diff --git "a/codet5_ia3_official_0.0001/checkpoint-117744/trainer_state.json" "b/codet5_ia3_official_0.0001/checkpoint-117744/trainer_state.json" new file mode 100644--- /dev/null +++ "b/codet5_ia3_official_0.0001/checkpoint-117744/trainer_state.json" @@ -0,0 +1,165045 @@ +{ + "best_metric": 0.0021924919669198163, + "best_model_checkpoint": "./results-cc/code-t5/codet5_ia3_official_0.0001/checkpoint-14718", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 117744, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003397200706617747, + "grad_norm": 2.2195968627929688, + "learning_rate": 9.999575349911673e-05, + "loss": 10.0277, + "step": 5 + }, + { + "epoch": 0.0006794401413235494, + "grad_norm": 1.40236496925354, + "learning_rate": 9.999320559858677e-05, + "loss": 10.112, + "step": 10 + }, + { + "epoch": 0.0010191602119853241, + "grad_norm": 2.544349431991577, + "learning_rate": 9.99889590977035e-05, + "loss": 10.7856, + "step": 15 + }, + { + "epoch": 0.001358880282647099, + "grad_norm": 1.8622939586639404, + "learning_rate": 9.998471259682023e-05, + "loss": 9.73, + "step": 20 + }, + { + "epoch": 0.0016986003533088735, + "grad_norm": 2.0292515754699707, + "learning_rate": 9.998046609593695e-05, + "loss": 10.2627, + "step": 25 + }, + { + "epoch": 0.0020383204239706482, + "grad_norm": 1.862714171409607, + "learning_rate": 9.997621959505368e-05, + "loss": 9.8577, + "step": 30 + }, + { + "epoch": 0.002378040494632423, + "grad_norm": 3.0175716876983643, + "learning_rate": 9.997197309417041e-05, + "loss": 9.6583, + "step": 35 + }, + { + "epoch": 0.002717760565294198, + "grad_norm": 2.3832008838653564, + "learning_rate": 9.996772659328714e-05, + "loss": 9.8661, + "step": 40 + }, + { + "epoch": 0.0030574806359559724, + "grad_norm": 1.7646359205245972, + "learning_rate": 9.996348009240387e-05, + "loss": 10.5005, + "step": 45 + }, + { + "epoch": 0.003397200706617747, + "grad_norm": 1.7444077730178833, + "learning_rate": 9.996008289169725e-05, + "loss": 9.5758, + "step": 50 + }, + { + "epoch": 0.0037369207772795215, + "grad_norm": 1.9149781465530396, + "learning_rate": 9.995583639081398e-05, + "loss": 9.735, + "step": 55 + }, + { + "epoch": 0.0040766408479412965, + "grad_norm": 1.6488384008407593, + "learning_rate": 9.995158988993069e-05, + "loss": 9.8686, + "step": 60 + }, + { + "epoch": 0.0044163609186030715, + "grad_norm": 2.184131622314453, + "learning_rate": 9.994734338904743e-05, + "loss": 10.1822, + "step": 65 + }, + { + "epoch": 0.004756080989264846, + "grad_norm": 2.6063928604125977, + "learning_rate": 9.994309688816416e-05, + "loss": 9.9386, + "step": 70 + }, + { + "epoch": 0.005095801059926621, + "grad_norm": 1.925402045249939, + "learning_rate": 9.993885038728087e-05, + "loss": 9.3722, + "step": 75 + }, + { + "epoch": 0.005435521130588396, + "grad_norm": 3.882549524307251, + "learning_rate": 9.993545318657427e-05, + "loss": 9.4705, + "step": 80 + }, + { + "epoch": 0.00577524120125017, + "grad_norm": 2.1654701232910156, + "learning_rate": 9.9931206685691e-05, + "loss": 9.5574, + "step": 85 + }, + { + "epoch": 0.006114961271911945, + "grad_norm": 2.0329928398132324, + "learning_rate": 9.992696018480773e-05, + "loss": 9.7831, + "step": 90 + }, + { + "epoch": 0.006454681342573719, + "grad_norm": 2.408450126647949, + "learning_rate": 9.992271368392445e-05, + "loss": 10.2149, + "step": 95 + }, + { + "epoch": 0.006794401413235494, + "grad_norm": 2.039151430130005, + "learning_rate": 9.991846718304117e-05, + "loss": 10.2672, + "step": 100 + }, + { + "epoch": 0.007134121483897269, + "grad_norm": 2.3165321350097656, + "learning_rate": 9.991422068215791e-05, + "loss": 9.4804, + "step": 105 + }, + { + "epoch": 0.007473841554559043, + "grad_norm": 2.117964029312134, + "learning_rate": 9.990997418127464e-05, + "loss": 10.1318, + "step": 110 + }, + { + "epoch": 0.007813561625220818, + "grad_norm": 2.1264026165008545, + "learning_rate": 9.990572768039135e-05, + "loss": 9.6686, + "step": 115 + }, + { + "epoch": 0.008153281695882593, + "grad_norm": 1.2933008670806885, + "learning_rate": 9.99014811795081e-05, + "loss": 9.4348, + "step": 120 + }, + { + "epoch": 0.008493001766544368, + "grad_norm": 2.1963112354278564, + "learning_rate": 9.989723467862482e-05, + "loss": 9.7095, + "step": 125 + }, + { + "epoch": 0.008832721837206143, + "grad_norm": 1.4545302391052246, + "learning_rate": 9.989298817774154e-05, + "loss": 9.5571, + "step": 130 + }, + { + "epoch": 0.009172441907867916, + "grad_norm": 1.4792237281799316, + "learning_rate": 9.988874167685828e-05, + "loss": 8.9859, + "step": 135 + }, + { + "epoch": 0.009512161978529691, + "grad_norm": 1.2513773441314697, + "learning_rate": 9.9884495175975e-05, + "loss": 9.1648, + "step": 140 + }, + { + "epoch": 0.009851882049191466, + "grad_norm": 2.559937000274658, + "learning_rate": 9.988024867509172e-05, + "loss": 9.7622, + "step": 145 + }, + { + "epoch": 0.010191602119853241, + "grad_norm": 1.9447046518325806, + "learning_rate": 9.987600217420846e-05, + "loss": 9.8491, + "step": 150 + }, + { + "epoch": 0.010531322190515016, + "grad_norm": 1.3316339254379272, + "learning_rate": 9.987175567332519e-05, + "loss": 9.0772, + "step": 155 + }, + { + "epoch": 0.010871042261176791, + "grad_norm": 1.4368464946746826, + "learning_rate": 9.98675091724419e-05, + "loss": 8.9018, + "step": 160 + }, + { + "epoch": 0.011210762331838564, + "grad_norm": 1.5808727741241455, + "learning_rate": 9.986326267155865e-05, + "loss": 8.9733, + "step": 165 + }, + { + "epoch": 0.01155048240250034, + "grad_norm": 1.7572826147079468, + "learning_rate": 9.985901617067537e-05, + "loss": 8.7883, + "step": 170 + }, + { + "epoch": 0.011890202473162114, + "grad_norm": 1.6672322750091553, + "learning_rate": 9.985476966979209e-05, + "loss": 8.8474, + "step": 175 + }, + { + "epoch": 0.01222992254382389, + "grad_norm": 1.3700741529464722, + "learning_rate": 9.985052316890883e-05, + "loss": 9.4713, + "step": 180 + }, + { + "epoch": 0.012569642614485664, + "grad_norm": 1.9305622577667236, + "learning_rate": 9.984627666802555e-05, + "loss": 9.2708, + "step": 185 + }, + { + "epoch": 0.012909362685147438, + "grad_norm": 1.964107871055603, + "learning_rate": 9.984203016714227e-05, + "loss": 8.8052, + "step": 190 + }, + { + "epoch": 0.013249082755809213, + "grad_norm": 1.9907019138336182, + "learning_rate": 9.983778366625901e-05, + "loss": 9.6229, + "step": 195 + }, + { + "epoch": 0.013588802826470988, + "grad_norm": 1.7012939453125, + "learning_rate": 9.983353716537573e-05, + "loss": 8.9995, + "step": 200 + }, + { + "epoch": 0.013928522897132763, + "grad_norm": 1.7597671747207642, + "learning_rate": 9.982929066449246e-05, + "loss": 8.8808, + "step": 205 + }, + { + "epoch": 0.014268242967794538, + "grad_norm": 1.6379801034927368, + "learning_rate": 9.98250441636092e-05, + "loss": 8.8094, + "step": 210 + }, + { + "epoch": 0.014607963038456313, + "grad_norm": 1.9670891761779785, + "learning_rate": 9.982079766272591e-05, + "loss": 9.522, + "step": 215 + }, + { + "epoch": 0.014947683109118086, + "grad_norm": 1.9001445770263672, + "learning_rate": 9.981655116184264e-05, + "loss": 9.3781, + "step": 220 + }, + { + "epoch": 0.015287403179779861, + "grad_norm": 1.9136974811553955, + "learning_rate": 9.981230466095938e-05, + "loss": 8.8219, + "step": 225 + }, + { + "epoch": 0.015627123250441636, + "grad_norm": 2.4746735095977783, + "learning_rate": 9.98080581600761e-05, + "loss": 8.9191, + "step": 230 + }, + { + "epoch": 0.01596684332110341, + "grad_norm": 2.372750759124756, + "learning_rate": 9.980381165919283e-05, + "loss": 9.1705, + "step": 235 + }, + { + "epoch": 0.016306563391765186, + "grad_norm": 1.418637752532959, + "learning_rate": 9.980041445848621e-05, + "loss": 8.6098, + "step": 240 + }, + { + "epoch": 0.01664628346242696, + "grad_norm": 2.103688955307007, + "learning_rate": 9.979616795760294e-05, + "loss": 9.1948, + "step": 245 + }, + { + "epoch": 0.016986003533088736, + "grad_norm": 2.14066481590271, + "learning_rate": 9.979192145671968e-05, + "loss": 9.0493, + "step": 250 + }, + { + "epoch": 0.01732572360375051, + "grad_norm": 1.4272230863571167, + "learning_rate": 9.978767495583639e-05, + "loss": 8.7393, + "step": 255 + }, + { + "epoch": 0.017665443674412286, + "grad_norm": 1.5108071565628052, + "learning_rate": 9.978342845495312e-05, + "loss": 9.3101, + "step": 260 + }, + { + "epoch": 0.01800516374507406, + "grad_norm": 2.017267942428589, + "learning_rate": 9.977918195406986e-05, + "loss": 9.0584, + "step": 265 + }, + { + "epoch": 0.018344883815735832, + "grad_norm": 1.4429893493652344, + "learning_rate": 9.977493545318658e-05, + "loss": 8.5374, + "step": 270 + }, + { + "epoch": 0.01868460388639761, + "grad_norm": 1.7082629203796387, + "learning_rate": 9.97706889523033e-05, + "loss": 8.2905, + "step": 275 + }, + { + "epoch": 0.019024323957059382, + "grad_norm": 1.9540777206420898, + "learning_rate": 9.976644245142005e-05, + "loss": 8.9633, + "step": 280 + }, + { + "epoch": 0.01936404402772116, + "grad_norm": 1.4489926099777222, + "learning_rate": 9.976219595053676e-05, + "loss": 8.7871, + "step": 285 + }, + { + "epoch": 0.019703764098382932, + "grad_norm": 1.5926896333694458, + "learning_rate": 9.975794944965349e-05, + "loss": 8.5374, + "step": 290 + }, + { + "epoch": 0.020043484169044706, + "grad_norm": 1.8608131408691406, + "learning_rate": 9.975370294877022e-05, + "loss": 8.4816, + "step": 295 + }, + { + "epoch": 0.020383204239706482, + "grad_norm": 1.6402130126953125, + "learning_rate": 9.974945644788694e-05, + "loss": 8.3059, + "step": 300 + }, + { + "epoch": 0.020722924310368256, + "grad_norm": 1.0616756677627563, + "learning_rate": 9.974520994700367e-05, + "loss": 8.5302, + "step": 305 + }, + { + "epoch": 0.021062644381030032, + "grad_norm": 2.0486464500427246, + "learning_rate": 9.97409634461204e-05, + "loss": 8.6573, + "step": 310 + }, + { + "epoch": 0.021402364451691806, + "grad_norm": 2.764594316482544, + "learning_rate": 9.973671694523713e-05, + "loss": 9.0297, + "step": 315 + }, + { + "epoch": 0.021742084522353582, + "grad_norm": 1.457748293876648, + "learning_rate": 9.973247044435386e-05, + "loss": 8.6019, + "step": 320 + }, + { + "epoch": 0.022081804593015356, + "grad_norm": 1.5769116878509521, + "learning_rate": 9.972822394347058e-05, + "loss": 8.2577, + "step": 325 + }, + { + "epoch": 0.02242152466367713, + "grad_norm": 1.7383826971054077, + "learning_rate": 9.972397744258731e-05, + "loss": 8.1914, + "step": 330 + }, + { + "epoch": 0.022761244734338906, + "grad_norm": 1.6818331480026245, + "learning_rate": 9.971973094170404e-05, + "loss": 8.8289, + "step": 335 + }, + { + "epoch": 0.02310096480500068, + "grad_norm": 1.429895281791687, + "learning_rate": 9.971548444082077e-05, + "loss": 8.3524, + "step": 340 + }, + { + "epoch": 0.023440684875662456, + "grad_norm": 1.2669081687927246, + "learning_rate": 9.97112379399375e-05, + "loss": 8.289, + "step": 345 + }, + { + "epoch": 0.02378040494632423, + "grad_norm": 1.399524211883545, + "learning_rate": 9.970699143905422e-05, + "loss": 7.8466, + "step": 350 + }, + { + "epoch": 0.024120125016986002, + "grad_norm": 1.6960299015045166, + "learning_rate": 9.970274493817095e-05, + "loss": 8.6849, + "step": 355 + }, + { + "epoch": 0.02445984508764778, + "grad_norm": 1.7302825450897217, + "learning_rate": 9.969849843728768e-05, + "loss": 8.6366, + "step": 360 + }, + { + "epoch": 0.024799565158309552, + "grad_norm": 2.6233043670654297, + "learning_rate": 9.969425193640441e-05, + "loss": 8.3961, + "step": 365 + }, + { + "epoch": 0.02513928522897133, + "grad_norm": 1.480035424232483, + "learning_rate": 9.969000543552114e-05, + "loss": 8.6224, + "step": 370 + }, + { + "epoch": 0.025479005299633102, + "grad_norm": 1.156540870666504, + "learning_rate": 9.968575893463786e-05, + "loss": 7.9226, + "step": 375 + }, + { + "epoch": 0.025818725370294875, + "grad_norm": 1.7962318658828735, + "learning_rate": 9.968151243375459e-05, + "loss": 8.5519, + "step": 380 + }, + { + "epoch": 0.026158445440956652, + "grad_norm": 1.8737194538116455, + "learning_rate": 9.967726593287132e-05, + "loss": 8.4364, + "step": 385 + }, + { + "epoch": 0.026498165511618425, + "grad_norm": 1.6001181602478027, + "learning_rate": 9.967301943198805e-05, + "loss": 7.8641, + "step": 390 + }, + { + "epoch": 0.026837885582280202, + "grad_norm": 1.6181342601776123, + "learning_rate": 9.966877293110478e-05, + "loss": 7.3995, + "step": 395 + }, + { + "epoch": 0.027177605652941975, + "grad_norm": 1.5771849155426025, + "learning_rate": 9.96645264302215e-05, + "loss": 7.8526, + "step": 400 + }, + { + "epoch": 0.027517325723603752, + "grad_norm": 1.1884416341781616, + "learning_rate": 9.966027992933823e-05, + "loss": 8.117, + "step": 405 + }, + { + "epoch": 0.027857045794265525, + "grad_norm": 2.015026092529297, + "learning_rate": 9.965603342845496e-05, + "loss": 8.5386, + "step": 410 + }, + { + "epoch": 0.0281967658649273, + "grad_norm": 1.2226543426513672, + "learning_rate": 9.965178692757169e-05, + "loss": 8.1885, + "step": 415 + }, + { + "epoch": 0.028536485935589075, + "grad_norm": 0.8924500942230225, + "learning_rate": 9.964754042668842e-05, + "loss": 7.9768, + "step": 420 + }, + { + "epoch": 0.02887620600625085, + "grad_norm": 1.3788869380950928, + "learning_rate": 9.964329392580513e-05, + "loss": 8.2231, + "step": 425 + }, + { + "epoch": 0.029215926076912625, + "grad_norm": 1.4598475694656372, + "learning_rate": 9.963904742492187e-05, + "loss": 7.9246, + "step": 430 + }, + { + "epoch": 0.0295556461475744, + "grad_norm": 1.8520469665527344, + "learning_rate": 9.96348009240386e-05, + "loss": 8.0527, + "step": 435 + }, + { + "epoch": 0.029895366218236172, + "grad_norm": 1.4544007778167725, + "learning_rate": 9.963055442315531e-05, + "loss": 7.9365, + "step": 440 + }, + { + "epoch": 0.03023508628889795, + "grad_norm": 1.632197380065918, + "learning_rate": 9.962630792227206e-05, + "loss": 7.6173, + "step": 445 + }, + { + "epoch": 0.030574806359559722, + "grad_norm": 1.7875256538391113, + "learning_rate": 9.962206142138878e-05, + "loss": 7.8095, + "step": 450 + }, + { + "epoch": 0.0309145264302215, + "grad_norm": 1.496638298034668, + "learning_rate": 9.961866422068217e-05, + "loss": 8.3371, + "step": 455 + }, + { + "epoch": 0.03125424650088327, + "grad_norm": 1.915732741355896, + "learning_rate": 9.96144177197989e-05, + "loss": 8.5992, + "step": 460 + }, + { + "epoch": 0.03159396657154505, + "grad_norm": 1.1497036218643188, + "learning_rate": 9.961017121891561e-05, + "loss": 7.7974, + "step": 465 + }, + { + "epoch": 0.03193368664220682, + "grad_norm": 1.5241578817367554, + "learning_rate": 9.960592471803235e-05, + "loss": 7.9306, + "step": 470 + }, + { + "epoch": 0.032273406712868595, + "grad_norm": 1.8431288003921509, + "learning_rate": 9.960167821714908e-05, + "loss": 8.0421, + "step": 475 + }, + { + "epoch": 0.03261312678353037, + "grad_norm": 1.520870566368103, + "learning_rate": 9.959743171626579e-05, + "loss": 7.443, + "step": 480 + }, + { + "epoch": 0.03295284685419215, + "grad_norm": 1.9101468324661255, + "learning_rate": 9.959318521538253e-05, + "loss": 7.5383, + "step": 485 + }, + { + "epoch": 0.03329256692485392, + "grad_norm": 1.2860801219940186, + "learning_rate": 9.958893871449926e-05, + "loss": 7.4753, + "step": 490 + }, + { + "epoch": 0.033632286995515695, + "grad_norm": 1.3139928579330444, + "learning_rate": 9.958469221361598e-05, + "loss": 8.0711, + "step": 495 + }, + { + "epoch": 0.03397200706617747, + "grad_norm": 2.382516622543335, + "learning_rate": 9.958044571273272e-05, + "loss": 7.9498, + "step": 500 + }, + { + "epoch": 0.03431172713683924, + "grad_norm": 1.4085739850997925, + "learning_rate": 9.957619921184945e-05, + "loss": 7.6138, + "step": 505 + }, + { + "epoch": 0.03465144720750102, + "grad_norm": 1.34367835521698, + "learning_rate": 9.957195271096616e-05, + "loss": 7.7689, + "step": 510 + }, + { + "epoch": 0.034991167278162795, + "grad_norm": 1.3598331212997437, + "learning_rate": 9.95677062100829e-05, + "loss": 7.4086, + "step": 515 + }, + { + "epoch": 0.03533088734882457, + "grad_norm": 1.3421567678451538, + "learning_rate": 9.956345970919963e-05, + "loss": 7.8855, + "step": 520 + }, + { + "epoch": 0.03567060741948634, + "grad_norm": 1.5375795364379883, + "learning_rate": 9.955921320831634e-05, + "loss": 7.642, + "step": 525 + }, + { + "epoch": 0.03601032749014812, + "grad_norm": 1.6586527824401855, + "learning_rate": 9.955496670743309e-05, + "loss": 7.1819, + "step": 530 + }, + { + "epoch": 0.036350047560809895, + "grad_norm": 1.179348349571228, + "learning_rate": 9.955072020654981e-05, + "loss": 7.4156, + "step": 535 + }, + { + "epoch": 0.036689767631471665, + "grad_norm": 2.1848621368408203, + "learning_rate": 9.954647370566653e-05, + "loss": 7.4587, + "step": 540 + }, + { + "epoch": 0.03702948770213344, + "grad_norm": 1.3930761814117432, + "learning_rate": 9.954222720478327e-05, + "loss": 7.3459, + "step": 545 + }, + { + "epoch": 0.03736920777279522, + "grad_norm": 1.6147781610488892, + "learning_rate": 9.953798070389998e-05, + "loss": 7.913, + "step": 550 + }, + { + "epoch": 0.03770892784345699, + "grad_norm": 1.2318421602249146, + "learning_rate": 9.953373420301671e-05, + "loss": 7.4613, + "step": 555 + }, + { + "epoch": 0.038048647914118765, + "grad_norm": 14.445450782775879, + "learning_rate": 9.952948770213345e-05, + "loss": 7.4797, + "step": 560 + }, + { + "epoch": 0.03838836798478054, + "grad_norm": 1.447785496711731, + "learning_rate": 9.952524120125017e-05, + "loss": 7.663, + "step": 565 + }, + { + "epoch": 0.03872808805544232, + "grad_norm": 1.2133177518844604, + "learning_rate": 9.95209947003669e-05, + "loss": 7.5911, + "step": 570 + }, + { + "epoch": 0.03906780812610409, + "grad_norm": 1.6101973056793213, + "learning_rate": 9.951674819948364e-05, + "loss": 7.7617, + "step": 575 + }, + { + "epoch": 0.039407528196765865, + "grad_norm": 1.5707918405532837, + "learning_rate": 9.951250169860035e-05, + "loss": 7.2746, + "step": 580 + }, + { + "epoch": 0.03974724826742764, + "grad_norm": 1.4747017621994019, + "learning_rate": 9.950825519771708e-05, + "loss": 7.559, + "step": 585 + }, + { + "epoch": 0.04008696833808941, + "grad_norm": 1.2751151323318481, + "learning_rate": 9.950400869683382e-05, + "loss": 7.0742, + "step": 590 + }, + { + "epoch": 0.04042668840875119, + "grad_norm": 1.4686486721038818, + "learning_rate": 9.949976219595054e-05, + "loss": 7.179, + "step": 595 + }, + { + "epoch": 0.040766408479412965, + "grad_norm": 1.333364725112915, + "learning_rate": 9.949551569506727e-05, + "loss": 7.3349, + "step": 600 + }, + { + "epoch": 0.04110612855007474, + "grad_norm": 1.2560040950775146, + "learning_rate": 9.9491269194184e-05, + "loss": 7.2152, + "step": 605 + }, + { + "epoch": 0.04144584862073651, + "grad_norm": 5.9243011474609375, + "learning_rate": 9.948702269330072e-05, + "loss": 7.253, + "step": 610 + }, + { + "epoch": 0.04178556869139829, + "grad_norm": 1.3705462217330933, + "learning_rate": 9.948277619241745e-05, + "loss": 7.2954, + "step": 615 + }, + { + "epoch": 0.042125288762060065, + "grad_norm": 1.3280870914459229, + "learning_rate": 9.947852969153418e-05, + "loss": 7.0023, + "step": 620 + }, + { + "epoch": 0.042465008832721834, + "grad_norm": 1.5480890274047852, + "learning_rate": 9.94742831906509e-05, + "loss": 6.6209, + "step": 625 + }, + { + "epoch": 0.04280472890338361, + "grad_norm": 1.4617500305175781, + "learning_rate": 9.947003668976763e-05, + "loss": 6.6055, + "step": 630 + }, + { + "epoch": 0.04314444897404539, + "grad_norm": 1.5756878852844238, + "learning_rate": 9.946579018888436e-05, + "loss": 7.0135, + "step": 635 + }, + { + "epoch": 0.043484169044707165, + "grad_norm": 1.4289640188217163, + "learning_rate": 9.946154368800109e-05, + "loss": 7.1441, + "step": 640 + }, + { + "epoch": 0.043823889115368934, + "grad_norm": 1.3657900094985962, + "learning_rate": 9.945729718711782e-05, + "loss": 7.5154, + "step": 645 + }, + { + "epoch": 0.04416360918603071, + "grad_norm": 1.971498966217041, + "learning_rate": 9.94538999864112e-05, + "loss": 7.2665, + "step": 650 + }, + { + "epoch": 0.04450332925669249, + "grad_norm": 1.4446492195129395, + "learning_rate": 9.944965348552793e-05, + "loss": 7.0674, + "step": 655 + }, + { + "epoch": 0.04484304932735426, + "grad_norm": 1.0143150091171265, + "learning_rate": 9.944540698464467e-05, + "loss": 6.7426, + "step": 660 + }, + { + "epoch": 0.045182769398016034, + "grad_norm": 1.3732986450195312, + "learning_rate": 9.944116048376138e-05, + "loss": 7.0236, + "step": 665 + }, + { + "epoch": 0.04552248946867781, + "grad_norm": 1.5511842966079712, + "learning_rate": 9.943691398287811e-05, + "loss": 7.2107, + "step": 670 + }, + { + "epoch": 0.04586220953933958, + "grad_norm": 1.4255778789520264, + "learning_rate": 9.943266748199484e-05, + "loss": 6.817, + "step": 675 + }, + { + "epoch": 0.04620192961000136, + "grad_norm": 1.0669182538986206, + "learning_rate": 9.942842098111157e-05, + "loss": 6.9347, + "step": 680 + }, + { + "epoch": 0.046541649680663134, + "grad_norm": 1.512604832649231, + "learning_rate": 9.94241744802283e-05, + "loss": 7.0817, + "step": 685 + }, + { + "epoch": 0.04688136975132491, + "grad_norm": 1.3859061002731323, + "learning_rate": 9.941992797934502e-05, + "loss": 7.1805, + "step": 690 + }, + { + "epoch": 0.04722108982198668, + "grad_norm": 3.252913236618042, + "learning_rate": 9.941568147846175e-05, + "loss": 6.7985, + "step": 695 + }, + { + "epoch": 0.04756080989264846, + "grad_norm": 1.4156177043914795, + "learning_rate": 9.941143497757848e-05, + "loss": 6.9955, + "step": 700 + }, + { + "epoch": 0.047900529963310234, + "grad_norm": 1.5510213375091553, + "learning_rate": 9.940718847669521e-05, + "loss": 7.0235, + "step": 705 + }, + { + "epoch": 0.048240250033972004, + "grad_norm": 1.3725285530090332, + "learning_rate": 9.940294197581194e-05, + "loss": 6.9692, + "step": 710 + }, + { + "epoch": 0.04857997010463378, + "grad_norm": 1.4986199140548706, + "learning_rate": 9.939869547492866e-05, + "loss": 6.6778, + "step": 715 + }, + { + "epoch": 0.04891969017529556, + "grad_norm": 1.2320705652236938, + "learning_rate": 9.939444897404539e-05, + "loss": 6.8953, + "step": 720 + }, + { + "epoch": 0.049259410245957334, + "grad_norm": 0.9118322134017944, + "learning_rate": 9.939020247316212e-05, + "loss": 6.7496, + "step": 725 + }, + { + "epoch": 0.049599130316619104, + "grad_norm": 3.4886631965637207, + "learning_rate": 9.938595597227885e-05, + "loss": 7.0242, + "step": 730 + }, + { + "epoch": 0.04993885038728088, + "grad_norm": 0.9548838138580322, + "learning_rate": 9.938170947139558e-05, + "loss": 6.6306, + "step": 735 + }, + { + "epoch": 0.05027857045794266, + "grad_norm": 0.8389047980308533, + "learning_rate": 9.93774629705123e-05, + "loss": 7.134, + "step": 740 + }, + { + "epoch": 0.05061829052860443, + "grad_norm": 5.4491801261901855, + "learning_rate": 9.937321646962903e-05, + "loss": 6.698, + "step": 745 + }, + { + "epoch": 0.050958010599266204, + "grad_norm": 1.3063551187515259, + "learning_rate": 9.936896996874576e-05, + "loss": 7.0113, + "step": 750 + }, + { + "epoch": 0.05129773066992798, + "grad_norm": 1.470941424369812, + "learning_rate": 9.936472346786249e-05, + "loss": 6.606, + "step": 755 + }, + { + "epoch": 0.05163745074058975, + "grad_norm": 1.9392439126968384, + "learning_rate": 9.936047696697922e-05, + "loss": 7.07, + "step": 760 + }, + { + "epoch": 0.05197717081125153, + "grad_norm": 0.9688730239868164, + "learning_rate": 9.935623046609594e-05, + "loss": 6.5451, + "step": 765 + }, + { + "epoch": 0.052316890881913304, + "grad_norm": 1.4289032220840454, + "learning_rate": 9.935198396521267e-05, + "loss": 6.8784, + "step": 770 + }, + { + "epoch": 0.05265661095257508, + "grad_norm": 1.4620697498321533, + "learning_rate": 9.93477374643294e-05, + "loss": 6.5151, + "step": 775 + }, + { + "epoch": 0.05299633102323685, + "grad_norm": 2.3521432876586914, + "learning_rate": 9.934349096344613e-05, + "loss": 6.5384, + "step": 780 + }, + { + "epoch": 0.05333605109389863, + "grad_norm": 3.160248041152954, + "learning_rate": 9.933924446256286e-05, + "loss": 6.7476, + "step": 785 + }, + { + "epoch": 0.053675771164560404, + "grad_norm": 1.3147598505020142, + "learning_rate": 9.933499796167957e-05, + "loss": 6.8376, + "step": 790 + }, + { + "epoch": 0.054015491235222174, + "grad_norm": 1.6566650867462158, + "learning_rate": 9.933075146079631e-05, + "loss": 6.2506, + "step": 795 + }, + { + "epoch": 0.05435521130588395, + "grad_norm": 0.9440861344337463, + "learning_rate": 9.932650495991304e-05, + "loss": 6.9212, + "step": 800 + }, + { + "epoch": 0.05469493137654573, + "grad_norm": 1.1842477321624756, + "learning_rate": 9.932225845902975e-05, + "loss": 6.6892, + "step": 805 + }, + { + "epoch": 0.055034651447207504, + "grad_norm": 1.1736949682235718, + "learning_rate": 9.93180119581465e-05, + "loss": 6.9224, + "step": 810 + }, + { + "epoch": 0.055374371517869274, + "grad_norm": 0.7181898951530457, + "learning_rate": 9.931376545726322e-05, + "loss": 6.5164, + "step": 815 + }, + { + "epoch": 0.05571409158853105, + "grad_norm": 0.9374647736549377, + "learning_rate": 9.930951895637994e-05, + "loss": 6.5026, + "step": 820 + }, + { + "epoch": 0.05605381165919283, + "grad_norm": 1.2754137516021729, + "learning_rate": 9.930527245549668e-05, + "loss": 6.4676, + "step": 825 + }, + { + "epoch": 0.0563935317298546, + "grad_norm": 1.0159765481948853, + "learning_rate": 9.930102595461341e-05, + "loss": 6.3455, + "step": 830 + }, + { + "epoch": 0.056733251800516374, + "grad_norm": 1.0118136405944824, + "learning_rate": 9.929677945373012e-05, + "loss": 6.4984, + "step": 835 + }, + { + "epoch": 0.05707297187117815, + "grad_norm": 0.9740552306175232, + "learning_rate": 9.929253295284686e-05, + "loss": 6.665, + "step": 840 + }, + { + "epoch": 0.05741269194183993, + "grad_norm": 2.6464507579803467, + "learning_rate": 9.928828645196359e-05, + "loss": 6.5277, + "step": 845 + }, + { + "epoch": 0.0577524120125017, + "grad_norm": 1.1687380075454712, + "learning_rate": 9.92840399510803e-05, + "loss": 6.5141, + "step": 850 + }, + { + "epoch": 0.058092132083163474, + "grad_norm": 1.0684833526611328, + "learning_rate": 9.927979345019705e-05, + "loss": 6.2975, + "step": 855 + }, + { + "epoch": 0.05843185215382525, + "grad_norm": 3.8520267009735107, + "learning_rate": 9.927554694931378e-05, + "loss": 5.9187, + "step": 860 + }, + { + "epoch": 0.05877157222448702, + "grad_norm": 1.048731803894043, + "learning_rate": 9.927130044843049e-05, + "loss": 6.3076, + "step": 865 + }, + { + "epoch": 0.0591112922951488, + "grad_norm": 0.8034812808036804, + "learning_rate": 9.926705394754723e-05, + "loss": 6.5594, + "step": 870 + }, + { + "epoch": 0.059451012365810574, + "grad_norm": 0.9210667610168457, + "learning_rate": 9.926280744666395e-05, + "loss": 6.0014, + "step": 875 + }, + { + "epoch": 0.059790732436472344, + "grad_norm": 1.0861904621124268, + "learning_rate": 9.925856094578067e-05, + "loss": 6.2432, + "step": 880 + }, + { + "epoch": 0.06013045250713412, + "grad_norm": 0.8701607584953308, + "learning_rate": 9.925431444489742e-05, + "loss": 6.1582, + "step": 885 + }, + { + "epoch": 0.0604701725777959, + "grad_norm": 0.691939651966095, + "learning_rate": 9.925006794401413e-05, + "loss": 6.487, + "step": 890 + }, + { + "epoch": 0.060809892648457674, + "grad_norm": 1.1983147859573364, + "learning_rate": 9.924582144313086e-05, + "loss": 6.1766, + "step": 895 + }, + { + "epoch": 0.061149612719119444, + "grad_norm": 1.1613506078720093, + "learning_rate": 9.92415749422476e-05, + "loss": 6.5303, + "step": 900 + }, + { + "epoch": 0.06148933278978122, + "grad_norm": 0.8394651412963867, + "learning_rate": 9.923732844136431e-05, + "loss": 6.1502, + "step": 905 + }, + { + "epoch": 0.061829052860443, + "grad_norm": 1.2242004871368408, + "learning_rate": 9.923308194048104e-05, + "loss": 6.2054, + "step": 910 + }, + { + "epoch": 0.06216877293110477, + "grad_norm": 1.1255033016204834, + "learning_rate": 9.922883543959778e-05, + "loss": 6.4181, + "step": 915 + }, + { + "epoch": 0.06250849300176654, + "grad_norm": 0.7849110960960388, + "learning_rate": 9.92245889387145e-05, + "loss": 6.1411, + "step": 920 + }, + { + "epoch": 0.06284821307242831, + "grad_norm": 1.0032676458358765, + "learning_rate": 9.922034243783123e-05, + "loss": 6.4892, + "step": 925 + }, + { + "epoch": 0.0631879331430901, + "grad_norm": 1.127551555633545, + "learning_rate": 9.921609593694797e-05, + "loss": 6.3768, + "step": 930 + }, + { + "epoch": 0.06352765321375187, + "grad_norm": 1.0425925254821777, + "learning_rate": 9.921184943606468e-05, + "loss": 6.4448, + "step": 935 + }, + { + "epoch": 0.06386737328441364, + "grad_norm": 1.1642504930496216, + "learning_rate": 9.920760293518142e-05, + "loss": 5.7809, + "step": 940 + }, + { + "epoch": 0.06420709335507542, + "grad_norm": 1.8521403074264526, + "learning_rate": 9.920335643429814e-05, + "loss": 5.9249, + "step": 945 + }, + { + "epoch": 0.06454681342573719, + "grad_norm": 1.073219895362854, + "learning_rate": 9.919910993341487e-05, + "loss": 6.0477, + "step": 950 + }, + { + "epoch": 0.06488653349639897, + "grad_norm": 1.2109575271606445, + "learning_rate": 9.919486343253161e-05, + "loss": 6.0364, + "step": 955 + }, + { + "epoch": 0.06522625356706074, + "grad_norm": 1.1780409812927246, + "learning_rate": 9.919061693164832e-05, + "loss": 6.4147, + "step": 960 + }, + { + "epoch": 0.06556597363772251, + "grad_norm": 0.8810535073280334, + "learning_rate": 9.918637043076505e-05, + "loss": 6.0036, + "step": 965 + }, + { + "epoch": 0.0659056937083843, + "grad_norm": 0.7648366093635559, + "learning_rate": 9.918212392988179e-05, + "loss": 6.2567, + "step": 970 + }, + { + "epoch": 0.06624541377904607, + "grad_norm": 2.192458391189575, + "learning_rate": 9.91778774289985e-05, + "loss": 6.144, + "step": 975 + }, + { + "epoch": 0.06658513384970784, + "grad_norm": 1.2390516996383667, + "learning_rate": 9.917363092811523e-05, + "loss": 6.0287, + "step": 980 + }, + { + "epoch": 0.06692485392036962, + "grad_norm": 0.8258079886436462, + "learning_rate": 9.916938442723198e-05, + "loss": 6.2043, + "step": 985 + }, + { + "epoch": 0.06726457399103139, + "grad_norm": 0.9516924023628235, + "learning_rate": 9.916513792634869e-05, + "loss": 6.0747, + "step": 990 + }, + { + "epoch": 0.06760429406169316, + "grad_norm": 0.856916069984436, + "learning_rate": 9.916089142546542e-05, + "loss": 5.926, + "step": 995 + }, + { + "epoch": 0.06794401413235494, + "grad_norm": 0.8324723839759827, + "learning_rate": 9.915664492458216e-05, + "loss": 6.2707, + "step": 1000 + }, + { + "epoch": 0.06828373420301671, + "grad_norm": 0.7908216714859009, + "learning_rate": 9.915239842369887e-05, + "loss": 5.967, + "step": 1005 + }, + { + "epoch": 0.06862345427367848, + "grad_norm": 0.9094476103782654, + "learning_rate": 9.91481519228156e-05, + "loss": 5.6132, + "step": 1010 + }, + { + "epoch": 0.06896317434434027, + "grad_norm": 0.9734240770339966, + "learning_rate": 9.914390542193233e-05, + "loss": 5.9794, + "step": 1015 + }, + { + "epoch": 0.06930289441500204, + "grad_norm": 0.8310399055480957, + "learning_rate": 9.913965892104906e-05, + "loss": 6.0146, + "step": 1020 + }, + { + "epoch": 0.0696426144856638, + "grad_norm": 0.9436420798301697, + "learning_rate": 9.913541242016579e-05, + "loss": 5.9812, + "step": 1025 + }, + { + "epoch": 0.06998233455632559, + "grad_norm": 1.2283395528793335, + "learning_rate": 9.913116591928251e-05, + "loss": 5.9537, + "step": 1030 + }, + { + "epoch": 0.07032205462698736, + "grad_norm": 0.8751355409622192, + "learning_rate": 9.912691941839924e-05, + "loss": 5.9992, + "step": 1035 + }, + { + "epoch": 0.07066177469764914, + "grad_norm": 0.6706697344779968, + "learning_rate": 9.912267291751597e-05, + "loss": 6.0753, + "step": 1040 + }, + { + "epoch": 0.07100149476831091, + "grad_norm": 0.7029627561569214, + "learning_rate": 9.91184264166327e-05, + "loss": 6.0136, + "step": 1045 + }, + { + "epoch": 0.07134121483897268, + "grad_norm": 0.6171499490737915, + "learning_rate": 9.911417991574943e-05, + "loss": 6.0223, + "step": 1050 + }, + { + "epoch": 0.07168093490963447, + "grad_norm": 2.5934255123138428, + "learning_rate": 9.910993341486615e-05, + "loss": 5.807, + "step": 1055 + }, + { + "epoch": 0.07202065498029624, + "grad_norm": 0.9291547536849976, + "learning_rate": 9.910568691398288e-05, + "loss": 5.7086, + "step": 1060 + }, + { + "epoch": 0.072360375050958, + "grad_norm": 1.4394763708114624, + "learning_rate": 9.910144041309961e-05, + "loss": 5.5023, + "step": 1065 + }, + { + "epoch": 0.07270009512161979, + "grad_norm": 0.6298092603683472, + "learning_rate": 9.909719391221634e-05, + "loss": 5.9972, + "step": 1070 + }, + { + "epoch": 0.07303981519228156, + "grad_norm": 0.6151909232139587, + "learning_rate": 9.909294741133307e-05, + "loss": 5.6389, + "step": 1075 + }, + { + "epoch": 0.07337953526294333, + "grad_norm": 1.1861008405685425, + "learning_rate": 9.90887009104498e-05, + "loss": 6.2689, + "step": 1080 + }, + { + "epoch": 0.07371925533360511, + "grad_norm": 0.7876234650611877, + "learning_rate": 9.908445440956652e-05, + "loss": 5.5448, + "step": 1085 + }, + { + "epoch": 0.07405897540426688, + "grad_norm": 0.592897891998291, + "learning_rate": 9.908020790868325e-05, + "loss": 5.8057, + "step": 1090 + }, + { + "epoch": 0.07439869547492865, + "grad_norm": 0.9189316034317017, + "learning_rate": 9.907596140779998e-05, + "loss": 6.0782, + "step": 1095 + }, + { + "epoch": 0.07473841554559044, + "grad_norm": 0.6605345010757446, + "learning_rate": 9.90717149069167e-05, + "loss": 5.6442, + "step": 1100 + }, + { + "epoch": 0.0750781356162522, + "grad_norm": 0.6724756956100464, + "learning_rate": 9.906746840603343e-05, + "loss": 6.2757, + "step": 1105 + }, + { + "epoch": 0.07541785568691398, + "grad_norm": 0.8074867725372314, + "learning_rate": 9.906322190515016e-05, + "loss": 5.6868, + "step": 1110 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 2.2021851539611816, + "learning_rate": 9.905897540426689e-05, + "loss": 5.755, + "step": 1115 + }, + { + "epoch": 0.07609729582823753, + "grad_norm": 0.7941934466362, + "learning_rate": 9.905472890338362e-05, + "loss": 5.6318, + "step": 1120 + }, + { + "epoch": 0.07643701589889931, + "grad_norm": 0.9947513937950134, + "learning_rate": 9.905048240250035e-05, + "loss": 5.9247, + "step": 1125 + }, + { + "epoch": 0.07677673596956108, + "grad_norm": 0.6511673927307129, + "learning_rate": 9.904623590161707e-05, + "loss": 5.6326, + "step": 1130 + }, + { + "epoch": 0.07711645604022285, + "grad_norm": 0.6497818231582642, + "learning_rate": 9.90419894007338e-05, + "loss": 5.8753, + "step": 1135 + }, + { + "epoch": 0.07745617611088464, + "grad_norm": 0.5531424880027771, + "learning_rate": 9.903774289985053e-05, + "loss": 5.2715, + "step": 1140 + }, + { + "epoch": 0.0777958961815464, + "grad_norm": 0.7292714715003967, + "learning_rate": 9.903349639896725e-05, + "loss": 5.7824, + "step": 1145 + }, + { + "epoch": 0.07813561625220818, + "grad_norm": 0.6802114248275757, + "learning_rate": 9.902924989808399e-05, + "loss": 5.7912, + "step": 1150 + }, + { + "epoch": 0.07847533632286996, + "grad_norm": 0.680204451084137, + "learning_rate": 9.902500339720071e-05, + "loss": 5.961, + "step": 1155 + }, + { + "epoch": 0.07881505639353173, + "grad_norm": 0.596501350402832, + "learning_rate": 9.902075689631743e-05, + "loss": 5.5833, + "step": 1160 + }, + { + "epoch": 0.0791547764641935, + "grad_norm": 0.622715950012207, + "learning_rate": 9.901651039543417e-05, + "loss": 5.933, + "step": 1165 + }, + { + "epoch": 0.07949449653485528, + "grad_norm": 0.9008530974388123, + "learning_rate": 9.90122638945509e-05, + "loss": 5.568, + "step": 1170 + }, + { + "epoch": 0.07983421660551705, + "grad_norm": 0.5429263114929199, + "learning_rate": 9.900801739366761e-05, + "loss": 5.2962, + "step": 1175 + }, + { + "epoch": 0.08017393667617882, + "grad_norm": 0.6079940795898438, + "learning_rate": 9.900377089278435e-05, + "loss": 5.8575, + "step": 1180 + }, + { + "epoch": 0.0805136567468406, + "grad_norm": 0.6796315312385559, + "learning_rate": 9.899952439190108e-05, + "loss": 5.6058, + "step": 1185 + }, + { + "epoch": 0.08085337681750238, + "grad_norm": 0.6909620761871338, + "learning_rate": 9.89952778910178e-05, + "loss": 5.6206, + "step": 1190 + }, + { + "epoch": 0.08119309688816416, + "grad_norm": 0.8451843857765198, + "learning_rate": 9.899103139013454e-05, + "loss": 5.5017, + "step": 1195 + }, + { + "epoch": 0.08153281695882593, + "grad_norm": 0.6521558165550232, + "learning_rate": 9.898678488925127e-05, + "loss": 5.6647, + "step": 1200 + }, + { + "epoch": 0.0818725370294877, + "grad_norm": 0.4295422434806824, + "learning_rate": 9.898253838836798e-05, + "loss": 5.96, + "step": 1205 + }, + { + "epoch": 0.08221225710014948, + "grad_norm": 1.2797423601150513, + "learning_rate": 9.897829188748472e-05, + "loss": 5.5158, + "step": 1210 + }, + { + "epoch": 0.08255197717081125, + "grad_norm": 0.663374125957489, + "learning_rate": 9.897404538660144e-05, + "loss": 5.5387, + "step": 1215 + }, + { + "epoch": 0.08289169724147302, + "grad_norm": 0.6130101084709167, + "learning_rate": 9.896979888571817e-05, + "loss": 5.3825, + "step": 1220 + }, + { + "epoch": 0.0832314173121348, + "grad_norm": 0.9059043526649475, + "learning_rate": 9.896555238483491e-05, + "loss": 5.6168, + "step": 1225 + }, + { + "epoch": 0.08357113738279658, + "grad_norm": 0.9198205471038818, + "learning_rate": 9.896130588395162e-05, + "loss": 5.7126, + "step": 1230 + }, + { + "epoch": 0.08391085745345835, + "grad_norm": 0.6826533079147339, + "learning_rate": 9.895705938306835e-05, + "loss": 5.5344, + "step": 1235 + }, + { + "epoch": 0.08425057752412013, + "grad_norm": 0.6488471031188965, + "learning_rate": 9.895281288218509e-05, + "loss": 5.6311, + "step": 1240 + }, + { + "epoch": 0.0845902975947819, + "grad_norm": 1.4997718334197998, + "learning_rate": 9.89485663813018e-05, + "loss": 5.3847, + "step": 1245 + }, + { + "epoch": 0.08493001766544367, + "grad_norm": 0.8614689111709595, + "learning_rate": 9.894431988041853e-05, + "loss": 5.7996, + "step": 1250 + }, + { + "epoch": 0.08526973773610545, + "grad_norm": 0.910275936126709, + "learning_rate": 9.894007337953527e-05, + "loss": 5.6557, + "step": 1255 + }, + { + "epoch": 0.08560945780676722, + "grad_norm": 0.8584810495376587, + "learning_rate": 9.893582687865199e-05, + "loss": 5.6384, + "step": 1260 + }, + { + "epoch": 0.08594917787742899, + "grad_norm": 0.5326058864593506, + "learning_rate": 9.893158037776872e-05, + "loss": 5.4521, + "step": 1265 + }, + { + "epoch": 0.08628889794809078, + "grad_norm": 1.008244276046753, + "learning_rate": 9.892733387688546e-05, + "loss": 5.6282, + "step": 1270 + }, + { + "epoch": 0.08662861801875255, + "grad_norm": 0.9059062600135803, + "learning_rate": 9.892308737600217e-05, + "loss": 5.7698, + "step": 1275 + }, + { + "epoch": 0.08696833808941433, + "grad_norm": 0.754760205745697, + "learning_rate": 9.891884087511891e-05, + "loss": 5.7735, + "step": 1280 + }, + { + "epoch": 0.0873080581600761, + "grad_norm": 0.6785455346107483, + "learning_rate": 9.891459437423564e-05, + "loss": 5.8881, + "step": 1285 + }, + { + "epoch": 0.08764777823073787, + "grad_norm": 0.8128915429115295, + "learning_rate": 9.891034787335236e-05, + "loss": 5.5169, + "step": 1290 + }, + { + "epoch": 0.08798749830139965, + "grad_norm": 0.912551760673523, + "learning_rate": 9.89061013724691e-05, + "loss": 5.5467, + "step": 1295 + }, + { + "epoch": 0.08832721837206142, + "grad_norm": 2.0163161754608154, + "learning_rate": 9.890185487158581e-05, + "loss": 5.5448, + "step": 1300 + }, + { + "epoch": 0.08866693844272319, + "grad_norm": 0.5310774445533752, + "learning_rate": 9.889760837070254e-05, + "loss": 5.5268, + "step": 1305 + }, + { + "epoch": 0.08900665851338498, + "grad_norm": 0.603813886642456, + "learning_rate": 9.889336186981928e-05, + "loss": 5.6777, + "step": 1310 + }, + { + "epoch": 0.08934637858404675, + "grad_norm": 0.431869238615036, + "learning_rate": 9.8889115368936e-05, + "loss": 5.5403, + "step": 1315 + }, + { + "epoch": 0.08968609865470852, + "grad_norm": 0.7721471190452576, + "learning_rate": 9.888486886805273e-05, + "loss": 5.2792, + "step": 1320 + }, + { + "epoch": 0.0900258187253703, + "grad_norm": 1.0440256595611572, + "learning_rate": 9.888062236716947e-05, + "loss": 5.1811, + "step": 1325 + }, + { + "epoch": 0.09036553879603207, + "grad_norm": 0.5089631080627441, + "learning_rate": 9.887637586628618e-05, + "loss": 5.5219, + "step": 1330 + }, + { + "epoch": 0.09070525886669384, + "grad_norm": 0.5013499855995178, + "learning_rate": 9.887212936540291e-05, + "loss": 5.5286, + "step": 1335 + }, + { + "epoch": 0.09104497893735562, + "grad_norm": 0.8858599662780762, + "learning_rate": 9.886788286451965e-05, + "loss": 5.6356, + "step": 1340 + }, + { + "epoch": 0.09138469900801739, + "grad_norm": 0.45014268159866333, + "learning_rate": 9.886363636363637e-05, + "loss": 5.1792, + "step": 1345 + }, + { + "epoch": 0.09172441907867916, + "grad_norm": 0.6357929706573486, + "learning_rate": 9.88593898627531e-05, + "loss": 5.6234, + "step": 1350 + }, + { + "epoch": 0.09206413914934095, + "grad_norm": 4.850500106811523, + "learning_rate": 9.885514336186984e-05, + "loss": 5.3482, + "step": 1355 + }, + { + "epoch": 0.09240385922000272, + "grad_norm": 0.477006196975708, + "learning_rate": 9.885089686098655e-05, + "loss": 5.2999, + "step": 1360 + }, + { + "epoch": 0.0927435792906645, + "grad_norm": 0.5572239756584167, + "learning_rate": 9.884665036010328e-05, + "loss": 5.4801, + "step": 1365 + }, + { + "epoch": 0.09308329936132627, + "grad_norm": 0.3890332877635956, + "learning_rate": 9.884240385922e-05, + "loss": 5.3536, + "step": 1370 + }, + { + "epoch": 0.09342301943198804, + "grad_norm": 0.43894070386886597, + "learning_rate": 9.883815735833673e-05, + "loss": 5.6419, + "step": 1375 + }, + { + "epoch": 0.09376273950264982, + "grad_norm": 0.48212140798568726, + "learning_rate": 9.883391085745346e-05, + "loss": 5.431, + "step": 1380 + }, + { + "epoch": 0.09410245957331159, + "grad_norm": 0.5145598649978638, + "learning_rate": 9.882966435657019e-05, + "loss": 5.7753, + "step": 1385 + }, + { + "epoch": 0.09444217964397336, + "grad_norm": 0.4795299172401428, + "learning_rate": 9.882541785568692e-05, + "loss": 5.5992, + "step": 1390 + }, + { + "epoch": 0.09478189971463515, + "grad_norm": 0.4439328610897064, + "learning_rate": 9.882117135480365e-05, + "loss": 5.3292, + "step": 1395 + }, + { + "epoch": 0.09512161978529692, + "grad_norm": 0.6526133418083191, + "learning_rate": 9.881692485392037e-05, + "loss": 5.3765, + "step": 1400 + }, + { + "epoch": 0.09546133985595869, + "grad_norm": 0.8982023596763611, + "learning_rate": 9.88126783530371e-05, + "loss": 5.4783, + "step": 1405 + }, + { + "epoch": 0.09580105992662047, + "grad_norm": 0.46160343289375305, + "learning_rate": 9.880843185215383e-05, + "loss": 5.502, + "step": 1410 + }, + { + "epoch": 0.09614077999728224, + "grad_norm": 0.6750124096870422, + "learning_rate": 9.880418535127056e-05, + "loss": 5.4252, + "step": 1415 + }, + { + "epoch": 0.09648050006794401, + "grad_norm": 0.42901554703712463, + "learning_rate": 9.879993885038729e-05, + "loss": 5.3442, + "step": 1420 + }, + { + "epoch": 0.09682022013860579, + "grad_norm": 0.6184918284416199, + "learning_rate": 9.879569234950401e-05, + "loss": 5.3717, + "step": 1425 + }, + { + "epoch": 0.09715994020926756, + "grad_norm": 0.5006517171859741, + "learning_rate": 9.879144584862074e-05, + "loss": 5.4458, + "step": 1430 + }, + { + "epoch": 0.09749966027992933, + "grad_norm": 0.4495384693145752, + "learning_rate": 9.878719934773747e-05, + "loss": 5.263, + "step": 1435 + }, + { + "epoch": 0.09783938035059112, + "grad_norm": 0.4285268783569336, + "learning_rate": 9.87829528468542e-05, + "loss": 5.2207, + "step": 1440 + }, + { + "epoch": 0.09817910042125289, + "grad_norm": 1.0460352897644043, + "learning_rate": 9.877870634597093e-05, + "loss": 5.3448, + "step": 1445 + }, + { + "epoch": 0.09851882049191467, + "grad_norm": 0.6291869878768921, + "learning_rate": 9.877445984508765e-05, + "loss": 5.3182, + "step": 1450 + }, + { + "epoch": 0.09885854056257644, + "grad_norm": 1.0043153762817383, + "learning_rate": 9.877021334420438e-05, + "loss": 5.4226, + "step": 1455 + }, + { + "epoch": 0.09919826063323821, + "grad_norm": 0.7458539009094238, + "learning_rate": 9.876596684332111e-05, + "loss": 5.6298, + "step": 1460 + }, + { + "epoch": 0.09953798070389999, + "grad_norm": 0.45767852663993835, + "learning_rate": 9.876172034243784e-05, + "loss": 5.5378, + "step": 1465 + }, + { + "epoch": 0.09987770077456176, + "grad_norm": 0.4586849510669708, + "learning_rate": 9.875747384155457e-05, + "loss": 5.7105, + "step": 1470 + }, + { + "epoch": 0.10021742084522353, + "grad_norm": 0.37701913714408875, + "learning_rate": 9.87532273406713e-05, + "loss": 5.1507, + "step": 1475 + }, + { + "epoch": 0.10055714091588532, + "grad_norm": 0.42394065856933594, + "learning_rate": 9.874898083978802e-05, + "loss": 5.5164, + "step": 1480 + }, + { + "epoch": 0.10089686098654709, + "grad_norm": 0.6020697355270386, + "learning_rate": 9.874473433890475e-05, + "loss": 5.3247, + "step": 1485 + }, + { + "epoch": 0.10123658105720885, + "grad_norm": 0.5261949300765991, + "learning_rate": 9.874048783802148e-05, + "loss": 5.5159, + "step": 1490 + }, + { + "epoch": 0.10157630112787064, + "grad_norm": 0.4316195547580719, + "learning_rate": 9.87362413371382e-05, + "loss": 5.4615, + "step": 1495 + }, + { + "epoch": 0.10191602119853241, + "grad_norm": 0.3691781759262085, + "learning_rate": 9.873199483625492e-05, + "loss": 5.5583, + "step": 1500 + }, + { + "epoch": 0.10225574126919418, + "grad_norm": 0.5686614513397217, + "learning_rate": 9.872774833537166e-05, + "loss": 5.5936, + "step": 1505 + }, + { + "epoch": 0.10259546133985596, + "grad_norm": 0.3296063542366028, + "learning_rate": 9.872350183448839e-05, + "loss": 5.4636, + "step": 1510 + }, + { + "epoch": 0.10293518141051773, + "grad_norm": 0.46032190322875977, + "learning_rate": 9.87192553336051e-05, + "loss": 5.1794, + "step": 1515 + }, + { + "epoch": 0.1032749014811795, + "grad_norm": 0.4383249282836914, + "learning_rate": 9.871500883272185e-05, + "loss": 5.3524, + "step": 1520 + }, + { + "epoch": 0.10361462155184128, + "grad_norm": 0.5257749557495117, + "learning_rate": 9.871076233183857e-05, + "loss": 5.5045, + "step": 1525 + }, + { + "epoch": 0.10395434162250305, + "grad_norm": 0.5201256275177002, + "learning_rate": 9.870651583095529e-05, + "loss": 5.3214, + "step": 1530 + }, + { + "epoch": 0.10429406169316484, + "grad_norm": 0.7715117931365967, + "learning_rate": 9.870226933007203e-05, + "loss": 5.3583, + "step": 1535 + }, + { + "epoch": 0.10463378176382661, + "grad_norm": 0.34586212038993835, + "learning_rate": 9.869802282918876e-05, + "loss": 5.3337, + "step": 1540 + }, + { + "epoch": 0.10497350183448838, + "grad_norm": 0.45313313603401184, + "learning_rate": 9.869377632830547e-05, + "loss": 5.2788, + "step": 1545 + }, + { + "epoch": 0.10531322190515016, + "grad_norm": 0.4078027904033661, + "learning_rate": 9.868952982742221e-05, + "loss": 5.4364, + "step": 1550 + }, + { + "epoch": 0.10565294197581193, + "grad_norm": 0.4670262038707733, + "learning_rate": 9.868528332653894e-05, + "loss": 5.308, + "step": 1555 + }, + { + "epoch": 0.1059926620464737, + "grad_norm": 0.4951310455799103, + "learning_rate": 9.868103682565566e-05, + "loss": 5.5171, + "step": 1560 + }, + { + "epoch": 0.10633238211713548, + "grad_norm": 0.7351198792457581, + "learning_rate": 9.86767903247724e-05, + "loss": 5.6146, + "step": 1565 + }, + { + "epoch": 0.10667210218779725, + "grad_norm": 0.5064637660980225, + "learning_rate": 9.867254382388911e-05, + "loss": 5.3591, + "step": 1570 + }, + { + "epoch": 0.10701182225845902, + "grad_norm": 0.39143896102905273, + "learning_rate": 9.866829732300584e-05, + "loss": 5.3523, + "step": 1575 + }, + { + "epoch": 0.10735154232912081, + "grad_norm": 1.2670384645462036, + "learning_rate": 9.866405082212258e-05, + "loss": 5.2359, + "step": 1580 + }, + { + "epoch": 0.10769126239978258, + "grad_norm": 0.3745839297771454, + "learning_rate": 9.86598043212393e-05, + "loss": 5.2194, + "step": 1585 + }, + { + "epoch": 0.10803098247044435, + "grad_norm": 0.26325303316116333, + "learning_rate": 9.865555782035602e-05, + "loss": 5.1649, + "step": 1590 + }, + { + "epoch": 0.10837070254110613, + "grad_norm": 0.3311369717121124, + "learning_rate": 9.865131131947277e-05, + "loss": 5.326, + "step": 1595 + }, + { + "epoch": 0.1087104226117679, + "grad_norm": 0.4302009046077728, + "learning_rate": 9.864706481858948e-05, + "loss": 5.1757, + "step": 1600 + }, + { + "epoch": 0.10905014268242967, + "grad_norm": 0.5953149795532227, + "learning_rate": 9.864281831770621e-05, + "loss": 5.3937, + "step": 1605 + }, + { + "epoch": 0.10938986275309145, + "grad_norm": 0.4650028645992279, + "learning_rate": 9.863857181682295e-05, + "loss": 5.4321, + "step": 1610 + }, + { + "epoch": 0.10972958282375322, + "grad_norm": 1.5760172605514526, + "learning_rate": 9.863432531593966e-05, + "loss": 5.1935, + "step": 1615 + }, + { + "epoch": 0.11006930289441501, + "grad_norm": 0.4063778221607208, + "learning_rate": 9.86300788150564e-05, + "loss": 5.0892, + "step": 1620 + }, + { + "epoch": 0.11040902296507678, + "grad_norm": 0.3407844603061676, + "learning_rate": 9.862583231417313e-05, + "loss": 5.2536, + "step": 1625 + }, + { + "epoch": 0.11074874303573855, + "grad_norm": 0.5247534513473511, + "learning_rate": 9.862158581328985e-05, + "loss": 5.3708, + "step": 1630 + }, + { + "epoch": 0.11108846310640033, + "grad_norm": 1.0360844135284424, + "learning_rate": 9.861733931240659e-05, + "loss": 5.3132, + "step": 1635 + }, + { + "epoch": 0.1114281831770621, + "grad_norm": 0.5687776803970337, + "learning_rate": 9.86130928115233e-05, + "loss": 5.2554, + "step": 1640 + }, + { + "epoch": 0.11176790324772387, + "grad_norm": 0.3441666066646576, + "learning_rate": 9.860884631064003e-05, + "loss": 5.1661, + "step": 1645 + }, + { + "epoch": 0.11210762331838565, + "grad_norm": 0.6809844970703125, + "learning_rate": 9.860459980975677e-05, + "loss": 5.3175, + "step": 1650 + }, + { + "epoch": 0.11244734338904742, + "grad_norm": 0.4880785644054413, + "learning_rate": 9.860035330887349e-05, + "loss": 5.3368, + "step": 1655 + }, + { + "epoch": 0.1127870634597092, + "grad_norm": 0.5996628999710083, + "learning_rate": 9.859610680799022e-05, + "loss": 5.2202, + "step": 1660 + }, + { + "epoch": 0.11312678353037098, + "grad_norm": 0.3819567561149597, + "learning_rate": 9.859186030710696e-05, + "loss": 5.2171, + "step": 1665 + }, + { + "epoch": 0.11346650360103275, + "grad_norm": 0.8039321899414062, + "learning_rate": 9.858761380622367e-05, + "loss": 5.1053, + "step": 1670 + }, + { + "epoch": 0.11380622367169452, + "grad_norm": 0.7948293685913086, + "learning_rate": 9.85833673053404e-05, + "loss": 5.1902, + "step": 1675 + }, + { + "epoch": 0.1141459437423563, + "grad_norm": 0.45758846402168274, + "learning_rate": 9.857912080445714e-05, + "loss": 5.2766, + "step": 1680 + }, + { + "epoch": 0.11448566381301807, + "grad_norm": 0.33638903498649597, + "learning_rate": 9.857487430357386e-05, + "loss": 5.1641, + "step": 1685 + }, + { + "epoch": 0.11482538388367985, + "grad_norm": 0.3370652496814728, + "learning_rate": 9.857062780269058e-05, + "loss": 4.8306, + "step": 1690 + }, + { + "epoch": 0.11516510395434162, + "grad_norm": 0.30151012539863586, + "learning_rate": 9.856638130180733e-05, + "loss": 4.9977, + "step": 1695 + }, + { + "epoch": 0.1155048240250034, + "grad_norm": 0.4379021227359772, + "learning_rate": 9.856213480092404e-05, + "loss": 5.2388, + "step": 1700 + }, + { + "epoch": 0.11584454409566518, + "grad_norm": 0.5139544606208801, + "learning_rate": 9.855788830004077e-05, + "loss": 5.0948, + "step": 1705 + }, + { + "epoch": 0.11618426416632695, + "grad_norm": 0.2696703374385834, + "learning_rate": 9.855364179915751e-05, + "loss": 5.1955, + "step": 1710 + }, + { + "epoch": 0.11652398423698872, + "grad_norm": 0.43758052587509155, + "learning_rate": 9.854939529827422e-05, + "loss": 5.1093, + "step": 1715 + }, + { + "epoch": 0.1168637043076505, + "grad_norm": 0.45877712965011597, + "learning_rate": 9.854514879739095e-05, + "loss": 5.1859, + "step": 1720 + }, + { + "epoch": 0.11720342437831227, + "grad_norm": 0.5116316676139832, + "learning_rate": 9.854090229650768e-05, + "loss": 4.9734, + "step": 1725 + }, + { + "epoch": 0.11754314444897404, + "grad_norm": 0.3733248710632324, + "learning_rate": 9.853665579562441e-05, + "loss": 4.9164, + "step": 1730 + }, + { + "epoch": 0.11788286451963582, + "grad_norm": 0.9365966320037842, + "learning_rate": 9.853240929474114e-05, + "loss": 5.1925, + "step": 1735 + }, + { + "epoch": 0.1182225845902976, + "grad_norm": 0.43242642283439636, + "learning_rate": 9.852816279385786e-05, + "loss": 5.2445, + "step": 1740 + }, + { + "epoch": 0.11856230466095936, + "grad_norm": 0.28928515315055847, + "learning_rate": 9.852391629297459e-05, + "loss": 5.2208, + "step": 1745 + }, + { + "epoch": 0.11890202473162115, + "grad_norm": 0.4645937383174896, + "learning_rate": 9.851966979209132e-05, + "loss": 5.1848, + "step": 1750 + }, + { + "epoch": 0.11924174480228292, + "grad_norm": 0.3553106486797333, + "learning_rate": 9.851542329120805e-05, + "loss": 5.0109, + "step": 1755 + }, + { + "epoch": 0.11958146487294469, + "grad_norm": 0.47933322191238403, + "learning_rate": 9.851117679032478e-05, + "loss": 5.2334, + "step": 1760 + }, + { + "epoch": 0.11992118494360647, + "grad_norm": 0.3383587896823883, + "learning_rate": 9.85069302894415e-05, + "loss": 5.3224, + "step": 1765 + }, + { + "epoch": 0.12026090501426824, + "grad_norm": 0.41393041610717773, + "learning_rate": 9.850268378855823e-05, + "loss": 5.062, + "step": 1770 + }, + { + "epoch": 0.12060062508493002, + "grad_norm": 0.43941059708595276, + "learning_rate": 9.849843728767496e-05, + "loss": 5.0013, + "step": 1775 + }, + { + "epoch": 0.1209403451555918, + "grad_norm": 0.35179632902145386, + "learning_rate": 9.849419078679169e-05, + "loss": 5.1989, + "step": 1780 + }, + { + "epoch": 0.12128006522625356, + "grad_norm": 0.7730126976966858, + "learning_rate": 9.848994428590842e-05, + "loss": 5.1821, + "step": 1785 + }, + { + "epoch": 0.12161978529691535, + "grad_norm": 0.45934972167015076, + "learning_rate": 9.848569778502514e-05, + "loss": 5.2454, + "step": 1790 + }, + { + "epoch": 0.12195950536757712, + "grad_norm": 0.6049938797950745, + "learning_rate": 9.848145128414187e-05, + "loss": 5.0269, + "step": 1795 + }, + { + "epoch": 0.12229922543823889, + "grad_norm": 0.9805595874786377, + "learning_rate": 9.84772047832586e-05, + "loss": 4.885, + "step": 1800 + }, + { + "epoch": 0.12263894550890067, + "grad_norm": 0.47447869181632996, + "learning_rate": 9.847295828237533e-05, + "loss": 4.9461, + "step": 1805 + }, + { + "epoch": 0.12297866557956244, + "grad_norm": 0.4037536382675171, + "learning_rate": 9.846871178149206e-05, + "loss": 4.79, + "step": 1810 + }, + { + "epoch": 0.12331838565022421, + "grad_norm": 0.6516850590705872, + "learning_rate": 9.846446528060878e-05, + "loss": 4.8244, + "step": 1815 + }, + { + "epoch": 0.123658105720886, + "grad_norm": 0.46356433629989624, + "learning_rate": 9.846021877972551e-05, + "loss": 4.8264, + "step": 1820 + }, + { + "epoch": 0.12399782579154776, + "grad_norm": 1.0530160665512085, + "learning_rate": 9.845597227884224e-05, + "loss": 4.7702, + "step": 1825 + }, + { + "epoch": 0.12433754586220953, + "grad_norm": 1.3012051582336426, + "learning_rate": 9.845172577795897e-05, + "loss": 4.9146, + "step": 1830 + }, + { + "epoch": 0.12467726593287132, + "grad_norm": 0.6158355474472046, + "learning_rate": 9.84474792770757e-05, + "loss": 5.0082, + "step": 1835 + }, + { + "epoch": 0.1250169860035331, + "grad_norm": 4.688101768493652, + "learning_rate": 9.844323277619241e-05, + "loss": 4.9467, + "step": 1840 + }, + { + "epoch": 0.12535670607419486, + "grad_norm": 0.5098426342010498, + "learning_rate": 9.843898627530915e-05, + "loss": 4.7443, + "step": 1845 + }, + { + "epoch": 0.12569642614485663, + "grad_norm": 0.6203608512878418, + "learning_rate": 9.843473977442588e-05, + "loss": 5.0459, + "step": 1850 + }, + { + "epoch": 0.12603614621551842, + "grad_norm": 0.5965786576271057, + "learning_rate": 9.84304932735426e-05, + "loss": 5.0615, + "step": 1855 + }, + { + "epoch": 0.1263758662861802, + "grad_norm": 0.7298919558525085, + "learning_rate": 9.842624677265934e-05, + "loss": 4.9333, + "step": 1860 + }, + { + "epoch": 0.12671558635684196, + "grad_norm": 0.514262318611145, + "learning_rate": 9.842200027177606e-05, + "loss": 4.9152, + "step": 1865 + }, + { + "epoch": 0.12705530642750373, + "grad_norm": 0.49908140301704407, + "learning_rate": 9.841775377089278e-05, + "loss": 4.8825, + "step": 1870 + }, + { + "epoch": 0.1273950264981655, + "grad_norm": 0.3929906487464905, + "learning_rate": 9.841350727000952e-05, + "loss": 4.6288, + "step": 1875 + }, + { + "epoch": 0.12773474656882727, + "grad_norm": 0.6221901178359985, + "learning_rate": 9.840926076912625e-05, + "loss": 4.6867, + "step": 1880 + }, + { + "epoch": 0.12807446663948907, + "grad_norm": 0.4237980544567108, + "learning_rate": 9.840501426824296e-05, + "loss": 5.0275, + "step": 1885 + }, + { + "epoch": 0.12841418671015084, + "grad_norm": 0.5076737403869629, + "learning_rate": 9.84007677673597e-05, + "loss": 4.3183, + "step": 1890 + }, + { + "epoch": 0.1287539067808126, + "grad_norm": 0.562611997127533, + "learning_rate": 9.839652126647643e-05, + "loss": 4.8492, + "step": 1895 + }, + { + "epoch": 0.12909362685147438, + "grad_norm": 0.43838977813720703, + "learning_rate": 9.839227476559315e-05, + "loss": 4.7559, + "step": 1900 + }, + { + "epoch": 0.12943334692213615, + "grad_norm": 1.7356271743774414, + "learning_rate": 9.838802826470989e-05, + "loss": 4.4937, + "step": 1905 + }, + { + "epoch": 0.12977306699279795, + "grad_norm": 0.35975855588912964, + "learning_rate": 9.838378176382662e-05, + "loss": 4.8153, + "step": 1910 + }, + { + "epoch": 0.13011278706345972, + "grad_norm": 0.46843382716178894, + "learning_rate": 9.837953526294333e-05, + "loss": 4.7742, + "step": 1915 + }, + { + "epoch": 0.1304525071341215, + "grad_norm": 0.49429741501808167, + "learning_rate": 9.837528876206007e-05, + "loss": 4.5403, + "step": 1920 + }, + { + "epoch": 0.13079222720478326, + "grad_norm": 0.496423602104187, + "learning_rate": 9.837104226117679e-05, + "loss": 4.8032, + "step": 1925 + }, + { + "epoch": 0.13113194727544503, + "grad_norm": 0.7953855395317078, + "learning_rate": 9.836679576029352e-05, + "loss": 4.8191, + "step": 1930 + }, + { + "epoch": 0.1314716673461068, + "grad_norm": 0.5093162655830383, + "learning_rate": 9.836254925941026e-05, + "loss": 4.794, + "step": 1935 + }, + { + "epoch": 0.1318113874167686, + "grad_norm": 0.37883055210113525, + "learning_rate": 9.835830275852697e-05, + "loss": 4.3129, + "step": 1940 + }, + { + "epoch": 0.13215110748743036, + "grad_norm": 0.6972466707229614, + "learning_rate": 9.83540562576437e-05, + "loss": 4.6677, + "step": 1945 + }, + { + "epoch": 0.13249082755809213, + "grad_norm": 0.4960924983024597, + "learning_rate": 9.834980975676044e-05, + "loss": 4.7554, + "step": 1950 + }, + { + "epoch": 0.1328305476287539, + "grad_norm": 0.3313211500644684, + "learning_rate": 9.834556325587716e-05, + "loss": 4.5136, + "step": 1955 + }, + { + "epoch": 0.13317026769941567, + "grad_norm": 0.37889447808265686, + "learning_rate": 9.83413167549939e-05, + "loss": 4.6352, + "step": 1960 + }, + { + "epoch": 0.13350998777007744, + "grad_norm": 0.2897196412086487, + "learning_rate": 9.833707025411062e-05, + "loss": 4.2392, + "step": 1965 + }, + { + "epoch": 0.13384970784073924, + "grad_norm": 0.4556117653846741, + "learning_rate": 9.833282375322734e-05, + "loss": 4.4802, + "step": 1970 + }, + { + "epoch": 0.134189427911401, + "grad_norm": 0.24939770996570587, + "learning_rate": 9.832857725234408e-05, + "loss": 4.8028, + "step": 1975 + }, + { + "epoch": 0.13452914798206278, + "grad_norm": 0.5589706301689148, + "learning_rate": 9.832433075146081e-05, + "loss": 4.5484, + "step": 1980 + }, + { + "epoch": 0.13486886805272455, + "grad_norm": 0.403367817401886, + "learning_rate": 9.832008425057752e-05, + "loss": 4.6303, + "step": 1985 + }, + { + "epoch": 0.13520858812338632, + "grad_norm": 0.2891002595424652, + "learning_rate": 9.831583774969426e-05, + "loss": 4.6125, + "step": 1990 + }, + { + "epoch": 0.13554830819404812, + "grad_norm": 0.4545519948005676, + "learning_rate": 9.831159124881098e-05, + "loss": 4.6801, + "step": 1995 + }, + { + "epoch": 0.1358880282647099, + "grad_norm": 0.2752302289009094, + "learning_rate": 9.830734474792771e-05, + "loss": 4.5331, + "step": 2000 + }, + { + "epoch": 0.13622774833537166, + "grad_norm": 0.4735427498817444, + "learning_rate": 9.830309824704445e-05, + "loss": 4.5487, + "step": 2005 + }, + { + "epoch": 0.13656746840603343, + "grad_norm": 0.2892632782459259, + "learning_rate": 9.829885174616116e-05, + "loss": 4.7872, + "step": 2010 + }, + { + "epoch": 0.1369071884766952, + "grad_norm": 0.3587241768836975, + "learning_rate": 9.829460524527789e-05, + "loss": 4.8017, + "step": 2015 + }, + { + "epoch": 0.13724690854735697, + "grad_norm": 0.8643600940704346, + "learning_rate": 9.829035874439463e-05, + "loss": 4.9978, + "step": 2020 + }, + { + "epoch": 0.13758662861801876, + "grad_norm": 0.3995005786418915, + "learning_rate": 9.828611224351135e-05, + "loss": 4.7966, + "step": 2025 + }, + { + "epoch": 0.13792634868868053, + "grad_norm": 0.5287114381790161, + "learning_rate": 9.828186574262808e-05, + "loss": 4.6836, + "step": 2030 + }, + { + "epoch": 0.1382660687593423, + "grad_norm": 0.356660932302475, + "learning_rate": 9.827761924174482e-05, + "loss": 4.6598, + "step": 2035 + }, + { + "epoch": 0.13860578883000407, + "grad_norm": 0.3594839572906494, + "learning_rate": 9.827337274086153e-05, + "loss": 4.7932, + "step": 2040 + }, + { + "epoch": 0.13894550890066584, + "grad_norm": 0.460989385843277, + "learning_rate": 9.826912623997826e-05, + "loss": 4.8404, + "step": 2045 + }, + { + "epoch": 0.1392852289713276, + "grad_norm": 0.3044515550136566, + "learning_rate": 9.8264879739095e-05, + "loss": 4.4804, + "step": 2050 + }, + { + "epoch": 0.1396249490419894, + "grad_norm": 0.2440759837627411, + "learning_rate": 9.826063323821172e-05, + "loss": 4.6584, + "step": 2055 + }, + { + "epoch": 0.13996466911265118, + "grad_norm": 0.39719679951667786, + "learning_rate": 9.825638673732844e-05, + "loss": 4.6913, + "step": 2060 + }, + { + "epoch": 0.14030438918331295, + "grad_norm": 0.2519219219684601, + "learning_rate": 9.825214023644517e-05, + "loss": 4.7914, + "step": 2065 + }, + { + "epoch": 0.14064410925397472, + "grad_norm": 0.27213895320892334, + "learning_rate": 9.82478937355619e-05, + "loss": 4.4571, + "step": 2070 + }, + { + "epoch": 0.1409838293246365, + "grad_norm": 0.31952184438705444, + "learning_rate": 9.824364723467863e-05, + "loss": 4.4334, + "step": 2075 + }, + { + "epoch": 0.1413235493952983, + "grad_norm": 0.2466011643409729, + "learning_rate": 9.823940073379536e-05, + "loss": 4.623, + "step": 2080 + }, + { + "epoch": 0.14166326946596006, + "grad_norm": 0.41923725605010986, + "learning_rate": 9.823515423291208e-05, + "loss": 4.5557, + "step": 2085 + }, + { + "epoch": 0.14200298953662183, + "grad_norm": 0.23959270119667053, + "learning_rate": 9.823090773202881e-05, + "loss": 4.5756, + "step": 2090 + }, + { + "epoch": 0.1423427096072836, + "grad_norm": 0.7019773721694946, + "learning_rate": 9.822666123114554e-05, + "loss": 4.74, + "step": 2095 + }, + { + "epoch": 0.14268242967794537, + "grad_norm": 0.6014403700828552, + "learning_rate": 9.822241473026227e-05, + "loss": 4.4456, + "step": 2100 + }, + { + "epoch": 0.14302214974860714, + "grad_norm": 0.2578621804714203, + "learning_rate": 9.8218168229379e-05, + "loss": 4.6776, + "step": 2105 + }, + { + "epoch": 0.14336186981926893, + "grad_norm": 0.24368084967136383, + "learning_rate": 9.821392172849572e-05, + "loss": 4.7798, + "step": 2110 + }, + { + "epoch": 0.1437015898899307, + "grad_norm": 0.4451867938041687, + "learning_rate": 9.820967522761245e-05, + "loss": 4.2507, + "step": 2115 + }, + { + "epoch": 0.14404130996059247, + "grad_norm": 0.27697330713272095, + "learning_rate": 9.820542872672918e-05, + "loss": 4.6886, + "step": 2120 + }, + { + "epoch": 0.14438103003125424, + "grad_norm": 0.8379690647125244, + "learning_rate": 9.820118222584591e-05, + "loss": 4.5629, + "step": 2125 + }, + { + "epoch": 0.144720750101916, + "grad_norm": 0.9834319353103638, + "learning_rate": 9.819693572496264e-05, + "loss": 4.4945, + "step": 2130 + }, + { + "epoch": 0.14506047017257778, + "grad_norm": 0.45272937417030334, + "learning_rate": 9.819268922407936e-05, + "loss": 4.6099, + "step": 2135 + }, + { + "epoch": 0.14540019024323958, + "grad_norm": 0.517729640007019, + "learning_rate": 9.818844272319609e-05, + "loss": 4.6808, + "step": 2140 + }, + { + "epoch": 0.14573991031390135, + "grad_norm": 0.26133647561073303, + "learning_rate": 9.818419622231282e-05, + "loss": 4.4916, + "step": 2145 + }, + { + "epoch": 0.14607963038456312, + "grad_norm": 0.31160035729408264, + "learning_rate": 9.817994972142955e-05, + "loss": 4.4746, + "step": 2150 + }, + { + "epoch": 0.1464193504552249, + "grad_norm": 0.3950839936733246, + "learning_rate": 9.817570322054628e-05, + "loss": 4.8946, + "step": 2155 + }, + { + "epoch": 0.14675907052588666, + "grad_norm": 0.254171758890152, + "learning_rate": 9.8171456719663e-05, + "loss": 4.5237, + "step": 2160 + }, + { + "epoch": 0.14709879059654846, + "grad_norm": 0.4314219653606415, + "learning_rate": 9.816721021877973e-05, + "loss": 4.6116, + "step": 2165 + }, + { + "epoch": 0.14743851066721023, + "grad_norm": 0.2894288897514343, + "learning_rate": 9.816296371789646e-05, + "loss": 4.4748, + "step": 2170 + }, + { + "epoch": 0.147778230737872, + "grad_norm": 0.2681034207344055, + "learning_rate": 9.815871721701319e-05, + "loss": 4.5926, + "step": 2175 + }, + { + "epoch": 0.14811795080853377, + "grad_norm": 0.27911391854286194, + "learning_rate": 9.815447071612992e-05, + "loss": 4.5391, + "step": 2180 + }, + { + "epoch": 0.14845767087919554, + "grad_norm": 0.3182697296142578, + "learning_rate": 9.815022421524664e-05, + "loss": 4.5708, + "step": 2185 + }, + { + "epoch": 0.1487973909498573, + "grad_norm": 0.2478509396314621, + "learning_rate": 9.814597771436337e-05, + "loss": 4.3974, + "step": 2190 + }, + { + "epoch": 0.1491371110205191, + "grad_norm": 0.3418025076389313, + "learning_rate": 9.814173121348009e-05, + "loss": 4.5312, + "step": 2195 + }, + { + "epoch": 0.14947683109118087, + "grad_norm": 0.2670694887638092, + "learning_rate": 9.813748471259683e-05, + "loss": 4.5906, + "step": 2200 + }, + { + "epoch": 0.14981655116184264, + "grad_norm": 0.29988008737564087, + "learning_rate": 9.813323821171356e-05, + "loss": 4.4151, + "step": 2205 + }, + { + "epoch": 0.1501562712325044, + "grad_norm": 0.2230396866798401, + "learning_rate": 9.812899171083027e-05, + "loss": 4.6196, + "step": 2210 + }, + { + "epoch": 0.15049599130316618, + "grad_norm": 0.2940434515476227, + "learning_rate": 9.812474520994701e-05, + "loss": 4.5765, + "step": 2215 + }, + { + "epoch": 0.15083571137382795, + "grad_norm": 0.2943139672279358, + "learning_rate": 9.812049870906374e-05, + "loss": 4.7439, + "step": 2220 + }, + { + "epoch": 0.15117543144448975, + "grad_norm": 0.5938501954078674, + "learning_rate": 9.811625220818045e-05, + "loss": 4.6233, + "step": 2225 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.29499292373657227, + "learning_rate": 9.81120057072972e-05, + "loss": 4.43, + "step": 2230 + }, + { + "epoch": 0.1518548715858133, + "grad_norm": 0.21327312290668488, + "learning_rate": 9.810775920641392e-05, + "loss": 4.5865, + "step": 2235 + }, + { + "epoch": 0.15219459165647506, + "grad_norm": 0.4112052917480469, + "learning_rate": 9.810351270553064e-05, + "loss": 4.7013, + "step": 2240 + }, + { + "epoch": 0.15253431172713683, + "grad_norm": 0.40261027216911316, + "learning_rate": 9.809926620464738e-05, + "loss": 4.6419, + "step": 2245 + }, + { + "epoch": 0.15287403179779863, + "grad_norm": 0.2737533748149872, + "learning_rate": 9.809501970376411e-05, + "loss": 4.3994, + "step": 2250 + }, + { + "epoch": 0.1532137518684604, + "grad_norm": 0.24050559103488922, + "learning_rate": 9.809077320288082e-05, + "loss": 4.5648, + "step": 2255 + }, + { + "epoch": 0.15355347193912217, + "grad_norm": 0.3781549036502838, + "learning_rate": 9.808652670199756e-05, + "loss": 4.4987, + "step": 2260 + }, + { + "epoch": 0.15389319200978394, + "grad_norm": 0.46098098158836365, + "learning_rate": 9.808228020111428e-05, + "loss": 4.5322, + "step": 2265 + }, + { + "epoch": 0.1542329120804457, + "grad_norm": 0.32969388365745544, + "learning_rate": 9.8078033700231e-05, + "loss": 4.59, + "step": 2270 + }, + { + "epoch": 0.15457263215110748, + "grad_norm": 0.28195780515670776, + "learning_rate": 9.807378719934775e-05, + "loss": 4.4548, + "step": 2275 + }, + { + "epoch": 0.15491235222176927, + "grad_norm": 0.2665387690067291, + "learning_rate": 9.806954069846446e-05, + "loss": 4.4353, + "step": 2280 + }, + { + "epoch": 0.15525207229243104, + "grad_norm": 0.3116438686847687, + "learning_rate": 9.806529419758119e-05, + "loss": 4.3557, + "step": 2285 + }, + { + "epoch": 0.1555917923630928, + "grad_norm": 0.42467501759529114, + "learning_rate": 9.806104769669793e-05, + "loss": 4.3241, + "step": 2290 + }, + { + "epoch": 0.15593151243375458, + "grad_norm": 0.24590204656124115, + "learning_rate": 9.805680119581465e-05, + "loss": 4.3719, + "step": 2295 + }, + { + "epoch": 0.15627123250441635, + "grad_norm": 0.7295488119125366, + "learning_rate": 9.805255469493139e-05, + "loss": 4.5891, + "step": 2300 + }, + { + "epoch": 0.15661095257507815, + "grad_norm": 0.24560780823230743, + "learning_rate": 9.804830819404812e-05, + "loss": 4.6124, + "step": 2305 + }, + { + "epoch": 0.15695067264573992, + "grad_norm": 0.2907837927341461, + "learning_rate": 9.804406169316483e-05, + "loss": 4.2532, + "step": 2310 + }, + { + "epoch": 0.1572903927164017, + "grad_norm": 1.0109922885894775, + "learning_rate": 9.803981519228157e-05, + "loss": 4.5454, + "step": 2315 + }, + { + "epoch": 0.15763011278706346, + "grad_norm": 0.2637081444263458, + "learning_rate": 9.80355686913983e-05, + "loss": 4.5952, + "step": 2320 + }, + { + "epoch": 0.15796983285772523, + "grad_norm": 0.2559982240200043, + "learning_rate": 9.803132219051501e-05, + "loss": 4.6078, + "step": 2325 + }, + { + "epoch": 0.158309552928387, + "grad_norm": 0.4410446882247925, + "learning_rate": 9.802707568963176e-05, + "loss": 4.6202, + "step": 2330 + }, + { + "epoch": 0.1586492729990488, + "grad_norm": 0.20168878138065338, + "learning_rate": 9.802282918874848e-05, + "loss": 4.4023, + "step": 2335 + }, + { + "epoch": 0.15898899306971057, + "grad_norm": 0.29185861349105835, + "learning_rate": 9.80185826878652e-05, + "loss": 4.543, + "step": 2340 + }, + { + "epoch": 0.15932871314037234, + "grad_norm": 0.22290275990962982, + "learning_rate": 9.801433618698194e-05, + "loss": 4.6697, + "step": 2345 + }, + { + "epoch": 0.1596684332110341, + "grad_norm": 0.7529789805412292, + "learning_rate": 9.801008968609865e-05, + "loss": 4.737, + "step": 2350 + }, + { + "epoch": 0.16000815328169588, + "grad_norm": 0.3712422549724579, + "learning_rate": 9.800584318521538e-05, + "loss": 4.6241, + "step": 2355 + }, + { + "epoch": 0.16034787335235764, + "grad_norm": 0.23941993713378906, + "learning_rate": 9.800159668433212e-05, + "loss": 4.6871, + "step": 2360 + }, + { + "epoch": 0.16068759342301944, + "grad_norm": 0.37533217668533325, + "learning_rate": 9.799735018344884e-05, + "loss": 4.467, + "step": 2365 + }, + { + "epoch": 0.1610273134936812, + "grad_norm": 0.2338525950908661, + "learning_rate": 9.799310368256557e-05, + "loss": 4.435, + "step": 2370 + }, + { + "epoch": 0.16136703356434298, + "grad_norm": 0.26814886927604675, + "learning_rate": 9.798885718168231e-05, + "loss": 4.4838, + "step": 2375 + }, + { + "epoch": 0.16170675363500475, + "grad_norm": 0.3187100887298584, + "learning_rate": 9.798461068079902e-05, + "loss": 4.354, + "step": 2380 + }, + { + "epoch": 0.16204647370566652, + "grad_norm": 0.7054830193519592, + "learning_rate": 9.798036417991575e-05, + "loss": 4.489, + "step": 2385 + }, + { + "epoch": 0.16238619377632832, + "grad_norm": 0.25023216009140015, + "learning_rate": 9.797611767903249e-05, + "loss": 4.6024, + "step": 2390 + }, + { + "epoch": 0.1627259138469901, + "grad_norm": 0.24370110034942627, + "learning_rate": 9.79718711781492e-05, + "loss": 4.3951, + "step": 2395 + }, + { + "epoch": 0.16306563391765186, + "grad_norm": 0.23113249242305756, + "learning_rate": 9.796762467726593e-05, + "loss": 4.4352, + "step": 2400 + }, + { + "epoch": 0.16340535398831363, + "grad_norm": 0.4448549747467041, + "learning_rate": 9.796337817638268e-05, + "loss": 4.4063, + "step": 2405 + }, + { + "epoch": 0.1637450740589754, + "grad_norm": 0.20236225426197052, + "learning_rate": 9.795913167549939e-05, + "loss": 4.6175, + "step": 2410 + }, + { + "epoch": 0.16408479412963717, + "grad_norm": 0.5627440810203552, + "learning_rate": 9.795488517461612e-05, + "loss": 4.5675, + "step": 2415 + }, + { + "epoch": 0.16442451420029897, + "grad_norm": 0.28272920846939087, + "learning_rate": 9.795063867373285e-05, + "loss": 4.6146, + "step": 2420 + }, + { + "epoch": 0.16476423427096074, + "grad_norm": 0.2605418264865875, + "learning_rate": 9.794639217284957e-05, + "loss": 4.5697, + "step": 2425 + }, + { + "epoch": 0.1651039543416225, + "grad_norm": 0.23570238053798676, + "learning_rate": 9.79421456719663e-05, + "loss": 4.5072, + "step": 2430 + }, + { + "epoch": 0.16544367441228428, + "grad_norm": 0.20745481550693512, + "learning_rate": 9.793789917108303e-05, + "loss": 4.5735, + "step": 2435 + }, + { + "epoch": 0.16578339448294604, + "grad_norm": 0.23489026725292206, + "learning_rate": 9.793365267019976e-05, + "loss": 4.4731, + "step": 2440 + }, + { + "epoch": 0.16612311455360781, + "grad_norm": 0.4274902939796448, + "learning_rate": 9.792940616931649e-05, + "loss": 4.6706, + "step": 2445 + }, + { + "epoch": 0.1664628346242696, + "grad_norm": 0.25951382517814636, + "learning_rate": 9.792515966843321e-05, + "loss": 4.441, + "step": 2450 + }, + { + "epoch": 0.16680255469493138, + "grad_norm": 1.9463924169540405, + "learning_rate": 9.792091316754994e-05, + "loss": 4.4691, + "step": 2455 + }, + { + "epoch": 0.16714227476559315, + "grad_norm": 0.4177579879760742, + "learning_rate": 9.791666666666667e-05, + "loss": 4.4903, + "step": 2460 + }, + { + "epoch": 0.16748199483625492, + "grad_norm": 0.533138632774353, + "learning_rate": 9.79124201657834e-05, + "loss": 4.3311, + "step": 2465 + }, + { + "epoch": 0.1678217149069167, + "grad_norm": 0.2822255790233612, + "learning_rate": 9.790817366490013e-05, + "loss": 4.5948, + "step": 2470 + }, + { + "epoch": 0.1681614349775785, + "grad_norm": 0.29035472869873047, + "learning_rate": 9.790392716401685e-05, + "loss": 4.5585, + "step": 2475 + }, + { + "epoch": 0.16850115504824026, + "grad_norm": 2.6457104682922363, + "learning_rate": 9.789968066313358e-05, + "loss": 4.5255, + "step": 2480 + }, + { + "epoch": 0.16884087511890203, + "grad_norm": 0.21925875544548035, + "learning_rate": 9.789543416225031e-05, + "loss": 4.5955, + "step": 2485 + }, + { + "epoch": 0.1691805951895638, + "grad_norm": 0.3095509707927704, + "learning_rate": 9.789118766136704e-05, + "loss": 4.5427, + "step": 2490 + }, + { + "epoch": 0.16952031526022557, + "grad_norm": 1.3866817951202393, + "learning_rate": 9.788694116048377e-05, + "loss": 4.3407, + "step": 2495 + }, + { + "epoch": 0.16986003533088734, + "grad_norm": 0.31529414653778076, + "learning_rate": 9.78826946596005e-05, + "loss": 4.614, + "step": 2500 + }, + { + "epoch": 0.17019975540154914, + "grad_norm": 0.25377875566482544, + "learning_rate": 9.787844815871722e-05, + "loss": 4.5838, + "step": 2505 + }, + { + "epoch": 0.1705394754722109, + "grad_norm": 0.7861871123313904, + "learning_rate": 9.787420165783395e-05, + "loss": 4.5731, + "step": 2510 + }, + { + "epoch": 0.17087919554287267, + "grad_norm": 0.19743318855762482, + "learning_rate": 9.786995515695068e-05, + "loss": 4.3947, + "step": 2515 + }, + { + "epoch": 0.17121891561353444, + "grad_norm": 0.3416430950164795, + "learning_rate": 9.78657086560674e-05, + "loss": 4.5711, + "step": 2520 + }, + { + "epoch": 0.17155863568419621, + "grad_norm": 0.3679373562335968, + "learning_rate": 9.786146215518413e-05, + "loss": 4.6518, + "step": 2525 + }, + { + "epoch": 0.17189835575485798, + "grad_norm": 0.23833996057510376, + "learning_rate": 9.785721565430086e-05, + "loss": 4.4339, + "step": 2530 + }, + { + "epoch": 0.17223807582551978, + "grad_norm": 0.25589922070503235, + "learning_rate": 9.785296915341759e-05, + "loss": 4.6889, + "step": 2535 + }, + { + "epoch": 0.17257779589618155, + "grad_norm": 0.27489981055259705, + "learning_rate": 9.784872265253432e-05, + "loss": 4.3215, + "step": 2540 + }, + { + "epoch": 0.17291751596684332, + "grad_norm": 0.23039469122886658, + "learning_rate": 9.784447615165105e-05, + "loss": 4.4543, + "step": 2545 + }, + { + "epoch": 0.1732572360375051, + "grad_norm": 0.3405773341655731, + "learning_rate": 9.784022965076776e-05, + "loss": 4.5138, + "step": 2550 + }, + { + "epoch": 0.17359695610816686, + "grad_norm": 0.8154670000076294, + "learning_rate": 9.78359831498845e-05, + "loss": 4.4716, + "step": 2555 + }, + { + "epoch": 0.17393667617882866, + "grad_norm": 0.30465012788772583, + "learning_rate": 9.783173664900123e-05, + "loss": 4.2262, + "step": 2560 + }, + { + "epoch": 0.17427639624949043, + "grad_norm": 0.3995078504085541, + "learning_rate": 9.782749014811795e-05, + "loss": 4.5862, + "step": 2565 + }, + { + "epoch": 0.1746161163201522, + "grad_norm": 0.2636319398880005, + "learning_rate": 9.782324364723469e-05, + "loss": 4.6933, + "step": 2570 + }, + { + "epoch": 0.17495583639081397, + "grad_norm": 0.3614608943462372, + "learning_rate": 9.781899714635141e-05, + "loss": 4.6504, + "step": 2575 + }, + { + "epoch": 0.17529555646147574, + "grad_norm": 0.3470248878002167, + "learning_rate": 9.781475064546813e-05, + "loss": 4.7038, + "step": 2580 + }, + { + "epoch": 0.1756352765321375, + "grad_norm": 0.39428719878196716, + "learning_rate": 9.781050414458487e-05, + "loss": 4.2662, + "step": 2585 + }, + { + "epoch": 0.1759749966027993, + "grad_norm": 0.22955843806266785, + "learning_rate": 9.78062576437016e-05, + "loss": 4.4044, + "step": 2590 + }, + { + "epoch": 0.17631471667346107, + "grad_norm": 0.2899189293384552, + "learning_rate": 9.780201114281831e-05, + "loss": 4.4223, + "step": 2595 + }, + { + "epoch": 0.17665443674412284, + "grad_norm": 0.4230986535549164, + "learning_rate": 9.779776464193505e-05, + "loss": 4.1781, + "step": 2600 + }, + { + "epoch": 0.17699415681478461, + "grad_norm": 0.32788804173469543, + "learning_rate": 9.779351814105178e-05, + "loss": 4.616, + "step": 2605 + }, + { + "epoch": 0.17733387688544638, + "grad_norm": 0.2200581431388855, + "learning_rate": 9.77892716401685e-05, + "loss": 4.2604, + "step": 2610 + }, + { + "epoch": 0.17767359695610815, + "grad_norm": 0.30823394656181335, + "learning_rate": 9.778502513928524e-05, + "loss": 4.2978, + "step": 2615 + }, + { + "epoch": 0.17801331702676995, + "grad_norm": 0.22299472987651825, + "learning_rate": 9.778077863840195e-05, + "loss": 4.3587, + "step": 2620 + }, + { + "epoch": 0.17835303709743172, + "grad_norm": 0.22951941192150116, + "learning_rate": 9.777653213751868e-05, + "loss": 4.2573, + "step": 2625 + }, + { + "epoch": 0.1786927571680935, + "grad_norm": 0.35953882336616516, + "learning_rate": 9.777228563663542e-05, + "loss": 4.3515, + "step": 2630 + }, + { + "epoch": 0.17903247723875526, + "grad_norm": 0.4688868522644043, + "learning_rate": 9.776803913575214e-05, + "loss": 4.5713, + "step": 2635 + }, + { + "epoch": 0.17937219730941703, + "grad_norm": 0.21083256602287292, + "learning_rate": 9.776379263486888e-05, + "loss": 4.4969, + "step": 2640 + }, + { + "epoch": 0.17971191738007883, + "grad_norm": 0.36712825298309326, + "learning_rate": 9.775954613398561e-05, + "loss": 4.5619, + "step": 2645 + }, + { + "epoch": 0.1800516374507406, + "grad_norm": 0.2260504513978958, + "learning_rate": 9.775529963310232e-05, + "loss": 4.6722, + "step": 2650 + }, + { + "epoch": 0.18039135752140237, + "grad_norm": 0.36943840980529785, + "learning_rate": 9.775105313221906e-05, + "loss": 4.4934, + "step": 2655 + }, + { + "epoch": 0.18073107759206414, + "grad_norm": 0.4936888515949249, + "learning_rate": 9.774680663133579e-05, + "loss": 4.348, + "step": 2660 + }, + { + "epoch": 0.1810707976627259, + "grad_norm": 0.21958352625370026, + "learning_rate": 9.77425601304525e-05, + "loss": 4.4938, + "step": 2665 + }, + { + "epoch": 0.18141051773338768, + "grad_norm": 1.1148053407669067, + "learning_rate": 9.773831362956925e-05, + "loss": 4.2976, + "step": 2670 + }, + { + "epoch": 0.18175023780404947, + "grad_norm": 0.39846473932266235, + "learning_rate": 9.773406712868597e-05, + "loss": 4.3586, + "step": 2675 + }, + { + "epoch": 0.18208995787471124, + "grad_norm": 0.28287413716316223, + "learning_rate": 9.772982062780269e-05, + "loss": 4.4437, + "step": 2680 + }, + { + "epoch": 0.18242967794537301, + "grad_norm": 0.3402862846851349, + "learning_rate": 9.772557412691943e-05, + "loss": 4.5291, + "step": 2685 + }, + { + "epoch": 0.18276939801603478, + "grad_norm": 0.3358980715274811, + "learning_rate": 9.772132762603616e-05, + "loss": 4.3738, + "step": 2690 + }, + { + "epoch": 0.18310911808669655, + "grad_norm": 0.19017407298088074, + "learning_rate": 9.771708112515287e-05, + "loss": 4.1701, + "step": 2695 + }, + { + "epoch": 0.18344883815735832, + "grad_norm": 0.2291361540555954, + "learning_rate": 9.771283462426961e-05, + "loss": 4.6092, + "step": 2700 + }, + { + "epoch": 0.18378855822802012, + "grad_norm": 0.42033877968788147, + "learning_rate": 9.770858812338633e-05, + "loss": 4.3813, + "step": 2705 + }, + { + "epoch": 0.1841282782986819, + "grad_norm": 0.22784222662448883, + "learning_rate": 9.770434162250306e-05, + "loss": 4.4408, + "step": 2710 + }, + { + "epoch": 0.18446799836934366, + "grad_norm": 0.23395958542823792, + "learning_rate": 9.77000951216198e-05, + "loss": 4.4935, + "step": 2715 + }, + { + "epoch": 0.18480771844000543, + "grad_norm": 0.2610359191894531, + "learning_rate": 9.769584862073651e-05, + "loss": 4.4932, + "step": 2720 + }, + { + "epoch": 0.1851474385106672, + "grad_norm": 0.2646908164024353, + "learning_rate": 9.769160211985324e-05, + "loss": 4.4914, + "step": 2725 + }, + { + "epoch": 0.185487158581329, + "grad_norm": 0.31001701951026917, + "learning_rate": 9.768735561896998e-05, + "loss": 4.5656, + "step": 2730 + }, + { + "epoch": 0.18582687865199077, + "grad_norm": 0.3422091007232666, + "learning_rate": 9.76831091180867e-05, + "loss": 4.4946, + "step": 2735 + }, + { + "epoch": 0.18616659872265254, + "grad_norm": 0.4761231243610382, + "learning_rate": 9.767886261720343e-05, + "loss": 4.1494, + "step": 2740 + }, + { + "epoch": 0.1865063187933143, + "grad_norm": 0.23646193742752075, + "learning_rate": 9.767461611632017e-05, + "loss": 4.3254, + "step": 2745 + }, + { + "epoch": 0.18684603886397608, + "grad_norm": 1.6517447233200073, + "learning_rate": 9.767036961543688e-05, + "loss": 4.5333, + "step": 2750 + }, + { + "epoch": 0.18718575893463785, + "grad_norm": 0.2012016475200653, + "learning_rate": 9.766612311455361e-05, + "loss": 4.7069, + "step": 2755 + }, + { + "epoch": 0.18752547900529964, + "grad_norm": 0.20281845331192017, + "learning_rate": 9.766187661367035e-05, + "loss": 4.4399, + "step": 2760 + }, + { + "epoch": 0.18786519907596141, + "grad_norm": 0.1804925948381424, + "learning_rate": 9.765763011278707e-05, + "loss": 4.5354, + "step": 2765 + }, + { + "epoch": 0.18820491914662318, + "grad_norm": 0.4761740267276764, + "learning_rate": 9.76533836119038e-05, + "loss": 4.6633, + "step": 2770 + }, + { + "epoch": 0.18854463921728495, + "grad_norm": 0.22267234325408936, + "learning_rate": 9.764913711102052e-05, + "loss": 4.5686, + "step": 2775 + }, + { + "epoch": 0.18888435928794672, + "grad_norm": 0.5881355404853821, + "learning_rate": 9.764489061013725e-05, + "loss": 4.4554, + "step": 2780 + }, + { + "epoch": 0.1892240793586085, + "grad_norm": 0.43992605805397034, + "learning_rate": 9.764064410925398e-05, + "loss": 4.1684, + "step": 2785 + }, + { + "epoch": 0.1895637994292703, + "grad_norm": 0.21498017013072968, + "learning_rate": 9.76363976083707e-05, + "loss": 4.5047, + "step": 2790 + }, + { + "epoch": 0.18990351949993206, + "grad_norm": 0.37874165177345276, + "learning_rate": 9.763215110748743e-05, + "loss": 4.2255, + "step": 2795 + }, + { + "epoch": 0.19024323957059383, + "grad_norm": 0.2565677762031555, + "learning_rate": 9.762790460660416e-05, + "loss": 4.4333, + "step": 2800 + }, + { + "epoch": 0.1905829596412556, + "grad_norm": 0.2246963530778885, + "learning_rate": 9.762365810572089e-05, + "loss": 4.4198, + "step": 2805 + }, + { + "epoch": 0.19092267971191737, + "grad_norm": 0.946719229221344, + "learning_rate": 9.761941160483762e-05, + "loss": 3.9055, + "step": 2810 + }, + { + "epoch": 0.19126239978257917, + "grad_norm": 1.0544602870941162, + "learning_rate": 9.761516510395435e-05, + "loss": 4.4512, + "step": 2815 + }, + { + "epoch": 0.19160211985324094, + "grad_norm": 0.21298794448375702, + "learning_rate": 9.761091860307107e-05, + "loss": 4.4729, + "step": 2820 + }, + { + "epoch": 0.1919418399239027, + "grad_norm": 1.3822523355484009, + "learning_rate": 9.76066721021878e-05, + "loss": 4.3136, + "step": 2825 + }, + { + "epoch": 0.19228155999456448, + "grad_norm": 0.1828567236661911, + "learning_rate": 9.760242560130453e-05, + "loss": 4.5116, + "step": 2830 + }, + { + "epoch": 0.19262128006522625, + "grad_norm": 0.28580307960510254, + "learning_rate": 9.759817910042126e-05, + "loss": 4.4903, + "step": 2835 + }, + { + "epoch": 0.19296100013588802, + "grad_norm": 0.39433717727661133, + "learning_rate": 9.759393259953799e-05, + "loss": 4.4487, + "step": 2840 + }, + { + "epoch": 0.19330072020654981, + "grad_norm": 0.49140483140945435, + "learning_rate": 9.758968609865471e-05, + "loss": 4.4639, + "step": 2845 + }, + { + "epoch": 0.19364044027721158, + "grad_norm": 0.3383556306362152, + "learning_rate": 9.758543959777144e-05, + "loss": 4.3328, + "step": 2850 + }, + { + "epoch": 0.19398016034787335, + "grad_norm": 0.7367972135543823, + "learning_rate": 9.758119309688817e-05, + "loss": 4.5457, + "step": 2855 + }, + { + "epoch": 0.19431988041853512, + "grad_norm": 0.19852545857429504, + "learning_rate": 9.75769465960049e-05, + "loss": 4.2916, + "step": 2860 + }, + { + "epoch": 0.1946596004891969, + "grad_norm": 0.3379197120666504, + "learning_rate": 9.757270009512163e-05, + "loss": 4.4011, + "step": 2865 + }, + { + "epoch": 0.19499932055985866, + "grad_norm": 0.4577140212059021, + "learning_rate": 9.756845359423835e-05, + "loss": 4.2438, + "step": 2870 + }, + { + "epoch": 0.19533904063052046, + "grad_norm": 0.32074615359306335, + "learning_rate": 9.756420709335508e-05, + "loss": 4.3429, + "step": 2875 + }, + { + "epoch": 0.19567876070118223, + "grad_norm": 0.4993734359741211, + "learning_rate": 9.755996059247181e-05, + "loss": 4.6458, + "step": 2880 + }, + { + "epoch": 0.196018480771844, + "grad_norm": 0.21413934230804443, + "learning_rate": 9.755571409158854e-05, + "loss": 4.3236, + "step": 2885 + }, + { + "epoch": 0.19635820084250577, + "grad_norm": 0.39588046073913574, + "learning_rate": 9.755146759070527e-05, + "loss": 4.2725, + "step": 2890 + }, + { + "epoch": 0.19669792091316754, + "grad_norm": 0.23066450655460358, + "learning_rate": 9.7547221089822e-05, + "loss": 4.2981, + "step": 2895 + }, + { + "epoch": 0.19703764098382934, + "grad_norm": 0.24343866109848022, + "learning_rate": 9.754297458893872e-05, + "loss": 4.4485, + "step": 2900 + }, + { + "epoch": 0.1973773610544911, + "grad_norm": 0.2774411141872406, + "learning_rate": 9.753872808805544e-05, + "loss": 4.2737, + "step": 2905 + }, + { + "epoch": 0.19771708112515288, + "grad_norm": 0.3360697329044342, + "learning_rate": 9.753448158717218e-05, + "loss": 4.2094, + "step": 2910 + }, + { + "epoch": 0.19805680119581465, + "grad_norm": 0.3886429965496063, + "learning_rate": 9.75302350862889e-05, + "loss": 4.4864, + "step": 2915 + }, + { + "epoch": 0.19839652126647642, + "grad_norm": 0.5242161154747009, + "learning_rate": 9.752598858540562e-05, + "loss": 4.3877, + "step": 2920 + }, + { + "epoch": 0.1987362413371382, + "grad_norm": 0.2082594633102417, + "learning_rate": 9.752174208452236e-05, + "loss": 4.3006, + "step": 2925 + }, + { + "epoch": 0.19907596140779998, + "grad_norm": 1.1216654777526855, + "learning_rate": 9.751749558363909e-05, + "loss": 4.2916, + "step": 2930 + }, + { + "epoch": 0.19941568147846175, + "grad_norm": 0.1812744289636612, + "learning_rate": 9.75132490827558e-05, + "loss": 4.4246, + "step": 2935 + }, + { + "epoch": 0.19975540154912352, + "grad_norm": 0.316278874874115, + "learning_rate": 9.750900258187255e-05, + "loss": 4.2248, + "step": 2940 + }, + { + "epoch": 0.2000951216197853, + "grad_norm": 0.2795095443725586, + "learning_rate": 9.750475608098927e-05, + "loss": 4.4285, + "step": 2945 + }, + { + "epoch": 0.20043484169044706, + "grad_norm": 0.25871169567108154, + "learning_rate": 9.750050958010599e-05, + "loss": 4.3629, + "step": 2950 + }, + { + "epoch": 0.20077456176110883, + "grad_norm": 0.3203955888748169, + "learning_rate": 9.749626307922273e-05, + "loss": 4.8022, + "step": 2955 + }, + { + "epoch": 0.20111428183177063, + "grad_norm": 0.897880494594574, + "learning_rate": 9.749201657833946e-05, + "loss": 4.444, + "step": 2960 + }, + { + "epoch": 0.2014540019024324, + "grad_norm": 0.6095696687698364, + "learning_rate": 9.748777007745617e-05, + "loss": 4.3442, + "step": 2965 + }, + { + "epoch": 0.20179372197309417, + "grad_norm": 0.8089606761932373, + "learning_rate": 9.748352357657291e-05, + "loss": 4.3223, + "step": 2970 + }, + { + "epoch": 0.20213344204375594, + "grad_norm": 0.5481230616569519, + "learning_rate": 9.747927707568963e-05, + "loss": 4.3059, + "step": 2975 + }, + { + "epoch": 0.2024731621144177, + "grad_norm": 0.24502769112586975, + "learning_rate": 9.747503057480637e-05, + "loss": 4.2946, + "step": 2980 + }, + { + "epoch": 0.2028128821850795, + "grad_norm": 0.20267254114151, + "learning_rate": 9.74707840739231e-05, + "loss": 4.3506, + "step": 2985 + }, + { + "epoch": 0.20315260225574128, + "grad_norm": 1.4581079483032227, + "learning_rate": 9.746653757303981e-05, + "loss": 4.3024, + "step": 2990 + }, + { + "epoch": 0.20349232232640305, + "grad_norm": 0.3428595960140228, + "learning_rate": 9.746229107215655e-05, + "loss": 4.4748, + "step": 2995 + }, + { + "epoch": 0.20383204239706482, + "grad_norm": 0.3032056391239166, + "learning_rate": 9.745804457127328e-05, + "loss": 4.3262, + "step": 3000 + }, + { + "epoch": 0.2041717624677266, + "grad_norm": 0.32317832112312317, + "learning_rate": 9.745379807039e-05, + "loss": 4.475, + "step": 3005 + }, + { + "epoch": 0.20451148253838836, + "grad_norm": 0.21782419085502625, + "learning_rate": 9.744955156950674e-05, + "loss": 4.5021, + "step": 3010 + }, + { + "epoch": 0.20485120260905015, + "grad_norm": 0.17983724176883698, + "learning_rate": 9.744530506862347e-05, + "loss": 4.3826, + "step": 3015 + }, + { + "epoch": 0.20519092267971192, + "grad_norm": 0.3824704587459564, + "learning_rate": 9.744105856774018e-05, + "loss": 4.4883, + "step": 3020 + }, + { + "epoch": 0.2055306427503737, + "grad_norm": 0.6417528390884399, + "learning_rate": 9.743681206685692e-05, + "loss": 4.3623, + "step": 3025 + }, + { + "epoch": 0.20587036282103546, + "grad_norm": 0.31229910254478455, + "learning_rate": 9.743256556597365e-05, + "loss": 4.3362, + "step": 3030 + }, + { + "epoch": 0.20621008289169723, + "grad_norm": 0.35579913854599, + "learning_rate": 9.742831906509036e-05, + "loss": 4.3119, + "step": 3035 + }, + { + "epoch": 0.206549802962359, + "grad_norm": 0.21225492656230927, + "learning_rate": 9.74240725642071e-05, + "loss": 4.4159, + "step": 3040 + }, + { + "epoch": 0.2068895230330208, + "grad_norm": 0.5204954147338867, + "learning_rate": 9.741982606332382e-05, + "loss": 4.254, + "step": 3045 + }, + { + "epoch": 0.20722924310368257, + "grad_norm": 0.22649656236171722, + "learning_rate": 9.741557956244055e-05, + "loss": 4.3604, + "step": 3050 + }, + { + "epoch": 0.20756896317434434, + "grad_norm": 0.22533409297466278, + "learning_rate": 9.741133306155729e-05, + "loss": 4.1594, + "step": 3055 + }, + { + "epoch": 0.2079086832450061, + "grad_norm": 0.2681191563606262, + "learning_rate": 9.7407086560674e-05, + "loss": 4.553, + "step": 3060 + }, + { + "epoch": 0.20824840331566788, + "grad_norm": 1.2959145307540894, + "learning_rate": 9.740284005979073e-05, + "loss": 4.2071, + "step": 3065 + }, + { + "epoch": 0.20858812338632968, + "grad_norm": 0.21679522097110748, + "learning_rate": 9.739859355890747e-05, + "loss": 4.5038, + "step": 3070 + }, + { + "epoch": 0.20892784345699145, + "grad_norm": 0.36338409781455994, + "learning_rate": 9.739434705802419e-05, + "loss": 4.3356, + "step": 3075 + }, + { + "epoch": 0.20926756352765322, + "grad_norm": 0.2271890938282013, + "learning_rate": 9.739010055714092e-05, + "loss": 4.3573, + "step": 3080 + }, + { + "epoch": 0.209607283598315, + "grad_norm": 0.2753996253013611, + "learning_rate": 9.738585405625766e-05, + "loss": 4.4467, + "step": 3085 + }, + { + "epoch": 0.20994700366897676, + "grad_norm": 0.32643699645996094, + "learning_rate": 9.738160755537437e-05, + "loss": 4.2911, + "step": 3090 + }, + { + "epoch": 0.21028672373963853, + "grad_norm": 0.1794055551290512, + "learning_rate": 9.73773610544911e-05, + "loss": 4.3936, + "step": 3095 + }, + { + "epoch": 0.21062644381030032, + "grad_norm": 0.2121143937110901, + "learning_rate": 9.737311455360784e-05, + "loss": 4.4391, + "step": 3100 + }, + { + "epoch": 0.2109661638809621, + "grad_norm": 0.6584509015083313, + "learning_rate": 9.736886805272456e-05, + "loss": 4.3936, + "step": 3105 + }, + { + "epoch": 0.21130588395162386, + "grad_norm": 0.2863527834415436, + "learning_rate": 9.736462155184128e-05, + "loss": 4.4768, + "step": 3110 + }, + { + "epoch": 0.21164560402228563, + "grad_norm": 1.1741371154785156, + "learning_rate": 9.736037505095803e-05, + "loss": 4.3373, + "step": 3115 + }, + { + "epoch": 0.2119853240929474, + "grad_norm": 0.3653934597969055, + "learning_rate": 9.735612855007474e-05, + "loss": 4.6397, + "step": 3120 + }, + { + "epoch": 0.21232504416360917, + "grad_norm": 0.369391530752182, + "learning_rate": 9.735188204919147e-05, + "loss": 4.1315, + "step": 3125 + }, + { + "epoch": 0.21266476423427097, + "grad_norm": 0.22272364795207977, + "learning_rate": 9.73476355483082e-05, + "loss": 4.4174, + "step": 3130 + }, + { + "epoch": 0.21300448430493274, + "grad_norm": 0.26364120841026306, + "learning_rate": 9.734338904742492e-05, + "loss": 4.3034, + "step": 3135 + }, + { + "epoch": 0.2133442043755945, + "grad_norm": 0.2755309045314789, + "learning_rate": 9.733914254654165e-05, + "loss": 4.3209, + "step": 3140 + }, + { + "epoch": 0.21368392444625628, + "grad_norm": 0.27905556559562683, + "learning_rate": 9.733489604565838e-05, + "loss": 4.4046, + "step": 3145 + }, + { + "epoch": 0.21402364451691805, + "grad_norm": 0.25759658217430115, + "learning_rate": 9.733064954477511e-05, + "loss": 4.1602, + "step": 3150 + }, + { + "epoch": 0.21436336458757985, + "grad_norm": 1.0761340856552124, + "learning_rate": 9.732640304389184e-05, + "loss": 4.2794, + "step": 3155 + }, + { + "epoch": 0.21470308465824162, + "grad_norm": 0.18029484152793884, + "learning_rate": 9.732215654300856e-05, + "loss": 4.3554, + "step": 3160 + }, + { + "epoch": 0.2150428047289034, + "grad_norm": 0.373797208070755, + "learning_rate": 9.731791004212529e-05, + "loss": 4.3863, + "step": 3165 + }, + { + "epoch": 0.21538252479956516, + "grad_norm": 0.6202191710472107, + "learning_rate": 9.731366354124202e-05, + "loss": 4.2955, + "step": 3170 + }, + { + "epoch": 0.21572224487022693, + "grad_norm": 0.20301900804042816, + "learning_rate": 9.730941704035875e-05, + "loss": 4.2914, + "step": 3175 + }, + { + "epoch": 0.2160619649408887, + "grad_norm": 0.17571194469928741, + "learning_rate": 9.730517053947548e-05, + "loss": 4.276, + "step": 3180 + }, + { + "epoch": 0.2164016850115505, + "grad_norm": 0.3209381401538849, + "learning_rate": 9.73009240385922e-05, + "loss": 4.3957, + "step": 3185 + }, + { + "epoch": 0.21674140508221226, + "grad_norm": 0.2638840079307556, + "learning_rate": 9.729667753770893e-05, + "loss": 4.6224, + "step": 3190 + }, + { + "epoch": 0.21708112515287403, + "grad_norm": 0.19320239126682281, + "learning_rate": 9.729243103682566e-05, + "loss": 4.3039, + "step": 3195 + }, + { + "epoch": 0.2174208452235358, + "grad_norm": 0.43768310546875, + "learning_rate": 9.728818453594239e-05, + "loss": 4.3898, + "step": 3200 + }, + { + "epoch": 0.21776056529419757, + "grad_norm": 0.35756048560142517, + "learning_rate": 9.728393803505912e-05, + "loss": 4.5883, + "step": 3205 + }, + { + "epoch": 0.21810028536485934, + "grad_norm": 0.2380749136209488, + "learning_rate": 9.727969153417584e-05, + "loss": 4.4224, + "step": 3210 + }, + { + "epoch": 0.21844000543552114, + "grad_norm": 0.23136284947395325, + "learning_rate": 9.727544503329257e-05, + "loss": 4.3141, + "step": 3215 + }, + { + "epoch": 0.2187797255061829, + "grad_norm": 0.3109607398509979, + "learning_rate": 9.72711985324093e-05, + "loss": 4.295, + "step": 3220 + }, + { + "epoch": 0.21911944557684468, + "grad_norm": 0.4062863290309906, + "learning_rate": 9.726695203152603e-05, + "loss": 4.2421, + "step": 3225 + }, + { + "epoch": 0.21945916564750645, + "grad_norm": 0.20023144781589508, + "learning_rate": 9.726270553064276e-05, + "loss": 4.3144, + "step": 3230 + }, + { + "epoch": 0.21979888571816822, + "grad_norm": 0.8297600150108337, + "learning_rate": 9.725845902975948e-05, + "loss": 4.2986, + "step": 3235 + }, + { + "epoch": 0.22013860578883002, + "grad_norm": 0.6315371990203857, + "learning_rate": 9.725421252887621e-05, + "loss": 4.4641, + "step": 3240 + }, + { + "epoch": 0.22047832585949179, + "grad_norm": 0.2108875811100006, + "learning_rate": 9.724996602799293e-05, + "loss": 4.2764, + "step": 3245 + }, + { + "epoch": 0.22081804593015356, + "grad_norm": 0.20751313865184784, + "learning_rate": 9.724571952710967e-05, + "loss": 4.3553, + "step": 3250 + }, + { + "epoch": 0.22115776600081533, + "grad_norm": 0.24425362050533295, + "learning_rate": 9.72414730262264e-05, + "loss": 4.243, + "step": 3255 + }, + { + "epoch": 0.2214974860714771, + "grad_norm": 0.22244137525558472, + "learning_rate": 9.723722652534311e-05, + "loss": 4.2259, + "step": 3260 + }, + { + "epoch": 0.22183720614213887, + "grad_norm": 1.1119288206100464, + "learning_rate": 9.723298002445985e-05, + "loss": 4.322, + "step": 3265 + }, + { + "epoch": 0.22217692621280066, + "grad_norm": 0.3089415729045868, + "learning_rate": 9.722873352357658e-05, + "loss": 4.5175, + "step": 3270 + }, + { + "epoch": 0.22251664628346243, + "grad_norm": 0.2517615556716919, + "learning_rate": 9.72244870226933e-05, + "loss": 4.4571, + "step": 3275 + }, + { + "epoch": 0.2228563663541242, + "grad_norm": 0.17470265924930573, + "learning_rate": 9.722024052181004e-05, + "loss": 4.2085, + "step": 3280 + }, + { + "epoch": 0.22319608642478597, + "grad_norm": 0.22137637436389923, + "learning_rate": 9.721599402092676e-05, + "loss": 4.597, + "step": 3285 + }, + { + "epoch": 0.22353580649544774, + "grad_norm": 0.3537333309650421, + "learning_rate": 9.721174752004348e-05, + "loss": 4.4751, + "step": 3290 + }, + { + "epoch": 0.22387552656610954, + "grad_norm": 0.24241957068443298, + "learning_rate": 9.720750101916022e-05, + "loss": 4.0842, + "step": 3295 + }, + { + "epoch": 0.2242152466367713, + "grad_norm": 0.2881457805633545, + "learning_rate": 9.720325451827695e-05, + "loss": 4.1991, + "step": 3300 + }, + { + "epoch": 0.22455496670743308, + "grad_norm": 0.23095691204071045, + "learning_rate": 9.719900801739366e-05, + "loss": 4.4024, + "step": 3305 + }, + { + "epoch": 0.22489468677809485, + "grad_norm": 0.25291046500205994, + "learning_rate": 9.71947615165104e-05, + "loss": 4.1577, + "step": 3310 + }, + { + "epoch": 0.22523440684875662, + "grad_norm": 0.2241574227809906, + "learning_rate": 9.719051501562713e-05, + "loss": 4.2593, + "step": 3315 + }, + { + "epoch": 0.2255741269194184, + "grad_norm": 0.19019931554794312, + "learning_rate": 9.718626851474386e-05, + "loss": 4.1674, + "step": 3320 + }, + { + "epoch": 0.22591384699008019, + "grad_norm": 0.2490902543067932, + "learning_rate": 9.718202201386059e-05, + "loss": 4.201, + "step": 3325 + }, + { + "epoch": 0.22625356706074196, + "grad_norm": 0.2902776896953583, + "learning_rate": 9.71777755129773e-05, + "loss": 4.5505, + "step": 3330 + }, + { + "epoch": 0.22659328713140373, + "grad_norm": 0.22167052328586578, + "learning_rate": 9.717352901209404e-05, + "loss": 4.282, + "step": 3335 + }, + { + "epoch": 0.2269330072020655, + "grad_norm": 3.241713523864746, + "learning_rate": 9.716928251121077e-05, + "loss": 4.095, + "step": 3340 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.2534385323524475, + "learning_rate": 9.716503601032749e-05, + "loss": 4.325, + "step": 3345 + }, + { + "epoch": 0.22761244734338903, + "grad_norm": 0.2039516121149063, + "learning_rate": 9.716078950944423e-05, + "loss": 4.3627, + "step": 3350 + }, + { + "epoch": 0.22795216741405083, + "grad_norm": 0.20797346532344818, + "learning_rate": 9.715654300856096e-05, + "loss": 4.1518, + "step": 3355 + }, + { + "epoch": 0.2282918874847126, + "grad_norm": 0.2560058534145355, + "learning_rate": 9.715229650767767e-05, + "loss": 4.2377, + "step": 3360 + }, + { + "epoch": 0.22863160755537437, + "grad_norm": 0.20020583271980286, + "learning_rate": 9.714805000679441e-05, + "loss": 4.4088, + "step": 3365 + }, + { + "epoch": 0.22897132762603614, + "grad_norm": 0.32701200246810913, + "learning_rate": 9.714380350591114e-05, + "loss": 4.4603, + "step": 3370 + }, + { + "epoch": 0.2293110476966979, + "grad_norm": 0.16908589005470276, + "learning_rate": 9.713955700502786e-05, + "loss": 4.4159, + "step": 3375 + }, + { + "epoch": 0.2296507677673597, + "grad_norm": 10.86708927154541, + "learning_rate": 9.71353105041446e-05, + "loss": 4.373, + "step": 3380 + }, + { + "epoch": 0.22999048783802148, + "grad_norm": 0.2162582278251648, + "learning_rate": 9.713106400326132e-05, + "loss": 4.3303, + "step": 3385 + }, + { + "epoch": 0.23033020790868325, + "grad_norm": 0.1772332489490509, + "learning_rate": 9.712681750237804e-05, + "loss": 4.2396, + "step": 3390 + }, + { + "epoch": 0.23066992797934502, + "grad_norm": 0.36134952306747437, + "learning_rate": 9.712257100149478e-05, + "loss": 4.3838, + "step": 3395 + }, + { + "epoch": 0.2310096480500068, + "grad_norm": 0.32894113659858704, + "learning_rate": 9.71183245006115e-05, + "loss": 4.177, + "step": 3400 + }, + { + "epoch": 0.23134936812066856, + "grad_norm": 0.2267148792743683, + "learning_rate": 9.711407799972822e-05, + "loss": 4.3994, + "step": 3405 + }, + { + "epoch": 0.23168908819133036, + "grad_norm": 0.22997945547103882, + "learning_rate": 9.710983149884496e-05, + "loss": 4.2585, + "step": 3410 + }, + { + "epoch": 0.23202880826199213, + "grad_norm": 0.21913081407546997, + "learning_rate": 9.710558499796168e-05, + "loss": 4.495, + "step": 3415 + }, + { + "epoch": 0.2323685283326539, + "grad_norm": 0.2355417013168335, + "learning_rate": 9.710133849707841e-05, + "loss": 4.4619, + "step": 3420 + }, + { + "epoch": 0.23270824840331567, + "grad_norm": 0.29134589433670044, + "learning_rate": 9.709709199619515e-05, + "loss": 4.3438, + "step": 3425 + }, + { + "epoch": 0.23304796847397743, + "grad_norm": 0.4645059108734131, + "learning_rate": 9.709284549531186e-05, + "loss": 4.467, + "step": 3430 + }, + { + "epoch": 0.2333876885446392, + "grad_norm": 0.3466382324695587, + "learning_rate": 9.708859899442859e-05, + "loss": 4.4636, + "step": 3435 + }, + { + "epoch": 0.233727408615301, + "grad_norm": 0.2788010835647583, + "learning_rate": 9.708435249354533e-05, + "loss": 4.1318, + "step": 3440 + }, + { + "epoch": 0.23406712868596277, + "grad_norm": 0.4784042537212372, + "learning_rate": 9.708010599266205e-05, + "loss": 4.2089, + "step": 3445 + }, + { + "epoch": 0.23440684875662454, + "grad_norm": 0.45934122800827026, + "learning_rate": 9.707585949177878e-05, + "loss": 4.3023, + "step": 3450 + }, + { + "epoch": 0.2347465688272863, + "grad_norm": 0.25707322359085083, + "learning_rate": 9.707161299089552e-05, + "loss": 4.2901, + "step": 3455 + }, + { + "epoch": 0.23508628889794808, + "grad_norm": 0.4797256290912628, + "learning_rate": 9.706736649001223e-05, + "loss": 4.2574, + "step": 3460 + }, + { + "epoch": 0.23542600896860988, + "grad_norm": 0.2368171215057373, + "learning_rate": 9.706311998912896e-05, + "loss": 4.3998, + "step": 3465 + }, + { + "epoch": 0.23576572903927165, + "grad_norm": 1.7958965301513672, + "learning_rate": 9.705887348824569e-05, + "loss": 4.212, + "step": 3470 + }, + { + "epoch": 0.23610544910993342, + "grad_norm": 0.24695445597171783, + "learning_rate": 9.705462698736242e-05, + "loss": 4.3325, + "step": 3475 + }, + { + "epoch": 0.2364451691805952, + "grad_norm": 0.24877724051475525, + "learning_rate": 9.705038048647914e-05, + "loss": 4.4167, + "step": 3480 + }, + { + "epoch": 0.23678488925125696, + "grad_norm": 0.2147648185491562, + "learning_rate": 9.704613398559587e-05, + "loss": 4.3511, + "step": 3485 + }, + { + "epoch": 0.23712460932191873, + "grad_norm": 0.38735896348953247, + "learning_rate": 9.70418874847126e-05, + "loss": 4.054, + "step": 3490 + }, + { + "epoch": 0.23746432939258053, + "grad_norm": 0.28407546877861023, + "learning_rate": 9.703764098382933e-05, + "loss": 4.1299, + "step": 3495 + }, + { + "epoch": 0.2378040494632423, + "grad_norm": 0.4963781237602234, + "learning_rate": 9.703339448294606e-05, + "loss": 4.3751, + "step": 3500 + }, + { + "epoch": 0.23814376953390406, + "grad_norm": 0.2363215535879135, + "learning_rate": 9.702914798206278e-05, + "loss": 4.3631, + "step": 3505 + }, + { + "epoch": 0.23848348960456583, + "grad_norm": 0.2752895951271057, + "learning_rate": 9.702490148117951e-05, + "loss": 4.3192, + "step": 3510 + }, + { + "epoch": 0.2388232096752276, + "grad_norm": 0.2011261135339737, + "learning_rate": 9.702065498029624e-05, + "loss": 4.3019, + "step": 3515 + }, + { + "epoch": 0.23916292974588937, + "grad_norm": 0.30605676770210266, + "learning_rate": 9.701640847941297e-05, + "loss": 4.2733, + "step": 3520 + }, + { + "epoch": 0.23950264981655117, + "grad_norm": 0.23777063190937042, + "learning_rate": 9.70121619785297e-05, + "loss": 4.4391, + "step": 3525 + }, + { + "epoch": 0.23984236988721294, + "grad_norm": 0.19578081369400024, + "learning_rate": 9.700791547764642e-05, + "loss": 4.3464, + "step": 3530 + }, + { + "epoch": 0.2401820899578747, + "grad_norm": 0.43479400873184204, + "learning_rate": 9.700366897676315e-05, + "loss": 4.1509, + "step": 3535 + }, + { + "epoch": 0.24052181002853648, + "grad_norm": 0.23320983350276947, + "learning_rate": 9.699942247587988e-05, + "loss": 4.2031, + "step": 3540 + }, + { + "epoch": 0.24086153009919825, + "grad_norm": 0.395224004983902, + "learning_rate": 9.699517597499661e-05, + "loss": 4.3575, + "step": 3545 + }, + { + "epoch": 0.24120125016986005, + "grad_norm": 0.22553794085979462, + "learning_rate": 9.699092947411334e-05, + "loss": 4.4234, + "step": 3550 + }, + { + "epoch": 0.24154097024052182, + "grad_norm": 0.21396693587303162, + "learning_rate": 9.698668297323006e-05, + "loss": 4.3933, + "step": 3555 + }, + { + "epoch": 0.2418806903111836, + "grad_norm": 0.2883829176425934, + "learning_rate": 9.698243647234679e-05, + "loss": 4.1656, + "step": 3560 + }, + { + "epoch": 0.24222041038184536, + "grad_norm": 0.3761749267578125, + "learning_rate": 9.697818997146352e-05, + "loss": 4.254, + "step": 3565 + }, + { + "epoch": 0.24256013045250713, + "grad_norm": 0.2654179036617279, + "learning_rate": 9.697394347058025e-05, + "loss": 4.0754, + "step": 3570 + }, + { + "epoch": 0.2428998505231689, + "grad_norm": 0.20404711365699768, + "learning_rate": 9.696969696969698e-05, + "loss": 4.4404, + "step": 3575 + }, + { + "epoch": 0.2432395705938307, + "grad_norm": 0.661677360534668, + "learning_rate": 9.69654504688137e-05, + "loss": 4.4084, + "step": 3580 + }, + { + "epoch": 0.24357929066449246, + "grad_norm": 0.19168756902217865, + "learning_rate": 9.696120396793043e-05, + "loss": 4.3991, + "step": 3585 + }, + { + "epoch": 0.24391901073515423, + "grad_norm": 0.21689128875732422, + "learning_rate": 9.695695746704716e-05, + "loss": 4.2182, + "step": 3590 + }, + { + "epoch": 0.244258730805816, + "grad_norm": 0.1910148561000824, + "learning_rate": 9.695271096616389e-05, + "loss": 4.2426, + "step": 3595 + }, + { + "epoch": 0.24459845087647777, + "grad_norm": 0.463371604681015, + "learning_rate": 9.69484644652806e-05, + "loss": 4.1501, + "step": 3600 + }, + { + "epoch": 0.24493817094713954, + "grad_norm": 0.2187051922082901, + "learning_rate": 9.694421796439734e-05, + "loss": 4.2383, + "step": 3605 + }, + { + "epoch": 0.24527789101780134, + "grad_norm": 0.7701082229614258, + "learning_rate": 9.693997146351407e-05, + "loss": 4.2811, + "step": 3610 + }, + { + "epoch": 0.2456176110884631, + "grad_norm": 0.2454994171857834, + "learning_rate": 9.693572496263079e-05, + "loss": 4.3994, + "step": 3615 + }, + { + "epoch": 0.24595733115912488, + "grad_norm": 0.22179093956947327, + "learning_rate": 9.693147846174753e-05, + "loss": 4.1261, + "step": 3620 + }, + { + "epoch": 0.24629705122978665, + "grad_norm": 0.23975835740566254, + "learning_rate": 9.692723196086426e-05, + "loss": 4.308, + "step": 3625 + }, + { + "epoch": 0.24663677130044842, + "grad_norm": 0.21660096943378448, + "learning_rate": 9.692298545998097e-05, + "loss": 4.2105, + "step": 3630 + }, + { + "epoch": 0.24697649137111022, + "grad_norm": 0.22534438967704773, + "learning_rate": 9.691873895909771e-05, + "loss": 4.2923, + "step": 3635 + }, + { + "epoch": 0.247316211441772, + "grad_norm": 0.19649091362953186, + "learning_rate": 9.691449245821444e-05, + "loss": 4.4036, + "step": 3640 + }, + { + "epoch": 0.24765593151243376, + "grad_norm": 0.32042601704597473, + "learning_rate": 9.691024595733115e-05, + "loss": 4.4266, + "step": 3645 + }, + { + "epoch": 0.24799565158309553, + "grad_norm": 0.6859878301620483, + "learning_rate": 9.69059994564479e-05, + "loss": 4.2232, + "step": 3650 + }, + { + "epoch": 0.2483353716537573, + "grad_norm": 0.23352079093456268, + "learning_rate": 9.690175295556462e-05, + "loss": 4.0696, + "step": 3655 + }, + { + "epoch": 0.24867509172441907, + "grad_norm": 0.272712767124176, + "learning_rate": 9.689750645468135e-05, + "loss": 4.1759, + "step": 3660 + }, + { + "epoch": 0.24901481179508086, + "grad_norm": 0.22009974718093872, + "learning_rate": 9.689325995379808e-05, + "loss": 4.1973, + "step": 3665 + }, + { + "epoch": 0.24935453186574263, + "grad_norm": 1.4543390274047852, + "learning_rate": 9.68890134529148e-05, + "loss": 4.314, + "step": 3670 + }, + { + "epoch": 0.2496942519364044, + "grad_norm": 0.3941098153591156, + "learning_rate": 9.688476695203154e-05, + "loss": 4.4651, + "step": 3675 + }, + { + "epoch": 0.2500339720070662, + "grad_norm": 0.28159454464912415, + "learning_rate": 9.688052045114826e-05, + "loss": 4.2579, + "step": 3680 + }, + { + "epoch": 0.25037369207772797, + "grad_norm": 0.22340060770511627, + "learning_rate": 9.687627395026498e-05, + "loss": 4.2427, + "step": 3685 + }, + { + "epoch": 0.2507134121483897, + "grad_norm": 0.24438177049160004, + "learning_rate": 9.687202744938172e-05, + "loss": 4.1173, + "step": 3690 + }, + { + "epoch": 0.2510531322190515, + "grad_norm": 0.19045932590961456, + "learning_rate": 9.686778094849845e-05, + "loss": 4.3531, + "step": 3695 + }, + { + "epoch": 0.25139285228971325, + "grad_norm": 0.21072247624397278, + "learning_rate": 9.686353444761516e-05, + "loss": 4.3211, + "step": 3700 + }, + { + "epoch": 0.25173257236037505, + "grad_norm": 0.20157082378864288, + "learning_rate": 9.68592879467319e-05, + "loss": 4.3939, + "step": 3705 + }, + { + "epoch": 0.25207229243103685, + "grad_norm": 0.23919062316417694, + "learning_rate": 9.685504144584863e-05, + "loss": 4.2216, + "step": 3710 + }, + { + "epoch": 0.2524120125016986, + "grad_norm": 0.3379192352294922, + "learning_rate": 9.685079494496535e-05, + "loss": 4.15, + "step": 3715 + }, + { + "epoch": 0.2527517325723604, + "grad_norm": 0.2691631615161896, + "learning_rate": 9.684654844408209e-05, + "loss": 4.178, + "step": 3720 + }, + { + "epoch": 0.25309145264302213, + "grad_norm": 0.2460995614528656, + "learning_rate": 9.684230194319882e-05, + "loss": 4.1714, + "step": 3725 + }, + { + "epoch": 0.2534311727136839, + "grad_norm": 0.24896664917469025, + "learning_rate": 9.683805544231553e-05, + "loss": 4.5137, + "step": 3730 + }, + { + "epoch": 0.2537708927843457, + "grad_norm": 0.2998896837234497, + "learning_rate": 9.683380894143227e-05, + "loss": 4.2024, + "step": 3735 + }, + { + "epoch": 0.25411061285500747, + "grad_norm": 0.2170042097568512, + "learning_rate": 9.6829562440549e-05, + "loss": 4.2967, + "step": 3740 + }, + { + "epoch": 0.25445033292566926, + "grad_norm": 0.2534918487071991, + "learning_rate": 9.682531593966571e-05, + "loss": 4.1857, + "step": 3745 + }, + { + "epoch": 0.254790052996331, + "grad_norm": 0.2045327126979828, + "learning_rate": 9.682106943878246e-05, + "loss": 3.9961, + "step": 3750 + }, + { + "epoch": 0.2551297730669928, + "grad_norm": 0.23638245463371277, + "learning_rate": 9.681682293789917e-05, + "loss": 4.3547, + "step": 3755 + }, + { + "epoch": 0.25546949313765455, + "grad_norm": 0.22549360990524292, + "learning_rate": 9.68125764370159e-05, + "loss": 4.1726, + "step": 3760 + }, + { + "epoch": 0.25580921320831634, + "grad_norm": 0.24715493619441986, + "learning_rate": 9.680832993613264e-05, + "loss": 4.2836, + "step": 3765 + }, + { + "epoch": 0.25614893327897814, + "grad_norm": 0.33308762311935425, + "learning_rate": 9.680408343524935e-05, + "loss": 4.2667, + "step": 3770 + }, + { + "epoch": 0.2564886533496399, + "grad_norm": 0.4240279197692871, + "learning_rate": 9.679983693436608e-05, + "loss": 4.2174, + "step": 3775 + }, + { + "epoch": 0.2568283734203017, + "grad_norm": 0.26198095083236694, + "learning_rate": 9.679559043348282e-05, + "loss": 4.33, + "step": 3780 + }, + { + "epoch": 0.2571680934909634, + "grad_norm": 0.21898075938224792, + "learning_rate": 9.679134393259954e-05, + "loss": 4.2754, + "step": 3785 + }, + { + "epoch": 0.2575078135616252, + "grad_norm": 0.1936497837305069, + "learning_rate": 9.678709743171627e-05, + "loss": 4.2178, + "step": 3790 + }, + { + "epoch": 0.257847533632287, + "grad_norm": 0.3042401075363159, + "learning_rate": 9.678285093083301e-05, + "loss": 4.2444, + "step": 3795 + }, + { + "epoch": 0.25818725370294876, + "grad_norm": 0.22089192271232605, + "learning_rate": 9.677860442994972e-05, + "loss": 4.1209, + "step": 3800 + }, + { + "epoch": 0.25852697377361056, + "grad_norm": 0.26595672965049744, + "learning_rate": 9.677435792906645e-05, + "loss": 4.3664, + "step": 3805 + }, + { + "epoch": 0.2588666938442723, + "grad_norm": 0.38972827792167664, + "learning_rate": 9.677011142818319e-05, + "loss": 4.3291, + "step": 3810 + }, + { + "epoch": 0.2592064139149341, + "grad_norm": 0.8308687210083008, + "learning_rate": 9.67658649272999e-05, + "loss": 4.3588, + "step": 3815 + }, + { + "epoch": 0.2595461339855959, + "grad_norm": 0.29095426201820374, + "learning_rate": 9.676161842641663e-05, + "loss": 4.2095, + "step": 3820 + }, + { + "epoch": 0.25988585405625764, + "grad_norm": 0.6666823625564575, + "learning_rate": 9.675737192553336e-05, + "loss": 4.249, + "step": 3825 + }, + { + "epoch": 0.26022557412691943, + "grad_norm": 0.2800503373146057, + "learning_rate": 9.675312542465009e-05, + "loss": 4.0724, + "step": 3830 + }, + { + "epoch": 0.2605652941975812, + "grad_norm": 0.31251639127731323, + "learning_rate": 9.674887892376682e-05, + "loss": 4.2177, + "step": 3835 + }, + { + "epoch": 0.260905014268243, + "grad_norm": 0.19203290343284607, + "learning_rate": 9.674463242288355e-05, + "loss": 4.1164, + "step": 3840 + }, + { + "epoch": 0.2612447343389047, + "grad_norm": 0.21506604552268982, + "learning_rate": 9.674038592200027e-05, + "loss": 4.3448, + "step": 3845 + }, + { + "epoch": 0.2615844544095665, + "grad_norm": 0.2286742478609085, + "learning_rate": 9.6736139421117e-05, + "loss": 4.3154, + "step": 3850 + }, + { + "epoch": 0.2619241744802283, + "grad_norm": 0.22341595590114594, + "learning_rate": 9.673189292023373e-05, + "loss": 4.0496, + "step": 3855 + }, + { + "epoch": 0.26226389455089005, + "grad_norm": 3.2631723880767822, + "learning_rate": 9.672764641935046e-05, + "loss": 4.2739, + "step": 3860 + }, + { + "epoch": 0.26260361462155185, + "grad_norm": 0.21692293882369995, + "learning_rate": 9.672339991846719e-05, + "loss": 4.2239, + "step": 3865 + }, + { + "epoch": 0.2629433346922136, + "grad_norm": 0.24772769212722778, + "learning_rate": 9.671915341758391e-05, + "loss": 4.0354, + "step": 3870 + }, + { + "epoch": 0.2632830547628754, + "grad_norm": 0.2190844863653183, + "learning_rate": 9.671490691670064e-05, + "loss": 4.2554, + "step": 3875 + }, + { + "epoch": 0.2636227748335372, + "grad_norm": 0.19608178734779358, + "learning_rate": 9.671066041581737e-05, + "loss": 4.2199, + "step": 3880 + }, + { + "epoch": 0.26396249490419893, + "grad_norm": 0.22313562035560608, + "learning_rate": 9.67064139149341e-05, + "loss": 4.1976, + "step": 3885 + }, + { + "epoch": 0.2643022149748607, + "grad_norm": 0.25129613280296326, + "learning_rate": 9.670216741405083e-05, + "loss": 4.2595, + "step": 3890 + }, + { + "epoch": 0.26464193504552247, + "grad_norm": 0.19212405383586884, + "learning_rate": 9.669792091316755e-05, + "loss": 4.3704, + "step": 3895 + }, + { + "epoch": 0.26498165511618427, + "grad_norm": 0.21401169896125793, + "learning_rate": 9.669367441228428e-05, + "loss": 4.1088, + "step": 3900 + }, + { + "epoch": 0.26532137518684606, + "grad_norm": 0.2625492811203003, + "learning_rate": 9.668942791140101e-05, + "loss": 4.4765, + "step": 3905 + }, + { + "epoch": 0.2656610952575078, + "grad_norm": 0.23690305650234222, + "learning_rate": 9.668518141051774e-05, + "loss": 4.1193, + "step": 3910 + }, + { + "epoch": 0.2660008153281696, + "grad_norm": 0.2038702368736267, + "learning_rate": 9.668093490963447e-05, + "loss": 4.4361, + "step": 3915 + }, + { + "epoch": 0.26634053539883135, + "grad_norm": 1.9976972341537476, + "learning_rate": 9.66766884087512e-05, + "loss": 4.448, + "step": 3920 + }, + { + "epoch": 0.26668025546949314, + "grad_norm": 0.2619224488735199, + "learning_rate": 9.667244190786792e-05, + "loss": 4.3061, + "step": 3925 + }, + { + "epoch": 0.2670199755401549, + "grad_norm": 0.17488695681095123, + "learning_rate": 9.666819540698465e-05, + "loss": 3.924, + "step": 3930 + }, + { + "epoch": 0.2673596956108167, + "grad_norm": 0.3555572032928467, + "learning_rate": 9.666394890610138e-05, + "loss": 4.4889, + "step": 3935 + }, + { + "epoch": 0.2676994156814785, + "grad_norm": 0.18651026487350464, + "learning_rate": 9.66597024052181e-05, + "loss": 4.2601, + "step": 3940 + }, + { + "epoch": 0.2680391357521402, + "grad_norm": 0.4118260145187378, + "learning_rate": 9.665545590433483e-05, + "loss": 4.0048, + "step": 3945 + }, + { + "epoch": 0.268378855822802, + "grad_norm": 0.21420472860336304, + "learning_rate": 9.665120940345156e-05, + "loss": 4.5124, + "step": 3950 + }, + { + "epoch": 0.26871857589346376, + "grad_norm": 0.25867247581481934, + "learning_rate": 9.664696290256828e-05, + "loss": 3.9366, + "step": 3955 + }, + { + "epoch": 0.26905829596412556, + "grad_norm": 0.9560242295265198, + "learning_rate": 9.664271640168502e-05, + "loss": 4.273, + "step": 3960 + }, + { + "epoch": 0.26939801603478736, + "grad_norm": 0.22547510266304016, + "learning_rate": 9.663846990080175e-05, + "loss": 4.4065, + "step": 3965 + }, + { + "epoch": 0.2697377361054491, + "grad_norm": 0.4761745035648346, + "learning_rate": 9.663422339991846e-05, + "loss": 4.0622, + "step": 3970 + }, + { + "epoch": 0.2700774561761109, + "grad_norm": 0.26078933477401733, + "learning_rate": 9.66299768990352e-05, + "loss": 4.4622, + "step": 3975 + }, + { + "epoch": 0.27041717624677264, + "grad_norm": 0.21970224380493164, + "learning_rate": 9.662573039815193e-05, + "loss": 4.2412, + "step": 3980 + }, + { + "epoch": 0.27075689631743444, + "grad_norm": 0.5211921334266663, + "learning_rate": 9.662148389726865e-05, + "loss": 4.3848, + "step": 3985 + }, + { + "epoch": 0.27109661638809623, + "grad_norm": 0.31244269013404846, + "learning_rate": 9.661723739638539e-05, + "loss": 4.3852, + "step": 3990 + }, + { + "epoch": 0.271436336458758, + "grad_norm": 0.29353293776512146, + "learning_rate": 9.661299089550211e-05, + "loss": 4.091, + "step": 3995 + }, + { + "epoch": 0.2717760565294198, + "grad_norm": 0.23753587901592255, + "learning_rate": 9.660874439461884e-05, + "loss": 4.3828, + "step": 4000 + }, + { + "epoch": 0.2721157766000815, + "grad_norm": 0.2865026593208313, + "learning_rate": 9.660449789373557e-05, + "loss": 4.2366, + "step": 4005 + }, + { + "epoch": 0.2724554966707433, + "grad_norm": 0.32267966866493225, + "learning_rate": 9.66002513928523e-05, + "loss": 4.1455, + "step": 4010 + }, + { + "epoch": 0.27279521674140506, + "grad_norm": 0.25711655616760254, + "learning_rate": 9.659600489196903e-05, + "loss": 4.4231, + "step": 4015 + }, + { + "epoch": 0.27313493681206685, + "grad_norm": 0.21570606529712677, + "learning_rate": 9.659175839108575e-05, + "loss": 4.2092, + "step": 4020 + }, + { + "epoch": 0.27347465688272865, + "grad_norm": 0.22739212214946747, + "learning_rate": 9.658751189020247e-05, + "loss": 4.2107, + "step": 4025 + }, + { + "epoch": 0.2738143769533904, + "grad_norm": 0.22698377072811127, + "learning_rate": 9.658326538931921e-05, + "loss": 3.965, + "step": 4030 + }, + { + "epoch": 0.2741540970240522, + "grad_norm": 0.2108132541179657, + "learning_rate": 9.657901888843594e-05, + "loss": 4.3312, + "step": 4035 + }, + { + "epoch": 0.27449381709471393, + "grad_norm": 0.3985457122325897, + "learning_rate": 9.657477238755265e-05, + "loss": 4.1752, + "step": 4040 + }, + { + "epoch": 0.27483353716537573, + "grad_norm": 0.2395816147327423, + "learning_rate": 9.65705258866694e-05, + "loss": 4.6349, + "step": 4045 + }, + { + "epoch": 0.2751732572360375, + "grad_norm": 0.24473239481449127, + "learning_rate": 9.656627938578612e-05, + "loss": 4.4361, + "step": 4050 + }, + { + "epoch": 0.27551297730669927, + "grad_norm": 0.3756929636001587, + "learning_rate": 9.656203288490284e-05, + "loss": 4.2976, + "step": 4055 + }, + { + "epoch": 0.27585269737736107, + "grad_norm": 0.2284708023071289, + "learning_rate": 9.655778638401958e-05, + "loss": 4.2554, + "step": 4060 + }, + { + "epoch": 0.2761924174480228, + "grad_norm": 0.26325544714927673, + "learning_rate": 9.655353988313631e-05, + "loss": 4.152, + "step": 4065 + }, + { + "epoch": 0.2765321375186846, + "grad_norm": 0.20146878063678741, + "learning_rate": 9.654929338225302e-05, + "loss": 4.1147, + "step": 4070 + }, + { + "epoch": 0.2768718575893464, + "grad_norm": 0.21572470664978027, + "learning_rate": 9.654504688136976e-05, + "loss": 4.2792, + "step": 4075 + }, + { + "epoch": 0.27721157766000815, + "grad_norm": 0.32967862486839294, + "learning_rate": 9.654080038048649e-05, + "loss": 4.1379, + "step": 4080 + }, + { + "epoch": 0.27755129773066994, + "grad_norm": 0.23224560916423798, + "learning_rate": 9.65365538796032e-05, + "loss": 4.0244, + "step": 4085 + }, + { + "epoch": 0.2778910178013317, + "grad_norm": 0.2101649045944214, + "learning_rate": 9.653230737871995e-05, + "loss": 4.2301, + "step": 4090 + }, + { + "epoch": 0.2782307378719935, + "grad_norm": 0.22994142770767212, + "learning_rate": 9.652806087783666e-05, + "loss": 4.1505, + "step": 4095 + }, + { + "epoch": 0.2785704579426552, + "grad_norm": 0.23458316922187805, + "learning_rate": 9.652381437695339e-05, + "loss": 4.2603, + "step": 4100 + }, + { + "epoch": 0.278910178013317, + "grad_norm": 1.6248669624328613, + "learning_rate": 9.651956787607013e-05, + "loss": 4.1157, + "step": 4105 + }, + { + "epoch": 0.2792498980839788, + "grad_norm": 0.49504292011260986, + "learning_rate": 9.651532137518685e-05, + "loss": 4.2854, + "step": 4110 + }, + { + "epoch": 0.27958961815464056, + "grad_norm": 0.2064412534236908, + "learning_rate": 9.651107487430357e-05, + "loss": 4.2062, + "step": 4115 + }, + { + "epoch": 0.27992933822530236, + "grad_norm": 0.26098746061325073, + "learning_rate": 9.650682837342031e-05, + "loss": 4.1904, + "step": 4120 + }, + { + "epoch": 0.2802690582959641, + "grad_norm": 1.5820512771606445, + "learning_rate": 9.650258187253703e-05, + "loss": 4.0721, + "step": 4125 + }, + { + "epoch": 0.2806087783666259, + "grad_norm": 0.28080296516418457, + "learning_rate": 9.649833537165376e-05, + "loss": 4.1324, + "step": 4130 + }, + { + "epoch": 0.2809484984372877, + "grad_norm": 0.186203733086586, + "learning_rate": 9.64940888707705e-05, + "loss": 4.0794, + "step": 4135 + }, + { + "epoch": 0.28128821850794944, + "grad_norm": 0.3637949824333191, + "learning_rate": 9.648984236988721e-05, + "loss": 4.135, + "step": 4140 + }, + { + "epoch": 0.28162793857861124, + "grad_norm": 0.24603451788425446, + "learning_rate": 9.648559586900394e-05, + "loss": 4.3026, + "step": 4145 + }, + { + "epoch": 0.281967658649273, + "grad_norm": 0.21384142339229584, + "learning_rate": 9.648134936812068e-05, + "loss": 4.5439, + "step": 4150 + }, + { + "epoch": 0.2823073787199348, + "grad_norm": 0.23172922432422638, + "learning_rate": 9.64771028672374e-05, + "loss": 4.2762, + "step": 4155 + }, + { + "epoch": 0.2826470987905966, + "grad_norm": 0.22065469622612, + "learning_rate": 9.647285636635413e-05, + "loss": 4.3842, + "step": 4160 + }, + { + "epoch": 0.2829868188612583, + "grad_norm": 3.40319561958313, + "learning_rate": 9.646860986547087e-05, + "loss": 4.1898, + "step": 4165 + }, + { + "epoch": 0.2833265389319201, + "grad_norm": 0.20233049988746643, + "learning_rate": 9.646436336458758e-05, + "loss": 4.0764, + "step": 4170 + }, + { + "epoch": 0.28366625900258186, + "grad_norm": 0.586283802986145, + "learning_rate": 9.646011686370431e-05, + "loss": 4.314, + "step": 4175 + }, + { + "epoch": 0.28400597907324365, + "grad_norm": 2.9164395332336426, + "learning_rate": 9.645587036282104e-05, + "loss": 4.0146, + "step": 4180 + }, + { + "epoch": 0.2843456991439054, + "grad_norm": 0.28996986150741577, + "learning_rate": 9.645162386193777e-05, + "loss": 4.2867, + "step": 4185 + }, + { + "epoch": 0.2846854192145672, + "grad_norm": 0.5857749581336975, + "learning_rate": 9.64473773610545e-05, + "loss": 3.9711, + "step": 4190 + }, + { + "epoch": 0.285025139285229, + "grad_norm": 0.2920601963996887, + "learning_rate": 9.644313086017122e-05, + "loss": 4.2076, + "step": 4195 + }, + { + "epoch": 0.28536485935589073, + "grad_norm": 0.5181906223297119, + "learning_rate": 9.643888435928795e-05, + "loss": 4.2119, + "step": 4200 + }, + { + "epoch": 0.28570457942655253, + "grad_norm": 0.28782758116722107, + "learning_rate": 9.643463785840468e-05, + "loss": 4.3009, + "step": 4205 + }, + { + "epoch": 0.28604429949721427, + "grad_norm": 0.23932106792926788, + "learning_rate": 9.64303913575214e-05, + "loss": 4.2094, + "step": 4210 + }, + { + "epoch": 0.28638401956787607, + "grad_norm": 0.3413240611553192, + "learning_rate": 9.642614485663813e-05, + "loss": 4.2305, + "step": 4215 + }, + { + "epoch": 0.28672373963853787, + "grad_norm": 2.542914390563965, + "learning_rate": 9.642189835575486e-05, + "loss": 4.4582, + "step": 4220 + }, + { + "epoch": 0.2870634597091996, + "grad_norm": 0.21945473551750183, + "learning_rate": 9.641765185487159e-05, + "loss": 4.3744, + "step": 4225 + }, + { + "epoch": 0.2874031797798614, + "grad_norm": 0.209177166223526, + "learning_rate": 9.641340535398832e-05, + "loss": 4.2808, + "step": 4230 + }, + { + "epoch": 0.28774289985052315, + "grad_norm": 0.2568071782588959, + "learning_rate": 9.640915885310505e-05, + "loss": 3.9962, + "step": 4235 + }, + { + "epoch": 0.28808261992118495, + "grad_norm": 0.31277233362197876, + "learning_rate": 9.640491235222177e-05, + "loss": 4.3054, + "step": 4240 + }, + { + "epoch": 0.28842233999184674, + "grad_norm": 0.24698586761951447, + "learning_rate": 9.64006658513385e-05, + "loss": 4.5058, + "step": 4245 + }, + { + "epoch": 0.2887620600625085, + "grad_norm": 0.23559850454330444, + "learning_rate": 9.639641935045523e-05, + "loss": 4.0805, + "step": 4250 + }, + { + "epoch": 0.2891017801331703, + "grad_norm": 0.22997041046619415, + "learning_rate": 9.639217284957196e-05, + "loss": 3.9677, + "step": 4255 + }, + { + "epoch": 0.289441500203832, + "grad_norm": 0.2484733760356903, + "learning_rate": 9.638792634868869e-05, + "loss": 4.4475, + "step": 4260 + }, + { + "epoch": 0.2897812202744938, + "grad_norm": 0.19737331569194794, + "learning_rate": 9.638367984780541e-05, + "loss": 4.0734, + "step": 4265 + }, + { + "epoch": 0.29012094034515556, + "grad_norm": 0.21674834191799164, + "learning_rate": 9.637943334692214e-05, + "loss": 4.2674, + "step": 4270 + }, + { + "epoch": 0.29046066041581736, + "grad_norm": 1.3918811082839966, + "learning_rate": 9.637518684603887e-05, + "loss": 4.2559, + "step": 4275 + }, + { + "epoch": 0.29080038048647916, + "grad_norm": 0.3141735792160034, + "learning_rate": 9.63709403451556e-05, + "loss": 4.2088, + "step": 4280 + }, + { + "epoch": 0.2911401005571409, + "grad_norm": 0.17463235557079315, + "learning_rate": 9.636669384427233e-05, + "loss": 4.2172, + "step": 4285 + }, + { + "epoch": 0.2914798206278027, + "grad_norm": 0.28866469860076904, + "learning_rate": 9.636244734338905e-05, + "loss": 4.1058, + "step": 4290 + }, + { + "epoch": 0.29181954069846444, + "grad_norm": 0.19368840754032135, + "learning_rate": 9.635820084250577e-05, + "loss": 4.3816, + "step": 4295 + }, + { + "epoch": 0.29215926076912624, + "grad_norm": 0.5456646680831909, + "learning_rate": 9.635395434162251e-05, + "loss": 4.004, + "step": 4300 + }, + { + "epoch": 0.29249898083978804, + "grad_norm": 0.5791100263595581, + "learning_rate": 9.634970784073924e-05, + "loss": 3.9493, + "step": 4305 + }, + { + "epoch": 0.2928387009104498, + "grad_norm": 0.18864502012729645, + "learning_rate": 9.634546133985595e-05, + "loss": 4.2906, + "step": 4310 + }, + { + "epoch": 0.2931784209811116, + "grad_norm": 0.6542890071868896, + "learning_rate": 9.63412148389727e-05, + "loss": 4.2943, + "step": 4315 + }, + { + "epoch": 0.2935181410517733, + "grad_norm": 0.2639864683151245, + "learning_rate": 9.633696833808942e-05, + "loss": 4.1306, + "step": 4320 + }, + { + "epoch": 0.2938578611224351, + "grad_norm": 0.24884024262428284, + "learning_rate": 9.633272183720614e-05, + "loss": 4.1749, + "step": 4325 + }, + { + "epoch": 0.2941975811930969, + "grad_norm": 1.6146323680877686, + "learning_rate": 9.632847533632288e-05, + "loss": 4.1622, + "step": 4330 + }, + { + "epoch": 0.29453730126375866, + "grad_norm": 0.19550690054893494, + "learning_rate": 9.63242288354396e-05, + "loss": 4.2481, + "step": 4335 + }, + { + "epoch": 0.29487702133442045, + "grad_norm": 0.48053327202796936, + "learning_rate": 9.631998233455633e-05, + "loss": 4.1654, + "step": 4340 + }, + { + "epoch": 0.2952167414050822, + "grad_norm": 0.6082022190093994, + "learning_rate": 9.631573583367306e-05, + "loss": 3.922, + "step": 4345 + }, + { + "epoch": 0.295556461475744, + "grad_norm": 0.410819411277771, + "learning_rate": 9.631148933278979e-05, + "loss": 4.1838, + "step": 4350 + }, + { + "epoch": 0.29589618154640573, + "grad_norm": 0.20050150156021118, + "learning_rate": 9.630724283190652e-05, + "loss": 4.2083, + "step": 4355 + }, + { + "epoch": 0.29623590161706753, + "grad_norm": 0.2641303539276123, + "learning_rate": 9.630299633102325e-05, + "loss": 4.0804, + "step": 4360 + }, + { + "epoch": 0.29657562168772933, + "grad_norm": 0.1961575597524643, + "learning_rate": 9.629874983013997e-05, + "loss": 4.3261, + "step": 4365 + }, + { + "epoch": 0.29691534175839107, + "grad_norm": 0.17782385647296906, + "learning_rate": 9.62945033292567e-05, + "loss": 4.3017, + "step": 4370 + }, + { + "epoch": 0.29725506182905287, + "grad_norm": 1.2139571905136108, + "learning_rate": 9.629025682837343e-05, + "loss": 4.1051, + "step": 4375 + }, + { + "epoch": 0.2975947818997146, + "grad_norm": 0.2687116265296936, + "learning_rate": 9.628601032749014e-05, + "loss": 4.0757, + "step": 4380 + }, + { + "epoch": 0.2979345019703764, + "grad_norm": 0.19845756888389587, + "learning_rate": 9.628176382660689e-05, + "loss": 4.0948, + "step": 4385 + }, + { + "epoch": 0.2982742220410382, + "grad_norm": 0.2517881989479065, + "learning_rate": 9.627751732572361e-05, + "loss": 4.3492, + "step": 4390 + }, + { + "epoch": 0.29861394211169995, + "grad_norm": 0.19443516433238983, + "learning_rate": 9.627327082484033e-05, + "loss": 4.2639, + "step": 4395 + }, + { + "epoch": 0.29895366218236175, + "grad_norm": 0.21518000960350037, + "learning_rate": 9.626902432395707e-05, + "loss": 4.0157, + "step": 4400 + }, + { + "epoch": 0.2992933822530235, + "grad_norm": 0.3461875915527344, + "learning_rate": 9.62647778230738e-05, + "loss": 4.2115, + "step": 4405 + }, + { + "epoch": 0.2996331023236853, + "grad_norm": 0.39930984377861023, + "learning_rate": 9.626053132219051e-05, + "loss": 4.3391, + "step": 4410 + }, + { + "epoch": 0.2999728223943471, + "grad_norm": 0.22730666399002075, + "learning_rate": 9.625628482130725e-05, + "loss": 4.4092, + "step": 4415 + }, + { + "epoch": 0.3003125424650088, + "grad_norm": 0.2596425414085388, + "learning_rate": 9.625203832042398e-05, + "loss": 4.2127, + "step": 4420 + }, + { + "epoch": 0.3006522625356706, + "grad_norm": 0.19453459978103638, + "learning_rate": 9.62477918195407e-05, + "loss": 4.338, + "step": 4425 + }, + { + "epoch": 0.30099198260633236, + "grad_norm": 0.5006263852119446, + "learning_rate": 9.624354531865744e-05, + "loss": 4.3887, + "step": 4430 + }, + { + "epoch": 0.30133170267699416, + "grad_norm": 0.5625196099281311, + "learning_rate": 9.623929881777417e-05, + "loss": 4.411, + "step": 4435 + }, + { + "epoch": 0.3016714227476559, + "grad_norm": 0.2295573651790619, + "learning_rate": 9.623505231689088e-05, + "loss": 4.0798, + "step": 4440 + }, + { + "epoch": 0.3020111428183177, + "grad_norm": 0.22928239405155182, + "learning_rate": 9.623080581600762e-05, + "loss": 4.2019, + "step": 4445 + }, + { + "epoch": 0.3023508628889795, + "grad_norm": 0.7567266225814819, + "learning_rate": 9.622655931512434e-05, + "loss": 4.069, + "step": 4450 + }, + { + "epoch": 0.30269058295964124, + "grad_norm": 0.3791631758213043, + "learning_rate": 9.622231281424106e-05, + "loss": 4.0739, + "step": 4455 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.21165654063224792, + "learning_rate": 9.62180663133578e-05, + "loss": 4.0467, + "step": 4460 + }, + { + "epoch": 0.3033700231009648, + "grad_norm": 0.2274232804775238, + "learning_rate": 9.621381981247452e-05, + "loss": 4.1655, + "step": 4465 + }, + { + "epoch": 0.3037097431716266, + "grad_norm": 0.2882256507873535, + "learning_rate": 9.620957331159125e-05, + "loss": 4.0162, + "step": 4470 + }, + { + "epoch": 0.3040494632422884, + "grad_norm": 0.21718928217887878, + "learning_rate": 9.620532681070799e-05, + "loss": 4.1421, + "step": 4475 + }, + { + "epoch": 0.3043891833129501, + "grad_norm": 0.20482736825942993, + "learning_rate": 9.62010803098247e-05, + "loss": 4.3067, + "step": 4480 + }, + { + "epoch": 0.3047289033836119, + "grad_norm": 0.8644359111785889, + "learning_rate": 9.619683380894143e-05, + "loss": 4.1044, + "step": 4485 + }, + { + "epoch": 0.30506862345427366, + "grad_norm": 0.2353629469871521, + "learning_rate": 9.619258730805817e-05, + "loss": 4.6311, + "step": 4490 + }, + { + "epoch": 0.30540834352493546, + "grad_norm": 0.4746248722076416, + "learning_rate": 9.618834080717489e-05, + "loss": 4.2593, + "step": 4495 + }, + { + "epoch": 0.30574806359559725, + "grad_norm": 0.412517786026001, + "learning_rate": 9.618409430629162e-05, + "loss": 4.2203, + "step": 4500 + }, + { + "epoch": 0.306087783666259, + "grad_norm": 0.1828744113445282, + "learning_rate": 9.617984780540836e-05, + "loss": 4.0572, + "step": 4505 + }, + { + "epoch": 0.3064275037369208, + "grad_norm": 0.2963522672653198, + "learning_rate": 9.617560130452507e-05, + "loss": 4.3707, + "step": 4510 + }, + { + "epoch": 0.30676722380758253, + "grad_norm": 0.20454883575439453, + "learning_rate": 9.61713548036418e-05, + "loss": 4.1609, + "step": 4515 + }, + { + "epoch": 0.30710694387824433, + "grad_norm": 0.9719656109809875, + "learning_rate": 9.616710830275853e-05, + "loss": 4.3215, + "step": 4520 + }, + { + "epoch": 0.3074466639489061, + "grad_norm": 0.6544187664985657, + "learning_rate": 9.616286180187526e-05, + "loss": 3.9987, + "step": 4525 + }, + { + "epoch": 0.30778638401956787, + "grad_norm": 0.3224717080593109, + "learning_rate": 9.615861530099198e-05, + "loss": 4.2707, + "step": 4530 + }, + { + "epoch": 0.30812610409022967, + "grad_norm": 0.3716621398925781, + "learning_rate": 9.615436880010871e-05, + "loss": 3.9584, + "step": 4535 + }, + { + "epoch": 0.3084658241608914, + "grad_norm": 0.23874157667160034, + "learning_rate": 9.615012229922544e-05, + "loss": 4.0501, + "step": 4540 + }, + { + "epoch": 0.3088055442315532, + "grad_norm": 0.30314287543296814, + "learning_rate": 9.614587579834217e-05, + "loss": 3.8515, + "step": 4545 + }, + { + "epoch": 0.30914526430221495, + "grad_norm": 0.19416074454784393, + "learning_rate": 9.61416292974589e-05, + "loss": 4.0812, + "step": 4550 + }, + { + "epoch": 0.30948498437287675, + "grad_norm": 0.5242770910263062, + "learning_rate": 9.613738279657562e-05, + "loss": 4.37, + "step": 4555 + }, + { + "epoch": 0.30982470444353855, + "grad_norm": 0.20864085853099823, + "learning_rate": 9.613313629569235e-05, + "loss": 4.2711, + "step": 4560 + }, + { + "epoch": 0.3101644245142003, + "grad_norm": 0.20202593505382538, + "learning_rate": 9.612888979480908e-05, + "loss": 4.2423, + "step": 4565 + }, + { + "epoch": 0.3105041445848621, + "grad_norm": 0.5426074266433716, + "learning_rate": 9.612464329392581e-05, + "loss": 4.2333, + "step": 4570 + }, + { + "epoch": 0.3108438646555238, + "grad_norm": 0.21013125777244568, + "learning_rate": 9.612039679304254e-05, + "loss": 4.0275, + "step": 4575 + }, + { + "epoch": 0.3111835847261856, + "grad_norm": 0.3289850056171417, + "learning_rate": 9.611615029215926e-05, + "loss": 4.3321, + "step": 4580 + }, + { + "epoch": 0.3115233047968474, + "grad_norm": 0.20981498062610626, + "learning_rate": 9.611190379127599e-05, + "loss": 4.3237, + "step": 4585 + }, + { + "epoch": 0.31186302486750916, + "grad_norm": 0.2699143886566162, + "learning_rate": 9.610765729039272e-05, + "loss": 4.1158, + "step": 4590 + }, + { + "epoch": 0.31220274493817096, + "grad_norm": 0.27582621574401855, + "learning_rate": 9.610341078950945e-05, + "loss": 4.1325, + "step": 4595 + }, + { + "epoch": 0.3125424650088327, + "grad_norm": 0.2031656950712204, + "learning_rate": 9.609916428862618e-05, + "loss": 4.2928, + "step": 4600 + }, + { + "epoch": 0.3128821850794945, + "grad_norm": 0.17339491844177246, + "learning_rate": 9.60949177877429e-05, + "loss": 4.2046, + "step": 4605 + }, + { + "epoch": 0.3132219051501563, + "grad_norm": 0.2266245186328888, + "learning_rate": 9.609067128685963e-05, + "loss": 4.0126, + "step": 4610 + }, + { + "epoch": 0.31356162522081804, + "grad_norm": 0.2599181830883026, + "learning_rate": 9.608642478597636e-05, + "loss": 4.2055, + "step": 4615 + }, + { + "epoch": 0.31390134529147984, + "grad_norm": 0.27137070894241333, + "learning_rate": 9.608217828509309e-05, + "loss": 4.3499, + "step": 4620 + }, + { + "epoch": 0.3142410653621416, + "grad_norm": 0.22108152508735657, + "learning_rate": 9.607793178420982e-05, + "loss": 4.257, + "step": 4625 + }, + { + "epoch": 0.3145807854328034, + "grad_norm": 0.2058125138282776, + "learning_rate": 9.607368528332654e-05, + "loss": 3.919, + "step": 4630 + }, + { + "epoch": 0.3149205055034651, + "grad_norm": 0.4298430383205414, + "learning_rate": 9.606943878244327e-05, + "loss": 4.0876, + "step": 4635 + }, + { + "epoch": 0.3152602255741269, + "grad_norm": 0.19494764506816864, + "learning_rate": 9.606519228156e-05, + "loss": 4.2404, + "step": 4640 + }, + { + "epoch": 0.3155999456447887, + "grad_norm": 0.35915589332580566, + "learning_rate": 9.606094578067673e-05, + "loss": 4.1185, + "step": 4645 + }, + { + "epoch": 0.31593966571545046, + "grad_norm": 0.395074725151062, + "learning_rate": 9.605669927979344e-05, + "loss": 4.2395, + "step": 4650 + }, + { + "epoch": 0.31627938578611225, + "grad_norm": 0.3247855007648468, + "learning_rate": 9.605245277891018e-05, + "loss": 3.9987, + "step": 4655 + }, + { + "epoch": 0.316619105856774, + "grad_norm": 0.18057437241077423, + "learning_rate": 9.604820627802691e-05, + "loss": 4.3912, + "step": 4660 + }, + { + "epoch": 0.3169588259274358, + "grad_norm": 0.25986525416374207, + "learning_rate": 9.604395977714363e-05, + "loss": 4.2514, + "step": 4665 + }, + { + "epoch": 0.3172985459980976, + "grad_norm": 0.2058873325586319, + "learning_rate": 9.603971327626037e-05, + "loss": 4.3795, + "step": 4670 + }, + { + "epoch": 0.31763826606875933, + "grad_norm": 0.30512815713882446, + "learning_rate": 9.60354667753771e-05, + "loss": 4.2392, + "step": 4675 + }, + { + "epoch": 0.31797798613942113, + "grad_norm": 0.18412257730960846, + "learning_rate": 9.603122027449382e-05, + "loss": 4.2086, + "step": 4680 + }, + { + "epoch": 0.3183177062100829, + "grad_norm": 0.2081792950630188, + "learning_rate": 9.602697377361055e-05, + "loss": 4.3573, + "step": 4685 + }, + { + "epoch": 0.31865742628074467, + "grad_norm": 0.3510904014110565, + "learning_rate": 9.602272727272728e-05, + "loss": 4.3618, + "step": 4690 + }, + { + "epoch": 0.31899714635140647, + "grad_norm": 0.37145447731018066, + "learning_rate": 9.601848077184401e-05, + "loss": 4.301, + "step": 4695 + }, + { + "epoch": 0.3193368664220682, + "grad_norm": 0.23616161942481995, + "learning_rate": 9.601423427096074e-05, + "loss": 4.2158, + "step": 4700 + }, + { + "epoch": 0.31967658649273, + "grad_norm": 1.332667589187622, + "learning_rate": 9.600998777007746e-05, + "loss": 4.0569, + "step": 4705 + }, + { + "epoch": 0.32001630656339175, + "grad_norm": 0.21901053190231323, + "learning_rate": 9.600574126919419e-05, + "loss": 4.0555, + "step": 4710 + }, + { + "epoch": 0.32035602663405355, + "grad_norm": 0.18509267270565033, + "learning_rate": 9.600149476831092e-05, + "loss": 4.1219, + "step": 4715 + }, + { + "epoch": 0.3206957467047153, + "grad_norm": 0.17567375302314758, + "learning_rate": 9.599724826742764e-05, + "loss": 4.2921, + "step": 4720 + }, + { + "epoch": 0.3210354667753771, + "grad_norm": 0.17881886661052704, + "learning_rate": 9.599300176654438e-05, + "loss": 4.2577, + "step": 4725 + }, + { + "epoch": 0.3213751868460389, + "grad_norm": 0.2836432158946991, + "learning_rate": 9.59887552656611e-05, + "loss": 4.1362, + "step": 4730 + }, + { + "epoch": 0.3217149069167006, + "grad_norm": 0.43188366293907166, + "learning_rate": 9.598450876477782e-05, + "loss": 3.7939, + "step": 4735 + }, + { + "epoch": 0.3220546269873624, + "grad_norm": 0.21462103724479675, + "learning_rate": 9.598026226389456e-05, + "loss": 4.166, + "step": 4740 + }, + { + "epoch": 0.32239434705802417, + "grad_norm": 0.16883856058120728, + "learning_rate": 9.597601576301129e-05, + "loss": 3.9182, + "step": 4745 + }, + { + "epoch": 0.32273406712868596, + "grad_norm": 0.32869285345077515, + "learning_rate": 9.5971769262128e-05, + "loss": 4.0832, + "step": 4750 + }, + { + "epoch": 0.32307378719934776, + "grad_norm": 0.22861923277378082, + "learning_rate": 9.596752276124474e-05, + "loss": 4.344, + "step": 4755 + }, + { + "epoch": 0.3234135072700095, + "grad_norm": 0.3567066192626953, + "learning_rate": 9.596327626036147e-05, + "loss": 4.0191, + "step": 4760 + }, + { + "epoch": 0.3237532273406713, + "grad_norm": 0.23706111311912537, + "learning_rate": 9.595902975947819e-05, + "loss": 4.1505, + "step": 4765 + }, + { + "epoch": 0.32409294741133304, + "grad_norm": 0.22384607791900635, + "learning_rate": 9.595478325859493e-05, + "loss": 4.1113, + "step": 4770 + }, + { + "epoch": 0.32443266748199484, + "grad_norm": 0.1979631930589676, + "learning_rate": 9.595053675771166e-05, + "loss": 4.0686, + "step": 4775 + }, + { + "epoch": 0.32477238755265664, + "grad_norm": 0.3480912148952484, + "learning_rate": 9.594629025682837e-05, + "loss": 4.2106, + "step": 4780 + }, + { + "epoch": 0.3251121076233184, + "grad_norm": 0.23982703685760498, + "learning_rate": 9.594204375594511e-05, + "loss": 4.2114, + "step": 4785 + }, + { + "epoch": 0.3254518276939802, + "grad_norm": 1.1133886575698853, + "learning_rate": 9.593779725506184e-05, + "loss": 3.894, + "step": 4790 + }, + { + "epoch": 0.3257915477646419, + "grad_norm": 0.1953042596578598, + "learning_rate": 9.593355075417856e-05, + "loss": 4.1392, + "step": 4795 + }, + { + "epoch": 0.3261312678353037, + "grad_norm": 0.24198144674301147, + "learning_rate": 9.59293042532953e-05, + "loss": 3.9054, + "step": 4800 + }, + { + "epoch": 0.32647098790596546, + "grad_norm": 0.5570387840270996, + "learning_rate": 9.592505775241201e-05, + "loss": 3.9928, + "step": 4805 + }, + { + "epoch": 0.32681070797662726, + "grad_norm": 0.3616771399974823, + "learning_rate": 9.592081125152874e-05, + "loss": 4.3527, + "step": 4810 + }, + { + "epoch": 0.32715042804728905, + "grad_norm": 0.7734106183052063, + "learning_rate": 9.591656475064548e-05, + "loss": 3.9594, + "step": 4815 + }, + { + "epoch": 0.3274901481179508, + "grad_norm": 0.23176033794879913, + "learning_rate": 9.59123182497622e-05, + "loss": 4.2269, + "step": 4820 + }, + { + "epoch": 0.3278298681886126, + "grad_norm": 0.22131270170211792, + "learning_rate": 9.590807174887892e-05, + "loss": 4.205, + "step": 4825 + }, + { + "epoch": 0.32816958825927434, + "grad_norm": 0.1967305839061737, + "learning_rate": 9.590382524799566e-05, + "loss": 4.1081, + "step": 4830 + }, + { + "epoch": 0.32850930832993613, + "grad_norm": 0.20413081347942352, + "learning_rate": 9.589957874711238e-05, + "loss": 4.2146, + "step": 4835 + }, + { + "epoch": 0.32884902840059793, + "grad_norm": 0.237589493393898, + "learning_rate": 9.589533224622911e-05, + "loss": 4.321, + "step": 4840 + }, + { + "epoch": 0.3291887484712597, + "grad_norm": 0.3263246417045593, + "learning_rate": 9.589108574534585e-05, + "loss": 4.2572, + "step": 4845 + }, + { + "epoch": 0.32952846854192147, + "grad_norm": 0.2639991343021393, + "learning_rate": 9.588683924446256e-05, + "loss": 4.3049, + "step": 4850 + }, + { + "epoch": 0.3298681886125832, + "grad_norm": 0.33223286271095276, + "learning_rate": 9.588259274357929e-05, + "loss": 4.3715, + "step": 4855 + }, + { + "epoch": 0.330207908683245, + "grad_norm": 0.22897298634052277, + "learning_rate": 9.587834624269603e-05, + "loss": 3.93, + "step": 4860 + }, + { + "epoch": 0.3305476287539068, + "grad_norm": 0.3667212128639221, + "learning_rate": 9.587409974181275e-05, + "loss": 4.3149, + "step": 4865 + }, + { + "epoch": 0.33088734882456855, + "grad_norm": 0.22442007064819336, + "learning_rate": 9.586985324092948e-05, + "loss": 4.4189, + "step": 4870 + }, + { + "epoch": 0.33122706889523035, + "grad_norm": 0.18334710597991943, + "learning_rate": 9.58656067400462e-05, + "loss": 3.9457, + "step": 4875 + }, + { + "epoch": 0.3315667889658921, + "grad_norm": 0.34593579173088074, + "learning_rate": 9.586136023916293e-05, + "loss": 3.9007, + "step": 4880 + }, + { + "epoch": 0.3319065090365539, + "grad_norm": 0.24884456396102905, + "learning_rate": 9.585711373827966e-05, + "loss": 4.198, + "step": 4885 + }, + { + "epoch": 0.33224622910721563, + "grad_norm": 0.6150047183036804, + "learning_rate": 9.585286723739639e-05, + "loss": 4.1738, + "step": 4890 + }, + { + "epoch": 0.3325859491778774, + "grad_norm": 0.18449634313583374, + "learning_rate": 9.584862073651312e-05, + "loss": 4.3916, + "step": 4895 + }, + { + "epoch": 0.3329256692485392, + "grad_norm": 0.45811331272125244, + "learning_rate": 9.584437423562984e-05, + "loss": 4.0894, + "step": 4900 + }, + { + "epoch": 0.33326538931920097, + "grad_norm": 0.9670056700706482, + "learning_rate": 9.584012773474657e-05, + "loss": 4.0947, + "step": 4905 + }, + { + "epoch": 0.33360510938986276, + "grad_norm": 0.2828699052333832, + "learning_rate": 9.58358812338633e-05, + "loss": 4.27, + "step": 4910 + }, + { + "epoch": 0.3339448294605245, + "grad_norm": 0.22989730536937714, + "learning_rate": 9.583163473298003e-05, + "loss": 4.2658, + "step": 4915 + }, + { + "epoch": 0.3342845495311863, + "grad_norm": 0.4018714427947998, + "learning_rate": 9.582738823209676e-05, + "loss": 4.1278, + "step": 4920 + }, + { + "epoch": 0.3346242696018481, + "grad_norm": 0.5296480059623718, + "learning_rate": 9.582314173121348e-05, + "loss": 4.1865, + "step": 4925 + }, + { + "epoch": 0.33496398967250984, + "grad_norm": 0.2477627843618393, + "learning_rate": 9.581889523033021e-05, + "loss": 4.4925, + "step": 4930 + }, + { + "epoch": 0.33530370974317164, + "grad_norm": 0.24414370954036713, + "learning_rate": 9.581464872944694e-05, + "loss": 4.1706, + "step": 4935 + }, + { + "epoch": 0.3356434298138334, + "grad_norm": 0.15603503584861755, + "learning_rate": 9.581040222856367e-05, + "loss": 4.0303, + "step": 4940 + }, + { + "epoch": 0.3359831498844952, + "grad_norm": 0.17460083961486816, + "learning_rate": 9.58061557276804e-05, + "loss": 4.1986, + "step": 4945 + }, + { + "epoch": 0.336322869955157, + "grad_norm": 0.2035687118768692, + "learning_rate": 9.580190922679712e-05, + "loss": 4.3514, + "step": 4950 + }, + { + "epoch": 0.3366625900258187, + "grad_norm": 0.4037059545516968, + "learning_rate": 9.579766272591385e-05, + "loss": 4.1645, + "step": 4955 + }, + { + "epoch": 0.3370023100964805, + "grad_norm": 0.22527378797531128, + "learning_rate": 9.579341622503058e-05, + "loss": 4.1399, + "step": 4960 + }, + { + "epoch": 0.33734203016714226, + "grad_norm": 0.27611520886421204, + "learning_rate": 9.578916972414731e-05, + "loss": 4.1679, + "step": 4965 + }, + { + "epoch": 0.33768175023780406, + "grad_norm": 0.26130980253219604, + "learning_rate": 9.578492322326404e-05, + "loss": 3.9786, + "step": 4970 + }, + { + "epoch": 0.3380214703084658, + "grad_norm": 0.20753854513168335, + "learning_rate": 9.578067672238076e-05, + "loss": 4.0629, + "step": 4975 + }, + { + "epoch": 0.3383611903791276, + "grad_norm": 0.198018416762352, + "learning_rate": 9.577643022149749e-05, + "loss": 4.0956, + "step": 4980 + }, + { + "epoch": 0.3387009104497894, + "grad_norm": 0.21650417149066925, + "learning_rate": 9.577218372061422e-05, + "loss": 4.0965, + "step": 4985 + }, + { + "epoch": 0.33904063052045114, + "grad_norm": 0.3832937777042389, + "learning_rate": 9.576793721973095e-05, + "loss": 4.1086, + "step": 4990 + }, + { + "epoch": 0.33938035059111293, + "grad_norm": 0.23973116278648376, + "learning_rate": 9.576369071884768e-05, + "loss": 4.3533, + "step": 4995 + }, + { + "epoch": 0.3397200706617747, + "grad_norm": 0.3817537724971771, + "learning_rate": 9.57594442179644e-05, + "loss": 4.2127, + "step": 5000 + }, + { + "epoch": 0.3400597907324365, + "grad_norm": 0.2038896679878235, + "learning_rate": 9.575519771708112e-05, + "loss": 4.3264, + "step": 5005 + }, + { + "epoch": 0.34039951080309827, + "grad_norm": 0.23982146382331848, + "learning_rate": 9.575095121619786e-05, + "loss": 4.0702, + "step": 5010 + }, + { + "epoch": 0.34073923087376, + "grad_norm": 0.30504170060157776, + "learning_rate": 9.574670471531459e-05, + "loss": 3.9892, + "step": 5015 + }, + { + "epoch": 0.3410789509444218, + "grad_norm": 0.2355673611164093, + "learning_rate": 9.574245821443132e-05, + "loss": 3.9423, + "step": 5020 + }, + { + "epoch": 0.34141867101508355, + "grad_norm": 0.22874650359153748, + "learning_rate": 9.573821171354804e-05, + "loss": 3.9142, + "step": 5025 + }, + { + "epoch": 0.34175839108574535, + "grad_norm": 0.21437452733516693, + "learning_rate": 9.573396521266477e-05, + "loss": 4.1365, + "step": 5030 + }, + { + "epoch": 0.34209811115640715, + "grad_norm": 0.3095625936985016, + "learning_rate": 9.57297187117815e-05, + "loss": 4.1468, + "step": 5035 + }, + { + "epoch": 0.3424378312270689, + "grad_norm": 0.2202177494764328, + "learning_rate": 9.572547221089823e-05, + "loss": 4.01, + "step": 5040 + }, + { + "epoch": 0.3427775512977307, + "grad_norm": 0.32474327087402344, + "learning_rate": 9.572122571001496e-05, + "loss": 4.0868, + "step": 5045 + }, + { + "epoch": 0.34311727136839243, + "grad_norm": 0.2622280716896057, + "learning_rate": 9.571697920913168e-05, + "loss": 4.2103, + "step": 5050 + }, + { + "epoch": 0.3434569914390542, + "grad_norm": 0.7448468208312988, + "learning_rate": 9.571273270824841e-05, + "loss": 4.2945, + "step": 5055 + }, + { + "epoch": 0.34379671150971597, + "grad_norm": 0.2269594371318817, + "learning_rate": 9.570848620736514e-05, + "loss": 3.9127, + "step": 5060 + }, + { + "epoch": 0.34413643158037777, + "grad_norm": 0.2645524740219116, + "learning_rate": 9.570423970648187e-05, + "loss": 3.9423, + "step": 5065 + }, + { + "epoch": 0.34447615165103956, + "grad_norm": 0.246607705950737, + "learning_rate": 9.56999932055986e-05, + "loss": 4.1347, + "step": 5070 + }, + { + "epoch": 0.3448158717217013, + "grad_norm": 0.19342374801635742, + "learning_rate": 9.569574670471531e-05, + "loss": 4.0787, + "step": 5075 + }, + { + "epoch": 0.3451555917923631, + "grad_norm": 0.31122297048568726, + "learning_rate": 9.569150020383205e-05, + "loss": 4.1522, + "step": 5080 + }, + { + "epoch": 0.34549531186302485, + "grad_norm": 1.0425968170166016, + "learning_rate": 9.568725370294878e-05, + "loss": 3.995, + "step": 5085 + }, + { + "epoch": 0.34583503193368664, + "grad_norm": 0.22739467024803162, + "learning_rate": 9.56830072020655e-05, + "loss": 3.9042, + "step": 5090 + }, + { + "epoch": 0.34617475200434844, + "grad_norm": 0.21344348788261414, + "learning_rate": 9.567876070118224e-05, + "loss": 4.1351, + "step": 5095 + }, + { + "epoch": 0.3465144720750102, + "grad_norm": 0.2140883356332779, + "learning_rate": 9.567451420029896e-05, + "loss": 3.9542, + "step": 5100 + }, + { + "epoch": 0.346854192145672, + "grad_norm": 0.1795925498008728, + "learning_rate": 9.567026769941568e-05, + "loss": 4.061, + "step": 5105 + }, + { + "epoch": 0.3471939122163337, + "grad_norm": 0.37569665908813477, + "learning_rate": 9.566602119853242e-05, + "loss": 4.3051, + "step": 5110 + }, + { + "epoch": 0.3475336322869955, + "grad_norm": 0.19528646767139435, + "learning_rate": 9.566177469764915e-05, + "loss": 4.1591, + "step": 5115 + }, + { + "epoch": 0.3478733523576573, + "grad_norm": 0.1688258945941925, + "learning_rate": 9.565752819676586e-05, + "loss": 3.9876, + "step": 5120 + }, + { + "epoch": 0.34821307242831906, + "grad_norm": 0.17164097726345062, + "learning_rate": 9.56532816958826e-05, + "loss": 4.2081, + "step": 5125 + }, + { + "epoch": 0.34855279249898086, + "grad_norm": 0.18012307584285736, + "learning_rate": 9.564903519499933e-05, + "loss": 3.9195, + "step": 5130 + }, + { + "epoch": 0.3488925125696426, + "grad_norm": 0.2118469625711441, + "learning_rate": 9.564478869411605e-05, + "loss": 4.1027, + "step": 5135 + }, + { + "epoch": 0.3492322326403044, + "grad_norm": 0.21049754321575165, + "learning_rate": 9.564054219323279e-05, + "loss": 4.0254, + "step": 5140 + }, + { + "epoch": 0.34957195271096614, + "grad_norm": 0.5723572969436646, + "learning_rate": 9.563629569234952e-05, + "loss": 3.8725, + "step": 5145 + }, + { + "epoch": 0.34991167278162794, + "grad_norm": 0.22618575394153595, + "learning_rate": 9.563204919146623e-05, + "loss": 4.1794, + "step": 5150 + }, + { + "epoch": 0.35025139285228973, + "grad_norm": 0.1874348223209381, + "learning_rate": 9.562780269058297e-05, + "loss": 3.9515, + "step": 5155 + }, + { + "epoch": 0.3505911129229515, + "grad_norm": 0.2910906672477722, + "learning_rate": 9.562355618969969e-05, + "loss": 4.2863, + "step": 5160 + }, + { + "epoch": 0.3509308329936133, + "grad_norm": 0.19384640455245972, + "learning_rate": 9.561930968881641e-05, + "loss": 4.2367, + "step": 5165 + }, + { + "epoch": 0.351270553064275, + "grad_norm": 0.15815407037734985, + "learning_rate": 9.561506318793316e-05, + "loss": 3.9907, + "step": 5170 + }, + { + "epoch": 0.3516102731349368, + "grad_norm": 0.2070821076631546, + "learning_rate": 9.561081668704987e-05, + "loss": 3.9338, + "step": 5175 + }, + { + "epoch": 0.3519499932055986, + "grad_norm": 0.24302656948566437, + "learning_rate": 9.56065701861666e-05, + "loss": 4.2737, + "step": 5180 + }, + { + "epoch": 0.35228971327626035, + "grad_norm": 0.24706007540225983, + "learning_rate": 9.560232368528334e-05, + "loss": 4.23, + "step": 5185 + }, + { + "epoch": 0.35262943334692215, + "grad_norm": 0.5972357988357544, + "learning_rate": 9.559807718440005e-05, + "loss": 4.157, + "step": 5190 + }, + { + "epoch": 0.3529691534175839, + "grad_norm": 1.1296205520629883, + "learning_rate": 9.559383068351678e-05, + "loss": 4.1556, + "step": 5195 + }, + { + "epoch": 0.3533088734882457, + "grad_norm": 0.19614671170711517, + "learning_rate": 9.558958418263352e-05, + "loss": 4.1069, + "step": 5200 + }, + { + "epoch": 0.3536485935589075, + "grad_norm": 0.2329636514186859, + "learning_rate": 9.558533768175024e-05, + "loss": 4.0195, + "step": 5205 + }, + { + "epoch": 0.35398831362956923, + "grad_norm": 0.3606981635093689, + "learning_rate": 9.558109118086697e-05, + "loss": 4.0518, + "step": 5210 + }, + { + "epoch": 0.354328033700231, + "grad_norm": 0.212651789188385, + "learning_rate": 9.557684467998371e-05, + "loss": 4.1561, + "step": 5215 + }, + { + "epoch": 0.35466775377089277, + "grad_norm": 0.2813778817653656, + "learning_rate": 9.557259817910042e-05, + "loss": 4.1709, + "step": 5220 + }, + { + "epoch": 0.35500747384155457, + "grad_norm": 0.1836264729499817, + "learning_rate": 9.556835167821715e-05, + "loss": 3.9968, + "step": 5225 + }, + { + "epoch": 0.3553471939122163, + "grad_norm": 0.24313072860240936, + "learning_rate": 9.556410517733388e-05, + "loss": 4.1021, + "step": 5230 + }, + { + "epoch": 0.3556869139828781, + "grad_norm": 0.21879182755947113, + "learning_rate": 9.55598586764506e-05, + "loss": 4.0678, + "step": 5235 + }, + { + "epoch": 0.3560266340535399, + "grad_norm": 0.19957928359508514, + "learning_rate": 9.555561217556733e-05, + "loss": 4.1579, + "step": 5240 + }, + { + "epoch": 0.35636635412420165, + "grad_norm": 0.2043609321117401, + "learning_rate": 9.555136567468406e-05, + "loss": 3.9197, + "step": 5245 + }, + { + "epoch": 0.35670607419486344, + "grad_norm": 0.1743493527173996, + "learning_rate": 9.554711917380079e-05, + "loss": 4.3003, + "step": 5250 + }, + { + "epoch": 0.3570457942655252, + "grad_norm": 0.3488079309463501, + "learning_rate": 9.554287267291752e-05, + "loss": 4.0098, + "step": 5255 + }, + { + "epoch": 0.357385514336187, + "grad_norm": 0.2585020959377289, + "learning_rate": 9.553862617203425e-05, + "loss": 4.0749, + "step": 5260 + }, + { + "epoch": 0.3577252344068488, + "grad_norm": 0.22201067209243774, + "learning_rate": 9.553437967115097e-05, + "loss": 3.7807, + "step": 5265 + }, + { + "epoch": 0.3580649544775105, + "grad_norm": 0.4632178843021393, + "learning_rate": 9.55301331702677e-05, + "loss": 4.1557, + "step": 5270 + }, + { + "epoch": 0.3584046745481723, + "grad_norm": 0.4491996765136719, + "learning_rate": 9.552588666938443e-05, + "loss": 4.2286, + "step": 5275 + }, + { + "epoch": 0.35874439461883406, + "grad_norm": 0.22126582264900208, + "learning_rate": 9.552164016850116e-05, + "loss": 3.9587, + "step": 5280 + }, + { + "epoch": 0.35908411468949586, + "grad_norm": 0.20614346861839294, + "learning_rate": 9.551739366761789e-05, + "loss": 4.1402, + "step": 5285 + }, + { + "epoch": 0.35942383476015766, + "grad_norm": 0.2311069220304489, + "learning_rate": 9.551314716673461e-05, + "loss": 4.1504, + "step": 5290 + }, + { + "epoch": 0.3597635548308194, + "grad_norm": 0.20152784883975983, + "learning_rate": 9.550890066585134e-05, + "loss": 4.2783, + "step": 5295 + }, + { + "epoch": 0.3601032749014812, + "grad_norm": 0.2334737479686737, + "learning_rate": 9.550465416496807e-05, + "loss": 4.2849, + "step": 5300 + }, + { + "epoch": 0.36044299497214294, + "grad_norm": 0.17994678020477295, + "learning_rate": 9.55004076640848e-05, + "loss": 4.3848, + "step": 5305 + }, + { + "epoch": 0.36078271504280474, + "grad_norm": 0.2141488939523697, + "learning_rate": 9.549616116320153e-05, + "loss": 4.309, + "step": 5310 + }, + { + "epoch": 0.3611224351134665, + "grad_norm": 0.2028026133775711, + "learning_rate": 9.549191466231825e-05, + "loss": 4.4349, + "step": 5315 + }, + { + "epoch": 0.3614621551841283, + "grad_norm": 0.1849725842475891, + "learning_rate": 9.548766816143498e-05, + "loss": 3.929, + "step": 5320 + }, + { + "epoch": 0.3618018752547901, + "grad_norm": 0.20538243651390076, + "learning_rate": 9.548342166055171e-05, + "loss": 4.2426, + "step": 5325 + }, + { + "epoch": 0.3621415953254518, + "grad_norm": 0.22145512700080872, + "learning_rate": 9.547917515966844e-05, + "loss": 4.1321, + "step": 5330 + }, + { + "epoch": 0.3624813153961136, + "grad_norm": 0.24293570220470428, + "learning_rate": 9.547492865878517e-05, + "loss": 4.4132, + "step": 5335 + }, + { + "epoch": 0.36282103546677535, + "grad_norm": 3.3797924518585205, + "learning_rate": 9.54706821579019e-05, + "loss": 3.9775, + "step": 5340 + }, + { + "epoch": 0.36316075553743715, + "grad_norm": 0.22349077463150024, + "learning_rate": 9.546643565701862e-05, + "loss": 4.095, + "step": 5345 + }, + { + "epoch": 0.36350047560809895, + "grad_norm": 0.22708293795585632, + "learning_rate": 9.546218915613535e-05, + "loss": 4.1219, + "step": 5350 + }, + { + "epoch": 0.3638401956787607, + "grad_norm": 0.40382349491119385, + "learning_rate": 9.545794265525208e-05, + "loss": 3.8157, + "step": 5355 + }, + { + "epoch": 0.3641799157494225, + "grad_norm": 0.1983574777841568, + "learning_rate": 9.54536961543688e-05, + "loss": 4.0346, + "step": 5360 + }, + { + "epoch": 0.36451963582008423, + "grad_norm": 0.3324495851993561, + "learning_rate": 9.544944965348553e-05, + "loss": 4.2765, + "step": 5365 + }, + { + "epoch": 0.36485935589074603, + "grad_norm": 0.2055937945842743, + "learning_rate": 9.544520315260226e-05, + "loss": 3.8416, + "step": 5370 + }, + { + "epoch": 0.3651990759614078, + "grad_norm": 0.18161867558956146, + "learning_rate": 9.544095665171899e-05, + "loss": 4.0468, + "step": 5375 + }, + { + "epoch": 0.36553879603206957, + "grad_norm": 0.2383970320224762, + "learning_rate": 9.543671015083572e-05, + "loss": 4.0359, + "step": 5380 + }, + { + "epoch": 0.36587851610273137, + "grad_norm": 0.1611696481704712, + "learning_rate": 9.543246364995245e-05, + "loss": 4.1817, + "step": 5385 + }, + { + "epoch": 0.3662182361733931, + "grad_norm": 0.3070268929004669, + "learning_rate": 9.542821714906917e-05, + "loss": 4.0901, + "step": 5390 + }, + { + "epoch": 0.3665579562440549, + "grad_norm": 0.17862237989902496, + "learning_rate": 9.54239706481859e-05, + "loss": 3.9993, + "step": 5395 + }, + { + "epoch": 0.36689767631471665, + "grad_norm": 0.30012592673301697, + "learning_rate": 9.541972414730263e-05, + "loss": 4.0737, + "step": 5400 + }, + { + "epoch": 0.36723739638537845, + "grad_norm": 15.268974304199219, + "learning_rate": 9.541547764641936e-05, + "loss": 3.8387, + "step": 5405 + }, + { + "epoch": 0.36757711645604024, + "grad_norm": 0.19847442209720612, + "learning_rate": 9.541123114553609e-05, + "loss": 4.0158, + "step": 5410 + }, + { + "epoch": 0.367916836526702, + "grad_norm": 1.345680832862854, + "learning_rate": 9.540698464465281e-05, + "loss": 4.1187, + "step": 5415 + }, + { + "epoch": 0.3682565565973638, + "grad_norm": 0.15424399077892303, + "learning_rate": 9.540273814376954e-05, + "loss": 4.1746, + "step": 5420 + }, + { + "epoch": 0.3685962766680255, + "grad_norm": 0.47641104459762573, + "learning_rate": 9.539849164288627e-05, + "loss": 3.8821, + "step": 5425 + }, + { + "epoch": 0.3689359967386873, + "grad_norm": 0.27253925800323486, + "learning_rate": 9.539424514200299e-05, + "loss": 4.1306, + "step": 5430 + }, + { + "epoch": 0.3692757168093491, + "grad_norm": 0.5784019231796265, + "learning_rate": 9.538999864111973e-05, + "loss": 4.1907, + "step": 5435 + }, + { + "epoch": 0.36961543688001086, + "grad_norm": 0.21910730004310608, + "learning_rate": 9.538575214023645e-05, + "loss": 3.9055, + "step": 5440 + }, + { + "epoch": 0.36995515695067266, + "grad_norm": 0.195495143532753, + "learning_rate": 9.538150563935317e-05, + "loss": 4.1521, + "step": 5445 + }, + { + "epoch": 0.3702948770213344, + "grad_norm": 0.20794479548931122, + "learning_rate": 9.537725913846991e-05, + "loss": 3.9962, + "step": 5450 + }, + { + "epoch": 0.3706345970919962, + "grad_norm": 1.305681586265564, + "learning_rate": 9.537301263758664e-05, + "loss": 4.173, + "step": 5455 + }, + { + "epoch": 0.370974317162658, + "grad_norm": 0.1818116158246994, + "learning_rate": 9.536876613670335e-05, + "loss": 4.0849, + "step": 5460 + }, + { + "epoch": 0.37131403723331974, + "grad_norm": 0.38611772656440735, + "learning_rate": 9.53645196358201e-05, + "loss": 3.9796, + "step": 5465 + }, + { + "epoch": 0.37165375730398154, + "grad_norm": 0.2650381922721863, + "learning_rate": 9.536027313493682e-05, + "loss": 4.1452, + "step": 5470 + }, + { + "epoch": 0.3719934773746433, + "grad_norm": 0.2208934873342514, + "learning_rate": 9.535602663405354e-05, + "loss": 4.2661, + "step": 5475 + }, + { + "epoch": 0.3723331974453051, + "grad_norm": 0.20266486704349518, + "learning_rate": 9.535178013317028e-05, + "loss": 4.207, + "step": 5480 + }, + { + "epoch": 0.3726729175159668, + "grad_norm": 0.21677860617637634, + "learning_rate": 9.5347533632287e-05, + "loss": 4.3784, + "step": 5485 + }, + { + "epoch": 0.3730126375866286, + "grad_norm": 0.33356210589408875, + "learning_rate": 9.534328713140372e-05, + "loss": 4.0518, + "step": 5490 + }, + { + "epoch": 0.3733523576572904, + "grad_norm": 0.2748437225818634, + "learning_rate": 9.533904063052046e-05, + "loss": 4.0669, + "step": 5495 + }, + { + "epoch": 0.37369207772795215, + "grad_norm": 0.22416415810585022, + "learning_rate": 9.533479412963718e-05, + "loss": 4.1892, + "step": 5500 + }, + { + "epoch": 0.37403179779861395, + "grad_norm": 0.20975516736507416, + "learning_rate": 9.53305476287539e-05, + "loss": 4.2146, + "step": 5505 + }, + { + "epoch": 0.3743715178692757, + "grad_norm": 0.1820031851530075, + "learning_rate": 9.532630112787065e-05, + "loss": 4.1054, + "step": 5510 + }, + { + "epoch": 0.3747112379399375, + "grad_norm": 0.2546124756336212, + "learning_rate": 9.532205462698736e-05, + "loss": 4.1923, + "step": 5515 + }, + { + "epoch": 0.3750509580105993, + "grad_norm": 0.5577950477600098, + "learning_rate": 9.531780812610409e-05, + "loss": 3.851, + "step": 5520 + }, + { + "epoch": 0.37539067808126103, + "grad_norm": 0.2909904718399048, + "learning_rate": 9.531356162522083e-05, + "loss": 3.8513, + "step": 5525 + }, + { + "epoch": 0.37573039815192283, + "grad_norm": 0.20286774635314941, + "learning_rate": 9.530931512433755e-05, + "loss": 3.8725, + "step": 5530 + }, + { + "epoch": 0.37607011822258457, + "grad_norm": 0.20398856699466705, + "learning_rate": 9.530506862345427e-05, + "loss": 4.3243, + "step": 5535 + }, + { + "epoch": 0.37640983829324637, + "grad_norm": 0.1849180907011032, + "learning_rate": 9.530082212257101e-05, + "loss": 4.1106, + "step": 5540 + }, + { + "epoch": 0.37674955836390817, + "grad_norm": 0.1672249287366867, + "learning_rate": 9.529657562168773e-05, + "loss": 4.4955, + "step": 5545 + }, + { + "epoch": 0.3770892784345699, + "grad_norm": 0.7186090350151062, + "learning_rate": 9.529232912080446e-05, + "loss": 4.2498, + "step": 5550 + }, + { + "epoch": 0.3774289985052317, + "grad_norm": 0.17973625659942627, + "learning_rate": 9.52880826199212e-05, + "loss": 4.2932, + "step": 5555 + }, + { + "epoch": 0.37776871857589345, + "grad_norm": 0.23119674623012543, + "learning_rate": 9.528383611903791e-05, + "loss": 4.2597, + "step": 5560 + }, + { + "epoch": 0.37810843864655524, + "grad_norm": 0.27012819051742554, + "learning_rate": 9.527958961815464e-05, + "loss": 4.0075, + "step": 5565 + }, + { + "epoch": 0.378448158717217, + "grad_norm": 0.22133472561836243, + "learning_rate": 9.527534311727138e-05, + "loss": 4.1785, + "step": 5570 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.17481616139411926, + "learning_rate": 9.52710966163881e-05, + "loss": 3.9544, + "step": 5575 + }, + { + "epoch": 0.3791275988585406, + "grad_norm": 0.20295321941375732, + "learning_rate": 9.526685011550483e-05, + "loss": 3.8667, + "step": 5580 + }, + { + "epoch": 0.3794673189292023, + "grad_norm": 0.3702150881290436, + "learning_rate": 9.526260361462155e-05, + "loss": 4.012, + "step": 5585 + }, + { + "epoch": 0.3798070389998641, + "grad_norm": 0.3844399154186249, + "learning_rate": 9.525835711373828e-05, + "loss": 4.0456, + "step": 5590 + }, + { + "epoch": 0.38014675907052586, + "grad_norm": 0.27248868346214294, + "learning_rate": 9.525411061285501e-05, + "loss": 4.0001, + "step": 5595 + }, + { + "epoch": 0.38048647914118766, + "grad_norm": 0.4196895360946655, + "learning_rate": 9.524986411197174e-05, + "loss": 4.2368, + "step": 5600 + }, + { + "epoch": 0.38082619921184946, + "grad_norm": 0.2525693476200104, + "learning_rate": 9.524561761108847e-05, + "loss": 4.1713, + "step": 5605 + }, + { + "epoch": 0.3811659192825112, + "grad_norm": 0.19002725183963776, + "learning_rate": 9.52413711102052e-05, + "loss": 3.9637, + "step": 5610 + }, + { + "epoch": 0.381505639353173, + "grad_norm": 0.21603484451770782, + "learning_rate": 9.523712460932192e-05, + "loss": 4.0533, + "step": 5615 + }, + { + "epoch": 0.38184535942383474, + "grad_norm": 0.1926129311323166, + "learning_rate": 9.523287810843865e-05, + "loss": 4.0691, + "step": 5620 + }, + { + "epoch": 0.38218507949449654, + "grad_norm": 0.41377758979797363, + "learning_rate": 9.522863160755538e-05, + "loss": 4.0141, + "step": 5625 + }, + { + "epoch": 0.38252479956515834, + "grad_norm": 0.2249571830034256, + "learning_rate": 9.52243851066721e-05, + "loss": 4.1834, + "step": 5630 + }, + { + "epoch": 0.3828645196358201, + "grad_norm": 0.1857426017522812, + "learning_rate": 9.522013860578883e-05, + "loss": 4.0473, + "step": 5635 + }, + { + "epoch": 0.3832042397064819, + "grad_norm": 0.20423340797424316, + "learning_rate": 9.521589210490556e-05, + "loss": 4.1039, + "step": 5640 + }, + { + "epoch": 0.3835439597771436, + "grad_norm": 0.20530074834823608, + "learning_rate": 9.521164560402229e-05, + "loss": 4.1668, + "step": 5645 + }, + { + "epoch": 0.3838836798478054, + "grad_norm": 0.21148528158664703, + "learning_rate": 9.520739910313902e-05, + "loss": 4.0141, + "step": 5650 + }, + { + "epoch": 0.38422339991846716, + "grad_norm": 0.23851540684700012, + "learning_rate": 9.520315260225575e-05, + "loss": 4.2311, + "step": 5655 + }, + { + "epoch": 0.38456311998912895, + "grad_norm": 0.17751434445381165, + "learning_rate": 9.519890610137247e-05, + "loss": 4.0698, + "step": 5660 + }, + { + "epoch": 0.38490284005979075, + "grad_norm": 0.4216800332069397, + "learning_rate": 9.51946596004892e-05, + "loss": 4.0324, + "step": 5665 + }, + { + "epoch": 0.3852425601304525, + "grad_norm": 0.19707581400871277, + "learning_rate": 9.519041309960593e-05, + "loss": 4.0026, + "step": 5670 + }, + { + "epoch": 0.3855822802011143, + "grad_norm": 0.21056298911571503, + "learning_rate": 9.518616659872266e-05, + "loss": 4.0293, + "step": 5675 + }, + { + "epoch": 0.38592200027177603, + "grad_norm": 0.18237900733947754, + "learning_rate": 9.518192009783939e-05, + "loss": 3.9467, + "step": 5680 + }, + { + "epoch": 0.38626172034243783, + "grad_norm": 0.1838427186012268, + "learning_rate": 9.517767359695611e-05, + "loss": 3.925, + "step": 5685 + }, + { + "epoch": 0.38660144041309963, + "grad_norm": 0.21086731553077698, + "learning_rate": 9.517342709607284e-05, + "loss": 4.2034, + "step": 5690 + }, + { + "epoch": 0.38694116048376137, + "grad_norm": 0.17555493116378784, + "learning_rate": 9.516918059518957e-05, + "loss": 3.8963, + "step": 5695 + }, + { + "epoch": 0.38728088055442317, + "grad_norm": 0.23491710424423218, + "learning_rate": 9.51649340943063e-05, + "loss": 4.116, + "step": 5700 + }, + { + "epoch": 0.3876206006250849, + "grad_norm": 0.18439505994319916, + "learning_rate": 9.516068759342303e-05, + "loss": 4.1069, + "step": 5705 + }, + { + "epoch": 0.3879603206957467, + "grad_norm": 0.18807227909564972, + "learning_rate": 9.515644109253975e-05, + "loss": 4.104, + "step": 5710 + }, + { + "epoch": 0.3883000407664085, + "grad_norm": 0.4963626265525818, + "learning_rate": 9.515219459165648e-05, + "loss": 4.1994, + "step": 5715 + }, + { + "epoch": 0.38863976083707025, + "grad_norm": 0.24339251220226288, + "learning_rate": 9.514794809077321e-05, + "loss": 4.1082, + "step": 5720 + }, + { + "epoch": 0.38897948090773204, + "grad_norm": 0.17436154186725616, + "learning_rate": 9.514370158988994e-05, + "loss": 4.1256, + "step": 5725 + }, + { + "epoch": 0.3893192009783938, + "grad_norm": 0.2445308268070221, + "learning_rate": 9.513945508900667e-05, + "loss": 4.0222, + "step": 5730 + }, + { + "epoch": 0.3896589210490556, + "grad_norm": 0.6241475939750671, + "learning_rate": 9.51352085881234e-05, + "loss": 4.1132, + "step": 5735 + }, + { + "epoch": 0.3899986411197173, + "grad_norm": 0.16763907670974731, + "learning_rate": 9.513096208724012e-05, + "loss": 4.3379, + "step": 5740 + }, + { + "epoch": 0.3903383611903791, + "grad_norm": 0.1730974316596985, + "learning_rate": 9.512671558635685e-05, + "loss": 4.1179, + "step": 5745 + }, + { + "epoch": 0.3906780812610409, + "grad_norm": 0.19016407430171967, + "learning_rate": 9.512246908547358e-05, + "loss": 4.0632, + "step": 5750 + }, + { + "epoch": 0.39101780133170266, + "grad_norm": 0.19713403284549713, + "learning_rate": 9.51182225845903e-05, + "loss": 4.088, + "step": 5755 + }, + { + "epoch": 0.39135752140236446, + "grad_norm": 0.26910024881362915, + "learning_rate": 9.511397608370703e-05, + "loss": 4.2572, + "step": 5760 + }, + { + "epoch": 0.3916972414730262, + "grad_norm": 0.20750823616981506, + "learning_rate": 9.510972958282376e-05, + "loss": 4.0017, + "step": 5765 + }, + { + "epoch": 0.392036961543688, + "grad_norm": 0.18255822360515594, + "learning_rate": 9.510548308194049e-05, + "loss": 4.4284, + "step": 5770 + }, + { + "epoch": 0.3923766816143498, + "grad_norm": 0.19282065331935883, + "learning_rate": 9.510123658105722e-05, + "loss": 4.1651, + "step": 5775 + }, + { + "epoch": 0.39271640168501154, + "grad_norm": 0.22694478929042816, + "learning_rate": 9.509699008017395e-05, + "loss": 4.0152, + "step": 5780 + }, + { + "epoch": 0.39305612175567334, + "grad_norm": 0.2607446014881134, + "learning_rate": 9.509274357929066e-05, + "loss": 4.3358, + "step": 5785 + }, + { + "epoch": 0.3933958418263351, + "grad_norm": 0.22173616290092468, + "learning_rate": 9.50884970784074e-05, + "loss": 4.2362, + "step": 5790 + }, + { + "epoch": 0.3937355618969969, + "grad_norm": 0.20545057952404022, + "learning_rate": 9.508425057752413e-05, + "loss": 4.0821, + "step": 5795 + }, + { + "epoch": 0.3940752819676587, + "grad_norm": 0.23421698808670044, + "learning_rate": 9.508000407664084e-05, + "loss": 4.2178, + "step": 5800 + }, + { + "epoch": 0.3944150020383204, + "grad_norm": 0.2095632702112198, + "learning_rate": 9.507575757575759e-05, + "loss": 4.2833, + "step": 5805 + }, + { + "epoch": 0.3947547221089822, + "grad_norm": 0.23404939472675323, + "learning_rate": 9.507151107487431e-05, + "loss": 4.0823, + "step": 5810 + }, + { + "epoch": 0.39509444217964396, + "grad_norm": 0.23966114223003387, + "learning_rate": 9.506726457399103e-05, + "loss": 3.8059, + "step": 5815 + }, + { + "epoch": 0.39543416225030575, + "grad_norm": 0.2027054876089096, + "learning_rate": 9.506301807310777e-05, + "loss": 4.2229, + "step": 5820 + }, + { + "epoch": 0.3957738823209675, + "grad_norm": 0.18689711391925812, + "learning_rate": 9.50587715722245e-05, + "loss": 4.26, + "step": 5825 + }, + { + "epoch": 0.3961136023916293, + "grad_norm": 0.263927698135376, + "learning_rate": 9.505452507134121e-05, + "loss": 4.021, + "step": 5830 + }, + { + "epoch": 0.3964533224622911, + "grad_norm": 0.18399837613105774, + "learning_rate": 9.505027857045795e-05, + "loss": 4.218, + "step": 5835 + }, + { + "epoch": 0.39679304253295283, + "grad_norm": 0.17031966149806976, + "learning_rate": 9.504603206957468e-05, + "loss": 3.9079, + "step": 5840 + }, + { + "epoch": 0.39713276260361463, + "grad_norm": 0.3210891783237457, + "learning_rate": 9.50417855686914e-05, + "loss": 3.9466, + "step": 5845 + }, + { + "epoch": 0.3974724826742764, + "grad_norm": 0.1981404423713684, + "learning_rate": 9.503753906780814e-05, + "loss": 4.001, + "step": 5850 + }, + { + "epoch": 0.39781220274493817, + "grad_norm": 0.3136885464191437, + "learning_rate": 9.503329256692485e-05, + "loss": 4.058, + "step": 5855 + }, + { + "epoch": 0.39815192281559997, + "grad_norm": 2.190765857696533, + "learning_rate": 9.502904606604158e-05, + "loss": 4.0696, + "step": 5860 + }, + { + "epoch": 0.3984916428862617, + "grad_norm": 0.17315542697906494, + "learning_rate": 9.502479956515832e-05, + "loss": 4.2626, + "step": 5865 + }, + { + "epoch": 0.3988313629569235, + "grad_norm": 0.33235201239585876, + "learning_rate": 9.502055306427504e-05, + "loss": 3.9844, + "step": 5870 + }, + { + "epoch": 0.39917108302758525, + "grad_norm": 0.23391257226467133, + "learning_rate": 9.501630656339176e-05, + "loss": 3.8401, + "step": 5875 + }, + { + "epoch": 0.39951080309824705, + "grad_norm": 0.24006325006484985, + "learning_rate": 9.50120600625085e-05, + "loss": 4.0724, + "step": 5880 + }, + { + "epoch": 0.39985052316890884, + "grad_norm": 0.17999830842018127, + "learning_rate": 9.500781356162522e-05, + "loss": 4.2167, + "step": 5885 + }, + { + "epoch": 0.4001902432395706, + "grad_norm": 0.18070223927497864, + "learning_rate": 9.500356706074195e-05, + "loss": 4.105, + "step": 5890 + }, + { + "epoch": 0.4005299633102324, + "grad_norm": 0.19634656608104706, + "learning_rate": 9.499932055985869e-05, + "loss": 3.989, + "step": 5895 + }, + { + "epoch": 0.4008696833808941, + "grad_norm": 0.23722241818904877, + "learning_rate": 9.49950740589754e-05, + "loss": 4.1728, + "step": 5900 + }, + { + "epoch": 0.4012094034515559, + "grad_norm": 0.19146768748760223, + "learning_rate": 9.499082755809213e-05, + "loss": 4.1732, + "step": 5905 + }, + { + "epoch": 0.40154912352221767, + "grad_norm": 0.21835100650787354, + "learning_rate": 9.498658105720887e-05, + "loss": 4.0836, + "step": 5910 + }, + { + "epoch": 0.40188884359287946, + "grad_norm": 0.17060807347297668, + "learning_rate": 9.498233455632559e-05, + "loss": 4.0794, + "step": 5915 + }, + { + "epoch": 0.40222856366354126, + "grad_norm": 0.9648451805114746, + "learning_rate": 9.497808805544232e-05, + "loss": 4.1605, + "step": 5920 + }, + { + "epoch": 0.402568283734203, + "grad_norm": 0.1983519345521927, + "learning_rate": 9.497384155455904e-05, + "loss": 3.9843, + "step": 5925 + }, + { + "epoch": 0.4029080038048648, + "grad_norm": 0.23070013523101807, + "learning_rate": 9.496959505367577e-05, + "loss": 3.9679, + "step": 5930 + }, + { + "epoch": 0.40324772387552654, + "grad_norm": 1.1979519128799438, + "learning_rate": 9.49653485527925e-05, + "loss": 4.1856, + "step": 5935 + }, + { + "epoch": 0.40358744394618834, + "grad_norm": 0.7521958947181702, + "learning_rate": 9.496110205190923e-05, + "loss": 4.082, + "step": 5940 + }, + { + "epoch": 0.40392716401685014, + "grad_norm": 0.192143052816391, + "learning_rate": 9.495685555102596e-05, + "loss": 4.2121, + "step": 5945 + }, + { + "epoch": 0.4042668840875119, + "grad_norm": 0.2611311972141266, + "learning_rate": 9.495260905014268e-05, + "loss": 4.2154, + "step": 5950 + }, + { + "epoch": 0.4046066041581737, + "grad_norm": 0.18073415756225586, + "learning_rate": 9.494836254925941e-05, + "loss": 4.1762, + "step": 5955 + }, + { + "epoch": 0.4049463242288354, + "grad_norm": 0.1921936720609665, + "learning_rate": 9.494411604837614e-05, + "loss": 4.0705, + "step": 5960 + }, + { + "epoch": 0.4052860442994972, + "grad_norm": 0.16377374529838562, + "learning_rate": 9.493986954749287e-05, + "loss": 4.1785, + "step": 5965 + }, + { + "epoch": 0.405625764370159, + "grad_norm": 0.21104343235492706, + "learning_rate": 9.49356230466096e-05, + "loss": 4.2715, + "step": 5970 + }, + { + "epoch": 0.40596548444082076, + "grad_norm": 0.2071741223335266, + "learning_rate": 9.493137654572632e-05, + "loss": 3.9067, + "step": 5975 + }, + { + "epoch": 0.40630520451148255, + "grad_norm": 0.22247660160064697, + "learning_rate": 9.492713004484305e-05, + "loss": 4.0561, + "step": 5980 + }, + { + "epoch": 0.4066449245821443, + "grad_norm": 0.20433616638183594, + "learning_rate": 9.492288354395978e-05, + "loss": 4.0168, + "step": 5985 + }, + { + "epoch": 0.4069846446528061, + "grad_norm": 0.2049606889486313, + "learning_rate": 9.491863704307651e-05, + "loss": 4.3927, + "step": 5990 + }, + { + "epoch": 0.40732436472346784, + "grad_norm": 0.22720476984977722, + "learning_rate": 9.491439054219324e-05, + "loss": 4.2648, + "step": 5995 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 0.25233665108680725, + "learning_rate": 9.491014404130996e-05, + "loss": 3.9343, + "step": 6000 + }, + { + "epoch": 0.40800380486479143, + "grad_norm": 0.21542391180992126, + "learning_rate": 9.490589754042669e-05, + "loss": 4.2302, + "step": 6005 + }, + { + "epoch": 0.4083435249354532, + "grad_norm": 0.8740308284759521, + "learning_rate": 9.490165103954342e-05, + "loss": 4.0408, + "step": 6010 + }, + { + "epoch": 0.40868324500611497, + "grad_norm": 0.18519490957260132, + "learning_rate": 9.489740453866015e-05, + "loss": 4.0773, + "step": 6015 + }, + { + "epoch": 0.4090229650767767, + "grad_norm": 0.2651638388633728, + "learning_rate": 9.489315803777688e-05, + "loss": 4.1498, + "step": 6020 + }, + { + "epoch": 0.4093626851474385, + "grad_norm": 0.4929540157318115, + "learning_rate": 9.48889115368936e-05, + "loss": 3.9128, + "step": 6025 + }, + { + "epoch": 0.4097024052181003, + "grad_norm": 0.20049385726451874, + "learning_rate": 9.488466503601033e-05, + "loss": 4.0097, + "step": 6030 + }, + { + "epoch": 0.41004212528876205, + "grad_norm": 0.17493902146816254, + "learning_rate": 9.488041853512706e-05, + "loss": 4.1654, + "step": 6035 + }, + { + "epoch": 0.41038184535942385, + "grad_norm": 0.30577751994132996, + "learning_rate": 9.487617203424379e-05, + "loss": 4.0563, + "step": 6040 + }, + { + "epoch": 0.4107215654300856, + "grad_norm": 0.2669510543346405, + "learning_rate": 9.487192553336052e-05, + "loss": 4.0721, + "step": 6045 + }, + { + "epoch": 0.4110612855007474, + "grad_norm": 0.18722812831401825, + "learning_rate": 9.486767903247724e-05, + "loss": 4.1749, + "step": 6050 + }, + { + "epoch": 0.4114010055714092, + "grad_norm": 0.1755664199590683, + "learning_rate": 9.486343253159397e-05, + "loss": 4.1804, + "step": 6055 + }, + { + "epoch": 0.4117407256420709, + "grad_norm": 0.27995565533638, + "learning_rate": 9.48591860307107e-05, + "loss": 3.9794, + "step": 6060 + }, + { + "epoch": 0.4120804457127327, + "grad_norm": 0.2518627345561981, + "learning_rate": 9.485493952982743e-05, + "loss": 4.2129, + "step": 6065 + }, + { + "epoch": 0.41242016578339447, + "grad_norm": 0.21856893599033356, + "learning_rate": 9.485069302894416e-05, + "loss": 4.3444, + "step": 6070 + }, + { + "epoch": 0.41275988585405626, + "grad_norm": 1.2819626331329346, + "learning_rate": 9.484644652806088e-05, + "loss": 3.6595, + "step": 6075 + }, + { + "epoch": 0.413099605924718, + "grad_norm": 0.25162968039512634, + "learning_rate": 9.484220002717761e-05, + "loss": 4.2169, + "step": 6080 + }, + { + "epoch": 0.4134393259953798, + "grad_norm": 0.2659519910812378, + "learning_rate": 9.483795352629434e-05, + "loss": 4.0789, + "step": 6085 + }, + { + "epoch": 0.4137790460660416, + "grad_norm": 0.20415237545967102, + "learning_rate": 9.483370702541107e-05, + "loss": 4.2934, + "step": 6090 + }, + { + "epoch": 0.41411876613670334, + "grad_norm": 0.33283525705337524, + "learning_rate": 9.48294605245278e-05, + "loss": 4.0536, + "step": 6095 + }, + { + "epoch": 0.41445848620736514, + "grad_norm": 0.21782909333705902, + "learning_rate": 9.482521402364452e-05, + "loss": 3.9827, + "step": 6100 + }, + { + "epoch": 0.4147982062780269, + "grad_norm": 4.150507926940918, + "learning_rate": 9.482096752276125e-05, + "loss": 3.9654, + "step": 6105 + }, + { + "epoch": 0.4151379263486887, + "grad_norm": 0.20316611230373383, + "learning_rate": 9.481672102187798e-05, + "loss": 4.045, + "step": 6110 + }, + { + "epoch": 0.4154776464193505, + "grad_norm": 1.9692164659500122, + "learning_rate": 9.481247452099471e-05, + "loss": 4.0912, + "step": 6115 + }, + { + "epoch": 0.4158173664900122, + "grad_norm": 0.21312934160232544, + "learning_rate": 9.480822802011144e-05, + "loss": 4.2765, + "step": 6120 + }, + { + "epoch": 0.416157086560674, + "grad_norm": 0.1886243224143982, + "learning_rate": 9.480398151922815e-05, + "loss": 4.0305, + "step": 6125 + }, + { + "epoch": 0.41649680663133576, + "grad_norm": 0.22211480140686035, + "learning_rate": 9.479973501834489e-05, + "loss": 3.906, + "step": 6130 + }, + { + "epoch": 0.41683652670199756, + "grad_norm": 0.24448561668395996, + "learning_rate": 9.479548851746162e-05, + "loss": 3.9652, + "step": 6135 + }, + { + "epoch": 0.41717624677265935, + "grad_norm": 0.2330089956521988, + "learning_rate": 9.479124201657834e-05, + "loss": 4.0935, + "step": 6140 + }, + { + "epoch": 0.4175159668433211, + "grad_norm": 0.354692280292511, + "learning_rate": 9.478699551569508e-05, + "loss": 4.1893, + "step": 6145 + }, + { + "epoch": 0.4178556869139829, + "grad_norm": 0.31616339087486267, + "learning_rate": 9.47827490148118e-05, + "loss": 4.1393, + "step": 6150 + }, + { + "epoch": 0.41819540698464464, + "grad_norm": 0.6363674402236938, + "learning_rate": 9.477850251392852e-05, + "loss": 4.1578, + "step": 6155 + }, + { + "epoch": 0.41853512705530643, + "grad_norm": 0.19385862350463867, + "learning_rate": 9.477425601304526e-05, + "loss": 4.0994, + "step": 6160 + }, + { + "epoch": 0.4188748471259682, + "grad_norm": 0.20381571352481842, + "learning_rate": 9.477000951216199e-05, + "loss": 4.1873, + "step": 6165 + }, + { + "epoch": 0.41921456719663, + "grad_norm": 0.20795594155788422, + "learning_rate": 9.47657630112787e-05, + "loss": 4.0544, + "step": 6170 + }, + { + "epoch": 0.41955428726729177, + "grad_norm": 0.3839801847934723, + "learning_rate": 9.476151651039544e-05, + "loss": 4.1777, + "step": 6175 + }, + { + "epoch": 0.4198940073379535, + "grad_norm": 0.2491442710161209, + "learning_rate": 9.475727000951217e-05, + "loss": 3.9298, + "step": 6180 + }, + { + "epoch": 0.4202337274086153, + "grad_norm": 0.1739528328180313, + "learning_rate": 9.475302350862889e-05, + "loss": 4.0482, + "step": 6185 + }, + { + "epoch": 0.42057344747927705, + "grad_norm": 0.19341996312141418, + "learning_rate": 9.474877700774563e-05, + "loss": 3.911, + "step": 6190 + }, + { + "epoch": 0.42091316754993885, + "grad_norm": 0.16241292655467987, + "learning_rate": 9.474453050686236e-05, + "loss": 4.0751, + "step": 6195 + }, + { + "epoch": 0.42125288762060065, + "grad_norm": 0.16985565423965454, + "learning_rate": 9.474028400597907e-05, + "loss": 3.9359, + "step": 6200 + }, + { + "epoch": 0.4215926076912624, + "grad_norm": 0.21724484860897064, + "learning_rate": 9.473603750509581e-05, + "loss": 4.0409, + "step": 6205 + }, + { + "epoch": 0.4219323277619242, + "grad_norm": 0.21480692923069, + "learning_rate": 9.473179100421253e-05, + "loss": 3.987, + "step": 6210 + }, + { + "epoch": 0.42227204783258593, + "grad_norm": 0.2604687809944153, + "learning_rate": 9.472754450332926e-05, + "loss": 3.89, + "step": 6215 + }, + { + "epoch": 0.4226117679032477, + "grad_norm": 0.22292381525039673, + "learning_rate": 9.4723298002446e-05, + "loss": 4.029, + "step": 6220 + }, + { + "epoch": 0.4229514879739095, + "grad_norm": 0.2695325016975403, + "learning_rate": 9.471905150156271e-05, + "loss": 4.0199, + "step": 6225 + }, + { + "epoch": 0.42329120804457127, + "grad_norm": 0.17921492457389832, + "learning_rate": 9.471480500067944e-05, + "loss": 4.0195, + "step": 6230 + }, + { + "epoch": 0.42363092811523306, + "grad_norm": 0.29654955863952637, + "learning_rate": 9.471055849979618e-05, + "loss": 4.2356, + "step": 6235 + }, + { + "epoch": 0.4239706481858948, + "grad_norm": 0.3091282844543457, + "learning_rate": 9.47063119989129e-05, + "loss": 3.8808, + "step": 6240 + }, + { + "epoch": 0.4243103682565566, + "grad_norm": 0.20580576360225677, + "learning_rate": 9.470206549802962e-05, + "loss": 4.0886, + "step": 6245 + }, + { + "epoch": 0.42465008832721834, + "grad_norm": 0.19273176789283752, + "learning_rate": 9.469781899714636e-05, + "loss": 3.9993, + "step": 6250 + }, + { + "epoch": 0.42498980839788014, + "grad_norm": 0.18639002740383148, + "learning_rate": 9.469357249626308e-05, + "loss": 4.0441, + "step": 6255 + }, + { + "epoch": 0.42532952846854194, + "grad_norm": 2.978999614715576, + "learning_rate": 9.468932599537981e-05, + "loss": 3.7926, + "step": 6260 + }, + { + "epoch": 0.4256692485392037, + "grad_norm": 0.17055857181549072, + "learning_rate": 9.468507949449655e-05, + "loss": 4.0499, + "step": 6265 + }, + { + "epoch": 0.4260089686098655, + "grad_norm": 1.0542086362838745, + "learning_rate": 9.468083299361326e-05, + "loss": 4.2183, + "step": 6270 + }, + { + "epoch": 0.4263486886805272, + "grad_norm": 1.1902449131011963, + "learning_rate": 9.467658649272999e-05, + "loss": 3.9083, + "step": 6275 + }, + { + "epoch": 0.426688408751189, + "grad_norm": 0.26639068126678467, + "learning_rate": 9.467233999184672e-05, + "loss": 3.9113, + "step": 6280 + }, + { + "epoch": 0.4270281288218508, + "grad_norm": 0.2732694149017334, + "learning_rate": 9.466809349096345e-05, + "loss": 4.1468, + "step": 6285 + }, + { + "epoch": 0.42736784889251256, + "grad_norm": 0.27546462416648865, + "learning_rate": 9.466384699008018e-05, + "loss": 3.9733, + "step": 6290 + }, + { + "epoch": 0.42770756896317436, + "grad_norm": 0.1994089037179947, + "learning_rate": 9.46596004891969e-05, + "loss": 4.1123, + "step": 6295 + }, + { + "epoch": 0.4280472890338361, + "grad_norm": 0.20160436630249023, + "learning_rate": 9.465535398831363e-05, + "loss": 4.0467, + "step": 6300 + }, + { + "epoch": 0.4283870091044979, + "grad_norm": 0.2255067527294159, + "learning_rate": 9.465110748743036e-05, + "loss": 3.7712, + "step": 6305 + }, + { + "epoch": 0.4287267291751597, + "grad_norm": 0.1755346655845642, + "learning_rate": 9.464686098654709e-05, + "loss": 4.3119, + "step": 6310 + }, + { + "epoch": 0.42906644924582144, + "grad_norm": 0.16779784858226776, + "learning_rate": 9.464261448566382e-05, + "loss": 4.1447, + "step": 6315 + }, + { + "epoch": 0.42940616931648323, + "grad_norm": 0.2779237926006317, + "learning_rate": 9.463836798478054e-05, + "loss": 4.1285, + "step": 6320 + }, + { + "epoch": 0.429745889387145, + "grad_norm": 0.21850286424160004, + "learning_rate": 9.463412148389727e-05, + "loss": 4.118, + "step": 6325 + }, + { + "epoch": 0.4300856094578068, + "grad_norm": 0.2127694934606552, + "learning_rate": 9.4629874983014e-05, + "loss": 4.1073, + "step": 6330 + }, + { + "epoch": 0.4304253295284685, + "grad_norm": 0.24460366368293762, + "learning_rate": 9.462562848213073e-05, + "loss": 4.0523, + "step": 6335 + }, + { + "epoch": 0.4307650495991303, + "grad_norm": 0.18039904534816742, + "learning_rate": 9.462138198124746e-05, + "loss": 4.205, + "step": 6340 + }, + { + "epoch": 0.4311047696697921, + "grad_norm": 0.23940406739711761, + "learning_rate": 9.461713548036418e-05, + "loss": 4.0222, + "step": 6345 + }, + { + "epoch": 0.43144448974045385, + "grad_norm": 0.223390132188797, + "learning_rate": 9.461288897948091e-05, + "loss": 4.0133, + "step": 6350 + }, + { + "epoch": 0.43178420981111565, + "grad_norm": 0.20645423233509064, + "learning_rate": 9.460864247859764e-05, + "loss": 3.9967, + "step": 6355 + }, + { + "epoch": 0.4321239298817774, + "grad_norm": 0.7943032383918762, + "learning_rate": 9.460439597771437e-05, + "loss": 3.927, + "step": 6360 + }, + { + "epoch": 0.4324636499524392, + "grad_norm": 0.1684955656528473, + "learning_rate": 9.46001494768311e-05, + "loss": 4.0099, + "step": 6365 + }, + { + "epoch": 0.432803370023101, + "grad_norm": 0.21439214050769806, + "learning_rate": 9.459590297594782e-05, + "loss": 3.7963, + "step": 6370 + }, + { + "epoch": 0.43314309009376273, + "grad_norm": 0.21478402614593506, + "learning_rate": 9.459165647506455e-05, + "loss": 4.0739, + "step": 6375 + }, + { + "epoch": 0.4334828101644245, + "grad_norm": 0.8936269283294678, + "learning_rate": 9.458740997418128e-05, + "loss": 4.1838, + "step": 6380 + }, + { + "epoch": 0.43382253023508627, + "grad_norm": 0.5096725821495056, + "learning_rate": 9.458316347329801e-05, + "loss": 4.047, + "step": 6385 + }, + { + "epoch": 0.43416225030574807, + "grad_norm": 0.285972535610199, + "learning_rate": 9.457891697241474e-05, + "loss": 4.1083, + "step": 6390 + }, + { + "epoch": 0.43450197037640986, + "grad_norm": 0.1650124043226242, + "learning_rate": 9.457467047153146e-05, + "loss": 4.157, + "step": 6395 + }, + { + "epoch": 0.4348416904470716, + "grad_norm": 0.19191978871822357, + "learning_rate": 9.457042397064819e-05, + "loss": 3.9201, + "step": 6400 + }, + { + "epoch": 0.4351814105177334, + "grad_norm": 0.20855942368507385, + "learning_rate": 9.456617746976492e-05, + "loss": 4.1146, + "step": 6405 + }, + { + "epoch": 0.43552113058839514, + "grad_norm": 0.17791011929512024, + "learning_rate": 9.456193096888165e-05, + "loss": 3.9971, + "step": 6410 + }, + { + "epoch": 0.43586085065905694, + "grad_norm": 0.2120276242494583, + "learning_rate": 9.455768446799838e-05, + "loss": 4.0967, + "step": 6415 + }, + { + "epoch": 0.4362005707297187, + "grad_norm": 0.20230713486671448, + "learning_rate": 9.45534379671151e-05, + "loss": 3.902, + "step": 6420 + }, + { + "epoch": 0.4365402908003805, + "grad_norm": 0.5752553343772888, + "learning_rate": 9.454919146623183e-05, + "loss": 3.9801, + "step": 6425 + }, + { + "epoch": 0.4368800108710423, + "grad_norm": 0.19792981445789337, + "learning_rate": 9.454494496534856e-05, + "loss": 4.2249, + "step": 6430 + }, + { + "epoch": 0.437219730941704, + "grad_norm": 0.20685099065303802, + "learning_rate": 9.454069846446529e-05, + "loss": 4.2953, + "step": 6435 + }, + { + "epoch": 0.4375594510123658, + "grad_norm": 0.22175006568431854, + "learning_rate": 9.453645196358202e-05, + "loss": 3.9621, + "step": 6440 + }, + { + "epoch": 0.43789917108302756, + "grad_norm": 0.4952181875705719, + "learning_rate": 9.453220546269874e-05, + "loss": 3.986, + "step": 6445 + }, + { + "epoch": 0.43823889115368936, + "grad_norm": 0.20618560910224915, + "learning_rate": 9.452795896181547e-05, + "loss": 3.9802, + "step": 6450 + }, + { + "epoch": 0.43857861122435116, + "grad_norm": 1.136326551437378, + "learning_rate": 9.45237124609322e-05, + "loss": 4.2695, + "step": 6455 + }, + { + "epoch": 0.4389183312950129, + "grad_norm": 0.22814400494098663, + "learning_rate": 9.451946596004893e-05, + "loss": 3.8791, + "step": 6460 + }, + { + "epoch": 0.4392580513656747, + "grad_norm": 0.24193866550922394, + "learning_rate": 9.451521945916566e-05, + "loss": 3.9838, + "step": 6465 + }, + { + "epoch": 0.43959777143633644, + "grad_norm": 0.25064903497695923, + "learning_rate": 9.451097295828238e-05, + "loss": 3.9355, + "step": 6470 + }, + { + "epoch": 0.43993749150699824, + "grad_norm": 1.2106342315673828, + "learning_rate": 9.450672645739911e-05, + "loss": 4.1844, + "step": 6475 + }, + { + "epoch": 0.44027721157766003, + "grad_norm": 0.16492827236652374, + "learning_rate": 9.450247995651583e-05, + "loss": 4.2447, + "step": 6480 + }, + { + "epoch": 0.4406169316483218, + "grad_norm": 0.1795361191034317, + "learning_rate": 9.449823345563257e-05, + "loss": 3.8589, + "step": 6485 + }, + { + "epoch": 0.44095665171898357, + "grad_norm": 0.19358831644058228, + "learning_rate": 9.44939869547493e-05, + "loss": 4.2963, + "step": 6490 + }, + { + "epoch": 0.4412963717896453, + "grad_norm": 0.2732826769351959, + "learning_rate": 9.448974045386601e-05, + "loss": 3.9789, + "step": 6495 + }, + { + "epoch": 0.4416360918603071, + "grad_norm": 0.23638220131397247, + "learning_rate": 9.448549395298275e-05, + "loss": 4.2448, + "step": 6500 + }, + { + "epoch": 0.4419758119309689, + "grad_norm": 0.2072085738182068, + "learning_rate": 9.448124745209948e-05, + "loss": 3.8356, + "step": 6505 + }, + { + "epoch": 0.44231553200163065, + "grad_norm": 3.1101341247558594, + "learning_rate": 9.44770009512162e-05, + "loss": 4.2411, + "step": 6510 + }, + { + "epoch": 0.44265525207229245, + "grad_norm": 0.4264751374721527, + "learning_rate": 9.447275445033294e-05, + "loss": 3.9676, + "step": 6515 + }, + { + "epoch": 0.4429949721429542, + "grad_norm": 0.20776435732841492, + "learning_rate": 9.446850794944966e-05, + "loss": 3.8493, + "step": 6520 + }, + { + "epoch": 0.443334692213616, + "grad_norm": 0.3044533133506775, + "learning_rate": 9.446426144856638e-05, + "loss": 4.1147, + "step": 6525 + }, + { + "epoch": 0.44367441228427773, + "grad_norm": 0.16665169596672058, + "learning_rate": 9.446001494768312e-05, + "loss": 3.9521, + "step": 6530 + }, + { + "epoch": 0.44401413235493953, + "grad_norm": 0.2023710161447525, + "learning_rate": 9.445576844679985e-05, + "loss": 3.9206, + "step": 6535 + }, + { + "epoch": 0.4443538524256013, + "grad_norm": 0.4145415425300598, + "learning_rate": 9.445152194591656e-05, + "loss": 4.0138, + "step": 6540 + }, + { + "epoch": 0.44469357249626307, + "grad_norm": 0.16682837903499603, + "learning_rate": 9.44472754450333e-05, + "loss": 4.0957, + "step": 6545 + }, + { + "epoch": 0.44503329256692487, + "grad_norm": 0.2003334015607834, + "learning_rate": 9.444302894415002e-05, + "loss": 3.8427, + "step": 6550 + }, + { + "epoch": 0.4453730126375866, + "grad_norm": 0.29585328698158264, + "learning_rate": 9.443878244326675e-05, + "loss": 4.2463, + "step": 6555 + }, + { + "epoch": 0.4457127327082484, + "grad_norm": 0.20154406130313873, + "learning_rate": 9.443453594238349e-05, + "loss": 4.2644, + "step": 6560 + }, + { + "epoch": 0.4460524527789102, + "grad_norm": 0.23148468136787415, + "learning_rate": 9.44302894415002e-05, + "loss": 3.9205, + "step": 6565 + }, + { + "epoch": 0.44639217284957194, + "grad_norm": 0.1762179434299469, + "learning_rate": 9.442604294061693e-05, + "loss": 4.0293, + "step": 6570 + }, + { + "epoch": 0.44673189292023374, + "grad_norm": 0.4714028835296631, + "learning_rate": 9.442179643973367e-05, + "loss": 4.2011, + "step": 6575 + }, + { + "epoch": 0.4470716129908955, + "grad_norm": 0.368407666683197, + "learning_rate": 9.441754993885039e-05, + "loss": 4.0047, + "step": 6580 + }, + { + "epoch": 0.4474113330615573, + "grad_norm": 0.28887784481048584, + "learning_rate": 9.441330343796711e-05, + "loss": 4.0332, + "step": 6585 + }, + { + "epoch": 0.4477510531322191, + "grad_norm": 0.25729164481163025, + "learning_rate": 9.440905693708386e-05, + "loss": 4.0735, + "step": 6590 + }, + { + "epoch": 0.4480907732028808, + "grad_norm": 0.1723019927740097, + "learning_rate": 9.440481043620057e-05, + "loss": 4.0399, + "step": 6595 + }, + { + "epoch": 0.4484304932735426, + "grad_norm": 0.2043658047914505, + "learning_rate": 9.44005639353173e-05, + "loss": 4.2346, + "step": 6600 + }, + { + "epoch": 0.44877021334420436, + "grad_norm": 0.15108297765254974, + "learning_rate": 9.439631743443404e-05, + "loss": 3.9109, + "step": 6605 + }, + { + "epoch": 0.44910993341486616, + "grad_norm": 0.18265971541404724, + "learning_rate": 9.439207093355075e-05, + "loss": 3.845, + "step": 6610 + }, + { + "epoch": 0.4494496534855279, + "grad_norm": 0.9520887732505798, + "learning_rate": 9.438782443266748e-05, + "loss": 4.2181, + "step": 6615 + }, + { + "epoch": 0.4497893735561897, + "grad_norm": 0.28121015429496765, + "learning_rate": 9.438357793178422e-05, + "loss": 4.0234, + "step": 6620 + }, + { + "epoch": 0.4501290936268515, + "grad_norm": 0.21010081470012665, + "learning_rate": 9.437933143090094e-05, + "loss": 4.1509, + "step": 6625 + }, + { + "epoch": 0.45046881369751324, + "grad_norm": 0.23798321187496185, + "learning_rate": 9.437508493001767e-05, + "loss": 4.2879, + "step": 6630 + }, + { + "epoch": 0.45080853376817503, + "grad_norm": 0.20470625162124634, + "learning_rate": 9.43708384291344e-05, + "loss": 4.0134, + "step": 6635 + }, + { + "epoch": 0.4511482538388368, + "grad_norm": 0.19223268330097198, + "learning_rate": 9.436659192825112e-05, + "loss": 4.1928, + "step": 6640 + }, + { + "epoch": 0.4514879739094986, + "grad_norm": 0.1938367336988449, + "learning_rate": 9.436234542736785e-05, + "loss": 3.923, + "step": 6645 + }, + { + "epoch": 0.45182769398016037, + "grad_norm": 0.2842964231967926, + "learning_rate": 9.435809892648458e-05, + "loss": 4.2081, + "step": 6650 + }, + { + "epoch": 0.4521674140508221, + "grad_norm": 0.22615915536880493, + "learning_rate": 9.43538524256013e-05, + "loss": 4.0994, + "step": 6655 + }, + { + "epoch": 0.4525071341214839, + "grad_norm": 0.23465953767299652, + "learning_rate": 9.434960592471803e-05, + "loss": 3.8209, + "step": 6660 + }, + { + "epoch": 0.45284685419214565, + "grad_norm": 0.17599263787269592, + "learning_rate": 9.434535942383476e-05, + "loss": 3.8742, + "step": 6665 + }, + { + "epoch": 0.45318657426280745, + "grad_norm": 0.5463417172431946, + "learning_rate": 9.434111292295149e-05, + "loss": 4.1574, + "step": 6670 + }, + { + "epoch": 0.45352629433346925, + "grad_norm": 0.21346516907215118, + "learning_rate": 9.433686642206822e-05, + "loss": 4.3701, + "step": 6675 + }, + { + "epoch": 0.453866014404131, + "grad_norm": 0.2235599011182785, + "learning_rate": 9.433261992118495e-05, + "loss": 4.1951, + "step": 6680 + }, + { + "epoch": 0.4542057344747928, + "grad_norm": 0.2730211615562439, + "learning_rate": 9.432837342030167e-05, + "loss": 3.8224, + "step": 6685 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.1868310272693634, + "learning_rate": 9.43241269194184e-05, + "loss": 3.7951, + "step": 6690 + }, + { + "epoch": 0.45488517461611633, + "grad_norm": 0.3626730442047119, + "learning_rate": 9.431988041853513e-05, + "loss": 4.1127, + "step": 6695 + }, + { + "epoch": 0.45522489468677807, + "grad_norm": 0.22474856674671173, + "learning_rate": 9.431563391765186e-05, + "loss": 3.8253, + "step": 6700 + }, + { + "epoch": 0.45556461475743987, + "grad_norm": 0.3556784689426422, + "learning_rate": 9.431138741676859e-05, + "loss": 4.0104, + "step": 6705 + }, + { + "epoch": 0.45590433482810166, + "grad_norm": 1.793366551399231, + "learning_rate": 9.430714091588531e-05, + "loss": 4.1386, + "step": 6710 + }, + { + "epoch": 0.4562440548987634, + "grad_norm": 0.18291668593883514, + "learning_rate": 9.430289441500204e-05, + "loss": 4.118, + "step": 6715 + }, + { + "epoch": 0.4565837749694252, + "grad_norm": 0.3133949339389801, + "learning_rate": 9.429864791411877e-05, + "loss": 3.9527, + "step": 6720 + }, + { + "epoch": 0.45692349504008695, + "grad_norm": 0.2146921306848526, + "learning_rate": 9.42944014132355e-05, + "loss": 3.9596, + "step": 6725 + }, + { + "epoch": 0.45726321511074874, + "grad_norm": 0.17036296427249908, + "learning_rate": 9.429015491235223e-05, + "loss": 4.0381, + "step": 6730 + }, + { + "epoch": 0.45760293518141054, + "grad_norm": 0.32481151819229126, + "learning_rate": 9.428590841146895e-05, + "loss": 4.0932, + "step": 6735 + }, + { + "epoch": 0.4579426552520723, + "grad_norm": 0.18955262005329132, + "learning_rate": 9.428166191058568e-05, + "loss": 4.1081, + "step": 6740 + }, + { + "epoch": 0.4582823753227341, + "grad_norm": 0.2482958883047104, + "learning_rate": 9.427741540970241e-05, + "loss": 4.036, + "step": 6745 + }, + { + "epoch": 0.4586220953933958, + "grad_norm": 0.1786300390958786, + "learning_rate": 9.427316890881914e-05, + "loss": 4.2296, + "step": 6750 + }, + { + "epoch": 0.4589618154640576, + "grad_norm": 0.19114576280117035, + "learning_rate": 9.426892240793587e-05, + "loss": 4.1781, + "step": 6755 + }, + { + "epoch": 0.4593015355347194, + "grad_norm": 0.1777360886335373, + "learning_rate": 9.42646759070526e-05, + "loss": 3.7195, + "step": 6760 + }, + { + "epoch": 0.45964125560538116, + "grad_norm": 0.2255825698375702, + "learning_rate": 9.426042940616932e-05, + "loss": 4.0727, + "step": 6765 + }, + { + "epoch": 0.45998097567604296, + "grad_norm": 2.2815988063812256, + "learning_rate": 9.425618290528605e-05, + "loss": 4.1177, + "step": 6770 + }, + { + "epoch": 0.4603206957467047, + "grad_norm": 0.18281744420528412, + "learning_rate": 9.425193640440278e-05, + "loss": 4.1818, + "step": 6775 + }, + { + "epoch": 0.4606604158173665, + "grad_norm": 0.1932012438774109, + "learning_rate": 9.42476899035195e-05, + "loss": 4.207, + "step": 6780 + }, + { + "epoch": 0.46100013588802824, + "grad_norm": 0.3870634138584137, + "learning_rate": 9.424344340263623e-05, + "loss": 4.0997, + "step": 6785 + }, + { + "epoch": 0.46133985595869004, + "grad_norm": 0.20440851151943207, + "learning_rate": 9.423919690175296e-05, + "loss": 3.9454, + "step": 6790 + }, + { + "epoch": 0.46167957602935183, + "grad_norm": 0.21663671731948853, + "learning_rate": 9.423495040086969e-05, + "loss": 4.013, + "step": 6795 + }, + { + "epoch": 0.4620192961000136, + "grad_norm": 0.18921007215976715, + "learning_rate": 9.423070389998642e-05, + "loss": 4.0641, + "step": 6800 + }, + { + "epoch": 0.4623590161706754, + "grad_norm": 0.21008360385894775, + "learning_rate": 9.422645739910315e-05, + "loss": 3.8814, + "step": 6805 + }, + { + "epoch": 0.4626987362413371, + "grad_norm": 1.4397491216659546, + "learning_rate": 9.422221089821987e-05, + "loss": 3.977, + "step": 6810 + }, + { + "epoch": 0.4630384563119989, + "grad_norm": 0.1650581657886505, + "learning_rate": 9.42179643973366e-05, + "loss": 3.8511, + "step": 6815 + }, + { + "epoch": 0.4633781763826607, + "grad_norm": 0.24074020981788635, + "learning_rate": 9.421371789645333e-05, + "loss": 4.4683, + "step": 6820 + }, + { + "epoch": 0.46371789645332245, + "grad_norm": 0.2204151600599289, + "learning_rate": 9.420947139557006e-05, + "loss": 4.077, + "step": 6825 + }, + { + "epoch": 0.46405761652398425, + "grad_norm": 0.24461984634399414, + "learning_rate": 9.420522489468679e-05, + "loss": 3.9247, + "step": 6830 + }, + { + "epoch": 0.464397336594646, + "grad_norm": 0.19434142112731934, + "learning_rate": 9.42009783938035e-05, + "loss": 3.8453, + "step": 6835 + }, + { + "epoch": 0.4647370566653078, + "grad_norm": 0.2689877450466156, + "learning_rate": 9.419673189292024e-05, + "loss": 4.0808, + "step": 6840 + }, + { + "epoch": 0.4650767767359696, + "grad_norm": 0.2343098372220993, + "learning_rate": 9.419248539203697e-05, + "loss": 4.0781, + "step": 6845 + }, + { + "epoch": 0.46541649680663133, + "grad_norm": 0.19415900111198425, + "learning_rate": 9.418823889115369e-05, + "loss": 4.1067, + "step": 6850 + }, + { + "epoch": 0.4657562168772931, + "grad_norm": 0.20309092104434967, + "learning_rate": 9.418399239027043e-05, + "loss": 4.0426, + "step": 6855 + }, + { + "epoch": 0.46609593694795487, + "grad_norm": 0.35404348373413086, + "learning_rate": 9.417974588938715e-05, + "loss": 4.0913, + "step": 6860 + }, + { + "epoch": 0.46643565701861667, + "grad_norm": 0.8057840466499329, + "learning_rate": 9.417549938850387e-05, + "loss": 4.0028, + "step": 6865 + }, + { + "epoch": 0.4667753770892784, + "grad_norm": 0.23458503186702728, + "learning_rate": 9.417125288762061e-05, + "loss": 3.878, + "step": 6870 + }, + { + "epoch": 0.4671150971599402, + "grad_norm": 0.2019844651222229, + "learning_rate": 9.416700638673734e-05, + "loss": 3.7162, + "step": 6875 + }, + { + "epoch": 0.467454817230602, + "grad_norm": 0.19805146753787994, + "learning_rate": 9.416275988585405e-05, + "loss": 3.8687, + "step": 6880 + }, + { + "epoch": 0.46779453730126375, + "grad_norm": 0.2559395730495453, + "learning_rate": 9.41585133849708e-05, + "loss": 4.0791, + "step": 6885 + }, + { + "epoch": 0.46813425737192554, + "grad_norm": 0.3069989085197449, + "learning_rate": 9.415426688408752e-05, + "loss": 4.1204, + "step": 6890 + }, + { + "epoch": 0.4684739774425873, + "grad_norm": 0.5808936953544617, + "learning_rate": 9.415002038320424e-05, + "loss": 4.078, + "step": 6895 + }, + { + "epoch": 0.4688136975132491, + "grad_norm": 0.2510988414287567, + "learning_rate": 9.414577388232098e-05, + "loss": 4.23, + "step": 6900 + }, + { + "epoch": 0.4691534175839109, + "grad_norm": 0.2112618386745453, + "learning_rate": 9.414152738143769e-05, + "loss": 4.1868, + "step": 6905 + }, + { + "epoch": 0.4694931376545726, + "grad_norm": 0.22074821591377258, + "learning_rate": 9.413728088055442e-05, + "loss": 4.116, + "step": 6910 + }, + { + "epoch": 0.4698328577252344, + "grad_norm": 0.18397152423858643, + "learning_rate": 9.413303437967116e-05, + "loss": 4.0599, + "step": 6915 + }, + { + "epoch": 0.47017257779589616, + "grad_norm": 0.23679019510746002, + "learning_rate": 9.412878787878788e-05, + "loss": 3.8418, + "step": 6920 + }, + { + "epoch": 0.47051229786655796, + "grad_norm": 0.16602519154548645, + "learning_rate": 9.41245413779046e-05, + "loss": 4.0308, + "step": 6925 + }, + { + "epoch": 0.47085201793721976, + "grad_norm": 0.2898738384246826, + "learning_rate": 9.412029487702135e-05, + "loss": 4.2135, + "step": 6930 + }, + { + "epoch": 0.4711917380078815, + "grad_norm": 0.21048447489738464, + "learning_rate": 9.411604837613806e-05, + "loss": 3.9954, + "step": 6935 + }, + { + "epoch": 0.4715314580785433, + "grad_norm": 0.16546538472175598, + "learning_rate": 9.411180187525479e-05, + "loss": 4.1153, + "step": 6940 + }, + { + "epoch": 0.47187117814920504, + "grad_norm": 0.5077167749404907, + "learning_rate": 9.410755537437153e-05, + "loss": 4.1654, + "step": 6945 + }, + { + "epoch": 0.47221089821986684, + "grad_norm": 0.20563165843486786, + "learning_rate": 9.410330887348825e-05, + "loss": 3.9387, + "step": 6950 + }, + { + "epoch": 0.4725506182905286, + "grad_norm": 0.2332395762205124, + "learning_rate": 9.409906237260497e-05, + "loss": 4.2803, + "step": 6955 + }, + { + "epoch": 0.4728903383611904, + "grad_norm": 0.9494916796684265, + "learning_rate": 9.409481587172171e-05, + "loss": 4.1342, + "step": 6960 + }, + { + "epoch": 0.4732300584318522, + "grad_norm": 0.4015176296234131, + "learning_rate": 9.409056937083843e-05, + "loss": 4.1532, + "step": 6965 + }, + { + "epoch": 0.4735697785025139, + "grad_norm": 0.17285263538360596, + "learning_rate": 9.408632286995516e-05, + "loss": 3.8368, + "step": 6970 + }, + { + "epoch": 0.4739094985731757, + "grad_norm": 0.24160470068454742, + "learning_rate": 9.408207636907189e-05, + "loss": 4.0496, + "step": 6975 + }, + { + "epoch": 0.47424921864383746, + "grad_norm": 0.16916899383068085, + "learning_rate": 9.407782986818861e-05, + "loss": 4.3213, + "step": 6980 + }, + { + "epoch": 0.47458893871449925, + "grad_norm": 0.1876440942287445, + "learning_rate": 9.407358336730534e-05, + "loss": 4.0836, + "step": 6985 + }, + { + "epoch": 0.47492865878516105, + "grad_norm": 0.20803870260715485, + "learning_rate": 9.406933686642207e-05, + "loss": 4.0582, + "step": 6990 + }, + { + "epoch": 0.4752683788558228, + "grad_norm": 0.5098522305488586, + "learning_rate": 9.40650903655388e-05, + "loss": 3.7007, + "step": 6995 + }, + { + "epoch": 0.4756080989264846, + "grad_norm": 0.28446561098098755, + "learning_rate": 9.406084386465553e-05, + "loss": 3.7383, + "step": 7000 + }, + { + "epoch": 0.47594781899714633, + "grad_norm": 1.344814658164978, + "learning_rate": 9.405659736377225e-05, + "loss": 4.3367, + "step": 7005 + }, + { + "epoch": 0.47628753906780813, + "grad_norm": 0.3608788549900055, + "learning_rate": 9.405235086288898e-05, + "loss": 4.0906, + "step": 7010 + }, + { + "epoch": 0.4766272591384699, + "grad_norm": 0.2733428478240967, + "learning_rate": 9.404810436200571e-05, + "loss": 4.079, + "step": 7015 + }, + { + "epoch": 0.47696697920913167, + "grad_norm": 0.2144654393196106, + "learning_rate": 9.404385786112244e-05, + "loss": 3.5828, + "step": 7020 + }, + { + "epoch": 0.47730669927979347, + "grad_norm": 0.21566329896450043, + "learning_rate": 9.403961136023917e-05, + "loss": 4.1559, + "step": 7025 + }, + { + "epoch": 0.4776464193504552, + "grad_norm": 0.19979317486286163, + "learning_rate": 9.40353648593559e-05, + "loss": 4.078, + "step": 7030 + }, + { + "epoch": 0.477986139421117, + "grad_norm": 0.179921954870224, + "learning_rate": 9.403111835847262e-05, + "loss": 3.9188, + "step": 7035 + }, + { + "epoch": 0.47832585949177875, + "grad_norm": 0.17742060124874115, + "learning_rate": 9.402687185758935e-05, + "loss": 4.023, + "step": 7040 + }, + { + "epoch": 0.47866557956244055, + "grad_norm": 0.3981129229068756, + "learning_rate": 9.402262535670608e-05, + "loss": 4.0171, + "step": 7045 + }, + { + "epoch": 0.47900529963310234, + "grad_norm": 0.206672802567482, + "learning_rate": 9.40183788558228e-05, + "loss": 4.175, + "step": 7050 + }, + { + "epoch": 0.4793450197037641, + "grad_norm": 0.1959320604801178, + "learning_rate": 9.401413235493953e-05, + "loss": 4.1638, + "step": 7055 + }, + { + "epoch": 0.4796847397744259, + "grad_norm": 0.2009037733078003, + "learning_rate": 9.400988585405626e-05, + "loss": 4.1122, + "step": 7060 + }, + { + "epoch": 0.4800244598450876, + "grad_norm": 0.17592014372348785, + "learning_rate": 9.400563935317299e-05, + "loss": 4.0237, + "step": 7065 + }, + { + "epoch": 0.4803641799157494, + "grad_norm": 0.26748034358024597, + "learning_rate": 9.400139285228972e-05, + "loss": 3.8357, + "step": 7070 + }, + { + "epoch": 0.4807038999864112, + "grad_norm": 0.16173365712165833, + "learning_rate": 9.399714635140645e-05, + "loss": 4.0592, + "step": 7075 + }, + { + "epoch": 0.48104362005707296, + "grad_norm": 0.3452107906341553, + "learning_rate": 9.399289985052317e-05, + "loss": 4.1379, + "step": 7080 + }, + { + "epoch": 0.48138334012773476, + "grad_norm": 0.20402079820632935, + "learning_rate": 9.39886533496399e-05, + "loss": 4.2539, + "step": 7085 + }, + { + "epoch": 0.4817230601983965, + "grad_norm": 0.3040589690208435, + "learning_rate": 9.398440684875663e-05, + "loss": 4.0941, + "step": 7090 + }, + { + "epoch": 0.4820627802690583, + "grad_norm": 0.18901465833187103, + "learning_rate": 9.398016034787336e-05, + "loss": 4.0795, + "step": 7095 + }, + { + "epoch": 0.4824025003397201, + "grad_norm": 0.1665019541978836, + "learning_rate": 9.397591384699009e-05, + "loss": 4.0337, + "step": 7100 + }, + { + "epoch": 0.48274222041038184, + "grad_norm": 0.42847058176994324, + "learning_rate": 9.397166734610681e-05, + "loss": 4.108, + "step": 7105 + }, + { + "epoch": 0.48308194048104364, + "grad_norm": 0.21919941902160645, + "learning_rate": 9.396742084522354e-05, + "loss": 4.2498, + "step": 7110 + }, + { + "epoch": 0.4834216605517054, + "grad_norm": 0.7178875803947449, + "learning_rate": 9.396317434434027e-05, + "loss": 3.938, + "step": 7115 + }, + { + "epoch": 0.4837613806223672, + "grad_norm": 0.15290340781211853, + "learning_rate": 9.3958927843457e-05, + "loss": 4.1507, + "step": 7120 + }, + { + "epoch": 0.4841011006930289, + "grad_norm": 0.20199231803417206, + "learning_rate": 9.395468134257373e-05, + "loss": 4.165, + "step": 7125 + }, + { + "epoch": 0.4844408207636907, + "grad_norm": 0.4050713777542114, + "learning_rate": 9.395043484169045e-05, + "loss": 4.1098, + "step": 7130 + }, + { + "epoch": 0.4847805408343525, + "grad_norm": 0.22871138155460358, + "learning_rate": 9.394618834080718e-05, + "loss": 3.8713, + "step": 7135 + }, + { + "epoch": 0.48512026090501426, + "grad_norm": 0.2092018574476242, + "learning_rate": 9.394194183992391e-05, + "loss": 4.13, + "step": 7140 + }, + { + "epoch": 0.48545998097567605, + "grad_norm": 0.23514516651630402, + "learning_rate": 9.393769533904064e-05, + "loss": 4.1008, + "step": 7145 + }, + { + "epoch": 0.4857997010463378, + "grad_norm": 0.19990748167037964, + "learning_rate": 9.393344883815737e-05, + "loss": 4.1277, + "step": 7150 + }, + { + "epoch": 0.4861394211169996, + "grad_norm": 0.23331451416015625, + "learning_rate": 9.39292023372741e-05, + "loss": 4.1307, + "step": 7155 + }, + { + "epoch": 0.4864791411876614, + "grad_norm": 0.4659624397754669, + "learning_rate": 9.392495583639082e-05, + "loss": 4.0373, + "step": 7160 + }, + { + "epoch": 0.48681886125832313, + "grad_norm": 0.350339412689209, + "learning_rate": 9.392070933550755e-05, + "loss": 4.0145, + "step": 7165 + }, + { + "epoch": 0.48715858132898493, + "grad_norm": 0.29896309971809387, + "learning_rate": 9.391646283462428e-05, + "loss": 3.7892, + "step": 7170 + }, + { + "epoch": 0.48749830139964667, + "grad_norm": 0.2809394896030426, + "learning_rate": 9.391221633374099e-05, + "loss": 4.0247, + "step": 7175 + }, + { + "epoch": 0.48783802147030847, + "grad_norm": 0.2700156271457672, + "learning_rate": 9.390796983285773e-05, + "loss": 4.0203, + "step": 7180 + }, + { + "epoch": 0.48817774154097027, + "grad_norm": 0.2099238932132721, + "learning_rate": 9.390372333197446e-05, + "loss": 3.8992, + "step": 7185 + }, + { + "epoch": 0.488517461611632, + "grad_norm": 0.1657472550868988, + "learning_rate": 9.389947683109118e-05, + "loss": 4.0689, + "step": 7190 + }, + { + "epoch": 0.4888571816822938, + "grad_norm": 2.5052669048309326, + "learning_rate": 9.389523033020792e-05, + "loss": 4.2311, + "step": 7195 + }, + { + "epoch": 0.48919690175295555, + "grad_norm": 0.18674081563949585, + "learning_rate": 9.389098382932465e-05, + "loss": 4.0554, + "step": 7200 + }, + { + "epoch": 0.48953662182361735, + "grad_norm": 0.17581969499588013, + "learning_rate": 9.388673732844136e-05, + "loss": 3.9992, + "step": 7205 + }, + { + "epoch": 0.4898763418942791, + "grad_norm": 0.2378435879945755, + "learning_rate": 9.38824908275581e-05, + "loss": 4.1965, + "step": 7210 + }, + { + "epoch": 0.4902160619649409, + "grad_norm": 0.21895438432693481, + "learning_rate": 9.387824432667483e-05, + "loss": 3.9994, + "step": 7215 + }, + { + "epoch": 0.4905557820356027, + "grad_norm": 0.18622034788131714, + "learning_rate": 9.387399782579154e-05, + "loss": 4.2384, + "step": 7220 + }, + { + "epoch": 0.4908955021062644, + "grad_norm": 0.21598079800605774, + "learning_rate": 9.386975132490829e-05, + "loss": 4.2434, + "step": 7225 + }, + { + "epoch": 0.4912352221769262, + "grad_norm": 1.403287410736084, + "learning_rate": 9.386550482402501e-05, + "loss": 3.9085, + "step": 7230 + }, + { + "epoch": 0.49157494224758796, + "grad_norm": 0.31629684567451477, + "learning_rate": 9.386125832314173e-05, + "loss": 4.0834, + "step": 7235 + }, + { + "epoch": 0.49191466231824976, + "grad_norm": 0.503648042678833, + "learning_rate": 9.385701182225847e-05, + "loss": 4.0346, + "step": 7240 + }, + { + "epoch": 0.49225438238891156, + "grad_norm": 0.16960012912750244, + "learning_rate": 9.38527653213752e-05, + "loss": 4.0295, + "step": 7245 + }, + { + "epoch": 0.4925941024595733, + "grad_norm": 0.2992265224456787, + "learning_rate": 9.384851882049191e-05, + "loss": 4.147, + "step": 7250 + }, + { + "epoch": 0.4929338225302351, + "grad_norm": 0.37070542573928833, + "learning_rate": 9.384427231960865e-05, + "loss": 3.6991, + "step": 7255 + }, + { + "epoch": 0.49327354260089684, + "grad_norm": 0.2252090573310852, + "learning_rate": 9.384002581872537e-05, + "loss": 4.2128, + "step": 7260 + }, + { + "epoch": 0.49361326267155864, + "grad_norm": 0.1695706844329834, + "learning_rate": 9.38357793178421e-05, + "loss": 4.0321, + "step": 7265 + }, + { + "epoch": 0.49395298274222044, + "grad_norm": 0.23216302692890167, + "learning_rate": 9.383153281695884e-05, + "loss": 3.9758, + "step": 7270 + }, + { + "epoch": 0.4942927028128822, + "grad_norm": 0.9034651517868042, + "learning_rate": 9.382728631607555e-05, + "loss": 3.9329, + "step": 7275 + }, + { + "epoch": 0.494632422883544, + "grad_norm": 0.21417531371116638, + "learning_rate": 9.382303981519228e-05, + "loss": 3.8799, + "step": 7280 + }, + { + "epoch": 0.4949721429542057, + "grad_norm": 0.18411661684513092, + "learning_rate": 9.381879331430902e-05, + "loss": 4.2956, + "step": 7285 + }, + { + "epoch": 0.4953118630248675, + "grad_norm": 0.23416036367416382, + "learning_rate": 9.381454681342574e-05, + "loss": 3.8779, + "step": 7290 + }, + { + "epoch": 0.49565158309552926, + "grad_norm": 0.2349618524312973, + "learning_rate": 9.381030031254246e-05, + "loss": 4.027, + "step": 7295 + }, + { + "epoch": 0.49599130316619106, + "grad_norm": 0.2522861659526825, + "learning_rate": 9.38060538116592e-05, + "loss": 4.1115, + "step": 7300 + }, + { + "epoch": 0.49633102323685285, + "grad_norm": 0.18916946649551392, + "learning_rate": 9.380180731077592e-05, + "loss": 4.1002, + "step": 7305 + }, + { + "epoch": 0.4966707433075146, + "grad_norm": 0.20227526128292084, + "learning_rate": 9.379756080989265e-05, + "loss": 4.0984, + "step": 7310 + }, + { + "epoch": 0.4970104633781764, + "grad_norm": 0.20442931354045868, + "learning_rate": 9.379331430900939e-05, + "loss": 4.0096, + "step": 7315 + }, + { + "epoch": 0.49735018344883813, + "grad_norm": 0.19708792865276337, + "learning_rate": 9.37890678081261e-05, + "loss": 4.2264, + "step": 7320 + }, + { + "epoch": 0.49768990351949993, + "grad_norm": 0.2058994174003601, + "learning_rate": 9.378482130724283e-05, + "loss": 3.9661, + "step": 7325 + }, + { + "epoch": 0.49802962359016173, + "grad_norm": 0.18831300735473633, + "learning_rate": 9.378057480635956e-05, + "loss": 4.0001, + "step": 7330 + }, + { + "epoch": 0.49836934366082347, + "grad_norm": 0.5251606702804565, + "learning_rate": 9.377632830547629e-05, + "loss": 4.1706, + "step": 7335 + }, + { + "epoch": 0.49870906373148527, + "grad_norm": 0.17007534205913544, + "learning_rate": 9.377208180459302e-05, + "loss": 4.1699, + "step": 7340 + }, + { + "epoch": 0.499048783802147, + "grad_norm": 0.3484830856323242, + "learning_rate": 9.376783530370974e-05, + "loss": 3.944, + "step": 7345 + }, + { + "epoch": 0.4993885038728088, + "grad_norm": 0.20382869243621826, + "learning_rate": 9.376358880282647e-05, + "loss": 3.7016, + "step": 7350 + }, + { + "epoch": 0.4997282239434706, + "grad_norm": 0.2002745270729065, + "learning_rate": 9.37593423019432e-05, + "loss": 4.1238, + "step": 7355 + }, + { + "epoch": 0.5000679440141323, + "grad_norm": 0.17399045825004578, + "learning_rate": 9.375509580105993e-05, + "loss": 4.0909, + "step": 7360 + }, + { + "epoch": 0.5004076640847941, + "grad_norm": 0.24848084151744843, + "learning_rate": 9.375084930017666e-05, + "loss": 4.1379, + "step": 7365 + }, + { + "epoch": 0.5007473841554559, + "grad_norm": 0.5024029016494751, + "learning_rate": 9.374660279929338e-05, + "loss": 4.2591, + "step": 7370 + }, + { + "epoch": 0.5010871042261177, + "grad_norm": 0.22552575170993805, + "learning_rate": 9.374235629841011e-05, + "loss": 4.049, + "step": 7375 + }, + { + "epoch": 0.5014268242967794, + "grad_norm": 0.2249031662940979, + "learning_rate": 9.373810979752684e-05, + "loss": 4.2421, + "step": 7380 + }, + { + "epoch": 0.5017665443674413, + "grad_norm": 0.22408431768417358, + "learning_rate": 9.373386329664357e-05, + "loss": 4.3666, + "step": 7385 + }, + { + "epoch": 0.502106264438103, + "grad_norm": 0.16393537819385529, + "learning_rate": 9.37296167957603e-05, + "loss": 3.7884, + "step": 7390 + }, + { + "epoch": 0.5024459845087648, + "grad_norm": 0.25391802191734314, + "learning_rate": 9.372537029487702e-05, + "loss": 4.1085, + "step": 7395 + }, + { + "epoch": 0.5027857045794265, + "grad_norm": 0.25248852372169495, + "learning_rate": 9.372112379399375e-05, + "loss": 4.0558, + "step": 7400 + }, + { + "epoch": 0.5031254246500884, + "grad_norm": 0.2197033017873764, + "learning_rate": 9.371687729311048e-05, + "loss": 3.979, + "step": 7405 + }, + { + "epoch": 0.5034651447207501, + "grad_norm": 0.20195040106773376, + "learning_rate": 9.371263079222721e-05, + "loss": 3.8918, + "step": 7410 + }, + { + "epoch": 0.5038048647914118, + "grad_norm": 0.1969507336616516, + "learning_rate": 9.370838429134394e-05, + "loss": 3.9211, + "step": 7415 + }, + { + "epoch": 0.5041445848620737, + "grad_norm": 0.26221612095832825, + "learning_rate": 9.370413779046066e-05, + "loss": 4.067, + "step": 7420 + }, + { + "epoch": 0.5044843049327354, + "grad_norm": 0.20401246845722198, + "learning_rate": 9.369989128957739e-05, + "loss": 3.9535, + "step": 7425 + }, + { + "epoch": 0.5048240250033972, + "grad_norm": 0.5490508675575256, + "learning_rate": 9.369564478869412e-05, + "loss": 3.9772, + "step": 7430 + }, + { + "epoch": 0.5051637450740589, + "grad_norm": 0.1551188975572586, + "learning_rate": 9.369139828781085e-05, + "loss": 4.1125, + "step": 7435 + }, + { + "epoch": 0.5055034651447208, + "grad_norm": 0.31335705518722534, + "learning_rate": 9.368715178692758e-05, + "loss": 4.0064, + "step": 7440 + }, + { + "epoch": 0.5058431852153825, + "grad_norm": 0.23974861204624176, + "learning_rate": 9.36829052860443e-05, + "loss": 3.9679, + "step": 7445 + }, + { + "epoch": 0.5061829052860443, + "grad_norm": 0.15649579465389252, + "learning_rate": 9.367865878516103e-05, + "loss": 4.005, + "step": 7450 + }, + { + "epoch": 0.5065226253567061, + "grad_norm": 0.18585532903671265, + "learning_rate": 9.367441228427776e-05, + "loss": 3.8728, + "step": 7455 + }, + { + "epoch": 0.5068623454273679, + "grad_norm": 0.2581709623336792, + "learning_rate": 9.367016578339449e-05, + "loss": 4.0771, + "step": 7460 + }, + { + "epoch": 0.5072020654980296, + "grad_norm": 0.2468833178281784, + "learning_rate": 9.366591928251122e-05, + "loss": 4.054, + "step": 7465 + }, + { + "epoch": 0.5075417855686915, + "grad_norm": 0.2019713968038559, + "learning_rate": 9.366167278162794e-05, + "loss": 3.6126, + "step": 7470 + }, + { + "epoch": 0.5078815056393532, + "grad_norm": 0.25231555104255676, + "learning_rate": 9.365742628074467e-05, + "loss": 4.4004, + "step": 7475 + }, + { + "epoch": 0.5082212257100149, + "grad_norm": 0.14785149693489075, + "learning_rate": 9.36531797798614e-05, + "loss": 3.7965, + "step": 7480 + }, + { + "epoch": 0.5085609457806767, + "grad_norm": 0.21854764223098755, + "learning_rate": 9.364893327897813e-05, + "loss": 4.3081, + "step": 7485 + }, + { + "epoch": 0.5089006658513385, + "grad_norm": 0.35712555050849915, + "learning_rate": 9.364468677809486e-05, + "loss": 3.7848, + "step": 7490 + }, + { + "epoch": 0.5092403859220003, + "grad_norm": 0.9170289039611816, + "learning_rate": 9.364044027721158e-05, + "loss": 4.0845, + "step": 7495 + }, + { + "epoch": 0.509580105992662, + "grad_norm": 0.19155257940292358, + "learning_rate": 9.363619377632831e-05, + "loss": 4.0807, + "step": 7500 + }, + { + "epoch": 0.5099198260633239, + "grad_norm": 0.20362383127212524, + "learning_rate": 9.363194727544504e-05, + "loss": 4.1181, + "step": 7505 + }, + { + "epoch": 0.5102595461339856, + "grad_norm": 1.610526204109192, + "learning_rate": 9.362770077456177e-05, + "loss": 4.1332, + "step": 7510 + }, + { + "epoch": 0.5105992662046474, + "grad_norm": 0.36077892780303955, + "learning_rate": 9.36234542736785e-05, + "loss": 4.0379, + "step": 7515 + }, + { + "epoch": 0.5109389862753091, + "grad_norm": 0.34184160828590393, + "learning_rate": 9.361920777279522e-05, + "loss": 4.0309, + "step": 7520 + }, + { + "epoch": 0.511278706345971, + "grad_norm": 0.266156941652298, + "learning_rate": 9.361496127191195e-05, + "loss": 4.0715, + "step": 7525 + }, + { + "epoch": 0.5116184264166327, + "grad_norm": 0.21247410774230957, + "learning_rate": 9.361071477102867e-05, + "loss": 4.1461, + "step": 7530 + }, + { + "epoch": 0.5119581464872944, + "grad_norm": 0.3173115849494934, + "learning_rate": 9.360646827014541e-05, + "loss": 4.0909, + "step": 7535 + }, + { + "epoch": 0.5122978665579563, + "grad_norm": 0.1932353973388672, + "learning_rate": 9.360222176926214e-05, + "loss": 4.0699, + "step": 7540 + }, + { + "epoch": 0.512637586628618, + "grad_norm": 0.34887808561325073, + "learning_rate": 9.359797526837885e-05, + "loss": 3.8442, + "step": 7545 + }, + { + "epoch": 0.5129773066992798, + "grad_norm": 0.1603212207555771, + "learning_rate": 9.359372876749559e-05, + "loss": 3.9252, + "step": 7550 + }, + { + "epoch": 0.5133170267699416, + "grad_norm": 0.18673382699489594, + "learning_rate": 9.358948226661232e-05, + "loss": 4.132, + "step": 7555 + }, + { + "epoch": 0.5136567468406034, + "grad_norm": 0.17931464314460754, + "learning_rate": 9.358523576572904e-05, + "loss": 4.2045, + "step": 7560 + }, + { + "epoch": 0.5139964669112651, + "grad_norm": 0.20832332968711853, + "learning_rate": 9.358098926484578e-05, + "loss": 4.1134, + "step": 7565 + }, + { + "epoch": 0.5143361869819268, + "grad_norm": 0.1900588423013687, + "learning_rate": 9.35767427639625e-05, + "loss": 4.1737, + "step": 7570 + }, + { + "epoch": 0.5146759070525887, + "grad_norm": 0.25555694103240967, + "learning_rate": 9.357249626307922e-05, + "loss": 3.9879, + "step": 7575 + }, + { + "epoch": 0.5150156271232504, + "grad_norm": 0.3741958737373352, + "learning_rate": 9.356824976219596e-05, + "loss": 4.0922, + "step": 7580 + }, + { + "epoch": 0.5153553471939122, + "grad_norm": 0.25290772318840027, + "learning_rate": 9.356400326131269e-05, + "loss": 4.1101, + "step": 7585 + }, + { + "epoch": 0.515695067264574, + "grad_norm": 0.23055055737495422, + "learning_rate": 9.35597567604294e-05, + "loss": 3.9841, + "step": 7590 + }, + { + "epoch": 0.5160347873352358, + "grad_norm": 0.2004847675561905, + "learning_rate": 9.355551025954614e-05, + "loss": 4.0573, + "step": 7595 + }, + { + "epoch": 0.5163745074058975, + "grad_norm": 0.17430374026298523, + "learning_rate": 9.355126375866287e-05, + "loss": 4.1038, + "step": 7600 + }, + { + "epoch": 0.5167142274765593, + "grad_norm": 0.19129951298236847, + "learning_rate": 9.354701725777959e-05, + "loss": 3.9517, + "step": 7605 + }, + { + "epoch": 0.5170539475472211, + "grad_norm": 0.24866996705532074, + "learning_rate": 9.354277075689633e-05, + "loss": 3.9927, + "step": 7610 + }, + { + "epoch": 0.5173936676178829, + "grad_norm": 0.2479562759399414, + "learning_rate": 9.353852425601304e-05, + "loss": 3.824, + "step": 7615 + }, + { + "epoch": 0.5177333876885446, + "grad_norm": 0.25535717606544495, + "learning_rate": 9.353427775512977e-05, + "loss": 4.1203, + "step": 7620 + }, + { + "epoch": 0.5180731077592065, + "grad_norm": 0.19401293992996216, + "learning_rate": 9.353003125424651e-05, + "loss": 4.2723, + "step": 7625 + }, + { + "epoch": 0.5184128278298682, + "grad_norm": 0.25242581963539124, + "learning_rate": 9.352578475336323e-05, + "loss": 3.7278, + "step": 7630 + }, + { + "epoch": 0.5187525479005299, + "grad_norm": 0.24895191192626953, + "learning_rate": 9.352153825247996e-05, + "loss": 4.0693, + "step": 7635 + }, + { + "epoch": 0.5190922679711918, + "grad_norm": 0.208522230386734, + "learning_rate": 9.35172917515967e-05, + "loss": 4.105, + "step": 7640 + }, + { + "epoch": 0.5194319880418535, + "grad_norm": 0.21266648173332214, + "learning_rate": 9.351304525071341e-05, + "loss": 4.0295, + "step": 7645 + }, + { + "epoch": 0.5197717081125153, + "grad_norm": 0.1788320690393448, + "learning_rate": 9.350879874983014e-05, + "loss": 3.9959, + "step": 7650 + }, + { + "epoch": 0.520111428183177, + "grad_norm": 0.36194828152656555, + "learning_rate": 9.350455224894688e-05, + "loss": 4.0003, + "step": 7655 + }, + { + "epoch": 0.5204511482538389, + "grad_norm": 0.22471646964550018, + "learning_rate": 9.35003057480636e-05, + "loss": 4.0001, + "step": 7660 + }, + { + "epoch": 0.5207908683245006, + "grad_norm": 0.20140118896961212, + "learning_rate": 9.349605924718032e-05, + "loss": 3.802, + "step": 7665 + }, + { + "epoch": 0.5211305883951624, + "grad_norm": 0.2610224485397339, + "learning_rate": 9.349181274629706e-05, + "loss": 4.0378, + "step": 7670 + }, + { + "epoch": 0.5214703084658242, + "grad_norm": 0.19528135657310486, + "learning_rate": 9.348756624541378e-05, + "loss": 4.059, + "step": 7675 + }, + { + "epoch": 0.521810028536486, + "grad_norm": 0.1775432527065277, + "learning_rate": 9.348331974453051e-05, + "loss": 4.0532, + "step": 7680 + }, + { + "epoch": 0.5221497486071477, + "grad_norm": 4.0261311531066895, + "learning_rate": 9.347907324364724e-05, + "loss": 4.1247, + "step": 7685 + }, + { + "epoch": 0.5224894686778094, + "grad_norm": 0.18927231431007385, + "learning_rate": 9.347482674276396e-05, + "loss": 4.1597, + "step": 7690 + }, + { + "epoch": 0.5228291887484713, + "grad_norm": 0.17525963485240936, + "learning_rate": 9.347058024188069e-05, + "loss": 3.7783, + "step": 7695 + }, + { + "epoch": 0.523168908819133, + "grad_norm": 0.15147970616817474, + "learning_rate": 9.346633374099742e-05, + "loss": 4.017, + "step": 7700 + }, + { + "epoch": 0.5235086288897948, + "grad_norm": 0.17710480093955994, + "learning_rate": 9.346208724011415e-05, + "loss": 4.3239, + "step": 7705 + }, + { + "epoch": 0.5238483489604566, + "grad_norm": 0.1875525414943695, + "learning_rate": 9.345784073923088e-05, + "loss": 3.9417, + "step": 7710 + }, + { + "epoch": 0.5241880690311184, + "grad_norm": 0.2575678825378418, + "learning_rate": 9.34535942383476e-05, + "loss": 4.1485, + "step": 7715 + }, + { + "epoch": 0.5245277891017801, + "grad_norm": 0.8326651453971863, + "learning_rate": 9.344934773746433e-05, + "loss": 4.0598, + "step": 7720 + }, + { + "epoch": 0.524867509172442, + "grad_norm": 0.1673835813999176, + "learning_rate": 9.344510123658106e-05, + "loss": 4.1255, + "step": 7725 + }, + { + "epoch": 0.5252072292431037, + "grad_norm": 0.764521598815918, + "learning_rate": 9.344085473569779e-05, + "loss": 3.9019, + "step": 7730 + }, + { + "epoch": 0.5255469493137654, + "grad_norm": 0.17800885438919067, + "learning_rate": 9.343660823481452e-05, + "loss": 3.7869, + "step": 7735 + }, + { + "epoch": 0.5258866693844272, + "grad_norm": 0.1920061856508255, + "learning_rate": 9.343236173393124e-05, + "loss": 4.1229, + "step": 7740 + }, + { + "epoch": 0.526226389455089, + "grad_norm": 3.5604405403137207, + "learning_rate": 9.342811523304797e-05, + "loss": 3.9478, + "step": 7745 + }, + { + "epoch": 0.5265661095257508, + "grad_norm": 0.194805309176445, + "learning_rate": 9.34238687321647e-05, + "loss": 4.2643, + "step": 7750 + }, + { + "epoch": 0.5269058295964125, + "grad_norm": 0.22051791846752167, + "learning_rate": 9.341962223128143e-05, + "loss": 4.1483, + "step": 7755 + }, + { + "epoch": 0.5272455496670744, + "grad_norm": 0.22619082033634186, + "learning_rate": 9.341622503057481e-05, + "loss": 3.5109, + "step": 7760 + }, + { + "epoch": 0.5275852697377361, + "grad_norm": 0.2676302194595337, + "learning_rate": 9.341197852969154e-05, + "loss": 4.174, + "step": 7765 + }, + { + "epoch": 0.5279249898083979, + "grad_norm": 0.25767549872398376, + "learning_rate": 9.340773202880827e-05, + "loss": 3.9739, + "step": 7770 + }, + { + "epoch": 0.5282647098790596, + "grad_norm": 0.18924906849861145, + "learning_rate": 9.3403485527925e-05, + "loss": 4.2547, + "step": 7775 + }, + { + "epoch": 0.5286044299497215, + "grad_norm": 0.1977754682302475, + "learning_rate": 9.339923902704172e-05, + "loss": 3.8264, + "step": 7780 + }, + { + "epoch": 0.5289441500203832, + "grad_norm": 0.1865202784538269, + "learning_rate": 9.339499252615845e-05, + "loss": 4.2391, + "step": 7785 + }, + { + "epoch": 0.5292838700910449, + "grad_norm": 0.22030463814735413, + "learning_rate": 9.339074602527518e-05, + "loss": 4.1641, + "step": 7790 + }, + { + "epoch": 0.5296235901617068, + "grad_norm": 0.1868383139371872, + "learning_rate": 9.33864995243919e-05, + "loss": 4.247, + "step": 7795 + }, + { + "epoch": 0.5299633102323685, + "grad_norm": 0.19364790618419647, + "learning_rate": 9.338225302350863e-05, + "loss": 3.9474, + "step": 7800 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.1819707751274109, + "learning_rate": 9.337800652262536e-05, + "loss": 4.0559, + "step": 7805 + }, + { + "epoch": 0.5306427503736921, + "grad_norm": 0.25897395610809326, + "learning_rate": 9.337376002174209e-05, + "loss": 3.9502, + "step": 7810 + }, + { + "epoch": 0.5309824704443539, + "grad_norm": 0.23874245584011078, + "learning_rate": 9.336951352085882e-05, + "loss": 3.89, + "step": 7815 + }, + { + "epoch": 0.5313221905150156, + "grad_norm": 0.16747428476810455, + "learning_rate": 9.336526701997555e-05, + "loss": 4.0011, + "step": 7820 + }, + { + "epoch": 0.5316619105856774, + "grad_norm": 0.17043693363666534, + "learning_rate": 9.336102051909227e-05, + "loss": 4.0938, + "step": 7825 + }, + { + "epoch": 0.5320016306563392, + "grad_norm": 0.19424232840538025, + "learning_rate": 9.3356774018209e-05, + "loss": 3.9479, + "step": 7830 + }, + { + "epoch": 0.532341350727001, + "grad_norm": 0.1900462657213211, + "learning_rate": 9.335252751732573e-05, + "loss": 4.2105, + "step": 7835 + }, + { + "epoch": 0.5326810707976627, + "grad_norm": 0.2120985984802246, + "learning_rate": 9.334828101644246e-05, + "loss": 4.2686, + "step": 7840 + }, + { + "epoch": 0.5330207908683245, + "grad_norm": 0.24349243938922882, + "learning_rate": 9.334403451555919e-05, + "loss": 4.09, + "step": 7845 + }, + { + "epoch": 0.5333605109389863, + "grad_norm": 0.22167713940143585, + "learning_rate": 9.333978801467591e-05, + "loss": 4.0405, + "step": 7850 + }, + { + "epoch": 0.533700231009648, + "grad_norm": 0.2967727780342102, + "learning_rate": 9.333554151379263e-05, + "loss": 4.3056, + "step": 7855 + }, + { + "epoch": 0.5340399510803098, + "grad_norm": 0.1543935239315033, + "learning_rate": 9.333129501290937e-05, + "loss": 4.0946, + "step": 7860 + }, + { + "epoch": 0.5343796711509716, + "grad_norm": 0.27082520723342896, + "learning_rate": 9.33270485120261e-05, + "loss": 3.9952, + "step": 7865 + }, + { + "epoch": 0.5347193912216334, + "grad_norm": 0.27814018726348877, + "learning_rate": 9.332280201114281e-05, + "loss": 4.2333, + "step": 7870 + }, + { + "epoch": 0.5350591112922951, + "grad_norm": 0.18961313366889954, + "learning_rate": 9.331855551025955e-05, + "loss": 4.1017, + "step": 7875 + }, + { + "epoch": 0.535398831362957, + "grad_norm": 0.19121582806110382, + "learning_rate": 9.331430900937628e-05, + "loss": 4.1663, + "step": 7880 + }, + { + "epoch": 0.5357385514336187, + "grad_norm": 0.20209085941314697, + "learning_rate": 9.3310062508493e-05, + "loss": 3.9602, + "step": 7885 + }, + { + "epoch": 0.5360782715042804, + "grad_norm": 0.222623810172081, + "learning_rate": 9.330581600760974e-05, + "loss": 4.208, + "step": 7890 + }, + { + "epoch": 0.5364179915749423, + "grad_norm": 0.2834756672382355, + "learning_rate": 9.330156950672647e-05, + "loss": 3.8017, + "step": 7895 + }, + { + "epoch": 0.536757711645604, + "grad_norm": 0.26975885033607483, + "learning_rate": 9.329732300584318e-05, + "loss": 3.8618, + "step": 7900 + }, + { + "epoch": 0.5370974317162658, + "grad_norm": 0.24554960429668427, + "learning_rate": 9.329307650495992e-05, + "loss": 4.1339, + "step": 7905 + }, + { + "epoch": 0.5374371517869275, + "grad_norm": 0.19073788821697235, + "learning_rate": 9.328883000407665e-05, + "loss": 4.1798, + "step": 7910 + }, + { + "epoch": 0.5377768718575894, + "grad_norm": 0.1700531542301178, + "learning_rate": 9.328458350319336e-05, + "loss": 4.1152, + "step": 7915 + }, + { + "epoch": 0.5381165919282511, + "grad_norm": 0.2178450971841812, + "learning_rate": 9.32803370023101e-05, + "loss": 3.9352, + "step": 7920 + }, + { + "epoch": 0.5384563119989129, + "grad_norm": 0.20611537992954254, + "learning_rate": 9.327609050142682e-05, + "loss": 4.0871, + "step": 7925 + }, + { + "epoch": 0.5387960320695747, + "grad_norm": 0.20074544847011566, + "learning_rate": 9.327184400054355e-05, + "loss": 4.1689, + "step": 7930 + }, + { + "epoch": 0.5391357521402365, + "grad_norm": 0.18785405158996582, + "learning_rate": 9.326759749966029e-05, + "loss": 4.0912, + "step": 7935 + }, + { + "epoch": 0.5394754722108982, + "grad_norm": 0.2205738127231598, + "learning_rate": 9.3263350998777e-05, + "loss": 4.0251, + "step": 7940 + }, + { + "epoch": 0.5398151922815599, + "grad_norm": 0.2106117606163025, + "learning_rate": 9.325910449789373e-05, + "loss": 3.9616, + "step": 7945 + }, + { + "epoch": 0.5401549123522218, + "grad_norm": 0.20841112732887268, + "learning_rate": 9.325485799701047e-05, + "loss": 4.1365, + "step": 7950 + }, + { + "epoch": 0.5404946324228835, + "grad_norm": 0.20364391803741455, + "learning_rate": 9.325061149612719e-05, + "loss": 3.8211, + "step": 7955 + }, + { + "epoch": 0.5408343524935453, + "grad_norm": 0.1856883019208908, + "learning_rate": 9.324636499524393e-05, + "loss": 4.102, + "step": 7960 + }, + { + "epoch": 0.5411740725642071, + "grad_norm": 0.22017249464988708, + "learning_rate": 9.324211849436066e-05, + "loss": 3.9317, + "step": 7965 + }, + { + "epoch": 0.5415137926348689, + "grad_norm": 0.2391664832830429, + "learning_rate": 9.323787199347737e-05, + "loss": 3.975, + "step": 7970 + }, + { + "epoch": 0.5418535127055306, + "grad_norm": 0.15632979571819305, + "learning_rate": 9.323362549259411e-05, + "loss": 4.1384, + "step": 7975 + }, + { + "epoch": 0.5421932327761925, + "grad_norm": 0.18496164679527283, + "learning_rate": 9.322937899171084e-05, + "loss": 4.1871, + "step": 7980 + }, + { + "epoch": 0.5425329528468542, + "grad_norm": 0.18015219271183014, + "learning_rate": 9.322513249082756e-05, + "loss": 3.9501, + "step": 7985 + }, + { + "epoch": 0.542872672917516, + "grad_norm": 0.19946110248565674, + "learning_rate": 9.32208859899443e-05, + "loss": 3.928, + "step": 7990 + }, + { + "epoch": 0.5432123929881777, + "grad_norm": 0.18122997879981995, + "learning_rate": 9.321663948906103e-05, + "loss": 4.0794, + "step": 7995 + }, + { + "epoch": 0.5435521130588395, + "grad_norm": 0.2898665964603424, + "learning_rate": 9.321239298817774e-05, + "loss": 4.1946, + "step": 8000 + }, + { + "epoch": 0.5438918331295013, + "grad_norm": 0.19237551093101501, + "learning_rate": 9.320814648729448e-05, + "loss": 3.8687, + "step": 8005 + }, + { + "epoch": 0.544231553200163, + "grad_norm": 0.18704599142074585, + "learning_rate": 9.32038999864112e-05, + "loss": 4.0483, + "step": 8010 + }, + { + "epoch": 0.5445712732708249, + "grad_norm": 0.16550405323505402, + "learning_rate": 9.319965348552792e-05, + "loss": 3.927, + "step": 8015 + }, + { + "epoch": 0.5449109933414866, + "grad_norm": 0.20201174914836884, + "learning_rate": 9.319540698464467e-05, + "loss": 4.1593, + "step": 8020 + }, + { + "epoch": 0.5452507134121484, + "grad_norm": 0.19555409252643585, + "learning_rate": 9.319116048376138e-05, + "loss": 4.2645, + "step": 8025 + }, + { + "epoch": 0.5455904334828101, + "grad_norm": 0.20266516506671906, + "learning_rate": 9.318691398287811e-05, + "loss": 4.165, + "step": 8030 + }, + { + "epoch": 0.545930153553472, + "grad_norm": 0.18069981038570404, + "learning_rate": 9.318266748199485e-05, + "loss": 4.0198, + "step": 8035 + }, + { + "epoch": 0.5462698736241337, + "grad_norm": 0.1856188029050827, + "learning_rate": 9.317842098111156e-05, + "loss": 3.9087, + "step": 8040 + }, + { + "epoch": 0.5466095936947954, + "grad_norm": 0.19204466044902802, + "learning_rate": 9.317417448022829e-05, + "loss": 4.0723, + "step": 8045 + }, + { + "epoch": 0.5469493137654573, + "grad_norm": 0.19473916292190552, + "learning_rate": 9.316992797934503e-05, + "loss": 3.9512, + "step": 8050 + }, + { + "epoch": 0.547289033836119, + "grad_norm": 0.23411086201667786, + "learning_rate": 9.316568147846175e-05, + "loss": 4.019, + "step": 8055 + }, + { + "epoch": 0.5476287539067808, + "grad_norm": 0.17357757687568665, + "learning_rate": 9.316143497757848e-05, + "loss": 3.9088, + "step": 8060 + }, + { + "epoch": 0.5479684739774426, + "grad_norm": 0.19055502116680145, + "learning_rate": 9.315718847669522e-05, + "loss": 3.9099, + "step": 8065 + }, + { + "epoch": 0.5483081940481044, + "grad_norm": 0.21026012301445007, + "learning_rate": 9.315294197581193e-05, + "loss": 4.0825, + "step": 8070 + }, + { + "epoch": 0.5486479141187661, + "grad_norm": 0.1714550107717514, + "learning_rate": 9.314869547492866e-05, + "loss": 4.1415, + "step": 8075 + }, + { + "epoch": 0.5489876341894279, + "grad_norm": 0.18270696699619293, + "learning_rate": 9.314444897404539e-05, + "loss": 4.0709, + "step": 8080 + }, + { + "epoch": 0.5493273542600897, + "grad_norm": 0.1924707442522049, + "learning_rate": 9.314020247316212e-05, + "loss": 4.0765, + "step": 8085 + }, + { + "epoch": 0.5496670743307515, + "grad_norm": 1.1790002584457397, + "learning_rate": 9.313595597227884e-05, + "loss": 4.0, + "step": 8090 + }, + { + "epoch": 0.5500067944014132, + "grad_norm": 0.2330000251531601, + "learning_rate": 9.313170947139557e-05, + "loss": 4.0367, + "step": 8095 + }, + { + "epoch": 0.550346514472075, + "grad_norm": 0.21324478089809418, + "learning_rate": 9.31274629705123e-05, + "loss": 3.9346, + "step": 8100 + }, + { + "epoch": 0.5506862345427368, + "grad_norm": 0.1989852339029312, + "learning_rate": 9.312321646962903e-05, + "loss": 3.883, + "step": 8105 + }, + { + "epoch": 0.5510259546133985, + "grad_norm": 0.2093362659215927, + "learning_rate": 9.311896996874576e-05, + "loss": 3.945, + "step": 8110 + }, + { + "epoch": 0.5513656746840603, + "grad_norm": 0.17465944588184357, + "learning_rate": 9.311472346786248e-05, + "loss": 3.9291, + "step": 8115 + }, + { + "epoch": 0.5517053947547221, + "grad_norm": 0.7979787588119507, + "learning_rate": 9.311047696697921e-05, + "loss": 3.9622, + "step": 8120 + }, + { + "epoch": 0.5520451148253839, + "grad_norm": 0.20573332905769348, + "learning_rate": 9.310623046609594e-05, + "loss": 4.0717, + "step": 8125 + }, + { + "epoch": 0.5523848348960456, + "grad_norm": 0.8882215023040771, + "learning_rate": 9.310198396521267e-05, + "loss": 4.045, + "step": 8130 + }, + { + "epoch": 0.5527245549667075, + "grad_norm": 0.1780032366514206, + "learning_rate": 9.30977374643294e-05, + "loss": 3.8284, + "step": 8135 + }, + { + "epoch": 0.5530642750373692, + "grad_norm": 0.19671432673931122, + "learning_rate": 9.309349096344612e-05, + "loss": 4.0132, + "step": 8140 + }, + { + "epoch": 0.553403995108031, + "grad_norm": 0.17597247660160065, + "learning_rate": 9.308924446256285e-05, + "loss": 4.0527, + "step": 8145 + }, + { + "epoch": 0.5537437151786928, + "grad_norm": 0.19633089005947113, + "learning_rate": 9.308499796167958e-05, + "loss": 3.8693, + "step": 8150 + }, + { + "epoch": 0.5540834352493546, + "grad_norm": 0.19133026897907257, + "learning_rate": 9.308075146079631e-05, + "loss": 3.9765, + "step": 8155 + }, + { + "epoch": 0.5544231553200163, + "grad_norm": 0.18762901425361633, + "learning_rate": 9.307650495991304e-05, + "loss": 3.9366, + "step": 8160 + }, + { + "epoch": 0.554762875390678, + "grad_norm": 0.21953530609607697, + "learning_rate": 9.307225845902976e-05, + "loss": 4.1145, + "step": 8165 + }, + { + "epoch": 0.5551025954613399, + "grad_norm": 0.1631409376859665, + "learning_rate": 9.306801195814649e-05, + "loss": 4.3299, + "step": 8170 + }, + { + "epoch": 0.5554423155320016, + "grad_norm": 0.1782941222190857, + "learning_rate": 9.306376545726322e-05, + "loss": 4.0921, + "step": 8175 + }, + { + "epoch": 0.5557820356026634, + "grad_norm": 0.18272286653518677, + "learning_rate": 9.305951895637995e-05, + "loss": 4.0609, + "step": 8180 + }, + { + "epoch": 0.5561217556733252, + "grad_norm": 0.5140985250473022, + "learning_rate": 9.305527245549668e-05, + "loss": 3.9297, + "step": 8185 + }, + { + "epoch": 0.556461475743987, + "grad_norm": 1.4922987222671509, + "learning_rate": 9.30510259546134e-05, + "loss": 4.0238, + "step": 8190 + }, + { + "epoch": 0.5568011958146487, + "grad_norm": 3.8372085094451904, + "learning_rate": 9.304677945373013e-05, + "loss": 4.1705, + "step": 8195 + }, + { + "epoch": 0.5571409158853105, + "grad_norm": 0.1523323357105255, + "learning_rate": 9.304253295284686e-05, + "loss": 4.0109, + "step": 8200 + }, + { + "epoch": 0.5574806359559723, + "grad_norm": 0.2263563573360443, + "learning_rate": 9.303828645196359e-05, + "loss": 4.0453, + "step": 8205 + }, + { + "epoch": 0.557820356026634, + "grad_norm": 0.20858174562454224, + "learning_rate": 9.30340399510803e-05, + "loss": 4.0197, + "step": 8210 + }, + { + "epoch": 0.5581600760972958, + "grad_norm": 0.2111169844865799, + "learning_rate": 9.302979345019704e-05, + "loss": 3.9775, + "step": 8215 + }, + { + "epoch": 0.5584997961679576, + "grad_norm": 0.16785529255867004, + "learning_rate": 9.302554694931377e-05, + "loss": 4.1782, + "step": 8220 + }, + { + "epoch": 0.5588395162386194, + "grad_norm": 0.21160061657428741, + "learning_rate": 9.302130044843049e-05, + "loss": 3.7809, + "step": 8225 + }, + { + "epoch": 0.5591792363092811, + "grad_norm": 0.34842145442962646, + "learning_rate": 9.301705394754723e-05, + "loss": 4.0454, + "step": 8230 + }, + { + "epoch": 0.559518956379943, + "grad_norm": 0.22332549095153809, + "learning_rate": 9.301280744666396e-05, + "loss": 3.9447, + "step": 8235 + }, + { + "epoch": 0.5598586764506047, + "grad_norm": 0.33598944544792175, + "learning_rate": 9.300856094578067e-05, + "loss": 4.2742, + "step": 8240 + }, + { + "epoch": 0.5601983965212665, + "grad_norm": 0.18852943181991577, + "learning_rate": 9.300431444489741e-05, + "loss": 3.9494, + "step": 8245 + }, + { + "epoch": 0.5605381165919282, + "grad_norm": 0.1902894675731659, + "learning_rate": 9.300006794401414e-05, + "loss": 4.0114, + "step": 8250 + }, + { + "epoch": 0.5608778366625901, + "grad_norm": 0.3319913446903229, + "learning_rate": 9.299582144313086e-05, + "loss": 4.1077, + "step": 8255 + }, + { + "epoch": 0.5612175567332518, + "grad_norm": 1.5811398029327393, + "learning_rate": 9.29915749422476e-05, + "loss": 3.9531, + "step": 8260 + }, + { + "epoch": 0.5615572768039135, + "grad_norm": 0.18648011982440948, + "learning_rate": 9.298732844136433e-05, + "loss": 4.0406, + "step": 8265 + }, + { + "epoch": 0.5618969968745754, + "grad_norm": 0.20678003132343292, + "learning_rate": 9.298308194048104e-05, + "loss": 4.0141, + "step": 8270 + }, + { + "epoch": 0.5622367169452371, + "grad_norm": 0.284509539604187, + "learning_rate": 9.297883543959778e-05, + "loss": 3.7511, + "step": 8275 + }, + { + "epoch": 0.5625764370158989, + "grad_norm": 0.23593567311763763, + "learning_rate": 9.29745889387145e-05, + "loss": 3.9993, + "step": 8280 + }, + { + "epoch": 0.5629161570865606, + "grad_norm": 0.221805140376091, + "learning_rate": 9.297034243783122e-05, + "loss": 4.0023, + "step": 8285 + }, + { + "epoch": 0.5632558771572225, + "grad_norm": 0.1965785026550293, + "learning_rate": 9.296609593694797e-05, + "loss": 4.0849, + "step": 8290 + }, + { + "epoch": 0.5635955972278842, + "grad_norm": 0.3942621648311615, + "learning_rate": 9.296184943606468e-05, + "loss": 4.0986, + "step": 8295 + }, + { + "epoch": 0.563935317298546, + "grad_norm": 0.14950452744960785, + "learning_rate": 9.295760293518142e-05, + "loss": 4.0598, + "step": 8300 + }, + { + "epoch": 0.5642750373692078, + "grad_norm": 0.19888624548912048, + "learning_rate": 9.295335643429815e-05, + "loss": 4.1756, + "step": 8305 + }, + { + "epoch": 0.5646147574398696, + "grad_norm": 0.18374282121658325, + "learning_rate": 9.294910993341486e-05, + "loss": 4.0213, + "step": 8310 + }, + { + "epoch": 0.5649544775105313, + "grad_norm": 0.19329077005386353, + "learning_rate": 9.29448634325316e-05, + "loss": 3.8402, + "step": 8315 + }, + { + "epoch": 0.5652941975811931, + "grad_norm": 0.4161980152130127, + "learning_rate": 9.294061693164833e-05, + "loss": 3.8012, + "step": 8320 + }, + { + "epoch": 0.5656339176518549, + "grad_norm": 0.20671139657497406, + "learning_rate": 9.293637043076505e-05, + "loss": 4.335, + "step": 8325 + }, + { + "epoch": 0.5659736377225166, + "grad_norm": 0.20330487191677094, + "learning_rate": 9.293212392988179e-05, + "loss": 4.0325, + "step": 8330 + }, + { + "epoch": 0.5663133577931784, + "grad_norm": 0.19175422191619873, + "learning_rate": 9.292787742899852e-05, + "loss": 4.1178, + "step": 8335 + }, + { + "epoch": 0.5666530778638402, + "grad_norm": 0.22935859858989716, + "learning_rate": 9.292363092811523e-05, + "loss": 4.1405, + "step": 8340 + }, + { + "epoch": 0.566992797934502, + "grad_norm": 0.17749212682247162, + "learning_rate": 9.291938442723197e-05, + "loss": 3.9575, + "step": 8345 + }, + { + "epoch": 0.5673325180051637, + "grad_norm": 1.0764920711517334, + "learning_rate": 9.291513792634869e-05, + "loss": 4.3775, + "step": 8350 + }, + { + "epoch": 0.5676722380758256, + "grad_norm": 0.15739497542381287, + "learning_rate": 9.291089142546542e-05, + "loss": 4.1305, + "step": 8355 + }, + { + "epoch": 0.5680119581464873, + "grad_norm": 0.19351720809936523, + "learning_rate": 9.290664492458216e-05, + "loss": 3.8318, + "step": 8360 + }, + { + "epoch": 0.568351678217149, + "grad_norm": 0.16042912006378174, + "learning_rate": 9.290239842369887e-05, + "loss": 4.1028, + "step": 8365 + }, + { + "epoch": 0.5686913982878108, + "grad_norm": 0.1768733263015747, + "learning_rate": 9.28981519228156e-05, + "loss": 3.8895, + "step": 8370 + }, + { + "epoch": 0.5690311183584726, + "grad_norm": 0.15216295421123505, + "learning_rate": 9.289390542193234e-05, + "loss": 4.1829, + "step": 8375 + }, + { + "epoch": 0.5693708384291344, + "grad_norm": 0.20485453307628632, + "learning_rate": 9.288965892104906e-05, + "loss": 4.1128, + "step": 8380 + }, + { + "epoch": 0.5697105584997961, + "grad_norm": 0.2530359625816345, + "learning_rate": 9.288541242016578e-05, + "loss": 4.0308, + "step": 8385 + }, + { + "epoch": 0.570050278570458, + "grad_norm": 0.24390393495559692, + "learning_rate": 9.288116591928253e-05, + "loss": 3.9455, + "step": 8390 + }, + { + "epoch": 0.5703899986411197, + "grad_norm": 0.15447057783603668, + "learning_rate": 9.287691941839924e-05, + "loss": 4.2323, + "step": 8395 + }, + { + "epoch": 0.5707297187117815, + "grad_norm": 0.262494295835495, + "learning_rate": 9.287267291751597e-05, + "loss": 3.8537, + "step": 8400 + }, + { + "epoch": 0.5710694387824433, + "grad_norm": 0.2734646499156952, + "learning_rate": 9.286842641663271e-05, + "loss": 4.0975, + "step": 8405 + }, + { + "epoch": 0.5714091588531051, + "grad_norm": 0.3060397803783417, + "learning_rate": 9.286417991574942e-05, + "loss": 4.1002, + "step": 8410 + }, + { + "epoch": 0.5717488789237668, + "grad_norm": 0.194105863571167, + "learning_rate": 9.285993341486615e-05, + "loss": 3.7785, + "step": 8415 + }, + { + "epoch": 0.5720885989944285, + "grad_norm": 0.19388654828071594, + "learning_rate": 9.28556869139829e-05, + "loss": 4.0577, + "step": 8420 + }, + { + "epoch": 0.5724283190650904, + "grad_norm": 0.4632340669631958, + "learning_rate": 9.285144041309961e-05, + "loss": 4.0301, + "step": 8425 + }, + { + "epoch": 0.5727680391357521, + "grad_norm": 0.23817621171474457, + "learning_rate": 9.284719391221634e-05, + "loss": 3.9506, + "step": 8430 + }, + { + "epoch": 0.5731077592064139, + "grad_norm": 0.26604166626930237, + "learning_rate": 9.284294741133306e-05, + "loss": 3.7005, + "step": 8435 + }, + { + "epoch": 0.5734474792770757, + "grad_norm": 0.21348132193088531, + "learning_rate": 9.283870091044979e-05, + "loss": 4.0274, + "step": 8440 + }, + { + "epoch": 0.5737871993477375, + "grad_norm": 0.3157108724117279, + "learning_rate": 9.283445440956652e-05, + "loss": 3.8789, + "step": 8445 + }, + { + "epoch": 0.5741269194183992, + "grad_norm": 0.6310774087905884, + "learning_rate": 9.283020790868325e-05, + "loss": 4.0599, + "step": 8450 + }, + { + "epoch": 0.574466639489061, + "grad_norm": 0.18738152086734772, + "learning_rate": 9.282596140779998e-05, + "loss": 3.8024, + "step": 8455 + }, + { + "epoch": 0.5748063595597228, + "grad_norm": 0.3210128843784332, + "learning_rate": 9.28217149069167e-05, + "loss": 4.1008, + "step": 8460 + }, + { + "epoch": 0.5751460796303846, + "grad_norm": 0.180195152759552, + "learning_rate": 9.281746840603343e-05, + "loss": 4.0769, + "step": 8465 + }, + { + "epoch": 0.5754857997010463, + "grad_norm": 0.17868779599666595, + "learning_rate": 9.281322190515016e-05, + "loss": 3.9685, + "step": 8470 + }, + { + "epoch": 0.5758255197717081, + "grad_norm": 0.5827326774597168, + "learning_rate": 9.280897540426689e-05, + "loss": 3.9109, + "step": 8475 + }, + { + "epoch": 0.5761652398423699, + "grad_norm": 0.3124859631061554, + "learning_rate": 9.280472890338362e-05, + "loss": 3.8527, + "step": 8480 + }, + { + "epoch": 0.5765049599130316, + "grad_norm": 0.42251700162887573, + "learning_rate": 9.280048240250034e-05, + "loss": 4.1552, + "step": 8485 + }, + { + "epoch": 0.5768446799836935, + "grad_norm": 0.1950671374797821, + "learning_rate": 9.279623590161707e-05, + "loss": 3.836, + "step": 8490 + }, + { + "epoch": 0.5771844000543552, + "grad_norm": 0.8381164073944092, + "learning_rate": 9.27919894007338e-05, + "loss": 4.0999, + "step": 8495 + }, + { + "epoch": 0.577524120125017, + "grad_norm": 0.15643952786922455, + "learning_rate": 9.278774289985053e-05, + "loss": 3.9804, + "step": 8500 + }, + { + "epoch": 0.5778638401956787, + "grad_norm": 0.21564146876335144, + "learning_rate": 9.278349639896726e-05, + "loss": 3.9151, + "step": 8505 + }, + { + "epoch": 0.5782035602663406, + "grad_norm": 0.21207468211650848, + "learning_rate": 9.277924989808398e-05, + "loss": 3.9481, + "step": 8510 + }, + { + "epoch": 0.5785432803370023, + "grad_norm": 0.19761881232261658, + "learning_rate": 9.277500339720071e-05, + "loss": 4.0216, + "step": 8515 + }, + { + "epoch": 0.578883000407664, + "grad_norm": 0.18729913234710693, + "learning_rate": 9.277075689631744e-05, + "loss": 3.9112, + "step": 8520 + }, + { + "epoch": 0.5792227204783259, + "grad_norm": 0.13949252665042877, + "learning_rate": 9.276651039543417e-05, + "loss": 3.928, + "step": 8525 + }, + { + "epoch": 0.5795624405489876, + "grad_norm": 0.1481368988752365, + "learning_rate": 9.27622638945509e-05, + "loss": 4.1387, + "step": 8530 + }, + { + "epoch": 0.5799021606196494, + "grad_norm": 0.2295740246772766, + "learning_rate": 9.275801739366762e-05, + "loss": 4.0267, + "step": 8535 + }, + { + "epoch": 0.5802418806903111, + "grad_norm": 0.22337546944618225, + "learning_rate": 9.275377089278435e-05, + "loss": 3.927, + "step": 8540 + }, + { + "epoch": 0.580581600760973, + "grad_norm": 0.41308972239494324, + "learning_rate": 9.274952439190108e-05, + "loss": 4.1804, + "step": 8545 + }, + { + "epoch": 0.5809213208316347, + "grad_norm": 0.15988604724407196, + "learning_rate": 9.27452778910178e-05, + "loss": 3.8497, + "step": 8550 + }, + { + "epoch": 0.5812610409022965, + "grad_norm": 0.17153027653694153, + "learning_rate": 9.274103139013454e-05, + "loss": 4.1164, + "step": 8555 + }, + { + "epoch": 0.5816007609729583, + "grad_norm": 0.2415970116853714, + "learning_rate": 9.273678488925126e-05, + "loss": 4.1781, + "step": 8560 + }, + { + "epoch": 0.5819404810436201, + "grad_norm": 2.764481544494629, + "learning_rate": 9.273253838836798e-05, + "loss": 3.9708, + "step": 8565 + }, + { + "epoch": 0.5822802011142818, + "grad_norm": 0.1980726718902588, + "learning_rate": 9.272829188748472e-05, + "loss": 4.0011, + "step": 8570 + }, + { + "epoch": 0.5826199211849437, + "grad_norm": 0.23337212204933167, + "learning_rate": 9.272404538660145e-05, + "loss": 3.9122, + "step": 8575 + }, + { + "epoch": 0.5829596412556054, + "grad_norm": 0.1616441160440445, + "learning_rate": 9.271979888571816e-05, + "loss": 3.8085, + "step": 8580 + }, + { + "epoch": 0.5832993613262671, + "grad_norm": 0.19544613361358643, + "learning_rate": 9.27155523848349e-05, + "loss": 4.0634, + "step": 8585 + }, + { + "epoch": 0.5836390813969289, + "grad_norm": 0.19798190891742706, + "learning_rate": 9.271130588395163e-05, + "loss": 3.9989, + "step": 8590 + }, + { + "epoch": 0.5839788014675907, + "grad_norm": 0.1608157753944397, + "learning_rate": 9.270705938306835e-05, + "loss": 4.3514, + "step": 8595 + }, + { + "epoch": 0.5843185215382525, + "grad_norm": 0.23190398514270782, + "learning_rate": 9.270281288218509e-05, + "loss": 3.9989, + "step": 8600 + }, + { + "epoch": 0.5846582416089142, + "grad_norm": 0.694850742816925, + "learning_rate": 9.269856638130182e-05, + "loss": 4.3087, + "step": 8605 + }, + { + "epoch": 0.5849979616795761, + "grad_norm": 0.2711631655693054, + "learning_rate": 9.269431988041853e-05, + "loss": 3.9979, + "step": 8610 + }, + { + "epoch": 0.5853376817502378, + "grad_norm": 0.17717806994915009, + "learning_rate": 9.269007337953527e-05, + "loss": 3.9771, + "step": 8615 + }, + { + "epoch": 0.5856774018208996, + "grad_norm": 0.258022665977478, + "learning_rate": 9.2685826878652e-05, + "loss": 3.9518, + "step": 8620 + }, + { + "epoch": 0.5860171218915613, + "grad_norm": 1.0233123302459717, + "learning_rate": 9.268158037776871e-05, + "loss": 4.0393, + "step": 8625 + }, + { + "epoch": 0.5863568419622232, + "grad_norm": 0.19119100272655487, + "learning_rate": 9.267733387688546e-05, + "loss": 4.2572, + "step": 8630 + }, + { + "epoch": 0.5866965620328849, + "grad_norm": 0.16495579481124878, + "learning_rate": 9.267308737600217e-05, + "loss": 4.0212, + "step": 8635 + }, + { + "epoch": 0.5870362821035466, + "grad_norm": 0.3087138831615448, + "learning_rate": 9.266884087511891e-05, + "loss": 3.6527, + "step": 8640 + }, + { + "epoch": 0.5873760021742085, + "grad_norm": 0.16957888007164001, + "learning_rate": 9.266459437423564e-05, + "loss": 4.2809, + "step": 8645 + }, + { + "epoch": 0.5877157222448702, + "grad_norm": 0.15367551147937775, + "learning_rate": 9.266034787335235e-05, + "loss": 4.2469, + "step": 8650 + }, + { + "epoch": 0.588055442315532, + "grad_norm": 0.2356184720993042, + "learning_rate": 9.26561013724691e-05, + "loss": 3.8713, + "step": 8655 + }, + { + "epoch": 0.5883951623861938, + "grad_norm": 6.177441120147705, + "learning_rate": 9.265185487158582e-05, + "loss": 3.8173, + "step": 8660 + }, + { + "epoch": 0.5887348824568556, + "grad_norm": 0.15291577577590942, + "learning_rate": 9.264760837070254e-05, + "loss": 4.0039, + "step": 8665 + }, + { + "epoch": 0.5890746025275173, + "grad_norm": 0.1881755292415619, + "learning_rate": 9.264336186981928e-05, + "loss": 4.1521, + "step": 8670 + }, + { + "epoch": 0.589414322598179, + "grad_norm": 0.28705185651779175, + "learning_rate": 9.263911536893601e-05, + "loss": 4.1136, + "step": 8675 + }, + { + "epoch": 0.5897540426688409, + "grad_norm": 0.8784993886947632, + "learning_rate": 9.263486886805272e-05, + "loss": 4.0763, + "step": 8680 + }, + { + "epoch": 0.5900937627395026, + "grad_norm": 0.5077646970748901, + "learning_rate": 9.263062236716946e-05, + "loss": 3.818, + "step": 8685 + }, + { + "epoch": 0.5904334828101644, + "grad_norm": 0.17957673966884613, + "learning_rate": 9.262637586628619e-05, + "loss": 3.8784, + "step": 8690 + }, + { + "epoch": 0.5907732028808262, + "grad_norm": 0.2007267326116562, + "learning_rate": 9.26221293654029e-05, + "loss": 4.0291, + "step": 8695 + }, + { + "epoch": 0.591112922951488, + "grad_norm": 0.3630281686782837, + "learning_rate": 9.261788286451965e-05, + "loss": 4.0589, + "step": 8700 + }, + { + "epoch": 0.5914526430221497, + "grad_norm": 0.33113589882850647, + "learning_rate": 9.261363636363636e-05, + "loss": 4.0724, + "step": 8705 + }, + { + "epoch": 0.5917923630928115, + "grad_norm": 0.9370392560958862, + "learning_rate": 9.260938986275309e-05, + "loss": 3.9571, + "step": 8710 + }, + { + "epoch": 0.5921320831634733, + "grad_norm": 0.7348127365112305, + "learning_rate": 9.260514336186983e-05, + "loss": 3.8521, + "step": 8715 + }, + { + "epoch": 0.5924718032341351, + "grad_norm": 0.20114554464817047, + "learning_rate": 9.260089686098655e-05, + "loss": 4.0328, + "step": 8720 + }, + { + "epoch": 0.5928115233047968, + "grad_norm": 0.375806987285614, + "learning_rate": 9.259665036010327e-05, + "loss": 4.045, + "step": 8725 + }, + { + "epoch": 0.5931512433754587, + "grad_norm": 0.17481490969657898, + "learning_rate": 9.259240385922002e-05, + "loss": 4.1616, + "step": 8730 + }, + { + "epoch": 0.5934909634461204, + "grad_norm": 0.3022591471672058, + "learning_rate": 9.258815735833673e-05, + "loss": 4.0886, + "step": 8735 + }, + { + "epoch": 0.5938306835167821, + "grad_norm": 0.17564785480499268, + "learning_rate": 9.258391085745346e-05, + "loss": 3.9194, + "step": 8740 + }, + { + "epoch": 0.594170403587444, + "grad_norm": 0.19886420667171478, + "learning_rate": 9.25796643565702e-05, + "loss": 3.9606, + "step": 8745 + }, + { + "epoch": 0.5945101236581057, + "grad_norm": 0.18436846137046814, + "learning_rate": 9.257541785568691e-05, + "loss": 4.0952, + "step": 8750 + }, + { + "epoch": 0.5948498437287675, + "grad_norm": 0.39597442746162415, + "learning_rate": 9.257117135480364e-05, + "loss": 4.0044, + "step": 8755 + }, + { + "epoch": 0.5951895637994292, + "grad_norm": 0.2468959391117096, + "learning_rate": 9.256692485392038e-05, + "loss": 4.2667, + "step": 8760 + }, + { + "epoch": 0.5955292838700911, + "grad_norm": 0.21869252622127533, + "learning_rate": 9.25626783530371e-05, + "loss": 3.862, + "step": 8765 + }, + { + "epoch": 0.5958690039407528, + "grad_norm": 0.18693305552005768, + "learning_rate": 9.255843185215383e-05, + "loss": 4.1253, + "step": 8770 + }, + { + "epoch": 0.5962087240114146, + "grad_norm": 0.2016768604516983, + "learning_rate": 9.255418535127055e-05, + "loss": 4.0373, + "step": 8775 + }, + { + "epoch": 0.5965484440820764, + "grad_norm": 0.20323170721530914, + "learning_rate": 9.254993885038728e-05, + "loss": 3.9236, + "step": 8780 + }, + { + "epoch": 0.5968881641527382, + "grad_norm": 0.17881441116333008, + "learning_rate": 9.254569234950401e-05, + "loss": 3.9083, + "step": 8785 + }, + { + "epoch": 0.5972278842233999, + "grad_norm": 0.22243642807006836, + "learning_rate": 9.254144584862074e-05, + "loss": 4.0484, + "step": 8790 + }, + { + "epoch": 0.5975676042940616, + "grad_norm": 0.160291388630867, + "learning_rate": 9.253719934773747e-05, + "loss": 4.0718, + "step": 8795 + }, + { + "epoch": 0.5979073243647235, + "grad_norm": 0.18238534033298492, + "learning_rate": 9.25329528468542e-05, + "loss": 3.8954, + "step": 8800 + }, + { + "epoch": 0.5982470444353852, + "grad_norm": 0.2048415243625641, + "learning_rate": 9.252870634597092e-05, + "loss": 3.9912, + "step": 8805 + }, + { + "epoch": 0.598586764506047, + "grad_norm": 0.17319513857364655, + "learning_rate": 9.252445984508765e-05, + "loss": 4.0986, + "step": 8810 + }, + { + "epoch": 0.5989264845767088, + "grad_norm": 0.24065633118152618, + "learning_rate": 9.252021334420438e-05, + "loss": 4.1444, + "step": 8815 + }, + { + "epoch": 0.5992662046473706, + "grad_norm": 0.20234987139701843, + "learning_rate": 9.251596684332111e-05, + "loss": 3.8242, + "step": 8820 + }, + { + "epoch": 0.5996059247180323, + "grad_norm": 0.21660032868385315, + "learning_rate": 9.251172034243783e-05, + "loss": 3.8864, + "step": 8825 + }, + { + "epoch": 0.5999456447886942, + "grad_norm": 0.17871896922588348, + "learning_rate": 9.250747384155456e-05, + "loss": 3.8432, + "step": 8830 + }, + { + "epoch": 0.6002853648593559, + "grad_norm": 0.16359539330005646, + "learning_rate": 9.250322734067129e-05, + "loss": 4.0026, + "step": 8835 + }, + { + "epoch": 0.6006250849300176, + "grad_norm": 0.2683166265487671, + "learning_rate": 9.249898083978802e-05, + "loss": 4.1686, + "step": 8840 + }, + { + "epoch": 0.6009648050006794, + "grad_norm": 0.16088762879371643, + "learning_rate": 9.249473433890475e-05, + "loss": 3.9448, + "step": 8845 + }, + { + "epoch": 0.6013045250713412, + "grad_norm": 0.21767516434192657, + "learning_rate": 9.249048783802147e-05, + "loss": 4.1642, + "step": 8850 + }, + { + "epoch": 0.601644245142003, + "grad_norm": 1.050563097000122, + "learning_rate": 9.24862413371382e-05, + "loss": 4.0446, + "step": 8855 + }, + { + "epoch": 0.6019839652126647, + "grad_norm": 0.16180701553821564, + "learning_rate": 9.248199483625493e-05, + "loss": 4.014, + "step": 8860 + }, + { + "epoch": 0.6023236852833266, + "grad_norm": 0.1821652054786682, + "learning_rate": 9.247774833537166e-05, + "loss": 4.0171, + "step": 8865 + }, + { + "epoch": 0.6026634053539883, + "grad_norm": 1.0722553730010986, + "learning_rate": 9.247350183448839e-05, + "loss": 3.8606, + "step": 8870 + }, + { + "epoch": 0.6030031254246501, + "grad_norm": 0.22805406153202057, + "learning_rate": 9.246925533360511e-05, + "loss": 3.993, + "step": 8875 + }, + { + "epoch": 0.6033428454953118, + "grad_norm": 0.2157077044248581, + "learning_rate": 9.246500883272184e-05, + "loss": 3.7777, + "step": 8880 + }, + { + "epoch": 0.6036825655659737, + "grad_norm": 0.18143793940544128, + "learning_rate": 9.246076233183857e-05, + "loss": 4.127, + "step": 8885 + }, + { + "epoch": 0.6040222856366354, + "grad_norm": 0.18271377682685852, + "learning_rate": 9.24565158309553e-05, + "loss": 4.0305, + "step": 8890 + }, + { + "epoch": 0.6043620057072971, + "grad_norm": 0.2080451250076294, + "learning_rate": 9.245226933007203e-05, + "loss": 3.8488, + "step": 8895 + }, + { + "epoch": 0.604701725777959, + "grad_norm": 1.494086742401123, + "learning_rate": 9.244802282918875e-05, + "loss": 4.0619, + "step": 8900 + }, + { + "epoch": 0.6050414458486207, + "grad_norm": 0.2166607528924942, + "learning_rate": 9.244377632830547e-05, + "loss": 3.8448, + "step": 8905 + }, + { + "epoch": 0.6053811659192825, + "grad_norm": 0.24948440492153168, + "learning_rate": 9.243952982742221e-05, + "loss": 3.9839, + "step": 8910 + }, + { + "epoch": 0.6057208859899443, + "grad_norm": 0.33411622047424316, + "learning_rate": 9.243528332653894e-05, + "loss": 4.0442, + "step": 8915 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.17864017188549042, + "learning_rate": 9.243103682565565e-05, + "loss": 3.996, + "step": 8920 + }, + { + "epoch": 0.6064003261312678, + "grad_norm": 0.18109479546546936, + "learning_rate": 9.24267903247724e-05, + "loss": 4.1308, + "step": 8925 + }, + { + "epoch": 0.6067400462019296, + "grad_norm": 0.21822641789913177, + "learning_rate": 9.242254382388912e-05, + "loss": 4.1508, + "step": 8930 + }, + { + "epoch": 0.6070797662725914, + "grad_norm": 0.37898629903793335, + "learning_rate": 9.241829732300584e-05, + "loss": 4.1584, + "step": 8935 + }, + { + "epoch": 0.6074194863432532, + "grad_norm": 0.30412939190864563, + "learning_rate": 9.241405082212258e-05, + "loss": 3.948, + "step": 8940 + }, + { + "epoch": 0.6077592064139149, + "grad_norm": 0.20068703591823578, + "learning_rate": 9.240980432123931e-05, + "loss": 4.1206, + "step": 8945 + }, + { + "epoch": 0.6080989264845768, + "grad_norm": 0.25400644540786743, + "learning_rate": 9.240555782035602e-05, + "loss": 4.3133, + "step": 8950 + }, + { + "epoch": 0.6084386465552385, + "grad_norm": 0.19139589369297028, + "learning_rate": 9.240131131947276e-05, + "loss": 3.9386, + "step": 8955 + }, + { + "epoch": 0.6087783666259002, + "grad_norm": 0.16936640441417694, + "learning_rate": 9.239706481858949e-05, + "loss": 4.0571, + "step": 8960 + }, + { + "epoch": 0.609118086696562, + "grad_norm": 0.20352940261363983, + "learning_rate": 9.23928183177062e-05, + "loss": 3.7657, + "step": 8965 + }, + { + "epoch": 0.6094578067672238, + "grad_norm": 0.1573779433965683, + "learning_rate": 9.238857181682295e-05, + "loss": 4.1298, + "step": 8970 + }, + { + "epoch": 0.6097975268378856, + "grad_norm": 0.20279887318611145, + "learning_rate": 9.238432531593966e-05, + "loss": 3.9271, + "step": 8975 + }, + { + "epoch": 0.6101372469085473, + "grad_norm": 16.52877426147461, + "learning_rate": 9.23800788150564e-05, + "loss": 3.8319, + "step": 8980 + }, + { + "epoch": 0.6104769669792092, + "grad_norm": 0.17606152594089508, + "learning_rate": 9.237583231417313e-05, + "loss": 3.7453, + "step": 8985 + }, + { + "epoch": 0.6108166870498709, + "grad_norm": 0.23678942024707794, + "learning_rate": 9.237158581328985e-05, + "loss": 3.9744, + "step": 8990 + }, + { + "epoch": 0.6111564071205327, + "grad_norm": 0.21576398611068726, + "learning_rate": 9.236733931240659e-05, + "loss": 4.1691, + "step": 8995 + }, + { + "epoch": 0.6114961271911945, + "grad_norm": 0.2256469577550888, + "learning_rate": 9.236309281152332e-05, + "loss": 3.9419, + "step": 9000 + }, + { + "epoch": 0.6118358472618562, + "grad_norm": 0.23562292754650116, + "learning_rate": 9.235884631064003e-05, + "loss": 4.1049, + "step": 9005 + }, + { + "epoch": 0.612175567332518, + "grad_norm": 0.2894531190395355, + "learning_rate": 9.235459980975677e-05, + "loss": 3.9535, + "step": 9010 + }, + { + "epoch": 0.6125152874031797, + "grad_norm": 0.22967609763145447, + "learning_rate": 9.23503533088735e-05, + "loss": 3.9537, + "step": 9015 + }, + { + "epoch": 0.6128550074738416, + "grad_norm": 1.3449379205703735, + "learning_rate": 9.234610680799021e-05, + "loss": 4.08, + "step": 9020 + }, + { + "epoch": 0.6131947275445033, + "grad_norm": 0.2513696849346161, + "learning_rate": 9.234186030710696e-05, + "loss": 3.8626, + "step": 9025 + }, + { + "epoch": 0.6135344476151651, + "grad_norm": 0.19834820926189423, + "learning_rate": 9.233761380622368e-05, + "loss": 3.9427, + "step": 9030 + }, + { + "epoch": 0.6138741676858269, + "grad_norm": 0.2118111550807953, + "learning_rate": 9.23333673053404e-05, + "loss": 3.8768, + "step": 9035 + }, + { + "epoch": 0.6142138877564887, + "grad_norm": 0.15969218313694, + "learning_rate": 9.232912080445714e-05, + "loss": 4.1679, + "step": 9040 + }, + { + "epoch": 0.6145536078271504, + "grad_norm": 0.184014692902565, + "learning_rate": 9.232487430357387e-05, + "loss": 4.1939, + "step": 9045 + }, + { + "epoch": 0.6148933278978121, + "grad_norm": 0.17408449947834015, + "learning_rate": 9.232062780269058e-05, + "loss": 4.0627, + "step": 9050 + }, + { + "epoch": 0.615233047968474, + "grad_norm": 0.3706364929676056, + "learning_rate": 9.231638130180732e-05, + "loss": 4.1244, + "step": 9055 + }, + { + "epoch": 0.6155727680391357, + "grad_norm": 0.2606452703475952, + "learning_rate": 9.231213480092404e-05, + "loss": 4.0637, + "step": 9060 + }, + { + "epoch": 0.6159124881097975, + "grad_norm": 0.2730484902858734, + "learning_rate": 9.230788830004077e-05, + "loss": 4.227, + "step": 9065 + }, + { + "epoch": 0.6162522081804593, + "grad_norm": 0.17146620154380798, + "learning_rate": 9.230364179915751e-05, + "loss": 3.9837, + "step": 9070 + }, + { + "epoch": 0.6165919282511211, + "grad_norm": 0.23479421436786652, + "learning_rate": 9.229939529827422e-05, + "loss": 3.9549, + "step": 9075 + }, + { + "epoch": 0.6169316483217828, + "grad_norm": 0.17351080477237701, + "learning_rate": 9.229514879739095e-05, + "loss": 4.0837, + "step": 9080 + }, + { + "epoch": 0.6172713683924447, + "grad_norm": 0.18759216368198395, + "learning_rate": 9.229090229650769e-05, + "loss": 4.1507, + "step": 9085 + }, + { + "epoch": 0.6176110884631064, + "grad_norm": 0.373471736907959, + "learning_rate": 9.22866557956244e-05, + "loss": 3.8382, + "step": 9090 + }, + { + "epoch": 0.6179508085337682, + "grad_norm": 0.16654084622859955, + "learning_rate": 9.228240929474113e-05, + "loss": 3.9211, + "step": 9095 + }, + { + "epoch": 0.6182905286044299, + "grad_norm": 2.6412665843963623, + "learning_rate": 9.227816279385788e-05, + "loss": 4.1437, + "step": 9100 + }, + { + "epoch": 0.6186302486750918, + "grad_norm": 0.22232688963413239, + "learning_rate": 9.227391629297459e-05, + "loss": 4.3322, + "step": 9105 + }, + { + "epoch": 0.6189699687457535, + "grad_norm": 0.19792711734771729, + "learning_rate": 9.226966979209132e-05, + "loss": 3.6801, + "step": 9110 + }, + { + "epoch": 0.6193096888164152, + "grad_norm": 0.19452767074108124, + "learning_rate": 9.226542329120806e-05, + "loss": 4.1374, + "step": 9115 + }, + { + "epoch": 0.6196494088870771, + "grad_norm": 0.17766821384429932, + "learning_rate": 9.226117679032477e-05, + "loss": 3.998, + "step": 9120 + }, + { + "epoch": 0.6199891289577388, + "grad_norm": 0.21171297132968903, + "learning_rate": 9.22569302894415e-05, + "loss": 4.0408, + "step": 9125 + }, + { + "epoch": 0.6203288490284006, + "grad_norm": 0.2353495955467224, + "learning_rate": 9.225268378855823e-05, + "loss": 4.2569, + "step": 9130 + }, + { + "epoch": 0.6206685690990623, + "grad_norm": 0.1814018189907074, + "learning_rate": 9.224843728767496e-05, + "loss": 4.1783, + "step": 9135 + }, + { + "epoch": 0.6210082891697242, + "grad_norm": 0.16691021621227264, + "learning_rate": 9.224419078679169e-05, + "loss": 3.95, + "step": 9140 + }, + { + "epoch": 0.6213480092403859, + "grad_norm": 0.24587292969226837, + "learning_rate": 9.223994428590841e-05, + "loss": 4.187, + "step": 9145 + }, + { + "epoch": 0.6216877293110477, + "grad_norm": 0.12846866250038147, + "learning_rate": 9.223569778502514e-05, + "loss": 3.7833, + "step": 9150 + }, + { + "epoch": 0.6220274493817095, + "grad_norm": 0.15816563367843628, + "learning_rate": 9.223145128414187e-05, + "loss": 3.9622, + "step": 9155 + }, + { + "epoch": 0.6223671694523712, + "grad_norm": 0.5960243344306946, + "learning_rate": 9.22272047832586e-05, + "loss": 4.0261, + "step": 9160 + }, + { + "epoch": 0.622706889523033, + "grad_norm": 0.21092690527439117, + "learning_rate": 9.222295828237533e-05, + "loss": 4.0417, + "step": 9165 + }, + { + "epoch": 0.6230466095936948, + "grad_norm": 0.21227766573429108, + "learning_rate": 9.221871178149205e-05, + "loss": 4.0396, + "step": 9170 + }, + { + "epoch": 0.6233863296643566, + "grad_norm": 0.1947152018547058, + "learning_rate": 9.221446528060878e-05, + "loss": 3.9731, + "step": 9175 + }, + { + "epoch": 0.6237260497350183, + "grad_norm": 0.264658123254776, + "learning_rate": 9.221021877972551e-05, + "loss": 3.7214, + "step": 9180 + }, + { + "epoch": 0.6240657698056801, + "grad_norm": 0.14752690494060516, + "learning_rate": 9.220597227884224e-05, + "loss": 4.262, + "step": 9185 + }, + { + "epoch": 0.6244054898763419, + "grad_norm": 0.20559826493263245, + "learning_rate": 9.220172577795897e-05, + "loss": 4.0443, + "step": 9190 + }, + { + "epoch": 0.6247452099470037, + "grad_norm": 0.17190653085708618, + "learning_rate": 9.21974792770757e-05, + "loss": 4.246, + "step": 9195 + }, + { + "epoch": 0.6250849300176654, + "grad_norm": 0.20981894433498383, + "learning_rate": 9.219323277619242e-05, + "loss": 4.0482, + "step": 9200 + }, + { + "epoch": 0.6254246500883273, + "grad_norm": 0.20660968124866486, + "learning_rate": 9.218898627530915e-05, + "loss": 3.9868, + "step": 9205 + }, + { + "epoch": 0.625764370158989, + "grad_norm": 0.21324321627616882, + "learning_rate": 9.218473977442588e-05, + "loss": 4.3044, + "step": 9210 + }, + { + "epoch": 0.6261040902296507, + "grad_norm": 0.45024943351745605, + "learning_rate": 9.21804932735426e-05, + "loss": 3.945, + "step": 9215 + }, + { + "epoch": 0.6264438103003126, + "grad_norm": 0.19029852747917175, + "learning_rate": 9.217624677265933e-05, + "loss": 4.2293, + "step": 9220 + }, + { + "epoch": 0.6267835303709743, + "grad_norm": 0.17948591709136963, + "learning_rate": 9.217200027177606e-05, + "loss": 4.4357, + "step": 9225 + }, + { + "epoch": 0.6271232504416361, + "grad_norm": 0.30317583680152893, + "learning_rate": 9.216775377089279e-05, + "loss": 3.9639, + "step": 9230 + }, + { + "epoch": 0.6274629705122978, + "grad_norm": 0.1628941148519516, + "learning_rate": 9.216350727000952e-05, + "loss": 4.0654, + "step": 9235 + }, + { + "epoch": 0.6278026905829597, + "grad_norm": 0.20330828428268433, + "learning_rate": 9.215926076912625e-05, + "loss": 4.0975, + "step": 9240 + }, + { + "epoch": 0.6281424106536214, + "grad_norm": 0.19045989215373993, + "learning_rate": 9.215501426824297e-05, + "loss": 4.0935, + "step": 9245 + }, + { + "epoch": 0.6284821307242832, + "grad_norm": 1.2690106630325317, + "learning_rate": 9.21507677673597e-05, + "loss": 3.8794, + "step": 9250 + }, + { + "epoch": 0.628821850794945, + "grad_norm": 0.3011123538017273, + "learning_rate": 9.214652126647643e-05, + "loss": 3.8597, + "step": 9255 + }, + { + "epoch": 0.6291615708656068, + "grad_norm": 0.17079856991767883, + "learning_rate": 9.214227476559314e-05, + "loss": 3.9687, + "step": 9260 + }, + { + "epoch": 0.6295012909362685, + "grad_norm": 0.18377001583576202, + "learning_rate": 9.213802826470989e-05, + "loss": 3.8344, + "step": 9265 + }, + { + "epoch": 0.6298410110069302, + "grad_norm": 0.26580023765563965, + "learning_rate": 9.213378176382661e-05, + "loss": 3.9417, + "step": 9270 + }, + { + "epoch": 0.6301807310775921, + "grad_norm": 0.24509429931640625, + "learning_rate": 9.212953526294333e-05, + "loss": 3.9923, + "step": 9275 + }, + { + "epoch": 0.6305204511482538, + "grad_norm": 0.1840851753950119, + "learning_rate": 9.212528876206007e-05, + "loss": 4.1023, + "step": 9280 + }, + { + "epoch": 0.6308601712189156, + "grad_norm": 0.16701363027095795, + "learning_rate": 9.21210422611768e-05, + "loss": 4.1101, + "step": 9285 + }, + { + "epoch": 0.6311998912895774, + "grad_norm": 0.5241246819496155, + "learning_rate": 9.211679576029351e-05, + "loss": 3.5958, + "step": 9290 + }, + { + "epoch": 0.6315396113602392, + "grad_norm": 0.17940564453601837, + "learning_rate": 9.211254925941025e-05, + "loss": 3.9681, + "step": 9295 + }, + { + "epoch": 0.6318793314309009, + "grad_norm": 0.2497144490480423, + "learning_rate": 9.210830275852698e-05, + "loss": 4.0491, + "step": 9300 + }, + { + "epoch": 0.6322190515015628, + "grad_norm": 0.48636677861213684, + "learning_rate": 9.21040562576437e-05, + "loss": 3.9699, + "step": 9305 + }, + { + "epoch": 0.6325587715722245, + "grad_norm": 0.2682591676712036, + "learning_rate": 9.209980975676044e-05, + "loss": 3.8142, + "step": 9310 + }, + { + "epoch": 0.6328984916428863, + "grad_norm": 0.5112836360931396, + "learning_rate": 9.209556325587717e-05, + "loss": 3.8701, + "step": 9315 + }, + { + "epoch": 0.633238211713548, + "grad_norm": 0.37686988711357117, + "learning_rate": 9.20913167549939e-05, + "loss": 3.9299, + "step": 9320 + }, + { + "epoch": 0.6335779317842098, + "grad_norm": 0.3224041759967804, + "learning_rate": 9.208707025411062e-05, + "loss": 4.0601, + "step": 9325 + }, + { + "epoch": 0.6339176518548716, + "grad_norm": 0.241369366645813, + "learning_rate": 9.208282375322734e-05, + "loss": 4.1394, + "step": 9330 + }, + { + "epoch": 0.6342573719255333, + "grad_norm": 0.1902744024991989, + "learning_rate": 9.207857725234408e-05, + "loss": 4.1937, + "step": 9335 + }, + { + "epoch": 0.6345970919961952, + "grad_norm": 0.20103438198566437, + "learning_rate": 9.20743307514608e-05, + "loss": 4.1394, + "step": 9340 + }, + { + "epoch": 0.6349368120668569, + "grad_norm": 0.17396235466003418, + "learning_rate": 9.207008425057752e-05, + "loss": 4.0629, + "step": 9345 + }, + { + "epoch": 0.6352765321375187, + "grad_norm": 0.1853020042181015, + "learning_rate": 9.206583774969426e-05, + "loss": 4.0948, + "step": 9350 + }, + { + "epoch": 0.6356162522081804, + "grad_norm": 0.15830761194229126, + "learning_rate": 9.206159124881099e-05, + "loss": 4.0189, + "step": 9355 + }, + { + "epoch": 0.6359559722788423, + "grad_norm": 0.18493899703025818, + "learning_rate": 9.20573447479277e-05, + "loss": 3.9398, + "step": 9360 + }, + { + "epoch": 0.636295692349504, + "grad_norm": 0.19994007050991058, + "learning_rate": 9.205309824704445e-05, + "loss": 4.1339, + "step": 9365 + }, + { + "epoch": 0.6366354124201657, + "grad_norm": 0.16409240663051605, + "learning_rate": 9.204885174616117e-05, + "loss": 4.3045, + "step": 9370 + }, + { + "epoch": 0.6369751324908276, + "grad_norm": 0.34056273102760315, + "learning_rate": 9.204460524527789e-05, + "loss": 3.9817, + "step": 9375 + }, + { + "epoch": 0.6373148525614893, + "grad_norm": 0.19899314641952515, + "learning_rate": 9.204035874439463e-05, + "loss": 3.8643, + "step": 9380 + }, + { + "epoch": 0.6376545726321511, + "grad_norm": 0.2605150043964386, + "learning_rate": 9.203611224351136e-05, + "loss": 4.0556, + "step": 9385 + }, + { + "epoch": 0.6379942927028129, + "grad_norm": 0.23088324069976807, + "learning_rate": 9.203186574262807e-05, + "loss": 3.9551, + "step": 9390 + }, + { + "epoch": 0.6383340127734747, + "grad_norm": 0.181942418217659, + "learning_rate": 9.202761924174481e-05, + "loss": 4.0324, + "step": 9395 + }, + { + "epoch": 0.6386737328441364, + "grad_norm": 0.1608920693397522, + "learning_rate": 9.202337274086154e-05, + "loss": 3.8843, + "step": 9400 + }, + { + "epoch": 0.6390134529147982, + "grad_norm": 0.23045985400676727, + "learning_rate": 9.201912623997826e-05, + "loss": 3.8905, + "step": 9405 + }, + { + "epoch": 0.63935317298546, + "grad_norm": 0.20380303263664246, + "learning_rate": 9.2014879739095e-05, + "loss": 3.988, + "step": 9410 + }, + { + "epoch": 0.6396928930561218, + "grad_norm": 0.6223300695419312, + "learning_rate": 9.201063323821171e-05, + "loss": 3.9791, + "step": 9415 + }, + { + "epoch": 0.6400326131267835, + "grad_norm": 0.8067218065261841, + "learning_rate": 9.200638673732844e-05, + "loss": 4.1077, + "step": 9420 + }, + { + "epoch": 0.6403723331974454, + "grad_norm": 0.20715036988258362, + "learning_rate": 9.200214023644518e-05, + "loss": 4.138, + "step": 9425 + }, + { + "epoch": 0.6407120532681071, + "grad_norm": 0.20288419723510742, + "learning_rate": 9.19978937355619e-05, + "loss": 3.9553, + "step": 9430 + }, + { + "epoch": 0.6410517733387688, + "grad_norm": 0.18636272847652435, + "learning_rate": 9.199364723467862e-05, + "loss": 3.9276, + "step": 9435 + }, + { + "epoch": 0.6413914934094306, + "grad_norm": 0.18782015144824982, + "learning_rate": 9.198940073379537e-05, + "loss": 4.0336, + "step": 9440 + }, + { + "epoch": 0.6417312134800924, + "grad_norm": 0.19050072133541107, + "learning_rate": 9.198515423291208e-05, + "loss": 4.0227, + "step": 9445 + }, + { + "epoch": 0.6420709335507542, + "grad_norm": 0.30288147926330566, + "learning_rate": 9.198090773202881e-05, + "loss": 3.8769, + "step": 9450 + }, + { + "epoch": 0.6424106536214159, + "grad_norm": 0.22393420338630676, + "learning_rate": 9.197666123114555e-05, + "loss": 3.9509, + "step": 9455 + }, + { + "epoch": 0.6427503736920778, + "grad_norm": 0.21114152669906616, + "learning_rate": 9.197241473026226e-05, + "loss": 4.1008, + "step": 9460 + }, + { + "epoch": 0.6430900937627395, + "grad_norm": 0.238412544131279, + "learning_rate": 9.196816822937899e-05, + "loss": 3.8772, + "step": 9465 + }, + { + "epoch": 0.6434298138334013, + "grad_norm": 0.6189041137695312, + "learning_rate": 9.196392172849573e-05, + "loss": 3.9286, + "step": 9470 + }, + { + "epoch": 0.6437695339040631, + "grad_norm": 0.26222628355026245, + "learning_rate": 9.195967522761245e-05, + "loss": 4.1328, + "step": 9475 + }, + { + "epoch": 0.6441092539747248, + "grad_norm": 0.21858513355255127, + "learning_rate": 9.195542872672918e-05, + "loss": 3.9626, + "step": 9480 + }, + { + "epoch": 0.6444489740453866, + "grad_norm": 0.21814042329788208, + "learning_rate": 9.19511822258459e-05, + "loss": 4.3128, + "step": 9485 + }, + { + "epoch": 0.6447886941160483, + "grad_norm": 0.19709447026252747, + "learning_rate": 9.194693572496263e-05, + "loss": 3.9969, + "step": 9490 + }, + { + "epoch": 0.6451284141867102, + "grad_norm": 0.172958642244339, + "learning_rate": 9.194268922407936e-05, + "loss": 4.0359, + "step": 9495 + }, + { + "epoch": 0.6454681342573719, + "grad_norm": 0.17147018015384674, + "learning_rate": 9.193844272319609e-05, + "loss": 3.8998, + "step": 9500 + }, + { + "epoch": 0.6458078543280337, + "grad_norm": 0.16436798870563507, + "learning_rate": 9.193419622231282e-05, + "loss": 3.8929, + "step": 9505 + }, + { + "epoch": 0.6461475743986955, + "grad_norm": 0.17490500211715698, + "learning_rate": 9.192994972142954e-05, + "loss": 3.9468, + "step": 9510 + }, + { + "epoch": 0.6464872944693573, + "grad_norm": 0.5229065418243408, + "learning_rate": 9.192570322054627e-05, + "loss": 4.1358, + "step": 9515 + }, + { + "epoch": 0.646827014540019, + "grad_norm": 0.1972028911113739, + "learning_rate": 9.1921456719663e-05, + "loss": 4.1462, + "step": 9520 + }, + { + "epoch": 0.6471667346106807, + "grad_norm": 0.18450862169265747, + "learning_rate": 9.191721021877973e-05, + "loss": 4.0512, + "step": 9525 + }, + { + "epoch": 0.6475064546813426, + "grad_norm": 0.24547728896141052, + "learning_rate": 9.191296371789646e-05, + "loss": 4.0998, + "step": 9530 + }, + { + "epoch": 0.6478461747520043, + "grad_norm": 0.19123290479183197, + "learning_rate": 9.190871721701318e-05, + "loss": 3.8672, + "step": 9535 + }, + { + "epoch": 0.6481858948226661, + "grad_norm": 0.22196923196315765, + "learning_rate": 9.190447071612991e-05, + "loss": 4.1529, + "step": 9540 + }, + { + "epoch": 0.6485256148933279, + "grad_norm": 0.21963347494602203, + "learning_rate": 9.190022421524664e-05, + "loss": 4.1048, + "step": 9545 + }, + { + "epoch": 0.6488653349639897, + "grad_norm": 0.19842089712619781, + "learning_rate": 9.189597771436337e-05, + "loss": 4.0297, + "step": 9550 + }, + { + "epoch": 0.6492050550346514, + "grad_norm": 0.2944156527519226, + "learning_rate": 9.18917312134801e-05, + "loss": 4.0261, + "step": 9555 + }, + { + "epoch": 0.6495447751053133, + "grad_norm": 0.1940944492816925, + "learning_rate": 9.188748471259682e-05, + "loss": 3.9962, + "step": 9560 + }, + { + "epoch": 0.649884495175975, + "grad_norm": 0.1459953486919403, + "learning_rate": 9.188323821171355e-05, + "loss": 4.0297, + "step": 9565 + }, + { + "epoch": 0.6502242152466368, + "grad_norm": 0.18244166672229767, + "learning_rate": 9.187899171083028e-05, + "loss": 3.9038, + "step": 9570 + }, + { + "epoch": 0.6505639353172985, + "grad_norm": 0.5632447004318237, + "learning_rate": 9.187474520994701e-05, + "loss": 3.8968, + "step": 9575 + }, + { + "epoch": 0.6509036553879604, + "grad_norm": 0.21160653233528137, + "learning_rate": 9.187049870906374e-05, + "loss": 4.0296, + "step": 9580 + }, + { + "epoch": 0.6512433754586221, + "grad_norm": 0.5150845646858215, + "learning_rate": 9.186625220818046e-05, + "loss": 3.9448, + "step": 9585 + }, + { + "epoch": 0.6515830955292838, + "grad_norm": 0.16149458289146423, + "learning_rate": 9.186200570729719e-05, + "loss": 3.9315, + "step": 9590 + }, + { + "epoch": 0.6519228155999457, + "grad_norm": 0.19759464263916016, + "learning_rate": 9.185775920641392e-05, + "loss": 3.9934, + "step": 9595 + }, + { + "epoch": 0.6522625356706074, + "grad_norm": 0.4384717047214508, + "learning_rate": 9.185351270553065e-05, + "loss": 3.932, + "step": 9600 + }, + { + "epoch": 0.6526022557412692, + "grad_norm": 0.17127420008182526, + "learning_rate": 9.184926620464738e-05, + "loss": 4.038, + "step": 9605 + }, + { + "epoch": 0.6529419758119309, + "grad_norm": 0.1805383861064911, + "learning_rate": 9.18450197037641e-05, + "loss": 4.077, + "step": 9610 + }, + { + "epoch": 0.6532816958825928, + "grad_norm": 0.17218123376369476, + "learning_rate": 9.184077320288082e-05, + "loss": 4.0027, + "step": 9615 + }, + { + "epoch": 0.6536214159532545, + "grad_norm": 0.21027123928070068, + "learning_rate": 9.183652670199756e-05, + "loss": 4.004, + "step": 9620 + }, + { + "epoch": 0.6539611360239163, + "grad_norm": 0.18752577900886536, + "learning_rate": 9.183228020111429e-05, + "loss": 4.1132, + "step": 9625 + }, + { + "epoch": 0.6543008560945781, + "grad_norm": 0.2274181991815567, + "learning_rate": 9.1828033700231e-05, + "loss": 4.0189, + "step": 9630 + }, + { + "epoch": 0.6546405761652399, + "grad_norm": 0.5568689107894897, + "learning_rate": 9.182378719934774e-05, + "loss": 3.9176, + "step": 9635 + }, + { + "epoch": 0.6549802962359016, + "grad_norm": 0.20141251385211945, + "learning_rate": 9.181954069846447e-05, + "loss": 4.1433, + "step": 9640 + }, + { + "epoch": 0.6553200163065634, + "grad_norm": 0.1734774261713028, + "learning_rate": 9.181529419758119e-05, + "loss": 3.8714, + "step": 9645 + }, + { + "epoch": 0.6556597363772252, + "grad_norm": 0.20372764766216278, + "learning_rate": 9.181104769669793e-05, + "loss": 3.6352, + "step": 9650 + }, + { + "epoch": 0.6559994564478869, + "grad_norm": 0.22679565846920013, + "learning_rate": 9.180680119581466e-05, + "loss": 4.0213, + "step": 9655 + }, + { + "epoch": 0.6563391765185487, + "grad_norm": 0.19201093912124634, + "learning_rate": 9.180255469493138e-05, + "loss": 4.3365, + "step": 9660 + }, + { + "epoch": 0.6566788965892105, + "grad_norm": 0.23552648723125458, + "learning_rate": 9.179830819404811e-05, + "loss": 3.827, + "step": 9665 + }, + { + "epoch": 0.6570186166598723, + "grad_norm": 0.2363472580909729, + "learning_rate": 9.179406169316484e-05, + "loss": 3.5363, + "step": 9670 + }, + { + "epoch": 0.657358336730534, + "grad_norm": 0.1936025321483612, + "learning_rate": 9.178981519228157e-05, + "loss": 3.6846, + "step": 9675 + }, + { + "epoch": 0.6576980568011959, + "grad_norm": 0.16396436095237732, + "learning_rate": 9.17855686913983e-05, + "loss": 4.2385, + "step": 9680 + }, + { + "epoch": 0.6580377768718576, + "grad_norm": 0.19219662249088287, + "learning_rate": 9.178132219051501e-05, + "loss": 3.8246, + "step": 9685 + }, + { + "epoch": 0.6583774969425193, + "grad_norm": 0.3243776857852936, + "learning_rate": 9.177707568963175e-05, + "loss": 3.9252, + "step": 9690 + }, + { + "epoch": 0.6587172170131811, + "grad_norm": 0.2118336260318756, + "learning_rate": 9.177282918874848e-05, + "loss": 3.9518, + "step": 9695 + }, + { + "epoch": 0.6590569370838429, + "grad_norm": 0.2181885987520218, + "learning_rate": 9.17685826878652e-05, + "loss": 3.8817, + "step": 9700 + }, + { + "epoch": 0.6593966571545047, + "grad_norm": 0.2063203752040863, + "learning_rate": 9.176433618698194e-05, + "loss": 4.0715, + "step": 9705 + }, + { + "epoch": 0.6597363772251664, + "grad_norm": 0.16221602261066437, + "learning_rate": 9.176008968609867e-05, + "loss": 3.8972, + "step": 9710 + }, + { + "epoch": 0.6600760972958283, + "grad_norm": 0.2043311893939972, + "learning_rate": 9.175584318521538e-05, + "loss": 3.9982, + "step": 9715 + }, + { + "epoch": 0.66041581736649, + "grad_norm": 0.15358102321624756, + "learning_rate": 9.175159668433212e-05, + "loss": 4.4382, + "step": 9720 + }, + { + "epoch": 0.6607555374371518, + "grad_norm": 0.1804971694946289, + "learning_rate": 9.174735018344885e-05, + "loss": 4.0575, + "step": 9725 + }, + { + "epoch": 0.6610952575078136, + "grad_norm": 0.8969916701316833, + "learning_rate": 9.174310368256556e-05, + "loss": 3.8002, + "step": 9730 + }, + { + "epoch": 0.6614349775784754, + "grad_norm": 0.17219315469264984, + "learning_rate": 9.17388571816823e-05, + "loss": 3.8753, + "step": 9735 + }, + { + "epoch": 0.6617746976491371, + "grad_norm": 1.4561220407485962, + "learning_rate": 9.173461068079903e-05, + "loss": 3.8808, + "step": 9740 + }, + { + "epoch": 0.6621144177197988, + "grad_norm": 0.3004949688911438, + "learning_rate": 9.173036417991575e-05, + "loss": 4.01, + "step": 9745 + }, + { + "epoch": 0.6624541377904607, + "grad_norm": 0.4193626642227173, + "learning_rate": 9.172611767903249e-05, + "loss": 4.1539, + "step": 9750 + }, + { + "epoch": 0.6627938578611224, + "grad_norm": 0.5232547521591187, + "learning_rate": 9.17218711781492e-05, + "loss": 4.1406, + "step": 9755 + }, + { + "epoch": 0.6631335779317842, + "grad_norm": 0.2067098319530487, + "learning_rate": 9.171762467726593e-05, + "loss": 3.9981, + "step": 9760 + }, + { + "epoch": 0.663473298002446, + "grad_norm": 0.2100781351327896, + "learning_rate": 9.171337817638267e-05, + "loss": 3.7371, + "step": 9765 + }, + { + "epoch": 0.6638130180731078, + "grad_norm": 0.28894108533859253, + "learning_rate": 9.170913167549939e-05, + "loss": 4.1151, + "step": 9770 + }, + { + "epoch": 0.6641527381437695, + "grad_norm": 0.2366105318069458, + "learning_rate": 9.170488517461612e-05, + "loss": 3.7963, + "step": 9775 + }, + { + "epoch": 0.6644924582144313, + "grad_norm": 0.15720832347869873, + "learning_rate": 9.170063867373286e-05, + "loss": 4.1177, + "step": 9780 + }, + { + "epoch": 0.6648321782850931, + "grad_norm": 0.16979362070560455, + "learning_rate": 9.169639217284957e-05, + "loss": 4.1974, + "step": 9785 + }, + { + "epoch": 0.6651718983557549, + "grad_norm": 0.1718185693025589, + "learning_rate": 9.16921456719663e-05, + "loss": 3.8146, + "step": 9790 + }, + { + "epoch": 0.6655116184264166, + "grad_norm": 0.1998133510351181, + "learning_rate": 9.168789917108304e-05, + "loss": 3.8062, + "step": 9795 + }, + { + "epoch": 0.6658513384970784, + "grad_norm": 0.19915224611759186, + "learning_rate": 9.168365267019976e-05, + "loss": 3.8217, + "step": 9800 + }, + { + "epoch": 0.6661910585677402, + "grad_norm": 0.19672515988349915, + "learning_rate": 9.167940616931648e-05, + "loss": 3.9699, + "step": 9805 + }, + { + "epoch": 0.6665307786384019, + "grad_norm": 0.18894067406654358, + "learning_rate": 9.167515966843323e-05, + "loss": 3.9818, + "step": 9810 + }, + { + "epoch": 0.6668704987090638, + "grad_norm": 0.2761344611644745, + "learning_rate": 9.167091316754994e-05, + "loss": 4.0251, + "step": 9815 + }, + { + "epoch": 0.6672102187797255, + "grad_norm": 0.20272742211818695, + "learning_rate": 9.166666666666667e-05, + "loss": 4.1925, + "step": 9820 + }, + { + "epoch": 0.6675499388503873, + "grad_norm": 1.1285771131515503, + "learning_rate": 9.166242016578341e-05, + "loss": 4.1825, + "step": 9825 + }, + { + "epoch": 0.667889658921049, + "grad_norm": 0.1726224571466446, + "learning_rate": 9.165817366490012e-05, + "loss": 3.9708, + "step": 9830 + }, + { + "epoch": 0.6682293789917109, + "grad_norm": 0.1987651288509369, + "learning_rate": 9.165392716401685e-05, + "loss": 4.1101, + "step": 9835 + }, + { + "epoch": 0.6685690990623726, + "grad_norm": 0.19084063172340393, + "learning_rate": 9.164968066313358e-05, + "loss": 3.9633, + "step": 9840 + }, + { + "epoch": 0.6689088191330343, + "grad_norm": 0.20953474938869476, + "learning_rate": 9.164543416225031e-05, + "loss": 4.0714, + "step": 9845 + }, + { + "epoch": 0.6692485392036962, + "grad_norm": 0.1689091920852661, + "learning_rate": 9.164118766136704e-05, + "loss": 4.0451, + "step": 9850 + }, + { + "epoch": 0.6695882592743579, + "grad_norm": 0.38470450043678284, + "learning_rate": 9.163694116048376e-05, + "loss": 4.2373, + "step": 9855 + }, + { + "epoch": 0.6699279793450197, + "grad_norm": 0.18749617040157318, + "learning_rate": 9.163269465960049e-05, + "loss": 4.0489, + "step": 9860 + }, + { + "epoch": 0.6702676994156814, + "grad_norm": 0.1558235138654709, + "learning_rate": 9.162844815871722e-05, + "loss": 4.0883, + "step": 9865 + }, + { + "epoch": 0.6706074194863433, + "grad_norm": 0.17656366527080536, + "learning_rate": 9.162420165783395e-05, + "loss": 3.8735, + "step": 9870 + }, + { + "epoch": 0.670947139557005, + "grad_norm": 0.16513271629810333, + "learning_rate": 9.161995515695068e-05, + "loss": 4.3651, + "step": 9875 + }, + { + "epoch": 0.6712868596276668, + "grad_norm": 0.369552880525589, + "learning_rate": 9.16157086560674e-05, + "loss": 4.1642, + "step": 9880 + }, + { + "epoch": 0.6716265796983286, + "grad_norm": 0.19037148356437683, + "learning_rate": 9.161146215518413e-05, + "loss": 4.0021, + "step": 9885 + }, + { + "epoch": 0.6719662997689904, + "grad_norm": 0.1722508817911148, + "learning_rate": 9.160721565430086e-05, + "loss": 3.8246, + "step": 9890 + }, + { + "epoch": 0.6723060198396521, + "grad_norm": 0.16816957294940948, + "learning_rate": 9.160296915341759e-05, + "loss": 3.7951, + "step": 9895 + }, + { + "epoch": 0.672645739910314, + "grad_norm": 0.18479801714420319, + "learning_rate": 9.159872265253432e-05, + "loss": 4.0533, + "step": 9900 + }, + { + "epoch": 0.6729854599809757, + "grad_norm": 0.1568441092967987, + "learning_rate": 9.159447615165104e-05, + "loss": 3.7568, + "step": 9905 + }, + { + "epoch": 0.6733251800516374, + "grad_norm": 0.15381227433681488, + "learning_rate": 9.159022965076777e-05, + "loss": 4.0221, + "step": 9910 + }, + { + "epoch": 0.6736649001222992, + "grad_norm": 0.22633537650108337, + "learning_rate": 9.15859831498845e-05, + "loss": 4.0213, + "step": 9915 + }, + { + "epoch": 0.674004620192961, + "grad_norm": 0.22000354528427124, + "learning_rate": 9.158173664900123e-05, + "loss": 3.9787, + "step": 9920 + }, + { + "epoch": 0.6743443402636228, + "grad_norm": 0.18316060304641724, + "learning_rate": 9.157749014811796e-05, + "loss": 3.939, + "step": 9925 + }, + { + "epoch": 0.6746840603342845, + "grad_norm": 0.3888174891471863, + "learning_rate": 9.157324364723468e-05, + "loss": 3.9585, + "step": 9930 + }, + { + "epoch": 0.6750237804049464, + "grad_norm": 0.24031803011894226, + "learning_rate": 9.156899714635141e-05, + "loss": 3.9616, + "step": 9935 + }, + { + "epoch": 0.6753635004756081, + "grad_norm": 0.22554604709148407, + "learning_rate": 9.156475064546814e-05, + "loss": 4.13, + "step": 9940 + }, + { + "epoch": 0.6757032205462699, + "grad_norm": 0.15670569241046906, + "learning_rate": 9.156050414458487e-05, + "loss": 4.0036, + "step": 9945 + }, + { + "epoch": 0.6760429406169316, + "grad_norm": 0.18224500119686127, + "learning_rate": 9.15562576437016e-05, + "loss": 4.1177, + "step": 9950 + }, + { + "epoch": 0.6763826606875935, + "grad_norm": 0.46049654483795166, + "learning_rate": 9.155201114281831e-05, + "loss": 3.8908, + "step": 9955 + }, + { + "epoch": 0.6767223807582552, + "grad_norm": 0.1925237625837326, + "learning_rate": 9.154776464193505e-05, + "loss": 4.0503, + "step": 9960 + }, + { + "epoch": 0.6770621008289169, + "grad_norm": 0.2237987518310547, + "learning_rate": 9.154351814105178e-05, + "loss": 4.0872, + "step": 9965 + }, + { + "epoch": 0.6774018208995788, + "grad_norm": 0.5446917414665222, + "learning_rate": 9.15392716401685e-05, + "loss": 3.9465, + "step": 9970 + }, + { + "epoch": 0.6777415409702405, + "grad_norm": 0.19259484112262726, + "learning_rate": 9.153502513928524e-05, + "loss": 4.0308, + "step": 9975 + }, + { + "epoch": 0.6780812610409023, + "grad_norm": 0.16928361356258392, + "learning_rate": 9.153077863840196e-05, + "loss": 3.8983, + "step": 9980 + }, + { + "epoch": 0.6784209811115641, + "grad_norm": 0.20402824878692627, + "learning_rate": 9.152653213751868e-05, + "loss": 4.0315, + "step": 9985 + }, + { + "epoch": 0.6787607011822259, + "grad_norm": 0.18891318142414093, + "learning_rate": 9.152228563663542e-05, + "loss": 3.9693, + "step": 9990 + }, + { + "epoch": 0.6791004212528876, + "grad_norm": 0.21101170778274536, + "learning_rate": 9.151803913575215e-05, + "loss": 3.8452, + "step": 9995 + }, + { + "epoch": 0.6794401413235494, + "grad_norm": 0.20866774022579193, + "learning_rate": 9.151379263486888e-05, + "loss": 3.7796, + "step": 10000 + }, + { + "epoch": 0.6797798613942112, + "grad_norm": 0.22108693420886993, + "learning_rate": 9.15095461339856e-05, + "loss": 4.1394, + "step": 10005 + }, + { + "epoch": 0.680119581464873, + "grad_norm": 0.18283884227275848, + "learning_rate": 9.150529963310233e-05, + "loss": 4.0837, + "step": 10010 + }, + { + "epoch": 0.6804593015355347, + "grad_norm": 0.3496660888195038, + "learning_rate": 9.150105313221906e-05, + "loss": 3.9772, + "step": 10015 + }, + { + "epoch": 0.6807990216061965, + "grad_norm": 0.21944838762283325, + "learning_rate": 9.149680663133579e-05, + "loss": 3.9497, + "step": 10020 + }, + { + "epoch": 0.6811387416768583, + "grad_norm": 0.9018047451972961, + "learning_rate": 9.149256013045252e-05, + "loss": 4.1269, + "step": 10025 + }, + { + "epoch": 0.68147846174752, + "grad_norm": 0.1882868856191635, + "learning_rate": 9.148831362956924e-05, + "loss": 3.9114, + "step": 10030 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.9261885285377502, + "learning_rate": 9.148406712868597e-05, + "loss": 4.1938, + "step": 10035 + }, + { + "epoch": 0.6821579018888436, + "grad_norm": 0.20487643778324127, + "learning_rate": 9.147982062780269e-05, + "loss": 4.029, + "step": 10040 + }, + { + "epoch": 0.6824976219595054, + "grad_norm": 0.20628653466701508, + "learning_rate": 9.147557412691943e-05, + "loss": 3.9453, + "step": 10045 + }, + { + "epoch": 0.6828373420301671, + "grad_norm": 0.18048962950706482, + "learning_rate": 9.147132762603616e-05, + "loss": 3.7238, + "step": 10050 + }, + { + "epoch": 0.683177062100829, + "grad_norm": 0.20143994688987732, + "learning_rate": 9.146708112515287e-05, + "loss": 4.0817, + "step": 10055 + }, + { + "epoch": 0.6835167821714907, + "grad_norm": 0.1772017776966095, + "learning_rate": 9.146283462426961e-05, + "loss": 4.3308, + "step": 10060 + }, + { + "epoch": 0.6838565022421524, + "grad_norm": 0.17534330487251282, + "learning_rate": 9.145858812338634e-05, + "loss": 3.9769, + "step": 10065 + }, + { + "epoch": 0.6841962223128143, + "grad_norm": 0.3508715331554413, + "learning_rate": 9.145434162250305e-05, + "loss": 4.1162, + "step": 10070 + }, + { + "epoch": 0.684535942383476, + "grad_norm": 0.22710181772708893, + "learning_rate": 9.14500951216198e-05, + "loss": 3.7313, + "step": 10075 + }, + { + "epoch": 0.6848756624541378, + "grad_norm": 0.1621914505958557, + "learning_rate": 9.144584862073652e-05, + "loss": 4.0611, + "step": 10080 + }, + { + "epoch": 0.6852153825247995, + "grad_norm": 0.26334255933761597, + "learning_rate": 9.144160211985324e-05, + "loss": 4.0556, + "step": 10085 + }, + { + "epoch": 0.6855551025954614, + "grad_norm": 0.6280850768089294, + "learning_rate": 9.143735561896998e-05, + "loss": 4.0588, + "step": 10090 + }, + { + "epoch": 0.6858948226661231, + "grad_norm": 0.3040946424007416, + "learning_rate": 9.143310911808671e-05, + "loss": 3.9212, + "step": 10095 + }, + { + "epoch": 0.6862345427367849, + "grad_norm": 0.30600547790527344, + "learning_rate": 9.142886261720342e-05, + "loss": 4.0689, + "step": 10100 + }, + { + "epoch": 0.6865742628074467, + "grad_norm": 0.18050602078437805, + "learning_rate": 9.142461611632016e-05, + "loss": 3.9464, + "step": 10105 + }, + { + "epoch": 0.6869139828781085, + "grad_norm": 0.48490941524505615, + "learning_rate": 9.142036961543688e-05, + "loss": 4.025, + "step": 10110 + }, + { + "epoch": 0.6872537029487702, + "grad_norm": 0.24022653698921204, + "learning_rate": 9.14161231145536e-05, + "loss": 4.0635, + "step": 10115 + }, + { + "epoch": 0.6875934230194319, + "grad_norm": 0.5546215772628784, + "learning_rate": 9.141187661367035e-05, + "loss": 3.9159, + "step": 10120 + }, + { + "epoch": 0.6879331430900938, + "grad_norm": 0.1747526228427887, + "learning_rate": 9.140763011278706e-05, + "loss": 4.0271, + "step": 10125 + }, + { + "epoch": 0.6882728631607555, + "grad_norm": 0.8281142115592957, + "learning_rate": 9.140338361190379e-05, + "loss": 3.8248, + "step": 10130 + }, + { + "epoch": 0.6886125832314173, + "grad_norm": 0.19988751411437988, + "learning_rate": 9.139913711102053e-05, + "loss": 4.022, + "step": 10135 + }, + { + "epoch": 0.6889523033020791, + "grad_norm": 0.5465024709701538, + "learning_rate": 9.139489061013725e-05, + "loss": 3.9433, + "step": 10140 + }, + { + "epoch": 0.6892920233727409, + "grad_norm": 0.704890787601471, + "learning_rate": 9.139064410925397e-05, + "loss": 4.2021, + "step": 10145 + }, + { + "epoch": 0.6896317434434026, + "grad_norm": 0.1683177798986435, + "learning_rate": 9.138639760837072e-05, + "loss": 4.0005, + "step": 10150 + }, + { + "epoch": 0.6899714635140645, + "grad_norm": 0.2499643862247467, + "learning_rate": 9.138215110748743e-05, + "loss": 4.0452, + "step": 10155 + }, + { + "epoch": 0.6903111835847262, + "grad_norm": 0.20261327922344208, + "learning_rate": 9.137790460660416e-05, + "loss": 3.9332, + "step": 10160 + }, + { + "epoch": 0.690650903655388, + "grad_norm": 0.19001875817775726, + "learning_rate": 9.13736581057209e-05, + "loss": 3.9008, + "step": 10165 + }, + { + "epoch": 0.6909906237260497, + "grad_norm": 0.18063665926456451, + "learning_rate": 9.136941160483761e-05, + "loss": 3.8843, + "step": 10170 + }, + { + "epoch": 0.6913303437967115, + "grad_norm": 0.1968679428100586, + "learning_rate": 9.136516510395434e-05, + "loss": 4.0298, + "step": 10175 + }, + { + "epoch": 0.6916700638673733, + "grad_norm": 0.21212510764598846, + "learning_rate": 9.136091860307107e-05, + "loss": 4.1819, + "step": 10180 + }, + { + "epoch": 0.692009783938035, + "grad_norm": 0.2317853420972824, + "learning_rate": 9.13566721021878e-05, + "loss": 3.8627, + "step": 10185 + }, + { + "epoch": 0.6923495040086969, + "grad_norm": 0.17582780122756958, + "learning_rate": 9.135242560130453e-05, + "loss": 4.044, + "step": 10190 + }, + { + "epoch": 0.6926892240793586, + "grad_norm": 0.1663304716348648, + "learning_rate": 9.134817910042125e-05, + "loss": 3.9946, + "step": 10195 + }, + { + "epoch": 0.6930289441500204, + "grad_norm": 0.2554979920387268, + "learning_rate": 9.134393259953798e-05, + "loss": 3.9845, + "step": 10200 + }, + { + "epoch": 0.6933686642206821, + "grad_norm": 0.19808262586593628, + "learning_rate": 9.133968609865471e-05, + "loss": 3.7411, + "step": 10205 + }, + { + "epoch": 0.693708384291344, + "grad_norm": 0.16207602620124817, + "learning_rate": 9.133543959777144e-05, + "loss": 3.9169, + "step": 10210 + }, + { + "epoch": 0.6940481043620057, + "grad_norm": 0.19255559146404266, + "learning_rate": 9.133119309688817e-05, + "loss": 4.0015, + "step": 10215 + }, + { + "epoch": 0.6943878244326674, + "grad_norm": 0.40374597907066345, + "learning_rate": 9.13269465960049e-05, + "loss": 4.0568, + "step": 10220 + }, + { + "epoch": 0.6947275445033293, + "grad_norm": 0.5542440414428711, + "learning_rate": 9.132270009512162e-05, + "loss": 4.0383, + "step": 10225 + }, + { + "epoch": 0.695067264573991, + "grad_norm": 0.2855238914489746, + "learning_rate": 9.131845359423835e-05, + "loss": 3.957, + "step": 10230 + }, + { + "epoch": 0.6954069846446528, + "grad_norm": 0.30983904004096985, + "learning_rate": 9.131420709335508e-05, + "loss": 3.9347, + "step": 10235 + }, + { + "epoch": 0.6957467047153146, + "grad_norm": 0.1820448637008667, + "learning_rate": 9.13099605924718e-05, + "loss": 4.0424, + "step": 10240 + }, + { + "epoch": 0.6960864247859764, + "grad_norm": 0.1688624769449234, + "learning_rate": 9.130571409158853e-05, + "loss": 4.1492, + "step": 10245 + }, + { + "epoch": 0.6964261448566381, + "grad_norm": 0.18944065272808075, + "learning_rate": 9.130146759070526e-05, + "loss": 4.0306, + "step": 10250 + }, + { + "epoch": 0.6967658649272999, + "grad_norm": 0.23642663657665253, + "learning_rate": 9.129722108982199e-05, + "loss": 3.9713, + "step": 10255 + }, + { + "epoch": 0.6971055849979617, + "grad_norm": 0.30841895937919617, + "learning_rate": 9.129297458893872e-05, + "loss": 4.0823, + "step": 10260 + }, + { + "epoch": 0.6974453050686235, + "grad_norm": 0.20149889588356018, + "learning_rate": 9.128872808805545e-05, + "loss": 3.9524, + "step": 10265 + }, + { + "epoch": 0.6977850251392852, + "grad_norm": 0.18833234906196594, + "learning_rate": 9.128448158717217e-05, + "loss": 4.1513, + "step": 10270 + }, + { + "epoch": 0.698124745209947, + "grad_norm": 0.1727929264307022, + "learning_rate": 9.12802350862889e-05, + "loss": 3.928, + "step": 10275 + }, + { + "epoch": 0.6984644652806088, + "grad_norm": 0.19599127769470215, + "learning_rate": 9.127598858540563e-05, + "loss": 3.9861, + "step": 10280 + }, + { + "epoch": 0.6988041853512705, + "grad_norm": 0.19696658849716187, + "learning_rate": 9.127174208452236e-05, + "loss": 3.9878, + "step": 10285 + }, + { + "epoch": 0.6991439054219323, + "grad_norm": 0.24747943878173828, + "learning_rate": 9.126749558363909e-05, + "loss": 3.9343, + "step": 10290 + }, + { + "epoch": 0.6994836254925941, + "grad_norm": 0.23848794400691986, + "learning_rate": 9.126324908275581e-05, + "loss": 3.7876, + "step": 10295 + }, + { + "epoch": 0.6998233455632559, + "grad_norm": 0.18184241652488708, + "learning_rate": 9.125900258187254e-05, + "loss": 4.003, + "step": 10300 + }, + { + "epoch": 0.7001630656339176, + "grad_norm": 0.21852192282676697, + "learning_rate": 9.125475608098927e-05, + "loss": 3.9115, + "step": 10305 + }, + { + "epoch": 0.7005027857045795, + "grad_norm": 0.18718090653419495, + "learning_rate": 9.125050958010599e-05, + "loss": 3.8846, + "step": 10310 + }, + { + "epoch": 0.7008425057752412, + "grad_norm": 0.4584771394729614, + "learning_rate": 9.124626307922273e-05, + "loss": 3.9954, + "step": 10315 + }, + { + "epoch": 0.701182225845903, + "grad_norm": 0.19998236000537872, + "learning_rate": 9.124201657833945e-05, + "loss": 3.9594, + "step": 10320 + }, + { + "epoch": 0.7015219459165648, + "grad_norm": 0.1826915442943573, + "learning_rate": 9.123777007745617e-05, + "loss": 3.8949, + "step": 10325 + }, + { + "epoch": 0.7018616659872265, + "grad_norm": 0.31630200147628784, + "learning_rate": 9.123352357657291e-05, + "loss": 4.0863, + "step": 10330 + }, + { + "epoch": 0.7022013860578883, + "grad_norm": 0.22190065681934357, + "learning_rate": 9.122927707568964e-05, + "loss": 3.9006, + "step": 10335 + }, + { + "epoch": 0.70254110612855, + "grad_norm": 0.14430834352970123, + "learning_rate": 9.122503057480637e-05, + "loss": 3.7831, + "step": 10340 + }, + { + "epoch": 0.7028808261992119, + "grad_norm": 0.23812465369701385, + "learning_rate": 9.12207840739231e-05, + "loss": 4.07, + "step": 10345 + }, + { + "epoch": 0.7032205462698736, + "grad_norm": 0.26677510142326355, + "learning_rate": 9.121653757303982e-05, + "loss": 4.0027, + "step": 10350 + }, + { + "epoch": 0.7035602663405354, + "grad_norm": 1.3501478433609009, + "learning_rate": 9.121229107215655e-05, + "loss": 3.987, + "step": 10355 + }, + { + "epoch": 0.7038999864111972, + "grad_norm": 0.20116499066352844, + "learning_rate": 9.120804457127328e-05, + "loss": 4.2462, + "step": 10360 + }, + { + "epoch": 0.704239706481859, + "grad_norm": 0.22384878993034363, + "learning_rate": 9.120379807039001e-05, + "loss": 3.7601, + "step": 10365 + }, + { + "epoch": 0.7045794265525207, + "grad_norm": 0.2325512170791626, + "learning_rate": 9.119955156950673e-05, + "loss": 4.0985, + "step": 10370 + }, + { + "epoch": 0.7049191466231824, + "grad_norm": 0.16118580102920532, + "learning_rate": 9.119530506862346e-05, + "loss": 4.0038, + "step": 10375 + }, + { + "epoch": 0.7052588666938443, + "grad_norm": 0.16809016466140747, + "learning_rate": 9.119105856774018e-05, + "loss": 4.0822, + "step": 10380 + }, + { + "epoch": 0.705598586764506, + "grad_norm": 0.29214194416999817, + "learning_rate": 9.118681206685692e-05, + "loss": 3.9853, + "step": 10385 + }, + { + "epoch": 0.7059383068351678, + "grad_norm": 0.17380909621715546, + "learning_rate": 9.118256556597365e-05, + "loss": 3.9326, + "step": 10390 + }, + { + "epoch": 0.7062780269058296, + "grad_norm": 0.19394385814666748, + "learning_rate": 9.117831906509036e-05, + "loss": 4.0416, + "step": 10395 + }, + { + "epoch": 0.7066177469764914, + "grad_norm": 0.16878750920295715, + "learning_rate": 9.11740725642071e-05, + "loss": 4.2154, + "step": 10400 + }, + { + "epoch": 0.7069574670471531, + "grad_norm": 0.17969490587711334, + "learning_rate": 9.116982606332383e-05, + "loss": 3.8792, + "step": 10405 + }, + { + "epoch": 0.707297187117815, + "grad_norm": 0.18729190528392792, + "learning_rate": 9.116557956244055e-05, + "loss": 4.0986, + "step": 10410 + }, + { + "epoch": 0.7076369071884767, + "grad_norm": 0.16963765025138855, + "learning_rate": 9.116133306155729e-05, + "loss": 4.062, + "step": 10415 + }, + { + "epoch": 0.7079766272591385, + "grad_norm": 0.2884555757045746, + "learning_rate": 9.115708656067402e-05, + "loss": 4.078, + "step": 10420 + }, + { + "epoch": 0.7083163473298002, + "grad_norm": 0.29540711641311646, + "learning_rate": 9.115284005979073e-05, + "loss": 4.0736, + "step": 10425 + }, + { + "epoch": 0.708656067400462, + "grad_norm": 0.41383740305900574, + "learning_rate": 9.114859355890747e-05, + "loss": 4.2034, + "step": 10430 + }, + { + "epoch": 0.7089957874711238, + "grad_norm": 0.17644955217838287, + "learning_rate": 9.11443470580242e-05, + "loss": 4.1137, + "step": 10435 + }, + { + "epoch": 0.7093355075417855, + "grad_norm": 0.1741337925195694, + "learning_rate": 9.114010055714091e-05, + "loss": 4.0893, + "step": 10440 + }, + { + "epoch": 0.7096752276124474, + "grad_norm": 0.2527826130390167, + "learning_rate": 9.113585405625766e-05, + "loss": 4.091, + "step": 10445 + }, + { + "epoch": 0.7100149476831091, + "grad_norm": 0.1837528795003891, + "learning_rate": 9.113160755537438e-05, + "loss": 4.0449, + "step": 10450 + }, + { + "epoch": 0.7103546677537709, + "grad_norm": 0.18858109414577484, + "learning_rate": 9.11273610544911e-05, + "loss": 4.0575, + "step": 10455 + }, + { + "epoch": 0.7106943878244326, + "grad_norm": 0.1998942494392395, + "learning_rate": 9.112311455360784e-05, + "loss": 3.9053, + "step": 10460 + }, + { + "epoch": 0.7110341078950945, + "grad_norm": 0.20997202396392822, + "learning_rate": 9.111886805272455e-05, + "loss": 3.8818, + "step": 10465 + }, + { + "epoch": 0.7113738279657562, + "grad_norm": 0.17449168860912323, + "learning_rate": 9.111462155184128e-05, + "loss": 4.0812, + "step": 10470 + }, + { + "epoch": 0.711713548036418, + "grad_norm": 0.20188404619693756, + "learning_rate": 9.111037505095802e-05, + "loss": 3.918, + "step": 10475 + }, + { + "epoch": 0.7120532681070798, + "grad_norm": 0.16575513780117035, + "learning_rate": 9.110612855007474e-05, + "loss": 4.053, + "step": 10480 + }, + { + "epoch": 0.7123929881777415, + "grad_norm": 0.20615115761756897, + "learning_rate": 9.110188204919147e-05, + "loss": 3.7282, + "step": 10485 + }, + { + "epoch": 0.7127327082484033, + "grad_norm": 0.17066192626953125, + "learning_rate": 9.109763554830821e-05, + "loss": 4.0129, + "step": 10490 + }, + { + "epoch": 0.7130724283190651, + "grad_norm": 0.2495145946741104, + "learning_rate": 9.109338904742492e-05, + "loss": 3.9959, + "step": 10495 + }, + { + "epoch": 0.7134121483897269, + "grad_norm": 0.19648021459579468, + "learning_rate": 9.108914254654165e-05, + "loss": 4.0106, + "step": 10500 + }, + { + "epoch": 0.7137518684603886, + "grad_norm": 0.20270267128944397, + "learning_rate": 9.108489604565839e-05, + "loss": 4.0734, + "step": 10505 + }, + { + "epoch": 0.7140915885310504, + "grad_norm": 0.1632177233695984, + "learning_rate": 9.10806495447751e-05, + "loss": 4.2488, + "step": 10510 + }, + { + "epoch": 0.7144313086017122, + "grad_norm": 0.1604064404964447, + "learning_rate": 9.107640304389183e-05, + "loss": 3.8896, + "step": 10515 + }, + { + "epoch": 0.714771028672374, + "grad_norm": 0.21193253993988037, + "learning_rate": 9.107215654300858e-05, + "loss": 3.9983, + "step": 10520 + }, + { + "epoch": 0.7151107487430357, + "grad_norm": 0.3716839551925659, + "learning_rate": 9.106791004212529e-05, + "loss": 3.9367, + "step": 10525 + }, + { + "epoch": 0.7154504688136976, + "grad_norm": 0.1587960124015808, + "learning_rate": 9.106366354124202e-05, + "loss": 3.7641, + "step": 10530 + }, + { + "epoch": 0.7157901888843593, + "grad_norm": 4.4356184005737305, + "learning_rate": 9.105941704035875e-05, + "loss": 4.0203, + "step": 10535 + }, + { + "epoch": 0.716129908955021, + "grad_norm": 0.2456178516149521, + "learning_rate": 9.105517053947547e-05, + "loss": 4.0491, + "step": 10540 + }, + { + "epoch": 0.7164696290256828, + "grad_norm": 0.5795451402664185, + "learning_rate": 9.10509240385922e-05, + "loss": 3.8471, + "step": 10545 + }, + { + "epoch": 0.7168093490963446, + "grad_norm": 0.20285823941230774, + "learning_rate": 9.104667753770893e-05, + "loss": 4.1314, + "step": 10550 + }, + { + "epoch": 0.7171490691670064, + "grad_norm": 0.16531841456890106, + "learning_rate": 9.104243103682566e-05, + "loss": 4.1928, + "step": 10555 + }, + { + "epoch": 0.7174887892376681, + "grad_norm": 0.1814994513988495, + "learning_rate": 9.103818453594239e-05, + "loss": 4.0319, + "step": 10560 + }, + { + "epoch": 0.71782850930833, + "grad_norm": 0.16023671627044678, + "learning_rate": 9.103393803505911e-05, + "loss": 4.1279, + "step": 10565 + }, + { + "epoch": 0.7181682293789917, + "grad_norm": 0.2011050581932068, + "learning_rate": 9.102969153417584e-05, + "loss": 4.0957, + "step": 10570 + }, + { + "epoch": 0.7185079494496535, + "grad_norm": 0.21786335110664368, + "learning_rate": 9.102544503329257e-05, + "loss": 3.9609, + "step": 10575 + }, + { + "epoch": 0.7188476695203153, + "grad_norm": 0.22446627914905548, + "learning_rate": 9.10211985324093e-05, + "loss": 3.9967, + "step": 10580 + }, + { + "epoch": 0.719187389590977, + "grad_norm": 0.301862508058548, + "learning_rate": 9.101695203152603e-05, + "loss": 3.7026, + "step": 10585 + }, + { + "epoch": 0.7195271096616388, + "grad_norm": 0.19402875006198883, + "learning_rate": 9.101270553064275e-05, + "loss": 4.1842, + "step": 10590 + }, + { + "epoch": 0.7198668297323005, + "grad_norm": 0.2339441329240799, + "learning_rate": 9.100845902975948e-05, + "loss": 3.8018, + "step": 10595 + }, + { + "epoch": 0.7202065498029624, + "grad_norm": 0.1565333753824234, + "learning_rate": 9.100421252887621e-05, + "loss": 3.8915, + "step": 10600 + }, + { + "epoch": 0.7205462698736241, + "grad_norm": 0.16925933957099915, + "learning_rate": 9.099996602799294e-05, + "loss": 4.1202, + "step": 10605 + }, + { + "epoch": 0.7208859899442859, + "grad_norm": 0.13388794660568237, + "learning_rate": 9.099571952710967e-05, + "loss": 3.9878, + "step": 10610 + }, + { + "epoch": 0.7212257100149477, + "grad_norm": 0.15455901622772217, + "learning_rate": 9.09914730262264e-05, + "loss": 3.9719, + "step": 10615 + }, + { + "epoch": 0.7215654300856095, + "grad_norm": 0.19704410433769226, + "learning_rate": 9.098722652534312e-05, + "loss": 3.9981, + "step": 10620 + }, + { + "epoch": 0.7219051501562712, + "grad_norm": 0.1932808756828308, + "learning_rate": 9.098298002445985e-05, + "loss": 4.0945, + "step": 10625 + }, + { + "epoch": 0.722244870226933, + "grad_norm": 0.2587969899177551, + "learning_rate": 9.097873352357658e-05, + "loss": 4.0047, + "step": 10630 + }, + { + "epoch": 0.7225845902975948, + "grad_norm": 0.24855893850326538, + "learning_rate": 9.09744870226933e-05, + "loss": 3.927, + "step": 10635 + }, + { + "epoch": 0.7229243103682566, + "grad_norm": 0.2058570235967636, + "learning_rate": 9.097024052181003e-05, + "loss": 4.0545, + "step": 10640 + }, + { + "epoch": 0.7232640304389183, + "grad_norm": 0.20533056557178497, + "learning_rate": 9.096599402092676e-05, + "loss": 4.0696, + "step": 10645 + }, + { + "epoch": 0.7236037505095801, + "grad_norm": 0.2629019320011139, + "learning_rate": 9.096174752004349e-05, + "loss": 3.9769, + "step": 10650 + }, + { + "epoch": 0.7239434705802419, + "grad_norm": 0.2127770483493805, + "learning_rate": 9.095750101916022e-05, + "loss": 3.9245, + "step": 10655 + }, + { + "epoch": 0.7242831906509036, + "grad_norm": 0.1867874562740326, + "learning_rate": 9.095325451827695e-05, + "loss": 3.5818, + "step": 10660 + }, + { + "epoch": 0.7246229107215655, + "grad_norm": 0.25175002217292786, + "learning_rate": 9.094900801739366e-05, + "loss": 4.0028, + "step": 10665 + }, + { + "epoch": 0.7249626307922272, + "grad_norm": 0.18207040429115295, + "learning_rate": 9.09447615165104e-05, + "loss": 3.9293, + "step": 10670 + }, + { + "epoch": 0.725302350862889, + "grad_norm": 0.2175348699092865, + "learning_rate": 9.094051501562713e-05, + "loss": 4.116, + "step": 10675 + }, + { + "epoch": 0.7256420709335507, + "grad_norm": 0.1736600250005722, + "learning_rate": 9.093626851474386e-05, + "loss": 4.0422, + "step": 10680 + }, + { + "epoch": 0.7259817910042126, + "grad_norm": 0.2036193609237671, + "learning_rate": 9.093202201386059e-05, + "loss": 4.0416, + "step": 10685 + }, + { + "epoch": 0.7263215110748743, + "grad_norm": 0.5849753618240356, + "learning_rate": 9.092777551297731e-05, + "loss": 3.9602, + "step": 10690 + }, + { + "epoch": 0.726661231145536, + "grad_norm": 0.15708011388778687, + "learning_rate": 9.092352901209404e-05, + "loss": 3.9746, + "step": 10695 + }, + { + "epoch": 0.7270009512161979, + "grad_norm": 0.21586155891418457, + "learning_rate": 9.091928251121077e-05, + "loss": 4.1214, + "step": 10700 + }, + { + "epoch": 0.7273406712868596, + "grad_norm": 0.19140325486660004, + "learning_rate": 9.09150360103275e-05, + "loss": 4.0866, + "step": 10705 + }, + { + "epoch": 0.7276803913575214, + "grad_norm": 0.6538243889808655, + "learning_rate": 9.091078950944423e-05, + "loss": 4.1316, + "step": 10710 + }, + { + "epoch": 0.7280201114281831, + "grad_norm": 0.1779155284166336, + "learning_rate": 9.090654300856095e-05, + "loss": 4.0354, + "step": 10715 + }, + { + "epoch": 0.728359831498845, + "grad_norm": 0.24357867240905762, + "learning_rate": 9.090229650767768e-05, + "loss": 3.7766, + "step": 10720 + }, + { + "epoch": 0.7286995515695067, + "grad_norm": 0.15026213228702545, + "learning_rate": 9.089805000679441e-05, + "loss": 3.8955, + "step": 10725 + }, + { + "epoch": 0.7290392716401685, + "grad_norm": 0.19349145889282227, + "learning_rate": 9.089380350591114e-05, + "loss": 3.7735, + "step": 10730 + }, + { + "epoch": 0.7293789917108303, + "grad_norm": 0.25646016001701355, + "learning_rate": 9.088955700502785e-05, + "loss": 4.0811, + "step": 10735 + }, + { + "epoch": 0.7297187117814921, + "grad_norm": 0.17351272702217102, + "learning_rate": 9.08853105041446e-05, + "loss": 4.1555, + "step": 10740 + }, + { + "epoch": 0.7300584318521538, + "grad_norm": 0.17100609838962555, + "learning_rate": 9.088106400326132e-05, + "loss": 4.0126, + "step": 10745 + }, + { + "epoch": 0.7303981519228157, + "grad_norm": 0.25536659359931946, + "learning_rate": 9.087681750237804e-05, + "loss": 4.0176, + "step": 10750 + }, + { + "epoch": 0.7307378719934774, + "grad_norm": 0.2601194977760315, + "learning_rate": 9.087257100149478e-05, + "loss": 4.0722, + "step": 10755 + }, + { + "epoch": 0.7310775920641391, + "grad_norm": 0.19794826209545135, + "learning_rate": 9.08683245006115e-05, + "loss": 4.2004, + "step": 10760 + }, + { + "epoch": 0.7314173121348009, + "grad_norm": 0.2230055183172226, + "learning_rate": 9.086407799972822e-05, + "loss": 3.9865, + "step": 10765 + }, + { + "epoch": 0.7317570322054627, + "grad_norm": 0.24480870366096497, + "learning_rate": 9.085983149884496e-05, + "loss": 3.7441, + "step": 10770 + }, + { + "epoch": 0.7320967522761245, + "grad_norm": 0.15868893265724182, + "learning_rate": 9.085558499796169e-05, + "loss": 4.0704, + "step": 10775 + }, + { + "epoch": 0.7324364723467862, + "grad_norm": 0.3648226857185364, + "learning_rate": 9.08513384970784e-05, + "loss": 4.0741, + "step": 10780 + }, + { + "epoch": 0.7327761924174481, + "grad_norm": 1.1779170036315918, + "learning_rate": 9.084709199619515e-05, + "loss": 4.0508, + "step": 10785 + }, + { + "epoch": 0.7331159124881098, + "grad_norm": 0.5466019511222839, + "learning_rate": 9.084284549531187e-05, + "loss": 3.7824, + "step": 10790 + }, + { + "epoch": 0.7334556325587716, + "grad_norm": 0.2697416841983795, + "learning_rate": 9.083859899442859e-05, + "loss": 3.8887, + "step": 10795 + }, + { + "epoch": 0.7337953526294333, + "grad_norm": 0.1877298504114151, + "learning_rate": 9.083435249354533e-05, + "loss": 4.0495, + "step": 10800 + }, + { + "epoch": 0.7341350727000951, + "grad_norm": 0.21535362303256989, + "learning_rate": 9.083010599266204e-05, + "loss": 4.1075, + "step": 10805 + }, + { + "epoch": 0.7344747927707569, + "grad_norm": 0.15432433784008026, + "learning_rate": 9.082585949177877e-05, + "loss": 4.1321, + "step": 10810 + }, + { + "epoch": 0.7348145128414186, + "grad_norm": 0.17613931000232697, + "learning_rate": 9.082161299089551e-05, + "loss": 4.0152, + "step": 10815 + }, + { + "epoch": 0.7351542329120805, + "grad_norm": 0.17943201959133148, + "learning_rate": 9.081736649001223e-05, + "loss": 4.0291, + "step": 10820 + }, + { + "epoch": 0.7354939529827422, + "grad_norm": 0.17358125746250153, + "learning_rate": 9.081311998912896e-05, + "loss": 4.0228, + "step": 10825 + }, + { + "epoch": 0.735833673053404, + "grad_norm": 0.16327157616615295, + "learning_rate": 9.08088734882457e-05, + "loss": 3.8393, + "step": 10830 + }, + { + "epoch": 0.7361733931240658, + "grad_norm": 0.38127797842025757, + "learning_rate": 9.080462698736241e-05, + "loss": 4.0771, + "step": 10835 + }, + { + "epoch": 0.7365131131947276, + "grad_norm": 0.17917278409004211, + "learning_rate": 9.080038048647914e-05, + "loss": 3.9258, + "step": 10840 + }, + { + "epoch": 0.7368528332653893, + "grad_norm": 0.1688838005065918, + "learning_rate": 9.079613398559588e-05, + "loss": 3.8387, + "step": 10845 + }, + { + "epoch": 0.737192553336051, + "grad_norm": 0.1823907047510147, + "learning_rate": 9.07918874847126e-05, + "loss": 3.9959, + "step": 10850 + }, + { + "epoch": 0.7375322734067129, + "grad_norm": 0.20357432961463928, + "learning_rate": 9.078764098382932e-05, + "loss": 3.99, + "step": 10855 + }, + { + "epoch": 0.7378719934773746, + "grad_norm": 0.18823125958442688, + "learning_rate": 9.078339448294607e-05, + "loss": 4.0908, + "step": 10860 + }, + { + "epoch": 0.7382117135480364, + "grad_norm": 0.16174191236495972, + "learning_rate": 9.077914798206278e-05, + "loss": 4.0641, + "step": 10865 + }, + { + "epoch": 0.7385514336186982, + "grad_norm": 0.1720336228609085, + "learning_rate": 9.077490148117951e-05, + "loss": 3.8454, + "step": 10870 + }, + { + "epoch": 0.73889115368936, + "grad_norm": 0.23603537678718567, + "learning_rate": 9.077065498029625e-05, + "loss": 3.9752, + "step": 10875 + }, + { + "epoch": 0.7392308737600217, + "grad_norm": 0.20614124834537506, + "learning_rate": 9.076640847941296e-05, + "loss": 4.045, + "step": 10880 + }, + { + "epoch": 0.7395705938306835, + "grad_norm": 0.30947062373161316, + "learning_rate": 9.076216197852969e-05, + "loss": 3.9429, + "step": 10885 + }, + { + "epoch": 0.7399103139013453, + "grad_norm": 0.2017713338136673, + "learning_rate": 9.075791547764642e-05, + "loss": 3.9205, + "step": 10890 + }, + { + "epoch": 0.7402500339720071, + "grad_norm": 1.3917611837387085, + "learning_rate": 9.075366897676315e-05, + "loss": 4.0456, + "step": 10895 + }, + { + "epoch": 0.7405897540426688, + "grad_norm": 0.4103597104549408, + "learning_rate": 9.074942247587988e-05, + "loss": 3.9107, + "step": 10900 + }, + { + "epoch": 0.7409294741133307, + "grad_norm": 0.5144510269165039, + "learning_rate": 9.07451759749966e-05, + "loss": 4.0178, + "step": 10905 + }, + { + "epoch": 0.7412691941839924, + "grad_norm": 0.16965581476688385, + "learning_rate": 9.074092947411333e-05, + "loss": 3.9729, + "step": 10910 + }, + { + "epoch": 0.7416089142546541, + "grad_norm": 0.38037505745887756, + "learning_rate": 9.073668297323006e-05, + "loss": 4.2043, + "step": 10915 + }, + { + "epoch": 0.741948634325316, + "grad_norm": 0.26255086064338684, + "learning_rate": 9.073243647234679e-05, + "loss": 3.8663, + "step": 10920 + }, + { + "epoch": 0.7422883543959777, + "grad_norm": 0.3262099623680115, + "learning_rate": 9.072818997146352e-05, + "loss": 4.0803, + "step": 10925 + }, + { + "epoch": 0.7426280744666395, + "grad_norm": 0.21773168444633484, + "learning_rate": 9.072394347058024e-05, + "loss": 3.9992, + "step": 10930 + }, + { + "epoch": 0.7429677945373012, + "grad_norm": 0.22857216000556946, + "learning_rate": 9.071969696969697e-05, + "loss": 3.8582, + "step": 10935 + }, + { + "epoch": 0.7433075146079631, + "grad_norm": 0.312259316444397, + "learning_rate": 9.07154504688137e-05, + "loss": 4.0318, + "step": 10940 + }, + { + "epoch": 0.7436472346786248, + "grad_norm": 0.1695690155029297, + "learning_rate": 9.071120396793043e-05, + "loss": 3.9077, + "step": 10945 + }, + { + "epoch": 0.7439869547492866, + "grad_norm": 0.29498061537742615, + "learning_rate": 9.070695746704716e-05, + "loss": 4.3087, + "step": 10950 + }, + { + "epoch": 0.7443266748199484, + "grad_norm": 0.24566805362701416, + "learning_rate": 9.070271096616388e-05, + "loss": 3.8372, + "step": 10955 + }, + { + "epoch": 0.7446663948906102, + "grad_norm": 0.163113072514534, + "learning_rate": 9.069846446528061e-05, + "loss": 4.0402, + "step": 10960 + }, + { + "epoch": 0.7450061149612719, + "grad_norm": 0.18011754751205444, + "learning_rate": 9.069421796439734e-05, + "loss": 4.1377, + "step": 10965 + }, + { + "epoch": 0.7453458350319336, + "grad_norm": 0.8807979822158813, + "learning_rate": 9.068997146351407e-05, + "loss": 3.8495, + "step": 10970 + }, + { + "epoch": 0.7456855551025955, + "grad_norm": 0.22865957021713257, + "learning_rate": 9.06857249626308e-05, + "loss": 4.1146, + "step": 10975 + }, + { + "epoch": 0.7460252751732572, + "grad_norm": 0.2118086814880371, + "learning_rate": 9.068147846174752e-05, + "loss": 3.8964, + "step": 10980 + }, + { + "epoch": 0.746364995243919, + "grad_norm": 0.18207617104053497, + "learning_rate": 9.067723196086425e-05, + "loss": 3.9706, + "step": 10985 + }, + { + "epoch": 0.7467047153145808, + "grad_norm": 0.16859905421733856, + "learning_rate": 9.067298545998098e-05, + "loss": 3.7855, + "step": 10990 + }, + { + "epoch": 0.7470444353852426, + "grad_norm": 0.16500097513198853, + "learning_rate": 9.066873895909771e-05, + "loss": 3.7684, + "step": 10995 + }, + { + "epoch": 0.7473841554559043, + "grad_norm": 0.1520179957151413, + "learning_rate": 9.066449245821444e-05, + "loss": 3.8084, + "step": 11000 + }, + { + "epoch": 0.7477238755265662, + "grad_norm": 0.21755331754684448, + "learning_rate": 9.066024595733115e-05, + "loss": 3.8863, + "step": 11005 + }, + { + "epoch": 0.7480635955972279, + "grad_norm": 0.20671890676021576, + "learning_rate": 9.065599945644789e-05, + "loss": 3.9006, + "step": 11010 + }, + { + "epoch": 0.7484033156678896, + "grad_norm": 0.16787393391132355, + "learning_rate": 9.065175295556462e-05, + "loss": 3.7611, + "step": 11015 + }, + { + "epoch": 0.7487430357385514, + "grad_norm": 0.22157283127307892, + "learning_rate": 9.064750645468135e-05, + "loss": 4.1048, + "step": 11020 + }, + { + "epoch": 0.7490827558092132, + "grad_norm": 0.22022277116775513, + "learning_rate": 9.064325995379808e-05, + "loss": 3.8746, + "step": 11025 + }, + { + "epoch": 0.749422475879875, + "grad_norm": 0.2435934692621231, + "learning_rate": 9.06390134529148e-05, + "loss": 3.9949, + "step": 11030 + }, + { + "epoch": 0.7497621959505367, + "grad_norm": 0.18187767267227173, + "learning_rate": 9.063476695203153e-05, + "loss": 4.0584, + "step": 11035 + }, + { + "epoch": 0.7501019160211986, + "grad_norm": 0.18477857112884521, + "learning_rate": 9.063052045114826e-05, + "loss": 4.1217, + "step": 11040 + }, + { + "epoch": 0.7504416360918603, + "grad_norm": 0.1471758335828781, + "learning_rate": 9.062627395026499e-05, + "loss": 4.0513, + "step": 11045 + }, + { + "epoch": 0.7507813561625221, + "grad_norm": 0.20632903277873993, + "learning_rate": 9.062202744938172e-05, + "loss": 3.9865, + "step": 11050 + }, + { + "epoch": 0.7511210762331838, + "grad_norm": 0.21105721592903137, + "learning_rate": 9.061778094849844e-05, + "loss": 3.8277, + "step": 11055 + }, + { + "epoch": 0.7514607963038457, + "grad_norm": 0.19280456006526947, + "learning_rate": 9.061353444761517e-05, + "loss": 4.0091, + "step": 11060 + }, + { + "epoch": 0.7518005163745074, + "grad_norm": 0.1918146163225174, + "learning_rate": 9.06092879467319e-05, + "loss": 4.0648, + "step": 11065 + }, + { + "epoch": 0.7521402364451691, + "grad_norm": 0.22963494062423706, + "learning_rate": 9.060504144584863e-05, + "loss": 3.9061, + "step": 11070 + }, + { + "epoch": 0.752479956515831, + "grad_norm": 0.16479997336864471, + "learning_rate": 9.060079494496536e-05, + "loss": 4.0303, + "step": 11075 + }, + { + "epoch": 0.7528196765864927, + "grad_norm": 0.18432816863059998, + "learning_rate": 9.059654844408208e-05, + "loss": 3.8773, + "step": 11080 + }, + { + "epoch": 0.7531593966571545, + "grad_norm": 0.22336050868034363, + "learning_rate": 9.059230194319881e-05, + "loss": 3.7997, + "step": 11085 + }, + { + "epoch": 0.7534991167278163, + "grad_norm": 0.242068812251091, + "learning_rate": 9.058805544231553e-05, + "loss": 3.9268, + "step": 11090 + }, + { + "epoch": 0.7538388367984781, + "grad_norm": 0.14753904938697815, + "learning_rate": 9.058380894143227e-05, + "loss": 4.0985, + "step": 11095 + }, + { + "epoch": 0.7541785568691398, + "grad_norm": 0.19245490431785583, + "learning_rate": 9.0579562440549e-05, + "loss": 3.8995, + "step": 11100 + }, + { + "epoch": 0.7545182769398016, + "grad_norm": 0.18615277111530304, + "learning_rate": 9.057531593966571e-05, + "loss": 4.0072, + "step": 11105 + }, + { + "epoch": 0.7548579970104634, + "grad_norm": 0.19581812620162964, + "learning_rate": 9.057106943878245e-05, + "loss": 3.8758, + "step": 11110 + }, + { + "epoch": 0.7551977170811252, + "grad_norm": 0.15949614346027374, + "learning_rate": 9.056682293789918e-05, + "loss": 3.9473, + "step": 11115 + }, + { + "epoch": 0.7555374371517869, + "grad_norm": 1.359558343887329, + "learning_rate": 9.05625764370159e-05, + "loss": 3.9416, + "step": 11120 + }, + { + "epoch": 0.7558771572224487, + "grad_norm": 0.1593676656484604, + "learning_rate": 9.055832993613264e-05, + "loss": 4.0518, + "step": 11125 + }, + { + "epoch": 0.7562168772931105, + "grad_norm": 0.1715662181377411, + "learning_rate": 9.055408343524937e-05, + "loss": 3.982, + "step": 11130 + }, + { + "epoch": 0.7565565973637722, + "grad_norm": 0.1934783011674881, + "learning_rate": 9.054983693436608e-05, + "loss": 4.1628, + "step": 11135 + }, + { + "epoch": 0.756896317434434, + "grad_norm": 0.16365660727024078, + "learning_rate": 9.054559043348282e-05, + "loss": 4.0758, + "step": 11140 + }, + { + "epoch": 0.7572360375050958, + "grad_norm": 0.2995030879974365, + "learning_rate": 9.054134393259955e-05, + "loss": 3.915, + "step": 11145 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.22450530529022217, + "learning_rate": 9.053709743171626e-05, + "loss": 4.2514, + "step": 11150 + }, + { + "epoch": 0.7579154776464193, + "grad_norm": 3.4520628452301025, + "learning_rate": 9.0532850930833e-05, + "loss": 4.0989, + "step": 11155 + }, + { + "epoch": 0.7582551977170812, + "grad_norm": 0.31930363178253174, + "learning_rate": 9.052860442994972e-05, + "loss": 4.0048, + "step": 11160 + }, + { + "epoch": 0.7585949177877429, + "grad_norm": 0.21884280443191528, + "learning_rate": 9.052435792906645e-05, + "loss": 3.8112, + "step": 11165 + }, + { + "epoch": 0.7589346378584046, + "grad_norm": 0.1630697101354599, + "learning_rate": 9.052011142818319e-05, + "loss": 3.9614, + "step": 11170 + }, + { + "epoch": 0.7592743579290665, + "grad_norm": 0.2021295428276062, + "learning_rate": 9.05158649272999e-05, + "loss": 3.9433, + "step": 11175 + }, + { + "epoch": 0.7596140779997282, + "grad_norm": 0.21674887835979462, + "learning_rate": 9.051161842641663e-05, + "loss": 3.9271, + "step": 11180 + }, + { + "epoch": 0.75995379807039, + "grad_norm": 0.2624286413192749, + "learning_rate": 9.050737192553337e-05, + "loss": 3.8687, + "step": 11185 + }, + { + "epoch": 0.7602935181410517, + "grad_norm": 0.15679609775543213, + "learning_rate": 9.050312542465009e-05, + "loss": 4.0386, + "step": 11190 + }, + { + "epoch": 0.7606332382117136, + "grad_norm": 0.298999547958374, + "learning_rate": 9.049887892376682e-05, + "loss": 4.039, + "step": 11195 + }, + { + "epoch": 0.7609729582823753, + "grad_norm": 0.1916528344154358, + "learning_rate": 9.049463242288356e-05, + "loss": 3.9924, + "step": 11200 + }, + { + "epoch": 0.7613126783530371, + "grad_norm": 0.20813806354999542, + "learning_rate": 9.049038592200027e-05, + "loss": 3.9728, + "step": 11205 + }, + { + "epoch": 0.7616523984236989, + "grad_norm": 0.26955127716064453, + "learning_rate": 9.0486139421117e-05, + "loss": 3.9563, + "step": 11210 + }, + { + "epoch": 0.7619921184943607, + "grad_norm": 0.22610753774642944, + "learning_rate": 9.048189292023374e-05, + "loss": 3.9385, + "step": 11215 + }, + { + "epoch": 0.7623318385650224, + "grad_norm": 0.1515149027109146, + "learning_rate": 9.047764641935046e-05, + "loss": 4.0009, + "step": 11220 + }, + { + "epoch": 0.7626715586356841, + "grad_norm": 0.15270265936851501, + "learning_rate": 9.047339991846718e-05, + "loss": 3.8132, + "step": 11225 + }, + { + "epoch": 0.763011278706346, + "grad_norm": 0.256085067987442, + "learning_rate": 9.046915341758391e-05, + "loss": 4.1206, + "step": 11230 + }, + { + "epoch": 0.7633509987770077, + "grad_norm": 0.19662001729011536, + "learning_rate": 9.046490691670064e-05, + "loss": 4.0799, + "step": 11235 + }, + { + "epoch": 0.7636907188476695, + "grad_norm": 0.18421690165996552, + "learning_rate": 9.046066041581737e-05, + "loss": 4.0328, + "step": 11240 + }, + { + "epoch": 0.7640304389183313, + "grad_norm": 0.19748954474925995, + "learning_rate": 9.04564139149341e-05, + "loss": 3.7995, + "step": 11245 + }, + { + "epoch": 0.7643701589889931, + "grad_norm": 0.15954630076885223, + "learning_rate": 9.045216741405082e-05, + "loss": 3.8286, + "step": 11250 + }, + { + "epoch": 0.7647098790596548, + "grad_norm": 0.5489984154701233, + "learning_rate": 9.044792091316755e-05, + "loss": 4.2052, + "step": 11255 + }, + { + "epoch": 0.7650495991303167, + "grad_norm": 0.44495344161987305, + "learning_rate": 9.044367441228428e-05, + "loss": 4.1036, + "step": 11260 + }, + { + "epoch": 0.7653893192009784, + "grad_norm": 0.16555814445018768, + "learning_rate": 9.043942791140101e-05, + "loss": 3.9737, + "step": 11265 + }, + { + "epoch": 0.7657290392716402, + "grad_norm": 0.21692036092281342, + "learning_rate": 9.043518141051774e-05, + "loss": 4.0437, + "step": 11270 + }, + { + "epoch": 0.7660687593423019, + "grad_norm": 1.8791022300720215, + "learning_rate": 9.043093490963446e-05, + "loss": 4.0974, + "step": 11275 + }, + { + "epoch": 0.7664084794129638, + "grad_norm": 0.19241303205490112, + "learning_rate": 9.042668840875119e-05, + "loss": 3.9209, + "step": 11280 + }, + { + "epoch": 0.7667481994836255, + "grad_norm": 0.17803336679935455, + "learning_rate": 9.042244190786792e-05, + "loss": 3.842, + "step": 11285 + }, + { + "epoch": 0.7670879195542872, + "grad_norm": 0.20129168033599854, + "learning_rate": 9.041819540698465e-05, + "loss": 4.0462, + "step": 11290 + }, + { + "epoch": 0.7674276396249491, + "grad_norm": 0.18283264338970184, + "learning_rate": 9.041394890610138e-05, + "loss": 3.9094, + "step": 11295 + }, + { + "epoch": 0.7677673596956108, + "grad_norm": 0.20721754431724548, + "learning_rate": 9.04097024052181e-05, + "loss": 3.9013, + "step": 11300 + }, + { + "epoch": 0.7681070797662726, + "grad_norm": 0.43089064955711365, + "learning_rate": 9.040545590433483e-05, + "loss": 3.964, + "step": 11305 + }, + { + "epoch": 0.7684467998369343, + "grad_norm": 0.24873760342597961, + "learning_rate": 9.040120940345156e-05, + "loss": 4.0062, + "step": 11310 + }, + { + "epoch": 0.7687865199075962, + "grad_norm": 0.22243714332580566, + "learning_rate": 9.039696290256829e-05, + "loss": 3.8801, + "step": 11315 + }, + { + "epoch": 0.7691262399782579, + "grad_norm": 0.41750073432922363, + "learning_rate": 9.039271640168502e-05, + "loss": 4.2682, + "step": 11320 + }, + { + "epoch": 0.7694659600489197, + "grad_norm": 0.369742214679718, + "learning_rate": 9.038846990080174e-05, + "loss": 3.8898, + "step": 11325 + }, + { + "epoch": 0.7698056801195815, + "grad_norm": 0.26420828700065613, + "learning_rate": 9.038422339991847e-05, + "loss": 3.9038, + "step": 11330 + }, + { + "epoch": 0.7701454001902432, + "grad_norm": 0.2597283720970154, + "learning_rate": 9.03799768990352e-05, + "loss": 3.9406, + "step": 11335 + }, + { + "epoch": 0.770485120260905, + "grad_norm": 0.17518769204616547, + "learning_rate": 9.037573039815193e-05, + "loss": 3.6741, + "step": 11340 + }, + { + "epoch": 0.7708248403315668, + "grad_norm": 0.6777191758155823, + "learning_rate": 9.037148389726866e-05, + "loss": 4.0673, + "step": 11345 + }, + { + "epoch": 0.7711645604022286, + "grad_norm": 0.201960951089859, + "learning_rate": 9.036723739638538e-05, + "loss": 4.0074, + "step": 11350 + }, + { + "epoch": 0.7715042804728903, + "grad_norm": 0.4381665587425232, + "learning_rate": 9.036299089550211e-05, + "loss": 3.8382, + "step": 11355 + }, + { + "epoch": 0.7718440005435521, + "grad_norm": 0.1966671347618103, + "learning_rate": 9.035874439461884e-05, + "loss": 4.1766, + "step": 11360 + }, + { + "epoch": 0.7721837206142139, + "grad_norm": 0.16876500844955444, + "learning_rate": 9.035449789373557e-05, + "loss": 3.9402, + "step": 11365 + }, + { + "epoch": 0.7725234406848757, + "grad_norm": 4.147640705108643, + "learning_rate": 9.03502513928523e-05, + "loss": 4.109, + "step": 11370 + }, + { + "epoch": 0.7728631607555374, + "grad_norm": 0.2072206437587738, + "learning_rate": 9.034600489196902e-05, + "loss": 4.0296, + "step": 11375 + }, + { + "epoch": 0.7732028808261993, + "grad_norm": 0.2016468346118927, + "learning_rate": 9.034175839108575e-05, + "loss": 3.7331, + "step": 11380 + }, + { + "epoch": 0.773542600896861, + "grad_norm": 0.47726747393608093, + "learning_rate": 9.033751189020248e-05, + "loss": 4.0178, + "step": 11385 + }, + { + "epoch": 0.7738823209675227, + "grad_norm": 0.17172425985336304, + "learning_rate": 9.033326538931921e-05, + "loss": 4.1782, + "step": 11390 + }, + { + "epoch": 0.7742220410381845, + "grad_norm": 0.1911281943321228, + "learning_rate": 9.032901888843594e-05, + "loss": 3.9972, + "step": 11395 + }, + { + "epoch": 0.7745617611088463, + "grad_norm": 0.14127899706363678, + "learning_rate": 9.032477238755266e-05, + "loss": 3.9405, + "step": 11400 + }, + { + "epoch": 0.7749014811795081, + "grad_norm": 0.1841440349817276, + "learning_rate": 9.032052588666939e-05, + "loss": 4.0964, + "step": 11405 + }, + { + "epoch": 0.7752412012501698, + "grad_norm": 0.17826926708221436, + "learning_rate": 9.031627938578612e-05, + "loss": 4.2658, + "step": 11410 + }, + { + "epoch": 0.7755809213208317, + "grad_norm": 0.2882947325706482, + "learning_rate": 9.031203288490285e-05, + "loss": 4.1508, + "step": 11415 + }, + { + "epoch": 0.7759206413914934, + "grad_norm": 0.21743571758270264, + "learning_rate": 9.030778638401958e-05, + "loss": 4.1659, + "step": 11420 + }, + { + "epoch": 0.7762603614621552, + "grad_norm": 0.23792824149131775, + "learning_rate": 9.03035398831363e-05, + "loss": 3.8291, + "step": 11425 + }, + { + "epoch": 0.776600081532817, + "grad_norm": 0.2318025827407837, + "learning_rate": 9.029929338225302e-05, + "loss": 3.8444, + "step": 11430 + }, + { + "epoch": 0.7769398016034788, + "grad_norm": 0.17949531972408295, + "learning_rate": 9.029504688136976e-05, + "loss": 3.9841, + "step": 11435 + }, + { + "epoch": 0.7772795216741405, + "grad_norm": 0.22435292601585388, + "learning_rate": 9.029080038048649e-05, + "loss": 3.9821, + "step": 11440 + }, + { + "epoch": 0.7776192417448022, + "grad_norm": 0.1865406632423401, + "learning_rate": 9.02865538796032e-05, + "loss": 3.935, + "step": 11445 + }, + { + "epoch": 0.7779589618154641, + "grad_norm": 0.2090293914079666, + "learning_rate": 9.028230737871994e-05, + "loss": 3.9991, + "step": 11450 + }, + { + "epoch": 0.7782986818861258, + "grad_norm": 0.18024842441082, + "learning_rate": 9.027806087783667e-05, + "loss": 4.076, + "step": 11455 + }, + { + "epoch": 0.7786384019567876, + "grad_norm": 0.17997018992900848, + "learning_rate": 9.027381437695339e-05, + "loss": 3.7585, + "step": 11460 + }, + { + "epoch": 0.7789781220274494, + "grad_norm": 0.16544857621192932, + "learning_rate": 9.026956787607013e-05, + "loss": 4.2184, + "step": 11465 + }, + { + "epoch": 0.7793178420981112, + "grad_norm": 0.17606359720230103, + "learning_rate": 9.026532137518686e-05, + "loss": 3.9412, + "step": 11470 + }, + { + "epoch": 0.7796575621687729, + "grad_norm": 0.2205812931060791, + "learning_rate": 9.026107487430357e-05, + "loss": 4.0843, + "step": 11475 + }, + { + "epoch": 0.7799972822394347, + "grad_norm": 0.25740867853164673, + "learning_rate": 9.025682837342031e-05, + "loss": 3.9544, + "step": 11480 + }, + { + "epoch": 0.7803370023100965, + "grad_norm": 0.14909543097019196, + "learning_rate": 9.025258187253704e-05, + "loss": 3.8342, + "step": 11485 + }, + { + "epoch": 0.7806767223807582, + "grad_norm": 0.24682089686393738, + "learning_rate": 9.024833537165375e-05, + "loss": 3.9944, + "step": 11490 + }, + { + "epoch": 0.78101644245142, + "grad_norm": 0.15707463026046753, + "learning_rate": 9.02440888707705e-05, + "loss": 3.9675, + "step": 11495 + }, + { + "epoch": 0.7813561625220818, + "grad_norm": 0.22718797624111176, + "learning_rate": 9.023984236988722e-05, + "loss": 3.6624, + "step": 11500 + }, + { + "epoch": 0.7816958825927436, + "grad_norm": 0.15948626399040222, + "learning_rate": 9.023559586900394e-05, + "loss": 3.9947, + "step": 11505 + }, + { + "epoch": 0.7820356026634053, + "grad_norm": 0.16061913967132568, + "learning_rate": 9.023134936812068e-05, + "loss": 4.1845, + "step": 11510 + }, + { + "epoch": 0.7823753227340672, + "grad_norm": 0.25919288396835327, + "learning_rate": 9.02271028672374e-05, + "loss": 3.9301, + "step": 11515 + }, + { + "epoch": 0.7827150428047289, + "grad_norm": 0.21657872200012207, + "learning_rate": 9.022285636635412e-05, + "loss": 4.0299, + "step": 11520 + }, + { + "epoch": 0.7830547628753907, + "grad_norm": 0.18826550245285034, + "learning_rate": 9.021860986547086e-05, + "loss": 4.212, + "step": 11525 + }, + { + "epoch": 0.7833944829460524, + "grad_norm": 0.2549474835395813, + "learning_rate": 9.021436336458758e-05, + "loss": 4.0705, + "step": 11530 + }, + { + "epoch": 0.7837342030167143, + "grad_norm": 0.6155955195426941, + "learning_rate": 9.02101168637043e-05, + "loss": 3.79, + "step": 11535 + }, + { + "epoch": 0.784073923087376, + "grad_norm": 0.1635499894618988, + "learning_rate": 9.020587036282105e-05, + "loss": 4.1024, + "step": 11540 + }, + { + "epoch": 0.7844136431580377, + "grad_norm": 0.15726587176322937, + "learning_rate": 9.020162386193776e-05, + "loss": 3.9499, + "step": 11545 + }, + { + "epoch": 0.7847533632286996, + "grad_norm": 0.16258913278579712, + "learning_rate": 9.019737736105449e-05, + "loss": 3.9322, + "step": 11550 + }, + { + "epoch": 0.7850930832993613, + "grad_norm": 0.2376587688922882, + "learning_rate": 9.019313086017123e-05, + "loss": 3.9946, + "step": 11555 + }, + { + "epoch": 0.7854328033700231, + "grad_norm": 0.1641000360250473, + "learning_rate": 9.018888435928795e-05, + "loss": 4.1131, + "step": 11560 + }, + { + "epoch": 0.7857725234406848, + "grad_norm": 0.18432609736919403, + "learning_rate": 9.018463785840467e-05, + "loss": 3.8617, + "step": 11565 + }, + { + "epoch": 0.7861122435113467, + "grad_norm": 0.31025978922843933, + "learning_rate": 9.018039135752142e-05, + "loss": 3.7286, + "step": 11570 + }, + { + "epoch": 0.7864519635820084, + "grad_norm": 0.18590706586837769, + "learning_rate": 9.017614485663813e-05, + "loss": 3.9636, + "step": 11575 + }, + { + "epoch": 0.7867916836526702, + "grad_norm": 0.18814896047115326, + "learning_rate": 9.017189835575486e-05, + "loss": 4.3321, + "step": 11580 + }, + { + "epoch": 0.787131403723332, + "grad_norm": 0.17569060623645782, + "learning_rate": 9.016765185487159e-05, + "loss": 4.0207, + "step": 11585 + }, + { + "epoch": 0.7874711237939938, + "grad_norm": 0.8084515333175659, + "learning_rate": 9.016340535398831e-05, + "loss": 3.8702, + "step": 11590 + }, + { + "epoch": 0.7878108438646555, + "grad_norm": 0.1719738245010376, + "learning_rate": 9.015915885310504e-05, + "loss": 3.9472, + "step": 11595 + }, + { + "epoch": 0.7881505639353173, + "grad_norm": 0.3733132481575012, + "learning_rate": 9.015491235222177e-05, + "loss": 3.9621, + "step": 11600 + }, + { + "epoch": 0.7884902840059791, + "grad_norm": 0.1931469440460205, + "learning_rate": 9.01506658513385e-05, + "loss": 4.1482, + "step": 11605 + }, + { + "epoch": 0.7888300040766408, + "grad_norm": 0.6097820997238159, + "learning_rate": 9.014641935045523e-05, + "loss": 4.2861, + "step": 11610 + }, + { + "epoch": 0.7891697241473026, + "grad_norm": 0.23092709481716156, + "learning_rate": 9.014217284957195e-05, + "loss": 3.7435, + "step": 11615 + }, + { + "epoch": 0.7895094442179644, + "grad_norm": 0.20437659323215485, + "learning_rate": 9.013792634868868e-05, + "loss": 4.0259, + "step": 11620 + }, + { + "epoch": 0.7898491642886262, + "grad_norm": 0.19561974704265594, + "learning_rate": 9.013367984780541e-05, + "loss": 3.8396, + "step": 11625 + }, + { + "epoch": 0.7901888843592879, + "grad_norm": 0.22799140214920044, + "learning_rate": 9.012943334692214e-05, + "loss": 4.0335, + "step": 11630 + }, + { + "epoch": 0.7905286044299498, + "grad_norm": 0.1820353865623474, + "learning_rate": 9.012518684603887e-05, + "loss": 3.9942, + "step": 11635 + }, + { + "epoch": 0.7908683245006115, + "grad_norm": 0.217819482088089, + "learning_rate": 9.01209403451556e-05, + "loss": 4.1374, + "step": 11640 + }, + { + "epoch": 0.7912080445712733, + "grad_norm": 0.20061899721622467, + "learning_rate": 9.011669384427232e-05, + "loss": 3.6462, + "step": 11645 + }, + { + "epoch": 0.791547764641935, + "grad_norm": 0.21914707124233246, + "learning_rate": 9.011244734338905e-05, + "loss": 4.0377, + "step": 11650 + }, + { + "epoch": 0.7918874847125968, + "grad_norm": 0.2225886732339859, + "learning_rate": 9.010820084250578e-05, + "loss": 4.1587, + "step": 11655 + }, + { + "epoch": 0.7922272047832586, + "grad_norm": 0.23360738158226013, + "learning_rate": 9.01039543416225e-05, + "loss": 4.0995, + "step": 11660 + }, + { + "epoch": 0.7925669248539203, + "grad_norm": 0.20647506415843964, + "learning_rate": 9.009970784073923e-05, + "loss": 4.0943, + "step": 11665 + }, + { + "epoch": 0.7929066449245822, + "grad_norm": 0.17202545702457428, + "learning_rate": 9.009546133985596e-05, + "loss": 3.8883, + "step": 11670 + }, + { + "epoch": 0.7932463649952439, + "grad_norm": 1.380285382270813, + "learning_rate": 9.009121483897269e-05, + "loss": 4.0137, + "step": 11675 + }, + { + "epoch": 0.7935860850659057, + "grad_norm": 0.23098598420619965, + "learning_rate": 9.008696833808942e-05, + "loss": 4.0737, + "step": 11680 + }, + { + "epoch": 0.7939258051365675, + "grad_norm": 0.17068329453468323, + "learning_rate": 9.008272183720615e-05, + "loss": 4.2746, + "step": 11685 + }, + { + "epoch": 0.7942655252072293, + "grad_norm": 0.23422260582447052, + "learning_rate": 9.007847533632287e-05, + "loss": 3.7582, + "step": 11690 + }, + { + "epoch": 0.794605245277891, + "grad_norm": 0.1885872483253479, + "learning_rate": 9.00742288354396e-05, + "loss": 4.0525, + "step": 11695 + }, + { + "epoch": 0.7949449653485527, + "grad_norm": 0.18177750706672668, + "learning_rate": 9.006998233455633e-05, + "loss": 4.0782, + "step": 11700 + }, + { + "epoch": 0.7952846854192146, + "grad_norm": 2.576247453689575, + "learning_rate": 9.006573583367306e-05, + "loss": 3.8726, + "step": 11705 + }, + { + "epoch": 0.7956244054898763, + "grad_norm": 0.16896361112594604, + "learning_rate": 9.006148933278979e-05, + "loss": 3.8062, + "step": 11710 + }, + { + "epoch": 0.7959641255605381, + "grad_norm": 0.1680668294429779, + "learning_rate": 9.005724283190651e-05, + "loss": 4.0136, + "step": 11715 + }, + { + "epoch": 0.7963038456311999, + "grad_norm": 0.18001356720924377, + "learning_rate": 9.005299633102324e-05, + "loss": 4.1243, + "step": 11720 + }, + { + "epoch": 0.7966435657018617, + "grad_norm": 0.19494907557964325, + "learning_rate": 9.004874983013997e-05, + "loss": 3.9437, + "step": 11725 + }, + { + "epoch": 0.7969832857725234, + "grad_norm": 0.18916480243206024, + "learning_rate": 9.00445033292567e-05, + "loss": 4.0779, + "step": 11730 + }, + { + "epoch": 0.7973230058431852, + "grad_norm": 0.211675226688385, + "learning_rate": 9.004025682837343e-05, + "loss": 3.8902, + "step": 11735 + }, + { + "epoch": 0.797662725913847, + "grad_norm": 0.2676939368247986, + "learning_rate": 9.003601032749015e-05, + "loss": 4.0427, + "step": 11740 + }, + { + "epoch": 0.7980024459845088, + "grad_norm": 0.20862559974193573, + "learning_rate": 9.003176382660688e-05, + "loss": 4.2145, + "step": 11745 + }, + { + "epoch": 0.7983421660551705, + "grad_norm": 0.20464570820331573, + "learning_rate": 9.002751732572361e-05, + "loss": 4.113, + "step": 11750 + }, + { + "epoch": 0.7986818861258324, + "grad_norm": 0.17028920352458954, + "learning_rate": 9.002327082484034e-05, + "loss": 3.6957, + "step": 11755 + }, + { + "epoch": 0.7990216061964941, + "grad_norm": 0.24813637137413025, + "learning_rate": 9.001902432395707e-05, + "loss": 4.1226, + "step": 11760 + }, + { + "epoch": 0.7993613262671558, + "grad_norm": 1.6223915815353394, + "learning_rate": 9.00147778230738e-05, + "loss": 3.9943, + "step": 11765 + }, + { + "epoch": 0.7997010463378177, + "grad_norm": 0.1639162003993988, + "learning_rate": 9.001053132219052e-05, + "loss": 3.9359, + "step": 11770 + }, + { + "epoch": 0.8000407664084794, + "grad_norm": 0.16888979077339172, + "learning_rate": 9.000628482130725e-05, + "loss": 3.8101, + "step": 11775 + }, + { + "epoch": 0.8003804864791412, + "grad_norm": 0.1576785147190094, + "learning_rate": 9.000203832042398e-05, + "loss": 4.062, + "step": 11780 + }, + { + "epoch": 0.8007202065498029, + "grad_norm": 0.19945354759693146, + "learning_rate": 8.99977918195407e-05, + "loss": 4.2053, + "step": 11785 + }, + { + "epoch": 0.8010599266204648, + "grad_norm": 0.13953137397766113, + "learning_rate": 8.999354531865743e-05, + "loss": 3.6578, + "step": 11790 + }, + { + "epoch": 0.8013996466911265, + "grad_norm": 0.1995120495557785, + "learning_rate": 8.998929881777416e-05, + "loss": 3.9994, + "step": 11795 + }, + { + "epoch": 0.8017393667617883, + "grad_norm": 0.22360244393348694, + "learning_rate": 8.998505231689088e-05, + "loss": 4.1224, + "step": 11800 + }, + { + "epoch": 0.8020790868324501, + "grad_norm": 0.20481501519680023, + "learning_rate": 8.998080581600762e-05, + "loss": 4.2824, + "step": 11805 + }, + { + "epoch": 0.8024188069031118, + "grad_norm": 0.39974507689476013, + "learning_rate": 8.997655931512435e-05, + "loss": 4.1875, + "step": 11810 + }, + { + "epoch": 0.8027585269737736, + "grad_norm": 0.32297125458717346, + "learning_rate": 8.997231281424106e-05, + "loss": 3.8753, + "step": 11815 + }, + { + "epoch": 0.8030982470444353, + "grad_norm": 0.2076197862625122, + "learning_rate": 8.996891561353446e-05, + "loss": 4.0354, + "step": 11820 + }, + { + "epoch": 0.8034379671150972, + "grad_norm": 0.17974001169204712, + "learning_rate": 8.996466911265119e-05, + "loss": 4.0395, + "step": 11825 + }, + { + "epoch": 0.8037776871857589, + "grad_norm": 0.18468719720840454, + "learning_rate": 8.99604226117679e-05, + "loss": 4.0589, + "step": 11830 + }, + { + "epoch": 0.8041174072564207, + "grad_norm": 0.2775026857852936, + "learning_rate": 8.995617611088464e-05, + "loss": 4.0899, + "step": 11835 + }, + { + "epoch": 0.8044571273270825, + "grad_norm": 0.20769546926021576, + "learning_rate": 8.995192961000136e-05, + "loss": 4.1438, + "step": 11840 + }, + { + "epoch": 0.8047968473977443, + "grad_norm": 0.18603581190109253, + "learning_rate": 8.994768310911808e-05, + "loss": 3.9822, + "step": 11845 + }, + { + "epoch": 0.805136567468406, + "grad_norm": 0.3347295820713043, + "learning_rate": 8.994343660823483e-05, + "loss": 4.042, + "step": 11850 + }, + { + "epoch": 0.8054762875390679, + "grad_norm": 0.26557305455207825, + "learning_rate": 8.993919010735154e-05, + "loss": 3.9212, + "step": 11855 + }, + { + "epoch": 0.8058160076097296, + "grad_norm": 0.27433109283447266, + "learning_rate": 8.993494360646827e-05, + "loss": 3.7039, + "step": 11860 + }, + { + "epoch": 0.8061557276803913, + "grad_norm": 0.1835566610097885, + "learning_rate": 8.993069710558501e-05, + "loss": 3.8467, + "step": 11865 + }, + { + "epoch": 0.8064954477510531, + "grad_norm": 0.15933853387832642, + "learning_rate": 8.992645060470172e-05, + "loss": 4.035, + "step": 11870 + }, + { + "epoch": 0.8068351678217149, + "grad_norm": 0.1779545098543167, + "learning_rate": 8.992220410381845e-05, + "loss": 4.0, + "step": 11875 + }, + { + "epoch": 0.8071748878923767, + "grad_norm": 0.19771164655685425, + "learning_rate": 8.99179576029352e-05, + "loss": 3.5028, + "step": 11880 + }, + { + "epoch": 0.8075146079630384, + "grad_norm": 0.17675349116325378, + "learning_rate": 8.991371110205191e-05, + "loss": 3.9989, + "step": 11885 + }, + { + "epoch": 0.8078543280337003, + "grad_norm": 0.23120000958442688, + "learning_rate": 8.990946460116864e-05, + "loss": 3.7091, + "step": 11890 + }, + { + "epoch": 0.808194048104362, + "grad_norm": 0.18149258196353912, + "learning_rate": 8.990521810028538e-05, + "loss": 3.9628, + "step": 11895 + }, + { + "epoch": 0.8085337681750238, + "grad_norm": 0.2674315571784973, + "learning_rate": 8.990097159940209e-05, + "loss": 3.7765, + "step": 11900 + }, + { + "epoch": 0.8088734882456855, + "grad_norm": 0.21212173998355865, + "learning_rate": 8.989672509851883e-05, + "loss": 3.8405, + "step": 11905 + }, + { + "epoch": 0.8092132083163474, + "grad_norm": 0.16879509389400482, + "learning_rate": 8.989247859763555e-05, + "loss": 4.0809, + "step": 11910 + }, + { + "epoch": 0.8095529283870091, + "grad_norm": 0.14125306904315948, + "learning_rate": 8.988823209675228e-05, + "loss": 4.0725, + "step": 11915 + }, + { + "epoch": 0.8098926484576708, + "grad_norm": 0.1498613953590393, + "learning_rate": 8.988398559586902e-05, + "loss": 3.7912, + "step": 11920 + }, + { + "epoch": 0.8102323685283327, + "grad_norm": 0.16456682980060577, + "learning_rate": 8.987973909498573e-05, + "loss": 4.0187, + "step": 11925 + }, + { + "epoch": 0.8105720885989944, + "grad_norm": 0.3114604949951172, + "learning_rate": 8.987549259410246e-05, + "loss": 4.0009, + "step": 11930 + }, + { + "epoch": 0.8109118086696562, + "grad_norm": 0.5615077018737793, + "learning_rate": 8.98712460932192e-05, + "loss": 4.0609, + "step": 11935 + }, + { + "epoch": 0.811251528740318, + "grad_norm": 0.27753254771232605, + "learning_rate": 8.986699959233592e-05, + "loss": 3.8107, + "step": 11940 + }, + { + "epoch": 0.8115912488109798, + "grad_norm": 0.21950267255306244, + "learning_rate": 8.986275309145264e-05, + "loss": 3.8093, + "step": 11945 + }, + { + "epoch": 0.8119309688816415, + "grad_norm": 0.17988736927509308, + "learning_rate": 8.985850659056939e-05, + "loss": 3.9343, + "step": 11950 + }, + { + "epoch": 0.8122706889523033, + "grad_norm": 0.23350049555301666, + "learning_rate": 8.98542600896861e-05, + "loss": 4.0691, + "step": 11955 + }, + { + "epoch": 0.8126104090229651, + "grad_norm": 0.19277788698673248, + "learning_rate": 8.985001358880283e-05, + "loss": 3.9519, + "step": 11960 + }, + { + "epoch": 0.8129501290936268, + "grad_norm": 0.21622268855571747, + "learning_rate": 8.984576708791957e-05, + "loss": 3.8689, + "step": 11965 + }, + { + "epoch": 0.8132898491642886, + "grad_norm": 1.2102338075637817, + "learning_rate": 8.984152058703628e-05, + "loss": 3.9296, + "step": 11970 + }, + { + "epoch": 0.8136295692349504, + "grad_norm": 0.2097243219614029, + "learning_rate": 8.983727408615301e-05, + "loss": 4.2595, + "step": 11975 + }, + { + "epoch": 0.8139692893056122, + "grad_norm": 0.3595362603664398, + "learning_rate": 8.983302758526974e-05, + "loss": 3.9542, + "step": 11980 + }, + { + "epoch": 0.8143090093762739, + "grad_norm": 0.18622025847434998, + "learning_rate": 8.982878108438647e-05, + "loss": 3.8882, + "step": 11985 + }, + { + "epoch": 0.8146487294469357, + "grad_norm": 0.19790107011795044, + "learning_rate": 8.98245345835032e-05, + "loss": 4.1604, + "step": 11990 + }, + { + "epoch": 0.8149884495175975, + "grad_norm": 0.21050450205802917, + "learning_rate": 8.982028808261992e-05, + "loss": 3.7831, + "step": 11995 + }, + { + "epoch": 0.8153281695882593, + "grad_norm": 0.2178838849067688, + "learning_rate": 8.981604158173665e-05, + "loss": 3.8846, + "step": 12000 + }, + { + "epoch": 0.815667889658921, + "grad_norm": 0.20060613751411438, + "learning_rate": 8.981179508085338e-05, + "loss": 4.0856, + "step": 12005 + }, + { + "epoch": 0.8160076097295829, + "grad_norm": 0.19663146138191223, + "learning_rate": 8.980754857997011e-05, + "loss": 3.9376, + "step": 12010 + }, + { + "epoch": 0.8163473298002446, + "grad_norm": 0.36938565969467163, + "learning_rate": 8.980330207908684e-05, + "loss": 4.121, + "step": 12015 + }, + { + "epoch": 0.8166870498709063, + "grad_norm": 0.17913353443145752, + "learning_rate": 8.979905557820356e-05, + "loss": 3.9507, + "step": 12020 + }, + { + "epoch": 0.8170267699415682, + "grad_norm": 0.18103277683258057, + "learning_rate": 8.979480907732029e-05, + "loss": 3.651, + "step": 12025 + }, + { + "epoch": 0.8173664900122299, + "grad_norm": 0.1673816740512848, + "learning_rate": 8.979056257643702e-05, + "loss": 3.9309, + "step": 12030 + }, + { + "epoch": 0.8177062100828917, + "grad_norm": 0.4948117434978485, + "learning_rate": 8.978631607555375e-05, + "loss": 3.9102, + "step": 12035 + }, + { + "epoch": 0.8180459301535534, + "grad_norm": 0.16142868995666504, + "learning_rate": 8.978206957467048e-05, + "loss": 3.8234, + "step": 12040 + }, + { + "epoch": 0.8183856502242153, + "grad_norm": 0.2791318893432617, + "learning_rate": 8.97778230737872e-05, + "loss": 4.1417, + "step": 12045 + }, + { + "epoch": 0.818725370294877, + "grad_norm": 0.17257554829120636, + "learning_rate": 8.977357657290393e-05, + "loss": 4.043, + "step": 12050 + }, + { + "epoch": 0.8190650903655388, + "grad_norm": 0.2123369425535202, + "learning_rate": 8.976933007202066e-05, + "loss": 3.9297, + "step": 12055 + }, + { + "epoch": 0.8194048104362006, + "grad_norm": 0.19854533672332764, + "learning_rate": 8.976508357113739e-05, + "loss": 3.8441, + "step": 12060 + }, + { + "epoch": 0.8197445305068624, + "grad_norm": 0.24193492531776428, + "learning_rate": 8.976083707025412e-05, + "loss": 4.0607, + "step": 12065 + }, + { + "epoch": 0.8200842505775241, + "grad_norm": 0.1961483359336853, + "learning_rate": 8.975659056937084e-05, + "loss": 3.8928, + "step": 12070 + }, + { + "epoch": 0.8204239706481858, + "grad_norm": 0.14383824169635773, + "learning_rate": 8.975234406848757e-05, + "loss": 4.0603, + "step": 12075 + }, + { + "epoch": 0.8207636907188477, + "grad_norm": 0.2458658516407013, + "learning_rate": 8.97480975676043e-05, + "loss": 3.8334, + "step": 12080 + }, + { + "epoch": 0.8211034107895094, + "grad_norm": 0.17008869349956512, + "learning_rate": 8.974385106672103e-05, + "loss": 3.9803, + "step": 12085 + }, + { + "epoch": 0.8214431308601712, + "grad_norm": 0.20078590512275696, + "learning_rate": 8.973960456583776e-05, + "loss": 4.1641, + "step": 12090 + }, + { + "epoch": 0.821782850930833, + "grad_norm": 0.1937909722328186, + "learning_rate": 8.973535806495448e-05, + "loss": 4.0954, + "step": 12095 + }, + { + "epoch": 0.8221225710014948, + "grad_norm": 0.18328414857387543, + "learning_rate": 8.973111156407121e-05, + "loss": 3.9427, + "step": 12100 + }, + { + "epoch": 0.8224622910721565, + "grad_norm": 0.2016650289297104, + "learning_rate": 8.972686506318794e-05, + "loss": 3.9728, + "step": 12105 + }, + { + "epoch": 0.8228020111428184, + "grad_norm": 0.17548047006130219, + "learning_rate": 8.972261856230465e-05, + "loss": 4.1828, + "step": 12110 + }, + { + "epoch": 0.8231417312134801, + "grad_norm": 0.39229270815849304, + "learning_rate": 8.97183720614214e-05, + "loss": 4.0582, + "step": 12115 + }, + { + "epoch": 0.8234814512841419, + "grad_norm": 0.18692153692245483, + "learning_rate": 8.971412556053812e-05, + "loss": 3.9979, + "step": 12120 + }, + { + "epoch": 0.8238211713548036, + "grad_norm": 0.22566412389278412, + "learning_rate": 8.970987905965484e-05, + "loss": 4.0853, + "step": 12125 + }, + { + "epoch": 0.8241608914254654, + "grad_norm": 0.2699925899505615, + "learning_rate": 8.970563255877158e-05, + "loss": 4.2611, + "step": 12130 + }, + { + "epoch": 0.8245006114961272, + "grad_norm": 0.2766724228858948, + "learning_rate": 8.970138605788831e-05, + "loss": 3.8676, + "step": 12135 + }, + { + "epoch": 0.8248403315667889, + "grad_norm": 0.149053156375885, + "learning_rate": 8.969713955700502e-05, + "loss": 4.0457, + "step": 12140 + }, + { + "epoch": 0.8251800516374508, + "grad_norm": 0.29666438698768616, + "learning_rate": 8.969289305612176e-05, + "loss": 4.0984, + "step": 12145 + }, + { + "epoch": 0.8255197717081125, + "grad_norm": 0.1891719549894333, + "learning_rate": 8.968864655523849e-05, + "loss": 4.009, + "step": 12150 + }, + { + "epoch": 0.8258594917787743, + "grad_norm": 0.1801346093416214, + "learning_rate": 8.968440005435521e-05, + "loss": 4.1132, + "step": 12155 + }, + { + "epoch": 0.826199211849436, + "grad_norm": 1.4226734638214111, + "learning_rate": 8.968015355347195e-05, + "loss": 3.9887, + "step": 12160 + }, + { + "epoch": 0.8265389319200979, + "grad_norm": 0.3138851523399353, + "learning_rate": 8.967590705258868e-05, + "loss": 4.0721, + "step": 12165 + }, + { + "epoch": 0.8268786519907596, + "grad_norm": 0.19921836256980896, + "learning_rate": 8.967166055170539e-05, + "loss": 4.0086, + "step": 12170 + }, + { + "epoch": 0.8272183720614213, + "grad_norm": 0.2232120782136917, + "learning_rate": 8.966741405082213e-05, + "loss": 3.8876, + "step": 12175 + }, + { + "epoch": 0.8275580921320832, + "grad_norm": 0.21191275119781494, + "learning_rate": 8.966316754993885e-05, + "loss": 3.9509, + "step": 12180 + }, + { + "epoch": 0.8278978122027449, + "grad_norm": 1.0071362257003784, + "learning_rate": 8.965892104905557e-05, + "loss": 4.0208, + "step": 12185 + }, + { + "epoch": 0.8282375322734067, + "grad_norm": 0.30778366327285767, + "learning_rate": 8.965467454817232e-05, + "loss": 4.0989, + "step": 12190 + }, + { + "epoch": 0.8285772523440685, + "grad_norm": 0.15304256975650787, + "learning_rate": 8.965042804728903e-05, + "loss": 4.1485, + "step": 12195 + }, + { + "epoch": 0.8289169724147303, + "grad_norm": 0.1691897213459015, + "learning_rate": 8.964618154640576e-05, + "loss": 4.1178, + "step": 12200 + }, + { + "epoch": 0.829256692485392, + "grad_norm": 0.2017151266336441, + "learning_rate": 8.96419350455225e-05, + "loss": 4.1713, + "step": 12205 + }, + { + "epoch": 0.8295964125560538, + "grad_norm": 0.45046570897102356, + "learning_rate": 8.963768854463921e-05, + "loss": 3.8627, + "step": 12210 + }, + { + "epoch": 0.8299361326267156, + "grad_norm": 0.21693216264247894, + "learning_rate": 8.963344204375594e-05, + "loss": 3.969, + "step": 12215 + }, + { + "epoch": 0.8302758526973774, + "grad_norm": 0.22828508913516998, + "learning_rate": 8.962919554287268e-05, + "loss": 3.9584, + "step": 12220 + }, + { + "epoch": 0.8306155727680391, + "grad_norm": 0.2518628239631653, + "learning_rate": 8.96249490419894e-05, + "loss": 4.0381, + "step": 12225 + }, + { + "epoch": 0.830955292838701, + "grad_norm": 0.1994330883026123, + "learning_rate": 8.962070254110613e-05, + "loss": 3.8909, + "step": 12230 + }, + { + "epoch": 0.8312950129093627, + "grad_norm": 0.1634039431810379, + "learning_rate": 8.961645604022287e-05, + "loss": 3.5102, + "step": 12235 + }, + { + "epoch": 0.8316347329800244, + "grad_norm": 0.17670801281929016, + "learning_rate": 8.961220953933958e-05, + "loss": 3.8131, + "step": 12240 + }, + { + "epoch": 0.8319744530506862, + "grad_norm": 0.19512879848480225, + "learning_rate": 8.960796303845632e-05, + "loss": 3.5444, + "step": 12245 + }, + { + "epoch": 0.832314173121348, + "grad_norm": 0.5432287454605103, + "learning_rate": 8.960371653757305e-05, + "loss": 3.8723, + "step": 12250 + }, + { + "epoch": 0.8326538931920098, + "grad_norm": 0.21648722887039185, + "learning_rate": 8.959947003668977e-05, + "loss": 3.7991, + "step": 12255 + }, + { + "epoch": 0.8329936132626715, + "grad_norm": 0.3105649948120117, + "learning_rate": 8.959522353580651e-05, + "loss": 3.8407, + "step": 12260 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.32569703459739685, + "learning_rate": 8.959097703492322e-05, + "loss": 4.2322, + "step": 12265 + }, + { + "epoch": 0.8336730534039951, + "grad_norm": 0.2628330588340759, + "learning_rate": 8.958673053403995e-05, + "loss": 3.599, + "step": 12270 + }, + { + "epoch": 0.8340127734746569, + "grad_norm": 0.15857040882110596, + "learning_rate": 8.958248403315669e-05, + "loss": 4.0897, + "step": 12275 + }, + { + "epoch": 0.8343524935453187, + "grad_norm": 0.20659282803535461, + "learning_rate": 8.957823753227341e-05, + "loss": 3.8562, + "step": 12280 + }, + { + "epoch": 0.8346922136159804, + "grad_norm": 0.21449913084506989, + "learning_rate": 8.957399103139014e-05, + "loss": 3.9756, + "step": 12285 + }, + { + "epoch": 0.8350319336866422, + "grad_norm": 0.1901203691959381, + "learning_rate": 8.956974453050688e-05, + "loss": 4.025, + "step": 12290 + }, + { + "epoch": 0.8353716537573039, + "grad_norm": 0.20290473103523254, + "learning_rate": 8.956549802962359e-05, + "loss": 3.8794, + "step": 12295 + }, + { + "epoch": 0.8357113738279658, + "grad_norm": 0.1798480749130249, + "learning_rate": 8.956125152874032e-05, + "loss": 3.9994, + "step": 12300 + }, + { + "epoch": 0.8360510938986275, + "grad_norm": 0.25437653064727783, + "learning_rate": 8.955700502785706e-05, + "loss": 3.6909, + "step": 12305 + }, + { + "epoch": 0.8363908139692893, + "grad_norm": 0.3736377954483032, + "learning_rate": 8.955275852697378e-05, + "loss": 4.0622, + "step": 12310 + }, + { + "epoch": 0.8367305340399511, + "grad_norm": 0.18843521177768707, + "learning_rate": 8.95485120260905e-05, + "loss": 3.7766, + "step": 12315 + }, + { + "epoch": 0.8370702541106129, + "grad_norm": 0.18586871027946472, + "learning_rate": 8.954426552520724e-05, + "loss": 4.0302, + "step": 12320 + }, + { + "epoch": 0.8374099741812746, + "grad_norm": 0.17058813571929932, + "learning_rate": 8.954001902432396e-05, + "loss": 4.0756, + "step": 12325 + }, + { + "epoch": 0.8377496942519363, + "grad_norm": 0.21250483393669128, + "learning_rate": 8.953577252344069e-05, + "loss": 3.6402, + "step": 12330 + }, + { + "epoch": 0.8380894143225982, + "grad_norm": 0.18693235516548157, + "learning_rate": 8.953152602255742e-05, + "loss": 4.2171, + "step": 12335 + }, + { + "epoch": 0.83842913439326, + "grad_norm": 0.20142552256584167, + "learning_rate": 8.952727952167414e-05, + "loss": 4.0116, + "step": 12340 + }, + { + "epoch": 0.8387688544639217, + "grad_norm": 0.5936012864112854, + "learning_rate": 8.952303302079087e-05, + "loss": 4.0355, + "step": 12345 + }, + { + "epoch": 0.8391085745345835, + "grad_norm": 0.3252449631690979, + "learning_rate": 8.95187865199076e-05, + "loss": 4.0143, + "step": 12350 + }, + { + "epoch": 0.8394482946052453, + "grad_norm": 0.18693962693214417, + "learning_rate": 8.951454001902433e-05, + "loss": 3.938, + "step": 12355 + }, + { + "epoch": 0.839788014675907, + "grad_norm": 0.36720210313796997, + "learning_rate": 8.951029351814106e-05, + "loss": 3.9352, + "step": 12360 + }, + { + "epoch": 0.8401277347465689, + "grad_norm": 0.14825104176998138, + "learning_rate": 8.950604701725778e-05, + "loss": 4.2168, + "step": 12365 + }, + { + "epoch": 0.8404674548172306, + "grad_norm": 0.23677104711532593, + "learning_rate": 8.950180051637451e-05, + "loss": 3.967, + "step": 12370 + }, + { + "epoch": 0.8408071748878924, + "grad_norm": 0.5124611258506775, + "learning_rate": 8.949755401549124e-05, + "loss": 3.9194, + "step": 12375 + }, + { + "epoch": 0.8411468949585541, + "grad_norm": 0.3029448688030243, + "learning_rate": 8.949330751460797e-05, + "loss": 3.9354, + "step": 12380 + }, + { + "epoch": 0.841486615029216, + "grad_norm": 0.20730510354042053, + "learning_rate": 8.94890610137247e-05, + "loss": 4.1058, + "step": 12385 + }, + { + "epoch": 0.8418263350998777, + "grad_norm": 0.22315102815628052, + "learning_rate": 8.948481451284142e-05, + "loss": 3.893, + "step": 12390 + }, + { + "epoch": 0.8421660551705394, + "grad_norm": 0.47029411792755127, + "learning_rate": 8.948056801195815e-05, + "loss": 3.8572, + "step": 12395 + }, + { + "epoch": 0.8425057752412013, + "grad_norm": 0.19684122502803802, + "learning_rate": 8.947632151107488e-05, + "loss": 4.0835, + "step": 12400 + }, + { + "epoch": 0.842845495311863, + "grad_norm": 0.18742690980434418, + "learning_rate": 8.947207501019161e-05, + "loss": 3.9798, + "step": 12405 + }, + { + "epoch": 0.8431852153825248, + "grad_norm": 0.177710622549057, + "learning_rate": 8.946782850930834e-05, + "loss": 4.1085, + "step": 12410 + }, + { + "epoch": 0.8435249354531865, + "grad_norm": 0.18476006388664246, + "learning_rate": 8.946358200842506e-05, + "loss": 4.0936, + "step": 12415 + }, + { + "epoch": 0.8438646555238484, + "grad_norm": 0.16293834149837494, + "learning_rate": 8.946018480771845e-05, + "loss": 3.9854, + "step": 12420 + }, + { + "epoch": 0.8442043755945101, + "grad_norm": 0.22308799624443054, + "learning_rate": 8.945593830683517e-05, + "loss": 4.1042, + "step": 12425 + }, + { + "epoch": 0.8445440956651719, + "grad_norm": 0.16046442091464996, + "learning_rate": 8.94516918059519e-05, + "loss": 4.0012, + "step": 12430 + }, + { + "epoch": 0.8448838157358337, + "grad_norm": 0.26154249906539917, + "learning_rate": 8.944744530506862e-05, + "loss": 4.0459, + "step": 12435 + }, + { + "epoch": 0.8452235358064955, + "grad_norm": 0.23383556306362152, + "learning_rate": 8.944319880418536e-05, + "loss": 4.0728, + "step": 12440 + }, + { + "epoch": 0.8455632558771572, + "grad_norm": 0.22498470544815063, + "learning_rate": 8.943895230330209e-05, + "loss": 3.979, + "step": 12445 + }, + { + "epoch": 0.845902975947819, + "grad_norm": 0.18164518475532532, + "learning_rate": 8.943470580241881e-05, + "loss": 3.8648, + "step": 12450 + }, + { + "epoch": 0.8462426960184808, + "grad_norm": 0.21877458691596985, + "learning_rate": 8.943045930153554e-05, + "loss": 4.2495, + "step": 12455 + }, + { + "epoch": 0.8465824160891425, + "grad_norm": 0.1924646645784378, + "learning_rate": 8.942621280065227e-05, + "loss": 3.8453, + "step": 12460 + }, + { + "epoch": 0.8469221361598043, + "grad_norm": 0.20620742440223694, + "learning_rate": 8.9421966299769e-05, + "loss": 3.9962, + "step": 12465 + }, + { + "epoch": 0.8472618562304661, + "grad_norm": 0.290763795375824, + "learning_rate": 8.941771979888573e-05, + "loss": 3.8139, + "step": 12470 + }, + { + "epoch": 0.8476015763011279, + "grad_norm": 0.16620713472366333, + "learning_rate": 8.941347329800245e-05, + "loss": 3.7297, + "step": 12475 + }, + { + "epoch": 0.8479412963717896, + "grad_norm": 2.936108112335205, + "learning_rate": 8.940922679711918e-05, + "loss": 3.8085, + "step": 12480 + }, + { + "epoch": 0.8482810164424515, + "grad_norm": 0.1852518916130066, + "learning_rate": 8.940498029623591e-05, + "loss": 3.9015, + "step": 12485 + }, + { + "epoch": 0.8486207365131132, + "grad_norm": 0.159807950258255, + "learning_rate": 8.940073379535264e-05, + "loss": 3.9683, + "step": 12490 + }, + { + "epoch": 0.848960456583775, + "grad_norm": 0.15564261376857758, + "learning_rate": 8.939648729446937e-05, + "loss": 4.0309, + "step": 12495 + }, + { + "epoch": 0.8493001766544367, + "grad_norm": 0.18451645970344543, + "learning_rate": 8.93922407935861e-05, + "loss": 4.002, + "step": 12500 + }, + { + "epoch": 0.8496398967250985, + "grad_norm": 0.4409978687763214, + "learning_rate": 8.938799429270281e-05, + "loss": 3.9582, + "step": 12505 + }, + { + "epoch": 0.8499796167957603, + "grad_norm": 0.21767017245292664, + "learning_rate": 8.938374779181955e-05, + "loss": 3.7942, + "step": 12510 + }, + { + "epoch": 0.850319336866422, + "grad_norm": 0.29611897468566895, + "learning_rate": 8.937950129093628e-05, + "loss": 3.8641, + "step": 12515 + }, + { + "epoch": 0.8506590569370839, + "grad_norm": 0.15617810189723969, + "learning_rate": 8.937525479005299e-05, + "loss": 4.0887, + "step": 12520 + }, + { + "epoch": 0.8509987770077456, + "grad_norm": 0.19923017919063568, + "learning_rate": 8.937100828916973e-05, + "loss": 3.9796, + "step": 12525 + }, + { + "epoch": 0.8513384970784074, + "grad_norm": 0.178538978099823, + "learning_rate": 8.936676178828646e-05, + "loss": 4.1373, + "step": 12530 + }, + { + "epoch": 0.8516782171490692, + "grad_norm": 0.20157839357852936, + "learning_rate": 8.936251528740318e-05, + "loss": 4.1656, + "step": 12535 + }, + { + "epoch": 0.852017937219731, + "grad_norm": 0.21917836368083954, + "learning_rate": 8.935826878651992e-05, + "loss": 3.9735, + "step": 12540 + }, + { + "epoch": 0.8523576572903927, + "grad_norm": 0.1743486225605011, + "learning_rate": 8.935402228563665e-05, + "loss": 4.0561, + "step": 12545 + }, + { + "epoch": 0.8526973773610544, + "grad_norm": 0.3735666573047638, + "learning_rate": 8.934977578475336e-05, + "loss": 3.9863, + "step": 12550 + }, + { + "epoch": 0.8530370974317163, + "grad_norm": 0.21841804683208466, + "learning_rate": 8.93455292838701e-05, + "loss": 4.0283, + "step": 12555 + }, + { + "epoch": 0.853376817502378, + "grad_norm": 0.4104025065898895, + "learning_rate": 8.934128278298683e-05, + "loss": 3.7465, + "step": 12560 + }, + { + "epoch": 0.8537165375730398, + "grad_norm": 0.15292176604270935, + "learning_rate": 8.933703628210354e-05, + "loss": 4.0306, + "step": 12565 + }, + { + "epoch": 0.8540562576437016, + "grad_norm": 0.30871346592903137, + "learning_rate": 8.933278978122029e-05, + "loss": 3.9504, + "step": 12570 + }, + { + "epoch": 0.8543959777143634, + "grad_norm": 0.18270432949066162, + "learning_rate": 8.9328543280337e-05, + "loss": 4.1066, + "step": 12575 + }, + { + "epoch": 0.8547356977850251, + "grad_norm": 0.5479381680488586, + "learning_rate": 8.932429677945373e-05, + "loss": 3.9433, + "step": 12580 + }, + { + "epoch": 0.8550754178556869, + "grad_norm": 0.19208583235740662, + "learning_rate": 8.932005027857047e-05, + "loss": 3.7949, + "step": 12585 + }, + { + "epoch": 0.8554151379263487, + "grad_norm": 0.19413863122463226, + "learning_rate": 8.931580377768718e-05, + "loss": 4.07, + "step": 12590 + }, + { + "epoch": 0.8557548579970105, + "grad_norm": 0.19963692128658295, + "learning_rate": 8.931155727680391e-05, + "loss": 4.038, + "step": 12595 + }, + { + "epoch": 0.8560945780676722, + "grad_norm": 0.18308939039707184, + "learning_rate": 8.930731077592065e-05, + "loss": 4.0086, + "step": 12600 + }, + { + "epoch": 0.856434298138334, + "grad_norm": 0.23906344175338745, + "learning_rate": 8.930306427503737e-05, + "loss": 3.835, + "step": 12605 + }, + { + "epoch": 0.8567740182089958, + "grad_norm": 0.15061314404010773, + "learning_rate": 8.92988177741541e-05, + "loss": 3.9051, + "step": 12610 + }, + { + "epoch": 0.8571137382796575, + "grad_norm": 0.4929114282131195, + "learning_rate": 8.929457127327084e-05, + "loss": 3.8619, + "step": 12615 + }, + { + "epoch": 0.8574534583503194, + "grad_norm": 0.23578637838363647, + "learning_rate": 8.929032477238755e-05, + "loss": 4.2782, + "step": 12620 + }, + { + "epoch": 0.8577931784209811, + "grad_norm": 0.2066326141357422, + "learning_rate": 8.928607827150428e-05, + "loss": 4.2725, + "step": 12625 + }, + { + "epoch": 0.8581328984916429, + "grad_norm": 0.22157415747642517, + "learning_rate": 8.928183177062102e-05, + "loss": 4.0701, + "step": 12630 + }, + { + "epoch": 0.8584726185623046, + "grad_norm": 0.9750187397003174, + "learning_rate": 8.927758526973774e-05, + "loss": 3.8094, + "step": 12635 + }, + { + "epoch": 0.8588123386329665, + "grad_norm": 0.17541570961475372, + "learning_rate": 8.927333876885446e-05, + "loss": 4.1766, + "step": 12640 + }, + { + "epoch": 0.8591520587036282, + "grad_norm": 0.1566866785287857, + "learning_rate": 8.92690922679712e-05, + "loss": 4.0965, + "step": 12645 + }, + { + "epoch": 0.85949177877429, + "grad_norm": 0.18229223787784576, + "learning_rate": 8.926484576708792e-05, + "loss": 3.9081, + "step": 12650 + }, + { + "epoch": 0.8598314988449518, + "grad_norm": 0.19233596324920654, + "learning_rate": 8.926059926620465e-05, + "loss": 3.9146, + "step": 12655 + }, + { + "epoch": 0.8601712189156135, + "grad_norm": 0.26399555802345276, + "learning_rate": 8.925635276532138e-05, + "loss": 3.8794, + "step": 12660 + }, + { + "epoch": 0.8605109389862753, + "grad_norm": 0.18803419172763824, + "learning_rate": 8.92521062644381e-05, + "loss": 4.2288, + "step": 12665 + }, + { + "epoch": 0.860850659056937, + "grad_norm": 0.2003091722726822, + "learning_rate": 8.924785976355483e-05, + "loss": 3.9376, + "step": 12670 + }, + { + "epoch": 0.8611903791275989, + "grad_norm": 2.6071043014526367, + "learning_rate": 8.924361326267156e-05, + "loss": 3.9786, + "step": 12675 + }, + { + "epoch": 0.8615300991982606, + "grad_norm": 0.21313896775245667, + "learning_rate": 8.923936676178829e-05, + "loss": 4.0317, + "step": 12680 + }, + { + "epoch": 0.8618698192689224, + "grad_norm": 0.17100752890110016, + "learning_rate": 8.923512026090502e-05, + "loss": 3.9988, + "step": 12685 + }, + { + "epoch": 0.8622095393395842, + "grad_norm": 0.23430535197257996, + "learning_rate": 8.923087376002174e-05, + "loss": 3.7328, + "step": 12690 + }, + { + "epoch": 0.862549259410246, + "grad_norm": 0.1643848717212677, + "learning_rate": 8.922662725913847e-05, + "loss": 4.109, + "step": 12695 + }, + { + "epoch": 0.8628889794809077, + "grad_norm": 0.2526448369026184, + "learning_rate": 8.92223807582552e-05, + "loss": 4.0189, + "step": 12700 + }, + { + "epoch": 0.8632286995515696, + "grad_norm": 0.24552328884601593, + "learning_rate": 8.921813425737193e-05, + "loss": 3.9226, + "step": 12705 + }, + { + "epoch": 0.8635684196222313, + "grad_norm": 0.17898543179035187, + "learning_rate": 8.921388775648866e-05, + "loss": 3.7268, + "step": 12710 + }, + { + "epoch": 0.863908139692893, + "grad_norm": 1.15958833694458, + "learning_rate": 8.920964125560538e-05, + "loss": 3.9694, + "step": 12715 + }, + { + "epoch": 0.8642478597635548, + "grad_norm": 0.15424911677837372, + "learning_rate": 8.920539475472211e-05, + "loss": 4.0695, + "step": 12720 + }, + { + "epoch": 0.8645875798342166, + "grad_norm": 0.1732960045337677, + "learning_rate": 8.920114825383884e-05, + "loss": 4.0694, + "step": 12725 + }, + { + "epoch": 0.8649272999048784, + "grad_norm": 0.25387898087501526, + "learning_rate": 8.919690175295557e-05, + "loss": 3.9112, + "step": 12730 + }, + { + "epoch": 0.8652670199755401, + "grad_norm": 0.18854272365570068, + "learning_rate": 8.91926552520723e-05, + "loss": 4.1747, + "step": 12735 + }, + { + "epoch": 0.865606740046202, + "grad_norm": 0.18318019807338715, + "learning_rate": 8.918840875118902e-05, + "loss": 4.0418, + "step": 12740 + }, + { + "epoch": 0.8659464601168637, + "grad_norm": 0.1997986137866974, + "learning_rate": 8.918416225030575e-05, + "loss": 3.9741, + "step": 12745 + }, + { + "epoch": 0.8662861801875255, + "grad_norm": 0.1834951937198639, + "learning_rate": 8.917991574942248e-05, + "loss": 4.0931, + "step": 12750 + }, + { + "epoch": 0.8666259002581872, + "grad_norm": 0.1741950660943985, + "learning_rate": 8.917566924853921e-05, + "loss": 3.8176, + "step": 12755 + }, + { + "epoch": 0.866965620328849, + "grad_norm": 0.18136630952358246, + "learning_rate": 8.917142274765594e-05, + "loss": 4.0065, + "step": 12760 + }, + { + "epoch": 0.8673053403995108, + "grad_norm": 0.22858762741088867, + "learning_rate": 8.916717624677266e-05, + "loss": 3.8951, + "step": 12765 + }, + { + "epoch": 0.8676450604701725, + "grad_norm": 0.15833310782909393, + "learning_rate": 8.916292974588939e-05, + "loss": 3.9966, + "step": 12770 + }, + { + "epoch": 0.8679847805408344, + "grad_norm": 0.16821962594985962, + "learning_rate": 8.915868324500611e-05, + "loss": 4.1885, + "step": 12775 + }, + { + "epoch": 0.8683245006114961, + "grad_norm": 0.16304543614387512, + "learning_rate": 8.915443674412285e-05, + "loss": 4.054, + "step": 12780 + }, + { + "epoch": 0.8686642206821579, + "grad_norm": 0.19145479798316956, + "learning_rate": 8.915019024323958e-05, + "loss": 4.0134, + "step": 12785 + }, + { + "epoch": 0.8690039407528197, + "grad_norm": 0.20590396225452423, + "learning_rate": 8.91459437423563e-05, + "loss": 3.9, + "step": 12790 + }, + { + "epoch": 0.8693436608234815, + "grad_norm": 0.1521267145872116, + "learning_rate": 8.914169724147303e-05, + "loss": 3.9639, + "step": 12795 + }, + { + "epoch": 0.8696833808941432, + "grad_norm": 1.8004759550094604, + "learning_rate": 8.913745074058976e-05, + "loss": 4.1324, + "step": 12800 + }, + { + "epoch": 0.870023100964805, + "grad_norm": 0.1751425862312317, + "learning_rate": 8.913320423970649e-05, + "loss": 4.0504, + "step": 12805 + }, + { + "epoch": 0.8703628210354668, + "grad_norm": 0.21332374215126038, + "learning_rate": 8.912895773882322e-05, + "loss": 4.095, + "step": 12810 + }, + { + "epoch": 0.8707025411061285, + "grad_norm": 0.2132454216480255, + "learning_rate": 8.912471123793994e-05, + "loss": 3.9654, + "step": 12815 + }, + { + "epoch": 0.8710422611767903, + "grad_norm": 0.1583162248134613, + "learning_rate": 8.912046473705667e-05, + "loss": 3.9617, + "step": 12820 + }, + { + "epoch": 0.8713819812474521, + "grad_norm": 0.19870373606681824, + "learning_rate": 8.91162182361734e-05, + "loss": 4.2326, + "step": 12825 + }, + { + "epoch": 0.8717217013181139, + "grad_norm": 0.18757444620132446, + "learning_rate": 8.911197173529013e-05, + "loss": 3.9562, + "step": 12830 + }, + { + "epoch": 0.8720614213887756, + "grad_norm": 0.15151342749595642, + "learning_rate": 8.910772523440686e-05, + "loss": 3.8253, + "step": 12835 + }, + { + "epoch": 0.8724011414594374, + "grad_norm": 0.23465317487716675, + "learning_rate": 8.910347873352358e-05, + "loss": 3.8039, + "step": 12840 + }, + { + "epoch": 0.8727408615300992, + "grad_norm": 0.3286290168762207, + "learning_rate": 8.909923223264031e-05, + "loss": 3.9344, + "step": 12845 + }, + { + "epoch": 0.873080581600761, + "grad_norm": 0.3796158730983734, + "learning_rate": 8.909498573175704e-05, + "loss": 4.2968, + "step": 12850 + }, + { + "epoch": 0.8734203016714227, + "grad_norm": 0.17161943018436432, + "learning_rate": 8.909073923087377e-05, + "loss": 4.0475, + "step": 12855 + }, + { + "epoch": 0.8737600217420846, + "grad_norm": 0.18927636742591858, + "learning_rate": 8.908649272999048e-05, + "loss": 3.9, + "step": 12860 + }, + { + "epoch": 0.8740997418127463, + "grad_norm": 0.17746715247631073, + "learning_rate": 8.908224622910722e-05, + "loss": 4.0236, + "step": 12865 + }, + { + "epoch": 0.874439461883408, + "grad_norm": 0.2432946115732193, + "learning_rate": 8.907799972822395e-05, + "loss": 3.595, + "step": 12870 + }, + { + "epoch": 0.8747791819540699, + "grad_norm": 0.18916042149066925, + "learning_rate": 8.907375322734067e-05, + "loss": 3.9668, + "step": 12875 + }, + { + "epoch": 0.8751189020247316, + "grad_norm": 0.18429867923259735, + "learning_rate": 8.906950672645741e-05, + "loss": 3.8215, + "step": 12880 + }, + { + "epoch": 0.8754586220953934, + "grad_norm": 0.30447253584861755, + "learning_rate": 8.906526022557414e-05, + "loss": 3.94, + "step": 12885 + }, + { + "epoch": 0.8757983421660551, + "grad_norm": 0.1514458954334259, + "learning_rate": 8.906101372469085e-05, + "loss": 3.8559, + "step": 12890 + }, + { + "epoch": 0.876138062236717, + "grad_norm": 0.1644824743270874, + "learning_rate": 8.905676722380759e-05, + "loss": 4.136, + "step": 12895 + }, + { + "epoch": 0.8764777823073787, + "grad_norm": 0.1758906990289688, + "learning_rate": 8.905252072292432e-05, + "loss": 4.2008, + "step": 12900 + }, + { + "epoch": 0.8768175023780405, + "grad_norm": 0.19972476363182068, + "learning_rate": 8.904827422204104e-05, + "loss": 3.9077, + "step": 12905 + }, + { + "epoch": 0.8771572224487023, + "grad_norm": 0.21778126060962677, + "learning_rate": 8.904402772115778e-05, + "loss": 3.9808, + "step": 12910 + }, + { + "epoch": 0.877496942519364, + "grad_norm": 0.19557218253612518, + "learning_rate": 8.90397812202745e-05, + "loss": 3.7706, + "step": 12915 + }, + { + "epoch": 0.8778366625900258, + "grad_norm": 0.1903916746377945, + "learning_rate": 8.903553471939122e-05, + "loss": 3.8328, + "step": 12920 + }, + { + "epoch": 0.8781763826606876, + "grad_norm": 0.21190743148326874, + "learning_rate": 8.903128821850796e-05, + "loss": 3.8016, + "step": 12925 + }, + { + "epoch": 0.8785161027313494, + "grad_norm": 0.22646445035934448, + "learning_rate": 8.902704171762468e-05, + "loss": 3.9702, + "step": 12930 + }, + { + "epoch": 0.8788558228020111, + "grad_norm": 0.2211994081735611, + "learning_rate": 8.90227952167414e-05, + "loss": 4.0118, + "step": 12935 + }, + { + "epoch": 0.8791955428726729, + "grad_norm": 0.23225241899490356, + "learning_rate": 8.901854871585814e-05, + "loss": 4.1305, + "step": 12940 + }, + { + "epoch": 0.8795352629433347, + "grad_norm": 0.20378831028938293, + "learning_rate": 8.901430221497486e-05, + "loss": 3.9627, + "step": 12945 + }, + { + "epoch": 0.8798749830139965, + "grad_norm": 0.15453274548053741, + "learning_rate": 8.901005571409159e-05, + "loss": 3.8527, + "step": 12950 + }, + { + "epoch": 0.8802147030846582, + "grad_norm": 0.19792801141738892, + "learning_rate": 8.900580921320833e-05, + "loss": 4.0273, + "step": 12955 + }, + { + "epoch": 0.8805544231553201, + "grad_norm": 0.17211174964904785, + "learning_rate": 8.900156271232504e-05, + "loss": 3.9116, + "step": 12960 + }, + { + "epoch": 0.8808941432259818, + "grad_norm": 0.5672011375427246, + "learning_rate": 8.899731621144177e-05, + "loss": 3.9642, + "step": 12965 + }, + { + "epoch": 0.8812338632966435, + "grad_norm": 0.18356992304325104, + "learning_rate": 8.899306971055851e-05, + "loss": 3.9994, + "step": 12970 + }, + { + "epoch": 0.8815735833673053, + "grad_norm": 0.18002454936504364, + "learning_rate": 8.898882320967523e-05, + "loss": 3.9536, + "step": 12975 + }, + { + "epoch": 0.8819133034379671, + "grad_norm": 0.18387572467327118, + "learning_rate": 8.898457670879196e-05, + "loss": 4.0552, + "step": 12980 + }, + { + "epoch": 0.8822530235086289, + "grad_norm": 0.27319806814193726, + "learning_rate": 8.89803302079087e-05, + "loss": 3.9619, + "step": 12985 + }, + { + "epoch": 0.8825927435792906, + "grad_norm": 0.2080732136964798, + "learning_rate": 8.897608370702541e-05, + "loss": 4.1625, + "step": 12990 + }, + { + "epoch": 0.8829324636499525, + "grad_norm": 0.17268094420433044, + "learning_rate": 8.897183720614214e-05, + "loss": 3.9614, + "step": 12995 + }, + { + "epoch": 0.8832721837206142, + "grad_norm": 0.15917377173900604, + "learning_rate": 8.896759070525888e-05, + "loss": 3.9529, + "step": 13000 + }, + { + "epoch": 0.883611903791276, + "grad_norm": 0.22826112806797028, + "learning_rate": 8.89633442043756e-05, + "loss": 4.0541, + "step": 13005 + }, + { + "epoch": 0.8839516238619378, + "grad_norm": 0.2643020749092102, + "learning_rate": 8.895909770349232e-05, + "loss": 3.8896, + "step": 13010 + }, + { + "epoch": 0.8842913439325996, + "grad_norm": 0.19719457626342773, + "learning_rate": 8.895485120260905e-05, + "loss": 3.8283, + "step": 13015 + }, + { + "epoch": 0.8846310640032613, + "grad_norm": 0.18295039236545563, + "learning_rate": 8.895060470172578e-05, + "loss": 3.8955, + "step": 13020 + }, + { + "epoch": 0.884970784073923, + "grad_norm": 0.23282389342784882, + "learning_rate": 8.894635820084251e-05, + "loss": 3.9094, + "step": 13025 + }, + { + "epoch": 0.8853105041445849, + "grad_norm": 0.1835237592458725, + "learning_rate": 8.894211169995924e-05, + "loss": 3.9402, + "step": 13030 + }, + { + "epoch": 0.8856502242152466, + "grad_norm": 0.2141278088092804, + "learning_rate": 8.893786519907596e-05, + "loss": 4.0951, + "step": 13035 + }, + { + "epoch": 0.8859899442859084, + "grad_norm": 0.16725867986679077, + "learning_rate": 8.893361869819269e-05, + "loss": 3.8523, + "step": 13040 + }, + { + "epoch": 0.8863296643565702, + "grad_norm": 0.15616454184055328, + "learning_rate": 8.892937219730942e-05, + "loss": 3.9671, + "step": 13045 + }, + { + "epoch": 0.886669384427232, + "grad_norm": 0.17522963881492615, + "learning_rate": 8.892512569642615e-05, + "loss": 4.0624, + "step": 13050 + }, + { + "epoch": 0.8870091044978937, + "grad_norm": 0.19032767415046692, + "learning_rate": 8.892087919554288e-05, + "loss": 3.8977, + "step": 13055 + }, + { + "epoch": 0.8873488245685555, + "grad_norm": 0.4839637279510498, + "learning_rate": 8.89166326946596e-05, + "loss": 3.9498, + "step": 13060 + }, + { + "epoch": 0.8876885446392173, + "grad_norm": 0.15772879123687744, + "learning_rate": 8.891238619377633e-05, + "loss": 3.9909, + "step": 13065 + }, + { + "epoch": 0.8880282647098791, + "grad_norm": 0.23280905187129974, + "learning_rate": 8.890813969289306e-05, + "loss": 3.9899, + "step": 13070 + }, + { + "epoch": 0.8883679847805408, + "grad_norm": 0.21939218044281006, + "learning_rate": 8.890389319200979e-05, + "loss": 4.0157, + "step": 13075 + }, + { + "epoch": 0.8887077048512027, + "grad_norm": 0.19345150887966156, + "learning_rate": 8.889964669112652e-05, + "loss": 3.8884, + "step": 13080 + }, + { + "epoch": 0.8890474249218644, + "grad_norm": 0.18044187128543854, + "learning_rate": 8.889540019024324e-05, + "loss": 3.9464, + "step": 13085 + }, + { + "epoch": 0.8893871449925261, + "grad_norm": 0.7762069702148438, + "learning_rate": 8.889115368935997e-05, + "loss": 4.0273, + "step": 13090 + }, + { + "epoch": 0.889726865063188, + "grad_norm": 0.20319564640522003, + "learning_rate": 8.88869071884767e-05, + "loss": 3.943, + "step": 13095 + }, + { + "epoch": 0.8900665851338497, + "grad_norm": 0.4176552891731262, + "learning_rate": 8.888266068759343e-05, + "loss": 3.9463, + "step": 13100 + }, + { + "epoch": 0.8904063052045115, + "grad_norm": 0.6137194633483887, + "learning_rate": 8.887841418671016e-05, + "loss": 3.9732, + "step": 13105 + }, + { + "epoch": 0.8907460252751732, + "grad_norm": 0.3522084355354309, + "learning_rate": 8.887416768582688e-05, + "loss": 4.083, + "step": 13110 + }, + { + "epoch": 0.8910857453458351, + "grad_norm": 0.1649700552225113, + "learning_rate": 8.886992118494361e-05, + "loss": 3.8638, + "step": 13115 + }, + { + "epoch": 0.8914254654164968, + "grad_norm": 0.1624513566493988, + "learning_rate": 8.886567468406034e-05, + "loss": 4.1418, + "step": 13120 + }, + { + "epoch": 0.8917651854871586, + "grad_norm": 0.18356308341026306, + "learning_rate": 8.886142818317707e-05, + "loss": 3.8366, + "step": 13125 + }, + { + "epoch": 0.8921049055578204, + "grad_norm": 0.22682762145996094, + "learning_rate": 8.88571816822938e-05, + "loss": 3.8805, + "step": 13130 + }, + { + "epoch": 0.8924446256284821, + "grad_norm": 0.3890301287174225, + "learning_rate": 8.885293518141052e-05, + "loss": 3.8615, + "step": 13135 + }, + { + "epoch": 0.8927843456991439, + "grad_norm": 0.1815677434206009, + "learning_rate": 8.884868868052725e-05, + "loss": 3.9861, + "step": 13140 + }, + { + "epoch": 0.8931240657698056, + "grad_norm": 0.5793675780296326, + "learning_rate": 8.884444217964398e-05, + "loss": 4.0116, + "step": 13145 + }, + { + "epoch": 0.8934637858404675, + "grad_norm": 0.30733972787857056, + "learning_rate": 8.884019567876071e-05, + "loss": 3.8564, + "step": 13150 + }, + { + "epoch": 0.8938035059111292, + "grad_norm": 0.15819989144802094, + "learning_rate": 8.883594917787744e-05, + "loss": 4.2131, + "step": 13155 + }, + { + "epoch": 0.894143225981791, + "grad_norm": 0.1839209347963333, + "learning_rate": 8.883170267699416e-05, + "loss": 3.8485, + "step": 13160 + }, + { + "epoch": 0.8944829460524528, + "grad_norm": 0.2315215915441513, + "learning_rate": 8.882745617611089e-05, + "loss": 3.7713, + "step": 13165 + }, + { + "epoch": 0.8948226661231146, + "grad_norm": 0.18673592805862427, + "learning_rate": 8.882320967522762e-05, + "loss": 3.7431, + "step": 13170 + }, + { + "epoch": 0.8951623861937763, + "grad_norm": 0.1436724215745926, + "learning_rate": 8.881896317434435e-05, + "loss": 4.0778, + "step": 13175 + }, + { + "epoch": 0.8955021062644382, + "grad_norm": 0.23739652335643768, + "learning_rate": 8.881471667346108e-05, + "loss": 3.925, + "step": 13180 + }, + { + "epoch": 0.8958418263350999, + "grad_norm": 0.17830795049667358, + "learning_rate": 8.88104701725778e-05, + "loss": 4.0997, + "step": 13185 + }, + { + "epoch": 0.8961815464057616, + "grad_norm": 0.20703446865081787, + "learning_rate": 8.880622367169453e-05, + "loss": 3.8833, + "step": 13190 + }, + { + "epoch": 0.8965212664764234, + "grad_norm": 0.18058006465435028, + "learning_rate": 8.880197717081126e-05, + "loss": 4.1206, + "step": 13195 + }, + { + "epoch": 0.8968609865470852, + "grad_norm": 0.16602760553359985, + "learning_rate": 8.879773066992799e-05, + "loss": 4.0043, + "step": 13200 + }, + { + "epoch": 0.897200706617747, + "grad_norm": 0.2073521465063095, + "learning_rate": 8.879348416904472e-05, + "loss": 3.8992, + "step": 13205 + }, + { + "epoch": 0.8975404266884087, + "grad_norm": 0.1674661785364151, + "learning_rate": 8.878923766816144e-05, + "loss": 3.9651, + "step": 13210 + }, + { + "epoch": 0.8978801467590706, + "grad_norm": 0.19709919393062592, + "learning_rate": 8.878499116727816e-05, + "loss": 3.8494, + "step": 13215 + }, + { + "epoch": 0.8982198668297323, + "grad_norm": 0.18262673914432526, + "learning_rate": 8.87807446663949e-05, + "loss": 3.9929, + "step": 13220 + }, + { + "epoch": 0.8985595869003941, + "grad_norm": 0.1922248899936676, + "learning_rate": 8.877649816551163e-05, + "loss": 4.1698, + "step": 13225 + }, + { + "epoch": 0.8988993069710558, + "grad_norm": 0.22797107696533203, + "learning_rate": 8.877225166462834e-05, + "loss": 4.1415, + "step": 13230 + }, + { + "epoch": 0.8992390270417177, + "grad_norm": 0.21377937495708466, + "learning_rate": 8.876800516374508e-05, + "loss": 3.7365, + "step": 13235 + }, + { + "epoch": 0.8995787471123794, + "grad_norm": 0.15349645912647247, + "learning_rate": 8.876375866286181e-05, + "loss": 4.21, + "step": 13240 + }, + { + "epoch": 0.8999184671830411, + "grad_norm": 0.17594188451766968, + "learning_rate": 8.875951216197853e-05, + "loss": 3.6277, + "step": 13245 + }, + { + "epoch": 0.900258187253703, + "grad_norm": 0.21915188431739807, + "learning_rate": 8.875526566109527e-05, + "loss": 3.8465, + "step": 13250 + }, + { + "epoch": 0.9005979073243647, + "grad_norm": 0.1609984189271927, + "learning_rate": 8.8751019160212e-05, + "loss": 3.9646, + "step": 13255 + }, + { + "epoch": 0.9009376273950265, + "grad_norm": 0.22858203947544098, + "learning_rate": 8.874677265932871e-05, + "loss": 3.9217, + "step": 13260 + }, + { + "epoch": 0.9012773474656883, + "grad_norm": 0.1865098774433136, + "learning_rate": 8.874252615844545e-05, + "loss": 4.0665, + "step": 13265 + }, + { + "epoch": 0.9016170675363501, + "grad_norm": 0.27133429050445557, + "learning_rate": 8.873827965756218e-05, + "loss": 4.1901, + "step": 13270 + }, + { + "epoch": 0.9019567876070118, + "grad_norm": 0.26253220438957214, + "learning_rate": 8.87340331566789e-05, + "loss": 4.0598, + "step": 13275 + }, + { + "epoch": 0.9022965076776736, + "grad_norm": 0.19666416943073273, + "learning_rate": 8.872978665579564e-05, + "loss": 4.1986, + "step": 13280 + }, + { + "epoch": 0.9026362277483354, + "grad_norm": 0.16205628216266632, + "learning_rate": 8.872554015491235e-05, + "loss": 3.9244, + "step": 13285 + }, + { + "epoch": 0.9029759478189971, + "grad_norm": 0.21423132717609406, + "learning_rate": 8.872129365402908e-05, + "loss": 3.9477, + "step": 13290 + }, + { + "epoch": 0.9033156678896589, + "grad_norm": 0.18254421651363373, + "learning_rate": 8.871704715314582e-05, + "loss": 4.0589, + "step": 13295 + }, + { + "epoch": 0.9036553879603207, + "grad_norm": 0.17389804124832153, + "learning_rate": 8.871280065226253e-05, + "loss": 4.0397, + "step": 13300 + }, + { + "epoch": 0.9039951080309825, + "grad_norm": 0.23967847228050232, + "learning_rate": 8.870855415137926e-05, + "loss": 3.9713, + "step": 13305 + }, + { + "epoch": 0.9043348281016442, + "grad_norm": 0.15660813450813293, + "learning_rate": 8.8704307650496e-05, + "loss": 4.155, + "step": 13310 + }, + { + "epoch": 0.904674548172306, + "grad_norm": 0.1497335135936737, + "learning_rate": 8.870006114961272e-05, + "loss": 4.2737, + "step": 13315 + }, + { + "epoch": 0.9050142682429678, + "grad_norm": 0.16438940167427063, + "learning_rate": 8.869581464872945e-05, + "loss": 3.7956, + "step": 13320 + }, + { + "epoch": 0.9053539883136296, + "grad_norm": 0.19728811085224152, + "learning_rate": 8.869156814784619e-05, + "loss": 3.8597, + "step": 13325 + }, + { + "epoch": 0.9056937083842913, + "grad_norm": 0.19347457587718964, + "learning_rate": 8.86873216469629e-05, + "loss": 3.9375, + "step": 13330 + }, + { + "epoch": 0.9060334284549532, + "grad_norm": 0.1782982498407364, + "learning_rate": 8.868307514607963e-05, + "loss": 4.1215, + "step": 13335 + }, + { + "epoch": 0.9063731485256149, + "grad_norm": 0.16826027631759644, + "learning_rate": 8.867882864519637e-05, + "loss": 3.9993, + "step": 13340 + }, + { + "epoch": 0.9067128685962766, + "grad_norm": 0.1599569171667099, + "learning_rate": 8.867458214431309e-05, + "loss": 3.8991, + "step": 13345 + }, + { + "epoch": 0.9070525886669385, + "grad_norm": 2.387441396713257, + "learning_rate": 8.867033564342981e-05, + "loss": 4.0009, + "step": 13350 + }, + { + "epoch": 0.9073923087376002, + "grad_norm": 0.16353747248649597, + "learning_rate": 8.866608914254654e-05, + "loss": 3.8368, + "step": 13355 + }, + { + "epoch": 0.907732028808262, + "grad_norm": 0.18139252066612244, + "learning_rate": 8.866184264166327e-05, + "loss": 3.8567, + "step": 13360 + }, + { + "epoch": 0.9080717488789237, + "grad_norm": 0.28488507866859436, + "learning_rate": 8.865759614078e-05, + "loss": 3.9918, + "step": 13365 + }, + { + "epoch": 0.9084114689495856, + "grad_norm": 0.193324476480484, + "learning_rate": 8.865334963989673e-05, + "loss": 4.0164, + "step": 13370 + }, + { + "epoch": 0.9087511890202473, + "grad_norm": 0.18897615373134613, + "learning_rate": 8.864910313901345e-05, + "loss": 3.6622, + "step": 13375 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.18190960586071014, + "learning_rate": 8.864485663813018e-05, + "loss": 4.0802, + "step": 13380 + }, + { + "epoch": 0.9094306291615709, + "grad_norm": 0.3872554302215576, + "learning_rate": 8.864061013724691e-05, + "loss": 4.1349, + "step": 13385 + }, + { + "epoch": 0.9097703492322327, + "grad_norm": 0.18860095739364624, + "learning_rate": 8.863636363636364e-05, + "loss": 4.1308, + "step": 13390 + }, + { + "epoch": 0.9101100693028944, + "grad_norm": 0.1607290804386139, + "learning_rate": 8.863211713548037e-05, + "loss": 3.9187, + "step": 13395 + }, + { + "epoch": 0.9104497893735561, + "grad_norm": 0.24063172936439514, + "learning_rate": 8.86278706345971e-05, + "loss": 3.7071, + "step": 13400 + }, + { + "epoch": 0.910789509444218, + "grad_norm": 0.2261088341474533, + "learning_rate": 8.862362413371382e-05, + "loss": 4.3313, + "step": 13405 + }, + { + "epoch": 0.9111292295148797, + "grad_norm": 0.2024299055337906, + "learning_rate": 8.861937763283055e-05, + "loss": 3.7482, + "step": 13410 + }, + { + "epoch": 0.9114689495855415, + "grad_norm": 0.32624539732933044, + "learning_rate": 8.861513113194728e-05, + "loss": 3.9679, + "step": 13415 + }, + { + "epoch": 0.9118086696562033, + "grad_norm": 0.2082969844341278, + "learning_rate": 8.8610884631064e-05, + "loss": 4.1913, + "step": 13420 + }, + { + "epoch": 0.9121483897268651, + "grad_norm": 0.20776890218257904, + "learning_rate": 8.860663813018073e-05, + "loss": 4.0744, + "step": 13425 + }, + { + "epoch": 0.9124881097975268, + "grad_norm": 0.16512270271778107, + "learning_rate": 8.860239162929746e-05, + "loss": 4.0043, + "step": 13430 + }, + { + "epoch": 0.9128278298681887, + "grad_norm": 1.4505500793457031, + "learning_rate": 8.859814512841419e-05, + "loss": 3.8782, + "step": 13435 + }, + { + "epoch": 0.9131675499388504, + "grad_norm": 0.25911930203437805, + "learning_rate": 8.859389862753092e-05, + "loss": 3.6646, + "step": 13440 + }, + { + "epoch": 0.9135072700095122, + "grad_norm": 0.23000332713127136, + "learning_rate": 8.858965212664765e-05, + "loss": 4.0076, + "step": 13445 + }, + { + "epoch": 0.9138469900801739, + "grad_norm": 0.19737814366817474, + "learning_rate": 8.858540562576437e-05, + "loss": 3.8124, + "step": 13450 + }, + { + "epoch": 0.9141867101508357, + "grad_norm": 0.1637062281370163, + "learning_rate": 8.85811591248811e-05, + "loss": 3.8238, + "step": 13455 + }, + { + "epoch": 0.9145264302214975, + "grad_norm": 0.24078230559825897, + "learning_rate": 8.857691262399783e-05, + "loss": 4.2022, + "step": 13460 + }, + { + "epoch": 0.9148661502921592, + "grad_norm": 0.17801740765571594, + "learning_rate": 8.857266612311456e-05, + "loss": 4.11, + "step": 13465 + }, + { + "epoch": 0.9152058703628211, + "grad_norm": 0.18943698704242706, + "learning_rate": 8.856841962223129e-05, + "loss": 4.0061, + "step": 13470 + }, + { + "epoch": 0.9155455904334828, + "grad_norm": 0.1784680187702179, + "learning_rate": 8.856417312134801e-05, + "loss": 3.9399, + "step": 13475 + }, + { + "epoch": 0.9158853105041446, + "grad_norm": 0.19242218136787415, + "learning_rate": 8.855992662046474e-05, + "loss": 3.7147, + "step": 13480 + }, + { + "epoch": 0.9162250305748063, + "grad_norm": 0.1983332335948944, + "learning_rate": 8.855568011958147e-05, + "loss": 3.7828, + "step": 13485 + }, + { + "epoch": 0.9165647506454682, + "grad_norm": 0.1877221167087555, + "learning_rate": 8.85514336186982e-05, + "loss": 3.9979, + "step": 13490 + }, + { + "epoch": 0.9169044707161299, + "grad_norm": 0.287514328956604, + "learning_rate": 8.854718711781493e-05, + "loss": 4.0574, + "step": 13495 + }, + { + "epoch": 0.9172441907867916, + "grad_norm": 0.1971914917230606, + "learning_rate": 8.854294061693165e-05, + "loss": 3.8822, + "step": 13500 + }, + { + "epoch": 0.9175839108574535, + "grad_norm": 0.19974425435066223, + "learning_rate": 8.853869411604838e-05, + "loss": 4.0866, + "step": 13505 + }, + { + "epoch": 0.9179236309281152, + "grad_norm": 0.1471916139125824, + "learning_rate": 8.853444761516511e-05, + "loss": 4.2775, + "step": 13510 + }, + { + "epoch": 0.918263350998777, + "grad_norm": 0.2695204019546509, + "learning_rate": 8.853020111428184e-05, + "loss": 3.5593, + "step": 13515 + }, + { + "epoch": 0.9186030710694388, + "grad_norm": 0.3328780233860016, + "learning_rate": 8.852595461339857e-05, + "loss": 4.0474, + "step": 13520 + }, + { + "epoch": 0.9189427911401006, + "grad_norm": 0.22376228868961334, + "learning_rate": 8.85217081125153e-05, + "loss": 3.8726, + "step": 13525 + }, + { + "epoch": 0.9192825112107623, + "grad_norm": 1.0047399997711182, + "learning_rate": 8.851746161163202e-05, + "loss": 3.6569, + "step": 13530 + }, + { + "epoch": 0.9196222312814241, + "grad_norm": 0.20537297427654266, + "learning_rate": 8.851321511074875e-05, + "loss": 4.0576, + "step": 13535 + }, + { + "epoch": 0.9199619513520859, + "grad_norm": 0.19384372234344482, + "learning_rate": 8.850896860986548e-05, + "loss": 4.1421, + "step": 13540 + }, + { + "epoch": 0.9203016714227477, + "grad_norm": 0.24288725852966309, + "learning_rate": 8.85047221089822e-05, + "loss": 3.9712, + "step": 13545 + }, + { + "epoch": 0.9206413914934094, + "grad_norm": 0.17010895907878876, + "learning_rate": 8.850047560809893e-05, + "loss": 4.1179, + "step": 13550 + }, + { + "epoch": 0.9209811115640713, + "grad_norm": 0.2580501139163971, + "learning_rate": 8.849622910721565e-05, + "loss": 3.963, + "step": 13555 + }, + { + "epoch": 0.921320831634733, + "grad_norm": 0.19051308929920197, + "learning_rate": 8.849198260633239e-05, + "loss": 3.9626, + "step": 13560 + }, + { + "epoch": 0.9216605517053947, + "grad_norm": 0.18489578366279602, + "learning_rate": 8.848773610544912e-05, + "loss": 4.0235, + "step": 13565 + }, + { + "epoch": 0.9220002717760565, + "grad_norm": 0.1751706898212433, + "learning_rate": 8.848348960456583e-05, + "loss": 4.03, + "step": 13570 + }, + { + "epoch": 0.9223399918467183, + "grad_norm": 0.15869379043579102, + "learning_rate": 8.847924310368257e-05, + "loss": 3.9887, + "step": 13575 + }, + { + "epoch": 0.9226797119173801, + "grad_norm": 0.18910722434520721, + "learning_rate": 8.84749966027993e-05, + "loss": 3.9694, + "step": 13580 + }, + { + "epoch": 0.9230194319880418, + "grad_norm": 0.18458014726638794, + "learning_rate": 8.847075010191602e-05, + "loss": 3.7103, + "step": 13585 + }, + { + "epoch": 0.9233591520587037, + "grad_norm": 0.15150383114814758, + "learning_rate": 8.846650360103276e-05, + "loss": 3.9727, + "step": 13590 + }, + { + "epoch": 0.9236988721293654, + "grad_norm": 0.1747668832540512, + "learning_rate": 8.846225710014949e-05, + "loss": 4.0081, + "step": 13595 + }, + { + "epoch": 0.9240385922000272, + "grad_norm": 0.1757059097290039, + "learning_rate": 8.84580105992662e-05, + "loss": 3.8936, + "step": 13600 + }, + { + "epoch": 0.924378312270689, + "grad_norm": 0.17729350924491882, + "learning_rate": 8.845376409838294e-05, + "loss": 4.1143, + "step": 13605 + }, + { + "epoch": 0.9247180323413507, + "grad_norm": 0.18305832147598267, + "learning_rate": 8.844951759749967e-05, + "loss": 4.1294, + "step": 13610 + }, + { + "epoch": 0.9250577524120125, + "grad_norm": 1.0373708009719849, + "learning_rate": 8.844527109661639e-05, + "loss": 4.0552, + "step": 13615 + }, + { + "epoch": 0.9253974724826742, + "grad_norm": 0.18093042075634003, + "learning_rate": 8.844102459573313e-05, + "loss": 4.0405, + "step": 13620 + }, + { + "epoch": 0.9257371925533361, + "grad_norm": 0.1686364710330963, + "learning_rate": 8.843677809484985e-05, + "loss": 4.0729, + "step": 13625 + }, + { + "epoch": 0.9260769126239978, + "grad_norm": 0.12190663069486618, + "learning_rate": 8.843253159396657e-05, + "loss": 3.958, + "step": 13630 + }, + { + "epoch": 0.9264166326946596, + "grad_norm": 0.18013213574886322, + "learning_rate": 8.842828509308331e-05, + "loss": 4.0457, + "step": 13635 + }, + { + "epoch": 0.9267563527653214, + "grad_norm": 0.47123172879219055, + "learning_rate": 8.842403859220003e-05, + "loss": 3.9378, + "step": 13640 + }, + { + "epoch": 0.9270960728359832, + "grad_norm": 0.17332005500793457, + "learning_rate": 8.841979209131675e-05, + "loss": 3.9994, + "step": 13645 + }, + { + "epoch": 0.9274357929066449, + "grad_norm": 0.2330494374036789, + "learning_rate": 8.84155455904335e-05, + "loss": 4.2059, + "step": 13650 + }, + { + "epoch": 0.9277755129773066, + "grad_norm": 0.1844603419303894, + "learning_rate": 8.841129908955021e-05, + "loss": 3.9922, + "step": 13655 + }, + { + "epoch": 0.9281152330479685, + "grad_norm": 0.19840815663337708, + "learning_rate": 8.840705258866694e-05, + "loss": 4.0601, + "step": 13660 + }, + { + "epoch": 0.9284549531186302, + "grad_norm": 0.25802430510520935, + "learning_rate": 8.840280608778368e-05, + "loss": 3.8963, + "step": 13665 + }, + { + "epoch": 0.928794673189292, + "grad_norm": 0.1573478877544403, + "learning_rate": 8.83985595869004e-05, + "loss": 4.1013, + "step": 13670 + }, + { + "epoch": 0.9291343932599538, + "grad_norm": 0.20257075130939484, + "learning_rate": 8.839431308601712e-05, + "loss": 4.0064, + "step": 13675 + }, + { + "epoch": 0.9294741133306156, + "grad_norm": 0.2046387791633606, + "learning_rate": 8.839006658513386e-05, + "loss": 3.948, + "step": 13680 + }, + { + "epoch": 0.9298138334012773, + "grad_norm": 0.29030993580818176, + "learning_rate": 8.838582008425058e-05, + "loss": 3.7141, + "step": 13685 + }, + { + "epoch": 0.9301535534719392, + "grad_norm": 0.1701250672340393, + "learning_rate": 8.83815735833673e-05, + "loss": 3.8852, + "step": 13690 + }, + { + "epoch": 0.9304932735426009, + "grad_norm": 0.18545231223106384, + "learning_rate": 8.837732708248405e-05, + "loss": 3.8857, + "step": 13695 + }, + { + "epoch": 0.9308329936132627, + "grad_norm": 0.22790156304836273, + "learning_rate": 8.837308058160076e-05, + "loss": 3.9376, + "step": 13700 + }, + { + "epoch": 0.9311727136839244, + "grad_norm": 0.2060774713754654, + "learning_rate": 8.836883408071749e-05, + "loss": 4.0047, + "step": 13705 + }, + { + "epoch": 0.9315124337545863, + "grad_norm": 0.17100512981414795, + "learning_rate": 8.836458757983422e-05, + "loss": 4.0347, + "step": 13710 + }, + { + "epoch": 0.931852153825248, + "grad_norm": 0.18439409136772156, + "learning_rate": 8.836034107895095e-05, + "loss": 4.0597, + "step": 13715 + }, + { + "epoch": 0.9321918738959097, + "grad_norm": 0.20171356201171875, + "learning_rate": 8.835609457806767e-05, + "loss": 3.8904, + "step": 13720 + }, + { + "epoch": 0.9325315939665716, + "grad_norm": 0.2539232075214386, + "learning_rate": 8.83518480771844e-05, + "loss": 3.7127, + "step": 13725 + }, + { + "epoch": 0.9328713140372333, + "grad_norm": 0.303066611289978, + "learning_rate": 8.834760157630113e-05, + "loss": 3.858, + "step": 13730 + }, + { + "epoch": 0.9332110341078951, + "grad_norm": 10.150884628295898, + "learning_rate": 8.834335507541786e-05, + "loss": 4.214, + "step": 13735 + }, + { + "epoch": 0.9335507541785568, + "grad_norm": 0.17438891530036926, + "learning_rate": 8.833910857453459e-05, + "loss": 3.9798, + "step": 13740 + }, + { + "epoch": 0.9338904742492187, + "grad_norm": 0.1963614672422409, + "learning_rate": 8.833486207365131e-05, + "loss": 4.0225, + "step": 13745 + }, + { + "epoch": 0.9342301943198804, + "grad_norm": 0.33438101410865784, + "learning_rate": 8.833061557276804e-05, + "loss": 3.8237, + "step": 13750 + }, + { + "epoch": 0.9345699143905422, + "grad_norm": 0.15952834486961365, + "learning_rate": 8.832636907188477e-05, + "loss": 4.0374, + "step": 13755 + }, + { + "epoch": 0.934909634461204, + "grad_norm": 0.20450946688652039, + "learning_rate": 8.83221225710015e-05, + "loss": 3.8739, + "step": 13760 + }, + { + "epoch": 0.9352493545318658, + "grad_norm": 0.2039669156074524, + "learning_rate": 8.831787607011823e-05, + "loss": 3.953, + "step": 13765 + }, + { + "epoch": 0.9355890746025275, + "grad_norm": 0.15678437054157257, + "learning_rate": 8.831362956923495e-05, + "loss": 3.9398, + "step": 13770 + }, + { + "epoch": 0.9359287946731893, + "grad_norm": 0.5224528312683105, + "learning_rate": 8.830938306835168e-05, + "loss": 3.8759, + "step": 13775 + }, + { + "epoch": 0.9362685147438511, + "grad_norm": 0.2511097192764282, + "learning_rate": 8.830513656746841e-05, + "loss": 3.9613, + "step": 13780 + }, + { + "epoch": 0.9366082348145128, + "grad_norm": 0.14598341286182404, + "learning_rate": 8.830089006658514e-05, + "loss": 4.0519, + "step": 13785 + }, + { + "epoch": 0.9369479548851746, + "grad_norm": 0.18947695195674896, + "learning_rate": 8.829664356570187e-05, + "loss": 3.9679, + "step": 13790 + }, + { + "epoch": 0.9372876749558364, + "grad_norm": 0.19946783781051636, + "learning_rate": 8.82923970648186e-05, + "loss": 4.0397, + "step": 13795 + }, + { + "epoch": 0.9376273950264982, + "grad_norm": 0.1777060627937317, + "learning_rate": 8.828815056393532e-05, + "loss": 3.8039, + "step": 13800 + }, + { + "epoch": 0.9379671150971599, + "grad_norm": 0.2687058746814728, + "learning_rate": 8.828390406305205e-05, + "loss": 3.9514, + "step": 13805 + }, + { + "epoch": 0.9383068351678218, + "grad_norm": 0.17168082296848297, + "learning_rate": 8.827965756216878e-05, + "loss": 3.6143, + "step": 13810 + }, + { + "epoch": 0.9386465552384835, + "grad_norm": 0.2019256055355072, + "learning_rate": 8.82754110612855e-05, + "loss": 3.9093, + "step": 13815 + }, + { + "epoch": 0.9389862753091452, + "grad_norm": 0.15410295128822327, + "learning_rate": 8.827116456040223e-05, + "loss": 4.0334, + "step": 13820 + }, + { + "epoch": 0.939325995379807, + "grad_norm": 0.18953141570091248, + "learning_rate": 8.826691805951896e-05, + "loss": 3.5909, + "step": 13825 + }, + { + "epoch": 0.9396657154504688, + "grad_norm": 0.4388710856437683, + "learning_rate": 8.826267155863569e-05, + "loss": 4.0782, + "step": 13830 + }, + { + "epoch": 0.9400054355211306, + "grad_norm": 0.5807581543922424, + "learning_rate": 8.825842505775242e-05, + "loss": 3.7892, + "step": 13835 + }, + { + "epoch": 0.9403451555917923, + "grad_norm": 0.14411011338233948, + "learning_rate": 8.825417855686915e-05, + "loss": 3.9576, + "step": 13840 + }, + { + "epoch": 0.9406848756624542, + "grad_norm": 0.21537935733795166, + "learning_rate": 8.824993205598587e-05, + "loss": 3.8762, + "step": 13845 + }, + { + "epoch": 0.9410245957331159, + "grad_norm": 1.785944938659668, + "learning_rate": 8.82456855551026e-05, + "loss": 4.103, + "step": 13850 + }, + { + "epoch": 0.9413643158037777, + "grad_norm": 0.13651463389396667, + "learning_rate": 8.824143905421933e-05, + "loss": 4.032, + "step": 13855 + }, + { + "epoch": 0.9417040358744395, + "grad_norm": 0.16791968047618866, + "learning_rate": 8.823719255333606e-05, + "loss": 3.9523, + "step": 13860 + }, + { + "epoch": 0.9420437559451013, + "grad_norm": 0.22517231106758118, + "learning_rate": 8.823294605245279e-05, + "loss": 3.8512, + "step": 13865 + }, + { + "epoch": 0.942383476015763, + "grad_norm": 0.2402997463941574, + "learning_rate": 8.822869955156951e-05, + "loss": 3.9759, + "step": 13870 + }, + { + "epoch": 0.9427231960864247, + "grad_norm": 0.20395690202713013, + "learning_rate": 8.822445305068624e-05, + "loss": 3.9377, + "step": 13875 + }, + { + "epoch": 0.9430629161570866, + "grad_norm": 0.24165527522563934, + "learning_rate": 8.822020654980297e-05, + "loss": 3.9313, + "step": 13880 + }, + { + "epoch": 0.9434026362277483, + "grad_norm": 0.3844189941883087, + "learning_rate": 8.82159600489197e-05, + "loss": 3.7314, + "step": 13885 + }, + { + "epoch": 0.9437423562984101, + "grad_norm": 0.2007877379655838, + "learning_rate": 8.821171354803643e-05, + "loss": 4.0631, + "step": 13890 + }, + { + "epoch": 0.9440820763690719, + "grad_norm": 0.18726769089698792, + "learning_rate": 8.820746704715315e-05, + "loss": 4.055, + "step": 13895 + }, + { + "epoch": 0.9444217964397337, + "grad_norm": 0.38226640224456787, + "learning_rate": 8.820322054626988e-05, + "loss": 4.0028, + "step": 13900 + }, + { + "epoch": 0.9447615165103954, + "grad_norm": 0.1904405802488327, + "learning_rate": 8.819897404538661e-05, + "loss": 3.9834, + "step": 13905 + }, + { + "epoch": 0.9451012365810572, + "grad_norm": 0.1809813529253006, + "learning_rate": 8.819472754450332e-05, + "loss": 4.0637, + "step": 13910 + }, + { + "epoch": 0.945440956651719, + "grad_norm": 0.4116685092449188, + "learning_rate": 8.819048104362007e-05, + "loss": 3.9916, + "step": 13915 + }, + { + "epoch": 0.9457806767223808, + "grad_norm": 1.9458621740341187, + "learning_rate": 8.81862345427368e-05, + "loss": 4.0563, + "step": 13920 + }, + { + "epoch": 0.9461203967930425, + "grad_norm": 0.15543721616268158, + "learning_rate": 8.818198804185351e-05, + "loss": 4.0744, + "step": 13925 + }, + { + "epoch": 0.9464601168637043, + "grad_norm": 0.42934080958366394, + "learning_rate": 8.817774154097025e-05, + "loss": 3.8221, + "step": 13930 + }, + { + "epoch": 0.9467998369343661, + "grad_norm": 0.228485107421875, + "learning_rate": 8.817349504008698e-05, + "loss": 3.9268, + "step": 13935 + }, + { + "epoch": 0.9471395570050278, + "grad_norm": 0.21295882761478424, + "learning_rate": 8.816924853920369e-05, + "loss": 4.2737, + "step": 13940 + }, + { + "epoch": 0.9474792770756897, + "grad_norm": 0.2023978978395462, + "learning_rate": 8.816500203832043e-05, + "loss": 4.0087, + "step": 13945 + }, + { + "epoch": 0.9478189971463514, + "grad_norm": 0.1767956167459488, + "learning_rate": 8.816075553743716e-05, + "loss": 3.9129, + "step": 13950 + }, + { + "epoch": 0.9481587172170132, + "grad_norm": 1.3790664672851562, + "learning_rate": 8.815650903655388e-05, + "loss": 3.6768, + "step": 13955 + }, + { + "epoch": 0.9484984372876749, + "grad_norm": 0.583037793636322, + "learning_rate": 8.815226253567062e-05, + "loss": 3.8518, + "step": 13960 + }, + { + "epoch": 0.9488381573583368, + "grad_norm": 0.16457735002040863, + "learning_rate": 8.814801603478735e-05, + "loss": 3.7035, + "step": 13965 + }, + { + "epoch": 0.9491778774289985, + "grad_norm": 0.2307073473930359, + "learning_rate": 8.814376953390406e-05, + "loss": 4.1654, + "step": 13970 + }, + { + "epoch": 0.9495175974996602, + "grad_norm": 0.3080320358276367, + "learning_rate": 8.81395230330208e-05, + "loss": 3.975, + "step": 13975 + }, + { + "epoch": 0.9498573175703221, + "grad_norm": 0.17688848078250885, + "learning_rate": 8.813527653213752e-05, + "loss": 4.0859, + "step": 13980 + }, + { + "epoch": 0.9501970376409838, + "grad_norm": 0.24346010386943817, + "learning_rate": 8.813103003125424e-05, + "loss": 3.9282, + "step": 13985 + }, + { + "epoch": 0.9505367577116456, + "grad_norm": 0.1712096929550171, + "learning_rate": 8.812678353037099e-05, + "loss": 3.9035, + "step": 13990 + }, + { + "epoch": 0.9508764777823073, + "grad_norm": 0.19020821154117584, + "learning_rate": 8.81225370294877e-05, + "loss": 4.1025, + "step": 13995 + }, + { + "epoch": 0.9512161978529692, + "grad_norm": 0.21272438764572144, + "learning_rate": 8.811829052860443e-05, + "loss": 3.9025, + "step": 14000 + }, + { + "epoch": 0.9515559179236309, + "grad_norm": 0.14927330613136292, + "learning_rate": 8.811404402772117e-05, + "loss": 3.6392, + "step": 14005 + }, + { + "epoch": 0.9518956379942927, + "grad_norm": 0.14234548807144165, + "learning_rate": 8.810979752683788e-05, + "loss": 3.7538, + "step": 14010 + }, + { + "epoch": 0.9522353580649545, + "grad_norm": 0.19329407811164856, + "learning_rate": 8.810555102595461e-05, + "loss": 4.0138, + "step": 14015 + }, + { + "epoch": 0.9525750781356163, + "grad_norm": 1.4322816133499146, + "learning_rate": 8.810130452507135e-05, + "loss": 3.8935, + "step": 14020 + }, + { + "epoch": 0.952914798206278, + "grad_norm": 0.19049431383609772, + "learning_rate": 8.809705802418807e-05, + "loss": 4.0516, + "step": 14025 + }, + { + "epoch": 0.9532545182769399, + "grad_norm": 0.2139282375574112, + "learning_rate": 8.80928115233048e-05, + "loss": 3.8331, + "step": 14030 + }, + { + "epoch": 0.9535942383476016, + "grad_norm": 0.17371760308742523, + "learning_rate": 8.808856502242154e-05, + "loss": 3.9821, + "step": 14035 + }, + { + "epoch": 0.9539339584182633, + "grad_norm": 0.16707171499729156, + "learning_rate": 8.808431852153825e-05, + "loss": 4.1538, + "step": 14040 + }, + { + "epoch": 0.9542736784889251, + "grad_norm": 0.2354259192943573, + "learning_rate": 8.808007202065498e-05, + "loss": 3.9828, + "step": 14045 + }, + { + "epoch": 0.9546133985595869, + "grad_norm": 0.26454171538352966, + "learning_rate": 8.807582551977172e-05, + "loss": 4.1493, + "step": 14050 + }, + { + "epoch": 0.9549531186302487, + "grad_norm": 0.26322662830352783, + "learning_rate": 8.807157901888844e-05, + "loss": 4.0161, + "step": 14055 + }, + { + "epoch": 0.9552928387009104, + "grad_norm": 0.5517749786376953, + "learning_rate": 8.806733251800516e-05, + "loss": 3.8552, + "step": 14060 + }, + { + "epoch": 0.9556325587715723, + "grad_norm": 0.34178662300109863, + "learning_rate": 8.806308601712189e-05, + "loss": 3.9374, + "step": 14065 + }, + { + "epoch": 0.955972278842234, + "grad_norm": 0.4318985044956207, + "learning_rate": 8.805883951623862e-05, + "loss": 3.9312, + "step": 14070 + }, + { + "epoch": 0.9563119989128958, + "grad_norm": 0.7835567593574524, + "learning_rate": 8.805459301535535e-05, + "loss": 4.0294, + "step": 14075 + }, + { + "epoch": 0.9566517189835575, + "grad_norm": 0.248532235622406, + "learning_rate": 8.805034651447208e-05, + "loss": 4.0144, + "step": 14080 + }, + { + "epoch": 0.9569914390542194, + "grad_norm": 0.17486423254013062, + "learning_rate": 8.80461000135888e-05, + "loss": 3.8262, + "step": 14085 + }, + { + "epoch": 0.9573311591248811, + "grad_norm": 0.1894778460264206, + "learning_rate": 8.804185351270553e-05, + "loss": 3.989, + "step": 14090 + }, + { + "epoch": 0.9576708791955428, + "grad_norm": 0.1900128871202469, + "learning_rate": 8.803760701182226e-05, + "loss": 4.0652, + "step": 14095 + }, + { + "epoch": 0.9580105992662047, + "grad_norm": 0.21038229763507843, + "learning_rate": 8.803336051093899e-05, + "loss": 3.7708, + "step": 14100 + }, + { + "epoch": 0.9583503193368664, + "grad_norm": 0.22659200429916382, + "learning_rate": 8.802911401005572e-05, + "loss": 3.701, + "step": 14105 + }, + { + "epoch": 0.9586900394075282, + "grad_norm": 0.18774689733982086, + "learning_rate": 8.802486750917244e-05, + "loss": 3.8036, + "step": 14110 + }, + { + "epoch": 0.95902975947819, + "grad_norm": 0.7720076441764832, + "learning_rate": 8.802062100828917e-05, + "loss": 4.0017, + "step": 14115 + }, + { + "epoch": 0.9593694795488518, + "grad_norm": 0.17789320647716522, + "learning_rate": 8.80163745074059e-05, + "loss": 4.3506, + "step": 14120 + }, + { + "epoch": 0.9597091996195135, + "grad_norm": 0.23013809323310852, + "learning_rate": 8.801212800652263e-05, + "loss": 4.1502, + "step": 14125 + }, + { + "epoch": 0.9600489196901753, + "grad_norm": 0.20013386011123657, + "learning_rate": 8.800788150563936e-05, + "loss": 4.002, + "step": 14130 + }, + { + "epoch": 0.9603886397608371, + "grad_norm": 0.331853449344635, + "learning_rate": 8.800363500475608e-05, + "loss": 3.9042, + "step": 14135 + }, + { + "epoch": 0.9607283598314988, + "grad_norm": 0.19631457328796387, + "learning_rate": 8.799938850387281e-05, + "loss": 4.1109, + "step": 14140 + }, + { + "epoch": 0.9610680799021606, + "grad_norm": 0.38952094316482544, + "learning_rate": 8.799514200298954e-05, + "loss": 3.7741, + "step": 14145 + }, + { + "epoch": 0.9614077999728224, + "grad_norm": 0.15865278244018555, + "learning_rate": 8.799089550210627e-05, + "loss": 4.0511, + "step": 14150 + }, + { + "epoch": 0.9617475200434842, + "grad_norm": 0.20270100235939026, + "learning_rate": 8.7986649001223e-05, + "loss": 3.9072, + "step": 14155 + }, + { + "epoch": 0.9620872401141459, + "grad_norm": 0.17968709766864777, + "learning_rate": 8.798240250033972e-05, + "loss": 4.0201, + "step": 14160 + }, + { + "epoch": 0.9624269601848077, + "grad_norm": 0.19091679155826569, + "learning_rate": 8.797815599945645e-05, + "loss": 4.0135, + "step": 14165 + }, + { + "epoch": 0.9627666802554695, + "grad_norm": 0.18477921187877655, + "learning_rate": 8.797390949857318e-05, + "loss": 4.0118, + "step": 14170 + }, + { + "epoch": 0.9631064003261313, + "grad_norm": 0.1884998232126236, + "learning_rate": 8.796966299768991e-05, + "loss": 3.9669, + "step": 14175 + }, + { + "epoch": 0.963446120396793, + "grad_norm": 0.18764863908290863, + "learning_rate": 8.796541649680664e-05, + "loss": 3.9139, + "step": 14180 + }, + { + "epoch": 0.9637858404674549, + "grad_norm": 0.3251302242279053, + "learning_rate": 8.796116999592336e-05, + "loss": 4.1504, + "step": 14185 + }, + { + "epoch": 0.9641255605381166, + "grad_norm": 0.17968013882637024, + "learning_rate": 8.795692349504009e-05, + "loss": 3.8325, + "step": 14190 + }, + { + "epoch": 0.9644652806087783, + "grad_norm": 0.28362056612968445, + "learning_rate": 8.795267699415682e-05, + "loss": 3.945, + "step": 14195 + }, + { + "epoch": 0.9648050006794402, + "grad_norm": 0.18230807781219482, + "learning_rate": 8.794843049327355e-05, + "loss": 3.8641, + "step": 14200 + }, + { + "epoch": 0.9651447207501019, + "grad_norm": 0.20012174546718597, + "learning_rate": 8.794418399239028e-05, + "loss": 3.8536, + "step": 14205 + }, + { + "epoch": 0.9654844408207637, + "grad_norm": 0.16306108236312866, + "learning_rate": 8.7939937491507e-05, + "loss": 3.9824, + "step": 14210 + }, + { + "epoch": 0.9658241608914254, + "grad_norm": 0.19044755399227142, + "learning_rate": 8.793569099062373e-05, + "loss": 4.0477, + "step": 14215 + }, + { + "epoch": 0.9661638809620873, + "grad_norm": 0.9334998726844788, + "learning_rate": 8.793144448974046e-05, + "loss": 4.0839, + "step": 14220 + }, + { + "epoch": 0.966503601032749, + "grad_norm": 0.2080710530281067, + "learning_rate": 8.792719798885719e-05, + "loss": 3.935, + "step": 14225 + }, + { + "epoch": 0.9668433211034108, + "grad_norm": 0.20228326320648193, + "learning_rate": 8.792295148797392e-05, + "loss": 3.8579, + "step": 14230 + }, + { + "epoch": 0.9671830411740726, + "grad_norm": 0.14868243038654327, + "learning_rate": 8.791870498709064e-05, + "loss": 3.9681, + "step": 14235 + }, + { + "epoch": 0.9675227612447344, + "grad_norm": 0.3080695569515228, + "learning_rate": 8.791445848620737e-05, + "loss": 3.7428, + "step": 14240 + }, + { + "epoch": 0.9678624813153961, + "grad_norm": 0.3069162666797638, + "learning_rate": 8.79102119853241e-05, + "loss": 3.9151, + "step": 14245 + }, + { + "epoch": 0.9682022013860578, + "grad_norm": 0.18564817309379578, + "learning_rate": 8.790596548444083e-05, + "loss": 3.9755, + "step": 14250 + }, + { + "epoch": 0.9685419214567197, + "grad_norm": 0.26427149772644043, + "learning_rate": 8.790171898355756e-05, + "loss": 4.0877, + "step": 14255 + }, + { + "epoch": 0.9688816415273814, + "grad_norm": 0.17514817416667938, + "learning_rate": 8.789747248267428e-05, + "loss": 3.9761, + "step": 14260 + }, + { + "epoch": 0.9692213615980432, + "grad_norm": 0.17348946630954742, + "learning_rate": 8.7893225981791e-05, + "loss": 3.9038, + "step": 14265 + }, + { + "epoch": 0.969561081668705, + "grad_norm": 0.20074446499347687, + "learning_rate": 8.788897948090774e-05, + "loss": 3.9772, + "step": 14270 + }, + { + "epoch": 0.9699008017393668, + "grad_norm": 0.24490071833133698, + "learning_rate": 8.788473298002447e-05, + "loss": 4.1249, + "step": 14275 + }, + { + "epoch": 0.9702405218100285, + "grad_norm": 0.2058936506509781, + "learning_rate": 8.788048647914118e-05, + "loss": 3.8624, + "step": 14280 + }, + { + "epoch": 0.9705802418806904, + "grad_norm": 0.16198603808879852, + "learning_rate": 8.787623997825792e-05, + "loss": 3.9449, + "step": 14285 + }, + { + "epoch": 0.9709199619513521, + "grad_norm": 0.15939363837242126, + "learning_rate": 8.787199347737465e-05, + "loss": 3.9548, + "step": 14290 + }, + { + "epoch": 0.9712596820220138, + "grad_norm": 0.15633496642112732, + "learning_rate": 8.786774697649137e-05, + "loss": 3.8646, + "step": 14295 + }, + { + "epoch": 0.9715994020926756, + "grad_norm": 0.18158230185508728, + "learning_rate": 8.786350047560811e-05, + "loss": 4.0045, + "step": 14300 + }, + { + "epoch": 0.9719391221633374, + "grad_norm": 0.2111271321773529, + "learning_rate": 8.785925397472484e-05, + "loss": 4.1019, + "step": 14305 + }, + { + "epoch": 0.9722788422339992, + "grad_norm": 0.19582590460777283, + "learning_rate": 8.785500747384155e-05, + "loss": 4.0289, + "step": 14310 + }, + { + "epoch": 0.9726185623046609, + "grad_norm": 0.18569405376911163, + "learning_rate": 8.785076097295829e-05, + "loss": 3.8372, + "step": 14315 + }, + { + "epoch": 0.9729582823753228, + "grad_norm": 0.1685798466205597, + "learning_rate": 8.784651447207502e-05, + "loss": 3.9108, + "step": 14320 + }, + { + "epoch": 0.9732980024459845, + "grad_norm": 0.236485555768013, + "learning_rate": 8.784226797119174e-05, + "loss": 3.6307, + "step": 14325 + }, + { + "epoch": 0.9736377225166463, + "grad_norm": 0.17849500477313995, + "learning_rate": 8.783802147030848e-05, + "loss": 4.0166, + "step": 14330 + }, + { + "epoch": 0.973977442587308, + "grad_norm": 0.15316098928451538, + "learning_rate": 8.783377496942519e-05, + "loss": 4.0223, + "step": 14335 + }, + { + "epoch": 0.9743171626579699, + "grad_norm": 0.14150911569595337, + "learning_rate": 8.782952846854192e-05, + "loss": 3.9861, + "step": 14340 + }, + { + "epoch": 0.9746568827286316, + "grad_norm": 0.15742090344429016, + "learning_rate": 8.782528196765866e-05, + "loss": 4.0853, + "step": 14345 + }, + { + "epoch": 0.9749966027992933, + "grad_norm": 0.1783253401517868, + "learning_rate": 8.782103546677538e-05, + "loss": 3.8919, + "step": 14350 + }, + { + "epoch": 0.9753363228699552, + "grad_norm": 0.16757941246032715, + "learning_rate": 8.78167889658921e-05, + "loss": 3.5916, + "step": 14355 + }, + { + "epoch": 0.9756760429406169, + "grad_norm": 0.25573256611824036, + "learning_rate": 8.781254246500884e-05, + "loss": 3.9848, + "step": 14360 + }, + { + "epoch": 0.9760157630112787, + "grad_norm": 0.1471344381570816, + "learning_rate": 8.780829596412556e-05, + "loss": 3.9848, + "step": 14365 + }, + { + "epoch": 0.9763554830819405, + "grad_norm": 0.17102879285812378, + "learning_rate": 8.780404946324229e-05, + "loss": 3.9778, + "step": 14370 + }, + { + "epoch": 0.9766952031526023, + "grad_norm": 0.21928320825099945, + "learning_rate": 8.779980296235903e-05, + "loss": 3.7909, + "step": 14375 + }, + { + "epoch": 0.977034923223264, + "grad_norm": 0.1851445883512497, + "learning_rate": 8.779555646147574e-05, + "loss": 4.0371, + "step": 14380 + }, + { + "epoch": 0.9773746432939258, + "grad_norm": 0.15043723583221436, + "learning_rate": 8.779130996059247e-05, + "loss": 3.7731, + "step": 14385 + }, + { + "epoch": 0.9777143633645876, + "grad_norm": 0.19180312752723694, + "learning_rate": 8.778706345970921e-05, + "loss": 3.8016, + "step": 14390 + }, + { + "epoch": 0.9780540834352494, + "grad_norm": 0.18517717719078064, + "learning_rate": 8.778281695882593e-05, + "loss": 4.0766, + "step": 14395 + }, + { + "epoch": 0.9783938035059111, + "grad_norm": 0.2040787637233734, + "learning_rate": 8.777857045794266e-05, + "loss": 3.9668, + "step": 14400 + }, + { + "epoch": 0.978733523576573, + "grad_norm": 0.17522788047790527, + "learning_rate": 8.777432395705938e-05, + "loss": 3.7057, + "step": 14405 + }, + { + "epoch": 0.9790732436472347, + "grad_norm": 0.258577823638916, + "learning_rate": 8.777007745617611e-05, + "loss": 3.8998, + "step": 14410 + }, + { + "epoch": 0.9794129637178964, + "grad_norm": 0.16101132333278656, + "learning_rate": 8.776583095529284e-05, + "loss": 4.024, + "step": 14415 + }, + { + "epoch": 0.9797526837885582, + "grad_norm": 0.23143424093723297, + "learning_rate": 8.776158445440957e-05, + "loss": 3.876, + "step": 14420 + }, + { + "epoch": 0.98009240385922, + "grad_norm": 0.5347188711166382, + "learning_rate": 8.77573379535263e-05, + "loss": 3.721, + "step": 14425 + }, + { + "epoch": 0.9804321239298818, + "grad_norm": 3.5879626274108887, + "learning_rate": 8.775309145264302e-05, + "loss": 3.9406, + "step": 14430 + }, + { + "epoch": 0.9807718440005435, + "grad_norm": 0.48771944642066956, + "learning_rate": 8.774884495175975e-05, + "loss": 3.8004, + "step": 14435 + }, + { + "epoch": 0.9811115640712054, + "grad_norm": 0.1710396111011505, + "learning_rate": 8.774459845087648e-05, + "loss": 3.9633, + "step": 14440 + }, + { + "epoch": 0.9814512841418671, + "grad_norm": 0.16899172961711884, + "learning_rate": 8.774035194999321e-05, + "loss": 4.0779, + "step": 14445 + }, + { + "epoch": 0.9817910042125289, + "grad_norm": 0.18617889285087585, + "learning_rate": 8.773610544910994e-05, + "loss": 3.8466, + "step": 14450 + }, + { + "epoch": 0.9821307242831907, + "grad_norm": 0.13299688696861267, + "learning_rate": 8.773185894822666e-05, + "loss": 4.0135, + "step": 14455 + }, + { + "epoch": 0.9824704443538524, + "grad_norm": 0.20678380131721497, + "learning_rate": 8.772761244734339e-05, + "loss": 3.8453, + "step": 14460 + }, + { + "epoch": 0.9828101644245142, + "grad_norm": 0.14222410321235657, + "learning_rate": 8.772336594646012e-05, + "loss": 3.9648, + "step": 14465 + }, + { + "epoch": 0.9831498844951759, + "grad_norm": 0.21488900482654572, + "learning_rate": 8.771911944557685e-05, + "loss": 3.9613, + "step": 14470 + }, + { + "epoch": 0.9834896045658378, + "grad_norm": 0.2058933824300766, + "learning_rate": 8.771487294469358e-05, + "loss": 3.7448, + "step": 14475 + }, + { + "epoch": 0.9838293246364995, + "grad_norm": 0.25969386100769043, + "learning_rate": 8.77106264438103e-05, + "loss": 3.8523, + "step": 14480 + }, + { + "epoch": 0.9841690447071613, + "grad_norm": 0.17804904282093048, + "learning_rate": 8.770637994292703e-05, + "loss": 4.0136, + "step": 14485 + }, + { + "epoch": 0.9845087647778231, + "grad_norm": 0.17092011868953705, + "learning_rate": 8.770213344204376e-05, + "loss": 3.9277, + "step": 14490 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.26749852299690247, + "learning_rate": 8.769788694116049e-05, + "loss": 4.1337, + "step": 14495 + }, + { + "epoch": 0.9851882049191466, + "grad_norm": 1.6659806966781616, + "learning_rate": 8.769364044027722e-05, + "loss": 3.9058, + "step": 14500 + }, + { + "epoch": 0.9855279249898083, + "grad_norm": 0.16872169077396393, + "learning_rate": 8.768939393939394e-05, + "loss": 3.9551, + "step": 14505 + }, + { + "epoch": 0.9858676450604702, + "grad_norm": 0.25871741771698, + "learning_rate": 8.768514743851067e-05, + "loss": 3.9576, + "step": 14510 + }, + { + "epoch": 0.9862073651311319, + "grad_norm": 0.2599276006221771, + "learning_rate": 8.76809009376274e-05, + "loss": 4.0486, + "step": 14515 + }, + { + "epoch": 0.9865470852017937, + "grad_norm": 0.17734144628047943, + "learning_rate": 8.767665443674413e-05, + "loss": 3.9271, + "step": 14520 + }, + { + "epoch": 0.9868868052724555, + "grad_norm": 0.15988993644714355, + "learning_rate": 8.767240793586086e-05, + "loss": 3.7398, + "step": 14525 + }, + { + "epoch": 0.9872265253431173, + "grad_norm": 0.3864258825778961, + "learning_rate": 8.766816143497758e-05, + "loss": 4.1109, + "step": 14530 + }, + { + "epoch": 0.987566245413779, + "grad_norm": 0.1917543262243271, + "learning_rate": 8.766391493409431e-05, + "loss": 3.9459, + "step": 14535 + }, + { + "epoch": 0.9879059654844409, + "grad_norm": 0.2340976893901825, + "learning_rate": 8.765966843321104e-05, + "loss": 3.9126, + "step": 14540 + }, + { + "epoch": 0.9882456855551026, + "grad_norm": 0.1463710069656372, + "learning_rate": 8.765542193232777e-05, + "loss": 3.8255, + "step": 14545 + }, + { + "epoch": 0.9885854056257644, + "grad_norm": 0.20803597569465637, + "learning_rate": 8.76511754314445e-05, + "loss": 3.9285, + "step": 14550 + }, + { + "epoch": 0.9889251256964261, + "grad_norm": 0.23730699717998505, + "learning_rate": 8.764692893056122e-05, + "loss": 3.9045, + "step": 14555 + }, + { + "epoch": 0.989264845767088, + "grad_norm": 0.17138421535491943, + "learning_rate": 8.764268242967795e-05, + "loss": 3.8542, + "step": 14560 + }, + { + "epoch": 0.9896045658377497, + "grad_norm": 0.16893406212329865, + "learning_rate": 8.763843592879468e-05, + "loss": 4.0535, + "step": 14565 + }, + { + "epoch": 0.9899442859084114, + "grad_norm": 0.13037355244159698, + "learning_rate": 8.763418942791141e-05, + "loss": 3.7079, + "step": 14570 + }, + { + "epoch": 0.9902840059790733, + "grad_norm": 0.1940174400806427, + "learning_rate": 8.762994292702814e-05, + "loss": 3.9701, + "step": 14575 + }, + { + "epoch": 0.990623726049735, + "grad_norm": 0.19718214869499207, + "learning_rate": 8.762569642614486e-05, + "loss": 3.8213, + "step": 14580 + }, + { + "epoch": 0.9909634461203968, + "grad_norm": 0.6371154189109802, + "learning_rate": 8.762144992526159e-05, + "loss": 3.8541, + "step": 14585 + }, + { + "epoch": 0.9913031661910585, + "grad_norm": 0.15423880517482758, + "learning_rate": 8.761720342437832e-05, + "loss": 4.0511, + "step": 14590 + }, + { + "epoch": 0.9916428862617204, + "grad_norm": 0.1736377775669098, + "learning_rate": 8.761295692349505e-05, + "loss": 3.8275, + "step": 14595 + }, + { + "epoch": 0.9919826063323821, + "grad_norm": 0.19258588552474976, + "learning_rate": 8.760871042261178e-05, + "loss": 4.1035, + "step": 14600 + }, + { + "epoch": 0.9923223264030439, + "grad_norm": 0.43297508358955383, + "learning_rate": 8.760446392172849e-05, + "loss": 4.0978, + "step": 14605 + }, + { + "epoch": 0.9926620464737057, + "grad_norm": 0.20580710470676422, + "learning_rate": 8.760021742084523e-05, + "loss": 3.8437, + "step": 14610 + }, + { + "epoch": 0.9930017665443674, + "grad_norm": 1.218690037727356, + "learning_rate": 8.759597091996196e-05, + "loss": 4.0635, + "step": 14615 + }, + { + "epoch": 0.9933414866150292, + "grad_norm": 0.1623927801847458, + "learning_rate": 8.759172441907867e-05, + "loss": 3.8147, + "step": 14620 + }, + { + "epoch": 0.993681206685691, + "grad_norm": 0.17431679368019104, + "learning_rate": 8.758747791819542e-05, + "loss": 3.9406, + "step": 14625 + }, + { + "epoch": 0.9940209267563528, + "grad_norm": 0.18215017020702362, + "learning_rate": 8.758323141731214e-05, + "loss": 4.1441, + "step": 14630 + }, + { + "epoch": 0.9943606468270145, + "grad_norm": 0.20073819160461426, + "learning_rate": 8.757898491642886e-05, + "loss": 3.9045, + "step": 14635 + }, + { + "epoch": 0.9947003668976763, + "grad_norm": 0.17208696901798248, + "learning_rate": 8.75747384155456e-05, + "loss": 4.223, + "step": 14640 + }, + { + "epoch": 0.9950400869683381, + "grad_norm": 0.18610428273677826, + "learning_rate": 8.757049191466233e-05, + "loss": 3.9652, + "step": 14645 + }, + { + "epoch": 0.9953798070389999, + "grad_norm": 0.2461862713098526, + "learning_rate": 8.756624541377904e-05, + "loss": 3.8623, + "step": 14650 + }, + { + "epoch": 0.9957195271096616, + "grad_norm": 0.37638500332832336, + "learning_rate": 8.756199891289578e-05, + "loss": 3.896, + "step": 14655 + }, + { + "epoch": 0.9960592471803235, + "grad_norm": 0.2814083993434906, + "learning_rate": 8.755775241201251e-05, + "loss": 3.8634, + "step": 14660 + }, + { + "epoch": 0.9963989672509852, + "grad_norm": 0.2636764645576477, + "learning_rate": 8.755350591112923e-05, + "loss": 4.1024, + "step": 14665 + }, + { + "epoch": 0.9967386873216469, + "grad_norm": 0.1730240136384964, + "learning_rate": 8.754925941024597e-05, + "loss": 4.0616, + "step": 14670 + }, + { + "epoch": 0.9970784073923087, + "grad_norm": 0.17876474559307098, + "learning_rate": 8.75450129093627e-05, + "loss": 4.0613, + "step": 14675 + }, + { + "epoch": 0.9974181274629705, + "grad_norm": 0.1803043931722641, + "learning_rate": 8.754076640847941e-05, + "loss": 4.0982, + "step": 14680 + }, + { + "epoch": 0.9977578475336323, + "grad_norm": 0.2912727892398834, + "learning_rate": 8.753651990759615e-05, + "loss": 3.8865, + "step": 14685 + }, + { + "epoch": 0.998097567604294, + "grad_norm": 0.21436181664466858, + "learning_rate": 8.753227340671287e-05, + "loss": 4.0897, + "step": 14690 + }, + { + "epoch": 0.9984372876749559, + "grad_norm": 0.1772686094045639, + "learning_rate": 8.75280269058296e-05, + "loss": 4.1523, + "step": 14695 + }, + { + "epoch": 0.9987770077456176, + "grad_norm": 0.269540935754776, + "learning_rate": 8.752378040494634e-05, + "loss": 3.9891, + "step": 14700 + }, + { + "epoch": 0.9991167278162794, + "grad_norm": 0.20457632839679718, + "learning_rate": 8.751953390406305e-05, + "loss": 3.8832, + "step": 14705 + }, + { + "epoch": 0.9994564478869412, + "grad_norm": 0.2145458459854126, + "learning_rate": 8.751528740317978e-05, + "loss": 4.0273, + "step": 14710 + }, + { + "epoch": 0.999796167957603, + "grad_norm": 0.18700024485588074, + "learning_rate": 8.751104090229652e-05, + "loss": 4.1366, + "step": 14715 + }, + { + "epoch": 1.0, + "eval_bertscore": { + "f1": 0.8525333878305233, + "precision": 0.8753549892468697, + "recall": 0.8312223081395127 + }, + "eval_bleu_4": 0.0021924919669198163, + "eval_exact_match": 0.0, + "eval_loss": 3.751943349838257, + "eval_meteor": 0.07837825616589945, + "eval_rouge": { + "rouge1": 0.12766942289604644, + "rouge2": 0.01466477971693661, + "rougeL": 0.11110403147048781, + "rougeLsum": 0.11113999503571864 + }, + "eval_runtime": 404.0965, + "eval_samples_per_second": 25.536, + "eval_steps_per_second": 3.192, + "step": 14718 + }, + { + "epoch": 1.0001358880282647, + "grad_norm": 0.3904990255832672, + "learning_rate": 8.750679440141323e-05, + "loss": 4.0106, + "step": 14720 + }, + { + "epoch": 1.0004756080989265, + "grad_norm": 0.16403785347938538, + "learning_rate": 8.750254790052996e-05, + "loss": 3.8899, + "step": 14725 + }, + { + "epoch": 1.0008153281695882, + "grad_norm": 0.3333182632923126, + "learning_rate": 8.74983013996467e-05, + "loss": 3.858, + "step": 14730 + }, + { + "epoch": 1.00115504824025, + "grad_norm": 0.13622447848320007, + "learning_rate": 8.749405489876342e-05, + "loss": 3.9242, + "step": 14735 + }, + { + "epoch": 1.0014947683109119, + "grad_norm": 0.16796274483203888, + "learning_rate": 8.748980839788015e-05, + "loss": 3.9442, + "step": 14740 + }, + { + "epoch": 1.0018344883815735, + "grad_norm": 0.3433658480644226, + "learning_rate": 8.748556189699689e-05, + "loss": 4.0023, + "step": 14745 + }, + { + "epoch": 1.0021742084522354, + "grad_norm": 0.19067181646823883, + "learning_rate": 8.74813153961136e-05, + "loss": 3.6469, + "step": 14750 + }, + { + "epoch": 1.0025139285228972, + "grad_norm": 0.212546706199646, + "learning_rate": 8.747706889523033e-05, + "loss": 3.7494, + "step": 14755 + }, + { + "epoch": 1.0028536485935589, + "grad_norm": 0.23670955002307892, + "learning_rate": 8.747282239434706e-05, + "loss": 4.0415, + "step": 14760 + }, + { + "epoch": 1.0031933686642207, + "grad_norm": 0.16621361672878265, + "learning_rate": 8.746857589346379e-05, + "loss": 4.0839, + "step": 14765 + }, + { + "epoch": 1.0035330887348826, + "grad_norm": 0.21829703450202942, + "learning_rate": 8.746432939258051e-05, + "loss": 4.0334, + "step": 14770 + }, + { + "epoch": 1.0038728088055442, + "grad_norm": 0.16474294662475586, + "learning_rate": 8.746008289169724e-05, + "loss": 3.9733, + "step": 14775 + }, + { + "epoch": 1.004212528876206, + "grad_norm": 0.196587473154068, + "learning_rate": 8.745583639081397e-05, + "loss": 3.8357, + "step": 14780 + }, + { + "epoch": 1.0045522489468677, + "grad_norm": 0.4421464502811432, + "learning_rate": 8.74515898899307e-05, + "loss": 3.9698, + "step": 14785 + }, + { + "epoch": 1.0048919690175295, + "grad_norm": 0.20670776069164276, + "learning_rate": 8.744734338904743e-05, + "loss": 3.9688, + "step": 14790 + }, + { + "epoch": 1.0052316890881914, + "grad_norm": 0.17343617975711823, + "learning_rate": 8.744309688816415e-05, + "loss": 3.7534, + "step": 14795 + }, + { + "epoch": 1.005571409158853, + "grad_norm": 0.19640295207500458, + "learning_rate": 8.743885038728088e-05, + "loss": 4.0618, + "step": 14800 + }, + { + "epoch": 1.0059111292295149, + "grad_norm": 0.1813947558403015, + "learning_rate": 8.743460388639761e-05, + "loss": 3.9566, + "step": 14805 + }, + { + "epoch": 1.0062508493001767, + "grad_norm": 0.3076678216457367, + "learning_rate": 8.743035738551434e-05, + "loss": 3.7793, + "step": 14810 + }, + { + "epoch": 1.0065905693708384, + "grad_norm": 0.15746279060840607, + "learning_rate": 8.742611088463107e-05, + "loss": 3.7041, + "step": 14815 + }, + { + "epoch": 1.0069302894415002, + "grad_norm": 0.23948338627815247, + "learning_rate": 8.74218643837478e-05, + "loss": 4.1761, + "step": 14820 + }, + { + "epoch": 1.007270009512162, + "grad_norm": 0.22588057816028595, + "learning_rate": 8.741761788286452e-05, + "loss": 3.9168, + "step": 14825 + }, + { + "epoch": 1.0076097295828237, + "grad_norm": 0.173685684800148, + "learning_rate": 8.741337138198125e-05, + "loss": 3.9242, + "step": 14830 + }, + { + "epoch": 1.0079494496534855, + "grad_norm": 0.20637229084968567, + "learning_rate": 8.740912488109798e-05, + "loss": 3.9508, + "step": 14835 + }, + { + "epoch": 1.0082891697241474, + "grad_norm": 0.38505056500434875, + "learning_rate": 8.74048783802147e-05, + "loss": 3.8449, + "step": 14840 + }, + { + "epoch": 1.008628889794809, + "grad_norm": 0.16608548164367676, + "learning_rate": 8.740063187933143e-05, + "loss": 4.2384, + "step": 14845 + }, + { + "epoch": 1.0089686098654709, + "grad_norm": 0.2428790181875229, + "learning_rate": 8.739638537844816e-05, + "loss": 4.07, + "step": 14850 + }, + { + "epoch": 1.0093083299361327, + "grad_norm": 0.1632264107465744, + "learning_rate": 8.739213887756489e-05, + "loss": 4.1535, + "step": 14855 + }, + { + "epoch": 1.0096480500067944, + "grad_norm": 0.21293267607688904, + "learning_rate": 8.738789237668162e-05, + "loss": 3.9345, + "step": 14860 + }, + { + "epoch": 1.0099877700774562, + "grad_norm": 0.2258225679397583, + "learning_rate": 8.738364587579835e-05, + "loss": 4.0907, + "step": 14865 + }, + { + "epoch": 1.0103274901481178, + "grad_norm": 0.1745368391275406, + "learning_rate": 8.737939937491507e-05, + "loss": 3.8419, + "step": 14870 + }, + { + "epoch": 1.0106672102187797, + "grad_norm": 0.18902482092380524, + "learning_rate": 8.73751528740318e-05, + "loss": 4.016, + "step": 14875 + }, + { + "epoch": 1.0110069302894416, + "grad_norm": 0.2323336899280548, + "learning_rate": 8.737090637314853e-05, + "loss": 3.9343, + "step": 14880 + }, + { + "epoch": 1.0113466503601032, + "grad_norm": 0.1533086746931076, + "learning_rate": 8.736665987226526e-05, + "loss": 4.0727, + "step": 14885 + }, + { + "epoch": 1.011686370430765, + "grad_norm": 0.183485209941864, + "learning_rate": 8.736241337138199e-05, + "loss": 4.1709, + "step": 14890 + }, + { + "epoch": 1.012026090501427, + "grad_norm": 0.33692219853401184, + "learning_rate": 8.735816687049871e-05, + "loss": 4.0349, + "step": 14895 + }, + { + "epoch": 1.0123658105720885, + "grad_norm": 0.2901309132575989, + "learning_rate": 8.735392036961544e-05, + "loss": 3.7, + "step": 14900 + }, + { + "epoch": 1.0127055306427504, + "grad_norm": 1.2777307033538818, + "learning_rate": 8.734967386873217e-05, + "loss": 4.2094, + "step": 14905 + }, + { + "epoch": 1.0130452507134122, + "grad_norm": 0.31602752208709717, + "learning_rate": 8.73454273678489e-05, + "loss": 4.2209, + "step": 14910 + }, + { + "epoch": 1.0133849707840739, + "grad_norm": 0.19287407398223877, + "learning_rate": 8.734118086696563e-05, + "loss": 3.8944, + "step": 14915 + }, + { + "epoch": 1.0137246908547357, + "grad_norm": 0.18507331609725952, + "learning_rate": 8.733693436608235e-05, + "loss": 3.7316, + "step": 14920 + }, + { + "epoch": 1.0140644109253976, + "grad_norm": 0.1758868396282196, + "learning_rate": 8.733268786519908e-05, + "loss": 3.8259, + "step": 14925 + }, + { + "epoch": 1.0144041309960592, + "grad_norm": 0.5127626061439514, + "learning_rate": 8.732844136431581e-05, + "loss": 3.8817, + "step": 14930 + }, + { + "epoch": 1.014743851066721, + "grad_norm": 0.19071178138256073, + "learning_rate": 8.732419486343254e-05, + "loss": 3.7527, + "step": 14935 + }, + { + "epoch": 1.015083571137383, + "grad_norm": 0.15322649478912354, + "learning_rate": 8.731994836254927e-05, + "loss": 3.6907, + "step": 14940 + }, + { + "epoch": 1.0154232912080445, + "grad_norm": 0.6302610635757446, + "learning_rate": 8.7315701861666e-05, + "loss": 4.1883, + "step": 14945 + }, + { + "epoch": 1.0157630112787064, + "grad_norm": 0.17898263037204742, + "learning_rate": 8.731145536078272e-05, + "loss": 4.0811, + "step": 14950 + }, + { + "epoch": 1.016102731349368, + "grad_norm": 0.41409072279930115, + "learning_rate": 8.730720885989945e-05, + "loss": 4.0178, + "step": 14955 + }, + { + "epoch": 1.0164424514200299, + "grad_norm": 0.26254400610923767, + "learning_rate": 8.730296235901617e-05, + "loss": 3.9531, + "step": 14960 + }, + { + "epoch": 1.0167821714906917, + "grad_norm": 0.20092613995075226, + "learning_rate": 8.72987158581329e-05, + "loss": 3.8858, + "step": 14965 + }, + { + "epoch": 1.0171218915613534, + "grad_norm": 0.20742516219615936, + "learning_rate": 8.729446935724963e-05, + "loss": 3.752, + "step": 14970 + }, + { + "epoch": 1.0174616116320152, + "grad_norm": 0.17711442708969116, + "learning_rate": 8.729022285636635e-05, + "loss": 3.9607, + "step": 14975 + }, + { + "epoch": 1.017801331702677, + "grad_norm": 0.15855258703231812, + "learning_rate": 8.728597635548309e-05, + "loss": 3.9188, + "step": 14980 + }, + { + "epoch": 1.0181410517733387, + "grad_norm": 0.17750421166419983, + "learning_rate": 8.728172985459982e-05, + "loss": 4.1726, + "step": 14985 + }, + { + "epoch": 1.0184807718440005, + "grad_norm": 0.19674821197986603, + "learning_rate": 8.727748335371653e-05, + "loss": 3.9894, + "step": 14990 + }, + { + "epoch": 1.0188204919146624, + "grad_norm": 0.9088572263717651, + "learning_rate": 8.727323685283327e-05, + "loss": 4.113, + "step": 14995 + }, + { + "epoch": 1.019160211985324, + "grad_norm": 0.2823253571987152, + "learning_rate": 8.726899035195e-05, + "loss": 3.9209, + "step": 15000 + }, + { + "epoch": 1.0194999320559859, + "grad_norm": 0.1664217710494995, + "learning_rate": 8.726474385106672e-05, + "loss": 3.7566, + "step": 15005 + }, + { + "epoch": 1.0198396521266477, + "grad_norm": 0.17631694674491882, + "learning_rate": 8.726049735018346e-05, + "loss": 3.9483, + "step": 15010 + }, + { + "epoch": 1.0201793721973094, + "grad_norm": 0.14859095215797424, + "learning_rate": 8.725625084930019e-05, + "loss": 3.8775, + "step": 15015 + }, + { + "epoch": 1.0205190922679712, + "grad_norm": 0.23279117047786713, + "learning_rate": 8.72520043484169e-05, + "loss": 3.8574, + "step": 15020 + }, + { + "epoch": 1.020858812338633, + "grad_norm": 0.9567646384239197, + "learning_rate": 8.724775784753364e-05, + "loss": 4.0973, + "step": 15025 + }, + { + "epoch": 1.0211985324092947, + "grad_norm": 0.19611890614032745, + "learning_rate": 8.724351134665036e-05, + "loss": 3.6861, + "step": 15030 + }, + { + "epoch": 1.0215382524799566, + "grad_norm": 0.16505566239356995, + "learning_rate": 8.723926484576709e-05, + "loss": 4.2409, + "step": 15035 + }, + { + "epoch": 1.0218779725506182, + "grad_norm": 0.22672072052955627, + "learning_rate": 8.723501834488383e-05, + "loss": 3.8754, + "step": 15040 + }, + { + "epoch": 1.02221769262128, + "grad_norm": 0.1819319725036621, + "learning_rate": 8.723077184400054e-05, + "loss": 4.0001, + "step": 15045 + }, + { + "epoch": 1.022557412691942, + "grad_norm": 1.2608730792999268, + "learning_rate": 8.722652534311727e-05, + "loss": 3.8925, + "step": 15050 + }, + { + "epoch": 1.0228971327626035, + "grad_norm": 0.17352354526519775, + "learning_rate": 8.722227884223401e-05, + "loss": 3.9367, + "step": 15055 + }, + { + "epoch": 1.0232368528332654, + "grad_norm": 0.21542750298976898, + "learning_rate": 8.721803234135073e-05, + "loss": 3.8971, + "step": 15060 + }, + { + "epoch": 1.0235765729039272, + "grad_norm": 0.20792967081069946, + "learning_rate": 8.721378584046745e-05, + "loss": 4.0328, + "step": 15065 + }, + { + "epoch": 1.0239162929745889, + "grad_norm": 0.20066216588020325, + "learning_rate": 8.72095393395842e-05, + "loss": 4.0192, + "step": 15070 + }, + { + "epoch": 1.0242560130452507, + "grad_norm": 0.3969745635986328, + "learning_rate": 8.720529283870091e-05, + "loss": 3.8883, + "step": 15075 + }, + { + "epoch": 1.0245957331159126, + "grad_norm": 0.18898066878318787, + "learning_rate": 8.720104633781764e-05, + "loss": 4.0302, + "step": 15080 + }, + { + "epoch": 1.0249354531865742, + "grad_norm": 0.21829643845558167, + "learning_rate": 8.719679983693438e-05, + "loss": 3.7844, + "step": 15085 + }, + { + "epoch": 1.025275173257236, + "grad_norm": 0.17096686363220215, + "learning_rate": 8.71925533360511e-05, + "loss": 4.0315, + "step": 15090 + }, + { + "epoch": 1.025614893327898, + "grad_norm": 0.2026512771844864, + "learning_rate": 8.718830683516782e-05, + "loss": 4.1566, + "step": 15095 + }, + { + "epoch": 1.0259546133985595, + "grad_norm": 0.46859011054039, + "learning_rate": 8.718406033428456e-05, + "loss": 3.8248, + "step": 15100 + }, + { + "epoch": 1.0262943334692214, + "grad_norm": 0.3532336354255676, + "learning_rate": 8.717981383340128e-05, + "loss": 3.9565, + "step": 15105 + }, + { + "epoch": 1.0266340535398832, + "grad_norm": 0.20714938640594482, + "learning_rate": 8.717641663269467e-05, + "loss": 3.9814, + "step": 15110 + }, + { + "epoch": 1.0269737736105449, + "grad_norm": 0.617857813835144, + "learning_rate": 8.717217013181139e-05, + "loss": 3.984, + "step": 15115 + }, + { + "epoch": 1.0273134936812067, + "grad_norm": 0.17971056699752808, + "learning_rate": 8.716792363092812e-05, + "loss": 3.9386, + "step": 15120 + }, + { + "epoch": 1.0276532137518684, + "grad_norm": 0.2551378309726715, + "learning_rate": 8.716367713004486e-05, + "loss": 3.8141, + "step": 15125 + }, + { + "epoch": 1.0279929338225302, + "grad_norm": 0.18092188239097595, + "learning_rate": 8.715943062916157e-05, + "loss": 4.0005, + "step": 15130 + }, + { + "epoch": 1.028332653893192, + "grad_norm": 0.19282864034175873, + "learning_rate": 8.71551841282783e-05, + "loss": 4.0706, + "step": 15135 + }, + { + "epoch": 1.0286723739638537, + "grad_norm": 0.14662542939186096, + "learning_rate": 8.715093762739504e-05, + "loss": 4.041, + "step": 15140 + }, + { + "epoch": 1.0290120940345155, + "grad_norm": 0.512459397315979, + "learning_rate": 8.714669112651176e-05, + "loss": 3.9389, + "step": 15145 + }, + { + "epoch": 1.0293518141051774, + "grad_norm": 0.18077687919139862, + "learning_rate": 8.714244462562848e-05, + "loss": 4.1245, + "step": 15150 + }, + { + "epoch": 1.029691534175839, + "grad_norm": 0.179152250289917, + "learning_rate": 8.713819812474521e-05, + "loss": 3.9908, + "step": 15155 + }, + { + "epoch": 1.0300312542465009, + "grad_norm": 0.1518973410129547, + "learning_rate": 8.713395162386194e-05, + "loss": 4.0454, + "step": 15160 + }, + { + "epoch": 1.0303709743171627, + "grad_norm": 0.3445395529270172, + "learning_rate": 8.712970512297867e-05, + "loss": 3.8875, + "step": 15165 + }, + { + "epoch": 1.0307106943878244, + "grad_norm": 0.18875689804553986, + "learning_rate": 8.71254586220954e-05, + "loss": 4.0114, + "step": 15170 + }, + { + "epoch": 1.0310504144584862, + "grad_norm": 0.3045675456523895, + "learning_rate": 8.712121212121212e-05, + "loss": 3.6805, + "step": 15175 + }, + { + "epoch": 1.031390134529148, + "grad_norm": 0.20015089213848114, + "learning_rate": 8.711696562032885e-05, + "loss": 4.0247, + "step": 15180 + }, + { + "epoch": 1.0317298545998097, + "grad_norm": 1.3827072381973267, + "learning_rate": 8.711271911944558e-05, + "loss": 4.0037, + "step": 15185 + }, + { + "epoch": 1.0320695746704716, + "grad_norm": 0.26853814721107483, + "learning_rate": 8.710847261856231e-05, + "loss": 4.0757, + "step": 15190 + }, + { + "epoch": 1.0324092947411334, + "grad_norm": 0.19492289423942566, + "learning_rate": 8.710422611767904e-05, + "loss": 3.9609, + "step": 15195 + }, + { + "epoch": 1.032749014811795, + "grad_norm": 0.16671162843704224, + "learning_rate": 8.709997961679576e-05, + "loss": 3.8563, + "step": 15200 + }, + { + "epoch": 1.033088734882457, + "grad_norm": 0.2444605976343155, + "learning_rate": 8.709573311591249e-05, + "loss": 4.1325, + "step": 15205 + }, + { + "epoch": 1.0334284549531185, + "grad_norm": 0.553428590297699, + "learning_rate": 8.709148661502922e-05, + "loss": 4.0776, + "step": 15210 + }, + { + "epoch": 1.0337681750237804, + "grad_norm": 0.14710353314876556, + "learning_rate": 8.708724011414595e-05, + "loss": 4.0418, + "step": 15215 + }, + { + "epoch": 1.0341078950944422, + "grad_norm": 0.5704167485237122, + "learning_rate": 8.708299361326268e-05, + "loss": 4.0805, + "step": 15220 + }, + { + "epoch": 1.0344476151651039, + "grad_norm": 0.6797170639038086, + "learning_rate": 8.70787471123794e-05, + "loss": 3.6392, + "step": 15225 + }, + { + "epoch": 1.0347873352357657, + "grad_norm": 0.25881215929985046, + "learning_rate": 8.707450061149613e-05, + "loss": 3.8805, + "step": 15230 + }, + { + "epoch": 1.0351270553064276, + "grad_norm": 0.16261979937553406, + "learning_rate": 8.707025411061286e-05, + "loss": 4.1071, + "step": 15235 + }, + { + "epoch": 1.0354667753770892, + "grad_norm": 0.1617424488067627, + "learning_rate": 8.706600760972959e-05, + "loss": 3.8198, + "step": 15240 + }, + { + "epoch": 1.035806495447751, + "grad_norm": 0.26581504940986633, + "learning_rate": 8.706176110884632e-05, + "loss": 4.1637, + "step": 15245 + }, + { + "epoch": 1.036146215518413, + "grad_norm": 0.19323405623435974, + "learning_rate": 8.705751460796304e-05, + "loss": 4.0632, + "step": 15250 + }, + { + "epoch": 1.0364859355890745, + "grad_norm": 0.17611320316791534, + "learning_rate": 8.705326810707977e-05, + "loss": 3.925, + "step": 15255 + }, + { + "epoch": 1.0368256556597364, + "grad_norm": 0.21875782310962677, + "learning_rate": 8.70490216061965e-05, + "loss": 4.0839, + "step": 15260 + }, + { + "epoch": 1.0371653757303982, + "grad_norm": 0.4429180324077606, + "learning_rate": 8.704477510531323e-05, + "loss": 3.857, + "step": 15265 + }, + { + "epoch": 1.0375050958010599, + "grad_norm": 0.1597195267677307, + "learning_rate": 8.704052860442996e-05, + "loss": 4.015, + "step": 15270 + }, + { + "epoch": 1.0378448158717217, + "grad_norm": 0.23358796536922455, + "learning_rate": 8.703628210354668e-05, + "loss": 3.5926, + "step": 15275 + }, + { + "epoch": 1.0381845359423836, + "grad_norm": 0.19826273620128632, + "learning_rate": 8.703203560266341e-05, + "loss": 4.0386, + "step": 15280 + }, + { + "epoch": 1.0385242560130452, + "grad_norm": 0.16573616862297058, + "learning_rate": 8.702778910178013e-05, + "loss": 4.1957, + "step": 15285 + }, + { + "epoch": 1.038863976083707, + "grad_norm": 0.14972004294395447, + "learning_rate": 8.702354260089687e-05, + "loss": 4.1942, + "step": 15290 + }, + { + "epoch": 1.0392036961543687, + "grad_norm": 0.20618268847465515, + "learning_rate": 8.70192961000136e-05, + "loss": 3.9232, + "step": 15295 + }, + { + "epoch": 1.0395434162250305, + "grad_norm": 0.17007362842559814, + "learning_rate": 8.701504959913031e-05, + "loss": 3.9043, + "step": 15300 + }, + { + "epoch": 1.0398831362956924, + "grad_norm": 0.22789278626441956, + "learning_rate": 8.701080309824705e-05, + "loss": 4.02, + "step": 15305 + }, + { + "epoch": 1.040222856366354, + "grad_norm": 0.216500386595726, + "learning_rate": 8.700655659736378e-05, + "loss": 3.8103, + "step": 15310 + }, + { + "epoch": 1.0405625764370159, + "grad_norm": 0.21216286718845367, + "learning_rate": 8.70023100964805e-05, + "loss": 4.205, + "step": 15315 + }, + { + "epoch": 1.0409022965076777, + "grad_norm": 0.2747771739959717, + "learning_rate": 8.699806359559724e-05, + "loss": 3.8234, + "step": 15320 + }, + { + "epoch": 1.0412420165783394, + "grad_norm": 0.3563567101955414, + "learning_rate": 8.699381709471396e-05, + "loss": 3.8734, + "step": 15325 + }, + { + "epoch": 1.0415817366490012, + "grad_norm": 0.1629883050918579, + "learning_rate": 8.698957059383068e-05, + "loss": 4.0788, + "step": 15330 + }, + { + "epoch": 1.041921456719663, + "grad_norm": 0.19491197168827057, + "learning_rate": 8.698532409294742e-05, + "loss": 4.0117, + "step": 15335 + }, + { + "epoch": 1.0422611767903247, + "grad_norm": 0.18457502126693726, + "learning_rate": 8.698107759206415e-05, + "loss": 4.2477, + "step": 15340 + }, + { + "epoch": 1.0426008968609866, + "grad_norm": 0.21591152250766754, + "learning_rate": 8.697683109118086e-05, + "loss": 3.9, + "step": 15345 + }, + { + "epoch": 1.0429406169316484, + "grad_norm": 0.17951448261737823, + "learning_rate": 8.69725845902976e-05, + "loss": 3.7959, + "step": 15350 + }, + { + "epoch": 1.04328033700231, + "grad_norm": 0.17192557454109192, + "learning_rate": 8.696833808941432e-05, + "loss": 3.5989, + "step": 15355 + }, + { + "epoch": 1.043620057072972, + "grad_norm": 0.1431371122598648, + "learning_rate": 8.696409158853105e-05, + "loss": 4.0487, + "step": 15360 + }, + { + "epoch": 1.0439597771436337, + "grad_norm": 0.19106970727443695, + "learning_rate": 8.695984508764779e-05, + "loss": 3.6114, + "step": 15365 + }, + { + "epoch": 1.0442994972142954, + "grad_norm": 0.18922775983810425, + "learning_rate": 8.69555985867645e-05, + "loss": 4.0135, + "step": 15370 + }, + { + "epoch": 1.0446392172849572, + "grad_norm": 0.17645229399204254, + "learning_rate": 8.695135208588123e-05, + "loss": 3.9, + "step": 15375 + }, + { + "epoch": 1.0449789373556189, + "grad_norm": 0.1632130742073059, + "learning_rate": 8.694710558499797e-05, + "loss": 3.8935, + "step": 15380 + }, + { + "epoch": 1.0453186574262807, + "grad_norm": 0.20238108932971954, + "learning_rate": 8.694285908411469e-05, + "loss": 3.9347, + "step": 15385 + }, + { + "epoch": 1.0456583774969426, + "grad_norm": 0.24021537601947784, + "learning_rate": 8.693861258323143e-05, + "loss": 3.883, + "step": 15390 + }, + { + "epoch": 1.0459980975676042, + "grad_norm": 0.20223712921142578, + "learning_rate": 8.693436608234816e-05, + "loss": 3.7852, + "step": 15395 + }, + { + "epoch": 1.046337817638266, + "grad_norm": 0.25143828988075256, + "learning_rate": 8.693011958146487e-05, + "loss": 3.7293, + "step": 15400 + }, + { + "epoch": 1.046677537708928, + "grad_norm": 0.21580559015274048, + "learning_rate": 8.692587308058161e-05, + "loss": 3.9184, + "step": 15405 + }, + { + "epoch": 1.0470172577795895, + "grad_norm": 0.2628757655620575, + "learning_rate": 8.692162657969834e-05, + "loss": 4.1083, + "step": 15410 + }, + { + "epoch": 1.0473569778502514, + "grad_norm": 0.17509426176548004, + "learning_rate": 8.691738007881505e-05, + "loss": 4.2208, + "step": 15415 + }, + { + "epoch": 1.0476966979209132, + "grad_norm": 0.2297753542661667, + "learning_rate": 8.69131335779318e-05, + "loss": 3.8002, + "step": 15420 + }, + { + "epoch": 1.0480364179915749, + "grad_norm": 0.1647188365459442, + "learning_rate": 8.690888707704852e-05, + "loss": 3.8605, + "step": 15425 + }, + { + "epoch": 1.0483761380622367, + "grad_norm": 0.30028846859931946, + "learning_rate": 8.690464057616524e-05, + "loss": 3.7821, + "step": 15430 + }, + { + "epoch": 1.0487158581328986, + "grad_norm": 0.2656041383743286, + "learning_rate": 8.690039407528198e-05, + "loss": 3.9917, + "step": 15435 + }, + { + "epoch": 1.0490555782035602, + "grad_norm": 0.47004324197769165, + "learning_rate": 8.68961475743987e-05, + "loss": 3.8301, + "step": 15440 + }, + { + "epoch": 1.049395298274222, + "grad_norm": 0.1800283044576645, + "learning_rate": 8.689190107351542e-05, + "loss": 3.7918, + "step": 15445 + }, + { + "epoch": 1.049735018344884, + "grad_norm": 0.15705004334449768, + "learning_rate": 8.688765457263216e-05, + "loss": 4.0264, + "step": 15450 + }, + { + "epoch": 1.0500747384155455, + "grad_norm": 0.20027688145637512, + "learning_rate": 8.688340807174888e-05, + "loss": 3.989, + "step": 15455 + }, + { + "epoch": 1.0504144584862074, + "grad_norm": 0.29811808466911316, + "learning_rate": 8.68791615708656e-05, + "loss": 3.8581, + "step": 15460 + }, + { + "epoch": 1.050754178556869, + "grad_norm": 0.20250307023525238, + "learning_rate": 8.687491506998235e-05, + "loss": 4.086, + "step": 15465 + }, + { + "epoch": 1.0510938986275309, + "grad_norm": 0.1817096620798111, + "learning_rate": 8.687066856909906e-05, + "loss": 4.0201, + "step": 15470 + }, + { + "epoch": 1.0514336186981927, + "grad_norm": 0.18293219804763794, + "learning_rate": 8.686642206821579e-05, + "loss": 3.9735, + "step": 15475 + }, + { + "epoch": 1.0517733387688544, + "grad_norm": 0.18923459947109222, + "learning_rate": 8.686217556733253e-05, + "loss": 4.0301, + "step": 15480 + }, + { + "epoch": 1.0521130588395162, + "grad_norm": 0.602603554725647, + "learning_rate": 8.685792906644925e-05, + "loss": 4.0435, + "step": 15485 + }, + { + "epoch": 1.052452778910178, + "grad_norm": 1.3263099193572998, + "learning_rate": 8.685368256556597e-05, + "loss": 3.9572, + "step": 15490 + }, + { + "epoch": 1.0527924989808397, + "grad_norm": 0.6051156520843506, + "learning_rate": 8.684943606468272e-05, + "loss": 3.9652, + "step": 15495 + }, + { + "epoch": 1.0531322190515016, + "grad_norm": 0.20413897931575775, + "learning_rate": 8.684518956379943e-05, + "loss": 3.8492, + "step": 15500 + }, + { + "epoch": 1.0534719391221634, + "grad_norm": 0.22272081673145294, + "learning_rate": 8.684094306291616e-05, + "loss": 4.0493, + "step": 15505 + }, + { + "epoch": 1.053811659192825, + "grad_norm": 1.5194166898727417, + "learning_rate": 8.683669656203289e-05, + "loss": 4.184, + "step": 15510 + }, + { + "epoch": 1.054151379263487, + "grad_norm": 2.840766668319702, + "learning_rate": 8.683245006114961e-05, + "loss": 3.937, + "step": 15515 + }, + { + "epoch": 1.0544910993341488, + "grad_norm": 0.19848090410232544, + "learning_rate": 8.682820356026634e-05, + "loss": 4.1022, + "step": 15520 + }, + { + "epoch": 1.0548308194048104, + "grad_norm": 0.21190063655376434, + "learning_rate": 8.682395705938307e-05, + "loss": 3.9305, + "step": 15525 + }, + { + "epoch": 1.0551705394754722, + "grad_norm": 0.15121546387672424, + "learning_rate": 8.68197105584998e-05, + "loss": 3.8715, + "step": 15530 + }, + { + "epoch": 1.055510259546134, + "grad_norm": 0.2186662256717682, + "learning_rate": 8.681546405761653e-05, + "loss": 4.151, + "step": 15535 + }, + { + "epoch": 1.0558499796167957, + "grad_norm": 0.17990659177303314, + "learning_rate": 8.681121755673325e-05, + "loss": 4.2081, + "step": 15540 + }, + { + "epoch": 1.0561896996874576, + "grad_norm": 0.14546173810958862, + "learning_rate": 8.680697105584998e-05, + "loss": 3.9119, + "step": 15545 + }, + { + "epoch": 1.0565294197581192, + "grad_norm": 0.679550051689148, + "learning_rate": 8.680272455496671e-05, + "loss": 3.9611, + "step": 15550 + }, + { + "epoch": 1.056869139828781, + "grad_norm": 0.1960933953523636, + "learning_rate": 8.679847805408344e-05, + "loss": 3.7563, + "step": 15555 + }, + { + "epoch": 1.057208859899443, + "grad_norm": 0.18684549629688263, + "learning_rate": 8.679423155320017e-05, + "loss": 3.6234, + "step": 15560 + }, + { + "epoch": 1.0575485799701045, + "grad_norm": 0.19271914660930634, + "learning_rate": 8.67899850523169e-05, + "loss": 4.0651, + "step": 15565 + }, + { + "epoch": 1.0578883000407664, + "grad_norm": 0.29440221190452576, + "learning_rate": 8.678573855143362e-05, + "loss": 4.0037, + "step": 15570 + }, + { + "epoch": 1.0582280201114282, + "grad_norm": 0.21043658256530762, + "learning_rate": 8.678149205055035e-05, + "loss": 3.93, + "step": 15575 + }, + { + "epoch": 1.0585677401820899, + "grad_norm": 0.16460970044136047, + "learning_rate": 8.677724554966708e-05, + "loss": 3.8842, + "step": 15580 + }, + { + "epoch": 1.0589074602527517, + "grad_norm": 0.3439513146877289, + "learning_rate": 8.677299904878381e-05, + "loss": 4.0125, + "step": 15585 + }, + { + "epoch": 1.0592471803234136, + "grad_norm": 0.16496612131595612, + "learning_rate": 8.676875254790053e-05, + "loss": 3.9941, + "step": 15590 + }, + { + "epoch": 1.0595869003940752, + "grad_norm": 0.19456376135349274, + "learning_rate": 8.676450604701726e-05, + "loss": 4.051, + "step": 15595 + }, + { + "epoch": 1.059926620464737, + "grad_norm": 0.16333982348442078, + "learning_rate": 8.676025954613399e-05, + "loss": 3.9371, + "step": 15600 + }, + { + "epoch": 1.060266340535399, + "grad_norm": 0.17786568403244019, + "learning_rate": 8.675601304525072e-05, + "loss": 3.9135, + "step": 15605 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.21767796576023102, + "learning_rate": 8.675176654436745e-05, + "loss": 4.042, + "step": 15610 + }, + { + "epoch": 1.0609457806767224, + "grad_norm": 0.4282776415348053, + "learning_rate": 8.674752004348417e-05, + "loss": 3.9378, + "step": 15615 + }, + { + "epoch": 1.0612855007473843, + "grad_norm": 0.16007962822914124, + "learning_rate": 8.67432735426009e-05, + "loss": 4.2506, + "step": 15620 + }, + { + "epoch": 1.061625220818046, + "grad_norm": 1.3766793012619019, + "learning_rate": 8.673902704171763e-05, + "loss": 4.0391, + "step": 15625 + }, + { + "epoch": 1.0619649408887077, + "grad_norm": 0.18175499141216278, + "learning_rate": 8.673478054083436e-05, + "loss": 3.9535, + "step": 15630 + }, + { + "epoch": 1.0623046609593694, + "grad_norm": 0.1911720335483551, + "learning_rate": 8.673053403995109e-05, + "loss": 4.1175, + "step": 15635 + }, + { + "epoch": 1.0626443810300312, + "grad_norm": 0.3966716527938843, + "learning_rate": 8.67262875390678e-05, + "loss": 3.7996, + "step": 15640 + }, + { + "epoch": 1.062984101100693, + "grad_norm": 0.1937049925327301, + "learning_rate": 8.672204103818454e-05, + "loss": 3.9307, + "step": 15645 + }, + { + "epoch": 1.0633238211713547, + "grad_norm": 0.1788756549358368, + "learning_rate": 8.671779453730127e-05, + "loss": 4.0341, + "step": 15650 + }, + { + "epoch": 1.0636635412420166, + "grad_norm": 0.19898243248462677, + "learning_rate": 8.671354803641799e-05, + "loss": 4.024, + "step": 15655 + }, + { + "epoch": 1.0640032613126784, + "grad_norm": 0.1986248642206192, + "learning_rate": 8.670930153553473e-05, + "loss": 4.0329, + "step": 15660 + }, + { + "epoch": 1.06434298138334, + "grad_norm": 0.14453712105751038, + "learning_rate": 8.670505503465146e-05, + "loss": 3.7102, + "step": 15665 + }, + { + "epoch": 1.064682701454002, + "grad_norm": 0.1540953516960144, + "learning_rate": 8.670080853376817e-05, + "loss": 3.7639, + "step": 15670 + }, + { + "epoch": 1.0650224215246638, + "grad_norm": 0.21638967096805573, + "learning_rate": 8.669656203288491e-05, + "loss": 3.9911, + "step": 15675 + }, + { + "epoch": 1.0653621415953254, + "grad_norm": 0.17059166729450226, + "learning_rate": 8.669231553200164e-05, + "loss": 4.0997, + "step": 15680 + }, + { + "epoch": 1.0657018616659872, + "grad_norm": 0.636968195438385, + "learning_rate": 8.668806903111835e-05, + "loss": 3.7359, + "step": 15685 + }, + { + "epoch": 1.066041581736649, + "grad_norm": 0.20409801602363586, + "learning_rate": 8.66838225302351e-05, + "loss": 4.1316, + "step": 15690 + }, + { + "epoch": 1.0663813018073107, + "grad_norm": 0.2158990055322647, + "learning_rate": 8.667957602935182e-05, + "loss": 4.0297, + "step": 15695 + }, + { + "epoch": 1.0667210218779726, + "grad_norm": 0.18995846807956696, + "learning_rate": 8.667532952846854e-05, + "loss": 4.0429, + "step": 15700 + }, + { + "epoch": 1.0670607419486344, + "grad_norm": 0.19172509014606476, + "learning_rate": 8.667108302758528e-05, + "loss": 3.7661, + "step": 15705 + }, + { + "epoch": 1.067400462019296, + "grad_norm": 0.20331713557243347, + "learning_rate": 8.6666836526702e-05, + "loss": 3.8456, + "step": 15710 + }, + { + "epoch": 1.067740182089958, + "grad_norm": 0.18738865852355957, + "learning_rate": 8.666259002581872e-05, + "loss": 3.8753, + "step": 15715 + }, + { + "epoch": 1.0680799021606195, + "grad_norm": 0.17677748203277588, + "learning_rate": 8.665834352493546e-05, + "loss": 3.7817, + "step": 15720 + }, + { + "epoch": 1.0684196222312814, + "grad_norm": 0.8454935550689697, + "learning_rate": 8.665409702405218e-05, + "loss": 3.9798, + "step": 15725 + }, + { + "epoch": 1.0687593423019432, + "grad_norm": 0.18706971406936646, + "learning_rate": 8.664985052316892e-05, + "loss": 4.0192, + "step": 15730 + }, + { + "epoch": 1.0690990623726049, + "grad_norm": 0.197980135679245, + "learning_rate": 8.664560402228565e-05, + "loss": 4.0905, + "step": 15735 + }, + { + "epoch": 1.0694387824432667, + "grad_norm": 0.17847314476966858, + "learning_rate": 8.664135752140236e-05, + "loss": 3.9688, + "step": 15740 + }, + { + "epoch": 1.0697785025139286, + "grad_norm": 0.19877523183822632, + "learning_rate": 8.66371110205191e-05, + "loss": 3.8889, + "step": 15745 + }, + { + "epoch": 1.0701182225845902, + "grad_norm": 0.15952861309051514, + "learning_rate": 8.663286451963583e-05, + "loss": 3.7604, + "step": 15750 + }, + { + "epoch": 1.070457942655252, + "grad_norm": 0.16648532450199127, + "learning_rate": 8.662861801875255e-05, + "loss": 4.1835, + "step": 15755 + }, + { + "epoch": 1.070797662725914, + "grad_norm": 0.19479535520076752, + "learning_rate": 8.662437151786929e-05, + "loss": 3.8944, + "step": 15760 + }, + { + "epoch": 1.0711373827965756, + "grad_norm": 0.245815709233284, + "learning_rate": 8.662012501698602e-05, + "loss": 3.8926, + "step": 15765 + }, + { + "epoch": 1.0714771028672374, + "grad_norm": 3.6268773078918457, + "learning_rate": 8.661587851610273e-05, + "loss": 3.6891, + "step": 15770 + }, + { + "epoch": 1.0718168229378993, + "grad_norm": 0.4121844470500946, + "learning_rate": 8.661163201521947e-05, + "loss": 3.952, + "step": 15775 + }, + { + "epoch": 1.072156543008561, + "grad_norm": 0.193523570895195, + "learning_rate": 8.660738551433619e-05, + "loss": 4.0271, + "step": 15780 + }, + { + "epoch": 1.0724962630792227, + "grad_norm": 0.2075708955526352, + "learning_rate": 8.660313901345291e-05, + "loss": 3.7452, + "step": 15785 + }, + { + "epoch": 1.0728359831498846, + "grad_norm": 0.17259186506271362, + "learning_rate": 8.659889251256966e-05, + "loss": 3.8602, + "step": 15790 + }, + { + "epoch": 1.0731757032205462, + "grad_norm": 0.1777859479188919, + "learning_rate": 8.659464601168637e-05, + "loss": 4.1383, + "step": 15795 + }, + { + "epoch": 1.073515423291208, + "grad_norm": 0.26029467582702637, + "learning_rate": 8.65903995108031e-05, + "loss": 3.9322, + "step": 15800 + }, + { + "epoch": 1.07385514336187, + "grad_norm": 0.19683490693569183, + "learning_rate": 8.658615300991984e-05, + "loss": 3.9462, + "step": 15805 + }, + { + "epoch": 1.0741948634325316, + "grad_norm": 0.17535097897052765, + "learning_rate": 8.658190650903655e-05, + "loss": 3.782, + "step": 15810 + }, + { + "epoch": 1.0745345835031934, + "grad_norm": 0.1796151101589203, + "learning_rate": 8.657766000815328e-05, + "loss": 4.0646, + "step": 15815 + }, + { + "epoch": 1.074874303573855, + "grad_norm": 0.8811689019203186, + "learning_rate": 8.657341350727002e-05, + "loss": 3.9336, + "step": 15820 + }, + { + "epoch": 1.075214023644517, + "grad_norm": 0.19488303363323212, + "learning_rate": 8.656916700638674e-05, + "loss": 3.9167, + "step": 15825 + }, + { + "epoch": 1.0755537437151788, + "grad_norm": 0.30913442373275757, + "learning_rate": 8.656492050550347e-05, + "loss": 4.0111, + "step": 15830 + }, + { + "epoch": 1.0758934637858404, + "grad_norm": 0.3397049903869629, + "learning_rate": 8.656067400462021e-05, + "loss": 3.8815, + "step": 15835 + }, + { + "epoch": 1.0762331838565022, + "grad_norm": 0.1829068511724472, + "learning_rate": 8.655642750373692e-05, + "loss": 4.0437, + "step": 15840 + }, + { + "epoch": 1.076572903927164, + "grad_norm": 0.28394851088523865, + "learning_rate": 8.655218100285365e-05, + "loss": 3.5043, + "step": 15845 + }, + { + "epoch": 1.0769126239978257, + "grad_norm": 0.18113264441490173, + "learning_rate": 8.654793450197039e-05, + "loss": 3.727, + "step": 15850 + }, + { + "epoch": 1.0772523440684876, + "grad_norm": 0.15289366245269775, + "learning_rate": 8.65436880010871e-05, + "loss": 3.8797, + "step": 15855 + }, + { + "epoch": 1.0775920641391494, + "grad_norm": 0.17669235169887543, + "learning_rate": 8.653944150020383e-05, + "loss": 4.1871, + "step": 15860 + }, + { + "epoch": 1.077931784209811, + "grad_norm": 0.18803103268146515, + "learning_rate": 8.653519499932056e-05, + "loss": 3.7349, + "step": 15865 + }, + { + "epoch": 1.078271504280473, + "grad_norm": 0.2054901123046875, + "learning_rate": 8.653094849843729e-05, + "loss": 4.0579, + "step": 15870 + }, + { + "epoch": 1.0786112243511348, + "grad_norm": 0.1688840240240097, + "learning_rate": 8.652670199755402e-05, + "loss": 3.8866, + "step": 15875 + }, + { + "epoch": 1.0789509444217964, + "grad_norm": 0.19786614179611206, + "learning_rate": 8.652245549667075e-05, + "loss": 3.8141, + "step": 15880 + }, + { + "epoch": 1.0792906644924583, + "grad_norm": 0.1600450575351715, + "learning_rate": 8.651820899578747e-05, + "loss": 3.9755, + "step": 15885 + }, + { + "epoch": 1.0796303845631199, + "grad_norm": 0.14356589317321777, + "learning_rate": 8.65139624949042e-05, + "loss": 3.9385, + "step": 15890 + }, + { + "epoch": 1.0799701046337817, + "grad_norm": 0.23043042421340942, + "learning_rate": 8.650971599402093e-05, + "loss": 4.0137, + "step": 15895 + }, + { + "epoch": 1.0803098247044436, + "grad_norm": 0.20627442002296448, + "learning_rate": 8.650546949313766e-05, + "loss": 3.6463, + "step": 15900 + }, + { + "epoch": 1.0806495447751052, + "grad_norm": 0.17873930931091309, + "learning_rate": 8.650122299225439e-05, + "loss": 4.2096, + "step": 15905 + }, + { + "epoch": 1.080989264845767, + "grad_norm": 0.5621508359909058, + "learning_rate": 8.649697649137111e-05, + "loss": 4.1769, + "step": 15910 + }, + { + "epoch": 1.081328984916429, + "grad_norm": 0.17645534873008728, + "learning_rate": 8.649272999048784e-05, + "loss": 4.045, + "step": 15915 + }, + { + "epoch": 1.0816687049870906, + "grad_norm": 0.21713340282440186, + "learning_rate": 8.648848348960457e-05, + "loss": 3.9102, + "step": 15920 + }, + { + "epoch": 1.0820084250577524, + "grad_norm": 0.1968790739774704, + "learning_rate": 8.64842369887213e-05, + "loss": 3.8988, + "step": 15925 + }, + { + "epoch": 1.0823481451284143, + "grad_norm": 0.1674720197916031, + "learning_rate": 8.647999048783803e-05, + "loss": 3.8862, + "step": 15930 + }, + { + "epoch": 1.082687865199076, + "grad_norm": 0.17151746153831482, + "learning_rate": 8.647574398695475e-05, + "loss": 3.7607, + "step": 15935 + }, + { + "epoch": 1.0830275852697377, + "grad_norm": 0.23312146961688995, + "learning_rate": 8.647149748607148e-05, + "loss": 3.9772, + "step": 15940 + }, + { + "epoch": 1.0833673053403996, + "grad_norm": 0.16680507361888885, + "learning_rate": 8.646725098518821e-05, + "loss": 3.983, + "step": 15945 + }, + { + "epoch": 1.0837070254110612, + "grad_norm": 0.19017355144023895, + "learning_rate": 8.646300448430494e-05, + "loss": 3.9456, + "step": 15950 + }, + { + "epoch": 1.084046745481723, + "grad_norm": 0.18922637403011322, + "learning_rate": 8.645875798342167e-05, + "loss": 4.184, + "step": 15955 + }, + { + "epoch": 1.084386465552385, + "grad_norm": 0.17443794012069702, + "learning_rate": 8.64545114825384e-05, + "loss": 3.9079, + "step": 15960 + }, + { + "epoch": 1.0847261856230466, + "grad_norm": 0.16345418989658356, + "learning_rate": 8.645026498165512e-05, + "loss": 3.8484, + "step": 15965 + }, + { + "epoch": 1.0850659056937084, + "grad_norm": 0.16860488057136536, + "learning_rate": 8.644601848077185e-05, + "loss": 3.8386, + "step": 15970 + }, + { + "epoch": 1.0854056257643703, + "grad_norm": 0.15834009647369385, + "learning_rate": 8.644177197988858e-05, + "loss": 4.0249, + "step": 15975 + }, + { + "epoch": 1.085745345835032, + "grad_norm": 0.1888628453016281, + "learning_rate": 8.643752547900529e-05, + "loss": 4.032, + "step": 15980 + }, + { + "epoch": 1.0860850659056938, + "grad_norm": 0.18656277656555176, + "learning_rate": 8.643327897812203e-05, + "loss": 3.9654, + "step": 15985 + }, + { + "epoch": 1.0864247859763554, + "grad_norm": 0.1627277135848999, + "learning_rate": 8.642903247723876e-05, + "loss": 3.892, + "step": 15990 + }, + { + "epoch": 1.0867645060470172, + "grad_norm": 0.1699935644865036, + "learning_rate": 8.642478597635548e-05, + "loss": 4.0555, + "step": 15995 + }, + { + "epoch": 1.087104226117679, + "grad_norm": 0.2147267609834671, + "learning_rate": 8.642053947547222e-05, + "loss": 4.082, + "step": 16000 + }, + { + "epoch": 1.0874439461883407, + "grad_norm": 0.2055261880159378, + "learning_rate": 8.641629297458895e-05, + "loss": 4.0327, + "step": 16005 + }, + { + "epoch": 1.0877836662590026, + "grad_norm": 0.21000158786773682, + "learning_rate": 8.641204647370566e-05, + "loss": 4.2682, + "step": 16010 + }, + { + "epoch": 1.0881233863296644, + "grad_norm": 0.14164499938488007, + "learning_rate": 8.64077999728224e-05, + "loss": 3.8245, + "step": 16015 + }, + { + "epoch": 1.088463106400326, + "grad_norm": 0.19035854935646057, + "learning_rate": 8.640355347193913e-05, + "loss": 4.0196, + "step": 16020 + }, + { + "epoch": 1.088802826470988, + "grad_norm": 0.24650296568870544, + "learning_rate": 8.639930697105584e-05, + "loss": 3.9404, + "step": 16025 + }, + { + "epoch": 1.0891425465416498, + "grad_norm": 0.1665068119764328, + "learning_rate": 8.639506047017259e-05, + "loss": 3.9262, + "step": 16030 + }, + { + "epoch": 1.0894822666123114, + "grad_norm": 0.1680706888437271, + "learning_rate": 8.639081396928931e-05, + "loss": 3.959, + "step": 16035 + }, + { + "epoch": 1.0898219866829733, + "grad_norm": 0.1715192198753357, + "learning_rate": 8.638656746840603e-05, + "loss": 3.722, + "step": 16040 + }, + { + "epoch": 1.090161706753635, + "grad_norm": 0.21355275809764862, + "learning_rate": 8.638232096752277e-05, + "loss": 3.8005, + "step": 16045 + }, + { + "epoch": 1.0905014268242967, + "grad_norm": 0.14626453816890717, + "learning_rate": 8.63780744666395e-05, + "loss": 4.1712, + "step": 16050 + }, + { + "epoch": 1.0908411468949586, + "grad_norm": 0.15723629295825958, + "learning_rate": 8.637382796575621e-05, + "loss": 4.173, + "step": 16055 + }, + { + "epoch": 1.0911808669656202, + "grad_norm": 0.1790107786655426, + "learning_rate": 8.636958146487295e-05, + "loss": 3.6412, + "step": 16060 + }, + { + "epoch": 1.091520587036282, + "grad_norm": 0.18286195397377014, + "learning_rate": 8.636533496398967e-05, + "loss": 3.9419, + "step": 16065 + }, + { + "epoch": 1.091860307106944, + "grad_norm": 0.1973506659269333, + "learning_rate": 8.636108846310641e-05, + "loss": 3.9782, + "step": 16070 + }, + { + "epoch": 1.0922000271776056, + "grad_norm": 0.15802575647830963, + "learning_rate": 8.635684196222314e-05, + "loss": 3.95, + "step": 16075 + }, + { + "epoch": 1.0925397472482674, + "grad_norm": 0.1466229259967804, + "learning_rate": 8.635259546133985e-05, + "loss": 3.9789, + "step": 16080 + }, + { + "epoch": 1.0928794673189293, + "grad_norm": 0.20506501197814941, + "learning_rate": 8.63483489604566e-05, + "loss": 4.0093, + "step": 16085 + }, + { + "epoch": 1.093219187389591, + "grad_norm": 0.17356517910957336, + "learning_rate": 8.634410245957332e-05, + "loss": 3.8507, + "step": 16090 + }, + { + "epoch": 1.0935589074602527, + "grad_norm": 0.21622826159000397, + "learning_rate": 8.633985595869004e-05, + "loss": 4.1601, + "step": 16095 + }, + { + "epoch": 1.0938986275309146, + "grad_norm": 0.20221838355064392, + "learning_rate": 8.633560945780678e-05, + "loss": 3.9822, + "step": 16100 + }, + { + "epoch": 1.0942383476015762, + "grad_norm": 0.1798730194568634, + "learning_rate": 8.63313629569235e-05, + "loss": 3.8871, + "step": 16105 + }, + { + "epoch": 1.094578067672238, + "grad_norm": 0.18747274577617645, + "learning_rate": 8.632711645604022e-05, + "loss": 3.9521, + "step": 16110 + }, + { + "epoch": 1.0949177877429, + "grad_norm": 0.31825652718544006, + "learning_rate": 8.632286995515696e-05, + "loss": 3.9403, + "step": 16115 + }, + { + "epoch": 1.0952575078135616, + "grad_norm": 0.1652481108903885, + "learning_rate": 8.631862345427369e-05, + "loss": 4.0651, + "step": 16120 + }, + { + "epoch": 1.0955972278842234, + "grad_norm": 0.19673031568527222, + "learning_rate": 8.63143769533904e-05, + "loss": 3.8475, + "step": 16125 + }, + { + "epoch": 1.0959369479548853, + "grad_norm": 2.468691349029541, + "learning_rate": 8.631013045250715e-05, + "loss": 4.1546, + "step": 16130 + }, + { + "epoch": 1.096276668025547, + "grad_norm": 0.1938968300819397, + "learning_rate": 8.630588395162386e-05, + "loss": 3.7779, + "step": 16135 + }, + { + "epoch": 1.0966163880962088, + "grad_norm": 0.9687556028366089, + "learning_rate": 8.630163745074059e-05, + "loss": 3.9407, + "step": 16140 + }, + { + "epoch": 1.0969561081668706, + "grad_norm": 0.16523803770542145, + "learning_rate": 8.629739094985733e-05, + "loss": 3.8722, + "step": 16145 + }, + { + "epoch": 1.0972958282375322, + "grad_norm": 0.1748422235250473, + "learning_rate": 8.629314444897404e-05, + "loss": 3.9942, + "step": 16150 + }, + { + "epoch": 1.097635548308194, + "grad_norm": 0.16996945440769196, + "learning_rate": 8.628889794809077e-05, + "loss": 4.0963, + "step": 16155 + }, + { + "epoch": 1.0979752683788557, + "grad_norm": 0.15984725952148438, + "learning_rate": 8.628465144720751e-05, + "loss": 3.9372, + "step": 16160 + }, + { + "epoch": 1.0983149884495176, + "grad_norm": 0.23570246994495392, + "learning_rate": 8.628040494632423e-05, + "loss": 3.8955, + "step": 16165 + }, + { + "epoch": 1.0986547085201794, + "grad_norm": 0.23657658696174622, + "learning_rate": 8.627615844544096e-05, + "loss": 3.9892, + "step": 16170 + }, + { + "epoch": 1.098994428590841, + "grad_norm": 0.22472096979618073, + "learning_rate": 8.62719119445577e-05, + "loss": 4.0592, + "step": 16175 + }, + { + "epoch": 1.099334148661503, + "grad_norm": 0.1978127509355545, + "learning_rate": 8.626766544367441e-05, + "loss": 3.8919, + "step": 16180 + }, + { + "epoch": 1.0996738687321648, + "grad_norm": 0.1566731482744217, + "learning_rate": 8.626341894279114e-05, + "loss": 4.0303, + "step": 16185 + }, + { + "epoch": 1.1000135888028264, + "grad_norm": 0.256422221660614, + "learning_rate": 8.625917244190788e-05, + "loss": 3.7957, + "step": 16190 + }, + { + "epoch": 1.1003533088734883, + "grad_norm": 0.1720321774482727, + "learning_rate": 8.62549259410246e-05, + "loss": 4.0208, + "step": 16195 + }, + { + "epoch": 1.10069302894415, + "grad_norm": 0.1642093062400818, + "learning_rate": 8.625067944014132e-05, + "loss": 4.0086, + "step": 16200 + }, + { + "epoch": 1.1010327490148117, + "grad_norm": 0.14341382682323456, + "learning_rate": 8.624643293925805e-05, + "loss": 3.8867, + "step": 16205 + }, + { + "epoch": 1.1013724690854736, + "grad_norm": 0.1669510453939438, + "learning_rate": 8.624218643837478e-05, + "loss": 3.7945, + "step": 16210 + }, + { + "epoch": 1.1017121891561354, + "grad_norm": 0.1497519165277481, + "learning_rate": 8.623793993749151e-05, + "loss": 3.7671, + "step": 16215 + }, + { + "epoch": 1.102051909226797, + "grad_norm": 0.16616341471672058, + "learning_rate": 8.623369343660824e-05, + "loss": 3.9888, + "step": 16220 + }, + { + "epoch": 1.102391629297459, + "grad_norm": 0.18781232833862305, + "learning_rate": 8.622944693572496e-05, + "loss": 4.11, + "step": 16225 + }, + { + "epoch": 1.1027313493681206, + "grad_norm": 0.49014216661453247, + "learning_rate": 8.622520043484169e-05, + "loss": 3.8448, + "step": 16230 + }, + { + "epoch": 1.1030710694387824, + "grad_norm": 0.20546668767929077, + "learning_rate": 8.622095393395842e-05, + "loss": 3.9141, + "step": 16235 + }, + { + "epoch": 1.1034107895094443, + "grad_norm": 0.14401434361934662, + "learning_rate": 8.621670743307515e-05, + "loss": 4.1854, + "step": 16240 + }, + { + "epoch": 1.103750509580106, + "grad_norm": 0.1771813929080963, + "learning_rate": 8.621246093219188e-05, + "loss": 4.1013, + "step": 16245 + }, + { + "epoch": 1.1040902296507678, + "grad_norm": 0.17841222882270813, + "learning_rate": 8.62082144313086e-05, + "loss": 4.0613, + "step": 16250 + }, + { + "epoch": 1.1044299497214296, + "grad_norm": 0.2375301718711853, + "learning_rate": 8.620396793042533e-05, + "loss": 3.9552, + "step": 16255 + }, + { + "epoch": 1.1047696697920912, + "grad_norm": 0.15513014793395996, + "learning_rate": 8.619972142954206e-05, + "loss": 3.9826, + "step": 16260 + }, + { + "epoch": 1.105109389862753, + "grad_norm": 1.3269137144088745, + "learning_rate": 8.619547492865879e-05, + "loss": 3.8172, + "step": 16265 + }, + { + "epoch": 1.105449109933415, + "grad_norm": 0.19027121365070343, + "learning_rate": 8.619122842777552e-05, + "loss": 3.875, + "step": 16270 + }, + { + "epoch": 1.1057888300040766, + "grad_norm": 0.17158430814743042, + "learning_rate": 8.618698192689224e-05, + "loss": 4.0814, + "step": 16275 + }, + { + "epoch": 1.1061285500747384, + "grad_norm": 0.49753719568252563, + "learning_rate": 8.618273542600897e-05, + "loss": 3.7628, + "step": 16280 + }, + { + "epoch": 1.1064682701454003, + "grad_norm": 0.1748051941394806, + "learning_rate": 8.61784889251257e-05, + "loss": 3.9761, + "step": 16285 + }, + { + "epoch": 1.106807990216062, + "grad_norm": 0.32643264532089233, + "learning_rate": 8.617424242424243e-05, + "loss": 3.9474, + "step": 16290 + }, + { + "epoch": 1.1071477102867238, + "grad_norm": 0.8213240504264832, + "learning_rate": 8.616999592335916e-05, + "loss": 3.7241, + "step": 16295 + }, + { + "epoch": 1.1074874303573856, + "grad_norm": 0.29735448956489563, + "learning_rate": 8.616574942247588e-05, + "loss": 3.6733, + "step": 16300 + }, + { + "epoch": 1.1078271504280472, + "grad_norm": 0.1454872339963913, + "learning_rate": 8.616150292159261e-05, + "loss": 3.89, + "step": 16305 + }, + { + "epoch": 1.108166870498709, + "grad_norm": 0.17517416179180145, + "learning_rate": 8.615725642070934e-05, + "loss": 4.0969, + "step": 16310 + }, + { + "epoch": 1.108506590569371, + "grad_norm": 0.18153846263885498, + "learning_rate": 8.615300991982607e-05, + "loss": 3.8348, + "step": 16315 + }, + { + "epoch": 1.1088463106400326, + "grad_norm": 0.23186548054218292, + "learning_rate": 8.61487634189428e-05, + "loss": 3.9116, + "step": 16320 + }, + { + "epoch": 1.1091860307106944, + "grad_norm": 0.15640127658843994, + "learning_rate": 8.614451691805952e-05, + "loss": 4.0668, + "step": 16325 + }, + { + "epoch": 1.109525750781356, + "grad_norm": 0.2013588547706604, + "learning_rate": 8.614027041717625e-05, + "loss": 3.8115, + "step": 16330 + }, + { + "epoch": 1.109865470852018, + "grad_norm": 0.12737514078617096, + "learning_rate": 8.613602391629297e-05, + "loss": 3.9491, + "step": 16335 + }, + { + "epoch": 1.1102051909226798, + "grad_norm": 0.19158364832401276, + "learning_rate": 8.613177741540971e-05, + "loss": 3.8493, + "step": 16340 + }, + { + "epoch": 1.1105449109933414, + "grad_norm": 0.3817283809185028, + "learning_rate": 8.612753091452644e-05, + "loss": 4.1342, + "step": 16345 + }, + { + "epoch": 1.1108846310640033, + "grad_norm": 0.1744743436574936, + "learning_rate": 8.612328441364315e-05, + "loss": 3.9817, + "step": 16350 + }, + { + "epoch": 1.1112243511346651, + "grad_norm": 0.4144933521747589, + "learning_rate": 8.611903791275989e-05, + "loss": 3.9226, + "step": 16355 + }, + { + "epoch": 1.1115640712053267, + "grad_norm": 0.15905387699604034, + "learning_rate": 8.611479141187662e-05, + "loss": 3.6998, + "step": 16360 + }, + { + "epoch": 1.1119037912759886, + "grad_norm": 0.15864962339401245, + "learning_rate": 8.611054491099334e-05, + "loss": 3.8524, + "step": 16365 + }, + { + "epoch": 1.1122435113466504, + "grad_norm": 0.17851097881793976, + "learning_rate": 8.610629841011008e-05, + "loss": 3.7403, + "step": 16370 + }, + { + "epoch": 1.112583231417312, + "grad_norm": 1.0258066654205322, + "learning_rate": 8.61020519092268e-05, + "loss": 3.6328, + "step": 16375 + }, + { + "epoch": 1.112922951487974, + "grad_norm": 0.1692223697900772, + "learning_rate": 8.609780540834352e-05, + "loss": 4.1079, + "step": 16380 + }, + { + "epoch": 1.1132626715586358, + "grad_norm": 0.1587432622909546, + "learning_rate": 8.609355890746026e-05, + "loss": 3.9648, + "step": 16385 + }, + { + "epoch": 1.1136023916292974, + "grad_norm": 0.1481763869524002, + "learning_rate": 8.608931240657699e-05, + "loss": 4.1383, + "step": 16390 + }, + { + "epoch": 1.1139421116999593, + "grad_norm": 0.16203315556049347, + "learning_rate": 8.60850659056937e-05, + "loss": 4.1066, + "step": 16395 + }, + { + "epoch": 1.114281831770621, + "grad_norm": 0.18231908977031708, + "learning_rate": 8.608081940481045e-05, + "loss": 3.8513, + "step": 16400 + }, + { + "epoch": 1.1146215518412828, + "grad_norm": 0.368586003780365, + "learning_rate": 8.607657290392716e-05, + "loss": 3.818, + "step": 16405 + }, + { + "epoch": 1.1149612719119446, + "grad_norm": 0.17134909331798553, + "learning_rate": 8.60723264030439e-05, + "loss": 4.0444, + "step": 16410 + }, + { + "epoch": 1.1153009919826062, + "grad_norm": 0.18825988471508026, + "learning_rate": 8.606807990216063e-05, + "loss": 3.8568, + "step": 16415 + }, + { + "epoch": 1.115640712053268, + "grad_norm": 0.15570901334285736, + "learning_rate": 8.606383340127734e-05, + "loss": 4.129, + "step": 16420 + }, + { + "epoch": 1.11598043212393, + "grad_norm": 0.17736178636550903, + "learning_rate": 8.605958690039409e-05, + "loss": 3.7899, + "step": 16425 + }, + { + "epoch": 1.1163201521945916, + "grad_norm": 0.4241405129432678, + "learning_rate": 8.605534039951081e-05, + "loss": 4.044, + "step": 16430 + }, + { + "epoch": 1.1166598722652534, + "grad_norm": 0.6367011666297913, + "learning_rate": 8.605109389862753e-05, + "loss": 3.7557, + "step": 16435 + }, + { + "epoch": 1.1169995923359153, + "grad_norm": 0.2951098680496216, + "learning_rate": 8.604684739774427e-05, + "loss": 3.9403, + "step": 16440 + }, + { + "epoch": 1.117339312406577, + "grad_norm": 0.1913820505142212, + "learning_rate": 8.6042600896861e-05, + "loss": 3.9853, + "step": 16445 + }, + { + "epoch": 1.1176790324772388, + "grad_norm": 0.2242596447467804, + "learning_rate": 8.603835439597771e-05, + "loss": 4.2887, + "step": 16450 + }, + { + "epoch": 1.1180187525479006, + "grad_norm": 0.332297682762146, + "learning_rate": 8.603410789509445e-05, + "loss": 3.9957, + "step": 16455 + }, + { + "epoch": 1.1183584726185622, + "grad_norm": 0.21807430684566498, + "learning_rate": 8.602986139421118e-05, + "loss": 4.0968, + "step": 16460 + }, + { + "epoch": 1.118698192689224, + "grad_norm": 0.17632004618644714, + "learning_rate": 8.60256148933279e-05, + "loss": 4.1983, + "step": 16465 + }, + { + "epoch": 1.119037912759886, + "grad_norm": 0.2156151682138443, + "learning_rate": 8.602136839244464e-05, + "loss": 3.6547, + "step": 16470 + }, + { + "epoch": 1.1193776328305476, + "grad_norm": 0.16284529864788055, + "learning_rate": 8.601712189156137e-05, + "loss": 3.985, + "step": 16475 + }, + { + "epoch": 1.1197173529012094, + "grad_norm": 0.16307656466960907, + "learning_rate": 8.601287539067808e-05, + "loss": 3.9131, + "step": 16480 + }, + { + "epoch": 1.1200570729718713, + "grad_norm": 0.22563326358795166, + "learning_rate": 8.600862888979482e-05, + "loss": 3.9868, + "step": 16485 + }, + { + "epoch": 1.120396793042533, + "grad_norm": 0.7765183448791504, + "learning_rate": 8.600438238891154e-05, + "loss": 3.808, + "step": 16490 + }, + { + "epoch": 1.1207365131131948, + "grad_norm": 0.4159073233604431, + "learning_rate": 8.600013588802826e-05, + "loss": 3.9451, + "step": 16495 + }, + { + "epoch": 1.1210762331838564, + "grad_norm": 0.19920696318149567, + "learning_rate": 8.5995889387145e-05, + "loss": 3.9598, + "step": 16500 + }, + { + "epoch": 1.1214159532545183, + "grad_norm": 0.18753471970558167, + "learning_rate": 8.599164288626172e-05, + "loss": 3.8349, + "step": 16505 + }, + { + "epoch": 1.1217556733251801, + "grad_norm": 0.22453509271144867, + "learning_rate": 8.598739638537845e-05, + "loss": 3.9091, + "step": 16510 + }, + { + "epoch": 1.1220953933958417, + "grad_norm": 0.8424657583236694, + "learning_rate": 8.598314988449519e-05, + "loss": 4.0252, + "step": 16515 + }, + { + "epoch": 1.1224351134665036, + "grad_norm": 0.17123176157474518, + "learning_rate": 8.59789033836119e-05, + "loss": 4.0061, + "step": 16520 + }, + { + "epoch": 1.1227748335371655, + "grad_norm": 0.26806747913360596, + "learning_rate": 8.597465688272863e-05, + "loss": 3.7403, + "step": 16525 + }, + { + "epoch": 1.123114553607827, + "grad_norm": 0.29179176688194275, + "learning_rate": 8.597041038184537e-05, + "loss": 3.9503, + "step": 16530 + }, + { + "epoch": 1.123454273678489, + "grad_norm": 0.1696764975786209, + "learning_rate": 8.596616388096209e-05, + "loss": 3.9636, + "step": 16535 + }, + { + "epoch": 1.1237939937491508, + "grad_norm": 0.17815542221069336, + "learning_rate": 8.596191738007882e-05, + "loss": 4.097, + "step": 16540 + }, + { + "epoch": 1.1241337138198124, + "grad_norm": 0.1656859666109085, + "learning_rate": 8.595767087919556e-05, + "loss": 3.9587, + "step": 16545 + }, + { + "epoch": 1.1244734338904743, + "grad_norm": 0.16445748507976532, + "learning_rate": 8.595342437831227e-05, + "loss": 3.8764, + "step": 16550 + }, + { + "epoch": 1.1248131539611361, + "grad_norm": 0.1810779720544815, + "learning_rate": 8.5949177877429e-05, + "loss": 4.0428, + "step": 16555 + }, + { + "epoch": 1.1251528740317978, + "grad_norm": 0.19179122149944305, + "learning_rate": 8.594493137654573e-05, + "loss": 3.9419, + "step": 16560 + }, + { + "epoch": 1.1254925941024596, + "grad_norm": 0.18260543048381805, + "learning_rate": 8.594068487566246e-05, + "loss": 4.1125, + "step": 16565 + }, + { + "epoch": 1.1258323141731212, + "grad_norm": 0.16802632808685303, + "learning_rate": 8.593643837477918e-05, + "loss": 3.9159, + "step": 16570 + }, + { + "epoch": 1.126172034243783, + "grad_norm": 1.3238937854766846, + "learning_rate": 8.593219187389591e-05, + "loss": 4.0869, + "step": 16575 + }, + { + "epoch": 1.126511754314445, + "grad_norm": 0.41687366366386414, + "learning_rate": 8.592794537301264e-05, + "loss": 4.0466, + "step": 16580 + }, + { + "epoch": 1.1268514743851066, + "grad_norm": 0.1587451845407486, + "learning_rate": 8.592369887212937e-05, + "loss": 4.0387, + "step": 16585 + }, + { + "epoch": 1.1271911944557684, + "grad_norm": 0.3418107330799103, + "learning_rate": 8.59194523712461e-05, + "loss": 3.8754, + "step": 16590 + }, + { + "epoch": 1.1275309145264303, + "grad_norm": 0.1839505285024643, + "learning_rate": 8.591520587036282e-05, + "loss": 4.1284, + "step": 16595 + }, + { + "epoch": 1.127870634597092, + "grad_norm": 0.32643118500709534, + "learning_rate": 8.591095936947955e-05, + "loss": 3.7703, + "step": 16600 + }, + { + "epoch": 1.1282103546677538, + "grad_norm": 0.16766859591007233, + "learning_rate": 8.590671286859628e-05, + "loss": 3.8694, + "step": 16605 + }, + { + "epoch": 1.1285500747384156, + "grad_norm": 0.712253212928772, + "learning_rate": 8.590246636771301e-05, + "loss": 4.0002, + "step": 16610 + }, + { + "epoch": 1.1288897948090773, + "grad_norm": 0.9820852875709534, + "learning_rate": 8.589821986682974e-05, + "loss": 4.0379, + "step": 16615 + }, + { + "epoch": 1.129229514879739, + "grad_norm": 0.2402014583349228, + "learning_rate": 8.589397336594646e-05, + "loss": 3.6969, + "step": 16620 + }, + { + "epoch": 1.129569234950401, + "grad_norm": 0.1834237426519394, + "learning_rate": 8.588972686506319e-05, + "loss": 3.9438, + "step": 16625 + }, + { + "epoch": 1.1299089550210626, + "grad_norm": 0.22266420722007751, + "learning_rate": 8.588548036417992e-05, + "loss": 3.8202, + "step": 16630 + }, + { + "epoch": 1.1302486750917244, + "grad_norm": 0.15689939260482788, + "learning_rate": 8.588123386329665e-05, + "loss": 4.0411, + "step": 16635 + }, + { + "epoch": 1.1305883951623863, + "grad_norm": 0.18447615206241608, + "learning_rate": 8.587698736241338e-05, + "loss": 3.9692, + "step": 16640 + }, + { + "epoch": 1.130928115233048, + "grad_norm": 0.19857051968574524, + "learning_rate": 8.58727408615301e-05, + "loss": 3.7997, + "step": 16645 + }, + { + "epoch": 1.1312678353037098, + "grad_norm": 0.18129754066467285, + "learning_rate": 8.586849436064683e-05, + "loss": 4.1222, + "step": 16650 + }, + { + "epoch": 1.1316075553743716, + "grad_norm": 0.2746391296386719, + "learning_rate": 8.586424785976356e-05, + "loss": 3.6868, + "step": 16655 + }, + { + "epoch": 1.1319472754450333, + "grad_norm": 0.1754639893770218, + "learning_rate": 8.586000135888029e-05, + "loss": 3.9424, + "step": 16660 + }, + { + "epoch": 1.1322869955156951, + "grad_norm": 0.2614280581474304, + "learning_rate": 8.585575485799702e-05, + "loss": 3.8774, + "step": 16665 + }, + { + "epoch": 1.1326267155863567, + "grad_norm": 0.18188413977622986, + "learning_rate": 8.585150835711374e-05, + "loss": 4.0158, + "step": 16670 + }, + { + "epoch": 1.1329664356570186, + "grad_norm": 0.1900574117898941, + "learning_rate": 8.584726185623047e-05, + "loss": 3.9557, + "step": 16675 + }, + { + "epoch": 1.1333061557276805, + "grad_norm": 0.2411600649356842, + "learning_rate": 8.58430153553472e-05, + "loss": 3.8946, + "step": 16680 + }, + { + "epoch": 1.133645875798342, + "grad_norm": 0.32648876309394836, + "learning_rate": 8.583876885446393e-05, + "loss": 3.9312, + "step": 16685 + }, + { + "epoch": 1.133985595869004, + "grad_norm": 0.2013859748840332, + "learning_rate": 8.583452235358064e-05, + "loss": 3.9893, + "step": 16690 + }, + { + "epoch": 1.1343253159396658, + "grad_norm": 0.15776346623897552, + "learning_rate": 8.583027585269738e-05, + "loss": 3.9688, + "step": 16695 + }, + { + "epoch": 1.1346650360103274, + "grad_norm": 0.17749743163585663, + "learning_rate": 8.582602935181411e-05, + "loss": 3.6437, + "step": 16700 + }, + { + "epoch": 1.1350047560809893, + "grad_norm": 0.16100169718265533, + "learning_rate": 8.582178285093083e-05, + "loss": 3.9158, + "step": 16705 + }, + { + "epoch": 1.1353444761516511, + "grad_norm": 0.19322456419467926, + "learning_rate": 8.581753635004757e-05, + "loss": 4.0565, + "step": 16710 + }, + { + "epoch": 1.1356841962223128, + "grad_norm": 0.14952364563941956, + "learning_rate": 8.58132898491643e-05, + "loss": 3.8898, + "step": 16715 + }, + { + "epoch": 1.1360239162929746, + "grad_norm": 0.20561130344867706, + "learning_rate": 8.580904334828101e-05, + "loss": 4.062, + "step": 16720 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.19884036481380463, + "learning_rate": 8.580479684739775e-05, + "loss": 3.9942, + "step": 16725 + }, + { + "epoch": 1.136703356434298, + "grad_norm": 0.17311780154705048, + "learning_rate": 8.580055034651448e-05, + "loss": 4.0655, + "step": 16730 + }, + { + "epoch": 1.13704307650496, + "grad_norm": 0.1967136263847351, + "learning_rate": 8.57963038456312e-05, + "loss": 4.1502, + "step": 16735 + }, + { + "epoch": 1.1373827965756216, + "grad_norm": 0.37797224521636963, + "learning_rate": 8.579205734474794e-05, + "loss": 3.8114, + "step": 16740 + }, + { + "epoch": 1.1377225166462834, + "grad_norm": 0.19511963427066803, + "learning_rate": 8.578781084386466e-05, + "loss": 3.9825, + "step": 16745 + }, + { + "epoch": 1.1380622367169453, + "grad_norm": 2.275869369506836, + "learning_rate": 8.578356434298139e-05, + "loss": 4.0732, + "step": 16750 + }, + { + "epoch": 1.138401956787607, + "grad_norm": 0.2894071638584137, + "learning_rate": 8.577931784209812e-05, + "loss": 4.1865, + "step": 16755 + }, + { + "epoch": 1.1387416768582688, + "grad_norm": 0.23166050016880035, + "learning_rate": 8.577507134121483e-05, + "loss": 3.8818, + "step": 16760 + }, + { + "epoch": 1.1390813969289306, + "grad_norm": 0.14223459362983704, + "learning_rate": 8.577082484033158e-05, + "loss": 4.0696, + "step": 16765 + }, + { + "epoch": 1.1394211169995923, + "grad_norm": 0.9115033149719238, + "learning_rate": 8.57665783394483e-05, + "loss": 3.9513, + "step": 16770 + }, + { + "epoch": 1.139760837070254, + "grad_norm": 0.17078211903572083, + "learning_rate": 8.576233183856502e-05, + "loss": 3.8712, + "step": 16775 + }, + { + "epoch": 1.140100557140916, + "grad_norm": 0.18300005793571472, + "learning_rate": 8.575808533768176e-05, + "loss": 3.9409, + "step": 16780 + }, + { + "epoch": 1.1404402772115776, + "grad_norm": 0.20243607461452484, + "learning_rate": 8.575383883679849e-05, + "loss": 3.9194, + "step": 16785 + }, + { + "epoch": 1.1407799972822394, + "grad_norm": 0.15823310613632202, + "learning_rate": 8.57495923359152e-05, + "loss": 3.9949, + "step": 16790 + }, + { + "epoch": 1.1411197173529013, + "grad_norm": 0.3241806924343109, + "learning_rate": 8.574534583503194e-05, + "loss": 3.8432, + "step": 16795 + }, + { + "epoch": 1.141459437423563, + "grad_norm": 0.17642635107040405, + "learning_rate": 8.574109933414867e-05, + "loss": 4.0258, + "step": 16800 + }, + { + "epoch": 1.1417991574942248, + "grad_norm": 0.17089220881462097, + "learning_rate": 8.573685283326539e-05, + "loss": 3.974, + "step": 16805 + }, + { + "epoch": 1.1421388775648866, + "grad_norm": 0.1902184635400772, + "learning_rate": 8.573260633238213e-05, + "loss": 3.9692, + "step": 16810 + }, + { + "epoch": 1.1424785976355483, + "grad_norm": 0.19794249534606934, + "learning_rate": 8.572835983149886e-05, + "loss": 3.9779, + "step": 16815 + }, + { + "epoch": 1.1428183177062101, + "grad_norm": 0.1614569127559662, + "learning_rate": 8.572411333061557e-05, + "loss": 3.8165, + "step": 16820 + }, + { + "epoch": 1.143158037776872, + "grad_norm": 0.24350915849208832, + "learning_rate": 8.571986682973231e-05, + "loss": 3.8707, + "step": 16825 + }, + { + "epoch": 1.1434977578475336, + "grad_norm": 0.164867103099823, + "learning_rate": 8.571562032884903e-05, + "loss": 3.8287, + "step": 16830 + }, + { + "epoch": 1.1438374779181955, + "grad_norm": 0.18074341118335724, + "learning_rate": 8.571137382796575e-05, + "loss": 4.0, + "step": 16835 + }, + { + "epoch": 1.144177197988857, + "grad_norm": 0.17711681127548218, + "learning_rate": 8.57071273270825e-05, + "loss": 3.9683, + "step": 16840 + }, + { + "epoch": 1.144516918059519, + "grad_norm": 0.17935657501220703, + "learning_rate": 8.570288082619921e-05, + "loss": 3.912, + "step": 16845 + }, + { + "epoch": 1.1448566381301808, + "grad_norm": 0.21363583207130432, + "learning_rate": 8.569863432531594e-05, + "loss": 4.1652, + "step": 16850 + }, + { + "epoch": 1.1451963582008424, + "grad_norm": 0.19257119297981262, + "learning_rate": 8.569438782443268e-05, + "loss": 3.9201, + "step": 16855 + }, + { + "epoch": 1.1455360782715043, + "grad_norm": 1.1066832542419434, + "learning_rate": 8.56901413235494e-05, + "loss": 4.1146, + "step": 16860 + }, + { + "epoch": 1.1458757983421661, + "grad_norm": 0.17682649195194244, + "learning_rate": 8.568589482266612e-05, + "loss": 3.637, + "step": 16865 + }, + { + "epoch": 1.1462155184128278, + "grad_norm": 0.17803549766540527, + "learning_rate": 8.568164832178286e-05, + "loss": 3.9385, + "step": 16870 + }, + { + "epoch": 1.1465552384834896, + "grad_norm": 0.20264935493469238, + "learning_rate": 8.567740182089958e-05, + "loss": 4.1419, + "step": 16875 + }, + { + "epoch": 1.1468949585541515, + "grad_norm": 0.17953553795814514, + "learning_rate": 8.56731553200163e-05, + "loss": 3.9504, + "step": 16880 + }, + { + "epoch": 1.147234678624813, + "grad_norm": 0.6922492980957031, + "learning_rate": 8.566890881913305e-05, + "loss": 4.0675, + "step": 16885 + }, + { + "epoch": 1.147574398695475, + "grad_norm": 0.17250503599643707, + "learning_rate": 8.566466231824976e-05, + "loss": 3.958, + "step": 16890 + }, + { + "epoch": 1.1479141187661366, + "grad_norm": 0.20890331268310547, + "learning_rate": 8.566041581736649e-05, + "loss": 3.9155, + "step": 16895 + }, + { + "epoch": 1.1482538388367984, + "grad_norm": 0.17603133618831635, + "learning_rate": 8.565616931648323e-05, + "loss": 3.8174, + "step": 16900 + }, + { + "epoch": 1.1485935589074603, + "grad_norm": 0.20147298276424408, + "learning_rate": 8.565192281559995e-05, + "loss": 3.9248, + "step": 16905 + }, + { + "epoch": 1.148933278978122, + "grad_norm": 0.20325009524822235, + "learning_rate": 8.564767631471667e-05, + "loss": 3.8041, + "step": 16910 + }, + { + "epoch": 1.1492729990487838, + "grad_norm": 0.1657760739326477, + "learning_rate": 8.56434298138334e-05, + "loss": 3.8153, + "step": 16915 + }, + { + "epoch": 1.1496127191194456, + "grad_norm": 0.2162741869688034, + "learning_rate": 8.563918331295013e-05, + "loss": 3.8665, + "step": 16920 + }, + { + "epoch": 1.1499524391901073, + "grad_norm": 0.19794046878814697, + "learning_rate": 8.563493681206686e-05, + "loss": 3.8585, + "step": 16925 + }, + { + "epoch": 1.150292159260769, + "grad_norm": 2.0781757831573486, + "learning_rate": 8.563069031118359e-05, + "loss": 3.9181, + "step": 16930 + }, + { + "epoch": 1.150631879331431, + "grad_norm": 0.1887458860874176, + "learning_rate": 8.562644381030031e-05, + "loss": 3.8447, + "step": 16935 + }, + { + "epoch": 1.1509715994020926, + "grad_norm": 0.16661617159843445, + "learning_rate": 8.562219730941704e-05, + "loss": 3.9485, + "step": 16940 + }, + { + "epoch": 1.1513113194727544, + "grad_norm": 0.1476723998785019, + "learning_rate": 8.561795080853377e-05, + "loss": 3.7977, + "step": 16945 + }, + { + "epoch": 1.1516510395434163, + "grad_norm": 0.2958218455314636, + "learning_rate": 8.56137043076505e-05, + "loss": 3.9162, + "step": 16950 + }, + { + "epoch": 1.151990759614078, + "grad_norm": 0.1819857805967331, + "learning_rate": 8.560945780676723e-05, + "loss": 3.7574, + "step": 16955 + }, + { + "epoch": 1.1523304796847398, + "grad_norm": 0.17012345790863037, + "learning_rate": 8.560521130588395e-05, + "loss": 3.6611, + "step": 16960 + }, + { + "epoch": 1.1526701997554016, + "grad_norm": 0.1729269027709961, + "learning_rate": 8.560096480500068e-05, + "loss": 3.9753, + "step": 16965 + }, + { + "epoch": 1.1530099198260633, + "grad_norm": 0.18929387629032135, + "learning_rate": 8.559671830411741e-05, + "loss": 3.7813, + "step": 16970 + }, + { + "epoch": 1.1533496398967251, + "grad_norm": 0.18209058046340942, + "learning_rate": 8.559247180323414e-05, + "loss": 3.8472, + "step": 16975 + }, + { + "epoch": 1.153689359967387, + "grad_norm": 0.1977819800376892, + "learning_rate": 8.558822530235087e-05, + "loss": 3.8335, + "step": 16980 + }, + { + "epoch": 1.1540290800380486, + "grad_norm": 0.19058872759342194, + "learning_rate": 8.55839788014676e-05, + "loss": 4.1012, + "step": 16985 + }, + { + "epoch": 1.1543688001087105, + "grad_norm": 2.193249225616455, + "learning_rate": 8.557973230058432e-05, + "loss": 3.8189, + "step": 16990 + }, + { + "epoch": 1.1547085201793723, + "grad_norm": 0.24926668405532837, + "learning_rate": 8.557548579970105e-05, + "loss": 3.9349, + "step": 16995 + }, + { + "epoch": 1.155048240250034, + "grad_norm": 0.17604751884937286, + "learning_rate": 8.557123929881778e-05, + "loss": 3.946, + "step": 17000 + }, + { + "epoch": 1.1553879603206958, + "grad_norm": 0.223891019821167, + "learning_rate": 8.556699279793451e-05, + "loss": 3.9508, + "step": 17005 + }, + { + "epoch": 1.1557276803913574, + "grad_norm": 0.19086340069770813, + "learning_rate": 8.556274629705123e-05, + "loss": 4.0189, + "step": 17010 + }, + { + "epoch": 1.1560674004620193, + "grad_norm": 2.420696258544922, + "learning_rate": 8.555849979616796e-05, + "loss": 4.206, + "step": 17015 + }, + { + "epoch": 1.1564071205326811, + "grad_norm": 0.21227078139781952, + "learning_rate": 8.555425329528469e-05, + "loss": 3.9761, + "step": 17020 + }, + { + "epoch": 1.1567468406033428, + "grad_norm": 0.1845429390668869, + "learning_rate": 8.555000679440142e-05, + "loss": 4.018, + "step": 17025 + }, + { + "epoch": 1.1570865606740046, + "grad_norm": 0.21767668426036835, + "learning_rate": 8.554576029351813e-05, + "loss": 4.0095, + "step": 17030 + }, + { + "epoch": 1.1574262807446665, + "grad_norm": 0.2368917316198349, + "learning_rate": 8.554151379263487e-05, + "loss": 3.8781, + "step": 17035 + }, + { + "epoch": 1.157766000815328, + "grad_norm": 0.23999758064746857, + "learning_rate": 8.55372672917516e-05, + "loss": 3.8043, + "step": 17040 + }, + { + "epoch": 1.15810572088599, + "grad_norm": 1.4887816905975342, + "learning_rate": 8.553302079086832e-05, + "loss": 4.2156, + "step": 17045 + }, + { + "epoch": 1.1584454409566518, + "grad_norm": 0.18826590478420258, + "learning_rate": 8.552877428998506e-05, + "loss": 3.6684, + "step": 17050 + }, + { + "epoch": 1.1587851610273134, + "grad_norm": 0.13121357560157776, + "learning_rate": 8.552452778910179e-05, + "loss": 3.9394, + "step": 17055 + }, + { + "epoch": 1.1591248810979753, + "grad_norm": 0.565320611000061, + "learning_rate": 8.55202812882185e-05, + "loss": 3.8813, + "step": 17060 + }, + { + "epoch": 1.159464601168637, + "grad_norm": 0.34057068824768066, + "learning_rate": 8.551603478733524e-05, + "loss": 4.0371, + "step": 17065 + }, + { + "epoch": 1.1598043212392988, + "grad_norm": 0.339124470949173, + "learning_rate": 8.551178828645197e-05, + "loss": 3.9389, + "step": 17070 + }, + { + "epoch": 1.1601440413099606, + "grad_norm": 1.2875251770019531, + "learning_rate": 8.550754178556869e-05, + "loss": 4.0045, + "step": 17075 + }, + { + "epoch": 1.1604837613806223, + "grad_norm": 0.2576915919780731, + "learning_rate": 8.550329528468543e-05, + "loss": 3.8932, + "step": 17080 + }, + { + "epoch": 1.1608234814512841, + "grad_norm": 0.25439324975013733, + "learning_rate": 8.549904878380216e-05, + "loss": 3.7743, + "step": 17085 + }, + { + "epoch": 1.161163201521946, + "grad_norm": 0.1842285841703415, + "learning_rate": 8.549480228291888e-05, + "loss": 3.7794, + "step": 17090 + }, + { + "epoch": 1.1615029215926076, + "grad_norm": 0.17083007097244263, + "learning_rate": 8.549055578203561e-05, + "loss": 3.7919, + "step": 17095 + }, + { + "epoch": 1.1618426416632694, + "grad_norm": 0.4583177864551544, + "learning_rate": 8.548630928115234e-05, + "loss": 3.969, + "step": 17100 + }, + { + "epoch": 1.1621823617339313, + "grad_norm": 0.22575242817401886, + "learning_rate": 8.548206278026907e-05, + "loss": 3.892, + "step": 17105 + }, + { + "epoch": 1.162522081804593, + "grad_norm": 0.1475016474723816, + "learning_rate": 8.54778162793858e-05, + "loss": 3.6935, + "step": 17110 + }, + { + "epoch": 1.1628618018752548, + "grad_norm": 0.16757836937904358, + "learning_rate": 8.547356977850251e-05, + "loss": 3.9406, + "step": 17115 + }, + { + "epoch": 1.1632015219459166, + "grad_norm": 0.18592201173305511, + "learning_rate": 8.546932327761925e-05, + "loss": 3.9396, + "step": 17120 + }, + { + "epoch": 1.1635412420165783, + "grad_norm": 0.15868262946605682, + "learning_rate": 8.546507677673598e-05, + "loss": 3.8424, + "step": 17125 + }, + { + "epoch": 1.1638809620872401, + "grad_norm": 0.13644038140773773, + "learning_rate": 8.54608302758527e-05, + "loss": 4.2132, + "step": 17130 + }, + { + "epoch": 1.164220682157902, + "grad_norm": 0.21577060222625732, + "learning_rate": 8.545658377496944e-05, + "loss": 3.7702, + "step": 17135 + }, + { + "epoch": 1.1645604022285636, + "grad_norm": 0.18722379207611084, + "learning_rate": 8.545233727408616e-05, + "loss": 4.0932, + "step": 17140 + }, + { + "epoch": 1.1649001222992255, + "grad_norm": 0.31291109323501587, + "learning_rate": 8.544809077320288e-05, + "loss": 3.9638, + "step": 17145 + }, + { + "epoch": 1.1652398423698873, + "grad_norm": 0.23044340312480927, + "learning_rate": 8.544384427231962e-05, + "loss": 3.9249, + "step": 17150 + }, + { + "epoch": 1.165579562440549, + "grad_norm": 0.161495178937912, + "learning_rate": 8.543959777143635e-05, + "loss": 3.8525, + "step": 17155 + }, + { + "epoch": 1.1659192825112108, + "grad_norm": 0.25017622113227844, + "learning_rate": 8.543535127055306e-05, + "loss": 3.9998, + "step": 17160 + }, + { + "epoch": 1.1662590025818727, + "grad_norm": 0.2178102284669876, + "learning_rate": 8.54311047696698e-05, + "loss": 3.9797, + "step": 17165 + }, + { + "epoch": 1.1665987226525343, + "grad_norm": 0.1246630996465683, + "learning_rate": 8.542685826878653e-05, + "loss": 3.7991, + "step": 17170 + }, + { + "epoch": 1.1669384427231961, + "grad_norm": 0.2031886875629425, + "learning_rate": 8.542261176790325e-05, + "loss": 4.149, + "step": 17175 + }, + { + "epoch": 1.1672781627938578, + "grad_norm": 0.23513466119766235, + "learning_rate": 8.541836526701999e-05, + "loss": 4.0086, + "step": 17180 + }, + { + "epoch": 1.1676178828645196, + "grad_norm": 0.5937551856040955, + "learning_rate": 8.54141187661367e-05, + "loss": 3.9436, + "step": 17185 + }, + { + "epoch": 1.1679576029351815, + "grad_norm": 0.46145716309547424, + "learning_rate": 8.540987226525343e-05, + "loss": 3.7951, + "step": 17190 + }, + { + "epoch": 1.168297323005843, + "grad_norm": 0.30620038509368896, + "learning_rate": 8.540562576437017e-05, + "loss": 3.8283, + "step": 17195 + }, + { + "epoch": 1.168637043076505, + "grad_norm": 0.1912326067686081, + "learning_rate": 8.540137926348689e-05, + "loss": 4.0595, + "step": 17200 + }, + { + "epoch": 1.1689767631471668, + "grad_norm": 0.14411430060863495, + "learning_rate": 8.539713276260361e-05, + "loss": 3.8941, + "step": 17205 + }, + { + "epoch": 1.1693164832178284, + "grad_norm": 0.17155896127223969, + "learning_rate": 8.539288626172036e-05, + "loss": 3.8664, + "step": 17210 + }, + { + "epoch": 1.1696562032884903, + "grad_norm": 0.1715584397315979, + "learning_rate": 8.538863976083707e-05, + "loss": 3.8682, + "step": 17215 + }, + { + "epoch": 1.1699959233591521, + "grad_norm": 0.1329980492591858, + "learning_rate": 8.53843932599538e-05, + "loss": 3.9608, + "step": 17220 + }, + { + "epoch": 1.1703356434298138, + "grad_norm": 0.2576993405818939, + "learning_rate": 8.538014675907054e-05, + "loss": 3.8609, + "step": 17225 + }, + { + "epoch": 1.1706753635004756, + "grad_norm": 0.19327299296855927, + "learning_rate": 8.537590025818725e-05, + "loss": 3.9865, + "step": 17230 + }, + { + "epoch": 1.1710150835711373, + "grad_norm": 0.18302805721759796, + "learning_rate": 8.537165375730398e-05, + "loss": 3.7078, + "step": 17235 + }, + { + "epoch": 1.1713548036417991, + "grad_norm": 0.24862872064113617, + "learning_rate": 8.536740725642072e-05, + "loss": 3.7665, + "step": 17240 + }, + { + "epoch": 1.171694523712461, + "grad_norm": 0.20956410467624664, + "learning_rate": 8.536316075553744e-05, + "loss": 3.8738, + "step": 17245 + }, + { + "epoch": 1.1720342437831226, + "grad_norm": 0.2470242828130722, + "learning_rate": 8.535891425465417e-05, + "loss": 4.0098, + "step": 17250 + }, + { + "epoch": 1.1723739638537845, + "grad_norm": 0.33338814973831177, + "learning_rate": 8.535466775377091e-05, + "loss": 4.0451, + "step": 17255 + }, + { + "epoch": 1.1727136839244463, + "grad_norm": 0.17504683136940002, + "learning_rate": 8.535042125288762e-05, + "loss": 4.0551, + "step": 17260 + }, + { + "epoch": 1.173053403995108, + "grad_norm": 0.16404786705970764, + "learning_rate": 8.534617475200435e-05, + "loss": 3.8969, + "step": 17265 + }, + { + "epoch": 1.1733931240657698, + "grad_norm": 0.20265820622444153, + "learning_rate": 8.534192825112108e-05, + "loss": 3.9714, + "step": 17270 + }, + { + "epoch": 1.1737328441364316, + "grad_norm": 0.19101597368717194, + "learning_rate": 8.53376817502378e-05, + "loss": 3.6584, + "step": 17275 + }, + { + "epoch": 1.1740725642070933, + "grad_norm": 0.2694936692714691, + "learning_rate": 8.533343524935453e-05, + "loss": 4.1199, + "step": 17280 + }, + { + "epoch": 1.1744122842777551, + "grad_norm": 0.22883574664592743, + "learning_rate": 8.532918874847126e-05, + "loss": 3.8207, + "step": 17285 + }, + { + "epoch": 1.174752004348417, + "grad_norm": 0.6247007250785828, + "learning_rate": 8.532494224758799e-05, + "loss": 3.9027, + "step": 17290 + }, + { + "epoch": 1.1750917244190786, + "grad_norm": 0.18652090430259705, + "learning_rate": 8.532069574670472e-05, + "loss": 4.1485, + "step": 17295 + }, + { + "epoch": 1.1754314444897405, + "grad_norm": 0.18046824634075165, + "learning_rate": 8.531644924582145e-05, + "loss": 4.1524, + "step": 17300 + }, + { + "epoch": 1.1757711645604023, + "grad_norm": 0.18683654069900513, + "learning_rate": 8.531220274493817e-05, + "loss": 3.9913, + "step": 17305 + }, + { + "epoch": 1.176110884631064, + "grad_norm": 0.21284620463848114, + "learning_rate": 8.53079562440549e-05, + "loss": 4.0716, + "step": 17310 + }, + { + "epoch": 1.1764506047017258, + "grad_norm": 0.1727413684129715, + "learning_rate": 8.530370974317163e-05, + "loss": 3.9771, + "step": 17315 + }, + { + "epoch": 1.1767903247723877, + "grad_norm": 0.14907076954841614, + "learning_rate": 8.529946324228836e-05, + "loss": 4.2271, + "step": 17320 + }, + { + "epoch": 1.1771300448430493, + "grad_norm": 0.1725512444972992, + "learning_rate": 8.529521674140509e-05, + "loss": 3.8649, + "step": 17325 + }, + { + "epoch": 1.1774697649137111, + "grad_norm": 0.16804037988185883, + "learning_rate": 8.529097024052181e-05, + "loss": 4.1127, + "step": 17330 + }, + { + "epoch": 1.177809484984373, + "grad_norm": 0.15476571023464203, + "learning_rate": 8.528672373963854e-05, + "loss": 4.0975, + "step": 17335 + }, + { + "epoch": 1.1781492050550346, + "grad_norm": 0.17302659153938293, + "learning_rate": 8.528247723875527e-05, + "loss": 3.9171, + "step": 17340 + }, + { + "epoch": 1.1784889251256965, + "grad_norm": 0.2050967961549759, + "learning_rate": 8.5278230737872e-05, + "loss": 3.942, + "step": 17345 + }, + { + "epoch": 1.178828645196358, + "grad_norm": 0.1648249477148056, + "learning_rate": 8.527398423698873e-05, + "loss": 3.7872, + "step": 17350 + }, + { + "epoch": 1.17916836526702, + "grad_norm": 0.18924520909786224, + "learning_rate": 8.526973773610545e-05, + "loss": 3.9058, + "step": 17355 + }, + { + "epoch": 1.1795080853376818, + "grad_norm": 0.2357872575521469, + "learning_rate": 8.526549123522218e-05, + "loss": 4.0327, + "step": 17360 + }, + { + "epoch": 1.1798478054083434, + "grad_norm": 0.20756453275680542, + "learning_rate": 8.526124473433891e-05, + "loss": 3.6292, + "step": 17365 + }, + { + "epoch": 1.1801875254790053, + "grad_norm": 1.4094053506851196, + "learning_rate": 8.525699823345564e-05, + "loss": 3.833, + "step": 17370 + }, + { + "epoch": 1.1805272455496671, + "grad_norm": 0.1797502338886261, + "learning_rate": 8.525275173257237e-05, + "loss": 3.9155, + "step": 17375 + }, + { + "epoch": 1.1808669656203288, + "grad_norm": 0.1424524039030075, + "learning_rate": 8.52485052316891e-05, + "loss": 3.9016, + "step": 17380 + }, + { + "epoch": 1.1812066856909906, + "grad_norm": 0.1629822552204132, + "learning_rate": 8.524425873080581e-05, + "loss": 3.7236, + "step": 17385 + }, + { + "epoch": 1.1815464057616525, + "grad_norm": 0.19388322532176971, + "learning_rate": 8.524001222992255e-05, + "loss": 3.815, + "step": 17390 + }, + { + "epoch": 1.1818861258323141, + "grad_norm": 0.2354399710893631, + "learning_rate": 8.523576572903928e-05, + "loss": 3.7392, + "step": 17395 + }, + { + "epoch": 1.182225845902976, + "grad_norm": 0.15893875062465668, + "learning_rate": 8.523151922815599e-05, + "loss": 3.7614, + "step": 17400 + }, + { + "epoch": 1.1825655659736376, + "grad_norm": 0.20125453174114227, + "learning_rate": 8.522727272727273e-05, + "loss": 3.6865, + "step": 17405 + }, + { + "epoch": 1.1829052860442995, + "grad_norm": 0.1618742197751999, + "learning_rate": 8.522302622638946e-05, + "loss": 3.9964, + "step": 17410 + }, + { + "epoch": 1.1832450061149613, + "grad_norm": 0.14728790521621704, + "learning_rate": 8.521877972550618e-05, + "loss": 3.8379, + "step": 17415 + }, + { + "epoch": 1.183584726185623, + "grad_norm": 3.493596315383911, + "learning_rate": 8.521453322462292e-05, + "loss": 3.8575, + "step": 17420 + }, + { + "epoch": 1.1839244462562848, + "grad_norm": 0.1597021073102951, + "learning_rate": 8.521028672373965e-05, + "loss": 3.844, + "step": 17425 + }, + { + "epoch": 1.1842641663269466, + "grad_norm": 0.5359874367713928, + "learning_rate": 8.520604022285637e-05, + "loss": 4.076, + "step": 17430 + }, + { + "epoch": 1.1846038863976083, + "grad_norm": 0.1780054271221161, + "learning_rate": 8.52017937219731e-05, + "loss": 4.1624, + "step": 17435 + }, + { + "epoch": 1.1849436064682701, + "grad_norm": 0.33140063285827637, + "learning_rate": 8.519754722108983e-05, + "loss": 3.8802, + "step": 17440 + }, + { + "epoch": 1.185283326538932, + "grad_norm": 0.18705520033836365, + "learning_rate": 8.519330072020656e-05, + "loss": 3.8147, + "step": 17445 + }, + { + "epoch": 1.1856230466095936, + "grad_norm": 0.18762825429439545, + "learning_rate": 8.518905421932329e-05, + "loss": 3.8554, + "step": 17450 + }, + { + "epoch": 1.1859627666802555, + "grad_norm": 0.18581523001194, + "learning_rate": 8.518480771844001e-05, + "loss": 4.0329, + "step": 17455 + }, + { + "epoch": 1.1863024867509173, + "grad_norm": 0.15572847425937653, + "learning_rate": 8.518056121755674e-05, + "loss": 3.9357, + "step": 17460 + }, + { + "epoch": 1.186642206821579, + "grad_norm": 0.2783101797103882, + "learning_rate": 8.517631471667347e-05, + "loss": 4.0425, + "step": 17465 + }, + { + "epoch": 1.1869819268922408, + "grad_norm": 0.1746896654367447, + "learning_rate": 8.517206821579018e-05, + "loss": 3.9771, + "step": 17470 + }, + { + "epoch": 1.1873216469629027, + "grad_norm": 0.19351759552955627, + "learning_rate": 8.516782171490693e-05, + "loss": 3.5136, + "step": 17475 + }, + { + "epoch": 1.1876613670335643, + "grad_norm": 0.18616865575313568, + "learning_rate": 8.516357521402365e-05, + "loss": 4.0759, + "step": 17480 + }, + { + "epoch": 1.1880010871042261, + "grad_norm": 0.3031885027885437, + "learning_rate": 8.515932871314037e-05, + "loss": 4.0218, + "step": 17485 + }, + { + "epoch": 1.188340807174888, + "grad_norm": 0.20433250069618225, + "learning_rate": 8.515508221225711e-05, + "loss": 3.5988, + "step": 17490 + }, + { + "epoch": 1.1886805272455496, + "grad_norm": 0.20006012916564941, + "learning_rate": 8.515083571137384e-05, + "loss": 3.7465, + "step": 17495 + }, + { + "epoch": 1.1890202473162115, + "grad_norm": 0.9421390295028687, + "learning_rate": 8.514658921049055e-05, + "loss": 3.7584, + "step": 17500 + }, + { + "epoch": 1.1893599673868733, + "grad_norm": 0.14888514578342438, + "learning_rate": 8.51423427096073e-05, + "loss": 3.8037, + "step": 17505 + }, + { + "epoch": 1.189699687457535, + "grad_norm": 0.14448173344135284, + "learning_rate": 8.513809620872402e-05, + "loss": 3.8154, + "step": 17510 + }, + { + "epoch": 1.1900394075281968, + "grad_norm": 0.19571180641651154, + "learning_rate": 8.513384970784074e-05, + "loss": 4.2558, + "step": 17515 + }, + { + "epoch": 1.1903791275988584, + "grad_norm": 0.19125308096408844, + "learning_rate": 8.512960320695748e-05, + "loss": 3.8016, + "step": 17520 + }, + { + "epoch": 1.1907188476695203, + "grad_norm": 0.32165583968162537, + "learning_rate": 8.51253567060742e-05, + "loss": 3.8769, + "step": 17525 + }, + { + "epoch": 1.1910585677401822, + "grad_norm": 0.1729317307472229, + "learning_rate": 8.512111020519092e-05, + "loss": 3.9856, + "step": 17530 + }, + { + "epoch": 1.1913982878108438, + "grad_norm": 0.17354346811771393, + "learning_rate": 8.511686370430766e-05, + "loss": 4.0383, + "step": 17535 + }, + { + "epoch": 1.1917380078815056, + "grad_norm": 0.17091649770736694, + "learning_rate": 8.511261720342438e-05, + "loss": 3.9232, + "step": 17540 + }, + { + "epoch": 1.1920777279521675, + "grad_norm": 0.1935364007949829, + "learning_rate": 8.51083707025411e-05, + "loss": 4.0085, + "step": 17545 + }, + { + "epoch": 1.1924174480228291, + "grad_norm": 0.18614515662193298, + "learning_rate": 8.510412420165785e-05, + "loss": 4.1707, + "step": 17550 + }, + { + "epoch": 1.192757168093491, + "grad_norm": 0.2323632836341858, + "learning_rate": 8.509987770077456e-05, + "loss": 3.9742, + "step": 17555 + }, + { + "epoch": 1.1930968881641528, + "grad_norm": 0.369140088558197, + "learning_rate": 8.509563119989129e-05, + "loss": 3.9275, + "step": 17560 + }, + { + "epoch": 1.1934366082348145, + "grad_norm": 0.1732921600341797, + "learning_rate": 8.509138469900803e-05, + "loss": 3.6527, + "step": 17565 + }, + { + "epoch": 1.1937763283054763, + "grad_norm": 0.2759207785129547, + "learning_rate": 8.508713819812474e-05, + "loss": 4.0347, + "step": 17570 + }, + { + "epoch": 1.194116048376138, + "grad_norm": 0.20774796605110168, + "learning_rate": 8.508289169724147e-05, + "loss": 3.6112, + "step": 17575 + }, + { + "epoch": 1.1944557684467998, + "grad_norm": 0.3093954026699066, + "learning_rate": 8.507864519635821e-05, + "loss": 3.903, + "step": 17580 + }, + { + "epoch": 1.1947954885174616, + "grad_norm": 0.22841060161590576, + "learning_rate": 8.507439869547493e-05, + "loss": 4.1444, + "step": 17585 + }, + { + "epoch": 1.1951352085881233, + "grad_norm": 0.19268852472305298, + "learning_rate": 8.507015219459166e-05, + "loss": 4.0287, + "step": 17590 + }, + { + "epoch": 1.1954749286587851, + "grad_norm": 0.18337005376815796, + "learning_rate": 8.50659056937084e-05, + "loss": 3.9969, + "step": 17595 + }, + { + "epoch": 1.195814648729447, + "grad_norm": 0.1679060310125351, + "learning_rate": 8.506165919282511e-05, + "loss": 3.9322, + "step": 17600 + }, + { + "epoch": 1.1961543688001086, + "grad_norm": 0.2293335199356079, + "learning_rate": 8.505741269194184e-05, + "loss": 3.8196, + "step": 17605 + }, + { + "epoch": 1.1964940888707705, + "grad_norm": 0.1933789849281311, + "learning_rate": 8.505316619105857e-05, + "loss": 3.5001, + "step": 17610 + }, + { + "epoch": 1.1968338089414323, + "grad_norm": 0.3912888467311859, + "learning_rate": 8.50489196901753e-05, + "loss": 4.1153, + "step": 17615 + }, + { + "epoch": 1.197173529012094, + "grad_norm": 0.17738385498523712, + "learning_rate": 8.504467318929202e-05, + "loss": 3.805, + "step": 17620 + }, + { + "epoch": 1.1975132490827558, + "grad_norm": 0.17735421657562256, + "learning_rate": 8.504042668840875e-05, + "loss": 3.6365, + "step": 17625 + }, + { + "epoch": 1.1978529691534177, + "grad_norm": 0.2203110158443451, + "learning_rate": 8.503618018752548e-05, + "loss": 3.8899, + "step": 17630 + }, + { + "epoch": 1.1981926892240793, + "grad_norm": 0.1532653123140335, + "learning_rate": 8.503193368664221e-05, + "loss": 3.7012, + "step": 17635 + }, + { + "epoch": 1.1985324092947411, + "grad_norm": 0.3337753415107727, + "learning_rate": 8.502768718575894e-05, + "loss": 4.0104, + "step": 17640 + }, + { + "epoch": 1.198872129365403, + "grad_norm": 0.33435937762260437, + "learning_rate": 8.502344068487566e-05, + "loss": 3.9224, + "step": 17645 + }, + { + "epoch": 1.1992118494360646, + "grad_norm": 0.1799783706665039, + "learning_rate": 8.501919418399239e-05, + "loss": 3.9293, + "step": 17650 + }, + { + "epoch": 1.1995515695067265, + "grad_norm": 0.1757258176803589, + "learning_rate": 8.501494768310912e-05, + "loss": 3.9612, + "step": 17655 + }, + { + "epoch": 1.1998912895773883, + "grad_norm": 0.16812027990818024, + "learning_rate": 8.501070118222585e-05, + "loss": 3.872, + "step": 17660 + }, + { + "epoch": 1.20023100964805, + "grad_norm": 0.20549772679805756, + "learning_rate": 8.500645468134258e-05, + "loss": 4.0602, + "step": 17665 + }, + { + "epoch": 1.2005707297187118, + "grad_norm": 0.15647916495800018, + "learning_rate": 8.50022081804593e-05, + "loss": 3.807, + "step": 17670 + }, + { + "epoch": 1.2009104497893737, + "grad_norm": 0.18400496244430542, + "learning_rate": 8.499796167957603e-05, + "loss": 3.8232, + "step": 17675 + }, + { + "epoch": 1.2012501698600353, + "grad_norm": 0.15180744230747223, + "learning_rate": 8.499371517869276e-05, + "loss": 3.6575, + "step": 17680 + }, + { + "epoch": 1.2015898899306972, + "grad_norm": 0.24149100482463837, + "learning_rate": 8.498946867780949e-05, + "loss": 4.0151, + "step": 17685 + }, + { + "epoch": 1.2019296100013588, + "grad_norm": 0.158351868391037, + "learning_rate": 8.498522217692622e-05, + "loss": 4.1815, + "step": 17690 + }, + { + "epoch": 1.2022693300720206, + "grad_norm": 0.15415014326572418, + "learning_rate": 8.498097567604294e-05, + "loss": 3.7625, + "step": 17695 + }, + { + "epoch": 1.2026090501426825, + "grad_norm": 0.13916771113872528, + "learning_rate": 8.497672917515967e-05, + "loss": 3.8619, + "step": 17700 + }, + { + "epoch": 1.2029487702133441, + "grad_norm": 0.17022761702537537, + "learning_rate": 8.49724826742764e-05, + "loss": 3.6778, + "step": 17705 + }, + { + "epoch": 1.203288490284006, + "grad_norm": 0.1659051477909088, + "learning_rate": 8.496823617339313e-05, + "loss": 4.1978, + "step": 17710 + }, + { + "epoch": 1.2036282103546678, + "grad_norm": 0.17350874841213226, + "learning_rate": 8.496398967250986e-05, + "loss": 3.6829, + "step": 17715 + }, + { + "epoch": 1.2039679304253295, + "grad_norm": 0.1609283834695816, + "learning_rate": 8.495974317162658e-05, + "loss": 4.1243, + "step": 17720 + }, + { + "epoch": 1.2043076504959913, + "grad_norm": 0.15750306844711304, + "learning_rate": 8.495549667074331e-05, + "loss": 4.0847, + "step": 17725 + }, + { + "epoch": 1.2046473705666532, + "grad_norm": 0.19416692852973938, + "learning_rate": 8.495125016986004e-05, + "loss": 3.9838, + "step": 17730 + }, + { + "epoch": 1.2049870906373148, + "grad_norm": 0.6967943906784058, + "learning_rate": 8.494700366897677e-05, + "loss": 3.9679, + "step": 17735 + }, + { + "epoch": 1.2053268107079766, + "grad_norm": 0.1592290997505188, + "learning_rate": 8.494275716809348e-05, + "loss": 3.7359, + "step": 17740 + }, + { + "epoch": 1.2056665307786383, + "grad_norm": 0.154512420296669, + "learning_rate": 8.493851066721022e-05, + "loss": 3.9136, + "step": 17745 + }, + { + "epoch": 1.2060062508493001, + "grad_norm": 0.1962956339120865, + "learning_rate": 8.493426416632695e-05, + "loss": 4.0739, + "step": 17750 + }, + { + "epoch": 1.206345970919962, + "grad_norm": 0.23174850642681122, + "learning_rate": 8.493001766544367e-05, + "loss": 3.7593, + "step": 17755 + }, + { + "epoch": 1.2066856909906236, + "grad_norm": 0.24617072939872742, + "learning_rate": 8.492577116456041e-05, + "loss": 3.9182, + "step": 17760 + }, + { + "epoch": 1.2070254110612855, + "grad_norm": 0.15417498350143433, + "learning_rate": 8.492152466367714e-05, + "loss": 4.012, + "step": 17765 + }, + { + "epoch": 1.2073651311319473, + "grad_norm": 0.18142859637737274, + "learning_rate": 8.491727816279386e-05, + "loss": 3.6762, + "step": 17770 + }, + { + "epoch": 1.207704851202609, + "grad_norm": 0.17556129395961761, + "learning_rate": 8.491303166191059e-05, + "loss": 4.0146, + "step": 17775 + }, + { + "epoch": 1.2080445712732708, + "grad_norm": 0.23099839687347412, + "learning_rate": 8.490878516102732e-05, + "loss": 3.9705, + "step": 17780 + }, + { + "epoch": 1.2083842913439327, + "grad_norm": 0.22976244986057281, + "learning_rate": 8.490453866014405e-05, + "loss": 3.8786, + "step": 17785 + }, + { + "epoch": 1.2087240114145943, + "grad_norm": 0.22878305613994598, + "learning_rate": 8.490029215926078e-05, + "loss": 4.2516, + "step": 17790 + }, + { + "epoch": 1.2090637314852561, + "grad_norm": 0.17479437589645386, + "learning_rate": 8.48960456583775e-05, + "loss": 3.9144, + "step": 17795 + }, + { + "epoch": 1.209403451555918, + "grad_norm": 0.1926736831665039, + "learning_rate": 8.489179915749423e-05, + "loss": 4.1098, + "step": 17800 + }, + { + "epoch": 1.2097431716265796, + "grad_norm": 0.18325889110565186, + "learning_rate": 8.488755265661096e-05, + "loss": 3.7186, + "step": 17805 + }, + { + "epoch": 1.2100828916972415, + "grad_norm": 0.22507767379283905, + "learning_rate": 8.488330615572768e-05, + "loss": 3.8564, + "step": 17810 + }, + { + "epoch": 1.2104226117679033, + "grad_norm": 0.2739925980567932, + "learning_rate": 8.487905965484442e-05, + "loss": 4.0392, + "step": 17815 + }, + { + "epoch": 1.210762331838565, + "grad_norm": 0.19921007752418518, + "learning_rate": 8.487481315396115e-05, + "loss": 3.9244, + "step": 17820 + }, + { + "epoch": 1.2111020519092268, + "grad_norm": 0.35412123799324036, + "learning_rate": 8.487056665307786e-05, + "loss": 4.0252, + "step": 17825 + }, + { + "epoch": 1.2114417719798887, + "grad_norm": 0.14085720479488373, + "learning_rate": 8.48663201521946e-05, + "loss": 3.9533, + "step": 17830 + }, + { + "epoch": 1.2117814920505503, + "grad_norm": 0.1868872493505478, + "learning_rate": 8.486207365131133e-05, + "loss": 3.9201, + "step": 17835 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.20532390475273132, + "learning_rate": 8.485782715042804e-05, + "loss": 4.0948, + "step": 17840 + }, + { + "epoch": 1.212460932191874, + "grad_norm": 2.066826105117798, + "learning_rate": 8.485358064954479e-05, + "loss": 4.0192, + "step": 17845 + }, + { + "epoch": 1.2128006522625356, + "grad_norm": 0.21731826663017273, + "learning_rate": 8.484933414866151e-05, + "loss": 3.9632, + "step": 17850 + }, + { + "epoch": 1.2131403723331975, + "grad_norm": 0.20806385576725006, + "learning_rate": 8.484508764777823e-05, + "loss": 4.1693, + "step": 17855 + }, + { + "epoch": 1.2134800924038591, + "grad_norm": 0.1966964602470398, + "learning_rate": 8.484084114689497e-05, + "loss": 3.8883, + "step": 17860 + }, + { + "epoch": 1.213819812474521, + "grad_norm": 0.1501649171113968, + "learning_rate": 8.48365946460117e-05, + "loss": 4.0365, + "step": 17865 + }, + { + "epoch": 1.2141595325451828, + "grad_norm": 0.19150406122207642, + "learning_rate": 8.483234814512841e-05, + "loss": 3.7534, + "step": 17870 + }, + { + "epoch": 1.2144992526158445, + "grad_norm": 0.17300066351890564, + "learning_rate": 8.482810164424515e-05, + "loss": 4.1593, + "step": 17875 + }, + { + "epoch": 1.2148389726865063, + "grad_norm": 3.2041115760803223, + "learning_rate": 8.482385514336188e-05, + "loss": 3.9071, + "step": 17880 + }, + { + "epoch": 1.2151786927571682, + "grad_norm": 0.1739591807126999, + "learning_rate": 8.48196086424786e-05, + "loss": 3.8652, + "step": 17885 + }, + { + "epoch": 1.2155184128278298, + "grad_norm": 0.7555450797080994, + "learning_rate": 8.481536214159534e-05, + "loss": 3.9528, + "step": 17890 + }, + { + "epoch": 1.2158581328984917, + "grad_norm": 0.1411188691854477, + "learning_rate": 8.481111564071205e-05, + "loss": 4.1016, + "step": 17895 + }, + { + "epoch": 1.2161978529691535, + "grad_norm": 0.16973358392715454, + "learning_rate": 8.480686913982878e-05, + "loss": 4.085, + "step": 17900 + }, + { + "epoch": 1.2165375730398151, + "grad_norm": 0.2102062851190567, + "learning_rate": 8.480262263894552e-05, + "loss": 3.8983, + "step": 17905 + }, + { + "epoch": 1.216877293110477, + "grad_norm": 0.16676536202430725, + "learning_rate": 8.479837613806224e-05, + "loss": 3.7503, + "step": 17910 + }, + { + "epoch": 1.2172170131811386, + "grad_norm": 0.31023889780044556, + "learning_rate": 8.479412963717896e-05, + "loss": 3.9107, + "step": 17915 + }, + { + "epoch": 1.2175567332518005, + "grad_norm": 0.1585465371608734, + "learning_rate": 8.47898831362957e-05, + "loss": 4.0752, + "step": 17920 + }, + { + "epoch": 1.2178964533224623, + "grad_norm": 0.18549667298793793, + "learning_rate": 8.478563663541242e-05, + "loss": 4.0288, + "step": 17925 + }, + { + "epoch": 1.218236173393124, + "grad_norm": 0.2188732922077179, + "learning_rate": 8.478139013452915e-05, + "loss": 3.8021, + "step": 17930 + }, + { + "epoch": 1.2185758934637858, + "grad_norm": 0.1689499467611313, + "learning_rate": 8.477714363364589e-05, + "loss": 4.1232, + "step": 17935 + }, + { + "epoch": 1.2189156135344477, + "grad_norm": 0.26732397079467773, + "learning_rate": 8.47728971327626e-05, + "loss": 3.9163, + "step": 17940 + }, + { + "epoch": 1.2192553336051093, + "grad_norm": 0.14491264522075653, + "learning_rate": 8.476865063187933e-05, + "loss": 4.1709, + "step": 17945 + }, + { + "epoch": 1.2195950536757711, + "grad_norm": 0.22153352200984955, + "learning_rate": 8.476440413099607e-05, + "loss": 3.7844, + "step": 17950 + }, + { + "epoch": 1.219934773746433, + "grad_norm": 0.579850971698761, + "learning_rate": 8.476015763011279e-05, + "loss": 3.9282, + "step": 17955 + }, + { + "epoch": 1.2202744938170946, + "grad_norm": 0.18854564428329468, + "learning_rate": 8.475591112922952e-05, + "loss": 3.988, + "step": 17960 + }, + { + "epoch": 1.2206142138877565, + "grad_norm": 0.17499446868896484, + "learning_rate": 8.475166462834624e-05, + "loss": 3.8036, + "step": 17965 + }, + { + "epoch": 1.2209539339584183, + "grad_norm": 0.14125065505504608, + "learning_rate": 8.474741812746297e-05, + "loss": 3.9113, + "step": 17970 + }, + { + "epoch": 1.22129365402908, + "grad_norm": 0.19082510471343994, + "learning_rate": 8.47431716265797e-05, + "loss": 3.967, + "step": 17975 + }, + { + "epoch": 1.2216333740997418, + "grad_norm": 1.026790976524353, + "learning_rate": 8.473892512569643e-05, + "loss": 3.9518, + "step": 17980 + }, + { + "epoch": 1.2219730941704037, + "grad_norm": 0.28648123145103455, + "learning_rate": 8.473467862481316e-05, + "loss": 3.6492, + "step": 17985 + }, + { + "epoch": 1.2223128142410653, + "grad_norm": 0.19966855645179749, + "learning_rate": 8.473043212392988e-05, + "loss": 4.0734, + "step": 17990 + }, + { + "epoch": 1.2226525343117272, + "grad_norm": 0.26493343710899353, + "learning_rate": 8.472618562304661e-05, + "loss": 4.0057, + "step": 17995 + }, + { + "epoch": 1.222992254382389, + "grad_norm": 0.3501884341239929, + "learning_rate": 8.472193912216334e-05, + "loss": 3.8788, + "step": 18000 + }, + { + "epoch": 1.2233319744530506, + "grad_norm": 0.19281838834285736, + "learning_rate": 8.471769262128007e-05, + "loss": 3.872, + "step": 18005 + }, + { + "epoch": 1.2236716945237125, + "grad_norm": 0.1932147890329361, + "learning_rate": 8.47134461203968e-05, + "loss": 3.8303, + "step": 18010 + }, + { + "epoch": 1.2240114145943743, + "grad_norm": 0.15256324410438538, + "learning_rate": 8.470919961951352e-05, + "loss": 3.984, + "step": 18015 + }, + { + "epoch": 1.224351134665036, + "grad_norm": 0.15034160017967224, + "learning_rate": 8.470495311863025e-05, + "loss": 4.0771, + "step": 18020 + }, + { + "epoch": 1.2246908547356978, + "grad_norm": 0.1738007664680481, + "learning_rate": 8.470070661774698e-05, + "loss": 3.8791, + "step": 18025 + }, + { + "epoch": 1.2250305748063597, + "grad_norm": 0.17952094972133636, + "learning_rate": 8.469646011686371e-05, + "loss": 3.7805, + "step": 18030 + }, + { + "epoch": 1.2253702948770213, + "grad_norm": 0.18394187092781067, + "learning_rate": 8.469221361598044e-05, + "loss": 4.172, + "step": 18035 + }, + { + "epoch": 1.2257100149476832, + "grad_norm": 0.2223910391330719, + "learning_rate": 8.468796711509716e-05, + "loss": 4.0108, + "step": 18040 + }, + { + "epoch": 1.2260497350183448, + "grad_norm": 0.5326086282730103, + "learning_rate": 8.468372061421389e-05, + "loss": 4.0129, + "step": 18045 + }, + { + "epoch": 1.2263894550890067, + "grad_norm": 0.17702655494213104, + "learning_rate": 8.467947411333062e-05, + "loss": 3.9277, + "step": 18050 + }, + { + "epoch": 1.2267291751596685, + "grad_norm": 0.2266564965248108, + "learning_rate": 8.467522761244735e-05, + "loss": 4.0063, + "step": 18055 + }, + { + "epoch": 1.2270688952303301, + "grad_norm": 1.4619090557098389, + "learning_rate": 8.467098111156408e-05, + "loss": 3.6856, + "step": 18060 + }, + { + "epoch": 1.227408615300992, + "grad_norm": 0.1942298710346222, + "learning_rate": 8.46667346106808e-05, + "loss": 3.9168, + "step": 18065 + }, + { + "epoch": 1.2277483353716538, + "grad_norm": 0.18967270851135254, + "learning_rate": 8.466248810979753e-05, + "loss": 4.068, + "step": 18070 + }, + { + "epoch": 1.2280880554423155, + "grad_norm": 0.20180198550224304, + "learning_rate": 8.465824160891426e-05, + "loss": 4.0818, + "step": 18075 + }, + { + "epoch": 1.2284277755129773, + "grad_norm": 0.20485012233257294, + "learning_rate": 8.465399510803099e-05, + "loss": 3.8413, + "step": 18080 + }, + { + "epoch": 1.228767495583639, + "grad_norm": 0.18021096289157867, + "learning_rate": 8.464974860714772e-05, + "loss": 3.8718, + "step": 18085 + }, + { + "epoch": 1.2291072156543008, + "grad_norm": 0.19677716493606567, + "learning_rate": 8.464550210626444e-05, + "loss": 4.0483, + "step": 18090 + }, + { + "epoch": 1.2294469357249627, + "grad_norm": 0.21937116980552673, + "learning_rate": 8.464125560538116e-05, + "loss": 3.9051, + "step": 18095 + }, + { + "epoch": 1.2297866557956243, + "grad_norm": 0.1341472715139389, + "learning_rate": 8.46370091044979e-05, + "loss": 4.2012, + "step": 18100 + }, + { + "epoch": 1.2301263758662861, + "grad_norm": 0.1738073229789734, + "learning_rate": 8.463276260361463e-05, + "loss": 3.9627, + "step": 18105 + }, + { + "epoch": 1.230466095936948, + "grad_norm": 3.2245421409606934, + "learning_rate": 8.462851610273136e-05, + "loss": 3.9218, + "step": 18110 + }, + { + "epoch": 1.2308058160076096, + "grad_norm": 0.145183265209198, + "learning_rate": 8.462426960184808e-05, + "loss": 4.1924, + "step": 18115 + }, + { + "epoch": 1.2311455360782715, + "grad_norm": 0.1976163387298584, + "learning_rate": 8.462002310096481e-05, + "loss": 4.1517, + "step": 18120 + }, + { + "epoch": 1.2314852561489333, + "grad_norm": 0.20109686255455017, + "learning_rate": 8.461577660008154e-05, + "loss": 3.6884, + "step": 18125 + }, + { + "epoch": 1.231824976219595, + "grad_norm": 2.8102073669433594, + "learning_rate": 8.461153009919827e-05, + "loss": 3.8938, + "step": 18130 + }, + { + "epoch": 1.2321646962902568, + "grad_norm": 0.18502961099147797, + "learning_rate": 8.4607283598315e-05, + "loss": 3.973, + "step": 18135 + }, + { + "epoch": 1.2325044163609187, + "grad_norm": 0.19922541081905365, + "learning_rate": 8.460303709743172e-05, + "loss": 3.9667, + "step": 18140 + }, + { + "epoch": 1.2328441364315803, + "grad_norm": 0.2200602889060974, + "learning_rate": 8.459879059654845e-05, + "loss": 3.9626, + "step": 18145 + }, + { + "epoch": 1.2331838565022422, + "grad_norm": 0.17551837861537933, + "learning_rate": 8.459454409566518e-05, + "loss": 3.9075, + "step": 18150 + }, + { + "epoch": 1.233523576572904, + "grad_norm": 0.20272567868232727, + "learning_rate": 8.459029759478191e-05, + "loss": 3.747, + "step": 18155 + }, + { + "epoch": 1.2338632966435656, + "grad_norm": 0.19307580590248108, + "learning_rate": 8.458605109389864e-05, + "loss": 3.8978, + "step": 18160 + }, + { + "epoch": 1.2342030167142275, + "grad_norm": 0.3040013313293457, + "learning_rate": 8.458180459301535e-05, + "loss": 3.8423, + "step": 18165 + }, + { + "epoch": 1.2345427367848893, + "grad_norm": 0.21311868727207184, + "learning_rate": 8.457755809213209e-05, + "loss": 3.8674, + "step": 18170 + }, + { + "epoch": 1.234882456855551, + "grad_norm": 0.20960110425949097, + "learning_rate": 8.457331159124882e-05, + "loss": 3.9625, + "step": 18175 + }, + { + "epoch": 1.2352221769262128, + "grad_norm": 0.1869891732931137, + "learning_rate": 8.456906509036553e-05, + "loss": 3.9233, + "step": 18180 + }, + { + "epoch": 1.2355618969968747, + "grad_norm": 0.21703138947486877, + "learning_rate": 8.456481858948228e-05, + "loss": 3.9846, + "step": 18185 + }, + { + "epoch": 1.2359016170675363, + "grad_norm": 0.16441896557807922, + "learning_rate": 8.4560572088599e-05, + "loss": 3.8119, + "step": 18190 + }, + { + "epoch": 1.2362413371381982, + "grad_norm": 0.37163805961608887, + "learning_rate": 8.455632558771572e-05, + "loss": 4.03, + "step": 18195 + }, + { + "epoch": 1.23658105720886, + "grad_norm": 0.25074121356010437, + "learning_rate": 8.455207908683246e-05, + "loss": 4.0184, + "step": 18200 + }, + { + "epoch": 1.2369207772795217, + "grad_norm": 0.21786735951900482, + "learning_rate": 8.454783258594919e-05, + "loss": 3.7732, + "step": 18205 + }, + { + "epoch": 1.2372604973501835, + "grad_norm": 0.24981912970542908, + "learning_rate": 8.45435860850659e-05, + "loss": 3.8362, + "step": 18210 + }, + { + "epoch": 1.2376002174208451, + "grad_norm": 0.15977716445922852, + "learning_rate": 8.453933958418264e-05, + "loss": 3.9227, + "step": 18215 + }, + { + "epoch": 1.237939937491507, + "grad_norm": 0.1689864695072174, + "learning_rate": 8.453509308329937e-05, + "loss": 3.773, + "step": 18220 + }, + { + "epoch": 1.2382796575621688, + "grad_norm": 0.18549127876758575, + "learning_rate": 8.453084658241609e-05, + "loss": 4.1043, + "step": 18225 + }, + { + "epoch": 1.2386193776328305, + "grad_norm": 0.18490347266197205, + "learning_rate": 8.452660008153283e-05, + "loss": 3.9516, + "step": 18230 + }, + { + "epoch": 1.2389590977034923, + "grad_norm": 0.1979372352361679, + "learning_rate": 8.452235358064954e-05, + "loss": 3.9929, + "step": 18235 + }, + { + "epoch": 1.2392988177741542, + "grad_norm": 0.1324065774679184, + "learning_rate": 8.451810707976627e-05, + "loss": 3.9382, + "step": 18240 + }, + { + "epoch": 1.2396385378448158, + "grad_norm": 0.21047401428222656, + "learning_rate": 8.451386057888301e-05, + "loss": 3.7973, + "step": 18245 + }, + { + "epoch": 1.2399782579154777, + "grad_norm": 0.24633465707302094, + "learning_rate": 8.450961407799973e-05, + "loss": 3.907, + "step": 18250 + }, + { + "epoch": 1.2403179779861393, + "grad_norm": 0.20173491537570953, + "learning_rate": 8.450536757711645e-05, + "loss": 3.9064, + "step": 18255 + }, + { + "epoch": 1.2406576980568012, + "grad_norm": 0.167512446641922, + "learning_rate": 8.45011210762332e-05, + "loss": 3.8382, + "step": 18260 + }, + { + "epoch": 1.240997418127463, + "grad_norm": 0.16559776663780212, + "learning_rate": 8.449687457534991e-05, + "loss": 4.1323, + "step": 18265 + }, + { + "epoch": 1.2413371381981246, + "grad_norm": 0.16953441500663757, + "learning_rate": 8.449262807446664e-05, + "loss": 3.7325, + "step": 18270 + }, + { + "epoch": 1.2416768582687865, + "grad_norm": 0.33102941513061523, + "learning_rate": 8.448838157358338e-05, + "loss": 3.7704, + "step": 18275 + }, + { + "epoch": 1.2420165783394483, + "grad_norm": 0.17240281403064728, + "learning_rate": 8.44841350727001e-05, + "loss": 3.9884, + "step": 18280 + }, + { + "epoch": 1.24235629841011, + "grad_norm": 0.1833924800157547, + "learning_rate": 8.447988857181682e-05, + "loss": 4.1975, + "step": 18285 + }, + { + "epoch": 1.2426960184807718, + "grad_norm": 0.2438492476940155, + "learning_rate": 8.447564207093356e-05, + "loss": 3.8595, + "step": 18290 + }, + { + "epoch": 1.2430357385514337, + "grad_norm": 0.17314963042736053, + "learning_rate": 8.447139557005028e-05, + "loss": 3.86, + "step": 18295 + }, + { + "epoch": 1.2433754586220953, + "grad_norm": 0.20306158065795898, + "learning_rate": 8.4467149069167e-05, + "loss": 3.9896, + "step": 18300 + }, + { + "epoch": 1.2437151786927572, + "grad_norm": 0.1843825876712799, + "learning_rate": 8.446290256828375e-05, + "loss": 3.9643, + "step": 18305 + }, + { + "epoch": 1.244054898763419, + "grad_norm": 0.4020126461982727, + "learning_rate": 8.445865606740046e-05, + "loss": 4.0843, + "step": 18310 + }, + { + "epoch": 1.2443946188340806, + "grad_norm": 0.17876341938972473, + "learning_rate": 8.445440956651719e-05, + "loss": 4.0539, + "step": 18315 + }, + { + "epoch": 1.2447343389047425, + "grad_norm": 0.3112374246120453, + "learning_rate": 8.445016306563392e-05, + "loss": 3.9346, + "step": 18320 + }, + { + "epoch": 1.2450740589754044, + "grad_norm": 0.16753847897052765, + "learning_rate": 8.444591656475065e-05, + "loss": 3.8759, + "step": 18325 + }, + { + "epoch": 1.245413779046066, + "grad_norm": 0.1748908907175064, + "learning_rate": 8.444167006386737e-05, + "loss": 3.6978, + "step": 18330 + }, + { + "epoch": 1.2457534991167278, + "grad_norm": 0.22233617305755615, + "learning_rate": 8.44374235629841e-05, + "loss": 3.9539, + "step": 18335 + }, + { + "epoch": 1.2460932191873897, + "grad_norm": 0.3607032895088196, + "learning_rate": 8.443317706210083e-05, + "loss": 3.8774, + "step": 18340 + }, + { + "epoch": 1.2464329392580513, + "grad_norm": 0.17075949907302856, + "learning_rate": 8.442893056121756e-05, + "loss": 3.8618, + "step": 18345 + }, + { + "epoch": 1.2467726593287132, + "grad_norm": 0.3087412416934967, + "learning_rate": 8.442468406033429e-05, + "loss": 3.8636, + "step": 18350 + }, + { + "epoch": 1.247112379399375, + "grad_norm": 0.2744675576686859, + "learning_rate": 8.442043755945101e-05, + "loss": 4.2295, + "step": 18355 + }, + { + "epoch": 1.2474520994700367, + "grad_norm": 0.24336329102516174, + "learning_rate": 8.441619105856774e-05, + "loss": 3.9766, + "step": 18360 + }, + { + "epoch": 1.2477918195406985, + "grad_norm": 0.13749198615550995, + "learning_rate": 8.441194455768447e-05, + "loss": 3.9386, + "step": 18365 + }, + { + "epoch": 1.2481315396113604, + "grad_norm": 0.17800405621528625, + "learning_rate": 8.44076980568012e-05, + "loss": 3.7504, + "step": 18370 + }, + { + "epoch": 1.248471259682022, + "grad_norm": 0.4344328045845032, + "learning_rate": 8.440345155591793e-05, + "loss": 4.0392, + "step": 18375 + }, + { + "epoch": 1.2488109797526838, + "grad_norm": 0.17533355951309204, + "learning_rate": 8.439920505503465e-05, + "loss": 3.6497, + "step": 18380 + }, + { + "epoch": 1.2491506998233455, + "grad_norm": 0.3843276798725128, + "learning_rate": 8.439495855415138e-05, + "loss": 4.0964, + "step": 18385 + }, + { + "epoch": 1.2494904198940073, + "grad_norm": 0.25363999605178833, + "learning_rate": 8.439071205326811e-05, + "loss": 3.8126, + "step": 18390 + }, + { + "epoch": 1.2498301399646692, + "grad_norm": 0.1518462747335434, + "learning_rate": 8.438646555238484e-05, + "loss": 3.8254, + "step": 18395 + }, + { + "epoch": 1.2501698600353308, + "grad_norm": 0.5754070281982422, + "learning_rate": 8.438221905150157e-05, + "loss": 4.0118, + "step": 18400 + }, + { + "epoch": 1.2505095801059927, + "grad_norm": 0.15858344733715057, + "learning_rate": 8.43779725506183e-05, + "loss": 4.0973, + "step": 18405 + }, + { + "epoch": 1.2508493001766543, + "grad_norm": 0.15595321357250214, + "learning_rate": 8.437372604973502e-05, + "loss": 4.0203, + "step": 18410 + }, + { + "epoch": 1.2511890202473162, + "grad_norm": 0.16931505501270294, + "learning_rate": 8.436947954885175e-05, + "loss": 3.7103, + "step": 18415 + }, + { + "epoch": 1.251528740317978, + "grad_norm": 0.1743667721748352, + "learning_rate": 8.436523304796848e-05, + "loss": 3.8191, + "step": 18420 + }, + { + "epoch": 1.2518684603886396, + "grad_norm": 0.18150494992733002, + "learning_rate": 8.436098654708521e-05, + "loss": 3.9796, + "step": 18425 + }, + { + "epoch": 1.2522081804593015, + "grad_norm": 0.24144500494003296, + "learning_rate": 8.435674004620193e-05, + "loss": 3.7808, + "step": 18430 + }, + { + "epoch": 1.2525479005299633, + "grad_norm": 0.557348370552063, + "learning_rate": 8.435249354531865e-05, + "loss": 4.0165, + "step": 18435 + }, + { + "epoch": 1.252887620600625, + "grad_norm": 0.19793453812599182, + "learning_rate": 8.434824704443539e-05, + "loss": 3.8958, + "step": 18440 + }, + { + "epoch": 1.2532273406712868, + "grad_norm": 0.17592406272888184, + "learning_rate": 8.434400054355212e-05, + "loss": 3.8217, + "step": 18445 + }, + { + "epoch": 1.2535670607419487, + "grad_norm": 0.17349916696548462, + "learning_rate": 8.433975404266885e-05, + "loss": 4.0341, + "step": 18450 + }, + { + "epoch": 1.2539067808126103, + "grad_norm": 0.1828591674566269, + "learning_rate": 8.433550754178557e-05, + "loss": 3.7779, + "step": 18455 + }, + { + "epoch": 1.2542465008832722, + "grad_norm": 0.19867847859859467, + "learning_rate": 8.43312610409023e-05, + "loss": 4.0222, + "step": 18460 + }, + { + "epoch": 1.254586220953934, + "grad_norm": 0.3628077208995819, + "learning_rate": 8.432701454001903e-05, + "loss": 3.8701, + "step": 18465 + }, + { + "epoch": 1.2549259410245956, + "grad_norm": 0.2046818733215332, + "learning_rate": 8.432276803913576e-05, + "loss": 3.694, + "step": 18470 + }, + { + "epoch": 1.2552656610952575, + "grad_norm": 0.19502213597297668, + "learning_rate": 8.431852153825249e-05, + "loss": 4.0575, + "step": 18475 + }, + { + "epoch": 1.2556053811659194, + "grad_norm": 0.26125508546829224, + "learning_rate": 8.431427503736921e-05, + "loss": 3.9209, + "step": 18480 + }, + { + "epoch": 1.255945101236581, + "grad_norm": 0.1731896549463272, + "learning_rate": 8.431002853648594e-05, + "loss": 3.7342, + "step": 18485 + }, + { + "epoch": 1.2562848213072428, + "grad_norm": 0.20229969918727875, + "learning_rate": 8.430578203560267e-05, + "loss": 3.9123, + "step": 18490 + }, + { + "epoch": 1.2566245413779047, + "grad_norm": 0.2072359323501587, + "learning_rate": 8.43015355347194e-05, + "loss": 3.989, + "step": 18495 + }, + { + "epoch": 1.2569642614485663, + "grad_norm": 0.19396726787090302, + "learning_rate": 8.429728903383613e-05, + "loss": 3.9213, + "step": 18500 + }, + { + "epoch": 1.2573039815192282, + "grad_norm": 0.3580690920352936, + "learning_rate": 8.429304253295286e-05, + "loss": 4.1172, + "step": 18505 + }, + { + "epoch": 1.25764370158989, + "grad_norm": 0.17838728427886963, + "learning_rate": 8.428879603206958e-05, + "loss": 4.046, + "step": 18510 + }, + { + "epoch": 1.2579834216605517, + "grad_norm": 0.19038140773773193, + "learning_rate": 8.428454953118631e-05, + "loss": 4.0819, + "step": 18515 + }, + { + "epoch": 1.2583231417312135, + "grad_norm": 0.6268328428268433, + "learning_rate": 8.428030303030303e-05, + "loss": 3.9927, + "step": 18520 + }, + { + "epoch": 1.2586628618018754, + "grad_norm": 0.19000442326068878, + "learning_rate": 8.427605652941977e-05, + "loss": 4.158, + "step": 18525 + }, + { + "epoch": 1.259002581872537, + "grad_norm": 0.15976901352405548, + "learning_rate": 8.42718100285365e-05, + "loss": 4.0683, + "step": 18530 + }, + { + "epoch": 1.2593423019431988, + "grad_norm": 0.16093571484088898, + "learning_rate": 8.426756352765321e-05, + "loss": 4.1147, + "step": 18535 + }, + { + "epoch": 1.2596820220138607, + "grad_norm": 0.42320576310157776, + "learning_rate": 8.426331702676995e-05, + "loss": 3.842, + "step": 18540 + }, + { + "epoch": 1.2600217420845223, + "grad_norm": 0.18776413798332214, + "learning_rate": 8.425907052588668e-05, + "loss": 4.0674, + "step": 18545 + }, + { + "epoch": 1.2603614621551842, + "grad_norm": 0.165963813662529, + "learning_rate": 8.42548240250034e-05, + "loss": 4.1429, + "step": 18550 + }, + { + "epoch": 1.260701182225846, + "grad_norm": 0.17807848751544952, + "learning_rate": 8.425057752412014e-05, + "loss": 3.8398, + "step": 18555 + }, + { + "epoch": 1.2610409022965077, + "grad_norm": 0.15975359082221985, + "learning_rate": 8.424633102323686e-05, + "loss": 4.1152, + "step": 18560 + }, + { + "epoch": 1.2613806223671695, + "grad_norm": 0.23029714822769165, + "learning_rate": 8.424208452235358e-05, + "loss": 3.9508, + "step": 18565 + }, + { + "epoch": 1.2617203424378312, + "grad_norm": 0.1886504888534546, + "learning_rate": 8.423783802147032e-05, + "loss": 4.0282, + "step": 18570 + }, + { + "epoch": 1.262060062508493, + "grad_norm": 0.17452242970466614, + "learning_rate": 8.423359152058705e-05, + "loss": 3.8075, + "step": 18575 + }, + { + "epoch": 1.2623997825791546, + "grad_norm": 0.17982973158359528, + "learning_rate": 8.422934501970376e-05, + "loss": 3.8094, + "step": 18580 + }, + { + "epoch": 1.2627395026498165, + "grad_norm": 0.2577313482761383, + "learning_rate": 8.42250985188205e-05, + "loss": 3.9233, + "step": 18585 + }, + { + "epoch": 1.2630792227204783, + "grad_norm": 0.1997264176607132, + "learning_rate": 8.422085201793722e-05, + "loss": 4.0151, + "step": 18590 + }, + { + "epoch": 1.26341894279114, + "grad_norm": 0.1584196388721466, + "learning_rate": 8.421660551705395e-05, + "loss": 3.8609, + "step": 18595 + }, + { + "epoch": 1.2637586628618018, + "grad_norm": 0.4707908630371094, + "learning_rate": 8.421235901617069e-05, + "loss": 4.0304, + "step": 18600 + }, + { + "epoch": 1.2640983829324637, + "grad_norm": 0.4373074769973755, + "learning_rate": 8.42081125152874e-05, + "loss": 4.0685, + "step": 18605 + }, + { + "epoch": 1.2644381030031253, + "grad_norm": 0.15299464762210846, + "learning_rate": 8.420386601440413e-05, + "loss": 3.6958, + "step": 18610 + }, + { + "epoch": 1.2647778230737872, + "grad_norm": 0.16645188629627228, + "learning_rate": 8.419961951352087e-05, + "loss": 3.9988, + "step": 18615 + }, + { + "epoch": 1.265117543144449, + "grad_norm": 0.17924442887306213, + "learning_rate": 8.419537301263759e-05, + "loss": 4.2055, + "step": 18620 + }, + { + "epoch": 1.2654572632151107, + "grad_norm": 0.7658303380012512, + "learning_rate": 8.419112651175431e-05, + "loss": 3.8812, + "step": 18625 + }, + { + "epoch": 1.2657969832857725, + "grad_norm": 0.19311122596263885, + "learning_rate": 8.418688001087106e-05, + "loss": 3.6928, + "step": 18630 + }, + { + "epoch": 1.2661367033564344, + "grad_norm": 0.3949373960494995, + "learning_rate": 8.418263350998777e-05, + "loss": 4.183, + "step": 18635 + }, + { + "epoch": 1.266476423427096, + "grad_norm": 0.1995886266231537, + "learning_rate": 8.41783870091045e-05, + "loss": 3.8113, + "step": 18640 + }, + { + "epoch": 1.2668161434977578, + "grad_norm": 0.16361403465270996, + "learning_rate": 8.417414050822124e-05, + "loss": 3.6914, + "step": 18645 + }, + { + "epoch": 1.2671558635684197, + "grad_norm": 0.17496487498283386, + "learning_rate": 8.416989400733795e-05, + "loss": 3.9014, + "step": 18650 + }, + { + "epoch": 1.2674955836390813, + "grad_norm": 0.15764591097831726, + "learning_rate": 8.416564750645468e-05, + "loss": 4.095, + "step": 18655 + }, + { + "epoch": 1.2678353037097432, + "grad_norm": 0.22676768898963928, + "learning_rate": 8.416140100557141e-05, + "loss": 4.2562, + "step": 18660 + }, + { + "epoch": 1.268175023780405, + "grad_norm": 0.22111302614212036, + "learning_rate": 8.415715450468814e-05, + "loss": 3.931, + "step": 18665 + }, + { + "epoch": 1.2685147438510667, + "grad_norm": 3.7777862548828125, + "learning_rate": 8.415290800380487e-05, + "loss": 3.9217, + "step": 18670 + }, + { + "epoch": 1.2688544639217285, + "grad_norm": 0.17876243591308594, + "learning_rate": 8.41486615029216e-05, + "loss": 3.8474, + "step": 18675 + }, + { + "epoch": 1.2691941839923904, + "grad_norm": 0.16093352437019348, + "learning_rate": 8.414441500203832e-05, + "loss": 3.7843, + "step": 18680 + }, + { + "epoch": 1.269533904063052, + "grad_norm": 0.6908741593360901, + "learning_rate": 8.414016850115505e-05, + "loss": 4.1044, + "step": 18685 + }, + { + "epoch": 1.2698736241337139, + "grad_norm": 0.17436876893043518, + "learning_rate": 8.413592200027178e-05, + "loss": 4.1441, + "step": 18690 + }, + { + "epoch": 1.2702133442043757, + "grad_norm": 0.2128446400165558, + "learning_rate": 8.41316754993885e-05, + "loss": 3.965, + "step": 18695 + }, + { + "epoch": 1.2705530642750373, + "grad_norm": 0.4302290976047516, + "learning_rate": 8.412742899850523e-05, + "loss": 4.0176, + "step": 18700 + }, + { + "epoch": 1.2708927843456992, + "grad_norm": 0.19190801680088043, + "learning_rate": 8.412318249762196e-05, + "loss": 3.8455, + "step": 18705 + }, + { + "epoch": 1.271232504416361, + "grad_norm": 0.17315298318862915, + "learning_rate": 8.411893599673869e-05, + "loss": 3.9856, + "step": 18710 + }, + { + "epoch": 1.2715722244870227, + "grad_norm": 0.19651898741722107, + "learning_rate": 8.411468949585542e-05, + "loss": 3.7731, + "step": 18715 + }, + { + "epoch": 1.2719119445576845, + "grad_norm": 0.371863454580307, + "learning_rate": 8.411044299497215e-05, + "loss": 3.8032, + "step": 18720 + }, + { + "epoch": 1.2722516646283464, + "grad_norm": 0.16243654489517212, + "learning_rate": 8.410619649408887e-05, + "loss": 3.9852, + "step": 18725 + }, + { + "epoch": 1.272591384699008, + "grad_norm": 0.1786739081144333, + "learning_rate": 8.41019499932056e-05, + "loss": 3.6946, + "step": 18730 + }, + { + "epoch": 1.2729311047696699, + "grad_norm": 0.15117411315441132, + "learning_rate": 8.409770349232233e-05, + "loss": 3.8749, + "step": 18735 + }, + { + "epoch": 1.2732708248403315, + "grad_norm": 0.16383428871631622, + "learning_rate": 8.409345699143906e-05, + "loss": 3.7454, + "step": 18740 + }, + { + "epoch": 1.2736105449109933, + "grad_norm": 0.683323860168457, + "learning_rate": 8.408921049055579e-05, + "loss": 3.8446, + "step": 18745 + }, + { + "epoch": 1.273950264981655, + "grad_norm": 0.18205972015857697, + "learning_rate": 8.408496398967251e-05, + "loss": 4.0128, + "step": 18750 + }, + { + "epoch": 1.2742899850523168, + "grad_norm": 0.3738502264022827, + "learning_rate": 8.408071748878924e-05, + "loss": 4.0799, + "step": 18755 + }, + { + "epoch": 1.2746297051229787, + "grad_norm": 0.15621113777160645, + "learning_rate": 8.407647098790597e-05, + "loss": 3.9419, + "step": 18760 + }, + { + "epoch": 1.2749694251936403, + "grad_norm": 0.2715875208377838, + "learning_rate": 8.40722244870227e-05, + "loss": 3.8257, + "step": 18765 + }, + { + "epoch": 1.2753091452643022, + "grad_norm": 0.8423426747322083, + "learning_rate": 8.406797798613943e-05, + "loss": 3.7512, + "step": 18770 + }, + { + "epoch": 1.275648865334964, + "grad_norm": 0.16856074333190918, + "learning_rate": 8.406373148525615e-05, + "loss": 3.9037, + "step": 18775 + }, + { + "epoch": 1.2759885854056257, + "grad_norm": 0.5063284635543823, + "learning_rate": 8.405948498437288e-05, + "loss": 4.1521, + "step": 18780 + }, + { + "epoch": 1.2763283054762875, + "grad_norm": 0.20653338730335236, + "learning_rate": 8.405523848348961e-05, + "loss": 4.0435, + "step": 18785 + }, + { + "epoch": 1.2766680255469494, + "grad_norm": 0.21055996417999268, + "learning_rate": 8.405099198260634e-05, + "loss": 4.2214, + "step": 18790 + }, + { + "epoch": 1.277007745617611, + "grad_norm": 0.3211451470851898, + "learning_rate": 8.404674548172307e-05, + "loss": 4.0186, + "step": 18795 + }, + { + "epoch": 1.2773474656882728, + "grad_norm": 0.2250409871339798, + "learning_rate": 8.40424989808398e-05, + "loss": 3.8362, + "step": 18800 + }, + { + "epoch": 1.2776871857589347, + "grad_norm": 0.2842384874820709, + "learning_rate": 8.403825247995652e-05, + "loss": 4.02, + "step": 18805 + }, + { + "epoch": 1.2780269058295963, + "grad_norm": 0.15172384679317474, + "learning_rate": 8.403400597907325e-05, + "loss": 3.9204, + "step": 18810 + }, + { + "epoch": 1.2783666259002582, + "grad_norm": 0.17894504964351654, + "learning_rate": 8.402975947818998e-05, + "loss": 3.9468, + "step": 18815 + }, + { + "epoch": 1.27870634597092, + "grad_norm": 0.19090019166469574, + "learning_rate": 8.40255129773067e-05, + "loss": 4.0322, + "step": 18820 + }, + { + "epoch": 1.2790460660415817, + "grad_norm": 0.16353973746299744, + "learning_rate": 8.402126647642343e-05, + "loss": 3.8708, + "step": 18825 + }, + { + "epoch": 1.2793857861122435, + "grad_norm": 0.1835135668516159, + "learning_rate": 8.401701997554016e-05, + "loss": 3.7859, + "step": 18830 + }, + { + "epoch": 1.2797255061829054, + "grad_norm": 0.2058762162923813, + "learning_rate": 8.401277347465689e-05, + "loss": 4.1318, + "step": 18835 + }, + { + "epoch": 1.280065226253567, + "grad_norm": 0.20229166746139526, + "learning_rate": 8.400852697377362e-05, + "loss": 4.0256, + "step": 18840 + }, + { + "epoch": 1.2804049463242289, + "grad_norm": 0.5365188717842102, + "learning_rate": 8.400428047289035e-05, + "loss": 3.9889, + "step": 18845 + }, + { + "epoch": 1.2807446663948907, + "grad_norm": 0.19579584896564484, + "learning_rate": 8.400003397200707e-05, + "loss": 3.9242, + "step": 18850 + }, + { + "epoch": 1.2810843864655523, + "grad_norm": 0.1515043079853058, + "learning_rate": 8.39957874711238e-05, + "loss": 3.8711, + "step": 18855 + }, + { + "epoch": 1.2814241065362142, + "grad_norm": 0.18436329066753387, + "learning_rate": 8.399154097024052e-05, + "loss": 3.9537, + "step": 18860 + }, + { + "epoch": 1.281763826606876, + "grad_norm": 4.739215850830078, + "learning_rate": 8.398729446935726e-05, + "loss": 3.9314, + "step": 18865 + }, + { + "epoch": 1.2821035466775377, + "grad_norm": 0.1554199606180191, + "learning_rate": 8.398304796847399e-05, + "loss": 4.1267, + "step": 18870 + }, + { + "epoch": 1.2824432667481995, + "grad_norm": 0.5670509338378906, + "learning_rate": 8.39788014675907e-05, + "loss": 3.854, + "step": 18875 + }, + { + "epoch": 1.2827829868188614, + "grad_norm": 0.18207408487796783, + "learning_rate": 8.397455496670744e-05, + "loss": 3.7142, + "step": 18880 + }, + { + "epoch": 1.283122706889523, + "grad_norm": 0.24660193920135498, + "learning_rate": 8.397030846582417e-05, + "loss": 4.0472, + "step": 18885 + }, + { + "epoch": 1.2834624269601849, + "grad_norm": 0.17908191680908203, + "learning_rate": 8.396606196494088e-05, + "loss": 3.6281, + "step": 18890 + }, + { + "epoch": 1.2838021470308467, + "grad_norm": 0.17493927478790283, + "learning_rate": 8.396181546405763e-05, + "loss": 3.85, + "step": 18895 + }, + { + "epoch": 1.2841418671015083, + "grad_norm": 0.15826581418514252, + "learning_rate": 8.395756896317435e-05, + "loss": 3.8545, + "step": 18900 + }, + { + "epoch": 1.2844815871721702, + "grad_norm": 0.23101332783699036, + "learning_rate": 8.395332246229107e-05, + "loss": 3.89, + "step": 18905 + }, + { + "epoch": 1.2848213072428318, + "grad_norm": 0.15133234858512878, + "learning_rate": 8.394907596140781e-05, + "loss": 4.102, + "step": 18910 + }, + { + "epoch": 1.2851610273134937, + "grad_norm": 0.20011445879936218, + "learning_rate": 8.394482946052454e-05, + "loss": 3.5545, + "step": 18915 + }, + { + "epoch": 1.2855007473841553, + "grad_norm": 0.162436380982399, + "learning_rate": 8.394058295964125e-05, + "loss": 3.8583, + "step": 18920 + }, + { + "epoch": 1.2858404674548172, + "grad_norm": 0.1851741224527359, + "learning_rate": 8.3936336458758e-05, + "loss": 3.7929, + "step": 18925 + }, + { + "epoch": 1.286180187525479, + "grad_norm": 0.18858441710472107, + "learning_rate": 8.393208995787472e-05, + "loss": 3.8529, + "step": 18930 + }, + { + "epoch": 1.2865199075961407, + "grad_norm": 0.4124208390712738, + "learning_rate": 8.392784345699144e-05, + "loss": 3.8056, + "step": 18935 + }, + { + "epoch": 1.2868596276668025, + "grad_norm": 0.2154892385005951, + "learning_rate": 8.392359695610818e-05, + "loss": 4.0072, + "step": 18940 + }, + { + "epoch": 1.2871993477374644, + "grad_norm": 0.25199267268180847, + "learning_rate": 8.391935045522489e-05, + "loss": 3.9312, + "step": 18945 + }, + { + "epoch": 1.287539067808126, + "grad_norm": 0.15789833664894104, + "learning_rate": 8.391510395434162e-05, + "loss": 4.085, + "step": 18950 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.2685810625553131, + "learning_rate": 8.391085745345836e-05, + "loss": 3.9964, + "step": 18955 + }, + { + "epoch": 1.2882185079494497, + "grad_norm": 0.17758676409721375, + "learning_rate": 8.390661095257508e-05, + "loss": 3.754, + "step": 18960 + }, + { + "epoch": 1.2885582280201113, + "grad_norm": 0.19539770483970642, + "learning_rate": 8.39023644516918e-05, + "loss": 3.7676, + "step": 18965 + }, + { + "epoch": 1.2888979480907732, + "grad_norm": 0.177890345454216, + "learning_rate": 8.389811795080855e-05, + "loss": 3.9155, + "step": 18970 + }, + { + "epoch": 1.289237668161435, + "grad_norm": 0.18537719547748566, + "learning_rate": 8.389387144992526e-05, + "loss": 3.8824, + "step": 18975 + }, + { + "epoch": 1.2895773882320967, + "grad_norm": 0.14783592522144318, + "learning_rate": 8.388962494904199e-05, + "loss": 3.821, + "step": 18980 + }, + { + "epoch": 1.2899171083027585, + "grad_norm": 0.2984146773815155, + "learning_rate": 8.388537844815873e-05, + "loss": 4.0337, + "step": 18985 + }, + { + "epoch": 1.2902568283734204, + "grad_norm": 0.39257773756980896, + "learning_rate": 8.388113194727544e-05, + "loss": 3.7821, + "step": 18990 + }, + { + "epoch": 1.290596548444082, + "grad_norm": 0.25558602809906006, + "learning_rate": 8.387688544639217e-05, + "loss": 4.233, + "step": 18995 + }, + { + "epoch": 1.2909362685147439, + "grad_norm": 7.887912750244141, + "learning_rate": 8.387263894550891e-05, + "loss": 3.8292, + "step": 19000 + }, + { + "epoch": 1.2912759885854057, + "grad_norm": 0.30138736963272095, + "learning_rate": 8.386839244462563e-05, + "loss": 3.9217, + "step": 19005 + }, + { + "epoch": 1.2916157086560673, + "grad_norm": 0.16285206377506256, + "learning_rate": 8.386414594374236e-05, + "loss": 3.913, + "step": 19010 + }, + { + "epoch": 1.2919554287267292, + "grad_norm": 0.20984157919883728, + "learning_rate": 8.385989944285908e-05, + "loss": 3.919, + "step": 19015 + }, + { + "epoch": 1.292295148797391, + "grad_norm": 0.23956653475761414, + "learning_rate": 8.385565294197581e-05, + "loss": 3.8979, + "step": 19020 + }, + { + "epoch": 1.2926348688680527, + "grad_norm": 0.24043509364128113, + "learning_rate": 8.385140644109254e-05, + "loss": 3.9719, + "step": 19025 + }, + { + "epoch": 1.2929745889387145, + "grad_norm": 0.18360047042369843, + "learning_rate": 8.384715994020927e-05, + "loss": 4.0478, + "step": 19030 + }, + { + "epoch": 1.2933143090093764, + "grad_norm": 0.25544285774230957, + "learning_rate": 8.3842913439326e-05, + "loss": 4.1586, + "step": 19035 + }, + { + "epoch": 1.293654029080038, + "grad_norm": 0.4439932405948639, + "learning_rate": 8.383866693844272e-05, + "loss": 3.9213, + "step": 19040 + }, + { + "epoch": 1.2939937491506999, + "grad_norm": 0.21582220494747162, + "learning_rate": 8.383442043755945e-05, + "loss": 4.1111, + "step": 19045 + }, + { + "epoch": 1.2943334692213617, + "grad_norm": 0.1527835726737976, + "learning_rate": 8.383017393667618e-05, + "loss": 3.7483, + "step": 19050 + }, + { + "epoch": 1.2946731892920234, + "grad_norm": 0.15534402430057526, + "learning_rate": 8.382592743579291e-05, + "loss": 3.9141, + "step": 19055 + }, + { + "epoch": 1.2950129093626852, + "grad_norm": 0.8908084034919739, + "learning_rate": 8.382168093490964e-05, + "loss": 4.0709, + "step": 19060 + }, + { + "epoch": 1.295352629433347, + "grad_norm": 0.16994062066078186, + "learning_rate": 8.381743443402636e-05, + "loss": 3.7966, + "step": 19065 + }, + { + "epoch": 1.2956923495040087, + "grad_norm": 0.48082512617111206, + "learning_rate": 8.381318793314309e-05, + "loss": 3.7967, + "step": 19070 + }, + { + "epoch": 1.2960320695746705, + "grad_norm": 0.22040043771266937, + "learning_rate": 8.380894143225982e-05, + "loss": 3.9136, + "step": 19075 + }, + { + "epoch": 1.2963717896453322, + "grad_norm": 0.14921611547470093, + "learning_rate": 8.380469493137655e-05, + "loss": 4.1139, + "step": 19080 + }, + { + "epoch": 1.296711509715994, + "grad_norm": 0.43779054284095764, + "learning_rate": 8.380044843049328e-05, + "loss": 4.0746, + "step": 19085 + }, + { + "epoch": 1.2970512297866557, + "grad_norm": 0.18992015719413757, + "learning_rate": 8.379620192961e-05, + "loss": 3.9353, + "step": 19090 + }, + { + "epoch": 1.2973909498573175, + "grad_norm": 0.18347913026809692, + "learning_rate": 8.379195542872673e-05, + "loss": 3.8564, + "step": 19095 + }, + { + "epoch": 1.2977306699279794, + "grad_norm": 0.21664832532405853, + "learning_rate": 8.378770892784346e-05, + "loss": 3.9936, + "step": 19100 + }, + { + "epoch": 1.298070389998641, + "grad_norm": 0.16030964255332947, + "learning_rate": 8.378346242696019e-05, + "loss": 3.868, + "step": 19105 + }, + { + "epoch": 1.2984101100693028, + "grad_norm": 0.2131301313638687, + "learning_rate": 8.377921592607692e-05, + "loss": 3.9912, + "step": 19110 + }, + { + "epoch": 1.2987498301399647, + "grad_norm": 0.2264239639043808, + "learning_rate": 8.377496942519364e-05, + "loss": 3.9649, + "step": 19115 + }, + { + "epoch": 1.2990895502106263, + "grad_norm": 0.1663063019514084, + "learning_rate": 8.377072292431037e-05, + "loss": 4.0163, + "step": 19120 + }, + { + "epoch": 1.2994292702812882, + "grad_norm": 0.1636616587638855, + "learning_rate": 8.37664764234271e-05, + "loss": 3.9985, + "step": 19125 + }, + { + "epoch": 1.29976899035195, + "grad_norm": 0.7488766312599182, + "learning_rate": 8.376222992254383e-05, + "loss": 4.0555, + "step": 19130 + }, + { + "epoch": 1.3001087104226117, + "grad_norm": 0.18963804841041565, + "learning_rate": 8.375798342166056e-05, + "loss": 3.8151, + "step": 19135 + }, + { + "epoch": 1.3004484304932735, + "grad_norm": 0.18976327776908875, + "learning_rate": 8.375373692077728e-05, + "loss": 3.7772, + "step": 19140 + }, + { + "epoch": 1.3007881505639354, + "grad_norm": 0.19920849800109863, + "learning_rate": 8.374949041989401e-05, + "loss": 3.9672, + "step": 19145 + }, + { + "epoch": 1.301127870634597, + "grad_norm": 0.17867040634155273, + "learning_rate": 8.374524391901074e-05, + "loss": 4.1197, + "step": 19150 + }, + { + "epoch": 1.3014675907052589, + "grad_norm": 0.2060815393924713, + "learning_rate": 8.374099741812747e-05, + "loss": 3.7812, + "step": 19155 + }, + { + "epoch": 1.3018073107759207, + "grad_norm": 0.2573358118534088, + "learning_rate": 8.37367509172442e-05, + "loss": 4.039, + "step": 19160 + }, + { + "epoch": 1.3021470308465823, + "grad_norm": 0.33305811882019043, + "learning_rate": 8.373335371653758e-05, + "loss": 3.7584, + "step": 19165 + }, + { + "epoch": 1.3024867509172442, + "grad_norm": 0.17413927614688873, + "learning_rate": 8.372910721565431e-05, + "loss": 4.1722, + "step": 19170 + }, + { + "epoch": 1.302826470987906, + "grad_norm": 0.1954801231622696, + "learning_rate": 8.372486071477104e-05, + "loss": 3.8101, + "step": 19175 + }, + { + "epoch": 1.3031661910585677, + "grad_norm": 0.23614861071109772, + "learning_rate": 8.372061421388776e-05, + "loss": 3.7521, + "step": 19180 + }, + { + "epoch": 1.3035059111292295, + "grad_norm": 0.17084358632564545, + "learning_rate": 8.371636771300448e-05, + "loss": 3.8107, + "step": 19185 + }, + { + "epoch": 1.3038456311998914, + "grad_norm": 0.26063695549964905, + "learning_rate": 8.371212121212122e-05, + "loss": 4.1759, + "step": 19190 + }, + { + "epoch": 1.304185351270553, + "grad_norm": 0.1504248082637787, + "learning_rate": 8.370787471123795e-05, + "loss": 3.773, + "step": 19195 + }, + { + "epoch": 1.3045250713412149, + "grad_norm": 0.19236069917678833, + "learning_rate": 8.370362821035466e-05, + "loss": 4.1685, + "step": 19200 + }, + { + "epoch": 1.3048647914118767, + "grad_norm": 0.24418583512306213, + "learning_rate": 8.36993817094714e-05, + "loss": 3.9897, + "step": 19205 + }, + { + "epoch": 1.3052045114825384, + "grad_norm": 0.29168444871902466, + "learning_rate": 8.369513520858813e-05, + "loss": 3.8933, + "step": 19210 + }, + { + "epoch": 1.3055442315532002, + "grad_norm": 0.20051023364067078, + "learning_rate": 8.369088870770485e-05, + "loss": 4.167, + "step": 19215 + }, + { + "epoch": 1.305883951623862, + "grad_norm": 0.1487528383731842, + "learning_rate": 8.368664220682159e-05, + "loss": 3.8438, + "step": 19220 + }, + { + "epoch": 1.3062236716945237, + "grad_norm": 0.16722460091114044, + "learning_rate": 8.368239570593832e-05, + "loss": 4.1649, + "step": 19225 + }, + { + "epoch": 1.3065633917651855, + "grad_norm": 0.19974221289157867, + "learning_rate": 8.367814920505503e-05, + "loss": 4.0, + "step": 19230 + }, + { + "epoch": 1.3069031118358474, + "grad_norm": 0.18729168176651, + "learning_rate": 8.367390270417177e-05, + "loss": 3.5726, + "step": 19235 + }, + { + "epoch": 1.307242831906509, + "grad_norm": 0.40956801176071167, + "learning_rate": 8.36696562032885e-05, + "loss": 3.9316, + "step": 19240 + }, + { + "epoch": 1.3075825519771709, + "grad_norm": 0.1835280805826187, + "learning_rate": 8.366540970240521e-05, + "loss": 3.9879, + "step": 19245 + }, + { + "epoch": 1.3079222720478325, + "grad_norm": 0.2411351054906845, + "learning_rate": 8.366116320152196e-05, + "loss": 4.1365, + "step": 19250 + }, + { + "epoch": 1.3082619921184944, + "grad_norm": 0.16510212421417236, + "learning_rate": 8.365691670063868e-05, + "loss": 4.148, + "step": 19255 + }, + { + "epoch": 1.308601712189156, + "grad_norm": 0.2643602192401886, + "learning_rate": 8.36526701997554e-05, + "loss": 4.0015, + "step": 19260 + }, + { + "epoch": 1.3089414322598178, + "grad_norm": 0.5195035934448242, + "learning_rate": 8.364842369887214e-05, + "loss": 3.8611, + "step": 19265 + }, + { + "epoch": 1.3092811523304797, + "grad_norm": 0.31154167652130127, + "learning_rate": 8.364417719798885e-05, + "loss": 3.9502, + "step": 19270 + }, + { + "epoch": 1.3096208724011413, + "grad_norm": 0.143663689494133, + "learning_rate": 8.363993069710558e-05, + "loss": 3.8714, + "step": 19275 + }, + { + "epoch": 1.3099605924718032, + "grad_norm": 0.17418330907821655, + "learning_rate": 8.363568419622232e-05, + "loss": 3.6703, + "step": 19280 + }, + { + "epoch": 1.310300312542465, + "grad_norm": 0.1635352373123169, + "learning_rate": 8.363143769533904e-05, + "loss": 3.9495, + "step": 19285 + }, + { + "epoch": 1.3106400326131267, + "grad_norm": 0.5414339900016785, + "learning_rate": 8.362719119445577e-05, + "loss": 3.8316, + "step": 19290 + }, + { + "epoch": 1.3109797526837885, + "grad_norm": 0.18818336725234985, + "learning_rate": 8.362294469357251e-05, + "loss": 3.9911, + "step": 19295 + }, + { + "epoch": 1.3113194727544504, + "grad_norm": 0.15305201709270477, + "learning_rate": 8.361869819268922e-05, + "loss": 3.9743, + "step": 19300 + }, + { + "epoch": 1.311659192825112, + "grad_norm": 1.2789772748947144, + "learning_rate": 8.361445169180595e-05, + "loss": 4.0671, + "step": 19305 + }, + { + "epoch": 1.3119989128957739, + "grad_norm": 0.15755140781402588, + "learning_rate": 8.361020519092269e-05, + "loss": 3.7232, + "step": 19310 + }, + { + "epoch": 1.3123386329664357, + "grad_norm": 0.1621299386024475, + "learning_rate": 8.36059586900394e-05, + "loss": 3.7638, + "step": 19315 + }, + { + "epoch": 1.3126783530370973, + "grad_norm": 0.14383243024349213, + "learning_rate": 8.360171218915613e-05, + "loss": 3.8595, + "step": 19320 + }, + { + "epoch": 1.3130180731077592, + "grad_norm": 0.17951351404190063, + "learning_rate": 8.359746568827288e-05, + "loss": 3.9923, + "step": 19325 + }, + { + "epoch": 1.313357793178421, + "grad_norm": 0.18223224580287933, + "learning_rate": 8.359321918738959e-05, + "loss": 3.908, + "step": 19330 + }, + { + "epoch": 1.3136975132490827, + "grad_norm": 0.19232766330242157, + "learning_rate": 8.358897268650633e-05, + "loss": 3.7973, + "step": 19335 + }, + { + "epoch": 1.3140372333197445, + "grad_norm": 0.6103265881538391, + "learning_rate": 8.358472618562305e-05, + "loss": 3.9275, + "step": 19340 + }, + { + "epoch": 1.3143769533904064, + "grad_norm": 0.2274128943681717, + "learning_rate": 8.358047968473977e-05, + "loss": 4.1221, + "step": 19345 + }, + { + "epoch": 1.314716673461068, + "grad_norm": 0.1885349303483963, + "learning_rate": 8.357623318385652e-05, + "loss": 3.9873, + "step": 19350 + }, + { + "epoch": 1.3150563935317299, + "grad_norm": 0.28178268671035767, + "learning_rate": 8.357198668297323e-05, + "loss": 3.8419, + "step": 19355 + }, + { + "epoch": 1.3153961136023917, + "grad_norm": 0.19138917326927185, + "learning_rate": 8.356774018208996e-05, + "loss": 3.8596, + "step": 19360 + }, + { + "epoch": 1.3157358336730534, + "grad_norm": 0.2312302589416504, + "learning_rate": 8.35634936812067e-05, + "loss": 4.2291, + "step": 19365 + }, + { + "epoch": 1.3160755537437152, + "grad_norm": 0.33398836851119995, + "learning_rate": 8.355924718032341e-05, + "loss": 3.9602, + "step": 19370 + }, + { + "epoch": 1.316415273814377, + "grad_norm": 0.228350430727005, + "learning_rate": 8.355500067944014e-05, + "loss": 4.2246, + "step": 19375 + }, + { + "epoch": 1.3167549938850387, + "grad_norm": 0.31661874055862427, + "learning_rate": 8.355075417855688e-05, + "loss": 3.9789, + "step": 19380 + }, + { + "epoch": 1.3170947139557005, + "grad_norm": 0.22833606600761414, + "learning_rate": 8.35465076776736e-05, + "loss": 3.9451, + "step": 19385 + }, + { + "epoch": 1.3174344340263624, + "grad_norm": 0.2765660881996155, + "learning_rate": 8.354226117679033e-05, + "loss": 3.9986, + "step": 19390 + }, + { + "epoch": 1.317774154097024, + "grad_norm": 0.34042954444885254, + "learning_rate": 8.353801467590707e-05, + "loss": 3.7479, + "step": 19395 + }, + { + "epoch": 1.3181138741676859, + "grad_norm": 0.15123659372329712, + "learning_rate": 8.353376817502378e-05, + "loss": 4.053, + "step": 19400 + }, + { + "epoch": 1.3184535942383477, + "grad_norm": 0.3974605202674866, + "learning_rate": 8.352952167414051e-05, + "loss": 3.9453, + "step": 19405 + }, + { + "epoch": 1.3187933143090094, + "grad_norm": 0.18069498240947723, + "learning_rate": 8.352527517325724e-05, + "loss": 4.0342, + "step": 19410 + }, + { + "epoch": 1.3191330343796712, + "grad_norm": 0.23115777969360352, + "learning_rate": 8.352102867237397e-05, + "loss": 3.9349, + "step": 19415 + }, + { + "epoch": 1.3194727544503329, + "grad_norm": 0.14878331124782562, + "learning_rate": 8.35167821714907e-05, + "loss": 4.1099, + "step": 19420 + }, + { + "epoch": 1.3198124745209947, + "grad_norm": 0.1701647788286209, + "learning_rate": 8.351253567060742e-05, + "loss": 3.9161, + "step": 19425 + }, + { + "epoch": 1.3201521945916566, + "grad_norm": 0.16140282154083252, + "learning_rate": 8.350828916972415e-05, + "loss": 4.0584, + "step": 19430 + }, + { + "epoch": 1.3204919146623182, + "grad_norm": 0.1818205714225769, + "learning_rate": 8.350404266884088e-05, + "loss": 3.8904, + "step": 19435 + }, + { + "epoch": 1.32083163473298, + "grad_norm": 0.20634934306144714, + "learning_rate": 8.34997961679576e-05, + "loss": 3.8103, + "step": 19440 + }, + { + "epoch": 1.3211713548036417, + "grad_norm": 0.17088109254837036, + "learning_rate": 8.349554966707433e-05, + "loss": 3.8141, + "step": 19445 + }, + { + "epoch": 1.3215110748743035, + "grad_norm": 0.20215611159801483, + "learning_rate": 8.349130316619106e-05, + "loss": 3.8729, + "step": 19450 + }, + { + "epoch": 1.3218507949449654, + "grad_norm": 0.18270128965377808, + "learning_rate": 8.348705666530779e-05, + "loss": 4.0808, + "step": 19455 + }, + { + "epoch": 1.322190515015627, + "grad_norm": 0.20336106419563293, + "learning_rate": 8.348281016442452e-05, + "loss": 4.2266, + "step": 19460 + }, + { + "epoch": 1.3225302350862889, + "grad_norm": 0.1644607037305832, + "learning_rate": 8.347856366354125e-05, + "loss": 3.8306, + "step": 19465 + }, + { + "epoch": 1.3228699551569507, + "grad_norm": 0.19405750930309296, + "learning_rate": 8.347431716265797e-05, + "loss": 4.1748, + "step": 19470 + }, + { + "epoch": 1.3232096752276123, + "grad_norm": 0.21239569783210754, + "learning_rate": 8.34700706617747e-05, + "loss": 3.9155, + "step": 19475 + }, + { + "epoch": 1.3235493952982742, + "grad_norm": 0.17459309101104736, + "learning_rate": 8.346582416089143e-05, + "loss": 3.8644, + "step": 19480 + }, + { + "epoch": 1.323889115368936, + "grad_norm": 0.16154392063617706, + "learning_rate": 8.346157766000816e-05, + "loss": 4.0394, + "step": 19485 + }, + { + "epoch": 1.3242288354395977, + "grad_norm": 0.25978603959083557, + "learning_rate": 8.345733115912489e-05, + "loss": 3.7452, + "step": 19490 + }, + { + "epoch": 1.3245685555102595, + "grad_norm": 0.1879960149526596, + "learning_rate": 8.345308465824161e-05, + "loss": 3.9863, + "step": 19495 + }, + { + "epoch": 1.3249082755809214, + "grad_norm": 0.1746322363615036, + "learning_rate": 8.344883815735834e-05, + "loss": 4.0941, + "step": 19500 + }, + { + "epoch": 1.325247995651583, + "grad_norm": 0.25141578912734985, + "learning_rate": 8.344459165647507e-05, + "loss": 3.8441, + "step": 19505 + }, + { + "epoch": 1.3255877157222449, + "grad_norm": 0.2381659895181656, + "learning_rate": 8.34403451555918e-05, + "loss": 3.9221, + "step": 19510 + }, + { + "epoch": 1.3259274357929067, + "grad_norm": 0.1651276797056198, + "learning_rate": 8.343609865470853e-05, + "loss": 4.1022, + "step": 19515 + }, + { + "epoch": 1.3262671558635684, + "grad_norm": 0.16350306570529938, + "learning_rate": 8.343185215382525e-05, + "loss": 3.7792, + "step": 19520 + }, + { + "epoch": 1.3266068759342302, + "grad_norm": 0.18928661942481995, + "learning_rate": 8.342760565294198e-05, + "loss": 3.8899, + "step": 19525 + }, + { + "epoch": 1.326946596004892, + "grad_norm": 0.17762024700641632, + "learning_rate": 8.342335915205871e-05, + "loss": 3.7073, + "step": 19530 + }, + { + "epoch": 1.3272863160755537, + "grad_norm": 0.19783206284046173, + "learning_rate": 8.341911265117544e-05, + "loss": 3.8979, + "step": 19535 + }, + { + "epoch": 1.3276260361462155, + "grad_norm": 0.2024858146905899, + "learning_rate": 8.341486615029215e-05, + "loss": 3.9377, + "step": 19540 + }, + { + "epoch": 1.3279657562168774, + "grad_norm": 0.17724867165088654, + "learning_rate": 8.34106196494089e-05, + "loss": 3.829, + "step": 19545 + }, + { + "epoch": 1.328305476287539, + "grad_norm": 0.17733719944953918, + "learning_rate": 8.340637314852562e-05, + "loss": 3.7078, + "step": 19550 + }, + { + "epoch": 1.3286451963582009, + "grad_norm": 0.21756169199943542, + "learning_rate": 8.340212664764234e-05, + "loss": 4.0279, + "step": 19555 + }, + { + "epoch": 1.3289849164288627, + "grad_norm": 0.16448095440864563, + "learning_rate": 8.339788014675908e-05, + "loss": 3.9283, + "step": 19560 + }, + { + "epoch": 1.3293246364995244, + "grad_norm": 0.1978486180305481, + "learning_rate": 8.33936336458758e-05, + "loss": 3.7435, + "step": 19565 + }, + { + "epoch": 1.3296643565701862, + "grad_norm": 0.1968926340341568, + "learning_rate": 8.338938714499252e-05, + "loss": 3.9159, + "step": 19570 + }, + { + "epoch": 1.330004076640848, + "grad_norm": 0.3300844430923462, + "learning_rate": 8.338514064410926e-05, + "loss": 3.9178, + "step": 19575 + }, + { + "epoch": 1.3303437967115097, + "grad_norm": 0.17931441962718964, + "learning_rate": 8.338089414322599e-05, + "loss": 3.9463, + "step": 19580 + }, + { + "epoch": 1.3306835167821716, + "grad_norm": 0.2748642861843109, + "learning_rate": 8.33766476423427e-05, + "loss": 3.9635, + "step": 19585 + }, + { + "epoch": 1.3310232368528332, + "grad_norm": 0.16187721490859985, + "learning_rate": 8.337240114145945e-05, + "loss": 3.8689, + "step": 19590 + }, + { + "epoch": 1.331362956923495, + "grad_norm": 0.15887784957885742, + "learning_rate": 8.336815464057617e-05, + "loss": 4.1295, + "step": 19595 + }, + { + "epoch": 1.331702676994157, + "grad_norm": 0.19710515439510345, + "learning_rate": 8.336390813969289e-05, + "loss": 3.8243, + "step": 19600 + }, + { + "epoch": 1.3320423970648185, + "grad_norm": 0.24822378158569336, + "learning_rate": 8.335966163880963e-05, + "loss": 4.0529, + "step": 19605 + }, + { + "epoch": 1.3323821171354804, + "grad_norm": 0.29866695404052734, + "learning_rate": 8.335541513792634e-05, + "loss": 3.8158, + "step": 19610 + }, + { + "epoch": 1.332721837206142, + "grad_norm": 0.20870965719223022, + "learning_rate": 8.335116863704307e-05, + "loss": 3.722, + "step": 19615 + }, + { + "epoch": 1.3330615572768039, + "grad_norm": 0.17013707756996155, + "learning_rate": 8.334692213615981e-05, + "loss": 4.0811, + "step": 19620 + }, + { + "epoch": 1.3334012773474657, + "grad_norm": 0.1831049770116806, + "learning_rate": 8.334267563527653e-05, + "loss": 3.9492, + "step": 19625 + }, + { + "epoch": 1.3337409974181273, + "grad_norm": 0.6720523238182068, + "learning_rate": 8.333842913439326e-05, + "loss": 4.2925, + "step": 19630 + }, + { + "epoch": 1.3340807174887892, + "grad_norm": 0.18534238636493683, + "learning_rate": 8.333418263351e-05, + "loss": 3.9825, + "step": 19635 + }, + { + "epoch": 1.334420437559451, + "grad_norm": 0.18676869571208954, + "learning_rate": 8.332993613262671e-05, + "loss": 3.8672, + "step": 19640 + }, + { + "epoch": 1.3347601576301127, + "grad_norm": 0.16528339684009552, + "learning_rate": 8.332568963174344e-05, + "loss": 3.9742, + "step": 19645 + }, + { + "epoch": 1.3350998777007745, + "grad_norm": 0.29641860723495483, + "learning_rate": 8.332144313086018e-05, + "loss": 3.6969, + "step": 19650 + }, + { + "epoch": 1.3354395977714364, + "grad_norm": 0.15875406563282013, + "learning_rate": 8.33171966299769e-05, + "loss": 3.7435, + "step": 19655 + }, + { + "epoch": 1.335779317842098, + "grad_norm": 0.2668616771697998, + "learning_rate": 8.331295012909362e-05, + "loss": 4.0622, + "step": 19660 + }, + { + "epoch": 1.3361190379127599, + "grad_norm": 0.2240106463432312, + "learning_rate": 8.330870362821037e-05, + "loss": 4.0038, + "step": 19665 + }, + { + "epoch": 1.3364587579834217, + "grad_norm": 0.18531446158885956, + "learning_rate": 8.330445712732708e-05, + "loss": 3.9786, + "step": 19670 + }, + { + "epoch": 1.3367984780540834, + "grad_norm": 0.2014482021331787, + "learning_rate": 8.330021062644382e-05, + "loss": 4.1063, + "step": 19675 + }, + { + "epoch": 1.3371381981247452, + "grad_norm": 0.26698142290115356, + "learning_rate": 8.329596412556055e-05, + "loss": 4.1084, + "step": 19680 + }, + { + "epoch": 1.337477918195407, + "grad_norm": 1.672305703163147, + "learning_rate": 8.329171762467727e-05, + "loss": 4.0953, + "step": 19685 + }, + { + "epoch": 1.3378176382660687, + "grad_norm": 0.1762583702802658, + "learning_rate": 8.3287471123794e-05, + "loss": 3.9526, + "step": 19690 + }, + { + "epoch": 1.3381573583367306, + "grad_norm": 0.22734053432941437, + "learning_rate": 8.328322462291072e-05, + "loss": 3.9851, + "step": 19695 + }, + { + "epoch": 1.3384970784073924, + "grad_norm": 0.182522714138031, + "learning_rate": 8.327897812202745e-05, + "loss": 3.6877, + "step": 19700 + }, + { + "epoch": 1.338836798478054, + "grad_norm": 0.342989444732666, + "learning_rate": 8.327473162114419e-05, + "loss": 3.9537, + "step": 19705 + }, + { + "epoch": 1.3391765185487159, + "grad_norm": 0.244001567363739, + "learning_rate": 8.32704851202609e-05, + "loss": 3.7863, + "step": 19710 + }, + { + "epoch": 1.3395162386193777, + "grad_norm": 0.2477254420518875, + "learning_rate": 8.326623861937763e-05, + "loss": 3.8508, + "step": 19715 + }, + { + "epoch": 1.3398559586900394, + "grad_norm": 0.26922059059143066, + "learning_rate": 8.326199211849437e-05, + "loss": 4.1351, + "step": 19720 + }, + { + "epoch": 1.3401956787607012, + "grad_norm": 0.2158743143081665, + "learning_rate": 8.325774561761109e-05, + "loss": 3.9485, + "step": 19725 + }, + { + "epoch": 1.340535398831363, + "grad_norm": 0.7418832778930664, + "learning_rate": 8.325349911672782e-05, + "loss": 3.8862, + "step": 19730 + }, + { + "epoch": 1.3408751189020247, + "grad_norm": 0.1519351750612259, + "learning_rate": 8.324925261584456e-05, + "loss": 3.9494, + "step": 19735 + }, + { + "epoch": 1.3412148389726866, + "grad_norm": 0.7085623741149902, + "learning_rate": 8.324500611496127e-05, + "loss": 3.8926, + "step": 19740 + }, + { + "epoch": 1.3415545590433484, + "grad_norm": 0.4582364857196808, + "learning_rate": 8.3240759614078e-05, + "loss": 3.9742, + "step": 19745 + }, + { + "epoch": 1.34189427911401, + "grad_norm": 0.2128627449274063, + "learning_rate": 8.323651311319474e-05, + "loss": 3.7832, + "step": 19750 + }, + { + "epoch": 1.342233999184672, + "grad_norm": 0.1862819492816925, + "learning_rate": 8.323226661231146e-05, + "loss": 3.9758, + "step": 19755 + }, + { + "epoch": 1.3425737192553335, + "grad_norm": 0.16334497928619385, + "learning_rate": 8.322802011142819e-05, + "loss": 3.784, + "step": 19760 + }, + { + "epoch": 1.3429134393259954, + "grad_norm": 0.4571809470653534, + "learning_rate": 8.322377361054491e-05, + "loss": 3.7794, + "step": 19765 + }, + { + "epoch": 1.3432531593966572, + "grad_norm": 0.3504500985145569, + "learning_rate": 8.321952710966164e-05, + "loss": 3.9104, + "step": 19770 + }, + { + "epoch": 1.3435928794673189, + "grad_norm": 0.23097096383571625, + "learning_rate": 8.321528060877837e-05, + "loss": 3.8286, + "step": 19775 + }, + { + "epoch": 1.3439325995379807, + "grad_norm": 0.3794955313205719, + "learning_rate": 8.32110341078951e-05, + "loss": 4.003, + "step": 19780 + }, + { + "epoch": 1.3442723196086424, + "grad_norm": 0.19196826219558716, + "learning_rate": 8.320678760701183e-05, + "loss": 3.9444, + "step": 19785 + }, + { + "epoch": 1.3446120396793042, + "grad_norm": 0.14066928625106812, + "learning_rate": 8.320254110612855e-05, + "loss": 4.0312, + "step": 19790 + }, + { + "epoch": 1.344951759749966, + "grad_norm": 0.2708784341812134, + "learning_rate": 8.319829460524528e-05, + "loss": 4.1265, + "step": 19795 + }, + { + "epoch": 1.3452914798206277, + "grad_norm": 0.23390352725982666, + "learning_rate": 8.319404810436201e-05, + "loss": 3.8449, + "step": 19800 + }, + { + "epoch": 1.3456311998912895, + "grad_norm": 0.27564239501953125, + "learning_rate": 8.318980160347874e-05, + "loss": 3.8713, + "step": 19805 + }, + { + "epoch": 1.3459709199619514, + "grad_norm": 0.1514866203069687, + "learning_rate": 8.318555510259547e-05, + "loss": 3.7811, + "step": 19810 + }, + { + "epoch": 1.346310640032613, + "grad_norm": 0.16295874118804932, + "learning_rate": 8.31813086017122e-05, + "loss": 4.0793, + "step": 19815 + }, + { + "epoch": 1.3466503601032749, + "grad_norm": 0.8417242169380188, + "learning_rate": 8.317706210082892e-05, + "loss": 3.9772, + "step": 19820 + }, + { + "epoch": 1.3469900801739367, + "grad_norm": 0.1565304696559906, + "learning_rate": 8.317281559994565e-05, + "loss": 3.8967, + "step": 19825 + }, + { + "epoch": 1.3473298002445984, + "grad_norm": 0.14673905074596405, + "learning_rate": 8.316856909906238e-05, + "loss": 3.8072, + "step": 19830 + }, + { + "epoch": 1.3476695203152602, + "grad_norm": 0.1315992772579193, + "learning_rate": 8.31643225981791e-05, + "loss": 4.0175, + "step": 19835 + }, + { + "epoch": 1.348009240385922, + "grad_norm": 0.1678156703710556, + "learning_rate": 8.316007609729583e-05, + "loss": 3.853, + "step": 19840 + }, + { + "epoch": 1.3483489604565837, + "grad_norm": 0.1983521282672882, + "learning_rate": 8.315582959641256e-05, + "loss": 3.8539, + "step": 19845 + }, + { + "epoch": 1.3486886805272456, + "grad_norm": 0.19544869661331177, + "learning_rate": 8.315158309552929e-05, + "loss": 4.0272, + "step": 19850 + }, + { + "epoch": 1.3490284005979074, + "grad_norm": 0.16290587186813354, + "learning_rate": 8.314733659464602e-05, + "loss": 4.0608, + "step": 19855 + }, + { + "epoch": 1.349368120668569, + "grad_norm": 0.27463269233703613, + "learning_rate": 8.314309009376275e-05, + "loss": 3.8993, + "step": 19860 + }, + { + "epoch": 1.349707840739231, + "grad_norm": 0.1842031031847, + "learning_rate": 8.313884359287947e-05, + "loss": 4.0268, + "step": 19865 + }, + { + "epoch": 1.3500475608098927, + "grad_norm": 0.1725267916917801, + "learning_rate": 8.31345970919962e-05, + "loss": 4.01, + "step": 19870 + }, + { + "epoch": 1.3503872808805544, + "grad_norm": 0.23180291056632996, + "learning_rate": 8.313035059111293e-05, + "loss": 4.105, + "step": 19875 + }, + { + "epoch": 1.3507270009512162, + "grad_norm": 0.17762398719787598, + "learning_rate": 8.312610409022966e-05, + "loss": 4.1905, + "step": 19880 + }, + { + "epoch": 1.351066721021878, + "grad_norm": 0.4031653106212616, + "learning_rate": 8.312185758934639e-05, + "loss": 4.0726, + "step": 19885 + }, + { + "epoch": 1.3514064410925397, + "grad_norm": 0.18705089390277863, + "learning_rate": 8.311761108846311e-05, + "loss": 3.751, + "step": 19890 + }, + { + "epoch": 1.3517461611632016, + "grad_norm": 0.2695411443710327, + "learning_rate": 8.311336458757983e-05, + "loss": 3.8954, + "step": 19895 + }, + { + "epoch": 1.3520858812338634, + "grad_norm": 0.18734440207481384, + "learning_rate": 8.310911808669657e-05, + "loss": 3.8349, + "step": 19900 + }, + { + "epoch": 1.352425601304525, + "grad_norm": 0.19522342085838318, + "learning_rate": 8.31048715858133e-05, + "loss": 3.8765, + "step": 19905 + }, + { + "epoch": 1.352765321375187, + "grad_norm": 0.6658803820610046, + "learning_rate": 8.310062508493001e-05, + "loss": 3.9454, + "step": 19910 + }, + { + "epoch": 1.3531050414458488, + "grad_norm": 0.15180611610412598, + "learning_rate": 8.309637858404675e-05, + "loss": 3.948, + "step": 19915 + }, + { + "epoch": 1.3534447615165104, + "grad_norm": 0.44514378905296326, + "learning_rate": 8.309213208316348e-05, + "loss": 4.0144, + "step": 19920 + }, + { + "epoch": 1.3537844815871722, + "grad_norm": 0.19116869568824768, + "learning_rate": 8.30878855822802e-05, + "loss": 3.8441, + "step": 19925 + }, + { + "epoch": 1.3541242016578339, + "grad_norm": 0.21665486693382263, + "learning_rate": 8.308363908139694e-05, + "loss": 4.0415, + "step": 19930 + }, + { + "epoch": 1.3544639217284957, + "grad_norm": 0.15915414690971375, + "learning_rate": 8.307939258051367e-05, + "loss": 3.9731, + "step": 19935 + }, + { + "epoch": 1.3548036417991576, + "grad_norm": 0.6419869065284729, + "learning_rate": 8.307514607963038e-05, + "loss": 3.8781, + "step": 19940 + }, + { + "epoch": 1.3551433618698192, + "grad_norm": 0.15344323217868805, + "learning_rate": 8.307089957874712e-05, + "loss": 3.9665, + "step": 19945 + }, + { + "epoch": 1.355483081940481, + "grad_norm": 0.155173197388649, + "learning_rate": 8.306665307786385e-05, + "loss": 3.973, + "step": 19950 + }, + { + "epoch": 1.3558228020111427, + "grad_norm": 0.23649723827838898, + "learning_rate": 8.306240657698056e-05, + "loss": 3.9524, + "step": 19955 + }, + { + "epoch": 1.3561625220818045, + "grad_norm": 0.2823832929134369, + "learning_rate": 8.30581600760973e-05, + "loss": 4.0862, + "step": 19960 + }, + { + "epoch": 1.3565022421524664, + "grad_norm": 5.308559894561768, + "learning_rate": 8.305391357521402e-05, + "loss": 4.1226, + "step": 19965 + }, + { + "epoch": 1.356841962223128, + "grad_norm": NaN, + "learning_rate": 8.305051637450742e-05, + "loss": 3.889, + "step": 19970 + }, + { + "epoch": 1.3571816822937899, + "grad_norm": 0.14660164713859558, + "learning_rate": 8.304626987362414e-05, + "loss": 3.8358, + "step": 19975 + }, + { + "epoch": 1.3575214023644517, + "grad_norm": 0.15585385262966156, + "learning_rate": 8.304202337274086e-05, + "loss": 3.9773, + "step": 19980 + }, + { + "epoch": 1.3578611224351134, + "grad_norm": 0.1493355929851532, + "learning_rate": 8.30377768718576e-05, + "loss": 3.7931, + "step": 19985 + }, + { + "epoch": 1.3582008425057752, + "grad_norm": 0.15059494972229004, + "learning_rate": 8.303353037097433e-05, + "loss": 3.836, + "step": 19990 + }, + { + "epoch": 1.358540562576437, + "grad_norm": 0.19136165082454681, + "learning_rate": 8.302928387009104e-05, + "loss": 3.8461, + "step": 19995 + }, + { + "epoch": 1.3588802826470987, + "grad_norm": 0.19499917328357697, + "learning_rate": 8.302503736920778e-05, + "loss": 4.0183, + "step": 20000 + }, + { + "epoch": 1.3592200027177606, + "grad_norm": 0.2692849934101105, + "learning_rate": 8.30207908683245e-05, + "loss": 4.0869, + "step": 20005 + }, + { + "epoch": 1.3595597227884224, + "grad_norm": 0.18053746223449707, + "learning_rate": 8.301654436744123e-05, + "loss": 4.0996, + "step": 20010 + }, + { + "epoch": 1.359899442859084, + "grad_norm": 0.16390866041183472, + "learning_rate": 8.301229786655797e-05, + "loss": 4.08, + "step": 20015 + }, + { + "epoch": 1.360239162929746, + "grad_norm": 1.0971055030822754, + "learning_rate": 8.300805136567468e-05, + "loss": 3.9701, + "step": 20020 + }, + { + "epoch": 1.3605788830004077, + "grad_norm": 0.19974741339683533, + "learning_rate": 8.300380486479141e-05, + "loss": 3.9713, + "step": 20025 + }, + { + "epoch": 1.3609186030710694, + "grad_norm": 0.1559068262577057, + "learning_rate": 8.299955836390815e-05, + "loss": 3.8466, + "step": 20030 + }, + { + "epoch": 1.3612583231417312, + "grad_norm": 0.14776596426963806, + "learning_rate": 8.299531186302487e-05, + "loss": 4.0535, + "step": 20035 + }, + { + "epoch": 1.361598043212393, + "grad_norm": 0.15199600160121918, + "learning_rate": 8.29910653621416e-05, + "loss": 4.0992, + "step": 20040 + }, + { + "epoch": 1.3619377632830547, + "grad_norm": 0.1746680587530136, + "learning_rate": 8.298681886125834e-05, + "loss": 3.8478, + "step": 20045 + }, + { + "epoch": 1.3622774833537166, + "grad_norm": 0.18728336691856384, + "learning_rate": 8.298257236037505e-05, + "loss": 4.0462, + "step": 20050 + }, + { + "epoch": 1.3626172034243784, + "grad_norm": 0.16553489863872528, + "learning_rate": 8.297832585949178e-05, + "loss": 4.0803, + "step": 20055 + }, + { + "epoch": 1.36295692349504, + "grad_norm": 0.13172194361686707, + "learning_rate": 8.297407935860852e-05, + "loss": 3.9735, + "step": 20060 + }, + { + "epoch": 1.363296643565702, + "grad_norm": 0.198270782828331, + "learning_rate": 8.296983285772523e-05, + "loss": 3.9527, + "step": 20065 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.17840759456157684, + "learning_rate": 8.296558635684196e-05, + "loss": 4.0108, + "step": 20070 + }, + { + "epoch": 1.3639760837070254, + "grad_norm": 0.182584747672081, + "learning_rate": 8.29613398559587e-05, + "loss": 3.903, + "step": 20075 + }, + { + "epoch": 1.3643158037776872, + "grad_norm": 0.17388814687728882, + "learning_rate": 8.295709335507542e-05, + "loss": 3.8093, + "step": 20080 + }, + { + "epoch": 1.364655523848349, + "grad_norm": 0.19766871631145477, + "learning_rate": 8.295284685419215e-05, + "loss": 3.9119, + "step": 20085 + }, + { + "epoch": 1.3649952439190107, + "grad_norm": 0.18254530429840088, + "learning_rate": 8.294860035330887e-05, + "loss": 3.9031, + "step": 20090 + }, + { + "epoch": 1.3653349639896726, + "grad_norm": 0.43354925513267517, + "learning_rate": 8.29443538524256e-05, + "loss": 3.7736, + "step": 20095 + }, + { + "epoch": 1.3656746840603342, + "grad_norm": 0.20304037630558014, + "learning_rate": 8.294010735154233e-05, + "loss": 3.8566, + "step": 20100 + }, + { + "epoch": 1.366014404130996, + "grad_norm": 0.21336279809474945, + "learning_rate": 8.293586085065906e-05, + "loss": 3.9938, + "step": 20105 + }, + { + "epoch": 1.366354124201658, + "grad_norm": 0.33956122398376465, + "learning_rate": 8.293161434977579e-05, + "loss": 3.8017, + "step": 20110 + }, + { + "epoch": 1.3666938442723195, + "grad_norm": 0.47993090748786926, + "learning_rate": 8.292736784889251e-05, + "loss": 3.8544, + "step": 20115 + }, + { + "epoch": 1.3670335643429814, + "grad_norm": 0.17959949374198914, + "learning_rate": 8.292312134800924e-05, + "loss": 3.5071, + "step": 20120 + }, + { + "epoch": 1.367373284413643, + "grad_norm": 0.1584675908088684, + "learning_rate": 8.291887484712597e-05, + "loss": 3.8139, + "step": 20125 + }, + { + "epoch": 1.3677130044843049, + "grad_norm": 0.17819224298000336, + "learning_rate": 8.29146283462427e-05, + "loss": 3.8649, + "step": 20130 + }, + { + "epoch": 1.3680527245549667, + "grad_norm": 0.1722547709941864, + "learning_rate": 8.291038184535943e-05, + "loss": 3.9831, + "step": 20135 + }, + { + "epoch": 1.3683924446256284, + "grad_norm": 0.19656646251678467, + "learning_rate": 8.290613534447615e-05, + "loss": 3.7208, + "step": 20140 + }, + { + "epoch": 1.3687321646962902, + "grad_norm": 0.29168495535850525, + "learning_rate": 8.290188884359288e-05, + "loss": 3.8531, + "step": 20145 + }, + { + "epoch": 1.369071884766952, + "grad_norm": 0.17680272459983826, + "learning_rate": 8.289764234270961e-05, + "loss": 3.8448, + "step": 20150 + }, + { + "epoch": 1.3694116048376137, + "grad_norm": 0.19573275744915009, + "learning_rate": 8.289339584182634e-05, + "loss": 4.0828, + "step": 20155 + }, + { + "epoch": 1.3697513249082756, + "grad_norm": 0.1910361796617508, + "learning_rate": 8.288914934094307e-05, + "loss": 3.8786, + "step": 20160 + }, + { + "epoch": 1.3700910449789374, + "grad_norm": 0.2344173640012741, + "learning_rate": 8.28849028400598e-05, + "loss": 3.8166, + "step": 20165 + }, + { + "epoch": 1.370430765049599, + "grad_norm": 1.6548504829406738, + "learning_rate": 8.288065633917652e-05, + "loss": 3.7879, + "step": 20170 + }, + { + "epoch": 1.370770485120261, + "grad_norm": 0.14902913570404053, + "learning_rate": 8.287640983829325e-05, + "loss": 3.7688, + "step": 20175 + }, + { + "epoch": 1.3711102051909227, + "grad_norm": 0.13891853392124176, + "learning_rate": 8.287216333740998e-05, + "loss": 3.9153, + "step": 20180 + }, + { + "epoch": 1.3714499252615844, + "grad_norm": 0.14944295585155487, + "learning_rate": 8.28679168365267e-05, + "loss": 3.7051, + "step": 20185 + }, + { + "epoch": 1.3717896453322462, + "grad_norm": 0.21667538583278656, + "learning_rate": 8.286367033564343e-05, + "loss": 4.0364, + "step": 20190 + }, + { + "epoch": 1.372129365402908, + "grad_norm": 0.20431853830814362, + "learning_rate": 8.285942383476016e-05, + "loss": 3.7486, + "step": 20195 + }, + { + "epoch": 1.3724690854735697, + "grad_norm": 0.45885857939720154, + "learning_rate": 8.285517733387689e-05, + "loss": 3.5869, + "step": 20200 + }, + { + "epoch": 1.3728088055442316, + "grad_norm": 0.16328980028629303, + "learning_rate": 8.28509308329936e-05, + "loss": 3.7108, + "step": 20205 + }, + { + "epoch": 1.3731485256148934, + "grad_norm": 0.14327803254127502, + "learning_rate": 8.284668433211035e-05, + "loss": 3.8404, + "step": 20210 + }, + { + "epoch": 1.373488245685555, + "grad_norm": 0.15884236991405487, + "learning_rate": 8.284243783122707e-05, + "loss": 3.8316, + "step": 20215 + }, + { + "epoch": 1.373827965756217, + "grad_norm": 0.17825280129909515, + "learning_rate": 8.28381913303438e-05, + "loss": 3.6574, + "step": 20220 + }, + { + "epoch": 1.3741676858268788, + "grad_norm": 0.18911921977996826, + "learning_rate": 8.283394482946053e-05, + "loss": 3.9764, + "step": 20225 + }, + { + "epoch": 1.3745074058975404, + "grad_norm": 0.16429981589317322, + "learning_rate": 8.282969832857726e-05, + "loss": 3.6322, + "step": 20230 + }, + { + "epoch": 1.3748471259682022, + "grad_norm": 0.2012677788734436, + "learning_rate": 8.282545182769399e-05, + "loss": 3.8832, + "step": 20235 + }, + { + "epoch": 1.375186846038864, + "grad_norm": 0.3244810104370117, + "learning_rate": 8.282120532681071e-05, + "loss": 3.7945, + "step": 20240 + }, + { + "epoch": 1.3755265661095257, + "grad_norm": 0.15442442893981934, + "learning_rate": 8.281695882592744e-05, + "loss": 3.7629, + "step": 20245 + }, + { + "epoch": 1.3758662861801876, + "grad_norm": 0.17104221880435944, + "learning_rate": 8.281271232504417e-05, + "loss": 3.7662, + "step": 20250 + }, + { + "epoch": 1.3762060062508494, + "grad_norm": 0.2611694931983948, + "learning_rate": 8.28084658241609e-05, + "loss": 3.9173, + "step": 20255 + }, + { + "epoch": 1.376545726321511, + "grad_norm": 0.21918843686580658, + "learning_rate": 8.280421932327763e-05, + "loss": 3.6724, + "step": 20260 + }, + { + "epoch": 1.376885446392173, + "grad_norm": 0.30032339692115784, + "learning_rate": 8.279997282239435e-05, + "loss": 3.8668, + "step": 20265 + }, + { + "epoch": 1.3772251664628345, + "grad_norm": 0.18865332007408142, + "learning_rate": 8.279572632151108e-05, + "loss": 4.1219, + "step": 20270 + }, + { + "epoch": 1.3775648865334964, + "grad_norm": 0.7063418626785278, + "learning_rate": 8.279147982062781e-05, + "loss": 3.9512, + "step": 20275 + }, + { + "epoch": 1.3779046066041583, + "grad_norm": 0.5794016122817993, + "learning_rate": 8.278723331974454e-05, + "loss": 4.0552, + "step": 20280 + }, + { + "epoch": 1.3782443266748199, + "grad_norm": 0.22169694304466248, + "learning_rate": 8.278298681886127e-05, + "loss": 3.7675, + "step": 20285 + }, + { + "epoch": 1.3785840467454817, + "grad_norm": 0.5231088399887085, + "learning_rate": 8.277874031797798e-05, + "loss": 4.0273, + "step": 20290 + }, + { + "epoch": 1.3789237668161434, + "grad_norm": 0.19817912578582764, + "learning_rate": 8.277449381709472e-05, + "loss": 4.0461, + "step": 20295 + }, + { + "epoch": 1.3792634868868052, + "grad_norm": 0.14622211456298828, + "learning_rate": 8.277024731621145e-05, + "loss": 4.0928, + "step": 20300 + }, + { + "epoch": 1.379603206957467, + "grad_norm": 0.6965955495834351, + "learning_rate": 8.276600081532817e-05, + "loss": 3.6367, + "step": 20305 + }, + { + "epoch": 1.3799429270281287, + "grad_norm": 0.22916048765182495, + "learning_rate": 8.276175431444491e-05, + "loss": 3.841, + "step": 20310 + }, + { + "epoch": 1.3802826470987906, + "grad_norm": 0.1716703176498413, + "learning_rate": 8.275750781356163e-05, + "loss": 4.0571, + "step": 20315 + }, + { + "epoch": 1.3806223671694524, + "grad_norm": 0.2703772783279419, + "learning_rate": 8.275326131267835e-05, + "loss": 3.9625, + "step": 20320 + }, + { + "epoch": 1.380962087240114, + "grad_norm": 0.2232513725757599, + "learning_rate": 8.274901481179509e-05, + "loss": 3.7933, + "step": 20325 + }, + { + "epoch": 1.381301807310776, + "grad_norm": 0.17531871795654297, + "learning_rate": 8.274476831091182e-05, + "loss": 3.8449, + "step": 20330 + }, + { + "epoch": 1.3816415273814378, + "grad_norm": 0.1798347532749176, + "learning_rate": 8.274052181002853e-05, + "loss": 3.7751, + "step": 20335 + }, + { + "epoch": 1.3819812474520994, + "grad_norm": 0.20792394876480103, + "learning_rate": 8.273627530914527e-05, + "loss": 3.9919, + "step": 20340 + }, + { + "epoch": 1.3823209675227612, + "grad_norm": 0.16131193935871124, + "learning_rate": 8.2732028808262e-05, + "loss": 3.8381, + "step": 20345 + }, + { + "epoch": 1.382660687593423, + "grad_norm": 0.1802690327167511, + "learning_rate": 8.272778230737872e-05, + "loss": 3.8566, + "step": 20350 + }, + { + "epoch": 1.3830004076640847, + "grad_norm": 0.229839026927948, + "learning_rate": 8.272353580649546e-05, + "loss": 3.9012, + "step": 20355 + }, + { + "epoch": 1.3833401277347466, + "grad_norm": 0.16918809711933136, + "learning_rate": 8.271928930561217e-05, + "loss": 3.7401, + "step": 20360 + }, + { + "epoch": 1.3836798478054084, + "grad_norm": 0.168418288230896, + "learning_rate": 8.27150428047289e-05, + "loss": 4.0612, + "step": 20365 + }, + { + "epoch": 1.38401956787607, + "grad_norm": 0.18246056139469147, + "learning_rate": 8.271079630384564e-05, + "loss": 3.9847, + "step": 20370 + }, + { + "epoch": 1.384359287946732, + "grad_norm": 0.167828768491745, + "learning_rate": 8.270654980296236e-05, + "loss": 3.9953, + "step": 20375 + }, + { + "epoch": 1.3846990080173938, + "grad_norm": 0.14558953046798706, + "learning_rate": 8.270230330207909e-05, + "loss": 4.071, + "step": 20380 + }, + { + "epoch": 1.3850387280880554, + "grad_norm": 0.14689557254314423, + "learning_rate": 8.269805680119583e-05, + "loss": 3.9772, + "step": 20385 + }, + { + "epoch": 1.3853784481587172, + "grad_norm": 0.2008514106273651, + "learning_rate": 8.269381030031254e-05, + "loss": 3.8841, + "step": 20390 + }, + { + "epoch": 1.385718168229379, + "grad_norm": 0.21766845881938934, + "learning_rate": 8.268956379942927e-05, + "loss": 3.9536, + "step": 20395 + }, + { + "epoch": 1.3860578883000407, + "grad_norm": 0.16888660192489624, + "learning_rate": 8.268531729854601e-05, + "loss": 3.8925, + "step": 20400 + }, + { + "epoch": 1.3863976083707026, + "grad_norm": 0.2284618318080902, + "learning_rate": 8.268107079766273e-05, + "loss": 3.8376, + "step": 20405 + }, + { + "epoch": 1.3867373284413644, + "grad_norm": 0.483484148979187, + "learning_rate": 8.267682429677945e-05, + "loss": 3.845, + "step": 20410 + }, + { + "epoch": 1.387077048512026, + "grad_norm": 0.2019200176000595, + "learning_rate": 8.26725777958962e-05, + "loss": 3.9293, + "step": 20415 + }, + { + "epoch": 1.387416768582688, + "grad_norm": 0.23234306275844574, + "learning_rate": 8.266833129501291e-05, + "loss": 4.0369, + "step": 20420 + }, + { + "epoch": 1.3877564886533498, + "grad_norm": 0.14562487602233887, + "learning_rate": 8.266408479412964e-05, + "loss": 3.8212, + "step": 20425 + }, + { + "epoch": 1.3880962087240114, + "grad_norm": 0.18297959864139557, + "learning_rate": 8.265983829324637e-05, + "loss": 3.8678, + "step": 20430 + }, + { + "epoch": 1.3884359287946733, + "grad_norm": 0.19302566349506378, + "learning_rate": 8.26555917923631e-05, + "loss": 4.0212, + "step": 20435 + }, + { + "epoch": 1.3887756488653349, + "grad_norm": 0.1777810901403427, + "learning_rate": 8.265134529147982e-05, + "loss": 3.8602, + "step": 20440 + }, + { + "epoch": 1.3891153689359967, + "grad_norm": 0.17040647566318512, + "learning_rate": 8.264709879059655e-05, + "loss": 3.9538, + "step": 20445 + }, + { + "epoch": 1.3894550890066586, + "grad_norm": 0.22637617588043213, + "learning_rate": 8.264285228971328e-05, + "loss": 3.9812, + "step": 20450 + }, + { + "epoch": 1.3897948090773202, + "grad_norm": 0.18493828177452087, + "learning_rate": 8.263860578883e-05, + "loss": 3.7827, + "step": 20455 + }, + { + "epoch": 1.390134529147982, + "grad_norm": 0.1830572485923767, + "learning_rate": 8.263435928794673e-05, + "loss": 4.1226, + "step": 20460 + }, + { + "epoch": 1.3904742492186437, + "grad_norm": 0.15612681210041046, + "learning_rate": 8.263011278706346e-05, + "loss": 3.8599, + "step": 20465 + }, + { + "epoch": 1.3908139692893056, + "grad_norm": 0.1907326877117157, + "learning_rate": 8.262586628618019e-05, + "loss": 3.8173, + "step": 20470 + }, + { + "epoch": 1.3911536893599674, + "grad_norm": 0.19133837521076202, + "learning_rate": 8.262161978529692e-05, + "loss": 3.892, + "step": 20475 + }, + { + "epoch": 1.391493409430629, + "grad_norm": 0.18632636964321136, + "learning_rate": 8.261737328441365e-05, + "loss": 3.9205, + "step": 20480 + }, + { + "epoch": 1.391833129501291, + "grad_norm": 0.20682813227176666, + "learning_rate": 8.261312678353037e-05, + "loss": 3.8794, + "step": 20485 + }, + { + "epoch": 1.3921728495719528, + "grad_norm": 0.1677735298871994, + "learning_rate": 8.26088802826471e-05, + "loss": 3.9011, + "step": 20490 + }, + { + "epoch": 1.3925125696426144, + "grad_norm": 0.2501674294471741, + "learning_rate": 8.260463378176383e-05, + "loss": 3.8333, + "step": 20495 + }, + { + "epoch": 1.3928522897132762, + "grad_norm": 0.26450711488723755, + "learning_rate": 8.260038728088056e-05, + "loss": 3.8244, + "step": 20500 + }, + { + "epoch": 1.393192009783938, + "grad_norm": 0.18345464766025543, + "learning_rate": 8.259614077999729e-05, + "loss": 3.928, + "step": 20505 + }, + { + "epoch": 1.3935317298545997, + "grad_norm": 0.24953937530517578, + "learning_rate": 8.259189427911401e-05, + "loss": 3.8553, + "step": 20510 + }, + { + "epoch": 1.3938714499252616, + "grad_norm": 0.18246859312057495, + "learning_rate": 8.258764777823074e-05, + "loss": 3.8166, + "step": 20515 + }, + { + "epoch": 1.3942111699959234, + "grad_norm": 0.20396870374679565, + "learning_rate": 8.258340127734747e-05, + "loss": 3.9248, + "step": 20520 + }, + { + "epoch": 1.394550890066585, + "grad_norm": 0.17202244699001312, + "learning_rate": 8.25791547764642e-05, + "loss": 3.9021, + "step": 20525 + }, + { + "epoch": 1.394890610137247, + "grad_norm": 0.2120206505060196, + "learning_rate": 8.257490827558093e-05, + "loss": 3.5841, + "step": 20530 + }, + { + "epoch": 1.3952303302079088, + "grad_norm": 0.18400073051452637, + "learning_rate": 8.257066177469765e-05, + "loss": 3.9685, + "step": 20535 + }, + { + "epoch": 1.3955700502785704, + "grad_norm": 0.1451161950826645, + "learning_rate": 8.256641527381438e-05, + "loss": 3.9454, + "step": 20540 + }, + { + "epoch": 1.3959097703492322, + "grad_norm": 0.15899056196212769, + "learning_rate": 8.256216877293111e-05, + "loss": 4.0376, + "step": 20545 + }, + { + "epoch": 1.396249490419894, + "grad_norm": 0.22315488755702972, + "learning_rate": 8.255792227204784e-05, + "loss": 3.7391, + "step": 20550 + }, + { + "epoch": 1.3965892104905557, + "grad_norm": 0.5583938956260681, + "learning_rate": 8.255367577116457e-05, + "loss": 3.6885, + "step": 20555 + }, + { + "epoch": 1.3969289305612176, + "grad_norm": 0.13616697490215302, + "learning_rate": 8.25494292702813e-05, + "loss": 3.8911, + "step": 20560 + }, + { + "epoch": 1.3972686506318794, + "grad_norm": 0.23101896047592163, + "learning_rate": 8.254518276939802e-05, + "loss": 3.9205, + "step": 20565 + }, + { + "epoch": 1.397608370702541, + "grad_norm": 0.1999511867761612, + "learning_rate": 8.254093626851475e-05, + "loss": 3.88, + "step": 20570 + }, + { + "epoch": 1.397948090773203, + "grad_norm": 0.1608119010925293, + "learning_rate": 8.253668976763148e-05, + "loss": 3.66, + "step": 20575 + }, + { + "epoch": 1.3982878108438648, + "grad_norm": 0.17892685532569885, + "learning_rate": 8.25324432667482e-05, + "loss": 3.9773, + "step": 20580 + }, + { + "epoch": 1.3986275309145264, + "grad_norm": 0.2371005415916443, + "learning_rate": 8.252819676586493e-05, + "loss": 3.8546, + "step": 20585 + }, + { + "epoch": 1.3989672509851883, + "grad_norm": 0.2594485878944397, + "learning_rate": 8.252395026498166e-05, + "loss": 4.0152, + "step": 20590 + }, + { + "epoch": 1.3993069710558501, + "grad_norm": 0.31378039717674255, + "learning_rate": 8.251970376409839e-05, + "loss": 3.7924, + "step": 20595 + }, + { + "epoch": 1.3996466911265117, + "grad_norm": 0.20084799826145172, + "learning_rate": 8.251545726321512e-05, + "loss": 3.7907, + "step": 20600 + }, + { + "epoch": 1.3999864111971736, + "grad_norm": 0.16089700162410736, + "learning_rate": 8.251121076233185e-05, + "loss": 4.0315, + "step": 20605 + }, + { + "epoch": 1.4003261312678352, + "grad_norm": 0.1722809374332428, + "learning_rate": 8.250696426144857e-05, + "loss": 3.943, + "step": 20610 + }, + { + "epoch": 1.400665851338497, + "grad_norm": 0.5044102072715759, + "learning_rate": 8.25027177605653e-05, + "loss": 4.042, + "step": 20615 + }, + { + "epoch": 1.401005571409159, + "grad_norm": 0.18929114937782288, + "learning_rate": 8.249847125968203e-05, + "loss": 3.8738, + "step": 20620 + }, + { + "epoch": 1.4013452914798206, + "grad_norm": 0.29690635204315186, + "learning_rate": 8.249422475879876e-05, + "loss": 3.8999, + "step": 20625 + }, + { + "epoch": 1.4016850115504824, + "grad_norm": 0.12846887111663818, + "learning_rate": 8.248997825791547e-05, + "loss": 3.9202, + "step": 20630 + }, + { + "epoch": 1.402024731621144, + "grad_norm": 0.17639724910259247, + "learning_rate": 8.248573175703221e-05, + "loss": 3.7957, + "step": 20635 + }, + { + "epoch": 1.402364451691806, + "grad_norm": 0.16797500848770142, + "learning_rate": 8.248148525614894e-05, + "loss": 4.3257, + "step": 20640 + }, + { + "epoch": 1.4027041717624678, + "grad_norm": 0.2172061800956726, + "learning_rate": 8.247723875526566e-05, + "loss": 4.1237, + "step": 20645 + }, + { + "epoch": 1.4030438918331294, + "grad_norm": 0.22591371834278107, + "learning_rate": 8.24729922543824e-05, + "loss": 3.8891, + "step": 20650 + }, + { + "epoch": 1.4033836119037912, + "grad_norm": 0.6188000440597534, + "learning_rate": 8.246874575349913e-05, + "loss": 3.7984, + "step": 20655 + }, + { + "epoch": 1.403723331974453, + "grad_norm": 0.15622898936271667, + "learning_rate": 8.246449925261584e-05, + "loss": 3.9129, + "step": 20660 + }, + { + "epoch": 1.4040630520451147, + "grad_norm": 0.18553189933300018, + "learning_rate": 8.246025275173258e-05, + "loss": 3.9939, + "step": 20665 + }, + { + "epoch": 1.4044027721157766, + "grad_norm": 0.1783105432987213, + "learning_rate": 8.245600625084931e-05, + "loss": 3.8729, + "step": 20670 + }, + { + "epoch": 1.4047424921864384, + "grad_norm": 0.2580389082431793, + "learning_rate": 8.245175974996602e-05, + "loss": 3.9036, + "step": 20675 + }, + { + "epoch": 1.4050822122571, + "grad_norm": 0.21731121838092804, + "learning_rate": 8.244751324908277e-05, + "loss": 3.9889, + "step": 20680 + }, + { + "epoch": 1.405421932327762, + "grad_norm": 0.15238884091377258, + "learning_rate": 8.24432667481995e-05, + "loss": 3.844, + "step": 20685 + }, + { + "epoch": 1.4057616523984238, + "grad_norm": 0.3508720397949219, + "learning_rate": 8.243902024731621e-05, + "loss": 3.6518, + "step": 20690 + }, + { + "epoch": 1.4061013724690854, + "grad_norm": 0.3375933766365051, + "learning_rate": 8.243477374643295e-05, + "loss": 3.8646, + "step": 20695 + }, + { + "epoch": 1.4064410925397473, + "grad_norm": 0.4046700894832611, + "learning_rate": 8.243052724554968e-05, + "loss": 3.9045, + "step": 20700 + }, + { + "epoch": 1.406780812610409, + "grad_norm": 0.19106018543243408, + "learning_rate": 8.242628074466639e-05, + "loss": 3.8645, + "step": 20705 + }, + { + "epoch": 1.4071205326810707, + "grad_norm": 0.7103424668312073, + "learning_rate": 8.242203424378313e-05, + "loss": 4.2689, + "step": 20710 + }, + { + "epoch": 1.4074602527517326, + "grad_norm": 0.1847032904624939, + "learning_rate": 8.241778774289985e-05, + "loss": 3.8212, + "step": 20715 + }, + { + "epoch": 1.4077999728223944, + "grad_norm": 0.16688205301761627, + "learning_rate": 8.241354124201658e-05, + "loss": 3.9479, + "step": 20720 + }, + { + "epoch": 1.408139692893056, + "grad_norm": 0.20364980399608612, + "learning_rate": 8.240929474113332e-05, + "loss": 3.8265, + "step": 20725 + }, + { + "epoch": 1.408479412963718, + "grad_norm": 0.21920597553253174, + "learning_rate": 8.240504824025003e-05, + "loss": 3.7218, + "step": 20730 + }, + { + "epoch": 1.4088191330343798, + "grad_norm": 0.1633688509464264, + "learning_rate": 8.240080173936676e-05, + "loss": 3.9561, + "step": 20735 + }, + { + "epoch": 1.4091588531050414, + "grad_norm": 0.19013762474060059, + "learning_rate": 8.23965552384835e-05, + "loss": 3.7603, + "step": 20740 + }, + { + "epoch": 1.4094985731757033, + "grad_norm": 0.9276907444000244, + "learning_rate": 8.239230873760022e-05, + "loss": 3.8389, + "step": 20745 + }, + { + "epoch": 1.4098382932463651, + "grad_norm": 0.23153136670589447, + "learning_rate": 8.238806223671694e-05, + "loss": 4.1403, + "step": 20750 + }, + { + "epoch": 1.4101780133170267, + "grad_norm": 0.18778279423713684, + "learning_rate": 8.238381573583369e-05, + "loss": 3.777, + "step": 20755 + }, + { + "epoch": 1.4105177333876886, + "grad_norm": 0.14239093661308289, + "learning_rate": 8.23795692349504e-05, + "loss": 3.8091, + "step": 20760 + }, + { + "epoch": 1.4108574534583505, + "grad_norm": 0.3288138508796692, + "learning_rate": 8.237532273406713e-05, + "loss": 3.9178, + "step": 20765 + }, + { + "epoch": 1.411197173529012, + "grad_norm": 0.20908305048942566, + "learning_rate": 8.237107623318387e-05, + "loss": 4.1386, + "step": 20770 + }, + { + "epoch": 1.411536893599674, + "grad_norm": 0.21411341428756714, + "learning_rate": 8.236682973230058e-05, + "loss": 3.8525, + "step": 20775 + }, + { + "epoch": 1.4118766136703356, + "grad_norm": 0.15740644931793213, + "learning_rate": 8.236258323141731e-05, + "loss": 4.1978, + "step": 20780 + }, + { + "epoch": 1.4122163337409974, + "grad_norm": 0.2701818346977234, + "learning_rate": 8.235833673053404e-05, + "loss": 3.8943, + "step": 20785 + }, + { + "epoch": 1.4125560538116593, + "grad_norm": 0.33300185203552246, + "learning_rate": 8.235409022965077e-05, + "loss": 3.9684, + "step": 20790 + }, + { + "epoch": 1.412895773882321, + "grad_norm": 0.16675721108913422, + "learning_rate": 8.23498437287675e-05, + "loss": 4.047, + "step": 20795 + }, + { + "epoch": 1.4132354939529828, + "grad_norm": 0.3941943645477295, + "learning_rate": 8.234559722788422e-05, + "loss": 3.8967, + "step": 20800 + }, + { + "epoch": 1.4135752140236444, + "grad_norm": 0.15823981165885925, + "learning_rate": 8.234135072700095e-05, + "loss": 3.9109, + "step": 20805 + }, + { + "epoch": 1.4139149340943062, + "grad_norm": 0.16483734548091888, + "learning_rate": 8.233710422611768e-05, + "loss": 3.7376, + "step": 20810 + }, + { + "epoch": 1.414254654164968, + "grad_norm": 0.21595998108386993, + "learning_rate": 8.233285772523441e-05, + "loss": 4.0877, + "step": 20815 + }, + { + "epoch": 1.4145943742356297, + "grad_norm": 0.1910046488046646, + "learning_rate": 8.232861122435114e-05, + "loss": 3.9901, + "step": 20820 + }, + { + "epoch": 1.4149340943062916, + "grad_norm": 0.16039004921913147, + "learning_rate": 8.232436472346786e-05, + "loss": 3.9833, + "step": 20825 + }, + { + "epoch": 1.4152738143769534, + "grad_norm": 0.19336053729057312, + "learning_rate": 8.232011822258459e-05, + "loss": 3.9563, + "step": 20830 + }, + { + "epoch": 1.415613534447615, + "grad_norm": 0.1709805577993393, + "learning_rate": 8.231587172170132e-05, + "loss": 3.8441, + "step": 20835 + }, + { + "epoch": 1.415953254518277, + "grad_norm": 0.15607839822769165, + "learning_rate": 8.231162522081805e-05, + "loss": 4.095, + "step": 20840 + }, + { + "epoch": 1.4162929745889388, + "grad_norm": 0.3460797965526581, + "learning_rate": 8.230737871993478e-05, + "loss": 4.0787, + "step": 20845 + }, + { + "epoch": 1.4166326946596004, + "grad_norm": 0.1900099217891693, + "learning_rate": 8.23031322190515e-05, + "loss": 3.7558, + "step": 20850 + }, + { + "epoch": 1.4169724147302623, + "grad_norm": 0.16840973496437073, + "learning_rate": 8.229888571816823e-05, + "loss": 3.6512, + "step": 20855 + }, + { + "epoch": 1.417312134800924, + "grad_norm": 0.17189863324165344, + "learning_rate": 8.229463921728496e-05, + "loss": 3.8169, + "step": 20860 + }, + { + "epoch": 1.4176518548715857, + "grad_norm": 0.1927618384361267, + "learning_rate": 8.229039271640169e-05, + "loss": 3.8956, + "step": 20865 + }, + { + "epoch": 1.4179915749422476, + "grad_norm": 0.19402828812599182, + "learning_rate": 8.228614621551842e-05, + "loss": 3.9335, + "step": 20870 + }, + { + "epoch": 1.4183312950129094, + "grad_norm": 0.1602359563112259, + "learning_rate": 8.228189971463514e-05, + "loss": 3.9649, + "step": 20875 + }, + { + "epoch": 1.418671015083571, + "grad_norm": 0.1657918244600296, + "learning_rate": 8.227765321375187e-05, + "loss": 4.051, + "step": 20880 + }, + { + "epoch": 1.419010735154233, + "grad_norm": 0.24802999198436737, + "learning_rate": 8.22734067128686e-05, + "loss": 3.942, + "step": 20885 + }, + { + "epoch": 1.4193504552248948, + "grad_norm": 0.2361719012260437, + "learning_rate": 8.226916021198533e-05, + "loss": 3.8222, + "step": 20890 + }, + { + "epoch": 1.4196901752955564, + "grad_norm": 1.6782368421554565, + "learning_rate": 8.226491371110206e-05, + "loss": 4.1212, + "step": 20895 + }, + { + "epoch": 1.4200298953662183, + "grad_norm": 0.17618829011917114, + "learning_rate": 8.226066721021878e-05, + "loss": 4.0812, + "step": 20900 + }, + { + "epoch": 1.4203696154368801, + "grad_norm": 0.19263197481632233, + "learning_rate": 8.225642070933551e-05, + "loss": 3.8659, + "step": 20905 + }, + { + "epoch": 1.4207093355075417, + "grad_norm": 0.1882973611354828, + "learning_rate": 8.225217420845224e-05, + "loss": 4.0679, + "step": 20910 + }, + { + "epoch": 1.4210490555782036, + "grad_norm": 0.6623756289482117, + "learning_rate": 8.224792770756897e-05, + "loss": 3.8181, + "step": 20915 + }, + { + "epoch": 1.4213887756488655, + "grad_norm": 0.18475428223609924, + "learning_rate": 8.22436812066857e-05, + "loss": 4.0344, + "step": 20920 + }, + { + "epoch": 1.421728495719527, + "grad_norm": 0.20617535710334778, + "learning_rate": 8.223943470580242e-05, + "loss": 3.8778, + "step": 20925 + }, + { + "epoch": 1.422068215790189, + "grad_norm": 0.17627649009227753, + "learning_rate": 8.223518820491915e-05, + "loss": 3.8603, + "step": 20930 + }, + { + "epoch": 1.4224079358608508, + "grad_norm": 0.17530901730060577, + "learning_rate": 8.223094170403588e-05, + "loss": 3.8346, + "step": 20935 + }, + { + "epoch": 1.4227476559315124, + "grad_norm": 0.1848442554473877, + "learning_rate": 8.222669520315261e-05, + "loss": 3.9002, + "step": 20940 + }, + { + "epoch": 1.4230873760021743, + "grad_norm": 0.16413180530071259, + "learning_rate": 8.222244870226934e-05, + "loss": 3.8241, + "step": 20945 + }, + { + "epoch": 1.423427096072836, + "grad_norm": 0.2092829793691635, + "learning_rate": 8.221820220138606e-05, + "loss": 3.9645, + "step": 20950 + }, + { + "epoch": 1.4237668161434978, + "grad_norm": 0.169973686337471, + "learning_rate": 8.221395570050279e-05, + "loss": 4.108, + "step": 20955 + }, + { + "epoch": 1.4241065362141596, + "grad_norm": 0.19913798570632935, + "learning_rate": 8.220970919961952e-05, + "loss": 3.982, + "step": 20960 + }, + { + "epoch": 1.4244462562848212, + "grad_norm": 0.15906639397144318, + "learning_rate": 8.220546269873625e-05, + "loss": 3.8677, + "step": 20965 + }, + { + "epoch": 1.424785976355483, + "grad_norm": 0.22668901085853577, + "learning_rate": 8.220121619785298e-05, + "loss": 4.0036, + "step": 20970 + }, + { + "epoch": 1.4251256964261447, + "grad_norm": 0.2476281225681305, + "learning_rate": 8.21969696969697e-05, + "loss": 3.7996, + "step": 20975 + }, + { + "epoch": 1.4254654164968066, + "grad_norm": 0.2064250111579895, + "learning_rate": 8.219272319608643e-05, + "loss": 4.2097, + "step": 20980 + }, + { + "epoch": 1.4258051365674684, + "grad_norm": 0.1753351241350174, + "learning_rate": 8.218847669520315e-05, + "loss": 3.7907, + "step": 20985 + }, + { + "epoch": 1.42614485663813, + "grad_norm": 0.2399117797613144, + "learning_rate": 8.218423019431989e-05, + "loss": 3.7094, + "step": 20990 + }, + { + "epoch": 1.426484576708792, + "grad_norm": 0.31722328066825867, + "learning_rate": 8.217998369343662e-05, + "loss": 3.6369, + "step": 20995 + }, + { + "epoch": 1.4268242967794538, + "grad_norm": 0.19348561763763428, + "learning_rate": 8.217573719255333e-05, + "loss": 3.8866, + "step": 21000 + }, + { + "epoch": 1.4271640168501154, + "grad_norm": 0.18979719281196594, + "learning_rate": 8.217149069167007e-05, + "loss": 4.0057, + "step": 21005 + }, + { + "epoch": 1.4275037369207773, + "grad_norm": 0.19874157011508942, + "learning_rate": 8.21672441907868e-05, + "loss": 4.1563, + "step": 21010 + }, + { + "epoch": 1.427843456991439, + "grad_norm": 0.16244027018547058, + "learning_rate": 8.216299768990352e-05, + "loss": 4.1136, + "step": 21015 + }, + { + "epoch": 1.4281831770621007, + "grad_norm": 0.44101476669311523, + "learning_rate": 8.215875118902026e-05, + "loss": 3.7816, + "step": 21020 + }, + { + "epoch": 1.4285228971327626, + "grad_norm": 0.16287243366241455, + "learning_rate": 8.215450468813698e-05, + "loss": 3.7884, + "step": 21025 + }, + { + "epoch": 1.4288626172034244, + "grad_norm": 0.17161798477172852, + "learning_rate": 8.21502581872537e-05, + "loss": 4.0035, + "step": 21030 + }, + { + "epoch": 1.429202337274086, + "grad_norm": 0.5789480209350586, + "learning_rate": 8.214601168637044e-05, + "loss": 3.9572, + "step": 21035 + }, + { + "epoch": 1.429542057344748, + "grad_norm": 0.1632133424282074, + "learning_rate": 8.214176518548717e-05, + "loss": 4.1409, + "step": 21040 + }, + { + "epoch": 1.4298817774154098, + "grad_norm": 0.15144796669483185, + "learning_rate": 8.213751868460388e-05, + "loss": 3.8937, + "step": 21045 + }, + { + "epoch": 1.4302214974860714, + "grad_norm": 0.16633948683738708, + "learning_rate": 8.213327218372062e-05, + "loss": 4.0276, + "step": 21050 + }, + { + "epoch": 1.4305612175567333, + "grad_norm": 0.18662403523921967, + "learning_rate": 8.212902568283735e-05, + "loss": 3.9667, + "step": 21055 + }, + { + "epoch": 1.4309009376273951, + "grad_norm": 0.23155295848846436, + "learning_rate": 8.212477918195407e-05, + "loss": 4.0141, + "step": 21060 + }, + { + "epoch": 1.4312406576980568, + "grad_norm": 0.16945880651474, + "learning_rate": 8.212053268107081e-05, + "loss": 3.928, + "step": 21065 + }, + { + "epoch": 1.4315803777687186, + "grad_norm": 0.1631777286529541, + "learning_rate": 8.211628618018752e-05, + "loss": 3.8216, + "step": 21070 + }, + { + "epoch": 1.4319200978393805, + "grad_norm": 0.32249847054481506, + "learning_rate": 8.211203967930425e-05, + "loss": 3.8325, + "step": 21075 + }, + { + "epoch": 1.432259817910042, + "grad_norm": 0.18511350452899933, + "learning_rate": 8.210779317842099e-05, + "loss": 3.8539, + "step": 21080 + }, + { + "epoch": 1.432599537980704, + "grad_norm": 0.26636990904808044, + "learning_rate": 8.210354667753771e-05, + "loss": 3.9061, + "step": 21085 + }, + { + "epoch": 1.4329392580513658, + "grad_norm": 0.1834803968667984, + "learning_rate": 8.209930017665444e-05, + "loss": 3.9185, + "step": 21090 + }, + { + "epoch": 1.4332789781220274, + "grad_norm": 0.15184654295444489, + "learning_rate": 8.209505367577118e-05, + "loss": 3.7936, + "step": 21095 + }, + { + "epoch": 1.4336186981926893, + "grad_norm": 0.21445034444332123, + "learning_rate": 8.209080717488789e-05, + "loss": 3.7076, + "step": 21100 + }, + { + "epoch": 1.4339584182633511, + "grad_norm": 0.23629005253314972, + "learning_rate": 8.208656067400462e-05, + "loss": 3.7299, + "step": 21105 + }, + { + "epoch": 1.4342981383340128, + "grad_norm": 0.16976600885391235, + "learning_rate": 8.208231417312136e-05, + "loss": 3.9154, + "step": 21110 + }, + { + "epoch": 1.4346378584046746, + "grad_norm": 0.5715909004211426, + "learning_rate": 8.207806767223808e-05, + "loss": 3.9288, + "step": 21115 + }, + { + "epoch": 1.4349775784753362, + "grad_norm": 0.19846251606941223, + "learning_rate": 8.20738211713548e-05, + "loss": 4.1136, + "step": 21120 + }, + { + "epoch": 1.435317298545998, + "grad_norm": 0.15568068623542786, + "learning_rate": 8.206957467047154e-05, + "loss": 3.8081, + "step": 21125 + }, + { + "epoch": 1.43565701861666, + "grad_norm": 0.1685870736837387, + "learning_rate": 8.206532816958826e-05, + "loss": 4.0102, + "step": 21130 + }, + { + "epoch": 1.4359967386873216, + "grad_norm": 0.13728953897953033, + "learning_rate": 8.206108166870499e-05, + "loss": 4.0357, + "step": 21135 + }, + { + "epoch": 1.4363364587579834, + "grad_norm": 0.18607132136821747, + "learning_rate": 8.205683516782172e-05, + "loss": 4.0537, + "step": 21140 + }, + { + "epoch": 1.436676178828645, + "grad_norm": 0.17334622144699097, + "learning_rate": 8.205258866693844e-05, + "loss": 3.8504, + "step": 21145 + }, + { + "epoch": 1.437015898899307, + "grad_norm": 0.24115973711013794, + "learning_rate": 8.204834216605517e-05, + "loss": 3.8659, + "step": 21150 + }, + { + "epoch": 1.4373556189699688, + "grad_norm": 2.416486978530884, + "learning_rate": 8.20440956651719e-05, + "loss": 3.7734, + "step": 21155 + }, + { + "epoch": 1.4376953390406304, + "grad_norm": 0.25191640853881836, + "learning_rate": 8.203984916428863e-05, + "loss": 3.8277, + "step": 21160 + }, + { + "epoch": 1.4380350591112923, + "grad_norm": 0.19989247620105743, + "learning_rate": 8.203560266340536e-05, + "loss": 3.9746, + "step": 21165 + }, + { + "epoch": 1.438374779181954, + "grad_norm": 0.1565609872341156, + "learning_rate": 8.203135616252208e-05, + "loss": 4.2268, + "step": 21170 + }, + { + "epoch": 1.4387144992526157, + "grad_norm": 0.20465902984142303, + "learning_rate": 8.202710966163881e-05, + "loss": 4.0392, + "step": 21175 + }, + { + "epoch": 1.4390542193232776, + "grad_norm": 0.14893248677253723, + "learning_rate": 8.202286316075554e-05, + "loss": 3.7666, + "step": 21180 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.1665772944688797, + "learning_rate": 8.201861665987227e-05, + "loss": 4.0514, + "step": 21185 + }, + { + "epoch": 1.439733659464601, + "grad_norm": 0.21879713237285614, + "learning_rate": 8.2014370158989e-05, + "loss": 3.767, + "step": 21190 + }, + { + "epoch": 1.440073379535263, + "grad_norm": 0.21291925013065338, + "learning_rate": 8.201012365810572e-05, + "loss": 3.8719, + "step": 21195 + }, + { + "epoch": 1.4404130996059248, + "grad_norm": 0.17573679983615875, + "learning_rate": 8.200587715722245e-05, + "loss": 3.8397, + "step": 21200 + }, + { + "epoch": 1.4407528196765864, + "grad_norm": 0.18540260195732117, + "learning_rate": 8.200163065633918e-05, + "loss": 4.2166, + "step": 21205 + }, + { + "epoch": 1.4410925397472483, + "grad_norm": 0.17198298871517181, + "learning_rate": 8.199738415545591e-05, + "loss": 3.7897, + "step": 21210 + }, + { + "epoch": 1.4414322598179101, + "grad_norm": 0.18037156760692596, + "learning_rate": 8.199313765457264e-05, + "loss": 3.9934, + "step": 21215 + }, + { + "epoch": 1.4417719798885718, + "grad_norm": 0.18208232522010803, + "learning_rate": 8.198889115368936e-05, + "loss": 3.9537, + "step": 21220 + }, + { + "epoch": 1.4421116999592336, + "grad_norm": 0.17620067298412323, + "learning_rate": 8.198464465280609e-05, + "loss": 3.9918, + "step": 21225 + }, + { + "epoch": 1.4424514200298955, + "grad_norm": 0.17730148136615753, + "learning_rate": 8.198039815192282e-05, + "loss": 3.8759, + "step": 21230 + }, + { + "epoch": 1.442791140100557, + "grad_norm": 0.2497914880514145, + "learning_rate": 8.197615165103955e-05, + "loss": 3.8654, + "step": 21235 + }, + { + "epoch": 1.443130860171219, + "grad_norm": 0.1706199049949646, + "learning_rate": 8.197190515015628e-05, + "loss": 3.8929, + "step": 21240 + }, + { + "epoch": 1.4434705802418808, + "grad_norm": 0.17563709616661072, + "learning_rate": 8.1967658649273e-05, + "loss": 4.1412, + "step": 21245 + }, + { + "epoch": 1.4438103003125424, + "grad_norm": 0.19810359179973602, + "learning_rate": 8.196341214838973e-05, + "loss": 3.8461, + "step": 21250 + }, + { + "epoch": 1.4441500203832043, + "grad_norm": 0.2019626498222351, + "learning_rate": 8.195916564750646e-05, + "loss": 4.0521, + "step": 21255 + }, + { + "epoch": 1.4444897404538661, + "grad_norm": 0.507472038269043, + "learning_rate": 8.195491914662319e-05, + "loss": 3.88, + "step": 21260 + }, + { + "epoch": 1.4448294605245278, + "grad_norm": 0.193507120013237, + "learning_rate": 8.195067264573992e-05, + "loss": 3.7817, + "step": 21265 + }, + { + "epoch": 1.4451691805951896, + "grad_norm": 0.1554255485534668, + "learning_rate": 8.194642614485664e-05, + "loss": 3.8584, + "step": 21270 + }, + { + "epoch": 1.4455089006658515, + "grad_norm": 0.24435606598854065, + "learning_rate": 8.194217964397337e-05, + "loss": 3.8308, + "step": 21275 + }, + { + "epoch": 1.445848620736513, + "grad_norm": 0.21548952162265778, + "learning_rate": 8.19379331430901e-05, + "loss": 3.9146, + "step": 21280 + }, + { + "epoch": 1.446188340807175, + "grad_norm": 0.2216891050338745, + "learning_rate": 8.193368664220683e-05, + "loss": 4.0798, + "step": 21285 + }, + { + "epoch": 1.4465280608778366, + "grad_norm": 0.16216230392456055, + "learning_rate": 8.192944014132356e-05, + "loss": 4.1395, + "step": 21290 + }, + { + "epoch": 1.4468677809484984, + "grad_norm": 0.1850496083498001, + "learning_rate": 8.192519364044028e-05, + "loss": 3.8735, + "step": 21295 + }, + { + "epoch": 1.4472075010191603, + "grad_norm": 0.21830938756465912, + "learning_rate": 8.192094713955701e-05, + "loss": 3.8555, + "step": 21300 + }, + { + "epoch": 1.447547221089822, + "grad_norm": 0.18808721005916595, + "learning_rate": 8.191670063867374e-05, + "loss": 3.7999, + "step": 21305 + }, + { + "epoch": 1.4478869411604838, + "grad_norm": 0.17853547632694244, + "learning_rate": 8.191245413779047e-05, + "loss": 3.807, + "step": 21310 + }, + { + "epoch": 1.4482266612311454, + "grad_norm": 0.21069560945034027, + "learning_rate": 8.19082076369072e-05, + "loss": 3.7181, + "step": 21315 + }, + { + "epoch": 1.4485663813018073, + "grad_norm": 0.17692461609840393, + "learning_rate": 8.190396113602392e-05, + "loss": 3.8038, + "step": 21320 + }, + { + "epoch": 1.4489061013724691, + "grad_norm": 0.18058958649635315, + "learning_rate": 8.189971463514065e-05, + "loss": 3.9721, + "step": 21325 + }, + { + "epoch": 1.4492458214431307, + "grad_norm": 0.21195021271705627, + "learning_rate": 8.189546813425738e-05, + "loss": 3.9, + "step": 21330 + }, + { + "epoch": 1.4495855415137926, + "grad_norm": 0.16935381293296814, + "learning_rate": 8.189122163337411e-05, + "loss": 4.1064, + "step": 21335 + }, + { + "epoch": 1.4499252615844545, + "grad_norm": 0.19831079244613647, + "learning_rate": 8.188697513249082e-05, + "loss": 3.7827, + "step": 21340 + }, + { + "epoch": 1.450264981655116, + "grad_norm": 0.17083477973937988, + "learning_rate": 8.188272863160756e-05, + "loss": 4.0263, + "step": 21345 + }, + { + "epoch": 1.450604701725778, + "grad_norm": 0.5333516001701355, + "learning_rate": 8.187848213072429e-05, + "loss": 3.8527, + "step": 21350 + }, + { + "epoch": 1.4509444217964398, + "grad_norm": 0.2195538729429245, + "learning_rate": 8.1874235629841e-05, + "loss": 3.9091, + "step": 21355 + }, + { + "epoch": 1.4512841418671014, + "grad_norm": 0.1787203848361969, + "learning_rate": 8.186998912895775e-05, + "loss": 4.0015, + "step": 21360 + }, + { + "epoch": 1.4516238619377633, + "grad_norm": 0.17264916002750397, + "learning_rate": 8.186574262807448e-05, + "loss": 3.7295, + "step": 21365 + }, + { + "epoch": 1.4519635820084251, + "grad_norm": 0.2010071575641632, + "learning_rate": 8.186149612719119e-05, + "loss": 4.1687, + "step": 21370 + }, + { + "epoch": 1.4523033020790868, + "grad_norm": 0.16693753004074097, + "learning_rate": 8.185724962630793e-05, + "loss": 4.0766, + "step": 21375 + }, + { + "epoch": 1.4526430221497486, + "grad_norm": 0.18354806303977966, + "learning_rate": 8.185300312542466e-05, + "loss": 3.6372, + "step": 21380 + }, + { + "epoch": 1.4529827422204105, + "grad_norm": 0.8385973572731018, + "learning_rate": 8.184875662454137e-05, + "loss": 4.1342, + "step": 21385 + }, + { + "epoch": 1.453322462291072, + "grad_norm": 0.20215268433094025, + "learning_rate": 8.184451012365812e-05, + "loss": 4.0694, + "step": 21390 + }, + { + "epoch": 1.453662182361734, + "grad_norm": 0.2252776175737381, + "learning_rate": 8.184026362277484e-05, + "loss": 3.7176, + "step": 21395 + }, + { + "epoch": 1.4540019024323958, + "grad_norm": 0.16187147796154022, + "learning_rate": 8.183601712189156e-05, + "loss": 3.6411, + "step": 21400 + }, + { + "epoch": 1.4543416225030574, + "grad_norm": 0.44014227390289307, + "learning_rate": 8.18317706210083e-05, + "loss": 3.9457, + "step": 21405 + }, + { + "epoch": 1.4546813425737193, + "grad_norm": 0.2793375849723816, + "learning_rate": 8.182752412012501e-05, + "loss": 3.9946, + "step": 21410 + }, + { + "epoch": 1.4550210626443811, + "grad_norm": 0.21300536394119263, + "learning_rate": 8.182327761924174e-05, + "loss": 3.9029, + "step": 21415 + }, + { + "epoch": 1.4553607827150428, + "grad_norm": 0.22775651514530182, + "learning_rate": 8.181903111835848e-05, + "loss": 3.8267, + "step": 21420 + }, + { + "epoch": 1.4557005027857046, + "grad_norm": 0.15846438705921173, + "learning_rate": 8.18147846174752e-05, + "loss": 3.7452, + "step": 21425 + }, + { + "epoch": 1.4560402228563665, + "grad_norm": 0.3081473708152771, + "learning_rate": 8.181053811659193e-05, + "loss": 3.9311, + "step": 21430 + }, + { + "epoch": 1.456379942927028, + "grad_norm": 0.17808730900287628, + "learning_rate": 8.180629161570867e-05, + "loss": 4.0536, + "step": 21435 + }, + { + "epoch": 1.45671966299769, + "grad_norm": 0.17139832675457, + "learning_rate": 8.180204511482538e-05, + "loss": 4.0515, + "step": 21440 + }, + { + "epoch": 1.4570593830683518, + "grad_norm": 0.20215845108032227, + "learning_rate": 8.179779861394211e-05, + "loss": 3.9739, + "step": 21445 + }, + { + "epoch": 1.4573991031390134, + "grad_norm": 0.17090734839439392, + "learning_rate": 8.179355211305885e-05, + "loss": 4.1245, + "step": 21450 + }, + { + "epoch": 1.4577388232096753, + "grad_norm": 0.17363929748535156, + "learning_rate": 8.178930561217557e-05, + "loss": 4.0509, + "step": 21455 + }, + { + "epoch": 1.458078543280337, + "grad_norm": 0.18686549365520477, + "learning_rate": 8.17850591112923e-05, + "loss": 3.8473, + "step": 21460 + }, + { + "epoch": 1.4584182633509988, + "grad_norm": 0.17829811573028564, + "learning_rate": 8.178081261040904e-05, + "loss": 4.0523, + "step": 21465 + }, + { + "epoch": 1.4587579834216606, + "grad_norm": 0.14514441788196564, + "learning_rate": 8.177656610952575e-05, + "loss": 3.7057, + "step": 21470 + }, + { + "epoch": 1.4590977034923223, + "grad_norm": 0.18036119639873505, + "learning_rate": 8.177231960864248e-05, + "loss": 3.715, + "step": 21475 + }, + { + "epoch": 1.4594374235629841, + "grad_norm": 0.29599207639694214, + "learning_rate": 8.176807310775922e-05, + "loss": 4.1213, + "step": 21480 + }, + { + "epoch": 1.4597771436336457, + "grad_norm": 0.24082861840724945, + "learning_rate": 8.176382660687593e-05, + "loss": 3.9192, + "step": 21485 + }, + { + "epoch": 1.4601168637043076, + "grad_norm": 0.21615785360336304, + "learning_rate": 8.175958010599266e-05, + "loss": 4.1346, + "step": 21490 + }, + { + "epoch": 1.4604565837749695, + "grad_norm": 0.21692131459712982, + "learning_rate": 8.175533360510939e-05, + "loss": 4.0504, + "step": 21495 + }, + { + "epoch": 1.460796303845631, + "grad_norm": 0.16427777707576752, + "learning_rate": 8.175108710422612e-05, + "loss": 3.9508, + "step": 21500 + }, + { + "epoch": 1.461136023916293, + "grad_norm": 0.15842287242412567, + "learning_rate": 8.174684060334285e-05, + "loss": 3.8486, + "step": 21505 + }, + { + "epoch": 1.4614757439869548, + "grad_norm": 0.14675350487232208, + "learning_rate": 8.174259410245957e-05, + "loss": 4.1129, + "step": 21510 + }, + { + "epoch": 1.4618154640576164, + "grad_norm": 0.6849441528320312, + "learning_rate": 8.17383476015763e-05, + "loss": 4.062, + "step": 21515 + }, + { + "epoch": 1.4621551841282783, + "grad_norm": 0.2675241231918335, + "learning_rate": 8.173410110069303e-05, + "loss": 3.7483, + "step": 21520 + }, + { + "epoch": 1.4624949041989401, + "grad_norm": 0.15431304275989532, + "learning_rate": 8.172985459980976e-05, + "loss": 4.0661, + "step": 21525 + }, + { + "epoch": 1.4628346242696018, + "grad_norm": 0.18781180679798126, + "learning_rate": 8.172560809892649e-05, + "loss": 3.9574, + "step": 21530 + }, + { + "epoch": 1.4631743443402636, + "grad_norm": 0.19349415600299835, + "learning_rate": 8.172136159804321e-05, + "loss": 4.0089, + "step": 21535 + }, + { + "epoch": 1.4635140644109255, + "grad_norm": 0.23189841210842133, + "learning_rate": 8.171711509715994e-05, + "loss": 4.0482, + "step": 21540 + }, + { + "epoch": 1.463853784481587, + "grad_norm": 0.15885671973228455, + "learning_rate": 8.171286859627667e-05, + "loss": 3.8203, + "step": 21545 + }, + { + "epoch": 1.464193504552249, + "grad_norm": 0.17275947332382202, + "learning_rate": 8.17086220953934e-05, + "loss": 4.1032, + "step": 21550 + }, + { + "epoch": 1.4645332246229108, + "grad_norm": 0.19999124109745026, + "learning_rate": 8.170437559451013e-05, + "loss": 3.9172, + "step": 21555 + }, + { + "epoch": 1.4648729446935724, + "grad_norm": 0.17095349729061127, + "learning_rate": 8.170012909362685e-05, + "loss": 3.9468, + "step": 21560 + }, + { + "epoch": 1.4652126647642343, + "grad_norm": 0.14642515778541565, + "learning_rate": 8.169588259274358e-05, + "loss": 4.0876, + "step": 21565 + }, + { + "epoch": 1.4655523848348961, + "grad_norm": 0.16695556044578552, + "learning_rate": 8.169163609186031e-05, + "loss": 4.0238, + "step": 21570 + }, + { + "epoch": 1.4658921049055578, + "grad_norm": 0.16036422550678253, + "learning_rate": 8.168738959097704e-05, + "loss": 4.0682, + "step": 21575 + }, + { + "epoch": 1.4662318249762196, + "grad_norm": 0.9543379545211792, + "learning_rate": 8.168314309009377e-05, + "loss": 4.1337, + "step": 21580 + }, + { + "epoch": 1.4665715450468815, + "grad_norm": 0.17027482390403748, + "learning_rate": 8.16788965892105e-05, + "loss": 4.022, + "step": 21585 + }, + { + "epoch": 1.466911265117543, + "grad_norm": 0.13834522664546967, + "learning_rate": 8.167465008832722e-05, + "loss": 3.6904, + "step": 21590 + }, + { + "epoch": 1.467250985188205, + "grad_norm": 0.3163164258003235, + "learning_rate": 8.167040358744395e-05, + "loss": 3.9433, + "step": 21595 + }, + { + "epoch": 1.4675907052588668, + "grad_norm": 0.16317187249660492, + "learning_rate": 8.166615708656068e-05, + "loss": 3.9733, + "step": 21600 + }, + { + "epoch": 1.4679304253295284, + "grad_norm": 0.19929274916648865, + "learning_rate": 8.16619105856774e-05, + "loss": 3.9078, + "step": 21605 + }, + { + "epoch": 1.4682701454001903, + "grad_norm": 0.5431677103042603, + "learning_rate": 8.165766408479413e-05, + "loss": 3.9937, + "step": 21610 + }, + { + "epoch": 1.4686098654708521, + "grad_norm": 0.16875344514846802, + "learning_rate": 8.165341758391086e-05, + "loss": 3.7644, + "step": 21615 + }, + { + "epoch": 1.4689495855415138, + "grad_norm": 0.22793284058570862, + "learning_rate": 8.164917108302759e-05, + "loss": 3.7872, + "step": 21620 + }, + { + "epoch": 1.4692893056121756, + "grad_norm": 4.586470127105713, + "learning_rate": 8.164492458214432e-05, + "loss": 3.902, + "step": 21625 + }, + { + "epoch": 1.4696290256828373, + "grad_norm": 0.16205987334251404, + "learning_rate": 8.164067808126105e-05, + "loss": 3.6601, + "step": 21630 + }, + { + "epoch": 1.4699687457534991, + "grad_norm": 0.1852976679801941, + "learning_rate": 8.163643158037777e-05, + "loss": 3.9805, + "step": 21635 + }, + { + "epoch": 1.470308465824161, + "grad_norm": 0.20962713658809662, + "learning_rate": 8.16321850794945e-05, + "loss": 3.7721, + "step": 21640 + }, + { + "epoch": 1.4706481858948226, + "grad_norm": 0.16768406331539154, + "learning_rate": 8.162793857861123e-05, + "loss": 3.6707, + "step": 21645 + }, + { + "epoch": 1.4709879059654845, + "grad_norm": 0.18406492471694946, + "learning_rate": 8.162369207772796e-05, + "loss": 4.0484, + "step": 21650 + }, + { + "epoch": 1.471327626036146, + "grad_norm": 0.20939980447292328, + "learning_rate": 8.161944557684469e-05, + "loss": 3.8459, + "step": 21655 + }, + { + "epoch": 1.471667346106808, + "grad_norm": 0.21590302884578705, + "learning_rate": 8.161519907596141e-05, + "loss": 3.7887, + "step": 21660 + }, + { + "epoch": 1.4720070661774698, + "grad_norm": 1.8166537284851074, + "learning_rate": 8.161095257507814e-05, + "loss": 4.0137, + "step": 21665 + }, + { + "epoch": 1.4723467862481314, + "grad_norm": 0.1635037213563919, + "learning_rate": 8.160670607419487e-05, + "loss": 4.155, + "step": 21670 + }, + { + "epoch": 1.4726865063187933, + "grad_norm": 0.26899540424346924, + "learning_rate": 8.16024595733116e-05, + "loss": 3.893, + "step": 21675 + }, + { + "epoch": 1.4730262263894551, + "grad_norm": 0.14848864078521729, + "learning_rate": 8.159821307242833e-05, + "loss": 3.8561, + "step": 21680 + }, + { + "epoch": 1.4733659464601168, + "grad_norm": 0.17912627756595612, + "learning_rate": 8.159396657154505e-05, + "loss": 3.7832, + "step": 21685 + }, + { + "epoch": 1.4737056665307786, + "grad_norm": 0.16671313345432281, + "learning_rate": 8.158972007066178e-05, + "loss": 3.8998, + "step": 21690 + }, + { + "epoch": 1.4740453866014405, + "grad_norm": 0.1994152069091797, + "learning_rate": 8.15854735697785e-05, + "loss": 4.0428, + "step": 21695 + }, + { + "epoch": 1.474385106672102, + "grad_norm": 0.15787015855312347, + "learning_rate": 8.158122706889524e-05, + "loss": 3.8654, + "step": 21700 + }, + { + "epoch": 1.474724826742764, + "grad_norm": 0.2328563928604126, + "learning_rate": 8.157698056801197e-05, + "loss": 3.8722, + "step": 21705 + }, + { + "epoch": 1.4750645468134258, + "grad_norm": 0.1907779425382614, + "learning_rate": 8.157273406712868e-05, + "loss": 3.9187, + "step": 21710 + }, + { + "epoch": 1.4754042668840874, + "grad_norm": 0.13368162512779236, + "learning_rate": 8.156848756624542e-05, + "loss": 3.976, + "step": 21715 + }, + { + "epoch": 1.4757439869547493, + "grad_norm": 0.22012904286384583, + "learning_rate": 8.156424106536215e-05, + "loss": 3.9433, + "step": 21720 + }, + { + "epoch": 1.4760837070254111, + "grad_norm": 0.26086264848709106, + "learning_rate": 8.155999456447887e-05, + "loss": 3.989, + "step": 21725 + }, + { + "epoch": 1.4764234270960728, + "grad_norm": 0.7943768501281738, + "learning_rate": 8.155574806359561e-05, + "loss": 3.8233, + "step": 21730 + }, + { + "epoch": 1.4767631471667346, + "grad_norm": 0.163779616355896, + "learning_rate": 8.155150156271233e-05, + "loss": 3.8236, + "step": 21735 + }, + { + "epoch": 1.4771028672373965, + "grad_norm": 0.16415409743785858, + "learning_rate": 8.154725506182905e-05, + "loss": 3.7924, + "step": 21740 + }, + { + "epoch": 1.477442587308058, + "grad_norm": 0.18303167819976807, + "learning_rate": 8.154300856094579e-05, + "loss": 3.7799, + "step": 21745 + }, + { + "epoch": 1.47778230737872, + "grad_norm": 0.1833762228488922, + "learning_rate": 8.153876206006252e-05, + "loss": 3.8077, + "step": 21750 + }, + { + "epoch": 1.4781220274493818, + "grad_norm": 0.24025312066078186, + "learning_rate": 8.153451555917923e-05, + "loss": 3.9481, + "step": 21755 + }, + { + "epoch": 1.4784617475200434, + "grad_norm": 0.16102038323879242, + "learning_rate": 8.153026905829597e-05, + "loss": 4.0759, + "step": 21760 + }, + { + "epoch": 1.4788014675907053, + "grad_norm": 0.20599770545959473, + "learning_rate": 8.152602255741269e-05, + "loss": 4.1731, + "step": 21765 + }, + { + "epoch": 1.4791411876613672, + "grad_norm": 0.2513538897037506, + "learning_rate": 8.152177605652942e-05, + "loss": 4.0091, + "step": 21770 + }, + { + "epoch": 1.4794809077320288, + "grad_norm": 0.15865686535835266, + "learning_rate": 8.151752955564616e-05, + "loss": 3.7982, + "step": 21775 + }, + { + "epoch": 1.4798206278026906, + "grad_norm": 0.20484325289726257, + "learning_rate": 8.151328305476287e-05, + "loss": 3.7247, + "step": 21780 + }, + { + "epoch": 1.4801603478733525, + "grad_norm": 0.29166272282600403, + "learning_rate": 8.15090365538796e-05, + "loss": 4.054, + "step": 21785 + }, + { + "epoch": 1.4805000679440141, + "grad_norm": 0.2248586118221283, + "learning_rate": 8.150479005299634e-05, + "loss": 3.982, + "step": 21790 + }, + { + "epoch": 1.480839788014676, + "grad_norm": 0.17249290645122528, + "learning_rate": 8.150054355211306e-05, + "loss": 4.077, + "step": 21795 + }, + { + "epoch": 1.4811795080853376, + "grad_norm": 0.21560299396514893, + "learning_rate": 8.149629705122979e-05, + "loss": 3.9082, + "step": 21800 + }, + { + "epoch": 1.4815192281559995, + "grad_norm": 0.1390869915485382, + "learning_rate": 8.149205055034653e-05, + "loss": 4.0174, + "step": 21805 + }, + { + "epoch": 1.4818589482266613, + "grad_norm": 0.12950243055820465, + "learning_rate": 8.148780404946324e-05, + "loss": 3.9253, + "step": 21810 + }, + { + "epoch": 1.482198668297323, + "grad_norm": 0.20193003118038177, + "learning_rate": 8.148355754857997e-05, + "loss": 4.0485, + "step": 21815 + }, + { + "epoch": 1.4825383883679848, + "grad_norm": 1.203129768371582, + "learning_rate": 8.147931104769671e-05, + "loss": 3.7355, + "step": 21820 + }, + { + "epoch": 1.4828781084386464, + "grad_norm": 0.2027519792318344, + "learning_rate": 8.147506454681343e-05, + "loss": 4.208, + "step": 21825 + }, + { + "epoch": 1.4832178285093083, + "grad_norm": 0.6469574570655823, + "learning_rate": 8.147081804593015e-05, + "loss": 4.0402, + "step": 21830 + }, + { + "epoch": 1.4835575485799701, + "grad_norm": 3.7862627506256104, + "learning_rate": 8.146657154504688e-05, + "loss": 3.9747, + "step": 21835 + }, + { + "epoch": 1.4838972686506318, + "grad_norm": 0.3205176889896393, + "learning_rate": 8.146232504416361e-05, + "loss": 3.8642, + "step": 21840 + }, + { + "epoch": 1.4842369887212936, + "grad_norm": 0.16341230273246765, + "learning_rate": 8.145807854328034e-05, + "loss": 3.9668, + "step": 21845 + }, + { + "epoch": 1.4845767087919555, + "grad_norm": 0.16365942358970642, + "learning_rate": 8.145383204239707e-05, + "loss": 3.9359, + "step": 21850 + }, + { + "epoch": 1.484916428862617, + "grad_norm": 0.2551836669445038, + "learning_rate": 8.14495855415138e-05, + "loss": 4.0983, + "step": 21855 + }, + { + "epoch": 1.485256148933279, + "grad_norm": 0.18045364320278168, + "learning_rate": 8.144533904063052e-05, + "loss": 3.9366, + "step": 21860 + }, + { + "epoch": 1.4855958690039408, + "grad_norm": 0.18181972205638885, + "learning_rate": 8.144109253974725e-05, + "loss": 4.0676, + "step": 21865 + }, + { + "epoch": 1.4859355890746024, + "grad_norm": 0.19416576623916626, + "learning_rate": 8.143684603886398e-05, + "loss": 3.8397, + "step": 21870 + }, + { + "epoch": 1.4862753091452643, + "grad_norm": 0.16823352873325348, + "learning_rate": 8.14325995379807e-05, + "loss": 3.7101, + "step": 21875 + }, + { + "epoch": 1.4866150292159261, + "grad_norm": 0.2803404927253723, + "learning_rate": 8.142835303709743e-05, + "loss": 3.9152, + "step": 21880 + }, + { + "epoch": 1.4869547492865878, + "grad_norm": 0.19958576560020447, + "learning_rate": 8.142410653621416e-05, + "loss": 3.6583, + "step": 21885 + }, + { + "epoch": 1.4872944693572496, + "grad_norm": 0.21030667424201965, + "learning_rate": 8.141986003533089e-05, + "loss": 3.6892, + "step": 21890 + }, + { + "epoch": 1.4876341894279115, + "grad_norm": 0.4157433807849884, + "learning_rate": 8.141561353444762e-05, + "loss": 3.8998, + "step": 21895 + }, + { + "epoch": 1.487973909498573, + "grad_norm": 0.2651022970676422, + "learning_rate": 8.141136703356435e-05, + "loss": 3.7464, + "step": 21900 + }, + { + "epoch": 1.488313629569235, + "grad_norm": 0.9864382147789001, + "learning_rate": 8.140712053268107e-05, + "loss": 4.0279, + "step": 21905 + }, + { + "epoch": 1.4886533496398968, + "grad_norm": 0.17192131280899048, + "learning_rate": 8.14028740317978e-05, + "loss": 3.5872, + "step": 21910 + }, + { + "epoch": 1.4889930697105584, + "grad_norm": 0.23184671998023987, + "learning_rate": 8.139862753091453e-05, + "loss": 3.8141, + "step": 21915 + }, + { + "epoch": 1.4893327897812203, + "grad_norm": 0.17487375438213348, + "learning_rate": 8.139438103003126e-05, + "loss": 3.8153, + "step": 21920 + }, + { + "epoch": 1.4896725098518822, + "grad_norm": 0.18509329855442047, + "learning_rate": 8.139013452914799e-05, + "loss": 3.6427, + "step": 21925 + }, + { + "epoch": 1.4900122299225438, + "grad_norm": 0.1858031004667282, + "learning_rate": 8.138588802826471e-05, + "loss": 3.9407, + "step": 21930 + }, + { + "epoch": 1.4903519499932056, + "grad_norm": 0.16595908999443054, + "learning_rate": 8.138164152738144e-05, + "loss": 3.8758, + "step": 21935 + }, + { + "epoch": 1.4906916700638675, + "grad_norm": 0.4388197660446167, + "learning_rate": 8.137739502649817e-05, + "loss": 4.0597, + "step": 21940 + }, + { + "epoch": 1.4910313901345291, + "grad_norm": 0.17590069770812988, + "learning_rate": 8.13731485256149e-05, + "loss": 3.8639, + "step": 21945 + }, + { + "epoch": 1.491371110205191, + "grad_norm": 0.1672753542661667, + "learning_rate": 8.136890202473163e-05, + "loss": 3.9801, + "step": 21950 + }, + { + "epoch": 1.4917108302758528, + "grad_norm": 0.1548089236021042, + "learning_rate": 8.136465552384835e-05, + "loss": 3.9075, + "step": 21955 + }, + { + "epoch": 1.4920505503465145, + "grad_norm": 0.17687340080738068, + "learning_rate": 8.136040902296508e-05, + "loss": 3.9456, + "step": 21960 + }, + { + "epoch": 1.4923902704171763, + "grad_norm": 0.1935090273618698, + "learning_rate": 8.135616252208181e-05, + "loss": 3.7437, + "step": 21965 + }, + { + "epoch": 1.492729990487838, + "grad_norm": 0.2118242233991623, + "learning_rate": 8.135191602119854e-05, + "loss": 3.8188, + "step": 21970 + }, + { + "epoch": 1.4930697105584998, + "grad_norm": 0.17160865664482117, + "learning_rate": 8.134766952031527e-05, + "loss": 4.0479, + "step": 21975 + }, + { + "epoch": 1.4934094306291616, + "grad_norm": 0.222242534160614, + "learning_rate": 8.1343423019432e-05, + "loss": 4.0141, + "step": 21980 + }, + { + "epoch": 1.4937491506998233, + "grad_norm": 0.19099248945713043, + "learning_rate": 8.133917651854872e-05, + "loss": 3.644, + "step": 21985 + }, + { + "epoch": 1.4940888707704851, + "grad_norm": 0.15663141012191772, + "learning_rate": 8.133493001766545e-05, + "loss": 3.7937, + "step": 21990 + }, + { + "epoch": 1.4944285908411468, + "grad_norm": 0.1788206398487091, + "learning_rate": 8.133068351678218e-05, + "loss": 3.9206, + "step": 21995 + }, + { + "epoch": 1.4947683109118086, + "grad_norm": 0.3145100176334381, + "learning_rate": 8.13264370158989e-05, + "loss": 3.9613, + "step": 22000 + }, + { + "epoch": 1.4951080309824705, + "grad_norm": 0.19365718960762024, + "learning_rate": 8.132219051501563e-05, + "loss": 4.0367, + "step": 22005 + }, + { + "epoch": 1.495447751053132, + "grad_norm": 0.16117675602436066, + "learning_rate": 8.131794401413236e-05, + "loss": 3.9031, + "step": 22010 + }, + { + "epoch": 1.495787471123794, + "grad_norm": 0.18990375101566315, + "learning_rate": 8.131369751324909e-05, + "loss": 3.9375, + "step": 22015 + }, + { + "epoch": 1.4961271911944558, + "grad_norm": 0.15753480792045593, + "learning_rate": 8.130945101236582e-05, + "loss": 3.9418, + "step": 22020 + }, + { + "epoch": 1.4964669112651174, + "grad_norm": 0.14911150932312012, + "learning_rate": 8.130520451148255e-05, + "loss": 3.8267, + "step": 22025 + }, + { + "epoch": 1.4968066313357793, + "grad_norm": 0.1654355227947235, + "learning_rate": 8.130095801059927e-05, + "loss": 3.8466, + "step": 22030 + }, + { + "epoch": 1.4971463514064411, + "grad_norm": 0.18509690463542938, + "learning_rate": 8.129671150971599e-05, + "loss": 4.1049, + "step": 22035 + }, + { + "epoch": 1.4974860714771028, + "grad_norm": 0.254231333732605, + "learning_rate": 8.129246500883273e-05, + "loss": 3.8883, + "step": 22040 + }, + { + "epoch": 1.4978257915477646, + "grad_norm": 0.17699551582336426, + "learning_rate": 8.128821850794946e-05, + "loss": 3.9302, + "step": 22045 + }, + { + "epoch": 1.4981655116184265, + "grad_norm": 0.15974491834640503, + "learning_rate": 8.128397200706617e-05, + "loss": 3.8923, + "step": 22050 + }, + { + "epoch": 1.4985052316890881, + "grad_norm": 0.1489754468202591, + "learning_rate": 8.127972550618291e-05, + "loss": 3.6922, + "step": 22055 + }, + { + "epoch": 1.49884495175975, + "grad_norm": 0.15716956555843353, + "learning_rate": 8.127547900529964e-05, + "loss": 3.9637, + "step": 22060 + }, + { + "epoch": 1.4991846718304118, + "grad_norm": 0.16464102268218994, + "learning_rate": 8.127123250441636e-05, + "loss": 4.1139, + "step": 22065 + }, + { + "epoch": 1.4995243919010735, + "grad_norm": 0.711264967918396, + "learning_rate": 8.12669860035331e-05, + "loss": 3.7423, + "step": 22070 + }, + { + "epoch": 1.4998641119717353, + "grad_norm": 0.16658902168273926, + "learning_rate": 8.126273950264983e-05, + "loss": 4.0559, + "step": 22075 + }, + { + "epoch": 1.5002038320423972, + "grad_norm": 0.2199849635362625, + "learning_rate": 8.125849300176654e-05, + "loss": 3.9694, + "step": 22080 + }, + { + "epoch": 1.5005435521130588, + "grad_norm": 0.15324361622333527, + "learning_rate": 8.125424650088328e-05, + "loss": 3.7384, + "step": 22085 + }, + { + "epoch": 1.5008832721837206, + "grad_norm": 0.23534561693668365, + "learning_rate": 8.125000000000001e-05, + "loss": 3.8223, + "step": 22090 + }, + { + "epoch": 1.5012229922543825, + "grad_norm": 0.5196974873542786, + "learning_rate": 8.124575349911672e-05, + "loss": 3.8665, + "step": 22095 + }, + { + "epoch": 1.5015627123250441, + "grad_norm": 0.20586290955543518, + "learning_rate": 8.124150699823347e-05, + "loss": 3.7821, + "step": 22100 + }, + { + "epoch": 1.501902432395706, + "grad_norm": 0.23740985989570618, + "learning_rate": 8.12372604973502e-05, + "loss": 3.9502, + "step": 22105 + }, + { + "epoch": 1.5022421524663678, + "grad_norm": 0.17245244979858398, + "learning_rate": 8.123301399646691e-05, + "loss": 3.9457, + "step": 22110 + }, + { + "epoch": 1.5025818725370295, + "grad_norm": 0.20910488069057465, + "learning_rate": 8.122876749558365e-05, + "loss": 3.8946, + "step": 22115 + }, + { + "epoch": 1.5029215926076913, + "grad_norm": 0.14958035945892334, + "learning_rate": 8.122452099470036e-05, + "loss": 3.8352, + "step": 22120 + }, + { + "epoch": 1.5032613126783532, + "grad_norm": 0.16651959717273712, + "learning_rate": 8.122027449381709e-05, + "loss": 3.9115, + "step": 22125 + }, + { + "epoch": 1.5036010327490148, + "grad_norm": 0.17594747245311737, + "learning_rate": 8.121602799293383e-05, + "loss": 3.8693, + "step": 22130 + }, + { + "epoch": 1.5039407528196764, + "grad_norm": 0.22003202140331268, + "learning_rate": 8.121178149205055e-05, + "loss": 3.9589, + "step": 22135 + }, + { + "epoch": 1.5042804728903385, + "grad_norm": 0.23798172175884247, + "learning_rate": 8.120753499116728e-05, + "loss": 3.9015, + "step": 22140 + }, + { + "epoch": 1.5046201929610001, + "grad_norm": 0.44073575735092163, + "learning_rate": 8.120328849028402e-05, + "loss": 4.0646, + "step": 22145 + }, + { + "epoch": 1.5049599130316618, + "grad_norm": 1.7266801595687866, + "learning_rate": 8.119904198940073e-05, + "loss": 4.0155, + "step": 22150 + }, + { + "epoch": 1.5052996331023238, + "grad_norm": 0.17522837221622467, + "learning_rate": 8.119479548851746e-05, + "loss": 3.8606, + "step": 22155 + }, + { + "epoch": 1.5056393531729855, + "grad_norm": 0.309773713350296, + "learning_rate": 8.11905489876342e-05, + "loss": 3.9721, + "step": 22160 + }, + { + "epoch": 1.505979073243647, + "grad_norm": 0.2653729319572449, + "learning_rate": 8.118630248675092e-05, + "loss": 3.9796, + "step": 22165 + }, + { + "epoch": 1.506318793314309, + "grad_norm": 0.18932275474071503, + "learning_rate": 8.118205598586764e-05, + "loss": 3.7778, + "step": 22170 + }, + { + "epoch": 1.5066585133849708, + "grad_norm": 0.20035238564014435, + "learning_rate": 8.117780948498439e-05, + "loss": 3.8119, + "step": 22175 + }, + { + "epoch": 1.5069982334556324, + "grad_norm": 0.1546817421913147, + "learning_rate": 8.11735629841011e-05, + "loss": 3.9432, + "step": 22180 + }, + { + "epoch": 1.5073379535262943, + "grad_norm": 0.16738523542881012, + "learning_rate": 8.116931648321783e-05, + "loss": 4.0707, + "step": 22185 + }, + { + "epoch": 1.5076776735969561, + "grad_norm": 0.20787541568279266, + "learning_rate": 8.116506998233456e-05, + "loss": 3.6642, + "step": 22190 + }, + { + "epoch": 1.5080173936676178, + "grad_norm": 0.23612059652805328, + "learning_rate": 8.116082348145128e-05, + "loss": 3.9457, + "step": 22195 + }, + { + "epoch": 1.5083571137382796, + "grad_norm": 0.17141501605510712, + "learning_rate": 8.115657698056801e-05, + "loss": 3.9473, + "step": 22200 + }, + { + "epoch": 1.5086968338089415, + "grad_norm": 0.6725015640258789, + "learning_rate": 8.115233047968474e-05, + "loss": 4.1258, + "step": 22205 + }, + { + "epoch": 1.5090365538796031, + "grad_norm": 0.1769857406616211, + "learning_rate": 8.114808397880147e-05, + "loss": 3.8573, + "step": 22210 + }, + { + "epoch": 1.509376273950265, + "grad_norm": 0.15409088134765625, + "learning_rate": 8.11438374779182e-05, + "loss": 3.9876, + "step": 22215 + }, + { + "epoch": 1.5097159940209268, + "grad_norm": 0.16695520281791687, + "learning_rate": 8.113959097703492e-05, + "loss": 3.8409, + "step": 22220 + }, + { + "epoch": 1.5100557140915885, + "grad_norm": 0.1558338701725006, + "learning_rate": 8.113534447615165e-05, + "loss": 3.8884, + "step": 22225 + }, + { + "epoch": 1.5103954341622503, + "grad_norm": 0.2463407665491104, + "learning_rate": 8.113109797526838e-05, + "loss": 3.8301, + "step": 22230 + }, + { + "epoch": 1.5107351542329122, + "grad_norm": 0.4138488173484802, + "learning_rate": 8.112685147438511e-05, + "loss": 3.6748, + "step": 22235 + }, + { + "epoch": 1.5110748743035738, + "grad_norm": 0.18995806574821472, + "learning_rate": 8.112260497350184e-05, + "loss": 3.9719, + "step": 22240 + }, + { + "epoch": 1.5114145943742356, + "grad_norm": 1.1736502647399902, + "learning_rate": 8.111835847261856e-05, + "loss": 3.8701, + "step": 22245 + }, + { + "epoch": 1.5117543144448975, + "grad_norm": 0.1819602996110916, + "learning_rate": 8.111411197173529e-05, + "loss": 3.9724, + "step": 22250 + }, + { + "epoch": 1.5120940345155591, + "grad_norm": 0.1881464719772339, + "learning_rate": 8.110986547085202e-05, + "loss": 3.9425, + "step": 22255 + }, + { + "epoch": 1.512433754586221, + "grad_norm": 0.17573055624961853, + "learning_rate": 8.110561896996875e-05, + "loss": 3.8248, + "step": 22260 + }, + { + "epoch": 1.5127734746568828, + "grad_norm": 0.14288708567619324, + "learning_rate": 8.110137246908548e-05, + "loss": 3.9033, + "step": 22265 + }, + { + "epoch": 1.5131131947275445, + "grad_norm": 0.15460045635700226, + "learning_rate": 8.10971259682022e-05, + "loss": 3.9373, + "step": 22270 + }, + { + "epoch": 1.5134529147982063, + "grad_norm": 0.17520169913768768, + "learning_rate": 8.109287946731893e-05, + "loss": 3.7058, + "step": 22275 + }, + { + "epoch": 1.5137926348688682, + "grad_norm": 0.2688046097755432, + "learning_rate": 8.108863296643566e-05, + "loss": 3.7653, + "step": 22280 + }, + { + "epoch": 1.5141323549395298, + "grad_norm": 0.4329849183559418, + "learning_rate": 8.108438646555239e-05, + "loss": 3.9618, + "step": 22285 + }, + { + "epoch": 1.5144720750101917, + "grad_norm": 0.15575814247131348, + "learning_rate": 8.108013996466912e-05, + "loss": 3.9133, + "step": 22290 + }, + { + "epoch": 1.5148117950808535, + "grad_norm": 0.25085195899009705, + "learning_rate": 8.107589346378584e-05, + "loss": 3.9147, + "step": 22295 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 3.459066867828369, + "learning_rate": 8.107164696290257e-05, + "loss": 3.8932, + "step": 22300 + }, + { + "epoch": 1.5154912352221768, + "grad_norm": 0.18564362823963165, + "learning_rate": 8.10674004620193e-05, + "loss": 4.0555, + "step": 22305 + }, + { + "epoch": 1.5158309552928388, + "grad_norm": 0.20934748649597168, + "learning_rate": 8.106315396113603e-05, + "loss": 3.8483, + "step": 22310 + }, + { + "epoch": 1.5161706753635005, + "grad_norm": 0.159783273935318, + "learning_rate": 8.105890746025276e-05, + "loss": 3.8961, + "step": 22315 + }, + { + "epoch": 1.516510395434162, + "grad_norm": 0.5500815510749817, + "learning_rate": 8.105466095936948e-05, + "loss": 3.9609, + "step": 22320 + }, + { + "epoch": 1.5168501155048242, + "grad_norm": 0.6837555766105652, + "learning_rate": 8.105041445848621e-05, + "loss": 3.9005, + "step": 22325 + }, + { + "epoch": 1.5171898355754858, + "grad_norm": 0.27666962146759033, + "learning_rate": 8.104616795760294e-05, + "loss": 3.8588, + "step": 22330 + }, + { + "epoch": 1.5175295556461474, + "grad_norm": 0.14040204882621765, + "learning_rate": 8.104192145671967e-05, + "loss": 3.8473, + "step": 22335 + }, + { + "epoch": 1.5178692757168093, + "grad_norm": 0.20570577681064606, + "learning_rate": 8.10376749558364e-05, + "loss": 3.8824, + "step": 22340 + }, + { + "epoch": 1.5182089957874711, + "grad_norm": 0.17105324566364288, + "learning_rate": 8.103342845495312e-05, + "loss": 3.8309, + "step": 22345 + }, + { + "epoch": 1.5185487158581328, + "grad_norm": 0.5443513989448547, + "learning_rate": 8.102918195406985e-05, + "loss": 3.7526, + "step": 22350 + }, + { + "epoch": 1.5188884359287946, + "grad_norm": 0.16678844392299652, + "learning_rate": 8.102493545318658e-05, + "loss": 3.7768, + "step": 22355 + }, + { + "epoch": 1.5192281559994565, + "grad_norm": 0.1440274566411972, + "learning_rate": 8.102068895230331e-05, + "loss": 3.9555, + "step": 22360 + }, + { + "epoch": 1.5195678760701181, + "grad_norm": 0.3287782371044159, + "learning_rate": 8.101644245142004e-05, + "loss": 3.7823, + "step": 22365 + }, + { + "epoch": 1.51990759614078, + "grad_norm": 0.22004348039627075, + "learning_rate": 8.101219595053676e-05, + "loss": 3.7449, + "step": 22370 + }, + { + "epoch": 1.5202473162114418, + "grad_norm": 0.17569592595100403, + "learning_rate": 8.100794944965349e-05, + "loss": 4.0474, + "step": 22375 + }, + { + "epoch": 1.5205870362821035, + "grad_norm": 0.16899912059307098, + "learning_rate": 8.100370294877022e-05, + "loss": 3.9927, + "step": 22380 + }, + { + "epoch": 1.5209267563527653, + "grad_norm": 0.18638694286346436, + "learning_rate": 8.099945644788695e-05, + "loss": 3.903, + "step": 22385 + }, + { + "epoch": 1.5212664764234272, + "grad_norm": 0.19152046740055084, + "learning_rate": 8.099520994700366e-05, + "loss": 3.9869, + "step": 22390 + }, + { + "epoch": 1.5216061964940888, + "grad_norm": 0.1958150863647461, + "learning_rate": 8.09909634461204e-05, + "loss": 4.0109, + "step": 22395 + }, + { + "epoch": 1.5219459165647506, + "grad_norm": 0.1647811383008957, + "learning_rate": 8.098671694523713e-05, + "loss": 3.9645, + "step": 22400 + }, + { + "epoch": 1.5222856366354125, + "grad_norm": 0.22496366500854492, + "learning_rate": 8.098247044435385e-05, + "loss": 3.9822, + "step": 22405 + }, + { + "epoch": 1.5226253567060741, + "grad_norm": 0.19555050134658813, + "learning_rate": 8.097822394347059e-05, + "loss": 4.1779, + "step": 22410 + }, + { + "epoch": 1.522965076776736, + "grad_norm": 0.15134504437446594, + "learning_rate": 8.097397744258732e-05, + "loss": 3.8983, + "step": 22415 + }, + { + "epoch": 1.5233047968473978, + "grad_norm": 0.17814016342163086, + "learning_rate": 8.096973094170403e-05, + "loss": 3.9419, + "step": 22420 + }, + { + "epoch": 1.5236445169180595, + "grad_norm": 0.1395789235830307, + "learning_rate": 8.096548444082077e-05, + "loss": 3.954, + "step": 22425 + }, + { + "epoch": 1.5239842369887213, + "grad_norm": 0.22126054763793945, + "learning_rate": 8.09612379399375e-05, + "loss": 3.6512, + "step": 22430 + }, + { + "epoch": 1.5243239570593832, + "grad_norm": 0.19779862463474274, + "learning_rate": 8.095699143905422e-05, + "loss": 3.8232, + "step": 22435 + }, + { + "epoch": 1.5246636771300448, + "grad_norm": 0.18976791203022003, + "learning_rate": 8.095274493817096e-05, + "loss": 4.0406, + "step": 22440 + }, + { + "epoch": 1.5250033972007067, + "grad_norm": 0.21518440544605255, + "learning_rate": 8.094849843728768e-05, + "loss": 4.0495, + "step": 22445 + }, + { + "epoch": 1.5253431172713685, + "grad_norm": 0.21706193685531616, + "learning_rate": 8.09442519364044e-05, + "loss": 3.7711, + "step": 22450 + }, + { + "epoch": 1.5256828373420301, + "grad_norm": 0.5312498807907104, + "learning_rate": 8.094000543552114e-05, + "loss": 3.8362, + "step": 22455 + }, + { + "epoch": 1.526022557412692, + "grad_norm": 0.16411226987838745, + "learning_rate": 8.093575893463786e-05, + "loss": 4.081, + "step": 22460 + }, + { + "epoch": 1.5263622774833538, + "grad_norm": 0.15574374794960022, + "learning_rate": 8.093151243375458e-05, + "loss": 3.931, + "step": 22465 + }, + { + "epoch": 1.5267019975540155, + "grad_norm": 0.15604238212108612, + "learning_rate": 8.092726593287132e-05, + "loss": 3.8869, + "step": 22470 + }, + { + "epoch": 1.527041717624677, + "grad_norm": 0.2018536478281021, + "learning_rate": 8.092301943198804e-05, + "loss": 4.1885, + "step": 22475 + }, + { + "epoch": 1.5273814376953392, + "grad_norm": 0.1861005425453186, + "learning_rate": 8.091877293110477e-05, + "loss": 3.9424, + "step": 22480 + }, + { + "epoch": 1.5277211577660008, + "grad_norm": 0.13977813720703125, + "learning_rate": 8.091452643022151e-05, + "loss": 3.9762, + "step": 22485 + }, + { + "epoch": 1.5280608778366624, + "grad_norm": 0.15929043292999268, + "learning_rate": 8.091027992933822e-05, + "loss": 4.1012, + "step": 22490 + }, + { + "epoch": 1.5284005979073245, + "grad_norm": 0.18824906647205353, + "learning_rate": 8.090603342845495e-05, + "loss": 3.9492, + "step": 22495 + }, + { + "epoch": 1.5287403179779862, + "grad_norm": 0.20469623804092407, + "learning_rate": 8.090178692757169e-05, + "loss": 3.9266, + "step": 22500 + }, + { + "epoch": 1.5290800380486478, + "grad_norm": 0.16181810200214386, + "learning_rate": 8.089754042668841e-05, + "loss": 3.7643, + "step": 22505 + }, + { + "epoch": 1.5294197581193096, + "grad_norm": 0.18278400599956512, + "learning_rate": 8.089329392580514e-05, + "loss": 3.7903, + "step": 22510 + }, + { + "epoch": 1.5297594781899715, + "grad_norm": 0.16438040137290955, + "learning_rate": 8.088904742492188e-05, + "loss": 3.6593, + "step": 22515 + }, + { + "epoch": 1.5300991982606331, + "grad_norm": 0.25451090931892395, + "learning_rate": 8.088480092403859e-05, + "loss": 3.5385, + "step": 22520 + }, + { + "epoch": 1.530438918331295, + "grad_norm": 0.16456502676010132, + "learning_rate": 8.088055442315532e-05, + "loss": 4.0585, + "step": 22525 + }, + { + "epoch": 1.5307786384019568, + "grad_norm": 0.31917455792427063, + "learning_rate": 8.087630792227206e-05, + "loss": 3.9106, + "step": 22530 + }, + { + "epoch": 1.5311183584726185, + "grad_norm": 0.17908233404159546, + "learning_rate": 8.087206142138878e-05, + "loss": 4.039, + "step": 22535 + }, + { + "epoch": 1.5314580785432803, + "grad_norm": 0.18773703277111053, + "learning_rate": 8.08678149205055e-05, + "loss": 3.9013, + "step": 22540 + }, + { + "epoch": 1.5317977986139422, + "grad_norm": 0.14723744988441467, + "learning_rate": 8.086356841962223e-05, + "loss": 3.946, + "step": 22545 + }, + { + "epoch": 1.5321375186846038, + "grad_norm": 0.2635185122489929, + "learning_rate": 8.085932191873896e-05, + "loss": 3.8438, + "step": 22550 + }, + { + "epoch": 1.5324772387552656, + "grad_norm": 0.15777957439422607, + "learning_rate": 8.085507541785569e-05, + "loss": 3.9293, + "step": 22555 + }, + { + "epoch": 1.5328169588259275, + "grad_norm": 0.15364667773246765, + "learning_rate": 8.085082891697242e-05, + "loss": 3.9061, + "step": 22560 + }, + { + "epoch": 1.5331566788965891, + "grad_norm": 0.3715648651123047, + "learning_rate": 8.084658241608914e-05, + "loss": 3.7968, + "step": 22565 + }, + { + "epoch": 1.533496398967251, + "grad_norm": 0.1749754101037979, + "learning_rate": 8.084233591520587e-05, + "loss": 3.9718, + "step": 22570 + }, + { + "epoch": 1.5338361190379128, + "grad_norm": 0.6572285294532776, + "learning_rate": 8.08380894143226e-05, + "loss": 3.8534, + "step": 22575 + }, + { + "epoch": 1.5341758391085745, + "grad_norm": 0.44348782300949097, + "learning_rate": 8.083384291343933e-05, + "loss": 3.9551, + "step": 22580 + }, + { + "epoch": 1.5345155591792363, + "grad_norm": 0.2257240265607834, + "learning_rate": 8.082959641255606e-05, + "loss": 4.0271, + "step": 22585 + }, + { + "epoch": 1.5348552792498982, + "grad_norm": 0.25812819600105286, + "learning_rate": 8.082534991167278e-05, + "loss": 4.193, + "step": 22590 + }, + { + "epoch": 1.5351949993205598, + "grad_norm": 0.40430963039398193, + "learning_rate": 8.082110341078951e-05, + "loss": 4.1891, + "step": 22595 + }, + { + "epoch": 1.5355347193912217, + "grad_norm": 9.87376880645752, + "learning_rate": 8.081685690990624e-05, + "loss": 3.7513, + "step": 22600 + }, + { + "epoch": 1.5358744394618835, + "grad_norm": 0.16081424057483673, + "learning_rate": 8.081261040902297e-05, + "loss": 3.9589, + "step": 22605 + }, + { + "epoch": 1.5362141595325451, + "grad_norm": 0.17502115666866302, + "learning_rate": 8.08083639081397e-05, + "loss": 3.8835, + "step": 22610 + }, + { + "epoch": 1.536553879603207, + "grad_norm": 0.1488470584154129, + "learning_rate": 8.080411740725642e-05, + "loss": 4.2128, + "step": 22615 + }, + { + "epoch": 1.5368935996738688, + "grad_norm": 0.16122186183929443, + "learning_rate": 8.079987090637315e-05, + "loss": 3.8305, + "step": 22620 + }, + { + "epoch": 1.5372333197445305, + "grad_norm": 0.1512492299079895, + "learning_rate": 8.079562440548988e-05, + "loss": 3.8493, + "step": 22625 + }, + { + "epoch": 1.5375730398151923, + "grad_norm": 0.2095698118209839, + "learning_rate": 8.079137790460661e-05, + "loss": 3.9078, + "step": 22630 + }, + { + "epoch": 1.5379127598858542, + "grad_norm": 0.17582467198371887, + "learning_rate": 8.078713140372334e-05, + "loss": 4.0295, + "step": 22635 + }, + { + "epoch": 1.5382524799565158, + "grad_norm": 0.1660434454679489, + "learning_rate": 8.078288490284006e-05, + "loss": 3.8114, + "step": 22640 + }, + { + "epoch": 1.5385922000271774, + "grad_norm": 0.3663979470729828, + "learning_rate": 8.077863840195679e-05, + "loss": 3.9335, + "step": 22645 + }, + { + "epoch": 1.5389319200978395, + "grad_norm": 0.17535951733589172, + "learning_rate": 8.077439190107352e-05, + "loss": 3.9079, + "step": 22650 + }, + { + "epoch": 1.5392716401685012, + "grad_norm": 0.18419885635375977, + "learning_rate": 8.077014540019025e-05, + "loss": 3.8868, + "step": 22655 + }, + { + "epoch": 1.5396113602391628, + "grad_norm": 1.9282505512237549, + "learning_rate": 8.076589889930698e-05, + "loss": 3.934, + "step": 22660 + }, + { + "epoch": 1.5399510803098249, + "grad_norm": 0.9594584107398987, + "learning_rate": 8.07616523984237e-05, + "loss": 4.1476, + "step": 22665 + }, + { + "epoch": 1.5402908003804865, + "grad_norm": 0.19354571402072906, + "learning_rate": 8.075740589754043e-05, + "loss": 3.8876, + "step": 22670 + }, + { + "epoch": 1.5406305204511481, + "grad_norm": 0.15643349289894104, + "learning_rate": 8.075315939665716e-05, + "loss": 3.9266, + "step": 22675 + }, + { + "epoch": 1.54097024052181, + "grad_norm": 0.1797167956829071, + "learning_rate": 8.074891289577389e-05, + "loss": 3.8671, + "step": 22680 + }, + { + "epoch": 1.5413099605924718, + "grad_norm": 0.158339723944664, + "learning_rate": 8.074466639489062e-05, + "loss": 3.9677, + "step": 22685 + }, + { + "epoch": 1.5416496806631335, + "grad_norm": 0.20443323254585266, + "learning_rate": 8.074041989400734e-05, + "loss": 3.7935, + "step": 22690 + }, + { + "epoch": 1.5419894007337953, + "grad_norm": 0.16003336012363434, + "learning_rate": 8.073617339312407e-05, + "loss": 3.985, + "step": 22695 + }, + { + "epoch": 1.5423291208044572, + "grad_norm": 0.1840764284133911, + "learning_rate": 8.07319268922408e-05, + "loss": 3.9381, + "step": 22700 + }, + { + "epoch": 1.5426688408751188, + "grad_norm": 0.13099946081638336, + "learning_rate": 8.072768039135753e-05, + "loss": 3.9184, + "step": 22705 + }, + { + "epoch": 1.5430085609457806, + "grad_norm": 0.2026049941778183, + "learning_rate": 8.072343389047426e-05, + "loss": 3.9481, + "step": 22710 + }, + { + "epoch": 1.5433482810164425, + "grad_norm": 0.21639543771743774, + "learning_rate": 8.071918738959098e-05, + "loss": 3.8921, + "step": 22715 + }, + { + "epoch": 1.5436880010871041, + "grad_norm": 0.3877890706062317, + "learning_rate": 8.071494088870771e-05, + "loss": 3.8998, + "step": 22720 + }, + { + "epoch": 1.544027721157766, + "grad_norm": 0.18860408663749695, + "learning_rate": 8.071069438782444e-05, + "loss": 4.0405, + "step": 22725 + }, + { + "epoch": 1.5443674412284278, + "grad_norm": 0.22111770510673523, + "learning_rate": 8.070644788694117e-05, + "loss": 3.947, + "step": 22730 + }, + { + "epoch": 1.5447071612990895, + "grad_norm": 0.32336196303367615, + "learning_rate": 8.07022013860579e-05, + "loss": 3.8188, + "step": 22735 + }, + { + "epoch": 1.5450468813697513, + "grad_norm": 0.17553462088108063, + "learning_rate": 8.069795488517462e-05, + "loss": 3.9761, + "step": 22740 + }, + { + "epoch": 1.5453866014404132, + "grad_norm": 0.17417173087596893, + "learning_rate": 8.069370838429134e-05, + "loss": 3.9678, + "step": 22745 + }, + { + "epoch": 1.5457263215110748, + "grad_norm": 0.18807198107242584, + "learning_rate": 8.068946188340808e-05, + "loss": 4.0894, + "step": 22750 + }, + { + "epoch": 1.5460660415817367, + "grad_norm": 0.1794470250606537, + "learning_rate": 8.068521538252481e-05, + "loss": 3.8719, + "step": 22755 + }, + { + "epoch": 1.5464057616523985, + "grad_norm": 0.17493528127670288, + "learning_rate": 8.068096888164152e-05, + "loss": 3.768, + "step": 22760 + }, + { + "epoch": 1.5467454817230601, + "grad_norm": 0.19775459170341492, + "learning_rate": 8.067672238075826e-05, + "loss": 3.8354, + "step": 22765 + }, + { + "epoch": 1.547085201793722, + "grad_norm": 0.16907206177711487, + "learning_rate": 8.067247587987499e-05, + "loss": 3.8057, + "step": 22770 + }, + { + "epoch": 1.5474249218643839, + "grad_norm": 0.19053472578525543, + "learning_rate": 8.06682293789917e-05, + "loss": 4.0394, + "step": 22775 + }, + { + "epoch": 1.5477646419350455, + "grad_norm": 0.20975057780742645, + "learning_rate": 8.066398287810845e-05, + "loss": 3.8362, + "step": 22780 + }, + { + "epoch": 1.5481043620057073, + "grad_norm": 0.1776856929063797, + "learning_rate": 8.065973637722518e-05, + "loss": 4.0495, + "step": 22785 + }, + { + "epoch": 1.5484440820763692, + "grad_norm": 0.22057822346687317, + "learning_rate": 8.065548987634189e-05, + "loss": 3.9868, + "step": 22790 + }, + { + "epoch": 1.5487838021470308, + "grad_norm": 0.18930856883525848, + "learning_rate": 8.065124337545863e-05, + "loss": 4.0875, + "step": 22795 + }, + { + "epoch": 1.5491235222176927, + "grad_norm": 0.15026040375232697, + "learning_rate": 8.064699687457536e-05, + "loss": 3.7421, + "step": 22800 + }, + { + "epoch": 1.5494632422883545, + "grad_norm": 0.49907609820365906, + "learning_rate": 8.064275037369207e-05, + "loss": 3.6157, + "step": 22805 + }, + { + "epoch": 1.5498029623590162, + "grad_norm": 0.19636879861354828, + "learning_rate": 8.063850387280882e-05, + "loss": 3.798, + "step": 22810 + }, + { + "epoch": 1.5501426824296778, + "grad_norm": 0.17884071171283722, + "learning_rate": 8.063425737192553e-05, + "loss": 4.0631, + "step": 22815 + }, + { + "epoch": 1.5504824025003399, + "grad_norm": 0.14085334539413452, + "learning_rate": 8.063001087104226e-05, + "loss": 3.8156, + "step": 22820 + }, + { + "epoch": 1.5508221225710015, + "grad_norm": 1.7794359922409058, + "learning_rate": 8.0625764370159e-05, + "loss": 3.9262, + "step": 22825 + }, + { + "epoch": 1.5511618426416631, + "grad_norm": 0.17480483651161194, + "learning_rate": 8.062151786927571e-05, + "loss": 4.1028, + "step": 22830 + }, + { + "epoch": 1.5515015627123252, + "grad_norm": 0.18914951384067535, + "learning_rate": 8.061727136839244e-05, + "loss": 3.9678, + "step": 22835 + }, + { + "epoch": 1.5518412827829868, + "grad_norm": 0.20987163484096527, + "learning_rate": 8.061302486750918e-05, + "loss": 3.9491, + "step": 22840 + }, + { + "epoch": 1.5521810028536485, + "grad_norm": 0.17292629182338715, + "learning_rate": 8.06087783666259e-05, + "loss": 3.9795, + "step": 22845 + }, + { + "epoch": 1.5525207229243103, + "grad_norm": 0.473714679479599, + "learning_rate": 8.060453186574263e-05, + "loss": 3.9546, + "step": 22850 + }, + { + "epoch": 1.5528604429949722, + "grad_norm": 0.24531768262386322, + "learning_rate": 8.060028536485937e-05, + "loss": 3.8173, + "step": 22855 + }, + { + "epoch": 1.5532001630656338, + "grad_norm": 0.16557277739048004, + "learning_rate": 8.059603886397608e-05, + "loss": 3.8889, + "step": 22860 + }, + { + "epoch": 1.5535398831362957, + "grad_norm": 0.23514215648174286, + "learning_rate": 8.059179236309281e-05, + "loss": 4.0031, + "step": 22865 + }, + { + "epoch": 1.5538796032069575, + "grad_norm": 0.1967945098876953, + "learning_rate": 8.058754586220955e-05, + "loss": 3.8343, + "step": 22870 + }, + { + "epoch": 1.5542193232776191, + "grad_norm": 0.15545599162578583, + "learning_rate": 8.058329936132627e-05, + "loss": 3.9325, + "step": 22875 + }, + { + "epoch": 1.554559043348281, + "grad_norm": 0.16079239547252655, + "learning_rate": 8.0579052860443e-05, + "loss": 3.9197, + "step": 22880 + }, + { + "epoch": 1.5548987634189428, + "grad_norm": 0.16385380923748016, + "learning_rate": 8.057480635955972e-05, + "loss": 3.9906, + "step": 22885 + }, + { + "epoch": 1.5552384834896045, + "grad_norm": 16.872812271118164, + "learning_rate": 8.057055985867645e-05, + "loss": 3.9292, + "step": 22890 + }, + { + "epoch": 1.5555782035602663, + "grad_norm": 0.23831741511821747, + "learning_rate": 8.056631335779318e-05, + "loss": 4.0138, + "step": 22895 + }, + { + "epoch": 1.5559179236309282, + "grad_norm": 0.3889058530330658, + "learning_rate": 8.05620668569099e-05, + "loss": 3.8109, + "step": 22900 + }, + { + "epoch": 1.5562576437015898, + "grad_norm": 0.13758012652397156, + "learning_rate": 8.055782035602663e-05, + "loss": 3.8772, + "step": 22905 + }, + { + "epoch": 1.5565973637722517, + "grad_norm": 0.1661239117383957, + "learning_rate": 8.055357385514336e-05, + "loss": 3.9328, + "step": 22910 + }, + { + "epoch": 1.5569370838429135, + "grad_norm": 0.15955190360546112, + "learning_rate": 8.054932735426009e-05, + "loss": 3.993, + "step": 22915 + }, + { + "epoch": 1.5572768039135751, + "grad_norm": 0.18747274577617645, + "learning_rate": 8.054508085337682e-05, + "loss": 3.7417, + "step": 22920 + }, + { + "epoch": 1.557616523984237, + "grad_norm": 0.17621882259845734, + "learning_rate": 8.054083435249355e-05, + "loss": 4.1038, + "step": 22925 + }, + { + "epoch": 1.5579562440548989, + "grad_norm": 0.22389636933803558, + "learning_rate": 8.053658785161027e-05, + "loss": 3.7762, + "step": 22930 + }, + { + "epoch": 1.5582959641255605, + "grad_norm": 0.24367506802082062, + "learning_rate": 8.0532341350727e-05, + "loss": 3.7608, + "step": 22935 + }, + { + "epoch": 1.5586356841962223, + "grad_norm": 0.13922227919101715, + "learning_rate": 8.052809484984373e-05, + "loss": 3.9677, + "step": 22940 + }, + { + "epoch": 1.5589754042668842, + "grad_norm": 0.16902963817119598, + "learning_rate": 8.052384834896046e-05, + "loss": 3.9052, + "step": 22945 + }, + { + "epoch": 1.5593151243375458, + "grad_norm": 0.7652144432067871, + "learning_rate": 8.051960184807719e-05, + "loss": 3.9379, + "step": 22950 + }, + { + "epoch": 1.5596548444082077, + "grad_norm": 0.17719703912734985, + "learning_rate": 8.051535534719393e-05, + "loss": 4.007, + "step": 22955 + }, + { + "epoch": 1.5599945644788695, + "grad_norm": 0.1798882782459259, + "learning_rate": 8.051110884631064e-05, + "loss": 3.8501, + "step": 22960 + }, + { + "epoch": 1.5603342845495312, + "grad_norm": 0.3087379038333893, + "learning_rate": 8.050686234542737e-05, + "loss": 3.976, + "step": 22965 + }, + { + "epoch": 1.560674004620193, + "grad_norm": 0.1989341378211975, + "learning_rate": 8.05026158445441e-05, + "loss": 4.1056, + "step": 22970 + }, + { + "epoch": 1.5610137246908549, + "grad_norm": 0.2823804020881653, + "learning_rate": 8.049836934366083e-05, + "loss": 3.8743, + "step": 22975 + }, + { + "epoch": 1.5613534447615165, + "grad_norm": 0.18499642610549927, + "learning_rate": 8.049412284277755e-05, + "loss": 3.6749, + "step": 22980 + }, + { + "epoch": 1.5616931648321781, + "grad_norm": 0.21138516068458557, + "learning_rate": 8.048987634189428e-05, + "loss": 3.9047, + "step": 22985 + }, + { + "epoch": 1.5620328849028402, + "grad_norm": 0.17963111400604248, + "learning_rate": 8.048562984101101e-05, + "loss": 4.2099, + "step": 22990 + }, + { + "epoch": 1.5623726049735018, + "grad_norm": 1.407643437385559, + "learning_rate": 8.048138334012774e-05, + "loss": 3.9895, + "step": 22995 + }, + { + "epoch": 1.5627123250441635, + "grad_norm": 0.1689721792936325, + "learning_rate": 8.047713683924447e-05, + "loss": 3.7905, + "step": 23000 + }, + { + "epoch": 1.5630520451148255, + "grad_norm": 0.17341989278793335, + "learning_rate": 8.04728903383612e-05, + "loss": 3.8096, + "step": 23005 + }, + { + "epoch": 1.5633917651854872, + "grad_norm": 0.16194278001785278, + "learning_rate": 8.046864383747792e-05, + "loss": 3.7436, + "step": 23010 + }, + { + "epoch": 1.5637314852561488, + "grad_norm": 0.18519973754882812, + "learning_rate": 8.046439733659465e-05, + "loss": 4.1771, + "step": 23015 + }, + { + "epoch": 1.5640712053268107, + "grad_norm": 0.17853236198425293, + "learning_rate": 8.046015083571138e-05, + "loss": 3.9996, + "step": 23020 + }, + { + "epoch": 1.5644109253974725, + "grad_norm": 0.18208886682987213, + "learning_rate": 8.04559043348281e-05, + "loss": 3.7302, + "step": 23025 + }, + { + "epoch": 1.5647506454681341, + "grad_norm": 0.4673936665058136, + "learning_rate": 8.045165783394483e-05, + "loss": 3.7828, + "step": 23030 + }, + { + "epoch": 1.565090365538796, + "grad_norm": 0.28109011054039, + "learning_rate": 8.044741133306156e-05, + "loss": 4.2147, + "step": 23035 + }, + { + "epoch": 1.5654300856094578, + "grad_norm": 0.28432148694992065, + "learning_rate": 8.044316483217829e-05, + "loss": 3.8292, + "step": 23040 + }, + { + "epoch": 1.5657698056801195, + "grad_norm": 0.17697305977344513, + "learning_rate": 8.043976763147167e-05, + "loss": 3.946, + "step": 23045 + }, + { + "epoch": 1.5661095257507813, + "grad_norm": 0.1723969578742981, + "learning_rate": 8.04355211305884e-05, + "loss": 3.9556, + "step": 23050 + }, + { + "epoch": 1.5664492458214432, + "grad_norm": 0.22159208357334137, + "learning_rate": 8.043127462970513e-05, + "loss": 3.7769, + "step": 23055 + }, + { + "epoch": 1.5667889658921048, + "grad_norm": 0.13616728782653809, + "learning_rate": 8.042702812882186e-05, + "loss": 4.0244, + "step": 23060 + }, + { + "epoch": 1.5671286859627667, + "grad_norm": 0.25459590554237366, + "learning_rate": 8.042278162793859e-05, + "loss": 4.1792, + "step": 23065 + }, + { + "epoch": 1.5674684060334285, + "grad_norm": 0.16975153982639313, + "learning_rate": 8.04185351270553e-05, + "loss": 4.1502, + "step": 23070 + }, + { + "epoch": 1.5678081261040901, + "grad_norm": 0.18457776308059692, + "learning_rate": 8.041428862617204e-05, + "loss": 3.9969, + "step": 23075 + }, + { + "epoch": 1.568147846174752, + "grad_norm": 0.18233300745487213, + "learning_rate": 8.041004212528877e-05, + "loss": 3.9536, + "step": 23080 + }, + { + "epoch": 1.5684875662454139, + "grad_norm": 0.20997940003871918, + "learning_rate": 8.040579562440548e-05, + "loss": 3.8661, + "step": 23085 + }, + { + "epoch": 1.5688272863160755, + "grad_norm": 0.19371215999126434, + "learning_rate": 8.040154912352223e-05, + "loss": 3.9629, + "step": 23090 + }, + { + "epoch": 1.5691670063867373, + "grad_norm": 0.19901804625988007, + "learning_rate": 8.039730262263895e-05, + "loss": 4.0105, + "step": 23095 + }, + { + "epoch": 1.5695067264573992, + "grad_norm": 0.17610082030296326, + "learning_rate": 8.039305612175567e-05, + "loss": 4.0023, + "step": 23100 + }, + { + "epoch": 1.5698464465280608, + "grad_norm": 0.25788018107414246, + "learning_rate": 8.038880962087241e-05, + "loss": 4.1077, + "step": 23105 + }, + { + "epoch": 1.5701861665987227, + "grad_norm": 0.1607256680727005, + "learning_rate": 8.038456311998914e-05, + "loss": 3.8022, + "step": 23110 + }, + { + "epoch": 1.5705258866693845, + "grad_norm": 0.20446252822875977, + "learning_rate": 8.038031661910585e-05, + "loss": 4.056, + "step": 23115 + }, + { + "epoch": 1.5708656067400462, + "grad_norm": 0.14942526817321777, + "learning_rate": 8.037607011822259e-05, + "loss": 3.8155, + "step": 23120 + }, + { + "epoch": 1.571205326810708, + "grad_norm": 0.1358565390110016, + "learning_rate": 8.037182361733932e-05, + "loss": 3.9761, + "step": 23125 + }, + { + "epoch": 1.5715450468813699, + "grad_norm": 0.1594834327697754, + "learning_rate": 8.036757711645604e-05, + "loss": 4.0048, + "step": 23130 + }, + { + "epoch": 1.5718847669520315, + "grad_norm": 0.14781281352043152, + "learning_rate": 8.036333061557278e-05, + "loss": 4.0448, + "step": 23135 + }, + { + "epoch": 1.5722244870226934, + "grad_norm": 0.17380239069461823, + "learning_rate": 8.035908411468949e-05, + "loss": 3.9664, + "step": 23140 + }, + { + "epoch": 1.5725642070933552, + "grad_norm": 0.16010525822639465, + "learning_rate": 8.035483761380622e-05, + "loss": 3.9684, + "step": 23145 + }, + { + "epoch": 1.5729039271640168, + "grad_norm": 0.17489488422870636, + "learning_rate": 8.035059111292296e-05, + "loss": 4.0137, + "step": 23150 + }, + { + "epoch": 1.5732436472346785, + "grad_norm": 0.22654464840888977, + "learning_rate": 8.034634461203968e-05, + "loss": 4.0658, + "step": 23155 + }, + { + "epoch": 1.5735833673053405, + "grad_norm": 0.15615729987621307, + "learning_rate": 8.034209811115642e-05, + "loss": 3.9144, + "step": 23160 + }, + { + "epoch": 1.5739230873760022, + "grad_norm": 0.14379781484603882, + "learning_rate": 8.033785161027315e-05, + "loss": 4.0182, + "step": 23165 + }, + { + "epoch": 1.5742628074466638, + "grad_norm": 0.3476228415966034, + "learning_rate": 8.033360510938986e-05, + "loss": 4.0271, + "step": 23170 + }, + { + "epoch": 1.5746025275173259, + "grad_norm": 0.4062865078449249, + "learning_rate": 8.03293586085066e-05, + "loss": 4.0889, + "step": 23175 + }, + { + "epoch": 1.5749422475879875, + "grad_norm": 0.19427351653575897, + "learning_rate": 8.032511210762333e-05, + "loss": 4.0721, + "step": 23180 + }, + { + "epoch": 1.5752819676586491, + "grad_norm": 0.17874784767627716, + "learning_rate": 8.032086560674004e-05, + "loss": 3.9605, + "step": 23185 + }, + { + "epoch": 1.575621687729311, + "grad_norm": 0.22739998996257782, + "learning_rate": 8.031661910585679e-05, + "loss": 3.9598, + "step": 23190 + }, + { + "epoch": 1.5759614077999728, + "grad_norm": 0.17597122490406036, + "learning_rate": 8.031237260497351e-05, + "loss": 4.213, + "step": 23195 + }, + { + "epoch": 1.5763011278706345, + "grad_norm": 0.17736169695854187, + "learning_rate": 8.030812610409023e-05, + "loss": 3.5892, + "step": 23200 + }, + { + "epoch": 1.5766408479412963, + "grad_norm": 0.19841057062149048, + "learning_rate": 8.030387960320697e-05, + "loss": 3.9962, + "step": 23205 + }, + { + "epoch": 1.5769805680119582, + "grad_norm": 0.30547213554382324, + "learning_rate": 8.029963310232368e-05, + "loss": 3.8256, + "step": 23210 + }, + { + "epoch": 1.5773202880826198, + "grad_norm": 0.18477849662303925, + "learning_rate": 8.029538660144041e-05, + "loss": 3.8495, + "step": 23215 + }, + { + "epoch": 1.5776600081532817, + "grad_norm": 0.28849464654922485, + "learning_rate": 8.029114010055715e-05, + "loss": 3.7811, + "step": 23220 + }, + { + "epoch": 1.5779997282239435, + "grad_norm": 0.24161672592163086, + "learning_rate": 8.028689359967387e-05, + "loss": 3.992, + "step": 23225 + }, + { + "epoch": 1.5783394482946052, + "grad_norm": 0.26209139823913574, + "learning_rate": 8.02826470987906e-05, + "loss": 3.7144, + "step": 23230 + }, + { + "epoch": 1.578679168365267, + "grad_norm": 0.1968594640493393, + "learning_rate": 8.027840059790734e-05, + "loss": 4.143, + "step": 23235 + }, + { + "epoch": 1.5790188884359289, + "grad_norm": 0.23589549958705902, + "learning_rate": 8.027415409702405e-05, + "loss": 3.7104, + "step": 23240 + }, + { + "epoch": 1.5793586085065905, + "grad_norm": 0.2005378156900406, + "learning_rate": 8.026990759614078e-05, + "loss": 3.9257, + "step": 23245 + }, + { + "epoch": 1.5796983285772523, + "grad_norm": 0.16991844773292542, + "learning_rate": 8.026566109525752e-05, + "loss": 3.9008, + "step": 23250 + }, + { + "epoch": 1.5800380486479142, + "grad_norm": 0.17162741720676422, + "learning_rate": 8.026141459437424e-05, + "loss": 3.9529, + "step": 23255 + }, + { + "epoch": 1.5803777687185758, + "grad_norm": 0.17957571148872375, + "learning_rate": 8.025716809349096e-05, + "loss": 3.9721, + "step": 23260 + }, + { + "epoch": 1.5807174887892377, + "grad_norm": 0.26333001255989075, + "learning_rate": 8.02529215926077e-05, + "loss": 3.9445, + "step": 23265 + }, + { + "epoch": 1.5810572088598995, + "grad_norm": 0.17143364250659943, + "learning_rate": 8.024867509172442e-05, + "loss": 4.0157, + "step": 23270 + }, + { + "epoch": 1.5813969289305612, + "grad_norm": 0.2055358737707138, + "learning_rate": 8.024442859084115e-05, + "loss": 4.0401, + "step": 23275 + }, + { + "epoch": 1.581736649001223, + "grad_norm": 0.18542641401290894, + "learning_rate": 8.024018208995789e-05, + "loss": 4.118, + "step": 23280 + }, + { + "epoch": 1.5820763690718849, + "grad_norm": 0.19009174406528473, + "learning_rate": 8.02359355890746e-05, + "loss": 3.9653, + "step": 23285 + }, + { + "epoch": 1.5824160891425465, + "grad_norm": 0.2027108073234558, + "learning_rate": 8.023168908819133e-05, + "loss": 3.4976, + "step": 23290 + }, + { + "epoch": 1.5827558092132084, + "grad_norm": 0.2694283127784729, + "learning_rate": 8.022744258730806e-05, + "loss": 3.8524, + "step": 23295 + }, + { + "epoch": 1.5830955292838702, + "grad_norm": 0.20329399406909943, + "learning_rate": 8.022319608642479e-05, + "loss": 3.9989, + "step": 23300 + }, + { + "epoch": 1.5834352493545318, + "grad_norm": 0.2255329042673111, + "learning_rate": 8.021894958554152e-05, + "loss": 3.9081, + "step": 23305 + }, + { + "epoch": 1.5837749694251937, + "grad_norm": 0.18434260785579681, + "learning_rate": 8.021470308465824e-05, + "loss": 3.7723, + "step": 23310 + }, + { + "epoch": 1.5841146894958555, + "grad_norm": 0.14204613864421844, + "learning_rate": 8.021045658377497e-05, + "loss": 3.6732, + "step": 23315 + }, + { + "epoch": 1.5844544095665172, + "grad_norm": 0.18785853683948517, + "learning_rate": 8.02062100828917e-05, + "loss": 3.6313, + "step": 23320 + }, + { + "epoch": 1.5847941296371788, + "grad_norm": 0.1770613044500351, + "learning_rate": 8.020196358200843e-05, + "loss": 3.8728, + "step": 23325 + }, + { + "epoch": 1.5851338497078409, + "grad_norm": 0.17172285914421082, + "learning_rate": 8.019771708112516e-05, + "loss": 3.8851, + "step": 23330 + }, + { + "epoch": 1.5854735697785025, + "grad_norm": 0.16711339354515076, + "learning_rate": 8.019347058024188e-05, + "loss": 3.9626, + "step": 23335 + }, + { + "epoch": 1.5858132898491641, + "grad_norm": 0.20618294179439545, + "learning_rate": 8.018922407935861e-05, + "loss": 3.9173, + "step": 23340 + }, + { + "epoch": 1.5861530099198262, + "grad_norm": 0.5611276030540466, + "learning_rate": 8.018497757847534e-05, + "loss": 3.9355, + "step": 23345 + }, + { + "epoch": 1.5864927299904878, + "grad_norm": 0.21194879710674286, + "learning_rate": 8.018073107759207e-05, + "loss": 4.2222, + "step": 23350 + }, + { + "epoch": 1.5868324500611495, + "grad_norm": 0.22641108930110931, + "learning_rate": 8.01764845767088e-05, + "loss": 3.8358, + "step": 23355 + }, + { + "epoch": 1.5871721701318113, + "grad_norm": 0.2807486057281494, + "learning_rate": 8.017223807582552e-05, + "loss": 3.7823, + "step": 23360 + }, + { + "epoch": 1.5875118902024732, + "grad_norm": 0.16357910633087158, + "learning_rate": 8.016799157494225e-05, + "loss": 4.0024, + "step": 23365 + }, + { + "epoch": 1.5878516102731348, + "grad_norm": 0.15183256566524506, + "learning_rate": 8.016374507405898e-05, + "loss": 4.0557, + "step": 23370 + }, + { + "epoch": 1.5881913303437967, + "grad_norm": 0.17596930265426636, + "learning_rate": 8.015949857317571e-05, + "loss": 3.8282, + "step": 23375 + }, + { + "epoch": 1.5885310504144585, + "grad_norm": 0.5453233122825623, + "learning_rate": 8.015525207229244e-05, + "loss": 3.8898, + "step": 23380 + }, + { + "epoch": 1.5888707704851202, + "grad_norm": 0.1968008577823639, + "learning_rate": 8.015100557140916e-05, + "loss": 4.0656, + "step": 23385 + }, + { + "epoch": 1.589210490555782, + "grad_norm": 0.2501211166381836, + "learning_rate": 8.014675907052589e-05, + "loss": 3.8221, + "step": 23390 + }, + { + "epoch": 1.5895502106264439, + "grad_norm": 0.16039659082889557, + "learning_rate": 8.014251256964262e-05, + "loss": 3.9878, + "step": 23395 + }, + { + "epoch": 1.5898899306971055, + "grad_norm": 0.2282872349023819, + "learning_rate": 8.013826606875935e-05, + "loss": 3.8139, + "step": 23400 + }, + { + "epoch": 1.5902296507677673, + "grad_norm": 0.1805654764175415, + "learning_rate": 8.013401956787608e-05, + "loss": 4.0603, + "step": 23405 + }, + { + "epoch": 1.5905693708384292, + "grad_norm": 0.17187590897083282, + "learning_rate": 8.012977306699279e-05, + "loss": 4.0792, + "step": 23410 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.1828698366880417, + "learning_rate": 8.012552656610953e-05, + "loss": 3.9207, + "step": 23415 + }, + { + "epoch": 1.5912488109797527, + "grad_norm": 0.16974280774593353, + "learning_rate": 8.012128006522626e-05, + "loss": 3.732, + "step": 23420 + }, + { + "epoch": 1.5915885310504145, + "grad_norm": 0.20831763744354248, + "learning_rate": 8.011703356434297e-05, + "loss": 4.0383, + "step": 23425 + }, + { + "epoch": 1.5919282511210762, + "grad_norm": 0.22434915602207184, + "learning_rate": 8.011278706345972e-05, + "loss": 3.879, + "step": 23430 + }, + { + "epoch": 1.592267971191738, + "grad_norm": 0.2299998551607132, + "learning_rate": 8.010854056257644e-05, + "loss": 3.661, + "step": 23435 + }, + { + "epoch": 1.5926076912623999, + "grad_norm": 0.19445201754570007, + "learning_rate": 8.010429406169316e-05, + "loss": 3.5565, + "step": 23440 + }, + { + "epoch": 1.5929474113330615, + "grad_norm": 0.5646089911460876, + "learning_rate": 8.01000475608099e-05, + "loss": 3.7393, + "step": 23445 + }, + { + "epoch": 1.5932871314037234, + "grad_norm": 0.17847034335136414, + "learning_rate": 8.009580105992663e-05, + "loss": 4.1388, + "step": 23450 + }, + { + "epoch": 1.5936268514743852, + "grad_norm": 0.18931686878204346, + "learning_rate": 8.009155455904334e-05, + "loss": 3.9139, + "step": 23455 + }, + { + "epoch": 1.5939665715450468, + "grad_norm": 0.19429105520248413, + "learning_rate": 8.008730805816008e-05, + "loss": 4.0856, + "step": 23460 + }, + { + "epoch": 1.5943062916157087, + "grad_norm": 0.48647037148475647, + "learning_rate": 8.008306155727681e-05, + "loss": 3.7857, + "step": 23465 + }, + { + "epoch": 1.5946460116863705, + "grad_norm": 0.24710795283317566, + "learning_rate": 8.007881505639353e-05, + "loss": 4.1288, + "step": 23470 + }, + { + "epoch": 1.5949857317570322, + "grad_norm": 0.1836954951286316, + "learning_rate": 8.007456855551027e-05, + "loss": 3.4876, + "step": 23475 + }, + { + "epoch": 1.595325451827694, + "grad_norm": 0.1482127457857132, + "learning_rate": 8.0070322054627e-05, + "loss": 3.9242, + "step": 23480 + }, + { + "epoch": 1.5956651718983559, + "grad_norm": 0.17236825823783875, + "learning_rate": 8.006607555374371e-05, + "loss": 3.7389, + "step": 23485 + }, + { + "epoch": 1.5960048919690175, + "grad_norm": 0.6602354049682617, + "learning_rate": 8.006182905286045e-05, + "loss": 3.6386, + "step": 23490 + }, + { + "epoch": 1.5963446120396791, + "grad_norm": 0.21019037067890167, + "learning_rate": 8.005758255197717e-05, + "loss": 3.8177, + "step": 23495 + }, + { + "epoch": 1.5966843321103412, + "grad_norm": 0.1805453896522522, + "learning_rate": 8.005333605109391e-05, + "loss": 3.9186, + "step": 23500 + }, + { + "epoch": 1.5970240521810029, + "grad_norm": 0.19113457202911377, + "learning_rate": 8.004908955021064e-05, + "loss": 3.6405, + "step": 23505 + }, + { + "epoch": 1.5973637722516645, + "grad_norm": 0.17619752883911133, + "learning_rate": 8.004484304932735e-05, + "loss": 3.8467, + "step": 23510 + }, + { + "epoch": 1.5977034923223266, + "grad_norm": 0.17658133804798126, + "learning_rate": 8.004059654844409e-05, + "loss": 4.119, + "step": 23515 + }, + { + "epoch": 1.5980432123929882, + "grad_norm": 0.18577221035957336, + "learning_rate": 8.003635004756082e-05, + "loss": 3.976, + "step": 23520 + }, + { + "epoch": 1.5983829324636498, + "grad_norm": 0.1644519716501236, + "learning_rate": 8.003210354667753e-05, + "loss": 4.0577, + "step": 23525 + }, + { + "epoch": 1.5987226525343117, + "grad_norm": 0.15924717485904694, + "learning_rate": 8.002785704579428e-05, + "loss": 3.8063, + "step": 23530 + }, + { + "epoch": 1.5990623726049735, + "grad_norm": 0.19600291550159454, + "learning_rate": 8.0023610544911e-05, + "loss": 3.8024, + "step": 23535 + }, + { + "epoch": 1.5994020926756352, + "grad_norm": 0.20023486018180847, + "learning_rate": 8.001936404402772e-05, + "loss": 4.0346, + "step": 23540 + }, + { + "epoch": 1.599741812746297, + "grad_norm": 0.18615978956222534, + "learning_rate": 8.001511754314446e-05, + "loss": 3.7356, + "step": 23545 + }, + { + "epoch": 1.6000815328169589, + "grad_norm": 0.24618953466415405, + "learning_rate": 8.001087104226119e-05, + "loss": 4.0322, + "step": 23550 + }, + { + "epoch": 1.6004212528876205, + "grad_norm": 0.15581165254116058, + "learning_rate": 8.00066245413779e-05, + "loss": 3.9442, + "step": 23555 + }, + { + "epoch": 1.6007609729582823, + "grad_norm": 0.42651036381721497, + "learning_rate": 8.000237804049464e-05, + "loss": 4.0087, + "step": 23560 + }, + { + "epoch": 1.6011006930289442, + "grad_norm": 0.2290574163198471, + "learning_rate": 7.999813153961136e-05, + "loss": 3.8357, + "step": 23565 + }, + { + "epoch": 1.6014404130996058, + "grad_norm": 0.21513143181800842, + "learning_rate": 7.999388503872809e-05, + "loss": 3.9526, + "step": 23570 + }, + { + "epoch": 1.6017801331702677, + "grad_norm": 0.3907299041748047, + "learning_rate": 7.998963853784483e-05, + "loss": 3.8381, + "step": 23575 + }, + { + "epoch": 1.6021198532409295, + "grad_norm": 0.34273141622543335, + "learning_rate": 7.998539203696154e-05, + "loss": 3.8453, + "step": 23580 + }, + { + "epoch": 1.6024595733115912, + "grad_norm": 0.2489028424024582, + "learning_rate": 7.998114553607827e-05, + "loss": 4.1931, + "step": 23585 + }, + { + "epoch": 1.602799293382253, + "grad_norm": 0.22349414229393005, + "learning_rate": 7.997689903519501e-05, + "loss": 4.0275, + "step": 23590 + }, + { + "epoch": 1.6031390134529149, + "grad_norm": 0.16843025386333466, + "learning_rate": 7.997265253431173e-05, + "loss": 3.8378, + "step": 23595 + }, + { + "epoch": 1.6034787335235765, + "grad_norm": 0.19994820654392242, + "learning_rate": 7.996840603342845e-05, + "loss": 4.0602, + "step": 23600 + }, + { + "epoch": 1.6038184535942384, + "grad_norm": 0.17509299516677856, + "learning_rate": 7.99641595325452e-05, + "loss": 4.0028, + "step": 23605 + }, + { + "epoch": 1.6041581736649002, + "grad_norm": 0.1686173528432846, + "learning_rate": 7.995991303166191e-05, + "loss": 3.8987, + "step": 23610 + }, + { + "epoch": 1.6044978937355618, + "grad_norm": 0.255532443523407, + "learning_rate": 7.995566653077864e-05, + "loss": 3.6604, + "step": 23615 + }, + { + "epoch": 1.6048376138062237, + "grad_norm": 0.25401827692985535, + "learning_rate": 7.995142002989538e-05, + "loss": 3.8695, + "step": 23620 + }, + { + "epoch": 1.6051773338768855, + "grad_norm": 0.17523211240768433, + "learning_rate": 7.99471735290121e-05, + "loss": 3.9903, + "step": 23625 + }, + { + "epoch": 1.6055170539475472, + "grad_norm": 0.1498420536518097, + "learning_rate": 7.994292702812882e-05, + "loss": 3.9668, + "step": 23630 + }, + { + "epoch": 1.605856774018209, + "grad_norm": 0.1798330545425415, + "learning_rate": 7.993868052724555e-05, + "loss": 3.8933, + "step": 23635 + }, + { + "epoch": 1.6061964940888709, + "grad_norm": 0.1712253838777542, + "learning_rate": 7.993443402636228e-05, + "loss": 4.0005, + "step": 23640 + }, + { + "epoch": 1.6065362141595325, + "grad_norm": 0.2947317957878113, + "learning_rate": 7.993018752547901e-05, + "loss": 3.7437, + "step": 23645 + }, + { + "epoch": 1.6068759342301944, + "grad_norm": 0.1601276844739914, + "learning_rate": 7.992594102459573e-05, + "loss": 4.0169, + "step": 23650 + }, + { + "epoch": 1.6072156543008562, + "grad_norm": 0.1736849993467331, + "learning_rate": 7.992169452371246e-05, + "loss": 4.1346, + "step": 23655 + }, + { + "epoch": 1.6075553743715179, + "grad_norm": 0.1532243937253952, + "learning_rate": 7.991744802282919e-05, + "loss": 3.9625, + "step": 23660 + }, + { + "epoch": 1.6078950944421795, + "grad_norm": 0.1759926825761795, + "learning_rate": 7.991320152194592e-05, + "loss": 3.7148, + "step": 23665 + }, + { + "epoch": 1.6082348145128416, + "grad_norm": 0.19011087715625763, + "learning_rate": 7.990895502106265e-05, + "loss": 3.7389, + "step": 23670 + }, + { + "epoch": 1.6085745345835032, + "grad_norm": 0.18898816406726837, + "learning_rate": 7.990470852017937e-05, + "loss": 3.8858, + "step": 23675 + }, + { + "epoch": 1.6089142546541648, + "grad_norm": 0.2232590764760971, + "learning_rate": 7.99004620192961e-05, + "loss": 3.8751, + "step": 23680 + }, + { + "epoch": 1.609253974724827, + "grad_norm": 0.16559503972530365, + "learning_rate": 7.989621551841283e-05, + "loss": 3.9654, + "step": 23685 + }, + { + "epoch": 1.6095936947954885, + "grad_norm": 0.2104642242193222, + "learning_rate": 7.989196901752956e-05, + "loss": 3.9678, + "step": 23690 + }, + { + "epoch": 1.6099334148661502, + "grad_norm": 0.6857727766036987, + "learning_rate": 7.988772251664629e-05, + "loss": 3.6484, + "step": 23695 + }, + { + "epoch": 1.610273134936812, + "grad_norm": 0.2108204960823059, + "learning_rate": 7.988347601576301e-05, + "loss": 4.0503, + "step": 23700 + }, + { + "epoch": 1.6106128550074739, + "grad_norm": 0.15169568359851837, + "learning_rate": 7.987922951487974e-05, + "loss": 3.9232, + "step": 23705 + }, + { + "epoch": 1.6109525750781355, + "grad_norm": 0.17591162025928497, + "learning_rate": 7.987498301399647e-05, + "loss": 4.0599, + "step": 23710 + }, + { + "epoch": 1.6112922951487973, + "grad_norm": 0.1601082980632782, + "learning_rate": 7.98707365131132e-05, + "loss": 3.9266, + "step": 23715 + }, + { + "epoch": 1.6116320152194592, + "grad_norm": 0.17098252475261688, + "learning_rate": 7.986649001222993e-05, + "loss": 4.1028, + "step": 23720 + }, + { + "epoch": 1.6119717352901208, + "grad_norm": 0.15679214894771576, + "learning_rate": 7.986224351134665e-05, + "loss": 3.8743, + "step": 23725 + }, + { + "epoch": 1.6123114553607827, + "grad_norm": 0.501569390296936, + "learning_rate": 7.985799701046338e-05, + "loss": 4.1861, + "step": 23730 + }, + { + "epoch": 1.6126511754314445, + "grad_norm": 0.22477468848228455, + "learning_rate": 7.985375050958011e-05, + "loss": 3.9906, + "step": 23735 + }, + { + "epoch": 1.6129908955021062, + "grad_norm": 0.20664003491401672, + "learning_rate": 7.984950400869684e-05, + "loss": 3.713, + "step": 23740 + }, + { + "epoch": 1.613330615572768, + "grad_norm": 0.19422908127307892, + "learning_rate": 7.984525750781357e-05, + "loss": 3.9289, + "step": 23745 + }, + { + "epoch": 1.6136703356434299, + "grad_norm": 1.1227723360061646, + "learning_rate": 7.98410110069303e-05, + "loss": 3.9712, + "step": 23750 + }, + { + "epoch": 1.6140100557140915, + "grad_norm": 0.2974293529987335, + "learning_rate": 7.983676450604702e-05, + "loss": 4.0278, + "step": 23755 + }, + { + "epoch": 1.6143497757847534, + "grad_norm": 0.15036122500896454, + "learning_rate": 7.983251800516375e-05, + "loss": 3.9517, + "step": 23760 + }, + { + "epoch": 1.6146894958554152, + "grad_norm": 0.2772010862827301, + "learning_rate": 7.982827150428047e-05, + "loss": 3.8772, + "step": 23765 + }, + { + "epoch": 1.6150292159260768, + "grad_norm": 0.16991256177425385, + "learning_rate": 7.982402500339721e-05, + "loss": 3.7141, + "step": 23770 + }, + { + "epoch": 1.6153689359967387, + "grad_norm": 0.1635085642337799, + "learning_rate": 7.981977850251394e-05, + "loss": 4.0778, + "step": 23775 + }, + { + "epoch": 1.6157086560674006, + "grad_norm": 0.21676288545131683, + "learning_rate": 7.981553200163065e-05, + "loss": 4.0274, + "step": 23780 + }, + { + "epoch": 1.6160483761380622, + "grad_norm": 0.16055245697498322, + "learning_rate": 7.981128550074739e-05, + "loss": 3.9156, + "step": 23785 + }, + { + "epoch": 1.616388096208724, + "grad_norm": 0.186578169465065, + "learning_rate": 7.980703899986412e-05, + "loss": 3.9061, + "step": 23790 + }, + { + "epoch": 1.6167278162793859, + "grad_norm": 0.2696215808391571, + "learning_rate": 7.980279249898083e-05, + "loss": 3.8351, + "step": 23795 + }, + { + "epoch": 1.6170675363500475, + "grad_norm": 0.1551664173603058, + "learning_rate": 7.979854599809758e-05, + "loss": 3.941, + "step": 23800 + }, + { + "epoch": 1.6174072564207094, + "grad_norm": 0.15920443832874298, + "learning_rate": 7.97942994972143e-05, + "loss": 3.7827, + "step": 23805 + }, + { + "epoch": 1.6177469764913712, + "grad_norm": 0.19448496401309967, + "learning_rate": 7.979005299633102e-05, + "loss": 3.8097, + "step": 23810 + }, + { + "epoch": 1.6180866965620329, + "grad_norm": 0.20241697132587433, + "learning_rate": 7.978580649544776e-05, + "loss": 4.1276, + "step": 23815 + }, + { + "epoch": 1.6184264166326947, + "grad_norm": 0.14855976402759552, + "learning_rate": 7.978155999456449e-05, + "loss": 3.8784, + "step": 23820 + }, + { + "epoch": 1.6187661367033566, + "grad_norm": 0.23913845419883728, + "learning_rate": 7.97773134936812e-05, + "loss": 3.8483, + "step": 23825 + }, + { + "epoch": 1.6191058567740182, + "grad_norm": 0.20084992051124573, + "learning_rate": 7.977306699279794e-05, + "loss": 3.9628, + "step": 23830 + }, + { + "epoch": 1.6194455768446798, + "grad_norm": 0.19364935159683228, + "learning_rate": 7.976882049191466e-05, + "loss": 3.8376, + "step": 23835 + }, + { + "epoch": 1.619785296915342, + "grad_norm": 0.13114383816719055, + "learning_rate": 7.97645739910314e-05, + "loss": 3.7776, + "step": 23840 + }, + { + "epoch": 1.6201250169860035, + "grad_norm": 0.17281602323055267, + "learning_rate": 7.976032749014813e-05, + "loss": 3.5318, + "step": 23845 + }, + { + "epoch": 1.6204647370566652, + "grad_norm": 0.17594875395298004, + "learning_rate": 7.975608098926484e-05, + "loss": 4.112, + "step": 23850 + }, + { + "epoch": 1.6208044571273272, + "grad_norm": 0.1932700127363205, + "learning_rate": 7.975183448838158e-05, + "loss": 3.9978, + "step": 23855 + }, + { + "epoch": 1.6211441771979889, + "grad_norm": 0.2162492871284485, + "learning_rate": 7.974758798749831e-05, + "loss": 3.7417, + "step": 23860 + }, + { + "epoch": 1.6214838972686505, + "grad_norm": 0.20377536118030548, + "learning_rate": 7.974334148661503e-05, + "loss": 3.8923, + "step": 23865 + }, + { + "epoch": 1.6218236173393124, + "grad_norm": 0.18263210356235504, + "learning_rate": 7.973909498573177e-05, + "loss": 3.8154, + "step": 23870 + }, + { + "epoch": 1.6221633374099742, + "grad_norm": 0.3520301878452301, + "learning_rate": 7.97348484848485e-05, + "loss": 3.8884, + "step": 23875 + }, + { + "epoch": 1.6225030574806358, + "grad_norm": 0.166124626994133, + "learning_rate": 7.973060198396521e-05, + "loss": 3.9625, + "step": 23880 + }, + { + "epoch": 1.6228427775512977, + "grad_norm": 0.17780283093452454, + "learning_rate": 7.972635548308195e-05, + "loss": 4.0434, + "step": 23885 + }, + { + "epoch": 1.6231824976219595, + "grad_norm": 1.1730380058288574, + "learning_rate": 7.972210898219868e-05, + "loss": 4.099, + "step": 23890 + }, + { + "epoch": 1.6235222176926212, + "grad_norm": 0.17068339884281158, + "learning_rate": 7.97178624813154e-05, + "loss": 3.6728, + "step": 23895 + }, + { + "epoch": 1.623861937763283, + "grad_norm": 0.49779629707336426, + "learning_rate": 7.971361598043214e-05, + "loss": 3.8046, + "step": 23900 + }, + { + "epoch": 1.6242016578339449, + "grad_norm": 0.15733446180820465, + "learning_rate": 7.970936947954886e-05, + "loss": 4.0239, + "step": 23905 + }, + { + "epoch": 1.6245413779046065, + "grad_norm": 0.20565132796764374, + "learning_rate": 7.970512297866558e-05, + "loss": 3.9057, + "step": 23910 + }, + { + "epoch": 1.6248810979752684, + "grad_norm": 0.18720857799053192, + "learning_rate": 7.970087647778232e-05, + "loss": 3.9999, + "step": 23915 + }, + { + "epoch": 1.6252208180459302, + "grad_norm": 0.16204072535037994, + "learning_rate": 7.969662997689903e-05, + "loss": 3.8222, + "step": 23920 + }, + { + "epoch": 1.6255605381165918, + "grad_norm": 0.1562989205121994, + "learning_rate": 7.969238347601576e-05, + "loss": 3.9522, + "step": 23925 + }, + { + "epoch": 1.6259002581872537, + "grad_norm": 0.1511804163455963, + "learning_rate": 7.96881369751325e-05, + "loss": 3.833, + "step": 23930 + }, + { + "epoch": 1.6262399782579156, + "grad_norm": 1.5464818477630615, + "learning_rate": 7.968389047424922e-05, + "loss": 3.7497, + "step": 23935 + }, + { + "epoch": 1.6265796983285772, + "grad_norm": 1.3212190866470337, + "learning_rate": 7.967964397336595e-05, + "loss": 3.7713, + "step": 23940 + }, + { + "epoch": 1.626919418399239, + "grad_norm": 0.2094212770462036, + "learning_rate": 7.967539747248269e-05, + "loss": 3.8338, + "step": 23945 + }, + { + "epoch": 1.627259138469901, + "grad_norm": 0.24690334498882294, + "learning_rate": 7.96711509715994e-05, + "loss": 3.6335, + "step": 23950 + }, + { + "epoch": 1.6275988585405625, + "grad_norm": 0.17425140738487244, + "learning_rate": 7.966690447071613e-05, + "loss": 4.0484, + "step": 23955 + }, + { + "epoch": 1.6279385786112244, + "grad_norm": 0.1735134869813919, + "learning_rate": 7.966265796983287e-05, + "loss": 4.172, + "step": 23960 + }, + { + "epoch": 1.6282782986818862, + "grad_norm": 0.20856407284736633, + "learning_rate": 7.965841146894959e-05, + "loss": 3.8079, + "step": 23965 + }, + { + "epoch": 1.6286180187525479, + "grad_norm": 0.1670047640800476, + "learning_rate": 7.965416496806631e-05, + "loss": 4.0131, + "step": 23970 + }, + { + "epoch": 1.6289577388232097, + "grad_norm": 0.18246978521347046, + "learning_rate": 7.964991846718306e-05, + "loss": 3.8564, + "step": 23975 + }, + { + "epoch": 1.6292974588938716, + "grad_norm": 0.25280529260635376, + "learning_rate": 7.964567196629977e-05, + "loss": 3.8665, + "step": 23980 + }, + { + "epoch": 1.6296371789645332, + "grad_norm": 0.16567406058311462, + "learning_rate": 7.96414254654165e-05, + "loss": 4.1064, + "step": 23985 + }, + { + "epoch": 1.629976899035195, + "grad_norm": 0.4880169630050659, + "learning_rate": 7.963717896453323e-05, + "loss": 4.0466, + "step": 23990 + }, + { + "epoch": 1.630316619105857, + "grad_norm": 0.2122366577386856, + "learning_rate": 7.963293246364995e-05, + "loss": 3.8969, + "step": 23995 + }, + { + "epoch": 1.6306563391765185, + "grad_norm": 0.15318627655506134, + "learning_rate": 7.962868596276668e-05, + "loss": 3.8059, + "step": 24000 + }, + { + "epoch": 1.6309960592471802, + "grad_norm": 0.2086213380098343, + "learning_rate": 7.962443946188341e-05, + "loss": 3.9249, + "step": 24005 + }, + { + "epoch": 1.6313357793178422, + "grad_norm": 0.1950540542602539, + "learning_rate": 7.962019296100014e-05, + "loss": 3.6876, + "step": 24010 + }, + { + "epoch": 1.6316754993885039, + "grad_norm": 0.15243861079216003, + "learning_rate": 7.961594646011687e-05, + "loss": 3.7944, + "step": 24015 + }, + { + "epoch": 1.6320152194591655, + "grad_norm": 0.16514404118061066, + "learning_rate": 7.96116999592336e-05, + "loss": 3.9987, + "step": 24020 + }, + { + "epoch": 1.6323549395298276, + "grad_norm": 0.1611555963754654, + "learning_rate": 7.960745345835032e-05, + "loss": 3.988, + "step": 24025 + }, + { + "epoch": 1.6326946596004892, + "grad_norm": 0.1540111005306244, + "learning_rate": 7.960320695746705e-05, + "loss": 3.9708, + "step": 24030 + }, + { + "epoch": 1.6330343796711508, + "grad_norm": 0.17110353708267212, + "learning_rate": 7.959896045658378e-05, + "loss": 4.0325, + "step": 24035 + }, + { + "epoch": 1.633374099741813, + "grad_norm": 0.18728619813919067, + "learning_rate": 7.95947139557005e-05, + "loss": 3.9396, + "step": 24040 + }, + { + "epoch": 1.6337138198124745, + "grad_norm": 0.15820161998271942, + "learning_rate": 7.959046745481723e-05, + "loss": 3.9999, + "step": 24045 + }, + { + "epoch": 1.6340535398831362, + "grad_norm": 0.21352258324623108, + "learning_rate": 7.958622095393396e-05, + "loss": 3.6932, + "step": 24050 + }, + { + "epoch": 1.634393259953798, + "grad_norm": 0.1851387321949005, + "learning_rate": 7.958197445305069e-05, + "loss": 3.5563, + "step": 24055 + }, + { + "epoch": 1.6347329800244599, + "grad_norm": 0.1808813214302063, + "learning_rate": 7.957772795216742e-05, + "loss": 3.918, + "step": 24060 + }, + { + "epoch": 1.6350727000951215, + "grad_norm": 0.16751275956630707, + "learning_rate": 7.957348145128415e-05, + "loss": 4.0617, + "step": 24065 + }, + { + "epoch": 1.6354124201657834, + "grad_norm": 0.2104269117116928, + "learning_rate": 7.956923495040087e-05, + "loss": 4.0817, + "step": 24070 + }, + { + "epoch": 1.6357521402364452, + "grad_norm": 0.1708756685256958, + "learning_rate": 7.95649884495176e-05, + "loss": 3.9209, + "step": 24075 + }, + { + "epoch": 1.6360918603071068, + "grad_norm": 0.18717390298843384, + "learning_rate": 7.956074194863433e-05, + "loss": 4.1886, + "step": 24080 + }, + { + "epoch": 1.6364315803777687, + "grad_norm": 0.3207692801952362, + "learning_rate": 7.955649544775106e-05, + "loss": 4.3937, + "step": 24085 + }, + { + "epoch": 1.6367713004484306, + "grad_norm": 0.21544545888900757, + "learning_rate": 7.955224894686779e-05, + "loss": 3.9264, + "step": 24090 + }, + { + "epoch": 1.6371110205190922, + "grad_norm": 0.1910250186920166, + "learning_rate": 7.954800244598451e-05, + "loss": 3.738, + "step": 24095 + }, + { + "epoch": 1.637450740589754, + "grad_norm": 0.14297524094581604, + "learning_rate": 7.954375594510124e-05, + "loss": 3.8462, + "step": 24100 + }, + { + "epoch": 1.637790460660416, + "grad_norm": 0.17333899438381195, + "learning_rate": 7.953950944421797e-05, + "loss": 3.8957, + "step": 24105 + }, + { + "epoch": 1.6381301807310775, + "grad_norm": 0.1948770433664322, + "learning_rate": 7.95352629433347e-05, + "loss": 3.7236, + "step": 24110 + }, + { + "epoch": 1.6384699008017394, + "grad_norm": 0.18003833293914795, + "learning_rate": 7.953101644245143e-05, + "loss": 3.8184, + "step": 24115 + }, + { + "epoch": 1.6388096208724012, + "grad_norm": 0.17102402448654175, + "learning_rate": 7.952676994156814e-05, + "loss": 3.8014, + "step": 24120 + }, + { + "epoch": 1.6391493409430629, + "grad_norm": 0.16062317788600922, + "learning_rate": 7.952252344068488e-05, + "loss": 3.7374, + "step": 24125 + }, + { + "epoch": 1.6394890610137247, + "grad_norm": 0.165630504488945, + "learning_rate": 7.951827693980161e-05, + "loss": 3.9524, + "step": 24130 + }, + { + "epoch": 1.6398287810843866, + "grad_norm": 0.17556560039520264, + "learning_rate": 7.951403043891832e-05, + "loss": 3.9246, + "step": 24135 + }, + { + "epoch": 1.6401685011550482, + "grad_norm": 0.20636606216430664, + "learning_rate": 7.950978393803507e-05, + "loss": 4.0547, + "step": 24140 + }, + { + "epoch": 1.64050822122571, + "grad_norm": 0.20399969816207886, + "learning_rate": 7.95055374371518e-05, + "loss": 3.803, + "step": 24145 + }, + { + "epoch": 1.640847941296372, + "grad_norm": 0.1642654836177826, + "learning_rate": 7.950129093626851e-05, + "loss": 4.1226, + "step": 24150 + }, + { + "epoch": 1.6411876613670335, + "grad_norm": 0.1758507788181305, + "learning_rate": 7.949704443538525e-05, + "loss": 3.7783, + "step": 24155 + }, + { + "epoch": 1.6415273814376954, + "grad_norm": 0.17428165674209595, + "learning_rate": 7.949279793450198e-05, + "loss": 3.6405, + "step": 24160 + }, + { + "epoch": 1.6418671015083572, + "grad_norm": 0.274393767118454, + "learning_rate": 7.948855143361869e-05, + "loss": 4.0466, + "step": 24165 + }, + { + "epoch": 1.6422068215790189, + "grad_norm": 0.1890534907579422, + "learning_rate": 7.948430493273543e-05, + "loss": 3.7475, + "step": 24170 + }, + { + "epoch": 1.6425465416496805, + "grad_norm": 1.7205244302749634, + "learning_rate": 7.948005843185216e-05, + "loss": 3.8132, + "step": 24175 + }, + { + "epoch": 1.6428862617203426, + "grad_norm": 0.16669093072414398, + "learning_rate": 7.947581193096889e-05, + "loss": 3.9114, + "step": 24180 + }, + { + "epoch": 1.6432259817910042, + "grad_norm": 0.21115164458751678, + "learning_rate": 7.947156543008562e-05, + "loss": 3.9481, + "step": 24185 + }, + { + "epoch": 1.6435657018616658, + "grad_norm": 0.15691471099853516, + "learning_rate": 7.946731892920233e-05, + "loss": 3.6542, + "step": 24190 + }, + { + "epoch": 1.643905421932328, + "grad_norm": 0.17998024821281433, + "learning_rate": 7.946307242831907e-05, + "loss": 3.7132, + "step": 24195 + }, + { + "epoch": 1.6442451420029895, + "grad_norm": 0.2970520555973053, + "learning_rate": 7.94588259274358e-05, + "loss": 4.1075, + "step": 24200 + }, + { + "epoch": 1.6445848620736512, + "grad_norm": 0.1864253431558609, + "learning_rate": 7.945457942655252e-05, + "loss": 4.1143, + "step": 24205 + }, + { + "epoch": 1.6449245821443133, + "grad_norm": 0.1456526219844818, + "learning_rate": 7.945033292566926e-05, + "loss": 3.7388, + "step": 24210 + }, + { + "epoch": 1.6452643022149749, + "grad_norm": 0.19242537021636963, + "learning_rate": 7.944608642478599e-05, + "loss": 3.921, + "step": 24215 + }, + { + "epoch": 1.6456040222856365, + "grad_norm": 0.3201119005680084, + "learning_rate": 7.94418399239027e-05, + "loss": 3.8611, + "step": 24220 + }, + { + "epoch": 1.6459437423562984, + "grad_norm": 0.18175916373729706, + "learning_rate": 7.943759342301944e-05, + "loss": 4.0219, + "step": 24225 + }, + { + "epoch": 1.6462834624269602, + "grad_norm": 0.28742724657058716, + "learning_rate": 7.943334692213617e-05, + "loss": 3.9881, + "step": 24230 + }, + { + "epoch": 1.6466231824976219, + "grad_norm": 0.19212494790554047, + "learning_rate": 7.942910042125288e-05, + "loss": 3.9363, + "step": 24235 + }, + { + "epoch": 1.6469629025682837, + "grad_norm": 0.18037335574626923, + "learning_rate": 7.942485392036963e-05, + "loss": 3.9165, + "step": 24240 + }, + { + "epoch": 1.6473026226389456, + "grad_norm": 0.30591702461242676, + "learning_rate": 7.942060741948635e-05, + "loss": 3.6759, + "step": 24245 + }, + { + "epoch": 1.6476423427096072, + "grad_norm": 0.17178910970687866, + "learning_rate": 7.941636091860307e-05, + "loss": 3.9932, + "step": 24250 + }, + { + "epoch": 1.647982062780269, + "grad_norm": 0.20229366421699524, + "learning_rate": 7.941211441771981e-05, + "loss": 3.7751, + "step": 24255 + }, + { + "epoch": 1.648321782850931, + "grad_norm": 0.21419720351696014, + "learning_rate": 7.940786791683652e-05, + "loss": 4.0039, + "step": 24260 + }, + { + "epoch": 1.6486615029215925, + "grad_norm": 0.2730887234210968, + "learning_rate": 7.940362141595325e-05, + "loss": 3.9142, + "step": 24265 + }, + { + "epoch": 1.6490012229922544, + "grad_norm": 0.24491539597511292, + "learning_rate": 7.939937491507e-05, + "loss": 3.7676, + "step": 24270 + }, + { + "epoch": 1.6493409430629162, + "grad_norm": 0.14415740966796875, + "learning_rate": 7.939512841418671e-05, + "loss": 3.7109, + "step": 24275 + }, + { + "epoch": 1.6496806631335779, + "grad_norm": 0.2772815525531769, + "learning_rate": 7.939088191330344e-05, + "loss": 4.1084, + "step": 24280 + }, + { + "epoch": 1.6500203832042397, + "grad_norm": 0.16489918529987335, + "learning_rate": 7.938663541242018e-05, + "loss": 3.7919, + "step": 24285 + }, + { + "epoch": 1.6503601032749016, + "grad_norm": 0.25432080030441284, + "learning_rate": 7.938238891153689e-05, + "loss": 3.8082, + "step": 24290 + }, + { + "epoch": 1.6506998233455632, + "grad_norm": 0.24341252446174622, + "learning_rate": 7.937814241065362e-05, + "loss": 4.1277, + "step": 24295 + }, + { + "epoch": 1.651039543416225, + "grad_norm": 0.13834118843078613, + "learning_rate": 7.937389590977036e-05, + "loss": 3.8603, + "step": 24300 + }, + { + "epoch": 1.651379263486887, + "grad_norm": 0.15653274953365326, + "learning_rate": 7.936964940888708e-05, + "loss": 3.8198, + "step": 24305 + }, + { + "epoch": 1.6517189835575485, + "grad_norm": 0.1972159594297409, + "learning_rate": 7.93654029080038e-05, + "loss": 4.1495, + "step": 24310 + }, + { + "epoch": 1.6520587036282104, + "grad_norm": 0.1429699808359146, + "learning_rate": 7.936115640712055e-05, + "loss": 3.9132, + "step": 24315 + }, + { + "epoch": 1.6523984236988722, + "grad_norm": 0.2528495192527771, + "learning_rate": 7.935690990623726e-05, + "loss": 3.7837, + "step": 24320 + }, + { + "epoch": 1.6527381437695339, + "grad_norm": 0.45410969853401184, + "learning_rate": 7.935266340535399e-05, + "loss": 3.943, + "step": 24325 + }, + { + "epoch": 1.6530778638401957, + "grad_norm": 0.1773313730955124, + "learning_rate": 7.934841690447073e-05, + "loss": 3.6774, + "step": 24330 + }, + { + "epoch": 1.6534175839108576, + "grad_norm": 0.7874429821968079, + "learning_rate": 7.934417040358744e-05, + "loss": 3.8536, + "step": 24335 + }, + { + "epoch": 1.6537573039815192, + "grad_norm": 0.1676434874534607, + "learning_rate": 7.933992390270417e-05, + "loss": 3.7092, + "step": 24340 + }, + { + "epoch": 1.6540970240521808, + "grad_norm": 0.1533070206642151, + "learning_rate": 7.93356774018209e-05, + "loss": 3.7544, + "step": 24345 + }, + { + "epoch": 1.654436744122843, + "grad_norm": 0.1748826503753662, + "learning_rate": 7.933143090093763e-05, + "loss": 3.8996, + "step": 24350 + }, + { + "epoch": 1.6547764641935045, + "grad_norm": 0.2941761314868927, + "learning_rate": 7.932718440005436e-05, + "loss": 3.8862, + "step": 24355 + }, + { + "epoch": 1.6551161842641662, + "grad_norm": 0.15498290956020355, + "learning_rate": 7.932293789917108e-05, + "loss": 3.9512, + "step": 24360 + }, + { + "epoch": 1.6554559043348283, + "grad_norm": 0.18281693756580353, + "learning_rate": 7.931869139828781e-05, + "loss": 3.8565, + "step": 24365 + }, + { + "epoch": 1.6557956244054899, + "grad_norm": 0.18015894293785095, + "learning_rate": 7.931444489740454e-05, + "loss": 3.8939, + "step": 24370 + }, + { + "epoch": 1.6561353444761515, + "grad_norm": 0.2272566556930542, + "learning_rate": 7.931019839652127e-05, + "loss": 3.8989, + "step": 24375 + }, + { + "epoch": 1.6564750645468136, + "grad_norm": 0.14183594286441803, + "learning_rate": 7.9305951895638e-05, + "loss": 3.8357, + "step": 24380 + }, + { + "epoch": 1.6568147846174752, + "grad_norm": 0.1696137636899948, + "learning_rate": 7.930170539475472e-05, + "loss": 4.1266, + "step": 24385 + }, + { + "epoch": 1.6571545046881369, + "grad_norm": 0.17644689977169037, + "learning_rate": 7.929745889387145e-05, + "loss": 3.7296, + "step": 24390 + }, + { + "epoch": 1.6574942247587987, + "grad_norm": 0.20541059970855713, + "learning_rate": 7.929321239298818e-05, + "loss": 3.9605, + "step": 24395 + }, + { + "epoch": 1.6578339448294606, + "grad_norm": 0.4709828197956085, + "learning_rate": 7.928896589210491e-05, + "loss": 3.9894, + "step": 24400 + }, + { + "epoch": 1.6581736649001222, + "grad_norm": 0.2560257911682129, + "learning_rate": 7.928471939122164e-05, + "loss": 3.7891, + "step": 24405 + }, + { + "epoch": 1.658513384970784, + "grad_norm": 0.19005152583122253, + "learning_rate": 7.928047289033836e-05, + "loss": 3.9181, + "step": 24410 + }, + { + "epoch": 1.658853105041446, + "grad_norm": 0.19898542761802673, + "learning_rate": 7.927622638945509e-05, + "loss": 3.9626, + "step": 24415 + }, + { + "epoch": 1.6591928251121075, + "grad_norm": 0.5456429719924927, + "learning_rate": 7.927197988857182e-05, + "loss": 3.9496, + "step": 24420 + }, + { + "epoch": 1.6595325451827694, + "grad_norm": 0.18062038719654083, + "learning_rate": 7.926773338768855e-05, + "loss": 3.9342, + "step": 24425 + }, + { + "epoch": 1.6598722652534312, + "grad_norm": 0.1970699578523636, + "learning_rate": 7.926348688680528e-05, + "loss": 4.0193, + "step": 24430 + }, + { + "epoch": 1.6602119853240929, + "grad_norm": 0.23547931015491486, + "learning_rate": 7.9259240385922e-05, + "loss": 3.8365, + "step": 24435 + }, + { + "epoch": 1.6605517053947547, + "grad_norm": 0.2527804970741272, + "learning_rate": 7.925499388503873e-05, + "loss": 3.9595, + "step": 24440 + }, + { + "epoch": 1.6608914254654166, + "grad_norm": 0.1774808168411255, + "learning_rate": 7.925074738415546e-05, + "loss": 3.887, + "step": 24445 + }, + { + "epoch": 1.6612311455360782, + "grad_norm": 0.1556062251329422, + "learning_rate": 7.924650088327219e-05, + "loss": 4.0998, + "step": 24450 + }, + { + "epoch": 1.66157086560674, + "grad_norm": 0.20335814356803894, + "learning_rate": 7.924225438238892e-05, + "loss": 3.8372, + "step": 24455 + }, + { + "epoch": 1.661910585677402, + "grad_norm": 0.17355573177337646, + "learning_rate": 7.923800788150563e-05, + "loss": 4.0016, + "step": 24460 + }, + { + "epoch": 1.6622503057480635, + "grad_norm": 0.20219679176807404, + "learning_rate": 7.923376138062237e-05, + "loss": 4.0143, + "step": 24465 + }, + { + "epoch": 1.6625900258187254, + "grad_norm": 0.7917993068695068, + "learning_rate": 7.92295148797391e-05, + "loss": 4.0577, + "step": 24470 + }, + { + "epoch": 1.6629297458893872, + "grad_norm": 0.18906843662261963, + "learning_rate": 7.922526837885582e-05, + "loss": 4.2456, + "step": 24475 + }, + { + "epoch": 1.6632694659600489, + "grad_norm": 0.16311801970005035, + "learning_rate": 7.922102187797256e-05, + "loss": 4.139, + "step": 24480 + }, + { + "epoch": 1.6636091860307107, + "grad_norm": 0.2580588459968567, + "learning_rate": 7.921677537708929e-05, + "loss": 3.8523, + "step": 24485 + }, + { + "epoch": 1.6639489061013726, + "grad_norm": 0.19363470375537872, + "learning_rate": 7.9212528876206e-05, + "loss": 3.8049, + "step": 24490 + }, + { + "epoch": 1.6642886261720342, + "grad_norm": 0.38199466466903687, + "learning_rate": 7.920828237532274e-05, + "loss": 4.0983, + "step": 24495 + }, + { + "epoch": 1.664628346242696, + "grad_norm": 0.16901031136512756, + "learning_rate": 7.920403587443947e-05, + "loss": 4.1684, + "step": 24500 + }, + { + "epoch": 1.664968066313358, + "grad_norm": 0.35095927119255066, + "learning_rate": 7.919978937355618e-05, + "loss": 3.7749, + "step": 24505 + }, + { + "epoch": 1.6653077863840196, + "grad_norm": 0.15602275729179382, + "learning_rate": 7.919554287267293e-05, + "loss": 3.8426, + "step": 24510 + }, + { + "epoch": 1.6656475064546812, + "grad_norm": 0.15190468728542328, + "learning_rate": 7.919129637178965e-05, + "loss": 3.6892, + "step": 24515 + }, + { + "epoch": 1.6659872265253433, + "grad_norm": 0.28014689683914185, + "learning_rate": 7.918704987090638e-05, + "loss": 3.728, + "step": 24520 + }, + { + "epoch": 1.6663269465960049, + "grad_norm": 0.13023699820041656, + "learning_rate": 7.918280337002311e-05, + "loss": 3.9152, + "step": 24525 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.766687273979187, + "learning_rate": 7.917855686913984e-05, + "loss": 3.9641, + "step": 24530 + }, + { + "epoch": 1.6670063867373286, + "grad_norm": 0.2289053350687027, + "learning_rate": 7.917431036825657e-05, + "loss": 4.0466, + "step": 24535 + }, + { + "epoch": 1.6673461068079902, + "grad_norm": 0.18119145929813385, + "learning_rate": 7.917006386737329e-05, + "loss": 3.8409, + "step": 24540 + }, + { + "epoch": 1.6676858268786519, + "grad_norm": 15.513923645019531, + "learning_rate": 7.916581736649001e-05, + "loss": 3.9725, + "step": 24545 + }, + { + "epoch": 1.668025546949314, + "grad_norm": 0.18886922299861908, + "learning_rate": 7.916157086560675e-05, + "loss": 3.8847, + "step": 24550 + }, + { + "epoch": 1.6683652670199756, + "grad_norm": 0.17136278748512268, + "learning_rate": 7.915732436472348e-05, + "loss": 4.1517, + "step": 24555 + }, + { + "epoch": 1.6687049870906372, + "grad_norm": 0.1670803725719452, + "learning_rate": 7.915307786384019e-05, + "loss": 3.8905, + "step": 24560 + }, + { + "epoch": 1.669044707161299, + "grad_norm": 0.17153289914131165, + "learning_rate": 7.914883136295693e-05, + "loss": 3.9441, + "step": 24565 + }, + { + "epoch": 1.669384427231961, + "grad_norm": 0.1640990823507309, + "learning_rate": 7.914458486207366e-05, + "loss": 4.0846, + "step": 24570 + }, + { + "epoch": 1.6697241473026225, + "grad_norm": 0.24264968931674957, + "learning_rate": 7.914033836119038e-05, + "loss": 3.9668, + "step": 24575 + }, + { + "epoch": 1.6700638673732844, + "grad_norm": 0.22977758944034576, + "learning_rate": 7.913609186030712e-05, + "loss": 3.9312, + "step": 24580 + }, + { + "epoch": 1.6704035874439462, + "grad_norm": 0.20148779451847076, + "learning_rate": 7.913184535942385e-05, + "loss": 3.9131, + "step": 24585 + }, + { + "epoch": 1.6707433075146079, + "grad_norm": 0.15561577677726746, + "learning_rate": 7.912759885854056e-05, + "loss": 3.6978, + "step": 24590 + }, + { + "epoch": 1.6710830275852697, + "grad_norm": 0.1999325305223465, + "learning_rate": 7.91233523576573e-05, + "loss": 3.9485, + "step": 24595 + }, + { + "epoch": 1.6714227476559316, + "grad_norm": 0.17174272239208221, + "learning_rate": 7.911910585677403e-05, + "loss": 3.7367, + "step": 24600 + }, + { + "epoch": 1.6717624677265932, + "grad_norm": 0.16479277610778809, + "learning_rate": 7.911485935589074e-05, + "loss": 3.9589, + "step": 24605 + }, + { + "epoch": 1.672102187797255, + "grad_norm": 0.8449418544769287, + "learning_rate": 7.911061285500749e-05, + "loss": 4.0846, + "step": 24610 + }, + { + "epoch": 1.672441907867917, + "grad_norm": 0.19327273964881897, + "learning_rate": 7.91063663541242e-05, + "loss": 4.0513, + "step": 24615 + }, + { + "epoch": 1.6727816279385785, + "grad_norm": 0.20895454287528992, + "learning_rate": 7.910211985324093e-05, + "loss": 3.8606, + "step": 24620 + }, + { + "epoch": 1.6731213480092404, + "grad_norm": 0.22944612801074982, + "learning_rate": 7.909787335235767e-05, + "loss": 3.9255, + "step": 24625 + }, + { + "epoch": 1.6734610680799022, + "grad_norm": 0.1641892045736313, + "learning_rate": 7.909362685147438e-05, + "loss": 4.0247, + "step": 24630 + }, + { + "epoch": 1.6738007881505639, + "grad_norm": 0.157088041305542, + "learning_rate": 7.908938035059111e-05, + "loss": 3.8922, + "step": 24635 + }, + { + "epoch": 1.6741405082212257, + "grad_norm": 0.18174982070922852, + "learning_rate": 7.908513384970785e-05, + "loss": 3.8923, + "step": 24640 + }, + { + "epoch": 1.6744802282918876, + "grad_norm": 0.6013708710670471, + "learning_rate": 7.908088734882457e-05, + "loss": 3.866, + "step": 24645 + }, + { + "epoch": 1.6748199483625492, + "grad_norm": 0.1545679271221161, + "learning_rate": 7.90766408479413e-05, + "loss": 3.835, + "step": 24650 + }, + { + "epoch": 1.675159668433211, + "grad_norm": 0.16208264231681824, + "learning_rate": 7.907239434705804e-05, + "loss": 4.0795, + "step": 24655 + }, + { + "epoch": 1.675499388503873, + "grad_norm": 0.16745969653129578, + "learning_rate": 7.906814784617475e-05, + "loss": 3.8006, + "step": 24660 + }, + { + "epoch": 1.6758391085745346, + "grad_norm": 0.21567237377166748, + "learning_rate": 7.906390134529148e-05, + "loss": 4.0386, + "step": 24665 + }, + { + "epoch": 1.6761788286451964, + "grad_norm": 0.23796403408050537, + "learning_rate": 7.905965484440822e-05, + "loss": 3.8471, + "step": 24670 + }, + { + "epoch": 1.6765185487158583, + "grad_norm": 0.48861321806907654, + "learning_rate": 7.905540834352494e-05, + "loss": 4.0926, + "step": 24675 + }, + { + "epoch": 1.67685826878652, + "grad_norm": 0.2219013124704361, + "learning_rate": 7.905116184264166e-05, + "loss": 3.7942, + "step": 24680 + }, + { + "epoch": 1.6771979888571815, + "grad_norm": 0.19302813708782196, + "learning_rate": 7.904691534175839e-05, + "loss": 4.1025, + "step": 24685 + }, + { + "epoch": 1.6775377089278436, + "grad_norm": 0.299867182970047, + "learning_rate": 7.904266884087512e-05, + "loss": 3.8444, + "step": 24690 + }, + { + "epoch": 1.6778774289985052, + "grad_norm": 0.18383780121803284, + "learning_rate": 7.903842233999185e-05, + "loss": 3.9753, + "step": 24695 + }, + { + "epoch": 1.6782171490691669, + "grad_norm": 0.16705390810966492, + "learning_rate": 7.903417583910858e-05, + "loss": 3.9721, + "step": 24700 + }, + { + "epoch": 1.678556869139829, + "grad_norm": 0.17858977615833282, + "learning_rate": 7.90299293382253e-05, + "loss": 3.8638, + "step": 24705 + }, + { + "epoch": 1.6788965892104906, + "grad_norm": 0.1406175047159195, + "learning_rate": 7.902568283734203e-05, + "loss": 4.0423, + "step": 24710 + }, + { + "epoch": 1.6792363092811522, + "grad_norm": 0.15125994384288788, + "learning_rate": 7.902143633645876e-05, + "loss": 4.0982, + "step": 24715 + }, + { + "epoch": 1.6795760293518143, + "grad_norm": 0.17645205557346344, + "learning_rate": 7.901718983557549e-05, + "loss": 3.9483, + "step": 24720 + }, + { + "epoch": 1.679915749422476, + "grad_norm": 0.15454550087451935, + "learning_rate": 7.901294333469222e-05, + "loss": 3.7015, + "step": 24725 + }, + { + "epoch": 1.6802554694931375, + "grad_norm": 0.21556589007377625, + "learning_rate": 7.900869683380894e-05, + "loss": 3.9092, + "step": 24730 + }, + { + "epoch": 1.6805951895637994, + "grad_norm": 0.19907844066619873, + "learning_rate": 7.900445033292567e-05, + "loss": 3.8623, + "step": 24735 + }, + { + "epoch": 1.6809349096344612, + "grad_norm": 0.16469287872314453, + "learning_rate": 7.90002038320424e-05, + "loss": 3.8743, + "step": 24740 + }, + { + "epoch": 1.6812746297051229, + "grad_norm": 1.904208779335022, + "learning_rate": 7.899595733115913e-05, + "loss": 4.1633, + "step": 24745 + }, + { + "epoch": 1.6816143497757847, + "grad_norm": 0.17544928193092346, + "learning_rate": 7.899171083027586e-05, + "loss": 4.0481, + "step": 24750 + }, + { + "epoch": 1.6819540698464466, + "grad_norm": 0.22521540522575378, + "learning_rate": 7.898746432939258e-05, + "loss": 3.9329, + "step": 24755 + }, + { + "epoch": 1.6822937899171082, + "grad_norm": 0.18599478900432587, + "learning_rate": 7.898321782850931e-05, + "loss": 3.7483, + "step": 24760 + }, + { + "epoch": 1.68263350998777, + "grad_norm": 0.7569705843925476, + "learning_rate": 7.897897132762604e-05, + "loss": 3.458, + "step": 24765 + }, + { + "epoch": 1.682973230058432, + "grad_norm": 0.20300547778606415, + "learning_rate": 7.897472482674277e-05, + "loss": 3.8353, + "step": 24770 + }, + { + "epoch": 1.6833129501290935, + "grad_norm": 0.15606099367141724, + "learning_rate": 7.89704783258595e-05, + "loss": 3.9055, + "step": 24775 + }, + { + "epoch": 1.6836526701997554, + "grad_norm": 0.19566331803798676, + "learning_rate": 7.896623182497622e-05, + "loss": 4.1395, + "step": 24780 + }, + { + "epoch": 1.6839923902704172, + "grad_norm": 0.17520058155059814, + "learning_rate": 7.896198532409295e-05, + "loss": 3.9097, + "step": 24785 + }, + { + "epoch": 1.6843321103410789, + "grad_norm": 0.19911041855812073, + "learning_rate": 7.895773882320968e-05, + "loss": 3.7572, + "step": 24790 + }, + { + "epoch": 1.6846718304117407, + "grad_norm": 0.1680784523487091, + "learning_rate": 7.895349232232641e-05, + "loss": 4.0063, + "step": 24795 + }, + { + "epoch": 1.6850115504824026, + "grad_norm": 0.15861257910728455, + "learning_rate": 7.894924582144314e-05, + "loss": 3.7826, + "step": 24800 + }, + { + "epoch": 1.6853512705530642, + "grad_norm": 0.2248765528202057, + "learning_rate": 7.894499932055986e-05, + "loss": 3.705, + "step": 24805 + }, + { + "epoch": 1.685690990623726, + "grad_norm": 0.22191625833511353, + "learning_rate": 7.894075281967659e-05, + "loss": 3.6166, + "step": 24810 + }, + { + "epoch": 1.686030710694388, + "grad_norm": 0.17160432040691376, + "learning_rate": 7.89365063187933e-05, + "loss": 3.8617, + "step": 24815 + }, + { + "epoch": 1.6863704307650496, + "grad_norm": 0.17551803588867188, + "learning_rate": 7.893225981791005e-05, + "loss": 3.7428, + "step": 24820 + }, + { + "epoch": 1.6867101508357114, + "grad_norm": 1.0323764085769653, + "learning_rate": 7.892801331702678e-05, + "loss": 3.924, + "step": 24825 + }, + { + "epoch": 1.6870498709063733, + "grad_norm": 0.16687646508216858, + "learning_rate": 7.892376681614349e-05, + "loss": 3.9574, + "step": 24830 + }, + { + "epoch": 1.687389590977035, + "grad_norm": 0.20594742894172668, + "learning_rate": 7.891952031526023e-05, + "loss": 3.8771, + "step": 24835 + }, + { + "epoch": 1.6877293110476967, + "grad_norm": 0.1527002602815628, + "learning_rate": 7.891527381437696e-05, + "loss": 3.985, + "step": 24840 + }, + { + "epoch": 1.6880690311183586, + "grad_norm": 0.19883416593074799, + "learning_rate": 7.891102731349367e-05, + "loss": 3.9944, + "step": 24845 + }, + { + "epoch": 1.6884087511890202, + "grad_norm": 0.32702088356018066, + "learning_rate": 7.890678081261042e-05, + "loss": 3.9083, + "step": 24850 + }, + { + "epoch": 1.6887484712596819, + "grad_norm": 0.15654422342777252, + "learning_rate": 7.890253431172714e-05, + "loss": 3.6697, + "step": 24855 + }, + { + "epoch": 1.689088191330344, + "grad_norm": 0.17276668548583984, + "learning_rate": 7.889828781084387e-05, + "loss": 3.9487, + "step": 24860 + }, + { + "epoch": 1.6894279114010056, + "grad_norm": 0.15820203721523285, + "learning_rate": 7.88940413099606e-05, + "loss": 3.7651, + "step": 24865 + }, + { + "epoch": 1.6897676314716672, + "grad_norm": 0.1623237282037735, + "learning_rate": 7.888979480907733e-05, + "loss": 3.6669, + "step": 24870 + }, + { + "epoch": 1.6901073515423293, + "grad_norm": 0.1379580944776535, + "learning_rate": 7.888554830819406e-05, + "loss": 3.9417, + "step": 24875 + }, + { + "epoch": 1.690447071612991, + "grad_norm": 0.15247198939323425, + "learning_rate": 7.888130180731078e-05, + "loss": 3.8024, + "step": 24880 + }, + { + "epoch": 1.6907867916836525, + "grad_norm": 0.1784202754497528, + "learning_rate": 7.88770553064275e-05, + "loss": 3.5637, + "step": 24885 + }, + { + "epoch": 1.6911265117543146, + "grad_norm": 0.17432157695293427, + "learning_rate": 7.887280880554424e-05, + "loss": 3.8442, + "step": 24890 + }, + { + "epoch": 1.6914662318249762, + "grad_norm": 0.22485899925231934, + "learning_rate": 7.886856230466097e-05, + "loss": 3.902, + "step": 24895 + }, + { + "epoch": 1.6918059518956379, + "grad_norm": 0.18886318802833557, + "learning_rate": 7.886431580377768e-05, + "loss": 3.6893, + "step": 24900 + }, + { + "epoch": 1.6921456719662997, + "grad_norm": 0.23344670236110687, + "learning_rate": 7.886006930289442e-05, + "loss": 3.7719, + "step": 24905 + }, + { + "epoch": 1.6924853920369616, + "grad_norm": 0.14375576376914978, + "learning_rate": 7.885582280201115e-05, + "loss": 3.9724, + "step": 24910 + }, + { + "epoch": 1.6928251121076232, + "grad_norm": 0.20028933882713318, + "learning_rate": 7.885157630112787e-05, + "loss": 3.7822, + "step": 24915 + }, + { + "epoch": 1.693164832178285, + "grad_norm": 0.18122731149196625, + "learning_rate": 7.884732980024461e-05, + "loss": 4.0248, + "step": 24920 + }, + { + "epoch": 1.693504552248947, + "grad_norm": 0.16876550018787384, + "learning_rate": 7.884308329936134e-05, + "loss": 4.192, + "step": 24925 + }, + { + "epoch": 1.6938442723196085, + "grad_norm": 1.6714425086975098, + "learning_rate": 7.883883679847805e-05, + "loss": 3.6576, + "step": 24930 + }, + { + "epoch": 1.6941839923902704, + "grad_norm": 0.14793816208839417, + "learning_rate": 7.883459029759479e-05, + "loss": 4.0956, + "step": 24935 + }, + { + "epoch": 1.6945237124609323, + "grad_norm": 0.5837845802307129, + "learning_rate": 7.883034379671152e-05, + "loss": 3.8922, + "step": 24940 + }, + { + "epoch": 1.6948634325315939, + "grad_norm": 0.2746259272098541, + "learning_rate": 7.882609729582823e-05, + "loss": 3.8964, + "step": 24945 + }, + { + "epoch": 1.6952031526022557, + "grad_norm": 0.9002888798713684, + "learning_rate": 7.882185079494498e-05, + "loss": 3.9459, + "step": 24950 + }, + { + "epoch": 1.6955428726729176, + "grad_norm": 0.1935095340013504, + "learning_rate": 7.88176042940617e-05, + "loss": 3.8441, + "step": 24955 + }, + { + "epoch": 1.6958825927435792, + "grad_norm": 0.16913118958473206, + "learning_rate": 7.881335779317842e-05, + "loss": 3.6909, + "step": 24960 + }, + { + "epoch": 1.696222312814241, + "grad_norm": 0.1677272468805313, + "learning_rate": 7.880911129229516e-05, + "loss": 3.7779, + "step": 24965 + }, + { + "epoch": 1.696562032884903, + "grad_norm": 0.132848858833313, + "learning_rate": 7.880486479141187e-05, + "loss": 4.0242, + "step": 24970 + }, + { + "epoch": 1.6969017529555646, + "grad_norm": 0.16878566145896912, + "learning_rate": 7.88006182905286e-05, + "loss": 3.9448, + "step": 24975 + }, + { + "epoch": 1.6972414730262264, + "grad_norm": 0.1811150163412094, + "learning_rate": 7.879637178964534e-05, + "loss": 3.7488, + "step": 24980 + }, + { + "epoch": 1.6975811930968883, + "grad_norm": 0.24220719933509827, + "learning_rate": 7.879212528876206e-05, + "loss": 3.6223, + "step": 24985 + }, + { + "epoch": 1.69792091316755, + "grad_norm": 0.19750110805034637, + "learning_rate": 7.878787878787879e-05, + "loss": 3.846, + "step": 24990 + }, + { + "epoch": 1.6982606332382117, + "grad_norm": 0.1806909143924713, + "learning_rate": 7.878363228699553e-05, + "loss": 3.9761, + "step": 24995 + }, + { + "epoch": 1.6986003533088736, + "grad_norm": 0.1541869342327118, + "learning_rate": 7.877938578611224e-05, + "loss": 4.0497, + "step": 25000 + }, + { + "epoch": 1.6989400733795352, + "grad_norm": 0.16378028690814972, + "learning_rate": 7.877513928522897e-05, + "loss": 3.9157, + "step": 25005 + }, + { + "epoch": 1.699279793450197, + "grad_norm": 0.21256224811077118, + "learning_rate": 7.877089278434571e-05, + "loss": 4.0286, + "step": 25010 + }, + { + "epoch": 1.699619513520859, + "grad_norm": 0.192632794380188, + "learning_rate": 7.876664628346243e-05, + "loss": 3.84, + "step": 25015 + }, + { + "epoch": 1.6999592335915206, + "grad_norm": 0.16443713009357452, + "learning_rate": 7.876239978257915e-05, + "loss": 3.9679, + "step": 25020 + }, + { + "epoch": 1.7002989536621822, + "grad_norm": 1.0100945234298706, + "learning_rate": 7.87581532816959e-05, + "loss": 3.874, + "step": 25025 + }, + { + "epoch": 1.7006386737328443, + "grad_norm": 0.18409784138202667, + "learning_rate": 7.875390678081261e-05, + "loss": 3.8486, + "step": 25030 + }, + { + "epoch": 1.700978393803506, + "grad_norm": 0.25915852189064026, + "learning_rate": 7.874966027992934e-05, + "loss": 3.5774, + "step": 25035 + }, + { + "epoch": 1.7013181138741675, + "grad_norm": 0.19250795245170593, + "learning_rate": 7.874541377904607e-05, + "loss": 3.9698, + "step": 25040 + }, + { + "epoch": 1.7016578339448296, + "grad_norm": 0.19508054852485657, + "learning_rate": 7.87411672781628e-05, + "loss": 3.947, + "step": 25045 + }, + { + "epoch": 1.7019975540154912, + "grad_norm": 0.2270628660917282, + "learning_rate": 7.873692077727952e-05, + "loss": 3.8637, + "step": 25050 + }, + { + "epoch": 1.7023372740861529, + "grad_norm": 0.2161751091480255, + "learning_rate": 7.873267427639625e-05, + "loss": 3.8807, + "step": 25055 + }, + { + "epoch": 1.702676994156815, + "grad_norm": 0.18651194870471954, + "learning_rate": 7.872842777551298e-05, + "loss": 3.8256, + "step": 25060 + }, + { + "epoch": 1.7030167142274766, + "grad_norm": 0.14387968182563782, + "learning_rate": 7.872418127462971e-05, + "loss": 3.7965, + "step": 25065 + }, + { + "epoch": 1.7033564342981382, + "grad_norm": 0.17202286422252655, + "learning_rate": 7.871993477374643e-05, + "loss": 3.9312, + "step": 25070 + }, + { + "epoch": 1.7036961543688, + "grad_norm": 0.18648800253868103, + "learning_rate": 7.871568827286316e-05, + "loss": 3.695, + "step": 25075 + }, + { + "epoch": 1.704035874439462, + "grad_norm": 0.18257348239421844, + "learning_rate": 7.871144177197989e-05, + "loss": 3.7781, + "step": 25080 + }, + { + "epoch": 1.7043755945101235, + "grad_norm": 0.19214104115962982, + "learning_rate": 7.870719527109662e-05, + "loss": 4.0052, + "step": 25085 + }, + { + "epoch": 1.7047153145807854, + "grad_norm": 1.8989408016204834, + "learning_rate": 7.870294877021335e-05, + "loss": 4.1226, + "step": 25090 + }, + { + "epoch": 1.7050550346514473, + "grad_norm": 0.17684803903102875, + "learning_rate": 7.869870226933007e-05, + "loss": 4.1619, + "step": 25095 + }, + { + "epoch": 1.7053947547221089, + "grad_norm": 0.19603700935840607, + "learning_rate": 7.86944557684468e-05, + "loss": 3.9109, + "step": 25100 + }, + { + "epoch": 1.7057344747927707, + "grad_norm": 0.393204003572464, + "learning_rate": 7.869020926756353e-05, + "loss": 3.8971, + "step": 25105 + }, + { + "epoch": 1.7060741948634326, + "grad_norm": 0.16296792030334473, + "learning_rate": 7.868596276668026e-05, + "loss": 3.8626, + "step": 25110 + }, + { + "epoch": 1.7064139149340942, + "grad_norm": 0.19251331686973572, + "learning_rate": 7.868171626579699e-05, + "loss": 3.8534, + "step": 25115 + }, + { + "epoch": 1.706753635004756, + "grad_norm": 0.22725585103034973, + "learning_rate": 7.867746976491371e-05, + "loss": 3.8204, + "step": 25120 + }, + { + "epoch": 1.707093355075418, + "grad_norm": 0.4820252060890198, + "learning_rate": 7.867322326403044e-05, + "loss": 3.8851, + "step": 25125 + }, + { + "epoch": 1.7074330751460796, + "grad_norm": 0.13679246604442596, + "learning_rate": 7.866897676314717e-05, + "loss": 4.0925, + "step": 25130 + }, + { + "epoch": 1.7077727952167414, + "grad_norm": 0.15524838864803314, + "learning_rate": 7.86647302622639e-05, + "loss": 3.7139, + "step": 25135 + }, + { + "epoch": 1.7081125152874033, + "grad_norm": 0.15836882591247559, + "learning_rate": 7.866048376138063e-05, + "loss": 4.0474, + "step": 25140 + }, + { + "epoch": 1.708452235358065, + "grad_norm": 0.2603413164615631, + "learning_rate": 7.865623726049735e-05, + "loss": 3.8505, + "step": 25145 + }, + { + "epoch": 1.7087919554287267, + "grad_norm": 0.1815950721502304, + "learning_rate": 7.865199075961408e-05, + "loss": 4.2164, + "step": 25150 + }, + { + "epoch": 1.7091316754993886, + "grad_norm": 0.17414537072181702, + "learning_rate": 7.864774425873081e-05, + "loss": 3.9754, + "step": 25155 + }, + { + "epoch": 1.7094713955700502, + "grad_norm": 0.8545550107955933, + "learning_rate": 7.864349775784754e-05, + "loss": 4.062, + "step": 25160 + }, + { + "epoch": 1.709811115640712, + "grad_norm": 0.17726565897464752, + "learning_rate": 7.863925125696427e-05, + "loss": 3.8747, + "step": 25165 + }, + { + "epoch": 1.710150835711374, + "grad_norm": 0.2013719230890274, + "learning_rate": 7.863500475608098e-05, + "loss": 3.9993, + "step": 25170 + }, + { + "epoch": 1.7104905557820356, + "grad_norm": 0.18580962717533112, + "learning_rate": 7.863075825519772e-05, + "loss": 3.9086, + "step": 25175 + }, + { + "epoch": 1.7108302758526974, + "grad_norm": 0.20207525789737701, + "learning_rate": 7.862651175431445e-05, + "loss": 4.1409, + "step": 25180 + }, + { + "epoch": 1.7111699959233593, + "grad_norm": 1.037609338760376, + "learning_rate": 7.862226525343117e-05, + "loss": 4.0451, + "step": 25185 + }, + { + "epoch": 1.711509715994021, + "grad_norm": 0.2452026605606079, + "learning_rate": 7.861801875254791e-05, + "loss": 4.019, + "step": 25190 + }, + { + "epoch": 1.7118494360646825, + "grad_norm": 0.2016415297985077, + "learning_rate": 7.861377225166464e-05, + "loss": 4.0161, + "step": 25195 + }, + { + "epoch": 1.7121891561353446, + "grad_norm": 0.2686477303504944, + "learning_rate": 7.860952575078136e-05, + "loss": 3.83, + "step": 25200 + }, + { + "epoch": 1.7125288762060062, + "grad_norm": 2.34346079826355, + "learning_rate": 7.860527924989809e-05, + "loss": 3.7403, + "step": 25205 + }, + { + "epoch": 1.7128685962766679, + "grad_norm": 0.18789605796337128, + "learning_rate": 7.860103274901482e-05, + "loss": 3.8704, + "step": 25210 + }, + { + "epoch": 1.71320831634733, + "grad_norm": 0.1953544020652771, + "learning_rate": 7.859678624813155e-05, + "loss": 3.698, + "step": 25215 + }, + { + "epoch": 1.7135480364179916, + "grad_norm": 0.1879119724035263, + "learning_rate": 7.859253974724828e-05, + "loss": 3.9055, + "step": 25220 + }, + { + "epoch": 1.7138877564886532, + "grad_norm": 0.4923626482486725, + "learning_rate": 7.8588293246365e-05, + "loss": 4.0051, + "step": 25225 + }, + { + "epoch": 1.7142274765593153, + "grad_norm": 0.18192274868488312, + "learning_rate": 7.858404674548173e-05, + "loss": 3.7504, + "step": 25230 + }, + { + "epoch": 1.714567196629977, + "grad_norm": 0.20617520809173584, + "learning_rate": 7.857980024459846e-05, + "loss": 3.8507, + "step": 25235 + }, + { + "epoch": 1.7149069167006386, + "grad_norm": 0.24653705954551697, + "learning_rate": 7.857555374371517e-05, + "loss": 4.3187, + "step": 25240 + }, + { + "epoch": 1.7152466367713004, + "grad_norm": 0.7135516405105591, + "learning_rate": 7.857130724283192e-05, + "loss": 3.8118, + "step": 25245 + }, + { + "epoch": 1.7155863568419623, + "grad_norm": 0.1549421101808548, + "learning_rate": 7.856706074194864e-05, + "loss": 3.9249, + "step": 25250 + }, + { + "epoch": 1.7159260769126239, + "grad_norm": 0.16622740030288696, + "learning_rate": 7.856281424106536e-05, + "loss": 3.9156, + "step": 25255 + }, + { + "epoch": 1.7162657969832857, + "grad_norm": 0.29805418848991394, + "learning_rate": 7.85585677401821e-05, + "loss": 3.7001, + "step": 25260 + }, + { + "epoch": 1.7166055170539476, + "grad_norm": 0.17936047911643982, + "learning_rate": 7.855432123929883e-05, + "loss": 3.954, + "step": 25265 + }, + { + "epoch": 1.7169452371246092, + "grad_norm": 0.49838945269584656, + "learning_rate": 7.855007473841554e-05, + "loss": 3.9371, + "step": 25270 + }, + { + "epoch": 1.717284957195271, + "grad_norm": 0.1510656625032425, + "learning_rate": 7.854582823753228e-05, + "loss": 3.8714, + "step": 25275 + }, + { + "epoch": 1.717624677265933, + "grad_norm": 0.21676041185855865, + "learning_rate": 7.854158173664901e-05, + "loss": 3.9565, + "step": 25280 + }, + { + "epoch": 1.7179643973365946, + "grad_norm": 0.17177796363830566, + "learning_rate": 7.853733523576573e-05, + "loss": 3.9917, + "step": 25285 + }, + { + "epoch": 1.7183041174072564, + "grad_norm": 0.19897349178791046, + "learning_rate": 7.853308873488247e-05, + "loss": 4.0083, + "step": 25290 + }, + { + "epoch": 1.7186438374779183, + "grad_norm": 0.7110993266105652, + "learning_rate": 7.85288422339992e-05, + "loss": 3.7236, + "step": 25295 + }, + { + "epoch": 1.71898355754858, + "grad_norm": 0.14355027675628662, + "learning_rate": 7.852459573311591e-05, + "loss": 3.8533, + "step": 25300 + }, + { + "epoch": 1.7193232776192418, + "grad_norm": 0.1822226196527481, + "learning_rate": 7.852034923223265e-05, + "loss": 3.8112, + "step": 25305 + }, + { + "epoch": 1.7196629976899036, + "grad_norm": 0.18613746762275696, + "learning_rate": 7.851610273134938e-05, + "loss": 4.2187, + "step": 25310 + }, + { + "epoch": 1.7200027177605652, + "grad_norm": 0.39568841457366943, + "learning_rate": 7.85118562304661e-05, + "loss": 3.9892, + "step": 25315 + }, + { + "epoch": 1.720342437831227, + "grad_norm": 0.16394305229187012, + "learning_rate": 7.850760972958284e-05, + "loss": 3.9599, + "step": 25320 + }, + { + "epoch": 1.720682157901889, + "grad_norm": 0.24391448497772217, + "learning_rate": 7.850336322869955e-05, + "loss": 3.7646, + "step": 25325 + }, + { + "epoch": 1.7210218779725506, + "grad_norm": 1.4941072463989258, + "learning_rate": 7.849911672781628e-05, + "loss": 4.0834, + "step": 25330 + }, + { + "epoch": 1.7213615980432124, + "grad_norm": 0.2775137424468994, + "learning_rate": 7.849487022693302e-05, + "loss": 3.8833, + "step": 25335 + }, + { + "epoch": 1.7217013181138743, + "grad_norm": 0.19075067341327667, + "learning_rate": 7.849062372604973e-05, + "loss": 3.8172, + "step": 25340 + }, + { + "epoch": 1.722041038184536, + "grad_norm": 0.18662239611148834, + "learning_rate": 7.848637722516646e-05, + "loss": 3.7342, + "step": 25345 + }, + { + "epoch": 1.7223807582551978, + "grad_norm": 0.16784369945526123, + "learning_rate": 7.84821307242832e-05, + "loss": 3.6477, + "step": 25350 + }, + { + "epoch": 1.7227204783258596, + "grad_norm": 0.1852727234363556, + "learning_rate": 7.847788422339992e-05, + "loss": 3.8582, + "step": 25355 + }, + { + "epoch": 1.7230601983965212, + "grad_norm": 0.17256638407707214, + "learning_rate": 7.847363772251665e-05, + "loss": 3.7418, + "step": 25360 + }, + { + "epoch": 1.7233999184671829, + "grad_norm": 0.19924387335777283, + "learning_rate": 7.846939122163339e-05, + "loss": 3.9121, + "step": 25365 + }, + { + "epoch": 1.723739638537845, + "grad_norm": 0.18148118257522583, + "learning_rate": 7.84651447207501e-05, + "loss": 4.1183, + "step": 25370 + }, + { + "epoch": 1.7240793586085066, + "grad_norm": 0.19901499152183533, + "learning_rate": 7.846089821986683e-05, + "loss": 3.7818, + "step": 25375 + }, + { + "epoch": 1.7244190786791682, + "grad_norm": 0.19984771311283112, + "learning_rate": 7.845665171898357e-05, + "loss": 3.8452, + "step": 25380 + }, + { + "epoch": 1.7247587987498303, + "grad_norm": 0.1917155385017395, + "learning_rate": 7.845240521810029e-05, + "loss": 3.9645, + "step": 25385 + }, + { + "epoch": 1.725098518820492, + "grad_norm": 0.16998791694641113, + "learning_rate": 7.844815871721701e-05, + "loss": 3.8708, + "step": 25390 + }, + { + "epoch": 1.7254382388911536, + "grad_norm": 0.41859400272369385, + "learning_rate": 7.844391221633374e-05, + "loss": 3.9933, + "step": 25395 + }, + { + "epoch": 1.7257779589618156, + "grad_norm": 0.1642247885465622, + "learning_rate": 7.843966571545047e-05, + "loss": 3.8383, + "step": 25400 + }, + { + "epoch": 1.7261176790324773, + "grad_norm": 0.18220822513103485, + "learning_rate": 7.84354192145672e-05, + "loss": 3.9109, + "step": 25405 + }, + { + "epoch": 1.726457399103139, + "grad_norm": 0.16029395163059235, + "learning_rate": 7.843117271368393e-05, + "loss": 3.8931, + "step": 25410 + }, + { + "epoch": 1.7267971191738007, + "grad_norm": 0.19661064445972443, + "learning_rate": 7.842692621280065e-05, + "loss": 4.0602, + "step": 25415 + }, + { + "epoch": 1.7271368392444626, + "grad_norm": 0.18555328249931335, + "learning_rate": 7.842267971191738e-05, + "loss": 4.0796, + "step": 25420 + }, + { + "epoch": 1.7274765593151242, + "grad_norm": 0.1959945410490036, + "learning_rate": 7.841843321103411e-05, + "loss": 3.9548, + "step": 25425 + }, + { + "epoch": 1.727816279385786, + "grad_norm": 0.21380814909934998, + "learning_rate": 7.841418671015084e-05, + "loss": 3.7294, + "step": 25430 + }, + { + "epoch": 1.728155999456448, + "grad_norm": 0.21466611325740814, + "learning_rate": 7.840994020926757e-05, + "loss": 4.0025, + "step": 25435 + }, + { + "epoch": 1.7284957195271096, + "grad_norm": 2.9268760681152344, + "learning_rate": 7.84056937083843e-05, + "loss": 4.0204, + "step": 25440 + }, + { + "epoch": 1.7288354395977714, + "grad_norm": 0.18211546540260315, + "learning_rate": 7.840144720750102e-05, + "loss": 3.7564, + "step": 25445 + }, + { + "epoch": 1.7291751596684333, + "grad_norm": 0.1869932860136032, + "learning_rate": 7.839720070661775e-05, + "loss": 3.8649, + "step": 25450 + }, + { + "epoch": 1.729514879739095, + "grad_norm": 0.19354258477687836, + "learning_rate": 7.839295420573448e-05, + "loss": 3.9566, + "step": 25455 + }, + { + "epoch": 1.7298545998097568, + "grad_norm": 0.16194108128547668, + "learning_rate": 7.83887077048512e-05, + "loss": 4.2018, + "step": 25460 + }, + { + "epoch": 1.7301943198804186, + "grad_norm": 0.37122881412506104, + "learning_rate": 7.838446120396793e-05, + "loss": 3.9942, + "step": 25465 + }, + { + "epoch": 1.7305340399510802, + "grad_norm": 0.4129442572593689, + "learning_rate": 7.838021470308466e-05, + "loss": 3.7841, + "step": 25470 + }, + { + "epoch": 1.730873760021742, + "grad_norm": 0.15450887382030487, + "learning_rate": 7.837596820220139e-05, + "loss": 4.003, + "step": 25475 + }, + { + "epoch": 1.731213480092404, + "grad_norm": 0.19513477385044098, + "learning_rate": 7.837172170131812e-05, + "loss": 3.6385, + "step": 25480 + }, + { + "epoch": 1.7315532001630656, + "grad_norm": 0.25509554147720337, + "learning_rate": 7.836747520043485e-05, + "loss": 3.9306, + "step": 25485 + }, + { + "epoch": 1.7318929202337274, + "grad_norm": 0.23531799018383026, + "learning_rate": 7.836322869955157e-05, + "loss": 4.0504, + "step": 25490 + }, + { + "epoch": 1.7322326403043893, + "grad_norm": 0.2702413499355316, + "learning_rate": 7.83589821986683e-05, + "loss": 3.7088, + "step": 25495 + }, + { + "epoch": 1.732572360375051, + "grad_norm": 0.22877845168113708, + "learning_rate": 7.835473569778503e-05, + "loss": 3.9612, + "step": 25500 + }, + { + "epoch": 1.7329120804457128, + "grad_norm": 0.16843707859516144, + "learning_rate": 7.835048919690176e-05, + "loss": 3.9468, + "step": 25505 + }, + { + "epoch": 1.7332518005163746, + "grad_norm": 0.18650801479816437, + "learning_rate": 7.834624269601849e-05, + "loss": 4.1764, + "step": 25510 + }, + { + "epoch": 1.7335915205870362, + "grad_norm": 0.2074943482875824, + "learning_rate": 7.834199619513521e-05, + "loss": 3.8008, + "step": 25515 + }, + { + "epoch": 1.733931240657698, + "grad_norm": 0.20668894052505493, + "learning_rate": 7.833774969425194e-05, + "loss": 3.8203, + "step": 25520 + }, + { + "epoch": 1.73427096072836, + "grad_norm": 0.21346059441566467, + "learning_rate": 7.833350319336866e-05, + "loss": 3.5714, + "step": 25525 + }, + { + "epoch": 1.7346106807990216, + "grad_norm": 0.21038301289081573, + "learning_rate": 7.83292566924854e-05, + "loss": 3.6954, + "step": 25530 + }, + { + "epoch": 1.7349504008696832, + "grad_norm": 0.1618669480085373, + "learning_rate": 7.832501019160213e-05, + "loss": 3.7115, + "step": 25535 + }, + { + "epoch": 1.7352901209403453, + "grad_norm": 0.169974222779274, + "learning_rate": 7.832076369071885e-05, + "loss": 3.7971, + "step": 25540 + }, + { + "epoch": 1.735629841011007, + "grad_norm": 0.20559339225292206, + "learning_rate": 7.831651718983558e-05, + "loss": 3.8862, + "step": 25545 + }, + { + "epoch": 1.7359695610816686, + "grad_norm": 0.255675733089447, + "learning_rate": 7.831227068895231e-05, + "loss": 3.9162, + "step": 25550 + }, + { + "epoch": 1.7363092811523306, + "grad_norm": 0.17540304362773895, + "learning_rate": 7.830802418806904e-05, + "loss": 4.087, + "step": 25555 + }, + { + "epoch": 1.7366490012229923, + "grad_norm": 0.14303384721279144, + "learning_rate": 7.830377768718577e-05, + "loss": 3.8401, + "step": 25560 + }, + { + "epoch": 1.736988721293654, + "grad_norm": 0.21565908193588257, + "learning_rate": 7.82995311863025e-05, + "loss": 3.833, + "step": 25565 + }, + { + "epoch": 1.737328441364316, + "grad_norm": 0.2927127182483673, + "learning_rate": 7.829528468541922e-05, + "loss": 3.6521, + "step": 25570 + }, + { + "epoch": 1.7376681614349776, + "grad_norm": 0.1676911562681198, + "learning_rate": 7.829103818453595e-05, + "loss": 3.7477, + "step": 25575 + }, + { + "epoch": 1.7380078815056392, + "grad_norm": 0.1994667947292328, + "learning_rate": 7.828679168365268e-05, + "loss": 3.9503, + "step": 25580 + }, + { + "epoch": 1.738347601576301, + "grad_norm": 0.41535788774490356, + "learning_rate": 7.82825451827694e-05, + "loss": 3.8049, + "step": 25585 + }, + { + "epoch": 1.738687321646963, + "grad_norm": 0.24231450259685516, + "learning_rate": 7.827829868188613e-05, + "loss": 4.0715, + "step": 25590 + }, + { + "epoch": 1.7390270417176246, + "grad_norm": 0.19360168278217316, + "learning_rate": 7.827405218100285e-05, + "loss": 3.9849, + "step": 25595 + }, + { + "epoch": 1.7393667617882864, + "grad_norm": 0.24657608568668365, + "learning_rate": 7.826980568011959e-05, + "loss": 4.092, + "step": 25600 + }, + { + "epoch": 1.7397064818589483, + "grad_norm": 0.18783694505691528, + "learning_rate": 7.826555917923632e-05, + "loss": 3.9738, + "step": 25605 + }, + { + "epoch": 1.74004620192961, + "grad_norm": 0.9105113744735718, + "learning_rate": 7.826131267835303e-05, + "loss": 3.8916, + "step": 25610 + }, + { + "epoch": 1.7403859220002718, + "grad_norm": 0.16742384433746338, + "learning_rate": 7.825706617746977e-05, + "loss": 4.1018, + "step": 25615 + }, + { + "epoch": 1.7407256420709336, + "grad_norm": 0.4461096525192261, + "learning_rate": 7.82528196765865e-05, + "loss": 3.6712, + "step": 25620 + }, + { + "epoch": 1.7410653621415952, + "grad_norm": 0.18471583724021912, + "learning_rate": 7.824857317570322e-05, + "loss": 3.9215, + "step": 25625 + }, + { + "epoch": 1.741405082212257, + "grad_norm": 0.21674783527851105, + "learning_rate": 7.824432667481996e-05, + "loss": 3.9338, + "step": 25630 + }, + { + "epoch": 1.741744802282919, + "grad_norm": 0.21410222351551056, + "learning_rate": 7.824008017393669e-05, + "loss": 4.0055, + "step": 25635 + }, + { + "epoch": 1.7420845223535806, + "grad_norm": 0.17411504685878754, + "learning_rate": 7.82358336730534e-05, + "loss": 3.7398, + "step": 25640 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.21899643540382385, + "learning_rate": 7.823158717217014e-05, + "loss": 3.7917, + "step": 25645 + }, + { + "epoch": 1.7427639624949043, + "grad_norm": 0.2132015973329544, + "learning_rate": 7.822734067128687e-05, + "loss": 3.7849, + "step": 25650 + }, + { + "epoch": 1.743103682565566, + "grad_norm": 0.18211622536182404, + "learning_rate": 7.822309417040358e-05, + "loss": 3.9235, + "step": 25655 + }, + { + "epoch": 1.7434434026362278, + "grad_norm": 0.22423851490020752, + "learning_rate": 7.821884766952033e-05, + "loss": 3.7939, + "step": 25660 + }, + { + "epoch": 1.7437831227068896, + "grad_norm": 0.2434370219707489, + "learning_rate": 7.821460116863704e-05, + "loss": 3.8666, + "step": 25665 + }, + { + "epoch": 1.7441228427775513, + "grad_norm": 0.15904605388641357, + "learning_rate": 7.821035466775377e-05, + "loss": 4.1991, + "step": 25670 + }, + { + "epoch": 1.744462562848213, + "grad_norm": 0.18933552503585815, + "learning_rate": 7.820610816687051e-05, + "loss": 3.7115, + "step": 25675 + }, + { + "epoch": 1.744802282918875, + "grad_norm": 0.1613510102033615, + "learning_rate": 7.820186166598722e-05, + "loss": 3.9445, + "step": 25680 + }, + { + "epoch": 1.7451420029895366, + "grad_norm": 0.17829404771327972, + "learning_rate": 7.819761516510395e-05, + "loss": 3.7043, + "step": 25685 + }, + { + "epoch": 1.7454817230601984, + "grad_norm": 0.1927444487810135, + "learning_rate": 7.81933686642207e-05, + "loss": 4.0279, + "step": 25690 + }, + { + "epoch": 1.7458214431308603, + "grad_norm": 0.16500389575958252, + "learning_rate": 7.818912216333741e-05, + "loss": 3.9423, + "step": 25695 + }, + { + "epoch": 1.746161163201522, + "grad_norm": 0.19629210233688354, + "learning_rate": 7.818487566245414e-05, + "loss": 3.8024, + "step": 25700 + }, + { + "epoch": 1.7465008832721836, + "grad_norm": 0.256438285112381, + "learning_rate": 7.818062916157088e-05, + "loss": 4.0245, + "step": 25705 + }, + { + "epoch": 1.7468406033428456, + "grad_norm": 0.23489029705524445, + "learning_rate": 7.817638266068759e-05, + "loss": 3.7544, + "step": 25710 + }, + { + "epoch": 1.7471803234135073, + "grad_norm": 0.27524852752685547, + "learning_rate": 7.817213615980432e-05, + "loss": 3.8794, + "step": 25715 + }, + { + "epoch": 1.747520043484169, + "grad_norm": 0.1856842339038849, + "learning_rate": 7.816788965892106e-05, + "loss": 3.8133, + "step": 25720 + }, + { + "epoch": 1.747859763554831, + "grad_norm": 0.16379818320274353, + "learning_rate": 7.816364315803778e-05, + "loss": 3.8696, + "step": 25725 + }, + { + "epoch": 1.7481994836254926, + "grad_norm": 0.16315363347530365, + "learning_rate": 7.81593966571545e-05, + "loss": 3.8633, + "step": 25730 + }, + { + "epoch": 1.7485392036961542, + "grad_norm": 0.14635157585144043, + "learning_rate": 7.815515015627125e-05, + "loss": 3.618, + "step": 25735 + }, + { + "epoch": 1.7488789237668163, + "grad_norm": 0.15115930140018463, + "learning_rate": 7.815090365538796e-05, + "loss": 3.9356, + "step": 25740 + }, + { + "epoch": 1.749218643837478, + "grad_norm": 0.1424102932214737, + "learning_rate": 7.814665715450469e-05, + "loss": 3.5629, + "step": 25745 + }, + { + "epoch": 1.7495583639081396, + "grad_norm": 0.16285699605941772, + "learning_rate": 7.814241065362142e-05, + "loss": 4.0615, + "step": 25750 + }, + { + "epoch": 1.7498980839788014, + "grad_norm": 0.23774859309196472, + "learning_rate": 7.813816415273814e-05, + "loss": 3.8449, + "step": 25755 + }, + { + "epoch": 1.7502378040494633, + "grad_norm": 0.1964176744222641, + "learning_rate": 7.813391765185487e-05, + "loss": 3.9878, + "step": 25760 + }, + { + "epoch": 1.750577524120125, + "grad_norm": 0.1776086837053299, + "learning_rate": 7.81296711509716e-05, + "loss": 3.9883, + "step": 25765 + }, + { + "epoch": 1.7509172441907868, + "grad_norm": 0.17602093517780304, + "learning_rate": 7.812542465008833e-05, + "loss": 3.6705, + "step": 25770 + }, + { + "epoch": 1.7512569642614486, + "grad_norm": 0.18847081065177917, + "learning_rate": 7.812117814920506e-05, + "loss": 3.9983, + "step": 25775 + }, + { + "epoch": 1.7515966843321102, + "grad_norm": 0.1770462840795517, + "learning_rate": 7.811693164832178e-05, + "loss": 3.9272, + "step": 25780 + }, + { + "epoch": 1.751936404402772, + "grad_norm": 0.17940519750118256, + "learning_rate": 7.811268514743851e-05, + "loss": 3.7528, + "step": 25785 + }, + { + "epoch": 1.752276124473434, + "grad_norm": 0.15378354489803314, + "learning_rate": 7.810843864655524e-05, + "loss": 4.1171, + "step": 25790 + }, + { + "epoch": 1.7526158445440956, + "grad_norm": 0.15963229537010193, + "learning_rate": 7.810419214567197e-05, + "loss": 3.8995, + "step": 25795 + }, + { + "epoch": 1.7529555646147574, + "grad_norm": 0.1572151631116867, + "learning_rate": 7.80999456447887e-05, + "loss": 3.9166, + "step": 25800 + }, + { + "epoch": 1.7532952846854193, + "grad_norm": 0.21655048429965973, + "learning_rate": 7.809569914390542e-05, + "loss": 4.092, + "step": 25805 + }, + { + "epoch": 1.753635004756081, + "grad_norm": 0.15344221889972687, + "learning_rate": 7.809145264302215e-05, + "loss": 4.0045, + "step": 25810 + }, + { + "epoch": 1.7539747248267428, + "grad_norm": 0.23213127255439758, + "learning_rate": 7.808720614213888e-05, + "loss": 4.0688, + "step": 25815 + }, + { + "epoch": 1.7543144448974046, + "grad_norm": 0.20509281754493713, + "learning_rate": 7.808295964125561e-05, + "loss": 4.0596, + "step": 25820 + }, + { + "epoch": 1.7546541649680663, + "grad_norm": 0.28146520256996155, + "learning_rate": 7.807871314037234e-05, + "loss": 3.785, + "step": 25825 + }, + { + "epoch": 1.754993885038728, + "grad_norm": 0.22088941931724548, + "learning_rate": 7.807446663948906e-05, + "loss": 4.0262, + "step": 25830 + }, + { + "epoch": 1.75533360510939, + "grad_norm": 0.1806650161743164, + "learning_rate": 7.807022013860579e-05, + "loss": 3.9567, + "step": 25835 + }, + { + "epoch": 1.7556733251800516, + "grad_norm": 2.506441593170166, + "learning_rate": 7.806597363772252e-05, + "loss": 3.8689, + "step": 25840 + }, + { + "epoch": 1.7560130452507134, + "grad_norm": 0.21518169343471527, + "learning_rate": 7.806172713683925e-05, + "loss": 4.035, + "step": 25845 + }, + { + "epoch": 1.7563527653213753, + "grad_norm": 0.17455333471298218, + "learning_rate": 7.805748063595598e-05, + "loss": 3.7574, + "step": 25850 + }, + { + "epoch": 1.756692485392037, + "grad_norm": 0.19762565195560455, + "learning_rate": 7.80532341350727e-05, + "loss": 3.913, + "step": 25855 + }, + { + "epoch": 1.7570322054626988, + "grad_norm": 0.23111023008823395, + "learning_rate": 7.804898763418943e-05, + "loss": 3.9347, + "step": 25860 + }, + { + "epoch": 1.7573719255333606, + "grad_norm": 0.17037637531757355, + "learning_rate": 7.804474113330615e-05, + "loss": 3.9856, + "step": 25865 + }, + { + "epoch": 1.7577116456040223, + "grad_norm": 0.16872669756412506, + "learning_rate": 7.804049463242289e-05, + "loss": 3.7488, + "step": 25870 + }, + { + "epoch": 1.758051365674684, + "grad_norm": 0.19018787145614624, + "learning_rate": 7.803624813153962e-05, + "loss": 3.7528, + "step": 25875 + }, + { + "epoch": 1.758391085745346, + "grad_norm": 0.17614497244358063, + "learning_rate": 7.803200163065634e-05, + "loss": 3.7168, + "step": 25880 + }, + { + "epoch": 1.7587308058160076, + "grad_norm": 0.14886288344860077, + "learning_rate": 7.802775512977307e-05, + "loss": 3.9095, + "step": 25885 + }, + { + "epoch": 1.7590705258866692, + "grad_norm": 0.16838903725147247, + "learning_rate": 7.80235086288898e-05, + "loss": 3.9062, + "step": 25890 + }, + { + "epoch": 1.7594102459573313, + "grad_norm": 0.6134260296821594, + "learning_rate": 7.801926212800653e-05, + "loss": 3.9085, + "step": 25895 + }, + { + "epoch": 1.759749966027993, + "grad_norm": 0.17483386397361755, + "learning_rate": 7.801501562712326e-05, + "loss": 3.5791, + "step": 25900 + }, + { + "epoch": 1.7600896860986546, + "grad_norm": 0.22112756967544556, + "learning_rate": 7.801076912623999e-05, + "loss": 4.0267, + "step": 25905 + }, + { + "epoch": 1.7604294061693166, + "grad_norm": 0.237503319978714, + "learning_rate": 7.800652262535671e-05, + "loss": 3.9415, + "step": 25910 + }, + { + "epoch": 1.7607691262399783, + "grad_norm": 0.1721808910369873, + "learning_rate": 7.800227612447344e-05, + "loss": 3.9764, + "step": 25915 + }, + { + "epoch": 1.76110884631064, + "grad_norm": 0.6190228462219238, + "learning_rate": 7.799802962359017e-05, + "loss": 3.7267, + "step": 25920 + }, + { + "epoch": 1.7614485663813018, + "grad_norm": 0.8414190411567688, + "learning_rate": 7.79937831227069e-05, + "loss": 3.9224, + "step": 25925 + }, + { + "epoch": 1.7617882864519636, + "grad_norm": 0.24957755208015442, + "learning_rate": 7.798953662182363e-05, + "loss": 4.1043, + "step": 25930 + }, + { + "epoch": 1.7621280065226252, + "grad_norm": 0.23723457753658295, + "learning_rate": 7.798529012094035e-05, + "loss": 3.967, + "step": 25935 + }, + { + "epoch": 1.762467726593287, + "grad_norm": 0.17850160598754883, + "learning_rate": 7.798104362005708e-05, + "loss": 3.8183, + "step": 25940 + }, + { + "epoch": 1.762807446663949, + "grad_norm": 0.5890353322029114, + "learning_rate": 7.797679711917381e-05, + "loss": 4.1371, + "step": 25945 + }, + { + "epoch": 1.7631471667346106, + "grad_norm": 0.2072901874780655, + "learning_rate": 7.797255061829052e-05, + "loss": 3.8073, + "step": 25950 + }, + { + "epoch": 1.7634868868052724, + "grad_norm": 0.17181119322776794, + "learning_rate": 7.796830411740727e-05, + "loss": 4.0762, + "step": 25955 + }, + { + "epoch": 1.7638266068759343, + "grad_norm": 0.309935599565506, + "learning_rate": 7.796405761652399e-05, + "loss": 4.0514, + "step": 25960 + }, + { + "epoch": 1.764166326946596, + "grad_norm": 0.1484830230474472, + "learning_rate": 7.795981111564071e-05, + "loss": 3.8094, + "step": 25965 + }, + { + "epoch": 1.7645060470172578, + "grad_norm": 0.18948422372341156, + "learning_rate": 7.795556461475745e-05, + "loss": 4.2525, + "step": 25970 + }, + { + "epoch": 1.7648457670879196, + "grad_norm": 0.36065876483917236, + "learning_rate": 7.795131811387418e-05, + "loss": 3.9623, + "step": 25975 + }, + { + "epoch": 1.7651854871585813, + "grad_norm": 0.15411554276943207, + "learning_rate": 7.794707161299089e-05, + "loss": 3.7769, + "step": 25980 + }, + { + "epoch": 1.765525207229243, + "grad_norm": 0.2350635677576065, + "learning_rate": 7.794282511210763e-05, + "loss": 3.797, + "step": 25985 + }, + { + "epoch": 1.765864927299905, + "grad_norm": 0.16473321616649628, + "learning_rate": 7.793857861122436e-05, + "loss": 3.8117, + "step": 25990 + }, + { + "epoch": 1.7662046473705666, + "grad_norm": 0.19410017132759094, + "learning_rate": 7.793433211034108e-05, + "loss": 4.2416, + "step": 25995 + }, + { + "epoch": 1.7665443674412284, + "grad_norm": 0.17998336255550385, + "learning_rate": 7.793008560945782e-05, + "loss": 3.9656, + "step": 26000 + }, + { + "epoch": 1.7668840875118903, + "grad_norm": 0.16076351702213287, + "learning_rate": 7.792583910857455e-05, + "loss": 3.9613, + "step": 26005 + }, + { + "epoch": 1.767223807582552, + "grad_norm": 0.17469412088394165, + "learning_rate": 7.792159260769126e-05, + "loss": 4.205, + "step": 26010 + }, + { + "epoch": 1.7675635276532138, + "grad_norm": 0.5149888396263123, + "learning_rate": 7.7917346106808e-05, + "loss": 3.8798, + "step": 26015 + }, + { + "epoch": 1.7679032477238756, + "grad_norm": 0.19410598278045654, + "learning_rate": 7.791309960592472e-05, + "loss": 3.7613, + "step": 26020 + }, + { + "epoch": 1.7682429677945373, + "grad_norm": 0.17819464206695557, + "learning_rate": 7.790885310504144e-05, + "loss": 3.7189, + "step": 26025 + }, + { + "epoch": 1.7685826878651991, + "grad_norm": 0.1662750095129013, + "learning_rate": 7.790460660415819e-05, + "loss": 3.7944, + "step": 26030 + }, + { + "epoch": 1.768922407935861, + "grad_norm": 0.16187137365341187, + "learning_rate": 7.79003601032749e-05, + "loss": 3.7017, + "step": 26035 + }, + { + "epoch": 1.7692621280065226, + "grad_norm": 0.160844624042511, + "learning_rate": 7.789611360239163e-05, + "loss": 3.9634, + "step": 26040 + }, + { + "epoch": 1.7696018480771842, + "grad_norm": 0.1839676797389984, + "learning_rate": 7.789186710150837e-05, + "loss": 3.8044, + "step": 26045 + }, + { + "epoch": 1.7699415681478463, + "grad_norm": 0.22456395626068115, + "learning_rate": 7.788762060062508e-05, + "loss": 3.9537, + "step": 26050 + }, + { + "epoch": 1.770281288218508, + "grad_norm": 0.20848336815834045, + "learning_rate": 7.788337409974181e-05, + "loss": 3.6595, + "step": 26055 + }, + { + "epoch": 1.7706210082891696, + "grad_norm": 0.5317763686180115, + "learning_rate": 7.787912759885855e-05, + "loss": 3.8877, + "step": 26060 + }, + { + "epoch": 1.7709607283598316, + "grad_norm": 0.1977878361940384, + "learning_rate": 7.787488109797527e-05, + "loss": 3.8282, + "step": 26065 + }, + { + "epoch": 1.7713004484304933, + "grad_norm": 0.1787205934524536, + "learning_rate": 7.7870634597092e-05, + "loss": 3.6516, + "step": 26070 + }, + { + "epoch": 1.771640168501155, + "grad_norm": 0.20221813023090363, + "learning_rate": 7.786638809620874e-05, + "loss": 3.7358, + "step": 26075 + }, + { + "epoch": 1.771979888571817, + "grad_norm": 0.19951555132865906, + "learning_rate": 7.786214159532545e-05, + "loss": 4.0884, + "step": 26080 + }, + { + "epoch": 1.7723196086424786, + "grad_norm": 0.18455719947814941, + "learning_rate": 7.785789509444218e-05, + "loss": 3.6975, + "step": 26085 + }, + { + "epoch": 1.7726593287131402, + "grad_norm": 0.16093185544013977, + "learning_rate": 7.785364859355891e-05, + "loss": 3.7811, + "step": 26090 + }, + { + "epoch": 1.772999048783802, + "grad_norm": 0.22813080251216888, + "learning_rate": 7.784940209267564e-05, + "loss": 3.7521, + "step": 26095 + }, + { + "epoch": 1.773338768854464, + "grad_norm": 0.16026708483695984, + "learning_rate": 7.784515559179236e-05, + "loss": 3.8626, + "step": 26100 + }, + { + "epoch": 1.7736784889251256, + "grad_norm": 0.13210180401802063, + "learning_rate": 7.784090909090909e-05, + "loss": 3.9591, + "step": 26105 + }, + { + "epoch": 1.7740182089957874, + "grad_norm": 0.2761272192001343, + "learning_rate": 7.783666259002582e-05, + "loss": 3.6413, + "step": 26110 + }, + { + "epoch": 1.7743579290664493, + "grad_norm": 0.17390525341033936, + "learning_rate": 7.783241608914255e-05, + "loss": 3.8723, + "step": 26115 + }, + { + "epoch": 1.774697649137111, + "grad_norm": 0.19960345327854156, + "learning_rate": 7.782816958825928e-05, + "loss": 3.9727, + "step": 26120 + }, + { + "epoch": 1.7750373692077728, + "grad_norm": 0.1846652776002884, + "learning_rate": 7.7823923087376e-05, + "loss": 3.8858, + "step": 26125 + }, + { + "epoch": 1.7753770892784346, + "grad_norm": 0.21528063714504242, + "learning_rate": 7.781967658649273e-05, + "loss": 3.8378, + "step": 26130 + }, + { + "epoch": 1.7757168093490963, + "grad_norm": 0.20016834139823914, + "learning_rate": 7.781543008560946e-05, + "loss": 3.8415, + "step": 26135 + }, + { + "epoch": 1.7760565294197581, + "grad_norm": 0.6123644709587097, + "learning_rate": 7.781118358472619e-05, + "loss": 3.8653, + "step": 26140 + }, + { + "epoch": 1.77639624949042, + "grad_norm": 0.20084905624389648, + "learning_rate": 7.780693708384292e-05, + "loss": 3.9054, + "step": 26145 + }, + { + "epoch": 1.7767359695610816, + "grad_norm": 0.2044343799352646, + "learning_rate": 7.780269058295964e-05, + "loss": 3.727, + "step": 26150 + }, + { + "epoch": 1.7770756896317434, + "grad_norm": 0.22257715463638306, + "learning_rate": 7.779844408207637e-05, + "loss": 3.6154, + "step": 26155 + }, + { + "epoch": 1.7774154097024053, + "grad_norm": 0.1693069487810135, + "learning_rate": 7.77941975811931e-05, + "loss": 3.912, + "step": 26160 + }, + { + "epoch": 1.777755129773067, + "grad_norm": 0.176688551902771, + "learning_rate": 7.778995108030983e-05, + "loss": 4.0402, + "step": 26165 + }, + { + "epoch": 1.7780948498437288, + "grad_norm": 0.2110009640455246, + "learning_rate": 7.778570457942656e-05, + "loss": 4.0025, + "step": 26170 + }, + { + "epoch": 1.7784345699143906, + "grad_norm": 0.1862836629152298, + "learning_rate": 7.778145807854328e-05, + "loss": 3.6532, + "step": 26175 + }, + { + "epoch": 1.7787742899850523, + "grad_norm": 0.15873485803604126, + "learning_rate": 7.777721157766001e-05, + "loss": 4.1377, + "step": 26180 + }, + { + "epoch": 1.7791140100557141, + "grad_norm": 0.21204356849193573, + "learning_rate": 7.777296507677674e-05, + "loss": 4.0433, + "step": 26185 + }, + { + "epoch": 1.779453730126376, + "grad_norm": 0.19908350706100464, + "learning_rate": 7.776871857589347e-05, + "loss": 3.6427, + "step": 26190 + }, + { + "epoch": 1.7797934501970376, + "grad_norm": 0.17096982896327972, + "learning_rate": 7.77644720750102e-05, + "loss": 3.6355, + "step": 26195 + }, + { + "epoch": 1.7801331702676995, + "grad_norm": 0.24354848265647888, + "learning_rate": 7.776022557412692e-05, + "loss": 3.7925, + "step": 26200 + }, + { + "epoch": 1.7804728903383613, + "grad_norm": 0.2315327376127243, + "learning_rate": 7.775597907324365e-05, + "loss": 3.8772, + "step": 26205 + }, + { + "epoch": 1.780812610409023, + "grad_norm": 0.299398273229599, + "learning_rate": 7.775173257236038e-05, + "loss": 3.9325, + "step": 26210 + }, + { + "epoch": 1.7811523304796846, + "grad_norm": 0.20360209047794342, + "learning_rate": 7.774748607147711e-05, + "loss": 3.8099, + "step": 26215 + }, + { + "epoch": 1.7814920505503467, + "grad_norm": 0.19338242709636688, + "learning_rate": 7.774323957059384e-05, + "loss": 3.8968, + "step": 26220 + }, + { + "epoch": 1.7818317706210083, + "grad_norm": 0.14649468660354614, + "learning_rate": 7.773899306971056e-05, + "loss": 3.9739, + "step": 26225 + }, + { + "epoch": 1.78217149069167, + "grad_norm": 0.17696213722229004, + "learning_rate": 7.773474656882729e-05, + "loss": 4.0409, + "step": 26230 + }, + { + "epoch": 1.782511210762332, + "grad_norm": 0.1328064650297165, + "learning_rate": 7.773050006794402e-05, + "loss": 3.8708, + "step": 26235 + }, + { + "epoch": 1.7828509308329936, + "grad_norm": 0.1680843085050583, + "learning_rate": 7.772625356706075e-05, + "loss": 4.0825, + "step": 26240 + }, + { + "epoch": 1.7831906509036552, + "grad_norm": 0.1538754254579544, + "learning_rate": 7.772200706617748e-05, + "loss": 3.9587, + "step": 26245 + }, + { + "epoch": 1.7835303709743173, + "grad_norm": 0.16726773977279663, + "learning_rate": 7.77177605652942e-05, + "loss": 4.0076, + "step": 26250 + }, + { + "epoch": 1.783870091044979, + "grad_norm": 0.30208492279052734, + "learning_rate": 7.771351406441093e-05, + "loss": 3.6829, + "step": 26255 + }, + { + "epoch": 1.7842098111156406, + "grad_norm": 0.234908789396286, + "learning_rate": 7.770926756352766e-05, + "loss": 3.799, + "step": 26260 + }, + { + "epoch": 1.7845495311863024, + "grad_norm": 0.1720590442419052, + "learning_rate": 7.770502106264439e-05, + "loss": 3.7713, + "step": 26265 + }, + { + "epoch": 1.7848892512569643, + "grad_norm": 0.15488260984420776, + "learning_rate": 7.770077456176112e-05, + "loss": 3.9516, + "step": 26270 + }, + { + "epoch": 1.785228971327626, + "grad_norm": 0.17541341483592987, + "learning_rate": 7.769652806087784e-05, + "loss": 3.8889, + "step": 26275 + }, + { + "epoch": 1.7855686913982878, + "grad_norm": 0.13884133100509644, + "learning_rate": 7.769228155999457e-05, + "loss": 4.0354, + "step": 26280 + }, + { + "epoch": 1.7859084114689496, + "grad_norm": 0.20109477639198303, + "learning_rate": 7.76880350591113e-05, + "loss": 3.8592, + "step": 26285 + }, + { + "epoch": 1.7862481315396113, + "grad_norm": 0.15505924820899963, + "learning_rate": 7.768378855822801e-05, + "loss": 3.9259, + "step": 26290 + }, + { + "epoch": 1.7865878516102731, + "grad_norm": 0.2134423851966858, + "learning_rate": 7.767954205734476e-05, + "loss": 3.9373, + "step": 26295 + }, + { + "epoch": 1.786927571680935, + "grad_norm": 0.15244637429714203, + "learning_rate": 7.767529555646148e-05, + "loss": 3.7413, + "step": 26300 + }, + { + "epoch": 1.7872672917515966, + "grad_norm": 0.1615348905324936, + "learning_rate": 7.76710490555782e-05, + "loss": 3.8393, + "step": 26305 + }, + { + "epoch": 1.7876070118222585, + "grad_norm": 0.20147714018821716, + "learning_rate": 7.766680255469494e-05, + "loss": 3.9051, + "step": 26310 + }, + { + "epoch": 1.7879467318929203, + "grad_norm": 0.2078302502632141, + "learning_rate": 7.766255605381167e-05, + "loss": 3.9881, + "step": 26315 + }, + { + "epoch": 1.788286451963582, + "grad_norm": 0.2673831284046173, + "learning_rate": 7.765830955292838e-05, + "loss": 3.8146, + "step": 26320 + }, + { + "epoch": 1.7886261720342438, + "grad_norm": 0.2913552522659302, + "learning_rate": 7.765406305204512e-05, + "loss": 3.715, + "step": 26325 + }, + { + "epoch": 1.7889658921049056, + "grad_norm": 0.20574414730072021, + "learning_rate": 7.76506658513385e-05, + "loss": 3.8187, + "step": 26330 + }, + { + "epoch": 1.7893056121755673, + "grad_norm": 0.16493472456932068, + "learning_rate": 7.764641935045522e-05, + "loss": 3.8184, + "step": 26335 + }, + { + "epoch": 1.7896453322462291, + "grad_norm": 0.22878111898899078, + "learning_rate": 7.764217284957196e-05, + "loss": 4.0797, + "step": 26340 + }, + { + "epoch": 1.789985052316891, + "grad_norm": 0.1575300693511963, + "learning_rate": 7.763792634868868e-05, + "loss": 3.9282, + "step": 26345 + }, + { + "epoch": 1.7903247723875526, + "grad_norm": 0.18347293138504028, + "learning_rate": 7.76336798478054e-05, + "loss": 4.2289, + "step": 26350 + }, + { + "epoch": 1.7906644924582145, + "grad_norm": 0.21829676628112793, + "learning_rate": 7.762943334692215e-05, + "loss": 4.0262, + "step": 26355 + }, + { + "epoch": 1.7910042125288763, + "grad_norm": 0.16191798448562622, + "learning_rate": 7.762518684603886e-05, + "loss": 3.911, + "step": 26360 + }, + { + "epoch": 1.791343932599538, + "grad_norm": 0.19974899291992188, + "learning_rate": 7.762094034515559e-05, + "loss": 3.9179, + "step": 26365 + }, + { + "epoch": 1.7916836526701998, + "grad_norm": 0.18920102715492249, + "learning_rate": 7.761669384427233e-05, + "loss": 3.9492, + "step": 26370 + }, + { + "epoch": 1.7920233727408617, + "grad_norm": 0.1907794326543808, + "learning_rate": 7.761244734338905e-05, + "loss": 3.9208, + "step": 26375 + }, + { + "epoch": 1.7923630928115233, + "grad_norm": 0.2118396759033203, + "learning_rate": 7.760820084250577e-05, + "loss": 4.0686, + "step": 26380 + }, + { + "epoch": 1.792702812882185, + "grad_norm": 0.16273494064807892, + "learning_rate": 7.760395434162251e-05, + "loss": 4.0353, + "step": 26385 + }, + { + "epoch": 1.793042532952847, + "grad_norm": 0.22527606785297394, + "learning_rate": 7.759970784073923e-05, + "loss": 4.0473, + "step": 26390 + }, + { + "epoch": 1.7933822530235086, + "grad_norm": 0.12903249263763428, + "learning_rate": 7.759546133985596e-05, + "loss": 4.0943, + "step": 26395 + }, + { + "epoch": 1.7937219730941703, + "grad_norm": 0.1792658567428589, + "learning_rate": 7.75912148389727e-05, + "loss": 3.8941, + "step": 26400 + }, + { + "epoch": 1.7940616931648323, + "grad_norm": 0.16873827576637268, + "learning_rate": 7.758696833808941e-05, + "loss": 3.8967, + "step": 26405 + }, + { + "epoch": 1.794401413235494, + "grad_norm": 0.1558201014995575, + "learning_rate": 7.758272183720614e-05, + "loss": 3.8145, + "step": 26410 + }, + { + "epoch": 1.7947411333061556, + "grad_norm": 0.1358812153339386, + "learning_rate": 7.757847533632287e-05, + "loss": 3.7916, + "step": 26415 + }, + { + "epoch": 1.7950808533768177, + "grad_norm": 0.3915088474750519, + "learning_rate": 7.75742288354396e-05, + "loss": 3.7476, + "step": 26420 + }, + { + "epoch": 1.7954205734474793, + "grad_norm": 0.1896940916776657, + "learning_rate": 7.756998233455634e-05, + "loss": 3.7508, + "step": 26425 + }, + { + "epoch": 1.795760293518141, + "grad_norm": 0.18188120424747467, + "learning_rate": 7.756573583367305e-05, + "loss": 3.7945, + "step": 26430 + }, + { + "epoch": 1.7961000135888028, + "grad_norm": 0.24403336644172668, + "learning_rate": 7.756148933278978e-05, + "loss": 3.6651, + "step": 26435 + }, + { + "epoch": 1.7964397336594646, + "grad_norm": 0.1830359846353531, + "learning_rate": 7.755724283190652e-05, + "loss": 3.9832, + "step": 26440 + }, + { + "epoch": 1.7967794537301263, + "grad_norm": 0.1751657873392105, + "learning_rate": 7.755299633102324e-05, + "loss": 3.6626, + "step": 26445 + }, + { + "epoch": 1.7971191738007881, + "grad_norm": 0.2703894376754761, + "learning_rate": 7.754874983013997e-05, + "loss": 3.5894, + "step": 26450 + }, + { + "epoch": 1.79745889387145, + "grad_norm": 0.19956360757350922, + "learning_rate": 7.75445033292567e-05, + "loss": 3.9104, + "step": 26455 + }, + { + "epoch": 1.7977986139421116, + "grad_norm": 0.18153636157512665, + "learning_rate": 7.754025682837342e-05, + "loss": 4.0352, + "step": 26460 + }, + { + "epoch": 1.7981383340127735, + "grad_norm": 0.40402752161026, + "learning_rate": 7.753601032749015e-05, + "loss": 3.8658, + "step": 26465 + }, + { + "epoch": 1.7984780540834353, + "grad_norm": 0.18831366300582886, + "learning_rate": 7.753176382660689e-05, + "loss": 3.9311, + "step": 26470 + }, + { + "epoch": 1.798817774154097, + "grad_norm": 0.22592391073703766, + "learning_rate": 7.75275173257236e-05, + "loss": 3.8296, + "step": 26475 + }, + { + "epoch": 1.7991574942247588, + "grad_norm": 0.14901496469974518, + "learning_rate": 7.752327082484033e-05, + "loss": 4.0578, + "step": 26480 + }, + { + "epoch": 1.7994972142954206, + "grad_norm": 0.16719041764736176, + "learning_rate": 7.751902432395706e-05, + "loss": 3.8608, + "step": 26485 + }, + { + "epoch": 1.7998369343660823, + "grad_norm": 0.19839486479759216, + "learning_rate": 7.751477782307379e-05, + "loss": 3.8629, + "step": 26490 + }, + { + "epoch": 1.8001766544367441, + "grad_norm": 0.15507622063159943, + "learning_rate": 7.751053132219052e-05, + "loss": 3.985, + "step": 26495 + }, + { + "epoch": 1.800516374507406, + "grad_norm": 0.15592411160469055, + "learning_rate": 7.750628482130725e-05, + "loss": 3.8909, + "step": 26500 + }, + { + "epoch": 1.8008560945780676, + "grad_norm": 0.2941340208053589, + "learning_rate": 7.750203832042397e-05, + "loss": 3.8877, + "step": 26505 + }, + { + "epoch": 1.8011958146487295, + "grad_norm": 0.1613299697637558, + "learning_rate": 7.74977918195407e-05, + "loss": 4.1809, + "step": 26510 + }, + { + "epoch": 1.8015355347193913, + "grad_norm": 0.1743617206811905, + "learning_rate": 7.749354531865743e-05, + "loss": 3.8592, + "step": 26515 + }, + { + "epoch": 1.801875254790053, + "grad_norm": 0.14489513635635376, + "learning_rate": 7.748929881777416e-05, + "loss": 3.6117, + "step": 26520 + }, + { + "epoch": 1.8022149748607148, + "grad_norm": 0.18078920245170593, + "learning_rate": 7.748505231689089e-05, + "loss": 4.0161, + "step": 26525 + }, + { + "epoch": 1.8025546949313767, + "grad_norm": 0.16817781329154968, + "learning_rate": 7.748080581600761e-05, + "loss": 3.7905, + "step": 26530 + }, + { + "epoch": 1.8028944150020383, + "grad_norm": 0.5751657485961914, + "learning_rate": 7.747655931512434e-05, + "loss": 4.1054, + "step": 26535 + }, + { + "epoch": 1.8032341350727001, + "grad_norm": 0.19804297387599945, + "learning_rate": 7.747316211441772e-05, + "loss": 4.0228, + "step": 26540 + }, + { + "epoch": 1.803573855143362, + "grad_norm": 0.1686244010925293, + "learning_rate": 7.746891561353445e-05, + "loss": 3.9354, + "step": 26545 + }, + { + "epoch": 1.8039135752140236, + "grad_norm": 0.18759752810001373, + "learning_rate": 7.746466911265118e-05, + "loss": 3.847, + "step": 26550 + }, + { + "epoch": 1.8042532952846853, + "grad_norm": 0.19520717859268188, + "learning_rate": 7.746042261176791e-05, + "loss": 4.1062, + "step": 26555 + }, + { + "epoch": 1.8045930153553473, + "grad_norm": 0.2201915830373764, + "learning_rate": 7.745617611088464e-05, + "loss": 4.0295, + "step": 26560 + }, + { + "epoch": 1.804932735426009, + "grad_norm": 0.17630118131637573, + "learning_rate": 7.745192961000136e-05, + "loss": 3.8856, + "step": 26565 + }, + { + "epoch": 1.8052724554966706, + "grad_norm": 0.384357750415802, + "learning_rate": 7.744768310911809e-05, + "loss": 3.8884, + "step": 26570 + }, + { + "epoch": 1.8056121755673327, + "grad_norm": 0.18431872129440308, + "learning_rate": 7.744343660823482e-05, + "loss": 3.7695, + "step": 26575 + }, + { + "epoch": 1.8059518956379943, + "grad_norm": 0.5442685484886169, + "learning_rate": 7.743919010735155e-05, + "loss": 3.8101, + "step": 26580 + }, + { + "epoch": 1.806291615708656, + "grad_norm": 0.15294799208641052, + "learning_rate": 7.743494360646826e-05, + "loss": 3.7637, + "step": 26585 + }, + { + "epoch": 1.806631335779318, + "grad_norm": 0.17018234729766846, + "learning_rate": 7.7430697105585e-05, + "loss": 3.9587, + "step": 26590 + }, + { + "epoch": 1.8069710558499796, + "grad_norm": 0.22204944491386414, + "learning_rate": 7.742645060470173e-05, + "loss": 3.9475, + "step": 26595 + }, + { + "epoch": 1.8073107759206413, + "grad_norm": 0.24529770016670227, + "learning_rate": 7.742220410381845e-05, + "loss": 4.0402, + "step": 26600 + }, + { + "epoch": 1.8076504959913031, + "grad_norm": 0.17744654417037964, + "learning_rate": 7.741795760293519e-05, + "loss": 3.833, + "step": 26605 + }, + { + "epoch": 1.807990216061965, + "grad_norm": 0.15817943215370178, + "learning_rate": 7.741371110205192e-05, + "loss": 3.8664, + "step": 26610 + }, + { + "epoch": 1.8083299361326266, + "grad_norm": 0.1521889865398407, + "learning_rate": 7.740946460116863e-05, + "loss": 3.9098, + "step": 26615 + }, + { + "epoch": 1.8086696562032885, + "grad_norm": 0.7224039435386658, + "learning_rate": 7.740521810028537e-05, + "loss": 3.9201, + "step": 26620 + }, + { + "epoch": 1.8090093762739503, + "grad_norm": 0.1789894998073578, + "learning_rate": 7.74009715994021e-05, + "loss": 3.7298, + "step": 26625 + }, + { + "epoch": 1.809349096344612, + "grad_norm": 0.20942912995815277, + "learning_rate": 7.739672509851883e-05, + "loss": 3.8598, + "step": 26630 + }, + { + "epoch": 1.8096888164152738, + "grad_norm": 0.23930658400058746, + "learning_rate": 7.739247859763556e-05, + "loss": 4.1655, + "step": 26635 + }, + { + "epoch": 1.8100285364859356, + "grad_norm": 0.19513458013534546, + "learning_rate": 7.738823209675228e-05, + "loss": 3.8148, + "step": 26640 + }, + { + "epoch": 1.8103682565565973, + "grad_norm": 0.162079855799675, + "learning_rate": 7.738398559586901e-05, + "loss": 3.9533, + "step": 26645 + }, + { + "epoch": 1.8107079766272591, + "grad_norm": 0.168344184756279, + "learning_rate": 7.737973909498574e-05, + "loss": 3.8739, + "step": 26650 + }, + { + "epoch": 1.811047696697921, + "grad_norm": 0.17121872305870056, + "learning_rate": 7.737549259410245e-05, + "loss": 3.8449, + "step": 26655 + }, + { + "epoch": 1.8113874167685826, + "grad_norm": 0.18654745817184448, + "learning_rate": 7.73712460932192e-05, + "loss": 3.8796, + "step": 26660 + }, + { + "epoch": 1.8117271368392445, + "grad_norm": 0.13394124805927277, + "learning_rate": 7.736699959233592e-05, + "loss": 3.8176, + "step": 26665 + }, + { + "epoch": 1.8120668569099063, + "grad_norm": 0.17908333241939545, + "learning_rate": 7.736275309145264e-05, + "loss": 3.8384, + "step": 26670 + }, + { + "epoch": 1.812406576980568, + "grad_norm": 0.1721053272485733, + "learning_rate": 7.735850659056938e-05, + "loss": 3.8347, + "step": 26675 + }, + { + "epoch": 1.8127462970512298, + "grad_norm": 0.16780465841293335, + "learning_rate": 7.735426008968611e-05, + "loss": 3.6114, + "step": 26680 + }, + { + "epoch": 1.8130860171218917, + "grad_norm": 0.1828881949186325, + "learning_rate": 7.735001358880282e-05, + "loss": 3.7794, + "step": 26685 + }, + { + "epoch": 1.8134257371925533, + "grad_norm": 0.1550731062889099, + "learning_rate": 7.734576708791956e-05, + "loss": 3.8947, + "step": 26690 + }, + { + "epoch": 1.8137654572632151, + "grad_norm": 0.2008993923664093, + "learning_rate": 7.734152058703629e-05, + "loss": 4.1944, + "step": 26695 + }, + { + "epoch": 1.814105177333877, + "grad_norm": 0.12791430950164795, + "learning_rate": 7.7337274086153e-05, + "loss": 3.9616, + "step": 26700 + }, + { + "epoch": 1.8144448974045386, + "grad_norm": 0.19235379993915558, + "learning_rate": 7.733302758526975e-05, + "loss": 3.6643, + "step": 26705 + }, + { + "epoch": 1.8147846174752005, + "grad_norm": 0.15170611441135406, + "learning_rate": 7.732878108438648e-05, + "loss": 4.0216, + "step": 26710 + }, + { + "epoch": 1.8151243375458623, + "grad_norm": 0.1943473368883133, + "learning_rate": 7.732453458350319e-05, + "loss": 3.8052, + "step": 26715 + }, + { + "epoch": 1.815464057616524, + "grad_norm": 0.19505806267261505, + "learning_rate": 7.732028808261993e-05, + "loss": 3.7902, + "step": 26720 + }, + { + "epoch": 1.8158037776871856, + "grad_norm": 0.4347198009490967, + "learning_rate": 7.731604158173666e-05, + "loss": 3.9582, + "step": 26725 + }, + { + "epoch": 1.8161434977578477, + "grad_norm": 0.1943882405757904, + "learning_rate": 7.731179508085337e-05, + "loss": 4.0501, + "step": 26730 + }, + { + "epoch": 1.8164832178285093, + "grad_norm": 0.17931057512760162, + "learning_rate": 7.730754857997012e-05, + "loss": 3.9631, + "step": 26735 + }, + { + "epoch": 1.816822937899171, + "grad_norm": 0.23831497132778168, + "learning_rate": 7.730330207908683e-05, + "loss": 3.7391, + "step": 26740 + }, + { + "epoch": 1.817162657969833, + "grad_norm": 0.1836778223514557, + "learning_rate": 7.729905557820356e-05, + "loss": 3.9821, + "step": 26745 + }, + { + "epoch": 1.8175023780404946, + "grad_norm": 0.2629704773426056, + "learning_rate": 7.72948090773203e-05, + "loss": 4.1296, + "step": 26750 + }, + { + "epoch": 1.8178420981111563, + "grad_norm": 0.2441583126783371, + "learning_rate": 7.729056257643701e-05, + "loss": 3.8592, + "step": 26755 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.22288045287132263, + "learning_rate": 7.728631607555374e-05, + "loss": 3.9113, + "step": 26760 + }, + { + "epoch": 1.81852153825248, + "grad_norm": 0.17395071685314178, + "learning_rate": 7.728206957467048e-05, + "loss": 3.8675, + "step": 26765 + }, + { + "epoch": 1.8188612583231416, + "grad_norm": 0.15598538517951965, + "learning_rate": 7.72778230737872e-05, + "loss": 3.951, + "step": 26770 + }, + { + "epoch": 1.8192009783938035, + "grad_norm": 0.20137055218219757, + "learning_rate": 7.727357657290393e-05, + "loss": 3.9048, + "step": 26775 + }, + { + "epoch": 1.8195406984644653, + "grad_norm": 0.21375055611133575, + "learning_rate": 7.726933007202067e-05, + "loss": 3.6937, + "step": 26780 + }, + { + "epoch": 1.819880418535127, + "grad_norm": 0.17277921736240387, + "learning_rate": 7.726508357113738e-05, + "loss": 4.0101, + "step": 26785 + }, + { + "epoch": 1.8202201386057888, + "grad_norm": 0.23173852264881134, + "learning_rate": 7.726083707025411e-05, + "loss": 3.995, + "step": 26790 + }, + { + "epoch": 1.8205598586764506, + "grad_norm": 0.44868725538253784, + "learning_rate": 7.725659056937085e-05, + "loss": 3.7907, + "step": 26795 + }, + { + "epoch": 1.8208995787471123, + "grad_norm": 0.17940400540828705, + "learning_rate": 7.725234406848757e-05, + "loss": 3.9358, + "step": 26800 + }, + { + "epoch": 1.8212392988177741, + "grad_norm": 0.20047830045223236, + "learning_rate": 7.72480975676043e-05, + "loss": 4.0194, + "step": 26805 + }, + { + "epoch": 1.821579018888436, + "grad_norm": 1.350032925605774, + "learning_rate": 7.724385106672102e-05, + "loss": 3.9512, + "step": 26810 + }, + { + "epoch": 1.8219187389590976, + "grad_norm": 0.17680513858795166, + "learning_rate": 7.723960456583775e-05, + "loss": 3.8325, + "step": 26815 + }, + { + "epoch": 1.8222584590297595, + "grad_norm": 0.18534226715564728, + "learning_rate": 7.723535806495448e-05, + "loss": 3.7897, + "step": 26820 + }, + { + "epoch": 1.8225981791004213, + "grad_norm": 0.19992922246456146, + "learning_rate": 7.72311115640712e-05, + "loss": 3.8049, + "step": 26825 + }, + { + "epoch": 1.822937899171083, + "grad_norm": 0.17379717528820038, + "learning_rate": 7.722686506318793e-05, + "loss": 3.8531, + "step": 26830 + }, + { + "epoch": 1.8232776192417448, + "grad_norm": 0.3135410249233246, + "learning_rate": 7.722261856230466e-05, + "loss": 3.7761, + "step": 26835 + }, + { + "epoch": 1.8236173393124067, + "grad_norm": 0.1589503288269043, + "learning_rate": 7.721837206142139e-05, + "loss": 3.9269, + "step": 26840 + }, + { + "epoch": 1.8239570593830683, + "grad_norm": 0.2093554139137268, + "learning_rate": 7.721412556053812e-05, + "loss": 3.8858, + "step": 26845 + }, + { + "epoch": 1.8242967794537301, + "grad_norm": 0.15273746848106384, + "learning_rate": 7.720987905965485e-05, + "loss": 4.0234, + "step": 26850 + }, + { + "epoch": 1.824636499524392, + "grad_norm": 0.2138567566871643, + "learning_rate": 7.720563255877157e-05, + "loss": 3.9036, + "step": 26855 + }, + { + "epoch": 1.8249762195950536, + "grad_norm": 0.18151448667049408, + "learning_rate": 7.72013860578883e-05, + "loss": 3.732, + "step": 26860 + }, + { + "epoch": 1.8253159396657155, + "grad_norm": 0.14510342478752136, + "learning_rate": 7.719713955700503e-05, + "loss": 3.9535, + "step": 26865 + }, + { + "epoch": 1.8256556597363773, + "grad_norm": 0.16447903215885162, + "learning_rate": 7.719289305612176e-05, + "loss": 3.9162, + "step": 26870 + }, + { + "epoch": 1.825995379807039, + "grad_norm": 0.7571980357170105, + "learning_rate": 7.718864655523849e-05, + "loss": 4.2166, + "step": 26875 + }, + { + "epoch": 1.8263350998777008, + "grad_norm": 0.2977351248264313, + "learning_rate": 7.718440005435521e-05, + "loss": 3.9288, + "step": 26880 + }, + { + "epoch": 1.8266748199483627, + "grad_norm": 0.17228025197982788, + "learning_rate": 7.718015355347194e-05, + "loss": 3.8618, + "step": 26885 + }, + { + "epoch": 1.8270145400190243, + "grad_norm": 0.23647838830947876, + "learning_rate": 7.717590705258867e-05, + "loss": 3.8678, + "step": 26890 + }, + { + "epoch": 1.827354260089686, + "grad_norm": 0.17129547894001007, + "learning_rate": 7.71716605517054e-05, + "loss": 3.7641, + "step": 26895 + }, + { + "epoch": 1.827693980160348, + "grad_norm": 0.18799041211605072, + "learning_rate": 7.716741405082213e-05, + "loss": 3.7812, + "step": 26900 + }, + { + "epoch": 1.8280337002310096, + "grad_norm": 0.37234434485435486, + "learning_rate": 7.716316754993885e-05, + "loss": 4.0104, + "step": 26905 + }, + { + "epoch": 1.8283734203016713, + "grad_norm": 0.15697382390499115, + "learning_rate": 7.715892104905558e-05, + "loss": 3.8832, + "step": 26910 + }, + { + "epoch": 1.8287131403723333, + "grad_norm": 0.17677617073059082, + "learning_rate": 7.715467454817231e-05, + "loss": 3.9919, + "step": 26915 + }, + { + "epoch": 1.829052860442995, + "grad_norm": 0.16849415004253387, + "learning_rate": 7.715042804728904e-05, + "loss": 3.8434, + "step": 26920 + }, + { + "epoch": 1.8293925805136566, + "grad_norm": 0.5477182269096375, + "learning_rate": 7.714618154640577e-05, + "loss": 3.7655, + "step": 26925 + }, + { + "epoch": 1.8297323005843187, + "grad_norm": 0.1828964650630951, + "learning_rate": 7.71419350455225e-05, + "loss": 3.9189, + "step": 26930 + }, + { + "epoch": 1.8300720206549803, + "grad_norm": 0.17466621100902557, + "learning_rate": 7.713768854463922e-05, + "loss": 3.9798, + "step": 26935 + }, + { + "epoch": 1.830411740725642, + "grad_norm": 0.15011201798915863, + "learning_rate": 7.713344204375594e-05, + "loss": 3.965, + "step": 26940 + }, + { + "epoch": 1.8307514607963038, + "grad_norm": 0.15971961617469788, + "learning_rate": 7.712919554287268e-05, + "loss": 3.9663, + "step": 26945 + }, + { + "epoch": 1.8310911808669657, + "grad_norm": 0.1873934417963028, + "learning_rate": 7.712494904198941e-05, + "loss": 3.9867, + "step": 26950 + }, + { + "epoch": 1.8314309009376273, + "grad_norm": 0.8789135217666626, + "learning_rate": 7.712070254110612e-05, + "loss": 3.9212, + "step": 26955 + }, + { + "epoch": 1.8317706210082891, + "grad_norm": 1.3156040906906128, + "learning_rate": 7.711645604022286e-05, + "loss": 3.8321, + "step": 26960 + }, + { + "epoch": 1.832110341078951, + "grad_norm": 0.1950591802597046, + "learning_rate": 7.711220953933959e-05, + "loss": 3.9354, + "step": 26965 + }, + { + "epoch": 1.8324500611496126, + "grad_norm": 0.20417684316635132, + "learning_rate": 7.710796303845632e-05, + "loss": 4.0194, + "step": 26970 + }, + { + "epoch": 1.8327897812202745, + "grad_norm": 0.17682234942913055, + "learning_rate": 7.710371653757305e-05, + "loss": 4.1154, + "step": 26975 + }, + { + "epoch": 1.8331295012909363, + "grad_norm": 0.15526516735553741, + "learning_rate": 7.709947003668977e-05, + "loss": 3.9578, + "step": 26980 + }, + { + "epoch": 1.833469221361598, + "grad_norm": 0.16138868033885956, + "learning_rate": 7.70952235358065e-05, + "loss": 3.8311, + "step": 26985 + }, + { + "epoch": 1.8338089414322598, + "grad_norm": 0.2100953906774521, + "learning_rate": 7.709097703492323e-05, + "loss": 3.9163, + "step": 26990 + }, + { + "epoch": 1.8341486615029217, + "grad_norm": 1.1291425228118896, + "learning_rate": 7.708673053403996e-05, + "loss": 3.5091, + "step": 26995 + }, + { + "epoch": 1.8344883815735833, + "grad_norm": 0.18962626159191132, + "learning_rate": 7.708248403315669e-05, + "loss": 4.0424, + "step": 27000 + }, + { + "epoch": 1.8348281016442451, + "grad_norm": 0.22387515008449554, + "learning_rate": 7.707823753227341e-05, + "loss": 3.6625, + "step": 27005 + }, + { + "epoch": 1.835167821714907, + "grad_norm": 0.21133656799793243, + "learning_rate": 7.707399103139013e-05, + "loss": 3.7867, + "step": 27010 + }, + { + "epoch": 1.8355075417855686, + "grad_norm": 0.41492897272109985, + "learning_rate": 7.706974453050687e-05, + "loss": 3.9254, + "step": 27015 + }, + { + "epoch": 1.8358472618562305, + "grad_norm": 0.17780150473117828, + "learning_rate": 7.70654980296236e-05, + "loss": 3.9441, + "step": 27020 + }, + { + "epoch": 1.8361869819268923, + "grad_norm": 0.18978863954544067, + "learning_rate": 7.706125152874031e-05, + "loss": 4.0264, + "step": 27025 + }, + { + "epoch": 1.836526701997554, + "grad_norm": 0.27804186940193176, + "learning_rate": 7.705700502785705e-05, + "loss": 3.7239, + "step": 27030 + }, + { + "epoch": 1.8368664220682158, + "grad_norm": 0.3949762284755707, + "learning_rate": 7.705275852697378e-05, + "loss": 3.9582, + "step": 27035 + }, + { + "epoch": 1.8372061421388777, + "grad_norm": 0.16728833317756653, + "learning_rate": 7.70485120260905e-05, + "loss": 3.5546, + "step": 27040 + }, + { + "epoch": 1.8375458622095393, + "grad_norm": 0.20488834381103516, + "learning_rate": 7.704426552520724e-05, + "loss": 3.8321, + "step": 27045 + }, + { + "epoch": 1.8378855822802012, + "grad_norm": 0.15300732851028442, + "learning_rate": 7.704001902432397e-05, + "loss": 3.9059, + "step": 27050 + }, + { + "epoch": 1.838225302350863, + "grad_norm": 0.1591406613588333, + "learning_rate": 7.703577252344068e-05, + "loss": 3.8425, + "step": 27055 + }, + { + "epoch": 1.8385650224215246, + "grad_norm": 0.16721488535404205, + "learning_rate": 7.703152602255742e-05, + "loss": 3.8911, + "step": 27060 + }, + { + "epoch": 1.8389047424921863, + "grad_norm": 0.19458164274692535, + "learning_rate": 7.702727952167415e-05, + "loss": 3.9166, + "step": 27065 + }, + { + "epoch": 1.8392444625628483, + "grad_norm": 2.4516851902008057, + "learning_rate": 7.702303302079087e-05, + "loss": 3.8524, + "step": 27070 + }, + { + "epoch": 1.83958418263351, + "grad_norm": 0.27723902463912964, + "learning_rate": 7.701878651990761e-05, + "loss": 4.0063, + "step": 27075 + }, + { + "epoch": 1.8399239027041716, + "grad_norm": 1.5805820226669312, + "learning_rate": 7.701454001902433e-05, + "loss": 3.8867, + "step": 27080 + }, + { + "epoch": 1.8402636227748337, + "grad_norm": 0.3097696900367737, + "learning_rate": 7.701029351814105e-05, + "loss": 3.848, + "step": 27085 + }, + { + "epoch": 1.8406033428454953, + "grad_norm": 0.20208622515201569, + "learning_rate": 7.700604701725779e-05, + "loss": 3.9012, + "step": 27090 + }, + { + "epoch": 1.840943062916157, + "grad_norm": 0.18587054312229156, + "learning_rate": 7.70018005163745e-05, + "loss": 3.7011, + "step": 27095 + }, + { + "epoch": 1.841282782986819, + "grad_norm": 0.21121825277805328, + "learning_rate": 7.699755401549123e-05, + "loss": 3.776, + "step": 27100 + }, + { + "epoch": 1.8416225030574807, + "grad_norm": 7.188265800476074, + "learning_rate": 7.699330751460798e-05, + "loss": 3.8733, + "step": 27105 + }, + { + "epoch": 1.8419622231281423, + "grad_norm": 0.15485529601573944, + "learning_rate": 7.698906101372469e-05, + "loss": 3.9543, + "step": 27110 + }, + { + "epoch": 1.8423019431988041, + "grad_norm": 0.18555189669132233, + "learning_rate": 7.698481451284142e-05, + "loss": 3.7265, + "step": 27115 + }, + { + "epoch": 1.842641663269466, + "grad_norm": 0.3609059751033783, + "learning_rate": 7.698056801195816e-05, + "loss": 3.9924, + "step": 27120 + }, + { + "epoch": 1.8429813833401276, + "grad_norm": 0.2118442803621292, + "learning_rate": 7.697632151107487e-05, + "loss": 3.8761, + "step": 27125 + }, + { + "epoch": 1.8433211034107895, + "grad_norm": 0.43577340245246887, + "learning_rate": 7.69720750101916e-05, + "loss": 4.0011, + "step": 27130 + }, + { + "epoch": 1.8436608234814513, + "grad_norm": 1.2183961868286133, + "learning_rate": 7.696782850930834e-05, + "loss": 3.9599, + "step": 27135 + }, + { + "epoch": 1.844000543552113, + "grad_norm": 0.15343034267425537, + "learning_rate": 7.696358200842506e-05, + "loss": 3.9815, + "step": 27140 + }, + { + "epoch": 1.8443402636227748, + "grad_norm": 0.1646152287721634, + "learning_rate": 7.695933550754179e-05, + "loss": 3.6328, + "step": 27145 + }, + { + "epoch": 1.8446799836934367, + "grad_norm": 0.20106656849384308, + "learning_rate": 7.695508900665853e-05, + "loss": 3.9895, + "step": 27150 + }, + { + "epoch": 1.8450197037640983, + "grad_norm": 0.15894967317581177, + "learning_rate": 7.695084250577524e-05, + "loss": 4.0036, + "step": 27155 + }, + { + "epoch": 1.8453594238347601, + "grad_norm": 0.3401646614074707, + "learning_rate": 7.694659600489197e-05, + "loss": 3.8168, + "step": 27160 + }, + { + "epoch": 1.845699143905422, + "grad_norm": 0.21853439509868622, + "learning_rate": 7.69423495040087e-05, + "loss": 3.6768, + "step": 27165 + }, + { + "epoch": 1.8460388639760836, + "grad_norm": 0.839600682258606, + "learning_rate": 7.693810300312543e-05, + "loss": 3.9179, + "step": 27170 + }, + { + "epoch": 1.8463785840467455, + "grad_norm": 0.16769981384277344, + "learning_rate": 7.693385650224215e-05, + "loss": 4.1247, + "step": 27175 + }, + { + "epoch": 1.8467183041174073, + "grad_norm": 0.21335577964782715, + "learning_rate": 7.692961000135888e-05, + "loss": 4.0711, + "step": 27180 + }, + { + "epoch": 1.847058024188069, + "grad_norm": 0.19433391094207764, + "learning_rate": 7.692536350047561e-05, + "loss": 3.9987, + "step": 27185 + }, + { + "epoch": 1.8473977442587308, + "grad_norm": 0.18129394948482513, + "learning_rate": 7.692111699959234e-05, + "loss": 4.1656, + "step": 27190 + }, + { + "epoch": 1.8477374643293927, + "grad_norm": 0.15720495581626892, + "learning_rate": 7.691687049870907e-05, + "loss": 4.092, + "step": 27195 + }, + { + "epoch": 1.8480771844000543, + "grad_norm": 0.15841859579086304, + "learning_rate": 7.69126239978258e-05, + "loss": 3.9017, + "step": 27200 + }, + { + "epoch": 1.8484169044707162, + "grad_norm": 0.1489601582288742, + "learning_rate": 7.690837749694252e-05, + "loss": 4.0379, + "step": 27205 + }, + { + "epoch": 1.848756624541378, + "grad_norm": 0.19507953524589539, + "learning_rate": 7.690413099605925e-05, + "loss": 3.8796, + "step": 27210 + }, + { + "epoch": 1.8490963446120396, + "grad_norm": 0.1569787859916687, + "learning_rate": 7.689988449517598e-05, + "loss": 3.7195, + "step": 27215 + }, + { + "epoch": 1.8494360646827015, + "grad_norm": 0.17715218663215637, + "learning_rate": 7.68956379942927e-05, + "loss": 4.2934, + "step": 27220 + }, + { + "epoch": 1.8497757847533634, + "grad_norm": 0.22942742705345154, + "learning_rate": 7.689139149340943e-05, + "loss": 3.7418, + "step": 27225 + }, + { + "epoch": 1.850115504824025, + "grad_norm": 0.1896630972623825, + "learning_rate": 7.688714499252616e-05, + "loss": 3.7445, + "step": 27230 + }, + { + "epoch": 1.8504552248946866, + "grad_norm": 0.1693107932806015, + "learning_rate": 7.688289849164289e-05, + "loss": 3.9436, + "step": 27235 + }, + { + "epoch": 1.8507949449653487, + "grad_norm": 0.1921326071023941, + "learning_rate": 7.687865199075962e-05, + "loss": 3.95, + "step": 27240 + }, + { + "epoch": 1.8511346650360103, + "grad_norm": 0.19475969672203064, + "learning_rate": 7.687440548987635e-05, + "loss": 3.9519, + "step": 27245 + }, + { + "epoch": 1.851474385106672, + "grad_norm": 0.6033923029899597, + "learning_rate": 7.687015898899307e-05, + "loss": 3.7752, + "step": 27250 + }, + { + "epoch": 1.851814105177334, + "grad_norm": 0.17315849661827087, + "learning_rate": 7.68659124881098e-05, + "loss": 3.7472, + "step": 27255 + }, + { + "epoch": 1.8521538252479957, + "grad_norm": 0.17223069071769714, + "learning_rate": 7.686166598722653e-05, + "loss": 3.9644, + "step": 27260 + }, + { + "epoch": 1.8524935453186573, + "grad_norm": 0.17473316192626953, + "learning_rate": 7.685741948634326e-05, + "loss": 4.0854, + "step": 27265 + }, + { + "epoch": 1.8528332653893194, + "grad_norm": 0.20422104001045227, + "learning_rate": 7.685317298545999e-05, + "loss": 3.9929, + "step": 27270 + }, + { + "epoch": 1.853172985459981, + "grad_norm": 0.21733246743679047, + "learning_rate": 7.684892648457671e-05, + "loss": 4.1776, + "step": 27275 + }, + { + "epoch": 1.8535127055306426, + "grad_norm": 0.16571736335754395, + "learning_rate": 7.684467998369344e-05, + "loss": 3.9185, + "step": 27280 + }, + { + "epoch": 1.8538524256013045, + "grad_norm": 0.45876020193099976, + "learning_rate": 7.684043348281017e-05, + "loss": 3.825, + "step": 27285 + }, + { + "epoch": 1.8541921456719663, + "grad_norm": 0.12175784260034561, + "learning_rate": 7.68361869819269e-05, + "loss": 4.004, + "step": 27290 + }, + { + "epoch": 1.854531865742628, + "grad_norm": 0.20976248383522034, + "learning_rate": 7.683194048104361e-05, + "loss": 3.8518, + "step": 27295 + }, + { + "epoch": 1.8548715858132898, + "grad_norm": 0.2357460856437683, + "learning_rate": 7.682769398016035e-05, + "loss": 4.0698, + "step": 27300 + }, + { + "epoch": 1.8552113058839517, + "grad_norm": 0.22299830615520477, + "learning_rate": 7.682344747927708e-05, + "loss": 3.8195, + "step": 27305 + }, + { + "epoch": 1.8555510259546133, + "grad_norm": 0.2081936001777649, + "learning_rate": 7.681920097839381e-05, + "loss": 3.9598, + "step": 27310 + }, + { + "epoch": 1.8558907460252752, + "grad_norm": 0.22136381268501282, + "learning_rate": 7.681495447751054e-05, + "loss": 4.0684, + "step": 27315 + }, + { + "epoch": 1.856230466095937, + "grad_norm": 0.178492933511734, + "learning_rate": 7.681070797662727e-05, + "loss": 3.7194, + "step": 27320 + }, + { + "epoch": 1.8565701861665986, + "grad_norm": 0.18417038023471832, + "learning_rate": 7.6806461475744e-05, + "loss": 3.8076, + "step": 27325 + }, + { + "epoch": 1.8569099062372605, + "grad_norm": 0.19161632657051086, + "learning_rate": 7.680221497486072e-05, + "loss": 3.7891, + "step": 27330 + }, + { + "epoch": 1.8572496263079223, + "grad_norm": 0.16976769268512726, + "learning_rate": 7.679796847397745e-05, + "loss": 3.7517, + "step": 27335 + }, + { + "epoch": 1.857589346378584, + "grad_norm": 0.15468204021453857, + "learning_rate": 7.679372197309418e-05, + "loss": 3.7302, + "step": 27340 + }, + { + "epoch": 1.8579290664492458, + "grad_norm": 0.20010165870189667, + "learning_rate": 7.67894754722109e-05, + "loss": 3.8131, + "step": 27345 + }, + { + "epoch": 1.8582687865199077, + "grad_norm": 0.1875891089439392, + "learning_rate": 7.678522897132763e-05, + "loss": 3.7242, + "step": 27350 + }, + { + "epoch": 1.8586085065905693, + "grad_norm": 0.2521427571773529, + "learning_rate": 7.678098247044436e-05, + "loss": 3.832, + "step": 27355 + }, + { + "epoch": 1.8589482266612312, + "grad_norm": 0.14770916104316711, + "learning_rate": 7.677673596956109e-05, + "loss": 3.8524, + "step": 27360 + }, + { + "epoch": 1.859287946731893, + "grad_norm": 0.16722725331783295, + "learning_rate": 7.67724894686778e-05, + "loss": 3.8935, + "step": 27365 + }, + { + "epoch": 1.8596276668025546, + "grad_norm": 0.17294961214065552, + "learning_rate": 7.676824296779455e-05, + "loss": 3.7492, + "step": 27370 + }, + { + "epoch": 1.8599673868732165, + "grad_norm": 0.23418164253234863, + "learning_rate": 7.676399646691127e-05, + "loss": 3.7994, + "step": 27375 + }, + { + "epoch": 1.8603071069438784, + "grad_norm": 0.18365445733070374, + "learning_rate": 7.675974996602799e-05, + "loss": 3.8819, + "step": 27380 + }, + { + "epoch": 1.86064682701454, + "grad_norm": 0.8726001977920532, + "learning_rate": 7.675550346514473e-05, + "loss": 3.8242, + "step": 27385 + }, + { + "epoch": 1.8609865470852018, + "grad_norm": 0.19356444478034973, + "learning_rate": 7.675125696426146e-05, + "loss": 3.6645, + "step": 27390 + }, + { + "epoch": 1.8613262671558637, + "grad_norm": 0.42371001839637756, + "learning_rate": 7.674701046337817e-05, + "loss": 4.0722, + "step": 27395 + }, + { + "epoch": 1.8616659872265253, + "grad_norm": 0.22512434422969818, + "learning_rate": 7.674276396249491e-05, + "loss": 4.1704, + "step": 27400 + }, + { + "epoch": 1.862005707297187, + "grad_norm": 0.1988096833229065, + "learning_rate": 7.673851746161164e-05, + "loss": 3.7197, + "step": 27405 + }, + { + "epoch": 1.862345427367849, + "grad_norm": 0.20652662217617035, + "learning_rate": 7.673427096072836e-05, + "loss": 4.1489, + "step": 27410 + }, + { + "epoch": 1.8626851474385107, + "grad_norm": 0.6438788175582886, + "learning_rate": 7.67300244598451e-05, + "loss": 4.2167, + "step": 27415 + }, + { + "epoch": 1.8630248675091723, + "grad_norm": 0.16326725482940674, + "learning_rate": 7.672577795896183e-05, + "loss": 4.0029, + "step": 27420 + }, + { + "epoch": 1.8633645875798344, + "grad_norm": 0.1724574714899063, + "learning_rate": 7.672153145807854e-05, + "loss": 3.8683, + "step": 27425 + }, + { + "epoch": 1.863704307650496, + "grad_norm": 0.16898061335086823, + "learning_rate": 7.671728495719528e-05, + "loss": 4.0637, + "step": 27430 + }, + { + "epoch": 1.8640440277211576, + "grad_norm": 0.13067100942134857, + "learning_rate": 7.6713038456312e-05, + "loss": 3.9528, + "step": 27435 + }, + { + "epoch": 1.8643837477918197, + "grad_norm": 0.2141728550195694, + "learning_rate": 7.670879195542872e-05, + "loss": 4.1472, + "step": 27440 + }, + { + "epoch": 1.8647234678624813, + "grad_norm": 0.16571970283985138, + "learning_rate": 7.670454545454547e-05, + "loss": 3.972, + "step": 27445 + }, + { + "epoch": 1.865063187933143, + "grad_norm": 0.1808907836675644, + "learning_rate": 7.670029895366218e-05, + "loss": 3.7435, + "step": 27450 + }, + { + "epoch": 1.8654029080038048, + "grad_norm": 0.17581528425216675, + "learning_rate": 7.669605245277891e-05, + "loss": 3.725, + "step": 27455 + }, + { + "epoch": 1.8657426280744667, + "grad_norm": 0.14017711579799652, + "learning_rate": 7.669180595189565e-05, + "loss": 3.9525, + "step": 27460 + }, + { + "epoch": 1.8660823481451283, + "grad_norm": 0.2425866723060608, + "learning_rate": 7.668755945101236e-05, + "loss": 3.8372, + "step": 27465 + }, + { + "epoch": 1.8664220682157902, + "grad_norm": 0.19868521392345428, + "learning_rate": 7.668331295012909e-05, + "loss": 4.0322, + "step": 27470 + }, + { + "epoch": 1.866761788286452, + "grad_norm": 2.4394636154174805, + "learning_rate": 7.667906644924583e-05, + "loss": 3.7711, + "step": 27475 + }, + { + "epoch": 1.8671015083571136, + "grad_norm": 0.2630516588687897, + "learning_rate": 7.667481994836255e-05, + "loss": 3.9318, + "step": 27480 + }, + { + "epoch": 1.8674412284277755, + "grad_norm": 0.2509152591228485, + "learning_rate": 7.667057344747928e-05, + "loss": 3.8091, + "step": 27485 + }, + { + "epoch": 1.8677809484984373, + "grad_norm": 0.18180425465106964, + "learning_rate": 7.666632694659602e-05, + "loss": 3.7485, + "step": 27490 + }, + { + "epoch": 1.868120668569099, + "grad_norm": 0.2684369683265686, + "learning_rate": 7.666208044571273e-05, + "loss": 4.0839, + "step": 27495 + }, + { + "epoch": 1.8684603886397608, + "grad_norm": 0.18869712948799133, + "learning_rate": 7.665783394482946e-05, + "loss": 3.7126, + "step": 27500 + }, + { + "epoch": 1.8688001087104227, + "grad_norm": 0.23211011290550232, + "learning_rate": 7.66535874439462e-05, + "loss": 4.1075, + "step": 27505 + }, + { + "epoch": 1.8691398287810843, + "grad_norm": 0.21079802513122559, + "learning_rate": 7.664934094306292e-05, + "loss": 3.5652, + "step": 27510 + }, + { + "epoch": 1.8694795488517462, + "grad_norm": 0.42643171548843384, + "learning_rate": 7.664509444217964e-05, + "loss": 3.8463, + "step": 27515 + }, + { + "epoch": 1.869819268922408, + "grad_norm": 0.16720914840698242, + "learning_rate": 7.664084794129637e-05, + "loss": 3.795, + "step": 27520 + }, + { + "epoch": 1.8701589889930696, + "grad_norm": 0.22448597848415375, + "learning_rate": 7.66366014404131e-05, + "loss": 3.738, + "step": 27525 + }, + { + "epoch": 1.8704987090637315, + "grad_norm": 0.1353236883878708, + "learning_rate": 7.663235493952983e-05, + "loss": 3.9405, + "step": 27530 + }, + { + "epoch": 1.8708384291343934, + "grad_norm": 0.3555552661418915, + "learning_rate": 7.662810843864656e-05, + "loss": 3.7861, + "step": 27535 + }, + { + "epoch": 1.871178149205055, + "grad_norm": 0.1685716062784195, + "learning_rate": 7.662386193776328e-05, + "loss": 4.2102, + "step": 27540 + }, + { + "epoch": 1.8715178692757168, + "grad_norm": 0.14676855504512787, + "learning_rate": 7.661961543688001e-05, + "loss": 3.7757, + "step": 27545 + }, + { + "epoch": 1.8718575893463787, + "grad_norm": 0.1538759469985962, + "learning_rate": 7.661536893599674e-05, + "loss": 4.1983, + "step": 27550 + }, + { + "epoch": 1.8721973094170403, + "grad_norm": 0.1710328757762909, + "learning_rate": 7.661112243511347e-05, + "loss": 3.8043, + "step": 27555 + }, + { + "epoch": 1.8725370294877022, + "grad_norm": 0.17969588935375214, + "learning_rate": 7.66068759342302e-05, + "loss": 3.654, + "step": 27560 + }, + { + "epoch": 1.872876749558364, + "grad_norm": 0.1716558188199997, + "learning_rate": 7.660262943334692e-05, + "loss": 3.8712, + "step": 27565 + }, + { + "epoch": 1.8732164696290257, + "grad_norm": 0.14902648329734802, + "learning_rate": 7.659838293246365e-05, + "loss": 3.8109, + "step": 27570 + }, + { + "epoch": 1.8735561896996873, + "grad_norm": 0.1789960265159607, + "learning_rate": 7.659413643158038e-05, + "loss": 3.9336, + "step": 27575 + }, + { + "epoch": 1.8738959097703494, + "grad_norm": 0.20441824197769165, + "learning_rate": 7.658988993069711e-05, + "loss": 4.0275, + "step": 27580 + }, + { + "epoch": 1.874235629841011, + "grad_norm": 0.17010632157325745, + "learning_rate": 7.658564342981384e-05, + "loss": 3.847, + "step": 27585 + }, + { + "epoch": 1.8745753499116726, + "grad_norm": 0.13975846767425537, + "learning_rate": 7.658139692893056e-05, + "loss": 3.8302, + "step": 27590 + }, + { + "epoch": 1.8749150699823347, + "grad_norm": 0.2813519239425659, + "learning_rate": 7.657715042804729e-05, + "loss": 3.8553, + "step": 27595 + }, + { + "epoch": 1.8752547900529963, + "grad_norm": 0.25548166036605835, + "learning_rate": 7.657290392716402e-05, + "loss": 3.8049, + "step": 27600 + }, + { + "epoch": 1.875594510123658, + "grad_norm": 0.17439569532871246, + "learning_rate": 7.656865742628075e-05, + "loss": 4.1535, + "step": 27605 + }, + { + "epoch": 1.87593423019432, + "grad_norm": 0.15901914238929749, + "learning_rate": 7.656441092539748e-05, + "loss": 3.8342, + "step": 27610 + }, + { + "epoch": 1.8762739502649817, + "grad_norm": 0.1591397523880005, + "learning_rate": 7.65601644245142e-05, + "loss": 3.948, + "step": 27615 + }, + { + "epoch": 1.8766136703356433, + "grad_norm": 0.1738409698009491, + "learning_rate": 7.655591792363093e-05, + "loss": 3.9437, + "step": 27620 + }, + { + "epoch": 1.8769533904063052, + "grad_norm": 0.2030322104692459, + "learning_rate": 7.655167142274766e-05, + "loss": 4.064, + "step": 27625 + }, + { + "epoch": 1.877293110476967, + "grad_norm": 0.12977442145347595, + "learning_rate": 7.654742492186439e-05, + "loss": 3.8154, + "step": 27630 + }, + { + "epoch": 1.8776328305476286, + "grad_norm": 0.17576083540916443, + "learning_rate": 7.65431784209811e-05, + "loss": 3.6391, + "step": 27635 + }, + { + "epoch": 1.8779725506182905, + "grad_norm": 0.4466758966445923, + "learning_rate": 7.653893192009784e-05, + "loss": 3.8916, + "step": 27640 + }, + { + "epoch": 1.8783122706889523, + "grad_norm": 0.2188110202550888, + "learning_rate": 7.653468541921457e-05, + "loss": 3.7705, + "step": 27645 + }, + { + "epoch": 1.878651990759614, + "grad_norm": 0.2263517677783966, + "learning_rate": 7.65304389183313e-05, + "loss": 3.8717, + "step": 27650 + }, + { + "epoch": 1.8789917108302758, + "grad_norm": 0.15267635881900787, + "learning_rate": 7.652619241744803e-05, + "loss": 4.072, + "step": 27655 + }, + { + "epoch": 1.8793314309009377, + "grad_norm": 0.1457611620426178, + "learning_rate": 7.652194591656476e-05, + "loss": 3.9451, + "step": 27660 + }, + { + "epoch": 1.8796711509715993, + "grad_norm": 0.17449326813220978, + "learning_rate": 7.651769941568148e-05, + "loss": 4.0991, + "step": 27665 + }, + { + "epoch": 1.8800108710422612, + "grad_norm": 0.1539296805858612, + "learning_rate": 7.651345291479821e-05, + "loss": 3.9089, + "step": 27670 + }, + { + "epoch": 1.880350591112923, + "grad_norm": 0.17494313418865204, + "learning_rate": 7.650920641391494e-05, + "loss": 3.9831, + "step": 27675 + }, + { + "epoch": 1.8806903111835847, + "grad_norm": 0.24165533483028412, + "learning_rate": 7.650495991303167e-05, + "loss": 4.0277, + "step": 27680 + }, + { + "epoch": 1.8810300312542465, + "grad_norm": 0.17158551514148712, + "learning_rate": 7.65007134121484e-05, + "loss": 3.8907, + "step": 27685 + }, + { + "epoch": 1.8813697513249084, + "grad_norm": 0.1494707465171814, + "learning_rate": 7.649646691126512e-05, + "loss": 3.8701, + "step": 27690 + }, + { + "epoch": 1.88170947139557, + "grad_norm": 0.16832149028778076, + "learning_rate": 7.649222041038185e-05, + "loss": 4.0896, + "step": 27695 + }, + { + "epoch": 1.8820491914662318, + "grad_norm": 0.17952796816825867, + "learning_rate": 7.648797390949858e-05, + "loss": 3.7769, + "step": 27700 + }, + { + "epoch": 1.8823889115368937, + "grad_norm": 0.21506807208061218, + "learning_rate": 7.648372740861531e-05, + "loss": 3.8537, + "step": 27705 + }, + { + "epoch": 1.8827286316075553, + "grad_norm": 0.18382273614406586, + "learning_rate": 7.647948090773204e-05, + "loss": 3.9737, + "step": 27710 + }, + { + "epoch": 1.8830683516782172, + "grad_norm": 0.14591960608959198, + "learning_rate": 7.647523440684876e-05, + "loss": 3.968, + "step": 27715 + }, + { + "epoch": 1.883408071748879, + "grad_norm": 0.1666860282421112, + "learning_rate": 7.647098790596548e-05, + "loss": 3.8459, + "step": 27720 + }, + { + "epoch": 1.8837477918195407, + "grad_norm": 0.1698896884918213, + "learning_rate": 7.646674140508222e-05, + "loss": 3.9945, + "step": 27725 + }, + { + "epoch": 1.8840875118902025, + "grad_norm": 0.20038585364818573, + "learning_rate": 7.646249490419895e-05, + "loss": 3.9, + "step": 27730 + }, + { + "epoch": 1.8844272319608644, + "grad_norm": 0.2127290517091751, + "learning_rate": 7.645824840331566e-05, + "loss": 3.914, + "step": 27735 + }, + { + "epoch": 1.884766952031526, + "grad_norm": 0.19418194890022278, + "learning_rate": 7.64540019024324e-05, + "loss": 4.0278, + "step": 27740 + }, + { + "epoch": 1.8851066721021879, + "grad_norm": 0.14979887008666992, + "learning_rate": 7.644975540154913e-05, + "loss": 3.8191, + "step": 27745 + }, + { + "epoch": 1.8854463921728497, + "grad_norm": 0.14931653439998627, + "learning_rate": 7.644550890066585e-05, + "loss": 3.7495, + "step": 27750 + }, + { + "epoch": 1.8857861122435113, + "grad_norm": 0.4573568105697632, + "learning_rate": 7.644126239978259e-05, + "loss": 3.645, + "step": 27755 + }, + { + "epoch": 1.886125832314173, + "grad_norm": 0.19637127220630646, + "learning_rate": 7.643701589889932e-05, + "loss": 4.0387, + "step": 27760 + }, + { + "epoch": 1.886465552384835, + "grad_norm": 0.1981884390115738, + "learning_rate": 7.643276939801603e-05, + "loss": 3.8141, + "step": 27765 + }, + { + "epoch": 1.8868052724554967, + "grad_norm": 0.19389156997203827, + "learning_rate": 7.642852289713277e-05, + "loss": 4.0692, + "step": 27770 + }, + { + "epoch": 1.8871449925261583, + "grad_norm": 0.19055671989917755, + "learning_rate": 7.64242763962495e-05, + "loss": 3.7315, + "step": 27775 + }, + { + "epoch": 1.8874847125968204, + "grad_norm": 0.1292697936296463, + "learning_rate": 7.642002989536622e-05, + "loss": 3.6713, + "step": 27780 + }, + { + "epoch": 1.887824432667482, + "grad_norm": 0.23367278277873993, + "learning_rate": 7.641578339448296e-05, + "loss": 3.7363, + "step": 27785 + }, + { + "epoch": 1.8881641527381436, + "grad_norm": 0.17080868780612946, + "learning_rate": 7.641153689359967e-05, + "loss": 3.9439, + "step": 27790 + }, + { + "epoch": 1.8885038728088055, + "grad_norm": 0.15967053174972534, + "learning_rate": 7.64072903927164e-05, + "loss": 3.8123, + "step": 27795 + }, + { + "epoch": 1.8888435928794673, + "grad_norm": 0.16855283081531525, + "learning_rate": 7.640304389183314e-05, + "loss": 3.9186, + "step": 27800 + }, + { + "epoch": 1.889183312950129, + "grad_norm": 0.14828741550445557, + "learning_rate": 7.639879739094986e-05, + "loss": 3.8666, + "step": 27805 + }, + { + "epoch": 1.8895230330207908, + "grad_norm": 0.17310476303100586, + "learning_rate": 7.639455089006658e-05, + "loss": 4.0962, + "step": 27810 + }, + { + "epoch": 1.8898627530914527, + "grad_norm": 0.5709404349327087, + "learning_rate": 7.639030438918333e-05, + "loss": 3.8457, + "step": 27815 + }, + { + "epoch": 1.8902024731621143, + "grad_norm": 0.1866549849510193, + "learning_rate": 7.638605788830004e-05, + "loss": 3.7048, + "step": 27820 + }, + { + "epoch": 1.8905421932327762, + "grad_norm": 0.1262911558151245, + "learning_rate": 7.638181138741677e-05, + "loss": 3.7346, + "step": 27825 + }, + { + "epoch": 1.890881913303438, + "grad_norm": 0.19286301732063293, + "learning_rate": 7.637756488653351e-05, + "loss": 4.1216, + "step": 27830 + }, + { + "epoch": 1.8912216333740997, + "grad_norm": 0.18326547741889954, + "learning_rate": 7.637331838565022e-05, + "loss": 4.0033, + "step": 27835 + }, + { + "epoch": 1.8915613534447615, + "grad_norm": 0.17711366713047028, + "learning_rate": 7.636907188476695e-05, + "loss": 3.7842, + "step": 27840 + }, + { + "epoch": 1.8919010735154234, + "grad_norm": 0.19333603978157043, + "learning_rate": 7.636482538388369e-05, + "loss": 3.8365, + "step": 27845 + }, + { + "epoch": 1.892240793586085, + "grad_norm": 0.24956129491329193, + "learning_rate": 7.636057888300041e-05, + "loss": 3.982, + "step": 27850 + }, + { + "epoch": 1.8925805136567468, + "grad_norm": 0.16625599563121796, + "learning_rate": 7.635633238211714e-05, + "loss": 4.1444, + "step": 27855 + }, + { + "epoch": 1.8929202337274087, + "grad_norm": 0.18385055661201477, + "learning_rate": 7.635208588123386e-05, + "loss": 4.044, + "step": 27860 + }, + { + "epoch": 1.8932599537980703, + "grad_norm": 0.20541439950466156, + "learning_rate": 7.634783938035059e-05, + "loss": 3.8208, + "step": 27865 + }, + { + "epoch": 1.8935996738687322, + "grad_norm": 0.2051268219947815, + "learning_rate": 7.634359287946732e-05, + "loss": 4.1081, + "step": 27870 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.17864134907722473, + "learning_rate": 7.633934637858405e-05, + "loss": 3.9344, + "step": 27875 + }, + { + "epoch": 1.8942791140100557, + "grad_norm": 0.3572849929332733, + "learning_rate": 7.633509987770078e-05, + "loss": 4.0381, + "step": 27880 + }, + { + "epoch": 1.8946188340807175, + "grad_norm": 0.1685326099395752, + "learning_rate": 7.63308533768175e-05, + "loss": 3.8583, + "step": 27885 + }, + { + "epoch": 1.8949585541513794, + "grad_norm": 0.14638446271419525, + "learning_rate": 7.632660687593423e-05, + "loss": 3.8346, + "step": 27890 + }, + { + "epoch": 1.895298274222041, + "grad_norm": 1.697165608406067, + "learning_rate": 7.632236037505096e-05, + "loss": 4.1046, + "step": 27895 + }, + { + "epoch": 1.8956379942927029, + "grad_norm": 0.19732946157455444, + "learning_rate": 7.631811387416769e-05, + "loss": 3.905, + "step": 27900 + }, + { + "epoch": 1.8959777143633647, + "grad_norm": 0.3966963291168213, + "learning_rate": 7.631386737328442e-05, + "loss": 3.8481, + "step": 27905 + }, + { + "epoch": 1.8963174344340263, + "grad_norm": 0.16861890256404877, + "learning_rate": 7.630962087240114e-05, + "loss": 3.9277, + "step": 27910 + }, + { + "epoch": 1.8966571545046882, + "grad_norm": 0.1623014360666275, + "learning_rate": 7.630537437151787e-05, + "loss": 3.8092, + "step": 27915 + }, + { + "epoch": 1.89699687457535, + "grad_norm": 0.17898115515708923, + "learning_rate": 7.63011278706346e-05, + "loss": 3.9095, + "step": 27920 + }, + { + "epoch": 1.8973365946460117, + "grad_norm": 0.1884016990661621, + "learning_rate": 7.629688136975133e-05, + "loss": 3.9718, + "step": 27925 + }, + { + "epoch": 1.8976763147166733, + "grad_norm": 0.15691201388835907, + "learning_rate": 7.629263486886806e-05, + "loss": 3.9485, + "step": 27930 + }, + { + "epoch": 1.8980160347873354, + "grad_norm": 0.2972804009914398, + "learning_rate": 7.628838836798478e-05, + "loss": 3.8479, + "step": 27935 + }, + { + "epoch": 1.898355754857997, + "grad_norm": 0.33188462257385254, + "learning_rate": 7.628414186710151e-05, + "loss": 3.8421, + "step": 27940 + }, + { + "epoch": 1.8986954749286586, + "grad_norm": 0.17929308116436005, + "learning_rate": 7.627989536621824e-05, + "loss": 3.7514, + "step": 27945 + }, + { + "epoch": 1.8990351949993207, + "grad_norm": 0.14026951789855957, + "learning_rate": 7.627564886533497e-05, + "loss": 3.7759, + "step": 27950 + }, + { + "epoch": 1.8993749150699824, + "grad_norm": 0.16127042472362518, + "learning_rate": 7.62714023644517e-05, + "loss": 3.6566, + "step": 27955 + }, + { + "epoch": 1.899714635140644, + "grad_norm": 0.18114839494228363, + "learning_rate": 7.626715586356842e-05, + "loss": 3.8313, + "step": 27960 + }, + { + "epoch": 1.9000543552113058, + "grad_norm": 0.19873599708080292, + "learning_rate": 7.626290936268515e-05, + "loss": 3.9408, + "step": 27965 + }, + { + "epoch": 1.9003940752819677, + "grad_norm": 0.1640951782464981, + "learning_rate": 7.625866286180188e-05, + "loss": 4.1048, + "step": 27970 + }, + { + "epoch": 1.9007337953526293, + "grad_norm": 0.36264804005622864, + "learning_rate": 7.625441636091861e-05, + "loss": 3.9583, + "step": 27975 + }, + { + "epoch": 1.9010735154232912, + "grad_norm": 0.15769626200199127, + "learning_rate": 7.625016986003534e-05, + "loss": 3.8927, + "step": 27980 + }, + { + "epoch": 1.901413235493953, + "grad_norm": 0.1568615436553955, + "learning_rate": 7.624592335915206e-05, + "loss": 4.1305, + "step": 27985 + }, + { + "epoch": 1.9017529555646147, + "grad_norm": 0.47777891159057617, + "learning_rate": 7.624167685826879e-05, + "loss": 3.8552, + "step": 27990 + }, + { + "epoch": 1.9020926756352765, + "grad_norm": 0.22238169610500336, + "learning_rate": 7.623743035738552e-05, + "loss": 3.6845, + "step": 27995 + }, + { + "epoch": 1.9024323957059384, + "grad_norm": 0.1473531872034073, + "learning_rate": 7.623318385650225e-05, + "loss": 3.8041, + "step": 28000 + }, + { + "epoch": 1.9027721157766, + "grad_norm": 0.17733487486839294, + "learning_rate": 7.622893735561898e-05, + "loss": 4.0026, + "step": 28005 + }, + { + "epoch": 1.9031118358472618, + "grad_norm": 0.1637508124113083, + "learning_rate": 7.62246908547357e-05, + "loss": 4.082, + "step": 28010 + }, + { + "epoch": 1.9034515559179237, + "grad_norm": 0.197715625166893, + "learning_rate": 7.622044435385243e-05, + "loss": 3.7224, + "step": 28015 + }, + { + "epoch": 1.9037912759885853, + "grad_norm": 0.18062236905097961, + "learning_rate": 7.621619785296916e-05, + "loss": 3.9629, + "step": 28020 + }, + { + "epoch": 1.9041309960592472, + "grad_norm": 0.17274343967437744, + "learning_rate": 7.621195135208589e-05, + "loss": 3.9874, + "step": 28025 + }, + { + "epoch": 1.904470716129909, + "grad_norm": 0.13803553581237793, + "learning_rate": 7.620770485120262e-05, + "loss": 3.9678, + "step": 28030 + }, + { + "epoch": 1.9048104362005707, + "grad_norm": 0.1767314225435257, + "learning_rate": 7.620345835031934e-05, + "loss": 3.9548, + "step": 28035 + }, + { + "epoch": 1.9051501562712325, + "grad_norm": 0.5290346145629883, + "learning_rate": 7.619921184943607e-05, + "loss": 3.9746, + "step": 28040 + }, + { + "epoch": 1.9054898763418944, + "grad_norm": 0.15992562472820282, + "learning_rate": 7.61949653485528e-05, + "loss": 4.1131, + "step": 28045 + }, + { + "epoch": 1.905829596412556, + "grad_norm": 0.1825106292963028, + "learning_rate": 7.619071884766953e-05, + "loss": 3.6634, + "step": 28050 + }, + { + "epoch": 1.9061693164832179, + "grad_norm": 0.21828171610832214, + "learning_rate": 7.618647234678626e-05, + "loss": 3.7912, + "step": 28055 + }, + { + "epoch": 1.9065090365538797, + "grad_norm": 0.15395250916481018, + "learning_rate": 7.618222584590297e-05, + "loss": 4.072, + "step": 28060 + }, + { + "epoch": 1.9068487566245413, + "grad_norm": 0.16495516896247864, + "learning_rate": 7.617797934501971e-05, + "loss": 3.6592, + "step": 28065 + }, + { + "epoch": 1.9071884766952032, + "grad_norm": 0.17774321138858795, + "learning_rate": 7.617373284413644e-05, + "loss": 3.7191, + "step": 28070 + }, + { + "epoch": 1.907528196765865, + "grad_norm": 0.15801410377025604, + "learning_rate": 7.616948634325315e-05, + "loss": 3.8994, + "step": 28075 + }, + { + "epoch": 1.9078679168365267, + "grad_norm": 0.1364717185497284, + "learning_rate": 7.61652398423699e-05, + "loss": 3.8975, + "step": 28080 + }, + { + "epoch": 1.9082076369071885, + "grad_norm": 0.21180878579616547, + "learning_rate": 7.616099334148662e-05, + "loss": 3.7871, + "step": 28085 + }, + { + "epoch": 1.9085473569778504, + "grad_norm": 1.3650767803192139, + "learning_rate": 7.615674684060334e-05, + "loss": 4.0118, + "step": 28090 + }, + { + "epoch": 1.908887077048512, + "grad_norm": 0.21111010015010834, + "learning_rate": 7.615250033972008e-05, + "loss": 3.7438, + "step": 28095 + }, + { + "epoch": 1.9092267971191736, + "grad_norm": 0.19963420927524567, + "learning_rate": 7.614825383883681e-05, + "loss": 3.7724, + "step": 28100 + }, + { + "epoch": 1.9095665171898357, + "grad_norm": 0.194320410490036, + "learning_rate": 7.614400733795352e-05, + "loss": 3.7762, + "step": 28105 + }, + { + "epoch": 1.9099062372604974, + "grad_norm": 0.2132817506790161, + "learning_rate": 7.613976083707026e-05, + "loss": 3.9261, + "step": 28110 + }, + { + "epoch": 1.910245957331159, + "grad_norm": 0.17743398249149323, + "learning_rate": 7.613551433618699e-05, + "loss": 3.8194, + "step": 28115 + }, + { + "epoch": 1.910585677401821, + "grad_norm": 0.28071609139442444, + "learning_rate": 7.61312678353037e-05, + "loss": 3.9722, + "step": 28120 + }, + { + "epoch": 1.9109253974724827, + "grad_norm": 0.8313693404197693, + "learning_rate": 7.612702133442045e-05, + "loss": 4.0656, + "step": 28125 + }, + { + "epoch": 1.9112651175431443, + "grad_norm": 0.19384852051734924, + "learning_rate": 7.612277483353718e-05, + "loss": 3.9651, + "step": 28130 + }, + { + "epoch": 1.9116048376138062, + "grad_norm": 0.20798298716545105, + "learning_rate": 7.611852833265389e-05, + "loss": 3.8397, + "step": 28135 + }, + { + "epoch": 1.911944557684468, + "grad_norm": 0.1828797161579132, + "learning_rate": 7.611428183177063e-05, + "loss": 4.2017, + "step": 28140 + }, + { + "epoch": 1.9122842777551297, + "grad_norm": 0.14113403856754303, + "learning_rate": 7.611003533088735e-05, + "loss": 3.9846, + "step": 28145 + }, + { + "epoch": 1.9126239978257915, + "grad_norm": 0.21827705204486847, + "learning_rate": 7.610578883000407e-05, + "loss": 3.8319, + "step": 28150 + }, + { + "epoch": 1.9129637178964534, + "grad_norm": 0.17757326364517212, + "learning_rate": 7.610154232912082e-05, + "loss": 4.0741, + "step": 28155 + }, + { + "epoch": 1.913303437967115, + "grad_norm": 0.21422694623470306, + "learning_rate": 7.609729582823753e-05, + "loss": 4.068, + "step": 28160 + }, + { + "epoch": 1.9136431580377768, + "grad_norm": 0.17750786244869232, + "learning_rate": 7.609304932735426e-05, + "loss": 4.1113, + "step": 28165 + }, + { + "epoch": 1.9139828781084387, + "grad_norm": 0.19200639426708221, + "learning_rate": 7.6088802826471e-05, + "loss": 3.8495, + "step": 28170 + }, + { + "epoch": 1.9143225981791003, + "grad_norm": 0.2152918577194214, + "learning_rate": 7.608455632558771e-05, + "loss": 3.6414, + "step": 28175 + }, + { + "epoch": 1.9146623182497622, + "grad_norm": 0.16820789873600006, + "learning_rate": 7.608030982470444e-05, + "loss": 3.9532, + "step": 28180 + }, + { + "epoch": 1.915002038320424, + "grad_norm": 0.2005823403596878, + "learning_rate": 7.607606332382118e-05, + "loss": 3.8826, + "step": 28185 + }, + { + "epoch": 1.9153417583910857, + "grad_norm": 0.19100531935691833, + "learning_rate": 7.60718168229379e-05, + "loss": 3.8814, + "step": 28190 + }, + { + "epoch": 1.9156814784617475, + "grad_norm": 0.20539595186710358, + "learning_rate": 7.606757032205463e-05, + "loss": 3.6989, + "step": 28195 + }, + { + "epoch": 1.9160211985324094, + "grad_norm": 0.1599256843328476, + "learning_rate": 7.606332382117137e-05, + "loss": 4.2637, + "step": 28200 + }, + { + "epoch": 1.916360918603071, + "grad_norm": 0.2545475363731384, + "learning_rate": 7.605907732028808e-05, + "loss": 3.8098, + "step": 28205 + }, + { + "epoch": 1.9167006386737329, + "grad_norm": 0.12170267850160599, + "learning_rate": 7.605483081940481e-05, + "loss": 3.8355, + "step": 28210 + }, + { + "epoch": 1.9170403587443947, + "grad_norm": 0.18811744451522827, + "learning_rate": 7.605058431852154e-05, + "loss": 3.7514, + "step": 28215 + }, + { + "epoch": 1.9173800788150563, + "grad_norm": 0.1738954782485962, + "learning_rate": 7.604633781763827e-05, + "loss": 3.8473, + "step": 28220 + }, + { + "epoch": 1.9177197988857182, + "grad_norm": 0.18142423033714294, + "learning_rate": 7.6042091316755e-05, + "loss": 3.7802, + "step": 28225 + }, + { + "epoch": 1.91805951895638, + "grad_norm": 0.2605923116207123, + "learning_rate": 7.603784481587172e-05, + "loss": 3.5701, + "step": 28230 + }, + { + "epoch": 1.9183992390270417, + "grad_norm": 0.29425787925720215, + "learning_rate": 7.603359831498845e-05, + "loss": 3.851, + "step": 28235 + }, + { + "epoch": 1.9187389590977035, + "grad_norm": 0.1539212316274643, + "learning_rate": 7.602935181410518e-05, + "loss": 3.9696, + "step": 28240 + }, + { + "epoch": 1.9190786791683654, + "grad_norm": 0.26808542013168335, + "learning_rate": 7.60251053132219e-05, + "loss": 3.9217, + "step": 28245 + }, + { + "epoch": 1.919418399239027, + "grad_norm": 0.6515941023826599, + "learning_rate": 7.602085881233863e-05, + "loss": 3.8249, + "step": 28250 + }, + { + "epoch": 1.9197581193096889, + "grad_norm": 0.181682288646698, + "learning_rate": 7.601661231145536e-05, + "loss": 4.0242, + "step": 28255 + }, + { + "epoch": 1.9200978393803507, + "grad_norm": 0.15553699433803558, + "learning_rate": 7.601236581057209e-05, + "loss": 3.9072, + "step": 28260 + }, + { + "epoch": 1.9204375594510124, + "grad_norm": 0.22848089039325714, + "learning_rate": 7.600811930968882e-05, + "loss": 3.9673, + "step": 28265 + }, + { + "epoch": 1.920777279521674, + "grad_norm": 0.1700098216533661, + "learning_rate": 7.600387280880555e-05, + "loss": 3.9604, + "step": 28270 + }, + { + "epoch": 1.921116999592336, + "grad_norm": 0.19649413228034973, + "learning_rate": 7.599962630792227e-05, + "loss": 3.9426, + "step": 28275 + }, + { + "epoch": 1.9214567196629977, + "grad_norm": 0.14789098501205444, + "learning_rate": 7.5995379807039e-05, + "loss": 3.8841, + "step": 28280 + }, + { + "epoch": 1.9217964397336593, + "grad_norm": 1.4909374713897705, + "learning_rate": 7.599113330615573e-05, + "loss": 3.9234, + "step": 28285 + }, + { + "epoch": 1.9221361598043214, + "grad_norm": 0.30204498767852783, + "learning_rate": 7.598688680527246e-05, + "loss": 3.9947, + "step": 28290 + }, + { + "epoch": 1.922475879874983, + "grad_norm": 0.21168002486228943, + "learning_rate": 7.598264030438919e-05, + "loss": 3.8856, + "step": 28295 + }, + { + "epoch": 1.9228155999456447, + "grad_norm": 0.16094359755516052, + "learning_rate": 7.597839380350591e-05, + "loss": 3.9182, + "step": 28300 + }, + { + "epoch": 1.9231553200163065, + "grad_norm": 0.6102222204208374, + "learning_rate": 7.597414730262264e-05, + "loss": 3.9363, + "step": 28305 + }, + { + "epoch": 1.9234950400869684, + "grad_norm": 0.15424862504005432, + "learning_rate": 7.596990080173937e-05, + "loss": 3.9082, + "step": 28310 + }, + { + "epoch": 1.92383476015763, + "grad_norm": 0.15269304811954498, + "learning_rate": 7.59656543008561e-05, + "loss": 4.0164, + "step": 28315 + }, + { + "epoch": 1.9241744802282919, + "grad_norm": 0.1650102138519287, + "learning_rate": 7.596140779997283e-05, + "loss": 4.0425, + "step": 28320 + }, + { + "epoch": 1.9245142002989537, + "grad_norm": 0.16988715529441833, + "learning_rate": 7.595716129908955e-05, + "loss": 3.9825, + "step": 28325 + }, + { + "epoch": 1.9248539203696153, + "grad_norm": 0.23558145761489868, + "learning_rate": 7.595291479820628e-05, + "loss": 3.8519, + "step": 28330 + }, + { + "epoch": 1.9251936404402772, + "grad_norm": 0.17035606503486633, + "learning_rate": 7.594866829732301e-05, + "loss": 3.9221, + "step": 28335 + }, + { + "epoch": 1.925533360510939, + "grad_norm": 0.1563226580619812, + "learning_rate": 7.594442179643974e-05, + "loss": 3.9689, + "step": 28340 + }, + { + "epoch": 1.9258730805816007, + "grad_norm": 0.18391691148281097, + "learning_rate": 7.594017529555647e-05, + "loss": 3.9295, + "step": 28345 + }, + { + "epoch": 1.9262128006522625, + "grad_norm": 0.24077385663986206, + "learning_rate": 7.59359287946732e-05, + "loss": 3.7646, + "step": 28350 + }, + { + "epoch": 1.9265525207229244, + "grad_norm": 0.18604795634746552, + "learning_rate": 7.593168229378992e-05, + "loss": 3.6653, + "step": 28355 + }, + { + "epoch": 1.926892240793586, + "grad_norm": 0.1942649483680725, + "learning_rate": 7.592743579290665e-05, + "loss": 3.7061, + "step": 28360 + }, + { + "epoch": 1.9272319608642479, + "grad_norm": 0.17356139421463013, + "learning_rate": 7.592318929202338e-05, + "loss": 3.8926, + "step": 28365 + }, + { + "epoch": 1.9275716809349097, + "grad_norm": 0.14925870299339294, + "learning_rate": 7.59189427911401e-05, + "loss": 3.7836, + "step": 28370 + }, + { + "epoch": 1.9279114010055713, + "grad_norm": 0.18402747809886932, + "learning_rate": 7.591469629025683e-05, + "loss": 3.8598, + "step": 28375 + }, + { + "epoch": 1.9282511210762332, + "grad_norm": 0.18687181174755096, + "learning_rate": 7.591044978937356e-05, + "loss": 3.9218, + "step": 28380 + }, + { + "epoch": 1.928590841146895, + "grad_norm": 0.13704098761081696, + "learning_rate": 7.590620328849029e-05, + "loss": 3.7285, + "step": 28385 + }, + { + "epoch": 1.9289305612175567, + "grad_norm": 0.3879941701889038, + "learning_rate": 7.590195678760702e-05, + "loss": 4.0019, + "step": 28390 + }, + { + "epoch": 1.9292702812882185, + "grad_norm": 0.16433273255825043, + "learning_rate": 7.589771028672375e-05, + "loss": 3.9535, + "step": 28395 + }, + { + "epoch": 1.9296100013588804, + "grad_norm": 0.22391854226589203, + "learning_rate": 7.589346378584047e-05, + "loss": 4.019, + "step": 28400 + }, + { + "epoch": 1.929949721429542, + "grad_norm": 0.21234647929668427, + "learning_rate": 7.58892172849572e-05, + "loss": 3.8815, + "step": 28405 + }, + { + "epoch": 1.9302894415002039, + "grad_norm": 0.17976419627666473, + "learning_rate": 7.588497078407393e-05, + "loss": 3.8199, + "step": 28410 + }, + { + "epoch": 1.9306291615708657, + "grad_norm": 0.5078787803649902, + "learning_rate": 7.588072428319065e-05, + "loss": 3.7314, + "step": 28415 + }, + { + "epoch": 1.9309688816415274, + "grad_norm": 0.415224552154541, + "learning_rate": 7.587647778230739e-05, + "loss": 4.0452, + "step": 28420 + }, + { + "epoch": 1.9313086017121892, + "grad_norm": 0.18565645813941956, + "learning_rate": 7.587223128142411e-05, + "loss": 3.9368, + "step": 28425 + }, + { + "epoch": 1.931648321782851, + "grad_norm": 0.26604071259498596, + "learning_rate": 7.586798478054083e-05, + "loss": 3.7558, + "step": 28430 + }, + { + "epoch": 1.9319880418535127, + "grad_norm": 0.20461969077587128, + "learning_rate": 7.586373827965757e-05, + "loss": 3.8349, + "step": 28435 + }, + { + "epoch": 1.9323277619241743, + "grad_norm": 0.17417210340499878, + "learning_rate": 7.58594917787743e-05, + "loss": 3.7898, + "step": 28440 + }, + { + "epoch": 1.9326674819948364, + "grad_norm": 0.1681414693593979, + "learning_rate": 7.585524527789101e-05, + "loss": 3.769, + "step": 28445 + }, + { + "epoch": 1.933007202065498, + "grad_norm": 0.22479256987571716, + "learning_rate": 7.585099877700775e-05, + "loss": 3.8817, + "step": 28450 + }, + { + "epoch": 1.9333469221361597, + "grad_norm": 0.16596832871437073, + "learning_rate": 7.584675227612448e-05, + "loss": 3.9302, + "step": 28455 + }, + { + "epoch": 1.9336866422068217, + "grad_norm": 0.14525513350963593, + "learning_rate": 7.58425057752412e-05, + "loss": 4.0555, + "step": 28460 + }, + { + "epoch": 1.9340263622774834, + "grad_norm": 0.19593417644500732, + "learning_rate": 7.583825927435794e-05, + "loss": 3.9594, + "step": 28465 + }, + { + "epoch": 1.934366082348145, + "grad_norm": 0.15694987773895264, + "learning_rate": 7.583401277347467e-05, + "loss": 3.7313, + "step": 28470 + }, + { + "epoch": 1.9347058024188069, + "grad_norm": 0.2306661456823349, + "learning_rate": 7.582976627259138e-05, + "loss": 4.1184, + "step": 28475 + }, + { + "epoch": 1.9350455224894687, + "grad_norm": 0.14993150532245636, + "learning_rate": 7.582551977170812e-05, + "loss": 3.9693, + "step": 28480 + }, + { + "epoch": 1.9353852425601303, + "grad_norm": 0.19698427617549896, + "learning_rate": 7.582127327082484e-05, + "loss": 3.8219, + "step": 28485 + }, + { + "epoch": 1.9357249626307922, + "grad_norm": 0.35421696305274963, + "learning_rate": 7.581702676994157e-05, + "loss": 3.7024, + "step": 28490 + }, + { + "epoch": 1.936064682701454, + "grad_norm": 0.2005046159029007, + "learning_rate": 7.581278026905831e-05, + "loss": 3.9377, + "step": 28495 + }, + { + "epoch": 1.9364044027721157, + "grad_norm": 0.2570323348045349, + "learning_rate": 7.580853376817502e-05, + "loss": 3.9278, + "step": 28500 + }, + { + "epoch": 1.9367441228427775, + "grad_norm": 0.1934492290019989, + "learning_rate": 7.580428726729175e-05, + "loss": 3.916, + "step": 28505 + }, + { + "epoch": 1.9370838429134394, + "grad_norm": 0.1637314110994339, + "learning_rate": 7.580004076640849e-05, + "loss": 3.7532, + "step": 28510 + }, + { + "epoch": 1.937423562984101, + "grad_norm": 0.1614295095205307, + "learning_rate": 7.57957942655252e-05, + "loss": 4.0572, + "step": 28515 + }, + { + "epoch": 1.9377632830547629, + "grad_norm": 0.19173380732536316, + "learning_rate": 7.579154776464193e-05, + "loss": 3.7465, + "step": 28520 + }, + { + "epoch": 1.9381030031254247, + "grad_norm": 0.2315157800912857, + "learning_rate": 7.578730126375868e-05, + "loss": 3.9825, + "step": 28525 + }, + { + "epoch": 1.9384427231960863, + "grad_norm": 0.1621650755405426, + "learning_rate": 7.578305476287539e-05, + "loss": 3.9392, + "step": 28530 + }, + { + "epoch": 1.9387824432667482, + "grad_norm": 0.16905152797698975, + "learning_rate": 7.577880826199212e-05, + "loss": 3.8929, + "step": 28535 + }, + { + "epoch": 1.93912216333741, + "grad_norm": 0.20569667220115662, + "learning_rate": 7.577456176110886e-05, + "loss": 4.1916, + "step": 28540 + }, + { + "epoch": 1.9394618834080717, + "grad_norm": 0.20626264810562134, + "learning_rate": 7.577031526022557e-05, + "loss": 3.8966, + "step": 28545 + }, + { + "epoch": 1.9398016034787335, + "grad_norm": 0.174543559551239, + "learning_rate": 7.57660687593423e-05, + "loss": 3.927, + "step": 28550 + }, + { + "epoch": 1.9401413235493954, + "grad_norm": 0.15784618258476257, + "learning_rate": 7.576182225845904e-05, + "loss": 3.6855, + "step": 28555 + }, + { + "epoch": 1.940481043620057, + "grad_norm": 0.3341679871082306, + "learning_rate": 7.575757575757576e-05, + "loss": 4.0198, + "step": 28560 + }, + { + "epoch": 1.9408207636907189, + "grad_norm": 0.23390629887580872, + "learning_rate": 7.575332925669249e-05, + "loss": 3.7836, + "step": 28565 + }, + { + "epoch": 1.9411604837613807, + "grad_norm": 0.574409544467926, + "learning_rate": 7.574908275580921e-05, + "loss": 3.9034, + "step": 28570 + }, + { + "epoch": 1.9415002038320424, + "grad_norm": 0.18634304404258728, + "learning_rate": 7.574483625492594e-05, + "loss": 3.8669, + "step": 28575 + }, + { + "epoch": 1.9418399239027042, + "grad_norm": 0.1903749257326126, + "learning_rate": 7.574058975404267e-05, + "loss": 3.9099, + "step": 28580 + }, + { + "epoch": 1.942179643973366, + "grad_norm": 0.2019668072462082, + "learning_rate": 7.57363432531594e-05, + "loss": 4.0368, + "step": 28585 + }, + { + "epoch": 1.9425193640440277, + "grad_norm": 0.17365911602973938, + "learning_rate": 7.573209675227613e-05, + "loss": 3.9398, + "step": 28590 + }, + { + "epoch": 1.9428590841146895, + "grad_norm": 0.21469604969024658, + "learning_rate": 7.572785025139285e-05, + "loss": 3.6867, + "step": 28595 + }, + { + "epoch": 1.9431988041853514, + "grad_norm": 0.33557629585266113, + "learning_rate": 7.572360375050958e-05, + "loss": 3.72, + "step": 28600 + }, + { + "epoch": 1.943538524256013, + "grad_norm": 0.1792975515127182, + "learning_rate": 7.571935724962631e-05, + "loss": 3.9011, + "step": 28605 + }, + { + "epoch": 1.9438782443266747, + "grad_norm": 0.18394911289215088, + "learning_rate": 7.571511074874304e-05, + "loss": 3.9018, + "step": 28610 + }, + { + "epoch": 1.9442179643973367, + "grad_norm": 0.17852739989757538, + "learning_rate": 7.571086424785977e-05, + "loss": 3.8287, + "step": 28615 + }, + { + "epoch": 1.9445576844679984, + "grad_norm": 0.23222753405570984, + "learning_rate": 7.57066177469765e-05, + "loss": 3.3701, + "step": 28620 + }, + { + "epoch": 1.94489740453866, + "grad_norm": 1.2480146884918213, + "learning_rate": 7.570237124609322e-05, + "loss": 3.7037, + "step": 28625 + }, + { + "epoch": 1.945237124609322, + "grad_norm": 0.1594918817281723, + "learning_rate": 7.569812474520995e-05, + "loss": 3.8822, + "step": 28630 + }, + { + "epoch": 1.9455768446799837, + "grad_norm": 0.6279609203338623, + "learning_rate": 7.569387824432668e-05, + "loss": 3.7909, + "step": 28635 + }, + { + "epoch": 1.9459165647506453, + "grad_norm": 0.15612271428108215, + "learning_rate": 7.56896317434434e-05, + "loss": 3.6813, + "step": 28640 + }, + { + "epoch": 1.9462562848213072, + "grad_norm": 0.15630017220973969, + "learning_rate": 7.568538524256013e-05, + "loss": 3.8183, + "step": 28645 + }, + { + "epoch": 1.946596004891969, + "grad_norm": 0.16744762659072876, + "learning_rate": 7.568113874167686e-05, + "loss": 3.6298, + "step": 28650 + }, + { + "epoch": 1.9469357249626307, + "grad_norm": 0.1678406149148941, + "learning_rate": 7.567689224079359e-05, + "loss": 3.9312, + "step": 28655 + }, + { + "epoch": 1.9472754450332925, + "grad_norm": 0.2061745524406433, + "learning_rate": 7.567264573991032e-05, + "loss": 4.1418, + "step": 28660 + }, + { + "epoch": 1.9476151651039544, + "grad_norm": 0.20069412887096405, + "learning_rate": 7.566839923902705e-05, + "loss": 3.6094, + "step": 28665 + }, + { + "epoch": 1.947954885174616, + "grad_norm": 0.30117034912109375, + "learning_rate": 7.566415273814377e-05, + "loss": 3.8877, + "step": 28670 + }, + { + "epoch": 1.9482946052452779, + "grad_norm": 0.21468403935432434, + "learning_rate": 7.56599062372605e-05, + "loss": 3.8385, + "step": 28675 + }, + { + "epoch": 1.9486343253159397, + "grad_norm": 0.173647940158844, + "learning_rate": 7.565565973637723e-05, + "loss": 4.0412, + "step": 28680 + }, + { + "epoch": 1.9489740453866014, + "grad_norm": 0.18057386577129364, + "learning_rate": 7.565141323549396e-05, + "loss": 3.9703, + "step": 28685 + }, + { + "epoch": 1.9493137654572632, + "grad_norm": 0.15432140231132507, + "learning_rate": 7.564716673461069e-05, + "loss": 3.8456, + "step": 28690 + }, + { + "epoch": 1.949653485527925, + "grad_norm": 0.16141648590564728, + "learning_rate": 7.564292023372741e-05, + "loss": 3.9864, + "step": 28695 + }, + { + "epoch": 1.9499932055985867, + "grad_norm": 0.25995954871177673, + "learning_rate": 7.563867373284414e-05, + "loss": 3.8852, + "step": 28700 + }, + { + "epoch": 1.9503329256692485, + "grad_norm": 0.22354675829410553, + "learning_rate": 7.563442723196087e-05, + "loss": 3.7647, + "step": 28705 + }, + { + "epoch": 1.9506726457399104, + "grad_norm": 0.19499213993549347, + "learning_rate": 7.56301807310776e-05, + "loss": 4.0943, + "step": 28710 + }, + { + "epoch": 1.951012365810572, + "grad_norm": 0.18912269175052643, + "learning_rate": 7.562593423019433e-05, + "loss": 3.8971, + "step": 28715 + }, + { + "epoch": 1.9513520858812339, + "grad_norm": 0.1645275354385376, + "learning_rate": 7.562168772931105e-05, + "loss": 3.8716, + "step": 28720 + }, + { + "epoch": 1.9516918059518957, + "grad_norm": 0.2009490579366684, + "learning_rate": 7.561744122842778e-05, + "loss": 3.9142, + "step": 28725 + }, + { + "epoch": 1.9520315260225574, + "grad_norm": 0.2077329009771347, + "learning_rate": 7.561319472754451e-05, + "loss": 4.0766, + "step": 28730 + }, + { + "epoch": 1.9523712460932192, + "grad_norm": 0.1625940203666687, + "learning_rate": 7.560894822666124e-05, + "loss": 4.0909, + "step": 28735 + }, + { + "epoch": 1.952710966163881, + "grad_norm": 0.18995901942253113, + "learning_rate": 7.560470172577797e-05, + "loss": 3.9231, + "step": 28740 + }, + { + "epoch": 1.9530506862345427, + "grad_norm": 0.1825612336397171, + "learning_rate": 7.56004552248947e-05, + "loss": 4.0585, + "step": 28745 + }, + { + "epoch": 1.9533904063052046, + "grad_norm": 0.15993399918079376, + "learning_rate": 7.559620872401142e-05, + "loss": 3.7725, + "step": 28750 + }, + { + "epoch": 1.9537301263758664, + "grad_norm": 0.20699810981750488, + "learning_rate": 7.559196222312815e-05, + "loss": 3.8649, + "step": 28755 + }, + { + "epoch": 1.954069846446528, + "grad_norm": 0.17233701050281525, + "learning_rate": 7.558771572224488e-05, + "loss": 4.1081, + "step": 28760 + }, + { + "epoch": 1.95440956651719, + "grad_norm": 0.2557596266269684, + "learning_rate": 7.55834692213616e-05, + "loss": 4.0374, + "step": 28765 + }, + { + "epoch": 1.9547492865878517, + "grad_norm": 0.1986154317855835, + "learning_rate": 7.557922272047832e-05, + "loss": 4.0581, + "step": 28770 + }, + { + "epoch": 1.9550890066585134, + "grad_norm": 0.21419759094715118, + "learning_rate": 7.557497621959506e-05, + "loss": 3.8865, + "step": 28775 + }, + { + "epoch": 1.955428726729175, + "grad_norm": 0.1564425677061081, + "learning_rate": 7.557072971871179e-05, + "loss": 3.9438, + "step": 28780 + }, + { + "epoch": 1.955768446799837, + "grad_norm": 0.5971413850784302, + "learning_rate": 7.55664832178285e-05, + "loss": 3.9651, + "step": 28785 + }, + { + "epoch": 1.9561081668704987, + "grad_norm": 0.3650059401988983, + "learning_rate": 7.556223671694525e-05, + "loss": 4.0239, + "step": 28790 + }, + { + "epoch": 1.9564478869411603, + "grad_norm": 0.27154508233070374, + "learning_rate": 7.555799021606197e-05, + "loss": 3.7856, + "step": 28795 + }, + { + "epoch": 1.9567876070118224, + "grad_norm": 1.1051714420318604, + "learning_rate": 7.555374371517869e-05, + "loss": 4.0547, + "step": 28800 + }, + { + "epoch": 1.957127327082484, + "grad_norm": 0.3950001299381256, + "learning_rate": 7.554949721429543e-05, + "loss": 3.5807, + "step": 28805 + }, + { + "epoch": 1.9574670471531457, + "grad_norm": 0.5057965517044067, + "learning_rate": 7.554525071341216e-05, + "loss": 3.9705, + "step": 28810 + }, + { + "epoch": 1.9578067672238075, + "grad_norm": 0.22920554876327515, + "learning_rate": 7.554100421252887e-05, + "loss": 3.8461, + "step": 28815 + }, + { + "epoch": 1.9581464872944694, + "grad_norm": 0.17659617960453033, + "learning_rate": 7.553675771164561e-05, + "loss": 3.8774, + "step": 28820 + }, + { + "epoch": 1.958486207365131, + "grad_norm": 0.15254487097263336, + "learning_rate": 7.553251121076234e-05, + "loss": 3.9112, + "step": 28825 + }, + { + "epoch": 1.9588259274357929, + "grad_norm": 0.21564850211143494, + "learning_rate": 7.552826470987906e-05, + "loss": 3.7513, + "step": 28830 + }, + { + "epoch": 1.9591656475064547, + "grad_norm": 0.2114938646554947, + "learning_rate": 7.55240182089958e-05, + "loss": 3.6898, + "step": 28835 + }, + { + "epoch": 1.9595053675771164, + "grad_norm": 0.19365867972373962, + "learning_rate": 7.551977170811251e-05, + "loss": 3.9097, + "step": 28840 + }, + { + "epoch": 1.9598450876477782, + "grad_norm": 0.2042960673570633, + "learning_rate": 7.551552520722924e-05, + "loss": 3.938, + "step": 28845 + }, + { + "epoch": 1.96018480771844, + "grad_norm": 1.962403416633606, + "learning_rate": 7.551127870634598e-05, + "loss": 3.8731, + "step": 28850 + }, + { + "epoch": 1.9605245277891017, + "grad_norm": 0.1525605320930481, + "learning_rate": 7.55070322054627e-05, + "loss": 3.9347, + "step": 28855 + }, + { + "epoch": 1.9608642478597635, + "grad_norm": 0.2115081250667572, + "learning_rate": 7.550278570457942e-05, + "loss": 4.1186, + "step": 28860 + }, + { + "epoch": 1.9612039679304254, + "grad_norm": 0.1531156301498413, + "learning_rate": 7.549853920369617e-05, + "loss": 3.8669, + "step": 28865 + }, + { + "epoch": 1.961543688001087, + "grad_norm": 0.18353340029716492, + "learning_rate": 7.549429270281288e-05, + "loss": 3.9375, + "step": 28870 + }, + { + "epoch": 1.9618834080717489, + "grad_norm": 0.17165622115135193, + "learning_rate": 7.549004620192961e-05, + "loss": 4.155, + "step": 28875 + }, + { + "epoch": 1.9622231281424107, + "grad_norm": 0.1533842831850052, + "learning_rate": 7.548579970104635e-05, + "loss": 3.7737, + "step": 28880 + }, + { + "epoch": 1.9625628482130724, + "grad_norm": 0.8411090970039368, + "learning_rate": 7.548155320016306e-05, + "loss": 3.668, + "step": 28885 + }, + { + "epoch": 1.9629025682837342, + "grad_norm": 0.16673916578292847, + "learning_rate": 7.547730669927979e-05, + "loss": 3.8621, + "step": 28890 + }, + { + "epoch": 1.963242288354396, + "grad_norm": 0.2007731944322586, + "learning_rate": 7.547306019839653e-05, + "loss": 3.8968, + "step": 28895 + }, + { + "epoch": 1.9635820084250577, + "grad_norm": 0.3047294616699219, + "learning_rate": 7.546881369751325e-05, + "loss": 3.7774, + "step": 28900 + }, + { + "epoch": 1.9639217284957196, + "grad_norm": 0.24949844181537628, + "learning_rate": 7.546456719662998e-05, + "loss": 4.0364, + "step": 28905 + }, + { + "epoch": 1.9642614485663814, + "grad_norm": 0.17516323924064636, + "learning_rate": 7.54603206957467e-05, + "loss": 3.6837, + "step": 28910 + }, + { + "epoch": 1.964601168637043, + "grad_norm": 0.1515410989522934, + "learning_rate": 7.545607419486343e-05, + "loss": 3.8065, + "step": 28915 + }, + { + "epoch": 1.964940888707705, + "grad_norm": 0.1432599574327469, + "learning_rate": 7.545182769398016e-05, + "loss": 4.0156, + "step": 28920 + }, + { + "epoch": 1.9652806087783667, + "grad_norm": 0.1828065812587738, + "learning_rate": 7.544758119309689e-05, + "loss": 3.9206, + "step": 28925 + }, + { + "epoch": 1.9656203288490284, + "grad_norm": 0.17855244874954224, + "learning_rate": 7.544333469221362e-05, + "loss": 3.9816, + "step": 28930 + }, + { + "epoch": 1.9659600489196902, + "grad_norm": 0.17481978237628937, + "learning_rate": 7.543908819133034e-05, + "loss": 3.8134, + "step": 28935 + }, + { + "epoch": 1.966299768990352, + "grad_norm": 0.14466944336891174, + "learning_rate": 7.543484169044707e-05, + "loss": 3.9186, + "step": 28940 + }, + { + "epoch": 1.9666394890610137, + "grad_norm": 0.16603897511959076, + "learning_rate": 7.54305951895638e-05, + "loss": 4.018, + "step": 28945 + }, + { + "epoch": 1.9669792091316753, + "grad_norm": 0.22208024561405182, + "learning_rate": 7.542634868868053e-05, + "loss": 4.1084, + "step": 28950 + }, + { + "epoch": 1.9673189292023374, + "grad_norm": 0.21492783725261688, + "learning_rate": 7.542210218779726e-05, + "loss": 3.803, + "step": 28955 + }, + { + "epoch": 1.967658649272999, + "grad_norm": 0.16949887573719025, + "learning_rate": 7.541785568691398e-05, + "loss": 3.6145, + "step": 28960 + }, + { + "epoch": 1.9679983693436607, + "grad_norm": 0.21943798661231995, + "learning_rate": 7.541360918603071e-05, + "loss": 4.0773, + "step": 28965 + }, + { + "epoch": 1.9683380894143228, + "grad_norm": 0.19466625154018402, + "learning_rate": 7.540936268514744e-05, + "loss": 3.9082, + "step": 28970 + }, + { + "epoch": 1.9686778094849844, + "grad_norm": 0.19159992039203644, + "learning_rate": 7.540511618426417e-05, + "loss": 3.7375, + "step": 28975 + }, + { + "epoch": 1.969017529555646, + "grad_norm": 0.1805427223443985, + "learning_rate": 7.54008696833809e-05, + "loss": 3.7456, + "step": 28980 + }, + { + "epoch": 1.9693572496263079, + "grad_norm": 0.20300701260566711, + "learning_rate": 7.539662318249762e-05, + "loss": 3.8153, + "step": 28985 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.1711605340242386, + "learning_rate": 7.539237668161435e-05, + "loss": 3.8414, + "step": 28990 + }, + { + "epoch": 1.9700366897676314, + "grad_norm": 0.15425744652748108, + "learning_rate": 7.538813018073108e-05, + "loss": 3.6377, + "step": 28995 + }, + { + "epoch": 1.9703764098382932, + "grad_norm": 0.1586846113204956, + "learning_rate": 7.538388367984781e-05, + "loss": 4.0355, + "step": 29000 + }, + { + "epoch": 1.970716129908955, + "grad_norm": 0.16863371431827545, + "learning_rate": 7.537963717896454e-05, + "loss": 3.8204, + "step": 29005 + }, + { + "epoch": 1.9710558499796167, + "grad_norm": 0.32749268412590027, + "learning_rate": 7.537539067808126e-05, + "loss": 4.0279, + "step": 29010 + }, + { + "epoch": 1.9713955700502785, + "grad_norm": 0.177074134349823, + "learning_rate": 7.537114417719799e-05, + "loss": 3.9316, + "step": 29015 + }, + { + "epoch": 1.9717352901209404, + "grad_norm": 0.17786149680614471, + "learning_rate": 7.536689767631472e-05, + "loss": 3.9019, + "step": 29020 + }, + { + "epoch": 1.972075010191602, + "grad_norm": 0.19821983575820923, + "learning_rate": 7.536265117543145e-05, + "loss": 3.8532, + "step": 29025 + }, + { + "epoch": 1.9724147302622639, + "grad_norm": 0.2140025645494461, + "learning_rate": 7.535840467454818e-05, + "loss": 4.0828, + "step": 29030 + }, + { + "epoch": 1.9727544503329257, + "grad_norm": 0.3477388024330139, + "learning_rate": 7.53541581736649e-05, + "loss": 3.8458, + "step": 29035 + }, + { + "epoch": 1.9730941704035874, + "grad_norm": 0.17392589151859283, + "learning_rate": 7.534991167278163e-05, + "loss": 3.7742, + "step": 29040 + }, + { + "epoch": 1.9734338904742492, + "grad_norm": 0.2178051471710205, + "learning_rate": 7.534566517189836e-05, + "loss": 3.7284, + "step": 29045 + }, + { + "epoch": 1.973773610544911, + "grad_norm": 0.19181449711322784, + "learning_rate": 7.534141867101509e-05, + "loss": 3.852, + "step": 29050 + }, + { + "epoch": 1.9741133306155727, + "grad_norm": 0.21679338812828064, + "learning_rate": 7.533717217013182e-05, + "loss": 4.0727, + "step": 29055 + }, + { + "epoch": 1.9744530506862346, + "grad_norm": 0.171362042427063, + "learning_rate": 7.533292566924854e-05, + "loss": 3.8534, + "step": 29060 + }, + { + "epoch": 1.9747927707568964, + "grad_norm": 0.18933825194835663, + "learning_rate": 7.532867916836527e-05, + "loss": 4.1266, + "step": 29065 + }, + { + "epoch": 1.975132490827558, + "grad_norm": 0.18015968799591064, + "learning_rate": 7.5324432667482e-05, + "loss": 3.8916, + "step": 29070 + }, + { + "epoch": 1.97547221089822, + "grad_norm": 0.2384442538022995, + "learning_rate": 7.532018616659873e-05, + "loss": 3.7611, + "step": 29075 + }, + { + "epoch": 1.9758119309688817, + "grad_norm": 0.23865869641304016, + "learning_rate": 7.531593966571546e-05, + "loss": 3.729, + "step": 29080 + }, + { + "epoch": 1.9761516510395434, + "grad_norm": 0.18216106295585632, + "learning_rate": 7.531169316483218e-05, + "loss": 3.9772, + "step": 29085 + }, + { + "epoch": 1.9764913711102052, + "grad_norm": 0.3407329320907593, + "learning_rate": 7.530744666394891e-05, + "loss": 3.9104, + "step": 29090 + }, + { + "epoch": 1.976831091180867, + "grad_norm": 0.15204742550849915, + "learning_rate": 7.530320016306564e-05, + "loss": 4.1264, + "step": 29095 + }, + { + "epoch": 1.9771708112515287, + "grad_norm": 0.21321195363998413, + "learning_rate": 7.529895366218237e-05, + "loss": 3.5481, + "step": 29100 + }, + { + "epoch": 1.9775105313221906, + "grad_norm": 0.15588432550430298, + "learning_rate": 7.52947071612991e-05, + "loss": 3.9693, + "step": 29105 + }, + { + "epoch": 1.9778502513928524, + "grad_norm": 0.2210242748260498, + "learning_rate": 7.529046066041581e-05, + "loss": 3.7492, + "step": 29110 + }, + { + "epoch": 1.978189971463514, + "grad_norm": 0.14939425885677338, + "learning_rate": 7.528621415953255e-05, + "loss": 3.7852, + "step": 29115 + }, + { + "epoch": 1.9785296915341757, + "grad_norm": 0.1740017533302307, + "learning_rate": 7.528196765864928e-05, + "loss": 3.914, + "step": 29120 + }, + { + "epoch": 1.9788694116048378, + "grad_norm": 0.1691030114889145, + "learning_rate": 7.5277721157766e-05, + "loss": 4.0772, + "step": 29125 + }, + { + "epoch": 1.9792091316754994, + "grad_norm": 0.6876809597015381, + "learning_rate": 7.527347465688274e-05, + "loss": 3.9235, + "step": 29130 + }, + { + "epoch": 1.979548851746161, + "grad_norm": 0.16820546984672546, + "learning_rate": 7.526922815599946e-05, + "loss": 3.9099, + "step": 29135 + }, + { + "epoch": 1.979888571816823, + "grad_norm": 0.19930388033390045, + "learning_rate": 7.526498165511618e-05, + "loss": 4.0105, + "step": 29140 + }, + { + "epoch": 1.9802282918874847, + "grad_norm": 0.16012413799762726, + "learning_rate": 7.526073515423292e-05, + "loss": 3.8592, + "step": 29145 + }, + { + "epoch": 1.9805680119581464, + "grad_norm": 0.16773782670497894, + "learning_rate": 7.525648865334965e-05, + "loss": 3.8575, + "step": 29150 + }, + { + "epoch": 1.9809077320288082, + "grad_norm": 0.15467670559883118, + "learning_rate": 7.525224215246636e-05, + "loss": 3.8103, + "step": 29155 + }, + { + "epoch": 1.98124745209947, + "grad_norm": 0.23219053447246552, + "learning_rate": 7.52479956515831e-05, + "loss": 3.8471, + "step": 29160 + }, + { + "epoch": 1.9815871721701317, + "grad_norm": 0.3983686566352844, + "learning_rate": 7.524374915069983e-05, + "loss": 3.8597, + "step": 29165 + }, + { + "epoch": 1.9819268922407935, + "grad_norm": 0.4048886001110077, + "learning_rate": 7.523950264981655e-05, + "loss": 3.9894, + "step": 29170 + }, + { + "epoch": 1.9822666123114554, + "grad_norm": 0.14300625026226044, + "learning_rate": 7.523525614893329e-05, + "loss": 4.0042, + "step": 29175 + }, + { + "epoch": 1.982606332382117, + "grad_norm": 0.1831045150756836, + "learning_rate": 7.523100964805002e-05, + "loss": 3.8204, + "step": 29180 + }, + { + "epoch": 1.9829460524527789, + "grad_norm": 0.14238326251506805, + "learning_rate": 7.522676314716673e-05, + "loss": 3.9191, + "step": 29185 + }, + { + "epoch": 1.9832857725234407, + "grad_norm": 0.18976551294326782, + "learning_rate": 7.522251664628347e-05, + "loss": 3.9912, + "step": 29190 + }, + { + "epoch": 1.9836254925941024, + "grad_norm": 0.1751544028520584, + "learning_rate": 7.521827014540019e-05, + "loss": 4.002, + "step": 29195 + }, + { + "epoch": 1.9839652126647642, + "grad_norm": 0.16446055471897125, + "learning_rate": 7.521402364451692e-05, + "loss": 4.1016, + "step": 29200 + }, + { + "epoch": 1.984304932735426, + "grad_norm": 0.17112456262111664, + "learning_rate": 7.520977714363366e-05, + "loss": 3.8937, + "step": 29205 + }, + { + "epoch": 1.9846446528060877, + "grad_norm": 0.17526963353157043, + "learning_rate": 7.520553064275037e-05, + "loss": 3.6733, + "step": 29210 + }, + { + "epoch": 1.9849843728767496, + "grad_norm": 0.18425685167312622, + "learning_rate": 7.52012841418671e-05, + "loss": 3.9485, + "step": 29215 + }, + { + "epoch": 1.9853240929474114, + "grad_norm": 0.2309459149837494, + "learning_rate": 7.519703764098384e-05, + "loss": 3.8701, + "step": 29220 + }, + { + "epoch": 1.985663813018073, + "grad_norm": 0.17401134967803955, + "learning_rate": 7.519279114010056e-05, + "loss": 3.9747, + "step": 29225 + }, + { + "epoch": 1.986003533088735, + "grad_norm": 0.34715521335601807, + "learning_rate": 7.518854463921728e-05, + "loss": 3.7779, + "step": 29230 + }, + { + "epoch": 1.9863432531593967, + "grad_norm": 0.23896299302577972, + "learning_rate": 7.518429813833402e-05, + "loss": 3.8424, + "step": 29235 + }, + { + "epoch": 1.9866829732300584, + "grad_norm": 0.3459424078464508, + "learning_rate": 7.518005163745074e-05, + "loss": 3.6811, + "step": 29240 + }, + { + "epoch": 1.9870226933007202, + "grad_norm": 0.482655793428421, + "learning_rate": 7.517580513656747e-05, + "loss": 3.9797, + "step": 29245 + }, + { + "epoch": 1.987362413371382, + "grad_norm": 0.45322373509407043, + "learning_rate": 7.517155863568421e-05, + "loss": 4.146, + "step": 29250 + }, + { + "epoch": 1.9877021334420437, + "grad_norm": 0.22944483160972595, + "learning_rate": 7.516731213480092e-05, + "loss": 3.8581, + "step": 29255 + }, + { + "epoch": 1.9880418535127056, + "grad_norm": 0.256619930267334, + "learning_rate": 7.516306563391765e-05, + "loss": 4.089, + "step": 29260 + }, + { + "epoch": 1.9883815735833674, + "grad_norm": 0.794204592704773, + "learning_rate": 7.515881913303438e-05, + "loss": 3.8387, + "step": 29265 + }, + { + "epoch": 1.988721293654029, + "grad_norm": 0.1650339514017105, + "learning_rate": 7.515457263215111e-05, + "loss": 3.7986, + "step": 29270 + }, + { + "epoch": 1.989061013724691, + "grad_norm": 0.19367657601833344, + "learning_rate": 7.515032613126784e-05, + "loss": 3.7254, + "step": 29275 + }, + { + "epoch": 1.9894007337953528, + "grad_norm": 0.14963364601135254, + "learning_rate": 7.514607963038456e-05, + "loss": 3.7913, + "step": 29280 + }, + { + "epoch": 1.9897404538660144, + "grad_norm": 0.2537325620651245, + "learning_rate": 7.514183312950129e-05, + "loss": 3.977, + "step": 29285 + }, + { + "epoch": 1.990080173936676, + "grad_norm": 7.223372936248779, + "learning_rate": 7.513758662861802e-05, + "loss": 3.784, + "step": 29290 + }, + { + "epoch": 1.990419894007338, + "grad_norm": 0.16349056363105774, + "learning_rate": 7.513334012773475e-05, + "loss": 3.9683, + "step": 29295 + }, + { + "epoch": 1.9907596140779997, + "grad_norm": 0.1990077793598175, + "learning_rate": 7.512909362685148e-05, + "loss": 4.1468, + "step": 29300 + }, + { + "epoch": 1.9910993341486614, + "grad_norm": 0.19930794835090637, + "learning_rate": 7.51248471259682e-05, + "loss": 3.8559, + "step": 29305 + }, + { + "epoch": 1.9914390542193234, + "grad_norm": 0.1327536255121231, + "learning_rate": 7.512060062508493e-05, + "loss": 3.9432, + "step": 29310 + }, + { + "epoch": 1.991778774289985, + "grad_norm": 0.15193022787570953, + "learning_rate": 7.511635412420166e-05, + "loss": 3.8336, + "step": 29315 + }, + { + "epoch": 1.9921184943606467, + "grad_norm": 0.14728865027427673, + "learning_rate": 7.511210762331839e-05, + "loss": 3.8643, + "step": 29320 + }, + { + "epoch": 1.9924582144313085, + "grad_norm": 0.20128199458122253, + "learning_rate": 7.510786112243512e-05, + "loss": 3.9977, + "step": 29325 + }, + { + "epoch": 1.9927979345019704, + "grad_norm": 0.16108326613903046, + "learning_rate": 7.510361462155184e-05, + "loss": 3.6128, + "step": 29330 + }, + { + "epoch": 1.993137654572632, + "grad_norm": 0.16531729698181152, + "learning_rate": 7.509936812066857e-05, + "loss": 3.9088, + "step": 29335 + }, + { + "epoch": 1.9934773746432939, + "grad_norm": 1.185957908630371, + "learning_rate": 7.50951216197853e-05, + "loss": 4.0379, + "step": 29340 + }, + { + "epoch": 1.9938170947139557, + "grad_norm": 0.19775107502937317, + "learning_rate": 7.509087511890203e-05, + "loss": 3.7136, + "step": 29345 + }, + { + "epoch": 1.9941568147846174, + "grad_norm": 0.1505029797554016, + "learning_rate": 7.508662861801876e-05, + "loss": 4.1196, + "step": 29350 + }, + { + "epoch": 1.9944965348552792, + "grad_norm": 0.19217808544635773, + "learning_rate": 7.508238211713548e-05, + "loss": 3.7892, + "step": 29355 + }, + { + "epoch": 1.994836254925941, + "grad_norm": 0.2800290584564209, + "learning_rate": 7.507813561625221e-05, + "loss": 4.0082, + "step": 29360 + }, + { + "epoch": 1.9951759749966027, + "grad_norm": 0.1712721884250641, + "learning_rate": 7.507388911536894e-05, + "loss": 3.9143, + "step": 29365 + }, + { + "epoch": 1.9955156950672646, + "grad_norm": 0.15520340204238892, + "learning_rate": 7.506964261448567e-05, + "loss": 4.0002, + "step": 29370 + }, + { + "epoch": 1.9958554151379264, + "grad_norm": 0.48389121890068054, + "learning_rate": 7.50653961136024e-05, + "loss": 3.9726, + "step": 29375 + }, + { + "epoch": 1.996195135208588, + "grad_norm": 0.14380806684494019, + "learning_rate": 7.506114961271912e-05, + "loss": 4.0489, + "step": 29380 + }, + { + "epoch": 1.99653485527925, + "grad_norm": 0.2424589991569519, + "learning_rate": 7.505690311183585e-05, + "loss": 3.9004, + "step": 29385 + }, + { + "epoch": 1.9968745753499118, + "grad_norm": 0.18293659389019012, + "learning_rate": 7.505265661095258e-05, + "loss": 3.838, + "step": 29390 + }, + { + "epoch": 1.9972142954205734, + "grad_norm": 0.1429549604654312, + "learning_rate": 7.504841011006931e-05, + "loss": 4.0699, + "step": 29395 + }, + { + "epoch": 1.9975540154912352, + "grad_norm": 0.19257469475269318, + "learning_rate": 7.504416360918604e-05, + "loss": 4.0369, + "step": 29400 + }, + { + "epoch": 1.997893735561897, + "grad_norm": 0.17624278366565704, + "learning_rate": 7.503991710830276e-05, + "loss": 3.9489, + "step": 29405 + }, + { + "epoch": 1.9982334556325587, + "grad_norm": 0.16862809658050537, + "learning_rate": 7.503567060741949e-05, + "loss": 3.8545, + "step": 29410 + }, + { + "epoch": 1.9985731757032206, + "grad_norm": 0.19477686285972595, + "learning_rate": 7.503142410653622e-05, + "loss": 3.8932, + "step": 29415 + }, + { + "epoch": 1.9989128957738824, + "grad_norm": 0.1890822798013687, + "learning_rate": 7.502717760565295e-05, + "loss": 3.9093, + "step": 29420 + }, + { + "epoch": 1.999252615844544, + "grad_norm": 0.15944001078605652, + "learning_rate": 7.502293110476968e-05, + "loss": 3.8241, + "step": 29425 + }, + { + "epoch": 1.999592335915206, + "grad_norm": 0.1564100831747055, + "learning_rate": 7.50186846038864e-05, + "loss": 3.7864, + "step": 29430 + }, + { + "epoch": 1.9999320559858678, + "grad_norm": 0.2901493012905121, + "learning_rate": 7.501443810300313e-05, + "loss": 4.0359, + "step": 29435 + }, + { + "epoch": 2.0, + "eval_bertscore": { + "f1": 0.852735464177121, + "precision": 0.8770126333208045, + "recall": 0.830094712327439 + }, + "eval_bleu_4": 0.0017295643758668197, + "eval_exact_match": 0.0, + "eval_loss": 3.6980581283569336, + "eval_meteor": 0.07461650280120449, + "eval_rouge": { + "rouge1": 0.12319974492882868, + "rouge2": 0.014841134031800911, + "rougeL": 0.10882816327333575, + "rougeLsum": 0.10886444748618009 + }, + "eval_runtime": 362.6176, + "eval_samples_per_second": 28.457, + "eval_steps_per_second": 3.557, + "step": 29436 + }, + { + "epoch": 2.0002717760565294, + "grad_norm": 0.2506450116634369, + "learning_rate": 7.501019160211986e-05, + "loss": 3.9558, + "step": 29440 + }, + { + "epoch": 2.000611496127191, + "grad_norm": 0.14834557473659515, + "learning_rate": 7.500594510123659e-05, + "loss": 3.8714, + "step": 29445 + }, + { + "epoch": 2.000951216197853, + "grad_norm": 0.1569802165031433, + "learning_rate": 7.500169860035332e-05, + "loss": 3.8355, + "step": 29450 + }, + { + "epoch": 2.0012909362685147, + "grad_norm": 0.16737738251686096, + "learning_rate": 7.499745209947004e-05, + "loss": 3.761, + "step": 29455 + }, + { + "epoch": 2.0016306563391764, + "grad_norm": 0.25586214661598206, + "learning_rate": 7.499320559858677e-05, + "loss": 3.9702, + "step": 29460 + }, + { + "epoch": 2.0019703764098384, + "grad_norm": 0.2027866393327713, + "learning_rate": 7.498895909770349e-05, + "loss": 3.6058, + "step": 29465 + }, + { + "epoch": 2.0023100964805, + "grad_norm": 0.37925732135772705, + "learning_rate": 7.498471259682023e-05, + "loss": 3.8035, + "step": 29470 + }, + { + "epoch": 2.0026498165511617, + "grad_norm": 0.24594746530056, + "learning_rate": 7.498046609593696e-05, + "loss": 3.866, + "step": 29475 + }, + { + "epoch": 2.0029895366218238, + "grad_norm": 0.23450279235839844, + "learning_rate": 7.497621959505367e-05, + "loss": 3.8653, + "step": 29480 + }, + { + "epoch": 2.0033292566924854, + "grad_norm": 0.3036451041698456, + "learning_rate": 7.497197309417041e-05, + "loss": 3.9517, + "step": 29485 + }, + { + "epoch": 2.003668976763147, + "grad_norm": 0.15258914232254028, + "learning_rate": 7.496772659328714e-05, + "loss": 3.9497, + "step": 29490 + }, + { + "epoch": 2.004008696833809, + "grad_norm": 0.13921470940113068, + "learning_rate": 7.496348009240385e-05, + "loss": 3.9438, + "step": 29495 + }, + { + "epoch": 2.0043484169044707, + "grad_norm": 1.5267518758773804, + "learning_rate": 7.49592335915206e-05, + "loss": 3.5371, + "step": 29500 + }, + { + "epoch": 2.0046881369751324, + "grad_norm": 0.19988133013248444, + "learning_rate": 7.495498709063732e-05, + "loss": 3.8509, + "step": 29505 + }, + { + "epoch": 2.0050278570457944, + "grad_norm": 0.1689959168434143, + "learning_rate": 7.495074058975404e-05, + "loss": 4.0953, + "step": 29510 + }, + { + "epoch": 2.005367577116456, + "grad_norm": 0.16203072667121887, + "learning_rate": 7.494649408887078e-05, + "loss": 3.7592, + "step": 29515 + }, + { + "epoch": 2.0057072971871177, + "grad_norm": 0.2856031060218811, + "learning_rate": 7.494224758798751e-05, + "loss": 3.8566, + "step": 29520 + }, + { + "epoch": 2.00604701725778, + "grad_norm": 0.24463525414466858, + "learning_rate": 7.493800108710422e-05, + "loss": 3.9532, + "step": 29525 + }, + { + "epoch": 2.0063867373284414, + "grad_norm": 0.1393943727016449, + "learning_rate": 7.493375458622096e-05, + "loss": 3.9673, + "step": 29530 + }, + { + "epoch": 2.006726457399103, + "grad_norm": 0.417354941368103, + "learning_rate": 7.492950808533769e-05, + "loss": 4.0793, + "step": 29535 + }, + { + "epoch": 2.007066177469765, + "grad_norm": 0.16933859884738922, + "learning_rate": 7.49252615844544e-05, + "loss": 3.891, + "step": 29540 + }, + { + "epoch": 2.0074058975404268, + "grad_norm": 0.16113397479057312, + "learning_rate": 7.492101508357115e-05, + "loss": 4.1241, + "step": 29545 + }, + { + "epoch": 2.0077456176110884, + "grad_norm": 0.19718100130558014, + "learning_rate": 7.491676858268786e-05, + "loss": 3.5671, + "step": 29550 + }, + { + "epoch": 2.0080853376817505, + "grad_norm": 0.3452725410461426, + "learning_rate": 7.491252208180459e-05, + "loss": 3.9334, + "step": 29555 + }, + { + "epoch": 2.008425057752412, + "grad_norm": 0.43188270926475525, + "learning_rate": 7.490827558092133e-05, + "loss": 3.5557, + "step": 29560 + }, + { + "epoch": 2.0087647778230737, + "grad_norm": 2.131880760192871, + "learning_rate": 7.490402908003805e-05, + "loss": 3.8287, + "step": 29565 + }, + { + "epoch": 2.0091044978937354, + "grad_norm": 0.15078526735305786, + "learning_rate": 7.489978257915477e-05, + "loss": 4.0118, + "step": 29570 + }, + { + "epoch": 2.0094442179643974, + "grad_norm": 0.15730661153793335, + "learning_rate": 7.489553607827152e-05, + "loss": 3.8627, + "step": 29575 + }, + { + "epoch": 2.009783938035059, + "grad_norm": 0.18580208718776703, + "learning_rate": 7.489128957738823e-05, + "loss": 3.9322, + "step": 29580 + }, + { + "epoch": 2.0101236581057207, + "grad_norm": 0.1876026839017868, + "learning_rate": 7.488704307650496e-05, + "loss": 3.7756, + "step": 29585 + }, + { + "epoch": 2.0104633781763828, + "grad_norm": 0.21731789410114288, + "learning_rate": 7.48827965756217e-05, + "loss": 3.9083, + "step": 29590 + }, + { + "epoch": 2.0108030982470444, + "grad_norm": 0.20345892012119293, + "learning_rate": 7.487855007473841e-05, + "loss": 3.9151, + "step": 29595 + }, + { + "epoch": 2.011142818317706, + "grad_norm": 0.209834024310112, + "learning_rate": 7.487430357385514e-05, + "loss": 3.6614, + "step": 29600 + }, + { + "epoch": 2.011482538388368, + "grad_norm": 0.17878244817256927, + "learning_rate": 7.487005707297188e-05, + "loss": 3.7798, + "step": 29605 + }, + { + "epoch": 2.0118222584590297, + "grad_norm": 0.17197681963443756, + "learning_rate": 7.48658105720886e-05, + "loss": 3.7759, + "step": 29610 + }, + { + "epoch": 2.0121619785296914, + "grad_norm": 0.18007656931877136, + "learning_rate": 7.486156407120533e-05, + "loss": 3.9337, + "step": 29615 + }, + { + "epoch": 2.0125016986003534, + "grad_norm": 0.18287988007068634, + "learning_rate": 7.485731757032205e-05, + "loss": 3.8724, + "step": 29620 + }, + { + "epoch": 2.012841418671015, + "grad_norm": 0.16474643349647522, + "learning_rate": 7.485307106943878e-05, + "loss": 3.6943, + "step": 29625 + }, + { + "epoch": 2.0131811387416767, + "grad_norm": 0.24428102374076843, + "learning_rate": 7.484882456855551e-05, + "loss": 4.0487, + "step": 29630 + }, + { + "epoch": 2.0135208588123388, + "grad_norm": 0.17122042179107666, + "learning_rate": 7.484457806767224e-05, + "loss": 4.0902, + "step": 29635 + }, + { + "epoch": 2.0138605788830004, + "grad_norm": 0.19222109019756317, + "learning_rate": 7.484033156678897e-05, + "loss": 3.6545, + "step": 29640 + }, + { + "epoch": 2.014200298953662, + "grad_norm": 0.1432534158229828, + "learning_rate": 7.48360850659057e-05, + "loss": 3.7381, + "step": 29645 + }, + { + "epoch": 2.014540019024324, + "grad_norm": 0.15326546132564545, + "learning_rate": 7.483183856502242e-05, + "loss": 3.8505, + "step": 29650 + }, + { + "epoch": 2.0148797390949857, + "grad_norm": 0.19128629565238953, + "learning_rate": 7.482759206413915e-05, + "loss": 4.1913, + "step": 29655 + }, + { + "epoch": 2.0152194591656474, + "grad_norm": 0.1629490703344345, + "learning_rate": 7.482334556325588e-05, + "loss": 4.0384, + "step": 29660 + }, + { + "epoch": 2.0155591792363095, + "grad_norm": 0.16680137813091278, + "learning_rate": 7.48190990623726e-05, + "loss": 3.8832, + "step": 29665 + }, + { + "epoch": 2.015898899306971, + "grad_norm": 0.4145749807357788, + "learning_rate": 7.481485256148933e-05, + "loss": 3.7529, + "step": 29670 + }, + { + "epoch": 2.0162386193776327, + "grad_norm": 0.18294429779052734, + "learning_rate": 7.481060606060606e-05, + "loss": 3.7635, + "step": 29675 + }, + { + "epoch": 2.016578339448295, + "grad_norm": 0.9447112679481506, + "learning_rate": 7.480635955972279e-05, + "loss": 4.1694, + "step": 29680 + }, + { + "epoch": 2.0169180595189564, + "grad_norm": 0.19893784821033478, + "learning_rate": 7.480211305883952e-05, + "loss": 3.7547, + "step": 29685 + }, + { + "epoch": 2.017257779589618, + "grad_norm": 0.33556368947029114, + "learning_rate": 7.479786655795625e-05, + "loss": 3.9718, + "step": 29690 + }, + { + "epoch": 2.01759749966028, + "grad_norm": 0.1496516615152359, + "learning_rate": 7.479362005707297e-05, + "loss": 4.0235, + "step": 29695 + }, + { + "epoch": 2.0179372197309418, + "grad_norm": 0.176497682929039, + "learning_rate": 7.47893735561897e-05, + "loss": 3.8679, + "step": 29700 + }, + { + "epoch": 2.0182769398016034, + "grad_norm": 0.8855968117713928, + "learning_rate": 7.478512705530643e-05, + "loss": 3.8389, + "step": 29705 + }, + { + "epoch": 2.0186166598722655, + "grad_norm": 0.1732775717973709, + "learning_rate": 7.478088055442316e-05, + "loss": 4.0467, + "step": 29710 + }, + { + "epoch": 2.018956379942927, + "grad_norm": 0.24626374244689941, + "learning_rate": 7.477663405353989e-05, + "loss": 3.9311, + "step": 29715 + }, + { + "epoch": 2.0192961000135887, + "grad_norm": 0.14932487905025482, + "learning_rate": 7.477238755265661e-05, + "loss": 3.7343, + "step": 29720 + }, + { + "epoch": 2.0196358200842504, + "grad_norm": 0.17489221692085266, + "learning_rate": 7.476814105177334e-05, + "loss": 3.8519, + "step": 29725 + }, + { + "epoch": 2.0199755401549124, + "grad_norm": 0.5738584399223328, + "learning_rate": 7.476389455089007e-05, + "loss": 3.5621, + "step": 29730 + }, + { + "epoch": 2.020315260225574, + "grad_norm": 0.252485990524292, + "learning_rate": 7.47596480500068e-05, + "loss": 3.7517, + "step": 29735 + }, + { + "epoch": 2.0206549802962357, + "grad_norm": 0.4976951777935028, + "learning_rate": 7.475540154912353e-05, + "loss": 3.8375, + "step": 29740 + }, + { + "epoch": 2.0209947003668978, + "grad_norm": 0.2789100706577301, + "learning_rate": 7.475115504824025e-05, + "loss": 3.7621, + "step": 29745 + }, + { + "epoch": 2.0213344204375594, + "grad_norm": 0.16491210460662842, + "learning_rate": 7.474690854735698e-05, + "loss": 3.9116, + "step": 29750 + }, + { + "epoch": 2.021674140508221, + "grad_norm": 0.22749063372612, + "learning_rate": 7.474266204647371e-05, + "loss": 3.8361, + "step": 29755 + }, + { + "epoch": 2.022013860578883, + "grad_norm": 0.1598631888628006, + "learning_rate": 7.473841554559044e-05, + "loss": 3.9414, + "step": 29760 + }, + { + "epoch": 2.0223535806495447, + "grad_norm": 0.17591282725334167, + "learning_rate": 7.473416904470717e-05, + "loss": 3.8556, + "step": 29765 + }, + { + "epoch": 2.0226933007202064, + "grad_norm": 0.18570509552955627, + "learning_rate": 7.47299225438239e-05, + "loss": 3.7539, + "step": 29770 + }, + { + "epoch": 2.0230330207908684, + "grad_norm": 0.28502100706100464, + "learning_rate": 7.472567604294062e-05, + "loss": 3.7849, + "step": 29775 + }, + { + "epoch": 2.02337274086153, + "grad_norm": 0.21022728085517883, + "learning_rate": 7.472142954205735e-05, + "loss": 3.7848, + "step": 29780 + }, + { + "epoch": 2.0237124609321917, + "grad_norm": 0.17657993733882904, + "learning_rate": 7.471718304117408e-05, + "loss": 3.864, + "step": 29785 + }, + { + "epoch": 2.024052181002854, + "grad_norm": 0.18260274827480316, + "learning_rate": 7.47129365402908e-05, + "loss": 3.9249, + "step": 29790 + }, + { + "epoch": 2.0243919010735154, + "grad_norm": 0.19945229589939117, + "learning_rate": 7.470869003940753e-05, + "loss": 3.8922, + "step": 29795 + }, + { + "epoch": 2.024731621144177, + "grad_norm": 0.21584486961364746, + "learning_rate": 7.470444353852426e-05, + "loss": 4.2253, + "step": 29800 + }, + { + "epoch": 2.025071341214839, + "grad_norm": 0.1868549883365631, + "learning_rate": 7.470019703764099e-05, + "loss": 3.9302, + "step": 29805 + }, + { + "epoch": 2.0254110612855007, + "grad_norm": 0.1653360277414322, + "learning_rate": 7.469595053675772e-05, + "loss": 3.9937, + "step": 29810 + }, + { + "epoch": 2.0257507813561624, + "grad_norm": 0.7149417400360107, + "learning_rate": 7.469170403587445e-05, + "loss": 3.8635, + "step": 29815 + }, + { + "epoch": 2.0260905014268245, + "grad_norm": 1.7713325023651123, + "learning_rate": 7.468745753499116e-05, + "loss": 3.8401, + "step": 29820 + }, + { + "epoch": 2.026430221497486, + "grad_norm": 0.21355466544628143, + "learning_rate": 7.46832110341079e-05, + "loss": 4.0203, + "step": 29825 + }, + { + "epoch": 2.0267699415681477, + "grad_norm": 0.20464441180229187, + "learning_rate": 7.467896453322463e-05, + "loss": 3.7151, + "step": 29830 + }, + { + "epoch": 2.02710966163881, + "grad_norm": 0.18254591524600983, + "learning_rate": 7.467471803234135e-05, + "loss": 3.6983, + "step": 29835 + }, + { + "epoch": 2.0274493817094714, + "grad_norm": 0.21587646007537842, + "learning_rate": 7.467047153145809e-05, + "loss": 4.0295, + "step": 29840 + }, + { + "epoch": 2.027789101780133, + "grad_norm": 0.18752549588680267, + "learning_rate": 7.466622503057481e-05, + "loss": 3.6611, + "step": 29845 + }, + { + "epoch": 2.028128821850795, + "grad_norm": 0.15292049944400787, + "learning_rate": 7.466197852969153e-05, + "loss": 3.7995, + "step": 29850 + }, + { + "epoch": 2.0284685419214568, + "grad_norm": 0.15837156772613525, + "learning_rate": 7.465773202880827e-05, + "loss": 3.9987, + "step": 29855 + }, + { + "epoch": 2.0288082619921184, + "grad_norm": 0.1639314591884613, + "learning_rate": 7.4653485527925e-05, + "loss": 3.7868, + "step": 29860 + }, + { + "epoch": 2.0291479820627805, + "grad_norm": 0.21752771735191345, + "learning_rate": 7.464923902704171e-05, + "loss": 3.9901, + "step": 29865 + }, + { + "epoch": 2.029487702133442, + "grad_norm": 0.2117740660905838, + "learning_rate": 7.464499252615845e-05, + "loss": 3.9397, + "step": 29870 + }, + { + "epoch": 2.0298274222041037, + "grad_norm": 0.14550113677978516, + "learning_rate": 7.464074602527518e-05, + "loss": 4.1296, + "step": 29875 + }, + { + "epoch": 2.030167142274766, + "grad_norm": 0.25024721026420593, + "learning_rate": 7.46364995243919e-05, + "loss": 4.092, + "step": 29880 + }, + { + "epoch": 2.0305068623454274, + "grad_norm": 0.18264682590961456, + "learning_rate": 7.463225302350864e-05, + "loss": 3.8854, + "step": 29885 + }, + { + "epoch": 2.030846582416089, + "grad_norm": 0.19197991490364075, + "learning_rate": 7.462800652262535e-05, + "loss": 3.8405, + "step": 29890 + }, + { + "epoch": 2.031186302486751, + "grad_norm": 0.4321473240852356, + "learning_rate": 7.462376002174208e-05, + "loss": 3.5106, + "step": 29895 + }, + { + "epoch": 2.0315260225574128, + "grad_norm": 0.1789269745349884, + "learning_rate": 7.461951352085882e-05, + "loss": 3.6003, + "step": 29900 + }, + { + "epoch": 2.0318657426280744, + "grad_norm": 0.18249399960041046, + "learning_rate": 7.461526701997554e-05, + "loss": 4.1188, + "step": 29905 + }, + { + "epoch": 2.032205462698736, + "grad_norm": 0.1391228586435318, + "learning_rate": 7.461102051909227e-05, + "loss": 4.1387, + "step": 29910 + }, + { + "epoch": 2.032545182769398, + "grad_norm": 0.21730300784111023, + "learning_rate": 7.460677401820901e-05, + "loss": 3.8125, + "step": 29915 + }, + { + "epoch": 2.0328849028400597, + "grad_norm": 0.1614730954170227, + "learning_rate": 7.460252751732572e-05, + "loss": 3.7854, + "step": 29920 + }, + { + "epoch": 2.0332246229107214, + "grad_norm": 0.2603830099105835, + "learning_rate": 7.459828101644245e-05, + "loss": 4.0274, + "step": 29925 + }, + { + "epoch": 2.0335643429813834, + "grad_norm": 0.14939945936203003, + "learning_rate": 7.459403451555919e-05, + "loss": 3.8916, + "step": 29930 + }, + { + "epoch": 2.033904063052045, + "grad_norm": 0.933394730091095, + "learning_rate": 7.45897880146759e-05, + "loss": 3.8967, + "step": 29935 + }, + { + "epoch": 2.0342437831227067, + "grad_norm": 0.8142344355583191, + "learning_rate": 7.458554151379263e-05, + "loss": 4.0369, + "step": 29940 + }, + { + "epoch": 2.034583503193369, + "grad_norm": 0.17824101448059082, + "learning_rate": 7.458129501290937e-05, + "loss": 3.7761, + "step": 29945 + }, + { + "epoch": 2.0349232232640304, + "grad_norm": 0.13716597855091095, + "learning_rate": 7.457704851202609e-05, + "loss": 3.8889, + "step": 29950 + }, + { + "epoch": 2.035262943334692, + "grad_norm": 0.18697237968444824, + "learning_rate": 7.457280201114282e-05, + "loss": 3.8933, + "step": 29955 + }, + { + "epoch": 2.035602663405354, + "grad_norm": 0.1780213564634323, + "learning_rate": 7.456855551025956e-05, + "loss": 3.9807, + "step": 29960 + }, + { + "epoch": 2.0359423834760157, + "grad_norm": 0.17982184886932373, + "learning_rate": 7.456430900937627e-05, + "loss": 3.6817, + "step": 29965 + }, + { + "epoch": 2.0362821035466774, + "grad_norm": 0.22634078562259674, + "learning_rate": 7.4560062508493e-05, + "loss": 3.9624, + "step": 29970 + }, + { + "epoch": 2.0366218236173395, + "grad_norm": 0.15620841085910797, + "learning_rate": 7.455581600760973e-05, + "loss": 3.9196, + "step": 29975 + }, + { + "epoch": 2.036961543688001, + "grad_norm": 0.24123652279376984, + "learning_rate": 7.455156950672646e-05, + "loss": 3.7174, + "step": 29980 + }, + { + "epoch": 2.0373012637586627, + "grad_norm": 0.16304905712604523, + "learning_rate": 7.454732300584319e-05, + "loss": 3.8104, + "step": 29985 + }, + { + "epoch": 2.037640983829325, + "grad_norm": 0.14329934120178223, + "learning_rate": 7.454307650495991e-05, + "loss": 4.0106, + "step": 29990 + }, + { + "epoch": 2.0379807038999864, + "grad_norm": 0.1746898889541626, + "learning_rate": 7.453883000407664e-05, + "loss": 4.0501, + "step": 29995 + }, + { + "epoch": 2.038320423970648, + "grad_norm": 0.17680193483829498, + "learning_rate": 7.453458350319337e-05, + "loss": 3.9997, + "step": 30000 + }, + { + "epoch": 2.03866014404131, + "grad_norm": 0.14578060805797577, + "learning_rate": 7.45303370023101e-05, + "loss": 3.9712, + "step": 30005 + }, + { + "epoch": 2.0389998641119718, + "grad_norm": 0.19629256427288055, + "learning_rate": 7.452609050142683e-05, + "loss": 4.1729, + "step": 30010 + }, + { + "epoch": 2.0393395841826334, + "grad_norm": 0.19633962213993073, + "learning_rate": 7.452184400054355e-05, + "loss": 4.129, + "step": 30015 + }, + { + "epoch": 2.0396793042532955, + "grad_norm": 0.16068527102470398, + "learning_rate": 7.451759749966028e-05, + "loss": 3.8758, + "step": 30020 + }, + { + "epoch": 2.040019024323957, + "grad_norm": 0.23561440408229828, + "learning_rate": 7.451335099877701e-05, + "loss": 3.8282, + "step": 30025 + }, + { + "epoch": 2.0403587443946187, + "grad_norm": 0.15773312747478485, + "learning_rate": 7.450910449789374e-05, + "loss": 4.1423, + "step": 30030 + }, + { + "epoch": 2.040698464465281, + "grad_norm": 0.2616139054298401, + "learning_rate": 7.450485799701047e-05, + "loss": 3.8878, + "step": 30035 + }, + { + "epoch": 2.0410381845359424, + "grad_norm": 0.25757837295532227, + "learning_rate": 7.45006114961272e-05, + "loss": 3.7846, + "step": 30040 + }, + { + "epoch": 2.041377904606604, + "grad_norm": 0.17961274087429047, + "learning_rate": 7.449636499524392e-05, + "loss": 3.7707, + "step": 30045 + }, + { + "epoch": 2.041717624677266, + "grad_norm": 0.22420617938041687, + "learning_rate": 7.449211849436065e-05, + "loss": 3.8434, + "step": 30050 + }, + { + "epoch": 2.0420573447479278, + "grad_norm": 0.2009170800447464, + "learning_rate": 7.448787199347738e-05, + "loss": 3.7759, + "step": 30055 + }, + { + "epoch": 2.0423970648185894, + "grad_norm": 0.5323528051376343, + "learning_rate": 7.44836254925941e-05, + "loss": 3.9789, + "step": 30060 + }, + { + "epoch": 2.042736784889251, + "grad_norm": 0.1750987470149994, + "learning_rate": 7.447937899171083e-05, + "loss": 3.8194, + "step": 30065 + }, + { + "epoch": 2.043076504959913, + "grad_norm": 0.1373085081577301, + "learning_rate": 7.447513249082756e-05, + "loss": 4.1871, + "step": 30070 + }, + { + "epoch": 2.0434162250305747, + "grad_norm": 0.14213398098945618, + "learning_rate": 7.447088598994429e-05, + "loss": 3.6782, + "step": 30075 + }, + { + "epoch": 2.0437559451012364, + "grad_norm": 0.20015956461429596, + "learning_rate": 7.446663948906102e-05, + "loss": 3.7659, + "step": 30080 + }, + { + "epoch": 2.0440956651718984, + "grad_norm": 0.16470271348953247, + "learning_rate": 7.446239298817775e-05, + "loss": 3.8876, + "step": 30085 + }, + { + "epoch": 2.04443538524256, + "grad_norm": 0.17309890687465668, + "learning_rate": 7.445814648729447e-05, + "loss": 4.029, + "step": 30090 + }, + { + "epoch": 2.0447751053132217, + "grad_norm": 0.27391552925109863, + "learning_rate": 7.44538999864112e-05, + "loss": 3.7865, + "step": 30095 + }, + { + "epoch": 2.045114825383884, + "grad_norm": 0.23906074464321136, + "learning_rate": 7.444965348552793e-05, + "loss": 3.7766, + "step": 30100 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.16704797744750977, + "learning_rate": 7.444540698464466e-05, + "loss": 3.9242, + "step": 30105 + }, + { + "epoch": 2.045794265525207, + "grad_norm": 0.18890663981437683, + "learning_rate": 7.444116048376139e-05, + "loss": 3.7676, + "step": 30110 + }, + { + "epoch": 2.046133985595869, + "grad_norm": 0.14636637270450592, + "learning_rate": 7.443691398287811e-05, + "loss": 4.0154, + "step": 30115 + }, + { + "epoch": 2.0464737056665308, + "grad_norm": 0.1715923547744751, + "learning_rate": 7.443266748199484e-05, + "loss": 3.9357, + "step": 30120 + }, + { + "epoch": 2.0468134257371924, + "grad_norm": 0.188579261302948, + "learning_rate": 7.442842098111157e-05, + "loss": 3.8905, + "step": 30125 + }, + { + "epoch": 2.0471531458078545, + "grad_norm": 0.15642209351062775, + "learning_rate": 7.44241744802283e-05, + "loss": 3.8117, + "step": 30130 + }, + { + "epoch": 2.047492865878516, + "grad_norm": 0.17435868084430695, + "learning_rate": 7.441992797934503e-05, + "loss": 3.8025, + "step": 30135 + }, + { + "epoch": 2.0478325859491777, + "grad_norm": 0.15586501359939575, + "learning_rate": 7.441568147846175e-05, + "loss": 3.9439, + "step": 30140 + }, + { + "epoch": 2.04817230601984, + "grad_norm": 0.3708030879497528, + "learning_rate": 7.441143497757848e-05, + "loss": 4.0174, + "step": 30145 + }, + { + "epoch": 2.0485120260905014, + "grad_norm": 0.19760610163211823, + "learning_rate": 7.440718847669521e-05, + "loss": 3.7158, + "step": 30150 + }, + { + "epoch": 2.048851746161163, + "grad_norm": 0.15812024474143982, + "learning_rate": 7.440294197581194e-05, + "loss": 3.7871, + "step": 30155 + }, + { + "epoch": 2.049191466231825, + "grad_norm": 0.16974589228630066, + "learning_rate": 7.439869547492867e-05, + "loss": 3.9246, + "step": 30160 + }, + { + "epoch": 2.0495311863024868, + "grad_norm": 0.1529001146554947, + "learning_rate": 7.43944489740454e-05, + "loss": 3.7669, + "step": 30165 + }, + { + "epoch": 2.0498709063731484, + "grad_norm": 0.47523850202560425, + "learning_rate": 7.439020247316212e-05, + "loss": 3.7807, + "step": 30170 + }, + { + "epoch": 2.0502106264438105, + "grad_norm": 0.19162774085998535, + "learning_rate": 7.438595597227884e-05, + "loss": 3.6397, + "step": 30175 + }, + { + "epoch": 2.050550346514472, + "grad_norm": 0.14052987098693848, + "learning_rate": 7.438170947139558e-05, + "loss": 4.0708, + "step": 30180 + }, + { + "epoch": 2.0508900665851337, + "grad_norm": 0.24684879183769226, + "learning_rate": 7.43774629705123e-05, + "loss": 3.7016, + "step": 30185 + }, + { + "epoch": 2.051229786655796, + "grad_norm": 0.9789308905601501, + "learning_rate": 7.437321646962902e-05, + "loss": 3.753, + "step": 30190 + }, + { + "epoch": 2.0515695067264574, + "grad_norm": 0.20065449178218842, + "learning_rate": 7.436896996874576e-05, + "loss": 3.7844, + "step": 30195 + }, + { + "epoch": 2.051909226797119, + "grad_norm": 0.17069724202156067, + "learning_rate": 7.436472346786249e-05, + "loss": 4.1353, + "step": 30200 + }, + { + "epoch": 2.052248946867781, + "grad_norm": 0.13853542506694794, + "learning_rate": 7.43604769669792e-05, + "loss": 3.8899, + "step": 30205 + }, + { + "epoch": 2.0525886669384428, + "grad_norm": 0.3496767282485962, + "learning_rate": 7.435623046609595e-05, + "loss": 3.7931, + "step": 30210 + }, + { + "epoch": 2.0529283870091044, + "grad_norm": 0.2068018764257431, + "learning_rate": 7.435198396521267e-05, + "loss": 3.9135, + "step": 30215 + }, + { + "epoch": 2.0532681070797665, + "grad_norm": 0.17450375854969025, + "learning_rate": 7.434773746432939e-05, + "loss": 4.0479, + "step": 30220 + }, + { + "epoch": 2.053607827150428, + "grad_norm": 0.14304186403751373, + "learning_rate": 7.434349096344613e-05, + "loss": 3.8392, + "step": 30225 + }, + { + "epoch": 2.0539475472210897, + "grad_norm": 0.3454066812992096, + "learning_rate": 7.433924446256286e-05, + "loss": 3.5124, + "step": 30230 + }, + { + "epoch": 2.054287267291752, + "grad_norm": 0.13930504024028778, + "learning_rate": 7.433499796167957e-05, + "loss": 3.8469, + "step": 30235 + }, + { + "epoch": 2.0546269873624134, + "grad_norm": 0.18690112233161926, + "learning_rate": 7.433075146079631e-05, + "loss": 3.9102, + "step": 30240 + }, + { + "epoch": 2.054966707433075, + "grad_norm": 0.14362984895706177, + "learning_rate": 7.432650495991303e-05, + "loss": 3.9257, + "step": 30245 + }, + { + "epoch": 2.0553064275037367, + "grad_norm": 0.26014912128448486, + "learning_rate": 7.432225845902976e-05, + "loss": 3.7724, + "step": 30250 + }, + { + "epoch": 2.055646147574399, + "grad_norm": 0.15759232640266418, + "learning_rate": 7.43180119581465e-05, + "loss": 4.1559, + "step": 30255 + }, + { + "epoch": 2.0559858676450604, + "grad_norm": 0.21459101140499115, + "learning_rate": 7.431376545726321e-05, + "loss": 4.1959, + "step": 30260 + }, + { + "epoch": 2.056325587715722, + "grad_norm": 0.15583892166614532, + "learning_rate": 7.430951895637994e-05, + "loss": 3.9636, + "step": 30265 + }, + { + "epoch": 2.056665307786384, + "grad_norm": 0.16969901323318481, + "learning_rate": 7.430527245549668e-05, + "loss": 3.6031, + "step": 30270 + }, + { + "epoch": 2.0570050278570458, + "grad_norm": 0.18525397777557373, + "learning_rate": 7.43010259546134e-05, + "loss": 3.9982, + "step": 30275 + }, + { + "epoch": 2.0573447479277074, + "grad_norm": 0.20426896214485168, + "learning_rate": 7.429677945373012e-05, + "loss": 3.6279, + "step": 30280 + }, + { + "epoch": 2.0576844679983695, + "grad_norm": 0.2686280906200409, + "learning_rate": 7.429253295284687e-05, + "loss": 4.0474, + "step": 30285 + }, + { + "epoch": 2.058024188069031, + "grad_norm": 0.15159833431243896, + "learning_rate": 7.428828645196358e-05, + "loss": 3.9579, + "step": 30290 + }, + { + "epoch": 2.0583639081396927, + "grad_norm": 0.24202151596546173, + "learning_rate": 7.428403995108031e-05, + "loss": 3.8706, + "step": 30295 + }, + { + "epoch": 2.058703628210355, + "grad_norm": 0.20743761956691742, + "learning_rate": 7.427979345019705e-05, + "loss": 4.0336, + "step": 30300 + }, + { + "epoch": 2.0590433482810164, + "grad_norm": 0.1940789520740509, + "learning_rate": 7.427554694931376e-05, + "loss": 3.9665, + "step": 30305 + }, + { + "epoch": 2.059383068351678, + "grad_norm": 0.2013208270072937, + "learning_rate": 7.427130044843049e-05, + "loss": 3.9269, + "step": 30310 + }, + { + "epoch": 2.05972278842234, + "grad_norm": 0.1505092978477478, + "learning_rate": 7.426705394754722e-05, + "loss": 3.8708, + "step": 30315 + }, + { + "epoch": 2.0600625084930018, + "grad_norm": 0.19856363534927368, + "learning_rate": 7.426280744666395e-05, + "loss": 3.9159, + "step": 30320 + }, + { + "epoch": 2.0604022285636634, + "grad_norm": 0.16691060364246368, + "learning_rate": 7.425856094578068e-05, + "loss": 4.0825, + "step": 30325 + }, + { + "epoch": 2.0607419486343255, + "grad_norm": 0.19938109815120697, + "learning_rate": 7.42543144448974e-05, + "loss": 3.7227, + "step": 30330 + }, + { + "epoch": 2.061081668704987, + "grad_norm": 0.19157074391841888, + "learning_rate": 7.425006794401413e-05, + "loss": 3.8386, + "step": 30335 + }, + { + "epoch": 2.0614213887756487, + "grad_norm": 0.19192638993263245, + "learning_rate": 7.424582144313086e-05, + "loss": 3.8078, + "step": 30340 + }, + { + "epoch": 2.061761108846311, + "grad_norm": 0.16541238129138947, + "learning_rate": 7.424157494224759e-05, + "loss": 3.7049, + "step": 30345 + }, + { + "epoch": 2.0621008289169724, + "grad_norm": 0.44250714778900146, + "learning_rate": 7.423732844136432e-05, + "loss": 3.8817, + "step": 30350 + }, + { + "epoch": 2.062440548987634, + "grad_norm": 0.19357651472091675, + "learning_rate": 7.423308194048104e-05, + "loss": 3.8183, + "step": 30355 + }, + { + "epoch": 2.062780269058296, + "grad_norm": 0.24058417975902557, + "learning_rate": 7.422883543959777e-05, + "loss": 3.9854, + "step": 30360 + }, + { + "epoch": 2.0631199891289578, + "grad_norm": 0.17792759835720062, + "learning_rate": 7.42245889387145e-05, + "loss": 3.6364, + "step": 30365 + }, + { + "epoch": 2.0634597091996194, + "grad_norm": 0.2884472906589508, + "learning_rate": 7.422034243783123e-05, + "loss": 3.9425, + "step": 30370 + }, + { + "epoch": 2.0637994292702815, + "grad_norm": 0.1919865608215332, + "learning_rate": 7.421609593694796e-05, + "loss": 3.6744, + "step": 30375 + }, + { + "epoch": 2.064139149340943, + "grad_norm": 0.3682226240634918, + "learning_rate": 7.421184943606468e-05, + "loss": 3.7733, + "step": 30380 + }, + { + "epoch": 2.0644788694116047, + "grad_norm": 0.2153066098690033, + "learning_rate": 7.420760293518143e-05, + "loss": 3.9384, + "step": 30385 + }, + { + "epoch": 2.064818589482267, + "grad_norm": 0.5063872933387756, + "learning_rate": 7.420335643429814e-05, + "loss": 3.9486, + "step": 30390 + }, + { + "epoch": 2.0651583095529285, + "grad_norm": 0.19251838326454163, + "learning_rate": 7.419910993341487e-05, + "loss": 3.883, + "step": 30395 + }, + { + "epoch": 2.06549802962359, + "grad_norm": 0.16862496733665466, + "learning_rate": 7.41948634325316e-05, + "loss": 3.5984, + "step": 30400 + }, + { + "epoch": 2.0658377496942517, + "grad_norm": 0.17777062952518463, + "learning_rate": 7.419061693164832e-05, + "loss": 3.7382, + "step": 30405 + }, + { + "epoch": 2.066177469764914, + "grad_norm": 0.1690099984407425, + "learning_rate": 7.418637043076505e-05, + "loss": 3.9367, + "step": 30410 + }, + { + "epoch": 2.0665171898355754, + "grad_norm": 0.19786378741264343, + "learning_rate": 7.418212392988178e-05, + "loss": 3.7563, + "step": 30415 + }, + { + "epoch": 2.066856909906237, + "grad_norm": 0.19303883612155914, + "learning_rate": 7.417787742899851e-05, + "loss": 3.7399, + "step": 30420 + }, + { + "epoch": 2.067196629976899, + "grad_norm": 0.1609804481267929, + "learning_rate": 7.417363092811524e-05, + "loss": 3.8271, + "step": 30425 + }, + { + "epoch": 2.0675363500475608, + "grad_norm": 0.1421130746603012, + "learning_rate": 7.416938442723196e-05, + "loss": 3.9389, + "step": 30430 + }, + { + "epoch": 2.0678760701182224, + "grad_norm": 0.14269603788852692, + "learning_rate": 7.416513792634869e-05, + "loss": 3.8299, + "step": 30435 + }, + { + "epoch": 2.0682157901888845, + "grad_norm": 0.1352069228887558, + "learning_rate": 7.416089142546542e-05, + "loss": 4.0401, + "step": 30440 + }, + { + "epoch": 2.068555510259546, + "grad_norm": 0.24732816219329834, + "learning_rate": 7.415664492458215e-05, + "loss": 3.9361, + "step": 30445 + }, + { + "epoch": 2.0688952303302077, + "grad_norm": 0.21389953792095184, + "learning_rate": 7.415239842369888e-05, + "loss": 4.0705, + "step": 30450 + }, + { + "epoch": 2.06923495040087, + "grad_norm": 0.9384064078330994, + "learning_rate": 7.41481519228156e-05, + "loss": 3.6998, + "step": 30455 + }, + { + "epoch": 2.0695746704715314, + "grad_norm": 0.19300983846187592, + "learning_rate": 7.414390542193233e-05, + "loss": 3.8522, + "step": 30460 + }, + { + "epoch": 2.069914390542193, + "grad_norm": 0.1568325310945511, + "learning_rate": 7.413965892104906e-05, + "loss": 4.0256, + "step": 30465 + }, + { + "epoch": 2.070254110612855, + "grad_norm": 0.1614760160446167, + "learning_rate": 7.413541242016579e-05, + "loss": 3.88, + "step": 30470 + }, + { + "epoch": 2.0705938306835168, + "grad_norm": 0.16910649836063385, + "learning_rate": 7.413116591928252e-05, + "loss": 3.6341, + "step": 30475 + }, + { + "epoch": 2.0709335507541784, + "grad_norm": 0.22375118732452393, + "learning_rate": 7.412691941839924e-05, + "loss": 3.7438, + "step": 30480 + }, + { + "epoch": 2.0712732708248405, + "grad_norm": 0.17668516933918, + "learning_rate": 7.412267291751597e-05, + "loss": 3.7226, + "step": 30485 + }, + { + "epoch": 2.071612990895502, + "grad_norm": 0.20110471546649933, + "learning_rate": 7.41184264166327e-05, + "loss": 3.8728, + "step": 30490 + }, + { + "epoch": 2.0719527109661637, + "grad_norm": 0.20512743294239044, + "learning_rate": 7.411417991574943e-05, + "loss": 4.0027, + "step": 30495 + }, + { + "epoch": 2.072292431036826, + "grad_norm": 0.21736864745616913, + "learning_rate": 7.410993341486616e-05, + "loss": 3.6234, + "step": 30500 + }, + { + "epoch": 2.0726321511074874, + "grad_norm": 0.16684429347515106, + "learning_rate": 7.410568691398288e-05, + "loss": 4.0623, + "step": 30505 + }, + { + "epoch": 2.072971871178149, + "grad_norm": 0.14699675142765045, + "learning_rate": 7.410144041309961e-05, + "loss": 3.9728, + "step": 30510 + }, + { + "epoch": 2.073311591248811, + "grad_norm": 0.4093308746814728, + "learning_rate": 7.409719391221633e-05, + "loss": 3.791, + "step": 30515 + }, + { + "epoch": 2.073651311319473, + "grad_norm": 0.20326736569404602, + "learning_rate": 7.409294741133307e-05, + "loss": 3.9511, + "step": 30520 + }, + { + "epoch": 2.0739910313901344, + "grad_norm": 0.14199787378311157, + "learning_rate": 7.40887009104498e-05, + "loss": 4.0346, + "step": 30525 + }, + { + "epoch": 2.0743307514607965, + "grad_norm": 0.1662047952413559, + "learning_rate": 7.408445440956651e-05, + "loss": 4.1237, + "step": 30530 + }, + { + "epoch": 2.074670471531458, + "grad_norm": 0.22127766907215118, + "learning_rate": 7.408020790868325e-05, + "loss": 3.6413, + "step": 30535 + }, + { + "epoch": 2.0750101916021197, + "grad_norm": 0.1863972395658493, + "learning_rate": 7.407596140779998e-05, + "loss": 3.6837, + "step": 30540 + }, + { + "epoch": 2.075349911672782, + "grad_norm": 0.16231435537338257, + "learning_rate": 7.40717149069167e-05, + "loss": 3.8573, + "step": 30545 + }, + { + "epoch": 2.0756896317434435, + "grad_norm": 0.15180107951164246, + "learning_rate": 7.406746840603344e-05, + "loss": 3.8114, + "step": 30550 + }, + { + "epoch": 2.076029351814105, + "grad_norm": 0.16054798662662506, + "learning_rate": 7.406322190515016e-05, + "loss": 3.8926, + "step": 30555 + }, + { + "epoch": 2.076369071884767, + "grad_norm": 0.2293163686990738, + "learning_rate": 7.405897540426688e-05, + "loss": 3.7809, + "step": 30560 + }, + { + "epoch": 2.076708791955429, + "grad_norm": 0.2555900514125824, + "learning_rate": 7.405472890338362e-05, + "loss": 3.8983, + "step": 30565 + }, + { + "epoch": 2.0770485120260904, + "grad_norm": 0.16651158034801483, + "learning_rate": 7.405048240250035e-05, + "loss": 3.7604, + "step": 30570 + }, + { + "epoch": 2.0773882320967525, + "grad_norm": 0.18363377451896667, + "learning_rate": 7.404623590161706e-05, + "loss": 3.8538, + "step": 30575 + }, + { + "epoch": 2.077727952167414, + "grad_norm": 0.16887740790843964, + "learning_rate": 7.40419894007338e-05, + "loss": 4.0091, + "step": 30580 + }, + { + "epoch": 2.0780676722380758, + "grad_norm": 0.18305829167366028, + "learning_rate": 7.403774289985053e-05, + "loss": 3.7085, + "step": 30585 + }, + { + "epoch": 2.0784073923087374, + "grad_norm": 0.17579318583011627, + "learning_rate": 7.403349639896725e-05, + "loss": 3.7795, + "step": 30590 + }, + { + "epoch": 2.0787471123793995, + "grad_norm": 0.17064177989959717, + "learning_rate": 7.402924989808399e-05, + "loss": 3.9684, + "step": 30595 + }, + { + "epoch": 2.079086832450061, + "grad_norm": 0.23884384334087372, + "learning_rate": 7.40250033972007e-05, + "loss": 3.9114, + "step": 30600 + }, + { + "epoch": 2.0794265525207227, + "grad_norm": 0.1769154667854309, + "learning_rate": 7.402075689631743e-05, + "loss": 3.9827, + "step": 30605 + }, + { + "epoch": 2.079766272591385, + "grad_norm": 0.15434648096561432, + "learning_rate": 7.401651039543417e-05, + "loss": 4.0351, + "step": 30610 + }, + { + "epoch": 2.0801059926620464, + "grad_norm": 0.18263523280620575, + "learning_rate": 7.401226389455089e-05, + "loss": 3.8801, + "step": 30615 + }, + { + "epoch": 2.080445712732708, + "grad_norm": 0.22344529628753662, + "learning_rate": 7.400801739366762e-05, + "loss": 3.7795, + "step": 30620 + }, + { + "epoch": 2.08078543280337, + "grad_norm": 0.1482914835214615, + "learning_rate": 7.400377089278436e-05, + "loss": 4.0836, + "step": 30625 + }, + { + "epoch": 2.0811251528740318, + "grad_norm": 0.13719600439071655, + "learning_rate": 7.399952439190107e-05, + "loss": 3.7343, + "step": 30630 + }, + { + "epoch": 2.0814648729446934, + "grad_norm": 0.1791633814573288, + "learning_rate": 7.39952778910178e-05, + "loss": 3.8321, + "step": 30635 + }, + { + "epoch": 2.0818045930153555, + "grad_norm": 0.18803595006465912, + "learning_rate": 7.399103139013454e-05, + "loss": 3.9302, + "step": 30640 + }, + { + "epoch": 2.082144313086017, + "grad_norm": 0.2841731905937195, + "learning_rate": 7.398678488925126e-05, + "loss": 4.1923, + "step": 30645 + }, + { + "epoch": 2.0824840331566787, + "grad_norm": 0.18086862564086914, + "learning_rate": 7.398253838836798e-05, + "loss": 3.9923, + "step": 30650 + }, + { + "epoch": 2.082823753227341, + "grad_norm": 0.20006488263607025, + "learning_rate": 7.397829188748472e-05, + "loss": 4.203, + "step": 30655 + }, + { + "epoch": 2.0831634732980024, + "grad_norm": 0.21996557712554932, + "learning_rate": 7.397404538660144e-05, + "loss": 3.9678, + "step": 30660 + }, + { + "epoch": 2.083503193368664, + "grad_norm": 0.23745614290237427, + "learning_rate": 7.396979888571817e-05, + "loss": 3.7661, + "step": 30665 + }, + { + "epoch": 2.083842913439326, + "grad_norm": 0.19451391696929932, + "learning_rate": 7.39655523848349e-05, + "loss": 3.8405, + "step": 30670 + }, + { + "epoch": 2.084182633509988, + "grad_norm": 0.1798413097858429, + "learning_rate": 7.396130588395162e-05, + "loss": 3.8407, + "step": 30675 + }, + { + "epoch": 2.0845223535806494, + "grad_norm": 0.3202071189880371, + "learning_rate": 7.395705938306835e-05, + "loss": 3.8708, + "step": 30680 + }, + { + "epoch": 2.0848620736513115, + "grad_norm": 0.21981047093868256, + "learning_rate": 7.395281288218508e-05, + "loss": 3.9433, + "step": 30685 + }, + { + "epoch": 2.085201793721973, + "grad_norm": 0.1685931235551834, + "learning_rate": 7.394856638130181e-05, + "loss": 3.7499, + "step": 30690 + }, + { + "epoch": 2.0855415137926347, + "grad_norm": 0.16589052975177765, + "learning_rate": 7.394431988041854e-05, + "loss": 3.7873, + "step": 30695 + }, + { + "epoch": 2.085881233863297, + "grad_norm": 0.6286324262619019, + "learning_rate": 7.394007337953526e-05, + "loss": 3.6498, + "step": 30700 + }, + { + "epoch": 2.0862209539339585, + "grad_norm": 0.14839491248130798, + "learning_rate": 7.393582687865199e-05, + "loss": 3.8338, + "step": 30705 + }, + { + "epoch": 2.08656067400462, + "grad_norm": 0.20429304242134094, + "learning_rate": 7.393158037776872e-05, + "loss": 3.8719, + "step": 30710 + }, + { + "epoch": 2.086900394075282, + "grad_norm": 0.1546333134174347, + "learning_rate": 7.392733387688545e-05, + "loss": 3.9224, + "step": 30715 + }, + { + "epoch": 2.087240114145944, + "grad_norm": 0.5702571868896484, + "learning_rate": 7.392308737600218e-05, + "loss": 3.9634, + "step": 30720 + }, + { + "epoch": 2.0875798342166054, + "grad_norm": 0.16540437936782837, + "learning_rate": 7.391884087511892e-05, + "loss": 4.2425, + "step": 30725 + }, + { + "epoch": 2.0879195542872675, + "grad_norm": 0.14852741360664368, + "learning_rate": 7.391459437423563e-05, + "loss": 4.0311, + "step": 30730 + }, + { + "epoch": 2.088259274357929, + "grad_norm": 1.0979334115982056, + "learning_rate": 7.391034787335236e-05, + "loss": 3.949, + "step": 30735 + }, + { + "epoch": 2.0885989944285908, + "grad_norm": 0.1397494524717331, + "learning_rate": 7.390610137246909e-05, + "loss": 3.8623, + "step": 30740 + }, + { + "epoch": 2.0889387144992524, + "grad_norm": 0.1849268078804016, + "learning_rate": 7.390185487158582e-05, + "loss": 4.0214, + "step": 30745 + }, + { + "epoch": 2.0892784345699145, + "grad_norm": 0.1660557985305786, + "learning_rate": 7.389760837070254e-05, + "loss": 3.9664, + "step": 30750 + }, + { + "epoch": 2.089618154640576, + "grad_norm": 0.16351982951164246, + "learning_rate": 7.389336186981927e-05, + "loss": 4.0476, + "step": 30755 + }, + { + "epoch": 2.0899578747112377, + "grad_norm": 0.20639602839946747, + "learning_rate": 7.3889115368936e-05, + "loss": 3.819, + "step": 30760 + }, + { + "epoch": 2.0902975947819, + "grad_norm": 0.1339769810438156, + "learning_rate": 7.388486886805273e-05, + "loss": 4.3403, + "step": 30765 + }, + { + "epoch": 2.0906373148525614, + "grad_norm": 0.13626378774642944, + "learning_rate": 7.388062236716946e-05, + "loss": 3.8078, + "step": 30770 + }, + { + "epoch": 2.090977034923223, + "grad_norm": 0.24054476618766785, + "learning_rate": 7.387637586628618e-05, + "loss": 3.6883, + "step": 30775 + }, + { + "epoch": 2.091316754993885, + "grad_norm": 0.3773823082447052, + "learning_rate": 7.387212936540291e-05, + "loss": 3.9644, + "step": 30780 + }, + { + "epoch": 2.0916564750645468, + "grad_norm": 0.1809372901916504, + "learning_rate": 7.386788286451964e-05, + "loss": 3.9257, + "step": 30785 + }, + { + "epoch": 2.0919961951352084, + "grad_norm": 0.131139874458313, + "learning_rate": 7.386363636363637e-05, + "loss": 3.9259, + "step": 30790 + }, + { + "epoch": 2.0923359152058705, + "grad_norm": 2.1228065490722656, + "learning_rate": 7.38593898627531e-05, + "loss": 3.9566, + "step": 30795 + }, + { + "epoch": 2.092675635276532, + "grad_norm": 0.13642403483390808, + "learning_rate": 7.385514336186982e-05, + "loss": 3.8253, + "step": 30800 + }, + { + "epoch": 2.0930153553471937, + "grad_norm": 0.2613891065120697, + "learning_rate": 7.385089686098655e-05, + "loss": 3.7239, + "step": 30805 + }, + { + "epoch": 2.093355075417856, + "grad_norm": 0.19712123274803162, + "learning_rate": 7.384665036010328e-05, + "loss": 3.8354, + "step": 30810 + }, + { + "epoch": 2.0936947954885174, + "grad_norm": 0.14497503638267517, + "learning_rate": 7.384240385922001e-05, + "loss": 3.9791, + "step": 30815 + }, + { + "epoch": 2.094034515559179, + "grad_norm": 0.17098523676395416, + "learning_rate": 7.383815735833674e-05, + "loss": 3.744, + "step": 30820 + }, + { + "epoch": 2.094374235629841, + "grad_norm": 0.17676371335983276, + "learning_rate": 7.383391085745346e-05, + "loss": 3.7825, + "step": 30825 + }, + { + "epoch": 2.094713955700503, + "grad_norm": 0.2137291133403778, + "learning_rate": 7.382966435657019e-05, + "loss": 4.0449, + "step": 30830 + }, + { + "epoch": 2.0950536757711644, + "grad_norm": 0.22910864651203156, + "learning_rate": 7.382541785568692e-05, + "loss": 3.8459, + "step": 30835 + }, + { + "epoch": 2.0953933958418265, + "grad_norm": 0.2695213258266449, + "learning_rate": 7.382117135480365e-05, + "loss": 3.8889, + "step": 30840 + }, + { + "epoch": 2.095733115912488, + "grad_norm": 0.3335666060447693, + "learning_rate": 7.381692485392038e-05, + "loss": 4.1058, + "step": 30845 + }, + { + "epoch": 2.0960728359831498, + "grad_norm": 0.17091625928878784, + "learning_rate": 7.38126783530371e-05, + "loss": 4.0891, + "step": 30850 + }, + { + "epoch": 2.096412556053812, + "grad_norm": 0.2663472294807434, + "learning_rate": 7.380843185215383e-05, + "loss": 3.8043, + "step": 30855 + }, + { + "epoch": 2.0967522761244735, + "grad_norm": 0.1716795712709427, + "learning_rate": 7.380418535127056e-05, + "loss": 3.9794, + "step": 30860 + }, + { + "epoch": 2.097091996195135, + "grad_norm": 0.20687024295330048, + "learning_rate": 7.379993885038729e-05, + "loss": 3.8138, + "step": 30865 + }, + { + "epoch": 2.097431716265797, + "grad_norm": 0.16000781953334808, + "learning_rate": 7.3795692349504e-05, + "loss": 3.9404, + "step": 30870 + }, + { + "epoch": 2.097771436336459, + "grad_norm": 0.5640997290611267, + "learning_rate": 7.379144584862074e-05, + "loss": 3.8608, + "step": 30875 + }, + { + "epoch": 2.0981111564071204, + "grad_norm": 0.21805191040039062, + "learning_rate": 7.378719934773747e-05, + "loss": 3.756, + "step": 30880 + }, + { + "epoch": 2.0984508764777825, + "grad_norm": 0.16211572289466858, + "learning_rate": 7.378295284685419e-05, + "loss": 4.0862, + "step": 30885 + }, + { + "epoch": 2.098790596548444, + "grad_norm": 0.16025607287883759, + "learning_rate": 7.377870634597093e-05, + "loss": 3.9095, + "step": 30890 + }, + { + "epoch": 2.0991303166191058, + "grad_norm": 0.23869997262954712, + "learning_rate": 7.377445984508766e-05, + "loss": 3.7683, + "step": 30895 + }, + { + "epoch": 2.099470036689768, + "grad_norm": 0.17254634201526642, + "learning_rate": 7.377021334420437e-05, + "loss": 3.8854, + "step": 30900 + }, + { + "epoch": 2.0998097567604295, + "grad_norm": 0.16997206211090088, + "learning_rate": 7.376596684332111e-05, + "loss": 4.0727, + "step": 30905 + }, + { + "epoch": 2.100149476831091, + "grad_norm": 0.16491830348968506, + "learning_rate": 7.376172034243784e-05, + "loss": 3.9336, + "step": 30910 + }, + { + "epoch": 2.100489196901753, + "grad_norm": 0.18880970776081085, + "learning_rate": 7.375747384155455e-05, + "loss": 3.9816, + "step": 30915 + }, + { + "epoch": 2.100828916972415, + "grad_norm": NaN, + "learning_rate": 7.375407664084795e-05, + "loss": 3.5882, + "step": 30920 + }, + { + "epoch": 2.1011686370430764, + "grad_norm": 0.8958574533462524, + "learning_rate": 7.374983013996466e-05, + "loss": 4.0707, + "step": 30925 + }, + { + "epoch": 2.101508357113738, + "grad_norm": 0.15629899501800537, + "learning_rate": 7.37455836390814e-05, + "loss": 3.7275, + "step": 30930 + }, + { + "epoch": 2.1018480771844, + "grad_norm": 0.1739453226327896, + "learning_rate": 7.374133713819813e-05, + "loss": 3.9423, + "step": 30935 + }, + { + "epoch": 2.1021877972550618, + "grad_norm": 0.2162434160709381, + "learning_rate": 7.373709063731485e-05, + "loss": 3.9803, + "step": 30940 + }, + { + "epoch": 2.1025275173257234, + "grad_norm": 0.3497511148452759, + "learning_rate": 7.373284413643159e-05, + "loss": 3.5312, + "step": 30945 + }, + { + "epoch": 2.1028672373963855, + "grad_norm": 0.17760758101940155, + "learning_rate": 7.372859763554832e-05, + "loss": 3.7607, + "step": 30950 + }, + { + "epoch": 2.103206957467047, + "grad_norm": 0.22855043411254883, + "learning_rate": 7.372435113466503e-05, + "loss": 3.9893, + "step": 30955 + }, + { + "epoch": 2.1035466775377087, + "grad_norm": 0.1899823695421219, + "learning_rate": 7.372010463378177e-05, + "loss": 4.1077, + "step": 30960 + }, + { + "epoch": 2.103886397608371, + "grad_norm": 0.18350671231746674, + "learning_rate": 7.37158581328985e-05, + "loss": 3.8848, + "step": 30965 + }, + { + "epoch": 2.1042261176790324, + "grad_norm": 0.5779078006744385, + "learning_rate": 7.371161163201522e-05, + "loss": 3.5182, + "step": 30970 + }, + { + "epoch": 2.104565837749694, + "grad_norm": 0.19188663363456726, + "learning_rate": 7.370736513113196e-05, + "loss": 3.874, + "step": 30975 + }, + { + "epoch": 2.104905557820356, + "grad_norm": 0.1735420823097229, + "learning_rate": 7.370311863024869e-05, + "loss": 3.793, + "step": 30980 + }, + { + "epoch": 2.105245277891018, + "grad_norm": 0.1519380658864975, + "learning_rate": 7.36988721293654e-05, + "loss": 3.7655, + "step": 30985 + }, + { + "epoch": 2.1055849979616794, + "grad_norm": 0.23968344926834106, + "learning_rate": 7.369462562848214e-05, + "loss": 3.9071, + "step": 30990 + }, + { + "epoch": 2.1059247180323415, + "grad_norm": 0.173060342669487, + "learning_rate": 7.369037912759886e-05, + "loss": 4.0527, + "step": 30995 + }, + { + "epoch": 2.106264438103003, + "grad_norm": 0.1784730851650238, + "learning_rate": 7.368613262671558e-05, + "loss": 4.0334, + "step": 31000 + }, + { + "epoch": 2.1066041581736648, + "grad_norm": 0.16062365472316742, + "learning_rate": 7.368188612583233e-05, + "loss": 3.7915, + "step": 31005 + }, + { + "epoch": 2.106943878244327, + "grad_norm": 0.18324166536331177, + "learning_rate": 7.367763962494904e-05, + "loss": 3.9851, + "step": 31010 + }, + { + "epoch": 2.1072835983149885, + "grad_norm": 0.16399532556533813, + "learning_rate": 7.367339312406577e-05, + "loss": 4.0958, + "step": 31015 + }, + { + "epoch": 2.10762331838565, + "grad_norm": 0.16129423677921295, + "learning_rate": 7.366914662318251e-05, + "loss": 3.8331, + "step": 31020 + }, + { + "epoch": 2.107963038456312, + "grad_norm": 0.19042037427425385, + "learning_rate": 7.366490012229922e-05, + "loss": 3.8061, + "step": 31025 + }, + { + "epoch": 2.108302758526974, + "grad_norm": 0.16174407303333282, + "learning_rate": 7.366065362141595e-05, + "loss": 3.9835, + "step": 31030 + }, + { + "epoch": 2.1086424785976354, + "grad_norm": 0.197922945022583, + "learning_rate": 7.36564071205327e-05, + "loss": 3.6738, + "step": 31035 + }, + { + "epoch": 2.1089821986682975, + "grad_norm": 0.6686575412750244, + "learning_rate": 7.365216061964941e-05, + "loss": 3.825, + "step": 31040 + }, + { + "epoch": 2.109321918738959, + "grad_norm": 0.13669951260089874, + "learning_rate": 7.364791411876614e-05, + "loss": 3.9931, + "step": 31045 + }, + { + "epoch": 2.1096616388096208, + "grad_norm": 0.20641569793224335, + "learning_rate": 7.364366761788288e-05, + "loss": 3.8871, + "step": 31050 + }, + { + "epoch": 2.110001358880283, + "grad_norm": 0.4954425096511841, + "learning_rate": 7.363942111699959e-05, + "loss": 4.0881, + "step": 31055 + }, + { + "epoch": 2.1103410789509445, + "grad_norm": 1.9927634000778198, + "learning_rate": 7.363517461611632e-05, + "loss": 4.0076, + "step": 31060 + }, + { + "epoch": 2.110680799021606, + "grad_norm": 1.6645833253860474, + "learning_rate": 7.363092811523305e-05, + "loss": 3.9438, + "step": 31065 + }, + { + "epoch": 2.111020519092268, + "grad_norm": 0.168278306722641, + "learning_rate": 7.362668161434978e-05, + "loss": 3.9924, + "step": 31070 + }, + { + "epoch": 2.11136023916293, + "grad_norm": 0.3083663880825043, + "learning_rate": 7.36224351134665e-05, + "loss": 4.0865, + "step": 31075 + }, + { + "epoch": 2.1116999592335914, + "grad_norm": 0.1852981150150299, + "learning_rate": 7.361818861258323e-05, + "loss": 3.7537, + "step": 31080 + }, + { + "epoch": 2.112039679304253, + "grad_norm": 0.2601427733898163, + "learning_rate": 7.361394211169996e-05, + "loss": 3.7647, + "step": 31085 + }, + { + "epoch": 2.112379399374915, + "grad_norm": 0.1618819236755371, + "learning_rate": 7.360969561081669e-05, + "loss": 3.903, + "step": 31090 + }, + { + "epoch": 2.1127191194455768, + "grad_norm": 0.14667843282222748, + "learning_rate": 7.360544910993342e-05, + "loss": 3.8183, + "step": 31095 + }, + { + "epoch": 2.1130588395162384, + "grad_norm": 0.14909020066261292, + "learning_rate": 7.360120260905014e-05, + "loss": 3.7505, + "step": 31100 + }, + { + "epoch": 2.1133985595869005, + "grad_norm": 0.19462253153324127, + "learning_rate": 7.359695610816687e-05, + "loss": 3.9549, + "step": 31105 + }, + { + "epoch": 2.113738279657562, + "grad_norm": 0.18227654695510864, + "learning_rate": 7.35927096072836e-05, + "loss": 3.9672, + "step": 31110 + }, + { + "epoch": 2.1140779997282237, + "grad_norm": 0.15363836288452148, + "learning_rate": 7.358846310640033e-05, + "loss": 3.6625, + "step": 31115 + }, + { + "epoch": 2.114417719798886, + "grad_norm": 0.14230315387248993, + "learning_rate": 7.358421660551706e-05, + "loss": 4.0072, + "step": 31120 + }, + { + "epoch": 2.1147574398695475, + "grad_norm": 0.38873088359832764, + "learning_rate": 7.357997010463379e-05, + "loss": 4.0977, + "step": 31125 + }, + { + "epoch": 2.115097159940209, + "grad_norm": 0.17385372519493103, + "learning_rate": 7.357572360375051e-05, + "loss": 3.7646, + "step": 31130 + }, + { + "epoch": 2.115436880010871, + "grad_norm": 0.17894768714904785, + "learning_rate": 7.357147710286724e-05, + "loss": 3.5717, + "step": 31135 + }, + { + "epoch": 2.115776600081533, + "grad_norm": 0.22375242412090302, + "learning_rate": 7.356723060198397e-05, + "loss": 3.73, + "step": 31140 + }, + { + "epoch": 2.1161163201521944, + "grad_norm": 0.15766006708145142, + "learning_rate": 7.35629841011007e-05, + "loss": 4.1436, + "step": 31145 + }, + { + "epoch": 2.1164560402228565, + "grad_norm": 0.15430475771427155, + "learning_rate": 7.355873760021743e-05, + "loss": 4.1124, + "step": 31150 + }, + { + "epoch": 2.116795760293518, + "grad_norm": 0.17033500969409943, + "learning_rate": 7.355449109933415e-05, + "loss": 3.6838, + "step": 31155 + }, + { + "epoch": 2.1171354803641798, + "grad_norm": 0.18709613382816315, + "learning_rate": 7.355024459845088e-05, + "loss": 3.7383, + "step": 31160 + }, + { + "epoch": 2.117475200434842, + "grad_norm": 0.14130111038684845, + "learning_rate": 7.354599809756761e-05, + "loss": 4.1333, + "step": 31165 + }, + { + "epoch": 2.1178149205055035, + "grad_norm": 0.9152103662490845, + "learning_rate": 7.354175159668434e-05, + "loss": 3.7129, + "step": 31170 + }, + { + "epoch": 2.118154640576165, + "grad_norm": 0.14519061148166656, + "learning_rate": 7.353750509580107e-05, + "loss": 3.9732, + "step": 31175 + }, + { + "epoch": 2.118494360646827, + "grad_norm": 0.1687147468328476, + "learning_rate": 7.353325859491779e-05, + "loss": 3.9133, + "step": 31180 + }, + { + "epoch": 2.118834080717489, + "grad_norm": 0.1999950408935547, + "learning_rate": 7.352901209403452e-05, + "loss": 3.8844, + "step": 31185 + }, + { + "epoch": 2.1191738007881504, + "grad_norm": 0.392113596200943, + "learning_rate": 7.352476559315125e-05, + "loss": 3.7149, + "step": 31190 + }, + { + "epoch": 2.1195135208588125, + "grad_norm": 0.14162960648536682, + "learning_rate": 7.352051909226796e-05, + "loss": 4.0424, + "step": 31195 + }, + { + "epoch": 2.119853240929474, + "grad_norm": 0.25717848539352417, + "learning_rate": 7.35162725913847e-05, + "loss": 3.9208, + "step": 31200 + }, + { + "epoch": 2.1201929610001358, + "grad_norm": 0.16042651236057281, + "learning_rate": 7.351202609050143e-05, + "loss": 4.0043, + "step": 31205 + }, + { + "epoch": 2.120532681070798, + "grad_norm": 0.19016847014427185, + "learning_rate": 7.350777958961815e-05, + "loss": 3.7084, + "step": 31210 + }, + { + "epoch": 2.1208724011414595, + "grad_norm": 0.18586397171020508, + "learning_rate": 7.350353308873489e-05, + "loss": 3.9695, + "step": 31215 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.17690399289131165, + "learning_rate": 7.349928658785162e-05, + "loss": 3.9081, + "step": 31220 + }, + { + "epoch": 2.121551841282783, + "grad_norm": 0.20745107531547546, + "learning_rate": 7.349504008696833e-05, + "loss": 3.8053, + "step": 31225 + }, + { + "epoch": 2.121891561353445, + "grad_norm": 0.14877015352249146, + "learning_rate": 7.349079358608507e-05, + "loss": 3.9493, + "step": 31230 + }, + { + "epoch": 2.1222312814241064, + "grad_norm": 0.17993314564228058, + "learning_rate": 7.34865470852018e-05, + "loss": 3.9577, + "step": 31235 + }, + { + "epoch": 2.1225710014947685, + "grad_norm": 0.16484317183494568, + "learning_rate": 7.348230058431852e-05, + "loss": 3.9652, + "step": 31240 + }, + { + "epoch": 2.12291072156543, + "grad_norm": 0.16067925095558167, + "learning_rate": 7.347805408343526e-05, + "loss": 3.9618, + "step": 31245 + }, + { + "epoch": 2.123250441636092, + "grad_norm": 0.15908122062683105, + "learning_rate": 7.347380758255199e-05, + "loss": 4.0635, + "step": 31250 + }, + { + "epoch": 2.123590161706754, + "grad_norm": 0.2551096975803375, + "learning_rate": 7.34695610816687e-05, + "loss": 3.9216, + "step": 31255 + }, + { + "epoch": 2.1239298817774155, + "grad_norm": 0.16490308940410614, + "learning_rate": 7.346531458078544e-05, + "loss": 3.8181, + "step": 31260 + }, + { + "epoch": 2.124269601848077, + "grad_norm": 0.23827433586120605, + "learning_rate": 7.346106807990216e-05, + "loss": 3.7092, + "step": 31265 + }, + { + "epoch": 2.1246093219187387, + "grad_norm": 0.18305811285972595, + "learning_rate": 7.34568215790189e-05, + "loss": 3.7657, + "step": 31270 + }, + { + "epoch": 2.124949041989401, + "grad_norm": 0.3218519389629364, + "learning_rate": 7.345257507813563e-05, + "loss": 3.6543, + "step": 31275 + }, + { + "epoch": 2.1252887620600625, + "grad_norm": 0.17016340792179108, + "learning_rate": 7.344832857725234e-05, + "loss": 3.9319, + "step": 31280 + }, + { + "epoch": 2.125628482130724, + "grad_norm": 0.1497194916009903, + "learning_rate": 7.344408207636908e-05, + "loss": 4.0092, + "step": 31285 + }, + { + "epoch": 2.125968202201386, + "grad_norm": 0.18689477443695068, + "learning_rate": 7.343983557548581e-05, + "loss": 3.9386, + "step": 31290 + }, + { + "epoch": 2.126307922272048, + "grad_norm": 0.15753714740276337, + "learning_rate": 7.343558907460252e-05, + "loss": 3.6259, + "step": 31295 + }, + { + "epoch": 2.1266476423427094, + "grad_norm": 0.17515061795711517, + "learning_rate": 7.343134257371927e-05, + "loss": 3.6898, + "step": 31300 + }, + { + "epoch": 2.1269873624133715, + "grad_norm": 0.14745591580867767, + "learning_rate": 7.3427096072836e-05, + "loss": 3.6574, + "step": 31305 + }, + { + "epoch": 2.127327082484033, + "grad_norm": 0.16269992291927338, + "learning_rate": 7.342284957195271e-05, + "loss": 3.743, + "step": 31310 + }, + { + "epoch": 2.1276668025546948, + "grad_norm": 0.1960616260766983, + "learning_rate": 7.341860307106945e-05, + "loss": 3.7624, + "step": 31315 + }, + { + "epoch": 2.128006522625357, + "grad_norm": 0.2432655692100525, + "learning_rate": 7.341435657018618e-05, + "loss": 3.7684, + "step": 31320 + }, + { + "epoch": 2.1283462426960185, + "grad_norm": 0.1782282590866089, + "learning_rate": 7.341011006930289e-05, + "loss": 3.8825, + "step": 31325 + }, + { + "epoch": 2.12868596276668, + "grad_norm": 0.23048779368400574, + "learning_rate": 7.340586356841963e-05, + "loss": 3.9127, + "step": 31330 + }, + { + "epoch": 2.129025682837342, + "grad_norm": 0.5039675831794739, + "learning_rate": 7.340161706753636e-05, + "loss": 4.0203, + "step": 31335 + }, + { + "epoch": 2.129365402908004, + "grad_norm": 0.1486896276473999, + "learning_rate": 7.339737056665308e-05, + "loss": 3.9332, + "step": 31340 + }, + { + "epoch": 2.1297051229786654, + "grad_norm": 0.28251707553863525, + "learning_rate": 7.339312406576982e-05, + "loss": 3.7637, + "step": 31345 + }, + { + "epoch": 2.1300448430493275, + "grad_norm": 0.16480061411857605, + "learning_rate": 7.338887756488653e-05, + "loss": 3.8669, + "step": 31350 + }, + { + "epoch": 2.130384563119989, + "grad_norm": 0.2128925621509552, + "learning_rate": 7.338463106400326e-05, + "loss": 3.9034, + "step": 31355 + }, + { + "epoch": 2.1307242831906508, + "grad_norm": 0.16587050259113312, + "learning_rate": 7.338038456312e-05, + "loss": 3.631, + "step": 31360 + }, + { + "epoch": 2.131064003261313, + "grad_norm": 0.3036096394062042, + "learning_rate": 7.337613806223672e-05, + "loss": 3.7637, + "step": 31365 + }, + { + "epoch": 2.1314037233319745, + "grad_norm": 0.15034744143486023, + "learning_rate": 7.337189156135344e-05, + "loss": 3.9571, + "step": 31370 + }, + { + "epoch": 2.131743443402636, + "grad_norm": 0.18869967758655548, + "learning_rate": 7.336764506047019e-05, + "loss": 3.9889, + "step": 31375 + }, + { + "epoch": 2.132083163473298, + "grad_norm": 0.3263240158557892, + "learning_rate": 7.33633985595869e-05, + "loss": 3.8252, + "step": 31380 + }, + { + "epoch": 2.13242288354396, + "grad_norm": 0.3614191710948944, + "learning_rate": 7.335915205870363e-05, + "loss": 3.8865, + "step": 31385 + }, + { + "epoch": 2.1327626036146214, + "grad_norm": 0.19941161572933197, + "learning_rate": 7.335490555782037e-05, + "loss": 4.0462, + "step": 31390 + }, + { + "epoch": 2.1331023236852835, + "grad_norm": 0.1964806169271469, + "learning_rate": 7.335065905693708e-05, + "loss": 3.6938, + "step": 31395 + }, + { + "epoch": 2.133442043755945, + "grad_norm": 0.17679435014724731, + "learning_rate": 7.334641255605381e-05, + "loss": 3.8874, + "step": 31400 + }, + { + "epoch": 2.133781763826607, + "grad_norm": 0.19814153015613556, + "learning_rate": 7.334216605517055e-05, + "loss": 3.7404, + "step": 31405 + }, + { + "epoch": 2.134121483897269, + "grad_norm": 0.16167019307613373, + "learning_rate": 7.333791955428727e-05, + "loss": 4.0077, + "step": 31410 + }, + { + "epoch": 2.1344612039679305, + "grad_norm": 0.21578367054462433, + "learning_rate": 7.3333673053404e-05, + "loss": 3.8562, + "step": 31415 + }, + { + "epoch": 2.134800924038592, + "grad_norm": 0.20550388097763062, + "learning_rate": 7.332942655252072e-05, + "loss": 4.0128, + "step": 31420 + }, + { + "epoch": 2.1351406441092537, + "grad_norm": 0.2634829878807068, + "learning_rate": 7.332518005163745e-05, + "loss": 4.286, + "step": 31425 + }, + { + "epoch": 2.135480364179916, + "grad_norm": 0.2138887494802475, + "learning_rate": 7.332093355075418e-05, + "loss": 3.9744, + "step": 31430 + }, + { + "epoch": 2.1358200842505775, + "grad_norm": 0.7117077708244324, + "learning_rate": 7.331668704987091e-05, + "loss": 3.6743, + "step": 31435 + }, + { + "epoch": 2.136159804321239, + "grad_norm": 0.46604064106941223, + "learning_rate": 7.331244054898764e-05, + "loss": 3.9329, + "step": 31440 + }, + { + "epoch": 2.136499524391901, + "grad_norm": 1.6563799381256104, + "learning_rate": 7.330819404810436e-05, + "loss": 3.7297, + "step": 31445 + }, + { + "epoch": 2.136839244462563, + "grad_norm": 0.21232225000858307, + "learning_rate": 7.330394754722109e-05, + "loss": 3.7957, + "step": 31450 + }, + { + "epoch": 2.1371789645332244, + "grad_norm": 0.1922462433576584, + "learning_rate": 7.329970104633782e-05, + "loss": 3.8412, + "step": 31455 + }, + { + "epoch": 2.1375186846038865, + "grad_norm": 0.14248128235340118, + "learning_rate": 7.329545454545455e-05, + "loss": 3.763, + "step": 31460 + }, + { + "epoch": 2.137858404674548, + "grad_norm": 0.16815796494483948, + "learning_rate": 7.329120804457128e-05, + "loss": 3.7784, + "step": 31465 + }, + { + "epoch": 2.1381981247452098, + "grad_norm": 0.19031696021556854, + "learning_rate": 7.3286961543688e-05, + "loss": 3.7936, + "step": 31470 + }, + { + "epoch": 2.138537844815872, + "grad_norm": 0.25130775570869446, + "learning_rate": 7.328271504280473e-05, + "loss": 3.6784, + "step": 31475 + }, + { + "epoch": 2.1388775648865335, + "grad_norm": 1.8823745250701904, + "learning_rate": 7.327846854192146e-05, + "loss": 3.5133, + "step": 31480 + }, + { + "epoch": 2.139217284957195, + "grad_norm": 0.1785174310207367, + "learning_rate": 7.327422204103819e-05, + "loss": 3.9334, + "step": 31485 + }, + { + "epoch": 2.139557005027857, + "grad_norm": 0.21518029272556305, + "learning_rate": 7.326997554015492e-05, + "loss": 3.9889, + "step": 31490 + }, + { + "epoch": 2.139896725098519, + "grad_norm": 0.21763555705547333, + "learning_rate": 7.326572903927164e-05, + "loss": 3.8071, + "step": 31495 + }, + { + "epoch": 2.1402364451691804, + "grad_norm": 0.17755292356014252, + "learning_rate": 7.326148253838837e-05, + "loss": 3.6014, + "step": 31500 + }, + { + "epoch": 2.1405761652398425, + "grad_norm": 0.4255022406578064, + "learning_rate": 7.32572360375051e-05, + "loss": 3.8717, + "step": 31505 + }, + { + "epoch": 2.140915885310504, + "grad_norm": 0.20479018986225128, + "learning_rate": 7.325298953662183e-05, + "loss": 3.7873, + "step": 31510 + }, + { + "epoch": 2.1412556053811658, + "grad_norm": 0.15194299817085266, + "learning_rate": 7.324874303573856e-05, + "loss": 3.6936, + "step": 31515 + }, + { + "epoch": 2.141595325451828, + "grad_norm": 0.17999492585659027, + "learning_rate": 7.324449653485528e-05, + "loss": 3.8081, + "step": 31520 + }, + { + "epoch": 2.1419350455224895, + "grad_norm": 0.19694867730140686, + "learning_rate": 7.324025003397201e-05, + "loss": 3.7068, + "step": 31525 + }, + { + "epoch": 2.142274765593151, + "grad_norm": 0.17488504946231842, + "learning_rate": 7.323600353308874e-05, + "loss": 4.0597, + "step": 31530 + }, + { + "epoch": 2.142614485663813, + "grad_norm": 0.15230461955070496, + "learning_rate": 7.323175703220547e-05, + "loss": 3.8581, + "step": 31535 + }, + { + "epoch": 2.142954205734475, + "grad_norm": 0.14944453537464142, + "learning_rate": 7.32275105313222e-05, + "loss": 4.0115, + "step": 31540 + }, + { + "epoch": 2.1432939258051364, + "grad_norm": 0.2063811719417572, + "learning_rate": 7.322326403043892e-05, + "loss": 3.918, + "step": 31545 + }, + { + "epoch": 2.1436336458757985, + "grad_norm": 0.15119388699531555, + "learning_rate": 7.321901752955564e-05, + "loss": 4.0949, + "step": 31550 + }, + { + "epoch": 2.14397336594646, + "grad_norm": 0.2164253294467926, + "learning_rate": 7.321477102867238e-05, + "loss": 4.1417, + "step": 31555 + }, + { + "epoch": 2.144313086017122, + "grad_norm": 0.605436384677887, + "learning_rate": 7.321052452778911e-05, + "loss": 4.0177, + "step": 31560 + }, + { + "epoch": 2.144652806087784, + "grad_norm": 0.21497578918933868, + "learning_rate": 7.320627802690582e-05, + "loss": 4.1392, + "step": 31565 + }, + { + "epoch": 2.1449925261584455, + "grad_norm": 0.5597323179244995, + "learning_rate": 7.320203152602256e-05, + "loss": 3.6805, + "step": 31570 + }, + { + "epoch": 2.145332246229107, + "grad_norm": 0.17142678797245026, + "learning_rate": 7.319778502513929e-05, + "loss": 4.079, + "step": 31575 + }, + { + "epoch": 2.145671966299769, + "grad_norm": 0.2073289453983307, + "learning_rate": 7.3193538524256e-05, + "loss": 3.9606, + "step": 31580 + }, + { + "epoch": 2.146011686370431, + "grad_norm": 0.20710396766662598, + "learning_rate": 7.318929202337275e-05, + "loss": 3.7865, + "step": 31585 + }, + { + "epoch": 2.1463514064410925, + "grad_norm": 0.19659818708896637, + "learning_rate": 7.318504552248948e-05, + "loss": 3.8396, + "step": 31590 + }, + { + "epoch": 2.1466911265117545, + "grad_norm": 0.2154000848531723, + "learning_rate": 7.318079902160619e-05, + "loss": 3.8077, + "step": 31595 + }, + { + "epoch": 2.147030846582416, + "grad_norm": 0.22669672966003418, + "learning_rate": 7.317655252072293e-05, + "loss": 3.7756, + "step": 31600 + }, + { + "epoch": 2.147370566653078, + "grad_norm": 0.23023571074008942, + "learning_rate": 7.317230601983966e-05, + "loss": 3.7597, + "step": 31605 + }, + { + "epoch": 2.14771028672374, + "grad_norm": 0.17460429668426514, + "learning_rate": 7.316805951895639e-05, + "loss": 3.7603, + "step": 31610 + }, + { + "epoch": 2.1480500067944015, + "grad_norm": 0.1575312614440918, + "learning_rate": 7.316381301807312e-05, + "loss": 3.8546, + "step": 31615 + }, + { + "epoch": 2.148389726865063, + "grad_norm": 0.17946597933769226, + "learning_rate": 7.315956651718983e-05, + "loss": 3.8166, + "step": 31620 + }, + { + "epoch": 2.1487294469357248, + "grad_norm": 0.26248130202293396, + "learning_rate": 7.315532001630657e-05, + "loss": 4.0273, + "step": 31625 + }, + { + "epoch": 2.149069167006387, + "grad_norm": 0.2261710911989212, + "learning_rate": 7.31510735154233e-05, + "loss": 3.8007, + "step": 31630 + }, + { + "epoch": 2.1494088870770485, + "grad_norm": 0.7925760746002197, + "learning_rate": 7.314682701454001e-05, + "loss": 3.8424, + "step": 31635 + }, + { + "epoch": 2.14974860714771, + "grad_norm": 0.22325529158115387, + "learning_rate": 7.314258051365676e-05, + "loss": 3.9979, + "step": 31640 + }, + { + "epoch": 2.150088327218372, + "grad_norm": 0.1774800717830658, + "learning_rate": 7.313833401277348e-05, + "loss": 3.9289, + "step": 31645 + }, + { + "epoch": 2.150428047289034, + "grad_norm": 0.1669335514307022, + "learning_rate": 7.31340875118902e-05, + "loss": 3.9706, + "step": 31650 + }, + { + "epoch": 2.1507677673596954, + "grad_norm": 0.4654673933982849, + "learning_rate": 7.312984101100694e-05, + "loss": 4.018, + "step": 31655 + }, + { + "epoch": 2.1511074874303575, + "grad_norm": 0.4956892430782318, + "learning_rate": 7.312559451012367e-05, + "loss": 4.0606, + "step": 31660 + }, + { + "epoch": 2.151447207501019, + "grad_norm": 0.1810349076986313, + "learning_rate": 7.312134800924038e-05, + "loss": 3.9306, + "step": 31665 + }, + { + "epoch": 2.1517869275716808, + "grad_norm": 0.2979622185230255, + "learning_rate": 7.311710150835712e-05, + "loss": 3.9112, + "step": 31670 + }, + { + "epoch": 2.152126647642343, + "grad_norm": 0.40965086221694946, + "learning_rate": 7.311285500747385e-05, + "loss": 3.9473, + "step": 31675 + }, + { + "epoch": 2.1524663677130045, + "grad_norm": 0.21751263737678528, + "learning_rate": 7.310860850659057e-05, + "loss": 3.7271, + "step": 31680 + }, + { + "epoch": 2.152806087783666, + "grad_norm": 0.19504979252815247, + "learning_rate": 7.310436200570731e-05, + "loss": 3.7751, + "step": 31685 + }, + { + "epoch": 2.153145807854328, + "grad_norm": 0.20270827412605286, + "learning_rate": 7.310011550482402e-05, + "loss": 4.1203, + "step": 31690 + }, + { + "epoch": 2.15348552792499, + "grad_norm": 0.17704129219055176, + "learning_rate": 7.309586900394075e-05, + "loss": 3.4927, + "step": 31695 + }, + { + "epoch": 2.1538252479956514, + "grad_norm": 0.15753594040870667, + "learning_rate": 7.309162250305749e-05, + "loss": 3.9072, + "step": 31700 + }, + { + "epoch": 2.1541649680663135, + "grad_norm": 0.7239094376564026, + "learning_rate": 7.308737600217421e-05, + "loss": 3.9115, + "step": 31705 + }, + { + "epoch": 2.154504688136975, + "grad_norm": 0.2185813933610916, + "learning_rate": 7.308312950129093e-05, + "loss": 3.858, + "step": 31710 + }, + { + "epoch": 2.154844408207637, + "grad_norm": 0.15574412047863007, + "learning_rate": 7.307888300040768e-05, + "loss": 3.8274, + "step": 31715 + }, + { + "epoch": 2.155184128278299, + "grad_norm": 0.221171036362648, + "learning_rate": 7.307463649952439e-05, + "loss": 3.8388, + "step": 31720 + }, + { + "epoch": 2.1555238483489605, + "grad_norm": 0.248666912317276, + "learning_rate": 7.307038999864112e-05, + "loss": 3.8068, + "step": 31725 + }, + { + "epoch": 2.155863568419622, + "grad_norm": 0.19957281649112701, + "learning_rate": 7.306614349775786e-05, + "loss": 4.009, + "step": 31730 + }, + { + "epoch": 2.156203288490284, + "grad_norm": 0.18795907497406006, + "learning_rate": 7.306189699687457e-05, + "loss": 3.6347, + "step": 31735 + }, + { + "epoch": 2.156543008560946, + "grad_norm": 0.19629710912704468, + "learning_rate": 7.30576504959913e-05, + "loss": 4.0389, + "step": 31740 + }, + { + "epoch": 2.1568827286316075, + "grad_norm": 0.18084336817264557, + "learning_rate": 7.305340399510804e-05, + "loss": 3.7799, + "step": 31745 + }, + { + "epoch": 2.1572224487022695, + "grad_norm": 0.14649920165538788, + "learning_rate": 7.304915749422476e-05, + "loss": 3.5689, + "step": 31750 + }, + { + "epoch": 2.157562168772931, + "grad_norm": 0.19734984636306763, + "learning_rate": 7.304491099334149e-05, + "loss": 3.969, + "step": 31755 + }, + { + "epoch": 2.157901888843593, + "grad_norm": 0.17120154201984406, + "learning_rate": 7.304066449245823e-05, + "loss": 3.915, + "step": 31760 + }, + { + "epoch": 2.1582416089142544, + "grad_norm": 0.1899290531873703, + "learning_rate": 7.303641799157494e-05, + "loss": 3.7958, + "step": 31765 + }, + { + "epoch": 2.1585813289849165, + "grad_norm": 0.16951562464237213, + "learning_rate": 7.303217149069167e-05, + "loss": 4.0088, + "step": 31770 + }, + { + "epoch": 2.158921049055578, + "grad_norm": 0.18368422985076904, + "learning_rate": 7.30279249898084e-05, + "loss": 3.6925, + "step": 31775 + }, + { + "epoch": 2.1592607691262398, + "grad_norm": 0.22717119753360748, + "learning_rate": 7.302367848892513e-05, + "loss": 3.9294, + "step": 31780 + }, + { + "epoch": 2.159600489196902, + "grad_norm": 0.14775168895721436, + "learning_rate": 7.301943198804185e-05, + "loss": 3.8278, + "step": 31785 + }, + { + "epoch": 2.1599402092675635, + "grad_norm": 0.179066464304924, + "learning_rate": 7.301518548715858e-05, + "loss": 4.1523, + "step": 31790 + }, + { + "epoch": 2.160279929338225, + "grad_norm": 0.20306143164634705, + "learning_rate": 7.301093898627531e-05, + "loss": 3.8589, + "step": 31795 + }, + { + "epoch": 2.160619649408887, + "grad_norm": 0.20280425250530243, + "learning_rate": 7.300669248539204e-05, + "loss": 3.6911, + "step": 31800 + }, + { + "epoch": 2.160959369479549, + "grad_norm": 0.23462747037410736, + "learning_rate": 7.300244598450877e-05, + "loss": 4.0746, + "step": 31805 + }, + { + "epoch": 2.1612990895502104, + "grad_norm": 0.2074826955795288, + "learning_rate": 7.29981994836255e-05, + "loss": 4.2465, + "step": 31810 + }, + { + "epoch": 2.1616388096208725, + "grad_norm": 0.7173203229904175, + "learning_rate": 7.299395298274222e-05, + "loss": 3.9146, + "step": 31815 + }, + { + "epoch": 2.161978529691534, + "grad_norm": 0.1572432667016983, + "learning_rate": 7.298970648185895e-05, + "loss": 3.8187, + "step": 31820 + }, + { + "epoch": 2.1623182497621958, + "grad_norm": 0.1555764526128769, + "learning_rate": 7.298545998097568e-05, + "loss": 3.9506, + "step": 31825 + }, + { + "epoch": 2.162657969832858, + "grad_norm": 0.20672738552093506, + "learning_rate": 7.298121348009241e-05, + "loss": 3.953, + "step": 31830 + }, + { + "epoch": 2.1629976899035195, + "grad_norm": 0.21768257021903992, + "learning_rate": 7.297696697920913e-05, + "loss": 3.872, + "step": 31835 + }, + { + "epoch": 2.163337409974181, + "grad_norm": 0.21232566237449646, + "learning_rate": 7.297272047832586e-05, + "loss": 4.1439, + "step": 31840 + }, + { + "epoch": 2.163677130044843, + "grad_norm": 0.15957492589950562, + "learning_rate": 7.296847397744259e-05, + "loss": 4.103, + "step": 31845 + }, + { + "epoch": 2.164016850115505, + "grad_norm": 0.24550633132457733, + "learning_rate": 7.296422747655932e-05, + "loss": 3.8027, + "step": 31850 + }, + { + "epoch": 2.1643565701861665, + "grad_norm": 0.14649060368537903, + "learning_rate": 7.295998097567605e-05, + "loss": 3.9393, + "step": 31855 + }, + { + "epoch": 2.1646962902568285, + "grad_norm": 0.19388172030448914, + "learning_rate": 7.295573447479278e-05, + "loss": 3.8849, + "step": 31860 + }, + { + "epoch": 2.16503601032749, + "grad_norm": 0.16194479167461395, + "learning_rate": 7.29514879739095e-05, + "loss": 4.0124, + "step": 31865 + }, + { + "epoch": 2.165375730398152, + "grad_norm": 0.16643749177455902, + "learning_rate": 7.294724147302623e-05, + "loss": 3.9625, + "step": 31870 + }, + { + "epoch": 2.165715450468814, + "grad_norm": 0.16119295358657837, + "learning_rate": 7.294299497214296e-05, + "loss": 3.8661, + "step": 31875 + }, + { + "epoch": 2.1660551705394755, + "grad_norm": 0.20701657235622406, + "learning_rate": 7.293874847125969e-05, + "loss": 3.7339, + "step": 31880 + }, + { + "epoch": 2.166394890610137, + "grad_norm": 0.19613933563232422, + "learning_rate": 7.293450197037642e-05, + "loss": 4.0483, + "step": 31885 + }, + { + "epoch": 2.166734610680799, + "grad_norm": 0.13892313838005066, + "learning_rate": 7.293025546949313e-05, + "loss": 3.9051, + "step": 31890 + }, + { + "epoch": 2.167074330751461, + "grad_norm": 0.1702692061662674, + "learning_rate": 7.292600896860987e-05, + "loss": 4.1194, + "step": 31895 + }, + { + "epoch": 2.1674140508221225, + "grad_norm": 0.17909814417362213, + "learning_rate": 7.29217624677266e-05, + "loss": 3.7902, + "step": 31900 + }, + { + "epoch": 2.1677537708927845, + "grad_norm": 0.1584734171628952, + "learning_rate": 7.291751596684331e-05, + "loss": 3.9372, + "step": 31905 + }, + { + "epoch": 2.168093490963446, + "grad_norm": 0.19880163669586182, + "learning_rate": 7.291326946596006e-05, + "loss": 4.0504, + "step": 31910 + }, + { + "epoch": 2.168433211034108, + "grad_norm": 0.227665513753891, + "learning_rate": 7.290902296507678e-05, + "loss": 3.8878, + "step": 31915 + }, + { + "epoch": 2.16877293110477, + "grad_norm": 0.20126572251319885, + "learning_rate": 7.29047764641935e-05, + "loss": 3.9376, + "step": 31920 + }, + { + "epoch": 2.1691126511754315, + "grad_norm": 0.16015078127384186, + "learning_rate": 7.290052996331024e-05, + "loss": 3.84, + "step": 31925 + }, + { + "epoch": 2.169452371246093, + "grad_norm": 0.15428532660007477, + "learning_rate": 7.289628346242697e-05, + "loss": 3.6974, + "step": 31930 + }, + { + "epoch": 2.169792091316755, + "grad_norm": 1.5260536670684814, + "learning_rate": 7.289203696154368e-05, + "loss": 4.0908, + "step": 31935 + }, + { + "epoch": 2.170131811387417, + "grad_norm": 0.1610811948776245, + "learning_rate": 7.288779046066042e-05, + "loss": 3.9106, + "step": 31940 + }, + { + "epoch": 2.1704715314580785, + "grad_norm": 0.16142091155052185, + "learning_rate": 7.288354395977715e-05, + "loss": 3.9246, + "step": 31945 + }, + { + "epoch": 2.1708112515287405, + "grad_norm": 0.15768378973007202, + "learning_rate": 7.287929745889388e-05, + "loss": 3.6037, + "step": 31950 + }, + { + "epoch": 2.171150971599402, + "grad_norm": 0.2974212169647217, + "learning_rate": 7.287505095801061e-05, + "loss": 4.0393, + "step": 31955 + }, + { + "epoch": 2.171490691670064, + "grad_norm": 0.18562129139900208, + "learning_rate": 7.287080445712734e-05, + "loss": 3.8738, + "step": 31960 + }, + { + "epoch": 2.1718304117407254, + "grad_norm": 0.1814710944890976, + "learning_rate": 7.286655795624406e-05, + "loss": 3.9412, + "step": 31965 + }, + { + "epoch": 2.1721701318113875, + "grad_norm": 0.1555013805627823, + "learning_rate": 7.286231145536079e-05, + "loss": 3.8673, + "step": 31970 + }, + { + "epoch": 2.172509851882049, + "grad_norm": 0.20260664820671082, + "learning_rate": 7.28580649544775e-05, + "loss": 3.9277, + "step": 31975 + }, + { + "epoch": 2.172849571952711, + "grad_norm": 0.2736378312110901, + "learning_rate": 7.285381845359425e-05, + "loss": 3.7608, + "step": 31980 + }, + { + "epoch": 2.173189292023373, + "grad_norm": 0.27590566873550415, + "learning_rate": 7.284957195271098e-05, + "loss": 3.9016, + "step": 31985 + }, + { + "epoch": 2.1735290120940345, + "grad_norm": 0.15818724036216736, + "learning_rate": 7.284532545182769e-05, + "loss": 3.8058, + "step": 31990 + }, + { + "epoch": 2.173868732164696, + "grad_norm": 0.3889075815677643, + "learning_rate": 7.284107895094443e-05, + "loss": 3.9651, + "step": 31995 + }, + { + "epoch": 2.174208452235358, + "grad_norm": 0.1762801855802536, + "learning_rate": 7.283683245006116e-05, + "loss": 3.7558, + "step": 32000 + }, + { + "epoch": 2.17454817230602, + "grad_norm": 0.24084338545799255, + "learning_rate": 7.283258594917787e-05, + "loss": 4.1959, + "step": 32005 + }, + { + "epoch": 2.1748878923766815, + "grad_norm": 0.1935751736164093, + "learning_rate": 7.282833944829462e-05, + "loss": 3.8784, + "step": 32010 + }, + { + "epoch": 2.1752276124473435, + "grad_norm": 0.20524778962135315, + "learning_rate": 7.282409294741134e-05, + "loss": 3.9333, + "step": 32015 + }, + { + "epoch": 2.175567332518005, + "grad_norm": 0.27659788727760315, + "learning_rate": 7.281984644652806e-05, + "loss": 3.7756, + "step": 32020 + }, + { + "epoch": 2.175907052588667, + "grad_norm": 0.19038884341716766, + "learning_rate": 7.28155999456448e-05, + "loss": 3.9825, + "step": 32025 + }, + { + "epoch": 2.176246772659329, + "grad_norm": 0.18403401970863342, + "learning_rate": 7.281135344476153e-05, + "loss": 3.8884, + "step": 32030 + }, + { + "epoch": 2.1765864927299905, + "grad_norm": 0.213815838098526, + "learning_rate": 7.280710694387824e-05, + "loss": 3.9809, + "step": 32035 + }, + { + "epoch": 2.176926212800652, + "grad_norm": 0.13817846775054932, + "learning_rate": 7.280286044299498e-05, + "loss": 3.7819, + "step": 32040 + }, + { + "epoch": 2.177265932871314, + "grad_norm": 0.15645885467529297, + "learning_rate": 7.27986139421117e-05, + "loss": 3.9514, + "step": 32045 + }, + { + "epoch": 2.177605652941976, + "grad_norm": 0.16220805048942566, + "learning_rate": 7.279436744122843e-05, + "loss": 3.7364, + "step": 32050 + }, + { + "epoch": 2.1779453730126375, + "grad_norm": 0.17392036318778992, + "learning_rate": 7.279012094034517e-05, + "loss": 3.8594, + "step": 32055 + }, + { + "epoch": 2.1782850930832995, + "grad_norm": 0.26267528533935547, + "learning_rate": 7.278587443946188e-05, + "loss": 4.0259, + "step": 32060 + }, + { + "epoch": 2.178624813153961, + "grad_norm": 0.16119059920310974, + "learning_rate": 7.278162793857861e-05, + "loss": 4.0014, + "step": 32065 + }, + { + "epoch": 2.178964533224623, + "grad_norm": 0.15942543745040894, + "learning_rate": 7.277738143769535e-05, + "loss": 4.1173, + "step": 32070 + }, + { + "epoch": 2.179304253295285, + "grad_norm": 0.21408048272132874, + "learning_rate": 7.277313493681207e-05, + "loss": 3.8616, + "step": 32075 + }, + { + "epoch": 2.1796439733659465, + "grad_norm": 0.2084580659866333, + "learning_rate": 7.27688884359288e-05, + "loss": 3.9724, + "step": 32080 + }, + { + "epoch": 2.179983693436608, + "grad_norm": 0.5399390459060669, + "learning_rate": 7.276464193504554e-05, + "loss": 4.1185, + "step": 32085 + }, + { + "epoch": 2.18032341350727, + "grad_norm": 0.16034239530563354, + "learning_rate": 7.276039543416225e-05, + "loss": 3.4816, + "step": 32090 + }, + { + "epoch": 2.180663133577932, + "grad_norm": 0.18410351872444153, + "learning_rate": 7.275614893327898e-05, + "loss": 3.7889, + "step": 32095 + }, + { + "epoch": 2.1810028536485935, + "grad_norm": 0.14784325659275055, + "learning_rate": 7.275190243239572e-05, + "loss": 3.9356, + "step": 32100 + }, + { + "epoch": 2.181342573719255, + "grad_norm": 0.2236083745956421, + "learning_rate": 7.274765593151243e-05, + "loss": 3.9695, + "step": 32105 + }, + { + "epoch": 2.181682293789917, + "grad_norm": 0.22387093305587769, + "learning_rate": 7.274340943062916e-05, + "loss": 3.9127, + "step": 32110 + }, + { + "epoch": 2.182022013860579, + "grad_norm": 0.23005805909633636, + "learning_rate": 7.273916292974589e-05, + "loss": 3.729, + "step": 32115 + }, + { + "epoch": 2.1823617339312404, + "grad_norm": 0.1834970861673355, + "learning_rate": 7.273491642886262e-05, + "loss": 3.9276, + "step": 32120 + }, + { + "epoch": 2.1827014540019025, + "grad_norm": 0.15589368343353271, + "learning_rate": 7.273066992797935e-05, + "loss": 4.0852, + "step": 32125 + }, + { + "epoch": 2.183041174072564, + "grad_norm": 0.13922220468521118, + "learning_rate": 7.272642342709607e-05, + "loss": 3.7652, + "step": 32130 + }, + { + "epoch": 2.183380894143226, + "grad_norm": 0.2236020416021347, + "learning_rate": 7.27221769262128e-05, + "loss": 3.9031, + "step": 32135 + }, + { + "epoch": 2.183720614213888, + "grad_norm": 0.286957710981369, + "learning_rate": 7.271793042532953e-05, + "loss": 3.9359, + "step": 32140 + }, + { + "epoch": 2.1840603342845495, + "grad_norm": 0.21395406126976013, + "learning_rate": 7.271368392444626e-05, + "loss": 3.7655, + "step": 32145 + }, + { + "epoch": 2.184400054355211, + "grad_norm": 0.1681520789861679, + "learning_rate": 7.270943742356299e-05, + "loss": 3.7696, + "step": 32150 + }, + { + "epoch": 2.184739774425873, + "grad_norm": 0.17820137739181519, + "learning_rate": 7.270519092267971e-05, + "loss": 3.5789, + "step": 32155 + }, + { + "epoch": 2.185079494496535, + "grad_norm": 0.15652498602867126, + "learning_rate": 7.270094442179644e-05, + "loss": 3.9706, + "step": 32160 + }, + { + "epoch": 2.1854192145671965, + "grad_norm": 0.15502065420150757, + "learning_rate": 7.269669792091317e-05, + "loss": 3.8317, + "step": 32165 + }, + { + "epoch": 2.1857589346378585, + "grad_norm": 0.43888577818870544, + "learning_rate": 7.26924514200299e-05, + "loss": 3.878, + "step": 32170 + }, + { + "epoch": 2.18609865470852, + "grad_norm": 0.26602602005004883, + "learning_rate": 7.268820491914663e-05, + "loss": 3.6976, + "step": 32175 + }, + { + "epoch": 2.186438374779182, + "grad_norm": 0.16525058448314667, + "learning_rate": 7.268395841826335e-05, + "loss": 3.9027, + "step": 32180 + }, + { + "epoch": 2.186778094849844, + "grad_norm": 0.1805514693260193, + "learning_rate": 7.267971191738008e-05, + "loss": 4.004, + "step": 32185 + }, + { + "epoch": 2.1871178149205055, + "grad_norm": 0.38465961813926697, + "learning_rate": 7.267546541649681e-05, + "loss": 3.9813, + "step": 32190 + }, + { + "epoch": 2.187457534991167, + "grad_norm": 0.15717457234859467, + "learning_rate": 7.267121891561354e-05, + "loss": 4.0347, + "step": 32195 + }, + { + "epoch": 2.187797255061829, + "grad_norm": 0.16581982374191284, + "learning_rate": 7.266697241473027e-05, + "loss": 3.6685, + "step": 32200 + }, + { + "epoch": 2.188136975132491, + "grad_norm": 0.8884585499763489, + "learning_rate": 7.2662725913847e-05, + "loss": 3.9558, + "step": 32205 + }, + { + "epoch": 2.1884766952031525, + "grad_norm": 0.15444910526275635, + "learning_rate": 7.265847941296372e-05, + "loss": 4.0879, + "step": 32210 + }, + { + "epoch": 2.1888164152738145, + "grad_norm": 0.33418309688568115, + "learning_rate": 7.265423291208045e-05, + "loss": 3.8673, + "step": 32215 + }, + { + "epoch": 2.189156135344476, + "grad_norm": 0.18639114499092102, + "learning_rate": 7.264998641119718e-05, + "loss": 3.8882, + "step": 32220 + }, + { + "epoch": 2.189495855415138, + "grad_norm": 0.1885489523410797, + "learning_rate": 7.26457399103139e-05, + "loss": 3.8131, + "step": 32225 + }, + { + "epoch": 2.1898355754858, + "grad_norm": 0.22887979447841644, + "learning_rate": 7.264149340943063e-05, + "loss": 4.1122, + "step": 32230 + }, + { + "epoch": 2.1901752955564615, + "grad_norm": 0.18868611752986908, + "learning_rate": 7.263724690854736e-05, + "loss": 4.0832, + "step": 32235 + }, + { + "epoch": 2.190515015627123, + "grad_norm": 0.17855460941791534, + "learning_rate": 7.263300040766409e-05, + "loss": 3.902, + "step": 32240 + }, + { + "epoch": 2.190854735697785, + "grad_norm": 0.37608101963996887, + "learning_rate": 7.26287539067808e-05, + "loss": 3.5734, + "step": 32245 + }, + { + "epoch": 2.191194455768447, + "grad_norm": 0.13431788980960846, + "learning_rate": 7.262450740589755e-05, + "loss": 3.9101, + "step": 32250 + }, + { + "epoch": 2.1915341758391085, + "grad_norm": 4.8957648277282715, + "learning_rate": 7.262026090501427e-05, + "loss": 3.8486, + "step": 32255 + }, + { + "epoch": 2.1918738959097706, + "grad_norm": 0.18035094439983368, + "learning_rate": 7.261601440413099e-05, + "loss": 3.695, + "step": 32260 + }, + { + "epoch": 2.192213615980432, + "grad_norm": 0.334176242351532, + "learning_rate": 7.261176790324773e-05, + "loss": 4.1347, + "step": 32265 + }, + { + "epoch": 2.192553336051094, + "grad_norm": 0.17351758480072021, + "learning_rate": 7.260752140236446e-05, + "loss": 3.9343, + "step": 32270 + }, + { + "epoch": 2.192893056121756, + "grad_norm": 0.1632409691810608, + "learning_rate": 7.260327490148117e-05, + "loss": 3.8641, + "step": 32275 + }, + { + "epoch": 2.1932327761924175, + "grad_norm": 0.2113005369901657, + "learning_rate": 7.259902840059791e-05, + "loss": 3.9512, + "step": 32280 + }, + { + "epoch": 2.193572496263079, + "grad_norm": 0.2471776008605957, + "learning_rate": 7.259478189971464e-05, + "loss": 3.3578, + "step": 32285 + }, + { + "epoch": 2.1939122163337412, + "grad_norm": 0.23105821013450623, + "learning_rate": 7.259053539883137e-05, + "loss": 3.9146, + "step": 32290 + }, + { + "epoch": 2.194251936404403, + "grad_norm": 0.19851747155189514, + "learning_rate": 7.25862888979481e-05, + "loss": 3.885, + "step": 32295 + }, + { + "epoch": 2.1945916564750645, + "grad_norm": 0.17704175412654877, + "learning_rate": 7.258204239706483e-05, + "loss": 3.8182, + "step": 32300 + }, + { + "epoch": 2.194931376545726, + "grad_norm": 0.19821980595588684, + "learning_rate": 7.257779589618155e-05, + "loss": 3.9775, + "step": 32305 + }, + { + "epoch": 2.195271096616388, + "grad_norm": 0.3010588586330414, + "learning_rate": 7.257354939529828e-05, + "loss": 3.8389, + "step": 32310 + }, + { + "epoch": 2.19561081668705, + "grad_norm": 0.1607079654932022, + "learning_rate": 7.2569302894415e-05, + "loss": 3.6536, + "step": 32315 + }, + { + "epoch": 2.1959505367577115, + "grad_norm": 0.20202195644378662, + "learning_rate": 7.256505639353174e-05, + "loss": 3.5996, + "step": 32320 + }, + { + "epoch": 2.1962902568283735, + "grad_norm": 0.16463324427604675, + "learning_rate": 7.256080989264847e-05, + "loss": 3.9852, + "step": 32325 + }, + { + "epoch": 2.196629976899035, + "grad_norm": 0.3532581031322479, + "learning_rate": 7.255656339176518e-05, + "loss": 3.7085, + "step": 32330 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.20867325365543365, + "learning_rate": 7.255231689088192e-05, + "loss": 3.9127, + "step": 32335 + }, + { + "epoch": 2.197309417040359, + "grad_norm": 0.20770137012004852, + "learning_rate": 7.254807038999865e-05, + "loss": 3.9434, + "step": 32340 + }, + { + "epoch": 2.1976491371110205, + "grad_norm": 0.18354299664497375, + "learning_rate": 7.254382388911536e-05, + "loss": 3.9802, + "step": 32345 + }, + { + "epoch": 2.197988857181682, + "grad_norm": 0.20711280405521393, + "learning_rate": 7.25395773882321e-05, + "loss": 4.1614, + "step": 32350 + }, + { + "epoch": 2.198328577252344, + "grad_norm": 0.20095595717430115, + "learning_rate": 7.253533088734883e-05, + "loss": 3.9152, + "step": 32355 + }, + { + "epoch": 2.198668297323006, + "grad_norm": 0.45739614963531494, + "learning_rate": 7.253108438646555e-05, + "loss": 3.8763, + "step": 32360 + }, + { + "epoch": 2.1990080173936675, + "grad_norm": 0.1527908891439438, + "learning_rate": 7.252683788558229e-05, + "loss": 3.7674, + "step": 32365 + }, + { + "epoch": 2.1993477374643295, + "grad_norm": 0.1537390798330307, + "learning_rate": 7.252259138469902e-05, + "loss": 3.6157, + "step": 32370 + }, + { + "epoch": 2.199687457534991, + "grad_norm": 0.17341235280036926, + "learning_rate": 7.251834488381573e-05, + "loss": 3.6746, + "step": 32375 + }, + { + "epoch": 2.200027177605653, + "grad_norm": 0.149677574634552, + "learning_rate": 7.251409838293247e-05, + "loss": 3.8651, + "step": 32380 + }, + { + "epoch": 2.200366897676315, + "grad_norm": 0.18025921285152435, + "learning_rate": 7.25098518820492e-05, + "loss": 4.0378, + "step": 32385 + }, + { + "epoch": 2.2007066177469765, + "grad_norm": 0.15491366386413574, + "learning_rate": 7.250560538116592e-05, + "loss": 4.207, + "step": 32390 + }, + { + "epoch": 2.201046337817638, + "grad_norm": 0.39977768063545227, + "learning_rate": 7.250135888028266e-05, + "loss": 3.6144, + "step": 32395 + }, + { + "epoch": 2.2013860578883, + "grad_norm": 0.16322045028209686, + "learning_rate": 7.249711237939937e-05, + "loss": 3.9203, + "step": 32400 + }, + { + "epoch": 2.201725777958962, + "grad_norm": 0.15594050288200378, + "learning_rate": 7.24928658785161e-05, + "loss": 3.9203, + "step": 32405 + }, + { + "epoch": 2.2020654980296235, + "grad_norm": 0.17471525073051453, + "learning_rate": 7.248861937763284e-05, + "loss": 3.7588, + "step": 32410 + }, + { + "epoch": 2.2024052181002856, + "grad_norm": 0.16063815355300903, + "learning_rate": 7.248437287674956e-05, + "loss": 3.9733, + "step": 32415 + }, + { + "epoch": 2.202744938170947, + "grad_norm": 0.18090678751468658, + "learning_rate": 7.248012637586628e-05, + "loss": 3.9223, + "step": 32420 + }, + { + "epoch": 2.203084658241609, + "grad_norm": 1.5027004480361938, + "learning_rate": 7.247587987498303e-05, + "loss": 3.89, + "step": 32425 + }, + { + "epoch": 2.203424378312271, + "grad_norm": 0.2761104106903076, + "learning_rate": 7.247163337409974e-05, + "loss": 3.7562, + "step": 32430 + }, + { + "epoch": 2.2037640983829325, + "grad_norm": 0.1741229146718979, + "learning_rate": 7.246738687321647e-05, + "loss": 3.88, + "step": 32435 + }, + { + "epoch": 2.204103818453594, + "grad_norm": 0.16095873713493347, + "learning_rate": 7.246314037233321e-05, + "loss": 4.0639, + "step": 32440 + }, + { + "epoch": 2.204443538524256, + "grad_norm": 0.1826382875442505, + "learning_rate": 7.245889387144992e-05, + "loss": 4.0326, + "step": 32445 + }, + { + "epoch": 2.204783258594918, + "grad_norm": 0.17246341705322266, + "learning_rate": 7.245464737056665e-05, + "loss": 3.7116, + "step": 32450 + }, + { + "epoch": 2.2051229786655795, + "grad_norm": 0.1495402753353119, + "learning_rate": 7.24504008696834e-05, + "loss": 3.7776, + "step": 32455 + }, + { + "epoch": 2.205462698736241, + "grad_norm": 0.16991867125034332, + "learning_rate": 7.244615436880011e-05, + "loss": 3.7621, + "step": 32460 + }, + { + "epoch": 2.205802418806903, + "grad_norm": 0.16698318719863892, + "learning_rate": 7.244190786791684e-05, + "loss": 3.9626, + "step": 32465 + }, + { + "epoch": 2.206142138877565, + "grad_norm": 0.16266398131847382, + "learning_rate": 7.243766136703356e-05, + "loss": 3.896, + "step": 32470 + }, + { + "epoch": 2.2064818589482265, + "grad_norm": 0.17279106378555298, + "learning_rate": 7.243341486615029e-05, + "loss": 4.1053, + "step": 32475 + }, + { + "epoch": 2.2068215790188885, + "grad_norm": 0.2499086558818817, + "learning_rate": 7.242916836526702e-05, + "loss": 3.6753, + "step": 32480 + }, + { + "epoch": 2.20716129908955, + "grad_norm": 0.17951621115207672, + "learning_rate": 7.242492186438375e-05, + "loss": 4.0893, + "step": 32485 + }, + { + "epoch": 2.207501019160212, + "grad_norm": 0.23826122283935547, + "learning_rate": 7.242067536350048e-05, + "loss": 3.67, + "step": 32490 + }, + { + "epoch": 2.207840739230874, + "grad_norm": 0.21968746185302734, + "learning_rate": 7.24164288626172e-05, + "loss": 3.9625, + "step": 32495 + }, + { + "epoch": 2.2081804593015355, + "grad_norm": 0.15708112716674805, + "learning_rate": 7.241218236173393e-05, + "loss": 3.7708, + "step": 32500 + }, + { + "epoch": 2.208520179372197, + "grad_norm": 0.20706376433372498, + "learning_rate": 7.240793586085066e-05, + "loss": 3.9734, + "step": 32505 + }, + { + "epoch": 2.208859899442859, + "grad_norm": 0.1990256905555725, + "learning_rate": 7.240368935996739e-05, + "loss": 3.953, + "step": 32510 + }, + { + "epoch": 2.209199619513521, + "grad_norm": 0.21668708324432373, + "learning_rate": 7.239944285908412e-05, + "loss": 4.1221, + "step": 32515 + }, + { + "epoch": 2.2095393395841825, + "grad_norm": 0.17436078190803528, + "learning_rate": 7.239519635820084e-05, + "loss": 3.9104, + "step": 32520 + }, + { + "epoch": 2.2098790596548445, + "grad_norm": 0.17630061507225037, + "learning_rate": 7.239094985731757e-05, + "loss": 4.1429, + "step": 32525 + }, + { + "epoch": 2.210218779725506, + "grad_norm": 0.1892368048429489, + "learning_rate": 7.23867033564343e-05, + "loss": 3.917, + "step": 32530 + }, + { + "epoch": 2.210558499796168, + "grad_norm": 0.8624678254127502, + "learning_rate": 7.238245685555103e-05, + "loss": 3.9299, + "step": 32535 + }, + { + "epoch": 2.21089821986683, + "grad_norm": 0.22433412075042725, + "learning_rate": 7.237821035466776e-05, + "loss": 4.2013, + "step": 32540 + }, + { + "epoch": 2.2112379399374915, + "grad_norm": 0.19506129622459412, + "learning_rate": 7.237396385378448e-05, + "loss": 3.9779, + "step": 32545 + }, + { + "epoch": 2.211577660008153, + "grad_norm": 0.16294309496879578, + "learning_rate": 7.236971735290121e-05, + "loss": 3.9899, + "step": 32550 + }, + { + "epoch": 2.211917380078815, + "grad_norm": 0.183319553732872, + "learning_rate": 7.236547085201794e-05, + "loss": 3.8771, + "step": 32555 + }, + { + "epoch": 2.212257100149477, + "grad_norm": 0.2051389217376709, + "learning_rate": 7.236122435113467e-05, + "loss": 3.7493, + "step": 32560 + }, + { + "epoch": 2.2125968202201385, + "grad_norm": 0.7087448835372925, + "learning_rate": 7.23569778502514e-05, + "loss": 3.9685, + "step": 32565 + }, + { + "epoch": 2.2129365402908006, + "grad_norm": 0.2831578552722931, + "learning_rate": 7.235273134936813e-05, + "loss": 3.8083, + "step": 32570 + }, + { + "epoch": 2.213276260361462, + "grad_norm": 0.14787006378173828, + "learning_rate": 7.234848484848485e-05, + "loss": 3.7481, + "step": 32575 + }, + { + "epoch": 2.213615980432124, + "grad_norm": 0.20758341252803802, + "learning_rate": 7.234423834760158e-05, + "loss": 3.9352, + "step": 32580 + }, + { + "epoch": 2.213955700502786, + "grad_norm": 0.1573704481124878, + "learning_rate": 7.233999184671831e-05, + "loss": 4.1178, + "step": 32585 + }, + { + "epoch": 2.2142954205734475, + "grad_norm": 0.4370795488357544, + "learning_rate": 7.233574534583504e-05, + "loss": 3.7349, + "step": 32590 + }, + { + "epoch": 2.214635140644109, + "grad_norm": 0.1925032138824463, + "learning_rate": 7.233149884495177e-05, + "loss": 3.7238, + "step": 32595 + }, + { + "epoch": 2.2149748607147712, + "grad_norm": 0.17317524552345276, + "learning_rate": 7.232725234406848e-05, + "loss": 3.9742, + "step": 32600 + }, + { + "epoch": 2.215314580785433, + "grad_norm": 0.1581128090620041, + "learning_rate": 7.232300584318522e-05, + "loss": 3.7956, + "step": 32605 + }, + { + "epoch": 2.2156543008560945, + "grad_norm": 0.2069024294614792, + "learning_rate": 7.231875934230195e-05, + "loss": 3.6325, + "step": 32610 + }, + { + "epoch": 2.2159940209267566, + "grad_norm": 0.19826950132846832, + "learning_rate": 7.231451284141866e-05, + "loss": 4.0699, + "step": 32615 + }, + { + "epoch": 2.216333740997418, + "grad_norm": 0.4247535765171051, + "learning_rate": 7.23102663405354e-05, + "loss": 3.926, + "step": 32620 + }, + { + "epoch": 2.21667346106808, + "grad_norm": 0.19720378518104553, + "learning_rate": 7.230601983965213e-05, + "loss": 4.1544, + "step": 32625 + }, + { + "epoch": 2.217013181138742, + "grad_norm": 0.32859617471694946, + "learning_rate": 7.230177333876886e-05, + "loss": 3.826, + "step": 32630 + }, + { + "epoch": 2.2173529012094035, + "grad_norm": 0.25300586223602295, + "learning_rate": 7.229752683788559e-05, + "loss": 3.796, + "step": 32635 + }, + { + "epoch": 2.217692621280065, + "grad_norm": 0.7959072589874268, + "learning_rate": 7.229328033700232e-05, + "loss": 3.7969, + "step": 32640 + }, + { + "epoch": 2.218032341350727, + "grad_norm": 0.16852198541164398, + "learning_rate": 7.228903383611905e-05, + "loss": 3.7791, + "step": 32645 + }, + { + "epoch": 2.218372061421389, + "grad_norm": 0.15717250108718872, + "learning_rate": 7.228478733523577e-05, + "loss": 3.6655, + "step": 32650 + }, + { + "epoch": 2.2187117814920505, + "grad_norm": 0.2568720877170563, + "learning_rate": 7.22805408343525e-05, + "loss": 3.92, + "step": 32655 + }, + { + "epoch": 2.219051501562712, + "grad_norm": 0.7709200382232666, + "learning_rate": 7.227629433346923e-05, + "loss": 3.8094, + "step": 32660 + }, + { + "epoch": 2.219391221633374, + "grad_norm": 0.15872985124588013, + "learning_rate": 7.227204783258596e-05, + "loss": 3.5879, + "step": 32665 + }, + { + "epoch": 2.219730941704036, + "grad_norm": 0.1740356981754303, + "learning_rate": 7.226780133170267e-05, + "loss": 3.9136, + "step": 32670 + }, + { + "epoch": 2.2200706617746975, + "grad_norm": 0.19331534206867218, + "learning_rate": 7.226355483081941e-05, + "loss": 3.8891, + "step": 32675 + }, + { + "epoch": 2.2204103818453595, + "grad_norm": 0.1989338994026184, + "learning_rate": 7.225930832993614e-05, + "loss": 3.7666, + "step": 32680 + }, + { + "epoch": 2.220750101916021, + "grad_norm": 0.29960718750953674, + "learning_rate": 7.225506182905286e-05, + "loss": 3.7731, + "step": 32685 + }, + { + "epoch": 2.221089821986683, + "grad_norm": 0.22474512457847595, + "learning_rate": 7.22508153281696e-05, + "loss": 3.8351, + "step": 32690 + }, + { + "epoch": 2.221429542057345, + "grad_norm": 0.24010442197322845, + "learning_rate": 7.224656882728633e-05, + "loss": 3.788, + "step": 32695 + }, + { + "epoch": 2.2217692621280065, + "grad_norm": 0.18482977151870728, + "learning_rate": 7.224232232640304e-05, + "loss": 3.917, + "step": 32700 + }, + { + "epoch": 2.222108982198668, + "grad_norm": 0.19073881208896637, + "learning_rate": 7.223807582551978e-05, + "loss": 3.7529, + "step": 32705 + }, + { + "epoch": 2.2224487022693302, + "grad_norm": 0.16896192729473114, + "learning_rate": 7.223382932463651e-05, + "loss": 3.8908, + "step": 32710 + }, + { + "epoch": 2.222788422339992, + "grad_norm": 0.1943991333246231, + "learning_rate": 7.222958282375322e-05, + "loss": 3.9901, + "step": 32715 + }, + { + "epoch": 2.2231281424106535, + "grad_norm": 0.1849452406167984, + "learning_rate": 7.222533632286997e-05, + "loss": 3.873, + "step": 32720 + }, + { + "epoch": 2.2234678624813156, + "grad_norm": 0.1735709309577942, + "learning_rate": 7.22210898219867e-05, + "loss": 3.767, + "step": 32725 + }, + { + "epoch": 2.223807582551977, + "grad_norm": 0.21989227831363678, + "learning_rate": 7.221684332110341e-05, + "loss": 3.887, + "step": 32730 + }, + { + "epoch": 2.224147302622639, + "grad_norm": 0.19092266261577606, + "learning_rate": 7.221259682022015e-05, + "loss": 3.8919, + "step": 32735 + }, + { + "epoch": 2.224487022693301, + "grad_norm": 0.17549388110637665, + "learning_rate": 7.220835031933686e-05, + "loss": 3.9932, + "step": 32740 + }, + { + "epoch": 2.2248267427639625, + "grad_norm": 0.20701493322849274, + "learning_rate": 7.220410381845359e-05, + "loss": 3.8788, + "step": 32745 + }, + { + "epoch": 2.225166462834624, + "grad_norm": 0.2581508159637451, + "learning_rate": 7.219985731757033e-05, + "loss": 3.6688, + "step": 32750 + }, + { + "epoch": 2.2255061829052862, + "grad_norm": 0.2064242660999298, + "learning_rate": 7.219561081668705e-05, + "loss": 3.8434, + "step": 32755 + }, + { + "epoch": 2.225845902975948, + "grad_norm": 0.15417373180389404, + "learning_rate": 7.219136431580378e-05, + "loss": 4.0269, + "step": 32760 + }, + { + "epoch": 2.2261856230466095, + "grad_norm": 0.22962354123592377, + "learning_rate": 7.218711781492052e-05, + "loss": 3.7533, + "step": 32765 + }, + { + "epoch": 2.2265253431172716, + "grad_norm": 0.18707793951034546, + "learning_rate": 7.218287131403723e-05, + "loss": 3.9337, + "step": 32770 + }, + { + "epoch": 2.226865063187933, + "grad_norm": 0.2598256766796112, + "learning_rate": 7.217862481315396e-05, + "loss": 3.873, + "step": 32775 + }, + { + "epoch": 2.227204783258595, + "grad_norm": 0.2104845941066742, + "learning_rate": 7.21743783122707e-05, + "loss": 3.9586, + "step": 32780 + }, + { + "epoch": 2.2275445033292565, + "grad_norm": 0.1780114620923996, + "learning_rate": 7.217013181138742e-05, + "loss": 3.7182, + "step": 32785 + }, + { + "epoch": 2.2278842233999185, + "grad_norm": 0.1577623188495636, + "learning_rate": 7.216588531050414e-05, + "loss": 3.7516, + "step": 32790 + }, + { + "epoch": 2.22822394347058, + "grad_norm": 0.1829080730676651, + "learning_rate": 7.216163880962089e-05, + "loss": 3.9721, + "step": 32795 + }, + { + "epoch": 2.228563663541242, + "grad_norm": 0.20386314392089844, + "learning_rate": 7.21573923087376e-05, + "loss": 4.0669, + "step": 32800 + }, + { + "epoch": 2.228903383611904, + "grad_norm": 0.5111936926841736, + "learning_rate": 7.215314580785433e-05, + "loss": 3.7681, + "step": 32805 + }, + { + "epoch": 2.2292431036825655, + "grad_norm": 0.1619226336479187, + "learning_rate": 7.214889930697107e-05, + "loss": 3.8242, + "step": 32810 + }, + { + "epoch": 2.229582823753227, + "grad_norm": 0.16803240776062012, + "learning_rate": 7.214465280608778e-05, + "loss": 3.9898, + "step": 32815 + }, + { + "epoch": 2.229922543823889, + "grad_norm": 0.1986560970544815, + "learning_rate": 7.214040630520451e-05, + "loss": 3.89, + "step": 32820 + }, + { + "epoch": 2.230262263894551, + "grad_norm": 0.1403447389602661, + "learning_rate": 7.213615980432124e-05, + "loss": 3.8803, + "step": 32825 + }, + { + "epoch": 2.2306019839652125, + "grad_norm": 0.20711904764175415, + "learning_rate": 7.213191330343797e-05, + "loss": 4.0675, + "step": 32830 + }, + { + "epoch": 2.2309417040358746, + "grad_norm": 0.19400465488433838, + "learning_rate": 7.21276668025547e-05, + "loss": 3.9794, + "step": 32835 + }, + { + "epoch": 2.231281424106536, + "grad_norm": 0.2173852175474167, + "learning_rate": 7.212342030167142e-05, + "loss": 3.8335, + "step": 32840 + }, + { + "epoch": 2.231621144177198, + "grad_norm": 0.19035492837429047, + "learning_rate": 7.211917380078815e-05, + "loss": 3.8409, + "step": 32845 + }, + { + "epoch": 2.23196086424786, + "grad_norm": 0.16666001081466675, + "learning_rate": 7.211492729990488e-05, + "loss": 3.8835, + "step": 32850 + }, + { + "epoch": 2.2323005843185215, + "grad_norm": 0.14318807423114777, + "learning_rate": 7.211068079902161e-05, + "loss": 3.7603, + "step": 32855 + }, + { + "epoch": 2.232640304389183, + "grad_norm": 0.20153585076332092, + "learning_rate": 7.210643429813834e-05, + "loss": 3.5964, + "step": 32860 + }, + { + "epoch": 2.2329800244598452, + "grad_norm": 0.2123516947031021, + "learning_rate": 7.210218779725506e-05, + "loss": 3.7307, + "step": 32865 + }, + { + "epoch": 2.233319744530507, + "grad_norm": 0.15104936063289642, + "learning_rate": 7.209794129637179e-05, + "loss": 4.0579, + "step": 32870 + }, + { + "epoch": 2.2336594646011685, + "grad_norm": 0.3451225459575653, + "learning_rate": 7.209369479548852e-05, + "loss": 3.993, + "step": 32875 + }, + { + "epoch": 2.2339991846718306, + "grad_norm": 0.16047649085521698, + "learning_rate": 7.208944829460525e-05, + "loss": 3.8844, + "step": 32880 + }, + { + "epoch": 2.234338904742492, + "grad_norm": 0.17082031071186066, + "learning_rate": 7.208520179372198e-05, + "loss": 3.8836, + "step": 32885 + }, + { + "epoch": 2.234678624813154, + "grad_norm": 0.18137559294700623, + "learning_rate": 7.20809552928387e-05, + "loss": 3.8137, + "step": 32890 + }, + { + "epoch": 2.235018344883816, + "grad_norm": 0.26225224137306213, + "learning_rate": 7.207670879195543e-05, + "loss": 3.7401, + "step": 32895 + }, + { + "epoch": 2.2353580649544775, + "grad_norm": 0.28071925044059753, + "learning_rate": 7.207246229107216e-05, + "loss": 3.8964, + "step": 32900 + }, + { + "epoch": 2.235697785025139, + "grad_norm": 0.18243293464183807, + "learning_rate": 7.206821579018889e-05, + "loss": 3.817, + "step": 32905 + }, + { + "epoch": 2.2360375050958012, + "grad_norm": 0.8295483589172363, + "learning_rate": 7.206396928930562e-05, + "loss": 4.1369, + "step": 32910 + }, + { + "epoch": 2.236377225166463, + "grad_norm": 0.25945645570755005, + "learning_rate": 7.205972278842234e-05, + "loss": 4.1043, + "step": 32915 + }, + { + "epoch": 2.2367169452371245, + "grad_norm": 0.3617568910121918, + "learning_rate": 7.205547628753907e-05, + "loss": 3.6668, + "step": 32920 + }, + { + "epoch": 2.2370566653077866, + "grad_norm": 1.3376972675323486, + "learning_rate": 7.20512297866558e-05, + "loss": 4.0678, + "step": 32925 + }, + { + "epoch": 2.237396385378448, + "grad_norm": 0.1728859841823578, + "learning_rate": 7.204698328577253e-05, + "loss": 4.0691, + "step": 32930 + }, + { + "epoch": 2.23773610544911, + "grad_norm": 0.16583450138568878, + "learning_rate": 7.204273678488926e-05, + "loss": 3.8455, + "step": 32935 + }, + { + "epoch": 2.238075825519772, + "grad_norm": 0.3183199465274811, + "learning_rate": 7.203849028400597e-05, + "loss": 3.6381, + "step": 32940 + }, + { + "epoch": 2.2384155455904335, + "grad_norm": 0.20117223262786865, + "learning_rate": 7.203424378312271e-05, + "loss": 4.119, + "step": 32945 + }, + { + "epoch": 2.238755265661095, + "grad_norm": 0.1535332351922989, + "learning_rate": 7.202999728223944e-05, + "loss": 3.9492, + "step": 32950 + }, + { + "epoch": 2.2390949857317572, + "grad_norm": 0.1537209451198578, + "learning_rate": 7.202575078135615e-05, + "loss": 3.8627, + "step": 32955 + }, + { + "epoch": 2.239434705802419, + "grad_norm": 0.1486196666955948, + "learning_rate": 7.20215042804729e-05, + "loss": 3.8643, + "step": 32960 + }, + { + "epoch": 2.2397744258730805, + "grad_norm": 0.23392526805400848, + "learning_rate": 7.201725777958962e-05, + "loss": 3.9241, + "step": 32965 + }, + { + "epoch": 2.2401141459437426, + "grad_norm": 0.21028755605220795, + "learning_rate": 7.201301127870635e-05, + "loss": 3.9397, + "step": 32970 + }, + { + "epoch": 2.240453866014404, + "grad_norm": 0.21871845424175262, + "learning_rate": 7.200876477782308e-05, + "loss": 3.9309, + "step": 32975 + }, + { + "epoch": 2.240793586085066, + "grad_norm": 0.19653216004371643, + "learning_rate": 7.200451827693981e-05, + "loss": 3.898, + "step": 32980 + }, + { + "epoch": 2.2411333061557275, + "grad_norm": 0.17270530760288239, + "learning_rate": 7.200027177605654e-05, + "loss": 3.8613, + "step": 32985 + }, + { + "epoch": 2.2414730262263896, + "grad_norm": 0.18931029736995697, + "learning_rate": 7.199602527517326e-05, + "loss": 3.7317, + "step": 32990 + }, + { + "epoch": 2.241812746297051, + "grad_norm": 0.1772434562444687, + "learning_rate": 7.199177877428999e-05, + "loss": 3.8472, + "step": 32995 + }, + { + "epoch": 2.242152466367713, + "grad_norm": 0.6844156980514526, + "learning_rate": 7.198753227340672e-05, + "loss": 3.8963, + "step": 33000 + }, + { + "epoch": 2.242492186438375, + "grad_norm": 0.1517947018146515, + "learning_rate": 7.198328577252345e-05, + "loss": 3.8459, + "step": 33005 + }, + { + "epoch": 2.2428319065090365, + "grad_norm": 0.19754000008106232, + "learning_rate": 7.197903927164018e-05, + "loss": 3.6174, + "step": 33010 + }, + { + "epoch": 2.243171626579698, + "grad_norm": 0.17129679024219513, + "learning_rate": 7.19747927707569e-05, + "loss": 4.0437, + "step": 33015 + }, + { + "epoch": 2.2435113466503602, + "grad_norm": 0.38179558515548706, + "learning_rate": 7.197054626987363e-05, + "loss": 3.958, + "step": 33020 + }, + { + "epoch": 2.243851066721022, + "grad_norm": 0.1802847981452942, + "learning_rate": 7.196629976899035e-05, + "loss": 4.0013, + "step": 33025 + }, + { + "epoch": 2.2441907867916835, + "grad_norm": 0.31891149282455444, + "learning_rate": 7.196205326810709e-05, + "loss": 3.8573, + "step": 33030 + }, + { + "epoch": 2.2445305068623456, + "grad_norm": 0.3007993996143341, + "learning_rate": 7.195780676722382e-05, + "loss": 3.9608, + "step": 33035 + }, + { + "epoch": 2.244870226933007, + "grad_norm": 0.19720762968063354, + "learning_rate": 7.195356026634053e-05, + "loss": 3.8465, + "step": 33040 + }, + { + "epoch": 2.245209947003669, + "grad_norm": 0.23990698158740997, + "learning_rate": 7.194931376545727e-05, + "loss": 3.9061, + "step": 33045 + }, + { + "epoch": 2.245549667074331, + "grad_norm": 0.1943071037530899, + "learning_rate": 7.1945067264574e-05, + "loss": 4.3124, + "step": 33050 + }, + { + "epoch": 2.2458893871449925, + "grad_norm": 0.20591843128204346, + "learning_rate": 7.194082076369071e-05, + "loss": 4.0123, + "step": 33055 + }, + { + "epoch": 2.246229107215654, + "grad_norm": 0.1863325983285904, + "learning_rate": 7.193657426280746e-05, + "loss": 3.9307, + "step": 33060 + }, + { + "epoch": 2.2465688272863162, + "grad_norm": 0.1978752464056015, + "learning_rate": 7.193232776192418e-05, + "loss": 3.952, + "step": 33065 + }, + { + "epoch": 2.246908547356978, + "grad_norm": 0.26166045665740967, + "learning_rate": 7.19280812610409e-05, + "loss": 3.8347, + "step": 33070 + }, + { + "epoch": 2.2472482674276395, + "grad_norm": 0.19126182794570923, + "learning_rate": 7.192383476015764e-05, + "loss": 4.0147, + "step": 33075 + }, + { + "epoch": 2.2475879874983016, + "grad_norm": 0.32238999009132385, + "learning_rate": 7.191958825927437e-05, + "loss": 3.8888, + "step": 33080 + }, + { + "epoch": 2.247927707568963, + "grad_norm": 0.19845977425575256, + "learning_rate": 7.191534175839108e-05, + "loss": 3.8476, + "step": 33085 + }, + { + "epoch": 2.248267427639625, + "grad_norm": 0.17749014496803284, + "learning_rate": 7.191109525750782e-05, + "loss": 3.6831, + "step": 33090 + }, + { + "epoch": 2.248607147710287, + "grad_norm": 0.19815604388713837, + "learning_rate": 7.190684875662454e-05, + "loss": 3.6774, + "step": 33095 + }, + { + "epoch": 2.2489468677809485, + "grad_norm": 0.23072487115859985, + "learning_rate": 7.190260225574127e-05, + "loss": 3.8091, + "step": 33100 + }, + { + "epoch": 2.24928658785161, + "grad_norm": 0.24032799899578094, + "learning_rate": 7.189835575485801e-05, + "loss": 3.6769, + "step": 33105 + }, + { + "epoch": 2.2496263079222723, + "grad_norm": 0.17884665727615356, + "learning_rate": 7.189410925397472e-05, + "loss": 3.7307, + "step": 33110 + }, + { + "epoch": 2.249966027992934, + "grad_norm": 0.4255974292755127, + "learning_rate": 7.188986275309145e-05, + "loss": 3.8669, + "step": 33115 + }, + { + "epoch": 2.2503057480635955, + "grad_norm": 0.2701708674430847, + "learning_rate": 7.188561625220819e-05, + "loss": 3.7426, + "step": 33120 + }, + { + "epoch": 2.250645468134257, + "grad_norm": 0.4801843464374542, + "learning_rate": 7.18813697513249e-05, + "loss": 3.9744, + "step": 33125 + }, + { + "epoch": 2.250985188204919, + "grad_norm": 0.16404716670513153, + "learning_rate": 7.187712325044163e-05, + "loss": 4.0791, + "step": 33130 + }, + { + "epoch": 2.251324908275581, + "grad_norm": 0.16959205269813538, + "learning_rate": 7.187287674955838e-05, + "loss": 3.9756, + "step": 33135 + }, + { + "epoch": 2.2516646283462425, + "grad_norm": 0.2007073163986206, + "learning_rate": 7.186863024867509e-05, + "loss": 3.9796, + "step": 33140 + }, + { + "epoch": 2.2520043484169046, + "grad_norm": 0.19775402545928955, + "learning_rate": 7.186438374779182e-05, + "loss": 3.8637, + "step": 33145 + }, + { + "epoch": 2.252344068487566, + "grad_norm": 0.21035252511501312, + "learning_rate": 7.186013724690856e-05, + "loss": 3.7115, + "step": 33150 + }, + { + "epoch": 2.252683788558228, + "grad_norm": 0.17229366302490234, + "learning_rate": 7.185589074602527e-05, + "loss": 4.0841, + "step": 33155 + }, + { + "epoch": 2.25302350862889, + "grad_norm": 0.19923605024814606, + "learning_rate": 7.1851644245142e-05, + "loss": 3.955, + "step": 33160 + }, + { + "epoch": 2.2533632286995515, + "grad_norm": 0.8353069424629211, + "learning_rate": 7.184739774425873e-05, + "loss": 4.0406, + "step": 33165 + }, + { + "epoch": 2.253702948770213, + "grad_norm": 0.24251116812229156, + "learning_rate": 7.184315124337546e-05, + "loss": 3.785, + "step": 33170 + }, + { + "epoch": 2.2540426688408752, + "grad_norm": 2.5886390209198, + "learning_rate": 7.183890474249219e-05, + "loss": 3.7995, + "step": 33175 + }, + { + "epoch": 2.254382388911537, + "grad_norm": 0.5043595433235168, + "learning_rate": 7.183465824160891e-05, + "loss": 3.6664, + "step": 33180 + }, + { + "epoch": 2.2547221089821985, + "grad_norm": 0.1549055129289627, + "learning_rate": 7.183041174072564e-05, + "loss": 3.9833, + "step": 33185 + }, + { + "epoch": 2.2550618290528606, + "grad_norm": 0.19492986798286438, + "learning_rate": 7.182616523984237e-05, + "loss": 3.8183, + "step": 33190 + }, + { + "epoch": 2.255401549123522, + "grad_norm": 0.23240990936756134, + "learning_rate": 7.18219187389591e-05, + "loss": 3.9604, + "step": 33195 + }, + { + "epoch": 2.255741269194184, + "grad_norm": 0.30934080481529236, + "learning_rate": 7.181767223807583e-05, + "loss": 3.9077, + "step": 33200 + }, + { + "epoch": 2.256080989264846, + "grad_norm": 0.40974095463752747, + "learning_rate": 7.181342573719255e-05, + "loss": 4.001, + "step": 33205 + }, + { + "epoch": 2.2564207093355075, + "grad_norm": 0.1593775898218155, + "learning_rate": 7.180917923630928e-05, + "loss": 3.9428, + "step": 33210 + }, + { + "epoch": 2.256760429406169, + "grad_norm": 0.22154617309570312, + "learning_rate": 7.180493273542601e-05, + "loss": 3.8433, + "step": 33215 + }, + { + "epoch": 2.2571001494768312, + "grad_norm": 0.21985360980033875, + "learning_rate": 7.180068623454274e-05, + "loss": 3.9897, + "step": 33220 + }, + { + "epoch": 2.257439869547493, + "grad_norm": 0.2663731575012207, + "learning_rate": 7.179643973365947e-05, + "loss": 3.7266, + "step": 33225 + }, + { + "epoch": 2.2577795896181545, + "grad_norm": 0.1901504248380661, + "learning_rate": 7.17921932327762e-05, + "loss": 3.7622, + "step": 33230 + }, + { + "epoch": 2.2581193096888166, + "grad_norm": 0.1499297171831131, + "learning_rate": 7.178794673189292e-05, + "loss": 3.9354, + "step": 33235 + }, + { + "epoch": 2.258459029759478, + "grad_norm": 0.1658245027065277, + "learning_rate": 7.178370023100965e-05, + "loss": 3.8306, + "step": 33240 + }, + { + "epoch": 2.25879874983014, + "grad_norm": 0.20522591471672058, + "learning_rate": 7.177945373012638e-05, + "loss": 3.6947, + "step": 33245 + }, + { + "epoch": 2.259138469900802, + "grad_norm": 0.22414501011371613, + "learning_rate": 7.177520722924311e-05, + "loss": 3.9391, + "step": 33250 + }, + { + "epoch": 2.2594781899714635, + "grad_norm": 0.22275273501873016, + "learning_rate": 7.177096072835983e-05, + "loss": 3.7588, + "step": 33255 + }, + { + "epoch": 2.259817910042125, + "grad_norm": 0.33584660291671753, + "learning_rate": 7.176671422747656e-05, + "loss": 4.052, + "step": 33260 + }, + { + "epoch": 2.2601576301127873, + "grad_norm": 0.2278933972120285, + "learning_rate": 7.176246772659329e-05, + "loss": 4.1226, + "step": 33265 + }, + { + "epoch": 2.260497350183449, + "grad_norm": 0.1832197606563568, + "learning_rate": 7.175822122571002e-05, + "loss": 3.7119, + "step": 33270 + }, + { + "epoch": 2.2608370702541105, + "grad_norm": 0.18679380416870117, + "learning_rate": 7.175397472482675e-05, + "loss": 3.8617, + "step": 33275 + }, + { + "epoch": 2.2611767903247726, + "grad_norm": 0.18932166695594788, + "learning_rate": 7.174972822394348e-05, + "loss": 3.8768, + "step": 33280 + }, + { + "epoch": 2.261516510395434, + "grad_norm": 0.24310176074504852, + "learning_rate": 7.17454817230602e-05, + "loss": 3.9054, + "step": 33285 + }, + { + "epoch": 2.261856230466096, + "grad_norm": 0.17377562820911407, + "learning_rate": 7.174123522217693e-05, + "loss": 3.8688, + "step": 33290 + }, + { + "epoch": 2.262195950536758, + "grad_norm": 0.19484102725982666, + "learning_rate": 7.173698872129365e-05, + "loss": 3.913, + "step": 33295 + }, + { + "epoch": 2.2625356706074196, + "grad_norm": 0.1831160932779312, + "learning_rate": 7.173274222041039e-05, + "loss": 3.7889, + "step": 33300 + }, + { + "epoch": 2.262875390678081, + "grad_norm": 0.16917267441749573, + "learning_rate": 7.172849571952712e-05, + "loss": 3.7721, + "step": 33305 + }, + { + "epoch": 2.2632151107487433, + "grad_norm": 0.22589050233364105, + "learning_rate": 7.172424921864384e-05, + "loss": 3.8276, + "step": 33310 + }, + { + "epoch": 2.263554830819405, + "grad_norm": 0.17123013734817505, + "learning_rate": 7.172000271776057e-05, + "loss": 4.1028, + "step": 33315 + }, + { + "epoch": 2.2638945508900665, + "grad_norm": 0.20784686505794525, + "learning_rate": 7.17157562168773e-05, + "loss": 3.8916, + "step": 33320 + }, + { + "epoch": 2.2642342709607286, + "grad_norm": 0.16869431734085083, + "learning_rate": 7.171150971599403e-05, + "loss": 3.8266, + "step": 33325 + }, + { + "epoch": 2.2645739910313902, + "grad_norm": 0.2343980073928833, + "learning_rate": 7.170726321511076e-05, + "loss": 3.8865, + "step": 33330 + }, + { + "epoch": 2.264913711102052, + "grad_norm": 0.17854757606983185, + "learning_rate": 7.170301671422748e-05, + "loss": 3.7931, + "step": 33335 + }, + { + "epoch": 2.2652534311727135, + "grad_norm": 0.18094027042388916, + "learning_rate": 7.169877021334421e-05, + "loss": 3.9197, + "step": 33340 + }, + { + "epoch": 2.2655931512433756, + "grad_norm": 0.18724893033504486, + "learning_rate": 7.169452371246094e-05, + "loss": 3.9053, + "step": 33345 + }, + { + "epoch": 2.265932871314037, + "grad_norm": 0.18492181599140167, + "learning_rate": 7.169027721157767e-05, + "loss": 3.9821, + "step": 33350 + }, + { + "epoch": 2.266272591384699, + "grad_norm": 0.21740753948688507, + "learning_rate": 7.16860307106944e-05, + "loss": 4.0089, + "step": 33355 + }, + { + "epoch": 2.266612311455361, + "grad_norm": 0.1554931402206421, + "learning_rate": 7.168178420981112e-05, + "loss": 4.0292, + "step": 33360 + }, + { + "epoch": 2.2669520315260225, + "grad_norm": 0.1901857554912567, + "learning_rate": 7.167753770892784e-05, + "loss": 3.8846, + "step": 33365 + }, + { + "epoch": 2.267291751596684, + "grad_norm": 0.17580756545066833, + "learning_rate": 7.167329120804458e-05, + "loss": 3.9946, + "step": 33370 + }, + { + "epoch": 2.2676314716673462, + "grad_norm": 0.17764413356781006, + "learning_rate": 7.166904470716131e-05, + "loss": 3.8258, + "step": 33375 + }, + { + "epoch": 2.267971191738008, + "grad_norm": 0.4496670067310333, + "learning_rate": 7.166479820627802e-05, + "loss": 4.0429, + "step": 33380 + }, + { + "epoch": 2.2683109118086695, + "grad_norm": 0.15851478278636932, + "learning_rate": 7.166055170539476e-05, + "loss": 3.5909, + "step": 33385 + }, + { + "epoch": 2.2686506318793316, + "grad_norm": 0.186979740858078, + "learning_rate": 7.165630520451149e-05, + "loss": 3.8922, + "step": 33390 + }, + { + "epoch": 2.268990351949993, + "grad_norm": 0.18307413160800934, + "learning_rate": 7.16520587036282e-05, + "loss": 3.8299, + "step": 33395 + }, + { + "epoch": 2.269330072020655, + "grad_norm": 0.16941972076892853, + "learning_rate": 7.164781220274495e-05, + "loss": 3.6695, + "step": 33400 + }, + { + "epoch": 2.269669792091317, + "grad_norm": 0.20379313826560974, + "learning_rate": 7.164356570186168e-05, + "loss": 3.6436, + "step": 33405 + }, + { + "epoch": 2.2700095121619785, + "grad_norm": 0.31415966153144836, + "learning_rate": 7.163931920097839e-05, + "loss": 3.6928, + "step": 33410 + }, + { + "epoch": 2.27034923223264, + "grad_norm": 0.17186102271080017, + "learning_rate": 7.163507270009513e-05, + "loss": 3.9076, + "step": 33415 + }, + { + "epoch": 2.2706889523033023, + "grad_norm": 0.17343521118164062, + "learning_rate": 7.163082619921186e-05, + "loss": 4.1058, + "step": 33420 + }, + { + "epoch": 2.271028672373964, + "grad_norm": 0.1676948219537735, + "learning_rate": 7.162657969832857e-05, + "loss": 3.7754, + "step": 33425 + }, + { + "epoch": 2.2713683924446255, + "grad_norm": 0.193776935338974, + "learning_rate": 7.162233319744532e-05, + "loss": 3.8159, + "step": 33430 + }, + { + "epoch": 2.2717081125152876, + "grad_norm": 0.13933351635932922, + "learning_rate": 7.161808669656204e-05, + "loss": 3.8259, + "step": 33435 + }, + { + "epoch": 2.2720478325859492, + "grad_norm": 0.15225152671337128, + "learning_rate": 7.161384019567876e-05, + "loss": 3.6579, + "step": 33440 + }, + { + "epoch": 2.272387552656611, + "grad_norm": 0.14554491639137268, + "learning_rate": 7.16095936947955e-05, + "loss": 3.8433, + "step": 33445 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.17570161819458008, + "learning_rate": 7.160534719391221e-05, + "loss": 3.7441, + "step": 33450 + }, + { + "epoch": 2.2730669927979346, + "grad_norm": 0.5071001648902893, + "learning_rate": 7.160110069302894e-05, + "loss": 3.8475, + "step": 33455 + }, + { + "epoch": 2.273406712868596, + "grad_norm": 0.26464685797691345, + "learning_rate": 7.159685419214568e-05, + "loss": 3.8553, + "step": 33460 + }, + { + "epoch": 2.273746432939258, + "grad_norm": 0.18169206380844116, + "learning_rate": 7.15926076912624e-05, + "loss": 3.7586, + "step": 33465 + }, + { + "epoch": 2.27408615300992, + "grad_norm": 0.367622971534729, + "learning_rate": 7.158836119037913e-05, + "loss": 3.9556, + "step": 33470 + }, + { + "epoch": 2.2744258730805815, + "grad_norm": 0.16520164906978607, + "learning_rate": 7.158411468949587e-05, + "loss": 3.916, + "step": 33475 + }, + { + "epoch": 2.274765593151243, + "grad_norm": 1.0610811710357666, + "learning_rate": 7.157986818861258e-05, + "loss": 3.8553, + "step": 33480 + }, + { + "epoch": 2.2751053132219052, + "grad_norm": 0.15802854299545288, + "learning_rate": 7.157562168772931e-05, + "loss": 3.7982, + "step": 33485 + }, + { + "epoch": 2.275445033292567, + "grad_norm": 0.17433717846870422, + "learning_rate": 7.157137518684605e-05, + "loss": 3.9885, + "step": 33490 + }, + { + "epoch": 2.2757847533632285, + "grad_norm": 0.1752845048904419, + "learning_rate": 7.156712868596277e-05, + "loss": 3.9711, + "step": 33495 + }, + { + "epoch": 2.2761244734338906, + "grad_norm": 0.14478828012943268, + "learning_rate": 7.15628821850795e-05, + "loss": 4.1626, + "step": 33500 + }, + { + "epoch": 2.276464193504552, + "grad_norm": 0.24581988155841827, + "learning_rate": 7.155863568419624e-05, + "loss": 4.0829, + "step": 33505 + }, + { + "epoch": 2.276803913575214, + "grad_norm": 0.18216434121131897, + "learning_rate": 7.155438918331295e-05, + "loss": 3.8128, + "step": 33510 + }, + { + "epoch": 2.277143633645876, + "grad_norm": 0.25158578157424927, + "learning_rate": 7.155014268242968e-05, + "loss": 3.8658, + "step": 33515 + }, + { + "epoch": 2.2774833537165375, + "grad_norm": 0.1919831931591034, + "learning_rate": 7.15458961815464e-05, + "loss": 3.7684, + "step": 33520 + }, + { + "epoch": 2.277823073787199, + "grad_norm": 0.14625497162342072, + "learning_rate": 7.154164968066313e-05, + "loss": 3.7948, + "step": 33525 + }, + { + "epoch": 2.2781627938578612, + "grad_norm": 0.18487650156021118, + "learning_rate": 7.153740317977986e-05, + "loss": 3.9077, + "step": 33530 + }, + { + "epoch": 2.278502513928523, + "grad_norm": 0.16735979914665222, + "learning_rate": 7.153315667889659e-05, + "loss": 3.8415, + "step": 33535 + }, + { + "epoch": 2.2788422339991845, + "grad_norm": 0.22768963873386383, + "learning_rate": 7.152891017801332e-05, + "loss": 4.1461, + "step": 33540 + }, + { + "epoch": 2.2791819540698466, + "grad_norm": 0.16857054829597473, + "learning_rate": 7.152466367713005e-05, + "loss": 3.7733, + "step": 33545 + }, + { + "epoch": 2.279521674140508, + "grad_norm": 0.17888092994689941, + "learning_rate": 7.152041717624677e-05, + "loss": 3.8599, + "step": 33550 + }, + { + "epoch": 2.27986139421117, + "grad_norm": 0.2282891422510147, + "learning_rate": 7.15161706753635e-05, + "loss": 3.8938, + "step": 33555 + }, + { + "epoch": 2.280201114281832, + "grad_norm": 0.1962967813014984, + "learning_rate": 7.151192417448023e-05, + "loss": 3.7165, + "step": 33560 + }, + { + "epoch": 2.2805408343524936, + "grad_norm": 0.16763916611671448, + "learning_rate": 7.150767767359696e-05, + "loss": 3.754, + "step": 33565 + }, + { + "epoch": 2.280880554423155, + "grad_norm": 0.3291618824005127, + "learning_rate": 7.150343117271369e-05, + "loss": 4.0277, + "step": 33570 + }, + { + "epoch": 2.2812202744938173, + "grad_norm": 0.17663203179836273, + "learning_rate": 7.149918467183041e-05, + "loss": 3.7553, + "step": 33575 + }, + { + "epoch": 2.281559994564479, + "grad_norm": 0.25080183148384094, + "learning_rate": 7.149493817094714e-05, + "loss": 3.6961, + "step": 33580 + }, + { + "epoch": 2.2818997146351405, + "grad_norm": 0.2729285955429077, + "learning_rate": 7.149069167006387e-05, + "loss": 3.9198, + "step": 33585 + }, + { + "epoch": 2.2822394347058026, + "grad_norm": 0.19085875153541565, + "learning_rate": 7.14864451691806e-05, + "loss": 3.8354, + "step": 33590 + }, + { + "epoch": 2.2825791547764642, + "grad_norm": 0.574005663394928, + "learning_rate": 7.148219866829733e-05, + "loss": 3.788, + "step": 33595 + }, + { + "epoch": 2.282918874847126, + "grad_norm": 0.19303672015666962, + "learning_rate": 7.147795216741405e-05, + "loss": 3.9461, + "step": 33600 + }, + { + "epoch": 2.283258594917788, + "grad_norm": 0.198695108294487, + "learning_rate": 7.147370566653078e-05, + "loss": 3.9211, + "step": 33605 + }, + { + "epoch": 2.2835983149884496, + "grad_norm": 0.1995697170495987, + "learning_rate": 7.146945916564751e-05, + "loss": 3.9512, + "step": 33610 + }, + { + "epoch": 2.283938035059111, + "grad_norm": 0.17618048191070557, + "learning_rate": 7.146521266476424e-05, + "loss": 3.7892, + "step": 33615 + }, + { + "epoch": 2.2842777551297733, + "grad_norm": 0.1950692981481552, + "learning_rate": 7.146096616388097e-05, + "loss": 3.8941, + "step": 33620 + }, + { + "epoch": 2.284617475200435, + "grad_norm": 0.27112337946891785, + "learning_rate": 7.14567196629977e-05, + "loss": 4.0642, + "step": 33625 + }, + { + "epoch": 2.2849571952710965, + "grad_norm": 0.19941969215869904, + "learning_rate": 7.145247316211442e-05, + "loss": 3.8245, + "step": 33630 + }, + { + "epoch": 2.2852969153417586, + "grad_norm": 0.18470698595046997, + "learning_rate": 7.144822666123115e-05, + "loss": 3.8602, + "step": 33635 + }, + { + "epoch": 2.2856366354124202, + "grad_norm": 0.2135244458913803, + "learning_rate": 7.144398016034788e-05, + "loss": 4.0668, + "step": 33640 + }, + { + "epoch": 2.285976355483082, + "grad_norm": 0.17999140918254852, + "learning_rate": 7.14397336594646e-05, + "loss": 3.9108, + "step": 33645 + }, + { + "epoch": 2.286316075553744, + "grad_norm": 0.17635738849639893, + "learning_rate": 7.143548715858133e-05, + "loss": 3.7621, + "step": 33650 + }, + { + "epoch": 2.2866557956244056, + "grad_norm": 0.19083411991596222, + "learning_rate": 7.143124065769806e-05, + "loss": 3.9116, + "step": 33655 + }, + { + "epoch": 2.286995515695067, + "grad_norm": 0.1773659735918045, + "learning_rate": 7.142699415681479e-05, + "loss": 3.9662, + "step": 33660 + }, + { + "epoch": 2.2873352357657293, + "grad_norm": 0.4037283658981323, + "learning_rate": 7.142274765593152e-05, + "loss": 3.7613, + "step": 33665 + }, + { + "epoch": 2.287674955836391, + "grad_norm": 0.15501853823661804, + "learning_rate": 7.141850115504825e-05, + "loss": 3.8143, + "step": 33670 + }, + { + "epoch": 2.2880146759070525, + "grad_norm": 0.4571472704410553, + "learning_rate": 7.141425465416497e-05, + "loss": 3.6141, + "step": 33675 + }, + { + "epoch": 2.288354395977714, + "grad_norm": 0.37283310294151306, + "learning_rate": 7.14100081532817e-05, + "loss": 3.7436, + "step": 33680 + }, + { + "epoch": 2.2886941160483762, + "grad_norm": 0.1553613841533661, + "learning_rate": 7.140576165239843e-05, + "loss": 3.9062, + "step": 33685 + }, + { + "epoch": 2.289033836119038, + "grad_norm": 0.1930345743894577, + "learning_rate": 7.140151515151516e-05, + "loss": 3.9082, + "step": 33690 + }, + { + "epoch": 2.2893735561896995, + "grad_norm": 0.15060670673847198, + "learning_rate": 7.139726865063189e-05, + "loss": 3.9545, + "step": 33695 + }, + { + "epoch": 2.2897132762603616, + "grad_norm": 0.15339234471321106, + "learning_rate": 7.139302214974861e-05, + "loss": 4.0158, + "step": 33700 + }, + { + "epoch": 2.290052996331023, + "grad_norm": 0.14945641160011292, + "learning_rate": 7.138877564886534e-05, + "loss": 3.9205, + "step": 33705 + }, + { + "epoch": 2.290392716401685, + "grad_norm": 0.20205023884773254, + "learning_rate": 7.138452914798207e-05, + "loss": 3.69, + "step": 33710 + }, + { + "epoch": 2.290732436472347, + "grad_norm": 0.18704114854335785, + "learning_rate": 7.13802826470988e-05, + "loss": 3.6467, + "step": 33715 + }, + { + "epoch": 2.2910721565430086, + "grad_norm": 0.16664418578147888, + "learning_rate": 7.137603614621551e-05, + "loss": 3.6894, + "step": 33720 + }, + { + "epoch": 2.29141187661367, + "grad_norm": 0.15274091064929962, + "learning_rate": 7.137178964533225e-05, + "loss": 3.8672, + "step": 33725 + }, + { + "epoch": 2.2917515966843323, + "grad_norm": 0.6551703214645386, + "learning_rate": 7.136754314444898e-05, + "loss": 4.1407, + "step": 33730 + }, + { + "epoch": 2.292091316754994, + "grad_norm": 0.1827123463153839, + "learning_rate": 7.13632966435657e-05, + "loss": 3.8326, + "step": 33735 + }, + { + "epoch": 2.2924310368256555, + "grad_norm": 0.2048230916261673, + "learning_rate": 7.135905014268244e-05, + "loss": 3.8248, + "step": 33740 + }, + { + "epoch": 2.2927707568963176, + "grad_norm": 0.18320338428020477, + "learning_rate": 7.135480364179917e-05, + "loss": 3.7342, + "step": 33745 + }, + { + "epoch": 2.2931104769669792, + "grad_norm": 0.1585918664932251, + "learning_rate": 7.135055714091588e-05, + "loss": 3.9874, + "step": 33750 + }, + { + "epoch": 2.293450197037641, + "grad_norm": 0.26423370838165283, + "learning_rate": 7.134631064003262e-05, + "loss": 3.9634, + "step": 33755 + }, + { + "epoch": 2.293789917108303, + "grad_norm": 0.14517532289028168, + "learning_rate": 7.134206413914935e-05, + "loss": 3.6447, + "step": 33760 + }, + { + "epoch": 2.2941296371789646, + "grad_norm": 0.15339674055576324, + "learning_rate": 7.133781763826606e-05, + "loss": 3.7984, + "step": 33765 + }, + { + "epoch": 2.294469357249626, + "grad_norm": 0.1475820392370224, + "learning_rate": 7.13335711373828e-05, + "loss": 3.9767, + "step": 33770 + }, + { + "epoch": 2.2948090773202883, + "grad_norm": 0.2495499849319458, + "learning_rate": 7.132932463649953e-05, + "loss": 3.7771, + "step": 33775 + }, + { + "epoch": 2.29514879739095, + "grad_norm": 0.17700563371181488, + "learning_rate": 7.132507813561625e-05, + "loss": 3.8377, + "step": 33780 + }, + { + "epoch": 2.2954885174616115, + "grad_norm": 0.1817101240158081, + "learning_rate": 7.132083163473299e-05, + "loss": 3.898, + "step": 33785 + }, + { + "epoch": 2.295828237532273, + "grad_norm": 0.19806788861751556, + "learning_rate": 7.131658513384972e-05, + "loss": 3.954, + "step": 33790 + }, + { + "epoch": 2.2961679576029352, + "grad_norm": 0.24994045495986938, + "learning_rate": 7.131233863296643e-05, + "loss": 3.9505, + "step": 33795 + }, + { + "epoch": 2.296507677673597, + "grad_norm": 0.13836371898651123, + "learning_rate": 7.130809213208317e-05, + "loss": 4.0293, + "step": 33800 + }, + { + "epoch": 2.2968473977442585, + "grad_norm": 0.22501905262470245, + "learning_rate": 7.130384563119989e-05, + "loss": 3.9376, + "step": 33805 + }, + { + "epoch": 2.2971871178149206, + "grad_norm": 0.20930305123329163, + "learning_rate": 7.129959913031662e-05, + "loss": 3.8521, + "step": 33810 + }, + { + "epoch": 2.297526837885582, + "grad_norm": 0.15476341545581818, + "learning_rate": 7.129535262943336e-05, + "loss": 3.9562, + "step": 33815 + }, + { + "epoch": 2.297866557956244, + "grad_norm": 0.17671902477741241, + "learning_rate": 7.129110612855007e-05, + "loss": 3.828, + "step": 33820 + }, + { + "epoch": 2.298206278026906, + "grad_norm": 0.16252413392066956, + "learning_rate": 7.12868596276668e-05, + "loss": 3.7357, + "step": 33825 + }, + { + "epoch": 2.2985459980975675, + "grad_norm": 0.17199711501598358, + "learning_rate": 7.128261312678354e-05, + "loss": 3.8995, + "step": 33830 + }, + { + "epoch": 2.298885718168229, + "grad_norm": 0.16923239827156067, + "learning_rate": 7.127836662590026e-05, + "loss": 3.7091, + "step": 33835 + }, + { + "epoch": 2.2992254382388913, + "grad_norm": 0.1728765070438385, + "learning_rate": 7.127412012501698e-05, + "loss": 3.7502, + "step": 33840 + }, + { + "epoch": 2.299565158309553, + "grad_norm": 0.15826866030693054, + "learning_rate": 7.126987362413373e-05, + "loss": 4.1707, + "step": 33845 + }, + { + "epoch": 2.2999048783802145, + "grad_norm": 0.15378648042678833, + "learning_rate": 7.126562712325044e-05, + "loss": 3.902, + "step": 33850 + }, + { + "epoch": 2.3002445984508766, + "grad_norm": 0.1499037891626358, + "learning_rate": 7.126138062236717e-05, + "loss": 3.8846, + "step": 33855 + }, + { + "epoch": 2.300584318521538, + "grad_norm": 0.1642501801252365, + "learning_rate": 7.125713412148391e-05, + "loss": 3.645, + "step": 33860 + }, + { + "epoch": 2.3009240385922, + "grad_norm": 0.7283478379249573, + "learning_rate": 7.125288762060062e-05, + "loss": 3.9968, + "step": 33865 + }, + { + "epoch": 2.301263758662862, + "grad_norm": 0.15556499361991882, + "learning_rate": 7.124864111971735e-05, + "loss": 3.9124, + "step": 33870 + }, + { + "epoch": 2.3016034787335236, + "grad_norm": 0.1592804491519928, + "learning_rate": 7.124439461883408e-05, + "loss": 3.5962, + "step": 33875 + }, + { + "epoch": 2.301943198804185, + "grad_norm": 0.18209852278232574, + "learning_rate": 7.124014811795081e-05, + "loss": 3.8237, + "step": 33880 + }, + { + "epoch": 2.3022829188748473, + "grad_norm": 0.18341630697250366, + "learning_rate": 7.123590161706754e-05, + "loss": 3.8359, + "step": 33885 + }, + { + "epoch": 2.302622638945509, + "grad_norm": 0.14941655099391937, + "learning_rate": 7.123165511618426e-05, + "loss": 3.9127, + "step": 33890 + }, + { + "epoch": 2.3029623590161705, + "grad_norm": 0.16373921930789948, + "learning_rate": 7.122740861530099e-05, + "loss": 3.9308, + "step": 33895 + }, + { + "epoch": 2.3033020790868326, + "grad_norm": 0.29273003339767456, + "learning_rate": 7.122316211441772e-05, + "loss": 3.7052, + "step": 33900 + }, + { + "epoch": 2.3036417991574942, + "grad_norm": 0.16547784209251404, + "learning_rate": 7.121891561353445e-05, + "loss": 3.9185, + "step": 33905 + }, + { + "epoch": 2.303981519228156, + "grad_norm": 0.17886173725128174, + "learning_rate": 7.121466911265118e-05, + "loss": 3.8626, + "step": 33910 + }, + { + "epoch": 2.304321239298818, + "grad_norm": 0.18447156250476837, + "learning_rate": 7.12104226117679e-05, + "loss": 4.0692, + "step": 33915 + }, + { + "epoch": 2.3046609593694796, + "grad_norm": 0.16887839138507843, + "learning_rate": 7.120617611088463e-05, + "loss": 3.9068, + "step": 33920 + }, + { + "epoch": 2.305000679440141, + "grad_norm": 0.16491159796714783, + "learning_rate": 7.120192961000136e-05, + "loss": 3.9247, + "step": 33925 + }, + { + "epoch": 2.3053403995108033, + "grad_norm": 0.4073471426963806, + "learning_rate": 7.119768310911809e-05, + "loss": 3.7265, + "step": 33930 + }, + { + "epoch": 2.305680119581465, + "grad_norm": 0.16800765693187714, + "learning_rate": 7.119343660823482e-05, + "loss": 3.8808, + "step": 33935 + }, + { + "epoch": 2.3060198396521265, + "grad_norm": 0.22007273137569427, + "learning_rate": 7.118919010735154e-05, + "loss": 3.834, + "step": 33940 + }, + { + "epoch": 2.3063595597227886, + "grad_norm": 0.16712543368339539, + "learning_rate": 7.118494360646827e-05, + "loss": 3.9804, + "step": 33945 + }, + { + "epoch": 2.3066992797934502, + "grad_norm": 0.15407276153564453, + "learning_rate": 7.1180697105585e-05, + "loss": 3.766, + "step": 33950 + }, + { + "epoch": 2.307038999864112, + "grad_norm": 0.9897381663322449, + "learning_rate": 7.117645060470173e-05, + "loss": 3.8065, + "step": 33955 + }, + { + "epoch": 2.307378719934774, + "grad_norm": 0.16995452344417572, + "learning_rate": 7.117220410381846e-05, + "loss": 3.9571, + "step": 33960 + }, + { + "epoch": 2.3077184400054356, + "grad_norm": 0.2272217720746994, + "learning_rate": 7.116795760293518e-05, + "loss": 3.8498, + "step": 33965 + }, + { + "epoch": 2.308058160076097, + "grad_norm": 0.22835375368595123, + "learning_rate": 7.116371110205191e-05, + "loss": 3.6094, + "step": 33970 + }, + { + "epoch": 2.3083978801467593, + "grad_norm": 0.15004165470600128, + "learning_rate": 7.115946460116864e-05, + "loss": 3.9976, + "step": 33975 + }, + { + "epoch": 2.308737600217421, + "grad_norm": 0.19977013766765594, + "learning_rate": 7.115521810028537e-05, + "loss": 3.708, + "step": 33980 + }, + { + "epoch": 2.3090773202880825, + "grad_norm": 0.3631936311721802, + "learning_rate": 7.11509715994021e-05, + "loss": 3.954, + "step": 33985 + }, + { + "epoch": 2.3094170403587446, + "grad_norm": 0.22640596330165863, + "learning_rate": 7.114672509851883e-05, + "loss": 3.7949, + "step": 33990 + }, + { + "epoch": 2.3097567604294063, + "grad_norm": 0.4275730848312378, + "learning_rate": 7.114247859763555e-05, + "loss": 4.0459, + "step": 33995 + }, + { + "epoch": 2.310096480500068, + "grad_norm": 0.3507051467895508, + "learning_rate": 7.113823209675228e-05, + "loss": 3.9926, + "step": 34000 + }, + { + "epoch": 2.31043620057073, + "grad_norm": 0.17367447912693024, + "learning_rate": 7.113398559586901e-05, + "loss": 3.8482, + "step": 34005 + }, + { + "epoch": 2.3107759206413916, + "grad_norm": 1.1871066093444824, + "learning_rate": 7.112973909498574e-05, + "loss": 4.0183, + "step": 34010 + }, + { + "epoch": 2.311115640712053, + "grad_norm": 0.22978776693344116, + "learning_rate": 7.112549259410247e-05, + "loss": 3.9002, + "step": 34015 + }, + { + "epoch": 2.311455360782715, + "grad_norm": 0.15607568621635437, + "learning_rate": 7.112124609321919e-05, + "loss": 3.8926, + "step": 34020 + }, + { + "epoch": 2.311795080853377, + "grad_norm": 0.18066610395908356, + "learning_rate": 7.111699959233592e-05, + "loss": 3.9068, + "step": 34025 + }, + { + "epoch": 2.3121348009240386, + "grad_norm": 0.17973747849464417, + "learning_rate": 7.111275309145265e-05, + "loss": 3.6078, + "step": 34030 + }, + { + "epoch": 2.3124745209947, + "grad_norm": 0.17589737474918365, + "learning_rate": 7.110850659056938e-05, + "loss": 3.9229, + "step": 34035 + }, + { + "epoch": 2.3128142410653623, + "grad_norm": 0.1861152946949005, + "learning_rate": 7.11042600896861e-05, + "loss": 4.0178, + "step": 34040 + }, + { + "epoch": 2.313153961136024, + "grad_norm": 0.16954651474952698, + "learning_rate": 7.110001358880283e-05, + "loss": 3.974, + "step": 34045 + }, + { + "epoch": 2.3134936812066855, + "grad_norm": 0.23011283576488495, + "learning_rate": 7.109576708791956e-05, + "loss": 3.8897, + "step": 34050 + }, + { + "epoch": 2.3138334012773476, + "grad_norm": 1.177659273147583, + "learning_rate": 7.109152058703629e-05, + "loss": 3.6446, + "step": 34055 + }, + { + "epoch": 2.3141731213480092, + "grad_norm": 0.8952947854995728, + "learning_rate": 7.108727408615302e-05, + "loss": 3.5732, + "step": 34060 + }, + { + "epoch": 2.314512841418671, + "grad_norm": 0.23246270418167114, + "learning_rate": 7.108302758526975e-05, + "loss": 3.8877, + "step": 34065 + }, + { + "epoch": 2.314852561489333, + "grad_norm": 0.22671149671077728, + "learning_rate": 7.107878108438647e-05, + "loss": 3.8344, + "step": 34070 + }, + { + "epoch": 2.3151922815599946, + "grad_norm": 0.17816615104675293, + "learning_rate": 7.107453458350319e-05, + "loss": 3.7771, + "step": 34075 + }, + { + "epoch": 2.315532001630656, + "grad_norm": 0.49890071153640747, + "learning_rate": 7.107028808261993e-05, + "loss": 4.0961, + "step": 34080 + }, + { + "epoch": 2.3158717217013183, + "grad_norm": 0.17725642025470734, + "learning_rate": 7.106604158173666e-05, + "loss": 4.1345, + "step": 34085 + }, + { + "epoch": 2.31621144177198, + "grad_norm": 0.14607340097427368, + "learning_rate": 7.106179508085337e-05, + "loss": 3.9563, + "step": 34090 + }, + { + "epoch": 2.3165511618426415, + "grad_norm": 0.15269334614276886, + "learning_rate": 7.105754857997011e-05, + "loss": 4.0438, + "step": 34095 + }, + { + "epoch": 2.3168908819133036, + "grad_norm": 0.15415242314338684, + "learning_rate": 7.105330207908684e-05, + "loss": 4.0407, + "step": 34100 + }, + { + "epoch": 2.3172306019839652, + "grad_norm": 0.17387503385543823, + "learning_rate": 7.104905557820356e-05, + "loss": 3.6794, + "step": 34105 + }, + { + "epoch": 2.317570322054627, + "grad_norm": 0.15863646566867828, + "learning_rate": 7.10448090773203e-05, + "loss": 4.0824, + "step": 34110 + }, + { + "epoch": 2.317910042125289, + "grad_norm": 0.2011251449584961, + "learning_rate": 7.104056257643703e-05, + "loss": 3.9401, + "step": 34115 + }, + { + "epoch": 2.3182497621959506, + "grad_norm": 0.20264077186584473, + "learning_rate": 7.103631607555374e-05, + "loss": 3.9223, + "step": 34120 + }, + { + "epoch": 2.318589482266612, + "grad_norm": 0.17002882063388824, + "learning_rate": 7.103206957467048e-05, + "loss": 3.9178, + "step": 34125 + }, + { + "epoch": 2.318929202337274, + "grad_norm": 0.19088712334632874, + "learning_rate": 7.102782307378721e-05, + "loss": 3.8872, + "step": 34130 + }, + { + "epoch": 2.319268922407936, + "grad_norm": 0.24126885831356049, + "learning_rate": 7.102357657290392e-05, + "loss": 4.0309, + "step": 34135 + }, + { + "epoch": 2.3196086424785975, + "grad_norm": 0.16632422804832458, + "learning_rate": 7.101933007202067e-05, + "loss": 4.0231, + "step": 34140 + }, + { + "epoch": 2.319948362549259, + "grad_norm": 0.18969963490962982, + "learning_rate": 7.101508357113738e-05, + "loss": 3.701, + "step": 34145 + }, + { + "epoch": 2.3202880826199213, + "grad_norm": 0.17067667841911316, + "learning_rate": 7.101083707025411e-05, + "loss": 3.923, + "step": 34150 + }, + { + "epoch": 2.320627802690583, + "grad_norm": 0.18874643743038177, + "learning_rate": 7.100659056937085e-05, + "loss": 3.8639, + "step": 34155 + }, + { + "epoch": 2.3209675227612445, + "grad_norm": 0.25210362672805786, + "learning_rate": 7.100234406848756e-05, + "loss": 3.7317, + "step": 34160 + }, + { + "epoch": 2.3213072428319066, + "grad_norm": 0.16832157969474792, + "learning_rate": 7.099809756760429e-05, + "loss": 3.7379, + "step": 34165 + }, + { + "epoch": 2.3216469629025682, + "grad_norm": 0.1497519165277481, + "learning_rate": 7.099385106672103e-05, + "loss": 3.8587, + "step": 34170 + }, + { + "epoch": 2.32198668297323, + "grad_norm": 0.16294331848621368, + "learning_rate": 7.098960456583775e-05, + "loss": 3.6167, + "step": 34175 + }, + { + "epoch": 2.322326403043892, + "grad_norm": 0.16162621974945068, + "learning_rate": 7.098535806495448e-05, + "loss": 3.7356, + "step": 34180 + }, + { + "epoch": 2.3226661231145536, + "grad_norm": 2.9324333667755127, + "learning_rate": 7.098111156407122e-05, + "loss": 3.7842, + "step": 34185 + }, + { + "epoch": 2.323005843185215, + "grad_norm": 0.20083405077457428, + "learning_rate": 7.097686506318793e-05, + "loss": 3.8465, + "step": 34190 + }, + { + "epoch": 2.3233455632558773, + "grad_norm": 0.15570175647735596, + "learning_rate": 7.097261856230466e-05, + "loss": 4.0049, + "step": 34195 + }, + { + "epoch": 2.323685283326539, + "grad_norm": 0.17300452291965485, + "learning_rate": 7.09683720614214e-05, + "loss": 3.846, + "step": 34200 + }, + { + "epoch": 2.3240250033972005, + "grad_norm": 0.2785402238368988, + "learning_rate": 7.096412556053812e-05, + "loss": 3.8151, + "step": 34205 + }, + { + "epoch": 2.3243647234678626, + "grad_norm": 0.14827406406402588, + "learning_rate": 7.095987905965484e-05, + "loss": 4.1671, + "step": 34210 + }, + { + "epoch": 2.3247044435385242, + "grad_norm": 0.14036738872528076, + "learning_rate": 7.095563255877159e-05, + "loss": 3.7804, + "step": 34215 + }, + { + "epoch": 2.325044163609186, + "grad_norm": 0.3090738356113434, + "learning_rate": 7.09513860578883e-05, + "loss": 4.0631, + "step": 34220 + }, + { + "epoch": 2.325383883679848, + "grad_norm": 0.22116605937480927, + "learning_rate": 7.094713955700503e-05, + "loss": 3.7587, + "step": 34225 + }, + { + "epoch": 2.3257236037505096, + "grad_norm": 0.18208985030651093, + "learning_rate": 7.094289305612176e-05, + "loss": 4.1032, + "step": 34230 + }, + { + "epoch": 2.326063323821171, + "grad_norm": 0.15003274381160736, + "learning_rate": 7.093864655523848e-05, + "loss": 3.8463, + "step": 34235 + }, + { + "epoch": 2.3264030438918333, + "grad_norm": 0.1640012264251709, + "learning_rate": 7.093440005435521e-05, + "loss": 3.9323, + "step": 34240 + }, + { + "epoch": 2.326742763962495, + "grad_norm": 0.16545815765857697, + "learning_rate": 7.093015355347194e-05, + "loss": 3.94, + "step": 34245 + }, + { + "epoch": 2.3270824840331565, + "grad_norm": 0.21223360300064087, + "learning_rate": 7.092590705258867e-05, + "loss": 3.8517, + "step": 34250 + }, + { + "epoch": 2.3274222041038186, + "grad_norm": 0.18628254532814026, + "learning_rate": 7.09216605517054e-05, + "loss": 3.9404, + "step": 34255 + }, + { + "epoch": 2.3277619241744802, + "grad_norm": 0.18660961091518402, + "learning_rate": 7.091741405082212e-05, + "loss": 3.959, + "step": 34260 + }, + { + "epoch": 2.328101644245142, + "grad_norm": 0.19228799641132355, + "learning_rate": 7.091316754993885e-05, + "loss": 3.6649, + "step": 34265 + }, + { + "epoch": 2.328441364315804, + "grad_norm": 0.18053950369358063, + "learning_rate": 7.090892104905558e-05, + "loss": 4.0449, + "step": 34270 + }, + { + "epoch": 2.3287810843864656, + "grad_norm": 0.20605526864528656, + "learning_rate": 7.090467454817231e-05, + "loss": 3.7803, + "step": 34275 + }, + { + "epoch": 2.329120804457127, + "grad_norm": 0.13962699472904205, + "learning_rate": 7.090042804728904e-05, + "loss": 4.1362, + "step": 34280 + }, + { + "epoch": 2.3294605245277893, + "grad_norm": 0.1643078476190567, + "learning_rate": 7.089618154640576e-05, + "loss": 3.8085, + "step": 34285 + }, + { + "epoch": 2.329800244598451, + "grad_norm": 0.8849371075630188, + "learning_rate": 7.089193504552249e-05, + "loss": 3.8723, + "step": 34290 + }, + { + "epoch": 2.3301399646691126, + "grad_norm": 0.14667470753192902, + "learning_rate": 7.088768854463922e-05, + "loss": 3.9541, + "step": 34295 + }, + { + "epoch": 2.3304796847397746, + "grad_norm": 0.15413177013397217, + "learning_rate": 7.088344204375595e-05, + "loss": 3.7522, + "step": 34300 + }, + { + "epoch": 2.3308194048104363, + "grad_norm": 0.1949261724948883, + "learning_rate": 7.087919554287268e-05, + "loss": 4.0373, + "step": 34305 + }, + { + "epoch": 2.331159124881098, + "grad_norm": 0.15861326456069946, + "learning_rate": 7.08749490419894e-05, + "loss": 3.9298, + "step": 34310 + }, + { + "epoch": 2.33149884495176, + "grad_norm": 0.18697470426559448, + "learning_rate": 7.087070254110613e-05, + "loss": 3.9055, + "step": 34315 + }, + { + "epoch": 2.3318385650224216, + "grad_norm": 0.2925753891468048, + "learning_rate": 7.086645604022286e-05, + "loss": 3.7087, + "step": 34320 + }, + { + "epoch": 2.3321782850930832, + "grad_norm": 0.18173125386238098, + "learning_rate": 7.086220953933959e-05, + "loss": 3.7581, + "step": 34325 + }, + { + "epoch": 2.3325180051637453, + "grad_norm": 0.18532507121562958, + "learning_rate": 7.085796303845632e-05, + "loss": 4.0735, + "step": 34330 + }, + { + "epoch": 2.332857725234407, + "grad_norm": 0.16436217725276947, + "learning_rate": 7.085371653757304e-05, + "loss": 3.7087, + "step": 34335 + }, + { + "epoch": 2.3331974453050686, + "grad_norm": 0.21637341380119324, + "learning_rate": 7.084947003668977e-05, + "loss": 3.8418, + "step": 34340 + }, + { + "epoch": 2.3335371653757306, + "grad_norm": 0.22207073867321014, + "learning_rate": 7.08452235358065e-05, + "loss": 3.8672, + "step": 34345 + }, + { + "epoch": 2.3338768854463923, + "grad_norm": 0.18552108108997345, + "learning_rate": 7.084097703492323e-05, + "loss": 3.9976, + "step": 34350 + }, + { + "epoch": 2.334216605517054, + "grad_norm": 0.2246922254562378, + "learning_rate": 7.083673053403996e-05, + "loss": 3.8711, + "step": 34355 + }, + { + "epoch": 2.3345563255877155, + "grad_norm": 0.13715097308158875, + "learning_rate": 7.083248403315668e-05, + "loss": 4.0619, + "step": 34360 + }, + { + "epoch": 2.3348960456583776, + "grad_norm": 0.2089153230190277, + "learning_rate": 7.082823753227341e-05, + "loss": 3.9831, + "step": 34365 + }, + { + "epoch": 2.3352357657290392, + "grad_norm": 0.1666397601366043, + "learning_rate": 7.082399103139014e-05, + "loss": 3.978, + "step": 34370 + }, + { + "epoch": 2.335575485799701, + "grad_norm": 0.5932078957557678, + "learning_rate": 7.081974453050687e-05, + "loss": 3.9046, + "step": 34375 + }, + { + "epoch": 2.335915205870363, + "grad_norm": 0.1672128438949585, + "learning_rate": 7.08154980296236e-05, + "loss": 3.9349, + "step": 34380 + }, + { + "epoch": 2.3362549259410246, + "grad_norm": 0.2197096049785614, + "learning_rate": 7.081125152874032e-05, + "loss": 3.9566, + "step": 34385 + }, + { + "epoch": 2.336594646011686, + "grad_norm": 0.22982755303382874, + "learning_rate": 7.080700502785705e-05, + "loss": 3.8922, + "step": 34390 + }, + { + "epoch": 2.3369343660823483, + "grad_norm": 0.19160060584545135, + "learning_rate": 7.080275852697378e-05, + "loss": 4.0046, + "step": 34395 + }, + { + "epoch": 2.33727408615301, + "grad_norm": 0.2064497023820877, + "learning_rate": 7.079851202609051e-05, + "loss": 3.9985, + "step": 34400 + }, + { + "epoch": 2.3376138062236715, + "grad_norm": 0.1582537591457367, + "learning_rate": 7.079426552520724e-05, + "loss": 3.8924, + "step": 34405 + }, + { + "epoch": 2.3379535262943336, + "grad_norm": 0.1499197781085968, + "learning_rate": 7.079001902432396e-05, + "loss": 3.9728, + "step": 34410 + }, + { + "epoch": 2.3382932463649952, + "grad_norm": 0.37996822595596313, + "learning_rate": 7.078577252344069e-05, + "loss": 3.9121, + "step": 34415 + }, + { + "epoch": 2.338632966435657, + "grad_norm": 0.17819368839263916, + "learning_rate": 7.078152602255742e-05, + "loss": 3.764, + "step": 34420 + }, + { + "epoch": 2.338972686506319, + "grad_norm": 0.15893596410751343, + "learning_rate": 7.077727952167415e-05, + "loss": 3.5964, + "step": 34425 + }, + { + "epoch": 2.3393124065769806, + "grad_norm": 0.39164525270462036, + "learning_rate": 7.077303302079086e-05, + "loss": 3.9734, + "step": 34430 + }, + { + "epoch": 2.339652126647642, + "grad_norm": 0.14418861269950867, + "learning_rate": 7.07687865199076e-05, + "loss": 3.9293, + "step": 34435 + }, + { + "epoch": 2.3399918467183043, + "grad_norm": 0.19149868190288544, + "learning_rate": 7.076454001902433e-05, + "loss": 3.7981, + "step": 34440 + }, + { + "epoch": 2.340331566788966, + "grad_norm": 0.17384204268455505, + "learning_rate": 7.076029351814105e-05, + "loss": 3.8336, + "step": 34445 + }, + { + "epoch": 2.3406712868596276, + "grad_norm": 0.1818857491016388, + "learning_rate": 7.075604701725779e-05, + "loss": 3.9437, + "step": 34450 + }, + { + "epoch": 2.3410110069302896, + "grad_norm": 0.16701366007328033, + "learning_rate": 7.075180051637452e-05, + "loss": 4.0207, + "step": 34455 + }, + { + "epoch": 2.3413507270009513, + "grad_norm": 0.23688434064388275, + "learning_rate": 7.074755401549123e-05, + "loss": 3.9684, + "step": 34460 + }, + { + "epoch": 2.341690447071613, + "grad_norm": 0.18319766223430634, + "learning_rate": 7.074330751460797e-05, + "loss": 3.844, + "step": 34465 + }, + { + "epoch": 2.3420301671422745, + "grad_norm": 0.14829102158546448, + "learning_rate": 7.07390610137247e-05, + "loss": 3.796, + "step": 34470 + }, + { + "epoch": 2.3423698872129366, + "grad_norm": 0.535907506942749, + "learning_rate": 7.073481451284141e-05, + "loss": 3.7111, + "step": 34475 + }, + { + "epoch": 2.3427096072835982, + "grad_norm": 0.18503440916538239, + "learning_rate": 7.073056801195816e-05, + "loss": 3.9862, + "step": 34480 + }, + { + "epoch": 2.34304932735426, + "grad_norm": 0.20278683304786682, + "learning_rate": 7.072632151107488e-05, + "loss": 3.806, + "step": 34485 + }, + { + "epoch": 2.343389047424922, + "grad_norm": 0.17911866307258606, + "learning_rate": 7.07220750101916e-05, + "loss": 4.0215, + "step": 34490 + }, + { + "epoch": 2.3437287674955836, + "grad_norm": 0.17367051541805267, + "learning_rate": 7.071782850930834e-05, + "loss": 4.1483, + "step": 34495 + }, + { + "epoch": 2.344068487566245, + "grad_norm": 0.15876546502113342, + "learning_rate": 7.071358200842505e-05, + "loss": 3.6324, + "step": 34500 + }, + { + "epoch": 2.3444082076369073, + "grad_norm": 0.2105153650045395, + "learning_rate": 7.070933550754178e-05, + "loss": 3.9519, + "step": 34505 + }, + { + "epoch": 2.344747927707569, + "grad_norm": 0.2158297300338745, + "learning_rate": 7.070508900665852e-05, + "loss": 3.9724, + "step": 34510 + }, + { + "epoch": 2.3450876477782305, + "grad_norm": 0.18328741192817688, + "learning_rate": 7.070084250577524e-05, + "loss": 3.8017, + "step": 34515 + }, + { + "epoch": 2.3454273678488926, + "grad_norm": 0.1934075504541397, + "learning_rate": 7.069659600489197e-05, + "loss": 3.7245, + "step": 34520 + }, + { + "epoch": 2.3457670879195542, + "grad_norm": 0.15489766001701355, + "learning_rate": 7.069234950400871e-05, + "loss": 4.028, + "step": 34525 + }, + { + "epoch": 2.346106807990216, + "grad_norm": 0.16190364956855774, + "learning_rate": 7.068810300312542e-05, + "loss": 3.971, + "step": 34530 + }, + { + "epoch": 2.346446528060878, + "grad_norm": 0.2130613625049591, + "learning_rate": 7.068385650224215e-05, + "loss": 4.1162, + "step": 34535 + }, + { + "epoch": 2.3467862481315396, + "grad_norm": 0.1791907697916031, + "learning_rate": 7.067961000135889e-05, + "loss": 3.634, + "step": 34540 + }, + { + "epoch": 2.347125968202201, + "grad_norm": 0.20863191783428192, + "learning_rate": 7.06753635004756e-05, + "loss": 3.8724, + "step": 34545 + }, + { + "epoch": 2.3474656882728633, + "grad_norm": 0.1662396490573883, + "learning_rate": 7.067111699959233e-05, + "loss": 3.9512, + "step": 34550 + }, + { + "epoch": 2.347805408343525, + "grad_norm": 0.1989622563123703, + "learning_rate": 7.066687049870908e-05, + "loss": 4.0245, + "step": 34555 + }, + { + "epoch": 2.3481451284141865, + "grad_norm": 0.20041723549365997, + "learning_rate": 7.066262399782579e-05, + "loss": 3.9614, + "step": 34560 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.1988714188337326, + "learning_rate": 7.065837749694252e-05, + "loss": 4.0063, + "step": 34565 + }, + { + "epoch": 2.3488245685555103, + "grad_norm": 0.16369350254535675, + "learning_rate": 7.065413099605925e-05, + "loss": 4.0373, + "step": 34570 + }, + { + "epoch": 2.349164288626172, + "grad_norm": 0.27216342091560364, + "learning_rate": 7.064988449517597e-05, + "loss": 3.8631, + "step": 34575 + }, + { + "epoch": 2.349504008696834, + "grad_norm": 0.20823942124843597, + "learning_rate": 7.06456379942927e-05, + "loss": 3.9232, + "step": 34580 + }, + { + "epoch": 2.3498437287674956, + "grad_norm": 0.4466433525085449, + "learning_rate": 7.064139149340943e-05, + "loss": 3.832, + "step": 34585 + }, + { + "epoch": 2.350183448838157, + "grad_norm": 0.13917317986488342, + "learning_rate": 7.063714499252616e-05, + "loss": 3.9598, + "step": 34590 + }, + { + "epoch": 2.3505231689088193, + "grad_norm": 1.1654977798461914, + "learning_rate": 7.063289849164289e-05, + "loss": 3.7045, + "step": 34595 + }, + { + "epoch": 2.350862888979481, + "grad_norm": 0.23542742431163788, + "learning_rate": 7.062865199075961e-05, + "loss": 3.8988, + "step": 34600 + }, + { + "epoch": 2.3512026090501426, + "grad_norm": 0.25812768936157227, + "learning_rate": 7.062440548987634e-05, + "loss": 3.8449, + "step": 34605 + }, + { + "epoch": 2.3515423291208046, + "grad_norm": 0.18766945600509644, + "learning_rate": 7.062015898899307e-05, + "loss": 4.034, + "step": 34610 + }, + { + "epoch": 2.3518820491914663, + "grad_norm": 0.285710871219635, + "learning_rate": 7.06159124881098e-05, + "loss": 3.8145, + "step": 34615 + }, + { + "epoch": 2.352221769262128, + "grad_norm": 0.1458282470703125, + "learning_rate": 7.061166598722653e-05, + "loss": 3.7645, + "step": 34620 + }, + { + "epoch": 2.35256148933279, + "grad_norm": 0.6241632103919983, + "learning_rate": 7.060741948634325e-05, + "loss": 4.0115, + "step": 34625 + }, + { + "epoch": 2.3529012094034516, + "grad_norm": 0.1675604283809662, + "learning_rate": 7.060317298545998e-05, + "loss": 3.8916, + "step": 34630 + }, + { + "epoch": 2.3532409294741132, + "grad_norm": 0.3273068368434906, + "learning_rate": 7.059892648457671e-05, + "loss": 4.1005, + "step": 34635 + }, + { + "epoch": 2.3535806495447753, + "grad_norm": 0.18352384865283966, + "learning_rate": 7.059467998369344e-05, + "loss": 3.7188, + "step": 34640 + }, + { + "epoch": 2.353920369615437, + "grad_norm": 0.1502200812101364, + "learning_rate": 7.059043348281017e-05, + "loss": 3.8257, + "step": 34645 + }, + { + "epoch": 2.3542600896860986, + "grad_norm": 0.17979788780212402, + "learning_rate": 7.05861869819269e-05, + "loss": 3.7918, + "step": 34650 + }, + { + "epoch": 2.3545998097567606, + "grad_norm": 0.17876629531383514, + "learning_rate": 7.058194048104362e-05, + "loss": 4.0048, + "step": 34655 + }, + { + "epoch": 2.3549395298274223, + "grad_norm": 0.1901116669178009, + "learning_rate": 7.057769398016035e-05, + "loss": 3.871, + "step": 34660 + }, + { + "epoch": 2.355279249898084, + "grad_norm": 0.8371573090553284, + "learning_rate": 7.057344747927708e-05, + "loss": 3.9041, + "step": 34665 + }, + { + "epoch": 2.355618969968746, + "grad_norm": 0.15629322826862335, + "learning_rate": 7.056920097839381e-05, + "loss": 3.9364, + "step": 34670 + }, + { + "epoch": 2.3559586900394076, + "grad_norm": 0.5274776816368103, + "learning_rate": 7.056495447751053e-05, + "loss": 3.9226, + "step": 34675 + }, + { + "epoch": 2.3562984101100692, + "grad_norm": 0.21830227971076965, + "learning_rate": 7.056070797662726e-05, + "loss": 3.8433, + "step": 34680 + }, + { + "epoch": 2.3566381301807313, + "grad_norm": 0.1475214958190918, + "learning_rate": 7.055646147574399e-05, + "loss": 3.899, + "step": 34685 + }, + { + "epoch": 2.356977850251393, + "grad_norm": 0.20169053971767426, + "learning_rate": 7.055221497486072e-05, + "loss": 3.9218, + "step": 34690 + }, + { + "epoch": 2.3573175703220546, + "grad_norm": 0.22861157357692719, + "learning_rate": 7.054796847397745e-05, + "loss": 3.8641, + "step": 34695 + }, + { + "epoch": 2.357657290392716, + "grad_norm": 0.19108867645263672, + "learning_rate": 7.054372197309417e-05, + "loss": 4.1334, + "step": 34700 + }, + { + "epoch": 2.3579970104633783, + "grad_norm": 0.19039931893348694, + "learning_rate": 7.05394754722109e-05, + "loss": 3.8593, + "step": 34705 + }, + { + "epoch": 2.35833673053404, + "grad_norm": 0.20883595943450928, + "learning_rate": 7.053522897132763e-05, + "loss": 3.6304, + "step": 34710 + }, + { + "epoch": 2.3586764506047015, + "grad_norm": 0.22621451318264008, + "learning_rate": 7.053098247044436e-05, + "loss": 3.7069, + "step": 34715 + }, + { + "epoch": 2.3590161706753636, + "grad_norm": 0.22170856595039368, + "learning_rate": 7.052673596956109e-05, + "loss": 3.8942, + "step": 34720 + }, + { + "epoch": 2.3593558907460253, + "grad_norm": 0.18467478454113007, + "learning_rate": 7.052248946867782e-05, + "loss": 3.8805, + "step": 34725 + }, + { + "epoch": 2.359695610816687, + "grad_norm": 0.19355931878089905, + "learning_rate": 7.051824296779454e-05, + "loss": 3.8423, + "step": 34730 + }, + { + "epoch": 2.360035330887349, + "grad_norm": 0.14389380812644958, + "learning_rate": 7.051399646691127e-05, + "loss": 3.8198, + "step": 34735 + }, + { + "epoch": 2.3603750509580106, + "grad_norm": 0.2344195693731308, + "learning_rate": 7.0509749966028e-05, + "loss": 3.7513, + "step": 34740 + }, + { + "epoch": 2.360714771028672, + "grad_norm": 0.16804787516593933, + "learning_rate": 7.050550346514473e-05, + "loss": 3.9161, + "step": 34745 + }, + { + "epoch": 2.3610544910993343, + "grad_norm": 0.9522013664245605, + "learning_rate": 7.050125696426146e-05, + "loss": 3.8492, + "step": 34750 + }, + { + "epoch": 2.361394211169996, + "grad_norm": 0.19575490057468414, + "learning_rate": 7.049701046337818e-05, + "loss": 3.8785, + "step": 34755 + }, + { + "epoch": 2.3617339312406576, + "grad_norm": 0.24174806475639343, + "learning_rate": 7.049276396249491e-05, + "loss": 3.897, + "step": 34760 + }, + { + "epoch": 2.3620736513113196, + "grad_norm": 0.18435640633106232, + "learning_rate": 7.048851746161164e-05, + "loss": 3.9338, + "step": 34765 + }, + { + "epoch": 2.3624133713819813, + "grad_norm": 0.24936258792877197, + "learning_rate": 7.048427096072835e-05, + "loss": 3.8286, + "step": 34770 + }, + { + "epoch": 2.362753091452643, + "grad_norm": 0.20814988017082214, + "learning_rate": 7.04800244598451e-05, + "loss": 3.8348, + "step": 34775 + }, + { + "epoch": 2.363092811523305, + "grad_norm": 0.20506484806537628, + "learning_rate": 7.047577795896182e-05, + "loss": 3.8475, + "step": 34780 + }, + { + "epoch": 2.3634325315939666, + "grad_norm": 0.35939764976501465, + "learning_rate": 7.047153145807854e-05, + "loss": 3.9306, + "step": 34785 + }, + { + "epoch": 2.3637722516646282, + "grad_norm": 0.22172527015209198, + "learning_rate": 7.046728495719528e-05, + "loss": 4.0405, + "step": 34790 + }, + { + "epoch": 2.3641119717352903, + "grad_norm": 0.13932347297668457, + "learning_rate": 7.046303845631201e-05, + "loss": 3.8692, + "step": 34795 + }, + { + "epoch": 2.364451691805952, + "grad_norm": 0.24414552748203278, + "learning_rate": 7.045879195542872e-05, + "loss": 3.9057, + "step": 34800 + }, + { + "epoch": 2.3647914118766136, + "grad_norm": 0.2334882915019989, + "learning_rate": 7.045454545454546e-05, + "loss": 3.9048, + "step": 34805 + }, + { + "epoch": 2.365131131947275, + "grad_norm": 0.16039611399173737, + "learning_rate": 7.045029895366219e-05, + "loss": 3.8446, + "step": 34810 + }, + { + "epoch": 2.3654708520179373, + "grad_norm": 0.16360166668891907, + "learning_rate": 7.04460524527789e-05, + "loss": 3.7311, + "step": 34815 + }, + { + "epoch": 2.365810572088599, + "grad_norm": 0.15799468755722046, + "learning_rate": 7.044180595189565e-05, + "loss": 3.7723, + "step": 34820 + }, + { + "epoch": 2.3661502921592605, + "grad_norm": 0.1764535903930664, + "learning_rate": 7.043755945101238e-05, + "loss": 3.4509, + "step": 34825 + }, + { + "epoch": 2.3664900122299226, + "grad_norm": 0.16846711933612823, + "learning_rate": 7.043331295012909e-05, + "loss": 3.5802, + "step": 34830 + }, + { + "epoch": 2.3668297323005842, + "grad_norm": 0.14287205040454865, + "learning_rate": 7.042906644924583e-05, + "loss": 4.0484, + "step": 34835 + }, + { + "epoch": 2.367169452371246, + "grad_norm": 0.17756585776805878, + "learning_rate": 7.042481994836256e-05, + "loss": 4.0787, + "step": 34840 + }, + { + "epoch": 2.367509172441908, + "grad_norm": 0.1961568295955658, + "learning_rate": 7.042057344747927e-05, + "loss": 3.8918, + "step": 34845 + }, + { + "epoch": 2.3678488925125696, + "grad_norm": 0.3508131802082062, + "learning_rate": 7.041632694659602e-05, + "loss": 3.9614, + "step": 34850 + }, + { + "epoch": 2.368188612583231, + "grad_norm": 0.17458845674991608, + "learning_rate": 7.041208044571273e-05, + "loss": 4.0121, + "step": 34855 + }, + { + "epoch": 2.3685283326538933, + "grad_norm": 0.31105902791023254, + "learning_rate": 7.040783394482946e-05, + "loss": 3.8328, + "step": 34860 + }, + { + "epoch": 2.368868052724555, + "grad_norm": 0.18908126652240753, + "learning_rate": 7.04035874439462e-05, + "loss": 3.8614, + "step": 34865 + }, + { + "epoch": 2.3692077727952165, + "grad_norm": 0.1832255721092224, + "learning_rate": 7.039934094306291e-05, + "loss": 3.9791, + "step": 34870 + }, + { + "epoch": 2.3695474928658786, + "grad_norm": 0.21219098567962646, + "learning_rate": 7.039509444217964e-05, + "loss": 4.0415, + "step": 34875 + }, + { + "epoch": 2.3698872129365403, + "grad_norm": 0.1648825705051422, + "learning_rate": 7.039084794129638e-05, + "loss": 3.8205, + "step": 34880 + }, + { + "epoch": 2.370226933007202, + "grad_norm": 0.18148626387119293, + "learning_rate": 7.03866014404131e-05, + "loss": 3.9021, + "step": 34885 + }, + { + "epoch": 2.370566653077864, + "grad_norm": 0.23025447130203247, + "learning_rate": 7.038235493952983e-05, + "loss": 3.8516, + "step": 34890 + }, + { + "epoch": 2.3709063731485256, + "grad_norm": 0.23982490599155426, + "learning_rate": 7.037810843864657e-05, + "loss": 3.9326, + "step": 34895 + }, + { + "epoch": 2.3712460932191872, + "grad_norm": 0.15724866092205048, + "learning_rate": 7.037386193776328e-05, + "loss": 3.8898, + "step": 34900 + }, + { + "epoch": 2.3715858132898493, + "grad_norm": 0.30242204666137695, + "learning_rate": 7.036961543688001e-05, + "loss": 3.9117, + "step": 34905 + }, + { + "epoch": 2.371925533360511, + "grad_norm": 0.19598232209682465, + "learning_rate": 7.036536893599675e-05, + "loss": 3.7749, + "step": 34910 + }, + { + "epoch": 2.3722652534311726, + "grad_norm": 0.1497187465429306, + "learning_rate": 7.036112243511347e-05, + "loss": 3.8849, + "step": 34915 + }, + { + "epoch": 2.3726049735018346, + "grad_norm": 0.1966252624988556, + "learning_rate": 7.03568759342302e-05, + "loss": 3.9843, + "step": 34920 + }, + { + "epoch": 2.3729446935724963, + "grad_norm": 1.263934850692749, + "learning_rate": 7.035262943334692e-05, + "loss": 3.8262, + "step": 34925 + }, + { + "epoch": 2.373284413643158, + "grad_norm": 0.16025008261203766, + "learning_rate": 7.034838293246365e-05, + "loss": 3.8628, + "step": 34930 + }, + { + "epoch": 2.37362413371382, + "grad_norm": 0.24759645760059357, + "learning_rate": 7.034413643158038e-05, + "loss": 4.0295, + "step": 34935 + }, + { + "epoch": 2.3739638537844816, + "grad_norm": 0.1671355813741684, + "learning_rate": 7.03398899306971e-05, + "loss": 4.025, + "step": 34940 + }, + { + "epoch": 2.3743035738551432, + "grad_norm": 0.17542287707328796, + "learning_rate": 7.033564342981383e-05, + "loss": 3.8187, + "step": 34945 + }, + { + "epoch": 2.3746432939258053, + "grad_norm": 0.22473275661468506, + "learning_rate": 7.033139692893056e-05, + "loss": 3.9648, + "step": 34950 + }, + { + "epoch": 2.374983013996467, + "grad_norm": 0.16561877727508545, + "learning_rate": 7.032715042804729e-05, + "loss": 3.7429, + "step": 34955 + }, + { + "epoch": 2.3753227340671286, + "grad_norm": 0.15233370661735535, + "learning_rate": 7.032290392716402e-05, + "loss": 3.8869, + "step": 34960 + }, + { + "epoch": 2.3756624541377906, + "grad_norm": 0.16744542121887207, + "learning_rate": 7.031865742628075e-05, + "loss": 3.9152, + "step": 34965 + }, + { + "epoch": 2.3760021742084523, + "grad_norm": 0.18345046043395996, + "learning_rate": 7.031441092539747e-05, + "loss": 3.9541, + "step": 34970 + }, + { + "epoch": 2.376341894279114, + "grad_norm": 0.1784452348947525, + "learning_rate": 7.03101644245142e-05, + "loss": 3.8959, + "step": 34975 + }, + { + "epoch": 2.376681614349776, + "grad_norm": 0.1984395831823349, + "learning_rate": 7.030591792363093e-05, + "loss": 3.8343, + "step": 34980 + }, + { + "epoch": 2.3770213344204376, + "grad_norm": 0.2020058035850525, + "learning_rate": 7.030167142274766e-05, + "loss": 3.8856, + "step": 34985 + }, + { + "epoch": 2.3773610544910992, + "grad_norm": 0.2967301309108734, + "learning_rate": 7.029742492186439e-05, + "loss": 4.0074, + "step": 34990 + }, + { + "epoch": 2.3777007745617613, + "grad_norm": 0.20043456554412842, + "learning_rate": 7.029317842098111e-05, + "loss": 3.7353, + "step": 34995 + }, + { + "epoch": 2.378040494632423, + "grad_norm": 0.151007279753685, + "learning_rate": 7.028893192009784e-05, + "loss": 4.0275, + "step": 35000 + }, + { + "epoch": 2.3783802147030846, + "grad_norm": 0.4795514643192291, + "learning_rate": 7.028468541921457e-05, + "loss": 3.8436, + "step": 35005 + }, + { + "epoch": 2.3787199347737467, + "grad_norm": 0.17003563046455383, + "learning_rate": 7.02804389183313e-05, + "loss": 3.9183, + "step": 35010 + }, + { + "epoch": 2.3790596548444083, + "grad_norm": 0.21616415679454803, + "learning_rate": 7.027619241744803e-05, + "loss": 3.7692, + "step": 35015 + }, + { + "epoch": 2.37939937491507, + "grad_norm": 0.2076455056667328, + "learning_rate": 7.027194591656475e-05, + "loss": 3.9471, + "step": 35020 + }, + { + "epoch": 2.379739094985732, + "grad_norm": 0.22068947553634644, + "learning_rate": 7.026769941568148e-05, + "loss": 3.9975, + "step": 35025 + }, + { + "epoch": 2.3800788150563936, + "grad_norm": 0.2420058250427246, + "learning_rate": 7.026345291479821e-05, + "loss": 3.9012, + "step": 35030 + }, + { + "epoch": 2.3804185351270553, + "grad_norm": 0.16254417598247528, + "learning_rate": 7.025920641391494e-05, + "loss": 3.9779, + "step": 35035 + }, + { + "epoch": 2.380758255197717, + "grad_norm": 0.2774917185306549, + "learning_rate": 7.025495991303167e-05, + "loss": 4.0968, + "step": 35040 + }, + { + "epoch": 2.381097975268379, + "grad_norm": 1.1040141582489014, + "learning_rate": 7.02507134121484e-05, + "loss": 3.6793, + "step": 35045 + }, + { + "epoch": 2.3814376953390406, + "grad_norm": 6.010372638702393, + "learning_rate": 7.024646691126512e-05, + "loss": 3.6087, + "step": 35050 + }, + { + "epoch": 2.3817774154097022, + "grad_norm": 0.24799270927906036, + "learning_rate": 7.024222041038185e-05, + "loss": 3.8575, + "step": 35055 + }, + { + "epoch": 2.3821171354803643, + "grad_norm": 0.16899791359901428, + "learning_rate": 7.023797390949858e-05, + "loss": 3.8739, + "step": 35060 + }, + { + "epoch": 2.382456855551026, + "grad_norm": 0.16555453836917877, + "learning_rate": 7.02337274086153e-05, + "loss": 3.8766, + "step": 35065 + }, + { + "epoch": 2.3827965756216876, + "grad_norm": 0.18497055768966675, + "learning_rate": 7.022948090773203e-05, + "loss": 3.8868, + "step": 35070 + }, + { + "epoch": 2.3831362956923496, + "grad_norm": 0.21046394109725952, + "learning_rate": 7.022523440684876e-05, + "loss": 3.8311, + "step": 35075 + }, + { + "epoch": 2.3834760157630113, + "grad_norm": 0.1471758335828781, + "learning_rate": 7.022183720614214e-05, + "loss": 3.8464, + "step": 35080 + }, + { + "epoch": 2.383815735833673, + "grad_norm": 0.1914798766374588, + "learning_rate": 7.021759070525887e-05, + "loss": 3.767, + "step": 35085 + }, + { + "epoch": 2.384155455904335, + "grad_norm": 0.15741200745105743, + "learning_rate": 7.02133442043756e-05, + "loss": 3.8513, + "step": 35090 + }, + { + "epoch": 2.3844951759749966, + "grad_norm": 0.18695376813411713, + "learning_rate": 7.020909770349231e-05, + "loss": 4.1669, + "step": 35095 + }, + { + "epoch": 2.3848348960456582, + "grad_norm": 0.1580781191587448, + "learning_rate": 7.020485120260906e-05, + "loss": 4.062, + "step": 35100 + }, + { + "epoch": 2.3851746161163203, + "grad_norm": 0.15757638216018677, + "learning_rate": 7.020060470172578e-05, + "loss": 4.1593, + "step": 35105 + }, + { + "epoch": 2.385514336186982, + "grad_norm": 0.1477503776550293, + "learning_rate": 7.01963582008425e-05, + "loss": 3.9972, + "step": 35110 + }, + { + "epoch": 2.3858540562576436, + "grad_norm": 0.19915081560611725, + "learning_rate": 7.019211169995924e-05, + "loss": 3.8051, + "step": 35115 + }, + { + "epoch": 2.3861937763283056, + "grad_norm": 0.17227745056152344, + "learning_rate": 7.018786519907597e-05, + "loss": 4.1152, + "step": 35120 + }, + { + "epoch": 2.3865334963989673, + "grad_norm": 1.2069405317306519, + "learning_rate": 7.018361869819268e-05, + "loss": 3.9396, + "step": 35125 + }, + { + "epoch": 2.386873216469629, + "grad_norm": 0.24876196682453156, + "learning_rate": 7.017937219730942e-05, + "loss": 4.1814, + "step": 35130 + }, + { + "epoch": 2.387212936540291, + "grad_norm": 0.22099217772483826, + "learning_rate": 7.017512569642615e-05, + "loss": 3.8183, + "step": 35135 + }, + { + "epoch": 2.3875526566109526, + "grad_norm": 0.26032349467277527, + "learning_rate": 7.017087919554287e-05, + "loss": 3.9303, + "step": 35140 + }, + { + "epoch": 2.3878923766816142, + "grad_norm": 0.16413246095180511, + "learning_rate": 7.016663269465961e-05, + "loss": 3.9876, + "step": 35145 + }, + { + "epoch": 2.388232096752276, + "grad_norm": 0.32890209555625916, + "learning_rate": 7.016238619377634e-05, + "loss": 3.9305, + "step": 35150 + }, + { + "epoch": 2.388571816822938, + "grad_norm": 0.15593382716178894, + "learning_rate": 7.015813969289305e-05, + "loss": 3.8979, + "step": 35155 + }, + { + "epoch": 2.3889115368935996, + "grad_norm": 0.2113686203956604, + "learning_rate": 7.015389319200979e-05, + "loss": 4.0027, + "step": 35160 + }, + { + "epoch": 2.389251256964261, + "grad_norm": 2.297987699508667, + "learning_rate": 7.014964669112651e-05, + "loss": 4.031, + "step": 35165 + }, + { + "epoch": 2.3895909770349233, + "grad_norm": 0.17299415171146393, + "learning_rate": 7.014540019024324e-05, + "loss": 3.7632, + "step": 35170 + }, + { + "epoch": 2.389930697105585, + "grad_norm": 0.19930002093315125, + "learning_rate": 7.014115368935998e-05, + "loss": 3.9145, + "step": 35175 + }, + { + "epoch": 2.3902704171762466, + "grad_norm": 0.1799178570508957, + "learning_rate": 7.013690718847669e-05, + "loss": 3.7222, + "step": 35180 + }, + { + "epoch": 2.3906101372469086, + "grad_norm": 0.17523592710494995, + "learning_rate": 7.013266068759342e-05, + "loss": 3.7925, + "step": 35185 + }, + { + "epoch": 2.3909498573175703, + "grad_norm": 0.23907101154327393, + "learning_rate": 7.012841418671016e-05, + "loss": 3.9168, + "step": 35190 + }, + { + "epoch": 2.391289577388232, + "grad_norm": 0.22770938277244568, + "learning_rate": 7.012416768582688e-05, + "loss": 3.9561, + "step": 35195 + }, + { + "epoch": 2.391629297458894, + "grad_norm": 0.6419442296028137, + "learning_rate": 7.01199211849436e-05, + "loss": 4.2013, + "step": 35200 + }, + { + "epoch": 2.3919690175295556, + "grad_norm": 0.2501198649406433, + "learning_rate": 7.011567468406034e-05, + "loss": 3.4686, + "step": 35205 + }, + { + "epoch": 2.3923087376002172, + "grad_norm": 0.1930820196866989, + "learning_rate": 7.011142818317706e-05, + "loss": 4.1928, + "step": 35210 + }, + { + "epoch": 2.3926484576708793, + "grad_norm": 0.18877695500850677, + "learning_rate": 7.01071816822938e-05, + "loss": 3.9415, + "step": 35215 + }, + { + "epoch": 2.392988177741541, + "grad_norm": 0.1495112180709839, + "learning_rate": 7.010293518141053e-05, + "loss": 3.9921, + "step": 35220 + }, + { + "epoch": 2.3933278978122026, + "grad_norm": 0.1751498132944107, + "learning_rate": 7.009868868052724e-05, + "loss": 4.0399, + "step": 35225 + }, + { + "epoch": 2.3936676178828646, + "grad_norm": 0.19117720425128937, + "learning_rate": 7.009444217964398e-05, + "loss": 3.9743, + "step": 35230 + }, + { + "epoch": 2.3940073379535263, + "grad_norm": 0.19562087953090668, + "learning_rate": 7.009019567876071e-05, + "loss": 3.7052, + "step": 35235 + }, + { + "epoch": 2.394347058024188, + "grad_norm": 0.19477924704551697, + "learning_rate": 7.008594917787743e-05, + "loss": 4.016, + "step": 35240 + }, + { + "epoch": 2.39468677809485, + "grad_norm": 0.16971172392368317, + "learning_rate": 7.008170267699417e-05, + "loss": 3.7487, + "step": 35245 + }, + { + "epoch": 2.3950264981655116, + "grad_norm": 0.16426925361156464, + "learning_rate": 7.007745617611088e-05, + "loss": 3.735, + "step": 35250 + }, + { + "epoch": 2.3953662182361732, + "grad_norm": 0.14565309882164001, + "learning_rate": 7.007320967522761e-05, + "loss": 3.8849, + "step": 35255 + }, + { + "epoch": 2.3957059383068353, + "grad_norm": 0.2386237531900406, + "learning_rate": 7.006896317434435e-05, + "loss": 3.8603, + "step": 35260 + }, + { + "epoch": 2.396045658377497, + "grad_norm": 0.16097943484783173, + "learning_rate": 7.006471667346107e-05, + "loss": 4.0981, + "step": 35265 + }, + { + "epoch": 2.3963853784481586, + "grad_norm": 0.14299938082695007, + "learning_rate": 7.00604701725778e-05, + "loss": 4.0037, + "step": 35270 + }, + { + "epoch": 2.3967250985188207, + "grad_norm": 0.15071888267993927, + "learning_rate": 7.005622367169454e-05, + "loss": 3.8635, + "step": 35275 + }, + { + "epoch": 2.3970648185894823, + "grad_norm": 0.20038768649101257, + "learning_rate": 7.005197717081125e-05, + "loss": 3.9456, + "step": 35280 + }, + { + "epoch": 2.397404538660144, + "grad_norm": 0.23678311705589294, + "learning_rate": 7.004773066992798e-05, + "loss": 3.9584, + "step": 35285 + }, + { + "epoch": 2.397744258730806, + "grad_norm": 0.24175770580768585, + "learning_rate": 7.004348416904472e-05, + "loss": 3.9588, + "step": 35290 + }, + { + "epoch": 2.3980839788014676, + "grad_norm": 0.14926986396312714, + "learning_rate": 7.003923766816144e-05, + "loss": 3.9633, + "step": 35295 + }, + { + "epoch": 2.3984236988721293, + "grad_norm": 0.2778499722480774, + "learning_rate": 7.003499116727816e-05, + "loss": 3.9763, + "step": 35300 + }, + { + "epoch": 2.3987634189427913, + "grad_norm": 0.13911688327789307, + "learning_rate": 7.00307446663949e-05, + "loss": 4.0237, + "step": 35305 + }, + { + "epoch": 2.399103139013453, + "grad_norm": 0.1766834259033203, + "learning_rate": 7.002649816551162e-05, + "loss": 3.8223, + "step": 35310 + }, + { + "epoch": 2.3994428590841146, + "grad_norm": 0.30721908807754517, + "learning_rate": 7.002225166462835e-05, + "loss": 3.7123, + "step": 35315 + }, + { + "epoch": 2.3997825791547767, + "grad_norm": 0.18673056364059448, + "learning_rate": 7.001800516374508e-05, + "loss": 3.8027, + "step": 35320 + }, + { + "epoch": 2.4001222992254383, + "grad_norm": 0.17892174422740936, + "learning_rate": 7.00137586628618e-05, + "loss": 3.9689, + "step": 35325 + }, + { + "epoch": 2.4004620192961, + "grad_norm": 0.15162548422813416, + "learning_rate": 7.000951216197853e-05, + "loss": 3.7557, + "step": 35330 + }, + { + "epoch": 2.400801739366762, + "grad_norm": 0.14508214592933655, + "learning_rate": 7.000526566109526e-05, + "loss": 4.0367, + "step": 35335 + }, + { + "epoch": 2.4011414594374236, + "grad_norm": 0.3082200586795807, + "learning_rate": 7.000101916021199e-05, + "loss": 3.8428, + "step": 35340 + }, + { + "epoch": 2.4014811795080853, + "grad_norm": 0.17048580944538116, + "learning_rate": 6.999677265932872e-05, + "loss": 3.8762, + "step": 35345 + }, + { + "epoch": 2.4018208995787473, + "grad_norm": 0.22295649349689484, + "learning_rate": 6.999252615844544e-05, + "loss": 3.8061, + "step": 35350 + }, + { + "epoch": 2.402160619649409, + "grad_norm": 0.20875748991966248, + "learning_rate": 6.998827965756217e-05, + "loss": 3.7872, + "step": 35355 + }, + { + "epoch": 2.4025003397200706, + "grad_norm": 0.3772265911102295, + "learning_rate": 6.99840331566789e-05, + "loss": 3.8731, + "step": 35360 + }, + { + "epoch": 2.4028400597907327, + "grad_norm": 0.17891767621040344, + "learning_rate": 6.997978665579563e-05, + "loss": 3.7204, + "step": 35365 + }, + { + "epoch": 2.4031797798613943, + "grad_norm": 0.24207191169261932, + "learning_rate": 6.997554015491236e-05, + "loss": 4.0196, + "step": 35370 + }, + { + "epoch": 2.403519499932056, + "grad_norm": 0.1306188851594925, + "learning_rate": 6.997129365402908e-05, + "loss": 3.7078, + "step": 35375 + }, + { + "epoch": 2.4038592200027176, + "grad_norm": 0.1807194948196411, + "learning_rate": 6.996704715314581e-05, + "loss": 3.7552, + "step": 35380 + }, + { + "epoch": 2.4041989400733796, + "grad_norm": 0.16450022161006927, + "learning_rate": 6.996280065226254e-05, + "loss": 3.8745, + "step": 35385 + }, + { + "epoch": 2.4045386601440413, + "grad_norm": 0.15915901958942413, + "learning_rate": 6.995855415137927e-05, + "loss": 3.8155, + "step": 35390 + }, + { + "epoch": 2.404878380214703, + "grad_norm": 0.17143741250038147, + "learning_rate": 6.9954307650496e-05, + "loss": 3.7657, + "step": 35395 + }, + { + "epoch": 2.405218100285365, + "grad_norm": 0.1634187400341034, + "learning_rate": 6.995006114961272e-05, + "loss": 3.7735, + "step": 35400 + }, + { + "epoch": 2.4055578203560266, + "grad_norm": 0.18030096590518951, + "learning_rate": 6.994581464872945e-05, + "loss": 4.0822, + "step": 35405 + }, + { + "epoch": 2.4058975404266882, + "grad_norm": 0.23381423950195312, + "learning_rate": 6.994156814784618e-05, + "loss": 3.7822, + "step": 35410 + }, + { + "epoch": 2.4062372604973503, + "grad_norm": 0.1430894136428833, + "learning_rate": 6.993732164696291e-05, + "loss": 3.6749, + "step": 35415 + }, + { + "epoch": 2.406576980568012, + "grad_norm": 0.1748279482126236, + "learning_rate": 6.993307514607964e-05, + "loss": 3.8155, + "step": 35420 + }, + { + "epoch": 2.4069167006386736, + "grad_norm": 0.1962091326713562, + "learning_rate": 6.992882864519636e-05, + "loss": 3.9876, + "step": 35425 + }, + { + "epoch": 2.4072564207093357, + "grad_norm": 0.1815182864665985, + "learning_rate": 6.992458214431309e-05, + "loss": 3.9228, + "step": 35430 + }, + { + "epoch": 2.4075961407799973, + "grad_norm": 0.30948641896247864, + "learning_rate": 6.992033564342982e-05, + "loss": 3.8654, + "step": 35435 + }, + { + "epoch": 2.407935860850659, + "grad_norm": 0.7766483426094055, + "learning_rate": 6.991608914254655e-05, + "loss": 3.8872, + "step": 35440 + }, + { + "epoch": 2.408275580921321, + "grad_norm": 0.21183514595031738, + "learning_rate": 6.991184264166328e-05, + "loss": 4.083, + "step": 35445 + }, + { + "epoch": 2.4086153009919826, + "grad_norm": 0.1901647448539734, + "learning_rate": 6.990759614077999e-05, + "loss": 4.2176, + "step": 35450 + }, + { + "epoch": 2.4089550210626443, + "grad_norm": 0.27889496088027954, + "learning_rate": 6.990334963989673e-05, + "loss": 3.7604, + "step": 35455 + }, + { + "epoch": 2.4092947411333063, + "grad_norm": 0.15199705958366394, + "learning_rate": 6.989910313901346e-05, + "loss": 4.0593, + "step": 35460 + }, + { + "epoch": 2.409634461203968, + "grad_norm": 0.2709583640098572, + "learning_rate": 6.989485663813017e-05, + "loss": 3.7427, + "step": 35465 + }, + { + "epoch": 2.4099741812746296, + "grad_norm": 0.15564188361167908, + "learning_rate": 6.989061013724692e-05, + "loss": 3.913, + "step": 35470 + }, + { + "epoch": 2.4103139013452917, + "grad_norm": 0.15495608747005463, + "learning_rate": 6.988636363636364e-05, + "loss": 3.8977, + "step": 35475 + }, + { + "epoch": 2.4106536214159533, + "grad_norm": 0.1607818901538849, + "learning_rate": 6.988211713548036e-05, + "loss": 3.7982, + "step": 35480 + }, + { + "epoch": 2.410993341486615, + "grad_norm": 0.1949816197156906, + "learning_rate": 6.98778706345971e-05, + "loss": 3.9639, + "step": 35485 + }, + { + "epoch": 2.4113330615572766, + "grad_norm": 0.16188134253025055, + "learning_rate": 6.987362413371383e-05, + "loss": 4.1211, + "step": 35490 + }, + { + "epoch": 2.4116727816279386, + "grad_norm": 0.23789745569229126, + "learning_rate": 6.986937763283054e-05, + "loss": 4.0525, + "step": 35495 + }, + { + "epoch": 2.4120125016986003, + "grad_norm": 0.1793557107448578, + "learning_rate": 6.986513113194728e-05, + "loss": 4.0613, + "step": 35500 + }, + { + "epoch": 2.412352221769262, + "grad_norm": 0.17384912073612213, + "learning_rate": 6.986088463106401e-05, + "loss": 3.6036, + "step": 35505 + }, + { + "epoch": 2.412691941839924, + "grad_norm": 0.1701529622077942, + "learning_rate": 6.985663813018073e-05, + "loss": 3.8854, + "step": 35510 + }, + { + "epoch": 2.4130316619105856, + "grad_norm": 0.14212077856063843, + "learning_rate": 6.985239162929747e-05, + "loss": 3.9215, + "step": 35515 + }, + { + "epoch": 2.4133713819812472, + "grad_norm": 0.22250454127788544, + "learning_rate": 6.984814512841418e-05, + "loss": 4.1357, + "step": 35520 + }, + { + "epoch": 2.4137111020519093, + "grad_norm": 0.22357754409313202, + "learning_rate": 6.984389862753091e-05, + "loss": 3.8985, + "step": 35525 + }, + { + "epoch": 2.414050822122571, + "grad_norm": 0.1753927618265152, + "learning_rate": 6.983965212664765e-05, + "loss": 4.0901, + "step": 35530 + }, + { + "epoch": 2.4143905421932326, + "grad_norm": 0.13322322070598602, + "learning_rate": 6.983540562576437e-05, + "loss": 4.0241, + "step": 35535 + }, + { + "epoch": 2.4147302622638946, + "grad_norm": 0.1865941286087036, + "learning_rate": 6.98311591248811e-05, + "loss": 3.8735, + "step": 35540 + }, + { + "epoch": 2.4150699823345563, + "grad_norm": 0.17113398015499115, + "learning_rate": 6.982691262399784e-05, + "loss": 3.9378, + "step": 35545 + }, + { + "epoch": 2.415409702405218, + "grad_norm": 0.18200746178627014, + "learning_rate": 6.982266612311455e-05, + "loss": 3.7954, + "step": 35550 + }, + { + "epoch": 2.41574942247588, + "grad_norm": 0.18082289397716522, + "learning_rate": 6.981841962223129e-05, + "loss": 4.0418, + "step": 35555 + }, + { + "epoch": 2.4160891425465416, + "grad_norm": 3.8521430492401123, + "learning_rate": 6.981417312134802e-05, + "loss": 3.9671, + "step": 35560 + }, + { + "epoch": 2.4164288626172032, + "grad_norm": 0.14055785536766052, + "learning_rate": 6.980992662046473e-05, + "loss": 3.7418, + "step": 35565 + }, + { + "epoch": 2.4167685826878653, + "grad_norm": 0.18189401924610138, + "learning_rate": 6.980568011958148e-05, + "loss": 3.8168, + "step": 35570 + }, + { + "epoch": 2.417108302758527, + "grad_norm": 0.22798043489456177, + "learning_rate": 6.98014336186982e-05, + "loss": 3.8101, + "step": 35575 + }, + { + "epoch": 2.4174480228291886, + "grad_norm": 0.17143777012825012, + "learning_rate": 6.979718711781492e-05, + "loss": 3.8992, + "step": 35580 + }, + { + "epoch": 2.4177877428998507, + "grad_norm": 0.30104443430900574, + "learning_rate": 6.979294061693166e-05, + "loss": 3.901, + "step": 35585 + }, + { + "epoch": 2.4181274629705123, + "grad_norm": 2.696350574493408, + "learning_rate": 6.978869411604839e-05, + "loss": 3.9062, + "step": 35590 + }, + { + "epoch": 2.418467183041174, + "grad_norm": 0.3592741787433624, + "learning_rate": 6.97844476151651e-05, + "loss": 4.0626, + "step": 35595 + }, + { + "epoch": 2.418806903111836, + "grad_norm": 0.19103389978408813, + "learning_rate": 6.978020111428184e-05, + "loss": 4.0319, + "step": 35600 + }, + { + "epoch": 2.4191466231824976, + "grad_norm": 0.13852714002132416, + "learning_rate": 6.977595461339856e-05, + "loss": 3.8958, + "step": 35605 + }, + { + "epoch": 2.4194863432531593, + "grad_norm": 0.15781597793102264, + "learning_rate": 6.977170811251529e-05, + "loss": 3.8151, + "step": 35610 + }, + { + "epoch": 2.4198260633238213, + "grad_norm": 0.1800476610660553, + "learning_rate": 6.976746161163203e-05, + "loss": 3.9576, + "step": 35615 + }, + { + "epoch": 2.420165783394483, + "grad_norm": 0.25098028779029846, + "learning_rate": 6.976321511074874e-05, + "loss": 3.91, + "step": 35620 + }, + { + "epoch": 2.4205055034651446, + "grad_norm": 1.7809762954711914, + "learning_rate": 6.975896860986547e-05, + "loss": 3.7779, + "step": 35625 + }, + { + "epoch": 2.4208452235358067, + "grad_norm": 0.18984760344028473, + "learning_rate": 6.975472210898221e-05, + "loss": 3.9136, + "step": 35630 + }, + { + "epoch": 2.4211849436064683, + "grad_norm": 0.194640651345253, + "learning_rate": 6.975047560809893e-05, + "loss": 3.998, + "step": 35635 + }, + { + "epoch": 2.42152466367713, + "grad_norm": 0.19488584995269775, + "learning_rate": 6.974622910721565e-05, + "loss": 3.7936, + "step": 35640 + }, + { + "epoch": 2.421864383747792, + "grad_norm": 0.1482219696044922, + "learning_rate": 6.97419826063324e-05, + "loss": 3.8019, + "step": 35645 + }, + { + "epoch": 2.4222041038184536, + "grad_norm": 0.28963547945022583, + "learning_rate": 6.973773610544911e-05, + "loss": 3.7614, + "step": 35650 + }, + { + "epoch": 2.4225438238891153, + "grad_norm": 0.1749502718448639, + "learning_rate": 6.973348960456584e-05, + "loss": 3.6873, + "step": 35655 + }, + { + "epoch": 2.4228835439597773, + "grad_norm": 0.4637705683708191, + "learning_rate": 6.972924310368258e-05, + "loss": 4.0803, + "step": 35660 + }, + { + "epoch": 2.423223264030439, + "grad_norm": 0.14008869230747223, + "learning_rate": 6.97249966027993e-05, + "loss": 3.8543, + "step": 35665 + }, + { + "epoch": 2.4235629841011006, + "grad_norm": 0.1736019253730774, + "learning_rate": 6.972075010191602e-05, + "loss": 4.0391, + "step": 35670 + }, + { + "epoch": 2.4239027041717627, + "grad_norm": 0.39295482635498047, + "learning_rate": 6.971650360103275e-05, + "loss": 3.714, + "step": 35675 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.27989619970321655, + "learning_rate": 6.971225710014948e-05, + "loss": 3.9146, + "step": 35680 + }, + { + "epoch": 2.424582144313086, + "grad_norm": 0.16213306784629822, + "learning_rate": 6.97080105992662e-05, + "loss": 3.785, + "step": 35685 + }, + { + "epoch": 2.424921864383748, + "grad_norm": 0.1965455710887909, + "learning_rate": 6.970376409838293e-05, + "loss": 3.8482, + "step": 35690 + }, + { + "epoch": 2.4252615844544096, + "grad_norm": 0.22674986720085144, + "learning_rate": 6.969951759749966e-05, + "loss": 3.7871, + "step": 35695 + }, + { + "epoch": 2.4256013045250713, + "grad_norm": 0.3413090109825134, + "learning_rate": 6.969527109661639e-05, + "loss": 3.854, + "step": 35700 + }, + { + "epoch": 2.4259410245957334, + "grad_norm": 0.19491694867610931, + "learning_rate": 6.969102459573312e-05, + "loss": 3.4217, + "step": 35705 + }, + { + "epoch": 2.426280744666395, + "grad_norm": 0.15379123389720917, + "learning_rate": 6.968677809484985e-05, + "loss": 4.1464, + "step": 35710 + }, + { + "epoch": 2.4266204647370566, + "grad_norm": 0.14035077393054962, + "learning_rate": 6.968253159396657e-05, + "loss": 3.7218, + "step": 35715 + }, + { + "epoch": 2.4269601848077182, + "grad_norm": 0.3222501277923584, + "learning_rate": 6.96782850930833e-05, + "loss": 3.8749, + "step": 35720 + }, + { + "epoch": 2.4272999048783803, + "grad_norm": 0.140783429145813, + "learning_rate": 6.967403859220003e-05, + "loss": 3.7579, + "step": 35725 + }, + { + "epoch": 2.427639624949042, + "grad_norm": 0.22320470213890076, + "learning_rate": 6.966979209131676e-05, + "loss": 4.0451, + "step": 35730 + }, + { + "epoch": 2.4279793450197036, + "grad_norm": 0.15971311926841736, + "learning_rate": 6.966554559043349e-05, + "loss": 3.8289, + "step": 35735 + }, + { + "epoch": 2.4283190650903657, + "grad_norm": 0.1684703528881073, + "learning_rate": 6.966129908955021e-05, + "loss": 3.8074, + "step": 35740 + }, + { + "epoch": 2.4286587851610273, + "grad_norm": 0.15857058763504028, + "learning_rate": 6.965705258866694e-05, + "loss": 3.6332, + "step": 35745 + }, + { + "epoch": 2.428998505231689, + "grad_norm": 0.2200189083814621, + "learning_rate": 6.965280608778367e-05, + "loss": 3.9125, + "step": 35750 + }, + { + "epoch": 2.429338225302351, + "grad_norm": 0.18016751110553741, + "learning_rate": 6.96485595869004e-05, + "loss": 3.7505, + "step": 35755 + }, + { + "epoch": 2.4296779453730126, + "grad_norm": 0.18371470272541046, + "learning_rate": 6.964431308601713e-05, + "loss": 4.0741, + "step": 35760 + }, + { + "epoch": 2.4300176654436743, + "grad_norm": 0.19834037125110626, + "learning_rate": 6.964006658513385e-05, + "loss": 4.0359, + "step": 35765 + }, + { + "epoch": 2.4303573855143363, + "grad_norm": 0.160971000790596, + "learning_rate": 6.963582008425058e-05, + "loss": 3.9375, + "step": 35770 + }, + { + "epoch": 2.430697105584998, + "grad_norm": 0.19047199189662933, + "learning_rate": 6.963157358336731e-05, + "loss": 3.9978, + "step": 35775 + }, + { + "epoch": 2.4310368256556596, + "grad_norm": 0.4023791253566742, + "learning_rate": 6.962732708248404e-05, + "loss": 3.7813, + "step": 35780 + }, + { + "epoch": 2.4313765457263217, + "grad_norm": 0.1660010665655136, + "learning_rate": 6.962308058160077e-05, + "loss": 3.8777, + "step": 35785 + }, + { + "epoch": 2.4317162657969833, + "grad_norm": 0.18059125542640686, + "learning_rate": 6.96188340807175e-05, + "loss": 3.8412, + "step": 35790 + }, + { + "epoch": 2.432055985867645, + "grad_norm": 0.1970418244600296, + "learning_rate": 6.961458757983422e-05, + "loss": 4.034, + "step": 35795 + }, + { + "epoch": 2.432395705938307, + "grad_norm": 0.16715186834335327, + "learning_rate": 6.961034107895095e-05, + "loss": 4.0112, + "step": 35800 + }, + { + "epoch": 2.4327354260089686, + "grad_norm": 0.3058704733848572, + "learning_rate": 6.960609457806766e-05, + "loss": 3.7805, + "step": 35805 + }, + { + "epoch": 2.4330751460796303, + "grad_norm": 0.1596057415008545, + "learning_rate": 6.96018480771844e-05, + "loss": 4.1052, + "step": 35810 + }, + { + "epoch": 2.4334148661502923, + "grad_norm": 0.19252650439739227, + "learning_rate": 6.959760157630113e-05, + "loss": 3.7515, + "step": 35815 + }, + { + "epoch": 2.433754586220954, + "grad_norm": 0.17683137953281403, + "learning_rate": 6.959335507541785e-05, + "loss": 4.191, + "step": 35820 + }, + { + "epoch": 2.4340943062916156, + "grad_norm": 0.18876998126506805, + "learning_rate": 6.958910857453459e-05, + "loss": 4.014, + "step": 35825 + }, + { + "epoch": 2.4344340263622772, + "grad_norm": 0.1320532262325287, + "learning_rate": 6.958486207365132e-05, + "loss": 3.8823, + "step": 35830 + }, + { + "epoch": 2.4347737464329393, + "grad_norm": 0.21703828871250153, + "learning_rate": 6.958061557276803e-05, + "loss": 3.7705, + "step": 35835 + }, + { + "epoch": 2.435113466503601, + "grad_norm": 0.1484573930501938, + "learning_rate": 6.957636907188477e-05, + "loss": 3.8172, + "step": 35840 + }, + { + "epoch": 2.4354531865742626, + "grad_norm": 0.1982671469449997, + "learning_rate": 6.95721225710015e-05, + "loss": 3.8367, + "step": 35845 + }, + { + "epoch": 2.4357929066449246, + "grad_norm": 0.20273710787296295, + "learning_rate": 6.956787607011822e-05, + "loss": 3.7317, + "step": 35850 + }, + { + "epoch": 2.4361326267155863, + "grad_norm": 0.4039521813392639, + "learning_rate": 6.956362956923496e-05, + "loss": 3.9261, + "step": 35855 + }, + { + "epoch": 2.436472346786248, + "grad_norm": 0.17542827129364014, + "learning_rate": 6.955938306835169e-05, + "loss": 4.0257, + "step": 35860 + }, + { + "epoch": 2.43681206685691, + "grad_norm": 0.16899430751800537, + "learning_rate": 6.95551365674684e-05, + "loss": 3.7482, + "step": 35865 + }, + { + "epoch": 2.4371517869275716, + "grad_norm": 0.18863551318645477, + "learning_rate": 6.955089006658514e-05, + "loss": 3.7367, + "step": 35870 + }, + { + "epoch": 2.4374915069982332, + "grad_norm": 0.1788477748632431, + "learning_rate": 6.954664356570186e-05, + "loss": 4.0243, + "step": 35875 + }, + { + "epoch": 2.4378312270688953, + "grad_norm": 0.23642119765281677, + "learning_rate": 6.954239706481859e-05, + "loss": 3.9973, + "step": 35880 + }, + { + "epoch": 2.438170947139557, + "grad_norm": 0.18261738121509552, + "learning_rate": 6.953815056393533e-05, + "loss": 3.6015, + "step": 35885 + }, + { + "epoch": 2.4385106672102186, + "grad_norm": 0.22067059576511383, + "learning_rate": 6.953390406305204e-05, + "loss": 4.0167, + "step": 35890 + }, + { + "epoch": 2.4388503872808807, + "grad_norm": 0.17295710742473602, + "learning_rate": 6.952965756216878e-05, + "loss": 3.8479, + "step": 35895 + }, + { + "epoch": 2.4391901073515423, + "grad_norm": 0.17328527569770813, + "learning_rate": 6.952541106128551e-05, + "loss": 3.7107, + "step": 35900 + }, + { + "epoch": 2.439529827422204, + "grad_norm": 0.2019263058900833, + "learning_rate": 6.952116456040223e-05, + "loss": 3.8506, + "step": 35905 + }, + { + "epoch": 2.439869547492866, + "grad_norm": 15.286571502685547, + "learning_rate": 6.951691805951897e-05, + "loss": 4.0811, + "step": 35910 + }, + { + "epoch": 2.4402092675635276, + "grad_norm": 0.1578136831521988, + "learning_rate": 6.95126715586357e-05, + "loss": 3.8386, + "step": 35915 + }, + { + "epoch": 2.4405489876341893, + "grad_norm": 0.1790495216846466, + "learning_rate": 6.950842505775241e-05, + "loss": 3.8688, + "step": 35920 + }, + { + "epoch": 2.4408887077048513, + "grad_norm": 0.17588889598846436, + "learning_rate": 6.950417855686915e-05, + "loss": 3.7294, + "step": 35925 + }, + { + "epoch": 2.441228427775513, + "grad_norm": 0.2111557126045227, + "learning_rate": 6.949993205598588e-05, + "loss": 3.9887, + "step": 35930 + }, + { + "epoch": 2.4415681478461746, + "grad_norm": 0.16743060946464539, + "learning_rate": 6.949568555510259e-05, + "loss": 4.0333, + "step": 35935 + }, + { + "epoch": 2.4419078679168367, + "grad_norm": 0.16518808901309967, + "learning_rate": 6.949143905421933e-05, + "loss": 3.8704, + "step": 35940 + }, + { + "epoch": 2.4422475879874983, + "grad_norm": 0.2130100578069687, + "learning_rate": 6.948719255333605e-05, + "loss": 3.8843, + "step": 35945 + }, + { + "epoch": 2.44258730805816, + "grad_norm": 0.2778122127056122, + "learning_rate": 6.948294605245278e-05, + "loss": 3.7291, + "step": 35950 + }, + { + "epoch": 2.442927028128822, + "grad_norm": 0.13165327906608582, + "learning_rate": 6.947869955156952e-05, + "loss": 4.0083, + "step": 35955 + }, + { + "epoch": 2.4432667481994836, + "grad_norm": 0.16578881442546844, + "learning_rate": 6.947445305068623e-05, + "loss": 3.6845, + "step": 35960 + }, + { + "epoch": 2.4436064682701453, + "grad_norm": 0.17065231502056122, + "learning_rate": 6.947020654980296e-05, + "loss": 3.896, + "step": 35965 + }, + { + "epoch": 2.4439461883408073, + "grad_norm": 0.1928863525390625, + "learning_rate": 6.94659600489197e-05, + "loss": 3.6639, + "step": 35970 + }, + { + "epoch": 2.444285908411469, + "grad_norm": 0.21527524292469025, + "learning_rate": 6.946171354803642e-05, + "loss": 3.7562, + "step": 35975 + }, + { + "epoch": 2.4446256284821306, + "grad_norm": 0.21453236043453217, + "learning_rate": 6.945746704715315e-05, + "loss": 3.585, + "step": 35980 + }, + { + "epoch": 2.4449653485527927, + "grad_norm": 0.18916966021060944, + "learning_rate": 6.945322054626989e-05, + "loss": 3.7244, + "step": 35985 + }, + { + "epoch": 2.4453050686234543, + "grad_norm": 0.1621418446302414, + "learning_rate": 6.94489740453866e-05, + "loss": 4.0956, + "step": 35990 + }, + { + "epoch": 2.445644788694116, + "grad_norm": 0.17796798050403595, + "learning_rate": 6.944472754450333e-05, + "loss": 3.7938, + "step": 35995 + }, + { + "epoch": 2.445984508764778, + "grad_norm": 0.20030507445335388, + "learning_rate": 6.944048104362007e-05, + "loss": 3.8936, + "step": 36000 + }, + { + "epoch": 2.4463242288354397, + "grad_norm": 0.2747809886932373, + "learning_rate": 6.943623454273679e-05, + "loss": 4.1238, + "step": 36005 + }, + { + "epoch": 2.4466639489061013, + "grad_norm": 0.17187118530273438, + "learning_rate": 6.943198804185351e-05, + "loss": 4.0124, + "step": 36010 + }, + { + "epoch": 2.4470036689767634, + "grad_norm": 0.15999282896518707, + "learning_rate": 6.942774154097025e-05, + "loss": 3.8903, + "step": 36015 + }, + { + "epoch": 2.447343389047425, + "grad_norm": 0.14726600050926208, + "learning_rate": 6.942349504008697e-05, + "loss": 3.6803, + "step": 36020 + }, + { + "epoch": 2.4476831091180866, + "grad_norm": 0.1673300415277481, + "learning_rate": 6.94192485392037e-05, + "loss": 3.8148, + "step": 36025 + }, + { + "epoch": 2.4480228291887487, + "grad_norm": 0.18858996033668518, + "learning_rate": 6.941500203832043e-05, + "loss": 3.803, + "step": 36030 + }, + { + "epoch": 2.4483625492594103, + "grad_norm": 0.17100179195404053, + "learning_rate": 6.941075553743715e-05, + "loss": 3.9026, + "step": 36035 + }, + { + "epoch": 2.448702269330072, + "grad_norm": 0.3090517222881317, + "learning_rate": 6.940650903655388e-05, + "loss": 3.7307, + "step": 36040 + }, + { + "epoch": 2.449041989400734, + "grad_norm": 0.15081068873405457, + "learning_rate": 6.940226253567061e-05, + "loss": 4.0042, + "step": 36045 + }, + { + "epoch": 2.4493817094713957, + "grad_norm": 0.6286077499389648, + "learning_rate": 6.939801603478734e-05, + "loss": 4.0234, + "step": 36050 + }, + { + "epoch": 2.4497214295420573, + "grad_norm": 0.19811220467090607, + "learning_rate": 6.939376953390407e-05, + "loss": 3.8621, + "step": 36055 + }, + { + "epoch": 2.4500611496127194, + "grad_norm": 0.22905570268630981, + "learning_rate": 6.93895230330208e-05, + "loss": 3.9638, + "step": 36060 + }, + { + "epoch": 2.450400869683381, + "grad_norm": 0.19862687587738037, + "learning_rate": 6.938527653213752e-05, + "loss": 3.9406, + "step": 36065 + }, + { + "epoch": 2.4507405897540426, + "grad_norm": 1.374951958656311, + "learning_rate": 6.938103003125425e-05, + "loss": 3.8503, + "step": 36070 + }, + { + "epoch": 2.4510803098247043, + "grad_norm": 0.40026697516441345, + "learning_rate": 6.937678353037098e-05, + "loss": 3.9305, + "step": 36075 + }, + { + "epoch": 2.4514200298953663, + "grad_norm": 0.13687218725681305, + "learning_rate": 6.93725370294877e-05, + "loss": 4.0672, + "step": 36080 + }, + { + "epoch": 2.451759749966028, + "grad_norm": 0.17260117828845978, + "learning_rate": 6.936829052860443e-05, + "loss": 4.0906, + "step": 36085 + }, + { + "epoch": 2.4520994700366896, + "grad_norm": 0.1388218104839325, + "learning_rate": 6.936404402772116e-05, + "loss": 3.777, + "step": 36090 + }, + { + "epoch": 2.4524391901073517, + "grad_norm": 0.17588473856449127, + "learning_rate": 6.935979752683789e-05, + "loss": 3.7966, + "step": 36095 + }, + { + "epoch": 2.4527789101780133, + "grad_norm": 0.7815591096878052, + "learning_rate": 6.935555102595462e-05, + "loss": 3.743, + "step": 36100 + }, + { + "epoch": 2.453118630248675, + "grad_norm": 0.1511906385421753, + "learning_rate": 6.935130452507135e-05, + "loss": 3.7144, + "step": 36105 + }, + { + "epoch": 2.453458350319337, + "grad_norm": 0.16056989133358002, + "learning_rate": 6.934705802418807e-05, + "loss": 3.9991, + "step": 36110 + }, + { + "epoch": 2.4537980703899986, + "grad_norm": 0.1409963071346283, + "learning_rate": 6.93428115233048e-05, + "loss": 4.0325, + "step": 36115 + }, + { + "epoch": 2.4541377904606603, + "grad_norm": 0.1763191819190979, + "learning_rate": 6.933856502242153e-05, + "loss": 4.1328, + "step": 36120 + }, + { + "epoch": 2.4544775105313223, + "grad_norm": 0.1618926078081131, + "learning_rate": 6.933431852153826e-05, + "loss": 3.805, + "step": 36125 + }, + { + "epoch": 2.454817230601984, + "grad_norm": 0.17236918210983276, + "learning_rate": 6.933007202065499e-05, + "loss": 3.9011, + "step": 36130 + }, + { + "epoch": 2.4551569506726456, + "grad_norm": 0.22457876801490784, + "learning_rate": 6.932582551977171e-05, + "loss": 3.9007, + "step": 36135 + }, + { + "epoch": 2.4554966707433077, + "grad_norm": 0.16952508687973022, + "learning_rate": 6.932157901888844e-05, + "loss": 3.6635, + "step": 36140 + }, + { + "epoch": 2.4558363908139693, + "grad_norm": 0.17558632791042328, + "learning_rate": 6.931733251800516e-05, + "loss": 4.103, + "step": 36145 + }, + { + "epoch": 2.456176110884631, + "grad_norm": 0.1599593162536621, + "learning_rate": 6.93130860171219e-05, + "loss": 3.9496, + "step": 36150 + }, + { + "epoch": 2.456515830955293, + "grad_norm": 0.8620944619178772, + "learning_rate": 6.930883951623863e-05, + "loss": 3.9324, + "step": 36155 + }, + { + "epoch": 2.4568555510259547, + "grad_norm": 0.19004808366298676, + "learning_rate": 6.930459301535534e-05, + "loss": 3.9474, + "step": 36160 + }, + { + "epoch": 2.4571952710966163, + "grad_norm": 0.21392235159873962, + "learning_rate": 6.930034651447208e-05, + "loss": 4.0176, + "step": 36165 + }, + { + "epoch": 2.457534991167278, + "grad_norm": 0.4230898916721344, + "learning_rate": 6.929610001358881e-05, + "loss": 3.8541, + "step": 36170 + }, + { + "epoch": 2.45787471123794, + "grad_norm": 0.14488035440444946, + "learning_rate": 6.929185351270552e-05, + "loss": 3.8719, + "step": 36175 + }, + { + "epoch": 2.4582144313086016, + "grad_norm": 0.17695112526416779, + "learning_rate": 6.928760701182227e-05, + "loss": 3.939, + "step": 36180 + }, + { + "epoch": 2.4585541513792633, + "grad_norm": 0.290048748254776, + "learning_rate": 6.9283360510939e-05, + "loss": 3.925, + "step": 36185 + }, + { + "epoch": 2.4588938714499253, + "grad_norm": 0.2367110699415207, + "learning_rate": 6.927911401005571e-05, + "loss": 3.9152, + "step": 36190 + }, + { + "epoch": 2.459233591520587, + "grad_norm": 0.18144668638706207, + "learning_rate": 6.927486750917245e-05, + "loss": 3.9696, + "step": 36195 + }, + { + "epoch": 2.4595733115912486, + "grad_norm": 0.20814472436904907, + "learning_rate": 6.927062100828918e-05, + "loss": 3.7831, + "step": 36200 + }, + { + "epoch": 2.4599130316619107, + "grad_norm": 0.16599467396736145, + "learning_rate": 6.926637450740589e-05, + "loss": 4.3674, + "step": 36205 + }, + { + "epoch": 2.4602527517325723, + "grad_norm": 0.15751208364963531, + "learning_rate": 6.926212800652263e-05, + "loss": 3.9097, + "step": 36210 + }, + { + "epoch": 2.460592471803234, + "grad_norm": 0.17406146228313446, + "learning_rate": 6.925788150563936e-05, + "loss": 3.7997, + "step": 36215 + }, + { + "epoch": 2.460932191873896, + "grad_norm": 0.19217590987682343, + "learning_rate": 6.925363500475608e-05, + "loss": 4.1065, + "step": 36220 + }, + { + "epoch": 2.4612719119445576, + "grad_norm": 1.2237894535064697, + "learning_rate": 6.924938850387282e-05, + "loss": 3.7048, + "step": 36225 + }, + { + "epoch": 2.4616116320152193, + "grad_norm": 0.5970996022224426, + "learning_rate": 6.924514200298953e-05, + "loss": 4.0778, + "step": 36230 + }, + { + "epoch": 2.4619513520858813, + "grad_norm": 0.14686471223831177, + "learning_rate": 6.924089550210627e-05, + "loss": 3.6792, + "step": 36235 + }, + { + "epoch": 2.462291072156543, + "grad_norm": 0.637578010559082, + "learning_rate": 6.9236649001223e-05, + "loss": 3.9073, + "step": 36240 + }, + { + "epoch": 2.4626307922272046, + "grad_norm": 0.15968813002109528, + "learning_rate": 6.923240250033972e-05, + "loss": 3.8424, + "step": 36245 + }, + { + "epoch": 2.4629705122978667, + "grad_norm": 0.18262220919132233, + "learning_rate": 6.922815599945646e-05, + "loss": 3.8553, + "step": 36250 + }, + { + "epoch": 2.4633102323685283, + "grad_norm": 0.2182403802871704, + "learning_rate": 6.922390949857319e-05, + "loss": 3.7754, + "step": 36255 + }, + { + "epoch": 2.46364995243919, + "grad_norm": 0.20825578272342682, + "learning_rate": 6.92196629976899e-05, + "loss": 3.9317, + "step": 36260 + }, + { + "epoch": 2.463989672509852, + "grad_norm": 0.2173466831445694, + "learning_rate": 6.921541649680664e-05, + "loss": 3.9657, + "step": 36265 + }, + { + "epoch": 2.4643293925805136, + "grad_norm": 0.7486411929130554, + "learning_rate": 6.921116999592337e-05, + "loss": 4.125, + "step": 36270 + }, + { + "epoch": 2.4646691126511753, + "grad_norm": 0.1681060642004013, + "learning_rate": 6.920692349504008e-05, + "loss": 3.9197, + "step": 36275 + }, + { + "epoch": 2.4650088327218374, + "grad_norm": 0.20195026695728302, + "learning_rate": 6.920267699415683e-05, + "loss": 3.8416, + "step": 36280 + }, + { + "epoch": 2.465348552792499, + "grad_norm": 0.1837283968925476, + "learning_rate": 6.919843049327355e-05, + "loss": 3.895, + "step": 36285 + }, + { + "epoch": 2.4656882728631606, + "grad_norm": 0.2285485416650772, + "learning_rate": 6.919418399239027e-05, + "loss": 3.8962, + "step": 36290 + }, + { + "epoch": 2.4660279929338227, + "grad_norm": 0.1608094871044159, + "learning_rate": 6.918993749150701e-05, + "loss": 3.5814, + "step": 36295 + }, + { + "epoch": 2.4663677130044843, + "grad_norm": 0.18003389239311218, + "learning_rate": 6.918569099062372e-05, + "loss": 3.9318, + "step": 36300 + }, + { + "epoch": 2.466707433075146, + "grad_norm": 0.13759389519691467, + "learning_rate": 6.918144448974045e-05, + "loss": 3.8396, + "step": 36305 + }, + { + "epoch": 2.467047153145808, + "grad_norm": 0.19837671518325806, + "learning_rate": 6.91771979888572e-05, + "loss": 3.9478, + "step": 36310 + }, + { + "epoch": 2.4673868732164697, + "grad_norm": 0.18797710537910461, + "learning_rate": 6.917295148797391e-05, + "loss": 4.1373, + "step": 36315 + }, + { + "epoch": 2.4677265932871313, + "grad_norm": 0.13907471299171448, + "learning_rate": 6.916870498709064e-05, + "loss": 4.0903, + "step": 36320 + }, + { + "epoch": 2.4680663133577934, + "grad_norm": 0.19448839128017426, + "learning_rate": 6.916445848620738e-05, + "loss": 3.9002, + "step": 36325 + }, + { + "epoch": 2.468406033428455, + "grad_norm": 0.15740589797496796, + "learning_rate": 6.916021198532409e-05, + "loss": 3.8897, + "step": 36330 + }, + { + "epoch": 2.4687457534991166, + "grad_norm": 0.1946292519569397, + "learning_rate": 6.915596548444082e-05, + "loss": 4.022, + "step": 36335 + }, + { + "epoch": 2.4690854735697787, + "grad_norm": 0.20069767534732819, + "learning_rate": 6.915171898355756e-05, + "loss": 3.6472, + "step": 36340 + }, + { + "epoch": 2.4694251936404403, + "grad_norm": 0.36691924929618835, + "learning_rate": 6.914747248267428e-05, + "loss": 3.9973, + "step": 36345 + }, + { + "epoch": 2.469764913711102, + "grad_norm": 0.19483885169029236, + "learning_rate": 6.9143225981791e-05, + "loss": 4.0293, + "step": 36350 + }, + { + "epoch": 2.470104633781764, + "grad_norm": 0.3769546151161194, + "learning_rate": 6.913897948090775e-05, + "loss": 3.5762, + "step": 36355 + }, + { + "epoch": 2.4704443538524257, + "grad_norm": 0.2694959342479706, + "learning_rate": 6.913473298002446e-05, + "loss": 3.7481, + "step": 36360 + }, + { + "epoch": 2.4707840739230873, + "grad_norm": 0.5239735841751099, + "learning_rate": 6.913048647914119e-05, + "loss": 4.1037, + "step": 36365 + }, + { + "epoch": 2.4711237939937494, + "grad_norm": 0.17668843269348145, + "learning_rate": 6.912623997825792e-05, + "loss": 3.9794, + "step": 36370 + }, + { + "epoch": 2.471463514064411, + "grad_norm": NaN, + "learning_rate": 6.91228427775513e-05, + "loss": 3.78, + "step": 36375 + }, + { + "epoch": 2.4718032341350726, + "grad_norm": 0.21169064939022064, + "learning_rate": 6.911859627666803e-05, + "loss": 3.9234, + "step": 36380 + }, + { + "epoch": 2.4721429542057347, + "grad_norm": 0.17324987053871155, + "learning_rate": 6.911434977578475e-05, + "loss": 3.9848, + "step": 36385 + }, + { + "epoch": 2.4724826742763963, + "grad_norm": 0.16439926624298096, + "learning_rate": 6.911010327490148e-05, + "loss": 3.7583, + "step": 36390 + }, + { + "epoch": 2.472822394347058, + "grad_norm": 0.17169155180454254, + "learning_rate": 6.910585677401821e-05, + "loss": 3.9106, + "step": 36395 + }, + { + "epoch": 2.47316211441772, + "grad_norm": 0.14684489369392395, + "learning_rate": 6.910161027313494e-05, + "loss": 3.6925, + "step": 36400 + }, + { + "epoch": 2.4735018344883817, + "grad_norm": 0.15346768498420715, + "learning_rate": 6.909736377225167e-05, + "loss": 3.9752, + "step": 36405 + }, + { + "epoch": 2.4738415545590433, + "grad_norm": 0.17185169458389282, + "learning_rate": 6.90931172713684e-05, + "loss": 3.9219, + "step": 36410 + }, + { + "epoch": 2.474181274629705, + "grad_norm": 0.22774919867515564, + "learning_rate": 6.908887077048512e-05, + "loss": 3.8514, + "step": 36415 + }, + { + "epoch": 2.474520994700367, + "grad_norm": 0.18486806750297546, + "learning_rate": 6.908462426960185e-05, + "loss": 3.8778, + "step": 36420 + }, + { + "epoch": 2.4748607147710286, + "grad_norm": 0.17202502489089966, + "learning_rate": 6.908037776871858e-05, + "loss": 3.6759, + "step": 36425 + }, + { + "epoch": 2.4752004348416903, + "grad_norm": 0.14686466753482819, + "learning_rate": 6.90761312678353e-05, + "loss": 3.808, + "step": 36430 + }, + { + "epoch": 2.4755401549123524, + "grad_norm": 0.20533357560634613, + "learning_rate": 6.907188476695203e-05, + "loss": 3.8847, + "step": 36435 + }, + { + "epoch": 2.475879874983014, + "grad_norm": 0.19715513288974762, + "learning_rate": 6.906763826606876e-05, + "loss": 3.9363, + "step": 36440 + }, + { + "epoch": 2.4762195950536756, + "grad_norm": 0.37111571431159973, + "learning_rate": 6.906339176518549e-05, + "loss": 4.0004, + "step": 36445 + }, + { + "epoch": 2.4765593151243377, + "grad_norm": 0.14974269270896912, + "learning_rate": 6.905914526430222e-05, + "loss": 3.974, + "step": 36450 + }, + { + "epoch": 2.4768990351949993, + "grad_norm": 0.7100102305412292, + "learning_rate": 6.905489876341895e-05, + "loss": 4.1026, + "step": 36455 + }, + { + "epoch": 2.477238755265661, + "grad_norm": 0.1699613332748413, + "learning_rate": 6.905065226253567e-05, + "loss": 3.962, + "step": 36460 + }, + { + "epoch": 2.477578475336323, + "grad_norm": 2.957294225692749, + "learning_rate": 6.90464057616524e-05, + "loss": 3.9006, + "step": 36465 + }, + { + "epoch": 2.4779181954069847, + "grad_norm": 0.16161565482616425, + "learning_rate": 6.904215926076913e-05, + "loss": 3.8419, + "step": 36470 + }, + { + "epoch": 2.4782579154776463, + "grad_norm": 0.18636150658130646, + "learning_rate": 6.903791275988586e-05, + "loss": 3.9178, + "step": 36475 + }, + { + "epoch": 2.4785976355483084, + "grad_norm": 0.196391761302948, + "learning_rate": 6.903366625900259e-05, + "loss": 3.8478, + "step": 36480 + }, + { + "epoch": 2.47893735561897, + "grad_norm": 0.20189696550369263, + "learning_rate": 6.902941975811931e-05, + "loss": 3.8303, + "step": 36485 + }, + { + "epoch": 2.4792770756896316, + "grad_norm": 0.3984149992465973, + "learning_rate": 6.902517325723604e-05, + "loss": 3.8051, + "step": 36490 + }, + { + "epoch": 2.4796167957602937, + "grad_norm": 0.15551187098026276, + "learning_rate": 6.902092675635277e-05, + "loss": 3.9498, + "step": 36495 + }, + { + "epoch": 2.4799565158309553, + "grad_norm": 0.17533212900161743, + "learning_rate": 6.90166802554695e-05, + "loss": 3.7421, + "step": 36500 + }, + { + "epoch": 2.480296235901617, + "grad_norm": 0.1503629833459854, + "learning_rate": 6.901243375458623e-05, + "loss": 3.9795, + "step": 36505 + }, + { + "epoch": 2.4806359559722786, + "grad_norm": 0.1473758965730667, + "learning_rate": 6.900818725370295e-05, + "loss": 3.7965, + "step": 36510 + }, + { + "epoch": 2.4809756760429407, + "grad_norm": 0.2715904116630554, + "learning_rate": 6.900394075281968e-05, + "loss": 3.6924, + "step": 36515 + }, + { + "epoch": 2.4813153961136023, + "grad_norm": 0.19850610196590424, + "learning_rate": 6.899969425193641e-05, + "loss": 3.8136, + "step": 36520 + }, + { + "epoch": 2.481655116184264, + "grad_norm": 0.21291278302669525, + "learning_rate": 6.899544775105314e-05, + "loss": 3.9542, + "step": 36525 + }, + { + "epoch": 2.481994836254926, + "grad_norm": 0.35965272784233093, + "learning_rate": 6.899120125016987e-05, + "loss": 3.8554, + "step": 36530 + }, + { + "epoch": 2.4823345563255876, + "grad_norm": 0.15751507878303528, + "learning_rate": 6.89869547492866e-05, + "loss": 3.9351, + "step": 36535 + }, + { + "epoch": 2.4826742763962493, + "grad_norm": 0.14773280918598175, + "learning_rate": 6.898270824840331e-05, + "loss": 3.716, + "step": 36540 + }, + { + "epoch": 2.4830139964669113, + "grad_norm": 0.17989963293075562, + "learning_rate": 6.897846174752005e-05, + "loss": 3.931, + "step": 36545 + }, + { + "epoch": 2.483353716537573, + "grad_norm": 0.18733417987823486, + "learning_rate": 6.897421524663678e-05, + "loss": 3.7158, + "step": 36550 + }, + { + "epoch": 2.4836934366082346, + "grad_norm": 0.22946175932884216, + "learning_rate": 6.89699687457535e-05, + "loss": 3.8742, + "step": 36555 + }, + { + "epoch": 2.4840331566788967, + "grad_norm": 0.20864397287368774, + "learning_rate": 6.896572224487023e-05, + "loss": 3.6692, + "step": 36560 + }, + { + "epoch": 2.4843728767495583, + "grad_norm": 0.21347060799598694, + "learning_rate": 6.896147574398696e-05, + "loss": 3.6318, + "step": 36565 + }, + { + "epoch": 2.48471259682022, + "grad_norm": 0.1865234524011612, + "learning_rate": 6.895722924310368e-05, + "loss": 3.7677, + "step": 36570 + }, + { + "epoch": 2.485052316890882, + "grad_norm": 0.29156431555747986, + "learning_rate": 6.895298274222042e-05, + "loss": 4.0313, + "step": 36575 + }, + { + "epoch": 2.4853920369615436, + "grad_norm": 0.19256886839866638, + "learning_rate": 6.894873624133715e-05, + "loss": 4.0834, + "step": 36580 + }, + { + "epoch": 2.4857317570322053, + "grad_norm": 0.1544439196586609, + "learning_rate": 6.894448974045386e-05, + "loss": 3.8155, + "step": 36585 + }, + { + "epoch": 2.4860714771028674, + "grad_norm": 0.15387749671936035, + "learning_rate": 6.89402432395706e-05, + "loss": 3.983, + "step": 36590 + }, + { + "epoch": 2.486411197173529, + "grad_norm": 0.26898708939552307, + "learning_rate": 6.893599673868733e-05, + "loss": 3.6406, + "step": 36595 + }, + { + "epoch": 2.4867509172441906, + "grad_norm": 0.16805878281593323, + "learning_rate": 6.893175023780405e-05, + "loss": 3.9483, + "step": 36600 + }, + { + "epoch": 2.4870906373148527, + "grad_norm": 0.19128736853599548, + "learning_rate": 6.892750373692079e-05, + "loss": 3.7505, + "step": 36605 + }, + { + "epoch": 2.4874303573855143, + "grad_norm": 0.21433793008327484, + "learning_rate": 6.892325723603751e-05, + "loss": 4.0699, + "step": 36610 + }, + { + "epoch": 2.487770077456176, + "grad_norm": 0.2221391797065735, + "learning_rate": 6.891901073515423e-05, + "loss": 4.0631, + "step": 36615 + }, + { + "epoch": 2.488109797526838, + "grad_norm": 0.17594419419765472, + "learning_rate": 6.891476423427097e-05, + "loss": 3.9975, + "step": 36620 + }, + { + "epoch": 2.4884495175974997, + "grad_norm": 0.14687439799308777, + "learning_rate": 6.891051773338769e-05, + "loss": 3.8941, + "step": 36625 + }, + { + "epoch": 2.4887892376681613, + "grad_norm": 0.41012752056121826, + "learning_rate": 6.890627123250441e-05, + "loss": 3.7533, + "step": 36630 + }, + { + "epoch": 2.4891289577388234, + "grad_norm": 0.31904882192611694, + "learning_rate": 6.890202473162116e-05, + "loss": 3.9749, + "step": 36635 + }, + { + "epoch": 2.489468677809485, + "grad_norm": 0.18380270898342133, + "learning_rate": 6.889777823073787e-05, + "loss": 3.8465, + "step": 36640 + }, + { + "epoch": 2.4898083978801466, + "grad_norm": 0.22088150680065155, + "learning_rate": 6.88935317298546e-05, + "loss": 3.9137, + "step": 36645 + }, + { + "epoch": 2.4901481179508087, + "grad_norm": 0.1601017415523529, + "learning_rate": 6.888928522897134e-05, + "loss": 3.7843, + "step": 36650 + }, + { + "epoch": 2.4904878380214703, + "grad_norm": 0.2096797525882721, + "learning_rate": 6.888503872808805e-05, + "loss": 4.126, + "step": 36655 + }, + { + "epoch": 2.490827558092132, + "grad_norm": 0.33961784839630127, + "learning_rate": 6.888079222720478e-05, + "loss": 3.7226, + "step": 36660 + }, + { + "epoch": 2.491167278162794, + "grad_norm": 0.5138489603996277, + "learning_rate": 6.887654572632152e-05, + "loss": 3.6675, + "step": 36665 + }, + { + "epoch": 2.4915069982334557, + "grad_norm": 0.1841944307088852, + "learning_rate": 6.887229922543824e-05, + "loss": 3.7279, + "step": 36670 + }, + { + "epoch": 2.4918467183041173, + "grad_norm": 0.17057378590106964, + "learning_rate": 6.886805272455497e-05, + "loss": 3.7042, + "step": 36675 + }, + { + "epoch": 2.4921864383747794, + "grad_norm": 0.18169564008712769, + "learning_rate": 6.886380622367171e-05, + "loss": 3.7882, + "step": 36680 + }, + { + "epoch": 2.492526158445441, + "grad_norm": 0.1736309975385666, + "learning_rate": 6.885955972278842e-05, + "loss": 3.6839, + "step": 36685 + }, + { + "epoch": 2.4928658785161026, + "grad_norm": 0.19687415659427643, + "learning_rate": 6.885531322190515e-05, + "loss": 3.612, + "step": 36690 + }, + { + "epoch": 2.4932055985867647, + "grad_norm": 0.15839624404907227, + "learning_rate": 6.885106672102188e-05, + "loss": 3.8889, + "step": 36695 + }, + { + "epoch": 2.4935453186574263, + "grad_norm": 0.15997445583343506, + "learning_rate": 6.88468202201386e-05, + "loss": 3.9605, + "step": 36700 + }, + { + "epoch": 2.493885038728088, + "grad_norm": 0.20338478684425354, + "learning_rate": 6.884257371925533e-05, + "loss": 3.5935, + "step": 36705 + }, + { + "epoch": 2.49422475879875, + "grad_norm": 0.19186271727085114, + "learning_rate": 6.883832721837206e-05, + "loss": 3.8726, + "step": 36710 + }, + { + "epoch": 2.4945644788694117, + "grad_norm": 0.16073182225227356, + "learning_rate": 6.883408071748879e-05, + "loss": 4.0546, + "step": 36715 + }, + { + "epoch": 2.4949041989400733, + "grad_norm": 0.35230588912963867, + "learning_rate": 6.882983421660552e-05, + "loss": 3.8475, + "step": 36720 + }, + { + "epoch": 2.4952439190107354, + "grad_norm": 0.19177311658859253, + "learning_rate": 6.882558771572225e-05, + "loss": 3.7469, + "step": 36725 + }, + { + "epoch": 2.495583639081397, + "grad_norm": 0.13811123371124268, + "learning_rate": 6.882134121483897e-05, + "loss": 3.8118, + "step": 36730 + }, + { + "epoch": 2.4959233591520587, + "grad_norm": 0.21237850189208984, + "learning_rate": 6.88170947139557e-05, + "loss": 4.0898, + "step": 36735 + }, + { + "epoch": 2.4962630792227207, + "grad_norm": 0.2072351723909378, + "learning_rate": 6.881284821307243e-05, + "loss": 3.9222, + "step": 36740 + }, + { + "epoch": 2.4966027992933824, + "grad_norm": 4.950149059295654, + "learning_rate": 6.880860171218916e-05, + "loss": 3.7785, + "step": 36745 + }, + { + "epoch": 2.496942519364044, + "grad_norm": 0.148143470287323, + "learning_rate": 6.880435521130589e-05, + "loss": 4.0277, + "step": 36750 + }, + { + "epoch": 2.4972822394347056, + "grad_norm": 0.515773594379425, + "learning_rate": 6.880010871042261e-05, + "loss": 3.779, + "step": 36755 + }, + { + "epoch": 2.4976219595053677, + "grad_norm": 0.14682301878929138, + "learning_rate": 6.879586220953934e-05, + "loss": 3.6341, + "step": 36760 + }, + { + "epoch": 2.4979616795760293, + "grad_norm": 0.15449778735637665, + "learning_rate": 6.879161570865607e-05, + "loss": 3.7955, + "step": 36765 + }, + { + "epoch": 2.498301399646691, + "grad_norm": 0.19297455251216888, + "learning_rate": 6.87873692077728e-05, + "loss": 3.8604, + "step": 36770 + }, + { + "epoch": 2.498641119717353, + "grad_norm": 0.955293595790863, + "learning_rate": 6.878312270688953e-05, + "loss": 3.7316, + "step": 36775 + }, + { + "epoch": 2.4989808397880147, + "grad_norm": 0.14109186828136444, + "learning_rate": 6.877887620600625e-05, + "loss": 3.8783, + "step": 36780 + }, + { + "epoch": 2.4993205598586763, + "grad_norm": 0.18919122219085693, + "learning_rate": 6.877462970512298e-05, + "loss": 3.8088, + "step": 36785 + }, + { + "epoch": 2.4996602799293384, + "grad_norm": 0.23277361690998077, + "learning_rate": 6.877038320423971e-05, + "loss": 3.7538, + "step": 36790 + }, + { + "epoch": 2.5, + "grad_norm": 0.17489562928676605, + "learning_rate": 6.876613670335644e-05, + "loss": 3.7379, + "step": 36795 + }, + { + "epoch": 2.5003397200706616, + "grad_norm": 0.1922277957201004, + "learning_rate": 6.876189020247317e-05, + "loss": 3.9552, + "step": 36800 + }, + { + "epoch": 2.5006794401413237, + "grad_norm": 0.7307527661323547, + "learning_rate": 6.87576437015899e-05, + "loss": 3.9696, + "step": 36805 + }, + { + "epoch": 2.5010191602119853, + "grad_norm": 0.18264976143836975, + "learning_rate": 6.875339720070662e-05, + "loss": 4.0197, + "step": 36810 + }, + { + "epoch": 2.501358880282647, + "grad_norm": 0.15874779224395752, + "learning_rate": 6.874915069982335e-05, + "loss": 3.8462, + "step": 36815 + }, + { + "epoch": 2.5016986003533086, + "grad_norm": 0.17949306964874268, + "learning_rate": 6.874490419894008e-05, + "loss": 4.0236, + "step": 36820 + }, + { + "epoch": 2.5020383204239707, + "grad_norm": 0.1570228636264801, + "learning_rate": 6.87406576980568e-05, + "loss": 3.9732, + "step": 36825 + }, + { + "epoch": 2.5023780404946323, + "grad_norm": 0.21446073055267334, + "learning_rate": 6.873641119717353e-05, + "loss": 3.8508, + "step": 36830 + }, + { + "epoch": 2.502717760565294, + "grad_norm": 0.18032142519950867, + "learning_rate": 6.873216469629026e-05, + "loss": 3.6901, + "step": 36835 + }, + { + "epoch": 2.503057480635956, + "grad_norm": 0.1520104557275772, + "learning_rate": 6.872791819540699e-05, + "loss": 3.8753, + "step": 36840 + }, + { + "epoch": 2.5033972007066176, + "grad_norm": 0.26639553904533386, + "learning_rate": 6.872367169452372e-05, + "loss": 3.989, + "step": 36845 + }, + { + "epoch": 2.5037369207772793, + "grad_norm": 0.1733635514974594, + "learning_rate": 6.871942519364045e-05, + "loss": 3.825, + "step": 36850 + }, + { + "epoch": 2.5040766408479413, + "grad_norm": 0.172599196434021, + "learning_rate": 6.871517869275717e-05, + "loss": 3.8003, + "step": 36855 + }, + { + "epoch": 2.504416360918603, + "grad_norm": 0.18700887262821198, + "learning_rate": 6.87109321918739e-05, + "loss": 4.0965, + "step": 36860 + }, + { + "epoch": 2.5047560809892646, + "grad_norm": 0.19338759779930115, + "learning_rate": 6.870668569099063e-05, + "loss": 3.9177, + "step": 36865 + }, + { + "epoch": 2.5050958010599267, + "grad_norm": 0.1625017672777176, + "learning_rate": 6.870243919010736e-05, + "loss": 3.5424, + "step": 36870 + }, + { + "epoch": 2.5054355211305883, + "grad_norm": 0.27526217699050903, + "learning_rate": 6.869819268922409e-05, + "loss": 3.7464, + "step": 36875 + }, + { + "epoch": 2.50577524120125, + "grad_norm": 0.1613265573978424, + "learning_rate": 6.869394618834081e-05, + "loss": 3.8147, + "step": 36880 + }, + { + "epoch": 2.506114961271912, + "grad_norm": 1.4922893047332764, + "learning_rate": 6.868969968745754e-05, + "loss": 3.792, + "step": 36885 + }, + { + "epoch": 2.5064546813425737, + "grad_norm": 0.17755438387393951, + "learning_rate": 6.868545318657427e-05, + "loss": 3.8747, + "step": 36890 + }, + { + "epoch": 2.5067944014132353, + "grad_norm": 0.236799955368042, + "learning_rate": 6.868120668569098e-05, + "loss": 3.7507, + "step": 36895 + }, + { + "epoch": 2.5071341214838974, + "grad_norm": 0.18457596004009247, + "learning_rate": 6.867696018480773e-05, + "loss": 3.6472, + "step": 36900 + }, + { + "epoch": 2.507473841554559, + "grad_norm": 0.14698894321918488, + "learning_rate": 6.867271368392445e-05, + "loss": 3.8866, + "step": 36905 + }, + { + "epoch": 2.5078135616252206, + "grad_norm": 0.13453157246112823, + "learning_rate": 6.866846718304117e-05, + "loss": 3.6853, + "step": 36910 + }, + { + "epoch": 2.5081532816958827, + "grad_norm": 0.21283742785453796, + "learning_rate": 6.866422068215791e-05, + "loss": 3.8756, + "step": 36915 + }, + { + "epoch": 2.5084930017665443, + "grad_norm": 0.2871435880661011, + "learning_rate": 6.865997418127464e-05, + "loss": 3.7318, + "step": 36920 + }, + { + "epoch": 2.508832721837206, + "grad_norm": 0.7925368547439575, + "learning_rate": 6.865572768039135e-05, + "loss": 4.0384, + "step": 36925 + }, + { + "epoch": 2.509172441907868, + "grad_norm": 0.4974973201751709, + "learning_rate": 6.86514811795081e-05, + "loss": 3.7602, + "step": 36930 + }, + { + "epoch": 2.5095121619785297, + "grad_norm": 0.20807147026062012, + "learning_rate": 6.864723467862482e-05, + "loss": 4.0365, + "step": 36935 + }, + { + "epoch": 2.5098518820491913, + "grad_norm": 0.14254695177078247, + "learning_rate": 6.864298817774154e-05, + "loss": 3.9309, + "step": 36940 + }, + { + "epoch": 2.5101916021198534, + "grad_norm": 0.20508956909179688, + "learning_rate": 6.863874167685828e-05, + "loss": 3.9114, + "step": 36945 + }, + { + "epoch": 2.510531322190515, + "grad_norm": 0.21126489341259003, + "learning_rate": 6.8634495175975e-05, + "loss": 3.9451, + "step": 36950 + }, + { + "epoch": 2.5108710422611766, + "grad_norm": 0.1898466795682907, + "learning_rate": 6.863024867509172e-05, + "loss": 3.8809, + "step": 36955 + }, + { + "epoch": 2.5112107623318387, + "grad_norm": 0.1262410283088684, + "learning_rate": 6.862600217420846e-05, + "loss": 3.946, + "step": 36960 + }, + { + "epoch": 2.5115504824025003, + "grad_norm": 0.23894302546977997, + "learning_rate": 6.862175567332518e-05, + "loss": 3.7579, + "step": 36965 + }, + { + "epoch": 2.511890202473162, + "grad_norm": 0.15802593529224396, + "learning_rate": 6.86175091724419e-05, + "loss": 3.8838, + "step": 36970 + }, + { + "epoch": 2.512229922543824, + "grad_norm": 0.15118566155433655, + "learning_rate": 6.861326267155865e-05, + "loss": 3.9696, + "step": 36975 + }, + { + "epoch": 2.5125696426144857, + "grad_norm": 0.23705926537513733, + "learning_rate": 6.860901617067536e-05, + "loss": 3.8995, + "step": 36980 + }, + { + "epoch": 2.5129093626851473, + "grad_norm": 0.16452059149742126, + "learning_rate": 6.860476966979209e-05, + "loss": 3.8568, + "step": 36985 + }, + { + "epoch": 2.5132490827558094, + "grad_norm": 0.1710859090089798, + "learning_rate": 6.860052316890883e-05, + "loss": 4.0499, + "step": 36990 + }, + { + "epoch": 2.513588802826471, + "grad_norm": 0.1798645704984665, + "learning_rate": 6.859627666802554e-05, + "loss": 4.0634, + "step": 36995 + }, + { + "epoch": 2.5139285228971326, + "grad_norm": 0.18639619648456573, + "learning_rate": 6.859203016714227e-05, + "loss": 3.8354, + "step": 37000 + }, + { + "epoch": 2.5142682429677947, + "grad_norm": 0.17033587396144867, + "learning_rate": 6.858778366625901e-05, + "loss": 4.1374, + "step": 37005 + }, + { + "epoch": 2.5146079630384564, + "grad_norm": 0.1570757031440735, + "learning_rate": 6.858353716537573e-05, + "loss": 3.6808, + "step": 37010 + }, + { + "epoch": 2.514947683109118, + "grad_norm": 3.318307876586914, + "learning_rate": 6.857929066449246e-05, + "loss": 3.8253, + "step": 37015 + }, + { + "epoch": 2.51528740317978, + "grad_norm": 5.834548473358154, + "learning_rate": 6.85750441636092e-05, + "loss": 3.7383, + "step": 37020 + }, + { + "epoch": 2.5156271232504417, + "grad_norm": 0.1615428626537323, + "learning_rate": 6.857079766272591e-05, + "loss": 3.7453, + "step": 37025 + }, + { + "epoch": 2.5159668433211033, + "grad_norm": 0.1419934332370758, + "learning_rate": 6.856655116184264e-05, + "loss": 3.8005, + "step": 37030 + }, + { + "epoch": 2.5163065633917654, + "grad_norm": 2.5508172512054443, + "learning_rate": 6.856230466095938e-05, + "loss": 3.6519, + "step": 37035 + }, + { + "epoch": 2.516646283462427, + "grad_norm": 0.1440742015838623, + "learning_rate": 6.85580581600761e-05, + "loss": 3.8325, + "step": 37040 + }, + { + "epoch": 2.5169860035330887, + "grad_norm": 0.19688092172145844, + "learning_rate": 6.855381165919282e-05, + "loss": 3.7759, + "step": 37045 + }, + { + "epoch": 2.5173257236037507, + "grad_norm": 0.18967199325561523, + "learning_rate": 6.854956515830955e-05, + "loss": 3.9638, + "step": 37050 + }, + { + "epoch": 2.5176654436744124, + "grad_norm": 1.2481505870819092, + "learning_rate": 6.854531865742628e-05, + "loss": 3.9547, + "step": 37055 + }, + { + "epoch": 2.518005163745074, + "grad_norm": 0.22856104373931885, + "learning_rate": 6.854107215654301e-05, + "loss": 3.8434, + "step": 37060 + }, + { + "epoch": 2.518344883815736, + "grad_norm": 0.261053204536438, + "learning_rate": 6.853682565565974e-05, + "loss": 3.9378, + "step": 37065 + }, + { + "epoch": 2.5186846038863977, + "grad_norm": 0.17151080071926117, + "learning_rate": 6.853257915477646e-05, + "loss": 3.9127, + "step": 37070 + }, + { + "epoch": 2.5190243239570593, + "grad_norm": 0.14898762106895447, + "learning_rate": 6.852833265389319e-05, + "loss": 3.5786, + "step": 37075 + }, + { + "epoch": 2.5193640440277214, + "grad_norm": 0.1968889981508255, + "learning_rate": 6.852408615300992e-05, + "loss": 4.094, + "step": 37080 + }, + { + "epoch": 2.519703764098383, + "grad_norm": 0.15498849749565125, + "learning_rate": 6.851983965212665e-05, + "loss": 3.8282, + "step": 37085 + }, + { + "epoch": 2.5200434841690447, + "grad_norm": 0.1642228364944458, + "learning_rate": 6.851559315124338e-05, + "loss": 3.8813, + "step": 37090 + }, + { + "epoch": 2.5203832042397067, + "grad_norm": 0.1818743497133255, + "learning_rate": 6.85113466503601e-05, + "loss": 3.9483, + "step": 37095 + }, + { + "epoch": 2.5207229243103684, + "grad_norm": 0.16608983278274536, + "learning_rate": 6.850710014947683e-05, + "loss": 3.8149, + "step": 37100 + }, + { + "epoch": 2.52106264438103, + "grad_norm": 0.19420260190963745, + "learning_rate": 6.850285364859356e-05, + "loss": 3.6272, + "step": 37105 + }, + { + "epoch": 2.521402364451692, + "grad_norm": 0.2887604236602783, + "learning_rate": 6.849860714771029e-05, + "loss": 3.963, + "step": 37110 + }, + { + "epoch": 2.5217420845223537, + "grad_norm": 3.278733491897583, + "learning_rate": 6.849436064682702e-05, + "loss": 3.8522, + "step": 37115 + }, + { + "epoch": 2.5220818045930153, + "grad_norm": 0.17978015542030334, + "learning_rate": 6.849011414594374e-05, + "loss": 4.0351, + "step": 37120 + }, + { + "epoch": 2.522421524663677, + "grad_norm": 0.22979359328746796, + "learning_rate": 6.848586764506047e-05, + "loss": 3.917, + "step": 37125 + }, + { + "epoch": 2.522761244734339, + "grad_norm": 0.2545952796936035, + "learning_rate": 6.84816211441772e-05, + "loss": 3.7174, + "step": 37130 + }, + { + "epoch": 2.5231009648050007, + "grad_norm": 0.17535294592380524, + "learning_rate": 6.847737464329393e-05, + "loss": 4.0413, + "step": 37135 + }, + { + "epoch": 2.5234406848756623, + "grad_norm": 0.1276407241821289, + "learning_rate": 6.847312814241066e-05, + "loss": 3.9433, + "step": 37140 + }, + { + "epoch": 2.5237804049463244, + "grad_norm": 0.16340528428554535, + "learning_rate": 6.846888164152738e-05, + "loss": 3.7465, + "step": 37145 + }, + { + "epoch": 2.524120125016986, + "grad_norm": 0.1809440553188324, + "learning_rate": 6.846463514064411e-05, + "loss": 3.9136, + "step": 37150 + }, + { + "epoch": 2.5244598450876476, + "grad_norm": 0.2046525776386261, + "learning_rate": 6.846038863976084e-05, + "loss": 3.8066, + "step": 37155 + }, + { + "epoch": 2.5247995651583093, + "grad_norm": 0.18030549585819244, + "learning_rate": 6.845614213887757e-05, + "loss": 3.7311, + "step": 37160 + }, + { + "epoch": 2.5251392852289714, + "grad_norm": 0.30200332403182983, + "learning_rate": 6.84518956379943e-05, + "loss": 4.0714, + "step": 37165 + }, + { + "epoch": 2.525479005299633, + "grad_norm": 0.17760871350765228, + "learning_rate": 6.844764913711102e-05, + "loss": 3.8087, + "step": 37170 + }, + { + "epoch": 2.5258187253702946, + "grad_norm": 0.2034587562084198, + "learning_rate": 6.844340263622775e-05, + "loss": 3.8646, + "step": 37175 + }, + { + "epoch": 2.5261584454409567, + "grad_norm": 0.2572284936904907, + "learning_rate": 6.843915613534448e-05, + "loss": 3.8529, + "step": 37180 + }, + { + "epoch": 2.5264981655116183, + "grad_norm": 0.16097688674926758, + "learning_rate": 6.843490963446121e-05, + "loss": 3.7431, + "step": 37185 + }, + { + "epoch": 2.52683788558228, + "grad_norm": 0.19346089661121368, + "learning_rate": 6.843066313357794e-05, + "loss": 3.8231, + "step": 37190 + }, + { + "epoch": 2.527177605652942, + "grad_norm": 0.2016330063343048, + "learning_rate": 6.842641663269466e-05, + "loss": 4.0369, + "step": 37195 + }, + { + "epoch": 2.5275173257236037, + "grad_norm": 0.19659675657749176, + "learning_rate": 6.842217013181139e-05, + "loss": 3.815, + "step": 37200 + }, + { + "epoch": 2.5278570457942653, + "grad_norm": 0.16766713559627533, + "learning_rate": 6.841792363092812e-05, + "loss": 3.8456, + "step": 37205 + }, + { + "epoch": 2.5281967658649274, + "grad_norm": 0.18897923827171326, + "learning_rate": 6.841367713004485e-05, + "loss": 3.9218, + "step": 37210 + }, + { + "epoch": 2.528536485935589, + "grad_norm": 0.19941921532154083, + "learning_rate": 6.840943062916158e-05, + "loss": 3.7606, + "step": 37215 + }, + { + "epoch": 2.5288762060062506, + "grad_norm": 0.1483783721923828, + "learning_rate": 6.84051841282783e-05, + "loss": 3.966, + "step": 37220 + }, + { + "epoch": 2.5292159260769127, + "grad_norm": 0.8834887146949768, + "learning_rate": 6.840093762739503e-05, + "loss": 3.8724, + "step": 37225 + }, + { + "epoch": 2.5295556461475743, + "grad_norm": 0.15295159816741943, + "learning_rate": 6.839669112651176e-05, + "loss": 3.7561, + "step": 37230 + }, + { + "epoch": 2.529895366218236, + "grad_norm": 0.2185949981212616, + "learning_rate": 6.839244462562849e-05, + "loss": 3.7139, + "step": 37235 + }, + { + "epoch": 2.530235086288898, + "grad_norm": 0.21206295490264893, + "learning_rate": 6.838819812474522e-05, + "loss": 3.8375, + "step": 37240 + }, + { + "epoch": 2.5305748063595597, + "grad_norm": 1.2398308515548706, + "learning_rate": 6.838395162386194e-05, + "loss": 3.6989, + "step": 37245 + }, + { + "epoch": 2.5309145264302213, + "grad_norm": 0.15674518048763275, + "learning_rate": 6.837970512297866e-05, + "loss": 3.8764, + "step": 37250 + }, + { + "epoch": 2.5312542465008834, + "grad_norm": 0.26454171538352966, + "learning_rate": 6.83754586220954e-05, + "loss": 4.0263, + "step": 37255 + }, + { + "epoch": 2.531593966571545, + "grad_norm": 0.1856747716665268, + "learning_rate": 6.837121212121213e-05, + "loss": 3.8761, + "step": 37260 + }, + { + "epoch": 2.5319336866422066, + "grad_norm": 2.554631233215332, + "learning_rate": 6.836696562032884e-05, + "loss": 3.7247, + "step": 37265 + }, + { + "epoch": 2.5322734067128687, + "grad_norm": 0.6254364848136902, + "learning_rate": 6.836271911944558e-05, + "loss": 3.7315, + "step": 37270 + }, + { + "epoch": 2.5326131267835303, + "grad_norm": 0.15747632086277008, + "learning_rate": 6.835847261856231e-05, + "loss": 3.9082, + "step": 37275 + }, + { + "epoch": 2.532952846854192, + "grad_norm": 0.1845155507326126, + "learning_rate": 6.835422611767903e-05, + "loss": 3.8556, + "step": 37280 + }, + { + "epoch": 2.533292566924854, + "grad_norm": 0.20521917939186096, + "learning_rate": 6.834997961679577e-05, + "loss": 3.8626, + "step": 37285 + }, + { + "epoch": 2.5336322869955157, + "grad_norm": 0.5883710980415344, + "learning_rate": 6.83457331159125e-05, + "loss": 3.9828, + "step": 37290 + }, + { + "epoch": 2.5339720070661773, + "grad_norm": 0.19409498572349548, + "learning_rate": 6.834148661502921e-05, + "loss": 3.7104, + "step": 37295 + }, + { + "epoch": 2.5343117271368394, + "grad_norm": 0.2062024474143982, + "learning_rate": 6.833724011414595e-05, + "loss": 3.8031, + "step": 37300 + }, + { + "epoch": 2.534651447207501, + "grad_norm": 0.2450551986694336, + "learning_rate": 6.833299361326268e-05, + "loss": 3.9642, + "step": 37305 + }, + { + "epoch": 2.5349911672781626, + "grad_norm": 0.21008940041065216, + "learning_rate": 6.83287471123794e-05, + "loss": 3.8196, + "step": 37310 + }, + { + "epoch": 2.5353308873488247, + "grad_norm": 0.19845548272132874, + "learning_rate": 6.832450061149614e-05, + "loss": 3.8538, + "step": 37315 + }, + { + "epoch": 2.5356706074194864, + "grad_norm": 0.2679832875728607, + "learning_rate": 6.832025411061285e-05, + "loss": 3.8913, + "step": 37320 + }, + { + "epoch": 2.536010327490148, + "grad_norm": 0.1847391277551651, + "learning_rate": 6.831600760972958e-05, + "loss": 3.8725, + "step": 37325 + }, + { + "epoch": 2.53635004756081, + "grad_norm": 0.16986484825611115, + "learning_rate": 6.831176110884632e-05, + "loss": 3.9835, + "step": 37330 + }, + { + "epoch": 2.5366897676314717, + "grad_norm": 0.21023723483085632, + "learning_rate": 6.830751460796304e-05, + "loss": 3.8494, + "step": 37335 + }, + { + "epoch": 2.5370294877021333, + "grad_norm": 0.1405934989452362, + "learning_rate": 6.830326810707976e-05, + "loss": 3.8805, + "step": 37340 + }, + { + "epoch": 2.5373692077727954, + "grad_norm": 0.4292110800743103, + "learning_rate": 6.82990216061965e-05, + "loss": 4.1275, + "step": 37345 + }, + { + "epoch": 2.537708927843457, + "grad_norm": 0.18910269439220428, + "learning_rate": 6.829477510531322e-05, + "loss": 3.7382, + "step": 37350 + }, + { + "epoch": 2.5380486479141187, + "grad_norm": 0.13618789613246918, + "learning_rate": 6.829052860442995e-05, + "loss": 3.9925, + "step": 37355 + }, + { + "epoch": 2.5383883679847807, + "grad_norm": 0.19233736395835876, + "learning_rate": 6.828628210354669e-05, + "loss": 3.8968, + "step": 37360 + }, + { + "epoch": 2.5387280880554424, + "grad_norm": 0.17334012687206268, + "learning_rate": 6.82820356026634e-05, + "loss": 3.7256, + "step": 37365 + }, + { + "epoch": 2.539067808126104, + "grad_norm": 0.2075682431459427, + "learning_rate": 6.827778910178013e-05, + "loss": 3.9457, + "step": 37370 + }, + { + "epoch": 2.539407528196766, + "grad_norm": 0.14943023025989532, + "learning_rate": 6.827354260089687e-05, + "loss": 3.9622, + "step": 37375 + }, + { + "epoch": 2.5397472482674277, + "grad_norm": 0.36123406887054443, + "learning_rate": 6.826929610001359e-05, + "loss": 3.8448, + "step": 37380 + }, + { + "epoch": 2.5400869683380893, + "grad_norm": 0.16059312224388123, + "learning_rate": 6.826504959913032e-05, + "loss": 3.9459, + "step": 37385 + }, + { + "epoch": 2.5404266884087514, + "grad_norm": 0.44739997386932373, + "learning_rate": 6.826080309824706e-05, + "loss": 3.9415, + "step": 37390 + }, + { + "epoch": 2.540766408479413, + "grad_norm": 1.6643913984298706, + "learning_rate": 6.825655659736377e-05, + "loss": 3.8184, + "step": 37395 + }, + { + "epoch": 2.5411061285500747, + "grad_norm": 0.1674775928258896, + "learning_rate": 6.82523100964805e-05, + "loss": 4.0072, + "step": 37400 + }, + { + "epoch": 2.5414458486207367, + "grad_norm": 0.15180304646492004, + "learning_rate": 6.824806359559723e-05, + "loss": 3.6227, + "step": 37405 + }, + { + "epoch": 2.5417855686913984, + "grad_norm": 0.3795605003833771, + "learning_rate": 6.824381709471396e-05, + "loss": 3.8564, + "step": 37410 + }, + { + "epoch": 2.54212528876206, + "grad_norm": 0.3689451515674591, + "learning_rate": 6.823957059383068e-05, + "loss": 4.1729, + "step": 37415 + }, + { + "epoch": 2.542465008832722, + "grad_norm": 0.16798289120197296, + "learning_rate": 6.823532409294741e-05, + "loss": 4.2136, + "step": 37420 + }, + { + "epoch": 2.5428047289033837, + "grad_norm": 0.17149914801120758, + "learning_rate": 6.823107759206414e-05, + "loss": 3.9222, + "step": 37425 + }, + { + "epoch": 2.5431444489740453, + "grad_norm": 0.18577635288238525, + "learning_rate": 6.822683109118087e-05, + "loss": 3.89, + "step": 37430 + }, + { + "epoch": 2.5434841690447074, + "grad_norm": 0.21951167285442352, + "learning_rate": 6.82225845902976e-05, + "loss": 3.9858, + "step": 37435 + }, + { + "epoch": 2.543823889115369, + "grad_norm": 0.20003795623779297, + "learning_rate": 6.821833808941432e-05, + "loss": 3.746, + "step": 37440 + }, + { + "epoch": 2.5441636091860307, + "grad_norm": 0.199764683842659, + "learning_rate": 6.821409158853105e-05, + "loss": 3.6965, + "step": 37445 + }, + { + "epoch": 2.5445033292566928, + "grad_norm": 0.18052585422992706, + "learning_rate": 6.820984508764778e-05, + "loss": 3.8418, + "step": 37450 + }, + { + "epoch": 2.5448430493273544, + "grad_norm": 0.20761530101299286, + "learning_rate": 6.820559858676451e-05, + "loss": 3.7387, + "step": 37455 + }, + { + "epoch": 2.545182769398016, + "grad_norm": 0.501528263092041, + "learning_rate": 6.820135208588124e-05, + "loss": 4.0116, + "step": 37460 + }, + { + "epoch": 2.5455224894686777, + "grad_norm": 0.21214178204536438, + "learning_rate": 6.819710558499796e-05, + "loss": 3.7276, + "step": 37465 + }, + { + "epoch": 2.5458622095393397, + "grad_norm": 0.15767259895801544, + "learning_rate": 6.819285908411469e-05, + "loss": 3.8607, + "step": 37470 + }, + { + "epoch": 2.5462019296100014, + "grad_norm": 0.24336351454257965, + "learning_rate": 6.818861258323142e-05, + "loss": 3.842, + "step": 37475 + }, + { + "epoch": 2.546541649680663, + "grad_norm": 0.2135043442249298, + "learning_rate": 6.818436608234815e-05, + "loss": 3.8399, + "step": 37480 + }, + { + "epoch": 2.546881369751325, + "grad_norm": 0.16207145154476166, + "learning_rate": 6.818011958146488e-05, + "loss": 4.0022, + "step": 37485 + }, + { + "epoch": 2.5472210898219867, + "grad_norm": 0.19801953434944153, + "learning_rate": 6.81758730805816e-05, + "loss": 4.0122, + "step": 37490 + }, + { + "epoch": 2.5475608098926483, + "grad_norm": 0.21286916732788086, + "learning_rate": 6.817162657969833e-05, + "loss": 3.6767, + "step": 37495 + }, + { + "epoch": 2.54790052996331, + "grad_norm": 0.17849916219711304, + "learning_rate": 6.816738007881506e-05, + "loss": 3.8674, + "step": 37500 + }, + { + "epoch": 2.548240250033972, + "grad_norm": 0.15600281953811646, + "learning_rate": 6.816313357793179e-05, + "loss": 3.9013, + "step": 37505 + }, + { + "epoch": 2.5485799701046337, + "grad_norm": 0.45286133885383606, + "learning_rate": 6.815888707704852e-05, + "loss": 3.9971, + "step": 37510 + }, + { + "epoch": 2.5489196901752953, + "grad_norm": 0.16927848756313324, + "learning_rate": 6.815464057616524e-05, + "loss": 3.915, + "step": 37515 + }, + { + "epoch": 2.5492594102459574, + "grad_norm": 0.15050309896469116, + "learning_rate": 6.815039407528197e-05, + "loss": 3.6953, + "step": 37520 + }, + { + "epoch": 2.549599130316619, + "grad_norm": 0.18721088767051697, + "learning_rate": 6.81461475743987e-05, + "loss": 4.0704, + "step": 37525 + }, + { + "epoch": 2.5499388503872806, + "grad_norm": 2.4701156616210938, + "learning_rate": 6.814190107351543e-05, + "loss": 3.8449, + "step": 37530 + }, + { + "epoch": 2.5502785704579427, + "grad_norm": 0.13386307656764984, + "learning_rate": 6.813765457263216e-05, + "loss": 4.0506, + "step": 37535 + }, + { + "epoch": 2.5506182905286043, + "grad_norm": 0.16844066977500916, + "learning_rate": 6.813340807174888e-05, + "loss": 3.8509, + "step": 37540 + }, + { + "epoch": 2.550958010599266, + "grad_norm": 0.21340049803256989, + "learning_rate": 6.812916157086561e-05, + "loss": 3.8405, + "step": 37545 + }, + { + "epoch": 2.551297730669928, + "grad_norm": 0.19400613009929657, + "learning_rate": 6.812491506998234e-05, + "loss": 3.6957, + "step": 37550 + }, + { + "epoch": 2.5516374507405897, + "grad_norm": 0.1926385909318924, + "learning_rate": 6.812066856909907e-05, + "loss": 3.5689, + "step": 37555 + }, + { + "epoch": 2.5519771708112513, + "grad_norm": 0.17507615685462952, + "learning_rate": 6.81164220682158e-05, + "loss": 3.543, + "step": 37560 + }, + { + "epoch": 2.5523168908819134, + "grad_norm": 0.23209261894226074, + "learning_rate": 6.811217556733252e-05, + "loss": 3.8137, + "step": 37565 + }, + { + "epoch": 2.552656610952575, + "grad_norm": 0.21988238394260406, + "learning_rate": 6.810792906644925e-05, + "loss": 3.8507, + "step": 37570 + }, + { + "epoch": 2.5529963310232366, + "grad_norm": 0.15827305614948273, + "learning_rate": 6.810368256556598e-05, + "loss": 3.6983, + "step": 37575 + }, + { + "epoch": 2.5533360510938987, + "grad_norm": 0.21784718334674835, + "learning_rate": 6.809943606468271e-05, + "loss": 4.0944, + "step": 37580 + }, + { + "epoch": 2.5536757711645603, + "grad_norm": 0.1466989368200302, + "learning_rate": 6.809518956379944e-05, + "loss": 3.6597, + "step": 37585 + }, + { + "epoch": 2.554015491235222, + "grad_norm": 0.18900702893733978, + "learning_rate": 6.809094306291616e-05, + "loss": 3.8889, + "step": 37590 + }, + { + "epoch": 2.554355211305884, + "grad_norm": 0.19810500741004944, + "learning_rate": 6.808669656203289e-05, + "loss": 3.85, + "step": 37595 + }, + { + "epoch": 2.5546949313765457, + "grad_norm": 0.1593780815601349, + "learning_rate": 6.808245006114962e-05, + "loss": 3.6895, + "step": 37600 + }, + { + "epoch": 2.5550346514472073, + "grad_norm": 0.7104688286781311, + "learning_rate": 6.807820356026633e-05, + "loss": 3.9656, + "step": 37605 + }, + { + "epoch": 2.5553743715178694, + "grad_norm": 1.5622491836547852, + "learning_rate": 6.807395705938308e-05, + "loss": 3.8924, + "step": 37610 + }, + { + "epoch": 2.555714091588531, + "grad_norm": 0.27539727091789246, + "learning_rate": 6.80697105584998e-05, + "loss": 3.7921, + "step": 37615 + }, + { + "epoch": 2.5560538116591927, + "grad_norm": 0.16209836304187775, + "learning_rate": 6.806546405761652e-05, + "loss": 4.0312, + "step": 37620 + }, + { + "epoch": 2.5563935317298547, + "grad_norm": 0.2975935637950897, + "learning_rate": 6.806121755673326e-05, + "loss": 4.0881, + "step": 37625 + }, + { + "epoch": 2.5567332518005164, + "grad_norm": 0.1771213710308075, + "learning_rate": 6.805697105584999e-05, + "loss": 3.7846, + "step": 37630 + }, + { + "epoch": 2.557072971871178, + "grad_norm": 0.44063931703567505, + "learning_rate": 6.80527245549667e-05, + "loss": 3.9526, + "step": 37635 + }, + { + "epoch": 2.55741269194184, + "grad_norm": 0.2503744661808014, + "learning_rate": 6.804847805408344e-05, + "loss": 3.7513, + "step": 37640 + }, + { + "epoch": 2.5577524120125017, + "grad_norm": 0.18494661152362823, + "learning_rate": 6.804423155320017e-05, + "loss": 3.7866, + "step": 37645 + }, + { + "epoch": 2.5580921320831633, + "grad_norm": 0.21615538001060486, + "learning_rate": 6.803998505231689e-05, + "loss": 3.7493, + "step": 37650 + }, + { + "epoch": 2.5584318521538254, + "grad_norm": 0.1533680558204651, + "learning_rate": 6.803573855143363e-05, + "loss": 3.8788, + "step": 37655 + }, + { + "epoch": 2.558771572224487, + "grad_norm": 0.18440574407577515, + "learning_rate": 6.803149205055036e-05, + "loss": 3.8783, + "step": 37660 + }, + { + "epoch": 2.5591112922951487, + "grad_norm": 0.3476037085056305, + "learning_rate": 6.802724554966707e-05, + "loss": 3.6921, + "step": 37665 + }, + { + "epoch": 2.5594510123658107, + "grad_norm": 0.1444246917963028, + "learning_rate": 6.802299904878381e-05, + "loss": 3.8523, + "step": 37670 + }, + { + "epoch": 2.5597907324364724, + "grad_norm": 0.17224757373332977, + "learning_rate": 6.801875254790053e-05, + "loss": 3.8664, + "step": 37675 + }, + { + "epoch": 2.560130452507134, + "grad_norm": 0.17976199090480804, + "learning_rate": 6.801450604701725e-05, + "loss": 3.9499, + "step": 37680 + }, + { + "epoch": 2.560470172577796, + "grad_norm": 0.17257413268089294, + "learning_rate": 6.8010259546134e-05, + "loss": 3.9055, + "step": 37685 + }, + { + "epoch": 2.5608098926484577, + "grad_norm": 0.23615749180316925, + "learning_rate": 6.800601304525071e-05, + "loss": 4.0315, + "step": 37690 + }, + { + "epoch": 2.5611496127191193, + "grad_norm": 0.18252849578857422, + "learning_rate": 6.800176654436744e-05, + "loss": 3.8867, + "step": 37695 + }, + { + "epoch": 2.5614893327897814, + "grad_norm": 0.15434594452381134, + "learning_rate": 6.799752004348418e-05, + "loss": 3.8458, + "step": 37700 + }, + { + "epoch": 2.561829052860443, + "grad_norm": 0.19554725289344788, + "learning_rate": 6.79932735426009e-05, + "loss": 3.886, + "step": 37705 + }, + { + "epoch": 2.5621687729311047, + "grad_norm": 0.20709633827209473, + "learning_rate": 6.798902704171762e-05, + "loss": 3.8921, + "step": 37710 + }, + { + "epoch": 2.5625084930017668, + "grad_norm": 0.17742562294006348, + "learning_rate": 6.798478054083436e-05, + "loss": 3.9112, + "step": 37715 + }, + { + "epoch": 2.5628482130724284, + "grad_norm": 0.1939844787120819, + "learning_rate": 6.798053403995108e-05, + "loss": 3.8264, + "step": 37720 + }, + { + "epoch": 2.56318793314309, + "grad_norm": 0.19827881455421448, + "learning_rate": 6.79762875390678e-05, + "loss": 3.6452, + "step": 37725 + }, + { + "epoch": 2.563527653213752, + "grad_norm": 0.1751987338066101, + "learning_rate": 6.797204103818455e-05, + "loss": 3.9943, + "step": 37730 + }, + { + "epoch": 2.5638673732844137, + "grad_norm": 0.1597607433795929, + "learning_rate": 6.796779453730126e-05, + "loss": 3.8576, + "step": 37735 + }, + { + "epoch": 2.5642070933550754, + "grad_norm": 0.14269274473190308, + "learning_rate": 6.796354803641799e-05, + "loss": 3.9946, + "step": 37740 + }, + { + "epoch": 2.5645468134257374, + "grad_norm": 0.14154091477394104, + "learning_rate": 6.795930153553472e-05, + "loss": 4.0577, + "step": 37745 + }, + { + "epoch": 2.564886533496399, + "grad_norm": 0.17050643265247345, + "learning_rate": 6.795505503465145e-05, + "loss": 3.7914, + "step": 37750 + }, + { + "epoch": 2.5652262535670607, + "grad_norm": 0.14155824482440948, + "learning_rate": 6.795080853376817e-05, + "loss": 3.7881, + "step": 37755 + }, + { + "epoch": 2.5655659736377228, + "grad_norm": 0.1620555818080902, + "learning_rate": 6.79465620328849e-05, + "loss": 3.8171, + "step": 37760 + }, + { + "epoch": 2.5659056937083844, + "grad_norm": 0.2741548717021942, + "learning_rate": 6.794231553200163e-05, + "loss": 3.7996, + "step": 37765 + }, + { + "epoch": 2.566245413779046, + "grad_norm": 0.18328417837619781, + "learning_rate": 6.793806903111836e-05, + "loss": 3.8448, + "step": 37770 + }, + { + "epoch": 2.566585133849708, + "grad_norm": 0.2482127845287323, + "learning_rate": 6.793382253023509e-05, + "loss": 4.0367, + "step": 37775 + }, + { + "epoch": 2.5669248539203697, + "grad_norm": 0.18423694372177124, + "learning_rate": 6.792957602935181e-05, + "loss": 3.9035, + "step": 37780 + }, + { + "epoch": 2.5672645739910314, + "grad_norm": 0.17012067139148712, + "learning_rate": 6.792532952846854e-05, + "loss": 3.7761, + "step": 37785 + }, + { + "epoch": 2.5676042940616934, + "grad_norm": 0.15058687329292297, + "learning_rate": 6.792108302758527e-05, + "loss": 4.0868, + "step": 37790 + }, + { + "epoch": 2.567944014132355, + "grad_norm": 0.18021239340305328, + "learning_rate": 6.7916836526702e-05, + "loss": 3.7944, + "step": 37795 + }, + { + "epoch": 2.5682837342030167, + "grad_norm": 0.35091543197631836, + "learning_rate": 6.791259002581873e-05, + "loss": 4.0643, + "step": 37800 + }, + { + "epoch": 2.5686234542736783, + "grad_norm": 0.18168523907661438, + "learning_rate": 6.790834352493545e-05, + "loss": 3.8641, + "step": 37805 + }, + { + "epoch": 2.5689631743443404, + "grad_norm": 0.15439322590827942, + "learning_rate": 6.790409702405218e-05, + "loss": 3.8419, + "step": 37810 + }, + { + "epoch": 2.569302894415002, + "grad_norm": 0.33021417260169983, + "learning_rate": 6.789985052316892e-05, + "loss": 3.8836, + "step": 37815 + }, + { + "epoch": 2.5696426144856637, + "grad_norm": 0.18430358171463013, + "learning_rate": 6.789560402228564e-05, + "loss": 3.938, + "step": 37820 + }, + { + "epoch": 2.5699823345563257, + "grad_norm": 0.5247917175292969, + "learning_rate": 6.789135752140237e-05, + "loss": 3.948, + "step": 37825 + }, + { + "epoch": 2.5703220546269874, + "grad_norm": 0.1723906397819519, + "learning_rate": 6.78871110205191e-05, + "loss": 3.7563, + "step": 37830 + }, + { + "epoch": 2.570661774697649, + "grad_norm": 0.18128974735736847, + "learning_rate": 6.788286451963582e-05, + "loss": 3.766, + "step": 37835 + }, + { + "epoch": 2.5710014947683106, + "grad_norm": 0.34230056405067444, + "learning_rate": 6.787861801875255e-05, + "loss": 3.7865, + "step": 37840 + }, + { + "epoch": 2.5713412148389727, + "grad_norm": 0.2738439738750458, + "learning_rate": 6.787437151786928e-05, + "loss": 3.9021, + "step": 37845 + }, + { + "epoch": 2.5716809349096343, + "grad_norm": 0.2191234976053238, + "learning_rate": 6.7870125016986e-05, + "loss": 4.0199, + "step": 37850 + }, + { + "epoch": 2.572020654980296, + "grad_norm": 0.1668914258480072, + "learning_rate": 6.786587851610273e-05, + "loss": 4.1348, + "step": 37855 + }, + { + "epoch": 2.572360375050958, + "grad_norm": 0.1662742793560028, + "learning_rate": 6.786163201521946e-05, + "loss": 3.923, + "step": 37860 + }, + { + "epoch": 2.5727000951216197, + "grad_norm": 0.604958713054657, + "learning_rate": 6.785738551433619e-05, + "loss": 4.0649, + "step": 37865 + }, + { + "epoch": 2.5730398151922813, + "grad_norm": 0.15393346548080444, + "learning_rate": 6.785313901345292e-05, + "loss": 3.8283, + "step": 37870 + }, + { + "epoch": 2.5733795352629434, + "grad_norm": 0.1839604675769806, + "learning_rate": 6.784889251256965e-05, + "loss": 3.9532, + "step": 37875 + }, + { + "epoch": 2.573719255333605, + "grad_norm": 0.1913987398147583, + "learning_rate": 6.784464601168637e-05, + "loss": 3.7503, + "step": 37880 + }, + { + "epoch": 2.5740589754042666, + "grad_norm": 0.298496276140213, + "learning_rate": 6.78403995108031e-05, + "loss": 3.8244, + "step": 37885 + }, + { + "epoch": 2.5743986954749287, + "grad_norm": 0.1461142897605896, + "learning_rate": 6.783615300991983e-05, + "loss": 4.0077, + "step": 37890 + }, + { + "epoch": 2.5747384155455904, + "grad_norm": 0.2640954852104187, + "learning_rate": 6.783190650903656e-05, + "loss": 3.9445, + "step": 37895 + }, + { + "epoch": 2.575078135616252, + "grad_norm": 0.684228241443634, + "learning_rate": 6.782766000815329e-05, + "loss": 4.0396, + "step": 37900 + }, + { + "epoch": 2.575417855686914, + "grad_norm": 0.16707473993301392, + "learning_rate": 6.782341350727001e-05, + "loss": 4.067, + "step": 37905 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.2726849913597107, + "learning_rate": 6.781916700638674e-05, + "loss": 3.9295, + "step": 37910 + }, + { + "epoch": 2.5760972958282373, + "grad_norm": 0.2541827857494354, + "learning_rate": 6.781492050550347e-05, + "loss": 3.8949, + "step": 37915 + }, + { + "epoch": 2.5764370158988994, + "grad_norm": 0.15255402028560638, + "learning_rate": 6.78106740046202e-05, + "loss": 3.7638, + "step": 37920 + }, + { + "epoch": 2.576776735969561, + "grad_norm": 0.18284012377262115, + "learning_rate": 6.780642750373693e-05, + "loss": 3.8898, + "step": 37925 + }, + { + "epoch": 2.5771164560402227, + "grad_norm": 0.39086785912513733, + "learning_rate": 6.780218100285365e-05, + "loss": 3.697, + "step": 37930 + }, + { + "epoch": 2.5774561761108847, + "grad_norm": 0.19182196259498596, + "learning_rate": 6.779793450197038e-05, + "loss": 3.743, + "step": 37935 + }, + { + "epoch": 2.5777958961815464, + "grad_norm": 0.17669744789600372, + "learning_rate": 6.779368800108711e-05, + "loss": 3.7575, + "step": 37940 + }, + { + "epoch": 2.578135616252208, + "grad_norm": 0.9946192502975464, + "learning_rate": 6.778944150020383e-05, + "loss": 3.8401, + "step": 37945 + }, + { + "epoch": 2.57847533632287, + "grad_norm": 0.21237725019454956, + "learning_rate": 6.778519499932057e-05, + "loss": 3.7481, + "step": 37950 + }, + { + "epoch": 2.5788150563935317, + "grad_norm": 0.2892763912677765, + "learning_rate": 6.77809484984373e-05, + "loss": 3.7799, + "step": 37955 + }, + { + "epoch": 2.5791547764641933, + "grad_norm": 0.21286733448505402, + "learning_rate": 6.777670199755401e-05, + "loss": 3.628, + "step": 37960 + }, + { + "epoch": 2.5794944965348554, + "grad_norm": 0.16037790477275848, + "learning_rate": 6.777245549667075e-05, + "loss": 3.9827, + "step": 37965 + }, + { + "epoch": 2.579834216605517, + "grad_norm": 0.17625358700752258, + "learning_rate": 6.776820899578748e-05, + "loss": 3.815, + "step": 37970 + }, + { + "epoch": 2.5801739366761787, + "grad_norm": 0.15890543162822723, + "learning_rate": 6.77639624949042e-05, + "loss": 3.7199, + "step": 37975 + }, + { + "epoch": 2.5805136567468407, + "grad_norm": 0.21482513844966888, + "learning_rate": 6.775971599402093e-05, + "loss": 3.9446, + "step": 37980 + }, + { + "epoch": 2.5808533768175024, + "grad_norm": 0.18750596046447754, + "learning_rate": 6.775546949313766e-05, + "loss": 3.7575, + "step": 37985 + }, + { + "epoch": 2.581193096888164, + "grad_norm": 0.19857539236545563, + "learning_rate": 6.775122299225438e-05, + "loss": 3.9464, + "step": 37990 + }, + { + "epoch": 2.581532816958826, + "grad_norm": 0.1795317828655243, + "learning_rate": 6.774697649137112e-05, + "loss": 3.9826, + "step": 37995 + }, + { + "epoch": 2.5818725370294877, + "grad_norm": 0.29272058606147766, + "learning_rate": 6.774272999048785e-05, + "loss": 3.8572, + "step": 38000 + }, + { + "epoch": 2.5822122571001493, + "grad_norm": 1.7742429971694946, + "learning_rate": 6.773848348960456e-05, + "loss": 3.651, + "step": 38005 + }, + { + "epoch": 2.5825519771708114, + "grad_norm": 0.1289520263671875, + "learning_rate": 6.77342369887213e-05, + "loss": 4.1219, + "step": 38010 + }, + { + "epoch": 2.582891697241473, + "grad_norm": 0.19417613744735718, + "learning_rate": 6.772999048783803e-05, + "loss": 3.9939, + "step": 38015 + }, + { + "epoch": 2.5832314173121347, + "grad_norm": 0.16135282814502716, + "learning_rate": 6.772574398695475e-05, + "loss": 4.0724, + "step": 38020 + }, + { + "epoch": 2.5835711373827968, + "grad_norm": 0.8927068710327148, + "learning_rate": 6.772149748607149e-05, + "loss": 3.7653, + "step": 38025 + }, + { + "epoch": 2.5839108574534584, + "grad_norm": 0.2606629729270935, + "learning_rate": 6.77172509851882e-05, + "loss": 3.8147, + "step": 38030 + }, + { + "epoch": 2.58425057752412, + "grad_norm": 0.21709518134593964, + "learning_rate": 6.771300448430493e-05, + "loss": 3.8762, + "step": 38035 + }, + { + "epoch": 2.584590297594782, + "grad_norm": 0.16106659173965454, + "learning_rate": 6.770875798342167e-05, + "loss": 3.9745, + "step": 38040 + }, + { + "epoch": 2.5849300176654437, + "grad_norm": 0.14284293353557587, + "learning_rate": 6.770451148253839e-05, + "loss": 3.8892, + "step": 38045 + }, + { + "epoch": 2.5852697377361054, + "grad_norm": 0.1863427460193634, + "learning_rate": 6.770026498165511e-05, + "loss": 3.7053, + "step": 38050 + }, + { + "epoch": 2.5856094578067674, + "grad_norm": 0.16267238557338715, + "learning_rate": 6.769601848077186e-05, + "loss": 3.9894, + "step": 38055 + }, + { + "epoch": 2.585949177877429, + "grad_norm": 0.2674639821052551, + "learning_rate": 6.769177197988857e-05, + "loss": 3.8434, + "step": 38060 + }, + { + "epoch": 2.5862888979480907, + "grad_norm": 0.6588161587715149, + "learning_rate": 6.76875254790053e-05, + "loss": 4.0355, + "step": 38065 + }, + { + "epoch": 2.5866286180187528, + "grad_norm": 0.17213179171085358, + "learning_rate": 6.768327897812204e-05, + "loss": 3.9706, + "step": 38070 + }, + { + "epoch": 2.5869683380894144, + "grad_norm": 0.19334153831005096, + "learning_rate": 6.767903247723875e-05, + "loss": 3.8805, + "step": 38075 + }, + { + "epoch": 2.587308058160076, + "grad_norm": 0.16863657534122467, + "learning_rate": 6.767478597635548e-05, + "loss": 3.6853, + "step": 38080 + }, + { + "epoch": 2.587647778230738, + "grad_norm": 0.21339772641658783, + "learning_rate": 6.767053947547222e-05, + "loss": 3.8982, + "step": 38085 + }, + { + "epoch": 2.5879874983013997, + "grad_norm": 0.16198043525218964, + "learning_rate": 6.766629297458894e-05, + "loss": 3.8574, + "step": 38090 + }, + { + "epoch": 2.5883272183720614, + "grad_norm": 0.19046585261821747, + "learning_rate": 6.766204647370567e-05, + "loss": 3.9336, + "step": 38095 + }, + { + "epoch": 2.5886669384427234, + "grad_norm": 0.21580398082733154, + "learning_rate": 6.76577999728224e-05, + "loss": 3.8267, + "step": 38100 + }, + { + "epoch": 2.589006658513385, + "grad_norm": 0.19663399457931519, + "learning_rate": 6.765355347193912e-05, + "loss": 4.1832, + "step": 38105 + }, + { + "epoch": 2.5893463785840467, + "grad_norm": 0.19285112619400024, + "learning_rate": 6.764930697105585e-05, + "loss": 4.1397, + "step": 38110 + }, + { + "epoch": 2.589686098654709, + "grad_norm": 0.2204139679670334, + "learning_rate": 6.764506047017258e-05, + "loss": 3.8379, + "step": 38115 + }, + { + "epoch": 2.5900258187253704, + "grad_norm": 0.2462388277053833, + "learning_rate": 6.76408139692893e-05, + "loss": 3.7105, + "step": 38120 + }, + { + "epoch": 2.590365538796032, + "grad_norm": 0.22405076026916504, + "learning_rate": 6.763656746840603e-05, + "loss": 3.9451, + "step": 38125 + }, + { + "epoch": 2.590705258866694, + "grad_norm": 4.331444263458252, + "learning_rate": 6.763232096752276e-05, + "loss": 3.8612, + "step": 38130 + }, + { + "epoch": 2.5910449789373557, + "grad_norm": 0.9973567724227905, + "learning_rate": 6.762807446663949e-05, + "loss": 3.8815, + "step": 38135 + }, + { + "epoch": 2.5913846990080174, + "grad_norm": 0.19833683967590332, + "learning_rate": 6.762382796575622e-05, + "loss": 3.7508, + "step": 38140 + }, + { + "epoch": 2.591724419078679, + "grad_norm": 0.17250585556030273, + "learning_rate": 6.761958146487295e-05, + "loss": 4.0876, + "step": 38145 + }, + { + "epoch": 2.592064139149341, + "grad_norm": 0.6895924210548401, + "learning_rate": 6.761533496398967e-05, + "loss": 3.8449, + "step": 38150 + }, + { + "epoch": 2.5924038592200027, + "grad_norm": 0.14451231062412262, + "learning_rate": 6.761108846310642e-05, + "loss": 3.778, + "step": 38155 + }, + { + "epoch": 2.5927435792906643, + "grad_norm": 0.17559008300304413, + "learning_rate": 6.760684196222313e-05, + "loss": 3.8814, + "step": 38160 + }, + { + "epoch": 2.5930832993613264, + "grad_norm": 0.17641183733940125, + "learning_rate": 6.760259546133986e-05, + "loss": 3.7963, + "step": 38165 + }, + { + "epoch": 2.593423019431988, + "grad_norm": 0.2170080989599228, + "learning_rate": 6.759834896045659e-05, + "loss": 3.8933, + "step": 38170 + }, + { + "epoch": 2.5937627395026497, + "grad_norm": 0.27776995301246643, + "learning_rate": 6.759410245957331e-05, + "loss": 3.8283, + "step": 38175 + }, + { + "epoch": 2.5941024595733113, + "grad_norm": 0.16453078389167786, + "learning_rate": 6.758985595869004e-05, + "loss": 3.8114, + "step": 38180 + }, + { + "epoch": 2.5944421796439734, + "grad_norm": 0.18194791674613953, + "learning_rate": 6.758560945780677e-05, + "loss": 3.8262, + "step": 38185 + }, + { + "epoch": 2.594781899714635, + "grad_norm": 0.1801554560661316, + "learning_rate": 6.75813629569235e-05, + "loss": 3.7521, + "step": 38190 + }, + { + "epoch": 2.5951216197852967, + "grad_norm": 0.1571815460920334, + "learning_rate": 6.757711645604023e-05, + "loss": 3.9986, + "step": 38195 + }, + { + "epoch": 2.5954613398559587, + "grad_norm": 0.1766660362482071, + "learning_rate": 6.757286995515695e-05, + "loss": 3.8519, + "step": 38200 + }, + { + "epoch": 2.5958010599266204, + "grad_norm": 0.17216385900974274, + "learning_rate": 6.756862345427368e-05, + "loss": 3.9497, + "step": 38205 + }, + { + "epoch": 2.596140779997282, + "grad_norm": 0.16460520029067993, + "learning_rate": 6.756437695339041e-05, + "loss": 3.9818, + "step": 38210 + }, + { + "epoch": 2.596480500067944, + "grad_norm": 0.21435554325580597, + "learning_rate": 6.756013045250714e-05, + "loss": 3.8493, + "step": 38215 + }, + { + "epoch": 2.5968202201386057, + "grad_norm": 0.833254873752594, + "learning_rate": 6.755588395162387e-05, + "loss": 3.7925, + "step": 38220 + }, + { + "epoch": 2.5971599402092673, + "grad_norm": 0.15411555767059326, + "learning_rate": 6.75516374507406e-05, + "loss": 3.8591, + "step": 38225 + }, + { + "epoch": 2.5974996602799294, + "grad_norm": 0.2759561836719513, + "learning_rate": 6.754739094985732e-05, + "loss": 3.9966, + "step": 38230 + }, + { + "epoch": 2.597839380350591, + "grad_norm": 0.16826587915420532, + "learning_rate": 6.754314444897405e-05, + "loss": 3.8573, + "step": 38235 + }, + { + "epoch": 2.5981791004212527, + "grad_norm": 0.5612097382545471, + "learning_rate": 6.753889794809078e-05, + "loss": 3.7376, + "step": 38240 + }, + { + "epoch": 2.5985188204919147, + "grad_norm": 0.18011586368083954, + "learning_rate": 6.75346514472075e-05, + "loss": 3.9084, + "step": 38245 + }, + { + "epoch": 2.5988585405625764, + "grad_norm": 0.19024650752544403, + "learning_rate": 6.753040494632423e-05, + "loss": 4.0595, + "step": 38250 + }, + { + "epoch": 2.599198260633238, + "grad_norm": 0.16418933868408203, + "learning_rate": 6.752615844544096e-05, + "loss": 3.9448, + "step": 38255 + }, + { + "epoch": 2.5995379807039, + "grad_norm": 0.15870077908039093, + "learning_rate": 6.752191194455769e-05, + "loss": 3.91, + "step": 38260 + }, + { + "epoch": 2.5998777007745617, + "grad_norm": 0.4953071177005768, + "learning_rate": 6.751766544367442e-05, + "loss": 3.7399, + "step": 38265 + }, + { + "epoch": 2.6002174208452233, + "grad_norm": 0.197297602891922, + "learning_rate": 6.751341894279115e-05, + "loss": 4.0662, + "step": 38270 + }, + { + "epoch": 2.6005571409158854, + "grad_norm": 0.16428737342357635, + "learning_rate": 6.750917244190787e-05, + "loss": 3.7028, + "step": 38275 + }, + { + "epoch": 2.600896860986547, + "grad_norm": 0.1585882008075714, + "learning_rate": 6.75049259410246e-05, + "loss": 3.805, + "step": 38280 + }, + { + "epoch": 2.6012365810572087, + "grad_norm": 0.15096329152584076, + "learning_rate": 6.750067944014133e-05, + "loss": 4.1154, + "step": 38285 + }, + { + "epoch": 2.6015763011278707, + "grad_norm": 1.0723063945770264, + "learning_rate": 6.749643293925806e-05, + "loss": 3.6827, + "step": 38290 + }, + { + "epoch": 2.6019160211985324, + "grad_norm": 0.17549487948417664, + "learning_rate": 6.749218643837479e-05, + "loss": 3.7944, + "step": 38295 + }, + { + "epoch": 2.602255741269194, + "grad_norm": 0.2011825442314148, + "learning_rate": 6.74879399374915e-05, + "loss": 3.9473, + "step": 38300 + }, + { + "epoch": 2.602595461339856, + "grad_norm": 0.19717353582382202, + "learning_rate": 6.748369343660824e-05, + "loss": 3.722, + "step": 38305 + }, + { + "epoch": 2.6029351814105177, + "grad_norm": 0.15078048408031464, + "learning_rate": 6.747944693572497e-05, + "loss": 3.7956, + "step": 38310 + }, + { + "epoch": 2.6032749014811793, + "grad_norm": 0.334347665309906, + "learning_rate": 6.747520043484168e-05, + "loss": 3.9531, + "step": 38315 + }, + { + "epoch": 2.6036146215518414, + "grad_norm": 0.16437943279743195, + "learning_rate": 6.747095393395843e-05, + "loss": 3.8117, + "step": 38320 + }, + { + "epoch": 2.603954341622503, + "grad_norm": 0.23359781503677368, + "learning_rate": 6.746670743307515e-05, + "loss": 4.0905, + "step": 38325 + }, + { + "epoch": 2.6042940616931647, + "grad_norm": 0.13728085160255432, + "learning_rate": 6.746246093219187e-05, + "loss": 3.9944, + "step": 38330 + }, + { + "epoch": 2.6046337817638268, + "grad_norm": 0.16939868032932281, + "learning_rate": 6.745821443130861e-05, + "loss": 4.0168, + "step": 38335 + }, + { + "epoch": 2.6049735018344884, + "grad_norm": 0.22858409583568573, + "learning_rate": 6.745396793042534e-05, + "loss": 3.9274, + "step": 38340 + }, + { + "epoch": 2.60531322190515, + "grad_norm": 0.1679362952709198, + "learning_rate": 6.744972142954205e-05, + "loss": 3.8527, + "step": 38345 + }, + { + "epoch": 2.605652941975812, + "grad_norm": 0.2819637656211853, + "learning_rate": 6.74454749286588e-05, + "loss": 3.798, + "step": 38350 + }, + { + "epoch": 2.6059926620464737, + "grad_norm": 0.33677324652671814, + "learning_rate": 6.744122842777552e-05, + "loss": 3.6877, + "step": 38355 + }, + { + "epoch": 2.6063323821171354, + "grad_norm": 0.2282998412847519, + "learning_rate": 6.743698192689224e-05, + "loss": 3.777, + "step": 38360 + }, + { + "epoch": 2.6066721021877974, + "grad_norm": 0.19791744649410248, + "learning_rate": 6.743273542600898e-05, + "loss": 3.623, + "step": 38365 + }, + { + "epoch": 2.607011822258459, + "grad_norm": 0.14286257326602936, + "learning_rate": 6.742848892512569e-05, + "loss": 4.0284, + "step": 38370 + }, + { + "epoch": 2.6073515423291207, + "grad_norm": 0.13591022789478302, + "learning_rate": 6.742424242424242e-05, + "loss": 4.1921, + "step": 38375 + }, + { + "epoch": 2.6076912623997828, + "grad_norm": 0.6068320870399475, + "learning_rate": 6.741999592335916e-05, + "loss": 4.1002, + "step": 38380 + }, + { + "epoch": 2.6080309824704444, + "grad_norm": 0.1824735403060913, + "learning_rate": 6.741574942247588e-05, + "loss": 3.8785, + "step": 38385 + }, + { + "epoch": 2.608370702541106, + "grad_norm": 1.190566062927246, + "learning_rate": 6.74115029215926e-05, + "loss": 3.9478, + "step": 38390 + }, + { + "epoch": 2.608710422611768, + "grad_norm": 0.16579128801822662, + "learning_rate": 6.740725642070935e-05, + "loss": 4.0632, + "step": 38395 + }, + { + "epoch": 2.6090501426824297, + "grad_norm": 0.14418628811836243, + "learning_rate": 6.740300991982606e-05, + "loss": 3.9068, + "step": 38400 + }, + { + "epoch": 2.6093898627530914, + "grad_norm": 0.2701079845428467, + "learning_rate": 6.739876341894279e-05, + "loss": 3.7626, + "step": 38405 + }, + { + "epoch": 2.6097295828237534, + "grad_norm": 0.19721166789531708, + "learning_rate": 6.739451691805953e-05, + "loss": 3.8771, + "step": 38410 + }, + { + "epoch": 2.610069302894415, + "grad_norm": 0.19662493467330933, + "learning_rate": 6.739027041717624e-05, + "loss": 3.9784, + "step": 38415 + }, + { + "epoch": 2.6104090229650767, + "grad_norm": 0.17218834161758423, + "learning_rate": 6.738602391629297e-05, + "loss": 3.8315, + "step": 38420 + }, + { + "epoch": 2.610748743035739, + "grad_norm": 0.1991235315799713, + "learning_rate": 6.738177741540971e-05, + "loss": 3.8098, + "step": 38425 + }, + { + "epoch": 2.6110884631064004, + "grad_norm": 0.1570056825876236, + "learning_rate": 6.737753091452643e-05, + "loss": 3.7112, + "step": 38430 + }, + { + "epoch": 2.611428183177062, + "grad_norm": 0.17220328748226166, + "learning_rate": 6.737328441364316e-05, + "loss": 3.7484, + "step": 38435 + }, + { + "epoch": 2.611767903247724, + "grad_norm": 0.23488670587539673, + "learning_rate": 6.73690379127599e-05, + "loss": 4.0312, + "step": 38440 + }, + { + "epoch": 2.6121076233183858, + "grad_norm": 0.2193462997674942, + "learning_rate": 6.736479141187661e-05, + "loss": 3.9504, + "step": 38445 + }, + { + "epoch": 2.6124473433890474, + "grad_norm": 0.15138201415538788, + "learning_rate": 6.736054491099334e-05, + "loss": 3.8094, + "step": 38450 + }, + { + "epoch": 2.6127870634597095, + "grad_norm": 0.17017897963523865, + "learning_rate": 6.735629841011007e-05, + "loss": 3.8121, + "step": 38455 + }, + { + "epoch": 2.613126783530371, + "grad_norm": 0.20006722211837769, + "learning_rate": 6.73520519092268e-05, + "loss": 3.8998, + "step": 38460 + }, + { + "epoch": 2.6134665036010327, + "grad_norm": 0.18689799308776855, + "learning_rate": 6.734780540834352e-05, + "loss": 3.9705, + "step": 38465 + }, + { + "epoch": 2.613806223671695, + "grad_norm": 0.22982634603977203, + "learning_rate": 6.734355890746025e-05, + "loss": 3.8345, + "step": 38470 + }, + { + "epoch": 2.6141459437423564, + "grad_norm": 0.1849014312028885, + "learning_rate": 6.733931240657698e-05, + "loss": 3.7943, + "step": 38475 + }, + { + "epoch": 2.614485663813018, + "grad_norm": 0.19971823692321777, + "learning_rate": 6.733506590569371e-05, + "loss": 4.0127, + "step": 38480 + }, + { + "epoch": 2.6148253838836797, + "grad_norm": 0.16188167035579681, + "learning_rate": 6.733081940481044e-05, + "loss": 3.8316, + "step": 38485 + }, + { + "epoch": 2.6151651039543418, + "grad_norm": 0.24610526859760284, + "learning_rate": 6.732657290392716e-05, + "loss": 3.7263, + "step": 38490 + }, + { + "epoch": 2.6155048240250034, + "grad_norm": 0.18459458649158478, + "learning_rate": 6.73223264030439e-05, + "loss": 3.9183, + "step": 38495 + }, + { + "epoch": 2.615844544095665, + "grad_norm": 0.22705473005771637, + "learning_rate": 6.731807990216062e-05, + "loss": 3.7329, + "step": 38500 + }, + { + "epoch": 2.616184264166327, + "grad_norm": 0.17980900406837463, + "learning_rate": 6.731383340127735e-05, + "loss": 3.6351, + "step": 38505 + }, + { + "epoch": 2.6165239842369887, + "grad_norm": 0.17771373689174652, + "learning_rate": 6.730958690039409e-05, + "loss": 3.8835, + "step": 38510 + }, + { + "epoch": 2.6168637043076504, + "grad_norm": 0.2257130891084671, + "learning_rate": 6.73053403995108e-05, + "loss": 3.7033, + "step": 38515 + }, + { + "epoch": 2.617203424378312, + "grad_norm": 0.18520115315914154, + "learning_rate": 6.730109389862753e-05, + "loss": 3.8042, + "step": 38520 + }, + { + "epoch": 2.617543144448974, + "grad_norm": 0.20030522346496582, + "learning_rate": 6.729684739774426e-05, + "loss": 3.7107, + "step": 38525 + }, + { + "epoch": 2.6178828645196357, + "grad_norm": 0.17382997274398804, + "learning_rate": 6.729260089686099e-05, + "loss": 3.7295, + "step": 38530 + }, + { + "epoch": 2.6182225845902973, + "grad_norm": 0.12412247061729431, + "learning_rate": 6.728835439597772e-05, + "loss": 3.576, + "step": 38535 + }, + { + "epoch": 2.6185623046609594, + "grad_norm": 0.16499686241149902, + "learning_rate": 6.728410789509444e-05, + "loss": 3.9439, + "step": 38540 + }, + { + "epoch": 2.618902024731621, + "grad_norm": 0.21657343208789825, + "learning_rate": 6.727986139421117e-05, + "loss": 3.8443, + "step": 38545 + }, + { + "epoch": 2.6192417448022827, + "grad_norm": 0.15216438472270966, + "learning_rate": 6.72756148933279e-05, + "loss": 3.7369, + "step": 38550 + }, + { + "epoch": 2.6195814648729447, + "grad_norm": 0.1550162434577942, + "learning_rate": 6.727136839244463e-05, + "loss": 3.8869, + "step": 38555 + }, + { + "epoch": 2.6199211849436064, + "grad_norm": 0.7907409071922302, + "learning_rate": 6.726712189156136e-05, + "loss": 3.7908, + "step": 38560 + }, + { + "epoch": 2.620260905014268, + "grad_norm": 0.19839037954807281, + "learning_rate": 6.726287539067808e-05, + "loss": 3.9563, + "step": 38565 + }, + { + "epoch": 2.62060062508493, + "grad_norm": 0.17914487421512604, + "learning_rate": 6.725862888979481e-05, + "loss": 4.1588, + "step": 38570 + }, + { + "epoch": 2.6209403451555917, + "grad_norm": 0.15906545519828796, + "learning_rate": 6.725438238891154e-05, + "loss": 3.8544, + "step": 38575 + }, + { + "epoch": 2.6212800652262533, + "grad_norm": 0.46324074268341064, + "learning_rate": 6.725013588802827e-05, + "loss": 4.0059, + "step": 38580 + }, + { + "epoch": 2.6216197852969154, + "grad_norm": 0.17961528897285461, + "learning_rate": 6.7245889387145e-05, + "loss": 3.8612, + "step": 38585 + }, + { + "epoch": 2.621959505367577, + "grad_norm": 0.23738789558410645, + "learning_rate": 6.724164288626172e-05, + "loss": 3.8299, + "step": 38590 + }, + { + "epoch": 2.6222992254382387, + "grad_norm": 0.6924583315849304, + "learning_rate": 6.723739638537845e-05, + "loss": 3.8493, + "step": 38595 + }, + { + "epoch": 2.6226389455089008, + "grad_norm": 0.3250090777873993, + "learning_rate": 6.723314988449518e-05, + "loss": 3.8863, + "step": 38600 + }, + { + "epoch": 2.6229786655795624, + "grad_norm": 0.30978941917419434, + "learning_rate": 6.722890338361191e-05, + "loss": 4.0041, + "step": 38605 + }, + { + "epoch": 2.623318385650224, + "grad_norm": 0.14681269228458405, + "learning_rate": 6.722465688272864e-05, + "loss": 3.8169, + "step": 38610 + }, + { + "epoch": 2.623658105720886, + "grad_norm": 0.17817670106887817, + "learning_rate": 6.722041038184536e-05, + "loss": 3.8048, + "step": 38615 + }, + { + "epoch": 2.6239978257915477, + "grad_norm": 0.17980530858039856, + "learning_rate": 6.721616388096209e-05, + "loss": 3.818, + "step": 38620 + }, + { + "epoch": 2.6243375458622094, + "grad_norm": 0.16020677983760834, + "learning_rate": 6.721191738007882e-05, + "loss": 4.0642, + "step": 38625 + }, + { + "epoch": 2.6246772659328714, + "grad_norm": 0.16309854388237, + "learning_rate": 6.720767087919555e-05, + "loss": 4.0552, + "step": 38630 + }, + { + "epoch": 2.625016986003533, + "grad_norm": 0.2003224939107895, + "learning_rate": 6.720342437831228e-05, + "loss": 4.1832, + "step": 38635 + }, + { + "epoch": 2.6253567060741947, + "grad_norm": 0.17885400354862213, + "learning_rate": 6.7199177877429e-05, + "loss": 3.9323, + "step": 38640 + }, + { + "epoch": 2.6256964261448568, + "grad_norm": 0.33830058574676514, + "learning_rate": 6.719493137654573e-05, + "loss": 3.8225, + "step": 38645 + }, + { + "epoch": 2.6260361462155184, + "grad_norm": 0.20505572855472565, + "learning_rate": 6.719068487566246e-05, + "loss": 4.0058, + "step": 38650 + }, + { + "epoch": 2.62637586628618, + "grad_norm": 0.1894165426492691, + "learning_rate": 6.718643837477918e-05, + "loss": 3.7348, + "step": 38655 + }, + { + "epoch": 2.626715586356842, + "grad_norm": 0.42541173100471497, + "learning_rate": 6.718219187389592e-05, + "loss": 3.863, + "step": 38660 + }, + { + "epoch": 2.6270553064275037, + "grad_norm": 0.24420730769634247, + "learning_rate": 6.717794537301264e-05, + "loss": 3.72, + "step": 38665 + }, + { + "epoch": 2.6273950264981654, + "grad_norm": 0.22123098373413086, + "learning_rate": 6.717369887212936e-05, + "loss": 3.8404, + "step": 38670 + }, + { + "epoch": 2.6277347465688274, + "grad_norm": 0.24667398631572723, + "learning_rate": 6.71694523712461e-05, + "loss": 3.8077, + "step": 38675 + }, + { + "epoch": 2.628074466639489, + "grad_norm": 0.20732860267162323, + "learning_rate": 6.716520587036283e-05, + "loss": 4.0147, + "step": 38680 + }, + { + "epoch": 2.6284141867101507, + "grad_norm": 0.1612192988395691, + "learning_rate": 6.716095936947954e-05, + "loss": 4.0052, + "step": 38685 + }, + { + "epoch": 2.6287539067808128, + "grad_norm": 0.14912736415863037, + "learning_rate": 6.715671286859628e-05, + "loss": 4.0191, + "step": 38690 + }, + { + "epoch": 2.6290936268514744, + "grad_norm": 10.335192680358887, + "learning_rate": 6.715246636771301e-05, + "loss": 3.808, + "step": 38695 + }, + { + "epoch": 2.629433346922136, + "grad_norm": 0.14809978008270264, + "learning_rate": 6.714821986682973e-05, + "loss": 3.8001, + "step": 38700 + }, + { + "epoch": 2.629773066992798, + "grad_norm": 0.17667439579963684, + "learning_rate": 6.714397336594647e-05, + "loss": 4.0228, + "step": 38705 + }, + { + "epoch": 2.6301127870634597, + "grad_norm": 0.13751813769340515, + "learning_rate": 6.71397268650632e-05, + "loss": 4.0608, + "step": 38710 + }, + { + "epoch": 2.6304525071341214, + "grad_norm": 0.20633287727832794, + "learning_rate": 6.713548036417991e-05, + "loss": 3.876, + "step": 38715 + }, + { + "epoch": 2.6307922272047835, + "grad_norm": 0.1966109722852707, + "learning_rate": 6.713123386329665e-05, + "loss": 3.9946, + "step": 38720 + }, + { + "epoch": 2.631131947275445, + "grad_norm": 0.1634412705898285, + "learning_rate": 6.712698736241337e-05, + "loss": 3.8975, + "step": 38725 + }, + { + "epoch": 2.6314716673461067, + "grad_norm": 0.21814756095409393, + "learning_rate": 6.71227408615301e-05, + "loss": 3.821, + "step": 38730 + }, + { + "epoch": 2.631811387416769, + "grad_norm": 0.18450957536697388, + "learning_rate": 6.711849436064684e-05, + "loss": 3.7871, + "step": 38735 + }, + { + "epoch": 2.6321511074874304, + "grad_norm": 0.17740853130817413, + "learning_rate": 6.711424785976355e-05, + "loss": 3.8541, + "step": 38740 + }, + { + "epoch": 2.632490827558092, + "grad_norm": 0.17699556052684784, + "learning_rate": 6.711000135888028e-05, + "loss": 3.9423, + "step": 38745 + }, + { + "epoch": 2.632830547628754, + "grad_norm": 0.3138996362686157, + "learning_rate": 6.710575485799702e-05, + "loss": 3.8431, + "step": 38750 + }, + { + "epoch": 2.6331702676994158, + "grad_norm": 0.33835873007774353, + "learning_rate": 6.710150835711374e-05, + "loss": 3.7717, + "step": 38755 + }, + { + "epoch": 2.6335099877700774, + "grad_norm": 0.2208409309387207, + "learning_rate": 6.709726185623046e-05, + "loss": 3.8926, + "step": 38760 + }, + { + "epoch": 2.6338497078407395, + "grad_norm": 0.16839522123336792, + "learning_rate": 6.70930153553472e-05, + "loss": 3.6863, + "step": 38765 + }, + { + "epoch": 2.634189427911401, + "grad_norm": 0.2082993984222412, + "learning_rate": 6.708876885446392e-05, + "loss": 4.0847, + "step": 38770 + }, + { + "epoch": 2.6345291479820627, + "grad_norm": 0.2201123684644699, + "learning_rate": 6.708452235358065e-05, + "loss": 3.8541, + "step": 38775 + }, + { + "epoch": 2.634868868052725, + "grad_norm": 0.14829425513744354, + "learning_rate": 6.708027585269739e-05, + "loss": 4.0468, + "step": 38780 + }, + { + "epoch": 2.6352085881233864, + "grad_norm": 0.18978525698184967, + "learning_rate": 6.70760293518141e-05, + "loss": 4.245, + "step": 38785 + }, + { + "epoch": 2.635548308194048, + "grad_norm": 0.19819572567939758, + "learning_rate": 6.707178285093083e-05, + "loss": 4.2246, + "step": 38790 + }, + { + "epoch": 2.63588802826471, + "grad_norm": 0.5437256693840027, + "learning_rate": 6.706753635004756e-05, + "loss": 3.9343, + "step": 38795 + }, + { + "epoch": 2.6362277483353718, + "grad_norm": 0.2518259584903717, + "learning_rate": 6.706328984916429e-05, + "loss": 4.0663, + "step": 38800 + }, + { + "epoch": 2.6365674684060334, + "grad_norm": 0.18224813044071198, + "learning_rate": 6.705904334828102e-05, + "loss": 3.7837, + "step": 38805 + }, + { + "epoch": 2.6369071884766955, + "grad_norm": 0.25703683495521545, + "learning_rate": 6.705479684739774e-05, + "loss": 3.842, + "step": 38810 + }, + { + "epoch": 2.637246908547357, + "grad_norm": 0.23397263884544373, + "learning_rate": 6.705055034651447e-05, + "loss": 3.6841, + "step": 38815 + }, + { + "epoch": 2.6375866286180187, + "grad_norm": 0.18520790338516235, + "learning_rate": 6.70463038456312e-05, + "loss": 4.1531, + "step": 38820 + }, + { + "epoch": 2.6379263486886804, + "grad_norm": 0.20380187034606934, + "learning_rate": 6.704205734474793e-05, + "loss": 3.9379, + "step": 38825 + }, + { + "epoch": 2.6382660687593424, + "grad_norm": 0.1579570323228836, + "learning_rate": 6.703781084386466e-05, + "loss": 3.8421, + "step": 38830 + }, + { + "epoch": 2.638605788830004, + "grad_norm": 0.16143319010734558, + "learning_rate": 6.70335643429814e-05, + "loss": 3.9735, + "step": 38835 + }, + { + "epoch": 2.6389455089006657, + "grad_norm": 0.26952221989631653, + "learning_rate": 6.702931784209811e-05, + "loss": 3.7047, + "step": 38840 + }, + { + "epoch": 2.639285228971328, + "grad_norm": 0.14281821250915527, + "learning_rate": 6.702507134121484e-05, + "loss": 3.8713, + "step": 38845 + }, + { + "epoch": 2.6396249490419894, + "grad_norm": 0.21622486412525177, + "learning_rate": 6.702082484033158e-05, + "loss": 3.7347, + "step": 38850 + }, + { + "epoch": 2.639964669112651, + "grad_norm": 0.34749603271484375, + "learning_rate": 6.70165783394483e-05, + "loss": 3.9229, + "step": 38855 + }, + { + "epoch": 2.640304389183313, + "grad_norm": 0.21533218026161194, + "learning_rate": 6.701233183856502e-05, + "loss": 3.9096, + "step": 38860 + }, + { + "epoch": 2.6406441092539747, + "grad_norm": 0.2215920388698578, + "learning_rate": 6.700808533768177e-05, + "loss": 3.8253, + "step": 38865 + }, + { + "epoch": 2.6409838293246364, + "grad_norm": 0.16950154304504395, + "learning_rate": 6.700383883679848e-05, + "loss": 3.7487, + "step": 38870 + }, + { + "epoch": 2.641323549395298, + "grad_norm": 0.18355531990528107, + "learning_rate": 6.699959233591521e-05, + "loss": 4.0732, + "step": 38875 + }, + { + "epoch": 2.64166326946596, + "grad_norm": 0.14184200763702393, + "learning_rate": 6.699534583503194e-05, + "loss": 3.8386, + "step": 38880 + }, + { + "epoch": 2.6420029895366217, + "grad_norm": 0.20677277445793152, + "learning_rate": 6.699109933414866e-05, + "loss": 4.0369, + "step": 38885 + }, + { + "epoch": 2.6423427096072833, + "grad_norm": 0.14885276556015015, + "learning_rate": 6.698685283326539e-05, + "loss": 3.995, + "step": 38890 + }, + { + "epoch": 2.6426824296779454, + "grad_norm": 0.16940414905548096, + "learning_rate": 6.698260633238212e-05, + "loss": 3.7868, + "step": 38895 + }, + { + "epoch": 2.643022149748607, + "grad_norm": 0.2022908627986908, + "learning_rate": 6.697835983149885e-05, + "loss": 3.5785, + "step": 38900 + }, + { + "epoch": 2.6433618698192687, + "grad_norm": 0.18654383718967438, + "learning_rate": 6.697411333061558e-05, + "loss": 3.8008, + "step": 38905 + }, + { + "epoch": 2.6437015898899308, + "grad_norm": 0.1588413119316101, + "learning_rate": 6.69698668297323e-05, + "loss": 3.8385, + "step": 38910 + }, + { + "epoch": 2.6440413099605924, + "grad_norm": 0.18405820429325104, + "learning_rate": 6.696562032884903e-05, + "loss": 3.8205, + "step": 38915 + }, + { + "epoch": 2.644381030031254, + "grad_norm": 0.17168614268302917, + "learning_rate": 6.696137382796576e-05, + "loss": 4.0108, + "step": 38920 + }, + { + "epoch": 2.644720750101916, + "grad_norm": 0.177122563123703, + "learning_rate": 6.695712732708249e-05, + "loss": 3.802, + "step": 38925 + }, + { + "epoch": 2.6450604701725777, + "grad_norm": 0.19957537949085236, + "learning_rate": 6.695288082619922e-05, + "loss": 3.7472, + "step": 38930 + }, + { + "epoch": 2.6454001902432394, + "grad_norm": 0.16347818076610565, + "learning_rate": 6.694863432531594e-05, + "loss": 3.6631, + "step": 38935 + }, + { + "epoch": 2.6457399103139014, + "grad_norm": 0.15724751353263855, + "learning_rate": 6.694438782443267e-05, + "loss": 3.6305, + "step": 38940 + }, + { + "epoch": 2.646079630384563, + "grad_norm": 0.16771399974822998, + "learning_rate": 6.69401413235494e-05, + "loss": 3.9449, + "step": 38945 + }, + { + "epoch": 2.6464193504552247, + "grad_norm": 0.2148045301437378, + "learning_rate": 6.693589482266613e-05, + "loss": 3.8467, + "step": 38950 + }, + { + "epoch": 2.6467590705258868, + "grad_norm": 0.18609458208084106, + "learning_rate": 6.693164832178286e-05, + "loss": 3.8519, + "step": 38955 + }, + { + "epoch": 2.6470987905965484, + "grad_norm": 0.20492902398109436, + "learning_rate": 6.692740182089958e-05, + "loss": 3.5738, + "step": 38960 + }, + { + "epoch": 2.64743851066721, + "grad_norm": 0.3353732228279114, + "learning_rate": 6.692315532001631e-05, + "loss": 3.7882, + "step": 38965 + }, + { + "epoch": 2.647778230737872, + "grad_norm": 0.22696205973625183, + "learning_rate": 6.691890881913304e-05, + "loss": 3.8576, + "step": 38970 + }, + { + "epoch": 2.6481179508085337, + "grad_norm": 0.15303649008274078, + "learning_rate": 6.691466231824977e-05, + "loss": 3.8113, + "step": 38975 + }, + { + "epoch": 2.6484576708791954, + "grad_norm": 0.1756230890750885, + "learning_rate": 6.69104158173665e-05, + "loss": 3.9117, + "step": 38980 + }, + { + "epoch": 2.6487973909498574, + "grad_norm": 0.1536557376384735, + "learning_rate": 6.690616931648322e-05, + "loss": 3.7511, + "step": 38985 + }, + { + "epoch": 2.649137111020519, + "grad_norm": 0.19993172585964203, + "learning_rate": 6.690192281559995e-05, + "loss": 3.7149, + "step": 38990 + }, + { + "epoch": 2.6494768310911807, + "grad_norm": 0.21317540109157562, + "learning_rate": 6.689767631471667e-05, + "loss": 3.6889, + "step": 38995 + }, + { + "epoch": 2.649816551161843, + "grad_norm": 0.2587644159793854, + "learning_rate": 6.689342981383341e-05, + "loss": 4.028, + "step": 39000 + }, + { + "epoch": 2.6501562712325044, + "grad_norm": 0.17905676364898682, + "learning_rate": 6.688918331295014e-05, + "loss": 3.8486, + "step": 39005 + }, + { + "epoch": 2.650495991303166, + "grad_norm": 0.20106825232505798, + "learning_rate": 6.688493681206685e-05, + "loss": 3.9385, + "step": 39010 + }, + { + "epoch": 2.650835711373828, + "grad_norm": 0.1860274076461792, + "learning_rate": 6.688069031118359e-05, + "loss": 3.9146, + "step": 39015 + }, + { + "epoch": 2.6511754314444897, + "grad_norm": 0.43984508514404297, + "learning_rate": 6.687644381030032e-05, + "loss": 3.6248, + "step": 39020 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.203241229057312, + "learning_rate": 6.687219730941703e-05, + "loss": 3.9223, + "step": 39025 + }, + { + "epoch": 2.6518548715858135, + "grad_norm": 0.17739947140216827, + "learning_rate": 6.686795080853378e-05, + "loss": 3.9954, + "step": 39030 + }, + { + "epoch": 2.652194591656475, + "grad_norm": 0.18529129028320312, + "learning_rate": 6.68637043076505e-05, + "loss": 4.0318, + "step": 39035 + }, + { + "epoch": 2.6525343117271367, + "grad_norm": 0.19578388333320618, + "learning_rate": 6.685945780676722e-05, + "loss": 3.9908, + "step": 39040 + }, + { + "epoch": 2.652874031797799, + "grad_norm": 0.18821455538272858, + "learning_rate": 6.685521130588396e-05, + "loss": 3.8051, + "step": 39045 + }, + { + "epoch": 2.6532137518684604, + "grad_norm": 0.18732252717018127, + "learning_rate": 6.685096480500069e-05, + "loss": 3.778, + "step": 39050 + }, + { + "epoch": 2.653553471939122, + "grad_norm": 0.3152923882007599, + "learning_rate": 6.68467183041174e-05, + "loss": 3.5502, + "step": 39055 + }, + { + "epoch": 2.653893192009784, + "grad_norm": 0.24027685821056366, + "learning_rate": 6.684247180323414e-05, + "loss": 3.9756, + "step": 39060 + }, + { + "epoch": 2.6542329120804458, + "grad_norm": 0.1418560892343521, + "learning_rate": 6.683822530235087e-05, + "loss": 3.5913, + "step": 39065 + }, + { + "epoch": 2.6545726321511074, + "grad_norm": 0.16635659337043762, + "learning_rate": 6.683397880146759e-05, + "loss": 4.0477, + "step": 39070 + }, + { + "epoch": 2.6549123522217695, + "grad_norm": 0.21965044736862183, + "learning_rate": 6.682973230058433e-05, + "loss": 3.8068, + "step": 39075 + }, + { + "epoch": 2.655252072292431, + "grad_norm": 0.16828322410583496, + "learning_rate": 6.682548579970104e-05, + "loss": 3.8555, + "step": 39080 + }, + { + "epoch": 2.6555917923630927, + "grad_norm": 0.15556932985782623, + "learning_rate": 6.682123929881777e-05, + "loss": 3.8566, + "step": 39085 + }, + { + "epoch": 2.655931512433755, + "grad_norm": 0.16386288404464722, + "learning_rate": 6.681699279793451e-05, + "loss": 3.9696, + "step": 39090 + }, + { + "epoch": 2.6562712325044164, + "grad_norm": 0.1571277678012848, + "learning_rate": 6.681274629705123e-05, + "loss": 3.9024, + "step": 39095 + }, + { + "epoch": 2.656610952575078, + "grad_norm": 0.7435330152511597, + "learning_rate": 6.680849979616795e-05, + "loss": 3.8411, + "step": 39100 + }, + { + "epoch": 2.65695067264574, + "grad_norm": 0.15638592839241028, + "learning_rate": 6.68042532952847e-05, + "loss": 3.7922, + "step": 39105 + }, + { + "epoch": 2.6572903927164018, + "grad_norm": 0.21691729128360748, + "learning_rate": 6.680000679440141e-05, + "loss": 3.5973, + "step": 39110 + }, + { + "epoch": 2.6576301127870634, + "grad_norm": 0.3828490376472473, + "learning_rate": 6.679576029351814e-05, + "loss": 3.7122, + "step": 39115 + }, + { + "epoch": 2.6579698328577255, + "grad_norm": 0.20334403216838837, + "learning_rate": 6.679151379263488e-05, + "loss": 4.0111, + "step": 39120 + }, + { + "epoch": 2.658309552928387, + "grad_norm": 0.13864688575267792, + "learning_rate": 6.67872672917516e-05, + "loss": 3.5636, + "step": 39125 + }, + { + "epoch": 2.6586492729990487, + "grad_norm": 0.1788858026266098, + "learning_rate": 6.678302079086832e-05, + "loss": 3.7515, + "step": 39130 + }, + { + "epoch": 2.658988993069711, + "grad_norm": 0.19848498702049255, + "learning_rate": 6.677877428998506e-05, + "loss": 4.0114, + "step": 39135 + }, + { + "epoch": 2.6593287131403724, + "grad_norm": 0.17475007474422455, + "learning_rate": 6.677452778910178e-05, + "loss": 3.8279, + "step": 39140 + }, + { + "epoch": 2.659668433211034, + "grad_norm": 0.16448117792606354, + "learning_rate": 6.67702812882185e-05, + "loss": 3.6498, + "step": 39145 + }, + { + "epoch": 2.660008153281696, + "grad_norm": 0.18287302553653717, + "learning_rate": 6.676603478733523e-05, + "loss": 3.8318, + "step": 39150 + }, + { + "epoch": 2.660347873352358, + "grad_norm": 0.18452897667884827, + "learning_rate": 6.676178828645196e-05, + "loss": 4.2835, + "step": 39155 + }, + { + "epoch": 2.6606875934230194, + "grad_norm": 0.15913204848766327, + "learning_rate": 6.675754178556869e-05, + "loss": 3.779, + "step": 39160 + }, + { + "epoch": 2.661027313493681, + "grad_norm": 0.2426171600818634, + "learning_rate": 6.675329528468542e-05, + "loss": 4.0519, + "step": 39165 + }, + { + "epoch": 2.661367033564343, + "grad_norm": 0.17499865591526031, + "learning_rate": 6.674904878380215e-05, + "loss": 3.7961, + "step": 39170 + }, + { + "epoch": 2.6617067536350048, + "grad_norm": 0.18635042011737823, + "learning_rate": 6.674480228291889e-05, + "loss": 3.7945, + "step": 39175 + }, + { + "epoch": 2.6620464737056664, + "grad_norm": 0.1712559163570404, + "learning_rate": 6.67405557820356e-05, + "loss": 3.6853, + "step": 39180 + }, + { + "epoch": 2.6623861937763285, + "grad_norm": 0.7862617373466492, + "learning_rate": 6.673630928115233e-05, + "loss": 3.5508, + "step": 39185 + }, + { + "epoch": 2.66272591384699, + "grad_norm": 0.24846701323986053, + "learning_rate": 6.673206278026907e-05, + "loss": 3.7936, + "step": 39190 + }, + { + "epoch": 2.6630656339176517, + "grad_norm": 0.1629224568605423, + "learning_rate": 6.672781627938579e-05, + "loss": 3.9428, + "step": 39195 + }, + { + "epoch": 2.663405353988314, + "grad_norm": 0.17935431003570557, + "learning_rate": 6.672356977850251e-05, + "loss": 4.0974, + "step": 39200 + }, + { + "epoch": 2.6637450740589754, + "grad_norm": 0.1948002427816391, + "learning_rate": 6.671932327761926e-05, + "loss": 3.7851, + "step": 39205 + }, + { + "epoch": 2.664084794129637, + "grad_norm": 0.18528540432453156, + "learning_rate": 6.671507677673597e-05, + "loss": 3.8205, + "step": 39210 + }, + { + "epoch": 2.6644245142002987, + "grad_norm": 0.37473979592323303, + "learning_rate": 6.67108302758527e-05, + "loss": 3.7826, + "step": 39215 + }, + { + "epoch": 2.6647642342709608, + "grad_norm": 0.19198155403137207, + "learning_rate": 6.670658377496943e-05, + "loss": 3.7263, + "step": 39220 + }, + { + "epoch": 2.6651039543416224, + "grad_norm": 0.19562086462974548, + "learning_rate": 6.670233727408615e-05, + "loss": 4.012, + "step": 39225 + }, + { + "epoch": 2.665443674412284, + "grad_norm": 0.17615118622779846, + "learning_rate": 6.669809077320288e-05, + "loss": 3.6048, + "step": 39230 + }, + { + "epoch": 2.665783394482946, + "grad_norm": 0.17155416309833527, + "learning_rate": 6.669384427231961e-05, + "loss": 3.9201, + "step": 39235 + }, + { + "epoch": 2.6661231145536077, + "grad_norm": 0.2704753279685974, + "learning_rate": 6.668959777143634e-05, + "loss": 3.967, + "step": 39240 + }, + { + "epoch": 2.6664628346242694, + "grad_norm": 0.16010276973247528, + "learning_rate": 6.668535127055307e-05, + "loss": 3.9672, + "step": 39245 + }, + { + "epoch": 2.6668025546949314, + "grad_norm": 0.17409323155879974, + "learning_rate": 6.66811047696698e-05, + "loss": 3.9513, + "step": 39250 + }, + { + "epoch": 2.667142274765593, + "grad_norm": 0.16481657326221466, + "learning_rate": 6.667685826878652e-05, + "loss": 3.7468, + "step": 39255 + }, + { + "epoch": 2.6674819948362547, + "grad_norm": 0.205617293715477, + "learning_rate": 6.667261176790325e-05, + "loss": 4.0308, + "step": 39260 + }, + { + "epoch": 2.6678217149069168, + "grad_norm": 0.19739383459091187, + "learning_rate": 6.666836526701998e-05, + "loss": 3.8077, + "step": 39265 + }, + { + "epoch": 2.6681614349775784, + "grad_norm": 0.18930460512638092, + "learning_rate": 6.66641187661367e-05, + "loss": 3.851, + "step": 39270 + }, + { + "epoch": 2.66850115504824, + "grad_norm": 0.1555471569299698, + "learning_rate": 6.665987226525343e-05, + "loss": 4.1532, + "step": 39275 + }, + { + "epoch": 2.668840875118902, + "grad_norm": 0.19139571487903595, + "learning_rate": 6.665562576437016e-05, + "loss": 3.9297, + "step": 39280 + }, + { + "epoch": 2.6691805951895637, + "grad_norm": 0.18166138231754303, + "learning_rate": 6.665137926348689e-05, + "loss": 4.0878, + "step": 39285 + }, + { + "epoch": 2.6695203152602254, + "grad_norm": 0.29773038625717163, + "learning_rate": 6.664713276260362e-05, + "loss": 3.8114, + "step": 39290 + }, + { + "epoch": 2.6698600353308874, + "grad_norm": 0.5080788135528564, + "learning_rate": 6.664288626172035e-05, + "loss": 3.9851, + "step": 39295 + }, + { + "epoch": 2.670199755401549, + "grad_norm": 0.1650705188512802, + "learning_rate": 6.663863976083707e-05, + "loss": 3.8817, + "step": 39300 + }, + { + "epoch": 2.6705394754722107, + "grad_norm": 0.3579939305782318, + "learning_rate": 6.66343932599538e-05, + "loss": 3.7593, + "step": 39305 + }, + { + "epoch": 2.670879195542873, + "grad_norm": 0.1607590913772583, + "learning_rate": 6.663014675907053e-05, + "loss": 3.8166, + "step": 39310 + }, + { + "epoch": 2.6712189156135344, + "grad_norm": 0.19355127215385437, + "learning_rate": 6.662590025818726e-05, + "loss": 3.8352, + "step": 39315 + }, + { + "epoch": 2.671558635684196, + "grad_norm": 0.20755428075790405, + "learning_rate": 6.662165375730399e-05, + "loss": 4.1121, + "step": 39320 + }, + { + "epoch": 2.671898355754858, + "grad_norm": 0.1421874612569809, + "learning_rate": 6.661740725642071e-05, + "loss": 3.8417, + "step": 39325 + }, + { + "epoch": 2.6722380758255198, + "grad_norm": 0.23598141968250275, + "learning_rate": 6.661316075553744e-05, + "loss": 3.9024, + "step": 39330 + }, + { + "epoch": 2.6725777958961814, + "grad_norm": 0.18979984521865845, + "learning_rate": 6.660891425465417e-05, + "loss": 3.8052, + "step": 39335 + }, + { + "epoch": 2.6729175159668435, + "grad_norm": 0.3973280191421509, + "learning_rate": 6.66046677537709e-05, + "loss": 3.9715, + "step": 39340 + }, + { + "epoch": 2.673257236037505, + "grad_norm": 0.18423056602478027, + "learning_rate": 6.660042125288763e-05, + "loss": 3.8122, + "step": 39345 + }, + { + "epoch": 2.6735969561081667, + "grad_norm": 0.3587031960487366, + "learning_rate": 6.659617475200434e-05, + "loss": 3.8794, + "step": 39350 + }, + { + "epoch": 2.673936676178829, + "grad_norm": 0.15070907771587372, + "learning_rate": 6.659192825112108e-05, + "loss": 3.9589, + "step": 39355 + }, + { + "epoch": 2.6742763962494904, + "grad_norm": 0.14258132874965668, + "learning_rate": 6.658768175023781e-05, + "loss": 3.6405, + "step": 39360 + }, + { + "epoch": 2.674616116320152, + "grad_norm": 0.18048407137393951, + "learning_rate": 6.658343524935453e-05, + "loss": 3.7725, + "step": 39365 + }, + { + "epoch": 2.674955836390814, + "grad_norm": 0.17306837439537048, + "learning_rate": 6.657918874847127e-05, + "loss": 3.9384, + "step": 39370 + }, + { + "epoch": 2.6752955564614758, + "grad_norm": 0.22409477829933167, + "learning_rate": 6.6574942247588e-05, + "loss": 3.9643, + "step": 39375 + }, + { + "epoch": 2.6756352765321374, + "grad_norm": 0.15564727783203125, + "learning_rate": 6.657069574670471e-05, + "loss": 3.8316, + "step": 39380 + }, + { + "epoch": 2.6759749966027995, + "grad_norm": 0.15814858675003052, + "learning_rate": 6.656644924582145e-05, + "loss": 3.9425, + "step": 39385 + }, + { + "epoch": 2.676314716673461, + "grad_norm": 0.16711993515491486, + "learning_rate": 6.656220274493818e-05, + "loss": 3.9561, + "step": 39390 + }, + { + "epoch": 2.6766544367441227, + "grad_norm": 0.18639622628688812, + "learning_rate": 6.65579562440549e-05, + "loss": 4.0069, + "step": 39395 + }, + { + "epoch": 2.676994156814785, + "grad_norm": 0.18486663699150085, + "learning_rate": 6.655370974317163e-05, + "loss": 4.134, + "step": 39400 + }, + { + "epoch": 2.6773338768854464, + "grad_norm": 0.2933027148246765, + "learning_rate": 6.654946324228836e-05, + "loss": 3.965, + "step": 39405 + }, + { + "epoch": 2.677673596956108, + "grad_norm": 0.17986251413822174, + "learning_rate": 6.654521674140508e-05, + "loss": 3.7952, + "step": 39410 + }, + { + "epoch": 2.67801331702677, + "grad_norm": 0.13847807049751282, + "learning_rate": 6.654097024052182e-05, + "loss": 3.9745, + "step": 39415 + }, + { + "epoch": 2.6783530370974318, + "grad_norm": 1.06475830078125, + "learning_rate": 6.653672373963853e-05, + "loss": 3.8382, + "step": 39420 + }, + { + "epoch": 2.6786927571680934, + "grad_norm": 0.1789160668849945, + "learning_rate": 6.653247723875526e-05, + "loss": 3.9702, + "step": 39425 + }, + { + "epoch": 2.6790324772387555, + "grad_norm": 0.19135092198848724, + "learning_rate": 6.6528230737872e-05, + "loss": 3.8738, + "step": 39430 + }, + { + "epoch": 2.679372197309417, + "grad_norm": 0.17377467453479767, + "learning_rate": 6.652398423698872e-05, + "loss": 3.9093, + "step": 39435 + }, + { + "epoch": 2.6797119173800787, + "grad_norm": 0.14184941351413727, + "learning_rate": 6.651973773610545e-05, + "loss": 3.8844, + "step": 39440 + }, + { + "epoch": 2.680051637450741, + "grad_norm": 0.15905126929283142, + "learning_rate": 6.651549123522219e-05, + "loss": 3.9719, + "step": 39445 + }, + { + "epoch": 2.6803913575214025, + "grad_norm": 0.15074039995670319, + "learning_rate": 6.65112447343389e-05, + "loss": 3.9636, + "step": 39450 + }, + { + "epoch": 2.680731077592064, + "grad_norm": 0.19336898624897003, + "learning_rate": 6.650699823345563e-05, + "loss": 3.833, + "step": 39455 + }, + { + "epoch": 2.681070797662726, + "grad_norm": 0.16907581686973572, + "learning_rate": 6.650275173257237e-05, + "loss": 3.8143, + "step": 39460 + }, + { + "epoch": 2.681410517733388, + "grad_norm": 0.16990873217582703, + "learning_rate": 6.649850523168909e-05, + "loss": 3.7794, + "step": 39465 + }, + { + "epoch": 2.6817502378040494, + "grad_norm": 0.14004813134670258, + "learning_rate": 6.649425873080581e-05, + "loss": 3.9059, + "step": 39470 + }, + { + "epoch": 2.6820899578747115, + "grad_norm": 0.37999001145362854, + "learning_rate": 6.649001222992255e-05, + "loss": 3.7908, + "step": 39475 + }, + { + "epoch": 2.682429677945373, + "grad_norm": 0.13811467587947845, + "learning_rate": 6.648576572903927e-05, + "loss": 3.7795, + "step": 39480 + }, + { + "epoch": 2.6827693980160348, + "grad_norm": 0.197072371840477, + "learning_rate": 6.6481519228156e-05, + "loss": 4.0253, + "step": 39485 + }, + { + "epoch": 2.683109118086697, + "grad_norm": 0.18520306050777435, + "learning_rate": 6.647727272727274e-05, + "loss": 3.983, + "step": 39490 + }, + { + "epoch": 2.6834488381573585, + "grad_norm": 0.16379015147686005, + "learning_rate": 6.647302622638945e-05, + "loss": 3.8434, + "step": 39495 + }, + { + "epoch": 2.68378855822802, + "grad_norm": 0.15837998688220978, + "learning_rate": 6.646877972550618e-05, + "loss": 3.7335, + "step": 39500 + }, + { + "epoch": 2.6841282782986817, + "grad_norm": 0.18392331898212433, + "learning_rate": 6.646453322462291e-05, + "loss": 3.8193, + "step": 39505 + }, + { + "epoch": 2.684467998369344, + "grad_norm": 0.23194344341754913, + "learning_rate": 6.646028672373964e-05, + "loss": 3.8866, + "step": 39510 + }, + { + "epoch": 2.6848077184400054, + "grad_norm": 0.19044387340545654, + "learning_rate": 6.645604022285638e-05, + "loss": 3.7335, + "step": 39515 + }, + { + "epoch": 2.685147438510667, + "grad_norm": 0.27145281434059143, + "learning_rate": 6.64517937219731e-05, + "loss": 3.7586, + "step": 39520 + }, + { + "epoch": 2.685487158581329, + "grad_norm": 0.26841282844543457, + "learning_rate": 6.644754722108982e-05, + "loss": 3.7374, + "step": 39525 + }, + { + "epoch": 2.6858268786519908, + "grad_norm": 0.20673756301403046, + "learning_rate": 6.644330072020656e-05, + "loss": 3.8455, + "step": 39530 + }, + { + "epoch": 2.6861665987226524, + "grad_norm": 7.591858386993408, + "learning_rate": 6.643905421932328e-05, + "loss": 3.9965, + "step": 39535 + }, + { + "epoch": 2.6865063187933145, + "grad_norm": 0.1811532825231552, + "learning_rate": 6.643480771844e-05, + "loss": 3.8152, + "step": 39540 + }, + { + "epoch": 2.686846038863976, + "grad_norm": 0.3619449734687805, + "learning_rate": 6.643056121755675e-05, + "loss": 3.6979, + "step": 39545 + }, + { + "epoch": 2.6871857589346377, + "grad_norm": 0.2250664085149765, + "learning_rate": 6.642631471667346e-05, + "loss": 3.9152, + "step": 39550 + }, + { + "epoch": 2.6875254790052994, + "grad_norm": 0.22007302939891815, + "learning_rate": 6.642206821579019e-05, + "loss": 3.9038, + "step": 39555 + }, + { + "epoch": 2.6878651990759614, + "grad_norm": 0.2050907164812088, + "learning_rate": 6.641782171490693e-05, + "loss": 3.7938, + "step": 39560 + }, + { + "epoch": 2.688204919146623, + "grad_norm": 0.158903568983078, + "learning_rate": 6.641357521402365e-05, + "loss": 3.9762, + "step": 39565 + }, + { + "epoch": 2.6885446392172847, + "grad_norm": 0.25433260202407837, + "learning_rate": 6.640932871314037e-05, + "loss": 3.9299, + "step": 39570 + }, + { + "epoch": 2.688884359287947, + "grad_norm": 0.16117116808891296, + "learning_rate": 6.64050822122571e-05, + "loss": 3.8442, + "step": 39575 + }, + { + "epoch": 2.6892240793586084, + "grad_norm": 0.28073304891586304, + "learning_rate": 6.640083571137383e-05, + "loss": 3.8709, + "step": 39580 + }, + { + "epoch": 2.68956379942927, + "grad_norm": 0.18658871948719025, + "learning_rate": 6.639658921049056e-05, + "loss": 4.1196, + "step": 39585 + }, + { + "epoch": 2.689903519499932, + "grad_norm": 0.8785406947135925, + "learning_rate": 6.639234270960729e-05, + "loss": 4.0303, + "step": 39590 + }, + { + "epoch": 2.6902432395705937, + "grad_norm": 0.1607578843832016, + "learning_rate": 6.638809620872401e-05, + "loss": 3.9262, + "step": 39595 + }, + { + "epoch": 2.6905829596412554, + "grad_norm": 0.6528551578521729, + "learning_rate": 6.638384970784074e-05, + "loss": 3.9307, + "step": 39600 + }, + { + "epoch": 2.6909226797119175, + "grad_norm": 0.15952497720718384, + "learning_rate": 6.637960320695747e-05, + "loss": 3.8217, + "step": 39605 + }, + { + "epoch": 2.691262399782579, + "grad_norm": 0.18700894713401794, + "learning_rate": 6.63753567060742e-05, + "loss": 3.8485, + "step": 39610 + }, + { + "epoch": 2.6916021198532407, + "grad_norm": 0.9578309059143066, + "learning_rate": 6.637111020519093e-05, + "loss": 4.0137, + "step": 39615 + }, + { + "epoch": 2.691941839923903, + "grad_norm": 0.15109091997146606, + "learning_rate": 6.636686370430765e-05, + "loss": 3.8342, + "step": 39620 + }, + { + "epoch": 2.6922815599945644, + "grad_norm": 0.1892438679933548, + "learning_rate": 6.636261720342438e-05, + "loss": 3.957, + "step": 39625 + }, + { + "epoch": 2.692621280065226, + "grad_norm": 0.16824592649936676, + "learning_rate": 6.635837070254111e-05, + "loss": 3.8375, + "step": 39630 + }, + { + "epoch": 2.692961000135888, + "grad_norm": 0.20034334063529968, + "learning_rate": 6.635412420165784e-05, + "loss": 3.8725, + "step": 39635 + }, + { + "epoch": 2.6933007202065498, + "grad_norm": 0.1790536493062973, + "learning_rate": 6.634987770077457e-05, + "loss": 3.8681, + "step": 39640 + }, + { + "epoch": 2.6936404402772114, + "grad_norm": 0.22760643064975739, + "learning_rate": 6.63456311998913e-05, + "loss": 3.6271, + "step": 39645 + }, + { + "epoch": 2.6939801603478735, + "grad_norm": 4.584059238433838, + "learning_rate": 6.634138469900802e-05, + "loss": 3.865, + "step": 39650 + }, + { + "epoch": 2.694319880418535, + "grad_norm": 0.17609962821006775, + "learning_rate": 6.633713819812475e-05, + "loss": 3.8464, + "step": 39655 + }, + { + "epoch": 2.6946596004891967, + "grad_norm": 0.22584731876850128, + "learning_rate": 6.633289169724148e-05, + "loss": 3.8556, + "step": 39660 + }, + { + "epoch": 2.694999320559859, + "grad_norm": 0.18846385180950165, + "learning_rate": 6.63286451963582e-05, + "loss": 3.9956, + "step": 39665 + }, + { + "epoch": 2.6953390406305204, + "grad_norm": 0.1520487666130066, + "learning_rate": 6.632439869547493e-05, + "loss": 3.8056, + "step": 39670 + }, + { + "epoch": 2.695678760701182, + "grad_norm": 0.19814421236515045, + "learning_rate": 6.632015219459166e-05, + "loss": 3.6232, + "step": 39675 + }, + { + "epoch": 2.696018480771844, + "grad_norm": 0.14729301631450653, + "learning_rate": 6.631590569370839e-05, + "loss": 3.8493, + "step": 39680 + }, + { + "epoch": 2.6963582008425058, + "grad_norm": 0.1963522732257843, + "learning_rate": 6.631165919282512e-05, + "loss": 3.8582, + "step": 39685 + }, + { + "epoch": 2.6966979209131674, + "grad_norm": 0.1826598197221756, + "learning_rate": 6.630741269194185e-05, + "loss": 3.7874, + "step": 39690 + }, + { + "epoch": 2.6970376409838295, + "grad_norm": 0.15741696953773499, + "learning_rate": 6.630316619105857e-05, + "loss": 3.8341, + "step": 39695 + }, + { + "epoch": 2.697377361054491, + "grad_norm": 0.20099465548992157, + "learning_rate": 6.62989196901753e-05, + "loss": 3.9654, + "step": 39700 + }, + { + "epoch": 2.6977170811251527, + "grad_norm": 0.18356665968894958, + "learning_rate": 6.629467318929202e-05, + "loss": 3.7643, + "step": 39705 + }, + { + "epoch": 2.698056801195815, + "grad_norm": 0.21813075244426727, + "learning_rate": 6.629042668840876e-05, + "loss": 3.93, + "step": 39710 + }, + { + "epoch": 2.6983965212664764, + "grad_norm": 0.26092728972435, + "learning_rate": 6.628618018752549e-05, + "loss": 3.7554, + "step": 39715 + }, + { + "epoch": 2.698736241337138, + "grad_norm": 0.2901889979839325, + "learning_rate": 6.62819336866422e-05, + "loss": 4.0609, + "step": 39720 + }, + { + "epoch": 2.6990759614078, + "grad_norm": 0.1503990888595581, + "learning_rate": 6.627768718575894e-05, + "loss": 3.8503, + "step": 39725 + }, + { + "epoch": 2.699415681478462, + "grad_norm": 0.15313531458377838, + "learning_rate": 6.627344068487567e-05, + "loss": 3.728, + "step": 39730 + }, + { + "epoch": 2.6997554015491234, + "grad_norm": 0.2051745504140854, + "learning_rate": 6.626919418399238e-05, + "loss": 3.8485, + "step": 39735 + }, + { + "epoch": 2.7000951216197855, + "grad_norm": 0.21384038031101227, + "learning_rate": 6.626494768310913e-05, + "loss": 3.5808, + "step": 39740 + }, + { + "epoch": 2.700434841690447, + "grad_norm": 0.17155736684799194, + "learning_rate": 6.626070118222585e-05, + "loss": 3.856, + "step": 39745 + }, + { + "epoch": 2.7007745617611087, + "grad_norm": 0.21808399260044098, + "learning_rate": 6.625645468134257e-05, + "loss": 4.0063, + "step": 39750 + }, + { + "epoch": 2.701114281831771, + "grad_norm": 0.14718694984912872, + "learning_rate": 6.625220818045931e-05, + "loss": 3.7663, + "step": 39755 + }, + { + "epoch": 2.7014540019024325, + "grad_norm": 0.25667688250541687, + "learning_rate": 6.624796167957604e-05, + "loss": 3.9097, + "step": 39760 + }, + { + "epoch": 2.701793721973094, + "grad_norm": 0.15612122416496277, + "learning_rate": 6.624371517869275e-05, + "loss": 4.0019, + "step": 39765 + }, + { + "epoch": 2.702133442043756, + "grad_norm": 0.16179856657981873, + "learning_rate": 6.62394686778095e-05, + "loss": 3.802, + "step": 39770 + }, + { + "epoch": 2.702473162114418, + "grad_norm": 0.1495223194360733, + "learning_rate": 6.623522217692621e-05, + "loss": 3.6474, + "step": 39775 + }, + { + "epoch": 2.7028128821850794, + "grad_norm": 0.19371001422405243, + "learning_rate": 6.623097567604294e-05, + "loss": 3.7462, + "step": 39780 + }, + { + "epoch": 2.7031526022557415, + "grad_norm": 0.17205655574798584, + "learning_rate": 6.622672917515968e-05, + "loss": 3.8506, + "step": 39785 + }, + { + "epoch": 2.703492322326403, + "grad_norm": 0.4129747748374939, + "learning_rate": 6.622248267427639e-05, + "loss": 3.9493, + "step": 39790 + }, + { + "epoch": 2.7038320423970648, + "grad_norm": 0.22093883156776428, + "learning_rate": 6.621823617339312e-05, + "loss": 3.9803, + "step": 39795 + }, + { + "epoch": 2.704171762467727, + "grad_norm": 0.27696123719215393, + "learning_rate": 6.621398967250986e-05, + "loss": 3.8362, + "step": 39800 + }, + { + "epoch": 2.7045114825383885, + "grad_norm": 0.18172483146190643, + "learning_rate": 6.620974317162658e-05, + "loss": 3.8541, + "step": 39805 + }, + { + "epoch": 2.70485120260905, + "grad_norm": 0.28370410203933716, + "learning_rate": 6.62054966707433e-05, + "loss": 3.8253, + "step": 39810 + }, + { + "epoch": 2.705190922679712, + "grad_norm": 0.18817420303821564, + "learning_rate": 6.620125016986005e-05, + "loss": 3.7454, + "step": 39815 + }, + { + "epoch": 2.705530642750374, + "grad_norm": 0.17720147967338562, + "learning_rate": 6.619700366897676e-05, + "loss": 3.9119, + "step": 39820 + }, + { + "epoch": 2.7058703628210354, + "grad_norm": 0.16161775588989258, + "learning_rate": 6.619275716809349e-05, + "loss": 3.7585, + "step": 39825 + }, + { + "epoch": 2.7062100828916975, + "grad_norm": 1.1930475234985352, + "learning_rate": 6.618851066721023e-05, + "loss": 3.7589, + "step": 39830 + }, + { + "epoch": 2.706549802962359, + "grad_norm": 0.17353136837482452, + "learning_rate": 6.618426416632694e-05, + "loss": 3.7958, + "step": 39835 + }, + { + "epoch": 2.7068895230330208, + "grad_norm": 0.1739385724067688, + "learning_rate": 6.618001766544367e-05, + "loss": 3.7778, + "step": 39840 + }, + { + "epoch": 2.7072292431036824, + "grad_norm": 0.17696277797222137, + "learning_rate": 6.617577116456041e-05, + "loss": 3.7335, + "step": 39845 + }, + { + "epoch": 2.7075689631743445, + "grad_norm": 0.19879163801670074, + "learning_rate": 6.617152466367713e-05, + "loss": 3.9534, + "step": 39850 + }, + { + "epoch": 2.707908683245006, + "grad_norm": 0.18951986730098724, + "learning_rate": 6.616727816279387e-05, + "loss": 3.812, + "step": 39855 + }, + { + "epoch": 2.7082484033156677, + "grad_norm": 0.25068920850753784, + "learning_rate": 6.616303166191058e-05, + "loss": 3.8314, + "step": 39860 + }, + { + "epoch": 2.70858812338633, + "grad_norm": 0.21665506064891815, + "learning_rate": 6.615878516102731e-05, + "loss": 3.9486, + "step": 39865 + }, + { + "epoch": 2.7089278434569914, + "grad_norm": 0.22324860095977783, + "learning_rate": 6.615453866014405e-05, + "loss": 3.9916, + "step": 39870 + }, + { + "epoch": 2.709267563527653, + "grad_norm": 1.0307663679122925, + "learning_rate": 6.615029215926077e-05, + "loss": 3.8132, + "step": 39875 + }, + { + "epoch": 2.709607283598315, + "grad_norm": 0.5131462812423706, + "learning_rate": 6.61460456583775e-05, + "loss": 3.7907, + "step": 39880 + }, + { + "epoch": 2.709947003668977, + "grad_norm": 0.3580879271030426, + "learning_rate": 6.614179915749424e-05, + "loss": 3.9488, + "step": 39885 + }, + { + "epoch": 2.7102867237396384, + "grad_norm": 0.16872943937778473, + "learning_rate": 6.613755265661095e-05, + "loss": 3.8775, + "step": 39890 + }, + { + "epoch": 2.7106264438103, + "grad_norm": 0.26246222853660583, + "learning_rate": 6.613330615572768e-05, + "loss": 3.8709, + "step": 39895 + }, + { + "epoch": 2.710966163880962, + "grad_norm": 0.4791730046272278, + "learning_rate": 6.612905965484442e-05, + "loss": 3.8326, + "step": 39900 + }, + { + "epoch": 2.7113058839516238, + "grad_norm": 0.210224449634552, + "learning_rate": 6.612481315396114e-05, + "loss": 3.9615, + "step": 39905 + }, + { + "epoch": 2.7116456040222854, + "grad_norm": 0.18140962719917297, + "learning_rate": 6.612056665307786e-05, + "loss": 3.9622, + "step": 39910 + }, + { + "epoch": 2.7119853240929475, + "grad_norm": 0.1813540756702423, + "learning_rate": 6.61163201521946e-05, + "loss": 3.8529, + "step": 39915 + }, + { + "epoch": 2.712325044163609, + "grad_norm": 0.14694467186927795, + "learning_rate": 6.611207365131132e-05, + "loss": 3.9863, + "step": 39920 + }, + { + "epoch": 2.7126647642342707, + "grad_norm": 0.14785657823085785, + "learning_rate": 6.610782715042805e-05, + "loss": 4.0107, + "step": 39925 + }, + { + "epoch": 2.713004484304933, + "grad_norm": 0.20011214911937714, + "learning_rate": 6.610358064954478e-05, + "loss": 4.0168, + "step": 39930 + }, + { + "epoch": 2.7133442043755944, + "grad_norm": 0.22491669654846191, + "learning_rate": 6.60993341486615e-05, + "loss": 3.9823, + "step": 39935 + }, + { + "epoch": 2.713683924446256, + "grad_norm": 0.1897081583738327, + "learning_rate": 6.609508764777823e-05, + "loss": 3.7362, + "step": 39940 + }, + { + "epoch": 2.714023644516918, + "grad_norm": 0.15833528339862823, + "learning_rate": 6.609084114689496e-05, + "loss": 3.6499, + "step": 39945 + }, + { + "epoch": 2.7143633645875798, + "grad_norm": 0.21260908246040344, + "learning_rate": 6.608659464601169e-05, + "loss": 3.7418, + "step": 39950 + }, + { + "epoch": 2.7147030846582414, + "grad_norm": 0.16621440649032593, + "learning_rate": 6.608234814512842e-05, + "loss": 3.8024, + "step": 39955 + }, + { + "epoch": 2.7150428047289035, + "grad_norm": 0.17978914082050323, + "learning_rate": 6.607810164424514e-05, + "loss": 4.0924, + "step": 39960 + }, + { + "epoch": 2.715382524799565, + "grad_norm": 0.23326647281646729, + "learning_rate": 6.607385514336187e-05, + "loss": 3.844, + "step": 39965 + }, + { + "epoch": 2.7157222448702267, + "grad_norm": 0.15309691429138184, + "learning_rate": 6.60696086424786e-05, + "loss": 3.518, + "step": 39970 + }, + { + "epoch": 2.716061964940889, + "grad_norm": 0.16847506165504456, + "learning_rate": 6.606536214159533e-05, + "loss": 3.94, + "step": 39975 + }, + { + "epoch": 2.7164016850115504, + "grad_norm": 0.23509420454502106, + "learning_rate": 6.606111564071206e-05, + "loss": 3.7791, + "step": 39980 + }, + { + "epoch": 2.716741405082212, + "grad_norm": 0.1719091832637787, + "learning_rate": 6.605686913982878e-05, + "loss": 3.838, + "step": 39985 + }, + { + "epoch": 2.717081125152874, + "grad_norm": 0.16519078612327576, + "learning_rate": 6.605262263894551e-05, + "loss": 3.8194, + "step": 39990 + }, + { + "epoch": 2.7174208452235358, + "grad_norm": 0.171021968126297, + "learning_rate": 6.604837613806224e-05, + "loss": 3.8939, + "step": 39995 + }, + { + "epoch": 2.7177605652941974, + "grad_norm": 0.1478385627269745, + "learning_rate": 6.604412963717897e-05, + "loss": 3.9125, + "step": 40000 + }, + { + "epoch": 2.7181002853648595, + "grad_norm": 0.24623793363571167, + "learning_rate": 6.60398831362957e-05, + "loss": 3.965, + "step": 40005 + }, + { + "epoch": 2.718440005435521, + "grad_norm": 0.22753620147705078, + "learning_rate": 6.603563663541242e-05, + "loss": 3.8258, + "step": 40010 + }, + { + "epoch": 2.7187797255061827, + "grad_norm": 0.1599360853433609, + "learning_rate": 6.603139013452915e-05, + "loss": 3.7718, + "step": 40015 + }, + { + "epoch": 2.719119445576845, + "grad_norm": 0.1641189008951187, + "learning_rate": 6.602714363364588e-05, + "loss": 4.0665, + "step": 40020 + }, + { + "epoch": 2.7194591656475064, + "grad_norm": 0.17975294589996338, + "learning_rate": 6.602289713276261e-05, + "loss": 3.9797, + "step": 40025 + }, + { + "epoch": 2.719798885718168, + "grad_norm": 0.22373978793621063, + "learning_rate": 6.601865063187934e-05, + "loss": 4.0896, + "step": 40030 + }, + { + "epoch": 2.72013860578883, + "grad_norm": 0.18035510182380676, + "learning_rate": 6.601440413099606e-05, + "loss": 3.6566, + "step": 40035 + }, + { + "epoch": 2.720478325859492, + "grad_norm": 1.1899406909942627, + "learning_rate": 6.601015763011279e-05, + "loss": 3.9387, + "step": 40040 + }, + { + "epoch": 2.7208180459301534, + "grad_norm": 0.17865894734859467, + "learning_rate": 6.600591112922952e-05, + "loss": 3.8446, + "step": 40045 + }, + { + "epoch": 2.7211577660008155, + "grad_norm": 0.22594071924686432, + "learning_rate": 6.600166462834625e-05, + "loss": 3.7914, + "step": 40050 + }, + { + "epoch": 2.721497486071477, + "grad_norm": 0.18844184279441833, + "learning_rate": 6.599741812746298e-05, + "loss": 3.9151, + "step": 40055 + }, + { + "epoch": 2.7218372061421388, + "grad_norm": 0.2650431990623474, + "learning_rate": 6.599317162657969e-05, + "loss": 3.617, + "step": 40060 + }, + { + "epoch": 2.722176926212801, + "grad_norm": 0.44453850388526917, + "learning_rate": 6.598892512569643e-05, + "loss": 3.8429, + "step": 40065 + }, + { + "epoch": 2.7225166462834625, + "grad_norm": 0.17432065308094025, + "learning_rate": 6.598467862481316e-05, + "loss": 3.8675, + "step": 40070 + }, + { + "epoch": 2.722856366354124, + "grad_norm": 0.1920483559370041, + "learning_rate": 6.598043212392988e-05, + "loss": 3.9324, + "step": 40075 + }, + { + "epoch": 2.723196086424786, + "grad_norm": 0.20909032225608826, + "learning_rate": 6.597618562304662e-05, + "loss": 3.6866, + "step": 40080 + }, + { + "epoch": 2.723535806495448, + "grad_norm": 0.17149361968040466, + "learning_rate": 6.597193912216334e-05, + "loss": 3.9968, + "step": 40085 + }, + { + "epoch": 2.7238755265661094, + "grad_norm": 0.28957027196884155, + "learning_rate": 6.596769262128006e-05, + "loss": 3.912, + "step": 40090 + }, + { + "epoch": 2.7242152466367715, + "grad_norm": 0.1618124395608902, + "learning_rate": 6.59634461203968e-05, + "loss": 4.0366, + "step": 40095 + }, + { + "epoch": 2.724554966707433, + "grad_norm": 0.17989583313465118, + "learning_rate": 6.595919961951353e-05, + "loss": 3.8824, + "step": 40100 + }, + { + "epoch": 2.7248946867780948, + "grad_norm": 0.2584648132324219, + "learning_rate": 6.595495311863024e-05, + "loss": 3.6319, + "step": 40105 + }, + { + "epoch": 2.725234406848757, + "grad_norm": 0.38605040311813354, + "learning_rate": 6.595070661774698e-05, + "loss": 4.041, + "step": 40110 + }, + { + "epoch": 2.7255741269194185, + "grad_norm": 0.19282405078411102, + "learning_rate": 6.594646011686371e-05, + "loss": 3.8692, + "step": 40115 + }, + { + "epoch": 2.72591384699008, + "grad_norm": 0.23990386724472046, + "learning_rate": 6.594221361598043e-05, + "loss": 3.7904, + "step": 40120 + }, + { + "epoch": 2.726253567060742, + "grad_norm": 0.16096271574497223, + "learning_rate": 6.593796711509717e-05, + "loss": 3.6122, + "step": 40125 + }, + { + "epoch": 2.726593287131404, + "grad_norm": 0.20079128444194794, + "learning_rate": 6.593372061421388e-05, + "loss": 3.9399, + "step": 40130 + }, + { + "epoch": 2.7269330072020654, + "grad_norm": 0.19019706547260284, + "learning_rate": 6.592947411333061e-05, + "loss": 3.8701, + "step": 40135 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.1883193999528885, + "learning_rate": 6.592522761244735e-05, + "loss": 3.6792, + "step": 40140 + }, + { + "epoch": 2.727612447343389, + "grad_norm": 0.2996148467063904, + "learning_rate": 6.592098111156407e-05, + "loss": 3.9258, + "step": 40145 + }, + { + "epoch": 2.7279521674140508, + "grad_norm": 0.18609175086021423, + "learning_rate": 6.59167346106808e-05, + "loss": 3.868, + "step": 40150 + }, + { + "epoch": 2.728291887484713, + "grad_norm": 0.12854738533496857, + "learning_rate": 6.591248810979754e-05, + "loss": 3.8805, + "step": 40155 + }, + { + "epoch": 2.7286316075553745, + "grad_norm": 0.1844760924577713, + "learning_rate": 6.590824160891425e-05, + "loss": 3.7392, + "step": 40160 + }, + { + "epoch": 2.728971327626036, + "grad_norm": 0.7045885324478149, + "learning_rate": 6.590399510803098e-05, + "loss": 4.2123, + "step": 40165 + }, + { + "epoch": 2.729311047696698, + "grad_norm": 0.21266746520996094, + "learning_rate": 6.589974860714772e-05, + "loss": 3.8436, + "step": 40170 + }, + { + "epoch": 2.72965076776736, + "grad_norm": 5.829078197479248, + "learning_rate": 6.589550210626444e-05, + "loss": 3.9811, + "step": 40175 + }, + { + "epoch": 2.7299904878380215, + "grad_norm": 0.22741572558879852, + "learning_rate": 6.589125560538116e-05, + "loss": 4.0817, + "step": 40180 + }, + { + "epoch": 2.730330207908683, + "grad_norm": 0.1738944947719574, + "learning_rate": 6.58870091044979e-05, + "loss": 3.9622, + "step": 40185 + }, + { + "epoch": 2.730669927979345, + "grad_norm": 0.15869717299938202, + "learning_rate": 6.588276260361462e-05, + "loss": 3.9534, + "step": 40190 + }, + { + "epoch": 2.731009648050007, + "grad_norm": 0.15207569301128387, + "learning_rate": 6.587851610273136e-05, + "loss": 3.848, + "step": 40195 + }, + { + "epoch": 2.7313493681206684, + "grad_norm": 1.98995840549469, + "learning_rate": 6.587426960184808e-05, + "loss": 4.0886, + "step": 40200 + }, + { + "epoch": 2.7316890881913305, + "grad_norm": 0.30862078070640564, + "learning_rate": 6.58700231009648e-05, + "loss": 3.6193, + "step": 40205 + }, + { + "epoch": 2.732028808261992, + "grad_norm": 0.1568174809217453, + "learning_rate": 6.586577660008155e-05, + "loss": 3.9364, + "step": 40210 + }, + { + "epoch": 2.7323685283326538, + "grad_norm": 0.1731138825416565, + "learning_rate": 6.586153009919826e-05, + "loss": 3.8855, + "step": 40215 + }, + { + "epoch": 2.732708248403316, + "grad_norm": 0.1631261557340622, + "learning_rate": 6.585728359831499e-05, + "loss": 3.8591, + "step": 40220 + }, + { + "epoch": 2.7330479684739775, + "grad_norm": 0.23158396780490875, + "learning_rate": 6.585303709743173e-05, + "loss": 3.8871, + "step": 40225 + }, + { + "epoch": 2.733387688544639, + "grad_norm": 0.16576160490512848, + "learning_rate": 6.584879059654844e-05, + "loss": 3.9189, + "step": 40230 + }, + { + "epoch": 2.7337274086153007, + "grad_norm": 0.4576801061630249, + "learning_rate": 6.584454409566517e-05, + "loss": 3.8331, + "step": 40235 + }, + { + "epoch": 2.734067128685963, + "grad_norm": 0.18447548151016235, + "learning_rate": 6.584029759478191e-05, + "loss": 3.9916, + "step": 40240 + }, + { + "epoch": 2.7344068487566244, + "grad_norm": 0.16919472813606262, + "learning_rate": 6.583605109389863e-05, + "loss": 3.7385, + "step": 40245 + }, + { + "epoch": 2.734746568827286, + "grad_norm": 0.21097266674041748, + "learning_rate": 6.583180459301536e-05, + "loss": 4.0224, + "step": 40250 + }, + { + "epoch": 2.735086288897948, + "grad_norm": 0.1827019900083542, + "learning_rate": 6.58275580921321e-05, + "loss": 4.0062, + "step": 40255 + }, + { + "epoch": 2.7354260089686098, + "grad_norm": 0.5310953259468079, + "learning_rate": 6.582331159124881e-05, + "loss": 3.9651, + "step": 40260 + }, + { + "epoch": 2.7357657290392714, + "grad_norm": 0.16291071474552155, + "learning_rate": 6.581906509036554e-05, + "loss": 3.7247, + "step": 40265 + }, + { + "epoch": 2.7361054491099335, + "grad_norm": 0.15003302693367004, + "learning_rate": 6.581481858948228e-05, + "loss": 3.8629, + "step": 40270 + }, + { + "epoch": 2.736445169180595, + "grad_norm": 0.19892387092113495, + "learning_rate": 6.5810572088599e-05, + "loss": 3.8214, + "step": 40275 + }, + { + "epoch": 2.7367848892512567, + "grad_norm": 0.1816316843032837, + "learning_rate": 6.580632558771572e-05, + "loss": 3.9516, + "step": 40280 + }, + { + "epoch": 2.737124609321919, + "grad_norm": 0.18819041550159454, + "learning_rate": 6.580207908683245e-05, + "loss": 3.7749, + "step": 40285 + }, + { + "epoch": 2.7374643293925804, + "grad_norm": 0.1544538289308548, + "learning_rate": 6.579783258594918e-05, + "loss": 3.8327, + "step": 40290 + }, + { + "epoch": 2.737804049463242, + "grad_norm": 0.14061178267002106, + "learning_rate": 6.579443538524256e-05, + "loss": 3.8083, + "step": 40295 + }, + { + "epoch": 2.738143769533904, + "grad_norm": 0.5974332690238953, + "learning_rate": 6.579018888435929e-05, + "loss": 3.7347, + "step": 40300 + }, + { + "epoch": 2.738483489604566, + "grad_norm": 0.16835050284862518, + "learning_rate": 6.578594238347602e-05, + "loss": 3.7531, + "step": 40305 + }, + { + "epoch": 2.7388232096752274, + "grad_norm": 0.7008256316184998, + "learning_rate": 6.578169588259275e-05, + "loss": 4.062, + "step": 40310 + }, + { + "epoch": 2.7391629297458895, + "grad_norm": 0.2814078629016876, + "learning_rate": 6.577744938170947e-05, + "loss": 3.8779, + "step": 40315 + }, + { + "epoch": 2.739502649816551, + "grad_norm": 0.5351347327232361, + "learning_rate": 6.57732028808262e-05, + "loss": 3.5861, + "step": 40320 + }, + { + "epoch": 2.7398423698872127, + "grad_norm": 0.19020280241966248, + "learning_rate": 6.576895637994293e-05, + "loss": 4.0736, + "step": 40325 + }, + { + "epoch": 2.740182089957875, + "grad_norm": 0.18085609376430511, + "learning_rate": 6.576470987905966e-05, + "loss": 3.8753, + "step": 40330 + }, + { + "epoch": 2.7405218100285365, + "grad_norm": 0.14648064970970154, + "learning_rate": 6.576046337817639e-05, + "loss": 3.8376, + "step": 40335 + }, + { + "epoch": 2.740861530099198, + "grad_norm": 0.29427334666252136, + "learning_rate": 6.575621687729311e-05, + "loss": 3.8457, + "step": 40340 + }, + { + "epoch": 2.74120125016986, + "grad_norm": 0.3152358829975128, + "learning_rate": 6.575197037640984e-05, + "loss": 3.9346, + "step": 40345 + }, + { + "epoch": 2.741540970240522, + "grad_norm": 0.19927142560482025, + "learning_rate": 6.574772387552657e-05, + "loss": 3.4393, + "step": 40350 + }, + { + "epoch": 2.7418806903111834, + "grad_norm": 0.16474118828773499, + "learning_rate": 6.57434773746433e-05, + "loss": 3.6226, + "step": 40355 + }, + { + "epoch": 2.7422204103818455, + "grad_norm": 1.1669189929962158, + "learning_rate": 6.573923087376003e-05, + "loss": 3.996, + "step": 40360 + }, + { + "epoch": 2.742560130452507, + "grad_norm": 0.2000553160905838, + "learning_rate": 6.573498437287675e-05, + "loss": 3.9665, + "step": 40365 + }, + { + "epoch": 2.7428998505231688, + "grad_norm": 0.15570257604122162, + "learning_rate": 6.573073787199347e-05, + "loss": 4.0611, + "step": 40370 + }, + { + "epoch": 2.743239570593831, + "grad_norm": 0.24418023228645325, + "learning_rate": 6.572649137111021e-05, + "loss": 3.9471, + "step": 40375 + }, + { + "epoch": 2.7435792906644925, + "grad_norm": 0.19269193708896637, + "learning_rate": 6.572224487022694e-05, + "loss": 3.9739, + "step": 40380 + }, + { + "epoch": 2.743919010735154, + "grad_norm": 0.44593295454978943, + "learning_rate": 6.571799836934365e-05, + "loss": 3.9395, + "step": 40385 + }, + { + "epoch": 2.744258730805816, + "grad_norm": 0.628871500492096, + "learning_rate": 6.57137518684604e-05, + "loss": 3.9744, + "step": 40390 + }, + { + "epoch": 2.744598450876478, + "grad_norm": 0.18951943516731262, + "learning_rate": 6.570950536757712e-05, + "loss": 3.8479, + "step": 40395 + }, + { + "epoch": 2.7449381709471394, + "grad_norm": 0.16621486842632294, + "learning_rate": 6.570525886669385e-05, + "loss": 3.6889, + "step": 40400 + }, + { + "epoch": 2.7452778910178015, + "grad_norm": 0.17868536710739136, + "learning_rate": 6.570101236581058e-05, + "loss": 3.9637, + "step": 40405 + }, + { + "epoch": 2.745617611088463, + "grad_norm": 0.21636687219142914, + "learning_rate": 6.56967658649273e-05, + "loss": 3.8438, + "step": 40410 + }, + { + "epoch": 2.7459573311591248, + "grad_norm": 0.16046690940856934, + "learning_rate": 6.569251936404403e-05, + "loss": 3.9113, + "step": 40415 + }, + { + "epoch": 2.746297051229787, + "grad_norm": 0.14451414346694946, + "learning_rate": 6.568827286316076e-05, + "loss": 3.8904, + "step": 40420 + }, + { + "epoch": 2.7466367713004485, + "grad_norm": 0.14863041043281555, + "learning_rate": 6.568402636227749e-05, + "loss": 3.9555, + "step": 40425 + }, + { + "epoch": 2.74697649137111, + "grad_norm": 0.15235179662704468, + "learning_rate": 6.567977986139422e-05, + "loss": 3.5804, + "step": 40430 + }, + { + "epoch": 2.747316211441772, + "grad_norm": 0.19257758557796478, + "learning_rate": 6.567553336051095e-05, + "loss": 3.8306, + "step": 40435 + }, + { + "epoch": 2.747655931512434, + "grad_norm": 0.17143167555332184, + "learning_rate": 6.567128685962767e-05, + "loss": 3.9823, + "step": 40440 + }, + { + "epoch": 2.7479956515830954, + "grad_norm": 0.16271299123764038, + "learning_rate": 6.56670403587444e-05, + "loss": 3.8297, + "step": 40445 + }, + { + "epoch": 2.7483353716537575, + "grad_norm": 0.19221942126750946, + "learning_rate": 6.566279385786113e-05, + "loss": 3.9872, + "step": 40450 + }, + { + "epoch": 2.748675091724419, + "grad_norm": 0.14956679940223694, + "learning_rate": 6.565854735697784e-05, + "loss": 3.8153, + "step": 40455 + }, + { + "epoch": 2.749014811795081, + "grad_norm": 0.2005520910024643, + "learning_rate": 6.565430085609459e-05, + "loss": 4.0717, + "step": 40460 + }, + { + "epoch": 2.749354531865743, + "grad_norm": 0.209285169839859, + "learning_rate": 6.565005435521131e-05, + "loss": 3.8395, + "step": 40465 + }, + { + "epoch": 2.7496942519364045, + "grad_norm": 0.21852613985538483, + "learning_rate": 6.564580785432803e-05, + "loss": 3.7326, + "step": 40470 + }, + { + "epoch": 2.750033972007066, + "grad_norm": 0.1734120398759842, + "learning_rate": 6.564156135344477e-05, + "loss": 3.9393, + "step": 40475 + }, + { + "epoch": 2.750373692077728, + "grad_norm": 0.18120142817497253, + "learning_rate": 6.56373148525615e-05, + "loss": 3.9683, + "step": 40480 + }, + { + "epoch": 2.75071341214839, + "grad_norm": 0.1512751430273056, + "learning_rate": 6.563306835167821e-05, + "loss": 3.8864, + "step": 40485 + }, + { + "epoch": 2.7510531322190515, + "grad_norm": 0.15235792100429535, + "learning_rate": 6.562882185079495e-05, + "loss": 3.7902, + "step": 40490 + }, + { + "epoch": 2.7513928522897135, + "grad_norm": 0.15778039395809174, + "learning_rate": 6.562457534991168e-05, + "loss": 3.8714, + "step": 40495 + }, + { + "epoch": 2.751732572360375, + "grad_norm": 0.17292603850364685, + "learning_rate": 6.56203288490284e-05, + "loss": 3.8589, + "step": 40500 + }, + { + "epoch": 2.752072292431037, + "grad_norm": 0.2646377384662628, + "learning_rate": 6.561608234814514e-05, + "loss": 3.7369, + "step": 40505 + }, + { + "epoch": 2.752412012501699, + "grad_norm": 0.16285067796707153, + "learning_rate": 6.561183584726187e-05, + "loss": 3.8616, + "step": 40510 + }, + { + "epoch": 2.7527517325723605, + "grad_norm": 0.19102561473846436, + "learning_rate": 6.560758934637858e-05, + "loss": 3.8916, + "step": 40515 + }, + { + "epoch": 2.753091452643022, + "grad_norm": 2.8341064453125, + "learning_rate": 6.560334284549532e-05, + "loss": 3.863, + "step": 40520 + }, + { + "epoch": 2.7534311727136838, + "grad_norm": 0.20734523236751556, + "learning_rate": 6.559909634461204e-05, + "loss": 3.8681, + "step": 40525 + }, + { + "epoch": 2.753770892784346, + "grad_norm": 0.16610004007816315, + "learning_rate": 6.559484984372876e-05, + "loss": 3.8994, + "step": 40530 + }, + { + "epoch": 2.7541106128550075, + "grad_norm": 0.12072641402482986, + "learning_rate": 6.55906033428455e-05, + "loss": 3.9937, + "step": 40535 + }, + { + "epoch": 2.754450332925669, + "grad_norm": 0.1845729500055313, + "learning_rate": 6.558635684196222e-05, + "loss": 3.8817, + "step": 40540 + }, + { + "epoch": 2.754790052996331, + "grad_norm": 0.1920740157365799, + "learning_rate": 6.558211034107895e-05, + "loss": 3.8501, + "step": 40545 + }, + { + "epoch": 2.755129773066993, + "grad_norm": 0.19515390694141388, + "learning_rate": 6.557786384019569e-05, + "loss": 4.0756, + "step": 40550 + }, + { + "epoch": 2.7554694931376544, + "grad_norm": 0.23563705384731293, + "learning_rate": 6.55736173393124e-05, + "loss": 3.8236, + "step": 40555 + }, + { + "epoch": 2.7558092132083165, + "grad_norm": 0.20250003039836884, + "learning_rate": 6.556937083842913e-05, + "loss": 3.8441, + "step": 40560 + }, + { + "epoch": 2.756148933278978, + "grad_norm": 0.14336341619491577, + "learning_rate": 6.556512433754587e-05, + "loss": 3.7142, + "step": 40565 + }, + { + "epoch": 2.7564886533496398, + "grad_norm": 0.15162959694862366, + "learning_rate": 6.556087783666259e-05, + "loss": 3.6701, + "step": 40570 + }, + { + "epoch": 2.7568283734203014, + "grad_norm": 0.2673413157463074, + "learning_rate": 6.555663133577932e-05, + "loss": 3.8417, + "step": 40575 + }, + { + "epoch": 2.7571680934909635, + "grad_norm": 0.21343950927257538, + "learning_rate": 6.555238483489606e-05, + "loss": 3.9766, + "step": 40580 + }, + { + "epoch": 2.757507813561625, + "grad_norm": 0.24644283950328827, + "learning_rate": 6.554813833401277e-05, + "loss": 3.7705, + "step": 40585 + }, + { + "epoch": 2.7578475336322867, + "grad_norm": 0.1672067940235138, + "learning_rate": 6.55438918331295e-05, + "loss": 3.9074, + "step": 40590 + }, + { + "epoch": 2.758187253702949, + "grad_norm": 0.20365940034389496, + "learning_rate": 6.553964533224623e-05, + "loss": 3.8728, + "step": 40595 + }, + { + "epoch": 2.7585269737736104, + "grad_norm": 0.2053121030330658, + "learning_rate": 6.553539883136296e-05, + "loss": 3.8873, + "step": 40600 + }, + { + "epoch": 2.758866693844272, + "grad_norm": 0.13314799964427948, + "learning_rate": 6.553115233047968e-05, + "loss": 3.8571, + "step": 40605 + }, + { + "epoch": 2.759206413914934, + "grad_norm": 0.24367006123065948, + "learning_rate": 6.552690582959641e-05, + "loss": 3.9959, + "step": 40610 + }, + { + "epoch": 2.759546133985596, + "grad_norm": 0.16490034759044647, + "learning_rate": 6.552265932871314e-05, + "loss": 3.4418, + "step": 40615 + }, + { + "epoch": 2.7598858540562574, + "grad_norm": 0.19236184656620026, + "learning_rate": 6.551841282782987e-05, + "loss": 4.1053, + "step": 40620 + }, + { + "epoch": 2.7602255741269195, + "grad_norm": 0.25260236859321594, + "learning_rate": 6.55141663269466e-05, + "loss": 3.7332, + "step": 40625 + }, + { + "epoch": 2.760565294197581, + "grad_norm": 0.2055732160806656, + "learning_rate": 6.550991982606332e-05, + "loss": 3.6278, + "step": 40630 + }, + { + "epoch": 2.7609050142682428, + "grad_norm": 0.16176554560661316, + "learning_rate": 6.550567332518005e-05, + "loss": 3.8164, + "step": 40635 + }, + { + "epoch": 2.761244734338905, + "grad_norm": 0.14101217687129974, + "learning_rate": 6.550142682429678e-05, + "loss": 3.4658, + "step": 40640 + }, + { + "epoch": 2.7615844544095665, + "grad_norm": 0.15700431168079376, + "learning_rate": 6.549718032341351e-05, + "loss": 3.956, + "step": 40645 + }, + { + "epoch": 2.761924174480228, + "grad_norm": 6.293766975402832, + "learning_rate": 6.549293382253024e-05, + "loss": 3.792, + "step": 40650 + }, + { + "epoch": 2.76226389455089, + "grad_norm": 0.17117133736610413, + "learning_rate": 6.548868732164696e-05, + "loss": 3.8466, + "step": 40655 + }, + { + "epoch": 2.762603614621552, + "grad_norm": 0.1971290558576584, + "learning_rate": 6.548444082076369e-05, + "loss": 3.9142, + "step": 40660 + }, + { + "epoch": 2.7629433346922134, + "grad_norm": 0.1511804163455963, + "learning_rate": 6.548019431988042e-05, + "loss": 3.7522, + "step": 40665 + }, + { + "epoch": 2.7632830547628755, + "grad_norm": 0.164160817861557, + "learning_rate": 6.547594781899715e-05, + "loss": 3.7902, + "step": 40670 + }, + { + "epoch": 2.763622774833537, + "grad_norm": 0.2801668047904968, + "learning_rate": 6.547170131811388e-05, + "loss": 3.7794, + "step": 40675 + }, + { + "epoch": 2.7639624949041988, + "grad_norm": 0.30183061957359314, + "learning_rate": 6.54674548172306e-05, + "loss": 3.6964, + "step": 40680 + }, + { + "epoch": 2.764302214974861, + "grad_norm": 0.22361962497234344, + "learning_rate": 6.546320831634733e-05, + "loss": 4.0271, + "step": 40685 + }, + { + "epoch": 2.7646419350455225, + "grad_norm": 0.2893846035003662, + "learning_rate": 6.545896181546406e-05, + "loss": 3.9466, + "step": 40690 + }, + { + "epoch": 2.764981655116184, + "grad_norm": 0.2566791772842407, + "learning_rate": 6.545471531458079e-05, + "loss": 3.8167, + "step": 40695 + }, + { + "epoch": 2.765321375186846, + "grad_norm": 0.1849353015422821, + "learning_rate": 6.545046881369752e-05, + "loss": 3.7959, + "step": 40700 + }, + { + "epoch": 2.765661095257508, + "grad_norm": 0.16301564872264862, + "learning_rate": 6.544622231281425e-05, + "loss": 3.9234, + "step": 40705 + }, + { + "epoch": 2.7660008153281694, + "grad_norm": 0.17490994930267334, + "learning_rate": 6.544197581193097e-05, + "loss": 3.7892, + "step": 40710 + }, + { + "epoch": 2.7663405353988315, + "grad_norm": 0.24019478261470795, + "learning_rate": 6.54377293110477e-05, + "loss": 3.7502, + "step": 40715 + }, + { + "epoch": 2.766680255469493, + "grad_norm": 0.2988351881504059, + "learning_rate": 6.543348281016443e-05, + "loss": 3.8241, + "step": 40720 + }, + { + "epoch": 2.7670199755401548, + "grad_norm": 0.1780611276626587, + "learning_rate": 6.542923630928114e-05, + "loss": 3.796, + "step": 40725 + }, + { + "epoch": 2.767359695610817, + "grad_norm": 0.17531028389930725, + "learning_rate": 6.542498980839789e-05, + "loss": 3.8742, + "step": 40730 + }, + { + "epoch": 2.7676994156814785, + "grad_norm": 0.17385904490947723, + "learning_rate": 6.542074330751461e-05, + "loss": 3.8863, + "step": 40735 + }, + { + "epoch": 2.76803913575214, + "grad_norm": 0.18227802217006683, + "learning_rate": 6.541649680663134e-05, + "loss": 3.8406, + "step": 40740 + }, + { + "epoch": 2.768378855822802, + "grad_norm": 0.15609627962112427, + "learning_rate": 6.541225030574807e-05, + "loss": 3.8898, + "step": 40745 + }, + { + "epoch": 2.768718575893464, + "grad_norm": 0.17390155792236328, + "learning_rate": 6.54080038048648e-05, + "loss": 4.0274, + "step": 40750 + }, + { + "epoch": 2.7690582959641254, + "grad_norm": 0.4534071981906891, + "learning_rate": 6.540375730398153e-05, + "loss": 3.8479, + "step": 40755 + }, + { + "epoch": 2.7693980160347875, + "grad_norm": 0.15489593148231506, + "learning_rate": 6.539951080309825e-05, + "loss": 3.8787, + "step": 40760 + }, + { + "epoch": 2.769737736105449, + "grad_norm": 0.40056535601615906, + "learning_rate": 6.539526430221498e-05, + "loss": 3.9708, + "step": 40765 + }, + { + "epoch": 2.770077456176111, + "grad_norm": 0.48168665170669556, + "learning_rate": 6.539101780133171e-05, + "loss": 3.9334, + "step": 40770 + }, + { + "epoch": 2.770417176246773, + "grad_norm": 0.6222829222679138, + "learning_rate": 6.538677130044844e-05, + "loss": 4.0796, + "step": 40775 + }, + { + "epoch": 2.7707568963174345, + "grad_norm": 0.1953052133321762, + "learning_rate": 6.538252479956517e-05, + "loss": 3.9812, + "step": 40780 + }, + { + "epoch": 2.771096616388096, + "grad_norm": 0.20719599723815918, + "learning_rate": 6.537827829868189e-05, + "loss": 3.7529, + "step": 40785 + }, + { + "epoch": 2.771436336458758, + "grad_norm": 0.18487991392612457, + "learning_rate": 6.537403179779862e-05, + "loss": 3.9684, + "step": 40790 + }, + { + "epoch": 2.77177605652942, + "grad_norm": 0.20364443957805634, + "learning_rate": 6.536978529691534e-05, + "loss": 3.9681, + "step": 40795 + }, + { + "epoch": 2.7721157766000815, + "grad_norm": 0.15516269207000732, + "learning_rate": 6.536553879603208e-05, + "loss": 3.8158, + "step": 40800 + }, + { + "epoch": 2.7724554966707435, + "grad_norm": 0.189484640955925, + "learning_rate": 6.53612922951488e-05, + "loss": 3.8218, + "step": 40805 + }, + { + "epoch": 2.772795216741405, + "grad_norm": 0.17149944603443146, + "learning_rate": 6.535704579426552e-05, + "loss": 3.7559, + "step": 40810 + }, + { + "epoch": 2.773134936812067, + "grad_norm": 0.18185140192508698, + "learning_rate": 6.535279929338226e-05, + "loss": 3.9403, + "step": 40815 + }, + { + "epoch": 2.773474656882729, + "grad_norm": 0.8734667897224426, + "learning_rate": 6.534855279249899e-05, + "loss": 3.9484, + "step": 40820 + }, + { + "epoch": 2.7738143769533905, + "grad_norm": 0.19922973215579987, + "learning_rate": 6.53443062916157e-05, + "loss": 3.8057, + "step": 40825 + }, + { + "epoch": 2.774154097024052, + "grad_norm": 0.28164979815483093, + "learning_rate": 6.534005979073245e-05, + "loss": 3.7921, + "step": 40830 + }, + { + "epoch": 2.774493817094714, + "grad_norm": 0.15335780382156372, + "learning_rate": 6.533581328984917e-05, + "loss": 3.9275, + "step": 40835 + }, + { + "epoch": 2.774833537165376, + "grad_norm": 0.13211368024349213, + "learning_rate": 6.533156678896589e-05, + "loss": 3.9888, + "step": 40840 + }, + { + "epoch": 2.7751732572360375, + "grad_norm": 15.844518661499023, + "learning_rate": 6.532732028808263e-05, + "loss": 4.0903, + "step": 40845 + }, + { + "epoch": 2.7755129773066995, + "grad_norm": 0.20744608342647552, + "learning_rate": 6.532307378719936e-05, + "loss": 3.7659, + "step": 40850 + }, + { + "epoch": 2.775852697377361, + "grad_norm": 1.0868453979492188, + "learning_rate": 6.531882728631607e-05, + "loss": 3.8305, + "step": 40855 + }, + { + "epoch": 2.776192417448023, + "grad_norm": 0.18724147975444794, + "learning_rate": 6.531458078543281e-05, + "loss": 3.9264, + "step": 40860 + }, + { + "epoch": 2.7765321375186844, + "grad_norm": 0.1826217919588089, + "learning_rate": 6.531033428454954e-05, + "loss": 4.0771, + "step": 40865 + }, + { + "epoch": 2.7768718575893465, + "grad_norm": 0.16010646522045135, + "learning_rate": 6.530608778366626e-05, + "loss": 3.9139, + "step": 40870 + }, + { + "epoch": 2.777211577660008, + "grad_norm": 1.7174044847488403, + "learning_rate": 6.5301841282783e-05, + "loss": 3.8596, + "step": 40875 + }, + { + "epoch": 2.7775512977306698, + "grad_norm": 0.19700784981250763, + "learning_rate": 6.529759478189971e-05, + "loss": 3.885, + "step": 40880 + }, + { + "epoch": 2.777891017801332, + "grad_norm": 0.24785584211349487, + "learning_rate": 6.529334828101644e-05, + "loss": 3.7237, + "step": 40885 + }, + { + "epoch": 2.7782307378719935, + "grad_norm": 0.22405485808849335, + "learning_rate": 6.528910178013318e-05, + "loss": 4.1671, + "step": 40890 + }, + { + "epoch": 2.778570457942655, + "grad_norm": 0.15462757647037506, + "learning_rate": 6.52848552792499e-05, + "loss": 4.0742, + "step": 40895 + }, + { + "epoch": 2.778910178013317, + "grad_norm": 0.2006295770406723, + "learning_rate": 6.528060877836662e-05, + "loss": 3.5563, + "step": 40900 + }, + { + "epoch": 2.779249898083979, + "grad_norm": 0.21120570600032806, + "learning_rate": 6.527636227748337e-05, + "loss": 3.987, + "step": 40905 + }, + { + "epoch": 2.7795896181546405, + "grad_norm": 0.14334307610988617, + "learning_rate": 6.527211577660008e-05, + "loss": 3.7961, + "step": 40910 + }, + { + "epoch": 2.779929338225302, + "grad_norm": 0.19448339939117432, + "learning_rate": 6.526786927571681e-05, + "loss": 4.1253, + "step": 40915 + }, + { + "epoch": 2.780269058295964, + "grad_norm": 0.2445646971464157, + "learning_rate": 6.526362277483355e-05, + "loss": 3.6482, + "step": 40920 + }, + { + "epoch": 2.780608778366626, + "grad_norm": 0.15304355323314667, + "learning_rate": 6.525937627395026e-05, + "loss": 3.9824, + "step": 40925 + }, + { + "epoch": 2.7809484984372874, + "grad_norm": 0.16407133638858795, + "learning_rate": 6.525512977306699e-05, + "loss": 3.8816, + "step": 40930 + }, + { + "epoch": 2.7812882185079495, + "grad_norm": 0.14814817905426025, + "learning_rate": 6.525088327218373e-05, + "loss": 3.9156, + "step": 40935 + }, + { + "epoch": 2.781627938578611, + "grad_norm": 0.14403614401817322, + "learning_rate": 6.524663677130045e-05, + "loss": 3.8874, + "step": 40940 + }, + { + "epoch": 2.7819676586492728, + "grad_norm": 0.15912380814552307, + "learning_rate": 6.524239027041718e-05, + "loss": 3.7142, + "step": 40945 + }, + { + "epoch": 2.782307378719935, + "grad_norm": 0.22021538019180298, + "learning_rate": 6.52381437695339e-05, + "loss": 3.9825, + "step": 40950 + }, + { + "epoch": 2.7826470987905965, + "grad_norm": 0.14810875058174133, + "learning_rate": 6.523389726865063e-05, + "loss": 3.9609, + "step": 40955 + }, + { + "epoch": 2.782986818861258, + "grad_norm": 0.5330456495285034, + "learning_rate": 6.522965076776736e-05, + "loss": 3.848, + "step": 40960 + }, + { + "epoch": 2.78332653893192, + "grad_norm": 0.19385021924972534, + "learning_rate": 6.522540426688409e-05, + "loss": 3.893, + "step": 40965 + }, + { + "epoch": 2.783666259002582, + "grad_norm": 0.12946945428848267, + "learning_rate": 6.522115776600082e-05, + "loss": 3.8084, + "step": 40970 + }, + { + "epoch": 2.7840059790732434, + "grad_norm": 0.18485036492347717, + "learning_rate": 6.521691126511754e-05, + "loss": 3.9232, + "step": 40975 + }, + { + "epoch": 2.7843456991439055, + "grad_norm": 0.25200945138931274, + "learning_rate": 6.521266476423427e-05, + "loss": 3.867, + "step": 40980 + }, + { + "epoch": 2.784685419214567, + "grad_norm": 0.1531386524438858, + "learning_rate": 6.5208418263351e-05, + "loss": 3.8966, + "step": 40985 + }, + { + "epoch": 2.7850251392852288, + "grad_norm": 0.1922338455915451, + "learning_rate": 6.520417176246773e-05, + "loss": 3.8965, + "step": 40990 + }, + { + "epoch": 2.785364859355891, + "grad_norm": 3.7345402240753174, + "learning_rate": 6.519992526158446e-05, + "loss": 4.0669, + "step": 40995 + }, + { + "epoch": 2.7857045794265525, + "grad_norm": 0.21678699553012848, + "learning_rate": 6.519567876070118e-05, + "loss": 3.5907, + "step": 41000 + }, + { + "epoch": 2.786044299497214, + "grad_norm": 0.1547134667634964, + "learning_rate": 6.519143225981791e-05, + "loss": 4.13, + "step": 41005 + }, + { + "epoch": 2.786384019567876, + "grad_norm": 0.6922367215156555, + "learning_rate": 6.518718575893464e-05, + "loss": 3.9959, + "step": 41010 + }, + { + "epoch": 2.786723739638538, + "grad_norm": 0.16160574555397034, + "learning_rate": 6.518293925805137e-05, + "loss": 3.985, + "step": 41015 + }, + { + "epoch": 2.7870634597091994, + "grad_norm": 0.1784139722585678, + "learning_rate": 6.51786927571681e-05, + "loss": 3.7978, + "step": 41020 + }, + { + "epoch": 2.7874031797798615, + "grad_norm": 0.18071062862873077, + "learning_rate": 6.517444625628482e-05, + "loss": 3.9709, + "step": 41025 + }, + { + "epoch": 2.787742899850523, + "grad_norm": 0.24022626876831055, + "learning_rate": 6.517019975540155e-05, + "loss": 3.6027, + "step": 41030 + }, + { + "epoch": 2.788082619921185, + "grad_norm": 0.16732239723205566, + "learning_rate": 6.516595325451828e-05, + "loss": 4.0605, + "step": 41035 + }, + { + "epoch": 2.788422339991847, + "grad_norm": 0.1918901950120926, + "learning_rate": 6.516170675363501e-05, + "loss": 3.8375, + "step": 41040 + }, + { + "epoch": 2.7887620600625085, + "grad_norm": 0.15840958058834076, + "learning_rate": 6.515746025275174e-05, + "loss": 3.4777, + "step": 41045 + }, + { + "epoch": 2.78910178013317, + "grad_norm": 0.1924312859773636, + "learning_rate": 6.515321375186846e-05, + "loss": 3.5551, + "step": 41050 + }, + { + "epoch": 2.789441500203832, + "grad_norm": 0.13963235914707184, + "learning_rate": 6.514896725098519e-05, + "loss": 3.9492, + "step": 41055 + }, + { + "epoch": 2.789781220274494, + "grad_norm": 0.14598742127418518, + "learning_rate": 6.514472075010192e-05, + "loss": 4.0527, + "step": 41060 + }, + { + "epoch": 2.7901209403451555, + "grad_norm": 0.1717139333486557, + "learning_rate": 6.514047424921865e-05, + "loss": 4.0187, + "step": 41065 + }, + { + "epoch": 2.7904606604158175, + "grad_norm": 0.14419305324554443, + "learning_rate": 6.513622774833538e-05, + "loss": 3.7046, + "step": 41070 + }, + { + "epoch": 2.790800380486479, + "grad_norm": 0.2421928197145462, + "learning_rate": 6.51319812474521e-05, + "loss": 3.7576, + "step": 41075 + }, + { + "epoch": 2.791140100557141, + "grad_norm": 0.18298254907131195, + "learning_rate": 6.512773474656883e-05, + "loss": 3.8814, + "step": 41080 + }, + { + "epoch": 2.791479820627803, + "grad_norm": 0.746887743473053, + "learning_rate": 6.512348824568556e-05, + "loss": 3.8394, + "step": 41085 + }, + { + "epoch": 2.7918195406984645, + "grad_norm": 0.7226248383522034, + "learning_rate": 6.511924174480229e-05, + "loss": 3.9921, + "step": 41090 + }, + { + "epoch": 2.792159260769126, + "grad_norm": 0.19559799134731293, + "learning_rate": 6.511499524391902e-05, + "loss": 3.923, + "step": 41095 + }, + { + "epoch": 2.792498980839788, + "grad_norm": 1.128818154335022, + "learning_rate": 6.511074874303574e-05, + "loss": 3.8201, + "step": 41100 + }, + { + "epoch": 2.79283870091045, + "grad_norm": 0.1605989933013916, + "learning_rate": 6.510650224215247e-05, + "loss": 3.8919, + "step": 41105 + }, + { + "epoch": 2.7931784209811115, + "grad_norm": 0.15949590504169464, + "learning_rate": 6.51022557412692e-05, + "loss": 3.9362, + "step": 41110 + }, + { + "epoch": 2.7935181410517735, + "grad_norm": 0.15392351150512695, + "learning_rate": 6.509800924038593e-05, + "loss": 3.8842, + "step": 41115 + }, + { + "epoch": 2.793857861122435, + "grad_norm": 0.16439351439476013, + "learning_rate": 6.509376273950266e-05, + "loss": 3.8287, + "step": 41120 + }, + { + "epoch": 2.794197581193097, + "grad_norm": 0.15173426270484924, + "learning_rate": 6.508951623861938e-05, + "loss": 3.8672, + "step": 41125 + }, + { + "epoch": 2.794537301263759, + "grad_norm": 0.16619549691677094, + "learning_rate": 6.508526973773611e-05, + "loss": 3.3878, + "step": 41130 + }, + { + "epoch": 2.7948770213344205, + "grad_norm": 0.1406264752149582, + "learning_rate": 6.508102323685284e-05, + "loss": 3.7017, + "step": 41135 + }, + { + "epoch": 2.795216741405082, + "grad_norm": 0.18603791296482086, + "learning_rate": 6.507677673596957e-05, + "loss": 3.8158, + "step": 41140 + }, + { + "epoch": 2.795556461475744, + "grad_norm": 0.349643737077713, + "learning_rate": 6.50725302350863e-05, + "loss": 3.6355, + "step": 41145 + }, + { + "epoch": 2.795896181546406, + "grad_norm": 0.1721067875623703, + "learning_rate": 6.506828373420301e-05, + "loss": 3.8922, + "step": 41150 + }, + { + "epoch": 2.7962359016170675, + "grad_norm": 0.29107335209846497, + "learning_rate": 6.506403723331975e-05, + "loss": 3.7053, + "step": 41155 + }, + { + "epoch": 2.7965756216877296, + "grad_norm": 0.13501383364200592, + "learning_rate": 6.505979073243648e-05, + "loss": 3.7116, + "step": 41160 + }, + { + "epoch": 2.796915341758391, + "grad_norm": 0.1845615804195404, + "learning_rate": 6.50555442315532e-05, + "loss": 3.8741, + "step": 41165 + }, + { + "epoch": 2.797255061829053, + "grad_norm": 0.15640591084957123, + "learning_rate": 6.505129773066994e-05, + "loss": 3.8217, + "step": 41170 + }, + { + "epoch": 2.797594781899715, + "grad_norm": 0.15319417417049408, + "learning_rate": 6.504705122978666e-05, + "loss": 3.6994, + "step": 41175 + }, + { + "epoch": 2.7979345019703765, + "grad_norm": 2.5420334339141846, + "learning_rate": 6.504280472890338e-05, + "loss": 4.04, + "step": 41180 + }, + { + "epoch": 2.798274222041038, + "grad_norm": 0.18297173082828522, + "learning_rate": 6.503855822802012e-05, + "loss": 3.6977, + "step": 41185 + }, + { + "epoch": 2.7986139421117002, + "grad_norm": 0.14367422461509705, + "learning_rate": 6.503431172713685e-05, + "loss": 3.8995, + "step": 41190 + }, + { + "epoch": 2.798953662182362, + "grad_norm": 0.23620112240314484, + "learning_rate": 6.503006522625356e-05, + "loss": 4.1066, + "step": 41195 + }, + { + "epoch": 2.7992933822530235, + "grad_norm": 0.49447867274284363, + "learning_rate": 6.50258187253703e-05, + "loss": 3.8112, + "step": 41200 + }, + { + "epoch": 2.799633102323685, + "grad_norm": 0.41531985998153687, + "learning_rate": 6.502157222448703e-05, + "loss": 3.8624, + "step": 41205 + }, + { + "epoch": 2.799972822394347, + "grad_norm": 0.15813003480434418, + "learning_rate": 6.501732572360375e-05, + "loss": 3.9642, + "step": 41210 + }, + { + "epoch": 2.800312542465009, + "grad_norm": 0.1622614711523056, + "learning_rate": 6.501307922272049e-05, + "loss": 3.6271, + "step": 41215 + }, + { + "epoch": 2.8006522625356705, + "grad_norm": 0.20669063925743103, + "learning_rate": 6.50088327218372e-05, + "loss": 3.757, + "step": 41220 + }, + { + "epoch": 2.8009919826063325, + "grad_norm": 0.16186979413032532, + "learning_rate": 6.500458622095393e-05, + "loss": 3.7677, + "step": 41225 + }, + { + "epoch": 2.801331702676994, + "grad_norm": 3.8010668754577637, + "learning_rate": 6.500033972007067e-05, + "loss": 3.994, + "step": 41230 + }, + { + "epoch": 2.801671422747656, + "grad_norm": 0.20890618860721588, + "learning_rate": 6.499609321918739e-05, + "loss": 3.8109, + "step": 41235 + }, + { + "epoch": 2.802011142818318, + "grad_norm": 0.1956794559955597, + "learning_rate": 6.499184671830411e-05, + "loss": 4.0174, + "step": 41240 + }, + { + "epoch": 2.8023508628889795, + "grad_norm": 0.17511625587940216, + "learning_rate": 6.498760021742086e-05, + "loss": 3.7991, + "step": 41245 + }, + { + "epoch": 2.802690582959641, + "grad_norm": 1.0300540924072266, + "learning_rate": 6.498335371653757e-05, + "loss": 3.6908, + "step": 41250 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.16171813011169434, + "learning_rate": 6.49791072156543e-05, + "loss": 3.8953, + "step": 41255 + }, + { + "epoch": 2.803370023100965, + "grad_norm": 0.2153616100549698, + "learning_rate": 6.497486071477104e-05, + "loss": 4.1771, + "step": 41260 + }, + { + "epoch": 2.8037097431716265, + "grad_norm": 0.15998482704162598, + "learning_rate": 6.497061421388775e-05, + "loss": 3.7686, + "step": 41265 + }, + { + "epoch": 2.804049463242288, + "grad_norm": 0.16211186349391937, + "learning_rate": 6.496636771300448e-05, + "loss": 3.7561, + "step": 41270 + }, + { + "epoch": 2.80438918331295, + "grad_norm": 0.17042408883571625, + "learning_rate": 6.496212121212122e-05, + "loss": 3.7859, + "step": 41275 + }, + { + "epoch": 2.804728903383612, + "grad_norm": 0.2037298083305359, + "learning_rate": 6.495787471123794e-05, + "loss": 3.8346, + "step": 41280 + }, + { + "epoch": 2.8050686234542734, + "grad_norm": 0.1659078598022461, + "learning_rate": 6.495362821035467e-05, + "loss": 3.7392, + "step": 41285 + }, + { + "epoch": 2.8054083435249355, + "grad_norm": 0.1965019851922989, + "learning_rate": 6.494938170947141e-05, + "loss": 3.8944, + "step": 41290 + }, + { + "epoch": 2.805748063595597, + "grad_norm": 0.24360430240631104, + "learning_rate": 6.494513520858812e-05, + "loss": 3.6558, + "step": 41295 + }, + { + "epoch": 2.8060877836662588, + "grad_norm": 0.15412777662277222, + "learning_rate": 6.494088870770485e-05, + "loss": 3.8314, + "step": 41300 + }, + { + "epoch": 2.806427503736921, + "grad_norm": 0.1599520742893219, + "learning_rate": 6.493664220682158e-05, + "loss": 3.926, + "step": 41305 + }, + { + "epoch": 2.8067672238075825, + "grad_norm": 0.552309513092041, + "learning_rate": 6.493239570593831e-05, + "loss": 3.8085, + "step": 41310 + }, + { + "epoch": 2.807106943878244, + "grad_norm": 0.1810910552740097, + "learning_rate": 6.492814920505503e-05, + "loss": 3.8541, + "step": 41315 + }, + { + "epoch": 2.807446663948906, + "grad_norm": 0.20247700810432434, + "learning_rate": 6.492390270417176e-05, + "loss": 3.7718, + "step": 41320 + }, + { + "epoch": 2.807786384019568, + "grad_norm": 0.1534716933965683, + "learning_rate": 6.491965620328849e-05, + "loss": 3.9538, + "step": 41325 + }, + { + "epoch": 2.8081261040902294, + "grad_norm": 0.19690610468387604, + "learning_rate": 6.491540970240522e-05, + "loss": 3.822, + "step": 41330 + }, + { + "epoch": 2.8084658241608915, + "grad_norm": 0.1752772182226181, + "learning_rate": 6.491116320152195e-05, + "loss": 3.7997, + "step": 41335 + }, + { + "epoch": 2.808805544231553, + "grad_norm": 0.1663365215063095, + "learning_rate": 6.490691670063867e-05, + "loss": 3.7501, + "step": 41340 + }, + { + "epoch": 2.809145264302215, + "grad_norm": 0.1593974530696869, + "learning_rate": 6.49026701997554e-05, + "loss": 3.93, + "step": 41345 + }, + { + "epoch": 2.809484984372877, + "grad_norm": 0.18721196055412292, + "learning_rate": 6.489842369887213e-05, + "loss": 3.6859, + "step": 41350 + }, + { + "epoch": 2.8098247044435385, + "grad_norm": 0.5420804619789124, + "learning_rate": 6.489417719798886e-05, + "loss": 3.9435, + "step": 41355 + }, + { + "epoch": 2.8101644245142, + "grad_norm": 0.1839836686849594, + "learning_rate": 6.488993069710559e-05, + "loss": 3.773, + "step": 41360 + }, + { + "epoch": 2.810504144584862, + "grad_norm": 0.20135970413684845, + "learning_rate": 6.488568419622231e-05, + "loss": 3.7704, + "step": 41365 + }, + { + "epoch": 2.810843864655524, + "grad_norm": 0.18822507560253143, + "learning_rate": 6.488143769533904e-05, + "loss": 3.8609, + "step": 41370 + }, + { + "epoch": 2.8111835847261855, + "grad_norm": 0.17303383350372314, + "learning_rate": 6.487719119445577e-05, + "loss": 3.9032, + "step": 41375 + }, + { + "epoch": 2.8115233047968475, + "grad_norm": 0.2072618007659912, + "learning_rate": 6.48729446935725e-05, + "loss": 3.9224, + "step": 41380 + }, + { + "epoch": 2.811863024867509, + "grad_norm": 0.17974853515625, + "learning_rate": 6.486869819268923e-05, + "loss": 3.8552, + "step": 41385 + }, + { + "epoch": 2.812202744938171, + "grad_norm": 0.1534213274717331, + "learning_rate": 6.486445169180596e-05, + "loss": 3.9951, + "step": 41390 + }, + { + "epoch": 2.812542465008833, + "grad_norm": 0.5085853934288025, + "learning_rate": 6.486020519092268e-05, + "loss": 3.9674, + "step": 41395 + }, + { + "epoch": 2.8128821850794945, + "grad_norm": 0.16392463445663452, + "learning_rate": 6.485595869003941e-05, + "loss": 3.9521, + "step": 41400 + }, + { + "epoch": 2.813221905150156, + "grad_norm": 0.171642005443573, + "learning_rate": 6.485171218915614e-05, + "loss": 3.9398, + "step": 41405 + }, + { + "epoch": 2.813561625220818, + "grad_norm": 2.704983711242676, + "learning_rate": 6.484746568827287e-05, + "loss": 3.9119, + "step": 41410 + }, + { + "epoch": 2.81390134529148, + "grad_norm": 0.20032308995723724, + "learning_rate": 6.48432191873896e-05, + "loss": 3.888, + "step": 41415 + }, + { + "epoch": 2.8142410653621415, + "grad_norm": 0.22273291647434235, + "learning_rate": 6.483897268650632e-05, + "loss": 4.188, + "step": 41420 + }, + { + "epoch": 2.8145807854328035, + "grad_norm": 0.18513278663158417, + "learning_rate": 6.483472618562305e-05, + "loss": 3.8375, + "step": 41425 + }, + { + "epoch": 2.814920505503465, + "grad_norm": 0.22609840333461761, + "learning_rate": 6.483047968473978e-05, + "loss": 3.8236, + "step": 41430 + }, + { + "epoch": 2.815260225574127, + "grad_norm": 0.6248467564582825, + "learning_rate": 6.482623318385651e-05, + "loss": 3.6887, + "step": 41435 + }, + { + "epoch": 2.815599945644789, + "grad_norm": 0.1971576064825058, + "learning_rate": 6.482198668297324e-05, + "loss": 3.7866, + "step": 41440 + }, + { + "epoch": 2.8159396657154505, + "grad_norm": 0.17702823877334595, + "learning_rate": 6.481774018208996e-05, + "loss": 3.8909, + "step": 41445 + }, + { + "epoch": 2.816279385786112, + "grad_norm": 0.1586950272321701, + "learning_rate": 6.481349368120669e-05, + "loss": 4.0461, + "step": 41450 + }, + { + "epoch": 2.816619105856774, + "grad_norm": 0.18079429864883423, + "learning_rate": 6.480924718032342e-05, + "loss": 3.7912, + "step": 41455 + }, + { + "epoch": 2.816958825927436, + "grad_norm": 0.29580822587013245, + "learning_rate": 6.480500067944015e-05, + "loss": 4.1335, + "step": 41460 + }, + { + "epoch": 2.8172985459980975, + "grad_norm": 0.17357346415519714, + "learning_rate": 6.480075417855688e-05, + "loss": 3.7029, + "step": 41465 + }, + { + "epoch": 2.8176382660687596, + "grad_norm": 0.1547984629869461, + "learning_rate": 6.47965076776736e-05, + "loss": 3.9138, + "step": 41470 + }, + { + "epoch": 2.817977986139421, + "grad_norm": 0.16758109629154205, + "learning_rate": 6.479226117679033e-05, + "loss": 4.0362, + "step": 41475 + }, + { + "epoch": 2.818317706210083, + "grad_norm": 0.19988007843494415, + "learning_rate": 6.478801467590706e-05, + "loss": 3.958, + "step": 41480 + }, + { + "epoch": 2.818657426280745, + "grad_norm": 0.16817273199558258, + "learning_rate": 6.478376817502379e-05, + "loss": 4.0078, + "step": 41485 + }, + { + "epoch": 2.8189971463514065, + "grad_norm": 0.1641170084476471, + "learning_rate": 6.477952167414052e-05, + "loss": 3.8716, + "step": 41490 + }, + { + "epoch": 2.819336866422068, + "grad_norm": 0.6144582629203796, + "learning_rate": 6.477527517325724e-05, + "loss": 4.0337, + "step": 41495 + }, + { + "epoch": 2.8196765864927302, + "grad_norm": 0.18522556126117706, + "learning_rate": 6.477102867237397e-05, + "loss": 3.69, + "step": 41500 + }, + { + "epoch": 2.820016306563392, + "grad_norm": 0.19464810192584991, + "learning_rate": 6.476678217149069e-05, + "loss": 3.9693, + "step": 41505 + }, + { + "epoch": 2.8203560266340535, + "grad_norm": 0.20989884436130524, + "learning_rate": 6.476253567060743e-05, + "loss": 3.915, + "step": 41510 + }, + { + "epoch": 2.8206957467047156, + "grad_norm": 0.19784732162952423, + "learning_rate": 6.475828916972416e-05, + "loss": 3.8199, + "step": 41515 + }, + { + "epoch": 2.821035466775377, + "grad_norm": 0.2373245805501938, + "learning_rate": 6.475404266884087e-05, + "loss": 4.1357, + "step": 41520 + }, + { + "epoch": 2.821375186846039, + "grad_norm": 0.14210820198059082, + "learning_rate": 6.474979616795761e-05, + "loss": 3.9537, + "step": 41525 + }, + { + "epoch": 2.821714906916701, + "grad_norm": 0.14911404252052307, + "learning_rate": 6.474554966707434e-05, + "loss": 3.9034, + "step": 41530 + }, + { + "epoch": 2.8220546269873625, + "grad_norm": 0.14990229904651642, + "learning_rate": 6.474130316619105e-05, + "loss": 4.0344, + "step": 41535 + }, + { + "epoch": 2.822394347058024, + "grad_norm": 0.18152545392513275, + "learning_rate": 6.47370566653078e-05, + "loss": 3.8561, + "step": 41540 + }, + { + "epoch": 2.822734067128686, + "grad_norm": 0.15199759602546692, + "learning_rate": 6.473281016442452e-05, + "loss": 3.6584, + "step": 41545 + }, + { + "epoch": 2.823073787199348, + "grad_norm": 0.14673444628715515, + "learning_rate": 6.472856366354124e-05, + "loss": 3.8908, + "step": 41550 + }, + { + "epoch": 2.8234135072700095, + "grad_norm": 0.23379407823085785, + "learning_rate": 6.472431716265798e-05, + "loss": 3.9446, + "step": 41555 + }, + { + "epoch": 2.823753227340671, + "grad_norm": 0.17057104408740997, + "learning_rate": 6.472007066177471e-05, + "loss": 3.8995, + "step": 41560 + }, + { + "epoch": 2.824092947411333, + "grad_norm": 0.23348014056682587, + "learning_rate": 6.471582416089142e-05, + "loss": 4.1712, + "step": 41565 + }, + { + "epoch": 2.824432667481995, + "grad_norm": 0.17367498576641083, + "learning_rate": 6.471157766000816e-05, + "loss": 3.9233, + "step": 41570 + }, + { + "epoch": 2.8247723875526565, + "grad_norm": 0.20234039425849915, + "learning_rate": 6.470733115912488e-05, + "loss": 3.8701, + "step": 41575 + }, + { + "epoch": 2.8251121076233185, + "grad_norm": 0.1373794972896576, + "learning_rate": 6.47030846582416e-05, + "loss": 3.8006, + "step": 41580 + }, + { + "epoch": 2.82545182769398, + "grad_norm": 0.18909406661987305, + "learning_rate": 6.469883815735835e-05, + "loss": 3.9656, + "step": 41585 + }, + { + "epoch": 2.825791547764642, + "grad_norm": 0.2158580869436264, + "learning_rate": 6.469459165647506e-05, + "loss": 3.9047, + "step": 41590 + }, + { + "epoch": 2.8261312678353034, + "grad_norm": 0.17780296504497528, + "learning_rate": 6.469034515559179e-05, + "loss": 3.7597, + "step": 41595 + }, + { + "epoch": 2.8264709879059655, + "grad_norm": 0.1791076958179474, + "learning_rate": 6.468609865470853e-05, + "loss": 4.029, + "step": 41600 + }, + { + "epoch": 2.826810707976627, + "grad_norm": 0.19211722910404205, + "learning_rate": 6.468185215382525e-05, + "loss": 3.9342, + "step": 41605 + }, + { + "epoch": 2.8271504280472888, + "grad_norm": 0.14656294882297516, + "learning_rate": 6.467760565294197e-05, + "loss": 3.8511, + "step": 41610 + }, + { + "epoch": 2.827490148117951, + "grad_norm": 0.16629575192928314, + "learning_rate": 6.467335915205872e-05, + "loss": 3.9632, + "step": 41615 + }, + { + "epoch": 2.8278298681886125, + "grad_norm": 0.22225984930992126, + "learning_rate": 6.466911265117543e-05, + "loss": 3.8049, + "step": 41620 + }, + { + "epoch": 2.828169588259274, + "grad_norm": 0.18341514468193054, + "learning_rate": 6.466486615029216e-05, + "loss": 3.8876, + "step": 41625 + }, + { + "epoch": 2.828509308329936, + "grad_norm": 0.1515032798051834, + "learning_rate": 6.46606196494089e-05, + "loss": 4.0422, + "step": 41630 + }, + { + "epoch": 2.828849028400598, + "grad_norm": 0.14735938608646393, + "learning_rate": 6.465637314852561e-05, + "loss": 3.8096, + "step": 41635 + }, + { + "epoch": 2.8291887484712595, + "grad_norm": 0.15819674730300903, + "learning_rate": 6.465212664764234e-05, + "loss": 3.8052, + "step": 41640 + }, + { + "epoch": 2.8295284685419215, + "grad_norm": 0.1965230405330658, + "learning_rate": 6.464788014675908e-05, + "loss": 4.0262, + "step": 41645 + }, + { + "epoch": 2.829868188612583, + "grad_norm": 0.23465140163898468, + "learning_rate": 6.46436336458758e-05, + "loss": 4.2026, + "step": 41650 + }, + { + "epoch": 2.830207908683245, + "grad_norm": 0.24363066256046295, + "learning_rate": 6.463938714499253e-05, + "loss": 3.8457, + "step": 41655 + }, + { + "epoch": 2.830547628753907, + "grad_norm": 0.3880245089530945, + "learning_rate": 6.463514064410925e-05, + "loss": 3.7877, + "step": 41660 + }, + { + "epoch": 2.8308873488245685, + "grad_norm": 0.1979319155216217, + "learning_rate": 6.463089414322598e-05, + "loss": 3.5927, + "step": 41665 + }, + { + "epoch": 2.83122706889523, + "grad_norm": 0.1822020411491394, + "learning_rate": 6.462664764234271e-05, + "loss": 3.9392, + "step": 41670 + }, + { + "epoch": 2.831566788965892, + "grad_norm": 0.20386859774589539, + "learning_rate": 6.462240114145944e-05, + "loss": 3.781, + "step": 41675 + }, + { + "epoch": 2.831906509036554, + "grad_norm": 0.16421228647232056, + "learning_rate": 6.461815464057617e-05, + "loss": 4.0333, + "step": 41680 + }, + { + "epoch": 2.8322462291072155, + "grad_norm": 0.16811542212963104, + "learning_rate": 6.46139081396929e-05, + "loss": 3.8708, + "step": 41685 + }, + { + "epoch": 2.8325859491778775, + "grad_norm": 0.15304513275623322, + "learning_rate": 6.460966163880962e-05, + "loss": 3.9881, + "step": 41690 + }, + { + "epoch": 2.832925669248539, + "grad_norm": 0.16346578299999237, + "learning_rate": 6.460541513792635e-05, + "loss": 3.981, + "step": 41695 + }, + { + "epoch": 2.833265389319201, + "grad_norm": 0.18916243314743042, + "learning_rate": 6.460116863704308e-05, + "loss": 3.8132, + "step": 41700 + }, + { + "epoch": 2.833605109389863, + "grad_norm": 0.1673441380262375, + "learning_rate": 6.45969221361598e-05, + "loss": 4.0049, + "step": 41705 + }, + { + "epoch": 2.8339448294605245, + "grad_norm": 0.23513874411582947, + "learning_rate": 6.459267563527653e-05, + "loss": 3.9599, + "step": 41710 + }, + { + "epoch": 2.834284549531186, + "grad_norm": 0.16638311743736267, + "learning_rate": 6.458842913439326e-05, + "loss": 3.8081, + "step": 41715 + }, + { + "epoch": 2.834624269601848, + "grad_norm": 0.17449292540550232, + "learning_rate": 6.458418263350999e-05, + "loss": 3.8388, + "step": 41720 + }, + { + "epoch": 2.83496398967251, + "grad_norm": 0.15125104784965515, + "learning_rate": 6.457993613262672e-05, + "loss": 3.9316, + "step": 41725 + }, + { + "epoch": 2.8353037097431715, + "grad_norm": 0.24074159562587738, + "learning_rate": 6.457568963174345e-05, + "loss": 4.1253, + "step": 41730 + }, + { + "epoch": 2.8356434298138335, + "grad_norm": 1.4013129472732544, + "learning_rate": 6.457144313086017e-05, + "loss": 3.6838, + "step": 41735 + }, + { + "epoch": 2.835983149884495, + "grad_norm": 3.0308330059051514, + "learning_rate": 6.45671966299769e-05, + "loss": 3.8747, + "step": 41740 + }, + { + "epoch": 2.836322869955157, + "grad_norm": 0.20832952857017517, + "learning_rate": 6.456295012909363e-05, + "loss": 4.2344, + "step": 41745 + }, + { + "epoch": 2.836662590025819, + "grad_norm": 1.0027023553848267, + "learning_rate": 6.455870362821036e-05, + "loss": 3.8669, + "step": 41750 + }, + { + "epoch": 2.8370023100964805, + "grad_norm": 0.16818536818027496, + "learning_rate": 6.455445712732709e-05, + "loss": 3.7574, + "step": 41755 + }, + { + "epoch": 2.837342030167142, + "grad_norm": 0.2488076090812683, + "learning_rate": 6.455021062644381e-05, + "loss": 4.0951, + "step": 41760 + }, + { + "epoch": 2.8376817502378042, + "grad_norm": 0.14138442277908325, + "learning_rate": 6.454596412556054e-05, + "loss": 3.784, + "step": 41765 + }, + { + "epoch": 2.838021470308466, + "grad_norm": 0.19765150547027588, + "learning_rate": 6.454171762467727e-05, + "loss": 4.1213, + "step": 41770 + }, + { + "epoch": 2.8383611903791275, + "grad_norm": 0.26301220059394836, + "learning_rate": 6.4537471123794e-05, + "loss": 3.9507, + "step": 41775 + }, + { + "epoch": 2.8387009104497896, + "grad_norm": 0.14241743087768555, + "learning_rate": 6.453322462291073e-05, + "loss": 3.9638, + "step": 41780 + }, + { + "epoch": 2.839040630520451, + "grad_norm": 0.14540265500545502, + "learning_rate": 6.452897812202745e-05, + "loss": 3.933, + "step": 41785 + }, + { + "epoch": 2.839380350591113, + "grad_norm": 0.265365332365036, + "learning_rate": 6.452473162114418e-05, + "loss": 3.8782, + "step": 41790 + }, + { + "epoch": 2.839720070661775, + "grad_norm": 0.1783810704946518, + "learning_rate": 6.452048512026091e-05, + "loss": 3.9336, + "step": 41795 + }, + { + "epoch": 2.8400597907324365, + "grad_norm": 0.17750555276870728, + "learning_rate": 6.451623861937764e-05, + "loss": 3.6393, + "step": 41800 + }, + { + "epoch": 2.840399510803098, + "grad_norm": 0.18140757083892822, + "learning_rate": 6.451199211849437e-05, + "loss": 4.0408, + "step": 41805 + }, + { + "epoch": 2.8407392308737602, + "grad_norm": 8.337628364562988, + "learning_rate": 6.45077456176111e-05, + "loss": 4.0642, + "step": 41810 + }, + { + "epoch": 2.841078950944422, + "grad_norm": 0.1826048493385315, + "learning_rate": 6.450349911672782e-05, + "loss": 4.0822, + "step": 41815 + }, + { + "epoch": 2.8414186710150835, + "grad_norm": 0.18497850000858307, + "learning_rate": 6.449925261584455e-05, + "loss": 3.7605, + "step": 41820 + }, + { + "epoch": 2.8417583910857456, + "grad_norm": 0.18052971363067627, + "learning_rate": 6.449500611496128e-05, + "loss": 3.9752, + "step": 41825 + }, + { + "epoch": 2.842098111156407, + "grad_norm": 0.17055514454841614, + "learning_rate": 6.4490759614078e-05, + "loss": 3.7121, + "step": 41830 + }, + { + "epoch": 2.842437831227069, + "grad_norm": 0.1576690822839737, + "learning_rate": 6.448651311319473e-05, + "loss": 3.7505, + "step": 41835 + }, + { + "epoch": 2.842777551297731, + "grad_norm": 0.1596190631389618, + "learning_rate": 6.448226661231146e-05, + "loss": 3.8928, + "step": 41840 + }, + { + "epoch": 2.8431172713683925, + "grad_norm": 0.22157254815101624, + "learning_rate": 6.447802011142819e-05, + "loss": 4.0872, + "step": 41845 + }, + { + "epoch": 2.843456991439054, + "grad_norm": 0.594726026058197, + "learning_rate": 6.447377361054492e-05, + "loss": 3.8086, + "step": 41850 + }, + { + "epoch": 2.8437967115097162, + "grad_norm": 0.19427527487277985, + "learning_rate": 6.446952710966165e-05, + "loss": 3.8739, + "step": 41855 + }, + { + "epoch": 2.844136431580378, + "grad_norm": 0.18121425807476044, + "learning_rate": 6.446528060877836e-05, + "loss": 3.523, + "step": 41860 + }, + { + "epoch": 2.8444761516510395, + "grad_norm": 0.17691777646541595, + "learning_rate": 6.44610341078951e-05, + "loss": 3.6667, + "step": 41865 + }, + { + "epoch": 2.8448158717217016, + "grad_norm": 0.14795510470867157, + "learning_rate": 6.445678760701183e-05, + "loss": 3.8062, + "step": 41870 + }, + { + "epoch": 2.845155591792363, + "grad_norm": 0.19895142316818237, + "learning_rate": 6.445254110612854e-05, + "loss": 3.8102, + "step": 41875 + }, + { + "epoch": 2.845495311863025, + "grad_norm": 0.18458786606788635, + "learning_rate": 6.444829460524529e-05, + "loss": 3.678, + "step": 41880 + }, + { + "epoch": 2.8458350319336865, + "grad_norm": 0.16723418235778809, + "learning_rate": 6.444404810436201e-05, + "loss": 3.7472, + "step": 41885 + }, + { + "epoch": 2.8461747520043486, + "grad_norm": 0.16482022404670715, + "learning_rate": 6.443980160347873e-05, + "loss": 3.9511, + "step": 41890 + }, + { + "epoch": 2.84651447207501, + "grad_norm": 0.19728286564350128, + "learning_rate": 6.443555510259547e-05, + "loss": 3.8606, + "step": 41895 + }, + { + "epoch": 2.846854192145672, + "grad_norm": 0.17994661629199982, + "learning_rate": 6.44313086017122e-05, + "loss": 3.8449, + "step": 41900 + }, + { + "epoch": 2.847193912216334, + "grad_norm": 0.15408973395824432, + "learning_rate": 6.442706210082891e-05, + "loss": 3.7296, + "step": 41905 + }, + { + "epoch": 2.8475336322869955, + "grad_norm": 0.16546086966991425, + "learning_rate": 6.442281559994565e-05, + "loss": 3.8788, + "step": 41910 + }, + { + "epoch": 2.847873352357657, + "grad_norm": 0.16529527306556702, + "learning_rate": 6.441856909906238e-05, + "loss": 4.0269, + "step": 41915 + }, + { + "epoch": 2.8482130724283192, + "grad_norm": 0.15758846700191498, + "learning_rate": 6.44143225981791e-05, + "loss": 4.1565, + "step": 41920 + }, + { + "epoch": 2.848552792498981, + "grad_norm": 0.1718040406703949, + "learning_rate": 6.441007609729584e-05, + "loss": 3.8584, + "step": 41925 + }, + { + "epoch": 2.8488925125696425, + "grad_norm": 0.5630946159362793, + "learning_rate": 6.440582959641255e-05, + "loss": 3.8185, + "step": 41930 + }, + { + "epoch": 2.849232232640304, + "grad_norm": 0.20286032557487488, + "learning_rate": 6.440158309552928e-05, + "loss": 3.7879, + "step": 41935 + }, + { + "epoch": 2.849571952710966, + "grad_norm": 0.19650696218013763, + "learning_rate": 6.439733659464602e-05, + "loss": 3.7752, + "step": 41940 + }, + { + "epoch": 2.849911672781628, + "grad_norm": 0.18683508038520813, + "learning_rate": 6.439309009376274e-05, + "loss": 3.8515, + "step": 41945 + }, + { + "epoch": 2.8502513928522895, + "grad_norm": 1.660178542137146, + "learning_rate": 6.438884359287946e-05, + "loss": 3.6935, + "step": 41950 + }, + { + "epoch": 2.8505911129229515, + "grad_norm": 0.15184149146080017, + "learning_rate": 6.43845970919962e-05, + "loss": 3.9034, + "step": 41955 + }, + { + "epoch": 2.850930832993613, + "grad_norm": 0.16441893577575684, + "learning_rate": 6.438035059111292e-05, + "loss": 3.9521, + "step": 41960 + }, + { + "epoch": 2.851270553064275, + "grad_norm": 0.207831010222435, + "learning_rate": 6.437610409022965e-05, + "loss": 3.8958, + "step": 41965 + }, + { + "epoch": 2.851610273134937, + "grad_norm": 0.12400612980127335, + "learning_rate": 6.437185758934639e-05, + "loss": 3.9067, + "step": 41970 + }, + { + "epoch": 2.8519499932055985, + "grad_norm": 0.16660481691360474, + "learning_rate": 6.43676110884631e-05, + "loss": 3.8738, + "step": 41975 + }, + { + "epoch": 2.85228971327626, + "grad_norm": 0.1526259183883667, + "learning_rate": 6.436336458757983e-05, + "loss": 3.689, + "step": 41980 + }, + { + "epoch": 2.852629433346922, + "grad_norm": 0.19673822820186615, + "learning_rate": 6.435911808669657e-05, + "loss": 4.0663, + "step": 41985 + }, + { + "epoch": 2.852969153417584, + "grad_norm": 0.27934375405311584, + "learning_rate": 6.435487158581329e-05, + "loss": 3.7898, + "step": 41990 + }, + { + "epoch": 2.8533088734882455, + "grad_norm": 0.19643957912921906, + "learning_rate": 6.435062508493002e-05, + "loss": 3.8598, + "step": 41995 + }, + { + "epoch": 2.8536485935589075, + "grad_norm": 1.8453552722930908, + "learning_rate": 6.434637858404674e-05, + "loss": 3.9517, + "step": 42000 + }, + { + "epoch": 2.853988313629569, + "grad_norm": 0.20167630910873413, + "learning_rate": 6.434213208316347e-05, + "loss": 3.8256, + "step": 42005 + }, + { + "epoch": 2.854328033700231, + "grad_norm": 0.17376552522182465, + "learning_rate": 6.43378855822802e-05, + "loss": 3.5834, + "step": 42010 + }, + { + "epoch": 2.854667753770893, + "grad_norm": 0.17216382920742035, + "learning_rate": 6.433363908139693e-05, + "loss": 4.055, + "step": 42015 + }, + { + "epoch": 2.8550074738415545, + "grad_norm": 0.1691320389509201, + "learning_rate": 6.432939258051366e-05, + "loss": 3.9343, + "step": 42020 + }, + { + "epoch": 2.855347193912216, + "grad_norm": 0.2066277712583542, + "learning_rate": 6.432514607963038e-05, + "loss": 4.202, + "step": 42025 + }, + { + "epoch": 2.855686913982878, + "grad_norm": 0.17372705042362213, + "learning_rate": 6.432089957874711e-05, + "loss": 3.8177, + "step": 42030 + }, + { + "epoch": 2.85602663405354, + "grad_norm": 0.22175659239292145, + "learning_rate": 6.431665307786384e-05, + "loss": 3.7414, + "step": 42035 + }, + { + "epoch": 2.8563663541242015, + "grad_norm": 0.18435749411582947, + "learning_rate": 6.431240657698057e-05, + "loss": 3.8698, + "step": 42040 + }, + { + "epoch": 2.8567060741948636, + "grad_norm": 0.17553505301475525, + "learning_rate": 6.43081600760973e-05, + "loss": 4.0391, + "step": 42045 + }, + { + "epoch": 2.857045794265525, + "grad_norm": 0.2689996063709259, + "learning_rate": 6.430391357521402e-05, + "loss": 3.8991, + "step": 42050 + }, + { + "epoch": 2.857385514336187, + "grad_norm": 0.18406876921653748, + "learning_rate": 6.429966707433075e-05, + "loss": 4.0041, + "step": 42055 + }, + { + "epoch": 2.857725234406849, + "grad_norm": 0.2252962589263916, + "learning_rate": 6.429542057344748e-05, + "loss": 3.7086, + "step": 42060 + }, + { + "epoch": 2.8580649544775105, + "grad_norm": 0.22810856997966766, + "learning_rate": 6.429117407256421e-05, + "loss": 4.0802, + "step": 42065 + }, + { + "epoch": 2.858404674548172, + "grad_norm": 0.16532470285892487, + "learning_rate": 6.428692757168094e-05, + "loss": 3.9036, + "step": 42070 + }, + { + "epoch": 2.8587443946188342, + "grad_norm": 0.2874560058116913, + "learning_rate": 6.428268107079766e-05, + "loss": 3.8, + "step": 42075 + }, + { + "epoch": 2.859084114689496, + "grad_norm": 0.2186034917831421, + "learning_rate": 6.427843456991439e-05, + "loss": 3.7611, + "step": 42080 + }, + { + "epoch": 2.8594238347601575, + "grad_norm": 0.13377465307712555, + "learning_rate": 6.427418806903112e-05, + "loss": 3.7712, + "step": 42085 + }, + { + "epoch": 2.8597635548308196, + "grad_norm": 0.1450299322605133, + "learning_rate": 6.426994156814785e-05, + "loss": 3.959, + "step": 42090 + }, + { + "epoch": 2.860103274901481, + "grad_norm": 0.16593030095100403, + "learning_rate": 6.426569506726458e-05, + "loss": 3.8831, + "step": 42095 + }, + { + "epoch": 2.860442994972143, + "grad_norm": 0.2205934375524521, + "learning_rate": 6.42614485663813e-05, + "loss": 3.8915, + "step": 42100 + }, + { + "epoch": 2.860782715042805, + "grad_norm": 0.1499638557434082, + "learning_rate": 6.425720206549803e-05, + "loss": 3.5557, + "step": 42105 + }, + { + "epoch": 2.8611224351134665, + "grad_norm": 0.17486800253391266, + "learning_rate": 6.425295556461476e-05, + "loss": 4.1332, + "step": 42110 + }, + { + "epoch": 2.861462155184128, + "grad_norm": 0.19709868729114532, + "learning_rate": 6.424870906373149e-05, + "loss": 3.7774, + "step": 42115 + }, + { + "epoch": 2.8618018752547902, + "grad_norm": 0.17719465494155884, + "learning_rate": 6.424446256284822e-05, + "loss": 3.923, + "step": 42120 + }, + { + "epoch": 2.862141595325452, + "grad_norm": 0.48698195815086365, + "learning_rate": 6.424021606196495e-05, + "loss": 3.9313, + "step": 42125 + }, + { + "epoch": 2.8624813153961135, + "grad_norm": 0.17625924944877625, + "learning_rate": 6.423596956108167e-05, + "loss": 3.8164, + "step": 42130 + }, + { + "epoch": 2.8628210354667756, + "grad_norm": 0.24776925146579742, + "learning_rate": 6.42317230601984e-05, + "loss": 3.9193, + "step": 42135 + }, + { + "epoch": 2.863160755537437, + "grad_norm": 0.33162304759025574, + "learning_rate": 6.422747655931513e-05, + "loss": 3.7893, + "step": 42140 + }, + { + "epoch": 2.863500475608099, + "grad_norm": 0.22492516040802002, + "learning_rate": 6.422323005843186e-05, + "loss": 3.8992, + "step": 42145 + }, + { + "epoch": 2.863840195678761, + "grad_norm": 0.17717601358890533, + "learning_rate": 6.421898355754859e-05, + "loss": 3.8885, + "step": 42150 + }, + { + "epoch": 2.8641799157494225, + "grad_norm": 0.1738208383321762, + "learning_rate": 6.421473705666531e-05, + "loss": 3.9815, + "step": 42155 + }, + { + "epoch": 2.864519635820084, + "grad_norm": 0.14243340492248535, + "learning_rate": 6.421049055578204e-05, + "loss": 3.808, + "step": 42160 + }, + { + "epoch": 2.8648593558907463, + "grad_norm": 0.16131152212619781, + "learning_rate": 6.420624405489877e-05, + "loss": 3.5494, + "step": 42165 + }, + { + "epoch": 2.865199075961408, + "grad_norm": 0.18918795883655548, + "learning_rate": 6.42019975540155e-05, + "loss": 3.916, + "step": 42170 + }, + { + "epoch": 2.8655387960320695, + "grad_norm": 0.15183673799037933, + "learning_rate": 6.419775105313223e-05, + "loss": 3.7503, + "step": 42175 + }, + { + "epoch": 2.8658785161027316, + "grad_norm": 0.17062175273895264, + "learning_rate": 6.419350455224895e-05, + "loss": 3.9167, + "step": 42180 + }, + { + "epoch": 2.866218236173393, + "grad_norm": 0.18888349831104279, + "learning_rate": 6.418925805136568e-05, + "loss": 3.648, + "step": 42185 + }, + { + "epoch": 2.866557956244055, + "grad_norm": 0.1258123815059662, + "learning_rate": 6.418501155048241e-05, + "loss": 3.8722, + "step": 42190 + }, + { + "epoch": 2.866897676314717, + "grad_norm": 0.3555353581905365, + "learning_rate": 6.418076504959914e-05, + "loss": 3.8411, + "step": 42195 + }, + { + "epoch": 2.8672373963853786, + "grad_norm": 0.17677804827690125, + "learning_rate": 6.417651854871585e-05, + "loss": 3.9151, + "step": 42200 + }, + { + "epoch": 2.86757711645604, + "grad_norm": 0.17301495373249054, + "learning_rate": 6.417227204783259e-05, + "loss": 3.8565, + "step": 42205 + }, + { + "epoch": 2.8679168365267023, + "grad_norm": 0.19141343235969543, + "learning_rate": 6.416802554694932e-05, + "loss": 3.976, + "step": 42210 + }, + { + "epoch": 2.868256556597364, + "grad_norm": 0.12724436819553375, + "learning_rate": 6.416377904606604e-05, + "loss": 3.8381, + "step": 42215 + }, + { + "epoch": 2.8685962766680255, + "grad_norm": 0.1666043996810913, + "learning_rate": 6.415953254518278e-05, + "loss": 3.5208, + "step": 42220 + }, + { + "epoch": 2.868935996738687, + "grad_norm": 0.1593688577413559, + "learning_rate": 6.41552860442995e-05, + "loss": 4.0267, + "step": 42225 + }, + { + "epoch": 2.8692757168093492, + "grad_norm": 0.2110791951417923, + "learning_rate": 6.415103954341622e-05, + "loss": 3.7888, + "step": 42230 + }, + { + "epoch": 2.869615436880011, + "grad_norm": 0.14945685863494873, + "learning_rate": 6.414679304253296e-05, + "loss": 3.9696, + "step": 42235 + }, + { + "epoch": 2.8699551569506725, + "grad_norm": 0.4051283001899719, + "learning_rate": 6.414254654164969e-05, + "loss": 3.9562, + "step": 42240 + }, + { + "epoch": 2.8702948770213346, + "grad_norm": 0.18692390620708466, + "learning_rate": 6.41383000407664e-05, + "loss": 3.9384, + "step": 42245 + }, + { + "epoch": 2.870634597091996, + "grad_norm": 0.17489540576934814, + "learning_rate": 6.413405353988315e-05, + "loss": 3.8595, + "step": 42250 + }, + { + "epoch": 2.870974317162658, + "grad_norm": 0.17667339742183685, + "learning_rate": 6.412980703899987e-05, + "loss": 3.9852, + "step": 42255 + }, + { + "epoch": 2.87131403723332, + "grad_norm": 0.13982945680618286, + "learning_rate": 6.412556053811659e-05, + "loss": 3.6952, + "step": 42260 + }, + { + "epoch": 2.8716537573039815, + "grad_norm": 0.1887921243906021, + "learning_rate": 6.412131403723333e-05, + "loss": 3.767, + "step": 42265 + }, + { + "epoch": 2.871993477374643, + "grad_norm": 0.4578472375869751, + "learning_rate": 6.411706753635006e-05, + "loss": 3.8185, + "step": 42270 + }, + { + "epoch": 2.872333197445305, + "grad_norm": 0.38538646697998047, + "learning_rate": 6.411282103546677e-05, + "loss": 4.0265, + "step": 42275 + }, + { + "epoch": 2.872672917515967, + "grad_norm": 0.18965809047222137, + "learning_rate": 6.410857453458351e-05, + "loss": 3.8978, + "step": 42280 + }, + { + "epoch": 2.8730126375866285, + "grad_norm": 0.2590252757072449, + "learning_rate": 6.410432803370023e-05, + "loss": 3.6757, + "step": 42285 + }, + { + "epoch": 2.87335235765729, + "grad_norm": 0.19417735934257507, + "learning_rate": 6.410008153281696e-05, + "loss": 4.0084, + "step": 42290 + }, + { + "epoch": 2.873692077727952, + "grad_norm": 0.17100122570991516, + "learning_rate": 6.40958350319337e-05, + "loss": 3.7615, + "step": 42295 + }, + { + "epoch": 2.874031797798614, + "grad_norm": 0.15806971490383148, + "learning_rate": 6.409158853105041e-05, + "loss": 3.9481, + "step": 42300 + }, + { + "epoch": 2.8743715178692755, + "grad_norm": 0.1755252182483673, + "learning_rate": 6.408734203016714e-05, + "loss": 3.9165, + "step": 42305 + }, + { + "epoch": 2.8747112379399375, + "grad_norm": 0.1627357006072998, + "learning_rate": 6.408309552928388e-05, + "loss": 3.8542, + "step": 42310 + }, + { + "epoch": 2.875050958010599, + "grad_norm": 0.18649645149707794, + "learning_rate": 6.40788490284006e-05, + "loss": 3.9288, + "step": 42315 + }, + { + "epoch": 2.875390678081261, + "grad_norm": 0.19286015629768372, + "learning_rate": 6.407460252751732e-05, + "loss": 3.8515, + "step": 42320 + }, + { + "epoch": 2.875730398151923, + "grad_norm": 0.16702726483345032, + "learning_rate": 6.407035602663407e-05, + "loss": 3.8326, + "step": 42325 + }, + { + "epoch": 2.8760701182225845, + "grad_norm": 0.15657317638397217, + "learning_rate": 6.406610952575078e-05, + "loss": 3.8286, + "step": 42330 + }, + { + "epoch": 2.876409838293246, + "grad_norm": 0.16709597408771515, + "learning_rate": 6.406186302486751e-05, + "loss": 3.8067, + "step": 42335 + }, + { + "epoch": 2.876749558363908, + "grad_norm": 0.17246735095977783, + "learning_rate": 6.405761652398425e-05, + "loss": 3.9482, + "step": 42340 + }, + { + "epoch": 2.87708927843457, + "grad_norm": 0.5548410415649414, + "learning_rate": 6.405337002310096e-05, + "loss": 3.7621, + "step": 42345 + }, + { + "epoch": 2.8774289985052315, + "grad_norm": 0.15623624622821808, + "learning_rate": 6.404912352221769e-05, + "loss": 3.9932, + "step": 42350 + }, + { + "epoch": 2.8777687185758936, + "grad_norm": 0.15263019502162933, + "learning_rate": 6.404487702133442e-05, + "loss": 3.8241, + "step": 42355 + }, + { + "epoch": 2.878108438646555, + "grad_norm": 2.804988384246826, + "learning_rate": 6.404063052045115e-05, + "loss": 3.9546, + "step": 42360 + }, + { + "epoch": 2.878448158717217, + "grad_norm": 0.17220371961593628, + "learning_rate": 6.403638401956788e-05, + "loss": 3.8028, + "step": 42365 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.1591564118862152, + "learning_rate": 6.40321375186846e-05, + "loss": 4.0205, + "step": 42370 + }, + { + "epoch": 2.8791275988585405, + "grad_norm": 0.18920592963695526, + "learning_rate": 6.402789101780133e-05, + "loss": 4.1813, + "step": 42375 + }, + { + "epoch": 2.879467318929202, + "grad_norm": 0.18130503594875336, + "learning_rate": 6.402364451691806e-05, + "loss": 3.9358, + "step": 42380 + }, + { + "epoch": 2.8798070389998642, + "grad_norm": 0.14371179044246674, + "learning_rate": 6.401939801603479e-05, + "loss": 3.7596, + "step": 42385 + }, + { + "epoch": 2.880146759070526, + "grad_norm": 0.16091729700565338, + "learning_rate": 6.401515151515152e-05, + "loss": 3.845, + "step": 42390 + }, + { + "epoch": 2.8804864791411875, + "grad_norm": 0.1795780211687088, + "learning_rate": 6.401090501426824e-05, + "loss": 3.91, + "step": 42395 + }, + { + "epoch": 2.8808261992118496, + "grad_norm": 0.18679945170879364, + "learning_rate": 6.400665851338497e-05, + "loss": 4.113, + "step": 42400 + }, + { + "epoch": 2.881165919282511, + "grad_norm": 0.15946432948112488, + "learning_rate": 6.40024120125017e-05, + "loss": 3.8561, + "step": 42405 + }, + { + "epoch": 2.881505639353173, + "grad_norm": 0.17645789682865143, + "learning_rate": 6.399816551161843e-05, + "loss": 3.8758, + "step": 42410 + }, + { + "epoch": 2.881845359423835, + "grad_norm": 0.1663370430469513, + "learning_rate": 6.399391901073516e-05, + "loss": 3.8797, + "step": 42415 + }, + { + "epoch": 2.8821850794944965, + "grad_norm": 0.22705203294754028, + "learning_rate": 6.398967250985188e-05, + "loss": 4.1325, + "step": 42420 + }, + { + "epoch": 2.882524799565158, + "grad_norm": 0.2427009791135788, + "learning_rate": 6.398542600896861e-05, + "loss": 3.7304, + "step": 42425 + }, + { + "epoch": 2.8828645196358202, + "grad_norm": 0.21676065027713776, + "learning_rate": 6.398117950808534e-05, + "loss": 3.8914, + "step": 42430 + }, + { + "epoch": 2.883204239706482, + "grad_norm": 0.1618509143590927, + "learning_rate": 6.397693300720207e-05, + "loss": 3.8244, + "step": 42435 + }, + { + "epoch": 2.8835439597771435, + "grad_norm": 0.33497121930122375, + "learning_rate": 6.39726865063188e-05, + "loss": 4.0203, + "step": 42440 + }, + { + "epoch": 2.8838836798478056, + "grad_norm": 0.15725912153720856, + "learning_rate": 6.396844000543552e-05, + "loss": 4.1853, + "step": 42445 + }, + { + "epoch": 2.884223399918467, + "grad_norm": 0.20438267290592194, + "learning_rate": 6.396419350455225e-05, + "loss": 3.7241, + "step": 42450 + }, + { + "epoch": 2.884563119989129, + "grad_norm": 0.1585283875465393, + "learning_rate": 6.395994700366898e-05, + "loss": 3.8935, + "step": 42455 + }, + { + "epoch": 2.884902840059791, + "grad_norm": 0.16543687880039215, + "learning_rate": 6.395570050278571e-05, + "loss": 3.9513, + "step": 42460 + }, + { + "epoch": 2.8852425601304525, + "grad_norm": 0.18440183997154236, + "learning_rate": 6.395145400190244e-05, + "loss": 3.8476, + "step": 42465 + }, + { + "epoch": 2.885582280201114, + "grad_norm": 0.17790398001670837, + "learning_rate": 6.394720750101916e-05, + "loss": 3.7925, + "step": 42470 + }, + { + "epoch": 2.8859220002717763, + "grad_norm": 0.21540559828281403, + "learning_rate": 6.394296100013589e-05, + "loss": 4.0347, + "step": 42475 + }, + { + "epoch": 2.886261720342438, + "grad_norm": 0.1780301332473755, + "learning_rate": 6.393871449925262e-05, + "loss": 3.9848, + "step": 42480 + }, + { + "epoch": 2.8866014404130995, + "grad_norm": 0.151422917842865, + "learning_rate": 6.393446799836935e-05, + "loss": 3.7972, + "step": 42485 + }, + { + "epoch": 2.8869411604837616, + "grad_norm": 0.1677219718694687, + "learning_rate": 6.393022149748608e-05, + "loss": 3.8178, + "step": 42490 + }, + { + "epoch": 2.8872808805544232, + "grad_norm": 0.18405354022979736, + "learning_rate": 6.39259749966028e-05, + "loss": 3.9142, + "step": 42495 + }, + { + "epoch": 2.887620600625085, + "grad_norm": 1.5120917558670044, + "learning_rate": 6.392172849571953e-05, + "loss": 3.8231, + "step": 42500 + }, + { + "epoch": 2.887960320695747, + "grad_norm": 0.6204794645309448, + "learning_rate": 6.391748199483626e-05, + "loss": 3.908, + "step": 42505 + }, + { + "epoch": 2.8883000407664086, + "grad_norm": 0.21885772049427032, + "learning_rate": 6.391323549395299e-05, + "loss": 3.7794, + "step": 42510 + }, + { + "epoch": 2.88863976083707, + "grad_norm": 0.16650491952896118, + "learning_rate": 6.390898899306972e-05, + "loss": 3.7201, + "step": 42515 + }, + { + "epoch": 2.8889794809077323, + "grad_norm": 0.42478442192077637, + "learning_rate": 6.390474249218644e-05, + "loss": 4.089, + "step": 42520 + }, + { + "epoch": 2.889319200978394, + "grad_norm": 0.2032255381345749, + "learning_rate": 6.390049599130317e-05, + "loss": 3.9173, + "step": 42525 + }, + { + "epoch": 2.8896589210490555, + "grad_norm": 0.23365187644958496, + "learning_rate": 6.38962494904199e-05, + "loss": 3.9498, + "step": 42530 + }, + { + "epoch": 2.8899986411197176, + "grad_norm": 0.1686418056488037, + "learning_rate": 6.389200298953663e-05, + "loss": 3.5511, + "step": 42535 + }, + { + "epoch": 2.8903383611903792, + "grad_norm": 0.26483574509620667, + "learning_rate": 6.388775648865336e-05, + "loss": 4.0474, + "step": 42540 + }, + { + "epoch": 2.890678081261041, + "grad_norm": 0.1734277456998825, + "learning_rate": 6.388350998777008e-05, + "loss": 3.7372, + "step": 42545 + }, + { + "epoch": 2.891017801331703, + "grad_norm": 0.15698841214179993, + "learning_rate": 6.387926348688681e-05, + "loss": 3.6325, + "step": 42550 + }, + { + "epoch": 2.8913575214023646, + "grad_norm": 0.35786861181259155, + "learning_rate": 6.387501698600353e-05, + "loss": 3.6851, + "step": 42555 + }, + { + "epoch": 2.891697241473026, + "grad_norm": 0.16560539603233337, + "learning_rate": 6.387077048512027e-05, + "loss": 3.9133, + "step": 42560 + }, + { + "epoch": 2.8920369615436883, + "grad_norm": 0.22604666650295258, + "learning_rate": 6.3866523984237e-05, + "loss": 4.0316, + "step": 42565 + }, + { + "epoch": 2.89237668161435, + "grad_norm": 0.14985089004039764, + "learning_rate": 6.386227748335371e-05, + "loss": 3.923, + "step": 42570 + }, + { + "epoch": 2.8927164016850115, + "grad_norm": 0.16401958465576172, + "learning_rate": 6.385803098247045e-05, + "loss": 3.8322, + "step": 42575 + }, + { + "epoch": 2.893056121755673, + "grad_norm": 0.16869987547397614, + "learning_rate": 6.385378448158718e-05, + "loss": 3.7692, + "step": 42580 + }, + { + "epoch": 2.8933958418263352, + "grad_norm": 0.16792012751102448, + "learning_rate": 6.38495379807039e-05, + "loss": 3.8384, + "step": 42585 + }, + { + "epoch": 2.893735561896997, + "grad_norm": 0.2551441192626953, + "learning_rate": 6.384529147982064e-05, + "loss": 3.9565, + "step": 42590 + }, + { + "epoch": 2.8940752819676585, + "grad_norm": 0.17610546946525574, + "learning_rate": 6.384104497893736e-05, + "loss": 3.9393, + "step": 42595 + }, + { + "epoch": 2.8944150020383206, + "grad_norm": 0.2164703756570816, + "learning_rate": 6.383679847805408e-05, + "loss": 3.994, + "step": 42600 + }, + { + "epoch": 2.894754722108982, + "grad_norm": 0.18191246688365936, + "learning_rate": 6.383255197717082e-05, + "loss": 3.6214, + "step": 42605 + }, + { + "epoch": 2.895094442179644, + "grad_norm": 0.15292660892009735, + "learning_rate": 6.382830547628755e-05, + "loss": 3.7773, + "step": 42610 + }, + { + "epoch": 2.8954341622503055, + "grad_norm": 0.1744583249092102, + "learning_rate": 6.382405897540426e-05, + "loss": 3.6723, + "step": 42615 + }, + { + "epoch": 2.8957738823209676, + "grad_norm": 0.14193004369735718, + "learning_rate": 6.3819812474521e-05, + "loss": 3.9445, + "step": 42620 + }, + { + "epoch": 2.896113602391629, + "grad_norm": 0.18737244606018066, + "learning_rate": 6.381556597363772e-05, + "loss": 3.8572, + "step": 42625 + }, + { + "epoch": 2.896453322462291, + "grad_norm": 1.0304700136184692, + "learning_rate": 6.381131947275445e-05, + "loss": 3.8017, + "step": 42630 + }, + { + "epoch": 2.896793042532953, + "grad_norm": 0.1628858745098114, + "learning_rate": 6.380707297187119e-05, + "loss": 3.9315, + "step": 42635 + }, + { + "epoch": 2.8971327626036145, + "grad_norm": 0.1872030794620514, + "learning_rate": 6.38028264709879e-05, + "loss": 3.9, + "step": 42640 + }, + { + "epoch": 2.897472482674276, + "grad_norm": 0.22762072086334229, + "learning_rate": 6.379857997010463e-05, + "loss": 4.1173, + "step": 42645 + }, + { + "epoch": 2.8978122027449382, + "grad_norm": 0.12068489193916321, + "learning_rate": 6.379433346922137e-05, + "loss": 3.9659, + "step": 42650 + }, + { + "epoch": 2.8981519228156, + "grad_norm": 0.1595483124256134, + "learning_rate": 6.379008696833809e-05, + "loss": 3.82, + "step": 42655 + }, + { + "epoch": 2.8984916428862615, + "grad_norm": 0.18977203965187073, + "learning_rate": 6.378584046745481e-05, + "loss": 3.8263, + "step": 42660 + }, + { + "epoch": 2.8988313629569236, + "grad_norm": 0.16803434491157532, + "learning_rate": 6.378159396657156e-05, + "loss": 3.848, + "step": 42665 + }, + { + "epoch": 2.899171083027585, + "grad_norm": 0.20173317193984985, + "learning_rate": 6.377734746568827e-05, + "loss": 3.7731, + "step": 42670 + }, + { + "epoch": 2.899510803098247, + "grad_norm": 0.39329642057418823, + "learning_rate": 6.3773100964805e-05, + "loss": 3.8812, + "step": 42675 + }, + { + "epoch": 2.899850523168909, + "grad_norm": 0.18608032166957855, + "learning_rate": 6.376885446392174e-05, + "loss": 3.7029, + "step": 42680 + }, + { + "epoch": 2.9001902432395705, + "grad_norm": 0.18728159368038177, + "learning_rate": 6.376460796303845e-05, + "loss": 3.7947, + "step": 42685 + }, + { + "epoch": 2.900529963310232, + "grad_norm": 0.21135330200195312, + "learning_rate": 6.376036146215518e-05, + "loss": 3.7946, + "step": 42690 + }, + { + "epoch": 2.9008696833808942, + "grad_norm": 0.16531476378440857, + "learning_rate": 6.375611496127192e-05, + "loss": 3.796, + "step": 42695 + }, + { + "epoch": 2.901209403451556, + "grad_norm": 0.362165629863739, + "learning_rate": 6.375186846038864e-05, + "loss": 3.9909, + "step": 42700 + }, + { + "epoch": 2.9015491235222175, + "grad_norm": 0.19605541229248047, + "learning_rate": 6.374762195950537e-05, + "loss": 4.1568, + "step": 42705 + }, + { + "epoch": 2.9018888435928796, + "grad_norm": 0.18867088854312897, + "learning_rate": 6.37433754586221e-05, + "loss": 3.9416, + "step": 42710 + }, + { + "epoch": 2.902228563663541, + "grad_norm": 0.1788673847913742, + "learning_rate": 6.373912895773882e-05, + "loss": 3.9564, + "step": 42715 + }, + { + "epoch": 2.902568283734203, + "grad_norm": 0.15481312572956085, + "learning_rate": 6.373488245685555e-05, + "loss": 3.7531, + "step": 42720 + }, + { + "epoch": 2.902908003804865, + "grad_norm": 0.18201923370361328, + "learning_rate": 6.373063595597228e-05, + "loss": 3.8525, + "step": 42725 + }, + { + "epoch": 2.9032477238755265, + "grad_norm": 0.19387592375278473, + "learning_rate": 6.372638945508901e-05, + "loss": 3.9778, + "step": 42730 + }, + { + "epoch": 2.903587443946188, + "grad_norm": 0.14957237243652344, + "learning_rate": 6.372214295420573e-05, + "loss": 3.9223, + "step": 42735 + }, + { + "epoch": 2.9039271640168502, + "grad_norm": 0.207451730966568, + "learning_rate": 6.371789645332246e-05, + "loss": 3.8615, + "step": 42740 + }, + { + "epoch": 2.904266884087512, + "grad_norm": 0.1526460498571396, + "learning_rate": 6.371364995243919e-05, + "loss": 3.6656, + "step": 42745 + }, + { + "epoch": 2.9046066041581735, + "grad_norm": 0.17573630809783936, + "learning_rate": 6.370940345155592e-05, + "loss": 3.7132, + "step": 42750 + }, + { + "epoch": 2.9049463242288356, + "grad_norm": 0.17000098526477814, + "learning_rate": 6.370515695067265e-05, + "loss": 3.8876, + "step": 42755 + }, + { + "epoch": 2.905286044299497, + "grad_norm": 0.17001508176326752, + "learning_rate": 6.370091044978937e-05, + "loss": 3.8239, + "step": 42760 + }, + { + "epoch": 2.905625764370159, + "grad_norm": 0.18162497878074646, + "learning_rate": 6.36966639489061e-05, + "loss": 3.8375, + "step": 42765 + }, + { + "epoch": 2.905965484440821, + "grad_norm": 0.16857455670833588, + "learning_rate": 6.369241744802283e-05, + "loss": 3.8933, + "step": 42770 + }, + { + "epoch": 2.9063052045114826, + "grad_norm": 0.3110954463481903, + "learning_rate": 6.368817094713956e-05, + "loss": 4.0816, + "step": 42775 + }, + { + "epoch": 2.906644924582144, + "grad_norm": 0.16310806572437286, + "learning_rate": 6.368392444625629e-05, + "loss": 3.994, + "step": 42780 + }, + { + "epoch": 2.9069846446528063, + "grad_norm": 0.19577211141586304, + "learning_rate": 6.367967794537301e-05, + "loss": 3.6067, + "step": 42785 + }, + { + "epoch": 2.907324364723468, + "grad_norm": 0.35532045364379883, + "learning_rate": 6.367543144448974e-05, + "loss": 3.9602, + "step": 42790 + }, + { + "epoch": 2.9076640847941295, + "grad_norm": 0.16420099139213562, + "learning_rate": 6.367118494360647e-05, + "loss": 3.6505, + "step": 42795 + }, + { + "epoch": 2.9080038048647916, + "grad_norm": 0.15293315052986145, + "learning_rate": 6.36669384427232e-05, + "loss": 3.9104, + "step": 42800 + }, + { + "epoch": 2.9083435249354532, + "grad_norm": 0.22291739284992218, + "learning_rate": 6.366269194183993e-05, + "loss": 3.8022, + "step": 42805 + }, + { + "epoch": 2.908683245006115, + "grad_norm": 0.14896437525749207, + "learning_rate": 6.365844544095666e-05, + "loss": 3.9733, + "step": 42810 + }, + { + "epoch": 2.909022965076777, + "grad_norm": 0.31311655044555664, + "learning_rate": 6.365419894007338e-05, + "loss": 3.9189, + "step": 42815 + }, + { + "epoch": 2.9093626851474386, + "grad_norm": 0.1982249617576599, + "learning_rate": 6.364995243919011e-05, + "loss": 3.9281, + "step": 42820 + }, + { + "epoch": 2.9097024052181, + "grad_norm": 0.16015976667404175, + "learning_rate": 6.364570593830684e-05, + "loss": 3.9626, + "step": 42825 + }, + { + "epoch": 2.9100421252887623, + "grad_norm": 0.13656026124954224, + "learning_rate": 6.364145943742357e-05, + "loss": 3.665, + "step": 42830 + }, + { + "epoch": 2.910381845359424, + "grad_norm": 3.387693166732788, + "learning_rate": 6.36372129365403e-05, + "loss": 4.1043, + "step": 42835 + }, + { + "epoch": 2.9107215654300855, + "grad_norm": 0.3883932828903198, + "learning_rate": 6.363296643565702e-05, + "loss": 3.9375, + "step": 42840 + }, + { + "epoch": 2.9110612855007476, + "grad_norm": 0.43694281578063965, + "learning_rate": 6.362871993477375e-05, + "loss": 3.8004, + "step": 42845 + }, + { + "epoch": 2.9114010055714092, + "grad_norm": 0.16275537014007568, + "learning_rate": 6.362447343389048e-05, + "loss": 3.8712, + "step": 42850 + }, + { + "epoch": 2.911740725642071, + "grad_norm": 0.19572852551937103, + "learning_rate": 6.362022693300721e-05, + "loss": 3.8899, + "step": 42855 + }, + { + "epoch": 2.912080445712733, + "grad_norm": 0.1832486093044281, + "learning_rate": 6.361598043212394e-05, + "loss": 4.0243, + "step": 42860 + }, + { + "epoch": 2.9124201657833946, + "grad_norm": 0.5265936851501465, + "learning_rate": 6.361173393124066e-05, + "loss": 3.8377, + "step": 42865 + }, + { + "epoch": 2.912759885854056, + "grad_norm": 0.17102496325969696, + "learning_rate": 6.360748743035739e-05, + "loss": 3.9294, + "step": 42870 + }, + { + "epoch": 2.9130996059247183, + "grad_norm": 0.17768944799900055, + "learning_rate": 6.360324092947412e-05, + "loss": 3.9309, + "step": 42875 + }, + { + "epoch": 2.91343932599538, + "grad_norm": 0.1541418582201004, + "learning_rate": 6.359899442859085e-05, + "loss": 3.9878, + "step": 42880 + }, + { + "epoch": 2.9137790460660415, + "grad_norm": 0.14836028218269348, + "learning_rate": 6.359474792770758e-05, + "loss": 4.1365, + "step": 42885 + }, + { + "epoch": 2.9141187661367036, + "grad_norm": 0.5424960255622864, + "learning_rate": 6.35905014268243e-05, + "loss": 3.8559, + "step": 42890 + }, + { + "epoch": 2.9144584862073653, + "grad_norm": 0.1550600528717041, + "learning_rate": 6.358625492594103e-05, + "loss": 3.9085, + "step": 42895 + }, + { + "epoch": 2.914798206278027, + "grad_norm": 0.44373661279678345, + "learning_rate": 6.358200842505776e-05, + "loss": 3.7388, + "step": 42900 + }, + { + "epoch": 2.915137926348689, + "grad_norm": 0.14855138957500458, + "learning_rate": 6.357776192417449e-05, + "loss": 4.0906, + "step": 42905 + }, + { + "epoch": 2.9154776464193506, + "grad_norm": 0.1742040067911148, + "learning_rate": 6.35735154232912e-05, + "loss": 3.7644, + "step": 42910 + }, + { + "epoch": 2.915817366490012, + "grad_norm": 0.16304340958595276, + "learning_rate": 6.356926892240794e-05, + "loss": 3.8708, + "step": 42915 + }, + { + "epoch": 2.916157086560674, + "grad_norm": 0.19485962390899658, + "learning_rate": 6.356502242152467e-05, + "loss": 3.786, + "step": 42920 + }, + { + "epoch": 2.916496806631336, + "grad_norm": 0.1684819906949997, + "learning_rate": 6.356077592064139e-05, + "loss": 3.9847, + "step": 42925 + }, + { + "epoch": 2.9168365267019976, + "grad_norm": 0.16969476640224457, + "learning_rate": 6.355652941975813e-05, + "loss": 3.8146, + "step": 42930 + }, + { + "epoch": 2.917176246772659, + "grad_norm": 0.20562702417373657, + "learning_rate": 6.355228291887486e-05, + "loss": 3.9352, + "step": 42935 + }, + { + "epoch": 2.9175159668433213, + "grad_norm": 0.1757957935333252, + "learning_rate": 6.354803641799157e-05, + "loss": 4.0389, + "step": 42940 + }, + { + "epoch": 2.917855686913983, + "grad_norm": 0.18166053295135498, + "learning_rate": 6.354378991710831e-05, + "loss": 4.1048, + "step": 42945 + }, + { + "epoch": 2.9181954069846445, + "grad_norm": 0.15788763761520386, + "learning_rate": 6.353954341622504e-05, + "loss": 3.8161, + "step": 42950 + }, + { + "epoch": 2.918535127055306, + "grad_norm": 0.2247292697429657, + "learning_rate": 6.353529691534175e-05, + "loss": 3.6966, + "step": 42955 + }, + { + "epoch": 2.9188748471259682, + "grad_norm": 0.16797727346420288, + "learning_rate": 6.35310504144585e-05, + "loss": 3.8459, + "step": 42960 + }, + { + "epoch": 2.91921456719663, + "grad_norm": 0.15992844104766846, + "learning_rate": 6.352680391357522e-05, + "loss": 3.9426, + "step": 42965 + }, + { + "epoch": 2.9195542872672915, + "grad_norm": 0.1927058845758438, + "learning_rate": 6.352255741269194e-05, + "loss": 3.994, + "step": 42970 + }, + { + "epoch": 2.9198940073379536, + "grad_norm": 0.14589177072048187, + "learning_rate": 6.351831091180868e-05, + "loss": 3.8798, + "step": 42975 + }, + { + "epoch": 2.920233727408615, + "grad_norm": 0.16557128727436066, + "learning_rate": 6.35140644109254e-05, + "loss": 4.0168, + "step": 42980 + }, + { + "epoch": 2.920573447479277, + "grad_norm": 0.2846653163433075, + "learning_rate": 6.350981791004212e-05, + "loss": 3.8516, + "step": 42985 + }, + { + "epoch": 2.920913167549939, + "grad_norm": 0.18468447029590607, + "learning_rate": 6.350557140915886e-05, + "loss": 3.7446, + "step": 42990 + }, + { + "epoch": 2.9212528876206005, + "grad_norm": 0.1465940922498703, + "learning_rate": 6.350132490827558e-05, + "loss": 3.7988, + "step": 42995 + }, + { + "epoch": 2.921592607691262, + "grad_norm": 0.17860719561576843, + "learning_rate": 6.34970784073923e-05, + "loss": 3.7632, + "step": 43000 + }, + { + "epoch": 2.9219323277619242, + "grad_norm": 0.16077549755573273, + "learning_rate": 6.349283190650905e-05, + "loss": 4.0716, + "step": 43005 + }, + { + "epoch": 2.922272047832586, + "grad_norm": 0.13944898545742035, + "learning_rate": 6.348858540562576e-05, + "loss": 3.721, + "step": 43010 + }, + { + "epoch": 2.9226117679032475, + "grad_norm": 0.22094428539276123, + "learning_rate": 6.348433890474249e-05, + "loss": 3.6526, + "step": 43015 + }, + { + "epoch": 2.9229514879739096, + "grad_norm": 0.18647481501102448, + "learning_rate": 6.348009240385923e-05, + "loss": 3.9857, + "step": 43020 + }, + { + "epoch": 2.923291208044571, + "grad_norm": 0.156856968998909, + "learning_rate": 6.347584590297595e-05, + "loss": 4.0521, + "step": 43025 + }, + { + "epoch": 2.923630928115233, + "grad_norm": 0.17692551016807556, + "learning_rate": 6.347159940209267e-05, + "loss": 3.9752, + "step": 43030 + }, + { + "epoch": 2.923970648185895, + "grad_norm": 0.21487170457839966, + "learning_rate": 6.346735290120942e-05, + "loss": 3.9121, + "step": 43035 + }, + { + "epoch": 2.9243103682565565, + "grad_norm": 0.16553984582424164, + "learning_rate": 6.346395570050278e-05, + "loss": 3.7717, + "step": 43040 + }, + { + "epoch": 2.924650088327218, + "grad_norm": 0.15427201986312866, + "learning_rate": 6.345970919961953e-05, + "loss": 3.9907, + "step": 43045 + }, + { + "epoch": 2.9249898083978803, + "grad_norm": 0.1493140012025833, + "learning_rate": 6.345546269873624e-05, + "loss": 4.042, + "step": 43050 + }, + { + "epoch": 2.925329528468542, + "grad_norm": 0.18489785492420197, + "learning_rate": 6.345121619785297e-05, + "loss": 3.7536, + "step": 43055 + }, + { + "epoch": 2.9256692485392035, + "grad_norm": 0.7359727621078491, + "learning_rate": 6.344696969696971e-05, + "loss": 3.8603, + "step": 43060 + }, + { + "epoch": 2.9260089686098656, + "grad_norm": 0.15980049967765808, + "learning_rate": 6.344272319608642e-05, + "loss": 3.9984, + "step": 43065 + }, + { + "epoch": 2.926348688680527, + "grad_norm": 0.1875642091035843, + "learning_rate": 6.343847669520315e-05, + "loss": 3.7832, + "step": 43070 + }, + { + "epoch": 2.926688408751189, + "grad_norm": 0.1783336102962494, + "learning_rate": 6.34342301943199e-05, + "loss": 3.8518, + "step": 43075 + }, + { + "epoch": 2.927028128821851, + "grad_norm": 0.23493844270706177, + "learning_rate": 6.342998369343661e-05, + "loss": 3.9384, + "step": 43080 + }, + { + "epoch": 2.9273678488925126, + "grad_norm": 0.19445784389972687, + "learning_rate": 6.342573719255334e-05, + "loss": 3.9123, + "step": 43085 + }, + { + "epoch": 2.927707568963174, + "grad_norm": 0.15529657900333405, + "learning_rate": 6.342149069167008e-05, + "loss": 3.8052, + "step": 43090 + }, + { + "epoch": 2.9280472890338363, + "grad_norm": 0.14504680037498474, + "learning_rate": 6.341724419078679e-05, + "loss": 3.6482, + "step": 43095 + }, + { + "epoch": 2.928387009104498, + "grad_norm": 0.17275138199329376, + "learning_rate": 6.341299768990352e-05, + "loss": 3.6529, + "step": 43100 + }, + { + "epoch": 2.9287267291751595, + "grad_norm": 0.23878975212574005, + "learning_rate": 6.340875118902025e-05, + "loss": 3.7002, + "step": 43105 + }, + { + "epoch": 2.9290664492458216, + "grad_norm": 0.15313144028186798, + "learning_rate": 6.340450468813698e-05, + "loss": 3.8054, + "step": 43110 + }, + { + "epoch": 2.9294061693164832, + "grad_norm": 0.15265750885009766, + "learning_rate": 6.34002581872537e-05, + "loss": 3.9078, + "step": 43115 + }, + { + "epoch": 2.929745889387145, + "grad_norm": 0.19179074466228485, + "learning_rate": 6.339601168637043e-05, + "loss": 3.6998, + "step": 43120 + }, + { + "epoch": 2.930085609457807, + "grad_norm": 0.15456953644752502, + "learning_rate": 6.339176518548716e-05, + "loss": 3.9972, + "step": 43125 + }, + { + "epoch": 2.9304253295284686, + "grad_norm": 0.5207456946372986, + "learning_rate": 6.338751868460389e-05, + "loss": 3.8836, + "step": 43130 + }, + { + "epoch": 2.93076504959913, + "grad_norm": 0.2181585282087326, + "learning_rate": 6.338327218372062e-05, + "loss": 3.5738, + "step": 43135 + }, + { + "epoch": 2.9311047696697923, + "grad_norm": 0.16719487309455872, + "learning_rate": 6.337902568283734e-05, + "loss": 3.823, + "step": 43140 + }, + { + "epoch": 2.931444489740454, + "grad_norm": 0.13165536522865295, + "learning_rate": 6.337477918195407e-05, + "loss": 4.092, + "step": 43145 + }, + { + "epoch": 2.9317842098111155, + "grad_norm": 0.20519907772541046, + "learning_rate": 6.33705326810708e-05, + "loss": 4.1099, + "step": 43150 + }, + { + "epoch": 2.9321239298817776, + "grad_norm": 0.20120161771774292, + "learning_rate": 6.336628618018753e-05, + "loss": 3.8922, + "step": 43155 + }, + { + "epoch": 2.9324636499524392, + "grad_norm": 0.15434595942497253, + "learning_rate": 6.336203967930426e-05, + "loss": 3.7531, + "step": 43160 + }, + { + "epoch": 2.932803370023101, + "grad_norm": 0.13259872794151306, + "learning_rate": 6.335779317842098e-05, + "loss": 3.8577, + "step": 43165 + }, + { + "epoch": 2.933143090093763, + "grad_norm": 0.21577732264995575, + "learning_rate": 6.335354667753771e-05, + "loss": 3.5892, + "step": 43170 + }, + { + "epoch": 2.9334828101644246, + "grad_norm": 0.23784121870994568, + "learning_rate": 6.334930017665444e-05, + "loss": 3.7453, + "step": 43175 + }, + { + "epoch": 2.933822530235086, + "grad_norm": 0.21086563169956207, + "learning_rate": 6.334505367577117e-05, + "loss": 3.9252, + "step": 43180 + }, + { + "epoch": 2.9341622503057483, + "grad_norm": 0.13637295365333557, + "learning_rate": 6.33408071748879e-05, + "loss": 3.6456, + "step": 43185 + }, + { + "epoch": 2.93450197037641, + "grad_norm": 0.17646025121212006, + "learning_rate": 6.333656067400462e-05, + "loss": 3.8409, + "step": 43190 + }, + { + "epoch": 2.9348416904470715, + "grad_norm": 0.22606299817562103, + "learning_rate": 6.333231417312135e-05, + "loss": 3.9637, + "step": 43195 + }, + { + "epoch": 2.9351814105177336, + "grad_norm": 0.2286187708377838, + "learning_rate": 6.332806767223808e-05, + "loss": 4.0262, + "step": 43200 + }, + { + "epoch": 2.9355211305883953, + "grad_norm": 0.22526122629642487, + "learning_rate": 6.332382117135481e-05, + "loss": 3.9211, + "step": 43205 + }, + { + "epoch": 2.935860850659057, + "grad_norm": 0.1503668576478958, + "learning_rate": 6.331957467047154e-05, + "loss": 3.9246, + "step": 43210 + }, + { + "epoch": 2.936200570729719, + "grad_norm": 0.20249786972999573, + "learning_rate": 6.331532816958826e-05, + "loss": 3.8096, + "step": 43215 + }, + { + "epoch": 2.9365402908003806, + "grad_norm": 0.1656864583492279, + "learning_rate": 6.331108166870498e-05, + "loss": 3.783, + "step": 43220 + }, + { + "epoch": 2.9368800108710422, + "grad_norm": 0.19238093495368958, + "learning_rate": 6.330683516782172e-05, + "loss": 3.7623, + "step": 43225 + }, + { + "epoch": 2.9372197309417043, + "grad_norm": 0.17821846902370453, + "learning_rate": 6.330258866693845e-05, + "loss": 3.8911, + "step": 43230 + }, + { + "epoch": 2.937559451012366, + "grad_norm": 0.9674791693687439, + "learning_rate": 6.329834216605516e-05, + "loss": 3.7268, + "step": 43235 + }, + { + "epoch": 2.9378991710830276, + "grad_norm": 0.18059037625789642, + "learning_rate": 6.32940956651719e-05, + "loss": 3.8783, + "step": 43240 + }, + { + "epoch": 2.9382388911536896, + "grad_norm": 0.17589902877807617, + "learning_rate": 6.328984916428863e-05, + "loss": 3.7788, + "step": 43245 + }, + { + "epoch": 2.9385786112243513, + "grad_norm": 0.16261602938175201, + "learning_rate": 6.328560266340535e-05, + "loss": 3.8946, + "step": 43250 + }, + { + "epoch": 2.938918331295013, + "grad_norm": 0.5624969005584717, + "learning_rate": 6.328135616252209e-05, + "loss": 3.9924, + "step": 43255 + }, + { + "epoch": 2.9392580513656745, + "grad_norm": 0.20482897758483887, + "learning_rate": 6.327710966163882e-05, + "loss": 3.9012, + "step": 43260 + }, + { + "epoch": 2.9395977714363366, + "grad_norm": 0.20270642638206482, + "learning_rate": 6.327286316075553e-05, + "loss": 3.7166, + "step": 43265 + }, + { + "epoch": 2.9399374915069982, + "grad_norm": 0.14434543251991272, + "learning_rate": 6.326861665987227e-05, + "loss": 3.7692, + "step": 43270 + }, + { + "epoch": 2.94027721157766, + "grad_norm": 0.1591363400220871, + "learning_rate": 6.3264370158989e-05, + "loss": 3.6205, + "step": 43275 + }, + { + "epoch": 2.940616931648322, + "grad_norm": 0.2531801164150238, + "learning_rate": 6.326012365810572e-05, + "loss": 3.8635, + "step": 43280 + }, + { + "epoch": 2.9409566517189836, + "grad_norm": 0.1809386909008026, + "learning_rate": 6.325587715722246e-05, + "loss": 3.9401, + "step": 43285 + }, + { + "epoch": 2.941296371789645, + "grad_norm": 0.14723245799541473, + "learning_rate": 6.325163065633918e-05, + "loss": 3.8615, + "step": 43290 + }, + { + "epoch": 2.941636091860307, + "grad_norm": 0.17376591265201569, + "learning_rate": 6.32473841554559e-05, + "loss": 4.058, + "step": 43295 + }, + { + "epoch": 2.941975811930969, + "grad_norm": 0.18208543956279755, + "learning_rate": 6.324313765457264e-05, + "loss": 3.976, + "step": 43300 + }, + { + "epoch": 2.9423155320016305, + "grad_norm": 0.16823796927928925, + "learning_rate": 6.323889115368936e-05, + "loss": 3.6039, + "step": 43305 + }, + { + "epoch": 2.942655252072292, + "grad_norm": 0.1642821878194809, + "learning_rate": 6.323464465280608e-05, + "loss": 3.8337, + "step": 43310 + }, + { + "epoch": 2.9429949721429542, + "grad_norm": 0.23384080827236176, + "learning_rate": 6.323039815192282e-05, + "loss": 3.6058, + "step": 43315 + }, + { + "epoch": 2.943334692213616, + "grad_norm": 0.27146849036216736, + "learning_rate": 6.322615165103954e-05, + "loss": 4.0242, + "step": 43320 + }, + { + "epoch": 2.9436744122842775, + "grad_norm": 0.20478108525276184, + "learning_rate": 6.322190515015628e-05, + "loss": 3.8272, + "step": 43325 + }, + { + "epoch": 2.9440141323549396, + "grad_norm": 0.17840644717216492, + "learning_rate": 6.321765864927301e-05, + "loss": 3.807, + "step": 43330 + }, + { + "epoch": 2.944353852425601, + "grad_norm": 0.1565498262643814, + "learning_rate": 6.321341214838972e-05, + "loss": 3.8129, + "step": 43335 + }, + { + "epoch": 2.944693572496263, + "grad_norm": 0.13239286839962006, + "learning_rate": 6.320916564750646e-05, + "loss": 4.0959, + "step": 43340 + }, + { + "epoch": 2.945033292566925, + "grad_norm": 0.20403295755386353, + "learning_rate": 6.320491914662319e-05, + "loss": 3.9601, + "step": 43345 + }, + { + "epoch": 2.9453730126375866, + "grad_norm": 1.9986076354980469, + "learning_rate": 6.320067264573991e-05, + "loss": 3.9698, + "step": 43350 + }, + { + "epoch": 2.945712732708248, + "grad_norm": 0.1705055832862854, + "learning_rate": 6.319642614485665e-05, + "loss": 4.0769, + "step": 43355 + }, + { + "epoch": 2.9460524527789103, + "grad_norm": 0.16654908657073975, + "learning_rate": 6.319217964397338e-05, + "loss": 3.8314, + "step": 43360 + }, + { + "epoch": 2.946392172849572, + "grad_norm": 0.18547603487968445, + "learning_rate": 6.318793314309009e-05, + "loss": 3.7541, + "step": 43365 + }, + { + "epoch": 2.9467318929202335, + "grad_norm": 0.24635308980941772, + "learning_rate": 6.318368664220683e-05, + "loss": 4.0945, + "step": 43370 + }, + { + "epoch": 2.9470716129908956, + "grad_norm": 0.14928443729877472, + "learning_rate": 6.317944014132355e-05, + "loss": 4.053, + "step": 43375 + }, + { + "epoch": 2.9474113330615572, + "grad_norm": 0.17643344402313232, + "learning_rate": 6.317519364044028e-05, + "loss": 3.6792, + "step": 43380 + }, + { + "epoch": 2.947751053132219, + "grad_norm": 0.6682527661323547, + "learning_rate": 6.317094713955702e-05, + "loss": 3.6809, + "step": 43385 + }, + { + "epoch": 2.948090773202881, + "grad_norm": 0.19629082083702087, + "learning_rate": 6.316670063867373e-05, + "loss": 4.0587, + "step": 43390 + }, + { + "epoch": 2.9484304932735426, + "grad_norm": 0.3993152678012848, + "learning_rate": 6.316245413779046e-05, + "loss": 3.8789, + "step": 43395 + }, + { + "epoch": 2.948770213344204, + "grad_norm": 0.17035536468029022, + "learning_rate": 6.31582076369072e-05, + "loss": 3.8334, + "step": 43400 + }, + { + "epoch": 2.9491099334148663, + "grad_norm": 0.18269148468971252, + "learning_rate": 6.315396113602392e-05, + "loss": 3.6527, + "step": 43405 + }, + { + "epoch": 2.949449653485528, + "grad_norm": 0.5065781474113464, + "learning_rate": 6.314971463514064e-05, + "loss": 3.587, + "step": 43410 + }, + { + "epoch": 2.9497893735561895, + "grad_norm": 0.19522784650325775, + "learning_rate": 6.314546813425738e-05, + "loss": 3.8447, + "step": 43415 + }, + { + "epoch": 2.9501290936268516, + "grad_norm": 0.20893366634845734, + "learning_rate": 6.31412216333741e-05, + "loss": 3.6919, + "step": 43420 + }, + { + "epoch": 2.9504688136975132, + "grad_norm": 0.16121025383472443, + "learning_rate": 6.313697513249083e-05, + "loss": 4.088, + "step": 43425 + }, + { + "epoch": 2.950808533768175, + "grad_norm": 0.30044227838516235, + "learning_rate": 6.313272863160757e-05, + "loss": 4.145, + "step": 43430 + }, + { + "epoch": 2.951148253838837, + "grad_norm": 0.25418877601623535, + "learning_rate": 6.312848213072428e-05, + "loss": 4.1403, + "step": 43435 + }, + { + "epoch": 2.9514879739094986, + "grad_norm": 0.180855393409729, + "learning_rate": 6.312423562984101e-05, + "loss": 3.7691, + "step": 43440 + }, + { + "epoch": 2.95182769398016, + "grad_norm": 0.19479963183403015, + "learning_rate": 6.311998912895775e-05, + "loss": 4.0384, + "step": 43445 + }, + { + "epoch": 2.9521674140508223, + "grad_norm": 0.21131008863449097, + "learning_rate": 6.311574262807447e-05, + "loss": 3.8013, + "step": 43450 + }, + { + "epoch": 2.952507134121484, + "grad_norm": 0.21060165762901306, + "learning_rate": 6.31114961271912e-05, + "loss": 3.735, + "step": 43455 + }, + { + "epoch": 2.9528468541921455, + "grad_norm": 0.14968647062778473, + "learning_rate": 6.310724962630792e-05, + "loss": 3.7392, + "step": 43460 + }, + { + "epoch": 2.9531865742628076, + "grad_norm": 0.16066908836364746, + "learning_rate": 6.310300312542465e-05, + "loss": 3.7895, + "step": 43465 + }, + { + "epoch": 2.9535262943334692, + "grad_norm": 0.17307181656360626, + "learning_rate": 6.309875662454138e-05, + "loss": 4.1163, + "step": 43470 + }, + { + "epoch": 2.953866014404131, + "grad_norm": 0.18590393662452698, + "learning_rate": 6.309451012365811e-05, + "loss": 3.726, + "step": 43475 + }, + { + "epoch": 2.954205734474793, + "grad_norm": 0.20651349425315857, + "learning_rate": 6.309026362277484e-05, + "loss": 4.0227, + "step": 43480 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.12349123507738113, + "learning_rate": 6.308601712189156e-05, + "loss": 3.9602, + "step": 43485 + }, + { + "epoch": 2.954885174616116, + "grad_norm": 0.1655462682247162, + "learning_rate": 6.308177062100829e-05, + "loss": 3.9979, + "step": 43490 + }, + { + "epoch": 2.9552248946867783, + "grad_norm": 1.8286194801330566, + "learning_rate": 6.307752412012502e-05, + "loss": 3.715, + "step": 43495 + }, + { + "epoch": 2.95556461475744, + "grad_norm": 1.154184103012085, + "learning_rate": 6.307327761924175e-05, + "loss": 3.7997, + "step": 43500 + }, + { + "epoch": 2.9559043348281016, + "grad_norm": 0.18022382259368896, + "learning_rate": 6.306903111835848e-05, + "loss": 3.6951, + "step": 43505 + }, + { + "epoch": 2.9562440548987636, + "grad_norm": 0.5330966711044312, + "learning_rate": 6.30647846174752e-05, + "loss": 3.9466, + "step": 43510 + }, + { + "epoch": 2.9565837749694253, + "grad_norm": 0.32436859607696533, + "learning_rate": 6.306053811659193e-05, + "loss": 3.6513, + "step": 43515 + }, + { + "epoch": 2.956923495040087, + "grad_norm": 0.14062456786632538, + "learning_rate": 6.305629161570866e-05, + "loss": 3.8913, + "step": 43520 + }, + { + "epoch": 2.957263215110749, + "grad_norm": 0.16726896166801453, + "learning_rate": 6.305204511482539e-05, + "loss": 3.9659, + "step": 43525 + }, + { + "epoch": 2.9576029351814106, + "grad_norm": 0.20263756811618805, + "learning_rate": 6.304779861394212e-05, + "loss": 3.9404, + "step": 43530 + }, + { + "epoch": 2.9579426552520722, + "grad_norm": 0.1615334153175354, + "learning_rate": 6.304355211305884e-05, + "loss": 3.7408, + "step": 43535 + }, + { + "epoch": 2.9582823753227343, + "grad_norm": 0.23241275548934937, + "learning_rate": 6.303930561217557e-05, + "loss": 3.6304, + "step": 43540 + }, + { + "epoch": 2.958622095393396, + "grad_norm": 0.16309992969036102, + "learning_rate": 6.30350591112923e-05, + "loss": 3.8404, + "step": 43545 + }, + { + "epoch": 2.9589618154640576, + "grad_norm": 0.19634264707565308, + "learning_rate": 6.303081261040903e-05, + "loss": 3.9086, + "step": 43550 + }, + { + "epoch": 2.9593015355347196, + "grad_norm": 0.36743587255477905, + "learning_rate": 6.302656610952576e-05, + "loss": 3.8047, + "step": 43555 + }, + { + "epoch": 2.9596412556053813, + "grad_norm": 0.17164462804794312, + "learning_rate": 6.302231960864248e-05, + "loss": 3.8717, + "step": 43560 + }, + { + "epoch": 2.959980975676043, + "grad_norm": 0.43131428956985474, + "learning_rate": 6.301807310775921e-05, + "loss": 3.6683, + "step": 43565 + }, + { + "epoch": 2.960320695746705, + "grad_norm": 0.36243727803230286, + "learning_rate": 6.301382660687594e-05, + "loss": 3.6032, + "step": 43570 + }, + { + "epoch": 2.9606604158173666, + "grad_norm": 0.16992788016796112, + "learning_rate": 6.300958010599265e-05, + "loss": 3.8909, + "step": 43575 + }, + { + "epoch": 2.9610001358880282, + "grad_norm": 0.16279467940330505, + "learning_rate": 6.30053336051094e-05, + "loss": 3.8637, + "step": 43580 + }, + { + "epoch": 2.9613398559586903, + "grad_norm": 0.17922942340373993, + "learning_rate": 6.300108710422612e-05, + "loss": 3.9334, + "step": 43585 + }, + { + "epoch": 2.961679576029352, + "grad_norm": 1.0467777252197266, + "learning_rate": 6.299684060334284e-05, + "loss": 4.032, + "step": 43590 + }, + { + "epoch": 2.9620192961000136, + "grad_norm": 0.17994962632656097, + "learning_rate": 6.299259410245958e-05, + "loss": 3.7107, + "step": 43595 + }, + { + "epoch": 2.962359016170675, + "grad_norm": 0.16418002545833588, + "learning_rate": 6.298834760157631e-05, + "loss": 3.9271, + "step": 43600 + }, + { + "epoch": 2.9626987362413373, + "grad_norm": 0.14179669320583344, + "learning_rate": 6.298410110069302e-05, + "loss": 3.8605, + "step": 43605 + }, + { + "epoch": 2.963038456311999, + "grad_norm": 0.19066330790519714, + "learning_rate": 6.297985459980976e-05, + "loss": 3.8843, + "step": 43610 + }, + { + "epoch": 2.9633781763826605, + "grad_norm": 0.1537814736366272, + "learning_rate": 6.297560809892649e-05, + "loss": 3.6217, + "step": 43615 + }, + { + "epoch": 2.9637178964533226, + "grad_norm": 0.15620434284210205, + "learning_rate": 6.29713615980432e-05, + "loss": 3.9486, + "step": 43620 + }, + { + "epoch": 2.9640576165239843, + "grad_norm": 0.18316452205181122, + "learning_rate": 6.296711509715995e-05, + "loss": 3.9258, + "step": 43625 + }, + { + "epoch": 2.964397336594646, + "grad_norm": 0.24229392409324646, + "learning_rate": 6.296286859627668e-05, + "loss": 3.7352, + "step": 43630 + }, + { + "epoch": 2.9647370566653075, + "grad_norm": 0.2125084549188614, + "learning_rate": 6.295862209539339e-05, + "loss": 3.9246, + "step": 43635 + }, + { + "epoch": 2.9650767767359696, + "grad_norm": 0.1955076903104782, + "learning_rate": 6.295437559451013e-05, + "loss": 3.9465, + "step": 43640 + }, + { + "epoch": 2.965416496806631, + "grad_norm": 0.15660282969474792, + "learning_rate": 6.295012909362686e-05, + "loss": 3.907, + "step": 43645 + }, + { + "epoch": 2.965756216877293, + "grad_norm": 0.2951078414916992, + "learning_rate": 6.294588259274357e-05, + "loss": 3.9581, + "step": 43650 + }, + { + "epoch": 2.966095936947955, + "grad_norm": 4.251708030700684, + "learning_rate": 6.294163609186032e-05, + "loss": 4.0309, + "step": 43655 + }, + { + "epoch": 2.9664356570186166, + "grad_norm": 0.17603333294391632, + "learning_rate": 6.293738959097703e-05, + "loss": 3.9795, + "step": 43660 + }, + { + "epoch": 2.966775377089278, + "grad_norm": 0.17622163891792297, + "learning_rate": 6.293314309009377e-05, + "loss": 3.8104, + "step": 43665 + }, + { + "epoch": 2.9671150971599403, + "grad_norm": 0.16364796459674835, + "learning_rate": 6.29288965892105e-05, + "loss": 4.0277, + "step": 43670 + }, + { + "epoch": 2.967454817230602, + "grad_norm": 0.1348246932029724, + "learning_rate": 6.292465008832721e-05, + "loss": 3.815, + "step": 43675 + }, + { + "epoch": 2.9677945373012635, + "grad_norm": 0.191290944814682, + "learning_rate": 6.292040358744396e-05, + "loss": 4.0041, + "step": 43680 + }, + { + "epoch": 2.9681342573719256, + "grad_norm": 0.15270498394966125, + "learning_rate": 6.291615708656068e-05, + "loss": 4.0224, + "step": 43685 + }, + { + "epoch": 2.9684739774425872, + "grad_norm": 0.14189204573631287, + "learning_rate": 6.29119105856774e-05, + "loss": 3.7108, + "step": 43690 + }, + { + "epoch": 2.968813697513249, + "grad_norm": 0.1789444386959076, + "learning_rate": 6.290766408479414e-05, + "loss": 4.1085, + "step": 43695 + }, + { + "epoch": 2.969153417583911, + "grad_norm": 0.19486597180366516, + "learning_rate": 6.290341758391087e-05, + "loss": 3.8149, + "step": 43700 + }, + { + "epoch": 2.9694931376545726, + "grad_norm": 0.20657891035079956, + "learning_rate": 6.289917108302758e-05, + "loss": 3.9472, + "step": 43705 + }, + { + "epoch": 2.969832857725234, + "grad_norm": 0.16749605536460876, + "learning_rate": 6.289492458214432e-05, + "loss": 3.7886, + "step": 43710 + }, + { + "epoch": 2.9701725777958963, + "grad_norm": 0.19317512214183807, + "learning_rate": 6.289067808126105e-05, + "loss": 3.705, + "step": 43715 + }, + { + "epoch": 2.970512297866558, + "grad_norm": 0.14873214066028595, + "learning_rate": 6.288643158037777e-05, + "loss": 3.6573, + "step": 43720 + }, + { + "epoch": 2.9708520179372195, + "grad_norm": 0.2057671993970871, + "learning_rate": 6.288218507949451e-05, + "loss": 3.7893, + "step": 43725 + }, + { + "epoch": 2.9711917380078816, + "grad_norm": 0.2171197235584259, + "learning_rate": 6.287793857861122e-05, + "loss": 4.0177, + "step": 43730 + }, + { + "epoch": 2.9715314580785432, + "grad_norm": 0.17153143882751465, + "learning_rate": 6.287369207772795e-05, + "loss": 3.9598, + "step": 43735 + }, + { + "epoch": 2.971871178149205, + "grad_norm": 0.19196929037570953, + "learning_rate": 6.286944557684469e-05, + "loss": 3.7864, + "step": 43740 + }, + { + "epoch": 2.972210898219867, + "grad_norm": 0.16618981957435608, + "learning_rate": 6.28651990759614e-05, + "loss": 4.026, + "step": 43745 + }, + { + "epoch": 2.9725506182905286, + "grad_norm": 0.2652030289173126, + "learning_rate": 6.286095257507813e-05, + "loss": 3.8921, + "step": 43750 + }, + { + "epoch": 2.97289033836119, + "grad_norm": 0.17412860691547394, + "learning_rate": 6.285670607419488e-05, + "loss": 3.806, + "step": 43755 + }, + { + "epoch": 2.9732300584318523, + "grad_norm": 0.23198889195919037, + "learning_rate": 6.285245957331159e-05, + "loss": 4.1742, + "step": 43760 + }, + { + "epoch": 2.973569778502514, + "grad_norm": 0.22028906643390656, + "learning_rate": 6.284821307242832e-05, + "loss": 3.8894, + "step": 43765 + }, + { + "epoch": 2.9739094985731755, + "grad_norm": 0.13870121538639069, + "learning_rate": 6.284396657154506e-05, + "loss": 3.9152, + "step": 43770 + }, + { + "epoch": 2.9742492186438376, + "grad_norm": 1.1242376565933228, + "learning_rate": 6.283972007066177e-05, + "loss": 3.6634, + "step": 43775 + }, + { + "epoch": 2.9745889387144993, + "grad_norm": 0.16807390749454498, + "learning_rate": 6.28354735697785e-05, + "loss": 3.943, + "step": 43780 + }, + { + "epoch": 2.974928658785161, + "grad_norm": 0.17908912897109985, + "learning_rate": 6.283122706889524e-05, + "loss": 3.7565, + "step": 43785 + }, + { + "epoch": 2.975268378855823, + "grad_norm": 0.15372201800346375, + "learning_rate": 6.282698056801196e-05, + "loss": 3.9512, + "step": 43790 + }, + { + "epoch": 2.9756080989264846, + "grad_norm": 0.21235322952270508, + "learning_rate": 6.282273406712869e-05, + "loss": 3.9256, + "step": 43795 + }, + { + "epoch": 2.975947818997146, + "grad_norm": 0.16537973284721375, + "learning_rate": 6.281848756624541e-05, + "loss": 3.7884, + "step": 43800 + }, + { + "epoch": 2.9762875390678083, + "grad_norm": 0.1616590917110443, + "learning_rate": 6.281424106536214e-05, + "loss": 3.9136, + "step": 43805 + }, + { + "epoch": 2.97662725913847, + "grad_norm": 0.8023096919059753, + "learning_rate": 6.280999456447887e-05, + "loss": 3.757, + "step": 43810 + }, + { + "epoch": 2.9769669792091316, + "grad_norm": 0.1604900360107422, + "learning_rate": 6.28057480635956e-05, + "loss": 4.1112, + "step": 43815 + }, + { + "epoch": 2.9773066992797936, + "grad_norm": 0.19210541248321533, + "learning_rate": 6.280150156271233e-05, + "loss": 4.0952, + "step": 43820 + }, + { + "epoch": 2.9776464193504553, + "grad_norm": 0.4921315014362335, + "learning_rate": 6.279725506182905e-05, + "loss": 3.8529, + "step": 43825 + }, + { + "epoch": 2.977986139421117, + "grad_norm": 0.18518052995204926, + "learning_rate": 6.279300856094578e-05, + "loss": 3.7679, + "step": 43830 + }, + { + "epoch": 2.978325859491779, + "grad_norm": 0.19248434901237488, + "learning_rate": 6.278876206006251e-05, + "loss": 3.5629, + "step": 43835 + }, + { + "epoch": 2.9786655795624406, + "grad_norm": 0.15910610556602478, + "learning_rate": 6.278451555917924e-05, + "loss": 3.8569, + "step": 43840 + }, + { + "epoch": 2.9790052996331022, + "grad_norm": 0.2547973394393921, + "learning_rate": 6.278026905829597e-05, + "loss": 3.8826, + "step": 43845 + }, + { + "epoch": 2.9793450197037643, + "grad_norm": 0.9047517776489258, + "learning_rate": 6.27760225574127e-05, + "loss": 3.9552, + "step": 43850 + }, + { + "epoch": 2.979684739774426, + "grad_norm": 0.17821423709392548, + "learning_rate": 6.277177605652942e-05, + "loss": 3.7151, + "step": 43855 + }, + { + "epoch": 2.9800244598450876, + "grad_norm": 0.20938356220722198, + "learning_rate": 6.276752955564615e-05, + "loss": 3.9205, + "step": 43860 + }, + { + "epoch": 2.9803641799157496, + "grad_norm": 0.16868583858013153, + "learning_rate": 6.276328305476288e-05, + "loss": 3.7303, + "step": 43865 + }, + { + "epoch": 2.9807038999864113, + "grad_norm": 0.21416650712490082, + "learning_rate": 6.27590365538796e-05, + "loss": 3.8688, + "step": 43870 + }, + { + "epoch": 2.981043620057073, + "grad_norm": 1.11167311668396, + "learning_rate": 6.275479005299633e-05, + "loss": 3.7217, + "step": 43875 + }, + { + "epoch": 2.981383340127735, + "grad_norm": 0.15236958861351013, + "learning_rate": 6.275054355211306e-05, + "loss": 3.9347, + "step": 43880 + }, + { + "epoch": 2.9817230601983966, + "grad_norm": 0.22965151071548462, + "learning_rate": 6.274629705122979e-05, + "loss": 3.7695, + "step": 43885 + }, + { + "epoch": 2.9820627802690582, + "grad_norm": 0.46913158893585205, + "learning_rate": 6.274205055034652e-05, + "loss": 3.646, + "step": 43890 + }, + { + "epoch": 2.9824025003397203, + "grad_norm": 0.20789067447185516, + "learning_rate": 6.273780404946325e-05, + "loss": 4.2169, + "step": 43895 + }, + { + "epoch": 2.982742220410382, + "grad_norm": 0.20799103379249573, + "learning_rate": 6.273355754857997e-05, + "loss": 3.8693, + "step": 43900 + }, + { + "epoch": 2.9830819404810436, + "grad_norm": 0.2187012881040573, + "learning_rate": 6.27293110476967e-05, + "loss": 3.9123, + "step": 43905 + }, + { + "epoch": 2.9834216605517057, + "grad_norm": 0.1937045156955719, + "learning_rate": 6.272506454681343e-05, + "loss": 3.8888, + "step": 43910 + }, + { + "epoch": 2.9837613806223673, + "grad_norm": 0.1719880998134613, + "learning_rate": 6.272081804593016e-05, + "loss": 3.8685, + "step": 43915 + }, + { + "epoch": 2.984101100693029, + "grad_norm": 0.1539873480796814, + "learning_rate": 6.271657154504689e-05, + "loss": 3.7746, + "step": 43920 + }, + { + "epoch": 2.984440820763691, + "grad_norm": 0.17230618000030518, + "learning_rate": 6.271232504416361e-05, + "loss": 3.9073, + "step": 43925 + }, + { + "epoch": 2.9847805408343526, + "grad_norm": 0.20375201106071472, + "learning_rate": 6.270807854328033e-05, + "loss": 3.6585, + "step": 43930 + }, + { + "epoch": 2.9851202609050143, + "grad_norm": 0.1988213211297989, + "learning_rate": 6.270383204239707e-05, + "loss": 3.5163, + "step": 43935 + }, + { + "epoch": 2.985459980975676, + "grad_norm": 0.1964956670999527, + "learning_rate": 6.26995855415138e-05, + "loss": 3.8465, + "step": 43940 + }, + { + "epoch": 2.985799701046338, + "grad_norm": 0.18335653841495514, + "learning_rate": 6.269533904063051e-05, + "loss": 3.7615, + "step": 43945 + }, + { + "epoch": 2.9861394211169996, + "grad_norm": 0.18092626333236694, + "learning_rate": 6.269109253974725e-05, + "loss": 3.9329, + "step": 43950 + }, + { + "epoch": 2.9864791411876612, + "grad_norm": 0.18411394953727722, + "learning_rate": 6.268684603886398e-05, + "loss": 3.8538, + "step": 43955 + }, + { + "epoch": 2.9868188612583233, + "grad_norm": 0.646848738193512, + "learning_rate": 6.26825995379807e-05, + "loss": 3.9862, + "step": 43960 + }, + { + "epoch": 2.987158581328985, + "grad_norm": 0.17919863760471344, + "learning_rate": 6.267835303709744e-05, + "loss": 3.9045, + "step": 43965 + }, + { + "epoch": 2.9874983013996466, + "grad_norm": 0.21215775609016418, + "learning_rate": 6.267410653621417e-05, + "loss": 4.0202, + "step": 43970 + }, + { + "epoch": 2.987838021470308, + "grad_norm": 0.15435653924942017, + "learning_rate": 6.266986003533088e-05, + "loss": 3.7103, + "step": 43975 + }, + { + "epoch": 2.9881777415409703, + "grad_norm": 0.1699531525373459, + "learning_rate": 6.266561353444762e-05, + "loss": 3.863, + "step": 43980 + }, + { + "epoch": 2.988517461611632, + "grad_norm": 0.13874255120754242, + "learning_rate": 6.266136703356435e-05, + "loss": 3.7413, + "step": 43985 + }, + { + "epoch": 2.9888571816822935, + "grad_norm": 0.21314023435115814, + "learning_rate": 6.265712053268107e-05, + "loss": 4.0948, + "step": 43990 + }, + { + "epoch": 2.9891969017529556, + "grad_norm": 1.6078605651855469, + "learning_rate": 6.26528740317978e-05, + "loss": 4.1001, + "step": 43995 + }, + { + "epoch": 2.9895366218236172, + "grad_norm": 0.267627090215683, + "learning_rate": 6.264862753091452e-05, + "loss": 3.8289, + "step": 44000 + }, + { + "epoch": 2.989876341894279, + "grad_norm": 0.1901550590991974, + "learning_rate": 6.264438103003126e-05, + "loss": 3.982, + "step": 44005 + }, + { + "epoch": 2.990216061964941, + "grad_norm": 0.241401806473732, + "learning_rate": 6.264013452914799e-05, + "loss": 3.6269, + "step": 44010 + }, + { + "epoch": 2.9905557820356026, + "grad_norm": 0.18021120131015778, + "learning_rate": 6.26358880282647e-05, + "loss": 4.0023, + "step": 44015 + }, + { + "epoch": 2.990895502106264, + "grad_norm": 0.2309008091688156, + "learning_rate": 6.263164152738145e-05, + "loss": 3.9046, + "step": 44020 + }, + { + "epoch": 2.9912352221769263, + "grad_norm": 0.17707498371601105, + "learning_rate": 6.262739502649817e-05, + "loss": 3.808, + "step": 44025 + }, + { + "epoch": 2.991574942247588, + "grad_norm": 0.22348742187023163, + "learning_rate": 6.262314852561489e-05, + "loss": 4.3089, + "step": 44030 + }, + { + "epoch": 2.9919146623182495, + "grad_norm": 0.1725098043680191, + "learning_rate": 6.261890202473163e-05, + "loss": 3.8744, + "step": 44035 + }, + { + "epoch": 2.9922543823889116, + "grad_norm": 0.21064795553684235, + "learning_rate": 6.261465552384836e-05, + "loss": 3.7689, + "step": 44040 + }, + { + "epoch": 2.9925941024595732, + "grad_norm": 0.18274621665477753, + "learning_rate": 6.261040902296507e-05, + "loss": 3.9965, + "step": 44045 + }, + { + "epoch": 2.992933822530235, + "grad_norm": 0.18376219272613525, + "learning_rate": 6.260616252208181e-05, + "loss": 4.0195, + "step": 44050 + }, + { + "epoch": 2.993273542600897, + "grad_norm": 0.19093045592308044, + "learning_rate": 6.260191602119854e-05, + "loss": 4.1007, + "step": 44055 + }, + { + "epoch": 2.9936132626715586, + "grad_norm": 0.18170276284217834, + "learning_rate": 6.259766952031526e-05, + "loss": 4.0363, + "step": 44060 + }, + { + "epoch": 2.99395298274222, + "grad_norm": 0.19281305372714996, + "learning_rate": 6.2593423019432e-05, + "loss": 3.7665, + "step": 44065 + }, + { + "epoch": 2.9942927028128823, + "grad_norm": 0.1853039711713791, + "learning_rate": 6.258917651854873e-05, + "loss": 3.9352, + "step": 44070 + }, + { + "epoch": 2.994632422883544, + "grad_norm": 0.20089535415172577, + "learning_rate": 6.258493001766544e-05, + "loss": 3.8714, + "step": 44075 + }, + { + "epoch": 2.9949721429542056, + "grad_norm": 0.19948695600032806, + "learning_rate": 6.258068351678218e-05, + "loss": 3.7586, + "step": 44080 + }, + { + "epoch": 2.9953118630248676, + "grad_norm": 0.2634957432746887, + "learning_rate": 6.25764370158989e-05, + "loss": 3.9141, + "step": 44085 + }, + { + "epoch": 2.9956515830955293, + "grad_norm": 0.24538551270961761, + "learning_rate": 6.257219051501563e-05, + "loss": 3.779, + "step": 44090 + }, + { + "epoch": 2.995991303166191, + "grad_norm": 0.1748637855052948, + "learning_rate": 6.256794401413237e-05, + "loss": 3.9612, + "step": 44095 + }, + { + "epoch": 2.996331023236853, + "grad_norm": 0.45539844036102295, + "learning_rate": 6.256369751324908e-05, + "loss": 3.9101, + "step": 44100 + }, + { + "epoch": 2.9966707433075146, + "grad_norm": 0.8879687786102295, + "learning_rate": 6.255945101236581e-05, + "loss": 3.9688, + "step": 44105 + }, + { + "epoch": 2.9970104633781762, + "grad_norm": 0.17951299250125885, + "learning_rate": 6.255520451148255e-05, + "loss": 3.8212, + "step": 44110 + }, + { + "epoch": 2.9973501834488383, + "grad_norm": 0.24572037160396576, + "learning_rate": 6.255095801059927e-05, + "loss": 3.6434, + "step": 44115 + }, + { + "epoch": 2.9976899035195, + "grad_norm": 0.18831472098827362, + "learning_rate": 6.254671150971599e-05, + "loss": 3.9689, + "step": 44120 + }, + { + "epoch": 2.9980296235901616, + "grad_norm": 0.15480273962020874, + "learning_rate": 6.254246500883273e-05, + "loss": 4.0318, + "step": 44125 + }, + { + "epoch": 2.9983693436608236, + "grad_norm": 0.2160441279411316, + "learning_rate": 6.253821850794945e-05, + "loss": 3.9586, + "step": 44130 + }, + { + "epoch": 2.9987090637314853, + "grad_norm": 0.866939902305603, + "learning_rate": 6.253397200706618e-05, + "loss": 3.8636, + "step": 44135 + }, + { + "epoch": 2.999048783802147, + "grad_norm": 0.14561940729618073, + "learning_rate": 6.252972550618292e-05, + "loss": 3.8484, + "step": 44140 + }, + { + "epoch": 2.999388503872809, + "grad_norm": 0.15115301311016083, + "learning_rate": 6.252547900529963e-05, + "loss": 3.8011, + "step": 44145 + }, + { + "epoch": 2.9997282239434706, + "grad_norm": 1.7283555269241333, + "learning_rate": 6.252123250441636e-05, + "loss": 3.8227, + "step": 44150 + }, + { + "epoch": 3.0, + "eval_bertscore": { + "f1": 0.8527401463619694, + "precision": 0.8768615470178391, + "recall": 0.8302379138121506 + }, + "eval_bleu_4": 0.0017275862649653035, + "eval_exact_match": 0.0, + "eval_loss": 3.6739563941955566, + "eval_meteor": 0.074907350266259, + "eval_rouge": { + "rouge1": 0.122962562806164, + "rouge2": 0.015046829919232098, + "rougeL": 0.10858722748962665, + "rougeLsum": 0.10861272732313015 + }, + "eval_runtime": 363.2564, + "eval_samples_per_second": 28.407, + "eval_steps_per_second": 3.551, + "step": 44154 + }, + { + "epoch": 3.0000679440141322, + "grad_norm": 0.154000923037529, + "learning_rate": 6.251698600353309e-05, + "loss": 3.8421, + "step": 44155 + }, + { + "epoch": 3.0004076640847943, + "grad_norm": 0.1786402016878128, + "learning_rate": 6.251273950264982e-05, + "loss": 3.907, + "step": 44160 + }, + { + "epoch": 3.000747384155456, + "grad_norm": 0.4940347671508789, + "learning_rate": 6.250849300176655e-05, + "loss": 3.8744, + "step": 44165 + }, + { + "epoch": 3.0010871042261176, + "grad_norm": 0.21371343731880188, + "learning_rate": 6.250424650088327e-05, + "loss": 3.7141, + "step": 44170 + }, + { + "epoch": 3.0014268242967796, + "grad_norm": 0.18918779492378235, + "learning_rate": 6.25e-05, + "loss": 3.9109, + "step": 44175 + }, + { + "epoch": 3.0017665443674413, + "grad_norm": 0.2152406871318817, + "learning_rate": 6.249575349911673e-05, + "loss": 3.9004, + "step": 44180 + }, + { + "epoch": 3.002106264438103, + "grad_norm": 0.19928883016109467, + "learning_rate": 6.249150699823346e-05, + "loss": 3.6001, + "step": 44185 + }, + { + "epoch": 3.002445984508765, + "grad_norm": 0.13658899068832397, + "learning_rate": 6.248726049735019e-05, + "loss": 4.0968, + "step": 44190 + }, + { + "epoch": 3.0027857045794266, + "grad_norm": 0.17396563291549683, + "learning_rate": 6.248301399646691e-05, + "loss": 3.6771, + "step": 44195 + }, + { + "epoch": 3.0031254246500882, + "grad_norm": 0.16514383256435394, + "learning_rate": 6.247876749558364e-05, + "loss": 3.7361, + "step": 44200 + }, + { + "epoch": 3.0034651447207503, + "grad_norm": 0.18629255890846252, + "learning_rate": 6.247452099470037e-05, + "loss": 3.9618, + "step": 44205 + }, + { + "epoch": 3.003804864791412, + "grad_norm": 0.16549646854400635, + "learning_rate": 6.24702744938171e-05, + "loss": 3.8794, + "step": 44210 + }, + { + "epoch": 3.0041445848620736, + "grad_norm": 0.15086673200130463, + "learning_rate": 6.246602799293383e-05, + "loss": 3.9472, + "step": 44215 + }, + { + "epoch": 3.004484304932735, + "grad_norm": 0.19215624034404755, + "learning_rate": 6.246178149205055e-05, + "loss": 3.803, + "step": 44220 + }, + { + "epoch": 3.0048240250033973, + "grad_norm": 0.24128364026546478, + "learning_rate": 6.245753499116728e-05, + "loss": 3.909, + "step": 44225 + }, + { + "epoch": 3.005163745074059, + "grad_norm": 0.1911817044019699, + "learning_rate": 6.245328849028401e-05, + "loss": 3.7894, + "step": 44230 + }, + { + "epoch": 3.0055034651447206, + "grad_norm": 0.4445135295391083, + "learning_rate": 6.244904198940074e-05, + "loss": 3.7856, + "step": 44235 + }, + { + "epoch": 3.0058431852153826, + "grad_norm": 0.1516428142786026, + "learning_rate": 6.244479548851747e-05, + "loss": 3.9329, + "step": 44240 + }, + { + "epoch": 3.0061829052860443, + "grad_norm": 0.18789459764957428, + "learning_rate": 6.24405489876342e-05, + "loss": 3.9044, + "step": 44245 + }, + { + "epoch": 3.006522625356706, + "grad_norm": 0.19202770292758942, + "learning_rate": 6.243630248675092e-05, + "loss": 3.8764, + "step": 44250 + }, + { + "epoch": 3.006862345427368, + "grad_norm": 0.1698179543018341, + "learning_rate": 6.243205598586765e-05, + "loss": 3.8345, + "step": 44255 + }, + { + "epoch": 3.0072020654980296, + "grad_norm": 0.18703432381153107, + "learning_rate": 6.242780948498438e-05, + "loss": 4.1764, + "step": 44260 + }, + { + "epoch": 3.0075417855686912, + "grad_norm": 0.15342819690704346, + "learning_rate": 6.24235629841011e-05, + "loss": 4.0494, + "step": 44265 + }, + { + "epoch": 3.0078815056393533, + "grad_norm": 0.19301213324069977, + "learning_rate": 6.241931648321783e-05, + "loss": 3.6173, + "step": 44270 + }, + { + "epoch": 3.008221225710015, + "grad_norm": 0.1800895482301712, + "learning_rate": 6.241506998233456e-05, + "loss": 3.8001, + "step": 44275 + }, + { + "epoch": 3.0085609457806766, + "grad_norm": 0.16103455424308777, + "learning_rate": 6.241082348145129e-05, + "loss": 4.0194, + "step": 44280 + }, + { + "epoch": 3.0089006658513386, + "grad_norm": 0.2092173844575882, + "learning_rate": 6.2406576980568e-05, + "loss": 3.7312, + "step": 44285 + }, + { + "epoch": 3.0092403859220003, + "grad_norm": 0.19738946855068207, + "learning_rate": 6.240233047968475e-05, + "loss": 3.7154, + "step": 44290 + }, + { + "epoch": 3.009580105992662, + "grad_norm": 0.1473068743944168, + "learning_rate": 6.239808397880147e-05, + "loss": 3.8988, + "step": 44295 + }, + { + "epoch": 3.009919826063324, + "grad_norm": 0.17140065133571625, + "learning_rate": 6.239383747791819e-05, + "loss": 3.7725, + "step": 44300 + }, + { + "epoch": 3.0102595461339856, + "grad_norm": 0.173176571726799, + "learning_rate": 6.238959097703493e-05, + "loss": 3.7921, + "step": 44305 + }, + { + "epoch": 3.0105992662046472, + "grad_norm": 0.1627722531557083, + "learning_rate": 6.238534447615166e-05, + "loss": 4.0589, + "step": 44310 + }, + { + "epoch": 3.0109389862753093, + "grad_norm": 0.22152982652187347, + "learning_rate": 6.238109797526837e-05, + "loss": 3.9136, + "step": 44315 + }, + { + "epoch": 3.011278706345971, + "grad_norm": 0.2256784737110138, + "learning_rate": 6.237685147438511e-05, + "loss": 3.8036, + "step": 44320 + }, + { + "epoch": 3.0116184264166326, + "grad_norm": 0.1884443461894989, + "learning_rate": 6.237260497350184e-05, + "loss": 3.9211, + "step": 44325 + }, + { + "epoch": 3.0119581464872947, + "grad_norm": 0.16700834035873413, + "learning_rate": 6.236835847261856e-05, + "loss": 4.0796, + "step": 44330 + }, + { + "epoch": 3.0122978665579563, + "grad_norm": 0.1844736635684967, + "learning_rate": 6.23641119717353e-05, + "loss": 3.8286, + "step": 44335 + }, + { + "epoch": 3.012637586628618, + "grad_norm": 0.9240265488624573, + "learning_rate": 6.235986547085203e-05, + "loss": 3.7549, + "step": 44340 + }, + { + "epoch": 3.01297730669928, + "grad_norm": 0.2103656381368637, + "learning_rate": 6.235561896996874e-05, + "loss": 4.0295, + "step": 44345 + }, + { + "epoch": 3.0133170267699416, + "grad_norm": 0.17974020540714264, + "learning_rate": 6.235137246908548e-05, + "loss": 3.766, + "step": 44350 + }, + { + "epoch": 3.0136567468406033, + "grad_norm": 0.18281976878643036, + "learning_rate": 6.23471259682022e-05, + "loss": 3.9141, + "step": 44355 + }, + { + "epoch": 3.0139964669112653, + "grad_norm": 0.14310936629772186, + "learning_rate": 6.234287946731894e-05, + "loss": 3.9146, + "step": 44360 + }, + { + "epoch": 3.014336186981927, + "grad_norm": 0.17828884720802307, + "learning_rate": 6.233863296643567e-05, + "loss": 3.7484, + "step": 44365 + }, + { + "epoch": 3.0146759070525886, + "grad_norm": 0.16465747356414795, + "learning_rate": 6.233438646555238e-05, + "loss": 3.7801, + "step": 44370 + }, + { + "epoch": 3.01501562712325, + "grad_norm": 0.21067115664482117, + "learning_rate": 6.233013996466912e-05, + "loss": 4.0107, + "step": 44375 + }, + { + "epoch": 3.0153553471939123, + "grad_norm": 0.16687677800655365, + "learning_rate": 6.232589346378585e-05, + "loss": 3.709, + "step": 44380 + }, + { + "epoch": 3.015695067264574, + "grad_norm": 0.20705267786979675, + "learning_rate": 6.232164696290256e-05, + "loss": 3.8939, + "step": 44385 + }, + { + "epoch": 3.0160347873352356, + "grad_norm": 0.19143223762512207, + "learning_rate": 6.23174004620193e-05, + "loss": 3.9941, + "step": 44390 + }, + { + "epoch": 3.0163745074058976, + "grad_norm": 0.16204476356506348, + "learning_rate": 6.231315396113603e-05, + "loss": 3.8164, + "step": 44395 + }, + { + "epoch": 3.0167142274765593, + "grad_norm": 0.18840202689170837, + "learning_rate": 6.230890746025275e-05, + "loss": 4.1806, + "step": 44400 + }, + { + "epoch": 3.017053947547221, + "grad_norm": 0.16344767808914185, + "learning_rate": 6.230466095936949e-05, + "loss": 3.677, + "step": 44405 + }, + { + "epoch": 3.017393667617883, + "grad_norm": 0.18322747945785522, + "learning_rate": 6.230041445848622e-05, + "loss": 3.7563, + "step": 44410 + }, + { + "epoch": 3.0177333876885446, + "grad_norm": 1.0964486598968506, + "learning_rate": 6.229616795760293e-05, + "loss": 4.0735, + "step": 44415 + }, + { + "epoch": 3.0180731077592062, + "grad_norm": 0.24958722293376923, + "learning_rate": 6.229192145671967e-05, + "loss": 4.0205, + "step": 44420 + }, + { + "epoch": 3.0184128278298683, + "grad_norm": 0.1721007227897644, + "learning_rate": 6.228767495583639e-05, + "loss": 4.1049, + "step": 44425 + }, + { + "epoch": 3.01875254790053, + "grad_norm": 0.13701309263706207, + "learning_rate": 6.228342845495312e-05, + "loss": 3.9409, + "step": 44430 + }, + { + "epoch": 3.0190922679711916, + "grad_norm": 0.1645641028881073, + "learning_rate": 6.227918195406986e-05, + "loss": 3.7342, + "step": 44435 + }, + { + "epoch": 3.0194319880418536, + "grad_norm": 0.18376778066158295, + "learning_rate": 6.227493545318657e-05, + "loss": 4.1207, + "step": 44440 + }, + { + "epoch": 3.0197717081125153, + "grad_norm": 0.18016070127487183, + "learning_rate": 6.22706889523033e-05, + "loss": 3.9262, + "step": 44445 + }, + { + "epoch": 3.020111428183177, + "grad_norm": 0.1890650987625122, + "learning_rate": 6.226644245142004e-05, + "loss": 4.0267, + "step": 44450 + }, + { + "epoch": 3.020451148253839, + "grad_norm": 0.24187599122524261, + "learning_rate": 6.226219595053676e-05, + "loss": 3.7762, + "step": 44455 + }, + { + "epoch": 3.0207908683245006, + "grad_norm": 0.17917272448539734, + "learning_rate": 6.225794944965348e-05, + "loss": 3.776, + "step": 44460 + }, + { + "epoch": 3.0211305883951622, + "grad_norm": 0.20570288598537445, + "learning_rate": 6.225370294877023e-05, + "loss": 3.9837, + "step": 44465 + }, + { + "epoch": 3.0214703084658243, + "grad_norm": 0.19695299863815308, + "learning_rate": 6.224945644788694e-05, + "loss": 3.8811, + "step": 44470 + }, + { + "epoch": 3.021810028536486, + "grad_norm": 0.18974174559116364, + "learning_rate": 6.224520994700367e-05, + "loss": 3.9037, + "step": 44475 + }, + { + "epoch": 3.0221497486071476, + "grad_norm": 0.32905083894729614, + "learning_rate": 6.224096344612041e-05, + "loss": 3.9332, + "step": 44480 + }, + { + "epoch": 3.0224894686778097, + "grad_norm": 0.18083637952804565, + "learning_rate": 6.223671694523712e-05, + "loss": 4.176, + "step": 44485 + }, + { + "epoch": 3.0228291887484713, + "grad_norm": 0.141565203666687, + "learning_rate": 6.223247044435385e-05, + "loss": 3.5561, + "step": 44490 + }, + { + "epoch": 3.023168908819133, + "grad_norm": 0.17139850556850433, + "learning_rate": 6.22282239434706e-05, + "loss": 3.9121, + "step": 44495 + }, + { + "epoch": 3.023508628889795, + "grad_norm": 0.2626049816608429, + "learning_rate": 6.222397744258731e-05, + "loss": 3.8764, + "step": 44500 + }, + { + "epoch": 3.0238483489604566, + "grad_norm": 0.277527779340744, + "learning_rate": 6.221973094170404e-05, + "loss": 3.7366, + "step": 44505 + }, + { + "epoch": 3.0241880690311183, + "grad_norm": 0.1734590232372284, + "learning_rate": 6.221548444082076e-05, + "loss": 3.9063, + "step": 44510 + }, + { + "epoch": 3.0245277891017803, + "grad_norm": 0.17087484896183014, + "learning_rate": 6.221123793993749e-05, + "loss": 3.7924, + "step": 44515 + }, + { + "epoch": 3.024867509172442, + "grad_norm": 0.1724262833595276, + "learning_rate": 6.220699143905422e-05, + "loss": 3.7638, + "step": 44520 + }, + { + "epoch": 3.0252072292431036, + "grad_norm": 0.22005967795848846, + "learning_rate": 6.220274493817095e-05, + "loss": 3.8515, + "step": 44525 + }, + { + "epoch": 3.0255469493137657, + "grad_norm": 0.16660957038402557, + "learning_rate": 6.219849843728768e-05, + "loss": 3.982, + "step": 44530 + }, + { + "epoch": 3.0258866693844273, + "grad_norm": 0.16902515292167664, + "learning_rate": 6.21942519364044e-05, + "loss": 3.7365, + "step": 44535 + }, + { + "epoch": 3.026226389455089, + "grad_norm": 0.13985224068164825, + "learning_rate": 6.219000543552113e-05, + "loss": 4.3157, + "step": 44540 + }, + { + "epoch": 3.026566109525751, + "grad_norm": 0.16689558327198029, + "learning_rate": 6.218575893463786e-05, + "loss": 3.6909, + "step": 44545 + }, + { + "epoch": 3.0269058295964126, + "grad_norm": 0.5939047336578369, + "learning_rate": 6.218151243375459e-05, + "loss": 3.8015, + "step": 44550 + }, + { + "epoch": 3.0272455496670743, + "grad_norm": 0.13799281418323517, + "learning_rate": 6.217726593287132e-05, + "loss": 3.8312, + "step": 44555 + }, + { + "epoch": 3.027585269737736, + "grad_norm": 0.1994619071483612, + "learning_rate": 6.217301943198804e-05, + "loss": 3.8414, + "step": 44560 + }, + { + "epoch": 3.027924989808398, + "grad_norm": 0.18506985902786255, + "learning_rate": 6.216877293110477e-05, + "loss": 3.9189, + "step": 44565 + }, + { + "epoch": 3.0282647098790596, + "grad_norm": 0.1713983565568924, + "learning_rate": 6.21645264302215e-05, + "loss": 3.902, + "step": 44570 + }, + { + "epoch": 3.0286044299497212, + "grad_norm": 0.22224852442741394, + "learning_rate": 6.216027992933823e-05, + "loss": 3.8314, + "step": 44575 + }, + { + "epoch": 3.0289441500203833, + "grad_norm": 0.4167514741420746, + "learning_rate": 6.215603342845496e-05, + "loss": 3.6955, + "step": 44580 + }, + { + "epoch": 3.029283870091045, + "grad_norm": 0.19789160788059235, + "learning_rate": 6.215178692757168e-05, + "loss": 3.8731, + "step": 44585 + }, + { + "epoch": 3.0296235901617066, + "grad_norm": 0.2072342336177826, + "learning_rate": 6.214754042668841e-05, + "loss": 3.72, + "step": 44590 + }, + { + "epoch": 3.0299633102323686, + "grad_norm": 0.20719921588897705, + "learning_rate": 6.214329392580514e-05, + "loss": 3.8223, + "step": 44595 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.48561426997184753, + "learning_rate": 6.213904742492187e-05, + "loss": 3.8356, + "step": 44600 + }, + { + "epoch": 3.030642750373692, + "grad_norm": 0.20463380217552185, + "learning_rate": 6.21348009240386e-05, + "loss": 3.7925, + "step": 44605 + }, + { + "epoch": 3.030982470444354, + "grad_norm": 0.1438651978969574, + "learning_rate": 6.213055442315532e-05, + "loss": 3.7933, + "step": 44610 + }, + { + "epoch": 3.0313221905150156, + "grad_norm": 0.14481580257415771, + "learning_rate": 6.212630792227205e-05, + "loss": 3.9116, + "step": 44615 + }, + { + "epoch": 3.0316619105856772, + "grad_norm": 0.15381166338920593, + "learning_rate": 6.212206142138878e-05, + "loss": 3.7105, + "step": 44620 + }, + { + "epoch": 3.0320016306563393, + "grad_norm": 0.23638960719108582, + "learning_rate": 6.21178149205055e-05, + "loss": 3.9647, + "step": 44625 + }, + { + "epoch": 3.032341350727001, + "grad_norm": 0.1780596822500229, + "learning_rate": 6.211356841962224e-05, + "loss": 3.9982, + "step": 44630 + }, + { + "epoch": 3.0326810707976626, + "grad_norm": 0.3678363561630249, + "learning_rate": 6.210932191873896e-05, + "loss": 3.7988, + "step": 44635 + }, + { + "epoch": 3.0330207908683247, + "grad_norm": 0.1440468728542328, + "learning_rate": 6.210507541785568e-05, + "loss": 3.5812, + "step": 44640 + }, + { + "epoch": 3.0333605109389863, + "grad_norm": 0.17629508674144745, + "learning_rate": 6.210082891697242e-05, + "loss": 3.9719, + "step": 44645 + }, + { + "epoch": 3.033700231009648, + "grad_norm": 0.15180718898773193, + "learning_rate": 6.209658241608915e-05, + "loss": 3.4452, + "step": 44650 + }, + { + "epoch": 3.03403995108031, + "grad_norm": 0.16143923997879028, + "learning_rate": 6.209233591520586e-05, + "loss": 3.8914, + "step": 44655 + }, + { + "epoch": 3.0343796711509716, + "grad_norm": 0.21574288606643677, + "learning_rate": 6.20880894143226e-05, + "loss": 3.863, + "step": 44660 + }, + { + "epoch": 3.0347193912216333, + "grad_norm": 0.1763511449098587, + "learning_rate": 6.208384291343933e-05, + "loss": 4.0651, + "step": 44665 + }, + { + "epoch": 3.0350591112922953, + "grad_norm": 0.21441873908042908, + "learning_rate": 6.207959641255605e-05, + "loss": 3.5405, + "step": 44670 + }, + { + "epoch": 3.035398831362957, + "grad_norm": 0.1832277774810791, + "learning_rate": 6.207534991167279e-05, + "loss": 3.7705, + "step": 44675 + }, + { + "epoch": 3.0357385514336186, + "grad_norm": 0.23157991468906403, + "learning_rate": 6.207110341078952e-05, + "loss": 3.8122, + "step": 44680 + }, + { + "epoch": 3.0360782715042807, + "grad_norm": 0.18154944479465485, + "learning_rate": 6.206685690990623e-05, + "loss": 3.8668, + "step": 44685 + }, + { + "epoch": 3.0364179915749423, + "grad_norm": 0.14337635040283203, + "learning_rate": 6.206261040902297e-05, + "loss": 3.7712, + "step": 44690 + }, + { + "epoch": 3.036757711645604, + "grad_norm": 0.21612627804279327, + "learning_rate": 6.20583639081397e-05, + "loss": 3.5962, + "step": 44695 + }, + { + "epoch": 3.037097431716266, + "grad_norm": 0.1484072208404541, + "learning_rate": 6.205411740725643e-05, + "loss": 3.8307, + "step": 44700 + }, + { + "epoch": 3.0374371517869276, + "grad_norm": 0.21081511676311493, + "learning_rate": 6.204987090637316e-05, + "loss": 3.7058, + "step": 44705 + }, + { + "epoch": 3.0377768718575893, + "grad_norm": 0.1794106811285019, + "learning_rate": 6.204562440548987e-05, + "loss": 4.0277, + "step": 44710 + }, + { + "epoch": 3.038116591928251, + "grad_norm": 0.20427468419075012, + "learning_rate": 6.204137790460661e-05, + "loss": 3.5899, + "step": 44715 + }, + { + "epoch": 3.038456311998913, + "grad_norm": 0.1483902633190155, + "learning_rate": 6.203713140372334e-05, + "loss": 3.7785, + "step": 44720 + }, + { + "epoch": 3.0387960320695746, + "grad_norm": 0.20699742436408997, + "learning_rate": 6.203288490284006e-05, + "loss": 3.6592, + "step": 44725 + }, + { + "epoch": 3.0391357521402362, + "grad_norm": 0.26335182785987854, + "learning_rate": 6.20286384019568e-05, + "loss": 3.8302, + "step": 44730 + }, + { + "epoch": 3.0394754722108983, + "grad_norm": 0.2021624892950058, + "learning_rate": 6.202439190107352e-05, + "loss": 4.0598, + "step": 44735 + }, + { + "epoch": 3.03981519228156, + "grad_norm": 0.1765749305486679, + "learning_rate": 6.202014540019024e-05, + "loss": 3.67, + "step": 44740 + }, + { + "epoch": 3.0401549123522216, + "grad_norm": 0.1829794943332672, + "learning_rate": 6.201589889930698e-05, + "loss": 4.0, + "step": 44745 + }, + { + "epoch": 3.0404946324228836, + "grad_norm": 0.1436256766319275, + "learning_rate": 6.201165239842371e-05, + "loss": 3.8584, + "step": 44750 + }, + { + "epoch": 3.0408343524935453, + "grad_norm": 0.3845714330673218, + "learning_rate": 6.200740589754042e-05, + "loss": 3.4474, + "step": 44755 + }, + { + "epoch": 3.041174072564207, + "grad_norm": 0.19842003285884857, + "learning_rate": 6.200315939665716e-05, + "loss": 3.7456, + "step": 44760 + }, + { + "epoch": 3.041513792634869, + "grad_norm": 0.17419981956481934, + "learning_rate": 6.199891289577389e-05, + "loss": 3.7403, + "step": 44765 + }, + { + "epoch": 3.0418535127055306, + "grad_norm": 0.19041088223457336, + "learning_rate": 6.199466639489061e-05, + "loss": 3.6245, + "step": 44770 + }, + { + "epoch": 3.0421932327761922, + "grad_norm": 0.20401667058467865, + "learning_rate": 6.199041989400735e-05, + "loss": 3.8606, + "step": 44775 + }, + { + "epoch": 3.0425329528468543, + "grad_norm": 0.20037704706192017, + "learning_rate": 6.198617339312406e-05, + "loss": 3.6886, + "step": 44780 + }, + { + "epoch": 3.042872672917516, + "grad_norm": 0.1652826964855194, + "learning_rate": 6.198192689224079e-05, + "loss": 3.7462, + "step": 44785 + }, + { + "epoch": 3.0432123929881776, + "grad_norm": 0.1945980042219162, + "learning_rate": 6.197768039135753e-05, + "loss": 3.5256, + "step": 44790 + }, + { + "epoch": 3.0435521130588397, + "grad_norm": 0.21055617928504944, + "learning_rate": 6.197343389047425e-05, + "loss": 3.8398, + "step": 44795 + }, + { + "epoch": 3.0438918331295013, + "grad_norm": 1.0296701192855835, + "learning_rate": 6.196918738959098e-05, + "loss": 4.0175, + "step": 44800 + }, + { + "epoch": 3.044231553200163, + "grad_norm": 0.1970560997724533, + "learning_rate": 6.196494088870772e-05, + "loss": 3.7976, + "step": 44805 + }, + { + "epoch": 3.044571273270825, + "grad_norm": 0.17494112253189087, + "learning_rate": 6.196069438782443e-05, + "loss": 3.9406, + "step": 44810 + }, + { + "epoch": 3.0449109933414866, + "grad_norm": 0.17323166131973267, + "learning_rate": 6.195644788694116e-05, + "loss": 4.0578, + "step": 44815 + }, + { + "epoch": 3.0452507134121483, + "grad_norm": 0.1803709864616394, + "learning_rate": 6.19522013860579e-05, + "loss": 3.7496, + "step": 44820 + }, + { + "epoch": 3.0455904334828103, + "grad_norm": 0.15378986299037933, + "learning_rate": 6.194795488517462e-05, + "loss": 3.8543, + "step": 44825 + }, + { + "epoch": 3.045930153553472, + "grad_norm": 0.1517726331949234, + "learning_rate": 6.194370838429134e-05, + "loss": 4.0343, + "step": 44830 + }, + { + "epoch": 3.0462698736241336, + "grad_norm": 0.19912581145763397, + "learning_rate": 6.193946188340808e-05, + "loss": 3.7898, + "step": 44835 + }, + { + "epoch": 3.0466095936947957, + "grad_norm": 0.17660365998744965, + "learning_rate": 6.19352153825248e-05, + "loss": 3.7117, + "step": 44840 + }, + { + "epoch": 3.0469493137654573, + "grad_norm": 0.2126893699169159, + "learning_rate": 6.193096888164153e-05, + "loss": 3.6056, + "step": 44845 + }, + { + "epoch": 3.047289033836119, + "grad_norm": 0.20564617216587067, + "learning_rate": 6.192672238075826e-05, + "loss": 3.9108, + "step": 44850 + }, + { + "epoch": 3.047628753906781, + "grad_norm": 0.31688234210014343, + "learning_rate": 6.192247587987498e-05, + "loss": 3.7729, + "step": 44855 + }, + { + "epoch": 3.0479684739774426, + "grad_norm": 0.13487662374973297, + "learning_rate": 6.191822937899171e-05, + "loss": 3.6705, + "step": 44860 + }, + { + "epoch": 3.0483081940481043, + "grad_norm": 0.23807121813297272, + "learning_rate": 6.191398287810844e-05, + "loss": 3.923, + "step": 44865 + }, + { + "epoch": 3.0486479141187663, + "grad_norm": 0.149857759475708, + "learning_rate": 6.190973637722517e-05, + "loss": 3.7274, + "step": 44870 + }, + { + "epoch": 3.048987634189428, + "grad_norm": 0.189708411693573, + "learning_rate": 6.19054898763419e-05, + "loss": 3.767, + "step": 44875 + }, + { + "epoch": 3.0493273542600896, + "grad_norm": 0.2067829668521881, + "learning_rate": 6.190124337545862e-05, + "loss": 3.7626, + "step": 44880 + }, + { + "epoch": 3.0496670743307517, + "grad_norm": 0.18564137816429138, + "learning_rate": 6.189699687457535e-05, + "loss": 3.8312, + "step": 44885 + }, + { + "epoch": 3.0500067944014133, + "grad_norm": 0.15766900777816772, + "learning_rate": 6.189275037369208e-05, + "loss": 4.0065, + "step": 44890 + }, + { + "epoch": 3.050346514472075, + "grad_norm": 0.30150651931762695, + "learning_rate": 6.188850387280881e-05, + "loss": 3.7622, + "step": 44895 + }, + { + "epoch": 3.0506862345427366, + "grad_norm": 0.19464680552482605, + "learning_rate": 6.188425737192554e-05, + "loss": 3.9088, + "step": 44900 + }, + { + "epoch": 3.0510259546133986, + "grad_norm": 0.16579242050647736, + "learning_rate": 6.188001087104226e-05, + "loss": 3.6339, + "step": 44905 + }, + { + "epoch": 3.0513656746840603, + "grad_norm": 0.3937108814716339, + "learning_rate": 6.187576437015899e-05, + "loss": 3.7814, + "step": 44910 + }, + { + "epoch": 3.051705394754722, + "grad_norm": 0.14685596525669098, + "learning_rate": 6.187151786927572e-05, + "loss": 3.9808, + "step": 44915 + }, + { + "epoch": 3.052045114825384, + "grad_norm": 0.9442539811134338, + "learning_rate": 6.186727136839245e-05, + "loss": 3.9256, + "step": 44920 + }, + { + "epoch": 3.0523848348960456, + "grad_norm": 0.16799969971179962, + "learning_rate": 6.186302486750918e-05, + "loss": 3.9549, + "step": 44925 + }, + { + "epoch": 3.0527245549667072, + "grad_norm": 0.20515625178813934, + "learning_rate": 6.18587783666259e-05, + "loss": 3.9245, + "step": 44930 + }, + { + "epoch": 3.0530642750373693, + "grad_norm": 0.38664257526397705, + "learning_rate": 6.185453186574263e-05, + "loss": 3.6649, + "step": 44935 + }, + { + "epoch": 3.053403995108031, + "grad_norm": 0.2069699913263321, + "learning_rate": 6.185028536485936e-05, + "loss": 3.7097, + "step": 44940 + }, + { + "epoch": 3.0537437151786926, + "grad_norm": 0.14607669413089752, + "learning_rate": 6.184603886397609e-05, + "loss": 3.9691, + "step": 44945 + }, + { + "epoch": 3.0540834352493547, + "grad_norm": 0.1558600813150406, + "learning_rate": 6.184179236309282e-05, + "loss": 3.8785, + "step": 44950 + }, + { + "epoch": 3.0544231553200163, + "grad_norm": 0.14685370028018951, + "learning_rate": 6.183754586220954e-05, + "loss": 3.8212, + "step": 44955 + }, + { + "epoch": 3.054762875390678, + "grad_norm": 0.12365331500768661, + "learning_rate": 6.183329936132627e-05, + "loss": 3.8752, + "step": 44960 + }, + { + "epoch": 3.05510259546134, + "grad_norm": 0.19910332560539246, + "learning_rate": 6.1829052860443e-05, + "loss": 3.8548, + "step": 44965 + }, + { + "epoch": 3.0554423155320016, + "grad_norm": 0.15883074700832367, + "learning_rate": 6.182480635955973e-05, + "loss": 3.709, + "step": 44970 + }, + { + "epoch": 3.0557820356026633, + "grad_norm": 0.27186527848243713, + "learning_rate": 6.182055985867646e-05, + "loss": 3.919, + "step": 44975 + }, + { + "epoch": 3.0561217556733253, + "grad_norm": 0.17667201161384583, + "learning_rate": 6.181631335779317e-05, + "loss": 3.9303, + "step": 44980 + }, + { + "epoch": 3.056461475743987, + "grad_norm": 0.15786124765872955, + "learning_rate": 6.181206685690991e-05, + "loss": 3.8828, + "step": 44985 + }, + { + "epoch": 3.0568011958146486, + "grad_norm": 0.1669166386127472, + "learning_rate": 6.180782035602664e-05, + "loss": 3.9928, + "step": 44990 + }, + { + "epoch": 3.0571409158853107, + "grad_norm": 0.14781691133975983, + "learning_rate": 6.180357385514335e-05, + "loss": 3.6314, + "step": 44995 + }, + { + "epoch": 3.0574806359559723, + "grad_norm": 0.15897110104560852, + "learning_rate": 6.17993273542601e-05, + "loss": 4.0126, + "step": 45000 + }, + { + "epoch": 3.057820356026634, + "grad_norm": 0.5570805668830872, + "learning_rate": 6.179508085337682e-05, + "loss": 3.634, + "step": 45005 + }, + { + "epoch": 3.058160076097296, + "grad_norm": 0.16373145580291748, + "learning_rate": 6.179083435249354e-05, + "loss": 3.5577, + "step": 45010 + }, + { + "epoch": 3.0584997961679576, + "grad_norm": 0.17821292579174042, + "learning_rate": 6.178658785161028e-05, + "loss": 3.993, + "step": 45015 + }, + { + "epoch": 3.0588395162386193, + "grad_norm": 0.9587756991386414, + "learning_rate": 6.178234135072701e-05, + "loss": 3.6503, + "step": 45020 + }, + { + "epoch": 3.0591792363092813, + "grad_norm": 0.15463097393512726, + "learning_rate": 6.177809484984372e-05, + "loss": 3.9149, + "step": 45025 + }, + { + "epoch": 3.059518956379943, + "grad_norm": 0.1540638506412506, + "learning_rate": 6.177384834896046e-05, + "loss": 4.0331, + "step": 45030 + }, + { + "epoch": 3.0598586764506046, + "grad_norm": 0.20067791640758514, + "learning_rate": 6.176960184807719e-05, + "loss": 3.837, + "step": 45035 + }, + { + "epoch": 3.0601983965212667, + "grad_norm": 0.16559767723083496, + "learning_rate": 6.176535534719392e-05, + "loss": 3.9021, + "step": 45040 + }, + { + "epoch": 3.0605381165919283, + "grad_norm": 0.2065168172121048, + "learning_rate": 6.176110884631065e-05, + "loss": 3.5765, + "step": 45045 + }, + { + "epoch": 3.06087783666259, + "grad_norm": 0.18789055943489075, + "learning_rate": 6.175686234542736e-05, + "loss": 3.8933, + "step": 45050 + }, + { + "epoch": 3.0612175567332516, + "grad_norm": 0.1603345274925232, + "learning_rate": 6.17526158445441e-05, + "loss": 3.7817, + "step": 45055 + }, + { + "epoch": 3.0615572768039137, + "grad_norm": 0.14979276061058044, + "learning_rate": 6.174836934366083e-05, + "loss": 3.8776, + "step": 45060 + }, + { + "epoch": 3.0618969968745753, + "grad_norm": 0.17844237387180328, + "learning_rate": 6.174412284277755e-05, + "loss": 3.87, + "step": 45065 + }, + { + "epoch": 3.062236716945237, + "grad_norm": 0.23523779213428497, + "learning_rate": 6.173987634189429e-05, + "loss": 3.8965, + "step": 45070 + }, + { + "epoch": 3.062576437015899, + "grad_norm": 0.15219895541667938, + "learning_rate": 6.173562984101102e-05, + "loss": 4.1006, + "step": 45075 + }, + { + "epoch": 3.0629161570865606, + "grad_norm": 0.15267744660377502, + "learning_rate": 6.173138334012773e-05, + "loss": 3.8428, + "step": 45080 + }, + { + "epoch": 3.0632558771572223, + "grad_norm": 0.1841878443956375, + "learning_rate": 6.172713683924447e-05, + "loss": 4.1187, + "step": 45085 + }, + { + "epoch": 3.0635955972278843, + "grad_norm": 0.1551981419324875, + "learning_rate": 6.17228903383612e-05, + "loss": 3.8385, + "step": 45090 + }, + { + "epoch": 3.063935317298546, + "grad_norm": 0.16399239003658295, + "learning_rate": 6.171864383747791e-05, + "loss": 3.8441, + "step": 45095 + }, + { + "epoch": 3.0642750373692076, + "grad_norm": 0.19808648526668549, + "learning_rate": 6.171439733659466e-05, + "loss": 3.9621, + "step": 45100 + }, + { + "epoch": 3.0646147574398697, + "grad_norm": 0.41280752420425415, + "learning_rate": 6.171015083571138e-05, + "loss": 3.9477, + "step": 45105 + }, + { + "epoch": 3.0649544775105313, + "grad_norm": 0.16643968224525452, + "learning_rate": 6.17059043348281e-05, + "loss": 4.2439, + "step": 45110 + }, + { + "epoch": 3.065294197581193, + "grad_norm": 0.2174854576587677, + "learning_rate": 6.170165783394484e-05, + "loss": 3.8289, + "step": 45115 + }, + { + "epoch": 3.065633917651855, + "grad_norm": 0.15786845982074738, + "learning_rate": 6.169741133306157e-05, + "loss": 3.9102, + "step": 45120 + }, + { + "epoch": 3.0659736377225166, + "grad_norm": 0.22801831364631653, + "learning_rate": 6.169316483217828e-05, + "loss": 3.7456, + "step": 45125 + }, + { + "epoch": 3.0663133577931783, + "grad_norm": 0.173265278339386, + "learning_rate": 6.168891833129502e-05, + "loss": 3.8379, + "step": 45130 + }, + { + "epoch": 3.0666530778638403, + "grad_norm": 0.16044017672538757, + "learning_rate": 6.168467183041174e-05, + "loss": 3.7991, + "step": 45135 + }, + { + "epoch": 3.066992797934502, + "grad_norm": 0.21971255540847778, + "learning_rate": 6.168042532952847e-05, + "loss": 3.971, + "step": 45140 + }, + { + "epoch": 3.0673325180051636, + "grad_norm": 0.3641834855079651, + "learning_rate": 6.167617882864521e-05, + "loss": 3.7174, + "step": 45145 + }, + { + "epoch": 3.0676722380758257, + "grad_norm": 0.15041132271289825, + "learning_rate": 6.167193232776192e-05, + "loss": 3.8947, + "step": 45150 + }, + { + "epoch": 3.0680119581464873, + "grad_norm": 0.17550837993621826, + "learning_rate": 6.166768582687865e-05, + "loss": 3.8909, + "step": 45155 + }, + { + "epoch": 3.068351678217149, + "grad_norm": 0.1674947440624237, + "learning_rate": 6.166343932599539e-05, + "loss": 3.7963, + "step": 45160 + }, + { + "epoch": 3.068691398287811, + "grad_norm": 0.16375893354415894, + "learning_rate": 6.16591928251121e-05, + "loss": 3.948, + "step": 45165 + }, + { + "epoch": 3.0690311183584726, + "grad_norm": 0.1741797775030136, + "learning_rate": 6.165494632422883e-05, + "loss": 3.9718, + "step": 45170 + }, + { + "epoch": 3.0693708384291343, + "grad_norm": 0.2155780792236328, + "learning_rate": 6.165069982334558e-05, + "loss": 3.8592, + "step": 45175 + }, + { + "epoch": 3.0697105584997963, + "grad_norm": 0.44251418113708496, + "learning_rate": 6.164645332246229e-05, + "loss": 3.8822, + "step": 45180 + }, + { + "epoch": 3.070050278570458, + "grad_norm": 0.20434877276420593, + "learning_rate": 6.164220682157902e-05, + "loss": 4.0411, + "step": 45185 + }, + { + "epoch": 3.0703899986411196, + "grad_norm": 0.17867624759674072, + "learning_rate": 6.163796032069576e-05, + "loss": 3.9778, + "step": 45190 + }, + { + "epoch": 3.0707297187117817, + "grad_norm": 0.17402374744415283, + "learning_rate": 6.163371381981247e-05, + "loss": 3.686, + "step": 45195 + }, + { + "epoch": 3.0710694387824433, + "grad_norm": 0.1938955932855606, + "learning_rate": 6.16294673189292e-05, + "loss": 3.7773, + "step": 45200 + }, + { + "epoch": 3.071409158853105, + "grad_norm": 0.2674334943294525, + "learning_rate": 6.162522081804593e-05, + "loss": 3.6275, + "step": 45205 + }, + { + "epoch": 3.071748878923767, + "grad_norm": 0.1805531233549118, + "learning_rate": 6.162097431716266e-05, + "loss": 3.8836, + "step": 45210 + }, + { + "epoch": 3.0720885989944287, + "grad_norm": 0.21022143959999084, + "learning_rate": 6.161672781627939e-05, + "loss": 3.9695, + "step": 45215 + }, + { + "epoch": 3.0724283190650903, + "grad_norm": 0.1937449872493744, + "learning_rate": 6.161248131539611e-05, + "loss": 3.8625, + "step": 45220 + }, + { + "epoch": 3.0727680391357524, + "grad_norm": 0.19553688168525696, + "learning_rate": 6.160823481451284e-05, + "loss": 3.9615, + "step": 45225 + }, + { + "epoch": 3.073107759206414, + "grad_norm": 0.24457654356956482, + "learning_rate": 6.160398831362957e-05, + "loss": 3.7919, + "step": 45230 + }, + { + "epoch": 3.0734474792770756, + "grad_norm": 0.18695639073848724, + "learning_rate": 6.15997418127463e-05, + "loss": 3.4177, + "step": 45235 + }, + { + "epoch": 3.0737871993477373, + "grad_norm": 0.14263592660427094, + "learning_rate": 6.159549531186303e-05, + "loss": 3.7823, + "step": 45240 + }, + { + "epoch": 3.0741269194183993, + "grad_norm": 0.2332691252231598, + "learning_rate": 6.159124881097975e-05, + "loss": 4.0384, + "step": 45245 + }, + { + "epoch": 3.074466639489061, + "grad_norm": 0.21952632069587708, + "learning_rate": 6.158700231009648e-05, + "loss": 4.1943, + "step": 45250 + }, + { + "epoch": 3.0748063595597226, + "grad_norm": 0.1554601937532425, + "learning_rate": 6.158275580921321e-05, + "loss": 3.8416, + "step": 45255 + }, + { + "epoch": 3.0751460796303847, + "grad_norm": 0.2444729506969452, + "learning_rate": 6.157850930832994e-05, + "loss": 3.8315, + "step": 45260 + }, + { + "epoch": 3.0754857997010463, + "grad_norm": 0.16371092200279236, + "learning_rate": 6.157426280744667e-05, + "loss": 3.7286, + "step": 45265 + }, + { + "epoch": 3.075825519771708, + "grad_norm": 0.20897838473320007, + "learning_rate": 6.15700163065634e-05, + "loss": 3.7402, + "step": 45270 + }, + { + "epoch": 3.07616523984237, + "grad_norm": 0.17533066868782043, + "learning_rate": 6.156576980568012e-05, + "loss": 3.8308, + "step": 45275 + }, + { + "epoch": 3.0765049599130316, + "grad_norm": 0.1432885378599167, + "learning_rate": 6.156152330479685e-05, + "loss": 3.759, + "step": 45280 + }, + { + "epoch": 3.0768446799836933, + "grad_norm": 0.14274607598781586, + "learning_rate": 6.155727680391358e-05, + "loss": 4.0224, + "step": 45285 + }, + { + "epoch": 3.0771844000543553, + "grad_norm": 0.13568773865699768, + "learning_rate": 6.15530303030303e-05, + "loss": 3.8814, + "step": 45290 + }, + { + "epoch": 3.077524120125017, + "grad_norm": 0.19247247278690338, + "learning_rate": 6.154878380214703e-05, + "loss": 3.9163, + "step": 45295 + }, + { + "epoch": 3.0778638401956786, + "grad_norm": 0.2847353518009186, + "learning_rate": 6.154453730126376e-05, + "loss": 4.0573, + "step": 45300 + }, + { + "epoch": 3.0782035602663407, + "grad_norm": 0.15240103006362915, + "learning_rate": 6.154029080038049e-05, + "loss": 3.7297, + "step": 45305 + }, + { + "epoch": 3.0785432803370023, + "grad_norm": 0.23664872348308563, + "learning_rate": 6.153604429949722e-05, + "loss": 3.7927, + "step": 45310 + }, + { + "epoch": 3.078883000407664, + "grad_norm": 0.1438993513584137, + "learning_rate": 6.153179779861395e-05, + "loss": 3.9089, + "step": 45315 + }, + { + "epoch": 3.079222720478326, + "grad_norm": 0.17089420557022095, + "learning_rate": 6.152755129773067e-05, + "loss": 3.7725, + "step": 45320 + }, + { + "epoch": 3.0795624405489876, + "grad_norm": 0.1621735692024231, + "learning_rate": 6.15233047968474e-05, + "loss": 3.7865, + "step": 45325 + }, + { + "epoch": 3.0799021606196493, + "grad_norm": 0.16643552482128143, + "learning_rate": 6.151905829596413e-05, + "loss": 3.8649, + "step": 45330 + }, + { + "epoch": 3.0802418806903114, + "grad_norm": 0.3527011573314667, + "learning_rate": 6.151481179508084e-05, + "loss": 3.9737, + "step": 45335 + }, + { + "epoch": 3.080581600760973, + "grad_norm": 0.18432366847991943, + "learning_rate": 6.151056529419759e-05, + "loss": 3.816, + "step": 45340 + }, + { + "epoch": 3.0809213208316346, + "grad_norm": 0.20179103314876556, + "learning_rate": 6.150631879331431e-05, + "loss": 4.1707, + "step": 45345 + }, + { + "epoch": 3.0812610409022967, + "grad_norm": 0.23122641444206238, + "learning_rate": 6.150207229243103e-05, + "loss": 3.8123, + "step": 45350 + }, + { + "epoch": 3.0816007609729583, + "grad_norm": 1.1607489585876465, + "learning_rate": 6.149782579154777e-05, + "loss": 3.8484, + "step": 45355 + }, + { + "epoch": 3.08194048104362, + "grad_norm": 0.28258442878723145, + "learning_rate": 6.14935792906645e-05, + "loss": 3.8653, + "step": 45360 + }, + { + "epoch": 3.082280201114282, + "grad_norm": 0.4477655589580536, + "learning_rate": 6.148933278978121e-05, + "loss": 3.8569, + "step": 45365 + }, + { + "epoch": 3.0826199211849437, + "grad_norm": 0.20822592079639435, + "learning_rate": 6.148508628889795e-05, + "loss": 3.8975, + "step": 45370 + }, + { + "epoch": 3.0829596412556053, + "grad_norm": 0.17573915421962738, + "learning_rate": 6.148083978801468e-05, + "loss": 3.5332, + "step": 45375 + }, + { + "epoch": 3.0832993613262674, + "grad_norm": 0.15093550086021423, + "learning_rate": 6.147659328713141e-05, + "loss": 3.7782, + "step": 45380 + }, + { + "epoch": 3.083639081396929, + "grad_norm": 0.17635661363601685, + "learning_rate": 6.147234678624814e-05, + "loss": 3.8911, + "step": 45385 + }, + { + "epoch": 3.0839788014675906, + "grad_norm": 0.1870010644197464, + "learning_rate": 6.146810028536487e-05, + "loss": 3.938, + "step": 45390 + }, + { + "epoch": 3.0843185215382523, + "grad_norm": 5.095501899719238, + "learning_rate": 6.14638537844816e-05, + "loss": 3.922, + "step": 45395 + }, + { + "epoch": 3.0846582416089143, + "grad_norm": 0.16958463191986084, + "learning_rate": 6.145960728359832e-05, + "loss": 3.8997, + "step": 45400 + }, + { + "epoch": 3.084997961679576, + "grad_norm": 0.3157839775085449, + "learning_rate": 6.145536078271504e-05, + "loss": 3.7757, + "step": 45405 + }, + { + "epoch": 3.0853376817502376, + "grad_norm": 0.2030276507139206, + "learning_rate": 6.145111428183178e-05, + "loss": 3.8159, + "step": 45410 + }, + { + "epoch": 3.0856774018208997, + "grad_norm": 0.21927107870578766, + "learning_rate": 6.14468677809485e-05, + "loss": 3.8658, + "step": 45415 + }, + { + "epoch": 3.0860171218915613, + "grad_norm": 0.19810087978839874, + "learning_rate": 6.144262128006522e-05, + "loss": 3.7345, + "step": 45420 + }, + { + "epoch": 3.086356841962223, + "grad_norm": 0.18275055289268494, + "learning_rate": 6.143837477918196e-05, + "loss": 3.8667, + "step": 45425 + }, + { + "epoch": 3.086696562032885, + "grad_norm": 0.17432090640068054, + "learning_rate": 6.143412827829869e-05, + "loss": 3.9306, + "step": 45430 + }, + { + "epoch": 3.0870362821035466, + "grad_norm": 0.1457238644361496, + "learning_rate": 6.14298817774154e-05, + "loss": 4.1026, + "step": 45435 + }, + { + "epoch": 3.0873760021742083, + "grad_norm": 0.42990460991859436, + "learning_rate": 6.142563527653215e-05, + "loss": 3.7247, + "step": 45440 + }, + { + "epoch": 3.0877157222448703, + "grad_norm": 0.20443543791770935, + "learning_rate": 6.142138877564887e-05, + "loss": 3.7588, + "step": 45445 + }, + { + "epoch": 3.088055442315532, + "grad_norm": 0.17893573641777039, + "learning_rate": 6.141714227476559e-05, + "loss": 4.1528, + "step": 45450 + }, + { + "epoch": 3.0883951623861936, + "grad_norm": 0.2672066390514374, + "learning_rate": 6.141289577388233e-05, + "loss": 3.7861, + "step": 45455 + }, + { + "epoch": 3.0887348824568557, + "grad_norm": 0.19342923164367676, + "learning_rate": 6.140864927299906e-05, + "loss": 3.8525, + "step": 45460 + }, + { + "epoch": 3.0890746025275173, + "grad_norm": 0.2398359328508377, + "learning_rate": 6.140440277211577e-05, + "loss": 3.8332, + "step": 45465 + }, + { + "epoch": 3.089414322598179, + "grad_norm": 0.22629719972610474, + "learning_rate": 6.140015627123251e-05, + "loss": 3.7982, + "step": 45470 + }, + { + "epoch": 3.089754042668841, + "grad_norm": 0.20426872372627258, + "learning_rate": 6.139590977034923e-05, + "loss": 3.5846, + "step": 45475 + }, + { + "epoch": 3.0900937627395026, + "grad_norm": 0.18261438608169556, + "learning_rate": 6.139166326946596e-05, + "loss": 3.6964, + "step": 45480 + }, + { + "epoch": 3.0904334828101643, + "grad_norm": 0.15464341640472412, + "learning_rate": 6.13874167685827e-05, + "loss": 3.8619, + "step": 45485 + }, + { + "epoch": 3.0907732028808264, + "grad_norm": 0.19600285589694977, + "learning_rate": 6.138317026769941e-05, + "loss": 3.8763, + "step": 45490 + }, + { + "epoch": 3.091112922951488, + "grad_norm": 0.1899159997701645, + "learning_rate": 6.137892376681614e-05, + "loss": 3.8217, + "step": 45495 + }, + { + "epoch": 3.0914526430221496, + "grad_norm": 0.540661633014679, + "learning_rate": 6.137467726593288e-05, + "loss": 3.8451, + "step": 45500 + }, + { + "epoch": 3.0917923630928117, + "grad_norm": 0.21007588505744934, + "learning_rate": 6.13704307650496e-05, + "loss": 3.7255, + "step": 45505 + }, + { + "epoch": 3.0921320831634733, + "grad_norm": 0.1882549673318863, + "learning_rate": 6.136618426416633e-05, + "loss": 4.0184, + "step": 45510 + }, + { + "epoch": 3.092471803234135, + "grad_norm": 0.16149993240833282, + "learning_rate": 6.136193776328307e-05, + "loss": 3.9226, + "step": 45515 + }, + { + "epoch": 3.092811523304797, + "grad_norm": 1.2902884483337402, + "learning_rate": 6.135769126239978e-05, + "loss": 3.8983, + "step": 45520 + }, + { + "epoch": 3.0931512433754587, + "grad_norm": 0.2001941055059433, + "learning_rate": 6.135344476151651e-05, + "loss": 3.9656, + "step": 45525 + }, + { + "epoch": 3.0934909634461203, + "grad_norm": 0.20168434083461761, + "learning_rate": 6.134919826063325e-05, + "loss": 3.8797, + "step": 45530 + }, + { + "epoch": 3.0938306835167824, + "grad_norm": 0.231950044631958, + "learning_rate": 6.134495175974997e-05, + "loss": 3.8275, + "step": 45535 + }, + { + "epoch": 3.094170403587444, + "grad_norm": 0.14356110990047455, + "learning_rate": 6.134070525886669e-05, + "loss": 4.0216, + "step": 45540 + }, + { + "epoch": 3.0945101236581056, + "grad_norm": 0.17069151997566223, + "learning_rate": 6.133645875798343e-05, + "loss": 4.1323, + "step": 45545 + }, + { + "epoch": 3.0948498437287677, + "grad_norm": 0.1711096614599228, + "learning_rate": 6.133221225710015e-05, + "loss": 3.8208, + "step": 45550 + }, + { + "epoch": 3.0951895637994293, + "grad_norm": 0.16932369768619537, + "learning_rate": 6.132796575621688e-05, + "loss": 3.9529, + "step": 45555 + }, + { + "epoch": 3.095529283870091, + "grad_norm": 14.634061813354492, + "learning_rate": 6.13237192553336e-05, + "loss": 3.9402, + "step": 45560 + }, + { + "epoch": 3.095869003940753, + "grad_norm": 0.48429635167121887, + "learning_rate": 6.131947275445033e-05, + "loss": 3.7815, + "step": 45565 + }, + { + "epoch": 3.0962087240114147, + "grad_norm": 0.18284346163272858, + "learning_rate": 6.131522625356706e-05, + "loss": 3.924, + "step": 45570 + }, + { + "epoch": 3.0965484440820763, + "grad_norm": 0.14939463138580322, + "learning_rate": 6.131097975268379e-05, + "loss": 3.7859, + "step": 45575 + }, + { + "epoch": 3.096888164152738, + "grad_norm": 0.16179980337619781, + "learning_rate": 6.130673325180052e-05, + "loss": 3.7077, + "step": 45580 + }, + { + "epoch": 3.0972278842234, + "grad_norm": 0.15182848274707794, + "learning_rate": 6.130248675091725e-05, + "loss": 3.9438, + "step": 45585 + }, + { + "epoch": 3.0975676042940616, + "grad_norm": 0.16136273741722107, + "learning_rate": 6.129824025003397e-05, + "loss": 3.8927, + "step": 45590 + }, + { + "epoch": 3.0979073243647233, + "grad_norm": 0.15744389593601227, + "learning_rate": 6.12939937491507e-05, + "loss": 3.7094, + "step": 45595 + }, + { + "epoch": 3.0982470444353853, + "grad_norm": 0.21967150270938873, + "learning_rate": 6.128974724826743e-05, + "loss": 3.7834, + "step": 45600 + }, + { + "epoch": 3.098586764506047, + "grad_norm": 0.12954267859458923, + "learning_rate": 6.128550074738416e-05, + "loss": 3.7184, + "step": 45605 + }, + { + "epoch": 3.0989264845767086, + "grad_norm": 0.15431639552116394, + "learning_rate": 6.128125424650089e-05, + "loss": 3.6747, + "step": 45610 + }, + { + "epoch": 3.0992662046473707, + "grad_norm": 0.18705612421035767, + "learning_rate": 6.127700774561761e-05, + "loss": 3.6205, + "step": 45615 + }, + { + "epoch": 3.0996059247180323, + "grad_norm": 0.15095067024230957, + "learning_rate": 6.127276124473434e-05, + "loss": 3.8958, + "step": 45620 + }, + { + "epoch": 3.099945644788694, + "grad_norm": 0.22270743548870087, + "learning_rate": 6.126851474385107e-05, + "loss": 3.9371, + "step": 45625 + }, + { + "epoch": 3.100285364859356, + "grad_norm": 0.18691834807395935, + "learning_rate": 6.12642682429678e-05, + "loss": 3.8614, + "step": 45630 + }, + { + "epoch": 3.1006250849300176, + "grad_norm": 0.24415922164916992, + "learning_rate": 6.126002174208453e-05, + "loss": 3.9109, + "step": 45635 + }, + { + "epoch": 3.1009648050006793, + "grad_norm": 0.15593013167381287, + "learning_rate": 6.125577524120125e-05, + "loss": 3.5764, + "step": 45640 + }, + { + "epoch": 3.1013045250713414, + "grad_norm": 3.9405698776245117, + "learning_rate": 6.125152874031798e-05, + "loss": 3.9495, + "step": 45645 + }, + { + "epoch": 3.101644245142003, + "grad_norm": 0.21433869004249573, + "learning_rate": 6.124728223943471e-05, + "loss": 3.7143, + "step": 45650 + }, + { + "epoch": 3.1019839652126646, + "grad_norm": 0.17934012413024902, + "learning_rate": 6.124303573855144e-05, + "loss": 3.7803, + "step": 45655 + }, + { + "epoch": 3.1023236852833267, + "grad_norm": 0.20450513064861298, + "learning_rate": 6.123878923766817e-05, + "loss": 4.0203, + "step": 45660 + }, + { + "epoch": 3.1026634053539883, + "grad_norm": 0.21227334439754486, + "learning_rate": 6.12345427367849e-05, + "loss": 3.7067, + "step": 45665 + }, + { + "epoch": 3.10300312542465, + "grad_norm": 0.201822891831398, + "learning_rate": 6.123029623590162e-05, + "loss": 3.8061, + "step": 45670 + }, + { + "epoch": 3.103342845495312, + "grad_norm": 0.7099252343177795, + "learning_rate": 6.122604973501834e-05, + "loss": 3.8406, + "step": 45675 + }, + { + "epoch": 3.1036825655659737, + "grad_norm": 0.7003065943717957, + "learning_rate": 6.122180323413508e-05, + "loss": 3.9051, + "step": 45680 + }, + { + "epoch": 3.1040222856366353, + "grad_norm": 0.14836476743221283, + "learning_rate": 6.12175567332518e-05, + "loss": 3.9858, + "step": 45685 + }, + { + "epoch": 3.1043620057072974, + "grad_norm": 0.17751696705818176, + "learning_rate": 6.121331023236852e-05, + "loss": 3.9852, + "step": 45690 + }, + { + "epoch": 3.104701725777959, + "grad_norm": 0.497363418340683, + "learning_rate": 6.120906373148526e-05, + "loss": 3.8519, + "step": 45695 + }, + { + "epoch": 3.1050414458486206, + "grad_norm": 0.1935739517211914, + "learning_rate": 6.120481723060199e-05, + "loss": 3.9523, + "step": 45700 + }, + { + "epoch": 3.1053811659192827, + "grad_norm": 0.22234618663787842, + "learning_rate": 6.12005707297187e-05, + "loss": 3.9671, + "step": 45705 + }, + { + "epoch": 3.1057208859899443, + "grad_norm": 0.2033732384443283, + "learning_rate": 6.119632422883545e-05, + "loss": 4.0857, + "step": 45710 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 0.19384436309337616, + "learning_rate": 6.119207772795217e-05, + "loss": 3.7108, + "step": 45715 + }, + { + "epoch": 3.106400326131268, + "grad_norm": 0.18930798768997192, + "learning_rate": 6.11878312270689e-05, + "loss": 3.8242, + "step": 45720 + }, + { + "epoch": 3.1067400462019297, + "grad_norm": 0.14957687258720398, + "learning_rate": 6.118358472618563e-05, + "loss": 3.9804, + "step": 45725 + }, + { + "epoch": 3.1070797662725913, + "grad_norm": 0.18163970112800598, + "learning_rate": 6.117933822530236e-05, + "loss": 3.6998, + "step": 45730 + }, + { + "epoch": 3.107419486343253, + "grad_norm": 0.18365037441253662, + "learning_rate": 6.117509172441909e-05, + "loss": 3.8219, + "step": 45735 + }, + { + "epoch": 3.107759206413915, + "grad_norm": 0.20103605091571808, + "learning_rate": 6.117084522353581e-05, + "loss": 3.8032, + "step": 45740 + }, + { + "epoch": 3.1080989264845766, + "grad_norm": 0.21138662099838257, + "learning_rate": 6.116659872265254e-05, + "loss": 4.0205, + "step": 45745 + }, + { + "epoch": 3.1084386465552383, + "grad_norm": 0.26066505908966064, + "learning_rate": 6.116235222176927e-05, + "loss": 4.0847, + "step": 45750 + }, + { + "epoch": 3.1087783666259003, + "grad_norm": 0.19285771250724792, + "learning_rate": 6.1158105720886e-05, + "loss": 3.7201, + "step": 45755 + }, + { + "epoch": 3.109118086696562, + "grad_norm": 0.19929175078868866, + "learning_rate": 6.115385922000271e-05, + "loss": 3.8155, + "step": 45760 + }, + { + "epoch": 3.1094578067672236, + "grad_norm": 0.17307516932487488, + "learning_rate": 6.114961271911945e-05, + "loss": 3.6407, + "step": 45765 + }, + { + "epoch": 3.1097975268378857, + "grad_norm": 0.2074047029018402, + "learning_rate": 6.114536621823618e-05, + "loss": 3.7797, + "step": 45770 + }, + { + "epoch": 3.1101372469085473, + "grad_norm": 0.1515599638223648, + "learning_rate": 6.11411197173529e-05, + "loss": 3.8747, + "step": 45775 + }, + { + "epoch": 3.110476966979209, + "grad_norm": 0.12526777386665344, + "learning_rate": 6.113687321646964e-05, + "loss": 3.8968, + "step": 45780 + }, + { + "epoch": 3.110816687049871, + "grad_norm": 0.1662093847990036, + "learning_rate": 6.113262671558637e-05, + "loss": 3.9521, + "step": 45785 + }, + { + "epoch": 3.1111564071205327, + "grad_norm": 0.16280855238437653, + "learning_rate": 6.112838021470308e-05, + "loss": 4.0735, + "step": 45790 + }, + { + "epoch": 3.1114961271911943, + "grad_norm": 0.21871864795684814, + "learning_rate": 6.112413371381982e-05, + "loss": 3.9148, + "step": 45795 + }, + { + "epoch": 3.1118358472618564, + "grad_norm": 2.3022239208221436, + "learning_rate": 6.111988721293655e-05, + "loss": 4.0979, + "step": 45800 + }, + { + "epoch": 3.112175567332518, + "grad_norm": 0.14905503392219543, + "learning_rate": 6.111564071205326e-05, + "loss": 4.0014, + "step": 45805 + }, + { + "epoch": 3.1125152874031796, + "grad_norm": 0.1823827624320984, + "learning_rate": 6.111139421117e-05, + "loss": 3.756, + "step": 45810 + }, + { + "epoch": 3.1128550074738417, + "grad_norm": 0.13824822008609772, + "learning_rate": 6.110714771028673e-05, + "loss": 3.8532, + "step": 45815 + }, + { + "epoch": 3.1131947275445033, + "grad_norm": 0.19300805032253265, + "learning_rate": 6.110290120940345e-05, + "loss": 4.0273, + "step": 45820 + }, + { + "epoch": 3.113534447615165, + "grad_norm": 0.16554708778858185, + "learning_rate": 6.109865470852019e-05, + "loss": 3.744, + "step": 45825 + }, + { + "epoch": 3.113874167685827, + "grad_norm": 0.12764473259449005, + "learning_rate": 6.10944082076369e-05, + "loss": 3.9633, + "step": 45830 + }, + { + "epoch": 3.1142138877564887, + "grad_norm": 0.17186856269836426, + "learning_rate": 6.109016170675363e-05, + "loss": 3.8875, + "step": 45835 + }, + { + "epoch": 3.1145536078271503, + "grad_norm": 0.3542034924030304, + "learning_rate": 6.108591520587037e-05, + "loss": 4.1037, + "step": 45840 + }, + { + "epoch": 3.1148933278978124, + "grad_norm": 0.16155828535556793, + "learning_rate": 6.108166870498709e-05, + "loss": 3.9055, + "step": 45845 + }, + { + "epoch": 3.115233047968474, + "grad_norm": 0.1654869168996811, + "learning_rate": 6.107742220410382e-05, + "loss": 3.8496, + "step": 45850 + }, + { + "epoch": 3.1155727680391356, + "grad_norm": 0.20794400572776794, + "learning_rate": 6.107317570322056e-05, + "loss": 3.9545, + "step": 45855 + }, + { + "epoch": 3.1159124881097977, + "grad_norm": 0.16935937106609344, + "learning_rate": 6.106892920233727e-05, + "loss": 3.699, + "step": 45860 + }, + { + "epoch": 3.1162522081804593, + "grad_norm": 0.17864425480365753, + "learning_rate": 6.1064682701454e-05, + "loss": 3.9539, + "step": 45865 + }, + { + "epoch": 3.116591928251121, + "grad_norm": 0.22802141308784485, + "learning_rate": 6.106043620057074e-05, + "loss": 3.8253, + "step": 45870 + }, + { + "epoch": 3.116931648321783, + "grad_norm": 0.15662673115730286, + "learning_rate": 6.105618969968746e-05, + "loss": 3.8194, + "step": 45875 + }, + { + "epoch": 3.1172713683924447, + "grad_norm": 0.16057011485099792, + "learning_rate": 6.105194319880418e-05, + "loss": 4.0469, + "step": 45880 + }, + { + "epoch": 3.1176110884631063, + "grad_norm": 0.18097703158855438, + "learning_rate": 6.104769669792093e-05, + "loss": 3.8716, + "step": 45885 + }, + { + "epoch": 3.1179508085337684, + "grad_norm": 0.22844620048999786, + "learning_rate": 6.104345019703764e-05, + "loss": 3.9312, + "step": 45890 + }, + { + "epoch": 3.11829052860443, + "grad_norm": 1.6416170597076416, + "learning_rate": 6.103920369615437e-05, + "loss": 3.6706, + "step": 45895 + }, + { + "epoch": 3.1186302486750916, + "grad_norm": 0.23407304286956787, + "learning_rate": 6.10349571952711e-05, + "loss": 4.0511, + "step": 45900 + }, + { + "epoch": 3.1189699687457537, + "grad_norm": 0.1369846910238266, + "learning_rate": 6.1030710694387824e-05, + "loss": 4.0405, + "step": 45905 + }, + { + "epoch": 3.1193096888164153, + "grad_norm": 0.2106025516986847, + "learning_rate": 6.102646419350455e-05, + "loss": 4.0785, + "step": 45910 + }, + { + "epoch": 3.119649408887077, + "grad_norm": 0.17910678684711456, + "learning_rate": 6.102221769262129e-05, + "loss": 3.8638, + "step": 45915 + }, + { + "epoch": 3.1199891289577386, + "grad_norm": 0.27095186710357666, + "learning_rate": 6.101797119173801e-05, + "loss": 3.8833, + "step": 45920 + }, + { + "epoch": 3.1203288490284007, + "grad_norm": 0.1660093069076538, + "learning_rate": 6.1013724690854736e-05, + "loss": 3.6873, + "step": 45925 + }, + { + "epoch": 3.1206685690990623, + "grad_norm": 0.21589335799217224, + "learning_rate": 6.100947818997147e-05, + "loss": 3.8165, + "step": 45930 + }, + { + "epoch": 3.121008289169724, + "grad_norm": 0.17619362473487854, + "learning_rate": 6.100523168908819e-05, + "loss": 3.7799, + "step": 45935 + }, + { + "epoch": 3.121348009240386, + "grad_norm": 0.15682102739810944, + "learning_rate": 6.100098518820492e-05, + "loss": 3.7431, + "step": 45940 + }, + { + "epoch": 3.1216877293110477, + "grad_norm": 0.20546703040599823, + "learning_rate": 6.0996738687321655e-05, + "loss": 3.8992, + "step": 45945 + }, + { + "epoch": 3.1220274493817093, + "grad_norm": 0.17678344249725342, + "learning_rate": 6.0992492186438376e-05, + "loss": 3.8788, + "step": 45950 + }, + { + "epoch": 3.1223671694523714, + "grad_norm": 0.15994738042354584, + "learning_rate": 6.0988245685555104e-05, + "loss": 3.6632, + "step": 45955 + }, + { + "epoch": 3.122706889523033, + "grad_norm": 0.17248672246932983, + "learning_rate": 6.098399918467184e-05, + "loss": 3.9855, + "step": 45960 + }, + { + "epoch": 3.1230466095936946, + "grad_norm": 0.13948290050029755, + "learning_rate": 6.097975268378856e-05, + "loss": 3.7669, + "step": 45965 + }, + { + "epoch": 3.1233863296643567, + "grad_norm": 0.21535979211330414, + "learning_rate": 6.097550618290528e-05, + "loss": 3.8619, + "step": 45970 + }, + { + "epoch": 3.1237260497350183, + "grad_norm": 0.34696629643440247, + "learning_rate": 6.0971259682022016e-05, + "loss": 4.0387, + "step": 45975 + }, + { + "epoch": 3.12406576980568, + "grad_norm": 0.15804658830165863, + "learning_rate": 6.0967013181138744e-05, + "loss": 3.6065, + "step": 45980 + }, + { + "epoch": 3.124405489876342, + "grad_norm": 0.8990033864974976, + "learning_rate": 6.0962766680255466e-05, + "loss": 3.9762, + "step": 45985 + }, + { + "epoch": 3.1247452099470037, + "grad_norm": 0.15661799907684326, + "learning_rate": 6.09585201793722e-05, + "loss": 3.464, + "step": 45990 + }, + { + "epoch": 3.1250849300176653, + "grad_norm": 0.17161500453948975, + "learning_rate": 6.095427367848893e-05, + "loss": 3.8953, + "step": 45995 + }, + { + "epoch": 3.1254246500883274, + "grad_norm": 0.15444707870483398, + "learning_rate": 6.095002717760565e-05, + "loss": 4.0773, + "step": 46000 + }, + { + "epoch": 3.125764370158989, + "grad_norm": 0.29127150774002075, + "learning_rate": 6.0945780676722384e-05, + "loss": 3.8771, + "step": 46005 + }, + { + "epoch": 3.1261040902296506, + "grad_norm": 0.15628138184547424, + "learning_rate": 6.094153417583911e-05, + "loss": 3.8436, + "step": 46010 + }, + { + "epoch": 3.1264438103003127, + "grad_norm": 0.18056851625442505, + "learning_rate": 6.0937287674955834e-05, + "loss": 3.7685, + "step": 46015 + }, + { + "epoch": 3.1267835303709743, + "grad_norm": 0.19241659343242645, + "learning_rate": 6.093304117407257e-05, + "loss": 3.8295, + "step": 46020 + }, + { + "epoch": 3.127123250441636, + "grad_norm": 0.1384664624929428, + "learning_rate": 6.0928794673189296e-05, + "loss": 3.8376, + "step": 46025 + }, + { + "epoch": 3.127462970512298, + "grad_norm": 0.16761912405490875, + "learning_rate": 6.092454817230602e-05, + "loss": 3.8211, + "step": 46030 + }, + { + "epoch": 3.1278026905829597, + "grad_norm": 0.1912979632616043, + "learning_rate": 6.092030167142275e-05, + "loss": 3.7082, + "step": 46035 + }, + { + "epoch": 3.1281424106536213, + "grad_norm": 0.19558820128440857, + "learning_rate": 6.0916055170539474e-05, + "loss": 3.9049, + "step": 46040 + }, + { + "epoch": 3.1284821307242834, + "grad_norm": 0.17280243337154388, + "learning_rate": 6.09118086696562e-05, + "loss": 3.5782, + "step": 46045 + }, + { + "epoch": 3.128821850794945, + "grad_norm": 0.1906227171421051, + "learning_rate": 6.0907562168772936e-05, + "loss": 3.7885, + "step": 46050 + }, + { + "epoch": 3.1291615708656066, + "grad_norm": 0.16029924154281616, + "learning_rate": 6.090331566788966e-05, + "loss": 3.7534, + "step": 46055 + }, + { + "epoch": 3.1295012909362687, + "grad_norm": 0.1602870523929596, + "learning_rate": 6.089906916700639e-05, + "loss": 3.8099, + "step": 46060 + }, + { + "epoch": 3.1298410110069304, + "grad_norm": 0.2944587171077728, + "learning_rate": 6.089482266612312e-05, + "loss": 3.8491, + "step": 46065 + }, + { + "epoch": 3.130180731077592, + "grad_norm": 0.2543397843837738, + "learning_rate": 6.089057616523984e-05, + "loss": 3.9977, + "step": 46070 + }, + { + "epoch": 3.1305204511482536, + "grad_norm": 0.21412688493728638, + "learning_rate": 6.0886329664356576e-05, + "loss": 3.7913, + "step": 46075 + }, + { + "epoch": 3.1308601712189157, + "grad_norm": 0.20356574654579163, + "learning_rate": 6.0882083163473304e-05, + "loss": 3.8816, + "step": 46080 + }, + { + "epoch": 3.1311998912895773, + "grad_norm": 0.16215229034423828, + "learning_rate": 6.0877836662590026e-05, + "loss": 3.6779, + "step": 46085 + }, + { + "epoch": 3.131539611360239, + "grad_norm": 0.15662148594856262, + "learning_rate": 6.087359016170676e-05, + "loss": 4.0252, + "step": 46090 + }, + { + "epoch": 3.131879331430901, + "grad_norm": 0.16205047070980072, + "learning_rate": 6.086934366082349e-05, + "loss": 3.997, + "step": 46095 + }, + { + "epoch": 3.1322190515015627, + "grad_norm": 0.1489594280719757, + "learning_rate": 6.086509715994021e-05, + "loss": 3.9278, + "step": 46100 + }, + { + "epoch": 3.1325587715722243, + "grad_norm": 2.2615127563476562, + "learning_rate": 6.0860850659056945e-05, + "loss": 4.1278, + "step": 46105 + }, + { + "epoch": 3.1328984916428864, + "grad_norm": 0.19721737504005432, + "learning_rate": 6.0856604158173666e-05, + "loss": 3.9516, + "step": 46110 + }, + { + "epoch": 3.133238211713548, + "grad_norm": 0.3248071074485779, + "learning_rate": 6.0852357657290394e-05, + "loss": 3.8585, + "step": 46115 + }, + { + "epoch": 3.1335779317842096, + "grad_norm": 0.16760356724262238, + "learning_rate": 6.084811115640713e-05, + "loss": 3.7366, + "step": 46120 + }, + { + "epoch": 3.1339176518548717, + "grad_norm": 0.20555424690246582, + "learning_rate": 6.084386465552385e-05, + "loss": 4.0538, + "step": 46125 + }, + { + "epoch": 3.1342573719255333, + "grad_norm": 0.1974673569202423, + "learning_rate": 6.083961815464058e-05, + "loss": 3.5644, + "step": 46130 + }, + { + "epoch": 3.134597091996195, + "grad_norm": 0.2662750780582428, + "learning_rate": 6.083537165375731e-05, + "loss": 3.6681, + "step": 46135 + }, + { + "epoch": 3.134936812066857, + "grad_norm": 0.15258590877056122, + "learning_rate": 6.0831125152874034e-05, + "loss": 3.6967, + "step": 46140 + }, + { + "epoch": 3.1352765321375187, + "grad_norm": 0.1854752153158188, + "learning_rate": 6.082687865199076e-05, + "loss": 3.8317, + "step": 46145 + }, + { + "epoch": 3.1356162522081803, + "grad_norm": 0.18728064000606537, + "learning_rate": 6.0822632151107497e-05, + "loss": 3.7926, + "step": 46150 + }, + { + "epoch": 3.1359559722788424, + "grad_norm": 0.14833280444145203, + "learning_rate": 6.081838565022422e-05, + "loss": 3.9006, + "step": 46155 + }, + { + "epoch": 3.136295692349504, + "grad_norm": 0.1828518509864807, + "learning_rate": 6.0814139149340946e-05, + "loss": 3.7018, + "step": 46160 + }, + { + "epoch": 3.1366354124201656, + "grad_norm": 0.2163403332233429, + "learning_rate": 6.080989264845768e-05, + "loss": 4.0001, + "step": 46165 + }, + { + "epoch": 3.1369751324908277, + "grad_norm": 0.16392198204994202, + "learning_rate": 6.08056461475744e-05, + "loss": 3.902, + "step": 46170 + }, + { + "epoch": 3.1373148525614893, + "grad_norm": 0.17933325469493866, + "learning_rate": 6.080139964669112e-05, + "loss": 3.945, + "step": 46175 + }, + { + "epoch": 3.137654572632151, + "grad_norm": 0.1654706597328186, + "learning_rate": 6.0797153145807865e-05, + "loss": 3.8177, + "step": 46180 + }, + { + "epoch": 3.137994292702813, + "grad_norm": 0.3586430549621582, + "learning_rate": 6.0792906644924586e-05, + "loss": 3.7952, + "step": 46185 + }, + { + "epoch": 3.1383340127734747, + "grad_norm": 0.1939568966627121, + "learning_rate": 6.078866014404131e-05, + "loss": 4.0063, + "step": 46190 + }, + { + "epoch": 3.1386737328441363, + "grad_norm": 0.19874542951583862, + "learning_rate": 6.078441364315804e-05, + "loss": 4.0122, + "step": 46195 + }, + { + "epoch": 3.1390134529147984, + "grad_norm": 0.2023492306470871, + "learning_rate": 6.078016714227477e-05, + "loss": 3.8426, + "step": 46200 + }, + { + "epoch": 3.13935317298546, + "grad_norm": 0.18147656321525574, + "learning_rate": 6.077592064139149e-05, + "loss": 3.7607, + "step": 46205 + }, + { + "epoch": 3.1396928930561216, + "grad_norm": 0.18856093287467957, + "learning_rate": 6.0771674140508226e-05, + "loss": 3.8011, + "step": 46210 + }, + { + "epoch": 3.1400326131267837, + "grad_norm": 0.1714790165424347, + "learning_rate": 6.0767427639624954e-05, + "loss": 4.1352, + "step": 46215 + }, + { + "epoch": 3.1403723331974454, + "grad_norm": 0.2508147358894348, + "learning_rate": 6.0763181138741675e-05, + "loss": 3.9605, + "step": 46220 + }, + { + "epoch": 3.140712053268107, + "grad_norm": 0.8711825609207153, + "learning_rate": 6.075893463785841e-05, + "loss": 3.8253, + "step": 46225 + }, + { + "epoch": 3.141051773338769, + "grad_norm": 0.1999233514070511, + "learning_rate": 6.075468813697514e-05, + "loss": 3.6723, + "step": 46230 + }, + { + "epoch": 3.1413914934094307, + "grad_norm": 0.22401782870292664, + "learning_rate": 6.075044163609186e-05, + "loss": 3.8749, + "step": 46235 + }, + { + "epoch": 3.1417312134800923, + "grad_norm": 0.14051398634910583, + "learning_rate": 6.0746195135208594e-05, + "loss": 3.9389, + "step": 46240 + }, + { + "epoch": 3.1420709335507544, + "grad_norm": 0.21467408537864685, + "learning_rate": 6.0741948634325315e-05, + "loss": 3.9146, + "step": 46245 + }, + { + "epoch": 3.142410653621416, + "grad_norm": 0.16628398001194, + "learning_rate": 6.073770213344204e-05, + "loss": 3.9369, + "step": 46250 + }, + { + "epoch": 3.1427503736920777, + "grad_norm": 0.23486211895942688, + "learning_rate": 6.073345563255878e-05, + "loss": 3.8921, + "step": 46255 + }, + { + "epoch": 3.1430900937627397, + "grad_norm": 0.25209662318229675, + "learning_rate": 6.07292091316755e-05, + "loss": 3.8929, + "step": 46260 + }, + { + "epoch": 3.1434298138334014, + "grad_norm": 0.29209011793136597, + "learning_rate": 6.072496263079223e-05, + "loss": 3.9878, + "step": 46265 + }, + { + "epoch": 3.143769533904063, + "grad_norm": 0.20982465147972107, + "learning_rate": 6.072071612990896e-05, + "loss": 3.9821, + "step": 46270 + }, + { + "epoch": 3.1441092539747246, + "grad_norm": 0.18760858476161957, + "learning_rate": 6.071646962902568e-05, + "loss": 3.9821, + "step": 46275 + }, + { + "epoch": 3.1444489740453867, + "grad_norm": 0.19803674519062042, + "learning_rate": 6.071222312814241e-05, + "loss": 3.885, + "step": 46280 + }, + { + "epoch": 3.1447886941160483, + "grad_norm": 0.17279039323329926, + "learning_rate": 6.0707976627259146e-05, + "loss": 3.8519, + "step": 46285 + }, + { + "epoch": 3.14512841418671, + "grad_norm": 0.25255799293518066, + "learning_rate": 6.070373012637587e-05, + "loss": 3.9047, + "step": 46290 + }, + { + "epoch": 3.145468134257372, + "grad_norm": 0.18226775527000427, + "learning_rate": 6.0699483625492595e-05, + "loss": 3.7695, + "step": 46295 + }, + { + "epoch": 3.1458078543280337, + "grad_norm": 0.23876775801181793, + "learning_rate": 6.069523712460933e-05, + "loss": 4.0338, + "step": 46300 + }, + { + "epoch": 3.1461475743986953, + "grad_norm": 0.20568789541721344, + "learning_rate": 6.069099062372605e-05, + "loss": 4.1628, + "step": 46305 + }, + { + "epoch": 3.1464872944693574, + "grad_norm": 0.20138223469257355, + "learning_rate": 6.068674412284277e-05, + "loss": 3.8365, + "step": 46310 + }, + { + "epoch": 3.146827014540019, + "grad_norm": 0.6262094378471375, + "learning_rate": 6.0682497621959514e-05, + "loss": 3.6669, + "step": 46315 + }, + { + "epoch": 3.1471667346106806, + "grad_norm": 0.20711763203144073, + "learning_rate": 6.0678251121076235e-05, + "loss": 3.7663, + "step": 46320 + }, + { + "epoch": 3.1475064546813427, + "grad_norm": 0.22064736485481262, + "learning_rate": 6.0674004620192957e-05, + "loss": 4.102, + "step": 46325 + }, + { + "epoch": 3.1478461747520043, + "grad_norm": 0.20069460570812225, + "learning_rate": 6.066975811930969e-05, + "loss": 4.007, + "step": 46330 + }, + { + "epoch": 3.148185894822666, + "grad_norm": 0.20364077389240265, + "learning_rate": 6.066551161842642e-05, + "loss": 3.9698, + "step": 46335 + }, + { + "epoch": 3.148525614893328, + "grad_norm": 0.2298687994480133, + "learning_rate": 6.066126511754314e-05, + "loss": 3.8276, + "step": 46340 + }, + { + "epoch": 3.1488653349639897, + "grad_norm": 0.17157933115959167, + "learning_rate": 6.0657018616659875e-05, + "loss": 4.0739, + "step": 46345 + }, + { + "epoch": 3.1492050550346513, + "grad_norm": 0.15822574496269226, + "learning_rate": 6.06527721157766e-05, + "loss": 3.7491, + "step": 46350 + }, + { + "epoch": 3.1495447751053134, + "grad_norm": 0.2818049490451813, + "learning_rate": 6.0648525614893325e-05, + "loss": 3.6844, + "step": 46355 + }, + { + "epoch": 3.149884495175975, + "grad_norm": 0.6117631793022156, + "learning_rate": 6.064427911401006e-05, + "loss": 3.9682, + "step": 46360 + }, + { + "epoch": 3.1502242152466366, + "grad_norm": 0.1498817652463913, + "learning_rate": 6.064003261312679e-05, + "loss": 3.9948, + "step": 46365 + }, + { + "epoch": 3.1505639353172987, + "grad_norm": 0.38892248272895813, + "learning_rate": 6.063578611224351e-05, + "loss": 3.742, + "step": 46370 + }, + { + "epoch": 3.1509036553879604, + "grad_norm": 0.14578676223754883, + "learning_rate": 6.063153961136024e-05, + "loss": 3.6502, + "step": 46375 + }, + { + "epoch": 3.151243375458622, + "grad_norm": 0.27669209241867065, + "learning_rate": 6.062729311047697e-05, + "loss": 3.7193, + "step": 46380 + }, + { + "epoch": 3.151583095529284, + "grad_norm": 0.1663466989994049, + "learning_rate": 6.062304660959369e-05, + "loss": 3.894, + "step": 46385 + }, + { + "epoch": 3.1519228155999457, + "grad_norm": 0.14263509213924408, + "learning_rate": 6.061880010871043e-05, + "loss": 3.7899, + "step": 46390 + }, + { + "epoch": 3.1522625356706073, + "grad_norm": 0.17689332365989685, + "learning_rate": 6.061455360782715e-05, + "loss": 4.028, + "step": 46395 + }, + { + "epoch": 3.1526022557412694, + "grad_norm": 0.2425488382577896, + "learning_rate": 6.0610307106943883e-05, + "loss": 3.9534, + "step": 46400 + }, + { + "epoch": 3.152941975811931, + "grad_norm": 0.9053425788879395, + "learning_rate": 6.060606060606061e-05, + "loss": 3.8107, + "step": 46405 + }, + { + "epoch": 3.1532816958825927, + "grad_norm": 0.15014897286891937, + "learning_rate": 6.060181410517733e-05, + "loss": 3.8702, + "step": 46410 + }, + { + "epoch": 3.1536214159532543, + "grad_norm": 1.3441578149795532, + "learning_rate": 6.059756760429407e-05, + "loss": 3.7263, + "step": 46415 + }, + { + "epoch": 3.1539611360239164, + "grad_norm": 0.18820995092391968, + "learning_rate": 6.0593321103410795e-05, + "loss": 3.8132, + "step": 46420 + }, + { + "epoch": 3.154300856094578, + "grad_norm": 0.18813863396644592, + "learning_rate": 6.058907460252752e-05, + "loss": 4.0157, + "step": 46425 + }, + { + "epoch": 3.1546405761652396, + "grad_norm": 0.30119702219963074, + "learning_rate": 6.058482810164425e-05, + "loss": 3.8944, + "step": 46430 + }, + { + "epoch": 3.1549802962359017, + "grad_norm": 0.18195825815200806, + "learning_rate": 6.058058160076098e-05, + "loss": 3.8334, + "step": 46435 + }, + { + "epoch": 3.1553200163065633, + "grad_norm": 0.19605255126953125, + "learning_rate": 6.05763350998777e-05, + "loss": 3.8999, + "step": 46440 + }, + { + "epoch": 3.155659736377225, + "grad_norm": 0.16176848113536835, + "learning_rate": 6.0572088598994435e-05, + "loss": 3.9488, + "step": 46445 + }, + { + "epoch": 3.155999456447887, + "grad_norm": 0.14732131361961365, + "learning_rate": 6.0567842098111163e-05, + "loss": 3.906, + "step": 46450 + }, + { + "epoch": 3.1563391765185487, + "grad_norm": 0.1761723905801773, + "learning_rate": 6.0563595597227885e-05, + "loss": 4.0255, + "step": 46455 + }, + { + "epoch": 3.1566788965892103, + "grad_norm": 0.24605126678943634, + "learning_rate": 6.055934909634462e-05, + "loss": 3.7692, + "step": 46460 + }, + { + "epoch": 3.1570186166598724, + "grad_norm": 0.13849298655986786, + "learning_rate": 6.055510259546134e-05, + "loss": 3.6757, + "step": 46465 + }, + { + "epoch": 3.157358336730534, + "grad_norm": 0.19153012335300446, + "learning_rate": 6.055085609457807e-05, + "loss": 3.9687, + "step": 46470 + }, + { + "epoch": 3.1576980568011956, + "grad_norm": 0.25169944763183594, + "learning_rate": 6.0546609593694804e-05, + "loss": 3.6387, + "step": 46475 + }, + { + "epoch": 3.1580377768718577, + "grad_norm": 0.16287636756896973, + "learning_rate": 6.0542363092811525e-05, + "loss": 3.797, + "step": 46480 + }, + { + "epoch": 3.1583774969425193, + "grad_norm": 1.352262020111084, + "learning_rate": 6.053811659192825e-05, + "loss": 3.8743, + "step": 46485 + }, + { + "epoch": 3.158717217013181, + "grad_norm": 0.3478766679763794, + "learning_rate": 6.053387009104499e-05, + "loss": 3.867, + "step": 46490 + }, + { + "epoch": 3.159056937083843, + "grad_norm": 0.18540571630001068, + "learning_rate": 6.052962359016171e-05, + "loss": 3.9611, + "step": 46495 + }, + { + "epoch": 3.1593966571545047, + "grad_norm": 0.16001708805561066, + "learning_rate": 6.052537708927844e-05, + "loss": 3.4919, + "step": 46500 + }, + { + "epoch": 3.1597363772251663, + "grad_norm": 0.18536561727523804, + "learning_rate": 6.052113058839517e-05, + "loss": 3.8557, + "step": 46505 + }, + { + "epoch": 3.1600760972958284, + "grad_norm": 0.19796468317508698, + "learning_rate": 6.051688408751189e-05, + "loss": 3.9758, + "step": 46510 + }, + { + "epoch": 3.16041581736649, + "grad_norm": 0.19946232438087463, + "learning_rate": 6.051263758662862e-05, + "loss": 4.1308, + "step": 46515 + }, + { + "epoch": 3.1607555374371517, + "grad_norm": 0.9891839623451233, + "learning_rate": 6.0508391085745356e-05, + "loss": 4.0983, + "step": 46520 + }, + { + "epoch": 3.1610952575078137, + "grad_norm": 1.588503122329712, + "learning_rate": 6.050414458486208e-05, + "loss": 3.8308, + "step": 46525 + }, + { + "epoch": 3.1614349775784754, + "grad_norm": 0.18981239199638367, + "learning_rate": 6.04998980839788e-05, + "loss": 3.8358, + "step": 46530 + }, + { + "epoch": 3.161774697649137, + "grad_norm": 0.1424698829650879, + "learning_rate": 6.049565158309553e-05, + "loss": 3.9198, + "step": 46535 + }, + { + "epoch": 3.162114417719799, + "grad_norm": 0.19869297742843628, + "learning_rate": 6.049140508221226e-05, + "loss": 3.8243, + "step": 46540 + }, + { + "epoch": 3.1624541377904607, + "grad_norm": 0.19553746283054352, + "learning_rate": 6.048715858132898e-05, + "loss": 3.7903, + "step": 46545 + }, + { + "epoch": 3.1627938578611223, + "grad_norm": 0.12855280935764313, + "learning_rate": 6.048291208044572e-05, + "loss": 3.9513, + "step": 46550 + }, + { + "epoch": 3.1631335779317844, + "grad_norm": 0.19559723138809204, + "learning_rate": 6.0478665579562445e-05, + "loss": 3.9659, + "step": 46555 + }, + { + "epoch": 3.163473298002446, + "grad_norm": 0.19129206240177155, + "learning_rate": 6.0474419078679166e-05, + "loss": 3.8436, + "step": 46560 + }, + { + "epoch": 3.1638130180731077, + "grad_norm": 0.12804149091243744, + "learning_rate": 6.04701725777959e-05, + "loss": 3.9521, + "step": 46565 + }, + { + "epoch": 3.1641527381437697, + "grad_norm": 0.18636059761047363, + "learning_rate": 6.046592607691263e-05, + "loss": 3.7283, + "step": 46570 + }, + { + "epoch": 3.1644924582144314, + "grad_norm": 0.1786118596792221, + "learning_rate": 6.046167957602935e-05, + "loss": 4.0983, + "step": 46575 + }, + { + "epoch": 3.164832178285093, + "grad_norm": 0.1529618203639984, + "learning_rate": 6.0457433075146085e-05, + "loss": 4.0361, + "step": 46580 + }, + { + "epoch": 3.165171898355755, + "grad_norm": 8.61949348449707, + "learning_rate": 6.045318657426281e-05, + "loss": 3.7996, + "step": 46585 + }, + { + "epoch": 3.1655116184264167, + "grad_norm": 0.1800478994846344, + "learning_rate": 6.0448940073379534e-05, + "loss": 3.8076, + "step": 46590 + }, + { + "epoch": 3.1658513384970783, + "grad_norm": 0.45816534757614136, + "learning_rate": 6.044469357249627e-05, + "loss": 4.0302, + "step": 46595 + }, + { + "epoch": 3.1661910585677404, + "grad_norm": 0.36418381333351135, + "learning_rate": 6.044044707161299e-05, + "loss": 3.8934, + "step": 46600 + }, + { + "epoch": 3.166530778638402, + "grad_norm": 0.4913425147533417, + "learning_rate": 6.043620057072972e-05, + "loss": 3.9689, + "step": 46605 + }, + { + "epoch": 3.1668704987090637, + "grad_norm": 0.17846544086933136, + "learning_rate": 6.043195406984645e-05, + "loss": 3.7251, + "step": 46610 + }, + { + "epoch": 3.1672102187797253, + "grad_norm": 0.3014601469039917, + "learning_rate": 6.0427707568963174e-05, + "loss": 3.7721, + "step": 46615 + }, + { + "epoch": 3.1675499388503874, + "grad_norm": 0.1897277981042862, + "learning_rate": 6.04234610680799e-05, + "loss": 3.9169, + "step": 46620 + }, + { + "epoch": 3.167889658921049, + "grad_norm": 0.19393809139728546, + "learning_rate": 6.041921456719664e-05, + "loss": 4.0632, + "step": 46625 + }, + { + "epoch": 3.1682293789917106, + "grad_norm": 0.16108451783657074, + "learning_rate": 6.041496806631336e-05, + "loss": 3.7458, + "step": 46630 + }, + { + "epoch": 3.1685690990623727, + "grad_norm": 0.1651889979839325, + "learning_rate": 6.0410721565430086e-05, + "loss": 3.8832, + "step": 46635 + }, + { + "epoch": 3.1689088191330343, + "grad_norm": 0.14826831221580505, + "learning_rate": 6.040647506454682e-05, + "loss": 3.867, + "step": 46640 + }, + { + "epoch": 3.169248539203696, + "grad_norm": 0.171342670917511, + "learning_rate": 6.040222856366354e-05, + "loss": 3.9477, + "step": 46645 + }, + { + "epoch": 3.169588259274358, + "grad_norm": 0.26389262080192566, + "learning_rate": 6.039798206278027e-05, + "loss": 4.0791, + "step": 46650 + }, + { + "epoch": 3.1699279793450197, + "grad_norm": 0.18787340819835663, + "learning_rate": 6.0393735561897005e-05, + "loss": 3.853, + "step": 46655 + }, + { + "epoch": 3.1702676994156813, + "grad_norm": 0.19471627473831177, + "learning_rate": 6.0389489061013726e-05, + "loss": 4.0425, + "step": 46660 + }, + { + "epoch": 3.1706074194863434, + "grad_norm": 0.144736185669899, + "learning_rate": 6.038524256013045e-05, + "loss": 3.8194, + "step": 46665 + }, + { + "epoch": 3.170947139557005, + "grad_norm": 0.15944325923919678, + "learning_rate": 6.038099605924719e-05, + "loss": 3.8823, + "step": 46670 + }, + { + "epoch": 3.1712868596276667, + "grad_norm": 0.19803696870803833, + "learning_rate": 6.037674955836391e-05, + "loss": 3.9989, + "step": 46675 + }, + { + "epoch": 3.1716265796983287, + "grad_norm": 0.15869851410388947, + "learning_rate": 6.037250305748063e-05, + "loss": 4.045, + "step": 46680 + }, + { + "epoch": 3.1719662997689904, + "grad_norm": 0.191684752702713, + "learning_rate": 6.0368256556597366e-05, + "loss": 3.7108, + "step": 46685 + }, + { + "epoch": 3.172306019839652, + "grad_norm": 0.254269003868103, + "learning_rate": 6.0364010055714094e-05, + "loss": 4.1728, + "step": 46690 + }, + { + "epoch": 3.172645739910314, + "grad_norm": 0.2690134048461914, + "learning_rate": 6.0359763554830816e-05, + "loss": 3.9848, + "step": 46695 + }, + { + "epoch": 3.1729854599809757, + "grad_norm": 0.16707609593868256, + "learning_rate": 6.035551705394755e-05, + "loss": 3.6459, + "step": 46700 + }, + { + "epoch": 3.1733251800516373, + "grad_norm": 0.19034649431705475, + "learning_rate": 6.035127055306428e-05, + "loss": 3.7666, + "step": 46705 + }, + { + "epoch": 3.1736649001222994, + "grad_norm": 0.23304328322410583, + "learning_rate": 6.0347024052181e-05, + "loss": 3.8096, + "step": 46710 + }, + { + "epoch": 3.174004620192961, + "grad_norm": 0.24243797361850739, + "learning_rate": 6.0342777551297734e-05, + "loss": 4.1169, + "step": 46715 + }, + { + "epoch": 3.1743443402636227, + "grad_norm": 0.17370180785655975, + "learning_rate": 6.033853105041446e-05, + "loss": 3.851, + "step": 46720 + }, + { + "epoch": 3.1746840603342847, + "grad_norm": 0.2228429764509201, + "learning_rate": 6.0334284549531184e-05, + "loss": 3.7407, + "step": 46725 + }, + { + "epoch": 3.1750237804049464, + "grad_norm": 0.1567288488149643, + "learning_rate": 6.033003804864792e-05, + "loss": 3.9077, + "step": 46730 + }, + { + "epoch": 3.175363500475608, + "grad_norm": 0.20375464856624603, + "learning_rate": 6.032579154776464e-05, + "loss": 3.7864, + "step": 46735 + }, + { + "epoch": 3.17570322054627, + "grad_norm": 0.18152743577957153, + "learning_rate": 6.032154504688138e-05, + "loss": 4.0019, + "step": 46740 + }, + { + "epoch": 3.1760429406169317, + "grad_norm": 0.1917732208967209, + "learning_rate": 6.03172985459981e-05, + "loss": 4.0679, + "step": 46745 + }, + { + "epoch": 3.1763826606875933, + "grad_norm": 0.17521944642066956, + "learning_rate": 6.0313052045114824e-05, + "loss": 3.7699, + "step": 46750 + }, + { + "epoch": 3.176722380758255, + "grad_norm": 0.8083236217498779, + "learning_rate": 6.030880554423156e-05, + "loss": 3.4267, + "step": 46755 + }, + { + "epoch": 3.177062100828917, + "grad_norm": 0.16031138598918915, + "learning_rate": 6.0304559043348286e-05, + "loss": 3.9566, + "step": 46760 + }, + { + "epoch": 3.1774018208995787, + "grad_norm": 0.1497601866722107, + "learning_rate": 6.030031254246501e-05, + "loss": 3.7667, + "step": 46765 + }, + { + "epoch": 3.1777415409702403, + "grad_norm": 0.16868209838867188, + "learning_rate": 6.029606604158174e-05, + "loss": 3.8535, + "step": 46770 + }, + { + "epoch": 3.1780812610409024, + "grad_norm": 0.20110774040222168, + "learning_rate": 6.029181954069847e-05, + "loss": 3.7325, + "step": 46775 + }, + { + "epoch": 3.178420981111564, + "grad_norm": 0.18309153616428375, + "learning_rate": 6.028757303981519e-05, + "loss": 3.841, + "step": 46780 + }, + { + "epoch": 3.1787607011822256, + "grad_norm": 0.1642385572195053, + "learning_rate": 6.0283326538931926e-05, + "loss": 3.7081, + "step": 46785 + }, + { + "epoch": 3.1791004212528877, + "grad_norm": 0.1729748398065567, + "learning_rate": 6.0279080038048654e-05, + "loss": 4.0843, + "step": 46790 + }, + { + "epoch": 3.1794401413235494, + "grad_norm": 0.15167739987373352, + "learning_rate": 6.0274833537165376e-05, + "loss": 3.9201, + "step": 46795 + }, + { + "epoch": 3.179779861394211, + "grad_norm": 0.2404259294271469, + "learning_rate": 6.027058703628211e-05, + "loss": 3.6082, + "step": 46800 + }, + { + "epoch": 3.180119581464873, + "grad_norm": 0.1430004984140396, + "learning_rate": 6.026634053539884e-05, + "loss": 3.6203, + "step": 46805 + }, + { + "epoch": 3.1804593015355347, + "grad_norm": 4.9924750328063965, + "learning_rate": 6.026209403451556e-05, + "loss": 4.0099, + "step": 46810 + }, + { + "epoch": 3.1807990216061963, + "grad_norm": 0.19866305589675903, + "learning_rate": 6.0257847533632295e-05, + "loss": 3.6963, + "step": 46815 + }, + { + "epoch": 3.1811387416768584, + "grad_norm": 0.6613665223121643, + "learning_rate": 6.0253601032749016e-05, + "loss": 3.7971, + "step": 46820 + }, + { + "epoch": 3.18147846174752, + "grad_norm": 0.14300362765789032, + "learning_rate": 6.0249354531865744e-05, + "loss": 3.8919, + "step": 46825 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.1552605777978897, + "learning_rate": 6.024510803098248e-05, + "loss": 3.8682, + "step": 46830 + }, + { + "epoch": 3.1821579018888437, + "grad_norm": 0.14464783668518066, + "learning_rate": 6.02408615300992e-05, + "loss": 3.8882, + "step": 46835 + }, + { + "epoch": 3.1824976219595054, + "grad_norm": 0.20616678893566132, + "learning_rate": 6.023661502921593e-05, + "loss": 3.7009, + "step": 46840 + }, + { + "epoch": 3.182837342030167, + "grad_norm": NaN, + "learning_rate": 6.023321782850932e-05, + "loss": 3.8764, + "step": 46845 + }, + { + "epoch": 3.183177062100829, + "grad_norm": 0.1649390161037445, + "learning_rate": 6.022897132762604e-05, + "loss": 3.9329, + "step": 46850 + }, + { + "epoch": 3.1835167821714907, + "grad_norm": 0.1623104065656662, + "learning_rate": 6.022472482674276e-05, + "loss": 3.8889, + "step": 46855 + }, + { + "epoch": 3.1838565022421523, + "grad_norm": 0.19490088522434235, + "learning_rate": 6.0220478325859494e-05, + "loss": 4.0714, + "step": 46860 + }, + { + "epoch": 3.1841962223128144, + "grad_norm": 0.37540262937545776, + "learning_rate": 6.021623182497622e-05, + "loss": 3.8803, + "step": 46865 + }, + { + "epoch": 3.184535942383476, + "grad_norm": 0.1514524221420288, + "learning_rate": 6.0211985324092943e-05, + "loss": 4.0608, + "step": 46870 + }, + { + "epoch": 3.1848756624541377, + "grad_norm": 0.24546101689338684, + "learning_rate": 6.020773882320968e-05, + "loss": 4.014, + "step": 46875 + }, + { + "epoch": 3.1852153825247997, + "grad_norm": 0.18002192676067352, + "learning_rate": 6.0203492322326406e-05, + "loss": 3.8214, + "step": 46880 + }, + { + "epoch": 3.1855551025954614, + "grad_norm": 0.17743612825870514, + "learning_rate": 6.019924582144313e-05, + "loss": 4.054, + "step": 46885 + }, + { + "epoch": 3.185894822666123, + "grad_norm": 0.16999486088752747, + "learning_rate": 6.019499932055986e-05, + "loss": 3.7812, + "step": 46890 + }, + { + "epoch": 3.186234542736785, + "grad_norm": 0.1847606748342514, + "learning_rate": 6.019075281967659e-05, + "loss": 3.9212, + "step": 46895 + }, + { + "epoch": 3.1865742628074467, + "grad_norm": 0.17828582227230072, + "learning_rate": 6.018650631879331e-05, + "loss": 3.9847, + "step": 46900 + }, + { + "epoch": 3.1869139828781083, + "grad_norm": 0.1990036964416504, + "learning_rate": 6.0182259817910046e-05, + "loss": 3.7448, + "step": 46905 + }, + { + "epoch": 3.1872537029487704, + "grad_norm": 0.17744888365268707, + "learning_rate": 6.0178013317026774e-05, + "loss": 3.9783, + "step": 46910 + }, + { + "epoch": 3.187593423019432, + "grad_norm": 0.1577865034341812, + "learning_rate": 6.0173766816143496e-05, + "loss": 3.8284, + "step": 46915 + }, + { + "epoch": 3.1879331430900937, + "grad_norm": 0.23191750049591064, + "learning_rate": 6.016952031526023e-05, + "loss": 3.9226, + "step": 46920 + }, + { + "epoch": 3.1882728631607558, + "grad_norm": 0.18722142279148102, + "learning_rate": 6.016527381437695e-05, + "loss": 3.9798, + "step": 46925 + }, + { + "epoch": 3.1886125832314174, + "grad_norm": 0.1592922806739807, + "learning_rate": 6.016102731349368e-05, + "loss": 3.9508, + "step": 46930 + }, + { + "epoch": 3.188952303302079, + "grad_norm": 0.3110353350639343, + "learning_rate": 6.0156780812610414e-05, + "loss": 3.766, + "step": 46935 + }, + { + "epoch": 3.189292023372741, + "grad_norm": 0.2620554268360138, + "learning_rate": 6.0152534311727136e-05, + "loss": 3.8622, + "step": 46940 + }, + { + "epoch": 3.1896317434434027, + "grad_norm": 0.28354236483573914, + "learning_rate": 6.014828781084387e-05, + "loss": 3.8019, + "step": 46945 + }, + { + "epoch": 3.1899714635140644, + "grad_norm": 0.15796807408332825, + "learning_rate": 6.01440413099606e-05, + "loss": 3.9246, + "step": 46950 + }, + { + "epoch": 3.190311183584726, + "grad_norm": 0.2801980674266815, + "learning_rate": 6.013979480907732e-05, + "loss": 3.7215, + "step": 46955 + }, + { + "epoch": 3.190650903655388, + "grad_norm": 0.14773382246494293, + "learning_rate": 6.0135548308194054e-05, + "loss": 3.647, + "step": 46960 + }, + { + "epoch": 3.1909906237260497, + "grad_norm": 0.20466236770153046, + "learning_rate": 6.013130180731078e-05, + "loss": 3.854, + "step": 46965 + }, + { + "epoch": 3.1913303437967113, + "grad_norm": 0.21673281490802765, + "learning_rate": 6.0127055306427504e-05, + "loss": 3.6727, + "step": 46970 + }, + { + "epoch": 3.1916700638673734, + "grad_norm": 0.5030932426452637, + "learning_rate": 6.012280880554424e-05, + "loss": 3.9152, + "step": 46975 + }, + { + "epoch": 3.192009783938035, + "grad_norm": 0.18105365335941315, + "learning_rate": 6.0118562304660966e-05, + "loss": 3.7741, + "step": 46980 + }, + { + "epoch": 3.1923495040086967, + "grad_norm": 0.16228382289409637, + "learning_rate": 6.011431580377769e-05, + "loss": 3.6671, + "step": 46985 + }, + { + "epoch": 3.1926892240793587, + "grad_norm": 0.21830199658870697, + "learning_rate": 6.011006930289442e-05, + "loss": 3.7458, + "step": 46990 + }, + { + "epoch": 3.1930289441500204, + "grad_norm": 0.16545408964157104, + "learning_rate": 6.0105822802011144e-05, + "loss": 3.7497, + "step": 46995 + }, + { + "epoch": 3.193368664220682, + "grad_norm": 0.24206796288490295, + "learning_rate": 6.010157630112787e-05, + "loss": 3.7023, + "step": 47000 + }, + { + "epoch": 3.193708384291344, + "grad_norm": 0.1937483847141266, + "learning_rate": 6.0097329800244606e-05, + "loss": 3.9317, + "step": 47005 + }, + { + "epoch": 3.1940481043620057, + "grad_norm": 0.16856926679611206, + "learning_rate": 6.009308329936133e-05, + "loss": 3.6629, + "step": 47010 + }, + { + "epoch": 3.1943878244326673, + "grad_norm": 0.18739618360996246, + "learning_rate": 6.0088836798478056e-05, + "loss": 3.8029, + "step": 47015 + }, + { + "epoch": 3.1947275445033294, + "grad_norm": 0.15599533915519714, + "learning_rate": 6.008459029759479e-05, + "loss": 3.9435, + "step": 47020 + }, + { + "epoch": 3.195067264573991, + "grad_norm": 0.20468167960643768, + "learning_rate": 6.008034379671151e-05, + "loss": 3.9315, + "step": 47025 + }, + { + "epoch": 3.1954069846446527, + "grad_norm": 0.17289571464061737, + "learning_rate": 6.007609729582824e-05, + "loss": 3.6703, + "step": 47030 + }, + { + "epoch": 3.1957467047153147, + "grad_norm": 0.1426009088754654, + "learning_rate": 6.0071850794944974e-05, + "loss": 3.7982, + "step": 47035 + }, + { + "epoch": 3.1960864247859764, + "grad_norm": 0.21332815289497375, + "learning_rate": 6.0067604294061696e-05, + "loss": 3.9482, + "step": 47040 + }, + { + "epoch": 3.196426144856638, + "grad_norm": 0.3126266300678253, + "learning_rate": 6.0063357793178424e-05, + "loss": 3.7129, + "step": 47045 + }, + { + "epoch": 3.1967658649273, + "grad_norm": 0.18854723870754242, + "learning_rate": 6.005911129229516e-05, + "loss": 3.7325, + "step": 47050 + }, + { + "epoch": 3.1971055849979617, + "grad_norm": 0.1468878835439682, + "learning_rate": 6.005486479141188e-05, + "loss": 3.8998, + "step": 47055 + }, + { + "epoch": 3.1974453050686233, + "grad_norm": 0.22508853673934937, + "learning_rate": 6.00506182905286e-05, + "loss": 3.7552, + "step": 47060 + }, + { + "epoch": 3.1977850251392854, + "grad_norm": 0.19393949210643768, + "learning_rate": 6.004637178964534e-05, + "loss": 4.0961, + "step": 47065 + }, + { + "epoch": 3.198124745209947, + "grad_norm": 0.23349837958812714, + "learning_rate": 6.0042125288762064e-05, + "loss": 3.8337, + "step": 47070 + }, + { + "epoch": 3.1984644652806087, + "grad_norm": 0.16992177069187164, + "learning_rate": 6.0037878787878785e-05, + "loss": 4.0899, + "step": 47075 + }, + { + "epoch": 3.1988041853512708, + "grad_norm": 0.21336254477500916, + "learning_rate": 6.003363228699552e-05, + "loss": 3.7297, + "step": 47080 + }, + { + "epoch": 3.1991439054219324, + "grad_norm": 0.42011114954948425, + "learning_rate": 6.002938578611225e-05, + "loss": 3.8878, + "step": 47085 + }, + { + "epoch": 3.199483625492594, + "grad_norm": 0.20629285275936127, + "learning_rate": 6.002513928522897e-05, + "loss": 3.9592, + "step": 47090 + }, + { + "epoch": 3.1998233455632556, + "grad_norm": 0.16210563480854034, + "learning_rate": 6.0020892784345704e-05, + "loss": 3.8222, + "step": 47095 + }, + { + "epoch": 3.2001630656339177, + "grad_norm": 0.2816537618637085, + "learning_rate": 6.001664628346243e-05, + "loss": 3.7253, + "step": 47100 + }, + { + "epoch": 3.2005027857045794, + "grad_norm": 0.18682362139225006, + "learning_rate": 6.001239978257915e-05, + "loss": 3.7558, + "step": 47105 + }, + { + "epoch": 3.200842505775241, + "grad_norm": 0.15972919762134552, + "learning_rate": 6.000815328169589e-05, + "loss": 4.2649, + "step": 47110 + }, + { + "epoch": 3.201182225845903, + "grad_norm": 0.18903936445713043, + "learning_rate": 6.0003906780812616e-05, + "loss": 3.8686, + "step": 47115 + }, + { + "epoch": 3.2015219459165647, + "grad_norm": 0.1666589230298996, + "learning_rate": 5.999966027992934e-05, + "loss": 3.9636, + "step": 47120 + }, + { + "epoch": 3.2018616659872263, + "grad_norm": 0.14552029967308044, + "learning_rate": 5.999541377904607e-05, + "loss": 3.5386, + "step": 47125 + }, + { + "epoch": 3.2022013860578884, + "grad_norm": 0.6466304063796997, + "learning_rate": 5.99911672781628e-05, + "loss": 4.0052, + "step": 47130 + }, + { + "epoch": 3.20254110612855, + "grad_norm": 0.20036780834197998, + "learning_rate": 5.998692077727952e-05, + "loss": 4.0364, + "step": 47135 + }, + { + "epoch": 3.2028808261992117, + "grad_norm": 0.17272372543811798, + "learning_rate": 5.9982674276396256e-05, + "loss": 3.8482, + "step": 47140 + }, + { + "epoch": 3.2032205462698737, + "grad_norm": 0.14123594760894775, + "learning_rate": 5.997842777551298e-05, + "loss": 3.8372, + "step": 47145 + }, + { + "epoch": 3.2035602663405354, + "grad_norm": 0.16468963027000427, + "learning_rate": 5.9974181274629705e-05, + "loss": 3.7864, + "step": 47150 + }, + { + "epoch": 3.203899986411197, + "grad_norm": 0.14044152200222015, + "learning_rate": 5.996993477374644e-05, + "loss": 4.0315, + "step": 47155 + }, + { + "epoch": 3.204239706481859, + "grad_norm": 0.3264308273792267, + "learning_rate": 5.996568827286316e-05, + "loss": 3.9228, + "step": 47160 + }, + { + "epoch": 3.2045794265525207, + "grad_norm": 0.2367425411939621, + "learning_rate": 5.996144177197989e-05, + "loss": 3.8192, + "step": 47165 + }, + { + "epoch": 3.2049191466231823, + "grad_norm": 0.18556901812553406, + "learning_rate": 5.9957195271096624e-05, + "loss": 4.0316, + "step": 47170 + }, + { + "epoch": 3.2052588666938444, + "grad_norm": 0.1371951550245285, + "learning_rate": 5.9952948770213345e-05, + "loss": 3.7365, + "step": 47175 + }, + { + "epoch": 3.205598586764506, + "grad_norm": 0.16801877319812775, + "learning_rate": 5.994870226933007e-05, + "loss": 3.9243, + "step": 47180 + }, + { + "epoch": 3.2059383068351677, + "grad_norm": 0.1700340062379837, + "learning_rate": 5.994445576844681e-05, + "loss": 3.6816, + "step": 47185 + }, + { + "epoch": 3.2062780269058297, + "grad_norm": 0.17648866772651672, + "learning_rate": 5.994020926756353e-05, + "loss": 3.9058, + "step": 47190 + }, + { + "epoch": 3.2066177469764914, + "grad_norm": 0.301749050617218, + "learning_rate": 5.993596276668025e-05, + "loss": 3.7582, + "step": 47195 + }, + { + "epoch": 3.206957467047153, + "grad_norm": 0.20606037974357605, + "learning_rate": 5.993171626579699e-05, + "loss": 3.6766, + "step": 47200 + }, + { + "epoch": 3.207297187117815, + "grad_norm": 0.2192257046699524, + "learning_rate": 5.992746976491371e-05, + "loss": 3.7902, + "step": 47205 + }, + { + "epoch": 3.2076369071884767, + "grad_norm": 0.14834310114383698, + "learning_rate": 5.9923223264030434e-05, + "loss": 3.999, + "step": 47210 + }, + { + "epoch": 3.2079766272591383, + "grad_norm": 0.16724276542663574, + "learning_rate": 5.991897676314717e-05, + "loss": 3.645, + "step": 47215 + }, + { + "epoch": 3.2083163473298004, + "grad_norm": 0.2269490659236908, + "learning_rate": 5.99147302622639e-05, + "loss": 3.7144, + "step": 47220 + }, + { + "epoch": 3.208656067400462, + "grad_norm": 0.7690162658691406, + "learning_rate": 5.991048376138062e-05, + "loss": 3.9277, + "step": 47225 + }, + { + "epoch": 3.2089957874711237, + "grad_norm": 0.19216027855873108, + "learning_rate": 5.990623726049735e-05, + "loss": 3.6236, + "step": 47230 + }, + { + "epoch": 3.2093355075417858, + "grad_norm": 0.19952143728733063, + "learning_rate": 5.990199075961408e-05, + "loss": 4.1183, + "step": 47235 + }, + { + "epoch": 3.2096752276124474, + "grad_norm": 0.1711655855178833, + "learning_rate": 5.98977442587308e-05, + "loss": 3.8987, + "step": 47240 + }, + { + "epoch": 3.210014947683109, + "grad_norm": 0.16087694466114044, + "learning_rate": 5.989349775784754e-05, + "loss": 3.9449, + "step": 47245 + }, + { + "epoch": 3.210354667753771, + "grad_norm": 0.19812385737895966, + "learning_rate": 5.9889251256964265e-05, + "loss": 4.0045, + "step": 47250 + }, + { + "epoch": 3.2106943878244327, + "grad_norm": 0.1898788958787918, + "learning_rate": 5.9885004756080986e-05, + "loss": 3.7969, + "step": 47255 + }, + { + "epoch": 3.2110341078950944, + "grad_norm": 0.17915144562721252, + "learning_rate": 5.988075825519772e-05, + "loss": 3.963, + "step": 47260 + }, + { + "epoch": 3.2113738279657564, + "grad_norm": 0.1971270889043808, + "learning_rate": 5.987651175431445e-05, + "loss": 3.8014, + "step": 47265 + }, + { + "epoch": 3.211713548036418, + "grad_norm": 0.13854359090328217, + "learning_rate": 5.987226525343117e-05, + "loss": 3.8996, + "step": 47270 + }, + { + "epoch": 3.2120532681070797, + "grad_norm": 0.21675635874271393, + "learning_rate": 5.9868018752547905e-05, + "loss": 3.9834, + "step": 47275 + }, + { + "epoch": 3.2123929881777418, + "grad_norm": 0.1900162249803543, + "learning_rate": 5.9863772251664627e-05, + "loss": 3.9926, + "step": 47280 + }, + { + "epoch": 3.2127327082484034, + "grad_norm": 0.17641477286815643, + "learning_rate": 5.985952575078136e-05, + "loss": 3.7508, + "step": 47285 + }, + { + "epoch": 3.213072428319065, + "grad_norm": 0.19892704486846924, + "learning_rate": 5.985527924989809e-05, + "loss": 3.8825, + "step": 47290 + }, + { + "epoch": 3.2134121483897267, + "grad_norm": 0.15533363819122314, + "learning_rate": 5.985103274901481e-05, + "loss": 3.7491, + "step": 47295 + }, + { + "epoch": 3.2137518684603887, + "grad_norm": 0.23414346575737, + "learning_rate": 5.9846786248131545e-05, + "loss": 4.0257, + "step": 47300 + }, + { + "epoch": 3.2140915885310504, + "grad_norm": 0.14785437285900116, + "learning_rate": 5.984253974724827e-05, + "loss": 3.5726, + "step": 47305 + }, + { + "epoch": 3.214431308601712, + "grad_norm": 0.16865816712379456, + "learning_rate": 5.9838293246364995e-05, + "loss": 3.5593, + "step": 47310 + }, + { + "epoch": 3.214771028672374, + "grad_norm": 1.496704339981079, + "learning_rate": 5.983404674548173e-05, + "loss": 3.9104, + "step": 47315 + }, + { + "epoch": 3.2151107487430357, + "grad_norm": 0.17531566321849823, + "learning_rate": 5.982980024459846e-05, + "loss": 3.9634, + "step": 47320 + }, + { + "epoch": 3.2154504688136973, + "grad_norm": 0.18621721863746643, + "learning_rate": 5.982555374371518e-05, + "loss": 3.9826, + "step": 47325 + }, + { + "epoch": 3.2157901888843594, + "grad_norm": 0.23189616203308105, + "learning_rate": 5.982130724283191e-05, + "loss": 3.6534, + "step": 47330 + }, + { + "epoch": 3.216129908955021, + "grad_norm": 0.1752248853445053, + "learning_rate": 5.981706074194864e-05, + "loss": 3.7674, + "step": 47335 + }, + { + "epoch": 3.2164696290256827, + "grad_norm": 0.35219869017601013, + "learning_rate": 5.981281424106536e-05, + "loss": 3.7565, + "step": 47340 + }, + { + "epoch": 3.2168093490963448, + "grad_norm": 0.21947355568408966, + "learning_rate": 5.98085677401821e-05, + "loss": 3.6801, + "step": 47345 + }, + { + "epoch": 3.2171490691670064, + "grad_norm": 0.17522940039634705, + "learning_rate": 5.980432123929882e-05, + "loss": 3.821, + "step": 47350 + }, + { + "epoch": 3.217488789237668, + "grad_norm": 0.14637401700019836, + "learning_rate": 5.980007473841555e-05, + "loss": 3.7665, + "step": 47355 + }, + { + "epoch": 3.21782850930833, + "grad_norm": 0.15536507964134216, + "learning_rate": 5.979582823753228e-05, + "loss": 4.0717, + "step": 47360 + }, + { + "epoch": 3.2181682293789917, + "grad_norm": 0.19388175010681152, + "learning_rate": 5.9791581736649e-05, + "loss": 3.6606, + "step": 47365 + }, + { + "epoch": 3.2185079494496533, + "grad_norm": 0.1526719182729721, + "learning_rate": 5.978733523576573e-05, + "loss": 3.9143, + "step": 47370 + }, + { + "epoch": 3.2188476695203154, + "grad_norm": 0.16199298202991486, + "learning_rate": 5.9783088734882465e-05, + "loss": 3.7191, + "step": 47375 + }, + { + "epoch": 3.219187389590977, + "grad_norm": 0.1307421773672104, + "learning_rate": 5.977884223399919e-05, + "loss": 4.0299, + "step": 47380 + }, + { + "epoch": 3.2195271096616387, + "grad_norm": 0.20905861258506775, + "learning_rate": 5.9774595733115915e-05, + "loss": 3.5075, + "step": 47385 + }, + { + "epoch": 3.2198668297323008, + "grad_norm": 0.12996767461299896, + "learning_rate": 5.977034923223265e-05, + "loss": 4.0493, + "step": 47390 + }, + { + "epoch": 3.2202065498029624, + "grad_norm": 0.20528572797775269, + "learning_rate": 5.976610273134937e-05, + "loss": 3.9621, + "step": 47395 + }, + { + "epoch": 3.220546269873624, + "grad_norm": 0.15554717183113098, + "learning_rate": 5.97618562304661e-05, + "loss": 3.7642, + "step": 47400 + }, + { + "epoch": 3.220885989944286, + "grad_norm": 0.2637736201286316, + "learning_rate": 5.9757609729582833e-05, + "loss": 3.7858, + "step": 47405 + }, + { + "epoch": 3.2212257100149477, + "grad_norm": 0.17289984226226807, + "learning_rate": 5.9753363228699555e-05, + "loss": 3.8941, + "step": 47410 + }, + { + "epoch": 3.2215654300856094, + "grad_norm": 0.14722007513046265, + "learning_rate": 5.9749116727816276e-05, + "loss": 3.9039, + "step": 47415 + }, + { + "epoch": 3.2219051501562714, + "grad_norm": 0.1958538293838501, + "learning_rate": 5.974487022693301e-05, + "loss": 3.7545, + "step": 47420 + }, + { + "epoch": 3.222244870226933, + "grad_norm": 0.1842440366744995, + "learning_rate": 5.974062372604974e-05, + "loss": 3.6962, + "step": 47425 + }, + { + "epoch": 3.2225845902975947, + "grad_norm": 0.24751146137714386, + "learning_rate": 5.973637722516646e-05, + "loss": 3.7984, + "step": 47430 + }, + { + "epoch": 3.2229243103682563, + "grad_norm": 0.2066965401172638, + "learning_rate": 5.9732130724283195e-05, + "loss": 3.8646, + "step": 47435 + }, + { + "epoch": 3.2232640304389184, + "grad_norm": 0.22982840240001678, + "learning_rate": 5.972788422339992e-05, + "loss": 3.7538, + "step": 47440 + }, + { + "epoch": 3.22360375050958, + "grad_norm": 0.15331166982650757, + "learning_rate": 5.9723637722516644e-05, + "loss": 3.9558, + "step": 47445 + }, + { + "epoch": 3.2239434705802417, + "grad_norm": 1.0078892707824707, + "learning_rate": 5.971939122163338e-05, + "loss": 3.7792, + "step": 47450 + }, + { + "epoch": 3.2242831906509037, + "grad_norm": 0.15630826354026794, + "learning_rate": 5.971514472075011e-05, + "loss": 3.8825, + "step": 47455 + }, + { + "epoch": 3.2246229107215654, + "grad_norm": 0.20803815126419067, + "learning_rate": 5.971089821986683e-05, + "loss": 3.9679, + "step": 47460 + }, + { + "epoch": 3.224962630792227, + "grad_norm": 0.16960996389389038, + "learning_rate": 5.970665171898356e-05, + "loss": 3.9302, + "step": 47465 + }, + { + "epoch": 3.225302350862889, + "grad_norm": 0.16232216358184814, + "learning_rate": 5.970240521810029e-05, + "loss": 3.812, + "step": 47470 + }, + { + "epoch": 3.2256420709335507, + "grad_norm": 1.3846189975738525, + "learning_rate": 5.969815871721701e-05, + "loss": 3.8019, + "step": 47475 + }, + { + "epoch": 3.2259817910042123, + "grad_norm": 0.19805188477039337, + "learning_rate": 5.969391221633375e-05, + "loss": 3.9586, + "step": 47480 + }, + { + "epoch": 3.2263215110748744, + "grad_norm": 0.16832207143306732, + "learning_rate": 5.968966571545047e-05, + "loss": 4.0661, + "step": 47485 + }, + { + "epoch": 3.226661231145536, + "grad_norm": 0.1786240041255951, + "learning_rate": 5.9685419214567196e-05, + "loss": 3.9147, + "step": 47490 + }, + { + "epoch": 3.2270009512161977, + "grad_norm": 0.2033698409795761, + "learning_rate": 5.968117271368393e-05, + "loss": 3.6194, + "step": 47495 + }, + { + "epoch": 3.2273406712868598, + "grad_norm": 0.22637829184532166, + "learning_rate": 5.967692621280065e-05, + "loss": 3.481, + "step": 47500 + }, + { + "epoch": 3.2276803913575214, + "grad_norm": 0.16686411201953888, + "learning_rate": 5.967267971191738e-05, + "loss": 3.9995, + "step": 47505 + }, + { + "epoch": 3.228020111428183, + "grad_norm": 2.1442155838012695, + "learning_rate": 5.9668433211034115e-05, + "loss": 3.8519, + "step": 47510 + }, + { + "epoch": 3.228359831498845, + "grad_norm": 0.1880173534154892, + "learning_rate": 5.9664186710150836e-05, + "loss": 4.1078, + "step": 47515 + }, + { + "epoch": 3.2286995515695067, + "grad_norm": 0.19720159471035004, + "learning_rate": 5.9659940209267564e-05, + "loss": 3.5293, + "step": 47520 + }, + { + "epoch": 3.2290392716401684, + "grad_norm": 0.19619116187095642, + "learning_rate": 5.96556937083843e-05, + "loss": 3.8598, + "step": 47525 + }, + { + "epoch": 3.2293789917108304, + "grad_norm": 0.21220353245735168, + "learning_rate": 5.965144720750102e-05, + "loss": 3.866, + "step": 47530 + }, + { + "epoch": 3.229718711781492, + "grad_norm": 0.17942388355731964, + "learning_rate": 5.964720070661775e-05, + "loss": 3.5291, + "step": 47535 + }, + { + "epoch": 3.2300584318521537, + "grad_norm": 0.21897025406360626, + "learning_rate": 5.964295420573448e-05, + "loss": 3.8226, + "step": 47540 + }, + { + "epoch": 3.2303981519228158, + "grad_norm": 0.15525367856025696, + "learning_rate": 5.9638707704851204e-05, + "loss": 3.818, + "step": 47545 + }, + { + "epoch": 3.2307378719934774, + "grad_norm": 0.38602396845817566, + "learning_rate": 5.9634461203967925e-05, + "loss": 3.6532, + "step": 47550 + }, + { + "epoch": 3.231077592064139, + "grad_norm": 5.078938007354736, + "learning_rate": 5.963021470308467e-05, + "loss": 4.059, + "step": 47555 + }, + { + "epoch": 3.231417312134801, + "grad_norm": 0.18466627597808838, + "learning_rate": 5.962596820220139e-05, + "loss": 3.7124, + "step": 47560 + }, + { + "epoch": 3.2317570322054627, + "grad_norm": 0.17687830328941345, + "learning_rate": 5.962172170131811e-05, + "loss": 3.7801, + "step": 47565 + }, + { + "epoch": 3.2320967522761244, + "grad_norm": 0.14797689020633698, + "learning_rate": 5.9617475200434844e-05, + "loss": 3.8563, + "step": 47570 + }, + { + "epoch": 3.2324364723467864, + "grad_norm": 0.17272256314754486, + "learning_rate": 5.961322869955157e-05, + "loss": 3.8921, + "step": 47575 + }, + { + "epoch": 3.232776192417448, + "grad_norm": 0.1795654445886612, + "learning_rate": 5.9608982198668293e-05, + "loss": 3.9615, + "step": 47580 + }, + { + "epoch": 3.2331159124881097, + "grad_norm": 0.1819065511226654, + "learning_rate": 5.960473569778503e-05, + "loss": 3.9875, + "step": 47585 + }, + { + "epoch": 3.2334556325587718, + "grad_norm": 0.2791818678379059, + "learning_rate": 5.9600489196901756e-05, + "loss": 4.0749, + "step": 47590 + }, + { + "epoch": 3.2337953526294334, + "grad_norm": 0.6176039576530457, + "learning_rate": 5.959624269601848e-05, + "loss": 3.8471, + "step": 47595 + }, + { + "epoch": 3.234135072700095, + "grad_norm": 0.15958864986896515, + "learning_rate": 5.959199619513521e-05, + "loss": 4.1653, + "step": 47600 + }, + { + "epoch": 3.234474792770757, + "grad_norm": 0.2551652491092682, + "learning_rate": 5.958774969425194e-05, + "loss": 3.9431, + "step": 47605 + }, + { + "epoch": 3.2348145128414187, + "grad_norm": 0.1629408746957779, + "learning_rate": 5.958350319336866e-05, + "loss": 3.7522, + "step": 47610 + }, + { + "epoch": 3.2351542329120804, + "grad_norm": 0.2595336437225342, + "learning_rate": 5.9579256692485396e-05, + "loss": 3.8661, + "step": 47615 + }, + { + "epoch": 3.2354939529827424, + "grad_norm": 0.13118018209934235, + "learning_rate": 5.957501019160212e-05, + "loss": 3.933, + "step": 47620 + }, + { + "epoch": 3.235833673053404, + "grad_norm": 0.21010389924049377, + "learning_rate": 5.957076369071886e-05, + "loss": 3.6986, + "step": 47625 + }, + { + "epoch": 3.2361733931240657, + "grad_norm": 0.14555995166301727, + "learning_rate": 5.956651718983558e-05, + "loss": 3.7536, + "step": 47630 + }, + { + "epoch": 3.2365131131947273, + "grad_norm": 0.6246963739395142, + "learning_rate": 5.95622706889523e-05, + "loss": 3.886, + "step": 47635 + }, + { + "epoch": 3.2368528332653894, + "grad_norm": 0.14268851280212402, + "learning_rate": 5.9558024188069036e-05, + "loss": 3.8211, + "step": 47640 + }, + { + "epoch": 3.237192553336051, + "grad_norm": 0.16929416358470917, + "learning_rate": 5.9553777687185764e-05, + "loss": 3.9118, + "step": 47645 + }, + { + "epoch": 3.2375322734067127, + "grad_norm": 0.1686500906944275, + "learning_rate": 5.9549531186302486e-05, + "loss": 3.7962, + "step": 47650 + }, + { + "epoch": 3.2378719934773748, + "grad_norm": 0.21716561913490295, + "learning_rate": 5.954528468541922e-05, + "loss": 3.8232, + "step": 47655 + }, + { + "epoch": 3.2382117135480364, + "grad_norm": 0.17851249873638153, + "learning_rate": 5.954103818453595e-05, + "loss": 4.0324, + "step": 47660 + }, + { + "epoch": 3.238551433618698, + "grad_norm": 0.2312621921300888, + "learning_rate": 5.953679168365267e-05, + "loss": 3.929, + "step": 47665 + }, + { + "epoch": 3.23889115368936, + "grad_norm": 0.14643417298793793, + "learning_rate": 5.9532545182769404e-05, + "loss": 3.8799, + "step": 47670 + }, + { + "epoch": 3.2392308737600217, + "grad_norm": 0.20119041204452515, + "learning_rate": 5.952829868188613e-05, + "loss": 3.8917, + "step": 47675 + }, + { + "epoch": 3.2395705938306834, + "grad_norm": 0.16836419701576233, + "learning_rate": 5.9524052181002854e-05, + "loss": 3.8439, + "step": 47680 + }, + { + "epoch": 3.2399103139013454, + "grad_norm": 0.18801723420619965, + "learning_rate": 5.951980568011959e-05, + "loss": 4.1075, + "step": 47685 + }, + { + "epoch": 3.240250033972007, + "grad_norm": 0.18953293561935425, + "learning_rate": 5.9515559179236316e-05, + "loss": 3.7808, + "step": 47690 + }, + { + "epoch": 3.2405897540426687, + "grad_norm": 0.2024488002061844, + "learning_rate": 5.951131267835304e-05, + "loss": 4.2118, + "step": 47695 + }, + { + "epoch": 3.2409294741133308, + "grad_norm": 0.18040552735328674, + "learning_rate": 5.950706617746977e-05, + "loss": 3.7861, + "step": 47700 + }, + { + "epoch": 3.2412691941839924, + "grad_norm": 0.1534203141927719, + "learning_rate": 5.9502819676586494e-05, + "loss": 3.9203, + "step": 47705 + }, + { + "epoch": 3.241608914254654, + "grad_norm": 0.17509417235851288, + "learning_rate": 5.949857317570322e-05, + "loss": 4.0528, + "step": 47710 + }, + { + "epoch": 3.241948634325316, + "grad_norm": 0.17901167273521423, + "learning_rate": 5.9494326674819956e-05, + "loss": 3.7922, + "step": 47715 + }, + { + "epoch": 3.2422883543959777, + "grad_norm": 0.22663116455078125, + "learning_rate": 5.949008017393668e-05, + "loss": 3.9561, + "step": 47720 + }, + { + "epoch": 3.2426280744666394, + "grad_norm": 0.1736420840024948, + "learning_rate": 5.9485833673053406e-05, + "loss": 3.7984, + "step": 47725 + }, + { + "epoch": 3.2429677945373014, + "grad_norm": 0.1787383109331131, + "learning_rate": 5.948158717217014e-05, + "loss": 3.781, + "step": 47730 + }, + { + "epoch": 3.243307514607963, + "grad_norm": 0.19541236758232117, + "learning_rate": 5.947734067128686e-05, + "loss": 3.8964, + "step": 47735 + }, + { + "epoch": 3.2436472346786247, + "grad_norm": 0.17943920195102692, + "learning_rate": 5.947309417040359e-05, + "loss": 3.9547, + "step": 47740 + }, + { + "epoch": 3.2439869547492868, + "grad_norm": 0.22944623231887817, + "learning_rate": 5.9468847669520324e-05, + "loss": 3.9069, + "step": 47745 + }, + { + "epoch": 3.2443266748199484, + "grad_norm": 0.3070072531700134, + "learning_rate": 5.9464601168637046e-05, + "loss": 3.7507, + "step": 47750 + }, + { + "epoch": 3.24466639489061, + "grad_norm": 0.21330197155475616, + "learning_rate": 5.9460354667753774e-05, + "loss": 4.0456, + "step": 47755 + }, + { + "epoch": 3.245006114961272, + "grad_norm": 0.21161994338035583, + "learning_rate": 5.945610816687051e-05, + "loss": 3.8679, + "step": 47760 + }, + { + "epoch": 3.2453458350319337, + "grad_norm": 1.0202349424362183, + "learning_rate": 5.945186166598723e-05, + "loss": 3.949, + "step": 47765 + }, + { + "epoch": 3.2456855551025954, + "grad_norm": 0.15466104447841644, + "learning_rate": 5.944761516510395e-05, + "loss": 3.9334, + "step": 47770 + }, + { + "epoch": 3.246025275173257, + "grad_norm": 0.16482563316822052, + "learning_rate": 5.9443368664220686e-05, + "loss": 3.8361, + "step": 47775 + }, + { + "epoch": 3.246364995243919, + "grad_norm": 0.16672585904598236, + "learning_rate": 5.9439122163337414e-05, + "loss": 3.8604, + "step": 47780 + }, + { + "epoch": 3.2467047153145807, + "grad_norm": 0.16584858298301697, + "learning_rate": 5.9434875662454135e-05, + "loss": 3.6071, + "step": 47785 + }, + { + "epoch": 3.2470444353852423, + "grad_norm": 0.17524319887161255, + "learning_rate": 5.943062916157087e-05, + "loss": 3.8052, + "step": 47790 + }, + { + "epoch": 3.2473841554559044, + "grad_norm": 0.19086621701717377, + "learning_rate": 5.94263826606876e-05, + "loss": 3.7625, + "step": 47795 + }, + { + "epoch": 3.247723875526566, + "grad_norm": 0.3336789011955261, + "learning_rate": 5.942213615980432e-05, + "loss": 3.8688, + "step": 47800 + }, + { + "epoch": 3.2480635955972277, + "grad_norm": 0.2135232388973236, + "learning_rate": 5.9417889658921054e-05, + "loss": 3.7292, + "step": 47805 + }, + { + "epoch": 3.2484033156678898, + "grad_norm": 0.19523490965366364, + "learning_rate": 5.941364315803778e-05, + "loss": 3.9863, + "step": 47810 + }, + { + "epoch": 3.2487430357385514, + "grad_norm": 0.17495489120483398, + "learning_rate": 5.94093966571545e-05, + "loss": 3.7933, + "step": 47815 + }, + { + "epoch": 3.249082755809213, + "grad_norm": 0.9625658988952637, + "learning_rate": 5.940515015627124e-05, + "loss": 4.2075, + "step": 47820 + }, + { + "epoch": 3.249422475879875, + "grad_norm": 0.2258571982383728, + "learning_rate": 5.9400903655387966e-05, + "loss": 3.7537, + "step": 47825 + }, + { + "epoch": 3.2497621959505367, + "grad_norm": 0.183134987950325, + "learning_rate": 5.939665715450469e-05, + "loss": 3.7875, + "step": 47830 + }, + { + "epoch": 3.2501019160211984, + "grad_norm": 0.19675828516483307, + "learning_rate": 5.939241065362142e-05, + "loss": 3.8727, + "step": 47835 + }, + { + "epoch": 3.2504416360918604, + "grad_norm": 0.1834293007850647, + "learning_rate": 5.938816415273814e-05, + "loss": 3.9594, + "step": 47840 + }, + { + "epoch": 3.250781356162522, + "grad_norm": 0.15169797837734222, + "learning_rate": 5.938391765185487e-05, + "loss": 3.8457, + "step": 47845 + }, + { + "epoch": 3.2511210762331837, + "grad_norm": 0.16990143060684204, + "learning_rate": 5.9379671150971606e-05, + "loss": 4.0442, + "step": 47850 + }, + { + "epoch": 3.2514607963038458, + "grad_norm": 0.24196046590805054, + "learning_rate": 5.937542465008833e-05, + "loss": 4.0098, + "step": 47855 + }, + { + "epoch": 3.2518005163745074, + "grad_norm": 0.1206011101603508, + "learning_rate": 5.9371178149205055e-05, + "loss": 3.8739, + "step": 47860 + }, + { + "epoch": 3.252140236445169, + "grad_norm": 0.1716797947883606, + "learning_rate": 5.936693164832179e-05, + "loss": 3.5741, + "step": 47865 + }, + { + "epoch": 3.252479956515831, + "grad_norm": 0.1576320379972458, + "learning_rate": 5.936268514743851e-05, + "loss": 3.798, + "step": 47870 + }, + { + "epoch": 3.2528196765864927, + "grad_norm": 0.20539210736751556, + "learning_rate": 5.935843864655524e-05, + "loss": 3.6988, + "step": 47875 + }, + { + "epoch": 3.2531593966571544, + "grad_norm": 0.16741874814033508, + "learning_rate": 5.9354192145671974e-05, + "loss": 3.9836, + "step": 47880 + }, + { + "epoch": 3.2534991167278164, + "grad_norm": 0.15438419580459595, + "learning_rate": 5.9349945644788695e-05, + "loss": 3.8407, + "step": 47885 + }, + { + "epoch": 3.253838836798478, + "grad_norm": 6.924149513244629, + "learning_rate": 5.934569914390542e-05, + "loss": 4.0007, + "step": 47890 + }, + { + "epoch": 3.2541785568691397, + "grad_norm": 0.18766526877880096, + "learning_rate": 5.934145264302216e-05, + "loss": 3.8833, + "step": 47895 + }, + { + "epoch": 3.254518276939802, + "grad_norm": 0.6869513988494873, + "learning_rate": 5.933720614213888e-05, + "loss": 3.7407, + "step": 47900 + }, + { + "epoch": 3.2548579970104634, + "grad_norm": 0.1960199624300003, + "learning_rate": 5.93329596412556e-05, + "loss": 3.8307, + "step": 47905 + }, + { + "epoch": 3.255197717081125, + "grad_norm": 0.18483267724514008, + "learning_rate": 5.9328713140372335e-05, + "loss": 3.7033, + "step": 47910 + }, + { + "epoch": 3.255537437151787, + "grad_norm": 0.1719682216644287, + "learning_rate": 5.932446663948906e-05, + "loss": 3.6636, + "step": 47915 + }, + { + "epoch": 3.2558771572224487, + "grad_norm": 0.17647510766983032, + "learning_rate": 5.9320220138605784e-05, + "loss": 4.2266, + "step": 47920 + }, + { + "epoch": 3.2562168772931104, + "grad_norm": 0.12417463958263397, + "learning_rate": 5.931597363772252e-05, + "loss": 3.8482, + "step": 47925 + }, + { + "epoch": 3.2565565973637725, + "grad_norm": 1.1339267492294312, + "learning_rate": 5.931172713683925e-05, + "loss": 4.0308, + "step": 47930 + }, + { + "epoch": 3.256896317434434, + "grad_norm": 0.3298845887184143, + "learning_rate": 5.930748063595597e-05, + "loss": 4.1716, + "step": 47935 + }, + { + "epoch": 3.2572360375050957, + "grad_norm": 0.18021966516971588, + "learning_rate": 5.93032341350727e-05, + "loss": 3.7737, + "step": 47940 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 0.16773675382137299, + "learning_rate": 5.929898763418943e-05, + "loss": 3.9121, + "step": 47945 + }, + { + "epoch": 3.2579154776464194, + "grad_norm": 0.16995100677013397, + "learning_rate": 5.929474113330615e-05, + "loss": 3.7898, + "step": 47950 + }, + { + "epoch": 3.258255197717081, + "grad_norm": 0.18475671112537384, + "learning_rate": 5.929049463242289e-05, + "loss": 3.8991, + "step": 47955 + }, + { + "epoch": 3.258594917787743, + "grad_norm": 0.19107544422149658, + "learning_rate": 5.9286248131539615e-05, + "loss": 4.0713, + "step": 47960 + }, + { + "epoch": 3.2589346378584048, + "grad_norm": 0.22932985424995422, + "learning_rate": 5.928200163065635e-05, + "loss": 3.7581, + "step": 47965 + }, + { + "epoch": 3.2592743579290664, + "grad_norm": 0.1867833435535431, + "learning_rate": 5.927775512977307e-05, + "loss": 3.9223, + "step": 47970 + }, + { + "epoch": 3.2596140779997285, + "grad_norm": 0.20802976191043854, + "learning_rate": 5.927350862888979e-05, + "loss": 3.7459, + "step": 47975 + }, + { + "epoch": 3.25995379807039, + "grad_norm": 0.16453808546066284, + "learning_rate": 5.9269262128006534e-05, + "loss": 3.6945, + "step": 47980 + }, + { + "epoch": 3.2602935181410517, + "grad_norm": 0.15995284914970398, + "learning_rate": 5.9265015627123255e-05, + "loss": 3.9799, + "step": 47985 + }, + { + "epoch": 3.2606332382117134, + "grad_norm": 0.1814463883638382, + "learning_rate": 5.9260769126239977e-05, + "loss": 3.8305, + "step": 47990 + }, + { + "epoch": 3.2609729582823754, + "grad_norm": 2.2995293140411377, + "learning_rate": 5.925652262535671e-05, + "loss": 4.0067, + "step": 47995 + }, + { + "epoch": 3.261312678353037, + "grad_norm": 0.1282157003879547, + "learning_rate": 5.925227612447344e-05, + "loss": 3.9621, + "step": 48000 + }, + { + "epoch": 3.2616523984236987, + "grad_norm": 0.17838600277900696, + "learning_rate": 5.924802962359016e-05, + "loss": 3.847, + "step": 48005 + }, + { + "epoch": 3.2619921184943608, + "grad_norm": 0.1556093692779541, + "learning_rate": 5.9243783122706895e-05, + "loss": 4.0238, + "step": 48010 + }, + { + "epoch": 3.2623318385650224, + "grad_norm": 0.21217413246631622, + "learning_rate": 5.923953662182362e-05, + "loss": 3.9096, + "step": 48015 + }, + { + "epoch": 3.262671558635684, + "grad_norm": 0.18045905232429504, + "learning_rate": 5.9235290120940345e-05, + "loss": 3.7832, + "step": 48020 + }, + { + "epoch": 3.263011278706346, + "grad_norm": 0.15328434109687805, + "learning_rate": 5.923104362005708e-05, + "loss": 4.0015, + "step": 48025 + }, + { + "epoch": 3.2633509987770077, + "grad_norm": 0.1397327482700348, + "learning_rate": 5.922679711917381e-05, + "loss": 3.9312, + "step": 48030 + }, + { + "epoch": 3.2636907188476694, + "grad_norm": 0.16765519976615906, + "learning_rate": 5.922255061829053e-05, + "loss": 3.8725, + "step": 48035 + }, + { + "epoch": 3.2640304389183314, + "grad_norm": 0.13093815743923187, + "learning_rate": 5.921830411740726e-05, + "loss": 3.9161, + "step": 48040 + }, + { + "epoch": 3.264370158988993, + "grad_norm": 0.17823366820812225, + "learning_rate": 5.9214057616523985e-05, + "loss": 3.7382, + "step": 48045 + }, + { + "epoch": 3.2647098790596547, + "grad_norm": 0.1945357322692871, + "learning_rate": 5.920981111564071e-05, + "loss": 4.0837, + "step": 48050 + }, + { + "epoch": 3.265049599130317, + "grad_norm": 0.19992974400520325, + "learning_rate": 5.920556461475745e-05, + "loss": 3.723, + "step": 48055 + }, + { + "epoch": 3.2653893192009784, + "grad_norm": 0.22778362035751343, + "learning_rate": 5.920131811387417e-05, + "loss": 3.717, + "step": 48060 + }, + { + "epoch": 3.26572903927164, + "grad_norm": 0.368291974067688, + "learning_rate": 5.91970716129909e-05, + "loss": 3.8497, + "step": 48065 + }, + { + "epoch": 3.266068759342302, + "grad_norm": 0.1769287884235382, + "learning_rate": 5.919282511210763e-05, + "loss": 3.9083, + "step": 48070 + }, + { + "epoch": 3.2664084794129638, + "grad_norm": 0.21705959737300873, + "learning_rate": 5.918857861122435e-05, + "loss": 3.8137, + "step": 48075 + }, + { + "epoch": 3.2667481994836254, + "grad_norm": 0.9549431204795837, + "learning_rate": 5.918433211034108e-05, + "loss": 3.7231, + "step": 48080 + }, + { + "epoch": 3.2670879195542875, + "grad_norm": 0.263150155544281, + "learning_rate": 5.9180085609457815e-05, + "loss": 3.7489, + "step": 48085 + }, + { + "epoch": 3.267427639624949, + "grad_norm": 0.19250573217868805, + "learning_rate": 5.917583910857454e-05, + "loss": 3.8863, + "step": 48090 + }, + { + "epoch": 3.2677673596956107, + "grad_norm": 0.19313500821590424, + "learning_rate": 5.9171592607691265e-05, + "loss": 3.8984, + "step": 48095 + }, + { + "epoch": 3.2681070797662723, + "grad_norm": 0.2035088986158371, + "learning_rate": 5.9167346106808e-05, + "loss": 3.9013, + "step": 48100 + }, + { + "epoch": 3.2684467998369344, + "grad_norm": 0.152478888630867, + "learning_rate": 5.916309960592472e-05, + "loss": 3.8774, + "step": 48105 + }, + { + "epoch": 3.268786519907596, + "grad_norm": 0.20528197288513184, + "learning_rate": 5.915885310504144e-05, + "loss": 3.8152, + "step": 48110 + }, + { + "epoch": 3.2691262399782577, + "grad_norm": 0.21950477361679077, + "learning_rate": 5.9154606604158183e-05, + "loss": 3.9557, + "step": 48115 + }, + { + "epoch": 3.2694659600489198, + "grad_norm": 0.13746026158332825, + "learning_rate": 5.9150360103274905e-05, + "loss": 3.9887, + "step": 48120 + }, + { + "epoch": 3.2698056801195814, + "grad_norm": 0.20348402857780457, + "learning_rate": 5.9146113602391626e-05, + "loss": 4.1266, + "step": 48125 + }, + { + "epoch": 3.270145400190243, + "grad_norm": 0.17019358277320862, + "learning_rate": 5.914186710150836e-05, + "loss": 3.5832, + "step": 48130 + }, + { + "epoch": 3.270485120260905, + "grad_norm": 0.765153169631958, + "learning_rate": 5.913762060062509e-05, + "loss": 3.7882, + "step": 48135 + }, + { + "epoch": 3.2708248403315667, + "grad_norm": 0.18737910687923431, + "learning_rate": 5.913337409974181e-05, + "loss": 3.8513, + "step": 48140 + }, + { + "epoch": 3.2711645604022284, + "grad_norm": 0.1730422079563141, + "learning_rate": 5.9129127598858545e-05, + "loss": 3.8998, + "step": 48145 + }, + { + "epoch": 3.2715042804728904, + "grad_norm": 0.3533150553703308, + "learning_rate": 5.912488109797527e-05, + "loss": 4.0171, + "step": 48150 + }, + { + "epoch": 3.271844000543552, + "grad_norm": 0.21554383635520935, + "learning_rate": 5.9120634597091994e-05, + "loss": 3.9812, + "step": 48155 + }, + { + "epoch": 3.2721837206142137, + "grad_norm": 0.16953045129776, + "learning_rate": 5.911638809620873e-05, + "loss": 3.6187, + "step": 48160 + }, + { + "epoch": 3.2725234406848758, + "grad_norm": 0.19496658444404602, + "learning_rate": 5.911214159532546e-05, + "loss": 3.9051, + "step": 48165 + }, + { + "epoch": 3.2728631607555374, + "grad_norm": 0.8683009743690491, + "learning_rate": 5.910789509444218e-05, + "loss": 3.6141, + "step": 48170 + }, + { + "epoch": 3.273202880826199, + "grad_norm": 0.21562959253787994, + "learning_rate": 5.910364859355891e-05, + "loss": 3.7055, + "step": 48175 + }, + { + "epoch": 3.273542600896861, + "grad_norm": 0.15049827098846436, + "learning_rate": 5.909940209267564e-05, + "loss": 3.7871, + "step": 48180 + }, + { + "epoch": 3.2738823209675227, + "grad_norm": 0.18523414433002472, + "learning_rate": 5.909515559179236e-05, + "loss": 3.8537, + "step": 48185 + }, + { + "epoch": 3.2742220410381844, + "grad_norm": 0.29626357555389404, + "learning_rate": 5.90909090909091e-05, + "loss": 3.8622, + "step": 48190 + }, + { + "epoch": 3.2745617611088464, + "grad_norm": 0.20052017271518707, + "learning_rate": 5.908666259002582e-05, + "loss": 3.9868, + "step": 48195 + }, + { + "epoch": 3.274901481179508, + "grad_norm": 0.21487222611904144, + "learning_rate": 5.9082416089142546e-05, + "loss": 3.8756, + "step": 48200 + }, + { + "epoch": 3.2752412012501697, + "grad_norm": 0.20672807097434998, + "learning_rate": 5.907816958825928e-05, + "loss": 3.7728, + "step": 48205 + }, + { + "epoch": 3.275580921320832, + "grad_norm": 0.20186084508895874, + "learning_rate": 5.9073923087376e-05, + "loss": 3.9407, + "step": 48210 + }, + { + "epoch": 3.2759206413914934, + "grad_norm": 0.18851883709430695, + "learning_rate": 5.906967658649273e-05, + "loss": 3.7711, + "step": 48215 + }, + { + "epoch": 3.276260361462155, + "grad_norm": 0.16818739473819733, + "learning_rate": 5.9065430085609465e-05, + "loss": 3.8901, + "step": 48220 + }, + { + "epoch": 3.276600081532817, + "grad_norm": 0.16532184183597565, + "learning_rate": 5.9061183584726186e-05, + "loss": 3.866, + "step": 48225 + }, + { + "epoch": 3.2769398016034788, + "grad_norm": 0.21322299540042877, + "learning_rate": 5.9056937083842914e-05, + "loss": 3.8193, + "step": 48230 + }, + { + "epoch": 3.2772795216741404, + "grad_norm": 0.2671998143196106, + "learning_rate": 5.905269058295965e-05, + "loss": 3.9498, + "step": 48235 + }, + { + "epoch": 3.2776192417448025, + "grad_norm": 0.20630984008312225, + "learning_rate": 5.904844408207637e-05, + "loss": 3.8797, + "step": 48240 + }, + { + "epoch": 3.277958961815464, + "grad_norm": 0.14929001033306122, + "learning_rate": 5.90441975811931e-05, + "loss": 4.1523, + "step": 48245 + }, + { + "epoch": 3.2782986818861257, + "grad_norm": 0.1633896678686142, + "learning_rate": 5.903995108030983e-05, + "loss": 3.8506, + "step": 48250 + }, + { + "epoch": 3.278638401956788, + "grad_norm": 0.17337633669376373, + "learning_rate": 5.9035704579426554e-05, + "loss": 3.7666, + "step": 48255 + }, + { + "epoch": 3.2789781220274494, + "grad_norm": 0.25487929582595825, + "learning_rate": 5.9031458078543275e-05, + "loss": 3.8077, + "step": 48260 + }, + { + "epoch": 3.279317842098111, + "grad_norm": 0.16817790269851685, + "learning_rate": 5.902721157766001e-05, + "loss": 3.7975, + "step": 48265 + }, + { + "epoch": 3.279657562168773, + "grad_norm": 0.168153777718544, + "learning_rate": 5.902296507677674e-05, + "loss": 3.8139, + "step": 48270 + }, + { + "epoch": 3.2799972822394348, + "grad_norm": 0.6060583591461182, + "learning_rate": 5.901871857589346e-05, + "loss": 3.7037, + "step": 48275 + }, + { + "epoch": 3.2803370023100964, + "grad_norm": 0.15782544016838074, + "learning_rate": 5.9014472075010194e-05, + "loss": 3.9518, + "step": 48280 + }, + { + "epoch": 3.2806767223807585, + "grad_norm": 0.17507416009902954, + "learning_rate": 5.901022557412692e-05, + "loss": 3.8413, + "step": 48285 + }, + { + "epoch": 3.28101644245142, + "grad_norm": 0.18312038481235504, + "learning_rate": 5.9005979073243643e-05, + "loss": 3.6588, + "step": 48290 + }, + { + "epoch": 3.2813561625220817, + "grad_norm": 0.3553847372531891, + "learning_rate": 5.900173257236038e-05, + "loss": 3.9446, + "step": 48295 + }, + { + "epoch": 3.281695882592744, + "grad_norm": 0.1961686611175537, + "learning_rate": 5.8997486071477106e-05, + "loss": 3.7414, + "step": 48300 + }, + { + "epoch": 3.2820356026634054, + "grad_norm": 0.20853859186172485, + "learning_rate": 5.899323957059384e-05, + "loss": 3.8202, + "step": 48305 + }, + { + "epoch": 3.282375322734067, + "grad_norm": 0.18239545822143555, + "learning_rate": 5.898899306971056e-05, + "loss": 3.9811, + "step": 48310 + }, + { + "epoch": 3.282715042804729, + "grad_norm": 0.2690362334251404, + "learning_rate": 5.898474656882729e-05, + "loss": 3.9181, + "step": 48315 + }, + { + "epoch": 3.2830547628753908, + "grad_norm": 0.2055128663778305, + "learning_rate": 5.8980500067944025e-05, + "loss": 3.9582, + "step": 48320 + }, + { + "epoch": 3.2833944829460524, + "grad_norm": 0.19470731914043427, + "learning_rate": 5.8976253567060746e-05, + "loss": 3.8684, + "step": 48325 + }, + { + "epoch": 3.283734203016714, + "grad_norm": 0.16889357566833496, + "learning_rate": 5.897200706617747e-05, + "loss": 3.9066, + "step": 48330 + }, + { + "epoch": 3.284073923087376, + "grad_norm": 0.19849993288516998, + "learning_rate": 5.89677605652942e-05, + "loss": 3.819, + "step": 48335 + }, + { + "epoch": 3.2844136431580377, + "grad_norm": 0.19542619585990906, + "learning_rate": 5.896351406441093e-05, + "loss": 3.7785, + "step": 48340 + }, + { + "epoch": 3.2847533632286994, + "grad_norm": 0.15089237689971924, + "learning_rate": 5.895926756352765e-05, + "loss": 3.5967, + "step": 48345 + }, + { + "epoch": 3.2850930832993614, + "grad_norm": 0.1676180064678192, + "learning_rate": 5.8955021062644386e-05, + "loss": 3.9103, + "step": 48350 + }, + { + "epoch": 3.285432803370023, + "grad_norm": 0.17944885790348053, + "learning_rate": 5.8950774561761114e-05, + "loss": 3.777, + "step": 48355 + }, + { + "epoch": 3.2857725234406847, + "grad_norm": 0.17238572239875793, + "learning_rate": 5.8946528060877836e-05, + "loss": 3.808, + "step": 48360 + }, + { + "epoch": 3.286112243511347, + "grad_norm": 0.17745323479175568, + "learning_rate": 5.894228155999457e-05, + "loss": 3.8596, + "step": 48365 + }, + { + "epoch": 3.2864519635820084, + "grad_norm": 0.21909092366695404, + "learning_rate": 5.89380350591113e-05, + "loss": 3.873, + "step": 48370 + }, + { + "epoch": 3.28679168365267, + "grad_norm": 0.48958706855773926, + "learning_rate": 5.893378855822802e-05, + "loss": 3.9444, + "step": 48375 + }, + { + "epoch": 3.287131403723332, + "grad_norm": 0.18372277915477753, + "learning_rate": 5.8929542057344754e-05, + "loss": 3.9205, + "step": 48380 + }, + { + "epoch": 3.2874711237939938, + "grad_norm": 0.19455502927303314, + "learning_rate": 5.892529555646148e-05, + "loss": 3.929, + "step": 48385 + }, + { + "epoch": 3.2878108438646554, + "grad_norm": 0.22653725743293762, + "learning_rate": 5.8921049055578204e-05, + "loss": 3.832, + "step": 48390 + }, + { + "epoch": 3.2881505639353175, + "grad_norm": 0.13628943264484406, + "learning_rate": 5.891680255469494e-05, + "loss": 3.8249, + "step": 48395 + }, + { + "epoch": 3.288490284005979, + "grad_norm": 0.1806837022304535, + "learning_rate": 5.891255605381166e-05, + "loss": 3.8565, + "step": 48400 + }, + { + "epoch": 3.2888300040766407, + "grad_norm": 0.15982255339622498, + "learning_rate": 5.890830955292839e-05, + "loss": 4.2296, + "step": 48405 + }, + { + "epoch": 3.289169724147303, + "grad_norm": 0.14639416337013245, + "learning_rate": 5.890406305204512e-05, + "loss": 3.9163, + "step": 48410 + }, + { + "epoch": 3.2895094442179644, + "grad_norm": 1.6268407106399536, + "learning_rate": 5.8899816551161844e-05, + "loss": 3.7319, + "step": 48415 + }, + { + "epoch": 3.289849164288626, + "grad_norm": 0.24444080889225006, + "learning_rate": 5.889557005027857e-05, + "loss": 3.7232, + "step": 48420 + }, + { + "epoch": 3.290188884359288, + "grad_norm": 0.160862535238266, + "learning_rate": 5.8891323549395306e-05, + "loss": 3.8076, + "step": 48425 + }, + { + "epoch": 3.2905286044299498, + "grad_norm": 0.18145693838596344, + "learning_rate": 5.888707704851203e-05, + "loss": 3.9235, + "step": 48430 + }, + { + "epoch": 3.2908683245006114, + "grad_norm": 0.16448165476322174, + "learning_rate": 5.8882830547628756e-05, + "loss": 3.7562, + "step": 48435 + }, + { + "epoch": 3.291208044571273, + "grad_norm": 1.7213935852050781, + "learning_rate": 5.887858404674549e-05, + "loss": 4.1587, + "step": 48440 + }, + { + "epoch": 3.291547764641935, + "grad_norm": 0.20659470558166504, + "learning_rate": 5.887433754586221e-05, + "loss": 4.169, + "step": 48445 + }, + { + "epoch": 3.2918874847125967, + "grad_norm": 0.17514508962631226, + "learning_rate": 5.887009104497894e-05, + "loss": 4.1866, + "step": 48450 + }, + { + "epoch": 3.2922272047832584, + "grad_norm": 0.23320895433425903, + "learning_rate": 5.8865844544095674e-05, + "loss": 3.4819, + "step": 48455 + }, + { + "epoch": 3.2925669248539204, + "grad_norm": 0.42951980233192444, + "learning_rate": 5.8861598043212396e-05, + "loss": 3.6829, + "step": 48460 + }, + { + "epoch": 3.292906644924582, + "grad_norm": 0.18157869577407837, + "learning_rate": 5.885735154232912e-05, + "loss": 3.832, + "step": 48465 + }, + { + "epoch": 3.2932463649952437, + "grad_norm": 0.44201937317848206, + "learning_rate": 5.885310504144586e-05, + "loss": 3.9956, + "step": 48470 + }, + { + "epoch": 3.2935860850659058, + "grad_norm": 0.16690756380558014, + "learning_rate": 5.884885854056258e-05, + "loss": 3.752, + "step": 48475 + }, + { + "epoch": 3.2939258051365674, + "grad_norm": 0.13377083837985992, + "learning_rate": 5.88446120396793e-05, + "loss": 3.918, + "step": 48480 + }, + { + "epoch": 3.294265525207229, + "grad_norm": 0.18971729278564453, + "learning_rate": 5.8840365538796036e-05, + "loss": 3.8599, + "step": 48485 + }, + { + "epoch": 3.294605245277891, + "grad_norm": 0.21699655055999756, + "learning_rate": 5.8836119037912764e-05, + "loss": 3.9329, + "step": 48490 + }, + { + "epoch": 3.2949449653485527, + "grad_norm": 0.1793157011270523, + "learning_rate": 5.8831872537029485e-05, + "loss": 3.7887, + "step": 48495 + }, + { + "epoch": 3.2952846854192144, + "grad_norm": 0.1656971275806427, + "learning_rate": 5.882762603614622e-05, + "loss": 3.7488, + "step": 48500 + }, + { + "epoch": 3.2956244054898765, + "grad_norm": 0.14456713199615479, + "learning_rate": 5.882337953526295e-05, + "loss": 3.9154, + "step": 48505 + }, + { + "epoch": 3.295964125560538, + "grad_norm": 0.2069406807422638, + "learning_rate": 5.881913303437967e-05, + "loss": 3.7578, + "step": 48510 + }, + { + "epoch": 3.2963038456311997, + "grad_norm": 0.268075555562973, + "learning_rate": 5.8814886533496404e-05, + "loss": 3.8541, + "step": 48515 + }, + { + "epoch": 3.296643565701862, + "grad_norm": 0.36184608936309814, + "learning_rate": 5.881064003261313e-05, + "loss": 3.6765, + "step": 48520 + }, + { + "epoch": 3.2969832857725234, + "grad_norm": 0.28893178701400757, + "learning_rate": 5.880639353172985e-05, + "loss": 3.6764, + "step": 48525 + }, + { + "epoch": 3.297323005843185, + "grad_norm": 0.22858348488807678, + "learning_rate": 5.880214703084659e-05, + "loss": 3.809, + "step": 48530 + }, + { + "epoch": 3.297662725913847, + "grad_norm": 0.1771659106016159, + "learning_rate": 5.879790052996331e-05, + "loss": 3.6168, + "step": 48535 + }, + { + "epoch": 3.2980024459845088, + "grad_norm": 0.17974600195884705, + "learning_rate": 5.879365402908004e-05, + "loss": 3.6568, + "step": 48540 + }, + { + "epoch": 3.2983421660551704, + "grad_norm": 0.24489247798919678, + "learning_rate": 5.878940752819677e-05, + "loss": 3.9402, + "step": 48545 + }, + { + "epoch": 3.2986818861258325, + "grad_norm": 0.2406006157398224, + "learning_rate": 5.878516102731349e-05, + "loss": 3.8366, + "step": 48550 + }, + { + "epoch": 3.299021606196494, + "grad_norm": 0.18410374224185944, + "learning_rate": 5.878091452643022e-05, + "loss": 3.9841, + "step": 48555 + }, + { + "epoch": 3.2993613262671557, + "grad_norm": 0.24973322451114655, + "learning_rate": 5.8776668025546956e-05, + "loss": 3.887, + "step": 48560 + }, + { + "epoch": 3.299701046337818, + "grad_norm": 0.15340076386928558, + "learning_rate": 5.877242152466368e-05, + "loss": 3.5575, + "step": 48565 + }, + { + "epoch": 3.3000407664084794, + "grad_norm": 0.16904686391353607, + "learning_rate": 5.8768175023780405e-05, + "loss": 3.9021, + "step": 48570 + }, + { + "epoch": 3.300380486479141, + "grad_norm": 0.20278184115886688, + "learning_rate": 5.876392852289714e-05, + "loss": 3.9403, + "step": 48575 + }, + { + "epoch": 3.300720206549803, + "grad_norm": 0.15489602088928223, + "learning_rate": 5.875968202201386e-05, + "loss": 3.8679, + "step": 48580 + }, + { + "epoch": 3.3010599266204648, + "grad_norm": 0.8360319137573242, + "learning_rate": 5.875543552113059e-05, + "loss": 3.8992, + "step": 48585 + }, + { + "epoch": 3.3013996466911264, + "grad_norm": 0.16348101198673248, + "learning_rate": 5.8751189020247324e-05, + "loss": 3.7201, + "step": 48590 + }, + { + "epoch": 3.3017393667617885, + "grad_norm": 0.16657091677188873, + "learning_rate": 5.8746942519364045e-05, + "loss": 3.8822, + "step": 48595 + }, + { + "epoch": 3.30207908683245, + "grad_norm": 0.17518039047718048, + "learning_rate": 5.8742696018480766e-05, + "loss": 3.7826, + "step": 48600 + }, + { + "epoch": 3.3024188069031117, + "grad_norm": 0.21338728070259094, + "learning_rate": 5.873844951759751e-05, + "loss": 3.8345, + "step": 48605 + }, + { + "epoch": 3.302758526973774, + "grad_norm": 0.22749367356300354, + "learning_rate": 5.873420301671423e-05, + "loss": 4.004, + "step": 48610 + }, + { + "epoch": 3.3030982470444354, + "grad_norm": 0.2270691692829132, + "learning_rate": 5.872995651583095e-05, + "loss": 4.0881, + "step": 48615 + }, + { + "epoch": 3.303437967115097, + "grad_norm": 0.18117700517177582, + "learning_rate": 5.8725710014947685e-05, + "loss": 3.9353, + "step": 48620 + }, + { + "epoch": 3.303777687185759, + "grad_norm": 0.1767934262752533, + "learning_rate": 5.872146351406441e-05, + "loss": 3.7166, + "step": 48625 + }, + { + "epoch": 3.304117407256421, + "grad_norm": 0.19818511605262756, + "learning_rate": 5.8717217013181134e-05, + "loss": 3.7982, + "step": 48630 + }, + { + "epoch": 3.3044571273270824, + "grad_norm": 0.17169497907161713, + "learning_rate": 5.871297051229787e-05, + "loss": 4.0191, + "step": 48635 + }, + { + "epoch": 3.3047968473977445, + "grad_norm": 0.20609350502490997, + "learning_rate": 5.87087240114146e-05, + "loss": 3.9824, + "step": 48640 + }, + { + "epoch": 3.305136567468406, + "grad_norm": 0.3904407024383545, + "learning_rate": 5.870447751053133e-05, + "loss": 3.9562, + "step": 48645 + }, + { + "epoch": 3.3054762875390677, + "grad_norm": 0.14562131464481354, + "learning_rate": 5.870023100964805e-05, + "loss": 3.9998, + "step": 48650 + }, + { + "epoch": 3.30581600760973, + "grad_norm": 0.14543545246124268, + "learning_rate": 5.869598450876478e-05, + "loss": 3.7569, + "step": 48655 + }, + { + "epoch": 3.3061557276803915, + "grad_norm": 0.1636316031217575, + "learning_rate": 5.8691738007881516e-05, + "loss": 3.9714, + "step": 48660 + }, + { + "epoch": 3.306495447751053, + "grad_norm": 0.1377502679824829, + "learning_rate": 5.868749150699824e-05, + "loss": 3.7379, + "step": 48665 + }, + { + "epoch": 3.3068351678217147, + "grad_norm": 0.2177603393793106, + "learning_rate": 5.8683245006114965e-05, + "loss": 3.8399, + "step": 48670 + }, + { + "epoch": 3.307174887892377, + "grad_norm": 0.14752893149852753, + "learning_rate": 5.86789985052317e-05, + "loss": 3.7245, + "step": 48675 + }, + { + "epoch": 3.3075146079630384, + "grad_norm": 0.1964385062456131, + "learning_rate": 5.867475200434842e-05, + "loss": 3.5975, + "step": 48680 + }, + { + "epoch": 3.3078543280337, + "grad_norm": 0.18387159705162048, + "learning_rate": 5.867050550346514e-05, + "loss": 3.7538, + "step": 48685 + }, + { + "epoch": 3.308194048104362, + "grad_norm": 0.15395745635032654, + "learning_rate": 5.866625900258188e-05, + "loss": 3.7806, + "step": 48690 + }, + { + "epoch": 3.3085337681750238, + "grad_norm": 0.1815858632326126, + "learning_rate": 5.8662012501698605e-05, + "loss": 3.905, + "step": 48695 + }, + { + "epoch": 3.3088734882456854, + "grad_norm": 0.1892780363559723, + "learning_rate": 5.8657766000815327e-05, + "loss": 3.9628, + "step": 48700 + }, + { + "epoch": 3.3092132083163475, + "grad_norm": 0.9456372261047363, + "learning_rate": 5.865351949993206e-05, + "loss": 3.6849, + "step": 48705 + }, + { + "epoch": 3.309552928387009, + "grad_norm": 0.18249820172786713, + "learning_rate": 5.864927299904879e-05, + "loss": 3.7721, + "step": 48710 + }, + { + "epoch": 3.3098926484576707, + "grad_norm": 0.1662641167640686, + "learning_rate": 5.864502649816551e-05, + "loss": 3.789, + "step": 48715 + }, + { + "epoch": 3.310232368528333, + "grad_norm": 0.14231382310390472, + "learning_rate": 5.8640779997282245e-05, + "loss": 3.6066, + "step": 48720 + }, + { + "epoch": 3.3105720885989944, + "grad_norm": 0.21637526154518127, + "learning_rate": 5.863653349639897e-05, + "loss": 4.0119, + "step": 48725 + }, + { + "epoch": 3.310911808669656, + "grad_norm": 0.22731351852416992, + "learning_rate": 5.8632286995515695e-05, + "loss": 3.841, + "step": 48730 + }, + { + "epoch": 3.311251528740318, + "grad_norm": 0.5546835660934448, + "learning_rate": 5.862804049463243e-05, + "loss": 3.8653, + "step": 48735 + }, + { + "epoch": 3.3115912488109798, + "grad_norm": 0.20072679221630096, + "learning_rate": 5.862379399374916e-05, + "loss": 3.9446, + "step": 48740 + }, + { + "epoch": 3.3119309688816414, + "grad_norm": 0.18616725504398346, + "learning_rate": 5.861954749286588e-05, + "loss": 3.7906, + "step": 48745 + }, + { + "epoch": 3.3122706889523035, + "grad_norm": 0.16799381375312805, + "learning_rate": 5.861530099198261e-05, + "loss": 4.0294, + "step": 48750 + }, + { + "epoch": 3.312610409022965, + "grad_norm": 0.32267695665359497, + "learning_rate": 5.8611054491099335e-05, + "loss": 3.6247, + "step": 48755 + }, + { + "epoch": 3.3129501290936267, + "grad_norm": 0.1754310578107834, + "learning_rate": 5.860680799021606e-05, + "loss": 3.8684, + "step": 48760 + }, + { + "epoch": 3.313289849164289, + "grad_norm": 0.17697443068027496, + "learning_rate": 5.86025614893328e-05, + "loss": 3.8268, + "step": 48765 + }, + { + "epoch": 3.3136295692349504, + "grad_norm": 0.2608926296234131, + "learning_rate": 5.859831498844952e-05, + "loss": 3.7382, + "step": 48770 + }, + { + "epoch": 3.313969289305612, + "grad_norm": 0.1371144950389862, + "learning_rate": 5.859406848756625e-05, + "loss": 3.8244, + "step": 48775 + }, + { + "epoch": 3.3143090093762737, + "grad_norm": 0.15089091658592224, + "learning_rate": 5.858982198668298e-05, + "loss": 3.8426, + "step": 48780 + }, + { + "epoch": 3.314648729446936, + "grad_norm": 0.23714905977249146, + "learning_rate": 5.85855754857997e-05, + "loss": 3.9298, + "step": 48785 + }, + { + "epoch": 3.3149884495175974, + "grad_norm": 0.1626071333885193, + "learning_rate": 5.858132898491643e-05, + "loss": 3.8667, + "step": 48790 + }, + { + "epoch": 3.315328169588259, + "grad_norm": 0.19087626039981842, + "learning_rate": 5.8577082484033165e-05, + "loss": 4.0233, + "step": 48795 + }, + { + "epoch": 3.315667889658921, + "grad_norm": 0.1761803925037384, + "learning_rate": 5.857283598314989e-05, + "loss": 3.771, + "step": 48800 + }, + { + "epoch": 3.3160076097295828, + "grad_norm": 0.20478491485118866, + "learning_rate": 5.8568589482266615e-05, + "loss": 3.9236, + "step": 48805 + }, + { + "epoch": 3.3163473298002444, + "grad_norm": 0.14276863634586334, + "learning_rate": 5.856434298138335e-05, + "loss": 3.8081, + "step": 48810 + }, + { + "epoch": 3.3166870498709065, + "grad_norm": 0.2859087288379669, + "learning_rate": 5.856009648050007e-05, + "loss": 3.8357, + "step": 48815 + }, + { + "epoch": 3.317026769941568, + "grad_norm": 0.1998935341835022, + "learning_rate": 5.855584997961679e-05, + "loss": 3.8595, + "step": 48820 + }, + { + "epoch": 3.3173664900122297, + "grad_norm": 0.20849105715751648, + "learning_rate": 5.855160347873353e-05, + "loss": 3.9926, + "step": 48825 + }, + { + "epoch": 3.317706210082892, + "grad_norm": 0.1619783490896225, + "learning_rate": 5.8547356977850255e-05, + "loss": 3.6778, + "step": 48830 + }, + { + "epoch": 3.3180459301535534, + "grad_norm": 0.14118485152721405, + "learning_rate": 5.8543110476966976e-05, + "loss": 4.118, + "step": 48835 + }, + { + "epoch": 3.318385650224215, + "grad_norm": 0.15780098736286163, + "learning_rate": 5.853886397608371e-05, + "loss": 4.0157, + "step": 48840 + }, + { + "epoch": 3.318725370294877, + "grad_norm": 0.9457326531410217, + "learning_rate": 5.853461747520044e-05, + "loss": 4.1035, + "step": 48845 + }, + { + "epoch": 3.3190650903655388, + "grad_norm": 0.169319748878479, + "learning_rate": 5.853037097431716e-05, + "loss": 3.5173, + "step": 48850 + }, + { + "epoch": 3.3194048104362004, + "grad_norm": 0.16876006126403809, + "learning_rate": 5.8526124473433895e-05, + "loss": 4.0118, + "step": 48855 + }, + { + "epoch": 3.3197445305068625, + "grad_norm": 0.2009626179933548, + "learning_rate": 5.852187797255062e-05, + "loss": 3.6099, + "step": 48860 + }, + { + "epoch": 3.320084250577524, + "grad_norm": 0.20830988883972168, + "learning_rate": 5.8517631471667344e-05, + "loss": 3.7746, + "step": 48865 + }, + { + "epoch": 3.3204239706481857, + "grad_norm": 0.15427206456661224, + "learning_rate": 5.851338497078408e-05, + "loss": 4.0904, + "step": 48870 + }, + { + "epoch": 3.320763690718848, + "grad_norm": 0.15760260820388794, + "learning_rate": 5.850913846990081e-05, + "loss": 3.8324, + "step": 48875 + }, + { + "epoch": 3.3211034107895094, + "grad_norm": 0.28320449590682983, + "learning_rate": 5.850489196901753e-05, + "loss": 3.785, + "step": 48880 + }, + { + "epoch": 3.321443130860171, + "grad_norm": 0.1755431741476059, + "learning_rate": 5.850064546813426e-05, + "loss": 3.847, + "step": 48885 + }, + { + "epoch": 3.321782850930833, + "grad_norm": 0.19847001135349274, + "learning_rate": 5.8496398967250984e-05, + "loss": 3.847, + "step": 48890 + }, + { + "epoch": 3.3221225710014948, + "grad_norm": 0.17585712671279907, + "learning_rate": 5.849215246636771e-05, + "loss": 3.8718, + "step": 48895 + }, + { + "epoch": 3.3224622910721564, + "grad_norm": 0.17470811307430267, + "learning_rate": 5.848790596548445e-05, + "loss": 3.8735, + "step": 48900 + }, + { + "epoch": 3.3228020111428185, + "grad_norm": 0.20870710909366608, + "learning_rate": 5.848365946460117e-05, + "loss": 3.8573, + "step": 48905 + }, + { + "epoch": 3.32314173121348, + "grad_norm": 0.15830712020397186, + "learning_rate": 5.8479412963717896e-05, + "loss": 4.1454, + "step": 48910 + }, + { + "epoch": 3.3234814512841417, + "grad_norm": 0.20893092453479767, + "learning_rate": 5.847516646283463e-05, + "loss": 3.936, + "step": 48915 + }, + { + "epoch": 3.323821171354804, + "grad_norm": 0.2507949769496918, + "learning_rate": 5.847091996195135e-05, + "loss": 3.8191, + "step": 48920 + }, + { + "epoch": 3.3241608914254654, + "grad_norm": 0.17696422338485718, + "learning_rate": 5.846667346106808e-05, + "loss": 3.9641, + "step": 48925 + }, + { + "epoch": 3.324500611496127, + "grad_norm": 0.3240679204463959, + "learning_rate": 5.8462426960184815e-05, + "loss": 3.8574, + "step": 48930 + }, + { + "epoch": 3.324840331566789, + "grad_norm": 0.22197964787483215, + "learning_rate": 5.8458180459301536e-05, + "loss": 3.8339, + "step": 48935 + }, + { + "epoch": 3.325180051637451, + "grad_norm": 0.19736348092556, + "learning_rate": 5.8453933958418264e-05, + "loss": 3.8834, + "step": 48940 + }, + { + "epoch": 3.3255197717081124, + "grad_norm": 0.32521557807922363, + "learning_rate": 5.8449687457535e-05, + "loss": 3.9745, + "step": 48945 + }, + { + "epoch": 3.3258594917787745, + "grad_norm": 0.20140758156776428, + "learning_rate": 5.844544095665172e-05, + "loss": 3.9131, + "step": 48950 + }, + { + "epoch": 3.326199211849436, + "grad_norm": 0.18191692233085632, + "learning_rate": 5.844119445576844e-05, + "loss": 4.0513, + "step": 48955 + }, + { + "epoch": 3.3265389319200978, + "grad_norm": 0.2359256148338318, + "learning_rate": 5.8436947954885176e-05, + "loss": 3.837, + "step": 48960 + }, + { + "epoch": 3.32687865199076, + "grad_norm": 0.3561190068721771, + "learning_rate": 5.8432701454001904e-05, + "loss": 4.0575, + "step": 48965 + }, + { + "epoch": 3.3272183720614215, + "grad_norm": 0.22406898438930511, + "learning_rate": 5.8428454953118625e-05, + "loss": 3.7518, + "step": 48970 + }, + { + "epoch": 3.327558092132083, + "grad_norm": 0.16110779345035553, + "learning_rate": 5.842420845223536e-05, + "loss": 3.5004, + "step": 48975 + }, + { + "epoch": 3.327897812202745, + "grad_norm": 0.1724957525730133, + "learning_rate": 5.841996195135209e-05, + "loss": 3.9292, + "step": 48980 + }, + { + "epoch": 3.328237532273407, + "grad_norm": 0.1752115935087204, + "learning_rate": 5.841571545046882e-05, + "loss": 3.823, + "step": 48985 + }, + { + "epoch": 3.3285772523440684, + "grad_norm": 0.19076082110404968, + "learning_rate": 5.8411468949585544e-05, + "loss": 3.7872, + "step": 48990 + }, + { + "epoch": 3.3289169724147305, + "grad_norm": 0.19559650123119354, + "learning_rate": 5.840722244870227e-05, + "loss": 3.8664, + "step": 48995 + }, + { + "epoch": 3.329256692485392, + "grad_norm": 0.20220321416854858, + "learning_rate": 5.840297594781901e-05, + "loss": 3.9437, + "step": 49000 + }, + { + "epoch": 3.3295964125560538, + "grad_norm": 0.24245603382587433, + "learning_rate": 5.839872944693573e-05, + "loss": 3.9129, + "step": 49005 + }, + { + "epoch": 3.3299361326267154, + "grad_norm": 0.18982231616973877, + "learning_rate": 5.8394482946052456e-05, + "loss": 3.9055, + "step": 49010 + }, + { + "epoch": 3.3302758526973775, + "grad_norm": 0.2805820405483246, + "learning_rate": 5.839023644516919e-05, + "loss": 3.7745, + "step": 49015 + }, + { + "epoch": 3.330615572768039, + "grad_norm": 0.26735639572143555, + "learning_rate": 5.838598994428591e-05, + "loss": 3.8611, + "step": 49020 + }, + { + "epoch": 3.3309552928387007, + "grad_norm": 0.22693628072738647, + "learning_rate": 5.8381743443402633e-05, + "loss": 4.0148, + "step": 49025 + }, + { + "epoch": 3.331295012909363, + "grad_norm": 0.17543505132198334, + "learning_rate": 5.8377496942519375e-05, + "loss": 4.0336, + "step": 49030 + }, + { + "epoch": 3.3316347329800244, + "grad_norm": 0.1580287516117096, + "learning_rate": 5.8373250441636096e-05, + "loss": 3.9533, + "step": 49035 + }, + { + "epoch": 3.331974453050686, + "grad_norm": 0.24122679233551025, + "learning_rate": 5.836900394075282e-05, + "loss": 3.944, + "step": 49040 + }, + { + "epoch": 3.332314173121348, + "grad_norm": 0.17511437833309174, + "learning_rate": 5.836475743986955e-05, + "loss": 3.8313, + "step": 49045 + }, + { + "epoch": 3.3326538931920098, + "grad_norm": 0.2220337837934494, + "learning_rate": 5.836051093898628e-05, + "loss": 3.9435, + "step": 49050 + }, + { + "epoch": 3.3329936132626714, + "grad_norm": 0.15457378327846527, + "learning_rate": 5.8356264438103e-05, + "loss": 3.9019, + "step": 49055 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.1562301516532898, + "learning_rate": 5.8352017937219736e-05, + "loss": 4.015, + "step": 49060 + }, + { + "epoch": 3.333673053403995, + "grad_norm": 0.22620204091072083, + "learning_rate": 5.8347771436336464e-05, + "loss": 3.5245, + "step": 49065 + }, + { + "epoch": 3.3340127734746567, + "grad_norm": 0.21638774871826172, + "learning_rate": 5.8343524935453186e-05, + "loss": 3.5946, + "step": 49070 + }, + { + "epoch": 3.334352493545319, + "grad_norm": 0.7048006653785706, + "learning_rate": 5.833927843456992e-05, + "loss": 3.8891, + "step": 49075 + }, + { + "epoch": 3.3346922136159804, + "grad_norm": 0.1832842230796814, + "learning_rate": 5.833503193368665e-05, + "loss": 3.6893, + "step": 49080 + }, + { + "epoch": 3.335031933686642, + "grad_norm": 0.17107200622558594, + "learning_rate": 5.833078543280337e-05, + "loss": 3.8858, + "step": 49085 + }, + { + "epoch": 3.335371653757304, + "grad_norm": 0.20355045795440674, + "learning_rate": 5.8326538931920104e-05, + "loss": 3.6305, + "step": 49090 + }, + { + "epoch": 3.335711373827966, + "grad_norm": 0.1755518764257431, + "learning_rate": 5.832229243103683e-05, + "loss": 4.2184, + "step": 49095 + }, + { + "epoch": 3.3360510938986274, + "grad_norm": 0.15636040270328522, + "learning_rate": 5.8318045930153554e-05, + "loss": 3.8891, + "step": 49100 + }, + { + "epoch": 3.3363908139692895, + "grad_norm": 0.2508396804332733, + "learning_rate": 5.831379942927029e-05, + "loss": 3.7518, + "step": 49105 + }, + { + "epoch": 3.336730534039951, + "grad_norm": 0.1633147895336151, + "learning_rate": 5.830955292838701e-05, + "loss": 4.036, + "step": 49110 + }, + { + "epoch": 3.3370702541106128, + "grad_norm": 0.19743700325489044, + "learning_rate": 5.830530642750374e-05, + "loss": 4.1466, + "step": 49115 + }, + { + "epoch": 3.3374099741812744, + "grad_norm": 0.1687348186969757, + "learning_rate": 5.830105992662047e-05, + "loss": 3.7624, + "step": 49120 + }, + { + "epoch": 3.3377496942519365, + "grad_norm": 0.20514234900474548, + "learning_rate": 5.8296813425737194e-05, + "loss": 3.9034, + "step": 49125 + }, + { + "epoch": 3.338089414322598, + "grad_norm": 0.17389459908008575, + "learning_rate": 5.829256692485392e-05, + "loss": 3.9429, + "step": 49130 + }, + { + "epoch": 3.3384291343932597, + "grad_norm": 0.15748430788516998, + "learning_rate": 5.8288320423970656e-05, + "loss": 3.9206, + "step": 49135 + }, + { + "epoch": 3.338768854463922, + "grad_norm": 0.20798282325267792, + "learning_rate": 5.828407392308738e-05, + "loss": 4.1018, + "step": 49140 + }, + { + "epoch": 3.3391085745345834, + "grad_norm": 0.2024689018726349, + "learning_rate": 5.8279827422204106e-05, + "loss": 3.9575, + "step": 49145 + }, + { + "epoch": 3.339448294605245, + "grad_norm": 0.13747993111610413, + "learning_rate": 5.827558092132084e-05, + "loss": 3.8104, + "step": 49150 + }, + { + "epoch": 3.339788014675907, + "grad_norm": 0.22515684366226196, + "learning_rate": 5.827133442043756e-05, + "loss": 3.8136, + "step": 49155 + }, + { + "epoch": 3.3401277347465688, + "grad_norm": 0.5448517203330994, + "learning_rate": 5.826708791955428e-05, + "loss": 3.7691, + "step": 49160 + }, + { + "epoch": 3.3404674548172304, + "grad_norm": 0.24313214421272278, + "learning_rate": 5.8262841418671024e-05, + "loss": 3.8022, + "step": 49165 + }, + { + "epoch": 3.3408071748878925, + "grad_norm": 0.1981213241815567, + "learning_rate": 5.8258594917787746e-05, + "loss": 3.9486, + "step": 49170 + }, + { + "epoch": 3.341146894958554, + "grad_norm": 0.19663435220718384, + "learning_rate": 5.825434841690447e-05, + "loss": 3.8987, + "step": 49175 + }, + { + "epoch": 3.3414866150292157, + "grad_norm": 0.20607861876487732, + "learning_rate": 5.82501019160212e-05, + "loss": 3.6908, + "step": 49180 + }, + { + "epoch": 3.341826335099878, + "grad_norm": 0.18743032217025757, + "learning_rate": 5.824585541513793e-05, + "loss": 3.5477, + "step": 49185 + }, + { + "epoch": 3.3421660551705394, + "grad_norm": 0.5089203119277954, + "learning_rate": 5.824160891425465e-05, + "loss": 3.7747, + "step": 49190 + }, + { + "epoch": 3.342505775241201, + "grad_norm": 0.15451069176197052, + "learning_rate": 5.8237362413371386e-05, + "loss": 3.8092, + "step": 49195 + }, + { + "epoch": 3.342845495311863, + "grad_norm": 0.19103282690048218, + "learning_rate": 5.8233115912488114e-05, + "loss": 3.8792, + "step": 49200 + }, + { + "epoch": 3.3431852153825248, + "grad_norm": 0.1622224897146225, + "learning_rate": 5.8228869411604835e-05, + "loss": 4.0371, + "step": 49205 + }, + { + "epoch": 3.3435249354531864, + "grad_norm": 0.20664851367473602, + "learning_rate": 5.822462291072157e-05, + "loss": 4.0131, + "step": 49210 + }, + { + "epoch": 3.3438646555238485, + "grad_norm": 0.2033693939447403, + "learning_rate": 5.82203764098383e-05, + "loss": 3.7785, + "step": 49215 + }, + { + "epoch": 3.34420437559451, + "grad_norm": 0.22048243880271912, + "learning_rate": 5.821612990895502e-05, + "loss": 3.8344, + "step": 49220 + }, + { + "epoch": 3.3445440956651717, + "grad_norm": 0.19002391397953033, + "learning_rate": 5.8211883408071754e-05, + "loss": 3.867, + "step": 49225 + }, + { + "epoch": 3.344883815735834, + "grad_norm": 0.1948995292186737, + "learning_rate": 5.820763690718848e-05, + "loss": 3.9599, + "step": 49230 + }, + { + "epoch": 3.3452235358064955, + "grad_norm": 0.1264421045780182, + "learning_rate": 5.82033904063052e-05, + "loss": 3.4973, + "step": 49235 + }, + { + "epoch": 3.345563255877157, + "grad_norm": 0.14613482356071472, + "learning_rate": 5.819914390542194e-05, + "loss": 3.8629, + "step": 49240 + }, + { + "epoch": 3.345902975947819, + "grad_norm": 0.2828845679759979, + "learning_rate": 5.819489740453866e-05, + "loss": 3.9601, + "step": 49245 + }, + { + "epoch": 3.346242696018481, + "grad_norm": 1.02541983127594, + "learning_rate": 5.819065090365539e-05, + "loss": 3.8857, + "step": 49250 + }, + { + "epoch": 3.3465824160891424, + "grad_norm": 0.39386871457099915, + "learning_rate": 5.818640440277212e-05, + "loss": 3.7818, + "step": 49255 + }, + { + "epoch": 3.3469221361598045, + "grad_norm": 0.17527727782726288, + "learning_rate": 5.818215790188884e-05, + "loss": 3.8929, + "step": 49260 + }, + { + "epoch": 3.347261856230466, + "grad_norm": 0.18057332932949066, + "learning_rate": 5.817791140100557e-05, + "loss": 4.0587, + "step": 49265 + }, + { + "epoch": 3.3476015763011278, + "grad_norm": 0.18676058948040009, + "learning_rate": 5.8173664900122306e-05, + "loss": 4.0266, + "step": 49270 + }, + { + "epoch": 3.34794129637179, + "grad_norm": 0.44402068853378296, + "learning_rate": 5.816941839923903e-05, + "loss": 3.915, + "step": 49275 + }, + { + "epoch": 3.3482810164424515, + "grad_norm": 0.5271340012550354, + "learning_rate": 5.8165171898355755e-05, + "loss": 4.0396, + "step": 49280 + }, + { + "epoch": 3.348620736513113, + "grad_norm": 0.1545877307653427, + "learning_rate": 5.816092539747249e-05, + "loss": 3.9138, + "step": 49285 + }, + { + "epoch": 3.348960456583775, + "grad_norm": 0.16362611949443817, + "learning_rate": 5.815667889658921e-05, + "loss": 3.85, + "step": 49290 + }, + { + "epoch": 3.349300176654437, + "grad_norm": 0.16326245665550232, + "learning_rate": 5.815243239570594e-05, + "loss": 3.869, + "step": 49295 + }, + { + "epoch": 3.3496398967250984, + "grad_norm": 0.19443871080875397, + "learning_rate": 5.8148185894822674e-05, + "loss": 3.8336, + "step": 49300 + }, + { + "epoch": 3.3499796167957605, + "grad_norm": 0.29023364186286926, + "learning_rate": 5.8143939393939395e-05, + "loss": 3.7959, + "step": 49305 + }, + { + "epoch": 3.350319336866422, + "grad_norm": 0.17274728417396545, + "learning_rate": 5.8139692893056116e-05, + "loss": 3.783, + "step": 49310 + }, + { + "epoch": 3.3506590569370838, + "grad_norm": 0.14036820828914642, + "learning_rate": 5.813544639217285e-05, + "loss": 4.0612, + "step": 49315 + }, + { + "epoch": 3.350998777007746, + "grad_norm": 0.17516738176345825, + "learning_rate": 5.813119989128958e-05, + "loss": 3.8583, + "step": 49320 + }, + { + "epoch": 3.3513384970784075, + "grad_norm": 0.17349465191364288, + "learning_rate": 5.8126953390406314e-05, + "loss": 3.6763, + "step": 49325 + }, + { + "epoch": 3.351678217149069, + "grad_norm": 0.17167656123638153, + "learning_rate": 5.8122706889523035e-05, + "loss": 3.588, + "step": 49330 + }, + { + "epoch": 3.352017937219731, + "grad_norm": 0.24849042296409607, + "learning_rate": 5.811846038863976e-05, + "loss": 3.808, + "step": 49335 + }, + { + "epoch": 3.352357657290393, + "grad_norm": 0.22229307889938354, + "learning_rate": 5.81142138877565e-05, + "loss": 3.8262, + "step": 49340 + }, + { + "epoch": 3.3526973773610544, + "grad_norm": 0.17290367186069489, + "learning_rate": 5.810996738687322e-05, + "loss": 3.7472, + "step": 49345 + }, + { + "epoch": 3.353037097431716, + "grad_norm": 0.25160419940948486, + "learning_rate": 5.810572088598995e-05, + "loss": 3.6906, + "step": 49350 + }, + { + "epoch": 3.353376817502378, + "grad_norm": 0.7149028778076172, + "learning_rate": 5.810147438510668e-05, + "loss": 3.5382, + "step": 49355 + }, + { + "epoch": 3.35371653757304, + "grad_norm": 0.1877002865076065, + "learning_rate": 5.80972278842234e-05, + "loss": 3.789, + "step": 49360 + }, + { + "epoch": 3.3540562576437014, + "grad_norm": 0.1858672797679901, + "learning_rate": 5.809298138334013e-05, + "loss": 3.8381, + "step": 49365 + }, + { + "epoch": 3.3543959777143635, + "grad_norm": 0.1492408961057663, + "learning_rate": 5.8088734882456866e-05, + "loss": 3.8513, + "step": 49370 + }, + { + "epoch": 3.354735697785025, + "grad_norm": 0.1480036824941635, + "learning_rate": 5.808448838157359e-05, + "loss": 3.7135, + "step": 49375 + }, + { + "epoch": 3.3550754178556867, + "grad_norm": 0.1579311192035675, + "learning_rate": 5.808024188069031e-05, + "loss": 3.8692, + "step": 49380 + }, + { + "epoch": 3.355415137926349, + "grad_norm": 0.18596196174621582, + "learning_rate": 5.807599537980705e-05, + "loss": 3.8249, + "step": 49385 + }, + { + "epoch": 3.3557548579970105, + "grad_norm": 0.14642859995365143, + "learning_rate": 5.807174887892377e-05, + "loss": 3.5886, + "step": 49390 + }, + { + "epoch": 3.356094578067672, + "grad_norm": 0.19345717132091522, + "learning_rate": 5.806750237804049e-05, + "loss": 4.0403, + "step": 49395 + }, + { + "epoch": 3.356434298138334, + "grad_norm": 0.2567938566207886, + "learning_rate": 5.806325587715723e-05, + "loss": 3.9033, + "step": 49400 + }, + { + "epoch": 3.356774018208996, + "grad_norm": 0.1644248515367508, + "learning_rate": 5.8059009376273955e-05, + "loss": 3.703, + "step": 49405 + }, + { + "epoch": 3.3571137382796574, + "grad_norm": 0.18119315803050995, + "learning_rate": 5.8054762875390676e-05, + "loss": 4.1313, + "step": 49410 + }, + { + "epoch": 3.3574534583503195, + "grad_norm": 0.16627828776836395, + "learning_rate": 5.805051637450741e-05, + "loss": 3.8516, + "step": 49415 + }, + { + "epoch": 3.357793178420981, + "grad_norm": 0.2638174295425415, + "learning_rate": 5.804626987362414e-05, + "loss": 3.8389, + "step": 49420 + }, + { + "epoch": 3.3581328984916428, + "grad_norm": 0.17993474006652832, + "learning_rate": 5.804202337274086e-05, + "loss": 3.8647, + "step": 49425 + }, + { + "epoch": 3.358472618562305, + "grad_norm": 0.15596164762973785, + "learning_rate": 5.8037776871857595e-05, + "loss": 3.876, + "step": 49430 + }, + { + "epoch": 3.3588123386329665, + "grad_norm": 0.1522545963525772, + "learning_rate": 5.803353037097432e-05, + "loss": 3.5782, + "step": 49435 + }, + { + "epoch": 3.359152058703628, + "grad_norm": 0.241245836019516, + "learning_rate": 5.8029283870091045e-05, + "loss": 3.5861, + "step": 49440 + }, + { + "epoch": 3.35949177877429, + "grad_norm": 0.18106292188167572, + "learning_rate": 5.802503736920778e-05, + "loss": 3.6629, + "step": 49445 + }, + { + "epoch": 3.359831498844952, + "grad_norm": 0.19429607689380646, + "learning_rate": 5.80207908683245e-05, + "loss": 3.7725, + "step": 49450 + }, + { + "epoch": 3.3601712189156134, + "grad_norm": 0.2821449935436249, + "learning_rate": 5.801654436744123e-05, + "loss": 4.1324, + "step": 49455 + }, + { + "epoch": 3.360510938986275, + "grad_norm": 0.19257615506649017, + "learning_rate": 5.801229786655796e-05, + "loss": 3.8943, + "step": 49460 + }, + { + "epoch": 3.360850659056937, + "grad_norm": 0.1801515370607376, + "learning_rate": 5.8008051365674685e-05, + "loss": 3.7287, + "step": 49465 + }, + { + "epoch": 3.3611903791275988, + "grad_norm": 0.1572543829679489, + "learning_rate": 5.800380486479141e-05, + "loss": 3.8951, + "step": 49470 + }, + { + "epoch": 3.3615300991982604, + "grad_norm": 0.1781485378742218, + "learning_rate": 5.799955836390815e-05, + "loss": 3.7831, + "step": 49475 + }, + { + "epoch": 3.3618698192689225, + "grad_norm": 0.28676238656044006, + "learning_rate": 5.799531186302487e-05, + "loss": 3.8475, + "step": 49480 + }, + { + "epoch": 3.362209539339584, + "grad_norm": 0.19557753205299377, + "learning_rate": 5.7991065362141597e-05, + "loss": 3.7868, + "step": 49485 + }, + { + "epoch": 3.3625492594102457, + "grad_norm": 1.2168467044830322, + "learning_rate": 5.798681886125833e-05, + "loss": 3.8881, + "step": 49490 + }, + { + "epoch": 3.362888979480908, + "grad_norm": 0.291702002286911, + "learning_rate": 5.798257236037505e-05, + "loss": 3.9673, + "step": 49495 + }, + { + "epoch": 3.3632286995515694, + "grad_norm": 0.2055753767490387, + "learning_rate": 5.797832585949178e-05, + "loss": 3.887, + "step": 49500 + }, + { + "epoch": 3.363568419622231, + "grad_norm": 0.19394570589065552, + "learning_rate": 5.7974079358608515e-05, + "loss": 3.6444, + "step": 49505 + }, + { + "epoch": 3.363908139692893, + "grad_norm": 0.1686762571334839, + "learning_rate": 5.796983285772524e-05, + "loss": 3.9095, + "step": 49510 + }, + { + "epoch": 3.364247859763555, + "grad_norm": 0.18297745287418365, + "learning_rate": 5.796558635684196e-05, + "loss": 4.1154, + "step": 49515 + }, + { + "epoch": 3.3645875798342164, + "grad_norm": 0.18318217992782593, + "learning_rate": 5.79613398559587e-05, + "loss": 3.7944, + "step": 49520 + }, + { + "epoch": 3.3649272999048785, + "grad_norm": 0.1902020424604416, + "learning_rate": 5.795709335507542e-05, + "loss": 3.847, + "step": 49525 + }, + { + "epoch": 3.36526701997554, + "grad_norm": 0.1985507309436798, + "learning_rate": 5.795284685419214e-05, + "loss": 3.9408, + "step": 49530 + }, + { + "epoch": 3.3656067400462018, + "grad_norm": 0.4056830108165741, + "learning_rate": 5.794860035330888e-05, + "loss": 4.1164, + "step": 49535 + }, + { + "epoch": 3.365946460116864, + "grad_norm": 0.18235285580158234, + "learning_rate": 5.7944353852425605e-05, + "loss": 3.9556, + "step": 49540 + }, + { + "epoch": 3.3662861801875255, + "grad_norm": 0.1716195046901703, + "learning_rate": 5.7940107351542326e-05, + "loss": 3.8357, + "step": 49545 + }, + { + "epoch": 3.366625900258187, + "grad_norm": 0.16193009912967682, + "learning_rate": 5.793586085065906e-05, + "loss": 3.9597, + "step": 49550 + }, + { + "epoch": 3.366965620328849, + "grad_norm": 0.24614748358726501, + "learning_rate": 5.793161434977579e-05, + "loss": 3.9352, + "step": 49555 + }, + { + "epoch": 3.367305340399511, + "grad_norm": 0.16309203207492828, + "learning_rate": 5.792736784889251e-05, + "loss": 3.707, + "step": 49560 + }, + { + "epoch": 3.3676450604701724, + "grad_norm": 0.22889623045921326, + "learning_rate": 5.7923121348009245e-05, + "loss": 3.8996, + "step": 49565 + }, + { + "epoch": 3.3679847805408345, + "grad_norm": 0.24828866124153137, + "learning_rate": 5.791887484712597e-05, + "loss": 3.7995, + "step": 49570 + }, + { + "epoch": 3.368324500611496, + "grad_norm": 0.16575439274311066, + "learning_rate": 5.7914628346242694e-05, + "loss": 3.8668, + "step": 49575 + }, + { + "epoch": 3.3686642206821578, + "grad_norm": 0.20185503363609314, + "learning_rate": 5.791038184535943e-05, + "loss": 4.0562, + "step": 49580 + }, + { + "epoch": 3.36900394075282, + "grad_norm": 0.21475493907928467, + "learning_rate": 5.790613534447616e-05, + "loss": 3.6523, + "step": 49585 + }, + { + "epoch": 3.3693436608234815, + "grad_norm": 0.14778587222099304, + "learning_rate": 5.790188884359288e-05, + "loss": 3.793, + "step": 49590 + }, + { + "epoch": 3.369683380894143, + "grad_norm": 0.20446182787418365, + "learning_rate": 5.789764234270961e-05, + "loss": 3.8706, + "step": 49595 + }, + { + "epoch": 3.370023100964805, + "grad_norm": 0.19186294078826904, + "learning_rate": 5.7893395841826334e-05, + "loss": 3.9025, + "step": 49600 + }, + { + "epoch": 3.370362821035467, + "grad_norm": 0.4731720983982086, + "learning_rate": 5.788914934094306e-05, + "loss": 3.8553, + "step": 49605 + }, + { + "epoch": 3.3707025411061284, + "grad_norm": 0.17656008899211884, + "learning_rate": 5.78849028400598e-05, + "loss": 3.884, + "step": 49610 + }, + { + "epoch": 3.3710422611767905, + "grad_norm": 0.272748202085495, + "learning_rate": 5.788065633917652e-05, + "loss": 3.6097, + "step": 49615 + }, + { + "epoch": 3.371381981247452, + "grad_norm": 0.17102935910224915, + "learning_rate": 5.7876409838293246e-05, + "loss": 3.9605, + "step": 49620 + }, + { + "epoch": 3.3717217013181138, + "grad_norm": 0.19078493118286133, + "learning_rate": 5.787216333740998e-05, + "loss": 3.7708, + "step": 49625 + }, + { + "epoch": 3.372061421388776, + "grad_norm": 0.19155755639076233, + "learning_rate": 5.78679168365267e-05, + "loss": 3.962, + "step": 49630 + }, + { + "epoch": 3.3724011414594375, + "grad_norm": 0.1807924211025238, + "learning_rate": 5.786367033564343e-05, + "loss": 3.7901, + "step": 49635 + }, + { + "epoch": 3.372740861530099, + "grad_norm": 0.15575598180294037, + "learning_rate": 5.7859423834760165e-05, + "loss": 3.7909, + "step": 49640 + }, + { + "epoch": 3.373080581600761, + "grad_norm": 0.17230182886123657, + "learning_rate": 5.7855177333876886e-05, + "loss": 3.8359, + "step": 49645 + }, + { + "epoch": 3.373420301671423, + "grad_norm": 0.22879505157470703, + "learning_rate": 5.785093083299361e-05, + "loss": 3.9095, + "step": 49650 + }, + { + "epoch": 3.3737600217420844, + "grad_norm": 0.17476153373718262, + "learning_rate": 5.784668433211035e-05, + "loss": 3.8915, + "step": 49655 + }, + { + "epoch": 3.3740997418127465, + "grad_norm": 0.18915842473506927, + "learning_rate": 5.784243783122707e-05, + "loss": 3.7901, + "step": 49660 + }, + { + "epoch": 3.374439461883408, + "grad_norm": 0.5060005187988281, + "learning_rate": 5.7838191330343805e-05, + "loss": 3.8744, + "step": 49665 + }, + { + "epoch": 3.37477918195407, + "grad_norm": 0.1452927589416504, + "learning_rate": 5.7833944829460526e-05, + "loss": 3.9284, + "step": 49670 + }, + { + "epoch": 3.375118902024732, + "grad_norm": 2.4498445987701416, + "learning_rate": 5.7829698328577254e-05, + "loss": 3.8203, + "step": 49675 + }, + { + "epoch": 3.3754586220953935, + "grad_norm": 0.1924775242805481, + "learning_rate": 5.782545182769399e-05, + "loss": 3.7534, + "step": 49680 + }, + { + "epoch": 3.375798342166055, + "grad_norm": 0.1964481920003891, + "learning_rate": 5.782120532681071e-05, + "loss": 3.8958, + "step": 49685 + }, + { + "epoch": 3.3761380622367168, + "grad_norm": 0.21244072914123535, + "learning_rate": 5.781695882592744e-05, + "loss": 3.8043, + "step": 49690 + }, + { + "epoch": 3.376477782307379, + "grad_norm": 0.2541563808917999, + "learning_rate": 5.781271232504417e-05, + "loss": 3.9266, + "step": 49695 + }, + { + "epoch": 3.3768175023780405, + "grad_norm": 0.16596662998199463, + "learning_rate": 5.7808465824160894e-05, + "loss": 3.8367, + "step": 49700 + }, + { + "epoch": 3.377157222448702, + "grad_norm": 0.48147085309028625, + "learning_rate": 5.780421932327762e-05, + "loss": 3.7475, + "step": 49705 + }, + { + "epoch": 3.377496942519364, + "grad_norm": 0.3108479678630829, + "learning_rate": 5.779997282239436e-05, + "loss": 3.8581, + "step": 49710 + }, + { + "epoch": 3.377836662590026, + "grad_norm": 0.6282466650009155, + "learning_rate": 5.779572632151108e-05, + "loss": 3.6606, + "step": 49715 + }, + { + "epoch": 3.3781763826606874, + "grad_norm": 0.1700732260942459, + "learning_rate": 5.7791479820627806e-05, + "loss": 4.0303, + "step": 49720 + }, + { + "epoch": 3.3785161027313495, + "grad_norm": 0.17434553802013397, + "learning_rate": 5.778723331974454e-05, + "loss": 3.6768, + "step": 49725 + }, + { + "epoch": 3.378855822802011, + "grad_norm": 0.22202889621257782, + "learning_rate": 5.778298681886126e-05, + "loss": 3.8633, + "step": 49730 + }, + { + "epoch": 3.3791955428726728, + "grad_norm": 0.19459518790245056, + "learning_rate": 5.7778740317977983e-05, + "loss": 3.9639, + "step": 49735 + }, + { + "epoch": 3.379535262943335, + "grad_norm": 0.19807425141334534, + "learning_rate": 5.777449381709472e-05, + "loss": 3.9503, + "step": 49740 + }, + { + "epoch": 3.3798749830139965, + "grad_norm": 0.22826167941093445, + "learning_rate": 5.7770247316211446e-05, + "loss": 3.7928, + "step": 49745 + }, + { + "epoch": 3.380214703084658, + "grad_norm": 0.17605693638324738, + "learning_rate": 5.776600081532817e-05, + "loss": 3.7754, + "step": 49750 + }, + { + "epoch": 3.38055442315532, + "grad_norm": 1.9544306993484497, + "learning_rate": 5.77617543144449e-05, + "loss": 3.8862, + "step": 49755 + }, + { + "epoch": 3.380894143225982, + "grad_norm": 0.18515658378601074, + "learning_rate": 5.775750781356163e-05, + "loss": 3.7117, + "step": 49760 + }, + { + "epoch": 3.3812338632966434, + "grad_norm": 0.3088854253292084, + "learning_rate": 5.775326131267835e-05, + "loss": 3.9242, + "step": 49765 + }, + { + "epoch": 3.3815735833673055, + "grad_norm": 0.14294342696666718, + "learning_rate": 5.7749014811795086e-05, + "loss": 3.7706, + "step": 49770 + }, + { + "epoch": 3.381913303437967, + "grad_norm": 0.14654849469661713, + "learning_rate": 5.7744768310911814e-05, + "loss": 3.9458, + "step": 49775 + }, + { + "epoch": 3.3822530235086288, + "grad_norm": 0.2985582947731018, + "learning_rate": 5.7740521810028536e-05, + "loss": 3.9902, + "step": 49780 + }, + { + "epoch": 3.382592743579291, + "grad_norm": 0.137990340590477, + "learning_rate": 5.773627530914527e-05, + "loss": 3.9575, + "step": 49785 + }, + { + "epoch": 3.3829324636499525, + "grad_norm": 0.1455017775297165, + "learning_rate": 5.7732028808262e-05, + "loss": 4.0259, + "step": 49790 + }, + { + "epoch": 3.383272183720614, + "grad_norm": 0.1637449413537979, + "learning_rate": 5.772778230737872e-05, + "loss": 3.9002, + "step": 49795 + }, + { + "epoch": 3.3836119037912757, + "grad_norm": 0.18716129660606384, + "learning_rate": 5.7723535806495454e-05, + "loss": 3.9084, + "step": 49800 + }, + { + "epoch": 3.383951623861938, + "grad_norm": 0.13529199361801147, + "learning_rate": 5.7719289305612176e-05, + "loss": 3.5041, + "step": 49805 + }, + { + "epoch": 3.3842913439325994, + "grad_norm": 0.3589683473110199, + "learning_rate": 5.7715042804728904e-05, + "loss": 3.8283, + "step": 49810 + }, + { + "epoch": 3.384631064003261, + "grad_norm": 0.1914283037185669, + "learning_rate": 5.771079630384564e-05, + "loss": 3.8679, + "step": 49815 + }, + { + "epoch": 3.384970784073923, + "grad_norm": 0.17982442677021027, + "learning_rate": 5.770654980296236e-05, + "loss": 3.7002, + "step": 49820 + }, + { + "epoch": 3.385310504144585, + "grad_norm": 0.14826224744319916, + "learning_rate": 5.770230330207909e-05, + "loss": 3.8614, + "step": 49825 + }, + { + "epoch": 3.3856502242152464, + "grad_norm": 0.24278973042964935, + "learning_rate": 5.769805680119582e-05, + "loss": 4.1005, + "step": 49830 + }, + { + "epoch": 3.3859899442859085, + "grad_norm": 0.2086831033229828, + "learning_rate": 5.7693810300312544e-05, + "loss": 4.1775, + "step": 49835 + }, + { + "epoch": 3.38632966435657, + "grad_norm": 0.16117705404758453, + "learning_rate": 5.768956379942927e-05, + "loss": 3.6033, + "step": 49840 + }, + { + "epoch": 3.3866693844272318, + "grad_norm": 0.18374337255954742, + "learning_rate": 5.7685317298546006e-05, + "loss": 3.738, + "step": 49845 + }, + { + "epoch": 3.387009104497894, + "grad_norm": 0.1591971516609192, + "learning_rate": 5.768107079766273e-05, + "loss": 3.9472, + "step": 49850 + }, + { + "epoch": 3.3873488245685555, + "grad_norm": 0.16386906802654266, + "learning_rate": 5.7676824296779456e-05, + "loss": 3.8176, + "step": 49855 + }, + { + "epoch": 3.387688544639217, + "grad_norm": 0.18602421879768372, + "learning_rate": 5.767257779589619e-05, + "loss": 3.8413, + "step": 49860 + }, + { + "epoch": 3.388028264709879, + "grad_norm": 0.18388091027736664, + "learning_rate": 5.766833129501291e-05, + "loss": 3.8776, + "step": 49865 + }, + { + "epoch": 3.388367984780541, + "grad_norm": 0.17803554236888885, + "learning_rate": 5.766408479412963e-05, + "loss": 3.9355, + "step": 49870 + }, + { + "epoch": 3.3887077048512024, + "grad_norm": 0.21800722181797028, + "learning_rate": 5.765983829324637e-05, + "loss": 3.9463, + "step": 49875 + }, + { + "epoch": 3.3890474249218645, + "grad_norm": 0.26025596261024475, + "learning_rate": 5.7655591792363096e-05, + "loss": 3.9003, + "step": 49880 + }, + { + "epoch": 3.389387144992526, + "grad_norm": 0.19634124636650085, + "learning_rate": 5.765134529147982e-05, + "loss": 3.6928, + "step": 49885 + }, + { + "epoch": 3.3897268650631878, + "grad_norm": 0.48831239342689514, + "learning_rate": 5.764709879059655e-05, + "loss": 3.7067, + "step": 49890 + }, + { + "epoch": 3.39006658513385, + "grad_norm": 0.18114054203033447, + "learning_rate": 5.764285228971328e-05, + "loss": 3.8972, + "step": 49895 + }, + { + "epoch": 3.3904063052045115, + "grad_norm": 0.23285385966300964, + "learning_rate": 5.763860578883e-05, + "loss": 4.0276, + "step": 49900 + }, + { + "epoch": 3.390746025275173, + "grad_norm": 0.20155449211597443, + "learning_rate": 5.7634359287946736e-05, + "loss": 3.7587, + "step": 49905 + }, + { + "epoch": 3.391085745345835, + "grad_norm": 0.2094687521457672, + "learning_rate": 5.7630112787063464e-05, + "loss": 3.7912, + "step": 49910 + }, + { + "epoch": 3.391425465416497, + "grad_norm": 0.2121734917163849, + "learning_rate": 5.7625866286180185e-05, + "loss": 3.729, + "step": 49915 + }, + { + "epoch": 3.3917651854871584, + "grad_norm": 0.22478076815605164, + "learning_rate": 5.762161978529692e-05, + "loss": 3.72, + "step": 49920 + }, + { + "epoch": 3.3921049055578205, + "grad_norm": 0.820720374584198, + "learning_rate": 5.761737328441365e-05, + "loss": 3.9716, + "step": 49925 + }, + { + "epoch": 3.392444625628482, + "grad_norm": 0.13404689729213715, + "learning_rate": 5.761312678353037e-05, + "loss": 3.9077, + "step": 49930 + }, + { + "epoch": 3.3927843456991438, + "grad_norm": 0.206651970744133, + "learning_rate": 5.7608880282647104e-05, + "loss": 3.894, + "step": 49935 + }, + { + "epoch": 3.393124065769806, + "grad_norm": 0.20711947977542877, + "learning_rate": 5.7604633781763825e-05, + "loss": 3.739, + "step": 49940 + }, + { + "epoch": 3.3934637858404675, + "grad_norm": 0.16130110621452332, + "learning_rate": 5.760038728088055e-05, + "loss": 3.8269, + "step": 49945 + }, + { + "epoch": 3.393803505911129, + "grad_norm": 0.15791141986846924, + "learning_rate": 5.759614077999729e-05, + "loss": 3.7537, + "step": 49950 + }, + { + "epoch": 3.394143225981791, + "grad_norm": 0.13859841227531433, + "learning_rate": 5.759189427911401e-05, + "loss": 3.9732, + "step": 49955 + }, + { + "epoch": 3.394482946052453, + "grad_norm": 0.15453819930553436, + "learning_rate": 5.758764777823074e-05, + "loss": 3.7648, + "step": 49960 + }, + { + "epoch": 3.3948226661231145, + "grad_norm": 0.1826217770576477, + "learning_rate": 5.758340127734747e-05, + "loss": 3.8729, + "step": 49965 + }, + { + "epoch": 3.3951623861937765, + "grad_norm": 0.16932222247123718, + "learning_rate": 5.757915477646419e-05, + "loss": 3.6912, + "step": 49970 + }, + { + "epoch": 3.395502106264438, + "grad_norm": 0.12790346145629883, + "learning_rate": 5.757490827558092e-05, + "loss": 3.9569, + "step": 49975 + }, + { + "epoch": 3.3958418263351, + "grad_norm": 0.20459428429603577, + "learning_rate": 5.7570661774697656e-05, + "loss": 3.957, + "step": 49980 + }, + { + "epoch": 3.396181546405762, + "grad_norm": 0.18582980334758759, + "learning_rate": 5.756641527381438e-05, + "loss": 3.866, + "step": 49985 + }, + { + "epoch": 3.3965212664764235, + "grad_norm": 0.1469029039144516, + "learning_rate": 5.7562168772931105e-05, + "loss": 4.0087, + "step": 49990 + }, + { + "epoch": 3.396860986547085, + "grad_norm": 0.14740651845932007, + "learning_rate": 5.755792227204784e-05, + "loss": 3.6529, + "step": 49995 + }, + { + "epoch": 3.397200706617747, + "grad_norm": 0.1668945550918579, + "learning_rate": 5.755367577116456e-05, + "loss": 3.8872, + "step": 50000 + }, + { + "epoch": 3.397540426688409, + "grad_norm": 0.1876494586467743, + "learning_rate": 5.7549429270281296e-05, + "loss": 3.9339, + "step": 50005 + }, + { + "epoch": 3.3978801467590705, + "grad_norm": 0.17528970539569855, + "learning_rate": 5.7545182769398024e-05, + "loss": 3.9498, + "step": 50010 + }, + { + "epoch": 3.3982198668297325, + "grad_norm": 0.1630924940109253, + "learning_rate": 5.7540936268514745e-05, + "loss": 3.8761, + "step": 50015 + }, + { + "epoch": 3.398559586900394, + "grad_norm": 0.17757366597652435, + "learning_rate": 5.753668976763148e-05, + "loss": 3.8038, + "step": 50020 + }, + { + "epoch": 3.398899306971056, + "grad_norm": 0.2182961404323578, + "learning_rate": 5.75324432667482e-05, + "loss": 3.9563, + "step": 50025 + }, + { + "epoch": 3.3992390270417174, + "grad_norm": 0.1434420645236969, + "learning_rate": 5.752819676586493e-05, + "loss": 3.6285, + "step": 50030 + }, + { + "epoch": 3.3995787471123795, + "grad_norm": 0.219981387257576, + "learning_rate": 5.7523950264981664e-05, + "loss": 3.8282, + "step": 50035 + }, + { + "epoch": 3.399918467183041, + "grad_norm": 0.17177312076091766, + "learning_rate": 5.7519703764098385e-05, + "loss": 3.9674, + "step": 50040 + }, + { + "epoch": 3.4002581872537028, + "grad_norm": 0.2494460493326187, + "learning_rate": 5.751545726321511e-05, + "loss": 3.9119, + "step": 50045 + }, + { + "epoch": 3.400597907324365, + "grad_norm": 0.16354647278785706, + "learning_rate": 5.751121076233185e-05, + "loss": 3.889, + "step": 50050 + }, + { + "epoch": 3.4009376273950265, + "grad_norm": 0.16987432539463043, + "learning_rate": 5.750696426144857e-05, + "loss": 3.8723, + "step": 50055 + }, + { + "epoch": 3.401277347465688, + "grad_norm": 0.13636238873004913, + "learning_rate": 5.75027177605653e-05, + "loss": 3.6311, + "step": 50060 + }, + { + "epoch": 3.40161706753635, + "grad_norm": 0.18479889631271362, + "learning_rate": 5.749847125968203e-05, + "loss": 3.838, + "step": 50065 + }, + { + "epoch": 3.401956787607012, + "grad_norm": 0.23996903002262115, + "learning_rate": 5.749422475879875e-05, + "loss": 3.896, + "step": 50070 + }, + { + "epoch": 3.4022965076776734, + "grad_norm": 0.20357120037078857, + "learning_rate": 5.7489978257915474e-05, + "loss": 3.8247, + "step": 50075 + }, + { + "epoch": 3.4026362277483355, + "grad_norm": 0.17629007995128632, + "learning_rate": 5.7485731757032216e-05, + "loss": 3.6963, + "step": 50080 + }, + { + "epoch": 3.402975947818997, + "grad_norm": 0.16284435987472534, + "learning_rate": 5.748148525614894e-05, + "loss": 4.0939, + "step": 50085 + }, + { + "epoch": 3.403315667889659, + "grad_norm": 0.17357605695724487, + "learning_rate": 5.747723875526566e-05, + "loss": 3.8672, + "step": 50090 + }, + { + "epoch": 3.403655387960321, + "grad_norm": 0.23521779477596283, + "learning_rate": 5.747299225438239e-05, + "loss": 4.0045, + "step": 50095 + }, + { + "epoch": 3.4039951080309825, + "grad_norm": 0.16350512206554413, + "learning_rate": 5.746874575349912e-05, + "loss": 3.8759, + "step": 50100 + }, + { + "epoch": 3.404334828101644, + "grad_norm": 0.21591079235076904, + "learning_rate": 5.746449925261584e-05, + "loss": 3.8951, + "step": 50105 + }, + { + "epoch": 3.404674548172306, + "grad_norm": 0.22493374347686768, + "learning_rate": 5.746025275173258e-05, + "loss": 4.0677, + "step": 50110 + }, + { + "epoch": 3.405014268242968, + "grad_norm": 0.18463654816150665, + "learning_rate": 5.7456006250849305e-05, + "loss": 3.9444, + "step": 50115 + }, + { + "epoch": 3.4053539883136295, + "grad_norm": 0.1707422435283661, + "learning_rate": 5.7451759749966026e-05, + "loss": 3.9404, + "step": 50120 + }, + { + "epoch": 3.4056937083842915, + "grad_norm": 0.21783705055713654, + "learning_rate": 5.744751324908276e-05, + "loss": 3.9989, + "step": 50125 + }, + { + "epoch": 3.406033428454953, + "grad_norm": 0.17109525203704834, + "learning_rate": 5.744326674819949e-05, + "loss": 4.0377, + "step": 50130 + }, + { + "epoch": 3.406373148525615, + "grad_norm": 0.1585858017206192, + "learning_rate": 5.743902024731621e-05, + "loss": 3.7287, + "step": 50135 + }, + { + "epoch": 3.4067128685962764, + "grad_norm": 0.293866902589798, + "learning_rate": 5.7434773746432945e-05, + "loss": 3.8989, + "step": 50140 + }, + { + "epoch": 3.4070525886669385, + "grad_norm": 0.1725238710641861, + "learning_rate": 5.743052724554967e-05, + "loss": 3.8379, + "step": 50145 + }, + { + "epoch": 3.4073923087376, + "grad_norm": 0.4434596598148346, + "learning_rate": 5.7426280744666395e-05, + "loss": 3.7609, + "step": 50150 + }, + { + "epoch": 3.4077320288082618, + "grad_norm": 0.39861389994621277, + "learning_rate": 5.742203424378313e-05, + "loss": 4.0707, + "step": 50155 + }, + { + "epoch": 3.408071748878924, + "grad_norm": 0.1605692207813263, + "learning_rate": 5.741778774289985e-05, + "loss": 3.9267, + "step": 50160 + }, + { + "epoch": 3.4084114689495855, + "grad_norm": 0.28519171476364136, + "learning_rate": 5.741354124201658e-05, + "loss": 4.0777, + "step": 50165 + }, + { + "epoch": 3.408751189020247, + "grad_norm": 0.1673634648323059, + "learning_rate": 5.740929474113331e-05, + "loss": 3.7916, + "step": 50170 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.15022151172161102, + "learning_rate": 5.7405048240250035e-05, + "loss": 3.7482, + "step": 50175 + }, + { + "epoch": 3.409430629161571, + "grad_norm": 0.14391452074050903, + "learning_rate": 5.740080173936676e-05, + "loss": 3.8677, + "step": 50180 + }, + { + "epoch": 3.4097703492322324, + "grad_norm": 0.19426649808883667, + "learning_rate": 5.73965552384835e-05, + "loss": 3.8054, + "step": 50185 + }, + { + "epoch": 3.4101100693028945, + "grad_norm": 0.20319123566150665, + "learning_rate": 5.739230873760022e-05, + "loss": 3.7611, + "step": 50190 + }, + { + "epoch": 3.410449789373556, + "grad_norm": 0.1622397005558014, + "learning_rate": 5.7388062236716947e-05, + "loss": 3.5959, + "step": 50195 + }, + { + "epoch": 3.4107895094442178, + "grad_norm": 0.2131076157093048, + "learning_rate": 5.738381573583368e-05, + "loss": 4.1362, + "step": 50200 + }, + { + "epoch": 3.41112922951488, + "grad_norm": 0.23322345316410065, + "learning_rate": 5.73795692349504e-05, + "loss": 4.0861, + "step": 50205 + }, + { + "epoch": 3.4114689495855415, + "grad_norm": 0.18236006796360016, + "learning_rate": 5.737532273406713e-05, + "loss": 3.5612, + "step": 50210 + }, + { + "epoch": 3.411808669656203, + "grad_norm": 0.18351051211357117, + "learning_rate": 5.7371076233183865e-05, + "loss": 4.0614, + "step": 50215 + }, + { + "epoch": 3.412148389726865, + "grad_norm": 0.1520097404718399, + "learning_rate": 5.736682973230059e-05, + "loss": 3.9119, + "step": 50220 + }, + { + "epoch": 3.412488109797527, + "grad_norm": 0.21517086029052734, + "learning_rate": 5.736258323141731e-05, + "loss": 3.8038, + "step": 50225 + }, + { + "epoch": 3.4128278298681884, + "grad_norm": 0.19565719366073608, + "learning_rate": 5.735833673053404e-05, + "loss": 3.7167, + "step": 50230 + }, + { + "epoch": 3.4131675499388505, + "grad_norm": 0.1871274709701538, + "learning_rate": 5.735409022965077e-05, + "loss": 3.8104, + "step": 50235 + }, + { + "epoch": 3.413507270009512, + "grad_norm": 0.18842029571533203, + "learning_rate": 5.734984372876749e-05, + "loss": 3.9516, + "step": 50240 + }, + { + "epoch": 3.413846990080174, + "grad_norm": 0.19898445904254913, + "learning_rate": 5.734559722788423e-05, + "loss": 3.7092, + "step": 50245 + }, + { + "epoch": 3.414186710150836, + "grad_norm": 0.16372983157634735, + "learning_rate": 5.7341350727000955e-05, + "loss": 3.9828, + "step": 50250 + }, + { + "epoch": 3.4145264302214975, + "grad_norm": 0.1541195660829544, + "learning_rate": 5.7337104226117676e-05, + "loss": 3.6549, + "step": 50255 + }, + { + "epoch": 3.414866150292159, + "grad_norm": 0.6102018356323242, + "learning_rate": 5.733285772523441e-05, + "loss": 3.8705, + "step": 50260 + }, + { + "epoch": 3.415205870362821, + "grad_norm": 0.1699112355709076, + "learning_rate": 5.732861122435114e-05, + "loss": 3.9974, + "step": 50265 + }, + { + "epoch": 3.415545590433483, + "grad_norm": 0.23253917694091797, + "learning_rate": 5.732436472346786e-05, + "loss": 3.9996, + "step": 50270 + }, + { + "epoch": 3.4158853105041445, + "grad_norm": 0.2230161428451538, + "learning_rate": 5.7320118222584595e-05, + "loss": 3.9511, + "step": 50275 + }, + { + "epoch": 3.4162250305748065, + "grad_norm": 0.1521247923374176, + "learning_rate": 5.731587172170132e-05, + "loss": 3.9831, + "step": 50280 + }, + { + "epoch": 3.416564750645468, + "grad_norm": 0.16063512861728668, + "learning_rate": 5.7311625220818044e-05, + "loss": 3.7806, + "step": 50285 + }, + { + "epoch": 3.41690447071613, + "grad_norm": 0.15882308781147003, + "learning_rate": 5.730737871993478e-05, + "loss": 4.1132, + "step": 50290 + }, + { + "epoch": 3.417244190786792, + "grad_norm": 0.1645219773054123, + "learning_rate": 5.73031322190515e-05, + "loss": 3.8797, + "step": 50295 + }, + { + "epoch": 3.4175839108574535, + "grad_norm": 0.1836872696876526, + "learning_rate": 5.729888571816823e-05, + "loss": 3.7309, + "step": 50300 + }, + { + "epoch": 3.417923630928115, + "grad_norm": 0.7316274046897888, + "learning_rate": 5.729463921728496e-05, + "loss": 3.6888, + "step": 50305 + }, + { + "epoch": 3.418263350998777, + "grad_norm": 0.1630537211894989, + "learning_rate": 5.7290392716401684e-05, + "loss": 3.94, + "step": 50310 + }, + { + "epoch": 3.418603071069439, + "grad_norm": 0.1958102136850357, + "learning_rate": 5.728614621551841e-05, + "loss": 3.7489, + "step": 50315 + }, + { + "epoch": 3.4189427911401005, + "grad_norm": 0.17456775903701782, + "learning_rate": 5.728189971463515e-05, + "loss": 4.1133, + "step": 50320 + }, + { + "epoch": 3.4192825112107625, + "grad_norm": 0.20321038365364075, + "learning_rate": 5.727765321375187e-05, + "loss": 3.76, + "step": 50325 + }, + { + "epoch": 3.419622231281424, + "grad_norm": 0.22666886448860168, + "learning_rate": 5.7273406712868596e-05, + "loss": 4.0384, + "step": 50330 + }, + { + "epoch": 3.419961951352086, + "grad_norm": 0.15173448622226715, + "learning_rate": 5.726916021198533e-05, + "loss": 3.7543, + "step": 50335 + }, + { + "epoch": 3.420301671422748, + "grad_norm": 0.37855324149131775, + "learning_rate": 5.726491371110205e-05, + "loss": 3.947, + "step": 50340 + }, + { + "epoch": 3.4206413914934095, + "grad_norm": 0.33177125453948975, + "learning_rate": 5.726066721021879e-05, + "loss": 3.872, + "step": 50345 + }, + { + "epoch": 3.420981111564071, + "grad_norm": 0.16278928518295288, + "learning_rate": 5.7256420709335515e-05, + "loss": 3.8775, + "step": 50350 + }, + { + "epoch": 3.421320831634733, + "grad_norm": 0.20268943905830383, + "learning_rate": 5.7252174208452236e-05, + "loss": 4.0228, + "step": 50355 + }, + { + "epoch": 3.421660551705395, + "grad_norm": 0.2066456377506256, + "learning_rate": 5.724792770756897e-05, + "loss": 3.6793, + "step": 50360 + }, + { + "epoch": 3.4220002717760565, + "grad_norm": 0.19004304707050323, + "learning_rate": 5.724368120668569e-05, + "loss": 3.9784, + "step": 50365 + }, + { + "epoch": 3.422339991846718, + "grad_norm": 0.17768940329551697, + "learning_rate": 5.723943470580242e-05, + "loss": 3.7539, + "step": 50370 + }, + { + "epoch": 3.42267971191738, + "grad_norm": 1.4952070713043213, + "learning_rate": 5.7235188204919155e-05, + "loss": 4.0016, + "step": 50375 + }, + { + "epoch": 3.423019431988042, + "grad_norm": 0.1951870173215866, + "learning_rate": 5.7230941704035876e-05, + "loss": 3.8605, + "step": 50380 + }, + { + "epoch": 3.4233591520587034, + "grad_norm": 0.2833017110824585, + "learning_rate": 5.7226695203152604e-05, + "loss": 4.0496, + "step": 50385 + }, + { + "epoch": 3.4236988721293655, + "grad_norm": 0.1950385868549347, + "learning_rate": 5.722244870226934e-05, + "loss": 3.8417, + "step": 50390 + }, + { + "epoch": 3.424038592200027, + "grad_norm": 0.14277978241443634, + "learning_rate": 5.721820220138606e-05, + "loss": 3.8697, + "step": 50395 + }, + { + "epoch": 3.424378312270689, + "grad_norm": 0.16210749745368958, + "learning_rate": 5.721395570050279e-05, + "loss": 3.7529, + "step": 50400 + }, + { + "epoch": 3.424718032341351, + "grad_norm": 0.17770814895629883, + "learning_rate": 5.720970919961952e-05, + "loss": 3.6858, + "step": 50405 + }, + { + "epoch": 3.4250577524120125, + "grad_norm": 0.306385338306427, + "learning_rate": 5.7205462698736244e-05, + "loss": 3.7964, + "step": 50410 + }, + { + "epoch": 3.425397472482674, + "grad_norm": 0.19514276087284088, + "learning_rate": 5.720121619785297e-05, + "loss": 3.7786, + "step": 50415 + }, + { + "epoch": 3.425737192553336, + "grad_norm": 0.2245137244462967, + "learning_rate": 5.719696969696971e-05, + "loss": 3.854, + "step": 50420 + }, + { + "epoch": 3.426076912623998, + "grad_norm": 0.30379971861839294, + "learning_rate": 5.719272319608643e-05, + "loss": 3.9701, + "step": 50425 + }, + { + "epoch": 3.4264166326946595, + "grad_norm": 0.17782220244407654, + "learning_rate": 5.718847669520315e-05, + "loss": 4.0481, + "step": 50430 + }, + { + "epoch": 3.4267563527653215, + "grad_norm": 0.4921320378780365, + "learning_rate": 5.718507949449654e-05, + "loss": 3.6557, + "step": 50435 + }, + { + "epoch": 3.427096072835983, + "grad_norm": 0.19522233307361603, + "learning_rate": 5.7180832993613267e-05, + "loss": 3.7229, + "step": 50440 + }, + { + "epoch": 3.427435792906645, + "grad_norm": 0.3113357722759247, + "learning_rate": 5.717658649272999e-05, + "loss": 3.8661, + "step": 50445 + }, + { + "epoch": 3.427775512977307, + "grad_norm": 0.2196245640516281, + "learning_rate": 5.717233999184672e-05, + "loss": 3.7243, + "step": 50450 + }, + { + "epoch": 3.4281152330479685, + "grad_norm": 0.17013245820999146, + "learning_rate": 5.716809349096345e-05, + "loss": 3.791, + "step": 50455 + }, + { + "epoch": 3.42845495311863, + "grad_norm": 0.9744792580604553, + "learning_rate": 5.716384699008017e-05, + "loss": 4.0473, + "step": 50460 + }, + { + "epoch": 3.428794673189292, + "grad_norm": 0.16372571885585785, + "learning_rate": 5.715960048919691e-05, + "loss": 3.5341, + "step": 50465 + }, + { + "epoch": 3.429134393259954, + "grad_norm": 0.15207946300506592, + "learning_rate": 5.7155353988313635e-05, + "loss": 3.8268, + "step": 50470 + }, + { + "epoch": 3.4294741133306155, + "grad_norm": 0.18095897138118744, + "learning_rate": 5.7151107487430356e-05, + "loss": 3.9342, + "step": 50475 + }, + { + "epoch": 3.429813833401277, + "grad_norm": 0.19602788984775543, + "learning_rate": 5.714686098654709e-05, + "loss": 3.9033, + "step": 50480 + }, + { + "epoch": 3.430153553471939, + "grad_norm": 0.1758178323507309, + "learning_rate": 5.714261448566381e-05, + "loss": 3.872, + "step": 50485 + }, + { + "epoch": 3.430493273542601, + "grad_norm": 0.2070838361978531, + "learning_rate": 5.713836798478054e-05, + "loss": 3.7743, + "step": 50490 + }, + { + "epoch": 3.4308329936132624, + "grad_norm": 0.19097934663295746, + "learning_rate": 5.7134121483897275e-05, + "loss": 3.6234, + "step": 50495 + }, + { + "epoch": 3.4311727136839245, + "grad_norm": 0.15083128213882446, + "learning_rate": 5.7129874983013996e-05, + "loss": 4.0525, + "step": 50500 + }, + { + "epoch": 3.431512433754586, + "grad_norm": 0.1402638703584671, + "learning_rate": 5.7125628482130724e-05, + "loss": 3.8015, + "step": 50505 + }, + { + "epoch": 3.4318521538252478, + "grad_norm": 0.1781376451253891, + "learning_rate": 5.712138198124746e-05, + "loss": 3.6446, + "step": 50510 + }, + { + "epoch": 3.43219187389591, + "grad_norm": 0.18857908248901367, + "learning_rate": 5.711713548036418e-05, + "loss": 3.8141, + "step": 50515 + }, + { + "epoch": 3.4325315939665715, + "grad_norm": 0.20831844210624695, + "learning_rate": 5.711288897948091e-05, + "loss": 3.9042, + "step": 50520 + }, + { + "epoch": 3.432871314037233, + "grad_norm": 0.14842207729816437, + "learning_rate": 5.710864247859764e-05, + "loss": 3.6264, + "step": 50525 + }, + { + "epoch": 3.433211034107895, + "grad_norm": 0.15348203480243683, + "learning_rate": 5.7104395977714364e-05, + "loss": 3.8508, + "step": 50530 + }, + { + "epoch": 3.433550754178557, + "grad_norm": 0.1745956540107727, + "learning_rate": 5.7100149476831085e-05, + "loss": 3.5943, + "step": 50535 + }, + { + "epoch": 3.4338904742492184, + "grad_norm": 0.17859789729118347, + "learning_rate": 5.709590297594783e-05, + "loss": 4.0233, + "step": 50540 + }, + { + "epoch": 3.4342301943198805, + "grad_norm": 0.16170591115951538, + "learning_rate": 5.709165647506455e-05, + "loss": 3.7122, + "step": 50545 + }, + { + "epoch": 3.434569914390542, + "grad_norm": 0.17002134025096893, + "learning_rate": 5.708740997418128e-05, + "loss": 3.9199, + "step": 50550 + }, + { + "epoch": 3.434909634461204, + "grad_norm": 0.17276832461357117, + "learning_rate": 5.7083163473298004e-05, + "loss": 3.7352, + "step": 50555 + }, + { + "epoch": 3.435249354531866, + "grad_norm": 0.17350131273269653, + "learning_rate": 5.707891697241473e-05, + "loss": 3.9822, + "step": 50560 + }, + { + "epoch": 3.4355890746025275, + "grad_norm": 0.1728920340538025, + "learning_rate": 5.707467047153147e-05, + "loss": 3.7813, + "step": 50565 + }, + { + "epoch": 3.435928794673189, + "grad_norm": 0.1427105814218521, + "learning_rate": 5.707042397064819e-05, + "loss": 3.7634, + "step": 50570 + }, + { + "epoch": 3.436268514743851, + "grad_norm": 0.1916550248861313, + "learning_rate": 5.7066177469764916e-05, + "loss": 3.6888, + "step": 50575 + }, + { + "epoch": 3.436608234814513, + "grad_norm": 0.21725593507289886, + "learning_rate": 5.706193096888165e-05, + "loss": 3.8472, + "step": 50580 + }, + { + "epoch": 3.4369479548851745, + "grad_norm": 0.23503750562667847, + "learning_rate": 5.705768446799837e-05, + "loss": 3.9258, + "step": 50585 + }, + { + "epoch": 3.4372876749558365, + "grad_norm": 0.44103607535362244, + "learning_rate": 5.70534379671151e-05, + "loss": 3.8419, + "step": 50590 + }, + { + "epoch": 3.437627395026498, + "grad_norm": 0.208944633603096, + "learning_rate": 5.7049191466231835e-05, + "loss": 3.9771, + "step": 50595 + }, + { + "epoch": 3.43796711509716, + "grad_norm": 0.3873946964740753, + "learning_rate": 5.7044944965348556e-05, + "loss": 3.9204, + "step": 50600 + }, + { + "epoch": 3.438306835167822, + "grad_norm": 0.1472112536430359, + "learning_rate": 5.7040698464465284e-05, + "loss": 3.8072, + "step": 50605 + }, + { + "epoch": 3.4386465552384835, + "grad_norm": 0.4396369159221649, + "learning_rate": 5.703645196358202e-05, + "loss": 3.8668, + "step": 50610 + }, + { + "epoch": 3.438986275309145, + "grad_norm": 0.16379033029079437, + "learning_rate": 5.703220546269874e-05, + "loss": 3.5626, + "step": 50615 + }, + { + "epoch": 3.439325995379807, + "grad_norm": 0.23837964236736298, + "learning_rate": 5.702795896181546e-05, + "loss": 3.9895, + "step": 50620 + }, + { + "epoch": 3.439665715450469, + "grad_norm": 0.1649424135684967, + "learning_rate": 5.7023712460932196e-05, + "loss": 3.8649, + "step": 50625 + }, + { + "epoch": 3.4400054355211305, + "grad_norm": 0.1722540557384491, + "learning_rate": 5.7019465960048924e-05, + "loss": 4.0529, + "step": 50630 + }, + { + "epoch": 3.4403451555917925, + "grad_norm": 0.16426509618759155, + "learning_rate": 5.7015219459165645e-05, + "loss": 3.7329, + "step": 50635 + }, + { + "epoch": 3.440684875662454, + "grad_norm": 0.15454897284507751, + "learning_rate": 5.701097295828238e-05, + "loss": 3.8292, + "step": 50640 + }, + { + "epoch": 3.441024595733116, + "grad_norm": 0.22827592492103577, + "learning_rate": 5.700672645739911e-05, + "loss": 4.161, + "step": 50645 + }, + { + "epoch": 3.441364315803778, + "grad_norm": 0.34796395897865295, + "learning_rate": 5.700247995651583e-05, + "loss": 3.7455, + "step": 50650 + }, + { + "epoch": 3.4417040358744395, + "grad_norm": 0.23057803511619568, + "learning_rate": 5.6998233455632564e-05, + "loss": 4.0852, + "step": 50655 + }, + { + "epoch": 3.442043755945101, + "grad_norm": 0.16297201812267303, + "learning_rate": 5.699398695474929e-05, + "loss": 3.8387, + "step": 50660 + }, + { + "epoch": 3.442383476015763, + "grad_norm": 0.12953819334506989, + "learning_rate": 5.6989740453866013e-05, + "loss": 4.0437, + "step": 50665 + }, + { + "epoch": 3.442723196086425, + "grad_norm": 0.16081936657428741, + "learning_rate": 5.698549395298275e-05, + "loss": 3.7671, + "step": 50670 + }, + { + "epoch": 3.4430629161570865, + "grad_norm": 0.19327346980571747, + "learning_rate": 5.6981247452099476e-05, + "loss": 3.7101, + "step": 50675 + }, + { + "epoch": 3.4434026362277486, + "grad_norm": 0.1644895225763321, + "learning_rate": 5.69770009512162e-05, + "loss": 4.0055, + "step": 50680 + }, + { + "epoch": 3.44374235629841, + "grad_norm": 0.1702028512954712, + "learning_rate": 5.697275445033293e-05, + "loss": 3.8573, + "step": 50685 + }, + { + "epoch": 3.444082076369072, + "grad_norm": 0.20017503201961517, + "learning_rate": 5.6968507949449653e-05, + "loss": 3.8989, + "step": 50690 + }, + { + "epoch": 3.444421796439734, + "grad_norm": 0.16312803328037262, + "learning_rate": 5.696426144856638e-05, + "loss": 3.9289, + "step": 50695 + }, + { + "epoch": 3.4447615165103955, + "grad_norm": 0.14879225194454193, + "learning_rate": 5.6960014947683116e-05, + "loss": 3.7058, + "step": 50700 + }, + { + "epoch": 3.445101236581057, + "grad_norm": 0.1593855619430542, + "learning_rate": 5.695576844679984e-05, + "loss": 3.8228, + "step": 50705 + }, + { + "epoch": 3.4454409566517192, + "grad_norm": 0.16922366619110107, + "learning_rate": 5.6951521945916565e-05, + "loss": 4.0055, + "step": 50710 + }, + { + "epoch": 3.445780676722381, + "grad_norm": 0.21071060001850128, + "learning_rate": 5.69472754450333e-05, + "loss": 4.0548, + "step": 50715 + }, + { + "epoch": 3.4461203967930425, + "grad_norm": 0.18246731162071228, + "learning_rate": 5.694302894415002e-05, + "loss": 3.9026, + "step": 50720 + }, + { + "epoch": 3.446460116863704, + "grad_norm": 0.14010906219482422, + "learning_rate": 5.693878244326675e-05, + "loss": 4.0302, + "step": 50725 + }, + { + "epoch": 3.446799836934366, + "grad_norm": 0.16948167979717255, + "learning_rate": 5.6934535942383484e-05, + "loss": 4.1951, + "step": 50730 + }, + { + "epoch": 3.447139557005028, + "grad_norm": 0.3583422601222992, + "learning_rate": 5.6930289441500206e-05, + "loss": 3.7176, + "step": 50735 + }, + { + "epoch": 3.4474792770756895, + "grad_norm": 0.3888673484325409, + "learning_rate": 5.6926042940616934e-05, + "loss": 3.9331, + "step": 50740 + }, + { + "epoch": 3.4478189971463515, + "grad_norm": 0.18831472098827362, + "learning_rate": 5.692179643973367e-05, + "loss": 3.6226, + "step": 50745 + }, + { + "epoch": 3.448158717217013, + "grad_norm": 0.20164638757705688, + "learning_rate": 5.691754993885039e-05, + "loss": 3.8862, + "step": 50750 + }, + { + "epoch": 3.448498437287675, + "grad_norm": 0.18326711654663086, + "learning_rate": 5.691330343796711e-05, + "loss": 3.8543, + "step": 50755 + }, + { + "epoch": 3.448838157358337, + "grad_norm": 0.29193902015686035, + "learning_rate": 5.6909056937083846e-05, + "loss": 3.6901, + "step": 50760 + }, + { + "epoch": 3.4491778774289985, + "grad_norm": 0.15398164093494415, + "learning_rate": 5.6904810436200574e-05, + "loss": 3.5455, + "step": 50765 + }, + { + "epoch": 3.44951759749966, + "grad_norm": 0.2214033454656601, + "learning_rate": 5.6900563935317295e-05, + "loss": 3.8512, + "step": 50770 + }, + { + "epoch": 3.449857317570322, + "grad_norm": 0.17472109198570251, + "learning_rate": 5.689631743443403e-05, + "loss": 3.6658, + "step": 50775 + }, + { + "epoch": 3.450197037640984, + "grad_norm": 0.15443973243236542, + "learning_rate": 5.689207093355076e-05, + "loss": 3.6273, + "step": 50780 + }, + { + "epoch": 3.4505367577116455, + "grad_norm": 0.286526083946228, + "learning_rate": 5.688782443266748e-05, + "loss": 3.8016, + "step": 50785 + }, + { + "epoch": 3.4508764777823075, + "grad_norm": 0.16279369592666626, + "learning_rate": 5.6883577931784214e-05, + "loss": 4.1258, + "step": 50790 + }, + { + "epoch": 3.451216197852969, + "grad_norm": 0.2883913815021515, + "learning_rate": 5.687933143090094e-05, + "loss": 3.848, + "step": 50795 + }, + { + "epoch": 3.451555917923631, + "grad_norm": 0.16961175203323364, + "learning_rate": 5.687508493001766e-05, + "loss": 3.9477, + "step": 50800 + }, + { + "epoch": 3.451895637994293, + "grad_norm": 0.18720394372940063, + "learning_rate": 5.68708384291344e-05, + "loss": 4.0025, + "step": 50805 + }, + { + "epoch": 3.4522353580649545, + "grad_norm": 0.17147165536880493, + "learning_rate": 5.6866591928251126e-05, + "loss": 3.8412, + "step": 50810 + }, + { + "epoch": 3.452575078135616, + "grad_norm": 0.13493773341178894, + "learning_rate": 5.686234542736785e-05, + "loss": 4.0162, + "step": 50815 + }, + { + "epoch": 3.452914798206278, + "grad_norm": 0.1685657650232315, + "learning_rate": 5.685809892648458e-05, + "loss": 3.9371, + "step": 50820 + }, + { + "epoch": 3.45325451827694, + "grad_norm": 0.14345064759254456, + "learning_rate": 5.68538524256013e-05, + "loss": 3.8312, + "step": 50825 + }, + { + "epoch": 3.4535942383476015, + "grad_norm": 0.17455488443374634, + "learning_rate": 5.684960592471803e-05, + "loss": 3.8447, + "step": 50830 + }, + { + "epoch": 3.453933958418263, + "grad_norm": 0.31074461340904236, + "learning_rate": 5.6845359423834766e-05, + "loss": 3.7109, + "step": 50835 + }, + { + "epoch": 3.454273678488925, + "grad_norm": 0.5363773703575134, + "learning_rate": 5.684111292295149e-05, + "loss": 3.7755, + "step": 50840 + }, + { + "epoch": 3.454613398559587, + "grad_norm": 0.17672210931777954, + "learning_rate": 5.6836866422068215e-05, + "loss": 3.798, + "step": 50845 + }, + { + "epoch": 3.4549531186302485, + "grad_norm": 0.19740170240402222, + "learning_rate": 5.683261992118495e-05, + "loss": 3.8874, + "step": 50850 + }, + { + "epoch": 3.4552928387009105, + "grad_norm": 0.5966119766235352, + "learning_rate": 5.682837342030167e-05, + "loss": 3.9173, + "step": 50855 + }, + { + "epoch": 3.455632558771572, + "grad_norm": 0.15851639211177826, + "learning_rate": 5.68241269194184e-05, + "loss": 3.6488, + "step": 50860 + }, + { + "epoch": 3.455972278842234, + "grad_norm": 0.1866205334663391, + "learning_rate": 5.6819880418535134e-05, + "loss": 3.7908, + "step": 50865 + }, + { + "epoch": 3.456311998912896, + "grad_norm": 0.9292512536048889, + "learning_rate": 5.6815633917651855e-05, + "loss": 3.8325, + "step": 50870 + }, + { + "epoch": 3.4566517189835575, + "grad_norm": 0.26545166969299316, + "learning_rate": 5.681138741676858e-05, + "loss": 3.9281, + "step": 50875 + }, + { + "epoch": 3.456991439054219, + "grad_norm": 0.1735617071390152, + "learning_rate": 5.680714091588532e-05, + "loss": 3.8968, + "step": 50880 + }, + { + "epoch": 3.457331159124881, + "grad_norm": 0.38744786381721497, + "learning_rate": 5.680289441500204e-05, + "loss": 3.7146, + "step": 50885 + }, + { + "epoch": 3.457670879195543, + "grad_norm": 0.21654121577739716, + "learning_rate": 5.6798647914118774e-05, + "loss": 3.9277, + "step": 50890 + }, + { + "epoch": 3.4580105992662045, + "grad_norm": 0.20550063252449036, + "learning_rate": 5.67944014132355e-05, + "loss": 3.5856, + "step": 50895 + }, + { + "epoch": 3.4583503193368665, + "grad_norm": 0.16668295860290527, + "learning_rate": 5.679015491235222e-05, + "loss": 3.8759, + "step": 50900 + }, + { + "epoch": 3.458690039407528, + "grad_norm": 0.8337353467941284, + "learning_rate": 5.678590841146896e-05, + "loss": 3.9452, + "step": 50905 + }, + { + "epoch": 3.45902975947819, + "grad_norm": 0.17122790217399597, + "learning_rate": 5.678166191058568e-05, + "loss": 3.8674, + "step": 50910 + }, + { + "epoch": 3.459369479548852, + "grad_norm": 0.8265745043754578, + "learning_rate": 5.677741540970241e-05, + "loss": 3.9728, + "step": 50915 + }, + { + "epoch": 3.4597091996195135, + "grad_norm": 0.24280282855033875, + "learning_rate": 5.677316890881914e-05, + "loss": 3.8364, + "step": 50920 + }, + { + "epoch": 3.460048919690175, + "grad_norm": 0.2025902271270752, + "learning_rate": 5.676892240793586e-05, + "loss": 3.6923, + "step": 50925 + }, + { + "epoch": 3.460388639760837, + "grad_norm": 0.18667976558208466, + "learning_rate": 5.676467590705259e-05, + "loss": 3.6419, + "step": 50930 + }, + { + "epoch": 3.460728359831499, + "grad_norm": 0.19246147572994232, + "learning_rate": 5.6760429406169326e-05, + "loss": 3.9462, + "step": 50935 + }, + { + "epoch": 3.4610680799021605, + "grad_norm": 0.31164073944091797, + "learning_rate": 5.675618290528605e-05, + "loss": 3.846, + "step": 50940 + }, + { + "epoch": 3.4614077999728226, + "grad_norm": 0.252176970243454, + "learning_rate": 5.6751936404402775e-05, + "loss": 3.9081, + "step": 50945 + }, + { + "epoch": 3.461747520043484, + "grad_norm": 0.29147058725357056, + "learning_rate": 5.674768990351951e-05, + "loss": 3.8963, + "step": 50950 + }, + { + "epoch": 3.462087240114146, + "grad_norm": 0.15461945533752441, + "learning_rate": 5.674344340263623e-05, + "loss": 3.871, + "step": 50955 + }, + { + "epoch": 3.462426960184808, + "grad_norm": 0.17010459303855896, + "learning_rate": 5.673919690175295e-05, + "loss": 4.0905, + "step": 50960 + }, + { + "epoch": 3.4627666802554695, + "grad_norm": 0.18153132498264313, + "learning_rate": 5.6734950400869694e-05, + "loss": 3.7819, + "step": 50965 + }, + { + "epoch": 3.463106400326131, + "grad_norm": 0.1867242455482483, + "learning_rate": 5.6730703899986415e-05, + "loss": 3.8024, + "step": 50970 + }, + { + "epoch": 3.4634461203967932, + "grad_norm": 0.19520160555839539, + "learning_rate": 5.6726457399103136e-05, + "loss": 3.9258, + "step": 50975 + }, + { + "epoch": 3.463785840467455, + "grad_norm": 0.389701783657074, + "learning_rate": 5.672221089821987e-05, + "loss": 3.9094, + "step": 50980 + }, + { + "epoch": 3.4641255605381165, + "grad_norm": 0.1563778817653656, + "learning_rate": 5.67179643973366e-05, + "loss": 3.8402, + "step": 50985 + }, + { + "epoch": 3.4644652806087786, + "grad_norm": 0.17867696285247803, + "learning_rate": 5.671371789645332e-05, + "loss": 3.6157, + "step": 50990 + }, + { + "epoch": 3.46480500067944, + "grad_norm": 0.14557412266731262, + "learning_rate": 5.6709471395570055e-05, + "loss": 3.8779, + "step": 50995 + }, + { + "epoch": 3.465144720750102, + "grad_norm": 0.1324138194322586, + "learning_rate": 5.670522489468678e-05, + "loss": 4.1318, + "step": 51000 + }, + { + "epoch": 3.465484440820764, + "grad_norm": 0.15205024182796478, + "learning_rate": 5.6700978393803504e-05, + "loss": 4.1389, + "step": 51005 + }, + { + "epoch": 3.4658241608914255, + "grad_norm": 0.20460255444049835, + "learning_rate": 5.669673189292024e-05, + "loss": 3.8909, + "step": 51010 + }, + { + "epoch": 3.466163880962087, + "grad_norm": 0.16737571358680725, + "learning_rate": 5.669248539203697e-05, + "loss": 3.8415, + "step": 51015 + }, + { + "epoch": 3.4665036010327492, + "grad_norm": 0.19060438871383667, + "learning_rate": 5.668823889115369e-05, + "loss": 3.6893, + "step": 51020 + }, + { + "epoch": 3.466843321103411, + "grad_norm": 0.2372555434703827, + "learning_rate": 5.668399239027042e-05, + "loss": 3.836, + "step": 51025 + }, + { + "epoch": 3.4671830411740725, + "grad_norm": 0.14141546189785004, + "learning_rate": 5.667974588938715e-05, + "loss": 3.9209, + "step": 51030 + }, + { + "epoch": 3.4675227612447346, + "grad_norm": 0.17095765471458435, + "learning_rate": 5.667549938850387e-05, + "loss": 4.1173, + "step": 51035 + }, + { + "epoch": 3.467862481315396, + "grad_norm": 0.1676894724369049, + "learning_rate": 5.667125288762061e-05, + "loss": 3.8965, + "step": 51040 + }, + { + "epoch": 3.468202201386058, + "grad_norm": 0.17684686183929443, + "learning_rate": 5.666700638673733e-05, + "loss": 3.8458, + "step": 51045 + }, + { + "epoch": 3.46854192145672, + "grad_norm": 1.8697739839553833, + "learning_rate": 5.6662759885854056e-05, + "loss": 3.9079, + "step": 51050 + }, + { + "epoch": 3.4688816415273815, + "grad_norm": 0.16103564202785492, + "learning_rate": 5.665851338497079e-05, + "loss": 4.0013, + "step": 51055 + }, + { + "epoch": 3.469221361598043, + "grad_norm": 0.2323448359966278, + "learning_rate": 5.665426688408751e-05, + "loss": 3.7526, + "step": 51060 + }, + { + "epoch": 3.469561081668705, + "grad_norm": 0.17759501934051514, + "learning_rate": 5.665002038320424e-05, + "loss": 3.7177, + "step": 51065 + }, + { + "epoch": 3.469900801739367, + "grad_norm": 0.19264541566371918, + "learning_rate": 5.6645773882320975e-05, + "loss": 3.8693, + "step": 51070 + }, + { + "epoch": 3.4702405218100285, + "grad_norm": 0.15088053047657013, + "learning_rate": 5.6641527381437696e-05, + "loss": 3.9041, + "step": 51075 + }, + { + "epoch": 3.47058024188069, + "grad_norm": 0.19996589422225952, + "learning_rate": 5.6637280880554424e-05, + "loss": 3.9923, + "step": 51080 + }, + { + "epoch": 3.470919961951352, + "grad_norm": 0.8266063928604126, + "learning_rate": 5.663303437967116e-05, + "loss": 3.872, + "step": 51085 + }, + { + "epoch": 3.471259682022014, + "grad_norm": 0.40308690071105957, + "learning_rate": 5.662878787878788e-05, + "loss": 4.0015, + "step": 51090 + }, + { + "epoch": 3.4715994020926755, + "grad_norm": 0.390697181224823, + "learning_rate": 5.662454137790461e-05, + "loss": 3.8609, + "step": 51095 + }, + { + "epoch": 3.4719391221633376, + "grad_norm": 0.19475075602531433, + "learning_rate": 5.662029487702134e-05, + "loss": 3.8786, + "step": 51100 + }, + { + "epoch": 3.472278842233999, + "grad_norm": 0.17073191702365875, + "learning_rate": 5.6616048376138065e-05, + "loss": 3.6999, + "step": 51105 + }, + { + "epoch": 3.472618562304661, + "grad_norm": 0.398194283246994, + "learning_rate": 5.6611801875254786e-05, + "loss": 3.7508, + "step": 51110 + }, + { + "epoch": 3.472958282375323, + "grad_norm": 0.14254090189933777, + "learning_rate": 5.660755537437152e-05, + "loss": 3.9627, + "step": 51115 + }, + { + "epoch": 3.4732980024459845, + "grad_norm": 0.17468488216400146, + "learning_rate": 5.660330887348825e-05, + "loss": 3.9425, + "step": 51120 + }, + { + "epoch": 3.473637722516646, + "grad_norm": 0.2073851227760315, + "learning_rate": 5.659906237260497e-05, + "loss": 3.9459, + "step": 51125 + }, + { + "epoch": 3.4739774425873082, + "grad_norm": 0.198454350233078, + "learning_rate": 5.6594815871721705e-05, + "loss": 3.7749, + "step": 51130 + }, + { + "epoch": 3.47431716265797, + "grad_norm": 0.14299440383911133, + "learning_rate": 5.659056937083843e-05, + "loss": 4.0278, + "step": 51135 + }, + { + "epoch": 3.4746568827286315, + "grad_norm": 0.19239072501659393, + "learning_rate": 5.6586322869955154e-05, + "loss": 3.6987, + "step": 51140 + }, + { + "epoch": 3.4749966027992936, + "grad_norm": 0.5720560550689697, + "learning_rate": 5.658207636907189e-05, + "loss": 3.8165, + "step": 51145 + }, + { + "epoch": 3.475336322869955, + "grad_norm": 0.13450749218463898, + "learning_rate": 5.6577829868188617e-05, + "loss": 4.0305, + "step": 51150 + }, + { + "epoch": 3.475676042940617, + "grad_norm": 0.20314346253871918, + "learning_rate": 5.657358336730534e-05, + "loss": 3.813, + "step": 51155 + }, + { + "epoch": 3.4760157630112785, + "grad_norm": 0.42837411165237427, + "learning_rate": 5.656933686642207e-05, + "loss": 3.8684, + "step": 51160 + }, + { + "epoch": 3.4763554830819405, + "grad_norm": 0.18791988492012024, + "learning_rate": 5.65650903655388e-05, + "loss": 3.7002, + "step": 51165 + }, + { + "epoch": 3.476695203152602, + "grad_norm": 0.20549309253692627, + "learning_rate": 5.656084386465552e-05, + "loss": 3.8255, + "step": 51170 + }, + { + "epoch": 3.477034923223264, + "grad_norm": 0.17532499134540558, + "learning_rate": 5.655659736377226e-05, + "loss": 3.5783, + "step": 51175 + }, + { + "epoch": 3.477374643293926, + "grad_norm": 0.1891949623823166, + "learning_rate": 5.655235086288898e-05, + "loss": 3.9367, + "step": 51180 + }, + { + "epoch": 3.4777143633645875, + "grad_norm": 0.2024717777967453, + "learning_rate": 5.6548104362005706e-05, + "loss": 3.98, + "step": 51185 + }, + { + "epoch": 3.478054083435249, + "grad_norm": 0.16581511497497559, + "learning_rate": 5.654385786112244e-05, + "loss": 3.9143, + "step": 51190 + }, + { + "epoch": 3.478393803505911, + "grad_norm": 0.18390938639640808, + "learning_rate": 5.653961136023916e-05, + "loss": 3.8113, + "step": 51195 + }, + { + "epoch": 3.478733523576573, + "grad_norm": 0.16751545667648315, + "learning_rate": 5.653536485935589e-05, + "loss": 3.6935, + "step": 51200 + }, + { + "epoch": 3.4790732436472345, + "grad_norm": 2.060521125793457, + "learning_rate": 5.6531118358472625e-05, + "loss": 3.7688, + "step": 51205 + }, + { + "epoch": 3.4794129637178965, + "grad_norm": 0.19273576140403748, + "learning_rate": 5.6526871857589346e-05, + "loss": 3.6786, + "step": 51210 + }, + { + "epoch": 3.479752683788558, + "grad_norm": 1.9175763130187988, + "learning_rate": 5.6522625356706074e-05, + "loss": 4.0463, + "step": 51215 + }, + { + "epoch": 3.48009240385922, + "grad_norm": 0.18326237797737122, + "learning_rate": 5.651837885582281e-05, + "loss": 3.9441, + "step": 51220 + }, + { + "epoch": 3.480432123929882, + "grad_norm": 0.17601372301578522, + "learning_rate": 5.651413235493953e-05, + "loss": 3.9293, + "step": 51225 + }, + { + "epoch": 3.4807718440005435, + "grad_norm": 0.22446584701538086, + "learning_rate": 5.6509885854056265e-05, + "loss": 3.923, + "step": 51230 + }, + { + "epoch": 3.481111564071205, + "grad_norm": 0.19178104400634766, + "learning_rate": 5.650563935317299e-05, + "loss": 4.0313, + "step": 51235 + }, + { + "epoch": 3.481451284141867, + "grad_norm": 0.6696075201034546, + "learning_rate": 5.6501392852289714e-05, + "loss": 3.9027, + "step": 51240 + }, + { + "epoch": 3.481791004212529, + "grad_norm": 0.2597949206829071, + "learning_rate": 5.649714635140645e-05, + "loss": 3.9204, + "step": 51245 + }, + { + "epoch": 3.4821307242831905, + "grad_norm": 0.18346749246120453, + "learning_rate": 5.649289985052317e-05, + "loss": 3.8038, + "step": 51250 + }, + { + "epoch": 3.4824704443538526, + "grad_norm": 0.1528492569923401, + "learning_rate": 5.64886533496399e-05, + "loss": 4.0342, + "step": 51255 + }, + { + "epoch": 3.482810164424514, + "grad_norm": 0.26290249824523926, + "learning_rate": 5.648440684875663e-05, + "loss": 3.8046, + "step": 51260 + }, + { + "epoch": 3.483149884495176, + "grad_norm": 0.17634709179401398, + "learning_rate": 5.6480160347873354e-05, + "loss": 3.7895, + "step": 51265 + }, + { + "epoch": 3.483489604565838, + "grad_norm": 0.16377200186252594, + "learning_rate": 5.647591384699008e-05, + "loss": 4.0321, + "step": 51270 + }, + { + "epoch": 3.4838293246364995, + "grad_norm": 0.1672896146774292, + "learning_rate": 5.647166734610682e-05, + "loss": 3.7503, + "step": 51275 + }, + { + "epoch": 3.484169044707161, + "grad_norm": 0.179831862449646, + "learning_rate": 5.646742084522354e-05, + "loss": 4.0763, + "step": 51280 + }, + { + "epoch": 3.4845087647778232, + "grad_norm": 0.1851520985364914, + "learning_rate": 5.6463174344340266e-05, + "loss": 3.7972, + "step": 51285 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 0.19457001984119415, + "learning_rate": 5.6458927843457e-05, + "loss": 3.8644, + "step": 51290 + }, + { + "epoch": 3.4851882049191465, + "grad_norm": 0.17094357311725616, + "learning_rate": 5.645468134257372e-05, + "loss": 3.6858, + "step": 51295 + }, + { + "epoch": 3.4855279249898086, + "grad_norm": 0.15412908792495728, + "learning_rate": 5.645043484169045e-05, + "loss": 3.7756, + "step": 51300 + }, + { + "epoch": 3.48586764506047, + "grad_norm": 0.16148753464221954, + "learning_rate": 5.6446188340807185e-05, + "loss": 4.0734, + "step": 51305 + }, + { + "epoch": 3.486207365131132, + "grad_norm": 0.1736500859260559, + "learning_rate": 5.6441941839923906e-05, + "loss": 3.8025, + "step": 51310 + }, + { + "epoch": 3.486547085201794, + "grad_norm": 0.16197380423545837, + "learning_rate": 5.643769533904063e-05, + "loss": 3.8935, + "step": 51315 + }, + { + "epoch": 3.4868868052724555, + "grad_norm": 0.17328782379627228, + "learning_rate": 5.643344883815737e-05, + "loss": 3.556, + "step": 51320 + }, + { + "epoch": 3.487226525343117, + "grad_norm": 0.201485738158226, + "learning_rate": 5.642920233727409e-05, + "loss": 3.9873, + "step": 51325 + }, + { + "epoch": 3.4875662454137792, + "grad_norm": 0.18328461050987244, + "learning_rate": 5.642495583639081e-05, + "loss": 3.9642, + "step": 51330 + }, + { + "epoch": 3.487905965484441, + "grad_norm": 0.16416440904140472, + "learning_rate": 5.6420709335507546e-05, + "loss": 3.9591, + "step": 51335 + }, + { + "epoch": 3.4882456855551025, + "grad_norm": 0.1717408001422882, + "learning_rate": 5.6416462834624274e-05, + "loss": 3.8237, + "step": 51340 + }, + { + "epoch": 3.4885854056257646, + "grad_norm": 0.1720111221075058, + "learning_rate": 5.6412216333740995e-05, + "loss": 3.9691, + "step": 51345 + }, + { + "epoch": 3.488925125696426, + "grad_norm": 0.15580852329730988, + "learning_rate": 5.640796983285773e-05, + "loss": 3.9477, + "step": 51350 + }, + { + "epoch": 3.489264845767088, + "grad_norm": 0.16661953926086426, + "learning_rate": 5.640372333197446e-05, + "loss": 3.8466, + "step": 51355 + }, + { + "epoch": 3.48960456583775, + "grad_norm": 0.24187205731868744, + "learning_rate": 5.639947683109118e-05, + "loss": 3.5775, + "step": 51360 + }, + { + "epoch": 3.4899442859084115, + "grad_norm": 0.22040478885173798, + "learning_rate": 5.6395230330207914e-05, + "loss": 3.9104, + "step": 51365 + }, + { + "epoch": 3.490284005979073, + "grad_norm": 0.24337036907672882, + "learning_rate": 5.639098382932464e-05, + "loss": 3.8547, + "step": 51370 + }, + { + "epoch": 3.4906237260497353, + "grad_norm": 0.15239880979061127, + "learning_rate": 5.638673732844136e-05, + "loss": 3.9649, + "step": 51375 + }, + { + "epoch": 3.490963446120397, + "grad_norm": 0.15889626741409302, + "learning_rate": 5.63824908275581e-05, + "loss": 3.8951, + "step": 51380 + }, + { + "epoch": 3.4913031661910585, + "grad_norm": 0.1428847759962082, + "learning_rate": 5.6378244326674826e-05, + "loss": 4.0056, + "step": 51385 + }, + { + "epoch": 3.4916428862617206, + "grad_norm": 0.20022182166576385, + "learning_rate": 5.637399782579155e-05, + "loss": 3.5964, + "step": 51390 + }, + { + "epoch": 3.491982606332382, + "grad_norm": 0.15997961163520813, + "learning_rate": 5.636975132490828e-05, + "loss": 3.8913, + "step": 51395 + }, + { + "epoch": 3.492322326403044, + "grad_norm": 0.17310373485088348, + "learning_rate": 5.6365504824025003e-05, + "loss": 3.905, + "step": 51400 + }, + { + "epoch": 3.4926620464737055, + "grad_norm": 0.1658376157283783, + "learning_rate": 5.636125832314173e-05, + "loss": 3.8797, + "step": 51405 + }, + { + "epoch": 3.4930017665443676, + "grad_norm": 0.20854471623897552, + "learning_rate": 5.6357011822258466e-05, + "loss": 3.8091, + "step": 51410 + }, + { + "epoch": 3.493341486615029, + "grad_norm": 0.17974410951137543, + "learning_rate": 5.635276532137519e-05, + "loss": 3.6863, + "step": 51415 + }, + { + "epoch": 3.493681206685691, + "grad_norm": 1.357182502746582, + "learning_rate": 5.6348518820491915e-05, + "loss": 4.2249, + "step": 51420 + }, + { + "epoch": 3.494020926756353, + "grad_norm": 0.20704151690006256, + "learning_rate": 5.634427231960865e-05, + "loss": 3.9021, + "step": 51425 + }, + { + "epoch": 3.4943606468270145, + "grad_norm": 0.17964375019073486, + "learning_rate": 5.634002581872537e-05, + "loss": 3.8557, + "step": 51430 + }, + { + "epoch": 3.494700366897676, + "grad_norm": 0.14172060787677765, + "learning_rate": 5.63357793178421e-05, + "loss": 4.0097, + "step": 51435 + }, + { + "epoch": 3.4950400869683382, + "grad_norm": 0.1559130996465683, + "learning_rate": 5.6331532816958834e-05, + "loss": 3.6976, + "step": 51440 + }, + { + "epoch": 3.495379807039, + "grad_norm": 0.2030852884054184, + "learning_rate": 5.6327286316075555e-05, + "loss": 3.7582, + "step": 51445 + }, + { + "epoch": 3.4957195271096615, + "grad_norm": 0.22257760167121887, + "learning_rate": 5.632303981519228e-05, + "loss": 3.662, + "step": 51450 + }, + { + "epoch": 3.4960592471803236, + "grad_norm": 0.22917628288269043, + "learning_rate": 5.631879331430902e-05, + "loss": 4.0839, + "step": 51455 + }, + { + "epoch": 3.496398967250985, + "grad_norm": 0.15756887197494507, + "learning_rate": 5.631454681342574e-05, + "loss": 3.9485, + "step": 51460 + }, + { + "epoch": 3.496738687321647, + "grad_norm": 0.15691208839416504, + "learning_rate": 5.631030031254246e-05, + "loss": 3.765, + "step": 51465 + }, + { + "epoch": 3.497078407392309, + "grad_norm": 0.3394335210323334, + "learning_rate": 5.6306053811659196e-05, + "loss": 3.7854, + "step": 51470 + }, + { + "epoch": 3.4974181274629705, + "grad_norm": 1.5606539249420166, + "learning_rate": 5.6301807310775924e-05, + "loss": 3.7035, + "step": 51475 + }, + { + "epoch": 3.497757847533632, + "grad_norm": 0.16440264880657196, + "learning_rate": 5.6297560809892645e-05, + "loss": 3.9608, + "step": 51480 + }, + { + "epoch": 3.4980975676042942, + "grad_norm": 0.16806013882160187, + "learning_rate": 5.629331430900938e-05, + "loss": 3.6619, + "step": 51485 + }, + { + "epoch": 3.498437287674956, + "grad_norm": 0.15954738855361938, + "learning_rate": 5.628906780812611e-05, + "loss": 3.8921, + "step": 51490 + }, + { + "epoch": 3.4987770077456175, + "grad_norm": 0.1666649580001831, + "learning_rate": 5.628482130724283e-05, + "loss": 3.8072, + "step": 51495 + }, + { + "epoch": 3.499116727816279, + "grad_norm": 0.1613190472126007, + "learning_rate": 5.6280574806359564e-05, + "loss": 3.6038, + "step": 51500 + }, + { + "epoch": 3.499456447886941, + "grad_norm": 0.1976238489151001, + "learning_rate": 5.627632830547629e-05, + "loss": 3.7493, + "step": 51505 + }, + { + "epoch": 3.499796167957603, + "grad_norm": 0.15098457038402557, + "learning_rate": 5.627208180459301e-05, + "loss": 4.0099, + "step": 51510 + }, + { + "epoch": 3.5001358880282645, + "grad_norm": 0.8178439140319824, + "learning_rate": 5.626783530370975e-05, + "loss": 3.9042, + "step": 51515 + }, + { + "epoch": 3.5004756080989265, + "grad_norm": 0.15970268845558167, + "learning_rate": 5.6263588802826476e-05, + "loss": 3.4884, + "step": 51520 + }, + { + "epoch": 3.500815328169588, + "grad_norm": 0.15412874519824982, + "learning_rate": 5.62593423019432e-05, + "loss": 3.8225, + "step": 51525 + }, + { + "epoch": 3.50115504824025, + "grad_norm": 0.180281400680542, + "learning_rate": 5.625509580105993e-05, + "loss": 3.8366, + "step": 51530 + }, + { + "epoch": 3.501494768310912, + "grad_norm": 0.25936710834503174, + "learning_rate": 5.625084930017665e-05, + "loss": 3.6146, + "step": 51535 + }, + { + "epoch": 3.5018344883815735, + "grad_norm": 0.17485526204109192, + "learning_rate": 5.624660279929338e-05, + "loss": 3.9025, + "step": 51540 + }, + { + "epoch": 3.502174208452235, + "grad_norm": 0.21269117295742035, + "learning_rate": 5.6242356298410116e-05, + "loss": 3.8603, + "step": 51545 + }, + { + "epoch": 3.5025139285228972, + "grad_norm": 0.2439471334218979, + "learning_rate": 5.623810979752684e-05, + "loss": 3.9694, + "step": 51550 + }, + { + "epoch": 3.502853648593559, + "grad_norm": 0.1618831753730774, + "learning_rate": 5.6233863296643565e-05, + "loss": 3.8485, + "step": 51555 + }, + { + "epoch": 3.5031933686642205, + "grad_norm": 0.1418406367301941, + "learning_rate": 5.62296167957603e-05, + "loss": 3.6462, + "step": 51560 + }, + { + "epoch": 3.5035330887348826, + "grad_norm": 0.1669120192527771, + "learning_rate": 5.622537029487702e-05, + "loss": 3.9176, + "step": 51565 + }, + { + "epoch": 3.503872808805544, + "grad_norm": 0.1498599350452423, + "learning_rate": 5.622112379399375e-05, + "loss": 3.5775, + "step": 51570 + }, + { + "epoch": 3.504212528876206, + "grad_norm": 0.23196718096733093, + "learning_rate": 5.6216877293110484e-05, + "loss": 4.1292, + "step": 51575 + }, + { + "epoch": 3.504552248946868, + "grad_norm": 0.26878872513771057, + "learning_rate": 5.6212630792227205e-05, + "loss": 3.8189, + "step": 51580 + }, + { + "epoch": 3.5048919690175295, + "grad_norm": 0.23174692690372467, + "learning_rate": 5.620838429134394e-05, + "loss": 3.7617, + "step": 51585 + }, + { + "epoch": 3.505231689088191, + "grad_norm": 0.1713816076517105, + "learning_rate": 5.620413779046067e-05, + "loss": 3.6484, + "step": 51590 + }, + { + "epoch": 3.5055714091588532, + "grad_norm": 0.1952316164970398, + "learning_rate": 5.619989128957739e-05, + "loss": 4.1522, + "step": 51595 + }, + { + "epoch": 3.505911129229515, + "grad_norm": 0.5778675079345703, + "learning_rate": 5.6195644788694124e-05, + "loss": 3.5336, + "step": 51600 + }, + { + "epoch": 3.5062508493001765, + "grad_norm": 0.167790949344635, + "learning_rate": 5.6191398287810845e-05, + "loss": 3.5372, + "step": 51605 + }, + { + "epoch": 3.5065905693708386, + "grad_norm": 0.16954532265663147, + "learning_rate": 5.618715178692757e-05, + "loss": 3.6335, + "step": 51610 + }, + { + "epoch": 3.5069302894415, + "grad_norm": 0.15507403016090393, + "learning_rate": 5.618290528604431e-05, + "loss": 3.4446, + "step": 51615 + }, + { + "epoch": 3.507270009512162, + "grad_norm": 0.19764494895935059, + "learning_rate": 5.617865878516103e-05, + "loss": 4.0453, + "step": 51620 + }, + { + "epoch": 3.507609729582824, + "grad_norm": 0.25819310545921326, + "learning_rate": 5.617441228427776e-05, + "loss": 4.101, + "step": 51625 + }, + { + "epoch": 3.5079494496534855, + "grad_norm": 0.17068545520305634, + "learning_rate": 5.617016578339449e-05, + "loss": 3.795, + "step": 51630 + }, + { + "epoch": 3.508289169724147, + "grad_norm": 0.16675299406051636, + "learning_rate": 5.616591928251121e-05, + "loss": 3.7625, + "step": 51635 + }, + { + "epoch": 3.5086288897948092, + "grad_norm": 0.19400376081466675, + "learning_rate": 5.616167278162794e-05, + "loss": 4.1157, + "step": 51640 + }, + { + "epoch": 3.508968609865471, + "grad_norm": 0.8194607496261597, + "learning_rate": 5.6157426280744676e-05, + "loss": 3.9955, + "step": 51645 + }, + { + "epoch": 3.5093083299361325, + "grad_norm": 0.18930478394031525, + "learning_rate": 5.61531797798614e-05, + "loss": 4.047, + "step": 51650 + }, + { + "epoch": 3.5096480500067946, + "grad_norm": 0.14377401769161224, + "learning_rate": 5.6148933278978125e-05, + "loss": 3.7158, + "step": 51655 + }, + { + "epoch": 3.509987770077456, + "grad_norm": 0.16586898267269135, + "learning_rate": 5.614468677809486e-05, + "loss": 3.7737, + "step": 51660 + }, + { + "epoch": 3.510327490148118, + "grad_norm": 0.18540744483470917, + "learning_rate": 5.614044027721158e-05, + "loss": 3.6893, + "step": 51665 + }, + { + "epoch": 3.51066721021878, + "grad_norm": 0.8170579671859741, + "learning_rate": 5.61361937763283e-05, + "loss": 3.9413, + "step": 51670 + }, + { + "epoch": 3.5110069302894416, + "grad_norm": 0.15367090702056885, + "learning_rate": 5.613194727544504e-05, + "loss": 3.9729, + "step": 51675 + }, + { + "epoch": 3.511346650360103, + "grad_norm": 0.1718626767396927, + "learning_rate": 5.6127700774561765e-05, + "loss": 3.776, + "step": 51680 + }, + { + "epoch": 3.5116863704307653, + "grad_norm": 0.14041517674922943, + "learning_rate": 5.6123454273678486e-05, + "loss": 3.8582, + "step": 51685 + }, + { + "epoch": 3.512026090501427, + "grad_norm": 0.41119974851608276, + "learning_rate": 5.611920777279522e-05, + "loss": 3.7529, + "step": 51690 + }, + { + "epoch": 3.5123658105720885, + "grad_norm": 0.14799998700618744, + "learning_rate": 5.611496127191195e-05, + "loss": 4.0208, + "step": 51695 + }, + { + "epoch": 3.5127055306427506, + "grad_norm": 0.4601191580295563, + "learning_rate": 5.611071477102867e-05, + "loss": 3.9832, + "step": 51700 + }, + { + "epoch": 3.5130452507134122, + "grad_norm": 0.16077138483524323, + "learning_rate": 5.6106468270145405e-05, + "loss": 3.6077, + "step": 51705 + }, + { + "epoch": 3.513384970784074, + "grad_norm": 0.2917288541793823, + "learning_rate": 5.610222176926213e-05, + "loss": 4.0108, + "step": 51710 + }, + { + "epoch": 3.513724690854736, + "grad_norm": 0.15853305160999298, + "learning_rate": 5.6097975268378854e-05, + "loss": 3.7358, + "step": 51715 + }, + { + "epoch": 3.5140644109253976, + "grad_norm": 0.1844368875026703, + "learning_rate": 5.609372876749559e-05, + "loss": 3.9909, + "step": 51720 + }, + { + "epoch": 3.514404130996059, + "grad_norm": 0.13545849919319153, + "learning_rate": 5.608948226661232e-05, + "loss": 3.6784, + "step": 51725 + }, + { + "epoch": 3.5147438510667213, + "grad_norm": 0.18203890323638916, + "learning_rate": 5.608523576572904e-05, + "loss": 3.6638, + "step": 51730 + }, + { + "epoch": 3.515083571137383, + "grad_norm": 0.21825523674488068, + "learning_rate": 5.608098926484577e-05, + "loss": 3.8369, + "step": 51735 + }, + { + "epoch": 3.5154232912080445, + "grad_norm": 0.17766614258289337, + "learning_rate": 5.6076742763962494e-05, + "loss": 3.8006, + "step": 51740 + }, + { + "epoch": 3.5157630112787066, + "grad_norm": 0.22705502808094025, + "learning_rate": 5.607249626307922e-05, + "loss": 4.0127, + "step": 51745 + }, + { + "epoch": 3.5161027313493682, + "grad_norm": 0.41350436210632324, + "learning_rate": 5.606824976219596e-05, + "loss": 4.0178, + "step": 51750 + }, + { + "epoch": 3.51644245142003, + "grad_norm": 0.15920375287532806, + "learning_rate": 5.606400326131268e-05, + "loss": 3.8545, + "step": 51755 + }, + { + "epoch": 3.516782171490692, + "grad_norm": 0.15305782854557037, + "learning_rate": 5.6059756760429406e-05, + "loss": 3.9324, + "step": 51760 + }, + { + "epoch": 3.5171218915613536, + "grad_norm": 0.17225515842437744, + "learning_rate": 5.605551025954614e-05, + "loss": 3.8503, + "step": 51765 + }, + { + "epoch": 3.517461611632015, + "grad_norm": 0.18635383248329163, + "learning_rate": 5.605126375866286e-05, + "loss": 3.8143, + "step": 51770 + }, + { + "epoch": 3.517801331702677, + "grad_norm": 0.17625826597213745, + "learning_rate": 5.604701725777959e-05, + "loss": 3.8677, + "step": 51775 + }, + { + "epoch": 3.518141051773339, + "grad_norm": 0.18920139968395233, + "learning_rate": 5.6042770756896325e-05, + "loss": 3.8888, + "step": 51780 + }, + { + "epoch": 3.5184807718440005, + "grad_norm": 0.3009730875492096, + "learning_rate": 5.6038524256013046e-05, + "loss": 3.7242, + "step": 51785 + }, + { + "epoch": 3.518820491914662, + "grad_norm": 0.19381940364837646, + "learning_rate": 5.6034277755129774e-05, + "loss": 4.0011, + "step": 51790 + }, + { + "epoch": 3.5191602119853242, + "grad_norm": 0.31642165780067444, + "learning_rate": 5.603003125424651e-05, + "loss": 3.6893, + "step": 51795 + }, + { + "epoch": 3.519499932055986, + "grad_norm": 0.177153080701828, + "learning_rate": 5.602578475336323e-05, + "loss": 3.6982, + "step": 51800 + }, + { + "epoch": 3.5198396521266475, + "grad_norm": 0.16047848761081696, + "learning_rate": 5.602153825247995e-05, + "loss": 4.0759, + "step": 51805 + }, + { + "epoch": 3.520179372197309, + "grad_norm": 0.14991790056228638, + "learning_rate": 5.601729175159669e-05, + "loss": 4.0202, + "step": 51810 + }, + { + "epoch": 3.520519092267971, + "grad_norm": 0.18080776929855347, + "learning_rate": 5.6013045250713415e-05, + "loss": 3.8297, + "step": 51815 + }, + { + "epoch": 3.520858812338633, + "grad_norm": 0.17793801426887512, + "learning_rate": 5.6008798749830136e-05, + "loss": 3.6627, + "step": 51820 + }, + { + "epoch": 3.5211985324092945, + "grad_norm": 0.1763465851545334, + "learning_rate": 5.600455224894687e-05, + "loss": 3.7519, + "step": 51825 + }, + { + "epoch": 3.5215382524799566, + "grad_norm": 0.23076628148555756, + "learning_rate": 5.60003057480636e-05, + "loss": 3.9102, + "step": 51830 + }, + { + "epoch": 3.521877972550618, + "grad_norm": 0.15563467144966125, + "learning_rate": 5.599605924718032e-05, + "loss": 3.9893, + "step": 51835 + }, + { + "epoch": 3.52221769262128, + "grad_norm": 0.21223947405815125, + "learning_rate": 5.5991812746297055e-05, + "loss": 3.6164, + "step": 51840 + }, + { + "epoch": 3.522557412691942, + "grad_norm": 0.20159593224525452, + "learning_rate": 5.598756624541378e-05, + "loss": 3.7736, + "step": 51845 + }, + { + "epoch": 3.5228971327626035, + "grad_norm": 1.9499975442886353, + "learning_rate": 5.5983319744530504e-05, + "loss": 3.8636, + "step": 51850 + }, + { + "epoch": 3.523236852833265, + "grad_norm": 0.17690607905387878, + "learning_rate": 5.597907324364724e-05, + "loss": 3.7461, + "step": 51855 + }, + { + "epoch": 3.5235765729039272, + "grad_norm": 0.19060643017292023, + "learning_rate": 5.5974826742763967e-05, + "loss": 3.9397, + "step": 51860 + }, + { + "epoch": 3.523916292974589, + "grad_norm": 0.1727701723575592, + "learning_rate": 5.597058024188069e-05, + "loss": 3.8918, + "step": 51865 + }, + { + "epoch": 3.5242560130452505, + "grad_norm": 0.1628095656633377, + "learning_rate": 5.596633374099742e-05, + "loss": 3.636, + "step": 51870 + }, + { + "epoch": 3.5245957331159126, + "grad_norm": 0.15602366626262665, + "learning_rate": 5.5962087240114144e-05, + "loss": 3.8992, + "step": 51875 + }, + { + "epoch": 3.524935453186574, + "grad_norm": 0.2031722515821457, + "learning_rate": 5.595784073923087e-05, + "loss": 3.6085, + "step": 51880 + }, + { + "epoch": 3.525275173257236, + "grad_norm": 0.33309125900268555, + "learning_rate": 5.595359423834761e-05, + "loss": 3.7356, + "step": 51885 + }, + { + "epoch": 3.525614893327898, + "grad_norm": 0.15568780899047852, + "learning_rate": 5.594934773746433e-05, + "loss": 3.9826, + "step": 51890 + }, + { + "epoch": 3.5259546133985595, + "grad_norm": 0.20518861711025238, + "learning_rate": 5.5945101236581056e-05, + "loss": 3.8792, + "step": 51895 + }, + { + "epoch": 3.526294333469221, + "grad_norm": 0.34832391142845154, + "learning_rate": 5.594085473569779e-05, + "loss": 3.8201, + "step": 51900 + }, + { + "epoch": 3.5266340535398832, + "grad_norm": 0.1776844561100006, + "learning_rate": 5.593660823481451e-05, + "loss": 3.9134, + "step": 51905 + }, + { + "epoch": 3.526973773610545, + "grad_norm": 0.16373398900032043, + "learning_rate": 5.593236173393124e-05, + "loss": 3.9196, + "step": 51910 + }, + { + "epoch": 3.5273134936812065, + "grad_norm": 0.7757550477981567, + "learning_rate": 5.5928115233047975e-05, + "loss": 4.0964, + "step": 51915 + }, + { + "epoch": 3.5276532137518686, + "grad_norm": 0.13431750237941742, + "learning_rate": 5.5923868732164696e-05, + "loss": 3.6388, + "step": 51920 + }, + { + "epoch": 3.52799293382253, + "grad_norm": 0.20551088452339172, + "learning_rate": 5.591962223128143e-05, + "loss": 3.9045, + "step": 51925 + }, + { + "epoch": 3.528332653893192, + "grad_norm": 0.16802501678466797, + "learning_rate": 5.591537573039816e-05, + "loss": 4.0819, + "step": 51930 + }, + { + "epoch": 3.528672373963854, + "grad_norm": 0.1754651665687561, + "learning_rate": 5.591112922951488e-05, + "loss": 3.7228, + "step": 51935 + }, + { + "epoch": 3.5290120940345155, + "grad_norm": 0.18006788194179535, + "learning_rate": 5.5906882728631615e-05, + "loss": 3.7794, + "step": 51940 + }, + { + "epoch": 3.529351814105177, + "grad_norm": 0.21341414749622345, + "learning_rate": 5.590263622774834e-05, + "loss": 3.9604, + "step": 51945 + }, + { + "epoch": 3.5296915341758393, + "grad_norm": 0.2207183986902237, + "learning_rate": 5.5898389726865064e-05, + "loss": 3.7108, + "step": 51950 + }, + { + "epoch": 3.530031254246501, + "grad_norm": 0.20541371405124664, + "learning_rate": 5.58941432259818e-05, + "loss": 3.7229, + "step": 51955 + }, + { + "epoch": 3.5303709743171625, + "grad_norm": 0.17476500570774078, + "learning_rate": 5.588989672509852e-05, + "loss": 3.9139, + "step": 51960 + }, + { + "epoch": 3.5307106943878246, + "grad_norm": 0.17656251788139343, + "learning_rate": 5.588565022421525e-05, + "loss": 3.7836, + "step": 51965 + }, + { + "epoch": 3.531050414458486, + "grad_norm": 0.1381717473268509, + "learning_rate": 5.588140372333198e-05, + "loss": 3.9538, + "step": 51970 + }, + { + "epoch": 3.531390134529148, + "grad_norm": 0.23163440823554993, + "learning_rate": 5.5877157222448704e-05, + "loss": 3.5392, + "step": 51975 + }, + { + "epoch": 3.53172985459981, + "grad_norm": 0.1856715977191925, + "learning_rate": 5.587291072156543e-05, + "loss": 3.6674, + "step": 51980 + }, + { + "epoch": 3.5320695746704716, + "grad_norm": 0.1628432273864746, + "learning_rate": 5.586866422068217e-05, + "loss": 3.9881, + "step": 51985 + }, + { + "epoch": 3.532409294741133, + "grad_norm": 0.20968997478485107, + "learning_rate": 5.586441771979889e-05, + "loss": 3.8582, + "step": 51990 + }, + { + "epoch": 3.5327490148117953, + "grad_norm": 0.4512220621109009, + "learning_rate": 5.5860171218915616e-05, + "loss": 3.8823, + "step": 51995 + }, + { + "epoch": 3.533088734882457, + "grad_norm": 0.22229443490505219, + "learning_rate": 5.585592471803235e-05, + "loss": 3.9639, + "step": 52000 + }, + { + "epoch": 3.5334284549531185, + "grad_norm": 0.16925707459449768, + "learning_rate": 5.585167821714907e-05, + "loss": 4.1029, + "step": 52005 + }, + { + "epoch": 3.5337681750237806, + "grad_norm": 0.20194460451602936, + "learning_rate": 5.58474317162658e-05, + "loss": 3.569, + "step": 52010 + }, + { + "epoch": 3.5341078950944422, + "grad_norm": 0.20574769377708435, + "learning_rate": 5.5843185215382535e-05, + "loss": 3.9264, + "step": 52015 + }, + { + "epoch": 3.534447615165104, + "grad_norm": 0.1998988687992096, + "learning_rate": 5.5838938714499256e-05, + "loss": 3.8928, + "step": 52020 + }, + { + "epoch": 3.534787335235766, + "grad_norm": 0.1648060828447342, + "learning_rate": 5.583469221361598e-05, + "loss": 3.9115, + "step": 52025 + }, + { + "epoch": 3.5351270553064276, + "grad_norm": 0.15645796060562134, + "learning_rate": 5.583044571273271e-05, + "loss": 4.0973, + "step": 52030 + }, + { + "epoch": 3.535466775377089, + "grad_norm": 0.21993142366409302, + "learning_rate": 5.582619921184944e-05, + "loss": 4.0548, + "step": 52035 + }, + { + "epoch": 3.5358064954477513, + "grad_norm": 0.24246451258659363, + "learning_rate": 5.582195271096616e-05, + "loss": 3.9056, + "step": 52040 + }, + { + "epoch": 3.536146215518413, + "grad_norm": 0.1732710748910904, + "learning_rate": 5.5817706210082896e-05, + "loss": 4.0541, + "step": 52045 + }, + { + "epoch": 3.5364859355890745, + "grad_norm": 0.17015594244003296, + "learning_rate": 5.5813459709199624e-05, + "loss": 4.0223, + "step": 52050 + }, + { + "epoch": 3.5368256556597366, + "grad_norm": 2.9471046924591064, + "learning_rate": 5.5809213208316345e-05, + "loss": 3.9196, + "step": 52055 + }, + { + "epoch": 3.5371653757303982, + "grad_norm": 0.19881628453731537, + "learning_rate": 5.580496670743308e-05, + "loss": 3.8606, + "step": 52060 + }, + { + "epoch": 3.53750509580106, + "grad_norm": 0.2667635381221771, + "learning_rate": 5.580072020654981e-05, + "loss": 3.8143, + "step": 52065 + }, + { + "epoch": 3.537844815871722, + "grad_norm": 0.4430239796638489, + "learning_rate": 5.579647370566653e-05, + "loss": 3.6697, + "step": 52070 + }, + { + "epoch": 3.5381845359423836, + "grad_norm": 0.1758584827184677, + "learning_rate": 5.5792227204783264e-05, + "loss": 3.9561, + "step": 52075 + }, + { + "epoch": 3.538524256013045, + "grad_norm": 0.1650838553905487, + "learning_rate": 5.578798070389999e-05, + "loss": 3.9808, + "step": 52080 + }, + { + "epoch": 3.5388639760837073, + "grad_norm": 0.5874785780906677, + "learning_rate": 5.578373420301671e-05, + "loss": 3.7423, + "step": 52085 + }, + { + "epoch": 3.539203696154369, + "grad_norm": 0.21253137290477753, + "learning_rate": 5.577948770213345e-05, + "loss": 3.7552, + "step": 52090 + }, + { + "epoch": 3.5395434162250305, + "grad_norm": 0.21686896681785583, + "learning_rate": 5.577524120125017e-05, + "loss": 4.1198, + "step": 52095 + }, + { + "epoch": 3.5398831362956926, + "grad_norm": 0.15620170533657074, + "learning_rate": 5.57709947003669e-05, + "loss": 3.8919, + "step": 52100 + }, + { + "epoch": 3.5402228563663543, + "grad_norm": 0.12956871092319489, + "learning_rate": 5.576674819948363e-05, + "loss": 3.9663, + "step": 52105 + }, + { + "epoch": 3.540562576437016, + "grad_norm": 0.15539605915546417, + "learning_rate": 5.5762501698600353e-05, + "loss": 3.7254, + "step": 52110 + }, + { + "epoch": 3.5409022965076775, + "grad_norm": 0.17913620173931122, + "learning_rate": 5.575825519771708e-05, + "loss": 4.0724, + "step": 52115 + }, + { + "epoch": 3.5412420165783396, + "grad_norm": 5.171310901641846, + "learning_rate": 5.5754008696833816e-05, + "loss": 3.6572, + "step": 52120 + }, + { + "epoch": 3.541581736649001, + "grad_norm": 0.1637914478778839, + "learning_rate": 5.574976219595054e-05, + "loss": 3.7751, + "step": 52125 + }, + { + "epoch": 3.541921456719663, + "grad_norm": 0.1795250028371811, + "learning_rate": 5.5745515695067265e-05, + "loss": 3.9534, + "step": 52130 + }, + { + "epoch": 3.542261176790325, + "grad_norm": 0.1815946400165558, + "learning_rate": 5.5741269194184e-05, + "loss": 3.9305, + "step": 52135 + }, + { + "epoch": 3.5426008968609866, + "grad_norm": 0.21541321277618408, + "learning_rate": 5.573702269330072e-05, + "loss": 3.7688, + "step": 52140 + }, + { + "epoch": 3.542940616931648, + "grad_norm": 0.1670183688402176, + "learning_rate": 5.573277619241745e-05, + "loss": 3.9036, + "step": 52145 + }, + { + "epoch": 3.54328033700231, + "grad_norm": 0.15355919301509857, + "learning_rate": 5.5728529691534184e-05, + "loss": 3.7964, + "step": 52150 + }, + { + "epoch": 3.543620057072972, + "grad_norm": 0.15925423800945282, + "learning_rate": 5.572513249082756e-05, + "loss": 4.0777, + "step": 52155 + }, + { + "epoch": 3.5439597771436335, + "grad_norm": 0.1928146630525589, + "learning_rate": 5.5720885989944295e-05, + "loss": 3.9977, + "step": 52160 + }, + { + "epoch": 3.544299497214295, + "grad_norm": 0.17630620300769806, + "learning_rate": 5.5716639489061016e-05, + "loss": 3.8161, + "step": 52165 + }, + { + "epoch": 3.5446392172849572, + "grad_norm": 0.21646162867546082, + "learning_rate": 5.5712392988177744e-05, + "loss": 3.7393, + "step": 52170 + }, + { + "epoch": 3.544978937355619, + "grad_norm": 0.19888626039028168, + "learning_rate": 5.570814648729448e-05, + "loss": 3.6712, + "step": 52175 + }, + { + "epoch": 3.5453186574262805, + "grad_norm": 0.5497890114784241, + "learning_rate": 5.57038999864112e-05, + "loss": 3.8526, + "step": 52180 + }, + { + "epoch": 3.5456583774969426, + "grad_norm": 0.15010856091976166, + "learning_rate": 5.569965348552793e-05, + "loss": 3.4625, + "step": 52185 + }, + { + "epoch": 3.545998097567604, + "grad_norm": 0.16280561685562134, + "learning_rate": 5.569540698464466e-05, + "loss": 3.8958, + "step": 52190 + }, + { + "epoch": 3.546337817638266, + "grad_norm": 0.15331284701824188, + "learning_rate": 5.5691160483761384e-05, + "loss": 3.7471, + "step": 52195 + }, + { + "epoch": 3.546677537708928, + "grad_norm": 0.27299603819847107, + "learning_rate": 5.5686913982878105e-05, + "loss": 3.7483, + "step": 52200 + }, + { + "epoch": 3.5470172577795895, + "grad_norm": 0.24320168793201447, + "learning_rate": 5.568266748199485e-05, + "loss": 3.7863, + "step": 52205 + }, + { + "epoch": 3.547356977850251, + "grad_norm": 0.25218337774276733, + "learning_rate": 5.567842098111157e-05, + "loss": 3.8466, + "step": 52210 + }, + { + "epoch": 3.5476966979209132, + "grad_norm": 0.1837984025478363, + "learning_rate": 5.567417448022829e-05, + "loss": 3.9633, + "step": 52215 + }, + { + "epoch": 3.548036417991575, + "grad_norm": 0.1824839860200882, + "learning_rate": 5.5669927979345024e-05, + "loss": 3.7699, + "step": 52220 + }, + { + "epoch": 3.5483761380622365, + "grad_norm": 0.20133045315742493, + "learning_rate": 5.566568147846175e-05, + "loss": 3.6918, + "step": 52225 + }, + { + "epoch": 3.5487158581328986, + "grad_norm": 0.1793437898159027, + "learning_rate": 5.566143497757847e-05, + "loss": 3.8255, + "step": 52230 + }, + { + "epoch": 3.54905557820356, + "grad_norm": 0.15268026292324066, + "learning_rate": 5.565718847669521e-05, + "loss": 3.8705, + "step": 52235 + }, + { + "epoch": 3.549395298274222, + "grad_norm": 0.20451825857162476, + "learning_rate": 5.5652941975811936e-05, + "loss": 3.802, + "step": 52240 + }, + { + "epoch": 3.549735018344884, + "grad_norm": 0.22261816263198853, + "learning_rate": 5.564869547492866e-05, + "loss": 3.9299, + "step": 52245 + }, + { + "epoch": 3.5500747384155455, + "grad_norm": 0.17490920424461365, + "learning_rate": 5.564444897404539e-05, + "loss": 3.852, + "step": 52250 + }, + { + "epoch": 3.550414458486207, + "grad_norm": 0.1490868330001831, + "learning_rate": 5.564020247316212e-05, + "loss": 3.7731, + "step": 52255 + }, + { + "epoch": 3.5507541785568693, + "grad_norm": 0.21903176605701447, + "learning_rate": 5.563595597227884e-05, + "loss": 3.63, + "step": 52260 + }, + { + "epoch": 3.551093898627531, + "grad_norm": 0.17967626452445984, + "learning_rate": 5.5631709471395576e-05, + "loss": 3.8828, + "step": 52265 + }, + { + "epoch": 3.5514336186981925, + "grad_norm": 0.41057491302490234, + "learning_rate": 5.5627462970512304e-05, + "loss": 3.7982, + "step": 52270 + }, + { + "epoch": 3.5517733387688546, + "grad_norm": 0.1434888392686844, + "learning_rate": 5.5623216469629025e-05, + "loss": 3.9413, + "step": 52275 + }, + { + "epoch": 3.5521130588395162, + "grad_norm": 0.20448878407478333, + "learning_rate": 5.561896996874576e-05, + "loss": 3.8173, + "step": 52280 + }, + { + "epoch": 3.552452778910178, + "grad_norm": 0.21833164989948273, + "learning_rate": 5.561472346786248e-05, + "loss": 3.9003, + "step": 52285 + }, + { + "epoch": 3.55279249898084, + "grad_norm": 0.18172873556613922, + "learning_rate": 5.561047696697921e-05, + "loss": 3.937, + "step": 52290 + }, + { + "epoch": 3.5531322190515016, + "grad_norm": 0.19501811265945435, + "learning_rate": 5.5606230466095944e-05, + "loss": 4.147, + "step": 52295 + }, + { + "epoch": 3.553471939122163, + "grad_norm": 0.16923728585243225, + "learning_rate": 5.5601983965212665e-05, + "loss": 3.5491, + "step": 52300 + }, + { + "epoch": 3.5538116591928253, + "grad_norm": 0.1850777417421341, + "learning_rate": 5.559773746432939e-05, + "loss": 4.0406, + "step": 52305 + }, + { + "epoch": 3.554151379263487, + "grad_norm": 0.7712956666946411, + "learning_rate": 5.559349096344613e-05, + "loss": 3.6961, + "step": 52310 + }, + { + "epoch": 3.5544910993341485, + "grad_norm": 0.14933891594409943, + "learning_rate": 5.558924446256285e-05, + "loss": 3.7154, + "step": 52315 + }, + { + "epoch": 3.5548308194048106, + "grad_norm": 0.17732787132263184, + "learning_rate": 5.558499796167958e-05, + "loss": 3.8449, + "step": 52320 + }, + { + "epoch": 3.5551705394754722, + "grad_norm": 0.6687011122703552, + "learning_rate": 5.558075146079631e-05, + "loss": 3.9869, + "step": 52325 + }, + { + "epoch": 3.555510259546134, + "grad_norm": 0.20380020141601562, + "learning_rate": 5.557650495991303e-05, + "loss": 3.9074, + "step": 52330 + }, + { + "epoch": 3.555849979616796, + "grad_norm": 0.20987163484096527, + "learning_rate": 5.5572258459029755e-05, + "loss": 3.9733, + "step": 52335 + }, + { + "epoch": 3.5561896996874576, + "grad_norm": 0.2821095585823059, + "learning_rate": 5.5568011958146496e-05, + "loss": 3.7461, + "step": 52340 + }, + { + "epoch": 3.556529419758119, + "grad_norm": 0.2076195627450943, + "learning_rate": 5.556376545726322e-05, + "loss": 3.5994, + "step": 52345 + }, + { + "epoch": 3.5568691398287813, + "grad_norm": 0.17863300442695618, + "learning_rate": 5.555951895637994e-05, + "loss": 3.78, + "step": 52350 + }, + { + "epoch": 3.557208859899443, + "grad_norm": 0.22603364288806915, + "learning_rate": 5.5555272455496673e-05, + "loss": 3.8904, + "step": 52355 + }, + { + "epoch": 3.5575485799701045, + "grad_norm": 0.15186859667301178, + "learning_rate": 5.55510259546134e-05, + "loss": 3.9502, + "step": 52360 + }, + { + "epoch": 3.5578883000407666, + "grad_norm": 0.13966453075408936, + "learning_rate": 5.554677945373012e-05, + "loss": 3.7771, + "step": 52365 + }, + { + "epoch": 3.5582280201114282, + "grad_norm": 0.19145803153514862, + "learning_rate": 5.554253295284686e-05, + "loss": 3.7615, + "step": 52370 + }, + { + "epoch": 3.55856774018209, + "grad_norm": 0.18369613587856293, + "learning_rate": 5.5538286451963585e-05, + "loss": 3.7628, + "step": 52375 + }, + { + "epoch": 3.558907460252752, + "grad_norm": 0.231706440448761, + "learning_rate": 5.553403995108031e-05, + "loss": 3.9089, + "step": 52380 + }, + { + "epoch": 3.5592471803234136, + "grad_norm": 0.15969930589199066, + "learning_rate": 5.552979345019704e-05, + "loss": 3.8627, + "step": 52385 + }, + { + "epoch": 3.559586900394075, + "grad_norm": 0.2355836182832718, + "learning_rate": 5.552554694931377e-05, + "loss": 3.8361, + "step": 52390 + }, + { + "epoch": 3.5599266204647373, + "grad_norm": 0.22990074753761292, + "learning_rate": 5.552130044843049e-05, + "loss": 3.7935, + "step": 52395 + }, + { + "epoch": 3.560266340535399, + "grad_norm": 0.24566608667373657, + "learning_rate": 5.5517053947547225e-05, + "loss": 3.8424, + "step": 52400 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 0.1807066947221756, + "learning_rate": 5.5512807446663953e-05, + "loss": 3.7998, + "step": 52405 + }, + { + "epoch": 3.5609457806767226, + "grad_norm": 0.16097185015678406, + "learning_rate": 5.5508560945780675e-05, + "loss": 4.0271, + "step": 52410 + }, + { + "epoch": 3.5612855007473843, + "grad_norm": 0.17211374640464783, + "learning_rate": 5.550431444489741e-05, + "loss": 3.8114, + "step": 52415 + }, + { + "epoch": 3.561625220818046, + "grad_norm": 0.1502368003129959, + "learning_rate": 5.550006794401413e-05, + "loss": 3.9128, + "step": 52420 + }, + { + "epoch": 3.561964940888708, + "grad_norm": 0.15664361417293549, + "learning_rate": 5.549582144313086e-05, + "loss": 3.9293, + "step": 52425 + }, + { + "epoch": 3.5623046609593696, + "grad_norm": 0.163893923163414, + "learning_rate": 5.5491574942247594e-05, + "loss": 3.8282, + "step": 52430 + }, + { + "epoch": 3.5626443810300312, + "grad_norm": 0.19202589988708496, + "learning_rate": 5.5487328441364315e-05, + "loss": 4.1594, + "step": 52435 + }, + { + "epoch": 3.5629841011006933, + "grad_norm": 0.21701721847057343, + "learning_rate": 5.548308194048104e-05, + "loss": 3.5492, + "step": 52440 + }, + { + "epoch": 3.563323821171355, + "grad_norm": 0.20184285938739777, + "learning_rate": 5.547883543959778e-05, + "loss": 3.892, + "step": 52445 + }, + { + "epoch": 3.5636635412420166, + "grad_norm": 0.24523420631885529, + "learning_rate": 5.54745889387145e-05, + "loss": 3.7367, + "step": 52450 + }, + { + "epoch": 3.564003261312678, + "grad_norm": 0.2559781074523926, + "learning_rate": 5.547034243783123e-05, + "loss": 3.801, + "step": 52455 + }, + { + "epoch": 3.5643429813833403, + "grad_norm": 0.1561535894870758, + "learning_rate": 5.546609593694796e-05, + "loss": 3.887, + "step": 52460 + }, + { + "epoch": 3.564682701454002, + "grad_norm": 0.18009065091609955, + "learning_rate": 5.546184943606468e-05, + "loss": 3.7972, + "step": 52465 + }, + { + "epoch": 3.5650224215246635, + "grad_norm": 0.23871923983097076, + "learning_rate": 5.545760293518142e-05, + "loss": 3.7366, + "step": 52470 + }, + { + "epoch": 3.5653621415953256, + "grad_norm": 0.17841434478759766, + "learning_rate": 5.5453356434298146e-05, + "loss": 3.8526, + "step": 52475 + }, + { + "epoch": 3.5657018616659872, + "grad_norm": 0.18217985332012177, + "learning_rate": 5.544910993341487e-05, + "loss": 3.84, + "step": 52480 + }, + { + "epoch": 3.566041581736649, + "grad_norm": 0.164532870054245, + "learning_rate": 5.54448634325316e-05, + "loss": 3.9508, + "step": 52485 + }, + { + "epoch": 3.5663813018073105, + "grad_norm": 0.21327534317970276, + "learning_rate": 5.544061693164832e-05, + "loss": 3.8696, + "step": 52490 + }, + { + "epoch": 3.5667210218779726, + "grad_norm": 0.21465985476970673, + "learning_rate": 5.543637043076505e-05, + "loss": 3.9628, + "step": 52495 + }, + { + "epoch": 3.567060741948634, + "grad_norm": 0.2286617010831833, + "learning_rate": 5.5432123929881786e-05, + "loss": 3.7021, + "step": 52500 + }, + { + "epoch": 3.567400462019296, + "grad_norm": 0.15360532701015472, + "learning_rate": 5.542787742899851e-05, + "loss": 3.707, + "step": 52505 + }, + { + "epoch": 3.567740182089958, + "grad_norm": 0.14227953553199768, + "learning_rate": 5.5423630928115235e-05, + "loss": 3.7552, + "step": 52510 + }, + { + "epoch": 3.5680799021606195, + "grad_norm": 0.2221529483795166, + "learning_rate": 5.541938442723197e-05, + "loss": 3.5821, + "step": 52515 + }, + { + "epoch": 3.568419622231281, + "grad_norm": 0.1607007086277008, + "learning_rate": 5.541513792634869e-05, + "loss": 3.9291, + "step": 52520 + }, + { + "epoch": 3.5687593423019432, + "grad_norm": 0.38363534212112427, + "learning_rate": 5.541089142546542e-05, + "loss": 4.1729, + "step": 52525 + }, + { + "epoch": 3.569099062372605, + "grad_norm": 0.2266181856393814, + "learning_rate": 5.5406644924582154e-05, + "loss": 3.8573, + "step": 52530 + }, + { + "epoch": 3.5694387824432665, + "grad_norm": 0.2262057214975357, + "learning_rate": 5.5402398423698875e-05, + "loss": 3.5916, + "step": 52535 + }, + { + "epoch": 3.5697785025139286, + "grad_norm": 0.8949726223945618, + "learning_rate": 5.53981519228156e-05, + "loss": 4.1287, + "step": 52540 + }, + { + "epoch": 3.57011822258459, + "grad_norm": 3.279271125793457, + "learning_rate": 5.539390542193234e-05, + "loss": 3.8386, + "step": 52545 + }, + { + "epoch": 3.570457942655252, + "grad_norm": 0.17035193741321564, + "learning_rate": 5.538965892104906e-05, + "loss": 3.828, + "step": 52550 + }, + { + "epoch": 3.570797662725914, + "grad_norm": 0.19277377426624298, + "learning_rate": 5.538541242016578e-05, + "loss": 3.8784, + "step": 52555 + }, + { + "epoch": 3.5711373827965756, + "grad_norm": 0.1801813393831253, + "learning_rate": 5.5381165919282515e-05, + "loss": 3.7691, + "step": 52560 + }, + { + "epoch": 3.571477102867237, + "grad_norm": 0.15036074817180634, + "learning_rate": 5.537691941839924e-05, + "loss": 3.6842, + "step": 52565 + }, + { + "epoch": 3.5718168229378993, + "grad_norm": 0.14409419894218445, + "learning_rate": 5.5372672917515964e-05, + "loss": 3.7983, + "step": 52570 + }, + { + "epoch": 3.572156543008561, + "grad_norm": 0.28238558769226074, + "learning_rate": 5.53684264166327e-05, + "loss": 4.0761, + "step": 52575 + }, + { + "epoch": 3.5724962630792225, + "grad_norm": 0.1694146692752838, + "learning_rate": 5.536417991574943e-05, + "loss": 3.9705, + "step": 52580 + }, + { + "epoch": 3.5728359831498846, + "grad_norm": 0.18311430513858795, + "learning_rate": 5.535993341486615e-05, + "loss": 4.0496, + "step": 52585 + }, + { + "epoch": 3.5731757032205462, + "grad_norm": 0.552855908870697, + "learning_rate": 5.535568691398288e-05, + "loss": 3.9689, + "step": 52590 + }, + { + "epoch": 3.573515423291208, + "grad_norm": 0.21972528100013733, + "learning_rate": 5.535144041309961e-05, + "loss": 3.6679, + "step": 52595 + }, + { + "epoch": 3.57385514336187, + "grad_norm": 0.23290085792541504, + "learning_rate": 5.534719391221633e-05, + "loss": 3.9349, + "step": 52600 + }, + { + "epoch": 3.5741948634325316, + "grad_norm": 0.14714351296424866, + "learning_rate": 5.534294741133307e-05, + "loss": 3.7513, + "step": 52605 + }, + { + "epoch": 3.574534583503193, + "grad_norm": 0.2670232355594635, + "learning_rate": 5.5338700910449795e-05, + "loss": 4.0065, + "step": 52610 + }, + { + "epoch": 3.5748743035738553, + "grad_norm": 0.23238149285316467, + "learning_rate": 5.5334454409566516e-05, + "loss": 3.9328, + "step": 52615 + }, + { + "epoch": 3.575214023644517, + "grad_norm": 0.16267217695713043, + "learning_rate": 5.533020790868325e-05, + "loss": 3.8945, + "step": 52620 + }, + { + "epoch": 3.5755537437151785, + "grad_norm": 0.23338037729263306, + "learning_rate": 5.532596140779997e-05, + "loss": 3.9641, + "step": 52625 + }, + { + "epoch": 3.5758934637858406, + "grad_norm": 0.19484928250312805, + "learning_rate": 5.53217149069167e-05, + "loss": 4.0639, + "step": 52630 + }, + { + "epoch": 3.5762331838565022, + "grad_norm": 0.2051248550415039, + "learning_rate": 5.5317468406033435e-05, + "loss": 3.813, + "step": 52635 + }, + { + "epoch": 3.576572903927164, + "grad_norm": 0.17050917446613312, + "learning_rate": 5.5313221905150156e-05, + "loss": 3.7346, + "step": 52640 + }, + { + "epoch": 3.576912623997826, + "grad_norm": 0.19374027848243713, + "learning_rate": 5.5308975404266884e-05, + "loss": 3.6798, + "step": 52645 + }, + { + "epoch": 3.5772523440684876, + "grad_norm": 0.18451227247714996, + "learning_rate": 5.530472890338362e-05, + "loss": 4.1415, + "step": 52650 + }, + { + "epoch": 3.577592064139149, + "grad_norm": 0.2062918096780777, + "learning_rate": 5.530048240250034e-05, + "loss": 3.9055, + "step": 52655 + }, + { + "epoch": 3.5779317842098113, + "grad_norm": 0.18978002667427063, + "learning_rate": 5.529623590161707e-05, + "loss": 3.9097, + "step": 52660 + }, + { + "epoch": 3.578271504280473, + "grad_norm": 0.1882885843515396, + "learning_rate": 5.52919894007338e-05, + "loss": 3.6224, + "step": 52665 + }, + { + "epoch": 3.5786112243511345, + "grad_norm": 0.15890266001224518, + "learning_rate": 5.5287742899850524e-05, + "loss": 3.7969, + "step": 52670 + }, + { + "epoch": 3.5789509444217966, + "grad_norm": 0.23028595745563507, + "learning_rate": 5.528349639896725e-05, + "loss": 3.8312, + "step": 52675 + }, + { + "epoch": 3.5792906644924583, + "grad_norm": 0.14182989299297333, + "learning_rate": 5.527924989808399e-05, + "loss": 3.9176, + "step": 52680 + }, + { + "epoch": 3.57963038456312, + "grad_norm": 0.16338498890399933, + "learning_rate": 5.527500339720071e-05, + "loss": 3.8771, + "step": 52685 + }, + { + "epoch": 3.579970104633782, + "grad_norm": 0.176205113530159, + "learning_rate": 5.527075689631743e-05, + "loss": 3.9335, + "step": 52690 + }, + { + "epoch": 3.5803098247044436, + "grad_norm": 0.17354744672775269, + "learning_rate": 5.526651039543417e-05, + "loss": 3.9118, + "step": 52695 + }, + { + "epoch": 3.580649544775105, + "grad_norm": 0.14755932986736298, + "learning_rate": 5.526226389455089e-05, + "loss": 4.0763, + "step": 52700 + }, + { + "epoch": 3.5809892648457673, + "grad_norm": 0.18269991874694824, + "learning_rate": 5.5258017393667614e-05, + "loss": 3.7652, + "step": 52705 + }, + { + "epoch": 3.581328984916429, + "grad_norm": 0.1768389791250229, + "learning_rate": 5.525377089278435e-05, + "loss": 3.9411, + "step": 52710 + }, + { + "epoch": 3.5816687049870906, + "grad_norm": 0.15557627379894257, + "learning_rate": 5.5249524391901076e-05, + "loss": 3.9078, + "step": 52715 + }, + { + "epoch": 3.5820084250577526, + "grad_norm": 0.21192865073680878, + "learning_rate": 5.52452778910178e-05, + "loss": 3.9115, + "step": 52720 + }, + { + "epoch": 3.5823481451284143, + "grad_norm": 0.1718427836894989, + "learning_rate": 5.524103139013453e-05, + "loss": 3.8462, + "step": 52725 + }, + { + "epoch": 3.582687865199076, + "grad_norm": 0.16324599087238312, + "learning_rate": 5.523678488925126e-05, + "loss": 3.7025, + "step": 52730 + }, + { + "epoch": 3.583027585269738, + "grad_norm": 0.16907474398612976, + "learning_rate": 5.523253838836798e-05, + "loss": 3.9715, + "step": 52735 + }, + { + "epoch": 3.5833673053403996, + "grad_norm": 0.18899960815906525, + "learning_rate": 5.5228291887484716e-05, + "loss": 3.7289, + "step": 52740 + }, + { + "epoch": 3.5837070254110612, + "grad_norm": 0.16097725927829742, + "learning_rate": 5.5224045386601444e-05, + "loss": 3.7366, + "step": 52745 + }, + { + "epoch": 3.5840467454817233, + "grad_norm": 0.15161509811878204, + "learning_rate": 5.5219798885718166e-05, + "loss": 3.8335, + "step": 52750 + }, + { + "epoch": 3.584386465552385, + "grad_norm": 0.1514062136411667, + "learning_rate": 5.52155523848349e-05, + "loss": 3.8696, + "step": 52755 + }, + { + "epoch": 3.5847261856230466, + "grad_norm": 0.2065105438232422, + "learning_rate": 5.521130588395162e-05, + "loss": 4.0701, + "step": 52760 + }, + { + "epoch": 3.5850659056937086, + "grad_norm": 0.1764591634273529, + "learning_rate": 5.520705938306835e-05, + "loss": 3.7568, + "step": 52765 + }, + { + "epoch": 3.5854056257643703, + "grad_norm": 0.16367694735527039, + "learning_rate": 5.5202812882185085e-05, + "loss": 3.7888, + "step": 52770 + }, + { + "epoch": 3.585745345835032, + "grad_norm": 0.1915961354970932, + "learning_rate": 5.5198566381301806e-05, + "loss": 3.8703, + "step": 52775 + }, + { + "epoch": 3.586085065905694, + "grad_norm": 0.4823121726512909, + "learning_rate": 5.5194319880418534e-05, + "loss": 3.8376, + "step": 52780 + }, + { + "epoch": 3.5864247859763556, + "grad_norm": 0.20243681967258453, + "learning_rate": 5.519007337953527e-05, + "loss": 3.6063, + "step": 52785 + }, + { + "epoch": 3.5867645060470172, + "grad_norm": 0.14381764829158783, + "learning_rate": 5.518582687865199e-05, + "loss": 4.0659, + "step": 52790 + }, + { + "epoch": 3.587104226117679, + "grad_norm": 0.1819092333316803, + "learning_rate": 5.518158037776872e-05, + "loss": 3.8694, + "step": 52795 + }, + { + "epoch": 3.587443946188341, + "grad_norm": 0.13981111347675323, + "learning_rate": 5.517733387688545e-05, + "loss": 4.1384, + "step": 52800 + }, + { + "epoch": 3.5877836662590026, + "grad_norm": 0.18244700133800507, + "learning_rate": 5.5173087376002174e-05, + "loss": 3.7508, + "step": 52805 + }, + { + "epoch": 3.588123386329664, + "grad_norm": 0.17885202169418335, + "learning_rate": 5.516884087511891e-05, + "loss": 4.2428, + "step": 52810 + }, + { + "epoch": 3.5884631064003263, + "grad_norm": 0.18187908828258514, + "learning_rate": 5.5164594374235637e-05, + "loss": 3.9266, + "step": 52815 + }, + { + "epoch": 3.588802826470988, + "grad_norm": 0.21009112894535065, + "learning_rate": 5.516034787335236e-05, + "loss": 3.936, + "step": 52820 + }, + { + "epoch": 3.5891425465416495, + "grad_norm": 0.2077493965625763, + "learning_rate": 5.515610137246909e-05, + "loss": 3.9139, + "step": 52825 + }, + { + "epoch": 3.589482266612311, + "grad_norm": 0.23095422983169556, + "learning_rate": 5.515185487158582e-05, + "loss": 3.9692, + "step": 52830 + }, + { + "epoch": 3.5898219866829733, + "grad_norm": 0.19100074470043182, + "learning_rate": 5.514760837070254e-05, + "loss": 3.8405, + "step": 52835 + }, + { + "epoch": 3.590161706753635, + "grad_norm": 0.14535021781921387, + "learning_rate": 5.514336186981928e-05, + "loss": 4.0587, + "step": 52840 + }, + { + "epoch": 3.5905014268242965, + "grad_norm": 0.188767671585083, + "learning_rate": 5.5139115368936e-05, + "loss": 3.8139, + "step": 52845 + }, + { + "epoch": 3.5908411468949586, + "grad_norm": 0.17796753346920013, + "learning_rate": 5.5134868868052726e-05, + "loss": 3.8389, + "step": 52850 + }, + { + "epoch": 3.59118086696562, + "grad_norm": 0.21067515015602112, + "learning_rate": 5.513062236716946e-05, + "loss": 3.8417, + "step": 52855 + }, + { + "epoch": 3.591520587036282, + "grad_norm": 0.13971304893493652, + "learning_rate": 5.512637586628618e-05, + "loss": 3.7807, + "step": 52860 + }, + { + "epoch": 3.591860307106944, + "grad_norm": 0.40385541319847107, + "learning_rate": 5.512212936540291e-05, + "loss": 3.8293, + "step": 52865 + }, + { + "epoch": 3.5922000271776056, + "grad_norm": 0.3759704530239105, + "learning_rate": 5.5117882864519645e-05, + "loss": 3.8939, + "step": 52870 + }, + { + "epoch": 3.592539747248267, + "grad_norm": 0.804699182510376, + "learning_rate": 5.5113636363636366e-05, + "loss": 3.7374, + "step": 52875 + }, + { + "epoch": 3.5928794673189293, + "grad_norm": 0.17676286399364471, + "learning_rate": 5.5109389862753094e-05, + "loss": 3.8641, + "step": 52880 + }, + { + "epoch": 3.593219187389591, + "grad_norm": 0.1965838223695755, + "learning_rate": 5.510514336186983e-05, + "loss": 3.8618, + "step": 52885 + }, + { + "epoch": 3.5935589074602525, + "grad_norm": 0.1375679224729538, + "learning_rate": 5.510089686098655e-05, + "loss": 3.8444, + "step": 52890 + }, + { + "epoch": 3.5938986275309146, + "grad_norm": 0.16696564853191376, + "learning_rate": 5.509665036010328e-05, + "loss": 3.8311, + "step": 52895 + }, + { + "epoch": 3.5942383476015762, + "grad_norm": 0.9773825407028198, + "learning_rate": 5.509240385922001e-05, + "loss": 3.8642, + "step": 52900 + }, + { + "epoch": 3.594578067672238, + "grad_norm": 0.20245347917079926, + "learning_rate": 5.5088157358336734e-05, + "loss": 3.8775, + "step": 52905 + }, + { + "epoch": 3.5949177877429, + "grad_norm": 0.16926254332065582, + "learning_rate": 5.5083910857453455e-05, + "loss": 3.8739, + "step": 52910 + }, + { + "epoch": 3.5952575078135616, + "grad_norm": 0.17277945578098297, + "learning_rate": 5.507966435657019e-05, + "loss": 4.0237, + "step": 52915 + }, + { + "epoch": 3.595597227884223, + "grad_norm": 0.14194636046886444, + "learning_rate": 5.507541785568692e-05, + "loss": 3.8417, + "step": 52920 + }, + { + "epoch": 3.5959369479548853, + "grad_norm": 0.17879709601402283, + "learning_rate": 5.507117135480364e-05, + "loss": 3.5055, + "step": 52925 + }, + { + "epoch": 3.596276668025547, + "grad_norm": 0.2828560769557953, + "learning_rate": 5.5066924853920374e-05, + "loss": 3.8497, + "step": 52930 + }, + { + "epoch": 3.5966163880962085, + "grad_norm": 0.15865641832351685, + "learning_rate": 5.50626783530371e-05, + "loss": 3.7423, + "step": 52935 + }, + { + "epoch": 3.5969561081668706, + "grad_norm": 0.16915567219257355, + "learning_rate": 5.505843185215382e-05, + "loss": 3.8205, + "step": 52940 + }, + { + "epoch": 3.5972958282375322, + "grad_norm": 0.15816420316696167, + "learning_rate": 5.505418535127056e-05, + "loss": 3.7288, + "step": 52945 + }, + { + "epoch": 3.597635548308194, + "grad_norm": 0.19275885820388794, + "learning_rate": 5.5049938850387286e-05, + "loss": 3.8992, + "step": 52950 + }, + { + "epoch": 3.597975268378856, + "grad_norm": 0.21550561487674713, + "learning_rate": 5.504569234950401e-05, + "loss": 3.7555, + "step": 52955 + }, + { + "epoch": 3.5983149884495176, + "grad_norm": 0.15912853181362152, + "learning_rate": 5.504144584862074e-05, + "loss": 3.9714, + "step": 52960 + }, + { + "epoch": 3.598654708520179, + "grad_norm": 0.15486666560173035, + "learning_rate": 5.503719934773747e-05, + "loss": 3.8555, + "step": 52965 + }, + { + "epoch": 3.5989944285908413, + "grad_norm": 0.2237868756055832, + "learning_rate": 5.503295284685419e-05, + "loss": 3.78, + "step": 52970 + }, + { + "epoch": 3.599334148661503, + "grad_norm": 0.13689598441123962, + "learning_rate": 5.5028706345970926e-05, + "loss": 3.9667, + "step": 52975 + }, + { + "epoch": 3.5996738687321646, + "grad_norm": 0.2247312366962433, + "learning_rate": 5.502445984508765e-05, + "loss": 3.8623, + "step": 52980 + }, + { + "epoch": 3.6000135888028266, + "grad_norm": 0.20287591218948364, + "learning_rate": 5.5020213344204375e-05, + "loss": 3.8188, + "step": 52985 + }, + { + "epoch": 3.6003533088734883, + "grad_norm": 0.2245674580335617, + "learning_rate": 5.501596684332111e-05, + "loss": 3.8995, + "step": 52990 + }, + { + "epoch": 3.60069302894415, + "grad_norm": 0.2188260406255722, + "learning_rate": 5.501172034243783e-05, + "loss": 4.2828, + "step": 52995 + }, + { + "epoch": 3.601032749014812, + "grad_norm": 0.14717519283294678, + "learning_rate": 5.500747384155456e-05, + "loss": 3.7433, + "step": 53000 + }, + { + "epoch": 3.6013724690854736, + "grad_norm": 0.1633640080690384, + "learning_rate": 5.5003227340671294e-05, + "loss": 3.9486, + "step": 53005 + }, + { + "epoch": 3.6017121891561352, + "grad_norm": 0.24562521278858185, + "learning_rate": 5.4998980839788015e-05, + "loss": 3.7993, + "step": 53010 + }, + { + "epoch": 3.6020519092267973, + "grad_norm": 0.1465391367673874, + "learning_rate": 5.499473433890474e-05, + "loss": 3.8011, + "step": 53015 + }, + { + "epoch": 3.602391629297459, + "grad_norm": 0.1750236302614212, + "learning_rate": 5.499048783802148e-05, + "loss": 3.7883, + "step": 53020 + }, + { + "epoch": 3.6027313493681206, + "grad_norm": 0.22068630158901215, + "learning_rate": 5.49862413371382e-05, + "loss": 3.6765, + "step": 53025 + }, + { + "epoch": 3.6030710694387826, + "grad_norm": 0.16479621827602386, + "learning_rate": 5.498199483625493e-05, + "loss": 3.7919, + "step": 53030 + }, + { + "epoch": 3.6034107895094443, + "grad_norm": 0.1517455130815506, + "learning_rate": 5.497774833537166e-05, + "loss": 3.7732, + "step": 53035 + }, + { + "epoch": 3.603750509580106, + "grad_norm": 0.17006687819957733, + "learning_rate": 5.497350183448838e-05, + "loss": 3.7365, + "step": 53040 + }, + { + "epoch": 3.604090229650768, + "grad_norm": 0.1899043768644333, + "learning_rate": 5.4969255333605105e-05, + "loss": 3.8088, + "step": 53045 + }, + { + "epoch": 3.6044299497214296, + "grad_norm": 0.2220163494348526, + "learning_rate": 5.496500883272184e-05, + "loss": 3.7445, + "step": 53050 + }, + { + "epoch": 3.6047696697920912, + "grad_norm": 0.1409776359796524, + "learning_rate": 5.496076233183857e-05, + "loss": 3.8093, + "step": 53055 + }, + { + "epoch": 3.6051093898627533, + "grad_norm": 0.15579648315906525, + "learning_rate": 5.495651583095529e-05, + "loss": 3.9541, + "step": 53060 + }, + { + "epoch": 3.605449109933415, + "grad_norm": 0.19357378780841827, + "learning_rate": 5.4952269330072023e-05, + "loss": 3.8678, + "step": 53065 + }, + { + "epoch": 3.6057888300040766, + "grad_norm": 0.13277387619018555, + "learning_rate": 5.494802282918875e-05, + "loss": 3.8712, + "step": 53070 + }, + { + "epoch": 3.6061285500747386, + "grad_norm": 0.18622173368930817, + "learning_rate": 5.494377632830547e-05, + "loss": 3.7588, + "step": 53075 + }, + { + "epoch": 3.6064682701454003, + "grad_norm": 0.17674000561237335, + "learning_rate": 5.493952982742221e-05, + "loss": 3.9016, + "step": 53080 + }, + { + "epoch": 3.606807990216062, + "grad_norm": 0.19338220357894897, + "learning_rate": 5.4935283326538935e-05, + "loss": 3.5578, + "step": 53085 + }, + { + "epoch": 3.607147710286724, + "grad_norm": 0.18867281079292297, + "learning_rate": 5.493103682565566e-05, + "loss": 3.76, + "step": 53090 + }, + { + "epoch": 3.6074874303573856, + "grad_norm": 0.1716698706150055, + "learning_rate": 5.492679032477239e-05, + "loss": 3.7822, + "step": 53095 + }, + { + "epoch": 3.6078271504280472, + "grad_norm": 0.4901680052280426, + "learning_rate": 5.492254382388912e-05, + "loss": 3.8911, + "step": 53100 + }, + { + "epoch": 3.6081668704987093, + "grad_norm": 0.13680079579353333, + "learning_rate": 5.491829732300584e-05, + "loss": 3.9677, + "step": 53105 + }, + { + "epoch": 3.608506590569371, + "grad_norm": 0.17386099696159363, + "learning_rate": 5.4914050822122575e-05, + "loss": 3.6212, + "step": 53110 + }, + { + "epoch": 3.6088463106400326, + "grad_norm": 0.16124454140663147, + "learning_rate": 5.49098043212393e-05, + "loss": 3.8436, + "step": 53115 + }, + { + "epoch": 3.6091860307106947, + "grad_norm": 0.35365647077560425, + "learning_rate": 5.4905557820356025e-05, + "loss": 3.8636, + "step": 53120 + }, + { + "epoch": 3.6095257507813563, + "grad_norm": 0.19149108231067657, + "learning_rate": 5.490131131947276e-05, + "loss": 4.0008, + "step": 53125 + }, + { + "epoch": 3.609865470852018, + "grad_norm": 0.3435831367969513, + "learning_rate": 5.489706481858948e-05, + "loss": 3.9574, + "step": 53130 + }, + { + "epoch": 3.6102051909226796, + "grad_norm": 0.19927749037742615, + "learning_rate": 5.489281831770621e-05, + "loss": 4.0597, + "step": 53135 + }, + { + "epoch": 3.6105449109933416, + "grad_norm": 0.1535813808441162, + "learning_rate": 5.4888571816822944e-05, + "loss": 3.6481, + "step": 53140 + }, + { + "epoch": 3.6108846310640033, + "grad_norm": 0.17694830894470215, + "learning_rate": 5.4884325315939665e-05, + "loss": 3.868, + "step": 53145 + }, + { + "epoch": 3.611224351134665, + "grad_norm": 0.16773131489753723, + "learning_rate": 5.48800788150564e-05, + "loss": 3.9708, + "step": 53150 + }, + { + "epoch": 3.611564071205327, + "grad_norm": 0.14397118985652924, + "learning_rate": 5.487583231417313e-05, + "loss": 3.8624, + "step": 53155 + }, + { + "epoch": 3.6119037912759886, + "grad_norm": 0.1881939023733139, + "learning_rate": 5.487158581328985e-05, + "loss": 3.7471, + "step": 53160 + }, + { + "epoch": 3.6122435113466502, + "grad_norm": 0.21574093401432037, + "learning_rate": 5.4867339312406584e-05, + "loss": 3.9779, + "step": 53165 + }, + { + "epoch": 3.612583231417312, + "grad_norm": 0.15244489908218384, + "learning_rate": 5.486309281152331e-05, + "loss": 3.7255, + "step": 53170 + }, + { + "epoch": 3.612922951487974, + "grad_norm": 0.3308011293411255, + "learning_rate": 5.485884631064003e-05, + "loss": 3.8279, + "step": 53175 + }, + { + "epoch": 3.6132626715586356, + "grad_norm": 0.23461315035820007, + "learning_rate": 5.485459980975677e-05, + "loss": 3.7463, + "step": 53180 + }, + { + "epoch": 3.613602391629297, + "grad_norm": 0.2006291300058365, + "learning_rate": 5.4850353308873496e-05, + "loss": 3.8011, + "step": 53185 + }, + { + "epoch": 3.6139421116999593, + "grad_norm": 15.722777366638184, + "learning_rate": 5.484610680799022e-05, + "loss": 3.9429, + "step": 53190 + }, + { + "epoch": 3.614281831770621, + "grad_norm": 0.22877506911754608, + "learning_rate": 5.484186030710695e-05, + "loss": 3.879, + "step": 53195 + }, + { + "epoch": 3.6146215518412825, + "grad_norm": 0.18687395751476288, + "learning_rate": 5.483761380622367e-05, + "loss": 3.8564, + "step": 53200 + }, + { + "epoch": 3.6149612719119446, + "grad_norm": 0.18945661187171936, + "learning_rate": 5.48333673053404e-05, + "loss": 3.7359, + "step": 53205 + }, + { + "epoch": 3.6153009919826062, + "grad_norm": 0.16470825672149658, + "learning_rate": 5.4829120804457136e-05, + "loss": 4.0506, + "step": 53210 + }, + { + "epoch": 3.615640712053268, + "grad_norm": 0.21996566653251648, + "learning_rate": 5.482487430357386e-05, + "loss": 4.0589, + "step": 53215 + }, + { + "epoch": 3.61598043212393, + "grad_norm": 0.15825536847114563, + "learning_rate": 5.4820627802690585e-05, + "loss": 3.904, + "step": 53220 + }, + { + "epoch": 3.6163201521945916, + "grad_norm": 0.16995446383953094, + "learning_rate": 5.481638130180732e-05, + "loss": 3.6963, + "step": 53225 + }, + { + "epoch": 3.616659872265253, + "grad_norm": 0.1461051106452942, + "learning_rate": 5.481213480092404e-05, + "loss": 3.8151, + "step": 53230 + }, + { + "epoch": 3.6169995923359153, + "grad_norm": 0.1739353984594345, + "learning_rate": 5.480788830004077e-05, + "loss": 3.8829, + "step": 53235 + }, + { + "epoch": 3.617339312406577, + "grad_norm": 0.18751443922519684, + "learning_rate": 5.4803641799157504e-05, + "loss": 4.0659, + "step": 53240 + }, + { + "epoch": 3.6176790324772385, + "grad_norm": 0.18842779099941254, + "learning_rate": 5.4799395298274225e-05, + "loss": 3.8605, + "step": 53245 + }, + { + "epoch": 3.6180187525479006, + "grad_norm": 2.9120399951934814, + "learning_rate": 5.4795148797390946e-05, + "loss": 3.8503, + "step": 53250 + }, + { + "epoch": 3.6183584726185622, + "grad_norm": 0.1973992884159088, + "learning_rate": 5.479090229650769e-05, + "loss": 3.7809, + "step": 53255 + }, + { + "epoch": 3.618698192689224, + "grad_norm": 0.17836697399616241, + "learning_rate": 5.478665579562441e-05, + "loss": 3.7105, + "step": 53260 + }, + { + "epoch": 3.619037912759886, + "grad_norm": 0.20385819673538208, + "learning_rate": 5.478240929474113e-05, + "loss": 3.7424, + "step": 53265 + }, + { + "epoch": 3.6193776328305476, + "grad_norm": 0.20713219046592712, + "learning_rate": 5.4778162793857865e-05, + "loss": 3.6583, + "step": 53270 + }, + { + "epoch": 3.619717352901209, + "grad_norm": 0.13691633939743042, + "learning_rate": 5.477391629297459e-05, + "loss": 3.8125, + "step": 53275 + }, + { + "epoch": 3.6200570729718713, + "grad_norm": 0.17771436274051666, + "learning_rate": 5.4769669792091314e-05, + "loss": 3.9904, + "step": 53280 + }, + { + "epoch": 3.620396793042533, + "grad_norm": 0.20848064124584198, + "learning_rate": 5.476542329120805e-05, + "loss": 3.7939, + "step": 53285 + }, + { + "epoch": 3.6207365131131946, + "grad_norm": 0.17830125987529755, + "learning_rate": 5.476117679032478e-05, + "loss": 3.8569, + "step": 53290 + }, + { + "epoch": 3.6210762331838566, + "grad_norm": 0.41695114970207214, + "learning_rate": 5.47569302894415e-05, + "loss": 3.8201, + "step": 53295 + }, + { + "epoch": 3.6214159532545183, + "grad_norm": 0.21204999089241028, + "learning_rate": 5.475268378855823e-05, + "loss": 3.8755, + "step": 53300 + }, + { + "epoch": 3.62175567332518, + "grad_norm": 0.7295950651168823, + "learning_rate": 5.474843728767496e-05, + "loss": 3.8719, + "step": 53305 + }, + { + "epoch": 3.622095393395842, + "grad_norm": 0.557945966720581, + "learning_rate": 5.474419078679168e-05, + "loss": 3.8476, + "step": 53310 + }, + { + "epoch": 3.6224351134665036, + "grad_norm": 0.16306234896183014, + "learning_rate": 5.473994428590842e-05, + "loss": 3.8794, + "step": 53315 + }, + { + "epoch": 3.6227748335371652, + "grad_norm": 0.21068337559700012, + "learning_rate": 5.4735697785025145e-05, + "loss": 3.71, + "step": 53320 + }, + { + "epoch": 3.6231145536078273, + "grad_norm": 0.13559205830097198, + "learning_rate": 5.4731451284141866e-05, + "loss": 3.7813, + "step": 53325 + }, + { + "epoch": 3.623454273678489, + "grad_norm": 0.20322847366333008, + "learning_rate": 5.47272047832586e-05, + "loss": 3.9037, + "step": 53330 + }, + { + "epoch": 3.6237939937491506, + "grad_norm": 0.16416633129119873, + "learning_rate": 5.472295828237532e-05, + "loss": 3.9309, + "step": 53335 + }, + { + "epoch": 3.6241337138198126, + "grad_norm": 0.17706198990345, + "learning_rate": 5.471871178149205e-05, + "loss": 3.9995, + "step": 53340 + }, + { + "epoch": 3.6244734338904743, + "grad_norm": 0.21690092980861664, + "learning_rate": 5.4714465280608785e-05, + "loss": 3.6835, + "step": 53345 + }, + { + "epoch": 3.624813153961136, + "grad_norm": 0.18041233718395233, + "learning_rate": 5.4710218779725506e-05, + "loss": 3.7465, + "step": 53350 + }, + { + "epoch": 3.625152874031798, + "grad_norm": 0.1776193380355835, + "learning_rate": 5.4705972278842234e-05, + "loss": 3.9077, + "step": 53355 + }, + { + "epoch": 3.6254925941024596, + "grad_norm": 0.2704446315765381, + "learning_rate": 5.470172577795897e-05, + "loss": 4.1097, + "step": 53360 + }, + { + "epoch": 3.6258323141731212, + "grad_norm": 0.14352895319461823, + "learning_rate": 5.469747927707569e-05, + "loss": 3.7788, + "step": 53365 + }, + { + "epoch": 3.6261720342437833, + "grad_norm": 0.13832662999629974, + "learning_rate": 5.469323277619242e-05, + "loss": 3.9636, + "step": 53370 + }, + { + "epoch": 3.626511754314445, + "grad_norm": 0.1621926724910736, + "learning_rate": 5.468898627530915e-05, + "loss": 3.7501, + "step": 53375 + }, + { + "epoch": 3.6268514743851066, + "grad_norm": 0.2596416175365448, + "learning_rate": 5.4684739774425874e-05, + "loss": 4.0276, + "step": 53380 + }, + { + "epoch": 3.6271911944557687, + "grad_norm": 0.16022711992263794, + "learning_rate": 5.46804932735426e-05, + "loss": 3.9194, + "step": 53385 + }, + { + "epoch": 3.6275309145264303, + "grad_norm": 0.7027639746665955, + "learning_rate": 5.467624677265934e-05, + "loss": 3.7247, + "step": 53390 + }, + { + "epoch": 3.627870634597092, + "grad_norm": 0.1851588934659958, + "learning_rate": 5.467200027177606e-05, + "loss": 3.7975, + "step": 53395 + }, + { + "epoch": 3.628210354667754, + "grad_norm": 0.17818938195705414, + "learning_rate": 5.466775377089278e-05, + "loss": 3.8071, + "step": 53400 + }, + { + "epoch": 3.6285500747384156, + "grad_norm": 0.2020588219165802, + "learning_rate": 5.4663507270009514e-05, + "loss": 3.9111, + "step": 53405 + }, + { + "epoch": 3.6288897948090773, + "grad_norm": 0.17835554480552673, + "learning_rate": 5.465926076912624e-05, + "loss": 4.0566, + "step": 53410 + }, + { + "epoch": 3.6292295148797393, + "grad_norm": 0.15103869140148163, + "learning_rate": 5.4655014268242964e-05, + "loss": 3.7502, + "step": 53415 + }, + { + "epoch": 3.629569234950401, + "grad_norm": 0.17501679062843323, + "learning_rate": 5.46507677673597e-05, + "loss": 3.6411, + "step": 53420 + }, + { + "epoch": 3.6299089550210626, + "grad_norm": 0.17275667190551758, + "learning_rate": 5.4646521266476426e-05, + "loss": 3.9125, + "step": 53425 + }, + { + "epoch": 3.6302486750917247, + "grad_norm": 0.17078359425067902, + "learning_rate": 5.464227476559315e-05, + "loss": 3.8454, + "step": 53430 + }, + { + "epoch": 3.6305883951623863, + "grad_norm": 0.24012914299964905, + "learning_rate": 5.463802826470988e-05, + "loss": 3.9938, + "step": 53435 + }, + { + "epoch": 3.630928115233048, + "grad_norm": 0.17749667167663574, + "learning_rate": 5.463378176382661e-05, + "loss": 3.8879, + "step": 53440 + }, + { + "epoch": 3.63126783530371, + "grad_norm": 0.1613859385251999, + "learning_rate": 5.462953526294333e-05, + "loss": 3.7478, + "step": 53445 + }, + { + "epoch": 3.6316075553743716, + "grad_norm": 0.136374369263649, + "learning_rate": 5.4625288762060066e-05, + "loss": 3.8163, + "step": 53450 + }, + { + "epoch": 3.6319472754450333, + "grad_norm": 0.13820502161979675, + "learning_rate": 5.4621042261176794e-05, + "loss": 3.951, + "step": 53455 + }, + { + "epoch": 3.6322869955156953, + "grad_norm": 0.19987478852272034, + "learning_rate": 5.4616795760293516e-05, + "loss": 3.5371, + "step": 53460 + }, + { + "epoch": 3.632626715586357, + "grad_norm": 0.17264708876609802, + "learning_rate": 5.461254925941025e-05, + "loss": 3.9013, + "step": 53465 + }, + { + "epoch": 3.6329664356570186, + "grad_norm": 0.15404009819030762, + "learning_rate": 5.460830275852697e-05, + "loss": 3.7849, + "step": 53470 + }, + { + "epoch": 3.6333061557276802, + "grad_norm": 0.17730286717414856, + "learning_rate": 5.46040562576437e-05, + "loss": 3.8502, + "step": 53475 + }, + { + "epoch": 3.6336458757983423, + "grad_norm": 3.03957200050354, + "learning_rate": 5.4599809756760434e-05, + "loss": 4.0624, + "step": 53480 + }, + { + "epoch": 3.633985595869004, + "grad_norm": 0.20740418136119843, + "learning_rate": 5.4595563255877156e-05, + "loss": 3.8505, + "step": 53485 + }, + { + "epoch": 3.6343253159396656, + "grad_norm": 0.20924700796604156, + "learning_rate": 5.459131675499389e-05, + "loss": 3.8346, + "step": 53490 + }, + { + "epoch": 3.6346650360103276, + "grad_norm": 3.1965620517730713, + "learning_rate": 5.458707025411062e-05, + "loss": 3.748, + "step": 53495 + }, + { + "epoch": 3.6350047560809893, + "grad_norm": 0.1917291134595871, + "learning_rate": 5.458282375322734e-05, + "loss": 3.886, + "step": 53500 + }, + { + "epoch": 3.635344476151651, + "grad_norm": 0.1695198118686676, + "learning_rate": 5.4578577252344075e-05, + "loss": 3.8704, + "step": 53505 + }, + { + "epoch": 3.635684196222313, + "grad_norm": 0.1654720902442932, + "learning_rate": 5.45743307514608e-05, + "loss": 3.9031, + "step": 53510 + }, + { + "epoch": 3.6360239162929746, + "grad_norm": 0.15513601899147034, + "learning_rate": 5.4570084250577524e-05, + "loss": 3.8157, + "step": 53515 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.14845319092273712, + "learning_rate": 5.456583774969426e-05, + "loss": 4.0089, + "step": 53520 + }, + { + "epoch": 3.636703356434298, + "grad_norm": 0.7912161350250244, + "learning_rate": 5.4561591248810987e-05, + "loss": 3.8136, + "step": 53525 + }, + { + "epoch": 3.63704307650496, + "grad_norm": 0.17868579924106598, + "learning_rate": 5.455734474792771e-05, + "loss": 4.0597, + "step": 53530 + }, + { + "epoch": 3.6373827965756216, + "grad_norm": 0.14451605081558228, + "learning_rate": 5.455309824704444e-05, + "loss": 4.0766, + "step": 53535 + }, + { + "epoch": 3.637722516646283, + "grad_norm": 0.243995800614357, + "learning_rate": 5.4548851746161164e-05, + "loss": 3.8467, + "step": 53540 + }, + { + "epoch": 3.6380622367169453, + "grad_norm": 0.15714585781097412, + "learning_rate": 5.454460524527789e-05, + "loss": 3.8713, + "step": 53545 + }, + { + "epoch": 3.638401956787607, + "grad_norm": 0.18673552572727203, + "learning_rate": 5.4540358744394627e-05, + "loss": 3.8441, + "step": 53550 + }, + { + "epoch": 3.6387416768582685, + "grad_norm": 0.6612139940261841, + "learning_rate": 5.453611224351135e-05, + "loss": 4.0936, + "step": 53555 + }, + { + "epoch": 3.6390813969289306, + "grad_norm": 0.18575239181518555, + "learning_rate": 5.4531865742628076e-05, + "loss": 3.7577, + "step": 53560 + }, + { + "epoch": 3.6394211169995923, + "grad_norm": 0.19134023785591125, + "learning_rate": 5.452761924174481e-05, + "loss": 3.9686, + "step": 53565 + }, + { + "epoch": 3.639760837070254, + "grad_norm": 0.17432920634746552, + "learning_rate": 5.452337274086153e-05, + "loss": 3.7912, + "step": 53570 + }, + { + "epoch": 3.640100557140916, + "grad_norm": 0.17178045213222504, + "learning_rate": 5.451912623997826e-05, + "loss": 4.0742, + "step": 53575 + }, + { + "epoch": 3.6404402772115776, + "grad_norm": 0.1689111888408661, + "learning_rate": 5.4514879739094995e-05, + "loss": 3.9972, + "step": 53580 + }, + { + "epoch": 3.6407799972822392, + "grad_norm": 0.1474733203649521, + "learning_rate": 5.4510633238211716e-05, + "loss": 3.7009, + "step": 53585 + }, + { + "epoch": 3.6411197173529013, + "grad_norm": 0.2106524258852005, + "learning_rate": 5.4506386737328444e-05, + "loss": 4.0183, + "step": 53590 + }, + { + "epoch": 3.641459437423563, + "grad_norm": 0.18541274964809418, + "learning_rate": 5.450214023644518e-05, + "loss": 3.7786, + "step": 53595 + }, + { + "epoch": 3.6417991574942246, + "grad_norm": 0.19837115705013275, + "learning_rate": 5.44978937355619e-05, + "loss": 3.9599, + "step": 53600 + }, + { + "epoch": 3.6421388775648866, + "grad_norm": 0.16361767053604126, + "learning_rate": 5.449364723467862e-05, + "loss": 3.7697, + "step": 53605 + }, + { + "epoch": 3.6424785976355483, + "grad_norm": 0.24250344932079315, + "learning_rate": 5.448940073379536e-05, + "loss": 3.686, + "step": 53610 + }, + { + "epoch": 3.64281831770621, + "grad_norm": 0.19632937014102936, + "learning_rate": 5.4485154232912084e-05, + "loss": 3.8379, + "step": 53615 + }, + { + "epoch": 3.643158037776872, + "grad_norm": 0.16889715194702148, + "learning_rate": 5.4480907732028805e-05, + "loss": 3.8688, + "step": 53620 + }, + { + "epoch": 3.6434977578475336, + "grad_norm": 0.20054301619529724, + "learning_rate": 5.447666123114554e-05, + "loss": 3.9249, + "step": 53625 + }, + { + "epoch": 3.6438374779181952, + "grad_norm": 0.18881727755069733, + "learning_rate": 5.447241473026227e-05, + "loss": 4.11, + "step": 53630 + }, + { + "epoch": 3.6441771979888573, + "grad_norm": 0.19404512643814087, + "learning_rate": 5.446816822937899e-05, + "loss": 3.6579, + "step": 53635 + }, + { + "epoch": 3.644516918059519, + "grad_norm": 0.18992070853710175, + "learning_rate": 5.4463921728495724e-05, + "loss": 3.9774, + "step": 53640 + }, + { + "epoch": 3.6448566381301806, + "grad_norm": 1.1140317916870117, + "learning_rate": 5.445967522761245e-05, + "loss": 3.6992, + "step": 53645 + }, + { + "epoch": 3.6451963582008426, + "grad_norm": 0.1608489751815796, + "learning_rate": 5.445542872672917e-05, + "loss": 3.8321, + "step": 53650 + }, + { + "epoch": 3.6455360782715043, + "grad_norm": 0.1881723403930664, + "learning_rate": 5.445118222584591e-05, + "loss": 3.8157, + "step": 53655 + }, + { + "epoch": 3.645875798342166, + "grad_norm": 0.19664938747882843, + "learning_rate": 5.4446935724962636e-05, + "loss": 3.9063, + "step": 53660 + }, + { + "epoch": 3.646215518412828, + "grad_norm": 0.24842000007629395, + "learning_rate": 5.444268922407936e-05, + "loss": 3.9349, + "step": 53665 + }, + { + "epoch": 3.6465552384834896, + "grad_norm": 0.15980298817157745, + "learning_rate": 5.443844272319609e-05, + "loss": 3.8773, + "step": 53670 + }, + { + "epoch": 3.6468949585541512, + "grad_norm": 0.2729089558124542, + "learning_rate": 5.443419622231281e-05, + "loss": 3.7183, + "step": 53675 + }, + { + "epoch": 3.6472346786248133, + "grad_norm": 0.21230573952198029, + "learning_rate": 5.442994972142954e-05, + "loss": 3.7723, + "step": 53680 + }, + { + "epoch": 3.647574398695475, + "grad_norm": 0.1543203890323639, + "learning_rate": 5.4425703220546276e-05, + "loss": 3.9068, + "step": 53685 + }, + { + "epoch": 3.6479141187661366, + "grad_norm": 0.16679586470127106, + "learning_rate": 5.4421456719663e-05, + "loss": 4.1193, + "step": 53690 + }, + { + "epoch": 3.6482538388367987, + "grad_norm": 0.2216525822877884, + "learning_rate": 5.4417210218779725e-05, + "loss": 3.9356, + "step": 53695 + }, + { + "epoch": 3.6485935589074603, + "grad_norm": 0.14524191617965698, + "learning_rate": 5.441296371789646e-05, + "loss": 3.8096, + "step": 53700 + }, + { + "epoch": 3.648933278978122, + "grad_norm": 0.24722878634929657, + "learning_rate": 5.440871721701318e-05, + "loss": 3.8734, + "step": 53705 + }, + { + "epoch": 3.649272999048784, + "grad_norm": 0.19928143918514252, + "learning_rate": 5.440447071612991e-05, + "loss": 4.1548, + "step": 53710 + }, + { + "epoch": 3.6496127191194456, + "grad_norm": 0.18013446033000946, + "learning_rate": 5.4400224215246644e-05, + "loss": 4.0663, + "step": 53715 + }, + { + "epoch": 3.6499524391901073, + "grad_norm": 0.17797164618968964, + "learning_rate": 5.4395977714363365e-05, + "loss": 3.7921, + "step": 53720 + }, + { + "epoch": 3.6502921592607693, + "grad_norm": 0.36499902606010437, + "learning_rate": 5.439173121348009e-05, + "loss": 3.9141, + "step": 53725 + }, + { + "epoch": 3.650631879331431, + "grad_norm": 0.19872577488422394, + "learning_rate": 5.438748471259683e-05, + "loss": 3.8713, + "step": 53730 + }, + { + "epoch": 3.6509715994020926, + "grad_norm": 0.1635497510433197, + "learning_rate": 5.438323821171355e-05, + "loss": 3.7166, + "step": 53735 + }, + { + "epoch": 3.6513113194727547, + "grad_norm": 0.21929974853992462, + "learning_rate": 5.437899171083027e-05, + "loss": 3.8006, + "step": 53740 + }, + { + "epoch": 3.6516510395434163, + "grad_norm": 0.1998845785856247, + "learning_rate": 5.437474520994701e-05, + "loss": 3.6277, + "step": 53745 + }, + { + "epoch": 3.651990759614078, + "grad_norm": 0.23206093907356262, + "learning_rate": 5.437049870906373e-05, + "loss": 3.7953, + "step": 53750 + }, + { + "epoch": 3.65233047968474, + "grad_norm": 0.1526026874780655, + "learning_rate": 5.4366252208180455e-05, + "loss": 4.0333, + "step": 53755 + }, + { + "epoch": 3.6526701997554016, + "grad_norm": 0.18531298637390137, + "learning_rate": 5.436200570729719e-05, + "loss": 3.8548, + "step": 53760 + }, + { + "epoch": 3.6530099198260633, + "grad_norm": 0.16789911687374115, + "learning_rate": 5.435775920641392e-05, + "loss": 3.8764, + "step": 53765 + }, + { + "epoch": 3.6533496398967253, + "grad_norm": 1.0547538995742798, + "learning_rate": 5.435351270553064e-05, + "loss": 3.6507, + "step": 53770 + }, + { + "epoch": 3.653689359967387, + "grad_norm": 0.16637037694454193, + "learning_rate": 5.4349266204647373e-05, + "loss": 3.9671, + "step": 53775 + }, + { + "epoch": 3.6540290800380486, + "grad_norm": 0.19226260483264923, + "learning_rate": 5.43450197037641e-05, + "loss": 3.913, + "step": 53780 + }, + { + "epoch": 3.6543688001087107, + "grad_norm": 0.1520862728357315, + "learning_rate": 5.434077320288082e-05, + "loss": 3.8899, + "step": 53785 + }, + { + "epoch": 3.6547085201793723, + "grad_norm": 0.16632919013500214, + "learning_rate": 5.433652670199756e-05, + "loss": 3.7832, + "step": 53790 + }, + { + "epoch": 3.655048240250034, + "grad_norm": 0.17284896969795227, + "learning_rate": 5.4332280201114285e-05, + "loss": 3.901, + "step": 53795 + }, + { + "epoch": 3.655387960320696, + "grad_norm": 0.15158723294734955, + "learning_rate": 5.432803370023101e-05, + "loss": 3.9024, + "step": 53800 + }, + { + "epoch": 3.6557276803913576, + "grad_norm": 0.2857232093811035, + "learning_rate": 5.432378719934774e-05, + "loss": 3.9473, + "step": 53805 + }, + { + "epoch": 3.6560674004620193, + "grad_norm": 0.1888241171836853, + "learning_rate": 5.431954069846447e-05, + "loss": 3.6985, + "step": 53810 + }, + { + "epoch": 3.656407120532681, + "grad_norm": 0.15342895686626434, + "learning_rate": 5.431529419758119e-05, + "loss": 3.857, + "step": 53815 + }, + { + "epoch": 3.656746840603343, + "grad_norm": 0.1787101924419403, + "learning_rate": 5.4311047696697925e-05, + "loss": 3.9836, + "step": 53820 + }, + { + "epoch": 3.6570865606740046, + "grad_norm": 0.17252062261104584, + "learning_rate": 5.430680119581465e-05, + "loss": 3.9079, + "step": 53825 + }, + { + "epoch": 3.6574262807446662, + "grad_norm": 0.23086628317832947, + "learning_rate": 5.430255469493138e-05, + "loss": 3.6506, + "step": 53830 + }, + { + "epoch": 3.6577660008153283, + "grad_norm": 0.18231266736984253, + "learning_rate": 5.429830819404811e-05, + "loss": 3.562, + "step": 53835 + }, + { + "epoch": 3.65810572088599, + "grad_norm": 0.22487370669841766, + "learning_rate": 5.429406169316483e-05, + "loss": 3.7904, + "step": 53840 + }, + { + "epoch": 3.6584454409566516, + "grad_norm": 0.20701223611831665, + "learning_rate": 5.4289815192281566e-05, + "loss": 3.7563, + "step": 53845 + }, + { + "epoch": 3.6587851610273137, + "grad_norm": 0.1652178019285202, + "learning_rate": 5.4285568691398294e-05, + "loss": 3.7383, + "step": 53850 + }, + { + "epoch": 3.6591248810979753, + "grad_norm": 0.2523956298828125, + "learning_rate": 5.4281322190515015e-05, + "loss": 4.012, + "step": 53855 + }, + { + "epoch": 3.659464601168637, + "grad_norm": 0.20848678052425385, + "learning_rate": 5.427707568963175e-05, + "loss": 3.8018, + "step": 53860 + }, + { + "epoch": 3.6598043212392986, + "grad_norm": 0.13177768886089325, + "learning_rate": 5.427282918874848e-05, + "loss": 3.6925, + "step": 53865 + }, + { + "epoch": 3.6601440413099606, + "grad_norm": 0.16428637504577637, + "learning_rate": 5.42685826878652e-05, + "loss": 3.8442, + "step": 53870 + }, + { + "epoch": 3.6604837613806223, + "grad_norm": 0.14670051634311676, + "learning_rate": 5.4264336186981934e-05, + "loss": 3.8924, + "step": 53875 + }, + { + "epoch": 3.660823481451284, + "grad_norm": 0.15171413123607635, + "learning_rate": 5.426008968609866e-05, + "loss": 3.7461, + "step": 53880 + }, + { + "epoch": 3.661163201521946, + "grad_norm": 0.1686738282442093, + "learning_rate": 5.425584318521538e-05, + "loss": 3.5849, + "step": 53885 + }, + { + "epoch": 3.6615029215926076, + "grad_norm": 0.15648332238197327, + "learning_rate": 5.425159668433212e-05, + "loss": 3.8854, + "step": 53890 + }, + { + "epoch": 3.6618426416632692, + "grad_norm": 0.17942926287651062, + "learning_rate": 5.424735018344884e-05, + "loss": 3.987, + "step": 53895 + }, + { + "epoch": 3.6621823617339313, + "grad_norm": 0.19281691312789917, + "learning_rate": 5.424310368256557e-05, + "loss": 3.8483, + "step": 53900 + }, + { + "epoch": 3.662522081804593, + "grad_norm": 0.12751764059066772, + "learning_rate": 5.42388571816823e-05, + "loss": 4.0677, + "step": 53905 + }, + { + "epoch": 3.6628618018752546, + "grad_norm": 0.16132669150829315, + "learning_rate": 5.423461068079902e-05, + "loss": 3.7538, + "step": 53910 + }, + { + "epoch": 3.6632015219459166, + "grad_norm": 0.22159145772457123, + "learning_rate": 5.423036417991575e-05, + "loss": 3.7331, + "step": 53915 + }, + { + "epoch": 3.6635412420165783, + "grad_norm": 0.1790739744901657, + "learning_rate": 5.4226117679032486e-05, + "loss": 3.8467, + "step": 53920 + }, + { + "epoch": 3.66388096208724, + "grad_norm": 0.1658005267381668, + "learning_rate": 5.422187117814921e-05, + "loss": 3.9203, + "step": 53925 + }, + { + "epoch": 3.664220682157902, + "grad_norm": 0.14301788806915283, + "learning_rate": 5.4217624677265935e-05, + "loss": 4.0711, + "step": 53930 + }, + { + "epoch": 3.6645604022285636, + "grad_norm": 0.1631091684103012, + "learning_rate": 5.421337817638267e-05, + "loss": 3.8007, + "step": 53935 + }, + { + "epoch": 3.6649001222992252, + "grad_norm": 0.45039474964141846, + "learning_rate": 5.420913167549939e-05, + "loss": 3.9008, + "step": 53940 + }, + { + "epoch": 3.6652398423698873, + "grad_norm": 0.1951817125082016, + "learning_rate": 5.420488517461612e-05, + "loss": 3.7723, + "step": 53945 + }, + { + "epoch": 3.665579562440549, + "grad_norm": 0.16297541558742523, + "learning_rate": 5.4200638673732854e-05, + "loss": 3.6696, + "step": 53950 + }, + { + "epoch": 3.6659192825112106, + "grad_norm": 0.20179042220115662, + "learning_rate": 5.4196392172849575e-05, + "loss": 4.0881, + "step": 53955 + }, + { + "epoch": 3.6662590025818727, + "grad_norm": 0.1476924866437912, + "learning_rate": 5.4192145671966296e-05, + "loss": 3.8461, + "step": 53960 + }, + { + "epoch": 3.6665987226525343, + "grad_norm": 0.18204739689826965, + "learning_rate": 5.418789917108303e-05, + "loss": 4.0072, + "step": 53965 + }, + { + "epoch": 3.666938442723196, + "grad_norm": 0.22443489730358124, + "learning_rate": 5.418365267019976e-05, + "loss": 3.94, + "step": 53970 + }, + { + "epoch": 3.667278162793858, + "grad_norm": 0.18618525564670563, + "learning_rate": 5.417940616931648e-05, + "loss": 4.2005, + "step": 53975 + }, + { + "epoch": 3.6676178828645196, + "grad_norm": 0.1639946550130844, + "learning_rate": 5.4175159668433215e-05, + "loss": 4.0653, + "step": 53980 + }, + { + "epoch": 3.6679576029351812, + "grad_norm": 0.18391722440719604, + "learning_rate": 5.417091316754994e-05, + "loss": 3.9665, + "step": 53985 + }, + { + "epoch": 3.6682973230058433, + "grad_norm": 0.3020731210708618, + "learning_rate": 5.4166666666666664e-05, + "loss": 3.6472, + "step": 53990 + }, + { + "epoch": 3.668637043076505, + "grad_norm": 0.15434810519218445, + "learning_rate": 5.41624201657834e-05, + "loss": 3.8707, + "step": 53995 + }, + { + "epoch": 3.6689767631471666, + "grad_norm": 0.17081181704998016, + "learning_rate": 5.415817366490013e-05, + "loss": 3.8847, + "step": 54000 + }, + { + "epoch": 3.6693164832178287, + "grad_norm": 0.2401028871536255, + "learning_rate": 5.415392716401685e-05, + "loss": 3.7387, + "step": 54005 + }, + { + "epoch": 3.6696562032884903, + "grad_norm": 0.26890116930007935, + "learning_rate": 5.414968066313358e-05, + "loss": 3.8434, + "step": 54010 + }, + { + "epoch": 3.669995923359152, + "grad_norm": 0.15471559762954712, + "learning_rate": 5.414543416225031e-05, + "loss": 3.9458, + "step": 54015 + }, + { + "epoch": 3.670335643429814, + "grad_norm": 0.1434662789106369, + "learning_rate": 5.414118766136703e-05, + "loss": 3.7612, + "step": 54020 + }, + { + "epoch": 3.6706753635004756, + "grad_norm": 0.1713419109582901, + "learning_rate": 5.413694116048377e-05, + "loss": 3.7913, + "step": 54025 + }, + { + "epoch": 3.6710150835711373, + "grad_norm": 0.18837794661521912, + "learning_rate": 5.413269465960049e-05, + "loss": 3.9253, + "step": 54030 + }, + { + "epoch": 3.6713548036417993, + "grad_norm": 0.15133683383464813, + "learning_rate": 5.4128448158717216e-05, + "loss": 3.8682, + "step": 54035 + }, + { + "epoch": 3.671694523712461, + "grad_norm": 0.16795842349529266, + "learning_rate": 5.412420165783395e-05, + "loss": 4.0056, + "step": 54040 + }, + { + "epoch": 3.6720342437831226, + "grad_norm": 0.17439600825309753, + "learning_rate": 5.411995515695067e-05, + "loss": 3.9842, + "step": 54045 + }, + { + "epoch": 3.6723739638537847, + "grad_norm": 0.14478695392608643, + "learning_rate": 5.41157086560674e-05, + "loss": 3.7701, + "step": 54050 + }, + { + "epoch": 3.6727136839244463, + "grad_norm": 0.31622782349586487, + "learning_rate": 5.4111462155184135e-05, + "loss": 3.942, + "step": 54055 + }, + { + "epoch": 3.673053403995108, + "grad_norm": 0.19608189165592194, + "learning_rate": 5.4107215654300856e-05, + "loss": 3.9635, + "step": 54060 + }, + { + "epoch": 3.67339312406577, + "grad_norm": 0.2104063481092453, + "learning_rate": 5.4102969153417584e-05, + "loss": 3.9855, + "step": 54065 + }, + { + "epoch": 3.6737328441364316, + "grad_norm": 0.30416738986968994, + "learning_rate": 5.409872265253432e-05, + "loss": 3.9665, + "step": 54070 + }, + { + "epoch": 3.6740725642070933, + "grad_norm": 0.1552334576845169, + "learning_rate": 5.409447615165104e-05, + "loss": 4.1119, + "step": 54075 + }, + { + "epoch": 3.6744122842777553, + "grad_norm": 0.15313346683979034, + "learning_rate": 5.409022965076777e-05, + "loss": 3.9659, + "step": 54080 + }, + { + "epoch": 3.674752004348417, + "grad_norm": 0.1928713321685791, + "learning_rate": 5.40859831498845e-05, + "loss": 3.9933, + "step": 54085 + }, + { + "epoch": 3.6750917244190786, + "grad_norm": 0.18001501262187958, + "learning_rate": 5.4081736649001224e-05, + "loss": 3.8447, + "step": 54090 + }, + { + "epoch": 3.6754314444897407, + "grad_norm": 0.17507657408714294, + "learning_rate": 5.4077490148117946e-05, + "loss": 3.6862, + "step": 54095 + }, + { + "epoch": 3.6757711645604023, + "grad_norm": 0.16668660938739777, + "learning_rate": 5.407324364723468e-05, + "loss": 4.0734, + "step": 54100 + }, + { + "epoch": 3.676110884631064, + "grad_norm": 1.6766818761825562, + "learning_rate": 5.406899714635141e-05, + "loss": 3.6409, + "step": 54105 + }, + { + "epoch": 3.676450604701726, + "grad_norm": 0.38470107316970825, + "learning_rate": 5.406475064546813e-05, + "loss": 3.8461, + "step": 54110 + }, + { + "epoch": 3.6767903247723877, + "grad_norm": 0.20049867033958435, + "learning_rate": 5.4060504144584864e-05, + "loss": 3.8196, + "step": 54115 + }, + { + "epoch": 3.6771300448430493, + "grad_norm": 0.7227198481559753, + "learning_rate": 5.405625764370159e-05, + "loss": 3.8907, + "step": 54120 + }, + { + "epoch": 3.6774697649137114, + "grad_norm": 0.16650983691215515, + "learning_rate": 5.4052011142818314e-05, + "loss": 3.9321, + "step": 54125 + }, + { + "epoch": 3.677809484984373, + "grad_norm": 0.142271026968956, + "learning_rate": 5.404776464193505e-05, + "loss": 3.7283, + "step": 54130 + }, + { + "epoch": 3.6781492050550346, + "grad_norm": 0.17631122469902039, + "learning_rate": 5.4043518141051776e-05, + "loss": 4.0615, + "step": 54135 + }, + { + "epoch": 3.6784889251256967, + "grad_norm": 0.194586381316185, + "learning_rate": 5.40392716401685e-05, + "loss": 3.913, + "step": 54140 + }, + { + "epoch": 3.6788286451963583, + "grad_norm": 0.24859975278377533, + "learning_rate": 5.403502513928523e-05, + "loss": 4.0435, + "step": 54145 + }, + { + "epoch": 3.67916836526702, + "grad_norm": 0.16335038840770721, + "learning_rate": 5.403077863840196e-05, + "loss": 3.8881, + "step": 54150 + }, + { + "epoch": 3.6795080853376816, + "grad_norm": 0.1834166944026947, + "learning_rate": 5.402653213751868e-05, + "loss": 3.8051, + "step": 54155 + }, + { + "epoch": 3.6798478054083437, + "grad_norm": 0.16382423043251038, + "learning_rate": 5.4022285636635416e-05, + "loss": 3.9165, + "step": 54160 + }, + { + "epoch": 3.6801875254790053, + "grad_norm": 0.23359762132167816, + "learning_rate": 5.401803913575214e-05, + "loss": 3.9661, + "step": 54165 + }, + { + "epoch": 3.680527245549667, + "grad_norm": 0.16781438887119293, + "learning_rate": 5.401379263486888e-05, + "loss": 3.6514, + "step": 54170 + }, + { + "epoch": 3.680866965620329, + "grad_norm": 0.37734290957450867, + "learning_rate": 5.40095461339856e-05, + "loss": 4.1256, + "step": 54175 + }, + { + "epoch": 3.6812066856909906, + "grad_norm": 0.1446843445301056, + "learning_rate": 5.400529963310232e-05, + "loss": 3.7099, + "step": 54180 + }, + { + "epoch": 3.6815464057616523, + "grad_norm": 0.1586170494556427, + "learning_rate": 5.4001053132219056e-05, + "loss": 3.8836, + "step": 54185 + }, + { + "epoch": 3.6818861258323143, + "grad_norm": 0.14854545891284943, + "learning_rate": 5.3996806631335784e-05, + "loss": 3.8674, + "step": 54190 + }, + { + "epoch": 3.682225845902976, + "grad_norm": 0.5733267664909363, + "learning_rate": 5.3992560130452506e-05, + "loss": 3.7908, + "step": 54195 + }, + { + "epoch": 3.6825655659736376, + "grad_norm": 10.82076358795166, + "learning_rate": 5.398831362956924e-05, + "loss": 3.7919, + "step": 54200 + }, + { + "epoch": 3.6829052860442992, + "grad_norm": 0.16535434126853943, + "learning_rate": 5.398406712868597e-05, + "loss": 3.7915, + "step": 54205 + }, + { + "epoch": 3.6832450061149613, + "grad_norm": 0.31783604621887207, + "learning_rate": 5.397982062780269e-05, + "loss": 3.9033, + "step": 54210 + }, + { + "epoch": 3.683584726185623, + "grad_norm": 0.14836977422237396, + "learning_rate": 5.3975574126919425e-05, + "loss": 3.8041, + "step": 54215 + }, + { + "epoch": 3.6839244462562846, + "grad_norm": 0.14449314773082733, + "learning_rate": 5.397132762603615e-05, + "loss": 3.6857, + "step": 54220 + }, + { + "epoch": 3.6842641663269466, + "grad_norm": 1.7662967443466187, + "learning_rate": 5.3967081125152874e-05, + "loss": 3.8978, + "step": 54225 + }, + { + "epoch": 3.6846038863976083, + "grad_norm": 0.20856428146362305, + "learning_rate": 5.396283462426961e-05, + "loss": 3.8834, + "step": 54230 + }, + { + "epoch": 3.68494360646827, + "grad_norm": 0.47596248984336853, + "learning_rate": 5.3958588123386337e-05, + "loss": 3.9323, + "step": 54235 + }, + { + "epoch": 3.685283326538932, + "grad_norm": 0.16372434794902802, + "learning_rate": 5.395434162250306e-05, + "loss": 3.6551, + "step": 54240 + }, + { + "epoch": 3.6856230466095936, + "grad_norm": 0.1921422928571701, + "learning_rate": 5.395009512161979e-05, + "loss": 3.9372, + "step": 54245 + }, + { + "epoch": 3.6859627666802552, + "grad_norm": 0.18808916211128235, + "learning_rate": 5.3945848620736514e-05, + "loss": 3.9737, + "step": 54250 + }, + { + "epoch": 3.6863024867509173, + "grad_norm": 0.1841004490852356, + "learning_rate": 5.394160211985324e-05, + "loss": 4.0359, + "step": 54255 + }, + { + "epoch": 3.686642206821579, + "grad_norm": 0.18339361250400543, + "learning_rate": 5.3937355618969977e-05, + "loss": 3.7263, + "step": 54260 + }, + { + "epoch": 3.6869819268922406, + "grad_norm": 0.27360326051712036, + "learning_rate": 5.39331091180867e-05, + "loss": 3.9377, + "step": 54265 + }, + { + "epoch": 3.6873216469629027, + "grad_norm": 0.13622187077999115, + "learning_rate": 5.3928862617203426e-05, + "loss": 3.8969, + "step": 54270 + }, + { + "epoch": 3.6876613670335643, + "grad_norm": 0.17364594340324402, + "learning_rate": 5.392461611632016e-05, + "loss": 3.8433, + "step": 54275 + }, + { + "epoch": 3.688001087104226, + "grad_norm": 0.13824450969696045, + "learning_rate": 5.392036961543688e-05, + "loss": 3.9918, + "step": 54280 + }, + { + "epoch": 3.688340807174888, + "grad_norm": 0.14133194088935852, + "learning_rate": 5.391612311455361e-05, + "loss": 3.9131, + "step": 54285 + }, + { + "epoch": 3.6886805272455496, + "grad_norm": 0.16144238412380219, + "learning_rate": 5.3911876613670345e-05, + "loss": 3.8191, + "step": 54290 + }, + { + "epoch": 3.6890202473162113, + "grad_norm": 0.1881900131702423, + "learning_rate": 5.3907630112787066e-05, + "loss": 3.7608, + "step": 54295 + }, + { + "epoch": 3.6893599673868733, + "grad_norm": 0.1768079251050949, + "learning_rate": 5.390338361190379e-05, + "loss": 3.772, + "step": 54300 + }, + { + "epoch": 3.689699687457535, + "grad_norm": 0.21079255640506744, + "learning_rate": 5.389913711102053e-05, + "loss": 3.9382, + "step": 54305 + }, + { + "epoch": 3.6900394075281966, + "grad_norm": 0.7654997110366821, + "learning_rate": 5.389489061013725e-05, + "loss": 3.5485, + "step": 54310 + }, + { + "epoch": 3.6903791275988587, + "grad_norm": 0.18216899037361145, + "learning_rate": 5.389064410925397e-05, + "loss": 3.9356, + "step": 54315 + }, + { + "epoch": 3.6907188476695203, + "grad_norm": 0.15291625261306763, + "learning_rate": 5.3886397608370706e-05, + "loss": 3.9824, + "step": 54320 + }, + { + "epoch": 3.691058567740182, + "grad_norm": 0.16544273495674133, + "learning_rate": 5.3882151107487434e-05, + "loss": 3.6761, + "step": 54325 + }, + { + "epoch": 3.691398287810844, + "grad_norm": 1.0025889873504639, + "learning_rate": 5.3877904606604155e-05, + "loss": 3.6687, + "step": 54330 + }, + { + "epoch": 3.6917380078815056, + "grad_norm": 0.18820862472057343, + "learning_rate": 5.387365810572089e-05, + "loss": 3.8077, + "step": 54335 + }, + { + "epoch": 3.6920777279521673, + "grad_norm": 0.16587795317173004, + "learning_rate": 5.386941160483762e-05, + "loss": 3.7105, + "step": 54340 + }, + { + "epoch": 3.6924174480228293, + "grad_norm": 0.15358169376850128, + "learning_rate": 5.386516510395434e-05, + "loss": 3.7947, + "step": 54345 + }, + { + "epoch": 3.692757168093491, + "grad_norm": 0.27041128277778625, + "learning_rate": 5.3860918603071074e-05, + "loss": 4.0474, + "step": 54350 + }, + { + "epoch": 3.6930968881641526, + "grad_norm": 0.41432541608810425, + "learning_rate": 5.38566721021878e-05, + "loss": 4.0703, + "step": 54355 + }, + { + "epoch": 3.6934366082348147, + "grad_norm": 0.15454243123531342, + "learning_rate": 5.385242560130452e-05, + "loss": 3.8329, + "step": 54360 + }, + { + "epoch": 3.6937763283054763, + "grad_norm": 0.158067524433136, + "learning_rate": 5.384817910042126e-05, + "loss": 3.7813, + "step": 54365 + }, + { + "epoch": 3.694116048376138, + "grad_norm": 1.4372774362564087, + "learning_rate": 5.3843932599537986e-05, + "loss": 3.708, + "step": 54370 + }, + { + "epoch": 3.6944557684468, + "grad_norm": 0.31454363465309143, + "learning_rate": 5.383968609865471e-05, + "loss": 3.9468, + "step": 54375 + }, + { + "epoch": 3.6947954885174616, + "grad_norm": 0.7633704543113708, + "learning_rate": 5.383543959777144e-05, + "loss": 3.5532, + "step": 54380 + }, + { + "epoch": 3.6951352085881233, + "grad_norm": 0.23525074124336243, + "learning_rate": 5.383119309688816e-05, + "loss": 3.7336, + "step": 54385 + }, + { + "epoch": 3.6954749286587854, + "grad_norm": 0.17696428298950195, + "learning_rate": 5.382694659600489e-05, + "loss": 3.9441, + "step": 54390 + }, + { + "epoch": 3.695814648729447, + "grad_norm": 0.2084655910730362, + "learning_rate": 5.3822700095121626e-05, + "loss": 3.6948, + "step": 54395 + }, + { + "epoch": 3.6961543688001086, + "grad_norm": 0.21950455009937286, + "learning_rate": 5.381845359423835e-05, + "loss": 3.8607, + "step": 54400 + }, + { + "epoch": 3.6964940888707707, + "grad_norm": 0.16001754999160767, + "learning_rate": 5.3814207093355075e-05, + "loss": 4.0364, + "step": 54405 + }, + { + "epoch": 3.6968338089414323, + "grad_norm": 0.157990962266922, + "learning_rate": 5.380996059247181e-05, + "loss": 3.9527, + "step": 54410 + }, + { + "epoch": 3.697173529012094, + "grad_norm": 0.23826847970485687, + "learning_rate": 5.380571409158853e-05, + "loss": 3.9979, + "step": 54415 + }, + { + "epoch": 3.697513249082756, + "grad_norm": 0.5521834492683411, + "learning_rate": 5.380146759070526e-05, + "loss": 3.8831, + "step": 54420 + }, + { + "epoch": 3.6978529691534177, + "grad_norm": 0.18277256190776825, + "learning_rate": 5.3797221089821994e-05, + "loss": 3.7679, + "step": 54425 + }, + { + "epoch": 3.6981926892240793, + "grad_norm": 0.20550104975700378, + "learning_rate": 5.3792974588938715e-05, + "loss": 3.8826, + "step": 54430 + }, + { + "epoch": 3.6985324092947414, + "grad_norm": 0.18162012100219727, + "learning_rate": 5.378872808805544e-05, + "loss": 3.9602, + "step": 54435 + }, + { + "epoch": 3.698872129365403, + "grad_norm": 0.19590826332569122, + "learning_rate": 5.378448158717218e-05, + "loss": 3.7405, + "step": 54440 + }, + { + "epoch": 3.6992118494360646, + "grad_norm": 0.16547256708145142, + "learning_rate": 5.37802350862889e-05, + "loss": 3.8186, + "step": 54445 + }, + { + "epoch": 3.6995515695067267, + "grad_norm": 0.21119490265846252, + "learning_rate": 5.377598858540562e-05, + "loss": 3.7522, + "step": 54450 + }, + { + "epoch": 3.6998912895773883, + "grad_norm": 0.20394624769687653, + "learning_rate": 5.3771742084522355e-05, + "loss": 3.963, + "step": 54455 + }, + { + "epoch": 3.70023100964805, + "grad_norm": 0.16071276366710663, + "learning_rate": 5.376749558363908e-05, + "loss": 3.8392, + "step": 54460 + }, + { + "epoch": 3.700570729718712, + "grad_norm": 0.181685671210289, + "learning_rate": 5.3763249082755805e-05, + "loss": 3.7027, + "step": 54465 + }, + { + "epoch": 3.7009104497893737, + "grad_norm": 0.15174078941345215, + "learning_rate": 5.375900258187254e-05, + "loss": 3.8262, + "step": 54470 + }, + { + "epoch": 3.7012501698600353, + "grad_norm": 0.13491427898406982, + "learning_rate": 5.375475608098927e-05, + "loss": 3.9257, + "step": 54475 + }, + { + "epoch": 3.7015898899306974, + "grad_norm": 0.19168813526630402, + "learning_rate": 5.375050958010599e-05, + "loss": 3.8527, + "step": 54480 + }, + { + "epoch": 3.701929610001359, + "grad_norm": 3.4407641887664795, + "learning_rate": 5.3746263079222723e-05, + "loss": 3.922, + "step": 54485 + }, + { + "epoch": 3.7022693300720206, + "grad_norm": 0.16923385858535767, + "learning_rate": 5.374201657833945e-05, + "loss": 3.8938, + "step": 54490 + }, + { + "epoch": 3.7026090501426823, + "grad_norm": 0.642430305480957, + "learning_rate": 5.373777007745617e-05, + "loss": 3.799, + "step": 54495 + }, + { + "epoch": 3.7029487702133443, + "grad_norm": 0.15403221547603607, + "learning_rate": 5.373352357657291e-05, + "loss": 3.8448, + "step": 54500 + }, + { + "epoch": 3.703288490284006, + "grad_norm": 0.20942261815071106, + "learning_rate": 5.3729277075689635e-05, + "loss": 3.7557, + "step": 54505 + }, + { + "epoch": 3.7036282103546676, + "grad_norm": 0.22001929581165314, + "learning_rate": 5.372503057480637e-05, + "loss": 3.9972, + "step": 54510 + }, + { + "epoch": 3.7039679304253297, + "grad_norm": 0.16077733039855957, + "learning_rate": 5.372078407392309e-05, + "loss": 3.5117, + "step": 54515 + }, + { + "epoch": 3.7043076504959913, + "grad_norm": 0.2372587025165558, + "learning_rate": 5.371653757303981e-05, + "loss": 3.6797, + "step": 54520 + }, + { + "epoch": 3.704647370566653, + "grad_norm": 0.14904741942882538, + "learning_rate": 5.3712291072156554e-05, + "loss": 4.09, + "step": 54525 + }, + { + "epoch": 3.704987090637315, + "grad_norm": 0.16994987428188324, + "learning_rate": 5.3708044571273275e-05, + "loss": 3.8307, + "step": 54530 + }, + { + "epoch": 3.7053268107079766, + "grad_norm": 0.23290444910526276, + "learning_rate": 5.370379807039e-05, + "loss": 3.9143, + "step": 54535 + }, + { + "epoch": 3.7056665307786383, + "grad_norm": 0.16590899229049683, + "learning_rate": 5.369955156950673e-05, + "loss": 3.9681, + "step": 54540 + }, + { + "epoch": 3.7060062508493, + "grad_norm": 0.1422811895608902, + "learning_rate": 5.369530506862346e-05, + "loss": 3.8572, + "step": 54545 + }, + { + "epoch": 3.706345970919962, + "grad_norm": 0.1856376826763153, + "learning_rate": 5.369105856774018e-05, + "loss": 3.8261, + "step": 54550 + }, + { + "epoch": 3.7066856909906236, + "grad_norm": 0.19114461541175842, + "learning_rate": 5.3686812066856916e-05, + "loss": 3.6983, + "step": 54555 + }, + { + "epoch": 3.7070254110612852, + "grad_norm": 0.22482821345329285, + "learning_rate": 5.3682565565973644e-05, + "loss": 3.8093, + "step": 54560 + }, + { + "epoch": 3.7073651311319473, + "grad_norm": 0.3421097993850708, + "learning_rate": 5.3678319065090365e-05, + "loss": 4.0303, + "step": 54565 + }, + { + "epoch": 3.707704851202609, + "grad_norm": 0.177371546626091, + "learning_rate": 5.36740725642071e-05, + "loss": 3.7458, + "step": 54570 + }, + { + "epoch": 3.7080445712732706, + "grad_norm": 0.15388713777065277, + "learning_rate": 5.366982606332383e-05, + "loss": 3.772, + "step": 54575 + }, + { + "epoch": 3.7083842913439327, + "grad_norm": 0.1646626740694046, + "learning_rate": 5.366557956244055e-05, + "loss": 3.7852, + "step": 54580 + }, + { + "epoch": 3.7087240114145943, + "grad_norm": 0.19151291251182556, + "learning_rate": 5.3661333061557284e-05, + "loss": 3.6274, + "step": 54585 + }, + { + "epoch": 3.709063731485256, + "grad_norm": 0.20556052029132843, + "learning_rate": 5.3657086560674005e-05, + "loss": 4.0317, + "step": 54590 + }, + { + "epoch": 3.709403451555918, + "grad_norm": 0.15782852470874786, + "learning_rate": 5.365284005979073e-05, + "loss": 3.8405, + "step": 54595 + }, + { + "epoch": 3.7097431716265796, + "grad_norm": 0.16837243735790253, + "learning_rate": 5.364859355890747e-05, + "loss": 3.7601, + "step": 54600 + }, + { + "epoch": 3.7100828916972413, + "grad_norm": 2.33550763130188, + "learning_rate": 5.364434705802419e-05, + "loss": 3.8755, + "step": 54605 + }, + { + "epoch": 3.7104226117679033, + "grad_norm": 0.18532146513462067, + "learning_rate": 5.364010055714092e-05, + "loss": 3.7552, + "step": 54610 + }, + { + "epoch": 3.710762331838565, + "grad_norm": 0.19522055983543396, + "learning_rate": 5.363585405625765e-05, + "loss": 3.5703, + "step": 54615 + }, + { + "epoch": 3.7111020519092266, + "grad_norm": 0.15527057647705078, + "learning_rate": 5.363160755537437e-05, + "loss": 3.6809, + "step": 54620 + }, + { + "epoch": 3.7114417719798887, + "grad_norm": 0.31311002373695374, + "learning_rate": 5.36273610544911e-05, + "loss": 3.8695, + "step": 54625 + }, + { + "epoch": 3.7117814920505503, + "grad_norm": 0.18923743069171906, + "learning_rate": 5.3623114553607836e-05, + "loss": 4.1208, + "step": 54630 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 0.1540749967098236, + "learning_rate": 5.361886805272456e-05, + "loss": 3.9279, + "step": 54635 + }, + { + "epoch": 3.712460932191874, + "grad_norm": 0.1669609099626541, + "learning_rate": 5.3614621551841285e-05, + "loss": 3.7992, + "step": 54640 + }, + { + "epoch": 3.7128006522625356, + "grad_norm": 0.1765584945678711, + "learning_rate": 5.361037505095802e-05, + "loss": 3.835, + "step": 54645 + }, + { + "epoch": 3.7131403723331973, + "grad_norm": 0.2195112556219101, + "learning_rate": 5.360612855007474e-05, + "loss": 3.8042, + "step": 54650 + }, + { + "epoch": 3.7134800924038593, + "grad_norm": 0.2149093896150589, + "learning_rate": 5.360188204919146e-05, + "loss": 3.8255, + "step": 54655 + }, + { + "epoch": 3.713819812474521, + "grad_norm": 0.1463499665260315, + "learning_rate": 5.3597635548308204e-05, + "loss": 3.7519, + "step": 54660 + }, + { + "epoch": 3.7141595325451826, + "grad_norm": 0.5023859739303589, + "learning_rate": 5.3593389047424925e-05, + "loss": 3.795, + "step": 54665 + }, + { + "epoch": 3.7144992526158447, + "grad_norm": 0.16322174668312073, + "learning_rate": 5.3589142546541646e-05, + "loss": 3.9008, + "step": 54670 + }, + { + "epoch": 3.7148389726865063, + "grad_norm": 0.18559323251247406, + "learning_rate": 5.358489604565838e-05, + "loss": 3.9011, + "step": 54675 + }, + { + "epoch": 3.715178692757168, + "grad_norm": 0.28513669967651367, + "learning_rate": 5.358064954477511e-05, + "loss": 3.6976, + "step": 54680 + }, + { + "epoch": 3.71551841282783, + "grad_norm": 0.21119125187397003, + "learning_rate": 5.357640304389183e-05, + "loss": 3.9212, + "step": 54685 + }, + { + "epoch": 3.7158581328984917, + "grad_norm": 0.19121411442756653, + "learning_rate": 5.3572156543008565e-05, + "loss": 3.9879, + "step": 54690 + }, + { + "epoch": 3.7161978529691533, + "grad_norm": 0.2450162023305893, + "learning_rate": 5.356791004212529e-05, + "loss": 3.8479, + "step": 54695 + }, + { + "epoch": 3.7165375730398154, + "grad_norm": 0.1755053699016571, + "learning_rate": 5.3563663541242014e-05, + "loss": 3.9491, + "step": 54700 + }, + { + "epoch": 3.716877293110477, + "grad_norm": 0.19460994005203247, + "learning_rate": 5.355941704035875e-05, + "loss": 3.9738, + "step": 54705 + }, + { + "epoch": 3.7172170131811386, + "grad_norm": 0.18441598117351532, + "learning_rate": 5.355517053947548e-05, + "loss": 3.8481, + "step": 54710 + }, + { + "epoch": 3.7175567332518007, + "grad_norm": 0.15110288560390472, + "learning_rate": 5.35509240385922e-05, + "loss": 3.8647, + "step": 54715 + }, + { + "epoch": 3.7178964533224623, + "grad_norm": 0.1639733761548996, + "learning_rate": 5.354667753770893e-05, + "loss": 3.7237, + "step": 54720 + }, + { + "epoch": 3.718236173393124, + "grad_norm": 0.22737644612789154, + "learning_rate": 5.354243103682566e-05, + "loss": 3.7196, + "step": 54725 + }, + { + "epoch": 3.718575893463786, + "grad_norm": 0.16324110329151154, + "learning_rate": 5.353818453594238e-05, + "loss": 3.8895, + "step": 54730 + }, + { + "epoch": 3.7189156135344477, + "grad_norm": 0.1527376025915146, + "learning_rate": 5.353393803505912e-05, + "loss": 3.6967, + "step": 54735 + }, + { + "epoch": 3.7192553336051093, + "grad_norm": 0.19493798911571503, + "learning_rate": 5.352969153417584e-05, + "loss": 4.0517, + "step": 54740 + }, + { + "epoch": 3.7195950536757714, + "grad_norm": 0.2418801486492157, + "learning_rate": 5.3525445033292566e-05, + "loss": 3.8872, + "step": 54745 + }, + { + "epoch": 3.719934773746433, + "grad_norm": 0.407288134098053, + "learning_rate": 5.35211985324093e-05, + "loss": 3.9734, + "step": 54750 + }, + { + "epoch": 3.7202744938170946, + "grad_norm": 0.2111273854970932, + "learning_rate": 5.351695203152602e-05, + "loss": 3.8975, + "step": 54755 + }, + { + "epoch": 3.7206142138877567, + "grad_norm": 0.13118596374988556, + "learning_rate": 5.351270553064275e-05, + "loss": 3.8041, + "step": 54760 + }, + { + "epoch": 3.7209539339584183, + "grad_norm": 0.13895033299922943, + "learning_rate": 5.3508459029759485e-05, + "loss": 3.778, + "step": 54765 + }, + { + "epoch": 3.72129365402908, + "grad_norm": 0.17260143160820007, + "learning_rate": 5.3504212528876206e-05, + "loss": 3.8907, + "step": 54770 + }, + { + "epoch": 3.721633374099742, + "grad_norm": 0.7838390469551086, + "learning_rate": 5.3499966027992934e-05, + "loss": 3.8368, + "step": 54775 + }, + { + "epoch": 3.7219730941704037, + "grad_norm": 0.19653744995594025, + "learning_rate": 5.349571952710967e-05, + "loss": 4.0089, + "step": 54780 + }, + { + "epoch": 3.7223128142410653, + "grad_norm": 0.15003643929958344, + "learning_rate": 5.349147302622639e-05, + "loss": 3.633, + "step": 54785 + }, + { + "epoch": 3.7226525343117274, + "grad_norm": 0.31457364559173584, + "learning_rate": 5.348722652534311e-05, + "loss": 4.0836, + "step": 54790 + }, + { + "epoch": 3.722992254382389, + "grad_norm": 0.18633118271827698, + "learning_rate": 5.348298002445985e-05, + "loss": 3.8576, + "step": 54795 + }, + { + "epoch": 3.7233319744530506, + "grad_norm": 0.1582077443599701, + "learning_rate": 5.3478733523576574e-05, + "loss": 3.6276, + "step": 54800 + }, + { + "epoch": 3.7236716945237127, + "grad_norm": 0.1651228815317154, + "learning_rate": 5.3474487022693296e-05, + "loss": 3.9046, + "step": 54805 + }, + { + "epoch": 3.7240114145943743, + "grad_norm": 0.17742221057415009, + "learning_rate": 5.347024052181003e-05, + "loss": 3.7422, + "step": 54810 + }, + { + "epoch": 3.724351134665036, + "grad_norm": 0.1641363501548767, + "learning_rate": 5.346599402092676e-05, + "loss": 3.6586, + "step": 54815 + }, + { + "epoch": 3.724690854735698, + "grad_norm": 0.16196759045124054, + "learning_rate": 5.346174752004348e-05, + "loss": 4.0165, + "step": 54820 + }, + { + "epoch": 3.7250305748063597, + "grad_norm": 0.20630352199077606, + "learning_rate": 5.3457501019160214e-05, + "loss": 3.7426, + "step": 54825 + }, + { + "epoch": 3.7253702948770213, + "grad_norm": 0.21857041120529175, + "learning_rate": 5.345325451827694e-05, + "loss": 3.6812, + "step": 54830 + }, + { + "epoch": 3.725710014947683, + "grad_norm": 0.18420086801052094, + "learning_rate": 5.3449008017393664e-05, + "loss": 3.8399, + "step": 54835 + }, + { + "epoch": 3.726049735018345, + "grad_norm": 0.1700654774904251, + "learning_rate": 5.34447615165104e-05, + "loss": 3.7127, + "step": 54840 + }, + { + "epoch": 3.7263894550890067, + "grad_norm": 0.1421424299478531, + "learning_rate": 5.3440515015627126e-05, + "loss": 3.8473, + "step": 54845 + }, + { + "epoch": 3.7267291751596683, + "grad_norm": 0.16328953206539154, + "learning_rate": 5.343626851474386e-05, + "loss": 3.6528, + "step": 54850 + }, + { + "epoch": 3.7270688952303304, + "grad_norm": 0.16983409225940704, + "learning_rate": 5.343202201386058e-05, + "loss": 3.8439, + "step": 54855 + }, + { + "epoch": 3.727408615300992, + "grad_norm": 0.1555110216140747, + "learning_rate": 5.342777551297731e-05, + "loss": 3.9128, + "step": 54860 + }, + { + "epoch": 3.7277483353716536, + "grad_norm": 0.15485262870788574, + "learning_rate": 5.3423529012094045e-05, + "loss": 4.1925, + "step": 54865 + }, + { + "epoch": 3.7280880554423157, + "grad_norm": 0.24680978059768677, + "learning_rate": 5.3419282511210766e-05, + "loss": 3.8945, + "step": 54870 + }, + { + "epoch": 3.7284277755129773, + "grad_norm": 0.16345177590847015, + "learning_rate": 5.341503601032749e-05, + "loss": 3.7637, + "step": 54875 + }, + { + "epoch": 3.728767495583639, + "grad_norm": 0.16417330503463745, + "learning_rate": 5.341078950944422e-05, + "loss": 3.7334, + "step": 54880 + }, + { + "epoch": 3.7291072156543006, + "grad_norm": 0.18936435878276825, + "learning_rate": 5.340654300856095e-05, + "loss": 3.8686, + "step": 54885 + }, + { + "epoch": 3.7294469357249627, + "grad_norm": 0.227220818400383, + "learning_rate": 5.340229650767767e-05, + "loss": 3.9021, + "step": 54890 + }, + { + "epoch": 3.7297866557956243, + "grad_norm": 0.170537069439888, + "learning_rate": 5.3398050006794406e-05, + "loss": 3.9536, + "step": 54895 + }, + { + "epoch": 3.730126375866286, + "grad_norm": 0.2753655016422272, + "learning_rate": 5.3393803505911134e-05, + "loss": 3.8874, + "step": 54900 + }, + { + "epoch": 3.730466095936948, + "grad_norm": 0.43100211024284363, + "learning_rate": 5.3389557005027856e-05, + "loss": 3.8663, + "step": 54905 + }, + { + "epoch": 3.7308058160076096, + "grad_norm": 0.1786329746246338, + "learning_rate": 5.338531050414459e-05, + "loss": 3.8187, + "step": 54910 + }, + { + "epoch": 3.7311455360782713, + "grad_norm": 0.6781576871871948, + "learning_rate": 5.338106400326132e-05, + "loss": 3.8451, + "step": 54915 + }, + { + "epoch": 3.7314852561489333, + "grad_norm": 0.1552000492811203, + "learning_rate": 5.337681750237804e-05, + "loss": 3.7272, + "step": 54920 + }, + { + "epoch": 3.731824976219595, + "grad_norm": 0.43678176403045654, + "learning_rate": 5.3372571001494775e-05, + "loss": 3.8259, + "step": 54925 + }, + { + "epoch": 3.7321646962902566, + "grad_norm": 0.25679171085357666, + "learning_rate": 5.33683245006115e-05, + "loss": 3.7748, + "step": 54930 + }, + { + "epoch": 3.7325044163609187, + "grad_norm": 0.19687364995479584, + "learning_rate": 5.3364077999728224e-05, + "loss": 3.8997, + "step": 54935 + }, + { + "epoch": 3.7328441364315803, + "grad_norm": 0.26240479946136475, + "learning_rate": 5.335983149884496e-05, + "loss": 3.7613, + "step": 54940 + }, + { + "epoch": 3.733183856502242, + "grad_norm": 0.20396898686885834, + "learning_rate": 5.335558499796168e-05, + "loss": 3.6938, + "step": 54945 + }, + { + "epoch": 3.733523576572904, + "grad_norm": 0.17912574112415314, + "learning_rate": 5.335133849707841e-05, + "loss": 3.705, + "step": 54950 + }, + { + "epoch": 3.7338632966435656, + "grad_norm": 0.17284777760505676, + "learning_rate": 5.334709199619514e-05, + "loss": 3.9808, + "step": 54955 + }, + { + "epoch": 3.7342030167142273, + "grad_norm": 0.16964764893054962, + "learning_rate": 5.3342845495311864e-05, + "loss": 3.8879, + "step": 54960 + }, + { + "epoch": 3.7345427367848893, + "grad_norm": 0.3668191134929657, + "learning_rate": 5.333859899442859e-05, + "loss": 3.8715, + "step": 54965 + }, + { + "epoch": 3.734882456855551, + "grad_norm": 0.18343141674995422, + "learning_rate": 5.3334352493545327e-05, + "loss": 3.8024, + "step": 54970 + }, + { + "epoch": 3.7352221769262126, + "grad_norm": 0.17176654934883118, + "learning_rate": 5.333010599266205e-05, + "loss": 4.0783, + "step": 54975 + }, + { + "epoch": 3.7355618969968747, + "grad_norm": 0.18658925592899323, + "learning_rate": 5.3325859491778776e-05, + "loss": 3.9162, + "step": 54980 + }, + { + "epoch": 3.7359016170675363, + "grad_norm": 0.2994347810745239, + "learning_rate": 5.332161299089551e-05, + "loss": 3.9047, + "step": 54985 + }, + { + "epoch": 3.736241337138198, + "grad_norm": 0.1526467353105545, + "learning_rate": 5.331736649001223e-05, + "loss": 3.7518, + "step": 54990 + }, + { + "epoch": 3.73658105720886, + "grad_norm": 0.1704474836587906, + "learning_rate": 5.331311998912896e-05, + "loss": 3.9979, + "step": 54995 + }, + { + "epoch": 3.7369207772795217, + "grad_norm": 0.18266302347183228, + "learning_rate": 5.3308873488245695e-05, + "loss": 3.9044, + "step": 55000 + }, + { + "epoch": 3.7372604973501833, + "grad_norm": 0.15341095626354218, + "learning_rate": 5.3304626987362416e-05, + "loss": 3.724, + "step": 55005 + }, + { + "epoch": 3.7376002174208454, + "grad_norm": 0.1669658124446869, + "learning_rate": 5.330038048647914e-05, + "loss": 3.867, + "step": 55010 + }, + { + "epoch": 3.737939937491507, + "grad_norm": 0.17114798724651337, + "learning_rate": 5.329613398559587e-05, + "loss": 3.9549, + "step": 55015 + }, + { + "epoch": 3.7382796575621686, + "grad_norm": 0.4225921630859375, + "learning_rate": 5.32918874847126e-05, + "loss": 4.0475, + "step": 55020 + }, + { + "epoch": 3.7386193776328307, + "grad_norm": 0.266684353351593, + "learning_rate": 5.328764098382932e-05, + "loss": 3.6634, + "step": 55025 + }, + { + "epoch": 3.7389590977034923, + "grad_norm": 0.20630110800266266, + "learning_rate": 5.3283394482946056e-05, + "loss": 3.8536, + "step": 55030 + }, + { + "epoch": 3.739298817774154, + "grad_norm": 0.2505786120891571, + "learning_rate": 5.3279147982062784e-05, + "loss": 3.7737, + "step": 55035 + }, + { + "epoch": 3.739638537844816, + "grad_norm": 0.2360670417547226, + "learning_rate": 5.3274901481179505e-05, + "loss": 3.9278, + "step": 55040 + }, + { + "epoch": 3.7399782579154777, + "grad_norm": 0.2487296462059021, + "learning_rate": 5.327065498029624e-05, + "loss": 3.6265, + "step": 55045 + }, + { + "epoch": 3.7403179779861393, + "grad_norm": 0.20983529090881348, + "learning_rate": 5.326640847941297e-05, + "loss": 3.6996, + "step": 55050 + }, + { + "epoch": 3.7406576980568014, + "grad_norm": 0.18219152092933655, + "learning_rate": 5.326216197852969e-05, + "loss": 3.878, + "step": 55055 + }, + { + "epoch": 3.740997418127463, + "grad_norm": 0.20400743186473846, + "learning_rate": 5.3257915477646424e-05, + "loss": 3.9267, + "step": 55060 + }, + { + "epoch": 3.7413371381981246, + "grad_norm": 0.15166419744491577, + "learning_rate": 5.325366897676315e-05, + "loss": 3.9068, + "step": 55065 + }, + { + "epoch": 3.7416768582687867, + "grad_norm": 0.2305295169353485, + "learning_rate": 5.324942247587987e-05, + "loss": 3.9937, + "step": 55070 + }, + { + "epoch": 3.7420165783394483, + "grad_norm": 0.15620006620883942, + "learning_rate": 5.324517597499661e-05, + "loss": 4.0943, + "step": 55075 + }, + { + "epoch": 3.74235629841011, + "grad_norm": 0.347829133272171, + "learning_rate": 5.324092947411333e-05, + "loss": 3.6425, + "step": 55080 + }, + { + "epoch": 3.742696018480772, + "grad_norm": 0.16688081622123718, + "learning_rate": 5.323668297323006e-05, + "loss": 3.7067, + "step": 55085 + }, + { + "epoch": 3.7430357385514337, + "grad_norm": 0.24458149075508118, + "learning_rate": 5.323243647234679e-05, + "loss": 3.7644, + "step": 55090 + }, + { + "epoch": 3.7433754586220953, + "grad_norm": 0.17140530049800873, + "learning_rate": 5.322818997146351e-05, + "loss": 3.7231, + "step": 55095 + }, + { + "epoch": 3.7437151786927574, + "grad_norm": 0.47019001841545105, + "learning_rate": 5.322394347058024e-05, + "loss": 3.7548, + "step": 55100 + }, + { + "epoch": 3.744054898763419, + "grad_norm": 0.708949625492096, + "learning_rate": 5.3219696969696976e-05, + "loss": 3.6481, + "step": 55105 + }, + { + "epoch": 3.7443946188340806, + "grad_norm": 0.20329508185386658, + "learning_rate": 5.32154504688137e-05, + "loss": 3.7883, + "step": 55110 + }, + { + "epoch": 3.7447343389047427, + "grad_norm": 0.1841443032026291, + "learning_rate": 5.3211203967930425e-05, + "loss": 3.8819, + "step": 55115 + }, + { + "epoch": 3.7450740589754044, + "grad_norm": 0.14728079736232758, + "learning_rate": 5.320695746704716e-05, + "loss": 3.6028, + "step": 55120 + }, + { + "epoch": 3.745413779046066, + "grad_norm": 0.20763161778450012, + "learning_rate": 5.320271096616388e-05, + "loss": 3.7157, + "step": 55125 + }, + { + "epoch": 3.745753499116728, + "grad_norm": 0.17005477845668793, + "learning_rate": 5.319846446528061e-05, + "loss": 3.9199, + "step": 55130 + }, + { + "epoch": 3.7460932191873897, + "grad_norm": 0.19450800120830536, + "learning_rate": 5.3194217964397344e-05, + "loss": 3.8294, + "step": 55135 + }, + { + "epoch": 3.7464329392580513, + "grad_norm": 0.12303080409765244, + "learning_rate": 5.3189971463514065e-05, + "loss": 3.8203, + "step": 55140 + }, + { + "epoch": 3.7467726593287134, + "grad_norm": 0.16420066356658936, + "learning_rate": 5.3185724962630787e-05, + "loss": 3.5792, + "step": 55145 + }, + { + "epoch": 3.747112379399375, + "grad_norm": 0.16644832491874695, + "learning_rate": 5.318147846174753e-05, + "loss": 3.8639, + "step": 55150 + }, + { + "epoch": 3.7474520994700367, + "grad_norm": 0.1829243302345276, + "learning_rate": 5.317723196086425e-05, + "loss": 3.8821, + "step": 55155 + }, + { + "epoch": 3.7477918195406987, + "grad_norm": 0.21252234280109406, + "learning_rate": 5.317298545998097e-05, + "loss": 3.7747, + "step": 55160 + }, + { + "epoch": 3.7481315396113604, + "grad_norm": 0.1782771646976471, + "learning_rate": 5.3168738959097705e-05, + "loss": 3.9758, + "step": 55165 + }, + { + "epoch": 3.748471259682022, + "grad_norm": 0.29350048303604126, + "learning_rate": 5.316449245821443e-05, + "loss": 3.7073, + "step": 55170 + }, + { + "epoch": 3.7488109797526836, + "grad_norm": 0.17899377644062042, + "learning_rate": 5.3160245957331155e-05, + "loss": 3.8498, + "step": 55175 + }, + { + "epoch": 3.7491506998233457, + "grad_norm": 0.19060856103897095, + "learning_rate": 5.315599945644789e-05, + "loss": 3.9065, + "step": 55180 + }, + { + "epoch": 3.7494904198940073, + "grad_norm": 0.17208032310009003, + "learning_rate": 5.315175295556462e-05, + "loss": 3.7259, + "step": 55185 + }, + { + "epoch": 3.749830139964669, + "grad_norm": 0.17067605257034302, + "learning_rate": 5.314750645468135e-05, + "loss": 3.9228, + "step": 55190 + }, + { + "epoch": 3.750169860035331, + "grad_norm": 0.20528258383274078, + "learning_rate": 5.314325995379807e-05, + "loss": 3.5183, + "step": 55195 + }, + { + "epoch": 3.7505095801059927, + "grad_norm": 0.18072623014450073, + "learning_rate": 5.31390134529148e-05, + "loss": 3.7863, + "step": 55200 + }, + { + "epoch": 3.7508493001766543, + "grad_norm": 0.15364746749401093, + "learning_rate": 5.3134766952031536e-05, + "loss": 3.8516, + "step": 55205 + }, + { + "epoch": 3.7511890202473164, + "grad_norm": 0.21010026335716248, + "learning_rate": 5.313052045114826e-05, + "loss": 3.9045, + "step": 55210 + }, + { + "epoch": 3.751528740317978, + "grad_norm": 0.13997632265090942, + "learning_rate": 5.312627395026498e-05, + "loss": 3.8382, + "step": 55215 + }, + { + "epoch": 3.7518684603886396, + "grad_norm": 0.17245763540267944, + "learning_rate": 5.312202744938172e-05, + "loss": 3.514, + "step": 55220 + }, + { + "epoch": 3.7522081804593013, + "grad_norm": 0.17626187205314636, + "learning_rate": 5.311778094849844e-05, + "loss": 3.7565, + "step": 55225 + }, + { + "epoch": 3.7525479005299633, + "grad_norm": 0.20749278366565704, + "learning_rate": 5.311353444761516e-05, + "loss": 4.0772, + "step": 55230 + }, + { + "epoch": 3.752887620600625, + "grad_norm": 0.3356064558029175, + "learning_rate": 5.31092879467319e-05, + "loss": 3.8123, + "step": 55235 + }, + { + "epoch": 3.7532273406712866, + "grad_norm": 0.15603268146514893, + "learning_rate": 5.3105041445848625e-05, + "loss": 3.718, + "step": 55240 + }, + { + "epoch": 3.7535670607419487, + "grad_norm": 0.2522687017917633, + "learning_rate": 5.310079494496535e-05, + "loss": 4.275, + "step": 55245 + }, + { + "epoch": 3.7539067808126103, + "grad_norm": 2.69635009765625, + "learning_rate": 5.309654844408208e-05, + "loss": 3.9404, + "step": 55250 + }, + { + "epoch": 3.754246500883272, + "grad_norm": 0.14192385971546173, + "learning_rate": 5.309230194319881e-05, + "loss": 3.7535, + "step": 55255 + }, + { + "epoch": 3.754586220953934, + "grad_norm": 0.16163189709186554, + "learning_rate": 5.308805544231553e-05, + "loss": 3.8724, + "step": 55260 + }, + { + "epoch": 3.7549259410245956, + "grad_norm": 0.17323645949363708, + "learning_rate": 5.3083808941432265e-05, + "loss": 3.8737, + "step": 55265 + }, + { + "epoch": 3.7552656610952573, + "grad_norm": 0.17242002487182617, + "learning_rate": 5.3079562440548993e-05, + "loss": 3.9199, + "step": 55270 + }, + { + "epoch": 3.7556053811659194, + "grad_norm": 0.15656650066375732, + "learning_rate": 5.3075315939665715e-05, + "loss": 3.7663, + "step": 55275 + }, + { + "epoch": 3.755945101236581, + "grad_norm": 0.18540650606155396, + "learning_rate": 5.307106943878245e-05, + "loss": 3.878, + "step": 55280 + }, + { + "epoch": 3.7562848213072426, + "grad_norm": 0.1786990910768509, + "learning_rate": 5.306682293789918e-05, + "loss": 3.8332, + "step": 55285 + }, + { + "epoch": 3.7566245413779047, + "grad_norm": 0.17556491494178772, + "learning_rate": 5.30625764370159e-05, + "loss": 3.7656, + "step": 55290 + }, + { + "epoch": 3.7569642614485663, + "grad_norm": 0.18430744111537933, + "learning_rate": 5.3058329936132634e-05, + "loss": 3.8269, + "step": 55295 + }, + { + "epoch": 3.757303981519228, + "grad_norm": 0.18686634302139282, + "learning_rate": 5.3054083435249355e-05, + "loss": 3.8462, + "step": 55300 + }, + { + "epoch": 3.75764370158989, + "grad_norm": 0.18873995542526245, + "learning_rate": 5.304983693436608e-05, + "loss": 3.8801, + "step": 55305 + }, + { + "epoch": 3.7579834216605517, + "grad_norm": 0.24906115233898163, + "learning_rate": 5.304559043348282e-05, + "loss": 4.0453, + "step": 55310 + }, + { + "epoch": 3.7583231417312133, + "grad_norm": 0.3638461232185364, + "learning_rate": 5.304134393259954e-05, + "loss": 4.0552, + "step": 55315 + }, + { + "epoch": 3.7586628618018754, + "grad_norm": 0.20625407993793488, + "learning_rate": 5.303709743171627e-05, + "loss": 3.9242, + "step": 55320 + }, + { + "epoch": 3.759002581872537, + "grad_norm": 0.13064785301685333, + "learning_rate": 5.3032850930833e-05, + "loss": 3.9818, + "step": 55325 + }, + { + "epoch": 3.7593423019431986, + "grad_norm": 0.18064109981060028, + "learning_rate": 5.302860442994972e-05, + "loss": 3.9208, + "step": 55330 + }, + { + "epoch": 3.7596820220138607, + "grad_norm": 0.3187396228313446, + "learning_rate": 5.302435792906645e-05, + "loss": 3.8407, + "step": 55335 + }, + { + "epoch": 3.7600217420845223, + "grad_norm": 0.15981797873973846, + "learning_rate": 5.3020111428183186e-05, + "loss": 3.9373, + "step": 55340 + }, + { + "epoch": 3.760361462155184, + "grad_norm": 0.18683911859989166, + "learning_rate": 5.301586492729991e-05, + "loss": 4.0132, + "step": 55345 + }, + { + "epoch": 3.760701182225846, + "grad_norm": 0.18659505248069763, + "learning_rate": 5.3011618426416635e-05, + "loss": 3.7099, + "step": 55350 + }, + { + "epoch": 3.7610409022965077, + "grad_norm": 0.18017451465129852, + "learning_rate": 5.300737192553337e-05, + "loss": 4.1154, + "step": 55355 + }, + { + "epoch": 3.7613806223671693, + "grad_norm": 0.16095490753650665, + "learning_rate": 5.300312542465009e-05, + "loss": 4.0298, + "step": 55360 + }, + { + "epoch": 3.7617203424378314, + "grad_norm": 0.19258655607700348, + "learning_rate": 5.299887892376681e-05, + "loss": 3.9271, + "step": 55365 + }, + { + "epoch": 3.762060062508493, + "grad_norm": 0.29585206508636475, + "learning_rate": 5.299463242288355e-05, + "loss": 3.6829, + "step": 55370 + }, + { + "epoch": 3.7623997825791546, + "grad_norm": 0.17744605243206024, + "learning_rate": 5.2990385922000275e-05, + "loss": 3.7728, + "step": 55375 + }, + { + "epoch": 3.7627395026498167, + "grad_norm": 0.1790464222431183, + "learning_rate": 5.2986139421116996e-05, + "loss": 4.0491, + "step": 55380 + }, + { + "epoch": 3.7630792227204783, + "grad_norm": 0.14450562000274658, + "learning_rate": 5.298189292023373e-05, + "loss": 3.7732, + "step": 55385 + }, + { + "epoch": 3.76341894279114, + "grad_norm": 0.1400325447320938, + "learning_rate": 5.297764641935046e-05, + "loss": 3.913, + "step": 55390 + }, + { + "epoch": 3.763758662861802, + "grad_norm": 0.19305400550365448, + "learning_rate": 5.297339991846718e-05, + "loss": 4.3486, + "step": 55395 + }, + { + "epoch": 3.7640983829324637, + "grad_norm": 0.18157987296581268, + "learning_rate": 5.2969153417583915e-05, + "loss": 3.8785, + "step": 55400 + }, + { + "epoch": 3.7644381030031253, + "grad_norm": 0.7673696875572205, + "learning_rate": 5.296490691670064e-05, + "loss": 3.677, + "step": 55405 + }, + { + "epoch": 3.7647778230737874, + "grad_norm": 0.13603289425373077, + "learning_rate": 5.2960660415817364e-05, + "loss": 3.9147, + "step": 55410 + }, + { + "epoch": 3.765117543144449, + "grad_norm": 0.16300298273563385, + "learning_rate": 5.29564139149341e-05, + "loss": 3.6765, + "step": 55415 + }, + { + "epoch": 3.7654572632151107, + "grad_norm": 3.33271861076355, + "learning_rate": 5.295216741405083e-05, + "loss": 4.0078, + "step": 55420 + }, + { + "epoch": 3.7657969832857727, + "grad_norm": 0.2159530073404312, + "learning_rate": 5.294792091316755e-05, + "loss": 3.9166, + "step": 55425 + }, + { + "epoch": 3.7661367033564344, + "grad_norm": 0.3632737994194031, + "learning_rate": 5.294367441228428e-05, + "loss": 3.7868, + "step": 55430 + }, + { + "epoch": 3.766476423427096, + "grad_norm": 0.18553131818771362, + "learning_rate": 5.2939427911401004e-05, + "loss": 3.9828, + "step": 55435 + }, + { + "epoch": 3.766816143497758, + "grad_norm": 0.16561922430992126, + "learning_rate": 5.293518141051773e-05, + "loss": 3.8423, + "step": 55440 + }, + { + "epoch": 3.7671558635684197, + "grad_norm": 0.20506992936134338, + "learning_rate": 5.293093490963447e-05, + "loss": 3.8633, + "step": 55445 + }, + { + "epoch": 3.7674955836390813, + "grad_norm": 0.1678115725517273, + "learning_rate": 5.292668840875119e-05, + "loss": 3.9842, + "step": 55450 + }, + { + "epoch": 3.7678353037097434, + "grad_norm": 0.14407965540885925, + "learning_rate": 5.2922441907867916e-05, + "loss": 3.9946, + "step": 55455 + }, + { + "epoch": 3.768175023780405, + "grad_norm": 0.7661156058311462, + "learning_rate": 5.291819540698465e-05, + "loss": 3.7219, + "step": 55460 + }, + { + "epoch": 3.7685147438510667, + "grad_norm": 0.1738481968641281, + "learning_rate": 5.291394890610137e-05, + "loss": 3.651, + "step": 55465 + }, + { + "epoch": 3.7688544639217287, + "grad_norm": 0.18494157493114471, + "learning_rate": 5.29097024052181e-05, + "loss": 3.7302, + "step": 55470 + }, + { + "epoch": 3.7691941839923904, + "grad_norm": 0.22232648730278015, + "learning_rate": 5.2905455904334835e-05, + "loss": 3.7349, + "step": 55475 + }, + { + "epoch": 3.769533904063052, + "grad_norm": 0.16903795301914215, + "learning_rate": 5.2901209403451556e-05, + "loss": 3.8303, + "step": 55480 + }, + { + "epoch": 3.769873624133714, + "grad_norm": 3.252129077911377, + "learning_rate": 5.2896962902568284e-05, + "loss": 3.8835, + "step": 55485 + }, + { + "epoch": 3.7702133442043757, + "grad_norm": 0.15115036070346832, + "learning_rate": 5.289271640168502e-05, + "loss": 3.8797, + "step": 55490 + }, + { + "epoch": 3.7705530642750373, + "grad_norm": 0.16231173276901245, + "learning_rate": 5.288846990080174e-05, + "loss": 4.0388, + "step": 55495 + }, + { + "epoch": 3.7708927843456994, + "grad_norm": 0.18620726466178894, + "learning_rate": 5.288422339991846e-05, + "loss": 3.9035, + "step": 55500 + }, + { + "epoch": 3.771232504416361, + "grad_norm": 0.284469872713089, + "learning_rate": 5.2879976899035196e-05, + "loss": 3.852, + "step": 55505 + }, + { + "epoch": 3.7715722244870227, + "grad_norm": 0.18127675354480743, + "learning_rate": 5.2875730398151924e-05, + "loss": 3.7441, + "step": 55510 + }, + { + "epoch": 3.7719119445576843, + "grad_norm": 0.18187925219535828, + "learning_rate": 5.2871483897268646e-05, + "loss": 4.0552, + "step": 55515 + }, + { + "epoch": 3.7722516646283464, + "grad_norm": 0.17232629656791687, + "learning_rate": 5.286723739638538e-05, + "loss": 3.9852, + "step": 55520 + }, + { + "epoch": 3.772591384699008, + "grad_norm": 0.2188505083322525, + "learning_rate": 5.286299089550211e-05, + "loss": 4.0142, + "step": 55525 + }, + { + "epoch": 3.7729311047696696, + "grad_norm": 0.17639464139938354, + "learning_rate": 5.285874439461884e-05, + "loss": 3.6923, + "step": 55530 + }, + { + "epoch": 3.7732708248403317, + "grad_norm": 0.15545068681240082, + "learning_rate": 5.2854497893735564e-05, + "loss": 3.6147, + "step": 55535 + }, + { + "epoch": 3.7736105449109933, + "grad_norm": 0.20305943489074707, + "learning_rate": 5.285025139285229e-05, + "loss": 3.936, + "step": 55540 + }, + { + "epoch": 3.773950264981655, + "grad_norm": 0.1738622486591339, + "learning_rate": 5.284600489196903e-05, + "loss": 3.9243, + "step": 55545 + }, + { + "epoch": 3.774289985052317, + "grad_norm": 0.17089128494262695, + "learning_rate": 5.284175839108575e-05, + "loss": 3.8491, + "step": 55550 + }, + { + "epoch": 3.7746297051229787, + "grad_norm": 0.1663990020751953, + "learning_rate": 5.2837511890202476e-05, + "loss": 3.79, + "step": 55555 + }, + { + "epoch": 3.7749694251936403, + "grad_norm": 0.18014578521251678, + "learning_rate": 5.283326538931921e-05, + "loss": 3.8712, + "step": 55560 + }, + { + "epoch": 3.775309145264302, + "grad_norm": 0.18822501599788666, + "learning_rate": 5.282901888843593e-05, + "loss": 3.7229, + "step": 55565 + }, + { + "epoch": 3.775648865334964, + "grad_norm": 0.1950160712003708, + "learning_rate": 5.2824772387552654e-05, + "loss": 3.7469, + "step": 55570 + }, + { + "epoch": 3.7759885854056257, + "grad_norm": 0.17318443953990936, + "learning_rate": 5.2820525886669395e-05, + "loss": 4.1496, + "step": 55575 + }, + { + "epoch": 3.7763283054762873, + "grad_norm": 0.3998160660266876, + "learning_rate": 5.2816279385786116e-05, + "loss": 3.8204, + "step": 55580 + }, + { + "epoch": 3.7766680255469494, + "grad_norm": 0.2002759873867035, + "learning_rate": 5.281203288490284e-05, + "loss": 3.855, + "step": 55585 + }, + { + "epoch": 3.777007745617611, + "grad_norm": 0.18122287094593048, + "learning_rate": 5.280778638401957e-05, + "loss": 3.9119, + "step": 55590 + }, + { + "epoch": 3.7773474656882726, + "grad_norm": 1.4403594732284546, + "learning_rate": 5.28035398831363e-05, + "loss": 4.0449, + "step": 55595 + }, + { + "epoch": 3.7776871857589347, + "grad_norm": 0.19008515775203705, + "learning_rate": 5.279929338225302e-05, + "loss": 3.8721, + "step": 55600 + }, + { + "epoch": 3.7780269058295963, + "grad_norm": 0.18266141414642334, + "learning_rate": 5.2795046881369756e-05, + "loss": 3.8903, + "step": 55605 + }, + { + "epoch": 3.778366625900258, + "grad_norm": 0.18557509779930115, + "learning_rate": 5.2790800380486484e-05, + "loss": 3.861, + "step": 55610 + }, + { + "epoch": 3.77870634597092, + "grad_norm": 0.12748335301876068, + "learning_rate": 5.2786553879603206e-05, + "loss": 4.0678, + "step": 55615 + }, + { + "epoch": 3.7790460660415817, + "grad_norm": 0.1673027127981186, + "learning_rate": 5.278230737871994e-05, + "loss": 3.9451, + "step": 55620 + }, + { + "epoch": 3.7793857861122433, + "grad_norm": 0.39543765783309937, + "learning_rate": 5.277806087783667e-05, + "loss": 3.83, + "step": 55625 + }, + { + "epoch": 3.7797255061829054, + "grad_norm": 0.20111827552318573, + "learning_rate": 5.277381437695339e-05, + "loss": 4.1758, + "step": 55630 + }, + { + "epoch": 3.780065226253567, + "grad_norm": 0.16260385513305664, + "learning_rate": 5.2769567876070125e-05, + "loss": 4.0439, + "step": 55635 + }, + { + "epoch": 3.7804049463242286, + "grad_norm": 0.17440226674079895, + "learning_rate": 5.276532137518685e-05, + "loss": 4.04, + "step": 55640 + }, + { + "epoch": 3.7807446663948907, + "grad_norm": 0.1363568902015686, + "learning_rate": 5.2761074874303574e-05, + "loss": 3.8809, + "step": 55645 + }, + { + "epoch": 3.7810843864655523, + "grad_norm": 0.17300380766391754, + "learning_rate": 5.275682837342031e-05, + "loss": 3.8549, + "step": 55650 + }, + { + "epoch": 3.781424106536214, + "grad_norm": 0.14393079280853271, + "learning_rate": 5.275258187253703e-05, + "loss": 3.8392, + "step": 55655 + }, + { + "epoch": 3.781763826606876, + "grad_norm": 0.2382488250732422, + "learning_rate": 5.274833537165376e-05, + "loss": 4.0766, + "step": 55660 + }, + { + "epoch": 3.7821035466775377, + "grad_norm": 0.2407142072916031, + "learning_rate": 5.274408887077049e-05, + "loss": 3.7771, + "step": 55665 + }, + { + "epoch": 3.7824432667481993, + "grad_norm": 0.1505570113658905, + "learning_rate": 5.2739842369887214e-05, + "loss": 3.9093, + "step": 55670 + }, + { + "epoch": 3.7827829868188614, + "grad_norm": 0.21772384643554688, + "learning_rate": 5.273559586900394e-05, + "loss": 3.7806, + "step": 55675 + }, + { + "epoch": 3.783122706889523, + "grad_norm": 0.18133580684661865, + "learning_rate": 5.2731349368120677e-05, + "loss": 3.8452, + "step": 55680 + }, + { + "epoch": 3.7834624269601846, + "grad_norm": 0.3962034583091736, + "learning_rate": 5.27271028672374e-05, + "loss": 3.9155, + "step": 55685 + }, + { + "epoch": 3.7838021470308467, + "grad_norm": 0.24342872202396393, + "learning_rate": 5.2722856366354126e-05, + "loss": 3.585, + "step": 55690 + }, + { + "epoch": 3.7841418671015083, + "grad_norm": 0.1921684741973877, + "learning_rate": 5.271860986547086e-05, + "loss": 4.13, + "step": 55695 + }, + { + "epoch": 3.78448158717217, + "grad_norm": 0.1619451344013214, + "learning_rate": 5.271436336458758e-05, + "loss": 3.6223, + "step": 55700 + }, + { + "epoch": 3.784821307242832, + "grad_norm": 0.20507632195949554, + "learning_rate": 5.27101168637043e-05, + "loss": 3.9324, + "step": 55705 + }, + { + "epoch": 3.7851610273134937, + "grad_norm": 0.16034886240959167, + "learning_rate": 5.2705870362821045e-05, + "loss": 4.0452, + "step": 55710 + }, + { + "epoch": 3.7855007473841553, + "grad_norm": 0.16090719401836395, + "learning_rate": 5.2701623861937766e-05, + "loss": 3.8692, + "step": 55715 + }, + { + "epoch": 3.7858404674548174, + "grad_norm": 0.19565342366695404, + "learning_rate": 5.269737736105449e-05, + "loss": 4.044, + "step": 55720 + }, + { + "epoch": 3.786180187525479, + "grad_norm": 0.35492053627967834, + "learning_rate": 5.269313086017122e-05, + "loss": 3.8105, + "step": 55725 + }, + { + "epoch": 3.7865199075961407, + "grad_norm": 0.1898210048675537, + "learning_rate": 5.268888435928795e-05, + "loss": 4.0421, + "step": 55730 + }, + { + "epoch": 3.7868596276668027, + "grad_norm": 0.2422453761100769, + "learning_rate": 5.268463785840467e-05, + "loss": 3.7278, + "step": 55735 + }, + { + "epoch": 3.7871993477374644, + "grad_norm": 0.18510450422763824, + "learning_rate": 5.2680391357521406e-05, + "loss": 4.122, + "step": 55740 + }, + { + "epoch": 3.787539067808126, + "grad_norm": 0.1485801339149475, + "learning_rate": 5.2676144856638134e-05, + "loss": 3.9031, + "step": 55745 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.15895892679691315, + "learning_rate": 5.2671898355754855e-05, + "loss": 3.805, + "step": 55750 + }, + { + "epoch": 3.7882185079494497, + "grad_norm": 0.8124158978462219, + "learning_rate": 5.266765185487159e-05, + "loss": 4.0196, + "step": 55755 + }, + { + "epoch": 3.7885582280201113, + "grad_norm": 0.14777342975139618, + "learning_rate": 5.266340535398832e-05, + "loss": 3.7378, + "step": 55760 + }, + { + "epoch": 3.7888979480907734, + "grad_norm": 0.18557493388652802, + "learning_rate": 5.265915885310504e-05, + "loss": 3.842, + "step": 55765 + }, + { + "epoch": 3.789237668161435, + "grad_norm": 0.18471960723400116, + "learning_rate": 5.2654912352221774e-05, + "loss": 3.91, + "step": 55770 + }, + { + "epoch": 3.7895773882320967, + "grad_norm": 0.2676074504852295, + "learning_rate": 5.26506658513385e-05, + "loss": 3.8285, + "step": 55775 + }, + { + "epoch": 3.7899171083027587, + "grad_norm": 0.1874377578496933, + "learning_rate": 5.264641935045522e-05, + "loss": 3.8386, + "step": 55780 + }, + { + "epoch": 3.7902568283734204, + "grad_norm": 0.1578744649887085, + "learning_rate": 5.264217284957196e-05, + "loss": 4.11, + "step": 55785 + }, + { + "epoch": 3.790596548444082, + "grad_norm": 0.1899978369474411, + "learning_rate": 5.263792634868868e-05, + "loss": 3.9367, + "step": 55790 + }, + { + "epoch": 3.790936268514744, + "grad_norm": 0.20752395689487457, + "learning_rate": 5.263367984780541e-05, + "loss": 3.9676, + "step": 55795 + }, + { + "epoch": 3.7912759885854057, + "grad_norm": 0.29877209663391113, + "learning_rate": 5.262943334692214e-05, + "loss": 3.7008, + "step": 55800 + }, + { + "epoch": 3.7916157086560673, + "grad_norm": 0.6128703951835632, + "learning_rate": 5.262518684603886e-05, + "loss": 3.8426, + "step": 55805 + }, + { + "epoch": 3.7919554287267294, + "grad_norm": 0.14412547647953033, + "learning_rate": 5.262094034515559e-05, + "loss": 3.6999, + "step": 55810 + }, + { + "epoch": 3.792295148797391, + "grad_norm": 0.19150377810001373, + "learning_rate": 5.2616693844272326e-05, + "loss": 3.6932, + "step": 55815 + }, + { + "epoch": 3.7926348688680527, + "grad_norm": 0.15082785487174988, + "learning_rate": 5.261244734338905e-05, + "loss": 4.02, + "step": 55820 + }, + { + "epoch": 3.7929745889387148, + "grad_norm": 0.4346161186695099, + "learning_rate": 5.2608200842505775e-05, + "loss": 3.8131, + "step": 55825 + }, + { + "epoch": 3.7933143090093764, + "grad_norm": 0.13969950377941132, + "learning_rate": 5.260395434162251e-05, + "loss": 3.8874, + "step": 55830 + }, + { + "epoch": 3.793654029080038, + "grad_norm": 0.19049084186553955, + "learning_rate": 5.259970784073923e-05, + "loss": 3.9711, + "step": 55835 + }, + { + "epoch": 3.7939937491507, + "grad_norm": 0.20480291545391083, + "learning_rate": 5.259546133985596e-05, + "loss": 3.8741, + "step": 55840 + }, + { + "epoch": 3.7943334692213617, + "grad_norm": 0.16015422344207764, + "learning_rate": 5.2591214838972694e-05, + "loss": 3.7806, + "step": 55845 + }, + { + "epoch": 3.7946731892920234, + "grad_norm": 0.19905789196491241, + "learning_rate": 5.2586968338089415e-05, + "loss": 3.7414, + "step": 55850 + }, + { + "epoch": 3.795012909362685, + "grad_norm": 0.22250054776668549, + "learning_rate": 5.2582721837206137e-05, + "loss": 3.8083, + "step": 55855 + }, + { + "epoch": 3.795352629433347, + "grad_norm": 0.15355657041072845, + "learning_rate": 5.257847533632287e-05, + "loss": 3.7938, + "step": 55860 + }, + { + "epoch": 3.7956923495040087, + "grad_norm": 0.1572331041097641, + "learning_rate": 5.25742288354396e-05, + "loss": 3.8673, + "step": 55865 + }, + { + "epoch": 3.7960320695746703, + "grad_norm": 0.17831963300704956, + "learning_rate": 5.2569982334556334e-05, + "loss": 3.8699, + "step": 55870 + }, + { + "epoch": 3.7963717896453324, + "grad_norm": 0.22766411304473877, + "learning_rate": 5.2565735833673055e-05, + "loss": 3.7912, + "step": 55875 + }, + { + "epoch": 3.796711509715994, + "grad_norm": 0.3665694296360016, + "learning_rate": 5.256148933278978e-05, + "loss": 3.9965, + "step": 55880 + }, + { + "epoch": 3.7970512297866557, + "grad_norm": 0.1755969226360321, + "learning_rate": 5.255724283190652e-05, + "loss": 3.944, + "step": 55885 + }, + { + "epoch": 3.7973909498573177, + "grad_norm": 0.18862667679786682, + "learning_rate": 5.255299633102324e-05, + "loss": 3.9737, + "step": 55890 + }, + { + "epoch": 3.7977306699279794, + "grad_norm": 0.19490747153759003, + "learning_rate": 5.254874983013997e-05, + "loss": 3.7189, + "step": 55895 + }, + { + "epoch": 3.798070389998641, + "grad_norm": 0.17787081003189087, + "learning_rate": 5.25445033292567e-05, + "loss": 3.9992, + "step": 55900 + }, + { + "epoch": 3.7984101100693026, + "grad_norm": 2.0330827236175537, + "learning_rate": 5.254025682837342e-05, + "loss": 4.0709, + "step": 55905 + }, + { + "epoch": 3.7987498301399647, + "grad_norm": 0.23355749249458313, + "learning_rate": 5.253601032749015e-05, + "loss": 3.5789, + "step": 55910 + }, + { + "epoch": 3.7990895502106263, + "grad_norm": 0.1480219066143036, + "learning_rate": 5.2531763826606886e-05, + "loss": 3.8184, + "step": 55915 + }, + { + "epoch": 3.799429270281288, + "grad_norm": 0.25599318742752075, + "learning_rate": 5.252751732572361e-05, + "loss": 3.9816, + "step": 55920 + }, + { + "epoch": 3.79976899035195, + "grad_norm": 0.1602974683046341, + "learning_rate": 5.252327082484033e-05, + "loss": 3.9775, + "step": 55925 + }, + { + "epoch": 3.8001087104226117, + "grad_norm": 0.2231777459383011, + "learning_rate": 5.2519024323957063e-05, + "loss": 3.9095, + "step": 55930 + }, + { + "epoch": 3.8004484304932733, + "grad_norm": 0.13835212588310242, + "learning_rate": 5.251477782307379e-05, + "loss": 4.083, + "step": 55935 + }, + { + "epoch": 3.8007881505639354, + "grad_norm": 0.16750769317150116, + "learning_rate": 5.251053132219051e-05, + "loss": 3.6227, + "step": 55940 + }, + { + "epoch": 3.801127870634597, + "grad_norm": 0.19293145835399628, + "learning_rate": 5.250628482130725e-05, + "loss": 3.864, + "step": 55945 + }, + { + "epoch": 3.8014675907052586, + "grad_norm": 0.2123650163412094, + "learning_rate": 5.2502038320423975e-05, + "loss": 3.8276, + "step": 55950 + }, + { + "epoch": 3.8018073107759207, + "grad_norm": 0.16674572229385376, + "learning_rate": 5.24977918195407e-05, + "loss": 3.5666, + "step": 55955 + }, + { + "epoch": 3.8021470308465823, + "grad_norm": 0.15358296036720276, + "learning_rate": 5.249354531865743e-05, + "loss": 3.8821, + "step": 55960 + }, + { + "epoch": 3.802486750917244, + "grad_norm": 0.16285553574562073, + "learning_rate": 5.248929881777416e-05, + "loss": 3.7736, + "step": 55965 + }, + { + "epoch": 3.802826470987906, + "grad_norm": 0.17398697137832642, + "learning_rate": 5.248505231689088e-05, + "loss": 3.9266, + "step": 55970 + }, + { + "epoch": 3.8031661910585677, + "grad_norm": 0.18570901453495026, + "learning_rate": 5.2480805816007615e-05, + "loss": 3.9775, + "step": 55975 + }, + { + "epoch": 3.8035059111292293, + "grad_norm": 0.1750597357749939, + "learning_rate": 5.2476559315124343e-05, + "loss": 3.699, + "step": 55980 + }, + { + "epoch": 3.8038456311998914, + "grad_norm": 0.17195889353752136, + "learning_rate": 5.2472312814241065e-05, + "loss": 3.8781, + "step": 55985 + }, + { + "epoch": 3.804185351270553, + "grad_norm": 0.17779338359832764, + "learning_rate": 5.24680663133578e-05, + "loss": 3.7661, + "step": 55990 + }, + { + "epoch": 3.8045250713412146, + "grad_norm": 0.17362385988235474, + "learning_rate": 5.246381981247452e-05, + "loss": 3.9007, + "step": 55995 + }, + { + "epoch": 3.8048647914118767, + "grad_norm": 0.16971328854560852, + "learning_rate": 5.245957331159125e-05, + "loss": 3.7623, + "step": 56000 + }, + { + "epoch": 3.8052045114825384, + "grad_norm": 0.19768264889717102, + "learning_rate": 5.2455326810707984e-05, + "loss": 3.8202, + "step": 56005 + }, + { + "epoch": 3.8055442315532, + "grad_norm": 0.36208009719848633, + "learning_rate": 5.2451080309824705e-05, + "loss": 3.8214, + "step": 56010 + }, + { + "epoch": 3.805883951623862, + "grad_norm": 0.13439857959747314, + "learning_rate": 5.244683380894143e-05, + "loss": 3.7463, + "step": 56015 + }, + { + "epoch": 3.8062236716945237, + "grad_norm": 0.15458625555038452, + "learning_rate": 5.244258730805817e-05, + "loss": 3.9305, + "step": 56020 + }, + { + "epoch": 3.8065633917651853, + "grad_norm": 0.18363763391971588, + "learning_rate": 5.243834080717489e-05, + "loss": 3.7535, + "step": 56025 + }, + { + "epoch": 3.8069031118358474, + "grad_norm": 0.1623394787311554, + "learning_rate": 5.243409430629162e-05, + "loss": 3.9832, + "step": 56030 + }, + { + "epoch": 3.807242831906509, + "grad_norm": 0.19808904826641083, + "learning_rate": 5.242984780540835e-05, + "loss": 3.8225, + "step": 56035 + }, + { + "epoch": 3.8075825519771707, + "grad_norm": 0.1416313201189041, + "learning_rate": 5.242560130452507e-05, + "loss": 3.7517, + "step": 56040 + }, + { + "epoch": 3.8079222720478327, + "grad_norm": 0.15091124176979065, + "learning_rate": 5.24213548036418e-05, + "loss": 3.6153, + "step": 56045 + }, + { + "epoch": 3.8082619921184944, + "grad_norm": 0.30366528034210205, + "learning_rate": 5.2417108302758536e-05, + "loss": 3.8083, + "step": 56050 + }, + { + "epoch": 3.808601712189156, + "grad_norm": 0.20857824385166168, + "learning_rate": 5.241286180187526e-05, + "loss": 4.0481, + "step": 56055 + }, + { + "epoch": 3.808941432259818, + "grad_norm": 0.22568905353546143, + "learning_rate": 5.240861530099198e-05, + "loss": 3.8649, + "step": 56060 + }, + { + "epoch": 3.8092811523304797, + "grad_norm": 0.1462930291891098, + "learning_rate": 5.240436880010872e-05, + "loss": 3.9485, + "step": 56065 + }, + { + "epoch": 3.8096208724011413, + "grad_norm": 0.1796797662973404, + "learning_rate": 5.240012229922544e-05, + "loss": 3.8424, + "step": 56070 + }, + { + "epoch": 3.8099605924718034, + "grad_norm": 0.18089596927165985, + "learning_rate": 5.239587579834216e-05, + "loss": 4.0864, + "step": 56075 + }, + { + "epoch": 3.810300312542465, + "grad_norm": 2.278374433517456, + "learning_rate": 5.23916292974589e-05, + "loss": 3.925, + "step": 56080 + }, + { + "epoch": 3.8106400326131267, + "grad_norm": 0.14809536933898926, + "learning_rate": 5.2387382796575625e-05, + "loss": 3.8429, + "step": 56085 + }, + { + "epoch": 3.8109797526837887, + "grad_norm": 0.2197229266166687, + "learning_rate": 5.2383136295692346e-05, + "loss": 3.7323, + "step": 56090 + }, + { + "epoch": 3.8113194727544504, + "grad_norm": 0.21095940470695496, + "learning_rate": 5.237888979480908e-05, + "loss": 3.9614, + "step": 56095 + }, + { + "epoch": 3.811659192825112, + "grad_norm": 0.16238394379615784, + "learning_rate": 5.237464329392581e-05, + "loss": 3.8284, + "step": 56100 + }, + { + "epoch": 3.811998912895774, + "grad_norm": 0.16249534487724304, + "learning_rate": 5.237039679304253e-05, + "loss": 3.9369, + "step": 56105 + }, + { + "epoch": 3.8123386329664357, + "grad_norm": 0.2346377670764923, + "learning_rate": 5.2366150292159265e-05, + "loss": 4.2975, + "step": 56110 + }, + { + "epoch": 3.8126783530370973, + "grad_norm": 0.16597016155719757, + "learning_rate": 5.236190379127599e-05, + "loss": 4.0639, + "step": 56115 + }, + { + "epoch": 3.8130180731077594, + "grad_norm": 0.15957343578338623, + "learning_rate": 5.2357657290392714e-05, + "loss": 3.8357, + "step": 56120 + }, + { + "epoch": 3.813357793178421, + "grad_norm": 0.23467355966567993, + "learning_rate": 5.235341078950945e-05, + "loss": 3.9013, + "step": 56125 + }, + { + "epoch": 3.8136975132490827, + "grad_norm": 0.22039511799812317, + "learning_rate": 5.234916428862617e-05, + "loss": 4.002, + "step": 56130 + }, + { + "epoch": 3.8140372333197448, + "grad_norm": 0.16759024560451508, + "learning_rate": 5.23449177877429e-05, + "loss": 3.6428, + "step": 56135 + }, + { + "epoch": 3.8143769533904064, + "grad_norm": 0.18927519023418427, + "learning_rate": 5.234067128685963e-05, + "loss": 3.6747, + "step": 56140 + }, + { + "epoch": 3.814716673461068, + "grad_norm": 0.20165936648845673, + "learning_rate": 5.2336424785976354e-05, + "loss": 3.8545, + "step": 56145 + }, + { + "epoch": 3.81505639353173, + "grad_norm": 0.16744045913219452, + "learning_rate": 5.233217828509308e-05, + "loss": 3.9986, + "step": 56150 + }, + { + "epoch": 3.8153961136023917, + "grad_norm": 0.25208285450935364, + "learning_rate": 5.232793178420982e-05, + "loss": 3.9479, + "step": 56155 + }, + { + "epoch": 3.8157358336730534, + "grad_norm": 0.3434893786907196, + "learning_rate": 5.232368528332654e-05, + "loss": 3.8133, + "step": 56160 + }, + { + "epoch": 3.8160755537437154, + "grad_norm": 0.2031378448009491, + "learning_rate": 5.2319438782443266e-05, + "loss": 4.0653, + "step": 56165 + }, + { + "epoch": 3.816415273814377, + "grad_norm": 0.18794365227222443, + "learning_rate": 5.231519228156e-05, + "loss": 3.937, + "step": 56170 + }, + { + "epoch": 3.8167549938850387, + "grad_norm": 0.1432580053806305, + "learning_rate": 5.231094578067672e-05, + "loss": 3.8549, + "step": 56175 + }, + { + "epoch": 3.8170947139557008, + "grad_norm": 0.15446342527866364, + "learning_rate": 5.230669927979345e-05, + "loss": 3.8549, + "step": 56180 + }, + { + "epoch": 3.8174344340263624, + "grad_norm": 0.1691153347492218, + "learning_rate": 5.2302452778910185e-05, + "loss": 3.9661, + "step": 56185 + }, + { + "epoch": 3.817774154097024, + "grad_norm": 0.3122189939022064, + "learning_rate": 5.2298206278026906e-05, + "loss": 3.8464, + "step": 56190 + }, + { + "epoch": 3.8181138741676857, + "grad_norm": 0.1983286738395691, + "learning_rate": 5.229395977714363e-05, + "loss": 4.1032, + "step": 56195 + }, + { + "epoch": 3.8184535942383477, + "grad_norm": 0.32352304458618164, + "learning_rate": 5.228971327626037e-05, + "loss": 3.9246, + "step": 56200 + }, + { + "epoch": 3.8187933143090094, + "grad_norm": 0.2223135232925415, + "learning_rate": 5.228546677537709e-05, + "loss": 3.5511, + "step": 56205 + }, + { + "epoch": 3.819133034379671, + "grad_norm": 0.1757047027349472, + "learning_rate": 5.2281220274493825e-05, + "loss": 3.8925, + "step": 56210 + }, + { + "epoch": 3.819472754450333, + "grad_norm": 0.21896229684352875, + "learning_rate": 5.2276973773610546e-05, + "loss": 3.879, + "step": 56215 + }, + { + "epoch": 3.8198124745209947, + "grad_norm": 0.25108492374420166, + "learning_rate": 5.2272727272727274e-05, + "loss": 4.0132, + "step": 56220 + }, + { + "epoch": 3.8201521945916563, + "grad_norm": 0.20608103275299072, + "learning_rate": 5.226848077184401e-05, + "loss": 3.7011, + "step": 56225 + }, + { + "epoch": 3.8204919146623184, + "grad_norm": 0.9110084176063538, + "learning_rate": 5.226423427096073e-05, + "loss": 3.8122, + "step": 56230 + }, + { + "epoch": 3.82083163473298, + "grad_norm": 0.15675584971904755, + "learning_rate": 5.225998777007746e-05, + "loss": 4.039, + "step": 56235 + }, + { + "epoch": 3.8211713548036417, + "grad_norm": 0.23751936852931976, + "learning_rate": 5.225574126919419e-05, + "loss": 3.9481, + "step": 56240 + }, + { + "epoch": 3.8215110748743033, + "grad_norm": 0.19325311481952667, + "learning_rate": 5.2251494768310914e-05, + "loss": 3.7621, + "step": 56245 + }, + { + "epoch": 3.8218507949449654, + "grad_norm": 0.19288089871406555, + "learning_rate": 5.224724826742764e-05, + "loss": 3.6865, + "step": 56250 + }, + { + "epoch": 3.822190515015627, + "grad_norm": 0.19434630870819092, + "learning_rate": 5.224300176654438e-05, + "loss": 4.0194, + "step": 56255 + }, + { + "epoch": 3.8225302350862886, + "grad_norm": 0.16369673609733582, + "learning_rate": 5.22387552656611e-05, + "loss": 3.8207, + "step": 56260 + }, + { + "epoch": 3.8228699551569507, + "grad_norm": 0.13042280077934265, + "learning_rate": 5.2234508764777826e-05, + "loss": 3.8468, + "step": 56265 + }, + { + "epoch": 3.8232096752276123, + "grad_norm": 0.1498783379793167, + "learning_rate": 5.223026226389456e-05, + "loss": 3.7078, + "step": 56270 + }, + { + "epoch": 3.823549395298274, + "grad_norm": 0.14498203992843628, + "learning_rate": 5.222601576301128e-05, + "loss": 3.805, + "step": 56275 + }, + { + "epoch": 3.823889115368936, + "grad_norm": 0.19472132623195648, + "learning_rate": 5.2221769262128004e-05, + "loss": 3.8855, + "step": 56280 + }, + { + "epoch": 3.8242288354395977, + "grad_norm": 0.20624731481075287, + "learning_rate": 5.221752276124474e-05, + "loss": 3.9941, + "step": 56285 + }, + { + "epoch": 3.8245685555102593, + "grad_norm": 0.38788557052612305, + "learning_rate": 5.2213276260361466e-05, + "loss": 3.9904, + "step": 56290 + }, + { + "epoch": 3.8249082755809214, + "grad_norm": 0.1479925662279129, + "learning_rate": 5.220902975947819e-05, + "loss": 3.9801, + "step": 56295 + }, + { + "epoch": 3.825247995651583, + "grad_norm": 0.18082477152347565, + "learning_rate": 5.220478325859492e-05, + "loss": 3.8787, + "step": 56300 + }, + { + "epoch": 3.8255877157222447, + "grad_norm": 0.17302250862121582, + "learning_rate": 5.220053675771165e-05, + "loss": 3.8892, + "step": 56305 + }, + { + "epoch": 3.8259274357929067, + "grad_norm": 0.3458196222782135, + "learning_rate": 5.219629025682837e-05, + "loss": 4.0283, + "step": 56310 + }, + { + "epoch": 3.8262671558635684, + "grad_norm": 0.13456599414348602, + "learning_rate": 5.2192043755945106e-05, + "loss": 3.8737, + "step": 56315 + }, + { + "epoch": 3.82660687593423, + "grad_norm": 0.1723032146692276, + "learning_rate": 5.2187797255061834e-05, + "loss": 3.5646, + "step": 56320 + }, + { + "epoch": 3.826946596004892, + "grad_norm": 0.16572722792625427, + "learning_rate": 5.2183550754178556e-05, + "loss": 3.7474, + "step": 56325 + }, + { + "epoch": 3.8272863160755537, + "grad_norm": 0.15741446614265442, + "learning_rate": 5.217930425329529e-05, + "loss": 3.719, + "step": 56330 + }, + { + "epoch": 3.8276260361462153, + "grad_norm": 0.19118505716323853, + "learning_rate": 5.217505775241202e-05, + "loss": 3.8924, + "step": 56335 + }, + { + "epoch": 3.8279657562168774, + "grad_norm": 0.25509241223335266, + "learning_rate": 5.217081125152874e-05, + "loss": 3.9594, + "step": 56340 + }, + { + "epoch": 3.828305476287539, + "grad_norm": 0.16151385009288788, + "learning_rate": 5.2166564750645475e-05, + "loss": 3.9051, + "step": 56345 + }, + { + "epoch": 3.8286451963582007, + "grad_norm": 0.15767768025398254, + "learning_rate": 5.2162318249762196e-05, + "loss": 3.991, + "step": 56350 + }, + { + "epoch": 3.8289849164288627, + "grad_norm": 0.2285168319940567, + "learning_rate": 5.2158071748878924e-05, + "loss": 3.5011, + "step": 56355 + }, + { + "epoch": 3.8293246364995244, + "grad_norm": 0.18667559325695038, + "learning_rate": 5.215382524799566e-05, + "loss": 3.8668, + "step": 56360 + }, + { + "epoch": 3.829664356570186, + "grad_norm": 0.26294034719467163, + "learning_rate": 5.214957874711238e-05, + "loss": 4.0484, + "step": 56365 + }, + { + "epoch": 3.830004076640848, + "grad_norm": 0.24298393726348877, + "learning_rate": 5.214533224622911e-05, + "loss": 3.7914, + "step": 56370 + }, + { + "epoch": 3.8303437967115097, + "grad_norm": 1.5468722581863403, + "learning_rate": 5.214108574534584e-05, + "loss": 3.7597, + "step": 56375 + }, + { + "epoch": 3.8306835167821713, + "grad_norm": 0.2054738849401474, + "learning_rate": 5.2136839244462564e-05, + "loss": 3.8053, + "step": 56380 + }, + { + "epoch": 3.8310232368528334, + "grad_norm": 0.16630220413208008, + "learning_rate": 5.213259274357929e-05, + "loss": 3.6805, + "step": 56385 + }, + { + "epoch": 3.831362956923495, + "grad_norm": 0.2935085594654083, + "learning_rate": 5.2128346242696027e-05, + "loss": 3.7229, + "step": 56390 + }, + { + "epoch": 3.8317026769941567, + "grad_norm": 0.20395386219024658, + "learning_rate": 5.212409974181275e-05, + "loss": 3.9438, + "step": 56395 + }, + { + "epoch": 3.8320423970648188, + "grad_norm": 0.14227476716041565, + "learning_rate": 5.2119853240929476e-05, + "loss": 3.9018, + "step": 56400 + }, + { + "epoch": 3.8323821171354804, + "grad_norm": 0.20508508384227753, + "learning_rate": 5.211560674004621e-05, + "loss": 3.849, + "step": 56405 + }, + { + "epoch": 3.832721837206142, + "grad_norm": 0.15068021416664124, + "learning_rate": 5.211136023916293e-05, + "loss": 3.9412, + "step": 56410 + }, + { + "epoch": 3.833061557276804, + "grad_norm": 0.18331514298915863, + "learning_rate": 5.210711373827965e-05, + "loss": 3.5534, + "step": 56415 + }, + { + "epoch": 3.8334012773474657, + "grad_norm": 0.16073474287986755, + "learning_rate": 5.210286723739639e-05, + "loss": 3.7246, + "step": 56420 + }, + { + "epoch": 3.8337409974181273, + "grad_norm": 0.1987936645746231, + "learning_rate": 5.2098620736513116e-05, + "loss": 3.735, + "step": 56425 + }, + { + "epoch": 3.8340807174887894, + "grad_norm": 0.17174044251441956, + "learning_rate": 5.209437423562984e-05, + "loss": 3.7239, + "step": 56430 + }, + { + "epoch": 3.834420437559451, + "grad_norm": 0.2183615267276764, + "learning_rate": 5.209012773474657e-05, + "loss": 3.8192, + "step": 56435 + }, + { + "epoch": 3.8347601576301127, + "grad_norm": 0.2338802069425583, + "learning_rate": 5.20858812338633e-05, + "loss": 3.6271, + "step": 56440 + }, + { + "epoch": 3.8350998777007748, + "grad_norm": 0.1718846708536148, + "learning_rate": 5.208163473298002e-05, + "loss": 3.772, + "step": 56445 + }, + { + "epoch": 3.8354395977714364, + "grad_norm": 0.1746426373720169, + "learning_rate": 5.2077388232096756e-05, + "loss": 3.8639, + "step": 56450 + }, + { + "epoch": 3.835779317842098, + "grad_norm": 0.189620703458786, + "learning_rate": 5.2073141731213484e-05, + "loss": 3.9355, + "step": 56455 + }, + { + "epoch": 3.83611903791276, + "grad_norm": 0.1423482745885849, + "learning_rate": 5.2068895230330205e-05, + "loss": 3.9838, + "step": 56460 + }, + { + "epoch": 3.8364587579834217, + "grad_norm": 0.18786486983299255, + "learning_rate": 5.206464872944694e-05, + "loss": 3.814, + "step": 56465 + }, + { + "epoch": 3.8367984780540834, + "grad_norm": 0.15675394237041473, + "learning_rate": 5.206040222856367e-05, + "loss": 3.8719, + "step": 56470 + }, + { + "epoch": 3.8371381981247454, + "grad_norm": 0.1907215267419815, + "learning_rate": 5.205615572768039e-05, + "loss": 3.9394, + "step": 56475 + }, + { + "epoch": 3.837477918195407, + "grad_norm": 0.42722463607788086, + "learning_rate": 5.2051909226797124e-05, + "loss": 3.82, + "step": 56480 + }, + { + "epoch": 3.8378176382660687, + "grad_norm": 0.1793125867843628, + "learning_rate": 5.2047662725913845e-05, + "loss": 3.9665, + "step": 56485 + }, + { + "epoch": 3.8381573583367308, + "grad_norm": 0.2343786209821701, + "learning_rate": 5.204341622503057e-05, + "loss": 3.8466, + "step": 56490 + }, + { + "epoch": 3.8384970784073924, + "grad_norm": 0.15175631642341614, + "learning_rate": 5.203916972414731e-05, + "loss": 4.0759, + "step": 56495 + }, + { + "epoch": 3.838836798478054, + "grad_norm": 0.18024607002735138, + "learning_rate": 5.203492322326403e-05, + "loss": 3.7181, + "step": 56500 + }, + { + "epoch": 3.839176518548716, + "grad_norm": 1.112662672996521, + "learning_rate": 5.203067672238076e-05, + "loss": 3.8132, + "step": 56505 + }, + { + "epoch": 3.8395162386193777, + "grad_norm": 0.18911224603652954, + "learning_rate": 5.202643022149749e-05, + "loss": 3.9837, + "step": 56510 + }, + { + "epoch": 3.8398559586900394, + "grad_norm": 0.15480050444602966, + "learning_rate": 5.202218372061421e-05, + "loss": 3.8394, + "step": 56515 + }, + { + "epoch": 3.8401956787607014, + "grad_norm": 0.22326619923114777, + "learning_rate": 5.201793721973094e-05, + "loss": 3.9136, + "step": 56520 + }, + { + "epoch": 3.840535398831363, + "grad_norm": 0.16010262072086334, + "learning_rate": 5.2013690718847676e-05, + "loss": 3.6922, + "step": 56525 + }, + { + "epoch": 3.8408751189020247, + "grad_norm": 0.19983206689357758, + "learning_rate": 5.20094442179644e-05, + "loss": 3.8509, + "step": 56530 + }, + { + "epoch": 3.8412148389726863, + "grad_norm": 0.23479333519935608, + "learning_rate": 5.2005197717081125e-05, + "loss": 3.6231, + "step": 56535 + }, + { + "epoch": 3.8415545590433484, + "grad_norm": 0.16877028346061707, + "learning_rate": 5.200095121619786e-05, + "loss": 4.0733, + "step": 56540 + }, + { + "epoch": 3.84189427911401, + "grad_norm": 0.2047101855278015, + "learning_rate": 5.199670471531458e-05, + "loss": 4.046, + "step": 56545 + }, + { + "epoch": 3.8422339991846717, + "grad_norm": 0.17738741636276245, + "learning_rate": 5.1992458214431316e-05, + "loss": 3.8944, + "step": 56550 + }, + { + "epoch": 3.8425737192553338, + "grad_norm": 0.25009995698928833, + "learning_rate": 5.198821171354804e-05, + "loss": 3.9585, + "step": 56555 + }, + { + "epoch": 3.8429134393259954, + "grad_norm": 0.16621074080467224, + "learning_rate": 5.1983965212664765e-05, + "loss": 4.0844, + "step": 56560 + }, + { + "epoch": 3.843253159396657, + "grad_norm": 0.2913565933704376, + "learning_rate": 5.19797187117815e-05, + "loss": 3.7898, + "step": 56565 + }, + { + "epoch": 3.843592879467319, + "grad_norm": 0.4064353108406067, + "learning_rate": 5.197547221089822e-05, + "loss": 4.0575, + "step": 56570 + }, + { + "epoch": 3.8439325995379807, + "grad_norm": 0.19331315159797668, + "learning_rate": 5.197122571001495e-05, + "loss": 3.7329, + "step": 56575 + }, + { + "epoch": 3.8442723196086424, + "grad_norm": 0.1402387171983719, + "learning_rate": 5.1966979209131684e-05, + "loss": 3.8697, + "step": 56580 + }, + { + "epoch": 3.844612039679304, + "grad_norm": 4.552797317504883, + "learning_rate": 5.1962732708248405e-05, + "loss": 4.0575, + "step": 56585 + }, + { + "epoch": 3.844951759749966, + "grad_norm": 0.17146533727645874, + "learning_rate": 5.195848620736513e-05, + "loss": 3.6944, + "step": 56590 + }, + { + "epoch": 3.8452914798206277, + "grad_norm": 0.31354832649230957, + "learning_rate": 5.195423970648187e-05, + "loss": 3.7247, + "step": 56595 + }, + { + "epoch": 3.8456311998912893, + "grad_norm": 0.1828717142343521, + "learning_rate": 5.194999320559859e-05, + "loss": 3.9051, + "step": 56600 + }, + { + "epoch": 3.8459709199619514, + "grad_norm": 0.18931065499782562, + "learning_rate": 5.194574670471532e-05, + "loss": 3.7033, + "step": 56605 + }, + { + "epoch": 3.846310640032613, + "grad_norm": 0.14763759076595306, + "learning_rate": 5.194150020383205e-05, + "loss": 3.8085, + "step": 56610 + }, + { + "epoch": 3.8466503601032747, + "grad_norm": 0.24264496564865112, + "learning_rate": 5.193725370294877e-05, + "loss": 4.0929, + "step": 56615 + }, + { + "epoch": 3.8469900801739367, + "grad_norm": 0.16434338688850403, + "learning_rate": 5.1933007202065495e-05, + "loss": 3.7373, + "step": 56620 + }, + { + "epoch": 3.8473298002445984, + "grad_norm": 0.14436782896518707, + "learning_rate": 5.1928760701182236e-05, + "loss": 3.7749, + "step": 56625 + }, + { + "epoch": 3.84766952031526, + "grad_norm": 0.19937369227409363, + "learning_rate": 5.192451420029896e-05, + "loss": 4.0696, + "step": 56630 + }, + { + "epoch": 3.848009240385922, + "grad_norm": 0.3498150408267975, + "learning_rate": 5.192026769941568e-05, + "loss": 3.9381, + "step": 56635 + }, + { + "epoch": 3.8483489604565837, + "grad_norm": 0.1380477398633957, + "learning_rate": 5.1916021198532413e-05, + "loss": 3.7359, + "step": 56640 + }, + { + "epoch": 3.8486886805272453, + "grad_norm": 0.1670013964176178, + "learning_rate": 5.191177469764914e-05, + "loss": 3.7866, + "step": 56645 + }, + { + "epoch": 3.8490284005979074, + "grad_norm": 0.13536621630191803, + "learning_rate": 5.190752819676586e-05, + "loss": 3.7662, + "step": 56650 + }, + { + "epoch": 3.849368120668569, + "grad_norm": 0.15077048540115356, + "learning_rate": 5.19032816958826e-05, + "loss": 3.6968, + "step": 56655 + }, + { + "epoch": 3.8497078407392307, + "grad_norm": 0.17201730608940125, + "learning_rate": 5.1899035194999325e-05, + "loss": 3.8657, + "step": 56660 + }, + { + "epoch": 3.8500475608098927, + "grad_norm": 0.18541152775287628, + "learning_rate": 5.189478869411605e-05, + "loss": 3.7399, + "step": 56665 + }, + { + "epoch": 3.8503872808805544, + "grad_norm": 0.16339439153671265, + "learning_rate": 5.189054219323278e-05, + "loss": 3.6112, + "step": 56670 + }, + { + "epoch": 3.850727000951216, + "grad_norm": 0.16024823486804962, + "learning_rate": 5.188629569234951e-05, + "loss": 3.9891, + "step": 56675 + }, + { + "epoch": 3.851066721021878, + "grad_norm": 0.2737288475036621, + "learning_rate": 5.188204919146623e-05, + "loss": 3.7675, + "step": 56680 + }, + { + "epoch": 3.8514064410925397, + "grad_norm": 1.1090972423553467, + "learning_rate": 5.1877802690582965e-05, + "loss": 3.7678, + "step": 56685 + }, + { + "epoch": 3.8517461611632013, + "grad_norm": 0.17001968622207642, + "learning_rate": 5.1873556189699693e-05, + "loss": 4.0222, + "step": 56690 + }, + { + "epoch": 3.8520858812338634, + "grad_norm": 0.20721621811389923, + "learning_rate": 5.1869309688816415e-05, + "loss": 4.0043, + "step": 56695 + }, + { + "epoch": 3.852425601304525, + "grad_norm": 0.17564621567726135, + "learning_rate": 5.186506318793315e-05, + "loss": 3.9108, + "step": 56700 + }, + { + "epoch": 3.8527653213751867, + "grad_norm": 0.17003929615020752, + "learning_rate": 5.186081668704987e-05, + "loss": 3.6458, + "step": 56705 + }, + { + "epoch": 3.8531050414458488, + "grad_norm": 0.17504076659679413, + "learning_rate": 5.18565701861666e-05, + "loss": 3.7632, + "step": 56710 + }, + { + "epoch": 3.8534447615165104, + "grad_norm": 0.1705876886844635, + "learning_rate": 5.1852323685283334e-05, + "loss": 3.7511, + "step": 56715 + }, + { + "epoch": 3.853784481587172, + "grad_norm": 0.13529399037361145, + "learning_rate": 5.1848077184400055e-05, + "loss": 3.7124, + "step": 56720 + }, + { + "epoch": 3.854124201657834, + "grad_norm": 0.24699173867702484, + "learning_rate": 5.184383068351678e-05, + "loss": 3.8867, + "step": 56725 + }, + { + "epoch": 3.8544639217284957, + "grad_norm": 0.18554702401161194, + "learning_rate": 5.183958418263352e-05, + "loss": 3.9157, + "step": 56730 + }, + { + "epoch": 3.8548036417991574, + "grad_norm": 0.2306605875492096, + "learning_rate": 5.183533768175024e-05, + "loss": 3.9154, + "step": 56735 + }, + { + "epoch": 3.8551433618698194, + "grad_norm": 0.2491772472858429, + "learning_rate": 5.183109118086697e-05, + "loss": 3.8694, + "step": 56740 + }, + { + "epoch": 3.855483081940481, + "grad_norm": 0.17819567024707794, + "learning_rate": 5.18268446799837e-05, + "loss": 3.5812, + "step": 56745 + }, + { + "epoch": 3.8558228020111427, + "grad_norm": 0.1518687903881073, + "learning_rate": 5.182259817910042e-05, + "loss": 3.9277, + "step": 56750 + }, + { + "epoch": 3.8561625220818048, + "grad_norm": 0.15245920419692993, + "learning_rate": 5.181835167821715e-05, + "loss": 3.9573, + "step": 56755 + }, + { + "epoch": 3.8565022421524664, + "grad_norm": 0.478177011013031, + "learning_rate": 5.1814105177333886e-05, + "loss": 3.9992, + "step": 56760 + }, + { + "epoch": 3.856841962223128, + "grad_norm": 0.1496283859014511, + "learning_rate": 5.180985867645061e-05, + "loss": 3.7102, + "step": 56765 + }, + { + "epoch": 3.85718168229379, + "grad_norm": 0.20211458206176758, + "learning_rate": 5.180561217556733e-05, + "loss": 4.0532, + "step": 56770 + }, + { + "epoch": 3.8575214023644517, + "grad_norm": 0.15146484971046448, + "learning_rate": 5.180136567468406e-05, + "loss": 3.9089, + "step": 56775 + }, + { + "epoch": 3.8578611224351134, + "grad_norm": 0.1889554262161255, + "learning_rate": 5.179711917380079e-05, + "loss": 3.7809, + "step": 56780 + }, + { + "epoch": 3.8582008425057754, + "grad_norm": 0.19777657091617584, + "learning_rate": 5.179287267291751e-05, + "loss": 3.786, + "step": 56785 + }, + { + "epoch": 3.858540562576437, + "grad_norm": 0.17517367005348206, + "learning_rate": 5.178862617203425e-05, + "loss": 3.6725, + "step": 56790 + }, + { + "epoch": 3.8588802826470987, + "grad_norm": 0.12790268659591675, + "learning_rate": 5.1784379671150975e-05, + "loss": 3.8921, + "step": 56795 + }, + { + "epoch": 3.8592200027177608, + "grad_norm": 0.19326192140579224, + "learning_rate": 5.1780133170267696e-05, + "loss": 3.7529, + "step": 56800 + }, + { + "epoch": 3.8595597227884224, + "grad_norm": 0.3588302731513977, + "learning_rate": 5.177588666938443e-05, + "loss": 3.6797, + "step": 56805 + }, + { + "epoch": 3.859899442859084, + "grad_norm": 0.1432654857635498, + "learning_rate": 5.177164016850116e-05, + "loss": 3.7682, + "step": 56810 + }, + { + "epoch": 3.860239162929746, + "grad_norm": 0.17104682326316833, + "learning_rate": 5.176739366761788e-05, + "loss": 3.7307, + "step": 56815 + }, + { + "epoch": 3.8605788830004077, + "grad_norm": 0.2591935694217682, + "learning_rate": 5.1763147166734615e-05, + "loss": 3.8962, + "step": 56820 + }, + { + "epoch": 3.8609186030710694, + "grad_norm": 0.15304864943027496, + "learning_rate": 5.175890066585134e-05, + "loss": 3.8806, + "step": 56825 + }, + { + "epoch": 3.8612583231417315, + "grad_norm": 0.15372131764888763, + "learning_rate": 5.1754654164968064e-05, + "loss": 3.7136, + "step": 56830 + }, + { + "epoch": 3.861598043212393, + "grad_norm": 0.17020538449287415, + "learning_rate": 5.17504076640848e-05, + "loss": 3.8691, + "step": 56835 + }, + { + "epoch": 3.8619377632830547, + "grad_norm": 0.1554422378540039, + "learning_rate": 5.174616116320152e-05, + "loss": 3.7645, + "step": 56840 + }, + { + "epoch": 3.862277483353717, + "grad_norm": 0.247858926653862, + "learning_rate": 5.174191466231825e-05, + "loss": 3.8665, + "step": 56845 + }, + { + "epoch": 3.8626172034243784, + "grad_norm": 0.18013402819633484, + "learning_rate": 5.173766816143498e-05, + "loss": 3.5484, + "step": 56850 + }, + { + "epoch": 3.86295692349504, + "grad_norm": 0.1731395572423935, + "learning_rate": 5.1733421660551704e-05, + "loss": 3.8279, + "step": 56855 + }, + { + "epoch": 3.863296643565702, + "grad_norm": 0.1384403258562088, + "learning_rate": 5.172917515966843e-05, + "loss": 3.5825, + "step": 56860 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 0.25678059458732605, + "learning_rate": 5.172492865878517e-05, + "loss": 3.6622, + "step": 56865 + }, + { + "epoch": 3.8639760837070254, + "grad_norm": 0.16558076441287994, + "learning_rate": 5.172068215790189e-05, + "loss": 3.783, + "step": 56870 + }, + { + "epoch": 3.864315803777687, + "grad_norm": 0.15336336195468903, + "learning_rate": 5.1716435657018616e-05, + "loss": 4.0664, + "step": 56875 + }, + { + "epoch": 3.864655523848349, + "grad_norm": 0.17050452530384064, + "learning_rate": 5.171218915613535e-05, + "loss": 4.0764, + "step": 56880 + }, + { + "epoch": 3.8649952439190107, + "grad_norm": 0.21986062824726105, + "learning_rate": 5.170794265525207e-05, + "loss": 3.6915, + "step": 56885 + }, + { + "epoch": 3.8653349639896724, + "grad_norm": 0.16500328481197357, + "learning_rate": 5.170369615436881e-05, + "loss": 3.8632, + "step": 56890 + }, + { + "epoch": 3.8656746840603344, + "grad_norm": 0.18160033226013184, + "learning_rate": 5.1699449653485535e-05, + "loss": 3.7823, + "step": 56895 + }, + { + "epoch": 3.866014404130996, + "grad_norm": 0.26966726779937744, + "learning_rate": 5.1695203152602256e-05, + "loss": 3.765, + "step": 56900 + }, + { + "epoch": 3.8663541242016577, + "grad_norm": 4.106831073760986, + "learning_rate": 5.169095665171899e-05, + "loss": 3.8648, + "step": 56905 + }, + { + "epoch": 3.8666938442723198, + "grad_norm": 0.27492815256118774, + "learning_rate": 5.168671015083571e-05, + "loss": 4.04, + "step": 56910 + }, + { + "epoch": 3.8670335643429814, + "grad_norm": 0.20339074730873108, + "learning_rate": 5.168246364995244e-05, + "loss": 3.9808, + "step": 56915 + }, + { + "epoch": 3.867373284413643, + "grad_norm": 0.15919485688209534, + "learning_rate": 5.1678217149069175e-05, + "loss": 3.8715, + "step": 56920 + }, + { + "epoch": 3.8677130044843047, + "grad_norm": 0.16808795928955078, + "learning_rate": 5.1673970648185896e-05, + "loss": 3.693, + "step": 56925 + }, + { + "epoch": 3.8680527245549667, + "grad_norm": 0.21802183985710144, + "learning_rate": 5.1669724147302624e-05, + "loss": 3.8218, + "step": 56930 + }, + { + "epoch": 3.8683924446256284, + "grad_norm": 0.17039600014686584, + "learning_rate": 5.166547764641936e-05, + "loss": 3.7319, + "step": 56935 + }, + { + "epoch": 3.86873216469629, + "grad_norm": 0.1750439554452896, + "learning_rate": 5.166123114553608e-05, + "loss": 4.0004, + "step": 56940 + }, + { + "epoch": 3.869071884766952, + "grad_norm": 0.18471430242061615, + "learning_rate": 5.165698464465281e-05, + "loss": 4.0659, + "step": 56945 + }, + { + "epoch": 3.8694116048376137, + "grad_norm": 0.21666231751441956, + "learning_rate": 5.165273814376954e-05, + "loss": 3.7288, + "step": 56950 + }, + { + "epoch": 3.8697513249082753, + "grad_norm": 0.15054336190223694, + "learning_rate": 5.1648491642886264e-05, + "loss": 3.8518, + "step": 56955 + }, + { + "epoch": 3.8700910449789374, + "grad_norm": 0.18611297011375427, + "learning_rate": 5.164424514200299e-05, + "loss": 4.1542, + "step": 56960 + }, + { + "epoch": 3.870430765049599, + "grad_norm": 0.15179599821567535, + "learning_rate": 5.163999864111973e-05, + "loss": 4.0107, + "step": 56965 + }, + { + "epoch": 3.8707704851202607, + "grad_norm": 0.18747158348560333, + "learning_rate": 5.163575214023645e-05, + "loss": 3.8854, + "step": 56970 + }, + { + "epoch": 3.8711102051909227, + "grad_norm": 0.18233518302440643, + "learning_rate": 5.163150563935317e-05, + "loss": 3.8223, + "step": 56975 + }, + { + "epoch": 3.8714499252615844, + "grad_norm": 0.14415320754051208, + "learning_rate": 5.162725913846991e-05, + "loss": 3.8636, + "step": 56980 + }, + { + "epoch": 3.871789645332246, + "grad_norm": 0.1719120293855667, + "learning_rate": 5.162301263758663e-05, + "loss": 3.7231, + "step": 56985 + }, + { + "epoch": 3.872129365402908, + "grad_norm": 0.2014671117067337, + "learning_rate": 5.1618766136703354e-05, + "loss": 3.9889, + "step": 56990 + }, + { + "epoch": 3.8724690854735697, + "grad_norm": 0.17274004220962524, + "learning_rate": 5.161451963582009e-05, + "loss": 3.7323, + "step": 56995 + }, + { + "epoch": 3.8728088055442313, + "grad_norm": 0.9479042887687683, + "learning_rate": 5.1610273134936816e-05, + "loss": 3.9551, + "step": 57000 + }, + { + "epoch": 3.8731485256148934, + "grad_norm": 1.3088493347167969, + "learning_rate": 5.160602663405354e-05, + "loss": 3.6624, + "step": 57005 + }, + { + "epoch": 3.873488245685555, + "grad_norm": 0.20490247011184692, + "learning_rate": 5.160178013317027e-05, + "loss": 3.8029, + "step": 57010 + }, + { + "epoch": 3.8738279657562167, + "grad_norm": 0.2375367283821106, + "learning_rate": 5.1597533632287e-05, + "loss": 3.7799, + "step": 57015 + }, + { + "epoch": 3.8741676858268788, + "grad_norm": 0.163449227809906, + "learning_rate": 5.159328713140372e-05, + "loss": 4.2848, + "step": 57020 + }, + { + "epoch": 3.8745074058975404, + "grad_norm": 0.6925029754638672, + "learning_rate": 5.1589040630520456e-05, + "loss": 3.8293, + "step": 57025 + }, + { + "epoch": 3.874847125968202, + "grad_norm": 0.5494389533996582, + "learning_rate": 5.1584794129637184e-05, + "loss": 3.687, + "step": 57030 + }, + { + "epoch": 3.875186846038864, + "grad_norm": 0.17327435314655304, + "learning_rate": 5.1580547628753906e-05, + "loss": 4.0312, + "step": 57035 + }, + { + "epoch": 3.8755265661095257, + "grad_norm": 0.1482311487197876, + "learning_rate": 5.157630112787064e-05, + "loss": 3.6855, + "step": 57040 + }, + { + "epoch": 3.8758662861801874, + "grad_norm": 0.16423040628433228, + "learning_rate": 5.157205462698736e-05, + "loss": 3.8577, + "step": 57045 + }, + { + "epoch": 3.8762060062508494, + "grad_norm": 0.19540822505950928, + "learning_rate": 5.156780812610409e-05, + "loss": 3.6771, + "step": 57050 + }, + { + "epoch": 3.876545726321511, + "grad_norm": 0.19499632716178894, + "learning_rate": 5.1563561625220824e-05, + "loss": 3.6704, + "step": 57055 + }, + { + "epoch": 3.8768854463921727, + "grad_norm": 0.14309532940387726, + "learning_rate": 5.1559315124337546e-05, + "loss": 3.8455, + "step": 57060 + }, + { + "epoch": 3.8772251664628348, + "grad_norm": 0.20768986642360687, + "learning_rate": 5.1555068623454274e-05, + "loss": 3.7433, + "step": 57065 + }, + { + "epoch": 3.8775648865334964, + "grad_norm": 0.1670025736093521, + "learning_rate": 5.155082212257101e-05, + "loss": 3.7149, + "step": 57070 + }, + { + "epoch": 3.877904606604158, + "grad_norm": 0.16255128383636475, + "learning_rate": 5.154657562168773e-05, + "loss": 3.9302, + "step": 57075 + }, + { + "epoch": 3.87824432667482, + "grad_norm": 0.2225804179906845, + "learning_rate": 5.154232912080446e-05, + "loss": 3.7819, + "step": 57080 + }, + { + "epoch": 3.8785840467454817, + "grad_norm": 0.1898791790008545, + "learning_rate": 5.153808261992119e-05, + "loss": 4.1466, + "step": 57085 + }, + { + "epoch": 3.8789237668161434, + "grad_norm": 0.21695642173290253, + "learning_rate": 5.1533836119037914e-05, + "loss": 4.0009, + "step": 57090 + }, + { + "epoch": 3.8792634868868054, + "grad_norm": 0.26677635312080383, + "learning_rate": 5.152958961815464e-05, + "loss": 3.8981, + "step": 57095 + }, + { + "epoch": 3.879603206957467, + "grad_norm": 0.19712157547473907, + "learning_rate": 5.1525343117271377e-05, + "loss": 3.7432, + "step": 57100 + }, + { + "epoch": 3.8799429270281287, + "grad_norm": 0.31186339259147644, + "learning_rate": 5.15210966163881e-05, + "loss": 3.9257, + "step": 57105 + }, + { + "epoch": 3.880282647098791, + "grad_norm": 0.2034907341003418, + "learning_rate": 5.151685011550482e-05, + "loss": 3.8926, + "step": 57110 + }, + { + "epoch": 3.8806223671694524, + "grad_norm": 0.14156045019626617, + "learning_rate": 5.151260361462156e-05, + "loss": 3.8656, + "step": 57115 + }, + { + "epoch": 3.880962087240114, + "grad_norm": 0.19904427230358124, + "learning_rate": 5.150835711373828e-05, + "loss": 3.7565, + "step": 57120 + }, + { + "epoch": 3.881301807310776, + "grad_norm": 0.15051332116127014, + "learning_rate": 5.1504110612855e-05, + "loss": 3.8375, + "step": 57125 + }, + { + "epoch": 3.8816415273814378, + "grad_norm": 0.16448727250099182, + "learning_rate": 5.149986411197174e-05, + "loss": 4.0285, + "step": 57130 + }, + { + "epoch": 3.8819812474520994, + "grad_norm": 0.18020229041576385, + "learning_rate": 5.1495617611088466e-05, + "loss": 3.7466, + "step": 57135 + }, + { + "epoch": 3.8823209675227615, + "grad_norm": 0.16687443852424622, + "learning_rate": 5.149137111020519e-05, + "loss": 3.7936, + "step": 57140 + }, + { + "epoch": 3.882660687593423, + "grad_norm": 0.15788069367408752, + "learning_rate": 5.148712460932192e-05, + "loss": 3.7252, + "step": 57145 + }, + { + "epoch": 3.8830004076640847, + "grad_norm": 0.39360103011131287, + "learning_rate": 5.148287810843865e-05, + "loss": 3.8696, + "step": 57150 + }, + { + "epoch": 3.883340127734747, + "grad_norm": 0.16723382472991943, + "learning_rate": 5.147863160755537e-05, + "loss": 3.6964, + "step": 57155 + }, + { + "epoch": 3.8836798478054084, + "grad_norm": 0.16598710417747498, + "learning_rate": 5.1474385106672106e-05, + "loss": 4.1727, + "step": 57160 + }, + { + "epoch": 3.88401956787607, + "grad_norm": 0.18326513469219208, + "learning_rate": 5.1470138605788834e-05, + "loss": 3.9255, + "step": 57165 + }, + { + "epoch": 3.884359287946732, + "grad_norm": 0.17371882498264313, + "learning_rate": 5.1465892104905555e-05, + "loss": 3.9013, + "step": 57170 + }, + { + "epoch": 3.8846990080173938, + "grad_norm": 0.29082900285720825, + "learning_rate": 5.146164560402229e-05, + "loss": 3.692, + "step": 57175 + }, + { + "epoch": 3.8850387280880554, + "grad_norm": 0.27872511744499207, + "learning_rate": 5.145739910313902e-05, + "loss": 3.5845, + "step": 57180 + }, + { + "epoch": 3.8853784481587175, + "grad_norm": 0.15703286230564117, + "learning_rate": 5.145315260225574e-05, + "loss": 3.9017, + "step": 57185 + }, + { + "epoch": 3.885718168229379, + "grad_norm": 0.16282635927200317, + "learning_rate": 5.1448906101372474e-05, + "loss": 3.7952, + "step": 57190 + }, + { + "epoch": 3.8860578883000407, + "grad_norm": 0.16056351363658905, + "learning_rate": 5.1444659600489195e-05, + "loss": 3.8262, + "step": 57195 + }, + { + "epoch": 3.886397608370703, + "grad_norm": 0.14684738218784332, + "learning_rate": 5.144041309960592e-05, + "loss": 3.5158, + "step": 57200 + }, + { + "epoch": 3.8867373284413644, + "grad_norm": 0.16840459406375885, + "learning_rate": 5.143616659872266e-05, + "loss": 3.8691, + "step": 57205 + }, + { + "epoch": 3.887077048512026, + "grad_norm": 0.16174231469631195, + "learning_rate": 5.143192009783938e-05, + "loss": 3.901, + "step": 57210 + }, + { + "epoch": 3.887416768582688, + "grad_norm": 0.24424801766872406, + "learning_rate": 5.142767359695611e-05, + "loss": 3.9882, + "step": 57215 + }, + { + "epoch": 3.8877564886533498, + "grad_norm": 0.17598097026348114, + "learning_rate": 5.142342709607284e-05, + "loss": 3.9069, + "step": 57220 + }, + { + "epoch": 3.8880962087240114, + "grad_norm": 0.4472482204437256, + "learning_rate": 5.141918059518956e-05, + "loss": 3.7552, + "step": 57225 + }, + { + "epoch": 3.888435928794673, + "grad_norm": 0.187988743185997, + "learning_rate": 5.14149340943063e-05, + "loss": 3.7665, + "step": 57230 + }, + { + "epoch": 3.888775648865335, + "grad_norm": 1.6261543035507202, + "learning_rate": 5.1410687593423026e-05, + "loss": 3.8825, + "step": 57235 + }, + { + "epoch": 3.8891153689359967, + "grad_norm": 0.1270320564508438, + "learning_rate": 5.140644109253975e-05, + "loss": 3.6804, + "step": 57240 + }, + { + "epoch": 3.8894550890066584, + "grad_norm": 0.19813579320907593, + "learning_rate": 5.140219459165648e-05, + "loss": 3.7904, + "step": 57245 + }, + { + "epoch": 3.8897948090773204, + "grad_norm": 0.14800305664539337, + "learning_rate": 5.139794809077321e-05, + "loss": 3.7996, + "step": 57250 + }, + { + "epoch": 3.890134529147982, + "grad_norm": 0.25260233879089355, + "learning_rate": 5.139370158988993e-05, + "loss": 3.8883, + "step": 57255 + }, + { + "epoch": 3.8904742492186437, + "grad_norm": 0.4159911572933197, + "learning_rate": 5.1389455089006666e-05, + "loss": 3.9598, + "step": 57260 + }, + { + "epoch": 3.8908139692893053, + "grad_norm": 0.35298725962638855, + "learning_rate": 5.138520858812339e-05, + "loss": 3.8156, + "step": 57265 + }, + { + "epoch": 3.8911536893599674, + "grad_norm": 0.18672379851341248, + "learning_rate": 5.1380962087240115e-05, + "loss": 3.8076, + "step": 57270 + }, + { + "epoch": 3.891493409430629, + "grad_norm": 0.21621067821979523, + "learning_rate": 5.137671558635685e-05, + "loss": 3.9435, + "step": 57275 + }, + { + "epoch": 3.8918331295012907, + "grad_norm": 0.1539844423532486, + "learning_rate": 5.137246908547357e-05, + "loss": 3.7303, + "step": 57280 + }, + { + "epoch": 3.8921728495719528, + "grad_norm": 0.23631827533245087, + "learning_rate": 5.13682225845903e-05, + "loss": 3.8209, + "step": 57285 + }, + { + "epoch": 3.8925125696426144, + "grad_norm": 0.2376357614994049, + "learning_rate": 5.1363976083707034e-05, + "loss": 3.9303, + "step": 57290 + }, + { + "epoch": 3.892852289713276, + "grad_norm": 0.1685994565486908, + "learning_rate": 5.1359729582823755e-05, + "loss": 3.844, + "step": 57295 + }, + { + "epoch": 3.893192009783938, + "grad_norm": 0.22950570285320282, + "learning_rate": 5.135548308194048e-05, + "loss": 3.9406, + "step": 57300 + }, + { + "epoch": 3.8935317298545997, + "grad_norm": 0.5842491984367371, + "learning_rate": 5.135123658105722e-05, + "loss": 4.0666, + "step": 57305 + }, + { + "epoch": 3.8938714499252614, + "grad_norm": 0.18241816759109497, + "learning_rate": 5.134699008017394e-05, + "loss": 3.9269, + "step": 57310 + }, + { + "epoch": 3.8942111699959234, + "grad_norm": 0.22437357902526855, + "learning_rate": 5.134274357929067e-05, + "loss": 3.885, + "step": 57315 + }, + { + "epoch": 3.894550890066585, + "grad_norm": 0.21862755715847015, + "learning_rate": 5.13384970784074e-05, + "loss": 4.0233, + "step": 57320 + }, + { + "epoch": 3.8948906101372467, + "grad_norm": 0.15074072778224945, + "learning_rate": 5.133425057752412e-05, + "loss": 3.8406, + "step": 57325 + }, + { + "epoch": 3.8952303302079088, + "grad_norm": 0.17388972640037537, + "learning_rate": 5.1330004076640845e-05, + "loss": 3.9293, + "step": 57330 + }, + { + "epoch": 3.8955700502785704, + "grad_norm": 0.1843828409910202, + "learning_rate": 5.132575757575758e-05, + "loss": 3.8095, + "step": 57335 + }, + { + "epoch": 3.895909770349232, + "grad_norm": 0.1671920269727707, + "learning_rate": 5.132151107487431e-05, + "loss": 3.7047, + "step": 57340 + }, + { + "epoch": 3.896249490419894, + "grad_norm": 0.18904148042201996, + "learning_rate": 5.131726457399103e-05, + "loss": 3.8728, + "step": 57345 + }, + { + "epoch": 3.8965892104905557, + "grad_norm": 0.11969590932130814, + "learning_rate": 5.1313018073107763e-05, + "loss": 4.1511, + "step": 57350 + }, + { + "epoch": 3.8969289305612174, + "grad_norm": 0.16141685843467712, + "learning_rate": 5.130877157222449e-05, + "loss": 3.8838, + "step": 57355 + }, + { + "epoch": 3.8972686506318794, + "grad_norm": 0.1704491227865219, + "learning_rate": 5.130452507134121e-05, + "loss": 3.8635, + "step": 57360 + }, + { + "epoch": 3.897608370702541, + "grad_norm": 0.17206022143363953, + "learning_rate": 5.130027857045795e-05, + "loss": 3.899, + "step": 57365 + }, + { + "epoch": 3.8979480907732027, + "grad_norm": 0.1889084279537201, + "learning_rate": 5.1296032069574675e-05, + "loss": 3.8222, + "step": 57370 + }, + { + "epoch": 3.8982878108438648, + "grad_norm": 0.2519523799419403, + "learning_rate": 5.12917855686914e-05, + "loss": 3.9692, + "step": 57375 + }, + { + "epoch": 3.8986275309145264, + "grad_norm": 0.20520026981830597, + "learning_rate": 5.128753906780813e-05, + "loss": 4.008, + "step": 57380 + }, + { + "epoch": 3.898967250985188, + "grad_norm": 0.14104333519935608, + "learning_rate": 5.128329256692486e-05, + "loss": 3.8494, + "step": 57385 + }, + { + "epoch": 3.89930697105585, + "grad_norm": 0.1617654412984848, + "learning_rate": 5.127904606604158e-05, + "loss": 3.9141, + "step": 57390 + }, + { + "epoch": 3.8996466911265117, + "grad_norm": 0.3753851652145386, + "learning_rate": 5.1274799565158315e-05, + "loss": 3.7918, + "step": 57395 + }, + { + "epoch": 3.8999864111971734, + "grad_norm": 0.1937483698129654, + "learning_rate": 5.127055306427504e-05, + "loss": 3.6769, + "step": 57400 + }, + { + "epoch": 3.9003261312678354, + "grad_norm": 0.13448819518089294, + "learning_rate": 5.1266306563391765e-05, + "loss": 3.7639, + "step": 57405 + }, + { + "epoch": 3.900665851338497, + "grad_norm": 0.16268853843212128, + "learning_rate": 5.12620600625085e-05, + "loss": 3.6408, + "step": 57410 + }, + { + "epoch": 3.9010055714091587, + "grad_norm": 0.17617006599903107, + "learning_rate": 5.125781356162522e-05, + "loss": 3.8002, + "step": 57415 + }, + { + "epoch": 3.901345291479821, + "grad_norm": 0.17442694306373596, + "learning_rate": 5.125356706074195e-05, + "loss": 3.7661, + "step": 57420 + }, + { + "epoch": 3.9016850115504824, + "grad_norm": 0.22264830768108368, + "learning_rate": 5.1249320559858684e-05, + "loss": 3.8653, + "step": 57425 + }, + { + "epoch": 3.902024731621144, + "grad_norm": 0.1596376746892929, + "learning_rate": 5.1245074058975405e-05, + "loss": 3.8823, + "step": 57430 + }, + { + "epoch": 3.902364451691806, + "grad_norm": 0.21571606397628784, + "learning_rate": 5.124082755809213e-05, + "loss": 3.7558, + "step": 57435 + }, + { + "epoch": 3.9027041717624678, + "grad_norm": 0.15017355978488922, + "learning_rate": 5.123658105720887e-05, + "loss": 3.8688, + "step": 57440 + }, + { + "epoch": 3.9030438918331294, + "grad_norm": 0.21599052846431732, + "learning_rate": 5.123233455632559e-05, + "loss": 3.5924, + "step": 57445 + }, + { + "epoch": 3.9033836119037915, + "grad_norm": 0.144163578748703, + "learning_rate": 5.122808805544232e-05, + "loss": 3.7228, + "step": 57450 + }, + { + "epoch": 3.903723331974453, + "grad_norm": 0.15805256366729736, + "learning_rate": 5.122384155455905e-05, + "loss": 3.9705, + "step": 57455 + }, + { + "epoch": 3.9040630520451147, + "grad_norm": 0.2042490392923355, + "learning_rate": 5.121959505367577e-05, + "loss": 4.145, + "step": 57460 + }, + { + "epoch": 3.904402772115777, + "grad_norm": 0.180549755692482, + "learning_rate": 5.1215348552792494e-05, + "loss": 3.8804, + "step": 57465 + }, + { + "epoch": 3.9047424921864384, + "grad_norm": 0.2031937837600708, + "learning_rate": 5.121110205190923e-05, + "loss": 3.8447, + "step": 57470 + }, + { + "epoch": 3.9050822122571, + "grad_norm": 0.17012080550193787, + "learning_rate": 5.120685555102596e-05, + "loss": 3.6657, + "step": 57475 + }, + { + "epoch": 3.905421932327762, + "grad_norm": 0.18431362509727478, + "learning_rate": 5.120260905014268e-05, + "loss": 3.9358, + "step": 57480 + }, + { + "epoch": 3.9057616523984238, + "grad_norm": 0.20700666308403015, + "learning_rate": 5.119836254925941e-05, + "loss": 3.7199, + "step": 57485 + }, + { + "epoch": 3.9061013724690854, + "grad_norm": 0.9901573061943054, + "learning_rate": 5.119411604837614e-05, + "loss": 3.6755, + "step": 57490 + }, + { + "epoch": 3.9064410925397475, + "grad_norm": 0.1829257309436798, + "learning_rate": 5.118986954749286e-05, + "loss": 3.8841, + "step": 57495 + }, + { + "epoch": 3.906780812610409, + "grad_norm": 0.1604820340871811, + "learning_rate": 5.11856230466096e-05, + "loss": 3.8752, + "step": 57500 + }, + { + "epoch": 3.9071205326810707, + "grad_norm": 0.1398690640926361, + "learning_rate": 5.1181376545726325e-05, + "loss": 3.9695, + "step": 57505 + }, + { + "epoch": 3.907460252751733, + "grad_norm": 0.18243831396102905, + "learning_rate": 5.1177130044843046e-05, + "loss": 3.7963, + "step": 57510 + }, + { + "epoch": 3.9077999728223944, + "grad_norm": 0.177028089761734, + "learning_rate": 5.117288354395978e-05, + "loss": 3.8607, + "step": 57515 + }, + { + "epoch": 3.908139692893056, + "grad_norm": 0.17708784341812134, + "learning_rate": 5.116863704307651e-05, + "loss": 3.8853, + "step": 57520 + }, + { + "epoch": 3.908479412963718, + "grad_norm": 0.20856188237667084, + "learning_rate": 5.116439054219323e-05, + "loss": 3.5466, + "step": 57525 + }, + { + "epoch": 3.90881913303438, + "grad_norm": 0.1817222535610199, + "learning_rate": 5.1160144041309965e-05, + "loss": 3.8228, + "step": 57530 + }, + { + "epoch": 3.9091588531050414, + "grad_norm": 0.20481278002262115, + "learning_rate": 5.1155897540426686e-05, + "loss": 3.8769, + "step": 57535 + }, + { + "epoch": 3.9094985731757035, + "grad_norm": 0.1714891791343689, + "learning_rate": 5.1151651039543414e-05, + "loss": 3.7815, + "step": 57540 + }, + { + "epoch": 3.909838293246365, + "grad_norm": 0.18889304995536804, + "learning_rate": 5.114740453866015e-05, + "loss": 3.7967, + "step": 57545 + }, + { + "epoch": 3.9101780133170267, + "grad_norm": 0.4772092401981354, + "learning_rate": 5.114315803777687e-05, + "loss": 3.8845, + "step": 57550 + }, + { + "epoch": 3.910517733387689, + "grad_norm": 0.19037283957004547, + "learning_rate": 5.11389115368936e-05, + "loss": 3.7236, + "step": 57555 + }, + { + "epoch": 3.9108574534583505, + "grad_norm": 0.20335127413272858, + "learning_rate": 5.113466503601033e-05, + "loss": 3.9045, + "step": 57560 + }, + { + "epoch": 3.911197173529012, + "grad_norm": 0.14213579893112183, + "learning_rate": 5.1130418535127054e-05, + "loss": 3.8564, + "step": 57565 + }, + { + "epoch": 3.9115368935996737, + "grad_norm": 0.13484323024749756, + "learning_rate": 5.112617203424379e-05, + "loss": 3.7001, + "step": 57570 + }, + { + "epoch": 3.911876613670336, + "grad_norm": 0.18935082852840424, + "learning_rate": 5.112192553336052e-05, + "loss": 3.8656, + "step": 57575 + }, + { + "epoch": 3.9122163337409974, + "grad_norm": 0.24803271889686584, + "learning_rate": 5.111767903247724e-05, + "loss": 3.8182, + "step": 57580 + }, + { + "epoch": 3.912556053811659, + "grad_norm": 0.21885640919208527, + "learning_rate": 5.111343253159397e-05, + "loss": 3.767, + "step": 57585 + }, + { + "epoch": 3.912895773882321, + "grad_norm": 0.5284019708633423, + "learning_rate": 5.11091860307107e-05, + "loss": 3.9917, + "step": 57590 + }, + { + "epoch": 3.9132354939529828, + "grad_norm": 0.17888370156288147, + "learning_rate": 5.110493952982742e-05, + "loss": 3.6802, + "step": 57595 + }, + { + "epoch": 3.9135752140236444, + "grad_norm": 0.38928312063217163, + "learning_rate": 5.110069302894416e-05, + "loss": 3.9137, + "step": 57600 + }, + { + "epoch": 3.913914934094306, + "grad_norm": 0.15627136826515198, + "learning_rate": 5.1096446528060885e-05, + "loss": 3.8157, + "step": 57605 + }, + { + "epoch": 3.914254654164968, + "grad_norm": 0.1794806271791458, + "learning_rate": 5.1092200027177606e-05, + "loss": 3.7977, + "step": 57610 + }, + { + "epoch": 3.9145943742356297, + "grad_norm": 0.26469478011131287, + "learning_rate": 5.108795352629434e-05, + "loss": 3.8109, + "step": 57615 + }, + { + "epoch": 3.9149340943062914, + "grad_norm": 0.15944640338420868, + "learning_rate": 5.108370702541106e-05, + "loss": 3.9687, + "step": 57620 + }, + { + "epoch": 3.9152738143769534, + "grad_norm": 0.15384891629219055, + "learning_rate": 5.107946052452779e-05, + "loss": 3.9701, + "step": 57625 + }, + { + "epoch": 3.915613534447615, + "grad_norm": 0.19482894241809845, + "learning_rate": 5.1075214023644525e-05, + "loss": 3.7359, + "step": 57630 + }, + { + "epoch": 3.9159532545182767, + "grad_norm": 0.2069990336894989, + "learning_rate": 5.1070967522761246e-05, + "loss": 4.0167, + "step": 57635 + }, + { + "epoch": 3.9162929745889388, + "grad_norm": 0.1779794991016388, + "learning_rate": 5.1066721021877974e-05, + "loss": 3.8595, + "step": 57640 + }, + { + "epoch": 3.9166326946596004, + "grad_norm": 0.17222458124160767, + "learning_rate": 5.106247452099471e-05, + "loss": 3.9861, + "step": 57645 + }, + { + "epoch": 3.916972414730262, + "grad_norm": 0.1412455290555954, + "learning_rate": 5.105822802011143e-05, + "loss": 3.7119, + "step": 57650 + }, + { + "epoch": 3.917312134800924, + "grad_norm": 0.1850082278251648, + "learning_rate": 5.105398151922816e-05, + "loss": 3.8409, + "step": 57655 + }, + { + "epoch": 3.9176518548715857, + "grad_norm": 0.17904694378376007, + "learning_rate": 5.104973501834489e-05, + "loss": 3.6324, + "step": 57660 + }, + { + "epoch": 3.9179915749422474, + "grad_norm": 0.13063256442546844, + "learning_rate": 5.1045488517461614e-05, + "loss": 3.9342, + "step": 57665 + }, + { + "epoch": 3.9183312950129094, + "grad_norm": 0.19366100430488586, + "learning_rate": 5.1041242016578336e-05, + "loss": 3.7486, + "step": 57670 + }, + { + "epoch": 3.918671015083571, + "grad_norm": 0.14452488720417023, + "learning_rate": 5.103699551569508e-05, + "loss": 3.9984, + "step": 57675 + }, + { + "epoch": 3.9190107351542327, + "grad_norm": 0.17058835923671722, + "learning_rate": 5.10327490148118e-05, + "loss": 3.7852, + "step": 57680 + }, + { + "epoch": 3.919350455224895, + "grad_norm": 0.15491226315498352, + "learning_rate": 5.102850251392852e-05, + "loss": 3.8773, + "step": 57685 + }, + { + "epoch": 3.9196901752955564, + "grad_norm": 0.12960325181484222, + "learning_rate": 5.1024256013045254e-05, + "loss": 3.7977, + "step": 57690 + }, + { + "epoch": 3.920029895366218, + "grad_norm": 0.14580044150352478, + "learning_rate": 5.102000951216198e-05, + "loss": 4.1061, + "step": 57695 + }, + { + "epoch": 3.92036961543688, + "grad_norm": 0.15027481317520142, + "learning_rate": 5.1015763011278704e-05, + "loss": 4.0708, + "step": 57700 + }, + { + "epoch": 3.9207093355075417, + "grad_norm": 0.12781736254692078, + "learning_rate": 5.101151651039544e-05, + "loss": 4.1354, + "step": 57705 + }, + { + "epoch": 3.9210490555782034, + "grad_norm": 0.23059435188770294, + "learning_rate": 5.1007270009512166e-05, + "loss": 3.9502, + "step": 57710 + }, + { + "epoch": 3.9213887756488655, + "grad_norm": 0.3741428256034851, + "learning_rate": 5.100302350862889e-05, + "loss": 3.6501, + "step": 57715 + }, + { + "epoch": 3.921728495719527, + "grad_norm": 0.2115805745124817, + "learning_rate": 5.099877700774562e-05, + "loss": 3.8016, + "step": 57720 + }, + { + "epoch": 3.9220682157901887, + "grad_norm": 0.1666104942560196, + "learning_rate": 5.099453050686235e-05, + "loss": 3.7323, + "step": 57725 + }, + { + "epoch": 3.922407935860851, + "grad_norm": 0.46550700068473816, + "learning_rate": 5.099028400597907e-05, + "loss": 3.853, + "step": 57730 + }, + { + "epoch": 3.9227476559315124, + "grad_norm": 0.14953647553920746, + "learning_rate": 5.0986037505095806e-05, + "loss": 3.8954, + "step": 57735 + }, + { + "epoch": 3.923087376002174, + "grad_norm": 0.20810037851333618, + "learning_rate": 5.0981791004212534e-05, + "loss": 3.6863, + "step": 57740 + }, + { + "epoch": 3.923427096072836, + "grad_norm": 0.18309549987316132, + "learning_rate": 5.0977544503329256e-05, + "loss": 3.8127, + "step": 57745 + }, + { + "epoch": 3.9237668161434978, + "grad_norm": 0.18375414609909058, + "learning_rate": 5.097329800244599e-05, + "loss": 3.9779, + "step": 57750 + }, + { + "epoch": 3.9241065362141594, + "grad_norm": 0.7060991525650024, + "learning_rate": 5.096905150156271e-05, + "loss": 3.7761, + "step": 57755 + }, + { + "epoch": 3.9244462562848215, + "grad_norm": 0.1357262134552002, + "learning_rate": 5.096480500067944e-05, + "loss": 3.8625, + "step": 57760 + }, + { + "epoch": 3.924785976355483, + "grad_norm": 0.14648056030273438, + "learning_rate": 5.0960558499796174e-05, + "loss": 3.9001, + "step": 57765 + }, + { + "epoch": 3.9251256964261447, + "grad_norm": 0.3801839351654053, + "learning_rate": 5.0956311998912896e-05, + "loss": 3.7085, + "step": 57770 + }, + { + "epoch": 3.925465416496807, + "grad_norm": 0.21589265763759613, + "learning_rate": 5.0952065498029624e-05, + "loss": 4.0547, + "step": 57775 + }, + { + "epoch": 3.9258051365674684, + "grad_norm": 0.5176482796669006, + "learning_rate": 5.094781899714636e-05, + "loss": 3.5956, + "step": 57780 + }, + { + "epoch": 3.92614485663813, + "grad_norm": 0.24929644167423248, + "learning_rate": 5.094357249626308e-05, + "loss": 3.9518, + "step": 57785 + }, + { + "epoch": 3.926484576708792, + "grad_norm": 0.1614309549331665, + "learning_rate": 5.093932599537981e-05, + "loss": 3.8214, + "step": 57790 + }, + { + "epoch": 3.9268242967794538, + "grad_norm": 0.16105933487415314, + "learning_rate": 5.093507949449654e-05, + "loss": 3.5529, + "step": 57795 + }, + { + "epoch": 3.9271640168501154, + "grad_norm": 0.2043214589357376, + "learning_rate": 5.0930832993613264e-05, + "loss": 3.9527, + "step": 57800 + }, + { + "epoch": 3.9275037369207775, + "grad_norm": 0.1787746548652649, + "learning_rate": 5.092658649272999e-05, + "loss": 3.944, + "step": 57805 + }, + { + "epoch": 3.927843456991439, + "grad_norm": 0.5169968605041504, + "learning_rate": 5.0922339991846727e-05, + "loss": 3.7795, + "step": 57810 + }, + { + "epoch": 3.9281831770621007, + "grad_norm": 0.23999468982219696, + "learning_rate": 5.091809349096345e-05, + "loss": 3.7049, + "step": 57815 + }, + { + "epoch": 3.928522897132763, + "grad_norm": 0.18311268091201782, + "learning_rate": 5.091384699008017e-05, + "loss": 3.7288, + "step": 57820 + }, + { + "epoch": 3.9288626172034244, + "grad_norm": 0.22423771023750305, + "learning_rate": 5.0909600489196904e-05, + "loss": 3.9396, + "step": 57825 + }, + { + "epoch": 3.929202337274086, + "grad_norm": 0.1515480875968933, + "learning_rate": 5.090535398831363e-05, + "loss": 3.8027, + "step": 57830 + }, + { + "epoch": 3.929542057344748, + "grad_norm": 0.204025000333786, + "learning_rate": 5.090110748743035e-05, + "loss": 3.6376, + "step": 57835 + }, + { + "epoch": 3.92988177741541, + "grad_norm": 0.16156849265098572, + "learning_rate": 5.089686098654709e-05, + "loss": 3.6644, + "step": 57840 + }, + { + "epoch": 3.9302214974860714, + "grad_norm": 0.17358428239822388, + "learning_rate": 5.0892614485663816e-05, + "loss": 3.8893, + "step": 57845 + }, + { + "epoch": 3.9305612175567335, + "grad_norm": 0.14807042479515076, + "learning_rate": 5.088836798478054e-05, + "loss": 3.7932, + "step": 57850 + }, + { + "epoch": 3.930900937627395, + "grad_norm": 0.15639176964759827, + "learning_rate": 5.088412148389727e-05, + "loss": 3.8896, + "step": 57855 + }, + { + "epoch": 3.9312406576980568, + "grad_norm": 0.17641016840934753, + "learning_rate": 5.0879874983014e-05, + "loss": 3.9364, + "step": 57860 + }, + { + "epoch": 3.931580377768719, + "grad_norm": 0.24837060272693634, + "learning_rate": 5.087562848213072e-05, + "loss": 3.736, + "step": 57865 + }, + { + "epoch": 3.9319200978393805, + "grad_norm": 0.16470028460025787, + "learning_rate": 5.0871381981247456e-05, + "loss": 3.7507, + "step": 57870 + }, + { + "epoch": 3.932259817910042, + "grad_norm": 0.24329549074172974, + "learning_rate": 5.0867135480364184e-05, + "loss": 3.8938, + "step": 57875 + }, + { + "epoch": 3.932599537980704, + "grad_norm": 0.21637293696403503, + "learning_rate": 5.0862888979480905e-05, + "loss": 3.7374, + "step": 57880 + }, + { + "epoch": 3.932939258051366, + "grad_norm": 0.18141064047813416, + "learning_rate": 5.085864247859764e-05, + "loss": 4.0213, + "step": 57885 + }, + { + "epoch": 3.9332789781220274, + "grad_norm": 0.14482468366622925, + "learning_rate": 5.085439597771436e-05, + "loss": 3.992, + "step": 57890 + }, + { + "epoch": 3.9336186981926895, + "grad_norm": 0.14134596288204193, + "learning_rate": 5.085014947683109e-05, + "loss": 3.9751, + "step": 57895 + }, + { + "epoch": 3.933958418263351, + "grad_norm": 0.1301964819431305, + "learning_rate": 5.0845902975947824e-05, + "loss": 3.6501, + "step": 57900 + }, + { + "epoch": 3.9342981383340128, + "grad_norm": 0.19141700863838196, + "learning_rate": 5.0841656475064545e-05, + "loss": 3.9216, + "step": 57905 + }, + { + "epoch": 3.9346378584046744, + "grad_norm": 0.15552227199077606, + "learning_rate": 5.083740997418128e-05, + "loss": 3.9603, + "step": 57910 + }, + { + "epoch": 3.9349775784753365, + "grad_norm": 0.16160663962364197, + "learning_rate": 5.083316347329801e-05, + "loss": 4.0457, + "step": 57915 + }, + { + "epoch": 3.935317298545998, + "grad_norm": 0.2949580252170563, + "learning_rate": 5.082891697241473e-05, + "loss": 3.7834, + "step": 57920 + }, + { + "epoch": 3.9356570186166597, + "grad_norm": 0.2479844093322754, + "learning_rate": 5.0824670471531464e-05, + "loss": 4.0351, + "step": 57925 + }, + { + "epoch": 3.935996738687322, + "grad_norm": 0.20096826553344727, + "learning_rate": 5.082042397064819e-05, + "loss": 3.6699, + "step": 57930 + }, + { + "epoch": 3.9363364587579834, + "grad_norm": 0.18883611261844635, + "learning_rate": 5.081617746976491e-05, + "loss": 3.9306, + "step": 57935 + }, + { + "epoch": 3.936676178828645, + "grad_norm": 0.2491828054189682, + "learning_rate": 5.081193096888165e-05, + "loss": 3.728, + "step": 57940 + }, + { + "epoch": 3.9370158988993067, + "grad_norm": 0.1591535359621048, + "learning_rate": 5.0807684467998376e-05, + "loss": 3.6391, + "step": 57945 + }, + { + "epoch": 3.9373556189699688, + "grad_norm": 0.2418534755706787, + "learning_rate": 5.08034379671151e-05, + "loss": 3.8671, + "step": 57950 + }, + { + "epoch": 3.9376953390406304, + "grad_norm": 0.18228910863399506, + "learning_rate": 5.079919146623183e-05, + "loss": 4.0138, + "step": 57955 + }, + { + "epoch": 3.938035059111292, + "grad_norm": 0.6733600497245789, + "learning_rate": 5.079494496534855e-05, + "loss": 3.7794, + "step": 57960 + }, + { + "epoch": 3.938374779181954, + "grad_norm": 0.3895410895347595, + "learning_rate": 5.079069846446528e-05, + "loss": 3.9538, + "step": 57965 + }, + { + "epoch": 3.9387144992526157, + "grad_norm": 2.397674083709717, + "learning_rate": 5.0786451963582016e-05, + "loss": 3.9813, + "step": 57970 + }, + { + "epoch": 3.9390542193232774, + "grad_norm": 0.1641194075345993, + "learning_rate": 5.078220546269874e-05, + "loss": 3.776, + "step": 57975 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.16521841287612915, + "learning_rate": 5.0777958961815465e-05, + "loss": 3.7073, + "step": 57980 + }, + { + "epoch": 3.939733659464601, + "grad_norm": 0.17825865745544434, + "learning_rate": 5.07737124609322e-05, + "loss": 3.8466, + "step": 57985 + }, + { + "epoch": 3.9400733795352627, + "grad_norm": 0.20976771414279938, + "learning_rate": 5.076946596004892e-05, + "loss": 3.7545, + "step": 57990 + }, + { + "epoch": 3.940413099605925, + "grad_norm": 0.184937983751297, + "learning_rate": 5.076521945916565e-05, + "loss": 3.722, + "step": 57995 + }, + { + "epoch": 3.9407528196765864, + "grad_norm": 0.22033217549324036, + "learning_rate": 5.0760972958282384e-05, + "loss": 3.6623, + "step": 58000 + }, + { + "epoch": 3.941092539747248, + "grad_norm": 0.17887519299983978, + "learning_rate": 5.0756726457399105e-05, + "loss": 3.5536, + "step": 58005 + }, + { + "epoch": 3.94143225981791, + "grad_norm": 0.18218696117401123, + "learning_rate": 5.075247995651583e-05, + "loss": 3.8262, + "step": 58010 + }, + { + "epoch": 3.9417719798885718, + "grad_norm": 0.16191956400871277, + "learning_rate": 5.074823345563257e-05, + "loss": 3.7628, + "step": 58015 + }, + { + "epoch": 3.9421116999592334, + "grad_norm": 0.16512049734592438, + "learning_rate": 5.074398695474929e-05, + "loss": 3.8712, + "step": 58020 + }, + { + "epoch": 3.9424514200298955, + "grad_norm": 0.17542946338653564, + "learning_rate": 5.073974045386601e-05, + "loss": 3.7024, + "step": 58025 + }, + { + "epoch": 3.942791140100557, + "grad_norm": 1.0706050395965576, + "learning_rate": 5.073549395298275e-05, + "loss": 3.8526, + "step": 58030 + }, + { + "epoch": 3.9431308601712187, + "grad_norm": 0.1441994607448578, + "learning_rate": 5.073124745209947e-05, + "loss": 3.7807, + "step": 58035 + }, + { + "epoch": 3.943470580241881, + "grad_norm": 0.16684125363826752, + "learning_rate": 5.0727000951216195e-05, + "loss": 3.8291, + "step": 58040 + }, + { + "epoch": 3.9438103003125424, + "grad_norm": 0.1524353325366974, + "learning_rate": 5.072275445033293e-05, + "loss": 3.9333, + "step": 58045 + }, + { + "epoch": 3.944150020383204, + "grad_norm": 0.22557751834392548, + "learning_rate": 5.071850794944966e-05, + "loss": 3.8859, + "step": 58050 + }, + { + "epoch": 3.944489740453866, + "grad_norm": 0.1527438908815384, + "learning_rate": 5.071426144856638e-05, + "loss": 4.0062, + "step": 58055 + }, + { + "epoch": 3.9448294605245278, + "grad_norm": 0.1854691058397293, + "learning_rate": 5.071001494768311e-05, + "loss": 3.8471, + "step": 58060 + }, + { + "epoch": 3.9451691805951894, + "grad_norm": 0.2058502584695816, + "learning_rate": 5.070576844679984e-05, + "loss": 3.7986, + "step": 58065 + }, + { + "epoch": 3.9455089006658515, + "grad_norm": 0.13655072450637817, + "learning_rate": 5.070152194591656e-05, + "loss": 3.8868, + "step": 58070 + }, + { + "epoch": 3.945848620736513, + "grad_norm": 0.1824241578578949, + "learning_rate": 5.06972754450333e-05, + "loss": 3.9055, + "step": 58075 + }, + { + "epoch": 3.9461883408071747, + "grad_norm": 0.14530250430107117, + "learning_rate": 5.0693028944150025e-05, + "loss": 3.9602, + "step": 58080 + }, + { + "epoch": 3.946528060877837, + "grad_norm": 0.3668321669101715, + "learning_rate": 5.068878244326675e-05, + "loss": 3.8625, + "step": 58085 + }, + { + "epoch": 3.9468677809484984, + "grad_norm": 0.19082443416118622, + "learning_rate": 5.068453594238348e-05, + "loss": 3.8223, + "step": 58090 + }, + { + "epoch": 3.94720750101916, + "grad_norm": 0.15664999186992645, + "learning_rate": 5.068028944150021e-05, + "loss": 3.7819, + "step": 58095 + }, + { + "epoch": 3.947547221089822, + "grad_norm": 0.1604963093996048, + "learning_rate": 5.067604294061693e-05, + "loss": 4.0632, + "step": 58100 + }, + { + "epoch": 3.9478869411604838, + "grad_norm": 0.22966524958610535, + "learning_rate": 5.0671796439733665e-05, + "loss": 3.9736, + "step": 58105 + }, + { + "epoch": 3.9482266612311454, + "grad_norm": 0.19551196694374084, + "learning_rate": 5.066754993885039e-05, + "loss": 3.6297, + "step": 58110 + }, + { + "epoch": 3.9485663813018075, + "grad_norm": 1.3087072372436523, + "learning_rate": 5.0663303437967115e-05, + "loss": 3.8351, + "step": 58115 + }, + { + "epoch": 3.948906101372469, + "grad_norm": 0.3184093236923218, + "learning_rate": 5.065905693708385e-05, + "loss": 3.7368, + "step": 58120 + }, + { + "epoch": 3.9492458214431307, + "grad_norm": 0.928909182548523, + "learning_rate": 5.065481043620057e-05, + "loss": 3.6322, + "step": 58125 + }, + { + "epoch": 3.949585541513793, + "grad_norm": 0.75217205286026, + "learning_rate": 5.06505639353173e-05, + "loss": 4.094, + "step": 58130 + }, + { + "epoch": 3.9499252615844545, + "grad_norm": 0.16419604420661926, + "learning_rate": 5.0646317434434034e-05, + "loss": 3.8522, + "step": 58135 + }, + { + "epoch": 3.950264981655116, + "grad_norm": 0.19089119136333466, + "learning_rate": 5.0642070933550755e-05, + "loss": 3.9115, + "step": 58140 + }, + { + "epoch": 3.950604701725778, + "grad_norm": 0.1611787974834442, + "learning_rate": 5.063782443266748e-05, + "loss": 3.9176, + "step": 58145 + }, + { + "epoch": 3.95094442179644, + "grad_norm": 0.1655425727367401, + "learning_rate": 5.063357793178422e-05, + "loss": 3.8892, + "step": 58150 + }, + { + "epoch": 3.9512841418671014, + "grad_norm": 0.15748895704746246, + "learning_rate": 5.062933143090094e-05, + "loss": 4.0063, + "step": 58155 + }, + { + "epoch": 3.9516238619377635, + "grad_norm": 0.19590981304645538, + "learning_rate": 5.062508493001766e-05, + "loss": 4.0098, + "step": 58160 + }, + { + "epoch": 3.951963582008425, + "grad_norm": 0.20488980412483215, + "learning_rate": 5.06208384291344e-05, + "loss": 4.2015, + "step": 58165 + }, + { + "epoch": 3.9523033020790868, + "grad_norm": 0.18874071538448334, + "learning_rate": 5.061659192825112e-05, + "loss": 3.7641, + "step": 58170 + }, + { + "epoch": 3.952643022149749, + "grad_norm": 0.4109078645706177, + "learning_rate": 5.0612345427367844e-05, + "loss": 3.719, + "step": 58175 + }, + { + "epoch": 3.9529827422204105, + "grad_norm": 0.6373121738433838, + "learning_rate": 5.060809892648458e-05, + "loss": 3.8161, + "step": 58180 + }, + { + "epoch": 3.953322462291072, + "grad_norm": 0.19689665734767914, + "learning_rate": 5.060385242560131e-05, + "loss": 3.8705, + "step": 58185 + }, + { + "epoch": 3.953662182361734, + "grad_norm": 0.12970386445522308, + "learning_rate": 5.059960592471803e-05, + "loss": 3.8011, + "step": 58190 + }, + { + "epoch": 3.954001902432396, + "grad_norm": 0.2488405853509903, + "learning_rate": 5.059535942383476e-05, + "loss": 3.8168, + "step": 58195 + }, + { + "epoch": 3.9543416225030574, + "grad_norm": 0.1712496280670166, + "learning_rate": 5.059111292295149e-05, + "loss": 3.8137, + "step": 58200 + }, + { + "epoch": 3.9546813425737195, + "grad_norm": 0.17488044500350952, + "learning_rate": 5.058686642206821e-05, + "loss": 3.9592, + "step": 58205 + }, + { + "epoch": 3.955021062644381, + "grad_norm": 0.18150053918361664, + "learning_rate": 5.058261992118495e-05, + "loss": 3.5427, + "step": 58210 + }, + { + "epoch": 3.9553607827150428, + "grad_norm": 0.16462060809135437, + "learning_rate": 5.0578373420301675e-05, + "loss": 3.6738, + "step": 58215 + }, + { + "epoch": 3.955700502785705, + "grad_norm": 4.366456985473633, + "learning_rate": 5.0574126919418396e-05, + "loss": 4.001, + "step": 58220 + }, + { + "epoch": 3.9560402228563665, + "grad_norm": 0.1975218802690506, + "learning_rate": 5.056988041853513e-05, + "loss": 3.7164, + "step": 58225 + }, + { + "epoch": 3.956379942927028, + "grad_norm": 0.19399023056030273, + "learning_rate": 5.056563391765186e-05, + "loss": 3.7518, + "step": 58230 + }, + { + "epoch": 3.95671966299769, + "grad_norm": 0.6008965969085693, + "learning_rate": 5.056138741676858e-05, + "loss": 4.0212, + "step": 58235 + }, + { + "epoch": 3.957059383068352, + "grad_norm": 0.16631518304347992, + "learning_rate": 5.0557140915885315e-05, + "loss": 3.8981, + "step": 58240 + }, + { + "epoch": 3.9573991031390134, + "grad_norm": 0.1731974333524704, + "learning_rate": 5.0552894415002036e-05, + "loss": 3.9825, + "step": 58245 + }, + { + "epoch": 3.957738823209675, + "grad_norm": 0.1502828299999237, + "learning_rate": 5.054864791411877e-05, + "loss": 3.8531, + "step": 58250 + }, + { + "epoch": 3.958078543280337, + "grad_norm": 0.17497454583644867, + "learning_rate": 5.05444014132355e-05, + "loss": 3.5233, + "step": 58255 + }, + { + "epoch": 3.958418263350999, + "grad_norm": 0.17214643955230713, + "learning_rate": 5.054015491235222e-05, + "loss": 3.7012, + "step": 58260 + }, + { + "epoch": 3.9587579834216604, + "grad_norm": 0.1628088504076004, + "learning_rate": 5.0535908411468955e-05, + "loss": 3.9131, + "step": 58265 + }, + { + "epoch": 3.9590977034923225, + "grad_norm": 0.17483054101467133, + "learning_rate": 5.053166191058568e-05, + "loss": 3.9005, + "step": 58270 + }, + { + "epoch": 3.959437423562984, + "grad_norm": 0.18805645406246185, + "learning_rate": 5.0527415409702404e-05, + "loss": 3.643, + "step": 58275 + }, + { + "epoch": 3.9597771436336457, + "grad_norm": 0.16807018220424652, + "learning_rate": 5.052316890881914e-05, + "loss": 3.8337, + "step": 58280 + }, + { + "epoch": 3.9601168637043074, + "grad_norm": 0.2239527851343155, + "learning_rate": 5.051892240793587e-05, + "loss": 3.8576, + "step": 58285 + }, + { + "epoch": 3.9604565837749695, + "grad_norm": 0.14480891823768616, + "learning_rate": 5.051467590705259e-05, + "loss": 4.1319, + "step": 58290 + }, + { + "epoch": 3.960796303845631, + "grad_norm": 0.13822710514068604, + "learning_rate": 5.051042940616932e-05, + "loss": 3.7949, + "step": 58295 + }, + { + "epoch": 3.9611360239162927, + "grad_norm": 0.24001508951187134, + "learning_rate": 5.050618290528605e-05, + "loss": 3.7992, + "step": 58300 + }, + { + "epoch": 3.961475743986955, + "grad_norm": 0.2069455087184906, + "learning_rate": 5.050193640440277e-05, + "loss": 3.8832, + "step": 58305 + }, + { + "epoch": 3.9618154640576164, + "grad_norm": 0.15955792367458344, + "learning_rate": 5.049768990351951e-05, + "loss": 3.6966, + "step": 58310 + }, + { + "epoch": 3.962155184128278, + "grad_norm": 0.1789204180240631, + "learning_rate": 5.049344340263623e-05, + "loss": 4.1342, + "step": 58315 + }, + { + "epoch": 3.96249490419894, + "grad_norm": 0.1916450709104538, + "learning_rate": 5.0489196901752956e-05, + "loss": 3.8248, + "step": 58320 + }, + { + "epoch": 3.9628346242696018, + "grad_norm": 0.1537584811449051, + "learning_rate": 5.048495040086969e-05, + "loss": 3.7187, + "step": 58325 + }, + { + "epoch": 3.9631743443402634, + "grad_norm": 1.1312006711959839, + "learning_rate": 5.048070389998641e-05, + "loss": 3.6617, + "step": 58330 + }, + { + "epoch": 3.9635140644109255, + "grad_norm": 2.647061824798584, + "learning_rate": 5.047645739910314e-05, + "loss": 3.8792, + "step": 58335 + }, + { + "epoch": 3.963853784481587, + "grad_norm": 0.1332806795835495, + "learning_rate": 5.0472210898219875e-05, + "loss": 3.8365, + "step": 58340 + }, + { + "epoch": 3.9641935045522487, + "grad_norm": 0.20037269592285156, + "learning_rate": 5.0467964397336596e-05, + "loss": 3.584, + "step": 58345 + }, + { + "epoch": 3.964533224622911, + "grad_norm": 0.22947564721107483, + "learning_rate": 5.0463717896453324e-05, + "loss": 4.1314, + "step": 58350 + }, + { + "epoch": 3.9648729446935724, + "grad_norm": 0.19128064811229706, + "learning_rate": 5.045947139557006e-05, + "loss": 3.4992, + "step": 58355 + }, + { + "epoch": 3.965212664764234, + "grad_norm": 0.165038600564003, + "learning_rate": 5.045522489468678e-05, + "loss": 3.6887, + "step": 58360 + }, + { + "epoch": 3.965552384834896, + "grad_norm": 0.4046389162540436, + "learning_rate": 5.045097839380351e-05, + "loss": 4.0358, + "step": 58365 + }, + { + "epoch": 3.9658921049055578, + "grad_norm": 0.1963997781276703, + "learning_rate": 5.044673189292024e-05, + "loss": 3.8024, + "step": 58370 + }, + { + "epoch": 3.9662318249762194, + "grad_norm": 1.151802659034729, + "learning_rate": 5.0442485392036964e-05, + "loss": 3.7629, + "step": 58375 + }, + { + "epoch": 3.9665715450468815, + "grad_norm": 0.1758662313222885, + "learning_rate": 5.0438238891153686e-05, + "loss": 3.8395, + "step": 58380 + }, + { + "epoch": 3.966911265117543, + "grad_norm": 0.18605123460292816, + "learning_rate": 5.043399239027042e-05, + "loss": 3.9211, + "step": 58385 + }, + { + "epoch": 3.9672509851882047, + "grad_norm": 0.17568737268447876, + "learning_rate": 5.042974588938715e-05, + "loss": 3.952, + "step": 58390 + }, + { + "epoch": 3.967590705258867, + "grad_norm": 0.17109186947345734, + "learning_rate": 5.042549938850387e-05, + "loss": 3.9707, + "step": 58395 + }, + { + "epoch": 3.9679304253295284, + "grad_norm": 0.14805428683757782, + "learning_rate": 5.0421252887620604e-05, + "loss": 3.6518, + "step": 58400 + }, + { + "epoch": 3.96827014540019, + "grad_norm": 0.22641950845718384, + "learning_rate": 5.041700638673733e-05, + "loss": 3.7828, + "step": 58405 + }, + { + "epoch": 3.968609865470852, + "grad_norm": 0.15636736154556274, + "learning_rate": 5.0412759885854054e-05, + "loss": 3.8271, + "step": 58410 + }, + { + "epoch": 3.968949585541514, + "grad_norm": 0.18020835518836975, + "learning_rate": 5.040851338497079e-05, + "loss": 3.6007, + "step": 58415 + }, + { + "epoch": 3.9692893056121754, + "grad_norm": 0.3707665503025055, + "learning_rate": 5.0404266884087516e-05, + "loss": 3.7226, + "step": 58420 + }, + { + "epoch": 3.9696290256828375, + "grad_norm": 0.20445334911346436, + "learning_rate": 5.040086968338089e-05, + "loss": 4.1161, + "step": 58425 + }, + { + "epoch": 3.969968745753499, + "grad_norm": 0.21595533192157745, + "learning_rate": 5.039662318249763e-05, + "loss": 3.9649, + "step": 58430 + }, + { + "epoch": 3.9703084658241607, + "grad_norm": 0.1692022681236267, + "learning_rate": 5.039237668161435e-05, + "loss": 3.6647, + "step": 58435 + }, + { + "epoch": 3.970648185894823, + "grad_norm": 0.16394400596618652, + "learning_rate": 5.0388130180731076e-05, + "loss": 3.8758, + "step": 58440 + }, + { + "epoch": 3.9709879059654845, + "grad_norm": 0.2083907425403595, + "learning_rate": 5.038388367984781e-05, + "loss": 3.7886, + "step": 58445 + }, + { + "epoch": 3.971327626036146, + "grad_norm": 0.19362513720989227, + "learning_rate": 5.037963717896453e-05, + "loss": 3.6316, + "step": 58450 + }, + { + "epoch": 3.971667346106808, + "grad_norm": 0.1844906061887741, + "learning_rate": 5.037539067808127e-05, + "loss": 4.0692, + "step": 58455 + }, + { + "epoch": 3.97200706617747, + "grad_norm": 0.16851547360420227, + "learning_rate": 5.0371144177197995e-05, + "loss": 3.8643, + "step": 58460 + }, + { + "epoch": 3.9723467862481314, + "grad_norm": 0.20549781620502472, + "learning_rate": 5.0366897676314716e-05, + "loss": 3.7875, + "step": 58465 + }, + { + "epoch": 3.9726865063187935, + "grad_norm": 0.20725449919700623, + "learning_rate": 5.036265117543145e-05, + "loss": 3.6846, + "step": 58470 + }, + { + "epoch": 3.973026226389455, + "grad_norm": 0.6084370017051697, + "learning_rate": 5.035840467454818e-05, + "loss": 3.7555, + "step": 58475 + }, + { + "epoch": 3.9733659464601168, + "grad_norm": 0.19661790132522583, + "learning_rate": 5.03541581736649e-05, + "loss": 3.8586, + "step": 58480 + }, + { + "epoch": 3.973705666530779, + "grad_norm": 0.15667492151260376, + "learning_rate": 5.0349911672781635e-05, + "loss": 3.7534, + "step": 58485 + }, + { + "epoch": 3.9740453866014405, + "grad_norm": 0.17166268825531006, + "learning_rate": 5.034566517189836e-05, + "loss": 3.9917, + "step": 58490 + }, + { + "epoch": 3.974385106672102, + "grad_norm": 0.17300529778003693, + "learning_rate": 5.0341418671015084e-05, + "loss": 4.0506, + "step": 58495 + }, + { + "epoch": 3.974724826742764, + "grad_norm": 0.3302803635597229, + "learning_rate": 5.033717217013182e-05, + "loss": 3.7439, + "step": 58500 + }, + { + "epoch": 3.975064546813426, + "grad_norm": 1.2848992347717285, + "learning_rate": 5.033292566924854e-05, + "loss": 3.8831, + "step": 58505 + }, + { + "epoch": 3.9754042668840874, + "grad_norm": 0.15228107571601868, + "learning_rate": 5.032867916836527e-05, + "loss": 3.7998, + "step": 58510 + }, + { + "epoch": 3.9757439869547495, + "grad_norm": 0.17852406203746796, + "learning_rate": 5.0324432667482e-05, + "loss": 4.0324, + "step": 58515 + }, + { + "epoch": 3.976083707025411, + "grad_norm": 0.24271060526371002, + "learning_rate": 5.0320186166598724e-05, + "loss": 3.8863, + "step": 58520 + }, + { + "epoch": 3.9764234270960728, + "grad_norm": 0.20318448543548584, + "learning_rate": 5.031593966571545e-05, + "loss": 3.9232, + "step": 58525 + }, + { + "epoch": 3.976763147166735, + "grad_norm": 0.18552082777023315, + "learning_rate": 5.031169316483219e-05, + "loss": 3.7839, + "step": 58530 + }, + { + "epoch": 3.9771028672373965, + "grad_norm": 0.21139173209667206, + "learning_rate": 5.030744666394891e-05, + "loss": 3.9195, + "step": 58535 + }, + { + "epoch": 3.977442587308058, + "grad_norm": 0.1639527678489685, + "learning_rate": 5.0303200163065636e-05, + "loss": 3.7527, + "step": 58540 + }, + { + "epoch": 3.97778230737872, + "grad_norm": 0.25587624311447144, + "learning_rate": 5.029895366218237e-05, + "loss": 3.9836, + "step": 58545 + }, + { + "epoch": 3.978122027449382, + "grad_norm": 0.1928192675113678, + "learning_rate": 5.029470716129909e-05, + "loss": 4.0151, + "step": 58550 + }, + { + "epoch": 3.9784617475200434, + "grad_norm": 0.19073210656642914, + "learning_rate": 5.0290460660415813e-05, + "loss": 4.0005, + "step": 58555 + }, + { + "epoch": 3.9788014675907055, + "grad_norm": 0.1744067668914795, + "learning_rate": 5.0286214159532555e-05, + "loss": 3.9229, + "step": 58560 + }, + { + "epoch": 3.979141187661367, + "grad_norm": 0.1475725769996643, + "learning_rate": 5.0281967658649276e-05, + "loss": 3.8403, + "step": 58565 + }, + { + "epoch": 3.979480907732029, + "grad_norm": 0.18871602416038513, + "learning_rate": 5.0277721157766e-05, + "loss": 3.9315, + "step": 58570 + }, + { + "epoch": 3.979820627802691, + "grad_norm": 0.6208199262619019, + "learning_rate": 5.027347465688273e-05, + "loss": 3.8072, + "step": 58575 + }, + { + "epoch": 3.9801603478733525, + "grad_norm": 0.16219539940357208, + "learning_rate": 5.026922815599946e-05, + "loss": 3.852, + "step": 58580 + }, + { + "epoch": 3.980500067944014, + "grad_norm": 1.1675351858139038, + "learning_rate": 5.026498165511618e-05, + "loss": 3.8569, + "step": 58585 + }, + { + "epoch": 3.9808397880146758, + "grad_norm": 0.16082051396369934, + "learning_rate": 5.0260735154232916e-05, + "loss": 3.8136, + "step": 58590 + }, + { + "epoch": 3.981179508085338, + "grad_norm": 0.21875548362731934, + "learning_rate": 5.0256488653349644e-05, + "loss": 3.8017, + "step": 58595 + }, + { + "epoch": 3.9815192281559995, + "grad_norm": 0.2560725212097168, + "learning_rate": 5.0252242152466366e-05, + "loss": 4.0521, + "step": 58600 + }, + { + "epoch": 3.981858948226661, + "grad_norm": 0.14233845472335815, + "learning_rate": 5.02479956515831e-05, + "loss": 3.8029, + "step": 58605 + }, + { + "epoch": 3.982198668297323, + "grad_norm": 0.18992263078689575, + "learning_rate": 5.024374915069983e-05, + "loss": 3.94, + "step": 58610 + }, + { + "epoch": 3.982538388367985, + "grad_norm": 0.28213536739349365, + "learning_rate": 5.023950264981655e-05, + "loss": 3.6416, + "step": 58615 + }, + { + "epoch": 3.9828781084386464, + "grad_norm": 0.1601865291595459, + "learning_rate": 5.0235256148933284e-05, + "loss": 4.0403, + "step": 58620 + }, + { + "epoch": 3.983217828509308, + "grad_norm": 0.14604465663433075, + "learning_rate": 5.023100964805001e-05, + "loss": 3.7275, + "step": 58625 + }, + { + "epoch": 3.98355754857997, + "grad_norm": 0.14919552206993103, + "learning_rate": 5.0226763147166734e-05, + "loss": 3.8316, + "step": 58630 + }, + { + "epoch": 3.9838972686506318, + "grad_norm": 0.19058912992477417, + "learning_rate": 5.022251664628347e-05, + "loss": 3.6312, + "step": 58635 + }, + { + "epoch": 3.9842369887212934, + "grad_norm": 0.21338985860347748, + "learning_rate": 5.021827014540019e-05, + "loss": 3.9457, + "step": 58640 + }, + { + "epoch": 3.9845767087919555, + "grad_norm": 0.20273464918136597, + "learning_rate": 5.021402364451692e-05, + "loss": 3.8171, + "step": 58645 + }, + { + "epoch": 3.984916428862617, + "grad_norm": 0.15299154818058014, + "learning_rate": 5.020977714363365e-05, + "loss": 3.9896, + "step": 58650 + }, + { + "epoch": 3.9852561489332787, + "grad_norm": 0.14319995045661926, + "learning_rate": 5.0205530642750374e-05, + "loss": 3.8516, + "step": 58655 + }, + { + "epoch": 3.985595869003941, + "grad_norm": 0.16296766698360443, + "learning_rate": 5.02012841418671e-05, + "loss": 3.7803, + "step": 58660 + }, + { + "epoch": 3.9859355890746024, + "grad_norm": 0.16991741955280304, + "learning_rate": 5.0197037640983836e-05, + "loss": 3.9436, + "step": 58665 + }, + { + "epoch": 3.986275309145264, + "grad_norm": 0.35606053471565247, + "learning_rate": 5.019279114010056e-05, + "loss": 3.7107, + "step": 58670 + }, + { + "epoch": 3.986615029215926, + "grad_norm": 0.1802905946969986, + "learning_rate": 5.0188544639217286e-05, + "loss": 3.8669, + "step": 58675 + }, + { + "epoch": 3.9869547492865878, + "grad_norm": 0.21026048064231873, + "learning_rate": 5.018429813833402e-05, + "loss": 3.7553, + "step": 58680 + }, + { + "epoch": 3.9872944693572494, + "grad_norm": 0.21038822829723358, + "learning_rate": 5.018005163745074e-05, + "loss": 3.5983, + "step": 58685 + }, + { + "epoch": 3.9876341894279115, + "grad_norm": 0.17418810725212097, + "learning_rate": 5.017580513656747e-05, + "loss": 3.955, + "step": 58690 + }, + { + "epoch": 3.987973909498573, + "grad_norm": 0.1159779354929924, + "learning_rate": 5.0171558635684204e-05, + "loss": 3.8809, + "step": 58695 + }, + { + "epoch": 3.9883136295692347, + "grad_norm": 0.14761552214622498, + "learning_rate": 5.0167312134800926e-05, + "loss": 3.6657, + "step": 58700 + }, + { + "epoch": 3.988653349639897, + "grad_norm": 0.13861523568630219, + "learning_rate": 5.016306563391765e-05, + "loss": 3.8866, + "step": 58705 + }, + { + "epoch": 3.9889930697105584, + "grad_norm": 0.1843661367893219, + "learning_rate": 5.015881913303438e-05, + "loss": 3.804, + "step": 58710 + }, + { + "epoch": 3.98933278978122, + "grad_norm": 0.1516420692205429, + "learning_rate": 5.015457263215111e-05, + "loss": 3.925, + "step": 58715 + }, + { + "epoch": 3.989672509851882, + "grad_norm": 0.18322625756263733, + "learning_rate": 5.015032613126783e-05, + "loss": 4.0793, + "step": 58720 + }, + { + "epoch": 3.990012229922544, + "grad_norm": 0.21290969848632812, + "learning_rate": 5.0146079630384566e-05, + "loss": 3.5531, + "step": 58725 + }, + { + "epoch": 3.9903519499932054, + "grad_norm": 0.17416241765022278, + "learning_rate": 5.0141833129501294e-05, + "loss": 3.6092, + "step": 58730 + }, + { + "epoch": 3.9906916700638675, + "grad_norm": 0.20266221463680267, + "learning_rate": 5.0137586628618015e-05, + "loss": 3.6445, + "step": 58735 + }, + { + "epoch": 3.991031390134529, + "grad_norm": 0.17679984867572784, + "learning_rate": 5.013334012773475e-05, + "loss": 3.8726, + "step": 58740 + }, + { + "epoch": 3.9913711102051908, + "grad_norm": 0.1757776439189911, + "learning_rate": 5.012909362685148e-05, + "loss": 3.7783, + "step": 58745 + }, + { + "epoch": 3.991710830275853, + "grad_norm": 0.1433846652507782, + "learning_rate": 5.01248471259682e-05, + "loss": 3.6664, + "step": 58750 + }, + { + "epoch": 3.9920505503465145, + "grad_norm": 0.17945587635040283, + "learning_rate": 5.0120600625084934e-05, + "loss": 3.814, + "step": 58755 + }, + { + "epoch": 3.992390270417176, + "grad_norm": 0.7377082705497742, + "learning_rate": 5.011635412420166e-05, + "loss": 3.713, + "step": 58760 + }, + { + "epoch": 3.992729990487838, + "grad_norm": 0.19541621208190918, + "learning_rate": 5.011210762331838e-05, + "loss": 4.0146, + "step": 58765 + }, + { + "epoch": 3.9930697105585, + "grad_norm": 0.27222272753715515, + "learning_rate": 5.010786112243512e-05, + "loss": 3.7276, + "step": 58770 + }, + { + "epoch": 3.9934094306291614, + "grad_norm": 0.19280044734477997, + "learning_rate": 5.010361462155184e-05, + "loss": 3.5795, + "step": 58775 + }, + { + "epoch": 3.9937491506998235, + "grad_norm": 0.5082488059997559, + "learning_rate": 5.009936812066857e-05, + "loss": 3.6743, + "step": 58780 + }, + { + "epoch": 3.994088870770485, + "grad_norm": 0.310148149728775, + "learning_rate": 5.00951216197853e-05, + "loss": 3.7752, + "step": 58785 + }, + { + "epoch": 3.9944285908411468, + "grad_norm": 0.17796210944652557, + "learning_rate": 5.009087511890202e-05, + "loss": 3.8461, + "step": 58790 + }, + { + "epoch": 3.994768310911809, + "grad_norm": 0.17581085860729218, + "learning_rate": 5.008662861801876e-05, + "loss": 3.875, + "step": 58795 + }, + { + "epoch": 3.9951080309824705, + "grad_norm": 0.1939670741558075, + "learning_rate": 5.0082382117135486e-05, + "loss": 3.8743, + "step": 58800 + }, + { + "epoch": 3.995447751053132, + "grad_norm": 0.21142704784870148, + "learning_rate": 5.007813561625221e-05, + "loss": 4.1714, + "step": 58805 + }, + { + "epoch": 3.995787471123794, + "grad_norm": 0.1747029572725296, + "learning_rate": 5.007388911536894e-05, + "loss": 3.796, + "step": 58810 + }, + { + "epoch": 3.996127191194456, + "grad_norm": 0.2000393569469452, + "learning_rate": 5.006964261448567e-05, + "loss": 3.6185, + "step": 58815 + }, + { + "epoch": 3.9964669112651174, + "grad_norm": 0.1846715807914734, + "learning_rate": 5.006539611360239e-05, + "loss": 4.0956, + "step": 58820 + }, + { + "epoch": 3.9968066313357795, + "grad_norm": 0.15960972011089325, + "learning_rate": 5.0061149612719126e-05, + "loss": 3.8955, + "step": 58825 + }, + { + "epoch": 3.997146351406441, + "grad_norm": 0.21877214312553406, + "learning_rate": 5.0056903111835854e-05, + "loss": 3.8488, + "step": 58830 + }, + { + "epoch": 3.9974860714771028, + "grad_norm": 0.5683044791221619, + "learning_rate": 5.0052656610952575e-05, + "loss": 3.5833, + "step": 58835 + }, + { + "epoch": 3.997825791547765, + "grad_norm": 0.22725051641464233, + "learning_rate": 5.004841011006931e-05, + "loss": 3.9212, + "step": 58840 + }, + { + "epoch": 3.9981655116184265, + "grad_norm": 0.13704515993595123, + "learning_rate": 5.004416360918603e-05, + "loss": 3.8947, + "step": 58845 + }, + { + "epoch": 3.998505231689088, + "grad_norm": 0.19673940539360046, + "learning_rate": 5.003991710830276e-05, + "loss": 3.7694, + "step": 58850 + }, + { + "epoch": 3.99884495175975, + "grad_norm": 0.1492345631122589, + "learning_rate": 5.0035670607419494e-05, + "loss": 3.7982, + "step": 58855 + }, + { + "epoch": 3.999184671830412, + "grad_norm": 0.17284490168094635, + "learning_rate": 5.0031424106536215e-05, + "loss": 4.0409, + "step": 58860 + }, + { + "epoch": 3.9995243919010735, + "grad_norm": 0.18352960050106049, + "learning_rate": 5.002717760565294e-05, + "loss": 3.9834, + "step": 58865 + }, + { + "epoch": 3.9998641119717355, + "grad_norm": 0.27080050110816956, + "learning_rate": 5.002293110476968e-05, + "loss": 3.8097, + "step": 58870 + }, + { + "epoch": 4.0, + "eval_bertscore": { + "f1": 0.8525581484282706, + "precision": 0.8766079274463682, + "recall": 0.8301242098568923 + }, + "eval_bleu_4": 0.00181414384457063, + "eval_exact_match": 0.0, + "eval_loss": 3.6596784591674805, + "eval_meteor": 0.07472455239973799, + "eval_rouge": { + "rouge1": 0.12265682810266193, + "rouge2": 0.01509586917024809, + "rougeL": 0.10848370902925666, + "rougeLsum": 0.10849115945071275 + }, + "eval_runtime": 364.5974, + "eval_samples_per_second": 28.302, + "eval_steps_per_second": 3.538, + "step": 58872 + }, + { + "epoch": 4.000203832042397, + "grad_norm": 0.16956445574760437, + "learning_rate": 5.00186846038864e-05, + "loss": 3.9599, + "step": 58875 + }, + { + "epoch": 4.000543552113059, + "grad_norm": 0.5823156833648682, + "learning_rate": 5.001443810300313e-05, + "loss": 3.8077, + "step": 58880 + }, + { + "epoch": 4.000883272183721, + "grad_norm": 0.19785510003566742, + "learning_rate": 5.001019160211986e-05, + "loss": 3.7422, + "step": 58885 + }, + { + "epoch": 4.001222992254382, + "grad_norm": 0.21469318866729736, + "learning_rate": 5.000594510123658e-05, + "loss": 3.6264, + "step": 58890 + }, + { + "epoch": 4.001562712325044, + "grad_norm": 0.19370906054973602, + "learning_rate": 5.000169860035331e-05, + "loss": 3.7582, + "step": 58895 + }, + { + "epoch": 4.001902432395706, + "grad_norm": 0.1800762265920639, + "learning_rate": 4.999745209947004e-05, + "loss": 3.9108, + "step": 58900 + }, + { + "epoch": 4.002242152466367, + "grad_norm": 0.15316610038280487, + "learning_rate": 4.999320559858677e-05, + "loss": 3.5996, + "step": 58905 + }, + { + "epoch": 4.0025818725370295, + "grad_norm": 0.1281016618013382, + "learning_rate": 4.9988959097703495e-05, + "loss": 3.6931, + "step": 58910 + }, + { + "epoch": 4.0029215926076915, + "grad_norm": 0.5330086946487427, + "learning_rate": 4.998471259682022e-05, + "loss": 3.8492, + "step": 58915 + }, + { + "epoch": 4.003261312678353, + "grad_norm": 0.17565494775772095, + "learning_rate": 4.998046609593695e-05, + "loss": 3.9615, + "step": 58920 + }, + { + "epoch": 4.003601032749015, + "grad_norm": 0.34312376379966736, + "learning_rate": 4.997621959505368e-05, + "loss": 3.7863, + "step": 58925 + }, + { + "epoch": 4.003940752819677, + "grad_norm": 0.1818292886018753, + "learning_rate": 4.997197309417041e-05, + "loss": 3.9772, + "step": 58930 + }, + { + "epoch": 4.004280472890338, + "grad_norm": 0.20075489580631256, + "learning_rate": 4.9967726593287135e-05, + "loss": 3.7491, + "step": 58935 + }, + { + "epoch": 4.004620192961, + "grad_norm": 0.15806801617145538, + "learning_rate": 4.996348009240386e-05, + "loss": 3.8947, + "step": 58940 + }, + { + "epoch": 4.004959913031662, + "grad_norm": 0.7243024706840515, + "learning_rate": 4.9959233591520584e-05, + "loss": 3.844, + "step": 58945 + }, + { + "epoch": 4.005299633102323, + "grad_norm": 0.14481616020202637, + "learning_rate": 4.995498709063732e-05, + "loss": 3.8068, + "step": 58950 + }, + { + "epoch": 4.0056393531729855, + "grad_norm": 0.361086905002594, + "learning_rate": 4.995074058975405e-05, + "loss": 3.8142, + "step": 58955 + }, + { + "epoch": 4.0059790732436475, + "grad_norm": 0.5446694493293762, + "learning_rate": 4.994649408887077e-05, + "loss": 4.0164, + "step": 58960 + }, + { + "epoch": 4.006318793314309, + "grad_norm": 0.1736304610967636, + "learning_rate": 4.99422475879875e-05, + "loss": 3.8226, + "step": 58965 + }, + { + "epoch": 4.006658513384971, + "grad_norm": 0.16007249057292938, + "learning_rate": 4.993800108710423e-05, + "loss": 3.8051, + "step": 58970 + }, + { + "epoch": 4.006998233455633, + "grad_norm": 0.17339901626110077, + "learning_rate": 4.993375458622095e-05, + "loss": 4.008, + "step": 58975 + }, + { + "epoch": 4.007337953526294, + "grad_norm": 0.15776443481445312, + "learning_rate": 4.992950808533769e-05, + "loss": 3.8476, + "step": 58980 + }, + { + "epoch": 4.007677673596956, + "grad_norm": 0.7581840753555298, + "learning_rate": 4.9925261584454415e-05, + "loss": 4.0886, + "step": 58985 + }, + { + "epoch": 4.008017393667618, + "grad_norm": 0.15481506288051605, + "learning_rate": 4.9921015083571137e-05, + "loss": 3.8278, + "step": 58990 + }, + { + "epoch": 4.008357113738279, + "grad_norm": 1.1289478540420532, + "learning_rate": 4.9916768582687865e-05, + "loss": 3.9031, + "step": 58995 + }, + { + "epoch": 4.0086968338089415, + "grad_norm": 0.16259406507015228, + "learning_rate": 4.99125220818046e-05, + "loss": 3.9815, + "step": 59000 + }, + { + "epoch": 4.009036553879604, + "grad_norm": 0.19028763473033905, + "learning_rate": 4.990827558092132e-05, + "loss": 3.7146, + "step": 59005 + }, + { + "epoch": 4.009376273950265, + "grad_norm": 0.2163374423980713, + "learning_rate": 4.990402908003805e-05, + "loss": 3.8986, + "step": 59010 + }, + { + "epoch": 4.009715994020927, + "grad_norm": 0.22266799211502075, + "learning_rate": 4.989978257915478e-05, + "loss": 3.8054, + "step": 59015 + }, + { + "epoch": 4.010055714091589, + "grad_norm": 0.14388792216777802, + "learning_rate": 4.9895536078271505e-05, + "loss": 4.1818, + "step": 59020 + }, + { + "epoch": 4.01039543416225, + "grad_norm": 0.17766091227531433, + "learning_rate": 4.989128957738823e-05, + "loss": 3.9928, + "step": 59025 + }, + { + "epoch": 4.010735154232912, + "grad_norm": 0.15500864386558533, + "learning_rate": 4.988704307650496e-05, + "loss": 3.6784, + "step": 59030 + }, + { + "epoch": 4.011074874303574, + "grad_norm": 0.1616431474685669, + "learning_rate": 4.988279657562169e-05, + "loss": 3.7359, + "step": 59035 + }, + { + "epoch": 4.011414594374235, + "grad_norm": 0.18289917707443237, + "learning_rate": 4.987855007473842e-05, + "loss": 3.7096, + "step": 59040 + }, + { + "epoch": 4.0117543144448975, + "grad_norm": 0.1666620969772339, + "learning_rate": 4.9874303573855145e-05, + "loss": 3.924, + "step": 59045 + }, + { + "epoch": 4.01209403451556, + "grad_norm": 0.15615001320838928, + "learning_rate": 4.987005707297187e-05, + "loss": 3.8168, + "step": 59050 + }, + { + "epoch": 4.012433754586221, + "grad_norm": 0.18870574235916138, + "learning_rate": 4.98658105720886e-05, + "loss": 3.7716, + "step": 59055 + }, + { + "epoch": 4.012773474656883, + "grad_norm": 0.1600351184606552, + "learning_rate": 4.986156407120533e-05, + "loss": 3.8673, + "step": 59060 + }, + { + "epoch": 4.013113194727545, + "grad_norm": 0.16558194160461426, + "learning_rate": 4.985731757032206e-05, + "loss": 4.0019, + "step": 59065 + }, + { + "epoch": 4.013452914798206, + "grad_norm": 0.16873030364513397, + "learning_rate": 4.9853071069438785e-05, + "loss": 3.7445, + "step": 59070 + }, + { + "epoch": 4.013792634868868, + "grad_norm": 0.17627686262130737, + "learning_rate": 4.984882456855551e-05, + "loss": 3.6465, + "step": 59075 + }, + { + "epoch": 4.01413235493953, + "grad_norm": 0.18710507452487946, + "learning_rate": 4.984457806767224e-05, + "loss": 3.864, + "step": 59080 + }, + { + "epoch": 4.014472075010191, + "grad_norm": 0.2114461064338684, + "learning_rate": 4.984033156678897e-05, + "loss": 3.9279, + "step": 59085 + }, + { + "epoch": 4.0148117950808535, + "grad_norm": 0.17347390949726105, + "learning_rate": 4.98360850659057e-05, + "loss": 3.8828, + "step": 59090 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.1845157891511917, + "learning_rate": 4.9831838565022425e-05, + "loss": 3.869, + "step": 59095 + }, + { + "epoch": 4.015491235222177, + "grad_norm": 0.1586078703403473, + "learning_rate": 4.982759206413915e-05, + "loss": 4.0204, + "step": 59100 + }, + { + "epoch": 4.015830955292839, + "grad_norm": 0.18145281076431274, + "learning_rate": 4.982334556325588e-05, + "loss": 3.7016, + "step": 59105 + }, + { + "epoch": 4.016170675363501, + "grad_norm": 0.1808101236820221, + "learning_rate": 4.981909906237261e-05, + "loss": 3.98, + "step": 59110 + }, + { + "epoch": 4.016510395434162, + "grad_norm": 1.875012993812561, + "learning_rate": 4.981485256148934e-05, + "loss": 3.6984, + "step": 59115 + }, + { + "epoch": 4.016850115504824, + "grad_norm": 0.20098362863063812, + "learning_rate": 4.9810606060606065e-05, + "loss": 3.8073, + "step": 59120 + }, + { + "epoch": 4.017189835575485, + "grad_norm": 0.18970395624637604, + "learning_rate": 4.980635955972279e-05, + "loss": 3.8676, + "step": 59125 + }, + { + "epoch": 4.017529555646147, + "grad_norm": 0.15255382657051086, + "learning_rate": 4.9802113058839514e-05, + "loss": 3.9934, + "step": 59130 + }, + { + "epoch": 4.0178692757168095, + "grad_norm": 0.19978079199790955, + "learning_rate": 4.979786655795625e-05, + "loss": 3.9488, + "step": 59135 + }, + { + "epoch": 4.018208995787471, + "grad_norm": 0.22141171991825104, + "learning_rate": 4.979362005707298e-05, + "loss": 3.9699, + "step": 59140 + }, + { + "epoch": 4.018548715858133, + "grad_norm": 0.16204099357128143, + "learning_rate": 4.97893735561897e-05, + "loss": 3.8197, + "step": 59145 + }, + { + "epoch": 4.018888435928795, + "grad_norm": 0.1800290048122406, + "learning_rate": 4.978512705530643e-05, + "loss": 3.7941, + "step": 59150 + }, + { + "epoch": 4.019228155999456, + "grad_norm": 0.11704519391059875, + "learning_rate": 4.978088055442316e-05, + "loss": 4.0613, + "step": 59155 + }, + { + "epoch": 4.019567876070118, + "grad_norm": 0.16586509346961975, + "learning_rate": 4.977663405353988e-05, + "loss": 3.6458, + "step": 59160 + }, + { + "epoch": 4.01990759614078, + "grad_norm": 0.15744830667972565, + "learning_rate": 4.977238755265661e-05, + "loss": 3.8907, + "step": 59165 + }, + { + "epoch": 4.020247316211441, + "grad_norm": 0.14398132264614105, + "learning_rate": 4.9768141051773345e-05, + "loss": 3.8194, + "step": 59170 + }, + { + "epoch": 4.0205870362821035, + "grad_norm": 0.15997682511806488, + "learning_rate": 4.9763894550890066e-05, + "loss": 3.9589, + "step": 59175 + }, + { + "epoch": 4.0209267563527655, + "grad_norm": 0.18344305455684662, + "learning_rate": 4.9759648050006794e-05, + "loss": 3.5722, + "step": 59180 + }, + { + "epoch": 4.021266476423427, + "grad_norm": 0.20392026007175446, + "learning_rate": 4.975540154912353e-05, + "loss": 4.0628, + "step": 59185 + }, + { + "epoch": 4.021606196494089, + "grad_norm": 2.4105477333068848, + "learning_rate": 4.975115504824025e-05, + "loss": 4.171, + "step": 59190 + }, + { + "epoch": 4.021945916564751, + "grad_norm": 0.1458497792482376, + "learning_rate": 4.974690854735698e-05, + "loss": 3.9394, + "step": 59195 + }, + { + "epoch": 4.022285636635412, + "grad_norm": 1.075713872909546, + "learning_rate": 4.9742662046473706e-05, + "loss": 3.8875, + "step": 59200 + }, + { + "epoch": 4.022625356706074, + "grad_norm": 0.19188059866428375, + "learning_rate": 4.9738415545590434e-05, + "loss": 3.9047, + "step": 59205 + }, + { + "epoch": 4.022965076776736, + "grad_norm": 0.14460740983486176, + "learning_rate": 4.973416904470716e-05, + "loss": 3.9705, + "step": 59210 + }, + { + "epoch": 4.023304796847397, + "grad_norm": 0.2275361865758896, + "learning_rate": 4.972992254382389e-05, + "loss": 4.008, + "step": 59215 + }, + { + "epoch": 4.0236445169180595, + "grad_norm": 0.15463371574878693, + "learning_rate": 4.972567604294062e-05, + "loss": 3.8584, + "step": 59220 + }, + { + "epoch": 4.0239842369887215, + "grad_norm": 0.34425005316734314, + "learning_rate": 4.9721429542057346e-05, + "loss": 3.9025, + "step": 59225 + }, + { + "epoch": 4.024323957059383, + "grad_norm": 0.12763836979866028, + "learning_rate": 4.9717183041174074e-05, + "loss": 4.0291, + "step": 59230 + }, + { + "epoch": 4.024663677130045, + "grad_norm": 0.14085246622562408, + "learning_rate": 4.97129365402908e-05, + "loss": 3.9382, + "step": 59235 + }, + { + "epoch": 4.025003397200707, + "grad_norm": 0.14459680020809174, + "learning_rate": 4.970869003940753e-05, + "loss": 3.843, + "step": 59240 + }, + { + "epoch": 4.025343117271368, + "grad_norm": 0.1785246729850769, + "learning_rate": 4.970444353852426e-05, + "loss": 4.0036, + "step": 59245 + }, + { + "epoch": 4.02568283734203, + "grad_norm": 1.2985061407089233, + "learning_rate": 4.9700197037640986e-05, + "loss": 3.9581, + "step": 59250 + }, + { + "epoch": 4.026022557412692, + "grad_norm": 0.19062712788581848, + "learning_rate": 4.9695950536757714e-05, + "loss": 3.841, + "step": 59255 + }, + { + "epoch": 4.026362277483353, + "grad_norm": 0.154824361205101, + "learning_rate": 4.969170403587444e-05, + "loss": 3.7746, + "step": 59260 + }, + { + "epoch": 4.0267019975540155, + "grad_norm": 0.16371692717075348, + "learning_rate": 4.968745753499117e-05, + "loss": 4.0989, + "step": 59265 + }, + { + "epoch": 4.0270417176246776, + "grad_norm": 0.17377406358718872, + "learning_rate": 4.96832110341079e-05, + "loss": 3.96, + "step": 59270 + }, + { + "epoch": 4.027381437695339, + "grad_norm": 0.19272848963737488, + "learning_rate": 4.9678964533224626e-05, + "loss": 3.9942, + "step": 59275 + }, + { + "epoch": 4.027721157766001, + "grad_norm": 0.1610245555639267, + "learning_rate": 4.9674718032341354e-05, + "loss": 3.645, + "step": 59280 + }, + { + "epoch": 4.028060877836663, + "grad_norm": 0.22347910702228546, + "learning_rate": 4.967047153145808e-05, + "loss": 3.9666, + "step": 59285 + }, + { + "epoch": 4.028400597907324, + "grad_norm": 0.18998447060585022, + "learning_rate": 4.966622503057481e-05, + "loss": 3.7006, + "step": 59290 + }, + { + "epoch": 4.028740317977986, + "grad_norm": 0.695253312587738, + "learning_rate": 4.966197852969154e-05, + "loss": 3.9233, + "step": 59295 + }, + { + "epoch": 4.029080038048648, + "grad_norm": 0.11901706457138062, + "learning_rate": 4.965773202880826e-05, + "loss": 3.7889, + "step": 59300 + }, + { + "epoch": 4.029419758119309, + "grad_norm": 0.14980019629001617, + "learning_rate": 4.9653485527924994e-05, + "loss": 3.8429, + "step": 59305 + }, + { + "epoch": 4.0297594781899715, + "grad_norm": 0.1807236224412918, + "learning_rate": 4.964923902704172e-05, + "loss": 3.7165, + "step": 59310 + }, + { + "epoch": 4.030099198260634, + "grad_norm": 0.17069785296916962, + "learning_rate": 4.9644992526158444e-05, + "loss": 3.7196, + "step": 59315 + }, + { + "epoch": 4.030438918331295, + "grad_norm": 0.8789368867874146, + "learning_rate": 4.964074602527518e-05, + "loss": 3.8654, + "step": 59320 + }, + { + "epoch": 4.030778638401957, + "grad_norm": 0.16854645311832428, + "learning_rate": 4.9636499524391906e-05, + "loss": 3.8411, + "step": 59325 + }, + { + "epoch": 4.031118358472619, + "grad_norm": 0.13457143306732178, + "learning_rate": 4.963225302350863e-05, + "loss": 4.0123, + "step": 59330 + }, + { + "epoch": 4.03145807854328, + "grad_norm": 0.893911600112915, + "learning_rate": 4.9628006522625356e-05, + "loss": 3.9127, + "step": 59335 + }, + { + "epoch": 4.031797798613942, + "grad_norm": 0.1736336499452591, + "learning_rate": 4.962376002174209e-05, + "loss": 3.7831, + "step": 59340 + }, + { + "epoch": 4.032137518684604, + "grad_norm": 0.2088216096162796, + "learning_rate": 4.961951352085881e-05, + "loss": 3.7745, + "step": 59345 + }, + { + "epoch": 4.032477238755265, + "grad_norm": 0.17551039159297943, + "learning_rate": 4.961526701997554e-05, + "loss": 3.8483, + "step": 59350 + }, + { + "epoch": 4.0328169588259275, + "grad_norm": 0.1834854781627655, + "learning_rate": 4.9611020519092274e-05, + "loss": 4.0023, + "step": 59355 + }, + { + "epoch": 4.03315667889659, + "grad_norm": 0.21100212633609772, + "learning_rate": 4.9606774018208996e-05, + "loss": 3.7801, + "step": 59360 + }, + { + "epoch": 4.033496398967251, + "grad_norm": 0.15856339037418365, + "learning_rate": 4.9602527517325724e-05, + "loss": 3.751, + "step": 59365 + }, + { + "epoch": 4.033836119037913, + "grad_norm": 0.30154186487197876, + "learning_rate": 4.959828101644245e-05, + "loss": 3.7628, + "step": 59370 + }, + { + "epoch": 4.034175839108575, + "grad_norm": 0.18000911176204681, + "learning_rate": 4.959403451555918e-05, + "loss": 3.7146, + "step": 59375 + }, + { + "epoch": 4.034515559179236, + "grad_norm": 0.15943491458892822, + "learning_rate": 4.958978801467591e-05, + "loss": 4.0178, + "step": 59380 + }, + { + "epoch": 4.034855279249898, + "grad_norm": 2.6631176471710205, + "learning_rate": 4.9585541513792636e-05, + "loss": 3.8879, + "step": 59385 + }, + { + "epoch": 4.03519499932056, + "grad_norm": 0.19201050698757172, + "learning_rate": 4.9581295012909364e-05, + "loss": 3.8597, + "step": 59390 + }, + { + "epoch": 4.035534719391221, + "grad_norm": 0.21255561709403992, + "learning_rate": 4.957704851202609e-05, + "loss": 3.8926, + "step": 59395 + }, + { + "epoch": 4.0358744394618835, + "grad_norm": 0.19218756258487701, + "learning_rate": 4.957280201114282e-05, + "loss": 3.7118, + "step": 59400 + }, + { + "epoch": 4.036214159532546, + "grad_norm": 0.23555731773376465, + "learning_rate": 4.9568555510259554e-05, + "loss": 3.8279, + "step": 59405 + }, + { + "epoch": 4.036553879603207, + "grad_norm": 0.17254109680652618, + "learning_rate": 4.9564309009376276e-05, + "loss": 3.7372, + "step": 59410 + }, + { + "epoch": 4.036893599673869, + "grad_norm": 0.1663997918367386, + "learning_rate": 4.9560062508493004e-05, + "loss": 3.6655, + "step": 59415 + }, + { + "epoch": 4.037233319744531, + "grad_norm": 0.16300125420093536, + "learning_rate": 4.955581600760973e-05, + "loss": 3.6328, + "step": 59420 + }, + { + "epoch": 4.037573039815192, + "grad_norm": 0.17639829218387604, + "learning_rate": 4.955156950672646e-05, + "loss": 3.8984, + "step": 59425 + }, + { + "epoch": 4.037912759885854, + "grad_norm": 0.24691808223724365, + "learning_rate": 4.954732300584319e-05, + "loss": 3.8151, + "step": 59430 + }, + { + "epoch": 4.038252479956516, + "grad_norm": 0.16134050488471985, + "learning_rate": 4.9543076504959916e-05, + "loss": 3.7086, + "step": 59435 + }, + { + "epoch": 4.0385922000271774, + "grad_norm": 0.16809335350990295, + "learning_rate": 4.9538830004076644e-05, + "loss": 3.8624, + "step": 59440 + }, + { + "epoch": 4.0389319200978395, + "grad_norm": 0.2673164904117584, + "learning_rate": 4.953458350319337e-05, + "loss": 3.7782, + "step": 59445 + }, + { + "epoch": 4.039271640168501, + "grad_norm": 0.21511362493038177, + "learning_rate": 4.95303370023101e-05, + "loss": 3.7212, + "step": 59450 + }, + { + "epoch": 4.039611360239163, + "grad_norm": 0.19382034242153168, + "learning_rate": 4.952609050142683e-05, + "loss": 4.2082, + "step": 59455 + }, + { + "epoch": 4.039951080309825, + "grad_norm": 0.16970065236091614, + "learning_rate": 4.9521844000543556e-05, + "loss": 3.9201, + "step": 59460 + }, + { + "epoch": 4.040290800380486, + "grad_norm": 0.20606915652751923, + "learning_rate": 4.9517597499660284e-05, + "loss": 3.9789, + "step": 59465 + }, + { + "epoch": 4.040630520451148, + "grad_norm": 0.13715654611587524, + "learning_rate": 4.9513350998777005e-05, + "loss": 3.9723, + "step": 59470 + }, + { + "epoch": 4.04097024052181, + "grad_norm": 0.16207550466060638, + "learning_rate": 4.950910449789374e-05, + "loss": 3.942, + "step": 59475 + }, + { + "epoch": 4.041309960592471, + "grad_norm": 0.1698717623949051, + "learning_rate": 4.950485799701047e-05, + "loss": 3.8092, + "step": 59480 + }, + { + "epoch": 4.0416496806631335, + "grad_norm": 0.21979787945747375, + "learning_rate": 4.950061149612719e-05, + "loss": 3.8026, + "step": 59485 + }, + { + "epoch": 4.0419894007337955, + "grad_norm": 0.18521176278591156, + "learning_rate": 4.9496364995243924e-05, + "loss": 3.9355, + "step": 59490 + }, + { + "epoch": 4.042329120804457, + "grad_norm": 0.1905571073293686, + "learning_rate": 4.949211849436065e-05, + "loss": 4.1007, + "step": 59495 + }, + { + "epoch": 4.042668840875119, + "grad_norm": 0.15764449536800385, + "learning_rate": 4.948787199347737e-05, + "loss": 3.8368, + "step": 59500 + }, + { + "epoch": 4.043008560945781, + "grad_norm": 0.16582076251506805, + "learning_rate": 4.948362549259411e-05, + "loss": 4.0009, + "step": 59505 + }, + { + "epoch": 4.043348281016442, + "grad_norm": 0.21746884286403656, + "learning_rate": 4.9479378991710836e-05, + "loss": 3.8357, + "step": 59510 + }, + { + "epoch": 4.043688001087104, + "grad_norm": 0.960581362247467, + "learning_rate": 4.947513249082756e-05, + "loss": 3.7626, + "step": 59515 + }, + { + "epoch": 4.044027721157766, + "grad_norm": 0.16583073139190674, + "learning_rate": 4.9470885989944285e-05, + "loss": 3.9061, + "step": 59520 + }, + { + "epoch": 4.044367441228427, + "grad_norm": 0.1473916620016098, + "learning_rate": 4.946663948906102e-05, + "loss": 3.6698, + "step": 59525 + }, + { + "epoch": 4.0447071612990895, + "grad_norm": 1.0331732034683228, + "learning_rate": 4.946239298817774e-05, + "loss": 3.7792, + "step": 59530 + }, + { + "epoch": 4.0450468813697515, + "grad_norm": 0.166097491979599, + "learning_rate": 4.945814648729447e-05, + "loss": 4.007, + "step": 59535 + }, + { + "epoch": 4.045386601440413, + "grad_norm": 0.13835129141807556, + "learning_rate": 4.9453899986411204e-05, + "loss": 4.0404, + "step": 59540 + }, + { + "epoch": 4.045726321511075, + "grad_norm": 0.16485567390918732, + "learning_rate": 4.9449653485527925e-05, + "loss": 3.7345, + "step": 59545 + }, + { + "epoch": 4.046066041581737, + "grad_norm": 0.167918398976326, + "learning_rate": 4.944540698464465e-05, + "loss": 3.8531, + "step": 59550 + }, + { + "epoch": 4.046405761652398, + "grad_norm": 0.19358882308006287, + "learning_rate": 4.944116048376138e-05, + "loss": 3.7892, + "step": 59555 + }, + { + "epoch": 4.04674548172306, + "grad_norm": 0.15315860509872437, + "learning_rate": 4.943691398287811e-05, + "loss": 3.9987, + "step": 59560 + }, + { + "epoch": 4.047085201793722, + "grad_norm": 0.22976939380168915, + "learning_rate": 4.943266748199484e-05, + "loss": 3.9225, + "step": 59565 + }, + { + "epoch": 4.047424921864383, + "grad_norm": 0.17650151252746582, + "learning_rate": 4.9428420981111565e-05, + "loss": 3.8833, + "step": 59570 + }, + { + "epoch": 4.0477646419350455, + "grad_norm": 0.17587676644325256, + "learning_rate": 4.94241744802283e-05, + "loss": 3.5681, + "step": 59575 + }, + { + "epoch": 4.048104362005708, + "grad_norm": 0.1875745952129364, + "learning_rate": 4.941992797934502e-05, + "loss": 3.8903, + "step": 59580 + }, + { + "epoch": 4.048444082076369, + "grad_norm": 0.18330898880958557, + "learning_rate": 4.941568147846175e-05, + "loss": 3.7926, + "step": 59585 + }, + { + "epoch": 4.048783802147031, + "grad_norm": 0.18213504552841187, + "learning_rate": 4.941143497757848e-05, + "loss": 3.7641, + "step": 59590 + }, + { + "epoch": 4.049123522217693, + "grad_norm": 0.6581819653511047, + "learning_rate": 4.9407188476695205e-05, + "loss": 3.6784, + "step": 59595 + }, + { + "epoch": 4.049463242288354, + "grad_norm": 0.1876242309808731, + "learning_rate": 4.940294197581193e-05, + "loss": 3.9968, + "step": 59600 + }, + { + "epoch": 4.049802962359016, + "grad_norm": 0.5199191570281982, + "learning_rate": 4.939869547492866e-05, + "loss": 3.8584, + "step": 59605 + }, + { + "epoch": 4.050142682429678, + "grad_norm": 0.15144532918930054, + "learning_rate": 4.939444897404539e-05, + "loss": 3.8974, + "step": 59610 + }, + { + "epoch": 4.050482402500339, + "grad_norm": 0.5455876588821411, + "learning_rate": 4.939020247316212e-05, + "loss": 3.8382, + "step": 59615 + }, + { + "epoch": 4.0508221225710015, + "grad_norm": 0.16013203561306, + "learning_rate": 4.9385955972278845e-05, + "loss": 3.874, + "step": 59620 + }, + { + "epoch": 4.051161842641664, + "grad_norm": 0.15977779030799866, + "learning_rate": 4.938170947139557e-05, + "loss": 3.7256, + "step": 59625 + }, + { + "epoch": 4.051501562712325, + "grad_norm": 0.15192380547523499, + "learning_rate": 4.93774629705123e-05, + "loss": 3.9181, + "step": 59630 + }, + { + "epoch": 4.051841282782987, + "grad_norm": 0.18502256274223328, + "learning_rate": 4.937321646962903e-05, + "loss": 4.0755, + "step": 59635 + }, + { + "epoch": 4.052181002853649, + "grad_norm": 0.17276185750961304, + "learning_rate": 4.936896996874576e-05, + "loss": 3.7106, + "step": 59640 + }, + { + "epoch": 4.05252072292431, + "grad_norm": 0.13399717211723328, + "learning_rate": 4.9364723467862485e-05, + "loss": 3.9413, + "step": 59645 + }, + { + "epoch": 4.052860442994972, + "grad_norm": 0.278813898563385, + "learning_rate": 4.936047696697921e-05, + "loss": 4.1127, + "step": 59650 + }, + { + "epoch": 4.053200163065634, + "grad_norm": 0.1733526587486267, + "learning_rate": 4.9356230466095934e-05, + "loss": 3.888, + "step": 59655 + }, + { + "epoch": 4.053539883136295, + "grad_norm": 0.1618412733078003, + "learning_rate": 4.935198396521267e-05, + "loss": 3.7247, + "step": 59660 + }, + { + "epoch": 4.0538796032069575, + "grad_norm": 0.2072465419769287, + "learning_rate": 4.93477374643294e-05, + "loss": 3.8678, + "step": 59665 + }, + { + "epoch": 4.05421932327762, + "grad_norm": 0.1620374321937561, + "learning_rate": 4.934349096344612e-05, + "loss": 4.1078, + "step": 59670 + }, + { + "epoch": 4.054559043348281, + "grad_norm": 0.21880760788917542, + "learning_rate": 4.933924446256285e-05, + "loss": 3.8732, + "step": 59675 + }, + { + "epoch": 4.054898763418943, + "grad_norm": 0.14428122341632843, + "learning_rate": 4.933499796167958e-05, + "loss": 3.9129, + "step": 59680 + }, + { + "epoch": 4.055238483489605, + "grad_norm": 0.17447559535503387, + "learning_rate": 4.93307514607963e-05, + "loss": 3.8875, + "step": 59685 + }, + { + "epoch": 4.055578203560266, + "grad_norm": 0.17551618814468384, + "learning_rate": 4.932650495991303e-05, + "loss": 3.7145, + "step": 59690 + }, + { + "epoch": 4.055917923630928, + "grad_norm": 0.21220634877681732, + "learning_rate": 4.9322258459029765e-05, + "loss": 3.8877, + "step": 59695 + }, + { + "epoch": 4.05625764370159, + "grad_norm": 0.1705874651670456, + "learning_rate": 4.9318011958146487e-05, + "loss": 3.8341, + "step": 59700 + }, + { + "epoch": 4.056597363772251, + "grad_norm": 0.2475590854883194, + "learning_rate": 4.9313765457263215e-05, + "loss": 3.8275, + "step": 59705 + }, + { + "epoch": 4.0569370838429135, + "grad_norm": 0.1978604942560196, + "learning_rate": 4.930951895637995e-05, + "loss": 3.9242, + "step": 59710 + }, + { + "epoch": 4.057276803913576, + "grad_norm": 0.2069787085056305, + "learning_rate": 4.930527245549667e-05, + "loss": 4.146, + "step": 59715 + }, + { + "epoch": 4.057616523984237, + "grad_norm": 0.13616520166397095, + "learning_rate": 4.93010259546134e-05, + "loss": 3.8938, + "step": 59720 + }, + { + "epoch": 4.057956244054899, + "grad_norm": 0.19515344500541687, + "learning_rate": 4.9296779453730127e-05, + "loss": 3.9691, + "step": 59725 + }, + { + "epoch": 4.058295964125561, + "grad_norm": 0.2055058628320694, + "learning_rate": 4.9292532952846855e-05, + "loss": 4.0386, + "step": 59730 + }, + { + "epoch": 4.058635684196222, + "grad_norm": 0.15520767867565155, + "learning_rate": 4.928828645196358e-05, + "loss": 3.8033, + "step": 59735 + }, + { + "epoch": 4.058975404266884, + "grad_norm": 0.3176758885383606, + "learning_rate": 4.928403995108031e-05, + "loss": 3.8396, + "step": 59740 + }, + { + "epoch": 4.059315124337546, + "grad_norm": 0.14352796971797943, + "learning_rate": 4.9279793450197045e-05, + "loss": 3.9043, + "step": 59745 + }, + { + "epoch": 4.0596548444082075, + "grad_norm": 0.22275061905384064, + "learning_rate": 4.927554694931377e-05, + "loss": 3.9075, + "step": 59750 + }, + { + "epoch": 4.0599945644788695, + "grad_norm": 0.6037282943725586, + "learning_rate": 4.9271300448430495e-05, + "loss": 3.9818, + "step": 59755 + }, + { + "epoch": 4.060334284549532, + "grad_norm": 0.16461439430713654, + "learning_rate": 4.926705394754722e-05, + "loss": 3.8684, + "step": 59760 + }, + { + "epoch": 4.060674004620193, + "grad_norm": 0.20143482089042664, + "learning_rate": 4.926280744666395e-05, + "loss": 3.7967, + "step": 59765 + }, + { + "epoch": 4.061013724690855, + "grad_norm": 1.4006673097610474, + "learning_rate": 4.925856094578068e-05, + "loss": 3.7097, + "step": 59770 + }, + { + "epoch": 4.061353444761517, + "grad_norm": 0.1577783226966858, + "learning_rate": 4.925431444489741e-05, + "loss": 3.9214, + "step": 59775 + }, + { + "epoch": 4.061693164832178, + "grad_norm": 0.32129788398742676, + "learning_rate": 4.9250067944014135e-05, + "loss": 3.9021, + "step": 59780 + }, + { + "epoch": 4.06203288490284, + "grad_norm": 0.21169033646583557, + "learning_rate": 4.924582144313086e-05, + "loss": 3.6797, + "step": 59785 + }, + { + "epoch": 4.062372604973502, + "grad_norm": 0.1406114399433136, + "learning_rate": 4.924157494224759e-05, + "loss": 3.9382, + "step": 59790 + }, + { + "epoch": 4.0627123250441635, + "grad_norm": 0.2534515857696533, + "learning_rate": 4.923732844136432e-05, + "loss": 4.0115, + "step": 59795 + }, + { + "epoch": 4.0630520451148255, + "grad_norm": 0.1817983090877533, + "learning_rate": 4.923308194048105e-05, + "loss": 3.8985, + "step": 59800 + }, + { + "epoch": 4.063391765185487, + "grad_norm": 0.1322777271270752, + "learning_rate": 4.9228835439597775e-05, + "loss": 4.0113, + "step": 59805 + }, + { + "epoch": 4.063731485256149, + "grad_norm": 0.15463407337665558, + "learning_rate": 4.92245889387145e-05, + "loss": 3.9137, + "step": 59810 + }, + { + "epoch": 4.064071205326811, + "grad_norm": 0.17445607483386993, + "learning_rate": 4.922034243783123e-05, + "loss": 3.7152, + "step": 59815 + }, + { + "epoch": 4.064410925397472, + "grad_norm": 0.1283748894929886, + "learning_rate": 4.921609593694796e-05, + "loss": 3.957, + "step": 59820 + }, + { + "epoch": 4.064750645468134, + "grad_norm": 0.18115770816802979, + "learning_rate": 4.921184943606468e-05, + "loss": 3.8458, + "step": 59825 + }, + { + "epoch": 4.065090365538796, + "grad_norm": 0.1552322953939438, + "learning_rate": 4.9207602935181415e-05, + "loss": 3.7299, + "step": 59830 + }, + { + "epoch": 4.065430085609457, + "grad_norm": 0.16396987438201904, + "learning_rate": 4.920335643429814e-05, + "loss": 3.7959, + "step": 59835 + }, + { + "epoch": 4.0657698056801195, + "grad_norm": 0.1916516274213791, + "learning_rate": 4.9199109933414864e-05, + "loss": 3.9731, + "step": 59840 + }, + { + "epoch": 4.0661095257507816, + "grad_norm": 0.3742567002773285, + "learning_rate": 4.91948634325316e-05, + "loss": 4.0579, + "step": 59845 + }, + { + "epoch": 4.066449245821443, + "grad_norm": 0.16575993597507477, + "learning_rate": 4.919061693164833e-05, + "loss": 3.8452, + "step": 59850 + }, + { + "epoch": 4.066788965892105, + "grad_norm": 0.20291389524936676, + "learning_rate": 4.918637043076505e-05, + "loss": 3.621, + "step": 59855 + }, + { + "epoch": 4.067128685962767, + "grad_norm": 0.15035437047481537, + "learning_rate": 4.9182123929881776e-05, + "loss": 3.8103, + "step": 59860 + }, + { + "epoch": 4.067468406033428, + "grad_norm": 0.14876806735992432, + "learning_rate": 4.917787742899851e-05, + "loss": 3.787, + "step": 59865 + }, + { + "epoch": 4.06780812610409, + "grad_norm": 0.1424962282180786, + "learning_rate": 4.917363092811523e-05, + "loss": 3.9046, + "step": 59870 + }, + { + "epoch": 4.068147846174752, + "grad_norm": 0.18697421252727509, + "learning_rate": 4.916938442723196e-05, + "loss": 3.9499, + "step": 59875 + }, + { + "epoch": 4.068487566245413, + "grad_norm": 0.16986753046512604, + "learning_rate": 4.9165137926348695e-05, + "loss": 3.9651, + "step": 59880 + }, + { + "epoch": 4.0688272863160755, + "grad_norm": 0.278665691614151, + "learning_rate": 4.9160891425465416e-05, + "loss": 3.8111, + "step": 59885 + }, + { + "epoch": 4.069167006386738, + "grad_norm": 0.172211691737175, + "learning_rate": 4.9156644924582144e-05, + "loss": 3.8868, + "step": 59890 + }, + { + "epoch": 4.069506726457399, + "grad_norm": 0.20207776129245758, + "learning_rate": 4.915239842369888e-05, + "loss": 3.7426, + "step": 59895 + }, + { + "epoch": 4.069846446528061, + "grad_norm": 0.16442541778087616, + "learning_rate": 4.91481519228156e-05, + "loss": 3.9226, + "step": 59900 + }, + { + "epoch": 4.070186166598723, + "grad_norm": 0.16047483682632446, + "learning_rate": 4.914390542193233e-05, + "loss": 4.1272, + "step": 59905 + }, + { + "epoch": 4.070525886669384, + "grad_norm": 0.19243796169757843, + "learning_rate": 4.9139658921049056e-05, + "loss": 3.7431, + "step": 59910 + }, + { + "epoch": 4.070865606740046, + "grad_norm": 0.17353549599647522, + "learning_rate": 4.913541242016579e-05, + "loss": 3.943, + "step": 59915 + }, + { + "epoch": 4.071205326810708, + "grad_norm": 0.2363561987876892, + "learning_rate": 4.913116591928251e-05, + "loss": 3.865, + "step": 59920 + }, + { + "epoch": 4.071545046881369, + "grad_norm": 0.2253856658935547, + "learning_rate": 4.912691941839924e-05, + "loss": 3.828, + "step": 59925 + }, + { + "epoch": 4.0718847669520315, + "grad_norm": 0.19178803265094757, + "learning_rate": 4.9122672917515975e-05, + "loss": 3.9185, + "step": 59930 + }, + { + "epoch": 4.072224487022694, + "grad_norm": 0.25162363052368164, + "learning_rate": 4.9118426416632696e-05, + "loss": 3.8861, + "step": 59935 + }, + { + "epoch": 4.072564207093355, + "grad_norm": 0.16112536191940308, + "learning_rate": 4.9114179915749424e-05, + "loss": 4.07, + "step": 59940 + }, + { + "epoch": 4.072903927164017, + "grad_norm": 0.16035431623458862, + "learning_rate": 4.910993341486615e-05, + "loss": 4.1091, + "step": 59945 + }, + { + "epoch": 4.073243647234679, + "grad_norm": 0.19542190432548523, + "learning_rate": 4.910568691398288e-05, + "loss": 3.8088, + "step": 59950 + }, + { + "epoch": 4.07358336730534, + "grad_norm": 0.22935976088047028, + "learning_rate": 4.910144041309961e-05, + "loss": 3.7364, + "step": 59955 + }, + { + "epoch": 4.073923087376002, + "grad_norm": 0.16776950657367706, + "learning_rate": 4.9097193912216336e-05, + "loss": 3.7613, + "step": 59960 + }, + { + "epoch": 4.074262807446664, + "grad_norm": 0.35495418310165405, + "learning_rate": 4.9092947411333064e-05, + "loss": 3.7667, + "step": 59965 + }, + { + "epoch": 4.074602527517325, + "grad_norm": 0.3122372329235077, + "learning_rate": 4.908870091044979e-05, + "loss": 3.8364, + "step": 59970 + }, + { + "epoch": 4.0749422475879875, + "grad_norm": 0.16568343341350555, + "learning_rate": 4.908445440956652e-05, + "loss": 4.0196, + "step": 59975 + }, + { + "epoch": 4.07528196765865, + "grad_norm": 0.49118778109550476, + "learning_rate": 4.908020790868325e-05, + "loss": 3.8144, + "step": 59980 + }, + { + "epoch": 4.075621687729311, + "grad_norm": 0.1595471054315567, + "learning_rate": 4.9075961407799976e-05, + "loss": 3.6283, + "step": 59985 + }, + { + "epoch": 4.075961407799973, + "grad_norm": 0.16588644683361053, + "learning_rate": 4.9071714906916704e-05, + "loss": 3.7251, + "step": 59990 + }, + { + "epoch": 4.076301127870635, + "grad_norm": 0.20282848179340363, + "learning_rate": 4.906746840603343e-05, + "loss": 3.7805, + "step": 59995 + }, + { + "epoch": 4.076640847941296, + "grad_norm": 0.19667819142341614, + "learning_rate": 4.906322190515016e-05, + "loss": 3.9795, + "step": 60000 + }, + { + "epoch": 4.076980568011958, + "grad_norm": 0.16493451595306396, + "learning_rate": 4.905897540426689e-05, + "loss": 3.831, + "step": 60005 + }, + { + "epoch": 4.07732028808262, + "grad_norm": 0.16637501120567322, + "learning_rate": 4.905472890338361e-05, + "loss": 3.726, + "step": 60010 + }, + { + "epoch": 4.0776600081532814, + "grad_norm": 0.2363310158252716, + "learning_rate": 4.9050482402500344e-05, + "loss": 3.6155, + "step": 60015 + }, + { + "epoch": 4.0779997282239435, + "grad_norm": 0.31818968057632446, + "learning_rate": 4.904623590161707e-05, + "loss": 3.8611, + "step": 60020 + }, + { + "epoch": 4.078339448294606, + "grad_norm": 0.16522148251533508, + "learning_rate": 4.9041989400733794e-05, + "loss": 3.8325, + "step": 60025 + }, + { + "epoch": 4.078679168365267, + "grad_norm": 0.2325717955827713, + "learning_rate": 4.903774289985053e-05, + "loss": 3.8036, + "step": 60030 + }, + { + "epoch": 4.079018888435929, + "grad_norm": 0.16281536221504211, + "learning_rate": 4.9033496398967256e-05, + "loss": 3.8669, + "step": 60035 + }, + { + "epoch": 4.079358608506591, + "grad_norm": 0.1848555952310562, + "learning_rate": 4.902924989808398e-05, + "loss": 3.8695, + "step": 60040 + }, + { + "epoch": 4.079698328577252, + "grad_norm": 0.1550171971321106, + "learning_rate": 4.9025003397200706e-05, + "loss": 3.7699, + "step": 60045 + }, + { + "epoch": 4.080038048647914, + "grad_norm": 0.1436447650194168, + "learning_rate": 4.902075689631744e-05, + "loss": 3.9048, + "step": 60050 + }, + { + "epoch": 4.080377768718576, + "grad_norm": 0.3168264627456665, + "learning_rate": 4.901651039543416e-05, + "loss": 3.9086, + "step": 60055 + }, + { + "epoch": 4.0807174887892375, + "grad_norm": 0.15674617886543274, + "learning_rate": 4.901226389455089e-05, + "loss": 3.9072, + "step": 60060 + }, + { + "epoch": 4.0810572088598995, + "grad_norm": 0.17878569662570953, + "learning_rate": 4.9008017393667624e-05, + "loss": 3.6594, + "step": 60065 + }, + { + "epoch": 4.081396928930562, + "grad_norm": 0.2028338611125946, + "learning_rate": 4.9003770892784346e-05, + "loss": 3.6911, + "step": 60070 + }, + { + "epoch": 4.081736649001223, + "grad_norm": 0.21032464504241943, + "learning_rate": 4.8999524391901074e-05, + "loss": 3.8249, + "step": 60075 + }, + { + "epoch": 4.082076369071885, + "grad_norm": 0.22803694009780884, + "learning_rate": 4.89952778910178e-05, + "loss": 3.9955, + "step": 60080 + }, + { + "epoch": 4.082416089142547, + "grad_norm": 0.15814776718616486, + "learning_rate": 4.8991031390134536e-05, + "loss": 3.8816, + "step": 60085 + }, + { + "epoch": 4.082755809213208, + "grad_norm": 0.1684780716896057, + "learning_rate": 4.898678488925126e-05, + "loss": 3.9772, + "step": 60090 + }, + { + "epoch": 4.08309552928387, + "grad_norm": 0.1957729458808899, + "learning_rate": 4.8982538388367986e-05, + "loss": 4.1539, + "step": 60095 + }, + { + "epoch": 4.083435249354532, + "grad_norm": 0.16557063162326813, + "learning_rate": 4.897829188748472e-05, + "loss": 3.7732, + "step": 60100 + }, + { + "epoch": 4.0837749694251935, + "grad_norm": 0.1668914407491684, + "learning_rate": 4.897404538660144e-05, + "loss": 3.696, + "step": 60105 + }, + { + "epoch": 4.0841146894958555, + "grad_norm": 0.1643957942724228, + "learning_rate": 4.896979888571817e-05, + "loss": 3.9145, + "step": 60110 + }, + { + "epoch": 4.084454409566518, + "grad_norm": 0.1661723405122757, + "learning_rate": 4.89655523848349e-05, + "loss": 3.9663, + "step": 60115 + }, + { + "epoch": 4.084794129637179, + "grad_norm": 0.1414141058921814, + "learning_rate": 4.8961305883951626e-05, + "loss": 3.8044, + "step": 60120 + }, + { + "epoch": 4.085133849707841, + "grad_norm": 0.18800048530101776, + "learning_rate": 4.8957059383068354e-05, + "loss": 3.9715, + "step": 60125 + }, + { + "epoch": 4.085473569778502, + "grad_norm": 0.19552327692508698, + "learning_rate": 4.895281288218508e-05, + "loss": 3.7663, + "step": 60130 + }, + { + "epoch": 4.085813289849164, + "grad_norm": 0.8070520758628845, + "learning_rate": 4.894856638130181e-05, + "loss": 4.0394, + "step": 60135 + }, + { + "epoch": 4.086153009919826, + "grad_norm": 0.16358524560928345, + "learning_rate": 4.894431988041854e-05, + "loss": 3.6819, + "step": 60140 + }, + { + "epoch": 4.086492729990487, + "grad_norm": 0.1602434515953064, + "learning_rate": 4.8940073379535266e-05, + "loss": 3.9282, + "step": 60145 + }, + { + "epoch": 4.0868324500611495, + "grad_norm": 0.1701774001121521, + "learning_rate": 4.8935826878651994e-05, + "loss": 3.9351, + "step": 60150 + }, + { + "epoch": 4.0871721701318116, + "grad_norm": 0.1515931338071823, + "learning_rate": 4.893158037776872e-05, + "loss": 4.1009, + "step": 60155 + }, + { + "epoch": 4.087511890202473, + "grad_norm": 0.1969285011291504, + "learning_rate": 4.892733387688545e-05, + "loss": 3.8737, + "step": 60160 + }, + { + "epoch": 4.087851610273135, + "grad_norm": 0.15617863833904266, + "learning_rate": 4.892308737600218e-05, + "loss": 3.9213, + "step": 60165 + }, + { + "epoch": 4.088191330343797, + "grad_norm": 0.16214385628700256, + "learning_rate": 4.8918840875118906e-05, + "loss": 3.5605, + "step": 60170 + }, + { + "epoch": 4.088531050414458, + "grad_norm": 0.17648205161094666, + "learning_rate": 4.8914594374235634e-05, + "loss": 3.981, + "step": 60175 + }, + { + "epoch": 4.08887077048512, + "grad_norm": 0.16015322506427765, + "learning_rate": 4.8910347873352355e-05, + "loss": 3.9672, + "step": 60180 + }, + { + "epoch": 4.089210490555782, + "grad_norm": 0.8466540575027466, + "learning_rate": 4.890610137246909e-05, + "loss": 3.568, + "step": 60185 + }, + { + "epoch": 4.089550210626443, + "grad_norm": 0.20394614338874817, + "learning_rate": 4.890185487158582e-05, + "loss": 3.8038, + "step": 60190 + }, + { + "epoch": 4.0898899306971055, + "grad_norm": 0.1819581836462021, + "learning_rate": 4.889760837070254e-05, + "loss": 4.2119, + "step": 60195 + }, + { + "epoch": 4.090229650767768, + "grad_norm": 0.16956952214241028, + "learning_rate": 4.8893361869819274e-05, + "loss": 3.7843, + "step": 60200 + }, + { + "epoch": 4.090569370838429, + "grad_norm": 2.0702810287475586, + "learning_rate": 4.8889115368936e-05, + "loss": 4.0419, + "step": 60205 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.33989793062210083, + "learning_rate": 4.888486886805272e-05, + "loss": 3.8261, + "step": 60210 + }, + { + "epoch": 4.091248810979753, + "grad_norm": 0.16726569831371307, + "learning_rate": 4.888062236716945e-05, + "loss": 3.9459, + "step": 60215 + }, + { + "epoch": 4.091588531050414, + "grad_norm": 0.1684996336698532, + "learning_rate": 4.8876375866286186e-05, + "loss": 3.852, + "step": 60220 + }, + { + "epoch": 4.091928251121076, + "grad_norm": 0.15668606758117676, + "learning_rate": 4.887212936540291e-05, + "loss": 3.9441, + "step": 60225 + }, + { + "epoch": 4.092267971191738, + "grad_norm": 0.17645476758480072, + "learning_rate": 4.8867882864519635e-05, + "loss": 3.9394, + "step": 60230 + }, + { + "epoch": 4.092607691262399, + "grad_norm": 6.901217460632324, + "learning_rate": 4.886363636363637e-05, + "loss": 3.7442, + "step": 60235 + }, + { + "epoch": 4.0929474113330615, + "grad_norm": 0.20358319580554962, + "learning_rate": 4.885938986275309e-05, + "loss": 3.5886, + "step": 60240 + }, + { + "epoch": 4.093287131403724, + "grad_norm": 0.17243839800357819, + "learning_rate": 4.885514336186982e-05, + "loss": 3.803, + "step": 60245 + }, + { + "epoch": 4.093626851474385, + "grad_norm": 0.17614853382110596, + "learning_rate": 4.885089686098655e-05, + "loss": 3.7466, + "step": 60250 + }, + { + "epoch": 4.093966571545047, + "grad_norm": 0.19124281406402588, + "learning_rate": 4.884665036010328e-05, + "loss": 3.7903, + "step": 60255 + }, + { + "epoch": 4.094306291615709, + "grad_norm": 0.1868988573551178, + "learning_rate": 4.884240385922e-05, + "loss": 3.9857, + "step": 60260 + }, + { + "epoch": 4.09464601168637, + "grad_norm": 0.17541000247001648, + "learning_rate": 4.883815735833673e-05, + "loss": 3.9694, + "step": 60265 + }, + { + "epoch": 4.094985731757032, + "grad_norm": 0.21044181287288666, + "learning_rate": 4.8833910857453466e-05, + "loss": 3.7912, + "step": 60270 + }, + { + "epoch": 4.095325451827694, + "grad_norm": 0.16002431511878967, + "learning_rate": 4.882966435657019e-05, + "loss": 3.7829, + "step": 60275 + }, + { + "epoch": 4.095665171898355, + "grad_norm": 0.16788969933986664, + "learning_rate": 4.8825417855686915e-05, + "loss": 3.7544, + "step": 60280 + }, + { + "epoch": 4.0960048919690175, + "grad_norm": 0.12943480908870697, + "learning_rate": 4.882117135480364e-05, + "loss": 3.7334, + "step": 60285 + }, + { + "epoch": 4.09634461203968, + "grad_norm": 0.15464317798614502, + "learning_rate": 4.881692485392037e-05, + "loss": 3.699, + "step": 60290 + }, + { + "epoch": 4.096684332110341, + "grad_norm": 0.1972687989473343, + "learning_rate": 4.88126783530371e-05, + "loss": 3.6925, + "step": 60295 + }, + { + "epoch": 4.097024052181003, + "grad_norm": 0.20398402214050293, + "learning_rate": 4.880843185215383e-05, + "loss": 3.8529, + "step": 60300 + }, + { + "epoch": 4.097363772251665, + "grad_norm": 0.1567905843257904, + "learning_rate": 4.8804185351270555e-05, + "loss": 3.8201, + "step": 60305 + }, + { + "epoch": 4.097703492322326, + "grad_norm": 0.15602858364582062, + "learning_rate": 4.879993885038728e-05, + "loss": 3.619, + "step": 60310 + }, + { + "epoch": 4.098043212392988, + "grad_norm": 0.6307495832443237, + "learning_rate": 4.879569234950401e-05, + "loss": 4.0401, + "step": 60315 + }, + { + "epoch": 4.09838293246365, + "grad_norm": 0.1515219807624817, + "learning_rate": 4.879144584862074e-05, + "loss": 3.6373, + "step": 60320 + }, + { + "epoch": 4.0987226525343115, + "grad_norm": 0.2736189365386963, + "learning_rate": 4.878719934773747e-05, + "loss": 4.055, + "step": 60325 + }, + { + "epoch": 4.0990623726049735, + "grad_norm": 0.4689928889274597, + "learning_rate": 4.8782952846854195e-05, + "loss": 3.8667, + "step": 60330 + }, + { + "epoch": 4.099402092675636, + "grad_norm": 0.43476757407188416, + "learning_rate": 4.877870634597092e-05, + "loss": 3.8874, + "step": 60335 + }, + { + "epoch": 4.099741812746297, + "grad_norm": 0.17491091787815094, + "learning_rate": 4.877445984508765e-05, + "loss": 3.835, + "step": 60340 + }, + { + "epoch": 4.100081532816959, + "grad_norm": 0.8897982835769653, + "learning_rate": 4.877021334420438e-05, + "loss": 4.0024, + "step": 60345 + }, + { + "epoch": 4.100421252887621, + "grad_norm": 0.13936395943164825, + "learning_rate": 4.87659668433211e-05, + "loss": 3.9605, + "step": 60350 + }, + { + "epoch": 4.100760972958282, + "grad_norm": 0.1540706902742386, + "learning_rate": 4.8761720342437835e-05, + "loss": 3.8612, + "step": 60355 + }, + { + "epoch": 4.101100693028944, + "grad_norm": 0.16927112638950348, + "learning_rate": 4.875747384155456e-05, + "loss": 4.0411, + "step": 60360 + }, + { + "epoch": 4.101440413099606, + "grad_norm": 0.16329650580883026, + "learning_rate": 4.8753227340671284e-05, + "loss": 3.8254, + "step": 60365 + }, + { + "epoch": 4.1017801331702675, + "grad_norm": 0.1854979544878006, + "learning_rate": 4.874898083978802e-05, + "loss": 3.8441, + "step": 60370 + }, + { + "epoch": 4.1021198532409295, + "grad_norm": 0.18712282180786133, + "learning_rate": 4.874473433890475e-05, + "loss": 3.761, + "step": 60375 + }, + { + "epoch": 4.102459573311592, + "grad_norm": 0.18338875472545624, + "learning_rate": 4.874048783802147e-05, + "loss": 4.0936, + "step": 60380 + }, + { + "epoch": 4.102799293382253, + "grad_norm": 0.1484263390302658, + "learning_rate": 4.8736241337138197e-05, + "loss": 3.8089, + "step": 60385 + }, + { + "epoch": 4.103139013452915, + "grad_norm": 0.15827910602092743, + "learning_rate": 4.873199483625493e-05, + "loss": 3.7113, + "step": 60390 + }, + { + "epoch": 4.103478733523577, + "grad_norm": 0.14215780794620514, + "learning_rate": 4.872774833537165e-05, + "loss": 3.7974, + "step": 60395 + }, + { + "epoch": 4.103818453594238, + "grad_norm": 0.3939377963542938, + "learning_rate": 4.872350183448838e-05, + "loss": 3.6268, + "step": 60400 + }, + { + "epoch": 4.1041581736649, + "grad_norm": 0.19978098571300507, + "learning_rate": 4.8719255333605115e-05, + "loss": 3.7202, + "step": 60405 + }, + { + "epoch": 4.104497893735562, + "grad_norm": 0.1953551471233368, + "learning_rate": 4.8715008832721837e-05, + "loss": 4.0015, + "step": 60410 + }, + { + "epoch": 4.1048376138062235, + "grad_norm": 0.31518152356147766, + "learning_rate": 4.8710762331838565e-05, + "loss": 3.7821, + "step": 60415 + }, + { + "epoch": 4.1051773338768855, + "grad_norm": 0.22752614319324493, + "learning_rate": 4.87065158309553e-05, + "loss": 3.9435, + "step": 60420 + }, + { + "epoch": 4.105517053947548, + "grad_norm": 0.21861033141613007, + "learning_rate": 4.870226933007203e-05, + "loss": 3.8721, + "step": 60425 + }, + { + "epoch": 4.105856774018209, + "grad_norm": 0.15616445243358612, + "learning_rate": 4.869802282918875e-05, + "loss": 3.7493, + "step": 60430 + }, + { + "epoch": 4.106196494088871, + "grad_norm": 0.16000080108642578, + "learning_rate": 4.8693776328305477e-05, + "loss": 3.65, + "step": 60435 + }, + { + "epoch": 4.106536214159533, + "grad_norm": 0.1784227341413498, + "learning_rate": 4.868952982742221e-05, + "loss": 3.9528, + "step": 60440 + }, + { + "epoch": 4.106875934230194, + "grad_norm": 0.15936937928199768, + "learning_rate": 4.868528332653893e-05, + "loss": 3.8478, + "step": 60445 + }, + { + "epoch": 4.107215654300856, + "grad_norm": 2.2587215900421143, + "learning_rate": 4.868103682565566e-05, + "loss": 3.9271, + "step": 60450 + }, + { + "epoch": 4.107555374371518, + "grad_norm": 0.17379115521907806, + "learning_rate": 4.8676790324772395e-05, + "loss": 3.8716, + "step": 60455 + }, + { + "epoch": 4.1078950944421795, + "grad_norm": 0.15550942718982697, + "learning_rate": 4.867254382388912e-05, + "loss": 3.7208, + "step": 60460 + }, + { + "epoch": 4.108234814512842, + "grad_norm": 0.17348241806030273, + "learning_rate": 4.8668297323005845e-05, + "loss": 3.8617, + "step": 60465 + }, + { + "epoch": 4.108574534583504, + "grad_norm": 0.18874108791351318, + "learning_rate": 4.866405082212257e-05, + "loss": 3.7991, + "step": 60470 + }, + { + "epoch": 4.108914254654165, + "grad_norm": 0.18416136503219604, + "learning_rate": 4.86598043212393e-05, + "loss": 3.9517, + "step": 60475 + }, + { + "epoch": 4.109253974724827, + "grad_norm": 0.13954566419124603, + "learning_rate": 4.865555782035603e-05, + "loss": 3.8113, + "step": 60480 + }, + { + "epoch": 4.109593694795488, + "grad_norm": 0.13173267245292664, + "learning_rate": 4.865131131947276e-05, + "loss": 3.8883, + "step": 60485 + }, + { + "epoch": 4.10993341486615, + "grad_norm": 0.15676403045654297, + "learning_rate": 4.8647064818589485e-05, + "loss": 4.0882, + "step": 60490 + }, + { + "epoch": 4.110273134936812, + "grad_norm": 0.18559077382087708, + "learning_rate": 4.864281831770621e-05, + "loss": 3.8618, + "step": 60495 + }, + { + "epoch": 4.110612855007473, + "grad_norm": 0.19611907005310059, + "learning_rate": 4.863857181682294e-05, + "loss": 3.4719, + "step": 60500 + }, + { + "epoch": 4.1109525750781355, + "grad_norm": 0.16242669522762299, + "learning_rate": 4.863432531593967e-05, + "loss": 3.8252, + "step": 60505 + }, + { + "epoch": 4.111292295148798, + "grad_norm": 0.19293203949928284, + "learning_rate": 4.86300788150564e-05, + "loss": 3.8176, + "step": 60510 + }, + { + "epoch": 4.111632015219459, + "grad_norm": 0.13509102165699005, + "learning_rate": 4.8625832314173125e-05, + "loss": 3.7855, + "step": 60515 + }, + { + "epoch": 4.111971735290121, + "grad_norm": 0.1759876310825348, + "learning_rate": 4.862158581328985e-05, + "loss": 3.8014, + "step": 60520 + }, + { + "epoch": 4.112311455360783, + "grad_norm": 0.18377643823623657, + "learning_rate": 4.861733931240658e-05, + "loss": 3.786, + "step": 60525 + }, + { + "epoch": 4.112651175431444, + "grad_norm": 0.18725310266017914, + "learning_rate": 4.861309281152331e-05, + "loss": 3.6931, + "step": 60530 + }, + { + "epoch": 4.112990895502106, + "grad_norm": 0.27178627252578735, + "learning_rate": 4.860884631064003e-05, + "loss": 3.6412, + "step": 60535 + }, + { + "epoch": 4.113330615572768, + "grad_norm": 0.2939038872718811, + "learning_rate": 4.8604599809756765e-05, + "loss": 3.7653, + "step": 60540 + }, + { + "epoch": 4.113670335643429, + "grad_norm": 0.1692841500043869, + "learning_rate": 4.860035330887349e-05, + "loss": 3.93, + "step": 60545 + }, + { + "epoch": 4.1140100557140915, + "grad_norm": 0.1564483344554901, + "learning_rate": 4.8596106807990214e-05, + "loss": 3.7051, + "step": 60550 + }, + { + "epoch": 4.114349775784754, + "grad_norm": 0.1500711590051651, + "learning_rate": 4.859186030710695e-05, + "loss": 3.689, + "step": 60555 + }, + { + "epoch": 4.114689495855415, + "grad_norm": 0.2274036854505539, + "learning_rate": 4.858761380622368e-05, + "loss": 3.8698, + "step": 60560 + }, + { + "epoch": 4.115029215926077, + "grad_norm": 0.2795615494251251, + "learning_rate": 4.85833673053404e-05, + "loss": 3.8872, + "step": 60565 + }, + { + "epoch": 4.115368935996739, + "grad_norm": 0.190473273396492, + "learning_rate": 4.8579120804457126e-05, + "loss": 3.7282, + "step": 60570 + }, + { + "epoch": 4.1157086560674, + "grad_norm": 0.16898594796657562, + "learning_rate": 4.857487430357386e-05, + "loss": 3.6021, + "step": 60575 + }, + { + "epoch": 4.116048376138062, + "grad_norm": 0.35757339000701904, + "learning_rate": 4.857062780269058e-05, + "loss": 4.024, + "step": 60580 + }, + { + "epoch": 4.116388096208724, + "grad_norm": 0.14144366979599, + "learning_rate": 4.856638130180731e-05, + "loss": 3.6172, + "step": 60585 + }, + { + "epoch": 4.116727816279385, + "grad_norm": 0.19866211712360382, + "learning_rate": 4.8562134800924045e-05, + "loss": 3.736, + "step": 60590 + }, + { + "epoch": 4.1170675363500475, + "grad_norm": 0.18755559623241425, + "learning_rate": 4.855788830004077e-05, + "loss": 3.9138, + "step": 60595 + }, + { + "epoch": 4.11740725642071, + "grad_norm": 0.15131178498268127, + "learning_rate": 4.8553641799157494e-05, + "loss": 4.0185, + "step": 60600 + }, + { + "epoch": 4.117746976491371, + "grad_norm": 0.19385775923728943, + "learning_rate": 4.854939529827422e-05, + "loss": 3.7905, + "step": 60605 + }, + { + "epoch": 4.118086696562033, + "grad_norm": 0.2256048172712326, + "learning_rate": 4.854514879739096e-05, + "loss": 3.6062, + "step": 60610 + }, + { + "epoch": 4.118426416632695, + "grad_norm": 0.355817049741745, + "learning_rate": 4.854090229650768e-05, + "loss": 3.723, + "step": 60615 + }, + { + "epoch": 4.118766136703356, + "grad_norm": 0.4213181138038635, + "learning_rate": 4.8536655795624406e-05, + "loss": 3.9478, + "step": 60620 + }, + { + "epoch": 4.119105856774018, + "grad_norm": 0.1507284939289093, + "learning_rate": 4.853240929474114e-05, + "loss": 3.8998, + "step": 60625 + }, + { + "epoch": 4.11944557684468, + "grad_norm": 1.4331426620483398, + "learning_rate": 4.852816279385786e-05, + "loss": 3.6611, + "step": 60630 + }, + { + "epoch": 4.1197852969153415, + "grad_norm": 0.19143249094486237, + "learning_rate": 4.852391629297459e-05, + "loss": 4.0549, + "step": 60635 + }, + { + "epoch": 4.1201250169860035, + "grad_norm": 0.14812159538269043, + "learning_rate": 4.851966979209132e-05, + "loss": 3.7083, + "step": 60640 + }, + { + "epoch": 4.120464737056666, + "grad_norm": 0.17800916731357574, + "learning_rate": 4.8515423291208046e-05, + "loss": 3.8671, + "step": 60645 + }, + { + "epoch": 4.120804457127327, + "grad_norm": 0.22621239721775055, + "learning_rate": 4.8511176790324774e-05, + "loss": 3.8939, + "step": 60650 + }, + { + "epoch": 4.121144177197989, + "grad_norm": 0.18635708093643188, + "learning_rate": 4.85069302894415e-05, + "loss": 3.8278, + "step": 60655 + }, + { + "epoch": 4.121483897268651, + "grad_norm": 0.16071341931819916, + "learning_rate": 4.850268378855823e-05, + "loss": 3.6427, + "step": 60660 + }, + { + "epoch": 4.121823617339312, + "grad_norm": 0.16269809007644653, + "learning_rate": 4.849843728767496e-05, + "loss": 3.8349, + "step": 60665 + }, + { + "epoch": 4.122163337409974, + "grad_norm": 0.2047763168811798, + "learning_rate": 4.8494190786791686e-05, + "loss": 3.7806, + "step": 60670 + }, + { + "epoch": 4.122503057480636, + "grad_norm": 0.8947213292121887, + "learning_rate": 4.8489944285908414e-05, + "loss": 3.7318, + "step": 60675 + }, + { + "epoch": 4.1228427775512975, + "grad_norm": 0.2046634703874588, + "learning_rate": 4.848569778502514e-05, + "loss": 3.8771, + "step": 60680 + }, + { + "epoch": 4.1231824976219595, + "grad_norm": 0.23676329851150513, + "learning_rate": 4.848145128414187e-05, + "loss": 3.8388, + "step": 60685 + }, + { + "epoch": 4.123522217692622, + "grad_norm": 0.1502244919538498, + "learning_rate": 4.84772047832586e-05, + "loss": 4.1666, + "step": 60690 + }, + { + "epoch": 4.123861937763283, + "grad_norm": 0.1520308554172516, + "learning_rate": 4.8472958282375326e-05, + "loss": 3.4778, + "step": 60695 + }, + { + "epoch": 4.124201657833945, + "grad_norm": 0.2501307725906372, + "learning_rate": 4.8468711781492054e-05, + "loss": 3.5181, + "step": 60700 + }, + { + "epoch": 4.124541377904607, + "grad_norm": 0.17092923820018768, + "learning_rate": 4.8464465280608775e-05, + "loss": 3.909, + "step": 60705 + }, + { + "epoch": 4.124881097975268, + "grad_norm": 0.18762214481830597, + "learning_rate": 4.846021877972551e-05, + "loss": 3.7678, + "step": 60710 + }, + { + "epoch": 4.12522081804593, + "grad_norm": 0.1910400390625, + "learning_rate": 4.845597227884224e-05, + "loss": 3.8359, + "step": 60715 + }, + { + "epoch": 4.125560538116592, + "grad_norm": 0.1513231247663498, + "learning_rate": 4.845172577795896e-05, + "loss": 3.7905, + "step": 60720 + }, + { + "epoch": 4.1259002581872535, + "grad_norm": 0.1500212848186493, + "learning_rate": 4.8447479277075694e-05, + "loss": 3.6903, + "step": 60725 + }, + { + "epoch": 4.1262399782579156, + "grad_norm": 0.20746612548828125, + "learning_rate": 4.844323277619242e-05, + "loss": 3.8521, + "step": 60730 + }, + { + "epoch": 4.126579698328578, + "grad_norm": 0.19458556175231934, + "learning_rate": 4.8438986275309143e-05, + "loss": 3.921, + "step": 60735 + }, + { + "epoch": 4.126919418399239, + "grad_norm": 0.16537727415561676, + "learning_rate": 4.843473977442587e-05, + "loss": 3.9258, + "step": 60740 + }, + { + "epoch": 4.127259138469901, + "grad_norm": 0.2156006544828415, + "learning_rate": 4.8430493273542606e-05, + "loss": 3.625, + "step": 60745 + }, + { + "epoch": 4.127598858540563, + "grad_norm": 0.17929258942604065, + "learning_rate": 4.842624677265933e-05, + "loss": 3.9871, + "step": 60750 + }, + { + "epoch": 4.127938578611224, + "grad_norm": 0.1475498229265213, + "learning_rate": 4.8422000271776056e-05, + "loss": 3.9349, + "step": 60755 + }, + { + "epoch": 4.128278298681886, + "grad_norm": 0.16118809580802917, + "learning_rate": 4.841775377089279e-05, + "loss": 3.9209, + "step": 60760 + }, + { + "epoch": 4.128618018752548, + "grad_norm": 0.18507975339889526, + "learning_rate": 4.841350727000952e-05, + "loss": 3.7153, + "step": 60765 + }, + { + "epoch": 4.1289577388232095, + "grad_norm": 0.20611004531383514, + "learning_rate": 4.840926076912624e-05, + "loss": 3.8189, + "step": 60770 + }, + { + "epoch": 4.129297458893872, + "grad_norm": 0.1818615347146988, + "learning_rate": 4.840501426824297e-05, + "loss": 3.7809, + "step": 60775 + }, + { + "epoch": 4.129637178964534, + "grad_norm": 0.22401116788387299, + "learning_rate": 4.84007677673597e-05, + "loss": 3.8113, + "step": 60780 + }, + { + "epoch": 4.129976899035195, + "grad_norm": 0.14226001501083374, + "learning_rate": 4.8396521266476424e-05, + "loss": 3.915, + "step": 60785 + }, + { + "epoch": 4.130316619105857, + "grad_norm": 0.42122262716293335, + "learning_rate": 4.839227476559315e-05, + "loss": 3.8162, + "step": 60790 + }, + { + "epoch": 4.130656339176519, + "grad_norm": 0.167398601770401, + "learning_rate": 4.8388028264709886e-05, + "loss": 3.7378, + "step": 60795 + }, + { + "epoch": 4.13099605924718, + "grad_norm": 0.1558995246887207, + "learning_rate": 4.838378176382661e-05, + "loss": 3.7209, + "step": 60800 + }, + { + "epoch": 4.131335779317842, + "grad_norm": 0.1538161039352417, + "learning_rate": 4.8379535262943336e-05, + "loss": 3.6509, + "step": 60805 + }, + { + "epoch": 4.131675499388503, + "grad_norm": 0.24335341155529022, + "learning_rate": 4.8375288762060064e-05, + "loss": 3.9434, + "step": 60810 + }, + { + "epoch": 4.1320152194591655, + "grad_norm": 0.18857795000076294, + "learning_rate": 4.837104226117679e-05, + "loss": 3.7676, + "step": 60815 + }, + { + "epoch": 4.132354939529828, + "grad_norm": 0.1804884523153305, + "learning_rate": 4.836679576029352e-05, + "loss": 3.8528, + "step": 60820 + }, + { + "epoch": 4.132694659600489, + "grad_norm": 1.3229633569717407, + "learning_rate": 4.836254925941025e-05, + "loss": 3.7706, + "step": 60825 + }, + { + "epoch": 4.133034379671151, + "grad_norm": 0.19357599318027496, + "learning_rate": 4.8358302758526976e-05, + "loss": 3.8941, + "step": 60830 + }, + { + "epoch": 4.133374099741813, + "grad_norm": 0.4906562566757202, + "learning_rate": 4.8354056257643704e-05, + "loss": 3.6629, + "step": 60835 + }, + { + "epoch": 4.133713819812474, + "grad_norm": 0.49393290281295776, + "learning_rate": 4.834980975676043e-05, + "loss": 3.8576, + "step": 60840 + }, + { + "epoch": 4.134053539883136, + "grad_norm": 0.2289818376302719, + "learning_rate": 4.834556325587716e-05, + "loss": 3.6546, + "step": 60845 + }, + { + "epoch": 4.134393259953798, + "grad_norm": 0.16517220437526703, + "learning_rate": 4.834131675499389e-05, + "loss": 3.9027, + "step": 60850 + }, + { + "epoch": 4.134732980024459, + "grad_norm": 0.16476339101791382, + "learning_rate": 4.8337070254110616e-05, + "loss": 3.8491, + "step": 60855 + }, + { + "epoch": 4.1350727000951215, + "grad_norm": 0.17488226294517517, + "learning_rate": 4.8332823753227344e-05, + "loss": 3.8436, + "step": 60860 + }, + { + "epoch": 4.135412420165784, + "grad_norm": 0.18494610488414764, + "learning_rate": 4.832857725234407e-05, + "loss": 3.9062, + "step": 60865 + }, + { + "epoch": 4.135752140236445, + "grad_norm": 0.2040848284959793, + "learning_rate": 4.83243307514608e-05, + "loss": 3.7068, + "step": 60870 + }, + { + "epoch": 4.136091860307107, + "grad_norm": 0.17708569765090942, + "learning_rate": 4.832008425057752e-05, + "loss": 3.7896, + "step": 60875 + }, + { + "epoch": 4.136431580377769, + "grad_norm": 0.2025512307882309, + "learning_rate": 4.8315837749694256e-05, + "loss": 3.8018, + "step": 60880 + }, + { + "epoch": 4.13677130044843, + "grad_norm": 0.23474712669849396, + "learning_rate": 4.8311591248810984e-05, + "loss": 3.8197, + "step": 60885 + }, + { + "epoch": 4.137111020519092, + "grad_norm": 0.24074973165988922, + "learning_rate": 4.8307344747927705e-05, + "loss": 3.9908, + "step": 60890 + }, + { + "epoch": 4.137450740589754, + "grad_norm": 0.20705141127109528, + "learning_rate": 4.830309824704444e-05, + "loss": 3.8093, + "step": 60895 + }, + { + "epoch": 4.1377904606604154, + "grad_norm": 0.15846942365169525, + "learning_rate": 4.829885174616117e-05, + "loss": 3.9205, + "step": 60900 + }, + { + "epoch": 4.1381301807310775, + "grad_norm": 0.17072567343711853, + "learning_rate": 4.829460524527789e-05, + "loss": 3.7782, + "step": 60905 + }, + { + "epoch": 4.13846990080174, + "grad_norm": 0.5706697106361389, + "learning_rate": 4.829035874439462e-05, + "loss": 3.9349, + "step": 60910 + }, + { + "epoch": 4.138809620872401, + "grad_norm": 0.159454807639122, + "learning_rate": 4.828611224351135e-05, + "loss": 3.9783, + "step": 60915 + }, + { + "epoch": 4.139149340943063, + "grad_norm": 0.2559989094734192, + "learning_rate": 4.828186574262807e-05, + "loss": 3.847, + "step": 60920 + }, + { + "epoch": 4.139489061013725, + "grad_norm": 0.1718353033065796, + "learning_rate": 4.82776192417448e-05, + "loss": 3.8753, + "step": 60925 + }, + { + "epoch": 4.139828781084386, + "grad_norm": 0.2059721052646637, + "learning_rate": 4.8273372740861536e-05, + "loss": 4.1857, + "step": 60930 + }, + { + "epoch": 4.140168501155048, + "grad_norm": 0.20880594849586487, + "learning_rate": 4.8269126239978264e-05, + "loss": 3.9715, + "step": 60935 + }, + { + "epoch": 4.14050822122571, + "grad_norm": 0.15680095553398132, + "learning_rate": 4.8264879739094985e-05, + "loss": 3.7972, + "step": 60940 + }, + { + "epoch": 4.1408479412963715, + "grad_norm": 0.18809954822063446, + "learning_rate": 4.826063323821172e-05, + "loss": 4.0651, + "step": 60945 + }, + { + "epoch": 4.1411876613670335, + "grad_norm": 0.16355156898498535, + "learning_rate": 4.825638673732845e-05, + "loss": 4.0456, + "step": 60950 + }, + { + "epoch": 4.141527381437696, + "grad_norm": 0.16882385313510895, + "learning_rate": 4.825214023644517e-05, + "loss": 3.4831, + "step": 60955 + }, + { + "epoch": 4.141867101508357, + "grad_norm": 0.16210518777370453, + "learning_rate": 4.82478937355619e-05, + "loss": 3.6924, + "step": 60960 + }, + { + "epoch": 4.142206821579019, + "grad_norm": 0.17673514783382416, + "learning_rate": 4.824364723467863e-05, + "loss": 3.8341, + "step": 60965 + }, + { + "epoch": 4.142546541649681, + "grad_norm": 0.20580872893333435, + "learning_rate": 4.823940073379535e-05, + "loss": 3.9087, + "step": 60970 + }, + { + "epoch": 4.142886261720342, + "grad_norm": 0.1845122128725052, + "learning_rate": 4.823515423291208e-05, + "loss": 3.7751, + "step": 60975 + }, + { + "epoch": 4.143225981791004, + "grad_norm": 0.1711554080247879, + "learning_rate": 4.8230907732028816e-05, + "loss": 3.7214, + "step": 60980 + }, + { + "epoch": 4.143565701861666, + "grad_norm": 0.18267951905727386, + "learning_rate": 4.822666123114554e-05, + "loss": 4.0899, + "step": 60985 + }, + { + "epoch": 4.1439054219323275, + "grad_norm": 0.5974140167236328, + "learning_rate": 4.8222414730262265e-05, + "loss": 3.7435, + "step": 60990 + }, + { + "epoch": 4.1442451420029895, + "grad_norm": 0.24413073062896729, + "learning_rate": 4.821816822937899e-05, + "loss": 3.6113, + "step": 60995 + }, + { + "epoch": 4.144584862073652, + "grad_norm": 0.17333820462226868, + "learning_rate": 4.821392172849572e-05, + "loss": 3.8934, + "step": 61000 + }, + { + "epoch": 4.144924582144313, + "grad_norm": 0.1508939266204834, + "learning_rate": 4.820967522761245e-05, + "loss": 3.634, + "step": 61005 + }, + { + "epoch": 4.145264302214975, + "grad_norm": 0.15205229818820953, + "learning_rate": 4.820542872672918e-05, + "loss": 4.0042, + "step": 61010 + }, + { + "epoch": 4.145604022285637, + "grad_norm": 0.19530753791332245, + "learning_rate": 4.8201182225845905e-05, + "loss": 3.8308, + "step": 61015 + }, + { + "epoch": 4.145943742356298, + "grad_norm": 0.1538011133670807, + "learning_rate": 4.819693572496263e-05, + "loss": 3.7648, + "step": 61020 + }, + { + "epoch": 4.14628346242696, + "grad_norm": 0.17523209750652313, + "learning_rate": 4.819268922407936e-05, + "loss": 3.8881, + "step": 61025 + }, + { + "epoch": 4.146623182497622, + "grad_norm": 0.15102636814117432, + "learning_rate": 4.818844272319609e-05, + "loss": 3.8217, + "step": 61030 + }, + { + "epoch": 4.1469629025682835, + "grad_norm": 0.15192745625972748, + "learning_rate": 4.818419622231282e-05, + "loss": 3.8481, + "step": 61035 + }, + { + "epoch": 4.147302622638946, + "grad_norm": 0.13622544705867767, + "learning_rate": 4.8179949721429545e-05, + "loss": 3.8706, + "step": 61040 + }, + { + "epoch": 4.147642342709608, + "grad_norm": 0.21065647900104523, + "learning_rate": 4.817570322054627e-05, + "loss": 4.1399, + "step": 61045 + }, + { + "epoch": 4.147982062780269, + "grad_norm": 0.19087152183055878, + "learning_rate": 4.8171456719663e-05, + "loss": 3.7475, + "step": 61050 + }, + { + "epoch": 4.148321782850931, + "grad_norm": 0.185089111328125, + "learning_rate": 4.816721021877973e-05, + "loss": 3.948, + "step": 61055 + }, + { + "epoch": 4.148661502921593, + "grad_norm": 0.15301495790481567, + "learning_rate": 4.816296371789645e-05, + "loss": 3.7495, + "step": 61060 + }, + { + "epoch": 4.149001222992254, + "grad_norm": 0.15505938231945038, + "learning_rate": 4.8158717217013185e-05, + "loss": 3.6813, + "step": 61065 + }, + { + "epoch": 4.149340943062916, + "grad_norm": 0.3566035032272339, + "learning_rate": 4.815447071612991e-05, + "loss": 3.9018, + "step": 61070 + }, + { + "epoch": 4.149680663133578, + "grad_norm": 0.15354067087173462, + "learning_rate": 4.8150224215246634e-05, + "loss": 3.8231, + "step": 61075 + }, + { + "epoch": 4.1500203832042395, + "grad_norm": 0.20772980153560638, + "learning_rate": 4.814597771436337e-05, + "loss": 3.7211, + "step": 61080 + }, + { + "epoch": 4.150360103274902, + "grad_norm": 0.20893257856369019, + "learning_rate": 4.81417312134801e-05, + "loss": 3.7489, + "step": 61085 + }, + { + "epoch": 4.150699823345564, + "grad_norm": 0.2796502411365509, + "learning_rate": 4.813748471259682e-05, + "loss": 3.8344, + "step": 61090 + }, + { + "epoch": 4.151039543416225, + "grad_norm": 0.32401442527770996, + "learning_rate": 4.8133238211713546e-05, + "loss": 4.0048, + "step": 61095 + }, + { + "epoch": 4.151379263486887, + "grad_norm": 0.17346273362636566, + "learning_rate": 4.812899171083028e-05, + "loss": 3.7966, + "step": 61100 + }, + { + "epoch": 4.151718983557549, + "grad_norm": 0.1541288048028946, + "learning_rate": 4.812474520994701e-05, + "loss": 3.7879, + "step": 61105 + }, + { + "epoch": 4.15205870362821, + "grad_norm": 0.9018494486808777, + "learning_rate": 4.812049870906373e-05, + "loss": 3.9597, + "step": 61110 + }, + { + "epoch": 4.152398423698872, + "grad_norm": 0.1963191032409668, + "learning_rate": 4.8116252208180465e-05, + "loss": 3.9307, + "step": 61115 + }, + { + "epoch": 4.152738143769534, + "grad_norm": 0.15498125553131104, + "learning_rate": 4.811200570729719e-05, + "loss": 3.8068, + "step": 61120 + }, + { + "epoch": 4.1530778638401955, + "grad_norm": 0.1608375906944275, + "learning_rate": 4.8107759206413915e-05, + "loss": 3.846, + "step": 61125 + }, + { + "epoch": 4.153417583910858, + "grad_norm": 0.19254621863365173, + "learning_rate": 4.810351270553064e-05, + "loss": 3.6624, + "step": 61130 + }, + { + "epoch": 4.15375730398152, + "grad_norm": 0.18618111312389374, + "learning_rate": 4.809926620464738e-05, + "loss": 3.8924, + "step": 61135 + }, + { + "epoch": 4.154097024052181, + "grad_norm": 0.30597123503685, + "learning_rate": 4.80950197037641e-05, + "loss": 3.9626, + "step": 61140 + }, + { + "epoch": 4.154436744122843, + "grad_norm": 0.2026231437921524, + "learning_rate": 4.8090773202880827e-05, + "loss": 3.8836, + "step": 61145 + }, + { + "epoch": 4.154776464193505, + "grad_norm": 0.19142422080039978, + "learning_rate": 4.808652670199756e-05, + "loss": 3.8858, + "step": 61150 + }, + { + "epoch": 4.155116184264166, + "grad_norm": 0.16282658278942108, + "learning_rate": 4.808228020111428e-05, + "loss": 3.9873, + "step": 61155 + }, + { + "epoch": 4.155455904334828, + "grad_norm": 0.5644433498382568, + "learning_rate": 4.807803370023101e-05, + "loss": 3.8741, + "step": 61160 + }, + { + "epoch": 4.15579562440549, + "grad_norm": 0.1796637326478958, + "learning_rate": 4.807378719934774e-05, + "loss": 3.9284, + "step": 61165 + }, + { + "epoch": 4.1561353444761515, + "grad_norm": 0.4338162839412689, + "learning_rate": 4.806954069846447e-05, + "loss": 4.1357, + "step": 61170 + }, + { + "epoch": 4.156475064546814, + "grad_norm": 0.20940658450126648, + "learning_rate": 4.8065294197581195e-05, + "loss": 3.7526, + "step": 61175 + }, + { + "epoch": 4.156814784617475, + "grad_norm": 0.2084924280643463, + "learning_rate": 4.806104769669792e-05, + "loss": 3.8402, + "step": 61180 + }, + { + "epoch": 4.157154504688137, + "grad_norm": 0.20433282852172852, + "learning_rate": 4.805680119581465e-05, + "loss": 3.8669, + "step": 61185 + }, + { + "epoch": 4.157494224758799, + "grad_norm": 0.20703504979610443, + "learning_rate": 4.805255469493138e-05, + "loss": 3.8032, + "step": 61190 + }, + { + "epoch": 4.15783394482946, + "grad_norm": 0.29097500443458557, + "learning_rate": 4.804830819404811e-05, + "loss": 3.9899, + "step": 61195 + }, + { + "epoch": 4.158173664900122, + "grad_norm": 0.2764449119567871, + "learning_rate": 4.8044061693164835e-05, + "loss": 3.8074, + "step": 61200 + }, + { + "epoch": 4.158513384970784, + "grad_norm": 0.18821975588798523, + "learning_rate": 4.803981519228156e-05, + "loss": 3.5743, + "step": 61205 + }, + { + "epoch": 4.1588531050414455, + "grad_norm": 0.23342402279376984, + "learning_rate": 4.803556869139829e-05, + "loss": 4.1062, + "step": 61210 + }, + { + "epoch": 4.1591928251121075, + "grad_norm": 0.15510833263397217, + "learning_rate": 4.803132219051502e-05, + "loss": 3.7495, + "step": 61215 + }, + { + "epoch": 4.15953254518277, + "grad_norm": 0.19636675715446472, + "learning_rate": 4.802707568963175e-05, + "loss": 3.9443, + "step": 61220 + }, + { + "epoch": 4.159872265253431, + "grad_norm": 0.18509846925735474, + "learning_rate": 4.8022829188748475e-05, + "loss": 3.9377, + "step": 61225 + }, + { + "epoch": 4.160211985324093, + "grad_norm": 0.35797354578971863, + "learning_rate": 4.8018582687865196e-05, + "loss": 3.8818, + "step": 61230 + }, + { + "epoch": 4.160551705394755, + "grad_norm": 0.17382043600082397, + "learning_rate": 4.801433618698193e-05, + "loss": 3.8762, + "step": 61235 + }, + { + "epoch": 4.160891425465416, + "grad_norm": 0.16951046884059906, + "learning_rate": 4.801008968609866e-05, + "loss": 3.8811, + "step": 61240 + }, + { + "epoch": 4.161231145536078, + "grad_norm": 0.17726878821849823, + "learning_rate": 4.800584318521538e-05, + "loss": 3.7734, + "step": 61245 + }, + { + "epoch": 4.16157086560674, + "grad_norm": 0.28720635175704956, + "learning_rate": 4.8001596684332115e-05, + "loss": 3.69, + "step": 61250 + }, + { + "epoch": 4.1619105856774015, + "grad_norm": 0.6119344234466553, + "learning_rate": 4.799735018344884e-05, + "loss": 3.903, + "step": 61255 + }, + { + "epoch": 4.1622503057480635, + "grad_norm": 0.16081440448760986, + "learning_rate": 4.7993103682565564e-05, + "loss": 4.047, + "step": 61260 + }, + { + "epoch": 4.162590025818726, + "grad_norm": 0.19331221282482147, + "learning_rate": 4.798885718168229e-05, + "loss": 3.8707, + "step": 61265 + }, + { + "epoch": 4.162929745889387, + "grad_norm": 0.19734948873519897, + "learning_rate": 4.798461068079903e-05, + "loss": 3.6869, + "step": 61270 + }, + { + "epoch": 4.163269465960049, + "grad_norm": 0.17007769644260406, + "learning_rate": 4.7980364179915755e-05, + "loss": 3.887, + "step": 61275 + }, + { + "epoch": 4.163609186030711, + "grad_norm": 0.167160764336586, + "learning_rate": 4.7976117679032476e-05, + "loss": 3.9408, + "step": 61280 + }, + { + "epoch": 4.163948906101372, + "grad_norm": 0.13192404806613922, + "learning_rate": 4.797187117814921e-05, + "loss": 3.6854, + "step": 61285 + }, + { + "epoch": 4.164288626172034, + "grad_norm": 0.1647675484418869, + "learning_rate": 4.796762467726594e-05, + "loss": 3.872, + "step": 61290 + }, + { + "epoch": 4.164628346242696, + "grad_norm": 0.15331216156482697, + "learning_rate": 4.796337817638266e-05, + "loss": 3.8526, + "step": 61295 + }, + { + "epoch": 4.1649680663133575, + "grad_norm": 0.16840487718582153, + "learning_rate": 4.795913167549939e-05, + "loss": 3.8575, + "step": 61300 + }, + { + "epoch": 4.1653077863840196, + "grad_norm": 0.16295307874679565, + "learning_rate": 4.795488517461612e-05, + "loss": 4.02, + "step": 61305 + }, + { + "epoch": 4.165647506454682, + "grad_norm": 0.1593450903892517, + "learning_rate": 4.7950638673732844e-05, + "loss": 3.7183, + "step": 61310 + }, + { + "epoch": 4.165987226525343, + "grad_norm": 0.1932189017534256, + "learning_rate": 4.794639217284957e-05, + "loss": 3.9406, + "step": 61315 + }, + { + "epoch": 4.166326946596005, + "grad_norm": 0.18838194012641907, + "learning_rate": 4.794214567196631e-05, + "loss": 3.9519, + "step": 61320 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.16873487830162048, + "learning_rate": 4.793789917108303e-05, + "loss": 3.7086, + "step": 61325 + }, + { + "epoch": 4.167006386737328, + "grad_norm": 0.6832773685455322, + "learning_rate": 4.7933652670199756e-05, + "loss": 3.9074, + "step": 61330 + }, + { + "epoch": 4.16734610680799, + "grad_norm": 0.16457097232341766, + "learning_rate": 4.792940616931649e-05, + "loss": 4.0206, + "step": 61335 + }, + { + "epoch": 4.167685826878652, + "grad_norm": 0.20946356654167175, + "learning_rate": 4.792515966843321e-05, + "loss": 4.1857, + "step": 61340 + }, + { + "epoch": 4.1680255469493135, + "grad_norm": 0.17008452117443085, + "learning_rate": 4.792091316754994e-05, + "loss": 3.6128, + "step": 61345 + }, + { + "epoch": 4.168365267019976, + "grad_norm": 0.24897801876068115, + "learning_rate": 4.791666666666667e-05, + "loss": 3.7654, + "step": 61350 + }, + { + "epoch": 4.168704987090638, + "grad_norm": 0.17641283571720123, + "learning_rate": 4.7912420165783396e-05, + "loss": 4.0814, + "step": 61355 + }, + { + "epoch": 4.169044707161299, + "grad_norm": 0.1716942936182022, + "learning_rate": 4.7908173664900124e-05, + "loss": 4.0162, + "step": 61360 + }, + { + "epoch": 4.169384427231961, + "grad_norm": 0.1825617402791977, + "learning_rate": 4.790392716401685e-05, + "loss": 3.7178, + "step": 61365 + }, + { + "epoch": 4.169724147302623, + "grad_norm": 0.18398766219615936, + "learning_rate": 4.789968066313358e-05, + "loss": 3.8349, + "step": 61370 + }, + { + "epoch": 4.170063867373284, + "grad_norm": 0.1704431176185608, + "learning_rate": 4.789543416225031e-05, + "loss": 3.7082, + "step": 61375 + }, + { + "epoch": 4.170403587443946, + "grad_norm": 0.28240904211997986, + "learning_rate": 4.7891187661367036e-05, + "loss": 3.9106, + "step": 61380 + }, + { + "epoch": 4.170743307514608, + "grad_norm": 0.18329085409641266, + "learning_rate": 4.7886941160483764e-05, + "loss": 3.8144, + "step": 61385 + }, + { + "epoch": 4.1710830275852695, + "grad_norm": 0.20632261037826538, + "learning_rate": 4.788269465960049e-05, + "loss": 3.6753, + "step": 61390 + }, + { + "epoch": 4.171422747655932, + "grad_norm": 0.18436968326568604, + "learning_rate": 4.787844815871722e-05, + "loss": 4.0971, + "step": 61395 + }, + { + "epoch": 4.171762467726594, + "grad_norm": 0.2513860762119293, + "learning_rate": 4.787420165783394e-05, + "loss": 3.7166, + "step": 61400 + }, + { + "epoch": 4.172102187797255, + "grad_norm": 0.15367351472377777, + "learning_rate": 4.7869955156950676e-05, + "loss": 3.7871, + "step": 61405 + }, + { + "epoch": 4.172441907867917, + "grad_norm": 0.47171053290367126, + "learning_rate": 4.7865708656067404e-05, + "loss": 3.9172, + "step": 61410 + }, + { + "epoch": 4.172781627938579, + "grad_norm": 0.18326455354690552, + "learning_rate": 4.7861462155184125e-05, + "loss": 3.8231, + "step": 61415 + }, + { + "epoch": 4.17312134800924, + "grad_norm": 0.22947897017002106, + "learning_rate": 4.785721565430086e-05, + "loss": 3.8408, + "step": 61420 + }, + { + "epoch": 4.173461068079902, + "grad_norm": 0.22907447814941406, + "learning_rate": 4.785296915341759e-05, + "loss": 3.8735, + "step": 61425 + }, + { + "epoch": 4.173800788150564, + "grad_norm": 0.1841355264186859, + "learning_rate": 4.784872265253431e-05, + "loss": 3.8667, + "step": 61430 + }, + { + "epoch": 4.1741405082212255, + "grad_norm": 0.1929919719696045, + "learning_rate": 4.7844476151651044e-05, + "loss": 4.043, + "step": 61435 + }, + { + "epoch": 4.174480228291888, + "grad_norm": 0.3499118685722351, + "learning_rate": 4.784022965076777e-05, + "loss": 4.0176, + "step": 61440 + }, + { + "epoch": 4.17481994836255, + "grad_norm": 0.17869006097316742, + "learning_rate": 4.78359831498845e-05, + "loss": 3.9609, + "step": 61445 + }, + { + "epoch": 4.175159668433211, + "grad_norm": 0.12915737926959991, + "learning_rate": 4.783173664900122e-05, + "loss": 3.9815, + "step": 61450 + }, + { + "epoch": 4.175499388503873, + "grad_norm": 0.16791872680187225, + "learning_rate": 4.7827490148117956e-05, + "loss": 3.9208, + "step": 61455 + }, + { + "epoch": 4.175839108574535, + "grad_norm": 0.14561475813388824, + "learning_rate": 4.7823243647234684e-05, + "loss": 3.7511, + "step": 61460 + }, + { + "epoch": 4.176178828645196, + "grad_norm": 0.3753831684589386, + "learning_rate": 4.7818997146351406e-05, + "loss": 3.7484, + "step": 61465 + }, + { + "epoch": 4.176518548715858, + "grad_norm": 0.2310917228460312, + "learning_rate": 4.781475064546814e-05, + "loss": 3.8466, + "step": 61470 + }, + { + "epoch": 4.17685826878652, + "grad_norm": 0.16298983991146088, + "learning_rate": 4.781050414458487e-05, + "loss": 3.9026, + "step": 61475 + }, + { + "epoch": 4.1771979888571815, + "grad_norm": 0.17694547772407532, + "learning_rate": 4.780625764370159e-05, + "loss": 4.0166, + "step": 61480 + }, + { + "epoch": 4.177537708927844, + "grad_norm": 0.23933355510234833, + "learning_rate": 4.780201114281832e-05, + "loss": 3.6961, + "step": 61485 + }, + { + "epoch": 4.177877428998505, + "grad_norm": 0.18903790414333344, + "learning_rate": 4.779776464193505e-05, + "loss": 3.8646, + "step": 61490 + }, + { + "epoch": 4.178217149069167, + "grad_norm": 4.762542247772217, + "learning_rate": 4.7793518141051774e-05, + "loss": 3.8866, + "step": 61495 + }, + { + "epoch": 4.178556869139829, + "grad_norm": 0.18044304847717285, + "learning_rate": 4.77892716401685e-05, + "loss": 3.7417, + "step": 61500 + }, + { + "epoch": 4.17889658921049, + "grad_norm": 0.2473597526550293, + "learning_rate": 4.7785025139285236e-05, + "loss": 3.816, + "step": 61505 + }, + { + "epoch": 4.179236309281152, + "grad_norm": 0.29933854937553406, + "learning_rate": 4.778077863840196e-05, + "loss": 3.7656, + "step": 61510 + }, + { + "epoch": 4.179576029351814, + "grad_norm": 0.17325834929943085, + "learning_rate": 4.7776532137518686e-05, + "loss": 3.7118, + "step": 61515 + }, + { + "epoch": 4.1799157494224755, + "grad_norm": 0.1584872603416443, + "learning_rate": 4.7772285636635414e-05, + "loss": 3.8655, + "step": 61520 + }, + { + "epoch": 4.1802554694931375, + "grad_norm": 0.2086939662694931, + "learning_rate": 4.776803913575214e-05, + "loss": 3.4945, + "step": 61525 + }, + { + "epoch": 4.1805951895638, + "grad_norm": 0.13347193598747253, + "learning_rate": 4.776379263486887e-05, + "loss": 4.0086, + "step": 61530 + }, + { + "epoch": 4.180934909634461, + "grad_norm": 0.18121525645256042, + "learning_rate": 4.77595461339856e-05, + "loss": 4.0803, + "step": 61535 + }, + { + "epoch": 4.181274629705123, + "grad_norm": 0.18682043254375458, + "learning_rate": 4.7755299633102326e-05, + "loss": 3.9019, + "step": 61540 + }, + { + "epoch": 4.181614349775785, + "grad_norm": 0.16878369450569153, + "learning_rate": 4.7751053132219054e-05, + "loss": 3.8888, + "step": 61545 + }, + { + "epoch": 4.181954069846446, + "grad_norm": 0.19410906732082367, + "learning_rate": 4.774680663133578e-05, + "loss": 3.8265, + "step": 61550 + }, + { + "epoch": 4.182293789917108, + "grad_norm": 0.20323066413402557, + "learning_rate": 4.774256013045251e-05, + "loss": 3.8432, + "step": 61555 + }, + { + "epoch": 4.18263350998777, + "grad_norm": 0.5980629920959473, + "learning_rate": 4.773831362956924e-05, + "loss": 3.6491, + "step": 61560 + }, + { + "epoch": 4.1829732300584315, + "grad_norm": 0.16356074810028076, + "learning_rate": 4.7734067128685966e-05, + "loss": 3.9598, + "step": 61565 + }, + { + "epoch": 4.1833129501290935, + "grad_norm": 0.24189883470535278, + "learning_rate": 4.7729820627802694e-05, + "loss": 3.8127, + "step": 61570 + }, + { + "epoch": 4.183652670199756, + "grad_norm": 0.16287033259868622, + "learning_rate": 4.772557412691942e-05, + "loss": 3.9199, + "step": 61575 + }, + { + "epoch": 4.183992390270417, + "grad_norm": 0.16445736587047577, + "learning_rate": 4.772132762603615e-05, + "loss": 4.0079, + "step": 61580 + }, + { + "epoch": 4.184332110341079, + "grad_norm": 0.13613535463809967, + "learning_rate": 4.771708112515287e-05, + "loss": 3.8717, + "step": 61585 + }, + { + "epoch": 4.184671830411741, + "grad_norm": 0.1328444927930832, + "learning_rate": 4.7712834624269606e-05, + "loss": 3.7837, + "step": 61590 + }, + { + "epoch": 4.185011550482402, + "grad_norm": 0.17061926424503326, + "learning_rate": 4.7708588123386334e-05, + "loss": 3.9421, + "step": 61595 + }, + { + "epoch": 4.185351270553064, + "grad_norm": 0.1572217494249344, + "learning_rate": 4.7704341622503055e-05, + "loss": 4.0384, + "step": 61600 + }, + { + "epoch": 4.185690990623726, + "grad_norm": 0.22308242321014404, + "learning_rate": 4.770009512161979e-05, + "loss": 3.8946, + "step": 61605 + }, + { + "epoch": 4.1860307106943875, + "grad_norm": 0.32672014832496643, + "learning_rate": 4.769584862073652e-05, + "loss": 3.9244, + "step": 61610 + }, + { + "epoch": 4.1863704307650496, + "grad_norm": 0.20076912641525269, + "learning_rate": 4.7691602119853246e-05, + "loss": 3.7718, + "step": 61615 + }, + { + "epoch": 4.186710150835712, + "grad_norm": 0.1966089904308319, + "learning_rate": 4.768735561896997e-05, + "loss": 3.8197, + "step": 61620 + }, + { + "epoch": 4.187049870906373, + "grad_norm": 0.1615491807460785, + "learning_rate": 4.76831091180867e-05, + "loss": 3.8912, + "step": 61625 + }, + { + "epoch": 4.187389590977035, + "grad_norm": 0.1824936419725418, + "learning_rate": 4.767886261720343e-05, + "loss": 3.965, + "step": 61630 + }, + { + "epoch": 4.187729311047697, + "grad_norm": 0.21087419986724854, + "learning_rate": 4.767461611632015e-05, + "loss": 3.6911, + "step": 61635 + }, + { + "epoch": 4.188069031118358, + "grad_norm": 0.1503649652004242, + "learning_rate": 4.7670369615436886e-05, + "loss": 3.9396, + "step": 61640 + }, + { + "epoch": 4.18840875118902, + "grad_norm": 0.1574837565422058, + "learning_rate": 4.7666123114553614e-05, + "loss": 3.71, + "step": 61645 + }, + { + "epoch": 4.188748471259682, + "grad_norm": 0.21799184381961823, + "learning_rate": 4.7661876613670335e-05, + "loss": 3.6971, + "step": 61650 + }, + { + "epoch": 4.1890881913303435, + "grad_norm": 0.17843301594257355, + "learning_rate": 4.765763011278706e-05, + "loss": 3.8561, + "step": 61655 + }, + { + "epoch": 4.189427911401006, + "grad_norm": 0.22304584085941315, + "learning_rate": 4.76533836119038e-05, + "loss": 3.712, + "step": 61660 + }, + { + "epoch": 4.189767631471668, + "grad_norm": 0.16032296419143677, + "learning_rate": 4.764913711102052e-05, + "loss": 3.9027, + "step": 61665 + }, + { + "epoch": 4.190107351542329, + "grad_norm": 0.1590634435415268, + "learning_rate": 4.764489061013725e-05, + "loss": 3.7643, + "step": 61670 + }, + { + "epoch": 4.190447071612991, + "grad_norm": 0.3241075277328491, + "learning_rate": 4.764064410925398e-05, + "loss": 3.7715, + "step": 61675 + }, + { + "epoch": 4.190786791683653, + "grad_norm": 0.16560499370098114, + "learning_rate": 4.76363976083707e-05, + "loss": 3.6336, + "step": 61680 + }, + { + "epoch": 4.191126511754314, + "grad_norm": 0.3603944480419159, + "learning_rate": 4.763215110748743e-05, + "loss": 3.6732, + "step": 61685 + }, + { + "epoch": 4.191466231824976, + "grad_norm": 0.22060255706310272, + "learning_rate": 4.762790460660416e-05, + "loss": 3.8659, + "step": 61690 + }, + { + "epoch": 4.191805951895638, + "grad_norm": 0.2813130021095276, + "learning_rate": 4.762365810572089e-05, + "loss": 4.0022, + "step": 61695 + }, + { + "epoch": 4.1921456719662995, + "grad_norm": 0.1749025136232376, + "learning_rate": 4.7619411604837615e-05, + "loss": 4.0028, + "step": 61700 + }, + { + "epoch": 4.192485392036962, + "grad_norm": 0.46385231614112854, + "learning_rate": 4.761516510395434e-05, + "loss": 3.7634, + "step": 61705 + }, + { + "epoch": 4.192825112107624, + "grad_norm": 0.15739236772060394, + "learning_rate": 4.761091860307107e-05, + "loss": 3.7208, + "step": 61710 + }, + { + "epoch": 4.193164832178285, + "grad_norm": 0.1684381663799286, + "learning_rate": 4.76066721021878e-05, + "loss": 3.6377, + "step": 61715 + }, + { + "epoch": 4.193504552248947, + "grad_norm": 0.15655721724033356, + "learning_rate": 4.760242560130453e-05, + "loss": 3.6486, + "step": 61720 + }, + { + "epoch": 4.193844272319609, + "grad_norm": 0.1637568324804306, + "learning_rate": 4.7598179100421255e-05, + "loss": 3.8324, + "step": 61725 + }, + { + "epoch": 4.19418399239027, + "grad_norm": 0.14091481268405914, + "learning_rate": 4.759393259953798e-05, + "loss": 3.9034, + "step": 61730 + }, + { + "epoch": 4.194523712460932, + "grad_norm": 0.18841229379177094, + "learning_rate": 4.758968609865471e-05, + "loss": 3.8472, + "step": 61735 + }, + { + "epoch": 4.194863432531594, + "grad_norm": 0.16842369735240936, + "learning_rate": 4.758543959777144e-05, + "loss": 3.7504, + "step": 61740 + }, + { + "epoch": 4.1952031526022555, + "grad_norm": 0.3986678123474121, + "learning_rate": 4.758119309688817e-05, + "loss": 3.8629, + "step": 61745 + }, + { + "epoch": 4.195542872672918, + "grad_norm": 0.13265667855739594, + "learning_rate": 4.7576946596004895e-05, + "loss": 4.0509, + "step": 61750 + }, + { + "epoch": 4.19588259274358, + "grad_norm": 0.47683849930763245, + "learning_rate": 4.7572700095121616e-05, + "loss": 4.0083, + "step": 61755 + }, + { + "epoch": 4.196222312814241, + "grad_norm": 0.20046815276145935, + "learning_rate": 4.756845359423835e-05, + "loss": 3.8223, + "step": 61760 + }, + { + "epoch": 4.196562032884903, + "grad_norm": 0.14805994927883148, + "learning_rate": 4.756420709335508e-05, + "loss": 3.8943, + "step": 61765 + }, + { + "epoch": 4.196901752955565, + "grad_norm": 1.2474452257156372, + "learning_rate": 4.75599605924718e-05, + "loss": 3.9732, + "step": 61770 + }, + { + "epoch": 4.197241473026226, + "grad_norm": 0.41101813316345215, + "learning_rate": 4.7555714091588535e-05, + "loss": 3.8055, + "step": 61775 + }, + { + "epoch": 4.197581193096888, + "grad_norm": 0.1393548995256424, + "learning_rate": 4.755146759070526e-05, + "loss": 3.8377, + "step": 61780 + }, + { + "epoch": 4.19792091316755, + "grad_norm": 0.2136000245809555, + "learning_rate": 4.754722108982199e-05, + "loss": 3.7531, + "step": 61785 + }, + { + "epoch": 4.1982606332382115, + "grad_norm": 0.15094716846942902, + "learning_rate": 4.754297458893871e-05, + "loss": 3.981, + "step": 61790 + }, + { + "epoch": 4.198600353308874, + "grad_norm": 0.20076553523540497, + "learning_rate": 4.753872808805545e-05, + "loss": 3.7339, + "step": 61795 + }, + { + "epoch": 4.198940073379536, + "grad_norm": 0.18177369236946106, + "learning_rate": 4.7534481587172175e-05, + "loss": 3.9875, + "step": 61800 + }, + { + "epoch": 4.199279793450197, + "grad_norm": 0.2378981113433838, + "learning_rate": 4.7530235086288896e-05, + "loss": 4.0487, + "step": 61805 + }, + { + "epoch": 4.199619513520859, + "grad_norm": 0.20396308600902557, + "learning_rate": 4.752598858540563e-05, + "loss": 3.8739, + "step": 61810 + }, + { + "epoch": 4.199959233591521, + "grad_norm": 0.15608173608779907, + "learning_rate": 4.752174208452236e-05, + "loss": 3.9531, + "step": 61815 + }, + { + "epoch": 4.200298953662182, + "grad_norm": 0.22123640775680542, + "learning_rate": 4.751749558363908e-05, + "loss": 3.9592, + "step": 61820 + }, + { + "epoch": 4.200638673732844, + "grad_norm": 0.17888742685317993, + "learning_rate": 4.751324908275581e-05, + "loss": 3.8866, + "step": 61825 + }, + { + "epoch": 4.200978393803506, + "grad_norm": 0.23540030419826508, + "learning_rate": 4.750900258187254e-05, + "loss": 4.0275, + "step": 61830 + }, + { + "epoch": 4.2013181138741675, + "grad_norm": 0.24781765043735504, + "learning_rate": 4.7504756080989265e-05, + "loss": 3.8627, + "step": 61835 + }, + { + "epoch": 4.20165783394483, + "grad_norm": 0.17360106110572815, + "learning_rate": 4.750135888028265e-05, + "loss": 3.7178, + "step": 61840 + }, + { + "epoch": 4.201997554015492, + "grad_norm": 0.3955538868904114, + "learning_rate": 4.7497112379399375e-05, + "loss": 3.782, + "step": 61845 + }, + { + "epoch": 4.202337274086153, + "grad_norm": 0.15550316870212555, + "learning_rate": 4.749286587851611e-05, + "loss": 3.7741, + "step": 61850 + }, + { + "epoch": 4.202676994156815, + "grad_norm": 0.3993319869041443, + "learning_rate": 4.748861937763283e-05, + "loss": 3.6771, + "step": 61855 + }, + { + "epoch": 4.203016714227476, + "grad_norm": 0.19497795403003693, + "learning_rate": 4.748437287674956e-05, + "loss": 3.8086, + "step": 61860 + }, + { + "epoch": 4.203356434298138, + "grad_norm": 0.1535976231098175, + "learning_rate": 4.7480126375866294e-05, + "loss": 3.7399, + "step": 61865 + }, + { + "epoch": 4.2036961543688, + "grad_norm": 0.1946563720703125, + "learning_rate": 4.7475879874983015e-05, + "loss": 3.7271, + "step": 61870 + }, + { + "epoch": 4.2040358744394615, + "grad_norm": 0.31946462392807007, + "learning_rate": 4.747163337409974e-05, + "loss": 3.5886, + "step": 61875 + }, + { + "epoch": 4.2043755945101235, + "grad_norm": 0.21349681913852692, + "learning_rate": 4.746738687321647e-05, + "loss": 3.9618, + "step": 61880 + }, + { + "epoch": 4.204715314580786, + "grad_norm": 0.14349541068077087, + "learning_rate": 4.74631403723332e-05, + "loss": 3.8998, + "step": 61885 + }, + { + "epoch": 4.205055034651447, + "grad_norm": 0.18453708291053772, + "learning_rate": 4.745889387144993e-05, + "loss": 3.875, + "step": 61890 + }, + { + "epoch": 4.205394754722109, + "grad_norm": 0.21414145827293396, + "learning_rate": 4.7454647370566655e-05, + "loss": 3.8367, + "step": 61895 + }, + { + "epoch": 4.205734474792771, + "grad_norm": 0.9592486619949341, + "learning_rate": 4.745040086968338e-05, + "loss": 3.8618, + "step": 61900 + }, + { + "epoch": 4.206074194863432, + "grad_norm": 0.5487968921661377, + "learning_rate": 4.744615436880011e-05, + "loss": 3.5772, + "step": 61905 + }, + { + "epoch": 4.206413914934094, + "grad_norm": 0.164751335978508, + "learning_rate": 4.744190786791684e-05, + "loss": 3.8226, + "step": 61910 + }, + { + "epoch": 4.206753635004756, + "grad_norm": 0.1445409208536148, + "learning_rate": 4.743766136703357e-05, + "loss": 3.9404, + "step": 61915 + }, + { + "epoch": 4.2070933550754175, + "grad_norm": 0.21032489836215973, + "learning_rate": 4.7433414866150295e-05, + "loss": 4.0013, + "step": 61920 + }, + { + "epoch": 4.20743307514608, + "grad_norm": 0.2139468491077423, + "learning_rate": 4.742916836526702e-05, + "loss": 3.8323, + "step": 61925 + }, + { + "epoch": 4.207772795216742, + "grad_norm": 1.1671611070632935, + "learning_rate": 4.742492186438375e-05, + "loss": 3.6777, + "step": 61930 + }, + { + "epoch": 4.208112515287403, + "grad_norm": 1.1310328245162964, + "learning_rate": 4.742067536350048e-05, + "loss": 3.8867, + "step": 61935 + }, + { + "epoch": 4.208452235358065, + "grad_norm": 0.1875639408826828, + "learning_rate": 4.741642886261721e-05, + "loss": 3.8463, + "step": 61940 + }, + { + "epoch": 4.208791955428727, + "grad_norm": 0.14576447010040283, + "learning_rate": 4.741218236173393e-05, + "loss": 3.7145, + "step": 61945 + }, + { + "epoch": 4.209131675499388, + "grad_norm": 0.1649259328842163, + "learning_rate": 4.740793586085066e-05, + "loss": 4.0161, + "step": 61950 + }, + { + "epoch": 4.20947139557005, + "grad_norm": 0.19794239103794098, + "learning_rate": 4.740368935996739e-05, + "loss": 3.8876, + "step": 61955 + }, + { + "epoch": 4.209811115640712, + "grad_norm": 0.17640502750873566, + "learning_rate": 4.739944285908411e-05, + "loss": 3.8157, + "step": 61960 + }, + { + "epoch": 4.2101508357113735, + "grad_norm": 0.2392888218164444, + "learning_rate": 4.739519635820085e-05, + "loss": 3.996, + "step": 61965 + }, + { + "epoch": 4.210490555782036, + "grad_norm": 0.25152403116226196, + "learning_rate": 4.7390949857317575e-05, + "loss": 3.6862, + "step": 61970 + }, + { + "epoch": 4.210830275852698, + "grad_norm": 0.14200939238071442, + "learning_rate": 4.7386703356434296e-05, + "loss": 3.8105, + "step": 61975 + }, + { + "epoch": 4.211169995923359, + "grad_norm": 0.188751220703125, + "learning_rate": 4.7382456855551024e-05, + "loss": 3.7089, + "step": 61980 + }, + { + "epoch": 4.211509715994021, + "grad_norm": 3.452327013015747, + "learning_rate": 4.737821035466776e-05, + "loss": 3.8383, + "step": 61985 + }, + { + "epoch": 4.211849436064683, + "grad_norm": 0.2065172642469406, + "learning_rate": 4.737396385378449e-05, + "loss": 3.81, + "step": 61990 + }, + { + "epoch": 4.212189156135344, + "grad_norm": 0.20874954760074615, + "learning_rate": 4.736971735290121e-05, + "loss": 3.8431, + "step": 61995 + }, + { + "epoch": 4.212528876206006, + "grad_norm": 0.26623907685279846, + "learning_rate": 4.736547085201794e-05, + "loss": 3.8714, + "step": 62000 + }, + { + "epoch": 4.212868596276668, + "grad_norm": 0.19300086796283722, + "learning_rate": 4.736122435113467e-05, + "loss": 3.942, + "step": 62005 + }, + { + "epoch": 4.2132083163473295, + "grad_norm": 0.2731533944606781, + "learning_rate": 4.735697785025139e-05, + "loss": 3.7768, + "step": 62010 + }, + { + "epoch": 4.213548036417992, + "grad_norm": 0.1258469671010971, + "learning_rate": 4.735273134936812e-05, + "loss": 3.8569, + "step": 62015 + }, + { + "epoch": 4.213887756488654, + "grad_norm": 0.22985780239105225, + "learning_rate": 4.7348484848484855e-05, + "loss": 3.9869, + "step": 62020 + }, + { + "epoch": 4.214227476559315, + "grad_norm": 0.18809469044208527, + "learning_rate": 4.7344238347601576e-05, + "loss": 3.9922, + "step": 62025 + }, + { + "epoch": 4.214567196629977, + "grad_norm": 0.1605890542268753, + "learning_rate": 4.7339991846718304e-05, + "loss": 3.8025, + "step": 62030 + }, + { + "epoch": 4.214906916700639, + "grad_norm": 0.22718292474746704, + "learning_rate": 4.733574534583504e-05, + "loss": 3.8839, + "step": 62035 + }, + { + "epoch": 4.2152466367713, + "grad_norm": 0.1682772934436798, + "learning_rate": 4.733149884495176e-05, + "loss": 3.5292, + "step": 62040 + }, + { + "epoch": 4.215586356841962, + "grad_norm": 0.17261968553066254, + "learning_rate": 4.732725234406849e-05, + "loss": 3.9666, + "step": 62045 + }, + { + "epoch": 4.215926076912624, + "grad_norm": 0.14531999826431274, + "learning_rate": 4.7323005843185216e-05, + "loss": 3.5818, + "step": 62050 + }, + { + "epoch": 4.2162657969832855, + "grad_norm": 0.24773533642292023, + "learning_rate": 4.7318759342301944e-05, + "loss": 3.583, + "step": 62055 + }, + { + "epoch": 4.216605517053948, + "grad_norm": 0.18722078204154968, + "learning_rate": 4.731451284141867e-05, + "loss": 3.8899, + "step": 62060 + }, + { + "epoch": 4.21694523712461, + "grad_norm": 0.18182751536369324, + "learning_rate": 4.73102663405354e-05, + "loss": 3.8277, + "step": 62065 + }, + { + "epoch": 4.217284957195271, + "grad_norm": 0.21273858845233917, + "learning_rate": 4.730601983965213e-05, + "loss": 4.0262, + "step": 62070 + }, + { + "epoch": 4.217624677265933, + "grad_norm": 0.16179652512073517, + "learning_rate": 4.7301773338768857e-05, + "loss": 3.9474, + "step": 62075 + }, + { + "epoch": 4.217964397336595, + "grad_norm": 0.2029334157705307, + "learning_rate": 4.7297526837885585e-05, + "loss": 3.6218, + "step": 62080 + }, + { + "epoch": 4.218304117407256, + "grad_norm": 0.19851376116275787, + "learning_rate": 4.729328033700231e-05, + "loss": 3.9831, + "step": 62085 + }, + { + "epoch": 4.218643837477918, + "grad_norm": 0.1958344578742981, + "learning_rate": 4.728903383611904e-05, + "loss": 3.7931, + "step": 62090 + }, + { + "epoch": 4.21898355754858, + "grad_norm": 0.1738799512386322, + "learning_rate": 4.728478733523577e-05, + "loss": 3.8039, + "step": 62095 + }, + { + "epoch": 4.2193232776192415, + "grad_norm": 0.1781737208366394, + "learning_rate": 4.7280540834352497e-05, + "loss": 3.8418, + "step": 62100 + }, + { + "epoch": 4.219662997689904, + "grad_norm": 0.13164588809013367, + "learning_rate": 4.7276294333469225e-05, + "loss": 3.9365, + "step": 62105 + }, + { + "epoch": 4.220002717760566, + "grad_norm": 0.17090697586536407, + "learning_rate": 4.727204783258595e-05, + "loss": 3.6824, + "step": 62110 + }, + { + "epoch": 4.220342437831227, + "grad_norm": 0.4364991784095764, + "learning_rate": 4.7267801331702674e-05, + "loss": 3.7352, + "step": 62115 + }, + { + "epoch": 4.220682157901889, + "grad_norm": 0.15069134533405304, + "learning_rate": 4.726355483081941e-05, + "loss": 3.9379, + "step": 62120 + }, + { + "epoch": 4.221021877972551, + "grad_norm": 0.1697666347026825, + "learning_rate": 4.7259308329936137e-05, + "loss": 3.9415, + "step": 62125 + }, + { + "epoch": 4.221361598043212, + "grad_norm": 0.17641274631023407, + "learning_rate": 4.725506182905286e-05, + "loss": 3.9485, + "step": 62130 + }, + { + "epoch": 4.221701318113874, + "grad_norm": 0.17579427361488342, + "learning_rate": 4.725081532816959e-05, + "loss": 3.6457, + "step": 62135 + }, + { + "epoch": 4.222041038184536, + "grad_norm": 0.2235265076160431, + "learning_rate": 4.724656882728632e-05, + "loss": 3.6655, + "step": 62140 + }, + { + "epoch": 4.2223807582551975, + "grad_norm": 0.15498751401901245, + "learning_rate": 4.724232232640304e-05, + "loss": 3.8362, + "step": 62145 + }, + { + "epoch": 4.22272047832586, + "grad_norm": 0.20893952250480652, + "learning_rate": 4.723807582551977e-05, + "loss": 4.0037, + "step": 62150 + }, + { + "epoch": 4.223060198396522, + "grad_norm": 0.1653960794210434, + "learning_rate": 4.7233829324636505e-05, + "loss": 3.9037, + "step": 62155 + }, + { + "epoch": 4.223399918467183, + "grad_norm": 0.6773374080657959, + "learning_rate": 4.722958282375323e-05, + "loss": 3.7605, + "step": 62160 + }, + { + "epoch": 4.223739638537845, + "grad_norm": 0.17761127650737762, + "learning_rate": 4.7225336322869954e-05, + "loss": 3.7052, + "step": 62165 + }, + { + "epoch": 4.224079358608506, + "grad_norm": 0.28900855779647827, + "learning_rate": 4.722108982198669e-05, + "loss": 3.6457, + "step": 62170 + }, + { + "epoch": 4.224419078679168, + "grad_norm": 0.1371610015630722, + "learning_rate": 4.721684332110342e-05, + "loss": 3.6555, + "step": 62175 + }, + { + "epoch": 4.22475879874983, + "grad_norm": 0.1954209804534912, + "learning_rate": 4.721259682022014e-05, + "loss": 3.9708, + "step": 62180 + }, + { + "epoch": 4.2250985188204915, + "grad_norm": 0.32753077149391174, + "learning_rate": 4.7208350319336866e-05, + "loss": 3.6772, + "step": 62185 + }, + { + "epoch": 4.2254382388911536, + "grad_norm": 0.22855016589164734, + "learning_rate": 4.72041038184536e-05, + "loss": 4.0036, + "step": 62190 + }, + { + "epoch": 4.225777958961816, + "grad_norm": 0.16955219209194183, + "learning_rate": 4.719985731757032e-05, + "loss": 3.7419, + "step": 62195 + }, + { + "epoch": 4.226117679032477, + "grad_norm": 0.1656356006860733, + "learning_rate": 4.719561081668705e-05, + "loss": 3.9637, + "step": 62200 + }, + { + "epoch": 4.226457399103139, + "grad_norm": 0.184284046292305, + "learning_rate": 4.7191364315803785e-05, + "loss": 3.8735, + "step": 62205 + }, + { + "epoch": 4.226797119173801, + "grad_norm": 0.17346160113811493, + "learning_rate": 4.7187117814920506e-05, + "loss": 3.627, + "step": 62210 + }, + { + "epoch": 4.227136839244462, + "grad_norm": 0.5812570452690125, + "learning_rate": 4.7182871314037234e-05, + "loss": 3.6903, + "step": 62215 + }, + { + "epoch": 4.227476559315124, + "grad_norm": 0.16624999046325684, + "learning_rate": 4.717862481315397e-05, + "loss": 3.9052, + "step": 62220 + }, + { + "epoch": 4.227816279385786, + "grad_norm": 0.16588030755519867, + "learning_rate": 4.717437831227069e-05, + "loss": 3.6972, + "step": 62225 + }, + { + "epoch": 4.2281559994564475, + "grad_norm": 0.6359578967094421, + "learning_rate": 4.717013181138742e-05, + "loss": 3.7733, + "step": 62230 + }, + { + "epoch": 4.22849571952711, + "grad_norm": 0.17906685173511505, + "learning_rate": 4.7165885310504146e-05, + "loss": 3.969, + "step": 62235 + }, + { + "epoch": 4.228835439597772, + "grad_norm": 0.3018489480018616, + "learning_rate": 4.7161638809620874e-05, + "loss": 3.85, + "step": 62240 + }, + { + "epoch": 4.229175159668433, + "grad_norm": 0.16917048394680023, + "learning_rate": 4.71573923087376e-05, + "loss": 4.1687, + "step": 62245 + }, + { + "epoch": 4.229514879739095, + "grad_norm": 0.16355890035629272, + "learning_rate": 4.715314580785433e-05, + "loss": 3.8067, + "step": 62250 + }, + { + "epoch": 4.229854599809757, + "grad_norm": 0.21995963156223297, + "learning_rate": 4.714889930697106e-05, + "loss": 3.6827, + "step": 62255 + }, + { + "epoch": 4.230194319880418, + "grad_norm": 0.14400887489318848, + "learning_rate": 4.7144652806087786e-05, + "loss": 4.0993, + "step": 62260 + }, + { + "epoch": 4.23053403995108, + "grad_norm": 0.20318056643009186, + "learning_rate": 4.7140406305204514e-05, + "loss": 3.7543, + "step": 62265 + }, + { + "epoch": 4.230873760021742, + "grad_norm": 0.20148865878582, + "learning_rate": 4.713615980432124e-05, + "loss": 3.7509, + "step": 62270 + }, + { + "epoch": 4.2312134800924035, + "grad_norm": 0.16811344027519226, + "learning_rate": 4.713191330343797e-05, + "loss": 3.9649, + "step": 62275 + }, + { + "epoch": 4.231553200163066, + "grad_norm": 0.19745102524757385, + "learning_rate": 4.71276668025547e-05, + "loss": 4.0638, + "step": 62280 + }, + { + "epoch": 4.231892920233728, + "grad_norm": 0.14338520169258118, + "learning_rate": 4.712342030167142e-05, + "loss": 4.0039, + "step": 62285 + }, + { + "epoch": 4.232232640304389, + "grad_norm": 0.5526770949363708, + "learning_rate": 4.7119173800788154e-05, + "loss": 3.9511, + "step": 62290 + }, + { + "epoch": 4.232572360375051, + "grad_norm": 0.2230692058801651, + "learning_rate": 4.711492729990488e-05, + "loss": 3.9685, + "step": 62295 + }, + { + "epoch": 4.232912080445713, + "grad_norm": 0.19238248467445374, + "learning_rate": 4.71106807990216e-05, + "loss": 3.7519, + "step": 62300 + }, + { + "epoch": 4.233251800516374, + "grad_norm": 0.1957177221775055, + "learning_rate": 4.710643429813834e-05, + "loss": 3.6939, + "step": 62305 + }, + { + "epoch": 4.233591520587036, + "grad_norm": 0.18333721160888672, + "learning_rate": 4.7102187797255066e-05, + "loss": 3.5906, + "step": 62310 + }, + { + "epoch": 4.233931240657698, + "grad_norm": 0.7019594311714172, + "learning_rate": 4.709794129637179e-05, + "loss": 3.871, + "step": 62315 + }, + { + "epoch": 4.2342709607283595, + "grad_norm": 0.17249338328838348, + "learning_rate": 4.709369479548852e-05, + "loss": 3.624, + "step": 62320 + }, + { + "epoch": 4.234610680799022, + "grad_norm": 0.2126917690038681, + "learning_rate": 4.708944829460525e-05, + "loss": 3.6685, + "step": 62325 + }, + { + "epoch": 4.234950400869684, + "grad_norm": 0.17723995447158813, + "learning_rate": 4.708520179372198e-05, + "loss": 3.7318, + "step": 62330 + }, + { + "epoch": 4.235290120940345, + "grad_norm": 0.14405709505081177, + "learning_rate": 4.70809552928387e-05, + "loss": 3.9599, + "step": 62335 + }, + { + "epoch": 4.235629841011007, + "grad_norm": 0.1555628627538681, + "learning_rate": 4.7076708791955434e-05, + "loss": 3.9109, + "step": 62340 + }, + { + "epoch": 4.235969561081669, + "grad_norm": 0.1893998235464096, + "learning_rate": 4.707246229107216e-05, + "loss": 3.6197, + "step": 62345 + }, + { + "epoch": 4.23630928115233, + "grad_norm": 0.32623258233070374, + "learning_rate": 4.7068215790188883e-05, + "loss": 3.7753, + "step": 62350 + }, + { + "epoch": 4.236649001222992, + "grad_norm": 0.1451745331287384, + "learning_rate": 4.706396928930562e-05, + "loss": 3.7676, + "step": 62355 + }, + { + "epoch": 4.236988721293654, + "grad_norm": 0.19803819060325623, + "learning_rate": 4.7059722788422346e-05, + "loss": 3.7308, + "step": 62360 + }, + { + "epoch": 4.2373284413643155, + "grad_norm": 0.1513693630695343, + "learning_rate": 4.705547628753907e-05, + "loss": 3.8706, + "step": 62365 + }, + { + "epoch": 4.237668161434978, + "grad_norm": 0.2173575907945633, + "learning_rate": 4.7051229786655795e-05, + "loss": 3.811, + "step": 62370 + }, + { + "epoch": 4.23800788150564, + "grad_norm": 0.20374470949172974, + "learning_rate": 4.704698328577253e-05, + "loss": 3.7727, + "step": 62375 + }, + { + "epoch": 4.238347601576301, + "grad_norm": 0.18278978765010834, + "learning_rate": 4.704273678488925e-05, + "loss": 3.8353, + "step": 62380 + }, + { + "epoch": 4.238687321646963, + "grad_norm": 0.17803283035755157, + "learning_rate": 4.703849028400598e-05, + "loss": 3.7591, + "step": 62385 + }, + { + "epoch": 4.239027041717625, + "grad_norm": 0.18127094209194183, + "learning_rate": 4.7034243783122714e-05, + "loss": 3.7972, + "step": 62390 + }, + { + "epoch": 4.239366761788286, + "grad_norm": 0.1761433631181717, + "learning_rate": 4.7029997282239435e-05, + "loss": 3.9405, + "step": 62395 + }, + { + "epoch": 4.239706481858948, + "grad_norm": 0.22721855342388153, + "learning_rate": 4.7025750781356163e-05, + "loss": 3.7911, + "step": 62400 + }, + { + "epoch": 4.24004620192961, + "grad_norm": 0.20756718516349792, + "learning_rate": 4.702150428047289e-05, + "loss": 3.8867, + "step": 62405 + }, + { + "epoch": 4.2403859220002715, + "grad_norm": 0.18347430229187012, + "learning_rate": 4.701725777958962e-05, + "loss": 3.9637, + "step": 62410 + }, + { + "epoch": 4.240725642070934, + "grad_norm": 0.17065037786960602, + "learning_rate": 4.701301127870635e-05, + "loss": 3.9028, + "step": 62415 + }, + { + "epoch": 4.241065362141596, + "grad_norm": 0.15850839018821716, + "learning_rate": 4.7008764777823076e-05, + "loss": 3.8962, + "step": 62420 + }, + { + "epoch": 4.241405082212257, + "grad_norm": 0.20607580244541168, + "learning_rate": 4.7004518276939804e-05, + "loss": 3.8933, + "step": 62425 + }, + { + "epoch": 4.241744802282919, + "grad_norm": 0.16016454994678497, + "learning_rate": 4.700027177605653e-05, + "loss": 3.6662, + "step": 62430 + }, + { + "epoch": 4.242084522353581, + "grad_norm": 0.2108292430639267, + "learning_rate": 4.699602527517326e-05, + "loss": 3.6842, + "step": 62435 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.17018334567546844, + "learning_rate": 4.699177877428999e-05, + "loss": 3.8601, + "step": 62440 + }, + { + "epoch": 4.242763962494904, + "grad_norm": 0.16266648471355438, + "learning_rate": 4.6987532273406716e-05, + "loss": 4.0589, + "step": 62445 + }, + { + "epoch": 4.243103682565566, + "grad_norm": 0.17320841550827026, + "learning_rate": 4.6983285772523444e-05, + "loss": 3.8102, + "step": 62450 + }, + { + "epoch": 4.2434434026362275, + "grad_norm": 0.1982216238975525, + "learning_rate": 4.697903927164017e-05, + "loss": 3.9462, + "step": 62455 + }, + { + "epoch": 4.24378312270689, + "grad_norm": 0.1328074336051941, + "learning_rate": 4.69747927707569e-05, + "loss": 3.8361, + "step": 62460 + }, + { + "epoch": 4.244122842777552, + "grad_norm": 0.37775862216949463, + "learning_rate": 4.697054626987363e-05, + "loss": 3.8817, + "step": 62465 + }, + { + "epoch": 4.244462562848213, + "grad_norm": 0.18837128579616547, + "learning_rate": 4.696629976899035e-05, + "loss": 3.6257, + "step": 62470 + }, + { + "epoch": 4.244802282918875, + "grad_norm": 0.16906042397022247, + "learning_rate": 4.6962053268107084e-05, + "loss": 3.7975, + "step": 62475 + }, + { + "epoch": 4.245142002989537, + "grad_norm": 0.17511536180973053, + "learning_rate": 4.695780676722381e-05, + "loss": 3.7596, + "step": 62480 + }, + { + "epoch": 4.245481723060198, + "grad_norm": 0.21341583132743835, + "learning_rate": 4.695356026634053e-05, + "loss": 3.6266, + "step": 62485 + }, + { + "epoch": 4.24582144313086, + "grad_norm": 0.2111874520778656, + "learning_rate": 4.694931376545727e-05, + "loss": 3.5787, + "step": 62490 + }, + { + "epoch": 4.246161163201522, + "grad_norm": 0.17370326817035675, + "learning_rate": 4.6945067264573996e-05, + "loss": 3.7474, + "step": 62495 + }, + { + "epoch": 4.246500883272184, + "grad_norm": 0.18821144104003906, + "learning_rate": 4.6940820763690724e-05, + "loss": 3.773, + "step": 62500 + }, + { + "epoch": 4.246840603342846, + "grad_norm": 0.1887567788362503, + "learning_rate": 4.6936574262807445e-05, + "loss": 3.8061, + "step": 62505 + }, + { + "epoch": 4.247180323413508, + "grad_norm": 0.24751615524291992, + "learning_rate": 4.693232776192418e-05, + "loss": 3.9784, + "step": 62510 + }, + { + "epoch": 4.247520043484169, + "grad_norm": 0.5497080683708191, + "learning_rate": 4.692808126104091e-05, + "loss": 3.8591, + "step": 62515 + }, + { + "epoch": 4.247859763554831, + "grad_norm": 0.18040022253990173, + "learning_rate": 4.692383476015763e-05, + "loss": 3.65, + "step": 62520 + }, + { + "epoch": 4.248199483625493, + "grad_norm": 0.22720059752464294, + "learning_rate": 4.6919588259274364e-05, + "loss": 3.6535, + "step": 62525 + }, + { + "epoch": 4.248539203696154, + "grad_norm": 0.21973945200443268, + "learning_rate": 4.691534175839109e-05, + "loss": 3.7221, + "step": 62530 + }, + { + "epoch": 4.248878923766816, + "grad_norm": 0.4171205759048462, + "learning_rate": 4.691109525750781e-05, + "loss": 3.704, + "step": 62535 + }, + { + "epoch": 4.2492186438374775, + "grad_norm": 0.1827021986246109, + "learning_rate": 4.690684875662454e-05, + "loss": 3.8928, + "step": 62540 + }, + { + "epoch": 4.24955836390814, + "grad_norm": 0.16609370708465576, + "learning_rate": 4.6902602255741276e-05, + "loss": 3.7738, + "step": 62545 + }, + { + "epoch": 4.249898083978802, + "grad_norm": 0.16231779754161835, + "learning_rate": 4.6898355754858e-05, + "loss": 3.9043, + "step": 62550 + }, + { + "epoch": 4.250237804049463, + "grad_norm": 0.15706367790699005, + "learning_rate": 4.6894109253974725e-05, + "loss": 3.6348, + "step": 62555 + }, + { + "epoch": 4.250577524120125, + "grad_norm": 0.1450299769639969, + "learning_rate": 4.688986275309146e-05, + "loss": 3.5035, + "step": 62560 + }, + { + "epoch": 4.250917244190787, + "grad_norm": 0.2677094638347626, + "learning_rate": 4.688561625220818e-05, + "loss": 3.9802, + "step": 62565 + }, + { + "epoch": 4.251256964261448, + "grad_norm": 0.1719151735305786, + "learning_rate": 4.688136975132491e-05, + "loss": 3.8999, + "step": 62570 + }, + { + "epoch": 4.25159668433211, + "grad_norm": 0.1722109615802765, + "learning_rate": 4.687712325044164e-05, + "loss": 3.8894, + "step": 62575 + }, + { + "epoch": 4.251936404402772, + "grad_norm": 0.16963207721710205, + "learning_rate": 4.6872876749558365e-05, + "loss": 3.6972, + "step": 62580 + }, + { + "epoch": 4.2522761244734335, + "grad_norm": 0.16342155635356903, + "learning_rate": 4.686863024867509e-05, + "loss": 3.935, + "step": 62585 + }, + { + "epoch": 4.252615844544096, + "grad_norm": 0.2113378643989563, + "learning_rate": 4.686438374779182e-05, + "loss": 3.9299, + "step": 62590 + }, + { + "epoch": 4.252955564614758, + "grad_norm": 0.21905502676963806, + "learning_rate": 4.686013724690855e-05, + "loss": 3.6837, + "step": 62595 + }, + { + "epoch": 4.253295284685419, + "grad_norm": 0.17267091572284698, + "learning_rate": 4.685589074602528e-05, + "loss": 3.6463, + "step": 62600 + }, + { + "epoch": 4.253635004756081, + "grad_norm": 0.22168384492397308, + "learning_rate": 4.6851644245142005e-05, + "loss": 3.6554, + "step": 62605 + }, + { + "epoch": 4.253974724826743, + "grad_norm": 0.2090248018503189, + "learning_rate": 4.684739774425873e-05, + "loss": 3.8103, + "step": 62610 + }, + { + "epoch": 4.254314444897404, + "grad_norm": 0.5489887595176697, + "learning_rate": 4.684315124337546e-05, + "loss": 3.7764, + "step": 62615 + }, + { + "epoch": 4.254654164968066, + "grad_norm": 0.2024175226688385, + "learning_rate": 4.683890474249219e-05, + "loss": 3.8041, + "step": 62620 + }, + { + "epoch": 4.254993885038728, + "grad_norm": 0.17926116287708282, + "learning_rate": 4.683465824160892e-05, + "loss": 3.848, + "step": 62625 + }, + { + "epoch": 4.2553336051093895, + "grad_norm": 0.15931348502635956, + "learning_rate": 4.6830411740725645e-05, + "loss": 3.9931, + "step": 62630 + }, + { + "epoch": 4.255673325180052, + "grad_norm": 0.18655861914157867, + "learning_rate": 4.682616523984237e-05, + "loss": 3.8528, + "step": 62635 + }, + { + "epoch": 4.256013045250714, + "grad_norm": 0.24003875255584717, + "learning_rate": 4.6821918738959094e-05, + "loss": 3.6153, + "step": 62640 + }, + { + "epoch": 4.256352765321375, + "grad_norm": 0.17049041390419006, + "learning_rate": 4.681767223807583e-05, + "loss": 3.7951, + "step": 62645 + }, + { + "epoch": 4.256692485392037, + "grad_norm": 0.1837928146123886, + "learning_rate": 4.681342573719256e-05, + "loss": 3.8495, + "step": 62650 + }, + { + "epoch": 4.257032205462699, + "grad_norm": 0.1553886979818344, + "learning_rate": 4.680917923630928e-05, + "loss": 3.8763, + "step": 62655 + }, + { + "epoch": 4.25737192553336, + "grad_norm": 0.1428597867488861, + "learning_rate": 4.680493273542601e-05, + "loss": 3.5557, + "step": 62660 + }, + { + "epoch": 4.257711645604022, + "grad_norm": 0.2664981484413147, + "learning_rate": 4.680068623454274e-05, + "loss": 3.9269, + "step": 62665 + }, + { + "epoch": 4.258051365674684, + "grad_norm": 0.2130032181739807, + "learning_rate": 4.679643973365947e-05, + "loss": 4.0903, + "step": 62670 + }, + { + "epoch": 4.2583910857453455, + "grad_norm": 0.17132769525051117, + "learning_rate": 4.679219323277619e-05, + "loss": 3.7568, + "step": 62675 + }, + { + "epoch": 4.258730805816008, + "grad_norm": 0.23723794519901276, + "learning_rate": 4.6787946731892925e-05, + "loss": 3.7647, + "step": 62680 + }, + { + "epoch": 4.25907052588667, + "grad_norm": 0.16796308755874634, + "learning_rate": 4.678370023100965e-05, + "loss": 3.669, + "step": 62685 + }, + { + "epoch": 4.259410245957331, + "grad_norm": 0.2084510773420334, + "learning_rate": 4.6779453730126374e-05, + "loss": 3.6552, + "step": 62690 + }, + { + "epoch": 4.259749966027993, + "grad_norm": 0.170313760638237, + "learning_rate": 4.677520722924311e-05, + "loss": 3.6631, + "step": 62695 + }, + { + "epoch": 4.260089686098655, + "grad_norm": 0.14304374158382416, + "learning_rate": 4.677096072835984e-05, + "loss": 3.9327, + "step": 62700 + }, + { + "epoch": 4.260429406169316, + "grad_norm": 0.2532387971878052, + "learning_rate": 4.676671422747656e-05, + "loss": 3.9881, + "step": 62705 + }, + { + "epoch": 4.260769126239978, + "grad_norm": 0.15590612590312958, + "learning_rate": 4.6762467726593286e-05, + "loss": 3.9693, + "step": 62710 + }, + { + "epoch": 4.26110884631064, + "grad_norm": 0.1429593563079834, + "learning_rate": 4.675822122571002e-05, + "loss": 3.9476, + "step": 62715 + }, + { + "epoch": 4.2614485663813015, + "grad_norm": 0.17707568407058716, + "learning_rate": 4.675397472482674e-05, + "loss": 3.7071, + "step": 62720 + }, + { + "epoch": 4.261788286451964, + "grad_norm": 0.1818859577178955, + "learning_rate": 4.674972822394347e-05, + "loss": 3.9247, + "step": 62725 + }, + { + "epoch": 4.262128006522626, + "grad_norm": 0.1946825236082077, + "learning_rate": 4.6745481723060205e-05, + "loss": 3.9412, + "step": 62730 + }, + { + "epoch": 4.262467726593287, + "grad_norm": 0.167491614818573, + "learning_rate": 4.6741235222176926e-05, + "loss": 3.8543, + "step": 62735 + }, + { + "epoch": 4.262807446663949, + "grad_norm": 0.16796615719795227, + "learning_rate": 4.6736988721293654e-05, + "loss": 3.8109, + "step": 62740 + }, + { + "epoch": 4.263147166734611, + "grad_norm": 0.24775059521198273, + "learning_rate": 4.673274222041039e-05, + "loss": 3.9179, + "step": 62745 + }, + { + "epoch": 4.263486886805272, + "grad_norm": 0.37277451157569885, + "learning_rate": 4.672849571952711e-05, + "loss": 3.733, + "step": 62750 + }, + { + "epoch": 4.263826606875934, + "grad_norm": 0.22241750359535217, + "learning_rate": 4.672424921864384e-05, + "loss": 3.7408, + "step": 62755 + }, + { + "epoch": 4.264166326946596, + "grad_norm": 0.16076400876045227, + "learning_rate": 4.6720002717760566e-05, + "loss": 3.9416, + "step": 62760 + }, + { + "epoch": 4.2645060470172576, + "grad_norm": 0.26041677594184875, + "learning_rate": 4.6715756216877294e-05, + "loss": 3.7064, + "step": 62765 + }, + { + "epoch": 4.26484576708792, + "grad_norm": 0.20381152629852295, + "learning_rate": 4.671150971599402e-05, + "loss": 3.8549, + "step": 62770 + }, + { + "epoch": 4.265185487158582, + "grad_norm": 0.17826233804225922, + "learning_rate": 4.670726321511075e-05, + "loss": 3.8702, + "step": 62775 + }, + { + "epoch": 4.265525207229243, + "grad_norm": 0.20833607017993927, + "learning_rate": 4.670301671422748e-05, + "loss": 3.8251, + "step": 62780 + }, + { + "epoch": 4.265864927299905, + "grad_norm": 0.16309136152267456, + "learning_rate": 4.6698770213344207e-05, + "loss": 3.7472, + "step": 62785 + }, + { + "epoch": 4.266204647370567, + "grad_norm": 0.17853376269340515, + "learning_rate": 4.6694523712460935e-05, + "loss": 3.9632, + "step": 62790 + }, + { + "epoch": 4.266544367441228, + "grad_norm": 0.19833488762378693, + "learning_rate": 4.669027721157766e-05, + "loss": 3.867, + "step": 62795 + }, + { + "epoch": 4.26688408751189, + "grad_norm": 0.1650567650794983, + "learning_rate": 4.668603071069439e-05, + "loss": 3.7703, + "step": 62800 + }, + { + "epoch": 4.267223807582552, + "grad_norm": 0.18126273155212402, + "learning_rate": 4.668178420981112e-05, + "loss": 3.7536, + "step": 62805 + }, + { + "epoch": 4.267563527653214, + "grad_norm": 0.20199839770793915, + "learning_rate": 4.667753770892784e-05, + "loss": 3.6095, + "step": 62810 + }, + { + "epoch": 4.267903247723876, + "grad_norm": 0.16384850442409515, + "learning_rate": 4.6673291208044575e-05, + "loss": 4.0831, + "step": 62815 + }, + { + "epoch": 4.268242967794538, + "grad_norm": 0.20679645240306854, + "learning_rate": 4.66690447071613e-05, + "loss": 3.7718, + "step": 62820 + }, + { + "epoch": 4.268582687865199, + "grad_norm": 0.1817120909690857, + "learning_rate": 4.6664798206278024e-05, + "loss": 3.9448, + "step": 62825 + }, + { + "epoch": 4.268922407935861, + "grad_norm": 0.32348892092704773, + "learning_rate": 4.666055170539476e-05, + "loss": 4.2113, + "step": 62830 + }, + { + "epoch": 4.269262128006522, + "grad_norm": 0.14709654450416565, + "learning_rate": 4.6656305204511487e-05, + "loss": 3.858, + "step": 62835 + }, + { + "epoch": 4.269601848077184, + "grad_norm": 0.16311001777648926, + "learning_rate": 4.6652058703628215e-05, + "loss": 4.0427, + "step": 62840 + }, + { + "epoch": 4.269941568147846, + "grad_norm": 0.5200947523117065, + "learning_rate": 4.664781220274494e-05, + "loss": 4.0319, + "step": 62845 + }, + { + "epoch": 4.2702812882185075, + "grad_norm": 0.14115512371063232, + "learning_rate": 4.664356570186167e-05, + "loss": 3.7857, + "step": 62850 + }, + { + "epoch": 4.27062100828917, + "grad_norm": 0.7975289225578308, + "learning_rate": 4.66393192009784e-05, + "loss": 3.9091, + "step": 62855 + }, + { + "epoch": 4.270960728359832, + "grad_norm": 0.2808198630809784, + "learning_rate": 4.663507270009512e-05, + "loss": 3.8664, + "step": 62860 + }, + { + "epoch": 4.271300448430493, + "grad_norm": 0.17220887541770935, + "learning_rate": 4.6630826199211855e-05, + "loss": 4.0671, + "step": 62865 + }, + { + "epoch": 4.271640168501155, + "grad_norm": 0.13886015117168427, + "learning_rate": 4.662657969832858e-05, + "loss": 3.8782, + "step": 62870 + }, + { + "epoch": 4.271979888571817, + "grad_norm": 0.1337057501077652, + "learning_rate": 4.6622333197445304e-05, + "loss": 3.8315, + "step": 62875 + }, + { + "epoch": 4.272319608642478, + "grad_norm": 0.17753668129444122, + "learning_rate": 4.661808669656204e-05, + "loss": 3.5944, + "step": 62880 + }, + { + "epoch": 4.27265932871314, + "grad_norm": 0.19171187281608582, + "learning_rate": 4.661384019567877e-05, + "loss": 3.7796, + "step": 62885 + }, + { + "epoch": 4.272999048783802, + "grad_norm": 0.30037519335746765, + "learning_rate": 4.660959369479549e-05, + "loss": 4.031, + "step": 62890 + }, + { + "epoch": 4.2733387688544635, + "grad_norm": 0.19859695434570312, + "learning_rate": 4.6605347193912216e-05, + "loss": 3.8552, + "step": 62895 + }, + { + "epoch": 4.273678488925126, + "grad_norm": 0.6891469359397888, + "learning_rate": 4.660110069302895e-05, + "loss": 3.5467, + "step": 62900 + }, + { + "epoch": 4.274018208995788, + "grad_norm": 0.19839340448379517, + "learning_rate": 4.659685419214567e-05, + "loss": 3.8848, + "step": 62905 + }, + { + "epoch": 4.274357929066449, + "grad_norm": 0.15869292616844177, + "learning_rate": 4.65926076912624e-05, + "loss": 3.7376, + "step": 62910 + }, + { + "epoch": 4.274697649137111, + "grad_norm": 0.35756173729896545, + "learning_rate": 4.6588361190379135e-05, + "loss": 3.778, + "step": 62915 + }, + { + "epoch": 4.275037369207773, + "grad_norm": 0.14277823269367218, + "learning_rate": 4.6584114689495856e-05, + "loss": 3.9927, + "step": 62920 + }, + { + "epoch": 4.275377089278434, + "grad_norm": 0.19820913672447205, + "learning_rate": 4.6579868188612584e-05, + "loss": 3.9238, + "step": 62925 + }, + { + "epoch": 4.275716809349096, + "grad_norm": 0.1699657142162323, + "learning_rate": 4.657562168772931e-05, + "loss": 3.9132, + "step": 62930 + }, + { + "epoch": 4.276056529419758, + "grad_norm": 0.14960968494415283, + "learning_rate": 4.657137518684604e-05, + "loss": 3.7466, + "step": 62935 + }, + { + "epoch": 4.2763962494904195, + "grad_norm": 0.17746305465698242, + "learning_rate": 4.656712868596277e-05, + "loss": 3.9436, + "step": 62940 + }, + { + "epoch": 4.276735969561082, + "grad_norm": 0.21788044273853302, + "learning_rate": 4.6562882185079496e-05, + "loss": 3.7335, + "step": 62945 + }, + { + "epoch": 4.277075689631744, + "grad_norm": 0.14513437449932098, + "learning_rate": 4.6558635684196224e-05, + "loss": 3.9051, + "step": 62950 + }, + { + "epoch": 4.277415409702405, + "grad_norm": 0.19149436056613922, + "learning_rate": 4.655438918331295e-05, + "loss": 3.8549, + "step": 62955 + }, + { + "epoch": 4.277755129773067, + "grad_norm": 0.21081602573394775, + "learning_rate": 4.655014268242968e-05, + "loss": 3.7417, + "step": 62960 + }, + { + "epoch": 4.278094849843729, + "grad_norm": 0.27684956789016724, + "learning_rate": 4.654589618154641e-05, + "loss": 3.9782, + "step": 62965 + }, + { + "epoch": 4.27843456991439, + "grad_norm": 0.6707547903060913, + "learning_rate": 4.6541649680663136e-05, + "loss": 3.7975, + "step": 62970 + }, + { + "epoch": 4.278774289985052, + "grad_norm": 0.3630595803260803, + "learning_rate": 4.6537403179779864e-05, + "loss": 3.7699, + "step": 62975 + }, + { + "epoch": 4.279114010055714, + "grad_norm": 0.18608856201171875, + "learning_rate": 4.653315667889659e-05, + "loss": 4.069, + "step": 62980 + }, + { + "epoch": 4.2794537301263755, + "grad_norm": 0.13571205735206604, + "learning_rate": 4.652891017801332e-05, + "loss": 3.8479, + "step": 62985 + }, + { + "epoch": 4.279793450197038, + "grad_norm": 0.5351913571357727, + "learning_rate": 4.652466367713005e-05, + "loss": 3.6846, + "step": 62990 + }, + { + "epoch": 4.2801331702677, + "grad_norm": 0.1532939374446869, + "learning_rate": 4.652041717624677e-05, + "loss": 3.6014, + "step": 62995 + }, + { + "epoch": 4.280472890338361, + "grad_norm": 0.18226642906665802, + "learning_rate": 4.6516170675363504e-05, + "loss": 3.8062, + "step": 63000 + }, + { + "epoch": 4.280812610409023, + "grad_norm": 0.18418705463409424, + "learning_rate": 4.651192417448023e-05, + "loss": 4.0017, + "step": 63005 + }, + { + "epoch": 4.281152330479685, + "grad_norm": 0.207549586892128, + "learning_rate": 4.650767767359696e-05, + "loss": 3.9706, + "step": 63010 + }, + { + "epoch": 4.281492050550346, + "grad_norm": 0.180976003408432, + "learning_rate": 4.650343117271369e-05, + "loss": 3.8699, + "step": 63015 + }, + { + "epoch": 4.281831770621008, + "grad_norm": 0.23175211250782013, + "learning_rate": 4.6499184671830416e-05, + "loss": 4.0929, + "step": 63020 + }, + { + "epoch": 4.28217149069167, + "grad_norm": 0.1614413857460022, + "learning_rate": 4.6494938170947144e-05, + "loss": 3.9233, + "step": 63025 + }, + { + "epoch": 4.2825112107623315, + "grad_norm": 0.17967116832733154, + "learning_rate": 4.6490691670063865e-05, + "loss": 3.8261, + "step": 63030 + }, + { + "epoch": 4.282850930832994, + "grad_norm": 0.15327274799346924, + "learning_rate": 4.64864451691806e-05, + "loss": 3.9388, + "step": 63035 + }, + { + "epoch": 4.283190650903656, + "grad_norm": 0.17634837329387665, + "learning_rate": 4.648219866829733e-05, + "loss": 3.9075, + "step": 63040 + }, + { + "epoch": 4.283530370974317, + "grad_norm": 0.17371675372123718, + "learning_rate": 4.647795216741405e-05, + "loss": 3.9795, + "step": 63045 + }, + { + "epoch": 4.283870091044979, + "grad_norm": 0.16827701032161713, + "learning_rate": 4.6473705666530784e-05, + "loss": 3.7339, + "step": 63050 + }, + { + "epoch": 4.284209811115641, + "grad_norm": 0.23059602081775665, + "learning_rate": 4.646945916564751e-05, + "loss": 4.0135, + "step": 63055 + }, + { + "epoch": 4.284549531186302, + "grad_norm": 0.15465697646141052, + "learning_rate": 4.6465212664764233e-05, + "loss": 3.8518, + "step": 63060 + }, + { + "epoch": 4.284889251256964, + "grad_norm": NaN, + "learning_rate": 4.6461815464057616e-05, + "loss": 3.9988, + "step": 63065 + }, + { + "epoch": 4.285228971327626, + "grad_norm": 0.1480311155319214, + "learning_rate": 4.6457568963174344e-05, + "loss": 3.838, + "step": 63070 + }, + { + "epoch": 4.2855686913982876, + "grad_norm": 0.4018503427505493, + "learning_rate": 4.645332246229108e-05, + "loss": 4.086, + "step": 63075 + }, + { + "epoch": 4.28590841146895, + "grad_norm": 0.2242083102464676, + "learning_rate": 4.64490759614078e-05, + "loss": 3.9389, + "step": 63080 + }, + { + "epoch": 4.286248131539612, + "grad_norm": 0.17406480014324188, + "learning_rate": 4.644482946052453e-05, + "loss": 3.7199, + "step": 63085 + }, + { + "epoch": 4.286587851610273, + "grad_norm": 0.16599644720554352, + "learning_rate": 4.644058295964126e-05, + "loss": 3.8186, + "step": 63090 + }, + { + "epoch": 4.286927571680935, + "grad_norm": 0.1728777140378952, + "learning_rate": 4.6436336458757984e-05, + "loss": 3.3349, + "step": 63095 + }, + { + "epoch": 4.287267291751597, + "grad_norm": 0.1674376279115677, + "learning_rate": 4.643208995787471e-05, + "loss": 3.8871, + "step": 63100 + }, + { + "epoch": 4.287607011822258, + "grad_norm": 0.1715657263994217, + "learning_rate": 4.642784345699145e-05, + "loss": 3.5947, + "step": 63105 + }, + { + "epoch": 4.28794673189292, + "grad_norm": 0.21367207169532776, + "learning_rate": 4.642359695610817e-05, + "loss": 3.8869, + "step": 63110 + }, + { + "epoch": 4.288286451963582, + "grad_norm": 0.1603970229625702, + "learning_rate": 4.6419350455224896e-05, + "loss": 3.7941, + "step": 63115 + }, + { + "epoch": 4.288626172034244, + "grad_norm": 2.1505348682403564, + "learning_rate": 4.6415103954341624e-05, + "loss": 3.9386, + "step": 63120 + }, + { + "epoch": 4.288965892104906, + "grad_norm": 0.17021149396896362, + "learning_rate": 4.641085745345835e-05, + "loss": 4.1202, + "step": 63125 + }, + { + "epoch": 4.289305612175568, + "grad_norm": 0.412357896566391, + "learning_rate": 4.640661095257508e-05, + "loss": 3.7928, + "step": 63130 + }, + { + "epoch": 4.289645332246229, + "grad_norm": 0.16097980737686157, + "learning_rate": 4.640236445169181e-05, + "loss": 3.9464, + "step": 63135 + }, + { + "epoch": 4.289985052316891, + "grad_norm": 0.1959889680147171, + "learning_rate": 4.6398117950808536e-05, + "loss": 3.7114, + "step": 63140 + }, + { + "epoch": 4.290324772387553, + "grad_norm": 0.16795776784420013, + "learning_rate": 4.6393871449925264e-05, + "loss": 3.9068, + "step": 63145 + }, + { + "epoch": 4.290664492458214, + "grad_norm": 0.17716744542121887, + "learning_rate": 4.638962494904199e-05, + "loss": 4.0352, + "step": 63150 + }, + { + "epoch": 4.291004212528876, + "grad_norm": 0.19940905272960663, + "learning_rate": 4.638537844815872e-05, + "loss": 3.9876, + "step": 63155 + }, + { + "epoch": 4.291343932599538, + "grad_norm": 0.2446894347667694, + "learning_rate": 4.638113194727545e-05, + "loss": 3.9194, + "step": 63160 + }, + { + "epoch": 4.2916836526702, + "grad_norm": 0.21648229658603668, + "learning_rate": 4.6376885446392176e-05, + "loss": 3.5903, + "step": 63165 + }, + { + "epoch": 4.292023372740862, + "grad_norm": 0.2463763803243637, + "learning_rate": 4.63726389455089e-05, + "loss": 3.8179, + "step": 63170 + }, + { + "epoch": 4.292363092811524, + "grad_norm": 0.15486228466033936, + "learning_rate": 4.636839244462563e-05, + "loss": 3.8705, + "step": 63175 + }, + { + "epoch": 4.292702812882185, + "grad_norm": 0.1959986388683319, + "learning_rate": 4.636414594374236e-05, + "loss": 3.7819, + "step": 63180 + }, + { + "epoch": 4.293042532952847, + "grad_norm": 0.22568809986114502, + "learning_rate": 4.635989944285908e-05, + "loss": 3.9406, + "step": 63185 + }, + { + "epoch": 4.293382253023509, + "grad_norm": 0.21781140565872192, + "learning_rate": 4.6355652941975816e-05, + "loss": 3.8196, + "step": 63190 + }, + { + "epoch": 4.29372197309417, + "grad_norm": 0.1521628499031067, + "learning_rate": 4.6351406441092544e-05, + "loss": 4.063, + "step": 63195 + }, + { + "epoch": 4.294061693164832, + "grad_norm": 0.20232230424880981, + "learning_rate": 4.6347159940209265e-05, + "loss": 3.7562, + "step": 63200 + }, + { + "epoch": 4.294401413235494, + "grad_norm": 0.18259699642658234, + "learning_rate": 4.6342913439326e-05, + "loss": 3.8842, + "step": 63205 + }, + { + "epoch": 4.294741133306156, + "grad_norm": 0.1597355157136917, + "learning_rate": 4.633866693844273e-05, + "loss": 3.7534, + "step": 63210 + }, + { + "epoch": 4.295080853376818, + "grad_norm": 0.28750497102737427, + "learning_rate": 4.6334420437559456e-05, + "loss": 3.8644, + "step": 63215 + }, + { + "epoch": 4.29542057344748, + "grad_norm": 0.16453567147254944, + "learning_rate": 4.633017393667618e-05, + "loss": 3.8978, + "step": 63220 + }, + { + "epoch": 4.295760293518141, + "grad_norm": 0.305908203125, + "learning_rate": 4.632592743579291e-05, + "loss": 3.8355, + "step": 63225 + }, + { + "epoch": 4.296100013588803, + "grad_norm": 0.18026627600193024, + "learning_rate": 4.632168093490964e-05, + "loss": 3.819, + "step": 63230 + }, + { + "epoch": 4.296439733659464, + "grad_norm": 0.19824960827827454, + "learning_rate": 4.631743443402636e-05, + "loss": 3.6851, + "step": 63235 + }, + { + "epoch": 4.296779453730126, + "grad_norm": 0.2310958355665207, + "learning_rate": 4.6313187933143096e-05, + "loss": 3.6757, + "step": 63240 + }, + { + "epoch": 4.297119173800788, + "grad_norm": 0.15282149612903595, + "learning_rate": 4.6308941432259824e-05, + "loss": 3.8883, + "step": 63245 + }, + { + "epoch": 4.2974588938714495, + "grad_norm": 0.22351707518100739, + "learning_rate": 4.6304694931376545e-05, + "loss": 3.9405, + "step": 63250 + }, + { + "epoch": 4.297798613942112, + "grad_norm": 0.17095312476158142, + "learning_rate": 4.630044843049327e-05, + "loss": 3.8436, + "step": 63255 + }, + { + "epoch": 4.298138334012774, + "grad_norm": 0.1734844297170639, + "learning_rate": 4.629620192961001e-05, + "loss": 3.7033, + "step": 63260 + }, + { + "epoch": 4.298478054083435, + "grad_norm": 0.17161594331264496, + "learning_rate": 4.629195542872673e-05, + "loss": 4.0295, + "step": 63265 + }, + { + "epoch": 4.298817774154097, + "grad_norm": 0.1725378781557083, + "learning_rate": 4.628770892784346e-05, + "loss": 3.7101, + "step": 63270 + }, + { + "epoch": 4.299157494224759, + "grad_norm": 0.18083584308624268, + "learning_rate": 4.628346242696019e-05, + "loss": 4.0524, + "step": 63275 + }, + { + "epoch": 4.29949721429542, + "grad_norm": 0.15342216193675995, + "learning_rate": 4.627921592607691e-05, + "loss": 3.931, + "step": 63280 + }, + { + "epoch": 4.299836934366082, + "grad_norm": 0.20553691685199738, + "learning_rate": 4.627496942519364e-05, + "loss": 3.8353, + "step": 63285 + }, + { + "epoch": 4.300176654436744, + "grad_norm": 0.14752131700515747, + "learning_rate": 4.627072292431037e-05, + "loss": 3.9583, + "step": 63290 + }, + { + "epoch": 4.3005163745074055, + "grad_norm": 0.1785702407360077, + "learning_rate": 4.62664764234271e-05, + "loss": 3.9273, + "step": 63295 + }, + { + "epoch": 4.300856094578068, + "grad_norm": 0.1788877695798874, + "learning_rate": 4.6262229922543825e-05, + "loss": 3.8021, + "step": 63300 + }, + { + "epoch": 4.30119581464873, + "grad_norm": 0.23798976838588715, + "learning_rate": 4.6257983421660553e-05, + "loss": 3.9349, + "step": 63305 + }, + { + "epoch": 4.301535534719391, + "grad_norm": 0.22758616507053375, + "learning_rate": 4.625373692077728e-05, + "loss": 3.6228, + "step": 63310 + }, + { + "epoch": 4.301875254790053, + "grad_norm": 0.23729611933231354, + "learning_rate": 4.624949041989401e-05, + "loss": 3.8406, + "step": 63315 + }, + { + "epoch": 4.302214974860715, + "grad_norm": 0.1607276052236557, + "learning_rate": 4.624524391901074e-05, + "loss": 3.9649, + "step": 63320 + }, + { + "epoch": 4.302554694931376, + "grad_norm": 0.14800600707530975, + "learning_rate": 4.6240997418127465e-05, + "loss": 3.8396, + "step": 63325 + }, + { + "epoch": 4.302894415002038, + "grad_norm": 0.15823623538017273, + "learning_rate": 4.6236750917244193e-05, + "loss": 3.9302, + "step": 63330 + }, + { + "epoch": 4.3032341350727, + "grad_norm": 2.887266159057617, + "learning_rate": 4.623250441636092e-05, + "loss": 3.7334, + "step": 63335 + }, + { + "epoch": 4.3035738551433615, + "grad_norm": 0.3563077747821808, + "learning_rate": 4.622825791547765e-05, + "loss": 3.7912, + "step": 63340 + }, + { + "epoch": 4.303913575214024, + "grad_norm": 0.2210267037153244, + "learning_rate": 4.622401141459438e-05, + "loss": 3.7445, + "step": 63345 + }, + { + "epoch": 4.304253295284686, + "grad_norm": 0.15054301917552948, + "learning_rate": 4.6219764913711105e-05, + "loss": 3.7947, + "step": 63350 + }, + { + "epoch": 4.304593015355347, + "grad_norm": 0.14819568395614624, + "learning_rate": 4.621551841282783e-05, + "loss": 3.6149, + "step": 63355 + }, + { + "epoch": 4.304932735426009, + "grad_norm": 0.14594979584217072, + "learning_rate": 4.621127191194456e-05, + "loss": 4.1179, + "step": 63360 + }, + { + "epoch": 4.305272455496671, + "grad_norm": 0.2976338863372803, + "learning_rate": 4.620702541106129e-05, + "loss": 3.9742, + "step": 63365 + }, + { + "epoch": 4.305612175567332, + "grad_norm": 2.2326555252075195, + "learning_rate": 4.620277891017801e-05, + "loss": 3.8355, + "step": 63370 + }, + { + "epoch": 4.305951895637994, + "grad_norm": 0.16370725631713867, + "learning_rate": 4.6198532409294746e-05, + "loss": 3.9876, + "step": 63375 + }, + { + "epoch": 4.306291615708656, + "grad_norm": 0.26201820373535156, + "learning_rate": 4.6194285908411474e-05, + "loss": 3.6069, + "step": 63380 + }, + { + "epoch": 4.306631335779318, + "grad_norm": 0.17490442097187042, + "learning_rate": 4.61900394075282e-05, + "loss": 3.7149, + "step": 63385 + }, + { + "epoch": 4.30697105584998, + "grad_norm": 0.18104314804077148, + "learning_rate": 4.618579290664492e-05, + "loss": 3.8521, + "step": 63390 + }, + { + "epoch": 4.307310775920642, + "grad_norm": 0.16280505061149597, + "learning_rate": 4.618154640576166e-05, + "loss": 3.7766, + "step": 63395 + }, + { + "epoch": 4.307650495991303, + "grad_norm": 0.37150052189826965, + "learning_rate": 4.6177299904878386e-05, + "loss": 3.8868, + "step": 63400 + }, + { + "epoch": 4.307990216061965, + "grad_norm": 0.14724378287792206, + "learning_rate": 4.617305340399511e-05, + "loss": 3.7046, + "step": 63405 + }, + { + "epoch": 4.308329936132627, + "grad_norm": 0.1579444259405136, + "learning_rate": 4.616880690311184e-05, + "loss": 3.8025, + "step": 63410 + }, + { + "epoch": 4.308669656203288, + "grad_norm": 0.16019746661186218, + "learning_rate": 4.616456040222857e-05, + "loss": 4.0112, + "step": 63415 + }, + { + "epoch": 4.30900937627395, + "grad_norm": 0.2052755355834961, + "learning_rate": 4.616031390134529e-05, + "loss": 3.9012, + "step": 63420 + }, + { + "epoch": 4.309349096344612, + "grad_norm": 0.16707901656627655, + "learning_rate": 4.615606740046202e-05, + "loss": 3.9194, + "step": 63425 + }, + { + "epoch": 4.309688816415274, + "grad_norm": 0.25230705738067627, + "learning_rate": 4.6151820899578754e-05, + "loss": 4.009, + "step": 63430 + }, + { + "epoch": 4.310028536485936, + "grad_norm": 0.8645108342170715, + "learning_rate": 4.6147574398695475e-05, + "loss": 3.8405, + "step": 63435 + }, + { + "epoch": 4.310368256556598, + "grad_norm": 0.16243097186088562, + "learning_rate": 4.61433278978122e-05, + "loss": 3.8111, + "step": 63440 + }, + { + "epoch": 4.310707976627259, + "grad_norm": 0.19411765038967133, + "learning_rate": 4.613908139692894e-05, + "loss": 3.7857, + "step": 63445 + }, + { + "epoch": 4.311047696697921, + "grad_norm": 0.16293099522590637, + "learning_rate": 4.613483489604566e-05, + "loss": 3.9525, + "step": 63450 + }, + { + "epoch": 4.311387416768583, + "grad_norm": 0.16824540495872498, + "learning_rate": 4.613058839516239e-05, + "loss": 4.125, + "step": 63455 + }, + { + "epoch": 4.311727136839244, + "grad_norm": 0.20362181961536407, + "learning_rate": 4.6126341894279115e-05, + "loss": 4.0084, + "step": 63460 + }, + { + "epoch": 4.312066856909906, + "grad_norm": 2.0406477451324463, + "learning_rate": 4.612209539339584e-05, + "loss": 3.8522, + "step": 63465 + }, + { + "epoch": 4.312406576980568, + "grad_norm": 0.17292986810207367, + "learning_rate": 4.611784889251257e-05, + "loss": 3.8678, + "step": 63470 + }, + { + "epoch": 4.31274629705123, + "grad_norm": 0.19866026937961578, + "learning_rate": 4.61136023916293e-05, + "loss": 3.9448, + "step": 63475 + }, + { + "epoch": 4.313086017121892, + "grad_norm": 0.20077066123485565, + "learning_rate": 4.610935589074603e-05, + "loss": 3.824, + "step": 63480 + }, + { + "epoch": 4.313425737192554, + "grad_norm": 0.1745264083147049, + "learning_rate": 4.6105109389862755e-05, + "loss": 3.8668, + "step": 63485 + }, + { + "epoch": 4.313765457263215, + "grad_norm": 0.9944829940795898, + "learning_rate": 4.610086288897948e-05, + "loss": 3.8409, + "step": 63490 + }, + { + "epoch": 4.314105177333877, + "grad_norm": 0.19766132533550262, + "learning_rate": 4.609661638809621e-05, + "loss": 3.8133, + "step": 63495 + }, + { + "epoch": 4.314444897404539, + "grad_norm": 0.15149450302124023, + "learning_rate": 4.609236988721294e-05, + "loss": 3.8948, + "step": 63500 + }, + { + "epoch": 4.3147846174752, + "grad_norm": 0.16350021958351135, + "learning_rate": 4.608812338632967e-05, + "loss": 3.8814, + "step": 63505 + }, + { + "epoch": 4.315124337545862, + "grad_norm": 1.265871524810791, + "learning_rate": 4.6083876885446395e-05, + "loss": 3.8742, + "step": 63510 + }, + { + "epoch": 4.3154640576165235, + "grad_norm": 0.2242605984210968, + "learning_rate": 4.607963038456312e-05, + "loss": 4.0533, + "step": 63515 + }, + { + "epoch": 4.315803777687186, + "grad_norm": 0.1766650229692459, + "learning_rate": 4.607538388367985e-05, + "loss": 3.6435, + "step": 63520 + }, + { + "epoch": 4.316143497757848, + "grad_norm": 0.14207495748996735, + "learning_rate": 4.607113738279657e-05, + "loss": 3.8184, + "step": 63525 + }, + { + "epoch": 4.316483217828509, + "grad_norm": 0.1677604764699936, + "learning_rate": 4.606689088191331e-05, + "loss": 3.6378, + "step": 63530 + }, + { + "epoch": 4.316822937899171, + "grad_norm": 0.1876164674758911, + "learning_rate": 4.6062644381030035e-05, + "loss": 3.9209, + "step": 63535 + }, + { + "epoch": 4.317162657969833, + "grad_norm": 0.1549244374036789, + "learning_rate": 4.6058397880146756e-05, + "loss": 3.8262, + "step": 63540 + }, + { + "epoch": 4.317502378040494, + "grad_norm": 0.1774241328239441, + "learning_rate": 4.605415137926349e-05, + "loss": 3.8995, + "step": 63545 + }, + { + "epoch": 4.317842098111156, + "grad_norm": 0.14202061295509338, + "learning_rate": 4.604990487838022e-05, + "loss": 3.7398, + "step": 63550 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.19014453887939453, + "learning_rate": 4.604565837749695e-05, + "loss": 3.9973, + "step": 63555 + }, + { + "epoch": 4.3185215382524795, + "grad_norm": 0.15967941284179688, + "learning_rate": 4.604141187661367e-05, + "loss": 3.9858, + "step": 63560 + }, + { + "epoch": 4.318861258323142, + "grad_norm": 0.17261824011802673, + "learning_rate": 4.60371653757304e-05, + "loss": 3.7889, + "step": 63565 + }, + { + "epoch": 4.319200978393804, + "grad_norm": 0.20073771476745605, + "learning_rate": 4.603291887484713e-05, + "loss": 3.7832, + "step": 63570 + }, + { + "epoch": 4.319540698464465, + "grad_norm": 0.1681959331035614, + "learning_rate": 4.602867237396385e-05, + "loss": 3.7275, + "step": 63575 + }, + { + "epoch": 4.319880418535127, + "grad_norm": 0.1447710394859314, + "learning_rate": 4.602442587308059e-05, + "loss": 4.0554, + "step": 63580 + }, + { + "epoch": 4.320220138605789, + "grad_norm": 0.14619040489196777, + "learning_rate": 4.6020179372197315e-05, + "loss": 3.9961, + "step": 63585 + }, + { + "epoch": 4.32055985867645, + "grad_norm": 0.20262694358825684, + "learning_rate": 4.6015932871314036e-05, + "loss": 3.6716, + "step": 63590 + }, + { + "epoch": 4.320899578747112, + "grad_norm": 0.15113988518714905, + "learning_rate": 4.601168637043077e-05, + "loss": 3.7813, + "step": 63595 + }, + { + "epoch": 4.321239298817774, + "grad_norm": 0.151357039809227, + "learning_rate": 4.60074398695475e-05, + "loss": 3.829, + "step": 63600 + }, + { + "epoch": 4.3215790188884355, + "grad_norm": 0.1863708645105362, + "learning_rate": 4.600319336866422e-05, + "loss": 4.0556, + "step": 63605 + }, + { + "epoch": 4.321918738959098, + "grad_norm": 0.18440689146518707, + "learning_rate": 4.599894686778095e-05, + "loss": 3.8132, + "step": 63610 + }, + { + "epoch": 4.32225845902976, + "grad_norm": 0.1555318385362625, + "learning_rate": 4.599470036689768e-05, + "loss": 3.8155, + "step": 63615 + }, + { + "epoch": 4.322598179100421, + "grad_norm": 0.19813111424446106, + "learning_rate": 4.5990453866014404e-05, + "loss": 3.8712, + "step": 63620 + }, + { + "epoch": 4.322937899171083, + "grad_norm": 0.1811678558588028, + "learning_rate": 4.598620736513113e-05, + "loss": 3.867, + "step": 63625 + }, + { + "epoch": 4.323277619241745, + "grad_norm": 0.16366715729236603, + "learning_rate": 4.598196086424787e-05, + "loss": 4.0141, + "step": 63630 + }, + { + "epoch": 4.323617339312406, + "grad_norm": 0.3262331485748291, + "learning_rate": 4.597771436336459e-05, + "loss": 3.5301, + "step": 63635 + }, + { + "epoch": 4.323957059383068, + "grad_norm": 0.14625713229179382, + "learning_rate": 4.5973467862481316e-05, + "loss": 4.1853, + "step": 63640 + }, + { + "epoch": 4.32429677945373, + "grad_norm": 0.20618830621242523, + "learning_rate": 4.5969221361598044e-05, + "loss": 3.9832, + "step": 63645 + }, + { + "epoch": 4.3246364995243916, + "grad_norm": 0.17638272047042847, + "learning_rate": 4.596497486071477e-05, + "loss": 3.9091, + "step": 63650 + }, + { + "epoch": 4.324976219595054, + "grad_norm": 0.1940934807062149, + "learning_rate": 4.59607283598315e-05, + "loss": 3.7106, + "step": 63655 + }, + { + "epoch": 4.325315939665716, + "grad_norm": 0.21973031759262085, + "learning_rate": 4.595648185894823e-05, + "loss": 3.6432, + "step": 63660 + }, + { + "epoch": 4.325655659736377, + "grad_norm": 0.22234146296977997, + "learning_rate": 4.5952235358064956e-05, + "loss": 4.012, + "step": 63665 + }, + { + "epoch": 4.325995379807039, + "grad_norm": 0.17524608969688416, + "learning_rate": 4.5947988857181684e-05, + "loss": 3.801, + "step": 63670 + }, + { + "epoch": 4.326335099877701, + "grad_norm": 0.14060825109481812, + "learning_rate": 4.594374235629841e-05, + "loss": 3.904, + "step": 63675 + }, + { + "epoch": 4.326674819948362, + "grad_norm": 0.19052928686141968, + "learning_rate": 4.593949585541514e-05, + "loss": 4.2191, + "step": 63680 + }, + { + "epoch": 4.327014540019024, + "grad_norm": 0.16074340045452118, + "learning_rate": 4.593524935453187e-05, + "loss": 3.9904, + "step": 63685 + }, + { + "epoch": 4.327354260089686, + "grad_norm": 0.14800509810447693, + "learning_rate": 4.5931002853648596e-05, + "loss": 4.0435, + "step": 63690 + }, + { + "epoch": 4.327693980160348, + "grad_norm": 0.28730639815330505, + "learning_rate": 4.5926756352765324e-05, + "loss": 3.8314, + "step": 63695 + }, + { + "epoch": 4.32803370023101, + "grad_norm": 0.15997226536273956, + "learning_rate": 4.592250985188205e-05, + "loss": 3.5659, + "step": 63700 + }, + { + "epoch": 4.328373420301672, + "grad_norm": 0.2798592150211334, + "learning_rate": 4.591826335099878e-05, + "loss": 3.7717, + "step": 63705 + }, + { + "epoch": 4.328713140372333, + "grad_norm": 0.47454991936683655, + "learning_rate": 4.59140168501155e-05, + "loss": 3.84, + "step": 63710 + }, + { + "epoch": 4.329052860442995, + "grad_norm": 0.17363101243972778, + "learning_rate": 4.5909770349232236e-05, + "loss": 3.7693, + "step": 63715 + }, + { + "epoch": 4.329392580513657, + "grad_norm": 0.14338423311710358, + "learning_rate": 4.5905523848348964e-05, + "loss": 3.7446, + "step": 63720 + }, + { + "epoch": 4.329732300584318, + "grad_norm": 0.7774453163146973, + "learning_rate": 4.590127734746569e-05, + "loss": 3.6054, + "step": 63725 + }, + { + "epoch": 4.33007202065498, + "grad_norm": 0.2090260535478592, + "learning_rate": 4.589703084658242e-05, + "loss": 4.0707, + "step": 63730 + }, + { + "epoch": 4.330411740725642, + "grad_norm": 0.1930515617132187, + "learning_rate": 4.589278434569915e-05, + "loss": 3.4458, + "step": 63735 + }, + { + "epoch": 4.330751460796304, + "grad_norm": 0.15212392807006836, + "learning_rate": 4.5888537844815877e-05, + "loss": 3.6791, + "step": 63740 + }, + { + "epoch": 4.331091180866966, + "grad_norm": 0.23284250497817993, + "learning_rate": 4.58842913439326e-05, + "loss": 3.5445, + "step": 63745 + }, + { + "epoch": 4.331430900937628, + "grad_norm": 0.17393609881401062, + "learning_rate": 4.588004484304933e-05, + "loss": 4.0175, + "step": 63750 + }, + { + "epoch": 4.331770621008289, + "grad_norm": 0.20152530074119568, + "learning_rate": 4.587579834216606e-05, + "loss": 3.8587, + "step": 63755 + }, + { + "epoch": 4.332110341078951, + "grad_norm": 0.16409137845039368, + "learning_rate": 4.587155184128278e-05, + "loss": 3.8624, + "step": 63760 + }, + { + "epoch": 4.332450061149613, + "grad_norm": 0.25751394033432007, + "learning_rate": 4.5867305340399517e-05, + "loss": 3.5473, + "step": 63765 + }, + { + "epoch": 4.332789781220274, + "grad_norm": 0.13622596859931946, + "learning_rate": 4.5863058839516245e-05, + "loss": 3.8007, + "step": 63770 + }, + { + "epoch": 4.333129501290936, + "grad_norm": 0.23081129789352417, + "learning_rate": 4.5858812338632966e-05, + "loss": 3.7185, + "step": 63775 + }, + { + "epoch": 4.333469221361598, + "grad_norm": 0.1792636662721634, + "learning_rate": 4.5854565837749694e-05, + "loss": 3.5724, + "step": 63780 + }, + { + "epoch": 4.33380894143226, + "grad_norm": 0.15758883953094482, + "learning_rate": 4.585031933686643e-05, + "loss": 4.1955, + "step": 63785 + }, + { + "epoch": 4.334148661502922, + "grad_norm": 0.2559954524040222, + "learning_rate": 4.584607283598315e-05, + "loss": 3.7186, + "step": 63790 + }, + { + "epoch": 4.334488381573584, + "grad_norm": 0.3094991445541382, + "learning_rate": 4.584182633509988e-05, + "loss": 3.8487, + "step": 63795 + }, + { + "epoch": 4.334828101644245, + "grad_norm": 0.22587963938713074, + "learning_rate": 4.583757983421661e-05, + "loss": 3.4909, + "step": 63800 + }, + { + "epoch": 4.335167821714907, + "grad_norm": 0.12787368893623352, + "learning_rate": 4.5833333333333334e-05, + "loss": 3.8713, + "step": 63805 + }, + { + "epoch": 4.335507541785569, + "grad_norm": 0.22917738556861877, + "learning_rate": 4.582908683245006e-05, + "loss": 3.5777, + "step": 63810 + }, + { + "epoch": 4.33584726185623, + "grad_norm": 0.20362511277198792, + "learning_rate": 4.582484033156679e-05, + "loss": 4.0415, + "step": 63815 + }, + { + "epoch": 4.336186981926892, + "grad_norm": 0.18614953756332397, + "learning_rate": 4.582059383068352e-05, + "loss": 3.8097, + "step": 63820 + }, + { + "epoch": 4.336526701997554, + "grad_norm": 0.16959021985530853, + "learning_rate": 4.5816347329800246e-05, + "loss": 3.9996, + "step": 63825 + }, + { + "epoch": 4.336866422068216, + "grad_norm": 0.18160788714885712, + "learning_rate": 4.5812100828916974e-05, + "loss": 3.9293, + "step": 63830 + }, + { + "epoch": 4.337206142138878, + "grad_norm": 0.1760733723640442, + "learning_rate": 4.58078543280337e-05, + "loss": 3.7253, + "step": 63835 + }, + { + "epoch": 4.33754586220954, + "grad_norm": 0.15905718505382538, + "learning_rate": 4.580360782715043e-05, + "loss": 3.7873, + "step": 63840 + }, + { + "epoch": 4.337885582280201, + "grad_norm": 0.19683510065078735, + "learning_rate": 4.579936132626716e-05, + "loss": 3.8564, + "step": 63845 + }, + { + "epoch": 4.338225302350863, + "grad_norm": 0.18918819725513458, + "learning_rate": 4.5795114825383886e-05, + "loss": 3.7391, + "step": 63850 + }, + { + "epoch": 4.338565022421525, + "grad_norm": 0.12217436730861664, + "learning_rate": 4.5790868324500614e-05, + "loss": 3.7794, + "step": 63855 + }, + { + "epoch": 4.338904742492186, + "grad_norm": 0.15358325839042664, + "learning_rate": 4.578662182361734e-05, + "loss": 3.9811, + "step": 63860 + }, + { + "epoch": 4.339244462562848, + "grad_norm": 0.20919835567474365, + "learning_rate": 4.578237532273407e-05, + "loss": 4.0686, + "step": 63865 + }, + { + "epoch": 4.33958418263351, + "grad_norm": 0.16558368504047394, + "learning_rate": 4.57781288218508e-05, + "loss": 3.9198, + "step": 63870 + }, + { + "epoch": 4.339923902704172, + "grad_norm": 0.18679428100585938, + "learning_rate": 4.5773882320967526e-05, + "loss": 3.8605, + "step": 63875 + }, + { + "epoch": 4.340263622774834, + "grad_norm": 0.15042416751384735, + "learning_rate": 4.576963582008425e-05, + "loss": 4.0309, + "step": 63880 + }, + { + "epoch": 4.340603342845496, + "grad_norm": 0.20875012874603271, + "learning_rate": 4.576538931920098e-05, + "loss": 3.9813, + "step": 63885 + }, + { + "epoch": 4.340943062916157, + "grad_norm": 0.16035504639148712, + "learning_rate": 4.576114281831771e-05, + "loss": 3.8624, + "step": 63890 + }, + { + "epoch": 4.341282782986819, + "grad_norm": 0.14612863957881927, + "learning_rate": 4.575689631743444e-05, + "loss": 3.7849, + "step": 63895 + }, + { + "epoch": 4.341622503057481, + "grad_norm": 0.2685539126396179, + "learning_rate": 4.5752649816551166e-05, + "loss": 3.8761, + "step": 63900 + }, + { + "epoch": 4.341962223128142, + "grad_norm": 0.17221400141716003, + "learning_rate": 4.5748403315667894e-05, + "loss": 3.6422, + "step": 63905 + }, + { + "epoch": 4.342301943198804, + "grad_norm": 0.16837981343269348, + "learning_rate": 4.574415681478462e-05, + "loss": 3.7173, + "step": 63910 + }, + { + "epoch": 4.3426416632694655, + "grad_norm": 0.7297783493995667, + "learning_rate": 4.573991031390134e-05, + "loss": 3.8524, + "step": 63915 + }, + { + "epoch": 4.342981383340128, + "grad_norm": 0.19390429556369781, + "learning_rate": 4.573566381301808e-05, + "loss": 3.8167, + "step": 63920 + }, + { + "epoch": 4.34332110341079, + "grad_norm": 0.1582280546426773, + "learning_rate": 4.5731417312134806e-05, + "loss": 4.0779, + "step": 63925 + }, + { + "epoch": 4.343660823481451, + "grad_norm": 0.17105568945407867, + "learning_rate": 4.572717081125153e-05, + "loss": 4.0889, + "step": 63930 + }, + { + "epoch": 4.344000543552113, + "grad_norm": 0.15874437987804413, + "learning_rate": 4.572292431036826e-05, + "loss": 4.0561, + "step": 63935 + }, + { + "epoch": 4.344340263622775, + "grad_norm": 0.163116917014122, + "learning_rate": 4.571867780948499e-05, + "loss": 3.8432, + "step": 63940 + }, + { + "epoch": 4.344679983693436, + "grad_norm": 0.19146184623241425, + "learning_rate": 4.571443130860171e-05, + "loss": 3.6359, + "step": 63945 + }, + { + "epoch": 4.345019703764098, + "grad_norm": 0.14527836441993713, + "learning_rate": 4.571018480771844e-05, + "loss": 3.9242, + "step": 63950 + }, + { + "epoch": 4.34535942383476, + "grad_norm": 0.18127940595149994, + "learning_rate": 4.5705938306835174e-05, + "loss": 3.6817, + "step": 63955 + }, + { + "epoch": 4.345699143905422, + "grad_norm": 0.17376649379730225, + "learning_rate": 4.5701691805951895e-05, + "loss": 3.7895, + "step": 63960 + }, + { + "epoch": 4.346038863976084, + "grad_norm": 0.25556138157844543, + "learning_rate": 4.569744530506862e-05, + "loss": 3.9318, + "step": 63965 + }, + { + "epoch": 4.346378584046746, + "grad_norm": 0.1620141565799713, + "learning_rate": 4.569319880418536e-05, + "loss": 3.6407, + "step": 63970 + }, + { + "epoch": 4.346718304117407, + "grad_norm": 0.1635180413722992, + "learning_rate": 4.568895230330208e-05, + "loss": 3.9337, + "step": 63975 + }, + { + "epoch": 4.347058024188069, + "grad_norm": 0.19376567006111145, + "learning_rate": 4.568470580241881e-05, + "loss": 3.6613, + "step": 63980 + }, + { + "epoch": 4.347397744258731, + "grad_norm": 0.15808331966400146, + "learning_rate": 4.5680459301535535e-05, + "loss": 3.9464, + "step": 63985 + }, + { + "epoch": 4.347737464329392, + "grad_norm": 0.646141767501831, + "learning_rate": 4.567621280065226e-05, + "loss": 3.8899, + "step": 63990 + }, + { + "epoch": 4.348077184400054, + "grad_norm": 0.1767219603061676, + "learning_rate": 4.567196629976899e-05, + "loss": 3.8486, + "step": 63995 + }, + { + "epoch": 4.348416904470716, + "grad_norm": 0.15177005529403687, + "learning_rate": 4.566771979888572e-05, + "loss": 3.6231, + "step": 64000 + }, + { + "epoch": 4.348756624541378, + "grad_norm": 0.22540892660617828, + "learning_rate": 4.566347329800245e-05, + "loss": 4.0195, + "step": 64005 + }, + { + "epoch": 4.34909634461204, + "grad_norm": 0.25314924120903015, + "learning_rate": 4.5659226797119175e-05, + "loss": 3.9611, + "step": 64010 + }, + { + "epoch": 4.349436064682702, + "grad_norm": 0.21801048517227173, + "learning_rate": 4.56549802962359e-05, + "loss": 3.5723, + "step": 64015 + }, + { + "epoch": 4.349775784753363, + "grad_norm": 0.18115465342998505, + "learning_rate": 4.565073379535263e-05, + "loss": 3.9358, + "step": 64020 + }, + { + "epoch": 4.350115504824025, + "grad_norm": 0.20119410753250122, + "learning_rate": 4.564648729446936e-05, + "loss": 3.5925, + "step": 64025 + }, + { + "epoch": 4.350455224894687, + "grad_norm": 3.9351370334625244, + "learning_rate": 4.564224079358609e-05, + "loss": 3.9593, + "step": 64030 + }, + { + "epoch": 4.350794944965348, + "grad_norm": 0.159758523106575, + "learning_rate": 4.5637994292702815e-05, + "loss": 3.7722, + "step": 64035 + }, + { + "epoch": 4.35113466503601, + "grad_norm": 0.19994066655635834, + "learning_rate": 4.5633747791819543e-05, + "loss": 3.9896, + "step": 64040 + }, + { + "epoch": 4.351474385106672, + "grad_norm": 0.14030279219150543, + "learning_rate": 4.562950129093627e-05, + "loss": 3.7975, + "step": 64045 + }, + { + "epoch": 4.351814105177334, + "grad_norm": 0.20052173733711243, + "learning_rate": 4.562525479005299e-05, + "loss": 3.6001, + "step": 64050 + }, + { + "epoch": 4.352153825247996, + "grad_norm": 0.8588405847549438, + "learning_rate": 4.562100828916973e-05, + "loss": 3.6223, + "step": 64055 + }, + { + "epoch": 4.352493545318658, + "grad_norm": 0.22395169734954834, + "learning_rate": 4.5616761788286455e-05, + "loss": 3.8581, + "step": 64060 + }, + { + "epoch": 4.352833265389319, + "grad_norm": 0.16941790282726288, + "learning_rate": 4.5612515287403183e-05, + "loss": 4.0306, + "step": 64065 + }, + { + "epoch": 4.353172985459981, + "grad_norm": 0.17366696894168854, + "learning_rate": 4.560826878651991e-05, + "loss": 3.8316, + "step": 64070 + }, + { + "epoch": 4.353512705530643, + "grad_norm": 0.19453853368759155, + "learning_rate": 4.560402228563664e-05, + "loss": 3.9846, + "step": 64075 + }, + { + "epoch": 4.353852425601304, + "grad_norm": 0.2727784812450409, + "learning_rate": 4.559977578475337e-05, + "loss": 3.9124, + "step": 64080 + }, + { + "epoch": 4.354192145671966, + "grad_norm": 0.20728488266468048, + "learning_rate": 4.559552928387009e-05, + "loss": 3.7257, + "step": 64085 + }, + { + "epoch": 4.354531865742628, + "grad_norm": 0.1529950499534607, + "learning_rate": 4.5591282782986823e-05, + "loss": 3.795, + "step": 64090 + }, + { + "epoch": 4.35487158581329, + "grad_norm": 0.24017342925071716, + "learning_rate": 4.558703628210355e-05, + "loss": 3.9786, + "step": 64095 + }, + { + "epoch": 4.355211305883952, + "grad_norm": 0.14396032691001892, + "learning_rate": 4.558278978122027e-05, + "loss": 3.7792, + "step": 64100 + }, + { + "epoch": 4.355551025954614, + "grad_norm": 0.17691649496555328, + "learning_rate": 4.557854328033701e-05, + "loss": 3.9131, + "step": 64105 + }, + { + "epoch": 4.355890746025275, + "grad_norm": 0.1648763120174408, + "learning_rate": 4.5574296779453736e-05, + "loss": 3.7783, + "step": 64110 + }, + { + "epoch": 4.356230466095937, + "grad_norm": 0.15980006754398346, + "learning_rate": 4.557005027857046e-05, + "loss": 3.5274, + "step": 64115 + }, + { + "epoch": 4.356570186166599, + "grad_norm": 0.18957164883613586, + "learning_rate": 4.556580377768719e-05, + "loss": 4.1795, + "step": 64120 + }, + { + "epoch": 4.35690990623726, + "grad_norm": 0.19925875961780548, + "learning_rate": 4.556155727680392e-05, + "loss": 3.9712, + "step": 64125 + }, + { + "epoch": 4.357249626307922, + "grad_norm": 0.14380085468292236, + "learning_rate": 4.555731077592064e-05, + "loss": 3.805, + "step": 64130 + }, + { + "epoch": 4.357589346378584, + "grad_norm": 0.23041877150535583, + "learning_rate": 4.555306427503737e-05, + "loss": 3.8178, + "step": 64135 + }, + { + "epoch": 4.357929066449246, + "grad_norm": 0.16288995742797852, + "learning_rate": 4.5548817774154104e-05, + "loss": 3.7259, + "step": 64140 + }, + { + "epoch": 4.358268786519908, + "grad_norm": 0.1691959649324417, + "learning_rate": 4.5544571273270825e-05, + "loss": 3.9216, + "step": 64145 + }, + { + "epoch": 4.35860850659057, + "grad_norm": 0.2012767642736435, + "learning_rate": 4.554032477238755e-05, + "loss": 3.7159, + "step": 64150 + }, + { + "epoch": 4.358948226661231, + "grad_norm": 0.15766717493534088, + "learning_rate": 4.553607827150429e-05, + "loss": 3.983, + "step": 64155 + }, + { + "epoch": 4.359287946731893, + "grad_norm": 0.21132418513298035, + "learning_rate": 4.553183177062101e-05, + "loss": 3.8235, + "step": 64160 + }, + { + "epoch": 4.359627666802555, + "grad_norm": 0.1843334436416626, + "learning_rate": 4.552758526973774e-05, + "loss": 3.8557, + "step": 64165 + }, + { + "epoch": 4.359967386873216, + "grad_norm": 0.5657386779785156, + "learning_rate": 4.5523338768854465e-05, + "loss": 3.8454, + "step": 64170 + }, + { + "epoch": 4.360307106943878, + "grad_norm": 0.17125488817691803, + "learning_rate": 4.551909226797119e-05, + "loss": 3.9213, + "step": 64175 + }, + { + "epoch": 4.36064682701454, + "grad_norm": 0.3142131567001343, + "learning_rate": 4.551484576708792e-05, + "loss": 3.7432, + "step": 64180 + }, + { + "epoch": 4.360986547085202, + "grad_norm": 0.2702673673629761, + "learning_rate": 4.551059926620465e-05, + "loss": 4.1052, + "step": 64185 + }, + { + "epoch": 4.361326267155864, + "grad_norm": 0.15570057928562164, + "learning_rate": 4.550635276532138e-05, + "loss": 3.821, + "step": 64190 + }, + { + "epoch": 4.361665987226525, + "grad_norm": 0.18238455057144165, + "learning_rate": 4.5502106264438105e-05, + "loss": 3.8545, + "step": 64195 + }, + { + "epoch": 4.362005707297187, + "grad_norm": 0.197293221950531, + "learning_rate": 4.549785976355483e-05, + "loss": 3.7328, + "step": 64200 + }, + { + "epoch": 4.362345427367849, + "grad_norm": 0.18594692647457123, + "learning_rate": 4.549361326267156e-05, + "loss": 3.9144, + "step": 64205 + }, + { + "epoch": 4.36268514743851, + "grad_norm": 0.7438086271286011, + "learning_rate": 4.548936676178829e-05, + "loss": 3.7586, + "step": 64210 + }, + { + "epoch": 4.363024867509172, + "grad_norm": 0.270454078912735, + "learning_rate": 4.548512026090502e-05, + "loss": 3.8609, + "step": 64215 + }, + { + "epoch": 4.363364587579834, + "grad_norm": 0.14717921614646912, + "learning_rate": 4.5480873760021745e-05, + "loss": 3.864, + "step": 64220 + }, + { + "epoch": 4.3637043076504956, + "grad_norm": 0.14051476120948792, + "learning_rate": 4.547662725913847e-05, + "loss": 3.7909, + "step": 64225 + }, + { + "epoch": 4.364044027721158, + "grad_norm": 0.1887817531824112, + "learning_rate": 4.54723807582552e-05, + "loss": 3.9166, + "step": 64230 + }, + { + "epoch": 4.36438374779182, + "grad_norm": 0.18706774711608887, + "learning_rate": 4.546813425737193e-05, + "loss": 3.7542, + "step": 64235 + }, + { + "epoch": 4.364723467862481, + "grad_norm": 0.159074604511261, + "learning_rate": 4.546388775648866e-05, + "loss": 3.9486, + "step": 64240 + }, + { + "epoch": 4.365063187933143, + "grad_norm": 0.14874404668807983, + "learning_rate": 4.5459641255605385e-05, + "loss": 3.9413, + "step": 64245 + }, + { + "epoch": 4.365402908003805, + "grad_norm": 0.19862762093544006, + "learning_rate": 4.545539475472211e-05, + "loss": 3.9009, + "step": 64250 + }, + { + "epoch": 4.365742628074466, + "grad_norm": 0.29107943177223206, + "learning_rate": 4.545114825383884e-05, + "loss": 3.6863, + "step": 64255 + }, + { + "epoch": 4.366082348145128, + "grad_norm": 0.1958242803812027, + "learning_rate": 4.544690175295557e-05, + "loss": 3.7101, + "step": 64260 + }, + { + "epoch": 4.36642206821579, + "grad_norm": 0.16646061837673187, + "learning_rate": 4.54426552520723e-05, + "loss": 3.7688, + "step": 64265 + }, + { + "epoch": 4.366761788286452, + "grad_norm": 0.2431248128414154, + "learning_rate": 4.543840875118902e-05, + "loss": 3.6708, + "step": 64270 + }, + { + "epoch": 4.367101508357114, + "grad_norm": 0.1635742336511612, + "learning_rate": 4.543416225030575e-05, + "loss": 4.0592, + "step": 64275 + }, + { + "epoch": 4.367441228427776, + "grad_norm": 0.20767326653003693, + "learning_rate": 4.542991574942248e-05, + "loss": 3.8213, + "step": 64280 + }, + { + "epoch": 4.367780948498437, + "grad_norm": 0.1486440896987915, + "learning_rate": 4.54256692485392e-05, + "loss": 3.7811, + "step": 64285 + }, + { + "epoch": 4.368120668569099, + "grad_norm": 0.2754456102848053, + "learning_rate": 4.542142274765594e-05, + "loss": 3.8635, + "step": 64290 + }, + { + "epoch": 4.368460388639761, + "grad_norm": 0.18012739717960358, + "learning_rate": 4.5417176246772665e-05, + "loss": 3.7864, + "step": 64295 + }, + { + "epoch": 4.368800108710422, + "grad_norm": 0.18072409927845, + "learning_rate": 4.5412929745889386e-05, + "loss": 4.0243, + "step": 64300 + }, + { + "epoch": 4.369139828781084, + "grad_norm": 0.17919200658798218, + "learning_rate": 4.5408683245006114e-05, + "loss": 4.1947, + "step": 64305 + }, + { + "epoch": 4.369479548851746, + "grad_norm": 0.32311105728149414, + "learning_rate": 4.540443674412285e-05, + "loss": 3.833, + "step": 64310 + }, + { + "epoch": 4.369819268922408, + "grad_norm": 0.15201586484909058, + "learning_rate": 4.540019024323957e-05, + "loss": 3.5609, + "step": 64315 + }, + { + "epoch": 4.37015898899307, + "grad_norm": 0.17027153074741364, + "learning_rate": 4.53959437423563e-05, + "loss": 4.0612, + "step": 64320 + }, + { + "epoch": 4.370498709063732, + "grad_norm": 0.15223252773284912, + "learning_rate": 4.539169724147303e-05, + "loss": 3.7527, + "step": 64325 + }, + { + "epoch": 4.370838429134393, + "grad_norm": 0.1284339725971222, + "learning_rate": 4.5387450740589754e-05, + "loss": 4.0242, + "step": 64330 + }, + { + "epoch": 4.371178149205055, + "grad_norm": 0.1987733542919159, + "learning_rate": 4.538320423970648e-05, + "loss": 3.9939, + "step": 64335 + }, + { + "epoch": 4.371517869275717, + "grad_norm": 0.2073649913072586, + "learning_rate": 4.537895773882321e-05, + "loss": 3.678, + "step": 64340 + }, + { + "epoch": 4.371857589346378, + "grad_norm": 0.20182853937149048, + "learning_rate": 4.537471123793994e-05, + "loss": 4.2932, + "step": 64345 + }, + { + "epoch": 4.37219730941704, + "grad_norm": 0.1644362062215805, + "learning_rate": 4.5370464737056666e-05, + "loss": 3.8035, + "step": 64350 + }, + { + "epoch": 4.372537029487702, + "grad_norm": 0.15812651813030243, + "learning_rate": 4.5366218236173394e-05, + "loss": 3.9485, + "step": 64355 + }, + { + "epoch": 4.372876749558364, + "grad_norm": 0.19249136745929718, + "learning_rate": 4.536197173529012e-05, + "loss": 3.8178, + "step": 64360 + }, + { + "epoch": 4.373216469629026, + "grad_norm": 0.1895420402288437, + "learning_rate": 4.535772523440685e-05, + "loss": 4.0099, + "step": 64365 + }, + { + "epoch": 4.373556189699688, + "grad_norm": 0.1484731137752533, + "learning_rate": 4.535347873352358e-05, + "loss": 3.7264, + "step": 64370 + }, + { + "epoch": 4.373895909770349, + "grad_norm": 0.17307431995868683, + "learning_rate": 4.5349232232640306e-05, + "loss": 4.0683, + "step": 64375 + }, + { + "epoch": 4.374235629841011, + "grad_norm": 0.30109527707099915, + "learning_rate": 4.5344985731757034e-05, + "loss": 4.0216, + "step": 64380 + }, + { + "epoch": 4.374575349911673, + "grad_norm": 0.41063690185546875, + "learning_rate": 4.534073923087376e-05, + "loss": 3.5936, + "step": 64385 + }, + { + "epoch": 4.374915069982334, + "grad_norm": 0.20717643201351166, + "learning_rate": 4.533649272999049e-05, + "loss": 3.8926, + "step": 64390 + }, + { + "epoch": 4.375254790052996, + "grad_norm": 0.2417864352464676, + "learning_rate": 4.533224622910722e-05, + "loss": 4.0263, + "step": 64395 + }, + { + "epoch": 4.375594510123658, + "grad_norm": 0.17724061012268066, + "learning_rate": 4.5327999728223946e-05, + "loss": 3.8276, + "step": 64400 + }, + { + "epoch": 4.37593423019432, + "grad_norm": 0.17355632781982422, + "learning_rate": 4.5323753227340674e-05, + "loss": 3.8688, + "step": 64405 + }, + { + "epoch": 4.376273950264982, + "grad_norm": 0.15952162444591522, + "learning_rate": 4.53195067264574e-05, + "loss": 3.6036, + "step": 64410 + }, + { + "epoch": 4.376613670335644, + "grad_norm": 0.16404397785663605, + "learning_rate": 4.531526022557413e-05, + "loss": 3.8305, + "step": 64415 + }, + { + "epoch": 4.376953390406305, + "grad_norm": 0.16990377008914948, + "learning_rate": 4.531101372469086e-05, + "loss": 3.9468, + "step": 64420 + }, + { + "epoch": 4.377293110476967, + "grad_norm": 0.22693264484405518, + "learning_rate": 4.5306767223807586e-05, + "loss": 3.988, + "step": 64425 + }, + { + "epoch": 4.377632830547629, + "grad_norm": 0.19846023619174957, + "learning_rate": 4.5302520722924314e-05, + "loss": 4.0957, + "step": 64430 + }, + { + "epoch": 4.37797255061829, + "grad_norm": 0.26753196120262146, + "learning_rate": 4.529827422204104e-05, + "loss": 3.9611, + "step": 64435 + }, + { + "epoch": 4.378312270688952, + "grad_norm": 0.1850920021533966, + "learning_rate": 4.5294027721157764e-05, + "loss": 3.7974, + "step": 64440 + }, + { + "epoch": 4.378651990759614, + "grad_norm": 0.154728502035141, + "learning_rate": 4.52897812202745e-05, + "loss": 3.6763, + "step": 64445 + }, + { + "epoch": 4.378991710830276, + "grad_norm": 0.1750163435935974, + "learning_rate": 4.5285534719391227e-05, + "loss": 4.1498, + "step": 64450 + }, + { + "epoch": 4.379331430900938, + "grad_norm": 0.14763686060905457, + "learning_rate": 4.528128821850795e-05, + "loss": 3.6207, + "step": 64455 + }, + { + "epoch": 4.3796711509716, + "grad_norm": 0.13829396665096283, + "learning_rate": 4.527704171762468e-05, + "loss": 3.7742, + "step": 64460 + }, + { + "epoch": 4.380010871042261, + "grad_norm": 0.2863059937953949, + "learning_rate": 4.527279521674141e-05, + "loss": 4.0033, + "step": 64465 + }, + { + "epoch": 4.380350591112923, + "grad_norm": 0.1643550843000412, + "learning_rate": 4.526854871585813e-05, + "loss": 3.6671, + "step": 64470 + }, + { + "epoch": 4.380690311183585, + "grad_norm": 0.15951402485370636, + "learning_rate": 4.526430221497486e-05, + "loss": 3.712, + "step": 64475 + }, + { + "epoch": 4.381030031254246, + "grad_norm": 0.20147660374641418, + "learning_rate": 4.5260055714091595e-05, + "loss": 3.8488, + "step": 64480 + }, + { + "epoch": 4.381369751324908, + "grad_norm": 0.1724211871623993, + "learning_rate": 4.5255809213208316e-05, + "loss": 3.8612, + "step": 64485 + }, + { + "epoch": 4.38170947139557, + "grad_norm": 0.19846458733081818, + "learning_rate": 4.5251562712325044e-05, + "loss": 3.926, + "step": 64490 + }, + { + "epoch": 4.382049191466232, + "grad_norm": 0.23736421763896942, + "learning_rate": 4.524731621144178e-05, + "loss": 3.8129, + "step": 64495 + }, + { + "epoch": 4.382388911536894, + "grad_norm": 0.18045726418495178, + "learning_rate": 4.52430697105585e-05, + "loss": 3.6966, + "step": 64500 + }, + { + "epoch": 4.382728631607556, + "grad_norm": 0.17242898046970367, + "learning_rate": 4.523882320967523e-05, + "loss": 3.9266, + "step": 64505 + }, + { + "epoch": 4.383068351678217, + "grad_norm": 0.4657038450241089, + "learning_rate": 4.5234576708791956e-05, + "loss": 3.7327, + "step": 64510 + }, + { + "epoch": 4.383408071748879, + "grad_norm": 0.21524818241596222, + "learning_rate": 4.5230330207908684e-05, + "loss": 3.8889, + "step": 64515 + }, + { + "epoch": 4.383747791819541, + "grad_norm": 0.4002414643764496, + "learning_rate": 4.522608370702541e-05, + "loss": 3.8478, + "step": 64520 + }, + { + "epoch": 4.384087511890202, + "grad_norm": 0.2000495046377182, + "learning_rate": 4.522183720614214e-05, + "loss": 4.1403, + "step": 64525 + }, + { + "epoch": 4.384427231960864, + "grad_norm": 0.1385493278503418, + "learning_rate": 4.521759070525887e-05, + "loss": 4.1329, + "step": 64530 + }, + { + "epoch": 4.384766952031526, + "grad_norm": 0.19468308985233307, + "learning_rate": 4.5213344204375596e-05, + "loss": 3.759, + "step": 64535 + }, + { + "epoch": 4.385106672102188, + "grad_norm": 0.1774033159017563, + "learning_rate": 4.5209097703492324e-05, + "loss": 4.0345, + "step": 64540 + }, + { + "epoch": 4.38544639217285, + "grad_norm": 0.21716150641441345, + "learning_rate": 4.520485120260905e-05, + "loss": 3.7512, + "step": 64545 + }, + { + "epoch": 4.385786112243512, + "grad_norm": 0.221731498837471, + "learning_rate": 4.520060470172578e-05, + "loss": 3.9595, + "step": 64550 + }, + { + "epoch": 4.386125832314173, + "grad_norm": 0.19579023122787476, + "learning_rate": 4.519635820084251e-05, + "loss": 3.7619, + "step": 64555 + }, + { + "epoch": 4.386465552384835, + "grad_norm": 0.149275004863739, + "learning_rate": 4.5192111699959236e-05, + "loss": 3.7579, + "step": 64560 + }, + { + "epoch": 4.386805272455497, + "grad_norm": 0.17280708253383636, + "learning_rate": 4.5187865199075964e-05, + "loss": 4.0535, + "step": 64565 + }, + { + "epoch": 4.387144992526158, + "grad_norm": 0.18074041604995728, + "learning_rate": 4.518361869819269e-05, + "loss": 3.8532, + "step": 64570 + }, + { + "epoch": 4.38748471259682, + "grad_norm": 0.17666473984718323, + "learning_rate": 4.517937219730942e-05, + "loss": 3.7997, + "step": 64575 + }, + { + "epoch": 4.3878244326674825, + "grad_norm": 0.1811911016702652, + "learning_rate": 4.517512569642615e-05, + "loss": 3.9316, + "step": 64580 + }, + { + "epoch": 4.388164152738144, + "grad_norm": 0.161920964717865, + "learning_rate": 4.5170879195542876e-05, + "loss": 3.9103, + "step": 64585 + }, + { + "epoch": 4.388503872808806, + "grad_norm": 0.17277222871780396, + "learning_rate": 4.5166632694659604e-05, + "loss": 3.9296, + "step": 64590 + }, + { + "epoch": 4.388843592879467, + "grad_norm": 0.18627125024795532, + "learning_rate": 4.516238619377633e-05, + "loss": 4.0831, + "step": 64595 + }, + { + "epoch": 4.389183312950129, + "grad_norm": 0.14685912430286407, + "learning_rate": 4.515813969289306e-05, + "loss": 3.6876, + "step": 64600 + }, + { + "epoch": 4.389523033020791, + "grad_norm": 0.19572308659553528, + "learning_rate": 4.515389319200979e-05, + "loss": 3.9849, + "step": 64605 + }, + { + "epoch": 4.389862753091452, + "grad_norm": 0.437159925699234, + "learning_rate": 4.514964669112651e-05, + "loss": 3.9571, + "step": 64610 + }, + { + "epoch": 4.390202473162114, + "grad_norm": 0.22666522860527039, + "learning_rate": 4.5145400190243244e-05, + "loss": 4.0623, + "step": 64615 + }, + { + "epoch": 4.390542193232776, + "grad_norm": 0.15031573176383972, + "learning_rate": 4.514115368935997e-05, + "loss": 3.8022, + "step": 64620 + }, + { + "epoch": 4.390881913303438, + "grad_norm": 0.1723521649837494, + "learning_rate": 4.513690718847669e-05, + "loss": 3.9047, + "step": 64625 + }, + { + "epoch": 4.3912216333741, + "grad_norm": 0.15648914873600006, + "learning_rate": 4.513266068759343e-05, + "loss": 3.7477, + "step": 64630 + }, + { + "epoch": 4.391561353444762, + "grad_norm": 0.1580948680639267, + "learning_rate": 4.5128414186710156e-05, + "loss": 3.7173, + "step": 64635 + }, + { + "epoch": 4.391901073515423, + "grad_norm": 0.16181515157222748, + "learning_rate": 4.512416768582688e-05, + "loss": 3.8539, + "step": 64640 + }, + { + "epoch": 4.392240793586085, + "grad_norm": 0.16096416115760803, + "learning_rate": 4.511992118494361e-05, + "loss": 3.7522, + "step": 64645 + }, + { + "epoch": 4.392580513656747, + "grad_norm": 0.2171659618616104, + "learning_rate": 4.511567468406034e-05, + "loss": 3.7442, + "step": 64650 + }, + { + "epoch": 4.392920233727408, + "grad_norm": 0.1828765720129013, + "learning_rate": 4.511142818317706e-05, + "loss": 3.791, + "step": 64655 + }, + { + "epoch": 4.39325995379807, + "grad_norm": 0.13939329981803894, + "learning_rate": 4.510718168229379e-05, + "loss": 3.968, + "step": 64660 + }, + { + "epoch": 4.393599673868732, + "grad_norm": 0.19411924481391907, + "learning_rate": 4.5102935181410524e-05, + "loss": 3.7325, + "step": 64665 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.15508447587490082, + "learning_rate": 4.5098688680527245e-05, + "loss": 3.8606, + "step": 64670 + }, + { + "epoch": 4.394279114010056, + "grad_norm": 0.8154310584068298, + "learning_rate": 4.509444217964397e-05, + "loss": 3.6635, + "step": 64675 + }, + { + "epoch": 4.394618834080718, + "grad_norm": 0.15054085850715637, + "learning_rate": 4.509019567876071e-05, + "loss": 3.9251, + "step": 64680 + }, + { + "epoch": 4.394958554151379, + "grad_norm": 0.15700902044773102, + "learning_rate": 4.508594917787743e-05, + "loss": 4.0806, + "step": 64685 + }, + { + "epoch": 4.395298274222041, + "grad_norm": 0.18396888673305511, + "learning_rate": 4.508170267699416e-05, + "loss": 3.8404, + "step": 64690 + }, + { + "epoch": 4.395637994292703, + "grad_norm": 0.25716859102249146, + "learning_rate": 4.5077456176110885e-05, + "loss": 3.647, + "step": 64695 + }, + { + "epoch": 4.395977714363364, + "grad_norm": 0.25038009881973267, + "learning_rate": 4.507320967522761e-05, + "loss": 3.5693, + "step": 64700 + }, + { + "epoch": 4.396317434434026, + "grad_norm": 0.3093158006668091, + "learning_rate": 4.506896317434434e-05, + "loss": 3.9922, + "step": 64705 + }, + { + "epoch": 4.396657154504688, + "grad_norm": 0.1688445657491684, + "learning_rate": 4.506471667346107e-05, + "loss": 3.7342, + "step": 64710 + }, + { + "epoch": 4.39699687457535, + "grad_norm": 0.9172671437263489, + "learning_rate": 4.50604701725778e-05, + "loss": 3.754, + "step": 64715 + }, + { + "epoch": 4.397336594646012, + "grad_norm": 0.20140312612056732, + "learning_rate": 4.5056223671694525e-05, + "loss": 3.6602, + "step": 64720 + }, + { + "epoch": 4.397676314716674, + "grad_norm": 0.16853733360767365, + "learning_rate": 4.505197717081125e-05, + "loss": 3.7413, + "step": 64725 + }, + { + "epoch": 4.398016034787335, + "grad_norm": 0.15895788371562958, + "learning_rate": 4.504773066992798e-05, + "loss": 3.7106, + "step": 64730 + }, + { + "epoch": 4.398355754857997, + "grad_norm": 0.13511987030506134, + "learning_rate": 4.504348416904471e-05, + "loss": 3.9361, + "step": 64735 + }, + { + "epoch": 4.398695474928659, + "grad_norm": 0.18637563288211823, + "learning_rate": 4.503923766816144e-05, + "loss": 3.6343, + "step": 64740 + }, + { + "epoch": 4.39903519499932, + "grad_norm": 0.20393410325050354, + "learning_rate": 4.5034991167278165e-05, + "loss": 3.8192, + "step": 64745 + }, + { + "epoch": 4.399374915069982, + "grad_norm": 0.14903008937835693, + "learning_rate": 4.5030744666394893e-05, + "loss": 3.7653, + "step": 64750 + }, + { + "epoch": 4.399714635140644, + "grad_norm": 0.21228095889091492, + "learning_rate": 4.502649816551162e-05, + "loss": 3.7838, + "step": 64755 + }, + { + "epoch": 4.400054355211306, + "grad_norm": 0.3368909955024719, + "learning_rate": 4.502225166462835e-05, + "loss": 3.6842, + "step": 64760 + }, + { + "epoch": 4.400394075281968, + "grad_norm": 0.32899773120880127, + "learning_rate": 4.501800516374508e-05, + "loss": 3.7405, + "step": 64765 + }, + { + "epoch": 4.40073379535263, + "grad_norm": 0.1552075892686844, + "learning_rate": 4.5013758662861805e-05, + "loss": 3.6243, + "step": 64770 + }, + { + "epoch": 4.401073515423291, + "grad_norm": 0.1541706770658493, + "learning_rate": 4.5009512161978533e-05, + "loss": 3.7929, + "step": 64775 + }, + { + "epoch": 4.401413235493953, + "grad_norm": 0.1703803539276123, + "learning_rate": 4.500526566109526e-05, + "loss": 3.6033, + "step": 64780 + }, + { + "epoch": 4.401752955564615, + "grad_norm": 0.1957656741142273, + "learning_rate": 4.500101916021199e-05, + "loss": 3.6301, + "step": 64785 + }, + { + "epoch": 4.402092675635276, + "grad_norm": 0.2028890997171402, + "learning_rate": 4.499677265932872e-05, + "loss": 3.8117, + "step": 64790 + }, + { + "epoch": 4.402432395705938, + "grad_norm": 0.171350359916687, + "learning_rate": 4.499252615844544e-05, + "loss": 3.8289, + "step": 64795 + }, + { + "epoch": 4.4027721157766, + "grad_norm": 0.1487458199262619, + "learning_rate": 4.4988279657562173e-05, + "loss": 3.495, + "step": 64800 + }, + { + "epoch": 4.403111835847262, + "grad_norm": 0.1987506002187729, + "learning_rate": 4.49840331566789e-05, + "loss": 3.8265, + "step": 64805 + }, + { + "epoch": 4.403451555917924, + "grad_norm": 0.19535768032073975, + "learning_rate": 4.497978665579562e-05, + "loss": 3.7447, + "step": 64810 + }, + { + "epoch": 4.403791275988586, + "grad_norm": 0.18333536386489868, + "learning_rate": 4.497554015491236e-05, + "loss": 3.821, + "step": 64815 + }, + { + "epoch": 4.404130996059247, + "grad_norm": 0.19042187929153442, + "learning_rate": 4.4971293654029086e-05, + "loss": 3.7412, + "step": 64820 + }, + { + "epoch": 4.404470716129909, + "grad_norm": 0.18661202490329742, + "learning_rate": 4.496704715314581e-05, + "loss": 3.6919, + "step": 64825 + }, + { + "epoch": 4.404810436200571, + "grad_norm": 0.17916837334632874, + "learning_rate": 4.4962800652262535e-05, + "loss": 4.1615, + "step": 64830 + }, + { + "epoch": 4.405150156271232, + "grad_norm": 0.16420435905456543, + "learning_rate": 4.495855415137927e-05, + "loss": 4.0223, + "step": 64835 + }, + { + "epoch": 4.405489876341894, + "grad_norm": 0.20922479033470154, + "learning_rate": 4.495430765049599e-05, + "loss": 3.7499, + "step": 64840 + }, + { + "epoch": 4.4058295964125564, + "grad_norm": 0.21600212156772614, + "learning_rate": 4.495006114961272e-05, + "loss": 3.6992, + "step": 64845 + }, + { + "epoch": 4.406169316483218, + "grad_norm": 0.16446994245052338, + "learning_rate": 4.4945814648729454e-05, + "loss": 3.819, + "step": 64850 + }, + { + "epoch": 4.40650903655388, + "grad_norm": 0.16553792357444763, + "learning_rate": 4.4941568147846175e-05, + "loss": 3.8079, + "step": 64855 + }, + { + "epoch": 4.406848756624542, + "grad_norm": 0.19232071936130524, + "learning_rate": 4.49373216469629e-05, + "loss": 3.6107, + "step": 64860 + }, + { + "epoch": 4.407188476695203, + "grad_norm": 0.7041946053504944, + "learning_rate": 4.493307514607963e-05, + "loss": 3.9399, + "step": 64865 + }, + { + "epoch": 4.407528196765865, + "grad_norm": 0.8754706978797913, + "learning_rate": 4.492882864519636e-05, + "loss": 3.725, + "step": 64870 + }, + { + "epoch": 4.407867916836526, + "grad_norm": 0.18787099421024323, + "learning_rate": 4.492458214431309e-05, + "loss": 3.914, + "step": 64875 + }, + { + "epoch": 4.408207636907188, + "grad_norm": 0.1618921011686325, + "learning_rate": 4.4920335643429815e-05, + "loss": 4.0142, + "step": 64880 + }, + { + "epoch": 4.40854735697785, + "grad_norm": 0.3151029944419861, + "learning_rate": 4.491608914254654e-05, + "loss": 3.9493, + "step": 64885 + }, + { + "epoch": 4.408887077048512, + "grad_norm": 0.30035340785980225, + "learning_rate": 4.491184264166327e-05, + "loss": 3.939, + "step": 64890 + }, + { + "epoch": 4.409226797119174, + "grad_norm": 0.1628868579864502, + "learning_rate": 4.490759614078e-05, + "loss": 3.7689, + "step": 64895 + }, + { + "epoch": 4.409566517189836, + "grad_norm": 0.16398346424102783, + "learning_rate": 4.490334963989673e-05, + "loss": 3.6352, + "step": 64900 + }, + { + "epoch": 4.409906237260497, + "grad_norm": 0.16188940405845642, + "learning_rate": 4.4899103139013455e-05, + "loss": 3.8134, + "step": 64905 + }, + { + "epoch": 4.410245957331159, + "grad_norm": 0.16533192992210388, + "learning_rate": 4.489485663813018e-05, + "loss": 3.8483, + "step": 64910 + }, + { + "epoch": 4.410585677401821, + "grad_norm": 0.18712477385997772, + "learning_rate": 4.489061013724691e-05, + "loss": 3.848, + "step": 64915 + }, + { + "epoch": 4.410925397472482, + "grad_norm": 0.820154070854187, + "learning_rate": 4.488636363636364e-05, + "loss": 3.7864, + "step": 64920 + }, + { + "epoch": 4.411265117543144, + "grad_norm": 0.18852341175079346, + "learning_rate": 4.488211713548037e-05, + "loss": 3.6944, + "step": 64925 + }, + { + "epoch": 4.411604837613806, + "grad_norm": 0.1534544676542282, + "learning_rate": 4.4877870634597095e-05, + "loss": 3.7055, + "step": 64930 + }, + { + "epoch": 4.411944557684468, + "grad_norm": 0.17902405560016632, + "learning_rate": 4.487362413371382e-05, + "loss": 4.0727, + "step": 64935 + }, + { + "epoch": 4.41228427775513, + "grad_norm": 0.17230699956417084, + "learning_rate": 4.486937763283055e-05, + "loss": 3.8884, + "step": 64940 + }, + { + "epoch": 4.412623997825792, + "grad_norm": 7.96125602722168, + "learning_rate": 4.486513113194728e-05, + "loss": 3.6816, + "step": 64945 + }, + { + "epoch": 4.412963717896453, + "grad_norm": 0.4738782048225403, + "learning_rate": 4.486088463106401e-05, + "loss": 3.8148, + "step": 64950 + }, + { + "epoch": 4.413303437967115, + "grad_norm": 0.4344748258590698, + "learning_rate": 4.4856638130180735e-05, + "loss": 3.8273, + "step": 64955 + }, + { + "epoch": 4.413643158037777, + "grad_norm": 0.1851280778646469, + "learning_rate": 4.485239162929746e-05, + "loss": 4.0127, + "step": 64960 + }, + { + "epoch": 4.413982878108438, + "grad_norm": 0.13650447130203247, + "learning_rate": 4.4848145128414184e-05, + "loss": 3.8271, + "step": 64965 + }, + { + "epoch": 4.4143225981791, + "grad_norm": 0.2216905802488327, + "learning_rate": 4.484389862753092e-05, + "loss": 4.0085, + "step": 64970 + }, + { + "epoch": 4.414662318249762, + "grad_norm": 0.14735177159309387, + "learning_rate": 4.483965212664765e-05, + "loss": 3.7373, + "step": 64975 + }, + { + "epoch": 4.415002038320424, + "grad_norm": 0.1465175598859787, + "learning_rate": 4.483540562576437e-05, + "loss": 3.9222, + "step": 64980 + }, + { + "epoch": 4.415341758391086, + "grad_norm": 0.20595934987068176, + "learning_rate": 4.48311591248811e-05, + "loss": 3.9479, + "step": 64985 + }, + { + "epoch": 4.415681478461748, + "grad_norm": 0.19248893857002258, + "learning_rate": 4.482691262399783e-05, + "loss": 4.0748, + "step": 64990 + }, + { + "epoch": 4.416021198532409, + "grad_norm": 0.16699986159801483, + "learning_rate": 4.482266612311455e-05, + "loss": 3.917, + "step": 64995 + }, + { + "epoch": 4.416360918603071, + "grad_norm": 0.7807731032371521, + "learning_rate": 4.481841962223128e-05, + "loss": 3.6242, + "step": 65000 + }, + { + "epoch": 4.416700638673733, + "grad_norm": 0.22778531908988953, + "learning_rate": 4.4814173121348015e-05, + "loss": 3.9186, + "step": 65005 + }, + { + "epoch": 4.417040358744394, + "grad_norm": 0.18270239233970642, + "learning_rate": 4.4809926620464736e-05, + "loss": 3.731, + "step": 65010 + }, + { + "epoch": 4.417380078815056, + "grad_norm": 0.1799401193857193, + "learning_rate": 4.4805680119581464e-05, + "loss": 3.9865, + "step": 65015 + }, + { + "epoch": 4.417719798885718, + "grad_norm": 0.18545390665531158, + "learning_rate": 4.48014336186982e-05, + "loss": 3.9449, + "step": 65020 + }, + { + "epoch": 4.41805951895638, + "grad_norm": 0.16816480457782745, + "learning_rate": 4.479718711781492e-05, + "loss": 3.9513, + "step": 65025 + }, + { + "epoch": 4.418399239027042, + "grad_norm": 3.299384117126465, + "learning_rate": 4.479294061693165e-05, + "loss": 3.8431, + "step": 65030 + }, + { + "epoch": 4.418738959097704, + "grad_norm": 0.15210339426994324, + "learning_rate": 4.478869411604838e-05, + "loss": 3.7141, + "step": 65035 + }, + { + "epoch": 4.419078679168365, + "grad_norm": 1.343281865119934, + "learning_rate": 4.4784447615165104e-05, + "loss": 3.6512, + "step": 65040 + }, + { + "epoch": 4.419418399239027, + "grad_norm": 0.21212099492549896, + "learning_rate": 4.478020111428183e-05, + "loss": 3.6128, + "step": 65045 + }, + { + "epoch": 4.419758119309689, + "grad_norm": 0.19508983194828033, + "learning_rate": 4.477595461339856e-05, + "loss": 3.8556, + "step": 65050 + }, + { + "epoch": 4.42009783938035, + "grad_norm": 0.19622667133808136, + "learning_rate": 4.477170811251529e-05, + "loss": 4.0225, + "step": 65055 + }, + { + "epoch": 4.420437559451012, + "grad_norm": 0.1657835692167282, + "learning_rate": 4.4767461611632016e-05, + "loss": 3.9506, + "step": 65060 + }, + { + "epoch": 4.420777279521674, + "grad_norm": 0.31122350692749023, + "learning_rate": 4.4763215110748744e-05, + "loss": 3.865, + "step": 65065 + }, + { + "epoch": 4.421116999592336, + "grad_norm": 0.16528479754924774, + "learning_rate": 4.475896860986547e-05, + "loss": 3.7086, + "step": 65070 + }, + { + "epoch": 4.421456719662998, + "grad_norm": 0.19680266082286835, + "learning_rate": 4.47547221089822e-05, + "loss": 3.8099, + "step": 65075 + }, + { + "epoch": 4.42179643973366, + "grad_norm": 0.19509656727313995, + "learning_rate": 4.475047560809893e-05, + "loss": 3.7216, + "step": 65080 + }, + { + "epoch": 4.422136159804321, + "grad_norm": 0.2005830556154251, + "learning_rate": 4.4746229107215656e-05, + "loss": 3.9132, + "step": 65085 + }, + { + "epoch": 4.422475879874983, + "grad_norm": 0.1498839557170868, + "learning_rate": 4.4741982606332384e-05, + "loss": 3.3755, + "step": 65090 + }, + { + "epoch": 4.422815599945645, + "grad_norm": 0.1914321929216385, + "learning_rate": 4.473773610544911e-05, + "loss": 3.9034, + "step": 65095 + }, + { + "epoch": 4.423155320016306, + "grad_norm": 0.16493846476078033, + "learning_rate": 4.473348960456584e-05, + "loss": 3.9537, + "step": 65100 + }, + { + "epoch": 4.423495040086968, + "grad_norm": 0.1951073408126831, + "learning_rate": 4.472924310368257e-05, + "loss": 3.8322, + "step": 65105 + }, + { + "epoch": 4.42383476015763, + "grad_norm": 0.14953859150409698, + "learning_rate": 4.4724996602799296e-05, + "loss": 3.9072, + "step": 65110 + }, + { + "epoch": 4.424174480228292, + "grad_norm": 0.20496632158756256, + "learning_rate": 4.4720750101916024e-05, + "loss": 3.9206, + "step": 65115 + }, + { + "epoch": 4.424514200298954, + "grad_norm": 1.2122220993041992, + "learning_rate": 4.471650360103275e-05, + "loss": 3.6445, + "step": 65120 + }, + { + "epoch": 4.424853920369616, + "grad_norm": 0.21890828013420105, + "learning_rate": 4.471225710014948e-05, + "loss": 3.6128, + "step": 65125 + }, + { + "epoch": 4.425193640440277, + "grad_norm": 0.16107819974422455, + "learning_rate": 4.470801059926621e-05, + "loss": 3.701, + "step": 65130 + }, + { + "epoch": 4.425533360510939, + "grad_norm": 0.18812116980552673, + "learning_rate": 4.4703764098382936e-05, + "loss": 3.7493, + "step": 65135 + }, + { + "epoch": 4.425873080581601, + "grad_norm": 0.1989978849887848, + "learning_rate": 4.4699517597499664e-05, + "loss": 3.8067, + "step": 65140 + }, + { + "epoch": 4.426212800652262, + "grad_norm": 0.292980432510376, + "learning_rate": 4.469527109661639e-05, + "loss": 3.9436, + "step": 65145 + }, + { + "epoch": 4.426552520722924, + "grad_norm": 0.18063224852085114, + "learning_rate": 4.4691024595733114e-05, + "loss": 3.7857, + "step": 65150 + }, + { + "epoch": 4.4268922407935865, + "grad_norm": 0.1678568571805954, + "learning_rate": 4.468677809484985e-05, + "loss": 3.8107, + "step": 65155 + }, + { + "epoch": 4.427231960864248, + "grad_norm": 0.1741386204957962, + "learning_rate": 4.4682531593966576e-05, + "loss": 4.2276, + "step": 65160 + }, + { + "epoch": 4.42757168093491, + "grad_norm": 0.17265067994594574, + "learning_rate": 4.46782850930833e-05, + "loss": 4.2265, + "step": 65165 + }, + { + "epoch": 4.427911401005572, + "grad_norm": 0.18832285702228546, + "learning_rate": 4.467403859220003e-05, + "loss": 4.0297, + "step": 65170 + }, + { + "epoch": 4.428251121076233, + "grad_norm": 0.21103499829769135, + "learning_rate": 4.466979209131676e-05, + "loss": 3.6371, + "step": 65175 + }, + { + "epoch": 4.428590841146895, + "grad_norm": 0.16439288854599, + "learning_rate": 4.466554559043348e-05, + "loss": 3.8685, + "step": 65180 + }, + { + "epoch": 4.428930561217557, + "grad_norm": 0.15815530717372894, + "learning_rate": 4.466129908955021e-05, + "loss": 3.9842, + "step": 65185 + }, + { + "epoch": 4.429270281288218, + "grad_norm": 0.17832379043102264, + "learning_rate": 4.4657052588666945e-05, + "loss": 3.9519, + "step": 65190 + }, + { + "epoch": 4.42961000135888, + "grad_norm": 0.15843147039413452, + "learning_rate": 4.4652806087783666e-05, + "loss": 3.6328, + "step": 65195 + }, + { + "epoch": 4.4299497214295425, + "grad_norm": 0.1635437309741974, + "learning_rate": 4.4648559586900394e-05, + "loss": 4.0572, + "step": 65200 + }, + { + "epoch": 4.430289441500204, + "grad_norm": 0.4721279442310333, + "learning_rate": 4.464431308601713e-05, + "loss": 3.8671, + "step": 65205 + }, + { + "epoch": 4.430629161570866, + "grad_norm": 0.4514354467391968, + "learning_rate": 4.464006658513385e-05, + "loss": 3.6499, + "step": 65210 + }, + { + "epoch": 4.430968881641528, + "grad_norm": 0.238425150513649, + "learning_rate": 4.463582008425058e-05, + "loss": 4.1825, + "step": 65215 + }, + { + "epoch": 4.431308601712189, + "grad_norm": 0.15588052570819855, + "learning_rate": 4.4631573583367306e-05, + "loss": 3.9449, + "step": 65220 + }, + { + "epoch": 4.431648321782851, + "grad_norm": 0.1698663830757141, + "learning_rate": 4.4627327082484034e-05, + "loss": 3.8584, + "step": 65225 + }, + { + "epoch": 4.431988041853513, + "grad_norm": 0.27624863386154175, + "learning_rate": 4.462308058160076e-05, + "loss": 3.8831, + "step": 65230 + }, + { + "epoch": 4.432327761924174, + "grad_norm": 0.19847244024276733, + "learning_rate": 4.461883408071749e-05, + "loss": 4.0822, + "step": 65235 + }, + { + "epoch": 4.432667481994836, + "grad_norm": 0.20625749230384827, + "learning_rate": 4.461458757983422e-05, + "loss": 3.8881, + "step": 65240 + }, + { + "epoch": 4.4330072020654985, + "grad_norm": 0.19253915548324585, + "learning_rate": 4.4610341078950946e-05, + "loss": 3.8613, + "step": 65245 + }, + { + "epoch": 4.43334692213616, + "grad_norm": 0.16986192762851715, + "learning_rate": 4.4606094578067674e-05, + "loss": 4.2534, + "step": 65250 + }, + { + "epoch": 4.433686642206822, + "grad_norm": 0.1833469718694687, + "learning_rate": 4.46018480771844e-05, + "loss": 3.9019, + "step": 65255 + }, + { + "epoch": 4.434026362277484, + "grad_norm": 0.20575647056102753, + "learning_rate": 4.459760157630113e-05, + "loss": 3.6804, + "step": 65260 + }, + { + "epoch": 4.434366082348145, + "grad_norm": 0.21043989062309265, + "learning_rate": 4.459335507541786e-05, + "loss": 4.0751, + "step": 65265 + }, + { + "epoch": 4.434705802418807, + "grad_norm": 0.15085384249687195, + "learning_rate": 4.4589108574534586e-05, + "loss": 3.9651, + "step": 65270 + }, + { + "epoch": 4.435045522489468, + "grad_norm": 0.19896212220191956, + "learning_rate": 4.4584862073651314e-05, + "loss": 3.7713, + "step": 65275 + }, + { + "epoch": 4.43538524256013, + "grad_norm": 0.20560237765312195, + "learning_rate": 4.458061557276804e-05, + "loss": 3.7291, + "step": 65280 + }, + { + "epoch": 4.435724962630792, + "grad_norm": 0.21134543418884277, + "learning_rate": 4.457636907188477e-05, + "loss": 4.301, + "step": 65285 + }, + { + "epoch": 4.436064682701454, + "grad_norm": 0.19460853934288025, + "learning_rate": 4.45721225710015e-05, + "loss": 3.6024, + "step": 65290 + }, + { + "epoch": 4.436404402772116, + "grad_norm": 0.22148054838180542, + "learning_rate": 4.4567876070118226e-05, + "loss": 3.8395, + "step": 65295 + }, + { + "epoch": 4.436744122842778, + "grad_norm": 0.16041770577430725, + "learning_rate": 4.4563629569234954e-05, + "loss": 3.7933, + "step": 65300 + }, + { + "epoch": 4.437083842913439, + "grad_norm": 0.17476055026054382, + "learning_rate": 4.455938306835168e-05, + "loss": 3.9709, + "step": 65305 + }, + { + "epoch": 4.437423562984101, + "grad_norm": 0.28541669249534607, + "learning_rate": 4.455513656746841e-05, + "loss": 3.8956, + "step": 65310 + }, + { + "epoch": 4.437763283054763, + "grad_norm": 0.23686617612838745, + "learning_rate": 4.455089006658514e-05, + "loss": 3.772, + "step": 65315 + }, + { + "epoch": 4.438103003125424, + "grad_norm": 0.15846168994903564, + "learning_rate": 4.454664356570186e-05, + "loss": 4.0185, + "step": 65320 + }, + { + "epoch": 4.438442723196086, + "grad_norm": 3.512010097503662, + "learning_rate": 4.4542397064818594e-05, + "loss": 3.6826, + "step": 65325 + }, + { + "epoch": 4.438782443266748, + "grad_norm": 0.29208171367645264, + "learning_rate": 4.453815056393532e-05, + "loss": 3.7723, + "step": 65330 + }, + { + "epoch": 4.43912216333741, + "grad_norm": 0.1725578010082245, + "learning_rate": 4.453390406305204e-05, + "loss": 4.0508, + "step": 65335 + }, + { + "epoch": 4.439461883408072, + "grad_norm": 0.3495625853538513, + "learning_rate": 4.452965756216878e-05, + "loss": 3.6918, + "step": 65340 + }, + { + "epoch": 4.439801603478734, + "grad_norm": 0.17628683149814606, + "learning_rate": 4.4525411061285506e-05, + "loss": 3.7795, + "step": 65345 + }, + { + "epoch": 4.440141323549395, + "grad_norm": 0.14918209612369537, + "learning_rate": 4.452116456040223e-05, + "loss": 3.9294, + "step": 65350 + }, + { + "epoch": 4.440481043620057, + "grad_norm": 0.17443886399269104, + "learning_rate": 4.4516918059518955e-05, + "loss": 3.8253, + "step": 65355 + }, + { + "epoch": 4.440820763690719, + "grad_norm": 0.18254737555980682, + "learning_rate": 4.451267155863569e-05, + "loss": 3.7924, + "step": 65360 + }, + { + "epoch": 4.44116048376138, + "grad_norm": 0.1656346172094345, + "learning_rate": 4.450842505775241e-05, + "loss": 3.9569, + "step": 65365 + }, + { + "epoch": 4.441500203832042, + "grad_norm": 0.17800576984882355, + "learning_rate": 4.450417855686914e-05, + "loss": 3.6966, + "step": 65370 + }, + { + "epoch": 4.441839923902704, + "grad_norm": 0.20836572349071503, + "learning_rate": 4.4499932055985874e-05, + "loss": 3.6788, + "step": 65375 + }, + { + "epoch": 4.442179643973366, + "grad_norm": 0.1775585114955902, + "learning_rate": 4.4495685555102595e-05, + "loss": 3.8693, + "step": 65380 + }, + { + "epoch": 4.442519364044028, + "grad_norm": 0.16763387620449066, + "learning_rate": 4.449143905421932e-05, + "loss": 3.6963, + "step": 65385 + }, + { + "epoch": 4.44285908411469, + "grad_norm": 0.16370724141597748, + "learning_rate": 4.448719255333605e-05, + "loss": 3.9399, + "step": 65390 + }, + { + "epoch": 4.443198804185351, + "grad_norm": 0.1865420937538147, + "learning_rate": 4.448294605245278e-05, + "loss": 3.9664, + "step": 65395 + }, + { + "epoch": 4.443538524256013, + "grad_norm": 2.4574074745178223, + "learning_rate": 4.447869955156951e-05, + "loss": 3.9012, + "step": 65400 + }, + { + "epoch": 4.443878244326675, + "grad_norm": 0.6160339713096619, + "learning_rate": 4.4474453050686235e-05, + "loss": 3.7709, + "step": 65405 + }, + { + "epoch": 4.444217964397336, + "grad_norm": 0.21013019979000092, + "learning_rate": 4.447020654980296e-05, + "loss": 3.7412, + "step": 65410 + }, + { + "epoch": 4.444557684467998, + "grad_norm": 0.18710096180438995, + "learning_rate": 4.446596004891969e-05, + "loss": 3.8211, + "step": 65415 + }, + { + "epoch": 4.4448974045386604, + "grad_norm": 0.2858206629753113, + "learning_rate": 4.446171354803642e-05, + "loss": 3.8632, + "step": 65420 + }, + { + "epoch": 4.445237124609322, + "grad_norm": 0.1773625612258911, + "learning_rate": 4.445746704715315e-05, + "loss": 4.02, + "step": 65425 + }, + { + "epoch": 4.445576844679984, + "grad_norm": 0.16711844503879547, + "learning_rate": 4.4453220546269875e-05, + "loss": 3.8704, + "step": 65430 + }, + { + "epoch": 4.445916564750646, + "grad_norm": 0.1956503838300705, + "learning_rate": 4.44489740453866e-05, + "loss": 3.8933, + "step": 65435 + }, + { + "epoch": 4.446256284821307, + "grad_norm": 0.13019658625125885, + "learning_rate": 4.444472754450333e-05, + "loss": 3.9195, + "step": 65440 + }, + { + "epoch": 4.446596004891969, + "grad_norm": 0.18112193048000336, + "learning_rate": 4.444048104362006e-05, + "loss": 3.9825, + "step": 65445 + }, + { + "epoch": 4.446935724962631, + "grad_norm": 0.1556452065706253, + "learning_rate": 4.443623454273679e-05, + "loss": 3.8367, + "step": 65450 + }, + { + "epoch": 4.447275445033292, + "grad_norm": 0.1670980453491211, + "learning_rate": 4.4431988041853515e-05, + "loss": 3.9833, + "step": 65455 + }, + { + "epoch": 4.447615165103954, + "grad_norm": 0.15709826350212097, + "learning_rate": 4.4427741540970243e-05, + "loss": 3.5275, + "step": 65460 + }, + { + "epoch": 4.4479548851746165, + "grad_norm": 0.20001456141471863, + "learning_rate": 4.442349504008697e-05, + "loss": 4.0942, + "step": 65465 + }, + { + "epoch": 4.448294605245278, + "grad_norm": 0.37681952118873596, + "learning_rate": 4.44192485392037e-05, + "loss": 3.8069, + "step": 65470 + }, + { + "epoch": 4.44863432531594, + "grad_norm": 0.4273237884044647, + "learning_rate": 4.441500203832043e-05, + "loss": 3.7972, + "step": 65475 + }, + { + "epoch": 4.448974045386602, + "grad_norm": 0.29301390051841736, + "learning_rate": 4.4410755537437155e-05, + "loss": 3.8729, + "step": 65480 + }, + { + "epoch": 4.449313765457263, + "grad_norm": 0.14922615885734558, + "learning_rate": 4.4406509036553883e-05, + "loss": 3.5267, + "step": 65485 + }, + { + "epoch": 4.449653485527925, + "grad_norm": 1.388907551765442, + "learning_rate": 4.4402262535670605e-05, + "loss": 3.9067, + "step": 65490 + }, + { + "epoch": 4.449993205598587, + "grad_norm": 0.1586804836988449, + "learning_rate": 4.439801603478734e-05, + "loss": 3.9281, + "step": 65495 + }, + { + "epoch": 4.450332925669248, + "grad_norm": 0.13560786843299866, + "learning_rate": 4.439376953390407e-05, + "loss": 3.8065, + "step": 65500 + }, + { + "epoch": 4.45067264573991, + "grad_norm": 0.1426897495985031, + "learning_rate": 4.438952303302079e-05, + "loss": 3.8128, + "step": 65505 + }, + { + "epoch": 4.4510123658105725, + "grad_norm": 0.2951332628726959, + "learning_rate": 4.4385276532137523e-05, + "loss": 3.7182, + "step": 65510 + }, + { + "epoch": 4.451352085881234, + "grad_norm": 0.18212130665779114, + "learning_rate": 4.438103003125425e-05, + "loss": 4.1137, + "step": 65515 + }, + { + "epoch": 4.451691805951896, + "grad_norm": 0.20194345712661743, + "learning_rate": 4.437678353037097e-05, + "loss": 3.8715, + "step": 65520 + }, + { + "epoch": 4.452031526022558, + "grad_norm": 0.20093926787376404, + "learning_rate": 4.43725370294877e-05, + "loss": 3.9094, + "step": 65525 + }, + { + "epoch": 4.452371246093219, + "grad_norm": 0.13732150197029114, + "learning_rate": 4.4368290528604436e-05, + "loss": 3.3742, + "step": 65530 + }, + { + "epoch": 4.452710966163881, + "grad_norm": 0.1585165560245514, + "learning_rate": 4.436404402772116e-05, + "loss": 3.8026, + "step": 65535 + }, + { + "epoch": 4.453050686234543, + "grad_norm": 0.1760946810245514, + "learning_rate": 4.4359797526837885e-05, + "loss": 3.8719, + "step": 65540 + }, + { + "epoch": 4.453390406305204, + "grad_norm": 0.15713360905647278, + "learning_rate": 4.435555102595462e-05, + "loss": 3.9282, + "step": 65545 + }, + { + "epoch": 4.453730126375866, + "grad_norm": 0.15381067991256714, + "learning_rate": 4.435130452507134e-05, + "loss": 3.8508, + "step": 65550 + }, + { + "epoch": 4.454069846446528, + "grad_norm": 1.6171308755874634, + "learning_rate": 4.434705802418807e-05, + "loss": 3.9135, + "step": 65555 + }, + { + "epoch": 4.45440956651719, + "grad_norm": 0.13650082051753998, + "learning_rate": 4.4342811523304804e-05, + "loss": 3.8359, + "step": 65560 + }, + { + "epoch": 4.454749286587852, + "grad_norm": 0.18038137257099152, + "learning_rate": 4.4338565022421525e-05, + "loss": 3.8268, + "step": 65565 + }, + { + "epoch": 4.455089006658513, + "grad_norm": 0.3654062747955322, + "learning_rate": 4.433431852153825e-05, + "loss": 3.9605, + "step": 65570 + }, + { + "epoch": 4.455428726729175, + "grad_norm": 0.4691430628299713, + "learning_rate": 4.433007202065498e-05, + "loss": 3.9801, + "step": 65575 + }, + { + "epoch": 4.455768446799837, + "grad_norm": 0.16862504184246063, + "learning_rate": 4.432582551977171e-05, + "loss": 3.6178, + "step": 65580 + }, + { + "epoch": 4.456108166870498, + "grad_norm": 0.165950208902359, + "learning_rate": 4.432157901888844e-05, + "loss": 3.7753, + "step": 65585 + }, + { + "epoch": 4.45644788694116, + "grad_norm": 0.20840133726596832, + "learning_rate": 4.4317332518005165e-05, + "loss": 3.7861, + "step": 65590 + }, + { + "epoch": 4.456787607011822, + "grad_norm": 0.15816885232925415, + "learning_rate": 4.43130860171219e-05, + "loss": 3.9421, + "step": 65595 + }, + { + "epoch": 4.457127327082484, + "grad_norm": 0.17659921944141388, + "learning_rate": 4.430883951623862e-05, + "loss": 3.9107, + "step": 65600 + }, + { + "epoch": 4.457467047153146, + "grad_norm": 0.9563364386558533, + "learning_rate": 4.430459301535535e-05, + "loss": 3.7854, + "step": 65605 + }, + { + "epoch": 4.457806767223808, + "grad_norm": 0.45992085337638855, + "learning_rate": 4.430034651447208e-05, + "loss": 4.0036, + "step": 65610 + }, + { + "epoch": 4.458146487294469, + "grad_norm": 0.19678451120853424, + "learning_rate": 4.4296100013588805e-05, + "loss": 3.8989, + "step": 65615 + }, + { + "epoch": 4.458486207365131, + "grad_norm": 0.1515466272830963, + "learning_rate": 4.429185351270553e-05, + "loss": 4.01, + "step": 65620 + }, + { + "epoch": 4.458825927435793, + "grad_norm": 0.17139749228954315, + "learning_rate": 4.428760701182226e-05, + "loss": 3.5857, + "step": 65625 + }, + { + "epoch": 4.459165647506454, + "grad_norm": 0.20009830594062805, + "learning_rate": 4.428336051093899e-05, + "loss": 3.899, + "step": 65630 + }, + { + "epoch": 4.459505367577116, + "grad_norm": 0.36213770508766174, + "learning_rate": 4.427911401005572e-05, + "loss": 3.866, + "step": 65635 + }, + { + "epoch": 4.459845087647778, + "grad_norm": 0.14845360815525055, + "learning_rate": 4.4274867509172445e-05, + "loss": 4.011, + "step": 65640 + }, + { + "epoch": 4.46018480771844, + "grad_norm": 0.20403790473937988, + "learning_rate": 4.427062100828917e-05, + "loss": 3.7646, + "step": 65645 + }, + { + "epoch": 4.460524527789102, + "grad_norm": 0.1625368893146515, + "learning_rate": 4.42663745074059e-05, + "loss": 3.7877, + "step": 65650 + }, + { + "epoch": 4.460864247859764, + "grad_norm": 0.3886447548866272, + "learning_rate": 4.426212800652263e-05, + "loss": 3.7451, + "step": 65655 + }, + { + "epoch": 4.461203967930425, + "grad_norm": 0.18566125631332397, + "learning_rate": 4.425788150563936e-05, + "loss": 3.6121, + "step": 65660 + }, + { + "epoch": 4.461543688001087, + "grad_norm": 0.14178207516670227, + "learning_rate": 4.4253635004756085e-05, + "loss": 3.8352, + "step": 65665 + }, + { + "epoch": 4.461883408071749, + "grad_norm": 0.21357586979866028, + "learning_rate": 4.424938850387281e-05, + "loss": 3.8513, + "step": 65670 + }, + { + "epoch": 4.46222312814241, + "grad_norm": 0.1828068345785141, + "learning_rate": 4.4245142002989534e-05, + "loss": 4.0099, + "step": 65675 + }, + { + "epoch": 4.462562848213072, + "grad_norm": 0.13089358806610107, + "learning_rate": 4.424089550210627e-05, + "loss": 3.9493, + "step": 65680 + }, + { + "epoch": 4.462902568283734, + "grad_norm": 0.18007341027259827, + "learning_rate": 4.4236649001223e-05, + "loss": 3.8657, + "step": 65685 + }, + { + "epoch": 4.463242288354396, + "grad_norm": 0.1825505495071411, + "learning_rate": 4.423240250033972e-05, + "loss": 3.8435, + "step": 65690 + }, + { + "epoch": 4.463582008425058, + "grad_norm": 0.6036990880966187, + "learning_rate": 4.422815599945645e-05, + "loss": 3.6229, + "step": 65695 + }, + { + "epoch": 4.46392172849572, + "grad_norm": 0.17641127109527588, + "learning_rate": 4.422390949857318e-05, + "loss": 3.7999, + "step": 65700 + }, + { + "epoch": 4.464261448566381, + "grad_norm": 0.17606143653392792, + "learning_rate": 4.42196629976899e-05, + "loss": 3.7672, + "step": 65705 + }, + { + "epoch": 4.464601168637043, + "grad_norm": 0.1640663892030716, + "learning_rate": 4.421541649680663e-05, + "loss": 3.734, + "step": 65710 + }, + { + "epoch": 4.464940888707705, + "grad_norm": 0.1748036891222, + "learning_rate": 4.4211169995923365e-05, + "loss": 3.8258, + "step": 65715 + }, + { + "epoch": 4.465280608778366, + "grad_norm": 0.19850200414657593, + "learning_rate": 4.4206923495040086e-05, + "loss": 3.9158, + "step": 65720 + }, + { + "epoch": 4.465620328849028, + "grad_norm": 0.21560223400592804, + "learning_rate": 4.4202676994156814e-05, + "loss": 3.7883, + "step": 65725 + }, + { + "epoch": 4.4659600489196905, + "grad_norm": 1.0902297496795654, + "learning_rate": 4.419843049327355e-05, + "loss": 3.7458, + "step": 65730 + }, + { + "epoch": 4.466299768990352, + "grad_norm": 0.19531333446502686, + "learning_rate": 4.419418399239027e-05, + "loss": 3.7291, + "step": 65735 + }, + { + "epoch": 4.466639489061014, + "grad_norm": 0.2026926875114441, + "learning_rate": 4.4189937491507e-05, + "loss": 3.5215, + "step": 65740 + }, + { + "epoch": 4.466979209131676, + "grad_norm": 0.19726797938346863, + "learning_rate": 4.4185690990623726e-05, + "loss": 3.6497, + "step": 65745 + }, + { + "epoch": 4.467318929202337, + "grad_norm": 0.15433305501937866, + "learning_rate": 4.4181444489740454e-05, + "loss": 3.7332, + "step": 65750 + }, + { + "epoch": 4.467658649272999, + "grad_norm": 0.15501590073108673, + "learning_rate": 4.417719798885718e-05, + "loss": 3.7602, + "step": 65755 + }, + { + "epoch": 4.467998369343661, + "grad_norm": 0.16593541204929352, + "learning_rate": 4.417295148797391e-05, + "loss": 4.0404, + "step": 65760 + }, + { + "epoch": 4.468338089414322, + "grad_norm": 0.17606590688228607, + "learning_rate": 4.4168704987090645e-05, + "loss": 3.9526, + "step": 65765 + }, + { + "epoch": 4.468677809484984, + "grad_norm": 0.16877909004688263, + "learning_rate": 4.4164458486207366e-05, + "loss": 3.9508, + "step": 65770 + }, + { + "epoch": 4.4690175295556465, + "grad_norm": 0.2916603982448578, + "learning_rate": 4.4160211985324094e-05, + "loss": 3.8086, + "step": 65775 + }, + { + "epoch": 4.469357249626308, + "grad_norm": 0.15770120918750763, + "learning_rate": 4.415596548444082e-05, + "loss": 3.665, + "step": 65780 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.15767793357372284, + "learning_rate": 4.415171898355755e-05, + "loss": 3.7073, + "step": 65785 + }, + { + "epoch": 4.470036689767632, + "grad_norm": 0.15709777176380157, + "learning_rate": 4.414747248267428e-05, + "loss": 3.7155, + "step": 65790 + }, + { + "epoch": 4.470376409838293, + "grad_norm": 0.1666347235441208, + "learning_rate": 4.4143225981791006e-05, + "loss": 4.0625, + "step": 65795 + }, + { + "epoch": 4.470716129908955, + "grad_norm": 1.657636284828186, + "learning_rate": 4.4138979480907734e-05, + "loss": 3.8271, + "step": 65800 + }, + { + "epoch": 4.471055849979617, + "grad_norm": 0.19187846779823303, + "learning_rate": 4.413473298002446e-05, + "loss": 3.8495, + "step": 65805 + }, + { + "epoch": 4.471395570050278, + "grad_norm": 0.17561307549476624, + "learning_rate": 4.413048647914119e-05, + "loss": 3.8541, + "step": 65810 + }, + { + "epoch": 4.47173529012094, + "grad_norm": 0.1428394615650177, + "learning_rate": 4.412623997825792e-05, + "loss": 3.7309, + "step": 65815 + }, + { + "epoch": 4.4720750101916025, + "grad_norm": 0.21997463703155518, + "learning_rate": 4.4121993477374646e-05, + "loss": 3.7759, + "step": 65820 + }, + { + "epoch": 4.472414730262264, + "grad_norm": 0.17234601080417633, + "learning_rate": 4.4117746976491374e-05, + "loss": 3.7333, + "step": 65825 + }, + { + "epoch": 4.472754450332926, + "grad_norm": 0.1722484976053238, + "learning_rate": 4.41135004756081e-05, + "loss": 3.8674, + "step": 65830 + }, + { + "epoch": 4.473094170403588, + "grad_norm": 0.16585060954093933, + "learning_rate": 4.410925397472483e-05, + "loss": 3.7435, + "step": 65835 + }, + { + "epoch": 4.473433890474249, + "grad_norm": 0.1630508005619049, + "learning_rate": 4.410500747384156e-05, + "loss": 4.0065, + "step": 65840 + }, + { + "epoch": 4.473773610544911, + "grad_norm": 0.2807488739490509, + "learning_rate": 4.410076097295828e-05, + "loss": 3.6927, + "step": 65845 + }, + { + "epoch": 4.474113330615573, + "grad_norm": 0.18997886776924133, + "learning_rate": 4.4096514472075014e-05, + "loss": 3.6211, + "step": 65850 + }, + { + "epoch": 4.474453050686234, + "grad_norm": 0.2843765914440155, + "learning_rate": 4.409226797119174e-05, + "loss": 3.7869, + "step": 65855 + }, + { + "epoch": 4.474792770756896, + "grad_norm": 0.1659715324640274, + "learning_rate": 4.4088021470308464e-05, + "loss": 3.6613, + "step": 65860 + }, + { + "epoch": 4.4751324908275585, + "grad_norm": 0.18929651379585266, + "learning_rate": 4.40837749694252e-05, + "loss": 3.8795, + "step": 65865 + }, + { + "epoch": 4.47547221089822, + "grad_norm": 0.45803359150886536, + "learning_rate": 4.4079528468541926e-05, + "loss": 3.8859, + "step": 65870 + }, + { + "epoch": 4.475811930968882, + "grad_norm": 0.24420157074928284, + "learning_rate": 4.407528196765865e-05, + "loss": 3.9157, + "step": 65875 + }, + { + "epoch": 4.476151651039544, + "grad_norm": 0.15120546519756317, + "learning_rate": 4.4071035466775376e-05, + "loss": 3.9375, + "step": 65880 + }, + { + "epoch": 4.476491371110205, + "grad_norm": 0.18532390892505646, + "learning_rate": 4.406678896589211e-05, + "loss": 3.6495, + "step": 65885 + }, + { + "epoch": 4.476831091180867, + "grad_norm": 0.40922221541404724, + "learning_rate": 4.406254246500883e-05, + "loss": 4.1068, + "step": 65890 + }, + { + "epoch": 4.477170811251529, + "grad_norm": 0.18404367566108704, + "learning_rate": 4.405829596412556e-05, + "loss": 3.8217, + "step": 65895 + }, + { + "epoch": 4.47751053132219, + "grad_norm": 0.19832220673561096, + "learning_rate": 4.4054049463242295e-05, + "loss": 3.7199, + "step": 65900 + }, + { + "epoch": 4.477850251392852, + "grad_norm": 0.19052526354789734, + "learning_rate": 4.4049802962359016e-05, + "loss": 3.7519, + "step": 65905 + }, + { + "epoch": 4.4781899714635145, + "grad_norm": 0.20104871690273285, + "learning_rate": 4.4045556461475744e-05, + "loss": 3.5996, + "step": 65910 + }, + { + "epoch": 4.478529691534176, + "grad_norm": 0.16214823722839355, + "learning_rate": 4.404130996059247e-05, + "loss": 3.9853, + "step": 65915 + }, + { + "epoch": 4.478869411604838, + "grad_norm": 0.19526754319667816, + "learning_rate": 4.40370634597092e-05, + "loss": 3.5843, + "step": 65920 + }, + { + "epoch": 4.4792091316755, + "grad_norm": 0.20356783270835876, + "learning_rate": 4.403281695882593e-05, + "loss": 3.8336, + "step": 65925 + }, + { + "epoch": 4.479548851746161, + "grad_norm": 0.1641458123922348, + "learning_rate": 4.4028570457942656e-05, + "loss": 3.9752, + "step": 65930 + }, + { + "epoch": 4.479888571816823, + "grad_norm": 0.26142510771751404, + "learning_rate": 4.402432395705939e-05, + "loss": 3.782, + "step": 65935 + }, + { + "epoch": 4.480228291887485, + "grad_norm": 0.175393208861351, + "learning_rate": 4.402007745617611e-05, + "loss": 3.7235, + "step": 65940 + }, + { + "epoch": 4.480568011958146, + "grad_norm": 0.18272614479064941, + "learning_rate": 4.401583095529284e-05, + "loss": 3.8196, + "step": 65945 + }, + { + "epoch": 4.480907732028808, + "grad_norm": 0.17925551533699036, + "learning_rate": 4.4011584454409575e-05, + "loss": 3.8131, + "step": 65950 + }, + { + "epoch": 4.4812474520994705, + "grad_norm": 0.21646460890769958, + "learning_rate": 4.4007337953526296e-05, + "loss": 3.8289, + "step": 65955 + }, + { + "epoch": 4.481587172170132, + "grad_norm": 0.21419692039489746, + "learning_rate": 4.4003091452643024e-05, + "loss": 3.8769, + "step": 65960 + }, + { + "epoch": 4.481926892240794, + "grad_norm": 0.16379916667938232, + "learning_rate": 4.399884495175975e-05, + "loss": 3.7871, + "step": 65965 + }, + { + "epoch": 4.482266612311455, + "grad_norm": 0.17601513862609863, + "learning_rate": 4.399459845087648e-05, + "loss": 3.818, + "step": 65970 + }, + { + "epoch": 4.482606332382117, + "grad_norm": 0.17989441752433777, + "learning_rate": 4.399035194999321e-05, + "loss": 3.8833, + "step": 65975 + }, + { + "epoch": 4.482946052452779, + "grad_norm": 0.1750950664281845, + "learning_rate": 4.3986105449109936e-05, + "loss": 3.9224, + "step": 65980 + }, + { + "epoch": 4.48328577252344, + "grad_norm": 1.0322773456573486, + "learning_rate": 4.3981858948226664e-05, + "loss": 3.9324, + "step": 65985 + }, + { + "epoch": 4.483625492594102, + "grad_norm": 0.14345745742321014, + "learning_rate": 4.397761244734339e-05, + "loss": 3.7676, + "step": 65990 + }, + { + "epoch": 4.483965212664764, + "grad_norm": 0.1726580262184143, + "learning_rate": 4.397336594646012e-05, + "loss": 3.7565, + "step": 65995 + }, + { + "epoch": 4.484304932735426, + "grad_norm": 0.19310405850410461, + "learning_rate": 4.396911944557685e-05, + "loss": 3.8467, + "step": 66000 + }, + { + "epoch": 4.484644652806088, + "grad_norm": 0.19398830831050873, + "learning_rate": 4.3964872944693576e-05, + "loss": 3.9307, + "step": 66005 + }, + { + "epoch": 4.48498437287675, + "grad_norm": 0.18298979103565216, + "learning_rate": 4.3960626443810304e-05, + "loss": 3.6994, + "step": 66010 + }, + { + "epoch": 4.485324092947411, + "grad_norm": 0.17128446698188782, + "learning_rate": 4.3956379942927025e-05, + "loss": 3.8519, + "step": 66015 + }, + { + "epoch": 4.485663813018073, + "grad_norm": 0.16013814508914948, + "learning_rate": 4.395213344204376e-05, + "loss": 3.8926, + "step": 66020 + }, + { + "epoch": 4.486003533088735, + "grad_norm": 0.16367492079734802, + "learning_rate": 4.394788694116049e-05, + "loss": 3.6256, + "step": 66025 + }, + { + "epoch": 4.486343253159396, + "grad_norm": 0.3461039960384369, + "learning_rate": 4.394364044027721e-05, + "loss": 3.9424, + "step": 66030 + }, + { + "epoch": 4.486682973230058, + "grad_norm": 0.19321785867214203, + "learning_rate": 4.3939393939393944e-05, + "loss": 3.7224, + "step": 66035 + }, + { + "epoch": 4.4870226933007205, + "grad_norm": 0.2017548382282257, + "learning_rate": 4.393514743851067e-05, + "loss": 3.9537, + "step": 66040 + }, + { + "epoch": 4.487362413371382, + "grad_norm": 0.21928055584430695, + "learning_rate": 4.393090093762739e-05, + "loss": 4.073, + "step": 66045 + }, + { + "epoch": 4.487702133442044, + "grad_norm": 0.19563937187194824, + "learning_rate": 4.392665443674413e-05, + "loss": 3.8271, + "step": 66050 + }, + { + "epoch": 4.488041853512706, + "grad_norm": 0.16597911715507507, + "learning_rate": 4.3922407935860856e-05, + "loss": 3.6397, + "step": 66055 + }, + { + "epoch": 4.488381573583367, + "grad_norm": 0.22918321192264557, + "learning_rate": 4.391816143497758e-05, + "loss": 3.7425, + "step": 66060 + }, + { + "epoch": 4.488721293654029, + "grad_norm": 0.17657753825187683, + "learning_rate": 4.3913914934094305e-05, + "loss": 3.9663, + "step": 66065 + }, + { + "epoch": 4.489061013724691, + "grad_norm": 0.22994957864284515, + "learning_rate": 4.390966843321104e-05, + "loss": 3.9839, + "step": 66070 + }, + { + "epoch": 4.489400733795352, + "grad_norm": 0.18835769593715668, + "learning_rate": 4.390542193232776e-05, + "loss": 3.8752, + "step": 66075 + }, + { + "epoch": 4.489740453866014, + "grad_norm": 0.18590204417705536, + "learning_rate": 4.390117543144449e-05, + "loss": 3.7613, + "step": 66080 + }, + { + "epoch": 4.4900801739366765, + "grad_norm": 0.45114895701408386, + "learning_rate": 4.3896928930561224e-05, + "loss": 3.9179, + "step": 66085 + }, + { + "epoch": 4.490419894007338, + "grad_norm": 0.23692721128463745, + "learning_rate": 4.3892682429677945e-05, + "loss": 3.886, + "step": 66090 + }, + { + "epoch": 4.490759614078, + "grad_norm": 0.17877337336540222, + "learning_rate": 4.388843592879467e-05, + "loss": 3.5172, + "step": 66095 + }, + { + "epoch": 4.491099334148662, + "grad_norm": 0.18668803572654724, + "learning_rate": 4.38841894279114e-05, + "loss": 3.7139, + "step": 66100 + }, + { + "epoch": 4.491439054219323, + "grad_norm": 0.1516217440366745, + "learning_rate": 4.3879942927028136e-05, + "loss": 3.7263, + "step": 66105 + }, + { + "epoch": 4.491778774289985, + "grad_norm": 0.17892104387283325, + "learning_rate": 4.387569642614486e-05, + "loss": 3.7655, + "step": 66110 + }, + { + "epoch": 4.492118494360647, + "grad_norm": 0.1693672239780426, + "learning_rate": 4.3871449925261585e-05, + "loss": 3.8115, + "step": 66115 + }, + { + "epoch": 4.492458214431308, + "grad_norm": 0.15287695825099945, + "learning_rate": 4.386720342437832e-05, + "loss": 4.2317, + "step": 66120 + }, + { + "epoch": 4.49279793450197, + "grad_norm": 0.4040757417678833, + "learning_rate": 4.386295692349504e-05, + "loss": 3.7242, + "step": 66125 + }, + { + "epoch": 4.4931376545726325, + "grad_norm": 0.19549527764320374, + "learning_rate": 4.385871042261177e-05, + "loss": 3.7603, + "step": 66130 + }, + { + "epoch": 4.493477374643294, + "grad_norm": 0.1893242448568344, + "learning_rate": 4.38544639217285e-05, + "loss": 4.0278, + "step": 66135 + }, + { + "epoch": 4.493817094713956, + "grad_norm": 0.17244218289852142, + "learning_rate": 4.3850217420845225e-05, + "loss": 3.886, + "step": 66140 + }, + { + "epoch": 4.494156814784618, + "grad_norm": 0.17113344371318817, + "learning_rate": 4.384597091996195e-05, + "loss": 3.8688, + "step": 66145 + }, + { + "epoch": 4.494496534855279, + "grad_norm": 0.1811506301164627, + "learning_rate": 4.384172441907868e-05, + "loss": 3.8807, + "step": 66150 + }, + { + "epoch": 4.494836254925941, + "grad_norm": 0.13587166368961334, + "learning_rate": 4.383747791819541e-05, + "loss": 3.6887, + "step": 66155 + }, + { + "epoch": 4.495175974996603, + "grad_norm": 0.19294604659080505, + "learning_rate": 4.383323141731214e-05, + "loss": 3.9796, + "step": 66160 + }, + { + "epoch": 4.495515695067264, + "grad_norm": 0.21250830590724945, + "learning_rate": 4.3828984916428865e-05, + "loss": 3.9107, + "step": 66165 + }, + { + "epoch": 4.495855415137926, + "grad_norm": 0.2667093873023987, + "learning_rate": 4.3824738415545593e-05, + "loss": 3.7744, + "step": 66170 + }, + { + "epoch": 4.4961951352085885, + "grad_norm": 0.20284591615200043, + "learning_rate": 4.382049191466232e-05, + "loss": 3.9867, + "step": 66175 + }, + { + "epoch": 4.49653485527925, + "grad_norm": 0.1636538803577423, + "learning_rate": 4.381624541377905e-05, + "loss": 3.8264, + "step": 66180 + }, + { + "epoch": 4.496874575349912, + "grad_norm": 0.19116079807281494, + "learning_rate": 4.381199891289578e-05, + "loss": 3.6688, + "step": 66185 + }, + { + "epoch": 4.497214295420574, + "grad_norm": 0.1961974948644638, + "learning_rate": 4.3807752412012505e-05, + "loss": 4.1123, + "step": 66190 + }, + { + "epoch": 4.497554015491235, + "grad_norm": 0.1579834371805191, + "learning_rate": 4.3803505911129233e-05, + "loss": 3.8662, + "step": 66195 + }, + { + "epoch": 4.497893735561897, + "grad_norm": 1.1306627988815308, + "learning_rate": 4.3799259410245955e-05, + "loss": 3.8963, + "step": 66200 + }, + { + "epoch": 4.498233455632559, + "grad_norm": 0.2089107185602188, + "learning_rate": 4.379501290936269e-05, + "loss": 3.5966, + "step": 66205 + }, + { + "epoch": 4.49857317570322, + "grad_norm": 0.15892164409160614, + "learning_rate": 4.379076640847942e-05, + "loss": 3.9436, + "step": 66210 + }, + { + "epoch": 4.498912895773882, + "grad_norm": 0.15142278373241425, + "learning_rate": 4.378651990759614e-05, + "loss": 3.7156, + "step": 66215 + }, + { + "epoch": 4.4992526158445445, + "grad_norm": 0.1697821319103241, + "learning_rate": 4.3782273406712873e-05, + "loss": 3.8629, + "step": 66220 + }, + { + "epoch": 4.499592335915206, + "grad_norm": 0.23515728116035461, + "learning_rate": 4.37780269058296e-05, + "loss": 4.0038, + "step": 66225 + }, + { + "epoch": 4.499932055985868, + "grad_norm": 0.15634207427501678, + "learning_rate": 4.377378040494632e-05, + "loss": 3.8249, + "step": 66230 + }, + { + "epoch": 4.500271776056529, + "grad_norm": 0.15609455108642578, + "learning_rate": 4.376953390406305e-05, + "loss": 3.7195, + "step": 66235 + }, + { + "epoch": 4.500611496127191, + "grad_norm": 0.24408911168575287, + "learning_rate": 4.3765287403179786e-05, + "loss": 3.6144, + "step": 66240 + }, + { + "epoch": 4.500951216197853, + "grad_norm": 0.1946553736925125, + "learning_rate": 4.376104090229651e-05, + "loss": 3.9223, + "step": 66245 + }, + { + "epoch": 4.501290936268514, + "grad_norm": 0.2983834743499756, + "learning_rate": 4.3756794401413235e-05, + "loss": 3.8358, + "step": 66250 + }, + { + "epoch": 4.501630656339176, + "grad_norm": 0.16022226214408875, + "learning_rate": 4.375254790052997e-05, + "loss": 3.7084, + "step": 66255 + }, + { + "epoch": 4.501970376409838, + "grad_norm": 0.13946005702018738, + "learning_rate": 4.374830139964669e-05, + "loss": 3.8268, + "step": 66260 + }, + { + "epoch": 4.5023100964805, + "grad_norm": 0.17228712141513824, + "learning_rate": 4.374405489876342e-05, + "loss": 3.6719, + "step": 66265 + }, + { + "epoch": 4.502649816551162, + "grad_norm": 0.17888176441192627, + "learning_rate": 4.373980839788015e-05, + "loss": 3.8217, + "step": 66270 + }, + { + "epoch": 4.502989536621824, + "grad_norm": 0.17487025260925293, + "learning_rate": 4.3735561896996875e-05, + "loss": 3.8282, + "step": 66275 + }, + { + "epoch": 4.503329256692485, + "grad_norm": 0.19783534109592438, + "learning_rate": 4.37313153961136e-05, + "loss": 3.9235, + "step": 66280 + }, + { + "epoch": 4.503668976763147, + "grad_norm": 0.16453787684440613, + "learning_rate": 4.372706889523033e-05, + "loss": 3.8579, + "step": 66285 + }, + { + "epoch": 4.504008696833809, + "grad_norm": 0.18161481618881226, + "learning_rate": 4.3722822394347066e-05, + "loss": 3.9408, + "step": 66290 + }, + { + "epoch": 4.50434841690447, + "grad_norm": 0.18898679316043854, + "learning_rate": 4.371857589346379e-05, + "loss": 3.8358, + "step": 66295 + }, + { + "epoch": 4.504688136975132, + "grad_norm": 0.2656009793281555, + "learning_rate": 4.3714329392580515e-05, + "loss": 3.7852, + "step": 66300 + }, + { + "epoch": 4.5050278570457944, + "grad_norm": 0.14816045761108398, + "learning_rate": 4.371008289169724e-05, + "loss": 3.8188, + "step": 66305 + }, + { + "epoch": 4.505367577116456, + "grad_norm": 0.40849265456199646, + "learning_rate": 4.370583639081397e-05, + "loss": 3.5751, + "step": 66310 + }, + { + "epoch": 4.505707297187118, + "grad_norm": 0.18086351454257965, + "learning_rate": 4.37015898899307e-05, + "loss": 3.7425, + "step": 66315 + }, + { + "epoch": 4.50604701725778, + "grad_norm": 0.1983030140399933, + "learning_rate": 4.369734338904743e-05, + "loss": 3.8396, + "step": 66320 + }, + { + "epoch": 4.506386737328441, + "grad_norm": 0.17220385372638702, + "learning_rate": 4.3693096888164155e-05, + "loss": 3.9646, + "step": 66325 + }, + { + "epoch": 4.506726457399103, + "grad_norm": 0.17671333253383636, + "learning_rate": 4.368885038728088e-05, + "loss": 3.8174, + "step": 66330 + }, + { + "epoch": 4.507066177469765, + "grad_norm": 0.14681637287139893, + "learning_rate": 4.368460388639761e-05, + "loss": 3.8921, + "step": 66335 + }, + { + "epoch": 4.507405897540426, + "grad_norm": 2.129880905151367, + "learning_rate": 4.368035738551434e-05, + "loss": 3.4758, + "step": 66340 + }, + { + "epoch": 4.507745617611088, + "grad_norm": 0.7710688710212708, + "learning_rate": 4.367611088463107e-05, + "loss": 4.0811, + "step": 66345 + }, + { + "epoch": 4.5080853376817505, + "grad_norm": 0.1700017750263214, + "learning_rate": 4.3671864383747795e-05, + "loss": 3.7731, + "step": 66350 + }, + { + "epoch": 4.508425057752412, + "grad_norm": 0.17708295583724976, + "learning_rate": 4.366761788286452e-05, + "loss": 3.849, + "step": 66355 + }, + { + "epoch": 4.508764777823074, + "grad_norm": 0.1428188532590866, + "learning_rate": 4.366337138198125e-05, + "loss": 3.8754, + "step": 66360 + }, + { + "epoch": 4.509104497893736, + "grad_norm": 0.17910178005695343, + "learning_rate": 4.365912488109798e-05, + "loss": 4.1015, + "step": 66365 + }, + { + "epoch": 4.509444217964397, + "grad_norm": 0.21361130475997925, + "learning_rate": 4.36548783802147e-05, + "loss": 3.8771, + "step": 66370 + }, + { + "epoch": 4.509783938035059, + "grad_norm": 0.14907681941986084, + "learning_rate": 4.3650631879331435e-05, + "loss": 3.6274, + "step": 66375 + }, + { + "epoch": 4.510123658105721, + "grad_norm": 0.16573791205883026, + "learning_rate": 4.364638537844816e-05, + "loss": 3.9395, + "step": 66380 + }, + { + "epoch": 4.510463378176382, + "grad_norm": 0.31108054518699646, + "learning_rate": 4.3642138877564884e-05, + "loss": 4.04, + "step": 66385 + }, + { + "epoch": 4.510803098247044, + "grad_norm": 4.884077548980713, + "learning_rate": 4.363789237668162e-05, + "loss": 3.8892, + "step": 66390 + }, + { + "epoch": 4.5111428183177065, + "grad_norm": 0.15047478675842285, + "learning_rate": 4.363364587579835e-05, + "loss": 3.6691, + "step": 66395 + }, + { + "epoch": 4.511482538388368, + "grad_norm": 0.23479796946048737, + "learning_rate": 4.362939937491507e-05, + "loss": 3.6977, + "step": 66400 + }, + { + "epoch": 4.51182225845903, + "grad_norm": 0.20352335274219513, + "learning_rate": 4.3625152874031796e-05, + "loss": 3.9957, + "step": 66405 + }, + { + "epoch": 4.512161978529692, + "grad_norm": 0.25108522176742554, + "learning_rate": 4.362090637314853e-05, + "loss": 3.7844, + "step": 66410 + }, + { + "epoch": 4.512501698600353, + "grad_norm": 0.15113617479801178, + "learning_rate": 4.361665987226525e-05, + "loss": 3.7594, + "step": 66415 + }, + { + "epoch": 4.512841418671015, + "grad_norm": 0.17836464941501617, + "learning_rate": 4.361241337138198e-05, + "loss": 3.8132, + "step": 66420 + }, + { + "epoch": 4.513181138741677, + "grad_norm": 0.16560591757297516, + "learning_rate": 4.3608166870498715e-05, + "loss": 3.8256, + "step": 66425 + }, + { + "epoch": 4.513520858812338, + "grad_norm": 0.3931291103363037, + "learning_rate": 4.3603920369615436e-05, + "loss": 3.6417, + "step": 66430 + }, + { + "epoch": 4.513860578883, + "grad_norm": 0.18201929330825806, + "learning_rate": 4.3599673868732164e-05, + "loss": 3.924, + "step": 66435 + }, + { + "epoch": 4.5142002989536625, + "grad_norm": 0.17822378873825073, + "learning_rate": 4.359542736784889e-05, + "loss": 3.9345, + "step": 66440 + }, + { + "epoch": 4.514540019024324, + "grad_norm": 0.2000465989112854, + "learning_rate": 4.359118086696562e-05, + "loss": 3.8305, + "step": 66445 + }, + { + "epoch": 4.514879739094986, + "grad_norm": 0.162429079413414, + "learning_rate": 4.358693436608235e-05, + "loss": 3.8468, + "step": 66450 + }, + { + "epoch": 4.515219459165648, + "grad_norm": 0.17170092463493347, + "learning_rate": 4.3582687865199076e-05, + "loss": 3.6423, + "step": 66455 + }, + { + "epoch": 4.515559179236309, + "grad_norm": 0.24091225862503052, + "learning_rate": 4.357844136431581e-05, + "loss": 3.7484, + "step": 66460 + }, + { + "epoch": 4.515898899306971, + "grad_norm": 0.1623532921075821, + "learning_rate": 4.357419486343253e-05, + "loss": 3.8594, + "step": 66465 + }, + { + "epoch": 4.516238619377633, + "grad_norm": 0.36337751150131226, + "learning_rate": 4.356994836254926e-05, + "loss": 3.7998, + "step": 66470 + }, + { + "epoch": 4.516578339448294, + "grad_norm": 0.3967737853527069, + "learning_rate": 4.3565701861665995e-05, + "loss": 3.67, + "step": 66475 + }, + { + "epoch": 4.516918059518956, + "grad_norm": 0.6891553997993469, + "learning_rate": 4.3561455360782716e-05, + "loss": 3.5128, + "step": 66480 + }, + { + "epoch": 4.5172577795896185, + "grad_norm": 0.17395681142807007, + "learning_rate": 4.3557208859899444e-05, + "loss": 3.917, + "step": 66485 + }, + { + "epoch": 4.51759749966028, + "grad_norm": 0.16237765550613403, + "learning_rate": 4.355296235901617e-05, + "loss": 3.8243, + "step": 66490 + }, + { + "epoch": 4.517937219730942, + "grad_norm": 0.6028052568435669, + "learning_rate": 4.35487158581329e-05, + "loss": 3.8898, + "step": 66495 + }, + { + "epoch": 4.518276939801604, + "grad_norm": 0.19467605650424957, + "learning_rate": 4.354446935724963e-05, + "loss": 3.8562, + "step": 66500 + }, + { + "epoch": 4.518616659872265, + "grad_norm": 2.3942320346832275, + "learning_rate": 4.3540222856366356e-05, + "loss": 3.7629, + "step": 66505 + }, + { + "epoch": 4.518956379942927, + "grad_norm": 0.2700802981853485, + "learning_rate": 4.3535976355483084e-05, + "loss": 3.8326, + "step": 66510 + }, + { + "epoch": 4.519296100013589, + "grad_norm": 0.20327623188495636, + "learning_rate": 4.353172985459981e-05, + "loss": 3.7374, + "step": 66515 + }, + { + "epoch": 4.51963582008425, + "grad_norm": 0.15633021295070648, + "learning_rate": 4.352748335371654e-05, + "loss": 3.8999, + "step": 66520 + }, + { + "epoch": 4.519975540154912, + "grad_norm": 0.17158165574073792, + "learning_rate": 4.352323685283327e-05, + "loss": 3.9506, + "step": 66525 + }, + { + "epoch": 4.5203152602255745, + "grad_norm": 0.7019169330596924, + "learning_rate": 4.3518990351949996e-05, + "loss": 3.8317, + "step": 66530 + }, + { + "epoch": 4.520654980296236, + "grad_norm": 0.13677150011062622, + "learning_rate": 4.3514743851066724e-05, + "loss": 3.7593, + "step": 66535 + }, + { + "epoch": 4.520994700366898, + "grad_norm": 0.16027306020259857, + "learning_rate": 4.3510497350183446e-05, + "loss": 3.9211, + "step": 66540 + }, + { + "epoch": 4.52133442043756, + "grad_norm": 0.16278758645057678, + "learning_rate": 4.350625084930018e-05, + "loss": 3.8111, + "step": 66545 + }, + { + "epoch": 4.521674140508221, + "grad_norm": 0.16683721542358398, + "learning_rate": 4.350200434841691e-05, + "loss": 3.9559, + "step": 66550 + }, + { + "epoch": 4.522013860578883, + "grad_norm": 0.18657737970352173, + "learning_rate": 4.349775784753363e-05, + "loss": 3.7563, + "step": 66555 + }, + { + "epoch": 4.522353580649545, + "grad_norm": 0.4317859411239624, + "learning_rate": 4.3493511346650364e-05, + "loss": 3.779, + "step": 66560 + }, + { + "epoch": 4.522693300720206, + "grad_norm": 0.15197806060314178, + "learning_rate": 4.348926484576709e-05, + "loss": 3.7325, + "step": 66565 + }, + { + "epoch": 4.523033020790868, + "grad_norm": 0.1577656865119934, + "learning_rate": 4.3485018344883814e-05, + "loss": 4.1096, + "step": 66570 + }, + { + "epoch": 4.5233727408615305, + "grad_norm": 1.2244980335235596, + "learning_rate": 4.348077184400055e-05, + "loss": 3.6926, + "step": 66575 + }, + { + "epoch": 4.523712460932192, + "grad_norm": 0.14851614832878113, + "learning_rate": 4.3476525343117276e-05, + "loss": 3.8113, + "step": 66580 + }, + { + "epoch": 4.524052181002854, + "grad_norm": 0.1837940365076065, + "learning_rate": 4.3472278842234e-05, + "loss": 3.9117, + "step": 66585 + }, + { + "epoch": 4.524391901073516, + "grad_norm": 0.16731196641921997, + "learning_rate": 4.3468032341350726e-05, + "loss": 3.9304, + "step": 66590 + }, + { + "epoch": 4.524731621144177, + "grad_norm": 0.13774411380290985, + "learning_rate": 4.346378584046746e-05, + "loss": 3.6865, + "step": 66595 + }, + { + "epoch": 4.525071341214839, + "grad_norm": 0.4680967628955841, + "learning_rate": 4.345953933958418e-05, + "loss": 3.9518, + "step": 66600 + }, + { + "epoch": 4.525411061285501, + "grad_norm": 0.20024672150611877, + "learning_rate": 4.345529283870091e-05, + "loss": 3.909, + "step": 66605 + }, + { + "epoch": 4.525750781356162, + "grad_norm": 0.2099156379699707, + "learning_rate": 4.3451046337817645e-05, + "loss": 3.7762, + "step": 66610 + }, + { + "epoch": 4.5260905014268245, + "grad_norm": 0.1780921220779419, + "learning_rate": 4.3446799836934366e-05, + "loss": 3.7776, + "step": 66615 + }, + { + "epoch": 4.5264302214974865, + "grad_norm": 0.16867947578430176, + "learning_rate": 4.3442553336051094e-05, + "loss": 3.6425, + "step": 66620 + }, + { + "epoch": 4.526769941568148, + "grad_norm": 0.14317825436592102, + "learning_rate": 4.343830683516782e-05, + "loss": 3.6866, + "step": 66625 + }, + { + "epoch": 4.52710966163881, + "grad_norm": 0.2319672852754593, + "learning_rate": 4.3434060334284557e-05, + "loss": 3.7898, + "step": 66630 + }, + { + "epoch": 4.527449381709472, + "grad_norm": 0.16300535202026367, + "learning_rate": 4.342981383340128e-05, + "loss": 3.8485, + "step": 66635 + }, + { + "epoch": 4.527789101780133, + "grad_norm": 0.15184389054775238, + "learning_rate": 4.3425567332518006e-05, + "loss": 3.7326, + "step": 66640 + }, + { + "epoch": 4.528128821850795, + "grad_norm": 0.15866106748580933, + "learning_rate": 4.342132083163474e-05, + "loss": 3.88, + "step": 66645 + }, + { + "epoch": 4.528468541921457, + "grad_norm": 0.1671493947505951, + "learning_rate": 4.341707433075146e-05, + "loss": 3.882, + "step": 66650 + }, + { + "epoch": 4.528808261992118, + "grad_norm": 0.1364038735628128, + "learning_rate": 4.341282782986819e-05, + "loss": 3.8518, + "step": 66655 + }, + { + "epoch": 4.5291479820627805, + "grad_norm": 0.13510963320732117, + "learning_rate": 4.340858132898492e-05, + "loss": 3.7349, + "step": 66660 + }, + { + "epoch": 4.5294877021334425, + "grad_norm": 0.1733224242925644, + "learning_rate": 4.3404334828101646e-05, + "loss": 3.8481, + "step": 66665 + }, + { + "epoch": 4.529827422204104, + "grad_norm": 0.17901065945625305, + "learning_rate": 4.3400088327218374e-05, + "loss": 3.93, + "step": 66670 + }, + { + "epoch": 4.530167142274766, + "grad_norm": 0.191208615899086, + "learning_rate": 4.33958418263351e-05, + "loss": 4.0092, + "step": 66675 + }, + { + "epoch": 4.530506862345427, + "grad_norm": 0.18844181299209595, + "learning_rate": 4.339159532545183e-05, + "loss": 4.0851, + "step": 66680 + }, + { + "epoch": 4.530846582416089, + "grad_norm": 0.15238583087921143, + "learning_rate": 4.338734882456856e-05, + "loss": 3.856, + "step": 66685 + }, + { + "epoch": 4.531186302486751, + "grad_norm": 0.18412385880947113, + "learning_rate": 4.3383102323685286e-05, + "loss": 3.7445, + "step": 66690 + }, + { + "epoch": 4.531526022557412, + "grad_norm": 0.19655081629753113, + "learning_rate": 4.3378855822802014e-05, + "loss": 3.7734, + "step": 66695 + }, + { + "epoch": 4.531865742628074, + "grad_norm": 0.14235015213489532, + "learning_rate": 4.337460932191874e-05, + "loss": 3.7648, + "step": 66700 + }, + { + "epoch": 4.5322054626987365, + "grad_norm": 0.22430703043937683, + "learning_rate": 4.337036282103547e-05, + "loss": 3.8329, + "step": 66705 + }, + { + "epoch": 4.532545182769398, + "grad_norm": 0.18646112084388733, + "learning_rate": 4.33661163201522e-05, + "loss": 3.9617, + "step": 66710 + }, + { + "epoch": 4.53288490284006, + "grad_norm": 0.2381315529346466, + "learning_rate": 4.3361869819268926e-05, + "loss": 3.7743, + "step": 66715 + }, + { + "epoch": 4.533224622910722, + "grad_norm": 0.2173253893852234, + "learning_rate": 4.3357623318385654e-05, + "loss": 4.0372, + "step": 66720 + }, + { + "epoch": 4.533564342981383, + "grad_norm": 0.20885716378688812, + "learning_rate": 4.3353376817502375e-05, + "loss": 3.8811, + "step": 66725 + }, + { + "epoch": 4.533904063052045, + "grad_norm": 0.21366645395755768, + "learning_rate": 4.334913031661911e-05, + "loss": 3.769, + "step": 66730 + }, + { + "epoch": 4.534243783122707, + "grad_norm": 0.15482962131500244, + "learning_rate": 4.334488381573584e-05, + "loss": 3.7441, + "step": 66735 + }, + { + "epoch": 4.534583503193368, + "grad_norm": 0.1516907662153244, + "learning_rate": 4.334063731485256e-05, + "loss": 3.9514, + "step": 66740 + }, + { + "epoch": 4.53492322326403, + "grad_norm": 0.16419488191604614, + "learning_rate": 4.3336390813969294e-05, + "loss": 4.0415, + "step": 66745 + }, + { + "epoch": 4.5352629433346925, + "grad_norm": 1.6460779905319214, + "learning_rate": 4.333214431308602e-05, + "loss": 3.8857, + "step": 66750 + }, + { + "epoch": 4.535602663405354, + "grad_norm": 0.1684107780456543, + "learning_rate": 4.332789781220274e-05, + "loss": 3.7018, + "step": 66755 + }, + { + "epoch": 4.535942383476016, + "grad_norm": 0.13829496502876282, + "learning_rate": 4.332365131131947e-05, + "loss": 3.876, + "step": 66760 + }, + { + "epoch": 4.536282103546678, + "grad_norm": 0.32694563269615173, + "learning_rate": 4.3319404810436206e-05, + "loss": 3.9454, + "step": 66765 + }, + { + "epoch": 4.536621823617339, + "grad_norm": 0.17074470221996307, + "learning_rate": 4.331515830955293e-05, + "loss": 3.6554, + "step": 66770 + }, + { + "epoch": 4.536961543688001, + "grad_norm": 0.18677549064159393, + "learning_rate": 4.3310911808669655e-05, + "loss": 3.98, + "step": 66775 + }, + { + "epoch": 4.537301263758663, + "grad_norm": 0.23219315707683563, + "learning_rate": 4.330666530778639e-05, + "loss": 3.5796, + "step": 66780 + }, + { + "epoch": 4.537640983829324, + "grad_norm": 0.1941671222448349, + "learning_rate": 4.330241880690311e-05, + "loss": 3.9347, + "step": 66785 + }, + { + "epoch": 4.537980703899986, + "grad_norm": 0.19383281469345093, + "learning_rate": 4.329817230601984e-05, + "loss": 3.8766, + "step": 66790 + }, + { + "epoch": 4.5383204239706485, + "grad_norm": 0.2025817185640335, + "learning_rate": 4.329392580513657e-05, + "loss": 3.9959, + "step": 66795 + }, + { + "epoch": 4.53866014404131, + "grad_norm": 0.17378726601600647, + "learning_rate": 4.32896793042533e-05, + "loss": 3.9348, + "step": 66800 + }, + { + "epoch": 4.538999864111972, + "grad_norm": 0.19569675624370575, + "learning_rate": 4.328543280337002e-05, + "loss": 3.9323, + "step": 66805 + }, + { + "epoch": 4.539339584182634, + "grad_norm": 0.153779074549675, + "learning_rate": 4.328118630248675e-05, + "loss": 4.1874, + "step": 66810 + }, + { + "epoch": 4.539679304253295, + "grad_norm": 0.19938898086547852, + "learning_rate": 4.3276939801603486e-05, + "loss": 3.9999, + "step": 66815 + }, + { + "epoch": 4.540019024323957, + "grad_norm": 0.33270880579948425, + "learning_rate": 4.327269330072021e-05, + "loss": 3.8587, + "step": 66820 + }, + { + "epoch": 4.540358744394619, + "grad_norm": 0.15677425265312195, + "learning_rate": 4.3268446799836935e-05, + "loss": 3.933, + "step": 66825 + }, + { + "epoch": 4.54069846446528, + "grad_norm": 0.14124439656734467, + "learning_rate": 4.326420029895366e-05, + "loss": 3.7747, + "step": 66830 + }, + { + "epoch": 4.541038184535942, + "grad_norm": 0.20131509006023407, + "learning_rate": 4.325995379807039e-05, + "loss": 3.856, + "step": 66835 + }, + { + "epoch": 4.5413779046066045, + "grad_norm": 0.6258596181869507, + "learning_rate": 4.325570729718712e-05, + "loss": 3.7073, + "step": 66840 + }, + { + "epoch": 4.541717624677266, + "grad_norm": 0.19011451303958893, + "learning_rate": 4.325146079630385e-05, + "loss": 3.8285, + "step": 66845 + }, + { + "epoch": 4.542057344747928, + "grad_norm": 0.17629219591617584, + "learning_rate": 4.3247214295420575e-05, + "loss": 3.942, + "step": 66850 + }, + { + "epoch": 4.54239706481859, + "grad_norm": 0.17037361860275269, + "learning_rate": 4.32429677945373e-05, + "loss": 3.7767, + "step": 66855 + }, + { + "epoch": 4.542736784889251, + "grad_norm": 0.13894705474376678, + "learning_rate": 4.323872129365403e-05, + "loss": 3.843, + "step": 66860 + }, + { + "epoch": 4.543076504959913, + "grad_norm": 0.20245319604873657, + "learning_rate": 4.323447479277076e-05, + "loss": 3.9511, + "step": 66865 + }, + { + "epoch": 4.543416225030575, + "grad_norm": 0.22549490630626678, + "learning_rate": 4.323022829188749e-05, + "loss": 3.6948, + "step": 66870 + }, + { + "epoch": 4.543755945101236, + "grad_norm": 0.15884602069854736, + "learning_rate": 4.3225981791004215e-05, + "loss": 3.9448, + "step": 66875 + }, + { + "epoch": 4.5440956651718984, + "grad_norm": 0.3254254162311554, + "learning_rate": 4.322173529012094e-05, + "loss": 3.9116, + "step": 66880 + }, + { + "epoch": 4.54443538524256, + "grad_norm": 0.21904847025871277, + "learning_rate": 4.321748878923767e-05, + "loss": 3.8225, + "step": 66885 + }, + { + "epoch": 4.544775105313222, + "grad_norm": 0.20074228942394257, + "learning_rate": 4.32132422883544e-05, + "loss": 3.6956, + "step": 66890 + }, + { + "epoch": 4.545114825383884, + "grad_norm": 0.24060900509357452, + "learning_rate": 4.320899578747112e-05, + "loss": 3.8558, + "step": 66895 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.1897147297859192, + "learning_rate": 4.3204749286587855e-05, + "loss": 3.7321, + "step": 66900 + }, + { + "epoch": 4.545794265525207, + "grad_norm": 0.23603686690330505, + "learning_rate": 4.3200502785704583e-05, + "loss": 3.7661, + "step": 66905 + }, + { + "epoch": 4.546133985595869, + "grad_norm": 0.32463082671165466, + "learning_rate": 4.3196256284821305e-05, + "loss": 3.9739, + "step": 66910 + }, + { + "epoch": 4.54647370566653, + "grad_norm": 0.13173942267894745, + "learning_rate": 4.319200978393804e-05, + "loss": 3.9518, + "step": 66915 + }, + { + "epoch": 4.546813425737192, + "grad_norm": 0.30086782574653625, + "learning_rate": 4.318776328305477e-05, + "loss": 3.8539, + "step": 66920 + }, + { + "epoch": 4.5471531458078545, + "grad_norm": 0.18376000225543976, + "learning_rate": 4.318351678217149e-05, + "loss": 3.9103, + "step": 66925 + }, + { + "epoch": 4.547492865878516, + "grad_norm": 0.282317578792572, + "learning_rate": 4.317927028128822e-05, + "loss": 3.9778, + "step": 66930 + }, + { + "epoch": 4.547832585949178, + "grad_norm": 0.15512804687023163, + "learning_rate": 4.317502378040495e-05, + "loss": 3.633, + "step": 66935 + }, + { + "epoch": 4.54817230601984, + "grad_norm": 0.937390923500061, + "learning_rate": 4.317077727952167e-05, + "loss": 3.6163, + "step": 66940 + }, + { + "epoch": 4.548512026090501, + "grad_norm": 0.26532894372940063, + "learning_rate": 4.31665307786384e-05, + "loss": 3.9081, + "step": 66945 + }, + { + "epoch": 4.548851746161163, + "grad_norm": 0.19369056820869446, + "learning_rate": 4.3162284277755135e-05, + "loss": 4.1301, + "step": 66950 + }, + { + "epoch": 4.549191466231825, + "grad_norm": 0.19918641448020935, + "learning_rate": 4.315803777687186e-05, + "loss": 4.007, + "step": 66955 + }, + { + "epoch": 4.549531186302486, + "grad_norm": 0.19149282574653625, + "learning_rate": 4.3153791275988585e-05, + "loss": 3.8506, + "step": 66960 + }, + { + "epoch": 4.549870906373148, + "grad_norm": 0.20846086740493774, + "learning_rate": 4.314954477510531e-05, + "loss": 3.7459, + "step": 66965 + }, + { + "epoch": 4.5502106264438105, + "grad_norm": 0.18673259019851685, + "learning_rate": 4.314529827422205e-05, + "loss": 3.7312, + "step": 66970 + }, + { + "epoch": 4.550550346514472, + "grad_norm": 0.1901957094669342, + "learning_rate": 4.314105177333877e-05, + "loss": 3.6661, + "step": 66975 + }, + { + "epoch": 4.550890066585134, + "grad_norm": 0.7560785412788391, + "learning_rate": 4.31368052724555e-05, + "loss": 3.7128, + "step": 66980 + }, + { + "epoch": 4.551229786655796, + "grad_norm": 0.1597515493631363, + "learning_rate": 4.313255877157223e-05, + "loss": 3.8642, + "step": 66985 + }, + { + "epoch": 4.551569506726457, + "grad_norm": 0.19493375718593597, + "learning_rate": 4.312831227068895e-05, + "loss": 3.8472, + "step": 66990 + }, + { + "epoch": 4.551909226797119, + "grad_norm": 0.1842745840549469, + "learning_rate": 4.312406576980568e-05, + "loss": 4.0267, + "step": 66995 + }, + { + "epoch": 4.552248946867781, + "grad_norm": 0.19605843722820282, + "learning_rate": 4.3119819268922416e-05, + "loss": 3.6644, + "step": 67000 + }, + { + "epoch": 4.552588666938442, + "grad_norm": 1.7521640062332153, + "learning_rate": 4.311557276803914e-05, + "loss": 3.7276, + "step": 67005 + }, + { + "epoch": 4.552928387009104, + "grad_norm": 0.1659940630197525, + "learning_rate": 4.3111326267155865e-05, + "loss": 3.7173, + "step": 67010 + }, + { + "epoch": 4.5532681070797665, + "grad_norm": 0.15878458321094513, + "learning_rate": 4.310707976627259e-05, + "loss": 3.8424, + "step": 67015 + }, + { + "epoch": 4.553607827150428, + "grad_norm": 0.2545137107372284, + "learning_rate": 4.310283326538932e-05, + "loss": 3.7263, + "step": 67020 + }, + { + "epoch": 4.55394754722109, + "grad_norm": 0.15717390179634094, + "learning_rate": 4.309858676450605e-05, + "loss": 3.8828, + "step": 67025 + }, + { + "epoch": 4.554287267291752, + "grad_norm": 0.17399102449417114, + "learning_rate": 4.309434026362278e-05, + "loss": 3.8932, + "step": 67030 + }, + { + "epoch": 4.554626987362413, + "grad_norm": 0.17631858587265015, + "learning_rate": 4.3090093762739505e-05, + "loss": 3.6967, + "step": 67035 + }, + { + "epoch": 4.554966707433075, + "grad_norm": 0.1541067361831665, + "learning_rate": 4.308584726185623e-05, + "loss": 3.9049, + "step": 67040 + }, + { + "epoch": 4.555306427503737, + "grad_norm": 1.1298143863677979, + "learning_rate": 4.308160076097296e-05, + "loss": 3.8267, + "step": 67045 + }, + { + "epoch": 4.555646147574398, + "grad_norm": 0.8288149237632751, + "learning_rate": 4.307735426008969e-05, + "loss": 3.6573, + "step": 67050 + }, + { + "epoch": 4.55598586764506, + "grad_norm": 0.1785118281841278, + "learning_rate": 4.307310775920642e-05, + "loss": 3.8427, + "step": 67055 + }, + { + "epoch": 4.5563255877157225, + "grad_norm": 0.17020677030086517, + "learning_rate": 4.3068861258323145e-05, + "loss": 3.9212, + "step": 67060 + }, + { + "epoch": 4.556665307786384, + "grad_norm": 0.26098281145095825, + "learning_rate": 4.3064614757439866e-05, + "loss": 4.0612, + "step": 67065 + }, + { + "epoch": 4.557005027857046, + "grad_norm": 0.18334205448627472, + "learning_rate": 4.30603682565566e-05, + "loss": 3.8692, + "step": 67070 + }, + { + "epoch": 4.557344747927708, + "grad_norm": 0.16985784471035004, + "learning_rate": 4.305612175567333e-05, + "loss": 4.0327, + "step": 67075 + }, + { + "epoch": 4.557684467998369, + "grad_norm": 0.34007030725479126, + "learning_rate": 4.305187525479005e-05, + "loss": 3.7898, + "step": 67080 + }, + { + "epoch": 4.558024188069031, + "grad_norm": 0.2728826105594635, + "learning_rate": 4.3047628753906785e-05, + "loss": 4.0313, + "step": 67085 + }, + { + "epoch": 4.558363908139693, + "grad_norm": 0.3200872242450714, + "learning_rate": 4.304338225302351e-05, + "loss": 3.8657, + "step": 67090 + }, + { + "epoch": 4.558703628210354, + "grad_norm": 0.1464628428220749, + "learning_rate": 4.3039135752140234e-05, + "loss": 3.8486, + "step": 67095 + }, + { + "epoch": 4.559043348281016, + "grad_norm": 0.20587748289108276, + "learning_rate": 4.303488925125697e-05, + "loss": 3.6284, + "step": 67100 + }, + { + "epoch": 4.5593830683516785, + "grad_norm": 0.19044819474220276, + "learning_rate": 4.30306427503737e-05, + "loss": 3.9984, + "step": 67105 + }, + { + "epoch": 4.55972278842234, + "grad_norm": 0.1958109587430954, + "learning_rate": 4.302639624949042e-05, + "loss": 3.7903, + "step": 67110 + }, + { + "epoch": 4.560062508493002, + "grad_norm": 0.16120518743991852, + "learning_rate": 4.3022149748607146e-05, + "loss": 3.7524, + "step": 67115 + }, + { + "epoch": 4.560402228563664, + "grad_norm": 0.18519896268844604, + "learning_rate": 4.301790324772388e-05, + "loss": 3.8977, + "step": 67120 + }, + { + "epoch": 4.560741948634325, + "grad_norm": 0.18859973549842834, + "learning_rate": 4.30136567468406e-05, + "loss": 3.6697, + "step": 67125 + }, + { + "epoch": 4.561081668704987, + "grad_norm": 0.1427718847990036, + "learning_rate": 4.300941024595733e-05, + "loss": 3.8283, + "step": 67130 + }, + { + "epoch": 4.561421388775649, + "grad_norm": 1.3711402416229248, + "learning_rate": 4.3005163745074065e-05, + "loss": 3.6503, + "step": 67135 + }, + { + "epoch": 4.56176110884631, + "grad_norm": 0.14689849317073822, + "learning_rate": 4.300091724419079e-05, + "loss": 3.9039, + "step": 67140 + }, + { + "epoch": 4.562100828916972, + "grad_norm": 0.1810089498758316, + "learning_rate": 4.2996670743307514e-05, + "loss": 3.8826, + "step": 67145 + }, + { + "epoch": 4.5624405489876345, + "grad_norm": 0.18959300220012665, + "learning_rate": 4.299242424242424e-05, + "loss": 3.8638, + "step": 67150 + }, + { + "epoch": 4.562780269058296, + "grad_norm": 0.1585749089717865, + "learning_rate": 4.298817774154098e-05, + "loss": 3.8583, + "step": 67155 + }, + { + "epoch": 4.563119989128958, + "grad_norm": 0.21467354893684387, + "learning_rate": 4.29839312406577e-05, + "loss": 3.8389, + "step": 67160 + }, + { + "epoch": 4.56345970919962, + "grad_norm": 0.20664116740226746, + "learning_rate": 4.2979684739774426e-05, + "loss": 3.8748, + "step": 67165 + }, + { + "epoch": 4.563799429270281, + "grad_norm": 0.14842858910560608, + "learning_rate": 4.297543823889116e-05, + "loss": 3.9123, + "step": 67170 + }, + { + "epoch": 4.564139149340943, + "grad_norm": 0.16339023411273956, + "learning_rate": 4.297119173800788e-05, + "loss": 3.8349, + "step": 67175 + }, + { + "epoch": 4.564478869411605, + "grad_norm": 0.6594675183296204, + "learning_rate": 4.296694523712461e-05, + "loss": 3.8237, + "step": 67180 + }, + { + "epoch": 4.564818589482266, + "grad_norm": 0.23580250144004822, + "learning_rate": 4.296269873624134e-05, + "loss": 3.5102, + "step": 67185 + }, + { + "epoch": 4.5651583095529285, + "grad_norm": 0.23885726928710938, + "learning_rate": 4.2958452235358066e-05, + "loss": 3.9473, + "step": 67190 + }, + { + "epoch": 4.5654980296235905, + "grad_norm": 0.18057678639888763, + "learning_rate": 4.2954205734474794e-05, + "loss": 3.7463, + "step": 67195 + }, + { + "epoch": 4.565837749694252, + "grad_norm": 0.22671443223953247, + "learning_rate": 4.294995923359152e-05, + "loss": 3.8264, + "step": 67200 + }, + { + "epoch": 4.566177469764914, + "grad_norm": 0.19832739233970642, + "learning_rate": 4.294571273270825e-05, + "loss": 3.9382, + "step": 67205 + }, + { + "epoch": 4.566517189835576, + "grad_norm": 0.19254615902900696, + "learning_rate": 4.294146623182498e-05, + "loss": 3.8521, + "step": 67210 + }, + { + "epoch": 4.566856909906237, + "grad_norm": 0.1657274216413498, + "learning_rate": 4.2937219730941706e-05, + "loss": 3.5752, + "step": 67215 + }, + { + "epoch": 4.567196629976899, + "grad_norm": 0.8950977921485901, + "learning_rate": 4.2932973230058434e-05, + "loss": 3.6184, + "step": 67220 + }, + { + "epoch": 4.567536350047561, + "grad_norm": 0.17828279733657837, + "learning_rate": 4.292872672917516e-05, + "loss": 3.7127, + "step": 67225 + }, + { + "epoch": 4.567876070118222, + "grad_norm": 0.1761537492275238, + "learning_rate": 4.292448022829189e-05, + "loss": 3.5973, + "step": 67230 + }, + { + "epoch": 4.5682157901888845, + "grad_norm": 0.14465920627117157, + "learning_rate": 4.292023372740862e-05, + "loss": 4.0969, + "step": 67235 + }, + { + "epoch": 4.5685555102595465, + "grad_norm": 5.371665954589844, + "learning_rate": 4.2915987226525346e-05, + "loss": 3.8481, + "step": 67240 + }, + { + "epoch": 4.568895230330208, + "grad_norm": 0.12768326699733734, + "learning_rate": 4.2911740725642074e-05, + "loss": 3.8481, + "step": 67245 + }, + { + "epoch": 4.56923495040087, + "grad_norm": 0.218317911028862, + "learning_rate": 4.2907494224758796e-05, + "loss": 4.0031, + "step": 67250 + }, + { + "epoch": 4.569574670471532, + "grad_norm": 0.16858653724193573, + "learning_rate": 4.290324772387553e-05, + "loss": 3.6665, + "step": 67255 + }, + { + "epoch": 4.569914390542193, + "grad_norm": 0.19297415018081665, + "learning_rate": 4.289900122299226e-05, + "loss": 3.8217, + "step": 67260 + }, + { + "epoch": 4.570254110612855, + "grad_norm": 0.19859309494495392, + "learning_rate": 4.289475472210898e-05, + "loss": 3.7222, + "step": 67265 + }, + { + "epoch": 4.570593830683517, + "grad_norm": 0.32772985100746155, + "learning_rate": 4.2890508221225714e-05, + "loss": 3.6829, + "step": 67270 + }, + { + "epoch": 4.570933550754178, + "grad_norm": 0.17757856845855713, + "learning_rate": 4.288626172034244e-05, + "loss": 3.736, + "step": 67275 + }, + { + "epoch": 4.5712732708248405, + "grad_norm": 0.13106317818164825, + "learning_rate": 4.2882015219459164e-05, + "loss": 3.5828, + "step": 67280 + }, + { + "epoch": 4.5716129908955025, + "grad_norm": 0.2104351669549942, + "learning_rate": 4.287776871857589e-05, + "loss": 3.7252, + "step": 67285 + }, + { + "epoch": 4.571952710966164, + "grad_norm": 0.16991277039051056, + "learning_rate": 4.2873522217692626e-05, + "loss": 3.9338, + "step": 67290 + }, + { + "epoch": 4.572292431036826, + "grad_norm": 0.20952656865119934, + "learning_rate": 4.286927571680935e-05, + "loss": 3.6011, + "step": 67295 + }, + { + "epoch": 4.572632151107488, + "grad_norm": 0.21609662473201752, + "learning_rate": 4.2865029215926076e-05, + "loss": 4.0276, + "step": 67300 + }, + { + "epoch": 4.572971871178149, + "grad_norm": 0.24324767291545868, + "learning_rate": 4.286078271504281e-05, + "loss": 3.6485, + "step": 67305 + }, + { + "epoch": 4.573311591248811, + "grad_norm": 0.23018811643123627, + "learning_rate": 4.285653621415954e-05, + "loss": 4.1475, + "step": 67310 + }, + { + "epoch": 4.573651311319473, + "grad_norm": 0.15722854435443878, + "learning_rate": 4.285228971327626e-05, + "loss": 3.9107, + "step": 67315 + }, + { + "epoch": 4.573991031390134, + "grad_norm": 0.16119833290576935, + "learning_rate": 4.284804321239299e-05, + "loss": 3.795, + "step": 67320 + }, + { + "epoch": 4.5743307514607965, + "grad_norm": 0.22534950077533722, + "learning_rate": 4.284379671150972e-05, + "loss": 3.8577, + "step": 67325 + }, + { + "epoch": 4.574670471531459, + "grad_norm": 0.15914112329483032, + "learning_rate": 4.2839550210626444e-05, + "loss": 3.8235, + "step": 67330 + }, + { + "epoch": 4.57501019160212, + "grad_norm": 0.17644886672496796, + "learning_rate": 4.283530370974317e-05, + "loss": 3.9156, + "step": 67335 + }, + { + "epoch": 4.575349911672782, + "grad_norm": 0.13233159482479095, + "learning_rate": 4.2831057208859907e-05, + "loss": 3.3933, + "step": 67340 + }, + { + "epoch": 4.575689631743444, + "grad_norm": 0.1643592119216919, + "learning_rate": 4.282681070797663e-05, + "loss": 3.888, + "step": 67345 + }, + { + "epoch": 4.576029351814105, + "grad_norm": 0.18105170130729675, + "learning_rate": 4.2822564207093356e-05, + "loss": 3.8351, + "step": 67350 + }, + { + "epoch": 4.576369071884767, + "grad_norm": 0.20120655000209808, + "learning_rate": 4.2818317706210084e-05, + "loss": 4.0264, + "step": 67355 + }, + { + "epoch": 4.576708791955428, + "grad_norm": 0.19715102016925812, + "learning_rate": 4.281407120532681e-05, + "loss": 3.8802, + "step": 67360 + }, + { + "epoch": 4.57704851202609, + "grad_norm": 0.15299047529697418, + "learning_rate": 4.280982470444354e-05, + "loss": 3.8366, + "step": 67365 + }, + { + "epoch": 4.5773882320967525, + "grad_norm": 0.20394769310951233, + "learning_rate": 4.280557820356027e-05, + "loss": 3.7477, + "step": 67370 + }, + { + "epoch": 4.577727952167414, + "grad_norm": 0.3406042158603668, + "learning_rate": 4.2801331702676996e-05, + "loss": 3.8166, + "step": 67375 + }, + { + "epoch": 4.578067672238076, + "grad_norm": 0.3086174428462982, + "learning_rate": 4.2797085201793724e-05, + "loss": 3.9742, + "step": 67380 + }, + { + "epoch": 4.578407392308738, + "grad_norm": 0.18105478584766388, + "learning_rate": 4.279283870091045e-05, + "loss": 3.9812, + "step": 67385 + }, + { + "epoch": 4.578747112379399, + "grad_norm": 0.18846826255321503, + "learning_rate": 4.278859220002718e-05, + "loss": 3.7943, + "step": 67390 + }, + { + "epoch": 4.579086832450061, + "grad_norm": 0.13758453726768494, + "learning_rate": 4.278434569914391e-05, + "loss": 3.8821, + "step": 67395 + }, + { + "epoch": 4.579426552520723, + "grad_norm": 0.3711819648742676, + "learning_rate": 4.2780099198260636e-05, + "loss": 3.898, + "step": 67400 + }, + { + "epoch": 4.579766272591384, + "grad_norm": 0.18513895571231842, + "learning_rate": 4.2775852697377364e-05, + "loss": 3.9676, + "step": 67405 + }, + { + "epoch": 4.580105992662046, + "grad_norm": 0.1534903645515442, + "learning_rate": 4.277160619649409e-05, + "loss": 3.9776, + "step": 67410 + }, + { + "epoch": 4.5804457127327085, + "grad_norm": 0.14586308598518372, + "learning_rate": 4.276735969561082e-05, + "loss": 3.8326, + "step": 67415 + }, + { + "epoch": 4.58078543280337, + "grad_norm": 0.16619174182415009, + "learning_rate": 4.276311319472754e-05, + "loss": 3.8098, + "step": 67420 + }, + { + "epoch": 4.581125152874032, + "grad_norm": 0.14280235767364502, + "learning_rate": 4.2758866693844276e-05, + "loss": 3.7212, + "step": 67425 + }, + { + "epoch": 4.581464872944694, + "grad_norm": 0.43513223528862, + "learning_rate": 4.2754620192961004e-05, + "loss": 3.7352, + "step": 67430 + }, + { + "epoch": 4.581804593015355, + "grad_norm": 0.1770252287387848, + "learning_rate": 4.2750373692077725e-05, + "loss": 4.0109, + "step": 67435 + }, + { + "epoch": 4.582144313086017, + "grad_norm": 0.7277382016181946, + "learning_rate": 4.274612719119446e-05, + "loss": 3.6408, + "step": 67440 + }, + { + "epoch": 4.582484033156679, + "grad_norm": 0.30086657404899597, + "learning_rate": 4.274188069031119e-05, + "loss": 3.7389, + "step": 67445 + }, + { + "epoch": 4.58282375322734, + "grad_norm": 0.18747907876968384, + "learning_rate": 4.273763418942791e-05, + "loss": 3.6821, + "step": 67450 + }, + { + "epoch": 4.583163473298002, + "grad_norm": 0.13620683550834656, + "learning_rate": 4.273338768854464e-05, + "loss": 3.64, + "step": 67455 + }, + { + "epoch": 4.5835031933686645, + "grad_norm": 0.21932657063007355, + "learning_rate": 4.272914118766137e-05, + "loss": 4.0844, + "step": 67460 + }, + { + "epoch": 4.583842913439326, + "grad_norm": 0.9739589095115662, + "learning_rate": 4.272489468677809e-05, + "loss": 3.707, + "step": 67465 + }, + { + "epoch": 4.584182633509988, + "grad_norm": 0.22257500886917114, + "learning_rate": 4.272064818589482e-05, + "loss": 3.7751, + "step": 67470 + }, + { + "epoch": 4.58452235358065, + "grad_norm": 0.20170500874519348, + "learning_rate": 4.2716401685011556e-05, + "loss": 3.8863, + "step": 67475 + }, + { + "epoch": 4.584862073651311, + "grad_norm": 0.2104494869709015, + "learning_rate": 4.2712155184128284e-05, + "loss": 3.7357, + "step": 67480 + }, + { + "epoch": 4.585201793721973, + "grad_norm": 0.13592462241649628, + "learning_rate": 4.2707908683245005e-05, + "loss": 3.6716, + "step": 67485 + }, + { + "epoch": 4.585541513792635, + "grad_norm": 0.5490448474884033, + "learning_rate": 4.270366218236174e-05, + "loss": 3.9319, + "step": 67490 + }, + { + "epoch": 4.585881233863296, + "grad_norm": 0.1665668785572052, + "learning_rate": 4.269941568147847e-05, + "loss": 3.6853, + "step": 67495 + }, + { + "epoch": 4.5862209539339585, + "grad_norm": 0.15551747381687164, + "learning_rate": 4.269516918059519e-05, + "loss": 4.097, + "step": 67500 + }, + { + "epoch": 4.5865606740046205, + "grad_norm": 0.17126336693763733, + "learning_rate": 4.269092267971192e-05, + "loss": 3.9102, + "step": 67505 + }, + { + "epoch": 4.586900394075282, + "grad_norm": 0.17959335446357727, + "learning_rate": 4.268667617882865e-05, + "loss": 3.8017, + "step": 67510 + }, + { + "epoch": 4.587240114145944, + "grad_norm": 0.15016528964042664, + "learning_rate": 4.268242967794537e-05, + "loss": 3.9085, + "step": 67515 + }, + { + "epoch": 4.587579834216606, + "grad_norm": 0.3971374034881592, + "learning_rate": 4.26781831770621e-05, + "loss": 3.9361, + "step": 67520 + }, + { + "epoch": 4.587919554287267, + "grad_norm": 0.1853644847869873, + "learning_rate": 4.2673936676178836e-05, + "loss": 3.9853, + "step": 67525 + }, + { + "epoch": 4.588259274357929, + "grad_norm": 0.19436350464820862, + "learning_rate": 4.266969017529556e-05, + "loss": 3.9174, + "step": 67530 + }, + { + "epoch": 4.588598994428591, + "grad_norm": 0.16673597693443298, + "learning_rate": 4.2665443674412285e-05, + "loss": 3.6828, + "step": 67535 + }, + { + "epoch": 4.588938714499252, + "grad_norm": 0.2305067777633667, + "learning_rate": 4.266119717352901e-05, + "loss": 3.9053, + "step": 67540 + }, + { + "epoch": 4.5892784345699145, + "grad_norm": 0.22541849315166473, + "learning_rate": 4.265695067264574e-05, + "loss": 3.9681, + "step": 67545 + }, + { + "epoch": 4.5896181546405765, + "grad_norm": 0.4553817808628082, + "learning_rate": 4.265270417176247e-05, + "loss": 3.7264, + "step": 67550 + }, + { + "epoch": 4.589957874711238, + "grad_norm": 0.4332711398601532, + "learning_rate": 4.26484576708792e-05, + "loss": 3.788, + "step": 67555 + }, + { + "epoch": 4.5902975947819, + "grad_norm": 0.15252548456192017, + "learning_rate": 4.2644211169995925e-05, + "loss": 4.1383, + "step": 67560 + }, + { + "epoch": 4.590637314852561, + "grad_norm": 0.1961861550807953, + "learning_rate": 4.263996466911265e-05, + "loss": 3.589, + "step": 67565 + }, + { + "epoch": 4.590977034923223, + "grad_norm": 0.1985137015581131, + "learning_rate": 4.263571816822938e-05, + "loss": 3.7358, + "step": 67570 + }, + { + "epoch": 4.591316754993885, + "grad_norm": 0.14835670590400696, + "learning_rate": 4.263147166734611e-05, + "loss": 3.9494, + "step": 67575 + }, + { + "epoch": 4.591656475064546, + "grad_norm": 0.2429930418729782, + "learning_rate": 4.262722516646284e-05, + "loss": 3.7778, + "step": 67580 + }, + { + "epoch": 4.591996195135208, + "grad_norm": 0.1474510282278061, + "learning_rate": 4.2622978665579565e-05, + "loss": 3.8432, + "step": 67585 + }, + { + "epoch": 4.5923359152058705, + "grad_norm": 0.17883218824863434, + "learning_rate": 4.261873216469629e-05, + "loss": 3.767, + "step": 67590 + }, + { + "epoch": 4.592675635276532, + "grad_norm": 0.1730176955461502, + "learning_rate": 4.261448566381302e-05, + "loss": 3.5548, + "step": 67595 + }, + { + "epoch": 4.593015355347194, + "grad_norm": 0.18520912528038025, + "learning_rate": 4.261023916292975e-05, + "loss": 3.821, + "step": 67600 + }, + { + "epoch": 4.593355075417856, + "grad_norm": 0.18324445188045502, + "learning_rate": 4.260599266204647e-05, + "loss": 3.7873, + "step": 67605 + }, + { + "epoch": 4.593694795488517, + "grad_norm": 0.17885354161262512, + "learning_rate": 4.2601746161163205e-05, + "loss": 3.7043, + "step": 67610 + }, + { + "epoch": 4.594034515559179, + "grad_norm": 0.4021769165992737, + "learning_rate": 4.2597499660279933e-05, + "loss": 3.7077, + "step": 67615 + }, + { + "epoch": 4.594374235629841, + "grad_norm": 0.18938715755939484, + "learning_rate": 4.2593253159396655e-05, + "loss": 3.8283, + "step": 67620 + }, + { + "epoch": 4.594713955700502, + "grad_norm": 0.18185393512248993, + "learning_rate": 4.258900665851339e-05, + "loss": 3.7167, + "step": 67625 + }, + { + "epoch": 4.595053675771164, + "grad_norm": 0.6385628581047058, + "learning_rate": 4.258476015763012e-05, + "loss": 3.8555, + "step": 67630 + }, + { + "epoch": 4.5953933958418265, + "grad_norm": 0.14296332001686096, + "learning_rate": 4.258051365674684e-05, + "loss": 3.6568, + "step": 67635 + }, + { + "epoch": 4.595733115912488, + "grad_norm": 0.2143794447183609, + "learning_rate": 4.257626715586357e-05, + "loss": 3.4718, + "step": 67640 + }, + { + "epoch": 4.59607283598315, + "grad_norm": 0.19242040812969208, + "learning_rate": 4.25720206549803e-05, + "loss": 3.968, + "step": 67645 + }, + { + "epoch": 4.596412556053812, + "grad_norm": 0.2191416174173355, + "learning_rate": 4.256777415409703e-05, + "loss": 3.881, + "step": 67650 + }, + { + "epoch": 4.596752276124473, + "grad_norm": 0.17096306383609772, + "learning_rate": 4.256352765321375e-05, + "loss": 4.0047, + "step": 67655 + }, + { + "epoch": 4.597091996195135, + "grad_norm": 0.20575201511383057, + "learning_rate": 4.2559281152330485e-05, + "loss": 3.649, + "step": 67660 + }, + { + "epoch": 4.597431716265797, + "grad_norm": 0.24478013813495636, + "learning_rate": 4.2555034651447213e-05, + "loss": 3.8591, + "step": 67665 + }, + { + "epoch": 4.597771436336458, + "grad_norm": 0.29316961765289307, + "learning_rate": 4.2550788150563935e-05, + "loss": 3.9184, + "step": 67670 + }, + { + "epoch": 4.59811115640712, + "grad_norm": 0.20992760360240936, + "learning_rate": 4.254654164968066e-05, + "loss": 3.7424, + "step": 67675 + }, + { + "epoch": 4.5984508764777825, + "grad_norm": 0.15399391949176788, + "learning_rate": 4.25422951487974e-05, + "loss": 3.7318, + "step": 67680 + }, + { + "epoch": 4.598790596548444, + "grad_norm": 0.20803824067115784, + "learning_rate": 4.253804864791412e-05, + "loss": 3.8504, + "step": 67685 + }, + { + "epoch": 4.599130316619106, + "grad_norm": 0.1804465800523758, + "learning_rate": 4.253380214703085e-05, + "loss": 3.7993, + "step": 67690 + }, + { + "epoch": 4.599470036689768, + "grad_norm": 0.6272724270820618, + "learning_rate": 4.252955564614758e-05, + "loss": 3.9214, + "step": 67695 + }, + { + "epoch": 4.599809756760429, + "grad_norm": 0.1520487368106842, + "learning_rate": 4.25253091452643e-05, + "loss": 3.8902, + "step": 67700 + }, + { + "epoch": 4.600149476831091, + "grad_norm": 0.19803626835346222, + "learning_rate": 4.252106264438103e-05, + "loss": 4.1091, + "step": 67705 + }, + { + "epoch": 4.600489196901753, + "grad_norm": 0.2514456808567047, + "learning_rate": 4.251681614349776e-05, + "loss": 3.5653, + "step": 67710 + }, + { + "epoch": 4.600828916972414, + "grad_norm": 0.17900964617729187, + "learning_rate": 4.251256964261449e-05, + "loss": 3.6615, + "step": 67715 + }, + { + "epoch": 4.601168637043076, + "grad_norm": 0.19256974756717682, + "learning_rate": 4.2508323141731215e-05, + "loss": 3.7849, + "step": 67720 + }, + { + "epoch": 4.6015083571137385, + "grad_norm": 0.14848706126213074, + "learning_rate": 4.250407664084794e-05, + "loss": 3.928, + "step": 67725 + }, + { + "epoch": 4.6018480771844, + "grad_norm": 0.17623162269592285, + "learning_rate": 4.249983013996467e-05, + "loss": 3.7586, + "step": 67730 + }, + { + "epoch": 4.602187797255062, + "grad_norm": 0.17929331958293915, + "learning_rate": 4.24955836390814e-05, + "loss": 3.7957, + "step": 67735 + }, + { + "epoch": 4.602527517325724, + "grad_norm": 0.20167165994644165, + "learning_rate": 4.249133713819813e-05, + "loss": 3.9641, + "step": 67740 + }, + { + "epoch": 4.602867237396385, + "grad_norm": 0.30083292722702026, + "learning_rate": 4.2487090637314855e-05, + "loss": 3.8673, + "step": 67745 + }, + { + "epoch": 4.603206957467047, + "grad_norm": 0.19522345066070557, + "learning_rate": 4.248284413643158e-05, + "loss": 3.8654, + "step": 67750 + }, + { + "epoch": 4.603546677537709, + "grad_norm": 0.4230428636074066, + "learning_rate": 4.247859763554831e-05, + "loss": 3.6728, + "step": 67755 + }, + { + "epoch": 4.60388639760837, + "grad_norm": 0.16984979808330536, + "learning_rate": 4.247435113466504e-05, + "loss": 3.9651, + "step": 67760 + }, + { + "epoch": 4.6042261176790324, + "grad_norm": 0.18225133419036865, + "learning_rate": 4.247010463378177e-05, + "loss": 3.7841, + "step": 67765 + }, + { + "epoch": 4.6045658377496945, + "grad_norm": 0.3221570551395416, + "learning_rate": 4.2465858132898495e-05, + "loss": 3.9795, + "step": 67770 + }, + { + "epoch": 4.604905557820356, + "grad_norm": 0.2098774015903473, + "learning_rate": 4.2461611632015216e-05, + "loss": 3.9265, + "step": 67775 + }, + { + "epoch": 4.605245277891018, + "grad_norm": 0.9245663285255432, + "learning_rate": 4.245736513113195e-05, + "loss": 3.9244, + "step": 67780 + }, + { + "epoch": 4.60558499796168, + "grad_norm": 0.18121393024921417, + "learning_rate": 4.245311863024868e-05, + "loss": 3.8573, + "step": 67785 + }, + { + "epoch": 4.605924718032341, + "grad_norm": 0.18289311230182648, + "learning_rate": 4.24488721293654e-05, + "loss": 3.9733, + "step": 67790 + }, + { + "epoch": 4.606264438103003, + "grad_norm": 0.12868796288967133, + "learning_rate": 4.2444625628482135e-05, + "loss": 3.7792, + "step": 67795 + }, + { + "epoch": 4.606604158173665, + "grad_norm": 0.1756097376346588, + "learning_rate": 4.244037912759886e-05, + "loss": 3.7657, + "step": 67800 + }, + { + "epoch": 4.606943878244326, + "grad_norm": 0.19917458295822144, + "learning_rate": 4.2436132626715584e-05, + "loss": 3.8398, + "step": 67805 + }, + { + "epoch": 4.6072835983149885, + "grad_norm": 0.18026548624038696, + "learning_rate": 4.243188612583231e-05, + "loss": 3.7815, + "step": 67810 + }, + { + "epoch": 4.6076233183856505, + "grad_norm": 0.15365087985992432, + "learning_rate": 4.242763962494905e-05, + "loss": 3.8143, + "step": 67815 + }, + { + "epoch": 4.607963038456312, + "grad_norm": 0.14292585849761963, + "learning_rate": 4.2423393124065775e-05, + "loss": 3.567, + "step": 67820 + }, + { + "epoch": 4.608302758526974, + "grad_norm": 0.18324902653694153, + "learning_rate": 4.2419146623182496e-05, + "loss": 4.0412, + "step": 67825 + }, + { + "epoch": 4.608642478597636, + "grad_norm": 0.17143481969833374, + "learning_rate": 4.241490012229923e-05, + "loss": 4.077, + "step": 67830 + }, + { + "epoch": 4.608982198668297, + "grad_norm": 0.2604639232158661, + "learning_rate": 4.241065362141596e-05, + "loss": 3.8375, + "step": 67835 + }, + { + "epoch": 4.609321918738959, + "grad_norm": 0.23124414682388306, + "learning_rate": 4.240640712053268e-05, + "loss": 3.6663, + "step": 67840 + }, + { + "epoch": 4.609661638809621, + "grad_norm": 0.18478786945343018, + "learning_rate": 4.240216061964941e-05, + "loss": 3.9764, + "step": 67845 + }, + { + "epoch": 4.610001358880282, + "grad_norm": 0.17159883677959442, + "learning_rate": 4.239791411876614e-05, + "loss": 3.9334, + "step": 67850 + }, + { + "epoch": 4.6103410789509445, + "grad_norm": 0.5762389302253723, + "learning_rate": 4.2393667617882864e-05, + "loss": 3.9678, + "step": 67855 + }, + { + "epoch": 4.6106807990216065, + "grad_norm": 0.179331973195076, + "learning_rate": 4.238942111699959e-05, + "loss": 3.7524, + "step": 67860 + }, + { + "epoch": 4.611020519092268, + "grad_norm": 0.15337836742401123, + "learning_rate": 4.238517461611633e-05, + "loss": 3.7075, + "step": 67865 + }, + { + "epoch": 4.61136023916293, + "grad_norm": 0.1778443455696106, + "learning_rate": 4.238092811523305e-05, + "loss": 3.7368, + "step": 67870 + }, + { + "epoch": 4.611699959233592, + "grad_norm": 1.1061655282974243, + "learning_rate": 4.2376681614349776e-05, + "loss": 3.8826, + "step": 67875 + }, + { + "epoch": 4.612039679304253, + "grad_norm": 0.30403628945350647, + "learning_rate": 4.2372435113466504e-05, + "loss": 3.9447, + "step": 67880 + }, + { + "epoch": 4.612379399374915, + "grad_norm": 0.12039197981357574, + "learning_rate": 4.236818861258323e-05, + "loss": 3.8808, + "step": 67885 + }, + { + "epoch": 4.612719119445577, + "grad_norm": 0.15981820225715637, + "learning_rate": 4.236394211169996e-05, + "loss": 3.9882, + "step": 67890 + }, + { + "epoch": 4.613058839516238, + "grad_norm": 0.19996973872184753, + "learning_rate": 4.235969561081669e-05, + "loss": 3.6293, + "step": 67895 + }, + { + "epoch": 4.6133985595869005, + "grad_norm": 0.44730961322784424, + "learning_rate": 4.2355449109933416e-05, + "loss": 4.0104, + "step": 67900 + }, + { + "epoch": 4.613738279657563, + "grad_norm": 0.16779673099517822, + "learning_rate": 4.2351202609050144e-05, + "loss": 3.963, + "step": 67905 + }, + { + "epoch": 4.614077999728224, + "grad_norm": 0.24031256139278412, + "learning_rate": 4.234695610816687e-05, + "loss": 3.5458, + "step": 67910 + }, + { + "epoch": 4.614417719798886, + "grad_norm": 0.7530183792114258, + "learning_rate": 4.23427096072836e-05, + "loss": 3.7414, + "step": 67915 + }, + { + "epoch": 4.614757439869548, + "grad_norm": 0.1741802841424942, + "learning_rate": 4.233846310640033e-05, + "loss": 3.8892, + "step": 67920 + }, + { + "epoch": 4.615097159940209, + "grad_norm": 0.18569494783878326, + "learning_rate": 4.2334216605517056e-05, + "loss": 4.0918, + "step": 67925 + }, + { + "epoch": 4.615436880010871, + "grad_norm": 0.21192747354507446, + "learning_rate": 4.2329970104633784e-05, + "loss": 3.6769, + "step": 67930 + }, + { + "epoch": 4.615776600081533, + "grad_norm": 0.1824348419904709, + "learning_rate": 4.232572360375051e-05, + "loss": 3.8207, + "step": 67935 + }, + { + "epoch": 4.616116320152194, + "grad_norm": 0.321607768535614, + "learning_rate": 4.232147710286724e-05, + "loss": 3.6905, + "step": 67940 + }, + { + "epoch": 4.6164560402228565, + "grad_norm": 0.15842141211032867, + "learning_rate": 4.231723060198396e-05, + "loss": 3.9336, + "step": 67945 + }, + { + "epoch": 4.616795760293519, + "grad_norm": 0.13680067658424377, + "learning_rate": 4.2312984101100696e-05, + "loss": 3.8934, + "step": 67950 + }, + { + "epoch": 4.61713548036418, + "grad_norm": 0.16522593796253204, + "learning_rate": 4.2308737600217424e-05, + "loss": 4.0063, + "step": 67955 + }, + { + "epoch": 4.617475200434842, + "grad_norm": 0.18114633858203888, + "learning_rate": 4.2304491099334146e-05, + "loss": 3.636, + "step": 67960 + }, + { + "epoch": 4.617814920505504, + "grad_norm": 0.15476509928703308, + "learning_rate": 4.230024459845088e-05, + "loss": 3.9745, + "step": 67965 + }, + { + "epoch": 4.618154640576165, + "grad_norm": 0.30897390842437744, + "learning_rate": 4.229599809756761e-05, + "loss": 3.7321, + "step": 67970 + }, + { + "epoch": 4.618494360646827, + "grad_norm": 0.24067522585391998, + "learning_rate": 4.229175159668433e-05, + "loss": 3.8791, + "step": 67975 + }, + { + "epoch": 4.618834080717489, + "grad_norm": 0.16195808351039886, + "learning_rate": 4.228750509580106e-05, + "loss": 3.6349, + "step": 67980 + }, + { + "epoch": 4.61917380078815, + "grad_norm": 0.22354227304458618, + "learning_rate": 4.228325859491779e-05, + "loss": 3.6942, + "step": 67985 + }, + { + "epoch": 4.6195135208588125, + "grad_norm": 0.5730610489845276, + "learning_rate": 4.227901209403452e-05, + "loss": 3.9281, + "step": 67990 + }, + { + "epoch": 4.619853240929475, + "grad_norm": 0.196171835064888, + "learning_rate": 4.227476559315124e-05, + "loss": 4.0428, + "step": 67995 + }, + { + "epoch": 4.620192961000136, + "grad_norm": 0.3413428068161011, + "learning_rate": 4.2270519092267976e-05, + "loss": 3.822, + "step": 68000 + }, + { + "epoch": 4.620532681070798, + "grad_norm": 0.20223082602024078, + "learning_rate": 4.2266272591384704e-05, + "loss": 3.9293, + "step": 68005 + }, + { + "epoch": 4.62087240114146, + "grad_norm": 0.18322432041168213, + "learning_rate": 4.2262026090501426e-05, + "loss": 3.7892, + "step": 68010 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 0.14900264143943787, + "learning_rate": 4.225777958961816e-05, + "loss": 3.7338, + "step": 68015 + }, + { + "epoch": 4.621551841282783, + "grad_norm": 0.29820555448532104, + "learning_rate": 4.225353308873489e-05, + "loss": 3.7798, + "step": 68020 + }, + { + "epoch": 4.621891561353445, + "grad_norm": 0.16546878218650818, + "learning_rate": 4.224928658785161e-05, + "loss": 3.6102, + "step": 68025 + }, + { + "epoch": 4.622231281424106, + "grad_norm": 0.20630255341529846, + "learning_rate": 4.224504008696834e-05, + "loss": 3.7948, + "step": 68030 + }, + { + "epoch": 4.6225710014947685, + "grad_norm": 0.17330925166606903, + "learning_rate": 4.224079358608507e-05, + "loss": 3.8435, + "step": 68035 + }, + { + "epoch": 4.62291072156543, + "grad_norm": 0.22484590113162994, + "learning_rate": 4.2236547085201794e-05, + "loss": 3.757, + "step": 68040 + }, + { + "epoch": 4.623250441636092, + "grad_norm": 0.18577666580677032, + "learning_rate": 4.223230058431852e-05, + "loss": 3.5149, + "step": 68045 + }, + { + "epoch": 4.623590161706754, + "grad_norm": 0.15874066948890686, + "learning_rate": 4.2228054083435257e-05, + "loss": 3.9862, + "step": 68050 + }, + { + "epoch": 4.623929881777415, + "grad_norm": 0.16075082123279572, + "learning_rate": 4.222380758255198e-05, + "loss": 3.6803, + "step": 68055 + }, + { + "epoch": 4.624269601848077, + "grad_norm": 0.1666797697544098, + "learning_rate": 4.2219561081668706e-05, + "loss": 3.9408, + "step": 68060 + }, + { + "epoch": 4.624609321918739, + "grad_norm": 0.19325454533100128, + "learning_rate": 4.2215314580785434e-05, + "loss": 4.0418, + "step": 68065 + }, + { + "epoch": 4.6249490419894, + "grad_norm": 0.7579836249351501, + "learning_rate": 4.221106807990216e-05, + "loss": 3.6501, + "step": 68070 + }, + { + "epoch": 4.6252887620600625, + "grad_norm": 0.14872662723064423, + "learning_rate": 4.220682157901889e-05, + "loss": 3.7818, + "step": 68075 + }, + { + "epoch": 4.6256284821307245, + "grad_norm": 1.10542631149292, + "learning_rate": 4.220257507813562e-05, + "loss": 3.8363, + "step": 68080 + }, + { + "epoch": 4.625968202201386, + "grad_norm": 0.1380252242088318, + "learning_rate": 4.2198328577252346e-05, + "loss": 3.9305, + "step": 68085 + }, + { + "epoch": 4.626307922272048, + "grad_norm": 0.1791084259748459, + "learning_rate": 4.2194082076369074e-05, + "loss": 3.7885, + "step": 68090 + }, + { + "epoch": 4.62664764234271, + "grad_norm": 0.1647564023733139, + "learning_rate": 4.21898355754858e-05, + "loss": 3.842, + "step": 68095 + }, + { + "epoch": 4.626987362413371, + "grad_norm": 0.3895025849342346, + "learning_rate": 4.218558907460253e-05, + "loss": 3.9255, + "step": 68100 + }, + { + "epoch": 4.627327082484033, + "grad_norm": 0.176923468708992, + "learning_rate": 4.218134257371926e-05, + "loss": 3.6541, + "step": 68105 + }, + { + "epoch": 4.627666802554695, + "grad_norm": 0.1514744609594345, + "learning_rate": 4.2177096072835986e-05, + "loss": 3.7691, + "step": 68110 + }, + { + "epoch": 4.628006522625356, + "grad_norm": 0.2833189368247986, + "learning_rate": 4.2172849571952714e-05, + "loss": 3.9058, + "step": 68115 + }, + { + "epoch": 4.6283462426960185, + "grad_norm": 0.20636914670467377, + "learning_rate": 4.216860307106944e-05, + "loss": 3.8371, + "step": 68120 + }, + { + "epoch": 4.6286859627666805, + "grad_norm": 0.2141110897064209, + "learning_rate": 4.216435657018617e-05, + "loss": 3.8758, + "step": 68125 + }, + { + "epoch": 4.629025682837342, + "grad_norm": 0.17259836196899414, + "learning_rate": 4.216011006930289e-05, + "loss": 3.9238, + "step": 68130 + }, + { + "epoch": 4.629365402908004, + "grad_norm": 0.1702612191438675, + "learning_rate": 4.2155863568419626e-05, + "loss": 3.7565, + "step": 68135 + }, + { + "epoch": 4.629705122978666, + "grad_norm": 0.6059342622756958, + "learning_rate": 4.2151617067536354e-05, + "loss": 3.8306, + "step": 68140 + }, + { + "epoch": 4.630044843049327, + "grad_norm": 0.15153522789478302, + "learning_rate": 4.2147370566653075e-05, + "loss": 3.8697, + "step": 68145 + }, + { + "epoch": 4.630384563119989, + "grad_norm": 0.18780411779880524, + "learning_rate": 4.214312406576981e-05, + "loss": 3.7857, + "step": 68150 + }, + { + "epoch": 4.630724283190651, + "grad_norm": 0.18265928328037262, + "learning_rate": 4.213887756488654e-05, + "loss": 3.9509, + "step": 68155 + }, + { + "epoch": 4.631064003261312, + "grad_norm": 0.1687014102935791, + "learning_rate": 4.2134631064003266e-05, + "loss": 3.8409, + "step": 68160 + }, + { + "epoch": 4.6314037233319745, + "grad_norm": 0.22622106969356537, + "learning_rate": 4.213038456311999e-05, + "loss": 3.9814, + "step": 68165 + }, + { + "epoch": 4.6317434434026366, + "grad_norm": 0.3558909595012665, + "learning_rate": 4.212613806223672e-05, + "loss": 3.8516, + "step": 68170 + }, + { + "epoch": 4.632083163473298, + "grad_norm": 0.26481249928474426, + "learning_rate": 4.212189156135345e-05, + "loss": 3.8967, + "step": 68175 + }, + { + "epoch": 4.63242288354396, + "grad_norm": 0.14586719870567322, + "learning_rate": 4.211764506047017e-05, + "loss": 3.7741, + "step": 68180 + }, + { + "epoch": 4.632762603614622, + "grad_norm": 0.19265222549438477, + "learning_rate": 4.2113398559586906e-05, + "loss": 3.764, + "step": 68185 + }, + { + "epoch": 4.633102323685283, + "grad_norm": 0.18824563920497894, + "learning_rate": 4.2109152058703634e-05, + "loss": 3.7842, + "step": 68190 + }, + { + "epoch": 4.633442043755945, + "grad_norm": 0.1561487913131714, + "learning_rate": 4.2104905557820355e-05, + "loss": 3.9968, + "step": 68195 + }, + { + "epoch": 4.633781763826607, + "grad_norm": 0.15808716416358948, + "learning_rate": 4.210065905693708e-05, + "loss": 3.727, + "step": 68200 + }, + { + "epoch": 4.634121483897268, + "grad_norm": 0.5167113542556763, + "learning_rate": 4.209641255605382e-05, + "loss": 3.9226, + "step": 68205 + }, + { + "epoch": 4.6344612039679305, + "grad_norm": 0.19829237461090088, + "learning_rate": 4.209216605517054e-05, + "loss": 3.5789, + "step": 68210 + }, + { + "epoch": 4.634800924038593, + "grad_norm": 0.20148324966430664, + "learning_rate": 4.208791955428727e-05, + "loss": 4.162, + "step": 68215 + }, + { + "epoch": 4.635140644109254, + "grad_norm": 0.17271050810813904, + "learning_rate": 4.2083673053404e-05, + "loss": 4.0319, + "step": 68220 + }, + { + "epoch": 4.635480364179916, + "grad_norm": 0.3469448387622833, + "learning_rate": 4.207942655252072e-05, + "loss": 4.0315, + "step": 68225 + }, + { + "epoch": 4.635820084250578, + "grad_norm": 0.20361623167991638, + "learning_rate": 4.207518005163745e-05, + "loss": 3.7619, + "step": 68230 + }, + { + "epoch": 4.636159804321239, + "grad_norm": 0.20639410614967346, + "learning_rate": 4.207093355075418e-05, + "loss": 3.8288, + "step": 68235 + }, + { + "epoch": 4.636499524391901, + "grad_norm": 0.7320982217788696, + "learning_rate": 4.206668704987091e-05, + "loss": 3.8286, + "step": 68240 + }, + { + "epoch": 4.636839244462563, + "grad_norm": 0.15213309228420258, + "learning_rate": 4.2062440548987635e-05, + "loss": 3.8832, + "step": 68245 + }, + { + "epoch": 4.637178964533224, + "grad_norm": 0.3042663335800171, + "learning_rate": 4.205819404810436e-05, + "loss": 3.8673, + "step": 68250 + }, + { + "epoch": 4.6375186846038865, + "grad_norm": 0.1501568704843521, + "learning_rate": 4.205394754722109e-05, + "loss": 3.7389, + "step": 68255 + }, + { + "epoch": 4.637858404674548, + "grad_norm": 0.14601919054985046, + "learning_rate": 4.204970104633782e-05, + "loss": 3.923, + "step": 68260 + }, + { + "epoch": 4.63819812474521, + "grad_norm": 0.14824987947940826, + "learning_rate": 4.204545454545455e-05, + "loss": 3.7811, + "step": 68265 + }, + { + "epoch": 4.638537844815872, + "grad_norm": 1.0809499025344849, + "learning_rate": 4.2041208044571275e-05, + "loss": 3.5815, + "step": 68270 + }, + { + "epoch": 4.638877564886533, + "grad_norm": 0.1522216796875, + "learning_rate": 4.2036961543688e-05, + "loss": 3.8996, + "step": 68275 + }, + { + "epoch": 4.639217284957195, + "grad_norm": 0.19097968935966492, + "learning_rate": 4.203271504280473e-05, + "loss": 3.9927, + "step": 68280 + }, + { + "epoch": 4.639557005027857, + "grad_norm": 0.19840562343597412, + "learning_rate": 4.202846854192146e-05, + "loss": 3.6597, + "step": 68285 + }, + { + "epoch": 4.639896725098518, + "grad_norm": 0.1563262641429901, + "learning_rate": 4.202422204103819e-05, + "loss": 3.6073, + "step": 68290 + }, + { + "epoch": 4.64023644516918, + "grad_norm": 0.21896524727344513, + "learning_rate": 4.2019975540154915e-05, + "loss": 4.0807, + "step": 68295 + }, + { + "epoch": 4.6405761652398425, + "grad_norm": 0.1788419783115387, + "learning_rate": 4.2015729039271637e-05, + "loss": 3.6236, + "step": 68300 + }, + { + "epoch": 4.640915885310504, + "grad_norm": 0.15648634731769562, + "learning_rate": 4.201148253838837e-05, + "loss": 3.5673, + "step": 68305 + }, + { + "epoch": 4.641255605381166, + "grad_norm": 0.18030019104480743, + "learning_rate": 4.20072360375051e-05, + "loss": 3.7116, + "step": 68310 + }, + { + "epoch": 4.641595325451828, + "grad_norm": 0.5064977407455444, + "learning_rate": 4.200298953662182e-05, + "loss": 3.7874, + "step": 68315 + }, + { + "epoch": 4.641935045522489, + "grad_norm": 0.15379804372787476, + "learning_rate": 4.1998743035738555e-05, + "loss": 3.7849, + "step": 68320 + }, + { + "epoch": 4.642274765593151, + "grad_norm": 0.17650803923606873, + "learning_rate": 4.1994496534855283e-05, + "loss": 3.9155, + "step": 68325 + }, + { + "epoch": 4.642614485663813, + "grad_norm": 0.2467309981584549, + "learning_rate": 4.199025003397201e-05, + "loss": 3.921, + "step": 68330 + }, + { + "epoch": 4.642954205734474, + "grad_norm": 0.1499156951904297, + "learning_rate": 4.198600353308873e-05, + "loss": 3.8493, + "step": 68335 + }, + { + "epoch": 4.6432939258051364, + "grad_norm": 0.22314903140068054, + "learning_rate": 4.198175703220547e-05, + "loss": 3.8105, + "step": 68340 + }, + { + "epoch": 4.6436336458757985, + "grad_norm": 1.2103030681610107, + "learning_rate": 4.1977510531322195e-05, + "loss": 3.7051, + "step": 68345 + }, + { + "epoch": 4.64397336594646, + "grad_norm": 0.1789325475692749, + "learning_rate": 4.197326403043892e-05, + "loss": 3.8141, + "step": 68350 + }, + { + "epoch": 4.644313086017122, + "grad_norm": 0.20247028768062592, + "learning_rate": 4.196901752955565e-05, + "loss": 3.8897, + "step": 68355 + }, + { + "epoch": 4.644652806087784, + "grad_norm": 0.26225537061691284, + "learning_rate": 4.196477102867238e-05, + "loss": 3.9131, + "step": 68360 + }, + { + "epoch": 4.644992526158445, + "grad_norm": 0.1919069141149521, + "learning_rate": 4.19605245277891e-05, + "loss": 3.7017, + "step": 68365 + }, + { + "epoch": 4.645332246229107, + "grad_norm": 0.1797487586736679, + "learning_rate": 4.195627802690583e-05, + "loss": 3.9613, + "step": 68370 + }, + { + "epoch": 4.645671966299769, + "grad_norm": 0.1894199103116989, + "learning_rate": 4.1952031526022563e-05, + "loss": 3.9553, + "step": 68375 + }, + { + "epoch": 4.64601168637043, + "grad_norm": 0.16636182367801666, + "learning_rate": 4.1947785025139285e-05, + "loss": 3.7149, + "step": 68380 + }, + { + "epoch": 4.6463514064410925, + "grad_norm": 0.17865613102912903, + "learning_rate": 4.194353852425601e-05, + "loss": 3.8182, + "step": 68385 + }, + { + "epoch": 4.6466911265117545, + "grad_norm": 0.16002888977527618, + "learning_rate": 4.193929202337275e-05, + "loss": 3.7241, + "step": 68390 + }, + { + "epoch": 4.647030846582416, + "grad_norm": 0.22040903568267822, + "learning_rate": 4.193504552248947e-05, + "loss": 3.8545, + "step": 68395 + }, + { + "epoch": 4.647370566653078, + "grad_norm": 0.48894357681274414, + "learning_rate": 4.19307990216062e-05, + "loss": 3.9968, + "step": 68400 + }, + { + "epoch": 4.64771028672374, + "grad_norm": 0.3262284994125366, + "learning_rate": 4.192655252072293e-05, + "loss": 3.7322, + "step": 68405 + }, + { + "epoch": 4.648050006794401, + "grad_norm": 0.18816804885864258, + "learning_rate": 4.192230601983965e-05, + "loss": 3.7385, + "step": 68410 + }, + { + "epoch": 4.648389726865063, + "grad_norm": 0.35360637307167053, + "learning_rate": 4.191805951895638e-05, + "loss": 3.7053, + "step": 68415 + }, + { + "epoch": 4.648729446935725, + "grad_norm": 0.17185059189796448, + "learning_rate": 4.191381301807311e-05, + "loss": 3.6735, + "step": 68420 + }, + { + "epoch": 4.649069167006386, + "grad_norm": 0.1509062498807907, + "learning_rate": 4.190956651718984e-05, + "loss": 3.6538, + "step": 68425 + }, + { + "epoch": 4.6494088870770485, + "grad_norm": 0.16673624515533447, + "learning_rate": 4.1905320016306565e-05, + "loss": 3.766, + "step": 68430 + }, + { + "epoch": 4.6497486071477105, + "grad_norm": 0.19565734267234802, + "learning_rate": 4.190107351542329e-05, + "loss": 3.8497, + "step": 68435 + }, + { + "epoch": 4.650088327218372, + "grad_norm": 0.26451340317726135, + "learning_rate": 4.189682701454002e-05, + "loss": 4.114, + "step": 68440 + }, + { + "epoch": 4.650428047289034, + "grad_norm": 0.8062251210212708, + "learning_rate": 4.189258051365675e-05, + "loss": 3.8657, + "step": 68445 + }, + { + "epoch": 4.650767767359696, + "grad_norm": 0.24221168458461761, + "learning_rate": 4.188833401277348e-05, + "loss": 3.8274, + "step": 68450 + }, + { + "epoch": 4.651107487430357, + "grad_norm": 0.15242592990398407, + "learning_rate": 4.1884087511890205e-05, + "loss": 3.8612, + "step": 68455 + }, + { + "epoch": 4.651447207501019, + "grad_norm": 0.15455441176891327, + "learning_rate": 4.187984101100693e-05, + "loss": 4.147, + "step": 68460 + }, + { + "epoch": 4.651786927571681, + "grad_norm": 0.14765112102031708, + "learning_rate": 4.187559451012366e-05, + "loss": 3.7079, + "step": 68465 + }, + { + "epoch": 4.652126647642342, + "grad_norm": 0.1643567979335785, + "learning_rate": 4.187134800924038e-05, + "loss": 4.1273, + "step": 68470 + }, + { + "epoch": 4.6524663677130045, + "grad_norm": 0.23158827424049377, + "learning_rate": 4.186710150835712e-05, + "loss": 3.7579, + "step": 68475 + }, + { + "epoch": 4.6528060877836666, + "grad_norm": 0.13821285963058472, + "learning_rate": 4.1862855007473845e-05, + "loss": 3.5296, + "step": 68480 + }, + { + "epoch": 4.653145807854328, + "grad_norm": 0.19568775594234467, + "learning_rate": 4.1858608506590566e-05, + "loss": 3.7982, + "step": 68485 + }, + { + "epoch": 4.65348552792499, + "grad_norm": 0.1908196210861206, + "learning_rate": 4.18543620057073e-05, + "loss": 3.9163, + "step": 68490 + }, + { + "epoch": 4.653825247995652, + "grad_norm": 0.49894392490386963, + "learning_rate": 4.185011550482403e-05, + "loss": 3.7831, + "step": 68495 + }, + { + "epoch": 4.654164968066313, + "grad_norm": 0.1846211701631546, + "learning_rate": 4.184586900394076e-05, + "loss": 3.8564, + "step": 68500 + }, + { + "epoch": 4.654504688136975, + "grad_norm": 0.16595642268657684, + "learning_rate": 4.1841622503057485e-05, + "loss": 3.9616, + "step": 68505 + }, + { + "epoch": 4.654844408207637, + "grad_norm": 0.19694316387176514, + "learning_rate": 4.183737600217421e-05, + "loss": 3.8031, + "step": 68510 + }, + { + "epoch": 4.655184128278298, + "grad_norm": 0.16045209765434265, + "learning_rate": 4.183312950129094e-05, + "loss": 3.6225, + "step": 68515 + }, + { + "epoch": 4.6555238483489605, + "grad_norm": 0.15459367632865906, + "learning_rate": 4.182888300040766e-05, + "loss": 3.7729, + "step": 68520 + }, + { + "epoch": 4.655863568419623, + "grad_norm": 0.17601895332336426, + "learning_rate": 4.18246364995244e-05, + "loss": 3.7772, + "step": 68525 + }, + { + "epoch": 4.656203288490284, + "grad_norm": 0.1789489984512329, + "learning_rate": 4.1820389998641125e-05, + "loss": 3.8997, + "step": 68530 + }, + { + "epoch": 4.656543008560946, + "grad_norm": 0.20182225108146667, + "learning_rate": 4.1816143497757846e-05, + "loss": 3.8936, + "step": 68535 + }, + { + "epoch": 4.656882728631608, + "grad_norm": 0.5025500655174255, + "learning_rate": 4.181189699687458e-05, + "loss": 3.8172, + "step": 68540 + }, + { + "epoch": 4.657222448702269, + "grad_norm": 0.28325700759887695, + "learning_rate": 4.180765049599131e-05, + "loss": 3.875, + "step": 68545 + }, + { + "epoch": 4.657562168772931, + "grad_norm": 0.215296670794487, + "learning_rate": 4.180340399510803e-05, + "loss": 3.8759, + "step": 68550 + }, + { + "epoch": 4.657901888843593, + "grad_norm": 0.153108149766922, + "learning_rate": 4.179915749422476e-05, + "loss": 3.5634, + "step": 68555 + }, + { + "epoch": 4.658241608914254, + "grad_norm": 0.3744702637195587, + "learning_rate": 4.179491099334149e-05, + "loss": 3.8082, + "step": 68560 + }, + { + "epoch": 4.6585813289849165, + "grad_norm": 0.4385930299758911, + "learning_rate": 4.1790664492458214e-05, + "loss": 3.8904, + "step": 68565 + }, + { + "epoch": 4.658921049055579, + "grad_norm": 0.23928719758987427, + "learning_rate": 4.178641799157494e-05, + "loss": 4.0084, + "step": 68570 + }, + { + "epoch": 4.65926076912624, + "grad_norm": 0.18486149609088898, + "learning_rate": 4.178217149069168e-05, + "loss": 3.7382, + "step": 68575 + }, + { + "epoch": 4.659600489196902, + "grad_norm": 0.19583159685134888, + "learning_rate": 4.17779249898084e-05, + "loss": 4.0757, + "step": 68580 + }, + { + "epoch": 4.659940209267564, + "grad_norm": 0.16731701791286469, + "learning_rate": 4.1773678488925126e-05, + "loss": 3.9565, + "step": 68585 + }, + { + "epoch": 4.660279929338225, + "grad_norm": 0.16313588619232178, + "learning_rate": 4.1769431988041854e-05, + "loss": 3.8856, + "step": 68590 + }, + { + "epoch": 4.660619649408887, + "grad_norm": 0.17274528741836548, + "learning_rate": 4.176518548715858e-05, + "loss": 3.8961, + "step": 68595 + }, + { + "epoch": 4.660959369479549, + "grad_norm": 0.14766234159469604, + "learning_rate": 4.176093898627531e-05, + "loss": 3.5486, + "step": 68600 + }, + { + "epoch": 4.66129908955021, + "grad_norm": 0.15812993049621582, + "learning_rate": 4.175669248539204e-05, + "loss": 3.7664, + "step": 68605 + }, + { + "epoch": 4.6616388096208725, + "grad_norm": 0.1941656619310379, + "learning_rate": 4.1752445984508766e-05, + "loss": 3.8687, + "step": 68610 + }, + { + "epoch": 4.661978529691535, + "grad_norm": 0.16856634616851807, + "learning_rate": 4.1748199483625494e-05, + "loss": 3.7513, + "step": 68615 + }, + { + "epoch": 4.662318249762196, + "grad_norm": 0.17885614931583405, + "learning_rate": 4.174395298274222e-05, + "loss": 3.7387, + "step": 68620 + }, + { + "epoch": 4.662657969832858, + "grad_norm": 0.17044313251972198, + "learning_rate": 4.173970648185895e-05, + "loss": 3.729, + "step": 68625 + }, + { + "epoch": 4.66299768990352, + "grad_norm": 0.17307260632514954, + "learning_rate": 4.173545998097568e-05, + "loss": 3.9667, + "step": 68630 + }, + { + "epoch": 4.663337409974181, + "grad_norm": 0.24773229658603668, + "learning_rate": 4.1731213480092406e-05, + "loss": 3.9001, + "step": 68635 + }, + { + "epoch": 4.663677130044843, + "grad_norm": 0.2012760192155838, + "learning_rate": 4.1726966979209134e-05, + "loss": 3.8207, + "step": 68640 + }, + { + "epoch": 4.664016850115505, + "grad_norm": 0.16266851127147675, + "learning_rate": 4.172272047832586e-05, + "loss": 3.7814, + "step": 68645 + }, + { + "epoch": 4.6643565701861665, + "grad_norm": 0.2889043092727661, + "learning_rate": 4.171847397744259e-05, + "loss": 3.7244, + "step": 68650 + }, + { + "epoch": 4.6646962902568285, + "grad_norm": 0.17196325957775116, + "learning_rate": 4.171422747655931e-05, + "loss": 3.8498, + "step": 68655 + }, + { + "epoch": 4.665036010327491, + "grad_norm": 0.172272726893425, + "learning_rate": 4.1709980975676046e-05, + "loss": 3.8935, + "step": 68660 + }, + { + "epoch": 4.665375730398152, + "grad_norm": 0.15466797351837158, + "learning_rate": 4.1705734474792774e-05, + "loss": 3.7442, + "step": 68665 + }, + { + "epoch": 4.665715450468814, + "grad_norm": 0.17263439297676086, + "learning_rate": 4.17014879739095e-05, + "loss": 3.8681, + "step": 68670 + }, + { + "epoch": 4.666055170539476, + "grad_norm": 0.1758696585893631, + "learning_rate": 4.169724147302623e-05, + "loss": 3.8892, + "step": 68675 + }, + { + "epoch": 4.666394890610137, + "grad_norm": 0.1925162523984909, + "learning_rate": 4.169299497214296e-05, + "loss": 3.6182, + "step": 68680 + }, + { + "epoch": 4.666734610680799, + "grad_norm": 0.21162870526313782, + "learning_rate": 4.1688748471259686e-05, + "loss": 3.9125, + "step": 68685 + }, + { + "epoch": 4.667074330751461, + "grad_norm": 0.2221028059720993, + "learning_rate": 4.168450197037641e-05, + "loss": 3.9177, + "step": 68690 + }, + { + "epoch": 4.6674140508221225, + "grad_norm": 0.18915507197380066, + "learning_rate": 4.168025546949314e-05, + "loss": 3.7381, + "step": 68695 + }, + { + "epoch": 4.6677537708927845, + "grad_norm": 0.18051481246948242, + "learning_rate": 4.167600896860987e-05, + "loss": 3.8834, + "step": 68700 + }, + { + "epoch": 4.668093490963447, + "grad_norm": 0.19264240562915802, + "learning_rate": 4.167176246772659e-05, + "loss": 3.8062, + "step": 68705 + }, + { + "epoch": 4.668433211034108, + "grad_norm": 0.17635060846805573, + "learning_rate": 4.1667515966843326e-05, + "loss": 3.8199, + "step": 68710 + }, + { + "epoch": 4.66877293110477, + "grad_norm": 0.16745080053806305, + "learning_rate": 4.1663269465960054e-05, + "loss": 3.8101, + "step": 68715 + }, + { + "epoch": 4.669112651175431, + "grad_norm": 0.24310070276260376, + "learning_rate": 4.1659022965076776e-05, + "loss": 3.8616, + "step": 68720 + }, + { + "epoch": 4.669452371246093, + "grad_norm": 0.18546044826507568, + "learning_rate": 4.1654776464193504e-05, + "loss": 4.0237, + "step": 68725 + }, + { + "epoch": 4.669792091316755, + "grad_norm": 0.2103537917137146, + "learning_rate": 4.165052996331024e-05, + "loss": 3.9751, + "step": 68730 + }, + { + "epoch": 4.670131811387416, + "grad_norm": 2.098684549331665, + "learning_rate": 4.164628346242696e-05, + "loss": 3.8476, + "step": 68735 + }, + { + "epoch": 4.6704715314580785, + "grad_norm": 0.1540943682193756, + "learning_rate": 4.164203696154369e-05, + "loss": 3.8387, + "step": 68740 + }, + { + "epoch": 4.6708112515287405, + "grad_norm": 0.19278714060783386, + "learning_rate": 4.163779046066042e-05, + "loss": 3.6942, + "step": 68745 + }, + { + "epoch": 4.671150971599402, + "grad_norm": 1.0868399143218994, + "learning_rate": 4.1633543959777144e-05, + "loss": 3.9957, + "step": 68750 + }, + { + "epoch": 4.671490691670064, + "grad_norm": 0.2361050695180893, + "learning_rate": 4.162929745889387e-05, + "loss": 3.842, + "step": 68755 + }, + { + "epoch": 4.671830411740726, + "grad_norm": 0.2030894011259079, + "learning_rate": 4.16250509580106e-05, + "loss": 3.8666, + "step": 68760 + }, + { + "epoch": 4.672170131811387, + "grad_norm": 0.1619316041469574, + "learning_rate": 4.162080445712733e-05, + "loss": 4.0026, + "step": 68765 + }, + { + "epoch": 4.672509851882049, + "grad_norm": 0.20540930330753326, + "learning_rate": 4.1616557956244056e-05, + "loss": 3.6949, + "step": 68770 + }, + { + "epoch": 4.672849571952711, + "grad_norm": 0.12846918404102325, + "learning_rate": 4.1612311455360784e-05, + "loss": 3.9891, + "step": 68775 + }, + { + "epoch": 4.673189292023372, + "grad_norm": 0.17641690373420715, + "learning_rate": 4.160806495447751e-05, + "loss": 3.9361, + "step": 68780 + }, + { + "epoch": 4.6735290120940345, + "grad_norm": 0.18398045003414154, + "learning_rate": 4.160381845359424e-05, + "loss": 3.904, + "step": 68785 + }, + { + "epoch": 4.673868732164697, + "grad_norm": 0.23215536773204803, + "learning_rate": 4.159957195271097e-05, + "loss": 3.7337, + "step": 68790 + }, + { + "epoch": 4.674208452235358, + "grad_norm": 0.15833885967731476, + "learning_rate": 4.1595325451827696e-05, + "loss": 3.674, + "step": 68795 + }, + { + "epoch": 4.67454817230602, + "grad_norm": 0.17487215995788574, + "learning_rate": 4.1591078950944424e-05, + "loss": 3.9058, + "step": 68800 + }, + { + "epoch": 4.674887892376682, + "grad_norm": 0.9540194869041443, + "learning_rate": 4.158683245006115e-05, + "loss": 3.6594, + "step": 68805 + }, + { + "epoch": 4.675227612447343, + "grad_norm": 0.15823917090892792, + "learning_rate": 4.158258594917788e-05, + "loss": 4.0222, + "step": 68810 + }, + { + "epoch": 4.675567332518005, + "grad_norm": 0.2661254107952118, + "learning_rate": 4.157833944829461e-05, + "loss": 3.8508, + "step": 68815 + }, + { + "epoch": 4.675907052588667, + "grad_norm": 0.2501099109649658, + "learning_rate": 4.1574092947411336e-05, + "loss": 3.9446, + "step": 68820 + }, + { + "epoch": 4.676246772659328, + "grad_norm": 0.17102719843387604, + "learning_rate": 4.156984644652806e-05, + "loss": 3.8182, + "step": 68825 + }, + { + "epoch": 4.6765864927299905, + "grad_norm": 0.28470659255981445, + "learning_rate": 4.156559994564479e-05, + "loss": 3.6269, + "step": 68830 + }, + { + "epoch": 4.676926212800653, + "grad_norm": 0.21534529328346252, + "learning_rate": 4.156135344476152e-05, + "loss": 3.9316, + "step": 68835 + }, + { + "epoch": 4.677265932871314, + "grad_norm": 0.19470536708831787, + "learning_rate": 4.155710694387825e-05, + "loss": 3.6262, + "step": 68840 + }, + { + "epoch": 4.677605652941976, + "grad_norm": 0.26461270451545715, + "learning_rate": 4.1552860442994976e-05, + "loss": 3.7323, + "step": 68845 + }, + { + "epoch": 4.677945373012638, + "grad_norm": 0.2353813648223877, + "learning_rate": 4.1548613942111704e-05, + "loss": 3.6853, + "step": 68850 + }, + { + "epoch": 4.678285093083299, + "grad_norm": 0.1428670585155487, + "learning_rate": 4.154436744122843e-05, + "loss": 3.7587, + "step": 68855 + }, + { + "epoch": 4.678624813153961, + "grad_norm": 0.21942399442195892, + "learning_rate": 4.154012094034515e-05, + "loss": 3.7134, + "step": 68860 + }, + { + "epoch": 4.678964533224623, + "grad_norm": 0.20228396356105804, + "learning_rate": 4.153587443946189e-05, + "loss": 3.8048, + "step": 68865 + }, + { + "epoch": 4.679304253295284, + "grad_norm": 0.15177245438098907, + "learning_rate": 4.1531627938578616e-05, + "loss": 3.8301, + "step": 68870 + }, + { + "epoch": 4.6796439733659465, + "grad_norm": 1.9536741971969604, + "learning_rate": 4.152738143769534e-05, + "loss": 3.8165, + "step": 68875 + }, + { + "epoch": 4.679983693436609, + "grad_norm": 0.15168094635009766, + "learning_rate": 4.152313493681207e-05, + "loss": 3.6674, + "step": 68880 + }, + { + "epoch": 4.68032341350727, + "grad_norm": 0.16528354585170746, + "learning_rate": 4.15188884359288e-05, + "loss": 3.7892, + "step": 68885 + }, + { + "epoch": 4.680663133577932, + "grad_norm": 0.20943722128868103, + "learning_rate": 4.151464193504552e-05, + "loss": 3.8694, + "step": 68890 + }, + { + "epoch": 4.681002853648594, + "grad_norm": 0.18656902015209198, + "learning_rate": 4.151039543416225e-05, + "loss": 3.852, + "step": 68895 + }, + { + "epoch": 4.681342573719255, + "grad_norm": 1.6915940046310425, + "learning_rate": 4.1506148933278984e-05, + "loss": 3.7445, + "step": 68900 + }, + { + "epoch": 4.681682293789917, + "grad_norm": 0.7540450692176819, + "learning_rate": 4.1501902432395705e-05, + "loss": 3.8773, + "step": 68905 + }, + { + "epoch": 4.682022013860579, + "grad_norm": 0.1617620587348938, + "learning_rate": 4.149765593151243e-05, + "loss": 3.6545, + "step": 68910 + }, + { + "epoch": 4.68236173393124, + "grad_norm": 0.20387063920497894, + "learning_rate": 4.149340943062917e-05, + "loss": 3.8506, + "step": 68915 + }, + { + "epoch": 4.6827014540019025, + "grad_norm": 0.2659585773944855, + "learning_rate": 4.148916292974589e-05, + "loss": 4.0698, + "step": 68920 + }, + { + "epoch": 4.683041174072565, + "grad_norm": 0.23171235620975494, + "learning_rate": 4.148491642886262e-05, + "loss": 3.8787, + "step": 68925 + }, + { + "epoch": 4.683380894143226, + "grad_norm": 0.1719885617494583, + "learning_rate": 4.148066992797935e-05, + "loss": 3.7812, + "step": 68930 + }, + { + "epoch": 4.683720614213888, + "grad_norm": 0.19112372398376465, + "learning_rate": 4.147642342709607e-05, + "loss": 3.445, + "step": 68935 + }, + { + "epoch": 4.684060334284549, + "grad_norm": 0.1557842344045639, + "learning_rate": 4.14721769262128e-05, + "loss": 4.0398, + "step": 68940 + }, + { + "epoch": 4.684400054355211, + "grad_norm": 0.14968185126781464, + "learning_rate": 4.146793042532953e-05, + "loss": 3.647, + "step": 68945 + }, + { + "epoch": 4.684739774425873, + "grad_norm": 0.16811871528625488, + "learning_rate": 4.146368392444626e-05, + "loss": 3.7352, + "step": 68950 + }, + { + "epoch": 4.685079494496534, + "grad_norm": 0.19865262508392334, + "learning_rate": 4.1459437423562985e-05, + "loss": 3.9672, + "step": 68955 + }, + { + "epoch": 4.6854192145671965, + "grad_norm": 0.25746721029281616, + "learning_rate": 4.145519092267971e-05, + "loss": 3.9415, + "step": 68960 + }, + { + "epoch": 4.6857589346378585, + "grad_norm": 0.16040000319480896, + "learning_rate": 4.145094442179644e-05, + "loss": 3.7074, + "step": 68965 + }, + { + "epoch": 4.68609865470852, + "grad_norm": 0.23114292323589325, + "learning_rate": 4.144669792091317e-05, + "loss": 3.6639, + "step": 68970 + }, + { + "epoch": 4.686438374779182, + "grad_norm": 0.18062016367912292, + "learning_rate": 4.14424514200299e-05, + "loss": 3.8362, + "step": 68975 + }, + { + "epoch": 4.686778094849844, + "grad_norm": 0.144982248544693, + "learning_rate": 4.1438204919146625e-05, + "loss": 3.9207, + "step": 68980 + }, + { + "epoch": 4.687117814920505, + "grad_norm": 0.18716265261173248, + "learning_rate": 4.143395841826335e-05, + "loss": 3.5398, + "step": 68985 + }, + { + "epoch": 4.687457534991167, + "grad_norm": 0.17122168838977814, + "learning_rate": 4.142971191738008e-05, + "loss": 3.8307, + "step": 68990 + }, + { + "epoch": 4.687797255061829, + "grad_norm": 0.23706969618797302, + "learning_rate": 4.14254654164968e-05, + "loss": 4.006, + "step": 68995 + }, + { + "epoch": 4.68813697513249, + "grad_norm": 0.22724047303199768, + "learning_rate": 4.142121891561354e-05, + "loss": 3.6819, + "step": 69000 + }, + { + "epoch": 4.6884766952031525, + "grad_norm": 0.18845215439796448, + "learning_rate": 4.1416972414730265e-05, + "loss": 4.0081, + "step": 69005 + }, + { + "epoch": 4.6888164152738145, + "grad_norm": 0.31157004833221436, + "learning_rate": 4.141272591384699e-05, + "loss": 3.9494, + "step": 69010 + }, + { + "epoch": 4.689156135344476, + "grad_norm": 0.2290927916765213, + "learning_rate": 4.140847941296372e-05, + "loss": 4.0869, + "step": 69015 + }, + { + "epoch": 4.689495855415138, + "grad_norm": 0.1970871239900589, + "learning_rate": 4.140423291208045e-05, + "loss": 3.8275, + "step": 69020 + }, + { + "epoch": 4.6898355754858, + "grad_norm": 0.22394989430904388, + "learning_rate": 4.139998641119718e-05, + "loss": 3.7963, + "step": 69025 + }, + { + "epoch": 4.690175295556461, + "grad_norm": 0.16817708313465118, + "learning_rate": 4.1395739910313905e-05, + "loss": 4.1638, + "step": 69030 + }, + { + "epoch": 4.690515015627123, + "grad_norm": 0.15104642510414124, + "learning_rate": 4.1391493409430633e-05, + "loss": 3.6713, + "step": 69035 + }, + { + "epoch": 4.690854735697785, + "grad_norm": 0.1783243864774704, + "learning_rate": 4.138724690854736e-05, + "loss": 3.7215, + "step": 69040 + }, + { + "epoch": 4.691194455768446, + "grad_norm": 0.24835321307182312, + "learning_rate": 4.138300040766408e-05, + "loss": 3.8175, + "step": 69045 + }, + { + "epoch": 4.6915341758391085, + "grad_norm": 0.21932777762413025, + "learning_rate": 4.137875390678082e-05, + "loss": 3.8894, + "step": 69050 + }, + { + "epoch": 4.6918738959097706, + "grad_norm": 0.1886502057313919, + "learning_rate": 4.1374507405897545e-05, + "loss": 3.8955, + "step": 69055 + }, + { + "epoch": 4.692213615980432, + "grad_norm": 0.21375896036624908, + "learning_rate": 4.137026090501427e-05, + "loss": 4.0959, + "step": 69060 + }, + { + "epoch": 4.692553336051094, + "grad_norm": 0.2200506031513214, + "learning_rate": 4.1366014404131e-05, + "loss": 3.8023, + "step": 69065 + }, + { + "epoch": 4.692893056121756, + "grad_norm": 0.17064562439918518, + "learning_rate": 4.136176790324773e-05, + "loss": 3.9648, + "step": 69070 + }, + { + "epoch": 4.693232776192417, + "grad_norm": 0.1845453828573227, + "learning_rate": 4.135752140236445e-05, + "loss": 4.1325, + "step": 69075 + }, + { + "epoch": 4.693572496263079, + "grad_norm": 0.14457561075687408, + "learning_rate": 4.135327490148118e-05, + "loss": 3.8567, + "step": 69080 + }, + { + "epoch": 4.693912216333741, + "grad_norm": 0.17372408509254456, + "learning_rate": 4.1349028400597913e-05, + "loss": 4.0674, + "step": 69085 + }, + { + "epoch": 4.694251936404402, + "grad_norm": 0.19677944481372833, + "learning_rate": 4.1344781899714635e-05, + "loss": 3.504, + "step": 69090 + }, + { + "epoch": 4.6945916564750645, + "grad_norm": 0.17962506413459778, + "learning_rate": 4.134053539883136e-05, + "loss": 3.9532, + "step": 69095 + }, + { + "epoch": 4.694931376545727, + "grad_norm": 0.2578170597553253, + "learning_rate": 4.13362888979481e-05, + "loss": 3.8129, + "step": 69100 + }, + { + "epoch": 4.695271096616388, + "grad_norm": 0.15059420466423035, + "learning_rate": 4.133204239706482e-05, + "loss": 3.982, + "step": 69105 + }, + { + "epoch": 4.69561081668705, + "grad_norm": 0.21051950752735138, + "learning_rate": 4.13286451963582e-05, + "loss": 3.8252, + "step": 69110 + }, + { + "epoch": 4.695950536757712, + "grad_norm": 0.18638600409030914, + "learning_rate": 4.132439869547493e-05, + "loss": 3.8863, + "step": 69115 + }, + { + "epoch": 4.696290256828373, + "grad_norm": 0.19402095675468445, + "learning_rate": 4.132015219459166e-05, + "loss": 4.0237, + "step": 69120 + }, + { + "epoch": 4.696629976899035, + "grad_norm": 0.1798579841852188, + "learning_rate": 4.1315905693708385e-05, + "loss": 3.8489, + "step": 69125 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 0.21458902955055237, + "learning_rate": 4.131165919282511e-05, + "loss": 3.7123, + "step": 69130 + }, + { + "epoch": 4.697309417040358, + "grad_norm": 0.154670849442482, + "learning_rate": 4.130741269194184e-05, + "loss": 3.8952, + "step": 69135 + }, + { + "epoch": 4.6976491371110205, + "grad_norm": 0.17300044000148773, + "learning_rate": 4.130316619105857e-05, + "loss": 3.739, + "step": 69140 + }, + { + "epoch": 4.697988857181683, + "grad_norm": 0.23539379239082336, + "learning_rate": 4.12989196901753e-05, + "loss": 3.8085, + "step": 69145 + }, + { + "epoch": 4.698328577252344, + "grad_norm": 0.15806522965431213, + "learning_rate": 4.1294673189292025e-05, + "loss": 3.8967, + "step": 69150 + }, + { + "epoch": 4.698668297323006, + "grad_norm": 0.19915927946567535, + "learning_rate": 4.129042668840875e-05, + "loss": 3.9584, + "step": 69155 + }, + { + "epoch": 4.699008017393668, + "grad_norm": 0.16972148418426514, + "learning_rate": 4.128618018752548e-05, + "loss": 3.9968, + "step": 69160 + }, + { + "epoch": 4.699347737464329, + "grad_norm": 0.16426129639148712, + "learning_rate": 4.128193368664221e-05, + "loss": 3.6815, + "step": 69165 + }, + { + "epoch": 4.699687457534991, + "grad_norm": 0.1827964186668396, + "learning_rate": 4.127768718575894e-05, + "loss": 3.7608, + "step": 69170 + }, + { + "epoch": 4.700027177605653, + "grad_norm": 0.17591789364814758, + "learning_rate": 4.1273440684875665e-05, + "loss": 3.8384, + "step": 69175 + }, + { + "epoch": 4.700366897676314, + "grad_norm": 0.22538793087005615, + "learning_rate": 4.126919418399239e-05, + "loss": 3.6981, + "step": 69180 + }, + { + "epoch": 4.7007066177469765, + "grad_norm": 0.19878025352954865, + "learning_rate": 4.1264947683109114e-05, + "loss": 3.7862, + "step": 69185 + }, + { + "epoch": 4.701046337817639, + "grad_norm": 0.19421455264091492, + "learning_rate": 4.126070118222585e-05, + "loss": 3.6836, + "step": 69190 + }, + { + "epoch": 4.7013860578883, + "grad_norm": 0.15080638229846954, + "learning_rate": 4.125645468134258e-05, + "loss": 3.9756, + "step": 69195 + }, + { + "epoch": 4.701725777958962, + "grad_norm": 0.16699759662151337, + "learning_rate": 4.12522081804593e-05, + "loss": 3.8499, + "step": 69200 + }, + { + "epoch": 4.702065498029624, + "grad_norm": 0.17238005995750427, + "learning_rate": 4.124796167957603e-05, + "loss": 3.8619, + "step": 69205 + }, + { + "epoch": 4.702405218100285, + "grad_norm": 0.18902797996997833, + "learning_rate": 4.124371517869276e-05, + "loss": 3.863, + "step": 69210 + }, + { + "epoch": 4.702744938170947, + "grad_norm": 0.2204180508852005, + "learning_rate": 4.123946867780949e-05, + "loss": 3.7888, + "step": 69215 + }, + { + "epoch": 4.703084658241609, + "grad_norm": 0.4899549186229706, + "learning_rate": 4.123522217692621e-05, + "loss": 3.9311, + "step": 69220 + }, + { + "epoch": 4.7034243783122704, + "grad_norm": 0.17617270350456238, + "learning_rate": 4.1230975676042945e-05, + "loss": 3.7843, + "step": 69225 + }, + { + "epoch": 4.7037640983829325, + "grad_norm": 0.13436172902584076, + "learning_rate": 4.122672917515967e-05, + "loss": 3.9666, + "step": 69230 + }, + { + "epoch": 4.704103818453595, + "grad_norm": 0.2940150201320648, + "learning_rate": 4.1222482674276395e-05, + "loss": 3.9044, + "step": 69235 + }, + { + "epoch": 4.704443538524256, + "grad_norm": 0.14469091594219208, + "learning_rate": 4.121823617339313e-05, + "loss": 3.8079, + "step": 69240 + }, + { + "epoch": 4.704783258594918, + "grad_norm": 0.17805533111095428, + "learning_rate": 4.121398967250986e-05, + "loss": 3.9598, + "step": 69245 + }, + { + "epoch": 4.70512297866558, + "grad_norm": 0.18061675131320953, + "learning_rate": 4.120974317162658e-05, + "loss": 3.8874, + "step": 69250 + }, + { + "epoch": 4.705462698736241, + "grad_norm": 0.2197183221578598, + "learning_rate": 4.1205496670743307e-05, + "loss": 4.0136, + "step": 69255 + }, + { + "epoch": 4.705802418806903, + "grad_norm": 0.21018420159816742, + "learning_rate": 4.120125016986004e-05, + "loss": 3.8368, + "step": 69260 + }, + { + "epoch": 4.706142138877565, + "grad_norm": 0.2447056770324707, + "learning_rate": 4.119700366897676e-05, + "loss": 3.6573, + "step": 69265 + }, + { + "epoch": 4.7064818589482265, + "grad_norm": 0.14191265404224396, + "learning_rate": 4.119275716809349e-05, + "loss": 3.8002, + "step": 69270 + }, + { + "epoch": 4.7068215790188885, + "grad_norm": 0.284763365983963, + "learning_rate": 4.1188510667210225e-05, + "loss": 3.644, + "step": 69275 + }, + { + "epoch": 4.707161299089551, + "grad_norm": 0.1641412377357483, + "learning_rate": 4.118426416632695e-05, + "loss": 3.6745, + "step": 69280 + }, + { + "epoch": 4.707501019160212, + "grad_norm": 0.19483138620853424, + "learning_rate": 4.1180017665443675e-05, + "loss": 3.8674, + "step": 69285 + }, + { + "epoch": 4.707840739230874, + "grad_norm": 0.18290047347545624, + "learning_rate": 4.117577116456041e-05, + "loss": 3.7756, + "step": 69290 + }, + { + "epoch": 4.708180459301536, + "grad_norm": 0.2891411781311035, + "learning_rate": 4.117152466367713e-05, + "loss": 3.7339, + "step": 69295 + }, + { + "epoch": 4.708520179372197, + "grad_norm": 0.1736142486333847, + "learning_rate": 4.116727816279386e-05, + "loss": 4.016, + "step": 69300 + }, + { + "epoch": 4.708859899442859, + "grad_norm": 0.16773273050785065, + "learning_rate": 4.116303166191059e-05, + "loss": 3.9622, + "step": 69305 + }, + { + "epoch": 4.709199619513521, + "grad_norm": 0.20583973824977875, + "learning_rate": 4.1158785161027315e-05, + "loss": 3.9889, + "step": 69310 + }, + { + "epoch": 4.7095393395841825, + "grad_norm": 0.1305687129497528, + "learning_rate": 4.115453866014404e-05, + "loss": 3.8567, + "step": 69315 + }, + { + "epoch": 4.7098790596548445, + "grad_norm": 0.3400142788887024, + "learning_rate": 4.115029215926077e-05, + "loss": 3.8595, + "step": 69320 + }, + { + "epoch": 4.710218779725507, + "grad_norm": 0.15412600338459015, + "learning_rate": 4.11460456583775e-05, + "loss": 3.8962, + "step": 69325 + }, + { + "epoch": 4.710558499796168, + "grad_norm": 0.16530482470989227, + "learning_rate": 4.114179915749423e-05, + "loss": 3.8934, + "step": 69330 + }, + { + "epoch": 4.71089821986683, + "grad_norm": 0.16969513893127441, + "learning_rate": 4.1137552656610955e-05, + "loss": 3.5709, + "step": 69335 + }, + { + "epoch": 4.711237939937492, + "grad_norm": 0.15744464099407196, + "learning_rate": 4.113330615572768e-05, + "loss": 4.164, + "step": 69340 + }, + { + "epoch": 4.711577660008153, + "grad_norm": 1.691825032234192, + "learning_rate": 4.112905965484441e-05, + "loss": 3.9242, + "step": 69345 + }, + { + "epoch": 4.711917380078815, + "grad_norm": 0.2085082083940506, + "learning_rate": 4.112481315396114e-05, + "loss": 3.7842, + "step": 69350 + }, + { + "epoch": 4.712257100149477, + "grad_norm": 0.23048847913742065, + "learning_rate": 4.112056665307786e-05, + "loss": 3.6893, + "step": 69355 + }, + { + "epoch": 4.7125968202201385, + "grad_norm": 0.15444685518741608, + "learning_rate": 4.1116320152194595e-05, + "loss": 3.8615, + "step": 69360 + }, + { + "epoch": 4.712936540290801, + "grad_norm": 0.13372081518173218, + "learning_rate": 4.111207365131132e-05, + "loss": 4.0665, + "step": 69365 + }, + { + "epoch": 4.713276260361463, + "grad_norm": 0.20089714229106903, + "learning_rate": 4.1107827150428044e-05, + "loss": 3.7269, + "step": 69370 + }, + { + "epoch": 4.713615980432124, + "grad_norm": 0.3148059546947479, + "learning_rate": 4.110358064954478e-05, + "loss": 3.7942, + "step": 69375 + }, + { + "epoch": 4.713955700502786, + "grad_norm": 0.2796807885169983, + "learning_rate": 4.109933414866151e-05, + "loss": 3.6944, + "step": 69380 + }, + { + "epoch": 4.714295420573448, + "grad_norm": 0.15527468919754028, + "learning_rate": 4.1095087647778235e-05, + "loss": 3.6775, + "step": 69385 + }, + { + "epoch": 4.714635140644109, + "grad_norm": 0.16474659740924835, + "learning_rate": 4.109084114689496e-05, + "loss": 3.7216, + "step": 69390 + }, + { + "epoch": 4.714974860714771, + "grad_norm": 0.17870217561721802, + "learning_rate": 4.108659464601169e-05, + "loss": 3.936, + "step": 69395 + }, + { + "epoch": 4.715314580785432, + "grad_norm": 0.17638325691223145, + "learning_rate": 4.108234814512842e-05, + "loss": 3.7944, + "step": 69400 + }, + { + "epoch": 4.7156543008560945, + "grad_norm": 0.2378946989774704, + "learning_rate": 4.107810164424514e-05, + "loss": 3.8994, + "step": 69405 + }, + { + "epoch": 4.715994020926757, + "grad_norm": 0.35973191261291504, + "learning_rate": 4.1073855143361875e-05, + "loss": 3.9946, + "step": 69410 + }, + { + "epoch": 4.716333740997418, + "grad_norm": 0.22645257413387299, + "learning_rate": 4.10696086424786e-05, + "loss": 3.7015, + "step": 69415 + }, + { + "epoch": 4.71667346106808, + "grad_norm": 0.2059125304222107, + "learning_rate": 4.1065362141595324e-05, + "loss": 3.658, + "step": 69420 + }, + { + "epoch": 4.717013181138742, + "grad_norm": 0.166192427277565, + "learning_rate": 4.106111564071206e-05, + "loss": 3.9486, + "step": 69425 + }, + { + "epoch": 4.717352901209403, + "grad_norm": 0.15830735862255096, + "learning_rate": 4.105686913982879e-05, + "loss": 3.7581, + "step": 69430 + }, + { + "epoch": 4.717692621280065, + "grad_norm": 0.13799981772899628, + "learning_rate": 4.105262263894551e-05, + "loss": 3.8644, + "step": 69435 + }, + { + "epoch": 4.718032341350727, + "grad_norm": 0.15006180107593536, + "learning_rate": 4.1048376138062236e-05, + "loss": 3.7654, + "step": 69440 + }, + { + "epoch": 4.718372061421388, + "grad_norm": 0.23511913418769836, + "learning_rate": 4.104412963717897e-05, + "loss": 3.8173, + "step": 69445 + }, + { + "epoch": 4.7187117814920505, + "grad_norm": 0.15157245099544525, + "learning_rate": 4.103988313629569e-05, + "loss": 3.8553, + "step": 69450 + }, + { + "epoch": 4.719051501562713, + "grad_norm": 0.17323000729084015, + "learning_rate": 4.103563663541242e-05, + "loss": 3.526, + "step": 69455 + }, + { + "epoch": 4.719391221633374, + "grad_norm": 0.18783295154571533, + "learning_rate": 4.1031390134529155e-05, + "loss": 3.7776, + "step": 69460 + }, + { + "epoch": 4.719730941704036, + "grad_norm": 0.17829188704490662, + "learning_rate": 4.1027143633645876e-05, + "loss": 3.9574, + "step": 69465 + }, + { + "epoch": 4.720070661774698, + "grad_norm": 1.978267788887024, + "learning_rate": 4.1022897132762604e-05, + "loss": 3.9175, + "step": 69470 + }, + { + "epoch": 4.720410381845359, + "grad_norm": 0.19898955523967743, + "learning_rate": 4.101865063187933e-05, + "loss": 3.7326, + "step": 69475 + }, + { + "epoch": 4.720750101916021, + "grad_norm": 0.1506434977054596, + "learning_rate": 4.101440413099606e-05, + "loss": 3.7467, + "step": 69480 + }, + { + "epoch": 4.721089821986683, + "grad_norm": 0.1947958767414093, + "learning_rate": 4.101015763011279e-05, + "loss": 3.538, + "step": 69485 + }, + { + "epoch": 4.721429542057344, + "grad_norm": 0.1462666392326355, + "learning_rate": 4.1005911129229516e-05, + "loss": 3.8386, + "step": 69490 + }, + { + "epoch": 4.7217692621280065, + "grad_norm": 0.23556935787200928, + "learning_rate": 4.1001664628346244e-05, + "loss": 4.0248, + "step": 69495 + }, + { + "epoch": 4.722108982198669, + "grad_norm": 0.1764063835144043, + "learning_rate": 4.099741812746297e-05, + "loss": 4.0288, + "step": 69500 + }, + { + "epoch": 4.72244870226933, + "grad_norm": 0.15105944871902466, + "learning_rate": 4.09931716265797e-05, + "loss": 4.0196, + "step": 69505 + }, + { + "epoch": 4.722788422339992, + "grad_norm": 0.18123696744441986, + "learning_rate": 4.098892512569643e-05, + "loss": 3.8543, + "step": 69510 + }, + { + "epoch": 4.723128142410654, + "grad_norm": 0.1824360489845276, + "learning_rate": 4.0984678624813156e-05, + "loss": 3.814, + "step": 69515 + }, + { + "epoch": 4.723467862481315, + "grad_norm": 1.2512435913085938, + "learning_rate": 4.0980432123929884e-05, + "loss": 3.8399, + "step": 69520 + }, + { + "epoch": 4.723807582551977, + "grad_norm": 0.14935144782066345, + "learning_rate": 4.097618562304661e-05, + "loss": 3.7841, + "step": 69525 + }, + { + "epoch": 4.724147302622639, + "grad_norm": 0.18348996341228485, + "learning_rate": 4.097193912216334e-05, + "loss": 3.8398, + "step": 69530 + }, + { + "epoch": 4.7244870226933005, + "grad_norm": 0.4618473947048187, + "learning_rate": 4.096769262128007e-05, + "loss": 3.6894, + "step": 69535 + }, + { + "epoch": 4.7248267427639625, + "grad_norm": 0.1990317851305008, + "learning_rate": 4.096344612039679e-05, + "loss": 3.8028, + "step": 69540 + }, + { + "epoch": 4.725166462834625, + "grad_norm": 0.17833879590034485, + "learning_rate": 4.0959199619513524e-05, + "loss": 3.9001, + "step": 69545 + }, + { + "epoch": 4.725506182905286, + "grad_norm": 0.25276055932044983, + "learning_rate": 4.095495311863025e-05, + "loss": 4.2191, + "step": 69550 + }, + { + "epoch": 4.725845902975948, + "grad_norm": 0.1616729199886322, + "learning_rate": 4.095070661774698e-05, + "loss": 3.5922, + "step": 69555 + }, + { + "epoch": 4.72618562304661, + "grad_norm": 0.18881896138191223, + "learning_rate": 4.094646011686371e-05, + "loss": 3.6686, + "step": 69560 + }, + { + "epoch": 4.726525343117271, + "grad_norm": 0.31957945227622986, + "learning_rate": 4.0942213615980436e-05, + "loss": 3.8781, + "step": 69565 + }, + { + "epoch": 4.726865063187933, + "grad_norm": 0.16654276847839355, + "learning_rate": 4.0937967115097164e-05, + "loss": 3.9193, + "step": 69570 + }, + { + "epoch": 4.727204783258595, + "grad_norm": 0.3706800043582916, + "learning_rate": 4.0933720614213886e-05, + "loss": 3.6709, + "step": 69575 + }, + { + "epoch": 4.7275445033292565, + "grad_norm": 0.7701665759086609, + "learning_rate": 4.092947411333062e-05, + "loss": 3.8012, + "step": 69580 + }, + { + "epoch": 4.7278842233999185, + "grad_norm": 0.19634996354579926, + "learning_rate": 4.092522761244735e-05, + "loss": 3.8037, + "step": 69585 + }, + { + "epoch": 4.728223943470581, + "grad_norm": 0.18045006692409515, + "learning_rate": 4.092098111156407e-05, + "loss": 3.9607, + "step": 69590 + }, + { + "epoch": 4.728563663541242, + "grad_norm": 0.16828277707099915, + "learning_rate": 4.0916734610680804e-05, + "loss": 3.9053, + "step": 69595 + }, + { + "epoch": 4.728903383611904, + "grad_norm": 0.16096670925617218, + "learning_rate": 4.091248810979753e-05, + "loss": 3.7437, + "step": 69600 + }, + { + "epoch": 4.729243103682566, + "grad_norm": 1.1955251693725586, + "learning_rate": 4.0908241608914254e-05, + "loss": 3.8053, + "step": 69605 + }, + { + "epoch": 4.729582823753227, + "grad_norm": 0.16202643513679504, + "learning_rate": 4.090399510803098e-05, + "loss": 4.0149, + "step": 69610 + }, + { + "epoch": 4.729922543823889, + "grad_norm": 0.21476508677005768, + "learning_rate": 4.0899748607147716e-05, + "loss": 3.6605, + "step": 69615 + }, + { + "epoch": 4.73026226389455, + "grad_norm": 0.31852227449417114, + "learning_rate": 4.089550210626444e-05, + "loss": 3.7557, + "step": 69620 + }, + { + "epoch": 4.7306019839652125, + "grad_norm": 0.19120609760284424, + "learning_rate": 4.0891255605381166e-05, + "loss": 3.9486, + "step": 69625 + }, + { + "epoch": 4.7309417040358746, + "grad_norm": 0.17255400121212006, + "learning_rate": 4.08870091044979e-05, + "loss": 3.9631, + "step": 69630 + }, + { + "epoch": 4.731281424106536, + "grad_norm": 0.1756787896156311, + "learning_rate": 4.088276260361462e-05, + "loss": 3.7773, + "step": 69635 + }, + { + "epoch": 4.731621144177198, + "grad_norm": 0.25166985392570496, + "learning_rate": 4.087851610273135e-05, + "loss": 3.9887, + "step": 69640 + }, + { + "epoch": 4.73196086424786, + "grad_norm": 0.2576724886894226, + "learning_rate": 4.087426960184808e-05, + "loss": 3.8013, + "step": 69645 + }, + { + "epoch": 4.732300584318521, + "grad_norm": 0.16472159326076508, + "learning_rate": 4.0870023100964806e-05, + "loss": 3.8582, + "step": 69650 + }, + { + "epoch": 4.732640304389183, + "grad_norm": 0.14868058264255524, + "learning_rate": 4.0865776600081534e-05, + "loss": 3.9008, + "step": 69655 + }, + { + "epoch": 4.732980024459845, + "grad_norm": 0.1692001223564148, + "learning_rate": 4.086153009919826e-05, + "loss": 3.7887, + "step": 69660 + }, + { + "epoch": 4.733319744530506, + "grad_norm": 0.17286135256290436, + "learning_rate": 4.085728359831499e-05, + "loss": 3.5849, + "step": 69665 + }, + { + "epoch": 4.7336594646011685, + "grad_norm": 0.17861060798168182, + "learning_rate": 4.085303709743172e-05, + "loss": 3.896, + "step": 69670 + }, + { + "epoch": 4.733999184671831, + "grad_norm": 0.19478176534175873, + "learning_rate": 4.0848790596548446e-05, + "loss": 3.8733, + "step": 69675 + }, + { + "epoch": 4.734338904742492, + "grad_norm": 0.16372986137866974, + "learning_rate": 4.0844544095665174e-05, + "loss": 3.8863, + "step": 69680 + }, + { + "epoch": 4.734678624813154, + "grad_norm": 0.156174436211586, + "learning_rate": 4.08402975947819e-05, + "loss": 3.7821, + "step": 69685 + }, + { + "epoch": 4.735018344883816, + "grad_norm": 0.15807633101940155, + "learning_rate": 4.083605109389863e-05, + "loss": 3.9932, + "step": 69690 + }, + { + "epoch": 4.735358064954477, + "grad_norm": 0.16950355470180511, + "learning_rate": 4.083180459301536e-05, + "loss": 3.7357, + "step": 69695 + }, + { + "epoch": 4.735697785025139, + "grad_norm": 0.424819678068161, + "learning_rate": 4.0827558092132086e-05, + "loss": 3.6844, + "step": 69700 + }, + { + "epoch": 4.736037505095801, + "grad_norm": 0.17918434739112854, + "learning_rate": 4.0823311591248814e-05, + "loss": 3.6596, + "step": 69705 + }, + { + "epoch": 4.736377225166462, + "grad_norm": 0.6783207654953003, + "learning_rate": 4.0819065090365535e-05, + "loss": 3.9667, + "step": 69710 + }, + { + "epoch": 4.7367169452371245, + "grad_norm": 0.17301073670387268, + "learning_rate": 4.081481858948227e-05, + "loss": 4.0757, + "step": 69715 + }, + { + "epoch": 4.737056665307787, + "grad_norm": 0.17099791765213013, + "learning_rate": 4.0810572088599e-05, + "loss": 3.5899, + "step": 69720 + }, + { + "epoch": 4.737396385378448, + "grad_norm": 0.16362299025058746, + "learning_rate": 4.0806325587715726e-05, + "loss": 4.0764, + "step": 69725 + }, + { + "epoch": 4.73773610544911, + "grad_norm": 0.4556816816329956, + "learning_rate": 4.0802079086832454e-05, + "loss": 3.8372, + "step": 69730 + }, + { + "epoch": 4.738075825519772, + "grad_norm": 0.17399826645851135, + "learning_rate": 4.079783258594918e-05, + "loss": 3.7186, + "step": 69735 + }, + { + "epoch": 4.738415545590433, + "grad_norm": 0.16243551671504974, + "learning_rate": 4.079358608506591e-05, + "loss": 4.0517, + "step": 69740 + }, + { + "epoch": 4.738755265661095, + "grad_norm": 0.24760769307613373, + "learning_rate": 4.078933958418263e-05, + "loss": 3.8783, + "step": 69745 + }, + { + "epoch": 4.739094985731757, + "grad_norm": 0.15923722088336945, + "learning_rate": 4.0785093083299366e-05, + "loss": 4.0326, + "step": 69750 + }, + { + "epoch": 4.739434705802418, + "grad_norm": 0.2309940606355667, + "learning_rate": 4.0780846582416094e-05, + "loss": 4.0727, + "step": 69755 + }, + { + "epoch": 4.7397744258730805, + "grad_norm": NaN, + "learning_rate": 4.0777449381709476e-05, + "loss": 3.8926, + "step": 69760 + }, + { + "epoch": 4.740114145943743, + "grad_norm": 0.15402713418006897, + "learning_rate": 4.07732028808262e-05, + "loss": 3.935, + "step": 69765 + }, + { + "epoch": 4.740453866014404, + "grad_norm": 0.14560341835021973, + "learning_rate": 4.076895637994293e-05, + "loss": 3.695, + "step": 69770 + }, + { + "epoch": 4.740793586085066, + "grad_norm": 0.20930422842502594, + "learning_rate": 4.076470987905966e-05, + "loss": 3.7512, + "step": 69775 + }, + { + "epoch": 4.741133306155728, + "grad_norm": 0.23412485420703888, + "learning_rate": 4.076046337817638e-05, + "loss": 3.8312, + "step": 69780 + }, + { + "epoch": 4.741473026226389, + "grad_norm": 0.2179725021123886, + "learning_rate": 4.0756216877293116e-05, + "loss": 3.8572, + "step": 69785 + }, + { + "epoch": 4.741812746297051, + "grad_norm": 0.20012648403644562, + "learning_rate": 4.0751970376409844e-05, + "loss": 3.8211, + "step": 69790 + }, + { + "epoch": 4.742152466367713, + "grad_norm": 0.1763702929019928, + "learning_rate": 4.0747723875526565e-05, + "loss": 3.9961, + "step": 69795 + }, + { + "epoch": 4.7424921864383744, + "grad_norm": 1.2338171005249023, + "learning_rate": 4.0743477374643293e-05, + "loss": 3.7468, + "step": 69800 + }, + { + "epoch": 4.7428319065090365, + "grad_norm": 0.16476747393608093, + "learning_rate": 4.073923087376003e-05, + "loss": 3.736, + "step": 69805 + }, + { + "epoch": 4.743171626579699, + "grad_norm": 0.24632465839385986, + "learning_rate": 4.073498437287675e-05, + "loss": 3.8137, + "step": 69810 + }, + { + "epoch": 4.74351134665036, + "grad_norm": 0.2062283158302307, + "learning_rate": 4.073073787199348e-05, + "loss": 3.5054, + "step": 69815 + }, + { + "epoch": 4.743851066721022, + "grad_norm": 0.1725446879863739, + "learning_rate": 4.072649137111021e-05, + "loss": 3.8308, + "step": 69820 + }, + { + "epoch": 4.744190786791684, + "grad_norm": 0.164071723818779, + "learning_rate": 4.0722244870226934e-05, + "loss": 3.84, + "step": 69825 + }, + { + "epoch": 4.744530506862345, + "grad_norm": 0.8423631191253662, + "learning_rate": 4.071799836934366e-05, + "loss": 3.8655, + "step": 69830 + }, + { + "epoch": 4.744870226933007, + "grad_norm": 0.30336758494377136, + "learning_rate": 4.071375186846039e-05, + "loss": 4.0803, + "step": 69835 + }, + { + "epoch": 4.745209947003669, + "grad_norm": 0.18273644149303436, + "learning_rate": 4.070950536757712e-05, + "loss": 3.8099, + "step": 69840 + }, + { + "epoch": 4.7455496670743305, + "grad_norm": 0.19877342879772186, + "learning_rate": 4.0705258866693846e-05, + "loss": 3.7705, + "step": 69845 + }, + { + "epoch": 4.7458893871449925, + "grad_norm": 0.1690056324005127, + "learning_rate": 4.0701012365810574e-05, + "loss": 4.0475, + "step": 69850 + }, + { + "epoch": 4.746229107215655, + "grad_norm": 0.18984735012054443, + "learning_rate": 4.06967658649273e-05, + "loss": 4.1258, + "step": 69855 + }, + { + "epoch": 4.746568827286316, + "grad_norm": 0.6469388008117676, + "learning_rate": 4.069251936404403e-05, + "loss": 3.4505, + "step": 69860 + }, + { + "epoch": 4.746908547356978, + "grad_norm": 0.17644253373146057, + "learning_rate": 4.068827286316076e-05, + "loss": 3.9331, + "step": 69865 + }, + { + "epoch": 4.74724826742764, + "grad_norm": 0.2042463719844818, + "learning_rate": 4.0684026362277486e-05, + "loss": 3.6338, + "step": 69870 + }, + { + "epoch": 4.747587987498301, + "grad_norm": 0.20460852980613708, + "learning_rate": 4.0679779861394214e-05, + "loss": 3.8882, + "step": 69875 + }, + { + "epoch": 4.747927707568963, + "grad_norm": 0.1906699538230896, + "learning_rate": 4.067553336051094e-05, + "loss": 4.0, + "step": 69880 + }, + { + "epoch": 4.748267427639625, + "grad_norm": 0.15022976696491241, + "learning_rate": 4.067128685962767e-05, + "loss": 3.8826, + "step": 69885 + }, + { + "epoch": 4.7486071477102865, + "grad_norm": 0.17956718802452087, + "learning_rate": 4.06670403587444e-05, + "loss": 3.9894, + "step": 69890 + }, + { + "epoch": 4.7489468677809485, + "grad_norm": 0.4011254906654358, + "learning_rate": 4.0662793857861126e-05, + "loss": 3.9898, + "step": 69895 + }, + { + "epoch": 4.749286587851611, + "grad_norm": 0.18429632484912872, + "learning_rate": 4.065854735697785e-05, + "loss": 3.9162, + "step": 69900 + }, + { + "epoch": 4.749626307922272, + "grad_norm": 0.18880563974380493, + "learning_rate": 4.065430085609458e-05, + "loss": 3.8665, + "step": 69905 + }, + { + "epoch": 4.749966027992934, + "grad_norm": 0.1594928354024887, + "learning_rate": 4.065005435521131e-05, + "loss": 3.8737, + "step": 69910 + }, + { + "epoch": 4.750305748063596, + "grad_norm": 0.1408892273902893, + "learning_rate": 4.064580785432803e-05, + "loss": 3.9353, + "step": 69915 + }, + { + "epoch": 4.750645468134257, + "grad_norm": 0.19530045986175537, + "learning_rate": 4.0641561353444766e-05, + "loss": 3.8789, + "step": 69920 + }, + { + "epoch": 4.750985188204919, + "grad_norm": 0.16247494518756866, + "learning_rate": 4.0637314852561494e-05, + "loss": 3.7065, + "step": 69925 + }, + { + "epoch": 4.751324908275581, + "grad_norm": 1.183523178100586, + "learning_rate": 4.063306835167822e-05, + "loss": 3.8645, + "step": 69930 + }, + { + "epoch": 4.7516646283462425, + "grad_norm": 0.1833140254020691, + "learning_rate": 4.062882185079494e-05, + "loss": 3.5089, + "step": 69935 + }, + { + "epoch": 4.7520043484169046, + "grad_norm": 0.2577720880508423, + "learning_rate": 4.062457534991168e-05, + "loss": 3.8681, + "step": 69940 + }, + { + "epoch": 4.752344068487567, + "grad_norm": 0.1840575784444809, + "learning_rate": 4.0620328849028406e-05, + "loss": 4.1098, + "step": 69945 + }, + { + "epoch": 4.752683788558228, + "grad_norm": 0.18959972262382507, + "learning_rate": 4.061608234814513e-05, + "loss": 3.7145, + "step": 69950 + }, + { + "epoch": 4.75302350862889, + "grad_norm": 0.3657325506210327, + "learning_rate": 4.061183584726186e-05, + "loss": 3.9191, + "step": 69955 + }, + { + "epoch": 4.753363228699552, + "grad_norm": 0.19438382983207703, + "learning_rate": 4.060758934637859e-05, + "loss": 3.9185, + "step": 69960 + }, + { + "epoch": 4.753702948770213, + "grad_norm": 0.16566719114780426, + "learning_rate": 4.060334284549531e-05, + "loss": 3.9646, + "step": 69965 + }, + { + "epoch": 4.754042668840875, + "grad_norm": 0.16172707080841064, + "learning_rate": 4.059909634461204e-05, + "loss": 3.8689, + "step": 69970 + }, + { + "epoch": 4.754382388911537, + "grad_norm": 0.20658664405345917, + "learning_rate": 4.0594849843728774e-05, + "loss": 3.6, + "step": 69975 + }, + { + "epoch": 4.7547221089821985, + "grad_norm": 0.1913328617811203, + "learning_rate": 4.0590603342845495e-05, + "loss": 3.8182, + "step": 69980 + }, + { + "epoch": 4.755061829052861, + "grad_norm": 0.23965522646903992, + "learning_rate": 4.058635684196222e-05, + "loss": 3.8171, + "step": 69985 + }, + { + "epoch": 4.755401549123523, + "grad_norm": 0.18489758670330048, + "learning_rate": 4.058211034107896e-05, + "loss": 3.7673, + "step": 69990 + }, + { + "epoch": 4.755741269194184, + "grad_norm": 0.21190279722213745, + "learning_rate": 4.057786384019568e-05, + "loss": 3.7581, + "step": 69995 + }, + { + "epoch": 4.756080989264846, + "grad_norm": 0.2444145530462265, + "learning_rate": 4.057361733931241e-05, + "loss": 4.0498, + "step": 70000 + }, + { + "epoch": 4.756420709335508, + "grad_norm": 0.1906590610742569, + "learning_rate": 4.0569370838429135e-05, + "loss": 4.0763, + "step": 70005 + }, + { + "epoch": 4.756760429406169, + "grad_norm": 0.17800237238407135, + "learning_rate": 4.056512433754586e-05, + "loss": 3.8029, + "step": 70010 + }, + { + "epoch": 4.757100149476831, + "grad_norm": 0.18547658622264862, + "learning_rate": 4.056087783666259e-05, + "loss": 3.8002, + "step": 70015 + }, + { + "epoch": 4.757439869547493, + "grad_norm": 0.15208452939987183, + "learning_rate": 4.055663133577932e-05, + "loss": 3.7482, + "step": 70020 + }, + { + "epoch": 4.7577795896181545, + "grad_norm": 0.3495774567127228, + "learning_rate": 4.055238483489605e-05, + "loss": 3.8196, + "step": 70025 + }, + { + "epoch": 4.758119309688817, + "grad_norm": 0.18723821640014648, + "learning_rate": 4.0548138334012775e-05, + "loss": 3.6882, + "step": 70030 + }, + { + "epoch": 4.758459029759479, + "grad_norm": 0.18608799576759338, + "learning_rate": 4.05438918331295e-05, + "loss": 4.0643, + "step": 70035 + }, + { + "epoch": 4.75879874983014, + "grad_norm": 0.13979026675224304, + "learning_rate": 4.053964533224623e-05, + "loss": 3.862, + "step": 70040 + }, + { + "epoch": 4.759138469900802, + "grad_norm": 0.17282691597938538, + "learning_rate": 4.053539883136296e-05, + "loss": 3.9806, + "step": 70045 + }, + { + "epoch": 4.759478189971464, + "grad_norm": 0.13941727578639984, + "learning_rate": 4.053115233047969e-05, + "loss": 3.4337, + "step": 70050 + }, + { + "epoch": 4.759817910042125, + "grad_norm": 0.16799797117710114, + "learning_rate": 4.0526905829596415e-05, + "loss": 3.9052, + "step": 70055 + }, + { + "epoch": 4.760157630112787, + "grad_norm": 0.1636986881494522, + "learning_rate": 4.052265932871314e-05, + "loss": 3.6665, + "step": 70060 + }, + { + "epoch": 4.760497350183449, + "grad_norm": 0.301016628742218, + "learning_rate": 4.051841282782987e-05, + "loss": 3.6937, + "step": 70065 + }, + { + "epoch": 4.7608370702541105, + "grad_norm": 0.19389694929122925, + "learning_rate": 4.051416632694659e-05, + "loss": 4.0233, + "step": 70070 + }, + { + "epoch": 4.761176790324773, + "grad_norm": 0.20152461528778076, + "learning_rate": 4.050991982606333e-05, + "loss": 3.7118, + "step": 70075 + }, + { + "epoch": 4.761516510395434, + "grad_norm": 0.16900832951068878, + "learning_rate": 4.0505673325180055e-05, + "loss": 3.9523, + "step": 70080 + }, + { + "epoch": 4.761856230466096, + "grad_norm": 0.1718425452709198, + "learning_rate": 4.0501426824296776e-05, + "loss": 3.8291, + "step": 70085 + }, + { + "epoch": 4.762195950536758, + "grad_norm": 0.15219321846961975, + "learning_rate": 4.049718032341351e-05, + "loss": 3.4584, + "step": 70090 + }, + { + "epoch": 4.762535670607419, + "grad_norm": 0.18168671429157257, + "learning_rate": 4.049293382253024e-05, + "loss": 3.7778, + "step": 70095 + }, + { + "epoch": 4.762875390678081, + "grad_norm": 0.1786816567182541, + "learning_rate": 4.048868732164697e-05, + "loss": 3.874, + "step": 70100 + }, + { + "epoch": 4.763215110748743, + "grad_norm": 0.17988331615924835, + "learning_rate": 4.048444082076369e-05, + "loss": 3.7645, + "step": 70105 + }, + { + "epoch": 4.7635548308194045, + "grad_norm": 0.16081885993480682, + "learning_rate": 4.048019431988042e-05, + "loss": 3.5704, + "step": 70110 + }, + { + "epoch": 4.7638945508900665, + "grad_norm": 0.19640228152275085, + "learning_rate": 4.047594781899715e-05, + "loss": 3.7127, + "step": 70115 + }, + { + "epoch": 4.764234270960729, + "grad_norm": 0.1826646476984024, + "learning_rate": 4.047170131811387e-05, + "loss": 3.9423, + "step": 70120 + }, + { + "epoch": 4.76457399103139, + "grad_norm": 0.14988286793231964, + "learning_rate": 4.046745481723061e-05, + "loss": 3.9324, + "step": 70125 + }, + { + "epoch": 4.764913711102052, + "grad_norm": 0.18272116780281067, + "learning_rate": 4.0463208316347335e-05, + "loss": 3.8105, + "step": 70130 + }, + { + "epoch": 4.765253431172714, + "grad_norm": 0.14168314635753632, + "learning_rate": 4.0458961815464056e-05, + "loss": 3.9333, + "step": 70135 + }, + { + "epoch": 4.765593151243375, + "grad_norm": 0.16271282732486725, + "learning_rate": 4.0454715314580784e-05, + "loss": 3.9069, + "step": 70140 + }, + { + "epoch": 4.765932871314037, + "grad_norm": 0.2875271141529083, + "learning_rate": 4.045046881369752e-05, + "loss": 3.827, + "step": 70145 + }, + { + "epoch": 4.766272591384699, + "grad_norm": 0.2442161738872528, + "learning_rate": 4.044622231281424e-05, + "loss": 3.5783, + "step": 70150 + }, + { + "epoch": 4.7666123114553605, + "grad_norm": 0.15008407831192017, + "learning_rate": 4.044197581193097e-05, + "loss": 3.7357, + "step": 70155 + }, + { + "epoch": 4.7669520315260225, + "grad_norm": 0.19489526748657227, + "learning_rate": 4.04377293110477e-05, + "loss": 3.951, + "step": 70160 + }, + { + "epoch": 4.767291751596685, + "grad_norm": 0.1656467467546463, + "learning_rate": 4.0433482810164425e-05, + "loss": 3.8675, + "step": 70165 + }, + { + "epoch": 4.767631471667346, + "grad_norm": 0.21791242063045502, + "learning_rate": 4.042923630928115e-05, + "loss": 3.9217, + "step": 70170 + }, + { + "epoch": 4.767971191738008, + "grad_norm": 0.15061452984809875, + "learning_rate": 4.042498980839789e-05, + "loss": 3.876, + "step": 70175 + }, + { + "epoch": 4.76831091180867, + "grad_norm": 0.17571209371089935, + "learning_rate": 4.042074330751461e-05, + "loss": 4.0121, + "step": 70180 + }, + { + "epoch": 4.768650631879331, + "grad_norm": 0.16216976940631866, + "learning_rate": 4.0416496806631337e-05, + "loss": 3.863, + "step": 70185 + }, + { + "epoch": 4.768990351949993, + "grad_norm": 0.17098921537399292, + "learning_rate": 4.0412250305748065e-05, + "loss": 3.9332, + "step": 70190 + }, + { + "epoch": 4.769330072020655, + "grad_norm": 0.16320253908634186, + "learning_rate": 4.040800380486479e-05, + "loss": 3.8068, + "step": 70195 + }, + { + "epoch": 4.7696697920913165, + "grad_norm": 0.16424715518951416, + "learning_rate": 4.040375730398152e-05, + "loss": 3.7862, + "step": 70200 + }, + { + "epoch": 4.7700095121619785, + "grad_norm": 0.14779143035411835, + "learning_rate": 4.039951080309825e-05, + "loss": 3.8429, + "step": 70205 + }, + { + "epoch": 4.770349232232641, + "grad_norm": 0.1453419327735901, + "learning_rate": 4.0395264302214977e-05, + "loss": 3.8102, + "step": 70210 + }, + { + "epoch": 4.770688952303302, + "grad_norm": 0.20256055891513824, + "learning_rate": 4.0391017801331705e-05, + "loss": 3.7606, + "step": 70215 + }, + { + "epoch": 4.771028672373964, + "grad_norm": 0.1886395961046219, + "learning_rate": 4.038677130044843e-05, + "loss": 3.8624, + "step": 70220 + }, + { + "epoch": 4.771368392444626, + "grad_norm": 0.31019240617752075, + "learning_rate": 4.038252479956516e-05, + "loss": 3.7973, + "step": 70225 + }, + { + "epoch": 4.771708112515287, + "grad_norm": 0.16573776304721832, + "learning_rate": 4.037827829868189e-05, + "loss": 3.893, + "step": 70230 + }, + { + "epoch": 4.772047832585949, + "grad_norm": 0.21324880421161652, + "learning_rate": 4.037403179779862e-05, + "loss": 3.6409, + "step": 70235 + }, + { + "epoch": 4.772387552656611, + "grad_norm": 1.1343713998794556, + "learning_rate": 4.036978529691534e-05, + "loss": 3.7157, + "step": 70240 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.17512373626232147, + "learning_rate": 4.036638809620873e-05, + "loss": 3.7586, + "step": 70245 + }, + { + "epoch": 4.773066992797935, + "grad_norm": 0.15631107985973358, + "learning_rate": 4.0362141595325455e-05, + "loss": 3.8288, + "step": 70250 + }, + { + "epoch": 4.773406712868597, + "grad_norm": 0.15690039098262787, + "learning_rate": 4.035789509444218e-05, + "loss": 3.8763, + "step": 70255 + }, + { + "epoch": 4.773746432939258, + "grad_norm": 0.18320022523403168, + "learning_rate": 4.0353648593558904e-05, + "loss": 3.8824, + "step": 70260 + }, + { + "epoch": 4.77408615300992, + "grad_norm": 0.22214210033416748, + "learning_rate": 4.034940209267564e-05, + "loss": 3.7715, + "step": 70265 + }, + { + "epoch": 4.774425873080582, + "grad_norm": 0.27380889654159546, + "learning_rate": 4.034515559179237e-05, + "loss": 3.9644, + "step": 70270 + }, + { + "epoch": 4.774765593151243, + "grad_norm": 0.14206765592098236, + "learning_rate": 4.034090909090909e-05, + "loss": 4.0155, + "step": 70275 + }, + { + "epoch": 4.775105313221905, + "grad_norm": 0.16638942062854767, + "learning_rate": 4.033666259002582e-05, + "loss": 3.706, + "step": 70280 + }, + { + "epoch": 4.775445033292567, + "grad_norm": 0.15552154183387756, + "learning_rate": 4.033241608914255e-05, + "loss": 4.1314, + "step": 70285 + }, + { + "epoch": 4.7757847533632285, + "grad_norm": 0.19169943034648895, + "learning_rate": 4.032816958825927e-05, + "loss": 3.8714, + "step": 70290 + }, + { + "epoch": 4.776124473433891, + "grad_norm": 0.18543344736099243, + "learning_rate": 4.0323923087376e-05, + "loss": 3.9107, + "step": 70295 + }, + { + "epoch": 4.776464193504552, + "grad_norm": 0.21812017261981964, + "learning_rate": 4.0319676586492735e-05, + "loss": 4.0278, + "step": 70300 + }, + { + "epoch": 4.776803913575214, + "grad_norm": 0.19433890283107758, + "learning_rate": 4.031543008560946e-05, + "loss": 4.0279, + "step": 70305 + }, + { + "epoch": 4.777143633645876, + "grad_norm": 0.26344507932662964, + "learning_rate": 4.0311183584726184e-05, + "loss": 3.8857, + "step": 70310 + }, + { + "epoch": 4.777483353716537, + "grad_norm": 0.5038392543792725, + "learning_rate": 4.030693708384292e-05, + "loss": 3.868, + "step": 70315 + }, + { + "epoch": 4.777823073787199, + "grad_norm": 0.1774202436208725, + "learning_rate": 4.030269058295965e-05, + "loss": 3.8885, + "step": 70320 + }, + { + "epoch": 4.778162793857861, + "grad_norm": 1.922248125076294, + "learning_rate": 4.029844408207637e-05, + "loss": 3.77, + "step": 70325 + }, + { + "epoch": 4.778502513928522, + "grad_norm": 0.3580746650695801, + "learning_rate": 4.0294197581193096e-05, + "loss": 3.7468, + "step": 70330 + }, + { + "epoch": 4.7788422339991845, + "grad_norm": 0.1800621598958969, + "learning_rate": 4.028995108030983e-05, + "loss": 3.8317, + "step": 70335 + }, + { + "epoch": 4.779181954069847, + "grad_norm": 0.20851300656795502, + "learning_rate": 4.028570457942655e-05, + "loss": 3.7605, + "step": 70340 + }, + { + "epoch": 4.779521674140508, + "grad_norm": 0.25747150182724, + "learning_rate": 4.028145807854328e-05, + "loss": 3.7408, + "step": 70345 + }, + { + "epoch": 4.77986139421117, + "grad_norm": 0.15010255575180054, + "learning_rate": 4.0277211577660015e-05, + "loss": 3.665, + "step": 70350 + }, + { + "epoch": 4.780201114281832, + "grad_norm": 0.2740277945995331, + "learning_rate": 4.0272965076776736e-05, + "loss": 3.9789, + "step": 70355 + }, + { + "epoch": 4.780540834352493, + "grad_norm": 0.22729888558387756, + "learning_rate": 4.0268718575893464e-05, + "loss": 3.897, + "step": 70360 + }, + { + "epoch": 4.780880554423155, + "grad_norm": 0.2547963261604309, + "learning_rate": 4.026447207501019e-05, + "loss": 3.5773, + "step": 70365 + }, + { + "epoch": 4.781220274493817, + "grad_norm": 0.18675345182418823, + "learning_rate": 4.026022557412692e-05, + "loss": 3.968, + "step": 70370 + }, + { + "epoch": 4.7815599945644784, + "grad_norm": 0.2608765661716461, + "learning_rate": 4.025597907324365e-05, + "loss": 3.9736, + "step": 70375 + }, + { + "epoch": 4.7818997146351405, + "grad_norm": 0.1744004637002945, + "learning_rate": 4.0251732572360376e-05, + "loss": 4.0152, + "step": 70380 + }, + { + "epoch": 4.782239434705803, + "grad_norm": 0.18451452255249023, + "learning_rate": 4.0247486071477104e-05, + "loss": 3.9811, + "step": 70385 + }, + { + "epoch": 4.782579154776464, + "grad_norm": 0.191804900765419, + "learning_rate": 4.024323957059383e-05, + "loss": 3.7858, + "step": 70390 + }, + { + "epoch": 4.782918874847126, + "grad_norm": 0.3443194329738617, + "learning_rate": 4.023899306971056e-05, + "loss": 3.9014, + "step": 70395 + }, + { + "epoch": 4.783258594917788, + "grad_norm": 0.1750718355178833, + "learning_rate": 4.023474656882729e-05, + "loss": 3.6665, + "step": 70400 + }, + { + "epoch": 4.783598314988449, + "grad_norm": 0.19320803880691528, + "learning_rate": 4.0230500067944016e-05, + "loss": 3.9071, + "step": 70405 + }, + { + "epoch": 4.783938035059111, + "grad_norm": 0.13967904448509216, + "learning_rate": 4.0226253567060745e-05, + "loss": 3.9798, + "step": 70410 + }, + { + "epoch": 4.784277755129773, + "grad_norm": 0.16601674258708954, + "learning_rate": 4.022200706617747e-05, + "loss": 4.0154, + "step": 70415 + }, + { + "epoch": 4.7846174752004345, + "grad_norm": 0.16849763691425323, + "learning_rate": 4.02177605652942e-05, + "loss": 3.7738, + "step": 70420 + }, + { + "epoch": 4.7849571952710965, + "grad_norm": 0.26976439356803894, + "learning_rate": 4.021351406441093e-05, + "loss": 3.6345, + "step": 70425 + }, + { + "epoch": 4.785296915341759, + "grad_norm": 0.422234445810318, + "learning_rate": 4.020926756352765e-05, + "loss": 3.9331, + "step": 70430 + }, + { + "epoch": 4.78563663541242, + "grad_norm": 0.2785547971725464, + "learning_rate": 4.0205021062644385e-05, + "loss": 3.8189, + "step": 70435 + }, + { + "epoch": 4.785976355483082, + "grad_norm": 0.24007798731327057, + "learning_rate": 4.020077456176111e-05, + "loss": 3.592, + "step": 70440 + }, + { + "epoch": 4.786316075553744, + "grad_norm": 0.20291811227798462, + "learning_rate": 4.0196528060877834e-05, + "loss": 4.0507, + "step": 70445 + }, + { + "epoch": 4.786655795624405, + "grad_norm": 0.1562073677778244, + "learning_rate": 4.019228155999457e-05, + "loss": 3.8632, + "step": 70450 + }, + { + "epoch": 4.786995515695067, + "grad_norm": 0.18006905913352966, + "learning_rate": 4.0188035059111297e-05, + "loss": 3.6692, + "step": 70455 + }, + { + "epoch": 4.787335235765729, + "grad_norm": 0.3399786353111267, + "learning_rate": 4.018378855822802e-05, + "loss": 3.7474, + "step": 70460 + }, + { + "epoch": 4.7876749558363905, + "grad_norm": 0.18158859014511108, + "learning_rate": 4.0179542057344746e-05, + "loss": 3.8693, + "step": 70465 + }, + { + "epoch": 4.7880146759070525, + "grad_norm": 0.20378580689430237, + "learning_rate": 4.017529555646148e-05, + "loss": 3.9735, + "step": 70470 + }, + { + "epoch": 4.788354395977715, + "grad_norm": 0.19618898630142212, + "learning_rate": 4.017104905557821e-05, + "loss": 3.8996, + "step": 70475 + }, + { + "epoch": 4.788694116048376, + "grad_norm": 0.2298976182937622, + "learning_rate": 4.016680255469493e-05, + "loss": 3.7173, + "step": 70480 + }, + { + "epoch": 4.789033836119038, + "grad_norm": 0.17214913666248322, + "learning_rate": 4.0162556053811665e-05, + "loss": 3.936, + "step": 70485 + }, + { + "epoch": 4.7893735561897, + "grad_norm": 0.17404712736606598, + "learning_rate": 4.015830955292839e-05, + "loss": 3.6805, + "step": 70490 + }, + { + "epoch": 4.789713276260361, + "grad_norm": 0.24903635680675507, + "learning_rate": 4.0154063052045114e-05, + "loss": 3.7241, + "step": 70495 + }, + { + "epoch": 4.790052996331023, + "grad_norm": 0.17742547392845154, + "learning_rate": 4.014981655116184e-05, + "loss": 4.0611, + "step": 70500 + }, + { + "epoch": 4.790392716401685, + "grad_norm": 0.16356362402439117, + "learning_rate": 4.014557005027858e-05, + "loss": 3.6656, + "step": 70505 + }, + { + "epoch": 4.7907324364723465, + "grad_norm": 0.14877626299858093, + "learning_rate": 4.01413235493953e-05, + "loss": 3.843, + "step": 70510 + }, + { + "epoch": 4.7910721565430086, + "grad_norm": 0.23482497036457062, + "learning_rate": 4.0137077048512026e-05, + "loss": 3.7025, + "step": 70515 + }, + { + "epoch": 4.791411876613671, + "grad_norm": 0.1619347184896469, + "learning_rate": 4.013283054762876e-05, + "loss": 4.0481, + "step": 70520 + }, + { + "epoch": 4.791751596684332, + "grad_norm": 0.18392819166183472, + "learning_rate": 4.012858404674548e-05, + "loss": 3.8705, + "step": 70525 + }, + { + "epoch": 4.792091316754994, + "grad_norm": 0.21138662099838257, + "learning_rate": 4.012433754586221e-05, + "loss": 3.7639, + "step": 70530 + }, + { + "epoch": 4.792431036825656, + "grad_norm": 0.22237525880336761, + "learning_rate": 4.0120091044978945e-05, + "loss": 3.8765, + "step": 70535 + }, + { + "epoch": 4.792770756896317, + "grad_norm": 0.5308010578155518, + "learning_rate": 4.0115844544095666e-05, + "loss": 3.6717, + "step": 70540 + }, + { + "epoch": 4.793110476966979, + "grad_norm": 0.42754417657852173, + "learning_rate": 4.0111598043212394e-05, + "loss": 3.6965, + "step": 70545 + }, + { + "epoch": 4.793450197037641, + "grad_norm": 0.270583838224411, + "learning_rate": 4.010735154232912e-05, + "loss": 3.7979, + "step": 70550 + }, + { + "epoch": 4.7937899171083025, + "grad_norm": 0.19766126573085785, + "learning_rate": 4.010310504144585e-05, + "loss": 4.0644, + "step": 70555 + }, + { + "epoch": 4.794129637178965, + "grad_norm": 0.15553219616413116, + "learning_rate": 4.009885854056258e-05, + "loss": 4.1012, + "step": 70560 + }, + { + "epoch": 4.794469357249627, + "grad_norm": 0.23311534523963928, + "learning_rate": 4.0094612039679306e-05, + "loss": 3.9477, + "step": 70565 + }, + { + "epoch": 4.794809077320288, + "grad_norm": 0.19616493582725525, + "learning_rate": 4.0090365538796034e-05, + "loss": 4.0355, + "step": 70570 + }, + { + "epoch": 4.79514879739095, + "grad_norm": 0.1902386099100113, + "learning_rate": 4.008611903791276e-05, + "loss": 3.7437, + "step": 70575 + }, + { + "epoch": 4.795488517461612, + "grad_norm": 0.21905486285686493, + "learning_rate": 4.008187253702949e-05, + "loss": 3.8059, + "step": 70580 + }, + { + "epoch": 4.795828237532273, + "grad_norm": 0.1549251824617386, + "learning_rate": 4.007762603614622e-05, + "loss": 3.7766, + "step": 70585 + }, + { + "epoch": 4.796167957602935, + "grad_norm": 0.17087768018245697, + "learning_rate": 4.0073379535262946e-05, + "loss": 3.8907, + "step": 70590 + }, + { + "epoch": 4.796507677673597, + "grad_norm": 0.18521907925605774, + "learning_rate": 4.0069133034379674e-05, + "loss": 3.7531, + "step": 70595 + }, + { + "epoch": 4.7968473977442585, + "grad_norm": 0.16606605052947998, + "learning_rate": 4.0064886533496395e-05, + "loss": 3.8344, + "step": 70600 + }, + { + "epoch": 4.797187117814921, + "grad_norm": 0.14402595162391663, + "learning_rate": 4.006064003261313e-05, + "loss": 3.8628, + "step": 70605 + }, + { + "epoch": 4.797526837885583, + "grad_norm": 0.14410436153411865, + "learning_rate": 4.005639353172986e-05, + "loss": 3.8891, + "step": 70610 + }, + { + "epoch": 4.797866557956244, + "grad_norm": 0.1696784794330597, + "learning_rate": 4.005214703084658e-05, + "loss": 3.7442, + "step": 70615 + }, + { + "epoch": 4.798206278026906, + "grad_norm": 0.1492958664894104, + "learning_rate": 4.0047900529963314e-05, + "loss": 4.019, + "step": 70620 + }, + { + "epoch": 4.798545998097568, + "grad_norm": 0.22667580842971802, + "learning_rate": 4.004365402908004e-05, + "loss": 3.7693, + "step": 70625 + }, + { + "epoch": 4.798885718168229, + "grad_norm": 0.1889474093914032, + "learning_rate": 4.003940752819676e-05, + "loss": 3.5224, + "step": 70630 + }, + { + "epoch": 4.799225438238891, + "grad_norm": 0.534538984298706, + "learning_rate": 4.00351610273135e-05, + "loss": 3.8229, + "step": 70635 + }, + { + "epoch": 4.799565158309553, + "grad_norm": 0.4319285452365875, + "learning_rate": 4.0030914526430226e-05, + "loss": 3.8617, + "step": 70640 + }, + { + "epoch": 4.7999048783802145, + "grad_norm": 0.1749068647623062, + "learning_rate": 4.0026668025546954e-05, + "loss": 3.6875, + "step": 70645 + }, + { + "epoch": 4.800244598450877, + "grad_norm": 0.18013118207454681, + "learning_rate": 4.0022421524663675e-05, + "loss": 3.7751, + "step": 70650 + }, + { + "epoch": 4.800584318521539, + "grad_norm": 0.16189618408679962, + "learning_rate": 4.001817502378041e-05, + "loss": 3.8202, + "step": 70655 + }, + { + "epoch": 4.8009240385922, + "grad_norm": 0.19582967460155487, + "learning_rate": 4.001392852289714e-05, + "loss": 3.9822, + "step": 70660 + }, + { + "epoch": 4.801263758662862, + "grad_norm": 0.15338417887687683, + "learning_rate": 4.000968202201386e-05, + "loss": 3.97, + "step": 70665 + }, + { + "epoch": 4.801603478733524, + "grad_norm": 0.2086905837059021, + "learning_rate": 4.0005435521130594e-05, + "loss": 3.6846, + "step": 70670 + }, + { + "epoch": 4.801943198804185, + "grad_norm": 0.17371195554733276, + "learning_rate": 4.000118902024732e-05, + "loss": 4.1241, + "step": 70675 + }, + { + "epoch": 4.802282918874847, + "grad_norm": 0.15363606810569763, + "learning_rate": 3.999694251936404e-05, + "loss": 3.8735, + "step": 70680 + }, + { + "epoch": 4.802622638945509, + "grad_norm": 0.1576196551322937, + "learning_rate": 3.999269601848077e-05, + "loss": 3.7462, + "step": 70685 + }, + { + "epoch": 4.8029623590161705, + "grad_norm": 0.18340179324150085, + "learning_rate": 3.9988449517597506e-05, + "loss": 3.8064, + "step": 70690 + }, + { + "epoch": 4.803302079086833, + "grad_norm": 0.1817067265510559, + "learning_rate": 3.998420301671423e-05, + "loss": 3.971, + "step": 70695 + }, + { + "epoch": 4.803641799157495, + "grad_norm": 6.228034973144531, + "learning_rate": 3.9979956515830955e-05, + "loss": 3.7568, + "step": 70700 + }, + { + "epoch": 4.803981519228156, + "grad_norm": 0.24893935024738312, + "learning_rate": 3.997571001494769e-05, + "loss": 3.6946, + "step": 70705 + }, + { + "epoch": 4.804321239298818, + "grad_norm": 0.33734169602394104, + "learning_rate": 3.997146351406441e-05, + "loss": 3.8371, + "step": 70710 + }, + { + "epoch": 4.80466095936948, + "grad_norm": 0.1758011281490326, + "learning_rate": 3.996721701318114e-05, + "loss": 3.8734, + "step": 70715 + }, + { + "epoch": 4.805000679440141, + "grad_norm": 0.1980191320180893, + "learning_rate": 3.996297051229787e-05, + "loss": 3.7779, + "step": 70720 + }, + { + "epoch": 4.805340399510803, + "grad_norm": 0.14684177935123444, + "learning_rate": 3.9958724011414595e-05, + "loss": 3.744, + "step": 70725 + }, + { + "epoch": 4.805680119581465, + "grad_norm": 0.16563493013381958, + "learning_rate": 3.9954477510531323e-05, + "loss": 3.8617, + "step": 70730 + }, + { + "epoch": 4.8060198396521265, + "grad_norm": 0.16085009276866913, + "learning_rate": 3.995023100964805e-05, + "loss": 3.8614, + "step": 70735 + }, + { + "epoch": 4.806359559722789, + "grad_norm": 0.1620466411113739, + "learning_rate": 3.994598450876478e-05, + "loss": 3.943, + "step": 70740 + }, + { + "epoch": 4.806699279793451, + "grad_norm": 0.1930084079504013, + "learning_rate": 3.994173800788151e-05, + "loss": 3.9878, + "step": 70745 + }, + { + "epoch": 4.807038999864112, + "grad_norm": 0.19328877329826355, + "learning_rate": 3.9937491506998235e-05, + "loss": 3.8855, + "step": 70750 + }, + { + "epoch": 4.807378719934774, + "grad_norm": 0.14949461817741394, + "learning_rate": 3.9933245006114963e-05, + "loss": 3.8091, + "step": 70755 + }, + { + "epoch": 4.807718440005435, + "grad_norm": 0.7955849766731262, + "learning_rate": 3.992899850523169e-05, + "loss": 3.719, + "step": 70760 + }, + { + "epoch": 4.808058160076097, + "grad_norm": 0.17894978821277618, + "learning_rate": 3.992475200434842e-05, + "loss": 3.9162, + "step": 70765 + }, + { + "epoch": 4.808397880146759, + "grad_norm": 0.19001273810863495, + "learning_rate": 3.992050550346515e-05, + "loss": 3.8381, + "step": 70770 + }, + { + "epoch": 4.8087376002174205, + "grad_norm": 0.141219362616539, + "learning_rate": 3.9916259002581876e-05, + "loss": 3.932, + "step": 70775 + }, + { + "epoch": 4.8090773202880825, + "grad_norm": 0.29186564683914185, + "learning_rate": 3.9912012501698604e-05, + "loss": 3.7187, + "step": 70780 + }, + { + "epoch": 4.809417040358745, + "grad_norm": 0.14125792682170868, + "learning_rate": 3.9907766000815325e-05, + "loss": 3.9955, + "step": 70785 + }, + { + "epoch": 4.809756760429406, + "grad_norm": 0.17533020675182343, + "learning_rate": 3.990351949993206e-05, + "loss": 3.735, + "step": 70790 + }, + { + "epoch": 4.810096480500068, + "grad_norm": 0.20233184099197388, + "learning_rate": 3.989927299904879e-05, + "loss": 3.722, + "step": 70795 + }, + { + "epoch": 4.81043620057073, + "grad_norm": 0.2939732074737549, + "learning_rate": 3.989502649816551e-05, + "loss": 3.7249, + "step": 70800 + }, + { + "epoch": 4.810775920641391, + "grad_norm": 0.17930886149406433, + "learning_rate": 3.9890779997282244e-05, + "loss": 3.8161, + "step": 70805 + }, + { + "epoch": 4.811115640712053, + "grad_norm": 0.18865668773651123, + "learning_rate": 3.988653349639897e-05, + "loss": 4.171, + "step": 70810 + }, + { + "epoch": 4.811455360782715, + "grad_norm": 0.18321113288402557, + "learning_rate": 3.98822869955157e-05, + "loss": 4.0204, + "step": 70815 + }, + { + "epoch": 4.8117950808533765, + "grad_norm": 0.1728058010339737, + "learning_rate": 3.987804049463242e-05, + "loss": 3.7647, + "step": 70820 + }, + { + "epoch": 4.812134800924039, + "grad_norm": 0.17273841798305511, + "learning_rate": 3.9873793993749156e-05, + "loss": 3.9206, + "step": 70825 + }, + { + "epoch": 4.812474520994701, + "grad_norm": 0.17520803213119507, + "learning_rate": 3.9869547492865884e-05, + "loss": 3.851, + "step": 70830 + }, + { + "epoch": 4.812814241065362, + "grad_norm": 3.7939860820770264, + "learning_rate": 3.9865300991982605e-05, + "loss": 3.9677, + "step": 70835 + }, + { + "epoch": 4.813153961136024, + "grad_norm": 0.19608311355113983, + "learning_rate": 3.986105449109934e-05, + "loss": 3.8041, + "step": 70840 + }, + { + "epoch": 4.813493681206686, + "grad_norm": 0.20329950749874115, + "learning_rate": 3.985680799021607e-05, + "loss": 4.2882, + "step": 70845 + }, + { + "epoch": 4.813833401277347, + "grad_norm": 0.20713748037815094, + "learning_rate": 3.985256148933279e-05, + "loss": 3.8191, + "step": 70850 + }, + { + "epoch": 4.814173121348009, + "grad_norm": 0.28105273842811584, + "learning_rate": 3.984831498844952e-05, + "loss": 4.1234, + "step": 70855 + }, + { + "epoch": 4.814512841418671, + "grad_norm": 0.19352732598781586, + "learning_rate": 3.984406848756625e-05, + "loss": 3.7172, + "step": 70860 + }, + { + "epoch": 4.8148525614893325, + "grad_norm": 0.18857350945472717, + "learning_rate": 3.983982198668297e-05, + "loss": 3.9109, + "step": 70865 + }, + { + "epoch": 4.815192281559995, + "grad_norm": 0.32114383578300476, + "learning_rate": 3.98355754857997e-05, + "loss": 3.793, + "step": 70870 + }, + { + "epoch": 4.815532001630657, + "grad_norm": 0.21420887112617493, + "learning_rate": 3.9831328984916436e-05, + "loss": 3.722, + "step": 70875 + }, + { + "epoch": 4.815871721701318, + "grad_norm": 0.603622555732727, + "learning_rate": 3.982708248403316e-05, + "loss": 4.0355, + "step": 70880 + }, + { + "epoch": 4.81621144177198, + "grad_norm": 0.14547137916088104, + "learning_rate": 3.9822835983149885e-05, + "loss": 3.7902, + "step": 70885 + }, + { + "epoch": 4.816551161842642, + "grad_norm": 0.1905893236398697, + "learning_rate": 3.981858948226661e-05, + "loss": 3.8483, + "step": 70890 + }, + { + "epoch": 4.816890881913303, + "grad_norm": 0.20935702323913574, + "learning_rate": 3.981434298138334e-05, + "loss": 3.9851, + "step": 70895 + }, + { + "epoch": 4.817230601983965, + "grad_norm": 0.19301308691501617, + "learning_rate": 3.981009648050007e-05, + "loss": 4.044, + "step": 70900 + }, + { + "epoch": 4.817570322054627, + "grad_norm": 0.2002219408750534, + "learning_rate": 3.98058499796168e-05, + "loss": 4.0655, + "step": 70905 + }, + { + "epoch": 4.8179100421252885, + "grad_norm": 0.2143668532371521, + "learning_rate": 3.9801603478733525e-05, + "loss": 4.1221, + "step": 70910 + }, + { + "epoch": 4.818249762195951, + "grad_norm": 0.31640109419822693, + "learning_rate": 3.979735697785025e-05, + "loss": 3.8651, + "step": 70915 + }, + { + "epoch": 4.818589482266613, + "grad_norm": 0.1701314002275467, + "learning_rate": 3.979311047696698e-05, + "loss": 3.6697, + "step": 70920 + }, + { + "epoch": 4.818929202337274, + "grad_norm": 0.22839003801345825, + "learning_rate": 3.978886397608371e-05, + "loss": 3.93, + "step": 70925 + }, + { + "epoch": 4.819268922407936, + "grad_norm": 0.16804993152618408, + "learning_rate": 3.978461747520044e-05, + "loss": 3.8984, + "step": 70930 + }, + { + "epoch": 4.819608642478598, + "grad_norm": 0.17863088846206665, + "learning_rate": 3.9780370974317165e-05, + "loss": 3.7933, + "step": 70935 + }, + { + "epoch": 4.819948362549259, + "grad_norm": 0.150114044547081, + "learning_rate": 3.977612447343389e-05, + "loss": 3.93, + "step": 70940 + }, + { + "epoch": 4.820288082619921, + "grad_norm": 0.16207502782344818, + "learning_rate": 3.977187797255062e-05, + "loss": 4.1115, + "step": 70945 + }, + { + "epoch": 4.820627802690583, + "grad_norm": 0.22246865928173065, + "learning_rate": 3.976763147166735e-05, + "loss": 3.8983, + "step": 70950 + }, + { + "epoch": 4.8209675227612445, + "grad_norm": 0.16276921331882477, + "learning_rate": 3.976338497078407e-05, + "loss": 3.7748, + "step": 70955 + }, + { + "epoch": 4.821307242831907, + "grad_norm": 0.15599238872528076, + "learning_rate": 3.9759138469900805e-05, + "loss": 3.8718, + "step": 70960 + }, + { + "epoch": 4.821646962902569, + "grad_norm": 0.19799159467220306, + "learning_rate": 3.975489196901753e-05, + "loss": 3.975, + "step": 70965 + }, + { + "epoch": 4.82198668297323, + "grad_norm": 0.1458619236946106, + "learning_rate": 3.9750645468134254e-05, + "loss": 3.9302, + "step": 70970 + }, + { + "epoch": 4.822326403043892, + "grad_norm": 0.15563273429870605, + "learning_rate": 3.974639896725099e-05, + "loss": 3.6874, + "step": 70975 + }, + { + "epoch": 4.822666123114553, + "grad_norm": 0.5896859765052795, + "learning_rate": 3.974215246636772e-05, + "loss": 3.6487, + "step": 70980 + }, + { + "epoch": 4.823005843185215, + "grad_norm": 0.17913825809955597, + "learning_rate": 3.9737905965484445e-05, + "loss": 3.8523, + "step": 70985 + }, + { + "epoch": 4.823345563255877, + "grad_norm": 0.1759783923625946, + "learning_rate": 3.9733659464601166e-05, + "loss": 3.7383, + "step": 70990 + }, + { + "epoch": 4.8236852833265385, + "grad_norm": 0.21404030919075012, + "learning_rate": 3.97294129637179e-05, + "loss": 3.9132, + "step": 70995 + }, + { + "epoch": 4.8240250033972005, + "grad_norm": 0.18146388232707977, + "learning_rate": 3.972516646283463e-05, + "loss": 3.9443, + "step": 71000 + }, + { + "epoch": 4.824364723467863, + "grad_norm": 0.15883387625217438, + "learning_rate": 3.972091996195135e-05, + "loss": 3.7865, + "step": 71005 + }, + { + "epoch": 4.824704443538524, + "grad_norm": 0.23232479393482208, + "learning_rate": 3.9716673461068085e-05, + "loss": 3.9903, + "step": 71010 + }, + { + "epoch": 4.825044163609186, + "grad_norm": 0.19015835225582123, + "learning_rate": 3.971242696018481e-05, + "loss": 3.5709, + "step": 71015 + }, + { + "epoch": 4.825383883679848, + "grad_norm": 0.1940743327140808, + "learning_rate": 3.9708180459301534e-05, + "loss": 3.7637, + "step": 71020 + }, + { + "epoch": 4.825723603750509, + "grad_norm": 0.6698678731918335, + "learning_rate": 3.970393395841826e-05, + "loss": 3.8939, + "step": 71025 + }, + { + "epoch": 4.826063323821171, + "grad_norm": 0.16673365235328674, + "learning_rate": 3.9699687457535e-05, + "loss": 3.9741, + "step": 71030 + }, + { + "epoch": 4.826403043891833, + "grad_norm": 0.31971198320388794, + "learning_rate": 3.969544095665172e-05, + "loss": 3.839, + "step": 71035 + }, + { + "epoch": 4.8267427639624945, + "grad_norm": 0.39376217126846313, + "learning_rate": 3.9691194455768446e-05, + "loss": 3.7034, + "step": 71040 + }, + { + "epoch": 4.8270824840331565, + "grad_norm": 0.20318931341171265, + "learning_rate": 3.968694795488518e-05, + "loss": 3.6291, + "step": 71045 + }, + { + "epoch": 4.827422204103819, + "grad_norm": 0.20043489336967468, + "learning_rate": 3.96827014540019e-05, + "loss": 3.9296, + "step": 71050 + }, + { + "epoch": 4.82776192417448, + "grad_norm": 0.17584364116191864, + "learning_rate": 3.967845495311863e-05, + "loss": 3.8433, + "step": 71055 + }, + { + "epoch": 4.828101644245142, + "grad_norm": 0.15978723764419556, + "learning_rate": 3.9674208452235365e-05, + "loss": 3.8419, + "step": 71060 + }, + { + "epoch": 4.828441364315804, + "grad_norm": 0.1846192479133606, + "learning_rate": 3.9669961951352086e-05, + "loss": 4.0289, + "step": 71065 + }, + { + "epoch": 4.828781084386465, + "grad_norm": 0.16744832694530487, + "learning_rate": 3.9665715450468814e-05, + "loss": 3.9504, + "step": 71070 + }, + { + "epoch": 4.829120804457127, + "grad_norm": 0.18834184110164642, + "learning_rate": 3.966146894958554e-05, + "loss": 3.7992, + "step": 71075 + }, + { + "epoch": 4.829460524527789, + "grad_norm": 0.1689615249633789, + "learning_rate": 3.965722244870227e-05, + "loss": 3.8632, + "step": 71080 + }, + { + "epoch": 4.8298002445984505, + "grad_norm": 5.793513298034668, + "learning_rate": 3.9652975947819e-05, + "loss": 3.9165, + "step": 71085 + }, + { + "epoch": 4.8301399646691126, + "grad_norm": 0.17210134863853455, + "learning_rate": 3.9648729446935726e-05, + "loss": 3.9604, + "step": 71090 + }, + { + "epoch": 4.830479684739775, + "grad_norm": 0.20700283348560333, + "learning_rate": 3.9644482946052454e-05, + "loss": 3.7954, + "step": 71095 + }, + { + "epoch": 4.830819404810436, + "grad_norm": 0.34806761145591736, + "learning_rate": 3.964023644516918e-05, + "loss": 3.8963, + "step": 71100 + }, + { + "epoch": 4.831159124881098, + "grad_norm": 0.4884701371192932, + "learning_rate": 3.963598994428591e-05, + "loss": 3.9617, + "step": 71105 + }, + { + "epoch": 4.83149884495176, + "grad_norm": 0.16322609782218933, + "learning_rate": 3.963174344340264e-05, + "loss": 4.031, + "step": 71110 + }, + { + "epoch": 4.831838565022421, + "grad_norm": 0.15877823531627655, + "learning_rate": 3.9627496942519366e-05, + "loss": 3.8917, + "step": 71115 + }, + { + "epoch": 4.832178285093083, + "grad_norm": 0.14246685802936554, + "learning_rate": 3.9623250441636094e-05, + "loss": 3.734, + "step": 71120 + }, + { + "epoch": 4.832518005163745, + "grad_norm": 0.16838420927524567, + "learning_rate": 3.9619003940752816e-05, + "loss": 3.9337, + "step": 71125 + }, + { + "epoch": 4.8328577252344065, + "grad_norm": 0.18788465857505798, + "learning_rate": 3.961475743986955e-05, + "loss": 3.6825, + "step": 71130 + }, + { + "epoch": 4.833197445305069, + "grad_norm": 0.13681015372276306, + "learning_rate": 3.961051093898628e-05, + "loss": 4.0171, + "step": 71135 + }, + { + "epoch": 4.833537165375731, + "grad_norm": 0.1751546710729599, + "learning_rate": 3.9606264438103e-05, + "loss": 3.5845, + "step": 71140 + }, + { + "epoch": 4.833876885446392, + "grad_norm": 0.1601065695285797, + "learning_rate": 3.9602017937219735e-05, + "loss": 3.7778, + "step": 71145 + }, + { + "epoch": 4.834216605517054, + "grad_norm": 0.1418411284685135, + "learning_rate": 3.959777143633646e-05, + "loss": 3.8776, + "step": 71150 + }, + { + "epoch": 4.834556325587716, + "grad_norm": 0.19416990876197815, + "learning_rate": 3.959352493545319e-05, + "loss": 3.8062, + "step": 71155 + }, + { + "epoch": 4.834896045658377, + "grad_norm": 0.2143838256597519, + "learning_rate": 3.958927843456992e-05, + "loss": 3.6951, + "step": 71160 + }, + { + "epoch": 4.835235765729039, + "grad_norm": 0.17289510369300842, + "learning_rate": 3.9585031933686647e-05, + "loss": 4.0108, + "step": 71165 + }, + { + "epoch": 4.835575485799701, + "grad_norm": 0.1487758308649063, + "learning_rate": 3.9580785432803375e-05, + "loss": 3.7837, + "step": 71170 + }, + { + "epoch": 4.8359152058703625, + "grad_norm": 0.21937382221221924, + "learning_rate": 3.9576538931920096e-05, + "loss": 3.7454, + "step": 71175 + }, + { + "epoch": 4.836254925941025, + "grad_norm": 0.12883210182189941, + "learning_rate": 3.957229243103683e-05, + "loss": 3.7777, + "step": 71180 + }, + { + "epoch": 4.836594646011687, + "grad_norm": 0.21273402869701385, + "learning_rate": 3.956804593015356e-05, + "loss": 3.9382, + "step": 71185 + }, + { + "epoch": 4.836934366082348, + "grad_norm": 1.2457605600357056, + "learning_rate": 3.956379942927028e-05, + "loss": 3.8724, + "step": 71190 + }, + { + "epoch": 4.83727408615301, + "grad_norm": 0.17323438823223114, + "learning_rate": 3.9559552928387015e-05, + "loss": 3.7312, + "step": 71195 + }, + { + "epoch": 4.837613806223672, + "grad_norm": 0.1738622486591339, + "learning_rate": 3.955530642750374e-05, + "loss": 3.9132, + "step": 71200 + }, + { + "epoch": 4.837953526294333, + "grad_norm": 0.19093722105026245, + "learning_rate": 3.9551059926620464e-05, + "loss": 3.7879, + "step": 71205 + }, + { + "epoch": 4.838293246364995, + "grad_norm": 0.22542700171470642, + "learning_rate": 3.954681342573719e-05, + "loss": 3.8527, + "step": 71210 + }, + { + "epoch": 4.838632966435657, + "grad_norm": 0.18545043468475342, + "learning_rate": 3.954256692485393e-05, + "loss": 3.8131, + "step": 71215 + }, + { + "epoch": 4.8389726865063185, + "grad_norm": 0.17290754616260529, + "learning_rate": 3.953832042397065e-05, + "loss": 3.7718, + "step": 71220 + }, + { + "epoch": 4.839312406576981, + "grad_norm": 0.1475800722837448, + "learning_rate": 3.9534073923087376e-05, + "loss": 3.6517, + "step": 71225 + }, + { + "epoch": 4.839652126647643, + "grad_norm": 0.17972594499588013, + "learning_rate": 3.952982742220411e-05, + "loss": 3.6018, + "step": 71230 + }, + { + "epoch": 4.839991846718304, + "grad_norm": 0.35980239510536194, + "learning_rate": 3.952558092132083e-05, + "loss": 3.9155, + "step": 71235 + }, + { + "epoch": 4.840331566788966, + "grad_norm": 0.1523229032754898, + "learning_rate": 3.952133442043756e-05, + "loss": 3.8862, + "step": 71240 + }, + { + "epoch": 4.840671286859628, + "grad_norm": 0.19236735999584198, + "learning_rate": 3.951708791955429e-05, + "loss": 3.7525, + "step": 71245 + }, + { + "epoch": 4.841011006930289, + "grad_norm": 0.15903498232364655, + "learning_rate": 3.9512841418671016e-05, + "loss": 3.8248, + "step": 71250 + }, + { + "epoch": 4.841350727000951, + "grad_norm": 0.20833736658096313, + "learning_rate": 3.9508594917787744e-05, + "loss": 3.9574, + "step": 71255 + }, + { + "epoch": 4.841690447071613, + "grad_norm": 0.16920152306556702, + "learning_rate": 3.950434841690447e-05, + "loss": 3.7305, + "step": 71260 + }, + { + "epoch": 4.8420301671422745, + "grad_norm": 0.34563642740249634, + "learning_rate": 3.95001019160212e-05, + "loss": 3.9269, + "step": 71265 + }, + { + "epoch": 4.842369887212937, + "grad_norm": 0.25207144021987915, + "learning_rate": 3.949585541513793e-05, + "loss": 3.7584, + "step": 71270 + }, + { + "epoch": 4.842709607283599, + "grad_norm": 0.15484748780727386, + "learning_rate": 3.9491608914254656e-05, + "loss": 3.8496, + "step": 71275 + }, + { + "epoch": 4.84304932735426, + "grad_norm": 0.16015157103538513, + "learning_rate": 3.9487362413371384e-05, + "loss": 3.8422, + "step": 71280 + }, + { + "epoch": 4.843389047424922, + "grad_norm": 0.1835055947303772, + "learning_rate": 3.948311591248811e-05, + "loss": 3.7968, + "step": 71285 + }, + { + "epoch": 4.843728767495584, + "grad_norm": 0.183495432138443, + "learning_rate": 3.947886941160484e-05, + "loss": 3.8342, + "step": 71290 + }, + { + "epoch": 4.844068487566245, + "grad_norm": 0.3453519344329834, + "learning_rate": 3.947462291072157e-05, + "loss": 3.8836, + "step": 71295 + }, + { + "epoch": 4.844408207636907, + "grad_norm": 0.14416027069091797, + "learning_rate": 3.9470376409838296e-05, + "loss": 3.7225, + "step": 71300 + }, + { + "epoch": 4.844747927707569, + "grad_norm": 0.12950444221496582, + "learning_rate": 3.9466129908955024e-05, + "loss": 3.9612, + "step": 71305 + }, + { + "epoch": 4.8450876477782305, + "grad_norm": 0.37392815947532654, + "learning_rate": 3.9461883408071745e-05, + "loss": 3.7574, + "step": 71310 + }, + { + "epoch": 4.845427367848893, + "grad_norm": 0.19077719748020172, + "learning_rate": 3.945763690718848e-05, + "loss": 3.8844, + "step": 71315 + }, + { + "epoch": 4.845767087919555, + "grad_norm": 0.15403759479522705, + "learning_rate": 3.945339040630521e-05, + "loss": 4.1217, + "step": 71320 + }, + { + "epoch": 4.846106807990216, + "grad_norm": 0.18348601460456848, + "learning_rate": 3.9449143905421936e-05, + "loss": 3.8718, + "step": 71325 + }, + { + "epoch": 4.846446528060878, + "grad_norm": 0.48156988620758057, + "learning_rate": 3.9444897404538664e-05, + "loss": 4.0138, + "step": 71330 + }, + { + "epoch": 4.84678624813154, + "grad_norm": 0.2995598316192627, + "learning_rate": 3.944065090365539e-05, + "loss": 3.8825, + "step": 71335 + }, + { + "epoch": 4.847125968202201, + "grad_norm": 0.19915789365768433, + "learning_rate": 3.943640440277212e-05, + "loss": 3.6617, + "step": 71340 + }, + { + "epoch": 4.847465688272863, + "grad_norm": 0.15429261326789856, + "learning_rate": 3.943215790188884e-05, + "loss": 3.9207, + "step": 71345 + }, + { + "epoch": 4.847805408343525, + "grad_norm": 0.16353724896907806, + "learning_rate": 3.9427911401005576e-05, + "loss": 3.8355, + "step": 71350 + }, + { + "epoch": 4.8481451284141865, + "grad_norm": 0.21488487720489502, + "learning_rate": 3.9423664900122304e-05, + "loss": 4.0142, + "step": 71355 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.15639717876911163, + "learning_rate": 3.9419418399239025e-05, + "loss": 3.6734, + "step": 71360 + }, + { + "epoch": 4.848824568555511, + "grad_norm": 0.19682663679122925, + "learning_rate": 3.941517189835576e-05, + "loss": 3.9322, + "step": 71365 + }, + { + "epoch": 4.849164288626172, + "grad_norm": 0.18979890644550323, + "learning_rate": 3.941092539747249e-05, + "loss": 3.6112, + "step": 71370 + }, + { + "epoch": 4.849504008696834, + "grad_norm": 0.16964897513389587, + "learning_rate": 3.940667889658921e-05, + "loss": 3.8375, + "step": 71375 + }, + { + "epoch": 4.849843728767496, + "grad_norm": 0.24961262941360474, + "learning_rate": 3.940243239570594e-05, + "loss": 3.8505, + "step": 71380 + }, + { + "epoch": 4.850183448838157, + "grad_norm": 0.21332430839538574, + "learning_rate": 3.939818589482267e-05, + "loss": 3.7045, + "step": 71385 + }, + { + "epoch": 4.850523168908819, + "grad_norm": 0.1582641303539276, + "learning_rate": 3.939393939393939e-05, + "loss": 3.8325, + "step": 71390 + }, + { + "epoch": 4.850862888979481, + "grad_norm": 0.1914045363664627, + "learning_rate": 3.938969289305612e-05, + "loss": 3.6822, + "step": 71395 + }, + { + "epoch": 4.8512026090501426, + "grad_norm": 0.35428231954574585, + "learning_rate": 3.9385446392172856e-05, + "loss": 3.8777, + "step": 71400 + }, + { + "epoch": 4.851542329120805, + "grad_norm": 0.20301322638988495, + "learning_rate": 3.938119989128958e-05, + "loss": 3.876, + "step": 71405 + }, + { + "epoch": 4.851882049191467, + "grad_norm": 0.17997293174266815, + "learning_rate": 3.9376953390406305e-05, + "loss": 3.6664, + "step": 71410 + }, + { + "epoch": 4.852221769262128, + "grad_norm": 0.16341596841812134, + "learning_rate": 3.9372706889523033e-05, + "loss": 3.97, + "step": 71415 + }, + { + "epoch": 4.85256148933279, + "grad_norm": 0.16666074097156525, + "learning_rate": 3.936846038863976e-05, + "loss": 3.8046, + "step": 71420 + }, + { + "epoch": 4.852901209403452, + "grad_norm": 0.1853763610124588, + "learning_rate": 3.936421388775649e-05, + "loss": 3.7754, + "step": 71425 + }, + { + "epoch": 4.853240929474113, + "grad_norm": 0.3835453391075134, + "learning_rate": 3.935996738687322e-05, + "loss": 3.9104, + "step": 71430 + }, + { + "epoch": 4.853580649544775, + "grad_norm": 0.1850874423980713, + "learning_rate": 3.9355720885989945e-05, + "loss": 3.9389, + "step": 71435 + }, + { + "epoch": 4.8539203696154365, + "grad_norm": 0.16295692324638367, + "learning_rate": 3.9351474385106673e-05, + "loss": 4.0476, + "step": 71440 + }, + { + "epoch": 4.854260089686099, + "grad_norm": 0.25401267409324646, + "learning_rate": 3.93472278842234e-05, + "loss": 3.5509, + "step": 71445 + }, + { + "epoch": 4.854599809756761, + "grad_norm": 0.14591529965400696, + "learning_rate": 3.934298138334013e-05, + "loss": 3.8305, + "step": 71450 + }, + { + "epoch": 4.854939529827422, + "grad_norm": 0.20695893466472626, + "learning_rate": 3.933873488245686e-05, + "loss": 3.7187, + "step": 71455 + }, + { + "epoch": 4.855279249898084, + "grad_norm": 0.19385109841823578, + "learning_rate": 3.9334488381573585e-05, + "loss": 3.8258, + "step": 71460 + }, + { + "epoch": 4.855618969968746, + "grad_norm": 0.18784688413143158, + "learning_rate": 3.9330241880690313e-05, + "loss": 3.7585, + "step": 71465 + }, + { + "epoch": 4.855958690039407, + "grad_norm": 0.18612565100193024, + "learning_rate": 3.932599537980704e-05, + "loss": 3.7994, + "step": 71470 + }, + { + "epoch": 4.856298410110069, + "grad_norm": 0.19574600458145142, + "learning_rate": 3.932174887892377e-05, + "loss": 4.0222, + "step": 71475 + }, + { + "epoch": 4.856638130180731, + "grad_norm": 0.35519087314605713, + "learning_rate": 3.931750237804049e-05, + "loss": 3.671, + "step": 71480 + }, + { + "epoch": 4.8569778502513925, + "grad_norm": 0.1502504050731659, + "learning_rate": 3.9313255877157226e-05, + "loss": 4.0541, + "step": 71485 + }, + { + "epoch": 4.857317570322055, + "grad_norm": 0.2530680298805237, + "learning_rate": 3.9309009376273954e-05, + "loss": 3.9975, + "step": 71490 + }, + { + "epoch": 4.857657290392717, + "grad_norm": 0.21507017314434052, + "learning_rate": 3.930476287539068e-05, + "loss": 3.7022, + "step": 71495 + }, + { + "epoch": 4.857997010463378, + "grad_norm": 0.16143576800823212, + "learning_rate": 3.930051637450741e-05, + "loss": 4.0035, + "step": 71500 + }, + { + "epoch": 4.85833673053404, + "grad_norm": 0.224837064743042, + "learning_rate": 3.929626987362414e-05, + "loss": 3.5768, + "step": 71505 + }, + { + "epoch": 4.858676450604702, + "grad_norm": 0.157994344830513, + "learning_rate": 3.9292023372740866e-05, + "loss": 3.6519, + "step": 71510 + }, + { + "epoch": 4.859016170675363, + "grad_norm": 0.2110944241285324, + "learning_rate": 3.928777687185759e-05, + "loss": 3.6312, + "step": 71515 + }, + { + "epoch": 4.859355890746025, + "grad_norm": 0.15841077268123627, + "learning_rate": 3.928353037097432e-05, + "loss": 3.717, + "step": 71520 + }, + { + "epoch": 4.859695610816687, + "grad_norm": 0.18490679562091827, + "learning_rate": 3.927928387009105e-05, + "loss": 3.8667, + "step": 71525 + }, + { + "epoch": 4.8600353308873485, + "grad_norm": 0.17699240148067474, + "learning_rate": 3.927503736920777e-05, + "loss": 3.7001, + "step": 71530 + }, + { + "epoch": 4.860375050958011, + "grad_norm": 0.1927429884672165, + "learning_rate": 3.9270790868324506e-05, + "loss": 3.9777, + "step": 71535 + }, + { + "epoch": 4.860714771028673, + "grad_norm": 0.18697163462638855, + "learning_rate": 3.9266544367441234e-05, + "loss": 3.8539, + "step": 71540 + }, + { + "epoch": 4.861054491099334, + "grad_norm": 0.31720447540283203, + "learning_rate": 3.9262297866557955e-05, + "loss": 4.0437, + "step": 71545 + }, + { + "epoch": 4.861394211169996, + "grad_norm": 0.16700398921966553, + "learning_rate": 3.925805136567469e-05, + "loss": 3.7492, + "step": 71550 + }, + { + "epoch": 4.861733931240658, + "grad_norm": 3.7433621883392334, + "learning_rate": 3.925380486479142e-05, + "loss": 3.9871, + "step": 71555 + }, + { + "epoch": 4.862073651311319, + "grad_norm": 0.17547494173049927, + "learning_rate": 3.924955836390814e-05, + "loss": 3.6762, + "step": 71560 + }, + { + "epoch": 4.862413371381981, + "grad_norm": 0.2941679060459137, + "learning_rate": 3.924531186302487e-05, + "loss": 3.8129, + "step": 71565 + }, + { + "epoch": 4.862753091452643, + "grad_norm": 0.2661932110786438, + "learning_rate": 3.92410653621416e-05, + "loss": 3.731, + "step": 71570 + }, + { + "epoch": 4.8630928115233045, + "grad_norm": 0.4022728204727173, + "learning_rate": 3.923681886125832e-05, + "loss": 3.886, + "step": 71575 + }, + { + "epoch": 4.863432531593967, + "grad_norm": 0.8318540453910828, + "learning_rate": 3.923257236037505e-05, + "loss": 3.9047, + "step": 71580 + }, + { + "epoch": 4.863772251664629, + "grad_norm": 0.2714719772338867, + "learning_rate": 3.9228325859491786e-05, + "loss": 4.0421, + "step": 71585 + }, + { + "epoch": 4.86411197173529, + "grad_norm": 0.15605966746807098, + "learning_rate": 3.922407935860851e-05, + "loss": 3.7195, + "step": 71590 + }, + { + "epoch": 4.864451691805952, + "grad_norm": 0.8867831826210022, + "learning_rate": 3.9219832857725235e-05, + "loss": 3.9013, + "step": 71595 + }, + { + "epoch": 4.864791411876614, + "grad_norm": 0.13971871137619019, + "learning_rate": 3.921558635684196e-05, + "loss": 3.6385, + "step": 71600 + }, + { + "epoch": 4.865131131947275, + "grad_norm": 0.22282348573207855, + "learning_rate": 3.921133985595869e-05, + "loss": 3.7864, + "step": 71605 + }, + { + "epoch": 4.865470852017937, + "grad_norm": 0.1967276930809021, + "learning_rate": 3.920709335507542e-05, + "loss": 3.6101, + "step": 71610 + }, + { + "epoch": 4.865810572088599, + "grad_norm": 0.1834419071674347, + "learning_rate": 3.920284685419215e-05, + "loss": 3.6492, + "step": 71615 + }, + { + "epoch": 4.8661502921592605, + "grad_norm": 0.21107906103134155, + "learning_rate": 3.9198600353308875e-05, + "loss": 3.7885, + "step": 71620 + }, + { + "epoch": 4.866490012229923, + "grad_norm": 0.1584479808807373, + "learning_rate": 3.91943538524256e-05, + "loss": 4.0317, + "step": 71625 + }, + { + "epoch": 4.866829732300585, + "grad_norm": 0.16822496056556702, + "learning_rate": 3.919010735154233e-05, + "loss": 3.8268, + "step": 71630 + }, + { + "epoch": 4.867169452371246, + "grad_norm": 3.4916763305664062, + "learning_rate": 3.918586085065906e-05, + "loss": 3.853, + "step": 71635 + }, + { + "epoch": 4.867509172441908, + "grad_norm": 0.1466940939426422, + "learning_rate": 3.918161434977579e-05, + "loss": 3.833, + "step": 71640 + }, + { + "epoch": 4.86784889251257, + "grad_norm": 0.16847209632396698, + "learning_rate": 3.9177367848892515e-05, + "loss": 4.0029, + "step": 71645 + }, + { + "epoch": 4.868188612583231, + "grad_norm": 0.14663957059383392, + "learning_rate": 3.917312134800924e-05, + "loss": 3.6785, + "step": 71650 + }, + { + "epoch": 4.868528332653893, + "grad_norm": 0.5030674934387207, + "learning_rate": 3.916887484712597e-05, + "loss": 3.9132, + "step": 71655 + }, + { + "epoch": 4.8688680527245545, + "grad_norm": 0.21490123867988586, + "learning_rate": 3.91646283462427e-05, + "loss": 3.9921, + "step": 71660 + }, + { + "epoch": 4.8692077727952165, + "grad_norm": 0.15729966759681702, + "learning_rate": 3.916038184535943e-05, + "loss": 3.8764, + "step": 71665 + }, + { + "epoch": 4.869547492865879, + "grad_norm": 0.13385900855064392, + "learning_rate": 3.9156135344476155e-05, + "loss": 4.1075, + "step": 71670 + }, + { + "epoch": 4.86988721293654, + "grad_norm": 0.18434187769889832, + "learning_rate": 3.915188884359288e-05, + "loss": 3.8643, + "step": 71675 + }, + { + "epoch": 4.870226933007202, + "grad_norm": 0.2765215039253235, + "learning_rate": 3.914764234270961e-05, + "loss": 3.8519, + "step": 71680 + }, + { + "epoch": 4.870566653077864, + "grad_norm": 0.32007715106010437, + "learning_rate": 3.914339584182634e-05, + "loss": 3.5926, + "step": 71685 + }, + { + "epoch": 4.870906373148525, + "grad_norm": 0.1658862829208374, + "learning_rate": 3.913914934094307e-05, + "loss": 3.8146, + "step": 71690 + }, + { + "epoch": 4.871246093219187, + "grad_norm": 0.1479843705892563, + "learning_rate": 3.9134902840059795e-05, + "loss": 3.7913, + "step": 71695 + }, + { + "epoch": 4.871585813289849, + "grad_norm": 0.17142441868782043, + "learning_rate": 3.9130656339176516e-05, + "loss": 3.7739, + "step": 71700 + }, + { + "epoch": 4.8719255333605105, + "grad_norm": 0.27052438259124756, + "learning_rate": 3.912640983829325e-05, + "loss": 3.8632, + "step": 71705 + }, + { + "epoch": 4.872265253431173, + "grad_norm": 0.3098181486129761, + "learning_rate": 3.912216333740998e-05, + "loss": 3.5542, + "step": 71710 + }, + { + "epoch": 4.872604973501835, + "grad_norm": 0.18041430413722992, + "learning_rate": 3.91179168365267e-05, + "loss": 3.6639, + "step": 71715 + }, + { + "epoch": 4.872944693572496, + "grad_norm": 0.16053056716918945, + "learning_rate": 3.9113670335643435e-05, + "loss": 4.0997, + "step": 71720 + }, + { + "epoch": 4.873284413643158, + "grad_norm": 0.23430930078029633, + "learning_rate": 3.910942383476016e-05, + "loss": 3.9242, + "step": 71725 + }, + { + "epoch": 4.87362413371382, + "grad_norm": 0.15047092735767365, + "learning_rate": 3.9105177333876884e-05, + "loss": 4.0142, + "step": 71730 + }, + { + "epoch": 4.873963853784481, + "grad_norm": 0.18273769319057465, + "learning_rate": 3.910093083299361e-05, + "loss": 3.8795, + "step": 71735 + }, + { + "epoch": 4.874303573855143, + "grad_norm": 0.15623581409454346, + "learning_rate": 3.909668433211035e-05, + "loss": 3.9166, + "step": 71740 + }, + { + "epoch": 4.874643293925805, + "grad_norm": 0.16733743250370026, + "learning_rate": 3.909243783122707e-05, + "loss": 3.9511, + "step": 71745 + }, + { + "epoch": 4.8749830139964665, + "grad_norm": 0.148252934217453, + "learning_rate": 3.9088191330343796e-05, + "loss": 3.7977, + "step": 71750 + }, + { + "epoch": 4.875322734067129, + "grad_norm": 0.16256879270076752, + "learning_rate": 3.908394482946053e-05, + "loss": 3.6372, + "step": 71755 + }, + { + "epoch": 4.875662454137791, + "grad_norm": 0.37533286213874817, + "learning_rate": 3.907969832857725e-05, + "loss": 3.7312, + "step": 71760 + }, + { + "epoch": 4.876002174208452, + "grad_norm": 0.17444011569023132, + "learning_rate": 3.907545182769398e-05, + "loss": 3.9902, + "step": 71765 + }, + { + "epoch": 4.876341894279114, + "grad_norm": 0.19403117895126343, + "learning_rate": 3.907120532681071e-05, + "loss": 3.6546, + "step": 71770 + }, + { + "epoch": 4.876681614349776, + "grad_norm": 0.1689661294221878, + "learning_rate": 3.9066958825927436e-05, + "loss": 3.916, + "step": 71775 + }, + { + "epoch": 4.877021334420437, + "grad_norm": 0.14284206926822662, + "learning_rate": 3.9062712325044164e-05, + "loss": 4.0835, + "step": 71780 + }, + { + "epoch": 4.877361054491099, + "grad_norm": 0.22097435593605042, + "learning_rate": 3.905931512433755e-05, + "loss": 3.8279, + "step": 71785 + }, + { + "epoch": 4.877700774561761, + "grad_norm": 0.1865473836660385, + "learning_rate": 3.9055068623454275e-05, + "loss": 3.8887, + "step": 71790 + }, + { + "epoch": 4.8780404946324225, + "grad_norm": 0.13834941387176514, + "learning_rate": 3.9050822122571e-05, + "loss": 3.6881, + "step": 71795 + }, + { + "epoch": 4.878380214703085, + "grad_norm": 0.15298141539096832, + "learning_rate": 3.904657562168773e-05, + "loss": 3.8249, + "step": 71800 + }, + { + "epoch": 4.878719934773747, + "grad_norm": 0.2141285091638565, + "learning_rate": 3.904232912080446e-05, + "loss": 3.8188, + "step": 71805 + }, + { + "epoch": 4.879059654844408, + "grad_norm": 7.253503322601318, + "learning_rate": 3.903808261992119e-05, + "loss": 4.0264, + "step": 71810 + }, + { + "epoch": 4.87939937491507, + "grad_norm": 0.19867679476737976, + "learning_rate": 3.9033836119037915e-05, + "loss": 3.9342, + "step": 71815 + }, + { + "epoch": 4.879739094985732, + "grad_norm": 0.2104354351758957, + "learning_rate": 3.902958961815464e-05, + "loss": 3.8173, + "step": 71820 + }, + { + "epoch": 4.880078815056393, + "grad_norm": 0.12295512855052948, + "learning_rate": 3.902534311727137e-05, + "loss": 3.7663, + "step": 71825 + }, + { + "epoch": 4.880418535127055, + "grad_norm": 0.19260717928409576, + "learning_rate": 3.90210966163881e-05, + "loss": 3.9024, + "step": 71830 + }, + { + "epoch": 4.880758255197717, + "grad_norm": 0.20099477469921112, + "learning_rate": 3.901685011550483e-05, + "loss": 4.0416, + "step": 71835 + }, + { + "epoch": 4.8810979752683785, + "grad_norm": 0.1658513844013214, + "learning_rate": 3.901260361462155e-05, + "loss": 3.8693, + "step": 71840 + }, + { + "epoch": 4.881437695339041, + "grad_norm": 0.22231899201869965, + "learning_rate": 3.900835711373828e-05, + "loss": 3.9364, + "step": 71845 + }, + { + "epoch": 4.881777415409703, + "grad_norm": 0.2105216383934021, + "learning_rate": 3.900411061285501e-05, + "loss": 3.7883, + "step": 71850 + }, + { + "epoch": 4.882117135480364, + "grad_norm": 0.15595297515392303, + "learning_rate": 3.899986411197173e-05, + "loss": 3.7448, + "step": 71855 + }, + { + "epoch": 4.882456855551026, + "grad_norm": 0.20417992770671844, + "learning_rate": 3.899561761108847e-05, + "loss": 3.9296, + "step": 71860 + }, + { + "epoch": 4.882796575621688, + "grad_norm": 2.268004894256592, + "learning_rate": 3.8991371110205195e-05, + "loss": 3.9939, + "step": 71865 + }, + { + "epoch": 4.883136295692349, + "grad_norm": 0.2435683012008667, + "learning_rate": 3.898712460932192e-05, + "loss": 4.1047, + "step": 71870 + }, + { + "epoch": 4.883476015763011, + "grad_norm": 0.22493667900562286, + "learning_rate": 3.8982878108438644e-05, + "loss": 3.8376, + "step": 71875 + }, + { + "epoch": 4.883815735833673, + "grad_norm": 0.35133954882621765, + "learning_rate": 3.897863160755538e-05, + "loss": 4.0296, + "step": 71880 + }, + { + "epoch": 4.8841554559043345, + "grad_norm": 0.24889473617076874, + "learning_rate": 3.897438510667211e-05, + "loss": 3.8983, + "step": 71885 + }, + { + "epoch": 4.884495175974997, + "grad_norm": 0.21176810562610626, + "learning_rate": 3.897013860578883e-05, + "loss": 3.7554, + "step": 71890 + }, + { + "epoch": 4.884834896045659, + "grad_norm": 0.2369702160358429, + "learning_rate": 3.896589210490556e-05, + "loss": 3.7555, + "step": 71895 + }, + { + "epoch": 4.88517461611632, + "grad_norm": 0.2244737595319748, + "learning_rate": 3.896164560402229e-05, + "loss": 3.7425, + "step": 71900 + }, + { + "epoch": 4.885514336186982, + "grad_norm": 0.3552957773208618, + "learning_rate": 3.895739910313901e-05, + "loss": 3.8896, + "step": 71905 + }, + { + "epoch": 4.885854056257644, + "grad_norm": 0.2196578085422516, + "learning_rate": 3.895315260225574e-05, + "loss": 3.7102, + "step": 71910 + }, + { + "epoch": 4.886193776328305, + "grad_norm": 0.1441468745470047, + "learning_rate": 3.8948906101372475e-05, + "loss": 3.8331, + "step": 71915 + }, + { + "epoch": 4.886533496398967, + "grad_norm": 0.17527687549591064, + "learning_rate": 3.8944659600489196e-05, + "loss": 3.8485, + "step": 71920 + }, + { + "epoch": 4.886873216469629, + "grad_norm": 0.2507105767726898, + "learning_rate": 3.8940413099605924e-05, + "loss": 3.7839, + "step": 71925 + }, + { + "epoch": 4.8872129365402905, + "grad_norm": 0.3216758370399475, + "learning_rate": 3.893616659872266e-05, + "loss": 3.924, + "step": 71930 + }, + { + "epoch": 4.887552656610953, + "grad_norm": 0.17979775369167328, + "learning_rate": 3.893192009783938e-05, + "loss": 3.8379, + "step": 71935 + }, + { + "epoch": 4.887892376681615, + "grad_norm": 0.27178728580474854, + "learning_rate": 3.892767359695611e-05, + "loss": 4.0922, + "step": 71940 + }, + { + "epoch": 4.888232096752276, + "grad_norm": 0.1975814700126648, + "learning_rate": 3.892342709607284e-05, + "loss": 3.8592, + "step": 71945 + }, + { + "epoch": 4.888571816822938, + "grad_norm": 0.14762239158153534, + "learning_rate": 3.8919180595189564e-05, + "loss": 3.91, + "step": 71950 + }, + { + "epoch": 4.8889115368936, + "grad_norm": 0.18023093044757843, + "learning_rate": 3.891493409430629e-05, + "loss": 4.046, + "step": 71955 + }, + { + "epoch": 4.889251256964261, + "grad_norm": 0.2042824923992157, + "learning_rate": 3.891068759342302e-05, + "loss": 3.55, + "step": 71960 + }, + { + "epoch": 4.889590977034923, + "grad_norm": 0.16152100265026093, + "learning_rate": 3.890644109253975e-05, + "loss": 3.7869, + "step": 71965 + }, + { + "epoch": 4.889930697105585, + "grad_norm": 0.1672319769859314, + "learning_rate": 3.8902194591656476e-05, + "loss": 3.617, + "step": 71970 + }, + { + "epoch": 4.8902704171762466, + "grad_norm": 0.182560995221138, + "learning_rate": 3.8897948090773204e-05, + "loss": 4.011, + "step": 71975 + }, + { + "epoch": 4.890610137246909, + "grad_norm": 0.18675725162029266, + "learning_rate": 3.889370158988993e-05, + "loss": 3.9351, + "step": 71980 + }, + { + "epoch": 4.890949857317571, + "grad_norm": 0.1519428789615631, + "learning_rate": 3.888945508900666e-05, + "loss": 3.9953, + "step": 71985 + }, + { + "epoch": 4.891289577388232, + "grad_norm": 0.19888585805892944, + "learning_rate": 3.888520858812339e-05, + "loss": 4.1182, + "step": 71990 + }, + { + "epoch": 4.891629297458894, + "grad_norm": 0.3212395906448364, + "learning_rate": 3.8880962087240116e-05, + "loss": 4.0123, + "step": 71995 + }, + { + "epoch": 4.891969017529556, + "grad_norm": 0.31388548016548157, + "learning_rate": 3.8876715586356844e-05, + "loss": 3.8778, + "step": 72000 + }, + { + "epoch": 4.892308737600217, + "grad_norm": 0.17823609709739685, + "learning_rate": 3.887246908547357e-05, + "loss": 3.9263, + "step": 72005 + }, + { + "epoch": 4.892648457670879, + "grad_norm": 0.1934792846441269, + "learning_rate": 3.8868222584590294e-05, + "loss": 3.8255, + "step": 72010 + }, + { + "epoch": 4.892988177741541, + "grad_norm": 0.21585750579833984, + "learning_rate": 3.886397608370703e-05, + "loss": 3.9614, + "step": 72015 + }, + { + "epoch": 4.893327897812203, + "grad_norm": 0.20794019103050232, + "learning_rate": 3.8859729582823756e-05, + "loss": 4.0204, + "step": 72020 + }, + { + "epoch": 4.893667617882865, + "grad_norm": 0.17856602370738983, + "learning_rate": 3.885548308194048e-05, + "loss": 3.7973, + "step": 72025 + }, + { + "epoch": 4.894007337953527, + "grad_norm": 0.21771477162837982, + "learning_rate": 3.885123658105721e-05, + "loss": 4.027, + "step": 72030 + }, + { + "epoch": 4.894347058024188, + "grad_norm": 0.1301996409893036, + "learning_rate": 3.884699008017394e-05, + "loss": 3.4542, + "step": 72035 + }, + { + "epoch": 4.89468677809485, + "grad_norm": 3.036611557006836, + "learning_rate": 3.884274357929067e-05, + "loss": 4.05, + "step": 72040 + }, + { + "epoch": 4.895026498165512, + "grad_norm": 0.21658095717430115, + "learning_rate": 3.8838497078407396e-05, + "loss": 3.9674, + "step": 72045 + }, + { + "epoch": 4.895366218236173, + "grad_norm": 0.18447330594062805, + "learning_rate": 3.8834250577524124e-05, + "loss": 3.8112, + "step": 72050 + }, + { + "epoch": 4.895705938306835, + "grad_norm": 0.1676894873380661, + "learning_rate": 3.883000407664085e-05, + "loss": 3.905, + "step": 72055 + }, + { + "epoch": 4.896045658377497, + "grad_norm": 0.21201524138450623, + "learning_rate": 3.8825757575757574e-05, + "loss": 3.8551, + "step": 72060 + }, + { + "epoch": 4.896385378448159, + "grad_norm": 0.17701676487922668, + "learning_rate": 3.882151107487431e-05, + "loss": 3.913, + "step": 72065 + }, + { + "epoch": 4.896725098518821, + "grad_norm": 0.22461919486522675, + "learning_rate": 3.8817264573991036e-05, + "loss": 3.9333, + "step": 72070 + }, + { + "epoch": 4.897064818589483, + "grad_norm": 0.13804891705513, + "learning_rate": 3.881301807310776e-05, + "loss": 3.8006, + "step": 72075 + }, + { + "epoch": 4.897404538660144, + "grad_norm": 0.19338636100292206, + "learning_rate": 3.880877157222449e-05, + "loss": 3.7728, + "step": 72080 + }, + { + "epoch": 4.897744258730806, + "grad_norm": 0.14880546927452087, + "learning_rate": 3.880452507134122e-05, + "loss": 4.0198, + "step": 72085 + }, + { + "epoch": 4.898083978801468, + "grad_norm": 0.24771542847156525, + "learning_rate": 3.880027857045794e-05, + "loss": 4.0426, + "step": 72090 + }, + { + "epoch": 4.898423698872129, + "grad_norm": 0.21423077583312988, + "learning_rate": 3.879603206957467e-05, + "loss": 3.8637, + "step": 72095 + }, + { + "epoch": 4.898763418942791, + "grad_norm": 0.16023032367229462, + "learning_rate": 3.8791785568691405e-05, + "loss": 3.7747, + "step": 72100 + }, + { + "epoch": 4.899103139013453, + "grad_norm": 0.16227012872695923, + "learning_rate": 3.8787539067808126e-05, + "loss": 3.9476, + "step": 72105 + }, + { + "epoch": 4.899442859084115, + "grad_norm": 4.171177387237549, + "learning_rate": 3.8783292566924854e-05, + "loss": 3.8418, + "step": 72110 + }, + { + "epoch": 4.899782579154777, + "grad_norm": 0.229423388838768, + "learning_rate": 3.877904606604159e-05, + "loss": 3.6997, + "step": 72115 + }, + { + "epoch": 4.900122299225439, + "grad_norm": 0.15706223249435425, + "learning_rate": 3.877479956515831e-05, + "loss": 3.7952, + "step": 72120 + }, + { + "epoch": 4.9004620192961, + "grad_norm": 0.4445186257362366, + "learning_rate": 3.877055306427504e-05, + "loss": 3.585, + "step": 72125 + }, + { + "epoch": 4.900801739366762, + "grad_norm": 0.31066882610321045, + "learning_rate": 3.8766306563391766e-05, + "loss": 3.9685, + "step": 72130 + }, + { + "epoch": 4.901141459437423, + "grad_norm": 3.4435689449310303, + "learning_rate": 3.8762060062508494e-05, + "loss": 3.8483, + "step": 72135 + }, + { + "epoch": 4.901481179508085, + "grad_norm": 0.5134146809577942, + "learning_rate": 3.875781356162522e-05, + "loss": 3.7508, + "step": 72140 + }, + { + "epoch": 4.901820899578747, + "grad_norm": 0.16822563111782074, + "learning_rate": 3.875356706074195e-05, + "loss": 3.8882, + "step": 72145 + }, + { + "epoch": 4.9021606196494085, + "grad_norm": 0.15515491366386414, + "learning_rate": 3.874932055985868e-05, + "loss": 3.992, + "step": 72150 + }, + { + "epoch": 4.902500339720071, + "grad_norm": 0.16266097128391266, + "learning_rate": 3.8745074058975406e-05, + "loss": 3.7982, + "step": 72155 + }, + { + "epoch": 4.902840059790733, + "grad_norm": 0.15677626430988312, + "learning_rate": 3.8740827558092134e-05, + "loss": 3.8562, + "step": 72160 + }, + { + "epoch": 4.903179779861394, + "grad_norm": 0.18253152072429657, + "learning_rate": 3.873658105720886e-05, + "loss": 3.7941, + "step": 72165 + }, + { + "epoch": 4.903519499932056, + "grad_norm": 0.1920611560344696, + "learning_rate": 3.873233455632559e-05, + "loss": 3.8615, + "step": 72170 + }, + { + "epoch": 4.903859220002718, + "grad_norm": 0.18373823165893555, + "learning_rate": 3.872808805544232e-05, + "loss": 3.8143, + "step": 72175 + }, + { + "epoch": 4.904198940073379, + "grad_norm": 0.14026641845703125, + "learning_rate": 3.8723841554559046e-05, + "loss": 3.9798, + "step": 72180 + }, + { + "epoch": 4.904538660144041, + "grad_norm": 0.19379866123199463, + "learning_rate": 3.8719595053675774e-05, + "loss": 4.0293, + "step": 72185 + }, + { + "epoch": 4.904878380214703, + "grad_norm": 0.1379636824131012, + "learning_rate": 3.87153485527925e-05, + "loss": 3.9953, + "step": 72190 + }, + { + "epoch": 4.9052181002853645, + "grad_norm": 0.19233964383602142, + "learning_rate": 3.871110205190922e-05, + "loss": 3.921, + "step": 72195 + }, + { + "epoch": 4.905557820356027, + "grad_norm": 0.1828608363866806, + "learning_rate": 3.870685555102596e-05, + "loss": 3.8556, + "step": 72200 + }, + { + "epoch": 4.905897540426689, + "grad_norm": 0.17232359945774078, + "learning_rate": 3.8702609050142686e-05, + "loss": 3.8416, + "step": 72205 + }, + { + "epoch": 4.90623726049735, + "grad_norm": 0.17605946958065033, + "learning_rate": 3.8698362549259414e-05, + "loss": 3.8502, + "step": 72210 + }, + { + "epoch": 4.906576980568012, + "grad_norm": 0.17292532324790955, + "learning_rate": 3.869411604837614e-05, + "loss": 3.8171, + "step": 72215 + }, + { + "epoch": 4.906916700638674, + "grad_norm": 0.22274336218833923, + "learning_rate": 3.868986954749287e-05, + "loss": 3.6762, + "step": 72220 + }, + { + "epoch": 4.907256420709335, + "grad_norm": 0.21605151891708374, + "learning_rate": 3.86856230466096e-05, + "loss": 3.8975, + "step": 72225 + }, + { + "epoch": 4.907596140779997, + "grad_norm": 0.1994885951280594, + "learning_rate": 3.868137654572632e-05, + "loss": 3.6714, + "step": 72230 + }, + { + "epoch": 4.907935860850659, + "grad_norm": 0.18399763107299805, + "learning_rate": 3.8677130044843054e-05, + "loss": 3.7122, + "step": 72235 + }, + { + "epoch": 4.9082755809213205, + "grad_norm": 0.16343262791633606, + "learning_rate": 3.867288354395978e-05, + "loss": 3.9736, + "step": 72240 + }, + { + "epoch": 4.908615300991983, + "grad_norm": 0.15261657536029816, + "learning_rate": 3.86686370430765e-05, + "loss": 3.8899, + "step": 72245 + }, + { + "epoch": 4.908955021062645, + "grad_norm": 0.17948250472545624, + "learning_rate": 3.866439054219324e-05, + "loss": 3.7081, + "step": 72250 + }, + { + "epoch": 4.909294741133306, + "grad_norm": 0.23126499354839325, + "learning_rate": 3.8660144041309966e-05, + "loss": 3.9144, + "step": 72255 + }, + { + "epoch": 4.909634461203968, + "grad_norm": 0.18646451830863953, + "learning_rate": 3.865589754042669e-05, + "loss": 3.9102, + "step": 72260 + }, + { + "epoch": 4.90997418127463, + "grad_norm": 0.17950347065925598, + "learning_rate": 3.8651651039543415e-05, + "loss": 3.8449, + "step": 72265 + }, + { + "epoch": 4.910313901345291, + "grad_norm": 0.17824238538742065, + "learning_rate": 3.864740453866015e-05, + "loss": 3.6865, + "step": 72270 + }, + { + "epoch": 4.910653621415953, + "grad_norm": 0.24262277781963348, + "learning_rate": 3.864315803777687e-05, + "loss": 3.7333, + "step": 72275 + }, + { + "epoch": 4.910993341486615, + "grad_norm": 0.16049058735370636, + "learning_rate": 3.86389115368936e-05, + "loss": 3.7558, + "step": 72280 + }, + { + "epoch": 4.911333061557277, + "grad_norm": 0.2145972102880478, + "learning_rate": 3.8634665036010334e-05, + "loss": 3.8185, + "step": 72285 + }, + { + "epoch": 4.911672781627939, + "grad_norm": 0.3061845004558563, + "learning_rate": 3.8630418535127055e-05, + "loss": 3.6126, + "step": 72290 + }, + { + "epoch": 4.912012501698601, + "grad_norm": 0.19022175669670105, + "learning_rate": 3.862617203424378e-05, + "loss": 3.6495, + "step": 72295 + }, + { + "epoch": 4.912352221769262, + "grad_norm": 0.40585681796073914, + "learning_rate": 3.862192553336051e-05, + "loss": 3.6899, + "step": 72300 + }, + { + "epoch": 4.912691941839924, + "grad_norm": 0.16769440472126007, + "learning_rate": 3.861767903247724e-05, + "loss": 3.9072, + "step": 72305 + }, + { + "epoch": 4.913031661910586, + "grad_norm": 0.21183528006076813, + "learning_rate": 3.861343253159397e-05, + "loss": 3.9364, + "step": 72310 + }, + { + "epoch": 4.913371381981247, + "grad_norm": 0.22043752670288086, + "learning_rate": 3.8609186030710695e-05, + "loss": 3.665, + "step": 72315 + }, + { + "epoch": 4.913711102051909, + "grad_norm": 0.1631031036376953, + "learning_rate": 3.860493952982742e-05, + "loss": 3.98, + "step": 72320 + }, + { + "epoch": 4.914050822122571, + "grad_norm": 4.991234302520752, + "learning_rate": 3.860069302894415e-05, + "loss": 3.8213, + "step": 72325 + }, + { + "epoch": 4.914390542193233, + "grad_norm": 0.1674109548330307, + "learning_rate": 3.859644652806088e-05, + "loss": 3.5736, + "step": 72330 + }, + { + "epoch": 4.914730262263895, + "grad_norm": 0.17067851126194, + "learning_rate": 3.859220002717761e-05, + "loss": 3.8308, + "step": 72335 + }, + { + "epoch": 4.915069982334556, + "grad_norm": 0.1667201817035675, + "learning_rate": 3.8587953526294335e-05, + "loss": 3.9228, + "step": 72340 + }, + { + "epoch": 4.915409702405218, + "grad_norm": 0.21495026350021362, + "learning_rate": 3.858370702541106e-05, + "loss": 3.825, + "step": 72345 + }, + { + "epoch": 4.91574942247588, + "grad_norm": 0.19253632426261902, + "learning_rate": 3.857946052452779e-05, + "loss": 3.8892, + "step": 72350 + }, + { + "epoch": 4.916089142546541, + "grad_norm": 0.15088139474391937, + "learning_rate": 3.857521402364452e-05, + "loss": 3.869, + "step": 72355 + }, + { + "epoch": 4.916428862617203, + "grad_norm": 0.15251076221466064, + "learning_rate": 3.857096752276125e-05, + "loss": 3.8746, + "step": 72360 + }, + { + "epoch": 4.916768582687865, + "grad_norm": 0.24500082433223724, + "learning_rate": 3.856672102187797e-05, + "loss": 3.9544, + "step": 72365 + }, + { + "epoch": 4.9171083027585265, + "grad_norm": 0.1682807356119156, + "learning_rate": 3.8562474520994703e-05, + "loss": 3.7186, + "step": 72370 + }, + { + "epoch": 4.917448022829189, + "grad_norm": 0.3451423943042755, + "learning_rate": 3.855822802011143e-05, + "loss": 3.9544, + "step": 72375 + }, + { + "epoch": 4.917787742899851, + "grad_norm": 0.30700841546058655, + "learning_rate": 3.855398151922816e-05, + "loss": 3.839, + "step": 72380 + }, + { + "epoch": 4.918127462970512, + "grad_norm": 0.2921653389930725, + "learning_rate": 3.854973501834489e-05, + "loss": 3.9336, + "step": 72385 + }, + { + "epoch": 4.918467183041174, + "grad_norm": 0.1795329749584198, + "learning_rate": 3.8545488517461615e-05, + "loss": 3.914, + "step": 72390 + }, + { + "epoch": 4.918806903111836, + "grad_norm": 0.211915522813797, + "learning_rate": 3.8541242016578343e-05, + "loss": 3.919, + "step": 72395 + }, + { + "epoch": 4.919146623182497, + "grad_norm": 0.1388920247554779, + "learning_rate": 3.8536995515695065e-05, + "loss": 3.9417, + "step": 72400 + }, + { + "epoch": 4.919486343253159, + "grad_norm": 0.22770802676677704, + "learning_rate": 3.85327490148118e-05, + "loss": 3.7833, + "step": 72405 + }, + { + "epoch": 4.919826063323821, + "grad_norm": 0.2512055039405823, + "learning_rate": 3.852850251392853e-05, + "loss": 3.8526, + "step": 72410 + }, + { + "epoch": 4.9201657833944825, + "grad_norm": 0.2240126132965088, + "learning_rate": 3.852425601304525e-05, + "loss": 3.7387, + "step": 72415 + }, + { + "epoch": 4.920505503465145, + "grad_norm": 0.18338549137115479, + "learning_rate": 3.8520009512161983e-05, + "loss": 3.842, + "step": 72420 + }, + { + "epoch": 4.920845223535807, + "grad_norm": 0.6848808526992798, + "learning_rate": 3.851576301127871e-05, + "loss": 3.7074, + "step": 72425 + }, + { + "epoch": 4.921184943606468, + "grad_norm": 0.21325811743736267, + "learning_rate": 3.851151651039543e-05, + "loss": 3.6922, + "step": 72430 + }, + { + "epoch": 4.92152466367713, + "grad_norm": 0.19025640189647675, + "learning_rate": 3.850727000951217e-05, + "loss": 3.7772, + "step": 72435 + }, + { + "epoch": 4.921864383747792, + "grad_norm": 0.1612997204065323, + "learning_rate": 3.8503023508628896e-05, + "loss": 3.8084, + "step": 72440 + }, + { + "epoch": 4.922204103818453, + "grad_norm": 0.16416847705841064, + "learning_rate": 3.849877700774562e-05, + "loss": 3.8707, + "step": 72445 + }, + { + "epoch": 4.922543823889115, + "grad_norm": 0.2123931646347046, + "learning_rate": 3.8494530506862345e-05, + "loss": 3.6246, + "step": 72450 + }, + { + "epoch": 4.922883543959777, + "grad_norm": 0.1730690896511078, + "learning_rate": 3.849028400597908e-05, + "loss": 3.8224, + "step": 72455 + }, + { + "epoch": 4.9232232640304385, + "grad_norm": 0.151531383395195, + "learning_rate": 3.84860375050958e-05, + "loss": 3.9202, + "step": 72460 + }, + { + "epoch": 4.923562984101101, + "grad_norm": 0.15297257900238037, + "learning_rate": 3.848179100421253e-05, + "loss": 3.9036, + "step": 72465 + }, + { + "epoch": 4.923902704171763, + "grad_norm": 0.14339210093021393, + "learning_rate": 3.8477544503329264e-05, + "loss": 3.9623, + "step": 72470 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.15311217308044434, + "learning_rate": 3.8473298002445985e-05, + "loss": 4.0038, + "step": 72475 + }, + { + "epoch": 4.924582144313086, + "grad_norm": 0.13712939620018005, + "learning_rate": 3.846905150156271e-05, + "loss": 3.667, + "step": 72480 + }, + { + "epoch": 4.924921864383748, + "grad_norm": 0.18885189294815063, + "learning_rate": 3.846480500067944e-05, + "loss": 3.7213, + "step": 72485 + }, + { + "epoch": 4.925261584454409, + "grad_norm": 0.1700606644153595, + "learning_rate": 3.846055849979617e-05, + "loss": 3.7997, + "step": 72490 + }, + { + "epoch": 4.925601304525071, + "grad_norm": 0.2763736844062805, + "learning_rate": 3.84563119989129e-05, + "loss": 3.701, + "step": 72495 + }, + { + "epoch": 4.925941024595733, + "grad_norm": 0.18644395470619202, + "learning_rate": 3.8452065498029625e-05, + "loss": 3.7864, + "step": 72500 + }, + { + "epoch": 4.9262807446663945, + "grad_norm": 0.19987325370311737, + "learning_rate": 3.844781899714635e-05, + "loss": 3.7797, + "step": 72505 + }, + { + "epoch": 4.926620464737057, + "grad_norm": 0.2858733832836151, + "learning_rate": 3.844357249626308e-05, + "loss": 3.9917, + "step": 72510 + }, + { + "epoch": 4.926960184807719, + "grad_norm": 0.964931309223175, + "learning_rate": 3.843932599537981e-05, + "loss": 4.1138, + "step": 72515 + }, + { + "epoch": 4.92729990487838, + "grad_norm": 0.17748647928237915, + "learning_rate": 3.843507949449654e-05, + "loss": 3.9141, + "step": 72520 + }, + { + "epoch": 4.927639624949042, + "grad_norm": 0.2683764696121216, + "learning_rate": 3.8430832993613265e-05, + "loss": 3.6883, + "step": 72525 + }, + { + "epoch": 4.927979345019704, + "grad_norm": 0.15615569055080414, + "learning_rate": 3.842658649272999e-05, + "loss": 3.7405, + "step": 72530 + }, + { + "epoch": 4.928319065090365, + "grad_norm": 0.24683670699596405, + "learning_rate": 3.842233999184672e-05, + "loss": 3.7398, + "step": 72535 + }, + { + "epoch": 4.928658785161027, + "grad_norm": 0.16532504558563232, + "learning_rate": 3.841809349096345e-05, + "loss": 3.9842, + "step": 72540 + }, + { + "epoch": 4.928998505231689, + "grad_norm": 0.16461913287639618, + "learning_rate": 3.841384699008018e-05, + "loss": 3.7261, + "step": 72545 + }, + { + "epoch": 4.9293382253023506, + "grad_norm": 0.22772705554962158, + "learning_rate": 3.8409600489196905e-05, + "loss": 3.7936, + "step": 72550 + }, + { + "epoch": 4.929677945373013, + "grad_norm": 0.1348925530910492, + "learning_rate": 3.840535398831363e-05, + "loss": 3.877, + "step": 72555 + }, + { + "epoch": 4.930017665443675, + "grad_norm": 0.15962335467338562, + "learning_rate": 3.840110748743036e-05, + "loss": 3.6239, + "step": 72560 + }, + { + "epoch": 4.930357385514336, + "grad_norm": 0.1853727251291275, + "learning_rate": 3.839686098654709e-05, + "loss": 3.8897, + "step": 72565 + }, + { + "epoch": 4.930697105584998, + "grad_norm": 0.2208378165960312, + "learning_rate": 3.839261448566382e-05, + "loss": 3.9479, + "step": 72570 + }, + { + "epoch": 4.93103682565566, + "grad_norm": 0.2194095253944397, + "learning_rate": 3.8388367984780545e-05, + "loss": 4.0019, + "step": 72575 + }, + { + "epoch": 4.931376545726321, + "grad_norm": 0.15925417840480804, + "learning_rate": 3.838412148389727e-05, + "loss": 3.9518, + "step": 72580 + }, + { + "epoch": 4.931716265796983, + "grad_norm": 0.5062684416770935, + "learning_rate": 3.8379874983013994e-05, + "loss": 3.9125, + "step": 72585 + }, + { + "epoch": 4.932055985867645, + "grad_norm": 0.33637744188308716, + "learning_rate": 3.837562848213073e-05, + "loss": 3.8659, + "step": 72590 + }, + { + "epoch": 4.932395705938307, + "grad_norm": 0.19923919439315796, + "learning_rate": 3.837138198124746e-05, + "loss": 3.9241, + "step": 72595 + }, + { + "epoch": 4.932735426008969, + "grad_norm": 0.18717125058174133, + "learning_rate": 3.836713548036418e-05, + "loss": 3.9925, + "step": 72600 + }, + { + "epoch": 4.933075146079631, + "grad_norm": 0.14551572501659393, + "learning_rate": 3.836288897948091e-05, + "loss": 4.0619, + "step": 72605 + }, + { + "epoch": 4.933414866150292, + "grad_norm": 0.15943504869937897, + "learning_rate": 3.835864247859764e-05, + "loss": 3.9348, + "step": 72610 + }, + { + "epoch": 4.933754586220954, + "grad_norm": 0.525718092918396, + "learning_rate": 3.835439597771436e-05, + "loss": 3.7771, + "step": 72615 + }, + { + "epoch": 4.934094306291616, + "grad_norm": 0.14558233320713043, + "learning_rate": 3.835014947683109e-05, + "loss": 4.009, + "step": 72620 + }, + { + "epoch": 4.934434026362277, + "grad_norm": 0.27223721146583557, + "learning_rate": 3.8345902975947825e-05, + "loss": 3.622, + "step": 72625 + }, + { + "epoch": 4.934773746432939, + "grad_norm": 0.15596209466457367, + "learning_rate": 3.8341656475064546e-05, + "loss": 3.8456, + "step": 72630 + }, + { + "epoch": 4.935113466503601, + "grad_norm": 0.15175634622573853, + "learning_rate": 3.8337409974181274e-05, + "loss": 3.5968, + "step": 72635 + }, + { + "epoch": 4.935453186574263, + "grad_norm": 0.19105464220046997, + "learning_rate": 3.833316347329801e-05, + "loss": 3.8963, + "step": 72640 + }, + { + "epoch": 4.935792906644925, + "grad_norm": 0.17130893468856812, + "learning_rate": 3.832891697241473e-05, + "loss": 3.8356, + "step": 72645 + }, + { + "epoch": 4.936132626715587, + "grad_norm": 0.17226870357990265, + "learning_rate": 3.832467047153146e-05, + "loss": 3.8682, + "step": 72650 + }, + { + "epoch": 4.936472346786248, + "grad_norm": 0.16385149955749512, + "learning_rate": 3.8320423970648186e-05, + "loss": 3.6309, + "step": 72655 + }, + { + "epoch": 4.93681206685691, + "grad_norm": 0.19694902002811432, + "learning_rate": 3.8316177469764914e-05, + "loss": 3.9459, + "step": 72660 + }, + { + "epoch": 4.937151786927572, + "grad_norm": 0.211028590798378, + "learning_rate": 3.831193096888164e-05, + "loss": 3.9728, + "step": 72665 + }, + { + "epoch": 4.937491506998233, + "grad_norm": 0.13970905542373657, + "learning_rate": 3.830768446799837e-05, + "loss": 3.7798, + "step": 72670 + }, + { + "epoch": 4.937831227068895, + "grad_norm": 0.19130490720272064, + "learning_rate": 3.83034379671151e-05, + "loss": 3.6972, + "step": 72675 + }, + { + "epoch": 4.938170947139557, + "grad_norm": 0.20966161787509918, + "learning_rate": 3.8299191466231826e-05, + "loss": 3.7273, + "step": 72680 + }, + { + "epoch": 4.938510667210219, + "grad_norm": 0.15497541427612305, + "learning_rate": 3.8294944965348554e-05, + "loss": 3.5833, + "step": 72685 + }, + { + "epoch": 4.938850387280881, + "grad_norm": 0.871463418006897, + "learning_rate": 3.829069846446528e-05, + "loss": 3.8095, + "step": 72690 + }, + { + "epoch": 4.939190107351543, + "grad_norm": 0.2503446340560913, + "learning_rate": 3.828645196358201e-05, + "loss": 3.6339, + "step": 72695 + }, + { + "epoch": 4.939529827422204, + "grad_norm": 0.4368094801902771, + "learning_rate": 3.828220546269874e-05, + "loss": 3.845, + "step": 72700 + }, + { + "epoch": 4.939869547492866, + "grad_norm": 0.14862334728240967, + "learning_rate": 3.8277958961815466e-05, + "loss": 3.8093, + "step": 72705 + }, + { + "epoch": 4.940209267563528, + "grad_norm": 0.18597061932086945, + "learning_rate": 3.8273712460932194e-05, + "loss": 3.6754, + "step": 72710 + }, + { + "epoch": 4.940548987634189, + "grad_norm": 0.3323424160480499, + "learning_rate": 3.826946596004892e-05, + "loss": 3.8929, + "step": 72715 + }, + { + "epoch": 4.940888707704851, + "grad_norm": 0.17050494253635406, + "learning_rate": 3.826521945916565e-05, + "loss": 4.0781, + "step": 72720 + }, + { + "epoch": 4.941228427775513, + "grad_norm": 0.13279303908348083, + "learning_rate": 3.826097295828238e-05, + "loss": 3.879, + "step": 72725 + }, + { + "epoch": 4.941568147846175, + "grad_norm": 0.3872393071651459, + "learning_rate": 3.8256726457399106e-05, + "loss": 3.6142, + "step": 72730 + }, + { + "epoch": 4.941907867916837, + "grad_norm": 0.18439556658267975, + "learning_rate": 3.8252479956515834e-05, + "loss": 3.8455, + "step": 72735 + }, + { + "epoch": 4.942247587987499, + "grad_norm": 0.17509707808494568, + "learning_rate": 3.824823345563256e-05, + "loss": 3.8687, + "step": 72740 + }, + { + "epoch": 4.94258730805816, + "grad_norm": 0.2523326277732849, + "learning_rate": 3.824398695474929e-05, + "loss": 3.87, + "step": 72745 + }, + { + "epoch": 4.942927028128822, + "grad_norm": 0.1991262137889862, + "learning_rate": 3.823974045386602e-05, + "loss": 3.8903, + "step": 72750 + }, + { + "epoch": 4.943266748199484, + "grad_norm": 0.22341668605804443, + "learning_rate": 3.823549395298274e-05, + "loss": 3.9623, + "step": 72755 + }, + { + "epoch": 4.943606468270145, + "grad_norm": 0.17470380663871765, + "learning_rate": 3.8231247452099474e-05, + "loss": 3.8316, + "step": 72760 + }, + { + "epoch": 4.943946188340807, + "grad_norm": 0.18429365754127502, + "learning_rate": 3.82270009512162e-05, + "loss": 3.9166, + "step": 72765 + }, + { + "epoch": 4.944285908411469, + "grad_norm": 1.4978739023208618, + "learning_rate": 3.8222754450332924e-05, + "loss": 3.8065, + "step": 72770 + }, + { + "epoch": 4.944625628482131, + "grad_norm": 0.17947335541248322, + "learning_rate": 3.821850794944966e-05, + "loss": 3.9388, + "step": 72775 + }, + { + "epoch": 4.944965348552793, + "grad_norm": 0.1714787483215332, + "learning_rate": 3.8214261448566386e-05, + "loss": 4.0758, + "step": 72780 + }, + { + "epoch": 4.945305068623455, + "grad_norm": 0.1597059667110443, + "learning_rate": 3.821001494768311e-05, + "loss": 3.8248, + "step": 72785 + }, + { + "epoch": 4.945644788694116, + "grad_norm": 0.1765446662902832, + "learning_rate": 3.8205768446799836e-05, + "loss": 3.8896, + "step": 72790 + }, + { + "epoch": 4.945984508764778, + "grad_norm": 0.14699159562587738, + "learning_rate": 3.820152194591657e-05, + "loss": 3.8219, + "step": 72795 + }, + { + "epoch": 4.94632422883544, + "grad_norm": 0.17608724534511566, + "learning_rate": 3.819727544503329e-05, + "loss": 3.9945, + "step": 72800 + }, + { + "epoch": 4.946663948906101, + "grad_norm": 0.14378570020198822, + "learning_rate": 3.819302894415002e-05, + "loss": 3.8179, + "step": 72805 + }, + { + "epoch": 4.947003668976763, + "grad_norm": 0.18977637588977814, + "learning_rate": 3.8188782443266755e-05, + "loss": 3.6947, + "step": 72810 + }, + { + "epoch": 4.9473433890474245, + "grad_norm": 0.15665970742702484, + "learning_rate": 3.8184535942383476e-05, + "loss": 3.8922, + "step": 72815 + }, + { + "epoch": 4.947683109118087, + "grad_norm": 0.14871209859848022, + "learning_rate": 3.8180289441500204e-05, + "loss": 3.4914, + "step": 72820 + }, + { + "epoch": 4.948022829188749, + "grad_norm": 0.18140652775764465, + "learning_rate": 3.817604294061693e-05, + "loss": 3.8254, + "step": 72825 + }, + { + "epoch": 4.94836254925941, + "grad_norm": 0.2401859313249588, + "learning_rate": 3.817179643973366e-05, + "loss": 3.7305, + "step": 72830 + }, + { + "epoch": 4.948702269330072, + "grad_norm": 0.20289787650108337, + "learning_rate": 3.816754993885039e-05, + "loss": 4.0477, + "step": 72835 + }, + { + "epoch": 4.949041989400734, + "grad_norm": 0.4909050166606903, + "learning_rate": 3.8163303437967116e-05, + "loss": 3.8293, + "step": 72840 + }, + { + "epoch": 4.949381709471395, + "grad_norm": 0.21425026655197144, + "learning_rate": 3.8159056937083844e-05, + "loss": 3.9043, + "step": 72845 + }, + { + "epoch": 4.949721429542057, + "grad_norm": 0.15952950716018677, + "learning_rate": 3.815481043620057e-05, + "loss": 3.8635, + "step": 72850 + }, + { + "epoch": 4.950061149612719, + "grad_norm": 0.22536997497081757, + "learning_rate": 3.81505639353173e-05, + "loss": 3.421, + "step": 72855 + }, + { + "epoch": 4.9504008696833806, + "grad_norm": 0.3664668798446655, + "learning_rate": 3.814631743443403e-05, + "loss": 3.821, + "step": 72860 + }, + { + "epoch": 4.950740589754043, + "grad_norm": 0.17207922041416168, + "learning_rate": 3.8142070933550756e-05, + "loss": 3.943, + "step": 72865 + }, + { + "epoch": 4.951080309824705, + "grad_norm": 0.2138483226299286, + "learning_rate": 3.8137824432667484e-05, + "loss": 3.9872, + "step": 72870 + }, + { + "epoch": 4.951420029895366, + "grad_norm": 0.9588253498077393, + "learning_rate": 3.813357793178421e-05, + "loss": 3.9267, + "step": 72875 + }, + { + "epoch": 4.951759749966028, + "grad_norm": 1.1777498722076416, + "learning_rate": 3.812933143090094e-05, + "loss": 3.9194, + "step": 72880 + }, + { + "epoch": 4.95209947003669, + "grad_norm": 0.17436912655830383, + "learning_rate": 3.812508493001767e-05, + "loss": 3.8797, + "step": 72885 + }, + { + "epoch": 4.952439190107351, + "grad_norm": 0.15836142003536224, + "learning_rate": 3.8120838429134396e-05, + "loss": 3.7962, + "step": 72890 + }, + { + "epoch": 4.952778910178013, + "grad_norm": 0.16100765764713287, + "learning_rate": 3.8116591928251124e-05, + "loss": 4.0156, + "step": 72895 + }, + { + "epoch": 4.953118630248675, + "grad_norm": 0.20194533467292786, + "learning_rate": 3.811234542736785e-05, + "loss": 3.6002, + "step": 72900 + }, + { + "epoch": 4.953458350319337, + "grad_norm": 0.32131901383399963, + "learning_rate": 3.810809892648458e-05, + "loss": 3.7348, + "step": 72905 + }, + { + "epoch": 4.953798070389999, + "grad_norm": 0.14308445155620575, + "learning_rate": 3.810385242560131e-05, + "loss": 3.845, + "step": 72910 + }, + { + "epoch": 4.954137790460661, + "grad_norm": 0.14491215348243713, + "learning_rate": 3.8099605924718036e-05, + "loss": 4.0078, + "step": 72915 + }, + { + "epoch": 4.954477510531322, + "grad_norm": 0.9796831607818604, + "learning_rate": 3.8095359423834764e-05, + "loss": 3.8255, + "step": 72920 + }, + { + "epoch": 4.954817230601984, + "grad_norm": 0.17626185715198517, + "learning_rate": 3.8091112922951485e-05, + "loss": 3.8791, + "step": 72925 + }, + { + "epoch": 4.955156950672646, + "grad_norm": 0.14583995938301086, + "learning_rate": 3.808686642206822e-05, + "loss": 3.9501, + "step": 72930 + }, + { + "epoch": 4.955496670743307, + "grad_norm": 0.29210302233695984, + "learning_rate": 3.808261992118495e-05, + "loss": 3.684, + "step": 72935 + }, + { + "epoch": 4.955836390813969, + "grad_norm": 0.19049431383609772, + "learning_rate": 3.807837342030167e-05, + "loss": 4.0167, + "step": 72940 + }, + { + "epoch": 4.956176110884631, + "grad_norm": 0.21780745685100555, + "learning_rate": 3.8074126919418404e-05, + "loss": 3.713, + "step": 72945 + }, + { + "epoch": 4.956515830955293, + "grad_norm": 0.35459524393081665, + "learning_rate": 3.806988041853513e-05, + "loss": 4.1169, + "step": 72950 + }, + { + "epoch": 4.956855551025955, + "grad_norm": 0.1902635097503662, + "learning_rate": 3.806563391765185e-05, + "loss": 3.7737, + "step": 72955 + }, + { + "epoch": 4.957195271096617, + "grad_norm": 0.13330325484275818, + "learning_rate": 3.806138741676859e-05, + "loss": 3.8745, + "step": 72960 + }, + { + "epoch": 4.957534991167278, + "grad_norm": 0.1713196486234665, + "learning_rate": 3.8057140915885316e-05, + "loss": 4.0333, + "step": 72965 + }, + { + "epoch": 4.95787471123794, + "grad_norm": 0.17896679043769836, + "learning_rate": 3.805289441500204e-05, + "loss": 3.8566, + "step": 72970 + }, + { + "epoch": 4.958214431308602, + "grad_norm": 0.14654766023159027, + "learning_rate": 3.8048647914118765e-05, + "loss": 3.9581, + "step": 72975 + }, + { + "epoch": 4.958554151379263, + "grad_norm": 0.1854006052017212, + "learning_rate": 3.80444014132355e-05, + "loss": 3.9178, + "step": 72980 + }, + { + "epoch": 4.958893871449925, + "grad_norm": 0.18307912349700928, + "learning_rate": 3.804015491235222e-05, + "loss": 3.8526, + "step": 72985 + }, + { + "epoch": 4.959233591520587, + "grad_norm": 0.17346002161502838, + "learning_rate": 3.803590841146895e-05, + "loss": 3.6112, + "step": 72990 + }, + { + "epoch": 4.959573311591249, + "grad_norm": 0.1637948602437973, + "learning_rate": 3.8031661910585684e-05, + "loss": 3.7532, + "step": 72995 + }, + { + "epoch": 4.959913031661911, + "grad_norm": 0.19840997457504272, + "learning_rate": 3.8027415409702405e-05, + "loss": 3.7459, + "step": 73000 + }, + { + "epoch": 4.960252751732573, + "grad_norm": 0.14397381246089935, + "learning_rate": 3.802316890881913e-05, + "loss": 3.6859, + "step": 73005 + }, + { + "epoch": 4.960592471803234, + "grad_norm": 2.7672104835510254, + "learning_rate": 3.801892240793586e-05, + "loss": 3.8947, + "step": 73010 + }, + { + "epoch": 4.960932191873896, + "grad_norm": 0.1492377668619156, + "learning_rate": 3.801467590705259e-05, + "loss": 3.6059, + "step": 73015 + }, + { + "epoch": 4.961271911944557, + "grad_norm": 0.16968050599098206, + "learning_rate": 3.801042940616932e-05, + "loss": 4.2012, + "step": 73020 + }, + { + "epoch": 4.961611632015219, + "grad_norm": 0.19430822134017944, + "learning_rate": 3.8006182905286045e-05, + "loss": 3.8443, + "step": 73025 + }, + { + "epoch": 4.961951352085881, + "grad_norm": 0.18654389679431915, + "learning_rate": 3.800193640440277e-05, + "loss": 3.9131, + "step": 73030 + }, + { + "epoch": 4.9622910721565425, + "grad_norm": 0.17621877789497375, + "learning_rate": 3.79976899035195e-05, + "loss": 4.0009, + "step": 73035 + }, + { + "epoch": 4.962630792227205, + "grad_norm": 0.16501088440418243, + "learning_rate": 3.799344340263623e-05, + "loss": 3.9028, + "step": 73040 + }, + { + "epoch": 4.962970512297867, + "grad_norm": 0.17476044595241547, + "learning_rate": 3.798919690175296e-05, + "loss": 4.0467, + "step": 73045 + }, + { + "epoch": 4.963310232368528, + "grad_norm": 0.17401151359081268, + "learning_rate": 3.7984950400869685e-05, + "loss": 3.9215, + "step": 73050 + }, + { + "epoch": 4.96364995243919, + "grad_norm": 0.16259242594242096, + "learning_rate": 3.798070389998641e-05, + "loss": 3.6472, + "step": 73055 + }, + { + "epoch": 4.963989672509852, + "grad_norm": 0.20109012722969055, + "learning_rate": 3.797645739910314e-05, + "loss": 3.7418, + "step": 73060 + }, + { + "epoch": 4.964329392580513, + "grad_norm": 0.17849819362163544, + "learning_rate": 3.797221089821987e-05, + "loss": 3.6351, + "step": 73065 + }, + { + "epoch": 4.964669112651175, + "grad_norm": 0.28850772976875305, + "learning_rate": 3.79679643973366e-05, + "loss": 3.9224, + "step": 73070 + }, + { + "epoch": 4.965008832721837, + "grad_norm": 0.27350443601608276, + "learning_rate": 3.7963717896453325e-05, + "loss": 3.8834, + "step": 73075 + }, + { + "epoch": 4.9653485527924985, + "grad_norm": 0.17288970947265625, + "learning_rate": 3.795947139557005e-05, + "loss": 3.7792, + "step": 73080 + }, + { + "epoch": 4.965688272863161, + "grad_norm": 0.1457190364599228, + "learning_rate": 3.795522489468678e-05, + "loss": 3.9784, + "step": 73085 + }, + { + "epoch": 4.966027992933823, + "grad_norm": 0.1372903287410736, + "learning_rate": 3.795097839380351e-05, + "loss": 3.9033, + "step": 73090 + }, + { + "epoch": 4.966367713004484, + "grad_norm": 0.17152155935764313, + "learning_rate": 3.794673189292024e-05, + "loss": 3.7645, + "step": 73095 + }, + { + "epoch": 4.966707433075146, + "grad_norm": 0.18131668865680695, + "learning_rate": 3.7942485392036965e-05, + "loss": 3.6291, + "step": 73100 + }, + { + "epoch": 4.967047153145808, + "grad_norm": 0.128218412399292, + "learning_rate": 3.7938238891153693e-05, + "loss": 3.9061, + "step": 73105 + }, + { + "epoch": 4.967386873216469, + "grad_norm": 0.18748581409454346, + "learning_rate": 3.7933992390270415e-05, + "loss": 3.7048, + "step": 73110 + }, + { + "epoch": 4.967726593287131, + "grad_norm": 0.14502209424972534, + "learning_rate": 3.792974588938715e-05, + "loss": 3.6755, + "step": 73115 + }, + { + "epoch": 4.968066313357793, + "grad_norm": 0.15724512934684753, + "learning_rate": 3.792549938850388e-05, + "loss": 4.0929, + "step": 73120 + }, + { + "epoch": 4.9684060334284545, + "grad_norm": 0.18259523808956146, + "learning_rate": 3.79212528876206e-05, + "loss": 3.8987, + "step": 73125 + }, + { + "epoch": 4.968745753499117, + "grad_norm": 1.6442091464996338, + "learning_rate": 3.7917006386737333e-05, + "loss": 3.8482, + "step": 73130 + }, + { + "epoch": 4.969085473569779, + "grad_norm": 0.17759019136428833, + "learning_rate": 3.791275988585406e-05, + "loss": 3.7101, + "step": 73135 + }, + { + "epoch": 4.96942519364044, + "grad_norm": 0.1867794394493103, + "learning_rate": 3.790851338497078e-05, + "loss": 3.8617, + "step": 73140 + }, + { + "epoch": 4.969764913711102, + "grad_norm": 1.0496702194213867, + "learning_rate": 3.790426688408751e-05, + "loss": 3.6218, + "step": 73145 + }, + { + "epoch": 4.970104633781764, + "grad_norm": 0.15936222672462463, + "learning_rate": 3.7900020383204245e-05, + "loss": 4.0627, + "step": 73150 + }, + { + "epoch": 4.970444353852425, + "grad_norm": 0.16374096274375916, + "learning_rate": 3.789577388232097e-05, + "loss": 3.7783, + "step": 73155 + }, + { + "epoch": 4.970784073923087, + "grad_norm": 0.1593206226825714, + "learning_rate": 3.7891527381437695e-05, + "loss": 4.0045, + "step": 73160 + }, + { + "epoch": 4.971123793993749, + "grad_norm": 0.27642911672592163, + "learning_rate": 3.788728088055443e-05, + "loss": 3.9364, + "step": 73165 + }, + { + "epoch": 4.971463514064411, + "grad_norm": 0.16122214496135712, + "learning_rate": 3.788303437967115e-05, + "loss": 3.7618, + "step": 73170 + }, + { + "epoch": 4.971803234135073, + "grad_norm": 0.16393746435642242, + "learning_rate": 3.787878787878788e-05, + "loss": 4.0369, + "step": 73175 + }, + { + "epoch": 4.972142954205735, + "grad_norm": 0.21102623641490936, + "learning_rate": 3.787454137790461e-05, + "loss": 3.8364, + "step": 73180 + }, + { + "epoch": 4.972482674276396, + "grad_norm": 0.15392769873142242, + "learning_rate": 3.7870294877021335e-05, + "loss": 3.7103, + "step": 73185 + }, + { + "epoch": 4.972822394347058, + "grad_norm": 0.1566586196422577, + "learning_rate": 3.786604837613806e-05, + "loss": 3.9687, + "step": 73190 + }, + { + "epoch": 4.97316211441772, + "grad_norm": 0.1804712414741516, + "learning_rate": 3.786180187525479e-05, + "loss": 3.9881, + "step": 73195 + }, + { + "epoch": 4.973501834488381, + "grad_norm": 0.198869526386261, + "learning_rate": 3.785755537437152e-05, + "loss": 3.9309, + "step": 73200 + }, + { + "epoch": 4.973841554559043, + "grad_norm": 0.16704609990119934, + "learning_rate": 3.785330887348825e-05, + "loss": 3.9161, + "step": 73205 + }, + { + "epoch": 4.974181274629705, + "grad_norm": 0.15416893362998962, + "learning_rate": 3.7849062372604975e-05, + "loss": 3.9327, + "step": 73210 + }, + { + "epoch": 4.974520994700367, + "grad_norm": 0.20722287893295288, + "learning_rate": 3.78448158717217e-05, + "loss": 3.7564, + "step": 73215 + }, + { + "epoch": 4.974860714771029, + "grad_norm": 0.1741323322057724, + "learning_rate": 3.784056937083843e-05, + "loss": 3.8004, + "step": 73220 + }, + { + "epoch": 4.975200434841691, + "grad_norm": 0.17054928839206696, + "learning_rate": 3.783632286995516e-05, + "loss": 3.8401, + "step": 73225 + }, + { + "epoch": 4.975540154912352, + "grad_norm": 0.16359272599220276, + "learning_rate": 3.783207636907189e-05, + "loss": 3.5267, + "step": 73230 + }, + { + "epoch": 4.975879874983014, + "grad_norm": 0.15090644359588623, + "learning_rate": 3.7827829868188615e-05, + "loss": 3.7644, + "step": 73235 + }, + { + "epoch": 4.976219595053676, + "grad_norm": 0.1673756092786789, + "learning_rate": 3.782358336730534e-05, + "loss": 3.5017, + "step": 73240 + }, + { + "epoch": 4.976559315124337, + "grad_norm": 0.2254389375448227, + "learning_rate": 3.781933686642207e-05, + "loss": 4.063, + "step": 73245 + }, + { + "epoch": 4.976899035194999, + "grad_norm": 0.9024821519851685, + "learning_rate": 3.78150903655388e-05, + "loss": 3.9931, + "step": 73250 + }, + { + "epoch": 4.977238755265661, + "grad_norm": 0.16383139789104462, + "learning_rate": 3.781084386465553e-05, + "loss": 3.9572, + "step": 73255 + }, + { + "epoch": 4.977578475336323, + "grad_norm": 0.152607724070549, + "learning_rate": 3.7806597363772255e-05, + "loss": 3.9341, + "step": 73260 + }, + { + "epoch": 4.977918195406985, + "grad_norm": 0.15796545147895813, + "learning_rate": 3.780235086288898e-05, + "loss": 3.9717, + "step": 73265 + }, + { + "epoch": 4.978257915477647, + "grad_norm": 0.19621147215366364, + "learning_rate": 3.779810436200571e-05, + "loss": 3.8525, + "step": 73270 + }, + { + "epoch": 4.978597635548308, + "grad_norm": 0.30139777064323425, + "learning_rate": 3.779385786112244e-05, + "loss": 3.8378, + "step": 73275 + }, + { + "epoch": 4.97893735561897, + "grad_norm": 0.2028236836194992, + "learning_rate": 3.778961136023916e-05, + "loss": 3.7174, + "step": 73280 + }, + { + "epoch": 4.979277075689632, + "grad_norm": 0.14865407347679138, + "learning_rate": 3.7785364859355895e-05, + "loss": 3.7494, + "step": 73285 + }, + { + "epoch": 4.979616795760293, + "grad_norm": 0.18534505367279053, + "learning_rate": 3.778111835847262e-05, + "loss": 3.5142, + "step": 73290 + }, + { + "epoch": 4.979956515830955, + "grad_norm": 0.24840617179870605, + "learning_rate": 3.7776871857589344e-05, + "loss": 3.796, + "step": 73295 + }, + { + "epoch": 4.980296235901617, + "grad_norm": 0.18927432596683502, + "learning_rate": 3.777262535670608e-05, + "loss": 3.8846, + "step": 73300 + }, + { + "epoch": 4.980635955972279, + "grad_norm": 0.18085920810699463, + "learning_rate": 3.776837885582281e-05, + "loss": 3.8521, + "step": 73305 + }, + { + "epoch": 4.980975676042941, + "grad_norm": 0.14980871975421906, + "learning_rate": 3.776413235493953e-05, + "loss": 4.0239, + "step": 73310 + }, + { + "epoch": 4.981315396113603, + "grad_norm": 0.17901892960071564, + "learning_rate": 3.7759885854056256e-05, + "loss": 3.8865, + "step": 73315 + }, + { + "epoch": 4.981655116184264, + "grad_norm": 0.1791582554578781, + "learning_rate": 3.775563935317299e-05, + "loss": 3.79, + "step": 73320 + }, + { + "epoch": 4.981994836254926, + "grad_norm": 0.14958973228931427, + "learning_rate": 3.775139285228971e-05, + "loss": 3.7861, + "step": 73325 + }, + { + "epoch": 4.982334556325588, + "grad_norm": 0.1522700935602188, + "learning_rate": 3.774714635140644e-05, + "loss": 3.5833, + "step": 73330 + }, + { + "epoch": 4.982674276396249, + "grad_norm": 0.15463295578956604, + "learning_rate": 3.7742899850523175e-05, + "loss": 3.8875, + "step": 73335 + }, + { + "epoch": 4.983013996466911, + "grad_norm": 0.18100816011428833, + "learning_rate": 3.7738653349639896e-05, + "loss": 3.9636, + "step": 73340 + }, + { + "epoch": 4.983353716537573, + "grad_norm": 0.28586259484291077, + "learning_rate": 3.7734406848756624e-05, + "loss": 3.6141, + "step": 73345 + }, + { + "epoch": 4.983693436608235, + "grad_norm": 0.18140774965286255, + "learning_rate": 3.773016034787335e-05, + "loss": 3.9088, + "step": 73350 + }, + { + "epoch": 4.984033156678897, + "grad_norm": 0.1723954975605011, + "learning_rate": 3.772591384699008e-05, + "loss": 3.7181, + "step": 73355 + }, + { + "epoch": 4.984372876749559, + "grad_norm": 0.40212103724479675, + "learning_rate": 3.772166734610681e-05, + "loss": 3.7357, + "step": 73360 + }, + { + "epoch": 4.98471259682022, + "grad_norm": 0.155445396900177, + "learning_rate": 3.7717420845223536e-05, + "loss": 3.9999, + "step": 73365 + }, + { + "epoch": 4.985052316890882, + "grad_norm": 0.23320448398590088, + "learning_rate": 3.7713174344340264e-05, + "loss": 3.8531, + "step": 73370 + }, + { + "epoch": 4.985392036961544, + "grad_norm": 0.1543717235326767, + "learning_rate": 3.770892784345699e-05, + "loss": 3.9978, + "step": 73375 + }, + { + "epoch": 4.985731757032205, + "grad_norm": 0.1913418471813202, + "learning_rate": 3.770468134257372e-05, + "loss": 3.7651, + "step": 73380 + }, + { + "epoch": 4.986071477102867, + "grad_norm": 0.21073082089424133, + "learning_rate": 3.770043484169045e-05, + "loss": 3.9119, + "step": 73385 + }, + { + "epoch": 4.986411197173529, + "grad_norm": 0.16307273507118225, + "learning_rate": 3.7696188340807176e-05, + "loss": 3.795, + "step": 73390 + }, + { + "epoch": 4.986750917244191, + "grad_norm": 0.17390097677707672, + "learning_rate": 3.7691941839923904e-05, + "loss": 3.7155, + "step": 73395 + }, + { + "epoch": 4.987090637314853, + "grad_norm": 0.15349958837032318, + "learning_rate": 3.768769533904063e-05, + "loss": 3.7853, + "step": 73400 + }, + { + "epoch": 4.987430357385515, + "grad_norm": 0.1655370146036148, + "learning_rate": 3.768344883815736e-05, + "loss": 3.8197, + "step": 73405 + }, + { + "epoch": 4.987770077456176, + "grad_norm": 0.17647910118103027, + "learning_rate": 3.767920233727409e-05, + "loss": 3.9744, + "step": 73410 + }, + { + "epoch": 4.988109797526838, + "grad_norm": 0.18231528997421265, + "learning_rate": 3.7674955836390816e-05, + "loss": 3.7436, + "step": 73415 + }, + { + "epoch": 4.9884495175975, + "grad_norm": 0.16543737053871155, + "learning_rate": 3.7670709335507544e-05, + "loss": 3.9551, + "step": 73420 + }, + { + "epoch": 4.988789237668161, + "grad_norm": 0.2112790048122406, + "learning_rate": 3.766646283462427e-05, + "loss": 3.915, + "step": 73425 + }, + { + "epoch": 4.989128957738823, + "grad_norm": 0.23133718967437744, + "learning_rate": 3.7662216333741e-05, + "loss": 3.863, + "step": 73430 + }, + { + "epoch": 4.989468677809485, + "grad_norm": 0.2636198401451111, + "learning_rate": 3.765796983285773e-05, + "loss": 3.991, + "step": 73435 + }, + { + "epoch": 4.989808397880147, + "grad_norm": 0.2076880782842636, + "learning_rate": 3.7653723331974456e-05, + "loss": 3.8321, + "step": 73440 + }, + { + "epoch": 4.990148117950809, + "grad_norm": 0.17880651354789734, + "learning_rate": 3.7649476831091184e-05, + "loss": 3.9522, + "step": 73445 + }, + { + "epoch": 4.990487838021471, + "grad_norm": 0.15003204345703125, + "learning_rate": 3.7645230330207906e-05, + "loss": 3.7454, + "step": 73450 + }, + { + "epoch": 4.990827558092132, + "grad_norm": 0.2050533890724182, + "learning_rate": 3.764098382932464e-05, + "loss": 3.9315, + "step": 73455 + }, + { + "epoch": 4.991167278162794, + "grad_norm": 0.23206956684589386, + "learning_rate": 3.763673732844137e-05, + "loss": 3.9846, + "step": 73460 + }, + { + "epoch": 4.991506998233456, + "grad_norm": 1.3720537424087524, + "learning_rate": 3.763249082755809e-05, + "loss": 3.6377, + "step": 73465 + }, + { + "epoch": 4.991846718304117, + "grad_norm": 0.18134665489196777, + "learning_rate": 3.7628244326674824e-05, + "loss": 3.7882, + "step": 73470 + }, + { + "epoch": 4.992186438374779, + "grad_norm": 0.1607312560081482, + "learning_rate": 3.762399782579155e-05, + "loss": 3.8857, + "step": 73475 + }, + { + "epoch": 4.9925261584454415, + "grad_norm": 0.2587389349937439, + "learning_rate": 3.7619751324908274e-05, + "loss": 3.8479, + "step": 73480 + }, + { + "epoch": 4.992865878516103, + "grad_norm": 0.1452445089817047, + "learning_rate": 3.761550482402501e-05, + "loss": 3.919, + "step": 73485 + }, + { + "epoch": 4.993205598586765, + "grad_norm": 0.22634607553482056, + "learning_rate": 3.7611258323141736e-05, + "loss": 3.8416, + "step": 73490 + }, + { + "epoch": 4.993545318657426, + "grad_norm": 0.13264033198356628, + "learning_rate": 3.760701182225846e-05, + "loss": 3.9133, + "step": 73495 + }, + { + "epoch": 4.993885038728088, + "grad_norm": 0.22413451969623566, + "learning_rate": 3.7602765321375186e-05, + "loss": 3.6948, + "step": 73500 + }, + { + "epoch": 4.99422475879875, + "grad_norm": 0.235796257853508, + "learning_rate": 3.759851882049192e-05, + "loss": 3.827, + "step": 73505 + }, + { + "epoch": 4.994564478869411, + "grad_norm": 0.18621617555618286, + "learning_rate": 3.759427231960864e-05, + "loss": 4.132, + "step": 73510 + }, + { + "epoch": 4.994904198940073, + "grad_norm": 0.19351904094219208, + "learning_rate": 3.759002581872537e-05, + "loss": 3.9434, + "step": 73515 + }, + { + "epoch": 4.995243919010735, + "grad_norm": 0.2548784017562866, + "learning_rate": 3.7585779317842105e-05, + "loss": 3.8861, + "step": 73520 + }, + { + "epoch": 4.995583639081397, + "grad_norm": 0.16307811439037323, + "learning_rate": 3.7581532816958826e-05, + "loss": 3.7953, + "step": 73525 + }, + { + "epoch": 4.995923359152059, + "grad_norm": 0.18241102993488312, + "learning_rate": 3.7577286316075554e-05, + "loss": 3.9303, + "step": 73530 + }, + { + "epoch": 4.996263079222721, + "grad_norm": 0.16856317222118378, + "learning_rate": 3.757303981519228e-05, + "loss": 3.7854, + "step": 73535 + }, + { + "epoch": 4.996602799293382, + "grad_norm": 0.17295238375663757, + "learning_rate": 3.756879331430901e-05, + "loss": 3.8178, + "step": 73540 + }, + { + "epoch": 4.996942519364044, + "grad_norm": 0.15982286632061005, + "learning_rate": 3.756454681342574e-05, + "loss": 3.9407, + "step": 73545 + }, + { + "epoch": 4.997282239434706, + "grad_norm": 0.19909541308879852, + "learning_rate": 3.7560300312542466e-05, + "loss": 3.9011, + "step": 73550 + }, + { + "epoch": 4.997621959505367, + "grad_norm": 0.23099543154239655, + "learning_rate": 3.7556053811659194e-05, + "loss": 3.8166, + "step": 73555 + }, + { + "epoch": 4.997961679576029, + "grad_norm": 0.1653001755475998, + "learning_rate": 3.755180731077592e-05, + "loss": 3.8926, + "step": 73560 + }, + { + "epoch": 4.998301399646691, + "grad_norm": 0.2073383331298828, + "learning_rate": 3.754756080989265e-05, + "loss": 3.6599, + "step": 73565 + }, + { + "epoch": 4.998641119717353, + "grad_norm": 0.16120849549770355, + "learning_rate": 3.754331430900938e-05, + "loss": 3.8686, + "step": 73570 + }, + { + "epoch": 4.998980839788015, + "grad_norm": 0.16576911509037018, + "learning_rate": 3.7539067808126106e-05, + "loss": 3.7297, + "step": 73575 + }, + { + "epoch": 4.999320559858677, + "grad_norm": 0.14384324848651886, + "learning_rate": 3.7534821307242834e-05, + "loss": 3.8063, + "step": 73580 + }, + { + "epoch": 4.999660279929338, + "grad_norm": 0.27213865518569946, + "learning_rate": 3.753057480635956e-05, + "loss": 4.0913, + "step": 73585 + }, + { + "epoch": 5.0, + "grad_norm": 0.3969910740852356, + "learning_rate": 3.752632830547629e-05, + "loss": 3.7912, + "step": 73590 + }, + { + "epoch": 5.0, + "eval_bertscore": { + "f1": 0.8522649204930844, + "precision": 0.8758532806807843, + "recall": 0.8302511239608645 + }, + "eval_bleu_4": 0.00189309412622561, + "eval_exact_match": 0.0, + "eval_loss": 3.6507654190063477, + "eval_meteor": 0.07520140981501999, + "eval_rouge": { + "rouge1": 0.1227311522338148, + "rouge2": 0.015189764566048387, + "rougeL": 0.10843903065165204, + "rougeLsum": 0.10843888146821376 + }, + "eval_runtime": 372.0203, + "eval_samples_per_second": 27.738, + "eval_steps_per_second": 3.468, + "step": 73590 + }, + { + "epoch": 5.000339720070662, + "grad_norm": 0.3425697684288025, + "learning_rate": 3.752208180459302e-05, + "loss": 3.8128, + "step": 73595 + }, + { + "epoch": 5.000679440141323, + "grad_norm": 0.1635667383670807, + "learning_rate": 3.7517835303709746e-05, + "loss": 3.7702, + "step": 73600 + }, + { + "epoch": 5.001019160211985, + "grad_norm": 0.15373270213603973, + "learning_rate": 3.7513588802826474e-05, + "loss": 4.0077, + "step": 73605 + }, + { + "epoch": 5.001358880282647, + "grad_norm": 0.2135203778743744, + "learning_rate": 3.75093423019432e-05, + "loss": 4.0199, + "step": 73610 + }, + { + "epoch": 5.001698600353309, + "grad_norm": 0.16302569210529327, + "learning_rate": 3.750509580105993e-05, + "loss": 3.9685, + "step": 73615 + }, + { + "epoch": 5.002038320423971, + "grad_norm": 0.5853749513626099, + "learning_rate": 3.750084930017666e-05, + "loss": 3.7865, + "step": 73620 + }, + { + "epoch": 5.002378040494633, + "grad_norm": 0.2929422855377197, + "learning_rate": 3.7496602799293386e-05, + "loss": 3.9434, + "step": 73625 + }, + { + "epoch": 5.002717760565294, + "grad_norm": 0.15631763637065887, + "learning_rate": 3.7492356298410114e-05, + "loss": 3.6557, + "step": 73630 + }, + { + "epoch": 5.003057480635956, + "grad_norm": 0.1787060648202896, + "learning_rate": 3.7488109797526835e-05, + "loss": 3.9076, + "step": 73635 + }, + { + "epoch": 5.003397200706618, + "grad_norm": 0.1612972766160965, + "learning_rate": 3.748386329664357e-05, + "loss": 3.8041, + "step": 73640 + }, + { + "epoch": 5.003736920777279, + "grad_norm": 0.15453140437602997, + "learning_rate": 3.74796167957603e-05, + "loss": 3.8103, + "step": 73645 + }, + { + "epoch": 5.004076640847941, + "grad_norm": 0.16041933000087738, + "learning_rate": 3.747537029487702e-05, + "loss": 3.8686, + "step": 73650 + }, + { + "epoch": 5.004416360918603, + "grad_norm": 0.18937794864177704, + "learning_rate": 3.7471123793993754e-05, + "loss": 3.7992, + "step": 73655 + }, + { + "epoch": 5.004756080989265, + "grad_norm": 0.21818138659000397, + "learning_rate": 3.746687729311048e-05, + "loss": 3.8994, + "step": 73660 + }, + { + "epoch": 5.005095801059927, + "grad_norm": 0.1677446961402893, + "learning_rate": 3.74626307922272e-05, + "loss": 3.8784, + "step": 73665 + }, + { + "epoch": 5.005435521130589, + "grad_norm": 0.19235160946846008, + "learning_rate": 3.745838429134393e-05, + "loss": 3.5373, + "step": 73670 + }, + { + "epoch": 5.00577524120125, + "grad_norm": 0.13205835223197937, + "learning_rate": 3.7454137790460666e-05, + "loss": 3.8916, + "step": 73675 + }, + { + "epoch": 5.006114961271912, + "grad_norm": 0.17811952531337738, + "learning_rate": 3.744989128957739e-05, + "loss": 3.8266, + "step": 73680 + }, + { + "epoch": 5.006454681342574, + "grad_norm": 0.16677333414554596, + "learning_rate": 3.7445644788694115e-05, + "loss": 3.952, + "step": 73685 + }, + { + "epoch": 5.006794401413235, + "grad_norm": 1.1334128379821777, + "learning_rate": 3.744139828781085e-05, + "loss": 3.6754, + "step": 73690 + }, + { + "epoch": 5.007134121483897, + "grad_norm": 0.4852638840675354, + "learning_rate": 3.743715178692757e-05, + "loss": 3.7995, + "step": 73695 + }, + { + "epoch": 5.007473841554559, + "grad_norm": 0.17762160301208496, + "learning_rate": 3.74329052860443e-05, + "loss": 3.6925, + "step": 73700 + }, + { + "epoch": 5.007813561625221, + "grad_norm": 0.30711042881011963, + "learning_rate": 3.742865878516103e-05, + "loss": 3.8415, + "step": 73705 + }, + { + "epoch": 5.008153281695883, + "grad_norm": 0.200289785861969, + "learning_rate": 3.7424412284277755e-05, + "loss": 3.8894, + "step": 73710 + }, + { + "epoch": 5.008493001766545, + "grad_norm": 0.3504563868045807, + "learning_rate": 3.742016578339448e-05, + "loss": 3.9794, + "step": 73715 + }, + { + "epoch": 5.008832721837206, + "grad_norm": 0.14665651321411133, + "learning_rate": 3.741591928251121e-05, + "loss": 3.992, + "step": 73720 + }, + { + "epoch": 5.009172441907868, + "grad_norm": 0.15855441987514496, + "learning_rate": 3.741167278162794e-05, + "loss": 3.7199, + "step": 73725 + }, + { + "epoch": 5.00951216197853, + "grad_norm": 0.1873980015516281, + "learning_rate": 3.740742628074467e-05, + "loss": 3.6921, + "step": 73730 + }, + { + "epoch": 5.009851882049191, + "grad_norm": 0.16187137365341187, + "learning_rate": 3.7403179779861395e-05, + "loss": 3.8371, + "step": 73735 + }, + { + "epoch": 5.010191602119853, + "grad_norm": 0.16337794065475464, + "learning_rate": 3.739893327897812e-05, + "loss": 3.7633, + "step": 73740 + }, + { + "epoch": 5.0105313221905154, + "grad_norm": 0.6356304883956909, + "learning_rate": 3.739468677809485e-05, + "loss": 3.6161, + "step": 73745 + }, + { + "epoch": 5.010871042261177, + "grad_norm": 0.17166079580783844, + "learning_rate": 3.739044027721158e-05, + "loss": 3.5851, + "step": 73750 + }, + { + "epoch": 5.011210762331839, + "grad_norm": 0.17309032380580902, + "learning_rate": 3.738619377632831e-05, + "loss": 3.8754, + "step": 73755 + }, + { + "epoch": 5.0115504824025, + "grad_norm": 0.15769124031066895, + "learning_rate": 3.7381947275445035e-05, + "loss": 3.8001, + "step": 73760 + }, + { + "epoch": 5.011890202473162, + "grad_norm": 0.6094582676887512, + "learning_rate": 3.737770077456176e-05, + "loss": 3.8232, + "step": 73765 + }, + { + "epoch": 5.012229922543824, + "grad_norm": 0.6577433347702026, + "learning_rate": 3.737345427367849e-05, + "loss": 3.6352, + "step": 73770 + }, + { + "epoch": 5.012569642614485, + "grad_norm": 0.15925635397434235, + "learning_rate": 3.736920777279522e-05, + "loss": 3.6076, + "step": 73775 + }, + { + "epoch": 5.012909362685147, + "grad_norm": 0.40896672010421753, + "learning_rate": 3.736496127191195e-05, + "loss": 3.9319, + "step": 73780 + }, + { + "epoch": 5.013249082755809, + "grad_norm": 0.17249377071857452, + "learning_rate": 3.7360714771028675e-05, + "loss": 3.7916, + "step": 73785 + }, + { + "epoch": 5.013588802826471, + "grad_norm": 0.15262827277183533, + "learning_rate": 3.73564682701454e-05, + "loss": 3.7709, + "step": 73790 + }, + { + "epoch": 5.013928522897133, + "grad_norm": 0.3623233735561371, + "learning_rate": 3.735222176926213e-05, + "loss": 3.9199, + "step": 73795 + }, + { + "epoch": 5.014268242967795, + "grad_norm": 0.22339282929897308, + "learning_rate": 3.734797526837886e-05, + "loss": 3.8396, + "step": 73800 + }, + { + "epoch": 5.014607963038456, + "grad_norm": 0.14591488242149353, + "learning_rate": 3.734372876749558e-05, + "loss": 3.6086, + "step": 73805 + }, + { + "epoch": 5.014947683109118, + "grad_norm": 0.14162768423557281, + "learning_rate": 3.7339482266612315e-05, + "loss": 3.8529, + "step": 73810 + }, + { + "epoch": 5.01528740317978, + "grad_norm": 0.2891632616519928, + "learning_rate": 3.7335235765729043e-05, + "loss": 3.762, + "step": 73815 + }, + { + "epoch": 5.015627123250441, + "grad_norm": 0.8727288246154785, + "learning_rate": 3.7330989264845765e-05, + "loss": 3.9194, + "step": 73820 + }, + { + "epoch": 5.015966843321103, + "grad_norm": 0.14084090292453766, + "learning_rate": 3.73267427639625e-05, + "loss": 3.5967, + "step": 73825 + }, + { + "epoch": 5.016306563391765, + "grad_norm": 0.1549486517906189, + "learning_rate": 3.732249626307923e-05, + "loss": 3.7808, + "step": 73830 + }, + { + "epoch": 5.016646283462427, + "grad_norm": 0.18128840625286102, + "learning_rate": 3.731824976219595e-05, + "loss": 4.0277, + "step": 73835 + }, + { + "epoch": 5.016986003533089, + "grad_norm": 0.15622355043888092, + "learning_rate": 3.731400326131268e-05, + "loss": 3.8204, + "step": 73840 + }, + { + "epoch": 5.017325723603751, + "grad_norm": 0.1690511256456375, + "learning_rate": 3.730975676042941e-05, + "loss": 3.5559, + "step": 73845 + }, + { + "epoch": 5.017665443674412, + "grad_norm": 0.15138967335224152, + "learning_rate": 3.730551025954613e-05, + "loss": 3.7534, + "step": 73850 + }, + { + "epoch": 5.018005163745074, + "grad_norm": 0.1742001473903656, + "learning_rate": 3.730126375866286e-05, + "loss": 3.8066, + "step": 73855 + }, + { + "epoch": 5.018344883815736, + "grad_norm": 0.1481122225522995, + "learning_rate": 3.7297017257779595e-05, + "loss": 3.6325, + "step": 73860 + }, + { + "epoch": 5.018684603886397, + "grad_norm": 0.18320320546627045, + "learning_rate": 3.729277075689632e-05, + "loss": 3.8747, + "step": 73865 + }, + { + "epoch": 5.019024323957059, + "grad_norm": 0.14993011951446533, + "learning_rate": 3.7288524256013045e-05, + "loss": 3.8994, + "step": 73870 + }, + { + "epoch": 5.019364044027721, + "grad_norm": 0.17244958877563477, + "learning_rate": 3.728427775512978e-05, + "loss": 3.8159, + "step": 73875 + }, + { + "epoch": 5.019703764098383, + "grad_norm": 0.1650383025407791, + "learning_rate": 3.72800312542465e-05, + "loss": 3.7145, + "step": 73880 + }, + { + "epoch": 5.020043484169045, + "grad_norm": 0.3499361276626587, + "learning_rate": 3.727578475336323e-05, + "loss": 3.7388, + "step": 73885 + }, + { + "epoch": 5.020383204239707, + "grad_norm": 0.33063629269599915, + "learning_rate": 3.727153825247996e-05, + "loss": 3.8857, + "step": 73890 + }, + { + "epoch": 5.020722924310368, + "grad_norm": 0.18468926846981049, + "learning_rate": 3.7267291751596685e-05, + "loss": 3.6821, + "step": 73895 + }, + { + "epoch": 5.02106264438103, + "grad_norm": 0.24532055854797363, + "learning_rate": 3.726304525071341e-05, + "loss": 3.9602, + "step": 73900 + }, + { + "epoch": 5.021402364451692, + "grad_norm": 0.15962134301662445, + "learning_rate": 3.725879874983014e-05, + "loss": 3.9379, + "step": 73905 + }, + { + "epoch": 5.021742084522353, + "grad_norm": 0.19365417957305908, + "learning_rate": 3.725455224894687e-05, + "loss": 3.9159, + "step": 73910 + }, + { + "epoch": 5.022081804593015, + "grad_norm": 0.2342374324798584, + "learning_rate": 3.72503057480636e-05, + "loss": 3.9691, + "step": 73915 + }, + { + "epoch": 5.022421524663677, + "grad_norm": 0.16900810599327087, + "learning_rate": 3.7246059247180325e-05, + "loss": 3.9095, + "step": 73920 + }, + { + "epoch": 5.022761244734339, + "grad_norm": 0.28762802481651306, + "learning_rate": 3.724181274629705e-05, + "loss": 3.8196, + "step": 73925 + }, + { + "epoch": 5.023100964805001, + "grad_norm": 0.13474752008914948, + "learning_rate": 3.723756624541378e-05, + "loss": 3.7344, + "step": 73930 + }, + { + "epoch": 5.023440684875663, + "grad_norm": 0.17789025604724884, + "learning_rate": 3.723331974453051e-05, + "loss": 3.9799, + "step": 73935 + }, + { + "epoch": 5.023780404946324, + "grad_norm": 0.17396701872348785, + "learning_rate": 3.722907324364724e-05, + "loss": 3.6982, + "step": 73940 + }, + { + "epoch": 5.024120125016986, + "grad_norm": 0.15686282515525818, + "learning_rate": 3.7224826742763965e-05, + "loss": 3.5933, + "step": 73945 + }, + { + "epoch": 5.024459845087648, + "grad_norm": 0.15027587115764618, + "learning_rate": 3.722058024188069e-05, + "loss": 3.7964, + "step": 73950 + }, + { + "epoch": 5.024799565158309, + "grad_norm": 0.1691741794347763, + "learning_rate": 3.721633374099742e-05, + "loss": 3.6683, + "step": 73955 + }, + { + "epoch": 5.025139285228971, + "grad_norm": 0.1418629288673401, + "learning_rate": 3.721208724011415e-05, + "loss": 3.8677, + "step": 73960 + }, + { + "epoch": 5.025479005299633, + "grad_norm": 0.1686144471168518, + "learning_rate": 3.720784073923088e-05, + "loss": 3.9224, + "step": 73965 + }, + { + "epoch": 5.025818725370295, + "grad_norm": 0.17413684725761414, + "learning_rate": 3.7203594238347605e-05, + "loss": 3.8326, + "step": 73970 + }, + { + "epoch": 5.026158445440957, + "grad_norm": 0.1493951827287674, + "learning_rate": 3.719934773746433e-05, + "loss": 3.8613, + "step": 73975 + }, + { + "epoch": 5.026498165511619, + "grad_norm": 0.16825906932353973, + "learning_rate": 3.719510123658106e-05, + "loss": 3.9026, + "step": 73980 + }, + { + "epoch": 5.02683788558228, + "grad_norm": 0.1493830680847168, + "learning_rate": 3.719085473569779e-05, + "loss": 3.7594, + "step": 73985 + }, + { + "epoch": 5.027177605652942, + "grad_norm": 0.15878033638000488, + "learning_rate": 3.718660823481451e-05, + "loss": 3.7608, + "step": 73990 + }, + { + "epoch": 5.027517325723604, + "grad_norm": 0.1567566990852356, + "learning_rate": 3.7182361733931245e-05, + "loss": 3.7922, + "step": 73995 + }, + { + "epoch": 5.027857045794265, + "grad_norm": 0.15164053440093994, + "learning_rate": 3.717811523304797e-05, + "loss": 3.6864, + "step": 74000 + }, + { + "epoch": 5.028196765864927, + "grad_norm": 0.17174498736858368, + "learning_rate": 3.7173868732164694e-05, + "loss": 3.9303, + "step": 74005 + }, + { + "epoch": 5.028536485935589, + "grad_norm": 0.2763015031814575, + "learning_rate": 3.716962223128143e-05, + "loss": 3.7531, + "step": 74010 + }, + { + "epoch": 5.028876206006251, + "grad_norm": 0.20831725001335144, + "learning_rate": 3.716537573039816e-05, + "loss": 3.7102, + "step": 74015 + }, + { + "epoch": 5.029215926076913, + "grad_norm": 0.7764050960540771, + "learning_rate": 3.716112922951488e-05, + "loss": 3.7075, + "step": 74020 + }, + { + "epoch": 5.029555646147575, + "grad_norm": 0.1829523891210556, + "learning_rate": 3.7156882728631606e-05, + "loss": 4.086, + "step": 74025 + }, + { + "epoch": 5.029895366218236, + "grad_norm": 0.18603549897670746, + "learning_rate": 3.715263622774834e-05, + "loss": 3.8297, + "step": 74030 + }, + { + "epoch": 5.030235086288898, + "grad_norm": 0.1878942847251892, + "learning_rate": 3.714838972686506e-05, + "loss": 3.7654, + "step": 74035 + }, + { + "epoch": 5.03057480635956, + "grad_norm": 0.19605833292007446, + "learning_rate": 3.714414322598179e-05, + "loss": 3.6966, + "step": 74040 + }, + { + "epoch": 5.030914526430221, + "grad_norm": 0.18928863108158112, + "learning_rate": 3.7139896725098525e-05, + "loss": 3.5456, + "step": 74045 + }, + { + "epoch": 5.031254246500883, + "grad_norm": 0.2015385627746582, + "learning_rate": 3.7135650224215246e-05, + "loss": 3.9844, + "step": 74050 + }, + { + "epoch": 5.0315939665715455, + "grad_norm": 0.3595483899116516, + "learning_rate": 3.7131403723331974e-05, + "loss": 3.7203, + "step": 74055 + }, + { + "epoch": 5.031933686642207, + "grad_norm": 0.2005746215581894, + "learning_rate": 3.71271572224487e-05, + "loss": 3.6724, + "step": 74060 + }, + { + "epoch": 5.032273406712869, + "grad_norm": 0.1713731288909912, + "learning_rate": 3.712291072156543e-05, + "loss": 3.9254, + "step": 74065 + }, + { + "epoch": 5.032613126783531, + "grad_norm": 4.285981178283691, + "learning_rate": 3.711866422068216e-05, + "loss": 3.6461, + "step": 74070 + }, + { + "epoch": 5.032952846854192, + "grad_norm": 0.24645353853702545, + "learning_rate": 3.7114417719798886e-05, + "loss": 3.737, + "step": 74075 + }, + { + "epoch": 5.033292566924854, + "grad_norm": 0.23591183125972748, + "learning_rate": 3.7110171218915614e-05, + "loss": 3.7912, + "step": 74080 + }, + { + "epoch": 5.033632286995516, + "grad_norm": 0.17060942947864532, + "learning_rate": 3.710592471803234e-05, + "loss": 3.8519, + "step": 74085 + }, + { + "epoch": 5.033972007066177, + "grad_norm": 0.18405279517173767, + "learning_rate": 3.710167821714907e-05, + "loss": 3.7451, + "step": 74090 + }, + { + "epoch": 5.034311727136839, + "grad_norm": 0.1534123569726944, + "learning_rate": 3.70974317162658e-05, + "loss": 4.115, + "step": 74095 + }, + { + "epoch": 5.0346514472075015, + "grad_norm": 0.21419304609298706, + "learning_rate": 3.7093185215382526e-05, + "loss": 3.9641, + "step": 74100 + }, + { + "epoch": 5.034991167278163, + "grad_norm": 0.14346282184123993, + "learning_rate": 3.7088938714499254e-05, + "loss": 3.7564, + "step": 74105 + }, + { + "epoch": 5.035330887348825, + "grad_norm": 0.1887427270412445, + "learning_rate": 3.708469221361598e-05, + "loss": 3.9067, + "step": 74110 + }, + { + "epoch": 5.035670607419486, + "grad_norm": 0.15996813774108887, + "learning_rate": 3.708044571273271e-05, + "loss": 4.1097, + "step": 74115 + }, + { + "epoch": 5.036010327490148, + "grad_norm": 0.17281943559646606, + "learning_rate": 3.707619921184944e-05, + "loss": 3.57, + "step": 74120 + }, + { + "epoch": 5.03635004756081, + "grad_norm": 0.16365009546279907, + "learning_rate": 3.7071952710966166e-05, + "loss": 3.9078, + "step": 74125 + }, + { + "epoch": 5.036689767631471, + "grad_norm": 0.17021900415420532, + "learning_rate": 3.7067706210082894e-05, + "loss": 3.9409, + "step": 74130 + }, + { + "epoch": 5.037029487702133, + "grad_norm": 0.22381317615509033, + "learning_rate": 3.706345970919962e-05, + "loss": 3.8087, + "step": 74135 + }, + { + "epoch": 5.037369207772795, + "grad_norm": 0.2866438329219818, + "learning_rate": 3.705921320831635e-05, + "loss": 3.853, + "step": 74140 + }, + { + "epoch": 5.037708927843457, + "grad_norm": 0.1574110984802246, + "learning_rate": 3.705496670743308e-05, + "loss": 3.8922, + "step": 74145 + }, + { + "epoch": 5.038048647914119, + "grad_norm": 0.142888605594635, + "learning_rate": 3.7050720206549806e-05, + "loss": 3.7815, + "step": 74150 + }, + { + "epoch": 5.038388367984781, + "grad_norm": 0.164756640791893, + "learning_rate": 3.7046473705666534e-05, + "loss": 3.8999, + "step": 74155 + }, + { + "epoch": 5.038728088055442, + "grad_norm": 0.31339937448501587, + "learning_rate": 3.7042227204783256e-05, + "loss": 3.67, + "step": 74160 + }, + { + "epoch": 5.039067808126104, + "grad_norm": 0.2499346137046814, + "learning_rate": 3.703798070389999e-05, + "loss": 4.173, + "step": 74165 + }, + { + "epoch": 5.039407528196766, + "grad_norm": 0.21323613822460175, + "learning_rate": 3.703373420301672e-05, + "loss": 3.9051, + "step": 74170 + }, + { + "epoch": 5.039747248267427, + "grad_norm": 0.3724031448364258, + "learning_rate": 3.702948770213344e-05, + "loss": 4.1983, + "step": 74175 + }, + { + "epoch": 5.040086968338089, + "grad_norm": 0.27175965905189514, + "learning_rate": 3.7025241201250174e-05, + "loss": 3.7032, + "step": 74180 + }, + { + "epoch": 5.040426688408751, + "grad_norm": 0.18447384238243103, + "learning_rate": 3.70209947003669e-05, + "loss": 3.875, + "step": 74185 + }, + { + "epoch": 5.040766408479413, + "grad_norm": 0.16432438790798187, + "learning_rate": 3.7016748199483624e-05, + "loss": 3.8544, + "step": 74190 + }, + { + "epoch": 5.041106128550075, + "grad_norm": 0.18884146213531494, + "learning_rate": 3.701250169860035e-05, + "loss": 3.8281, + "step": 74195 + }, + { + "epoch": 5.041445848620737, + "grad_norm": 0.19399921596050262, + "learning_rate": 3.7008255197717086e-05, + "loss": 4.0045, + "step": 74200 + }, + { + "epoch": 5.041785568691398, + "grad_norm": 0.1619730442762375, + "learning_rate": 3.700400869683381e-05, + "loss": 3.6514, + "step": 74205 + }, + { + "epoch": 5.04212528876206, + "grad_norm": 0.2012683004140854, + "learning_rate": 3.6999762195950536e-05, + "loss": 4.1196, + "step": 74210 + }, + { + "epoch": 5.042465008832722, + "grad_norm": 0.8247766494750977, + "learning_rate": 3.699551569506727e-05, + "loss": 3.7907, + "step": 74215 + }, + { + "epoch": 5.042804728903383, + "grad_norm": 0.1630246490240097, + "learning_rate": 3.699126919418399e-05, + "loss": 4.0289, + "step": 74220 + }, + { + "epoch": 5.043144448974045, + "grad_norm": 0.17740099132061005, + "learning_rate": 3.698702269330072e-05, + "loss": 3.8948, + "step": 74225 + }, + { + "epoch": 5.043484169044707, + "grad_norm": 0.21360312402248383, + "learning_rate": 3.698277619241745e-05, + "loss": 3.9597, + "step": 74230 + }, + { + "epoch": 5.043823889115369, + "grad_norm": 4.468254089355469, + "learning_rate": 3.6978529691534176e-05, + "loss": 3.6471, + "step": 74235 + }, + { + "epoch": 5.044163609186031, + "grad_norm": 0.4313063621520996, + "learning_rate": 3.6974283190650904e-05, + "loss": 3.5436, + "step": 74240 + }, + { + "epoch": 5.044503329256693, + "grad_norm": 0.20848535001277924, + "learning_rate": 3.697003668976763e-05, + "loss": 3.8267, + "step": 74245 + }, + { + "epoch": 5.044843049327354, + "grad_norm": 0.23327480256557465, + "learning_rate": 3.696579018888436e-05, + "loss": 3.7606, + "step": 74250 + }, + { + "epoch": 5.045182769398016, + "grad_norm": 0.719753086566925, + "learning_rate": 3.696154368800109e-05, + "loss": 3.9159, + "step": 74255 + }, + { + "epoch": 5.045522489468678, + "grad_norm": 0.40987929701805115, + "learning_rate": 3.6957297187117816e-05, + "loss": 3.8744, + "step": 74260 + }, + { + "epoch": 5.045862209539339, + "grad_norm": 0.1857329159975052, + "learning_rate": 3.6953050686234544e-05, + "loss": 3.9346, + "step": 74265 + }, + { + "epoch": 5.046201929610001, + "grad_norm": 0.14926402270793915, + "learning_rate": 3.694880418535127e-05, + "loss": 3.836, + "step": 74270 + }, + { + "epoch": 5.046541649680663, + "grad_norm": 0.2241513729095459, + "learning_rate": 3.6944557684468e-05, + "loss": 3.8368, + "step": 74275 + }, + { + "epoch": 5.046881369751325, + "grad_norm": 0.2728757858276367, + "learning_rate": 3.694031118358473e-05, + "loss": 3.6247, + "step": 74280 + }, + { + "epoch": 5.047221089821987, + "grad_norm": 0.16027338802814484, + "learning_rate": 3.6936064682701456e-05, + "loss": 3.7537, + "step": 74285 + }, + { + "epoch": 5.047560809892649, + "grad_norm": 0.16886956989765167, + "learning_rate": 3.6931818181818184e-05, + "loss": 3.8546, + "step": 74290 + }, + { + "epoch": 5.04790052996331, + "grad_norm": 0.28612229228019714, + "learning_rate": 3.692757168093491e-05, + "loss": 3.6449, + "step": 74295 + }, + { + "epoch": 5.048240250033972, + "grad_norm": 0.2129206359386444, + "learning_rate": 3.692332518005164e-05, + "loss": 4.0421, + "step": 74300 + }, + { + "epoch": 5.048579970104634, + "grad_norm": 0.2131652981042862, + "learning_rate": 3.691907867916837e-05, + "loss": 3.9187, + "step": 74305 + }, + { + "epoch": 5.048919690175295, + "grad_norm": 0.17033381760120392, + "learning_rate": 3.6914832178285096e-05, + "loss": 4.2775, + "step": 74310 + }, + { + "epoch": 5.049259410245957, + "grad_norm": 0.19506843388080597, + "learning_rate": 3.6910585677401824e-05, + "loss": 3.9417, + "step": 74315 + }, + { + "epoch": 5.0495991303166194, + "grad_norm": 0.15703622996807098, + "learning_rate": 3.690633917651855e-05, + "loss": 4.0172, + "step": 74320 + }, + { + "epoch": 5.049938850387281, + "grad_norm": 0.14131657779216766, + "learning_rate": 3.690209267563528e-05, + "loss": 3.8017, + "step": 74325 + }, + { + "epoch": 5.050278570457943, + "grad_norm": 0.15495119988918304, + "learning_rate": 3.6897846174752e-05, + "loss": 3.6312, + "step": 74330 + }, + { + "epoch": 5.050618290528605, + "grad_norm": 0.18291564285755157, + "learning_rate": 3.6893599673868736e-05, + "loss": 3.9576, + "step": 74335 + }, + { + "epoch": 5.050958010599266, + "grad_norm": 0.2967507839202881, + "learning_rate": 3.6889353172985464e-05, + "loss": 3.8298, + "step": 74340 + }, + { + "epoch": 5.051297730669928, + "grad_norm": 0.17335836589336395, + "learning_rate": 3.6885106672102185e-05, + "loss": 3.5346, + "step": 74345 + }, + { + "epoch": 5.05163745074059, + "grad_norm": 0.15357853472232819, + "learning_rate": 3.688086017121892e-05, + "loss": 3.9105, + "step": 74350 + }, + { + "epoch": 5.051977170811251, + "grad_norm": 0.14244180917739868, + "learning_rate": 3.687661367033565e-05, + "loss": 3.9064, + "step": 74355 + }, + { + "epoch": 5.052316890881913, + "grad_norm": 0.17836163938045502, + "learning_rate": 3.687236716945237e-05, + "loss": 3.7689, + "step": 74360 + }, + { + "epoch": 5.0526566109525755, + "grad_norm": 0.1572222262620926, + "learning_rate": 3.68681206685691e-05, + "loss": 3.6992, + "step": 74365 + }, + { + "epoch": 5.052996331023237, + "grad_norm": 0.19291211664676666, + "learning_rate": 3.686387416768583e-05, + "loss": 3.9019, + "step": 74370 + }, + { + "epoch": 5.053336051093899, + "grad_norm": 0.18459506332874298, + "learning_rate": 3.685962766680255e-05, + "loss": 3.7291, + "step": 74375 + }, + { + "epoch": 5.053675771164561, + "grad_norm": 0.1702875941991806, + "learning_rate": 3.685538116591928e-05, + "loss": 3.6508, + "step": 74380 + }, + { + "epoch": 5.054015491235222, + "grad_norm": 0.2219318002462387, + "learning_rate": 3.6851134665036016e-05, + "loss": 3.866, + "step": 74385 + }, + { + "epoch": 5.054355211305884, + "grad_norm": 0.4823443293571472, + "learning_rate": 3.684688816415274e-05, + "loss": 4.1204, + "step": 74390 + }, + { + "epoch": 5.054694931376546, + "grad_norm": 0.1463116556406021, + "learning_rate": 3.6842641663269465e-05, + "loss": 3.6827, + "step": 74395 + }, + { + "epoch": 5.055034651447207, + "grad_norm": 0.19482439756393433, + "learning_rate": 3.68383951623862e-05, + "loss": 3.7267, + "step": 74400 + }, + { + "epoch": 5.055374371517869, + "grad_norm": 0.17142251133918762, + "learning_rate": 3.683414866150292e-05, + "loss": 3.9073, + "step": 74405 + }, + { + "epoch": 5.0557140915885315, + "grad_norm": 0.16788409650325775, + "learning_rate": 3.682990216061965e-05, + "loss": 3.6678, + "step": 74410 + }, + { + "epoch": 5.056053811659193, + "grad_norm": 0.16760596632957458, + "learning_rate": 3.682565565973638e-05, + "loss": 3.9957, + "step": 74415 + }, + { + "epoch": 5.056393531729855, + "grad_norm": 0.18272174894809723, + "learning_rate": 3.6821409158853105e-05, + "loss": 3.9319, + "step": 74420 + }, + { + "epoch": 5.056733251800517, + "grad_norm": 0.14605538547039032, + "learning_rate": 3.681716265796983e-05, + "loss": 4.1651, + "step": 74425 + }, + { + "epoch": 5.057072971871178, + "grad_norm": 0.15154455602169037, + "learning_rate": 3.681291615708656e-05, + "loss": 3.8646, + "step": 74430 + }, + { + "epoch": 5.05741269194184, + "grad_norm": 0.22951047122478485, + "learning_rate": 3.6808669656203296e-05, + "loss": 3.7402, + "step": 74435 + }, + { + "epoch": 5.057752412012501, + "grad_norm": 0.15791112184524536, + "learning_rate": 3.680442315532002e-05, + "loss": 3.9176, + "step": 74440 + }, + { + "epoch": 5.058092132083163, + "grad_norm": 0.21939300000667572, + "learning_rate": 3.6800176654436745e-05, + "loss": 3.9057, + "step": 74445 + }, + { + "epoch": 5.058431852153825, + "grad_norm": 0.1943541318178177, + "learning_rate": 3.679593015355347e-05, + "loss": 3.8046, + "step": 74450 + }, + { + "epoch": 5.058771572224487, + "grad_norm": 0.17041495442390442, + "learning_rate": 3.67916836526702e-05, + "loss": 3.8173, + "step": 74455 + }, + { + "epoch": 5.059111292295149, + "grad_norm": 0.18261657655239105, + "learning_rate": 3.678743715178693e-05, + "loss": 3.7963, + "step": 74460 + }, + { + "epoch": 5.059451012365811, + "grad_norm": 0.21512623131275177, + "learning_rate": 3.678319065090366e-05, + "loss": 3.8194, + "step": 74465 + }, + { + "epoch": 5.059790732436472, + "grad_norm": 0.2205275446176529, + "learning_rate": 3.6778944150020385e-05, + "loss": 3.5939, + "step": 74470 + }, + { + "epoch": 5.060130452507134, + "grad_norm": 0.17345255613327026, + "learning_rate": 3.677469764913711e-05, + "loss": 3.8032, + "step": 74475 + }, + { + "epoch": 5.060470172577796, + "grad_norm": 0.2933901846408844, + "learning_rate": 3.677045114825384e-05, + "loss": 4.0298, + "step": 74480 + }, + { + "epoch": 5.060809892648457, + "grad_norm": 0.16910618543624878, + "learning_rate": 3.676620464737057e-05, + "loss": 3.935, + "step": 74485 + }, + { + "epoch": 5.061149612719119, + "grad_norm": 0.19691352546215057, + "learning_rate": 3.67619581464873e-05, + "loss": 3.7788, + "step": 74490 + }, + { + "epoch": 5.061489332789781, + "grad_norm": 0.2606130838394165, + "learning_rate": 3.6757711645604025e-05, + "loss": 3.9213, + "step": 74495 + }, + { + "epoch": 5.061829052860443, + "grad_norm": 0.20132921636104584, + "learning_rate": 3.675346514472075e-05, + "loss": 3.9805, + "step": 74500 + }, + { + "epoch": 5.062168772931105, + "grad_norm": 0.1478571593761444, + "learning_rate": 3.674921864383748e-05, + "loss": 3.7141, + "step": 74505 + }, + { + "epoch": 5.062508493001767, + "grad_norm": 0.18990351259708405, + "learning_rate": 3.674497214295421e-05, + "loss": 3.8389, + "step": 74510 + }, + { + "epoch": 5.062848213072428, + "grad_norm": 0.1656920313835144, + "learning_rate": 3.674072564207093e-05, + "loss": 3.8491, + "step": 74515 + }, + { + "epoch": 5.06318793314309, + "grad_norm": 0.17712509632110596, + "learning_rate": 3.6736479141187665e-05, + "loss": 3.8301, + "step": 74520 + }, + { + "epoch": 5.063527653213752, + "grad_norm": 0.1642158180475235, + "learning_rate": 3.6732232640304393e-05, + "loss": 3.6241, + "step": 74525 + }, + { + "epoch": 5.063867373284413, + "grad_norm": 0.1901341825723648, + "learning_rate": 3.6727986139421115e-05, + "loss": 4.0789, + "step": 74530 + }, + { + "epoch": 5.064207093355075, + "grad_norm": 0.1732434183359146, + "learning_rate": 3.672373963853785e-05, + "loss": 3.8394, + "step": 74535 + }, + { + "epoch": 5.064546813425737, + "grad_norm": 3.0560808181762695, + "learning_rate": 3.671949313765458e-05, + "loss": 3.8498, + "step": 74540 + }, + { + "epoch": 5.064886533496399, + "grad_norm": 0.18047146499156952, + "learning_rate": 3.67152466367713e-05, + "loss": 3.8772, + "step": 74545 + }, + { + "epoch": 5.065226253567061, + "grad_norm": 0.1697842925786972, + "learning_rate": 3.671100013588803e-05, + "loss": 3.6914, + "step": 74550 + }, + { + "epoch": 5.065565973637723, + "grad_norm": 0.24948149919509888, + "learning_rate": 3.670675363500476e-05, + "loss": 3.8512, + "step": 74555 + }, + { + "epoch": 5.065905693708384, + "grad_norm": 0.16301825642585754, + "learning_rate": 3.670250713412148e-05, + "loss": 3.7608, + "step": 74560 + }, + { + "epoch": 5.066245413779046, + "grad_norm": 0.7514474987983704, + "learning_rate": 3.669826063323821e-05, + "loss": 3.8748, + "step": 74565 + }, + { + "epoch": 5.066585133849708, + "grad_norm": 0.1969538778066635, + "learning_rate": 3.6694014132354945e-05, + "loss": 3.8751, + "step": 74570 + }, + { + "epoch": 5.066924853920369, + "grad_norm": 0.19219446182250977, + "learning_rate": 3.668976763147167e-05, + "loss": 4.1067, + "step": 74575 + }, + { + "epoch": 5.067264573991031, + "grad_norm": 0.6246183514595032, + "learning_rate": 3.6685521130588395e-05, + "loss": 3.6549, + "step": 74580 + }, + { + "epoch": 5.067604294061693, + "grad_norm": 0.19268469512462616, + "learning_rate": 3.668127462970512e-05, + "loss": 3.9944, + "step": 74585 + }, + { + "epoch": 5.067944014132355, + "grad_norm": 0.18279139697551727, + "learning_rate": 3.667702812882185e-05, + "loss": 3.866, + "step": 74590 + }, + { + "epoch": 5.068283734203017, + "grad_norm": 0.43641969561576843, + "learning_rate": 3.667278162793858e-05, + "loss": 3.9675, + "step": 74595 + }, + { + "epoch": 5.068623454273679, + "grad_norm": 0.19936789572238922, + "learning_rate": 3.666853512705531e-05, + "loss": 3.9759, + "step": 74600 + }, + { + "epoch": 5.06896317434434, + "grad_norm": 0.15813802182674408, + "learning_rate": 3.666428862617204e-05, + "loss": 3.9184, + "step": 74605 + }, + { + "epoch": 5.069302894415002, + "grad_norm": 0.1922391951084137, + "learning_rate": 3.666004212528876e-05, + "loss": 4.1118, + "step": 74610 + }, + { + "epoch": 5.069642614485664, + "grad_norm": 0.1573927402496338, + "learning_rate": 3.665579562440549e-05, + "loss": 3.8978, + "step": 74615 + }, + { + "epoch": 5.069982334556325, + "grad_norm": 0.1476246416568756, + "learning_rate": 3.665154912352222e-05, + "loss": 3.9436, + "step": 74620 + }, + { + "epoch": 5.070322054626987, + "grad_norm": 0.21032865345478058, + "learning_rate": 3.664730262263895e-05, + "loss": 3.6288, + "step": 74625 + }, + { + "epoch": 5.0706617746976494, + "grad_norm": 0.21663835644721985, + "learning_rate": 3.6643056121755675e-05, + "loss": 3.7202, + "step": 74630 + }, + { + "epoch": 5.071001494768311, + "grad_norm": 0.24193193018436432, + "learning_rate": 3.66388096208724e-05, + "loss": 3.9216, + "step": 74635 + }, + { + "epoch": 5.071341214838973, + "grad_norm": 0.16079215705394745, + "learning_rate": 3.663456311998913e-05, + "loss": 3.9166, + "step": 74640 + }, + { + "epoch": 5.071680934909635, + "grad_norm": 0.19334721565246582, + "learning_rate": 3.663031661910586e-05, + "loss": 3.7461, + "step": 74645 + }, + { + "epoch": 5.072020654980296, + "grad_norm": 0.2703067362308502, + "learning_rate": 3.662607011822259e-05, + "loss": 3.6928, + "step": 74650 + }, + { + "epoch": 5.072360375050958, + "grad_norm": 0.15479625761508942, + "learning_rate": 3.6621823617339315e-05, + "loss": 3.7916, + "step": 74655 + }, + { + "epoch": 5.07270009512162, + "grad_norm": 0.28362515568733215, + "learning_rate": 3.661757711645604e-05, + "loss": 3.8433, + "step": 74660 + }, + { + "epoch": 5.073039815192281, + "grad_norm": 0.13643264770507812, + "learning_rate": 3.661333061557277e-05, + "loss": 4.0033, + "step": 74665 + }, + { + "epoch": 5.073379535262943, + "grad_norm": 0.31266701221466064, + "learning_rate": 3.66090841146895e-05, + "loss": 4.1987, + "step": 74670 + }, + { + "epoch": 5.0737192553336055, + "grad_norm": 0.20111921429634094, + "learning_rate": 3.660483761380623e-05, + "loss": 3.6489, + "step": 74675 + }, + { + "epoch": 5.074058975404267, + "grad_norm": 0.2700046896934509, + "learning_rate": 3.6600591112922955e-05, + "loss": 3.7144, + "step": 74680 + }, + { + "epoch": 5.074398695474929, + "grad_norm": 0.16820628941059113, + "learning_rate": 3.6596344612039676e-05, + "loss": 4.0164, + "step": 74685 + }, + { + "epoch": 5.074738415545591, + "grad_norm": 0.6190727949142456, + "learning_rate": 3.659209811115641e-05, + "loss": 3.5643, + "step": 74690 + }, + { + "epoch": 5.075078135616252, + "grad_norm": 0.4745712876319885, + "learning_rate": 3.658785161027314e-05, + "loss": 3.8221, + "step": 74695 + }, + { + "epoch": 5.075417855686914, + "grad_norm": 0.14082489907741547, + "learning_rate": 3.658360510938986e-05, + "loss": 3.686, + "step": 74700 + }, + { + "epoch": 5.075757575757576, + "grad_norm": 1.458511471748352, + "learning_rate": 3.6579358608506595e-05, + "loss": 3.721, + "step": 74705 + }, + { + "epoch": 5.076097295828237, + "grad_norm": 0.19208507239818573, + "learning_rate": 3.657511210762332e-05, + "loss": 3.8737, + "step": 74710 + }, + { + "epoch": 5.076437015898899, + "grad_norm": 0.39479607343673706, + "learning_rate": 3.6570865606740044e-05, + "loss": 4.0061, + "step": 74715 + }, + { + "epoch": 5.0767767359695615, + "grad_norm": 0.18022549152374268, + "learning_rate": 3.656661910585677e-05, + "loss": 3.8598, + "step": 74720 + }, + { + "epoch": 5.077116456040223, + "grad_norm": 0.20691053569316864, + "learning_rate": 3.656237260497351e-05, + "loss": 3.8983, + "step": 74725 + }, + { + "epoch": 5.077456176110885, + "grad_norm": 0.17227615416049957, + "learning_rate": 3.655812610409023e-05, + "loss": 4.0217, + "step": 74730 + }, + { + "epoch": 5.077795896181547, + "grad_norm": 0.17249394953250885, + "learning_rate": 3.6553879603206956e-05, + "loss": 3.7752, + "step": 74735 + }, + { + "epoch": 5.078135616252208, + "grad_norm": 0.3049830198287964, + "learning_rate": 3.654963310232369e-05, + "loss": 3.8458, + "step": 74740 + }, + { + "epoch": 5.07847533632287, + "grad_norm": 0.1760198175907135, + "learning_rate": 3.654538660144041e-05, + "loss": 3.8168, + "step": 74745 + }, + { + "epoch": 5.078815056393532, + "grad_norm": 0.16991157829761505, + "learning_rate": 3.654114010055714e-05, + "loss": 3.8804, + "step": 74750 + }, + { + "epoch": 5.079154776464193, + "grad_norm": 0.153843492269516, + "learning_rate": 3.653689359967387e-05, + "loss": 3.9504, + "step": 74755 + }, + { + "epoch": 5.079494496534855, + "grad_norm": 0.4298514425754547, + "learning_rate": 3.6532647098790596e-05, + "loss": 3.8545, + "step": 74760 + }, + { + "epoch": 5.0798342166055175, + "grad_norm": 2.8060085773468018, + "learning_rate": 3.6528400597907324e-05, + "loss": 3.9315, + "step": 74765 + }, + { + "epoch": 5.080173936676179, + "grad_norm": 0.17041195929050446, + "learning_rate": 3.652415409702405e-05, + "loss": 3.946, + "step": 74770 + }, + { + "epoch": 5.080513656746841, + "grad_norm": 0.20859216153621674, + "learning_rate": 3.651990759614079e-05, + "loss": 3.9844, + "step": 74775 + }, + { + "epoch": 5.080853376817503, + "grad_norm": 0.17544704675674438, + "learning_rate": 3.651566109525751e-05, + "loss": 3.7815, + "step": 74780 + }, + { + "epoch": 5.081193096888164, + "grad_norm": 0.3323117792606354, + "learning_rate": 3.6511414594374236e-05, + "loss": 3.7786, + "step": 74785 + }, + { + "epoch": 5.081532816958826, + "grad_norm": 0.20409691333770752, + "learning_rate": 3.650716809349097e-05, + "loss": 3.7824, + "step": 74790 + }, + { + "epoch": 5.081872537029487, + "grad_norm": 0.1868184357881546, + "learning_rate": 3.650292159260769e-05, + "loss": 3.7151, + "step": 74795 + }, + { + "epoch": 5.082212257100149, + "grad_norm": 0.15394817292690277, + "learning_rate": 3.649867509172442e-05, + "loss": 3.7614, + "step": 74800 + }, + { + "epoch": 5.082551977170811, + "grad_norm": 0.15545962750911713, + "learning_rate": 3.649442859084115e-05, + "loss": 3.7134, + "step": 74805 + }, + { + "epoch": 5.082891697241473, + "grad_norm": 0.18421779572963715, + "learning_rate": 3.6490182089957876e-05, + "loss": 3.8233, + "step": 74810 + }, + { + "epoch": 5.083231417312135, + "grad_norm": 0.19194859266281128, + "learning_rate": 3.6485935589074604e-05, + "loss": 3.978, + "step": 74815 + }, + { + "epoch": 5.083571137382797, + "grad_norm": 0.1557818204164505, + "learning_rate": 3.648168908819133e-05, + "loss": 3.8016, + "step": 74820 + }, + { + "epoch": 5.083910857453458, + "grad_norm": 0.17065152525901794, + "learning_rate": 3.647744258730806e-05, + "loss": 4.072, + "step": 74825 + }, + { + "epoch": 5.08425057752412, + "grad_norm": 0.411484032869339, + "learning_rate": 3.647319608642479e-05, + "loss": 3.8541, + "step": 74830 + }, + { + "epoch": 5.084590297594782, + "grad_norm": 0.16425442695617676, + "learning_rate": 3.6468949585541516e-05, + "loss": 3.858, + "step": 74835 + }, + { + "epoch": 5.084930017665443, + "grad_norm": 0.20631146430969238, + "learning_rate": 3.6464703084658244e-05, + "loss": 3.7614, + "step": 74840 + }, + { + "epoch": 5.085269737736105, + "grad_norm": 0.2399131804704666, + "learning_rate": 3.646045658377497e-05, + "loss": 4.0954, + "step": 74845 + }, + { + "epoch": 5.085609457806767, + "grad_norm": 0.1993013322353363, + "learning_rate": 3.64562100828917e-05, + "loss": 3.9695, + "step": 74850 + }, + { + "epoch": 5.085949177877429, + "grad_norm": 0.18416887521743774, + "learning_rate": 3.645196358200842e-05, + "loss": 3.5985, + "step": 74855 + }, + { + "epoch": 5.086288897948091, + "grad_norm": 0.1553763896226883, + "learning_rate": 3.6447717081125156e-05, + "loss": 3.8137, + "step": 74860 + }, + { + "epoch": 5.086628618018753, + "grad_norm": 0.15636225044727325, + "learning_rate": 3.6443470580241884e-05, + "loss": 3.8087, + "step": 74865 + }, + { + "epoch": 5.086968338089414, + "grad_norm": 0.21097925305366516, + "learning_rate": 3.6439224079358606e-05, + "loss": 3.6538, + "step": 74870 + }, + { + "epoch": 5.087308058160076, + "grad_norm": 0.2780452072620392, + "learning_rate": 3.643497757847534e-05, + "loss": 3.7144, + "step": 74875 + }, + { + "epoch": 5.087647778230738, + "grad_norm": 0.13984538614749908, + "learning_rate": 3.643073107759207e-05, + "loss": 4.0441, + "step": 74880 + }, + { + "epoch": 5.087987498301399, + "grad_norm": 0.17615655064582825, + "learning_rate": 3.642648457670879e-05, + "loss": 3.7214, + "step": 74885 + }, + { + "epoch": 5.088327218372061, + "grad_norm": 0.18656645715236664, + "learning_rate": 3.6422238075825524e-05, + "loss": 3.7824, + "step": 74890 + }, + { + "epoch": 5.088666938442723, + "grad_norm": 3.942915439605713, + "learning_rate": 3.641799157494225e-05, + "loss": 3.6985, + "step": 74895 + }, + { + "epoch": 5.089006658513385, + "grad_norm": 0.6066412925720215, + "learning_rate": 3.6413745074058974e-05, + "loss": 3.9753, + "step": 74900 + }, + { + "epoch": 5.089346378584047, + "grad_norm": 0.15031856298446655, + "learning_rate": 3.64094985731757e-05, + "loss": 3.7513, + "step": 74905 + }, + { + "epoch": 5.089686098654709, + "grad_norm": 0.17403019964694977, + "learning_rate": 3.6405252072292436e-05, + "loss": 3.7175, + "step": 74910 + }, + { + "epoch": 5.09002581872537, + "grad_norm": 0.14771047234535217, + "learning_rate": 3.640100557140916e-05, + "loss": 3.6841, + "step": 74915 + }, + { + "epoch": 5.090365538796032, + "grad_norm": 1.0823311805725098, + "learning_rate": 3.6396759070525886e-05, + "loss": 3.8387, + "step": 74920 + }, + { + "epoch": 5.090705258866694, + "grad_norm": 0.179420605301857, + "learning_rate": 3.639251256964262e-05, + "loss": 3.7722, + "step": 74925 + }, + { + "epoch": 5.091044978937355, + "grad_norm": 0.20004193484783173, + "learning_rate": 3.638826606875934e-05, + "loss": 3.657, + "step": 74930 + }, + { + "epoch": 5.091384699008017, + "grad_norm": 0.480800986289978, + "learning_rate": 3.638401956787607e-05, + "loss": 3.8549, + "step": 74935 + }, + { + "epoch": 5.0917244190786795, + "grad_norm": 0.1750502586364746, + "learning_rate": 3.63797730669928e-05, + "loss": 4.0158, + "step": 74940 + }, + { + "epoch": 5.092064139149341, + "grad_norm": 0.4537420868873596, + "learning_rate": 3.637552656610953e-05, + "loss": 3.8253, + "step": 74945 + }, + { + "epoch": 5.092403859220003, + "grad_norm": 0.33049917221069336, + "learning_rate": 3.6371280065226254e-05, + "loss": 3.6034, + "step": 74950 + }, + { + "epoch": 5.092743579290665, + "grad_norm": 0.17674916982650757, + "learning_rate": 3.636703356434298e-05, + "loss": 3.937, + "step": 74955 + }, + { + "epoch": 5.093083299361326, + "grad_norm": 0.15661711990833282, + "learning_rate": 3.6362787063459717e-05, + "loss": 3.888, + "step": 74960 + }, + { + "epoch": 5.093423019431988, + "grad_norm": 0.1327143758535385, + "learning_rate": 3.635854056257644e-05, + "loss": 3.9844, + "step": 74965 + }, + { + "epoch": 5.09376273950265, + "grad_norm": 0.151447594165802, + "learning_rate": 3.6354294061693166e-05, + "loss": 3.965, + "step": 74970 + }, + { + "epoch": 5.094102459573311, + "grad_norm": 0.20247291028499603, + "learning_rate": 3.6350047560809894e-05, + "loss": 3.9556, + "step": 74975 + }, + { + "epoch": 5.094442179643973, + "grad_norm": 0.17961382865905762, + "learning_rate": 3.634580105992662e-05, + "loss": 3.6326, + "step": 74980 + }, + { + "epoch": 5.0947818997146355, + "grad_norm": 0.2008001208305359, + "learning_rate": 3.634155455904335e-05, + "loss": 3.8459, + "step": 74985 + }, + { + "epoch": 5.095121619785297, + "grad_norm": 2.2962453365325928, + "learning_rate": 3.633730805816008e-05, + "loss": 3.9165, + "step": 74990 + }, + { + "epoch": 5.095461339855959, + "grad_norm": 0.12677687406539917, + "learning_rate": 3.6333061557276806e-05, + "loss": 4.0698, + "step": 74995 + }, + { + "epoch": 5.095801059926621, + "grad_norm": 0.2067876160144806, + "learning_rate": 3.6328815056393534e-05, + "loss": 3.9116, + "step": 75000 + }, + { + "epoch": 5.096140779997282, + "grad_norm": 0.16215375065803528, + "learning_rate": 3.632456855551026e-05, + "loss": 3.7016, + "step": 75005 + }, + { + "epoch": 5.096480500067944, + "grad_norm": 0.3892051577568054, + "learning_rate": 3.632032205462699e-05, + "loss": 3.9101, + "step": 75010 + }, + { + "epoch": 5.096820220138606, + "grad_norm": 0.15051206946372986, + "learning_rate": 3.631607555374372e-05, + "loss": 3.9909, + "step": 75015 + }, + { + "epoch": 5.097159940209267, + "grad_norm": 0.25411224365234375, + "learning_rate": 3.6311829052860446e-05, + "loss": 3.9055, + "step": 75020 + }, + { + "epoch": 5.097499660279929, + "grad_norm": 0.18646010756492615, + "learning_rate": 3.6307582551977174e-05, + "loss": 3.884, + "step": 75025 + }, + { + "epoch": 5.0978393803505915, + "grad_norm": 1.696060299873352, + "learning_rate": 3.63033360510939e-05, + "loss": 3.8224, + "step": 75030 + }, + { + "epoch": 5.098179100421253, + "grad_norm": 0.5283030867576599, + "learning_rate": 3.629908955021063e-05, + "loss": 3.6597, + "step": 75035 + }, + { + "epoch": 5.098518820491915, + "grad_norm": 0.8336268663406372, + "learning_rate": 3.629484304932735e-05, + "loss": 3.7942, + "step": 75040 + }, + { + "epoch": 5.098858540562577, + "grad_norm": 0.14903289079666138, + "learning_rate": 3.6290596548444086e-05, + "loss": 3.8778, + "step": 75045 + }, + { + "epoch": 5.099198260633238, + "grad_norm": 0.20728042721748352, + "learning_rate": 3.6286350047560814e-05, + "loss": 3.7277, + "step": 75050 + }, + { + "epoch": 5.0995379807039, + "grad_norm": 0.21031130850315094, + "learning_rate": 3.6282103546677535e-05, + "loss": 3.7849, + "step": 75055 + }, + { + "epoch": 5.099877700774562, + "grad_norm": 0.16190290451049805, + "learning_rate": 3.627785704579427e-05, + "loss": 3.8146, + "step": 75060 + }, + { + "epoch": 5.100217420845223, + "grad_norm": 0.12857739627361298, + "learning_rate": 3.6273610544911e-05, + "loss": 3.7929, + "step": 75065 + }, + { + "epoch": 5.100557140915885, + "grad_norm": 0.16348440945148468, + "learning_rate": 3.626936404402772e-05, + "loss": 3.8758, + "step": 75070 + }, + { + "epoch": 5.1008968609865475, + "grad_norm": 0.2998165190219879, + "learning_rate": 3.626511754314445e-05, + "loss": 3.8712, + "step": 75075 + }, + { + "epoch": 5.101236581057209, + "grad_norm": 0.16245336830615997, + "learning_rate": 3.626087104226118e-05, + "loss": 3.8276, + "step": 75080 + }, + { + "epoch": 5.101576301127871, + "grad_norm": 0.2013658583164215, + "learning_rate": 3.62566245413779e-05, + "loss": 3.7775, + "step": 75085 + }, + { + "epoch": 5.101916021198533, + "grad_norm": 0.16703109443187714, + "learning_rate": 3.625237804049463e-05, + "loss": 3.9192, + "step": 75090 + }, + { + "epoch": 5.102255741269194, + "grad_norm": 0.22125765681266785, + "learning_rate": 3.6248131539611366e-05, + "loss": 3.8546, + "step": 75095 + }, + { + "epoch": 5.102595461339856, + "grad_norm": 0.16265571117401123, + "learning_rate": 3.624388503872809e-05, + "loss": 3.8482, + "step": 75100 + }, + { + "epoch": 5.102935181410518, + "grad_norm": 0.1915149688720703, + "learning_rate": 3.6239638537844815e-05, + "loss": 3.7979, + "step": 75105 + }, + { + "epoch": 5.103274901481179, + "grad_norm": 0.2088291198015213, + "learning_rate": 3.623539203696154e-05, + "loss": 3.7448, + "step": 75110 + }, + { + "epoch": 5.103614621551841, + "grad_norm": 0.1594468653202057, + "learning_rate": 3.623114553607828e-05, + "loss": 3.8462, + "step": 75115 + }, + { + "epoch": 5.103954341622503, + "grad_norm": 0.19216491281986237, + "learning_rate": 3.6226899035195e-05, + "loss": 3.7323, + "step": 75120 + }, + { + "epoch": 5.104294061693165, + "grad_norm": 0.1668006032705307, + "learning_rate": 3.622265253431173e-05, + "loss": 3.7567, + "step": 75125 + }, + { + "epoch": 5.104633781763827, + "grad_norm": 0.17570485174655914, + "learning_rate": 3.621840603342846e-05, + "loss": 3.8721, + "step": 75130 + }, + { + "epoch": 5.104973501834488, + "grad_norm": 0.21203091740608215, + "learning_rate": 3.621415953254518e-05, + "loss": 3.5761, + "step": 75135 + }, + { + "epoch": 5.10531322190515, + "grad_norm": 0.18535377085208893, + "learning_rate": 3.620991303166191e-05, + "loss": 3.5657, + "step": 75140 + }, + { + "epoch": 5.105652941975812, + "grad_norm": 0.23189227283000946, + "learning_rate": 3.620566653077864e-05, + "loss": 3.7063, + "step": 75145 + }, + { + "epoch": 5.105992662046473, + "grad_norm": 0.19548676908016205, + "learning_rate": 3.620142002989537e-05, + "loss": 3.8138, + "step": 75150 + }, + { + "epoch": 5.106332382117135, + "grad_norm": 0.17338328063488007, + "learning_rate": 3.6197173529012095e-05, + "loss": 3.8204, + "step": 75155 + }, + { + "epoch": 5.106672102187797, + "grad_norm": 0.19722719490528107, + "learning_rate": 3.619292702812882e-05, + "loss": 3.8463, + "step": 75160 + }, + { + "epoch": 5.107011822258459, + "grad_norm": 0.1828557401895523, + "learning_rate": 3.618868052724555e-05, + "loss": 4.0702, + "step": 75165 + }, + { + "epoch": 5.107351542329121, + "grad_norm": 0.15538664162158966, + "learning_rate": 3.618443402636228e-05, + "loss": 3.5114, + "step": 75170 + }, + { + "epoch": 5.107691262399783, + "grad_norm": 0.14318713545799255, + "learning_rate": 3.618018752547901e-05, + "loss": 3.8782, + "step": 75175 + }, + { + "epoch": 5.108030982470444, + "grad_norm": 0.2457353174686432, + "learning_rate": 3.6175941024595735e-05, + "loss": 3.6694, + "step": 75180 + }, + { + "epoch": 5.108370702541106, + "grad_norm": 0.19286318123340607, + "learning_rate": 3.617169452371246e-05, + "loss": 3.8728, + "step": 75185 + }, + { + "epoch": 5.108710422611768, + "grad_norm": 0.15769898891448975, + "learning_rate": 3.616744802282919e-05, + "loss": 3.7189, + "step": 75190 + }, + { + "epoch": 5.109050142682429, + "grad_norm": 0.19764024019241333, + "learning_rate": 3.616320152194592e-05, + "loss": 3.7707, + "step": 75195 + }, + { + "epoch": 5.109389862753091, + "grad_norm": 0.17613540589809418, + "learning_rate": 3.615895502106265e-05, + "loss": 3.7629, + "step": 75200 + }, + { + "epoch": 5.1097295828237534, + "grad_norm": 0.13867001235485077, + "learning_rate": 3.6154708520179375e-05, + "loss": 3.7445, + "step": 75205 + }, + { + "epoch": 5.110069302894415, + "grad_norm": 0.15916100144386292, + "learning_rate": 3.6150462019296097e-05, + "loss": 4.0874, + "step": 75210 + }, + { + "epoch": 5.110409022965077, + "grad_norm": 0.193494513630867, + "learning_rate": 3.614621551841283e-05, + "loss": 3.821, + "step": 75215 + }, + { + "epoch": 5.110748743035739, + "grad_norm": 0.17930090427398682, + "learning_rate": 3.614196901752956e-05, + "loss": 3.7986, + "step": 75220 + }, + { + "epoch": 5.1110884631064, + "grad_norm": 0.13676497340202332, + "learning_rate": 3.613772251664628e-05, + "loss": 3.9402, + "step": 75225 + }, + { + "epoch": 5.111428183177062, + "grad_norm": 0.20974698662757874, + "learning_rate": 3.6133476015763015e-05, + "loss": 3.9565, + "step": 75230 + }, + { + "epoch": 5.111767903247724, + "grad_norm": 0.42481887340545654, + "learning_rate": 3.6129229514879743e-05, + "loss": 3.8207, + "step": 75235 + }, + { + "epoch": 5.112107623318385, + "grad_norm": 0.19001005589962006, + "learning_rate": 3.6124983013996465e-05, + "loss": 3.8113, + "step": 75240 + }, + { + "epoch": 5.112447343389047, + "grad_norm": 0.1926811784505844, + "learning_rate": 3.612073651311319e-05, + "loss": 3.6875, + "step": 75245 + }, + { + "epoch": 5.1127870634597095, + "grad_norm": 0.2685335874557495, + "learning_rate": 3.611649001222993e-05, + "loss": 4.0692, + "step": 75250 + }, + { + "epoch": 5.113126783530371, + "grad_norm": 0.20354872941970825, + "learning_rate": 3.611224351134665e-05, + "loss": 3.6222, + "step": 75255 + }, + { + "epoch": 5.113466503601033, + "grad_norm": 0.17416515946388245, + "learning_rate": 3.610799701046338e-05, + "loss": 3.7595, + "step": 75260 + }, + { + "epoch": 5.113806223671695, + "grad_norm": 0.1659388691186905, + "learning_rate": 3.610375050958011e-05, + "loss": 3.8525, + "step": 75265 + }, + { + "epoch": 5.114145943742356, + "grad_norm": 0.14865557849407196, + "learning_rate": 3.609950400869683e-05, + "loss": 3.948, + "step": 75270 + }, + { + "epoch": 5.114485663813018, + "grad_norm": 0.27683812379837036, + "learning_rate": 3.609525750781356e-05, + "loss": 3.8396, + "step": 75275 + }, + { + "epoch": 5.11482538388368, + "grad_norm": 0.15704825520515442, + "learning_rate": 3.609101100693029e-05, + "loss": 3.8357, + "step": 75280 + }, + { + "epoch": 5.115165103954341, + "grad_norm": 0.19517675042152405, + "learning_rate": 3.6086764506047023e-05, + "loss": 3.9468, + "step": 75285 + }, + { + "epoch": 5.115504824025003, + "grad_norm": 0.19743052124977112, + "learning_rate": 3.6082518005163745e-05, + "loss": 3.6585, + "step": 75290 + }, + { + "epoch": 5.1158445440956655, + "grad_norm": 0.21763266623020172, + "learning_rate": 3.607827150428047e-05, + "loss": 3.8997, + "step": 75295 + }, + { + "epoch": 5.116184264166327, + "grad_norm": 0.2472202181816101, + "learning_rate": 3.607402500339721e-05, + "loss": 3.9868, + "step": 75300 + }, + { + "epoch": 5.116523984236989, + "grad_norm": 0.21556927263736725, + "learning_rate": 3.606977850251393e-05, + "loss": 3.6113, + "step": 75305 + }, + { + "epoch": 5.116863704307651, + "grad_norm": 0.19645507633686066, + "learning_rate": 3.606553200163066e-05, + "loss": 4.0251, + "step": 75310 + }, + { + "epoch": 5.117203424378312, + "grad_norm": 0.1936354786157608, + "learning_rate": 3.606128550074739e-05, + "loss": 3.8022, + "step": 75315 + }, + { + "epoch": 5.117543144448974, + "grad_norm": 0.14386625587940216, + "learning_rate": 3.605703899986411e-05, + "loss": 3.7681, + "step": 75320 + }, + { + "epoch": 5.117882864519636, + "grad_norm": 0.17091012001037598, + "learning_rate": 3.605279249898084e-05, + "loss": 3.8436, + "step": 75325 + }, + { + "epoch": 5.118222584590297, + "grad_norm": 0.2943861782550812, + "learning_rate": 3.604854599809757e-05, + "loss": 3.8305, + "step": 75330 + }, + { + "epoch": 5.118562304660959, + "grad_norm": 0.19922053813934326, + "learning_rate": 3.60442994972143e-05, + "loss": 3.8243, + "step": 75335 + }, + { + "epoch": 5.1189020247316215, + "grad_norm": 0.7355474233627319, + "learning_rate": 3.6040052996331025e-05, + "loss": 3.8285, + "step": 75340 + }, + { + "epoch": 5.119241744802283, + "grad_norm": 0.19379498064517975, + "learning_rate": 3.603580649544775e-05, + "loss": 3.751, + "step": 75345 + }, + { + "epoch": 5.119581464872945, + "grad_norm": 0.17716427147388458, + "learning_rate": 3.603155999456448e-05, + "loss": 4.1231, + "step": 75350 + }, + { + "epoch": 5.119921184943607, + "grad_norm": 0.1865861564874649, + "learning_rate": 3.602731349368121e-05, + "loss": 3.822, + "step": 75355 + }, + { + "epoch": 5.120260905014268, + "grad_norm": 0.15185818076133728, + "learning_rate": 3.602306699279794e-05, + "loss": 3.5972, + "step": 75360 + }, + { + "epoch": 5.12060062508493, + "grad_norm": 0.15347836911678314, + "learning_rate": 3.6018820491914665e-05, + "loss": 3.8441, + "step": 75365 + }, + { + "epoch": 5.120940345155592, + "grad_norm": 0.2982116937637329, + "learning_rate": 3.601457399103139e-05, + "loss": 3.9818, + "step": 75370 + }, + { + "epoch": 5.121280065226253, + "grad_norm": 0.20216324925422668, + "learning_rate": 3.601032749014812e-05, + "loss": 3.925, + "step": 75375 + }, + { + "epoch": 5.121619785296915, + "grad_norm": 0.17548026144504547, + "learning_rate": 3.600608098926484e-05, + "loss": 3.786, + "step": 75380 + }, + { + "epoch": 5.1219595053675775, + "grad_norm": 0.3202870786190033, + "learning_rate": 3.600183448838158e-05, + "loss": 3.5879, + "step": 75385 + }, + { + "epoch": 5.122299225438239, + "grad_norm": 0.13984636962413788, + "learning_rate": 3.5997587987498305e-05, + "loss": 3.5899, + "step": 75390 + }, + { + "epoch": 5.122638945508901, + "grad_norm": 0.17118193209171295, + "learning_rate": 3.5993341486615026e-05, + "loss": 3.7325, + "step": 75395 + }, + { + "epoch": 5.122978665579563, + "grad_norm": 0.1706131398677826, + "learning_rate": 3.598909498573176e-05, + "loss": 3.9167, + "step": 75400 + }, + { + "epoch": 5.123318385650224, + "grad_norm": 0.16265293955802917, + "learning_rate": 3.598484848484849e-05, + "loss": 3.8364, + "step": 75405 + }, + { + "epoch": 5.123658105720886, + "grad_norm": 0.1930200606584549, + "learning_rate": 3.598060198396521e-05, + "loss": 3.9487, + "step": 75410 + }, + { + "epoch": 5.123997825791548, + "grad_norm": 0.19035619497299194, + "learning_rate": 3.5976355483081945e-05, + "loss": 3.7253, + "step": 75415 + }, + { + "epoch": 5.124337545862209, + "grad_norm": 0.15790440142154694, + "learning_rate": 3.597210898219867e-05, + "loss": 3.7386, + "step": 75420 + }, + { + "epoch": 5.124677265932871, + "grad_norm": 0.2876945436000824, + "learning_rate": 3.5967862481315394e-05, + "loss": 3.6362, + "step": 75425 + }, + { + "epoch": 5.1250169860035335, + "grad_norm": 0.1559147834777832, + "learning_rate": 3.596361598043212e-05, + "loss": 3.9256, + "step": 75430 + }, + { + "epoch": 5.125356706074195, + "grad_norm": 0.1872844696044922, + "learning_rate": 3.595936947954886e-05, + "loss": 3.8282, + "step": 75435 + }, + { + "epoch": 5.125696426144857, + "grad_norm": 1.630262851715088, + "learning_rate": 3.595512297866558e-05, + "loss": 3.8938, + "step": 75440 + }, + { + "epoch": 5.126036146215519, + "grad_norm": 0.1559811234474182, + "learning_rate": 3.5950876477782306e-05, + "loss": 3.9745, + "step": 75445 + }, + { + "epoch": 5.12637586628618, + "grad_norm": 0.7061172127723694, + "learning_rate": 3.594662997689904e-05, + "loss": 3.9614, + "step": 75450 + }, + { + "epoch": 5.126715586356842, + "grad_norm": 0.1715691238641739, + "learning_rate": 3.594238347601577e-05, + "loss": 3.9508, + "step": 75455 + }, + { + "epoch": 5.127055306427504, + "grad_norm": 1.029101848602295, + "learning_rate": 3.593813697513249e-05, + "loss": 3.5798, + "step": 75460 + }, + { + "epoch": 5.127395026498165, + "grad_norm": 0.3052433729171753, + "learning_rate": 3.593389047424922e-05, + "loss": 3.7674, + "step": 75465 + }, + { + "epoch": 5.127734746568827, + "grad_norm": 0.17301377654075623, + "learning_rate": 3.592964397336595e-05, + "loss": 3.6693, + "step": 75470 + }, + { + "epoch": 5.1280744666394895, + "grad_norm": 0.17608892917633057, + "learning_rate": 3.5925397472482674e-05, + "loss": 3.4383, + "step": 75475 + }, + { + "epoch": 5.128414186710151, + "grad_norm": 0.1906675398349762, + "learning_rate": 3.59211509715994e-05, + "loss": 3.7472, + "step": 75480 + }, + { + "epoch": 5.128753906780813, + "grad_norm": 0.20825603604316711, + "learning_rate": 3.591690447071614e-05, + "loss": 3.6769, + "step": 75485 + }, + { + "epoch": 5.129093626851474, + "grad_norm": 0.1490364670753479, + "learning_rate": 3.591265796983286e-05, + "loss": 3.7843, + "step": 75490 + }, + { + "epoch": 5.129433346922136, + "grad_norm": 0.17603912949562073, + "learning_rate": 3.5908411468949586e-05, + "loss": 3.9656, + "step": 75495 + }, + { + "epoch": 5.129773066992798, + "grad_norm": 0.15724420547485352, + "learning_rate": 3.5904164968066314e-05, + "loss": 3.6451, + "step": 75500 + }, + { + "epoch": 5.130112787063459, + "grad_norm": 0.16166731715202332, + "learning_rate": 3.589991846718304e-05, + "loss": 3.798, + "step": 75505 + }, + { + "epoch": 5.130452507134121, + "grad_norm": 0.19806358218193054, + "learning_rate": 3.589567196629977e-05, + "loss": 3.8133, + "step": 75510 + }, + { + "epoch": 5.1307922272047835, + "grad_norm": 0.18270081281661987, + "learning_rate": 3.58914254654165e-05, + "loss": 3.9863, + "step": 75515 + }, + { + "epoch": 5.131131947275445, + "grad_norm": 0.18880075216293335, + "learning_rate": 3.5887178964533226e-05, + "loss": 3.7771, + "step": 75520 + }, + { + "epoch": 5.131471667346107, + "grad_norm": 0.23617017269134521, + "learning_rate": 3.5882932463649954e-05, + "loss": 3.607, + "step": 75525 + }, + { + "epoch": 5.131811387416769, + "grad_norm": 0.15048997104167938, + "learning_rate": 3.587868596276668e-05, + "loss": 3.7607, + "step": 75530 + }, + { + "epoch": 5.13215110748743, + "grad_norm": 0.15708687901496887, + "learning_rate": 3.587443946188341e-05, + "loss": 3.705, + "step": 75535 + }, + { + "epoch": 5.132490827558092, + "grad_norm": 0.18876004219055176, + "learning_rate": 3.587019296100014e-05, + "loss": 3.7854, + "step": 75540 + }, + { + "epoch": 5.132830547628754, + "grad_norm": 0.13901753723621368, + "learning_rate": 3.5865946460116866e-05, + "loss": 4.1707, + "step": 75545 + }, + { + "epoch": 5.133170267699415, + "grad_norm": 0.18575267493724823, + "learning_rate": 3.5861699959233594e-05, + "loss": 3.6161, + "step": 75550 + }, + { + "epoch": 5.133509987770077, + "grad_norm": 0.2589053809642792, + "learning_rate": 3.585745345835032e-05, + "loss": 3.7541, + "step": 75555 + }, + { + "epoch": 5.1338497078407395, + "grad_norm": 0.19501842558383942, + "learning_rate": 3.585320695746705e-05, + "loss": 3.9475, + "step": 75560 + }, + { + "epoch": 5.134189427911401, + "grad_norm": 0.15458190441131592, + "learning_rate": 3.584896045658377e-05, + "loss": 3.9435, + "step": 75565 + }, + { + "epoch": 5.134529147982063, + "grad_norm": 0.1732843965291977, + "learning_rate": 3.5844713955700506e-05, + "loss": 3.8365, + "step": 75570 + }, + { + "epoch": 5.134868868052725, + "grad_norm": 0.17991629242897034, + "learning_rate": 3.5840467454817234e-05, + "loss": 3.774, + "step": 75575 + }, + { + "epoch": 5.135208588123386, + "grad_norm": 0.21657030284404755, + "learning_rate": 3.5836220953933956e-05, + "loss": 3.799, + "step": 75580 + }, + { + "epoch": 5.135548308194048, + "grad_norm": 0.16050943732261658, + "learning_rate": 3.583197445305069e-05, + "loss": 4.0235, + "step": 75585 + }, + { + "epoch": 5.13588802826471, + "grad_norm": 0.1994692087173462, + "learning_rate": 3.582772795216742e-05, + "loss": 3.5228, + "step": 75590 + }, + { + "epoch": 5.136227748335371, + "grad_norm": 0.19538679718971252, + "learning_rate": 3.582348145128414e-05, + "loss": 3.8257, + "step": 75595 + }, + { + "epoch": 5.136567468406033, + "grad_norm": 0.24150121212005615, + "learning_rate": 3.581923495040087e-05, + "loss": 3.8158, + "step": 75600 + }, + { + "epoch": 5.1369071884766955, + "grad_norm": 0.18128183484077454, + "learning_rate": 3.58149884495176e-05, + "loss": 3.9922, + "step": 75605 + }, + { + "epoch": 5.137246908547357, + "grad_norm": 0.3602626919746399, + "learning_rate": 3.5810741948634324e-05, + "loss": 3.9068, + "step": 75610 + }, + { + "epoch": 5.137586628618019, + "grad_norm": 0.14568868279457092, + "learning_rate": 3.580649544775105e-05, + "loss": 3.8702, + "step": 75615 + }, + { + "epoch": 5.137926348688681, + "grad_norm": 0.38482627272605896, + "learning_rate": 3.5802248946867786e-05, + "loss": 4.1586, + "step": 75620 + }, + { + "epoch": 5.138266068759342, + "grad_norm": 0.19837358593940735, + "learning_rate": 3.5798002445984514e-05, + "loss": 4.0235, + "step": 75625 + }, + { + "epoch": 5.138605788830004, + "grad_norm": 0.2589569389820099, + "learning_rate": 3.5793755945101236e-05, + "loss": 3.7796, + "step": 75630 + }, + { + "epoch": 5.138945508900666, + "grad_norm": 0.5230412483215332, + "learning_rate": 3.5789509444217964e-05, + "loss": 3.9431, + "step": 75635 + }, + { + "epoch": 5.139285228971327, + "grad_norm": 0.3933567702770233, + "learning_rate": 3.57852629433347e-05, + "loss": 3.5746, + "step": 75640 + }, + { + "epoch": 5.139624949041989, + "grad_norm": 0.14658166468143463, + "learning_rate": 3.578101644245142e-05, + "loss": 3.7494, + "step": 75645 + }, + { + "epoch": 5.1399646691126515, + "grad_norm": 4.625521183013916, + "learning_rate": 3.577676994156815e-05, + "loss": 4.0023, + "step": 75650 + }, + { + "epoch": 5.140304389183313, + "grad_norm": 0.19661255180835724, + "learning_rate": 3.577252344068488e-05, + "loss": 3.9721, + "step": 75655 + }, + { + "epoch": 5.140644109253975, + "grad_norm": 0.15368515253067017, + "learning_rate": 3.5768276939801604e-05, + "loss": 3.7329, + "step": 75660 + }, + { + "epoch": 5.140983829324637, + "grad_norm": 0.15623313188552856, + "learning_rate": 3.576403043891833e-05, + "loss": 3.9479, + "step": 75665 + }, + { + "epoch": 5.141323549395298, + "grad_norm": 0.16874508559703827, + "learning_rate": 3.575978393803506e-05, + "loss": 3.743, + "step": 75670 + }, + { + "epoch": 5.14166326946596, + "grad_norm": 0.16583041846752167, + "learning_rate": 3.575553743715179e-05, + "loss": 4.0579, + "step": 75675 + }, + { + "epoch": 5.142002989536622, + "grad_norm": 0.1382959634065628, + "learning_rate": 3.5751290936268516e-05, + "loss": 3.7974, + "step": 75680 + }, + { + "epoch": 5.142342709607283, + "grad_norm": 0.15771742165088654, + "learning_rate": 3.5747044435385244e-05, + "loss": 3.7891, + "step": 75685 + }, + { + "epoch": 5.142682429677945, + "grad_norm": 0.15719452500343323, + "learning_rate": 3.574279793450197e-05, + "loss": 3.8551, + "step": 75690 + }, + { + "epoch": 5.1430221497486075, + "grad_norm": 0.1702520251274109, + "learning_rate": 3.57385514336187e-05, + "loss": 3.9089, + "step": 75695 + }, + { + "epoch": 5.143361869819269, + "grad_norm": 0.18656383454799652, + "learning_rate": 3.573430493273543e-05, + "loss": 3.8248, + "step": 75700 + }, + { + "epoch": 5.143701589889931, + "grad_norm": 0.15877750515937805, + "learning_rate": 3.5730058431852156e-05, + "loss": 4.032, + "step": 75705 + }, + { + "epoch": 5.144041309960593, + "grad_norm": 0.19007669389247894, + "learning_rate": 3.5725811930968884e-05, + "loss": 3.9002, + "step": 75710 + }, + { + "epoch": 5.144381030031254, + "grad_norm": 0.17701877653598785, + "learning_rate": 3.572156543008561e-05, + "loss": 3.9283, + "step": 75715 + }, + { + "epoch": 5.144720750101916, + "grad_norm": 0.16616986691951752, + "learning_rate": 3.571731892920234e-05, + "loss": 3.9167, + "step": 75720 + }, + { + "epoch": 5.145060470172578, + "grad_norm": 0.4652247428894043, + "learning_rate": 3.571307242831907e-05, + "loss": 4.0016, + "step": 75725 + }, + { + "epoch": 5.145400190243239, + "grad_norm": 0.3998810648918152, + "learning_rate": 3.5708825927435796e-05, + "loss": 3.9642, + "step": 75730 + }, + { + "epoch": 5.145739910313901, + "grad_norm": 0.19554707407951355, + "learning_rate": 3.570457942655252e-05, + "loss": 3.6871, + "step": 75735 + }, + { + "epoch": 5.1460796303845635, + "grad_norm": 0.34778067469596863, + "learning_rate": 3.570033292566925e-05, + "loss": 3.653, + "step": 75740 + }, + { + "epoch": 5.146419350455225, + "grad_norm": 0.4481565058231354, + "learning_rate": 3.569608642478598e-05, + "loss": 3.8989, + "step": 75745 + }, + { + "epoch": 5.146759070525887, + "grad_norm": 0.16127227246761322, + "learning_rate": 3.56918399239027e-05, + "loss": 4.0235, + "step": 75750 + }, + { + "epoch": 5.147098790596549, + "grad_norm": 0.20076633989810944, + "learning_rate": 3.5687593423019436e-05, + "loss": 3.7433, + "step": 75755 + }, + { + "epoch": 5.14743851066721, + "grad_norm": 0.22601033747196198, + "learning_rate": 3.5683346922136164e-05, + "loss": 3.7235, + "step": 75760 + }, + { + "epoch": 5.147778230737872, + "grad_norm": 0.18119709193706512, + "learning_rate": 3.5679100421252885e-05, + "loss": 4.1762, + "step": 75765 + }, + { + "epoch": 5.148117950808534, + "grad_norm": 0.16480059921741486, + "learning_rate": 3.567485392036961e-05, + "loss": 3.8705, + "step": 75770 + }, + { + "epoch": 5.148457670879195, + "grad_norm": 0.25469374656677246, + "learning_rate": 3.567060741948635e-05, + "loss": 3.854, + "step": 75775 + }, + { + "epoch": 5.1487973909498574, + "grad_norm": 0.1597108393907547, + "learning_rate": 3.566636091860307e-05, + "loss": 3.7485, + "step": 75780 + }, + { + "epoch": 5.1491371110205195, + "grad_norm": 0.2503375709056854, + "learning_rate": 3.56621144177198e-05, + "loss": 3.7522, + "step": 75785 + }, + { + "epoch": 5.149476831091181, + "grad_norm": 0.8692466020584106, + "learning_rate": 3.565786791683653e-05, + "loss": 3.7421, + "step": 75790 + }, + { + "epoch": 5.149816551161843, + "grad_norm": 0.1612158715724945, + "learning_rate": 3.565362141595326e-05, + "loss": 3.9334, + "step": 75795 + }, + { + "epoch": 5.150156271232504, + "grad_norm": 0.16672027111053467, + "learning_rate": 3.564937491506998e-05, + "loss": 3.9771, + "step": 75800 + }, + { + "epoch": 5.150495991303166, + "grad_norm": 0.1995498687028885, + "learning_rate": 3.5645128414186716e-05, + "loss": 3.5647, + "step": 75805 + }, + { + "epoch": 5.150835711373828, + "grad_norm": 0.345832884311676, + "learning_rate": 3.5640881913303444e-05, + "loss": 3.9616, + "step": 75810 + }, + { + "epoch": 5.151175431444489, + "grad_norm": 0.15702450275421143, + "learning_rate": 3.5636635412420165e-05, + "loss": 3.7019, + "step": 75815 + }, + { + "epoch": 5.151515151515151, + "grad_norm": 0.1746472418308258, + "learning_rate": 3.563238891153689e-05, + "loss": 3.9214, + "step": 75820 + }, + { + "epoch": 5.1518548715858135, + "grad_norm": 0.17718957364559174, + "learning_rate": 3.562814241065363e-05, + "loss": 3.735, + "step": 75825 + }, + { + "epoch": 5.152194591656475, + "grad_norm": 0.1846630573272705, + "learning_rate": 3.562389590977035e-05, + "loss": 4.1453, + "step": 75830 + }, + { + "epoch": 5.152534311727137, + "grad_norm": 0.1705903857946396, + "learning_rate": 3.561964940888708e-05, + "loss": 3.9071, + "step": 75835 + }, + { + "epoch": 5.152874031797799, + "grad_norm": 0.24608305096626282, + "learning_rate": 3.561540290800381e-05, + "loss": 3.877, + "step": 75840 + }, + { + "epoch": 5.15321375186846, + "grad_norm": 0.13022121787071228, + "learning_rate": 3.561115640712053e-05, + "loss": 3.6173, + "step": 75845 + }, + { + "epoch": 5.153553471939122, + "grad_norm": 0.16331031918525696, + "learning_rate": 3.560690990623726e-05, + "loss": 4.0117, + "step": 75850 + }, + { + "epoch": 5.153893192009784, + "grad_norm": 0.21629738807678223, + "learning_rate": 3.560266340535399e-05, + "loss": 3.8692, + "step": 75855 + }, + { + "epoch": 5.154232912080445, + "grad_norm": 0.16222237050533295, + "learning_rate": 3.559841690447072e-05, + "loss": 3.9086, + "step": 75860 + }, + { + "epoch": 5.154572632151107, + "grad_norm": 0.6127928495407104, + "learning_rate": 3.5594170403587445e-05, + "loss": 3.9089, + "step": 75865 + }, + { + "epoch": 5.1549123522217695, + "grad_norm": 0.1653042435646057, + "learning_rate": 3.558992390270417e-05, + "loss": 3.6748, + "step": 75870 + }, + { + "epoch": 5.155252072292431, + "grad_norm": 0.18411491811275482, + "learning_rate": 3.55856774018209e-05, + "loss": 3.9052, + "step": 75875 + }, + { + "epoch": 5.155591792363093, + "grad_norm": 0.1837218552827835, + "learning_rate": 3.558143090093763e-05, + "loss": 3.7535, + "step": 75880 + }, + { + "epoch": 5.155931512433755, + "grad_norm": 0.21512247622013092, + "learning_rate": 3.557718440005436e-05, + "loss": 3.8687, + "step": 75885 + }, + { + "epoch": 5.156271232504416, + "grad_norm": 3.1169345378875732, + "learning_rate": 3.5572937899171085e-05, + "loss": 3.7686, + "step": 75890 + }, + { + "epoch": 5.156610952575078, + "grad_norm": 0.18322649598121643, + "learning_rate": 3.556869139828781e-05, + "loss": 3.6982, + "step": 75895 + }, + { + "epoch": 5.15695067264574, + "grad_norm": 0.14953534305095673, + "learning_rate": 3.556444489740454e-05, + "loss": 3.7244, + "step": 75900 + }, + { + "epoch": 5.157290392716401, + "grad_norm": 0.15176056325435638, + "learning_rate": 3.556019839652127e-05, + "loss": 3.6104, + "step": 75905 + }, + { + "epoch": 5.157630112787063, + "grad_norm": 0.1950654238462448, + "learning_rate": 3.5555951895638e-05, + "loss": 3.9991, + "step": 75910 + }, + { + "epoch": 5.1579698328577255, + "grad_norm": 0.16653960943222046, + "learning_rate": 3.5551705394754725e-05, + "loss": 3.3694, + "step": 75915 + }, + { + "epoch": 5.158309552928387, + "grad_norm": 0.1727018505334854, + "learning_rate": 3.5547458893871447e-05, + "loss": 3.8359, + "step": 75920 + }, + { + "epoch": 5.158649272999049, + "grad_norm": 0.1552991271018982, + "learning_rate": 3.554321239298818e-05, + "loss": 3.9134, + "step": 75925 + }, + { + "epoch": 5.158988993069711, + "grad_norm": 0.2534259855747223, + "learning_rate": 3.553896589210491e-05, + "loss": 3.6945, + "step": 75930 + }, + { + "epoch": 5.159328713140372, + "grad_norm": 0.2981986701488495, + "learning_rate": 3.553471939122163e-05, + "loss": 3.9004, + "step": 75935 + }, + { + "epoch": 5.159668433211034, + "grad_norm": 0.2552013695240021, + "learning_rate": 3.5530472890338365e-05, + "loss": 3.86, + "step": 75940 + }, + { + "epoch": 5.160008153281696, + "grad_norm": 0.5942903161048889, + "learning_rate": 3.552622638945509e-05, + "loss": 3.8089, + "step": 75945 + }, + { + "epoch": 5.160347873352357, + "grad_norm": 0.17702311277389526, + "learning_rate": 3.5521979888571815e-05, + "loss": 3.7853, + "step": 75950 + }, + { + "epoch": 5.160687593423019, + "grad_norm": 0.6955286860466003, + "learning_rate": 3.551773338768854e-05, + "loss": 3.5669, + "step": 75955 + }, + { + "epoch": 5.1610273134936815, + "grad_norm": 0.18049651384353638, + "learning_rate": 3.551348688680528e-05, + "loss": 3.9268, + "step": 75960 + }, + { + "epoch": 5.161367033564343, + "grad_norm": 0.21171438694000244, + "learning_rate": 3.5509240385922005e-05, + "loss": 3.9553, + "step": 75965 + }, + { + "epoch": 5.161706753635005, + "grad_norm": 0.16457563638687134, + "learning_rate": 3.550499388503873e-05, + "loss": 3.7466, + "step": 75970 + }, + { + "epoch": 5.162046473705667, + "grad_norm": 0.1857668161392212, + "learning_rate": 3.550074738415546e-05, + "loss": 3.8768, + "step": 75975 + }, + { + "epoch": 5.162386193776328, + "grad_norm": 0.14510829746723175, + "learning_rate": 3.549650088327219e-05, + "loss": 4.0594, + "step": 75980 + }, + { + "epoch": 5.16272591384699, + "grad_norm": 0.15226610004901886, + "learning_rate": 3.549225438238891e-05, + "loss": 3.6552, + "step": 75985 + }, + { + "epoch": 5.163065633917652, + "grad_norm": 0.22878363728523254, + "learning_rate": 3.548800788150564e-05, + "loss": 3.8906, + "step": 75990 + }, + { + "epoch": 5.163405353988313, + "grad_norm": 0.21305251121520996, + "learning_rate": 3.5483761380622373e-05, + "loss": 4.0159, + "step": 75995 + }, + { + "epoch": 5.163745074058975, + "grad_norm": 0.12753081321716309, + "learning_rate": 3.5479514879739095e-05, + "loss": 4.0952, + "step": 76000 + }, + { + "epoch": 5.1640847941296375, + "grad_norm": 0.14023925364017487, + "learning_rate": 3.547526837885582e-05, + "loss": 3.8584, + "step": 76005 + }, + { + "epoch": 5.164424514200299, + "grad_norm": 0.20250611007213593, + "learning_rate": 3.547102187797256e-05, + "loss": 3.7385, + "step": 76010 + }, + { + "epoch": 5.164764234270961, + "grad_norm": 0.15044714510440826, + "learning_rate": 3.546677537708928e-05, + "loss": 3.8083, + "step": 76015 + }, + { + "epoch": 5.165103954341623, + "grad_norm": 0.19987404346466064, + "learning_rate": 3.546252887620601e-05, + "loss": 3.9027, + "step": 76020 + }, + { + "epoch": 5.165443674412284, + "grad_norm": 0.1803848296403885, + "learning_rate": 3.5458282375322735e-05, + "loss": 3.7286, + "step": 76025 + }, + { + "epoch": 5.165783394482946, + "grad_norm": 0.7138747572898865, + "learning_rate": 3.545403587443946e-05, + "loss": 3.6518, + "step": 76030 + }, + { + "epoch": 5.166123114553608, + "grad_norm": 0.2961859107017517, + "learning_rate": 3.544978937355619e-05, + "loss": 3.9942, + "step": 76035 + }, + { + "epoch": 5.166462834624269, + "grad_norm": 0.1976793259382248, + "learning_rate": 3.544554287267292e-05, + "loss": 3.8551, + "step": 76040 + }, + { + "epoch": 5.166802554694931, + "grad_norm": 0.1686517596244812, + "learning_rate": 3.544129637178965e-05, + "loss": 3.834, + "step": 76045 + }, + { + "epoch": 5.1671422747655935, + "grad_norm": 0.33329206705093384, + "learning_rate": 3.5437049870906375e-05, + "loss": 3.907, + "step": 76050 + }, + { + "epoch": 5.167481994836255, + "grad_norm": 0.19259528815746307, + "learning_rate": 3.54328033700231e-05, + "loss": 3.8169, + "step": 76055 + }, + { + "epoch": 5.167821714906917, + "grad_norm": 0.1620955616235733, + "learning_rate": 3.542855686913983e-05, + "loss": 3.6722, + "step": 76060 + }, + { + "epoch": 5.168161434977579, + "grad_norm": 0.12867248058319092, + "learning_rate": 3.542431036825656e-05, + "loss": 3.9684, + "step": 76065 + }, + { + "epoch": 5.16850115504824, + "grad_norm": 0.1478930115699768, + "learning_rate": 3.542006386737329e-05, + "loss": 4.1724, + "step": 76070 + }, + { + "epoch": 5.168840875118902, + "grad_norm": 0.20157282054424286, + "learning_rate": 3.5415817366490015e-05, + "loss": 3.6968, + "step": 76075 + }, + { + "epoch": 5.169180595189564, + "grad_norm": 0.1982841193675995, + "learning_rate": 3.541157086560674e-05, + "loss": 3.8347, + "step": 76080 + }, + { + "epoch": 5.169520315260225, + "grad_norm": 0.22110243141651154, + "learning_rate": 3.540732436472347e-05, + "loss": 3.9592, + "step": 76085 + }, + { + "epoch": 5.1698600353308874, + "grad_norm": 0.17452390491962433, + "learning_rate": 3.540307786384019e-05, + "loss": 3.9892, + "step": 76090 + }, + { + "epoch": 5.1701997554015495, + "grad_norm": 0.1775202602148056, + "learning_rate": 3.539883136295693e-05, + "loss": 3.9118, + "step": 76095 + }, + { + "epoch": 5.170539475472211, + "grad_norm": 0.17763695120811462, + "learning_rate": 3.5394584862073655e-05, + "loss": 3.8285, + "step": 76100 + }, + { + "epoch": 5.170879195542873, + "grad_norm": 0.1635710597038269, + "learning_rate": 3.5390338361190376e-05, + "loss": 3.7265, + "step": 76105 + }, + { + "epoch": 5.171218915613535, + "grad_norm": 0.18438829481601715, + "learning_rate": 3.538609186030711e-05, + "loss": 3.7406, + "step": 76110 + }, + { + "epoch": 5.171558635684196, + "grad_norm": 0.17665264010429382, + "learning_rate": 3.538184535942384e-05, + "loss": 4.0038, + "step": 76115 + }, + { + "epoch": 5.171898355754858, + "grad_norm": 0.18124179542064667, + "learning_rate": 3.537759885854056e-05, + "loss": 3.9211, + "step": 76120 + }, + { + "epoch": 5.17223807582552, + "grad_norm": 0.16025550663471222, + "learning_rate": 3.537335235765729e-05, + "loss": 3.5924, + "step": 76125 + }, + { + "epoch": 5.172577795896181, + "grad_norm": 0.2358534336090088, + "learning_rate": 3.536910585677402e-05, + "loss": 3.9805, + "step": 76130 + }, + { + "epoch": 5.1729175159668435, + "grad_norm": 0.20470421016216278, + "learning_rate": 3.536485935589075e-05, + "loss": 3.9763, + "step": 76135 + }, + { + "epoch": 5.1732572360375055, + "grad_norm": 0.22414207458496094, + "learning_rate": 3.536061285500747e-05, + "loss": 3.6718, + "step": 76140 + }, + { + "epoch": 5.173596956108167, + "grad_norm": 0.16123934090137482, + "learning_rate": 3.535636635412421e-05, + "loss": 3.7822, + "step": 76145 + }, + { + "epoch": 5.173936676178829, + "grad_norm": 0.16203604638576508, + "learning_rate": 3.5352119853240935e-05, + "loss": 3.7385, + "step": 76150 + }, + { + "epoch": 5.174276396249491, + "grad_norm": 0.18963126838207245, + "learning_rate": 3.5347873352357656e-05, + "loss": 3.9447, + "step": 76155 + }, + { + "epoch": 5.174616116320152, + "grad_norm": 0.18642771244049072, + "learning_rate": 3.5343626851474384e-05, + "loss": 3.9553, + "step": 76160 + }, + { + "epoch": 5.174955836390814, + "grad_norm": 0.3460775315761566, + "learning_rate": 3.533938035059112e-05, + "loss": 3.9063, + "step": 76165 + }, + { + "epoch": 5.175295556461475, + "grad_norm": 0.2183711975812912, + "learning_rate": 3.533513384970784e-05, + "loss": 3.5215, + "step": 76170 + }, + { + "epoch": 5.175635276532137, + "grad_norm": 0.22713828086853027, + "learning_rate": 3.533088734882457e-05, + "loss": 4.0034, + "step": 76175 + }, + { + "epoch": 5.1759749966027995, + "grad_norm": 0.1610482782125473, + "learning_rate": 3.53266408479413e-05, + "loss": 4.0138, + "step": 76180 + }, + { + "epoch": 5.176314716673461, + "grad_norm": 0.16606508195400238, + "learning_rate": 3.5322394347058024e-05, + "loss": 3.9517, + "step": 76185 + }, + { + "epoch": 5.176654436744123, + "grad_norm": 0.19859014451503754, + "learning_rate": 3.531814784617475e-05, + "loss": 3.7744, + "step": 76190 + }, + { + "epoch": 5.176994156814785, + "grad_norm": 0.15870003402233124, + "learning_rate": 3.531390134529148e-05, + "loss": 4.024, + "step": 76195 + }, + { + "epoch": 5.177333876885446, + "grad_norm": 0.19019979238510132, + "learning_rate": 3.530965484440821e-05, + "loss": 4.0958, + "step": 76200 + }, + { + "epoch": 5.177673596956108, + "grad_norm": 0.2077631950378418, + "learning_rate": 3.5305408343524936e-05, + "loss": 3.9039, + "step": 76205 + }, + { + "epoch": 5.17801331702677, + "grad_norm": 0.16710861027240753, + "learning_rate": 3.5301161842641664e-05, + "loss": 3.8579, + "step": 76210 + }, + { + "epoch": 5.178353037097431, + "grad_norm": 0.9456725120544434, + "learning_rate": 3.529691534175839e-05, + "loss": 3.9064, + "step": 76215 + }, + { + "epoch": 5.178692757168093, + "grad_norm": 0.1510010063648224, + "learning_rate": 3.529266884087512e-05, + "loss": 4.2181, + "step": 76220 + }, + { + "epoch": 5.1790324772387555, + "grad_norm": 0.14579258859157562, + "learning_rate": 3.528842233999185e-05, + "loss": 4.1642, + "step": 76225 + }, + { + "epoch": 5.179372197309417, + "grad_norm": 0.2890262007713318, + "learning_rate": 3.5284175839108576e-05, + "loss": 3.7855, + "step": 76230 + }, + { + "epoch": 5.179711917380079, + "grad_norm": 0.1696670800447464, + "learning_rate": 3.5279929338225304e-05, + "loss": 3.6711, + "step": 76235 + }, + { + "epoch": 5.180051637450741, + "grad_norm": 0.3154539167881012, + "learning_rate": 3.527568283734203e-05, + "loss": 3.8183, + "step": 76240 + }, + { + "epoch": 5.180391357521402, + "grad_norm": 0.14716582000255585, + "learning_rate": 3.527143633645876e-05, + "loss": 3.9075, + "step": 76245 + }, + { + "epoch": 5.180731077592064, + "grad_norm": 0.15156897902488708, + "learning_rate": 3.526718983557549e-05, + "loss": 3.8922, + "step": 76250 + }, + { + "epoch": 5.181070797662726, + "grad_norm": 0.1918926239013672, + "learning_rate": 3.5262943334692216e-05, + "loss": 3.7849, + "step": 76255 + }, + { + "epoch": 5.181410517733387, + "grad_norm": 0.15952162444591522, + "learning_rate": 3.525869683380894e-05, + "loss": 3.794, + "step": 76260 + }, + { + "epoch": 5.181750237804049, + "grad_norm": 0.20428678393363953, + "learning_rate": 3.525445033292567e-05, + "loss": 3.9437, + "step": 76265 + }, + { + "epoch": 5.1820899578747115, + "grad_norm": 0.20373961329460144, + "learning_rate": 3.52502038320424e-05, + "loss": 3.7662, + "step": 76270 + }, + { + "epoch": 5.182429677945373, + "grad_norm": 0.16718947887420654, + "learning_rate": 3.524595733115912e-05, + "loss": 3.9025, + "step": 76275 + }, + { + "epoch": 5.182769398016035, + "grad_norm": 1.0705702304840088, + "learning_rate": 3.5241710830275856e-05, + "loss": 3.7769, + "step": 76280 + }, + { + "epoch": 5.183109118086697, + "grad_norm": 0.20315049588680267, + "learning_rate": 3.5237464329392584e-05, + "loss": 3.7912, + "step": 76285 + }, + { + "epoch": 5.183448838157358, + "grad_norm": 0.1313156932592392, + "learning_rate": 3.5233217828509306e-05, + "loss": 4.0076, + "step": 76290 + }, + { + "epoch": 5.18378855822802, + "grad_norm": 0.18024377524852753, + "learning_rate": 3.5228971327626034e-05, + "loss": 3.5391, + "step": 76295 + }, + { + "epoch": 5.184128278298682, + "grad_norm": 0.27838951349258423, + "learning_rate": 3.522472482674277e-05, + "loss": 3.7497, + "step": 76300 + }, + { + "epoch": 5.184467998369343, + "grad_norm": 0.432295024394989, + "learning_rate": 3.5220478325859496e-05, + "loss": 4.0469, + "step": 76305 + }, + { + "epoch": 5.184807718440005, + "grad_norm": 0.7719891667366028, + "learning_rate": 3.521623182497622e-05, + "loss": 3.8678, + "step": 76310 + }, + { + "epoch": 5.1851474385106675, + "grad_norm": 0.17177189886569977, + "learning_rate": 3.521198532409295e-05, + "loss": 3.8449, + "step": 76315 + }, + { + "epoch": 5.185487158581329, + "grad_norm": 0.19275976717472076, + "learning_rate": 3.520773882320968e-05, + "loss": 3.9138, + "step": 76320 + }, + { + "epoch": 5.185826878651991, + "grad_norm": 0.1928538680076599, + "learning_rate": 3.52034923223264e-05, + "loss": 3.799, + "step": 76325 + }, + { + "epoch": 5.186166598722653, + "grad_norm": 0.19716942310333252, + "learning_rate": 3.5199245821443136e-05, + "loss": 3.8529, + "step": 76330 + }, + { + "epoch": 5.186506318793314, + "grad_norm": 0.14552508294582367, + "learning_rate": 3.5194999320559864e-05, + "loss": 3.8309, + "step": 76335 + }, + { + "epoch": 5.186846038863976, + "grad_norm": 0.18480795621871948, + "learning_rate": 3.5190752819676586e-05, + "loss": 4.0994, + "step": 76340 + }, + { + "epoch": 5.187185758934638, + "grad_norm": 0.16760815680027008, + "learning_rate": 3.5186506318793314e-05, + "loss": 3.8078, + "step": 76345 + }, + { + "epoch": 5.187525479005299, + "grad_norm": 0.16774241626262665, + "learning_rate": 3.518225981791005e-05, + "loss": 3.8591, + "step": 76350 + }, + { + "epoch": 5.187865199075961, + "grad_norm": 0.1981646716594696, + "learning_rate": 3.517801331702677e-05, + "loss": 4.027, + "step": 76355 + }, + { + "epoch": 5.1882049191466235, + "grad_norm": 0.20961284637451172, + "learning_rate": 3.51737668161435e-05, + "loss": 3.7142, + "step": 76360 + }, + { + "epoch": 5.188544639217285, + "grad_norm": 0.15293070673942566, + "learning_rate": 3.516952031526023e-05, + "loss": 3.8434, + "step": 76365 + }, + { + "epoch": 5.188884359287947, + "grad_norm": 0.7590661644935608, + "learning_rate": 3.5165273814376954e-05, + "loss": 4.0231, + "step": 76370 + }, + { + "epoch": 5.189224079358609, + "grad_norm": 0.15990963578224182, + "learning_rate": 3.516102731349368e-05, + "loss": 3.7562, + "step": 76375 + }, + { + "epoch": 5.18956379942927, + "grad_norm": 0.15794247388839722, + "learning_rate": 3.515678081261041e-05, + "loss": 3.5446, + "step": 76380 + }, + { + "epoch": 5.189903519499932, + "grad_norm": 0.16357429325580597, + "learning_rate": 3.515253431172714e-05, + "loss": 3.9036, + "step": 76385 + }, + { + "epoch": 5.190243239570594, + "grad_norm": 0.19575189054012299, + "learning_rate": 3.5148287810843866e-05, + "loss": 3.8928, + "step": 76390 + }, + { + "epoch": 5.190582959641255, + "grad_norm": 0.19120360910892487, + "learning_rate": 3.5144041309960594e-05, + "loss": 4.1406, + "step": 76395 + }, + { + "epoch": 5.1909226797119175, + "grad_norm": 0.1742716282606125, + "learning_rate": 3.513979480907732e-05, + "loss": 3.8019, + "step": 76400 + }, + { + "epoch": 5.1912623997825795, + "grad_norm": 0.15355615317821503, + "learning_rate": 3.513554830819405e-05, + "loss": 3.9528, + "step": 76405 + }, + { + "epoch": 5.191602119853241, + "grad_norm": 0.1613611876964569, + "learning_rate": 3.513130180731078e-05, + "loss": 4.0725, + "step": 76410 + }, + { + "epoch": 5.191941839923903, + "grad_norm": 0.15472346544265747, + "learning_rate": 3.5127055306427506e-05, + "loss": 3.5247, + "step": 76415 + }, + { + "epoch": 5.192281559994565, + "grad_norm": 0.17555692791938782, + "learning_rate": 3.5122808805544234e-05, + "loss": 3.5858, + "step": 76420 + }, + { + "epoch": 5.192621280065226, + "grad_norm": 0.13358406722545624, + "learning_rate": 3.511856230466096e-05, + "loss": 4.0893, + "step": 76425 + }, + { + "epoch": 5.192961000135888, + "grad_norm": 0.15859119594097137, + "learning_rate": 3.511431580377769e-05, + "loss": 3.6903, + "step": 76430 + }, + { + "epoch": 5.19330072020655, + "grad_norm": 0.14932358264923096, + "learning_rate": 3.511006930289442e-05, + "loss": 3.7687, + "step": 76435 + }, + { + "epoch": 5.193640440277211, + "grad_norm": 0.15768598020076752, + "learning_rate": 3.5105822802011146e-05, + "loss": 3.7497, + "step": 76440 + }, + { + "epoch": 5.1939801603478735, + "grad_norm": 0.1749202162027359, + "learning_rate": 3.510157630112787e-05, + "loss": 4.0626, + "step": 76445 + }, + { + "epoch": 5.1943198804185355, + "grad_norm": 0.14248928427696228, + "learning_rate": 3.50973298002446e-05, + "loss": 4.0162, + "step": 76450 + }, + { + "epoch": 5.194659600489197, + "grad_norm": 0.17038555443286896, + "learning_rate": 3.509308329936133e-05, + "loss": 3.8052, + "step": 76455 + }, + { + "epoch": 5.194999320559859, + "grad_norm": 0.16140158474445343, + "learning_rate": 3.508883679847805e-05, + "loss": 3.9671, + "step": 76460 + }, + { + "epoch": 5.195339040630521, + "grad_norm": 0.16233456134796143, + "learning_rate": 3.5084590297594786e-05, + "loss": 3.8886, + "step": 76465 + }, + { + "epoch": 5.195678760701182, + "grad_norm": 0.15207916498184204, + "learning_rate": 3.5080343796711514e-05, + "loss": 3.9367, + "step": 76470 + }, + { + "epoch": 5.196018480771844, + "grad_norm": 0.20620837807655334, + "learning_rate": 3.507609729582824e-05, + "loss": 3.8404, + "step": 76475 + }, + { + "epoch": 5.196358200842505, + "grad_norm": 0.15070591866970062, + "learning_rate": 3.507185079494496e-05, + "loss": 3.811, + "step": 76480 + }, + { + "epoch": 5.196697920913167, + "grad_norm": 0.20676717162132263, + "learning_rate": 3.50676042940617e-05, + "loss": 3.9232, + "step": 76485 + }, + { + "epoch": 5.1970376409838295, + "grad_norm": 0.1634403020143509, + "learning_rate": 3.5063357793178426e-05, + "loss": 3.9567, + "step": 76490 + }, + { + "epoch": 5.197377361054491, + "grad_norm": 0.24257057905197144, + "learning_rate": 3.505911129229515e-05, + "loss": 3.7632, + "step": 76495 + }, + { + "epoch": 5.197717081125153, + "grad_norm": 0.20982994139194489, + "learning_rate": 3.505486479141188e-05, + "loss": 3.7488, + "step": 76500 + }, + { + "epoch": 5.198056801195815, + "grad_norm": 0.23186399042606354, + "learning_rate": 3.505061829052861e-05, + "loss": 3.7651, + "step": 76505 + }, + { + "epoch": 5.198396521266476, + "grad_norm": 0.17649096250534058, + "learning_rate": 3.504637178964533e-05, + "loss": 3.8626, + "step": 76510 + }, + { + "epoch": 5.198736241337138, + "grad_norm": 0.22847354412078857, + "learning_rate": 3.504212528876206e-05, + "loss": 4.0466, + "step": 76515 + }, + { + "epoch": 5.1990759614078, + "grad_norm": 0.15405665338039398, + "learning_rate": 3.5037878787878794e-05, + "loss": 3.6864, + "step": 76520 + }, + { + "epoch": 5.199415681478461, + "grad_norm": 0.23805993795394897, + "learning_rate": 3.5033632286995515e-05, + "loss": 3.8332, + "step": 76525 + }, + { + "epoch": 5.199755401549123, + "grad_norm": 0.1816500574350357, + "learning_rate": 3.502938578611224e-05, + "loss": 3.8672, + "step": 76530 + }, + { + "epoch": 5.2000951216197855, + "grad_norm": 0.23680847883224487, + "learning_rate": 3.502513928522898e-05, + "loss": 4.0473, + "step": 76535 + }, + { + "epoch": 5.200434841690447, + "grad_norm": 0.18125393986701965, + "learning_rate": 3.50208927843457e-05, + "loss": 3.9471, + "step": 76540 + }, + { + "epoch": 5.200774561761109, + "grad_norm": 0.18419873714447021, + "learning_rate": 3.501664628346243e-05, + "loss": 3.8784, + "step": 76545 + }, + { + "epoch": 5.201114281831771, + "grad_norm": 0.24623821675777435, + "learning_rate": 3.5012399782579155e-05, + "loss": 3.9093, + "step": 76550 + }, + { + "epoch": 5.201454001902432, + "grad_norm": 0.13685406744480133, + "learning_rate": 3.500815328169588e-05, + "loss": 3.7799, + "step": 76555 + }, + { + "epoch": 5.201793721973094, + "grad_norm": 0.17210082709789276, + "learning_rate": 3.500390678081261e-05, + "loss": 3.8366, + "step": 76560 + }, + { + "epoch": 5.202133442043756, + "grad_norm": 0.18637488782405853, + "learning_rate": 3.499966027992934e-05, + "loss": 3.8292, + "step": 76565 + }, + { + "epoch": 5.202473162114417, + "grad_norm": 0.2552926540374756, + "learning_rate": 3.499541377904607e-05, + "loss": 3.7671, + "step": 76570 + }, + { + "epoch": 5.202812882185079, + "grad_norm": 0.17315618693828583, + "learning_rate": 3.4991167278162795e-05, + "loss": 3.8593, + "step": 76575 + }, + { + "epoch": 5.2031526022557415, + "grad_norm": 0.15687893331050873, + "learning_rate": 3.498692077727952e-05, + "loss": 3.9806, + "step": 76580 + }, + { + "epoch": 5.203492322326403, + "grad_norm": 0.1412847340106964, + "learning_rate": 3.498267427639625e-05, + "loss": 3.6729, + "step": 76585 + }, + { + "epoch": 5.203832042397065, + "grad_norm": 0.2469664067029953, + "learning_rate": 3.497842777551298e-05, + "loss": 3.7918, + "step": 76590 + }, + { + "epoch": 5.204171762467727, + "grad_norm": 0.16428865492343903, + "learning_rate": 3.497418127462971e-05, + "loss": 3.7651, + "step": 76595 + }, + { + "epoch": 5.204511482538388, + "grad_norm": 0.15114112198352814, + "learning_rate": 3.4969934773746435e-05, + "loss": 3.9307, + "step": 76600 + }, + { + "epoch": 5.20485120260905, + "grad_norm": 0.2671988606452942, + "learning_rate": 3.496568827286316e-05, + "loss": 3.5829, + "step": 76605 + }, + { + "epoch": 5.205190922679712, + "grad_norm": 0.1641000658273697, + "learning_rate": 3.496144177197989e-05, + "loss": 3.7317, + "step": 76610 + }, + { + "epoch": 5.205530642750373, + "grad_norm": 0.1808282732963562, + "learning_rate": 3.495719527109661e-05, + "loss": 3.8746, + "step": 76615 + }, + { + "epoch": 5.205870362821035, + "grad_norm": 0.15032050013542175, + "learning_rate": 3.495294877021335e-05, + "loss": 3.8845, + "step": 76620 + }, + { + "epoch": 5.2062100828916975, + "grad_norm": 0.19923387467861176, + "learning_rate": 3.4948702269330075e-05, + "loss": 4.0856, + "step": 76625 + }, + { + "epoch": 5.206549802962359, + "grad_norm": 0.1863435059785843, + "learning_rate": 3.4944455768446797e-05, + "loss": 3.7567, + "step": 76630 + }, + { + "epoch": 5.206889523033021, + "grad_norm": 0.20794802904129028, + "learning_rate": 3.494020926756353e-05, + "loss": 4.0172, + "step": 76635 + }, + { + "epoch": 5.207229243103683, + "grad_norm": 0.15499462187290192, + "learning_rate": 3.493596276668026e-05, + "loss": 3.8502, + "step": 76640 + }, + { + "epoch": 5.207568963174344, + "grad_norm": 0.19164419174194336, + "learning_rate": 3.493171626579699e-05, + "loss": 3.605, + "step": 76645 + }, + { + "epoch": 5.207908683245006, + "grad_norm": 0.3698290288448334, + "learning_rate": 3.492746976491371e-05, + "loss": 3.7548, + "step": 76650 + }, + { + "epoch": 5.208248403315668, + "grad_norm": 0.6963704824447632, + "learning_rate": 3.492322326403044e-05, + "loss": 3.9747, + "step": 76655 + }, + { + "epoch": 5.208588123386329, + "grad_norm": 0.16199728846549988, + "learning_rate": 3.491897676314717e-05, + "loss": 3.7522, + "step": 76660 + }, + { + "epoch": 5.2089278434569914, + "grad_norm": 0.16147984564304352, + "learning_rate": 3.491473026226389e-05, + "loss": 3.8681, + "step": 76665 + }, + { + "epoch": 5.2092675635276535, + "grad_norm": 0.16456644237041473, + "learning_rate": 3.491048376138063e-05, + "loss": 3.6981, + "step": 76670 + }, + { + "epoch": 5.209607283598315, + "grad_norm": 0.1881704032421112, + "learning_rate": 3.4906237260497355e-05, + "loss": 4.0004, + "step": 76675 + }, + { + "epoch": 5.209947003668977, + "grad_norm": 0.14231747388839722, + "learning_rate": 3.490199075961408e-05, + "loss": 3.7552, + "step": 76680 + }, + { + "epoch": 5.210286723739639, + "grad_norm": 0.1840914785861969, + "learning_rate": 3.4897744258730805e-05, + "loss": 3.6933, + "step": 76685 + }, + { + "epoch": 5.2106264438103, + "grad_norm": 0.15890036523342133, + "learning_rate": 3.489349775784754e-05, + "loss": 3.6735, + "step": 76690 + }, + { + "epoch": 5.210966163880962, + "grad_norm": 0.16237032413482666, + "learning_rate": 3.488925125696426e-05, + "loss": 3.8299, + "step": 76695 + }, + { + "epoch": 5.211305883951624, + "grad_norm": 0.1388593167066574, + "learning_rate": 3.488500475608099e-05, + "loss": 3.8306, + "step": 76700 + }, + { + "epoch": 5.211645604022285, + "grad_norm": 0.17997956275939941, + "learning_rate": 3.4880758255197723e-05, + "loss": 3.7024, + "step": 76705 + }, + { + "epoch": 5.2119853240929475, + "grad_norm": 0.21751919388771057, + "learning_rate": 3.4876511754314445e-05, + "loss": 3.8984, + "step": 76710 + }, + { + "epoch": 5.2123250441636095, + "grad_norm": 0.2124270498752594, + "learning_rate": 3.487226525343117e-05, + "loss": 3.7131, + "step": 76715 + }, + { + "epoch": 5.212664764234271, + "grad_norm": 0.1656089723110199, + "learning_rate": 3.48680187525479e-05, + "loss": 4.0081, + "step": 76720 + }, + { + "epoch": 5.213004484304933, + "grad_norm": 0.17345252633094788, + "learning_rate": 3.486377225166463e-05, + "loss": 3.8219, + "step": 76725 + }, + { + "epoch": 5.213344204375595, + "grad_norm": 0.2037106454372406, + "learning_rate": 3.485952575078136e-05, + "loss": 3.8166, + "step": 76730 + }, + { + "epoch": 5.213683924446256, + "grad_norm": 0.16658326983451843, + "learning_rate": 3.4855279249898085e-05, + "loss": 3.9372, + "step": 76735 + }, + { + "epoch": 5.214023644516918, + "grad_norm": 0.2612440586090088, + "learning_rate": 3.485103274901481e-05, + "loss": 3.9185, + "step": 76740 + }, + { + "epoch": 5.21436336458758, + "grad_norm": 0.18783611059188843, + "learning_rate": 3.484678624813154e-05, + "loss": 3.8039, + "step": 76745 + }, + { + "epoch": 5.214703084658241, + "grad_norm": 0.2616298496723175, + "learning_rate": 3.484253974724827e-05, + "loss": 3.6503, + "step": 76750 + }, + { + "epoch": 5.2150428047289035, + "grad_norm": 0.23000392317771912, + "learning_rate": 3.4838293246365e-05, + "loss": 3.7359, + "step": 76755 + }, + { + "epoch": 5.2153825247995655, + "grad_norm": 0.2626405954360962, + "learning_rate": 3.4834046745481725e-05, + "loss": 3.7828, + "step": 76760 + }, + { + "epoch": 5.215722244870227, + "grad_norm": 0.22675982117652893, + "learning_rate": 3.482980024459845e-05, + "loss": 3.7176, + "step": 76765 + }, + { + "epoch": 5.216061964940889, + "grad_norm": 0.427351176738739, + "learning_rate": 3.482555374371518e-05, + "loss": 3.7069, + "step": 76770 + }, + { + "epoch": 5.216401685011551, + "grad_norm": 0.1693708449602127, + "learning_rate": 3.482130724283191e-05, + "loss": 3.8803, + "step": 76775 + }, + { + "epoch": 5.216741405082212, + "grad_norm": 0.2071123570203781, + "learning_rate": 3.481706074194864e-05, + "loss": 3.7373, + "step": 76780 + }, + { + "epoch": 5.217081125152874, + "grad_norm": 0.1825796216726303, + "learning_rate": 3.481281424106536e-05, + "loss": 3.9931, + "step": 76785 + }, + { + "epoch": 5.217420845223536, + "grad_norm": 0.18623118102550507, + "learning_rate": 3.480856774018209e-05, + "loss": 4.0694, + "step": 76790 + }, + { + "epoch": 5.217760565294197, + "grad_norm": 0.20325864851474762, + "learning_rate": 3.480432123929882e-05, + "loss": 3.6248, + "step": 76795 + }, + { + "epoch": 5.2181002853648595, + "grad_norm": 0.192596435546875, + "learning_rate": 3.480007473841554e-05, + "loss": 4.0295, + "step": 76800 + }, + { + "epoch": 5.2184400054355216, + "grad_norm": 0.13355214893817902, + "learning_rate": 3.479582823753228e-05, + "loss": 3.9757, + "step": 76805 + }, + { + "epoch": 5.218779725506183, + "grad_norm": 0.15469057857990265, + "learning_rate": 3.4791581736649005e-05, + "loss": 3.7499, + "step": 76810 + }, + { + "epoch": 5.219119445576845, + "grad_norm": 0.1507500261068344, + "learning_rate": 3.478733523576573e-05, + "loss": 3.7892, + "step": 76815 + }, + { + "epoch": 5.219459165647507, + "grad_norm": 0.21054142713546753, + "learning_rate": 3.4783088734882454e-05, + "loss": 3.7183, + "step": 76820 + }, + { + "epoch": 5.219798885718168, + "grad_norm": 2.3249850273132324, + "learning_rate": 3.477884223399919e-05, + "loss": 3.8556, + "step": 76825 + }, + { + "epoch": 5.22013860578883, + "grad_norm": 0.1758790761232376, + "learning_rate": 3.477459573311592e-05, + "loss": 3.7208, + "step": 76830 + }, + { + "epoch": 5.220478325859492, + "grad_norm": 0.14724154770374298, + "learning_rate": 3.477034923223264e-05, + "loss": 3.5191, + "step": 76835 + }, + { + "epoch": 5.220818045930153, + "grad_norm": 0.6302258968353271, + "learning_rate": 3.476610273134937e-05, + "loss": 3.8027, + "step": 76840 + }, + { + "epoch": 5.2211577660008155, + "grad_norm": 0.14122697710990906, + "learning_rate": 3.47618562304661e-05, + "loss": 3.627, + "step": 76845 + }, + { + "epoch": 5.221497486071477, + "grad_norm": 0.18288594484329224, + "learning_rate": 3.475760972958282e-05, + "loss": 3.9246, + "step": 76850 + }, + { + "epoch": 5.221837206142139, + "grad_norm": 0.1557043343782425, + "learning_rate": 3.475336322869956e-05, + "loss": 3.7047, + "step": 76855 + }, + { + "epoch": 5.222176926212801, + "grad_norm": 0.18393880128860474, + "learning_rate": 3.4749116727816285e-05, + "loss": 3.9851, + "step": 76860 + }, + { + "epoch": 5.222516646283462, + "grad_norm": 0.220116525888443, + "learning_rate": 3.4744870226933006e-05, + "loss": 3.8369, + "step": 76865 + }, + { + "epoch": 5.222856366354124, + "grad_norm": 0.17638717591762543, + "learning_rate": 3.4740623726049734e-05, + "loss": 3.9272, + "step": 76870 + }, + { + "epoch": 5.223196086424786, + "grad_norm": 0.18294048309326172, + "learning_rate": 3.473637722516647e-05, + "loss": 3.8572, + "step": 76875 + }, + { + "epoch": 5.223535806495447, + "grad_norm": 0.1948380321264267, + "learning_rate": 3.473213072428319e-05, + "loss": 3.7046, + "step": 76880 + }, + { + "epoch": 5.223875526566109, + "grad_norm": 0.2044287472963333, + "learning_rate": 3.472788422339992e-05, + "loss": 3.7215, + "step": 76885 + }, + { + "epoch": 5.2242152466367715, + "grad_norm": 0.1330912858247757, + "learning_rate": 3.472363772251665e-05, + "loss": 3.929, + "step": 76890 + }, + { + "epoch": 5.224554966707433, + "grad_norm": 0.1944095641374588, + "learning_rate": 3.4719391221633374e-05, + "loss": 3.5433, + "step": 76895 + }, + { + "epoch": 5.224894686778095, + "grad_norm": 0.468144953250885, + "learning_rate": 3.47151447207501e-05, + "loss": 3.9567, + "step": 76900 + }, + { + "epoch": 5.225234406848757, + "grad_norm": 0.17860011756420135, + "learning_rate": 3.471089821986683e-05, + "loss": 4.0699, + "step": 76905 + }, + { + "epoch": 5.225574126919418, + "grad_norm": 0.28115054965019226, + "learning_rate": 3.470665171898356e-05, + "loss": 3.8032, + "step": 76910 + }, + { + "epoch": 5.22591384699008, + "grad_norm": 0.1724470853805542, + "learning_rate": 3.4702405218100286e-05, + "loss": 3.9823, + "step": 76915 + }, + { + "epoch": 5.226253567060742, + "grad_norm": 0.18758787214756012, + "learning_rate": 3.4698158717217014e-05, + "loss": 3.8374, + "step": 76920 + }, + { + "epoch": 5.226593287131403, + "grad_norm": 0.16332359611988068, + "learning_rate": 3.469391221633374e-05, + "loss": 3.9599, + "step": 76925 + }, + { + "epoch": 5.226933007202065, + "grad_norm": 0.13952995836734772, + "learning_rate": 3.468966571545047e-05, + "loss": 3.9219, + "step": 76930 + }, + { + "epoch": 5.2272727272727275, + "grad_norm": 0.200929194688797, + "learning_rate": 3.46854192145672e-05, + "loss": 3.6138, + "step": 76935 + }, + { + "epoch": 5.227612447343389, + "grad_norm": 0.1401822417974472, + "learning_rate": 3.4681172713683926e-05, + "loss": 3.692, + "step": 76940 + }, + { + "epoch": 5.227952167414051, + "grad_norm": 0.1576835811138153, + "learning_rate": 3.4676926212800654e-05, + "loss": 3.8127, + "step": 76945 + }, + { + "epoch": 5.228291887484713, + "grad_norm": 0.1626899540424347, + "learning_rate": 3.467267971191738e-05, + "loss": 3.805, + "step": 76950 + }, + { + "epoch": 5.228631607555374, + "grad_norm": 0.15556840598583221, + "learning_rate": 3.466843321103411e-05, + "loss": 3.8823, + "step": 76955 + }, + { + "epoch": 5.228971327626036, + "grad_norm": 0.1953974962234497, + "learning_rate": 3.466418671015084e-05, + "loss": 3.8521, + "step": 76960 + }, + { + "epoch": 5.229311047696698, + "grad_norm": 0.1725507229566574, + "learning_rate": 3.4659940209267566e-05, + "loss": 3.8318, + "step": 76965 + }, + { + "epoch": 5.229650767767359, + "grad_norm": 0.17979075014591217, + "learning_rate": 3.465569370838429e-05, + "loss": 3.8784, + "step": 76970 + }, + { + "epoch": 5.2299904878380215, + "grad_norm": 0.16544340550899506, + "learning_rate": 3.465144720750102e-05, + "loss": 4.0953, + "step": 76975 + }, + { + "epoch": 5.2303302079086835, + "grad_norm": 0.15788163244724274, + "learning_rate": 3.464720070661775e-05, + "loss": 3.876, + "step": 76980 + }, + { + "epoch": 5.230669927979345, + "grad_norm": 0.16534511744976044, + "learning_rate": 3.464295420573448e-05, + "loss": 3.8278, + "step": 76985 + }, + { + "epoch": 5.231009648050007, + "grad_norm": 0.16812217235565186, + "learning_rate": 3.4638707704851206e-05, + "loss": 3.8063, + "step": 76990 + }, + { + "epoch": 5.231349368120669, + "grad_norm": 0.15627816319465637, + "learning_rate": 3.4634461203967934e-05, + "loss": 3.893, + "step": 76995 + }, + { + "epoch": 5.23168908819133, + "grad_norm": 0.16236557066440582, + "learning_rate": 3.463021470308466e-05, + "loss": 3.7735, + "step": 77000 + }, + { + "epoch": 5.232028808261992, + "grad_norm": 0.2617053687572479, + "learning_rate": 3.4625968202201384e-05, + "loss": 3.8074, + "step": 77005 + }, + { + "epoch": 5.232368528332654, + "grad_norm": 0.16378064453601837, + "learning_rate": 3.462172170131812e-05, + "loss": 3.6768, + "step": 77010 + }, + { + "epoch": 5.232708248403315, + "grad_norm": 0.15055197477340698, + "learning_rate": 3.4617475200434846e-05, + "loss": 3.7963, + "step": 77015 + }, + { + "epoch": 5.2330479684739775, + "grad_norm": 0.26713094115257263, + "learning_rate": 3.461322869955157e-05, + "loss": 3.8075, + "step": 77020 + }, + { + "epoch": 5.2333876885446395, + "grad_norm": 0.1967001110315323, + "learning_rate": 3.46089821986683e-05, + "loss": 3.9296, + "step": 77025 + }, + { + "epoch": 5.233727408615301, + "grad_norm": 0.15944981575012207, + "learning_rate": 3.460473569778503e-05, + "loss": 3.8403, + "step": 77030 + }, + { + "epoch": 5.234067128685963, + "grad_norm": 0.25548484921455383, + "learning_rate": 3.460048919690175e-05, + "loss": 3.9305, + "step": 77035 + }, + { + "epoch": 5.234406848756625, + "grad_norm": 0.21213826537132263, + "learning_rate": 3.459624269601848e-05, + "loss": 3.9764, + "step": 77040 + }, + { + "epoch": 5.234746568827286, + "grad_norm": 0.1698743999004364, + "learning_rate": 3.4591996195135214e-05, + "loss": 3.7658, + "step": 77045 + }, + { + "epoch": 5.235086288897948, + "grad_norm": 0.3048624098300934, + "learning_rate": 3.4587749694251936e-05, + "loss": 3.9021, + "step": 77050 + }, + { + "epoch": 5.23542600896861, + "grad_norm": 0.1592402160167694, + "learning_rate": 3.4583503193368664e-05, + "loss": 3.7807, + "step": 77055 + }, + { + "epoch": 5.235765729039271, + "grad_norm": 0.15117482841014862, + "learning_rate": 3.45792566924854e-05, + "loss": 3.6879, + "step": 77060 + }, + { + "epoch": 5.2361054491099335, + "grad_norm": 0.20542295277118683, + "learning_rate": 3.457501019160212e-05, + "loss": 3.8647, + "step": 77065 + }, + { + "epoch": 5.2364451691805955, + "grad_norm": 0.731503963470459, + "learning_rate": 3.457076369071885e-05, + "loss": 3.8745, + "step": 77070 + }, + { + "epoch": 5.236784889251257, + "grad_norm": 0.3569260835647583, + "learning_rate": 3.4566517189835576e-05, + "loss": 3.8809, + "step": 77075 + }, + { + "epoch": 5.237124609321919, + "grad_norm": 0.16571536660194397, + "learning_rate": 3.4562270688952304e-05, + "loss": 3.7851, + "step": 77080 + }, + { + "epoch": 5.237464329392581, + "grad_norm": 0.18802513182163239, + "learning_rate": 3.455802418806903e-05, + "loss": 3.9567, + "step": 77085 + }, + { + "epoch": 5.237804049463242, + "grad_norm": 0.16824793815612793, + "learning_rate": 3.455377768718576e-05, + "loss": 3.7287, + "step": 77090 + }, + { + "epoch": 5.238143769533904, + "grad_norm": 0.248779758810997, + "learning_rate": 3.454953118630249e-05, + "loss": 3.9744, + "step": 77095 + }, + { + "epoch": 5.238483489604566, + "grad_norm": 0.15702518820762634, + "learning_rate": 3.4545284685419216e-05, + "loss": 3.9399, + "step": 77100 + }, + { + "epoch": 5.238823209675227, + "grad_norm": 0.17366403341293335, + "learning_rate": 3.4541038184535944e-05, + "loss": 3.7349, + "step": 77105 + }, + { + "epoch": 5.2391629297458895, + "grad_norm": 0.19520549476146698, + "learning_rate": 3.453679168365267e-05, + "loss": 3.7864, + "step": 77110 + }, + { + "epoch": 5.239502649816552, + "grad_norm": 0.2442643791437149, + "learning_rate": 3.45325451827694e-05, + "loss": 3.7816, + "step": 77115 + }, + { + "epoch": 5.239842369887213, + "grad_norm": 0.16436287760734558, + "learning_rate": 3.452829868188613e-05, + "loss": 3.8555, + "step": 77120 + }, + { + "epoch": 5.240182089957875, + "grad_norm": 0.18243467807769775, + "learning_rate": 3.4524052181002856e-05, + "loss": 3.6959, + "step": 77125 + }, + { + "epoch": 5.240521810028537, + "grad_norm": 0.1682080626487732, + "learning_rate": 3.4519805680119584e-05, + "loss": 3.9858, + "step": 77130 + }, + { + "epoch": 5.240861530099198, + "grad_norm": 0.2471064180135727, + "learning_rate": 3.451555917923631e-05, + "loss": 3.8237, + "step": 77135 + }, + { + "epoch": 5.24120125016986, + "grad_norm": 0.2308158576488495, + "learning_rate": 3.451131267835303e-05, + "loss": 3.9806, + "step": 77140 + }, + { + "epoch": 5.241540970240522, + "grad_norm": 0.16688327491283417, + "learning_rate": 3.450706617746977e-05, + "loss": 3.6601, + "step": 77145 + }, + { + "epoch": 5.241880690311183, + "grad_norm": 0.16071143746376038, + "learning_rate": 3.4502819676586496e-05, + "loss": 3.9542, + "step": 77150 + }, + { + "epoch": 5.2422204103818455, + "grad_norm": 0.14978723227977753, + "learning_rate": 3.4498573175703224e-05, + "loss": 3.7773, + "step": 77155 + }, + { + "epoch": 5.242560130452507, + "grad_norm": 0.21692399680614471, + "learning_rate": 3.449432667481995e-05, + "loss": 3.8059, + "step": 77160 + }, + { + "epoch": 5.242899850523169, + "grad_norm": 0.173975870013237, + "learning_rate": 3.449008017393668e-05, + "loss": 3.9315, + "step": 77165 + }, + { + "epoch": 5.243239570593831, + "grad_norm": 0.3427883982658386, + "learning_rate": 3.448583367305341e-05, + "loss": 3.8844, + "step": 77170 + }, + { + "epoch": 5.243579290664492, + "grad_norm": 0.219336599111557, + "learning_rate": 3.448158717217013e-05, + "loss": 3.8655, + "step": 77175 + }, + { + "epoch": 5.243919010735154, + "grad_norm": 0.22137680649757385, + "learning_rate": 3.4477340671286864e-05, + "loss": 3.8172, + "step": 77180 + }, + { + "epoch": 5.244258730805816, + "grad_norm": 0.18532072007656097, + "learning_rate": 3.447309417040359e-05, + "loss": 3.7559, + "step": 77185 + }, + { + "epoch": 5.244598450876477, + "grad_norm": 0.18990473449230194, + "learning_rate": 3.446884766952031e-05, + "loss": 3.3786, + "step": 77190 + }, + { + "epoch": 5.244938170947139, + "grad_norm": 0.18918319046497345, + "learning_rate": 3.446460116863705e-05, + "loss": 3.9927, + "step": 77195 + }, + { + "epoch": 5.2452778910178015, + "grad_norm": 0.18597018718719482, + "learning_rate": 3.4460354667753776e-05, + "loss": 3.8381, + "step": 77200 + }, + { + "epoch": 5.245617611088463, + "grad_norm": 0.1506301462650299, + "learning_rate": 3.44561081668705e-05, + "loss": 3.8777, + "step": 77205 + }, + { + "epoch": 5.245957331159125, + "grad_norm": 0.18262450397014618, + "learning_rate": 3.4451861665987225e-05, + "loss": 3.6198, + "step": 77210 + }, + { + "epoch": 5.246297051229787, + "grad_norm": 0.1829984188079834, + "learning_rate": 3.444761516510396e-05, + "loss": 3.7578, + "step": 77215 + }, + { + "epoch": 5.246636771300448, + "grad_norm": 0.20995570719242096, + "learning_rate": 3.444336866422068e-05, + "loss": 4.0847, + "step": 77220 + }, + { + "epoch": 5.24697649137111, + "grad_norm": 0.1757945865392685, + "learning_rate": 3.443912216333741e-05, + "loss": 3.7587, + "step": 77225 + }, + { + "epoch": 5.247316211441772, + "grad_norm": 0.23161688446998596, + "learning_rate": 3.4434875662454144e-05, + "loss": 3.9638, + "step": 77230 + }, + { + "epoch": 5.247655931512433, + "grad_norm": 0.15667515993118286, + "learning_rate": 3.4430629161570865e-05, + "loss": 3.7119, + "step": 77235 + }, + { + "epoch": 5.2479956515830954, + "grad_norm": 0.20693142712116241, + "learning_rate": 3.442638266068759e-05, + "loss": 3.9088, + "step": 77240 + }, + { + "epoch": 5.2483353716537575, + "grad_norm": 0.15759113430976868, + "learning_rate": 3.442213615980433e-05, + "loss": 3.8939, + "step": 77245 + }, + { + "epoch": 5.248675091724419, + "grad_norm": 0.15983137488365173, + "learning_rate": 3.441788965892105e-05, + "loss": 3.7879, + "step": 77250 + }, + { + "epoch": 5.249014811795081, + "grad_norm": 0.21912378072738647, + "learning_rate": 3.441364315803778e-05, + "loss": 3.6127, + "step": 77255 + }, + { + "epoch": 5.249354531865743, + "grad_norm": 0.16627460718154907, + "learning_rate": 3.4409396657154505e-05, + "loss": 3.8864, + "step": 77260 + }, + { + "epoch": 5.249694251936404, + "grad_norm": 0.21033988893032074, + "learning_rate": 3.440515015627123e-05, + "loss": 3.9152, + "step": 77265 + }, + { + "epoch": 5.250033972007066, + "grad_norm": 0.17334045469760895, + "learning_rate": 3.440090365538796e-05, + "loss": 3.7641, + "step": 77270 + }, + { + "epoch": 5.250373692077728, + "grad_norm": 0.23485590517520905, + "learning_rate": 3.439665715450469e-05, + "loss": 3.8456, + "step": 77275 + }, + { + "epoch": 5.250713412148389, + "grad_norm": 0.2380029410123825, + "learning_rate": 3.439241065362142e-05, + "loss": 3.9299, + "step": 77280 + }, + { + "epoch": 5.2510531322190515, + "grad_norm": 0.5499261021614075, + "learning_rate": 3.4388164152738145e-05, + "loss": 3.9276, + "step": 77285 + }, + { + "epoch": 5.2513928522897135, + "grad_norm": 0.14162175357341766, + "learning_rate": 3.438391765185487e-05, + "loss": 3.7943, + "step": 77290 + }, + { + "epoch": 5.251732572360375, + "grad_norm": 1.4046413898468018, + "learning_rate": 3.43796711509716e-05, + "loss": 3.8028, + "step": 77295 + }, + { + "epoch": 5.252072292431037, + "grad_norm": 0.19227235019207, + "learning_rate": 3.437542465008833e-05, + "loss": 3.8203, + "step": 77300 + }, + { + "epoch": 5.252412012501699, + "grad_norm": 0.16258375346660614, + "learning_rate": 3.437117814920506e-05, + "loss": 3.9792, + "step": 77305 + }, + { + "epoch": 5.25275173257236, + "grad_norm": 0.2502858340740204, + "learning_rate": 3.436693164832178e-05, + "loss": 3.92, + "step": 77310 + }, + { + "epoch": 5.253091452643022, + "grad_norm": 0.15980912744998932, + "learning_rate": 3.436268514743851e-05, + "loss": 4.0015, + "step": 77315 + }, + { + "epoch": 5.253431172713684, + "grad_norm": 0.16410334408283234, + "learning_rate": 3.435843864655524e-05, + "loss": 3.7289, + "step": 77320 + }, + { + "epoch": 5.253770892784345, + "grad_norm": 0.2290796935558319, + "learning_rate": 3.435419214567197e-05, + "loss": 3.9735, + "step": 77325 + }, + { + "epoch": 5.2541106128550075, + "grad_norm": 0.17726589739322662, + "learning_rate": 3.43499456447887e-05, + "loss": 3.797, + "step": 77330 + }, + { + "epoch": 5.2544503329256695, + "grad_norm": 0.15424564480781555, + "learning_rate": 3.4345699143905425e-05, + "loss": 3.8302, + "step": 77335 + }, + { + "epoch": 5.254790052996331, + "grad_norm": 0.1256771981716156, + "learning_rate": 3.434145264302215e-05, + "loss": 3.9477, + "step": 77340 + }, + { + "epoch": 5.255129773066993, + "grad_norm": 0.882987916469574, + "learning_rate": 3.433720614213888e-05, + "loss": 3.9331, + "step": 77345 + }, + { + "epoch": 5.255469493137655, + "grad_norm": 0.18188773095607758, + "learning_rate": 3.433295964125561e-05, + "loss": 3.8067, + "step": 77350 + }, + { + "epoch": 5.255809213208316, + "grad_norm": 0.24554823338985443, + "learning_rate": 3.432871314037234e-05, + "loss": 3.8022, + "step": 77355 + }, + { + "epoch": 5.256148933278978, + "grad_norm": 0.15611612796783447, + "learning_rate": 3.432446663948906e-05, + "loss": 3.7889, + "step": 77360 + }, + { + "epoch": 5.25648865334964, + "grad_norm": 0.1603335291147232, + "learning_rate": 3.432022013860579e-05, + "loss": 3.8438, + "step": 77365 + }, + { + "epoch": 5.256828373420301, + "grad_norm": 0.15471529960632324, + "learning_rate": 3.431597363772252e-05, + "loss": 4.0547, + "step": 77370 + }, + { + "epoch": 5.2571680934909635, + "grad_norm": 0.1755596548318863, + "learning_rate": 3.431172713683924e-05, + "loss": 3.827, + "step": 77375 + }, + { + "epoch": 5.2575078135616256, + "grad_norm": 0.21786054968833923, + "learning_rate": 3.430748063595598e-05, + "loss": 3.8363, + "step": 77380 + }, + { + "epoch": 5.257847533632287, + "grad_norm": 0.42398956418037415, + "learning_rate": 3.4303234135072705e-05, + "loss": 3.9143, + "step": 77385 + }, + { + "epoch": 5.258187253702949, + "grad_norm": 0.20250919461250305, + "learning_rate": 3.429898763418943e-05, + "loss": 3.9829, + "step": 77390 + }, + { + "epoch": 5.258526973773611, + "grad_norm": 0.17798525094985962, + "learning_rate": 3.4294741133306155e-05, + "loss": 3.9136, + "step": 77395 + }, + { + "epoch": 5.258866693844272, + "grad_norm": 0.14708758890628815, + "learning_rate": 3.429049463242289e-05, + "loss": 3.5709, + "step": 77400 + }, + { + "epoch": 5.259206413914934, + "grad_norm": 0.162577286362648, + "learning_rate": 3.428624813153961e-05, + "loss": 3.9967, + "step": 77405 + }, + { + "epoch": 5.259546133985596, + "grad_norm": 0.21142642199993134, + "learning_rate": 3.428200163065634e-05, + "loss": 4.0099, + "step": 77410 + }, + { + "epoch": 5.259885854056257, + "grad_norm": 0.7800741195678711, + "learning_rate": 3.4277755129773073e-05, + "loss": 3.8945, + "step": 77415 + }, + { + "epoch": 5.2602255741269195, + "grad_norm": 0.17223407328128815, + "learning_rate": 3.4273508628889795e-05, + "loss": 4.1273, + "step": 77420 + }, + { + "epoch": 5.260565294197582, + "grad_norm": 0.21928806602954865, + "learning_rate": 3.426926212800652e-05, + "loss": 3.6634, + "step": 77425 + }, + { + "epoch": 5.260905014268243, + "grad_norm": 0.19181841611862183, + "learning_rate": 3.426501562712325e-05, + "loss": 3.9222, + "step": 77430 + }, + { + "epoch": 5.261244734338905, + "grad_norm": 0.2190755158662796, + "learning_rate": 3.426076912623998e-05, + "loss": 3.9393, + "step": 77435 + }, + { + "epoch": 5.261584454409567, + "grad_norm": 0.16672611236572266, + "learning_rate": 3.425652262535671e-05, + "loss": 4.0636, + "step": 77440 + }, + { + "epoch": 5.261924174480228, + "grad_norm": 0.1654195338487625, + "learning_rate": 3.4252276124473435e-05, + "loss": 3.8703, + "step": 77445 + }, + { + "epoch": 5.26226389455089, + "grad_norm": 0.18828140199184418, + "learning_rate": 3.424802962359016e-05, + "loss": 3.9218, + "step": 77450 + }, + { + "epoch": 5.262603614621552, + "grad_norm": 0.2898576855659485, + "learning_rate": 3.424378312270689e-05, + "loss": 3.7665, + "step": 77455 + }, + { + "epoch": 5.262943334692213, + "grad_norm": 0.17064248025417328, + "learning_rate": 3.423953662182362e-05, + "loss": 3.9521, + "step": 77460 + }, + { + "epoch": 5.2632830547628755, + "grad_norm": 0.1436094045639038, + "learning_rate": 3.423529012094035e-05, + "loss": 4.0382, + "step": 77465 + }, + { + "epoch": 5.263622774833538, + "grad_norm": 0.19483333826065063, + "learning_rate": 3.4231043620057075e-05, + "loss": 3.7674, + "step": 77470 + }, + { + "epoch": 5.263962494904199, + "grad_norm": 0.2874303460121155, + "learning_rate": 3.42267971191738e-05, + "loss": 4.073, + "step": 77475 + }, + { + "epoch": 5.264302214974861, + "grad_norm": 0.26459217071533203, + "learning_rate": 3.422255061829053e-05, + "loss": 3.9194, + "step": 77480 + }, + { + "epoch": 5.264641935045523, + "grad_norm": 0.1502094715833664, + "learning_rate": 3.421830411740726e-05, + "loss": 3.71, + "step": 77485 + }, + { + "epoch": 5.264981655116184, + "grad_norm": 0.15481269359588623, + "learning_rate": 3.421405761652399e-05, + "loss": 3.7116, + "step": 77490 + }, + { + "epoch": 5.265321375186846, + "grad_norm": 0.1784498393535614, + "learning_rate": 3.4209811115640715e-05, + "loss": 3.7193, + "step": 77495 + }, + { + "epoch": 5.265661095257508, + "grad_norm": 0.17766378819942474, + "learning_rate": 3.420556461475744e-05, + "loss": 3.843, + "step": 77500 + }, + { + "epoch": 5.266000815328169, + "grad_norm": 0.3850246071815491, + "learning_rate": 3.420131811387417e-05, + "loss": 3.9227, + "step": 77505 + }, + { + "epoch": 5.2663405353988315, + "grad_norm": 0.1625785231590271, + "learning_rate": 3.41970716129909e-05, + "loss": 3.7436, + "step": 77510 + }, + { + "epoch": 5.266680255469494, + "grad_norm": 0.1555401235818863, + "learning_rate": 3.419282511210763e-05, + "loss": 3.8858, + "step": 77515 + }, + { + "epoch": 5.267019975540155, + "grad_norm": 0.21906313300132751, + "learning_rate": 3.4188578611224355e-05, + "loss": 3.8826, + "step": 77520 + }, + { + "epoch": 5.267359695610817, + "grad_norm": 0.13489973545074463, + "learning_rate": 3.418433211034108e-05, + "loss": 3.8532, + "step": 77525 + }, + { + "epoch": 5.267699415681479, + "grad_norm": 0.4335535764694214, + "learning_rate": 3.4180085609457804e-05, + "loss": 3.7238, + "step": 77530 + }, + { + "epoch": 5.26803913575214, + "grad_norm": 0.21564944088459015, + "learning_rate": 3.417583910857454e-05, + "loss": 3.718, + "step": 77535 + }, + { + "epoch": 5.268378855822802, + "grad_norm": 0.18783412873744965, + "learning_rate": 3.417159260769127e-05, + "loss": 3.8623, + "step": 77540 + }, + { + "epoch": 5.268718575893463, + "grad_norm": 0.1633332073688507, + "learning_rate": 3.416734610680799e-05, + "loss": 3.9213, + "step": 77545 + }, + { + "epoch": 5.2690582959641254, + "grad_norm": 0.5200182199478149, + "learning_rate": 3.416309960592472e-05, + "loss": 3.8028, + "step": 77550 + }, + { + "epoch": 5.2693980160347875, + "grad_norm": 0.17106004059314728, + "learning_rate": 3.415885310504145e-05, + "loss": 3.6837, + "step": 77555 + }, + { + "epoch": 5.269737736105449, + "grad_norm": 0.1936454027891159, + "learning_rate": 3.415460660415817e-05, + "loss": 3.8324, + "step": 77560 + }, + { + "epoch": 5.270077456176111, + "grad_norm": 0.1956002563238144, + "learning_rate": 3.41503601032749e-05, + "loss": 3.8804, + "step": 77565 + }, + { + "epoch": 5.270417176246773, + "grad_norm": 0.28375208377838135, + "learning_rate": 3.4146113602391635e-05, + "loss": 3.9437, + "step": 77570 + }, + { + "epoch": 5.270756896317434, + "grad_norm": 0.18236908316612244, + "learning_rate": 3.4141867101508356e-05, + "loss": 4.149, + "step": 77575 + }, + { + "epoch": 5.271096616388096, + "grad_norm": 0.21307137608528137, + "learning_rate": 3.4137620600625084e-05, + "loss": 3.7975, + "step": 77580 + }, + { + "epoch": 5.271436336458758, + "grad_norm": 0.18040554225444794, + "learning_rate": 3.413337409974182e-05, + "loss": 3.8915, + "step": 77585 + }, + { + "epoch": 5.271776056529419, + "grad_norm": 0.15905189514160156, + "learning_rate": 3.412912759885854e-05, + "loss": 3.7273, + "step": 77590 + }, + { + "epoch": 5.2721157766000815, + "grad_norm": 0.27827510237693787, + "learning_rate": 3.412488109797527e-05, + "loss": 4.011, + "step": 77595 + }, + { + "epoch": 5.2724554966707435, + "grad_norm": 0.19551241397857666, + "learning_rate": 3.4120634597091996e-05, + "loss": 3.8146, + "step": 77600 + }, + { + "epoch": 5.272795216741405, + "grad_norm": 0.16477283835411072, + "learning_rate": 3.4116388096208724e-05, + "loss": 3.7538, + "step": 77605 + }, + { + "epoch": 5.273134936812067, + "grad_norm": 0.16521626710891724, + "learning_rate": 3.411214159532545e-05, + "loss": 3.7264, + "step": 77610 + }, + { + "epoch": 5.273474656882729, + "grad_norm": 0.17286387085914612, + "learning_rate": 3.410789509444218e-05, + "loss": 3.9639, + "step": 77615 + }, + { + "epoch": 5.27381437695339, + "grad_norm": 0.19181574881076813, + "learning_rate": 3.410364859355891e-05, + "loss": 3.8928, + "step": 77620 + }, + { + "epoch": 5.274154097024052, + "grad_norm": 0.508616030216217, + "learning_rate": 3.4099402092675636e-05, + "loss": 3.8565, + "step": 77625 + }, + { + "epoch": 5.274493817094714, + "grad_norm": 0.17906910181045532, + "learning_rate": 3.4095155591792364e-05, + "loss": 3.9758, + "step": 77630 + }, + { + "epoch": 5.274833537165375, + "grad_norm": 0.1920541375875473, + "learning_rate": 3.409090909090909e-05, + "loss": 3.9197, + "step": 77635 + }, + { + "epoch": 5.2751732572360375, + "grad_norm": 0.16346272826194763, + "learning_rate": 3.408666259002582e-05, + "loss": 4.06, + "step": 77640 + }, + { + "epoch": 5.2755129773066995, + "grad_norm": 0.16247427463531494, + "learning_rate": 3.408241608914255e-05, + "loss": 3.7221, + "step": 77645 + }, + { + "epoch": 5.275852697377361, + "grad_norm": 0.17004233598709106, + "learning_rate": 3.4078169588259276e-05, + "loss": 3.8102, + "step": 77650 + }, + { + "epoch": 5.276192417448023, + "grad_norm": 0.1952553689479828, + "learning_rate": 3.4073923087376004e-05, + "loss": 3.8401, + "step": 77655 + }, + { + "epoch": 5.276532137518685, + "grad_norm": 0.8224812746047974, + "learning_rate": 3.406967658649273e-05, + "loss": 3.9421, + "step": 77660 + }, + { + "epoch": 5.276871857589346, + "grad_norm": 0.5184217095375061, + "learning_rate": 3.406543008560946e-05, + "loss": 3.7039, + "step": 77665 + }, + { + "epoch": 5.277211577660008, + "grad_norm": 0.6061617732048035, + "learning_rate": 3.406118358472619e-05, + "loss": 4.0317, + "step": 77670 + }, + { + "epoch": 5.27755129773067, + "grad_norm": 0.1754300594329834, + "learning_rate": 3.4056937083842916e-05, + "loss": 3.6859, + "step": 77675 + }, + { + "epoch": 5.277891017801331, + "grad_norm": 0.22637790441513062, + "learning_rate": 3.4052690582959644e-05, + "loss": 3.5217, + "step": 77680 + }, + { + "epoch": 5.2782307378719935, + "grad_norm": 0.1929861605167389, + "learning_rate": 3.404844408207637e-05, + "loss": 3.9042, + "step": 77685 + }, + { + "epoch": 5.278570457942656, + "grad_norm": 0.20925524830818176, + "learning_rate": 3.40441975811931e-05, + "loss": 3.8005, + "step": 77690 + }, + { + "epoch": 5.278910178013317, + "grad_norm": 0.19714713096618652, + "learning_rate": 3.403995108030983e-05, + "loss": 3.9413, + "step": 77695 + }, + { + "epoch": 5.279249898083979, + "grad_norm": 2.1855223178863525, + "learning_rate": 3.403570457942655e-05, + "loss": 3.7067, + "step": 77700 + }, + { + "epoch": 5.279589618154641, + "grad_norm": 0.1674615442752838, + "learning_rate": 3.4031458078543284e-05, + "loss": 4.0152, + "step": 77705 + }, + { + "epoch": 5.279929338225302, + "grad_norm": 1.883317470550537, + "learning_rate": 3.402721157766001e-05, + "loss": 3.562, + "step": 77710 + }, + { + "epoch": 5.280269058295964, + "grad_norm": 0.2116689383983612, + "learning_rate": 3.4022965076776734e-05, + "loss": 3.8393, + "step": 77715 + }, + { + "epoch": 5.280608778366626, + "grad_norm": 0.19270747900009155, + "learning_rate": 3.401871857589347e-05, + "loss": 3.9351, + "step": 77720 + }, + { + "epoch": 5.280948498437287, + "grad_norm": 0.2189575433731079, + "learning_rate": 3.4014472075010196e-05, + "loss": 3.8977, + "step": 77725 + }, + { + "epoch": 5.2812882185079495, + "grad_norm": 0.5378872156143188, + "learning_rate": 3.401022557412692e-05, + "loss": 3.7749, + "step": 77730 + }, + { + "epoch": 5.281627938578612, + "grad_norm": 0.1858975738286972, + "learning_rate": 3.4005979073243646e-05, + "loss": 4.0121, + "step": 77735 + }, + { + "epoch": 5.281967658649273, + "grad_norm": 0.19305533170700073, + "learning_rate": 3.400173257236038e-05, + "loss": 3.7686, + "step": 77740 + }, + { + "epoch": 5.282307378719935, + "grad_norm": 0.243051677942276, + "learning_rate": 3.39974860714771e-05, + "loss": 3.7701, + "step": 77745 + }, + { + "epoch": 5.282647098790597, + "grad_norm": 0.1685151308774948, + "learning_rate": 3.399323957059383e-05, + "loss": 3.6124, + "step": 77750 + }, + { + "epoch": 5.282986818861258, + "grad_norm": 0.1469566375017166, + "learning_rate": 3.3988993069710564e-05, + "loss": 3.9168, + "step": 77755 + }, + { + "epoch": 5.28332653893192, + "grad_norm": 0.1956639140844345, + "learning_rate": 3.3984746568827286e-05, + "loss": 4.0386, + "step": 77760 + }, + { + "epoch": 5.283666259002582, + "grad_norm": 0.159184068441391, + "learning_rate": 3.3980500067944014e-05, + "loss": 3.4333, + "step": 77765 + }, + { + "epoch": 5.284005979073243, + "grad_norm": 0.16274495422840118, + "learning_rate": 3.397625356706075e-05, + "loss": 3.7612, + "step": 77770 + }, + { + "epoch": 5.2843456991439055, + "grad_norm": 0.26503175497055054, + "learning_rate": 3.397200706617747e-05, + "loss": 3.813, + "step": 77775 + }, + { + "epoch": 5.284685419214568, + "grad_norm": 0.6275253295898438, + "learning_rate": 3.39677605652942e-05, + "loss": 3.6375, + "step": 77780 + }, + { + "epoch": 5.285025139285229, + "grad_norm": 0.3370520770549774, + "learning_rate": 3.3963514064410926e-05, + "loss": 3.8241, + "step": 77785 + }, + { + "epoch": 5.285364859355891, + "grad_norm": 0.20223239064216614, + "learning_rate": 3.3959267563527654e-05, + "loss": 3.9751, + "step": 77790 + }, + { + "epoch": 5.285704579426553, + "grad_norm": 0.20120130479335785, + "learning_rate": 3.395502106264438e-05, + "loss": 3.7039, + "step": 77795 + }, + { + "epoch": 5.286044299497214, + "grad_norm": 0.3322441875934601, + "learning_rate": 3.395077456176111e-05, + "loss": 3.7139, + "step": 77800 + }, + { + "epoch": 5.286384019567876, + "grad_norm": 0.14379292726516724, + "learning_rate": 3.394652806087784e-05, + "loss": 3.7398, + "step": 77805 + }, + { + "epoch": 5.286723739638538, + "grad_norm": 0.18198442459106445, + "learning_rate": 3.3942281559994566e-05, + "loss": 3.9078, + "step": 77810 + }, + { + "epoch": 5.287063459709199, + "grad_norm": 0.20342986285686493, + "learning_rate": 3.3938035059111294e-05, + "loss": 3.9298, + "step": 77815 + }, + { + "epoch": 5.2874031797798615, + "grad_norm": 0.29173532128334045, + "learning_rate": 3.393378855822802e-05, + "loss": 3.875, + "step": 77820 + }, + { + "epoch": 5.287742899850523, + "grad_norm": 0.1688195914030075, + "learning_rate": 3.392954205734475e-05, + "loss": 3.8181, + "step": 77825 + }, + { + "epoch": 5.288082619921185, + "grad_norm": 0.169798344373703, + "learning_rate": 3.392529555646148e-05, + "loss": 3.5761, + "step": 77830 + }, + { + "epoch": 5.288422339991847, + "grad_norm": 0.1738365888595581, + "learning_rate": 3.3921049055578206e-05, + "loss": 3.8282, + "step": 77835 + }, + { + "epoch": 5.288762060062508, + "grad_norm": 0.14575590193271637, + "learning_rate": 3.3916802554694934e-05, + "loss": 4.0823, + "step": 77840 + }, + { + "epoch": 5.28910178013317, + "grad_norm": 0.21894916892051697, + "learning_rate": 3.391255605381166e-05, + "loss": 3.802, + "step": 77845 + }, + { + "epoch": 5.289441500203832, + "grad_norm": 0.21534687280654907, + "learning_rate": 3.390830955292839e-05, + "loss": 3.8393, + "step": 77850 + }, + { + "epoch": 5.289781220274493, + "grad_norm": 0.5183839201927185, + "learning_rate": 3.390406305204512e-05, + "loss": 4.0894, + "step": 77855 + }, + { + "epoch": 5.2901209403451555, + "grad_norm": 0.42704081535339355, + "learning_rate": 3.3899816551161846e-05, + "loss": 3.9178, + "step": 77860 + }, + { + "epoch": 5.2904606604158175, + "grad_norm": 0.1737382709980011, + "learning_rate": 3.3895570050278574e-05, + "loss": 3.7694, + "step": 77865 + }, + { + "epoch": 5.290800380486479, + "grad_norm": 0.14227400720119476, + "learning_rate": 3.38913235493953e-05, + "loss": 3.7009, + "step": 77870 + }, + { + "epoch": 5.291140100557141, + "grad_norm": 0.16333593428134918, + "learning_rate": 3.388707704851203e-05, + "loss": 4.0873, + "step": 77875 + }, + { + "epoch": 5.291479820627803, + "grad_norm": 0.18710501492023468, + "learning_rate": 3.388283054762876e-05, + "loss": 3.6769, + "step": 77880 + }, + { + "epoch": 5.291819540698464, + "grad_norm": 0.17924757301807404, + "learning_rate": 3.387858404674548e-05, + "loss": 3.7612, + "step": 77885 + }, + { + "epoch": 5.292159260769126, + "grad_norm": 0.32857242226600647, + "learning_rate": 3.3874337545862214e-05, + "loss": 3.7548, + "step": 77890 + }, + { + "epoch": 5.292498980839788, + "grad_norm": 0.19908997416496277, + "learning_rate": 3.387009104497894e-05, + "loss": 3.7659, + "step": 77895 + }, + { + "epoch": 5.292838700910449, + "grad_norm": 0.1939539611339569, + "learning_rate": 3.386584454409566e-05, + "loss": 3.8332, + "step": 77900 + }, + { + "epoch": 5.2931784209811115, + "grad_norm": 0.19186708331108093, + "learning_rate": 3.38615980432124e-05, + "loss": 3.8112, + "step": 77905 + }, + { + "epoch": 5.2935181410517735, + "grad_norm": 0.18705803155899048, + "learning_rate": 3.3857351542329126e-05, + "loss": 3.6667, + "step": 77910 + }, + { + "epoch": 5.293857861122435, + "grad_norm": 0.1939193606376648, + "learning_rate": 3.385310504144585e-05, + "loss": 3.8096, + "step": 77915 + }, + { + "epoch": 5.294197581193097, + "grad_norm": 0.21993248164653778, + "learning_rate": 3.3848858540562575e-05, + "loss": 3.729, + "step": 77920 + }, + { + "epoch": 5.294537301263759, + "grad_norm": 0.16458119451999664, + "learning_rate": 3.384461203967931e-05, + "loss": 3.7876, + "step": 77925 + }, + { + "epoch": 5.29487702133442, + "grad_norm": 0.17448128759860992, + "learning_rate": 3.384036553879603e-05, + "loss": 3.7176, + "step": 77930 + }, + { + "epoch": 5.295216741405082, + "grad_norm": 0.14946907758712769, + "learning_rate": 3.383611903791276e-05, + "loss": 4.0056, + "step": 77935 + }, + { + "epoch": 5.295556461475744, + "grad_norm": 0.21576790511608124, + "learning_rate": 3.3831872537029494e-05, + "loss": 3.7349, + "step": 77940 + }, + { + "epoch": 5.295896181546405, + "grad_norm": 0.30705171823501587, + "learning_rate": 3.3827626036146215e-05, + "loss": 3.9565, + "step": 77945 + }, + { + "epoch": 5.2962359016170675, + "grad_norm": 0.15112265944480896, + "learning_rate": 3.382337953526294e-05, + "loss": 3.9275, + "step": 77950 + }, + { + "epoch": 5.2965756216877296, + "grad_norm": 0.18178242444992065, + "learning_rate": 3.381913303437967e-05, + "loss": 3.9515, + "step": 77955 + }, + { + "epoch": 5.296915341758391, + "grad_norm": 0.1593107134103775, + "learning_rate": 3.38148865334964e-05, + "loss": 3.6658, + "step": 77960 + }, + { + "epoch": 5.297255061829053, + "grad_norm": 0.2883705496788025, + "learning_rate": 3.381064003261313e-05, + "loss": 4.0634, + "step": 77965 + }, + { + "epoch": 5.297594781899715, + "grad_norm": 0.1456449329853058, + "learning_rate": 3.3806393531729855e-05, + "loss": 3.8653, + "step": 77970 + }, + { + "epoch": 5.297934501970376, + "grad_norm": 0.24563072621822357, + "learning_rate": 3.380214703084658e-05, + "loss": 3.6872, + "step": 77975 + }, + { + "epoch": 5.298274222041038, + "grad_norm": 0.26526308059692383, + "learning_rate": 3.379790052996331e-05, + "loss": 4.1012, + "step": 77980 + }, + { + "epoch": 5.2986139421117, + "grad_norm": 0.22993141412734985, + "learning_rate": 3.379365402908004e-05, + "loss": 3.7535, + "step": 77985 + }, + { + "epoch": 5.298953662182361, + "grad_norm": 0.15708300471305847, + "learning_rate": 3.378940752819677e-05, + "loss": 3.76, + "step": 77990 + }, + { + "epoch": 5.2992933822530235, + "grad_norm": 0.2075941264629364, + "learning_rate": 3.3785161027313495e-05, + "loss": 3.7077, + "step": 77995 + }, + { + "epoch": 5.299633102323686, + "grad_norm": 0.16292382776737213, + "learning_rate": 3.378091452643022e-05, + "loss": 3.8636, + "step": 78000 + }, + { + "epoch": 5.299972822394347, + "grad_norm": 0.3111187815666199, + "learning_rate": 3.377666802554695e-05, + "loss": 3.8966, + "step": 78005 + }, + { + "epoch": 5.300312542465009, + "grad_norm": 0.21505124866962433, + "learning_rate": 3.377242152466368e-05, + "loss": 3.5591, + "step": 78010 + }, + { + "epoch": 5.300652262535671, + "grad_norm": 0.258320689201355, + "learning_rate": 3.376817502378041e-05, + "loss": 3.8816, + "step": 78015 + }, + { + "epoch": 5.300991982606332, + "grad_norm": 0.22108185291290283, + "learning_rate": 3.3763928522897135e-05, + "loss": 3.6859, + "step": 78020 + }, + { + "epoch": 5.301331702676994, + "grad_norm": 0.18567340075969696, + "learning_rate": 3.375968202201386e-05, + "loss": 3.8499, + "step": 78025 + }, + { + "epoch": 5.301671422747656, + "grad_norm": 0.9342329502105713, + "learning_rate": 3.375543552113059e-05, + "loss": 3.7684, + "step": 78030 + }, + { + "epoch": 5.302011142818317, + "grad_norm": 2.5925076007843018, + "learning_rate": 3.375118902024732e-05, + "loss": 3.982, + "step": 78035 + }, + { + "epoch": 5.3023508628889795, + "grad_norm": 0.1649339348077774, + "learning_rate": 3.374694251936405e-05, + "loss": 3.8911, + "step": 78040 + }, + { + "epoch": 5.302690582959642, + "grad_norm": 0.12965840101242065, + "learning_rate": 3.3742696018480775e-05, + "loss": 3.826, + "step": 78045 + }, + { + "epoch": 5.303030303030303, + "grad_norm": 0.24850276112556458, + "learning_rate": 3.37384495175975e-05, + "loss": 3.5872, + "step": 78050 + }, + { + "epoch": 5.303370023100965, + "grad_norm": 0.2284393608570099, + "learning_rate": 3.3734203016714225e-05, + "loss": 3.9796, + "step": 78055 + }, + { + "epoch": 5.303709743171627, + "grad_norm": 0.20682981610298157, + "learning_rate": 3.372995651583096e-05, + "loss": 3.7715, + "step": 78060 + }, + { + "epoch": 5.304049463242288, + "grad_norm": 0.22698011994361877, + "learning_rate": 3.372571001494769e-05, + "loss": 3.9636, + "step": 78065 + }, + { + "epoch": 5.30438918331295, + "grad_norm": 0.16829948127269745, + "learning_rate": 3.372146351406441e-05, + "loss": 3.8503, + "step": 78070 + }, + { + "epoch": 5.304728903383612, + "grad_norm": 0.19901463389396667, + "learning_rate": 3.371721701318114e-05, + "loss": 3.6733, + "step": 78075 + }, + { + "epoch": 5.305068623454273, + "grad_norm": 0.21850620210170746, + "learning_rate": 3.371297051229787e-05, + "loss": 3.9246, + "step": 78080 + }, + { + "epoch": 5.3054083435249355, + "grad_norm": 0.1835915595293045, + "learning_rate": 3.370872401141459e-05, + "loss": 3.66, + "step": 78085 + }, + { + "epoch": 5.305748063595598, + "grad_norm": 0.21755553781986237, + "learning_rate": 3.370447751053132e-05, + "loss": 3.9279, + "step": 78090 + }, + { + "epoch": 5.306087783666259, + "grad_norm": 0.1675761640071869, + "learning_rate": 3.3700231009648055e-05, + "loss": 3.8029, + "step": 78095 + }, + { + "epoch": 5.306427503736921, + "grad_norm": 0.19387777149677277, + "learning_rate": 3.369598450876478e-05, + "loss": 3.6971, + "step": 78100 + }, + { + "epoch": 5.306767223807583, + "grad_norm": 0.1405857354402542, + "learning_rate": 3.3691738007881505e-05, + "loss": 3.9856, + "step": 78105 + }, + { + "epoch": 5.307106943878244, + "grad_norm": 0.19893084466457367, + "learning_rate": 3.368749150699824e-05, + "loss": 4.2273, + "step": 78110 + }, + { + "epoch": 5.307446663948906, + "grad_norm": 0.14402100443840027, + "learning_rate": 3.368324500611496e-05, + "loss": 3.9691, + "step": 78115 + }, + { + "epoch": 5.307786384019568, + "grad_norm": 0.15755531191825867, + "learning_rate": 3.367899850523169e-05, + "loss": 3.7569, + "step": 78120 + }, + { + "epoch": 5.3081261040902294, + "grad_norm": 0.18521583080291748, + "learning_rate": 3.367475200434842e-05, + "loss": 3.7932, + "step": 78125 + }, + { + "epoch": 5.3084658241608915, + "grad_norm": 0.25103792548179626, + "learning_rate": 3.3670505503465145e-05, + "loss": 3.947, + "step": 78130 + }, + { + "epoch": 5.308805544231554, + "grad_norm": 0.12529806792736053, + "learning_rate": 3.366625900258187e-05, + "loss": 3.612, + "step": 78135 + }, + { + "epoch": 5.309145264302215, + "grad_norm": 0.205535426735878, + "learning_rate": 3.36620125016986e-05, + "loss": 3.6995, + "step": 78140 + }, + { + "epoch": 5.309484984372877, + "grad_norm": 0.20645464956760406, + "learning_rate": 3.365776600081533e-05, + "loss": 3.6699, + "step": 78145 + }, + { + "epoch": 5.309824704443539, + "grad_norm": 0.1879822313785553, + "learning_rate": 3.365351949993206e-05, + "loss": 3.8636, + "step": 78150 + }, + { + "epoch": 5.3101644245142, + "grad_norm": 0.14624054729938507, + "learning_rate": 3.3649272999048785e-05, + "loss": 3.9075, + "step": 78155 + }, + { + "epoch": 5.310504144584862, + "grad_norm": 0.3122122585773468, + "learning_rate": 3.364502649816551e-05, + "loss": 4.0146, + "step": 78160 + }, + { + "epoch": 5.310843864655524, + "grad_norm": 0.15490111708641052, + "learning_rate": 3.364077999728224e-05, + "loss": 4.1317, + "step": 78165 + }, + { + "epoch": 5.3111835847261855, + "grad_norm": 0.15961489081382751, + "learning_rate": 3.363653349639897e-05, + "loss": 3.8143, + "step": 78170 + }, + { + "epoch": 5.3115233047968475, + "grad_norm": 0.17580898106098175, + "learning_rate": 3.36322869955157e-05, + "loss": 3.9268, + "step": 78175 + }, + { + "epoch": 5.31186302486751, + "grad_norm": 0.2540402114391327, + "learning_rate": 3.3628040494632425e-05, + "loss": 4.2563, + "step": 78180 + }, + { + "epoch": 5.312202744938171, + "grad_norm": 0.16905078291893005, + "learning_rate": 3.362379399374915e-05, + "loss": 3.7094, + "step": 78185 + }, + { + "epoch": 5.312542465008833, + "grad_norm": 4.325218677520752, + "learning_rate": 3.361954749286588e-05, + "loss": 3.678, + "step": 78190 + }, + { + "epoch": 5.312882185079495, + "grad_norm": 0.1543290615081787, + "learning_rate": 3.3616150292159256e-05, + "loss": 3.6984, + "step": 78195 + }, + { + "epoch": 5.313221905150156, + "grad_norm": 0.14473621547222137, + "learning_rate": 3.361190379127599e-05, + "loss": 3.8883, + "step": 78200 + }, + { + "epoch": 5.313561625220818, + "grad_norm": 0.18343010544776917, + "learning_rate": 3.360765729039272e-05, + "loss": 4.2025, + "step": 78205 + }, + { + "epoch": 5.31390134529148, + "grad_norm": 0.2644944190979004, + "learning_rate": 3.360341078950945e-05, + "loss": 3.982, + "step": 78210 + }, + { + "epoch": 5.3142410653621415, + "grad_norm": 0.5299903154373169, + "learning_rate": 3.3599164288626175e-05, + "loss": 3.7982, + "step": 78215 + }, + { + "epoch": 5.3145807854328035, + "grad_norm": 0.19014719128608704, + "learning_rate": 3.35949177877429e-05, + "loss": 3.9072, + "step": 78220 + }, + { + "epoch": 5.314920505503465, + "grad_norm": 0.20344357192516327, + "learning_rate": 3.359067128685963e-05, + "loss": 3.9978, + "step": 78225 + }, + { + "epoch": 5.315260225574127, + "grad_norm": 0.19392387568950653, + "learning_rate": 3.358642478597636e-05, + "loss": 3.8694, + "step": 78230 + }, + { + "epoch": 5.315599945644789, + "grad_norm": 0.16153086721897125, + "learning_rate": 3.358217828509309e-05, + "loss": 3.6898, + "step": 78235 + }, + { + "epoch": 5.31593966571545, + "grad_norm": 0.15462496876716614, + "learning_rate": 3.3577931784209815e-05, + "loss": 3.9673, + "step": 78240 + }, + { + "epoch": 5.316279385786112, + "grad_norm": 0.7955722808837891, + "learning_rate": 3.3573685283326536e-05, + "loss": 3.8156, + "step": 78245 + }, + { + "epoch": 5.316619105856774, + "grad_norm": 0.16972851753234863, + "learning_rate": 3.356943878244327e-05, + "loss": 3.8291, + "step": 78250 + }, + { + "epoch": 5.316958825927435, + "grad_norm": 0.18692512810230255, + "learning_rate": 3.356519228156e-05, + "loss": 3.9258, + "step": 78255 + }, + { + "epoch": 5.3172985459980975, + "grad_norm": 0.23974823951721191, + "learning_rate": 3.356094578067672e-05, + "loss": 3.8506, + "step": 78260 + }, + { + "epoch": 5.31763826606876, + "grad_norm": 0.5348193049430847, + "learning_rate": 3.3556699279793455e-05, + "loss": 3.7256, + "step": 78265 + }, + { + "epoch": 5.317977986139421, + "grad_norm": 0.17749613523483276, + "learning_rate": 3.355245277891018e-05, + "loss": 3.8477, + "step": 78270 + }, + { + "epoch": 5.318317706210083, + "grad_norm": 0.17043296992778778, + "learning_rate": 3.3548206278026905e-05, + "loss": 3.779, + "step": 78275 + }, + { + "epoch": 5.318657426280745, + "grad_norm": 0.17311955988407135, + "learning_rate": 3.354395977714363e-05, + "loss": 3.6205, + "step": 78280 + }, + { + "epoch": 5.318997146351406, + "grad_norm": 0.22165866196155548, + "learning_rate": 3.353971327626037e-05, + "loss": 3.8362, + "step": 78285 + }, + { + "epoch": 5.319336866422068, + "grad_norm": 0.14432962238788605, + "learning_rate": 3.353546677537709e-05, + "loss": 3.7915, + "step": 78290 + }, + { + "epoch": 5.31967658649273, + "grad_norm": 0.16798697412014008, + "learning_rate": 3.3531220274493817e-05, + "loss": 4.0107, + "step": 78295 + }, + { + "epoch": 5.320016306563391, + "grad_norm": 0.16353824734687805, + "learning_rate": 3.352697377361055e-05, + "loss": 3.6935, + "step": 78300 + }, + { + "epoch": 5.3203560266340535, + "grad_norm": 0.8591974973678589, + "learning_rate": 3.352272727272727e-05, + "loss": 3.7927, + "step": 78305 + }, + { + "epoch": 5.320695746704716, + "grad_norm": 0.736920177936554, + "learning_rate": 3.3518480771844e-05, + "loss": 3.6509, + "step": 78310 + }, + { + "epoch": 5.321035466775377, + "grad_norm": 1.0239330530166626, + "learning_rate": 3.351423427096073e-05, + "loss": 4.0086, + "step": 78315 + }, + { + "epoch": 5.321375186846039, + "grad_norm": 0.17457452416419983, + "learning_rate": 3.3509987770077457e-05, + "loss": 3.6834, + "step": 78320 + }, + { + "epoch": 5.321714906916701, + "grad_norm": 0.17188073694705963, + "learning_rate": 3.3505741269194185e-05, + "loss": 3.864, + "step": 78325 + }, + { + "epoch": 5.322054626987362, + "grad_norm": 1.695752501487732, + "learning_rate": 3.350149476831091e-05, + "loss": 3.8854, + "step": 78330 + }, + { + "epoch": 5.322394347058024, + "grad_norm": 0.7915775179862976, + "learning_rate": 3.349724826742764e-05, + "loss": 3.8087, + "step": 78335 + }, + { + "epoch": 5.322734067128686, + "grad_norm": 0.20750416815280914, + "learning_rate": 3.349300176654437e-05, + "loss": 3.916, + "step": 78340 + }, + { + "epoch": 5.323073787199347, + "grad_norm": 0.3975570797920227, + "learning_rate": 3.34887552656611e-05, + "loss": 3.8923, + "step": 78345 + }, + { + "epoch": 5.3234135072700095, + "grad_norm": 0.15842880308628082, + "learning_rate": 3.3484508764777825e-05, + "loss": 3.828, + "step": 78350 + }, + { + "epoch": 5.323753227340672, + "grad_norm": 0.21678757667541504, + "learning_rate": 3.348026226389455e-05, + "loss": 3.8436, + "step": 78355 + }, + { + "epoch": 5.324092947411333, + "grad_norm": 0.8385623097419739, + "learning_rate": 3.347601576301128e-05, + "loss": 3.9093, + "step": 78360 + }, + { + "epoch": 5.324432667481995, + "grad_norm": 0.17850320041179657, + "learning_rate": 3.347176926212801e-05, + "loss": 3.7528, + "step": 78365 + }, + { + "epoch": 5.324772387552657, + "grad_norm": 2.1984987258911133, + "learning_rate": 3.346752276124474e-05, + "loss": 3.8979, + "step": 78370 + }, + { + "epoch": 5.325112107623318, + "grad_norm": 0.1846625953912735, + "learning_rate": 3.3463276260361465e-05, + "loss": 3.8479, + "step": 78375 + }, + { + "epoch": 5.32545182769398, + "grad_norm": 0.1598161906003952, + "learning_rate": 3.345902975947819e-05, + "loss": 4.0075, + "step": 78380 + }, + { + "epoch": 5.325791547764642, + "grad_norm": 0.1843433529138565, + "learning_rate": 3.345478325859492e-05, + "loss": 3.9698, + "step": 78385 + }, + { + "epoch": 5.326131267835303, + "grad_norm": 0.16876652836799622, + "learning_rate": 3.345053675771165e-05, + "loss": 3.7286, + "step": 78390 + }, + { + "epoch": 5.3264709879059655, + "grad_norm": 0.19993099570274353, + "learning_rate": 3.344629025682838e-05, + "loss": 3.997, + "step": 78395 + }, + { + "epoch": 5.326810707976628, + "grad_norm": 0.21351933479309082, + "learning_rate": 3.3442043755945105e-05, + "loss": 3.6843, + "step": 78400 + }, + { + "epoch": 5.327150428047289, + "grad_norm": 0.16609473526477814, + "learning_rate": 3.343779725506183e-05, + "loss": 3.8325, + "step": 78405 + }, + { + "epoch": 5.327490148117951, + "grad_norm": 0.13396939635276794, + "learning_rate": 3.343355075417856e-05, + "loss": 3.7657, + "step": 78410 + }, + { + "epoch": 5.327829868188613, + "grad_norm": 0.18067754805088043, + "learning_rate": 3.342930425329528e-05, + "loss": 3.8393, + "step": 78415 + }, + { + "epoch": 5.328169588259274, + "grad_norm": 0.18209421634674072, + "learning_rate": 3.342505775241202e-05, + "loss": 3.592, + "step": 78420 + }, + { + "epoch": 5.328509308329936, + "grad_norm": 0.16601021587848663, + "learning_rate": 3.3420811251528745e-05, + "loss": 3.8425, + "step": 78425 + }, + { + "epoch": 5.328849028400598, + "grad_norm": 0.1596093475818634, + "learning_rate": 3.3416564750645466e-05, + "loss": 3.743, + "step": 78430 + }, + { + "epoch": 5.3291887484712595, + "grad_norm": 0.15947091579437256, + "learning_rate": 3.34123182497622e-05, + "loss": 3.6081, + "step": 78435 + }, + { + "epoch": 5.3295284685419215, + "grad_norm": 0.19339273869991302, + "learning_rate": 3.340807174887893e-05, + "loss": 3.9859, + "step": 78440 + }, + { + "epoch": 5.329868188612584, + "grad_norm": 0.16530556976795197, + "learning_rate": 3.340382524799565e-05, + "loss": 3.8185, + "step": 78445 + }, + { + "epoch": 5.330207908683245, + "grad_norm": 0.20082996785640717, + "learning_rate": 3.339957874711238e-05, + "loss": 3.8758, + "step": 78450 + }, + { + "epoch": 5.330547628753907, + "grad_norm": 0.3808183968067169, + "learning_rate": 3.339533224622911e-05, + "loss": 4.0949, + "step": 78455 + }, + { + "epoch": 5.330887348824569, + "grad_norm": 0.7443317174911499, + "learning_rate": 3.3391085745345834e-05, + "loss": 3.9918, + "step": 78460 + }, + { + "epoch": 5.33122706889523, + "grad_norm": 0.27781912684440613, + "learning_rate": 3.338683924446256e-05, + "loss": 4.0045, + "step": 78465 + }, + { + "epoch": 5.331566788965892, + "grad_norm": 0.4595417380332947, + "learning_rate": 3.33825927435793e-05, + "loss": 3.6269, + "step": 78470 + }, + { + "epoch": 5.331906509036554, + "grad_norm": 0.3026861846446991, + "learning_rate": 3.337834624269602e-05, + "loss": 3.7908, + "step": 78475 + }, + { + "epoch": 5.3322462291072155, + "grad_norm": 0.3631027042865753, + "learning_rate": 3.3374099741812746e-05, + "loss": 4.1302, + "step": 78480 + }, + { + "epoch": 5.3325859491778775, + "grad_norm": 0.26730671525001526, + "learning_rate": 3.3369853240929474e-05, + "loss": 3.813, + "step": 78485 + }, + { + "epoch": 5.33292566924854, + "grad_norm": 0.21708308160305023, + "learning_rate": 3.33656067400462e-05, + "loss": 3.8166, + "step": 78490 + }, + { + "epoch": 5.333265389319201, + "grad_norm": 0.28427156805992126, + "learning_rate": 3.336136023916293e-05, + "loss": 3.6347, + "step": 78495 + }, + { + "epoch": 5.333605109389863, + "grad_norm": 0.20935532450675964, + "learning_rate": 3.335711373827966e-05, + "loss": 3.7026, + "step": 78500 + }, + { + "epoch": 5.333944829460524, + "grad_norm": 0.15116003155708313, + "learning_rate": 3.3352867237396386e-05, + "loss": 4.032, + "step": 78505 + }, + { + "epoch": 5.334284549531186, + "grad_norm": 0.18993665277957916, + "learning_rate": 3.3348620736513114e-05, + "loss": 3.6005, + "step": 78510 + }, + { + "epoch": 5.334624269601848, + "grad_norm": 0.19323158264160156, + "learning_rate": 3.334437423562984e-05, + "loss": 3.6649, + "step": 78515 + }, + { + "epoch": 5.334963989672509, + "grad_norm": 0.17507930099964142, + "learning_rate": 3.334012773474657e-05, + "loss": 3.9678, + "step": 78520 + }, + { + "epoch": 5.3353037097431715, + "grad_norm": 0.15689338743686676, + "learning_rate": 3.33358812338633e-05, + "loss": 4.1205, + "step": 78525 + }, + { + "epoch": 5.3356434298138335, + "grad_norm": 0.15372046828269958, + "learning_rate": 3.3331634732980026e-05, + "loss": 3.9179, + "step": 78530 + }, + { + "epoch": 5.335983149884495, + "grad_norm": 0.1540265679359436, + "learning_rate": 3.3327388232096754e-05, + "loss": 3.8658, + "step": 78535 + }, + { + "epoch": 5.336322869955157, + "grad_norm": 0.16989223659038544, + "learning_rate": 3.332314173121348e-05, + "loss": 3.6386, + "step": 78540 + }, + { + "epoch": 5.336662590025819, + "grad_norm": 0.1722949892282486, + "learning_rate": 3.331889523033021e-05, + "loss": 3.8811, + "step": 78545 + }, + { + "epoch": 5.33700231009648, + "grad_norm": 0.15728029608726501, + "learning_rate": 3.331464872944694e-05, + "loss": 3.6907, + "step": 78550 + }, + { + "epoch": 5.337342030167142, + "grad_norm": 0.1566518098115921, + "learning_rate": 3.3310402228563666e-05, + "loss": 3.8228, + "step": 78555 + }, + { + "epoch": 5.337681750237804, + "grad_norm": 0.16814720630645752, + "learning_rate": 3.3306155727680394e-05, + "loss": 4.0677, + "step": 78560 + }, + { + "epoch": 5.338021470308465, + "grad_norm": 0.1936332732439041, + "learning_rate": 3.330190922679712e-05, + "loss": 3.8156, + "step": 78565 + }, + { + "epoch": 5.3383611903791275, + "grad_norm": 0.1889888197183609, + "learning_rate": 3.329766272591385e-05, + "loss": 3.6744, + "step": 78570 + }, + { + "epoch": 5.33870091044979, + "grad_norm": 0.20537924766540527, + "learning_rate": 3.329341622503058e-05, + "loss": 3.6593, + "step": 78575 + }, + { + "epoch": 5.339040630520451, + "grad_norm": 0.16923171281814575, + "learning_rate": 3.3289169724147306e-05, + "loss": 3.7275, + "step": 78580 + }, + { + "epoch": 5.339380350591113, + "grad_norm": 0.20887666940689087, + "learning_rate": 3.328492322326403e-05, + "loss": 3.7193, + "step": 78585 + }, + { + "epoch": 5.339720070661775, + "grad_norm": 0.1894235908985138, + "learning_rate": 3.328067672238076e-05, + "loss": 3.6974, + "step": 78590 + }, + { + "epoch": 5.340059790732436, + "grad_norm": 0.18793444335460663, + "learning_rate": 3.327643022149749e-05, + "loss": 4.0015, + "step": 78595 + }, + { + "epoch": 5.340399510803098, + "grad_norm": 0.21965977549552917, + "learning_rate": 3.327218372061421e-05, + "loss": 3.8941, + "step": 78600 + }, + { + "epoch": 5.34073923087376, + "grad_norm": 0.157006174325943, + "learning_rate": 3.3267937219730946e-05, + "loss": 3.9015, + "step": 78605 + }, + { + "epoch": 5.341078950944421, + "grad_norm": 1.341359257698059, + "learning_rate": 3.3263690718847674e-05, + "loss": 3.8925, + "step": 78610 + }, + { + "epoch": 5.3414186710150835, + "grad_norm": 0.1891997903585434, + "learning_rate": 3.3259444217964395e-05, + "loss": 3.7862, + "step": 78615 + }, + { + "epoch": 5.341758391085746, + "grad_norm": 0.1862678825855255, + "learning_rate": 3.3255197717081123e-05, + "loss": 3.8923, + "step": 78620 + }, + { + "epoch": 5.342098111156407, + "grad_norm": 0.2232414335012436, + "learning_rate": 3.325095121619786e-05, + "loss": 3.8299, + "step": 78625 + }, + { + "epoch": 5.342437831227069, + "grad_norm": 0.22188115119934082, + "learning_rate": 3.324670471531458e-05, + "loss": 4.0438, + "step": 78630 + }, + { + "epoch": 5.342777551297731, + "grad_norm": 2.85922908782959, + "learning_rate": 3.324245821443131e-05, + "loss": 3.9116, + "step": 78635 + }, + { + "epoch": 5.343117271368392, + "grad_norm": 0.13931353390216827, + "learning_rate": 3.323821171354804e-05, + "loss": 3.6332, + "step": 78640 + }, + { + "epoch": 5.343456991439054, + "grad_norm": 0.32716819643974304, + "learning_rate": 3.3233965212664764e-05, + "loss": 3.9112, + "step": 78645 + }, + { + "epoch": 5.343796711509716, + "grad_norm": 0.4860353171825409, + "learning_rate": 3.322971871178149e-05, + "loss": 3.7029, + "step": 78650 + }, + { + "epoch": 5.344136431580377, + "grad_norm": 0.19240939617156982, + "learning_rate": 3.3225472210898226e-05, + "loss": 3.9286, + "step": 78655 + }, + { + "epoch": 5.3444761516510395, + "grad_norm": 0.22828567028045654, + "learning_rate": 3.322122571001495e-05, + "loss": 4.0041, + "step": 78660 + }, + { + "epoch": 5.344815871721702, + "grad_norm": 0.22677023708820343, + "learning_rate": 3.3216979209131676e-05, + "loss": 3.8946, + "step": 78665 + }, + { + "epoch": 5.345155591792363, + "grad_norm": 0.19907517731189728, + "learning_rate": 3.3212732708248404e-05, + "loss": 3.9634, + "step": 78670 + }, + { + "epoch": 5.345495311863025, + "grad_norm": 0.1688217967748642, + "learning_rate": 3.320848620736513e-05, + "loss": 3.7422, + "step": 78675 + }, + { + "epoch": 5.345835031933687, + "grad_norm": 0.18821828067302704, + "learning_rate": 3.320423970648186e-05, + "loss": 3.762, + "step": 78680 + }, + { + "epoch": 5.346174752004348, + "grad_norm": 0.16222217679023743, + "learning_rate": 3.319999320559859e-05, + "loss": 3.7346, + "step": 78685 + }, + { + "epoch": 5.34651447207501, + "grad_norm": 0.1714888960123062, + "learning_rate": 3.3195746704715316e-05, + "loss": 3.966, + "step": 78690 + }, + { + "epoch": 5.346854192145672, + "grad_norm": 0.14590846002101898, + "learning_rate": 3.3191500203832044e-05, + "loss": 3.9439, + "step": 78695 + }, + { + "epoch": 5.3471939122163334, + "grad_norm": 0.15456528961658478, + "learning_rate": 3.318725370294877e-05, + "loss": 3.9454, + "step": 78700 + }, + { + "epoch": 5.3475336322869955, + "grad_norm": 0.1410723626613617, + "learning_rate": 3.31830072020655e-05, + "loss": 3.934, + "step": 78705 + }, + { + "epoch": 5.347873352357658, + "grad_norm": 0.15879923105239868, + "learning_rate": 3.317876070118223e-05, + "loss": 3.4778, + "step": 78710 + }, + { + "epoch": 5.348213072428319, + "grad_norm": 0.5707345008850098, + "learning_rate": 3.3174514200298956e-05, + "loss": 3.7179, + "step": 78715 + }, + { + "epoch": 5.348552792498981, + "grad_norm": 0.15858086943626404, + "learning_rate": 3.3170267699415684e-05, + "loss": 3.76, + "step": 78720 + }, + { + "epoch": 5.348892512569643, + "grad_norm": 0.16071730852127075, + "learning_rate": 3.316602119853241e-05, + "loss": 4.0079, + "step": 78725 + }, + { + "epoch": 5.349232232640304, + "grad_norm": 0.20222589373588562, + "learning_rate": 3.316177469764914e-05, + "loss": 3.8886, + "step": 78730 + }, + { + "epoch": 5.349571952710966, + "grad_norm": 1.6246392726898193, + "learning_rate": 3.315752819676587e-05, + "loss": 3.8952, + "step": 78735 + }, + { + "epoch": 5.349911672781628, + "grad_norm": 0.23815932869911194, + "learning_rate": 3.3153281695882596e-05, + "loss": 3.8112, + "step": 78740 + }, + { + "epoch": 5.3502513928522895, + "grad_norm": 0.3704369068145752, + "learning_rate": 3.3149035194999324e-05, + "loss": 3.6733, + "step": 78745 + }, + { + "epoch": 5.3505911129229515, + "grad_norm": 0.18404585123062134, + "learning_rate": 3.314478869411605e-05, + "loss": 3.9499, + "step": 78750 + }, + { + "epoch": 5.350930832993614, + "grad_norm": 0.1665336638689041, + "learning_rate": 3.314054219323278e-05, + "loss": 3.9738, + "step": 78755 + }, + { + "epoch": 5.351270553064275, + "grad_norm": 0.17683961987495422, + "learning_rate": 3.313629569234951e-05, + "loss": 3.863, + "step": 78760 + }, + { + "epoch": 5.351610273134937, + "grad_norm": 0.21115641295909882, + "learning_rate": 3.3132049191466236e-05, + "loss": 4.0774, + "step": 78765 + }, + { + "epoch": 5.351949993205599, + "grad_norm": 0.20260709524154663, + "learning_rate": 3.312780269058296e-05, + "loss": 3.8119, + "step": 78770 + }, + { + "epoch": 5.35228971327626, + "grad_norm": 0.16675864160060883, + "learning_rate": 3.312355618969969e-05, + "loss": 3.6609, + "step": 78775 + }, + { + "epoch": 5.352629433346922, + "grad_norm": 0.17001298069953918, + "learning_rate": 3.311930968881642e-05, + "loss": 3.9185, + "step": 78780 + }, + { + "epoch": 5.352969153417584, + "grad_norm": 0.21594761312007904, + "learning_rate": 3.311506318793314e-05, + "loss": 3.5301, + "step": 78785 + }, + { + "epoch": 5.3533088734882455, + "grad_norm": 0.15234020352363586, + "learning_rate": 3.3110816687049876e-05, + "loss": 3.889, + "step": 78790 + }, + { + "epoch": 5.3536485935589075, + "grad_norm": 0.19672144949436188, + "learning_rate": 3.3106570186166604e-05, + "loss": 3.6256, + "step": 78795 + }, + { + "epoch": 5.35398831362957, + "grad_norm": 0.2191658467054367, + "learning_rate": 3.3102323685283325e-05, + "loss": 3.479, + "step": 78800 + }, + { + "epoch": 5.354328033700231, + "grad_norm": 0.16446107625961304, + "learning_rate": 3.309807718440005e-05, + "loss": 3.842, + "step": 78805 + }, + { + "epoch": 5.354667753770893, + "grad_norm": 0.2176152467727661, + "learning_rate": 3.309383068351679e-05, + "loss": 3.8063, + "step": 78810 + }, + { + "epoch": 5.355007473841555, + "grad_norm": 0.13734328746795654, + "learning_rate": 3.308958418263351e-05, + "loss": 3.7042, + "step": 78815 + }, + { + "epoch": 5.355347193912216, + "grad_norm": 0.1694459170103073, + "learning_rate": 3.308533768175024e-05, + "loss": 4.0701, + "step": 78820 + }, + { + "epoch": 5.355686913982878, + "grad_norm": 0.1840628832578659, + "learning_rate": 3.308109118086697e-05, + "loss": 3.7688, + "step": 78825 + }, + { + "epoch": 5.35602663405354, + "grad_norm": 0.1589602679014206, + "learning_rate": 3.307684467998369e-05, + "loss": 3.6739, + "step": 78830 + }, + { + "epoch": 5.3563663541242015, + "grad_norm": 0.22284093499183655, + "learning_rate": 3.307259817910042e-05, + "loss": 3.6848, + "step": 78835 + }, + { + "epoch": 5.3567060741948636, + "grad_norm": 0.18583092093467712, + "learning_rate": 3.306835167821715e-05, + "loss": 3.9508, + "step": 78840 + }, + { + "epoch": 5.357045794265526, + "grad_norm": 0.2202303558588028, + "learning_rate": 3.306410517733388e-05, + "loss": 3.8367, + "step": 78845 + }, + { + "epoch": 5.357385514336187, + "grad_norm": 0.15471115708351135, + "learning_rate": 3.3059858676450605e-05, + "loss": 3.7673, + "step": 78850 + }, + { + "epoch": 5.357725234406849, + "grad_norm": 0.22440287470817566, + "learning_rate": 3.305561217556733e-05, + "loss": 3.9118, + "step": 78855 + }, + { + "epoch": 5.358064954477511, + "grad_norm": 0.3078247904777527, + "learning_rate": 3.305136567468406e-05, + "loss": 3.8147, + "step": 78860 + }, + { + "epoch": 5.358404674548172, + "grad_norm": 0.1514856517314911, + "learning_rate": 3.304711917380079e-05, + "loss": 3.9058, + "step": 78865 + }, + { + "epoch": 5.358744394618834, + "grad_norm": 0.13395674526691437, + "learning_rate": 3.304287267291752e-05, + "loss": 3.6838, + "step": 78870 + }, + { + "epoch": 5.359084114689496, + "grad_norm": 0.17283886671066284, + "learning_rate": 3.3038626172034245e-05, + "loss": 3.874, + "step": 78875 + }, + { + "epoch": 5.3594238347601575, + "grad_norm": 0.16552622616291046, + "learning_rate": 3.303437967115097e-05, + "loss": 3.906, + "step": 78880 + }, + { + "epoch": 5.35976355483082, + "grad_norm": 0.17149868607521057, + "learning_rate": 3.30301331702677e-05, + "loss": 3.7955, + "step": 78885 + }, + { + "epoch": 5.360103274901482, + "grad_norm": 0.20818723738193512, + "learning_rate": 3.302588666938443e-05, + "loss": 4.0235, + "step": 78890 + }, + { + "epoch": 5.360442994972143, + "grad_norm": 0.1499912291765213, + "learning_rate": 3.302164016850116e-05, + "loss": 3.7567, + "step": 78895 + }, + { + "epoch": 5.360782715042805, + "grad_norm": 0.1781933754682541, + "learning_rate": 3.3017393667617885e-05, + "loss": 3.6441, + "step": 78900 + }, + { + "epoch": 5.361122435113466, + "grad_norm": 0.16883128881454468, + "learning_rate": 3.301314716673461e-05, + "loss": 3.7645, + "step": 78905 + }, + { + "epoch": 5.361462155184128, + "grad_norm": 0.16596703231334686, + "learning_rate": 3.300890066585134e-05, + "loss": 3.7362, + "step": 78910 + }, + { + "epoch": 5.36180187525479, + "grad_norm": 0.2056170403957367, + "learning_rate": 3.300465416496807e-05, + "loss": 3.8257, + "step": 78915 + }, + { + "epoch": 5.362141595325451, + "grad_norm": 0.17513734102249146, + "learning_rate": 3.30004076640848e-05, + "loss": 3.8896, + "step": 78920 + }, + { + "epoch": 5.3624813153961135, + "grad_norm": 0.1864723414182663, + "learning_rate": 3.2996161163201525e-05, + "loss": 3.8895, + "step": 78925 + }, + { + "epoch": 5.362821035466776, + "grad_norm": 0.13717010617256165, + "learning_rate": 3.299191466231825e-05, + "loss": 3.7001, + "step": 78930 + }, + { + "epoch": 5.363160755537437, + "grad_norm": 0.1955285668373108, + "learning_rate": 3.298766816143498e-05, + "loss": 3.8764, + "step": 78935 + }, + { + "epoch": 5.363500475608099, + "grad_norm": 0.1981886774301529, + "learning_rate": 3.29834216605517e-05, + "loss": 3.8456, + "step": 78940 + }, + { + "epoch": 5.363840195678761, + "grad_norm": 0.20413127541542053, + "learning_rate": 3.297917515966844e-05, + "loss": 3.7624, + "step": 78945 + }, + { + "epoch": 5.364179915749422, + "grad_norm": 0.1924290657043457, + "learning_rate": 3.2974928658785165e-05, + "loss": 3.9579, + "step": 78950 + }, + { + "epoch": 5.364519635820084, + "grad_norm": 0.2896873950958252, + "learning_rate": 3.2970682157901886e-05, + "loss": 3.9274, + "step": 78955 + }, + { + "epoch": 5.364859355890746, + "grad_norm": 0.17789454758167267, + "learning_rate": 3.296643565701862e-05, + "loss": 3.6421, + "step": 78960 + }, + { + "epoch": 5.365199075961407, + "grad_norm": 0.21748241782188416, + "learning_rate": 3.296218915613535e-05, + "loss": 3.7718, + "step": 78965 + }, + { + "epoch": 5.3655387960320695, + "grad_norm": 0.17550747096538544, + "learning_rate": 3.295794265525207e-05, + "loss": 3.6932, + "step": 78970 + }, + { + "epoch": 5.365878516102732, + "grad_norm": 0.14771859347820282, + "learning_rate": 3.29536961543688e-05, + "loss": 4.1095, + "step": 78975 + }, + { + "epoch": 5.366218236173393, + "grad_norm": 0.18587031960487366, + "learning_rate": 3.294944965348553e-05, + "loss": 3.8741, + "step": 78980 + }, + { + "epoch": 5.366557956244055, + "grad_norm": 0.1654374897480011, + "learning_rate": 3.2945203152602255e-05, + "loss": 3.9381, + "step": 78985 + }, + { + "epoch": 5.366897676314717, + "grad_norm": 0.1774025708436966, + "learning_rate": 3.294095665171898e-05, + "loss": 3.6617, + "step": 78990 + }, + { + "epoch": 5.367237396385378, + "grad_norm": 0.1582125723361969, + "learning_rate": 3.293671015083572e-05, + "loss": 3.9196, + "step": 78995 + }, + { + "epoch": 5.36757711645604, + "grad_norm": 0.2117973268032074, + "learning_rate": 3.293246364995244e-05, + "loss": 3.7302, + "step": 79000 + }, + { + "epoch": 5.367916836526702, + "grad_norm": 0.2164502739906311, + "learning_rate": 3.2928217149069167e-05, + "loss": 3.732, + "step": 79005 + }, + { + "epoch": 5.3682565565973634, + "grad_norm": 0.14938774704933167, + "learning_rate": 3.2923970648185895e-05, + "loss": 4.0208, + "step": 79010 + }, + { + "epoch": 5.3685962766680255, + "grad_norm": 0.19555138051509857, + "learning_rate": 3.291972414730262e-05, + "loss": 3.7096, + "step": 79015 + }, + { + "epoch": 5.368935996738688, + "grad_norm": 0.2373962551355362, + "learning_rate": 3.291547764641935e-05, + "loss": 4.0089, + "step": 79020 + }, + { + "epoch": 5.369275716809349, + "grad_norm": 0.1779651641845703, + "learning_rate": 3.291123114553608e-05, + "loss": 3.8545, + "step": 79025 + }, + { + "epoch": 5.369615436880011, + "grad_norm": 0.22254382073879242, + "learning_rate": 3.2906984644652807e-05, + "loss": 3.885, + "step": 79030 + }, + { + "epoch": 5.369955156950673, + "grad_norm": 0.1515815705060959, + "learning_rate": 3.2902738143769535e-05, + "loss": 3.8676, + "step": 79035 + }, + { + "epoch": 5.370294877021334, + "grad_norm": 0.2112218737602234, + "learning_rate": 3.289849164288626e-05, + "loss": 3.8012, + "step": 79040 + }, + { + "epoch": 5.370634597091996, + "grad_norm": 0.1696973592042923, + "learning_rate": 3.289424514200299e-05, + "loss": 3.7005, + "step": 79045 + }, + { + "epoch": 5.370974317162658, + "grad_norm": 0.31813564896583557, + "learning_rate": 3.288999864111972e-05, + "loss": 3.6253, + "step": 79050 + }, + { + "epoch": 5.3713140372333195, + "grad_norm": 0.1923508644104004, + "learning_rate": 3.288575214023645e-05, + "loss": 3.9464, + "step": 79055 + }, + { + "epoch": 5.3716537573039815, + "grad_norm": 0.17163892090320587, + "learning_rate": 3.2881505639353175e-05, + "loss": 3.7698, + "step": 79060 + }, + { + "epoch": 5.371993477374644, + "grad_norm": 0.18580520153045654, + "learning_rate": 3.28772591384699e-05, + "loss": 3.9308, + "step": 79065 + }, + { + "epoch": 5.372333197445305, + "grad_norm": 0.18746915459632874, + "learning_rate": 3.287301263758663e-05, + "loss": 3.702, + "step": 79070 + }, + { + "epoch": 5.372672917515967, + "grad_norm": 0.1502191573381424, + "learning_rate": 3.286876613670336e-05, + "loss": 3.7028, + "step": 79075 + }, + { + "epoch": 5.373012637586629, + "grad_norm": 0.15321102738380432, + "learning_rate": 3.286451963582009e-05, + "loss": 3.5748, + "step": 79080 + }, + { + "epoch": 5.37335235765729, + "grad_norm": 0.1713200807571411, + "learning_rate": 3.2860273134936815e-05, + "loss": 3.7802, + "step": 79085 + }, + { + "epoch": 5.373692077727952, + "grad_norm": 0.17856085300445557, + "learning_rate": 3.285602663405354e-05, + "loss": 3.799, + "step": 79090 + }, + { + "epoch": 5.374031797798614, + "grad_norm": 0.1666310578584671, + "learning_rate": 3.285178013317027e-05, + "loss": 3.6821, + "step": 79095 + }, + { + "epoch": 5.3743715178692755, + "grad_norm": 0.15420110523700714, + "learning_rate": 3.2847533632287e-05, + "loss": 3.7039, + "step": 79100 + }, + { + "epoch": 5.3747112379399375, + "grad_norm": 0.1623835265636444, + "learning_rate": 3.284328713140373e-05, + "loss": 3.9024, + "step": 79105 + }, + { + "epoch": 5.3750509580106, + "grad_norm": 0.16062341630458832, + "learning_rate": 3.283904063052045e-05, + "loss": 3.7481, + "step": 79110 + }, + { + "epoch": 5.375390678081261, + "grad_norm": 0.49303731322288513, + "learning_rate": 3.283479412963718e-05, + "loss": 3.64, + "step": 79115 + }, + { + "epoch": 5.375730398151923, + "grad_norm": 0.19324558973312378, + "learning_rate": 3.283054762875391e-05, + "loss": 3.9379, + "step": 79120 + }, + { + "epoch": 5.376070118222585, + "grad_norm": 0.33126088976860046, + "learning_rate": 3.282630112787063e-05, + "loss": 3.6923, + "step": 79125 + }, + { + "epoch": 5.376409838293246, + "grad_norm": 0.1621760129928589, + "learning_rate": 3.282205462698737e-05, + "loss": 3.9592, + "step": 79130 + }, + { + "epoch": 5.376749558363908, + "grad_norm": 0.18886853754520416, + "learning_rate": 3.2817808126104095e-05, + "loss": 3.6764, + "step": 79135 + }, + { + "epoch": 5.37708927843457, + "grad_norm": 0.17446720600128174, + "learning_rate": 3.2813561625220816e-05, + "loss": 3.9289, + "step": 79140 + }, + { + "epoch": 5.3774289985052315, + "grad_norm": 0.18839240074157715, + "learning_rate": 3.280931512433755e-05, + "loss": 3.9897, + "step": 79145 + }, + { + "epoch": 5.377768718575894, + "grad_norm": 0.18011228740215302, + "learning_rate": 3.280506862345428e-05, + "loss": 3.7514, + "step": 79150 + }, + { + "epoch": 5.378108438646556, + "grad_norm": 0.19468119740486145, + "learning_rate": 3.2800822122571e-05, + "loss": 3.9393, + "step": 79155 + }, + { + "epoch": 5.378448158717217, + "grad_norm": 0.16395357251167297, + "learning_rate": 3.279657562168773e-05, + "loss": 3.8444, + "step": 79160 + }, + { + "epoch": 5.378787878787879, + "grad_norm": 0.1736222356557846, + "learning_rate": 3.279232912080446e-05, + "loss": 3.7888, + "step": 79165 + }, + { + "epoch": 5.379127598858541, + "grad_norm": 0.2042563408613205, + "learning_rate": 3.2788082619921184e-05, + "loss": 4.0356, + "step": 79170 + }, + { + "epoch": 5.379467318929202, + "grad_norm": 0.21696536242961884, + "learning_rate": 3.278383611903791e-05, + "loss": 3.7495, + "step": 79175 + }, + { + "epoch": 5.379807038999864, + "grad_norm": 0.1781844198703766, + "learning_rate": 3.277958961815465e-05, + "loss": 4.1292, + "step": 79180 + }, + { + "epoch": 5.380146759070525, + "grad_norm": 0.1546722799539566, + "learning_rate": 3.277534311727137e-05, + "loss": 3.8427, + "step": 79185 + }, + { + "epoch": 5.3804864791411875, + "grad_norm": 0.13770751655101776, + "learning_rate": 3.2771096616388096e-05, + "loss": 3.879, + "step": 79190 + }, + { + "epoch": 5.38082619921185, + "grad_norm": 0.13709178566932678, + "learning_rate": 3.2766850115504824e-05, + "loss": 3.8616, + "step": 79195 + }, + { + "epoch": 5.381165919282511, + "grad_norm": 0.44175606966018677, + "learning_rate": 3.276260361462155e-05, + "loss": 3.8972, + "step": 79200 + }, + { + "epoch": 5.381505639353173, + "grad_norm": 0.13228893280029297, + "learning_rate": 3.275835711373828e-05, + "loss": 3.825, + "step": 79205 + }, + { + "epoch": 5.381845359423835, + "grad_norm": 0.18145689368247986, + "learning_rate": 3.275411061285501e-05, + "loss": 3.7474, + "step": 79210 + }, + { + "epoch": 5.382185079494496, + "grad_norm": 0.19022256135940552, + "learning_rate": 3.2749864111971736e-05, + "loss": 3.7956, + "step": 79215 + }, + { + "epoch": 5.382524799565158, + "grad_norm": 0.14601309597492218, + "learning_rate": 3.2745617611088464e-05, + "loss": 3.7465, + "step": 79220 + }, + { + "epoch": 5.38286451963582, + "grad_norm": 0.7265409231185913, + "learning_rate": 3.274137111020519e-05, + "loss": 3.7591, + "step": 79225 + }, + { + "epoch": 5.383204239706481, + "grad_norm": 0.2917368710041046, + "learning_rate": 3.273712460932192e-05, + "loss": 4.015, + "step": 79230 + }, + { + "epoch": 5.3835439597771435, + "grad_norm": 0.19611510634422302, + "learning_rate": 3.273287810843865e-05, + "loss": 3.9903, + "step": 79235 + }, + { + "epoch": 5.383883679847806, + "grad_norm": 0.15760505199432373, + "learning_rate": 3.2728631607555376e-05, + "loss": 3.8935, + "step": 79240 + }, + { + "epoch": 5.384223399918467, + "grad_norm": 0.21960046887397766, + "learning_rate": 3.2724385106672104e-05, + "loss": 3.7119, + "step": 79245 + }, + { + "epoch": 5.384563119989129, + "grad_norm": 0.19863992929458618, + "learning_rate": 3.272013860578883e-05, + "loss": 3.5887, + "step": 79250 + }, + { + "epoch": 5.384902840059791, + "grad_norm": 0.3332905173301697, + "learning_rate": 3.271589210490556e-05, + "loss": 3.6291, + "step": 79255 + }, + { + "epoch": 5.385242560130452, + "grad_norm": 0.14395463466644287, + "learning_rate": 3.271164560402229e-05, + "loss": 3.9212, + "step": 79260 + }, + { + "epoch": 5.385582280201114, + "grad_norm": 0.7579538822174072, + "learning_rate": 3.2707399103139016e-05, + "loss": 3.872, + "step": 79265 + }, + { + "epoch": 5.385922000271776, + "grad_norm": 0.1707170158624649, + "learning_rate": 3.2703152602255744e-05, + "loss": 3.6741, + "step": 79270 + }, + { + "epoch": 5.386261720342437, + "grad_norm": 0.1657305508852005, + "learning_rate": 3.269890610137247e-05, + "loss": 3.7484, + "step": 79275 + }, + { + "epoch": 5.3866014404130995, + "grad_norm": 0.20356836915016174, + "learning_rate": 3.26946596004892e-05, + "loss": 3.9228, + "step": 79280 + }, + { + "epoch": 5.386941160483762, + "grad_norm": 0.1703435629606247, + "learning_rate": 3.269041309960593e-05, + "loss": 3.6973, + "step": 79285 + }, + { + "epoch": 5.387280880554423, + "grad_norm": 0.1939012110233307, + "learning_rate": 3.2686166598722656e-05, + "loss": 3.7335, + "step": 79290 + }, + { + "epoch": 5.387620600625085, + "grad_norm": 0.15730541944503784, + "learning_rate": 3.268192009783938e-05, + "loss": 3.8131, + "step": 79295 + }, + { + "epoch": 5.387960320695747, + "grad_norm": 0.1629168838262558, + "learning_rate": 3.267767359695611e-05, + "loss": 3.6587, + "step": 79300 + }, + { + "epoch": 5.388300040766408, + "grad_norm": 0.21429817378520966, + "learning_rate": 3.267342709607284e-05, + "loss": 3.7125, + "step": 79305 + }, + { + "epoch": 5.38863976083707, + "grad_norm": 0.1420091688632965, + "learning_rate": 3.266918059518956e-05, + "loss": 3.6959, + "step": 79310 + }, + { + "epoch": 5.388979480907732, + "grad_norm": 0.18316836655139923, + "learning_rate": 3.2664934094306296e-05, + "loss": 3.7786, + "step": 79315 + }, + { + "epoch": 5.3893192009783935, + "grad_norm": 0.15360911190509796, + "learning_rate": 3.2660687593423024e-05, + "loss": 3.7862, + "step": 79320 + }, + { + "epoch": 5.3896589210490555, + "grad_norm": 0.1711878776550293, + "learning_rate": 3.2656441092539745e-05, + "loss": 4.067, + "step": 79325 + }, + { + "epoch": 5.389998641119718, + "grad_norm": 0.4511939287185669, + "learning_rate": 3.2652194591656473e-05, + "loss": 3.849, + "step": 79330 + }, + { + "epoch": 5.390338361190379, + "grad_norm": 0.1757393777370453, + "learning_rate": 3.264794809077321e-05, + "loss": 3.8003, + "step": 79335 + }, + { + "epoch": 5.390678081261041, + "grad_norm": 0.13083235919475555, + "learning_rate": 3.264370158988993e-05, + "loss": 3.8611, + "step": 79340 + }, + { + "epoch": 5.391017801331703, + "grad_norm": 0.2071414440870285, + "learning_rate": 3.263945508900666e-05, + "loss": 3.7575, + "step": 79345 + }, + { + "epoch": 5.391357521402364, + "grad_norm": 0.2157520204782486, + "learning_rate": 3.263520858812339e-05, + "loss": 3.9893, + "step": 79350 + }, + { + "epoch": 5.391697241473026, + "grad_norm": 0.15461251139640808, + "learning_rate": 3.2630962087240114e-05, + "loss": 3.7446, + "step": 79355 + }, + { + "epoch": 5.392036961543688, + "grad_norm": 0.18800309300422668, + "learning_rate": 3.262671558635684e-05, + "loss": 3.692, + "step": 79360 + }, + { + "epoch": 5.3923766816143495, + "grad_norm": 0.19682587683200836, + "learning_rate": 3.262246908547357e-05, + "loss": 3.8088, + "step": 79365 + }, + { + "epoch": 5.3927164016850115, + "grad_norm": 0.16800472140312195, + "learning_rate": 3.26182225845903e-05, + "loss": 3.5455, + "step": 79370 + }, + { + "epoch": 5.393056121755674, + "grad_norm": 0.21050967276096344, + "learning_rate": 3.2613976083707026e-05, + "loss": 3.7268, + "step": 79375 + }, + { + "epoch": 5.393395841826335, + "grad_norm": 0.14617502689361572, + "learning_rate": 3.2609729582823754e-05, + "loss": 3.5947, + "step": 79380 + }, + { + "epoch": 5.393735561896997, + "grad_norm": 0.17983637750148773, + "learning_rate": 3.260548308194048e-05, + "loss": 3.8078, + "step": 79385 + }, + { + "epoch": 5.394075281967659, + "grad_norm": 0.20252448320388794, + "learning_rate": 3.260123658105721e-05, + "loss": 3.943, + "step": 79390 + }, + { + "epoch": 5.39441500203832, + "grad_norm": 0.1624324768781662, + "learning_rate": 3.259699008017394e-05, + "loss": 3.772, + "step": 79395 + }, + { + "epoch": 5.394754722108982, + "grad_norm": 0.17700302600860596, + "learning_rate": 3.2592743579290666e-05, + "loss": 3.8286, + "step": 79400 + }, + { + "epoch": 5.395094442179644, + "grad_norm": 0.16283296048641205, + "learning_rate": 3.2588497078407394e-05, + "loss": 3.6652, + "step": 79405 + }, + { + "epoch": 5.3954341622503055, + "grad_norm": 0.25906285643577576, + "learning_rate": 3.258425057752412e-05, + "loss": 4.1206, + "step": 79410 + }, + { + "epoch": 5.3957738823209676, + "grad_norm": 0.18042545020580292, + "learning_rate": 3.258000407664085e-05, + "loss": 3.9666, + "step": 79415 + }, + { + "epoch": 5.39611360239163, + "grad_norm": 0.13783800601959229, + "learning_rate": 3.257575757575758e-05, + "loss": 3.9326, + "step": 79420 + }, + { + "epoch": 5.396453322462291, + "grad_norm": 0.14267022907733917, + "learning_rate": 3.2571511074874306e-05, + "loss": 3.7644, + "step": 79425 + }, + { + "epoch": 5.396793042532953, + "grad_norm": 0.15059047937393188, + "learning_rate": 3.2567264573991034e-05, + "loss": 3.7607, + "step": 79430 + }, + { + "epoch": 5.397132762603615, + "grad_norm": 0.184220090508461, + "learning_rate": 3.256301807310776e-05, + "loss": 3.8497, + "step": 79435 + }, + { + "epoch": 5.397472482674276, + "grad_norm": 0.12601450085639954, + "learning_rate": 3.255877157222449e-05, + "loss": 4.0785, + "step": 79440 + }, + { + "epoch": 5.397812202744938, + "grad_norm": 0.14315101504325867, + "learning_rate": 3.255452507134122e-05, + "loss": 3.8931, + "step": 79445 + }, + { + "epoch": 5.3981519228156, + "grad_norm": 0.18067404627799988, + "learning_rate": 3.2550278570457946e-05, + "loss": 3.6733, + "step": 79450 + }, + { + "epoch": 5.3984916428862615, + "grad_norm": 0.2330625206232071, + "learning_rate": 3.2546032069574674e-05, + "loss": 3.6685, + "step": 79455 + }, + { + "epoch": 5.398831362956924, + "grad_norm": 0.7886677384376526, + "learning_rate": 3.25417855686914e-05, + "loss": 4.0333, + "step": 79460 + }, + { + "epoch": 5.399171083027586, + "grad_norm": 0.22936318814754486, + "learning_rate": 3.253753906780812e-05, + "loss": 3.6909, + "step": 79465 + }, + { + "epoch": 5.399510803098247, + "grad_norm": 0.16908279061317444, + "learning_rate": 3.253329256692486e-05, + "loss": 3.791, + "step": 79470 + }, + { + "epoch": 5.399850523168909, + "grad_norm": 0.18552030622959137, + "learning_rate": 3.2529046066041586e-05, + "loss": 3.9148, + "step": 79475 + }, + { + "epoch": 5.400190243239571, + "grad_norm": 0.16188359260559082, + "learning_rate": 3.252479956515831e-05, + "loss": 3.8795, + "step": 79480 + }, + { + "epoch": 5.400529963310232, + "grad_norm": 0.17595775425434113, + "learning_rate": 3.252055306427504e-05, + "loss": 3.6867, + "step": 79485 + }, + { + "epoch": 5.400869683380894, + "grad_norm": 0.18285565078258514, + "learning_rate": 3.251630656339177e-05, + "loss": 3.863, + "step": 79490 + }, + { + "epoch": 5.401209403451556, + "grad_norm": 0.17966735363006592, + "learning_rate": 3.251206006250849e-05, + "loss": 3.8017, + "step": 79495 + }, + { + "epoch": 5.4015491235222175, + "grad_norm": 0.18371722102165222, + "learning_rate": 3.250781356162522e-05, + "loss": 3.8708, + "step": 79500 + }, + { + "epoch": 5.40188884359288, + "grad_norm": 0.18569478392601013, + "learning_rate": 3.2503567060741954e-05, + "loss": 3.6474, + "step": 79505 + }, + { + "epoch": 5.402228563663542, + "grad_norm": 0.15464742481708527, + "learning_rate": 3.2499320559858675e-05, + "loss": 3.694, + "step": 79510 + }, + { + "epoch": 5.402568283734203, + "grad_norm": 0.18397365510463715, + "learning_rate": 3.24950740589754e-05, + "loss": 3.9428, + "step": 79515 + }, + { + "epoch": 5.402908003804865, + "grad_norm": 0.1763172447681427, + "learning_rate": 3.249082755809214e-05, + "loss": 3.8527, + "step": 79520 + }, + { + "epoch": 5.403247723875527, + "grad_norm": 0.1627524495124817, + "learning_rate": 3.248658105720886e-05, + "loss": 3.9524, + "step": 79525 + }, + { + "epoch": 5.403587443946188, + "grad_norm": 0.1843963861465454, + "learning_rate": 3.248233455632559e-05, + "loss": 3.8083, + "step": 79530 + }, + { + "epoch": 5.40392716401685, + "grad_norm": 0.2670363485813141, + "learning_rate": 3.2478088055442315e-05, + "loss": 3.5014, + "step": 79535 + }, + { + "epoch": 5.404266884087512, + "grad_norm": 0.17850543558597565, + "learning_rate": 3.247384155455904e-05, + "loss": 3.7628, + "step": 79540 + }, + { + "epoch": 5.4046066041581735, + "grad_norm": 0.19806641340255737, + "learning_rate": 3.246959505367577e-05, + "loss": 3.8228, + "step": 79545 + }, + { + "epoch": 5.404946324228836, + "grad_norm": 0.1641513556241989, + "learning_rate": 3.24653485527925e-05, + "loss": 3.9073, + "step": 79550 + }, + { + "epoch": 5.405286044299498, + "grad_norm": 0.23233045637607574, + "learning_rate": 3.246110205190923e-05, + "loss": 3.9511, + "step": 79555 + }, + { + "epoch": 5.405625764370159, + "grad_norm": 0.1657295972108841, + "learning_rate": 3.2456855551025955e-05, + "loss": 4.0078, + "step": 79560 + }, + { + "epoch": 5.405965484440821, + "grad_norm": 0.17866955697536469, + "learning_rate": 3.245260905014268e-05, + "loss": 3.75, + "step": 79565 + }, + { + "epoch": 5.406305204511483, + "grad_norm": 0.1647479087114334, + "learning_rate": 3.244836254925942e-05, + "loss": 4.0178, + "step": 79570 + }, + { + "epoch": 5.406644924582144, + "grad_norm": 0.1946965456008911, + "learning_rate": 3.244411604837614e-05, + "loss": 3.8191, + "step": 79575 + }, + { + "epoch": 5.406984644652806, + "grad_norm": 0.2379240244626999, + "learning_rate": 3.243986954749287e-05, + "loss": 3.917, + "step": 79580 + }, + { + "epoch": 5.4073243647234674, + "grad_norm": 0.1640145182609558, + "learning_rate": 3.2435623046609595e-05, + "loss": 3.5907, + "step": 79585 + }, + { + "epoch": 5.4076640847941295, + "grad_norm": 0.17081619799137115, + "learning_rate": 3.243137654572632e-05, + "loss": 3.7738, + "step": 79590 + }, + { + "epoch": 5.408003804864792, + "grad_norm": 0.17133744060993195, + "learning_rate": 3.242713004484305e-05, + "loss": 3.5532, + "step": 79595 + }, + { + "epoch": 5.408343524935453, + "grad_norm": 0.12736313045024872, + "learning_rate": 3.242288354395978e-05, + "loss": 4.1798, + "step": 79600 + }, + { + "epoch": 5.408683245006115, + "grad_norm": 0.14382004737854004, + "learning_rate": 3.241863704307651e-05, + "loss": 3.6327, + "step": 79605 + }, + { + "epoch": 5.409022965076777, + "grad_norm": 0.19826005399227142, + "learning_rate": 3.2414390542193235e-05, + "loss": 3.8742, + "step": 79610 + }, + { + "epoch": 5.409362685147438, + "grad_norm": 0.16698022186756134, + "learning_rate": 3.241014404130996e-05, + "loss": 3.6239, + "step": 79615 + }, + { + "epoch": 5.4097024052181, + "grad_norm": 0.15823400020599365, + "learning_rate": 3.240589754042669e-05, + "loss": 3.6309, + "step": 79620 + }, + { + "epoch": 5.410042125288762, + "grad_norm": 0.35590875148773193, + "learning_rate": 3.240165103954342e-05, + "loss": 3.9557, + "step": 79625 + }, + { + "epoch": 5.4103818453594235, + "grad_norm": 0.15963716804981232, + "learning_rate": 3.239740453866015e-05, + "loss": 3.7682, + "step": 79630 + }, + { + "epoch": 5.4107215654300855, + "grad_norm": 0.38677850365638733, + "learning_rate": 3.239315803777687e-05, + "loss": 3.895, + "step": 79635 + }, + { + "epoch": 5.411061285500748, + "grad_norm": 0.2027491182088852, + "learning_rate": 3.23889115368936e-05, + "loss": 3.7341, + "step": 79640 + }, + { + "epoch": 5.411401005571409, + "grad_norm": 0.15673789381980896, + "learning_rate": 3.238466503601033e-05, + "loss": 3.8036, + "step": 79645 + }, + { + "epoch": 5.411740725642071, + "grad_norm": 1.0447702407836914, + "learning_rate": 3.238041853512705e-05, + "loss": 3.5889, + "step": 79650 + }, + { + "epoch": 5.412080445712733, + "grad_norm": 0.13681600987911224, + "learning_rate": 3.237617203424379e-05, + "loss": 3.9152, + "step": 79655 + }, + { + "epoch": 5.412420165783394, + "grad_norm": 0.1710919439792633, + "learning_rate": 3.2371925533360515e-05, + "loss": 3.6559, + "step": 79660 + }, + { + "epoch": 5.412759885854056, + "grad_norm": 0.2567235827445984, + "learning_rate": 3.2367679032477236e-05, + "loss": 4.0509, + "step": 79665 + }, + { + "epoch": 5.413099605924718, + "grad_norm": 0.18982622027397156, + "learning_rate": 3.236343253159397e-05, + "loss": 3.8501, + "step": 79670 + }, + { + "epoch": 5.4134393259953795, + "grad_norm": 0.18981008231639862, + "learning_rate": 3.23591860307107e-05, + "loss": 3.9434, + "step": 79675 + }, + { + "epoch": 5.4137790460660415, + "grad_norm": 0.15583068132400513, + "learning_rate": 3.235493952982742e-05, + "loss": 3.6716, + "step": 79680 + }, + { + "epoch": 5.414118766136704, + "grad_norm": 0.14592771232128143, + "learning_rate": 3.235069302894415e-05, + "loss": 3.7011, + "step": 79685 + }, + { + "epoch": 5.414458486207365, + "grad_norm": 0.2672421932220459, + "learning_rate": 3.234644652806088e-05, + "loss": 3.7513, + "step": 79690 + }, + { + "epoch": 5.414798206278027, + "grad_norm": 0.16572140157222748, + "learning_rate": 3.2342200027177604e-05, + "loss": 3.9084, + "step": 79695 + }, + { + "epoch": 5.415137926348689, + "grad_norm": 0.19854941964149475, + "learning_rate": 3.233795352629433e-05, + "loss": 3.6854, + "step": 79700 + }, + { + "epoch": 5.41547764641935, + "grad_norm": 0.1327054500579834, + "learning_rate": 3.233370702541107e-05, + "loss": 3.9555, + "step": 79705 + }, + { + "epoch": 5.415817366490012, + "grad_norm": 0.18888671696186066, + "learning_rate": 3.232946052452779e-05, + "loss": 3.7879, + "step": 79710 + }, + { + "epoch": 5.416157086560674, + "grad_norm": 0.2222934514284134, + "learning_rate": 3.2325214023644517e-05, + "loss": 3.7377, + "step": 79715 + }, + { + "epoch": 5.4164968066313355, + "grad_norm": 0.18450793623924255, + "learning_rate": 3.2320967522761245e-05, + "loss": 3.8507, + "step": 79720 + }, + { + "epoch": 5.416836526701998, + "grad_norm": 0.5587729215621948, + "learning_rate": 3.231672102187797e-05, + "loss": 3.9306, + "step": 79725 + }, + { + "epoch": 5.41717624677266, + "grad_norm": 0.17385166883468628, + "learning_rate": 3.23124745209947e-05, + "loss": 4.0845, + "step": 79730 + }, + { + "epoch": 5.417515966843321, + "grad_norm": 0.1630699783563614, + "learning_rate": 3.230822802011143e-05, + "loss": 3.8942, + "step": 79735 + }, + { + "epoch": 5.417855686913983, + "grad_norm": 0.24811118841171265, + "learning_rate": 3.230398151922816e-05, + "loss": 3.745, + "step": 79740 + }, + { + "epoch": 5.418195406984645, + "grad_norm": 0.23634278774261475, + "learning_rate": 3.2299735018344885e-05, + "loss": 3.7882, + "step": 79745 + }, + { + "epoch": 5.418535127055306, + "grad_norm": 0.21200667321681976, + "learning_rate": 3.229548851746161e-05, + "loss": 3.8766, + "step": 79750 + }, + { + "epoch": 5.418874847125968, + "grad_norm": 0.20425555109977722, + "learning_rate": 3.229124201657834e-05, + "loss": 4.0503, + "step": 79755 + }, + { + "epoch": 5.41921456719663, + "grad_norm": 0.8028151392936707, + "learning_rate": 3.228699551569507e-05, + "loss": 3.692, + "step": 79760 + }, + { + "epoch": 5.4195542872672915, + "grad_norm": 0.18870127201080322, + "learning_rate": 3.2282749014811797e-05, + "loss": 3.7644, + "step": 79765 + }, + { + "epoch": 5.419894007337954, + "grad_norm": 0.2119903415441513, + "learning_rate": 3.2278502513928525e-05, + "loss": 3.8323, + "step": 79770 + }, + { + "epoch": 5.420233727408616, + "grad_norm": 0.15935124456882477, + "learning_rate": 3.227425601304525e-05, + "loss": 3.9383, + "step": 79775 + }, + { + "epoch": 5.420573447479277, + "grad_norm": 0.1785641461610794, + "learning_rate": 3.227000951216198e-05, + "loss": 4.0224, + "step": 79780 + }, + { + "epoch": 5.420913167549939, + "grad_norm": 0.14069734513759613, + "learning_rate": 3.226576301127871e-05, + "loss": 3.8936, + "step": 79785 + }, + { + "epoch": 5.421252887620601, + "grad_norm": 0.18465681374073029, + "learning_rate": 3.226151651039544e-05, + "loss": 3.8492, + "step": 79790 + }, + { + "epoch": 5.421592607691262, + "grad_norm": 0.17206759750843048, + "learning_rate": 3.2257270009512165e-05, + "loss": 3.6319, + "step": 79795 + }, + { + "epoch": 5.421932327761924, + "grad_norm": 0.19365786015987396, + "learning_rate": 3.225302350862889e-05, + "loss": 3.6806, + "step": 79800 + }, + { + "epoch": 5.422272047832586, + "grad_norm": 0.14529888331890106, + "learning_rate": 3.224877700774562e-05, + "loss": 3.5864, + "step": 79805 + }, + { + "epoch": 5.4226117679032475, + "grad_norm": 0.1902463734149933, + "learning_rate": 3.224453050686235e-05, + "loss": 3.8178, + "step": 79810 + }, + { + "epoch": 5.42295148797391, + "grad_norm": 0.31129857897758484, + "learning_rate": 3.224028400597908e-05, + "loss": 4.0603, + "step": 79815 + }, + { + "epoch": 5.423291208044572, + "grad_norm": 0.1808587908744812, + "learning_rate": 3.22360375050958e-05, + "loss": 3.8859, + "step": 79820 + }, + { + "epoch": 5.423630928115233, + "grad_norm": 1.952811360359192, + "learning_rate": 3.223179100421253e-05, + "loss": 3.981, + "step": 79825 + }, + { + "epoch": 5.423970648185895, + "grad_norm": 0.19094648957252502, + "learning_rate": 3.222754450332926e-05, + "loss": 3.9608, + "step": 79830 + }, + { + "epoch": 5.424310368256557, + "grad_norm": 0.6742556691169739, + "learning_rate": 3.222329800244598e-05, + "loss": 3.9912, + "step": 79835 + }, + { + "epoch": 5.424650088327218, + "grad_norm": 0.1757609248161316, + "learning_rate": 3.221905150156272e-05, + "loss": 3.8236, + "step": 79840 + }, + { + "epoch": 5.42498980839788, + "grad_norm": 0.15642528235912323, + "learning_rate": 3.2214805000679445e-05, + "loss": 3.9653, + "step": 79845 + }, + { + "epoch": 5.425329528468542, + "grad_norm": 0.14554481208324432, + "learning_rate": 3.2210558499796166e-05, + "loss": 3.8089, + "step": 79850 + }, + { + "epoch": 5.4256692485392035, + "grad_norm": 0.17531916499137878, + "learning_rate": 3.2206311998912894e-05, + "loss": 3.9578, + "step": 79855 + }, + { + "epoch": 5.426008968609866, + "grad_norm": 0.26923689246177673, + "learning_rate": 3.220206549802963e-05, + "loss": 3.7463, + "step": 79860 + }, + { + "epoch": 5.426348688680527, + "grad_norm": 0.19851543009281158, + "learning_rate": 3.219781899714635e-05, + "loss": 3.9195, + "step": 79865 + }, + { + "epoch": 5.426688408751189, + "grad_norm": 0.18351586163043976, + "learning_rate": 3.219357249626308e-05, + "loss": 3.6161, + "step": 79870 + }, + { + "epoch": 5.427028128821851, + "grad_norm": 0.25163936614990234, + "learning_rate": 3.218932599537981e-05, + "loss": 3.7322, + "step": 79875 + }, + { + "epoch": 5.427367848892512, + "grad_norm": 0.1410733014345169, + "learning_rate": 3.2185079494496534e-05, + "loss": 3.627, + "step": 79880 + }, + { + "epoch": 5.427707568963174, + "grad_norm": 0.1434079259634018, + "learning_rate": 3.218083299361326e-05, + "loss": 3.5833, + "step": 79885 + }, + { + "epoch": 5.428047289033836, + "grad_norm": 0.1562288999557495, + "learning_rate": 3.217658649272999e-05, + "loss": 3.872, + "step": 79890 + }, + { + "epoch": 5.4283870091044975, + "grad_norm": 0.14615990221500397, + "learning_rate": 3.217233999184672e-05, + "loss": 3.9558, + "step": 79895 + }, + { + "epoch": 5.4287267291751595, + "grad_norm": 0.2261238694190979, + "learning_rate": 3.2168093490963446e-05, + "loss": 3.9404, + "step": 79900 + }, + { + "epoch": 5.429066449245822, + "grad_norm": 0.16859221458435059, + "learning_rate": 3.2163846990080174e-05, + "loss": 3.8115, + "step": 79905 + }, + { + "epoch": 5.429406169316483, + "grad_norm": 0.18288512527942657, + "learning_rate": 3.215960048919691e-05, + "loss": 3.8931, + "step": 79910 + }, + { + "epoch": 5.429745889387145, + "grad_norm": 0.1757325828075409, + "learning_rate": 3.215535398831363e-05, + "loss": 3.8668, + "step": 79915 + }, + { + "epoch": 5.430085609457807, + "grad_norm": 0.440574049949646, + "learning_rate": 3.215110748743036e-05, + "loss": 3.4643, + "step": 79920 + }, + { + "epoch": 5.430425329528468, + "grad_norm": 0.12507915496826172, + "learning_rate": 3.2146860986547086e-05, + "loss": 3.7752, + "step": 79925 + }, + { + "epoch": 5.43076504959913, + "grad_norm": 0.1801176518201828, + "learning_rate": 3.2142614485663814e-05, + "loss": 3.7914, + "step": 79930 + }, + { + "epoch": 5.431104769669792, + "grad_norm": 0.21496392786502838, + "learning_rate": 3.213836798478054e-05, + "loss": 3.5016, + "step": 79935 + }, + { + "epoch": 5.4314444897404535, + "grad_norm": 0.16571664810180664, + "learning_rate": 3.213412148389727e-05, + "loss": 3.9097, + "step": 79940 + }, + { + "epoch": 5.4317842098111155, + "grad_norm": 0.1481715887784958, + "learning_rate": 3.2129874983014e-05, + "loss": 3.8109, + "step": 79945 + }, + { + "epoch": 5.432123929881778, + "grad_norm": 0.16897155344486237, + "learning_rate": 3.2125628482130726e-05, + "loss": 3.7091, + "step": 79950 + }, + { + "epoch": 5.432463649952439, + "grad_norm": 0.1882983297109604, + "learning_rate": 3.2121381981247454e-05, + "loss": 3.731, + "step": 79955 + }, + { + "epoch": 5.432803370023101, + "grad_norm": 0.20038874447345734, + "learning_rate": 3.211713548036418e-05, + "loss": 3.811, + "step": 79960 + }, + { + "epoch": 5.433143090093763, + "grad_norm": 0.20720914006233215, + "learning_rate": 3.211288897948091e-05, + "loss": 3.8553, + "step": 79965 + }, + { + "epoch": 5.433482810164424, + "grad_norm": 0.1620280146598816, + "learning_rate": 3.210864247859764e-05, + "loss": 3.8508, + "step": 79970 + }, + { + "epoch": 5.433822530235086, + "grad_norm": 0.17471687495708466, + "learning_rate": 3.2104395977714366e-05, + "loss": 3.7658, + "step": 79975 + }, + { + "epoch": 5.434162250305748, + "grad_norm": 0.6942095756530762, + "learning_rate": 3.2100149476831094e-05, + "loss": 3.6696, + "step": 79980 + }, + { + "epoch": 5.4345019703764095, + "grad_norm": 0.18256282806396484, + "learning_rate": 3.209590297594782e-05, + "loss": 3.7144, + "step": 79985 + }, + { + "epoch": 5.4348416904470715, + "grad_norm": 0.16818535327911377, + "learning_rate": 3.2091656475064543e-05, + "loss": 3.9275, + "step": 79990 + }, + { + "epoch": 5.435181410517734, + "grad_norm": 0.14287760853767395, + "learning_rate": 3.208740997418128e-05, + "loss": 3.8864, + "step": 79995 + }, + { + "epoch": 5.435521130588395, + "grad_norm": 0.17204219102859497, + "learning_rate": 3.2083163473298006e-05, + "loss": 3.7915, + "step": 80000 + }, + { + "epoch": 5.435860850659057, + "grad_norm": 0.19254201650619507, + "learning_rate": 3.207891697241473e-05, + "loss": 3.835, + "step": 80005 + }, + { + "epoch": 5.436200570729719, + "grad_norm": 0.19496574997901917, + "learning_rate": 3.207467047153146e-05, + "loss": 3.8539, + "step": 80010 + }, + { + "epoch": 5.43654029080038, + "grad_norm": 0.16853144764900208, + "learning_rate": 3.207042397064819e-05, + "loss": 3.8108, + "step": 80015 + }, + { + "epoch": 5.436880010871042, + "grad_norm": 0.2735777199268341, + "learning_rate": 3.206617746976491e-05, + "loss": 3.9679, + "step": 80020 + }, + { + "epoch": 5.437219730941704, + "grad_norm": 0.16341255605220795, + "learning_rate": 3.206193096888164e-05, + "loss": 3.604, + "step": 80025 + }, + { + "epoch": 5.4375594510123655, + "grad_norm": 0.18927742540836334, + "learning_rate": 3.2057684467998374e-05, + "loss": 4.0655, + "step": 80030 + }, + { + "epoch": 5.437899171083028, + "grad_norm": 0.233808234333992, + "learning_rate": 3.2053437967115095e-05, + "loss": 3.9514, + "step": 80035 + }, + { + "epoch": 5.43823889115369, + "grad_norm": 0.15252526104450226, + "learning_rate": 3.2049191466231823e-05, + "loss": 3.8787, + "step": 80040 + }, + { + "epoch": 5.438578611224351, + "grad_norm": 0.1544608175754547, + "learning_rate": 3.204494496534856e-05, + "loss": 3.7043, + "step": 80045 + }, + { + "epoch": 5.438918331295013, + "grad_norm": 0.19545888900756836, + "learning_rate": 3.204069846446528e-05, + "loss": 3.9453, + "step": 80050 + }, + { + "epoch": 5.439258051365675, + "grad_norm": 0.1726359874010086, + "learning_rate": 3.203645196358201e-05, + "loss": 3.9088, + "step": 80055 + }, + { + "epoch": 5.439597771436336, + "grad_norm": 0.15515652298927307, + "learning_rate": 3.203220546269874e-05, + "loss": 3.704, + "step": 80060 + }, + { + "epoch": 5.439937491506998, + "grad_norm": 0.22499053180217743, + "learning_rate": 3.2027958961815464e-05, + "loss": 3.6527, + "step": 80065 + }, + { + "epoch": 5.44027721157766, + "grad_norm": 0.22473299503326416, + "learning_rate": 3.202371246093219e-05, + "loss": 3.6711, + "step": 80070 + }, + { + "epoch": 5.4406169316483215, + "grad_norm": 0.24064020812511444, + "learning_rate": 3.201946596004892e-05, + "loss": 3.7715, + "step": 80075 + }, + { + "epoch": 5.440956651718984, + "grad_norm": 0.17424243688583374, + "learning_rate": 3.2015219459165654e-05, + "loss": 3.8681, + "step": 80080 + }, + { + "epoch": 5.441296371789646, + "grad_norm": 0.15491236746311188, + "learning_rate": 3.2010972958282376e-05, + "loss": 3.9136, + "step": 80085 + }, + { + "epoch": 5.441636091860307, + "grad_norm": 0.1728084534406662, + "learning_rate": 3.2006726457399104e-05, + "loss": 3.6217, + "step": 80090 + }, + { + "epoch": 5.441975811930969, + "grad_norm": 0.18615740537643433, + "learning_rate": 3.200247995651584e-05, + "loss": 3.8735, + "step": 80095 + }, + { + "epoch": 5.442315532001631, + "grad_norm": 0.20120859146118164, + "learning_rate": 3.199823345563256e-05, + "loss": 3.808, + "step": 80100 + }, + { + "epoch": 5.442655252072292, + "grad_norm": 0.18454337120056152, + "learning_rate": 3.199398695474929e-05, + "loss": 3.8273, + "step": 80105 + }, + { + "epoch": 5.442994972142954, + "grad_norm": 0.3310857117176056, + "learning_rate": 3.1989740453866016e-05, + "loss": 3.7433, + "step": 80110 + }, + { + "epoch": 5.443334692213616, + "grad_norm": 0.15258854627609253, + "learning_rate": 3.1985493952982744e-05, + "loss": 4.0072, + "step": 80115 + }, + { + "epoch": 5.4436744122842775, + "grad_norm": 0.18185366690158844, + "learning_rate": 3.198124745209947e-05, + "loss": 3.7789, + "step": 80120 + }, + { + "epoch": 5.44401413235494, + "grad_norm": 0.17741073668003082, + "learning_rate": 3.19770009512162e-05, + "loss": 3.6676, + "step": 80125 + }, + { + "epoch": 5.444353852425602, + "grad_norm": 0.2524588704109192, + "learning_rate": 3.197275445033293e-05, + "loss": 4.1054, + "step": 80130 + }, + { + "epoch": 5.444693572496263, + "grad_norm": 0.1754857897758484, + "learning_rate": 3.1968507949449656e-05, + "loss": 3.7962, + "step": 80135 + }, + { + "epoch": 5.445033292566925, + "grad_norm": 0.18196798861026764, + "learning_rate": 3.1964261448566384e-05, + "loss": 4.0843, + "step": 80140 + }, + { + "epoch": 5.445373012637587, + "grad_norm": 2.244976758956909, + "learning_rate": 3.196001494768311e-05, + "loss": 4.1249, + "step": 80145 + }, + { + "epoch": 5.445712732708248, + "grad_norm": 0.19757023453712463, + "learning_rate": 3.195576844679984e-05, + "loss": 3.9621, + "step": 80150 + }, + { + "epoch": 5.44605245277891, + "grad_norm": 0.15122811496257782, + "learning_rate": 3.195152194591657e-05, + "loss": 3.7718, + "step": 80155 + }, + { + "epoch": 5.446392172849572, + "grad_norm": 0.16282089054584503, + "learning_rate": 3.1947275445033296e-05, + "loss": 3.6728, + "step": 80160 + }, + { + "epoch": 5.4467318929202335, + "grad_norm": 0.15849503874778748, + "learning_rate": 3.1943028944150024e-05, + "loss": 3.9872, + "step": 80165 + }, + { + "epoch": 5.447071612990896, + "grad_norm": 0.25671887397766113, + "learning_rate": 3.193878244326675e-05, + "loss": 3.8984, + "step": 80170 + }, + { + "epoch": 5.447411333061558, + "grad_norm": 3.349982738494873, + "learning_rate": 3.193453594238347e-05, + "loss": 3.9259, + "step": 80175 + }, + { + "epoch": 5.447751053132219, + "grad_norm": 0.21122919023036957, + "learning_rate": 3.193028944150021e-05, + "loss": 3.6265, + "step": 80180 + }, + { + "epoch": 5.448090773202881, + "grad_norm": 0.18742012977600098, + "learning_rate": 3.1926042940616936e-05, + "loss": 3.987, + "step": 80185 + }, + { + "epoch": 5.448430493273543, + "grad_norm": 0.2027900516986847, + "learning_rate": 3.192179643973366e-05, + "loss": 3.8175, + "step": 80190 + }, + { + "epoch": 5.448770213344204, + "grad_norm": 0.2156335413455963, + "learning_rate": 3.191754993885039e-05, + "loss": 3.793, + "step": 80195 + }, + { + "epoch": 5.449109933414866, + "grad_norm": 0.18028564751148224, + "learning_rate": 3.191330343796712e-05, + "loss": 3.9318, + "step": 80200 + }, + { + "epoch": 5.449449653485528, + "grad_norm": 0.14577333629131317, + "learning_rate": 3.190905693708384e-05, + "loss": 3.7363, + "step": 80205 + }, + { + "epoch": 5.4497893735561895, + "grad_norm": 0.19668522477149963, + "learning_rate": 3.190481043620057e-05, + "loss": 3.7785, + "step": 80210 + }, + { + "epoch": 5.450129093626852, + "grad_norm": 0.17533652484416962, + "learning_rate": 3.1900563935317304e-05, + "loss": 3.989, + "step": 80215 + }, + { + "epoch": 5.450468813697514, + "grad_norm": 0.16296793520450592, + "learning_rate": 3.1896317434434025e-05, + "loss": 3.636, + "step": 80220 + }, + { + "epoch": 5.450808533768175, + "grad_norm": 0.203862264752388, + "learning_rate": 3.189207093355075e-05, + "loss": 3.6943, + "step": 80225 + }, + { + "epoch": 5.451148253838837, + "grad_norm": 0.17958325147628784, + "learning_rate": 3.188782443266749e-05, + "loss": 3.7484, + "step": 80230 + }, + { + "epoch": 5.451487973909499, + "grad_norm": 0.1832636594772339, + "learning_rate": 3.188357793178421e-05, + "loss": 3.9248, + "step": 80235 + }, + { + "epoch": 5.45182769398016, + "grad_norm": 0.15195263922214508, + "learning_rate": 3.187933143090094e-05, + "loss": 3.7345, + "step": 80240 + }, + { + "epoch": 5.452167414050822, + "grad_norm": 0.17796608805656433, + "learning_rate": 3.1875084930017665e-05, + "loss": 3.821, + "step": 80245 + }, + { + "epoch": 5.452507134121484, + "grad_norm": 0.18232746422290802, + "learning_rate": 3.18708384291344e-05, + "loss": 3.8315, + "step": 80250 + }, + { + "epoch": 5.4528468541921455, + "grad_norm": 0.14148512482643127, + "learning_rate": 3.186659192825112e-05, + "loss": 4.0479, + "step": 80255 + }, + { + "epoch": 5.453186574262808, + "grad_norm": 0.21863386034965515, + "learning_rate": 3.186234542736785e-05, + "loss": 4.0363, + "step": 80260 + }, + { + "epoch": 5.45352629433347, + "grad_norm": 0.20086972415447235, + "learning_rate": 3.1858098926484584e-05, + "loss": 3.8127, + "step": 80265 + }, + { + "epoch": 5.453866014404131, + "grad_norm": 0.16098016500473022, + "learning_rate": 3.1853852425601305e-05, + "loss": 3.7917, + "step": 80270 + }, + { + "epoch": 5.454205734474793, + "grad_norm": 0.4044160544872284, + "learning_rate": 3.184960592471803e-05, + "loss": 3.7093, + "step": 80275 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.18842720985412598, + "learning_rate": 3.184535942383476e-05, + "loss": 3.8255, + "step": 80280 + }, + { + "epoch": 5.454885174616116, + "grad_norm": 0.21712076663970947, + "learning_rate": 3.184111292295149e-05, + "loss": 3.7695, + "step": 80285 + }, + { + "epoch": 5.455224894686778, + "grad_norm": 0.19547218084335327, + "learning_rate": 3.183686642206822e-05, + "loss": 3.7519, + "step": 80290 + }, + { + "epoch": 5.4555646147574395, + "grad_norm": 0.16262416541576385, + "learning_rate": 3.1832619921184945e-05, + "loss": 3.9818, + "step": 80295 + }, + { + "epoch": 5.4559043348281016, + "grad_norm": 0.16848327219486237, + "learning_rate": 3.182837342030167e-05, + "loss": 3.9231, + "step": 80300 + }, + { + "epoch": 5.456244054898764, + "grad_norm": 0.20311862230300903, + "learning_rate": 3.18241269194184e-05, + "loss": 3.8442, + "step": 80305 + }, + { + "epoch": 5.456583774969425, + "grad_norm": 0.1642778515815735, + "learning_rate": 3.181988041853513e-05, + "loss": 3.8853, + "step": 80310 + }, + { + "epoch": 5.456923495040087, + "grad_norm": 0.18602240085601807, + "learning_rate": 3.181563391765186e-05, + "loss": 3.7445, + "step": 80315 + }, + { + "epoch": 5.457263215110749, + "grad_norm": 0.17096027731895447, + "learning_rate": 3.1811387416768585e-05, + "loss": 3.7474, + "step": 80320 + }, + { + "epoch": 5.45760293518141, + "grad_norm": 0.2783992290496826, + "learning_rate": 3.180714091588531e-05, + "loss": 3.796, + "step": 80325 + }, + { + "epoch": 5.457942655252072, + "grad_norm": 5.738150596618652, + "learning_rate": 3.180289441500204e-05, + "loss": 3.996, + "step": 80330 + }, + { + "epoch": 5.458282375322734, + "grad_norm": 0.14110110700130463, + "learning_rate": 3.179864791411877e-05, + "loss": 4.0735, + "step": 80335 + }, + { + "epoch": 5.4586220953933955, + "grad_norm": 0.19594646990299225, + "learning_rate": 3.17944014132355e-05, + "loss": 3.8277, + "step": 80340 + }, + { + "epoch": 5.458961815464058, + "grad_norm": 0.3030344247817993, + "learning_rate": 3.179015491235222e-05, + "loss": 3.9137, + "step": 80345 + }, + { + "epoch": 5.45930153553472, + "grad_norm": 0.19026967883110046, + "learning_rate": 3.178590841146895e-05, + "loss": 3.7656, + "step": 80350 + }, + { + "epoch": 5.459641255605381, + "grad_norm": 0.1844765543937683, + "learning_rate": 3.178166191058568e-05, + "loss": 3.9215, + "step": 80355 + }, + { + "epoch": 5.459980975676043, + "grad_norm": 0.22024138271808624, + "learning_rate": 3.17774154097024e-05, + "loss": 3.6433, + "step": 80360 + }, + { + "epoch": 5.460320695746705, + "grad_norm": 0.2770407795906067, + "learning_rate": 3.177316890881914e-05, + "loss": 3.9491, + "step": 80365 + }, + { + "epoch": 5.460660415817366, + "grad_norm": 3.4200549125671387, + "learning_rate": 3.1768922407935865e-05, + "loss": 3.8915, + "step": 80370 + }, + { + "epoch": 5.461000135888028, + "grad_norm": 0.15833914279937744, + "learning_rate": 3.1764675907052586e-05, + "loss": 3.9398, + "step": 80375 + }, + { + "epoch": 5.46133985595869, + "grad_norm": 0.19402173161506653, + "learning_rate": 3.1760429406169314e-05, + "loss": 3.8327, + "step": 80380 + }, + { + "epoch": 5.4616795760293515, + "grad_norm": 0.159050852060318, + "learning_rate": 3.175618290528605e-05, + "loss": 3.7372, + "step": 80385 + }, + { + "epoch": 5.462019296100014, + "grad_norm": 0.2337605506181717, + "learning_rate": 3.175193640440277e-05, + "loss": 4.1105, + "step": 80390 + }, + { + "epoch": 5.462359016170676, + "grad_norm": 0.1710256189107895, + "learning_rate": 3.17476899035195e-05, + "loss": 3.8196, + "step": 80395 + }, + { + "epoch": 5.462698736241337, + "grad_norm": 0.17211690545082092, + "learning_rate": 3.174344340263623e-05, + "loss": 3.7917, + "step": 80400 + }, + { + "epoch": 5.463038456311999, + "grad_norm": 0.19851897656917572, + "learning_rate": 3.1739196901752954e-05, + "loss": 3.7375, + "step": 80405 + }, + { + "epoch": 5.463378176382661, + "grad_norm": 0.14509786665439606, + "learning_rate": 3.173495040086968e-05, + "loss": 3.8796, + "step": 80410 + }, + { + "epoch": 5.463717896453322, + "grad_norm": 0.20330168306827545, + "learning_rate": 3.173070389998641e-05, + "loss": 3.9754, + "step": 80415 + }, + { + "epoch": 5.464057616523984, + "grad_norm": 1.0605262517929077, + "learning_rate": 3.1726457399103145e-05, + "loss": 3.8706, + "step": 80420 + }, + { + "epoch": 5.464397336594646, + "grad_norm": 0.17508433759212494, + "learning_rate": 3.1722210898219867e-05, + "loss": 3.5774, + "step": 80425 + }, + { + "epoch": 5.4647370566653075, + "grad_norm": 0.19446636736392975, + "learning_rate": 3.1717964397336595e-05, + "loss": 3.8127, + "step": 80430 + }, + { + "epoch": 5.46507677673597, + "grad_norm": 0.1521492451429367, + "learning_rate": 3.171371789645333e-05, + "loss": 3.873, + "step": 80435 + }, + { + "epoch": 5.465416496806632, + "grad_norm": 0.15934795141220093, + "learning_rate": 3.170947139557005e-05, + "loss": 3.9512, + "step": 80440 + }, + { + "epoch": 5.465756216877293, + "grad_norm": 0.17766204476356506, + "learning_rate": 3.170522489468678e-05, + "loss": 4.1411, + "step": 80445 + }, + { + "epoch": 5.466095936947955, + "grad_norm": 0.1854570209980011, + "learning_rate": 3.1700978393803507e-05, + "loss": 4.1105, + "step": 80450 + }, + { + "epoch": 5.466435657018617, + "grad_norm": 0.20606279373168945, + "learning_rate": 3.1696731892920235e-05, + "loss": 3.8978, + "step": 80455 + }, + { + "epoch": 5.466775377089278, + "grad_norm": 0.16568805277347565, + "learning_rate": 3.169248539203696e-05, + "loss": 3.8842, + "step": 80460 + }, + { + "epoch": 5.46711509715994, + "grad_norm": 0.2605275511741638, + "learning_rate": 3.168823889115369e-05, + "loss": 4.177, + "step": 80465 + }, + { + "epoch": 5.467454817230602, + "grad_norm": 0.20496851205825806, + "learning_rate": 3.168399239027042e-05, + "loss": 3.7145, + "step": 80470 + }, + { + "epoch": 5.4677945373012635, + "grad_norm": 0.17611026763916016, + "learning_rate": 3.1679745889387147e-05, + "loss": 3.7645, + "step": 80475 + }, + { + "epoch": 5.468134257371926, + "grad_norm": 0.2045975774526596, + "learning_rate": 3.1675499388503875e-05, + "loss": 3.7455, + "step": 80480 + }, + { + "epoch": 5.468473977442588, + "grad_norm": 0.14766474068164825, + "learning_rate": 3.16712528876206e-05, + "loss": 3.8306, + "step": 80485 + }, + { + "epoch": 5.468813697513249, + "grad_norm": 0.1999780684709549, + "learning_rate": 3.166700638673733e-05, + "loss": 3.8284, + "step": 80490 + }, + { + "epoch": 5.469153417583911, + "grad_norm": 0.18146085739135742, + "learning_rate": 3.166275988585406e-05, + "loss": 3.6658, + "step": 80495 + }, + { + "epoch": 5.469493137654573, + "grad_norm": 0.1776416301727295, + "learning_rate": 3.165851338497079e-05, + "loss": 3.6554, + "step": 80500 + }, + { + "epoch": 5.469832857725234, + "grad_norm": 0.25652819871902466, + "learning_rate": 3.1654266884087515e-05, + "loss": 3.8548, + "step": 80505 + }, + { + "epoch": 5.470172577795896, + "grad_norm": 0.16607581079006195, + "learning_rate": 3.165002038320424e-05, + "loss": 3.9902, + "step": 80510 + }, + { + "epoch": 5.470512297866558, + "grad_norm": 0.20577920973300934, + "learning_rate": 3.1645773882320964e-05, + "loss": 3.9268, + "step": 80515 + }, + { + "epoch": 5.4708520179372195, + "grad_norm": 0.2793298363685608, + "learning_rate": 3.16415273814377e-05, + "loss": 3.611, + "step": 80520 + }, + { + "epoch": 5.471191738007882, + "grad_norm": 0.27206525206565857, + "learning_rate": 3.163728088055443e-05, + "loss": 3.7059, + "step": 80525 + }, + { + "epoch": 5.471531458078544, + "grad_norm": 0.2917449176311493, + "learning_rate": 3.163303437967115e-05, + "loss": 3.8433, + "step": 80530 + }, + { + "epoch": 5.471871178149205, + "grad_norm": 0.12605543434619904, + "learning_rate": 3.162878787878788e-05, + "loss": 3.6829, + "step": 80535 + }, + { + "epoch": 5.472210898219867, + "grad_norm": 0.22873009741306305, + "learning_rate": 3.162454137790461e-05, + "loss": 3.8588, + "step": 80540 + }, + { + "epoch": 5.472550618290528, + "grad_norm": 0.21775925159454346, + "learning_rate": 3.162029487702133e-05, + "loss": 4.0448, + "step": 80545 + }, + { + "epoch": 5.47289033836119, + "grad_norm": 0.19571734964847565, + "learning_rate": 3.161604837613806e-05, + "loss": 3.8019, + "step": 80550 + }, + { + "epoch": 5.473230058431852, + "grad_norm": 0.5738860964775085, + "learning_rate": 3.1611801875254795e-05, + "loss": 3.8706, + "step": 80555 + }, + { + "epoch": 5.4735697785025135, + "grad_norm": 0.23034265637397766, + "learning_rate": 3.1607555374371516e-05, + "loss": 3.7578, + "step": 80560 + }, + { + "epoch": 5.4739094985731755, + "grad_norm": 0.20841403305530548, + "learning_rate": 3.1603308873488244e-05, + "loss": 3.6396, + "step": 80565 + }, + { + "epoch": 5.474249218643838, + "grad_norm": 0.13665440678596497, + "learning_rate": 3.159906237260498e-05, + "loss": 3.8998, + "step": 80570 + }, + { + "epoch": 5.474588938714499, + "grad_norm": 0.17599818110466003, + "learning_rate": 3.15948158717217e-05, + "loss": 3.7678, + "step": 80575 + }, + { + "epoch": 5.474928658785161, + "grad_norm": 0.27249014377593994, + "learning_rate": 3.159056937083843e-05, + "loss": 4.0133, + "step": 80580 + }, + { + "epoch": 5.475268378855823, + "grad_norm": 0.20820753276348114, + "learning_rate": 3.158632286995516e-05, + "loss": 3.9934, + "step": 80585 + }, + { + "epoch": 5.475608098926484, + "grad_norm": 0.16732287406921387, + "learning_rate": 3.158207636907189e-05, + "loss": 3.9887, + "step": 80590 + }, + { + "epoch": 5.475947818997146, + "grad_norm": 0.21850113570690155, + "learning_rate": 3.157782986818861e-05, + "loss": 3.9096, + "step": 80595 + }, + { + "epoch": 5.476287539067808, + "grad_norm": 0.16365312039852142, + "learning_rate": 3.157358336730534e-05, + "loss": 3.9461, + "step": 80600 + }, + { + "epoch": 5.4766272591384695, + "grad_norm": 0.17188577353954315, + "learning_rate": 3.1569336866422075e-05, + "loss": 3.6406, + "step": 80605 + }, + { + "epoch": 5.476966979209132, + "grad_norm": 0.44409143924713135, + "learning_rate": 3.1565090365538796e-05, + "loss": 3.6656, + "step": 80610 + }, + { + "epoch": 5.477306699279794, + "grad_norm": 0.2037411779165268, + "learning_rate": 3.1560843864655524e-05, + "loss": 4.0875, + "step": 80615 + }, + { + "epoch": 5.477646419350455, + "grad_norm": 0.13998043537139893, + "learning_rate": 3.155659736377226e-05, + "loss": 4.062, + "step": 80620 + }, + { + "epoch": 5.477986139421117, + "grad_norm": 0.30469024181365967, + "learning_rate": 3.155235086288898e-05, + "loss": 4.0343, + "step": 80625 + }, + { + "epoch": 5.478325859491779, + "grad_norm": 0.22612877190113068, + "learning_rate": 3.154810436200571e-05, + "loss": 3.7391, + "step": 80630 + }, + { + "epoch": 5.47866557956244, + "grad_norm": 0.7040181159973145, + "learning_rate": 3.1543857861122436e-05, + "loss": 3.7289, + "step": 80635 + }, + { + "epoch": 5.479005299633102, + "grad_norm": 0.1633150726556778, + "learning_rate": 3.1539611360239164e-05, + "loss": 3.99, + "step": 80640 + }, + { + "epoch": 5.479345019703764, + "grad_norm": 0.16932553052902222, + "learning_rate": 3.153536485935589e-05, + "loss": 3.7355, + "step": 80645 + }, + { + "epoch": 5.4796847397744255, + "grad_norm": 0.1400771588087082, + "learning_rate": 3.153111835847262e-05, + "loss": 3.8145, + "step": 80650 + }, + { + "epoch": 5.480024459845088, + "grad_norm": 0.18197770416736603, + "learning_rate": 3.152687185758935e-05, + "loss": 3.9, + "step": 80655 + }, + { + "epoch": 5.48036417991575, + "grad_norm": 0.3232249319553375, + "learning_rate": 3.1522625356706076e-05, + "loss": 3.714, + "step": 80660 + }, + { + "epoch": 5.480703899986411, + "grad_norm": 0.4196282625198364, + "learning_rate": 3.1518378855822804e-05, + "loss": 3.6702, + "step": 80665 + }, + { + "epoch": 5.481043620057073, + "grad_norm": 0.1960006058216095, + "learning_rate": 3.151413235493953e-05, + "loss": 3.8017, + "step": 80670 + }, + { + "epoch": 5.481383340127735, + "grad_norm": 0.189263716340065, + "learning_rate": 3.150988585405626e-05, + "loss": 4.0253, + "step": 80675 + }, + { + "epoch": 5.481723060198396, + "grad_norm": 0.6533451080322266, + "learning_rate": 3.150563935317299e-05, + "loss": 3.8338, + "step": 80680 + }, + { + "epoch": 5.482062780269058, + "grad_norm": 0.20143435895442963, + "learning_rate": 3.1501392852289716e-05, + "loss": 3.9964, + "step": 80685 + }, + { + "epoch": 5.48240250033972, + "grad_norm": 0.2255447953939438, + "learning_rate": 3.1497146351406444e-05, + "loss": 3.7345, + "step": 80690 + }, + { + "epoch": 5.4827422204103815, + "grad_norm": 0.2231578677892685, + "learning_rate": 3.149289985052317e-05, + "loss": 4.0502, + "step": 80695 + }, + { + "epoch": 5.483081940481044, + "grad_norm": 1.7166736125946045, + "learning_rate": 3.1488653349639893e-05, + "loss": 3.8812, + "step": 80700 + }, + { + "epoch": 5.483421660551706, + "grad_norm": 0.14742223918437958, + "learning_rate": 3.148440684875663e-05, + "loss": 3.5876, + "step": 80705 + }, + { + "epoch": 5.483761380622367, + "grad_norm": 0.15072737634181976, + "learning_rate": 3.1480160347873356e-05, + "loss": 3.8128, + "step": 80710 + }, + { + "epoch": 5.484101100693029, + "grad_norm": 0.15742193162441254, + "learning_rate": 3.147591384699008e-05, + "loss": 3.8521, + "step": 80715 + }, + { + "epoch": 5.484440820763691, + "grad_norm": 0.19703374803066254, + "learning_rate": 3.147166734610681e-05, + "loss": 4.0858, + "step": 80720 + }, + { + "epoch": 5.484780540834352, + "grad_norm": 1.1392018795013428, + "learning_rate": 3.146742084522354e-05, + "loss": 3.6741, + "step": 80725 + }, + { + "epoch": 5.485120260905014, + "grad_norm": 0.15538303554058075, + "learning_rate": 3.146317434434026e-05, + "loss": 4.0528, + "step": 80730 + }, + { + "epoch": 5.485459980975676, + "grad_norm": 0.24474163353443146, + "learning_rate": 3.145892784345699e-05, + "loss": 4.0505, + "step": 80735 + }, + { + "epoch": 5.4857997010463375, + "grad_norm": 0.14897383749485016, + "learning_rate": 3.1454681342573724e-05, + "loss": 3.7517, + "step": 80740 + }, + { + "epoch": 5.486139421117, + "grad_norm": 0.16642116010189056, + "learning_rate": 3.1450434841690445e-05, + "loss": 3.6637, + "step": 80745 + }, + { + "epoch": 5.486479141187662, + "grad_norm": 0.15768806636333466, + "learning_rate": 3.1446188340807173e-05, + "loss": 3.979, + "step": 80750 + }, + { + "epoch": 5.486818861258323, + "grad_norm": 0.3567536473274231, + "learning_rate": 3.144194183992391e-05, + "loss": 3.8633, + "step": 80755 + }, + { + "epoch": 5.487158581328985, + "grad_norm": 0.16210481524467468, + "learning_rate": 3.1437695339040636e-05, + "loss": 3.6152, + "step": 80760 + }, + { + "epoch": 5.487498301399647, + "grad_norm": 0.16385731101036072, + "learning_rate": 3.143344883815736e-05, + "loss": 3.5995, + "step": 80765 + }, + { + "epoch": 5.487838021470308, + "grad_norm": 0.17812654376029968, + "learning_rate": 3.1429202337274086e-05, + "loss": 4.0577, + "step": 80770 + }, + { + "epoch": 5.48817774154097, + "grad_norm": 0.15438498556613922, + "learning_rate": 3.142495583639082e-05, + "loss": 3.4085, + "step": 80775 + }, + { + "epoch": 5.488517461611632, + "grad_norm": 0.25050851702690125, + "learning_rate": 3.142070933550754e-05, + "loss": 3.7586, + "step": 80780 + }, + { + "epoch": 5.4888571816822935, + "grad_norm": 0.17735537886619568, + "learning_rate": 3.141646283462427e-05, + "loss": 3.8071, + "step": 80785 + }, + { + "epoch": 5.489196901752956, + "grad_norm": 0.15045498311519623, + "learning_rate": 3.1412216333741004e-05, + "loss": 3.6724, + "step": 80790 + }, + { + "epoch": 5.489536621823618, + "grad_norm": 0.13702377676963806, + "learning_rate": 3.1407969832857726e-05, + "loss": 3.8198, + "step": 80795 + }, + { + "epoch": 5.489876341894279, + "grad_norm": 1.1740434169769287, + "learning_rate": 3.1403723331974454e-05, + "loss": 3.6681, + "step": 80800 + }, + { + "epoch": 5.490216061964941, + "grad_norm": 0.16066433489322662, + "learning_rate": 3.139947683109118e-05, + "loss": 3.8387, + "step": 80805 + }, + { + "epoch": 5.490555782035603, + "grad_norm": 0.23656095564365387, + "learning_rate": 3.139523033020791e-05, + "loss": 4.0285, + "step": 80810 + }, + { + "epoch": 5.490895502106264, + "grad_norm": 0.15252859890460968, + "learning_rate": 3.139098382932464e-05, + "loss": 4.0288, + "step": 80815 + }, + { + "epoch": 5.491235222176926, + "grad_norm": 0.21970215439796448, + "learning_rate": 3.1386737328441366e-05, + "loss": 3.8284, + "step": 80820 + }, + { + "epoch": 5.491574942247588, + "grad_norm": 0.18072745203971863, + "learning_rate": 3.1382490827558094e-05, + "loss": 3.8502, + "step": 80825 + }, + { + "epoch": 5.4919146623182495, + "grad_norm": 0.25675755739212036, + "learning_rate": 3.137824432667482e-05, + "loss": 3.8885, + "step": 80830 + }, + { + "epoch": 5.492254382388912, + "grad_norm": 0.18794545531272888, + "learning_rate": 3.137399782579155e-05, + "loss": 3.7486, + "step": 80835 + }, + { + "epoch": 5.492594102459574, + "grad_norm": 0.17575719952583313, + "learning_rate": 3.136975132490828e-05, + "loss": 4.0185, + "step": 80840 + }, + { + "epoch": 5.492933822530235, + "grad_norm": 0.2574876546859741, + "learning_rate": 3.1365504824025006e-05, + "loss": 3.5145, + "step": 80845 + }, + { + "epoch": 5.493273542600897, + "grad_norm": 0.14948774874210358, + "learning_rate": 3.1361258323141734e-05, + "loss": 3.8628, + "step": 80850 + }, + { + "epoch": 5.493613262671559, + "grad_norm": 0.17718417942523956, + "learning_rate": 3.135701182225846e-05, + "loss": 3.7031, + "step": 80855 + }, + { + "epoch": 5.49395298274222, + "grad_norm": 0.1631971299648285, + "learning_rate": 3.135276532137519e-05, + "loss": 4.0606, + "step": 80860 + }, + { + "epoch": 5.494292702812882, + "grad_norm": 0.23127147555351257, + "learning_rate": 3.134851882049192e-05, + "loss": 3.7888, + "step": 80865 + }, + { + "epoch": 5.494632422883544, + "grad_norm": 0.18816682696342468, + "learning_rate": 3.134427231960864e-05, + "loss": 4.0296, + "step": 80870 + }, + { + "epoch": 5.4949721429542056, + "grad_norm": 0.2587229907512665, + "learning_rate": 3.1340025818725374e-05, + "loss": 3.8468, + "step": 80875 + }, + { + "epoch": 5.495311863024868, + "grad_norm": 0.4172057509422302, + "learning_rate": 3.13357793178421e-05, + "loss": 4.0353, + "step": 80880 + }, + { + "epoch": 5.49565158309553, + "grad_norm": 0.15734660625457764, + "learning_rate": 3.133153281695882e-05, + "loss": 3.702, + "step": 80885 + }, + { + "epoch": 5.495991303166191, + "grad_norm": 0.17020264267921448, + "learning_rate": 3.132728631607556e-05, + "loss": 3.8067, + "step": 80890 + }, + { + "epoch": 5.496331023236853, + "grad_norm": 1.1143834590911865, + "learning_rate": 3.1323039815192286e-05, + "loss": 3.7573, + "step": 80895 + }, + { + "epoch": 5.496670743307515, + "grad_norm": 0.24211202561855316, + "learning_rate": 3.131879331430901e-05, + "loss": 3.583, + "step": 80900 + }, + { + "epoch": 5.497010463378176, + "grad_norm": 0.19653604924678802, + "learning_rate": 3.1314546813425735e-05, + "loss": 4.0393, + "step": 80905 + }, + { + "epoch": 5.497350183448838, + "grad_norm": 0.17225325107574463, + "learning_rate": 3.131030031254247e-05, + "loss": 3.7396, + "step": 80910 + }, + { + "epoch": 5.4976899035195, + "grad_norm": 0.1542322188615799, + "learning_rate": 3.130605381165919e-05, + "loss": 4.0604, + "step": 80915 + }, + { + "epoch": 5.498029623590162, + "grad_norm": 0.15241634845733643, + "learning_rate": 3.130180731077592e-05, + "loss": 3.8104, + "step": 80920 + }, + { + "epoch": 5.498369343660824, + "grad_norm": 0.1886514276266098, + "learning_rate": 3.1297560809892654e-05, + "loss": 3.8669, + "step": 80925 + }, + { + "epoch": 5.498709063731486, + "grad_norm": 0.2630189061164856, + "learning_rate": 3.129331430900938e-05, + "loss": 4.1063, + "step": 80930 + }, + { + "epoch": 5.499048783802147, + "grad_norm": 0.18863964080810547, + "learning_rate": 3.12890678081261e-05, + "loss": 3.9553, + "step": 80935 + }, + { + "epoch": 5.499388503872809, + "grad_norm": 0.13631416857242584, + "learning_rate": 3.128482130724283e-05, + "loss": 3.6676, + "step": 80940 + }, + { + "epoch": 5.499728223943471, + "grad_norm": 0.14183717966079712, + "learning_rate": 3.1280574806359566e-05, + "loss": 3.823, + "step": 80945 + }, + { + "epoch": 5.500067944014132, + "grad_norm": 0.1690707802772522, + "learning_rate": 3.127632830547629e-05, + "loss": 3.7544, + "step": 80950 + }, + { + "epoch": 5.500407664084794, + "grad_norm": 0.20826919376850128, + "learning_rate": 3.1272081804593015e-05, + "loss": 3.8669, + "step": 80955 + }, + { + "epoch": 5.500747384155456, + "grad_norm": 0.21441827714443207, + "learning_rate": 3.126783530370975e-05, + "loss": 3.9291, + "step": 80960 + }, + { + "epoch": 5.501087104226118, + "grad_norm": 0.1538984179496765, + "learning_rate": 3.126358880282647e-05, + "loss": 3.5571, + "step": 80965 + }, + { + "epoch": 5.50142682429678, + "grad_norm": 0.15257692337036133, + "learning_rate": 3.12593423019432e-05, + "loss": 3.8629, + "step": 80970 + }, + { + "epoch": 5.501766544367442, + "grad_norm": 0.1553221493959427, + "learning_rate": 3.125509580105993e-05, + "loss": 3.8566, + "step": 80975 + }, + { + "epoch": 5.502106264438103, + "grad_norm": 0.1922660768032074, + "learning_rate": 3.1250849300176655e-05, + "loss": 3.9806, + "step": 80980 + }, + { + "epoch": 5.502445984508765, + "grad_norm": 0.15935853123664856, + "learning_rate": 3.124660279929338e-05, + "loss": 3.7796, + "step": 80985 + }, + { + "epoch": 5.502785704579426, + "grad_norm": 0.2078651636838913, + "learning_rate": 3.124235629841011e-05, + "loss": 3.6454, + "step": 80990 + }, + { + "epoch": 5.503125424650088, + "grad_norm": 0.1705245077610016, + "learning_rate": 3.123810979752684e-05, + "loss": 3.4608, + "step": 80995 + }, + { + "epoch": 5.50346514472075, + "grad_norm": 0.15975774824619293, + "learning_rate": 3.123386329664357e-05, + "loss": 3.8249, + "step": 81000 + }, + { + "epoch": 5.5038048647914115, + "grad_norm": 0.20426993072032928, + "learning_rate": 3.1229616795760295e-05, + "loss": 3.8439, + "step": 81005 + }, + { + "epoch": 5.504144584862074, + "grad_norm": 0.15694975852966309, + "learning_rate": 3.122537029487702e-05, + "loss": 3.707, + "step": 81010 + }, + { + "epoch": 5.504484304932736, + "grad_norm": 0.17525538802146912, + "learning_rate": 3.122112379399375e-05, + "loss": 3.8502, + "step": 81015 + }, + { + "epoch": 5.504824025003397, + "grad_norm": 1.9521129131317139, + "learning_rate": 3.121687729311048e-05, + "loss": 3.8954, + "step": 81020 + }, + { + "epoch": 5.505163745074059, + "grad_norm": 0.1687362790107727, + "learning_rate": 3.121263079222721e-05, + "loss": 3.6653, + "step": 81025 + }, + { + "epoch": 5.505503465144721, + "grad_norm": 0.16366982460021973, + "learning_rate": 3.1208384291343935e-05, + "loss": 3.6813, + "step": 81030 + }, + { + "epoch": 5.505843185215382, + "grad_norm": 0.16483892500400543, + "learning_rate": 3.120413779046066e-05, + "loss": 3.7352, + "step": 81035 + }, + { + "epoch": 5.506182905286044, + "grad_norm": 0.21966113150119781, + "learning_rate": 3.1199891289577384e-05, + "loss": 3.837, + "step": 81040 + }, + { + "epoch": 5.506522625356706, + "grad_norm": 0.18512950837612152, + "learning_rate": 3.119564478869412e-05, + "loss": 3.8937, + "step": 81045 + }, + { + "epoch": 5.5068623454273675, + "grad_norm": 0.13986870646476746, + "learning_rate": 3.119139828781085e-05, + "loss": 3.7506, + "step": 81050 + }, + { + "epoch": 5.50720206549803, + "grad_norm": 0.1850927472114563, + "learning_rate": 3.118715178692757e-05, + "loss": 4.04, + "step": 81055 + }, + { + "epoch": 5.507541785568692, + "grad_norm": 0.19160546362400055, + "learning_rate": 3.11829052860443e-05, + "loss": 3.767, + "step": 81060 + }, + { + "epoch": 5.507881505639353, + "grad_norm": 0.1426953822374344, + "learning_rate": 3.117865878516103e-05, + "loss": 3.8478, + "step": 81065 + }, + { + "epoch": 5.508221225710015, + "grad_norm": 0.17485101521015167, + "learning_rate": 3.117441228427775e-05, + "loss": 3.8383, + "step": 81070 + }, + { + "epoch": 5.508560945780677, + "grad_norm": 0.19775567948818207, + "learning_rate": 3.117016578339448e-05, + "loss": 3.7595, + "step": 81075 + }, + { + "epoch": 5.508900665851338, + "grad_norm": 0.1870385855436325, + "learning_rate": 3.1165919282511215e-05, + "loss": 3.6871, + "step": 81080 + }, + { + "epoch": 5.509240385922, + "grad_norm": 0.19936835765838623, + "learning_rate": 3.1161672781627936e-05, + "loss": 3.8241, + "step": 81085 + }, + { + "epoch": 5.509580105992662, + "grad_norm": 0.1494005173444748, + "learning_rate": 3.1157426280744664e-05, + "loss": 3.7629, + "step": 81090 + }, + { + "epoch": 5.5099198260633235, + "grad_norm": 0.17371711134910583, + "learning_rate": 3.11531797798614e-05, + "loss": 3.8641, + "step": 81095 + }, + { + "epoch": 5.510259546133986, + "grad_norm": 0.18645983934402466, + "learning_rate": 3.114893327897812e-05, + "loss": 3.935, + "step": 81100 + }, + { + "epoch": 5.510599266204648, + "grad_norm": 0.23737789690494537, + "learning_rate": 3.114468677809485e-05, + "loss": 3.9923, + "step": 81105 + }, + { + "epoch": 5.510938986275309, + "grad_norm": 0.21275632083415985, + "learning_rate": 3.114044027721158e-05, + "loss": 3.9569, + "step": 81110 + }, + { + "epoch": 5.511278706345971, + "grad_norm": 0.21458417177200317, + "learning_rate": 3.113619377632831e-05, + "loss": 3.677, + "step": 81115 + }, + { + "epoch": 5.511618426416633, + "grad_norm": 0.16218794882297516, + "learning_rate": 3.113194727544503e-05, + "loss": 3.8474, + "step": 81120 + }, + { + "epoch": 5.511958146487294, + "grad_norm": 5.7459917068481445, + "learning_rate": 3.112770077456176e-05, + "loss": 3.5994, + "step": 81125 + }, + { + "epoch": 5.512297866557956, + "grad_norm": 0.1870754063129425, + "learning_rate": 3.1123454273678495e-05, + "loss": 3.8584, + "step": 81130 + }, + { + "epoch": 5.512637586628618, + "grad_norm": 0.16010788083076477, + "learning_rate": 3.1119207772795217e-05, + "loss": 3.9633, + "step": 81135 + }, + { + "epoch": 5.5129773066992795, + "grad_norm": 0.15126349031925201, + "learning_rate": 3.1114961271911945e-05, + "loss": 3.7363, + "step": 81140 + }, + { + "epoch": 5.513317026769942, + "grad_norm": 0.22179435193538666, + "learning_rate": 3.111071477102868e-05, + "loss": 3.9357, + "step": 81145 + }, + { + "epoch": 5.513656746840604, + "grad_norm": 0.19462722539901733, + "learning_rate": 3.11064682701454e-05, + "loss": 3.9669, + "step": 81150 + }, + { + "epoch": 5.513996466911265, + "grad_norm": 0.1441238671541214, + "learning_rate": 3.110222176926213e-05, + "loss": 3.6688, + "step": 81155 + }, + { + "epoch": 5.514336186981927, + "grad_norm": 0.19776692986488342, + "learning_rate": 3.1097975268378857e-05, + "loss": 4.0816, + "step": 81160 + }, + { + "epoch": 5.514675907052589, + "grad_norm": 1.7075257301330566, + "learning_rate": 3.1093728767495585e-05, + "loss": 3.9222, + "step": 81165 + }, + { + "epoch": 5.51501562712325, + "grad_norm": 0.20068126916885376, + "learning_rate": 3.108948226661231e-05, + "loss": 3.8994, + "step": 81170 + }, + { + "epoch": 5.515355347193912, + "grad_norm": 0.19757117331027985, + "learning_rate": 3.108523576572904e-05, + "loss": 3.7662, + "step": 81175 + }, + { + "epoch": 5.515695067264574, + "grad_norm": 0.281073659658432, + "learning_rate": 3.108098926484577e-05, + "loss": 3.8071, + "step": 81180 + }, + { + "epoch": 5.516034787335236, + "grad_norm": 0.20165108144283295, + "learning_rate": 3.1076742763962497e-05, + "loss": 4.0068, + "step": 81185 + }, + { + "epoch": 5.516374507405898, + "grad_norm": 0.26076292991638184, + "learning_rate": 3.1072496263079225e-05, + "loss": 3.9409, + "step": 81190 + }, + { + "epoch": 5.516714227476559, + "grad_norm": 0.29394155740737915, + "learning_rate": 3.106824976219595e-05, + "loss": 3.8163, + "step": 81195 + }, + { + "epoch": 5.517053947547221, + "grad_norm": 0.1749008148908615, + "learning_rate": 3.106400326131268e-05, + "loss": 3.8717, + "step": 81200 + }, + { + "epoch": 5.517393667617883, + "grad_norm": 0.24170446395874023, + "learning_rate": 3.105975676042941e-05, + "loss": 3.7893, + "step": 81205 + }, + { + "epoch": 5.517733387688544, + "grad_norm": 0.194504514336586, + "learning_rate": 3.105551025954614e-05, + "loss": 4.0527, + "step": 81210 + }, + { + "epoch": 5.518073107759206, + "grad_norm": 0.1496128886938095, + "learning_rate": 3.1051263758662865e-05, + "loss": 3.9199, + "step": 81215 + }, + { + "epoch": 5.518412827829868, + "grad_norm": 0.16189271211624146, + "learning_rate": 3.104701725777959e-05, + "loss": 3.9695, + "step": 81220 + }, + { + "epoch": 5.5187525479005295, + "grad_norm": 0.16774708032608032, + "learning_rate": 3.1042770756896314e-05, + "loss": 4.0464, + "step": 81225 + }, + { + "epoch": 5.519092267971192, + "grad_norm": 0.28198930621147156, + "learning_rate": 3.103852425601305e-05, + "loss": 3.7001, + "step": 81230 + }, + { + "epoch": 5.519431988041854, + "grad_norm": 0.16410338878631592, + "learning_rate": 3.103427775512978e-05, + "loss": 4.057, + "step": 81235 + }, + { + "epoch": 5.519771708112515, + "grad_norm": 0.20193526148796082, + "learning_rate": 3.10300312542465e-05, + "loss": 3.9305, + "step": 81240 + }, + { + "epoch": 5.520111428183177, + "grad_norm": 0.1662224978208542, + "learning_rate": 3.102578475336323e-05, + "loss": 4.0017, + "step": 81245 + }, + { + "epoch": 5.520451148253839, + "grad_norm": 0.2576456367969513, + "learning_rate": 3.102153825247996e-05, + "loss": 4.1293, + "step": 81250 + }, + { + "epoch": 5.5207908683245, + "grad_norm": 0.16784271597862244, + "learning_rate": 3.101729175159668e-05, + "loss": 3.8035, + "step": 81255 + }, + { + "epoch": 5.521130588395162, + "grad_norm": 0.180461585521698, + "learning_rate": 3.101304525071341e-05, + "loss": 4.0609, + "step": 81260 + }, + { + "epoch": 5.521470308465824, + "grad_norm": 0.7546388506889343, + "learning_rate": 3.1008798749830145e-05, + "loss": 3.7785, + "step": 81265 + }, + { + "epoch": 5.5218100285364855, + "grad_norm": 0.21349039673805237, + "learning_rate": 3.1004552248946866e-05, + "loss": 3.7066, + "step": 81270 + }, + { + "epoch": 5.522149748607148, + "grad_norm": 0.7678186297416687, + "learning_rate": 3.1000305748063594e-05, + "loss": 3.8051, + "step": 81275 + }, + { + "epoch": 5.52248946867781, + "grad_norm": 0.17874088883399963, + "learning_rate": 3.099605924718033e-05, + "loss": 3.994, + "step": 81280 + }, + { + "epoch": 5.522829188748471, + "grad_norm": 0.3185825049877167, + "learning_rate": 3.099181274629706e-05, + "loss": 3.98, + "step": 81285 + }, + { + "epoch": 5.523168908819133, + "grad_norm": 0.2035703808069229, + "learning_rate": 3.098756624541378e-05, + "loss": 4.0174, + "step": 81290 + }, + { + "epoch": 5.523508628889795, + "grad_norm": 0.5630665421485901, + "learning_rate": 3.0983319744530506e-05, + "loss": 3.7441, + "step": 81295 + }, + { + "epoch": 5.523848348960456, + "grad_norm": NaN, + "learning_rate": 3.097992254382389e-05, + "loss": 3.9575, + "step": 81300 + }, + { + "epoch": 5.524188069031118, + "grad_norm": 0.14095546305179596, + "learning_rate": 3.0975676042940616e-05, + "loss": 3.8683, + "step": 81305 + }, + { + "epoch": 5.52452778910178, + "grad_norm": 0.19644500315189362, + "learning_rate": 3.0971429542057344e-05, + "loss": 3.9485, + "step": 81310 + }, + { + "epoch": 5.5248675091724415, + "grad_norm": 0.24307231605052948, + "learning_rate": 3.096718304117407e-05, + "loss": 3.7183, + "step": 81315 + }, + { + "epoch": 5.525207229243104, + "grad_norm": 0.18433475494384766, + "learning_rate": 3.096293654029081e-05, + "loss": 3.6027, + "step": 81320 + }, + { + "epoch": 5.525546949313766, + "grad_norm": 0.16512466967105865, + "learning_rate": 3.095869003940753e-05, + "loss": 3.7465, + "step": 81325 + }, + { + "epoch": 5.525886669384427, + "grad_norm": 0.1537017524242401, + "learning_rate": 3.0954443538524256e-05, + "loss": 3.7673, + "step": 81330 + }, + { + "epoch": 5.526226389455089, + "grad_norm": 0.16838300228118896, + "learning_rate": 3.0950197037640984e-05, + "loss": 4.1028, + "step": 81335 + }, + { + "epoch": 5.526566109525751, + "grad_norm": 0.15104937553405762, + "learning_rate": 3.094595053675771e-05, + "loss": 3.9762, + "step": 81340 + }, + { + "epoch": 5.526905829596412, + "grad_norm": 0.19699615240097046, + "learning_rate": 3.094170403587444e-05, + "loss": 3.8726, + "step": 81345 + }, + { + "epoch": 5.527245549667074, + "grad_norm": 0.1671372354030609, + "learning_rate": 3.093745753499117e-05, + "loss": 3.9689, + "step": 81350 + }, + { + "epoch": 5.527585269737736, + "grad_norm": 0.1653023660182953, + "learning_rate": 3.0933211034107896e-05, + "loss": 3.9595, + "step": 81355 + }, + { + "epoch": 5.5279249898083975, + "grad_norm": 0.16357927024364471, + "learning_rate": 3.0928964533224624e-05, + "loss": 3.6786, + "step": 81360 + }, + { + "epoch": 5.52826470987906, + "grad_norm": 0.16529305279254913, + "learning_rate": 3.092471803234135e-05, + "loss": 3.8441, + "step": 81365 + }, + { + "epoch": 5.528604429949722, + "grad_norm": 0.173111692070961, + "learning_rate": 3.092047153145808e-05, + "loss": 3.6613, + "step": 81370 + }, + { + "epoch": 5.528944150020383, + "grad_norm": 0.19385845959186554, + "learning_rate": 3.091622503057481e-05, + "loss": 3.7555, + "step": 81375 + }, + { + "epoch": 5.529283870091045, + "grad_norm": 0.18106134235858917, + "learning_rate": 3.0911978529691537e-05, + "loss": 3.7973, + "step": 81380 + }, + { + "epoch": 5.529623590161707, + "grad_norm": 0.3330727517604828, + "learning_rate": 3.0907732028808265e-05, + "loss": 3.9976, + "step": 81385 + }, + { + "epoch": 5.529963310232368, + "grad_norm": 0.14420408010482788, + "learning_rate": 3.090348552792499e-05, + "loss": 3.8203, + "step": 81390 + }, + { + "epoch": 5.53030303030303, + "grad_norm": 0.16649675369262695, + "learning_rate": 3.089923902704172e-05, + "loss": 3.7188, + "step": 81395 + }, + { + "epoch": 5.530642750373692, + "grad_norm": 0.17056670784950256, + "learning_rate": 3.089499252615844e-05, + "loss": 3.8273, + "step": 81400 + }, + { + "epoch": 5.5309824704443535, + "grad_norm": 0.14086897671222687, + "learning_rate": 3.0890746025275177e-05, + "loss": 3.8767, + "step": 81405 + }, + { + "epoch": 5.531322190515016, + "grad_norm": 2.2310280799865723, + "learning_rate": 3.0886499524391905e-05, + "loss": 3.9073, + "step": 81410 + }, + { + "epoch": 5.531661910585678, + "grad_norm": 0.5060164928436279, + "learning_rate": 3.0882253023508626e-05, + "loss": 3.9566, + "step": 81415 + }, + { + "epoch": 5.532001630656339, + "grad_norm": 0.14128533005714417, + "learning_rate": 3.087800652262536e-05, + "loss": 3.9726, + "step": 81420 + }, + { + "epoch": 5.532341350727001, + "grad_norm": 0.26332157850265503, + "learning_rate": 3.087376002174209e-05, + "loss": 3.8158, + "step": 81425 + }, + { + "epoch": 5.532681070797663, + "grad_norm": 0.13729609549045563, + "learning_rate": 3.086951352085881e-05, + "loss": 3.7926, + "step": 81430 + }, + { + "epoch": 5.533020790868324, + "grad_norm": 0.16047881543636322, + "learning_rate": 3.086526701997554e-05, + "loss": 3.8803, + "step": 81435 + }, + { + "epoch": 5.533360510938986, + "grad_norm": 0.2637616991996765, + "learning_rate": 3.086102051909227e-05, + "loss": 3.9126, + "step": 81440 + }, + { + "epoch": 5.533700231009648, + "grad_norm": 0.16653411090373993, + "learning_rate": 3.0856774018208994e-05, + "loss": 3.9078, + "step": 81445 + }, + { + "epoch": 5.5340399510803095, + "grad_norm": 0.47850528359413147, + "learning_rate": 3.085252751732572e-05, + "loss": 3.5962, + "step": 81450 + }, + { + "epoch": 5.534379671150972, + "grad_norm": 0.1759483814239502, + "learning_rate": 3.084828101644246e-05, + "loss": 3.8669, + "step": 81455 + }, + { + "epoch": 5.534719391221634, + "grad_norm": 0.16972942650318146, + "learning_rate": 3.084403451555918e-05, + "loss": 3.7879, + "step": 81460 + }, + { + "epoch": 5.535059111292295, + "grad_norm": 0.1790671944618225, + "learning_rate": 3.0839788014675906e-05, + "loss": 3.8667, + "step": 81465 + }, + { + "epoch": 5.535398831362957, + "grad_norm": 0.17062056064605713, + "learning_rate": 3.083554151379264e-05, + "loss": 3.709, + "step": 81470 + }, + { + "epoch": 5.535738551433619, + "grad_norm": 0.1693548709154129, + "learning_rate": 3.083129501290936e-05, + "loss": 3.9119, + "step": 81475 + }, + { + "epoch": 5.53607827150428, + "grad_norm": 0.20063459873199463, + "learning_rate": 3.082704851202609e-05, + "loss": 3.938, + "step": 81480 + }, + { + "epoch": 5.536417991574942, + "grad_norm": 0.17220914363861084, + "learning_rate": 3.082280201114282e-05, + "loss": 3.8161, + "step": 81485 + }, + { + "epoch": 5.536757711645604, + "grad_norm": 0.1510418951511383, + "learning_rate": 3.081855551025955e-05, + "loss": 3.8647, + "step": 81490 + }, + { + "epoch": 5.537097431716266, + "grad_norm": 0.15366268157958984, + "learning_rate": 3.0814309009376274e-05, + "loss": 4.07, + "step": 81495 + }, + { + "epoch": 5.537437151786928, + "grad_norm": 0.14910703897476196, + "learning_rate": 3.0810062508493e-05, + "loss": 3.6991, + "step": 81500 + }, + { + "epoch": 5.53777687185759, + "grad_norm": 0.15684837102890015, + "learning_rate": 3.080581600760974e-05, + "loss": 4.0225, + "step": 81505 + }, + { + "epoch": 5.538116591928251, + "grad_norm": 0.24209070205688477, + "learning_rate": 3.080156950672646e-05, + "loss": 3.7988, + "step": 81510 + }, + { + "epoch": 5.538456311998913, + "grad_norm": 0.21142520010471344, + "learning_rate": 3.0797323005843186e-05, + "loss": 3.9167, + "step": 81515 + }, + { + "epoch": 5.538796032069575, + "grad_norm": 0.294050931930542, + "learning_rate": 3.0793076504959914e-05, + "loss": 3.8479, + "step": 81520 + }, + { + "epoch": 5.539135752140236, + "grad_norm": 0.15432986617088318, + "learning_rate": 3.078883000407664e-05, + "loss": 3.9529, + "step": 81525 + }, + { + "epoch": 5.539475472210898, + "grad_norm": 0.18285198509693146, + "learning_rate": 3.078458350319337e-05, + "loss": 3.9344, + "step": 81530 + }, + { + "epoch": 5.53981519228156, + "grad_norm": 0.16931743919849396, + "learning_rate": 3.07803370023101e-05, + "loss": 3.9913, + "step": 81535 + }, + { + "epoch": 5.540154912352222, + "grad_norm": 0.17888060212135315, + "learning_rate": 3.0776090501426826e-05, + "loss": 3.9791, + "step": 81540 + }, + { + "epoch": 5.540494632422884, + "grad_norm": 0.5679081678390503, + "learning_rate": 3.0771844000543554e-05, + "loss": 3.9295, + "step": 81545 + }, + { + "epoch": 5.540834352493546, + "grad_norm": 0.17467157542705536, + "learning_rate": 3.076759749966028e-05, + "loss": 3.8959, + "step": 81550 + }, + { + "epoch": 5.541174072564207, + "grad_norm": 0.2786978483200073, + "learning_rate": 3.076335099877701e-05, + "loss": 3.8521, + "step": 81555 + }, + { + "epoch": 5.541513792634869, + "grad_norm": 0.1408616006374359, + "learning_rate": 3.075910449789374e-05, + "loss": 4.0697, + "step": 81560 + }, + { + "epoch": 5.541853512705531, + "grad_norm": 0.17078553140163422, + "learning_rate": 3.0754857997010466e-05, + "loss": 3.6133, + "step": 81565 + }, + { + "epoch": 5.542193232776192, + "grad_norm": 0.25223737955093384, + "learning_rate": 3.0750611496127194e-05, + "loss": 3.865, + "step": 81570 + }, + { + "epoch": 5.542532952846854, + "grad_norm": 0.8302135467529297, + "learning_rate": 3.074636499524392e-05, + "loss": 3.7317, + "step": 81575 + }, + { + "epoch": 5.542872672917516, + "grad_norm": 0.15629158914089203, + "learning_rate": 3.074211849436065e-05, + "loss": 3.9782, + "step": 81580 + }, + { + "epoch": 5.543212392988178, + "grad_norm": 0.22512730956077576, + "learning_rate": 3.073787199347737e-05, + "loss": 3.9261, + "step": 81585 + }, + { + "epoch": 5.54355211305884, + "grad_norm": 2.000666618347168, + "learning_rate": 3.0733625492594106e-05, + "loss": 3.7358, + "step": 81590 + }, + { + "epoch": 5.543891833129502, + "grad_norm": 0.15946362912654877, + "learning_rate": 3.0729378991710834e-05, + "loss": 3.8446, + "step": 81595 + }, + { + "epoch": 5.544231553200163, + "grad_norm": 0.15322712063789368, + "learning_rate": 3.0725132490827555e-05, + "loss": 3.7828, + "step": 81600 + }, + { + "epoch": 5.544571273270825, + "grad_norm": 0.14586353302001953, + "learning_rate": 3.072088598994429e-05, + "loss": 3.9301, + "step": 81605 + }, + { + "epoch": 5.544910993341487, + "grad_norm": 0.17497970163822174, + "learning_rate": 3.071663948906102e-05, + "loss": 3.8887, + "step": 81610 + }, + { + "epoch": 5.545250713412148, + "grad_norm": 0.1994054913520813, + "learning_rate": 3.071239298817774e-05, + "loss": 3.8379, + "step": 81615 + }, + { + "epoch": 5.54559043348281, + "grad_norm": 0.19989024102687836, + "learning_rate": 3.070814648729447e-05, + "loss": 3.6734, + "step": 81620 + }, + { + "epoch": 5.545930153553472, + "grad_norm": 0.16376633942127228, + "learning_rate": 3.07038999864112e-05, + "loss": 3.9033, + "step": 81625 + }, + { + "epoch": 5.546269873624134, + "grad_norm": 0.2459750920534134, + "learning_rate": 3.069965348552792e-05, + "loss": 3.8145, + "step": 81630 + }, + { + "epoch": 5.546609593694796, + "grad_norm": 0.6487008929252625, + "learning_rate": 3.069540698464465e-05, + "loss": 3.781, + "step": 81635 + }, + { + "epoch": 5.546949313765458, + "grad_norm": 2.4842140674591064, + "learning_rate": 3.0691160483761386e-05, + "loss": 3.4301, + "step": 81640 + }, + { + "epoch": 5.547289033836119, + "grad_norm": 0.1632111519575119, + "learning_rate": 3.068691398287811e-05, + "loss": 3.75, + "step": 81645 + }, + { + "epoch": 5.547628753906781, + "grad_norm": 0.14283151924610138, + "learning_rate": 3.0682667481994835e-05, + "loss": 4.0583, + "step": 81650 + }, + { + "epoch": 5.547968473977443, + "grad_norm": 0.18503661453723907, + "learning_rate": 3.067842098111156e-05, + "loss": 3.6863, + "step": 81655 + }, + { + "epoch": 5.548308194048104, + "grad_norm": 0.17740540206432343, + "learning_rate": 3.06741744802283e-05, + "loss": 3.8499, + "step": 81660 + }, + { + "epoch": 5.548647914118766, + "grad_norm": 0.17743150889873505, + "learning_rate": 3.066992797934502e-05, + "loss": 4.043, + "step": 81665 + }, + { + "epoch": 5.5489876341894275, + "grad_norm": 0.19470317661762238, + "learning_rate": 3.066568147846175e-05, + "loss": 4.0504, + "step": 81670 + }, + { + "epoch": 5.54932735426009, + "grad_norm": 0.2024279534816742, + "learning_rate": 3.066143497757848e-05, + "loss": 3.6904, + "step": 81675 + }, + { + "epoch": 5.549667074330752, + "grad_norm": 3.7573142051696777, + "learning_rate": 3.0657188476695203e-05, + "loss": 3.84, + "step": 81680 + }, + { + "epoch": 5.550006794401413, + "grad_norm": 0.1405433863401413, + "learning_rate": 3.065294197581193e-05, + "loss": 3.7545, + "step": 81685 + }, + { + "epoch": 5.550346514472075, + "grad_norm": 0.1828056275844574, + "learning_rate": 3.064869547492866e-05, + "loss": 3.9292, + "step": 81690 + }, + { + "epoch": 5.550686234542737, + "grad_norm": 0.18673890829086304, + "learning_rate": 3.064444897404539e-05, + "loss": 3.861, + "step": 81695 + }, + { + "epoch": 5.551025954613398, + "grad_norm": 0.16688351333141327, + "learning_rate": 3.0640202473162115e-05, + "loss": 3.9437, + "step": 81700 + }, + { + "epoch": 5.55136567468406, + "grad_norm": 0.16560524702072144, + "learning_rate": 3.0635955972278843e-05, + "loss": 3.7378, + "step": 81705 + }, + { + "epoch": 5.551705394754722, + "grad_norm": 0.32834839820861816, + "learning_rate": 3.063170947139557e-05, + "loss": 3.4872, + "step": 81710 + }, + { + "epoch": 5.5520451148253835, + "grad_norm": 0.20362018048763275, + "learning_rate": 3.06274629705123e-05, + "loss": 3.831, + "step": 81715 + }, + { + "epoch": 5.552384834896046, + "grad_norm": 0.23003099858760834, + "learning_rate": 3.062321646962903e-05, + "loss": 3.868, + "step": 81720 + }, + { + "epoch": 5.552724554966708, + "grad_norm": 0.22754468023777008, + "learning_rate": 3.0618969968745755e-05, + "loss": 4.0763, + "step": 81725 + }, + { + "epoch": 5.553064275037369, + "grad_norm": 0.17381420731544495, + "learning_rate": 3.0614723467862484e-05, + "loss": 3.5876, + "step": 81730 + }, + { + "epoch": 5.553403995108031, + "grad_norm": 0.21171103417873383, + "learning_rate": 3.061047696697921e-05, + "loss": 3.8671, + "step": 81735 + }, + { + "epoch": 5.553743715178693, + "grad_norm": 0.17588861286640167, + "learning_rate": 3.060623046609594e-05, + "loss": 4.0917, + "step": 81740 + }, + { + "epoch": 5.554083435249354, + "grad_norm": 0.14538976550102234, + "learning_rate": 3.060198396521267e-05, + "loss": 3.9094, + "step": 81745 + }, + { + "epoch": 5.554423155320016, + "grad_norm": 0.18214978277683258, + "learning_rate": 3.0597737464329396e-05, + "loss": 3.6422, + "step": 81750 + }, + { + "epoch": 5.554762875390678, + "grad_norm": 0.1332785189151764, + "learning_rate": 3.059349096344612e-05, + "loss": 3.7489, + "step": 81755 + }, + { + "epoch": 5.5551025954613396, + "grad_norm": 0.2288784682750702, + "learning_rate": 3.058924446256285e-05, + "loss": 3.7808, + "step": 81760 + }, + { + "epoch": 5.555442315532002, + "grad_norm": 0.22287602722644806, + "learning_rate": 3.058499796167958e-05, + "loss": 3.7826, + "step": 81765 + }, + { + "epoch": 5.555782035602664, + "grad_norm": 0.17758381366729736, + "learning_rate": 3.05807514607963e-05, + "loss": 3.6335, + "step": 81770 + }, + { + "epoch": 5.556121755673325, + "grad_norm": 0.4980243444442749, + "learning_rate": 3.0576504959913036e-05, + "loss": 4.0197, + "step": 81775 + }, + { + "epoch": 5.556461475743987, + "grad_norm": 0.18645763397216797, + "learning_rate": 3.0572258459029764e-05, + "loss": 3.8976, + "step": 81780 + }, + { + "epoch": 5.556801195814649, + "grad_norm": 0.17370808124542236, + "learning_rate": 3.0568011958146485e-05, + "loss": 4.0382, + "step": 81785 + }, + { + "epoch": 5.55714091588531, + "grad_norm": 0.21996285021305084, + "learning_rate": 3.056376545726321e-05, + "loss": 3.6633, + "step": 81790 + }, + { + "epoch": 5.557480635955972, + "grad_norm": 0.2564495801925659, + "learning_rate": 3.055951895637995e-05, + "loss": 3.6283, + "step": 81795 + }, + { + "epoch": 5.557820356026634, + "grad_norm": 0.6760494709014893, + "learning_rate": 3.055527245549667e-05, + "loss": 3.7837, + "step": 81800 + }, + { + "epoch": 5.558160076097296, + "grad_norm": 0.16857339441776276, + "learning_rate": 3.05510259546134e-05, + "loss": 4.0387, + "step": 81805 + }, + { + "epoch": 5.558499796167958, + "grad_norm": 1.9918453693389893, + "learning_rate": 3.054677945373013e-05, + "loss": 3.7416, + "step": 81810 + }, + { + "epoch": 5.55883951623862, + "grad_norm": 0.16101793944835663, + "learning_rate": 3.054253295284685e-05, + "loss": 3.9482, + "step": 81815 + }, + { + "epoch": 5.559179236309281, + "grad_norm": 0.19351916015148163, + "learning_rate": 3.053828645196358e-05, + "loss": 3.9479, + "step": 81820 + }, + { + "epoch": 5.559518956379943, + "grad_norm": 0.19234497845172882, + "learning_rate": 3.053403995108031e-05, + "loss": 3.8053, + "step": 81825 + }, + { + "epoch": 5.559858676450605, + "grad_norm": 0.16150343418121338, + "learning_rate": 3.0529793450197044e-05, + "loss": 3.9229, + "step": 81830 + }, + { + "epoch": 5.560198396521266, + "grad_norm": 0.1658748835325241, + "learning_rate": 3.0525546949313765e-05, + "loss": 3.897, + "step": 81835 + }, + { + "epoch": 5.560538116591928, + "grad_norm": 0.17750553786754608, + "learning_rate": 3.052130044843049e-05, + "loss": 3.8394, + "step": 81840 + }, + { + "epoch": 5.56087783666259, + "grad_norm": 0.18472731113433838, + "learning_rate": 3.0517053947547224e-05, + "loss": 3.811, + "step": 81845 + }, + { + "epoch": 5.561217556733252, + "grad_norm": 0.23359128832817078, + "learning_rate": 3.051280744666395e-05, + "loss": 3.7751, + "step": 81850 + }, + { + "epoch": 5.561557276803914, + "grad_norm": 0.20514735579490662, + "learning_rate": 3.050856094578068e-05, + "loss": 3.9522, + "step": 81855 + }, + { + "epoch": 5.561896996874576, + "grad_norm": 0.18528594076633453, + "learning_rate": 3.050431444489741e-05, + "loss": 3.8111, + "step": 81860 + }, + { + "epoch": 5.562236716945237, + "grad_norm": 0.5640978813171387, + "learning_rate": 3.0500067944014133e-05, + "loss": 3.5243, + "step": 81865 + }, + { + "epoch": 5.562576437015899, + "grad_norm": 0.20625053346157074, + "learning_rate": 3.049582144313086e-05, + "loss": 3.8971, + "step": 81870 + }, + { + "epoch": 5.56291615708656, + "grad_norm": 0.15877626836299896, + "learning_rate": 3.0491574942247592e-05, + "loss": 3.6259, + "step": 81875 + }, + { + "epoch": 5.563255877157222, + "grad_norm": 0.4757607579231262, + "learning_rate": 3.0487328441364317e-05, + "loss": 3.5769, + "step": 81880 + }, + { + "epoch": 5.563595597227884, + "grad_norm": 0.1851503700017929, + "learning_rate": 3.0483081940481045e-05, + "loss": 3.6677, + "step": 81885 + }, + { + "epoch": 5.5639353172985455, + "grad_norm": 0.1485123634338379, + "learning_rate": 3.0478835439597776e-05, + "loss": 3.9509, + "step": 81890 + }, + { + "epoch": 5.564275037369208, + "grad_norm": 0.15262974798679352, + "learning_rate": 3.0474588938714498e-05, + "loss": 3.6314, + "step": 81895 + }, + { + "epoch": 5.56461475743987, + "grad_norm": 0.2583438456058502, + "learning_rate": 3.047034243783123e-05, + "loss": 3.7169, + "step": 81900 + }, + { + "epoch": 5.564954477510531, + "grad_norm": 0.19377371668815613, + "learning_rate": 3.0466095936947957e-05, + "loss": 3.8722, + "step": 81905 + }, + { + "epoch": 5.565294197581193, + "grad_norm": 0.9350605607032776, + "learning_rate": 3.046184943606468e-05, + "loss": 3.8732, + "step": 81910 + }, + { + "epoch": 5.565633917651855, + "grad_norm": 0.16068941354751587, + "learning_rate": 3.0457602935181413e-05, + "loss": 3.8747, + "step": 81915 + }, + { + "epoch": 5.565973637722516, + "grad_norm": 0.2182384729385376, + "learning_rate": 3.045335643429814e-05, + "loss": 3.4592, + "step": 81920 + }, + { + "epoch": 5.566313357793178, + "grad_norm": 0.19426925480365753, + "learning_rate": 3.0449109933414866e-05, + "loss": 3.9416, + "step": 81925 + }, + { + "epoch": 5.56665307786384, + "grad_norm": 0.14541393518447876, + "learning_rate": 3.0444863432531594e-05, + "loss": 3.8315, + "step": 81930 + }, + { + "epoch": 5.5669927979345015, + "grad_norm": 0.38417696952819824, + "learning_rate": 3.0440616931648325e-05, + "loss": 3.8863, + "step": 81935 + }, + { + "epoch": 5.567332518005164, + "grad_norm": 0.1482798159122467, + "learning_rate": 3.043637043076505e-05, + "loss": 4.1055, + "step": 81940 + }, + { + "epoch": 5.567672238075826, + "grad_norm": 0.19762344658374786, + "learning_rate": 3.0432123929881778e-05, + "loss": 3.8193, + "step": 81945 + }, + { + "epoch": 5.568011958146487, + "grad_norm": 0.17268428206443787, + "learning_rate": 3.042787742899851e-05, + "loss": 4.026, + "step": 81950 + }, + { + "epoch": 5.568351678217149, + "grad_norm": 0.15472745895385742, + "learning_rate": 3.0423630928115234e-05, + "loss": 3.6874, + "step": 81955 + }, + { + "epoch": 5.568691398287811, + "grad_norm": 0.6118285059928894, + "learning_rate": 3.041938442723196e-05, + "loss": 3.9357, + "step": 81960 + }, + { + "epoch": 5.569031118358472, + "grad_norm": 0.19233542680740356, + "learning_rate": 3.041513792634869e-05, + "loss": 3.8315, + "step": 81965 + }, + { + "epoch": 5.569370838429134, + "grad_norm": 0.9463678598403931, + "learning_rate": 3.0410891425465414e-05, + "loss": 4.0189, + "step": 81970 + }, + { + "epoch": 5.569710558499796, + "grad_norm": 0.38689059019088745, + "learning_rate": 3.0406644924582146e-05, + "loss": 3.8736, + "step": 81975 + }, + { + "epoch": 5.5700502785704575, + "grad_norm": 0.18904191255569458, + "learning_rate": 3.0402398423698874e-05, + "loss": 3.8345, + "step": 81980 + }, + { + "epoch": 5.57038999864112, + "grad_norm": 0.1473395973443985, + "learning_rate": 3.03981519228156e-05, + "loss": 3.9177, + "step": 81985 + }, + { + "epoch": 5.570729718711782, + "grad_norm": 0.19594942033290863, + "learning_rate": 3.039390542193233e-05, + "loss": 3.6007, + "step": 81990 + }, + { + "epoch": 5.571069438782443, + "grad_norm": 0.19317926466464996, + "learning_rate": 3.0389658921049058e-05, + "loss": 3.9449, + "step": 81995 + }, + { + "epoch": 5.571409158853105, + "grad_norm": 0.17288358509540558, + "learning_rate": 3.038541242016579e-05, + "loss": 3.8626, + "step": 82000 + }, + { + "epoch": 5.571748878923767, + "grad_norm": 0.16523785889148712, + "learning_rate": 3.038116591928251e-05, + "loss": 3.6767, + "step": 82005 + }, + { + "epoch": 5.572088598994428, + "grad_norm": 0.17416781187057495, + "learning_rate": 3.0376919418399242e-05, + "loss": 3.797, + "step": 82010 + }, + { + "epoch": 5.57242831906509, + "grad_norm": 0.38498345017433167, + "learning_rate": 3.037267291751597e-05, + "loss": 3.9694, + "step": 82015 + }, + { + "epoch": 5.572768039135752, + "grad_norm": 0.15925562381744385, + "learning_rate": 3.0368426416632694e-05, + "loss": 3.7774, + "step": 82020 + }, + { + "epoch": 5.5731077592064135, + "grad_norm": 0.1558508574962616, + "learning_rate": 3.0364179915749426e-05, + "loss": 4.0174, + "step": 82025 + }, + { + "epoch": 5.573447479277076, + "grad_norm": 0.15586058795452118, + "learning_rate": 3.0359933414866154e-05, + "loss": 3.9613, + "step": 82030 + }, + { + "epoch": 5.573787199347738, + "grad_norm": 0.16582483053207397, + "learning_rate": 3.035568691398288e-05, + "loss": 4.1021, + "step": 82035 + }, + { + "epoch": 5.574126919418399, + "grad_norm": 0.15828722715377808, + "learning_rate": 3.0351440413099606e-05, + "loss": 3.9667, + "step": 82040 + }, + { + "epoch": 5.574466639489061, + "grad_norm": 0.14186915755271912, + "learning_rate": 3.0347193912216338e-05, + "loss": 3.7875, + "step": 82045 + }, + { + "epoch": 5.574806359559723, + "grad_norm": 0.1729414016008377, + "learning_rate": 3.0342947411333062e-05, + "loss": 3.935, + "step": 82050 + }, + { + "epoch": 5.575146079630384, + "grad_norm": 0.15303368866443634, + "learning_rate": 3.033870091044979e-05, + "loss": 3.9211, + "step": 82055 + }, + { + "epoch": 5.575485799701046, + "grad_norm": 0.1827246993780136, + "learning_rate": 3.0334454409566522e-05, + "loss": 3.8895, + "step": 82060 + }, + { + "epoch": 5.575825519771708, + "grad_norm": 0.20462541282176971, + "learning_rate": 3.0330207908683243e-05, + "loss": 3.7895, + "step": 82065 + }, + { + "epoch": 5.57616523984237, + "grad_norm": 0.15010149776935577, + "learning_rate": 3.0325961407799974e-05, + "loss": 3.8407, + "step": 82070 + }, + { + "epoch": 5.576504959913032, + "grad_norm": 0.1891358643770218, + "learning_rate": 3.0321714906916702e-05, + "loss": 3.7685, + "step": 82075 + }, + { + "epoch": 5.576844679983694, + "grad_norm": 0.5295328497886658, + "learning_rate": 3.0317468406033427e-05, + "loss": 3.8062, + "step": 82080 + }, + { + "epoch": 5.577184400054355, + "grad_norm": 0.1996038258075714, + "learning_rate": 3.031322190515016e-05, + "loss": 3.8889, + "step": 82085 + }, + { + "epoch": 5.577524120125017, + "grad_norm": 0.146746426820755, + "learning_rate": 3.0308975404266887e-05, + "loss": 3.9741, + "step": 82090 + }, + { + "epoch": 5.577863840195679, + "grad_norm": 0.4884132742881775, + "learning_rate": 3.030472890338361e-05, + "loss": 3.769, + "step": 82095 + }, + { + "epoch": 5.57820356026634, + "grad_norm": 0.19802352786064148, + "learning_rate": 3.0300482402500343e-05, + "loss": 4.0257, + "step": 82100 + }, + { + "epoch": 5.578543280337002, + "grad_norm": 0.15108583867549896, + "learning_rate": 3.029623590161707e-05, + "loss": 3.8884, + "step": 82105 + }, + { + "epoch": 5.578883000407664, + "grad_norm": 0.1800481081008911, + "learning_rate": 3.0291989400733795e-05, + "loss": 3.7795, + "step": 82110 + }, + { + "epoch": 5.579222720478326, + "grad_norm": 0.2015244960784912, + "learning_rate": 3.0287742899850523e-05, + "loss": 3.5822, + "step": 82115 + }, + { + "epoch": 5.579562440548988, + "grad_norm": 0.18994687497615814, + "learning_rate": 3.0283496398967255e-05, + "loss": 3.8837, + "step": 82120 + }, + { + "epoch": 5.57990216061965, + "grad_norm": 0.1678653210401535, + "learning_rate": 3.027924989808398e-05, + "loss": 3.8478, + "step": 82125 + }, + { + "epoch": 5.580241880690311, + "grad_norm": 0.15726657211780548, + "learning_rate": 3.0275003397200707e-05, + "loss": 3.7187, + "step": 82130 + }, + { + "epoch": 5.580581600760973, + "grad_norm": 0.19016210734844208, + "learning_rate": 3.027075689631744e-05, + "loss": 3.8172, + "step": 82135 + }, + { + "epoch": 5.580921320831635, + "grad_norm": 0.4128619134426117, + "learning_rate": 3.026651039543416e-05, + "loss": 3.7957, + "step": 82140 + }, + { + "epoch": 5.581261040902296, + "grad_norm": 0.15834257006645203, + "learning_rate": 3.026226389455089e-05, + "loss": 4.1085, + "step": 82145 + }, + { + "epoch": 5.581600760972958, + "grad_norm": 0.4873870015144348, + "learning_rate": 3.025801739366762e-05, + "loss": 3.9599, + "step": 82150 + }, + { + "epoch": 5.58194048104362, + "grad_norm": 0.18855516612529755, + "learning_rate": 3.0253770892784344e-05, + "loss": 3.8631, + "step": 82155 + }, + { + "epoch": 5.582280201114282, + "grad_norm": 0.6642122268676758, + "learning_rate": 3.0249524391901075e-05, + "loss": 4.0414, + "step": 82160 + }, + { + "epoch": 5.582619921184944, + "grad_norm": 0.14236178994178772, + "learning_rate": 3.0245277891017803e-05, + "loss": 3.6768, + "step": 82165 + }, + { + "epoch": 5.582959641255606, + "grad_norm": 0.2906593978404999, + "learning_rate": 3.0241031390134535e-05, + "loss": 3.9858, + "step": 82170 + }, + { + "epoch": 5.583299361326267, + "grad_norm": 0.1629696637392044, + "learning_rate": 3.0236784889251256e-05, + "loss": 3.9073, + "step": 82175 + }, + { + "epoch": 5.583639081396929, + "grad_norm": 0.1777247190475464, + "learning_rate": 3.0232538388367987e-05, + "loss": 3.7094, + "step": 82180 + }, + { + "epoch": 5.583978801467591, + "grad_norm": 0.1522219330072403, + "learning_rate": 3.0228291887484715e-05, + "loss": 3.6685, + "step": 82185 + }, + { + "epoch": 5.584318521538252, + "grad_norm": 0.21116867661476135, + "learning_rate": 3.022404538660144e-05, + "loss": 3.9282, + "step": 82190 + }, + { + "epoch": 5.584658241608914, + "grad_norm": 0.16873234510421753, + "learning_rate": 3.021979888571817e-05, + "loss": 3.6708, + "step": 82195 + }, + { + "epoch": 5.584997961679576, + "grad_norm": 0.14131031930446625, + "learning_rate": 3.02155523848349e-05, + "loss": 4.1418, + "step": 82200 + }, + { + "epoch": 5.585337681750238, + "grad_norm": 0.1617916375398636, + "learning_rate": 3.0211305883951624e-05, + "loss": 3.7899, + "step": 82205 + }, + { + "epoch": 5.5856774018209, + "grad_norm": 0.1755700558423996, + "learning_rate": 3.0207059383068352e-05, + "loss": 3.5829, + "step": 82210 + }, + { + "epoch": 5.586017121891562, + "grad_norm": 0.2080787718296051, + "learning_rate": 3.0202812882185083e-05, + "loss": 3.9065, + "step": 82215 + }, + { + "epoch": 5.586356841962223, + "grad_norm": 0.1549527794122696, + "learning_rate": 3.0198566381301808e-05, + "loss": 3.8498, + "step": 82220 + }, + { + "epoch": 5.586696562032885, + "grad_norm": 0.17821574211120605, + "learning_rate": 3.0194319880418536e-05, + "loss": 4.0511, + "step": 82225 + }, + { + "epoch": 5.587036282103547, + "grad_norm": 0.18997375667095184, + "learning_rate": 3.0190073379535267e-05, + "loss": 3.7937, + "step": 82230 + }, + { + "epoch": 5.587376002174208, + "grad_norm": 0.18084697425365448, + "learning_rate": 3.0185826878651992e-05, + "loss": 3.9392, + "step": 82235 + }, + { + "epoch": 5.58771572224487, + "grad_norm": 0.24189448356628418, + "learning_rate": 3.018158037776872e-05, + "loss": 3.5546, + "step": 82240 + }, + { + "epoch": 5.588055442315532, + "grad_norm": 0.24605026841163635, + "learning_rate": 3.017733387688545e-05, + "loss": 3.9328, + "step": 82245 + }, + { + "epoch": 5.588395162386194, + "grad_norm": 0.1784420609474182, + "learning_rate": 3.0173087376002173e-05, + "loss": 3.7887, + "step": 82250 + }, + { + "epoch": 5.588734882456856, + "grad_norm": 0.15638482570648193, + "learning_rate": 3.0168840875118904e-05, + "loss": 3.9795, + "step": 82255 + }, + { + "epoch": 5.589074602527518, + "grad_norm": 0.13985693454742432, + "learning_rate": 3.0164594374235632e-05, + "loss": 3.9737, + "step": 82260 + }, + { + "epoch": 5.589414322598179, + "grad_norm": 0.15736538171768188, + "learning_rate": 3.0160347873352357e-05, + "loss": 3.7588, + "step": 82265 + }, + { + "epoch": 5.589754042668841, + "grad_norm": 1.4867839813232422, + "learning_rate": 3.0156101372469088e-05, + "loss": 3.9133, + "step": 82270 + }, + { + "epoch": 5.590093762739503, + "grad_norm": 0.28309959173202515, + "learning_rate": 3.0151854871585816e-05, + "loss": 3.6846, + "step": 82275 + }, + { + "epoch": 5.590433482810164, + "grad_norm": 0.20693622529506683, + "learning_rate": 3.014760837070254e-05, + "loss": 3.811, + "step": 82280 + }, + { + "epoch": 5.590773202880826, + "grad_norm": 0.17478980123996735, + "learning_rate": 3.014336186981927e-05, + "loss": 3.6686, + "step": 82285 + }, + { + "epoch": 5.591112922951488, + "grad_norm": 0.1980154663324356, + "learning_rate": 3.0139115368936e-05, + "loss": 3.7858, + "step": 82290 + }, + { + "epoch": 5.59145264302215, + "grad_norm": 0.2073349505662918, + "learning_rate": 3.0134868868052725e-05, + "loss": 3.7073, + "step": 82295 + }, + { + "epoch": 5.591792363092812, + "grad_norm": 0.1437574177980423, + "learning_rate": 3.0130622367169453e-05, + "loss": 4.0627, + "step": 82300 + }, + { + "epoch": 5.592132083163474, + "grad_norm": 0.17231929302215576, + "learning_rate": 3.0126375866286184e-05, + "loss": 3.742, + "step": 82305 + }, + { + "epoch": 5.592471803234135, + "grad_norm": 0.15876956284046173, + "learning_rate": 3.0122129365402905e-05, + "loss": 3.6613, + "step": 82310 + }, + { + "epoch": 5.592811523304797, + "grad_norm": 0.5331560969352722, + "learning_rate": 3.0117882864519637e-05, + "loss": 3.9332, + "step": 82315 + }, + { + "epoch": 5.593151243375459, + "grad_norm": 0.19072096049785614, + "learning_rate": 3.0113636363636365e-05, + "loss": 3.7589, + "step": 82320 + }, + { + "epoch": 5.59349096344612, + "grad_norm": 0.4658890962600708, + "learning_rate": 3.010938986275309e-05, + "loss": 3.7848, + "step": 82325 + }, + { + "epoch": 5.593830683516782, + "grad_norm": 0.17641057074069977, + "learning_rate": 3.010514336186982e-05, + "loss": 4.0121, + "step": 82330 + }, + { + "epoch": 5.594170403587444, + "grad_norm": 0.17842243611812592, + "learning_rate": 3.010089686098655e-05, + "loss": 3.7735, + "step": 82335 + }, + { + "epoch": 5.594510123658106, + "grad_norm": 0.25073543190956116, + "learning_rate": 3.009665036010328e-05, + "loss": 3.9695, + "step": 82340 + }, + { + "epoch": 5.594849843728768, + "grad_norm": 0.4585062861442566, + "learning_rate": 3.0092403859220005e-05, + "loss": 3.8459, + "step": 82345 + }, + { + "epoch": 5.595189563799429, + "grad_norm": 0.16030824184417725, + "learning_rate": 3.0088157358336733e-05, + "loss": 3.441, + "step": 82350 + }, + { + "epoch": 5.595529283870091, + "grad_norm": 0.17370393872261047, + "learning_rate": 3.008391085745346e-05, + "loss": 4.2192, + "step": 82355 + }, + { + "epoch": 5.595869003940753, + "grad_norm": 0.24351754784584045, + "learning_rate": 3.0079664356570185e-05, + "loss": 4.0208, + "step": 82360 + }, + { + "epoch": 5.596208724011414, + "grad_norm": 0.16962938010692596, + "learning_rate": 3.0075417855686917e-05, + "loss": 4.0931, + "step": 82365 + }, + { + "epoch": 5.596548444082076, + "grad_norm": 0.22111186385154724, + "learning_rate": 3.0071171354803645e-05, + "loss": 3.9638, + "step": 82370 + }, + { + "epoch": 5.596888164152738, + "grad_norm": 0.15407758951187134, + "learning_rate": 3.006692485392037e-05, + "loss": 3.6247, + "step": 82375 + }, + { + "epoch": 5.5972278842234, + "grad_norm": 0.2143179327249527, + "learning_rate": 3.00626783530371e-05, + "loss": 3.9718, + "step": 82380 + }, + { + "epoch": 5.597567604294062, + "grad_norm": 0.1998262107372284, + "learning_rate": 3.005843185215383e-05, + "loss": 3.6398, + "step": 82385 + }, + { + "epoch": 5.597907324364724, + "grad_norm": 0.2341223657131195, + "learning_rate": 3.0054185351270553e-05, + "loss": 3.8472, + "step": 82390 + }, + { + "epoch": 5.598247044435385, + "grad_norm": 0.23044291138648987, + "learning_rate": 3.004993885038728e-05, + "loss": 3.8887, + "step": 82395 + }, + { + "epoch": 5.598586764506047, + "grad_norm": 0.19260019063949585, + "learning_rate": 3.0045692349504013e-05, + "loss": 3.8035, + "step": 82400 + }, + { + "epoch": 5.598926484576709, + "grad_norm": 0.17749805748462677, + "learning_rate": 3.0041445848620737e-05, + "loss": 3.6717, + "step": 82405 + }, + { + "epoch": 5.59926620464737, + "grad_norm": 0.14455080032348633, + "learning_rate": 3.0037199347737465e-05, + "loss": 3.7881, + "step": 82410 + }, + { + "epoch": 5.599605924718032, + "grad_norm": 0.22736646234989166, + "learning_rate": 3.0032952846854197e-05, + "loss": 4.0067, + "step": 82415 + }, + { + "epoch": 5.599945644788694, + "grad_norm": 0.19641803205013275, + "learning_rate": 3.0028706345970918e-05, + "loss": 3.9325, + "step": 82420 + }, + { + "epoch": 5.600285364859356, + "grad_norm": 0.15355715155601501, + "learning_rate": 3.002445984508765e-05, + "loss": 3.8358, + "step": 82425 + }, + { + "epoch": 5.600625084930018, + "grad_norm": 0.17923451960086823, + "learning_rate": 3.0020213344204377e-05, + "loss": 3.8463, + "step": 82430 + }, + { + "epoch": 5.60096480500068, + "grad_norm": 0.17459066212177277, + "learning_rate": 3.0015966843321102e-05, + "loss": 3.7283, + "step": 82435 + }, + { + "epoch": 5.601304525071341, + "grad_norm": 0.15866383910179138, + "learning_rate": 3.0011720342437833e-05, + "loss": 3.7517, + "step": 82440 + }, + { + "epoch": 5.601644245142003, + "grad_norm": 0.15196825563907623, + "learning_rate": 3.000747384155456e-05, + "loss": 3.9777, + "step": 82445 + }, + { + "epoch": 5.601983965212665, + "grad_norm": 0.882273256778717, + "learning_rate": 3.0003227340671286e-05, + "loss": 3.9722, + "step": 82450 + }, + { + "epoch": 5.602323685283326, + "grad_norm": 0.30651891231536865, + "learning_rate": 2.9998980839788014e-05, + "loss": 3.9717, + "step": 82455 + }, + { + "epoch": 5.602663405353988, + "grad_norm": 0.17984507977962494, + "learning_rate": 2.9994734338904746e-05, + "loss": 3.7738, + "step": 82460 + }, + { + "epoch": 5.60300312542465, + "grad_norm": 0.19980385899543762, + "learning_rate": 2.999048783802147e-05, + "loss": 3.81, + "step": 82465 + }, + { + "epoch": 5.603342845495312, + "grad_norm": 0.1954261064529419, + "learning_rate": 2.9986241337138198e-05, + "loss": 4.0138, + "step": 82470 + }, + { + "epoch": 5.603682565565974, + "grad_norm": 0.18460245430469513, + "learning_rate": 2.998199483625493e-05, + "loss": 3.8519, + "step": 82475 + }, + { + "epoch": 5.604022285636636, + "grad_norm": 0.18465425074100494, + "learning_rate": 2.9977748335371654e-05, + "loss": 3.7776, + "step": 82480 + }, + { + "epoch": 5.604362005707297, + "grad_norm": 0.1625368893146515, + "learning_rate": 2.9973501834488382e-05, + "loss": 3.6359, + "step": 82485 + }, + { + "epoch": 5.604701725777959, + "grad_norm": 0.25090643763542175, + "learning_rate": 2.996925533360511e-05, + "loss": 3.7065, + "step": 82490 + }, + { + "epoch": 5.605041445848621, + "grad_norm": 0.19810669124126434, + "learning_rate": 2.9965008832721835e-05, + "loss": 3.7666, + "step": 82495 + }, + { + "epoch": 5.605381165919282, + "grad_norm": 0.25169751048088074, + "learning_rate": 2.9960762331838566e-05, + "loss": 3.7891, + "step": 82500 + }, + { + "epoch": 5.605720885989944, + "grad_norm": 0.28292807936668396, + "learning_rate": 2.9956515830955294e-05, + "loss": 4.0279, + "step": 82505 + }, + { + "epoch": 5.606060606060606, + "grad_norm": 0.14667725563049316, + "learning_rate": 2.9952269330072026e-05, + "loss": 3.8259, + "step": 82510 + }, + { + "epoch": 5.606400326131268, + "grad_norm": 0.2368813157081604, + "learning_rate": 2.994802282918875e-05, + "loss": 3.8874, + "step": 82515 + }, + { + "epoch": 5.60674004620193, + "grad_norm": 0.13973504304885864, + "learning_rate": 2.9943776328305478e-05, + "loss": 3.8907, + "step": 82520 + }, + { + "epoch": 5.607079766272592, + "grad_norm": 0.2029159665107727, + "learning_rate": 2.993952982742221e-05, + "loss": 3.7832, + "step": 82525 + }, + { + "epoch": 5.607419486343253, + "grad_norm": 0.2147703766822815, + "learning_rate": 2.993528332653893e-05, + "loss": 3.9437, + "step": 82530 + }, + { + "epoch": 5.607759206413915, + "grad_norm": 0.20807504653930664, + "learning_rate": 2.9931036825655662e-05, + "loss": 3.8262, + "step": 82535 + }, + { + "epoch": 5.608098926484577, + "grad_norm": 0.20454636216163635, + "learning_rate": 2.992679032477239e-05, + "loss": 3.8248, + "step": 82540 + }, + { + "epoch": 5.608438646555238, + "grad_norm": 0.15979348123073578, + "learning_rate": 2.9922543823889115e-05, + "loss": 3.8901, + "step": 82545 + }, + { + "epoch": 5.6087783666259, + "grad_norm": 0.21585074067115784, + "learning_rate": 2.9918297323005846e-05, + "loss": 3.9882, + "step": 82550 + }, + { + "epoch": 5.6091180866965615, + "grad_norm": 0.8582046627998352, + "learning_rate": 2.9914050822122574e-05, + "loss": 3.7857, + "step": 82555 + }, + { + "epoch": 5.609457806767224, + "grad_norm": 0.14493413269519806, + "learning_rate": 2.99098043212393e-05, + "loss": 3.6377, + "step": 82560 + }, + { + "epoch": 5.609797526837886, + "grad_norm": 0.17354066669940948, + "learning_rate": 2.9905557820356027e-05, + "loss": 3.8935, + "step": 82565 + }, + { + "epoch": 5.610137246908547, + "grad_norm": 0.19135574996471405, + "learning_rate": 2.9901311319472758e-05, + "loss": 3.8797, + "step": 82570 + }, + { + "epoch": 5.610476966979209, + "grad_norm": 0.23143132030963898, + "learning_rate": 2.9897064818589483e-05, + "loss": 3.614, + "step": 82575 + }, + { + "epoch": 5.610816687049871, + "grad_norm": 0.16584111750125885, + "learning_rate": 2.989281831770621e-05, + "loss": 3.9018, + "step": 82580 + }, + { + "epoch": 5.611156407120532, + "grad_norm": 0.15388284623622894, + "learning_rate": 2.9888571816822942e-05, + "loss": 3.9312, + "step": 82585 + }, + { + "epoch": 5.611496127191194, + "grad_norm": 0.17572695016860962, + "learning_rate": 2.9884325315939664e-05, + "loss": 3.8716, + "step": 82590 + }, + { + "epoch": 5.611835847261856, + "grad_norm": 0.16335953772068024, + "learning_rate": 2.9880078815056395e-05, + "loss": 3.8507, + "step": 82595 + }, + { + "epoch": 5.6121755673325175, + "grad_norm": 0.19435101747512817, + "learning_rate": 2.9875832314173123e-05, + "loss": 3.7684, + "step": 82600 + }, + { + "epoch": 5.61251528740318, + "grad_norm": 0.18041963875293732, + "learning_rate": 2.9871585813289848e-05, + "loss": 3.797, + "step": 82605 + }, + { + "epoch": 5.612855007473842, + "grad_norm": 0.2330271303653717, + "learning_rate": 2.986733931240658e-05, + "loss": 3.7284, + "step": 82610 + }, + { + "epoch": 5.613194727544503, + "grad_norm": 0.2310645878314972, + "learning_rate": 2.9863092811523307e-05, + "loss": 3.8565, + "step": 82615 + }, + { + "epoch": 5.613534447615165, + "grad_norm": 0.20739080011844635, + "learning_rate": 2.985884631064003e-05, + "loss": 4.139, + "step": 82620 + }, + { + "epoch": 5.613874167685827, + "grad_norm": 0.21432912349700928, + "learning_rate": 2.9854599809756763e-05, + "loss": 4.028, + "step": 82625 + }, + { + "epoch": 5.614213887756488, + "grad_norm": 0.20527642965316772, + "learning_rate": 2.985035330887349e-05, + "loss": 3.8818, + "step": 82630 + }, + { + "epoch": 5.61455360782715, + "grad_norm": 1.720093011856079, + "learning_rate": 2.9846106807990216e-05, + "loss": 3.7216, + "step": 82635 + }, + { + "epoch": 5.614893327897812, + "grad_norm": 0.1754552721977234, + "learning_rate": 2.9841860307106944e-05, + "loss": 3.5769, + "step": 82640 + }, + { + "epoch": 5.615233047968474, + "grad_norm": 0.17456410825252533, + "learning_rate": 2.9837613806223675e-05, + "loss": 3.7359, + "step": 82645 + }, + { + "epoch": 5.615572768039136, + "grad_norm": 0.16162070631980896, + "learning_rate": 2.98333673053404e-05, + "loss": 3.8574, + "step": 82650 + }, + { + "epoch": 5.615912488109798, + "grad_norm": 0.1746436506509781, + "learning_rate": 2.9829120804457128e-05, + "loss": 3.8257, + "step": 82655 + }, + { + "epoch": 5.616252208180459, + "grad_norm": 0.14188537001609802, + "learning_rate": 2.982487430357386e-05, + "loss": 3.987, + "step": 82660 + }, + { + "epoch": 5.616591928251121, + "grad_norm": 0.18658281862735748, + "learning_rate": 2.982062780269058e-05, + "loss": 3.8812, + "step": 82665 + }, + { + "epoch": 5.616931648321783, + "grad_norm": 0.16296513378620148, + "learning_rate": 2.981638130180731e-05, + "loss": 3.6105, + "step": 82670 + }, + { + "epoch": 5.617271368392444, + "grad_norm": 2.2678797245025635, + "learning_rate": 2.981213480092404e-05, + "loss": 3.911, + "step": 82675 + }, + { + "epoch": 5.617611088463106, + "grad_norm": 0.20699474215507507, + "learning_rate": 2.980788830004077e-05, + "loss": 3.8902, + "step": 82680 + }, + { + "epoch": 5.617950808533768, + "grad_norm": 0.20868180692195892, + "learning_rate": 2.9803641799157496e-05, + "loss": 3.8639, + "step": 82685 + }, + { + "epoch": 5.61829052860443, + "grad_norm": 0.14910058677196503, + "learning_rate": 2.9799395298274224e-05, + "loss": 3.743, + "step": 82690 + }, + { + "epoch": 5.618630248675092, + "grad_norm": 0.14784462749958038, + "learning_rate": 2.9795148797390955e-05, + "loss": 3.87, + "step": 82695 + }, + { + "epoch": 5.618969968745754, + "grad_norm": 0.22617174685001373, + "learning_rate": 2.9790902296507676e-05, + "loss": 4.0723, + "step": 82700 + }, + { + "epoch": 5.619309688816415, + "grad_norm": 0.20545676350593567, + "learning_rate": 2.9786655795624408e-05, + "loss": 4.0139, + "step": 82705 + }, + { + "epoch": 5.619649408887077, + "grad_norm": 0.20552705228328705, + "learning_rate": 2.9782409294741136e-05, + "loss": 3.9516, + "step": 82710 + }, + { + "epoch": 5.619989128957739, + "grad_norm": 0.1624002605676651, + "learning_rate": 2.977816279385786e-05, + "loss": 3.7981, + "step": 82715 + }, + { + "epoch": 5.6203288490284, + "grad_norm": 0.1631079912185669, + "learning_rate": 2.9773916292974592e-05, + "loss": 3.5105, + "step": 82720 + }, + { + "epoch": 5.620668569099062, + "grad_norm": 0.3614894151687622, + "learning_rate": 2.976966979209132e-05, + "loss": 3.9667, + "step": 82725 + }, + { + "epoch": 5.621008289169724, + "grad_norm": 0.2261628657579422, + "learning_rate": 2.9765423291208044e-05, + "loss": 3.6792, + "step": 82730 + }, + { + "epoch": 5.621348009240386, + "grad_norm": 0.15438348054885864, + "learning_rate": 2.9761176790324772e-05, + "loss": 3.8758, + "step": 82735 + }, + { + "epoch": 5.621687729311048, + "grad_norm": 0.14057917892932892, + "learning_rate": 2.9756930289441504e-05, + "loss": 3.9295, + "step": 82740 + }, + { + "epoch": 5.62202744938171, + "grad_norm": 0.7438631653785706, + "learning_rate": 2.975268378855823e-05, + "loss": 3.8079, + "step": 82745 + }, + { + "epoch": 5.622367169452371, + "grad_norm": 0.15957044064998627, + "learning_rate": 2.9748437287674956e-05, + "loss": 3.9166, + "step": 82750 + }, + { + "epoch": 5.622706889523033, + "grad_norm": 0.19719696044921875, + "learning_rate": 2.9744190786791688e-05, + "loss": 4.0012, + "step": 82755 + }, + { + "epoch": 5.623046609593695, + "grad_norm": 0.19405896961688995, + "learning_rate": 2.9739944285908412e-05, + "loss": 3.8675, + "step": 82760 + }, + { + "epoch": 5.623386329664356, + "grad_norm": 0.20440062880516052, + "learning_rate": 2.973569778502514e-05, + "loss": 3.8443, + "step": 82765 + }, + { + "epoch": 5.623726049735018, + "grad_norm": 0.3118343949317932, + "learning_rate": 2.9731451284141872e-05, + "loss": 3.8375, + "step": 82770 + }, + { + "epoch": 5.62406576980568, + "grad_norm": 0.20552361011505127, + "learning_rate": 2.9727204783258593e-05, + "loss": 3.8779, + "step": 82775 + }, + { + "epoch": 5.624405489876342, + "grad_norm": 0.3233601152896881, + "learning_rate": 2.9722958282375324e-05, + "loss": 4.0797, + "step": 82780 + }, + { + "epoch": 5.624745209947004, + "grad_norm": 0.16933190822601318, + "learning_rate": 2.9718711781492052e-05, + "loss": 4.1167, + "step": 82785 + }, + { + "epoch": 5.625084930017666, + "grad_norm": 0.1581960916519165, + "learning_rate": 2.9714465280608777e-05, + "loss": 3.8079, + "step": 82790 + }, + { + "epoch": 5.625424650088327, + "grad_norm": 0.17676833271980286, + "learning_rate": 2.971021877972551e-05, + "loss": 3.9744, + "step": 82795 + }, + { + "epoch": 5.625764370158989, + "grad_norm": 0.47396740317344666, + "learning_rate": 2.9705972278842236e-05, + "loss": 4.016, + "step": 82800 + }, + { + "epoch": 5.626104090229651, + "grad_norm": 1.046812653541565, + "learning_rate": 2.970172577795896e-05, + "loss": 3.8758, + "step": 82805 + }, + { + "epoch": 5.626443810300312, + "grad_norm": 0.17691585421562195, + "learning_rate": 2.969747927707569e-05, + "loss": 3.6351, + "step": 82810 + }, + { + "epoch": 5.626783530370974, + "grad_norm": 0.2339118868112564, + "learning_rate": 2.969323277619242e-05, + "loss": 3.7714, + "step": 82815 + }, + { + "epoch": 5.627123250441636, + "grad_norm": 0.2011939138174057, + "learning_rate": 2.9688986275309145e-05, + "loss": 3.7704, + "step": 82820 + }, + { + "epoch": 5.627462970512298, + "grad_norm": 0.2193947583436966, + "learning_rate": 2.9684739774425873e-05, + "loss": 3.7913, + "step": 82825 + }, + { + "epoch": 5.62780269058296, + "grad_norm": 0.18837307393550873, + "learning_rate": 2.9680493273542605e-05, + "loss": 3.7754, + "step": 82830 + }, + { + "epoch": 5.628142410653622, + "grad_norm": 0.36517849564552307, + "learning_rate": 2.9676246772659326e-05, + "loss": 3.567, + "step": 82835 + }, + { + "epoch": 5.628482130724283, + "grad_norm": 7.440813064575195, + "learning_rate": 2.9672000271776057e-05, + "loss": 4.0265, + "step": 82840 + }, + { + "epoch": 5.628821850794945, + "grad_norm": 0.15979696810245514, + "learning_rate": 2.9667753770892785e-05, + "loss": 3.8785, + "step": 82845 + }, + { + "epoch": 5.629161570865607, + "grad_norm": 0.1485421061515808, + "learning_rate": 2.9663507270009517e-05, + "loss": 3.6698, + "step": 82850 + }, + { + "epoch": 5.629501290936268, + "grad_norm": 0.1886965036392212, + "learning_rate": 2.965926076912624e-05, + "loss": 4.2296, + "step": 82855 + }, + { + "epoch": 5.62984101100693, + "grad_norm": 0.5104491710662842, + "learning_rate": 2.965501426824297e-05, + "loss": 3.9022, + "step": 82860 + }, + { + "epoch": 5.630180731077592, + "grad_norm": 0.13451987504959106, + "learning_rate": 2.96507677673597e-05, + "loss": 3.5424, + "step": 82865 + }, + { + "epoch": 5.630520451148254, + "grad_norm": 0.1675197035074234, + "learning_rate": 2.9646521266476425e-05, + "loss": 3.8301, + "step": 82870 + }, + { + "epoch": 5.630860171218916, + "grad_norm": 0.1793145090341568, + "learning_rate": 2.9642274765593153e-05, + "loss": 3.622, + "step": 82875 + }, + { + "epoch": 5.631199891289578, + "grad_norm": 0.1878010779619217, + "learning_rate": 2.963802826470988e-05, + "loss": 3.8552, + "step": 82880 + }, + { + "epoch": 5.631539611360239, + "grad_norm": 0.272343248128891, + "learning_rate": 2.9633781763826606e-05, + "loss": 3.7576, + "step": 82885 + }, + { + "epoch": 5.631879331430901, + "grad_norm": 0.2593172490596771, + "learning_rate": 2.9629535262943337e-05, + "loss": 3.9481, + "step": 82890 + }, + { + "epoch": 5.632219051501563, + "grad_norm": 0.23741741478443146, + "learning_rate": 2.9625288762060065e-05, + "loss": 3.9148, + "step": 82895 + }, + { + "epoch": 5.632558771572224, + "grad_norm": 0.17792044579982758, + "learning_rate": 2.962104226117679e-05, + "loss": 3.8031, + "step": 82900 + }, + { + "epoch": 5.632898491642886, + "grad_norm": 0.12251744419336319, + "learning_rate": 2.961679576029352e-05, + "loss": 4.0173, + "step": 82905 + }, + { + "epoch": 5.633238211713548, + "grad_norm": 0.1594085991382599, + "learning_rate": 2.961254925941025e-05, + "loss": 3.9063, + "step": 82910 + }, + { + "epoch": 5.63357793178421, + "grad_norm": 0.1436924785375595, + "learning_rate": 2.9608302758526974e-05, + "loss": 3.9463, + "step": 82915 + }, + { + "epoch": 5.633917651854872, + "grad_norm": 0.1802213042974472, + "learning_rate": 2.9604056257643702e-05, + "loss": 3.8359, + "step": 82920 + }, + { + "epoch": 5.634257371925534, + "grad_norm": 0.181314155459404, + "learning_rate": 2.9599809756760433e-05, + "loss": 3.7043, + "step": 82925 + }, + { + "epoch": 5.634597091996195, + "grad_norm": 0.2112465798854828, + "learning_rate": 2.9595563255877158e-05, + "loss": 3.9029, + "step": 82930 + }, + { + "epoch": 5.634936812066857, + "grad_norm": 0.33204445242881775, + "learning_rate": 2.9591316754993886e-05, + "loss": 3.825, + "step": 82935 + }, + { + "epoch": 5.635276532137519, + "grad_norm": 0.13935662806034088, + "learning_rate": 2.9587070254110617e-05, + "loss": 3.4793, + "step": 82940 + }, + { + "epoch": 5.63561625220818, + "grad_norm": 0.3159787654876709, + "learning_rate": 2.958282375322734e-05, + "loss": 3.8541, + "step": 82945 + }, + { + "epoch": 5.635955972278842, + "grad_norm": 13.053560256958008, + "learning_rate": 2.957857725234407e-05, + "loss": 3.7833, + "step": 82950 + }, + { + "epoch": 5.6362956923495044, + "grad_norm": 0.1360938996076584, + "learning_rate": 2.9574330751460798e-05, + "loss": 3.8262, + "step": 82955 + }, + { + "epoch": 5.636635412420166, + "grad_norm": 0.1997787356376648, + "learning_rate": 2.9570084250577523e-05, + "loss": 3.9884, + "step": 82960 + }, + { + "epoch": 5.636975132490828, + "grad_norm": 0.13816256821155548, + "learning_rate": 2.9565837749694254e-05, + "loss": 3.7671, + "step": 82965 + }, + { + "epoch": 5.63731485256149, + "grad_norm": 0.15220661461353302, + "learning_rate": 2.9561591248810982e-05, + "loss": 3.7228, + "step": 82970 + }, + { + "epoch": 5.637654572632151, + "grad_norm": 0.15705506503582, + "learning_rate": 2.9557344747927707e-05, + "loss": 4.1978, + "step": 82975 + }, + { + "epoch": 5.637994292702813, + "grad_norm": 0.17424376308918, + "learning_rate": 2.9553098247044435e-05, + "loss": 3.6576, + "step": 82980 + }, + { + "epoch": 5.638334012773475, + "grad_norm": 0.2298591583967209, + "learning_rate": 2.9548851746161166e-05, + "loss": 3.8744, + "step": 82985 + }, + { + "epoch": 5.638673732844136, + "grad_norm": 0.18835166096687317, + "learning_rate": 2.954460524527789e-05, + "loss": 3.9623, + "step": 82990 + }, + { + "epoch": 5.639013452914798, + "grad_norm": 0.14675386250019073, + "learning_rate": 2.954035874439462e-05, + "loss": 3.7058, + "step": 82995 + }, + { + "epoch": 5.6393531729854605, + "grad_norm": 0.14376573264598846, + "learning_rate": 2.953611224351135e-05, + "loss": 3.9587, + "step": 83000 + }, + { + "epoch": 5.639692893056122, + "grad_norm": 0.15324139595031738, + "learning_rate": 2.9531865742628075e-05, + "loss": 3.6694, + "step": 83005 + }, + { + "epoch": 5.640032613126784, + "grad_norm": 0.22318464517593384, + "learning_rate": 2.9527619241744803e-05, + "loss": 4.0287, + "step": 83010 + }, + { + "epoch": 5.640372333197446, + "grad_norm": 0.2266695201396942, + "learning_rate": 2.9523372740861534e-05, + "loss": 3.8871, + "step": 83015 + }, + { + "epoch": 5.640712053268107, + "grad_norm": 0.21675071120262146, + "learning_rate": 2.9519126239978262e-05, + "loss": 3.6472, + "step": 83020 + }, + { + "epoch": 5.641051773338769, + "grad_norm": 0.19038142263889313, + "learning_rate": 2.9514879739094987e-05, + "loss": 3.9748, + "step": 83025 + }, + { + "epoch": 5.64139149340943, + "grad_norm": 0.1761888712644577, + "learning_rate": 2.9510633238211715e-05, + "loss": 3.7723, + "step": 83030 + }, + { + "epoch": 5.641731213480092, + "grad_norm": 0.156538724899292, + "learning_rate": 2.9506386737328446e-05, + "loss": 3.8202, + "step": 83035 + }, + { + "epoch": 5.642070933550754, + "grad_norm": 0.17837630212306976, + "learning_rate": 2.950214023644517e-05, + "loss": 3.9524, + "step": 83040 + }, + { + "epoch": 5.642410653621416, + "grad_norm": 0.22098803520202637, + "learning_rate": 2.94978937355619e-05, + "loss": 3.6583, + "step": 83045 + }, + { + "epoch": 5.642750373692078, + "grad_norm": 0.17674271762371063, + "learning_rate": 2.949364723467863e-05, + "loss": 3.8467, + "step": 83050 + }, + { + "epoch": 5.64309009376274, + "grad_norm": 0.18341729044914246, + "learning_rate": 2.948940073379535e-05, + "loss": 3.89, + "step": 83055 + }, + { + "epoch": 5.643429813833401, + "grad_norm": 0.5191129446029663, + "learning_rate": 2.9485154232912083e-05, + "loss": 3.9397, + "step": 83060 + }, + { + "epoch": 5.643769533904063, + "grad_norm": 0.15145061910152435, + "learning_rate": 2.948090773202881e-05, + "loss": 3.894, + "step": 83065 + }, + { + "epoch": 5.644109253974725, + "grad_norm": 0.1573423147201538, + "learning_rate": 2.9476661231145535e-05, + "loss": 3.833, + "step": 83070 + }, + { + "epoch": 5.644448974045386, + "grad_norm": 0.15245652198791504, + "learning_rate": 2.9472414730262267e-05, + "loss": 3.7706, + "step": 83075 + }, + { + "epoch": 5.644788694116048, + "grad_norm": 0.14686430990695953, + "learning_rate": 2.9468168229378995e-05, + "loss": 3.6349, + "step": 83080 + }, + { + "epoch": 5.64512841418671, + "grad_norm": 0.6079075336456299, + "learning_rate": 2.946392172849572e-05, + "loss": 3.7963, + "step": 83085 + }, + { + "epoch": 5.645468134257372, + "grad_norm": 0.1893862783908844, + "learning_rate": 2.9459675227612447e-05, + "loss": 3.7826, + "step": 83090 + }, + { + "epoch": 5.645807854328034, + "grad_norm": 0.1622413545846939, + "learning_rate": 2.945542872672918e-05, + "loss": 3.9504, + "step": 83095 + }, + { + "epoch": 5.646147574398696, + "grad_norm": 0.17958110570907593, + "learning_rate": 2.9451182225845903e-05, + "loss": 3.9588, + "step": 83100 + }, + { + "epoch": 5.646487294469357, + "grad_norm": 0.23479604721069336, + "learning_rate": 2.944693572496263e-05, + "loss": 3.774, + "step": 83105 + }, + { + "epoch": 5.646827014540019, + "grad_norm": 0.1440928429365158, + "learning_rate": 2.9442689224079363e-05, + "loss": 3.6769, + "step": 83110 + }, + { + "epoch": 5.647166734610681, + "grad_norm": 0.21812096238136292, + "learning_rate": 2.9438442723196087e-05, + "loss": 3.9926, + "step": 83115 + }, + { + "epoch": 5.647506454681342, + "grad_norm": 0.34516340494155884, + "learning_rate": 2.9434196222312815e-05, + "loss": 3.9206, + "step": 83120 + }, + { + "epoch": 5.647846174752004, + "grad_norm": 0.1782693862915039, + "learning_rate": 2.9429949721429543e-05, + "loss": 3.7076, + "step": 83125 + }, + { + "epoch": 5.648185894822666, + "grad_norm": 0.19672977924346924, + "learning_rate": 2.9425703220546268e-05, + "loss": 4.0108, + "step": 83130 + }, + { + "epoch": 5.648525614893328, + "grad_norm": 0.17492203414440155, + "learning_rate": 2.9421456719663e-05, + "loss": 3.7808, + "step": 83135 + }, + { + "epoch": 5.64886533496399, + "grad_norm": 0.9353084564208984, + "learning_rate": 2.9417210218779727e-05, + "loss": 3.7872, + "step": 83140 + }, + { + "epoch": 5.649205055034652, + "grad_norm": 1.3190656900405884, + "learning_rate": 2.9412963717896452e-05, + "loss": 3.7023, + "step": 83145 + }, + { + "epoch": 5.649544775105313, + "grad_norm": 0.16222213208675385, + "learning_rate": 2.9408717217013183e-05, + "loss": 3.8738, + "step": 83150 + }, + { + "epoch": 5.649884495175975, + "grad_norm": 0.17499615252017975, + "learning_rate": 2.940447071612991e-05, + "loss": 3.8073, + "step": 83155 + }, + { + "epoch": 5.650224215246637, + "grad_norm": 0.18653513491153717, + "learning_rate": 2.9400224215246636e-05, + "loss": 4.0649, + "step": 83160 + }, + { + "epoch": 5.650563935317298, + "grad_norm": 0.20725283026695251, + "learning_rate": 2.9395977714363364e-05, + "loss": 3.9893, + "step": 83165 + }, + { + "epoch": 5.65090365538796, + "grad_norm": 0.18083949387073517, + "learning_rate": 2.9391731213480096e-05, + "loss": 3.9518, + "step": 83170 + }, + { + "epoch": 5.651243375458622, + "grad_norm": 0.20022207498550415, + "learning_rate": 2.938748471259682e-05, + "loss": 3.8959, + "step": 83175 + }, + { + "epoch": 5.651583095529284, + "grad_norm": 0.1842350959777832, + "learning_rate": 2.9383238211713548e-05, + "loss": 3.7787, + "step": 83180 + }, + { + "epoch": 5.651922815599946, + "grad_norm": 0.21778813004493713, + "learning_rate": 2.937899171083028e-05, + "loss": 3.7427, + "step": 83185 + }, + { + "epoch": 5.652262535670608, + "grad_norm": 0.3394336998462677, + "learning_rate": 2.9374745209947008e-05, + "loss": 3.6329, + "step": 83190 + }, + { + "epoch": 5.652602255741269, + "grad_norm": 0.15596291422843933, + "learning_rate": 2.9370498709063732e-05, + "loss": 3.8696, + "step": 83195 + }, + { + "epoch": 5.652941975811931, + "grad_norm": 0.12118133157491684, + "learning_rate": 2.936625220818046e-05, + "loss": 3.8626, + "step": 83200 + }, + { + "epoch": 5.653281695882593, + "grad_norm": 0.15132737159729004, + "learning_rate": 2.936200570729719e-05, + "loss": 3.7914, + "step": 83205 + }, + { + "epoch": 5.653621415953254, + "grad_norm": 0.1768975555896759, + "learning_rate": 2.9357759206413916e-05, + "loss": 3.9966, + "step": 83210 + }, + { + "epoch": 5.653961136023916, + "grad_norm": 0.1995272934436798, + "learning_rate": 2.9353512705530644e-05, + "loss": 3.7708, + "step": 83215 + }, + { + "epoch": 5.654300856094578, + "grad_norm": 0.15967482328414917, + "learning_rate": 2.9349266204647376e-05, + "loss": 3.8902, + "step": 83220 + }, + { + "epoch": 5.65464057616524, + "grad_norm": 0.18351605534553528, + "learning_rate": 2.9345019703764097e-05, + "loss": 3.9123, + "step": 83225 + }, + { + "epoch": 5.654980296235902, + "grad_norm": 0.1536710262298584, + "learning_rate": 2.9340773202880828e-05, + "loss": 3.6259, + "step": 83230 + }, + { + "epoch": 5.655320016306564, + "grad_norm": 0.18604178726673126, + "learning_rate": 2.9336526701997556e-05, + "loss": 3.7119, + "step": 83235 + }, + { + "epoch": 5.655659736377225, + "grad_norm": 0.1918763369321823, + "learning_rate": 2.933228020111428e-05, + "loss": 3.8023, + "step": 83240 + }, + { + "epoch": 5.655999456447887, + "grad_norm": 0.16560418903827667, + "learning_rate": 2.9328033700231012e-05, + "loss": 3.7638, + "step": 83245 + }, + { + "epoch": 5.656339176518548, + "grad_norm": 0.14637179672718048, + "learning_rate": 2.932378719934774e-05, + "loss": 3.6471, + "step": 83250 + }, + { + "epoch": 5.65667889658921, + "grad_norm": 0.17613770067691803, + "learning_rate": 2.9319540698464465e-05, + "loss": 4.0844, + "step": 83255 + }, + { + "epoch": 5.657018616659872, + "grad_norm": 0.20106077194213867, + "learning_rate": 2.9315294197581193e-05, + "loss": 3.6809, + "step": 83260 + }, + { + "epoch": 5.657358336730534, + "grad_norm": 0.17073707282543182, + "learning_rate": 2.9311047696697924e-05, + "loss": 3.8857, + "step": 83265 + }, + { + "epoch": 5.657698056801196, + "grad_norm": 0.12879781424999237, + "learning_rate": 2.930680119581465e-05, + "loss": 4.1076, + "step": 83270 + }, + { + "epoch": 5.658037776871858, + "grad_norm": 0.1763564497232437, + "learning_rate": 2.9302554694931377e-05, + "loss": 3.517, + "step": 83275 + }, + { + "epoch": 5.658377496942519, + "grad_norm": 0.20969310402870178, + "learning_rate": 2.9298308194048108e-05, + "loss": 3.6931, + "step": 83280 + }, + { + "epoch": 5.658717217013181, + "grad_norm": 0.1711921989917755, + "learning_rate": 2.9294061693164833e-05, + "loss": 3.7181, + "step": 83285 + }, + { + "epoch": 5.659056937083843, + "grad_norm": 0.17298489809036255, + "learning_rate": 2.928981519228156e-05, + "loss": 4.0389, + "step": 83290 + }, + { + "epoch": 5.659396657154504, + "grad_norm": 0.18086650967597961, + "learning_rate": 2.9285568691398292e-05, + "loss": 3.3915, + "step": 83295 + }, + { + "epoch": 5.659736377225166, + "grad_norm": 0.16978931427001953, + "learning_rate": 2.9281322190515014e-05, + "loss": 3.6245, + "step": 83300 + }, + { + "epoch": 5.660076097295828, + "grad_norm": 0.17924635112285614, + "learning_rate": 2.9277924989808396e-05, + "loss": 3.7128, + "step": 83305 + }, + { + "epoch": 5.66041581736649, + "grad_norm": 0.19551976025104523, + "learning_rate": 2.9273678488925127e-05, + "loss": 3.6357, + "step": 83310 + }, + { + "epoch": 5.660755537437152, + "grad_norm": 0.16476601362228394, + "learning_rate": 2.9269431988041855e-05, + "loss": 3.7055, + "step": 83315 + }, + { + "epoch": 5.661095257507814, + "grad_norm": 0.18846797943115234, + "learning_rate": 2.926518548715858e-05, + "loss": 3.8245, + "step": 83320 + }, + { + "epoch": 5.661434977578475, + "grad_norm": 0.1919729858636856, + "learning_rate": 2.926093898627531e-05, + "loss": 3.8384, + "step": 83325 + }, + { + "epoch": 5.661774697649137, + "grad_norm": 0.18664862215518951, + "learning_rate": 2.925669248539204e-05, + "loss": 4.025, + "step": 83330 + }, + { + "epoch": 5.662114417719799, + "grad_norm": 0.13782340288162231, + "learning_rate": 2.9252445984508764e-05, + "loss": 3.68, + "step": 83335 + }, + { + "epoch": 5.66245413779046, + "grad_norm": 0.16646914184093475, + "learning_rate": 2.9248199483625492e-05, + "loss": 3.8793, + "step": 83340 + }, + { + "epoch": 5.662793857861122, + "grad_norm": 3.15120792388916, + "learning_rate": 2.9243952982742223e-05, + "loss": 3.8462, + "step": 83345 + }, + { + "epoch": 5.663133577931784, + "grad_norm": 0.21389178931713104, + "learning_rate": 2.9239706481858948e-05, + "loss": 3.7504, + "step": 83350 + }, + { + "epoch": 5.663473298002446, + "grad_norm": 0.20090673863887787, + "learning_rate": 2.9235459980975676e-05, + "loss": 3.8691, + "step": 83355 + }, + { + "epoch": 5.663813018073108, + "grad_norm": 0.18039435148239136, + "learning_rate": 2.9231213480092407e-05, + "loss": 4.0466, + "step": 83360 + }, + { + "epoch": 5.66415273814377, + "grad_norm": 0.20697617530822754, + "learning_rate": 2.9226966979209132e-05, + "loss": 3.7371, + "step": 83365 + }, + { + "epoch": 5.664492458214431, + "grad_norm": 0.16424135863780975, + "learning_rate": 2.922272047832586e-05, + "loss": 3.6603, + "step": 83370 + }, + { + "epoch": 5.664832178285093, + "grad_norm": 0.14735214412212372, + "learning_rate": 2.9218473977442588e-05, + "loss": 3.7108, + "step": 83375 + }, + { + "epoch": 5.665171898355755, + "grad_norm": 0.14734973013401031, + "learning_rate": 2.9214227476559313e-05, + "loss": 3.768, + "step": 83380 + }, + { + "epoch": 5.665511618426416, + "grad_norm": 0.4915100336074829, + "learning_rate": 2.9209980975676044e-05, + "loss": 3.5962, + "step": 83385 + }, + { + "epoch": 5.665851338497078, + "grad_norm": 0.15549039840698242, + "learning_rate": 2.9205734474792772e-05, + "loss": 3.9682, + "step": 83390 + }, + { + "epoch": 5.66619105856774, + "grad_norm": 0.1675119698047638, + "learning_rate": 2.9201487973909503e-05, + "loss": 3.8175, + "step": 83395 + }, + { + "epoch": 5.666530778638402, + "grad_norm": 0.19907167553901672, + "learning_rate": 2.9197241473026228e-05, + "loss": 3.7112, + "step": 83400 + }, + { + "epoch": 5.666870498709064, + "grad_norm": 0.1949111968278885, + "learning_rate": 2.9192994972142956e-05, + "loss": 3.9654, + "step": 83405 + }, + { + "epoch": 5.667210218779726, + "grad_norm": 0.15814541280269623, + "learning_rate": 2.9188748471259688e-05, + "loss": 3.6161, + "step": 83410 + }, + { + "epoch": 5.667549938850387, + "grad_norm": 0.20836730301380157, + "learning_rate": 2.918450197037641e-05, + "loss": 3.9131, + "step": 83415 + }, + { + "epoch": 5.667889658921049, + "grad_norm": 0.16399677097797394, + "learning_rate": 2.918025546949314e-05, + "loss": 3.8682, + "step": 83420 + }, + { + "epoch": 5.668229378991711, + "grad_norm": 0.13990828394889832, + "learning_rate": 2.9176008968609868e-05, + "loss": 3.6735, + "step": 83425 + }, + { + "epoch": 5.668569099062372, + "grad_norm": 0.17374281585216522, + "learning_rate": 2.9171762467726593e-05, + "loss": 3.7512, + "step": 83430 + }, + { + "epoch": 5.668908819133034, + "grad_norm": 0.14784905314445496, + "learning_rate": 2.9167515966843324e-05, + "loss": 3.9684, + "step": 83435 + }, + { + "epoch": 5.669248539203696, + "grad_norm": 0.21313919126987457, + "learning_rate": 2.9163269465960052e-05, + "loss": 3.7956, + "step": 83440 + }, + { + "epoch": 5.669588259274358, + "grad_norm": 0.16831237077713013, + "learning_rate": 2.9159022965076777e-05, + "loss": 4.1581, + "step": 83445 + }, + { + "epoch": 5.66992797934502, + "grad_norm": 0.23878076672554016, + "learning_rate": 2.9154776464193505e-05, + "loss": 3.7602, + "step": 83450 + }, + { + "epoch": 5.670267699415682, + "grad_norm": 0.16425523161888123, + "learning_rate": 2.9150529963310236e-05, + "loss": 3.86, + "step": 83455 + }, + { + "epoch": 5.670607419486343, + "grad_norm": 0.19822731614112854, + "learning_rate": 2.914628346242696e-05, + "loss": 3.9242, + "step": 83460 + }, + { + "epoch": 5.670947139557005, + "grad_norm": 0.16095668077468872, + "learning_rate": 2.914203696154369e-05, + "loss": 4.0133, + "step": 83465 + }, + { + "epoch": 5.671286859627667, + "grad_norm": 0.269173264503479, + "learning_rate": 2.913779046066042e-05, + "loss": 3.8335, + "step": 83470 + }, + { + "epoch": 5.671626579698328, + "grad_norm": 0.16792616248130798, + "learning_rate": 2.913354395977714e-05, + "loss": 3.6026, + "step": 83475 + }, + { + "epoch": 5.67196629976899, + "grad_norm": 0.21158432960510254, + "learning_rate": 2.9129297458893873e-05, + "loss": 3.7504, + "step": 83480 + }, + { + "epoch": 5.672306019839652, + "grad_norm": 0.37885284423828125, + "learning_rate": 2.91250509580106e-05, + "loss": 3.8698, + "step": 83485 + }, + { + "epoch": 5.672645739910314, + "grad_norm": 0.15224353969097137, + "learning_rate": 2.9120804457127325e-05, + "loss": 4.0176, + "step": 83490 + }, + { + "epoch": 5.672985459980976, + "grad_norm": 0.2516126036643982, + "learning_rate": 2.9116557956244057e-05, + "loss": 3.8132, + "step": 83495 + }, + { + "epoch": 5.673325180051638, + "grad_norm": 0.14538632333278656, + "learning_rate": 2.9112311455360785e-05, + "loss": 3.9685, + "step": 83500 + }, + { + "epoch": 5.673664900122299, + "grad_norm": 0.17919300496578217, + "learning_rate": 2.910806495447751e-05, + "loss": 3.8122, + "step": 83505 + }, + { + "epoch": 5.674004620192961, + "grad_norm": 0.24166817963123322, + "learning_rate": 2.910381845359424e-05, + "loss": 3.8619, + "step": 83510 + }, + { + "epoch": 5.674344340263623, + "grad_norm": 0.24824847280979156, + "learning_rate": 2.909957195271097e-05, + "loss": 3.8763, + "step": 83515 + }, + { + "epoch": 5.674684060334284, + "grad_norm": 0.13492818176746368, + "learning_rate": 2.9095325451827694e-05, + "loss": 3.9027, + "step": 83520 + }, + { + "epoch": 5.675023780404946, + "grad_norm": 0.15003974735736847, + "learning_rate": 2.909107895094442e-05, + "loss": 3.6359, + "step": 83525 + }, + { + "epoch": 5.6753635004756084, + "grad_norm": 1.2369078397750854, + "learning_rate": 2.9086832450061153e-05, + "loss": 3.9725, + "step": 83530 + }, + { + "epoch": 5.67570322054627, + "grad_norm": 0.19180165231227875, + "learning_rate": 2.9082585949177878e-05, + "loss": 3.7762, + "step": 83535 + }, + { + "epoch": 5.676042940616932, + "grad_norm": 0.19645798206329346, + "learning_rate": 2.9078339448294606e-05, + "loss": 3.92, + "step": 83540 + }, + { + "epoch": 5.676382660687594, + "grad_norm": 0.2066870778799057, + "learning_rate": 2.9074092947411337e-05, + "loss": 3.7721, + "step": 83545 + }, + { + "epoch": 5.676722380758255, + "grad_norm": 0.1598387360572815, + "learning_rate": 2.9069846446528058e-05, + "loss": 3.5383, + "step": 83550 + }, + { + "epoch": 5.677062100828917, + "grad_norm": 0.1623648703098297, + "learning_rate": 2.906559994564479e-05, + "loss": 3.8033, + "step": 83555 + }, + { + "epoch": 5.677401820899579, + "grad_norm": 0.16692954301834106, + "learning_rate": 2.9061353444761518e-05, + "loss": 3.8621, + "step": 83560 + }, + { + "epoch": 5.67774154097024, + "grad_norm": 0.17898179590702057, + "learning_rate": 2.905710694387825e-05, + "loss": 3.8667, + "step": 83565 + }, + { + "epoch": 5.678081261040902, + "grad_norm": 0.5346291065216064, + "learning_rate": 2.9052860442994974e-05, + "loss": 3.8695, + "step": 83570 + }, + { + "epoch": 5.6784209811115645, + "grad_norm": 0.16245298087596893, + "learning_rate": 2.90486139421117e-05, + "loss": 3.7007, + "step": 83575 + }, + { + "epoch": 5.678760701182226, + "grad_norm": 0.151051864027977, + "learning_rate": 2.9044367441228433e-05, + "loss": 3.7604, + "step": 83580 + }, + { + "epoch": 5.679100421252888, + "grad_norm": 0.20730581879615784, + "learning_rate": 2.9040120940345154e-05, + "loss": 3.9031, + "step": 83585 + }, + { + "epoch": 5.67944014132355, + "grad_norm": 0.15818962454795837, + "learning_rate": 2.9035874439461886e-05, + "loss": 3.835, + "step": 83590 + }, + { + "epoch": 5.679779861394211, + "grad_norm": 0.17004868388175964, + "learning_rate": 2.9031627938578614e-05, + "loss": 3.7627, + "step": 83595 + }, + { + "epoch": 5.680119581464873, + "grad_norm": 0.1765996366739273, + "learning_rate": 2.9027381437695338e-05, + "loss": 3.8956, + "step": 83600 + }, + { + "epoch": 5.680459301535535, + "grad_norm": 0.1949256807565689, + "learning_rate": 2.902313493681207e-05, + "loss": 3.8497, + "step": 83605 + }, + { + "epoch": 5.680799021606196, + "grad_norm": 0.14569425582885742, + "learning_rate": 2.9018888435928798e-05, + "loss": 3.908, + "step": 83610 + }, + { + "epoch": 5.681138741676858, + "grad_norm": 0.14103281497955322, + "learning_rate": 2.9014641935045522e-05, + "loss": 3.9549, + "step": 83615 + }, + { + "epoch": 5.6814784617475205, + "grad_norm": 0.1651393324136734, + "learning_rate": 2.901039543416225e-05, + "loss": 3.6542, + "step": 83620 + }, + { + "epoch": 5.681818181818182, + "grad_norm": 0.15967866778373718, + "learning_rate": 2.900614893327898e-05, + "loss": 3.9562, + "step": 83625 + }, + { + "epoch": 5.682157901888844, + "grad_norm": 0.18891659379005432, + "learning_rate": 2.9001902432395706e-05, + "loss": 3.8575, + "step": 83630 + }, + { + "epoch": 5.682497621959506, + "grad_norm": 0.15611638128757477, + "learning_rate": 2.8997655931512434e-05, + "loss": 3.7305, + "step": 83635 + }, + { + "epoch": 5.682837342030167, + "grad_norm": 0.276801198720932, + "learning_rate": 2.8993409430629166e-05, + "loss": 4.1191, + "step": 83640 + }, + { + "epoch": 5.683177062100829, + "grad_norm": 0.15906265377998352, + "learning_rate": 2.898916292974589e-05, + "loss": 3.8293, + "step": 83645 + }, + { + "epoch": 5.683516782171491, + "grad_norm": 0.18043501675128937, + "learning_rate": 2.898491642886262e-05, + "loss": 3.5084, + "step": 83650 + }, + { + "epoch": 5.683856502242152, + "grad_norm": 0.23929733037948608, + "learning_rate": 2.898066992797935e-05, + "loss": 3.4741, + "step": 83655 + }, + { + "epoch": 5.684196222312814, + "grad_norm": 0.11779074370861053, + "learning_rate": 2.897642342709607e-05, + "loss": 3.9385, + "step": 83660 + }, + { + "epoch": 5.6845359423834765, + "grad_norm": 0.16833360493183136, + "learning_rate": 2.8972176926212802e-05, + "loss": 3.7121, + "step": 83665 + }, + { + "epoch": 5.684875662454138, + "grad_norm": 0.2755311131477356, + "learning_rate": 2.896793042532953e-05, + "loss": 3.8742, + "step": 83670 + }, + { + "epoch": 5.6852153825248, + "grad_norm": 0.1551896184682846, + "learning_rate": 2.8963683924446255e-05, + "loss": 3.7275, + "step": 83675 + }, + { + "epoch": 5.685555102595462, + "grad_norm": 0.23808611929416656, + "learning_rate": 2.8959437423562986e-05, + "loss": 4.0943, + "step": 83680 + }, + { + "epoch": 5.685894822666123, + "grad_norm": 1.3335087299346924, + "learning_rate": 2.8955190922679714e-05, + "loss": 3.9424, + "step": 83685 + }, + { + "epoch": 5.686234542736785, + "grad_norm": 0.1901344656944275, + "learning_rate": 2.895094442179644e-05, + "loss": 3.8412, + "step": 83690 + }, + { + "epoch": 5.686574262807447, + "grad_norm": 0.2618623375892639, + "learning_rate": 2.8946697920913167e-05, + "loss": 3.78, + "step": 83695 + }, + { + "epoch": 5.686913982878108, + "grad_norm": 0.17195852100849152, + "learning_rate": 2.89424514200299e-05, + "loss": 3.8022, + "step": 83700 + }, + { + "epoch": 5.68725370294877, + "grad_norm": 0.16855421662330627, + "learning_rate": 2.8938204919146623e-05, + "loss": 3.6619, + "step": 83705 + }, + { + "epoch": 5.687593423019432, + "grad_norm": 3.029052495956421, + "learning_rate": 2.893395841826335e-05, + "loss": 3.4726, + "step": 83710 + }, + { + "epoch": 5.687933143090094, + "grad_norm": 0.18403294682502747, + "learning_rate": 2.8929711917380082e-05, + "loss": 3.8356, + "step": 83715 + }, + { + "epoch": 5.688272863160756, + "grad_norm": 0.14702080190181732, + "learning_rate": 2.8925465416496804e-05, + "loss": 3.793, + "step": 83720 + }, + { + "epoch": 5.688612583231417, + "grad_norm": 0.1990588754415512, + "learning_rate": 2.8921218915613535e-05, + "loss": 3.8297, + "step": 83725 + }, + { + "epoch": 5.688952303302079, + "grad_norm": 0.17446213960647583, + "learning_rate": 2.8916972414730263e-05, + "loss": 3.9965, + "step": 83730 + }, + { + "epoch": 5.689292023372741, + "grad_norm": 0.36991599202156067, + "learning_rate": 2.8912725913846994e-05, + "loss": 3.7948, + "step": 83735 + }, + { + "epoch": 5.689631743443402, + "grad_norm": 0.16051064431667328, + "learning_rate": 2.890847941296372e-05, + "loss": 3.8485, + "step": 83740 + }, + { + "epoch": 5.689971463514064, + "grad_norm": 0.15767550468444824, + "learning_rate": 2.8904232912080447e-05, + "loss": 3.8543, + "step": 83745 + }, + { + "epoch": 5.690311183584726, + "grad_norm": 0.15114298462867737, + "learning_rate": 2.889998641119718e-05, + "loss": 3.8555, + "step": 83750 + }, + { + "epoch": 5.690650903655388, + "grad_norm": 0.1777687817811966, + "learning_rate": 2.8895739910313903e-05, + "loss": 3.9421, + "step": 83755 + }, + { + "epoch": 5.69099062372605, + "grad_norm": 0.14862403273582458, + "learning_rate": 2.889149340943063e-05, + "loss": 3.834, + "step": 83760 + }, + { + "epoch": 5.691330343796712, + "grad_norm": 0.4670039415359497, + "learning_rate": 2.888724690854736e-05, + "loss": 4.0427, + "step": 83765 + }, + { + "epoch": 5.691670063867373, + "grad_norm": 0.18925829231739044, + "learning_rate": 2.8883000407664084e-05, + "loss": 3.8268, + "step": 83770 + }, + { + "epoch": 5.692009783938035, + "grad_norm": 0.19880196452140808, + "learning_rate": 2.8878753906780815e-05, + "loss": 3.8235, + "step": 83775 + }, + { + "epoch": 5.692349504008697, + "grad_norm": 0.15736696124076843, + "learning_rate": 2.8874507405897543e-05, + "loss": 3.9986, + "step": 83780 + }, + { + "epoch": 5.692689224079358, + "grad_norm": 0.17113469541072845, + "learning_rate": 2.8870260905014268e-05, + "loss": 3.9197, + "step": 83785 + }, + { + "epoch": 5.69302894415002, + "grad_norm": 0.20354008674621582, + "learning_rate": 2.8866014404131e-05, + "loss": 3.8406, + "step": 83790 + }, + { + "epoch": 5.693368664220682, + "grad_norm": 0.17924487590789795, + "learning_rate": 2.8861767903247727e-05, + "loss": 3.9416, + "step": 83795 + }, + { + "epoch": 5.693708384291344, + "grad_norm": 2.320963144302368, + "learning_rate": 2.8857521402364452e-05, + "loss": 3.9549, + "step": 83800 + }, + { + "epoch": 5.694048104362006, + "grad_norm": 0.12954330444335938, + "learning_rate": 2.885327490148118e-05, + "loss": 3.6349, + "step": 83805 + }, + { + "epoch": 5.694387824432668, + "grad_norm": 0.29036691784858704, + "learning_rate": 2.884902840059791e-05, + "loss": 3.8184, + "step": 83810 + }, + { + "epoch": 5.694727544503329, + "grad_norm": 0.20923592150211334, + "learning_rate": 2.8844781899714636e-05, + "loss": 3.8828, + "step": 83815 + }, + { + "epoch": 5.695067264573991, + "grad_norm": 0.18361559510231018, + "learning_rate": 2.8840535398831364e-05, + "loss": 3.7846, + "step": 83820 + }, + { + "epoch": 5.695406984644653, + "grad_norm": 0.754600465297699, + "learning_rate": 2.8836288897948095e-05, + "loss": 3.8166, + "step": 83825 + }, + { + "epoch": 5.695746704715314, + "grad_norm": 0.2741512358188629, + "learning_rate": 2.8832042397064816e-05, + "loss": 3.8738, + "step": 83830 + }, + { + "epoch": 5.696086424785976, + "grad_norm": 0.24275408685207367, + "learning_rate": 2.8827795896181548e-05, + "loss": 4.0946, + "step": 83835 + }, + { + "epoch": 5.6964261448566385, + "grad_norm": 0.17275561392307281, + "learning_rate": 2.8823549395298276e-05, + "loss": 3.7004, + "step": 83840 + }, + { + "epoch": 5.6967658649273, + "grad_norm": 0.18428467214107513, + "learning_rate": 2.8819302894415e-05, + "loss": 3.807, + "step": 83845 + }, + { + "epoch": 5.697105584997962, + "grad_norm": 0.1729269176721573, + "learning_rate": 2.8815056393531732e-05, + "loss": 3.8232, + "step": 83850 + }, + { + "epoch": 5.697445305068624, + "grad_norm": 0.19697290658950806, + "learning_rate": 2.881080989264846e-05, + "loss": 3.7969, + "step": 83855 + }, + { + "epoch": 5.697785025139285, + "grad_norm": 0.22765350341796875, + "learning_rate": 2.8806563391765184e-05, + "loss": 3.9311, + "step": 83860 + }, + { + "epoch": 5.698124745209947, + "grad_norm": 0.19849392771720886, + "learning_rate": 2.8802316890881912e-05, + "loss": 3.7848, + "step": 83865 + }, + { + "epoch": 5.698464465280609, + "grad_norm": 0.1669212281703949, + "learning_rate": 2.8798070389998644e-05, + "loss": 3.9406, + "step": 83870 + }, + { + "epoch": 5.69880418535127, + "grad_norm": 0.1646357625722885, + "learning_rate": 2.879382388911537e-05, + "loss": 3.8197, + "step": 83875 + }, + { + "epoch": 5.699143905421932, + "grad_norm": 0.18608656525611877, + "learning_rate": 2.8789577388232097e-05, + "loss": 4.0027, + "step": 83880 + }, + { + "epoch": 5.6994836254925945, + "grad_norm": 0.2082747220993042, + "learning_rate": 2.8785330887348828e-05, + "loss": 3.7118, + "step": 83885 + }, + { + "epoch": 5.699823345563256, + "grad_norm": 0.15497073531150818, + "learning_rate": 2.8781084386465553e-05, + "loss": 3.6747, + "step": 83890 + }, + { + "epoch": 5.700163065633918, + "grad_norm": 0.1216728687286377, + "learning_rate": 2.877683788558228e-05, + "loss": 3.6945, + "step": 83895 + }, + { + "epoch": 5.70050278570458, + "grad_norm": 0.17473770678043365, + "learning_rate": 2.8772591384699012e-05, + "loss": 3.7062, + "step": 83900 + }, + { + "epoch": 5.700842505775241, + "grad_norm": 0.7188725471496582, + "learning_rate": 2.876834488381574e-05, + "loss": 3.9374, + "step": 83905 + }, + { + "epoch": 5.701182225845903, + "grad_norm": 0.3282308280467987, + "learning_rate": 2.8764098382932465e-05, + "loss": 3.9814, + "step": 83910 + }, + { + "epoch": 5.701521945916565, + "grad_norm": 0.1766708344221115, + "learning_rate": 2.8759851882049193e-05, + "loss": 3.9186, + "step": 83915 + }, + { + "epoch": 5.701861665987226, + "grad_norm": 0.2034737765789032, + "learning_rate": 2.8755605381165924e-05, + "loss": 3.7948, + "step": 83920 + }, + { + "epoch": 5.702201386057888, + "grad_norm": 0.1717519313097, + "learning_rate": 2.875135888028265e-05, + "loss": 3.7183, + "step": 83925 + }, + { + "epoch": 5.70254110612855, + "grad_norm": 0.1982448548078537, + "learning_rate": 2.8747112379399377e-05, + "loss": 3.7574, + "step": 83930 + }, + { + "epoch": 5.702880826199212, + "grad_norm": 0.15597577393054962, + "learning_rate": 2.8742865878516108e-05, + "loss": 3.8084, + "step": 83935 + }, + { + "epoch": 5.703220546269874, + "grad_norm": 0.1550191193819046, + "learning_rate": 2.873861937763283e-05, + "loss": 3.6262, + "step": 83940 + }, + { + "epoch": 5.703560266340535, + "grad_norm": 0.16655591130256653, + "learning_rate": 2.873437287674956e-05, + "loss": 4.0337, + "step": 83945 + }, + { + "epoch": 5.703899986411197, + "grad_norm": 1.652599573135376, + "learning_rate": 2.873012637586629e-05, + "loss": 3.7309, + "step": 83950 + }, + { + "epoch": 5.704239706481859, + "grad_norm": 0.16760341823101044, + "learning_rate": 2.8725879874983013e-05, + "loss": 3.8065, + "step": 83955 + }, + { + "epoch": 5.70457942655252, + "grad_norm": 0.21091844141483307, + "learning_rate": 2.8721633374099745e-05, + "loss": 3.7843, + "step": 83960 + }, + { + "epoch": 5.704919146623182, + "grad_norm": 0.17003361880779266, + "learning_rate": 2.8717386873216473e-05, + "loss": 3.7869, + "step": 83965 + }, + { + "epoch": 5.705258866693844, + "grad_norm": 0.18138974905014038, + "learning_rate": 2.8713140372333197e-05, + "loss": 3.8885, + "step": 83970 + }, + { + "epoch": 5.705598586764506, + "grad_norm": 0.1581900417804718, + "learning_rate": 2.8708893871449925e-05, + "loss": 3.9529, + "step": 83975 + }, + { + "epoch": 5.705938306835168, + "grad_norm": 0.1648029386997223, + "learning_rate": 2.8704647370566657e-05, + "loss": 3.9021, + "step": 83980 + }, + { + "epoch": 5.70627802690583, + "grad_norm": 0.8751697540283203, + "learning_rate": 2.870040086968338e-05, + "loss": 3.8481, + "step": 83985 + }, + { + "epoch": 5.706617746976491, + "grad_norm": 0.1688155233860016, + "learning_rate": 2.869615436880011e-05, + "loss": 3.6211, + "step": 83990 + }, + { + "epoch": 5.706957467047153, + "grad_norm": 0.15785467624664307, + "learning_rate": 2.869190786791684e-05, + "loss": 3.8426, + "step": 83995 + }, + { + "epoch": 5.707297187117815, + "grad_norm": 0.1995263695716858, + "learning_rate": 2.8687661367033565e-05, + "loss": 3.8122, + "step": 84000 + }, + { + "epoch": 5.707636907188476, + "grad_norm": 0.19291599094867706, + "learning_rate": 2.8683414866150293e-05, + "loss": 3.8649, + "step": 84005 + }, + { + "epoch": 5.707976627259138, + "grad_norm": 0.13972537219524384, + "learning_rate": 2.867916836526702e-05, + "loss": 3.8534, + "step": 84010 + }, + { + "epoch": 5.7083163473298, + "grad_norm": 0.22105510532855988, + "learning_rate": 2.8674921864383746e-05, + "loss": 3.6267, + "step": 84015 + }, + { + "epoch": 5.708656067400462, + "grad_norm": 0.18316251039505005, + "learning_rate": 2.8670675363500477e-05, + "loss": 3.8411, + "step": 84020 + }, + { + "epoch": 5.708995787471124, + "grad_norm": 0.15393269062042236, + "learning_rate": 2.8666428862617205e-05, + "loss": 3.6259, + "step": 84025 + }, + { + "epoch": 5.709335507541786, + "grad_norm": 0.18272697925567627, + "learning_rate": 2.866218236173393e-05, + "loss": 3.6601, + "step": 84030 + }, + { + "epoch": 5.709675227612447, + "grad_norm": 0.1992858648300171, + "learning_rate": 2.865793586085066e-05, + "loss": 3.8797, + "step": 84035 + }, + { + "epoch": 5.710014947683109, + "grad_norm": 0.2807686924934387, + "learning_rate": 2.865368935996739e-05, + "loss": 3.9608, + "step": 84040 + }, + { + "epoch": 5.710354667753771, + "grad_norm": 0.17115215957164764, + "learning_rate": 2.8649442859084114e-05, + "loss": 3.9544, + "step": 84045 + }, + { + "epoch": 5.710694387824432, + "grad_norm": 0.2829534113407135, + "learning_rate": 2.8645196358200842e-05, + "loss": 4.1406, + "step": 84050 + }, + { + "epoch": 5.711034107895094, + "grad_norm": 0.22288502752780914, + "learning_rate": 2.8640949857317573e-05, + "loss": 3.935, + "step": 84055 + }, + { + "epoch": 5.711373827965756, + "grad_norm": 0.17138591408729553, + "learning_rate": 2.8636703356434298e-05, + "loss": 3.8362, + "step": 84060 + }, + { + "epoch": 5.711713548036418, + "grad_norm": 0.1970357596874237, + "learning_rate": 2.8632456855551026e-05, + "loss": 3.6888, + "step": 84065 + }, + { + "epoch": 5.71205326810708, + "grad_norm": 0.1706417351961136, + "learning_rate": 2.8628210354667757e-05, + "loss": 4.0501, + "step": 84070 + }, + { + "epoch": 5.712392988177742, + "grad_norm": 0.2037607580423355, + "learning_rate": 2.8623963853784485e-05, + "loss": 4.0146, + "step": 84075 + }, + { + "epoch": 5.712732708248403, + "grad_norm": 0.17065711319446564, + "learning_rate": 2.861971735290121e-05, + "loss": 4.1319, + "step": 84080 + }, + { + "epoch": 5.713072428319065, + "grad_norm": 0.15186430513858795, + "learning_rate": 2.8615470852017938e-05, + "loss": 3.6832, + "step": 84085 + }, + { + "epoch": 5.713412148389727, + "grad_norm": 0.21831029653549194, + "learning_rate": 2.861122435113467e-05, + "loss": 3.5922, + "step": 84090 + }, + { + "epoch": 5.713751868460388, + "grad_norm": 0.12921680510044098, + "learning_rate": 2.8606977850251394e-05, + "loss": 3.7103, + "step": 84095 + }, + { + "epoch": 5.71409158853105, + "grad_norm": 0.14213402569293976, + "learning_rate": 2.8602731349368122e-05, + "loss": 3.6993, + "step": 84100 + }, + { + "epoch": 5.7144313086017124, + "grad_norm": 0.43350332975387573, + "learning_rate": 2.8598484848484853e-05, + "loss": 3.8289, + "step": 84105 + }, + { + "epoch": 5.714771028672374, + "grad_norm": 0.15845626592636108, + "learning_rate": 2.8594238347601575e-05, + "loss": 3.8931, + "step": 84110 + }, + { + "epoch": 5.715110748743036, + "grad_norm": 0.20084062218666077, + "learning_rate": 2.8589991846718306e-05, + "loss": 3.802, + "step": 84115 + }, + { + "epoch": 5.715450468813698, + "grad_norm": 0.1646522432565689, + "learning_rate": 2.8585745345835034e-05, + "loss": 3.8908, + "step": 84120 + }, + { + "epoch": 5.715790188884359, + "grad_norm": 0.2390480488538742, + "learning_rate": 2.858149884495176e-05, + "loss": 4.0157, + "step": 84125 + }, + { + "epoch": 5.716129908955021, + "grad_norm": 0.21871823072433472, + "learning_rate": 2.857725234406849e-05, + "loss": 3.805, + "step": 84130 + }, + { + "epoch": 5.716469629025683, + "grad_norm": 0.19886848330497742, + "learning_rate": 2.8573005843185218e-05, + "loss": 3.8819, + "step": 84135 + }, + { + "epoch": 5.716809349096344, + "grad_norm": 0.1616610735654831, + "learning_rate": 2.8568759342301943e-05, + "loss": 4.2244, + "step": 84140 + }, + { + "epoch": 5.717149069167006, + "grad_norm": 0.22112472355365753, + "learning_rate": 2.8564512841418674e-05, + "loss": 3.9623, + "step": 84145 + }, + { + "epoch": 5.7174887892376685, + "grad_norm": 0.1288512498140335, + "learning_rate": 2.8560266340535402e-05, + "loss": 3.8086, + "step": 84150 + }, + { + "epoch": 5.71782850930833, + "grad_norm": 0.16054606437683105, + "learning_rate": 2.8556019839652127e-05, + "loss": 4.1863, + "step": 84155 + }, + { + "epoch": 5.718168229378992, + "grad_norm": 0.19785811007022858, + "learning_rate": 2.8551773338768855e-05, + "loss": 3.8457, + "step": 84160 + }, + { + "epoch": 5.718507949449654, + "grad_norm": 0.15443524718284607, + "learning_rate": 2.8547526837885586e-05, + "loss": 3.9416, + "step": 84165 + }, + { + "epoch": 5.718847669520315, + "grad_norm": 0.8517979383468628, + "learning_rate": 2.854328033700231e-05, + "loss": 3.996, + "step": 84170 + }, + { + "epoch": 5.719187389590977, + "grad_norm": 0.25512734055519104, + "learning_rate": 2.853903383611904e-05, + "loss": 3.6762, + "step": 84175 + }, + { + "epoch": 5.719527109661639, + "grad_norm": 0.16859646141529083, + "learning_rate": 2.853478733523577e-05, + "loss": 3.789, + "step": 84180 + }, + { + "epoch": 5.7198668297323, + "grad_norm": 0.16310900449752808, + "learning_rate": 2.853054083435249e-05, + "loss": 3.8883, + "step": 84185 + }, + { + "epoch": 5.720206549802962, + "grad_norm": 0.13819266855716705, + "learning_rate": 2.8526294333469223e-05, + "loss": 3.7486, + "step": 84190 + }, + { + "epoch": 5.7205462698736245, + "grad_norm": 0.1847442090511322, + "learning_rate": 2.852204783258595e-05, + "loss": 3.8637, + "step": 84195 + }, + { + "epoch": 5.720885989944286, + "grad_norm": 0.18871153891086578, + "learning_rate": 2.8517801331702675e-05, + "loss": 3.852, + "step": 84200 + }, + { + "epoch": 5.721225710014948, + "grad_norm": 0.14878308773040771, + "learning_rate": 2.8513554830819407e-05, + "loss": 3.8727, + "step": 84205 + }, + { + "epoch": 5.72156543008561, + "grad_norm": 0.15848085284233093, + "learning_rate": 2.8509308329936135e-05, + "loss": 3.8379, + "step": 84210 + }, + { + "epoch": 5.721905150156271, + "grad_norm": 0.15675479173660278, + "learning_rate": 2.850506182905286e-05, + "loss": 3.9258, + "step": 84215 + }, + { + "epoch": 5.722244870226933, + "grad_norm": 0.2276291698217392, + "learning_rate": 2.8500815328169587e-05, + "loss": 3.9805, + "step": 84220 + }, + { + "epoch": 5.722584590297595, + "grad_norm": 0.16095496714115143, + "learning_rate": 2.849656882728632e-05, + "loss": 3.7491, + "step": 84225 + }, + { + "epoch": 5.722924310368256, + "grad_norm": 0.18302349746227264, + "learning_rate": 2.8492322326403044e-05, + "loss": 3.6805, + "step": 84230 + }, + { + "epoch": 5.723264030438918, + "grad_norm": 0.16763104498386383, + "learning_rate": 2.848807582551977e-05, + "loss": 3.6415, + "step": 84235 + }, + { + "epoch": 5.7236037505095805, + "grad_norm": 0.18177607655525208, + "learning_rate": 2.8483829324636503e-05, + "loss": 4.089, + "step": 84240 + }, + { + "epoch": 5.723943470580242, + "grad_norm": 0.17339642345905304, + "learning_rate": 2.847958282375323e-05, + "loss": 3.7909, + "step": 84245 + }, + { + "epoch": 5.724283190650904, + "grad_norm": 0.17382997274398804, + "learning_rate": 2.8475336322869956e-05, + "loss": 3.8133, + "step": 84250 + }, + { + "epoch": 5.724622910721566, + "grad_norm": 0.17943423986434937, + "learning_rate": 2.8471089821986684e-05, + "loss": 3.6571, + "step": 84255 + }, + { + "epoch": 5.724962630792227, + "grad_norm": 0.16299636662006378, + "learning_rate": 2.8466843321103415e-05, + "loss": 3.5353, + "step": 84260 + }, + { + "epoch": 5.725302350862889, + "grad_norm": 0.16162973642349243, + "learning_rate": 2.846259682022014e-05, + "loss": 3.963, + "step": 84265 + }, + { + "epoch": 5.725642070933551, + "grad_norm": 0.17693373560905457, + "learning_rate": 2.8458350319336868e-05, + "loss": 3.7235, + "step": 84270 + }, + { + "epoch": 5.725981791004212, + "grad_norm": 0.1721968650817871, + "learning_rate": 2.84541038184536e-05, + "loss": 3.8208, + "step": 84275 + }, + { + "epoch": 5.726321511074874, + "grad_norm": 0.16787490248680115, + "learning_rate": 2.8449857317570324e-05, + "loss": 3.8813, + "step": 84280 + }, + { + "epoch": 5.7266612311455365, + "grad_norm": 0.8830936551094055, + "learning_rate": 2.844561081668705e-05, + "loss": 3.625, + "step": 84285 + }, + { + "epoch": 5.727000951216198, + "grad_norm": 0.14381183683872223, + "learning_rate": 2.844136431580378e-05, + "loss": 3.7739, + "step": 84290 + }, + { + "epoch": 5.72734067128686, + "grad_norm": 0.17257672548294067, + "learning_rate": 2.8437117814920504e-05, + "loss": 3.571, + "step": 84295 + }, + { + "epoch": 5.727680391357522, + "grad_norm": 0.1419108510017395, + "learning_rate": 2.8432871314037236e-05, + "loss": 3.9062, + "step": 84300 + }, + { + "epoch": 5.728020111428183, + "grad_norm": 0.16995063424110413, + "learning_rate": 2.8428624813153964e-05, + "loss": 3.769, + "step": 84305 + }, + { + "epoch": 5.728359831498845, + "grad_norm": 0.17657850682735443, + "learning_rate": 2.8424378312270688e-05, + "loss": 3.7235, + "step": 84310 + }, + { + "epoch": 5.728699551569507, + "grad_norm": 0.18855217099189758, + "learning_rate": 2.842013181138742e-05, + "loss": 3.8537, + "step": 84315 + }, + { + "epoch": 5.729039271640168, + "grad_norm": 0.18494533002376556, + "learning_rate": 2.8415885310504148e-05, + "loss": 3.6648, + "step": 84320 + }, + { + "epoch": 5.72937899171083, + "grad_norm": 0.1814785748720169, + "learning_rate": 2.8411638809620872e-05, + "loss": 3.6033, + "step": 84325 + }, + { + "epoch": 5.7297187117814925, + "grad_norm": 0.1698867678642273, + "learning_rate": 2.84073923087376e-05, + "loss": 3.6308, + "step": 84330 + }, + { + "epoch": 5.730058431852154, + "grad_norm": 0.21539077162742615, + "learning_rate": 2.840314580785433e-05, + "loss": 3.7273, + "step": 84335 + }, + { + "epoch": 5.730398151922816, + "grad_norm": 0.14693494141101837, + "learning_rate": 2.8398899306971056e-05, + "loss": 3.7732, + "step": 84340 + }, + { + "epoch": 5.730737871993478, + "grad_norm": 1.6342934370040894, + "learning_rate": 2.8394652806087784e-05, + "loss": 3.9008, + "step": 84345 + }, + { + "epoch": 5.731077592064139, + "grad_norm": 0.16297627985477448, + "learning_rate": 2.8390406305204516e-05, + "loss": 3.7444, + "step": 84350 + }, + { + "epoch": 5.731417312134801, + "grad_norm": 0.15844391286373138, + "learning_rate": 2.8386159804321237e-05, + "loss": 3.9846, + "step": 84355 + }, + { + "epoch": 5.731757032205463, + "grad_norm": 0.16376753151416779, + "learning_rate": 2.838191330343797e-05, + "loss": 3.8589, + "step": 84360 + }, + { + "epoch": 5.732096752276124, + "grad_norm": 0.19383500516414642, + "learning_rate": 2.8377666802554696e-05, + "loss": 3.7295, + "step": 84365 + }, + { + "epoch": 5.732436472346786, + "grad_norm": 0.1928442269563675, + "learning_rate": 2.837342030167142e-05, + "loss": 3.734, + "step": 84370 + }, + { + "epoch": 5.7327761924174485, + "grad_norm": 1.2924537658691406, + "learning_rate": 2.8369173800788152e-05, + "loss": 3.7558, + "step": 84375 + }, + { + "epoch": 5.73311591248811, + "grad_norm": 0.2199750393629074, + "learning_rate": 2.836492729990488e-05, + "loss": 3.886, + "step": 84380 + }, + { + "epoch": 5.733455632558772, + "grad_norm": 0.15915720164775848, + "learning_rate": 2.8360680799021605e-05, + "loss": 3.7247, + "step": 84385 + }, + { + "epoch": 5.733795352629433, + "grad_norm": 0.1847950518131256, + "learning_rate": 2.8356434298138333e-05, + "loss": 4.0335, + "step": 84390 + }, + { + "epoch": 5.734135072700095, + "grad_norm": 0.19790425896644592, + "learning_rate": 2.8352187797255064e-05, + "loss": 4.1639, + "step": 84395 + }, + { + "epoch": 5.734474792770757, + "grad_norm": 0.15156157314777374, + "learning_rate": 2.834794129637179e-05, + "loss": 3.6471, + "step": 84400 + }, + { + "epoch": 5.734814512841418, + "grad_norm": 0.1940418928861618, + "learning_rate": 2.8343694795488517e-05, + "loss": 3.6593, + "step": 84405 + }, + { + "epoch": 5.73515423291208, + "grad_norm": 0.16451658308506012, + "learning_rate": 2.833944829460525e-05, + "loss": 4.0579, + "step": 84410 + }, + { + "epoch": 5.7354939529827424, + "grad_norm": 0.20154651999473572, + "learning_rate": 2.8335201793721976e-05, + "loss": 3.7203, + "step": 84415 + }, + { + "epoch": 5.735833673053404, + "grad_norm": 0.17576172947883606, + "learning_rate": 2.83309552928387e-05, + "loss": 4.0157, + "step": 84420 + }, + { + "epoch": 5.736173393124066, + "grad_norm": 0.14807777106761932, + "learning_rate": 2.8326708791955432e-05, + "loss": 3.895, + "step": 84425 + }, + { + "epoch": 5.736513113194728, + "grad_norm": 0.16156822443008423, + "learning_rate": 2.832246229107216e-05, + "loss": 3.7501, + "step": 84430 + }, + { + "epoch": 5.736852833265389, + "grad_norm": 0.189786896109581, + "learning_rate": 2.8318215790188885e-05, + "loss": 3.7693, + "step": 84435 + }, + { + "epoch": 5.737192553336051, + "grad_norm": 0.13681000471115112, + "learning_rate": 2.8313969289305613e-05, + "loss": 3.7864, + "step": 84440 + }, + { + "epoch": 5.737532273406713, + "grad_norm": 0.20690418779850006, + "learning_rate": 2.8309722788422344e-05, + "loss": 3.4488, + "step": 84445 + }, + { + "epoch": 5.737871993477374, + "grad_norm": 0.1671135425567627, + "learning_rate": 2.830547628753907e-05, + "loss": 4.0006, + "step": 84450 + }, + { + "epoch": 5.738211713548036, + "grad_norm": 0.4124707877635956, + "learning_rate": 2.8301229786655797e-05, + "loss": 3.8647, + "step": 84455 + }, + { + "epoch": 5.7385514336186985, + "grad_norm": 0.16540631651878357, + "learning_rate": 2.829698328577253e-05, + "loss": 3.6893, + "step": 84460 + }, + { + "epoch": 5.73889115368936, + "grad_norm": 0.18046435713768005, + "learning_rate": 2.829273678488925e-05, + "loss": 3.7755, + "step": 84465 + }, + { + "epoch": 5.739230873760022, + "grad_norm": 0.19030538201332092, + "learning_rate": 2.828849028400598e-05, + "loss": 4.0461, + "step": 84470 + }, + { + "epoch": 5.739570593830684, + "grad_norm": 0.5149437785148621, + "learning_rate": 2.828424378312271e-05, + "loss": 3.9692, + "step": 84475 + }, + { + "epoch": 5.739910313901345, + "grad_norm": 0.34006330370903015, + "learning_rate": 2.8279997282239434e-05, + "loss": 3.7265, + "step": 84480 + }, + { + "epoch": 5.740250033972007, + "grad_norm": 0.18922938406467438, + "learning_rate": 2.8275750781356165e-05, + "loss": 3.8387, + "step": 84485 + }, + { + "epoch": 5.740589754042669, + "grad_norm": 0.28339576721191406, + "learning_rate": 2.8271504280472893e-05, + "loss": 3.8585, + "step": 84490 + }, + { + "epoch": 5.74092947411333, + "grad_norm": 0.12746278941631317, + "learning_rate": 2.8267257779589618e-05, + "loss": 4.0033, + "step": 84495 + }, + { + "epoch": 5.741269194183992, + "grad_norm": 0.32636842131614685, + "learning_rate": 2.8263011278706346e-05, + "loss": 3.7851, + "step": 84500 + }, + { + "epoch": 5.7416089142546545, + "grad_norm": 0.20219780504703522, + "learning_rate": 2.8258764777823077e-05, + "loss": 3.5434, + "step": 84505 + }, + { + "epoch": 5.741948634325316, + "grad_norm": 0.1583326905965805, + "learning_rate": 2.8254518276939802e-05, + "loss": 3.5531, + "step": 84510 + }, + { + "epoch": 5.742288354395978, + "grad_norm": 0.16701556742191315, + "learning_rate": 2.825027177605653e-05, + "loss": 3.8651, + "step": 84515 + }, + { + "epoch": 5.74262807446664, + "grad_norm": 0.21894414722919464, + "learning_rate": 2.824602527517326e-05, + "loss": 3.9741, + "step": 84520 + }, + { + "epoch": 5.742967794537301, + "grad_norm": 0.17777284979820251, + "learning_rate": 2.8241778774289986e-05, + "loss": 3.9269, + "step": 84525 + }, + { + "epoch": 5.743307514607963, + "grad_norm": 0.1445787400007248, + "learning_rate": 2.8237532273406714e-05, + "loss": 3.987, + "step": 84530 + }, + { + "epoch": 5.743647234678625, + "grad_norm": 0.18643580377101898, + "learning_rate": 2.8233285772523442e-05, + "loss": 3.9588, + "step": 84535 + }, + { + "epoch": 5.743986954749286, + "grad_norm": 0.14663001894950867, + "learning_rate": 2.8229039271640166e-05, + "loss": 3.9921, + "step": 84540 + }, + { + "epoch": 5.744326674819948, + "grad_norm": 0.5163230895996094, + "learning_rate": 2.8224792770756898e-05, + "loss": 3.8387, + "step": 84545 + }, + { + "epoch": 5.7446663948906105, + "grad_norm": 0.1589977890253067, + "learning_rate": 2.8220546269873626e-05, + "loss": 3.7227, + "step": 84550 + }, + { + "epoch": 5.745006114961272, + "grad_norm": 0.2141970545053482, + "learning_rate": 2.821629976899035e-05, + "loss": 3.8296, + "step": 84555 + }, + { + "epoch": 5.745345835031934, + "grad_norm": 0.32184547185897827, + "learning_rate": 2.8212053268107082e-05, + "loss": 3.8931, + "step": 84560 + }, + { + "epoch": 5.745685555102596, + "grad_norm": 1.4347431659698486, + "learning_rate": 2.820780676722381e-05, + "loss": 3.7685, + "step": 84565 + }, + { + "epoch": 5.746025275173257, + "grad_norm": 0.16629013419151306, + "learning_rate": 2.8203560266340534e-05, + "loss": 3.9668, + "step": 84570 + }, + { + "epoch": 5.746364995243919, + "grad_norm": 0.16699907183647156, + "learning_rate": 2.8199313765457262e-05, + "loss": 3.8691, + "step": 84575 + }, + { + "epoch": 5.746704715314581, + "grad_norm": 0.28668075799942017, + "learning_rate": 2.8195067264573994e-05, + "loss": 3.7428, + "step": 84580 + }, + { + "epoch": 5.747044435385242, + "grad_norm": 0.19142426550388336, + "learning_rate": 2.8190820763690722e-05, + "loss": 3.9468, + "step": 84585 + }, + { + "epoch": 5.747384155455904, + "grad_norm": 0.17680834233760834, + "learning_rate": 2.8186574262807447e-05, + "loss": 4.0126, + "step": 84590 + }, + { + "epoch": 5.7477238755265665, + "grad_norm": 0.18493545055389404, + "learning_rate": 2.8182327761924178e-05, + "loss": 3.7973, + "step": 84595 + }, + { + "epoch": 5.748063595597228, + "grad_norm": 0.16582919657230377, + "learning_rate": 2.8178081261040906e-05, + "loss": 3.6235, + "step": 84600 + }, + { + "epoch": 5.74840331566789, + "grad_norm": 0.14612148702144623, + "learning_rate": 2.817383476015763e-05, + "loss": 3.9522, + "step": 84605 + }, + { + "epoch": 5.748743035738551, + "grad_norm": 0.1952006220817566, + "learning_rate": 2.816958825927436e-05, + "loss": 3.923, + "step": 84610 + }, + { + "epoch": 5.749082755809213, + "grad_norm": 0.16045796871185303, + "learning_rate": 2.816534175839109e-05, + "loss": 3.7315, + "step": 84615 + }, + { + "epoch": 5.749422475879875, + "grad_norm": 1.3243160247802734, + "learning_rate": 2.8161095257507815e-05, + "loss": 3.7409, + "step": 84620 + }, + { + "epoch": 5.749762195950536, + "grad_norm": 0.7889221906661987, + "learning_rate": 2.8156848756624543e-05, + "loss": 3.7989, + "step": 84625 + }, + { + "epoch": 5.750101916021198, + "grad_norm": 0.1923588067293167, + "learning_rate": 2.8152602255741274e-05, + "loss": 3.6979, + "step": 84630 + }, + { + "epoch": 5.75044163609186, + "grad_norm": 0.14769572019577026, + "learning_rate": 2.8148355754857995e-05, + "loss": 3.6868, + "step": 84635 + }, + { + "epoch": 5.750781356162522, + "grad_norm": 0.14476566016674042, + "learning_rate": 2.8144109253974727e-05, + "loss": 3.8172, + "step": 84640 + }, + { + "epoch": 5.751121076233184, + "grad_norm": 0.13889585435390472, + "learning_rate": 2.8139862753091455e-05, + "loss": 3.7715, + "step": 84645 + }, + { + "epoch": 5.751460796303846, + "grad_norm": 0.2442198246717453, + "learning_rate": 2.813561625220818e-05, + "loss": 3.7316, + "step": 84650 + }, + { + "epoch": 5.751800516374507, + "grad_norm": 0.1697923243045807, + "learning_rate": 2.813136975132491e-05, + "loss": 3.8059, + "step": 84655 + }, + { + "epoch": 5.752140236445169, + "grad_norm": 0.15497879683971405, + "learning_rate": 2.812712325044164e-05, + "loss": 3.9397, + "step": 84660 + }, + { + "epoch": 5.752479956515831, + "grad_norm": 0.18171866238117218, + "learning_rate": 2.8122876749558363e-05, + "loss": 3.8046, + "step": 84665 + }, + { + "epoch": 5.752819676586492, + "grad_norm": 0.22138479351997375, + "learning_rate": 2.8118630248675095e-05, + "loss": 3.8601, + "step": 84670 + }, + { + "epoch": 5.753159396657154, + "grad_norm": 0.2522282302379608, + "learning_rate": 2.8114383747791823e-05, + "loss": 3.649, + "step": 84675 + }, + { + "epoch": 5.753499116727816, + "grad_norm": 0.6027456521987915, + "learning_rate": 2.8110137246908547e-05, + "loss": 4.0153, + "step": 84680 + }, + { + "epoch": 5.753838836798478, + "grad_norm": 0.16744588315486908, + "learning_rate": 2.8105890746025275e-05, + "loss": 3.716, + "step": 84685 + }, + { + "epoch": 5.75417855686914, + "grad_norm": 0.19689105451107025, + "learning_rate": 2.8101644245142007e-05, + "loss": 3.8246, + "step": 84690 + }, + { + "epoch": 5.754518276939802, + "grad_norm": 0.32462790608406067, + "learning_rate": 2.809739774425873e-05, + "loss": 3.877, + "step": 84695 + }, + { + "epoch": 5.754857997010463, + "grad_norm": 0.19031301140785217, + "learning_rate": 2.809315124337546e-05, + "loss": 3.5828, + "step": 84700 + }, + { + "epoch": 5.755197717081125, + "grad_norm": 0.2095639407634735, + "learning_rate": 2.808890474249219e-05, + "loss": 3.8233, + "step": 84705 + }, + { + "epoch": 5.755537437151787, + "grad_norm": 0.204901322722435, + "learning_rate": 2.8084658241608912e-05, + "loss": 3.7782, + "step": 84710 + }, + { + "epoch": 5.755877157222448, + "grad_norm": 0.19507405161857605, + "learning_rate": 2.8080411740725643e-05, + "loss": 3.3606, + "step": 84715 + }, + { + "epoch": 5.75621687729311, + "grad_norm": 0.17809341847896576, + "learning_rate": 2.807616523984237e-05, + "loss": 3.7602, + "step": 84720 + }, + { + "epoch": 5.7565565973637725, + "grad_norm": 0.16225934028625488, + "learning_rate": 2.8071918738959096e-05, + "loss": 3.6653, + "step": 84725 + }, + { + "epoch": 5.756896317434434, + "grad_norm": 0.19570264220237732, + "learning_rate": 2.8067672238075827e-05, + "loss": 3.6867, + "step": 84730 + }, + { + "epoch": 5.757236037505096, + "grad_norm": 0.1904711127281189, + "learning_rate": 2.8063425737192555e-05, + "loss": 3.6966, + "step": 84735 + }, + { + "epoch": 5.757575757575758, + "grad_norm": 0.1777360886335373, + "learning_rate": 2.805917923630928e-05, + "loss": 3.8035, + "step": 84740 + }, + { + "epoch": 5.757915477646419, + "grad_norm": 0.23377631604671478, + "learning_rate": 2.8054932735426008e-05, + "loss": 3.8726, + "step": 84745 + }, + { + "epoch": 5.758255197717081, + "grad_norm": 0.2188449501991272, + "learning_rate": 2.805068623454274e-05, + "loss": 3.8737, + "step": 84750 + }, + { + "epoch": 5.758594917787743, + "grad_norm": 0.16581086814403534, + "learning_rate": 2.8046439733659467e-05, + "loss": 4.0453, + "step": 84755 + }, + { + "epoch": 5.758934637858404, + "grad_norm": 0.1899517923593521, + "learning_rate": 2.8042193232776192e-05, + "loss": 3.7387, + "step": 84760 + }, + { + "epoch": 5.759274357929066, + "grad_norm": 0.1796630620956421, + "learning_rate": 2.8037946731892923e-05, + "loss": 4.0786, + "step": 84765 + }, + { + "epoch": 5.7596140779997285, + "grad_norm": 0.16960595548152924, + "learning_rate": 2.803370023100965e-05, + "loss": 3.8152, + "step": 84770 + }, + { + "epoch": 5.75995379807039, + "grad_norm": 0.23614150285720825, + "learning_rate": 2.8029453730126376e-05, + "loss": 3.8567, + "step": 84775 + }, + { + "epoch": 5.760293518141052, + "grad_norm": 0.1941513568162918, + "learning_rate": 2.8025207229243104e-05, + "loss": 4.0143, + "step": 84780 + }, + { + "epoch": 5.760633238211714, + "grad_norm": 0.1944657862186432, + "learning_rate": 2.8020960728359835e-05, + "loss": 3.8455, + "step": 84785 + }, + { + "epoch": 5.760972958282375, + "grad_norm": 0.2604605257511139, + "learning_rate": 2.801671422747656e-05, + "loss": 3.6816, + "step": 84790 + }, + { + "epoch": 5.761312678353037, + "grad_norm": 0.22314804792404175, + "learning_rate": 2.8012467726593288e-05, + "loss": 3.7924, + "step": 84795 + }, + { + "epoch": 5.761652398423699, + "grad_norm": 0.1947893649339676, + "learning_rate": 2.800822122571002e-05, + "loss": 3.7387, + "step": 84800 + }, + { + "epoch": 5.76199211849436, + "grad_norm": 0.21884416043758392, + "learning_rate": 2.8003974724826744e-05, + "loss": 3.3833, + "step": 84805 + }, + { + "epoch": 5.762331838565022, + "grad_norm": 0.1771090179681778, + "learning_rate": 2.7999728223943472e-05, + "loss": 3.7333, + "step": 84810 + }, + { + "epoch": 5.7626715586356845, + "grad_norm": 0.2054833024740219, + "learning_rate": 2.7995481723060203e-05, + "loss": 4.026, + "step": 84815 + }, + { + "epoch": 5.763011278706346, + "grad_norm": 0.6642948389053345, + "learning_rate": 2.7991235222176925e-05, + "loss": 3.9361, + "step": 84820 + }, + { + "epoch": 5.763350998777008, + "grad_norm": 0.15206870436668396, + "learning_rate": 2.7986988721293656e-05, + "loss": 3.7772, + "step": 84825 + }, + { + "epoch": 5.76369071884767, + "grad_norm": 0.19731278717517853, + "learning_rate": 2.7982742220410384e-05, + "loss": 3.8896, + "step": 84830 + }, + { + "epoch": 5.764030438918331, + "grad_norm": 0.16728836297988892, + "learning_rate": 2.797849571952711e-05, + "loss": 3.967, + "step": 84835 + }, + { + "epoch": 5.764370158988993, + "grad_norm": 0.1630062609910965, + "learning_rate": 2.797424921864384e-05, + "loss": 3.6874, + "step": 84840 + }, + { + "epoch": 5.764709879059655, + "grad_norm": 0.1959209442138672, + "learning_rate": 2.7970002717760568e-05, + "loss": 3.8182, + "step": 84845 + }, + { + "epoch": 5.765049599130316, + "grad_norm": 0.16736268997192383, + "learning_rate": 2.7965756216877293e-05, + "loss": 3.4906, + "step": 84850 + }, + { + "epoch": 5.765389319200978, + "grad_norm": 0.17490389943122864, + "learning_rate": 2.796150971599402e-05, + "loss": 3.8009, + "step": 84855 + }, + { + "epoch": 5.7657290392716405, + "grad_norm": 0.2759133279323578, + "learning_rate": 2.7957263215110752e-05, + "loss": 3.8765, + "step": 84860 + }, + { + "epoch": 5.766068759342302, + "grad_norm": 0.15340422093868256, + "learning_rate": 2.7953016714227477e-05, + "loss": 3.8721, + "step": 84865 + }, + { + "epoch": 5.766408479412964, + "grad_norm": 0.15968607366085052, + "learning_rate": 2.7948770213344205e-05, + "loss": 3.9121, + "step": 84870 + }, + { + "epoch": 5.766748199483626, + "grad_norm": 0.17116519808769226, + "learning_rate": 2.7944523712460936e-05, + "loss": 3.8246, + "step": 84875 + }, + { + "epoch": 5.767087919554287, + "grad_norm": 0.7180834412574768, + "learning_rate": 2.7940277211577657e-05, + "loss": 3.691, + "step": 84880 + }, + { + "epoch": 5.767427639624949, + "grad_norm": 0.22146299481391907, + "learning_rate": 2.793603071069439e-05, + "loss": 3.8364, + "step": 84885 + }, + { + "epoch": 5.767767359695611, + "grad_norm": 0.15477828681468964, + "learning_rate": 2.7931784209811117e-05, + "loss": 3.8794, + "step": 84890 + }, + { + "epoch": 5.768107079766272, + "grad_norm": 0.2636602222919464, + "learning_rate": 2.792753770892784e-05, + "loss": 3.9111, + "step": 84895 + }, + { + "epoch": 5.768446799836934, + "grad_norm": 0.148532897233963, + "learning_rate": 2.7923291208044573e-05, + "loss": 4.0268, + "step": 84900 + }, + { + "epoch": 5.7687865199075965, + "grad_norm": 0.13636602461338043, + "learning_rate": 2.79190447071613e-05, + "loss": 4.014, + "step": 84905 + }, + { + "epoch": 5.769126239978258, + "grad_norm": 0.2007436603307724, + "learning_rate": 2.7914798206278025e-05, + "loss": 4.0184, + "step": 84910 + }, + { + "epoch": 5.76946596004892, + "grad_norm": 0.2493779957294464, + "learning_rate": 2.7910551705394757e-05, + "loss": 3.8544, + "step": 84915 + }, + { + "epoch": 5.769805680119582, + "grad_norm": 0.17890872061252594, + "learning_rate": 2.7906305204511485e-05, + "loss": 3.7082, + "step": 84920 + }, + { + "epoch": 5.770145400190243, + "grad_norm": 0.7159873247146606, + "learning_rate": 2.7902058703628213e-05, + "loss": 3.9708, + "step": 84925 + }, + { + "epoch": 5.770485120260905, + "grad_norm": 0.16856501996517181, + "learning_rate": 2.7897812202744937e-05, + "loss": 3.908, + "step": 84930 + }, + { + "epoch": 5.770824840331567, + "grad_norm": 0.15371879935264587, + "learning_rate": 2.789356570186167e-05, + "loss": 4.0249, + "step": 84935 + }, + { + "epoch": 5.771164560402228, + "grad_norm": 0.22174157202243805, + "learning_rate": 2.7889319200978397e-05, + "loss": 3.8902, + "step": 84940 + }, + { + "epoch": 5.77150428047289, + "grad_norm": 0.15192489326000214, + "learning_rate": 2.788507270009512e-05, + "loss": 3.7351, + "step": 84945 + }, + { + "epoch": 5.7718440005435525, + "grad_norm": 0.19538931548595428, + "learning_rate": 2.7880826199211853e-05, + "loss": 3.9234, + "step": 84950 + }, + { + "epoch": 5.772183720614214, + "grad_norm": 0.13009971380233765, + "learning_rate": 2.787657969832858e-05, + "loss": 3.9885, + "step": 84955 + }, + { + "epoch": 5.772523440684876, + "grad_norm": 0.15457060933113098, + "learning_rate": 2.7872333197445306e-05, + "loss": 3.838, + "step": 84960 + }, + { + "epoch": 5.772863160755538, + "grad_norm": 0.17919133603572845, + "learning_rate": 2.7868086696562034e-05, + "loss": 3.918, + "step": 84965 + }, + { + "epoch": 5.773202880826199, + "grad_norm": 0.13446646928787231, + "learning_rate": 2.7863840195678765e-05, + "loss": 3.7752, + "step": 84970 + }, + { + "epoch": 5.773542600896861, + "grad_norm": 2.880589246749878, + "learning_rate": 2.785959369479549e-05, + "loss": 3.7667, + "step": 84975 + }, + { + "epoch": 5.773882320967523, + "grad_norm": 0.17534463107585907, + "learning_rate": 2.7855347193912218e-05, + "loss": 3.6879, + "step": 84980 + }, + { + "epoch": 5.774222041038184, + "grad_norm": 0.1612652987241745, + "learning_rate": 2.785110069302895e-05, + "loss": 3.5643, + "step": 84985 + }, + { + "epoch": 5.7745617611088464, + "grad_norm": 0.22270454466342926, + "learning_rate": 2.784685419214567e-05, + "loss": 3.9776, + "step": 84990 + }, + { + "epoch": 5.7749014811795085, + "grad_norm": 0.17824393510818481, + "learning_rate": 2.78426076912624e-05, + "loss": 3.845, + "step": 84995 + }, + { + "epoch": 5.77524120125017, + "grad_norm": 0.13579563796520233, + "learning_rate": 2.783836119037913e-05, + "loss": 3.7887, + "step": 85000 + }, + { + "epoch": 5.775580921320832, + "grad_norm": 0.14184294641017914, + "learning_rate": 2.7834114689495854e-05, + "loss": 3.8039, + "step": 85005 + }, + { + "epoch": 5.775920641391494, + "grad_norm": 0.18552064895629883, + "learning_rate": 2.7829868188612586e-05, + "loss": 3.8875, + "step": 85010 + }, + { + "epoch": 5.776260361462155, + "grad_norm": 0.22224929928779602, + "learning_rate": 2.7825621687729314e-05, + "loss": 3.9334, + "step": 85015 + }, + { + "epoch": 5.776600081532817, + "grad_norm": 0.19812746345996857, + "learning_rate": 2.7821375186846038e-05, + "loss": 3.7919, + "step": 85020 + }, + { + "epoch": 5.776939801603479, + "grad_norm": 0.16625913977622986, + "learning_rate": 2.7817128685962766e-05, + "loss": 3.6723, + "step": 85025 + }, + { + "epoch": 5.77727952167414, + "grad_norm": 0.1434660106897354, + "learning_rate": 2.7812882185079498e-05, + "loss": 3.7931, + "step": 85030 + }, + { + "epoch": 5.7776192417448025, + "grad_norm": 0.20544016361236572, + "learning_rate": 2.7808635684196222e-05, + "loss": 3.9676, + "step": 85035 + }, + { + "epoch": 5.7779589618154645, + "grad_norm": 0.23629680275917053, + "learning_rate": 2.780438918331295e-05, + "loss": 3.6584, + "step": 85040 + }, + { + "epoch": 5.778298681886126, + "grad_norm": 0.15275464951992035, + "learning_rate": 2.780014268242968e-05, + "loss": 4.1232, + "step": 85045 + }, + { + "epoch": 5.778638401956788, + "grad_norm": 0.2240334004163742, + "learning_rate": 2.7795896181546406e-05, + "loss": 3.8143, + "step": 85050 + }, + { + "epoch": 5.77897812202745, + "grad_norm": 0.1479082852602005, + "learning_rate": 2.7791649680663134e-05, + "loss": 3.9467, + "step": 85055 + }, + { + "epoch": 5.779317842098111, + "grad_norm": 0.226506307721138, + "learning_rate": 2.7787403179779862e-05, + "loss": 3.7569, + "step": 85060 + }, + { + "epoch": 5.779657562168773, + "grad_norm": 0.25415709614753723, + "learning_rate": 2.7783156678896587e-05, + "loss": 3.9226, + "step": 85065 + }, + { + "epoch": 5.779997282239434, + "grad_norm": 0.14565913379192352, + "learning_rate": 2.777891017801332e-05, + "loss": 3.6123, + "step": 85070 + }, + { + "epoch": 5.780337002310096, + "grad_norm": 0.21185939013957977, + "learning_rate": 2.7774663677130046e-05, + "loss": 3.8428, + "step": 85075 + }, + { + "epoch": 5.7806767223807585, + "grad_norm": 0.4333938956260681, + "learning_rate": 2.777041717624677e-05, + "loss": 3.8799, + "step": 85080 + }, + { + "epoch": 5.78101644245142, + "grad_norm": 0.18197162449359894, + "learning_rate": 2.7766170675363502e-05, + "loss": 3.74, + "step": 85085 + }, + { + "epoch": 5.781356162522082, + "grad_norm": 0.1973731368780136, + "learning_rate": 2.776192417448023e-05, + "loss": 3.9662, + "step": 85090 + }, + { + "epoch": 5.781695882592744, + "grad_norm": 0.17966140806674957, + "learning_rate": 2.7757677673596962e-05, + "loss": 3.9334, + "step": 85095 + }, + { + "epoch": 5.782035602663405, + "grad_norm": 0.24521958827972412, + "learning_rate": 2.7753431172713683e-05, + "loss": 3.6556, + "step": 85100 + }, + { + "epoch": 5.782375322734067, + "grad_norm": 0.19508689641952515, + "learning_rate": 2.7749184671830414e-05, + "loss": 3.7855, + "step": 85105 + }, + { + "epoch": 5.782715042804729, + "grad_norm": 0.12988927960395813, + "learning_rate": 2.7744938170947142e-05, + "loss": 3.9604, + "step": 85110 + }, + { + "epoch": 5.78305476287539, + "grad_norm": 0.35568001866340637, + "learning_rate": 2.7740691670063867e-05, + "loss": 3.9353, + "step": 85115 + }, + { + "epoch": 5.783394482946052, + "grad_norm": 0.22554442286491394, + "learning_rate": 2.77364451691806e-05, + "loss": 3.6286, + "step": 85120 + }, + { + "epoch": 5.7837342030167145, + "grad_norm": 0.13741014897823334, + "learning_rate": 2.7732198668297326e-05, + "loss": 3.7099, + "step": 85125 + }, + { + "epoch": 5.784073923087376, + "grad_norm": 0.1825506091117859, + "learning_rate": 2.772795216741405e-05, + "loss": 3.8481, + "step": 85130 + }, + { + "epoch": 5.784413643158038, + "grad_norm": 0.16970893740653992, + "learning_rate": 2.772370566653078e-05, + "loss": 4.019, + "step": 85135 + }, + { + "epoch": 5.7847533632287, + "grad_norm": 0.16900958120822906, + "learning_rate": 2.771945916564751e-05, + "loss": 3.7901, + "step": 85140 + }, + { + "epoch": 5.785093083299361, + "grad_norm": 0.19175855815410614, + "learning_rate": 2.7715212664764235e-05, + "loss": 3.8629, + "step": 85145 + }, + { + "epoch": 5.785432803370023, + "grad_norm": 0.15192754566669464, + "learning_rate": 2.7710966163880963e-05, + "loss": 3.7466, + "step": 85150 + }, + { + "epoch": 5.785772523440685, + "grad_norm": 0.19335132837295532, + "learning_rate": 2.7706719662997694e-05, + "loss": 3.7257, + "step": 85155 + }, + { + "epoch": 5.786112243511346, + "grad_norm": 0.3011012375354767, + "learning_rate": 2.7702473162114416e-05, + "loss": 3.8298, + "step": 85160 + }, + { + "epoch": 5.786451963582008, + "grad_norm": 0.20540723204612732, + "learning_rate": 2.7698226661231147e-05, + "loss": 3.8466, + "step": 85165 + }, + { + "epoch": 5.7867916836526705, + "grad_norm": 0.2795126438140869, + "learning_rate": 2.7693980160347875e-05, + "loss": 3.8019, + "step": 85170 + }, + { + "epoch": 5.787131403723332, + "grad_norm": 0.16934449970722198, + "learning_rate": 2.76897336594646e-05, + "loss": 3.7716, + "step": 85175 + }, + { + "epoch": 5.787471123793994, + "grad_norm": 0.20382246375083923, + "learning_rate": 2.768548715858133e-05, + "loss": 4.1826, + "step": 85180 + }, + { + "epoch": 5.787810843864656, + "grad_norm": 0.1591188907623291, + "learning_rate": 2.768124065769806e-05, + "loss": 3.7873, + "step": 85185 + }, + { + "epoch": 5.788150563935317, + "grad_norm": 0.166305273771286, + "learning_rate": 2.7676994156814784e-05, + "loss": 4.0864, + "step": 85190 + }, + { + "epoch": 5.788490284005979, + "grad_norm": 0.3067665696144104, + "learning_rate": 2.7672747655931515e-05, + "loss": 3.9191, + "step": 85195 + }, + { + "epoch": 5.788830004076641, + "grad_norm": 0.21449536085128784, + "learning_rate": 2.7668501155048243e-05, + "loss": 3.6969, + "step": 85200 + }, + { + "epoch": 5.789169724147302, + "grad_norm": 0.18140697479248047, + "learning_rate": 2.7664254654164968e-05, + "loss": 3.8711, + "step": 85205 + }, + { + "epoch": 5.789509444217964, + "grad_norm": 0.1730380654335022, + "learning_rate": 2.7660008153281696e-05, + "loss": 3.8594, + "step": 85210 + }, + { + "epoch": 5.7898491642886265, + "grad_norm": 0.20750854909420013, + "learning_rate": 2.7655761652398427e-05, + "loss": 3.6352, + "step": 85215 + }, + { + "epoch": 5.790188884359288, + "grad_norm": 0.17075470089912415, + "learning_rate": 2.7651515151515152e-05, + "loss": 3.8549, + "step": 85220 + }, + { + "epoch": 5.79052860442995, + "grad_norm": 0.19804823398590088, + "learning_rate": 2.764726865063188e-05, + "loss": 3.7904, + "step": 85225 + }, + { + "epoch": 5.790868324500612, + "grad_norm": 0.17436349391937256, + "learning_rate": 2.764302214974861e-05, + "loss": 3.9702, + "step": 85230 + }, + { + "epoch": 5.791208044571273, + "grad_norm": 0.18615072965621948, + "learning_rate": 2.7638775648865332e-05, + "loss": 3.7552, + "step": 85235 + }, + { + "epoch": 5.791547764641935, + "grad_norm": 0.8328690528869629, + "learning_rate": 2.7634529147982064e-05, + "loss": 3.7537, + "step": 85240 + }, + { + "epoch": 5.791887484712597, + "grad_norm": 0.1880655139684677, + "learning_rate": 2.7630282647098792e-05, + "loss": 3.9509, + "step": 85245 + }, + { + "epoch": 5.792227204783258, + "grad_norm": 0.19577354192733765, + "learning_rate": 2.7626036146215516e-05, + "loss": 3.9251, + "step": 85250 + }, + { + "epoch": 5.79256692485392, + "grad_norm": 0.17240308225154877, + "learning_rate": 2.7621789645332248e-05, + "loss": 3.7615, + "step": 85255 + }, + { + "epoch": 5.7929066449245825, + "grad_norm": 0.19239458441734314, + "learning_rate": 2.7617543144448976e-05, + "loss": 4.0249, + "step": 85260 + }, + { + "epoch": 5.793246364995244, + "grad_norm": 0.18498311936855316, + "learning_rate": 2.7613296643565707e-05, + "loss": 3.7871, + "step": 85265 + }, + { + "epoch": 5.793586085065906, + "grad_norm": 0.28513726592063904, + "learning_rate": 2.760905014268243e-05, + "loss": 3.5444, + "step": 85270 + }, + { + "epoch": 5.793925805136568, + "grad_norm": 0.2392939031124115, + "learning_rate": 2.760480364179916e-05, + "loss": 3.8691, + "step": 85275 + }, + { + "epoch": 5.794265525207229, + "grad_norm": 0.16273820400238037, + "learning_rate": 2.7600557140915888e-05, + "loss": 3.9672, + "step": 85280 + }, + { + "epoch": 5.794605245277891, + "grad_norm": 0.1591058075428009, + "learning_rate": 2.7596310640032612e-05, + "loss": 3.9254, + "step": 85285 + }, + { + "epoch": 5.794944965348552, + "grad_norm": 0.20973989367485046, + "learning_rate": 2.7592064139149344e-05, + "loss": 3.7257, + "step": 85290 + }, + { + "epoch": 5.795284685419214, + "grad_norm": 0.35086849331855774, + "learning_rate": 2.7587817638266072e-05, + "loss": 3.8019, + "step": 85295 + }, + { + "epoch": 5.7956244054898765, + "grad_norm": 0.23872925341129303, + "learning_rate": 2.7583571137382796e-05, + "loss": 3.9967, + "step": 85300 + }, + { + "epoch": 5.795964125560538, + "grad_norm": 0.19861769676208496, + "learning_rate": 2.7579324636499525e-05, + "loss": 3.9476, + "step": 85305 + }, + { + "epoch": 5.7963038456312, + "grad_norm": 0.16952215135097504, + "learning_rate": 2.7575078135616256e-05, + "loss": 3.8991, + "step": 85310 + }, + { + "epoch": 5.796643565701862, + "grad_norm": 0.143427774310112, + "learning_rate": 2.757083163473298e-05, + "loss": 3.7052, + "step": 85315 + }, + { + "epoch": 5.796983285772523, + "grad_norm": 0.18873697519302368, + "learning_rate": 2.756658513384971e-05, + "loss": 3.5407, + "step": 85320 + }, + { + "epoch": 5.797323005843185, + "grad_norm": 0.15188519656658173, + "learning_rate": 2.756233863296644e-05, + "loss": 3.65, + "step": 85325 + }, + { + "epoch": 5.797662725913847, + "grad_norm": 0.18363139033317566, + "learning_rate": 2.7558941432259822e-05, + "loss": 3.912, + "step": 85330 + }, + { + "epoch": 5.798002445984508, + "grad_norm": 0.126181960105896, + "learning_rate": 2.7554694931376547e-05, + "loss": 3.8708, + "step": 85335 + }, + { + "epoch": 5.79834216605517, + "grad_norm": 0.2805311977863312, + "learning_rate": 2.7550448430493275e-05, + "loss": 3.9113, + "step": 85340 + }, + { + "epoch": 5.7986818861258325, + "grad_norm": 0.22967787086963654, + "learning_rate": 2.7546201929610006e-05, + "loss": 3.7881, + "step": 85345 + }, + { + "epoch": 5.799021606196494, + "grad_norm": 0.2112668752670288, + "learning_rate": 2.7541955428726728e-05, + "loss": 4.0595, + "step": 85350 + }, + { + "epoch": 5.799361326267156, + "grad_norm": 0.18705296516418457, + "learning_rate": 2.753770892784346e-05, + "loss": 3.7994, + "step": 85355 + }, + { + "epoch": 5.799701046337818, + "grad_norm": 0.19296622276306152, + "learning_rate": 2.7533462426960187e-05, + "loss": 3.8068, + "step": 85360 + }, + { + "epoch": 5.800040766408479, + "grad_norm": 0.1745738536119461, + "learning_rate": 2.752921592607691e-05, + "loss": 3.7704, + "step": 85365 + }, + { + "epoch": 5.800380486479141, + "grad_norm": 0.2124253362417221, + "learning_rate": 2.7524969425193643e-05, + "loss": 3.6063, + "step": 85370 + }, + { + "epoch": 5.800720206549803, + "grad_norm": 0.19484104216098785, + "learning_rate": 2.752072292431037e-05, + "loss": 3.6254, + "step": 85375 + }, + { + "epoch": 5.801059926620464, + "grad_norm": 0.17232736945152283, + "learning_rate": 2.7516476423427096e-05, + "loss": 3.7165, + "step": 85380 + }, + { + "epoch": 5.801399646691126, + "grad_norm": 0.1409953087568283, + "learning_rate": 2.7512229922543824e-05, + "loss": 3.7036, + "step": 85385 + }, + { + "epoch": 5.8017393667617885, + "grad_norm": 0.15335877239704132, + "learning_rate": 2.7507983421660555e-05, + "loss": 3.6495, + "step": 85390 + }, + { + "epoch": 5.80207908683245, + "grad_norm": 0.5891558527946472, + "learning_rate": 2.750373692077728e-05, + "loss": 4.084, + "step": 85395 + }, + { + "epoch": 5.802418806903112, + "grad_norm": 0.1869095414876938, + "learning_rate": 2.7499490419894008e-05, + "loss": 3.9299, + "step": 85400 + }, + { + "epoch": 5.802758526973774, + "grad_norm": 0.19159606099128723, + "learning_rate": 2.749524391901074e-05, + "loss": 3.9201, + "step": 85405 + }, + { + "epoch": 5.803098247044435, + "grad_norm": 0.17312322556972504, + "learning_rate": 2.7490997418127464e-05, + "loss": 3.8407, + "step": 85410 + }, + { + "epoch": 5.803437967115097, + "grad_norm": 0.1639661341905594, + "learning_rate": 2.748675091724419e-05, + "loss": 3.6877, + "step": 85415 + }, + { + "epoch": 5.803777687185759, + "grad_norm": 0.13688752055168152, + "learning_rate": 2.748250441636092e-05, + "loss": 3.6487, + "step": 85420 + }, + { + "epoch": 5.80411740725642, + "grad_norm": 0.23320676386356354, + "learning_rate": 2.7478257915477644e-05, + "loss": 3.7505, + "step": 85425 + }, + { + "epoch": 5.804457127327082, + "grad_norm": 0.23288728296756744, + "learning_rate": 2.7474011414594376e-05, + "loss": 3.4798, + "step": 85430 + }, + { + "epoch": 5.8047968473977445, + "grad_norm": 0.21874108910560608, + "learning_rate": 2.7469764913711104e-05, + "loss": 4.0363, + "step": 85435 + }, + { + "epoch": 5.805136567468406, + "grad_norm": 0.3710234761238098, + "learning_rate": 2.746551841282783e-05, + "loss": 3.6697, + "step": 85440 + }, + { + "epoch": 5.805476287539068, + "grad_norm": 0.15904732048511505, + "learning_rate": 2.746127191194456e-05, + "loss": 3.8142, + "step": 85445 + }, + { + "epoch": 5.80581600760973, + "grad_norm": 0.2799580693244934, + "learning_rate": 2.7457025411061288e-05, + "loss": 3.7778, + "step": 85450 + }, + { + "epoch": 5.806155727680391, + "grad_norm": 0.18776309490203857, + "learning_rate": 2.7452778910178012e-05, + "loss": 3.9149, + "step": 85455 + }, + { + "epoch": 5.806495447751053, + "grad_norm": 0.15575073659420013, + "learning_rate": 2.744853240929474e-05, + "loss": 3.7566, + "step": 85460 + }, + { + "epoch": 5.806835167821715, + "grad_norm": 0.15537913143634796, + "learning_rate": 2.7444285908411472e-05, + "loss": 3.9221, + "step": 85465 + }, + { + "epoch": 5.807174887892376, + "grad_norm": 0.3012561798095703, + "learning_rate": 2.74400394075282e-05, + "loss": 3.8385, + "step": 85470 + }, + { + "epoch": 5.807514607963038, + "grad_norm": 0.1702168732881546, + "learning_rate": 2.7435792906644924e-05, + "loss": 3.7813, + "step": 85475 + }, + { + "epoch": 5.8078543280337005, + "grad_norm": 0.15743644535541534, + "learning_rate": 2.7431546405761656e-05, + "loss": 3.7743, + "step": 85480 + }, + { + "epoch": 5.808194048104362, + "grad_norm": 0.20480042695999146, + "learning_rate": 2.7427299904878384e-05, + "loss": 3.8652, + "step": 85485 + }, + { + "epoch": 5.808533768175024, + "grad_norm": 0.21687261760234833, + "learning_rate": 2.742305340399511e-05, + "loss": 3.9108, + "step": 85490 + }, + { + "epoch": 5.808873488245686, + "grad_norm": 0.33032605051994324, + "learning_rate": 2.7418806903111836e-05, + "loss": 3.7849, + "step": 85495 + }, + { + "epoch": 5.809213208316347, + "grad_norm": 0.1776474267244339, + "learning_rate": 2.7414560402228568e-05, + "loss": 3.8615, + "step": 85500 + }, + { + "epoch": 5.809552928387009, + "grad_norm": 0.2139628529548645, + "learning_rate": 2.7410313901345292e-05, + "loss": 3.837, + "step": 85505 + }, + { + "epoch": 5.809892648457671, + "grad_norm": 0.15731468796730042, + "learning_rate": 2.740606740046202e-05, + "loss": 3.7509, + "step": 85510 + }, + { + "epoch": 5.810232368528332, + "grad_norm": 0.21741612255573273, + "learning_rate": 2.7401820899578752e-05, + "loss": 3.8135, + "step": 85515 + }, + { + "epoch": 5.810572088598994, + "grad_norm": 0.15353895723819733, + "learning_rate": 2.7397574398695473e-05, + "loss": 3.8617, + "step": 85520 + }, + { + "epoch": 5.8109118086696565, + "grad_norm": 0.18901678919792175, + "learning_rate": 2.7393327897812204e-05, + "loss": 4.1815, + "step": 85525 + }, + { + "epoch": 5.811251528740318, + "grad_norm": 0.1715831458568573, + "learning_rate": 2.7389081396928932e-05, + "loss": 3.9611, + "step": 85530 + }, + { + "epoch": 5.81159124881098, + "grad_norm": 0.28715717792510986, + "learning_rate": 2.7384834896045657e-05, + "loss": 4.0137, + "step": 85535 + }, + { + "epoch": 5.811930968881642, + "grad_norm": 0.5632097125053406, + "learning_rate": 2.738058839516239e-05, + "loss": 3.7676, + "step": 85540 + }, + { + "epoch": 5.812270688952303, + "grad_norm": 0.16851402819156647, + "learning_rate": 2.7376341894279116e-05, + "loss": 3.7891, + "step": 85545 + }, + { + "epoch": 5.812610409022965, + "grad_norm": 0.1773560494184494, + "learning_rate": 2.737209539339584e-05, + "loss": 3.8757, + "step": 85550 + }, + { + "epoch": 5.812950129093627, + "grad_norm": 0.18698063492774963, + "learning_rate": 2.7367848892512573e-05, + "loss": 3.8047, + "step": 85555 + }, + { + "epoch": 5.813289849164288, + "grad_norm": 0.15276198089122772, + "learning_rate": 2.73636023916293e-05, + "loss": 3.8857, + "step": 85560 + }, + { + "epoch": 5.8136295692349504, + "grad_norm": 0.12368792295455933, + "learning_rate": 2.7359355890746025e-05, + "loss": 3.8307, + "step": 85565 + }, + { + "epoch": 5.8139692893056125, + "grad_norm": 0.16029351949691772, + "learning_rate": 2.7355109389862753e-05, + "loss": 3.8011, + "step": 85570 + }, + { + "epoch": 5.814309009376274, + "grad_norm": 0.16878169775009155, + "learning_rate": 2.7350862888979485e-05, + "loss": 3.8172, + "step": 85575 + }, + { + "epoch": 5.814648729446936, + "grad_norm": 0.32560163736343384, + "learning_rate": 2.734661638809621e-05, + "loss": 4.0436, + "step": 85580 + }, + { + "epoch": 5.814988449517598, + "grad_norm": 0.19081655144691467, + "learning_rate": 2.7342369887212937e-05, + "loss": 3.8044, + "step": 85585 + }, + { + "epoch": 5.815328169588259, + "grad_norm": 0.19229234755039215, + "learning_rate": 2.733812338632967e-05, + "loss": 3.5413, + "step": 85590 + }, + { + "epoch": 5.815667889658921, + "grad_norm": 0.17090603709220886, + "learning_rate": 2.733387688544639e-05, + "loss": 3.9569, + "step": 85595 + }, + { + "epoch": 5.816007609729583, + "grad_norm": 0.21244052052497864, + "learning_rate": 2.732963038456312e-05, + "loss": 3.8505, + "step": 85600 + }, + { + "epoch": 5.816347329800244, + "grad_norm": 0.1701357364654541, + "learning_rate": 2.732538388367985e-05, + "loss": 3.7532, + "step": 85605 + }, + { + "epoch": 5.8166870498709065, + "grad_norm": 0.22630909085273743, + "learning_rate": 2.7321137382796574e-05, + "loss": 3.6727, + "step": 85610 + }, + { + "epoch": 5.8170267699415685, + "grad_norm": 0.17914755642414093, + "learning_rate": 2.7316890881913305e-05, + "loss": 3.8028, + "step": 85615 + }, + { + "epoch": 5.81736649001223, + "grad_norm": 0.15875737369060516, + "learning_rate": 2.7312644381030033e-05, + "loss": 3.8787, + "step": 85620 + }, + { + "epoch": 5.817706210082892, + "grad_norm": 0.21747766435146332, + "learning_rate": 2.7308397880146758e-05, + "loss": 3.6986, + "step": 85625 + }, + { + "epoch": 5.818045930153554, + "grad_norm": 0.16756759583950043, + "learning_rate": 2.7304151379263486e-05, + "loss": 3.7948, + "step": 85630 + }, + { + "epoch": 5.818385650224215, + "grad_norm": 0.23364901542663574, + "learning_rate": 2.7299904878380217e-05, + "loss": 3.9174, + "step": 85635 + }, + { + "epoch": 5.818725370294877, + "grad_norm": 0.16944213211536407, + "learning_rate": 2.7295658377496945e-05, + "loss": 3.8744, + "step": 85640 + }, + { + "epoch": 5.819065090365539, + "grad_norm": 0.30442383885383606, + "learning_rate": 2.729141187661367e-05, + "loss": 3.7272, + "step": 85645 + }, + { + "epoch": 5.8194048104362, + "grad_norm": 0.1990485042333603, + "learning_rate": 2.72871653757304e-05, + "loss": 3.8824, + "step": 85650 + }, + { + "epoch": 5.8197445305068625, + "grad_norm": 0.12334448099136353, + "learning_rate": 2.728291887484713e-05, + "loss": 3.8332, + "step": 85655 + }, + { + "epoch": 5.8200842505775245, + "grad_norm": 0.19946257770061493, + "learning_rate": 2.7278672373963854e-05, + "loss": 3.989, + "step": 85660 + }, + { + "epoch": 5.820423970648186, + "grad_norm": 0.1605609506368637, + "learning_rate": 2.7274425873080582e-05, + "loss": 3.8754, + "step": 85665 + }, + { + "epoch": 5.820763690718848, + "grad_norm": 0.18615762889385223, + "learning_rate": 2.7270179372197313e-05, + "loss": 3.9702, + "step": 85670 + }, + { + "epoch": 5.82110341078951, + "grad_norm": 0.15320368111133575, + "learning_rate": 2.7265932871314038e-05, + "loss": 3.9067, + "step": 85675 + }, + { + "epoch": 5.821443130860171, + "grad_norm": 0.18667060136795044, + "learning_rate": 2.7261686370430766e-05, + "loss": 3.8836, + "step": 85680 + }, + { + "epoch": 5.821782850930833, + "grad_norm": 0.30539873242378235, + "learning_rate": 2.7257439869547497e-05, + "loss": 3.594, + "step": 85685 + }, + { + "epoch": 5.822122571001495, + "grad_norm": 0.16464465856552124, + "learning_rate": 2.7253193368664222e-05, + "loss": 3.6895, + "step": 85690 + }, + { + "epoch": 5.822462291072156, + "grad_norm": 0.16852517426013947, + "learning_rate": 2.724894686778095e-05, + "loss": 3.9347, + "step": 85695 + }, + { + "epoch": 5.8228020111428185, + "grad_norm": 0.802299439907074, + "learning_rate": 2.724470036689768e-05, + "loss": 3.7265, + "step": 85700 + }, + { + "epoch": 5.8231417312134806, + "grad_norm": 0.18265049159526825, + "learning_rate": 2.7240453866014403e-05, + "loss": 3.8225, + "step": 85705 + }, + { + "epoch": 5.823481451284142, + "grad_norm": 0.20388175547122955, + "learning_rate": 2.7236207365131134e-05, + "loss": 3.6471, + "step": 85710 + }, + { + "epoch": 5.823821171354804, + "grad_norm": 0.22656406462192535, + "learning_rate": 2.7231960864247862e-05, + "loss": 3.6616, + "step": 85715 + }, + { + "epoch": 5.824160891425466, + "grad_norm": 0.22337134182453156, + "learning_rate": 2.7227714363364587e-05, + "loss": 3.9431, + "step": 85720 + }, + { + "epoch": 5.824500611496127, + "grad_norm": 0.1988542228937149, + "learning_rate": 2.7223467862481318e-05, + "loss": 3.8622, + "step": 85725 + }, + { + "epoch": 5.824840331566789, + "grad_norm": 0.21512649953365326, + "learning_rate": 2.7219221361598046e-05, + "loss": 3.8565, + "step": 85730 + }, + { + "epoch": 5.825180051637451, + "grad_norm": 0.21047759056091309, + "learning_rate": 2.721497486071477e-05, + "loss": 3.527, + "step": 85735 + }, + { + "epoch": 5.825519771708112, + "grad_norm": 0.1897142231464386, + "learning_rate": 2.72107283598315e-05, + "loss": 3.7142, + "step": 85740 + }, + { + "epoch": 5.8258594917787745, + "grad_norm": 0.7756871581077576, + "learning_rate": 2.720648185894823e-05, + "loss": 3.9154, + "step": 85745 + }, + { + "epoch": 5.826199211849436, + "grad_norm": 0.3337222933769226, + "learning_rate": 2.7202235358064955e-05, + "loss": 3.8905, + "step": 85750 + }, + { + "epoch": 5.826538931920098, + "grad_norm": 0.21127568185329437, + "learning_rate": 2.7197988857181683e-05, + "loss": 3.6782, + "step": 85755 + }, + { + "epoch": 5.82687865199076, + "grad_norm": 0.1446215808391571, + "learning_rate": 2.7193742356298414e-05, + "loss": 3.709, + "step": 85760 + }, + { + "epoch": 5.827218372061421, + "grad_norm": 0.360937237739563, + "learning_rate": 2.7189495855415135e-05, + "loss": 3.9409, + "step": 85765 + }, + { + "epoch": 5.827558092132083, + "grad_norm": 0.2195359468460083, + "learning_rate": 2.7185249354531867e-05, + "loss": 3.6914, + "step": 85770 + }, + { + "epoch": 5.827897812202745, + "grad_norm": 0.1900080442428589, + "learning_rate": 2.7181002853648595e-05, + "loss": 3.7097, + "step": 85775 + }, + { + "epoch": 5.828237532273406, + "grad_norm": 0.19466380774974823, + "learning_rate": 2.717675635276532e-05, + "loss": 3.7817, + "step": 85780 + }, + { + "epoch": 5.828577252344068, + "grad_norm": 0.1570495069026947, + "learning_rate": 2.717250985188205e-05, + "loss": 3.8535, + "step": 85785 + }, + { + "epoch": 5.8289169724147305, + "grad_norm": 0.18056011199951172, + "learning_rate": 2.716826335099878e-05, + "loss": 3.8884, + "step": 85790 + }, + { + "epoch": 5.829256692485392, + "grad_norm": 0.12634776532649994, + "learning_rate": 2.7164016850115503e-05, + "loss": 3.8962, + "step": 85795 + }, + { + "epoch": 5.829596412556054, + "grad_norm": 0.1716940850019455, + "learning_rate": 2.7159770349232235e-05, + "loss": 3.7219, + "step": 85800 + }, + { + "epoch": 5.829936132626716, + "grad_norm": 0.24769437313079834, + "learning_rate": 2.7155523848348963e-05, + "loss": 3.7916, + "step": 85805 + }, + { + "epoch": 5.830275852697377, + "grad_norm": 0.1582348793745041, + "learning_rate": 2.715127734746569e-05, + "loss": 3.9529, + "step": 85810 + }, + { + "epoch": 5.830615572768039, + "grad_norm": 0.18998056650161743, + "learning_rate": 2.7147030846582415e-05, + "loss": 3.6497, + "step": 85815 + }, + { + "epoch": 5.830955292838701, + "grad_norm": 0.18606114387512207, + "learning_rate": 2.7142784345699147e-05, + "loss": 3.8455, + "step": 85820 + }, + { + "epoch": 5.831295012909362, + "grad_norm": 0.18265022337436676, + "learning_rate": 2.7138537844815875e-05, + "loss": 3.8696, + "step": 85825 + }, + { + "epoch": 5.831634732980024, + "grad_norm": 0.5466532111167908, + "learning_rate": 2.71342913439326e-05, + "loss": 3.9323, + "step": 85830 + }, + { + "epoch": 5.8319744530506865, + "grad_norm": 3.0889503955841064, + "learning_rate": 2.713004484304933e-05, + "loss": 4.0597, + "step": 85835 + }, + { + "epoch": 5.832314173121348, + "grad_norm": 0.1945502907037735, + "learning_rate": 2.712579834216606e-05, + "loss": 3.9399, + "step": 85840 + }, + { + "epoch": 5.83265389319201, + "grad_norm": 0.18493704497814178, + "learning_rate": 2.7121551841282783e-05, + "loss": 3.8539, + "step": 85845 + }, + { + "epoch": 5.832993613262672, + "grad_norm": 0.20355366170406342, + "learning_rate": 2.711730534039951e-05, + "loss": 3.8163, + "step": 85850 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.16129052639007568, + "learning_rate": 2.7113058839516243e-05, + "loss": 4.0064, + "step": 85855 + }, + { + "epoch": 5.833673053403995, + "grad_norm": 1.4826200008392334, + "learning_rate": 2.7108812338632967e-05, + "loss": 3.9654, + "step": 85860 + }, + { + "epoch": 5.834012773474657, + "grad_norm": 0.17972390353679657, + "learning_rate": 2.7104565837749695e-05, + "loss": 3.8194, + "step": 85865 + }, + { + "epoch": 5.834352493545318, + "grad_norm": 0.16790738701820374, + "learning_rate": 2.7100319336866427e-05, + "loss": 3.8735, + "step": 85870 + }, + { + "epoch": 5.8346922136159804, + "grad_norm": 0.17138949036598206, + "learning_rate": 2.7096072835983148e-05, + "loss": 3.7272, + "step": 85875 + }, + { + "epoch": 5.8350319336866425, + "grad_norm": 0.16943199932575226, + "learning_rate": 2.709182633509988e-05, + "loss": 3.7259, + "step": 85880 + }, + { + "epoch": 5.835371653757304, + "grad_norm": 0.14261753857135773, + "learning_rate": 2.7087579834216607e-05, + "loss": 3.708, + "step": 85885 + }, + { + "epoch": 5.835711373827966, + "grad_norm": 0.61725252866745, + "learning_rate": 2.7083333333333332e-05, + "loss": 3.9235, + "step": 85890 + }, + { + "epoch": 5.836051093898628, + "grad_norm": 0.17391644418239594, + "learning_rate": 2.7079086832450063e-05, + "loss": 3.7388, + "step": 85895 + }, + { + "epoch": 5.836390813969289, + "grad_norm": 0.1770203858613968, + "learning_rate": 2.707484033156679e-05, + "loss": 3.702, + "step": 85900 + }, + { + "epoch": 5.836730534039951, + "grad_norm": 0.18225792050361633, + "learning_rate": 2.7070593830683516e-05, + "loss": 3.7788, + "step": 85905 + }, + { + "epoch": 5.837070254110613, + "grad_norm": 0.18126124143600464, + "learning_rate": 2.7066347329800244e-05, + "loss": 3.8767, + "step": 85910 + }, + { + "epoch": 5.837409974181274, + "grad_norm": 0.14765402674674988, + "learning_rate": 2.7062100828916976e-05, + "loss": 3.7247, + "step": 85915 + }, + { + "epoch": 5.8377496942519365, + "grad_norm": 0.17235895991325378, + "learning_rate": 2.70578543280337e-05, + "loss": 3.947, + "step": 85920 + }, + { + "epoch": 5.8380894143225985, + "grad_norm": 0.17129522562026978, + "learning_rate": 2.7053607827150428e-05, + "loss": 3.8582, + "step": 85925 + }, + { + "epoch": 5.83842913439326, + "grad_norm": 0.15529416501522064, + "learning_rate": 2.704936132626716e-05, + "loss": 3.9947, + "step": 85930 + }, + { + "epoch": 5.838768854463922, + "grad_norm": 0.24690331518650055, + "learning_rate": 2.7045114825383884e-05, + "loss": 3.5825, + "step": 85935 + }, + { + "epoch": 5.839108574534584, + "grad_norm": 0.14335855841636658, + "learning_rate": 2.7040868324500612e-05, + "loss": 3.606, + "step": 85940 + }, + { + "epoch": 5.839448294605245, + "grad_norm": 0.1645924150943756, + "learning_rate": 2.703662182361734e-05, + "loss": 3.7857, + "step": 85945 + }, + { + "epoch": 5.839788014675907, + "grad_norm": 0.18148915469646454, + "learning_rate": 2.7032375322734065e-05, + "loss": 3.9942, + "step": 85950 + }, + { + "epoch": 5.840127734746569, + "grad_norm": 0.4099326729774475, + "learning_rate": 2.7028128821850796e-05, + "loss": 3.9473, + "step": 85955 + }, + { + "epoch": 5.84046745481723, + "grad_norm": 0.18575522303581238, + "learning_rate": 2.7023882320967524e-05, + "loss": 3.6911, + "step": 85960 + }, + { + "epoch": 5.8408071748878925, + "grad_norm": 0.17342300713062286, + "learning_rate": 2.701963582008425e-05, + "loss": 3.7088, + "step": 85965 + }, + { + "epoch": 5.841146894958554, + "grad_norm": 0.20476987957954407, + "learning_rate": 2.701538931920098e-05, + "loss": 3.6186, + "step": 85970 + }, + { + "epoch": 5.841486615029216, + "grad_norm": 0.16050031781196594, + "learning_rate": 2.7011142818317708e-05, + "loss": 3.8602, + "step": 85975 + }, + { + "epoch": 5.841826335099878, + "grad_norm": 0.1550615280866623, + "learning_rate": 2.700689631743444e-05, + "loss": 3.6944, + "step": 85980 + }, + { + "epoch": 5.842166055170539, + "grad_norm": 0.18298496305942535, + "learning_rate": 2.700264981655116e-05, + "loss": 4.2614, + "step": 85985 + }, + { + "epoch": 5.842505775241201, + "grad_norm": 0.2223220020532608, + "learning_rate": 2.6998403315667892e-05, + "loss": 3.9849, + "step": 85990 + }, + { + "epoch": 5.842845495311863, + "grad_norm": 0.1385788470506668, + "learning_rate": 2.699415681478462e-05, + "loss": 3.8081, + "step": 85995 + }, + { + "epoch": 5.843185215382524, + "grad_norm": 0.221363827586174, + "learning_rate": 2.6989910313901345e-05, + "loss": 3.7253, + "step": 86000 + }, + { + "epoch": 5.843524935453186, + "grad_norm": 0.1786593347787857, + "learning_rate": 2.6985663813018076e-05, + "loss": 3.7715, + "step": 86005 + }, + { + "epoch": 5.8438646555238485, + "grad_norm": 0.20420637726783752, + "learning_rate": 2.6981417312134804e-05, + "loss": 3.8461, + "step": 86010 + }, + { + "epoch": 5.84420437559451, + "grad_norm": 0.1929236203432083, + "learning_rate": 2.697717081125153e-05, + "loss": 3.9105, + "step": 86015 + }, + { + "epoch": 5.844544095665172, + "grad_norm": 0.2291848063468933, + "learning_rate": 2.6972924310368257e-05, + "loss": 3.723, + "step": 86020 + }, + { + "epoch": 5.844883815735834, + "grad_norm": 0.20692448318004608, + "learning_rate": 2.6968677809484988e-05, + "loss": 3.9896, + "step": 86025 + }, + { + "epoch": 5.845223535806495, + "grad_norm": 0.2099495828151703, + "learning_rate": 2.6964431308601713e-05, + "loss": 4.1392, + "step": 86030 + }, + { + "epoch": 5.845563255877157, + "grad_norm": 0.18211278319358826, + "learning_rate": 2.696018480771844e-05, + "loss": 3.8252, + "step": 86035 + }, + { + "epoch": 5.845902975947819, + "grad_norm": 0.23627498745918274, + "learning_rate": 2.6955938306835172e-05, + "loss": 3.8835, + "step": 86040 + }, + { + "epoch": 5.84624269601848, + "grad_norm": 0.24722984433174133, + "learning_rate": 2.6951691805951894e-05, + "loss": 3.7631, + "step": 86045 + }, + { + "epoch": 5.846582416089142, + "grad_norm": 0.19254888594150543, + "learning_rate": 2.6947445305068625e-05, + "loss": 3.7723, + "step": 86050 + }, + { + "epoch": 5.8469221361598045, + "grad_norm": 0.16459116339683533, + "learning_rate": 2.6943198804185353e-05, + "loss": 3.6914, + "step": 86055 + }, + { + "epoch": 5.847261856230466, + "grad_norm": 0.16597925126552582, + "learning_rate": 2.6938952303302078e-05, + "loss": 3.8789, + "step": 86060 + }, + { + "epoch": 5.847601576301128, + "grad_norm": 0.16691188514232635, + "learning_rate": 2.693470580241881e-05, + "loss": 3.6693, + "step": 86065 + }, + { + "epoch": 5.84794129637179, + "grad_norm": 0.17839497327804565, + "learning_rate": 2.6930459301535537e-05, + "loss": 3.6197, + "step": 86070 + }, + { + "epoch": 5.848281016442451, + "grad_norm": 0.204807847738266, + "learning_rate": 2.692621280065226e-05, + "loss": 3.8423, + "step": 86075 + }, + { + "epoch": 5.848620736513113, + "grad_norm": 0.15518276393413544, + "learning_rate": 2.6921966299768993e-05, + "loss": 3.656, + "step": 86080 + }, + { + "epoch": 5.848960456583775, + "grad_norm": 0.16258761286735535, + "learning_rate": 2.691771979888572e-05, + "loss": 3.8464, + "step": 86085 + }, + { + "epoch": 5.849300176654436, + "grad_norm": 0.15729723870754242, + "learning_rate": 2.6913473298002446e-05, + "loss": 3.8583, + "step": 86090 + }, + { + "epoch": 5.849639896725098, + "grad_norm": 0.20391075313091278, + "learning_rate": 2.6909226797119174e-05, + "loss": 3.7147, + "step": 86095 + }, + { + "epoch": 5.8499796167957605, + "grad_norm": 0.23973993957042694, + "learning_rate": 2.6904980296235905e-05, + "loss": 3.6452, + "step": 86100 + }, + { + "epoch": 5.850319336866422, + "grad_norm": 0.14759467542171478, + "learning_rate": 2.690073379535263e-05, + "loss": 3.7891, + "step": 86105 + }, + { + "epoch": 5.850659056937084, + "grad_norm": 0.1691329926252365, + "learning_rate": 2.6896487294469358e-05, + "loss": 3.6818, + "step": 86110 + }, + { + "epoch": 5.850998777007746, + "grad_norm": 0.15408280491828918, + "learning_rate": 2.689224079358609e-05, + "loss": 3.6304, + "step": 86115 + }, + { + "epoch": 5.851338497078407, + "grad_norm": 0.20940853655338287, + "learning_rate": 2.688799429270281e-05, + "loss": 3.7961, + "step": 86120 + }, + { + "epoch": 5.851678217149069, + "grad_norm": 0.5085872411727905, + "learning_rate": 2.688374779181954e-05, + "loss": 3.6084, + "step": 86125 + }, + { + "epoch": 5.852017937219731, + "grad_norm": 0.1626625955104828, + "learning_rate": 2.687950129093627e-05, + "loss": 3.9963, + "step": 86130 + }, + { + "epoch": 5.852357657290392, + "grad_norm": 0.19384551048278809, + "learning_rate": 2.6875254790052994e-05, + "loss": 3.7767, + "step": 86135 + }, + { + "epoch": 5.852697377361054, + "grad_norm": 0.24090291559696198, + "learning_rate": 2.6871008289169726e-05, + "loss": 3.7941, + "step": 86140 + }, + { + "epoch": 5.8530370974317165, + "grad_norm": 0.1349765807390213, + "learning_rate": 2.6866761788286454e-05, + "loss": 3.8841, + "step": 86145 + }, + { + "epoch": 5.853376817502378, + "grad_norm": 0.14648309350013733, + "learning_rate": 2.6862515287403185e-05, + "loss": 3.8465, + "step": 86150 + }, + { + "epoch": 5.85371653757304, + "grad_norm": 0.17657168209552765, + "learning_rate": 2.6858268786519906e-05, + "loss": 3.8577, + "step": 86155 + }, + { + "epoch": 5.854056257643702, + "grad_norm": 0.4061316251754761, + "learning_rate": 2.6854022285636638e-05, + "loss": 3.6067, + "step": 86160 + }, + { + "epoch": 5.854395977714363, + "grad_norm": 0.15924684703350067, + "learning_rate": 2.6849775784753366e-05, + "loss": 3.9479, + "step": 86165 + }, + { + "epoch": 5.854735697785025, + "grad_norm": 0.445734441280365, + "learning_rate": 2.684552928387009e-05, + "loss": 3.8432, + "step": 86170 + }, + { + "epoch": 5.855075417855687, + "grad_norm": 0.1919630616903305, + "learning_rate": 2.6841282782986822e-05, + "loss": 3.8132, + "step": 86175 + }, + { + "epoch": 5.855415137926348, + "grad_norm": 0.8721786737442017, + "learning_rate": 2.683703628210355e-05, + "loss": 3.8902, + "step": 86180 + }, + { + "epoch": 5.8557548579970105, + "grad_norm": 0.17866311967372894, + "learning_rate": 2.6832789781220274e-05, + "loss": 3.8595, + "step": 86185 + }, + { + "epoch": 5.8560945780676725, + "grad_norm": 0.2041396051645279, + "learning_rate": 2.6828543280337002e-05, + "loss": 3.8953, + "step": 86190 + }, + { + "epoch": 5.856434298138334, + "grad_norm": 0.1275649219751358, + "learning_rate": 2.6824296779453734e-05, + "loss": 3.859, + "step": 86195 + }, + { + "epoch": 5.856774018208996, + "grad_norm": 0.1573132574558258, + "learning_rate": 2.682005027857046e-05, + "loss": 3.8704, + "step": 86200 + }, + { + "epoch": 5.857113738279658, + "grad_norm": 0.20413312315940857, + "learning_rate": 2.6815803777687186e-05, + "loss": 3.8409, + "step": 86205 + }, + { + "epoch": 5.857453458350319, + "grad_norm": 0.2606233060359955, + "learning_rate": 2.6811557276803918e-05, + "loss": 4.037, + "step": 86210 + }, + { + "epoch": 5.857793178420981, + "grad_norm": 0.18563184142112732, + "learning_rate": 2.6807310775920642e-05, + "loss": 3.8178, + "step": 86215 + }, + { + "epoch": 5.858132898491643, + "grad_norm": 0.1484459638595581, + "learning_rate": 2.680306427503737e-05, + "loss": 3.8041, + "step": 86220 + }, + { + "epoch": 5.858472618562304, + "grad_norm": 0.15661384165287018, + "learning_rate": 2.6798817774154102e-05, + "loss": 3.6758, + "step": 86225 + }, + { + "epoch": 5.8588123386329665, + "grad_norm": 0.2010364681482315, + "learning_rate": 2.6794571273270823e-05, + "loss": 3.939, + "step": 86230 + }, + { + "epoch": 5.8591520587036285, + "grad_norm": 0.163077712059021, + "learning_rate": 2.6790324772387554e-05, + "loss": 3.5902, + "step": 86235 + }, + { + "epoch": 5.85949177877429, + "grad_norm": 0.2126501500606537, + "learning_rate": 2.6786078271504282e-05, + "loss": 3.7306, + "step": 86240 + }, + { + "epoch": 5.859831498844952, + "grad_norm": 0.16564001142978668, + "learning_rate": 2.6781831770621007e-05, + "loss": 3.9127, + "step": 86245 + }, + { + "epoch": 5.860171218915614, + "grad_norm": 0.21080917119979858, + "learning_rate": 2.677758526973774e-05, + "loss": 3.8219, + "step": 86250 + }, + { + "epoch": 5.860510938986275, + "grad_norm": 0.2869647443294525, + "learning_rate": 2.6773338768854466e-05, + "loss": 3.6474, + "step": 86255 + }, + { + "epoch": 5.860850659056937, + "grad_norm": 0.18856950104236603, + "learning_rate": 2.676909226797119e-05, + "loss": 3.681, + "step": 86260 + }, + { + "epoch": 5.861190379127599, + "grad_norm": 0.13774839043617249, + "learning_rate": 2.676484576708792e-05, + "loss": 3.8462, + "step": 86265 + }, + { + "epoch": 5.86153009919826, + "grad_norm": 0.617302656173706, + "learning_rate": 2.676059926620465e-05, + "loss": 3.852, + "step": 86270 + }, + { + "epoch": 5.8618698192689225, + "grad_norm": 0.1693350225687027, + "learning_rate": 2.6756352765321375e-05, + "loss": 3.8064, + "step": 86275 + }, + { + "epoch": 5.8622095393395846, + "grad_norm": 0.14237913489341736, + "learning_rate": 2.6752106264438103e-05, + "loss": 3.8967, + "step": 86280 + }, + { + "epoch": 5.862549259410246, + "grad_norm": 0.1793428510427475, + "learning_rate": 2.6747859763554835e-05, + "loss": 3.3674, + "step": 86285 + }, + { + "epoch": 5.862888979480908, + "grad_norm": 0.14303651452064514, + "learning_rate": 2.6743613262671556e-05, + "loss": 3.6113, + "step": 86290 + }, + { + "epoch": 5.86322869955157, + "grad_norm": 0.15545544028282166, + "learning_rate": 2.6739366761788287e-05, + "loss": 3.9736, + "step": 86295 + }, + { + "epoch": 5.863568419622231, + "grad_norm": 0.14474375545978546, + "learning_rate": 2.6735120260905015e-05, + "loss": 3.5469, + "step": 86300 + }, + { + "epoch": 5.863908139692893, + "grad_norm": 0.18190449476242065, + "learning_rate": 2.673087376002174e-05, + "loss": 3.9103, + "step": 86305 + }, + { + "epoch": 5.864247859763555, + "grad_norm": 0.18208901584148407, + "learning_rate": 2.672662725913847e-05, + "loss": 4.1245, + "step": 86310 + }, + { + "epoch": 5.864587579834216, + "grad_norm": 0.15767896175384521, + "learning_rate": 2.67223807582552e-05, + "loss": 4.077, + "step": 86315 + }, + { + "epoch": 5.8649272999048785, + "grad_norm": 0.43518757820129395, + "learning_rate": 2.671813425737193e-05, + "loss": 3.9688, + "step": 86320 + }, + { + "epoch": 5.865267019975541, + "grad_norm": 0.16792204976081848, + "learning_rate": 2.6713887756488655e-05, + "loss": 3.652, + "step": 86325 + }, + { + "epoch": 5.865606740046202, + "grad_norm": 0.1610514223575592, + "learning_rate": 2.6709641255605383e-05, + "loss": 3.916, + "step": 86330 + }, + { + "epoch": 5.865946460116864, + "grad_norm": 0.18029950559139252, + "learning_rate": 2.670539475472211e-05, + "loss": 3.7556, + "step": 86335 + }, + { + "epoch": 5.866286180187526, + "grad_norm": 0.1986013799905777, + "learning_rate": 2.6701148253838836e-05, + "loss": 3.96, + "step": 86340 + }, + { + "epoch": 5.866625900258187, + "grad_norm": 0.29944953322410583, + "learning_rate": 2.6696901752955567e-05, + "loss": 3.4673, + "step": 86345 + }, + { + "epoch": 5.866965620328849, + "grad_norm": 0.18132317066192627, + "learning_rate": 2.6692655252072295e-05, + "loss": 3.6828, + "step": 86350 + }, + { + "epoch": 5.867305340399511, + "grad_norm": 0.2417098432779312, + "learning_rate": 2.668840875118902e-05, + "loss": 3.7108, + "step": 86355 + }, + { + "epoch": 5.867645060470172, + "grad_norm": 0.1663130223751068, + "learning_rate": 2.668416225030575e-05, + "loss": 3.8147, + "step": 86360 + }, + { + "epoch": 5.8679847805408345, + "grad_norm": 0.18155092000961304, + "learning_rate": 2.667991574942248e-05, + "loss": 3.9282, + "step": 86365 + }, + { + "epoch": 5.868324500611497, + "grad_norm": 0.12931491434574127, + "learning_rate": 2.6675669248539204e-05, + "loss": 3.794, + "step": 86370 + }, + { + "epoch": 5.868664220682158, + "grad_norm": 0.15801021456718445, + "learning_rate": 2.6671422747655932e-05, + "loss": 4.0852, + "step": 86375 + }, + { + "epoch": 5.86900394075282, + "grad_norm": 0.19838188588619232, + "learning_rate": 2.6667176246772663e-05, + "loss": 3.7719, + "step": 86380 + }, + { + "epoch": 5.869343660823482, + "grad_norm": 0.1754210740327835, + "learning_rate": 2.6662929745889388e-05, + "loss": 3.7283, + "step": 86385 + }, + { + "epoch": 5.869683380894143, + "grad_norm": 0.1711631566286087, + "learning_rate": 2.6658683245006116e-05, + "loss": 3.9559, + "step": 86390 + }, + { + "epoch": 5.870023100964805, + "grad_norm": 0.28646618127822876, + "learning_rate": 2.6654436744122847e-05, + "loss": 3.7555, + "step": 86395 + }, + { + "epoch": 5.870362821035467, + "grad_norm": 0.19164219498634338, + "learning_rate": 2.665019024323957e-05, + "loss": 3.8516, + "step": 86400 + }, + { + "epoch": 5.870702541106128, + "grad_norm": 0.1788719743490219, + "learning_rate": 2.66459437423563e-05, + "loss": 3.7013, + "step": 86405 + }, + { + "epoch": 5.8710422611767905, + "grad_norm": 0.14959374070167542, + "learning_rate": 2.6641697241473028e-05, + "loss": 3.8642, + "step": 86410 + }, + { + "epoch": 5.871381981247453, + "grad_norm": 0.1406276375055313, + "learning_rate": 2.663830004076641e-05, + "loss": 3.8868, + "step": 86415 + }, + { + "epoch": 5.871721701318114, + "grad_norm": 0.25170308351516724, + "learning_rate": 2.6634053539883135e-05, + "loss": 3.8436, + "step": 86420 + }, + { + "epoch": 5.872061421388776, + "grad_norm": 0.19637854397296906, + "learning_rate": 2.6629807038999866e-05, + "loss": 3.6442, + "step": 86425 + }, + { + "epoch": 5.872401141459437, + "grad_norm": 1.8160128593444824, + "learning_rate": 2.6625560538116594e-05, + "loss": 3.8675, + "step": 86430 + }, + { + "epoch": 5.872740861530099, + "grad_norm": 0.15775367617607117, + "learning_rate": 2.662131403723332e-05, + "loss": 3.9279, + "step": 86435 + }, + { + "epoch": 5.873080581600761, + "grad_norm": 0.19587969779968262, + "learning_rate": 2.661706753635005e-05, + "loss": 3.5808, + "step": 86440 + }, + { + "epoch": 5.873420301671422, + "grad_norm": 0.24452145397663116, + "learning_rate": 2.661282103546678e-05, + "loss": 3.8604, + "step": 86445 + }, + { + "epoch": 5.8737600217420844, + "grad_norm": 0.2037481814622879, + "learning_rate": 2.6608574534583503e-05, + "loss": 4.1428, + "step": 86450 + }, + { + "epoch": 5.8740997418127465, + "grad_norm": 0.211351677775383, + "learning_rate": 2.660432803370023e-05, + "loss": 3.7384, + "step": 86455 + }, + { + "epoch": 5.874439461883408, + "grad_norm": 0.16104501485824585, + "learning_rate": 2.6600081532816962e-05, + "loss": 3.8992, + "step": 86460 + }, + { + "epoch": 5.87477918195407, + "grad_norm": 0.17681393027305603, + "learning_rate": 2.6595835031933687e-05, + "loss": 3.7817, + "step": 86465 + }, + { + "epoch": 5.875118902024732, + "grad_norm": 0.1691887080669403, + "learning_rate": 2.6591588531050415e-05, + "loss": 3.8737, + "step": 86470 + }, + { + "epoch": 5.875458622095393, + "grad_norm": 0.144912987947464, + "learning_rate": 2.6587342030167146e-05, + "loss": 3.9276, + "step": 86475 + }, + { + "epoch": 5.875798342166055, + "grad_norm": 0.1256876289844513, + "learning_rate": 2.6583095529283868e-05, + "loss": 3.7671, + "step": 86480 + }, + { + "epoch": 5.876138062236717, + "grad_norm": 0.15306523442268372, + "learning_rate": 2.65788490284006e-05, + "loss": 3.7435, + "step": 86485 + }, + { + "epoch": 5.876477782307378, + "grad_norm": 0.1833777278661728, + "learning_rate": 2.6574602527517327e-05, + "loss": 3.7191, + "step": 86490 + }, + { + "epoch": 5.8768175023780405, + "grad_norm": 0.17016828060150146, + "learning_rate": 2.6570356026634052e-05, + "loss": 3.6893, + "step": 86495 + }, + { + "epoch": 5.8771572224487025, + "grad_norm": 0.541545033454895, + "learning_rate": 2.6566109525750783e-05, + "loss": 3.9438, + "step": 86500 + }, + { + "epoch": 5.877496942519364, + "grad_norm": 0.2509545385837555, + "learning_rate": 2.656186302486751e-05, + "loss": 3.6892, + "step": 86505 + }, + { + "epoch": 5.877836662590026, + "grad_norm": 0.16653013229370117, + "learning_rate": 2.6557616523984236e-05, + "loss": 3.7278, + "step": 86510 + }, + { + "epoch": 5.878176382660688, + "grad_norm": 0.17900952696800232, + "learning_rate": 2.6553370023100964e-05, + "loss": 3.8633, + "step": 86515 + }, + { + "epoch": 5.878516102731349, + "grad_norm": 0.22760224342346191, + "learning_rate": 2.6549123522217695e-05, + "loss": 4.0419, + "step": 86520 + }, + { + "epoch": 5.878855822802011, + "grad_norm": 0.1725781261920929, + "learning_rate": 2.6544877021334423e-05, + "loss": 3.8823, + "step": 86525 + }, + { + "epoch": 5.879195542872673, + "grad_norm": 0.16393448412418365, + "learning_rate": 2.6540630520451148e-05, + "loss": 3.5535, + "step": 86530 + }, + { + "epoch": 5.879535262943334, + "grad_norm": 0.16440841555595398, + "learning_rate": 2.653638401956788e-05, + "loss": 3.8322, + "step": 86535 + }, + { + "epoch": 5.8798749830139965, + "grad_norm": 0.188571497797966, + "learning_rate": 2.6532137518684607e-05, + "loss": 3.9124, + "step": 86540 + }, + { + "epoch": 5.8802147030846585, + "grad_norm": 0.1590401530265808, + "learning_rate": 2.6527891017801332e-05, + "loss": 3.6561, + "step": 86545 + }, + { + "epoch": 5.88055442315532, + "grad_norm": 0.17461270093917847, + "learning_rate": 2.652364451691806e-05, + "loss": 3.8923, + "step": 86550 + }, + { + "epoch": 5.880894143225982, + "grad_norm": 0.21817077696323395, + "learning_rate": 2.651939801603479e-05, + "loss": 3.8346, + "step": 86555 + }, + { + "epoch": 5.881233863296644, + "grad_norm": 0.1836916208267212, + "learning_rate": 2.6515151515151516e-05, + "loss": 3.8633, + "step": 86560 + }, + { + "epoch": 5.881573583367305, + "grad_norm": 0.1989801973104477, + "learning_rate": 2.6510905014268244e-05, + "loss": 3.8529, + "step": 86565 + }, + { + "epoch": 5.881913303437967, + "grad_norm": 0.15225712954998016, + "learning_rate": 2.6506658513384975e-05, + "loss": 3.8962, + "step": 86570 + }, + { + "epoch": 5.882253023508629, + "grad_norm": 0.1929301619529724, + "learning_rate": 2.65024120125017e-05, + "loss": 3.798, + "step": 86575 + }, + { + "epoch": 5.88259274357929, + "grad_norm": 0.1608058512210846, + "learning_rate": 2.6498165511618428e-05, + "loss": 3.8134, + "step": 86580 + }, + { + "epoch": 5.8829324636499525, + "grad_norm": 0.16699087619781494, + "learning_rate": 2.649391901073516e-05, + "loss": 4.0504, + "step": 86585 + }, + { + "epoch": 5.883272183720615, + "grad_norm": 0.2959393560886383, + "learning_rate": 2.648967250985188e-05, + "loss": 3.7976, + "step": 86590 + }, + { + "epoch": 5.883611903791276, + "grad_norm": 0.21304723620414734, + "learning_rate": 2.6485426008968612e-05, + "loss": 3.621, + "step": 86595 + }, + { + "epoch": 5.883951623861938, + "grad_norm": 0.17125318944454193, + "learning_rate": 2.648117950808534e-05, + "loss": 3.8265, + "step": 86600 + }, + { + "epoch": 5.8842913439326, + "grad_norm": 0.2076544463634491, + "learning_rate": 2.6476933007202064e-05, + "loss": 3.7544, + "step": 86605 + }, + { + "epoch": 5.884631064003261, + "grad_norm": 0.8181946873664856, + "learning_rate": 2.6472686506318796e-05, + "loss": 3.5519, + "step": 86610 + }, + { + "epoch": 5.884970784073923, + "grad_norm": 0.31852057576179504, + "learning_rate": 2.6468440005435524e-05, + "loss": 4.014, + "step": 86615 + }, + { + "epoch": 5.885310504144585, + "grad_norm": 0.2323046326637268, + "learning_rate": 2.646419350455225e-05, + "loss": 3.8227, + "step": 86620 + }, + { + "epoch": 5.885650224215246, + "grad_norm": 0.15708331763744354, + "learning_rate": 2.6459947003668977e-05, + "loss": 3.7741, + "step": 86625 + }, + { + "epoch": 5.8859899442859085, + "grad_norm": 0.1735091358423233, + "learning_rate": 2.6455700502785708e-05, + "loss": 3.8688, + "step": 86630 + }, + { + "epoch": 5.886329664356571, + "grad_norm": 0.6020544171333313, + "learning_rate": 2.6451454001902433e-05, + "loss": 3.6255, + "step": 86635 + }, + { + "epoch": 5.886669384427232, + "grad_norm": 0.17922312021255493, + "learning_rate": 2.644720750101916e-05, + "loss": 3.8034, + "step": 86640 + }, + { + "epoch": 5.887009104497894, + "grad_norm": 0.2131078988313675, + "learning_rate": 2.6442961000135892e-05, + "loss": 3.7969, + "step": 86645 + }, + { + "epoch": 5.887348824568555, + "grad_norm": 0.1590917557477951, + "learning_rate": 2.6438714499252613e-05, + "loss": 3.9876, + "step": 86650 + }, + { + "epoch": 5.887688544639217, + "grad_norm": 0.1616903841495514, + "learning_rate": 2.6434467998369345e-05, + "loss": 3.9094, + "step": 86655 + }, + { + "epoch": 5.888028264709879, + "grad_norm": 0.29358530044555664, + "learning_rate": 2.6430221497486073e-05, + "loss": 3.7678, + "step": 86660 + }, + { + "epoch": 5.88836798478054, + "grad_norm": 0.1754750907421112, + "learning_rate": 2.6425974996602797e-05, + "loss": 3.9888, + "step": 86665 + }, + { + "epoch": 5.888707704851202, + "grad_norm": 0.20568838715553284, + "learning_rate": 2.642172849571953e-05, + "loss": 3.7733, + "step": 86670 + }, + { + "epoch": 5.8890474249218645, + "grad_norm": 0.1885572224855423, + "learning_rate": 2.6417481994836257e-05, + "loss": 3.7936, + "step": 86675 + }, + { + "epoch": 5.889387144992526, + "grad_norm": 0.18719083070755005, + "learning_rate": 2.641323549395298e-05, + "loss": 3.6053, + "step": 86680 + }, + { + "epoch": 5.889726865063188, + "grad_norm": 0.17178823053836823, + "learning_rate": 2.6408988993069713e-05, + "loss": 3.9192, + "step": 86685 + }, + { + "epoch": 5.89006658513385, + "grad_norm": 0.15103763341903687, + "learning_rate": 2.640474249218644e-05, + "loss": 3.7713, + "step": 86690 + }, + { + "epoch": 5.890406305204511, + "grad_norm": 0.15557880699634552, + "learning_rate": 2.640049599130317e-05, + "loss": 3.7845, + "step": 86695 + }, + { + "epoch": 5.890746025275173, + "grad_norm": 0.176952064037323, + "learning_rate": 2.6396249490419893e-05, + "loss": 3.7023, + "step": 86700 + }, + { + "epoch": 5.891085745345835, + "grad_norm": 0.3113168179988861, + "learning_rate": 2.6392002989536625e-05, + "loss": 3.8503, + "step": 86705 + }, + { + "epoch": 5.891425465416496, + "grad_norm": 0.2545350193977356, + "learning_rate": 2.6387756488653353e-05, + "loss": 3.9034, + "step": 86710 + }, + { + "epoch": 5.891765185487158, + "grad_norm": 0.19711191952228546, + "learning_rate": 2.6383509987770077e-05, + "loss": 3.6781, + "step": 86715 + }, + { + "epoch": 5.8921049055578205, + "grad_norm": 0.17488013207912445, + "learning_rate": 2.637926348688681e-05, + "loss": 3.7429, + "step": 86720 + }, + { + "epoch": 5.892444625628482, + "grad_norm": 0.1874558925628662, + "learning_rate": 2.6375016986003537e-05, + "loss": 4.1493, + "step": 86725 + }, + { + "epoch": 5.892784345699144, + "grad_norm": 0.15737944841384888, + "learning_rate": 2.637077048512026e-05, + "loss": 3.8788, + "step": 86730 + }, + { + "epoch": 5.893124065769806, + "grad_norm": 0.16414709389209747, + "learning_rate": 2.636652398423699e-05, + "loss": 3.6891, + "step": 86735 + }, + { + "epoch": 5.893463785840467, + "grad_norm": 0.16441762447357178, + "learning_rate": 2.636227748335372e-05, + "loss": 4.0435, + "step": 86740 + }, + { + "epoch": 5.893803505911129, + "grad_norm": 0.2225388139486313, + "learning_rate": 2.6358030982470445e-05, + "loss": 3.6332, + "step": 86745 + }, + { + "epoch": 5.894143225981791, + "grad_norm": 0.3299490511417389, + "learning_rate": 2.6353784481587173e-05, + "loss": 3.8712, + "step": 86750 + }, + { + "epoch": 5.894482946052452, + "grad_norm": 0.16615936160087585, + "learning_rate": 2.6349537980703905e-05, + "loss": 3.8695, + "step": 86755 + }, + { + "epoch": 5.8948226661231145, + "grad_norm": 0.1374998688697815, + "learning_rate": 2.6345291479820626e-05, + "loss": 3.9316, + "step": 86760 + }, + { + "epoch": 5.8951623861937765, + "grad_norm": 0.21304123103618622, + "learning_rate": 2.6341044978937357e-05, + "loss": 3.5508, + "step": 86765 + }, + { + "epoch": 5.895502106264438, + "grad_norm": 0.1492086797952652, + "learning_rate": 2.6336798478054085e-05, + "loss": 3.8972, + "step": 86770 + }, + { + "epoch": 5.8958418263351, + "grad_norm": 0.22229550778865814, + "learning_rate": 2.633255197717081e-05, + "loss": 3.8405, + "step": 86775 + }, + { + "epoch": 5.896181546405762, + "grad_norm": 0.17416226863861084, + "learning_rate": 2.632830547628754e-05, + "loss": 3.8668, + "step": 86780 + }, + { + "epoch": 5.896521266476423, + "grad_norm": 0.21257896721363068, + "learning_rate": 2.632405897540427e-05, + "loss": 3.8961, + "step": 86785 + }, + { + "epoch": 5.896860986547085, + "grad_norm": 0.19858266413211823, + "learning_rate": 2.6319812474520994e-05, + "loss": 3.8429, + "step": 86790 + }, + { + "epoch": 5.897200706617747, + "grad_norm": 0.18171623349189758, + "learning_rate": 2.6315565973637722e-05, + "loss": 3.6761, + "step": 86795 + }, + { + "epoch": 5.897540426688408, + "grad_norm": 0.17758311331272125, + "learning_rate": 2.6311319472754453e-05, + "loss": 3.8372, + "step": 86800 + }, + { + "epoch": 5.8978801467590705, + "grad_norm": 0.22095882892608643, + "learning_rate": 2.6307072971871178e-05, + "loss": 3.9479, + "step": 86805 + }, + { + "epoch": 5.8982198668297325, + "grad_norm": 0.20379185676574707, + "learning_rate": 2.6302826470987906e-05, + "loss": 3.8714, + "step": 86810 + }, + { + "epoch": 5.898559586900394, + "grad_norm": 0.1484268754720688, + "learning_rate": 2.6298579970104637e-05, + "loss": 3.809, + "step": 86815 + }, + { + "epoch": 5.898899306971056, + "grad_norm": 0.1465691775083542, + "learning_rate": 2.6294333469221362e-05, + "loss": 3.9657, + "step": 86820 + }, + { + "epoch": 5.899239027041718, + "grad_norm": 0.27397364377975464, + "learning_rate": 2.629008696833809e-05, + "loss": 3.9438, + "step": 86825 + }, + { + "epoch": 5.899578747112379, + "grad_norm": 0.1861005276441574, + "learning_rate": 2.628584046745482e-05, + "loss": 3.7582, + "step": 86830 + }, + { + "epoch": 5.899918467183041, + "grad_norm": 0.154975026845932, + "learning_rate": 2.6281593966571543e-05, + "loss": 3.6717, + "step": 86835 + }, + { + "epoch": 5.900258187253703, + "grad_norm": 0.1981004923582077, + "learning_rate": 2.6277347465688274e-05, + "loss": 3.7983, + "step": 86840 + }, + { + "epoch": 5.900597907324364, + "grad_norm": 0.19562003016471863, + "learning_rate": 2.6273100964805002e-05, + "loss": 3.9566, + "step": 86845 + }, + { + "epoch": 5.9009376273950265, + "grad_norm": 0.15287138521671295, + "learning_rate": 2.6268854463921727e-05, + "loss": 3.7485, + "step": 86850 + }, + { + "epoch": 5.9012773474656885, + "grad_norm": 0.22392642498016357, + "learning_rate": 2.6264607963038458e-05, + "loss": 3.5912, + "step": 86855 + }, + { + "epoch": 5.90161706753635, + "grad_norm": 0.15184621512889862, + "learning_rate": 2.6260361462155186e-05, + "loss": 4.0522, + "step": 86860 + }, + { + "epoch": 5.901956787607012, + "grad_norm": 0.15918275713920593, + "learning_rate": 2.6256114961271918e-05, + "loss": 3.7559, + "step": 86865 + }, + { + "epoch": 5.902296507677674, + "grad_norm": 0.22497855126857758, + "learning_rate": 2.625186846038864e-05, + "loss": 3.8729, + "step": 86870 + }, + { + "epoch": 5.902636227748335, + "grad_norm": 0.4487234950065613, + "learning_rate": 2.624762195950537e-05, + "loss": 3.8135, + "step": 86875 + }, + { + "epoch": 5.902975947818997, + "grad_norm": 0.15800198912620544, + "learning_rate": 2.6243375458622098e-05, + "loss": 3.8699, + "step": 86880 + }, + { + "epoch": 5.903315667889659, + "grad_norm": 0.18704868853092194, + "learning_rate": 2.6239128957738823e-05, + "loss": 3.857, + "step": 86885 + }, + { + "epoch": 5.90365538796032, + "grad_norm": 0.1713220477104187, + "learning_rate": 2.6234882456855554e-05, + "loss": 3.9874, + "step": 86890 + }, + { + "epoch": 5.9039951080309825, + "grad_norm": 0.15653640031814575, + "learning_rate": 2.6230635955972282e-05, + "loss": 3.8658, + "step": 86895 + }, + { + "epoch": 5.904334828101645, + "grad_norm": 0.23844507336616516, + "learning_rate": 2.6226389455089007e-05, + "loss": 3.8249, + "step": 86900 + }, + { + "epoch": 5.904674548172306, + "grad_norm": 0.1605812907218933, + "learning_rate": 2.6222142954205735e-05, + "loss": 3.7877, + "step": 86905 + }, + { + "epoch": 5.905014268242968, + "grad_norm": 0.16053177416324615, + "learning_rate": 2.6217896453322466e-05, + "loss": 3.6607, + "step": 86910 + }, + { + "epoch": 5.90535398831363, + "grad_norm": 1.1742806434631348, + "learning_rate": 2.621364995243919e-05, + "loss": 3.7379, + "step": 86915 + }, + { + "epoch": 5.905693708384291, + "grad_norm": 0.2446203976869583, + "learning_rate": 2.620940345155592e-05, + "loss": 3.7518, + "step": 86920 + }, + { + "epoch": 5.906033428454953, + "grad_norm": 0.371307909488678, + "learning_rate": 2.620515695067265e-05, + "loss": 3.886, + "step": 86925 + }, + { + "epoch": 5.906373148525615, + "grad_norm": 0.22556458413600922, + "learning_rate": 2.6200910449789375e-05, + "loss": 3.7577, + "step": 86930 + }, + { + "epoch": 5.906712868596276, + "grad_norm": 0.16150718927383423, + "learning_rate": 2.6196663948906103e-05, + "loss": 3.4791, + "step": 86935 + }, + { + "epoch": 5.9070525886669385, + "grad_norm": 0.1804715394973755, + "learning_rate": 2.619241744802283e-05, + "loss": 3.7026, + "step": 86940 + }, + { + "epoch": 5.907392308737601, + "grad_norm": 0.16493795812129974, + "learning_rate": 2.6188170947139555e-05, + "loss": 3.7284, + "step": 86945 + }, + { + "epoch": 5.907732028808262, + "grad_norm": 0.7045275568962097, + "learning_rate": 2.6183924446256287e-05, + "loss": 4.0015, + "step": 86950 + }, + { + "epoch": 5.908071748878924, + "grad_norm": 0.151090607047081, + "learning_rate": 2.6179677945373015e-05, + "loss": 3.8039, + "step": 86955 + }, + { + "epoch": 5.908411468949586, + "grad_norm": 0.199724018573761, + "learning_rate": 2.617543144448974e-05, + "loss": 3.8494, + "step": 86960 + }, + { + "epoch": 5.908751189020247, + "grad_norm": 0.1710241287946701, + "learning_rate": 2.617118494360647e-05, + "loss": 3.9811, + "step": 86965 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 0.2119830846786499, + "learning_rate": 2.61669384427232e-05, + "loss": 3.7901, + "step": 86970 + }, + { + "epoch": 5.909430629161571, + "grad_norm": 0.15752369165420532, + "learning_rate": 2.6162691941839924e-05, + "loss": 4.1905, + "step": 86975 + }, + { + "epoch": 5.909770349232232, + "grad_norm": 0.1556861847639084, + "learning_rate": 2.615844544095665e-05, + "loss": 3.8003, + "step": 86980 + }, + { + "epoch": 5.9101100693028945, + "grad_norm": 0.16448156535625458, + "learning_rate": 2.6154198940073383e-05, + "loss": 3.6925, + "step": 86985 + }, + { + "epoch": 5.910449789373557, + "grad_norm": 0.16137494146823883, + "learning_rate": 2.6149952439190108e-05, + "loss": 3.8636, + "step": 86990 + }, + { + "epoch": 5.910789509444218, + "grad_norm": 0.14145848155021667, + "learning_rate": 2.6145705938306836e-05, + "loss": 3.8867, + "step": 86995 + }, + { + "epoch": 5.91112922951488, + "grad_norm": 0.18158754706382751, + "learning_rate": 2.6141459437423567e-05, + "loss": 3.9428, + "step": 87000 + }, + { + "epoch": 5.911468949585542, + "grad_norm": 0.18415029346942902, + "learning_rate": 2.6137212936540288e-05, + "loss": 3.9019, + "step": 87005 + }, + { + "epoch": 5.911808669656203, + "grad_norm": 1.3746601343154907, + "learning_rate": 2.613296643565702e-05, + "loss": 3.5001, + "step": 87010 + }, + { + "epoch": 5.912148389726865, + "grad_norm": 0.1807202398777008, + "learning_rate": 2.6128719934773748e-05, + "loss": 3.7324, + "step": 87015 + }, + { + "epoch": 5.912488109797527, + "grad_norm": 0.1697731763124466, + "learning_rate": 2.6124473433890472e-05, + "loss": 3.7545, + "step": 87020 + }, + { + "epoch": 5.9128278298681884, + "grad_norm": 0.263271301984787, + "learning_rate": 2.6120226933007204e-05, + "loss": 4.016, + "step": 87025 + }, + { + "epoch": 5.9131675499388505, + "grad_norm": 0.17300978302955627, + "learning_rate": 2.611598043212393e-05, + "loss": 4.1361, + "step": 87030 + }, + { + "epoch": 5.913507270009513, + "grad_norm": 0.4372500777244568, + "learning_rate": 2.6111733931240663e-05, + "loss": 3.9765, + "step": 87035 + }, + { + "epoch": 5.913846990080174, + "grad_norm": 0.1729850023984909, + "learning_rate": 2.6107487430357384e-05, + "loss": 3.915, + "step": 87040 + }, + { + "epoch": 5.914186710150836, + "grad_norm": 0.15116126835346222, + "learning_rate": 2.6103240929474116e-05, + "loss": 3.6453, + "step": 87045 + }, + { + "epoch": 5.914526430221498, + "grad_norm": 0.18720878660678864, + "learning_rate": 2.6098994428590844e-05, + "loss": 3.8556, + "step": 87050 + }, + { + "epoch": 5.914866150292159, + "grad_norm": 0.2684553563594818, + "learning_rate": 2.6094747927707568e-05, + "loss": 3.7453, + "step": 87055 + }, + { + "epoch": 5.915205870362821, + "grad_norm": 0.2110915631055832, + "learning_rate": 2.60905014268243e-05, + "loss": 3.6388, + "step": 87060 + }, + { + "epoch": 5.915545590433483, + "grad_norm": 0.17684879899024963, + "learning_rate": 2.6086254925941028e-05, + "loss": 3.8376, + "step": 87065 + }, + { + "epoch": 5.9158853105041445, + "grad_norm": 0.18826881051063538, + "learning_rate": 2.6082008425057752e-05, + "loss": 3.7191, + "step": 87070 + }, + { + "epoch": 5.9162250305748065, + "grad_norm": 0.14945833384990692, + "learning_rate": 2.607776192417448e-05, + "loss": 3.8831, + "step": 87075 + }, + { + "epoch": 5.916564750645469, + "grad_norm": 0.16991694271564484, + "learning_rate": 2.607351542329121e-05, + "loss": 4.0428, + "step": 87080 + }, + { + "epoch": 5.91690447071613, + "grad_norm": 0.1898922324180603, + "learning_rate": 2.6069268922407936e-05, + "loss": 3.7158, + "step": 87085 + }, + { + "epoch": 5.917244190786792, + "grad_norm": 0.14403964579105377, + "learning_rate": 2.6065022421524664e-05, + "loss": 4.0349, + "step": 87090 + }, + { + "epoch": 5.917583910857454, + "grad_norm": 0.1968710571527481, + "learning_rate": 2.6060775920641396e-05, + "loss": 3.6675, + "step": 87095 + }, + { + "epoch": 5.917923630928115, + "grad_norm": 0.17394380271434784, + "learning_rate": 2.605652941975812e-05, + "loss": 3.8134, + "step": 87100 + }, + { + "epoch": 5.918263350998777, + "grad_norm": 0.18292567133903503, + "learning_rate": 2.605228291887485e-05, + "loss": 4.0204, + "step": 87105 + }, + { + "epoch": 5.918603071069439, + "grad_norm": 0.19365081191062927, + "learning_rate": 2.604803641799158e-05, + "loss": 3.8197, + "step": 87110 + }, + { + "epoch": 5.9189427911401005, + "grad_norm": 0.15811066329479218, + "learning_rate": 2.60437899171083e-05, + "loss": 3.9665, + "step": 87115 + }, + { + "epoch": 5.9192825112107625, + "grad_norm": 0.14747053384780884, + "learning_rate": 2.6039543416225032e-05, + "loss": 3.6443, + "step": 87120 + }, + { + "epoch": 5.919622231281424, + "grad_norm": 0.1506301760673523, + "learning_rate": 2.603529691534176e-05, + "loss": 3.8648, + "step": 87125 + }, + { + "epoch": 5.919961951352086, + "grad_norm": 0.16446493566036224, + "learning_rate": 2.6031050414458485e-05, + "loss": 3.5598, + "step": 87130 + }, + { + "epoch": 5.920301671422748, + "grad_norm": 0.1849096566438675, + "learning_rate": 2.6026803913575216e-05, + "loss": 3.8918, + "step": 87135 + }, + { + "epoch": 5.920641391493409, + "grad_norm": 0.35056272149086, + "learning_rate": 2.6022557412691944e-05, + "loss": 3.8659, + "step": 87140 + }, + { + "epoch": 5.920981111564071, + "grad_norm": 0.21119652688503265, + "learning_rate": 2.601831091180867e-05, + "loss": 4.0136, + "step": 87145 + }, + { + "epoch": 5.921320831634733, + "grad_norm": 0.16310729086399078, + "learning_rate": 2.6014064410925397e-05, + "loss": 3.8612, + "step": 87150 + }, + { + "epoch": 5.921660551705394, + "grad_norm": 0.18789516389369965, + "learning_rate": 2.600981791004213e-05, + "loss": 3.8399, + "step": 87155 + }, + { + "epoch": 5.9220002717760565, + "grad_norm": 0.15192370116710663, + "learning_rate": 2.6005571409158853e-05, + "loss": 3.7563, + "step": 87160 + }, + { + "epoch": 5.9223399918467186, + "grad_norm": 0.16144251823425293, + "learning_rate": 2.600132490827558e-05, + "loss": 4.2069, + "step": 87165 + }, + { + "epoch": 5.92267971191738, + "grad_norm": 0.20076251029968262, + "learning_rate": 2.5997078407392312e-05, + "loss": 3.8563, + "step": 87170 + }, + { + "epoch": 5.923019431988042, + "grad_norm": 0.15641935169696808, + "learning_rate": 2.5992831906509034e-05, + "loss": 3.847, + "step": 87175 + }, + { + "epoch": 5.923359152058704, + "grad_norm": 0.23714417219161987, + "learning_rate": 2.5988585405625765e-05, + "loss": 3.8798, + "step": 87180 + }, + { + "epoch": 5.923698872129365, + "grad_norm": 0.5878047347068787, + "learning_rate": 2.5984338904742493e-05, + "loss": 3.7715, + "step": 87185 + }, + { + "epoch": 5.924038592200027, + "grad_norm": 0.18179471790790558, + "learning_rate": 2.5980092403859218e-05, + "loss": 3.7847, + "step": 87190 + }, + { + "epoch": 5.924378312270689, + "grad_norm": 0.17172928154468536, + "learning_rate": 2.597584590297595e-05, + "loss": 4.143, + "step": 87195 + }, + { + "epoch": 5.92471803234135, + "grad_norm": 0.8586941361427307, + "learning_rate": 2.5971599402092677e-05, + "loss": 4.0854, + "step": 87200 + }, + { + "epoch": 5.9250577524120125, + "grad_norm": 0.16550570726394653, + "learning_rate": 2.596735290120941e-05, + "loss": 3.7376, + "step": 87205 + }, + { + "epoch": 5.925397472482675, + "grad_norm": 0.18934908509254456, + "learning_rate": 2.5963106400326133e-05, + "loss": 3.7317, + "step": 87210 + }, + { + "epoch": 5.925737192553336, + "grad_norm": 0.18892066180706024, + "learning_rate": 2.595885989944286e-05, + "loss": 3.8884, + "step": 87215 + }, + { + "epoch": 5.926076912623998, + "grad_norm": 0.1681206077337265, + "learning_rate": 2.595461339855959e-05, + "loss": 3.8644, + "step": 87220 + }, + { + "epoch": 5.92641663269466, + "grad_norm": 0.1786937564611435, + "learning_rate": 2.5950366897676314e-05, + "loss": 3.8398, + "step": 87225 + }, + { + "epoch": 5.926756352765321, + "grad_norm": 0.19082345068454742, + "learning_rate": 2.5946120396793045e-05, + "loss": 3.7511, + "step": 87230 + }, + { + "epoch": 5.927096072835983, + "grad_norm": 0.1404760181903839, + "learning_rate": 2.5941873895909773e-05, + "loss": 3.9937, + "step": 87235 + }, + { + "epoch": 5.927435792906645, + "grad_norm": 0.1652848869562149, + "learning_rate": 2.5937627395026498e-05, + "loss": 3.7434, + "step": 87240 + }, + { + "epoch": 5.927775512977306, + "grad_norm": 0.7942866086959839, + "learning_rate": 2.593338089414323e-05, + "loss": 3.7542, + "step": 87245 + }, + { + "epoch": 5.9281152330479685, + "grad_norm": 0.16530370712280273, + "learning_rate": 2.5929134393259957e-05, + "loss": 3.9818, + "step": 87250 + }, + { + "epoch": 5.928454953118631, + "grad_norm": 0.23240630328655243, + "learning_rate": 2.5924887892376682e-05, + "loss": 3.6598, + "step": 87255 + }, + { + "epoch": 5.928794673189292, + "grad_norm": 0.3544514775276184, + "learning_rate": 2.592064139149341e-05, + "loss": 3.8614, + "step": 87260 + }, + { + "epoch": 5.929134393259954, + "grad_norm": 0.20342355966567993, + "learning_rate": 2.591639489061014e-05, + "loss": 3.6288, + "step": 87265 + }, + { + "epoch": 5.929474113330616, + "grad_norm": 0.20095524191856384, + "learning_rate": 2.5912148389726866e-05, + "loss": 4.041, + "step": 87270 + }, + { + "epoch": 5.929813833401277, + "grad_norm": 0.31775811314582825, + "learning_rate": 2.5907901888843594e-05, + "loss": 3.7312, + "step": 87275 + }, + { + "epoch": 5.930153553471939, + "grad_norm": 0.15352202951908112, + "learning_rate": 2.5903655387960325e-05, + "loss": 3.6484, + "step": 87280 + }, + { + "epoch": 5.930493273542601, + "grad_norm": 0.17390449345111847, + "learning_rate": 2.5899408887077046e-05, + "loss": 3.7002, + "step": 87285 + }, + { + "epoch": 5.930832993613262, + "grad_norm": 0.9997602105140686, + "learning_rate": 2.5895162386193778e-05, + "loss": 3.8455, + "step": 87290 + }, + { + "epoch": 5.9311727136839245, + "grad_norm": 0.33538755774497986, + "learning_rate": 2.5890915885310506e-05, + "loss": 3.8303, + "step": 87295 + }, + { + "epoch": 5.931512433754587, + "grad_norm": 0.20382006466388702, + "learning_rate": 2.588666938442723e-05, + "loss": 3.8641, + "step": 87300 + }, + { + "epoch": 5.931852153825248, + "grad_norm": 0.21510623395442963, + "learning_rate": 2.5882422883543962e-05, + "loss": 3.8028, + "step": 87305 + }, + { + "epoch": 5.93219187389591, + "grad_norm": 0.15119364857673645, + "learning_rate": 2.587817638266069e-05, + "loss": 3.7916, + "step": 87310 + }, + { + "epoch": 5.932531593966572, + "grad_norm": 0.1584136188030243, + "learning_rate": 2.5873929881777414e-05, + "loss": 3.9279, + "step": 87315 + }, + { + "epoch": 5.932871314037233, + "grad_norm": 0.13243713974952698, + "learning_rate": 2.5869683380894142e-05, + "loss": 3.7105, + "step": 87320 + }, + { + "epoch": 5.933211034107895, + "grad_norm": 0.1352556347846985, + "learning_rate": 2.5865436880010874e-05, + "loss": 3.8971, + "step": 87325 + }, + { + "epoch": 5.933550754178556, + "grad_norm": 0.18496033549308777, + "learning_rate": 2.58611903791276e-05, + "loss": 3.791, + "step": 87330 + }, + { + "epoch": 5.9338904742492184, + "grad_norm": 0.46022358536720276, + "learning_rate": 2.5856943878244327e-05, + "loss": 3.8731, + "step": 87335 + }, + { + "epoch": 5.9342301943198805, + "grad_norm": 0.22510740160942078, + "learning_rate": 2.5852697377361058e-05, + "loss": 4.1864, + "step": 87340 + }, + { + "epoch": 5.934569914390542, + "grad_norm": 0.15688778460025787, + "learning_rate": 2.5848450876477783e-05, + "loss": 3.7116, + "step": 87345 + }, + { + "epoch": 5.934909634461204, + "grad_norm": 0.15472480654716492, + "learning_rate": 2.584420437559451e-05, + "loss": 3.8027, + "step": 87350 + }, + { + "epoch": 5.935249354531866, + "grad_norm": 0.1818024069070816, + "learning_rate": 2.5839957874711242e-05, + "loss": 3.7786, + "step": 87355 + }, + { + "epoch": 5.935589074602527, + "grad_norm": 0.1832912117242813, + "learning_rate": 2.5835711373827963e-05, + "loss": 3.8286, + "step": 87360 + }, + { + "epoch": 5.935928794673189, + "grad_norm": 0.1584431529045105, + "learning_rate": 2.5831464872944695e-05, + "loss": 3.7233, + "step": 87365 + }, + { + "epoch": 5.936268514743851, + "grad_norm": 0.15959309041500092, + "learning_rate": 2.5827218372061423e-05, + "loss": 3.7759, + "step": 87370 + }, + { + "epoch": 5.936608234814512, + "grad_norm": 0.18132786452770233, + "learning_rate": 2.5822971871178154e-05, + "loss": 3.6572, + "step": 87375 + }, + { + "epoch": 5.9369479548851745, + "grad_norm": 0.15917985141277313, + "learning_rate": 2.581872537029488e-05, + "loss": 3.7692, + "step": 87380 + }, + { + "epoch": 5.9372876749558365, + "grad_norm": 0.1860395073890686, + "learning_rate": 2.5814478869411607e-05, + "loss": 3.8296, + "step": 87385 + }, + { + "epoch": 5.937627395026498, + "grad_norm": 0.17592692375183105, + "learning_rate": 2.5810232368528338e-05, + "loss": 3.6645, + "step": 87390 + }, + { + "epoch": 5.93796711509716, + "grad_norm": 0.17467673122882843, + "learning_rate": 2.580598586764506e-05, + "loss": 3.7113, + "step": 87395 + }, + { + "epoch": 5.938306835167822, + "grad_norm": 0.16891255974769592, + "learning_rate": 2.580173936676179e-05, + "loss": 3.5132, + "step": 87400 + }, + { + "epoch": 5.938646555238483, + "grad_norm": 0.2548465430736542, + "learning_rate": 2.579749286587852e-05, + "loss": 3.7227, + "step": 87405 + }, + { + "epoch": 5.938986275309145, + "grad_norm": 0.17914913594722748, + "learning_rate": 2.5793246364995243e-05, + "loss": 3.9714, + "step": 87410 + }, + { + "epoch": 5.939325995379807, + "grad_norm": 0.16329318284988403, + "learning_rate": 2.5788999864111975e-05, + "loss": 3.6432, + "step": 87415 + }, + { + "epoch": 5.939665715450468, + "grad_norm": 0.15169881284236908, + "learning_rate": 2.5784753363228703e-05, + "loss": 3.8021, + "step": 87420 + }, + { + "epoch": 5.9400054355211305, + "grad_norm": 0.14052148163318634, + "learning_rate": 2.5780506862345427e-05, + "loss": 3.5871, + "step": 87425 + }, + { + "epoch": 5.9403451555917925, + "grad_norm": 0.12996700406074524, + "learning_rate": 2.5776260361462155e-05, + "loss": 3.527, + "step": 87430 + }, + { + "epoch": 5.940684875662454, + "grad_norm": 0.18696683645248413, + "learning_rate": 2.5772013860578887e-05, + "loss": 3.9565, + "step": 87435 + }, + { + "epoch": 5.941024595733116, + "grad_norm": 0.19638891518115997, + "learning_rate": 2.576776735969561e-05, + "loss": 3.7088, + "step": 87440 + }, + { + "epoch": 5.941364315803778, + "grad_norm": 0.1575406938791275, + "learning_rate": 2.576352085881234e-05, + "loss": 3.664, + "step": 87445 + }, + { + "epoch": 5.941704035874439, + "grad_norm": 0.20760484039783478, + "learning_rate": 2.575927435792907e-05, + "loss": 3.8597, + "step": 87450 + }, + { + "epoch": 5.942043755945101, + "grad_norm": 0.1762494593858719, + "learning_rate": 2.5755027857045795e-05, + "loss": 3.9087, + "step": 87455 + }, + { + "epoch": 5.942383476015763, + "grad_norm": 0.15965071320533752, + "learning_rate": 2.5750781356162523e-05, + "loss": 3.8472, + "step": 87460 + }, + { + "epoch": 5.942723196086424, + "grad_norm": 0.21379190683364868, + "learning_rate": 2.574653485527925e-05, + "loss": 3.7468, + "step": 87465 + }, + { + "epoch": 5.9430629161570865, + "grad_norm": 0.13872337341308594, + "learning_rate": 2.5742288354395976e-05, + "loss": 3.7808, + "step": 87470 + }, + { + "epoch": 5.943402636227749, + "grad_norm": 0.20250597596168518, + "learning_rate": 2.5738041853512707e-05, + "loss": 3.9017, + "step": 87475 + }, + { + "epoch": 5.94374235629841, + "grad_norm": 0.16309544444084167, + "learning_rate": 2.5733795352629435e-05, + "loss": 3.898, + "step": 87480 + }, + { + "epoch": 5.944082076369072, + "grad_norm": 0.4247414767742157, + "learning_rate": 2.572954885174616e-05, + "loss": 3.8735, + "step": 87485 + }, + { + "epoch": 5.944421796439734, + "grad_norm": 0.15481147170066833, + "learning_rate": 2.572530235086289e-05, + "loss": 3.665, + "step": 87490 + }, + { + "epoch": 5.944761516510395, + "grad_norm": 0.17599348723888397, + "learning_rate": 2.572105584997962e-05, + "loss": 3.9047, + "step": 87495 + }, + { + "epoch": 5.945101236581057, + "grad_norm": 0.2939627170562744, + "learning_rate": 2.5716809349096344e-05, + "loss": 4.0309, + "step": 87500 + }, + { + "epoch": 5.945440956651719, + "grad_norm": 0.6440281867980957, + "learning_rate": 2.5712562848213072e-05, + "loss": 3.834, + "step": 87505 + }, + { + "epoch": 5.94578067672238, + "grad_norm": 0.20011460781097412, + "learning_rate": 2.5708316347329803e-05, + "loss": 3.6833, + "step": 87510 + }, + { + "epoch": 5.9461203967930425, + "grad_norm": 0.1646520048379898, + "learning_rate": 2.5704069846446528e-05, + "loss": 3.8968, + "step": 87515 + }, + { + "epoch": 5.946460116863705, + "grad_norm": 0.20819607377052307, + "learning_rate": 2.5699823345563256e-05, + "loss": 4.0411, + "step": 87520 + }, + { + "epoch": 5.946799836934366, + "grad_norm": 0.18085899949073792, + "learning_rate": 2.5695576844679987e-05, + "loss": 4.001, + "step": 87525 + }, + { + "epoch": 5.947139557005028, + "grad_norm": 0.17523178458213806, + "learning_rate": 2.569133034379671e-05, + "loss": 3.6105, + "step": 87530 + }, + { + "epoch": 5.94747927707569, + "grad_norm": 0.17491155862808228, + "learning_rate": 2.568708384291344e-05, + "loss": 3.9702, + "step": 87535 + }, + { + "epoch": 5.947818997146351, + "grad_norm": 0.28524070978164673, + "learning_rate": 2.5682837342030168e-05, + "loss": 3.967, + "step": 87540 + }, + { + "epoch": 5.948158717217013, + "grad_norm": 0.202602356672287, + "learning_rate": 2.56785908411469e-05, + "loss": 3.8102, + "step": 87545 + }, + { + "epoch": 5.948498437287675, + "grad_norm": 0.1777697056531906, + "learning_rate": 2.5674344340263624e-05, + "loss": 3.6557, + "step": 87550 + }, + { + "epoch": 5.948838157358336, + "grad_norm": 0.20807908475399017, + "learning_rate": 2.5670097839380352e-05, + "loss": 4.1368, + "step": 87555 + }, + { + "epoch": 5.9491778774289985, + "grad_norm": 0.15789148211479187, + "learning_rate": 2.5665851338497083e-05, + "loss": 3.5301, + "step": 87560 + }, + { + "epoch": 5.949517597499661, + "grad_norm": 0.18020161986351013, + "learning_rate": 2.5661604837613805e-05, + "loss": 3.6583, + "step": 87565 + }, + { + "epoch": 5.949857317570322, + "grad_norm": 0.2549792230129242, + "learning_rate": 2.5657358336730536e-05, + "loss": 3.9069, + "step": 87570 + }, + { + "epoch": 5.950197037640984, + "grad_norm": 0.16306766867637634, + "learning_rate": 2.5653111835847264e-05, + "loss": 3.7369, + "step": 87575 + }, + { + "epoch": 5.950536757711646, + "grad_norm": 0.18196450173854828, + "learning_rate": 2.564886533496399e-05, + "loss": 3.9156, + "step": 87580 + }, + { + "epoch": 5.950876477782307, + "grad_norm": 0.6512978076934814, + "learning_rate": 2.564461883408072e-05, + "loss": 3.9173, + "step": 87585 + }, + { + "epoch": 5.951216197852969, + "grad_norm": 0.1937416046857834, + "learning_rate": 2.5640372333197448e-05, + "loss": 3.6507, + "step": 87590 + }, + { + "epoch": 5.951555917923631, + "grad_norm": 0.23907408118247986, + "learning_rate": 2.5636125832314173e-05, + "loss": 3.8429, + "step": 87595 + }, + { + "epoch": 5.951895637994292, + "grad_norm": 0.1608644723892212, + "learning_rate": 2.5631879331430904e-05, + "loss": 3.7335, + "step": 87600 + }, + { + "epoch": 5.9522353580649545, + "grad_norm": 0.18861982226371765, + "learning_rate": 2.5627632830547632e-05, + "loss": 3.9248, + "step": 87605 + }, + { + "epoch": 5.952575078135617, + "grad_norm": 0.31147390604019165, + "learning_rate": 2.5623386329664357e-05, + "loss": 3.9585, + "step": 87610 + }, + { + "epoch": 5.952914798206278, + "grad_norm": 0.48977431654930115, + "learning_rate": 2.5619139828781085e-05, + "loss": 3.9886, + "step": 87615 + }, + { + "epoch": 5.95325451827694, + "grad_norm": 0.17341752350330353, + "learning_rate": 2.5614893327897816e-05, + "loss": 3.6684, + "step": 87620 + }, + { + "epoch": 5.953594238347602, + "grad_norm": 0.21212458610534668, + "learning_rate": 2.561064682701454e-05, + "loss": 3.6399, + "step": 87625 + }, + { + "epoch": 5.953933958418263, + "grad_norm": 0.17468979954719543, + "learning_rate": 2.560640032613127e-05, + "loss": 3.9729, + "step": 87630 + }, + { + "epoch": 5.954273678488925, + "grad_norm": 0.18068231642246246, + "learning_rate": 2.5602153825248e-05, + "loss": 3.8276, + "step": 87635 + }, + { + "epoch": 5.954613398559587, + "grad_norm": 0.16172155737876892, + "learning_rate": 2.559790732436472e-05, + "loss": 3.64, + "step": 87640 + }, + { + "epoch": 5.9549531186302485, + "grad_norm": 1.682381510734558, + "learning_rate": 2.5593660823481453e-05, + "loss": 3.8264, + "step": 87645 + }, + { + "epoch": 5.9552928387009105, + "grad_norm": 0.18775391578674316, + "learning_rate": 2.558941432259818e-05, + "loss": 3.8259, + "step": 87650 + }, + { + "epoch": 5.955632558771573, + "grad_norm": 1.9867573976516724, + "learning_rate": 2.5585167821714905e-05, + "loss": 3.8044, + "step": 87655 + }, + { + "epoch": 5.955972278842234, + "grad_norm": 3.8646140098571777, + "learning_rate": 2.5580921320831637e-05, + "loss": 3.8202, + "step": 87660 + }, + { + "epoch": 5.956311998912896, + "grad_norm": 0.25662678480148315, + "learning_rate": 2.5576674819948365e-05, + "loss": 3.9775, + "step": 87665 + }, + { + "epoch": 5.956651718983558, + "grad_norm": 0.16967715322971344, + "learning_rate": 2.557242831906509e-05, + "loss": 3.7243, + "step": 87670 + }, + { + "epoch": 5.956991439054219, + "grad_norm": 0.18503783643245697, + "learning_rate": 2.5568181818181817e-05, + "loss": 3.9149, + "step": 87675 + }, + { + "epoch": 5.957331159124881, + "grad_norm": 0.18904000520706177, + "learning_rate": 2.556393531729855e-05, + "loss": 4.0093, + "step": 87680 + }, + { + "epoch": 5.957670879195543, + "grad_norm": 1.8983327150344849, + "learning_rate": 2.5559688816415273e-05, + "loss": 4.0102, + "step": 87685 + }, + { + "epoch": 5.9580105992662045, + "grad_norm": 0.14067243039608002, + "learning_rate": 2.5555442315532e-05, + "loss": 3.7492, + "step": 87690 + }, + { + "epoch": 5.9583503193368665, + "grad_norm": 0.15186284482479095, + "learning_rate": 2.5551195814648733e-05, + "loss": 3.7567, + "step": 87695 + }, + { + "epoch": 5.958690039407529, + "grad_norm": 0.16683097183704376, + "learning_rate": 2.5546949313765458e-05, + "loss": 4.0941, + "step": 87700 + }, + { + "epoch": 5.95902975947819, + "grad_norm": 1.7078295946121216, + "learning_rate": 2.5542702812882186e-05, + "loss": 4.0207, + "step": 87705 + }, + { + "epoch": 5.959369479548852, + "grad_norm": 0.23955190181732178, + "learning_rate": 2.5538456311998914e-05, + "loss": 4.1246, + "step": 87710 + }, + { + "epoch": 5.959709199619514, + "grad_norm": 0.1384066790342331, + "learning_rate": 2.5534209811115645e-05, + "loss": 3.6775, + "step": 87715 + }, + { + "epoch": 5.960048919690175, + "grad_norm": 0.2559462785720825, + "learning_rate": 2.552996331023237e-05, + "loss": 4.025, + "step": 87720 + }, + { + "epoch": 5.960388639760837, + "grad_norm": 0.1621944159269333, + "learning_rate": 2.5525716809349098e-05, + "loss": 3.865, + "step": 87725 + }, + { + "epoch": 5.960728359831499, + "grad_norm": 0.21233732998371124, + "learning_rate": 2.552147030846583e-05, + "loss": 4.1303, + "step": 87730 + }, + { + "epoch": 5.9610680799021605, + "grad_norm": 0.16882926225662231, + "learning_rate": 2.5517223807582554e-05, + "loss": 3.8389, + "step": 87735 + }, + { + "epoch": 5.9614077999728226, + "grad_norm": 0.27132198214530945, + "learning_rate": 2.551297730669928e-05, + "loss": 3.8895, + "step": 87740 + }, + { + "epoch": 5.961747520043485, + "grad_norm": 0.3637780249118805, + "learning_rate": 2.550873080581601e-05, + "loss": 3.6015, + "step": 87745 + }, + { + "epoch": 5.962087240114146, + "grad_norm": 0.26511719822883606, + "learning_rate": 2.5504484304932734e-05, + "loss": 4.1324, + "step": 87750 + }, + { + "epoch": 5.962426960184808, + "grad_norm": 0.16835150122642517, + "learning_rate": 2.5500237804049466e-05, + "loss": 3.8598, + "step": 87755 + }, + { + "epoch": 5.96276668025547, + "grad_norm": 0.20800817012786865, + "learning_rate": 2.5495991303166194e-05, + "loss": 4.0037, + "step": 87760 + }, + { + "epoch": 5.963106400326131, + "grad_norm": 0.13813483715057373, + "learning_rate": 2.5491744802282918e-05, + "loss": 4.105, + "step": 87765 + }, + { + "epoch": 5.963446120396793, + "grad_norm": 0.395033597946167, + "learning_rate": 2.548749830139965e-05, + "loss": 3.9456, + "step": 87770 + }, + { + "epoch": 5.963785840467455, + "grad_norm": 0.17108134925365448, + "learning_rate": 2.5483251800516378e-05, + "loss": 3.7833, + "step": 87775 + }, + { + "epoch": 5.9641255605381165, + "grad_norm": 0.24395863711833954, + "learning_rate": 2.5479005299633102e-05, + "loss": 3.8445, + "step": 87780 + }, + { + "epoch": 5.964465280608779, + "grad_norm": 0.18847553431987762, + "learning_rate": 2.547475879874983e-05, + "loss": 4.0838, + "step": 87785 + }, + { + "epoch": 5.964805000679441, + "grad_norm": 0.17531229555606842, + "learning_rate": 2.547051229786656e-05, + "loss": 3.7764, + "step": 87790 + }, + { + "epoch": 5.965144720750102, + "grad_norm": 0.1584831029176712, + "learning_rate": 2.5466265796983286e-05, + "loss": 3.7039, + "step": 87795 + }, + { + "epoch": 5.965484440820764, + "grad_norm": 0.17061544954776764, + "learning_rate": 2.5462019296100014e-05, + "loss": 3.8524, + "step": 87800 + }, + { + "epoch": 5.965824160891425, + "grad_norm": 0.2096887230873108, + "learning_rate": 2.5457772795216746e-05, + "loss": 3.7569, + "step": 87805 + }, + { + "epoch": 5.966163880962087, + "grad_norm": 0.47146177291870117, + "learning_rate": 2.5453526294333467e-05, + "loss": 3.9796, + "step": 87810 + }, + { + "epoch": 5.966503601032749, + "grad_norm": 0.1943608671426773, + "learning_rate": 2.54492797934502e-05, + "loss": 4.1227, + "step": 87815 + }, + { + "epoch": 5.96684332110341, + "grad_norm": 0.17657628655433655, + "learning_rate": 2.5445033292566926e-05, + "loss": 3.7188, + "step": 87820 + }, + { + "epoch": 5.9671830411740725, + "grad_norm": 0.14725595712661743, + "learning_rate": 2.544078679168365e-05, + "loss": 3.8013, + "step": 87825 + }, + { + "epoch": 5.967522761244735, + "grad_norm": 0.17469839751720428, + "learning_rate": 2.5436540290800382e-05, + "loss": 4.0015, + "step": 87830 + }, + { + "epoch": 5.967862481315396, + "grad_norm": 0.1716184765100479, + "learning_rate": 2.543229378991711e-05, + "loss": 3.8167, + "step": 87835 + }, + { + "epoch": 5.968202201386058, + "grad_norm": 0.20152880251407623, + "learning_rate": 2.5428047289033835e-05, + "loss": 3.8816, + "step": 87840 + }, + { + "epoch": 5.96854192145672, + "grad_norm": 0.14258965849876404, + "learning_rate": 2.5423800788150563e-05, + "loss": 4.0521, + "step": 87845 + }, + { + "epoch": 5.968881641527381, + "grad_norm": 0.4130111634731293, + "learning_rate": 2.5419554287267294e-05, + "loss": 3.8691, + "step": 87850 + }, + { + "epoch": 5.969221361598043, + "grad_norm": 0.19019849598407745, + "learning_rate": 2.541530778638402e-05, + "loss": 3.8505, + "step": 87855 + }, + { + "epoch": 5.969561081668705, + "grad_norm": 0.17057418823242188, + "learning_rate": 2.5411061285500747e-05, + "loss": 3.9228, + "step": 87860 + }, + { + "epoch": 5.969900801739366, + "grad_norm": 0.15414078533649445, + "learning_rate": 2.540681478461748e-05, + "loss": 4.0086, + "step": 87865 + }, + { + "epoch": 5.9702405218100285, + "grad_norm": 0.16627973318099976, + "learning_rate": 2.5402568283734203e-05, + "loss": 3.7861, + "step": 87870 + }, + { + "epoch": 5.970580241880691, + "grad_norm": 0.2122168242931366, + "learning_rate": 2.539832178285093e-05, + "loss": 3.8644, + "step": 87875 + }, + { + "epoch": 5.970919961951352, + "grad_norm": 0.21458381414413452, + "learning_rate": 2.5394075281967662e-05, + "loss": 3.7804, + "step": 87880 + }, + { + "epoch": 5.971259682022014, + "grad_norm": 0.13927409052848816, + "learning_rate": 2.538982878108439e-05, + "loss": 3.9889, + "step": 87885 + }, + { + "epoch": 5.971599402092676, + "grad_norm": 0.21342912316322327, + "learning_rate": 2.5385582280201115e-05, + "loss": 3.9371, + "step": 87890 + }, + { + "epoch": 5.971939122163337, + "grad_norm": 0.3389163017272949, + "learning_rate": 2.5381335779317843e-05, + "loss": 3.9358, + "step": 87895 + }, + { + "epoch": 5.972278842233999, + "grad_norm": 0.18179841339588165, + "learning_rate": 2.5377089278434574e-05, + "loss": 3.6052, + "step": 87900 + }, + { + "epoch": 5.972618562304661, + "grad_norm": 0.17171815037727356, + "learning_rate": 2.53728427775513e-05, + "loss": 3.7552, + "step": 87905 + }, + { + "epoch": 5.9729582823753224, + "grad_norm": 0.6072494387626648, + "learning_rate": 2.5368596276668027e-05, + "loss": 3.9947, + "step": 87910 + }, + { + "epoch": 5.9732980024459845, + "grad_norm": 0.14123603701591492, + "learning_rate": 2.536434977578476e-05, + "loss": 3.8589, + "step": 87915 + }, + { + "epoch": 5.973637722516647, + "grad_norm": 0.17933064699172974, + "learning_rate": 2.536010327490148e-05, + "loss": 3.8428, + "step": 87920 + }, + { + "epoch": 5.973977442587308, + "grad_norm": 0.15991370379924774, + "learning_rate": 2.535585677401821e-05, + "loss": 3.7981, + "step": 87925 + }, + { + "epoch": 5.97431716265797, + "grad_norm": 0.1464116871356964, + "learning_rate": 2.535161027313494e-05, + "loss": 3.9015, + "step": 87930 + }, + { + "epoch": 5.974656882728632, + "grad_norm": 0.16472263634204865, + "learning_rate": 2.5347363772251664e-05, + "loss": 3.9469, + "step": 87935 + }, + { + "epoch": 5.974996602799293, + "grad_norm": 0.14497308433055878, + "learning_rate": 2.5343117271368395e-05, + "loss": 3.5905, + "step": 87940 + }, + { + "epoch": 5.975336322869955, + "grad_norm": 0.15256886184215546, + "learning_rate": 2.5338870770485123e-05, + "loss": 3.9991, + "step": 87945 + }, + { + "epoch": 5.975676042940617, + "grad_norm": 0.5023109912872314, + "learning_rate": 2.5334624269601848e-05, + "loss": 3.8838, + "step": 87950 + }, + { + "epoch": 5.9760157630112785, + "grad_norm": 0.15378092229366302, + "learning_rate": 2.5330377768718576e-05, + "loss": 3.5732, + "step": 87955 + }, + { + "epoch": 5.9763554830819405, + "grad_norm": 0.26889488101005554, + "learning_rate": 2.5326131267835307e-05, + "loss": 3.9609, + "step": 87960 + }, + { + "epoch": 5.976695203152603, + "grad_norm": 0.18030884861946106, + "learning_rate": 2.5321884766952032e-05, + "loss": 3.8119, + "step": 87965 + }, + { + "epoch": 5.977034923223264, + "grad_norm": 0.15802635252475739, + "learning_rate": 2.531763826606876e-05, + "loss": 3.8978, + "step": 87970 + }, + { + "epoch": 5.977374643293926, + "grad_norm": 0.1373254358768463, + "learning_rate": 2.531339176518549e-05, + "loss": 4.0583, + "step": 87975 + }, + { + "epoch": 5.977714363364588, + "grad_norm": 0.16162167489528656, + "learning_rate": 2.5309145264302216e-05, + "loss": 3.8014, + "step": 87980 + }, + { + "epoch": 5.978054083435249, + "grad_norm": 0.1369083672761917, + "learning_rate": 2.5304898763418944e-05, + "loss": 4.0124, + "step": 87985 + }, + { + "epoch": 5.978393803505911, + "grad_norm": 0.1707378327846527, + "learning_rate": 2.5300652262535672e-05, + "loss": 3.5849, + "step": 87990 + }, + { + "epoch": 5.978733523576573, + "grad_norm": 0.18192249536514282, + "learning_rate": 2.5296405761652396e-05, + "loss": 3.9303, + "step": 87995 + }, + { + "epoch": 5.9790732436472345, + "grad_norm": 0.22192558646202087, + "learning_rate": 2.5292159260769128e-05, + "loss": 3.7972, + "step": 88000 + }, + { + "epoch": 5.9794129637178965, + "grad_norm": 0.20244620740413666, + "learning_rate": 2.5287912759885856e-05, + "loss": 3.7811, + "step": 88005 + }, + { + "epoch": 5.979752683788558, + "grad_norm": 0.15418276190757751, + "learning_rate": 2.528366625900258e-05, + "loss": 3.6797, + "step": 88010 + }, + { + "epoch": 5.98009240385922, + "grad_norm": 0.20800280570983887, + "learning_rate": 2.5279419758119312e-05, + "loss": 3.8567, + "step": 88015 + }, + { + "epoch": 5.980432123929882, + "grad_norm": 0.17682956159114838, + "learning_rate": 2.527517325723604e-05, + "loss": 3.8935, + "step": 88020 + }, + { + "epoch": 5.980771844000543, + "grad_norm": 0.19309362769126892, + "learning_rate": 2.5270926756352764e-05, + "loss": 3.6357, + "step": 88025 + }, + { + "epoch": 5.981111564071205, + "grad_norm": 0.15270277857780457, + "learning_rate": 2.5266680255469492e-05, + "loss": 3.8525, + "step": 88030 + }, + { + "epoch": 5.981451284141867, + "grad_norm": 0.2965698540210724, + "learning_rate": 2.5262433754586224e-05, + "loss": 3.9835, + "step": 88035 + }, + { + "epoch": 5.981791004212528, + "grad_norm": 0.62654709815979, + "learning_rate": 2.525818725370295e-05, + "loss": 3.9342, + "step": 88040 + }, + { + "epoch": 5.9821307242831905, + "grad_norm": 0.15270152688026428, + "learning_rate": 2.5253940752819677e-05, + "loss": 3.8479, + "step": 88045 + }, + { + "epoch": 5.982470444353853, + "grad_norm": 0.15525111556053162, + "learning_rate": 2.5249694251936408e-05, + "loss": 3.8192, + "step": 88050 + }, + { + "epoch": 5.982810164424514, + "grad_norm": 0.16966918110847473, + "learning_rate": 2.5245447751053136e-05, + "loss": 3.8133, + "step": 88055 + }, + { + "epoch": 5.983149884495176, + "grad_norm": 0.1558348387479782, + "learning_rate": 2.524120125016986e-05, + "loss": 3.8934, + "step": 88060 + }, + { + "epoch": 5.983489604565838, + "grad_norm": 0.1711210012435913, + "learning_rate": 2.523695474928659e-05, + "loss": 3.9042, + "step": 88065 + }, + { + "epoch": 5.983829324636499, + "grad_norm": 0.24424013495445251, + "learning_rate": 2.523270824840332e-05, + "loss": 4.1994, + "step": 88070 + }, + { + "epoch": 5.984169044707161, + "grad_norm": 0.1578197330236435, + "learning_rate": 2.5228461747520045e-05, + "loss": 3.6675, + "step": 88075 + }, + { + "epoch": 5.984508764777823, + "grad_norm": 0.22591985762119293, + "learning_rate": 2.5224215246636773e-05, + "loss": 3.9969, + "step": 88080 + }, + { + "epoch": 5.984848484848484, + "grad_norm": 0.2162635177373886, + "learning_rate": 2.5219968745753504e-05, + "loss": 3.7373, + "step": 88085 + }, + { + "epoch": 5.9851882049191465, + "grad_norm": 0.19775955379009247, + "learning_rate": 2.5215722244870225e-05, + "loss": 3.8293, + "step": 88090 + }, + { + "epoch": 5.985527924989809, + "grad_norm": 0.270691841840744, + "learning_rate": 2.5211475743986957e-05, + "loss": 3.5802, + "step": 88095 + }, + { + "epoch": 5.98586764506047, + "grad_norm": 0.1708485633134842, + "learning_rate": 2.5207229243103685e-05, + "loss": 3.9036, + "step": 88100 + }, + { + "epoch": 5.986207365131132, + "grad_norm": 0.1460791528224945, + "learning_rate": 2.520298274222041e-05, + "loss": 3.7057, + "step": 88105 + }, + { + "epoch": 5.986547085201794, + "grad_norm": 0.21730846166610718, + "learning_rate": 2.519873624133714e-05, + "loss": 3.9265, + "step": 88110 + }, + { + "epoch": 5.986886805272455, + "grad_norm": 0.15005196630954742, + "learning_rate": 2.519448974045387e-05, + "loss": 3.8796, + "step": 88115 + }, + { + "epoch": 5.987226525343117, + "grad_norm": 0.13597354292869568, + "learning_rate": 2.5190243239570593e-05, + "loss": 3.6734, + "step": 88120 + }, + { + "epoch": 5.987566245413779, + "grad_norm": 0.1809493452310562, + "learning_rate": 2.5185996738687325e-05, + "loss": 3.7226, + "step": 88125 + }, + { + "epoch": 5.98790596548444, + "grad_norm": 0.18407543003559113, + "learning_rate": 2.5181750237804053e-05, + "loss": 3.5772, + "step": 88130 + }, + { + "epoch": 5.9882456855551025, + "grad_norm": 0.20748290419578552, + "learning_rate": 2.5177503736920777e-05, + "loss": 3.5702, + "step": 88135 + }, + { + "epoch": 5.988585405625765, + "grad_norm": 0.17321763932704926, + "learning_rate": 2.5173257236037505e-05, + "loss": 3.817, + "step": 88140 + }, + { + "epoch": 5.988925125696426, + "grad_norm": 0.18901018798351288, + "learning_rate": 2.5169010735154237e-05, + "loss": 3.7309, + "step": 88145 + }, + { + "epoch": 5.989264845767088, + "grad_norm": 4.35134220123291, + "learning_rate": 2.516476423427096e-05, + "loss": 3.6755, + "step": 88150 + }, + { + "epoch": 5.98960456583775, + "grad_norm": 0.17760680615901947, + "learning_rate": 2.516051773338769e-05, + "loss": 3.9918, + "step": 88155 + }, + { + "epoch": 5.989944285908411, + "grad_norm": 0.1771373599767685, + "learning_rate": 2.515627123250442e-05, + "loss": 3.5779, + "step": 88160 + }, + { + "epoch": 5.990284005979073, + "grad_norm": 0.22196683287620544, + "learning_rate": 2.5152024731621142e-05, + "loss": 3.699, + "step": 88165 + }, + { + "epoch": 5.990623726049735, + "grad_norm": 0.1994863599538803, + "learning_rate": 2.5147778230737873e-05, + "loss": 4.0568, + "step": 88170 + }, + { + "epoch": 5.990963446120396, + "grad_norm": 0.19126400351524353, + "learning_rate": 2.51435317298546e-05, + "loss": 3.7844, + "step": 88175 + }, + { + "epoch": 5.9913031661910585, + "grad_norm": 0.24470463395118713, + "learning_rate": 2.5139285228971326e-05, + "loss": 4.1199, + "step": 88180 + }, + { + "epoch": 5.991642886261721, + "grad_norm": 0.1639489233493805, + "learning_rate": 2.5135038728088057e-05, + "loss": 3.848, + "step": 88185 + }, + { + "epoch": 5.991982606332382, + "grad_norm": 0.17713968455791473, + "learning_rate": 2.5130792227204785e-05, + "loss": 3.6378, + "step": 88190 + }, + { + "epoch": 5.992322326403044, + "grad_norm": 2.1178128719329834, + "learning_rate": 2.512654572632151e-05, + "loss": 3.6613, + "step": 88195 + }, + { + "epoch": 5.992662046473706, + "grad_norm": 0.16026709973812103, + "learning_rate": 2.5122299225438238e-05, + "loss": 3.912, + "step": 88200 + }, + { + "epoch": 5.993001766544367, + "grad_norm": 0.18087953329086304, + "learning_rate": 2.511805272455497e-05, + "loss": 3.8272, + "step": 88205 + }, + { + "epoch": 5.993341486615029, + "grad_norm": 0.6028064489364624, + "learning_rate": 2.5113806223671694e-05, + "loss": 3.7683, + "step": 88210 + }, + { + "epoch": 5.993681206685691, + "grad_norm": 0.1638663411140442, + "learning_rate": 2.5109559722788422e-05, + "loss": 3.5718, + "step": 88215 + }, + { + "epoch": 5.9940209267563525, + "grad_norm": 0.5797380805015564, + "learning_rate": 2.5105313221905153e-05, + "loss": 3.7581, + "step": 88220 + }, + { + "epoch": 5.9943606468270145, + "grad_norm": 0.3090953230857849, + "learning_rate": 2.510106672102188e-05, + "loss": 3.8527, + "step": 88225 + }, + { + "epoch": 5.994700366897677, + "grad_norm": 0.21213455498218536, + "learning_rate": 2.5096820220138606e-05, + "loss": 3.9659, + "step": 88230 + }, + { + "epoch": 5.995040086968338, + "grad_norm": 0.8605678081512451, + "learning_rate": 2.5092573719255334e-05, + "loss": 4.016, + "step": 88235 + }, + { + "epoch": 5.995379807039, + "grad_norm": 0.17479883134365082, + "learning_rate": 2.5088327218372065e-05, + "loss": 3.8574, + "step": 88240 + }, + { + "epoch": 5.995719527109662, + "grad_norm": 0.1702878475189209, + "learning_rate": 2.508408071748879e-05, + "loss": 3.7845, + "step": 88245 + }, + { + "epoch": 5.996059247180323, + "grad_norm": 0.21793755888938904, + "learning_rate": 2.5079834216605518e-05, + "loss": 3.957, + "step": 88250 + }, + { + "epoch": 5.996398967250985, + "grad_norm": 0.1505189687013626, + "learning_rate": 2.507558771572225e-05, + "loss": 3.6482, + "step": 88255 + }, + { + "epoch": 5.996738687321647, + "grad_norm": 0.6737164258956909, + "learning_rate": 2.5071341214838974e-05, + "loss": 3.9004, + "step": 88260 + }, + { + "epoch": 5.9970784073923085, + "grad_norm": 0.3836795687675476, + "learning_rate": 2.5067094713955702e-05, + "loss": 3.9438, + "step": 88265 + }, + { + "epoch": 5.9974181274629705, + "grad_norm": 0.7036447525024414, + "learning_rate": 2.5062848213072433e-05, + "loss": 3.7911, + "step": 88270 + }, + { + "epoch": 5.997757847533633, + "grad_norm": 0.15709033608436584, + "learning_rate": 2.5058601712189155e-05, + "loss": 3.8187, + "step": 88275 + }, + { + "epoch": 5.998097567604294, + "grad_norm": 0.15967978537082672, + "learning_rate": 2.5054355211305886e-05, + "loss": 3.836, + "step": 88280 + }, + { + "epoch": 5.998437287674956, + "grad_norm": 0.1633913367986679, + "learning_rate": 2.5050108710422614e-05, + "loss": 3.8224, + "step": 88285 + }, + { + "epoch": 5.998777007745618, + "grad_norm": 0.192756786942482, + "learning_rate": 2.504586220953934e-05, + "loss": 3.8452, + "step": 88290 + }, + { + "epoch": 5.999116727816279, + "grad_norm": 0.15225054323673248, + "learning_rate": 2.504161570865607e-05, + "loss": 3.749, + "step": 88295 + }, + { + "epoch": 5.999456447886941, + "grad_norm": 0.24956953525543213, + "learning_rate": 2.5037369207772798e-05, + "loss": 3.8444, + "step": 88300 + }, + { + "epoch": 5.999796167957603, + "grad_norm": 0.21167489886283875, + "learning_rate": 2.5033122706889523e-05, + "loss": 3.8819, + "step": 88305 + }, + { + "epoch": 6.0, + "eval_bertscore": { + "f1": 0.8522983806079436, + "precision": 0.8758887708828262, + "recall": 0.8302821247357478 + }, + "eval_bleu_4": 0.001877025915167185, + "eval_exact_match": 0.0, + "eval_loss": 3.6452934741973877, + "eval_meteor": 0.07523836707878802, + "eval_rouge": { + "rouge1": 0.12277253172519767, + "rouge2": 0.015237856538580855, + "rougeL": 0.10860551406961803, + "rougeLsum": 0.10861594625767944 + }, + "eval_runtime": 370.9169, + "eval_samples_per_second": 27.82, + "eval_steps_per_second": 3.478, + "step": 88308 + }, + { + "epoch": 6.0001358880282645, + "grad_norm": 0.21177932620048523, + "learning_rate": 2.502887620600625e-05, + "loss": 3.7291, + "step": 88310 + }, + { + "epoch": 6.0004756080989265, + "grad_norm": 0.18662048876285553, + "learning_rate": 2.5024629705122982e-05, + "loss": 3.6866, + "step": 88315 + }, + { + "epoch": 6.000815328169589, + "grad_norm": 0.21127019822597504, + "learning_rate": 2.5020383204239707e-05, + "loss": 3.8494, + "step": 88320 + }, + { + "epoch": 6.00115504824025, + "grad_norm": 0.20294392108917236, + "learning_rate": 2.5016136703356435e-05, + "loss": 3.8762, + "step": 88325 + }, + { + "epoch": 6.001494768310912, + "grad_norm": 0.2144780308008194, + "learning_rate": 2.5011890202473166e-05, + "loss": 3.7869, + "step": 88330 + }, + { + "epoch": 6.001834488381574, + "grad_norm": 0.1536242812871933, + "learning_rate": 2.5007643701589887e-05, + "loss": 3.8874, + "step": 88335 + }, + { + "epoch": 6.002174208452235, + "grad_norm": 0.21745771169662476, + "learning_rate": 2.500339720070662e-05, + "loss": 3.8959, + "step": 88340 + }, + { + "epoch": 6.002513928522897, + "grad_norm": 0.1690192073583603, + "learning_rate": 2.4999150699823347e-05, + "loss": 3.8764, + "step": 88345 + }, + { + "epoch": 6.002853648593559, + "grad_norm": 0.15318702161312103, + "learning_rate": 2.4994904198940075e-05, + "loss": 3.8666, + "step": 88350 + }, + { + "epoch": 6.0031933686642205, + "grad_norm": 0.23923543095588684, + "learning_rate": 2.4990657698056803e-05, + "loss": 3.8687, + "step": 88355 + }, + { + "epoch": 6.003533088734883, + "grad_norm": 0.21125124394893646, + "learning_rate": 2.498641119717353e-05, + "loss": 3.6032, + "step": 88360 + }, + { + "epoch": 6.003872808805545, + "grad_norm": 0.18487413227558136, + "learning_rate": 2.498216469629026e-05, + "loss": 3.4643, + "step": 88365 + }, + { + "epoch": 6.004212528876206, + "grad_norm": 0.17493277788162231, + "learning_rate": 2.4977918195406987e-05, + "loss": 3.4643, + "step": 88370 + }, + { + "epoch": 6.004552248946868, + "grad_norm": 0.23339276015758514, + "learning_rate": 2.4973671694523715e-05, + "loss": 3.8403, + "step": 88375 + }, + { + "epoch": 6.00489196901753, + "grad_norm": 0.20602121949195862, + "learning_rate": 2.4969425193640443e-05, + "loss": 3.8874, + "step": 88380 + }, + { + "epoch": 6.005231689088191, + "grad_norm": 0.18371155858039856, + "learning_rate": 2.4965178692757167e-05, + "loss": 3.9646, + "step": 88385 + }, + { + "epoch": 6.005571409158853, + "grad_norm": 0.22509869933128357, + "learning_rate": 2.49609321918739e-05, + "loss": 4.0126, + "step": 88390 + }, + { + "epoch": 6.005911129229515, + "grad_norm": 0.22131295502185822, + "learning_rate": 2.4956685690990623e-05, + "loss": 3.9592, + "step": 88395 + }, + { + "epoch": 6.0062508493001765, + "grad_norm": 0.1965228021144867, + "learning_rate": 2.495243919010735e-05, + "loss": 3.8752, + "step": 88400 + }, + { + "epoch": 6.006590569370839, + "grad_norm": 0.19128096103668213, + "learning_rate": 2.4948192689224083e-05, + "loss": 4.1738, + "step": 88405 + }, + { + "epoch": 6.006930289441501, + "grad_norm": 0.15303070843219757, + "learning_rate": 2.4943946188340808e-05, + "loss": 3.8655, + "step": 88410 + }, + { + "epoch": 6.007270009512162, + "grad_norm": 0.1768026500940323, + "learning_rate": 2.4939699687457536e-05, + "loss": 3.854, + "step": 88415 + }, + { + "epoch": 6.007609729582824, + "grad_norm": 0.18505296111106873, + "learning_rate": 2.4935453186574264e-05, + "loss": 3.9782, + "step": 88420 + }, + { + "epoch": 6.007949449653485, + "grad_norm": 0.2167542725801468, + "learning_rate": 2.493120668569099e-05, + "loss": 3.8961, + "step": 88425 + }, + { + "epoch": 6.008289169724147, + "grad_norm": 0.2257390320301056, + "learning_rate": 2.492696018480772e-05, + "loss": 3.9157, + "step": 88430 + }, + { + "epoch": 6.008628889794809, + "grad_norm": 0.17882804572582245, + "learning_rate": 2.4922713683924448e-05, + "loss": 3.6378, + "step": 88435 + }, + { + "epoch": 6.00896860986547, + "grad_norm": 0.2507639527320862, + "learning_rate": 2.4918467183041176e-05, + "loss": 3.7761, + "step": 88440 + }, + { + "epoch": 6.0093083299361325, + "grad_norm": 0.16430532932281494, + "learning_rate": 2.4914220682157904e-05, + "loss": 3.9402, + "step": 88445 + }, + { + "epoch": 6.009648050006795, + "grad_norm": 0.8925593495368958, + "learning_rate": 2.490997418127463e-05, + "loss": 3.8337, + "step": 88450 + }, + { + "epoch": 6.009987770077456, + "grad_norm": 0.18754784762859344, + "learning_rate": 2.490572768039136e-05, + "loss": 3.7648, + "step": 88455 + }, + { + "epoch": 6.010327490148118, + "grad_norm": 0.5204706788063049, + "learning_rate": 2.4901481179508088e-05, + "loss": 3.6481, + "step": 88460 + }, + { + "epoch": 6.01066721021878, + "grad_norm": 0.2087002843618393, + "learning_rate": 2.4897234678624816e-05, + "loss": 3.8115, + "step": 88465 + }, + { + "epoch": 6.011006930289441, + "grad_norm": 0.16794030368328094, + "learning_rate": 2.489298817774154e-05, + "loss": 3.9572, + "step": 88470 + }, + { + "epoch": 6.011346650360103, + "grad_norm": 0.13649730384349823, + "learning_rate": 2.488874167685827e-05, + "loss": 4.0053, + "step": 88475 + }, + { + "epoch": 6.011686370430765, + "grad_norm": 0.21843582391738892, + "learning_rate": 2.4884495175974996e-05, + "loss": 3.7374, + "step": 88480 + }, + { + "epoch": 6.0120260905014264, + "grad_norm": 0.8665614724159241, + "learning_rate": 2.4880248675091724e-05, + "loss": 3.8167, + "step": 88485 + }, + { + "epoch": 6.0123658105720885, + "grad_norm": 0.2085527777671814, + "learning_rate": 2.4876002174208456e-05, + "loss": 3.6928, + "step": 88490 + }, + { + "epoch": 6.012705530642751, + "grad_norm": 0.1784515678882599, + "learning_rate": 2.487175567332518e-05, + "loss": 3.9164, + "step": 88495 + }, + { + "epoch": 6.013045250713412, + "grad_norm": 0.18934592604637146, + "learning_rate": 2.4867509172441908e-05, + "loss": 3.8659, + "step": 88500 + }, + { + "epoch": 6.013384970784074, + "grad_norm": 0.19530123472213745, + "learning_rate": 2.4863262671558636e-05, + "loss": 3.8242, + "step": 88505 + }, + { + "epoch": 6.013724690854736, + "grad_norm": 0.299832284450531, + "learning_rate": 2.4859016170675364e-05, + "loss": 3.7511, + "step": 88510 + }, + { + "epoch": 6.014064410925397, + "grad_norm": 0.20527534186840057, + "learning_rate": 2.4854769669792092e-05, + "loss": 3.7464, + "step": 88515 + }, + { + "epoch": 6.014404130996059, + "grad_norm": 0.16686655580997467, + "learning_rate": 2.485052316890882e-05, + "loss": 3.8823, + "step": 88520 + }, + { + "epoch": 6.014743851066721, + "grad_norm": 0.199691042304039, + "learning_rate": 2.4846276668025548e-05, + "loss": 3.8665, + "step": 88525 + }, + { + "epoch": 6.0150835711373825, + "grad_norm": 0.24624280631542206, + "learning_rate": 2.4842030167142276e-05, + "loss": 3.812, + "step": 88530 + }, + { + "epoch": 6.0154232912080445, + "grad_norm": 0.2083972990512848, + "learning_rate": 2.4837783666259004e-05, + "loss": 3.8383, + "step": 88535 + }, + { + "epoch": 6.015763011278707, + "grad_norm": 0.17676231265068054, + "learning_rate": 2.4833537165375732e-05, + "loss": 3.6572, + "step": 88540 + }, + { + "epoch": 6.016102731349368, + "grad_norm": 0.18028351664543152, + "learning_rate": 2.482929066449246e-05, + "loss": 3.8348, + "step": 88545 + }, + { + "epoch": 6.01644245142003, + "grad_norm": 0.1835000216960907, + "learning_rate": 2.482504416360919e-05, + "loss": 4.0105, + "step": 88550 + }, + { + "epoch": 6.016782171490692, + "grad_norm": 0.22100220620632172, + "learning_rate": 2.4820797662725913e-05, + "loss": 3.6786, + "step": 88555 + }, + { + "epoch": 6.017121891561353, + "grad_norm": 0.18769507110118866, + "learning_rate": 2.4816551161842644e-05, + "loss": 3.535, + "step": 88560 + }, + { + "epoch": 6.017461611632015, + "grad_norm": 0.1531793177127838, + "learning_rate": 2.4812304660959372e-05, + "loss": 3.8558, + "step": 88565 + }, + { + "epoch": 6.017801331702677, + "grad_norm": 0.1868162453174591, + "learning_rate": 2.4808058160076097e-05, + "loss": 3.8657, + "step": 88570 + }, + { + "epoch": 6.0181410517733385, + "grad_norm": 0.14480575919151306, + "learning_rate": 2.480381165919283e-05, + "loss": 3.8946, + "step": 88575 + }, + { + "epoch": 6.0184807718440005, + "grad_norm": 0.2325258105993271, + "learning_rate": 2.4799565158309553e-05, + "loss": 4.0012, + "step": 88580 + }, + { + "epoch": 6.018820491914663, + "grad_norm": 0.13523073494434357, + "learning_rate": 2.479531865742628e-05, + "loss": 3.8009, + "step": 88585 + }, + { + "epoch": 6.019160211985324, + "grad_norm": 0.7730312347412109, + "learning_rate": 2.479107215654301e-05, + "loss": 3.8803, + "step": 88590 + }, + { + "epoch": 6.019499932055986, + "grad_norm": 0.20273755490779877, + "learning_rate": 2.4786825655659737e-05, + "loss": 3.6728, + "step": 88595 + }, + { + "epoch": 6.019839652126648, + "grad_norm": 0.23541715741157532, + "learning_rate": 2.4782579154776465e-05, + "loss": 3.7825, + "step": 88600 + }, + { + "epoch": 6.020179372197309, + "grad_norm": 0.181257963180542, + "learning_rate": 2.4778332653893193e-05, + "loss": 3.9775, + "step": 88605 + }, + { + "epoch": 6.020519092267971, + "grad_norm": 0.18461644649505615, + "learning_rate": 2.477408615300992e-05, + "loss": 3.7577, + "step": 88610 + }, + { + "epoch": 6.020858812338633, + "grad_norm": 9.930700302124023, + "learning_rate": 2.476983965212665e-05, + "loss": 3.9715, + "step": 88615 + }, + { + "epoch": 6.0211985324092945, + "grad_norm": 0.16601328551769257, + "learning_rate": 2.4765593151243377e-05, + "loss": 3.7774, + "step": 88620 + }, + { + "epoch": 6.0215382524799566, + "grad_norm": 0.2117709219455719, + "learning_rate": 2.4761346650360105e-05, + "loss": 3.7013, + "step": 88625 + }, + { + "epoch": 6.021877972550619, + "grad_norm": 0.20543916523456573, + "learning_rate": 2.4757100149476833e-05, + "loss": 3.927, + "step": 88630 + }, + { + "epoch": 6.02221769262128, + "grad_norm": 0.24819360673427582, + "learning_rate": 2.475285364859356e-05, + "loss": 3.752, + "step": 88635 + }, + { + "epoch": 6.022557412691942, + "grad_norm": 0.2178351879119873, + "learning_rate": 2.4748607147710286e-05, + "loss": 3.8744, + "step": 88640 + }, + { + "epoch": 6.022897132762604, + "grad_norm": 0.15564312040805817, + "learning_rate": 2.4744360646827017e-05, + "loss": 3.8108, + "step": 88645 + }, + { + "epoch": 6.023236852833265, + "grad_norm": 0.22226494550704956, + "learning_rate": 2.4740114145943745e-05, + "loss": 3.6884, + "step": 88650 + }, + { + "epoch": 6.023576572903927, + "grad_norm": 0.16782939434051514, + "learning_rate": 2.473586764506047e-05, + "loss": 3.5864, + "step": 88655 + }, + { + "epoch": 6.023916292974589, + "grad_norm": 0.18459592759609222, + "learning_rate": 2.47316211441772e-05, + "loss": 3.7749, + "step": 88660 + }, + { + "epoch": 6.0242560130452505, + "grad_norm": 0.14734330773353577, + "learning_rate": 2.4727374643293926e-05, + "loss": 3.531, + "step": 88665 + }, + { + "epoch": 6.024595733115913, + "grad_norm": 0.16865232586860657, + "learning_rate": 2.4723128142410654e-05, + "loss": 4.0215, + "step": 88670 + }, + { + "epoch": 6.024935453186575, + "grad_norm": 0.15350030362606049, + "learning_rate": 2.4718881641527382e-05, + "loss": 3.7311, + "step": 88675 + }, + { + "epoch": 6.025275173257236, + "grad_norm": 0.1832588165998459, + "learning_rate": 2.471463514064411e-05, + "loss": 3.8481, + "step": 88680 + }, + { + "epoch": 6.025614893327898, + "grad_norm": 1.688464879989624, + "learning_rate": 2.4710388639760838e-05, + "loss": 3.7003, + "step": 88685 + }, + { + "epoch": 6.02595461339856, + "grad_norm": 0.1932218372821808, + "learning_rate": 2.4706142138877566e-05, + "loss": 3.845, + "step": 88690 + }, + { + "epoch": 6.026294333469221, + "grad_norm": 0.13740123808383942, + "learning_rate": 2.4701895637994294e-05, + "loss": 3.5275, + "step": 88695 + }, + { + "epoch": 6.026634053539883, + "grad_norm": 0.15006835758686066, + "learning_rate": 2.4697649137111022e-05, + "loss": 4.1243, + "step": 88700 + }, + { + "epoch": 6.026973773610545, + "grad_norm": 0.29665106534957886, + "learning_rate": 2.469340263622775e-05, + "loss": 3.6925, + "step": 88705 + }, + { + "epoch": 6.0273134936812065, + "grad_norm": 0.2130628526210785, + "learning_rate": 2.4689156135344478e-05, + "loss": 3.6733, + "step": 88710 + }, + { + "epoch": 6.027653213751869, + "grad_norm": 0.19061098992824554, + "learning_rate": 2.4684909634461206e-05, + "loss": 3.9367, + "step": 88715 + }, + { + "epoch": 6.027992933822531, + "grad_norm": 0.34877702593803406, + "learning_rate": 2.4680663133577934e-05, + "loss": 3.9324, + "step": 88720 + }, + { + "epoch": 6.028332653893192, + "grad_norm": 0.16084209084510803, + "learning_rate": 2.467641663269466e-05, + "loss": 3.8806, + "step": 88725 + }, + { + "epoch": 6.028672373963854, + "grad_norm": 0.1873062700033188, + "learning_rate": 2.467217013181139e-05, + "loss": 3.9693, + "step": 88730 + }, + { + "epoch": 6.029012094034516, + "grad_norm": 0.21399915218353271, + "learning_rate": 2.4667923630928118e-05, + "loss": 3.8211, + "step": 88735 + }, + { + "epoch": 6.029351814105177, + "grad_norm": 0.2036156803369522, + "learning_rate": 2.4663677130044842e-05, + "loss": 4.1322, + "step": 88740 + }, + { + "epoch": 6.029691534175839, + "grad_norm": 0.4657822847366333, + "learning_rate": 2.4659430629161574e-05, + "loss": 3.8925, + "step": 88745 + }, + { + "epoch": 6.0300312542465, + "grad_norm": 0.22546829283237457, + "learning_rate": 2.46551841282783e-05, + "loss": 3.7863, + "step": 88750 + }, + { + "epoch": 6.0303709743171625, + "grad_norm": 0.19521036744117737, + "learning_rate": 2.4650937627395026e-05, + "loss": 4.0603, + "step": 88755 + }, + { + "epoch": 6.030710694387825, + "grad_norm": 0.16000720858573914, + "learning_rate": 2.4646691126511754e-05, + "loss": 3.7513, + "step": 88760 + }, + { + "epoch": 6.031050414458486, + "grad_norm": 0.20664943754673004, + "learning_rate": 2.4642444625628483e-05, + "loss": 3.7654, + "step": 88765 + }, + { + "epoch": 6.031390134529148, + "grad_norm": 0.20301562547683716, + "learning_rate": 2.463819812474521e-05, + "loss": 3.6583, + "step": 88770 + }, + { + "epoch": 6.03172985459981, + "grad_norm": 0.1446334421634674, + "learning_rate": 2.463395162386194e-05, + "loss": 3.8547, + "step": 88775 + }, + { + "epoch": 6.032069574670471, + "grad_norm": 0.37464675307273865, + "learning_rate": 2.4629705122978667e-05, + "loss": 3.8105, + "step": 88780 + }, + { + "epoch": 6.032409294741133, + "grad_norm": 0.25557804107666016, + "learning_rate": 2.4625458622095395e-05, + "loss": 3.7573, + "step": 88785 + }, + { + "epoch": 6.032749014811795, + "grad_norm": 0.16993987560272217, + "learning_rate": 2.4621212121212123e-05, + "loss": 3.7893, + "step": 88790 + }, + { + "epoch": 6.0330887348824564, + "grad_norm": 0.16200228035449982, + "learning_rate": 2.461696562032885e-05, + "loss": 3.9401, + "step": 88795 + }, + { + "epoch": 6.0334284549531185, + "grad_norm": 0.22022701799869537, + "learning_rate": 2.461271911944558e-05, + "loss": 3.8839, + "step": 88800 + }, + { + "epoch": 6.033768175023781, + "grad_norm": 0.17201821506023407, + "learning_rate": 2.4608472618562307e-05, + "loss": 3.9209, + "step": 88805 + }, + { + "epoch": 6.034107895094442, + "grad_norm": 0.16385890543460846, + "learning_rate": 2.460422611767903e-05, + "loss": 3.8075, + "step": 88810 + }, + { + "epoch": 6.034447615165104, + "grad_norm": 0.2853863537311554, + "learning_rate": 2.4599979616795763e-05, + "loss": 3.7645, + "step": 88815 + }, + { + "epoch": 6.034787335235766, + "grad_norm": 0.18430835008621216, + "learning_rate": 2.459573311591249e-05, + "loss": 4.1578, + "step": 88820 + }, + { + "epoch": 6.035127055306427, + "grad_norm": 0.1418386995792389, + "learning_rate": 2.4591486615029215e-05, + "loss": 3.8794, + "step": 88825 + }, + { + "epoch": 6.035466775377089, + "grad_norm": 0.19755493104457855, + "learning_rate": 2.4587240114145947e-05, + "loss": 3.906, + "step": 88830 + }, + { + "epoch": 6.035806495447751, + "grad_norm": 0.15876945853233337, + "learning_rate": 2.458299361326267e-05, + "loss": 3.7415, + "step": 88835 + }, + { + "epoch": 6.0361462155184125, + "grad_norm": 0.18128232657909393, + "learning_rate": 2.45787471123794e-05, + "loss": 3.7685, + "step": 88840 + }, + { + "epoch": 6.0364859355890745, + "grad_norm": 0.27148985862731934, + "learning_rate": 2.457450061149613e-05, + "loss": 3.68, + "step": 88845 + }, + { + "epoch": 6.036825655659737, + "grad_norm": 0.17085154354572296, + "learning_rate": 2.4570254110612855e-05, + "loss": 3.9833, + "step": 88850 + }, + { + "epoch": 6.037165375730398, + "grad_norm": 0.15329430997371674, + "learning_rate": 2.4566007609729583e-05, + "loss": 3.9202, + "step": 88855 + }, + { + "epoch": 6.03750509580106, + "grad_norm": 0.1436380296945572, + "learning_rate": 2.456176110884631e-05, + "loss": 4.0724, + "step": 88860 + }, + { + "epoch": 6.037844815871722, + "grad_norm": 0.21794702112674713, + "learning_rate": 2.455751460796304e-05, + "loss": 3.6169, + "step": 88865 + }, + { + "epoch": 6.038184535942383, + "grad_norm": 0.215349942445755, + "learning_rate": 2.4553268107079767e-05, + "loss": 3.8937, + "step": 88870 + }, + { + "epoch": 6.038524256013045, + "grad_norm": 1.3969995975494385, + "learning_rate": 2.4549021606196495e-05, + "loss": 3.8345, + "step": 88875 + }, + { + "epoch": 6.038863976083707, + "grad_norm": 0.15279865264892578, + "learning_rate": 2.4544775105313223e-05, + "loss": 3.9439, + "step": 88880 + }, + { + "epoch": 6.0392036961543685, + "grad_norm": 0.16989754140377045, + "learning_rate": 2.454052860442995e-05, + "loss": 3.7422, + "step": 88885 + }, + { + "epoch": 6.0395434162250305, + "grad_norm": 0.18451401591300964, + "learning_rate": 2.453628210354668e-05, + "loss": 3.6025, + "step": 88890 + }, + { + "epoch": 6.039883136295693, + "grad_norm": 0.23892010748386383, + "learning_rate": 2.4532035602663407e-05, + "loss": 3.8303, + "step": 88895 + }, + { + "epoch": 6.040222856366354, + "grad_norm": 0.23327895998954773, + "learning_rate": 2.4527789101780135e-05, + "loss": 3.9387, + "step": 88900 + }, + { + "epoch": 6.040562576437016, + "grad_norm": 0.21851515769958496, + "learning_rate": 2.4523542600896863e-05, + "loss": 4.0228, + "step": 88905 + }, + { + "epoch": 6.040902296507678, + "grad_norm": 0.27634531259536743, + "learning_rate": 2.4519296100013588e-05, + "loss": 3.7622, + "step": 88910 + }, + { + "epoch": 6.041242016578339, + "grad_norm": 0.2618471086025238, + "learning_rate": 2.451504959913032e-05, + "loss": 3.8345, + "step": 88915 + }, + { + "epoch": 6.041581736649001, + "grad_norm": 0.21901296079158783, + "learning_rate": 2.4510803098247044e-05, + "loss": 3.7935, + "step": 88920 + }, + { + "epoch": 6.041921456719663, + "grad_norm": 0.17258380353450775, + "learning_rate": 2.4506556597363772e-05, + "loss": 3.6382, + "step": 88925 + }, + { + "epoch": 6.0422611767903245, + "grad_norm": 0.16824398934841156, + "learning_rate": 2.4502310096480503e-05, + "loss": 3.8494, + "step": 88930 + }, + { + "epoch": 6.042600896860987, + "grad_norm": 0.25254026055336, + "learning_rate": 2.4498063595597228e-05, + "loss": 3.9929, + "step": 88935 + }, + { + "epoch": 6.042940616931649, + "grad_norm": 0.1934623122215271, + "learning_rate": 2.4493817094713956e-05, + "loss": 3.677, + "step": 88940 + }, + { + "epoch": 6.04328033700231, + "grad_norm": 1.85792875289917, + "learning_rate": 2.4489570593830684e-05, + "loss": 3.8218, + "step": 88945 + }, + { + "epoch": 6.043620057072972, + "grad_norm": 0.24609965085983276, + "learning_rate": 2.4485324092947412e-05, + "loss": 4.1007, + "step": 88950 + }, + { + "epoch": 6.043959777143634, + "grad_norm": 0.23275503516197205, + "learning_rate": 2.448107759206414e-05, + "loss": 3.8643, + "step": 88955 + }, + { + "epoch": 6.044299497214295, + "grad_norm": 0.31832197308540344, + "learning_rate": 2.4476831091180868e-05, + "loss": 4.0627, + "step": 88960 + }, + { + "epoch": 6.044639217284957, + "grad_norm": 0.2144773006439209, + "learning_rate": 2.4472584590297596e-05, + "loss": 3.9374, + "step": 88965 + }, + { + "epoch": 6.044978937355619, + "grad_norm": 0.2248833328485489, + "learning_rate": 2.4468338089414324e-05, + "loss": 4.0384, + "step": 88970 + }, + { + "epoch": 6.0453186574262805, + "grad_norm": 0.16910918056964874, + "learning_rate": 2.4464091588531052e-05, + "loss": 3.7311, + "step": 88975 + }, + { + "epoch": 6.045658377496943, + "grad_norm": 0.16838407516479492, + "learning_rate": 2.445984508764778e-05, + "loss": 4.0097, + "step": 88980 + }, + { + "epoch": 6.045998097567605, + "grad_norm": 0.2269686758518219, + "learning_rate": 2.4455598586764508e-05, + "loss": 3.8073, + "step": 88985 + }, + { + "epoch": 6.046337817638266, + "grad_norm": 0.2175985723733902, + "learning_rate": 2.4451352085881236e-05, + "loss": 3.5731, + "step": 88990 + }, + { + "epoch": 6.046677537708928, + "grad_norm": 0.48604339361190796, + "learning_rate": 2.444710558499796e-05, + "loss": 3.8878, + "step": 88995 + }, + { + "epoch": 6.04701725777959, + "grad_norm": 0.16577395796775818, + "learning_rate": 2.4442859084114692e-05, + "loss": 3.5541, + "step": 89000 + }, + { + "epoch": 6.047356977850251, + "grad_norm": 0.20737752318382263, + "learning_rate": 2.4438612583231417e-05, + "loss": 3.9141, + "step": 89005 + }, + { + "epoch": 6.047696697920913, + "grad_norm": 0.21135520935058594, + "learning_rate": 2.4434366082348145e-05, + "loss": 3.8159, + "step": 89010 + }, + { + "epoch": 6.048036417991575, + "grad_norm": 0.19439083337783813, + "learning_rate": 2.4430119581464876e-05, + "loss": 3.7517, + "step": 89015 + }, + { + "epoch": 6.0483761380622365, + "grad_norm": 0.1445995718240738, + "learning_rate": 2.44258730805816e-05, + "loss": 3.6061, + "step": 89020 + }, + { + "epoch": 6.048715858132899, + "grad_norm": 0.29242753982543945, + "learning_rate": 2.442162657969833e-05, + "loss": 3.7691, + "step": 89025 + }, + { + "epoch": 6.049055578203561, + "grad_norm": 0.15006835758686066, + "learning_rate": 2.4417380078815057e-05, + "loss": 3.7581, + "step": 89030 + }, + { + "epoch": 6.049395298274222, + "grad_norm": 0.3652605712413788, + "learning_rate": 2.4413133577931785e-05, + "loss": 3.8114, + "step": 89035 + }, + { + "epoch": 6.049735018344884, + "grad_norm": 0.2304953932762146, + "learning_rate": 2.4408887077048516e-05, + "loss": 3.9255, + "step": 89040 + }, + { + "epoch": 6.050074738415546, + "grad_norm": 0.18763265013694763, + "learning_rate": 2.440464057616524e-05, + "loss": 3.8244, + "step": 89045 + }, + { + "epoch": 6.050414458486207, + "grad_norm": 0.14828293025493622, + "learning_rate": 2.440039407528197e-05, + "loss": 3.6767, + "step": 89050 + }, + { + "epoch": 6.050754178556869, + "grad_norm": 1.5288492441177368, + "learning_rate": 2.4396147574398697e-05, + "loss": 3.848, + "step": 89055 + }, + { + "epoch": 6.051093898627531, + "grad_norm": 0.2147945910692215, + "learning_rate": 2.4391901073515425e-05, + "loss": 3.7846, + "step": 89060 + }, + { + "epoch": 6.0514336186981925, + "grad_norm": 2.302523136138916, + "learning_rate": 2.4387654572632153e-05, + "loss": 3.7717, + "step": 89065 + }, + { + "epoch": 6.051773338768855, + "grad_norm": 0.20742860436439514, + "learning_rate": 2.438340807174888e-05, + "loss": 3.935, + "step": 89070 + }, + { + "epoch": 6.052113058839517, + "grad_norm": 0.21734404563903809, + "learning_rate": 2.437916157086561e-05, + "loss": 3.6442, + "step": 89075 + }, + { + "epoch": 6.052452778910178, + "grad_norm": 0.1783326268196106, + "learning_rate": 2.4374915069982333e-05, + "loss": 3.8647, + "step": 89080 + }, + { + "epoch": 6.05279249898084, + "grad_norm": 0.18296633660793304, + "learning_rate": 2.4370668569099065e-05, + "loss": 3.8057, + "step": 89085 + }, + { + "epoch": 6.053132219051502, + "grad_norm": 0.14456725120544434, + "learning_rate": 2.4366422068215793e-05, + "loss": 3.8594, + "step": 89090 + }, + { + "epoch": 6.053471939122163, + "grad_norm": 0.16386494040489197, + "learning_rate": 2.4362175567332517e-05, + "loss": 3.7484, + "step": 89095 + }, + { + "epoch": 6.053811659192825, + "grad_norm": 0.17106950283050537, + "learning_rate": 2.435792906644925e-05, + "loss": 3.8988, + "step": 89100 + }, + { + "epoch": 6.0541513792634865, + "grad_norm": 0.13463446497917175, + "learning_rate": 2.4353682565565973e-05, + "loss": 3.8336, + "step": 89105 + }, + { + "epoch": 6.0544910993341485, + "grad_norm": 0.765954852104187, + "learning_rate": 2.43494360646827e-05, + "loss": 3.8123, + "step": 89110 + }, + { + "epoch": 6.054830819404811, + "grad_norm": 0.27592089772224426, + "learning_rate": 2.434518956379943e-05, + "loss": 3.3864, + "step": 89115 + }, + { + "epoch": 6.055170539475472, + "grad_norm": 0.1466210037469864, + "learning_rate": 2.4340943062916158e-05, + "loss": 3.9388, + "step": 89120 + }, + { + "epoch": 6.055510259546134, + "grad_norm": 0.18732990324497223, + "learning_rate": 2.433669656203289e-05, + "loss": 3.87, + "step": 89125 + }, + { + "epoch": 6.055849979616796, + "grad_norm": 0.16487744450569153, + "learning_rate": 2.4332450061149614e-05, + "loss": 3.5341, + "step": 89130 + }, + { + "epoch": 6.056189699687457, + "grad_norm": 0.14847196638584137, + "learning_rate": 2.432820356026634e-05, + "loss": 4.0006, + "step": 89135 + }, + { + "epoch": 6.056529419758119, + "grad_norm": 0.8282633423805237, + "learning_rate": 2.432395705938307e-05, + "loss": 3.7042, + "step": 89140 + }, + { + "epoch": 6.056869139828781, + "grad_norm": 0.20449768006801605, + "learning_rate": 2.4319710558499798e-05, + "loss": 3.8526, + "step": 89145 + }, + { + "epoch": 6.0572088598994425, + "grad_norm": 0.1627482920885086, + "learning_rate": 2.4315464057616526e-05, + "loss": 3.7521, + "step": 89150 + }, + { + "epoch": 6.0575485799701045, + "grad_norm": 0.17148491740226746, + "learning_rate": 2.4311217556733254e-05, + "loss": 3.8215, + "step": 89155 + }, + { + "epoch": 6.057888300040767, + "grad_norm": 0.17577508091926575, + "learning_rate": 2.430697105584998e-05, + "loss": 3.6693, + "step": 89160 + }, + { + "epoch": 6.058228020111428, + "grad_norm": 0.29566389322280884, + "learning_rate": 2.4302724554966706e-05, + "loss": 3.9162, + "step": 89165 + }, + { + "epoch": 6.05856774018209, + "grad_norm": 0.20496052503585815, + "learning_rate": 2.4298478054083438e-05, + "loss": 3.7889, + "step": 89170 + }, + { + "epoch": 6.058907460252752, + "grad_norm": 0.15963220596313477, + "learning_rate": 2.4294231553200166e-05, + "loss": 4.1223, + "step": 89175 + }, + { + "epoch": 6.059247180323413, + "grad_norm": 0.4274572432041168, + "learning_rate": 2.428998505231689e-05, + "loss": 3.8484, + "step": 89180 + }, + { + "epoch": 6.059586900394075, + "grad_norm": 0.18242141604423523, + "learning_rate": 2.428573855143362e-05, + "loss": 3.9406, + "step": 89185 + }, + { + "epoch": 6.059926620464737, + "grad_norm": 0.17306458950042725, + "learning_rate": 2.4281492050550346e-05, + "loss": 3.8214, + "step": 89190 + }, + { + "epoch": 6.0602663405353985, + "grad_norm": 0.15839892625808716, + "learning_rate": 2.4277245549667074e-05, + "loss": 3.7069, + "step": 89195 + }, + { + "epoch": 6.0606060606060606, + "grad_norm": 0.6020665764808655, + "learning_rate": 2.4272999048783802e-05, + "loss": 3.9582, + "step": 89200 + }, + { + "epoch": 6.060945780676723, + "grad_norm": 0.15659955143928528, + "learning_rate": 2.426875254790053e-05, + "loss": 3.7823, + "step": 89205 + }, + { + "epoch": 6.061285500747384, + "grad_norm": 0.20657777786254883, + "learning_rate": 2.426450604701726e-05, + "loss": 3.7233, + "step": 89210 + }, + { + "epoch": 6.061625220818046, + "grad_norm": 0.15863776206970215, + "learning_rate": 2.4260259546133986e-05, + "loss": 3.5079, + "step": 89215 + }, + { + "epoch": 6.061964940888708, + "grad_norm": 0.2800079584121704, + "learning_rate": 2.4256013045250714e-05, + "loss": 3.8373, + "step": 89220 + }, + { + "epoch": 6.062304660959369, + "grad_norm": 0.1831672042608261, + "learning_rate": 2.4251766544367442e-05, + "loss": 3.7062, + "step": 89225 + }, + { + "epoch": 6.062644381030031, + "grad_norm": 0.44519782066345215, + "learning_rate": 2.424752004348417e-05, + "loss": 3.9425, + "step": 89230 + }, + { + "epoch": 6.062984101100693, + "grad_norm": 0.17647263407707214, + "learning_rate": 2.4243273542600898e-05, + "loss": 4.1012, + "step": 89235 + }, + { + "epoch": 6.0633238211713545, + "grad_norm": 0.22938580811023712, + "learning_rate": 2.4239027041717626e-05, + "loss": 3.718, + "step": 89240 + }, + { + "epoch": 6.063663541242017, + "grad_norm": 0.1733253300189972, + "learning_rate": 2.4234780540834354e-05, + "loss": 3.8671, + "step": 89245 + }, + { + "epoch": 6.064003261312679, + "grad_norm": 0.14304296672344208, + "learning_rate": 2.423053403995108e-05, + "loss": 3.9664, + "step": 89250 + }, + { + "epoch": 6.06434298138334, + "grad_norm": 1.766561508178711, + "learning_rate": 2.422628753906781e-05, + "loss": 3.917, + "step": 89255 + }, + { + "epoch": 6.064682701454002, + "grad_norm": 0.154021218419075, + "learning_rate": 2.422204103818454e-05, + "loss": 3.7543, + "step": 89260 + }, + { + "epoch": 6.065022421524664, + "grad_norm": 0.13302730023860931, + "learning_rate": 2.4217794537301263e-05, + "loss": 3.9001, + "step": 89265 + }, + { + "epoch": 6.065362141595325, + "grad_norm": 0.16150397062301636, + "learning_rate": 2.4213548036417994e-05, + "loss": 3.9608, + "step": 89270 + }, + { + "epoch": 6.065701861665987, + "grad_norm": 0.3454391360282898, + "learning_rate": 2.420930153553472e-05, + "loss": 3.8816, + "step": 89275 + }, + { + "epoch": 6.066041581736649, + "grad_norm": 0.7212494611740112, + "learning_rate": 2.4205055034651447e-05, + "loss": 3.6609, + "step": 89280 + }, + { + "epoch": 6.0663813018073105, + "grad_norm": 0.19949479401111603, + "learning_rate": 2.420080853376818e-05, + "loss": 3.8906, + "step": 89285 + }, + { + "epoch": 6.066721021877973, + "grad_norm": 0.23371274769306183, + "learning_rate": 2.4196562032884903e-05, + "loss": 4.0488, + "step": 89290 + }, + { + "epoch": 6.067060741948635, + "grad_norm": 0.16535060107707977, + "learning_rate": 2.4192315532001634e-05, + "loss": 4.134, + "step": 89295 + }, + { + "epoch": 6.067400462019296, + "grad_norm": 0.13421116769313812, + "learning_rate": 2.418806903111836e-05, + "loss": 3.8479, + "step": 89300 + }, + { + "epoch": 6.067740182089958, + "grad_norm": 0.21399077773094177, + "learning_rate": 2.4183822530235087e-05, + "loss": 3.6808, + "step": 89305 + }, + { + "epoch": 6.06807990216062, + "grad_norm": 0.20870853960514069, + "learning_rate": 2.4179576029351815e-05, + "loss": 3.8165, + "step": 89310 + }, + { + "epoch": 6.068419622231281, + "grad_norm": 0.23541882634162903, + "learning_rate": 2.4175329528468543e-05, + "loss": 3.8796, + "step": 89315 + }, + { + "epoch": 6.068759342301943, + "grad_norm": 0.24850043654441833, + "learning_rate": 2.417108302758527e-05, + "loss": 3.6846, + "step": 89320 + }, + { + "epoch": 6.069099062372605, + "grad_norm": 0.13661794364452362, + "learning_rate": 2.4166836526702e-05, + "loss": 4.0573, + "step": 89325 + }, + { + "epoch": 6.0694387824432665, + "grad_norm": 0.1563296616077423, + "learning_rate": 2.4162590025818727e-05, + "loss": 3.9028, + "step": 89330 + }, + { + "epoch": 6.069778502513929, + "grad_norm": 0.16597191989421844, + "learning_rate": 2.4158343524935455e-05, + "loss": 3.9974, + "step": 89335 + }, + { + "epoch": 6.070118222584591, + "grad_norm": 0.5771569609642029, + "learning_rate": 2.4154097024052183e-05, + "loss": 4.0442, + "step": 89340 + }, + { + "epoch": 6.070457942655252, + "grad_norm": 0.3842753469944, + "learning_rate": 2.414985052316891e-05, + "loss": 3.7259, + "step": 89345 + }, + { + "epoch": 6.070797662725914, + "grad_norm": 0.17972233891487122, + "learning_rate": 2.4145604022285636e-05, + "loss": 3.8931, + "step": 89350 + }, + { + "epoch": 6.071137382796576, + "grad_norm": 0.2929549515247345, + "learning_rate": 2.4141357521402367e-05, + "loss": 3.9033, + "step": 89355 + }, + { + "epoch": 6.071477102867237, + "grad_norm": 0.20192459225654602, + "learning_rate": 2.4137111020519092e-05, + "loss": 3.8165, + "step": 89360 + }, + { + "epoch": 6.071816822937899, + "grad_norm": 0.17204369604587555, + "learning_rate": 2.413286451963582e-05, + "loss": 3.9384, + "step": 89365 + }, + { + "epoch": 6.072156543008561, + "grad_norm": 0.15395426750183105, + "learning_rate": 2.412861801875255e-05, + "loss": 3.9046, + "step": 89370 + }, + { + "epoch": 6.0724962630792225, + "grad_norm": 0.1721627116203308, + "learning_rate": 2.4124371517869276e-05, + "loss": 3.6645, + "step": 89375 + }, + { + "epoch": 6.072835983149885, + "grad_norm": 0.20758745074272156, + "learning_rate": 2.4120125016986007e-05, + "loss": 3.7347, + "step": 89380 + }, + { + "epoch": 6.073175703220547, + "grad_norm": 0.17571240663528442, + "learning_rate": 2.4115878516102732e-05, + "loss": 3.8581, + "step": 89385 + }, + { + "epoch": 6.073515423291208, + "grad_norm": 0.14244906604290009, + "learning_rate": 2.411163201521946e-05, + "loss": 4.056, + "step": 89390 + }, + { + "epoch": 6.07385514336187, + "grad_norm": 0.23501212894916534, + "learning_rate": 2.4107385514336188e-05, + "loss": 3.9044, + "step": 89395 + }, + { + "epoch": 6.074194863432532, + "grad_norm": 0.22503703832626343, + "learning_rate": 2.4103139013452916e-05, + "loss": 3.5709, + "step": 89400 + }, + { + "epoch": 6.074534583503193, + "grad_norm": 0.21758879721164703, + "learning_rate": 2.4098892512569644e-05, + "loss": 3.8737, + "step": 89405 + }, + { + "epoch": 6.074874303573855, + "grad_norm": 0.1999451369047165, + "learning_rate": 2.4094646011686372e-05, + "loss": 4.0897, + "step": 89410 + }, + { + "epoch": 6.075214023644517, + "grad_norm": 2.1164112091064453, + "learning_rate": 2.40903995108031e-05, + "loss": 3.7326, + "step": 89415 + }, + { + "epoch": 6.0755537437151785, + "grad_norm": 0.15366551280021667, + "learning_rate": 2.4086153009919828e-05, + "loss": 3.648, + "step": 89420 + }, + { + "epoch": 6.075893463785841, + "grad_norm": 0.16424506902694702, + "learning_rate": 2.4081906509036556e-05, + "loss": 3.9412, + "step": 89425 + }, + { + "epoch": 6.076233183856502, + "grad_norm": 0.1820128858089447, + "learning_rate": 2.4077660008153284e-05, + "loss": 4.0194, + "step": 89430 + }, + { + "epoch": 6.076572903927164, + "grad_norm": 0.17011329531669617, + "learning_rate": 2.407341350727001e-05, + "loss": 3.9761, + "step": 89435 + }, + { + "epoch": 6.076912623997826, + "grad_norm": 0.23292195796966553, + "learning_rate": 2.406916700638674e-05, + "loss": 3.9202, + "step": 89440 + }, + { + "epoch": 6.077252344068487, + "grad_norm": 0.18762147426605225, + "learning_rate": 2.4064920505503464e-05, + "loss": 3.9871, + "step": 89445 + }, + { + "epoch": 6.077592064139149, + "grad_norm": 0.20510806143283844, + "learning_rate": 2.4060674004620192e-05, + "loss": 3.7669, + "step": 89450 + }, + { + "epoch": 6.077931784209811, + "grad_norm": 0.19392411410808563, + "learning_rate": 2.4056427503736924e-05, + "loss": 3.8347, + "step": 89455 + }, + { + "epoch": 6.0782715042804725, + "grad_norm": 0.2223082333803177, + "learning_rate": 2.405218100285365e-05, + "loss": 3.9195, + "step": 89460 + }, + { + "epoch": 6.0786112243511345, + "grad_norm": 0.1988038867712021, + "learning_rate": 2.404793450197038e-05, + "loss": 3.6633, + "step": 89465 + }, + { + "epoch": 6.078950944421797, + "grad_norm": 0.21140992641448975, + "learning_rate": 2.4043688001087104e-05, + "loss": 4.149, + "step": 89470 + }, + { + "epoch": 6.079290664492458, + "grad_norm": 0.17588473856449127, + "learning_rate": 2.4039441500203832e-05, + "loss": 3.6985, + "step": 89475 + }, + { + "epoch": 6.07963038456312, + "grad_norm": 0.32829657196998596, + "learning_rate": 2.403519499932056e-05, + "loss": 3.9093, + "step": 89480 + }, + { + "epoch": 6.079970104633782, + "grad_norm": 0.17577292025089264, + "learning_rate": 2.403094849843729e-05, + "loss": 3.8925, + "step": 89485 + }, + { + "epoch": 6.080309824704443, + "grad_norm": 0.15224018692970276, + "learning_rate": 2.4026701997554017e-05, + "loss": 3.795, + "step": 89490 + }, + { + "epoch": 6.080649544775105, + "grad_norm": 0.18634025752544403, + "learning_rate": 2.4022455496670745e-05, + "loss": 3.7861, + "step": 89495 + }, + { + "epoch": 6.080989264845767, + "grad_norm": 0.6744875311851501, + "learning_rate": 2.4018208995787473e-05, + "loss": 3.7618, + "step": 89500 + }, + { + "epoch": 6.0813289849164285, + "grad_norm": 0.19116482138633728, + "learning_rate": 2.40139624949042e-05, + "loss": 4.0117, + "step": 89505 + }, + { + "epoch": 6.081668704987091, + "grad_norm": 0.3253881633281708, + "learning_rate": 2.400971599402093e-05, + "loss": 4.0475, + "step": 89510 + }, + { + "epoch": 6.082008425057753, + "grad_norm": 0.34601157903671265, + "learning_rate": 2.4005469493137657e-05, + "loss": 3.8469, + "step": 89515 + }, + { + "epoch": 6.082348145128414, + "grad_norm": 0.14116442203521729, + "learning_rate": 2.400122299225438e-05, + "loss": 3.8568, + "step": 89520 + }, + { + "epoch": 6.082687865199076, + "grad_norm": 0.17492856085300446, + "learning_rate": 2.3996976491371113e-05, + "loss": 3.8084, + "step": 89525 + }, + { + "epoch": 6.083027585269738, + "grad_norm": 0.2613787055015564, + "learning_rate": 2.3992729990487837e-05, + "loss": 3.8682, + "step": 89530 + }, + { + "epoch": 6.083367305340399, + "grad_norm": 0.19556622207164764, + "learning_rate": 2.3988483489604565e-05, + "loss": 3.9064, + "step": 89535 + }, + { + "epoch": 6.083707025411061, + "grad_norm": 0.17828193306922913, + "learning_rate": 2.3984236988721297e-05, + "loss": 3.6106, + "step": 89540 + }, + { + "epoch": 6.084046745481723, + "grad_norm": 0.8359688520431519, + "learning_rate": 2.397999048783802e-05, + "loss": 3.9277, + "step": 89545 + }, + { + "epoch": 6.0843864655523845, + "grad_norm": 0.20118239521980286, + "learning_rate": 2.3975743986954753e-05, + "loss": 3.7634, + "step": 89550 + }, + { + "epoch": 6.084726185623047, + "grad_norm": 0.30769917368888855, + "learning_rate": 2.3971497486071477e-05, + "loss": 3.6277, + "step": 89555 + }, + { + "epoch": 6.085065905693709, + "grad_norm": 0.24274997413158417, + "learning_rate": 2.3967250985188205e-05, + "loss": 3.7091, + "step": 89560 + }, + { + "epoch": 6.08540562576437, + "grad_norm": 0.22222843766212463, + "learning_rate": 2.3963004484304937e-05, + "loss": 3.6596, + "step": 89565 + }, + { + "epoch": 6.085745345835032, + "grad_norm": 0.16801835596561432, + "learning_rate": 2.395875798342166e-05, + "loss": 3.8846, + "step": 89570 + }, + { + "epoch": 6.086085065905694, + "grad_norm": 0.1743101328611374, + "learning_rate": 2.395451148253839e-05, + "loss": 3.7884, + "step": 89575 + }, + { + "epoch": 6.086424785976355, + "grad_norm": 0.1875547617673874, + "learning_rate": 2.3950264981655117e-05, + "loss": 3.821, + "step": 89580 + }, + { + "epoch": 6.086764506047017, + "grad_norm": 0.16493965685367584, + "learning_rate": 2.3946018480771845e-05, + "loss": 3.6439, + "step": 89585 + }, + { + "epoch": 6.087104226117679, + "grad_norm": 0.16902610659599304, + "learning_rate": 2.3941771979888573e-05, + "loss": 3.6632, + "step": 89590 + }, + { + "epoch": 6.0874439461883405, + "grad_norm": 0.6319149136543274, + "learning_rate": 2.39375254790053e-05, + "loss": 3.9739, + "step": 89595 + }, + { + "epoch": 6.087783666259003, + "grad_norm": 0.6970657110214233, + "learning_rate": 2.393327897812203e-05, + "loss": 3.9463, + "step": 89600 + }, + { + "epoch": 6.088123386329665, + "grad_norm": 0.18336404860019684, + "learning_rate": 2.3929032477238754e-05, + "loss": 3.8724, + "step": 89605 + }, + { + "epoch": 6.088463106400326, + "grad_norm": 0.21528595685958862, + "learning_rate": 2.3924785976355485e-05, + "loss": 3.9073, + "step": 89610 + }, + { + "epoch": 6.088802826470988, + "grad_norm": 0.20170007646083832, + "learning_rate": 2.3920539475472213e-05, + "loss": 3.9006, + "step": 89615 + }, + { + "epoch": 6.08914254654165, + "grad_norm": 0.16446396708488464, + "learning_rate": 2.3916292974588938e-05, + "loss": 4.0002, + "step": 89620 + }, + { + "epoch": 6.089482266612311, + "grad_norm": 0.24195784330368042, + "learning_rate": 2.391204647370567e-05, + "loss": 4.0379, + "step": 89625 + }, + { + "epoch": 6.089821986682973, + "grad_norm": 0.21902063488960266, + "learning_rate": 2.3907799972822394e-05, + "loss": 3.8843, + "step": 89630 + }, + { + "epoch": 6.090161706753635, + "grad_norm": 0.14118167757987976, + "learning_rate": 2.3903553471939125e-05, + "loss": 3.4538, + "step": 89635 + }, + { + "epoch": 6.0905014268242965, + "grad_norm": 0.14633503556251526, + "learning_rate": 2.389930697105585e-05, + "loss": 3.7068, + "step": 89640 + }, + { + "epoch": 6.090841146894959, + "grad_norm": 0.16056084632873535, + "learning_rate": 2.3895060470172578e-05, + "loss": 3.6283, + "step": 89645 + }, + { + "epoch": 6.091180866965621, + "grad_norm": 0.18068210780620575, + "learning_rate": 2.389081396928931e-05, + "loss": 3.6566, + "step": 89650 + }, + { + "epoch": 6.091520587036282, + "grad_norm": 0.160360649228096, + "learning_rate": 2.3886567468406034e-05, + "loss": 3.86, + "step": 89655 + }, + { + "epoch": 6.091860307106944, + "grad_norm": 0.28811976313591003, + "learning_rate": 2.3882320967522762e-05, + "loss": 3.9575, + "step": 89660 + }, + { + "epoch": 6.092200027177606, + "grad_norm": 0.1771298348903656, + "learning_rate": 2.387807446663949e-05, + "loss": 3.6585, + "step": 89665 + }, + { + "epoch": 6.092539747248267, + "grad_norm": 0.29003220796585083, + "learning_rate": 2.3873827965756218e-05, + "loss": 3.9071, + "step": 89670 + }, + { + "epoch": 6.092879467318929, + "grad_norm": 0.18299955129623413, + "learning_rate": 2.3869581464872946e-05, + "loss": 3.7475, + "step": 89675 + }, + { + "epoch": 6.093219187389591, + "grad_norm": 0.18496166169643402, + "learning_rate": 2.3865334963989674e-05, + "loss": 3.7966, + "step": 89680 + }, + { + "epoch": 6.0935589074602525, + "grad_norm": 0.18638254702091217, + "learning_rate": 2.3861088463106402e-05, + "loss": 3.8245, + "step": 89685 + }, + { + "epoch": 6.093898627530915, + "grad_norm": 0.13409848511219025, + "learning_rate": 2.3856841962223127e-05, + "loss": 3.7514, + "step": 89690 + }, + { + "epoch": 6.094238347601577, + "grad_norm": 0.19300125539302826, + "learning_rate": 2.3852595461339858e-05, + "loss": 3.9966, + "step": 89695 + }, + { + "epoch": 6.094578067672238, + "grad_norm": 0.16265136003494263, + "learning_rate": 2.3848348960456586e-05, + "loss": 3.7216, + "step": 89700 + }, + { + "epoch": 6.0949177877429, + "grad_norm": 0.20418918132781982, + "learning_rate": 2.384410245957331e-05, + "loss": 3.7832, + "step": 89705 + }, + { + "epoch": 6.095257507813562, + "grad_norm": 0.180721253156662, + "learning_rate": 2.3839855958690042e-05, + "loss": 3.8895, + "step": 89710 + }, + { + "epoch": 6.095597227884223, + "grad_norm": 0.15899233520030975, + "learning_rate": 2.3835609457806767e-05, + "loss": 3.8041, + "step": 89715 + }, + { + "epoch": 6.095936947954885, + "grad_norm": 0.1571827083826065, + "learning_rate": 2.3831362956923498e-05, + "loss": 4.032, + "step": 89720 + }, + { + "epoch": 6.096276668025547, + "grad_norm": 0.21257805824279785, + "learning_rate": 2.3827116456040223e-05, + "loss": 3.8911, + "step": 89725 + }, + { + "epoch": 6.0966163880962085, + "grad_norm": 0.1619468331336975, + "learning_rate": 2.382286995515695e-05, + "loss": 3.6226, + "step": 89730 + }, + { + "epoch": 6.096956108166871, + "grad_norm": 0.18019132316112518, + "learning_rate": 2.3818623454273682e-05, + "loss": 3.5253, + "step": 89735 + }, + { + "epoch": 6.097295828237533, + "grad_norm": 0.3262346386909485, + "learning_rate": 2.3814376953390407e-05, + "loss": 3.8523, + "step": 89740 + }, + { + "epoch": 6.097635548308194, + "grad_norm": 0.2001989632844925, + "learning_rate": 2.3810130452507135e-05, + "loss": 3.9087, + "step": 89745 + }, + { + "epoch": 6.097975268378856, + "grad_norm": 0.1707494854927063, + "learning_rate": 2.3805883951623863e-05, + "loss": 3.7096, + "step": 89750 + }, + { + "epoch": 6.098314988449518, + "grad_norm": 0.19867810606956482, + "learning_rate": 2.380163745074059e-05, + "loss": 3.7108, + "step": 89755 + }, + { + "epoch": 6.098654708520179, + "grad_norm": 0.17178043723106384, + "learning_rate": 2.379739094985732e-05, + "loss": 3.8189, + "step": 89760 + }, + { + "epoch": 6.098994428590841, + "grad_norm": 0.154733806848526, + "learning_rate": 2.3793144448974047e-05, + "loss": 4.0059, + "step": 89765 + }, + { + "epoch": 6.099334148661503, + "grad_norm": 0.189328134059906, + "learning_rate": 2.3788897948090775e-05, + "loss": 3.8377, + "step": 89770 + }, + { + "epoch": 6.0996738687321646, + "grad_norm": 0.19084738194942474, + "learning_rate": 2.37846514472075e-05, + "loss": 3.9829, + "step": 89775 + }, + { + "epoch": 6.100013588802827, + "grad_norm": 0.1815895289182663, + "learning_rate": 2.378040494632423e-05, + "loss": 3.9834, + "step": 89780 + }, + { + "epoch": 6.100353308873488, + "grad_norm": 0.2437085658311844, + "learning_rate": 2.377615844544096e-05, + "loss": 3.8256, + "step": 89785 + }, + { + "epoch": 6.10069302894415, + "grad_norm": 0.16918303072452545, + "learning_rate": 2.3771911944557683e-05, + "loss": 3.8385, + "step": 89790 + }, + { + "epoch": 6.101032749014812, + "grad_norm": 0.17072144150733948, + "learning_rate": 2.3767665443674415e-05, + "loss": 3.8234, + "step": 89795 + }, + { + "epoch": 6.101372469085473, + "grad_norm": 0.20295260846614838, + "learning_rate": 2.376341894279114e-05, + "loss": 3.8465, + "step": 89800 + }, + { + "epoch": 6.101712189156135, + "grad_norm": 0.1809159219264984, + "learning_rate": 2.375917244190787e-05, + "loss": 3.9148, + "step": 89805 + }, + { + "epoch": 6.102051909226797, + "grad_norm": 0.1937183141708374, + "learning_rate": 2.37549259410246e-05, + "loss": 3.6349, + "step": 89810 + }, + { + "epoch": 6.1023916292974585, + "grad_norm": 1.8313419818878174, + "learning_rate": 2.3750679440141323e-05, + "loss": 3.7035, + "step": 89815 + }, + { + "epoch": 6.102731349368121, + "grad_norm": 0.3563876748085022, + "learning_rate": 2.3746432939258055e-05, + "loss": 3.9355, + "step": 89820 + }, + { + "epoch": 6.103071069438783, + "grad_norm": 0.24259351193904877, + "learning_rate": 2.374218643837478e-05, + "loss": 4.0953, + "step": 89825 + }, + { + "epoch": 6.103410789509444, + "grad_norm": 0.19502823054790497, + "learning_rate": 2.3737939937491507e-05, + "loss": 3.7338, + "step": 89830 + }, + { + "epoch": 6.103750509580106, + "grad_norm": 0.16094641387462616, + "learning_rate": 2.3733693436608236e-05, + "loss": 3.8698, + "step": 89835 + }, + { + "epoch": 6.104090229650768, + "grad_norm": 0.15558458864688873, + "learning_rate": 2.3729446935724964e-05, + "loss": 3.6889, + "step": 89840 + }, + { + "epoch": 6.104429949721429, + "grad_norm": 0.1782703548669815, + "learning_rate": 2.372520043484169e-05, + "loss": 3.9714, + "step": 89845 + }, + { + "epoch": 6.104769669792091, + "grad_norm": 0.1700865477323532, + "learning_rate": 2.372095393395842e-05, + "loss": 3.7529, + "step": 89850 + }, + { + "epoch": 6.105109389862753, + "grad_norm": 0.23705576360225677, + "learning_rate": 2.3716707433075148e-05, + "loss": 3.8607, + "step": 89855 + }, + { + "epoch": 6.1054491099334145, + "grad_norm": 0.19059449434280396, + "learning_rate": 2.3712460932191876e-05, + "loss": 3.8593, + "step": 89860 + }, + { + "epoch": 6.105788830004077, + "grad_norm": 0.2424628585577011, + "learning_rate": 2.3708214431308604e-05, + "loss": 3.841, + "step": 89865 + }, + { + "epoch": 6.106128550074739, + "grad_norm": 0.2781767249107361, + "learning_rate": 2.370396793042533e-05, + "loss": 3.954, + "step": 89870 + }, + { + "epoch": 6.1064682701454, + "grad_norm": 0.15095040202140808, + "learning_rate": 2.3699721429542056e-05, + "loss": 3.9592, + "step": 89875 + }, + { + "epoch": 6.106807990216062, + "grad_norm": 0.21780112385749817, + "learning_rate": 2.3695474928658788e-05, + "loss": 3.7963, + "step": 89880 + }, + { + "epoch": 6.107147710286724, + "grad_norm": 0.17924462258815765, + "learning_rate": 2.3691228427775512e-05, + "loss": 3.9943, + "step": 89885 + }, + { + "epoch": 6.107487430357385, + "grad_norm": 0.15859660506248474, + "learning_rate": 2.3686981926892244e-05, + "loss": 3.9178, + "step": 89890 + }, + { + "epoch": 6.107827150428047, + "grad_norm": 0.16286471486091614, + "learning_rate": 2.368273542600897e-05, + "loss": 3.7901, + "step": 89895 + }, + { + "epoch": 6.108166870498709, + "grad_norm": 0.4396839737892151, + "learning_rate": 2.3678488925125696e-05, + "loss": 3.948, + "step": 89900 + }, + { + "epoch": 6.1085065905693705, + "grad_norm": 0.19544890522956848, + "learning_rate": 2.3674242424242428e-05, + "loss": 3.907, + "step": 89905 + }, + { + "epoch": 6.108846310640033, + "grad_norm": 0.15642529726028442, + "learning_rate": 2.3669995923359152e-05, + "loss": 3.8102, + "step": 89910 + }, + { + "epoch": 6.109186030710695, + "grad_norm": 0.24804821610450745, + "learning_rate": 2.366574942247588e-05, + "loss": 3.8087, + "step": 89915 + }, + { + "epoch": 6.109525750781356, + "grad_norm": 0.20439894497394562, + "learning_rate": 2.3661502921592608e-05, + "loss": 3.8525, + "step": 89920 + }, + { + "epoch": 6.109865470852018, + "grad_norm": 0.2493523210287094, + "learning_rate": 2.3657256420709336e-05, + "loss": 3.7159, + "step": 89925 + }, + { + "epoch": 6.11020519092268, + "grad_norm": 0.14850068092346191, + "learning_rate": 2.3653009919826064e-05, + "loss": 3.741, + "step": 89930 + }, + { + "epoch": 6.110544910993341, + "grad_norm": 0.13657855987548828, + "learning_rate": 2.3648763418942792e-05, + "loss": 3.626, + "step": 89935 + }, + { + "epoch": 6.110884631064003, + "grad_norm": 0.1701803207397461, + "learning_rate": 2.364451691805952e-05, + "loss": 3.9681, + "step": 89940 + }, + { + "epoch": 6.111224351134665, + "grad_norm": 2.337164878845215, + "learning_rate": 2.3640270417176248e-05, + "loss": 3.8427, + "step": 89945 + }, + { + "epoch": 6.1115640712053265, + "grad_norm": 0.17057490348815918, + "learning_rate": 2.3636023916292976e-05, + "loss": 3.8801, + "step": 89950 + }, + { + "epoch": 6.111903791275989, + "grad_norm": 0.19834552705287933, + "learning_rate": 2.3631777415409704e-05, + "loss": 3.7361, + "step": 89955 + }, + { + "epoch": 6.112243511346651, + "grad_norm": 0.2800852358341217, + "learning_rate": 2.362753091452643e-05, + "loss": 3.7852, + "step": 89960 + }, + { + "epoch": 6.112583231417312, + "grad_norm": 0.1717090606689453, + "learning_rate": 2.362328441364316e-05, + "loss": 3.7956, + "step": 89965 + }, + { + "epoch": 6.112922951487974, + "grad_norm": 0.14634396135807037, + "learning_rate": 2.3619037912759885e-05, + "loss": 4.0892, + "step": 89970 + }, + { + "epoch": 6.113262671558636, + "grad_norm": 0.16140586137771606, + "learning_rate": 2.3614791411876616e-05, + "loss": 3.9696, + "step": 89975 + }, + { + "epoch": 6.113602391629297, + "grad_norm": 0.1490987241268158, + "learning_rate": 2.3610544910993344e-05, + "loss": 3.8354, + "step": 89980 + }, + { + "epoch": 6.113942111699959, + "grad_norm": 2.1104490756988525, + "learning_rate": 2.360629841011007e-05, + "loss": 3.8589, + "step": 89985 + }, + { + "epoch": 6.114281831770621, + "grad_norm": 0.1783444583415985, + "learning_rate": 2.36020519092268e-05, + "loss": 3.7547, + "step": 89990 + }, + { + "epoch": 6.1146215518412825, + "grad_norm": 0.8564414978027344, + "learning_rate": 2.3597805408343525e-05, + "loss": 3.9457, + "step": 89995 + }, + { + "epoch": 6.114961271911945, + "grad_norm": 0.18040026724338531, + "learning_rate": 2.3593558907460253e-05, + "loss": 3.9367, + "step": 90000 + }, + { + "epoch": 6.115300991982607, + "grad_norm": 0.14354996383190155, + "learning_rate": 2.3589312406576984e-05, + "loss": 3.9871, + "step": 90005 + }, + { + "epoch": 6.115640712053268, + "grad_norm": 0.17981179058551788, + "learning_rate": 2.358506590569371e-05, + "loss": 3.657, + "step": 90010 + }, + { + "epoch": 6.11598043212393, + "grad_norm": 0.14466294646263123, + "learning_rate": 2.3580819404810437e-05, + "loss": 4.0343, + "step": 90015 + }, + { + "epoch": 6.116320152194592, + "grad_norm": 1.167648196220398, + "learning_rate": 2.3576572903927165e-05, + "loss": 3.74, + "step": 90020 + }, + { + "epoch": 6.116659872265253, + "grad_norm": 0.12277612090110779, + "learning_rate": 2.3572326403043893e-05, + "loss": 3.996, + "step": 90025 + }, + { + "epoch": 6.116999592335915, + "grad_norm": 0.15933609008789062, + "learning_rate": 2.356807990216062e-05, + "loss": 3.5527, + "step": 90030 + }, + { + "epoch": 6.117339312406577, + "grad_norm": 0.19039779901504517, + "learning_rate": 2.356383340127735e-05, + "loss": 3.9289, + "step": 90035 + }, + { + "epoch": 6.1176790324772385, + "grad_norm": 0.40890204906463623, + "learning_rate": 2.3559586900394077e-05, + "loss": 3.8801, + "step": 90040 + }, + { + "epoch": 6.118018752547901, + "grad_norm": 0.17160969972610474, + "learning_rate": 2.35553403995108e-05, + "loss": 4.0237, + "step": 90045 + }, + { + "epoch": 6.118358472618563, + "grad_norm": 0.21254509687423706, + "learning_rate": 2.3551093898627533e-05, + "loss": 4.1749, + "step": 90050 + }, + { + "epoch": 6.118698192689224, + "grad_norm": 0.15204432606697083, + "learning_rate": 2.354684739774426e-05, + "loss": 3.9739, + "step": 90055 + }, + { + "epoch": 6.119037912759886, + "grad_norm": 0.4585149586200714, + "learning_rate": 2.354260089686099e-05, + "loss": 3.8871, + "step": 90060 + }, + { + "epoch": 6.119377632830548, + "grad_norm": 0.15308047831058502, + "learning_rate": 2.3538354395977717e-05, + "loss": 3.8133, + "step": 90065 + }, + { + "epoch": 6.119717352901209, + "grad_norm": 0.17166469991207123, + "learning_rate": 2.3534107895094442e-05, + "loss": 3.7525, + "step": 90070 + }, + { + "epoch": 6.120057072971871, + "grad_norm": 0.16457153856754303, + "learning_rate": 2.3529861394211173e-05, + "loss": 3.9182, + "step": 90075 + }, + { + "epoch": 6.120396793042533, + "grad_norm": 0.7849352359771729, + "learning_rate": 2.3525614893327898e-05, + "loss": 3.9123, + "step": 90080 + }, + { + "epoch": 6.1207365131131946, + "grad_norm": 0.24403327703475952, + "learning_rate": 2.3521368392444626e-05, + "loss": 3.7046, + "step": 90085 + }, + { + "epoch": 6.121076233183857, + "grad_norm": 0.1389245241880417, + "learning_rate": 2.3517121891561357e-05, + "loss": 3.6911, + "step": 90090 + }, + { + "epoch": 6.121415953254519, + "grad_norm": 0.1521902084350586, + "learning_rate": 2.3512875390678082e-05, + "loss": 3.9721, + "step": 90095 + }, + { + "epoch": 6.12175567332518, + "grad_norm": 0.18143728375434875, + "learning_rate": 2.350862888979481e-05, + "loss": 3.7886, + "step": 90100 + }, + { + "epoch": 6.122095393395842, + "grad_norm": 0.4437861442565918, + "learning_rate": 2.3504382388911538e-05, + "loss": 3.8038, + "step": 90105 + }, + { + "epoch": 6.122435113466503, + "grad_norm": 0.1698966771364212, + "learning_rate": 2.3500135888028266e-05, + "loss": 3.598, + "step": 90110 + }, + { + "epoch": 6.122774833537165, + "grad_norm": 0.1661263257265091, + "learning_rate": 2.3495889387144994e-05, + "loss": 3.8531, + "step": 90115 + }, + { + "epoch": 6.123114553607827, + "grad_norm": 0.2171361893415451, + "learning_rate": 2.3491642886261722e-05, + "loss": 3.8274, + "step": 90120 + }, + { + "epoch": 6.1234542736784885, + "grad_norm": 0.1610659658908844, + "learning_rate": 2.348739638537845e-05, + "loss": 3.6545, + "step": 90125 + }, + { + "epoch": 6.123793993749151, + "grad_norm": 0.1837674379348755, + "learning_rate": 2.3483149884495174e-05, + "loss": 3.8383, + "step": 90130 + }, + { + "epoch": 6.124133713819813, + "grad_norm": 0.20044703781604767, + "learning_rate": 2.3478903383611906e-05, + "loss": 3.7063, + "step": 90135 + }, + { + "epoch": 6.124473433890474, + "grad_norm": 0.8011187314987183, + "learning_rate": 2.3474656882728634e-05, + "loss": 3.7996, + "step": 90140 + }, + { + "epoch": 6.124813153961136, + "grad_norm": 0.1572798192501068, + "learning_rate": 2.3470410381845362e-05, + "loss": 3.7905, + "step": 90145 + }, + { + "epoch": 6.125152874031798, + "grad_norm": 0.20399987697601318, + "learning_rate": 2.346616388096209e-05, + "loss": 3.916, + "step": 90150 + }, + { + "epoch": 6.125492594102459, + "grad_norm": 0.21938298642635345, + "learning_rate": 2.3461917380078814e-05, + "loss": 3.955, + "step": 90155 + }, + { + "epoch": 6.125832314173121, + "grad_norm": 0.15998469293117523, + "learning_rate": 2.3457670879195546e-05, + "loss": 3.7998, + "step": 90160 + }, + { + "epoch": 6.126172034243783, + "grad_norm": 0.1633158028125763, + "learning_rate": 2.345342437831227e-05, + "loss": 4.0347, + "step": 90165 + }, + { + "epoch": 6.1265117543144445, + "grad_norm": 0.20470020174980164, + "learning_rate": 2.3449177877429e-05, + "loss": 3.8834, + "step": 90170 + }, + { + "epoch": 6.126851474385107, + "grad_norm": 1.403978705406189, + "learning_rate": 2.344493137654573e-05, + "loss": 3.8144, + "step": 90175 + }, + { + "epoch": 6.127191194455769, + "grad_norm": 0.17324481904506683, + "learning_rate": 2.3440684875662454e-05, + "loss": 3.6942, + "step": 90180 + }, + { + "epoch": 6.12753091452643, + "grad_norm": 0.194215327501297, + "learning_rate": 2.3436438374779182e-05, + "loss": 3.8925, + "step": 90185 + }, + { + "epoch": 6.127870634597092, + "grad_norm": 0.1652088463306427, + "learning_rate": 2.343219187389591e-05, + "loss": 3.9729, + "step": 90190 + }, + { + "epoch": 6.128210354667754, + "grad_norm": 0.2041509449481964, + "learning_rate": 2.342794537301264e-05, + "loss": 4.0546, + "step": 90195 + }, + { + "epoch": 6.128550074738415, + "grad_norm": 0.15137319266796112, + "learning_rate": 2.3423698872129367e-05, + "loss": 4.0894, + "step": 90200 + }, + { + "epoch": 6.128889794809077, + "grad_norm": 0.33307766914367676, + "learning_rate": 2.3419452371246095e-05, + "loss": 4.0174, + "step": 90205 + }, + { + "epoch": 6.129229514879739, + "grad_norm": 0.9525396227836609, + "learning_rate": 2.3415205870362823e-05, + "loss": 3.889, + "step": 90210 + }, + { + "epoch": 6.1295692349504005, + "grad_norm": 0.16405674815177917, + "learning_rate": 2.3410959369479547e-05, + "loss": 3.7606, + "step": 90215 + }, + { + "epoch": 6.129908955021063, + "grad_norm": 0.23416176438331604, + "learning_rate": 2.340671286859628e-05, + "loss": 3.7004, + "step": 90220 + }, + { + "epoch": 6.130248675091725, + "grad_norm": 0.13902559876441956, + "learning_rate": 2.3402466367713007e-05, + "loss": 3.7214, + "step": 90225 + }, + { + "epoch": 6.130588395162386, + "grad_norm": 0.21475577354431152, + "learning_rate": 2.3398219866829735e-05, + "loss": 3.7452, + "step": 90230 + }, + { + "epoch": 6.130928115233048, + "grad_norm": 0.19747984409332275, + "learning_rate": 2.3393973365946463e-05, + "loss": 3.7098, + "step": 90235 + }, + { + "epoch": 6.13126783530371, + "grad_norm": 2.3155529499053955, + "learning_rate": 2.3389726865063187e-05, + "loss": 3.9526, + "step": 90240 + }, + { + "epoch": 6.131607555374371, + "grad_norm": 0.1552920639514923, + "learning_rate": 2.338548036417992e-05, + "loss": 3.8032, + "step": 90245 + }, + { + "epoch": 6.131947275445033, + "grad_norm": 0.20210032165050507, + "learning_rate": 2.3381233863296643e-05, + "loss": 3.6976, + "step": 90250 + }, + { + "epoch": 6.132286995515695, + "grad_norm": 0.4865095615386963, + "learning_rate": 2.337698736241337e-05, + "loss": 3.7908, + "step": 90255 + }, + { + "epoch": 6.1326267155863565, + "grad_norm": 0.16912436485290527, + "learning_rate": 2.3372740861530103e-05, + "loss": 3.9714, + "step": 90260 + }, + { + "epoch": 6.132966435657019, + "grad_norm": 0.20097488164901733, + "learning_rate": 2.3368494360646827e-05, + "loss": 3.8073, + "step": 90265 + }, + { + "epoch": 6.133306155727681, + "grad_norm": 0.16574783623218536, + "learning_rate": 2.3364247859763555e-05, + "loss": 3.7349, + "step": 90270 + }, + { + "epoch": 6.133645875798342, + "grad_norm": 0.26805973052978516, + "learning_rate": 2.3360001358880283e-05, + "loss": 3.9437, + "step": 90275 + }, + { + "epoch": 6.133985595869004, + "grad_norm": 0.15236057341098785, + "learning_rate": 2.335575485799701e-05, + "loss": 3.9698, + "step": 90280 + }, + { + "epoch": 6.134325315939666, + "grad_norm": 0.23317939043045044, + "learning_rate": 2.335150835711374e-05, + "loss": 4.0321, + "step": 90285 + }, + { + "epoch": 6.134665036010327, + "grad_norm": 0.19008201360702515, + "learning_rate": 2.3347261856230467e-05, + "loss": 3.838, + "step": 90290 + }, + { + "epoch": 6.135004756080989, + "grad_norm": 0.1755993664264679, + "learning_rate": 2.3343015355347195e-05, + "loss": 3.9123, + "step": 90295 + }, + { + "epoch": 6.135344476151651, + "grad_norm": 0.15975339710712433, + "learning_rate": 2.333876885446392e-05, + "loss": 3.7402, + "step": 90300 + }, + { + "epoch": 6.1356841962223125, + "grad_norm": 0.14227360486984253, + "learning_rate": 2.333452235358065e-05, + "loss": 3.8992, + "step": 90305 + }, + { + "epoch": 6.136023916292975, + "grad_norm": 0.22948119044303894, + "learning_rate": 2.333027585269738e-05, + "loss": 3.8668, + "step": 90310 + }, + { + "epoch": 6.136363636363637, + "grad_norm": 0.17830179631710052, + "learning_rate": 2.3326029351814107e-05, + "loss": 3.8122, + "step": 90315 + }, + { + "epoch": 6.136703356434298, + "grad_norm": 0.14543390274047852, + "learning_rate": 2.3321782850930835e-05, + "loss": 3.6493, + "step": 90320 + }, + { + "epoch": 6.13704307650496, + "grad_norm": 0.17100423574447632, + "learning_rate": 2.331753635004756e-05, + "loss": 3.6854, + "step": 90325 + }, + { + "epoch": 6.137382796575622, + "grad_norm": 0.2042299509048462, + "learning_rate": 2.331328984916429e-05, + "loss": 3.8896, + "step": 90330 + }, + { + "epoch": 6.137722516646283, + "grad_norm": 0.1812141090631485, + "learning_rate": 2.330904334828102e-05, + "loss": 3.9657, + "step": 90335 + }, + { + "epoch": 6.138062236716945, + "grad_norm": 0.2732195258140564, + "learning_rate": 2.3304796847397744e-05, + "loss": 3.7924, + "step": 90340 + }, + { + "epoch": 6.138401956787607, + "grad_norm": 0.2034429907798767, + "learning_rate": 2.3300550346514475e-05, + "loss": 4.0545, + "step": 90345 + }, + { + "epoch": 6.1387416768582685, + "grad_norm": 0.1696600764989853, + "learning_rate": 2.32963038456312e-05, + "loss": 4.0188, + "step": 90350 + }, + { + "epoch": 6.139081396928931, + "grad_norm": 0.1898268312215805, + "learning_rate": 2.3292057344747928e-05, + "loss": 3.9312, + "step": 90355 + }, + { + "epoch": 6.139421116999593, + "grad_norm": 0.1887490451335907, + "learning_rate": 2.3287810843864656e-05, + "loss": 3.9577, + "step": 90360 + }, + { + "epoch": 6.139760837070254, + "grad_norm": 0.21092501282691956, + "learning_rate": 2.3283564342981384e-05, + "loss": 3.7916, + "step": 90365 + }, + { + "epoch": 6.140100557140916, + "grad_norm": 0.15258954465389252, + "learning_rate": 2.3279317842098112e-05, + "loss": 3.7364, + "step": 90370 + }, + { + "epoch": 6.140440277211578, + "grad_norm": 0.21406444907188416, + "learning_rate": 2.327507134121484e-05, + "loss": 3.7348, + "step": 90375 + }, + { + "epoch": 6.140779997282239, + "grad_norm": 0.2040134221315384, + "learning_rate": 2.3270824840331568e-05, + "loss": 4.1553, + "step": 90380 + }, + { + "epoch": 6.141119717352901, + "grad_norm": 0.20070333778858185, + "learning_rate": 2.3266578339448296e-05, + "loss": 4.0618, + "step": 90385 + }, + { + "epoch": 6.141459437423563, + "grad_norm": 0.2173217087984085, + "learning_rate": 2.3262331838565024e-05, + "loss": 3.775, + "step": 90390 + }, + { + "epoch": 6.141799157494225, + "grad_norm": 0.28653740882873535, + "learning_rate": 2.3258085337681752e-05, + "loss": 3.7431, + "step": 90395 + }, + { + "epoch": 6.142138877564887, + "grad_norm": 0.2121233344078064, + "learning_rate": 2.325383883679848e-05, + "loss": 3.9652, + "step": 90400 + }, + { + "epoch": 6.142478597635549, + "grad_norm": 0.13138656318187714, + "learning_rate": 2.3249592335915208e-05, + "loss": 3.9745, + "step": 90405 + }, + { + "epoch": 6.14281831770621, + "grad_norm": 0.17053891718387604, + "learning_rate": 2.3245345835031933e-05, + "loss": 3.8846, + "step": 90410 + }, + { + "epoch": 6.143158037776872, + "grad_norm": 1.7140570878982544, + "learning_rate": 2.3241099334148664e-05, + "loss": 3.9802, + "step": 90415 + }, + { + "epoch": 6.143497757847534, + "grad_norm": 0.1509867012500763, + "learning_rate": 2.3236852833265392e-05, + "loss": 3.903, + "step": 90420 + }, + { + "epoch": 6.143837477918195, + "grad_norm": 0.1850818395614624, + "learning_rate": 2.3232606332382117e-05, + "loss": 4.0368, + "step": 90425 + }, + { + "epoch": 6.144177197988857, + "grad_norm": 0.16172578930854797, + "learning_rate": 2.3228359831498848e-05, + "loss": 3.7295, + "step": 90430 + }, + { + "epoch": 6.144516918059519, + "grad_norm": 0.14592576026916504, + "learning_rate": 2.3224113330615573e-05, + "loss": 3.8802, + "step": 90435 + }, + { + "epoch": 6.144856638130181, + "grad_norm": 0.1790267527103424, + "learning_rate": 2.32198668297323e-05, + "loss": 4.1293, + "step": 90440 + }, + { + "epoch": 6.145196358200843, + "grad_norm": 0.15481576323509216, + "learning_rate": 2.321562032884903e-05, + "loss": 3.8054, + "step": 90445 + }, + { + "epoch": 6.145536078271505, + "grad_norm": 0.18155477941036224, + "learning_rate": 2.3211373827965757e-05, + "loss": 3.7902, + "step": 90450 + }, + { + "epoch": 6.145875798342166, + "grad_norm": 0.15967020392417908, + "learning_rate": 2.3207127327082485e-05, + "loss": 4.0405, + "step": 90455 + }, + { + "epoch": 6.146215518412828, + "grad_norm": 0.19536645710468292, + "learning_rate": 2.3202880826199213e-05, + "loss": 3.9059, + "step": 90460 + }, + { + "epoch": 6.14655523848349, + "grad_norm": 0.12561596930027008, + "learning_rate": 2.319863432531594e-05, + "loss": 4.0073, + "step": 90465 + }, + { + "epoch": 6.146894958554151, + "grad_norm": 0.22172412276268005, + "learning_rate": 2.319438782443267e-05, + "loss": 3.7903, + "step": 90470 + }, + { + "epoch": 6.147234678624813, + "grad_norm": 0.22562259435653687, + "learning_rate": 2.3190141323549397e-05, + "loss": 3.7606, + "step": 90475 + }, + { + "epoch": 6.1475743986954745, + "grad_norm": 0.4704419672489166, + "learning_rate": 2.3185894822666125e-05, + "loss": 3.8523, + "step": 90480 + }, + { + "epoch": 6.147914118766137, + "grad_norm": 0.19719858467578888, + "learning_rate": 2.3181648321782853e-05, + "loss": 3.958, + "step": 90485 + }, + { + "epoch": 6.148253838836799, + "grad_norm": 0.2050846517086029, + "learning_rate": 2.317740182089958e-05, + "loss": 3.7408, + "step": 90490 + }, + { + "epoch": 6.14859355890746, + "grad_norm": 0.15316925942897797, + "learning_rate": 2.3173155320016305e-05, + "loss": 3.8112, + "step": 90495 + }, + { + "epoch": 6.148933278978122, + "grad_norm": 0.17997966706752777, + "learning_rate": 2.3168908819133037e-05, + "loss": 3.6778, + "step": 90500 + }, + { + "epoch": 6.149272999048784, + "grad_norm": 0.18824680149555206, + "learning_rate": 2.3164662318249765e-05, + "loss": 3.9427, + "step": 90505 + }, + { + "epoch": 6.149612719119445, + "grad_norm": 0.15166133642196655, + "learning_rate": 2.316041581736649e-05, + "loss": 3.6375, + "step": 90510 + }, + { + "epoch": 6.149952439190107, + "grad_norm": 0.24639767408370972, + "learning_rate": 2.315616931648322e-05, + "loss": 3.9707, + "step": 90515 + }, + { + "epoch": 6.150292159260769, + "grad_norm": 0.18164177238941193, + "learning_rate": 2.3151922815599945e-05, + "loss": 3.7986, + "step": 90520 + }, + { + "epoch": 6.1506318793314305, + "grad_norm": 0.17106015980243683, + "learning_rate": 2.3147676314716673e-05, + "loss": 3.9537, + "step": 90525 + }, + { + "epoch": 6.150971599402093, + "grad_norm": 0.30852335691452026, + "learning_rate": 2.3143429813833405e-05, + "loss": 4.0044, + "step": 90530 + }, + { + "epoch": 6.151311319472755, + "grad_norm": 0.27992749214172363, + "learning_rate": 2.313918331295013e-05, + "loss": 3.9487, + "step": 90535 + }, + { + "epoch": 6.151651039543416, + "grad_norm": 0.21037891507148743, + "learning_rate": 2.3134936812066857e-05, + "loss": 3.9888, + "step": 90540 + }, + { + "epoch": 6.151990759614078, + "grad_norm": 0.1778789758682251, + "learning_rate": 2.3130690311183585e-05, + "loss": 3.8656, + "step": 90545 + }, + { + "epoch": 6.15233047968474, + "grad_norm": 0.20948058366775513, + "learning_rate": 2.3126443810300314e-05, + "loss": 3.8253, + "step": 90550 + }, + { + "epoch": 6.152670199755401, + "grad_norm": 0.2057427018880844, + "learning_rate": 2.312219730941704e-05, + "loss": 3.9688, + "step": 90555 + }, + { + "epoch": 6.153009919826063, + "grad_norm": 0.15829239785671234, + "learning_rate": 2.311795080853377e-05, + "loss": 3.8491, + "step": 90560 + }, + { + "epoch": 6.153349639896725, + "grad_norm": 0.15682564675807953, + "learning_rate": 2.3113704307650498e-05, + "loss": 3.8348, + "step": 90565 + }, + { + "epoch": 6.1536893599673865, + "grad_norm": 0.25296837091445923, + "learning_rate": 2.3109457806767226e-05, + "loss": 3.9558, + "step": 90570 + }, + { + "epoch": 6.154029080038049, + "grad_norm": 0.20016974210739136, + "learning_rate": 2.3105211305883954e-05, + "loss": 3.804, + "step": 90575 + }, + { + "epoch": 6.154368800108711, + "grad_norm": 0.28638502955436707, + "learning_rate": 2.310096480500068e-05, + "loss": 3.8382, + "step": 90580 + }, + { + "epoch": 6.154708520179372, + "grad_norm": 0.180460587143898, + "learning_rate": 2.309671830411741e-05, + "loss": 3.5077, + "step": 90585 + }, + { + "epoch": 6.155048240250034, + "grad_norm": 0.4887554347515106, + "learning_rate": 2.3092471803234138e-05, + "loss": 3.7998, + "step": 90590 + }, + { + "epoch": 6.155387960320696, + "grad_norm": 0.3852337598800659, + "learning_rate": 2.3088225302350862e-05, + "loss": 3.6003, + "step": 90595 + }, + { + "epoch": 6.155727680391357, + "grad_norm": 0.214670330286026, + "learning_rate": 2.3083978801467594e-05, + "loss": 3.9869, + "step": 90600 + }, + { + "epoch": 6.156067400462019, + "grad_norm": 0.17704257369041443, + "learning_rate": 2.3079732300584318e-05, + "loss": 4.0317, + "step": 90605 + }, + { + "epoch": 6.156407120532681, + "grad_norm": 0.1941511183977127, + "learning_rate": 2.3075485799701046e-05, + "loss": 3.9725, + "step": 90610 + }, + { + "epoch": 6.1567468406033425, + "grad_norm": 0.15076762437820435, + "learning_rate": 2.3071239298817778e-05, + "loss": 3.811, + "step": 90615 + }, + { + "epoch": 6.157086560674005, + "grad_norm": 0.17977851629257202, + "learning_rate": 2.3066992797934502e-05, + "loss": 4.0339, + "step": 90620 + }, + { + "epoch": 6.157426280744667, + "grad_norm": 0.26872727274894714, + "learning_rate": 2.306274629705123e-05, + "loss": 3.8974, + "step": 90625 + }, + { + "epoch": 6.157766000815328, + "grad_norm": 0.17853465676307678, + "learning_rate": 2.3058499796167958e-05, + "loss": 3.7793, + "step": 90630 + }, + { + "epoch": 6.15810572088599, + "grad_norm": 0.24657569825649261, + "learning_rate": 2.3054253295284686e-05, + "loss": 3.8605, + "step": 90635 + }, + { + "epoch": 6.158445440956652, + "grad_norm": 0.31609585881233215, + "learning_rate": 2.3050006794401414e-05, + "loss": 3.8208, + "step": 90640 + }, + { + "epoch": 6.158785161027313, + "grad_norm": 0.1715116798877716, + "learning_rate": 2.3045760293518142e-05, + "loss": 3.7792, + "step": 90645 + }, + { + "epoch": 6.159124881097975, + "grad_norm": 0.1433178037405014, + "learning_rate": 2.304151379263487e-05, + "loss": 3.8897, + "step": 90650 + }, + { + "epoch": 6.159464601168637, + "grad_norm": 0.13397347927093506, + "learning_rate": 2.3037267291751598e-05, + "loss": 3.8551, + "step": 90655 + }, + { + "epoch": 6.1598043212392986, + "grad_norm": 0.14669784903526306, + "learning_rate": 2.3033020790868326e-05, + "loss": 3.979, + "step": 90660 + }, + { + "epoch": 6.160144041309961, + "grad_norm": 0.2055118829011917, + "learning_rate": 2.3028774289985054e-05, + "loss": 3.5984, + "step": 90665 + }, + { + "epoch": 6.160483761380623, + "grad_norm": 0.17650596797466278, + "learning_rate": 2.3024527789101782e-05, + "loss": 3.96, + "step": 90670 + }, + { + "epoch": 6.160823481451284, + "grad_norm": 0.1635303497314453, + "learning_rate": 2.302028128821851e-05, + "loss": 3.7604, + "step": 90675 + }, + { + "epoch": 6.161163201521946, + "grad_norm": 0.17079037427902222, + "learning_rate": 2.3016034787335235e-05, + "loss": 4.0906, + "step": 90680 + }, + { + "epoch": 6.161502921592608, + "grad_norm": 0.2289309799671173, + "learning_rate": 2.3011788286451966e-05, + "loss": 3.8946, + "step": 90685 + }, + { + "epoch": 6.161842641663269, + "grad_norm": 0.19628922641277313, + "learning_rate": 2.300754178556869e-05, + "loss": 3.7937, + "step": 90690 + }, + { + "epoch": 6.162182361733931, + "grad_norm": 0.17865335941314697, + "learning_rate": 2.300329528468542e-05, + "loss": 3.9432, + "step": 90695 + }, + { + "epoch": 6.162522081804593, + "grad_norm": 0.2447793185710907, + "learning_rate": 2.299904878380215e-05, + "loss": 3.8055, + "step": 90700 + }, + { + "epoch": 6.162861801875255, + "grad_norm": 0.15187962353229523, + "learning_rate": 2.2994802282918875e-05, + "loss": 4.0086, + "step": 90705 + }, + { + "epoch": 6.163201521945917, + "grad_norm": 0.2069731056690216, + "learning_rate": 2.2990555782035603e-05, + "loss": 3.8192, + "step": 90710 + }, + { + "epoch": 6.163541242016579, + "grad_norm": 0.16453705728054047, + "learning_rate": 2.298630928115233e-05, + "loss": 3.9031, + "step": 90715 + }, + { + "epoch": 6.16388096208724, + "grad_norm": 0.1795751452445984, + "learning_rate": 2.298206278026906e-05, + "loss": 3.7074, + "step": 90720 + }, + { + "epoch": 6.164220682157902, + "grad_norm": 0.26364609599113464, + "learning_rate": 2.2977816279385787e-05, + "loss": 3.9859, + "step": 90725 + }, + { + "epoch": 6.164560402228564, + "grad_norm": 0.24534925818443298, + "learning_rate": 2.2973569778502515e-05, + "loss": 3.8623, + "step": 90730 + }, + { + "epoch": 6.164900122299225, + "grad_norm": 0.14594632387161255, + "learning_rate": 2.2969323277619243e-05, + "loss": 3.9468, + "step": 90735 + }, + { + "epoch": 6.165239842369887, + "grad_norm": 0.1992996782064438, + "learning_rate": 2.296507677673597e-05, + "loss": 3.845, + "step": 90740 + }, + { + "epoch": 6.165579562440549, + "grad_norm": 0.2302580177783966, + "learning_rate": 2.29608302758527e-05, + "loss": 3.7494, + "step": 90745 + }, + { + "epoch": 6.165919282511211, + "grad_norm": 0.19885903596878052, + "learning_rate": 2.2956583774969427e-05, + "loss": 3.9898, + "step": 90750 + }, + { + "epoch": 6.166259002581873, + "grad_norm": 0.15202538669109344, + "learning_rate": 2.2952337274086155e-05, + "loss": 3.7737, + "step": 90755 + }, + { + "epoch": 6.166598722652535, + "grad_norm": 0.1776427924633026, + "learning_rate": 2.2948090773202883e-05, + "loss": 3.7732, + "step": 90760 + }, + { + "epoch": 6.166938442723196, + "grad_norm": 0.1784524768590927, + "learning_rate": 2.2943844272319608e-05, + "loss": 3.7217, + "step": 90765 + }, + { + "epoch": 6.167278162793858, + "grad_norm": 0.18139231204986572, + "learning_rate": 2.293959777143634e-05, + "loss": 3.8315, + "step": 90770 + }, + { + "epoch": 6.16761788286452, + "grad_norm": 0.21333612501621246, + "learning_rate": 2.2935351270553067e-05, + "loss": 3.7405, + "step": 90775 + }, + { + "epoch": 6.167957602935181, + "grad_norm": 0.18974708020687103, + "learning_rate": 2.293110476966979e-05, + "loss": 4.079, + "step": 90780 + }, + { + "epoch": 6.168297323005843, + "grad_norm": 0.12318399548530579, + "learning_rate": 2.2926858268786523e-05, + "loss": 3.8125, + "step": 90785 + }, + { + "epoch": 6.1686370430765045, + "grad_norm": 0.5449991226196289, + "learning_rate": 2.2922611767903248e-05, + "loss": 4.0649, + "step": 90790 + }, + { + "epoch": 6.168976763147167, + "grad_norm": 0.28715139627456665, + "learning_rate": 2.2918365267019976e-05, + "loss": 3.8006, + "step": 90795 + }, + { + "epoch": 6.169316483217829, + "grad_norm": 0.16772569715976715, + "learning_rate": 2.2914118766136704e-05, + "loss": 3.9102, + "step": 90800 + }, + { + "epoch": 6.16965620328849, + "grad_norm": 0.11861063539981842, + "learning_rate": 2.2909872265253432e-05, + "loss": 3.9068, + "step": 90805 + }, + { + "epoch": 6.169995923359152, + "grad_norm": 0.19501358270645142, + "learning_rate": 2.290562576437016e-05, + "loss": 3.9846, + "step": 90810 + }, + { + "epoch": 6.170335643429814, + "grad_norm": 0.19632300734519958, + "learning_rate": 2.2901379263486888e-05, + "loss": 4.0857, + "step": 90815 + }, + { + "epoch": 6.170675363500475, + "grad_norm": 0.18873177468776703, + "learning_rate": 2.2897132762603616e-05, + "loss": 3.8261, + "step": 90820 + }, + { + "epoch": 6.171015083571137, + "grad_norm": 0.13626554608345032, + "learning_rate": 2.2892886261720344e-05, + "loss": 3.7147, + "step": 90825 + }, + { + "epoch": 6.171354803641799, + "grad_norm": 0.1865636110305786, + "learning_rate": 2.2888639760837072e-05, + "loss": 3.7954, + "step": 90830 + }, + { + "epoch": 6.1716945237124605, + "grad_norm": 0.41043949127197266, + "learning_rate": 2.28843932599538e-05, + "loss": 3.8133, + "step": 90835 + }, + { + "epoch": 6.172034243783123, + "grad_norm": 0.21169723570346832, + "learning_rate": 2.2880146759070528e-05, + "loss": 3.7699, + "step": 90840 + }, + { + "epoch": 6.172373963853785, + "grad_norm": 0.1845240741968155, + "learning_rate": 2.2875900258187256e-05, + "loss": 3.8004, + "step": 90845 + }, + { + "epoch": 6.172713683924446, + "grad_norm": 0.15434053540229797, + "learning_rate": 2.287165375730398e-05, + "loss": 3.7921, + "step": 90850 + }, + { + "epoch": 6.173053403995108, + "grad_norm": 0.5521658658981323, + "learning_rate": 2.2867407256420712e-05, + "loss": 3.8111, + "step": 90855 + }, + { + "epoch": 6.17339312406577, + "grad_norm": 0.33162131905555725, + "learning_rate": 2.286316075553744e-05, + "loss": 3.5533, + "step": 90860 + }, + { + "epoch": 6.173732844136431, + "grad_norm": 0.17033912241458893, + "learning_rate": 2.2858914254654164e-05, + "loss": 3.7832, + "step": 90865 + }, + { + "epoch": 6.174072564207093, + "grad_norm": 0.21014170348644257, + "learning_rate": 2.2854667753770896e-05, + "loss": 3.6791, + "step": 90870 + }, + { + "epoch": 6.174412284277755, + "grad_norm": 0.784229576587677, + "learning_rate": 2.285042125288762e-05, + "loss": 3.5685, + "step": 90875 + }, + { + "epoch": 6.1747520043484165, + "grad_norm": 0.19244329631328583, + "learning_rate": 2.284617475200435e-05, + "loss": 3.929, + "step": 90880 + }, + { + "epoch": 6.175091724419079, + "grad_norm": 0.2091214954853058, + "learning_rate": 2.2841928251121076e-05, + "loss": 3.8665, + "step": 90885 + }, + { + "epoch": 6.175431444489741, + "grad_norm": 0.2223050445318222, + "learning_rate": 2.2837681750237804e-05, + "loss": 3.7579, + "step": 90890 + }, + { + "epoch": 6.175771164560402, + "grad_norm": 0.19781635701656342, + "learning_rate": 2.2833435249354532e-05, + "loss": 3.7233, + "step": 90895 + }, + { + "epoch": 6.176110884631064, + "grad_norm": 0.2545914947986603, + "learning_rate": 2.282918874847126e-05, + "loss": 3.728, + "step": 90900 + }, + { + "epoch": 6.176450604701726, + "grad_norm": 0.21774694323539734, + "learning_rate": 2.282494224758799e-05, + "loss": 3.6736, + "step": 90905 + }, + { + "epoch": 6.176790324772387, + "grad_norm": 0.15725462138652802, + "learning_rate": 2.2820695746704717e-05, + "loss": 3.8457, + "step": 90910 + }, + { + "epoch": 6.177130044843049, + "grad_norm": 0.21222305297851562, + "learning_rate": 2.2816449245821445e-05, + "loss": 3.7043, + "step": 90915 + }, + { + "epoch": 6.177469764913711, + "grad_norm": 0.31563112139701843, + "learning_rate": 2.2812202744938173e-05, + "loss": 3.8375, + "step": 90920 + }, + { + "epoch": 6.1778094849843725, + "grad_norm": 0.17445409297943115, + "learning_rate": 2.28079562440549e-05, + "loss": 3.7776, + "step": 90925 + }, + { + "epoch": 6.178149205055035, + "grad_norm": 0.1578754037618637, + "learning_rate": 2.280370974317163e-05, + "loss": 3.9193, + "step": 90930 + }, + { + "epoch": 6.178488925125697, + "grad_norm": 0.17557090520858765, + "learning_rate": 2.2799463242288353e-05, + "loss": 3.9438, + "step": 90935 + }, + { + "epoch": 6.178828645196358, + "grad_norm": 0.13992321491241455, + "learning_rate": 2.2795216741405085e-05, + "loss": 3.8154, + "step": 90940 + }, + { + "epoch": 6.17916836526702, + "grad_norm": 0.1911877989768982, + "learning_rate": 2.2790970240521813e-05, + "loss": 3.6639, + "step": 90945 + }, + { + "epoch": 6.179508085337682, + "grad_norm": 0.23735688626766205, + "learning_rate": 2.2786723739638537e-05, + "loss": 3.635, + "step": 90950 + }, + { + "epoch": 6.179847805408343, + "grad_norm": 0.2489350289106369, + "learning_rate": 2.278247723875527e-05, + "loss": 3.6676, + "step": 90955 + }, + { + "epoch": 6.180187525479005, + "grad_norm": 0.18209855258464813, + "learning_rate": 2.2778230737871993e-05, + "loss": 3.8789, + "step": 90960 + }, + { + "epoch": 6.180527245549667, + "grad_norm": 0.23269039392471313, + "learning_rate": 2.277398423698872e-05, + "loss": 3.7815, + "step": 90965 + }, + { + "epoch": 6.180866965620329, + "grad_norm": 0.2832420766353607, + "learning_rate": 2.2769737736105453e-05, + "loss": 3.9576, + "step": 90970 + }, + { + "epoch": 6.181206685690991, + "grad_norm": 0.1397460252046585, + "learning_rate": 2.2765491235222177e-05, + "loss": 3.8189, + "step": 90975 + }, + { + "epoch": 6.181546405761653, + "grad_norm": 0.26659226417541504, + "learning_rate": 2.2761244734338905e-05, + "loss": 4.0468, + "step": 90980 + }, + { + "epoch": 6.181886125832314, + "grad_norm": 0.17154335975646973, + "learning_rate": 2.2756998233455633e-05, + "loss": 3.5004, + "step": 90985 + }, + { + "epoch": 6.182225845902976, + "grad_norm": 0.14976948499679565, + "learning_rate": 2.275275173257236e-05, + "loss": 3.9355, + "step": 90990 + }, + { + "epoch": 6.182565565973638, + "grad_norm": 0.3200986683368683, + "learning_rate": 2.274850523168909e-05, + "loss": 3.8715, + "step": 90995 + }, + { + "epoch": 6.182905286044299, + "grad_norm": 0.19928912818431854, + "learning_rate": 2.2744258730805817e-05, + "loss": 4.0556, + "step": 91000 + }, + { + "epoch": 6.183245006114961, + "grad_norm": 0.1624697893857956, + "learning_rate": 2.2740012229922545e-05, + "loss": 3.6913, + "step": 91005 + }, + { + "epoch": 6.183584726185623, + "grad_norm": 0.16321218013763428, + "learning_rate": 2.2735765729039273e-05, + "loss": 3.8564, + "step": 91010 + }, + { + "epoch": 6.183924446256285, + "grad_norm": 0.16139301657676697, + "learning_rate": 2.2731519228156e-05, + "loss": 3.9335, + "step": 91015 + }, + { + "epoch": 6.184264166326947, + "grad_norm": 0.16272205114364624, + "learning_rate": 2.272727272727273e-05, + "loss": 3.6958, + "step": 91020 + }, + { + "epoch": 6.184603886397609, + "grad_norm": 0.27992022037506104, + "learning_rate": 2.2723026226389457e-05, + "loss": 4.021, + "step": 91025 + }, + { + "epoch": 6.18494360646827, + "grad_norm": 0.19009357690811157, + "learning_rate": 2.2718779725506185e-05, + "loss": 3.7636, + "step": 91030 + }, + { + "epoch": 6.185283326538932, + "grad_norm": 0.16977468132972717, + "learning_rate": 2.271453322462291e-05, + "loss": 3.7577, + "step": 91035 + }, + { + "epoch": 6.185623046609594, + "grad_norm": 0.17407643795013428, + "learning_rate": 2.271028672373964e-05, + "loss": 3.5569, + "step": 91040 + }, + { + "epoch": 6.185962766680255, + "grad_norm": 0.1764884889125824, + "learning_rate": 2.2706040222856366e-05, + "loss": 3.8571, + "step": 91045 + }, + { + "epoch": 6.186302486750917, + "grad_norm": 0.13281844556331635, + "learning_rate": 2.2701793721973094e-05, + "loss": 3.9755, + "step": 91050 + }, + { + "epoch": 6.186642206821579, + "grad_norm": 0.18154263496398926, + "learning_rate": 2.2697547221089825e-05, + "loss": 4.0843, + "step": 91055 + }, + { + "epoch": 6.186981926892241, + "grad_norm": 0.1661376804113388, + "learning_rate": 2.269330072020655e-05, + "loss": 3.4838, + "step": 91060 + }, + { + "epoch": 6.187321646962903, + "grad_norm": 0.20416289567947388, + "learning_rate": 2.2689054219323278e-05, + "loss": 3.9925, + "step": 91065 + }, + { + "epoch": 6.187661367033565, + "grad_norm": 0.15448413789272308, + "learning_rate": 2.2684807718440006e-05, + "loss": 3.8206, + "step": 91070 + }, + { + "epoch": 6.188001087104226, + "grad_norm": 0.19770507514476776, + "learning_rate": 2.2680561217556734e-05, + "loss": 3.6853, + "step": 91075 + }, + { + "epoch": 6.188340807174888, + "grad_norm": 0.14626584947109222, + "learning_rate": 2.2676314716673462e-05, + "loss": 3.8809, + "step": 91080 + }, + { + "epoch": 6.18868052724555, + "grad_norm": 0.24230697751045227, + "learning_rate": 2.267206821579019e-05, + "loss": 3.783, + "step": 91085 + }, + { + "epoch": 6.189020247316211, + "grad_norm": 0.17014895379543304, + "learning_rate": 2.2667821714906918e-05, + "loss": 3.8351, + "step": 91090 + }, + { + "epoch": 6.189359967386873, + "grad_norm": 0.1448322981595993, + "learning_rate": 2.2663575214023646e-05, + "loss": 3.9619, + "step": 91095 + }, + { + "epoch": 6.189699687457535, + "grad_norm": 0.23219627141952515, + "learning_rate": 2.2659328713140374e-05, + "loss": 3.7179, + "step": 91100 + }, + { + "epoch": 6.190039407528197, + "grad_norm": 0.3375060558319092, + "learning_rate": 2.2655082212257102e-05, + "loss": 3.8242, + "step": 91105 + }, + { + "epoch": 6.190379127598859, + "grad_norm": 0.16003888845443726, + "learning_rate": 2.265083571137383e-05, + "loss": 4.1025, + "step": 91110 + }, + { + "epoch": 6.190718847669521, + "grad_norm": 0.20238979160785675, + "learning_rate": 2.2646589210490558e-05, + "loss": 4.0299, + "step": 91115 + }, + { + "epoch": 6.191058567740182, + "grad_norm": 0.22134140133857727, + "learning_rate": 2.2642342709607283e-05, + "loss": 3.8373, + "step": 91120 + }, + { + "epoch": 6.191398287810844, + "grad_norm": 0.22709591686725616, + "learning_rate": 2.2638096208724014e-05, + "loss": 3.9218, + "step": 91125 + }, + { + "epoch": 6.191738007881506, + "grad_norm": 0.19159747660160065, + "learning_rate": 2.263384970784074e-05, + "loss": 3.6284, + "step": 91130 + }, + { + "epoch": 6.192077727952167, + "grad_norm": 0.6620166897773743, + "learning_rate": 2.2629603206957467e-05, + "loss": 4.1301, + "step": 91135 + }, + { + "epoch": 6.192417448022829, + "grad_norm": 0.20923663675785065, + "learning_rate": 2.2625356706074198e-05, + "loss": 4.0686, + "step": 91140 + }, + { + "epoch": 6.192757168093491, + "grad_norm": 0.22522884607315063, + "learning_rate": 2.2621110205190923e-05, + "loss": 3.4424, + "step": 91145 + }, + { + "epoch": 6.193096888164153, + "grad_norm": 0.136847585439682, + "learning_rate": 2.261686370430765e-05, + "loss": 3.6151, + "step": 91150 + }, + { + "epoch": 6.193436608234815, + "grad_norm": 0.15926715731620789, + "learning_rate": 2.261261720342438e-05, + "loss": 3.9375, + "step": 91155 + }, + { + "epoch": 6.193776328305476, + "grad_norm": 0.17486275732517242, + "learning_rate": 2.2608370702541107e-05, + "loss": 3.8046, + "step": 91160 + }, + { + "epoch": 6.194116048376138, + "grad_norm": 0.16275328397750854, + "learning_rate": 2.2604124201657835e-05, + "loss": 3.839, + "step": 91165 + }, + { + "epoch": 6.1944557684468, + "grad_norm": 0.1667303889989853, + "learning_rate": 2.2599877700774563e-05, + "loss": 3.9177, + "step": 91170 + }, + { + "epoch": 6.194795488517461, + "grad_norm": 0.1553685963153839, + "learning_rate": 2.259563119989129e-05, + "loss": 3.8164, + "step": 91175 + }, + { + "epoch": 6.195135208588123, + "grad_norm": 0.43145790696144104, + "learning_rate": 2.259138469900802e-05, + "loss": 4.0474, + "step": 91180 + }, + { + "epoch": 6.195474928658785, + "grad_norm": 0.5369970798492432, + "learning_rate": 2.2587138198124747e-05, + "loss": 3.8209, + "step": 91185 + }, + { + "epoch": 6.1958146487294465, + "grad_norm": 0.17754265666007996, + "learning_rate": 2.2582891697241475e-05, + "loss": 3.8949, + "step": 91190 + }, + { + "epoch": 6.196154368800109, + "grad_norm": 0.13550178706645966, + "learning_rate": 2.2578645196358203e-05, + "loss": 3.7384, + "step": 91195 + }, + { + "epoch": 6.196494088870771, + "grad_norm": 0.16653786599636078, + "learning_rate": 2.257439869547493e-05, + "loss": 4.1676, + "step": 91200 + }, + { + "epoch": 6.196833808941432, + "grad_norm": 0.16747266054153442, + "learning_rate": 2.2570152194591655e-05, + "loss": 3.7018, + "step": 91205 + }, + { + "epoch": 6.197173529012094, + "grad_norm": 0.21033941209316254, + "learning_rate": 2.2565905693708387e-05, + "loss": 3.7125, + "step": 91210 + }, + { + "epoch": 6.197513249082756, + "grad_norm": 2.3351316452026367, + "learning_rate": 2.256165919282511e-05, + "loss": 3.907, + "step": 91215 + }, + { + "epoch": 6.197852969153417, + "grad_norm": 0.15243157744407654, + "learning_rate": 2.255741269194184e-05, + "loss": 3.7185, + "step": 91220 + }, + { + "epoch": 6.198192689224079, + "grad_norm": 0.20694108307361603, + "learning_rate": 2.255316619105857e-05, + "loss": 3.9184, + "step": 91225 + }, + { + "epoch": 6.198532409294741, + "grad_norm": 0.18854749202728271, + "learning_rate": 2.2548919690175295e-05, + "loss": 3.8022, + "step": 91230 + }, + { + "epoch": 6.1988721293654026, + "grad_norm": 1.489006757736206, + "learning_rate": 2.2544673189292023e-05, + "loss": 3.726, + "step": 91235 + }, + { + "epoch": 6.199211849436065, + "grad_norm": 0.17175523936748505, + "learning_rate": 2.254042668840875e-05, + "loss": 3.8122, + "step": 91240 + }, + { + "epoch": 6.199551569506727, + "grad_norm": 0.24330474436283112, + "learning_rate": 2.253618018752548e-05, + "loss": 3.8626, + "step": 91245 + }, + { + "epoch": 6.199891289577388, + "grad_norm": 0.17603611946105957, + "learning_rate": 2.253193368664221e-05, + "loss": 3.8361, + "step": 91250 + }, + { + "epoch": 6.20023100964805, + "grad_norm": 0.16212791204452515, + "learning_rate": 2.2527687185758935e-05, + "loss": 3.5885, + "step": 91255 + }, + { + "epoch": 6.200570729718712, + "grad_norm": 0.19552116096019745, + "learning_rate": 2.2523440684875663e-05, + "loss": 3.8458, + "step": 91260 + }, + { + "epoch": 6.200910449789373, + "grad_norm": 0.15467749536037445, + "learning_rate": 2.251919418399239e-05, + "loss": 3.8389, + "step": 91265 + }, + { + "epoch": 6.201250169860035, + "grad_norm": 0.3717688322067261, + "learning_rate": 2.251494768310912e-05, + "loss": 3.907, + "step": 91270 + }, + { + "epoch": 6.201589889930697, + "grad_norm": 0.14259472489356995, + "learning_rate": 2.2510701182225848e-05, + "loss": 3.7493, + "step": 91275 + }, + { + "epoch": 6.201929610001359, + "grad_norm": 0.28487709164619446, + "learning_rate": 2.2506454681342576e-05, + "loss": 3.9249, + "step": 91280 + }, + { + "epoch": 6.202269330072021, + "grad_norm": 0.20413346588611603, + "learning_rate": 2.2502208180459304e-05, + "loss": 3.5044, + "step": 91285 + }, + { + "epoch": 6.202609050142683, + "grad_norm": 0.16351218521595, + "learning_rate": 2.2497961679576028e-05, + "loss": 3.8411, + "step": 91290 + }, + { + "epoch": 6.202948770213344, + "grad_norm": 0.18901322782039642, + "learning_rate": 2.249371517869276e-05, + "loss": 4.0262, + "step": 91295 + }, + { + "epoch": 6.203288490284006, + "grad_norm": 0.19756068289279938, + "learning_rate": 2.2489468677809488e-05, + "loss": 3.4209, + "step": 91300 + }, + { + "epoch": 6.203628210354668, + "grad_norm": 0.13492557406425476, + "learning_rate": 2.2485222176926212e-05, + "loss": 3.7359, + "step": 91305 + }, + { + "epoch": 6.203967930425329, + "grad_norm": 0.18905851244926453, + "learning_rate": 2.2480975676042944e-05, + "loss": 3.7412, + "step": 91310 + }, + { + "epoch": 6.204307650495991, + "grad_norm": 0.21422743797302246, + "learning_rate": 2.2476729175159668e-05, + "loss": 3.6742, + "step": 91315 + }, + { + "epoch": 6.204647370566653, + "grad_norm": 0.28671762347221375, + "learning_rate": 2.2472482674276396e-05, + "loss": 3.8826, + "step": 91320 + }, + { + "epoch": 6.204987090637315, + "grad_norm": 0.19353626668453217, + "learning_rate": 2.2468236173393124e-05, + "loss": 3.6552, + "step": 91325 + }, + { + "epoch": 6.205326810707977, + "grad_norm": 0.18762050569057465, + "learning_rate": 2.2463989672509852e-05, + "loss": 3.9025, + "step": 91330 + }, + { + "epoch": 6.205666530778639, + "grad_norm": 0.15341956913471222, + "learning_rate": 2.2459743171626584e-05, + "loss": 3.7788, + "step": 91335 + }, + { + "epoch": 6.2060062508493, + "grad_norm": 0.15695177018642426, + "learning_rate": 2.2455496670743308e-05, + "loss": 3.6974, + "step": 91340 + }, + { + "epoch": 6.206345970919962, + "grad_norm": 0.2274375706911087, + "learning_rate": 2.2451250169860036e-05, + "loss": 4.0898, + "step": 91345 + }, + { + "epoch": 6.206685690990624, + "grad_norm": 0.182235985994339, + "learning_rate": 2.2447003668976764e-05, + "loss": 3.6247, + "step": 91350 + }, + { + "epoch": 6.207025411061285, + "grad_norm": 0.5122133493423462, + "learning_rate": 2.2442757168093492e-05, + "loss": 3.7703, + "step": 91355 + }, + { + "epoch": 6.207365131131947, + "grad_norm": 0.1762898713350296, + "learning_rate": 2.243851066721022e-05, + "loss": 3.836, + "step": 91360 + }, + { + "epoch": 6.207704851202609, + "grad_norm": 0.2324950248003006, + "learning_rate": 2.2434264166326948e-05, + "loss": 3.7355, + "step": 91365 + }, + { + "epoch": 6.208044571273271, + "grad_norm": 0.19097653031349182, + "learning_rate": 2.2430017665443676e-05, + "loss": 3.9889, + "step": 91370 + }, + { + "epoch": 6.208384291343933, + "grad_norm": 0.14378732442855835, + "learning_rate": 2.24257711645604e-05, + "loss": 3.9371, + "step": 91375 + }, + { + "epoch": 6.208724011414595, + "grad_norm": 0.2512596845626831, + "learning_rate": 2.2421524663677132e-05, + "loss": 3.8867, + "step": 91380 + }, + { + "epoch": 6.209063731485256, + "grad_norm": 0.19716347754001617, + "learning_rate": 2.241727816279386e-05, + "loss": 3.7188, + "step": 91385 + }, + { + "epoch": 6.209403451555918, + "grad_norm": 0.16130442917346954, + "learning_rate": 2.2413031661910585e-05, + "loss": 3.7055, + "step": 91390 + }, + { + "epoch": 6.20974317162658, + "grad_norm": 0.19353069365024567, + "learning_rate": 2.2408785161027316e-05, + "loss": 3.7704, + "step": 91395 + }, + { + "epoch": 6.210082891697241, + "grad_norm": 0.15483920276165009, + "learning_rate": 2.240453866014404e-05, + "loss": 3.79, + "step": 91400 + }, + { + "epoch": 6.210422611767903, + "grad_norm": 0.1507573276758194, + "learning_rate": 2.240029215926077e-05, + "loss": 3.7603, + "step": 91405 + }, + { + "epoch": 6.210762331838565, + "grad_norm": 0.14489209651947021, + "learning_rate": 2.2396045658377497e-05, + "loss": 3.8096, + "step": 91410 + }, + { + "epoch": 6.211102051909227, + "grad_norm": 0.20147626101970673, + "learning_rate": 2.2391799157494225e-05, + "loss": 3.8935, + "step": 91415 + }, + { + "epoch": 6.211441771979889, + "grad_norm": 0.16306383907794952, + "learning_rate": 2.2387552656610956e-05, + "loss": 3.8203, + "step": 91420 + }, + { + "epoch": 6.211781492050551, + "grad_norm": 0.16803325712680817, + "learning_rate": 2.238330615572768e-05, + "loss": 3.8988, + "step": 91425 + }, + { + "epoch": 6.212121212121212, + "grad_norm": 0.27075132727622986, + "learning_rate": 2.237905965484441e-05, + "loss": 3.7447, + "step": 91430 + }, + { + "epoch": 6.212460932191874, + "grad_norm": 0.5833024382591248, + "learning_rate": 2.2374813153961137e-05, + "loss": 4.0217, + "step": 91435 + }, + { + "epoch": 6.212800652262536, + "grad_norm": 0.19433709979057312, + "learning_rate": 2.2370566653077865e-05, + "loss": 4.059, + "step": 91440 + }, + { + "epoch": 6.213140372333197, + "grad_norm": 0.14211232960224152, + "learning_rate": 2.2366320152194593e-05, + "loss": 3.7139, + "step": 91445 + }, + { + "epoch": 6.213480092403859, + "grad_norm": 0.6864272356033325, + "learning_rate": 2.236207365131132e-05, + "loss": 3.8318, + "step": 91450 + }, + { + "epoch": 6.213819812474521, + "grad_norm": 0.19123108685016632, + "learning_rate": 2.235782715042805e-05, + "loss": 4.0934, + "step": 91455 + }, + { + "epoch": 6.214159532545183, + "grad_norm": 0.6766995787620544, + "learning_rate": 2.2353580649544774e-05, + "loss": 4.0342, + "step": 91460 + }, + { + "epoch": 6.214499252615845, + "grad_norm": 0.18025338649749756, + "learning_rate": 2.2349334148661505e-05, + "loss": 3.8652, + "step": 91465 + }, + { + "epoch": 6.214838972686506, + "grad_norm": 0.22389470040798187, + "learning_rate": 2.2345087647778233e-05, + "loss": 3.811, + "step": 91470 + }, + { + "epoch": 6.215178692757168, + "grad_norm": 0.6484856605529785, + "learning_rate": 2.2340841146894958e-05, + "loss": 3.6625, + "step": 91475 + }, + { + "epoch": 6.21551841282783, + "grad_norm": 0.3774573504924774, + "learning_rate": 2.233659464601169e-05, + "loss": 3.6968, + "step": 91480 + }, + { + "epoch": 6.215858132898491, + "grad_norm": 0.18017613887786865, + "learning_rate": 2.2332348145128414e-05, + "loss": 3.7895, + "step": 91485 + }, + { + "epoch": 6.216197852969153, + "grad_norm": 0.1562880575656891, + "learning_rate": 2.232810164424514e-05, + "loss": 3.9915, + "step": 91490 + }, + { + "epoch": 6.216537573039815, + "grad_norm": 0.15656809508800507, + "learning_rate": 2.2323855143361873e-05, + "loss": 4.0195, + "step": 91495 + }, + { + "epoch": 6.2168772931104765, + "grad_norm": 0.17447158694267273, + "learning_rate": 2.2319608642478598e-05, + "loss": 3.9529, + "step": 91500 + }, + { + "epoch": 6.217217013181139, + "grad_norm": 0.13873453438282013, + "learning_rate": 2.231536214159533e-05, + "loss": 3.8472, + "step": 91505 + }, + { + "epoch": 6.217556733251801, + "grad_norm": 0.19340817630290985, + "learning_rate": 2.2311115640712054e-05, + "loss": 3.636, + "step": 91510 + }, + { + "epoch": 6.217896453322462, + "grad_norm": 0.22526778280735016, + "learning_rate": 2.2306869139828782e-05, + "loss": 3.6285, + "step": 91515 + }, + { + "epoch": 6.218236173393124, + "grad_norm": 0.1988825500011444, + "learning_rate": 2.230262263894551e-05, + "loss": 3.5657, + "step": 91520 + }, + { + "epoch": 6.218575893463786, + "grad_norm": 0.2025974988937378, + "learning_rate": 2.2298376138062238e-05, + "loss": 4.0043, + "step": 91525 + }, + { + "epoch": 6.218915613534447, + "grad_norm": 0.17677170038223267, + "learning_rate": 2.2294129637178966e-05, + "loss": 3.8964, + "step": 91530 + }, + { + "epoch": 6.219255333605109, + "grad_norm": 0.1829923391342163, + "learning_rate": 2.2289883136295694e-05, + "loss": 3.6725, + "step": 91535 + }, + { + "epoch": 6.219595053675771, + "grad_norm": 0.19735178351402283, + "learning_rate": 2.2285636635412422e-05, + "loss": 3.8833, + "step": 91540 + }, + { + "epoch": 6.2199347737464326, + "grad_norm": 0.564212441444397, + "learning_rate": 2.228139013452915e-05, + "loss": 3.8825, + "step": 91545 + }, + { + "epoch": 6.220274493817095, + "grad_norm": 0.7258668541908264, + "learning_rate": 2.2277143633645878e-05, + "loss": 3.7773, + "step": 91550 + }, + { + "epoch": 6.220614213887757, + "grad_norm": 0.33867499232292175, + "learning_rate": 2.2272897132762606e-05, + "loss": 3.9341, + "step": 91555 + }, + { + "epoch": 6.220953933958418, + "grad_norm": 0.1522631198167801, + "learning_rate": 2.226865063187933e-05, + "loss": 3.8835, + "step": 91560 + }, + { + "epoch": 6.22129365402908, + "grad_norm": 0.1511303037405014, + "learning_rate": 2.2264404130996062e-05, + "loss": 3.7722, + "step": 91565 + }, + { + "epoch": 6.221633374099742, + "grad_norm": 0.6543501615524292, + "learning_rate": 2.2260157630112786e-05, + "loss": 3.6636, + "step": 91570 + }, + { + "epoch": 6.221973094170403, + "grad_norm": 0.1561490148305893, + "learning_rate": 2.2255911129229514e-05, + "loss": 3.7373, + "step": 91575 + }, + { + "epoch": 6.222312814241065, + "grad_norm": 0.19424530863761902, + "learning_rate": 2.2251664628346246e-05, + "loss": 3.9178, + "step": 91580 + }, + { + "epoch": 6.222652534311727, + "grad_norm": 0.1668897569179535, + "learning_rate": 2.224741812746297e-05, + "loss": 3.6944, + "step": 91585 + }, + { + "epoch": 6.222992254382389, + "grad_norm": 0.3236995041370392, + "learning_rate": 2.2243171626579702e-05, + "loss": 3.7183, + "step": 91590 + }, + { + "epoch": 6.223331974453051, + "grad_norm": 0.14484699070453644, + "learning_rate": 2.2238925125696426e-05, + "loss": 3.7517, + "step": 91595 + }, + { + "epoch": 6.223671694523713, + "grad_norm": 1.2785429954528809, + "learning_rate": 2.2234678624813154e-05, + "loss": 3.7882, + "step": 91600 + }, + { + "epoch": 6.224011414594374, + "grad_norm": 0.16087444126605988, + "learning_rate": 2.2230432123929882e-05, + "loss": 3.7638, + "step": 91605 + }, + { + "epoch": 6.224351134665036, + "grad_norm": 0.18272896111011505, + "learning_rate": 2.222618562304661e-05, + "loss": 3.718, + "step": 91610 + }, + { + "epoch": 6.224690854735698, + "grad_norm": 0.17911240458488464, + "learning_rate": 2.222193912216334e-05, + "loss": 3.9615, + "step": 91615 + }, + { + "epoch": 6.225030574806359, + "grad_norm": 0.20218504965305328, + "learning_rate": 2.2217692621280066e-05, + "loss": 3.8242, + "step": 91620 + }, + { + "epoch": 6.225370294877021, + "grad_norm": 0.1792442500591278, + "learning_rate": 2.2213446120396795e-05, + "loss": 4.0354, + "step": 91625 + }, + { + "epoch": 6.225710014947683, + "grad_norm": 0.16551265120506287, + "learning_rate": 2.2209199619513523e-05, + "loss": 3.6725, + "step": 91630 + }, + { + "epoch": 6.226049735018345, + "grad_norm": 0.13556607067584991, + "learning_rate": 2.220495311863025e-05, + "loss": 3.897, + "step": 91635 + }, + { + "epoch": 6.226389455089007, + "grad_norm": 0.17483769357204437, + "learning_rate": 2.220070661774698e-05, + "loss": 3.8527, + "step": 91640 + }, + { + "epoch": 6.226729175159669, + "grad_norm": 0.18284811079502106, + "learning_rate": 2.2196460116863703e-05, + "loss": 4.1263, + "step": 91645 + }, + { + "epoch": 6.22706889523033, + "grad_norm": 0.14841492474079132, + "learning_rate": 2.2192213615980435e-05, + "loss": 3.6115, + "step": 91650 + }, + { + "epoch": 6.227408615300992, + "grad_norm": 0.2439740151166916, + "learning_rate": 2.218796711509716e-05, + "loss": 3.6257, + "step": 91655 + }, + { + "epoch": 6.227748335371654, + "grad_norm": 0.20454415678977966, + "learning_rate": 2.2183720614213887e-05, + "loss": 3.7694, + "step": 91660 + }, + { + "epoch": 6.228088055442315, + "grad_norm": 0.1369970142841339, + "learning_rate": 2.217947411333062e-05, + "loss": 3.8291, + "step": 91665 + }, + { + "epoch": 6.228427775512977, + "grad_norm": 0.16895458102226257, + "learning_rate": 2.2175227612447343e-05, + "loss": 3.9851, + "step": 91670 + }, + { + "epoch": 6.228767495583639, + "grad_norm": 0.21212927997112274, + "learning_rate": 2.2170981111564075e-05, + "loss": 3.8393, + "step": 91675 + }, + { + "epoch": 6.229107215654301, + "grad_norm": 0.17939889430999756, + "learning_rate": 2.21667346106808e-05, + "loss": 3.7227, + "step": 91680 + }, + { + "epoch": 6.229446935724963, + "grad_norm": 0.16728919744491577, + "learning_rate": 2.2162488109797527e-05, + "loss": 3.8701, + "step": 91685 + }, + { + "epoch": 6.229786655795625, + "grad_norm": 0.1686837524175644, + "learning_rate": 2.215824160891426e-05, + "loss": 3.8713, + "step": 91690 + }, + { + "epoch": 6.230126375866286, + "grad_norm": 0.30426159501075745, + "learning_rate": 2.2153995108030983e-05, + "loss": 3.7206, + "step": 91695 + }, + { + "epoch": 6.230466095936948, + "grad_norm": 1.7182775735855103, + "learning_rate": 2.214974860714771e-05, + "loss": 3.6898, + "step": 91700 + }, + { + "epoch": 6.23080581600761, + "grad_norm": 0.1585504114627838, + "learning_rate": 2.214550210626444e-05, + "loss": 3.7653, + "step": 91705 + }, + { + "epoch": 6.231145536078271, + "grad_norm": 0.29131725430488586, + "learning_rate": 2.2141255605381167e-05, + "loss": 3.8388, + "step": 91710 + }, + { + "epoch": 6.231485256148933, + "grad_norm": 0.16977441310882568, + "learning_rate": 2.2137009104497895e-05, + "loss": 4.221, + "step": 91715 + }, + { + "epoch": 6.231824976219595, + "grad_norm": 0.22793392837047577, + "learning_rate": 2.2132762603614623e-05, + "loss": 3.7736, + "step": 91720 + }, + { + "epoch": 6.232164696290257, + "grad_norm": 0.42076948285102844, + "learning_rate": 2.212851610273135e-05, + "loss": 3.703, + "step": 91725 + }, + { + "epoch": 6.232504416360919, + "grad_norm": 0.14421528577804565, + "learning_rate": 2.2124269601848076e-05, + "loss": 3.6688, + "step": 91730 + }, + { + "epoch": 6.232844136431581, + "grad_norm": 0.15739600360393524, + "learning_rate": 2.2120023100964807e-05, + "loss": 3.9964, + "step": 91735 + }, + { + "epoch": 6.233183856502242, + "grad_norm": 0.22669321298599243, + "learning_rate": 2.2115776600081535e-05, + "loss": 4.0132, + "step": 91740 + }, + { + "epoch": 6.233523576572904, + "grad_norm": 0.17945371568202972, + "learning_rate": 2.211153009919826e-05, + "loss": 3.7814, + "step": 91745 + }, + { + "epoch": 6.233863296643566, + "grad_norm": 0.22795993089675903, + "learning_rate": 2.210728359831499e-05, + "loss": 3.8447, + "step": 91750 + }, + { + "epoch": 6.234203016714227, + "grad_norm": 0.14999301731586456, + "learning_rate": 2.2103037097431716e-05, + "loss": 3.716, + "step": 91755 + }, + { + "epoch": 6.234542736784889, + "grad_norm": 0.17025311291217804, + "learning_rate": 2.2098790596548447e-05, + "loss": 3.7936, + "step": 91760 + }, + { + "epoch": 6.234882456855551, + "grad_norm": 0.43397969007492065, + "learning_rate": 2.2094544095665172e-05, + "loss": 3.7305, + "step": 91765 + }, + { + "epoch": 6.235222176926213, + "grad_norm": 0.19024918973445892, + "learning_rate": 2.20902975947819e-05, + "loss": 3.8178, + "step": 91770 + }, + { + "epoch": 6.235561896996875, + "grad_norm": 0.21309156715869904, + "learning_rate": 2.208605109389863e-05, + "loss": 3.9282, + "step": 91775 + }, + { + "epoch": 6.235901617067537, + "grad_norm": 0.19574545323848724, + "learning_rate": 2.2081804593015356e-05, + "loss": 4.1441, + "step": 91780 + }, + { + "epoch": 6.236241337138198, + "grad_norm": 0.1944281905889511, + "learning_rate": 2.2077558092132084e-05, + "loss": 3.7469, + "step": 91785 + }, + { + "epoch": 6.23658105720886, + "grad_norm": 0.16940881311893463, + "learning_rate": 2.2073311591248812e-05, + "loss": 3.8286, + "step": 91790 + }, + { + "epoch": 6.236920777279522, + "grad_norm": 0.4282350242137909, + "learning_rate": 2.206906509036554e-05, + "loss": 3.6823, + "step": 91795 + }, + { + "epoch": 6.237260497350183, + "grad_norm": 0.1631837785243988, + "learning_rate": 2.2064818589482268e-05, + "loss": 4.1206, + "step": 91800 + }, + { + "epoch": 6.237600217420845, + "grad_norm": 0.1705278903245926, + "learning_rate": 2.2060572088598996e-05, + "loss": 4.0389, + "step": 91805 + }, + { + "epoch": 6.237939937491507, + "grad_norm": 0.23594802618026733, + "learning_rate": 2.2056325587715724e-05, + "loss": 3.7595, + "step": 91810 + }, + { + "epoch": 6.238279657562169, + "grad_norm": 0.23722241818904877, + "learning_rate": 2.205207908683245e-05, + "loss": 3.8473, + "step": 91815 + }, + { + "epoch": 6.238619377632831, + "grad_norm": 0.201886385679245, + "learning_rate": 2.204783258594918e-05, + "loss": 3.8493, + "step": 91820 + }, + { + "epoch": 6.238959097703493, + "grad_norm": 0.13675042986869812, + "learning_rate": 2.2043586085065908e-05, + "loss": 3.7732, + "step": 91825 + }, + { + "epoch": 6.239298817774154, + "grad_norm": 0.14114511013031006, + "learning_rate": 2.2039339584182633e-05, + "loss": 3.4781, + "step": 91830 + }, + { + "epoch": 6.239638537844816, + "grad_norm": 0.17411932349205017, + "learning_rate": 2.2035093083299364e-05, + "loss": 3.7003, + "step": 91835 + }, + { + "epoch": 6.239978257915477, + "grad_norm": 0.17824184894561768, + "learning_rate": 2.203084658241609e-05, + "loss": 3.7166, + "step": 91840 + }, + { + "epoch": 6.240317977986139, + "grad_norm": 0.16549712419509888, + "learning_rate": 2.202660008153282e-05, + "loss": 3.9109, + "step": 91845 + }, + { + "epoch": 6.240657698056801, + "grad_norm": 0.15540184080600739, + "learning_rate": 2.2022353580649545e-05, + "loss": 3.8907, + "step": 91850 + }, + { + "epoch": 6.240997418127463, + "grad_norm": 0.17289121448993683, + "learning_rate": 2.2018107079766273e-05, + "loss": 3.7937, + "step": 91855 + }, + { + "epoch": 6.241337138198125, + "grad_norm": 0.18066810071468353, + "learning_rate": 2.2013860578883004e-05, + "loss": 3.9969, + "step": 91860 + }, + { + "epoch": 6.241676858268787, + "grad_norm": 0.5820386409759521, + "learning_rate": 2.200961407799973e-05, + "loss": 3.865, + "step": 91865 + }, + { + "epoch": 6.242016578339448, + "grad_norm": 0.18077296018600464, + "learning_rate": 2.2005367577116457e-05, + "loss": 3.6833, + "step": 91870 + }, + { + "epoch": 6.24235629841011, + "grad_norm": 0.22779567539691925, + "learning_rate": 2.2001121076233185e-05, + "loss": 3.7521, + "step": 91875 + }, + { + "epoch": 6.242696018480772, + "grad_norm": 0.14610590040683746, + "learning_rate": 2.1996874575349913e-05, + "loss": 3.7759, + "step": 91880 + }, + { + "epoch": 6.243035738551433, + "grad_norm": 0.19488425552845, + "learning_rate": 2.199262807446664e-05, + "loss": 3.7541, + "step": 91885 + }, + { + "epoch": 6.243375458622095, + "grad_norm": 0.1607968509197235, + "learning_rate": 2.198838157358337e-05, + "loss": 3.7637, + "step": 91890 + }, + { + "epoch": 6.243715178692757, + "grad_norm": 0.1718495786190033, + "learning_rate": 2.1984135072700097e-05, + "loss": 3.8256, + "step": 91895 + }, + { + "epoch": 6.244054898763419, + "grad_norm": 0.22052112221717834, + "learning_rate": 2.197988857181682e-05, + "loss": 3.8803, + "step": 91900 + }, + { + "epoch": 6.244394618834081, + "grad_norm": 0.18888553977012634, + "learning_rate": 2.1975642070933553e-05, + "loss": 3.9452, + "step": 91905 + }, + { + "epoch": 6.244734338904743, + "grad_norm": 0.2174227386713028, + "learning_rate": 2.197139557005028e-05, + "loss": 3.9166, + "step": 91910 + }, + { + "epoch": 6.245074058975404, + "grad_norm": 0.17493383586406708, + "learning_rate": 2.1967149069167005e-05, + "loss": 3.7715, + "step": 91915 + }, + { + "epoch": 6.245413779046066, + "grad_norm": 0.20926158130168915, + "learning_rate": 2.1962902568283737e-05, + "loss": 3.8727, + "step": 91920 + }, + { + "epoch": 6.245753499116728, + "grad_norm": 0.18952026963233948, + "learning_rate": 2.195865606740046e-05, + "loss": 3.9146, + "step": 91925 + }, + { + "epoch": 6.246093219187389, + "grad_norm": 6.61220121383667, + "learning_rate": 2.1954409566517193e-05, + "loss": 3.9598, + "step": 91930 + }, + { + "epoch": 6.246432939258051, + "grad_norm": 0.17106132209300995, + "learning_rate": 2.1950163065633917e-05, + "loss": 3.5855, + "step": 91935 + }, + { + "epoch": 6.246772659328713, + "grad_norm": 0.18339617550373077, + "learning_rate": 2.1945916564750645e-05, + "loss": 3.8952, + "step": 91940 + }, + { + "epoch": 6.247112379399375, + "grad_norm": 0.18846623599529266, + "learning_rate": 2.1941670063867377e-05, + "loss": 4.0752, + "step": 91945 + }, + { + "epoch": 6.247452099470037, + "grad_norm": 0.2251916527748108, + "learning_rate": 2.19374235629841e-05, + "loss": 3.6819, + "step": 91950 + }, + { + "epoch": 6.247791819540699, + "grad_norm": 0.17860695719718933, + "learning_rate": 2.193317706210083e-05, + "loss": 3.8506, + "step": 91955 + }, + { + "epoch": 6.24813153961136, + "grad_norm": 0.16570664942264557, + "learning_rate": 2.1928930561217557e-05, + "loss": 3.8108, + "step": 91960 + }, + { + "epoch": 6.248471259682022, + "grad_norm": 0.19174708425998688, + "learning_rate": 2.1924684060334285e-05, + "loss": 3.7585, + "step": 91965 + }, + { + "epoch": 6.248810979752684, + "grad_norm": 0.16106292605400085, + "learning_rate": 2.1920437559451013e-05, + "loss": 3.7017, + "step": 91970 + }, + { + "epoch": 6.249150699823345, + "grad_norm": 0.3432653844356537, + "learning_rate": 2.191619105856774e-05, + "loss": 3.9051, + "step": 91975 + }, + { + "epoch": 6.249490419894007, + "grad_norm": 0.19663722813129425, + "learning_rate": 2.191194455768447e-05, + "loss": 3.7799, + "step": 91980 + }, + { + "epoch": 6.249830139964669, + "grad_norm": 0.1563407927751541, + "learning_rate": 2.1907698056801194e-05, + "loss": 3.5351, + "step": 91985 + }, + { + "epoch": 6.250169860035331, + "grad_norm": 0.1799822747707367, + "learning_rate": 2.1903451555917926e-05, + "loss": 3.8426, + "step": 91990 + }, + { + "epoch": 6.250509580105993, + "grad_norm": 0.14982828497886658, + "learning_rate": 2.1899205055034654e-05, + "loss": 3.9232, + "step": 91995 + }, + { + "epoch": 6.250849300176655, + "grad_norm": 0.1815965324640274, + "learning_rate": 2.1894958554151378e-05, + "loss": 3.8023, + "step": 92000 + }, + { + "epoch": 6.251189020247316, + "grad_norm": 0.12175703793764114, + "learning_rate": 2.189071205326811e-05, + "loss": 4.0403, + "step": 92005 + }, + { + "epoch": 6.251528740317978, + "grad_norm": 0.14380133152008057, + "learning_rate": 2.1886465552384834e-05, + "loss": 3.7724, + "step": 92010 + }, + { + "epoch": 6.25186846038864, + "grad_norm": 0.1303611844778061, + "learning_rate": 2.1882219051501566e-05, + "loss": 3.4966, + "step": 92015 + }, + { + "epoch": 6.252208180459301, + "grad_norm": 0.2294054925441742, + "learning_rate": 2.1877972550618294e-05, + "loss": 3.7328, + "step": 92020 + }, + { + "epoch": 6.252547900529963, + "grad_norm": 0.18238383531570435, + "learning_rate": 2.1873726049735018e-05, + "loss": 3.7363, + "step": 92025 + }, + { + "epoch": 6.252887620600625, + "grad_norm": 0.15636427700519562, + "learning_rate": 2.186947954885175e-05, + "loss": 3.76, + "step": 92030 + }, + { + "epoch": 6.253227340671287, + "grad_norm": 0.1394205391407013, + "learning_rate": 2.1865233047968474e-05, + "loss": 3.7947, + "step": 92035 + }, + { + "epoch": 6.253567060741949, + "grad_norm": 0.3113560974597931, + "learning_rate": 2.1860986547085202e-05, + "loss": 3.7627, + "step": 92040 + }, + { + "epoch": 6.253906780812611, + "grad_norm": 0.14998143911361694, + "learning_rate": 2.185674004620193e-05, + "loss": 3.8796, + "step": 92045 + }, + { + "epoch": 6.254246500883272, + "grad_norm": 0.13111554086208344, + "learning_rate": 2.1852493545318658e-05, + "loss": 3.9145, + "step": 92050 + }, + { + "epoch": 6.254586220953934, + "grad_norm": 0.18656262755393982, + "learning_rate": 2.1848247044435386e-05, + "loss": 3.726, + "step": 92055 + }, + { + "epoch": 6.254925941024596, + "grad_norm": 0.17059458792209625, + "learning_rate": 2.1844000543552114e-05, + "loss": 3.92, + "step": 92060 + }, + { + "epoch": 6.255265661095257, + "grad_norm": 0.18127626180648804, + "learning_rate": 2.1839754042668842e-05, + "loss": 3.8832, + "step": 92065 + }, + { + "epoch": 6.255605381165919, + "grad_norm": 0.30667954683303833, + "learning_rate": 2.183550754178557e-05, + "loss": 3.5803, + "step": 92070 + }, + { + "epoch": 6.255945101236581, + "grad_norm": 0.20058542490005493, + "learning_rate": 2.1831261040902298e-05, + "loss": 4.0304, + "step": 92075 + }, + { + "epoch": 6.256284821307243, + "grad_norm": 0.15654811263084412, + "learning_rate": 2.1827014540019026e-05, + "loss": 3.7926, + "step": 92080 + }, + { + "epoch": 6.256624541377905, + "grad_norm": 0.2335447371006012, + "learning_rate": 2.182276803913575e-05, + "loss": 3.8088, + "step": 92085 + }, + { + "epoch": 6.256964261448567, + "grad_norm": 0.12686800956726074, + "learning_rate": 2.1818521538252482e-05, + "loss": 3.795, + "step": 92090 + }, + { + "epoch": 6.257303981519228, + "grad_norm": 0.17993281781673431, + "learning_rate": 2.1814275037369207e-05, + "loss": 3.63, + "step": 92095 + }, + { + "epoch": 6.25764370158989, + "grad_norm": 0.1497471034526825, + "learning_rate": 2.1810028536485935e-05, + "loss": 3.7955, + "step": 92100 + }, + { + "epoch": 6.257983421660552, + "grad_norm": 0.16911368072032928, + "learning_rate": 2.1805782035602666e-05, + "loss": 3.736, + "step": 92105 + }, + { + "epoch": 6.258323141731213, + "grad_norm": 0.21212679147720337, + "learning_rate": 2.180153553471939e-05, + "loss": 3.8561, + "step": 92110 + }, + { + "epoch": 6.258662861801875, + "grad_norm": 0.19837422668933868, + "learning_rate": 2.1797289033836122e-05, + "loss": 3.9316, + "step": 92115 + }, + { + "epoch": 6.259002581872537, + "grad_norm": 0.5575704574584961, + "learning_rate": 2.1793042532952847e-05, + "loss": 3.7994, + "step": 92120 + }, + { + "epoch": 6.259342301943199, + "grad_norm": 0.2756849527359009, + "learning_rate": 2.1788796032069575e-05, + "loss": 3.6893, + "step": 92125 + }, + { + "epoch": 6.259682022013861, + "grad_norm": 0.164479598402977, + "learning_rate": 2.1784549531186303e-05, + "loss": 4.0829, + "step": 92130 + }, + { + "epoch": 6.260021742084522, + "grad_norm": 0.13817806541919708, + "learning_rate": 2.178030303030303e-05, + "loss": 3.7733, + "step": 92135 + }, + { + "epoch": 6.260361462155184, + "grad_norm": 0.15935437381267548, + "learning_rate": 2.177605652941976e-05, + "loss": 3.9055, + "step": 92140 + }, + { + "epoch": 6.260701182225846, + "grad_norm": 0.21256782114505768, + "learning_rate": 2.1771810028536487e-05, + "loss": 3.7631, + "step": 92145 + }, + { + "epoch": 6.261040902296507, + "grad_norm": 0.15897972881793976, + "learning_rate": 2.1767563527653215e-05, + "loss": 3.9008, + "step": 92150 + }, + { + "epoch": 6.261380622367169, + "grad_norm": 0.19116206467151642, + "learning_rate": 2.1763317026769943e-05, + "loss": 3.7852, + "step": 92155 + }, + { + "epoch": 6.261720342437831, + "grad_norm": 0.20485517382621765, + "learning_rate": 2.175907052588667e-05, + "loss": 3.9538, + "step": 92160 + }, + { + "epoch": 6.262060062508493, + "grad_norm": 0.3108145296573639, + "learning_rate": 2.17548240250034e-05, + "loss": 3.7722, + "step": 92165 + }, + { + "epoch": 6.262399782579155, + "grad_norm": 0.23916803300380707, + "learning_rate": 2.1750577524120124e-05, + "loss": 3.8436, + "step": 92170 + }, + { + "epoch": 6.262739502649817, + "grad_norm": 0.16691002249717712, + "learning_rate": 2.1746331023236855e-05, + "loss": 3.9998, + "step": 92175 + }, + { + "epoch": 6.263079222720478, + "grad_norm": 0.20875370502471924, + "learning_rate": 2.174208452235358e-05, + "loss": 3.7089, + "step": 92180 + }, + { + "epoch": 6.26341894279114, + "grad_norm": 0.16069336235523224, + "learning_rate": 2.1737838021470308e-05, + "loss": 3.9138, + "step": 92185 + }, + { + "epoch": 6.263758662861802, + "grad_norm": 0.18857505917549133, + "learning_rate": 2.173359152058704e-05, + "loss": 3.872, + "step": 92190 + }, + { + "epoch": 6.264098382932463, + "grad_norm": 0.18962492048740387, + "learning_rate": 2.1729345019703764e-05, + "loss": 3.9224, + "step": 92195 + }, + { + "epoch": 6.264438103003125, + "grad_norm": 0.19352123141288757, + "learning_rate": 2.1725098518820495e-05, + "loss": 3.8475, + "step": 92200 + }, + { + "epoch": 6.264777823073787, + "grad_norm": 0.17661869525909424, + "learning_rate": 2.172085201793722e-05, + "loss": 3.6061, + "step": 92205 + }, + { + "epoch": 6.265117543144449, + "grad_norm": 0.20869502425193787, + "learning_rate": 2.1716605517053948e-05, + "loss": 3.7469, + "step": 92210 + }, + { + "epoch": 6.265457263215111, + "grad_norm": 0.1800180971622467, + "learning_rate": 2.171235901617068e-05, + "loss": 3.9705, + "step": 92215 + }, + { + "epoch": 6.265796983285773, + "grad_norm": 0.2670784890651703, + "learning_rate": 2.1708112515287404e-05, + "loss": 3.8877, + "step": 92220 + }, + { + "epoch": 6.266136703356434, + "grad_norm": 0.17867405712604523, + "learning_rate": 2.1703866014404132e-05, + "loss": 3.5669, + "step": 92225 + }, + { + "epoch": 6.266476423427096, + "grad_norm": 0.20327401161193848, + "learning_rate": 2.169961951352086e-05, + "loss": 3.5587, + "step": 92230 + }, + { + "epoch": 6.266816143497758, + "grad_norm": 0.2386675626039505, + "learning_rate": 2.1695373012637588e-05, + "loss": 3.7297, + "step": 92235 + }, + { + "epoch": 6.267155863568419, + "grad_norm": 0.17239496111869812, + "learning_rate": 2.1691126511754316e-05, + "loss": 4.0197, + "step": 92240 + }, + { + "epoch": 6.267495583639081, + "grad_norm": 0.1776680201292038, + "learning_rate": 2.1686880010871044e-05, + "loss": 3.9812, + "step": 92245 + }, + { + "epoch": 6.267835303709743, + "grad_norm": 0.2424042522907257, + "learning_rate": 2.1682633509987772e-05, + "loss": 3.956, + "step": 92250 + }, + { + "epoch": 6.268175023780405, + "grad_norm": 0.40005743503570557, + "learning_rate": 2.1678387009104496e-05, + "loss": 3.9524, + "step": 92255 + }, + { + "epoch": 6.268514743851067, + "grad_norm": 0.1360200196504593, + "learning_rate": 2.1674140508221228e-05, + "loss": 4.1281, + "step": 92260 + }, + { + "epoch": 6.268854463921729, + "grad_norm": 0.15886828303337097, + "learning_rate": 2.1669894007337956e-05, + "loss": 3.8806, + "step": 92265 + }, + { + "epoch": 6.26919418399239, + "grad_norm": 0.2016608864068985, + "learning_rate": 2.166564750645468e-05, + "loss": 3.7209, + "step": 92270 + }, + { + "epoch": 6.269533904063052, + "grad_norm": 0.17939412593841553, + "learning_rate": 2.1661401005571412e-05, + "loss": 3.7635, + "step": 92275 + }, + { + "epoch": 6.269873624133714, + "grad_norm": 0.20037396252155304, + "learning_rate": 2.1657154504688136e-05, + "loss": 3.6915, + "step": 92280 + }, + { + "epoch": 6.270213344204375, + "grad_norm": 0.18229761719703674, + "learning_rate": 2.1652908003804868e-05, + "loss": 3.886, + "step": 92285 + }, + { + "epoch": 6.270553064275037, + "grad_norm": 0.45804235339164734, + "learning_rate": 2.1648661502921592e-05, + "loss": 4.0564, + "step": 92290 + }, + { + "epoch": 6.270892784345699, + "grad_norm": 0.18537871539592743, + "learning_rate": 2.164441500203832e-05, + "loss": 3.8029, + "step": 92295 + }, + { + "epoch": 6.271232504416361, + "grad_norm": 0.1606541872024536, + "learning_rate": 2.1640168501155052e-05, + "loss": 3.8949, + "step": 92300 + }, + { + "epoch": 6.271572224487023, + "grad_norm": 0.18457704782485962, + "learning_rate": 2.1635922000271776e-05, + "loss": 3.7603, + "step": 92305 + }, + { + "epoch": 6.271911944557685, + "grad_norm": 0.3436674475669861, + "learning_rate": 2.1631675499388504e-05, + "loss": 4.2088, + "step": 92310 + }, + { + "epoch": 6.272251664628346, + "grad_norm": 0.1754087507724762, + "learning_rate": 2.1627428998505232e-05, + "loss": 3.8678, + "step": 92315 + }, + { + "epoch": 6.272591384699008, + "grad_norm": 0.16619354486465454, + "learning_rate": 2.162318249762196e-05, + "loss": 3.9659, + "step": 92320 + }, + { + "epoch": 6.27293110476967, + "grad_norm": 0.1625409573316574, + "learning_rate": 2.161893599673869e-05, + "loss": 3.9038, + "step": 92325 + }, + { + "epoch": 6.273270824840331, + "grad_norm": 0.2011565864086151, + "learning_rate": 2.1614689495855416e-05, + "loss": 3.8988, + "step": 92330 + }, + { + "epoch": 6.273610544910993, + "grad_norm": 0.2706249952316284, + "learning_rate": 2.1610442994972144e-05, + "loss": 3.857, + "step": 92335 + }, + { + "epoch": 6.273950264981655, + "grad_norm": 0.19370079040527344, + "learning_rate": 2.160619649408887e-05, + "loss": 3.9687, + "step": 92340 + }, + { + "epoch": 6.274289985052317, + "grad_norm": 0.15662437677383423, + "learning_rate": 2.16019499932056e-05, + "loss": 3.6116, + "step": 92345 + }, + { + "epoch": 6.274629705122979, + "grad_norm": 0.17663884162902832, + "learning_rate": 2.159770349232233e-05, + "loss": 3.9036, + "step": 92350 + }, + { + "epoch": 6.274969425193641, + "grad_norm": 0.3757394552230835, + "learning_rate": 2.1593456991439053e-05, + "loss": 3.5841, + "step": 92355 + }, + { + "epoch": 6.275309145264302, + "grad_norm": 0.20479609072208405, + "learning_rate": 2.1589210490555785e-05, + "loss": 3.7171, + "step": 92360 + }, + { + "epoch": 6.275648865334964, + "grad_norm": 0.1603420525789261, + "learning_rate": 2.158496398967251e-05, + "loss": 3.6849, + "step": 92365 + }, + { + "epoch": 6.275988585405626, + "grad_norm": 0.19604015350341797, + "learning_rate": 2.158071748878924e-05, + "loss": 3.6384, + "step": 92370 + }, + { + "epoch": 6.276328305476287, + "grad_norm": 0.19241827726364136, + "learning_rate": 2.1576470987905965e-05, + "loss": 3.8061, + "step": 92375 + }, + { + "epoch": 6.276668025546949, + "grad_norm": 0.16258816421031952, + "learning_rate": 2.1572224487022693e-05, + "loss": 3.6433, + "step": 92380 + }, + { + "epoch": 6.277007745617611, + "grad_norm": 0.21255293488502502, + "learning_rate": 2.1567977986139425e-05, + "loss": 3.8816, + "step": 92385 + }, + { + "epoch": 6.277347465688273, + "grad_norm": 0.21936267614364624, + "learning_rate": 2.156373148525615e-05, + "loss": 3.7755, + "step": 92390 + }, + { + "epoch": 6.277687185758935, + "grad_norm": 0.146976038813591, + "learning_rate": 2.1559484984372877e-05, + "loss": 3.6711, + "step": 92395 + }, + { + "epoch": 6.278026905829597, + "grad_norm": 0.3296806216239929, + "learning_rate": 2.1555238483489605e-05, + "loss": 3.755, + "step": 92400 + }, + { + "epoch": 6.278366625900258, + "grad_norm": 0.17094814777374268, + "learning_rate": 2.1550991982606333e-05, + "loss": 3.8939, + "step": 92405 + }, + { + "epoch": 6.27870634597092, + "grad_norm": 0.17458221316337585, + "learning_rate": 2.154674548172306e-05, + "loss": 3.7238, + "step": 92410 + }, + { + "epoch": 6.279046066041582, + "grad_norm": 0.21329353749752045, + "learning_rate": 2.154249898083979e-05, + "loss": 3.8442, + "step": 92415 + }, + { + "epoch": 6.279385786112243, + "grad_norm": 0.17239722609519958, + "learning_rate": 2.1538252479956517e-05, + "loss": 3.7641, + "step": 92420 + }, + { + "epoch": 6.279725506182905, + "grad_norm": 0.1447230875492096, + "learning_rate": 2.1534005979073242e-05, + "loss": 3.9694, + "step": 92425 + }, + { + "epoch": 6.2800652262535674, + "grad_norm": 0.2515372931957245, + "learning_rate": 2.1529759478189973e-05, + "loss": 3.6838, + "step": 92430 + }, + { + "epoch": 6.280404946324229, + "grad_norm": 0.19982239603996277, + "learning_rate": 2.15255129773067e-05, + "loss": 4.0759, + "step": 92435 + }, + { + "epoch": 6.280744666394891, + "grad_norm": 0.18230773508548737, + "learning_rate": 2.1521266476423426e-05, + "loss": 3.7656, + "step": 92440 + }, + { + "epoch": 6.281084386465553, + "grad_norm": 0.5063667297363281, + "learning_rate": 2.1517019975540157e-05, + "loss": 4.0052, + "step": 92445 + }, + { + "epoch": 6.281424106536214, + "grad_norm": 0.23078224062919617, + "learning_rate": 2.1512773474656882e-05, + "loss": 3.7265, + "step": 92450 + }, + { + "epoch": 6.281763826606876, + "grad_norm": 0.16173362731933594, + "learning_rate": 2.1508526973773613e-05, + "loss": 3.8297, + "step": 92455 + }, + { + "epoch": 6.282103546677538, + "grad_norm": 0.16080129146575928, + "learning_rate": 2.150428047289034e-05, + "loss": 4.0717, + "step": 92460 + }, + { + "epoch": 6.282443266748199, + "grad_norm": 0.2509477436542511, + "learning_rate": 2.1500033972007066e-05, + "loss": 3.7281, + "step": 92465 + }, + { + "epoch": 6.282782986818861, + "grad_norm": 0.3058795630931854, + "learning_rate": 2.1495787471123797e-05, + "loss": 3.9489, + "step": 92470 + }, + { + "epoch": 6.2831227068895235, + "grad_norm": 0.2125069946050644, + "learning_rate": 2.1491540970240522e-05, + "loss": 3.9336, + "step": 92475 + }, + { + "epoch": 6.283462426960185, + "grad_norm": 0.17323100566864014, + "learning_rate": 2.148729446935725e-05, + "loss": 3.9058, + "step": 92480 + }, + { + "epoch": 6.283802147030847, + "grad_norm": 0.16257569193840027, + "learning_rate": 2.1483047968473978e-05, + "loss": 3.7858, + "step": 92485 + }, + { + "epoch": 6.284141867101509, + "grad_norm": 0.22857972979545593, + "learning_rate": 2.1478801467590706e-05, + "loss": 3.5659, + "step": 92490 + }, + { + "epoch": 6.28448158717217, + "grad_norm": 0.18311195075511932, + "learning_rate": 2.1474554966707434e-05, + "loss": 3.7871, + "step": 92495 + }, + { + "epoch": 6.284821307242832, + "grad_norm": 0.2680303454399109, + "learning_rate": 2.1470308465824162e-05, + "loss": 3.8028, + "step": 92500 + }, + { + "epoch": 6.285161027313494, + "grad_norm": 0.1814599484205246, + "learning_rate": 2.146606196494089e-05, + "loss": 3.9403, + "step": 92505 + }, + { + "epoch": 6.285500747384155, + "grad_norm": 0.17880423367023468, + "learning_rate": 2.1461815464057618e-05, + "loss": 3.709, + "step": 92510 + }, + { + "epoch": 6.285840467454817, + "grad_norm": 0.19357553124427795, + "learning_rate": 2.1457568963174346e-05, + "loss": 3.6721, + "step": 92515 + }, + { + "epoch": 6.2861801875254795, + "grad_norm": 0.3977069854736328, + "learning_rate": 2.1453322462291074e-05, + "loss": 3.6087, + "step": 92520 + }, + { + "epoch": 6.286519907596141, + "grad_norm": 0.20398126542568207, + "learning_rate": 2.14490759614078e-05, + "loss": 3.9137, + "step": 92525 + }, + { + "epoch": 6.286859627666803, + "grad_norm": 0.17811337113380432, + "learning_rate": 2.144482946052453e-05, + "loss": 3.9771, + "step": 92530 + }, + { + "epoch": 6.287199347737464, + "grad_norm": 0.1610080450773239, + "learning_rate": 2.1440582959641255e-05, + "loss": 3.8497, + "step": 92535 + }, + { + "epoch": 6.287539067808126, + "grad_norm": 0.20508401095867157, + "learning_rate": 2.1436336458757986e-05, + "loss": 4.0356, + "step": 92540 + }, + { + "epoch": 6.287878787878788, + "grad_norm": 0.22044043242931366, + "learning_rate": 2.1432089957874714e-05, + "loss": 3.8759, + "step": 92545 + }, + { + "epoch": 6.288218507949449, + "grad_norm": 0.14300701022148132, + "learning_rate": 2.142784345699144e-05, + "loss": 3.4817, + "step": 92550 + }, + { + "epoch": 6.288558228020111, + "grad_norm": 0.16060246527194977, + "learning_rate": 2.142359695610817e-05, + "loss": 3.8655, + "step": 92555 + }, + { + "epoch": 6.288897948090773, + "grad_norm": 0.1850554496049881, + "learning_rate": 2.1419350455224895e-05, + "loss": 3.8928, + "step": 92560 + }, + { + "epoch": 6.289237668161435, + "grad_norm": 0.15955454111099243, + "learning_rate": 2.1415103954341623e-05, + "loss": 3.7447, + "step": 92565 + }, + { + "epoch": 6.289577388232097, + "grad_norm": 0.19388926029205322, + "learning_rate": 2.141085745345835e-05, + "loss": 3.9121, + "step": 92570 + }, + { + "epoch": 6.289917108302759, + "grad_norm": 0.23300068080425262, + "learning_rate": 2.140661095257508e-05, + "loss": 3.9199, + "step": 92575 + }, + { + "epoch": 6.29025682837342, + "grad_norm": 0.23576337099075317, + "learning_rate": 2.1402364451691807e-05, + "loss": 3.7345, + "step": 92580 + }, + { + "epoch": 6.290596548444082, + "grad_norm": 0.15475693345069885, + "learning_rate": 2.139896725098519e-05, + "loss": 3.8106, + "step": 92585 + }, + { + "epoch": 6.290936268514744, + "grad_norm": 0.16336338222026825, + "learning_rate": 2.1394720750101917e-05, + "loss": 3.9574, + "step": 92590 + }, + { + "epoch": 6.291275988585405, + "grad_norm": 0.1967071294784546, + "learning_rate": 2.1390474249218645e-05, + "loss": 3.7301, + "step": 92595 + }, + { + "epoch": 6.291615708656067, + "grad_norm": 0.3520631194114685, + "learning_rate": 2.1386227748335373e-05, + "loss": 3.9592, + "step": 92600 + }, + { + "epoch": 6.291955428726729, + "grad_norm": 0.1749131828546524, + "learning_rate": 2.13819812474521e-05, + "loss": 4.0154, + "step": 92605 + }, + { + "epoch": 6.292295148797391, + "grad_norm": 0.17725972831249237, + "learning_rate": 2.137773474656883e-05, + "loss": 3.5896, + "step": 92610 + }, + { + "epoch": 6.292634868868053, + "grad_norm": 0.2538590431213379, + "learning_rate": 2.1373488245685554e-05, + "loss": 3.8637, + "step": 92615 + }, + { + "epoch": 6.292974588938715, + "grad_norm": 0.19767387211322784, + "learning_rate": 2.1369241744802285e-05, + "loss": 3.8463, + "step": 92620 + }, + { + "epoch": 6.293314309009376, + "grad_norm": 0.22050873935222626, + "learning_rate": 2.1364995243919013e-05, + "loss": 3.727, + "step": 92625 + }, + { + "epoch": 6.293654029080038, + "grad_norm": 0.19821837544441223, + "learning_rate": 2.1360748743035738e-05, + "loss": 3.8631, + "step": 92630 + }, + { + "epoch": 6.2939937491507, + "grad_norm": 0.18449337780475616, + "learning_rate": 2.135650224215247e-05, + "loss": 3.9881, + "step": 92635 + }, + { + "epoch": 6.294333469221361, + "grad_norm": 0.3471803367137909, + "learning_rate": 2.1352255741269194e-05, + "loss": 3.8554, + "step": 92640 + }, + { + "epoch": 6.294673189292023, + "grad_norm": 0.16115449368953705, + "learning_rate": 2.1348009240385922e-05, + "loss": 3.7434, + "step": 92645 + }, + { + "epoch": 6.295012909362685, + "grad_norm": 0.18648894131183624, + "learning_rate": 2.134376273950265e-05, + "loss": 3.9154, + "step": 92650 + }, + { + "epoch": 6.295352629433347, + "grad_norm": 0.1486605554819107, + "learning_rate": 2.1339516238619378e-05, + "loss": 3.9757, + "step": 92655 + }, + { + "epoch": 6.295692349504009, + "grad_norm": 0.15827982127666473, + "learning_rate": 2.133526973773611e-05, + "loss": 3.7959, + "step": 92660 + }, + { + "epoch": 6.296032069574671, + "grad_norm": 0.4180581271648407, + "learning_rate": 2.1331023236852834e-05, + "loss": 3.8796, + "step": 92665 + }, + { + "epoch": 6.296371789645332, + "grad_norm": 0.145726278424263, + "learning_rate": 2.1326776735969562e-05, + "loss": 3.7892, + "step": 92670 + }, + { + "epoch": 6.296711509715994, + "grad_norm": 0.13101723790168762, + "learning_rate": 2.132253023508629e-05, + "loss": 3.7477, + "step": 92675 + }, + { + "epoch": 6.297051229786656, + "grad_norm": 0.16594110429286957, + "learning_rate": 2.1318283734203018e-05, + "loss": 4.0085, + "step": 92680 + }, + { + "epoch": 6.297390949857317, + "grad_norm": 0.2001670002937317, + "learning_rate": 2.1314037233319746e-05, + "loss": 3.7931, + "step": 92685 + }, + { + "epoch": 6.297730669927979, + "grad_norm": 0.1712753027677536, + "learning_rate": 2.1309790732436474e-05, + "loss": 3.8894, + "step": 92690 + }, + { + "epoch": 6.298070389998641, + "grad_norm": 0.2238607406616211, + "learning_rate": 2.1305544231553202e-05, + "loss": 3.8826, + "step": 92695 + }, + { + "epoch": 6.298410110069303, + "grad_norm": 0.20939885079860687, + "learning_rate": 2.1301297730669927e-05, + "loss": 3.7705, + "step": 92700 + }, + { + "epoch": 6.298749830139965, + "grad_norm": 0.1835762858390808, + "learning_rate": 2.1297051229786658e-05, + "loss": 3.6386, + "step": 92705 + }, + { + "epoch": 6.299089550210627, + "grad_norm": 0.1833743005990982, + "learning_rate": 2.1292804728903386e-05, + "loss": 3.7771, + "step": 92710 + }, + { + "epoch": 6.299429270281288, + "grad_norm": 0.29207557439804077, + "learning_rate": 2.128855822802011e-05, + "loss": 4.149, + "step": 92715 + }, + { + "epoch": 6.29976899035195, + "grad_norm": 0.2634355127811432, + "learning_rate": 2.1284311727136842e-05, + "loss": 4.1642, + "step": 92720 + }, + { + "epoch": 6.300108710422612, + "grad_norm": 0.16283224523067474, + "learning_rate": 2.1280065226253567e-05, + "loss": 3.7892, + "step": 92725 + }, + { + "epoch": 6.300448430493273, + "grad_norm": 0.15610933303833008, + "learning_rate": 2.127666802554695e-05, + "loss": 4.0187, + "step": 92730 + }, + { + "epoch": 6.300788150563935, + "grad_norm": 0.15086901187896729, + "learning_rate": 2.1272421524663677e-05, + "loss": 3.8515, + "step": 92735 + }, + { + "epoch": 6.3011278706345974, + "grad_norm": 0.19434110820293427, + "learning_rate": 2.126817502378041e-05, + "loss": 3.8211, + "step": 92740 + }, + { + "epoch": 6.301467590705259, + "grad_norm": 3.0459423065185547, + "learning_rate": 2.1263928522897133e-05, + "loss": 3.7495, + "step": 92745 + }, + { + "epoch": 6.301807310775921, + "grad_norm": 0.14196397364139557, + "learning_rate": 2.125968202201386e-05, + "loss": 3.6103, + "step": 92750 + }, + { + "epoch": 6.302147030846583, + "grad_norm": 0.15513953566551208, + "learning_rate": 2.125543552113059e-05, + "loss": 3.4822, + "step": 92755 + }, + { + "epoch": 6.302486750917244, + "grad_norm": 1.6527396440505981, + "learning_rate": 2.1251189020247317e-05, + "loss": 3.7257, + "step": 92760 + }, + { + "epoch": 6.302826470987906, + "grad_norm": 0.1676974594593048, + "learning_rate": 2.1246942519364045e-05, + "loss": 3.6551, + "step": 92765 + }, + { + "epoch": 6.303166191058568, + "grad_norm": 0.15296611189842224, + "learning_rate": 2.1242696018480773e-05, + "loss": 3.962, + "step": 92770 + }, + { + "epoch": 6.303505911129229, + "grad_norm": 0.1793835610151291, + "learning_rate": 2.12384495175975e-05, + "loss": 3.8714, + "step": 92775 + }, + { + "epoch": 6.303845631199891, + "grad_norm": 0.17269133031368256, + "learning_rate": 2.123420301671423e-05, + "loss": 3.8287, + "step": 92780 + }, + { + "epoch": 6.3041853512705535, + "grad_norm": 0.17909084260463715, + "learning_rate": 2.1229956515830957e-05, + "loss": 3.8706, + "step": 92785 + }, + { + "epoch": 6.304525071341215, + "grad_norm": 0.16879765689373016, + "learning_rate": 2.1225710014947685e-05, + "loss": 3.8686, + "step": 92790 + }, + { + "epoch": 6.304864791411877, + "grad_norm": 0.2325674146413803, + "learning_rate": 2.1221463514064413e-05, + "loss": 3.7958, + "step": 92795 + }, + { + "epoch": 6.305204511482539, + "grad_norm": 0.1967020034790039, + "learning_rate": 2.121721701318114e-05, + "loss": 3.8754, + "step": 92800 + }, + { + "epoch": 6.3055442315532, + "grad_norm": 0.26287758350372314, + "learning_rate": 2.1212970512297866e-05, + "loss": 3.8695, + "step": 92805 + }, + { + "epoch": 6.305883951623862, + "grad_norm": 0.18480893969535828, + "learning_rate": 2.1208724011414597e-05, + "loss": 3.8255, + "step": 92810 + }, + { + "epoch": 6.306223671694523, + "grad_norm": 0.21533599495887756, + "learning_rate": 2.1204477510531322e-05, + "loss": 3.8129, + "step": 92815 + }, + { + "epoch": 6.306563391765185, + "grad_norm": 0.16683191061019897, + "learning_rate": 2.120023100964805e-05, + "loss": 3.6254, + "step": 92820 + }, + { + "epoch": 6.306903111835847, + "grad_norm": 0.1253999024629593, + "learning_rate": 2.119598450876478e-05, + "loss": 3.9416, + "step": 92825 + }, + { + "epoch": 6.307242831906509, + "grad_norm": 0.16347837448120117, + "learning_rate": 2.1191738007881506e-05, + "loss": 3.8907, + "step": 92830 + }, + { + "epoch": 6.307582551977171, + "grad_norm": 0.16917867958545685, + "learning_rate": 2.1187491506998234e-05, + "loss": 3.9185, + "step": 92835 + }, + { + "epoch": 6.307922272047833, + "grad_norm": 0.39185577630996704, + "learning_rate": 2.1183245006114962e-05, + "loss": 3.94, + "step": 92840 + }, + { + "epoch": 6.308261992118494, + "grad_norm": 0.2404075264930725, + "learning_rate": 2.117899850523169e-05, + "loss": 3.8665, + "step": 92845 + }, + { + "epoch": 6.308601712189156, + "grad_norm": 0.25637054443359375, + "learning_rate": 2.1174752004348418e-05, + "loss": 3.8155, + "step": 92850 + }, + { + "epoch": 6.308941432259818, + "grad_norm": 0.25543728470802307, + "learning_rate": 2.1170505503465146e-05, + "loss": 3.8227, + "step": 92855 + }, + { + "epoch": 6.309281152330479, + "grad_norm": 0.19051969051361084, + "learning_rate": 2.1166259002581874e-05, + "loss": 3.817, + "step": 92860 + }, + { + "epoch": 6.309620872401141, + "grad_norm": 0.20049796998500824, + "learning_rate": 2.1162012501698602e-05, + "loss": 4.043, + "step": 92865 + }, + { + "epoch": 6.309960592471803, + "grad_norm": 0.16858385503292084, + "learning_rate": 2.115776600081533e-05, + "loss": 3.7108, + "step": 92870 + }, + { + "epoch": 6.310300312542465, + "grad_norm": 0.18796555697917938, + "learning_rate": 2.1153519499932058e-05, + "loss": 3.8117, + "step": 92875 + }, + { + "epoch": 6.310640032613127, + "grad_norm": 0.17552374303340912, + "learning_rate": 2.1149272999048786e-05, + "loss": 3.7052, + "step": 92880 + }, + { + "epoch": 6.310979752683789, + "grad_norm": 0.1638011336326599, + "learning_rate": 2.1145026498165514e-05, + "loss": 3.9174, + "step": 92885 + }, + { + "epoch": 6.31131947275445, + "grad_norm": 0.1592295616865158, + "learning_rate": 2.114077999728224e-05, + "loss": 3.736, + "step": 92890 + }, + { + "epoch": 6.311659192825112, + "grad_norm": 0.2705751955509186, + "learning_rate": 2.113653349639897e-05, + "loss": 3.6566, + "step": 92895 + }, + { + "epoch": 6.311998912895774, + "grad_norm": 0.15790016949176788, + "learning_rate": 2.1132286995515694e-05, + "loss": 3.9483, + "step": 92900 + }, + { + "epoch": 6.312338632966435, + "grad_norm": 0.16298818588256836, + "learning_rate": 2.1128040494632422e-05, + "loss": 3.7959, + "step": 92905 + }, + { + "epoch": 6.312678353037097, + "grad_norm": 0.17871885001659393, + "learning_rate": 2.1123793993749154e-05, + "loss": 3.7819, + "step": 92910 + }, + { + "epoch": 6.313018073107759, + "grad_norm": 0.16784217953681946, + "learning_rate": 2.111954749286588e-05, + "loss": 3.9177, + "step": 92915 + }, + { + "epoch": 6.313357793178421, + "grad_norm": 0.16115981340408325, + "learning_rate": 2.1115300991982606e-05, + "loss": 3.7475, + "step": 92920 + }, + { + "epoch": 6.313697513249083, + "grad_norm": 0.18721474707126617, + "learning_rate": 2.1111054491099334e-05, + "loss": 3.7499, + "step": 92925 + }, + { + "epoch": 6.314037233319745, + "grad_norm": 0.3325623571872711, + "learning_rate": 2.1106807990216062e-05, + "loss": 3.9106, + "step": 92930 + }, + { + "epoch": 6.314376953390406, + "grad_norm": 0.18666696548461914, + "learning_rate": 2.110256148933279e-05, + "loss": 3.7039, + "step": 92935 + }, + { + "epoch": 6.314716673461068, + "grad_norm": 0.15721793472766876, + "learning_rate": 2.109831498844952e-05, + "loss": 3.7002, + "step": 92940 + }, + { + "epoch": 6.31505639353173, + "grad_norm": 0.17834657430648804, + "learning_rate": 2.1094068487566247e-05, + "loss": 3.8087, + "step": 92945 + }, + { + "epoch": 6.315396113602391, + "grad_norm": 0.1932317316532135, + "learning_rate": 2.1089821986682975e-05, + "loss": 3.8278, + "step": 92950 + }, + { + "epoch": 6.315735833673053, + "grad_norm": 0.1472780406475067, + "learning_rate": 2.1085575485799703e-05, + "loss": 3.9177, + "step": 92955 + }, + { + "epoch": 6.316075553743715, + "grad_norm": 0.16521063446998596, + "learning_rate": 2.108132898491643e-05, + "loss": 4.0872, + "step": 92960 + }, + { + "epoch": 6.316415273814377, + "grad_norm": 0.17418909072875977, + "learning_rate": 2.107708248403316e-05, + "loss": 3.6819, + "step": 92965 + }, + { + "epoch": 6.316754993885039, + "grad_norm": 0.17898032069206238, + "learning_rate": 2.1072835983149887e-05, + "loss": 3.6706, + "step": 92970 + }, + { + "epoch": 6.317094713955701, + "grad_norm": 0.1589546799659729, + "learning_rate": 2.106858948226661e-05, + "loss": 3.8451, + "step": 92975 + }, + { + "epoch": 6.317434434026362, + "grad_norm": 0.18630637228488922, + "learning_rate": 2.1064342981383343e-05, + "loss": 3.847, + "step": 92980 + }, + { + "epoch": 6.317774154097024, + "grad_norm": 0.1496521234512329, + "learning_rate": 2.106009648050007e-05, + "loss": 3.769, + "step": 92985 + }, + { + "epoch": 6.318113874167686, + "grad_norm": 0.1352892816066742, + "learning_rate": 2.1055849979616795e-05, + "loss": 4.0516, + "step": 92990 + }, + { + "epoch": 6.318453594238347, + "grad_norm": 0.21550744771957397, + "learning_rate": 2.1051603478733527e-05, + "loss": 3.8448, + "step": 92995 + }, + { + "epoch": 6.318793314309009, + "grad_norm": 0.20123359560966492, + "learning_rate": 2.104735697785025e-05, + "loss": 3.6334, + "step": 93000 + }, + { + "epoch": 6.319133034379671, + "grad_norm": 0.21185745298862457, + "learning_rate": 2.104311047696698e-05, + "loss": 3.9168, + "step": 93005 + }, + { + "epoch": 6.319472754450333, + "grad_norm": 0.14667297899723053, + "learning_rate": 2.1038863976083707e-05, + "loss": 3.9062, + "step": 93010 + }, + { + "epoch": 6.319812474520995, + "grad_norm": 0.2082216590642929, + "learning_rate": 2.1034617475200435e-05, + "loss": 3.7789, + "step": 93015 + }, + { + "epoch": 6.320152194591657, + "grad_norm": 0.27057021856307983, + "learning_rate": 2.1030370974317163e-05, + "loss": 3.6851, + "step": 93020 + }, + { + "epoch": 6.320491914662318, + "grad_norm": 0.1707758903503418, + "learning_rate": 2.102612447343389e-05, + "loss": 3.7822, + "step": 93025 + }, + { + "epoch": 6.32083163473298, + "grad_norm": 0.6476504802703857, + "learning_rate": 2.102187797255062e-05, + "loss": 3.7137, + "step": 93030 + }, + { + "epoch": 6.321171354803642, + "grad_norm": 0.1759043186903, + "learning_rate": 2.1017631471667347e-05, + "loss": 3.8049, + "step": 93035 + }, + { + "epoch": 6.321511074874303, + "grad_norm": 0.16091684997081757, + "learning_rate": 2.1013384970784075e-05, + "loss": 3.6347, + "step": 93040 + }, + { + "epoch": 6.321850794944965, + "grad_norm": 0.16753102838993073, + "learning_rate": 2.1009138469900803e-05, + "loss": 3.6143, + "step": 93045 + }, + { + "epoch": 6.3221905150156275, + "grad_norm": 0.34772101044654846, + "learning_rate": 2.100489196901753e-05, + "loss": 3.8109, + "step": 93050 + }, + { + "epoch": 6.322530235086289, + "grad_norm": 0.19310444593429565, + "learning_rate": 2.100064546813426e-05, + "loss": 3.6611, + "step": 93055 + }, + { + "epoch": 6.322869955156951, + "grad_norm": 0.22940704226493835, + "learning_rate": 2.0996398967250984e-05, + "loss": 3.6892, + "step": 93060 + }, + { + "epoch": 6.323209675227613, + "grad_norm": 0.20270699262619019, + "learning_rate": 2.0992152466367715e-05, + "loss": 3.8074, + "step": 93065 + }, + { + "epoch": 6.323549395298274, + "grad_norm": 0.18451695144176483, + "learning_rate": 2.0987905965484443e-05, + "loss": 3.8884, + "step": 93070 + }, + { + "epoch": 6.323889115368936, + "grad_norm": 0.20495273172855377, + "learning_rate": 2.0983659464601168e-05, + "loss": 3.9487, + "step": 93075 + }, + { + "epoch": 6.324228835439598, + "grad_norm": 0.17708325386047363, + "learning_rate": 2.09794129637179e-05, + "loss": 3.8587, + "step": 93080 + }, + { + "epoch": 6.324568555510259, + "grad_norm": 0.15595072507858276, + "learning_rate": 2.0975166462834624e-05, + "loss": 3.8484, + "step": 93085 + }, + { + "epoch": 6.324908275580921, + "grad_norm": 0.21839387714862823, + "learning_rate": 2.0970919961951352e-05, + "loss": 4.125, + "step": 93090 + }, + { + "epoch": 6.3252479956515835, + "grad_norm": 0.19075319170951843, + "learning_rate": 2.096667346106808e-05, + "loss": 3.78, + "step": 93095 + }, + { + "epoch": 6.325587715722245, + "grad_norm": 0.6413862705230713, + "learning_rate": 2.0962426960184808e-05, + "loss": 3.5917, + "step": 93100 + }, + { + "epoch": 6.325927435792907, + "grad_norm": 0.15698827803134918, + "learning_rate": 2.0958180459301536e-05, + "loss": 3.8489, + "step": 93105 + }, + { + "epoch": 6.326267155863569, + "grad_norm": 0.17114724218845367, + "learning_rate": 2.0953933958418264e-05, + "loss": 3.8195, + "step": 93110 + }, + { + "epoch": 6.32660687593423, + "grad_norm": 0.17340317368507385, + "learning_rate": 2.0949687457534992e-05, + "loss": 3.7185, + "step": 93115 + }, + { + "epoch": 6.326946596004892, + "grad_norm": 0.16639629006385803, + "learning_rate": 2.094544095665172e-05, + "loss": 3.8556, + "step": 93120 + }, + { + "epoch": 6.327286316075554, + "grad_norm": 0.14914298057556152, + "learning_rate": 2.0941194455768448e-05, + "loss": 3.7056, + "step": 93125 + }, + { + "epoch": 6.327626036146215, + "grad_norm": 0.17210771143436432, + "learning_rate": 2.0936947954885176e-05, + "loss": 3.9114, + "step": 93130 + }, + { + "epoch": 6.327965756216877, + "grad_norm": 0.23801067471504211, + "learning_rate": 2.0932701454001904e-05, + "loss": 3.8486, + "step": 93135 + }, + { + "epoch": 6.3283054762875395, + "grad_norm": 0.18147093057632446, + "learning_rate": 2.0928454953118632e-05, + "loss": 3.8492, + "step": 93140 + }, + { + "epoch": 6.328645196358201, + "grad_norm": 0.2699384093284607, + "learning_rate": 2.0924208452235357e-05, + "loss": 3.5978, + "step": 93145 + }, + { + "epoch": 6.328984916428863, + "grad_norm": 0.16000263392925262, + "learning_rate": 2.0919961951352088e-05, + "loss": 3.7747, + "step": 93150 + }, + { + "epoch": 6.329324636499525, + "grad_norm": 0.14269880950450897, + "learning_rate": 2.0915715450468816e-05, + "loss": 3.8889, + "step": 93155 + }, + { + "epoch": 6.329664356570186, + "grad_norm": 0.1762872338294983, + "learning_rate": 2.091146894958554e-05, + "loss": 3.8868, + "step": 93160 + }, + { + "epoch": 6.330004076640848, + "grad_norm": 0.3366875648498535, + "learning_rate": 2.0907222448702272e-05, + "loss": 3.8222, + "step": 93165 + }, + { + "epoch": 6.33034379671151, + "grad_norm": 0.17152121663093567, + "learning_rate": 2.0902975947818997e-05, + "loss": 3.6535, + "step": 93170 + }, + { + "epoch": 6.330683516782171, + "grad_norm": 0.21971353888511658, + "learning_rate": 2.0898729446935725e-05, + "loss": 3.6442, + "step": 93175 + }, + { + "epoch": 6.331023236852833, + "grad_norm": 0.5804756283760071, + "learning_rate": 2.0894482946052453e-05, + "loss": 3.9161, + "step": 93180 + }, + { + "epoch": 6.3313629569234955, + "grad_norm": 0.1792113035917282, + "learning_rate": 2.089023644516918e-05, + "loss": 3.7682, + "step": 93185 + }, + { + "epoch": 6.331702676994157, + "grad_norm": 0.16494400799274445, + "learning_rate": 2.088598994428591e-05, + "loss": 3.7704, + "step": 93190 + }, + { + "epoch": 6.332042397064819, + "grad_norm": 0.1563011109828949, + "learning_rate": 2.0881743443402637e-05, + "loss": 4.0126, + "step": 93195 + }, + { + "epoch": 6.332382117135481, + "grad_norm": 0.18816617131233215, + "learning_rate": 2.0877496942519365e-05, + "loss": 3.7795, + "step": 93200 + }, + { + "epoch": 6.332721837206142, + "grad_norm": 1.0028417110443115, + "learning_rate": 2.0873250441636093e-05, + "loss": 3.7683, + "step": 93205 + }, + { + "epoch": 6.333061557276804, + "grad_norm": 0.15868203341960907, + "learning_rate": 2.086900394075282e-05, + "loss": 3.7146, + "step": 93210 + }, + { + "epoch": 6.333401277347465, + "grad_norm": 0.5786229968070984, + "learning_rate": 2.086475743986955e-05, + "loss": 3.8427, + "step": 93215 + }, + { + "epoch": 6.333740997418127, + "grad_norm": 0.17346426844596863, + "learning_rate": 2.0860510938986277e-05, + "loss": 4.0435, + "step": 93220 + }, + { + "epoch": 6.334080717488789, + "grad_norm": 0.1811761111021042, + "learning_rate": 2.0856264438103005e-05, + "loss": 3.7776, + "step": 93225 + }, + { + "epoch": 6.334420437559451, + "grad_norm": 0.1482975035905838, + "learning_rate": 2.085201793721973e-05, + "loss": 3.6904, + "step": 93230 + }, + { + "epoch": 6.334760157630113, + "grad_norm": 1.2514753341674805, + "learning_rate": 2.084777143633646e-05, + "loss": 4.0448, + "step": 93235 + }, + { + "epoch": 6.335099877700775, + "grad_norm": 0.17755615711212158, + "learning_rate": 2.084352493545319e-05, + "loss": 3.686, + "step": 93240 + }, + { + "epoch": 6.335439597771436, + "grad_norm": 0.1783488243818283, + "learning_rate": 2.0839278434569913e-05, + "loss": 3.9286, + "step": 93245 + }, + { + "epoch": 6.335779317842098, + "grad_norm": 0.3962506651878357, + "learning_rate": 2.0835031933686645e-05, + "loss": 3.9521, + "step": 93250 + }, + { + "epoch": 6.33611903791276, + "grad_norm": 0.17580141127109528, + "learning_rate": 2.083078543280337e-05, + "loss": 3.855, + "step": 93255 + }, + { + "epoch": 6.336458757983421, + "grad_norm": 0.17069968581199646, + "learning_rate": 2.0826538931920097e-05, + "loss": 3.8728, + "step": 93260 + }, + { + "epoch": 6.336798478054083, + "grad_norm": 0.15484431385993958, + "learning_rate": 2.082229243103683e-05, + "loss": 3.8756, + "step": 93265 + }, + { + "epoch": 6.337138198124745, + "grad_norm": 0.2013581395149231, + "learning_rate": 2.0818045930153553e-05, + "loss": 3.7176, + "step": 93270 + }, + { + "epoch": 6.337477918195407, + "grad_norm": 0.16980920732021332, + "learning_rate": 2.081379942927028e-05, + "loss": 3.9986, + "step": 93275 + }, + { + "epoch": 6.337817638266069, + "grad_norm": 0.2386217713356018, + "learning_rate": 2.080955292838701e-05, + "loss": 3.996, + "step": 93280 + }, + { + "epoch": 6.338157358336731, + "grad_norm": 0.2075965255498886, + "learning_rate": 2.0805306427503737e-05, + "loss": 3.8047, + "step": 93285 + }, + { + "epoch": 6.338497078407392, + "grad_norm": 0.16665850579738617, + "learning_rate": 2.0801059926620465e-05, + "loss": 3.8934, + "step": 93290 + }, + { + "epoch": 6.338836798478054, + "grad_norm": 0.15777848660945892, + "learning_rate": 2.0796813425737194e-05, + "loss": 3.839, + "step": 93295 + }, + { + "epoch": 6.339176518548716, + "grad_norm": 0.15201319754123688, + "learning_rate": 2.079256692485392e-05, + "loss": 3.8932, + "step": 93300 + }, + { + "epoch": 6.339516238619377, + "grad_norm": 0.22624239325523376, + "learning_rate": 2.078832042397065e-05, + "loss": 3.9152, + "step": 93305 + }, + { + "epoch": 6.339855958690039, + "grad_norm": 0.43990975618362427, + "learning_rate": 2.0784073923087378e-05, + "loss": 3.7107, + "step": 93310 + }, + { + "epoch": 6.3401956787607014, + "grad_norm": 0.18585766851902008, + "learning_rate": 2.0779827422204106e-05, + "loss": 3.5981, + "step": 93315 + }, + { + "epoch": 6.340535398831363, + "grad_norm": 0.20350873470306396, + "learning_rate": 2.0775580921320834e-05, + "loss": 3.8946, + "step": 93320 + }, + { + "epoch": 6.340875118902025, + "grad_norm": 0.191247820854187, + "learning_rate": 2.077133442043756e-05, + "loss": 3.8274, + "step": 93325 + }, + { + "epoch": 6.341214838972687, + "grad_norm": 0.3627893626689911, + "learning_rate": 2.0767087919554286e-05, + "loss": 3.8394, + "step": 93330 + }, + { + "epoch": 6.341554559043348, + "grad_norm": 0.15542460978031158, + "learning_rate": 2.0762841418671018e-05, + "loss": 3.5802, + "step": 93335 + }, + { + "epoch": 6.34189427911401, + "grad_norm": 0.19081629812717438, + "learning_rate": 2.0758594917787742e-05, + "loss": 3.7314, + "step": 93340 + }, + { + "epoch": 6.342233999184672, + "grad_norm": 1.3195796012878418, + "learning_rate": 2.075434841690447e-05, + "loss": 3.8287, + "step": 93345 + }, + { + "epoch": 6.342573719255333, + "grad_norm": 0.14389610290527344, + "learning_rate": 2.07501019160212e-05, + "loss": 3.565, + "step": 93350 + }, + { + "epoch": 6.342913439325995, + "grad_norm": 0.16145886480808258, + "learning_rate": 2.0745855415137926e-05, + "loss": 3.886, + "step": 93355 + }, + { + "epoch": 6.3432531593966575, + "grad_norm": 0.21880941092967987, + "learning_rate": 2.0741608914254654e-05, + "loss": 3.6471, + "step": 93360 + }, + { + "epoch": 6.343592879467319, + "grad_norm": 0.1783679574728012, + "learning_rate": 2.0737362413371382e-05, + "loss": 3.9154, + "step": 93365 + }, + { + "epoch": 6.343932599537981, + "grad_norm": 0.18894942104816437, + "learning_rate": 2.073311591248811e-05, + "loss": 3.9442, + "step": 93370 + }, + { + "epoch": 6.344272319608643, + "grad_norm": 0.15838104486465454, + "learning_rate": 2.0728869411604838e-05, + "loss": 3.8768, + "step": 93375 + }, + { + "epoch": 6.344612039679304, + "grad_norm": 0.1690409630537033, + "learning_rate": 2.0724622910721566e-05, + "loss": 3.838, + "step": 93380 + }, + { + "epoch": 6.344951759749966, + "grad_norm": 0.21835367381572723, + "learning_rate": 2.0720376409838294e-05, + "loss": 3.9203, + "step": 93385 + }, + { + "epoch": 6.345291479820628, + "grad_norm": 0.4410611093044281, + "learning_rate": 2.0716129908955022e-05, + "loss": 3.8893, + "step": 93390 + }, + { + "epoch": 6.345631199891289, + "grad_norm": 0.1347021758556366, + "learning_rate": 2.071188340807175e-05, + "loss": 3.8548, + "step": 93395 + }, + { + "epoch": 6.345970919961951, + "grad_norm": 0.1973429024219513, + "learning_rate": 2.0707636907188478e-05, + "loss": 3.6564, + "step": 93400 + }, + { + "epoch": 6.3463106400326135, + "grad_norm": 0.29636678099632263, + "learning_rate": 2.0703390406305206e-05, + "loss": 3.805, + "step": 93405 + }, + { + "epoch": 6.346650360103275, + "grad_norm": 0.18024815618991852, + "learning_rate": 2.0699143905421934e-05, + "loss": 3.8373, + "step": 93410 + }, + { + "epoch": 6.346990080173937, + "grad_norm": 2.342334032058716, + "learning_rate": 2.069489740453866e-05, + "loss": 3.7733, + "step": 93415 + }, + { + "epoch": 6.347329800244599, + "grad_norm": 2.903961420059204, + "learning_rate": 2.069065090365539e-05, + "loss": 3.97, + "step": 93420 + }, + { + "epoch": 6.34766952031526, + "grad_norm": 0.18983909487724304, + "learning_rate": 2.0686404402772115e-05, + "loss": 3.8629, + "step": 93425 + }, + { + "epoch": 6.348009240385922, + "grad_norm": 0.19818751513957977, + "learning_rate": 2.0682157901888843e-05, + "loss": 3.653, + "step": 93430 + }, + { + "epoch": 6.348348960456584, + "grad_norm": 0.2977749705314636, + "learning_rate": 2.0677911401005574e-05, + "loss": 3.6106, + "step": 93435 + }, + { + "epoch": 6.348688680527245, + "grad_norm": 0.1366225630044937, + "learning_rate": 2.06736649001223e-05, + "loss": 3.7566, + "step": 93440 + }, + { + "epoch": 6.349028400597907, + "grad_norm": 0.24300380051136017, + "learning_rate": 2.0669418399239027e-05, + "loss": 3.9191, + "step": 93445 + }, + { + "epoch": 6.3493681206685695, + "grad_norm": 0.18688496947288513, + "learning_rate": 2.0665171898355755e-05, + "loss": 4.0217, + "step": 93450 + }, + { + "epoch": 6.349707840739231, + "grad_norm": 0.15836270153522491, + "learning_rate": 2.0660925397472483e-05, + "loss": 3.9595, + "step": 93455 + }, + { + "epoch": 6.350047560809893, + "grad_norm": 0.24505719542503357, + "learning_rate": 2.0656678896589214e-05, + "loss": 3.7405, + "step": 93460 + }, + { + "epoch": 6.350387280880555, + "grad_norm": 0.9197686314582825, + "learning_rate": 2.065243239570594e-05, + "loss": 3.9645, + "step": 93465 + }, + { + "epoch": 6.350727000951216, + "grad_norm": 0.7961097359657288, + "learning_rate": 2.0648185894822667e-05, + "loss": 3.8994, + "step": 93470 + }, + { + "epoch": 6.351066721021878, + "grad_norm": 0.1711624264717102, + "learning_rate": 2.0643939393939395e-05, + "loss": 4.1058, + "step": 93475 + }, + { + "epoch": 6.35140644109254, + "grad_norm": 0.6723896265029907, + "learning_rate": 2.0639692893056123e-05, + "loss": 3.9585, + "step": 93480 + }, + { + "epoch": 6.351746161163201, + "grad_norm": 0.16179655492305756, + "learning_rate": 2.063544639217285e-05, + "loss": 3.8401, + "step": 93485 + }, + { + "epoch": 6.352085881233863, + "grad_norm": 0.14615383744239807, + "learning_rate": 2.063119989128958e-05, + "loss": 3.4632, + "step": 93490 + }, + { + "epoch": 6.352425601304525, + "grad_norm": 0.16185232996940613, + "learning_rate": 2.0626953390406307e-05, + "loss": 3.682, + "step": 93495 + }, + { + "epoch": 6.352765321375187, + "grad_norm": 0.15050356090068817, + "learning_rate": 2.062270688952303e-05, + "loss": 3.8365, + "step": 93500 + }, + { + "epoch": 6.353105041445849, + "grad_norm": 0.16865983605384827, + "learning_rate": 2.0618460388639763e-05, + "loss": 3.6692, + "step": 93505 + }, + { + "epoch": 6.35344476151651, + "grad_norm": 1.656991958618164, + "learning_rate": 2.061421388775649e-05, + "loss": 3.797, + "step": 93510 + }, + { + "epoch": 6.353784481587172, + "grad_norm": 0.16034665703773499, + "learning_rate": 2.0609967386873216e-05, + "loss": 3.9339, + "step": 93515 + }, + { + "epoch": 6.354124201657834, + "grad_norm": 0.17373359203338623, + "learning_rate": 2.0605720885989947e-05, + "loss": 3.5819, + "step": 93520 + }, + { + "epoch": 6.354463921728495, + "grad_norm": 0.12907245755195618, + "learning_rate": 2.0601474385106672e-05, + "loss": 3.9137, + "step": 93525 + }, + { + "epoch": 6.354803641799157, + "grad_norm": 0.2226761281490326, + "learning_rate": 2.05972278842234e-05, + "loss": 3.9406, + "step": 93530 + }, + { + "epoch": 6.355143361869819, + "grad_norm": 0.19289909303188324, + "learning_rate": 2.0592981383340128e-05, + "loss": 3.9243, + "step": 93535 + }, + { + "epoch": 6.355483081940481, + "grad_norm": 0.16070854663848877, + "learning_rate": 2.0588734882456856e-05, + "loss": 3.767, + "step": 93540 + }, + { + "epoch": 6.355822802011143, + "grad_norm": 0.22012686729431152, + "learning_rate": 2.0584488381573587e-05, + "loss": 4.1818, + "step": 93545 + }, + { + "epoch": 6.356162522081805, + "grad_norm": 0.19300492107868195, + "learning_rate": 2.0580241880690312e-05, + "loss": 3.78, + "step": 93550 + }, + { + "epoch": 6.356502242152466, + "grad_norm": 0.9170252084732056, + "learning_rate": 2.057599537980704e-05, + "loss": 3.9119, + "step": 93555 + }, + { + "epoch": 6.356841962223128, + "grad_norm": 0.16892117261886597, + "learning_rate": 2.0571748878923768e-05, + "loss": 3.6136, + "step": 93560 + }, + { + "epoch": 6.35718168229379, + "grad_norm": 0.15750204026699066, + "learning_rate": 2.0567502378040496e-05, + "loss": 3.5534, + "step": 93565 + }, + { + "epoch": 6.357521402364451, + "grad_norm": 0.13947711884975433, + "learning_rate": 2.0563255877157224e-05, + "loss": 3.6712, + "step": 93570 + }, + { + "epoch": 6.357861122435113, + "grad_norm": 0.1618821769952774, + "learning_rate": 2.0559009376273952e-05, + "loss": 3.89, + "step": 93575 + }, + { + "epoch": 6.358200842505775, + "grad_norm": 0.20807036757469177, + "learning_rate": 2.055476287539068e-05, + "loss": 4.0661, + "step": 93580 + }, + { + "epoch": 6.358540562576437, + "grad_norm": 0.17378517985343933, + "learning_rate": 2.0550516374507404e-05, + "loss": 3.8495, + "step": 93585 + }, + { + "epoch": 6.358880282647099, + "grad_norm": 0.203337162733078, + "learning_rate": 2.0546269873624136e-05, + "loss": 3.829, + "step": 93590 + }, + { + "epoch": 6.359220002717761, + "grad_norm": 0.17317265272140503, + "learning_rate": 2.0542023372740864e-05, + "loss": 3.7599, + "step": 93595 + }, + { + "epoch": 6.359559722788422, + "grad_norm": 0.14698316156864166, + "learning_rate": 2.053777687185759e-05, + "loss": 4.008, + "step": 93600 + }, + { + "epoch": 6.359899442859084, + "grad_norm": 0.18557733297348022, + "learning_rate": 2.053353037097432e-05, + "loss": 3.6404, + "step": 93605 + }, + { + "epoch": 6.360239162929746, + "grad_norm": 0.163859024643898, + "learning_rate": 2.0529283870091044e-05, + "loss": 4.0622, + "step": 93610 + }, + { + "epoch": 6.360578883000407, + "grad_norm": 0.15953022241592407, + "learning_rate": 2.0525037369207772e-05, + "loss": 3.9014, + "step": 93615 + }, + { + "epoch": 6.360918603071069, + "grad_norm": 0.14676758646965027, + "learning_rate": 2.05207908683245e-05, + "loss": 3.8385, + "step": 93620 + }, + { + "epoch": 6.3612583231417315, + "grad_norm": 0.19414517283439636, + "learning_rate": 2.051654436744123e-05, + "loss": 4.0325, + "step": 93625 + }, + { + "epoch": 6.361598043212393, + "grad_norm": 0.14429019391536713, + "learning_rate": 2.051229786655796e-05, + "loss": 3.74, + "step": 93630 + }, + { + "epoch": 6.361937763283055, + "grad_norm": 0.14292268455028534, + "learning_rate": 2.0508051365674684e-05, + "loss": 3.8636, + "step": 93635 + }, + { + "epoch": 6.362277483353717, + "grad_norm": 0.16222938895225525, + "learning_rate": 2.0503804864791412e-05, + "loss": 3.7702, + "step": 93640 + }, + { + "epoch": 6.362617203424378, + "grad_norm": 0.1632986068725586, + "learning_rate": 2.049955836390814e-05, + "loss": 4.025, + "step": 93645 + }, + { + "epoch": 6.36295692349504, + "grad_norm": 0.6562703251838684, + "learning_rate": 2.049531186302487e-05, + "loss": 3.7441, + "step": 93650 + }, + { + "epoch": 6.363296643565702, + "grad_norm": 0.17899711430072784, + "learning_rate": 2.0491065362141597e-05, + "loss": 3.7135, + "step": 93655 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 0.1911403387784958, + "learning_rate": 2.0486818861258325e-05, + "loss": 3.6041, + "step": 93660 + }, + { + "epoch": 6.363976083707025, + "grad_norm": 0.16378383338451385, + "learning_rate": 2.0482572360375053e-05, + "loss": 3.7194, + "step": 93665 + }, + { + "epoch": 6.3643158037776875, + "grad_norm": 0.29136937856674194, + "learning_rate": 2.0478325859491777e-05, + "loss": 3.8618, + "step": 93670 + }, + { + "epoch": 6.364655523848349, + "grad_norm": 0.15178757905960083, + "learning_rate": 2.047407935860851e-05, + "loss": 3.8743, + "step": 93675 + }, + { + "epoch": 6.364995243919011, + "grad_norm": 0.20802608132362366, + "learning_rate": 2.0469832857725237e-05, + "loss": 3.5634, + "step": 93680 + }, + { + "epoch": 6.365334963989673, + "grad_norm": 0.1904127597808838, + "learning_rate": 2.046558635684196e-05, + "loss": 4.0446, + "step": 93685 + }, + { + "epoch": 6.365674684060334, + "grad_norm": 0.15657652914524078, + "learning_rate": 2.0461339855958693e-05, + "loss": 3.9773, + "step": 93690 + }, + { + "epoch": 6.366014404130996, + "grad_norm": 0.17308904230594635, + "learning_rate": 2.0457093355075417e-05, + "loss": 3.7471, + "step": 93695 + }, + { + "epoch": 6.366354124201658, + "grad_norm": 0.7549744844436646, + "learning_rate": 2.0452846854192145e-05, + "loss": 3.8108, + "step": 93700 + }, + { + "epoch": 6.366693844272319, + "grad_norm": 0.1662915199995041, + "learning_rate": 2.0448600353308877e-05, + "loss": 3.4783, + "step": 93705 + }, + { + "epoch": 6.367033564342981, + "grad_norm": 0.1780969649553299, + "learning_rate": 2.04443538524256e-05, + "loss": 3.9212, + "step": 93710 + }, + { + "epoch": 6.3673732844136435, + "grad_norm": 0.17247231304645538, + "learning_rate": 2.0440107351542333e-05, + "loss": 3.7928, + "step": 93715 + }, + { + "epoch": 6.367713004484305, + "grad_norm": 0.15032309293746948, + "learning_rate": 2.0435860850659057e-05, + "loss": 3.6388, + "step": 93720 + }, + { + "epoch": 6.368052724554967, + "grad_norm": 0.20697569847106934, + "learning_rate": 2.0431614349775785e-05, + "loss": 3.9103, + "step": 93725 + }, + { + "epoch": 6.368392444625629, + "grad_norm": 0.16229620575904846, + "learning_rate": 2.0427367848892513e-05, + "loss": 3.8487, + "step": 93730 + }, + { + "epoch": 6.36873216469629, + "grad_norm": 0.1674824208021164, + "learning_rate": 2.042312134800924e-05, + "loss": 3.7001, + "step": 93735 + }, + { + "epoch": 6.369071884766952, + "grad_norm": 0.1690962016582489, + "learning_rate": 2.041887484712597e-05, + "loss": 3.6975, + "step": 93740 + }, + { + "epoch": 6.369411604837614, + "grad_norm": 0.23227423429489136, + "learning_rate": 2.0414628346242697e-05, + "loss": 4.0467, + "step": 93745 + }, + { + "epoch": 6.369751324908275, + "grad_norm": 1.239059567451477, + "learning_rate": 2.0410381845359425e-05, + "loss": 3.9244, + "step": 93750 + }, + { + "epoch": 6.370091044978937, + "grad_norm": 0.2883090078830719, + "learning_rate": 2.0406135344476153e-05, + "loss": 3.8649, + "step": 93755 + }, + { + "epoch": 6.3704307650495995, + "grad_norm": 0.16177846491336823, + "learning_rate": 2.040188884359288e-05, + "loss": 3.8699, + "step": 93760 + }, + { + "epoch": 6.370770485120261, + "grad_norm": 0.24733330309391022, + "learning_rate": 2.039764234270961e-05, + "loss": 3.8631, + "step": 93765 + }, + { + "epoch": 6.371110205190923, + "grad_norm": 0.12736348807811737, + "learning_rate": 2.0393395841826334e-05, + "loss": 3.9381, + "step": 93770 + }, + { + "epoch": 6.371449925261585, + "grad_norm": 0.1589558720588684, + "learning_rate": 2.0389149340943065e-05, + "loss": 3.7461, + "step": 93775 + }, + { + "epoch": 6.371789645332246, + "grad_norm": 0.20020249485969543, + "learning_rate": 2.038490284005979e-05, + "loss": 3.8723, + "step": 93780 + }, + { + "epoch": 6.372129365402908, + "grad_norm": 0.17673641443252563, + "learning_rate": 2.0380656339176518e-05, + "loss": 3.8712, + "step": 93785 + }, + { + "epoch": 6.37246908547357, + "grad_norm": 0.15371675789356232, + "learning_rate": 2.037640983829325e-05, + "loss": 3.7741, + "step": 93790 + }, + { + "epoch": 6.372808805544231, + "grad_norm": 0.14156298339366913, + "learning_rate": 2.0372163337409974e-05, + "loss": 3.9009, + "step": 93795 + }, + { + "epoch": 6.373148525614893, + "grad_norm": 0.1546567678451538, + "learning_rate": 2.0367916836526705e-05, + "loss": 3.6447, + "step": 93800 + }, + { + "epoch": 6.3734882456855555, + "grad_norm": 0.16576029360294342, + "learning_rate": 2.036367033564343e-05, + "loss": 3.8836, + "step": 93805 + }, + { + "epoch": 6.373827965756217, + "grad_norm": 0.17701461911201477, + "learning_rate": 2.0359423834760158e-05, + "loss": 3.9652, + "step": 93810 + }, + { + "epoch": 6.374167685826879, + "grad_norm": 0.20557238161563873, + "learning_rate": 2.0355177333876886e-05, + "loss": 3.7977, + "step": 93815 + }, + { + "epoch": 6.374507405897541, + "grad_norm": 0.143763467669487, + "learning_rate": 2.0350930832993614e-05, + "loss": 3.7514, + "step": 93820 + }, + { + "epoch": 6.374847125968202, + "grad_norm": 0.15330757200717926, + "learning_rate": 2.0346684332110342e-05, + "loss": 3.7229, + "step": 93825 + }, + { + "epoch": 6.375186846038864, + "grad_norm": 0.18000127375125885, + "learning_rate": 2.034243783122707e-05, + "loss": 3.8778, + "step": 93830 + }, + { + "epoch": 6.375526566109526, + "grad_norm": 0.2806156575679779, + "learning_rate": 2.0338191330343798e-05, + "loss": 3.7237, + "step": 93835 + }, + { + "epoch": 6.375866286180187, + "grad_norm": 0.14646591246128082, + "learning_rate": 2.0333944829460526e-05, + "loss": 3.7984, + "step": 93840 + }, + { + "epoch": 6.376206006250849, + "grad_norm": 1.1159279346466064, + "learning_rate": 2.0329698328577254e-05, + "loss": 3.8422, + "step": 93845 + }, + { + "epoch": 6.3765457263215115, + "grad_norm": 0.16675415635108948, + "learning_rate": 2.0325451827693982e-05, + "loss": 3.7049, + "step": 93850 + }, + { + "epoch": 6.376885446392173, + "grad_norm": 0.22381210327148438, + "learning_rate": 2.0321205326810707e-05, + "loss": 3.5996, + "step": 93855 + }, + { + "epoch": 6.377225166462835, + "grad_norm": 0.1293320208787918, + "learning_rate": 2.0316958825927438e-05, + "loss": 4.0206, + "step": 93860 + }, + { + "epoch": 6.377564886533497, + "grad_norm": 0.233649343252182, + "learning_rate": 2.0312712325044163e-05, + "loss": 3.785, + "step": 93865 + }, + { + "epoch": 6.377904606604158, + "grad_norm": 0.17538411915302277, + "learning_rate": 2.030846582416089e-05, + "loss": 3.7921, + "step": 93870 + }, + { + "epoch": 6.37824432667482, + "grad_norm": 0.1714346557855606, + "learning_rate": 2.0304219323277622e-05, + "loss": 3.8759, + "step": 93875 + }, + { + "epoch": 6.378584046745482, + "grad_norm": 0.22131070494651794, + "learning_rate": 2.0299972822394347e-05, + "loss": 3.7554, + "step": 93880 + }, + { + "epoch": 6.378923766816143, + "grad_norm": 0.21927958726882935, + "learning_rate": 2.0295726321511078e-05, + "loss": 3.5737, + "step": 93885 + }, + { + "epoch": 6.3792634868868054, + "grad_norm": 0.24479690194129944, + "learning_rate": 2.0291479820627803e-05, + "loss": 3.7135, + "step": 93890 + }, + { + "epoch": 6.379603206957467, + "grad_norm": 0.15170659124851227, + "learning_rate": 2.028723331974453e-05, + "loss": 3.4505, + "step": 93895 + }, + { + "epoch": 6.379942927028129, + "grad_norm": 0.18836767971515656, + "learning_rate": 2.028298681886126e-05, + "loss": 3.8296, + "step": 93900 + }, + { + "epoch": 6.380282647098791, + "grad_norm": 0.17013457417488098, + "learning_rate": 2.0278740317977987e-05, + "loss": 3.7739, + "step": 93905 + }, + { + "epoch": 6.380622367169452, + "grad_norm": 0.4306323528289795, + "learning_rate": 2.0274493817094715e-05, + "loss": 3.5507, + "step": 93910 + }, + { + "epoch": 6.380962087240114, + "grad_norm": 0.1666158139705658, + "learning_rate": 2.0270247316211443e-05, + "loss": 4.0184, + "step": 93915 + }, + { + "epoch": 6.381301807310776, + "grad_norm": 0.27248504757881165, + "learning_rate": 2.026600081532817e-05, + "loss": 3.3646, + "step": 93920 + }, + { + "epoch": 6.381641527381437, + "grad_norm": 0.149455264210701, + "learning_rate": 2.02617543144449e-05, + "loss": 3.764, + "step": 93925 + }, + { + "epoch": 6.381981247452099, + "grad_norm": 0.2074078917503357, + "learning_rate": 2.0257507813561627e-05, + "loss": 4.0275, + "step": 93930 + }, + { + "epoch": 6.3823209675227615, + "grad_norm": 0.16697411239147186, + "learning_rate": 2.0253261312678355e-05, + "loss": 3.7807, + "step": 93935 + }, + { + "epoch": 6.382660687593423, + "grad_norm": 0.19630372524261475, + "learning_rate": 2.024901481179508e-05, + "loss": 3.8452, + "step": 93940 + }, + { + "epoch": 6.383000407664085, + "grad_norm": 0.3378649652004242, + "learning_rate": 2.024476831091181e-05, + "loss": 3.8719, + "step": 93945 + }, + { + "epoch": 6.383340127734747, + "grad_norm": 0.18867038190364838, + "learning_rate": 2.0240521810028535e-05, + "loss": 3.716, + "step": 93950 + }, + { + "epoch": 6.383679847805408, + "grad_norm": 0.1601042002439499, + "learning_rate": 2.0236275309145263e-05, + "loss": 3.9403, + "step": 93955 + }, + { + "epoch": 6.38401956787607, + "grad_norm": 0.17121060192584991, + "learning_rate": 2.0232028808261995e-05, + "loss": 3.7441, + "step": 93960 + }, + { + "epoch": 6.384359287946732, + "grad_norm": 0.17221184074878693, + "learning_rate": 2.022778230737872e-05, + "loss": 3.8186, + "step": 93965 + }, + { + "epoch": 6.384699008017393, + "grad_norm": 0.18146660923957825, + "learning_rate": 2.022353580649545e-05, + "loss": 4.051, + "step": 93970 + }, + { + "epoch": 6.385038728088055, + "grad_norm": 0.1321161985397339, + "learning_rate": 2.0219289305612175e-05, + "loss": 3.7415, + "step": 93975 + }, + { + "epoch": 6.3853784481587175, + "grad_norm": 0.21278013288974762, + "learning_rate": 2.0215042804728903e-05, + "loss": 3.8746, + "step": 93980 + }, + { + "epoch": 6.385718168229379, + "grad_norm": 0.2109779715538025, + "learning_rate": 2.0210796303845635e-05, + "loss": 3.7719, + "step": 93985 + }, + { + "epoch": 6.386057888300041, + "grad_norm": 0.6093006730079651, + "learning_rate": 2.020654980296236e-05, + "loss": 3.7279, + "step": 93990 + }, + { + "epoch": 6.386397608370703, + "grad_norm": 0.17884519696235657, + "learning_rate": 2.0202303302079087e-05, + "loss": 3.6855, + "step": 93995 + }, + { + "epoch": 6.386737328441364, + "grad_norm": 0.17958299815654755, + "learning_rate": 2.0198056801195815e-05, + "loss": 3.699, + "step": 94000 + }, + { + "epoch": 6.387077048512026, + "grad_norm": 0.20259468257427216, + "learning_rate": 2.0193810300312543e-05, + "loss": 3.7568, + "step": 94005 + }, + { + "epoch": 6.387416768582688, + "grad_norm": 0.16577792167663574, + "learning_rate": 2.018956379942927e-05, + "loss": 4.051, + "step": 94010 + }, + { + "epoch": 6.387756488653349, + "grad_norm": 0.32488396763801575, + "learning_rate": 2.0185317298546e-05, + "loss": 3.9968, + "step": 94015 + }, + { + "epoch": 6.388096208724011, + "grad_norm": 0.23457103967666626, + "learning_rate": 2.0181070797662728e-05, + "loss": 3.8711, + "step": 94020 + }, + { + "epoch": 6.3884359287946735, + "grad_norm": 0.41553983092308044, + "learning_rate": 2.0176824296779452e-05, + "loss": 3.9462, + "step": 94025 + }, + { + "epoch": 6.388775648865335, + "grad_norm": 0.18553166091442108, + "learning_rate": 2.0172577795896184e-05, + "loss": 3.8416, + "step": 94030 + }, + { + "epoch": 6.389115368935997, + "grad_norm": 0.1959758698940277, + "learning_rate": 2.016833129501291e-05, + "loss": 3.906, + "step": 94035 + }, + { + "epoch": 6.389455089006659, + "grad_norm": 0.7565122246742249, + "learning_rate": 2.0164084794129636e-05, + "loss": 3.9554, + "step": 94040 + }, + { + "epoch": 6.38979480907732, + "grad_norm": 0.47012507915496826, + "learning_rate": 2.0159838293246368e-05, + "loss": 3.9355, + "step": 94045 + }, + { + "epoch": 6.390134529147982, + "grad_norm": 0.14803801476955414, + "learning_rate": 2.0155591792363092e-05, + "loss": 4.0066, + "step": 94050 + }, + { + "epoch": 6.390474249218644, + "grad_norm": 0.16375558078289032, + "learning_rate": 2.0151345291479824e-05, + "loss": 3.5828, + "step": 94055 + }, + { + "epoch": 6.390813969289305, + "grad_norm": 0.18857257068157196, + "learning_rate": 2.0147098790596548e-05, + "loss": 3.6549, + "step": 94060 + }, + { + "epoch": 6.391153689359967, + "grad_norm": 0.155385822057724, + "learning_rate": 2.0142852289713276e-05, + "loss": 3.8192, + "step": 94065 + }, + { + "epoch": 6.3914934094306295, + "grad_norm": 0.17805777490139008, + "learning_rate": 2.0138605788830008e-05, + "loss": 4.0292, + "step": 94070 + }, + { + "epoch": 6.391833129501291, + "grad_norm": 0.6078669428825378, + "learning_rate": 2.0134359287946732e-05, + "loss": 4.0917, + "step": 94075 + }, + { + "epoch": 6.392172849571953, + "grad_norm": 0.14908762276172638, + "learning_rate": 2.013011278706346e-05, + "loss": 3.8853, + "step": 94080 + }, + { + "epoch": 6.392512569642615, + "grad_norm": 0.16254371404647827, + "learning_rate": 2.0125866286180188e-05, + "loss": 3.8679, + "step": 94085 + }, + { + "epoch": 6.392852289713276, + "grad_norm": 0.37427693605422974, + "learning_rate": 2.0121619785296916e-05, + "loss": 4.0248, + "step": 94090 + }, + { + "epoch": 6.393192009783938, + "grad_norm": 0.1592235565185547, + "learning_rate": 2.0117373284413644e-05, + "loss": 3.7283, + "step": 94095 + }, + { + "epoch": 6.3935317298546, + "grad_norm": 0.13413192331790924, + "learning_rate": 2.0113126783530372e-05, + "loss": 3.6829, + "step": 94100 + }, + { + "epoch": 6.393871449925261, + "grad_norm": 0.18988245725631714, + "learning_rate": 2.01088802826471e-05, + "loss": 3.8772, + "step": 94105 + }, + { + "epoch": 6.394211169995923, + "grad_norm": 0.47704777121543884, + "learning_rate": 2.0104633781763825e-05, + "loss": 3.96, + "step": 94110 + }, + { + "epoch": 6.3945508900665855, + "grad_norm": 0.8911278247833252, + "learning_rate": 2.0100387280880556e-05, + "loss": 3.6011, + "step": 94115 + }, + { + "epoch": 6.394890610137247, + "grad_norm": 0.5950703620910645, + "learning_rate": 2.0096140779997284e-05, + "loss": 3.6952, + "step": 94120 + }, + { + "epoch": 6.395230330207909, + "grad_norm": 0.18955165147781372, + "learning_rate": 2.009189427911401e-05, + "loss": 3.7814, + "step": 94125 + }, + { + "epoch": 6.395570050278571, + "grad_norm": 0.2374432235956192, + "learning_rate": 2.008764777823074e-05, + "loss": 3.9633, + "step": 94130 + }, + { + "epoch": 6.395909770349232, + "grad_norm": 0.17013992369174957, + "learning_rate": 2.0083401277347465e-05, + "loss": 3.8801, + "step": 94135 + }, + { + "epoch": 6.396249490419894, + "grad_norm": 0.18071505427360535, + "learning_rate": 2.0079154776464196e-05, + "loss": 3.8657, + "step": 94140 + }, + { + "epoch": 6.396589210490556, + "grad_norm": 0.21857291460037231, + "learning_rate": 2.007490827558092e-05, + "loss": 3.7305, + "step": 94145 + }, + { + "epoch": 6.396928930561217, + "grad_norm": 0.17915460467338562, + "learning_rate": 2.007066177469765e-05, + "loss": 3.7983, + "step": 94150 + }, + { + "epoch": 6.397268650631879, + "grad_norm": 0.47910189628601074, + "learning_rate": 2.006641527381438e-05, + "loss": 3.7971, + "step": 94155 + }, + { + "epoch": 6.3976083707025415, + "grad_norm": 0.16851811110973358, + "learning_rate": 2.0062168772931105e-05, + "loss": 3.7972, + "step": 94160 + }, + { + "epoch": 6.397948090773203, + "grad_norm": 0.15973402559757233, + "learning_rate": 2.0057922272047833e-05, + "loss": 3.6363, + "step": 94165 + }, + { + "epoch": 6.398287810843865, + "grad_norm": 0.16174617409706116, + "learning_rate": 2.005367577116456e-05, + "loss": 3.8032, + "step": 94170 + }, + { + "epoch": 6.398627530914526, + "grad_norm": 0.16624128818511963, + "learning_rate": 2.004942927028129e-05, + "loss": 3.7894, + "step": 94175 + }, + { + "epoch": 6.398967250985188, + "grad_norm": 0.16226549446582794, + "learning_rate": 2.0045182769398017e-05, + "loss": 3.8015, + "step": 94180 + }, + { + "epoch": 6.39930697105585, + "grad_norm": 0.15950463712215424, + "learning_rate": 2.0040936268514745e-05, + "loss": 3.7201, + "step": 94185 + }, + { + "epoch": 6.399646691126511, + "grad_norm": 0.2234213650226593, + "learning_rate": 2.0036689767631473e-05, + "loss": 4.0891, + "step": 94190 + }, + { + "epoch": 6.399986411197173, + "grad_norm": 0.16955754160881042, + "learning_rate": 2.0032443266748198e-05, + "loss": 3.9512, + "step": 94195 + }, + { + "epoch": 6.4003261312678354, + "grad_norm": 0.16633157432079315, + "learning_rate": 2.002819676586493e-05, + "loss": 3.7215, + "step": 94200 + }, + { + "epoch": 6.400665851338497, + "grad_norm": 0.18419601023197174, + "learning_rate": 2.0023950264981657e-05, + "loss": 3.9186, + "step": 94205 + }, + { + "epoch": 6.401005571409159, + "grad_norm": 0.18082791566848755, + "learning_rate": 2.001970376409838e-05, + "loss": 3.6583, + "step": 94210 + }, + { + "epoch": 6.401345291479821, + "grad_norm": 0.24155521392822266, + "learning_rate": 2.0015457263215113e-05, + "loss": 3.8376, + "step": 94215 + }, + { + "epoch": 6.401685011550482, + "grad_norm": 0.14909671247005463, + "learning_rate": 2.0011210762331838e-05, + "loss": 3.9461, + "step": 94220 + }, + { + "epoch": 6.402024731621144, + "grad_norm": 0.18988147377967834, + "learning_rate": 2.000696426144857e-05, + "loss": 3.7541, + "step": 94225 + }, + { + "epoch": 6.402364451691806, + "grad_norm": 0.1978045552968979, + "learning_rate": 2.0002717760565297e-05, + "loss": 3.6393, + "step": 94230 + }, + { + "epoch": 6.402704171762467, + "grad_norm": 0.19074298441410065, + "learning_rate": 1.999847125968202e-05, + "loss": 3.8867, + "step": 94235 + }, + { + "epoch": 6.403043891833129, + "grad_norm": 0.15271252393722534, + "learning_rate": 1.9994224758798753e-05, + "loss": 3.63, + "step": 94240 + }, + { + "epoch": 6.4033836119037915, + "grad_norm": 0.1621190309524536, + "learning_rate": 1.9989978257915478e-05, + "loss": 3.6581, + "step": 94245 + }, + { + "epoch": 6.403723331974453, + "grad_norm": 0.199523463845253, + "learning_rate": 1.9985731757032206e-05, + "loss": 4.0168, + "step": 94250 + }, + { + "epoch": 6.404063052045115, + "grad_norm": 0.1807786375284195, + "learning_rate": 1.9981485256148934e-05, + "loss": 3.8066, + "step": 94255 + }, + { + "epoch": 6.404402772115777, + "grad_norm": 0.1806679219007492, + "learning_rate": 1.9977238755265662e-05, + "loss": 3.7041, + "step": 94260 + }, + { + "epoch": 6.404742492186438, + "grad_norm": 0.19120700657367706, + "learning_rate": 1.997299225438239e-05, + "loss": 3.8255, + "step": 94265 + }, + { + "epoch": 6.4050822122571, + "grad_norm": 0.3603769838809967, + "learning_rate": 1.9968745753499118e-05, + "loss": 3.9195, + "step": 94270 + }, + { + "epoch": 6.405421932327762, + "grad_norm": 0.16037240624427795, + "learning_rate": 1.9964499252615846e-05, + "loss": 3.6108, + "step": 94275 + }, + { + "epoch": 6.405761652398423, + "grad_norm": 0.1649843007326126, + "learning_rate": 1.9960252751732574e-05, + "loss": 3.7679, + "step": 94280 + }, + { + "epoch": 6.406101372469085, + "grad_norm": 0.16899271309375763, + "learning_rate": 1.9956006250849302e-05, + "loss": 3.808, + "step": 94285 + }, + { + "epoch": 6.4064410925397475, + "grad_norm": 0.26538291573524475, + "learning_rate": 1.995175974996603e-05, + "loss": 3.9098, + "step": 94290 + }, + { + "epoch": 6.406780812610409, + "grad_norm": 0.2031443864107132, + "learning_rate": 1.9947513249082754e-05, + "loss": 3.6991, + "step": 94295 + }, + { + "epoch": 6.407120532681071, + "grad_norm": 0.2447075992822647, + "learning_rate": 1.9943266748199486e-05, + "loss": 3.8141, + "step": 94300 + }, + { + "epoch": 6.407460252751733, + "grad_norm": 0.182987779378891, + "learning_rate": 1.993902024731621e-05, + "loss": 3.9279, + "step": 94305 + }, + { + "epoch": 6.407799972822394, + "grad_norm": 0.2064349353313446, + "learning_rate": 1.9934773746432942e-05, + "loss": 3.7176, + "step": 94310 + }, + { + "epoch": 6.408139692893056, + "grad_norm": 0.5879525542259216, + "learning_rate": 1.993052724554967e-05, + "loss": 3.6793, + "step": 94315 + }, + { + "epoch": 6.408479412963718, + "grad_norm": 0.22087162733078003, + "learning_rate": 1.9926280744666394e-05, + "loss": 3.9421, + "step": 94320 + }, + { + "epoch": 6.408819133034379, + "grad_norm": 0.47221481800079346, + "learning_rate": 1.9922034243783126e-05, + "loss": 3.7105, + "step": 94325 + }, + { + "epoch": 6.409158853105041, + "grad_norm": 0.3584645688533783, + "learning_rate": 1.991778774289985e-05, + "loss": 3.927, + "step": 94330 + }, + { + "epoch": 6.4094985731757035, + "grad_norm": 0.20147185027599335, + "learning_rate": 1.991354124201658e-05, + "loss": 3.6581, + "step": 94335 + }, + { + "epoch": 6.409838293246365, + "grad_norm": 0.20127229392528534, + "learning_rate": 1.9909294741133306e-05, + "loss": 3.8092, + "step": 94340 + }, + { + "epoch": 6.410178013317027, + "grad_norm": 0.15167777240276337, + "learning_rate": 1.9905048240250034e-05, + "loss": 3.9107, + "step": 94345 + }, + { + "epoch": 6.410517733387689, + "grad_norm": 0.1615748107433319, + "learning_rate": 1.9900801739366762e-05, + "loss": 3.9388, + "step": 94350 + }, + { + "epoch": 6.41085745345835, + "grad_norm": 1.4838316440582275, + "learning_rate": 1.989655523848349e-05, + "loss": 3.7113, + "step": 94355 + }, + { + "epoch": 6.411197173529012, + "grad_norm": 0.15927980840206146, + "learning_rate": 1.989230873760022e-05, + "loss": 3.8267, + "step": 94360 + }, + { + "epoch": 6.411536893599674, + "grad_norm": 0.20851682126522064, + "learning_rate": 1.9888062236716946e-05, + "loss": 3.8015, + "step": 94365 + }, + { + "epoch": 6.411876613670335, + "grad_norm": 0.16425128281116486, + "learning_rate": 1.9883815735833675e-05, + "loss": 3.9245, + "step": 94370 + }, + { + "epoch": 6.412216333740997, + "grad_norm": 0.2014336735010147, + "learning_rate": 1.9879569234950403e-05, + "loss": 3.8243, + "step": 94375 + }, + { + "epoch": 6.4125560538116595, + "grad_norm": 0.18249155580997467, + "learning_rate": 1.9875322734067127e-05, + "loss": 3.8299, + "step": 94380 + }, + { + "epoch": 6.412895773882321, + "grad_norm": 0.17161835730075836, + "learning_rate": 1.987107623318386e-05, + "loss": 3.8051, + "step": 94385 + }, + { + "epoch": 6.413235493952983, + "grad_norm": 0.6981828212738037, + "learning_rate": 1.9866829732300583e-05, + "loss": 3.8876, + "step": 94390 + }, + { + "epoch": 6.413575214023645, + "grad_norm": 0.19857537746429443, + "learning_rate": 1.9862583231417315e-05, + "loss": 3.8065, + "step": 94395 + }, + { + "epoch": 6.413914934094306, + "grad_norm": 0.15314361453056335, + "learning_rate": 1.9858336730534043e-05, + "loss": 3.7663, + "step": 94400 + }, + { + "epoch": 6.414254654164968, + "grad_norm": 0.29938918352127075, + "learning_rate": 1.9854090229650767e-05, + "loss": 3.8337, + "step": 94405 + }, + { + "epoch": 6.41459437423563, + "grad_norm": 0.13133397698402405, + "learning_rate": 1.98498437287675e-05, + "loss": 3.9044, + "step": 94410 + }, + { + "epoch": 6.414934094306291, + "grad_norm": 0.17853358387947083, + "learning_rate": 1.9845597227884223e-05, + "loss": 3.8294, + "step": 94415 + }, + { + "epoch": 6.415273814376953, + "grad_norm": 0.21341180801391602, + "learning_rate": 1.984135072700095e-05, + "loss": 3.7937, + "step": 94420 + }, + { + "epoch": 6.4156135344476155, + "grad_norm": 0.16572332382202148, + "learning_rate": 1.9837104226117683e-05, + "loss": 3.7395, + "step": 94425 + }, + { + "epoch": 6.415953254518277, + "grad_norm": 0.7484102845191956, + "learning_rate": 1.9832857725234407e-05, + "loss": 4.0358, + "step": 94430 + }, + { + "epoch": 6.416292974588939, + "grad_norm": 0.27752485871315, + "learning_rate": 1.9828611224351135e-05, + "loss": 3.7088, + "step": 94435 + }, + { + "epoch": 6.416632694659601, + "grad_norm": 0.19530721008777618, + "learning_rate": 1.9824364723467863e-05, + "loss": 3.7493, + "step": 94440 + }, + { + "epoch": 6.416972414730262, + "grad_norm": 0.17243048548698425, + "learning_rate": 1.982011822258459e-05, + "loss": 3.895, + "step": 94445 + }, + { + "epoch": 6.417312134800924, + "grad_norm": 0.18074990808963776, + "learning_rate": 1.981587172170132e-05, + "loss": 3.7246, + "step": 94450 + }, + { + "epoch": 6.417651854871586, + "grad_norm": 0.8269655108451843, + "learning_rate": 1.9811625220818047e-05, + "loss": 3.7748, + "step": 94455 + }, + { + "epoch": 6.417991574942247, + "grad_norm": 0.1847144067287445, + "learning_rate": 1.9807378719934775e-05, + "loss": 4.0043, + "step": 94460 + }, + { + "epoch": 6.418331295012909, + "grad_norm": 0.23400312662124634, + "learning_rate": 1.98031322190515e-05, + "loss": 3.8404, + "step": 94465 + }, + { + "epoch": 6.4186710150835715, + "grad_norm": 0.16192786395549774, + "learning_rate": 1.979888571816823e-05, + "loss": 3.8528, + "step": 94470 + }, + { + "epoch": 6.419010735154233, + "grad_norm": 0.16521576046943665, + "learning_rate": 1.979463921728496e-05, + "loss": 3.6864, + "step": 94475 + }, + { + "epoch": 6.419350455224895, + "grad_norm": 0.18308793008327484, + "learning_rate": 1.9790392716401687e-05, + "loss": 4.014, + "step": 94480 + }, + { + "epoch": 6.419690175295557, + "grad_norm": 0.17813825607299805, + "learning_rate": 1.9786146215518415e-05, + "loss": 3.8127, + "step": 94485 + }, + { + "epoch": 6.420029895366218, + "grad_norm": 0.18898223340511322, + "learning_rate": 1.978189971463514e-05, + "loss": 3.7428, + "step": 94490 + }, + { + "epoch": 6.42036961543688, + "grad_norm": 0.18485620617866516, + "learning_rate": 1.977765321375187e-05, + "loss": 3.7499, + "step": 94495 + }, + { + "epoch": 6.420709335507542, + "grad_norm": 0.1643616408109665, + "learning_rate": 1.9773406712868596e-05, + "loss": 3.8949, + "step": 94500 + }, + { + "epoch": 6.421049055578203, + "grad_norm": 0.1616450399160385, + "learning_rate": 1.9769160211985324e-05, + "loss": 3.7906, + "step": 94505 + }, + { + "epoch": 6.4213887756488655, + "grad_norm": 0.17966367304325104, + "learning_rate": 1.9764913711102055e-05, + "loss": 3.5698, + "step": 94510 + }, + { + "epoch": 6.4217284957195275, + "grad_norm": 0.4820009171962738, + "learning_rate": 1.976066721021878e-05, + "loss": 3.9387, + "step": 94515 + }, + { + "epoch": 6.422068215790189, + "grad_norm": 0.16034886240959167, + "learning_rate": 1.9756420709335508e-05, + "loss": 3.6028, + "step": 94520 + }, + { + "epoch": 6.422407935860851, + "grad_norm": 1.6006866693496704, + "learning_rate": 1.9752174208452236e-05, + "loss": 3.4423, + "step": 94525 + }, + { + "epoch": 6.422747655931513, + "grad_norm": 0.1451512724161148, + "learning_rate": 1.9747927707568964e-05, + "loss": 3.8058, + "step": 94530 + }, + { + "epoch": 6.423087376002174, + "grad_norm": 0.16092203557491302, + "learning_rate": 1.9743681206685692e-05, + "loss": 3.8419, + "step": 94535 + }, + { + "epoch": 6.423427096072836, + "grad_norm": 0.17286492884159088, + "learning_rate": 1.973943470580242e-05, + "loss": 3.9577, + "step": 94540 + }, + { + "epoch": 6.423766816143498, + "grad_norm": 0.16494876146316528, + "learning_rate": 1.9735188204919148e-05, + "loss": 3.7882, + "step": 94545 + }, + { + "epoch": 6.424106536214159, + "grad_norm": 0.16122230887413025, + "learning_rate": 1.9730941704035873e-05, + "loss": 3.707, + "step": 94550 + }, + { + "epoch": 6.4244462562848215, + "grad_norm": 0.19997456669807434, + "learning_rate": 1.9726695203152604e-05, + "loss": 3.943, + "step": 94555 + }, + { + "epoch": 6.4247859763554835, + "grad_norm": 0.1482875794172287, + "learning_rate": 1.9722448702269332e-05, + "loss": 4.08, + "step": 94560 + }, + { + "epoch": 6.425125696426145, + "grad_norm": 0.23736368119716644, + "learning_rate": 1.971820220138606e-05, + "loss": 3.8281, + "step": 94565 + }, + { + "epoch": 6.425465416496807, + "grad_norm": 0.21135734021663666, + "learning_rate": 1.9713955700502788e-05, + "loss": 3.8508, + "step": 94570 + }, + { + "epoch": 6.425805136567468, + "grad_norm": 0.1595461219549179, + "learning_rate": 1.9709709199619513e-05, + "loss": 3.4771, + "step": 94575 + }, + { + "epoch": 6.42614485663813, + "grad_norm": 0.1792944073677063, + "learning_rate": 1.9705462698736244e-05, + "loss": 3.826, + "step": 94580 + }, + { + "epoch": 6.426484576708792, + "grad_norm": 0.15235395729541779, + "learning_rate": 1.970121619785297e-05, + "loss": 3.8519, + "step": 94585 + }, + { + "epoch": 6.426824296779453, + "grad_norm": 0.19349220395088196, + "learning_rate": 1.9696969696969697e-05, + "loss": 3.8492, + "step": 94590 + }, + { + "epoch": 6.427164016850115, + "grad_norm": 0.4379744231700897, + "learning_rate": 1.9692723196086428e-05, + "loss": 4.1304, + "step": 94595 + }, + { + "epoch": 6.4275037369207775, + "grad_norm": 0.18391357362270355, + "learning_rate": 1.9688476695203153e-05, + "loss": 3.8896, + "step": 94600 + }, + { + "epoch": 6.427843456991439, + "grad_norm": 0.16063760221004486, + "learning_rate": 1.968423019431988e-05, + "loss": 3.8468, + "step": 94605 + }, + { + "epoch": 6.428183177062101, + "grad_norm": 0.21530215442180634, + "learning_rate": 1.967998369343661e-05, + "loss": 3.9007, + "step": 94610 + }, + { + "epoch": 6.428522897132763, + "grad_norm": 0.17114877700805664, + "learning_rate": 1.9675737192553337e-05, + "loss": 3.7234, + "step": 94615 + }, + { + "epoch": 6.428862617203424, + "grad_norm": 0.15378916263580322, + "learning_rate": 1.9671490691670065e-05, + "loss": 3.9709, + "step": 94620 + }, + { + "epoch": 6.429202337274086, + "grad_norm": 0.5180607438087463, + "learning_rate": 1.9667244190786793e-05, + "loss": 3.7629, + "step": 94625 + }, + { + "epoch": 6.429542057344748, + "grad_norm": 0.17945657670497894, + "learning_rate": 1.966299768990352e-05, + "loss": 4.0574, + "step": 94630 + }, + { + "epoch": 6.429881777415409, + "grad_norm": 0.21680903434753418, + "learning_rate": 1.9658751189020245e-05, + "loss": 3.8624, + "step": 94635 + }, + { + "epoch": 6.430221497486071, + "grad_norm": 0.15904487669467926, + "learning_rate": 1.9654504688136977e-05, + "loss": 3.8491, + "step": 94640 + }, + { + "epoch": 6.4305612175567335, + "grad_norm": 0.21898962557315826, + "learning_rate": 1.9650258187253705e-05, + "loss": 4.046, + "step": 94645 + }, + { + "epoch": 6.430900937627395, + "grad_norm": 0.1976468414068222, + "learning_rate": 1.9646011686370433e-05, + "loss": 4.0349, + "step": 94650 + }, + { + "epoch": 6.431240657698057, + "grad_norm": 0.17204639315605164, + "learning_rate": 1.964176518548716e-05, + "loss": 3.9366, + "step": 94655 + }, + { + "epoch": 6.431580377768719, + "grad_norm": 0.1633344441652298, + "learning_rate": 1.9637518684603885e-05, + "loss": 3.8091, + "step": 94660 + }, + { + "epoch": 6.43192009783938, + "grad_norm": 0.18450893461704254, + "learning_rate": 1.9633272183720617e-05, + "loss": 3.907, + "step": 94665 + }, + { + "epoch": 6.432259817910042, + "grad_norm": 0.15952067077159882, + "learning_rate": 1.9629025682837345e-05, + "loss": 3.7755, + "step": 94670 + }, + { + "epoch": 6.432599537980704, + "grad_norm": 0.144451305270195, + "learning_rate": 1.962477918195407e-05, + "loss": 4.0, + "step": 94675 + }, + { + "epoch": 6.432939258051365, + "grad_norm": 0.1641199290752411, + "learning_rate": 1.96205326810708e-05, + "loss": 3.7709, + "step": 94680 + }, + { + "epoch": 6.433278978122027, + "grad_norm": 0.1966599076986313, + "learning_rate": 1.9616286180187525e-05, + "loss": 3.8906, + "step": 94685 + }, + { + "epoch": 6.4336186981926895, + "grad_norm": 0.19079115986824036, + "learning_rate": 1.9612039679304253e-05, + "loss": 3.8986, + "step": 94690 + }, + { + "epoch": 6.433958418263351, + "grad_norm": 0.16946569085121155, + "learning_rate": 1.960779317842098e-05, + "loss": 3.8356, + "step": 94695 + }, + { + "epoch": 6.434298138334013, + "grad_norm": 0.16434423625469208, + "learning_rate": 1.960354667753771e-05, + "loss": 3.7244, + "step": 94700 + }, + { + "epoch": 6.434637858404675, + "grad_norm": 0.8967655897140503, + "learning_rate": 1.9599300176654437e-05, + "loss": 3.9067, + "step": 94705 + }, + { + "epoch": 6.434977578475336, + "grad_norm": 0.2812124788761139, + "learning_rate": 1.9595053675771165e-05, + "loss": 3.8088, + "step": 94710 + }, + { + "epoch": 6.435317298545998, + "grad_norm": 0.5903648734092712, + "learning_rate": 1.9590807174887893e-05, + "loss": 3.7315, + "step": 94715 + }, + { + "epoch": 6.43565701861666, + "grad_norm": 0.1747802346944809, + "learning_rate": 1.958656067400462e-05, + "loss": 3.9612, + "step": 94720 + }, + { + "epoch": 6.435996738687321, + "grad_norm": 0.15303362905979156, + "learning_rate": 1.958231417312135e-05, + "loss": 3.7276, + "step": 94725 + }, + { + "epoch": 6.436336458757983, + "grad_norm": 0.24412274360656738, + "learning_rate": 1.9578067672238078e-05, + "loss": 3.7087, + "step": 94730 + }, + { + "epoch": 6.4366761788286455, + "grad_norm": 0.14353051781654358, + "learning_rate": 1.9573821171354806e-05, + "loss": 3.7187, + "step": 94735 + }, + { + "epoch": 6.437015898899307, + "grad_norm": 0.15636488795280457, + "learning_rate": 1.9569574670471534e-05, + "loss": 3.8113, + "step": 94740 + }, + { + "epoch": 6.437355618969969, + "grad_norm": 0.3342701494693756, + "learning_rate": 1.9565328169588258e-05, + "loss": 3.976, + "step": 94745 + }, + { + "epoch": 6.437695339040631, + "grad_norm": 0.15478962659835815, + "learning_rate": 1.956108166870499e-05, + "loss": 3.7868, + "step": 94750 + }, + { + "epoch": 6.438035059111292, + "grad_norm": 0.1993449330329895, + "learning_rate": 1.9556835167821718e-05, + "loss": 3.8345, + "step": 94755 + }, + { + "epoch": 6.438374779181954, + "grad_norm": 0.14557424187660217, + "learning_rate": 1.9552588666938442e-05, + "loss": 4.0109, + "step": 94760 + }, + { + "epoch": 6.438714499252616, + "grad_norm": 0.23240961134433746, + "learning_rate": 1.9548342166055174e-05, + "loss": 3.8175, + "step": 94765 + }, + { + "epoch": 6.439054219323277, + "grad_norm": 0.3118521273136139, + "learning_rate": 1.9544095665171898e-05, + "loss": 3.7156, + "step": 94770 + }, + { + "epoch": 6.4393939393939394, + "grad_norm": 0.7728208303451538, + "learning_rate": 1.9539849164288626e-05, + "loss": 3.8057, + "step": 94775 + }, + { + "epoch": 6.4397336594646015, + "grad_norm": 0.17533740401268005, + "learning_rate": 1.9535602663405354e-05, + "loss": 3.8169, + "step": 94780 + }, + { + "epoch": 6.440073379535263, + "grad_norm": 0.21538406610488892, + "learning_rate": 1.9531356162522082e-05, + "loss": 3.9156, + "step": 94785 + }, + { + "epoch": 6.440413099605925, + "grad_norm": 0.21879979968070984, + "learning_rate": 1.952710966163881e-05, + "loss": 3.6952, + "step": 94790 + }, + { + "epoch": 6.440752819676587, + "grad_norm": 0.21257780492305756, + "learning_rate": 1.9522863160755538e-05, + "loss": 3.6744, + "step": 94795 + }, + { + "epoch": 6.441092539747248, + "grad_norm": 0.16733311116695404, + "learning_rate": 1.9518616659872266e-05, + "loss": 3.8743, + "step": 94800 + }, + { + "epoch": 6.44143225981791, + "grad_norm": 0.1971929520368576, + "learning_rate": 1.9514370158988994e-05, + "loss": 3.8158, + "step": 94805 + }, + { + "epoch": 6.441771979888572, + "grad_norm": 0.13848042488098145, + "learning_rate": 1.9510123658105722e-05, + "loss": 3.702, + "step": 94810 + }, + { + "epoch": 6.442111699959233, + "grad_norm": 0.2081320434808731, + "learning_rate": 1.950587715722245e-05, + "loss": 3.9276, + "step": 94815 + }, + { + "epoch": 6.4424514200298955, + "grad_norm": 0.173781156539917, + "learning_rate": 1.9501630656339178e-05, + "loss": 3.7562, + "step": 94820 + }, + { + "epoch": 6.4427911401005575, + "grad_norm": 0.2103375643491745, + "learning_rate": 1.9497384155455906e-05, + "loss": 3.6384, + "step": 94825 + }, + { + "epoch": 6.443130860171219, + "grad_norm": 0.18931706249713898, + "learning_rate": 1.949313765457263e-05, + "loss": 3.9955, + "step": 94830 + }, + { + "epoch": 6.443470580241881, + "grad_norm": 0.22972577810287476, + "learning_rate": 1.9488891153689362e-05, + "loss": 3.7276, + "step": 94835 + }, + { + "epoch": 6.443810300312543, + "grad_norm": 0.21707673370838165, + "learning_rate": 1.948464465280609e-05, + "loss": 3.9884, + "step": 94840 + }, + { + "epoch": 6.444150020383204, + "grad_norm": 0.20033499598503113, + "learning_rate": 1.9480398151922815e-05, + "loss": 3.7377, + "step": 94845 + }, + { + "epoch": 6.444489740453866, + "grad_norm": 0.16364963352680206, + "learning_rate": 1.9476151651039546e-05, + "loss": 3.7019, + "step": 94850 + }, + { + "epoch": 6.444829460524527, + "grad_norm": 0.1804109513759613, + "learning_rate": 1.947190515015627e-05, + "loss": 3.6525, + "step": 94855 + }, + { + "epoch": 6.445169180595189, + "grad_norm": 0.1701570600271225, + "learning_rate": 1.9467658649273e-05, + "loss": 3.7954, + "step": 94860 + }, + { + "epoch": 6.4455089006658515, + "grad_norm": 2.174837827682495, + "learning_rate": 1.9463412148389727e-05, + "loss": 3.8743, + "step": 94865 + }, + { + "epoch": 6.445848620736513, + "grad_norm": 0.14109894633293152, + "learning_rate": 1.9459165647506455e-05, + "loss": 3.958, + "step": 94870 + }, + { + "epoch": 6.446188340807175, + "grad_norm": 0.1681746393442154, + "learning_rate": 1.9454919146623183e-05, + "loss": 3.835, + "step": 94875 + }, + { + "epoch": 6.446528060877837, + "grad_norm": 0.14377769827842712, + "learning_rate": 1.945067264573991e-05, + "loss": 3.8178, + "step": 94880 + }, + { + "epoch": 6.446867780948498, + "grad_norm": 0.18057739734649658, + "learning_rate": 1.944642614485664e-05, + "loss": 3.9835, + "step": 94885 + }, + { + "epoch": 6.44720750101916, + "grad_norm": 0.20656302571296692, + "learning_rate": 1.9442179643973367e-05, + "loss": 3.7412, + "step": 94890 + }, + { + "epoch": 6.447547221089822, + "grad_norm": 0.26332664489746094, + "learning_rate": 1.9437933143090095e-05, + "loss": 3.7342, + "step": 94895 + }, + { + "epoch": 6.447886941160483, + "grad_norm": 0.21166333556175232, + "learning_rate": 1.9433686642206823e-05, + "loss": 3.7028, + "step": 94900 + }, + { + "epoch": 6.448226661231145, + "grad_norm": 0.16825126111507416, + "learning_rate": 1.942944014132355e-05, + "loss": 3.8644, + "step": 94905 + }, + { + "epoch": 6.4485663813018075, + "grad_norm": 0.17131434381008148, + "learning_rate": 1.942519364044028e-05, + "loss": 3.8287, + "step": 94910 + }, + { + "epoch": 6.448906101372469, + "grad_norm": 0.1914704144001007, + "learning_rate": 1.9420947139557004e-05, + "loss": 3.6387, + "step": 94915 + }, + { + "epoch": 6.449245821443131, + "grad_norm": 0.1611143797636032, + "learning_rate": 1.9416700638673735e-05, + "loss": 3.9278, + "step": 94920 + }, + { + "epoch": 6.449585541513793, + "grad_norm": 0.15229283273220062, + "learning_rate": 1.9412454137790463e-05, + "loss": 3.6037, + "step": 94925 + }, + { + "epoch": 6.449925261584454, + "grad_norm": 0.19872057437896729, + "learning_rate": 1.9408207636907188e-05, + "loss": 3.678, + "step": 94930 + }, + { + "epoch": 6.450264981655116, + "grad_norm": 0.1562841832637787, + "learning_rate": 1.940396113602392e-05, + "loss": 3.8218, + "step": 94935 + }, + { + "epoch": 6.450604701725778, + "grad_norm": 0.1928742676973343, + "learning_rate": 1.9399714635140644e-05, + "loss": 3.8798, + "step": 94940 + }, + { + "epoch": 6.450944421796439, + "grad_norm": 0.16229557991027832, + "learning_rate": 1.939546813425737e-05, + "loss": 3.8413, + "step": 94945 + }, + { + "epoch": 6.451284141867101, + "grad_norm": 0.17446039617061615, + "learning_rate": 1.9391221633374103e-05, + "loss": 3.7909, + "step": 94950 + }, + { + "epoch": 6.4516238619377635, + "grad_norm": 0.18464498221874237, + "learning_rate": 1.9386975132490828e-05, + "loss": 3.9299, + "step": 94955 + }, + { + "epoch": 6.451963582008425, + "grad_norm": 0.18503916263580322, + "learning_rate": 1.9382728631607556e-05, + "loss": 3.9345, + "step": 94960 + }, + { + "epoch": 6.452303302079087, + "grad_norm": 0.17949753999710083, + "learning_rate": 1.9378482130724284e-05, + "loss": 3.858, + "step": 94965 + }, + { + "epoch": 6.452643022149749, + "grad_norm": 0.23239198327064514, + "learning_rate": 1.9374235629841012e-05, + "loss": 4.1174, + "step": 94970 + }, + { + "epoch": 6.45298274222041, + "grad_norm": 0.17709119617938995, + "learning_rate": 1.936998912895774e-05, + "loss": 3.813, + "step": 94975 + }, + { + "epoch": 6.453322462291072, + "grad_norm": 0.24766169488430023, + "learning_rate": 1.9365742628074468e-05, + "loss": 3.5356, + "step": 94980 + }, + { + "epoch": 6.453662182361734, + "grad_norm": 0.21855652332305908, + "learning_rate": 1.9361496127191196e-05, + "loss": 3.9299, + "step": 94985 + }, + { + "epoch": 6.454001902432395, + "grad_norm": 0.15754030644893646, + "learning_rate": 1.9357249626307924e-05, + "loss": 3.9155, + "step": 94990 + }, + { + "epoch": 6.454341622503057, + "grad_norm": 0.18430912494659424, + "learning_rate": 1.9353003125424652e-05, + "loss": 3.8099, + "step": 94995 + }, + { + "epoch": 6.4546813425737195, + "grad_norm": 0.15014930069446564, + "learning_rate": 1.934875662454138e-05, + "loss": 3.563, + "step": 95000 + }, + { + "epoch": 6.455021062644381, + "grad_norm": 0.16197004914283752, + "learning_rate": 1.9344510123658108e-05, + "loss": 3.8839, + "step": 95005 + }, + { + "epoch": 6.455360782715043, + "grad_norm": 1.1334991455078125, + "learning_rate": 1.9340263622774836e-05, + "loss": 3.835, + "step": 95010 + }, + { + "epoch": 6.455700502785705, + "grad_norm": 0.14469031989574432, + "learning_rate": 1.933601712189156e-05, + "loss": 3.6588, + "step": 95015 + }, + { + "epoch": 6.456040222856366, + "grad_norm": 0.35555803775787354, + "learning_rate": 1.9331770621008292e-05, + "loss": 3.9182, + "step": 95020 + }, + { + "epoch": 6.456379942927028, + "grad_norm": 0.1321752518415451, + "learning_rate": 1.9327524120125016e-05, + "loss": 4.0043, + "step": 95025 + }, + { + "epoch": 6.45671966299769, + "grad_norm": 0.1754714995622635, + "learning_rate": 1.9323277619241744e-05, + "loss": 3.8409, + "step": 95030 + }, + { + "epoch": 6.457059383068351, + "grad_norm": 0.17848198115825653, + "learning_rate": 1.9319031118358476e-05, + "loss": 3.684, + "step": 95035 + }, + { + "epoch": 6.457399103139013, + "grad_norm": 0.21680285036563873, + "learning_rate": 1.93147846174752e-05, + "loss": 3.7727, + "step": 95040 + }, + { + "epoch": 6.4577388232096755, + "grad_norm": 0.1816585212945938, + "learning_rate": 1.931053811659193e-05, + "loss": 3.7115, + "step": 95045 + }, + { + "epoch": 6.458078543280337, + "grad_norm": 0.14301733672618866, + "learning_rate": 1.9306291615708656e-05, + "loss": 3.8552, + "step": 95050 + }, + { + "epoch": 6.458418263350999, + "grad_norm": 0.741206169128418, + "learning_rate": 1.9302045114825384e-05, + "loss": 3.6489, + "step": 95055 + }, + { + "epoch": 6.458757983421661, + "grad_norm": 0.35114026069641113, + "learning_rate": 1.9297798613942112e-05, + "loss": 3.9846, + "step": 95060 + }, + { + "epoch": 6.459097703492322, + "grad_norm": 0.1596360206604004, + "learning_rate": 1.929355211305884e-05, + "loss": 3.864, + "step": 95065 + }, + { + "epoch": 6.459437423562984, + "grad_norm": 0.1704375296831131, + "learning_rate": 1.928930561217557e-05, + "loss": 3.6836, + "step": 95070 + }, + { + "epoch": 6.459777143633646, + "grad_norm": 0.6792884469032288, + "learning_rate": 1.9285059111292296e-05, + "loss": 3.9783, + "step": 95075 + }, + { + "epoch": 6.460116863704307, + "grad_norm": 0.17647932469844818, + "learning_rate": 1.9280812610409024e-05, + "loss": 3.9352, + "step": 95080 + }, + { + "epoch": 6.4604565837749695, + "grad_norm": 0.2144434005022049, + "learning_rate": 1.9276566109525753e-05, + "loss": 3.8878, + "step": 95085 + }, + { + "epoch": 6.4607963038456315, + "grad_norm": 0.6760505437850952, + "learning_rate": 1.927231960864248e-05, + "loss": 3.9455, + "step": 95090 + }, + { + "epoch": 6.461136023916293, + "grad_norm": 0.21656639873981476, + "learning_rate": 1.926807310775921e-05, + "loss": 3.8445, + "step": 95095 + }, + { + "epoch": 6.461475743986955, + "grad_norm": 0.16552108526229858, + "learning_rate": 1.9263826606875933e-05, + "loss": 3.7046, + "step": 95100 + }, + { + "epoch": 6.461815464057617, + "grad_norm": 0.1726793348789215, + "learning_rate": 1.9259580105992665e-05, + "loss": 3.6955, + "step": 95105 + }, + { + "epoch": 6.462155184128278, + "grad_norm": 0.15237362682819366, + "learning_rate": 1.925533360510939e-05, + "loss": 3.671, + "step": 95110 + }, + { + "epoch": 6.46249490419894, + "grad_norm": 0.14197947084903717, + "learning_rate": 1.9251087104226117e-05, + "loss": 3.9548, + "step": 95115 + }, + { + "epoch": 6.462834624269602, + "grad_norm": 0.17724302411079407, + "learning_rate": 1.924684060334285e-05, + "loss": 3.6942, + "step": 95120 + }, + { + "epoch": 6.463174344340263, + "grad_norm": 0.1666419506072998, + "learning_rate": 1.9242594102459573e-05, + "loss": 3.9135, + "step": 95125 + }, + { + "epoch": 6.4635140644109255, + "grad_norm": 0.2034805864095688, + "learning_rate": 1.92383476015763e-05, + "loss": 3.8183, + "step": 95130 + }, + { + "epoch": 6.4638537844815875, + "grad_norm": 0.2836878299713135, + "learning_rate": 1.923410110069303e-05, + "loss": 3.5606, + "step": 95135 + }, + { + "epoch": 6.464193504552249, + "grad_norm": 0.17685705423355103, + "learning_rate": 1.9229854599809757e-05, + "loss": 3.9572, + "step": 95140 + }, + { + "epoch": 6.464533224622911, + "grad_norm": 0.14388372004032135, + "learning_rate": 1.9225608098926485e-05, + "loss": 4.0434, + "step": 95145 + }, + { + "epoch": 6.464872944693573, + "grad_norm": 0.1437414437532425, + "learning_rate": 1.9221361598043213e-05, + "loss": 3.8592, + "step": 95150 + }, + { + "epoch": 6.465212664764234, + "grad_norm": 0.15261447429656982, + "learning_rate": 1.921711509715994e-05, + "loss": 3.7907, + "step": 95155 + }, + { + "epoch": 6.465552384834896, + "grad_norm": 0.17143431305885315, + "learning_rate": 1.921286859627667e-05, + "loss": 3.9356, + "step": 95160 + }, + { + "epoch": 6.465892104905558, + "grad_norm": 0.17507657408714294, + "learning_rate": 1.9208622095393397e-05, + "loss": 3.8031, + "step": 95165 + }, + { + "epoch": 6.466231824976219, + "grad_norm": 0.15691806375980377, + "learning_rate": 1.9204375594510125e-05, + "loss": 3.7976, + "step": 95170 + }, + { + "epoch": 6.4665715450468815, + "grad_norm": 0.15721222758293152, + "learning_rate": 1.9200129093626853e-05, + "loss": 3.8434, + "step": 95175 + }, + { + "epoch": 6.4669112651175436, + "grad_norm": 0.2731049954891205, + "learning_rate": 1.919588259274358e-05, + "loss": 3.7873, + "step": 95180 + }, + { + "epoch": 6.467250985188205, + "grad_norm": 0.17607524991035461, + "learning_rate": 1.9191636091860306e-05, + "loss": 3.7396, + "step": 95185 + }, + { + "epoch": 6.467590705258867, + "grad_norm": 0.19639894366264343, + "learning_rate": 1.9187389590977037e-05, + "loss": 3.8503, + "step": 95190 + }, + { + "epoch": 6.467930425329529, + "grad_norm": 0.18665237724781036, + "learning_rate": 1.9183143090093765e-05, + "loss": 3.8347, + "step": 95195 + }, + { + "epoch": 6.46827014540019, + "grad_norm": 0.20462659001350403, + "learning_rate": 1.917889658921049e-05, + "loss": 4.0045, + "step": 95200 + }, + { + "epoch": 6.468609865470852, + "grad_norm": 0.1833561509847641, + "learning_rate": 1.917465008832722e-05, + "loss": 3.9355, + "step": 95205 + }, + { + "epoch": 6.468949585541514, + "grad_norm": 0.20426708459854126, + "learning_rate": 1.9170403587443946e-05, + "loss": 3.7878, + "step": 95210 + }, + { + "epoch": 6.469289305612175, + "grad_norm": 0.1795668601989746, + "learning_rate": 1.9166157086560674e-05, + "loss": 3.7898, + "step": 95215 + }, + { + "epoch": 6.4696290256828375, + "grad_norm": 0.19559244811534882, + "learning_rate": 1.9161910585677402e-05, + "loss": 3.7427, + "step": 95220 + }, + { + "epoch": 6.4699687457535, + "grad_norm": 0.2768562436103821, + "learning_rate": 1.915766408479413e-05, + "loss": 3.7248, + "step": 95225 + }, + { + "epoch": 6.470308465824161, + "grad_norm": 0.16812682151794434, + "learning_rate": 1.9153417583910858e-05, + "loss": 3.9927, + "step": 95230 + }, + { + "epoch": 6.470648185894823, + "grad_norm": 0.17482145130634308, + "learning_rate": 1.9149171083027586e-05, + "loss": 4.1889, + "step": 95235 + }, + { + "epoch": 6.470987905965485, + "grad_norm": 0.6912363171577454, + "learning_rate": 1.9144924582144314e-05, + "loss": 4.0556, + "step": 95240 + }, + { + "epoch": 6.471327626036146, + "grad_norm": 0.1981084793806076, + "learning_rate": 1.9140678081261042e-05, + "loss": 4.1259, + "step": 95245 + }, + { + "epoch": 6.471667346106808, + "grad_norm": 0.14665362238883972, + "learning_rate": 1.913643158037777e-05, + "loss": 3.7855, + "step": 95250 + }, + { + "epoch": 6.47200706617747, + "grad_norm": 0.7556307911872864, + "learning_rate": 1.9132185079494498e-05, + "loss": 3.7525, + "step": 95255 + }, + { + "epoch": 6.472346786248131, + "grad_norm": 0.2543773055076599, + "learning_rate": 1.9127938578611226e-05, + "loss": 3.9707, + "step": 95260 + }, + { + "epoch": 6.4726865063187935, + "grad_norm": 0.17282381653785706, + "learning_rate": 1.9123692077727954e-05, + "loss": 3.676, + "step": 95265 + }, + { + "epoch": 6.473026226389455, + "grad_norm": 0.49774739146232605, + "learning_rate": 1.911944557684468e-05, + "loss": 3.7603, + "step": 95270 + }, + { + "epoch": 6.473365946460117, + "grad_norm": 0.1878042072057724, + "learning_rate": 1.911519907596141e-05, + "loss": 3.6703, + "step": 95275 + }, + { + "epoch": 6.473705666530779, + "grad_norm": 0.16073647141456604, + "learning_rate": 1.9110952575078138e-05, + "loss": 4.0092, + "step": 95280 + }, + { + "epoch": 6.47404538660144, + "grad_norm": 0.16339470446109772, + "learning_rate": 1.9106706074194863e-05, + "loss": 3.6876, + "step": 95285 + }, + { + "epoch": 6.474385106672102, + "grad_norm": 0.15883249044418335, + "learning_rate": 1.9103308873488245e-05, + "loss": 3.9116, + "step": 95290 + }, + { + "epoch": 6.474724826742764, + "grad_norm": 0.19064290821552277, + "learning_rate": 1.9099062372604976e-05, + "loss": 3.7974, + "step": 95295 + }, + { + "epoch": 6.475064546813425, + "grad_norm": 0.1770259290933609, + "learning_rate": 1.90948158717217e-05, + "loss": 3.8882, + "step": 95300 + }, + { + "epoch": 6.475404266884087, + "grad_norm": 0.15925613045692444, + "learning_rate": 1.909056937083843e-05, + "loss": 3.9749, + "step": 95305 + }, + { + "epoch": 6.4757439869547495, + "grad_norm": 0.1585651934146881, + "learning_rate": 1.908632286995516e-05, + "loss": 4.0278, + "step": 95310 + }, + { + "epoch": 6.476083707025411, + "grad_norm": 0.15542763471603394, + "learning_rate": 1.9082076369071885e-05, + "loss": 3.7536, + "step": 95315 + }, + { + "epoch": 6.476423427096073, + "grad_norm": 0.15497998893260956, + "learning_rate": 1.9077829868188613e-05, + "loss": 3.6455, + "step": 95320 + }, + { + "epoch": 6.476763147166735, + "grad_norm": 0.19440686702728271, + "learning_rate": 1.907358336730534e-05, + "loss": 3.7938, + "step": 95325 + }, + { + "epoch": 6.477102867237396, + "grad_norm": 0.15516908466815948, + "learning_rate": 1.906933686642207e-05, + "loss": 3.7947, + "step": 95330 + }, + { + "epoch": 6.477442587308058, + "grad_norm": 0.20578458905220032, + "learning_rate": 1.9065090365538797e-05, + "loss": 3.9179, + "step": 95335 + }, + { + "epoch": 6.47778230737872, + "grad_norm": 0.22588518261909485, + "learning_rate": 1.9060843864655525e-05, + "loss": 3.7796, + "step": 95340 + }, + { + "epoch": 6.478122027449381, + "grad_norm": 0.2256390005350113, + "learning_rate": 1.9056597363772253e-05, + "loss": 3.8011, + "step": 95345 + }, + { + "epoch": 6.4784617475200434, + "grad_norm": 0.14315061271190643, + "learning_rate": 1.9052350862888978e-05, + "loss": 3.806, + "step": 95350 + }, + { + "epoch": 6.4788014675907055, + "grad_norm": 0.26267707347869873, + "learning_rate": 1.904810436200571e-05, + "loss": 3.6396, + "step": 95355 + }, + { + "epoch": 6.479141187661367, + "grad_norm": 0.196270152926445, + "learning_rate": 1.9043857861122437e-05, + "loss": 3.7356, + "step": 95360 + }, + { + "epoch": 6.479480907732029, + "grad_norm": 0.16051068902015686, + "learning_rate": 1.9039611360239165e-05, + "loss": 3.666, + "step": 95365 + }, + { + "epoch": 6.479820627802691, + "grad_norm": 0.19250601530075073, + "learning_rate": 1.9035364859355893e-05, + "loss": 3.8486, + "step": 95370 + }, + { + "epoch": 6.480160347873352, + "grad_norm": 0.209642693400383, + "learning_rate": 1.9031118358472618e-05, + "loss": 3.4755, + "step": 95375 + }, + { + "epoch": 6.480500067944014, + "grad_norm": 0.160110741853714, + "learning_rate": 1.902687185758935e-05, + "loss": 3.9276, + "step": 95380 + }, + { + "epoch": 6.480839788014676, + "grad_norm": 0.15937493741512299, + "learning_rate": 1.9022625356706074e-05, + "loss": 3.8614, + "step": 95385 + }, + { + "epoch": 6.481179508085337, + "grad_norm": 0.24334272742271423, + "learning_rate": 1.9018378855822802e-05, + "loss": 3.7877, + "step": 95390 + }, + { + "epoch": 6.4815192281559995, + "grad_norm": 0.1686638593673706, + "learning_rate": 1.9014132354939533e-05, + "loss": 3.6958, + "step": 95395 + }, + { + "epoch": 6.4818589482266615, + "grad_norm": 0.22418466210365295, + "learning_rate": 1.9009885854056258e-05, + "loss": 3.8713, + "step": 95400 + }, + { + "epoch": 6.482198668297323, + "grad_norm": 0.16450029611587524, + "learning_rate": 1.9005639353172986e-05, + "loss": 3.8278, + "step": 95405 + }, + { + "epoch": 6.482538388367985, + "grad_norm": 0.1796611100435257, + "learning_rate": 1.9001392852289714e-05, + "loss": 3.8256, + "step": 95410 + }, + { + "epoch": 6.482878108438647, + "grad_norm": 0.15522010624408722, + "learning_rate": 1.8997146351406442e-05, + "loss": 4.0415, + "step": 95415 + }, + { + "epoch": 6.483217828509308, + "grad_norm": 0.1684180498123169, + "learning_rate": 1.899289985052317e-05, + "loss": 3.9334, + "step": 95420 + }, + { + "epoch": 6.48355754857997, + "grad_norm": 0.14132539927959442, + "learning_rate": 1.8988653349639898e-05, + "loss": 3.6287, + "step": 95425 + }, + { + "epoch": 6.483897268650632, + "grad_norm": 0.18234090507030487, + "learning_rate": 1.8984406848756626e-05, + "loss": 3.7248, + "step": 95430 + }, + { + "epoch": 6.484236988721293, + "grad_norm": 0.1765775829553604, + "learning_rate": 1.898016034787335e-05, + "loss": 3.801, + "step": 95435 + }, + { + "epoch": 6.4845767087919555, + "grad_norm": 0.2762952148914337, + "learning_rate": 1.8975913846990082e-05, + "loss": 3.7634, + "step": 95440 + }, + { + "epoch": 6.4849164288626175, + "grad_norm": 0.16662971675395966, + "learning_rate": 1.897166734610681e-05, + "loss": 3.8808, + "step": 95445 + }, + { + "epoch": 6.485256148933279, + "grad_norm": 0.1618972271680832, + "learning_rate": 1.8967420845223538e-05, + "loss": 4.067, + "step": 95450 + }, + { + "epoch": 6.485595869003941, + "grad_norm": 0.15239430963993073, + "learning_rate": 1.8963174344340266e-05, + "loss": 3.8221, + "step": 95455 + }, + { + "epoch": 6.485935589074603, + "grad_norm": 0.15194299817085266, + "learning_rate": 1.895892784345699e-05, + "loss": 3.9346, + "step": 95460 + }, + { + "epoch": 6.486275309145264, + "grad_norm": 0.20861107110977173, + "learning_rate": 1.8954681342573722e-05, + "loss": 4.0423, + "step": 95465 + }, + { + "epoch": 6.486615029215926, + "grad_norm": 0.14997538924217224, + "learning_rate": 1.8950434841690447e-05, + "loss": 4.1692, + "step": 95470 + }, + { + "epoch": 6.486954749286588, + "grad_norm": 0.32769548892974854, + "learning_rate": 1.8946188340807175e-05, + "loss": 4.115, + "step": 95475 + }, + { + "epoch": 6.487294469357249, + "grad_norm": 0.157618448138237, + "learning_rate": 1.8941941839923906e-05, + "loss": 3.7181, + "step": 95480 + }, + { + "epoch": 6.4876341894279115, + "grad_norm": 0.1826610267162323, + "learning_rate": 1.893769533904063e-05, + "loss": 4.0039, + "step": 95485 + }, + { + "epoch": 6.4879739094985736, + "grad_norm": 0.16506855189800262, + "learning_rate": 1.893344883815736e-05, + "loss": 3.7856, + "step": 95490 + }, + { + "epoch": 6.488313629569235, + "grad_norm": 0.13932234048843384, + "learning_rate": 1.8929202337274087e-05, + "loss": 3.7621, + "step": 95495 + }, + { + "epoch": 6.488653349639897, + "grad_norm": 0.21610838174819946, + "learning_rate": 1.8924955836390815e-05, + "loss": 3.8838, + "step": 95500 + }, + { + "epoch": 6.488993069710559, + "grad_norm": 0.18289341032505035, + "learning_rate": 1.8920709335507543e-05, + "loss": 3.916, + "step": 95505 + }, + { + "epoch": 6.48933278978122, + "grad_norm": 0.1324312537908554, + "learning_rate": 1.891646283462427e-05, + "loss": 3.6529, + "step": 95510 + }, + { + "epoch": 6.489672509851882, + "grad_norm": 0.30381715297698975, + "learning_rate": 1.8912216333741e-05, + "loss": 3.9855, + "step": 95515 + }, + { + "epoch": 6.490012229922544, + "grad_norm": 0.15529781579971313, + "learning_rate": 1.8907969832857723e-05, + "loss": 4.0835, + "step": 95520 + }, + { + "epoch": 6.490351949993205, + "grad_norm": 0.19985775649547577, + "learning_rate": 1.8903723331974455e-05, + "loss": 3.5087, + "step": 95525 + }, + { + "epoch": 6.4906916700638675, + "grad_norm": 0.1506633758544922, + "learning_rate": 1.8899476831091183e-05, + "loss": 3.7995, + "step": 95530 + }, + { + "epoch": 6.491031390134529, + "grad_norm": 0.1824040561914444, + "learning_rate": 1.889523033020791e-05, + "loss": 3.8365, + "step": 95535 + }, + { + "epoch": 6.491371110205191, + "grad_norm": 1.934745192527771, + "learning_rate": 1.889098382932464e-05, + "loss": 3.9143, + "step": 95540 + }, + { + "epoch": 6.491710830275853, + "grad_norm": 0.7387930154800415, + "learning_rate": 1.8886737328441363e-05, + "loss": 3.8398, + "step": 95545 + }, + { + "epoch": 6.492050550346514, + "grad_norm": 0.17539241909980774, + "learning_rate": 1.8882490827558095e-05, + "loss": 3.7048, + "step": 95550 + }, + { + "epoch": 6.492390270417176, + "grad_norm": 0.15693652629852295, + "learning_rate": 1.8878244326674823e-05, + "loss": 3.7969, + "step": 95555 + }, + { + "epoch": 6.492729990487838, + "grad_norm": 0.17988130450248718, + "learning_rate": 1.8873997825791547e-05, + "loss": 3.8614, + "step": 95560 + }, + { + "epoch": 6.493069710558499, + "grad_norm": 0.2956083118915558, + "learning_rate": 1.886975132490828e-05, + "loss": 3.921, + "step": 95565 + }, + { + "epoch": 6.493409430629161, + "grad_norm": 0.16046001017093658, + "learning_rate": 1.8865504824025003e-05, + "loss": 3.7229, + "step": 95570 + }, + { + "epoch": 6.4937491506998235, + "grad_norm": 0.1700402796268463, + "learning_rate": 1.886125832314173e-05, + "loss": 3.9942, + "step": 95575 + }, + { + "epoch": 6.494088870770485, + "grad_norm": 0.19069141149520874, + "learning_rate": 1.885701182225846e-05, + "loss": 3.733, + "step": 95580 + }, + { + "epoch": 6.494428590841147, + "grad_norm": 0.15849915146827698, + "learning_rate": 1.8852765321375187e-05, + "loss": 3.8666, + "step": 95585 + }, + { + "epoch": 6.494768310911809, + "grad_norm": 0.21301564574241638, + "learning_rate": 1.8848518820491915e-05, + "loss": 3.7295, + "step": 95590 + }, + { + "epoch": 6.49510803098247, + "grad_norm": 0.15385816991329193, + "learning_rate": 1.8844272319608643e-05, + "loss": 3.74, + "step": 95595 + }, + { + "epoch": 6.495447751053132, + "grad_norm": 0.15013951063156128, + "learning_rate": 1.884002581872537e-05, + "loss": 3.6845, + "step": 95600 + }, + { + "epoch": 6.495787471123794, + "grad_norm": 0.2107752561569214, + "learning_rate": 1.88357793178421e-05, + "loss": 3.7343, + "step": 95605 + }, + { + "epoch": 6.496127191194455, + "grad_norm": 0.19248619675636292, + "learning_rate": 1.8831532816958827e-05, + "loss": 3.6611, + "step": 95610 + }, + { + "epoch": 6.496466911265117, + "grad_norm": 0.4626934826374054, + "learning_rate": 1.8827286316075555e-05, + "loss": 3.8932, + "step": 95615 + }, + { + "epoch": 6.4968066313357795, + "grad_norm": 0.2741275429725647, + "learning_rate": 1.8823039815192283e-05, + "loss": 3.8772, + "step": 95620 + }, + { + "epoch": 6.497146351406441, + "grad_norm": 0.15830780565738678, + "learning_rate": 1.881879331430901e-05, + "loss": 3.8779, + "step": 95625 + }, + { + "epoch": 6.497486071477103, + "grad_norm": 0.1704079508781433, + "learning_rate": 1.8814546813425736e-05, + "loss": 3.9672, + "step": 95630 + }, + { + "epoch": 6.497825791547765, + "grad_norm": 0.1479315459728241, + "learning_rate": 1.8810300312542467e-05, + "loss": 4.0258, + "step": 95635 + }, + { + "epoch": 6.498165511618426, + "grad_norm": 0.1666712760925293, + "learning_rate": 1.8806053811659195e-05, + "loss": 3.7133, + "step": 95640 + }, + { + "epoch": 6.498505231689088, + "grad_norm": 0.15956467390060425, + "learning_rate": 1.880180731077592e-05, + "loss": 3.8454, + "step": 95645 + }, + { + "epoch": 6.49884495175975, + "grad_norm": 0.4004557430744171, + "learning_rate": 1.879756080989265e-05, + "loss": 4.027, + "step": 95650 + }, + { + "epoch": 6.499184671830411, + "grad_norm": 0.18122249841690063, + "learning_rate": 1.8793314309009376e-05, + "loss": 3.9567, + "step": 95655 + }, + { + "epoch": 6.4995243919010735, + "grad_norm": 0.20528922975063324, + "learning_rate": 1.8789067808126104e-05, + "loss": 3.7817, + "step": 95660 + }, + { + "epoch": 6.4998641119717355, + "grad_norm": 0.1925683468580246, + "learning_rate": 1.8784821307242832e-05, + "loss": 4.044, + "step": 95665 + }, + { + "epoch": 6.500203832042397, + "grad_norm": 0.3275878131389618, + "learning_rate": 1.878057480635956e-05, + "loss": 3.9057, + "step": 95670 + }, + { + "epoch": 6.500543552113059, + "grad_norm": 0.8828839063644409, + "learning_rate": 1.8776328305476288e-05, + "loss": 3.8704, + "step": 95675 + }, + { + "epoch": 6.500883272183721, + "grad_norm": 0.30123358964920044, + "learning_rate": 1.8772081804593016e-05, + "loss": 3.6622, + "step": 95680 + }, + { + "epoch": 6.501222992254382, + "grad_norm": 1.1880029439926147, + "learning_rate": 1.8767835303709744e-05, + "loss": 3.6409, + "step": 95685 + }, + { + "epoch": 6.501562712325044, + "grad_norm": 0.18887469172477722, + "learning_rate": 1.8763588802826472e-05, + "loss": 3.9548, + "step": 95690 + }, + { + "epoch": 6.501902432395706, + "grad_norm": 0.1540897786617279, + "learning_rate": 1.87593423019432e-05, + "loss": 3.9085, + "step": 95695 + }, + { + "epoch": 6.502242152466367, + "grad_norm": 0.1641250103712082, + "learning_rate": 1.8755095801059928e-05, + "loss": 3.864, + "step": 95700 + }, + { + "epoch": 6.5025818725370295, + "grad_norm": 0.8485110402107239, + "learning_rate": 1.8750849300176656e-05, + "loss": 3.8513, + "step": 95705 + }, + { + "epoch": 6.5029215926076915, + "grad_norm": 0.1782754808664322, + "learning_rate": 1.8746602799293384e-05, + "loss": 3.8305, + "step": 95710 + }, + { + "epoch": 6.503261312678353, + "grad_norm": 0.21701624989509583, + "learning_rate": 1.8743205598586767e-05, + "loss": 3.736, + "step": 95715 + }, + { + "epoch": 6.503601032749015, + "grad_norm": 0.20825089514255524, + "learning_rate": 1.8738959097703495e-05, + "loss": 3.7909, + "step": 95720 + }, + { + "epoch": 6.503940752819677, + "grad_norm": 0.29183247685432434, + "learning_rate": 1.873471259682022e-05, + "loss": 3.9841, + "step": 95725 + }, + { + "epoch": 6.504280472890338, + "grad_norm": 0.21175511181354523, + "learning_rate": 1.873046609593695e-05, + "loss": 3.8781, + "step": 95730 + }, + { + "epoch": 6.504620192961, + "grad_norm": 0.13604533672332764, + "learning_rate": 1.8726219595053675e-05, + "loss": 3.7161, + "step": 95735 + }, + { + "epoch": 6.504959913031662, + "grad_norm": 0.1663418710231781, + "learning_rate": 1.8721973094170407e-05, + "loss": 3.8834, + "step": 95740 + }, + { + "epoch": 6.505299633102323, + "grad_norm": 0.13429677486419678, + "learning_rate": 1.871772659328713e-05, + "loss": 3.8583, + "step": 95745 + }, + { + "epoch": 6.5056393531729855, + "grad_norm": 0.22769783437252045, + "learning_rate": 1.871348009240386e-05, + "loss": 3.741, + "step": 95750 + }, + { + "epoch": 6.5059790732436475, + "grad_norm": 0.2089589387178421, + "learning_rate": 1.870923359152059e-05, + "loss": 3.8246, + "step": 95755 + }, + { + "epoch": 6.506318793314309, + "grad_norm": 0.15722361207008362, + "learning_rate": 1.8704987090637315e-05, + "loss": 3.8317, + "step": 95760 + }, + { + "epoch": 6.506658513384971, + "grad_norm": 0.1667608767747879, + "learning_rate": 1.8700740589754043e-05, + "loss": 3.7777, + "step": 95765 + }, + { + "epoch": 6.506998233455633, + "grad_norm": 0.15930497646331787, + "learning_rate": 1.869649408887077e-05, + "loss": 4.029, + "step": 95770 + }, + { + "epoch": 6.507337953526294, + "grad_norm": 0.17493510246276855, + "learning_rate": 1.86922475879875e-05, + "loss": 3.8018, + "step": 95775 + }, + { + "epoch": 6.507677673596956, + "grad_norm": 0.2053455114364624, + "learning_rate": 1.8688001087104227e-05, + "loss": 4.0037, + "step": 95780 + }, + { + "epoch": 6.508017393667618, + "grad_norm": 0.5102881193161011, + "learning_rate": 1.8683754586220955e-05, + "loss": 4.0349, + "step": 95785 + }, + { + "epoch": 6.508357113738279, + "grad_norm": 0.36412572860717773, + "learning_rate": 1.8679508085337683e-05, + "loss": 3.9433, + "step": 95790 + }, + { + "epoch": 6.5086968338089415, + "grad_norm": 0.236044242978096, + "learning_rate": 1.8675261584454408e-05, + "loss": 3.8397, + "step": 95795 + }, + { + "epoch": 6.509036553879604, + "grad_norm": 0.1589801013469696, + "learning_rate": 1.867101508357114e-05, + "loss": 3.8062, + "step": 95800 + }, + { + "epoch": 6.509376273950265, + "grad_norm": 0.18341264128684998, + "learning_rate": 1.8666768582687867e-05, + "loss": 3.7665, + "step": 95805 + }, + { + "epoch": 6.509715994020927, + "grad_norm": 0.19275611639022827, + "learning_rate": 1.8662522081804592e-05, + "loss": 3.876, + "step": 95810 + }, + { + "epoch": 6.510055714091589, + "grad_norm": 0.1430159956216812, + "learning_rate": 1.8658275580921323e-05, + "loss": 3.8001, + "step": 95815 + }, + { + "epoch": 6.51039543416225, + "grad_norm": 0.18196040391921997, + "learning_rate": 1.8654029080038048e-05, + "loss": 4.0117, + "step": 95820 + }, + { + "epoch": 6.510735154232912, + "grad_norm": 0.4896487593650818, + "learning_rate": 1.864978257915478e-05, + "loss": 3.6883, + "step": 95825 + }, + { + "epoch": 6.511074874303574, + "grad_norm": 2.869795322418213, + "learning_rate": 1.8645536078271504e-05, + "loss": 3.8163, + "step": 95830 + }, + { + "epoch": 6.511414594374235, + "grad_norm": 0.1474589705467224, + "learning_rate": 1.8641289577388232e-05, + "loss": 3.6775, + "step": 95835 + }, + { + "epoch": 6.5117543144448975, + "grad_norm": 0.21876725554466248, + "learning_rate": 1.8637043076504963e-05, + "loss": 3.7169, + "step": 95840 + }, + { + "epoch": 6.51209403451556, + "grad_norm": 0.13106997311115265, + "learning_rate": 1.8632796575621688e-05, + "loss": 3.8514, + "step": 95845 + }, + { + "epoch": 6.512433754586221, + "grad_norm": 0.25837814807891846, + "learning_rate": 1.8628550074738416e-05, + "loss": 4.0131, + "step": 95850 + }, + { + "epoch": 6.512773474656883, + "grad_norm": 0.1991213709115982, + "learning_rate": 1.8624303573855144e-05, + "loss": 3.8511, + "step": 95855 + }, + { + "epoch": 6.513113194727545, + "grad_norm": 0.30214524269104004, + "learning_rate": 1.8620057072971872e-05, + "loss": 3.8253, + "step": 95860 + }, + { + "epoch": 6.513452914798206, + "grad_norm": 0.4728465974330902, + "learning_rate": 1.86158105720886e-05, + "loss": 3.7007, + "step": 95865 + }, + { + "epoch": 6.513792634868868, + "grad_norm": 0.20678496360778809, + "learning_rate": 1.8611564071205328e-05, + "loss": 3.8578, + "step": 95870 + }, + { + "epoch": 6.51413235493953, + "grad_norm": 0.18153290450572968, + "learning_rate": 1.8607317570322056e-05, + "loss": 3.746, + "step": 95875 + }, + { + "epoch": 6.514472075010191, + "grad_norm": 0.22687487304210663, + "learning_rate": 1.860307106943878e-05, + "loss": 3.9336, + "step": 95880 + }, + { + "epoch": 6.5148117950808535, + "grad_norm": 0.414934903383255, + "learning_rate": 1.8598824568555512e-05, + "loss": 3.6041, + "step": 95885 + }, + { + "epoch": 6.515151515151516, + "grad_norm": 0.17601460218429565, + "learning_rate": 1.859457806767224e-05, + "loss": 3.9119, + "step": 95890 + }, + { + "epoch": 6.515491235222177, + "grad_norm": 0.21162767708301544, + "learning_rate": 1.8590331566788965e-05, + "loss": 3.76, + "step": 95895 + }, + { + "epoch": 6.515830955292839, + "grad_norm": 0.16137662529945374, + "learning_rate": 1.8586085065905696e-05, + "loss": 3.9511, + "step": 95900 + }, + { + "epoch": 6.516170675363501, + "grad_norm": 0.1944965422153473, + "learning_rate": 1.858183856502242e-05, + "loss": 3.8504, + "step": 95905 + }, + { + "epoch": 6.516510395434162, + "grad_norm": 0.21177659928798676, + "learning_rate": 1.8577592064139152e-05, + "loss": 4.0242, + "step": 95910 + }, + { + "epoch": 6.516850115504824, + "grad_norm": 0.1606612205505371, + "learning_rate": 1.8573345563255877e-05, + "loss": 4.0523, + "step": 95915 + }, + { + "epoch": 6.517189835575486, + "grad_norm": 0.1607521027326584, + "learning_rate": 1.8569099062372605e-05, + "loss": 3.9597, + "step": 95920 + }, + { + "epoch": 6.517529555646147, + "grad_norm": 0.16791512072086334, + "learning_rate": 1.8564852561489336e-05, + "loss": 3.7457, + "step": 95925 + }, + { + "epoch": 6.5178692757168095, + "grad_norm": 0.168924942612648, + "learning_rate": 1.856060606060606e-05, + "loss": 3.7008, + "step": 95930 + }, + { + "epoch": 6.518208995787472, + "grad_norm": 0.5294444561004639, + "learning_rate": 1.855635955972279e-05, + "loss": 3.9742, + "step": 95935 + }, + { + "epoch": 6.518548715858133, + "grad_norm": 0.22340723872184753, + "learning_rate": 1.8552113058839517e-05, + "loss": 3.919, + "step": 95940 + }, + { + "epoch": 6.518888435928795, + "grad_norm": 0.16849008202552795, + "learning_rate": 1.8547866557956245e-05, + "loss": 4.1028, + "step": 95945 + }, + { + "epoch": 6.519228155999457, + "grad_norm": 0.2639058530330658, + "learning_rate": 1.8543620057072973e-05, + "loss": 3.9109, + "step": 95950 + }, + { + "epoch": 6.519567876070118, + "grad_norm": 0.16760332882404327, + "learning_rate": 1.85393735561897e-05, + "loss": 3.8328, + "step": 95955 + }, + { + "epoch": 6.51990759614078, + "grad_norm": 0.1692647486925125, + "learning_rate": 1.853512705530643e-05, + "loss": 3.7115, + "step": 95960 + }, + { + "epoch": 6.520247316211442, + "grad_norm": 0.1909148395061493, + "learning_rate": 1.8530880554423153e-05, + "loss": 3.5259, + "step": 95965 + }, + { + "epoch": 6.5205870362821035, + "grad_norm": 0.1737062931060791, + "learning_rate": 1.8526634053539885e-05, + "loss": 3.9838, + "step": 95970 + }, + { + "epoch": 6.5209267563527655, + "grad_norm": 0.2069055438041687, + "learning_rate": 1.8522387552656613e-05, + "loss": 3.6835, + "step": 95975 + }, + { + "epoch": 6.521266476423427, + "grad_norm": 0.20388922095298767, + "learning_rate": 1.8518141051773337e-05, + "loss": 3.8017, + "step": 95980 + }, + { + "epoch": 6.521606196494089, + "grad_norm": 0.13851481676101685, + "learning_rate": 1.851389455089007e-05, + "loss": 3.7249, + "step": 95985 + }, + { + "epoch": 6.521945916564751, + "grad_norm": 0.17637024819850922, + "learning_rate": 1.8509648050006793e-05, + "loss": 3.6492, + "step": 95990 + }, + { + "epoch": 6.522285636635412, + "grad_norm": 0.1835785061120987, + "learning_rate": 1.8505401549123525e-05, + "loss": 3.9739, + "step": 95995 + }, + { + "epoch": 6.522625356706074, + "grad_norm": 0.139165461063385, + "learning_rate": 1.8501155048240253e-05, + "loss": 3.8426, + "step": 96000 + }, + { + "epoch": 6.522965076776736, + "grad_norm": 0.13621032238006592, + "learning_rate": 1.8496908547356977e-05, + "loss": 3.6655, + "step": 96005 + }, + { + "epoch": 6.523304796847397, + "grad_norm": 0.16164997220039368, + "learning_rate": 1.849266204647371e-05, + "loss": 3.7167, + "step": 96010 + }, + { + "epoch": 6.5236445169180595, + "grad_norm": 0.16861897706985474, + "learning_rate": 1.8488415545590433e-05, + "loss": 3.8995, + "step": 96015 + }, + { + "epoch": 6.5239842369887215, + "grad_norm": 1.4083784818649292, + "learning_rate": 1.848416904470716e-05, + "loss": 3.775, + "step": 96020 + }, + { + "epoch": 6.524323957059383, + "grad_norm": 0.18547719717025757, + "learning_rate": 1.847992254382389e-05, + "loss": 4.0302, + "step": 96025 + }, + { + "epoch": 6.524663677130045, + "grad_norm": 0.18466775119304657, + "learning_rate": 1.8475676042940617e-05, + "loss": 3.8506, + "step": 96030 + }, + { + "epoch": 6.525003397200707, + "grad_norm": 0.19464446604251862, + "learning_rate": 1.8471429542057345e-05, + "loss": 3.7497, + "step": 96035 + }, + { + "epoch": 6.525343117271368, + "grad_norm": 0.18643411993980408, + "learning_rate": 1.8467183041174074e-05, + "loss": 3.8043, + "step": 96040 + }, + { + "epoch": 6.52568283734203, + "grad_norm": 0.1941070407629013, + "learning_rate": 1.84629365402908e-05, + "loss": 4.0413, + "step": 96045 + }, + { + "epoch": 6.526022557412692, + "grad_norm": 0.20546768605709076, + "learning_rate": 1.845869003940753e-05, + "loss": 3.8506, + "step": 96050 + }, + { + "epoch": 6.526362277483353, + "grad_norm": 0.18811355531215668, + "learning_rate": 1.8454443538524258e-05, + "loss": 3.9657, + "step": 96055 + }, + { + "epoch": 6.5267019975540155, + "grad_norm": 0.17089968919754028, + "learning_rate": 1.8450197037640986e-05, + "loss": 3.8819, + "step": 96060 + }, + { + "epoch": 6.5270417176246776, + "grad_norm": 0.16905099153518677, + "learning_rate": 1.844595053675771e-05, + "loss": 3.7042, + "step": 96065 + }, + { + "epoch": 6.527381437695339, + "grad_norm": 0.17243491113185883, + "learning_rate": 1.844170403587444e-05, + "loss": 3.8618, + "step": 96070 + }, + { + "epoch": 6.527721157766001, + "grad_norm": 0.1504015028476715, + "learning_rate": 1.8437457534991166e-05, + "loss": 3.7391, + "step": 96075 + }, + { + "epoch": 6.528060877836663, + "grad_norm": 0.2222045511007309, + "learning_rate": 1.8433211034107898e-05, + "loss": 4.1475, + "step": 96080 + }, + { + "epoch": 6.528400597907324, + "grad_norm": 0.14192596077919006, + "learning_rate": 1.8428964533224626e-05, + "loss": 3.9679, + "step": 96085 + }, + { + "epoch": 6.528740317977986, + "grad_norm": 0.1480465531349182, + "learning_rate": 1.842471803234135e-05, + "loss": 3.6978, + "step": 96090 + }, + { + "epoch": 6.529080038048648, + "grad_norm": 0.18209514021873474, + "learning_rate": 1.842047153145808e-05, + "loss": 3.8195, + "step": 96095 + }, + { + "epoch": 6.529419758119309, + "grad_norm": 0.1791825294494629, + "learning_rate": 1.8416225030574806e-05, + "loss": 3.7785, + "step": 96100 + }, + { + "epoch": 6.5297594781899715, + "grad_norm": 0.1438511312007904, + "learning_rate": 1.8411978529691534e-05, + "loss": 3.7554, + "step": 96105 + }, + { + "epoch": 6.530099198260634, + "grad_norm": 0.22955839335918427, + "learning_rate": 1.8407732028808262e-05, + "loss": 4.0654, + "step": 96110 + }, + { + "epoch": 6.530438918331295, + "grad_norm": 0.17777115106582642, + "learning_rate": 1.840348552792499e-05, + "loss": 3.9531, + "step": 96115 + }, + { + "epoch": 6.530778638401957, + "grad_norm": 0.1475006639957428, + "learning_rate": 1.8399239027041718e-05, + "loss": 3.8193, + "step": 96120 + }, + { + "epoch": 6.531118358472619, + "grad_norm": 0.1806480586528778, + "learning_rate": 1.8394992526158446e-05, + "loss": 3.8446, + "step": 96125 + }, + { + "epoch": 6.53145807854328, + "grad_norm": 0.17443044483661652, + "learning_rate": 1.8390746025275174e-05, + "loss": 3.8487, + "step": 96130 + }, + { + "epoch": 6.531797798613942, + "grad_norm": 0.37798863649368286, + "learning_rate": 1.8386499524391902e-05, + "loss": 4.1346, + "step": 96135 + }, + { + "epoch": 6.532137518684604, + "grad_norm": 0.3693869411945343, + "learning_rate": 1.838225302350863e-05, + "loss": 3.5984, + "step": 96140 + }, + { + "epoch": 6.532477238755265, + "grad_norm": 0.19610044360160828, + "learning_rate": 1.8378006522625358e-05, + "loss": 3.938, + "step": 96145 + }, + { + "epoch": 6.5328169588259275, + "grad_norm": 0.1771986037492752, + "learning_rate": 1.8373760021742083e-05, + "loss": 3.8064, + "step": 96150 + }, + { + "epoch": 6.53315667889659, + "grad_norm": 1.0706857442855835, + "learning_rate": 1.8369513520858814e-05, + "loss": 3.7103, + "step": 96155 + }, + { + "epoch": 6.533496398967251, + "grad_norm": 0.14584733545780182, + "learning_rate": 1.836526701997554e-05, + "loss": 3.7942, + "step": 96160 + }, + { + "epoch": 6.533836119037913, + "grad_norm": 0.13310891389846802, + "learning_rate": 1.836102051909227e-05, + "loss": 3.8971, + "step": 96165 + }, + { + "epoch": 6.534175839108575, + "grad_norm": 0.1784142106771469, + "learning_rate": 1.8356774018209e-05, + "loss": 3.7479, + "step": 96170 + }, + { + "epoch": 6.534515559179236, + "grad_norm": 0.13709574937820435, + "learning_rate": 1.8352527517325723e-05, + "loss": 4.0257, + "step": 96175 + }, + { + "epoch": 6.534855279249898, + "grad_norm": 0.16840232908725739, + "learning_rate": 1.8348281016442454e-05, + "loss": 3.7194, + "step": 96180 + }, + { + "epoch": 6.535194999320559, + "grad_norm": 0.1617039293050766, + "learning_rate": 1.834403451555918e-05, + "loss": 4.0015, + "step": 96185 + }, + { + "epoch": 6.535534719391221, + "grad_norm": 0.17104431986808777, + "learning_rate": 1.8339788014675907e-05, + "loss": 3.9259, + "step": 96190 + }, + { + "epoch": 6.5358744394618835, + "grad_norm": 0.19889232516288757, + "learning_rate": 1.833554151379264e-05, + "loss": 3.7245, + "step": 96195 + }, + { + "epoch": 6.536214159532545, + "grad_norm": 0.21487395465373993, + "learning_rate": 1.8331295012909363e-05, + "loss": 3.868, + "step": 96200 + }, + { + "epoch": 6.536553879603207, + "grad_norm": 0.26261651515960693, + "learning_rate": 1.832704851202609e-05, + "loss": 4.0099, + "step": 96205 + }, + { + "epoch": 6.536893599673869, + "grad_norm": 0.18112820386886597, + "learning_rate": 1.832280201114282e-05, + "loss": 3.8555, + "step": 96210 + }, + { + "epoch": 6.53723331974453, + "grad_norm": 0.18381178379058838, + "learning_rate": 1.8318555510259547e-05, + "loss": 3.8679, + "step": 96215 + }, + { + "epoch": 6.537573039815192, + "grad_norm": 0.1898992359638214, + "learning_rate": 1.8314309009376275e-05, + "loss": 3.8037, + "step": 96220 + }, + { + "epoch": 6.537912759885854, + "grad_norm": 0.27148932218551636, + "learning_rate": 1.8310062508493003e-05, + "loss": 3.6579, + "step": 96225 + }, + { + "epoch": 6.538252479956515, + "grad_norm": 0.18093156814575195, + "learning_rate": 1.830581600760973e-05, + "loss": 3.5416, + "step": 96230 + }, + { + "epoch": 6.5385922000271774, + "grad_norm": 0.1780756115913391, + "learning_rate": 1.8301569506726456e-05, + "loss": 3.8061, + "step": 96235 + }, + { + "epoch": 6.5389319200978395, + "grad_norm": 0.17538075149059296, + "learning_rate": 1.8297323005843187e-05, + "loss": 3.7543, + "step": 96240 + }, + { + "epoch": 6.539271640168501, + "grad_norm": 0.15785425901412964, + "learning_rate": 1.8293076504959915e-05, + "loss": 3.9501, + "step": 96245 + }, + { + "epoch": 6.539611360239163, + "grad_norm": 0.1517554372549057, + "learning_rate": 1.8288830004076643e-05, + "loss": 3.6432, + "step": 96250 + }, + { + "epoch": 6.539951080309825, + "grad_norm": 0.600372314453125, + "learning_rate": 1.828458350319337e-05, + "loss": 3.6967, + "step": 96255 + }, + { + "epoch": 6.540290800380486, + "grad_norm": 0.17378616333007812, + "learning_rate": 1.8280337002310096e-05, + "loss": 3.8914, + "step": 96260 + }, + { + "epoch": 6.540630520451148, + "grad_norm": 0.15895268321037292, + "learning_rate": 1.8276090501426827e-05, + "loss": 3.901, + "step": 96265 + }, + { + "epoch": 6.54097024052181, + "grad_norm": 0.14486855268478394, + "learning_rate": 1.8271844000543552e-05, + "loss": 3.8864, + "step": 96270 + }, + { + "epoch": 6.541309960592471, + "grad_norm": 0.17978811264038086, + "learning_rate": 1.826759749966028e-05, + "loss": 3.8935, + "step": 96275 + }, + { + "epoch": 6.5416496806631335, + "grad_norm": 0.3788725733757019, + "learning_rate": 1.826335099877701e-05, + "loss": 3.5185, + "step": 96280 + }, + { + "epoch": 6.5419894007337955, + "grad_norm": 0.141926571726799, + "learning_rate": 1.8259104497893736e-05, + "loss": 3.7066, + "step": 96285 + }, + { + "epoch": 6.542329120804457, + "grad_norm": 0.15537838637828827, + "learning_rate": 1.8254857997010464e-05, + "loss": 3.9546, + "step": 96290 + }, + { + "epoch": 6.542668840875119, + "grad_norm": 0.6585256457328796, + "learning_rate": 1.8250611496127192e-05, + "loss": 3.9725, + "step": 96295 + }, + { + "epoch": 6.543008560945781, + "grad_norm": 0.14467379450798035, + "learning_rate": 1.824636499524392e-05, + "loss": 3.8393, + "step": 96300 + }, + { + "epoch": 6.543348281016442, + "grad_norm": 0.1690262407064438, + "learning_rate": 1.8242118494360648e-05, + "loss": 3.7936, + "step": 96305 + }, + { + "epoch": 6.543688001087104, + "grad_norm": 0.26143091917037964, + "learning_rate": 1.8237871993477376e-05, + "loss": 3.6816, + "step": 96310 + }, + { + "epoch": 6.544027721157766, + "grad_norm": 0.30441492795944214, + "learning_rate": 1.8233625492594104e-05, + "loss": 4.1816, + "step": 96315 + }, + { + "epoch": 6.544367441228427, + "grad_norm": 0.2044161856174469, + "learning_rate": 1.822937899171083e-05, + "loss": 3.6994, + "step": 96320 + }, + { + "epoch": 6.5447071612990895, + "grad_norm": 0.23724329471588135, + "learning_rate": 1.822513249082756e-05, + "loss": 3.748, + "step": 96325 + }, + { + "epoch": 6.5450468813697515, + "grad_norm": 0.16412898898124695, + "learning_rate": 1.8220885989944288e-05, + "loss": 3.606, + "step": 96330 + }, + { + "epoch": 6.545386601440413, + "grad_norm": 0.17068737745285034, + "learning_rate": 1.8216639489061016e-05, + "loss": 3.9744, + "step": 96335 + }, + { + "epoch": 6.545726321511075, + "grad_norm": 0.1352180540561676, + "learning_rate": 1.8212392988177744e-05, + "loss": 3.9992, + "step": 96340 + }, + { + "epoch": 6.546066041581737, + "grad_norm": 0.794555127620697, + "learning_rate": 1.820814648729447e-05, + "loss": 3.7398, + "step": 96345 + }, + { + "epoch": 6.546405761652398, + "grad_norm": 0.1683708131313324, + "learning_rate": 1.82038999864112e-05, + "loss": 3.7309, + "step": 96350 + }, + { + "epoch": 6.54674548172306, + "grad_norm": 0.18030716478824615, + "learning_rate": 1.8199653485527924e-05, + "loss": 3.8048, + "step": 96355 + }, + { + "epoch": 6.547085201793722, + "grad_norm": 0.22083798050880432, + "learning_rate": 1.8195406984644652e-05, + "loss": 3.7158, + "step": 96360 + }, + { + "epoch": 6.547424921864383, + "grad_norm": 0.150190070271492, + "learning_rate": 1.8191160483761384e-05, + "loss": 3.8309, + "step": 96365 + }, + { + "epoch": 6.5477646419350455, + "grad_norm": 0.15507127344608307, + "learning_rate": 1.818691398287811e-05, + "loss": 3.8922, + "step": 96370 + }, + { + "epoch": 6.548104362005708, + "grad_norm": 0.21915094554424286, + "learning_rate": 1.8182667481994836e-05, + "loss": 3.7062, + "step": 96375 + }, + { + "epoch": 6.548444082076369, + "grad_norm": 0.20653775334358215, + "learning_rate": 1.8178420981111564e-05, + "loss": 3.6938, + "step": 96380 + }, + { + "epoch": 6.548783802147031, + "grad_norm": 0.18785478174686432, + "learning_rate": 1.8174174480228292e-05, + "loss": 3.8445, + "step": 96385 + }, + { + "epoch": 6.549123522217693, + "grad_norm": 0.17811495065689087, + "learning_rate": 1.816992797934502e-05, + "loss": 3.8088, + "step": 96390 + }, + { + "epoch": 6.549463242288354, + "grad_norm": 0.1534394919872284, + "learning_rate": 1.816568147846175e-05, + "loss": 3.9017, + "step": 96395 + }, + { + "epoch": 6.549802962359016, + "grad_norm": 0.15611062943935394, + "learning_rate": 1.8161434977578477e-05, + "loss": 3.8726, + "step": 96400 + }, + { + "epoch": 6.550142682429678, + "grad_norm": 0.3975841701030731, + "learning_rate": 1.81571884766952e-05, + "loss": 3.7114, + "step": 96405 + }, + { + "epoch": 6.550482402500339, + "grad_norm": 0.1694961041212082, + "learning_rate": 1.8152941975811933e-05, + "loss": 3.8839, + "step": 96410 + }, + { + "epoch": 6.5508221225710015, + "grad_norm": 0.1759660542011261, + "learning_rate": 1.814869547492866e-05, + "loss": 3.9257, + "step": 96415 + }, + { + "epoch": 6.551161842641664, + "grad_norm": 0.16331449151039124, + "learning_rate": 1.814444897404539e-05, + "loss": 3.7295, + "step": 96420 + }, + { + "epoch": 6.551501562712325, + "grad_norm": 0.178498312830925, + "learning_rate": 1.8140202473162117e-05, + "loss": 4.0138, + "step": 96425 + }, + { + "epoch": 6.551841282782987, + "grad_norm": 0.1370563805103302, + "learning_rate": 1.813595597227884e-05, + "loss": 3.9356, + "step": 96430 + }, + { + "epoch": 6.552181002853649, + "grad_norm": 0.18313068151474, + "learning_rate": 1.8131709471395573e-05, + "loss": 3.8691, + "step": 96435 + }, + { + "epoch": 6.55252072292431, + "grad_norm": 0.1343451589345932, + "learning_rate": 1.81274629705123e-05, + "loss": 3.7037, + "step": 96440 + }, + { + "epoch": 6.552860442994972, + "grad_norm": 0.21508856117725372, + "learning_rate": 1.8123216469629025e-05, + "loss": 3.737, + "step": 96445 + }, + { + "epoch": 6.553200163065634, + "grad_norm": 0.3519876003265381, + "learning_rate": 1.8118969968745757e-05, + "loss": 3.9202, + "step": 96450 + }, + { + "epoch": 6.553539883136295, + "grad_norm": 0.20559538900852203, + "learning_rate": 1.811472346786248e-05, + "loss": 3.8348, + "step": 96455 + }, + { + "epoch": 6.5538796032069575, + "grad_norm": 0.14597859978675842, + "learning_rate": 1.811047696697921e-05, + "loss": 3.6234, + "step": 96460 + }, + { + "epoch": 6.55421932327762, + "grad_norm": 0.15007546544075012, + "learning_rate": 1.8106230466095937e-05, + "loss": 3.6484, + "step": 96465 + }, + { + "epoch": 6.554559043348281, + "grad_norm": 0.13037945330142975, + "learning_rate": 1.8101983965212665e-05, + "loss": 3.992, + "step": 96470 + }, + { + "epoch": 6.554898763418943, + "grad_norm": 0.19441309571266174, + "learning_rate": 1.8097737464329393e-05, + "loss": 3.9163, + "step": 96475 + }, + { + "epoch": 6.555238483489605, + "grad_norm": 0.19569353759288788, + "learning_rate": 1.809349096344612e-05, + "loss": 3.7562, + "step": 96480 + }, + { + "epoch": 6.555578203560266, + "grad_norm": 0.15467879176139832, + "learning_rate": 1.808924446256285e-05, + "loss": 3.7371, + "step": 96485 + }, + { + "epoch": 6.555917923630928, + "grad_norm": 0.172559455037117, + "learning_rate": 1.8084997961679577e-05, + "loss": 3.8497, + "step": 96490 + }, + { + "epoch": 6.55625764370159, + "grad_norm": 0.23012076318264008, + "learning_rate": 1.8080751460796305e-05, + "loss": 3.4471, + "step": 96495 + }, + { + "epoch": 6.556597363772251, + "grad_norm": 0.19678345322608948, + "learning_rate": 1.8076504959913033e-05, + "loss": 3.8742, + "step": 96500 + }, + { + "epoch": 6.5569370838429135, + "grad_norm": 0.1513092964887619, + "learning_rate": 1.807225845902976e-05, + "loss": 3.5571, + "step": 96505 + }, + { + "epoch": 6.557276803913576, + "grad_norm": 0.17520038783550262, + "learning_rate": 1.806801195814649e-05, + "loss": 3.8302, + "step": 96510 + }, + { + "epoch": 6.557616523984237, + "grad_norm": 0.15582233667373657, + "learning_rate": 1.8063765457263214e-05, + "loss": 4.0205, + "step": 96515 + }, + { + "epoch": 6.557956244054899, + "grad_norm": 0.17414963245391846, + "learning_rate": 1.8059518956379945e-05, + "loss": 3.7778, + "step": 96520 + }, + { + "epoch": 6.558295964125561, + "grad_norm": 0.21993929147720337, + "learning_rate": 1.8055272455496673e-05, + "loss": 3.8173, + "step": 96525 + }, + { + "epoch": 6.558635684196222, + "grad_norm": 0.22587159276008606, + "learning_rate": 1.8051025954613398e-05, + "loss": 3.9814, + "step": 96530 + }, + { + "epoch": 6.558975404266884, + "grad_norm": 0.21768635511398315, + "learning_rate": 1.804677945373013e-05, + "loss": 4.037, + "step": 96535 + }, + { + "epoch": 6.559315124337546, + "grad_norm": 0.15064972639083862, + "learning_rate": 1.8042532952846854e-05, + "loss": 3.7118, + "step": 96540 + }, + { + "epoch": 6.5596548444082075, + "grad_norm": 0.3494754433631897, + "learning_rate": 1.8038286451963582e-05, + "loss": 3.8292, + "step": 96545 + }, + { + "epoch": 6.5599945644788695, + "grad_norm": 0.18573643267154694, + "learning_rate": 1.803403995108031e-05, + "loss": 3.9834, + "step": 96550 + }, + { + "epoch": 6.560334284549532, + "grad_norm": 0.22362171113491058, + "learning_rate": 1.8029793450197038e-05, + "loss": 3.6681, + "step": 96555 + }, + { + "epoch": 6.560674004620193, + "grad_norm": 0.16717174649238586, + "learning_rate": 1.8025546949313766e-05, + "loss": 3.8914, + "step": 96560 + }, + { + "epoch": 6.561013724690855, + "grad_norm": 0.16959096491336823, + "learning_rate": 1.8021300448430494e-05, + "loss": 3.7808, + "step": 96565 + }, + { + "epoch": 6.561353444761517, + "grad_norm": 0.2919604778289795, + "learning_rate": 1.8017053947547222e-05, + "loss": 3.5517, + "step": 96570 + }, + { + "epoch": 6.561693164832178, + "grad_norm": 0.2547197937965393, + "learning_rate": 1.801280744666395e-05, + "loss": 3.7442, + "step": 96575 + }, + { + "epoch": 6.56203288490284, + "grad_norm": 0.13746333122253418, + "learning_rate": 1.8008560945780678e-05, + "loss": 3.8338, + "step": 96580 + }, + { + "epoch": 6.562372604973502, + "grad_norm": 0.21307919919490814, + "learning_rate": 1.8004314444897406e-05, + "loss": 3.7712, + "step": 96585 + }, + { + "epoch": 6.5627123250441635, + "grad_norm": 0.1500844955444336, + "learning_rate": 1.8000067944014134e-05, + "loss": 3.6142, + "step": 96590 + }, + { + "epoch": 6.5630520451148255, + "grad_norm": 0.1576290726661682, + "learning_rate": 1.7995821443130862e-05, + "loss": 3.744, + "step": 96595 + }, + { + "epoch": 6.563391765185488, + "grad_norm": 0.18999429047107697, + "learning_rate": 1.7991574942247587e-05, + "loss": 3.8233, + "step": 96600 + }, + { + "epoch": 6.563731485256149, + "grad_norm": 0.17300406098365784, + "learning_rate": 1.7987328441364318e-05, + "loss": 3.8496, + "step": 96605 + }, + { + "epoch": 6.564071205326811, + "grad_norm": 0.14762334525585175, + "learning_rate": 1.7983081940481046e-05, + "loss": 3.886, + "step": 96610 + }, + { + "epoch": 6.564410925397473, + "grad_norm": 0.15950334072113037, + "learning_rate": 1.797883543959777e-05, + "loss": 3.9699, + "step": 96615 + }, + { + "epoch": 6.564750645468134, + "grad_norm": 0.15003702044487, + "learning_rate": 1.7974588938714502e-05, + "loss": 3.6817, + "step": 96620 + }, + { + "epoch": 6.565090365538796, + "grad_norm": 0.20657651126384735, + "learning_rate": 1.7970342437831227e-05, + "loss": 3.8447, + "step": 96625 + }, + { + "epoch": 6.565430085609458, + "grad_norm": 0.14195133745670319, + "learning_rate": 1.7966095936947955e-05, + "loss": 3.6846, + "step": 96630 + }, + { + "epoch": 6.5657698056801195, + "grad_norm": 0.16313178837299347, + "learning_rate": 1.7961849436064683e-05, + "loss": 3.7995, + "step": 96635 + }, + { + "epoch": 6.5661095257507816, + "grad_norm": 0.1819685846567154, + "learning_rate": 1.795760293518141e-05, + "loss": 3.5376, + "step": 96640 + }, + { + "epoch": 6.566449245821444, + "grad_norm": 0.19530221819877625, + "learning_rate": 1.795335643429814e-05, + "loss": 3.7765, + "step": 96645 + }, + { + "epoch": 6.566788965892105, + "grad_norm": 0.18244881927967072, + "learning_rate": 1.7949109933414867e-05, + "loss": 3.9901, + "step": 96650 + }, + { + "epoch": 6.567128685962767, + "grad_norm": 0.18964256346225739, + "learning_rate": 1.7944863432531595e-05, + "loss": 3.9865, + "step": 96655 + }, + { + "epoch": 6.567468406033428, + "grad_norm": 0.16840697824954987, + "learning_rate": 1.7940616931648323e-05, + "loss": 3.9005, + "step": 96660 + }, + { + "epoch": 6.56780812610409, + "grad_norm": 0.20791028439998627, + "learning_rate": 1.793637043076505e-05, + "loss": 4.0182, + "step": 96665 + }, + { + "epoch": 6.568147846174752, + "grad_norm": 0.18306739628314972, + "learning_rate": 1.793212392988178e-05, + "loss": 3.9191, + "step": 96670 + }, + { + "epoch": 6.568487566245413, + "grad_norm": 0.1688109040260315, + "learning_rate": 1.7927877428998507e-05, + "loss": 3.9467, + "step": 96675 + }, + { + "epoch": 6.5688272863160755, + "grad_norm": 0.6638104319572449, + "learning_rate": 1.7923630928115235e-05, + "loss": 3.7672, + "step": 96680 + }, + { + "epoch": 6.569167006386738, + "grad_norm": 0.21644841134548187, + "learning_rate": 1.791938442723196e-05, + "loss": 3.8519, + "step": 96685 + }, + { + "epoch": 6.569506726457399, + "grad_norm": 0.19499237835407257, + "learning_rate": 1.791513792634869e-05, + "loss": 3.6397, + "step": 96690 + }, + { + "epoch": 6.569846446528061, + "grad_norm": 0.20346611738204956, + "learning_rate": 1.791089142546542e-05, + "loss": 3.696, + "step": 96695 + }, + { + "epoch": 6.570186166598723, + "grad_norm": 0.1688411384820938, + "learning_rate": 1.7906644924582143e-05, + "loss": 3.7003, + "step": 96700 + }, + { + "epoch": 6.570525886669384, + "grad_norm": 0.2418552041053772, + "learning_rate": 1.7902398423698875e-05, + "loss": 3.4972, + "step": 96705 + }, + { + "epoch": 6.570865606740046, + "grad_norm": 0.1568688601255417, + "learning_rate": 1.78981519228156e-05, + "loss": 3.6843, + "step": 96710 + }, + { + "epoch": 6.571205326810708, + "grad_norm": 0.17011483013629913, + "learning_rate": 1.7893905421932327e-05, + "loss": 3.7351, + "step": 96715 + }, + { + "epoch": 6.571545046881369, + "grad_norm": 0.3453958332538605, + "learning_rate": 1.788965892104906e-05, + "loss": 3.8492, + "step": 96720 + }, + { + "epoch": 6.5718847669520315, + "grad_norm": 0.37151822447776794, + "learning_rate": 1.7885412420165783e-05, + "loss": 3.7609, + "step": 96725 + }, + { + "epoch": 6.572224487022694, + "grad_norm": 0.24275356531143188, + "learning_rate": 1.788116591928251e-05, + "loss": 3.7035, + "step": 96730 + }, + { + "epoch": 6.572564207093355, + "grad_norm": 0.20542843639850616, + "learning_rate": 1.787691941839924e-05, + "loss": 3.7616, + "step": 96735 + }, + { + "epoch": 6.572903927164017, + "grad_norm": 0.15554912388324738, + "learning_rate": 1.7872672917515967e-05, + "loss": 3.7413, + "step": 96740 + }, + { + "epoch": 6.573243647234679, + "grad_norm": 0.20183953642845154, + "learning_rate": 1.7868426416632695e-05, + "loss": 3.7438, + "step": 96745 + }, + { + "epoch": 6.57358336730534, + "grad_norm": 0.20525625348091125, + "learning_rate": 1.7864179915749423e-05, + "loss": 4.1778, + "step": 96750 + }, + { + "epoch": 6.573923087376002, + "grad_norm": 0.16244551539421082, + "learning_rate": 1.785993341486615e-05, + "loss": 3.7756, + "step": 96755 + }, + { + "epoch": 6.574262807446664, + "grad_norm": 0.2129857987165451, + "learning_rate": 1.785568691398288e-05, + "loss": 3.8417, + "step": 96760 + }, + { + "epoch": 6.574602527517325, + "grad_norm": 0.15805776417255402, + "learning_rate": 1.7851440413099608e-05, + "loss": 3.9802, + "step": 96765 + }, + { + "epoch": 6.5749422475879875, + "grad_norm": 0.1967563033103943, + "learning_rate": 1.7847193912216336e-05, + "loss": 3.865, + "step": 96770 + }, + { + "epoch": 6.57528196765865, + "grad_norm": 0.1750117540359497, + "learning_rate": 1.7842947411333064e-05, + "loss": 3.8295, + "step": 96775 + }, + { + "epoch": 6.575621687729311, + "grad_norm": 0.15995429456233978, + "learning_rate": 1.783870091044979e-05, + "loss": 3.8983, + "step": 96780 + }, + { + "epoch": 6.575961407799973, + "grad_norm": 0.1783713698387146, + "learning_rate": 1.7834454409566516e-05, + "loss": 3.7041, + "step": 96785 + }, + { + "epoch": 6.576301127870635, + "grad_norm": 0.173465758562088, + "learning_rate": 1.7830207908683248e-05, + "loss": 3.7705, + "step": 96790 + }, + { + "epoch": 6.576640847941296, + "grad_norm": 0.16965597867965698, + "learning_rate": 1.7825961407799972e-05, + "loss": 3.8833, + "step": 96795 + }, + { + "epoch": 6.576980568011958, + "grad_norm": 0.848264217376709, + "learning_rate": 1.78217149069167e-05, + "loss": 3.8491, + "step": 96800 + }, + { + "epoch": 6.57732028808262, + "grad_norm": 1.2913285493850708, + "learning_rate": 1.781746840603343e-05, + "loss": 4.0635, + "step": 96805 + }, + { + "epoch": 6.5776600081532814, + "grad_norm": 0.12865281105041504, + "learning_rate": 1.7813221905150156e-05, + "loss": 4.0842, + "step": 96810 + }, + { + "epoch": 6.5779997282239435, + "grad_norm": 0.18095411360263824, + "learning_rate": 1.7808975404266884e-05, + "loss": 3.9442, + "step": 96815 + }, + { + "epoch": 6.578339448294606, + "grad_norm": 0.17441198229789734, + "learning_rate": 1.7804728903383612e-05, + "loss": 3.8295, + "step": 96820 + }, + { + "epoch": 6.578679168365267, + "grad_norm": 0.3856109082698822, + "learning_rate": 1.780048240250034e-05, + "loss": 4.0019, + "step": 96825 + }, + { + "epoch": 6.579018888435929, + "grad_norm": 0.2132854163646698, + "learning_rate": 1.7796235901617068e-05, + "loss": 3.6204, + "step": 96830 + }, + { + "epoch": 6.579358608506591, + "grad_norm": 0.21207018196582794, + "learning_rate": 1.7791989400733796e-05, + "loss": 3.9036, + "step": 96835 + }, + { + "epoch": 6.579698328577252, + "grad_norm": 0.16274866461753845, + "learning_rate": 1.7787742899850524e-05, + "loss": 3.9774, + "step": 96840 + }, + { + "epoch": 6.580038048647914, + "grad_norm": 0.2154558002948761, + "learning_rate": 1.7783496398967252e-05, + "loss": 3.7022, + "step": 96845 + }, + { + "epoch": 6.580377768718576, + "grad_norm": 0.12903261184692383, + "learning_rate": 1.777924989808398e-05, + "loss": 3.8515, + "step": 96850 + }, + { + "epoch": 6.5807174887892375, + "grad_norm": 0.18518497049808502, + "learning_rate": 1.7775003397200708e-05, + "loss": 3.8146, + "step": 96855 + }, + { + "epoch": 6.5810572088598995, + "grad_norm": 0.1704203188419342, + "learning_rate": 1.7770756896317436e-05, + "loss": 4.0677, + "step": 96860 + }, + { + "epoch": 6.581396928930561, + "grad_norm": 0.175320103764534, + "learning_rate": 1.7766510395434164e-05, + "loss": 3.9385, + "step": 96865 + }, + { + "epoch": 6.581736649001223, + "grad_norm": 1.1256805658340454, + "learning_rate": 1.776226389455089e-05, + "loss": 3.6168, + "step": 96870 + }, + { + "epoch": 6.582076369071885, + "grad_norm": 0.3765079379081726, + "learning_rate": 1.775801739366762e-05, + "loss": 4.0503, + "step": 96875 + }, + { + "epoch": 6.582416089142546, + "grad_norm": 0.1496044546365738, + "learning_rate": 1.7753770892784345e-05, + "loss": 3.8373, + "step": 96880 + }, + { + "epoch": 6.582755809213208, + "grad_norm": 0.17852377891540527, + "learning_rate": 1.7749524391901073e-05, + "loss": 4.1203, + "step": 96885 + }, + { + "epoch": 6.58309552928387, + "grad_norm": 0.23746508359909058, + "learning_rate": 1.7745277891017804e-05, + "loss": 3.8378, + "step": 96890 + }, + { + "epoch": 6.583435249354531, + "grad_norm": 0.1849028617143631, + "learning_rate": 1.774103139013453e-05, + "loss": 3.8947, + "step": 96895 + }, + { + "epoch": 6.5837749694251935, + "grad_norm": 0.21006730198860168, + "learning_rate": 1.7736784889251257e-05, + "loss": 3.8102, + "step": 96900 + }, + { + "epoch": 6.5841146894958555, + "grad_norm": 0.13814093172550201, + "learning_rate": 1.7732538388367985e-05, + "loss": 3.9009, + "step": 96905 + }, + { + "epoch": 6.584454409566517, + "grad_norm": 1.6910053491592407, + "learning_rate": 1.7728291887484713e-05, + "loss": 3.8564, + "step": 96910 + }, + { + "epoch": 6.584794129637179, + "grad_norm": 1.3710941076278687, + "learning_rate": 1.772404538660144e-05, + "loss": 3.7496, + "step": 96915 + }, + { + "epoch": 6.585133849707841, + "grad_norm": 0.20973902940750122, + "learning_rate": 1.771979888571817e-05, + "loss": 3.9862, + "step": 96920 + }, + { + "epoch": 6.585473569778502, + "grad_norm": 0.1773037314414978, + "learning_rate": 1.7715552384834897e-05, + "loss": 3.7053, + "step": 96925 + }, + { + "epoch": 6.585813289849164, + "grad_norm": 0.170777827501297, + "learning_rate": 1.7711305883951625e-05, + "loss": 3.863, + "step": 96930 + }, + { + "epoch": 6.586153009919826, + "grad_norm": 0.17636772990226746, + "learning_rate": 1.7707059383068353e-05, + "loss": 3.6418, + "step": 96935 + }, + { + "epoch": 6.586492729990487, + "grad_norm": 0.17642001807689667, + "learning_rate": 1.770281288218508e-05, + "loss": 3.9367, + "step": 96940 + }, + { + "epoch": 6.5868324500611495, + "grad_norm": 0.17037492990493774, + "learning_rate": 1.769856638130181e-05, + "loss": 3.6938, + "step": 96945 + }, + { + "epoch": 6.5871721701318116, + "grad_norm": 0.1742451786994934, + "learning_rate": 1.7694319880418537e-05, + "loss": 3.6895, + "step": 96950 + }, + { + "epoch": 6.587511890202473, + "grad_norm": 0.20984330773353577, + "learning_rate": 1.769007337953526e-05, + "loss": 3.6468, + "step": 96955 + }, + { + "epoch": 6.587851610273135, + "grad_norm": 0.216226726770401, + "learning_rate": 1.7685826878651993e-05, + "loss": 4.0949, + "step": 96960 + }, + { + "epoch": 6.588191330343797, + "grad_norm": 0.1620084047317505, + "learning_rate": 1.768158037776872e-05, + "loss": 3.7438, + "step": 96965 + }, + { + "epoch": 6.588531050414458, + "grad_norm": 0.15805354714393616, + "learning_rate": 1.7677333876885446e-05, + "loss": 4.0846, + "step": 96970 + }, + { + "epoch": 6.58887077048512, + "grad_norm": 0.17543061077594757, + "learning_rate": 1.7673087376002177e-05, + "loss": 3.5297, + "step": 96975 + }, + { + "epoch": 6.589210490555782, + "grad_norm": 0.1977647840976715, + "learning_rate": 1.76688408751189e-05, + "loss": 3.7266, + "step": 96980 + }, + { + "epoch": 6.589550210626443, + "grad_norm": 0.1837480068206787, + "learning_rate": 1.766459437423563e-05, + "loss": 3.7956, + "step": 96985 + }, + { + "epoch": 6.5898899306971055, + "grad_norm": 0.6687168478965759, + "learning_rate": 1.7660347873352358e-05, + "loss": 3.8937, + "step": 96990 + }, + { + "epoch": 6.590229650767768, + "grad_norm": 0.1713707000017166, + "learning_rate": 1.7656101372469086e-05, + "loss": 3.9379, + "step": 96995 + }, + { + "epoch": 6.590569370838429, + "grad_norm": 1.2046457529067993, + "learning_rate": 1.7651854871585814e-05, + "loss": 3.9116, + "step": 97000 + }, + { + "epoch": 6.590909090909091, + "grad_norm": 0.16251863539218903, + "learning_rate": 1.7647608370702542e-05, + "loss": 3.8983, + "step": 97005 + }, + { + "epoch": 6.591248810979753, + "grad_norm": 0.14763599634170532, + "learning_rate": 1.764336186981927e-05, + "loss": 3.6737, + "step": 97010 + }, + { + "epoch": 6.591588531050414, + "grad_norm": 0.17866063117980957, + "learning_rate": 1.7639115368935998e-05, + "loss": 3.8152, + "step": 97015 + }, + { + "epoch": 6.591928251121076, + "grad_norm": 0.1697966605424881, + "learning_rate": 1.7634868868052726e-05, + "loss": 3.7054, + "step": 97020 + }, + { + "epoch": 6.592267971191738, + "grad_norm": 0.20378080010414124, + "learning_rate": 1.7630622367169454e-05, + "loss": 3.75, + "step": 97025 + }, + { + "epoch": 6.592607691262399, + "grad_norm": 0.23516753315925598, + "learning_rate": 1.7626375866286182e-05, + "loss": 3.8221, + "step": 97030 + }, + { + "epoch": 6.5929474113330615, + "grad_norm": 0.20523181557655334, + "learning_rate": 1.762212936540291e-05, + "loss": 3.6441, + "step": 97035 + }, + { + "epoch": 6.593287131403724, + "grad_norm": 0.18792536854743958, + "learning_rate": 1.7617882864519634e-05, + "loss": 3.6663, + "step": 97040 + }, + { + "epoch": 6.593626851474385, + "grad_norm": 0.1959187239408493, + "learning_rate": 1.7613636363636366e-05, + "loss": 3.7711, + "step": 97045 + }, + { + "epoch": 6.593966571545047, + "grad_norm": 0.21029970049858093, + "learning_rate": 1.7609389862753094e-05, + "loss": 3.7633, + "step": 97050 + }, + { + "epoch": 6.594306291615709, + "grad_norm": 0.20345981419086456, + "learning_rate": 1.760514336186982e-05, + "loss": 3.8897, + "step": 97055 + }, + { + "epoch": 6.59464601168637, + "grad_norm": 0.29355236887931824, + "learning_rate": 1.760089686098655e-05, + "loss": 4.0076, + "step": 97060 + }, + { + "epoch": 6.594985731757032, + "grad_norm": 0.1746012568473816, + "learning_rate": 1.7596650360103274e-05, + "loss": 3.7405, + "step": 97065 + }, + { + "epoch": 6.595325451827694, + "grad_norm": 0.4867667853832245, + "learning_rate": 1.7592403859220002e-05, + "loss": 3.7663, + "step": 97070 + }, + { + "epoch": 6.595665171898355, + "grad_norm": 0.23598122596740723, + "learning_rate": 1.758815735833673e-05, + "loss": 3.9834, + "step": 97075 + }, + { + "epoch": 6.5960048919690175, + "grad_norm": 0.20043690502643585, + "learning_rate": 1.758391085745346e-05, + "loss": 3.6489, + "step": 97080 + }, + { + "epoch": 6.59634461203968, + "grad_norm": 0.1834181249141693, + "learning_rate": 1.7579664356570186e-05, + "loss": 3.7985, + "step": 97085 + }, + { + "epoch": 6.596684332110341, + "grad_norm": 0.13951896131038666, + "learning_rate": 1.7575417855686914e-05, + "loss": 3.9349, + "step": 97090 + }, + { + "epoch": 6.597024052181003, + "grad_norm": 0.17428509891033173, + "learning_rate": 1.7571171354803642e-05, + "loss": 3.8713, + "step": 97095 + }, + { + "epoch": 6.597363772251665, + "grad_norm": 0.19422730803489685, + "learning_rate": 1.756692485392037e-05, + "loss": 3.767, + "step": 97100 + }, + { + "epoch": 6.597703492322326, + "grad_norm": 0.22323769330978394, + "learning_rate": 1.75626783530371e-05, + "loss": 3.8199, + "step": 97105 + }, + { + "epoch": 6.598043212392988, + "grad_norm": 0.17323540151119232, + "learning_rate": 1.7558431852153827e-05, + "loss": 4.0079, + "step": 97110 + }, + { + "epoch": 6.59838293246365, + "grad_norm": 0.2376486212015152, + "learning_rate": 1.7554185351270555e-05, + "loss": 3.8568, + "step": 97115 + }, + { + "epoch": 6.5987226525343115, + "grad_norm": 0.1751522421836853, + "learning_rate": 1.7549938850387283e-05, + "loss": 3.7824, + "step": 97120 + }, + { + "epoch": 6.5990623726049735, + "grad_norm": 0.162657231092453, + "learning_rate": 1.7545692349504007e-05, + "loss": 3.7411, + "step": 97125 + }, + { + "epoch": 6.599402092675636, + "grad_norm": 0.16614018380641937, + "learning_rate": 1.754144584862074e-05, + "loss": 3.8024, + "step": 97130 + }, + { + "epoch": 6.599741812746297, + "grad_norm": 0.13967856764793396, + "learning_rate": 1.7537199347737467e-05, + "loss": 3.6916, + "step": 97135 + }, + { + "epoch": 6.600081532816959, + "grad_norm": 0.18511256575584412, + "learning_rate": 1.753295284685419e-05, + "loss": 3.8205, + "step": 97140 + }, + { + "epoch": 6.600421252887621, + "grad_norm": 0.1982847899198532, + "learning_rate": 1.7528706345970923e-05, + "loss": 3.8522, + "step": 97145 + }, + { + "epoch": 6.600760972958282, + "grad_norm": 0.19399426877498627, + "learning_rate": 1.7524459845087647e-05, + "loss": 3.8045, + "step": 97150 + }, + { + "epoch": 6.601100693028944, + "grad_norm": 0.13964912295341492, + "learning_rate": 1.7520213344204375e-05, + "loss": 3.7726, + "step": 97155 + }, + { + "epoch": 6.601440413099606, + "grad_norm": 0.1555028259754181, + "learning_rate": 1.7515966843321107e-05, + "loss": 4.1287, + "step": 97160 + }, + { + "epoch": 6.6017801331702675, + "grad_norm": 0.18295077979564667, + "learning_rate": 1.751172034243783e-05, + "loss": 4.0435, + "step": 97165 + }, + { + "epoch": 6.6021198532409295, + "grad_norm": 0.19984567165374756, + "learning_rate": 1.750747384155456e-05, + "loss": 3.7919, + "step": 97170 + }, + { + "epoch": 6.602459573311592, + "grad_norm": 0.1903802901506424, + "learning_rate": 1.7503227340671287e-05, + "loss": 3.722, + "step": 97175 + }, + { + "epoch": 6.602799293382253, + "grad_norm": 0.1873559206724167, + "learning_rate": 1.7498980839788015e-05, + "loss": 3.8567, + "step": 97180 + }, + { + "epoch": 6.603139013452915, + "grad_norm": 0.14595159888267517, + "learning_rate": 1.7494734338904743e-05, + "loss": 4.0636, + "step": 97185 + }, + { + "epoch": 6.603478733523577, + "grad_norm": 0.14051498472690582, + "learning_rate": 1.749048783802147e-05, + "loss": 3.8744, + "step": 97190 + }, + { + "epoch": 6.603818453594238, + "grad_norm": 0.17537330090999603, + "learning_rate": 1.74862413371382e-05, + "loss": 3.9544, + "step": 97195 + }, + { + "epoch": 6.6041581736649, + "grad_norm": 0.2062283307313919, + "learning_rate": 1.7481994836254927e-05, + "loss": 4.034, + "step": 97200 + }, + { + "epoch": 6.604497893735562, + "grad_norm": 0.1648673266172409, + "learning_rate": 1.7477748335371655e-05, + "loss": 3.7524, + "step": 97205 + }, + { + "epoch": 6.6048376138062235, + "grad_norm": 0.1940527856349945, + "learning_rate": 1.7473501834488383e-05, + "loss": 3.5734, + "step": 97210 + }, + { + "epoch": 6.6051773338768855, + "grad_norm": 0.17060501873493195, + "learning_rate": 1.746925533360511e-05, + "loss": 3.8864, + "step": 97215 + }, + { + "epoch": 6.605517053947548, + "grad_norm": 0.25032153725624084, + "learning_rate": 1.746500883272184e-05, + "loss": 3.96, + "step": 97220 + }, + { + "epoch": 6.605856774018209, + "grad_norm": 0.17735445499420166, + "learning_rate": 1.7460762331838564e-05, + "loss": 3.7328, + "step": 97225 + }, + { + "epoch": 6.606196494088871, + "grad_norm": 0.18215537071228027, + "learning_rate": 1.7456515830955295e-05, + "loss": 3.8031, + "step": 97230 + }, + { + "epoch": 6.606536214159533, + "grad_norm": 0.1378353387117386, + "learning_rate": 1.745226933007202e-05, + "loss": 3.8679, + "step": 97235 + }, + { + "epoch": 6.606875934230194, + "grad_norm": 0.15663553774356842, + "learning_rate": 1.7448022829188748e-05, + "loss": 3.9316, + "step": 97240 + }, + { + "epoch": 6.607215654300856, + "grad_norm": 0.1665482521057129, + "learning_rate": 1.744377632830548e-05, + "loss": 3.7743, + "step": 97245 + }, + { + "epoch": 6.607555374371518, + "grad_norm": 0.14767953753471375, + "learning_rate": 1.7439529827422204e-05, + "loss": 3.6449, + "step": 97250 + }, + { + "epoch": 6.6078950944421795, + "grad_norm": 0.16111941635608673, + "learning_rate": 1.7435283326538932e-05, + "loss": 3.5653, + "step": 97255 + }, + { + "epoch": 6.608234814512842, + "grad_norm": 0.15866750478744507, + "learning_rate": 1.743103682565566e-05, + "loss": 3.9238, + "step": 97260 + }, + { + "epoch": 6.608574534583504, + "grad_norm": 0.13227681815624237, + "learning_rate": 1.7426790324772388e-05, + "loss": 3.7898, + "step": 97265 + }, + { + "epoch": 6.608914254654165, + "grad_norm": 0.1895110309123993, + "learning_rate": 1.7422543823889116e-05, + "loss": 3.6031, + "step": 97270 + }, + { + "epoch": 6.609253974724827, + "grad_norm": 0.20326471328735352, + "learning_rate": 1.7418297323005844e-05, + "loss": 4.0508, + "step": 97275 + }, + { + "epoch": 6.609593694795489, + "grad_norm": 0.49812594056129456, + "learning_rate": 1.7414050822122572e-05, + "loss": 4.0262, + "step": 97280 + }, + { + "epoch": 6.60993341486615, + "grad_norm": 0.35791993141174316, + "learning_rate": 1.74098043212393e-05, + "loss": 4.0661, + "step": 97285 + }, + { + "epoch": 6.610273134936812, + "grad_norm": 0.16448646783828735, + "learning_rate": 1.7405557820356028e-05, + "loss": 3.7571, + "step": 97290 + }, + { + "epoch": 6.610612855007474, + "grad_norm": 0.15996277332305908, + "learning_rate": 1.7401311319472756e-05, + "loss": 3.7116, + "step": 97295 + }, + { + "epoch": 6.6109525750781355, + "grad_norm": 0.16339412331581116, + "learning_rate": 1.7397064818589484e-05, + "loss": 4.038, + "step": 97300 + }, + { + "epoch": 6.611292295148798, + "grad_norm": 0.19682057201862335, + "learning_rate": 1.7392818317706212e-05, + "loss": 3.6029, + "step": 97305 + }, + { + "epoch": 6.61163201521946, + "grad_norm": 0.1920398324728012, + "learning_rate": 1.7388571816822937e-05, + "loss": 3.7194, + "step": 97310 + }, + { + "epoch": 6.611971735290121, + "grad_norm": 0.43165382742881775, + "learning_rate": 1.7384325315939668e-05, + "loss": 3.9456, + "step": 97315 + }, + { + "epoch": 6.612311455360783, + "grad_norm": 0.21755550801753998, + "learning_rate": 1.7380078815056393e-05, + "loss": 3.9045, + "step": 97320 + }, + { + "epoch": 6.612651175431445, + "grad_norm": 0.15839506685733795, + "learning_rate": 1.737583231417312e-05, + "loss": 3.8969, + "step": 97325 + }, + { + "epoch": 6.612990895502106, + "grad_norm": 0.16762372851371765, + "learning_rate": 1.7371585813289852e-05, + "loss": 3.7144, + "step": 97330 + }, + { + "epoch": 6.613330615572768, + "grad_norm": 0.16222813725471497, + "learning_rate": 1.7367339312406577e-05, + "loss": 3.7111, + "step": 97335 + }, + { + "epoch": 6.613670335643429, + "grad_norm": 0.18657591938972473, + "learning_rate": 1.7363092811523305e-05, + "loss": 3.8521, + "step": 97340 + }, + { + "epoch": 6.6140100557140915, + "grad_norm": 0.39917096495628357, + "learning_rate": 1.7358846310640033e-05, + "loss": 3.9244, + "step": 97345 + }, + { + "epoch": 6.614349775784754, + "grad_norm": 0.3736250698566437, + "learning_rate": 1.735459980975676e-05, + "loss": 3.6826, + "step": 97350 + }, + { + "epoch": 6.614689495855415, + "grad_norm": 0.22206293046474457, + "learning_rate": 1.7350353308873492e-05, + "loss": 3.5965, + "step": 97355 + }, + { + "epoch": 6.615029215926077, + "grad_norm": 0.2048659473657608, + "learning_rate": 1.7346106807990217e-05, + "loss": 3.4977, + "step": 97360 + }, + { + "epoch": 6.615368935996739, + "grad_norm": 0.18768222630023956, + "learning_rate": 1.7341860307106945e-05, + "loss": 3.8125, + "step": 97365 + }, + { + "epoch": 6.6157086560674, + "grad_norm": 0.31805920600891113, + "learning_rate": 1.7337613806223673e-05, + "loss": 3.897, + "step": 97370 + }, + { + "epoch": 6.616048376138062, + "grad_norm": 0.23006649315357208, + "learning_rate": 1.73333673053404e-05, + "loss": 3.5301, + "step": 97375 + }, + { + "epoch": 6.616388096208724, + "grad_norm": 0.16018559038639069, + "learning_rate": 1.732912080445713e-05, + "loss": 3.8073, + "step": 97380 + }, + { + "epoch": 6.616727816279385, + "grad_norm": 0.1881377249956131, + "learning_rate": 1.7324874303573857e-05, + "loss": 3.9672, + "step": 97385 + }, + { + "epoch": 6.6170675363500475, + "grad_norm": 0.20850728452205658, + "learning_rate": 1.7320627802690585e-05, + "loss": 3.522, + "step": 97390 + }, + { + "epoch": 6.61740725642071, + "grad_norm": 0.2923467457294464, + "learning_rate": 1.731638130180731e-05, + "loss": 3.7086, + "step": 97395 + }, + { + "epoch": 6.617746976491371, + "grad_norm": 0.17369860410690308, + "learning_rate": 1.731213480092404e-05, + "loss": 3.8521, + "step": 97400 + }, + { + "epoch": 6.618086696562033, + "grad_norm": 0.15984612703323364, + "learning_rate": 1.730788830004077e-05, + "loss": 3.991, + "step": 97405 + }, + { + "epoch": 6.618426416632695, + "grad_norm": 0.19181999564170837, + "learning_rate": 1.7303641799157493e-05, + "loss": 4.0878, + "step": 97410 + }, + { + "epoch": 6.618766136703356, + "grad_norm": 0.165174663066864, + "learning_rate": 1.7299395298274225e-05, + "loss": 3.6818, + "step": 97415 + }, + { + "epoch": 6.619105856774018, + "grad_norm": 0.20356659591197968, + "learning_rate": 1.729514879739095e-05, + "loss": 3.7412, + "step": 97420 + }, + { + "epoch": 6.61944557684468, + "grad_norm": 0.17369699478149414, + "learning_rate": 1.7290902296507677e-05, + "loss": 3.9597, + "step": 97425 + }, + { + "epoch": 6.6197852969153415, + "grad_norm": 0.20833931863307953, + "learning_rate": 1.7286655795624405e-05, + "loss": 3.8489, + "step": 97430 + }, + { + "epoch": 6.6201250169860035, + "grad_norm": 0.15766564011573792, + "learning_rate": 1.7282409294741133e-05, + "loss": 3.5354, + "step": 97435 + }, + { + "epoch": 6.620464737056666, + "grad_norm": 0.2013990432024002, + "learning_rate": 1.7278162793857865e-05, + "loss": 3.8517, + "step": 97440 + }, + { + "epoch": 6.620804457127327, + "grad_norm": 0.15361113846302032, + "learning_rate": 1.727391629297459e-05, + "loss": 3.5583, + "step": 97445 + }, + { + "epoch": 6.621144177197989, + "grad_norm": 0.19320739805698395, + "learning_rate": 1.7269669792091317e-05, + "loss": 4.077, + "step": 97450 + }, + { + "epoch": 6.621483897268651, + "grad_norm": 0.22444918751716614, + "learning_rate": 1.7265423291208045e-05, + "loss": 3.8962, + "step": 97455 + }, + { + "epoch": 6.621823617339312, + "grad_norm": 0.1980927735567093, + "learning_rate": 1.7261176790324773e-05, + "loss": 4.0385, + "step": 97460 + }, + { + "epoch": 6.622163337409974, + "grad_norm": 0.17566989362239838, + "learning_rate": 1.72569302894415e-05, + "loss": 3.659, + "step": 97465 + }, + { + "epoch": 6.622503057480636, + "grad_norm": 0.16323181986808777, + "learning_rate": 1.725268378855823e-05, + "loss": 3.8563, + "step": 97470 + }, + { + "epoch": 6.6228427775512975, + "grad_norm": 0.6830105185508728, + "learning_rate": 1.7248437287674958e-05, + "loss": 3.8536, + "step": 97475 + }, + { + "epoch": 6.6231824976219595, + "grad_norm": 0.11072591692209244, + "learning_rate": 1.7244190786791682e-05, + "loss": 3.7107, + "step": 97480 + }, + { + "epoch": 6.623522217692622, + "grad_norm": 0.1418125033378601, + "learning_rate": 1.7239944285908414e-05, + "loss": 4.1258, + "step": 97485 + }, + { + "epoch": 6.623861937763283, + "grad_norm": 1.0678057670593262, + "learning_rate": 1.723569778502514e-05, + "loss": 3.9027, + "step": 97490 + }, + { + "epoch": 6.624201657833945, + "grad_norm": 0.2342217117547989, + "learning_rate": 1.7231451284141866e-05, + "loss": 3.8648, + "step": 97495 + }, + { + "epoch": 6.624541377904607, + "grad_norm": 0.21167419850826263, + "learning_rate": 1.7227204783258598e-05, + "loss": 3.6705, + "step": 97500 + }, + { + "epoch": 6.624881097975268, + "grad_norm": 0.2551633417606354, + "learning_rate": 1.7222958282375322e-05, + "loss": 4.044, + "step": 97505 + }, + { + "epoch": 6.62522081804593, + "grad_norm": 0.6568472981452942, + "learning_rate": 1.721871178149205e-05, + "loss": 3.862, + "step": 97510 + }, + { + "epoch": 6.625560538116592, + "grad_norm": 0.18729111552238464, + "learning_rate": 1.7214465280608778e-05, + "loss": 3.9424, + "step": 97515 + }, + { + "epoch": 6.6259002581872535, + "grad_norm": 0.20793190598487854, + "learning_rate": 1.7210218779725506e-05, + "loss": 4.1286, + "step": 97520 + }, + { + "epoch": 6.6262399782579156, + "grad_norm": 0.20649176836013794, + "learning_rate": 1.7205972278842238e-05, + "loss": 3.5247, + "step": 97525 + }, + { + "epoch": 6.626579698328578, + "grad_norm": 0.19791588187217712, + "learning_rate": 1.7201725777958962e-05, + "loss": 3.9823, + "step": 97530 + }, + { + "epoch": 6.626919418399239, + "grad_norm": 0.18290741741657257, + "learning_rate": 1.719747927707569e-05, + "loss": 3.9327, + "step": 97535 + }, + { + "epoch": 6.627259138469901, + "grad_norm": 0.25536882877349854, + "learning_rate": 1.7193232776192418e-05, + "loss": 3.8953, + "step": 97540 + }, + { + "epoch": 6.627598858540563, + "grad_norm": 0.16846534609794617, + "learning_rate": 1.7188986275309146e-05, + "loss": 3.7231, + "step": 97545 + }, + { + "epoch": 6.627938578611224, + "grad_norm": 0.18040680885314941, + "learning_rate": 1.7184739774425874e-05, + "loss": 3.7741, + "step": 97550 + }, + { + "epoch": 6.628278298681886, + "grad_norm": 0.27331334352493286, + "learning_rate": 1.7180493273542602e-05, + "loss": 3.9021, + "step": 97555 + }, + { + "epoch": 6.628618018752547, + "grad_norm": 0.15491890907287598, + "learning_rate": 1.717624677265933e-05, + "loss": 3.7616, + "step": 97560 + }, + { + "epoch": 6.6289577388232095, + "grad_norm": 0.2164619415998459, + "learning_rate": 1.7172000271776055e-05, + "loss": 3.7605, + "step": 97565 + }, + { + "epoch": 6.629297458893872, + "grad_norm": 0.6357191801071167, + "learning_rate": 1.7167753770892786e-05, + "loss": 3.7025, + "step": 97570 + }, + { + "epoch": 6.629637178964533, + "grad_norm": 0.18849031627178192, + "learning_rate": 1.7163507270009514e-05, + "loss": 3.9178, + "step": 97575 + }, + { + "epoch": 6.629976899035195, + "grad_norm": 0.20572225749492645, + "learning_rate": 1.715926076912624e-05, + "loss": 3.8546, + "step": 97580 + }, + { + "epoch": 6.630316619105857, + "grad_norm": 0.19543586671352386, + "learning_rate": 1.715501426824297e-05, + "loss": 3.5979, + "step": 97585 + }, + { + "epoch": 6.630656339176518, + "grad_norm": 0.18541297316551208, + "learning_rate": 1.7150767767359695e-05, + "loss": 3.7955, + "step": 97590 + }, + { + "epoch": 6.63099605924718, + "grad_norm": 0.14897888898849487, + "learning_rate": 1.7146521266476423e-05, + "loss": 3.8378, + "step": 97595 + }, + { + "epoch": 6.631335779317842, + "grad_norm": 0.1847577840089798, + "learning_rate": 1.714227476559315e-05, + "loss": 3.8041, + "step": 97600 + }, + { + "epoch": 6.631675499388503, + "grad_norm": 0.17902661859989166, + "learning_rate": 1.713802826470988e-05, + "loss": 4.0983, + "step": 97605 + }, + { + "epoch": 6.6320152194591655, + "grad_norm": 0.24407774209976196, + "learning_rate": 1.713378176382661e-05, + "loss": 4.0031, + "step": 97610 + }, + { + "epoch": 6.632354939529828, + "grad_norm": 0.1620427817106247, + "learning_rate": 1.7129535262943335e-05, + "loss": 3.7806, + "step": 97615 + }, + { + "epoch": 6.632694659600489, + "grad_norm": 0.24517247080802917, + "learning_rate": 1.7125288762060063e-05, + "loss": 3.6986, + "step": 97620 + }, + { + "epoch": 6.633034379671151, + "grad_norm": 0.2130793035030365, + "learning_rate": 1.712104226117679e-05, + "loss": 3.8534, + "step": 97625 + }, + { + "epoch": 6.633374099741813, + "grad_norm": 0.14351189136505127, + "learning_rate": 1.711679576029352e-05, + "loss": 3.6998, + "step": 97630 + }, + { + "epoch": 6.633713819812474, + "grad_norm": 0.1463000476360321, + "learning_rate": 1.7112549259410247e-05, + "loss": 3.7156, + "step": 97635 + }, + { + "epoch": 6.634053539883136, + "grad_norm": 0.20852304995059967, + "learning_rate": 1.7108302758526975e-05, + "loss": 3.8822, + "step": 97640 + }, + { + "epoch": 6.634393259953798, + "grad_norm": 0.13566233217716217, + "learning_rate": 1.7104056257643703e-05, + "loss": 4.0404, + "step": 97645 + }, + { + "epoch": 6.634732980024459, + "grad_norm": 0.19342859089374542, + "learning_rate": 1.7099809756760428e-05, + "loss": 3.934, + "step": 97650 + }, + { + "epoch": 6.6350727000951215, + "grad_norm": 0.44151952862739563, + "learning_rate": 1.709556325587716e-05, + "loss": 3.9244, + "step": 97655 + }, + { + "epoch": 6.635412420165784, + "grad_norm": 0.15374526381492615, + "learning_rate": 1.7091316754993887e-05, + "loss": 4.0212, + "step": 97660 + }, + { + "epoch": 6.635752140236445, + "grad_norm": 0.14995573461055756, + "learning_rate": 1.708707025411061e-05, + "loss": 3.7198, + "step": 97665 + }, + { + "epoch": 6.636091860307107, + "grad_norm": 0.19548943638801575, + "learning_rate": 1.7082823753227343e-05, + "loss": 3.9578, + "step": 97670 + }, + { + "epoch": 6.636431580377769, + "grad_norm": 0.18515579402446747, + "learning_rate": 1.7078577252344068e-05, + "loss": 4.1089, + "step": 97675 + }, + { + "epoch": 6.63677130044843, + "grad_norm": 0.14090080559253693, + "learning_rate": 1.7074330751460796e-05, + "loss": 3.7602, + "step": 97680 + }, + { + "epoch": 6.637111020519092, + "grad_norm": 0.14392350614070892, + "learning_rate": 1.7070084250577527e-05, + "loss": 3.5787, + "step": 97685 + }, + { + "epoch": 6.637450740589754, + "grad_norm": 0.17690740525722504, + "learning_rate": 1.706583774969425e-05, + "loss": 3.9845, + "step": 97690 + }, + { + "epoch": 6.6377904606604154, + "grad_norm": 0.1691780686378479, + "learning_rate": 1.7061591248810983e-05, + "loss": 4.0121, + "step": 97695 + }, + { + "epoch": 6.6381301807310775, + "grad_norm": 0.18805325031280518, + "learning_rate": 1.7057344747927708e-05, + "loss": 3.7303, + "step": 97700 + }, + { + "epoch": 6.63846990080174, + "grad_norm": 0.1585899144411087, + "learning_rate": 1.7053098247044436e-05, + "loss": 3.7389, + "step": 97705 + }, + { + "epoch": 6.638809620872401, + "grad_norm": 0.14704574644565582, + "learning_rate": 1.7048851746161164e-05, + "loss": 3.6712, + "step": 97710 + }, + { + "epoch": 6.639149340943063, + "grad_norm": 0.14275768399238586, + "learning_rate": 1.7044605245277892e-05, + "loss": 3.9266, + "step": 97715 + }, + { + "epoch": 6.639489061013725, + "grad_norm": 0.15118442475795746, + "learning_rate": 1.704035874439462e-05, + "loss": 3.9114, + "step": 97720 + }, + { + "epoch": 6.639828781084386, + "grad_norm": 0.1642536222934723, + "learning_rate": 1.7036112243511348e-05, + "loss": 3.574, + "step": 97725 + }, + { + "epoch": 6.640168501155048, + "grad_norm": 0.15129393339157104, + "learning_rate": 1.7031865742628076e-05, + "loss": 3.9541, + "step": 97730 + }, + { + "epoch": 6.64050822122571, + "grad_norm": 0.24814210832118988, + "learning_rate": 1.7027619241744804e-05, + "loss": 3.8436, + "step": 97735 + }, + { + "epoch": 6.6408479412963715, + "grad_norm": 0.2365351766347885, + "learning_rate": 1.7023372740861532e-05, + "loss": 3.8583, + "step": 97740 + }, + { + "epoch": 6.6411876613670335, + "grad_norm": 0.16146203875541687, + "learning_rate": 1.701912623997826e-05, + "loss": 3.9078, + "step": 97745 + }, + { + "epoch": 6.641527381437696, + "grad_norm": 0.21953164041042328, + "learning_rate": 1.7014879739094984e-05, + "loss": 3.9244, + "step": 97750 + }, + { + "epoch": 6.641867101508357, + "grad_norm": 0.18930694460868835, + "learning_rate": 1.7010633238211716e-05, + "loss": 3.7048, + "step": 97755 + }, + { + "epoch": 6.642206821579019, + "grad_norm": 0.21554237604141235, + "learning_rate": 1.700638673732844e-05, + "loss": 3.8629, + "step": 97760 + }, + { + "epoch": 6.642546541649681, + "grad_norm": 0.2113679200410843, + "learning_rate": 1.700214023644517e-05, + "loss": 3.9385, + "step": 97765 + }, + { + "epoch": 6.642886261720342, + "grad_norm": 0.17852193117141724, + "learning_rate": 1.69978937355619e-05, + "loss": 3.7415, + "step": 97770 + }, + { + "epoch": 6.643225981791004, + "grad_norm": 0.17534983158111572, + "learning_rate": 1.6993647234678624e-05, + "loss": 3.6893, + "step": 97775 + }, + { + "epoch": 6.643565701861666, + "grad_norm": 0.1896272897720337, + "learning_rate": 1.6989400733795356e-05, + "loss": 3.6278, + "step": 97780 + }, + { + "epoch": 6.6439054219323275, + "grad_norm": 5.100830554962158, + "learning_rate": 1.698515423291208e-05, + "loss": 4.0784, + "step": 97785 + }, + { + "epoch": 6.6442451420029895, + "grad_norm": 0.23590253293514252, + "learning_rate": 1.698090773202881e-05, + "loss": 3.8354, + "step": 97790 + }, + { + "epoch": 6.644584862073652, + "grad_norm": 0.16447857022285461, + "learning_rate": 1.6976661231145536e-05, + "loss": 3.7456, + "step": 97795 + }, + { + "epoch": 6.644924582144313, + "grad_norm": 0.17567718029022217, + "learning_rate": 1.6972414730262264e-05, + "loss": 3.7109, + "step": 97800 + }, + { + "epoch": 6.645264302214975, + "grad_norm": 0.229747474193573, + "learning_rate": 1.6968168229378992e-05, + "loss": 3.8169, + "step": 97805 + }, + { + "epoch": 6.645604022285637, + "grad_norm": 0.23789457976818085, + "learning_rate": 1.696392172849572e-05, + "loss": 3.6775, + "step": 97810 + }, + { + "epoch": 6.645943742356298, + "grad_norm": 0.17319083213806152, + "learning_rate": 1.695967522761245e-05, + "loss": 3.7608, + "step": 97815 + }, + { + "epoch": 6.64628346242696, + "grad_norm": 0.15928010642528534, + "learning_rate": 1.6955428726729176e-05, + "loss": 3.689, + "step": 97820 + }, + { + "epoch": 6.646623182497622, + "grad_norm": 0.20257951319217682, + "learning_rate": 1.6951182225845905e-05, + "loss": 3.8864, + "step": 97825 + }, + { + "epoch": 6.6469629025682835, + "grad_norm": 0.20065456628799438, + "learning_rate": 1.6946935724962633e-05, + "loss": 3.8585, + "step": 97830 + }, + { + "epoch": 6.647302622638946, + "grad_norm": 0.18552932143211365, + "learning_rate": 1.6942689224079357e-05, + "loss": 3.9008, + "step": 97835 + }, + { + "epoch": 6.647642342709608, + "grad_norm": 0.1457749605178833, + "learning_rate": 1.693844272319609e-05, + "loss": 4.0215, + "step": 97840 + }, + { + "epoch": 6.647982062780269, + "grad_norm": 0.2033795416355133, + "learning_rate": 1.6934196222312813e-05, + "loss": 3.8398, + "step": 97845 + }, + { + "epoch": 6.648321782850931, + "grad_norm": 0.19135594367980957, + "learning_rate": 1.692994972142954e-05, + "loss": 3.8339, + "step": 97850 + }, + { + "epoch": 6.648661502921593, + "grad_norm": 0.19987422227859497, + "learning_rate": 1.6925703220546273e-05, + "loss": 3.8432, + "step": 97855 + }, + { + "epoch": 6.649001222992254, + "grad_norm": 0.18778854608535767, + "learning_rate": 1.6921456719662997e-05, + "loss": 3.898, + "step": 97860 + }, + { + "epoch": 6.649340943062916, + "grad_norm": 0.21111935377120972, + "learning_rate": 1.691721021877973e-05, + "loss": 4.0328, + "step": 97865 + }, + { + "epoch": 6.649680663133578, + "grad_norm": 0.16396117210388184, + "learning_rate": 1.6912963717896453e-05, + "loss": 3.6089, + "step": 97870 + }, + { + "epoch": 6.6500203832042395, + "grad_norm": 0.16041991114616394, + "learning_rate": 1.690871721701318e-05, + "loss": 3.7266, + "step": 97875 + }, + { + "epoch": 6.650360103274902, + "grad_norm": 0.18684260547161102, + "learning_rate": 1.6904470716129913e-05, + "loss": 3.7376, + "step": 97880 + }, + { + "epoch": 6.650699823345564, + "grad_norm": 0.16788507997989655, + "learning_rate": 1.6900224215246637e-05, + "loss": 3.9468, + "step": 97885 + }, + { + "epoch": 6.651039543416225, + "grad_norm": 0.16576717793941498, + "learning_rate": 1.6895977714363365e-05, + "loss": 4.0861, + "step": 97890 + }, + { + "epoch": 6.651379263486887, + "grad_norm": 0.1717865914106369, + "learning_rate": 1.6891731213480093e-05, + "loss": 3.9044, + "step": 97895 + }, + { + "epoch": 6.651718983557549, + "grad_norm": 0.26112881302833557, + "learning_rate": 1.688748471259682e-05, + "loss": 3.6713, + "step": 97900 + }, + { + "epoch": 6.65205870362821, + "grad_norm": 0.2097044736146927, + "learning_rate": 1.688323821171355e-05, + "loss": 3.8364, + "step": 97905 + }, + { + "epoch": 6.652398423698872, + "grad_norm": 0.155910924077034, + "learning_rate": 1.6878991710830277e-05, + "loss": 3.9029, + "step": 97910 + }, + { + "epoch": 6.652738143769534, + "grad_norm": 0.167001873254776, + "learning_rate": 1.6874745209947005e-05, + "loss": 3.683, + "step": 97915 + }, + { + "epoch": 6.6530778638401955, + "grad_norm": 14.121210098266602, + "learning_rate": 1.687049870906373e-05, + "loss": 3.815, + "step": 97920 + }, + { + "epoch": 6.653417583910858, + "grad_norm": 0.20107313990592957, + "learning_rate": 1.686625220818046e-05, + "loss": 3.879, + "step": 97925 + }, + { + "epoch": 6.65375730398152, + "grad_norm": 0.14842045307159424, + "learning_rate": 1.686200570729719e-05, + "loss": 3.6826, + "step": 97930 + }, + { + "epoch": 6.654097024052181, + "grad_norm": 0.168861523270607, + "learning_rate": 1.6857759206413914e-05, + "loss": 3.7802, + "step": 97935 + }, + { + "epoch": 6.654436744122843, + "grad_norm": 0.1748935729265213, + "learning_rate": 1.6853512705530645e-05, + "loss": 3.7843, + "step": 97940 + }, + { + "epoch": 6.654776464193505, + "grad_norm": 0.2488655000925064, + "learning_rate": 1.684926620464737e-05, + "loss": 3.8142, + "step": 97945 + }, + { + "epoch": 6.655116184264166, + "grad_norm": 0.18403126299381256, + "learning_rate": 1.68450197037641e-05, + "loss": 3.961, + "step": 97950 + }, + { + "epoch": 6.655455904334828, + "grad_norm": 0.16995422542095184, + "learning_rate": 1.6840773202880826e-05, + "loss": 3.9466, + "step": 97955 + }, + { + "epoch": 6.65579562440549, + "grad_norm": 0.1613229513168335, + "learning_rate": 1.6836526701997554e-05, + "loss": 3.4817, + "step": 97960 + }, + { + "epoch": 6.6561353444761515, + "grad_norm": 0.26403844356536865, + "learning_rate": 1.6832280201114285e-05, + "loss": 3.8389, + "step": 97965 + }, + { + "epoch": 6.656475064546814, + "grad_norm": 0.5185433626174927, + "learning_rate": 1.682803370023101e-05, + "loss": 3.8621, + "step": 97970 + }, + { + "epoch": 6.656814784617476, + "grad_norm": 0.29152199625968933, + "learning_rate": 1.6823787199347738e-05, + "loss": 4.0053, + "step": 97975 + }, + { + "epoch": 6.657154504688137, + "grad_norm": 0.151423841714859, + "learning_rate": 1.6819540698464466e-05, + "loss": 3.7888, + "step": 97980 + }, + { + "epoch": 6.657494224758799, + "grad_norm": 0.14695465564727783, + "learning_rate": 1.6815294197581194e-05, + "loss": 3.8281, + "step": 97985 + }, + { + "epoch": 6.657833944829461, + "grad_norm": 0.16746075451374054, + "learning_rate": 1.6811047696697922e-05, + "loss": 3.8426, + "step": 97990 + }, + { + "epoch": 6.658173664900122, + "grad_norm": 0.16513259708881378, + "learning_rate": 1.680680119581465e-05, + "loss": 3.8023, + "step": 97995 + }, + { + "epoch": 6.658513384970784, + "grad_norm": 0.1906750351190567, + "learning_rate": 1.6802554694931378e-05, + "loss": 3.7799, + "step": 98000 + }, + { + "epoch": 6.658853105041446, + "grad_norm": 0.16646528244018555, + "learning_rate": 1.6798308194048103e-05, + "loss": 3.9103, + "step": 98005 + }, + { + "epoch": 6.6591928251121075, + "grad_norm": 0.17902129888534546, + "learning_rate": 1.6794061693164834e-05, + "loss": 3.6622, + "step": 98010 + }, + { + "epoch": 6.65953254518277, + "grad_norm": 0.15466973185539246, + "learning_rate": 1.6789815192281562e-05, + "loss": 4.0725, + "step": 98015 + }, + { + "epoch": 6.659872265253431, + "grad_norm": 0.28222909569740295, + "learning_rate": 1.6785568691398287e-05, + "loss": 3.8189, + "step": 98020 + }, + { + "epoch": 6.660211985324093, + "grad_norm": 0.15269222855567932, + "learning_rate": 1.6781322190515018e-05, + "loss": 3.9005, + "step": 98025 + }, + { + "epoch": 6.660551705394755, + "grad_norm": 0.20711030066013336, + "learning_rate": 1.6777075689631743e-05, + "loss": 3.9224, + "step": 98030 + }, + { + "epoch": 6.660891425465416, + "grad_norm": 0.20753814280033112, + "learning_rate": 1.6772829188748474e-05, + "loss": 3.9887, + "step": 98035 + }, + { + "epoch": 6.661231145536078, + "grad_norm": 0.18924148380756378, + "learning_rate": 1.67685826878652e-05, + "loss": 3.7618, + "step": 98040 + }, + { + "epoch": 6.66157086560674, + "grad_norm": 0.17481616139411926, + "learning_rate": 1.6764336186981927e-05, + "loss": 3.8749, + "step": 98045 + }, + { + "epoch": 6.6619105856774015, + "grad_norm": 0.13442890346050262, + "learning_rate": 1.6760089686098658e-05, + "loss": 3.7319, + "step": 98050 + }, + { + "epoch": 6.6622503057480635, + "grad_norm": 0.6438812017440796, + "learning_rate": 1.6755843185215383e-05, + "loss": 3.8632, + "step": 98055 + }, + { + "epoch": 6.662590025818726, + "grad_norm": 0.16961880028247833, + "learning_rate": 1.675159668433211e-05, + "loss": 3.8818, + "step": 98060 + }, + { + "epoch": 6.662929745889387, + "grad_norm": 0.20309574902057648, + "learning_rate": 1.674735018344884e-05, + "loss": 3.7176, + "step": 98065 + }, + { + "epoch": 6.663269465960049, + "grad_norm": 0.20626607537269592, + "learning_rate": 1.6743103682565567e-05, + "loss": 3.8065, + "step": 98070 + }, + { + "epoch": 6.663609186030711, + "grad_norm": 0.2709059417247772, + "learning_rate": 1.6738857181682295e-05, + "loss": 3.9529, + "step": 98075 + }, + { + "epoch": 6.663948906101372, + "grad_norm": 0.19461442530155182, + "learning_rate": 1.6734610680799023e-05, + "loss": 3.6982, + "step": 98080 + }, + { + "epoch": 6.664288626172034, + "grad_norm": 0.234874427318573, + "learning_rate": 1.673036417991575e-05, + "loss": 3.8713, + "step": 98085 + }, + { + "epoch": 6.664628346242696, + "grad_norm": 0.14531412720680237, + "learning_rate": 1.6726117679032475e-05, + "loss": 3.8092, + "step": 98090 + }, + { + "epoch": 6.6649680663133575, + "grad_norm": 0.3363882601261139, + "learning_rate": 1.6721871178149207e-05, + "loss": 3.7163, + "step": 98095 + }, + { + "epoch": 6.6653077863840196, + "grad_norm": 0.1499057114124298, + "learning_rate": 1.6717624677265935e-05, + "loss": 3.6551, + "step": 98100 + }, + { + "epoch": 6.665647506454682, + "grad_norm": 0.38333258032798767, + "learning_rate": 1.671337817638266e-05, + "loss": 3.7812, + "step": 98105 + }, + { + "epoch": 6.665987226525343, + "grad_norm": 0.1776815503835678, + "learning_rate": 1.670913167549939e-05, + "loss": 3.6571, + "step": 98110 + }, + { + "epoch": 6.666326946596005, + "grad_norm": 0.1557912975549698, + "learning_rate": 1.6704885174616115e-05, + "loss": 3.9323, + "step": 98115 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.0258861780166626, + "learning_rate": 1.6700638673732847e-05, + "loss": 3.9535, + "step": 98120 + }, + { + "epoch": 6.667006386737328, + "grad_norm": 0.13886287808418274, + "learning_rate": 1.6696392172849575e-05, + "loss": 3.6973, + "step": 98125 + }, + { + "epoch": 6.66734610680799, + "grad_norm": 0.14667785167694092, + "learning_rate": 1.66921456719663e-05, + "loss": 3.9743, + "step": 98130 + }, + { + "epoch": 6.667685826878652, + "grad_norm": 0.275775671005249, + "learning_rate": 1.668789917108303e-05, + "loss": 3.7319, + "step": 98135 + }, + { + "epoch": 6.6680255469493135, + "grad_norm": 0.18896277248859406, + "learning_rate": 1.6683652670199755e-05, + "loss": 3.7013, + "step": 98140 + }, + { + "epoch": 6.668365267019976, + "grad_norm": 0.2870987355709076, + "learning_rate": 1.6679406169316483e-05, + "loss": 3.793, + "step": 98145 + }, + { + "epoch": 6.668704987090638, + "grad_norm": 0.2010570615530014, + "learning_rate": 1.667515966843321e-05, + "loss": 4.0552, + "step": 98150 + }, + { + "epoch": 6.669044707161299, + "grad_norm": 0.19120670855045319, + "learning_rate": 1.667091316754994e-05, + "loss": 3.7905, + "step": 98155 + }, + { + "epoch": 6.669384427231961, + "grad_norm": 0.23384545743465424, + "learning_rate": 1.6666666666666667e-05, + "loss": 3.9172, + "step": 98160 + }, + { + "epoch": 6.669724147302623, + "grad_norm": 0.17290112376213074, + "learning_rate": 1.6662420165783395e-05, + "loss": 3.6126, + "step": 98165 + }, + { + "epoch": 6.670063867373284, + "grad_norm": 0.17672953009605408, + "learning_rate": 1.6658173664900123e-05, + "loss": 3.8338, + "step": 98170 + }, + { + "epoch": 6.670403587443946, + "grad_norm": 0.19455528259277344, + "learning_rate": 1.665392716401685e-05, + "loss": 3.927, + "step": 98175 + }, + { + "epoch": 6.670743307514608, + "grad_norm": 0.19774173200130463, + "learning_rate": 1.664968066313358e-05, + "loss": 4.2089, + "step": 98180 + }, + { + "epoch": 6.6710830275852695, + "grad_norm": 0.22456969320774078, + "learning_rate": 1.6645434162250308e-05, + "loss": 3.8443, + "step": 98185 + }, + { + "epoch": 6.671422747655932, + "grad_norm": 5.130001544952393, + "learning_rate": 1.6641187661367032e-05, + "loss": 3.9414, + "step": 98190 + }, + { + "epoch": 6.671762467726594, + "grad_norm": 0.15741762518882751, + "learning_rate": 1.6636941160483764e-05, + "loss": 3.7728, + "step": 98195 + }, + { + "epoch": 6.672102187797255, + "grad_norm": 0.16530942916870117, + "learning_rate": 1.6632694659600488e-05, + "loss": 3.7921, + "step": 98200 + }, + { + "epoch": 6.672441907867917, + "grad_norm": 0.17111986875534058, + "learning_rate": 1.662844815871722e-05, + "loss": 3.7364, + "step": 98205 + }, + { + "epoch": 6.672781627938579, + "grad_norm": 0.2390504628419876, + "learning_rate": 1.6624201657833948e-05, + "loss": 4.039, + "step": 98210 + }, + { + "epoch": 6.67312134800924, + "grad_norm": 0.16763822734355927, + "learning_rate": 1.6619955156950672e-05, + "loss": 3.7603, + "step": 98215 + }, + { + "epoch": 6.673461068079902, + "grad_norm": 0.17900164425373077, + "learning_rate": 1.6615708656067404e-05, + "loss": 3.9581, + "step": 98220 + }, + { + "epoch": 6.673800788150564, + "grad_norm": 0.1574012190103531, + "learning_rate": 1.6611462155184128e-05, + "loss": 3.596, + "step": 98225 + }, + { + "epoch": 6.6741405082212255, + "grad_norm": 0.18963418900966644, + "learning_rate": 1.6607215654300856e-05, + "loss": 3.6808, + "step": 98230 + }, + { + "epoch": 6.674480228291888, + "grad_norm": 0.13888901472091675, + "learning_rate": 1.6602969153417584e-05, + "loss": 4.1215, + "step": 98235 + }, + { + "epoch": 6.674819948362549, + "grad_norm": 0.2003876268863678, + "learning_rate": 1.6598722652534312e-05, + "loss": 3.7705, + "step": 98240 + }, + { + "epoch": 6.675159668433211, + "grad_norm": 0.13179218769073486, + "learning_rate": 1.659447615165104e-05, + "loss": 3.9479, + "step": 98245 + }, + { + "epoch": 6.675499388503873, + "grad_norm": 0.180047407746315, + "learning_rate": 1.6590229650767768e-05, + "loss": 3.9542, + "step": 98250 + }, + { + "epoch": 6.675839108574534, + "grad_norm": 0.4793492555618286, + "learning_rate": 1.6585983149884496e-05, + "loss": 3.53, + "step": 98255 + }, + { + "epoch": 6.676178828645196, + "grad_norm": 0.1805434226989746, + "learning_rate": 1.6581736649001224e-05, + "loss": 3.7736, + "step": 98260 + }, + { + "epoch": 6.676518548715858, + "grad_norm": 0.17798875272274017, + "learning_rate": 1.6577490148117952e-05, + "loss": 3.792, + "step": 98265 + }, + { + "epoch": 6.6768582687865194, + "grad_norm": 0.17498989403247833, + "learning_rate": 1.657324364723468e-05, + "loss": 3.6656, + "step": 98270 + }, + { + "epoch": 6.6771979888571815, + "grad_norm": 0.2513429820537567, + "learning_rate": 1.6568997146351405e-05, + "loss": 3.5976, + "step": 98275 + }, + { + "epoch": 6.677537708927844, + "grad_norm": 0.14138750731945038, + "learning_rate": 1.6564750645468136e-05, + "loss": 3.8585, + "step": 98280 + }, + { + "epoch": 6.677877428998505, + "grad_norm": 0.14396996796131134, + "learning_rate": 1.656050414458486e-05, + "loss": 3.921, + "step": 98285 + }, + { + "epoch": 6.678217149069167, + "grad_norm": 0.18028613924980164, + "learning_rate": 1.6556257643701592e-05, + "loss": 3.8947, + "step": 98290 + }, + { + "epoch": 6.678556869139829, + "grad_norm": 0.17879055440425873, + "learning_rate": 1.655201114281832e-05, + "loss": 3.7001, + "step": 98295 + }, + { + "epoch": 6.67889658921049, + "grad_norm": 0.25114530324935913, + "learning_rate": 1.6547764641935045e-05, + "loss": 3.843, + "step": 98300 + }, + { + "epoch": 6.679236309281152, + "grad_norm": 0.17328087985515594, + "learning_rate": 1.6543518141051776e-05, + "loss": 3.8743, + "step": 98305 + }, + { + "epoch": 6.679576029351814, + "grad_norm": 0.14980317652225494, + "learning_rate": 1.65392716401685e-05, + "loss": 3.6835, + "step": 98310 + }, + { + "epoch": 6.6799157494224755, + "grad_norm": 0.13088329136371613, + "learning_rate": 1.653502513928523e-05, + "loss": 3.9469, + "step": 98315 + }, + { + "epoch": 6.6802554694931375, + "grad_norm": 1.1719409227371216, + "learning_rate": 1.6530778638401957e-05, + "loss": 3.9838, + "step": 98320 + }, + { + "epoch": 6.6805951895638, + "grad_norm": 0.18541766703128815, + "learning_rate": 1.6526532137518685e-05, + "loss": 3.787, + "step": 98325 + }, + { + "epoch": 6.680934909634461, + "grad_norm": 0.16214284300804138, + "learning_rate": 1.6522285636635413e-05, + "loss": 3.9063, + "step": 98330 + }, + { + "epoch": 6.681274629705123, + "grad_norm": 0.1836778223514557, + "learning_rate": 1.651803913575214e-05, + "loss": 3.6475, + "step": 98335 + }, + { + "epoch": 6.681614349775785, + "grad_norm": 0.15890644490718842, + "learning_rate": 1.651379263486887e-05, + "loss": 3.9388, + "step": 98340 + }, + { + "epoch": 6.681954069846446, + "grad_norm": 0.3115396499633789, + "learning_rate": 1.6509546133985597e-05, + "loss": 3.971, + "step": 98345 + }, + { + "epoch": 6.682293789917108, + "grad_norm": 0.17291703820228577, + "learning_rate": 1.6505299633102325e-05, + "loss": 3.8925, + "step": 98350 + }, + { + "epoch": 6.68263350998777, + "grad_norm": 0.22654500603675842, + "learning_rate": 1.6501053132219053e-05, + "loss": 3.9488, + "step": 98355 + }, + { + "epoch": 6.6829732300584315, + "grad_norm": 0.22150906920433044, + "learning_rate": 1.6496806631335778e-05, + "loss": 3.9231, + "step": 98360 + }, + { + "epoch": 6.6833129501290935, + "grad_norm": 0.23671981692314148, + "learning_rate": 1.649256013045251e-05, + "loss": 4.0157, + "step": 98365 + }, + { + "epoch": 6.683652670199756, + "grad_norm": 0.1471484750509262, + "learning_rate": 1.6488313629569234e-05, + "loss": 3.4989, + "step": 98370 + }, + { + "epoch": 6.683992390270417, + "grad_norm": 0.25421252846717834, + "learning_rate": 1.6484067128685965e-05, + "loss": 3.7604, + "step": 98375 + }, + { + "epoch": 6.684332110341079, + "grad_norm": 0.42949414253234863, + "learning_rate": 1.6479820627802693e-05, + "loss": 3.8441, + "step": 98380 + }, + { + "epoch": 6.684671830411741, + "grad_norm": 0.1904306858778, + "learning_rate": 1.6475574126919418e-05, + "loss": 4.0657, + "step": 98385 + }, + { + "epoch": 6.685011550482402, + "grad_norm": 0.19492800533771515, + "learning_rate": 1.647132762603615e-05, + "loss": 3.7991, + "step": 98390 + }, + { + "epoch": 6.685351270553064, + "grad_norm": 0.16607053577899933, + "learning_rate": 1.6467081125152874e-05, + "loss": 4.0704, + "step": 98395 + }, + { + "epoch": 6.685690990623726, + "grad_norm": 0.28082793951034546, + "learning_rate": 1.64628346242696e-05, + "loss": 3.7934, + "step": 98400 + }, + { + "epoch": 6.6860307106943875, + "grad_norm": 0.1882369965314865, + "learning_rate": 1.6458588123386333e-05, + "loss": 3.9885, + "step": 98405 + }, + { + "epoch": 6.6863704307650496, + "grad_norm": 0.14946770668029785, + "learning_rate": 1.6454341622503058e-05, + "loss": 3.666, + "step": 98410 + }, + { + "epoch": 6.686710150835712, + "grad_norm": 0.19830875098705292, + "learning_rate": 1.6450095121619786e-05, + "loss": 3.8077, + "step": 98415 + }, + { + "epoch": 6.687049870906373, + "grad_norm": 0.16056491434574127, + "learning_rate": 1.6445848620736514e-05, + "loss": 3.6775, + "step": 98420 + }, + { + "epoch": 6.687389590977035, + "grad_norm": 0.16085097193717957, + "learning_rate": 1.6441602119853242e-05, + "loss": 3.7974, + "step": 98425 + }, + { + "epoch": 6.687729311047697, + "grad_norm": 0.17273736000061035, + "learning_rate": 1.643735561896997e-05, + "loss": 3.6249, + "step": 98430 + }, + { + "epoch": 6.688069031118358, + "grad_norm": 0.2691991925239563, + "learning_rate": 1.6433109118086698e-05, + "loss": 3.7873, + "step": 98435 + }, + { + "epoch": 6.68840875118902, + "grad_norm": 0.16120530664920807, + "learning_rate": 1.6428862617203426e-05, + "loss": 3.7922, + "step": 98440 + }, + { + "epoch": 6.688748471259682, + "grad_norm": 0.18705761432647705, + "learning_rate": 1.642461611632015e-05, + "loss": 3.8333, + "step": 98445 + }, + { + "epoch": 6.6890881913303435, + "grad_norm": 0.13492579758167267, + "learning_rate": 1.6420369615436882e-05, + "loss": 4.2134, + "step": 98450 + }, + { + "epoch": 6.689427911401006, + "grad_norm": 0.16114817559719086, + "learning_rate": 1.641612311455361e-05, + "loss": 3.8408, + "step": 98455 + }, + { + "epoch": 6.689767631471668, + "grad_norm": 0.16160941123962402, + "learning_rate": 1.6411876613670338e-05, + "loss": 3.9276, + "step": 98460 + }, + { + "epoch": 6.690107351542329, + "grad_norm": 0.17079779505729675, + "learning_rate": 1.6407630112787066e-05, + "loss": 3.9335, + "step": 98465 + }, + { + "epoch": 6.690447071612991, + "grad_norm": 0.14601749181747437, + "learning_rate": 1.640338361190379e-05, + "loss": 3.5294, + "step": 98470 + }, + { + "epoch": 6.690786791683653, + "grad_norm": 0.13062416017055511, + "learning_rate": 1.6399137111020522e-05, + "loss": 3.8232, + "step": 98475 + }, + { + "epoch": 6.691126511754314, + "grad_norm": 0.163959339261055, + "learning_rate": 1.6394890610137246e-05, + "loss": 3.7921, + "step": 98480 + }, + { + "epoch": 6.691466231824976, + "grad_norm": 0.7196491956710815, + "learning_rate": 1.6390644109253974e-05, + "loss": 3.9713, + "step": 98485 + }, + { + "epoch": 6.691805951895638, + "grad_norm": 0.18440058827400208, + "learning_rate": 1.6386397608370706e-05, + "loss": 3.6702, + "step": 98490 + }, + { + "epoch": 6.6921456719662995, + "grad_norm": 0.1460983157157898, + "learning_rate": 1.638215110748743e-05, + "loss": 3.7455, + "step": 98495 + }, + { + "epoch": 6.692485392036962, + "grad_norm": 0.21249976754188538, + "learning_rate": 1.637790460660416e-05, + "loss": 3.8781, + "step": 98500 + }, + { + "epoch": 6.692825112107624, + "grad_norm": 0.20766420662403107, + "learning_rate": 1.6373658105720886e-05, + "loss": 3.7742, + "step": 98505 + }, + { + "epoch": 6.693164832178285, + "grad_norm": 0.33264607191085815, + "learning_rate": 1.6369411604837614e-05, + "loss": 3.8774, + "step": 98510 + }, + { + "epoch": 6.693504552248947, + "grad_norm": 0.23635530471801758, + "learning_rate": 1.6365165103954342e-05, + "loss": 3.8414, + "step": 98515 + }, + { + "epoch": 6.693844272319609, + "grad_norm": 0.18688587844371796, + "learning_rate": 1.636091860307107e-05, + "loss": 3.6715, + "step": 98520 + }, + { + "epoch": 6.69418399239027, + "grad_norm": 0.7956392765045166, + "learning_rate": 1.63566721021878e-05, + "loss": 3.63, + "step": 98525 + }, + { + "epoch": 6.694523712460932, + "grad_norm": 0.22569504380226135, + "learning_rate": 1.6352425601304523e-05, + "loss": 4.0211, + "step": 98530 + }, + { + "epoch": 6.694863432531594, + "grad_norm": 0.1724555343389511, + "learning_rate": 1.6348179100421254e-05, + "loss": 3.7492, + "step": 98535 + }, + { + "epoch": 6.6952031526022555, + "grad_norm": 0.20071309804916382, + "learning_rate": 1.6343932599537982e-05, + "loss": 3.7826, + "step": 98540 + }, + { + "epoch": 6.695542872672918, + "grad_norm": 0.13057146966457367, + "learning_rate": 1.633968609865471e-05, + "loss": 3.9718, + "step": 98545 + }, + { + "epoch": 6.69588259274358, + "grad_norm": 0.20095194876194, + "learning_rate": 1.633543959777144e-05, + "loss": 3.9215, + "step": 98550 + }, + { + "epoch": 6.696222312814241, + "grad_norm": 0.20940545201301575, + "learning_rate": 1.6331193096888163e-05, + "loss": 3.7303, + "step": 98555 + }, + { + "epoch": 6.696562032884903, + "grad_norm": 0.16892896592617035, + "learning_rate": 1.6326946596004895e-05, + "loss": 3.7975, + "step": 98560 + }, + { + "epoch": 6.696901752955565, + "grad_norm": 0.457302063703537, + "learning_rate": 1.632270009512162e-05, + "loss": 3.7651, + "step": 98565 + }, + { + "epoch": 6.697241473026226, + "grad_norm": 0.1660194844007492, + "learning_rate": 1.6318453594238347e-05, + "loss": 3.8184, + "step": 98570 + }, + { + "epoch": 6.697581193096888, + "grad_norm": 0.16730770468711853, + "learning_rate": 1.631420709335508e-05, + "loss": 3.5999, + "step": 98575 + }, + { + "epoch": 6.69792091316755, + "grad_norm": 0.6469573378562927, + "learning_rate": 1.6309960592471803e-05, + "loss": 3.9194, + "step": 98580 + }, + { + "epoch": 6.6982606332382115, + "grad_norm": 0.17927148938179016, + "learning_rate": 1.630571409158853e-05, + "loss": 3.5593, + "step": 98585 + }, + { + "epoch": 6.698600353308874, + "grad_norm": 0.17910949885845184, + "learning_rate": 1.630146759070526e-05, + "loss": 3.7621, + "step": 98590 + }, + { + "epoch": 6.698940073379536, + "grad_norm": 0.15708699822425842, + "learning_rate": 1.6297221089821987e-05, + "loss": 3.6052, + "step": 98595 + }, + { + "epoch": 6.699279793450197, + "grad_norm": 0.175208181142807, + "learning_rate": 1.6292974588938715e-05, + "loss": 3.8483, + "step": 98600 + }, + { + "epoch": 6.699619513520859, + "grad_norm": 0.17385853826999664, + "learning_rate": 1.6288728088055443e-05, + "loss": 3.902, + "step": 98605 + }, + { + "epoch": 6.699959233591521, + "grad_norm": 0.5756554007530212, + "learning_rate": 1.628448158717217e-05, + "loss": 4.0038, + "step": 98610 + }, + { + "epoch": 6.700298953662182, + "grad_norm": 0.15272359549999237, + "learning_rate": 1.6280235086288896e-05, + "loss": 3.639, + "step": 98615 + }, + { + "epoch": 6.700638673732844, + "grad_norm": 0.17996317148208618, + "learning_rate": 1.6275988585405627e-05, + "loss": 3.7727, + "step": 98620 + }, + { + "epoch": 6.700978393803506, + "grad_norm": 0.1686866283416748, + "learning_rate": 1.6271742084522355e-05, + "loss": 3.8313, + "step": 98625 + }, + { + "epoch": 6.7013181138741675, + "grad_norm": 0.15680605173110962, + "learning_rate": 1.6267495583639083e-05, + "loss": 3.7282, + "step": 98630 + }, + { + "epoch": 6.70165783394483, + "grad_norm": 0.17983326315879822, + "learning_rate": 1.626324908275581e-05, + "loss": 3.7038, + "step": 98635 + }, + { + "epoch": 6.701997554015492, + "grad_norm": 0.489094614982605, + "learning_rate": 1.6259002581872536e-05, + "loss": 3.9977, + "step": 98640 + }, + { + "epoch": 6.702337274086153, + "grad_norm": 0.17080527544021606, + "learning_rate": 1.6254756080989267e-05, + "loss": 3.7813, + "step": 98645 + }, + { + "epoch": 6.702676994156815, + "grad_norm": 0.1680312156677246, + "learning_rate": 1.6250509580105995e-05, + "loss": 3.7065, + "step": 98650 + }, + { + "epoch": 6.703016714227477, + "grad_norm": 0.16404949128627777, + "learning_rate": 1.624626307922272e-05, + "loss": 3.7277, + "step": 98655 + }, + { + "epoch": 6.703356434298138, + "grad_norm": 0.1866694539785385, + "learning_rate": 1.624201657833945e-05, + "loss": 3.8806, + "step": 98660 + }, + { + "epoch": 6.7036961543688, + "grad_norm": 3.1008100509643555, + "learning_rate": 1.6237770077456176e-05, + "loss": 3.8504, + "step": 98665 + }, + { + "epoch": 6.704035874439462, + "grad_norm": 0.629716157913208, + "learning_rate": 1.6233523576572904e-05, + "loss": 3.788, + "step": 98670 + }, + { + "epoch": 6.7043755945101235, + "grad_norm": 0.2788126766681671, + "learning_rate": 1.6229277075689632e-05, + "loss": 3.6449, + "step": 98675 + }, + { + "epoch": 6.704715314580786, + "grad_norm": 0.1557047814130783, + "learning_rate": 1.622503057480636e-05, + "loss": 3.691, + "step": 98680 + }, + { + "epoch": 6.705055034651448, + "grad_norm": 0.1621730774641037, + "learning_rate": 1.6220784073923088e-05, + "loss": 3.9578, + "step": 98685 + }, + { + "epoch": 6.705394754722109, + "grad_norm": 0.1856575459241867, + "learning_rate": 1.6216537573039816e-05, + "loss": 3.8064, + "step": 98690 + }, + { + "epoch": 6.705734474792771, + "grad_norm": 0.19335247576236725, + "learning_rate": 1.6212291072156544e-05, + "loss": 3.9115, + "step": 98695 + }, + { + "epoch": 6.706074194863432, + "grad_norm": 0.20928283035755157, + "learning_rate": 1.6208044571273272e-05, + "loss": 3.874, + "step": 98700 + }, + { + "epoch": 6.706413914934094, + "grad_norm": 0.20524418354034424, + "learning_rate": 1.620379807039e-05, + "loss": 3.9376, + "step": 98705 + }, + { + "epoch": 6.706753635004756, + "grad_norm": 0.13492488861083984, + "learning_rate": 1.6199551569506728e-05, + "loss": 3.8376, + "step": 98710 + }, + { + "epoch": 6.7070933550754175, + "grad_norm": 0.18164819478988647, + "learning_rate": 1.6195305068623456e-05, + "loss": 3.8985, + "step": 98715 + }, + { + "epoch": 6.70743307514608, + "grad_norm": 0.18826881051063538, + "learning_rate": 1.6191058567740184e-05, + "loss": 3.8474, + "step": 98720 + }, + { + "epoch": 6.707772795216742, + "grad_norm": 0.19228388369083405, + "learning_rate": 1.618681206685691e-05, + "loss": 3.8761, + "step": 98725 + }, + { + "epoch": 6.708112515287403, + "grad_norm": 0.16982296109199524, + "learning_rate": 1.618256556597364e-05, + "loss": 3.8309, + "step": 98730 + }, + { + "epoch": 6.708452235358065, + "grad_norm": 0.16605034470558167, + "learning_rate": 1.6178319065090368e-05, + "loss": 3.6461, + "step": 98735 + }, + { + "epoch": 6.708791955428727, + "grad_norm": 1.4809061288833618, + "learning_rate": 1.6174072564207093e-05, + "loss": 3.6429, + "step": 98740 + }, + { + "epoch": 6.709131675499388, + "grad_norm": 0.13534021377563477, + "learning_rate": 1.6169826063323824e-05, + "loss": 3.769, + "step": 98745 + }, + { + "epoch": 6.70947139557005, + "grad_norm": 0.16820576786994934, + "learning_rate": 1.616557956244055e-05, + "loss": 3.8803, + "step": 98750 + }, + { + "epoch": 6.709811115640712, + "grad_norm": 0.157270148396492, + "learning_rate": 1.6161333061557277e-05, + "loss": 4.1104, + "step": 98755 + }, + { + "epoch": 6.7101508357113735, + "grad_norm": 0.28846243023872375, + "learning_rate": 1.6157086560674005e-05, + "loss": 3.9412, + "step": 98760 + }, + { + "epoch": 6.710490555782036, + "grad_norm": 0.14579714834690094, + "learning_rate": 1.6152840059790733e-05, + "loss": 4.0562, + "step": 98765 + }, + { + "epoch": 6.710830275852698, + "grad_norm": 0.1546497344970703, + "learning_rate": 1.614859355890746e-05, + "loss": 3.9647, + "step": 98770 + }, + { + "epoch": 6.711169995923359, + "grad_norm": 0.2842707335948944, + "learning_rate": 1.614434705802419e-05, + "loss": 3.7919, + "step": 98775 + }, + { + "epoch": 6.711509715994021, + "grad_norm": 2.675126552581787, + "learning_rate": 1.6140100557140917e-05, + "loss": 3.9711, + "step": 98780 + }, + { + "epoch": 6.711849436064683, + "grad_norm": 0.22742068767547607, + "learning_rate": 1.6135854056257645e-05, + "loss": 3.7315, + "step": 98785 + }, + { + "epoch": 6.712189156135344, + "grad_norm": 0.19312886893749237, + "learning_rate": 1.6131607555374373e-05, + "loss": 3.771, + "step": 98790 + }, + { + "epoch": 6.712528876206006, + "grad_norm": 0.17437608540058136, + "learning_rate": 1.61273610544911e-05, + "loss": 3.8954, + "step": 98795 + }, + { + "epoch": 6.712868596276668, + "grad_norm": 0.19070729613304138, + "learning_rate": 1.612311455360783e-05, + "loss": 3.7454, + "step": 98800 + }, + { + "epoch": 6.7132083163473295, + "grad_norm": 0.4969911575317383, + "learning_rate": 1.6118868052724557e-05, + "loss": 3.7173, + "step": 98805 + }, + { + "epoch": 6.713548036417992, + "grad_norm": 0.6132063269615173, + "learning_rate": 1.611462155184128e-05, + "loss": 3.8235, + "step": 98810 + }, + { + "epoch": 6.713887756488654, + "grad_norm": 0.18753117322921753, + "learning_rate": 1.6110375050958013e-05, + "loss": 3.8769, + "step": 98815 + }, + { + "epoch": 6.714227476559315, + "grad_norm": 0.16667622327804565, + "learning_rate": 1.610612855007474e-05, + "loss": 3.8961, + "step": 98820 + }, + { + "epoch": 6.714567196629977, + "grad_norm": 0.29447177052497864, + "learning_rate": 1.6101882049191465e-05, + "loss": 3.7694, + "step": 98825 + }, + { + "epoch": 6.714906916700639, + "grad_norm": 0.15447412431240082, + "learning_rate": 1.6097635548308197e-05, + "loss": 3.6386, + "step": 98830 + }, + { + "epoch": 6.7152466367713, + "grad_norm": 0.17791493237018585, + "learning_rate": 1.609338904742492e-05, + "loss": 3.9442, + "step": 98835 + }, + { + "epoch": 6.715586356841962, + "grad_norm": 0.16637644171714783, + "learning_rate": 1.608914254654165e-05, + "loss": 3.9244, + "step": 98840 + }, + { + "epoch": 6.715926076912624, + "grad_norm": 0.2400028556585312, + "learning_rate": 1.608489604565838e-05, + "loss": 3.6789, + "step": 98845 + }, + { + "epoch": 6.7162657969832855, + "grad_norm": 0.18333710730075836, + "learning_rate": 1.6080649544775105e-05, + "loss": 3.7013, + "step": 98850 + }, + { + "epoch": 6.716605517053948, + "grad_norm": 0.23217234015464783, + "learning_rate": 1.6076403043891833e-05, + "loss": 3.8399, + "step": 98855 + }, + { + "epoch": 6.71694523712461, + "grad_norm": 0.15747149288654327, + "learning_rate": 1.607215654300856e-05, + "loss": 3.6785, + "step": 98860 + }, + { + "epoch": 6.717284957195271, + "grad_norm": 0.15794406831264496, + "learning_rate": 1.606791004212529e-05, + "loss": 4.0277, + "step": 98865 + }, + { + "epoch": 6.717624677265933, + "grad_norm": 0.1457022875547409, + "learning_rate": 1.6063663541242017e-05, + "loss": 3.8861, + "step": 98870 + }, + { + "epoch": 6.717964397336595, + "grad_norm": 0.2076546847820282, + "learning_rate": 1.6059417040358745e-05, + "loss": 3.781, + "step": 98875 + }, + { + "epoch": 6.718304117407256, + "grad_norm": 0.19274134933948517, + "learning_rate": 1.6055170539475473e-05, + "loss": 4.1358, + "step": 98880 + }, + { + "epoch": 6.718643837477918, + "grad_norm": 0.22280625998973846, + "learning_rate": 1.60509240385922e-05, + "loss": 3.6653, + "step": 98885 + }, + { + "epoch": 6.71898355754858, + "grad_norm": 0.9054977893829346, + "learning_rate": 1.604667753770893e-05, + "loss": 3.7759, + "step": 98890 + }, + { + "epoch": 6.7193232776192415, + "grad_norm": 0.341184139251709, + "learning_rate": 1.6042431036825657e-05, + "loss": 3.6219, + "step": 98895 + }, + { + "epoch": 6.719662997689904, + "grad_norm": 0.15851044654846191, + "learning_rate": 1.6038184535942386e-05, + "loss": 3.6521, + "step": 98900 + }, + { + "epoch": 6.720002717760566, + "grad_norm": 0.15840181708335876, + "learning_rate": 1.6033938035059114e-05, + "loss": 3.7286, + "step": 98905 + }, + { + "epoch": 6.720342437831227, + "grad_norm": 0.15422289073467255, + "learning_rate": 1.6029691534175838e-05, + "loss": 3.908, + "step": 98910 + }, + { + "epoch": 6.720682157901889, + "grad_norm": 0.21011418104171753, + "learning_rate": 1.602544503329257e-05, + "loss": 3.7289, + "step": 98915 + }, + { + "epoch": 6.72102187797255, + "grad_norm": 0.3595767617225647, + "learning_rate": 1.6021198532409294e-05, + "loss": 3.9835, + "step": 98920 + }, + { + "epoch": 6.721361598043212, + "grad_norm": 0.15673963725566864, + "learning_rate": 1.6016952031526022e-05, + "loss": 3.9069, + "step": 98925 + }, + { + "epoch": 6.721701318113874, + "grad_norm": 0.13962621986865997, + "learning_rate": 1.6012705530642754e-05, + "loss": 3.9704, + "step": 98930 + }, + { + "epoch": 6.7220410381845355, + "grad_norm": 0.20340417325496674, + "learning_rate": 1.6008459029759478e-05, + "loss": 4.0033, + "step": 98935 + }, + { + "epoch": 6.7223807582551975, + "grad_norm": 0.1638823300600052, + "learning_rate": 1.6004212528876206e-05, + "loss": 3.8265, + "step": 98940 + }, + { + "epoch": 6.72272047832586, + "grad_norm": 0.14229202270507812, + "learning_rate": 1.5999966027992934e-05, + "loss": 3.695, + "step": 98945 + }, + { + "epoch": 6.723060198396521, + "grad_norm": 2.620544910430908, + "learning_rate": 1.5995719527109662e-05, + "loss": 3.9173, + "step": 98950 + }, + { + "epoch": 6.723399918467183, + "grad_norm": 0.33965572714805603, + "learning_rate": 1.599147302622639e-05, + "loss": 3.9229, + "step": 98955 + }, + { + "epoch": 6.723739638537845, + "grad_norm": 0.15481655299663544, + "learning_rate": 1.5987226525343118e-05, + "loss": 3.7691, + "step": 98960 + }, + { + "epoch": 6.724079358608506, + "grad_norm": 0.14432194828987122, + "learning_rate": 1.5982980024459846e-05, + "loss": 4.0632, + "step": 98965 + }, + { + "epoch": 6.724419078679168, + "grad_norm": 0.2106795758008957, + "learning_rate": 1.5978733523576574e-05, + "loss": 3.8667, + "step": 98970 + }, + { + "epoch": 6.72475879874983, + "grad_norm": 0.36858662962913513, + "learning_rate": 1.5974487022693302e-05, + "loss": 3.7409, + "step": 98975 + }, + { + "epoch": 6.7250985188204915, + "grad_norm": 0.16509468853473663, + "learning_rate": 1.597024052181003e-05, + "loss": 3.8374, + "step": 98980 + }, + { + "epoch": 6.7254382388911536, + "grad_norm": 0.3186328411102295, + "learning_rate": 1.5965994020926758e-05, + "loss": 3.6966, + "step": 98985 + }, + { + "epoch": 6.725777958961816, + "grad_norm": 0.4708883762359619, + "learning_rate": 1.5961747520043486e-05, + "loss": 4.0558, + "step": 98990 + }, + { + "epoch": 6.726117679032477, + "grad_norm": 0.15792927145957947, + "learning_rate": 1.595750101916021e-05, + "loss": 3.9166, + "step": 98995 + }, + { + "epoch": 6.726457399103139, + "grad_norm": 0.26420798897743225, + "learning_rate": 1.5953254518276942e-05, + "loss": 3.8253, + "step": 99000 + }, + { + "epoch": 6.726797119173801, + "grad_norm": 0.1636495292186737, + "learning_rate": 1.5949008017393667e-05, + "loss": 4.0563, + "step": 99005 + }, + { + "epoch": 6.727136839244462, + "grad_norm": 0.40778428316116333, + "learning_rate": 1.5944761516510395e-05, + "loss": 3.6754, + "step": 99010 + }, + { + "epoch": 6.727476559315124, + "grad_norm": 0.15900716185569763, + "learning_rate": 1.5940515015627126e-05, + "loss": 3.9005, + "step": 99015 + }, + { + "epoch": 6.727816279385786, + "grad_norm": 0.18749745190143585, + "learning_rate": 1.593626851474385e-05, + "loss": 3.8042, + "step": 99020 + }, + { + "epoch": 6.7281559994564475, + "grad_norm": 0.18166732788085938, + "learning_rate": 1.593202201386058e-05, + "loss": 4.1368, + "step": 99025 + }, + { + "epoch": 6.72849571952711, + "grad_norm": 0.20500335097312927, + "learning_rate": 1.5927775512977307e-05, + "loss": 3.9744, + "step": 99030 + }, + { + "epoch": 6.728835439597772, + "grad_norm": 0.21016615629196167, + "learning_rate": 1.5923529012094035e-05, + "loss": 3.8905, + "step": 99035 + }, + { + "epoch": 6.729175159668433, + "grad_norm": 0.16305550932884216, + "learning_rate": 1.5919282511210763e-05, + "loss": 3.8717, + "step": 99040 + }, + { + "epoch": 6.729514879739095, + "grad_norm": 0.17022882401943207, + "learning_rate": 1.591503601032749e-05, + "loss": 3.8415, + "step": 99045 + }, + { + "epoch": 6.729854599809757, + "grad_norm": 0.7480267286300659, + "learning_rate": 1.591078950944422e-05, + "loss": 3.8266, + "step": 99050 + }, + { + "epoch": 6.730194319880418, + "grad_norm": 0.2969494163990021, + "learning_rate": 1.5906543008560947e-05, + "loss": 3.77, + "step": 99055 + }, + { + "epoch": 6.73053403995108, + "grad_norm": 0.2005760222673416, + "learning_rate": 1.5902296507677675e-05, + "loss": 3.8428, + "step": 99060 + }, + { + "epoch": 6.730873760021742, + "grad_norm": 0.186527818441391, + "learning_rate": 1.5898050006794403e-05, + "loss": 3.5548, + "step": 99065 + }, + { + "epoch": 6.7312134800924035, + "grad_norm": 0.7967578172683716, + "learning_rate": 1.589380350591113e-05, + "loss": 4.1308, + "step": 99070 + }, + { + "epoch": 6.731553200163066, + "grad_norm": 0.20304535329341888, + "learning_rate": 1.588955700502786e-05, + "loss": 3.9165, + "step": 99075 + }, + { + "epoch": 6.731892920233728, + "grad_norm": 0.14525052905082703, + "learning_rate": 1.5885310504144584e-05, + "loss": 3.8004, + "step": 99080 + }, + { + "epoch": 6.732232640304389, + "grad_norm": 0.14860835671424866, + "learning_rate": 1.5881064003261315e-05, + "loss": 3.8243, + "step": 99085 + }, + { + "epoch": 6.732572360375051, + "grad_norm": 0.16230911016464233, + "learning_rate": 1.5876817502378043e-05, + "loss": 3.712, + "step": 99090 + }, + { + "epoch": 6.732912080445713, + "grad_norm": 0.1794179528951645, + "learning_rate": 1.5872571001494768e-05, + "loss": 3.6994, + "step": 99095 + }, + { + "epoch": 6.733251800516374, + "grad_norm": 0.1492609977722168, + "learning_rate": 1.58683245006115e-05, + "loss": 3.7895, + "step": 99100 + }, + { + "epoch": 6.733591520587036, + "grad_norm": 0.18041987717151642, + "learning_rate": 1.5864077999728224e-05, + "loss": 3.6966, + "step": 99105 + }, + { + "epoch": 6.733931240657698, + "grad_norm": 0.13441193103790283, + "learning_rate": 1.585983149884495e-05, + "loss": 3.7036, + "step": 99110 + }, + { + "epoch": 6.7342709607283595, + "grad_norm": 0.19330310821533203, + "learning_rate": 1.585558499796168e-05, + "loss": 3.803, + "step": 99115 + }, + { + "epoch": 6.734610680799022, + "grad_norm": 0.16887061297893524, + "learning_rate": 1.5851338497078408e-05, + "loss": 3.6892, + "step": 99120 + }, + { + "epoch": 6.734950400869684, + "grad_norm": 0.2238435298204422, + "learning_rate": 1.5847091996195136e-05, + "loss": 3.6408, + "step": 99125 + }, + { + "epoch": 6.735290120940345, + "grad_norm": 1.4089698791503906, + "learning_rate": 1.5842845495311864e-05, + "loss": 4.0662, + "step": 99130 + }, + { + "epoch": 6.735629841011007, + "grad_norm": 0.16329938173294067, + "learning_rate": 1.5838598994428592e-05, + "loss": 3.6133, + "step": 99135 + }, + { + "epoch": 6.735969561081669, + "grad_norm": 0.15337024629116058, + "learning_rate": 1.583435249354532e-05, + "loss": 3.8556, + "step": 99140 + }, + { + "epoch": 6.73630928115233, + "grad_norm": 0.8897867202758789, + "learning_rate": 1.5830105992662048e-05, + "loss": 3.8098, + "step": 99145 + }, + { + "epoch": 6.736649001222992, + "grad_norm": 0.14756858348846436, + "learning_rate": 1.5825859491778776e-05, + "loss": 3.838, + "step": 99150 + }, + { + "epoch": 6.736988721293654, + "grad_norm": 0.2005806714296341, + "learning_rate": 1.5821612990895504e-05, + "loss": 3.7531, + "step": 99155 + }, + { + "epoch": 6.7373284413643155, + "grad_norm": 0.155650332570076, + "learning_rate": 1.5817366490012232e-05, + "loss": 3.6757, + "step": 99160 + }, + { + "epoch": 6.737668161434978, + "grad_norm": 0.17244596779346466, + "learning_rate": 1.5813119989128956e-05, + "loss": 4.0516, + "step": 99165 + }, + { + "epoch": 6.73800788150564, + "grad_norm": 0.19129084050655365, + "learning_rate": 1.5808873488245688e-05, + "loss": 3.7673, + "step": 99170 + }, + { + "epoch": 6.738347601576301, + "grad_norm": 0.16407275199890137, + "learning_rate": 1.5804626987362416e-05, + "loss": 3.837, + "step": 99175 + }, + { + "epoch": 6.738687321646963, + "grad_norm": 0.17693626880645752, + "learning_rate": 1.580038048647914e-05, + "loss": 3.8403, + "step": 99180 + }, + { + "epoch": 6.739027041717625, + "grad_norm": 0.30385351181030273, + "learning_rate": 1.5796133985595872e-05, + "loss": 3.8237, + "step": 99185 + }, + { + "epoch": 6.739366761788286, + "grad_norm": 0.21968534588813782, + "learning_rate": 1.5791887484712596e-05, + "loss": 3.9073, + "step": 99190 + }, + { + "epoch": 6.739706481858948, + "grad_norm": 0.18497291207313538, + "learning_rate": 1.5787640983829324e-05, + "loss": 3.8653, + "step": 99195 + }, + { + "epoch": 6.74004620192961, + "grad_norm": 0.1455434411764145, + "learning_rate": 1.5783394482946052e-05, + "loss": 3.9252, + "step": 99200 + }, + { + "epoch": 6.7403859220002715, + "grad_norm": 0.18357117474079132, + "learning_rate": 1.577914798206278e-05, + "loss": 4.1069, + "step": 99205 + }, + { + "epoch": 6.740725642070934, + "grad_norm": 0.17552082240581512, + "learning_rate": 1.577490148117951e-05, + "loss": 3.9311, + "step": 99210 + }, + { + "epoch": 6.741065362141596, + "grad_norm": 0.1963225156068802, + "learning_rate": 1.5770654980296236e-05, + "loss": 3.7175, + "step": 99215 + }, + { + "epoch": 6.741405082212257, + "grad_norm": 0.16584010422229767, + "learning_rate": 1.5766408479412964e-05, + "loss": 3.7263, + "step": 99220 + }, + { + "epoch": 6.741744802282919, + "grad_norm": 0.5905805826187134, + "learning_rate": 1.5762161978529692e-05, + "loss": 3.6842, + "step": 99225 + }, + { + "epoch": 6.742084522353581, + "grad_norm": 0.3583389222621918, + "learning_rate": 1.575791547764642e-05, + "loss": 3.848, + "step": 99230 + }, + { + "epoch": 6.742424242424242, + "grad_norm": 0.16317042708396912, + "learning_rate": 1.575366897676315e-05, + "loss": 3.8209, + "step": 99235 + }, + { + "epoch": 6.742763962494904, + "grad_norm": 0.6123111844062805, + "learning_rate": 1.5749422475879876e-05, + "loss": 3.8626, + "step": 99240 + }, + { + "epoch": 6.743103682565566, + "grad_norm": 0.18899281322956085, + "learning_rate": 1.5745175974996604e-05, + "loss": 3.7203, + "step": 99245 + }, + { + "epoch": 6.7434434026362275, + "grad_norm": 0.19115746021270752, + "learning_rate": 1.574092947411333e-05, + "loss": 3.7081, + "step": 99250 + }, + { + "epoch": 6.74378312270689, + "grad_norm": 0.17630000412464142, + "learning_rate": 1.573668297323006e-05, + "loss": 4.1006, + "step": 99255 + }, + { + "epoch": 6.744122842777552, + "grad_norm": 0.20932407677173615, + "learning_rate": 1.573243647234679e-05, + "loss": 3.8195, + "step": 99260 + }, + { + "epoch": 6.744462562848213, + "grad_norm": 0.17179648578166962, + "learning_rate": 1.5728189971463513e-05, + "loss": 3.6952, + "step": 99265 + }, + { + "epoch": 6.744802282918875, + "grad_norm": 0.20413215458393097, + "learning_rate": 1.5723943470580245e-05, + "loss": 3.7316, + "step": 99270 + }, + { + "epoch": 6.745142002989537, + "grad_norm": 0.203007772564888, + "learning_rate": 1.571969696969697e-05, + "loss": 3.7169, + "step": 99275 + }, + { + "epoch": 6.745481723060198, + "grad_norm": 0.17100243270397186, + "learning_rate": 1.5715450468813697e-05, + "loss": 3.7805, + "step": 99280 + }, + { + "epoch": 6.74582144313086, + "grad_norm": 0.19130848348140717, + "learning_rate": 1.5711203967930425e-05, + "loss": 3.8924, + "step": 99285 + }, + { + "epoch": 6.746161163201522, + "grad_norm": 0.14143812656402588, + "learning_rate": 1.5706957467047153e-05, + "loss": 3.481, + "step": 99290 + }, + { + "epoch": 6.746500883272184, + "grad_norm": 0.18114535510540009, + "learning_rate": 1.570271096616388e-05, + "loss": 3.6017, + "step": 99295 + }, + { + "epoch": 6.746840603342846, + "grad_norm": 0.20598743855953217, + "learning_rate": 1.569846446528061e-05, + "loss": 3.7503, + "step": 99300 + }, + { + "epoch": 6.747180323413508, + "grad_norm": 0.1592658907175064, + "learning_rate": 1.5694217964397337e-05, + "loss": 3.8408, + "step": 99305 + }, + { + "epoch": 6.747520043484169, + "grad_norm": 0.4102640151977539, + "learning_rate": 1.5689971463514065e-05, + "loss": 3.803, + "step": 99310 + }, + { + "epoch": 6.747859763554831, + "grad_norm": 0.17728661000728607, + "learning_rate": 1.5685724962630793e-05, + "loss": 3.7397, + "step": 99315 + }, + { + "epoch": 6.748199483625493, + "grad_norm": 0.17146536707878113, + "learning_rate": 1.568147846174752e-05, + "loss": 3.8057, + "step": 99320 + }, + { + "epoch": 6.748539203696154, + "grad_norm": 0.16775557398796082, + "learning_rate": 1.567723196086425e-05, + "loss": 3.988, + "step": 99325 + }, + { + "epoch": 6.748878923766816, + "grad_norm": 0.9599456191062927, + "learning_rate": 1.5672985459980977e-05, + "loss": 3.9306, + "step": 99330 + }, + { + "epoch": 6.749218643837478, + "grad_norm": 0.19238056242465973, + "learning_rate": 1.5668738959097702e-05, + "loss": 3.7577, + "step": 99335 + }, + { + "epoch": 6.74955836390814, + "grad_norm": 0.19118660688400269, + "learning_rate": 1.5664492458214433e-05, + "loss": 3.6002, + "step": 99340 + }, + { + "epoch": 6.749898083978802, + "grad_norm": 0.21073226630687714, + "learning_rate": 1.566024595733116e-05, + "loss": 4.0709, + "step": 99345 + }, + { + "epoch": 6.750237804049464, + "grad_norm": 0.224894717335701, + "learning_rate": 1.5655999456447886e-05, + "loss": 3.7849, + "step": 99350 + }, + { + "epoch": 6.750577524120125, + "grad_norm": 0.1549072563648224, + "learning_rate": 1.5651752955564617e-05, + "loss": 3.802, + "step": 99355 + }, + { + "epoch": 6.750917244190787, + "grad_norm": 0.18164633214473724, + "learning_rate": 1.5647506454681342e-05, + "loss": 3.8496, + "step": 99360 + }, + { + "epoch": 6.751256964261449, + "grad_norm": 0.16482189297676086, + "learning_rate": 1.564325995379807e-05, + "loss": 3.7996, + "step": 99365 + }, + { + "epoch": 6.75159668433211, + "grad_norm": 0.17861343920230865, + "learning_rate": 1.56390134529148e-05, + "loss": 3.8492, + "step": 99370 + }, + { + "epoch": 6.751936404402772, + "grad_norm": 0.5578711032867432, + "learning_rate": 1.5634766952031526e-05, + "loss": 3.8077, + "step": 99375 + }, + { + "epoch": 6.7522761244734335, + "grad_norm": 0.29464852809906006, + "learning_rate": 1.5630520451148254e-05, + "loss": 3.8837, + "step": 99380 + }, + { + "epoch": 6.752615844544096, + "grad_norm": 0.1503230780363083, + "learning_rate": 1.5626273950264982e-05, + "loss": 3.8837, + "step": 99385 + }, + { + "epoch": 6.752955564614758, + "grad_norm": 0.42031341791152954, + "learning_rate": 1.562202744938171e-05, + "loss": 3.8957, + "step": 99390 + }, + { + "epoch": 6.753295284685419, + "grad_norm": 0.2377912551164627, + "learning_rate": 1.5617780948498438e-05, + "loss": 3.753, + "step": 99395 + }, + { + "epoch": 6.753635004756081, + "grad_norm": 0.17122551798820496, + "learning_rate": 1.5613534447615166e-05, + "loss": 3.8536, + "step": 99400 + }, + { + "epoch": 6.753974724826743, + "grad_norm": 0.15795104205608368, + "learning_rate": 1.5609287946731894e-05, + "loss": 3.9435, + "step": 99405 + }, + { + "epoch": 6.754314444897404, + "grad_norm": 0.9472194314002991, + "learning_rate": 1.5605041445848622e-05, + "loss": 3.8063, + "step": 99410 + }, + { + "epoch": 6.754654164968066, + "grad_norm": 0.15639935433864594, + "learning_rate": 1.560079494496535e-05, + "loss": 3.8864, + "step": 99415 + }, + { + "epoch": 6.754993885038728, + "grad_norm": 0.2939939498901367, + "learning_rate": 1.5596548444082078e-05, + "loss": 3.7175, + "step": 99420 + }, + { + "epoch": 6.7553336051093895, + "grad_norm": 0.20922251045703888, + "learning_rate": 1.5592301943198806e-05, + "loss": 4.1024, + "step": 99425 + }, + { + "epoch": 6.755673325180052, + "grad_norm": 0.2398269921541214, + "learning_rate": 1.5588055442315534e-05, + "loss": 3.9725, + "step": 99430 + }, + { + "epoch": 6.756013045250714, + "grad_norm": 0.1464240401983261, + "learning_rate": 1.558380894143226e-05, + "loss": 3.7991, + "step": 99435 + }, + { + "epoch": 6.756352765321375, + "grad_norm": 0.16659164428710938, + "learning_rate": 1.557956244054899e-05, + "loss": 3.8368, + "step": 99440 + }, + { + "epoch": 6.756692485392037, + "grad_norm": 0.16991129517555237, + "learning_rate": 1.5575315939665715e-05, + "loss": 3.9923, + "step": 99445 + }, + { + "epoch": 6.757032205462699, + "grad_norm": 0.1300753504037857, + "learning_rate": 1.5571069438782443e-05, + "loss": 3.9376, + "step": 99450 + }, + { + "epoch": 6.75737192553336, + "grad_norm": 0.1343783587217331, + "learning_rate": 1.5566822937899174e-05, + "loss": 3.8849, + "step": 99455 + }, + { + "epoch": 6.757711645604022, + "grad_norm": 0.13906288146972656, + "learning_rate": 1.55625764370159e-05, + "loss": 3.816, + "step": 99460 + }, + { + "epoch": 6.758051365674684, + "grad_norm": 0.20680849254131317, + "learning_rate": 1.5558329936132627e-05, + "loss": 3.7815, + "step": 99465 + }, + { + "epoch": 6.7583910857453455, + "grad_norm": 0.17339491844177246, + "learning_rate": 1.5554083435249355e-05, + "loss": 3.6564, + "step": 99470 + }, + { + "epoch": 6.758730805816008, + "grad_norm": 0.20651590824127197, + "learning_rate": 1.5549836934366083e-05, + "loss": 3.6992, + "step": 99475 + }, + { + "epoch": 6.75907052588667, + "grad_norm": 0.13642749190330505, + "learning_rate": 1.554559043348281e-05, + "loss": 3.7762, + "step": 99480 + }, + { + "epoch": 6.759410245957331, + "grad_norm": 0.14098015427589417, + "learning_rate": 1.554134393259954e-05, + "loss": 4.0221, + "step": 99485 + }, + { + "epoch": 6.759749966027993, + "grad_norm": 0.2170168161392212, + "learning_rate": 1.5537097431716267e-05, + "loss": 4.018, + "step": 99490 + }, + { + "epoch": 6.760089686098655, + "grad_norm": 0.21817249059677124, + "learning_rate": 1.5532850930832995e-05, + "loss": 3.7407, + "step": 99495 + }, + { + "epoch": 6.760429406169316, + "grad_norm": 0.2384442836046219, + "learning_rate": 1.5528604429949723e-05, + "loss": 3.8441, + "step": 99500 + }, + { + "epoch": 6.760769126239978, + "grad_norm": 0.1321878284215927, + "learning_rate": 1.552435792906645e-05, + "loss": 4.0284, + "step": 99505 + }, + { + "epoch": 6.76110884631064, + "grad_norm": 0.1445275843143463, + "learning_rate": 1.552011142818318e-05, + "loss": 3.7375, + "step": 99510 + }, + { + "epoch": 6.7614485663813015, + "grad_norm": 0.17224375903606415, + "learning_rate": 1.5515864927299907e-05, + "loss": 3.8591, + "step": 99515 + }, + { + "epoch": 6.761788286451964, + "grad_norm": 0.17702747881412506, + "learning_rate": 1.551161842641663e-05, + "loss": 3.7753, + "step": 99520 + }, + { + "epoch": 6.762128006522626, + "grad_norm": 0.1735495775938034, + "learning_rate": 1.5507371925533363e-05, + "loss": 3.9575, + "step": 99525 + }, + { + "epoch": 6.762467726593287, + "grad_norm": 0.1652914583683014, + "learning_rate": 1.5503125424650087e-05, + "loss": 3.8751, + "step": 99530 + }, + { + "epoch": 6.762807446663949, + "grad_norm": 0.20578716695308685, + "learning_rate": 1.5498878923766815e-05, + "loss": 4.0768, + "step": 99535 + }, + { + "epoch": 6.763147166734611, + "grad_norm": 0.17286482453346252, + "learning_rate": 1.5494632422883547e-05, + "loss": 3.9139, + "step": 99540 + }, + { + "epoch": 6.763486886805272, + "grad_norm": 0.13666576147079468, + "learning_rate": 1.549038592200027e-05, + "loss": 3.9205, + "step": 99545 + }, + { + "epoch": 6.763826606875934, + "grad_norm": 0.1791817545890808, + "learning_rate": 1.5486139421117e-05, + "loss": 3.6201, + "step": 99550 + }, + { + "epoch": 6.764166326946596, + "grad_norm": 0.16799646615982056, + "learning_rate": 1.5481892920233727e-05, + "loss": 3.811, + "step": 99555 + }, + { + "epoch": 6.7645060470172576, + "grad_norm": 0.1834472119808197, + "learning_rate": 1.5477646419350455e-05, + "loss": 3.5956, + "step": 99560 + }, + { + "epoch": 6.76484576708792, + "grad_norm": 0.14259366691112518, + "learning_rate": 1.5473399918467183e-05, + "loss": 3.9648, + "step": 99565 + }, + { + "epoch": 6.765185487158582, + "grad_norm": 0.17988409101963043, + "learning_rate": 1.546915341758391e-05, + "loss": 3.708, + "step": 99570 + }, + { + "epoch": 6.765525207229243, + "grad_norm": 0.13514147698879242, + "learning_rate": 1.546490691670064e-05, + "loss": 3.8114, + "step": 99575 + }, + { + "epoch": 6.765864927299905, + "grad_norm": 0.16409842669963837, + "learning_rate": 1.5460660415817367e-05, + "loss": 3.7467, + "step": 99580 + }, + { + "epoch": 6.766204647370567, + "grad_norm": 0.19552627205848694, + "learning_rate": 1.5456413914934095e-05, + "loss": 3.6177, + "step": 99585 + }, + { + "epoch": 6.766544367441228, + "grad_norm": 0.18730081617832184, + "learning_rate": 1.5452167414050823e-05, + "loss": 3.7875, + "step": 99590 + }, + { + "epoch": 6.76688408751189, + "grad_norm": 0.19704125821590424, + "learning_rate": 1.544792091316755e-05, + "loss": 3.8545, + "step": 99595 + }, + { + "epoch": 6.7672238075825515, + "grad_norm": 0.9278639554977417, + "learning_rate": 1.544367441228428e-05, + "loss": 4.039, + "step": 99600 + }, + { + "epoch": 6.767563527653214, + "grad_norm": 0.13678021728992462, + "learning_rate": 1.5439427911401004e-05, + "loss": 3.7531, + "step": 99605 + }, + { + "epoch": 6.767903247723876, + "grad_norm": 0.2150360494852066, + "learning_rate": 1.5435181410517735e-05, + "loss": 3.7729, + "step": 99610 + }, + { + "epoch": 6.768242967794537, + "grad_norm": 0.321768194437027, + "learning_rate": 1.5430934909634464e-05, + "loss": 3.7947, + "step": 99615 + }, + { + "epoch": 6.768582687865199, + "grad_norm": 0.18979543447494507, + "learning_rate": 1.5426688408751188e-05, + "loss": 3.7389, + "step": 99620 + }, + { + "epoch": 6.768922407935861, + "grad_norm": 0.1373489946126938, + "learning_rate": 1.542244190786792e-05, + "loss": 3.6005, + "step": 99625 + }, + { + "epoch": 6.769262128006522, + "grad_norm": 0.1770937591791153, + "learning_rate": 1.5418195406984644e-05, + "loss": 3.8754, + "step": 99630 + }, + { + "epoch": 6.769601848077184, + "grad_norm": 0.17807547748088837, + "learning_rate": 1.5413948906101372e-05, + "loss": 3.7484, + "step": 99635 + }, + { + "epoch": 6.769941568147846, + "grad_norm": 0.2043023407459259, + "learning_rate": 1.54097024052181e-05, + "loss": 3.9264, + "step": 99640 + }, + { + "epoch": 6.7702812882185075, + "grad_norm": 0.18668407201766968, + "learning_rate": 1.5405455904334828e-05, + "loss": 3.7721, + "step": 99645 + }, + { + "epoch": 6.77062100828917, + "grad_norm": 0.8033211827278137, + "learning_rate": 1.5401209403451556e-05, + "loss": 3.8921, + "step": 99650 + }, + { + "epoch": 6.770960728359832, + "grad_norm": 0.16032284498214722, + "learning_rate": 1.5396962902568284e-05, + "loss": 3.9942, + "step": 99655 + }, + { + "epoch": 6.771300448430493, + "grad_norm": 0.19838571548461914, + "learning_rate": 1.5392716401685012e-05, + "loss": 3.8691, + "step": 99660 + }, + { + "epoch": 6.771640168501155, + "grad_norm": 0.13602416217327118, + "learning_rate": 1.538846990080174e-05, + "loss": 3.6779, + "step": 99665 + }, + { + "epoch": 6.771979888571817, + "grad_norm": 0.2204105257987976, + "learning_rate": 1.5384223399918468e-05, + "loss": 3.9055, + "step": 99670 + }, + { + "epoch": 6.772319608642478, + "grad_norm": 0.21510808169841766, + "learning_rate": 1.5379976899035196e-05, + "loss": 3.8893, + "step": 99675 + }, + { + "epoch": 6.77265932871314, + "grad_norm": 1.8367851972579956, + "learning_rate": 1.5375730398151924e-05, + "loss": 3.7355, + "step": 99680 + }, + { + "epoch": 6.772999048783802, + "grad_norm": 0.1611761599779129, + "learning_rate": 1.5371483897268652e-05, + "loss": 3.7466, + "step": 99685 + }, + { + "epoch": 6.7733387688544635, + "grad_norm": 0.1741069257259369, + "learning_rate": 1.5367237396385377e-05, + "loss": 3.9082, + "step": 99690 + }, + { + "epoch": 6.773678488925126, + "grad_norm": 0.1882006824016571, + "learning_rate": 1.5362990895502108e-05, + "loss": 3.864, + "step": 99695 + }, + { + "epoch": 6.774018208995788, + "grad_norm": 0.1410403996706009, + "learning_rate": 1.5358744394618836e-05, + "loss": 3.8348, + "step": 99700 + }, + { + "epoch": 6.774357929066449, + "grad_norm": 0.48793700337409973, + "learning_rate": 1.535449789373556e-05, + "loss": 3.8981, + "step": 99705 + }, + { + "epoch": 6.774697649137111, + "grad_norm": 0.2006985992193222, + "learning_rate": 1.5350251392852292e-05, + "loss": 3.7008, + "step": 99710 + }, + { + "epoch": 6.775037369207773, + "grad_norm": 0.17787562310695648, + "learning_rate": 1.5346004891969017e-05, + "loss": 3.7588, + "step": 99715 + }, + { + "epoch": 6.775377089278434, + "grad_norm": 0.22067373991012573, + "learning_rate": 1.5341758391085745e-05, + "loss": 3.7672, + "step": 99720 + }, + { + "epoch": 6.775716809349096, + "grad_norm": 0.18414729833602905, + "learning_rate": 1.5337511890202473e-05, + "loss": 3.9756, + "step": 99725 + }, + { + "epoch": 6.776056529419758, + "grad_norm": 0.20872275531291962, + "learning_rate": 1.53332653893192e-05, + "loss": 3.9528, + "step": 99730 + }, + { + "epoch": 6.7763962494904195, + "grad_norm": 0.1862086057662964, + "learning_rate": 1.532901888843593e-05, + "loss": 3.8243, + "step": 99735 + }, + { + "epoch": 6.776735969561082, + "grad_norm": 0.17717359960079193, + "learning_rate": 1.5324772387552657e-05, + "loss": 3.5457, + "step": 99740 + }, + { + "epoch": 6.777075689631744, + "grad_norm": 0.175391286611557, + "learning_rate": 1.5320525886669385e-05, + "loss": 3.678, + "step": 99745 + }, + { + "epoch": 6.777415409702405, + "grad_norm": 0.25414058566093445, + "learning_rate": 1.5316279385786113e-05, + "loss": 3.5182, + "step": 99750 + }, + { + "epoch": 6.777755129773067, + "grad_norm": 0.13219085335731506, + "learning_rate": 1.531203288490284e-05, + "loss": 3.7946, + "step": 99755 + }, + { + "epoch": 6.778094849843729, + "grad_norm": 0.1893610656261444, + "learning_rate": 1.530778638401957e-05, + "loss": 3.8073, + "step": 99760 + }, + { + "epoch": 6.77843456991439, + "grad_norm": 0.13915525376796722, + "learning_rate": 1.5303539883136297e-05, + "loss": 3.6331, + "step": 99765 + }, + { + "epoch": 6.778774289985052, + "grad_norm": 0.19608382880687714, + "learning_rate": 1.5299293382253025e-05, + "loss": 3.763, + "step": 99770 + }, + { + "epoch": 6.779114010055714, + "grad_norm": 0.19289462268352509, + "learning_rate": 1.529504688136975e-05, + "loss": 3.8339, + "step": 99775 + }, + { + "epoch": 6.7794537301263755, + "grad_norm": 0.1694881170988083, + "learning_rate": 1.529080038048648e-05, + "loss": 3.6972, + "step": 99780 + }, + { + "epoch": 6.779793450197038, + "grad_norm": 0.16850000619888306, + "learning_rate": 1.528655387960321e-05, + "loss": 3.6641, + "step": 99785 + }, + { + "epoch": 6.7801331702677, + "grad_norm": 0.14747680723667145, + "learning_rate": 1.5282307378719934e-05, + "loss": 3.7701, + "step": 99790 + }, + { + "epoch": 6.780472890338361, + "grad_norm": 0.17803433537483215, + "learning_rate": 1.5278060877836665e-05, + "loss": 4.1104, + "step": 99795 + }, + { + "epoch": 6.780812610409023, + "grad_norm": 0.18141305446624756, + "learning_rate": 1.527381437695339e-05, + "loss": 3.8824, + "step": 99800 + }, + { + "epoch": 6.781152330479685, + "grad_norm": 0.259168803691864, + "learning_rate": 1.5269567876070118e-05, + "loss": 3.8679, + "step": 99805 + }, + { + "epoch": 6.781492050550346, + "grad_norm": 0.1855980008840561, + "learning_rate": 1.526532137518685e-05, + "loss": 3.6542, + "step": 99810 + }, + { + "epoch": 6.781831770621008, + "grad_norm": 0.2100203037261963, + "learning_rate": 1.5261074874303574e-05, + "loss": 3.5303, + "step": 99815 + }, + { + "epoch": 6.78217149069167, + "grad_norm": 0.15658536553382874, + "learning_rate": 1.5256828373420302e-05, + "loss": 3.8774, + "step": 99820 + }, + { + "epoch": 6.7825112107623315, + "grad_norm": 0.18347293138504028, + "learning_rate": 1.5252581872537031e-05, + "loss": 3.9487, + "step": 99825 + }, + { + "epoch": 6.782850930832994, + "grad_norm": 0.18282799422740936, + "learning_rate": 1.5248335371653758e-05, + "loss": 3.764, + "step": 99830 + }, + { + "epoch": 6.783190650903656, + "grad_norm": 0.1521012783050537, + "learning_rate": 1.5244088870770487e-05, + "loss": 3.7341, + "step": 99835 + }, + { + "epoch": 6.783530370974317, + "grad_norm": 0.1896529495716095, + "learning_rate": 1.5239842369887214e-05, + "loss": 3.8344, + "step": 99840 + }, + { + "epoch": 6.783870091044979, + "grad_norm": 0.20881612598896027, + "learning_rate": 1.523559586900394e-05, + "loss": 3.768, + "step": 99845 + }, + { + "epoch": 6.784209811115641, + "grad_norm": 0.4907752275466919, + "learning_rate": 1.523134936812067e-05, + "loss": 3.9525, + "step": 99850 + }, + { + "epoch": 6.784549531186302, + "grad_norm": 0.13692878186702728, + "learning_rate": 1.5227102867237398e-05, + "loss": 3.791, + "step": 99855 + }, + { + "epoch": 6.784889251256964, + "grad_norm": 0.18330861628055573, + "learning_rate": 1.5222856366354124e-05, + "loss": 3.6629, + "step": 99860 + }, + { + "epoch": 6.785228971327626, + "grad_norm": 0.16280415654182434, + "learning_rate": 1.5218609865470854e-05, + "loss": 3.946, + "step": 99865 + }, + { + "epoch": 6.7855686913982876, + "grad_norm": 0.31358978152275085, + "learning_rate": 1.521436336458758e-05, + "loss": 3.8571, + "step": 99870 + }, + { + "epoch": 6.78590841146895, + "grad_norm": 0.16816696524620056, + "learning_rate": 1.5210116863704308e-05, + "loss": 4.0104, + "step": 99875 + }, + { + "epoch": 6.786248131539612, + "grad_norm": 0.16997647285461426, + "learning_rate": 1.5205870362821038e-05, + "loss": 3.8583, + "step": 99880 + }, + { + "epoch": 6.786587851610273, + "grad_norm": 0.1895233392715454, + "learning_rate": 1.5201623861937764e-05, + "loss": 3.7349, + "step": 99885 + }, + { + "epoch": 6.786927571680935, + "grad_norm": 0.15296311676502228, + "learning_rate": 1.519737736105449e-05, + "loss": 3.6945, + "step": 99890 + }, + { + "epoch": 6.787267291751597, + "grad_norm": 2.1724905967712402, + "learning_rate": 1.519313086017122e-05, + "loss": 4.0706, + "step": 99895 + }, + { + "epoch": 6.787607011822258, + "grad_norm": 0.1744777411222458, + "learning_rate": 1.5188884359287946e-05, + "loss": 3.8573, + "step": 99900 + }, + { + "epoch": 6.78794673189292, + "grad_norm": 0.18315675854682922, + "learning_rate": 1.5184637858404674e-05, + "loss": 3.7014, + "step": 99905 + }, + { + "epoch": 6.788286451963582, + "grad_norm": 0.17119143903255463, + "learning_rate": 1.5180391357521404e-05, + "loss": 4.0438, + "step": 99910 + }, + { + "epoch": 6.788626172034244, + "grad_norm": 0.17802254855632782, + "learning_rate": 1.517614485663813e-05, + "loss": 3.6228, + "step": 99915 + }, + { + "epoch": 6.788965892104906, + "grad_norm": 0.16191288828849792, + "learning_rate": 1.517189835575486e-05, + "loss": 3.7806, + "step": 99920 + }, + { + "epoch": 6.789305612175568, + "grad_norm": 0.19567444920539856, + "learning_rate": 1.5167651854871586e-05, + "loss": 3.5457, + "step": 99925 + }, + { + "epoch": 6.789645332246229, + "grad_norm": 0.13988038897514343, + "learning_rate": 1.5163405353988314e-05, + "loss": 3.8151, + "step": 99930 + }, + { + "epoch": 6.789985052316891, + "grad_norm": 0.1789853870868683, + "learning_rate": 1.5159158853105042e-05, + "loss": 3.7565, + "step": 99935 + }, + { + "epoch": 6.790324772387553, + "grad_norm": 0.189373180270195, + "learning_rate": 1.515491235222177e-05, + "loss": 3.9974, + "step": 99940 + }, + { + "epoch": 6.790664492458214, + "grad_norm": 0.23295937478542328, + "learning_rate": 1.5150665851338497e-05, + "loss": 3.6255, + "step": 99945 + }, + { + "epoch": 6.791004212528876, + "grad_norm": 0.17897607386112213, + "learning_rate": 1.5146419350455226e-05, + "loss": 3.6089, + "step": 99950 + }, + { + "epoch": 6.791343932599538, + "grad_norm": 5.207253932952881, + "learning_rate": 1.5142172849571953e-05, + "loss": 3.9173, + "step": 99955 + }, + { + "epoch": 6.7916836526702, + "grad_norm": 0.5581674575805664, + "learning_rate": 1.513792634868868e-05, + "loss": 3.7457, + "step": 99960 + }, + { + "epoch": 6.792023372740862, + "grad_norm": 0.1666606068611145, + "learning_rate": 1.513367984780541e-05, + "loss": 3.7329, + "step": 99965 + }, + { + "epoch": 6.792363092811524, + "grad_norm": 0.29188984632492065, + "learning_rate": 1.5129433346922137e-05, + "loss": 3.839, + "step": 99970 + }, + { + "epoch": 6.792702812882185, + "grad_norm": 0.2657040059566498, + "learning_rate": 1.5125186846038863e-05, + "loss": 3.8241, + "step": 99975 + }, + { + "epoch": 6.793042532952847, + "grad_norm": 0.2168460190296173, + "learning_rate": 1.5120940345155593e-05, + "loss": 4.0071, + "step": 99980 + }, + { + "epoch": 6.793382253023509, + "grad_norm": 0.16686248779296875, + "learning_rate": 1.5116693844272319e-05, + "loss": 3.7701, + "step": 99985 + }, + { + "epoch": 6.79372197309417, + "grad_norm": 0.1689540296792984, + "learning_rate": 1.5112447343389047e-05, + "loss": 3.8485, + "step": 99990 + }, + { + "epoch": 6.794061693164832, + "grad_norm": 0.18186405301094055, + "learning_rate": 1.5108200842505777e-05, + "loss": 3.5134, + "step": 99995 + }, + { + "epoch": 6.794401413235494, + "grad_norm": 0.15508490800857544, + "learning_rate": 1.5103954341622503e-05, + "loss": 3.8837, + "step": 100000 + }, + { + "epoch": 6.794741133306156, + "grad_norm": 0.15095804631710052, + "learning_rate": 1.5099707840739233e-05, + "loss": 3.9342, + "step": 100005 + }, + { + "epoch": 6.795080853376818, + "grad_norm": 0.425925076007843, + "learning_rate": 1.509546133985596e-05, + "loss": 3.7155, + "step": 100010 + }, + { + "epoch": 6.79542057344748, + "grad_norm": 0.1629812866449356, + "learning_rate": 1.5091214838972687e-05, + "loss": 3.8745, + "step": 100015 + }, + { + "epoch": 6.795760293518141, + "grad_norm": 1.270575761795044, + "learning_rate": 1.5086968338089417e-05, + "loss": 3.7868, + "step": 100020 + }, + { + "epoch": 6.796100013588803, + "grad_norm": 0.17752182483673096, + "learning_rate": 1.5082721837206143e-05, + "loss": 3.8809, + "step": 100025 + }, + { + "epoch": 6.796439733659465, + "grad_norm": 0.3367801010608673, + "learning_rate": 1.507847533632287e-05, + "loss": 3.6886, + "step": 100030 + }, + { + "epoch": 6.796779453730126, + "grad_norm": 0.1784307211637497, + "learning_rate": 1.50742288354396e-05, + "loss": 3.8192, + "step": 100035 + }, + { + "epoch": 6.797119173800788, + "grad_norm": 0.18228934705257416, + "learning_rate": 1.5069982334556326e-05, + "loss": 3.6589, + "step": 100040 + }, + { + "epoch": 6.79745889387145, + "grad_norm": 0.16044758260250092, + "learning_rate": 1.5065735833673054e-05, + "loss": 3.8695, + "step": 100045 + }, + { + "epoch": 6.797798613942112, + "grad_norm": 0.15448640286922455, + "learning_rate": 1.5061489332789783e-05, + "loss": 3.8134, + "step": 100050 + }, + { + "epoch": 6.798138334012774, + "grad_norm": 0.20968611538410187, + "learning_rate": 1.505724283190651e-05, + "loss": 3.8793, + "step": 100055 + }, + { + "epoch": 6.798478054083435, + "grad_norm": 0.17628276348114014, + "learning_rate": 1.5052996331023236e-05, + "loss": 3.9195, + "step": 100060 + }, + { + "epoch": 6.798817774154097, + "grad_norm": 0.15591458976268768, + "learning_rate": 1.5048749830139966e-05, + "loss": 4.0025, + "step": 100065 + }, + { + "epoch": 6.799157494224759, + "grad_norm": 0.1735885739326477, + "learning_rate": 1.5044503329256694e-05, + "loss": 3.8863, + "step": 100070 + }, + { + "epoch": 6.79949721429542, + "grad_norm": 0.21148023009300232, + "learning_rate": 1.504025682837342e-05, + "loss": 3.7009, + "step": 100075 + }, + { + "epoch": 6.799836934366082, + "grad_norm": 0.4503028094768524, + "learning_rate": 1.503601032749015e-05, + "loss": 3.7565, + "step": 100080 + }, + { + "epoch": 6.800176654436744, + "grad_norm": 0.24058611690998077, + "learning_rate": 1.5031763826606876e-05, + "loss": 3.7411, + "step": 100085 + }, + { + "epoch": 6.8005163745074055, + "grad_norm": 0.15139147639274597, + "learning_rate": 1.5027517325723606e-05, + "loss": 3.8456, + "step": 100090 + }, + { + "epoch": 6.800856094578068, + "grad_norm": 0.13511867821216583, + "learning_rate": 1.5023270824840332e-05, + "loss": 3.7898, + "step": 100095 + }, + { + "epoch": 6.80119581464873, + "grad_norm": 0.16970306634902954, + "learning_rate": 1.501902432395706e-05, + "loss": 3.7491, + "step": 100100 + }, + { + "epoch": 6.801535534719391, + "grad_norm": 0.16163241863250732, + "learning_rate": 1.501477782307379e-05, + "loss": 4.0385, + "step": 100105 + }, + { + "epoch": 6.801875254790053, + "grad_norm": 0.1705814003944397, + "learning_rate": 1.5010531322190516e-05, + "loss": 3.7139, + "step": 100110 + }, + { + "epoch": 6.802214974860715, + "grad_norm": 0.164427250623703, + "learning_rate": 1.5006284821307242e-05, + "loss": 3.8509, + "step": 100115 + }, + { + "epoch": 6.802554694931376, + "grad_norm": 0.1594003587961197, + "learning_rate": 1.5002038320423972e-05, + "loss": 3.8712, + "step": 100120 + }, + { + "epoch": 6.802894415002038, + "grad_norm": 0.18269167840480804, + "learning_rate": 1.49977918195407e-05, + "loss": 4.0898, + "step": 100125 + }, + { + "epoch": 6.8032341350727, + "grad_norm": 0.1397692710161209, + "learning_rate": 1.4993545318657426e-05, + "loss": 3.7771, + "step": 100130 + }, + { + "epoch": 6.8035738551433615, + "grad_norm": 0.1849794089794159, + "learning_rate": 1.4989298817774156e-05, + "loss": 3.7542, + "step": 100135 + }, + { + "epoch": 6.803913575214024, + "grad_norm": 0.15315359830856323, + "learning_rate": 1.4985052316890882e-05, + "loss": 3.932, + "step": 100140 + }, + { + "epoch": 6.804253295284686, + "grad_norm": 0.15422825515270233, + "learning_rate": 1.4980805816007609e-05, + "loss": 3.8406, + "step": 100145 + }, + { + "epoch": 6.804593015355347, + "grad_norm": 0.34496739506721497, + "learning_rate": 1.4976559315124338e-05, + "loss": 3.7879, + "step": 100150 + }, + { + "epoch": 6.804932735426009, + "grad_norm": 0.17114733159542084, + "learning_rate": 1.4972312814241066e-05, + "loss": 3.9775, + "step": 100155 + }, + { + "epoch": 6.805272455496671, + "grad_norm": 0.22503839433193207, + "learning_rate": 1.4968066313357793e-05, + "loss": 3.5884, + "step": 100160 + }, + { + "epoch": 6.805612175567332, + "grad_norm": 1.7950762510299683, + "learning_rate": 1.4963819812474522e-05, + "loss": 4.1571, + "step": 100165 + }, + { + "epoch": 6.805951895637994, + "grad_norm": 0.24554480612277985, + "learning_rate": 1.4959573311591249e-05, + "loss": 3.5828, + "step": 100170 + }, + { + "epoch": 6.806291615708656, + "grad_norm": 0.1734873354434967, + "learning_rate": 1.4955326810707978e-05, + "loss": 3.7753, + "step": 100175 + }, + { + "epoch": 6.806631335779318, + "grad_norm": 0.21086643636226654, + "learning_rate": 1.4951080309824705e-05, + "loss": 3.9101, + "step": 100180 + }, + { + "epoch": 6.80697105584998, + "grad_norm": 0.3298192620277405, + "learning_rate": 1.4946833808941433e-05, + "loss": 3.9863, + "step": 100185 + }, + { + "epoch": 6.807310775920642, + "grad_norm": 0.19643527269363403, + "learning_rate": 1.4942587308058162e-05, + "loss": 3.7856, + "step": 100190 + }, + { + "epoch": 6.807650495991303, + "grad_norm": 0.2210150808095932, + "learning_rate": 1.4938340807174889e-05, + "loss": 4.0363, + "step": 100195 + }, + { + "epoch": 6.807990216061965, + "grad_norm": 0.7265141010284424, + "learning_rate": 1.4934094306291615e-05, + "loss": 3.7806, + "step": 100200 + }, + { + "epoch": 6.808329936132627, + "grad_norm": 0.239792138338089, + "learning_rate": 1.4929847805408345e-05, + "loss": 3.8059, + "step": 100205 + }, + { + "epoch": 6.808669656203288, + "grad_norm": 0.1801663637161255, + "learning_rate": 1.4925601304525073e-05, + "loss": 3.7785, + "step": 100210 + }, + { + "epoch": 6.80900937627395, + "grad_norm": 0.353352814912796, + "learning_rate": 1.4921354803641799e-05, + "loss": 3.8136, + "step": 100215 + }, + { + "epoch": 6.809349096344612, + "grad_norm": 0.17407596111297607, + "learning_rate": 1.4917108302758529e-05, + "loss": 3.8265, + "step": 100220 + }, + { + "epoch": 6.809688816415274, + "grad_norm": 0.31464284658432007, + "learning_rate": 1.4912861801875255e-05, + "loss": 3.536, + "step": 100225 + }, + { + "epoch": 6.810028536485936, + "grad_norm": 0.5988680124282837, + "learning_rate": 1.4908615300991981e-05, + "loss": 3.8506, + "step": 100230 + }, + { + "epoch": 6.810368256556598, + "grad_norm": 0.31979435682296753, + "learning_rate": 1.4904368800108711e-05, + "loss": 3.8336, + "step": 100235 + }, + { + "epoch": 6.810707976627259, + "grad_norm": 0.17885243892669678, + "learning_rate": 1.4900122299225439e-05, + "loss": 3.8677, + "step": 100240 + }, + { + "epoch": 6.811047696697921, + "grad_norm": 0.16501808166503906, + "learning_rate": 1.4895875798342165e-05, + "loss": 4.0395, + "step": 100245 + }, + { + "epoch": 6.811387416768583, + "grad_norm": 0.22560477256774902, + "learning_rate": 1.4891629297458895e-05, + "loss": 3.5606, + "step": 100250 + }, + { + "epoch": 6.811727136839244, + "grad_norm": 0.7228521108627319, + "learning_rate": 1.4887382796575621e-05, + "loss": 3.7129, + "step": 100255 + }, + { + "epoch": 6.812066856909906, + "grad_norm": 0.24650482833385468, + "learning_rate": 1.4883136295692351e-05, + "loss": 3.634, + "step": 100260 + }, + { + "epoch": 6.812406576980568, + "grad_norm": 0.16914883255958557, + "learning_rate": 1.4878889794809079e-05, + "loss": 3.8659, + "step": 100265 + }, + { + "epoch": 6.81274629705123, + "grad_norm": 0.17134128510951996, + "learning_rate": 1.4874643293925805e-05, + "loss": 3.7698, + "step": 100270 + }, + { + "epoch": 6.813086017121892, + "grad_norm": 0.19407261908054352, + "learning_rate": 1.4870396793042535e-05, + "loss": 3.8919, + "step": 100275 + }, + { + "epoch": 6.813425737192553, + "grad_norm": 0.21764661371707916, + "learning_rate": 1.4866150292159261e-05, + "loss": 3.7651, + "step": 100280 + }, + { + "epoch": 6.813765457263215, + "grad_norm": 0.18720199167728424, + "learning_rate": 1.4861903791275988e-05, + "loss": 3.5609, + "step": 100285 + }, + { + "epoch": 6.814105177333877, + "grad_norm": 0.17676183581352234, + "learning_rate": 1.4857657290392717e-05, + "loss": 3.6172, + "step": 100290 + }, + { + "epoch": 6.814444897404538, + "grad_norm": 0.3161899149417877, + "learning_rate": 1.4853410789509445e-05, + "loss": 3.8443, + "step": 100295 + }, + { + "epoch": 6.8147846174752, + "grad_norm": 0.19954852759838104, + "learning_rate": 1.4849164288626172e-05, + "loss": 4.1417, + "step": 100300 + }, + { + "epoch": 6.815124337545862, + "grad_norm": 0.465690553188324, + "learning_rate": 1.4844917787742901e-05, + "loss": 3.3182, + "step": 100305 + }, + { + "epoch": 6.8154640576165235, + "grad_norm": 0.1598772257566452, + "learning_rate": 1.4840671286859628e-05, + "loss": 3.6448, + "step": 100310 + }, + { + "epoch": 6.815803777687186, + "grad_norm": 0.2593238949775696, + "learning_rate": 1.4836424785976356e-05, + "loss": 3.8339, + "step": 100315 + }, + { + "epoch": 6.816143497757848, + "grad_norm": 0.1975974589586258, + "learning_rate": 1.4832178285093084e-05, + "loss": 3.7514, + "step": 100320 + }, + { + "epoch": 6.816483217828509, + "grad_norm": 0.1786472201347351, + "learning_rate": 1.4827931784209812e-05, + "loss": 3.684, + "step": 100325 + }, + { + "epoch": 6.816822937899171, + "grad_norm": 0.1598704755306244, + "learning_rate": 1.4823685283326538e-05, + "loss": 3.6945, + "step": 100330 + }, + { + "epoch": 6.817162657969833, + "grad_norm": 0.19325639307498932, + "learning_rate": 1.4819438782443268e-05, + "loss": 3.9579, + "step": 100335 + }, + { + "epoch": 6.817502378040494, + "grad_norm": 1.000757098197937, + "learning_rate": 1.4815192281559994e-05, + "loss": 3.8583, + "step": 100340 + }, + { + "epoch": 6.817842098111156, + "grad_norm": 0.19787804782390594, + "learning_rate": 1.4810945780676724e-05, + "loss": 3.808, + "step": 100345 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 5.210257530212402, + "learning_rate": 1.4806699279793452e-05, + "loss": 3.9791, + "step": 100350 + }, + { + "epoch": 6.8185215382524795, + "grad_norm": 0.1817236840724945, + "learning_rate": 1.4802452778910178e-05, + "loss": 3.6588, + "step": 100355 + }, + { + "epoch": 6.818861258323142, + "grad_norm": 0.6889113783836365, + "learning_rate": 1.4798206278026908e-05, + "loss": 3.9812, + "step": 100360 + }, + { + "epoch": 6.819200978393804, + "grad_norm": 0.16960862278938293, + "learning_rate": 1.4793959777143634e-05, + "loss": 3.942, + "step": 100365 + }, + { + "epoch": 6.819540698464465, + "grad_norm": 0.20436523854732513, + "learning_rate": 1.478971327626036e-05, + "loss": 3.9559, + "step": 100370 + }, + { + "epoch": 6.819880418535127, + "grad_norm": 1.3228806257247925, + "learning_rate": 1.478546677537709e-05, + "loss": 3.6297, + "step": 100375 + }, + { + "epoch": 6.820220138605789, + "grad_norm": 0.17770178616046906, + "learning_rate": 1.4781220274493818e-05, + "loss": 4.0366, + "step": 100380 + }, + { + "epoch": 6.82055985867645, + "grad_norm": 0.1557973027229309, + "learning_rate": 1.4776973773610545e-05, + "loss": 3.8397, + "step": 100385 + }, + { + "epoch": 6.820899578747112, + "grad_norm": 0.16855615377426147, + "learning_rate": 1.4772727272727274e-05, + "loss": 3.9666, + "step": 100390 + }, + { + "epoch": 6.821239298817774, + "grad_norm": 0.3305589258670807, + "learning_rate": 1.4768480771844e-05, + "loss": 3.9965, + "step": 100395 + }, + { + "epoch": 6.8215790188884355, + "grad_norm": 0.16853998601436615, + "learning_rate": 1.4764234270960729e-05, + "loss": 3.8438, + "step": 100400 + }, + { + "epoch": 6.821918738959098, + "grad_norm": 0.2438361495733261, + "learning_rate": 1.4759987770077458e-05, + "loss": 3.7749, + "step": 100405 + }, + { + "epoch": 6.82225845902976, + "grad_norm": 0.1808704137802124, + "learning_rate": 1.4755741269194185e-05, + "loss": 4.0599, + "step": 100410 + }, + { + "epoch": 6.822598179100421, + "grad_norm": 0.144701287150383, + "learning_rate": 1.4751494768310911e-05, + "loss": 3.9298, + "step": 100415 + }, + { + "epoch": 6.822937899171083, + "grad_norm": 0.18443424999713898, + "learning_rate": 1.474724826742764e-05, + "loss": 3.837, + "step": 100420 + }, + { + "epoch": 6.823277619241745, + "grad_norm": 0.20491211116313934, + "learning_rate": 1.4743001766544367e-05, + "loss": 3.6915, + "step": 100425 + }, + { + "epoch": 6.823617339312406, + "grad_norm": 0.20622727274894714, + "learning_rate": 1.4738755265661097e-05, + "loss": 3.9481, + "step": 100430 + }, + { + "epoch": 6.823957059383068, + "grad_norm": 0.13479164242744446, + "learning_rate": 1.4734508764777825e-05, + "loss": 3.7614, + "step": 100435 + }, + { + "epoch": 6.82429677945373, + "grad_norm": 0.1817297339439392, + "learning_rate": 1.4730262263894551e-05, + "loss": 3.6817, + "step": 100440 + }, + { + "epoch": 6.8246364995243916, + "grad_norm": 0.17233990132808685, + "learning_rate": 1.472601576301128e-05, + "loss": 3.9501, + "step": 100445 + }, + { + "epoch": 6.824976219595054, + "grad_norm": 0.18469230830669403, + "learning_rate": 1.4721769262128007e-05, + "loss": 3.8084, + "step": 100450 + }, + { + "epoch": 6.825315939665716, + "grad_norm": 0.309127539396286, + "learning_rate": 1.4717522761244735e-05, + "loss": 3.9495, + "step": 100455 + }, + { + "epoch": 6.825655659736377, + "grad_norm": 0.1418897807598114, + "learning_rate": 1.4713276260361465e-05, + "loss": 3.8947, + "step": 100460 + }, + { + "epoch": 6.825995379807039, + "grad_norm": 0.17462879419326782, + "learning_rate": 1.4709029759478191e-05, + "loss": 4.0009, + "step": 100465 + }, + { + "epoch": 6.826335099877701, + "grad_norm": 0.21685564517974854, + "learning_rate": 1.4704783258594917e-05, + "loss": 3.8007, + "step": 100470 + }, + { + "epoch": 6.826674819948362, + "grad_norm": 0.19603030383586884, + "learning_rate": 1.4700536757711647e-05, + "loss": 3.9167, + "step": 100475 + }, + { + "epoch": 6.827014540019024, + "grad_norm": 0.22058790922164917, + "learning_rate": 1.4696290256828373e-05, + "loss": 3.7868, + "step": 100480 + }, + { + "epoch": 6.827354260089686, + "grad_norm": 0.2059292495250702, + "learning_rate": 1.4692043755945101e-05, + "loss": 3.5809, + "step": 100485 + }, + { + "epoch": 6.827693980160348, + "grad_norm": 0.1927296221256256, + "learning_rate": 1.4687797255061831e-05, + "loss": 3.8305, + "step": 100490 + }, + { + "epoch": 6.82803370023101, + "grad_norm": 0.15170997381210327, + "learning_rate": 1.4683550754178557e-05, + "loss": 3.8772, + "step": 100495 + }, + { + "epoch": 6.828373420301672, + "grad_norm": 0.18977142870426178, + "learning_rate": 1.4679304253295284e-05, + "loss": 3.8247, + "step": 100500 + }, + { + "epoch": 6.828713140372333, + "grad_norm": 0.19058038294315338, + "learning_rate": 1.4675057752412013e-05, + "loss": 3.9929, + "step": 100505 + }, + { + "epoch": 6.829052860442995, + "grad_norm": 0.24708320200443268, + "learning_rate": 1.4670811251528741e-05, + "loss": 3.9027, + "step": 100510 + }, + { + "epoch": 6.829392580513657, + "grad_norm": 0.17096440494060516, + "learning_rate": 1.466656475064547e-05, + "loss": 3.8569, + "step": 100515 + }, + { + "epoch": 6.829732300584318, + "grad_norm": 0.18703924119472504, + "learning_rate": 1.4662318249762197e-05, + "loss": 3.8836, + "step": 100520 + }, + { + "epoch": 6.83007202065498, + "grad_norm": 0.2253386527299881, + "learning_rate": 1.4658071748878924e-05, + "loss": 3.8421, + "step": 100525 + }, + { + "epoch": 6.830411740725642, + "grad_norm": 1.4072222709655762, + "learning_rate": 1.4653825247995653e-05, + "loss": 3.7696, + "step": 100530 + }, + { + "epoch": 6.830751460796304, + "grad_norm": 0.17197264730930328, + "learning_rate": 1.464957874711238e-05, + "loss": 3.8273, + "step": 100535 + }, + { + "epoch": 6.831091180866966, + "grad_norm": 0.15678252279758453, + "learning_rate": 1.4645332246229108e-05, + "loss": 3.8512, + "step": 100540 + }, + { + "epoch": 6.831430900937628, + "grad_norm": 0.25228893756866455, + "learning_rate": 1.4641085745345837e-05, + "loss": 3.889, + "step": 100545 + }, + { + "epoch": 6.831770621008289, + "grad_norm": 0.21160399913787842, + "learning_rate": 1.4636839244462564e-05, + "loss": 3.9687, + "step": 100550 + }, + { + "epoch": 6.832110341078951, + "grad_norm": 0.18450488150119781, + "learning_rate": 1.463259274357929e-05, + "loss": 3.9474, + "step": 100555 + }, + { + "epoch": 6.832450061149613, + "grad_norm": 0.31299763917922974, + "learning_rate": 1.462834624269602e-05, + "loss": 3.6165, + "step": 100560 + }, + { + "epoch": 6.832789781220274, + "grad_norm": 0.18657588958740234, + "learning_rate": 1.4624099741812746e-05, + "loss": 3.9669, + "step": 100565 + }, + { + "epoch": 6.833129501290936, + "grad_norm": 0.1671879142522812, + "learning_rate": 1.4619853240929474e-05, + "loss": 3.8082, + "step": 100570 + }, + { + "epoch": 6.833469221361598, + "grad_norm": 0.18413874506950378, + "learning_rate": 1.4615606740046204e-05, + "loss": 3.9785, + "step": 100575 + }, + { + "epoch": 6.83380894143226, + "grad_norm": 0.21810969710350037, + "learning_rate": 1.461136023916293e-05, + "loss": 3.7009, + "step": 100580 + }, + { + "epoch": 6.834148661502922, + "grad_norm": 0.19151462614536285, + "learning_rate": 1.4607113738279656e-05, + "loss": 3.7966, + "step": 100585 + }, + { + "epoch": 6.834488381573584, + "grad_norm": 0.21127881109714508, + "learning_rate": 1.4602867237396386e-05, + "loss": 4.0272, + "step": 100590 + }, + { + "epoch": 6.834828101644245, + "grad_norm": 0.1574220508337021, + "learning_rate": 1.4598620736513114e-05, + "loss": 3.9355, + "step": 100595 + }, + { + "epoch": 6.835167821714907, + "grad_norm": 0.2106497585773468, + "learning_rate": 1.4594374235629844e-05, + "loss": 3.8077, + "step": 100600 + }, + { + "epoch": 6.835507541785569, + "grad_norm": 0.16509200632572174, + "learning_rate": 1.459012773474657e-05, + "loss": 3.8313, + "step": 100605 + }, + { + "epoch": 6.83584726185623, + "grad_norm": 0.15550273656845093, + "learning_rate": 1.4585881233863296e-05, + "loss": 3.6691, + "step": 100610 + }, + { + "epoch": 6.836186981926892, + "grad_norm": 0.1532837152481079, + "learning_rate": 1.4581634732980026e-05, + "loss": 3.8523, + "step": 100615 + }, + { + "epoch": 6.836526701997554, + "grad_norm": 0.17596875131130219, + "learning_rate": 1.4577388232096752e-05, + "loss": 3.7482, + "step": 100620 + }, + { + "epoch": 6.836866422068216, + "grad_norm": 0.16806963086128235, + "learning_rate": 1.457314173121348e-05, + "loss": 3.7364, + "step": 100625 + }, + { + "epoch": 6.837206142138878, + "grad_norm": 0.2941804528236389, + "learning_rate": 1.456889523033021e-05, + "loss": 3.6425, + "step": 100630 + }, + { + "epoch": 6.83754586220954, + "grad_norm": 0.19161662459373474, + "learning_rate": 1.4564648729446936e-05, + "loss": 3.7492, + "step": 100635 + }, + { + "epoch": 6.837885582280201, + "grad_norm": 0.15089842677116394, + "learning_rate": 1.4560402228563663e-05, + "loss": 3.7672, + "step": 100640 + }, + { + "epoch": 6.838225302350863, + "grad_norm": 0.20070597529411316, + "learning_rate": 1.4556155727680392e-05, + "loss": 3.569, + "step": 100645 + }, + { + "epoch": 6.838565022421525, + "grad_norm": 0.2016981691122055, + "learning_rate": 1.455190922679712e-05, + "loss": 3.8533, + "step": 100650 + }, + { + "epoch": 6.838904742492186, + "grad_norm": 0.43952569365501404, + "learning_rate": 1.4547662725913847e-05, + "loss": 3.7827, + "step": 100655 + }, + { + "epoch": 6.839244462562848, + "grad_norm": 0.15660612285137177, + "learning_rate": 1.4543416225030576e-05, + "loss": 3.9126, + "step": 100660 + }, + { + "epoch": 6.83958418263351, + "grad_norm": 0.14145012199878693, + "learning_rate": 1.4539169724147303e-05, + "loss": 3.8381, + "step": 100665 + }, + { + "epoch": 6.839923902704172, + "grad_norm": 0.17236116528511047, + "learning_rate": 1.4534923223264029e-05, + "loss": 3.9636, + "step": 100670 + }, + { + "epoch": 6.840263622774834, + "grad_norm": 0.19959333539009094, + "learning_rate": 1.4530676722380759e-05, + "loss": 3.7884, + "step": 100675 + }, + { + "epoch": 6.840603342845496, + "grad_norm": 0.13437584042549133, + "learning_rate": 1.4526430221497487e-05, + "loss": 3.6789, + "step": 100680 + }, + { + "epoch": 6.840943062916157, + "grad_norm": 0.24284635484218597, + "learning_rate": 1.4522183720614216e-05, + "loss": 3.8539, + "step": 100685 + }, + { + "epoch": 6.841282782986819, + "grad_norm": 0.1523945927619934, + "learning_rate": 1.4517937219730943e-05, + "loss": 3.9965, + "step": 100690 + }, + { + "epoch": 6.841622503057481, + "grad_norm": 0.23875777423381805, + "learning_rate": 1.4513690718847669e-05, + "loss": 3.9666, + "step": 100695 + }, + { + "epoch": 6.841962223128142, + "grad_norm": 0.17457567155361176, + "learning_rate": 1.4509444217964399e-05, + "loss": 3.7797, + "step": 100700 + }, + { + "epoch": 6.842301943198804, + "grad_norm": 0.1689138561487198, + "learning_rate": 1.4505197717081125e-05, + "loss": 3.9276, + "step": 100705 + }, + { + "epoch": 6.842641663269466, + "grad_norm": 0.18781103193759918, + "learning_rate": 1.4500951216197853e-05, + "loss": 3.7801, + "step": 100710 + }, + { + "epoch": 6.842981383340128, + "grad_norm": 0.16779746115207672, + "learning_rate": 1.4496704715314583e-05, + "loss": 3.7835, + "step": 100715 + }, + { + "epoch": 6.84332110341079, + "grad_norm": 0.8740018606185913, + "learning_rate": 1.449245821443131e-05, + "loss": 3.9194, + "step": 100720 + }, + { + "epoch": 6.843660823481452, + "grad_norm": 0.18931663036346436, + "learning_rate": 1.4488211713548035e-05, + "loss": 3.7367, + "step": 100725 + }, + { + "epoch": 6.844000543552113, + "grad_norm": 0.2088724821805954, + "learning_rate": 1.4483965212664765e-05, + "loss": 3.8177, + "step": 100730 + }, + { + "epoch": 6.844340263622775, + "grad_norm": 0.19380441308021545, + "learning_rate": 1.4479718711781493e-05, + "loss": 3.7156, + "step": 100735 + }, + { + "epoch": 6.844679983693436, + "grad_norm": 0.17712831497192383, + "learning_rate": 1.4476321511074876e-05, + "loss": 3.7703, + "step": 100740 + }, + { + "epoch": 6.845019703764098, + "grad_norm": 0.18007735908031464, + "learning_rate": 1.4472075010191602e-05, + "loss": 3.6347, + "step": 100745 + }, + { + "epoch": 6.84535942383476, + "grad_norm": 0.16026043891906738, + "learning_rate": 1.4467828509308332e-05, + "loss": 4.0306, + "step": 100750 + }, + { + "epoch": 6.845699143905422, + "grad_norm": 0.1769755631685257, + "learning_rate": 1.4463582008425058e-05, + "loss": 3.7665, + "step": 100755 + }, + { + "epoch": 6.846038863976084, + "grad_norm": 0.15712285041809082, + "learning_rate": 1.4459335507541786e-05, + "loss": 3.755, + "step": 100760 + }, + { + "epoch": 6.846378584046746, + "grad_norm": 0.2620411515235901, + "learning_rate": 1.4455089006658516e-05, + "loss": 4.044, + "step": 100765 + }, + { + "epoch": 6.846718304117407, + "grad_norm": 0.1472305804491043, + "learning_rate": 1.4450842505775242e-05, + "loss": 3.7817, + "step": 100770 + }, + { + "epoch": 6.847058024188069, + "grad_norm": 0.24699163436889648, + "learning_rate": 1.4446596004891968e-05, + "loss": 3.7168, + "step": 100775 + }, + { + "epoch": 6.847397744258731, + "grad_norm": 0.19376793503761292, + "learning_rate": 1.4442349504008698e-05, + "loss": 3.9664, + "step": 100780 + }, + { + "epoch": 6.847737464329392, + "grad_norm": 0.17360717058181763, + "learning_rate": 1.4438103003125424e-05, + "loss": 4.0718, + "step": 100785 + }, + { + "epoch": 6.848077184400054, + "grad_norm": 0.3635743260383606, + "learning_rate": 1.4433856502242152e-05, + "loss": 3.6371, + "step": 100790 + }, + { + "epoch": 6.848416904470716, + "grad_norm": 0.14338965713977814, + "learning_rate": 1.4429610001358882e-05, + "loss": 3.7216, + "step": 100795 + }, + { + "epoch": 6.848756624541378, + "grad_norm": 0.16821177303791046, + "learning_rate": 1.4425363500475608e-05, + "loss": 3.5768, + "step": 100800 + }, + { + "epoch": 6.84909634461204, + "grad_norm": 0.273875892162323, + "learning_rate": 1.4421116999592338e-05, + "loss": 3.7365, + "step": 100805 + }, + { + "epoch": 6.849436064682702, + "grad_norm": 0.16969595849514008, + "learning_rate": 1.4416870498709064e-05, + "loss": 3.8363, + "step": 100810 + }, + { + "epoch": 6.849775784753363, + "grad_norm": 0.17154079675674438, + "learning_rate": 1.4412623997825792e-05, + "loss": 4.0237, + "step": 100815 + }, + { + "epoch": 6.850115504824025, + "grad_norm": 0.22662289440631866, + "learning_rate": 1.440837749694252e-05, + "loss": 3.8334, + "step": 100820 + }, + { + "epoch": 6.850455224894687, + "grad_norm": 0.21405275166034698, + "learning_rate": 1.4404130996059248e-05, + "loss": 3.747, + "step": 100825 + }, + { + "epoch": 6.850794944965348, + "grad_norm": 0.1864372044801712, + "learning_rate": 1.4399884495175975e-05, + "loss": 3.9283, + "step": 100830 + }, + { + "epoch": 6.85113466503601, + "grad_norm": 0.16974452137947083, + "learning_rate": 1.4395637994292704e-05, + "loss": 3.9169, + "step": 100835 + }, + { + "epoch": 6.851474385106672, + "grad_norm": 0.16260957717895508, + "learning_rate": 1.439139149340943e-05, + "loss": 3.8602, + "step": 100840 + }, + { + "epoch": 6.851814105177334, + "grad_norm": 0.14552301168441772, + "learning_rate": 1.4387144992526159e-05, + "loss": 3.7099, + "step": 100845 + }, + { + "epoch": 6.852153825247996, + "grad_norm": 0.19135743379592896, + "learning_rate": 1.4382898491642888e-05, + "loss": 3.699, + "step": 100850 + }, + { + "epoch": 6.852493545318658, + "grad_norm": 0.16606742143630981, + "learning_rate": 1.4378651990759615e-05, + "loss": 3.8874, + "step": 100855 + }, + { + "epoch": 6.852833265389319, + "grad_norm": 0.7147949934005737, + "learning_rate": 1.4374405489876341e-05, + "loss": 3.9037, + "step": 100860 + }, + { + "epoch": 6.853172985459981, + "grad_norm": 0.16888514161109924, + "learning_rate": 1.437015898899307e-05, + "loss": 3.7625, + "step": 100865 + }, + { + "epoch": 6.853512705530643, + "grad_norm": 0.17795655131340027, + "learning_rate": 1.4365912488109797e-05, + "loss": 3.5976, + "step": 100870 + }, + { + "epoch": 6.853852425601304, + "grad_norm": 0.21300551295280457, + "learning_rate": 1.4361665987226525e-05, + "loss": 4.0446, + "step": 100875 + }, + { + "epoch": 6.854192145671966, + "grad_norm": 0.15497153997421265, + "learning_rate": 1.4357419486343255e-05, + "loss": 3.8353, + "step": 100880 + }, + { + "epoch": 6.854531865742628, + "grad_norm": 0.3193296194076538, + "learning_rate": 1.4353172985459981e-05, + "loss": 3.7936, + "step": 100885 + }, + { + "epoch": 6.85487158581329, + "grad_norm": 0.47321656346321106, + "learning_rate": 1.434892648457671e-05, + "loss": 3.832, + "step": 100890 + }, + { + "epoch": 6.855211305883952, + "grad_norm": 0.1912907063961029, + "learning_rate": 1.4344679983693437e-05, + "loss": 3.7215, + "step": 100895 + }, + { + "epoch": 6.855551025954614, + "grad_norm": 0.19328343868255615, + "learning_rate": 1.4340433482810165e-05, + "loss": 3.9781, + "step": 100900 + }, + { + "epoch": 6.855890746025275, + "grad_norm": 0.17915260791778564, + "learning_rate": 1.4336186981926895e-05, + "loss": 3.6334, + "step": 100905 + }, + { + "epoch": 6.856230466095937, + "grad_norm": 0.19222411513328552, + "learning_rate": 1.4331940481043621e-05, + "loss": 3.7127, + "step": 100910 + }, + { + "epoch": 6.856570186166599, + "grad_norm": 0.14629651606082916, + "learning_rate": 1.4327693980160347e-05, + "loss": 3.7651, + "step": 100915 + }, + { + "epoch": 6.85690990623726, + "grad_norm": 0.21834352612495422, + "learning_rate": 1.4323447479277077e-05, + "loss": 4.0047, + "step": 100920 + }, + { + "epoch": 6.857249626307922, + "grad_norm": 0.13723504543304443, + "learning_rate": 1.4319200978393803e-05, + "loss": 3.9497, + "step": 100925 + }, + { + "epoch": 6.857589346378584, + "grad_norm": 0.145138218998909, + "learning_rate": 1.4314954477510531e-05, + "loss": 3.8267, + "step": 100930 + }, + { + "epoch": 6.857929066449246, + "grad_norm": 1.9989402294158936, + "learning_rate": 1.4310707976627261e-05, + "loss": 4.086, + "step": 100935 + }, + { + "epoch": 6.858268786519908, + "grad_norm": 0.20529089868068695, + "learning_rate": 1.4306461475743987e-05, + "loss": 3.6844, + "step": 100940 + }, + { + "epoch": 6.85860850659057, + "grad_norm": 0.34567907452583313, + "learning_rate": 1.4302214974860714e-05, + "loss": 4.08, + "step": 100945 + }, + { + "epoch": 6.858948226661231, + "grad_norm": 0.15381954610347748, + "learning_rate": 1.4297968473977443e-05, + "loss": 3.6598, + "step": 100950 + }, + { + "epoch": 6.859287946731893, + "grad_norm": 0.1388966590166092, + "learning_rate": 1.4293721973094171e-05, + "loss": 3.5832, + "step": 100955 + }, + { + "epoch": 6.859627666802554, + "grad_norm": 0.45109522342681885, + "learning_rate": 1.4289475472210898e-05, + "loss": 3.6694, + "step": 100960 + }, + { + "epoch": 6.859967386873216, + "grad_norm": 0.2847703993320465, + "learning_rate": 1.4285228971327627e-05, + "loss": 3.909, + "step": 100965 + }, + { + "epoch": 6.860307106943878, + "grad_norm": 0.18789248168468475, + "learning_rate": 1.4280982470444354e-05, + "loss": 3.7487, + "step": 100970 + }, + { + "epoch": 6.8606468270145395, + "grad_norm": 0.18668043613433838, + "learning_rate": 1.4276735969561083e-05, + "loss": 3.7615, + "step": 100975 + }, + { + "epoch": 6.860986547085202, + "grad_norm": 0.18063805997371674, + "learning_rate": 1.427248946867781e-05, + "loss": 3.7352, + "step": 100980 + }, + { + "epoch": 6.861326267155864, + "grad_norm": 0.21311834454536438, + "learning_rate": 1.4268242967794538e-05, + "loss": 3.6899, + "step": 100985 + }, + { + "epoch": 6.861665987226525, + "grad_norm": 0.19147662818431854, + "learning_rate": 1.4263996466911268e-05, + "loss": 3.7984, + "step": 100990 + }, + { + "epoch": 6.862005707297187, + "grad_norm": 0.1827281415462494, + "learning_rate": 1.4259749966027994e-05, + "loss": 4.0692, + "step": 100995 + }, + { + "epoch": 6.862345427367849, + "grad_norm": 0.3273231089115143, + "learning_rate": 1.425550346514472e-05, + "loss": 3.9129, + "step": 101000 + }, + { + "epoch": 6.86268514743851, + "grad_norm": 0.178438201546669, + "learning_rate": 1.425125696426145e-05, + "loss": 3.8353, + "step": 101005 + }, + { + "epoch": 6.863024867509172, + "grad_norm": 0.16153278946876526, + "learning_rate": 1.4247010463378178e-05, + "loss": 3.7577, + "step": 101010 + }, + { + "epoch": 6.863364587579834, + "grad_norm": 0.1596720814704895, + "learning_rate": 1.4242763962494904e-05, + "loss": 3.6281, + "step": 101015 + }, + { + "epoch": 6.8637043076504956, + "grad_norm": 0.1597408950328827, + "learning_rate": 1.4238517461611634e-05, + "loss": 3.9923, + "step": 101020 + }, + { + "epoch": 6.864044027721158, + "grad_norm": 0.6262490153312683, + "learning_rate": 1.423427096072836e-05, + "loss": 3.8558, + "step": 101025 + }, + { + "epoch": 6.86438374779182, + "grad_norm": 0.21023300290107727, + "learning_rate": 1.4230024459845086e-05, + "loss": 3.8541, + "step": 101030 + }, + { + "epoch": 6.864723467862481, + "grad_norm": 0.36940351128578186, + "learning_rate": 1.4225777958961816e-05, + "loss": 4.1381, + "step": 101035 + }, + { + "epoch": 6.865063187933143, + "grad_norm": 0.20451977849006653, + "learning_rate": 1.4221531458078544e-05, + "loss": 3.6888, + "step": 101040 + }, + { + "epoch": 6.865402908003805, + "grad_norm": 0.4257952868938446, + "learning_rate": 1.421728495719527e-05, + "loss": 3.9548, + "step": 101045 + }, + { + "epoch": 6.865742628074466, + "grad_norm": 0.19645555317401886, + "learning_rate": 1.4213038456312e-05, + "loss": 3.8273, + "step": 101050 + }, + { + "epoch": 6.866082348145128, + "grad_norm": 0.6779542565345764, + "learning_rate": 1.4208791955428727e-05, + "loss": 3.6731, + "step": 101055 + }, + { + "epoch": 6.86642206821579, + "grad_norm": 0.15641844272613525, + "learning_rate": 1.4204545454545456e-05, + "loss": 3.9115, + "step": 101060 + }, + { + "epoch": 6.866761788286452, + "grad_norm": 0.2600044310092926, + "learning_rate": 1.4200298953662183e-05, + "loss": 3.7626, + "step": 101065 + }, + { + "epoch": 6.867101508357114, + "grad_norm": 0.15280681848526, + "learning_rate": 1.419605245277891e-05, + "loss": 3.8493, + "step": 101070 + }, + { + "epoch": 6.867441228427776, + "grad_norm": 0.18508568406105042, + "learning_rate": 1.419180595189564e-05, + "loss": 3.7105, + "step": 101075 + }, + { + "epoch": 6.867780948498437, + "grad_norm": 0.1663421094417572, + "learning_rate": 1.4187559451012367e-05, + "loss": 3.9952, + "step": 101080 + }, + { + "epoch": 6.868120668569099, + "grad_norm": 0.23156046867370605, + "learning_rate": 1.4183312950129093e-05, + "loss": 3.7796, + "step": 101085 + }, + { + "epoch": 6.868460388639761, + "grad_norm": 0.15634563565254211, + "learning_rate": 1.4179066449245823e-05, + "loss": 3.8149, + "step": 101090 + }, + { + "epoch": 6.868800108710422, + "grad_norm": 0.2771627604961395, + "learning_rate": 1.417481994836255e-05, + "loss": 3.7665, + "step": 101095 + }, + { + "epoch": 6.869139828781084, + "grad_norm": 0.16183021664619446, + "learning_rate": 1.4170573447479277e-05, + "loss": 3.7724, + "step": 101100 + }, + { + "epoch": 6.869479548851746, + "grad_norm": 0.17956897616386414, + "learning_rate": 1.4166326946596007e-05, + "loss": 3.7516, + "step": 101105 + }, + { + "epoch": 6.869819268922408, + "grad_norm": 0.2568677067756653, + "learning_rate": 1.4162080445712733e-05, + "loss": 4.039, + "step": 101110 + }, + { + "epoch": 6.87015898899307, + "grad_norm": 0.13081929087638855, + "learning_rate": 1.415783394482946e-05, + "loss": 3.583, + "step": 101115 + }, + { + "epoch": 6.870498709063732, + "grad_norm": 0.48012691736221313, + "learning_rate": 1.4153587443946189e-05, + "loss": 3.8483, + "step": 101120 + }, + { + "epoch": 6.870838429134393, + "grad_norm": 0.18312221765518188, + "learning_rate": 1.4149340943062917e-05, + "loss": 3.7885, + "step": 101125 + }, + { + "epoch": 6.871178149205055, + "grad_norm": 0.1918025016784668, + "learning_rate": 1.4145094442179643e-05, + "loss": 3.8291, + "step": 101130 + }, + { + "epoch": 6.871517869275717, + "grad_norm": 0.16295726597309113, + "learning_rate": 1.4140847941296373e-05, + "loss": 3.8668, + "step": 101135 + }, + { + "epoch": 6.871857589346378, + "grad_norm": 0.21576985716819763, + "learning_rate": 1.41366014404131e-05, + "loss": 3.9974, + "step": 101140 + }, + { + "epoch": 6.87219730941704, + "grad_norm": 0.14791247248649597, + "learning_rate": 1.4132354939529829e-05, + "loss": 4.1016, + "step": 101145 + }, + { + "epoch": 6.872537029487702, + "grad_norm": 0.5803276896476746, + "learning_rate": 1.4128108438646557e-05, + "loss": 4.0166, + "step": 101150 + }, + { + "epoch": 6.872876749558364, + "grad_norm": 0.16229471564292908, + "learning_rate": 1.4123861937763283e-05, + "loss": 3.8962, + "step": 101155 + }, + { + "epoch": 6.873216469629026, + "grad_norm": 0.16415297985076904, + "learning_rate": 1.4119615436880013e-05, + "loss": 3.7551, + "step": 101160 + }, + { + "epoch": 6.873556189699688, + "grad_norm": 0.2298216074705124, + "learning_rate": 1.411536893599674e-05, + "loss": 3.7799, + "step": 101165 + }, + { + "epoch": 6.873895909770349, + "grad_norm": 0.2565567195415497, + "learning_rate": 1.4111122435113466e-05, + "loss": 3.9462, + "step": 101170 + }, + { + "epoch": 6.874235629841011, + "grad_norm": 0.1875787228345871, + "learning_rate": 1.4106875934230195e-05, + "loss": 3.8132, + "step": 101175 + }, + { + "epoch": 6.874575349911673, + "grad_norm": 0.16722895205020905, + "learning_rate": 1.4102629433346923e-05, + "loss": 3.8734, + "step": 101180 + }, + { + "epoch": 6.874915069982334, + "grad_norm": 0.19482897222042084, + "learning_rate": 1.409838293246365e-05, + "loss": 3.9763, + "step": 101185 + }, + { + "epoch": 6.875254790052996, + "grad_norm": 0.5482123494148254, + "learning_rate": 1.409413643158038e-05, + "loss": 3.7481, + "step": 101190 + }, + { + "epoch": 6.875594510123658, + "grad_norm": 0.1699097603559494, + "learning_rate": 1.4089889930697106e-05, + "loss": 3.6567, + "step": 101195 + }, + { + "epoch": 6.87593423019432, + "grad_norm": 0.2410283386707306, + "learning_rate": 1.4085643429813834e-05, + "loss": 3.6391, + "step": 101200 + }, + { + "epoch": 6.876273950264982, + "grad_norm": 1.4865270853042603, + "learning_rate": 1.4081396928930562e-05, + "loss": 3.7978, + "step": 101205 + }, + { + "epoch": 6.876613670335644, + "grad_norm": 0.33893686532974243, + "learning_rate": 1.407715042804729e-05, + "loss": 3.6933, + "step": 101210 + }, + { + "epoch": 6.876953390406305, + "grad_norm": 0.22554044425487518, + "learning_rate": 1.4072903927164016e-05, + "loss": 4.0861, + "step": 101215 + }, + { + "epoch": 6.877293110476967, + "grad_norm": 0.5019211769104004, + "learning_rate": 1.4068657426280746e-05, + "loss": 3.7048, + "step": 101220 + }, + { + "epoch": 6.877632830547629, + "grad_norm": 0.9783873558044434, + "learning_rate": 1.4064410925397472e-05, + "loss": 3.915, + "step": 101225 + }, + { + "epoch": 6.87797255061829, + "grad_norm": 0.18176259100437164, + "learning_rate": 1.4060164424514202e-05, + "loss": 4.1444, + "step": 101230 + }, + { + "epoch": 6.878312270688952, + "grad_norm": 0.1945154219865799, + "learning_rate": 1.405591792363093e-05, + "loss": 4.0002, + "step": 101235 + }, + { + "epoch": 6.878651990759614, + "grad_norm": 2.9349114894866943, + "learning_rate": 1.4051671422747656e-05, + "loss": 3.7176, + "step": 101240 + }, + { + "epoch": 6.878991710830276, + "grad_norm": 0.19011950492858887, + "learning_rate": 1.4047424921864386e-05, + "loss": 3.9005, + "step": 101245 + }, + { + "epoch": 6.879331430900938, + "grad_norm": 0.17015987634658813, + "learning_rate": 1.4043178420981112e-05, + "loss": 3.7522, + "step": 101250 + }, + { + "epoch": 6.8796711509716, + "grad_norm": 0.15732342004776, + "learning_rate": 1.4038931920097838e-05, + "loss": 3.924, + "step": 101255 + }, + { + "epoch": 6.880010871042261, + "grad_norm": 0.13102783262729645, + "learning_rate": 1.4034685419214568e-05, + "loss": 3.7512, + "step": 101260 + }, + { + "epoch": 6.880350591112923, + "grad_norm": 0.20421913266181946, + "learning_rate": 1.4030438918331296e-05, + "loss": 3.9052, + "step": 101265 + }, + { + "epoch": 6.880690311183585, + "grad_norm": 0.1611640453338623, + "learning_rate": 1.4026192417448022e-05, + "loss": 3.6881, + "step": 101270 + }, + { + "epoch": 6.881030031254246, + "grad_norm": 0.16657277941703796, + "learning_rate": 1.4021945916564752e-05, + "loss": 4.0766, + "step": 101275 + }, + { + "epoch": 6.881369751324908, + "grad_norm": 0.18465200066566467, + "learning_rate": 1.4017699415681478e-05, + "loss": 3.8245, + "step": 101280 + }, + { + "epoch": 6.88170947139557, + "grad_norm": 0.16284692287445068, + "learning_rate": 1.4013452914798206e-05, + "loss": 3.7506, + "step": 101285 + }, + { + "epoch": 6.882049191466232, + "grad_norm": 0.14974816143512726, + "learning_rate": 1.4009206413914936e-05, + "loss": 3.7971, + "step": 101290 + }, + { + "epoch": 6.882388911536894, + "grad_norm": 0.7529256939888, + "learning_rate": 1.4004959913031662e-05, + "loss": 3.7719, + "step": 101295 + }, + { + "epoch": 6.882728631607556, + "grad_norm": 0.145399272441864, + "learning_rate": 1.4000713412148389e-05, + "loss": 3.8328, + "step": 101300 + }, + { + "epoch": 6.883068351678217, + "grad_norm": 0.9596526026725769, + "learning_rate": 1.3996466911265118e-05, + "loss": 3.676, + "step": 101305 + }, + { + "epoch": 6.883408071748879, + "grad_norm": 0.13784228265285492, + "learning_rate": 1.3992220410381845e-05, + "loss": 3.7353, + "step": 101310 + }, + { + "epoch": 6.883747791819541, + "grad_norm": 0.2072809636592865, + "learning_rate": 1.3987973909498574e-05, + "loss": 3.8446, + "step": 101315 + }, + { + "epoch": 6.884087511890202, + "grad_norm": 1.1360085010528564, + "learning_rate": 1.3983727408615302e-05, + "loss": 3.7696, + "step": 101320 + }, + { + "epoch": 6.884427231960864, + "grad_norm": 0.18207381665706635, + "learning_rate": 1.3979480907732029e-05, + "loss": 4.0372, + "step": 101325 + }, + { + "epoch": 6.884766952031526, + "grad_norm": 0.1530565619468689, + "learning_rate": 1.3975234406848758e-05, + "loss": 3.8057, + "step": 101330 + }, + { + "epoch": 6.885106672102188, + "grad_norm": 0.156253382563591, + "learning_rate": 1.3970987905965485e-05, + "loss": 3.859, + "step": 101335 + }, + { + "epoch": 6.88544639217285, + "grad_norm": 0.15642184019088745, + "learning_rate": 1.3966741405082213e-05, + "loss": 3.6391, + "step": 101340 + }, + { + "epoch": 6.885786112243512, + "grad_norm": 0.22440394759178162, + "learning_rate": 1.3962494904198943e-05, + "loss": 3.9962, + "step": 101345 + }, + { + "epoch": 6.886125832314173, + "grad_norm": 0.16417251527309418, + "learning_rate": 1.3958248403315669e-05, + "loss": 4.0091, + "step": 101350 + }, + { + "epoch": 6.886465552384835, + "grad_norm": 0.17915791273117065, + "learning_rate": 1.3954001902432395e-05, + "loss": 3.8362, + "step": 101355 + }, + { + "epoch": 6.886805272455497, + "grad_norm": 0.16133573651313782, + "learning_rate": 1.3949755401549125e-05, + "loss": 3.6642, + "step": 101360 + }, + { + "epoch": 6.887144992526158, + "grad_norm": 0.25333496928215027, + "learning_rate": 1.3945508900665851e-05, + "loss": 3.9098, + "step": 101365 + }, + { + "epoch": 6.88748471259682, + "grad_norm": 0.26268523931503296, + "learning_rate": 1.394126239978258e-05, + "loss": 3.7892, + "step": 101370 + }, + { + "epoch": 6.8878244326674825, + "grad_norm": 0.2081320434808731, + "learning_rate": 1.3937015898899309e-05, + "loss": 3.7649, + "step": 101375 + }, + { + "epoch": 6.888164152738144, + "grad_norm": 0.2186337262392044, + "learning_rate": 1.3932769398016035e-05, + "loss": 3.79, + "step": 101380 + }, + { + "epoch": 6.888503872808806, + "grad_norm": 0.1687869429588318, + "learning_rate": 1.3928522897132761e-05, + "loss": 3.9147, + "step": 101385 + }, + { + "epoch": 6.888843592879468, + "grad_norm": 0.16882283985614777, + "learning_rate": 1.3924276396249491e-05, + "loss": 3.9867, + "step": 101390 + }, + { + "epoch": 6.889183312950129, + "grad_norm": 0.2076367288827896, + "learning_rate": 1.392002989536622e-05, + "loss": 3.9641, + "step": 101395 + }, + { + "epoch": 6.889523033020791, + "grad_norm": 0.13019302487373352, + "learning_rate": 1.3915783394482947e-05, + "loss": 3.7809, + "step": 101400 + }, + { + "epoch": 6.889862753091453, + "grad_norm": 0.17185328900814056, + "learning_rate": 1.3911536893599675e-05, + "loss": 3.7759, + "step": 101405 + }, + { + "epoch": 6.890202473162114, + "grad_norm": 0.2204868495464325, + "learning_rate": 1.3907290392716402e-05, + "loss": 3.7915, + "step": 101410 + }, + { + "epoch": 6.890542193232776, + "grad_norm": 0.157804936170578, + "learning_rate": 1.3903043891833131e-05, + "loss": 3.7596, + "step": 101415 + }, + { + "epoch": 6.8908819133034385, + "grad_norm": 0.19258728623390198, + "learning_rate": 1.3898797390949858e-05, + "loss": 3.4938, + "step": 101420 + }, + { + "epoch": 6.8912216333741, + "grad_norm": 0.3684903681278229, + "learning_rate": 1.3894550890066586e-05, + "loss": 3.8835, + "step": 101425 + }, + { + "epoch": 6.891561353444762, + "grad_norm": 0.16483204066753387, + "learning_rate": 1.3890304389183315e-05, + "loss": 3.8773, + "step": 101430 + }, + { + "epoch": 6.891901073515423, + "grad_norm": 0.18797549605369568, + "learning_rate": 1.3886057888300042e-05, + "loss": 3.7607, + "step": 101435 + }, + { + "epoch": 6.892240793586085, + "grad_norm": 0.3311615586280823, + "learning_rate": 1.3881811387416768e-05, + "loss": 4.2694, + "step": 101440 + }, + { + "epoch": 6.892580513656747, + "grad_norm": 0.14239655435085297, + "learning_rate": 1.3877564886533498e-05, + "loss": 3.8244, + "step": 101445 + }, + { + "epoch": 6.892920233727408, + "grad_norm": 0.15357817709445953, + "learning_rate": 1.3873318385650224e-05, + "loss": 3.7381, + "step": 101450 + }, + { + "epoch": 6.89325995379807, + "grad_norm": 0.16955776512622833, + "learning_rate": 1.3869071884766952e-05, + "loss": 3.8126, + "step": 101455 + }, + { + "epoch": 6.893599673868732, + "grad_norm": 0.20924074947834015, + "learning_rate": 1.3864825383883682e-05, + "loss": 3.7787, + "step": 101460 + }, + { + "epoch": 6.893939393939394, + "grad_norm": 0.17765292525291443, + "learning_rate": 1.3860578883000408e-05, + "loss": 3.6963, + "step": 101465 + }, + { + "epoch": 6.894279114010056, + "grad_norm": 0.14950235188007355, + "learning_rate": 1.3856332382117134e-05, + "loss": 4.0569, + "step": 101470 + }, + { + "epoch": 6.894618834080718, + "grad_norm": 0.1991308629512787, + "learning_rate": 1.3852085881233864e-05, + "loss": 3.8747, + "step": 101475 + }, + { + "epoch": 6.894958554151379, + "grad_norm": 0.17425762116909027, + "learning_rate": 1.3847839380350592e-05, + "loss": 3.6104, + "step": 101480 + }, + { + "epoch": 6.895298274222041, + "grad_norm": 0.1693657487630844, + "learning_rate": 1.3843592879467322e-05, + "loss": 3.6186, + "step": 101485 + }, + { + "epoch": 6.895637994292703, + "grad_norm": 0.15466590225696564, + "learning_rate": 1.3839346378584048e-05, + "loss": 3.717, + "step": 101490 + }, + { + "epoch": 6.895977714363364, + "grad_norm": 0.19161398708820343, + "learning_rate": 1.3835099877700774e-05, + "loss": 3.7135, + "step": 101495 + }, + { + "epoch": 6.896317434434026, + "grad_norm": 0.14465220272541046, + "learning_rate": 1.3830853376817504e-05, + "loss": 3.8179, + "step": 101500 + }, + { + "epoch": 6.896657154504688, + "grad_norm": 0.16886045038700104, + "learning_rate": 1.382660687593423e-05, + "loss": 3.7785, + "step": 101505 + }, + { + "epoch": 6.89699687457535, + "grad_norm": 0.3667759597301483, + "learning_rate": 1.3822360375050958e-05, + "loss": 3.7967, + "step": 101510 + }, + { + "epoch": 6.897336594646012, + "grad_norm": 0.29372265934944153, + "learning_rate": 1.3818113874167688e-05, + "loss": 3.8673, + "step": 101515 + }, + { + "epoch": 6.897676314716674, + "grad_norm": 0.18132810294628143, + "learning_rate": 1.3813867373284414e-05, + "loss": 3.4813, + "step": 101520 + }, + { + "epoch": 6.898016034787335, + "grad_norm": 0.4306515157222748, + "learning_rate": 1.380962087240114e-05, + "loss": 3.9015, + "step": 101525 + }, + { + "epoch": 6.898355754857997, + "grad_norm": 0.12441864609718323, + "learning_rate": 1.380537437151787e-05, + "loss": 3.724, + "step": 101530 + }, + { + "epoch": 6.898695474928659, + "grad_norm": 0.5353036522865295, + "learning_rate": 1.3801127870634598e-05, + "loss": 3.7368, + "step": 101535 + }, + { + "epoch": 6.89903519499932, + "grad_norm": 0.19551131129264832, + "learning_rate": 1.3796881369751325e-05, + "loss": 3.7712, + "step": 101540 + }, + { + "epoch": 6.899374915069982, + "grad_norm": 0.13812129199504852, + "learning_rate": 1.3792634868868054e-05, + "loss": 3.5493, + "step": 101545 + }, + { + "epoch": 6.899714635140644, + "grad_norm": 0.1559155136346817, + "learning_rate": 1.378838836798478e-05, + "loss": 3.8835, + "step": 101550 + }, + { + "epoch": 6.900054355211306, + "grad_norm": 0.1802847683429718, + "learning_rate": 1.3784141867101507e-05, + "loss": 3.849, + "step": 101555 + }, + { + "epoch": 6.900394075281968, + "grad_norm": 0.27313557267189026, + "learning_rate": 1.3779895366218237e-05, + "loss": 3.9449, + "step": 101560 + }, + { + "epoch": 6.90073379535263, + "grad_norm": 0.1433781087398529, + "learning_rate": 1.3775648865334965e-05, + "loss": 3.852, + "step": 101565 + }, + { + "epoch": 6.901073515423291, + "grad_norm": 0.25645971298217773, + "learning_rate": 1.3771402364451694e-05, + "loss": 3.7723, + "step": 101570 + }, + { + "epoch": 6.901413235493953, + "grad_norm": 0.1960073858499527, + "learning_rate": 1.376715586356842e-05, + "loss": 3.7391, + "step": 101575 + }, + { + "epoch": 6.901752955564615, + "grad_norm": NaN, + "learning_rate": 1.3763758662861803e-05, + "loss": 3.6289, + "step": 101580 + }, + { + "epoch": 6.902092675635276, + "grad_norm": 0.2029721587896347, + "learning_rate": 1.375951216197853e-05, + "loss": 3.8899, + "step": 101585 + }, + { + "epoch": 6.902432395705938, + "grad_norm": 0.3185579180717468, + "learning_rate": 1.3755265661095257e-05, + "loss": 3.7169, + "step": 101590 + }, + { + "epoch": 6.9027721157766, + "grad_norm": 0.1714593768119812, + "learning_rate": 1.3751019160211987e-05, + "loss": 3.5763, + "step": 101595 + }, + { + "epoch": 6.903111835847262, + "grad_norm": 0.20990680158138275, + "learning_rate": 1.3746772659328713e-05, + "loss": 3.6803, + "step": 101600 + }, + { + "epoch": 6.903451555917924, + "grad_norm": 0.13670532405376434, + "learning_rate": 1.3742526158445443e-05, + "loss": 3.9666, + "step": 101605 + }, + { + "epoch": 6.903791275988586, + "grad_norm": 0.2297099381685257, + "learning_rate": 1.373827965756217e-05, + "loss": 3.6759, + "step": 101610 + }, + { + "epoch": 6.904130996059247, + "grad_norm": 0.15192948281764984, + "learning_rate": 1.3734033156678896e-05, + "loss": 3.8485, + "step": 101615 + }, + { + "epoch": 6.904470716129909, + "grad_norm": 0.15539103746414185, + "learning_rate": 1.3729786655795625e-05, + "loss": 4.0223, + "step": 101620 + }, + { + "epoch": 6.904810436200571, + "grad_norm": 0.15955793857574463, + "learning_rate": 1.3725540154912353e-05, + "loss": 3.9807, + "step": 101625 + }, + { + "epoch": 6.905150156271232, + "grad_norm": 2.866286039352417, + "learning_rate": 1.372129365402908e-05, + "loss": 3.8025, + "step": 101630 + }, + { + "epoch": 6.905489876341894, + "grad_norm": 0.17689469456672668, + "learning_rate": 1.371704715314581e-05, + "loss": 3.8621, + "step": 101635 + }, + { + "epoch": 6.905829596412556, + "grad_norm": 0.18169094622135162, + "learning_rate": 1.3712800652262536e-05, + "loss": 3.8881, + "step": 101640 + }, + { + "epoch": 6.906169316483218, + "grad_norm": 0.3835512101650238, + "learning_rate": 1.3708554151379264e-05, + "loss": 3.5673, + "step": 101645 + }, + { + "epoch": 6.90650903655388, + "grad_norm": 0.13971047103405, + "learning_rate": 1.3704307650495994e-05, + "loss": 3.9786, + "step": 101650 + }, + { + "epoch": 6.906848756624541, + "grad_norm": 0.3903498351573944, + "learning_rate": 1.370006114961272e-05, + "loss": 4.0121, + "step": 101655 + }, + { + "epoch": 6.907188476695203, + "grad_norm": 0.1469683200120926, + "learning_rate": 1.3695814648729446e-05, + "loss": 3.8985, + "step": 101660 + }, + { + "epoch": 6.907528196765865, + "grad_norm": 0.24693119525909424, + "learning_rate": 1.3691568147846176e-05, + "loss": 3.8601, + "step": 101665 + }, + { + "epoch": 6.907867916836526, + "grad_norm": 0.15701471269130707, + "learning_rate": 1.3687321646962902e-05, + "loss": 3.8968, + "step": 101670 + }, + { + "epoch": 6.908207636907188, + "grad_norm": 0.1828821748495102, + "learning_rate": 1.368307514607963e-05, + "loss": 3.7494, + "step": 101675 + }, + { + "epoch": 6.90854735697785, + "grad_norm": 0.17927102744579315, + "learning_rate": 1.367882864519636e-05, + "loss": 4.064, + "step": 101680 + }, + { + "epoch": 6.908887077048512, + "grad_norm": 0.16793090105056763, + "learning_rate": 1.3674582144313086e-05, + "loss": 3.8758, + "step": 101685 + }, + { + "epoch": 6.909226797119174, + "grad_norm": 0.15918691456317902, + "learning_rate": 1.3670335643429816e-05, + "loss": 3.8108, + "step": 101690 + }, + { + "epoch": 6.909566517189836, + "grad_norm": 0.16825301945209503, + "learning_rate": 1.3666089142546542e-05, + "loss": 3.5116, + "step": 101695 + }, + { + "epoch": 6.909906237260497, + "grad_norm": 0.2271578013896942, + "learning_rate": 1.366184264166327e-05, + "loss": 3.7267, + "step": 101700 + }, + { + "epoch": 6.910245957331159, + "grad_norm": 0.17777347564697266, + "learning_rate": 1.3657596140779998e-05, + "loss": 3.7578, + "step": 101705 + }, + { + "epoch": 6.910585677401821, + "grad_norm": 0.1789349466562271, + "learning_rate": 1.3653349639896726e-05, + "loss": 4.0267, + "step": 101710 + }, + { + "epoch": 6.910925397472482, + "grad_norm": 0.12974780797958374, + "learning_rate": 1.3649103139013453e-05, + "loss": 3.9512, + "step": 101715 + }, + { + "epoch": 6.911265117543144, + "grad_norm": 0.15156713128089905, + "learning_rate": 1.3644856638130182e-05, + "loss": 3.8507, + "step": 101720 + }, + { + "epoch": 6.911604837613806, + "grad_norm": 0.20277437567710876, + "learning_rate": 1.3640610137246909e-05, + "loss": 3.9783, + "step": 101725 + }, + { + "epoch": 6.911944557684468, + "grad_norm": 0.23303256928920746, + "learning_rate": 1.3636363636363637e-05, + "loss": 3.7498, + "step": 101730 + }, + { + "epoch": 6.91228427775513, + "grad_norm": 0.14504192769527435, + "learning_rate": 1.3632117135480366e-05, + "loss": 3.9657, + "step": 101735 + }, + { + "epoch": 6.912623997825792, + "grad_norm": 0.2512455880641937, + "learning_rate": 1.3627870634597093e-05, + "loss": 3.8542, + "step": 101740 + }, + { + "epoch": 6.912963717896453, + "grad_norm": 0.13919799029827118, + "learning_rate": 1.3623624133713819e-05, + "loss": 3.8115, + "step": 101745 + }, + { + "epoch": 6.913303437967115, + "grad_norm": 0.19540554285049438, + "learning_rate": 1.3619377632830549e-05, + "loss": 3.6092, + "step": 101750 + }, + { + "epoch": 6.913643158037777, + "grad_norm": 0.18478326499462128, + "learning_rate": 1.3615131131947275e-05, + "loss": 3.9107, + "step": 101755 + }, + { + "epoch": 6.913982878108438, + "grad_norm": 0.1926322877407074, + "learning_rate": 1.3610884631064003e-05, + "loss": 3.8587, + "step": 101760 + }, + { + "epoch": 6.9143225981791, + "grad_norm": 0.1578955054283142, + "learning_rate": 1.3606638130180733e-05, + "loss": 3.7131, + "step": 101765 + }, + { + "epoch": 6.914662318249762, + "grad_norm": 0.15633626282215118, + "learning_rate": 1.3602391629297459e-05, + "loss": 3.6674, + "step": 101770 + }, + { + "epoch": 6.915002038320424, + "grad_norm": 0.20679229497909546, + "learning_rate": 1.3598145128414189e-05, + "loss": 3.8486, + "step": 101775 + }, + { + "epoch": 6.915341758391086, + "grad_norm": 0.6239503622055054, + "learning_rate": 1.3593898627530915e-05, + "loss": 4.0062, + "step": 101780 + }, + { + "epoch": 6.915681478461748, + "grad_norm": 0.22309477627277374, + "learning_rate": 1.3589652126647643e-05, + "loss": 3.6054, + "step": 101785 + }, + { + "epoch": 6.916021198532409, + "grad_norm": 0.17247800529003143, + "learning_rate": 1.3585405625764373e-05, + "loss": 3.8887, + "step": 101790 + }, + { + "epoch": 6.916360918603071, + "grad_norm": 0.17038939893245697, + "learning_rate": 1.3581159124881099e-05, + "loss": 3.7241, + "step": 101795 + }, + { + "epoch": 6.916700638673733, + "grad_norm": 1.1830967664718628, + "learning_rate": 1.3576912623997825e-05, + "loss": 3.7357, + "step": 101800 + }, + { + "epoch": 6.917040358744394, + "grad_norm": 0.3752461075782776, + "learning_rate": 1.3572666123114555e-05, + "loss": 3.6242, + "step": 101805 + }, + { + "epoch": 6.917380078815056, + "grad_norm": 0.19596697390079498, + "learning_rate": 1.3568419622231281e-05, + "loss": 3.9175, + "step": 101810 + }, + { + "epoch": 6.917719798885718, + "grad_norm": 0.14987795054912567, + "learning_rate": 1.356417312134801e-05, + "loss": 3.6931, + "step": 101815 + }, + { + "epoch": 6.91805951895638, + "grad_norm": 0.16108523309230804, + "learning_rate": 1.3559926620464739e-05, + "loss": 3.9078, + "step": 101820 + }, + { + "epoch": 6.918399239027042, + "grad_norm": 0.16760188341140747, + "learning_rate": 1.3555680119581465e-05, + "loss": 3.8589, + "step": 101825 + }, + { + "epoch": 6.918738959097704, + "grad_norm": 0.14883403480052948, + "learning_rate": 1.3551433618698192e-05, + "loss": 3.7989, + "step": 101830 + }, + { + "epoch": 6.919078679168365, + "grad_norm": 0.3376505374908447, + "learning_rate": 1.3547187117814921e-05, + "loss": 3.7683, + "step": 101835 + }, + { + "epoch": 6.919418399239027, + "grad_norm": 0.816076934337616, + "learning_rate": 1.354294061693165e-05, + "loss": 3.9992, + "step": 101840 + }, + { + "epoch": 6.919758119309689, + "grad_norm": 0.2361312359571457, + "learning_rate": 1.3538694116048376e-05, + "loss": 3.8491, + "step": 101845 + }, + { + "epoch": 6.92009783938035, + "grad_norm": 0.39727163314819336, + "learning_rate": 1.3534447615165105e-05, + "loss": 3.8708, + "step": 101850 + }, + { + "epoch": 6.920437559451012, + "grad_norm": 0.17973726987838745, + "learning_rate": 1.3530201114281832e-05, + "loss": 3.9535, + "step": 101855 + }, + { + "epoch": 6.920777279521674, + "grad_norm": 0.16511224210262299, + "learning_rate": 1.3525954613398561e-05, + "loss": 3.8751, + "step": 101860 + }, + { + "epoch": 6.921116999592336, + "grad_norm": 0.1804663985967636, + "learning_rate": 1.3521708112515288e-05, + "loss": 3.8252, + "step": 101865 + }, + { + "epoch": 6.921456719662998, + "grad_norm": 0.13855068385601044, + "learning_rate": 1.3517461611632016e-05, + "loss": 3.6923, + "step": 101870 + }, + { + "epoch": 6.92179643973366, + "grad_norm": 0.1728561520576477, + "learning_rate": 1.3513215110748745e-05, + "loss": 3.627, + "step": 101875 + }, + { + "epoch": 6.922136159804321, + "grad_norm": 0.14533951878547668, + "learning_rate": 1.3508968609865472e-05, + "loss": 3.7576, + "step": 101880 + }, + { + "epoch": 6.922475879874983, + "grad_norm": 0.16485312581062317, + "learning_rate": 1.3504722108982198e-05, + "loss": 3.706, + "step": 101885 + }, + { + "epoch": 6.922815599945645, + "grad_norm": 0.17296023666858673, + "learning_rate": 1.3500475608098928e-05, + "loss": 3.792, + "step": 101890 + }, + { + "epoch": 6.923155320016306, + "grad_norm": 0.21830002963542938, + "learning_rate": 1.3496229107215656e-05, + "loss": 3.9708, + "step": 101895 + }, + { + "epoch": 6.923495040086968, + "grad_norm": 0.16463764011859894, + "learning_rate": 1.3491982606332382e-05, + "loss": 3.717, + "step": 101900 + }, + { + "epoch": 6.92383476015763, + "grad_norm": 0.44559553265571594, + "learning_rate": 1.3487736105449112e-05, + "loss": 3.7345, + "step": 101905 + }, + { + "epoch": 6.924174480228292, + "grad_norm": 0.1611296534538269, + "learning_rate": 1.3483489604565838e-05, + "loss": 3.6223, + "step": 101910 + }, + { + "epoch": 6.924514200298954, + "grad_norm": 0.18525013327598572, + "learning_rate": 1.3479243103682564e-05, + "loss": 3.7435, + "step": 101915 + }, + { + "epoch": 6.924853920369616, + "grad_norm": 0.21498288214206696, + "learning_rate": 1.3474996602799294e-05, + "loss": 3.8493, + "step": 101920 + }, + { + "epoch": 6.925193640440277, + "grad_norm": 0.19032034277915955, + "learning_rate": 1.3470750101916022e-05, + "loss": 3.9604, + "step": 101925 + }, + { + "epoch": 6.925533360510939, + "grad_norm": 0.1852356195449829, + "learning_rate": 1.3466503601032748e-05, + "loss": 3.6462, + "step": 101930 + }, + { + "epoch": 6.925873080581601, + "grad_norm": 1.3524082899093628, + "learning_rate": 1.3462257100149478e-05, + "loss": 3.7273, + "step": 101935 + }, + { + "epoch": 6.926212800652262, + "grad_norm": 0.36532914638519287, + "learning_rate": 1.3458010599266204e-05, + "loss": 3.8344, + "step": 101940 + }, + { + "epoch": 6.926552520722924, + "grad_norm": 0.1842917650938034, + "learning_rate": 1.3453764098382934e-05, + "loss": 3.8535, + "step": 101945 + }, + { + "epoch": 6.9268922407935865, + "grad_norm": 0.19255533814430237, + "learning_rate": 1.344951759749966e-05, + "loss": 3.8071, + "step": 101950 + }, + { + "epoch": 6.927231960864248, + "grad_norm": 0.16603496670722961, + "learning_rate": 1.3445271096616388e-05, + "loss": 3.6599, + "step": 101955 + }, + { + "epoch": 6.92757168093491, + "grad_norm": 0.4308161437511444, + "learning_rate": 1.3441024595733118e-05, + "loss": 3.9673, + "step": 101960 + }, + { + "epoch": 6.927911401005572, + "grad_norm": 0.46327322721481323, + "learning_rate": 1.3436778094849844e-05, + "loss": 3.9905, + "step": 101965 + }, + { + "epoch": 6.928251121076233, + "grad_norm": 0.16502508521080017, + "learning_rate": 1.343253159396657e-05, + "loss": 4.0932, + "step": 101970 + }, + { + "epoch": 6.928590841146895, + "grad_norm": 0.4417829215526581, + "learning_rate": 1.34282850930833e-05, + "loss": 4.0065, + "step": 101975 + }, + { + "epoch": 6.928930561217557, + "grad_norm": 0.16391998529434204, + "learning_rate": 1.3424038592200028e-05, + "loss": 3.8271, + "step": 101980 + }, + { + "epoch": 6.929270281288218, + "grad_norm": 0.20046702027320862, + "learning_rate": 1.3419792091316755e-05, + "loss": 4.0286, + "step": 101985 + }, + { + "epoch": 6.92961000135888, + "grad_norm": 0.2271331250667572, + "learning_rate": 1.3415545590433484e-05, + "loss": 3.6478, + "step": 101990 + }, + { + "epoch": 6.9299497214295425, + "grad_norm": 0.29651591181755066, + "learning_rate": 1.341129908955021e-05, + "loss": 3.7081, + "step": 101995 + }, + { + "epoch": 6.930289441500204, + "grad_norm": 0.17665253579616547, + "learning_rate": 1.3407052588666937e-05, + "loss": 4.1201, + "step": 102000 + }, + { + "epoch": 6.930629161570866, + "grad_norm": 1.1288182735443115, + "learning_rate": 1.3402806087783667e-05, + "loss": 3.8714, + "step": 102005 + }, + { + "epoch": 6.930968881641528, + "grad_norm": 0.2122366577386856, + "learning_rate": 1.3398559586900395e-05, + "loss": 3.9189, + "step": 102010 + }, + { + "epoch": 6.931308601712189, + "grad_norm": 0.17385758459568024, + "learning_rate": 1.3394313086017121e-05, + "loss": 3.9049, + "step": 102015 + }, + { + "epoch": 6.931648321782851, + "grad_norm": 0.13418154418468475, + "learning_rate": 1.339006658513385e-05, + "loss": 3.8029, + "step": 102020 + }, + { + "epoch": 6.931988041853513, + "grad_norm": 0.26238173246383667, + "learning_rate": 1.3385820084250577e-05, + "loss": 3.8433, + "step": 102025 + }, + { + "epoch": 6.932327761924174, + "grad_norm": 0.18239416182041168, + "learning_rate": 1.3381573583367307e-05, + "loss": 3.6387, + "step": 102030 + }, + { + "epoch": 6.932667481994836, + "grad_norm": 0.22318068146705627, + "learning_rate": 1.3377327082484035e-05, + "loss": 3.8985, + "step": 102035 + }, + { + "epoch": 6.9330072020654985, + "grad_norm": 0.21511252224445343, + "learning_rate": 1.3373080581600761e-05, + "loss": 3.9078, + "step": 102040 + }, + { + "epoch": 6.93334692213616, + "grad_norm": 0.17631690204143524, + "learning_rate": 1.3368834080717491e-05, + "loss": 3.798, + "step": 102045 + }, + { + "epoch": 6.933686642206822, + "grad_norm": 0.18817991018295288, + "learning_rate": 1.3364587579834217e-05, + "loss": 3.8312, + "step": 102050 + }, + { + "epoch": 6.934026362277484, + "grad_norm": 0.22091947495937347, + "learning_rate": 1.3360341078950944e-05, + "loss": 3.8687, + "step": 102055 + }, + { + "epoch": 6.934366082348145, + "grad_norm": 0.20010094344615936, + "learning_rate": 1.3356094578067673e-05, + "loss": 3.7832, + "step": 102060 + }, + { + "epoch": 6.934705802418807, + "grad_norm": 0.24460989236831665, + "learning_rate": 1.3351848077184401e-05, + "loss": 3.9119, + "step": 102065 + }, + { + "epoch": 6.935045522489469, + "grad_norm": 0.1971132755279541, + "learning_rate": 1.3347601576301128e-05, + "loss": 3.8534, + "step": 102070 + }, + { + "epoch": 6.93538524256013, + "grad_norm": 0.1915401667356491, + "learning_rate": 1.3343355075417857e-05, + "loss": 3.7292, + "step": 102075 + }, + { + "epoch": 6.935724962630792, + "grad_norm": 2.169985055923462, + "learning_rate": 1.3339108574534584e-05, + "loss": 3.8445, + "step": 102080 + }, + { + "epoch": 6.9360646827014545, + "grad_norm": 0.24871955811977386, + "learning_rate": 1.3334862073651312e-05, + "loss": 3.8773, + "step": 102085 + }, + { + "epoch": 6.936404402772116, + "grad_norm": 0.21507732570171356, + "learning_rate": 1.3330615572768041e-05, + "loss": 3.7607, + "step": 102090 + }, + { + "epoch": 6.936744122842778, + "grad_norm": 0.16345417499542236, + "learning_rate": 1.3326369071884768e-05, + "loss": 3.7738, + "step": 102095 + }, + { + "epoch": 6.93708384291344, + "grad_norm": 0.179193913936615, + "learning_rate": 1.3322122571001494e-05, + "loss": 3.5348, + "step": 102100 + }, + { + "epoch": 6.937423562984101, + "grad_norm": 0.2748263478279114, + "learning_rate": 1.3317876070118224e-05, + "loss": 3.6222, + "step": 102105 + }, + { + "epoch": 6.937763283054763, + "grad_norm": 0.1767999231815338, + "learning_rate": 1.331362956923495e-05, + "loss": 3.7792, + "step": 102110 + }, + { + "epoch": 6.938103003125424, + "grad_norm": 0.17983956634998322, + "learning_rate": 1.330938306835168e-05, + "loss": 3.6856, + "step": 102115 + }, + { + "epoch": 6.938442723196086, + "grad_norm": 0.15070010721683502, + "learning_rate": 1.3305136567468408e-05, + "loss": 3.9347, + "step": 102120 + }, + { + "epoch": 6.938782443266748, + "grad_norm": 0.18513505160808563, + "learning_rate": 1.3300890066585134e-05, + "loss": 3.7366, + "step": 102125 + }, + { + "epoch": 6.93912216333741, + "grad_norm": 0.14600205421447754, + "learning_rate": 1.3296643565701864e-05, + "loss": 3.923, + "step": 102130 + }, + { + "epoch": 6.939461883408072, + "grad_norm": 0.16951721906661987, + "learning_rate": 1.329239706481859e-05, + "loss": 3.9311, + "step": 102135 + }, + { + "epoch": 6.939801603478734, + "grad_norm": 0.15734103322029114, + "learning_rate": 1.3288150563935318e-05, + "loss": 3.8612, + "step": 102140 + }, + { + "epoch": 6.940141323549395, + "grad_norm": 0.2392294853925705, + "learning_rate": 1.3283904063052046e-05, + "loss": 3.8768, + "step": 102145 + }, + { + "epoch": 6.940481043620057, + "grad_norm": 0.16528818011283875, + "learning_rate": 1.3279657562168774e-05, + "loss": 3.919, + "step": 102150 + }, + { + "epoch": 6.940820763690719, + "grad_norm": 0.16950231790542603, + "learning_rate": 1.32754110612855e-05, + "loss": 3.8495, + "step": 102155 + }, + { + "epoch": 6.94116048376138, + "grad_norm": 0.1901455670595169, + "learning_rate": 1.327116456040223e-05, + "loss": 3.6687, + "step": 102160 + }, + { + "epoch": 6.941500203832042, + "grad_norm": 0.19468238949775696, + "learning_rate": 1.3266918059518956e-05, + "loss": 3.8442, + "step": 102165 + }, + { + "epoch": 6.941839923902704, + "grad_norm": 0.15663394331932068, + "learning_rate": 1.3262671558635684e-05, + "loss": 3.7007, + "step": 102170 + }, + { + "epoch": 6.942179643973366, + "grad_norm": 0.5229566097259521, + "learning_rate": 1.3258425057752414e-05, + "loss": 4.0792, + "step": 102175 + }, + { + "epoch": 6.942519364044028, + "grad_norm": 0.13422071933746338, + "learning_rate": 1.325417855686914e-05, + "loss": 3.7327, + "step": 102180 + }, + { + "epoch": 6.94285908411469, + "grad_norm": 0.1836128681898117, + "learning_rate": 1.3249932055985867e-05, + "loss": 3.9133, + "step": 102185 + }, + { + "epoch": 6.943198804185351, + "grad_norm": 0.15082482993602753, + "learning_rate": 1.3245685555102596e-05, + "loss": 3.7047, + "step": 102190 + }, + { + "epoch": 6.943538524256013, + "grad_norm": 0.2601660192012787, + "learning_rate": 1.3241439054219323e-05, + "loss": 3.7696, + "step": 102195 + }, + { + "epoch": 6.943878244326675, + "grad_norm": 0.2697978615760803, + "learning_rate": 1.3237192553336052e-05, + "loss": 3.7683, + "step": 102200 + }, + { + "epoch": 6.944217964397336, + "grad_norm": 0.1757582128047943, + "learning_rate": 1.323294605245278e-05, + "loss": 4.0115, + "step": 102205 + }, + { + "epoch": 6.944557684467998, + "grad_norm": 0.19751165807247162, + "learning_rate": 1.3228699551569507e-05, + "loss": 3.75, + "step": 102210 + }, + { + "epoch": 6.9448974045386604, + "grad_norm": 2.627371072769165, + "learning_rate": 1.3224453050686236e-05, + "loss": 3.9084, + "step": 102215 + }, + { + "epoch": 6.945237124609322, + "grad_norm": 0.15464161336421967, + "learning_rate": 1.3220206549802963e-05, + "loss": 3.7761, + "step": 102220 + }, + { + "epoch": 6.945576844679984, + "grad_norm": 0.1611202210187912, + "learning_rate": 1.321596004891969e-05, + "loss": 4.0709, + "step": 102225 + }, + { + "epoch": 6.945916564750646, + "grad_norm": 0.13850367069244385, + "learning_rate": 1.321171354803642e-05, + "loss": 3.9517, + "step": 102230 + }, + { + "epoch": 6.946256284821307, + "grad_norm": 0.16002623736858368, + "learning_rate": 1.3207467047153147e-05, + "loss": 3.8977, + "step": 102235 + }, + { + "epoch": 6.946596004891969, + "grad_norm": 0.1633204072713852, + "learning_rate": 1.3203220546269873e-05, + "loss": 3.9206, + "step": 102240 + }, + { + "epoch": 6.946935724962631, + "grad_norm": 0.13805823028087616, + "learning_rate": 1.3198974045386603e-05, + "loss": 3.6384, + "step": 102245 + }, + { + "epoch": 6.947275445033292, + "grad_norm": 0.7401549220085144, + "learning_rate": 1.3194727544503329e-05, + "loss": 3.9953, + "step": 102250 + }, + { + "epoch": 6.947615165103954, + "grad_norm": 1.9124748706817627, + "learning_rate": 1.3190481043620057e-05, + "loss": 4.005, + "step": 102255 + }, + { + "epoch": 6.9479548851746165, + "grad_norm": 0.15873458981513977, + "learning_rate": 1.3186234542736787e-05, + "loss": 3.8628, + "step": 102260 + }, + { + "epoch": 6.948294605245278, + "grad_norm": 0.1730349212884903, + "learning_rate": 1.3181988041853513e-05, + "loss": 3.8363, + "step": 102265 + }, + { + "epoch": 6.94863432531594, + "grad_norm": 0.1719636768102646, + "learning_rate": 1.317774154097024e-05, + "loss": 3.6582, + "step": 102270 + }, + { + "epoch": 6.948974045386602, + "grad_norm": 0.25570523738861084, + "learning_rate": 1.3173495040086969e-05, + "loss": 3.694, + "step": 102275 + }, + { + "epoch": 6.949313765457263, + "grad_norm": 0.14040975272655487, + "learning_rate": 1.3169248539203697e-05, + "loss": 3.9161, + "step": 102280 + }, + { + "epoch": 6.949653485527925, + "grad_norm": 0.16738612949848175, + "learning_rate": 1.3165002038320425e-05, + "loss": 3.7034, + "step": 102285 + }, + { + "epoch": 6.949993205598587, + "grad_norm": 0.1713159680366516, + "learning_rate": 1.3160755537437153e-05, + "loss": 3.7516, + "step": 102290 + }, + { + "epoch": 6.950332925669248, + "grad_norm": 0.17008282244205475, + "learning_rate": 1.315650903655388e-05, + "loss": 3.9267, + "step": 102295 + }, + { + "epoch": 6.95067264573991, + "grad_norm": 0.25813111662864685, + "learning_rate": 1.3152262535670609e-05, + "loss": 3.6, + "step": 102300 + }, + { + "epoch": 6.9510123658105725, + "grad_norm": 0.15394656360149384, + "learning_rate": 1.3148016034787335e-05, + "loss": 3.8647, + "step": 102305 + }, + { + "epoch": 6.951352085881234, + "grad_norm": 0.3749302923679352, + "learning_rate": 1.3143769533904063e-05, + "loss": 3.8911, + "step": 102310 + }, + { + "epoch": 6.951691805951896, + "grad_norm": 0.5580869913101196, + "learning_rate": 1.3139523033020793e-05, + "loss": 3.6885, + "step": 102315 + }, + { + "epoch": 6.952031526022557, + "grad_norm": 0.13772480189800262, + "learning_rate": 1.313527653213752e-05, + "loss": 3.8487, + "step": 102320 + }, + { + "epoch": 6.952371246093219, + "grad_norm": 0.1574840098619461, + "learning_rate": 1.3131030031254246e-05, + "loss": 3.8229, + "step": 102325 + }, + { + "epoch": 6.952710966163881, + "grad_norm": 0.17072561383247375, + "learning_rate": 1.3126783530370975e-05, + "loss": 3.851, + "step": 102330 + }, + { + "epoch": 6.953050686234542, + "grad_norm": 0.1888493150472641, + "learning_rate": 1.3122537029487702e-05, + "loss": 3.7653, + "step": 102335 + }, + { + "epoch": 6.953390406305204, + "grad_norm": 0.19034376740455627, + "learning_rate": 1.311829052860443e-05, + "loss": 3.8552, + "step": 102340 + }, + { + "epoch": 6.953730126375866, + "grad_norm": 0.24733632802963257, + "learning_rate": 1.311404402772116e-05, + "loss": 4.1153, + "step": 102345 + }, + { + "epoch": 6.954069846446528, + "grad_norm": 0.18456394970417023, + "learning_rate": 1.3109797526837886e-05, + "loss": 4.0077, + "step": 102350 + }, + { + "epoch": 6.95440956651719, + "grad_norm": 0.1541786789894104, + "learning_rate": 1.3105551025954612e-05, + "loss": 3.8288, + "step": 102355 + }, + { + "epoch": 6.954749286587852, + "grad_norm": 0.1624055802822113, + "learning_rate": 1.3101304525071342e-05, + "loss": 3.6018, + "step": 102360 + }, + { + "epoch": 6.955089006658513, + "grad_norm": 0.1978023499250412, + "learning_rate": 1.309705802418807e-05, + "loss": 3.9053, + "step": 102365 + }, + { + "epoch": 6.955428726729175, + "grad_norm": 0.23290222883224487, + "learning_rate": 1.30928115233048e-05, + "loss": 3.8847, + "step": 102370 + }, + { + "epoch": 6.955768446799837, + "grad_norm": 0.20844241976737976, + "learning_rate": 1.3088565022421526e-05, + "loss": 3.9231, + "step": 102375 + }, + { + "epoch": 6.956108166870498, + "grad_norm": 0.36049091815948486, + "learning_rate": 1.3084318521538252e-05, + "loss": 3.8417, + "step": 102380 + }, + { + "epoch": 6.95644788694116, + "grad_norm": 0.1394587755203247, + "learning_rate": 1.3080072020654982e-05, + "loss": 3.8618, + "step": 102385 + }, + { + "epoch": 6.956787607011822, + "grad_norm": 0.20294734835624695, + "learning_rate": 1.3075825519771708e-05, + "loss": 3.8198, + "step": 102390 + }, + { + "epoch": 6.957127327082484, + "grad_norm": 0.15417815744876862, + "learning_rate": 1.3071579018888436e-05, + "loss": 3.6934, + "step": 102395 + }, + { + "epoch": 6.957467047153146, + "grad_norm": 0.17474478483200073, + "learning_rate": 1.3067332518005166e-05, + "loss": 3.9006, + "step": 102400 + }, + { + "epoch": 6.957806767223808, + "grad_norm": 0.16099432110786438, + "learning_rate": 1.3063086017121892e-05, + "loss": 3.9772, + "step": 102405 + }, + { + "epoch": 6.958146487294469, + "grad_norm": 0.21112067997455597, + "learning_rate": 1.3058839516238618e-05, + "loss": 3.898, + "step": 102410 + }, + { + "epoch": 6.958486207365131, + "grad_norm": 0.29713794589042664, + "learning_rate": 1.3054593015355348e-05, + "loss": 3.6664, + "step": 102415 + }, + { + "epoch": 6.958825927435793, + "grad_norm": 0.17210105061531067, + "learning_rate": 1.3050346514472076e-05, + "loss": 3.7854, + "step": 102420 + }, + { + "epoch": 6.959165647506454, + "grad_norm": 0.16191557049751282, + "learning_rate": 1.3046100013588803e-05, + "loss": 3.9276, + "step": 102425 + }, + { + "epoch": 6.959505367577116, + "grad_norm": 0.1587989181280136, + "learning_rate": 1.3041853512705532e-05, + "loss": 3.8951, + "step": 102430 + }, + { + "epoch": 6.959845087647778, + "grad_norm": 0.2648528218269348, + "learning_rate": 1.3037607011822259e-05, + "loss": 3.884, + "step": 102435 + }, + { + "epoch": 6.96018480771844, + "grad_norm": 0.18769583106040955, + "learning_rate": 1.3033360510938985e-05, + "loss": 3.9402, + "step": 102440 + }, + { + "epoch": 6.960524527789102, + "grad_norm": 0.14795422554016113, + "learning_rate": 1.3029114010055715e-05, + "loss": 3.9491, + "step": 102445 + }, + { + "epoch": 6.960864247859764, + "grad_norm": 0.16330094635486603, + "learning_rate": 1.3024867509172443e-05, + "loss": 3.7307, + "step": 102450 + }, + { + "epoch": 6.961203967930425, + "grad_norm": 0.1879597157239914, + "learning_rate": 1.3020621008289172e-05, + "loss": 3.6476, + "step": 102455 + }, + { + "epoch": 6.961543688001087, + "grad_norm": 0.1615927666425705, + "learning_rate": 1.3016374507405899e-05, + "loss": 3.978, + "step": 102460 + }, + { + "epoch": 6.961883408071749, + "grad_norm": 0.3575038015842438, + "learning_rate": 1.3012128006522625e-05, + "loss": 3.8793, + "step": 102465 + }, + { + "epoch": 6.96222312814241, + "grad_norm": 0.1546420007944107, + "learning_rate": 1.3007881505639355e-05, + "loss": 3.5859, + "step": 102470 + }, + { + "epoch": 6.962562848213072, + "grad_norm": 0.15043097734451294, + "learning_rate": 1.3003635004756083e-05, + "loss": 3.6618, + "step": 102475 + }, + { + "epoch": 6.962902568283734, + "grad_norm": 0.22827880084514618, + "learning_rate": 1.2999388503872809e-05, + "loss": 3.6586, + "step": 102480 + }, + { + "epoch": 6.963242288354396, + "grad_norm": 0.13030578196048737, + "learning_rate": 1.2995142002989539e-05, + "loss": 3.9179, + "step": 102485 + }, + { + "epoch": 6.963582008425058, + "grad_norm": 0.18653558194637299, + "learning_rate": 1.2990895502106265e-05, + "loss": 3.8264, + "step": 102490 + }, + { + "epoch": 6.96392172849572, + "grad_norm": 0.1597052365541458, + "learning_rate": 1.2986649001222991e-05, + "loss": 4.0331, + "step": 102495 + }, + { + "epoch": 6.964261448566381, + "grad_norm": 0.202974334359169, + "learning_rate": 1.2982402500339721e-05, + "loss": 3.6315, + "step": 102500 + }, + { + "epoch": 6.964601168637043, + "grad_norm": 0.19012580811977386, + "learning_rate": 1.2978155999456449e-05, + "loss": 3.8271, + "step": 102505 + }, + { + "epoch": 6.964940888707705, + "grad_norm": 0.14242780208587646, + "learning_rate": 1.2973909498573175e-05, + "loss": 3.7775, + "step": 102510 + }, + { + "epoch": 6.965280608778366, + "grad_norm": 0.1901465803384781, + "learning_rate": 1.2969662997689905e-05, + "loss": 3.8197, + "step": 102515 + }, + { + "epoch": 6.965620328849028, + "grad_norm": 0.13892491161823273, + "learning_rate": 1.2965416496806631e-05, + "loss": 3.7379, + "step": 102520 + }, + { + "epoch": 6.9659600489196905, + "grad_norm": 0.1943463683128357, + "learning_rate": 1.296116999592336e-05, + "loss": 3.9572, + "step": 102525 + }, + { + "epoch": 6.966299768990352, + "grad_norm": 0.1770196259021759, + "learning_rate": 1.2956923495040087e-05, + "loss": 3.777, + "step": 102530 + }, + { + "epoch": 6.966639489061014, + "grad_norm": 0.18846552073955536, + "learning_rate": 1.2952676994156815e-05, + "loss": 3.8897, + "step": 102535 + }, + { + "epoch": 6.966979209131676, + "grad_norm": 0.15634439885616302, + "learning_rate": 1.2948430493273545e-05, + "loss": 4.038, + "step": 102540 + }, + { + "epoch": 6.967318929202337, + "grad_norm": 0.1842755824327469, + "learning_rate": 1.2944183992390271e-05, + "loss": 3.938, + "step": 102545 + }, + { + "epoch": 6.967658649272999, + "grad_norm": 0.254686564207077, + "learning_rate": 1.2939937491506998e-05, + "loss": 3.9088, + "step": 102550 + }, + { + "epoch": 6.967998369343661, + "grad_norm": 0.14828364551067352, + "learning_rate": 1.2935690990623727e-05, + "loss": 3.8467, + "step": 102555 + }, + { + "epoch": 6.968338089414322, + "grad_norm": 0.23513072729110718, + "learning_rate": 1.2931444489740455e-05, + "loss": 3.8613, + "step": 102560 + }, + { + "epoch": 6.968677809484984, + "grad_norm": 0.17502349615097046, + "learning_rate": 1.2927197988857182e-05, + "loss": 3.9684, + "step": 102565 + }, + { + "epoch": 6.9690175295556465, + "grad_norm": 0.21856984496116638, + "learning_rate": 1.2922951487973911e-05, + "loss": 3.8585, + "step": 102570 + }, + { + "epoch": 6.969357249626308, + "grad_norm": 0.19097043573856354, + "learning_rate": 1.2918704987090638e-05, + "loss": 3.7047, + "step": 102575 + }, + { + "epoch": 6.96969696969697, + "grad_norm": 0.16223180294036865, + "learning_rate": 1.2914458486207364e-05, + "loss": 3.8572, + "step": 102580 + }, + { + "epoch": 6.970036689767632, + "grad_norm": 0.2519015669822693, + "learning_rate": 1.2910211985324094e-05, + "loss": 3.9544, + "step": 102585 + }, + { + "epoch": 6.970376409838293, + "grad_norm": 0.20242473483085632, + "learning_rate": 1.2905965484440822e-05, + "loss": 3.5906, + "step": 102590 + }, + { + "epoch": 6.970716129908955, + "grad_norm": 0.24166321754455566, + "learning_rate": 1.2901718983557548e-05, + "loss": 3.6963, + "step": 102595 + }, + { + "epoch": 6.971055849979617, + "grad_norm": 0.20001651346683502, + "learning_rate": 1.2897472482674278e-05, + "loss": 3.7584, + "step": 102600 + }, + { + "epoch": 6.971395570050278, + "grad_norm": 0.23148514330387115, + "learning_rate": 1.2893225981791004e-05, + "loss": 3.98, + "step": 102605 + }, + { + "epoch": 6.97173529012094, + "grad_norm": 0.2625155448913574, + "learning_rate": 1.2888979480907732e-05, + "loss": 3.5262, + "step": 102610 + }, + { + "epoch": 6.9720750101916025, + "grad_norm": 0.17699868977069855, + "learning_rate": 1.2884732980024462e-05, + "loss": 3.8051, + "step": 102615 + }, + { + "epoch": 6.972414730262264, + "grad_norm": 0.15838012099266052, + "learning_rate": 1.2880486479141188e-05, + "loss": 4.0402, + "step": 102620 + }, + { + "epoch": 6.972754450332926, + "grad_norm": 0.16933996975421906, + "learning_rate": 1.2876239978257918e-05, + "loss": 3.8263, + "step": 102625 + }, + { + "epoch": 6.973094170403588, + "grad_norm": 0.210008904337883, + "learning_rate": 1.2871993477374644e-05, + "loss": 3.8139, + "step": 102630 + }, + { + "epoch": 6.973433890474249, + "grad_norm": 0.19213086366653442, + "learning_rate": 1.286774697649137e-05, + "loss": 3.7876, + "step": 102635 + }, + { + "epoch": 6.973773610544911, + "grad_norm": 0.20805661380290985, + "learning_rate": 1.28635004756081e-05, + "loss": 3.7928, + "step": 102640 + }, + { + "epoch": 6.974113330615573, + "grad_norm": 0.19587168097496033, + "learning_rate": 1.2859253974724828e-05, + "loss": 3.9841, + "step": 102645 + }, + { + "epoch": 6.974453050686234, + "grad_norm": 0.3091214597225189, + "learning_rate": 1.2855007473841554e-05, + "loss": 3.6729, + "step": 102650 + }, + { + "epoch": 6.974792770756896, + "grad_norm": 0.45923539996147156, + "learning_rate": 1.2850760972958284e-05, + "loss": 3.7331, + "step": 102655 + }, + { + "epoch": 6.9751324908275585, + "grad_norm": 0.22412021458148956, + "learning_rate": 1.284651447207501e-05, + "loss": 3.525, + "step": 102660 + }, + { + "epoch": 6.97547221089822, + "grad_norm": 0.16173331439495087, + "learning_rate": 1.2842267971191738e-05, + "loss": 3.729, + "step": 102665 + }, + { + "epoch": 6.975811930968882, + "grad_norm": 0.19364891946315765, + "learning_rate": 1.2838021470308466e-05, + "loss": 3.8844, + "step": 102670 + }, + { + "epoch": 6.976151651039544, + "grad_norm": 0.1732490360736847, + "learning_rate": 1.2833774969425194e-05, + "loss": 3.8756, + "step": 102675 + }, + { + "epoch": 6.976491371110205, + "grad_norm": 0.16973495483398438, + "learning_rate": 1.282952846854192e-05, + "loss": 3.7401, + "step": 102680 + }, + { + "epoch": 6.976831091180867, + "grad_norm": 0.18181170523166656, + "learning_rate": 1.282528196765865e-05, + "loss": 3.8638, + "step": 102685 + }, + { + "epoch": 6.977170811251529, + "grad_norm": 0.1925974041223526, + "learning_rate": 1.2821035466775377e-05, + "loss": 3.71, + "step": 102690 + }, + { + "epoch": 6.97751053132219, + "grad_norm": 1.2192015647888184, + "learning_rate": 1.2816788965892105e-05, + "loss": 3.8457, + "step": 102695 + }, + { + "epoch": 6.977850251392852, + "grad_norm": 0.17740970849990845, + "learning_rate": 1.2812542465008834e-05, + "loss": 3.6715, + "step": 102700 + }, + { + "epoch": 6.9781899714635145, + "grad_norm": 0.1495693475008011, + "learning_rate": 1.280829596412556e-05, + "loss": 3.8055, + "step": 102705 + }, + { + "epoch": 6.978529691534176, + "grad_norm": 0.1631336212158203, + "learning_rate": 1.280404946324229e-05, + "loss": 3.6913, + "step": 102710 + }, + { + "epoch": 6.978869411604838, + "grad_norm": 0.20166777074337006, + "learning_rate": 1.2799802962359017e-05, + "loss": 3.7469, + "step": 102715 + }, + { + "epoch": 6.9792091316755, + "grad_norm": 0.27848297357559204, + "learning_rate": 1.2795556461475743e-05, + "loss": 3.7336, + "step": 102720 + }, + { + "epoch": 6.979548851746161, + "grad_norm": 0.16607390344142914, + "learning_rate": 1.2791309960592473e-05, + "loss": 3.779, + "step": 102725 + }, + { + "epoch": 6.979888571816823, + "grad_norm": 0.18363402783870697, + "learning_rate": 1.27870634597092e-05, + "loss": 3.9061, + "step": 102730 + }, + { + "epoch": 6.980228291887485, + "grad_norm": 0.15466059744358063, + "learning_rate": 1.2782816958825927e-05, + "loss": 3.7684, + "step": 102735 + }, + { + "epoch": 6.980568011958146, + "grad_norm": 0.20058554410934448, + "learning_rate": 1.2778570457942657e-05, + "loss": 3.7539, + "step": 102740 + }, + { + "epoch": 6.980907732028808, + "grad_norm": 0.2866654396057129, + "learning_rate": 1.2774323957059383e-05, + "loss": 4.0047, + "step": 102745 + }, + { + "epoch": 6.9812474520994705, + "grad_norm": 0.1809672862291336, + "learning_rate": 1.2770077456176111e-05, + "loss": 3.9538, + "step": 102750 + }, + { + "epoch": 6.981587172170132, + "grad_norm": 0.2011355608701706, + "learning_rate": 1.2765830955292841e-05, + "loss": 3.9086, + "step": 102755 + }, + { + "epoch": 6.981926892240794, + "grad_norm": 0.19804848730564117, + "learning_rate": 1.2761584454409567e-05, + "loss": 3.6539, + "step": 102760 + }, + { + "epoch": 6.982266612311456, + "grad_norm": 0.16944608092308044, + "learning_rate": 1.2757337953526293e-05, + "loss": 3.8288, + "step": 102765 + }, + { + "epoch": 6.982606332382117, + "grad_norm": 0.15334542095661163, + "learning_rate": 1.2753091452643023e-05, + "loss": 3.8865, + "step": 102770 + }, + { + "epoch": 6.982946052452779, + "grad_norm": 0.19304445385932922, + "learning_rate": 1.274884495175975e-05, + "loss": 3.7218, + "step": 102775 + }, + { + "epoch": 6.983285772523441, + "grad_norm": 0.17188850045204163, + "learning_rate": 1.2744598450876478e-05, + "loss": 3.8732, + "step": 102780 + }, + { + "epoch": 6.983625492594102, + "grad_norm": 0.11268184334039688, + "learning_rate": 1.2740351949993207e-05, + "loss": 3.7391, + "step": 102785 + }, + { + "epoch": 6.983965212664764, + "grad_norm": 0.17559368908405304, + "learning_rate": 1.2736105449109934e-05, + "loss": 3.564, + "step": 102790 + }, + { + "epoch": 6.984304932735426, + "grad_norm": 0.1843075156211853, + "learning_rate": 1.2731858948226663e-05, + "loss": 3.8682, + "step": 102795 + }, + { + "epoch": 6.984644652806088, + "grad_norm": 0.15194900333881378, + "learning_rate": 1.272761244734339e-05, + "loss": 3.961, + "step": 102800 + }, + { + "epoch": 6.98498437287675, + "grad_norm": 0.1717453896999359, + "learning_rate": 1.2723365946460118e-05, + "loss": 3.8093, + "step": 102805 + }, + { + "epoch": 6.985324092947411, + "grad_norm": 0.1439923197031021, + "learning_rate": 1.2719119445576847e-05, + "loss": 3.9107, + "step": 102810 + }, + { + "epoch": 6.985663813018073, + "grad_norm": 0.18208830058574677, + "learning_rate": 1.2714872944693574e-05, + "loss": 3.8114, + "step": 102815 + }, + { + "epoch": 6.986003533088735, + "grad_norm": 0.22709141671657562, + "learning_rate": 1.27106264438103e-05, + "loss": 3.783, + "step": 102820 + }, + { + "epoch": 6.986343253159396, + "grad_norm": 0.3070448040962219, + "learning_rate": 1.270637994292703e-05, + "loss": 3.8303, + "step": 102825 + }, + { + "epoch": 6.986682973230058, + "grad_norm": 0.18988469243049622, + "learning_rate": 1.2702133442043756e-05, + "loss": 4.0049, + "step": 102830 + }, + { + "epoch": 6.9870226933007205, + "grad_norm": 0.23066161572933197, + "learning_rate": 1.2697886941160484e-05, + "loss": 3.8916, + "step": 102835 + }, + { + "epoch": 6.987362413371382, + "grad_norm": 0.19694529473781586, + "learning_rate": 1.2693640440277214e-05, + "loss": 3.8496, + "step": 102840 + }, + { + "epoch": 6.987702133442044, + "grad_norm": 0.29330530762672424, + "learning_rate": 1.268939393939394e-05, + "loss": 3.9288, + "step": 102845 + }, + { + "epoch": 6.988041853512706, + "grad_norm": 0.1508011668920517, + "learning_rate": 1.2685147438510666e-05, + "loss": 3.5791, + "step": 102850 + }, + { + "epoch": 6.988381573583367, + "grad_norm": 0.18975386023521423, + "learning_rate": 1.2680900937627396e-05, + "loss": 3.5647, + "step": 102855 + }, + { + "epoch": 6.988721293654029, + "grad_norm": 0.1891380399465561, + "learning_rate": 1.2676654436744124e-05, + "loss": 3.768, + "step": 102860 + }, + { + "epoch": 6.989061013724691, + "grad_norm": 0.15943272411823273, + "learning_rate": 1.267240793586085e-05, + "loss": 3.6021, + "step": 102865 + }, + { + "epoch": 6.989400733795352, + "grad_norm": 0.2677859961986542, + "learning_rate": 1.266816143497758e-05, + "loss": 3.7436, + "step": 102870 + }, + { + "epoch": 6.989740453866014, + "grad_norm": 0.1854143738746643, + "learning_rate": 1.2663914934094306e-05, + "loss": 3.6101, + "step": 102875 + }, + { + "epoch": 6.9900801739366765, + "grad_norm": 0.1149207130074501, + "learning_rate": 1.2659668433211036e-05, + "loss": 3.813, + "step": 102880 + }, + { + "epoch": 6.990419894007338, + "grad_norm": 0.3378344774246216, + "learning_rate": 1.2655421932327762e-05, + "loss": 3.9013, + "step": 102885 + }, + { + "epoch": 6.990759614078, + "grad_norm": 0.14309751987457275, + "learning_rate": 1.265117543144449e-05, + "loss": 3.7537, + "step": 102890 + }, + { + "epoch": 6.991099334148662, + "grad_norm": 0.1682460457086563, + "learning_rate": 1.264692893056122e-05, + "loss": 3.7068, + "step": 102895 + }, + { + "epoch": 6.991439054219323, + "grad_norm": 0.18525506556034088, + "learning_rate": 1.2642682429677946e-05, + "loss": 3.9102, + "step": 102900 + }, + { + "epoch": 6.991778774289985, + "grad_norm": 0.19433623552322388, + "learning_rate": 1.2638435928794673e-05, + "loss": 3.729, + "step": 102905 + }, + { + "epoch": 6.992118494360647, + "grad_norm": 0.17130152881145477, + "learning_rate": 1.2634189427911402e-05, + "loss": 4.1646, + "step": 102910 + }, + { + "epoch": 6.992458214431308, + "grad_norm": 0.18504507839679718, + "learning_rate": 1.2629942927028129e-05, + "loss": 4.142, + "step": 102915 + }, + { + "epoch": 6.99279793450197, + "grad_norm": 0.2120939940214157, + "learning_rate": 1.2625696426144857e-05, + "loss": 3.9126, + "step": 102920 + }, + { + "epoch": 6.9931376545726325, + "grad_norm": 0.20866519212722778, + "learning_rate": 1.2621449925261586e-05, + "loss": 3.9805, + "step": 102925 + }, + { + "epoch": 6.993477374643294, + "grad_norm": 0.1797594428062439, + "learning_rate": 1.2617203424378313e-05, + "loss": 3.9252, + "step": 102930 + }, + { + "epoch": 6.993817094713956, + "grad_norm": 0.19182339310646057, + "learning_rate": 1.2612956923495039e-05, + "loss": 4.0155, + "step": 102935 + }, + { + "epoch": 6.994156814784618, + "grad_norm": 0.21029838919639587, + "learning_rate": 1.2608710422611769e-05, + "loss": 3.7639, + "step": 102940 + }, + { + "epoch": 6.994496534855279, + "grad_norm": 0.1551222801208496, + "learning_rate": 1.2604463921728497e-05, + "loss": 4.0712, + "step": 102945 + }, + { + "epoch": 6.994836254925941, + "grad_norm": 0.2000609189271927, + "learning_rate": 1.2600217420845223e-05, + "loss": 3.8007, + "step": 102950 + }, + { + "epoch": 6.995175974996603, + "grad_norm": 0.1596136838197708, + "learning_rate": 1.2595970919961953e-05, + "loss": 3.7656, + "step": 102955 + }, + { + "epoch": 6.995515695067264, + "grad_norm": 0.3635205924510956, + "learning_rate": 1.2591724419078679e-05, + "loss": 3.8769, + "step": 102960 + }, + { + "epoch": 6.995855415137926, + "grad_norm": 0.15603014826774597, + "learning_rate": 1.2587477918195409e-05, + "loss": 3.9132, + "step": 102965 + }, + { + "epoch": 6.9961951352085885, + "grad_norm": 0.15893873572349548, + "learning_rate": 1.2583231417312135e-05, + "loss": 3.8166, + "step": 102970 + }, + { + "epoch": 6.99653485527925, + "grad_norm": 0.18160206079483032, + "learning_rate": 1.2578984916428863e-05, + "loss": 3.9808, + "step": 102975 + }, + { + "epoch": 6.996874575349912, + "grad_norm": 0.13561676442623138, + "learning_rate": 1.2574738415545593e-05, + "loss": 3.9991, + "step": 102980 + }, + { + "epoch": 6.997214295420574, + "grad_norm": 0.18926411867141724, + "learning_rate": 1.2570491914662319e-05, + "loss": 4.0326, + "step": 102985 + }, + { + "epoch": 6.997554015491235, + "grad_norm": 0.1498519629240036, + "learning_rate": 1.2566245413779045e-05, + "loss": 3.819, + "step": 102990 + }, + { + "epoch": 6.997893735561897, + "grad_norm": 0.1619308441877365, + "learning_rate": 1.2561998912895775e-05, + "loss": 3.7546, + "step": 102995 + }, + { + "epoch": 6.998233455632558, + "grad_norm": 0.20492412149906158, + "learning_rate": 1.2557752412012503e-05, + "loss": 4.0496, + "step": 103000 + }, + { + "epoch": 6.99857317570322, + "grad_norm": 0.6886380910873413, + "learning_rate": 1.255350591112923e-05, + "loss": 3.7604, + "step": 103005 + }, + { + "epoch": 6.998912895773882, + "grad_norm": 0.1772388070821762, + "learning_rate": 1.2549259410245959e-05, + "loss": 3.8132, + "step": 103010 + }, + { + "epoch": 6.999252615844544, + "grad_norm": 0.20301702618598938, + "learning_rate": 1.2545012909362685e-05, + "loss": 3.8879, + "step": 103015 + }, + { + "epoch": 6.999592335915206, + "grad_norm": 0.3568095862865448, + "learning_rate": 1.2540766408479412e-05, + "loss": 3.9247, + "step": 103020 + }, + { + "epoch": 6.999932055985868, + "grad_norm": 0.1485377997159958, + "learning_rate": 1.2536519907596141e-05, + "loss": 3.6976, + "step": 103025 + }, + { + "epoch": 7.0, + "eval_bertscore": { + "f1": 0.8522935561089914, + "precision": 0.8757816989739073, + "recall": 0.8303734835052065 + }, + "eval_bleu_4": 0.0019262513522988476, + "eval_exact_match": 0.0, + "eval_loss": 3.642313241958618, + "eval_meteor": 0.07540000873040434, + "eval_rouge": { + "rouge1": 0.12305378099419627, + "rouge2": 0.015327620255008351, + "rougeL": 0.10881049035978002, + "rougeLsum": 0.1088447698622804 + }, + "eval_runtime": 373.0938, + "eval_samples_per_second": 27.658, + "eval_steps_per_second": 3.458, + "step": 103026 + }, + { + "epoch": 7.00027177605653, + "grad_norm": 0.15122146904468536, + "learning_rate": 1.253227340671287e-05, + "loss": 3.8464, + "step": 103030 + }, + { + "epoch": 7.000611496127191, + "grad_norm": 0.23279964923858643, + "learning_rate": 1.2528026905829596e-05, + "loss": 3.8203, + "step": 103035 + }, + { + "epoch": 7.000951216197853, + "grad_norm": 0.1994272917509079, + "learning_rate": 1.2523780404946325e-05, + "loss": 3.8785, + "step": 103040 + }, + { + "epoch": 7.001290936268515, + "grad_norm": 0.17985416948795319, + "learning_rate": 1.2519533904063052e-05, + "loss": 3.82, + "step": 103045 + }, + { + "epoch": 7.001630656339176, + "grad_norm": 0.18433435261249542, + "learning_rate": 1.2515287403179781e-05, + "loss": 3.856, + "step": 103050 + }, + { + "epoch": 7.001970376409838, + "grad_norm": 0.19044512510299683, + "learning_rate": 1.2511040902296508e-05, + "loss": 4.0034, + "step": 103055 + }, + { + "epoch": 7.0023100964805, + "grad_norm": 0.17574350535869598, + "learning_rate": 1.2506794401413236e-05, + "loss": 3.7429, + "step": 103060 + }, + { + "epoch": 7.002649816551162, + "grad_norm": 0.21661590039730072, + "learning_rate": 1.2502547900529965e-05, + "loss": 3.9023, + "step": 103065 + }, + { + "epoch": 7.002989536621824, + "grad_norm": 0.1802130937576294, + "learning_rate": 1.2498301399646692e-05, + "loss": 3.6167, + "step": 103070 + }, + { + "epoch": 7.003329256692485, + "grad_norm": 0.18026600778102875, + "learning_rate": 1.249405489876342e-05, + "loss": 3.681, + "step": 103075 + }, + { + "epoch": 7.003668976763147, + "grad_norm": 0.1868506520986557, + "learning_rate": 1.2489808397880146e-05, + "loss": 3.7127, + "step": 103080 + }, + { + "epoch": 7.004008696833809, + "grad_norm": 0.24221523106098175, + "learning_rate": 1.2485561896996876e-05, + "loss": 4.0969, + "step": 103085 + }, + { + "epoch": 7.00434841690447, + "grad_norm": 0.219639390707016, + "learning_rate": 1.2481315396113604e-05, + "loss": 3.8913, + "step": 103090 + }, + { + "epoch": 7.004688136975132, + "grad_norm": 0.1436060667037964, + "learning_rate": 1.247706889523033e-05, + "loss": 3.4811, + "step": 103095 + }, + { + "epoch": 7.0050278570457944, + "grad_norm": 0.7689054608345032, + "learning_rate": 1.2472822394347058e-05, + "loss": 3.9187, + "step": 103100 + }, + { + "epoch": 7.005367577116456, + "grad_norm": 0.15715767443180084, + "learning_rate": 1.2468575893463786e-05, + "loss": 3.8198, + "step": 103105 + }, + { + "epoch": 7.005707297187118, + "grad_norm": 0.1779133379459381, + "learning_rate": 1.2464329392580514e-05, + "loss": 3.6828, + "step": 103110 + }, + { + "epoch": 7.00604701725778, + "grad_norm": 0.2263088971376419, + "learning_rate": 1.2460082891697242e-05, + "loss": 3.8908, + "step": 103115 + }, + { + "epoch": 7.006386737328441, + "grad_norm": 0.23675735294818878, + "learning_rate": 1.245583639081397e-05, + "loss": 3.7914, + "step": 103120 + }, + { + "epoch": 7.006726457399103, + "grad_norm": 0.2522970736026764, + "learning_rate": 1.2451589889930698e-05, + "loss": 3.7339, + "step": 103125 + }, + { + "epoch": 7.007066177469765, + "grad_norm": 0.19308921694755554, + "learning_rate": 1.2447343389047425e-05, + "loss": 3.9749, + "step": 103130 + }, + { + "epoch": 7.007405897540426, + "grad_norm": 1.3060708045959473, + "learning_rate": 1.2443096888164153e-05, + "loss": 3.7941, + "step": 103135 + }, + { + "epoch": 7.007745617611088, + "grad_norm": 0.17767837643623352, + "learning_rate": 1.2438850387280882e-05, + "loss": 3.7185, + "step": 103140 + }, + { + "epoch": 7.0080853376817505, + "grad_norm": 0.23698288202285767, + "learning_rate": 1.2434603886397609e-05, + "loss": 3.967, + "step": 103145 + }, + { + "epoch": 7.008425057752412, + "grad_norm": 0.20010590553283691, + "learning_rate": 1.2430357385514337e-05, + "loss": 3.7025, + "step": 103150 + }, + { + "epoch": 7.008764777823074, + "grad_norm": 0.214913472533226, + "learning_rate": 1.2426110884631065e-05, + "loss": 3.9224, + "step": 103155 + }, + { + "epoch": 7.009104497893736, + "grad_norm": 0.18161365389823914, + "learning_rate": 1.2421864383747793e-05, + "loss": 3.7807, + "step": 103160 + }, + { + "epoch": 7.009444217964397, + "grad_norm": 0.2522965669631958, + "learning_rate": 1.241761788286452e-05, + "loss": 3.7508, + "step": 103165 + }, + { + "epoch": 7.009783938035059, + "grad_norm": 0.16726846992969513, + "learning_rate": 1.2413371381981249e-05, + "loss": 3.8481, + "step": 103170 + }, + { + "epoch": 7.010123658105721, + "grad_norm": 0.16622821986675262, + "learning_rate": 1.2409124881097977e-05, + "loss": 3.4546, + "step": 103175 + }, + { + "epoch": 7.010463378176382, + "grad_norm": 0.3607472777366638, + "learning_rate": 1.2404878380214703e-05, + "loss": 3.9795, + "step": 103180 + }, + { + "epoch": 7.010803098247044, + "grad_norm": 0.29092705249786377, + "learning_rate": 1.2400631879331431e-05, + "loss": 3.804, + "step": 103185 + }, + { + "epoch": 7.0111428183177065, + "grad_norm": 0.22476507723331451, + "learning_rate": 1.2396385378448159e-05, + "loss": 3.9005, + "step": 103190 + }, + { + "epoch": 7.011482538388368, + "grad_norm": 0.14697514474391937, + "learning_rate": 1.2392138877564889e-05, + "loss": 3.8093, + "step": 103195 + }, + { + "epoch": 7.01182225845903, + "grad_norm": 0.1591273546218872, + "learning_rate": 1.2387892376681615e-05, + "loss": 4.0434, + "step": 103200 + }, + { + "epoch": 7.012161978529692, + "grad_norm": 0.15220585465431213, + "learning_rate": 1.2383645875798343e-05, + "loss": 3.8537, + "step": 103205 + }, + { + "epoch": 7.012501698600353, + "grad_norm": 0.13574472069740295, + "learning_rate": 1.2379399374915071e-05, + "loss": 3.7313, + "step": 103210 + }, + { + "epoch": 7.012841418671015, + "grad_norm": 0.6598378419876099, + "learning_rate": 1.2375152874031797e-05, + "loss": 3.7635, + "step": 103215 + }, + { + "epoch": 7.013181138741677, + "grad_norm": 0.17059387266635895, + "learning_rate": 1.2370906373148527e-05, + "loss": 3.5201, + "step": 103220 + }, + { + "epoch": 7.013520858812338, + "grad_norm": 0.1539117842912674, + "learning_rate": 1.2366659872265255e-05, + "loss": 3.8697, + "step": 103225 + }, + { + "epoch": 7.013860578883, + "grad_norm": 9.408555030822754, + "learning_rate": 1.2362413371381981e-05, + "loss": 3.6956, + "step": 103230 + }, + { + "epoch": 7.0142002989536625, + "grad_norm": 1.6300021409988403, + "learning_rate": 1.235816687049871e-05, + "loss": 3.8237, + "step": 103235 + }, + { + "epoch": 7.014540019024324, + "grad_norm": 0.16441649198532104, + "learning_rate": 1.2353920369615437e-05, + "loss": 3.7861, + "step": 103240 + }, + { + "epoch": 7.014879739094986, + "grad_norm": 2.699077606201172, + "learning_rate": 1.2349673868732165e-05, + "loss": 3.9374, + "step": 103245 + }, + { + "epoch": 7.015219459165648, + "grad_norm": 0.20791803300380707, + "learning_rate": 1.2345427367848893e-05, + "loss": 3.7622, + "step": 103250 + }, + { + "epoch": 7.015559179236309, + "grad_norm": 0.1972043663263321, + "learning_rate": 1.2341180866965621e-05, + "loss": 3.903, + "step": 103255 + }, + { + "epoch": 7.015898899306971, + "grad_norm": 0.15285180509090424, + "learning_rate": 1.233693436608235e-05, + "loss": 4.1309, + "step": 103260 + }, + { + "epoch": 7.016238619377633, + "grad_norm": 0.18965089321136475, + "learning_rate": 1.2332687865199076e-05, + "loss": 3.8092, + "step": 103265 + }, + { + "epoch": 7.016578339448294, + "grad_norm": 0.200117826461792, + "learning_rate": 1.2328441364315804e-05, + "loss": 3.7626, + "step": 103270 + }, + { + "epoch": 7.016918059518956, + "grad_norm": 0.17087186872959137, + "learning_rate": 1.2324194863432532e-05, + "loss": 3.7864, + "step": 103275 + }, + { + "epoch": 7.0172577795896185, + "grad_norm": 0.19536085426807404, + "learning_rate": 1.2319948362549261e-05, + "loss": 3.8819, + "step": 103280 + }, + { + "epoch": 7.01759749966028, + "grad_norm": 0.17225128412246704, + "learning_rate": 1.2315701861665988e-05, + "loss": 3.7301, + "step": 103285 + }, + { + "epoch": 7.017937219730942, + "grad_norm": 0.19976790249347687, + "learning_rate": 1.2311455360782716e-05, + "loss": 3.9574, + "step": 103290 + }, + { + "epoch": 7.018276939801604, + "grad_norm": 0.1405845731496811, + "learning_rate": 1.2307208859899444e-05, + "loss": 3.5933, + "step": 103295 + }, + { + "epoch": 7.018616659872265, + "grad_norm": 0.16376478970050812, + "learning_rate": 1.230296235901617e-05, + "loss": 3.5304, + "step": 103300 + }, + { + "epoch": 7.018956379942927, + "grad_norm": 0.17087191343307495, + "learning_rate": 1.22987158581329e-05, + "loss": 3.8797, + "step": 103305 + }, + { + "epoch": 7.019296100013589, + "grad_norm": 0.49114879965782166, + "learning_rate": 1.2294469357249628e-05, + "loss": 3.7044, + "step": 103310 + }, + { + "epoch": 7.01963582008425, + "grad_norm": 0.2003157138824463, + "learning_rate": 1.2290222856366354e-05, + "loss": 3.6405, + "step": 103315 + }, + { + "epoch": 7.019975540154912, + "grad_norm": 0.18145227432250977, + "learning_rate": 1.2285976355483082e-05, + "loss": 3.6429, + "step": 103320 + }, + { + "epoch": 7.0203152602255745, + "grad_norm": 0.1608065813779831, + "learning_rate": 1.228172985459981e-05, + "loss": 4.0887, + "step": 103325 + }, + { + "epoch": 7.020654980296236, + "grad_norm": 0.20594750344753265, + "learning_rate": 1.2277483353716538e-05, + "loss": 4.1257, + "step": 103330 + }, + { + "epoch": 7.020994700366898, + "grad_norm": 0.2863234877586365, + "learning_rate": 1.2273236852833266e-05, + "loss": 3.7958, + "step": 103335 + }, + { + "epoch": 7.02133442043756, + "grad_norm": 0.2471047043800354, + "learning_rate": 1.2268990351949994e-05, + "loss": 3.8701, + "step": 103340 + }, + { + "epoch": 7.021674140508221, + "grad_norm": 0.16556525230407715, + "learning_rate": 1.2264743851066722e-05, + "loss": 3.7775, + "step": 103345 + }, + { + "epoch": 7.022013860578883, + "grad_norm": 0.23831486701965332, + "learning_rate": 1.2260497350183448e-05, + "loss": 3.9302, + "step": 103350 + }, + { + "epoch": 7.022353580649545, + "grad_norm": 0.20987054705619812, + "learning_rate": 1.2256250849300176e-05, + "loss": 3.6963, + "step": 103355 + }, + { + "epoch": 7.022693300720206, + "grad_norm": 0.4124690890312195, + "learning_rate": 1.2252004348416906e-05, + "loss": 3.8815, + "step": 103360 + }, + { + "epoch": 7.023033020790868, + "grad_norm": 0.14928168058395386, + "learning_rate": 1.2247757847533634e-05, + "loss": 3.7284, + "step": 103365 + }, + { + "epoch": 7.0233727408615305, + "grad_norm": 0.17902329564094543, + "learning_rate": 1.224351134665036e-05, + "loss": 4.0451, + "step": 103370 + }, + { + "epoch": 7.023712460932192, + "grad_norm": 0.15806178748607635, + "learning_rate": 1.2239264845767088e-05, + "loss": 3.8284, + "step": 103375 + }, + { + "epoch": 7.024052181002854, + "grad_norm": 0.17242112755775452, + "learning_rate": 1.2235018344883816e-05, + "loss": 3.6998, + "step": 103380 + }, + { + "epoch": 7.024391901073516, + "grad_norm": 0.18118461966514587, + "learning_rate": 1.2230771844000544e-05, + "loss": 4.009, + "step": 103385 + }, + { + "epoch": 7.024731621144177, + "grad_norm": 0.1287001222372055, + "learning_rate": 1.2226525343117272e-05, + "loss": 3.7809, + "step": 103390 + }, + { + "epoch": 7.025071341214839, + "grad_norm": 0.15584129095077515, + "learning_rate": 1.2222278842234e-05, + "loss": 3.5862, + "step": 103395 + }, + { + "epoch": 7.025411061285501, + "grad_norm": 0.16834822297096252, + "learning_rate": 1.2218032341350727e-05, + "loss": 3.7405, + "step": 103400 + }, + { + "epoch": 7.025750781356162, + "grad_norm": 0.15048909187316895, + "learning_rate": 1.2213785840467455e-05, + "loss": 3.8518, + "step": 103405 + }, + { + "epoch": 7.0260905014268245, + "grad_norm": 0.24462050199508667, + "learning_rate": 1.2209539339584183e-05, + "loss": 3.894, + "step": 103410 + }, + { + "epoch": 7.026430221497486, + "grad_norm": 0.1751563996076584, + "learning_rate": 1.220529283870091e-05, + "loss": 3.9251, + "step": 103415 + }, + { + "epoch": 7.026769941568148, + "grad_norm": 0.15738394856452942, + "learning_rate": 1.2201046337817639e-05, + "loss": 3.9088, + "step": 103420 + }, + { + "epoch": 7.02710966163881, + "grad_norm": 1.614125370979309, + "learning_rate": 1.2196799836934367e-05, + "loss": 3.78, + "step": 103425 + }, + { + "epoch": 7.027449381709471, + "grad_norm": 0.16846589744091034, + "learning_rate": 1.2192553336051095e-05, + "loss": 3.8825, + "step": 103430 + }, + { + "epoch": 7.027789101780133, + "grad_norm": 0.1756429821252823, + "learning_rate": 1.2188306835167821e-05, + "loss": 3.9519, + "step": 103435 + }, + { + "epoch": 7.028128821850795, + "grad_norm": 0.18350806832313538, + "learning_rate": 1.2184060334284549e-05, + "loss": 3.8476, + "step": 103440 + }, + { + "epoch": 7.028468541921456, + "grad_norm": 0.1464817374944687, + "learning_rate": 1.2179813833401279e-05, + "loss": 3.9717, + "step": 103445 + }, + { + "epoch": 7.028808261992118, + "grad_norm": 0.5214900374412537, + "learning_rate": 1.2175567332518007e-05, + "loss": 3.7084, + "step": 103450 + }, + { + "epoch": 7.0291479820627805, + "grad_norm": 0.2156631350517273, + "learning_rate": 1.2171320831634733e-05, + "loss": 3.7525, + "step": 103455 + }, + { + "epoch": 7.029487702133442, + "grad_norm": 0.17423556745052338, + "learning_rate": 1.2167074330751461e-05, + "loss": 3.869, + "step": 103460 + }, + { + "epoch": 7.029827422204104, + "grad_norm": 0.1993287056684494, + "learning_rate": 1.216282782986819e-05, + "loss": 3.9984, + "step": 103465 + }, + { + "epoch": 7.030167142274766, + "grad_norm": 0.16624197363853455, + "learning_rate": 1.2158581328984917e-05, + "loss": 3.8849, + "step": 103470 + }, + { + "epoch": 7.030506862345427, + "grad_norm": 0.1610860377550125, + "learning_rate": 1.2154334828101645e-05, + "loss": 3.6876, + "step": 103475 + }, + { + "epoch": 7.030846582416089, + "grad_norm": 0.19634099304676056, + "learning_rate": 1.2150088327218373e-05, + "loss": 3.8456, + "step": 103480 + }, + { + "epoch": 7.031186302486751, + "grad_norm": 0.1929878145456314, + "learning_rate": 1.21458418263351e-05, + "loss": 3.7943, + "step": 103485 + }, + { + "epoch": 7.031526022557412, + "grad_norm": 0.23606809973716736, + "learning_rate": 1.2141595325451828e-05, + "loss": 3.7828, + "step": 103490 + }, + { + "epoch": 7.031865742628074, + "grad_norm": 0.19855456054210663, + "learning_rate": 1.2137348824568556e-05, + "loss": 3.7822, + "step": 103495 + }, + { + "epoch": 7.0322054626987365, + "grad_norm": 0.17548514902591705, + "learning_rate": 1.2133102323685285e-05, + "loss": 3.8067, + "step": 103500 + }, + { + "epoch": 7.032545182769398, + "grad_norm": 0.15323026478290558, + "learning_rate": 1.2128855822802012e-05, + "loss": 3.7067, + "step": 103505 + }, + { + "epoch": 7.03288490284006, + "grad_norm": 0.210654154419899, + "learning_rate": 1.212460932191874e-05, + "loss": 3.9929, + "step": 103510 + }, + { + "epoch": 7.033224622910722, + "grad_norm": 0.12667056918144226, + "learning_rate": 1.2120362821035468e-05, + "loss": 3.8626, + "step": 103515 + }, + { + "epoch": 7.033564342981383, + "grad_norm": 0.17316144704818726, + "learning_rate": 1.2116116320152194e-05, + "loss": 3.9064, + "step": 103520 + }, + { + "epoch": 7.033904063052045, + "grad_norm": 0.14828899502754211, + "learning_rate": 1.2111869819268924e-05, + "loss": 3.8484, + "step": 103525 + }, + { + "epoch": 7.034243783122707, + "grad_norm": 0.17271696031093597, + "learning_rate": 1.2107623318385652e-05, + "loss": 3.9687, + "step": 103530 + }, + { + "epoch": 7.034583503193368, + "grad_norm": 0.15803246200084686, + "learning_rate": 1.210337681750238e-05, + "loss": 3.8133, + "step": 103535 + }, + { + "epoch": 7.03492322326403, + "grad_norm": 0.18296605348587036, + "learning_rate": 1.2099130316619106e-05, + "loss": 4.044, + "step": 103540 + }, + { + "epoch": 7.0352629433346925, + "grad_norm": 0.20176784694194794, + "learning_rate": 1.2094883815735834e-05, + "loss": 3.729, + "step": 103545 + }, + { + "epoch": 7.035602663405354, + "grad_norm": 0.2109869420528412, + "learning_rate": 1.2090637314852562e-05, + "loss": 3.6877, + "step": 103550 + }, + { + "epoch": 7.035942383476016, + "grad_norm": 0.13939827680587769, + "learning_rate": 1.208639081396929e-05, + "loss": 4.0169, + "step": 103555 + }, + { + "epoch": 7.036282103546678, + "grad_norm": 0.15721596777439117, + "learning_rate": 1.2082144313086018e-05, + "loss": 4.1735, + "step": 103560 + }, + { + "epoch": 7.036621823617339, + "grad_norm": 0.3376748263835907, + "learning_rate": 1.2077897812202746e-05, + "loss": 3.8622, + "step": 103565 + }, + { + "epoch": 7.036961543688001, + "grad_norm": 0.2544378936290741, + "learning_rate": 1.2073651311319472e-05, + "loss": 3.8541, + "step": 103570 + }, + { + "epoch": 7.037301263758663, + "grad_norm": 0.14071199297904968, + "learning_rate": 1.20694048104362e-05, + "loss": 3.8529, + "step": 103575 + }, + { + "epoch": 7.037640983829324, + "grad_norm": 0.16832652688026428, + "learning_rate": 1.206515830955293e-05, + "loss": 3.6407, + "step": 103580 + }, + { + "epoch": 7.037980703899986, + "grad_norm": 0.15208809077739716, + "learning_rate": 1.2060911808669658e-05, + "loss": 3.8691, + "step": 103585 + }, + { + "epoch": 7.0383204239706485, + "grad_norm": 0.1626616269350052, + "learning_rate": 1.2056665307786384e-05, + "loss": 3.7526, + "step": 103590 + }, + { + "epoch": 7.03866014404131, + "grad_norm": 0.1510438621044159, + "learning_rate": 1.2052418806903112e-05, + "loss": 4.0609, + "step": 103595 + }, + { + "epoch": 7.038999864111972, + "grad_norm": 0.1931094527244568, + "learning_rate": 1.204817230601984e-05, + "loss": 3.6834, + "step": 103600 + }, + { + "epoch": 7.039339584182634, + "grad_norm": 0.1492992639541626, + "learning_rate": 1.2043925805136568e-05, + "loss": 3.8832, + "step": 103605 + }, + { + "epoch": 7.039679304253295, + "grad_norm": 0.1931026130914688, + "learning_rate": 1.2039679304253296e-05, + "loss": 4.0558, + "step": 103610 + }, + { + "epoch": 7.040019024323957, + "grad_norm": 0.18124432861804962, + "learning_rate": 1.2035432803370024e-05, + "loss": 3.8918, + "step": 103615 + }, + { + "epoch": 7.040358744394619, + "grad_norm": 0.1527661234140396, + "learning_rate": 1.2031186302486752e-05, + "loss": 3.9192, + "step": 103620 + }, + { + "epoch": 7.04069846446528, + "grad_norm": 0.1654752492904663, + "learning_rate": 1.2026939801603479e-05, + "loss": 3.8755, + "step": 103625 + }, + { + "epoch": 7.041038184535942, + "grad_norm": 1.0776766538619995, + "learning_rate": 1.2022693300720207e-05, + "loss": 3.7358, + "step": 103630 + }, + { + "epoch": 7.0413779046066045, + "grad_norm": 0.13783755898475647, + "learning_rate": 1.2018446799836935e-05, + "loss": 3.8681, + "step": 103635 + }, + { + "epoch": 7.041717624677266, + "grad_norm": 0.21193517744541168, + "learning_rate": 1.2014200298953663e-05, + "loss": 3.6967, + "step": 103640 + }, + { + "epoch": 7.042057344747928, + "grad_norm": 0.17916466295719147, + "learning_rate": 1.200995379807039e-05, + "loss": 3.8636, + "step": 103645 + }, + { + "epoch": 7.04239706481859, + "grad_norm": 0.22921901941299438, + "learning_rate": 1.2005707297187119e-05, + "loss": 3.9458, + "step": 103650 + }, + { + "epoch": 7.042736784889251, + "grad_norm": 0.22286280989646912, + "learning_rate": 1.2001460796303845e-05, + "loss": 4.2543, + "step": 103655 + }, + { + "epoch": 7.043076504959913, + "grad_norm": 0.28493836522102356, + "learning_rate": 1.1997214295420573e-05, + "loss": 3.8872, + "step": 103660 + }, + { + "epoch": 7.043416225030575, + "grad_norm": 0.1745987832546234, + "learning_rate": 1.1992967794537303e-05, + "loss": 3.8487, + "step": 103665 + }, + { + "epoch": 7.043755945101236, + "grad_norm": 0.1824847310781479, + "learning_rate": 1.198872129365403e-05, + "loss": 4.0323, + "step": 103670 + }, + { + "epoch": 7.0440956651718984, + "grad_norm": 0.2179403007030487, + "learning_rate": 1.1984474792770757e-05, + "loss": 3.9, + "step": 103675 + }, + { + "epoch": 7.0444353852425605, + "grad_norm": 0.2826710343360901, + "learning_rate": 1.1980228291887485e-05, + "loss": 3.9116, + "step": 103680 + }, + { + "epoch": 7.044775105313222, + "grad_norm": 0.17377163469791412, + "learning_rate": 1.1975981791004213e-05, + "loss": 3.9362, + "step": 103685 + }, + { + "epoch": 7.045114825383884, + "grad_norm": 0.23376885056495667, + "learning_rate": 1.1971735290120941e-05, + "loss": 3.7256, + "step": 103690 + }, + { + "epoch": 7.045454545454546, + "grad_norm": 0.15055504441261292, + "learning_rate": 1.1967488789237669e-05, + "loss": 3.644, + "step": 103695 + }, + { + "epoch": 7.045794265525207, + "grad_norm": 0.18894630670547485, + "learning_rate": 1.1963242288354397e-05, + "loss": 3.813, + "step": 103700 + }, + { + "epoch": 7.046133985595869, + "grad_norm": 0.2159714698791504, + "learning_rate": 1.1958995787471125e-05, + "loss": 3.9257, + "step": 103705 + }, + { + "epoch": 7.046473705666531, + "grad_norm": 0.1702186018228531, + "learning_rate": 1.1954749286587851e-05, + "loss": 4.1684, + "step": 103710 + }, + { + "epoch": 7.046813425737192, + "grad_norm": 0.6492427587509155, + "learning_rate": 1.195050278570458e-05, + "loss": 3.7256, + "step": 103715 + }, + { + "epoch": 7.0471531458078545, + "grad_norm": 0.18282176554203033, + "learning_rate": 1.1946256284821309e-05, + "loss": 3.8075, + "step": 103720 + }, + { + "epoch": 7.0474928658785165, + "grad_norm": 0.18667519092559814, + "learning_rate": 1.1942009783938035e-05, + "loss": 4.0308, + "step": 103725 + }, + { + "epoch": 7.047832585949178, + "grad_norm": 0.17139315605163574, + "learning_rate": 1.1937763283054763e-05, + "loss": 3.9572, + "step": 103730 + }, + { + "epoch": 7.04817230601984, + "grad_norm": 0.33821237087249756, + "learning_rate": 1.1933516782171491e-05, + "loss": 3.8462, + "step": 103735 + }, + { + "epoch": 7.048512026090501, + "grad_norm": 1.8654192686080933, + "learning_rate": 1.1929270281288218e-05, + "loss": 4.0558, + "step": 103740 + }, + { + "epoch": 7.048851746161163, + "grad_norm": 0.20716477930545807, + "learning_rate": 1.1925023780404947e-05, + "loss": 3.985, + "step": 103745 + }, + { + "epoch": 7.049191466231825, + "grad_norm": 0.17376655340194702, + "learning_rate": 1.1920777279521675e-05, + "loss": 3.6728, + "step": 103750 + }, + { + "epoch": 7.049531186302486, + "grad_norm": 0.21682380139827728, + "learning_rate": 1.1916530778638403e-05, + "loss": 4.0299, + "step": 103755 + }, + { + "epoch": 7.049870906373148, + "grad_norm": 1.3769866228103638, + "learning_rate": 1.191228427775513e-05, + "loss": 3.6359, + "step": 103760 + }, + { + "epoch": 7.0502106264438105, + "grad_norm": 0.2148658186197281, + "learning_rate": 1.1908037776871858e-05, + "loss": 3.6979, + "step": 103765 + }, + { + "epoch": 7.050550346514472, + "grad_norm": 0.16415001451969147, + "learning_rate": 1.1903791275988586e-05, + "loss": 3.9246, + "step": 103770 + }, + { + "epoch": 7.050890066585134, + "grad_norm": 0.13583940267562866, + "learning_rate": 1.1899544775105314e-05, + "loss": 3.6208, + "step": 103775 + }, + { + "epoch": 7.051229786655796, + "grad_norm": 0.1806793063879013, + "learning_rate": 1.1895298274222042e-05, + "loss": 3.6965, + "step": 103780 + }, + { + "epoch": 7.051569506726457, + "grad_norm": 0.1629088819026947, + "learning_rate": 1.189105177333877e-05, + "loss": 4.0217, + "step": 103785 + }, + { + "epoch": 7.051909226797119, + "grad_norm": 0.15852989256381989, + "learning_rate": 1.1886805272455498e-05, + "loss": 3.7983, + "step": 103790 + }, + { + "epoch": 7.052248946867781, + "grad_norm": 0.15649880468845367, + "learning_rate": 1.1882558771572224e-05, + "loss": 3.8283, + "step": 103795 + }, + { + "epoch": 7.052588666938442, + "grad_norm": 0.23410923779010773, + "learning_rate": 1.1878312270688952e-05, + "loss": 3.8602, + "step": 103800 + }, + { + "epoch": 7.052928387009104, + "grad_norm": 0.17560435831546783, + "learning_rate": 1.1874065769805682e-05, + "loss": 3.6448, + "step": 103805 + }, + { + "epoch": 7.0532681070797665, + "grad_norm": 0.16987958550453186, + "learning_rate": 1.1869819268922408e-05, + "loss": 3.6365, + "step": 103810 + }, + { + "epoch": 7.053607827150428, + "grad_norm": 0.18603475391864777, + "learning_rate": 1.1865572768039136e-05, + "loss": 3.9813, + "step": 103815 + }, + { + "epoch": 7.05394754722109, + "grad_norm": 0.17373715341091156, + "learning_rate": 1.1861326267155864e-05, + "loss": 3.6055, + "step": 103820 + }, + { + "epoch": 7.054287267291752, + "grad_norm": 0.22599482536315918, + "learning_rate": 1.185707976627259e-05, + "loss": 3.7368, + "step": 103825 + }, + { + "epoch": 7.054626987362413, + "grad_norm": 0.20843859016895294, + "learning_rate": 1.185283326538932e-05, + "loss": 3.8347, + "step": 103830 + }, + { + "epoch": 7.054966707433075, + "grad_norm": 0.21342667937278748, + "learning_rate": 1.1848586764506048e-05, + "loss": 3.8446, + "step": 103835 + }, + { + "epoch": 7.055306427503737, + "grad_norm": 0.2532491981983185, + "learning_rate": 1.1844340263622776e-05, + "loss": 3.7138, + "step": 103840 + }, + { + "epoch": 7.055646147574398, + "grad_norm": 0.17683789134025574, + "learning_rate": 1.1840093762739503e-05, + "loss": 3.9093, + "step": 103845 + }, + { + "epoch": 7.05598586764506, + "grad_norm": 0.2333788424730301, + "learning_rate": 1.183584726185623e-05, + "loss": 3.9172, + "step": 103850 + }, + { + "epoch": 7.0563255877157225, + "grad_norm": 0.17051604390144348, + "learning_rate": 1.1831600760972959e-05, + "loss": 4.1037, + "step": 103855 + }, + { + "epoch": 7.056665307786384, + "grad_norm": 0.7044515013694763, + "learning_rate": 1.1827354260089687e-05, + "loss": 3.9479, + "step": 103860 + }, + { + "epoch": 7.057005027857046, + "grad_norm": 0.1546582728624344, + "learning_rate": 1.1823107759206415e-05, + "loss": 3.7955, + "step": 103865 + }, + { + "epoch": 7.057344747927708, + "grad_norm": 0.191785529255867, + "learning_rate": 1.1818861258323143e-05, + "loss": 3.9227, + "step": 103870 + }, + { + "epoch": 7.057684467998369, + "grad_norm": 0.8334313631057739, + "learning_rate": 1.181461475743987e-05, + "loss": 3.8446, + "step": 103875 + }, + { + "epoch": 7.058024188069031, + "grad_norm": 0.21410679817199707, + "learning_rate": 1.1810368256556597e-05, + "loss": 3.5363, + "step": 103880 + }, + { + "epoch": 7.058363908139693, + "grad_norm": 0.1649196743965149, + "learning_rate": 1.1806121755673327e-05, + "loss": 3.8559, + "step": 103885 + }, + { + "epoch": 7.058703628210354, + "grad_norm": 0.15693967044353485, + "learning_rate": 1.1801875254790055e-05, + "loss": 4.0206, + "step": 103890 + }, + { + "epoch": 7.059043348281016, + "grad_norm": 0.15794327855110168, + "learning_rate": 1.1797628753906781e-05, + "loss": 3.9493, + "step": 103895 + }, + { + "epoch": 7.0593830683516785, + "grad_norm": 0.21008466184139252, + "learning_rate": 1.1793382253023509e-05, + "loss": 3.8778, + "step": 103900 + }, + { + "epoch": 7.05972278842234, + "grad_norm": 0.18490613996982574, + "learning_rate": 1.1789135752140237e-05, + "loss": 4.0485, + "step": 103905 + }, + { + "epoch": 7.060062508493002, + "grad_norm": 0.16491283476352692, + "learning_rate": 1.1784889251256965e-05, + "loss": 4.0054, + "step": 103910 + }, + { + "epoch": 7.060402228563664, + "grad_norm": 0.14623312652111053, + "learning_rate": 1.1780642750373693e-05, + "loss": 3.5468, + "step": 103915 + }, + { + "epoch": 7.060741948634325, + "grad_norm": 2.575374126434326, + "learning_rate": 1.1776396249490421e-05, + "loss": 3.7302, + "step": 103920 + }, + { + "epoch": 7.061081668704987, + "grad_norm": 0.17983214557170868, + "learning_rate": 1.1772149748607149e-05, + "loss": 3.8873, + "step": 103925 + }, + { + "epoch": 7.061421388775649, + "grad_norm": 0.23101770877838135, + "learning_rate": 1.1767903247723875e-05, + "loss": 3.8149, + "step": 103930 + }, + { + "epoch": 7.06176110884631, + "grad_norm": 1.4241352081298828, + "learning_rate": 1.1763656746840603e-05, + "loss": 4.0011, + "step": 103935 + }, + { + "epoch": 7.062100828916972, + "grad_norm": 0.18284589052200317, + "learning_rate": 1.1759410245957333e-05, + "loss": 3.8724, + "step": 103940 + }, + { + "epoch": 7.0624405489876345, + "grad_norm": 0.15187393128871918, + "learning_rate": 1.175516374507406e-05, + "loss": 3.9041, + "step": 103945 + }, + { + "epoch": 7.062780269058296, + "grad_norm": 0.1922498494386673, + "learning_rate": 1.1750917244190787e-05, + "loss": 3.6881, + "step": 103950 + }, + { + "epoch": 7.063119989128958, + "grad_norm": 0.15635821223258972, + "learning_rate": 1.1746670743307515e-05, + "loss": 3.9321, + "step": 103955 + }, + { + "epoch": 7.06345970919962, + "grad_norm": 0.20459149777889252, + "learning_rate": 1.1742424242424243e-05, + "loss": 3.7274, + "step": 103960 + }, + { + "epoch": 7.063799429270281, + "grad_norm": 0.1857956051826477, + "learning_rate": 1.1738177741540971e-05, + "loss": 3.9838, + "step": 103965 + }, + { + "epoch": 7.064139149340943, + "grad_norm": 0.2308182269334793, + "learning_rate": 1.17339312406577e-05, + "loss": 3.9748, + "step": 103970 + }, + { + "epoch": 7.064478869411605, + "grad_norm": 0.322933554649353, + "learning_rate": 1.1729684739774427e-05, + "loss": 3.9786, + "step": 103975 + }, + { + "epoch": 7.064818589482266, + "grad_norm": 0.19468417763710022, + "learning_rate": 1.1725438238891154e-05, + "loss": 3.7019, + "step": 103980 + }, + { + "epoch": 7.0651583095529285, + "grad_norm": 0.18768903613090515, + "learning_rate": 1.1721191738007882e-05, + "loss": 3.5941, + "step": 103985 + }, + { + "epoch": 7.0654980296235905, + "grad_norm": 0.1614757925271988, + "learning_rate": 1.171694523712461e-05, + "loss": 3.5941, + "step": 103990 + }, + { + "epoch": 7.065837749694252, + "grad_norm": 0.18864654004573822, + "learning_rate": 1.1712698736241338e-05, + "loss": 3.7604, + "step": 103995 + }, + { + "epoch": 7.066177469764914, + "grad_norm": 0.19424603879451752, + "learning_rate": 1.1708452235358066e-05, + "loss": 3.9023, + "step": 104000 + }, + { + "epoch": 7.066517189835576, + "grad_norm": 0.816429078578949, + "learning_rate": 1.1704205734474794e-05, + "loss": 3.7281, + "step": 104005 + }, + { + "epoch": 7.066856909906237, + "grad_norm": 0.328862726688385, + "learning_rate": 1.1699959233591522e-05, + "loss": 3.7131, + "step": 104010 + }, + { + "epoch": 7.067196629976899, + "grad_norm": 0.8097062706947327, + "learning_rate": 1.1695712732708248e-05, + "loss": 3.8096, + "step": 104015 + }, + { + "epoch": 7.067536350047561, + "grad_norm": 0.5915808081626892, + "learning_rate": 1.1691466231824976e-05, + "loss": 4.0505, + "step": 104020 + }, + { + "epoch": 7.067876070118222, + "grad_norm": 2.8791704177856445, + "learning_rate": 1.1687219730941706e-05, + "loss": 3.7167, + "step": 104025 + }, + { + "epoch": 7.0682157901888845, + "grad_norm": 0.14641284942626953, + "learning_rate": 1.1682973230058432e-05, + "loss": 3.8944, + "step": 104030 + }, + { + "epoch": 7.0685555102595465, + "grad_norm": 0.14459015429019928, + "learning_rate": 1.167872672917516e-05, + "loss": 3.7854, + "step": 104035 + }, + { + "epoch": 7.068895230330208, + "grad_norm": 0.14201216399669647, + "learning_rate": 1.1674480228291888e-05, + "loss": 3.9692, + "step": 104040 + }, + { + "epoch": 7.06923495040087, + "grad_norm": 0.16119733452796936, + "learning_rate": 1.1670233727408616e-05, + "loss": 3.9552, + "step": 104045 + }, + { + "epoch": 7.069574670471532, + "grad_norm": 0.3940170407295227, + "learning_rate": 1.1665987226525344e-05, + "loss": 3.84, + "step": 104050 + }, + { + "epoch": 7.069914390542193, + "grad_norm": 0.19971080124378204, + "learning_rate": 1.1661740725642072e-05, + "loss": 3.6026, + "step": 104055 + }, + { + "epoch": 7.070254110612855, + "grad_norm": 0.1433447003364563, + "learning_rate": 1.16574942247588e-05, + "loss": 4.0896, + "step": 104060 + }, + { + "epoch": 7.070593830683517, + "grad_norm": 0.1554250568151474, + "learning_rate": 1.1653247723875526e-05, + "loss": 3.7381, + "step": 104065 + }, + { + "epoch": 7.070933550754178, + "grad_norm": 0.23831689357757568, + "learning_rate": 1.1649001222992254e-05, + "loss": 3.8277, + "step": 104070 + }, + { + "epoch": 7.0712732708248405, + "grad_norm": 0.1716773360967636, + "learning_rate": 1.1644754722108982e-05, + "loss": 3.7652, + "step": 104075 + }, + { + "epoch": 7.0716129908955025, + "grad_norm": 0.21195322275161743, + "learning_rate": 1.164050822122571e-05, + "loss": 3.5808, + "step": 104080 + }, + { + "epoch": 7.071952710966164, + "grad_norm": 0.28694337606430054, + "learning_rate": 1.1636261720342438e-05, + "loss": 3.8816, + "step": 104085 + }, + { + "epoch": 7.072292431036826, + "grad_norm": 0.34097832441329956, + "learning_rate": 1.1632015219459166e-05, + "loss": 3.809, + "step": 104090 + }, + { + "epoch": 7.072632151107487, + "grad_norm": 0.19331112504005432, + "learning_rate": 1.1627768718575894e-05, + "loss": 3.7991, + "step": 104095 + }, + { + "epoch": 7.072971871178149, + "grad_norm": 0.2724429965019226, + "learning_rate": 1.162352221769262e-05, + "loss": 3.7393, + "step": 104100 + }, + { + "epoch": 7.073311591248811, + "grad_norm": 0.18461303412914276, + "learning_rate": 1.161927571680935e-05, + "loss": 3.8633, + "step": 104105 + }, + { + "epoch": 7.073651311319472, + "grad_norm": 0.17374825477600098, + "learning_rate": 1.1615029215926078e-05, + "loss": 3.8643, + "step": 104110 + }, + { + "epoch": 7.073991031390134, + "grad_norm": 0.15824122726917267, + "learning_rate": 1.1610782715042805e-05, + "loss": 3.8824, + "step": 104115 + }, + { + "epoch": 7.0743307514607965, + "grad_norm": 0.16847267746925354, + "learning_rate": 1.1606536214159533e-05, + "loss": 3.8274, + "step": 104120 + }, + { + "epoch": 7.074670471531458, + "grad_norm": 0.23621132969856262, + "learning_rate": 1.160228971327626e-05, + "loss": 3.8412, + "step": 104125 + }, + { + "epoch": 7.07501019160212, + "grad_norm": 0.16040997207164764, + "learning_rate": 1.1598043212392989e-05, + "loss": 3.8496, + "step": 104130 + }, + { + "epoch": 7.075349911672782, + "grad_norm": 0.2934716045856476, + "learning_rate": 1.1593796711509717e-05, + "loss": 3.8472, + "step": 104135 + }, + { + "epoch": 7.075689631743443, + "grad_norm": 0.13834212720394135, + "learning_rate": 1.1589550210626445e-05, + "loss": 3.6694, + "step": 104140 + }, + { + "epoch": 7.076029351814105, + "grad_norm": 0.1321132481098175, + "learning_rate": 1.1585303709743173e-05, + "loss": 3.7851, + "step": 104145 + }, + { + "epoch": 7.076369071884767, + "grad_norm": 0.17452764511108398, + "learning_rate": 1.1581057208859899e-05, + "loss": 3.7617, + "step": 104150 + }, + { + "epoch": 7.076708791955428, + "grad_norm": 0.18106946349143982, + "learning_rate": 1.1576810707976627e-05, + "loss": 3.7949, + "step": 104155 + }, + { + "epoch": 7.07704851202609, + "grad_norm": 0.20177777111530304, + "learning_rate": 1.1572564207093357e-05, + "loss": 3.8876, + "step": 104160 + }, + { + "epoch": 7.0773882320967525, + "grad_norm": 0.24132926762104034, + "learning_rate": 1.1568317706210083e-05, + "loss": 3.6359, + "step": 104165 + }, + { + "epoch": 7.077727952167414, + "grad_norm": 0.19044776260852814, + "learning_rate": 1.1564071205326811e-05, + "loss": 3.7409, + "step": 104170 + }, + { + "epoch": 7.078067672238076, + "grad_norm": 0.2045491486787796, + "learning_rate": 1.155982470444354e-05, + "loss": 3.9204, + "step": 104175 + }, + { + "epoch": 7.078407392308738, + "grad_norm": 0.1754423975944519, + "learning_rate": 1.1555578203560267e-05, + "loss": 3.8602, + "step": 104180 + }, + { + "epoch": 7.078747112379399, + "grad_norm": 0.1592836230993271, + "learning_rate": 1.1551331702676995e-05, + "loss": 3.9446, + "step": 104185 + }, + { + "epoch": 7.079086832450061, + "grad_norm": 0.1655048280954361, + "learning_rate": 1.1547085201793723e-05, + "loss": 3.7022, + "step": 104190 + }, + { + "epoch": 7.079426552520723, + "grad_norm": 0.2512984573841095, + "learning_rate": 1.1542838700910451e-05, + "loss": 3.9793, + "step": 104195 + }, + { + "epoch": 7.079766272591384, + "grad_norm": 0.16780924797058105, + "learning_rate": 1.1538592200027178e-05, + "loss": 3.7253, + "step": 104200 + }, + { + "epoch": 7.080105992662046, + "grad_norm": 0.20188209414482117, + "learning_rate": 1.1534345699143906e-05, + "loss": 3.5697, + "step": 104205 + }, + { + "epoch": 7.0804457127327085, + "grad_norm": 0.21369780600070953, + "learning_rate": 1.1530099198260634e-05, + "loss": 3.9795, + "step": 104210 + }, + { + "epoch": 7.08078543280337, + "grad_norm": 0.17122268676757812, + "learning_rate": 1.1525852697377362e-05, + "loss": 3.9029, + "step": 104215 + }, + { + "epoch": 7.081125152874032, + "grad_norm": 0.1928296685218811, + "learning_rate": 1.152160619649409e-05, + "loss": 3.787, + "step": 104220 + }, + { + "epoch": 7.081464872944694, + "grad_norm": 0.1463260054588318, + "learning_rate": 1.1517359695610818e-05, + "loss": 3.6199, + "step": 104225 + }, + { + "epoch": 7.081804593015355, + "grad_norm": 0.22407186031341553, + "learning_rate": 1.1513113194727546e-05, + "loss": 3.6838, + "step": 104230 + }, + { + "epoch": 7.082144313086017, + "grad_norm": 0.20646509528160095, + "learning_rate": 1.1508866693844272e-05, + "loss": 3.6639, + "step": 104235 + }, + { + "epoch": 7.082484033156679, + "grad_norm": 0.1948574185371399, + "learning_rate": 1.1504620192961e-05, + "loss": 3.5673, + "step": 104240 + }, + { + "epoch": 7.08282375322734, + "grad_norm": 0.15806111693382263, + "learning_rate": 1.150037369207773e-05, + "loss": 3.9032, + "step": 104245 + }, + { + "epoch": 7.083163473298002, + "grad_norm": 0.22653643786907196, + "learning_rate": 1.1496127191194456e-05, + "loss": 3.9957, + "step": 104250 + }, + { + "epoch": 7.0835031933686645, + "grad_norm": 0.16655656695365906, + "learning_rate": 1.1491880690311184e-05, + "loss": 3.7443, + "step": 104255 + }, + { + "epoch": 7.083842913439326, + "grad_norm": 0.16469185054302216, + "learning_rate": 1.1487634189427912e-05, + "loss": 4.0253, + "step": 104260 + }, + { + "epoch": 7.084182633509988, + "grad_norm": 0.2523594796657562, + "learning_rate": 1.148338768854464e-05, + "loss": 3.7436, + "step": 104265 + }, + { + "epoch": 7.08452235358065, + "grad_norm": 0.1263079047203064, + "learning_rate": 1.1479141187661368e-05, + "loss": 3.709, + "step": 104270 + }, + { + "epoch": 7.084862073651311, + "grad_norm": 0.1818646937608719, + "learning_rate": 1.1474894686778096e-05, + "loss": 3.8914, + "step": 104275 + }, + { + "epoch": 7.085201793721973, + "grad_norm": 0.15938915312290192, + "learning_rate": 1.1470648185894824e-05, + "loss": 3.9928, + "step": 104280 + }, + { + "epoch": 7.085541513792635, + "grad_norm": 0.18974654376506805, + "learning_rate": 1.146640168501155e-05, + "loss": 3.6644, + "step": 104285 + }, + { + "epoch": 7.085881233863296, + "grad_norm": 0.19097229838371277, + "learning_rate": 1.1462155184128278e-05, + "loss": 3.9015, + "step": 104290 + }, + { + "epoch": 7.0862209539339585, + "grad_norm": 0.1661965548992157, + "learning_rate": 1.1457908683245006e-05, + "loss": 4.0487, + "step": 104295 + }, + { + "epoch": 7.0865606740046205, + "grad_norm": 0.16528593003749847, + "learning_rate": 1.1453662182361736e-05, + "loss": 3.8534, + "step": 104300 + }, + { + "epoch": 7.086900394075282, + "grad_norm": 0.14853410422801971, + "learning_rate": 1.1449415681478462e-05, + "loss": 3.7199, + "step": 104305 + }, + { + "epoch": 7.087240114145944, + "grad_norm": 0.2550297677516937, + "learning_rate": 1.144516918059519e-05, + "loss": 3.6627, + "step": 104310 + }, + { + "epoch": 7.087579834216606, + "grad_norm": 0.15852445363998413, + "learning_rate": 1.1440922679711918e-05, + "loss": 3.7873, + "step": 104315 + }, + { + "epoch": 7.087919554287267, + "grad_norm": 0.1725594848394394, + "learning_rate": 1.1436676178828645e-05, + "loss": 3.731, + "step": 104320 + }, + { + "epoch": 7.088259274357929, + "grad_norm": 0.16740961372852325, + "learning_rate": 1.1432429677945374e-05, + "loss": 4.0547, + "step": 104325 + }, + { + "epoch": 7.088598994428591, + "grad_norm": 0.1941356658935547, + "learning_rate": 1.1428183177062102e-05, + "loss": 3.8884, + "step": 104330 + }, + { + "epoch": 7.088938714499252, + "grad_norm": 0.18443980813026428, + "learning_rate": 1.1423936676178829e-05, + "loss": 3.7539, + "step": 104335 + }, + { + "epoch": 7.0892784345699145, + "grad_norm": 0.1626851111650467, + "learning_rate": 1.1419690175295557e-05, + "loss": 3.5725, + "step": 104340 + }, + { + "epoch": 7.0896181546405765, + "grad_norm": 0.20319509506225586, + "learning_rate": 1.1415443674412285e-05, + "loss": 3.8747, + "step": 104345 + }, + { + "epoch": 7.089957874711238, + "grad_norm": 0.17025060951709747, + "learning_rate": 1.1411197173529013e-05, + "loss": 3.6103, + "step": 104350 + }, + { + "epoch": 7.0902975947819, + "grad_norm": 0.19794677197933197, + "learning_rate": 1.140695067264574e-05, + "loss": 3.8375, + "step": 104355 + }, + { + "epoch": 7.090637314852562, + "grad_norm": 0.2339634746313095, + "learning_rate": 1.1402704171762469e-05, + "loss": 3.8852, + "step": 104360 + }, + { + "epoch": 7.090977034923223, + "grad_norm": 0.19779041409492493, + "learning_rate": 1.1398457670879197e-05, + "loss": 3.8501, + "step": 104365 + }, + { + "epoch": 7.091316754993885, + "grad_norm": 0.1605483442544937, + "learning_rate": 1.1394211169995923e-05, + "loss": 3.8268, + "step": 104370 + }, + { + "epoch": 7.091656475064547, + "grad_norm": 0.144349604845047, + "learning_rate": 1.1389964669112651e-05, + "loss": 3.8315, + "step": 104375 + }, + { + "epoch": 7.091996195135208, + "grad_norm": 0.18739968538284302, + "learning_rate": 1.1385718168229379e-05, + "loss": 3.6609, + "step": 104380 + }, + { + "epoch": 7.0923359152058705, + "grad_norm": 0.17131876945495605, + "learning_rate": 1.1381471667346109e-05, + "loss": 3.6546, + "step": 104385 + }, + { + "epoch": 7.0926756352765326, + "grad_norm": 0.16712938249111176, + "learning_rate": 1.1377225166462835e-05, + "loss": 4.0376, + "step": 104390 + }, + { + "epoch": 7.093015355347194, + "grad_norm": 0.1748022735118866, + "learning_rate": 1.1372978665579563e-05, + "loss": 3.8391, + "step": 104395 + }, + { + "epoch": 7.093355075417856, + "grad_norm": 0.14181646704673767, + "learning_rate": 1.1368732164696291e-05, + "loss": 3.7603, + "step": 104400 + }, + { + "epoch": 7.093694795488518, + "grad_norm": 0.17136293649673462, + "learning_rate": 1.1364485663813017e-05, + "loss": 3.9817, + "step": 104405 + }, + { + "epoch": 7.094034515559179, + "grad_norm": 0.18561920523643494, + "learning_rate": 1.1360239162929747e-05, + "loss": 3.9278, + "step": 104410 + }, + { + "epoch": 7.094374235629841, + "grad_norm": 0.17694222927093506, + "learning_rate": 1.1355992662046475e-05, + "loss": 3.8348, + "step": 104415 + }, + { + "epoch": 7.094713955700502, + "grad_norm": 0.1996557116508484, + "learning_rate": 1.1351746161163201e-05, + "loss": 3.64, + "step": 104420 + }, + { + "epoch": 7.095053675771164, + "grad_norm": 1.7916903495788574, + "learning_rate": 1.134749966027993e-05, + "loss": 3.7643, + "step": 104425 + }, + { + "epoch": 7.0953933958418265, + "grad_norm": 0.15420018136501312, + "learning_rate": 1.1343253159396657e-05, + "loss": 3.6473, + "step": 104430 + }, + { + "epoch": 7.095733115912488, + "grad_norm": 0.413754940032959, + "learning_rate": 1.1339006658513385e-05, + "loss": 3.8586, + "step": 104435 + }, + { + "epoch": 7.09607283598315, + "grad_norm": 0.24466478824615479, + "learning_rate": 1.1334760157630113e-05, + "loss": 3.7702, + "step": 104440 + }, + { + "epoch": 7.096412556053812, + "grad_norm": 0.14955195784568787, + "learning_rate": 1.1330513656746841e-05, + "loss": 3.7913, + "step": 104445 + }, + { + "epoch": 7.096752276124473, + "grad_norm": 0.3648693561553955, + "learning_rate": 1.132626715586357e-05, + "loss": 3.6473, + "step": 104450 + }, + { + "epoch": 7.097091996195135, + "grad_norm": 0.16962885856628418, + "learning_rate": 1.1322020654980296e-05, + "loss": 3.7454, + "step": 104455 + }, + { + "epoch": 7.097431716265797, + "grad_norm": 0.2309577465057373, + "learning_rate": 1.1317774154097024e-05, + "loss": 3.8991, + "step": 104460 + }, + { + "epoch": 7.097771436336458, + "grad_norm": 0.1839534193277359, + "learning_rate": 1.1313527653213753e-05, + "loss": 3.8809, + "step": 104465 + }, + { + "epoch": 7.09811115640712, + "grad_norm": 0.14870281517505646, + "learning_rate": 1.1309281152330481e-05, + "loss": 3.4813, + "step": 104470 + }, + { + "epoch": 7.0984508764777825, + "grad_norm": 0.1651225984096527, + "learning_rate": 1.1305034651447208e-05, + "loss": 3.8197, + "step": 104475 + }, + { + "epoch": 7.098790596548444, + "grad_norm": 0.1706346869468689, + "learning_rate": 1.1300788150563936e-05, + "loss": 4.0067, + "step": 104480 + }, + { + "epoch": 7.099130316619106, + "grad_norm": 0.1941969245672226, + "learning_rate": 1.1296541649680664e-05, + "loss": 3.6632, + "step": 104485 + }, + { + "epoch": 7.099470036689768, + "grad_norm": 0.15257634222507477, + "learning_rate": 1.1292295148797392e-05, + "loss": 3.9429, + "step": 104490 + }, + { + "epoch": 7.099809756760429, + "grad_norm": 0.407008558511734, + "learning_rate": 1.128804864791412e-05, + "loss": 3.7439, + "step": 104495 + }, + { + "epoch": 7.100149476831091, + "grad_norm": 0.18182195723056793, + "learning_rate": 1.1283802147030848e-05, + "loss": 3.9225, + "step": 104500 + }, + { + "epoch": 7.100489196901753, + "grad_norm": 0.17422528564929962, + "learning_rate": 1.1279555646147574e-05, + "loss": 3.7322, + "step": 104505 + }, + { + "epoch": 7.100828916972414, + "grad_norm": 0.1511542946100235, + "learning_rate": 1.1275309145264302e-05, + "loss": 3.8737, + "step": 104510 + }, + { + "epoch": 7.101168637043076, + "grad_norm": 0.18308791518211365, + "learning_rate": 1.127106264438103e-05, + "loss": 3.9157, + "step": 104515 + }, + { + "epoch": 7.1015083571137385, + "grad_norm": 0.18581263720989227, + "learning_rate": 1.126681614349776e-05, + "loss": 3.6585, + "step": 104520 + }, + { + "epoch": 7.1018480771844, + "grad_norm": 0.2141989767551422, + "learning_rate": 1.1262569642614486e-05, + "loss": 3.5808, + "step": 104525 + }, + { + "epoch": 7.102187797255062, + "grad_norm": 0.23066237568855286, + "learning_rate": 1.1258323141731214e-05, + "loss": 3.8157, + "step": 104530 + }, + { + "epoch": 7.102527517325724, + "grad_norm": 0.15966343879699707, + "learning_rate": 1.1254076640847942e-05, + "loss": 3.6591, + "step": 104535 + }, + { + "epoch": 7.102867237396385, + "grad_norm": 0.8596388101577759, + "learning_rate": 1.1249830139964668e-05, + "loss": 3.8871, + "step": 104540 + }, + { + "epoch": 7.103206957467047, + "grad_norm": 0.18364794552326202, + "learning_rate": 1.1245583639081398e-05, + "loss": 3.9162, + "step": 104545 + }, + { + "epoch": 7.103546677537709, + "grad_norm": 0.22304630279541016, + "learning_rate": 1.1241337138198126e-05, + "loss": 3.8088, + "step": 104550 + }, + { + "epoch": 7.10388639760837, + "grad_norm": 0.19832056760787964, + "learning_rate": 1.1237090637314854e-05, + "loss": 3.7368, + "step": 104555 + }, + { + "epoch": 7.1042261176790324, + "grad_norm": 0.16129183769226074, + "learning_rate": 1.123284413643158e-05, + "loss": 3.7746, + "step": 104560 + }, + { + "epoch": 7.1045658377496945, + "grad_norm": 0.18916098773479462, + "learning_rate": 1.1228597635548309e-05, + "loss": 3.5498, + "step": 104565 + }, + { + "epoch": 7.104905557820356, + "grad_norm": 0.19050560891628265, + "learning_rate": 1.1224351134665037e-05, + "loss": 3.7544, + "step": 104570 + }, + { + "epoch": 7.105245277891018, + "grad_norm": 0.20852869749069214, + "learning_rate": 1.1220104633781765e-05, + "loss": 3.7646, + "step": 104575 + }, + { + "epoch": 7.10558499796168, + "grad_norm": 0.6670964956283569, + "learning_rate": 1.1215858132898493e-05, + "loss": 3.9023, + "step": 104580 + }, + { + "epoch": 7.105924718032341, + "grad_norm": 0.2268618494272232, + "learning_rate": 1.121161163201522e-05, + "loss": 3.8438, + "step": 104585 + }, + { + "epoch": 7.106264438103003, + "grad_norm": 0.15454654395580292, + "learning_rate": 1.1207365131131947e-05, + "loss": 3.8432, + "step": 104590 + }, + { + "epoch": 7.106604158173665, + "grad_norm": 0.25724732875823975, + "learning_rate": 1.1203118630248675e-05, + "loss": 3.755, + "step": 104595 + }, + { + "epoch": 7.106943878244326, + "grad_norm": 0.16956573724746704, + "learning_rate": 1.1198872129365403e-05, + "loss": 3.717, + "step": 104600 + }, + { + "epoch": 7.1072835983149885, + "grad_norm": 0.2345227301120758, + "learning_rate": 1.1194625628482133e-05, + "loss": 4.0872, + "step": 104605 + }, + { + "epoch": 7.1076233183856505, + "grad_norm": 0.2397422343492508, + "learning_rate": 1.1190379127598859e-05, + "loss": 3.5593, + "step": 104610 + }, + { + "epoch": 7.107963038456312, + "grad_norm": 0.1532098650932312, + "learning_rate": 1.1186132626715587e-05, + "loss": 3.9663, + "step": 104615 + }, + { + "epoch": 7.108302758526974, + "grad_norm": 0.18696464598178864, + "learning_rate": 1.1181886125832315e-05, + "loss": 3.8548, + "step": 104620 + }, + { + "epoch": 7.108642478597636, + "grad_norm": 0.1647881120443344, + "learning_rate": 1.1177639624949041e-05, + "loss": 3.7352, + "step": 104625 + }, + { + "epoch": 7.108982198668297, + "grad_norm": 0.17847613990306854, + "learning_rate": 1.1173393124065771e-05, + "loss": 3.9032, + "step": 104630 + }, + { + "epoch": 7.109321918738959, + "grad_norm": 0.598897397518158, + "learning_rate": 1.1169146623182499e-05, + "loss": 3.7402, + "step": 104635 + }, + { + "epoch": 7.109661638809621, + "grad_norm": 0.1741994321346283, + "learning_rate": 1.1164900122299227e-05, + "loss": 3.9098, + "step": 104640 + }, + { + "epoch": 7.110001358880282, + "grad_norm": 0.8816014528274536, + "learning_rate": 1.1160653621415953e-05, + "loss": 3.6763, + "step": 104645 + }, + { + "epoch": 7.1103410789509445, + "grad_norm": 0.18108199536800385, + "learning_rate": 1.1156407120532681e-05, + "loss": 3.9266, + "step": 104650 + }, + { + "epoch": 7.1106807990216065, + "grad_norm": 0.1861584633588791, + "learning_rate": 1.115216061964941e-05, + "loss": 3.739, + "step": 104655 + }, + { + "epoch": 7.111020519092268, + "grad_norm": 0.1680506467819214, + "learning_rate": 1.1147914118766137e-05, + "loss": 3.8587, + "step": 104660 + }, + { + "epoch": 7.11136023916293, + "grad_norm": 0.22341154515743256, + "learning_rate": 1.1143667617882865e-05, + "loss": 3.9208, + "step": 104665 + }, + { + "epoch": 7.111699959233592, + "grad_norm": 0.15925583243370056, + "learning_rate": 1.1139421116999593e-05, + "loss": 4.0815, + "step": 104670 + }, + { + "epoch": 7.112039679304253, + "grad_norm": 0.2795785665512085, + "learning_rate": 1.113517461611632e-05, + "loss": 3.9918, + "step": 104675 + }, + { + "epoch": 7.112379399374915, + "grad_norm": 0.17400087416172028, + "learning_rate": 1.1130928115233048e-05, + "loss": 3.9658, + "step": 104680 + }, + { + "epoch": 7.112719119445577, + "grad_norm": 0.49358871579170227, + "learning_rate": 1.1126681614349777e-05, + "loss": 3.9237, + "step": 104685 + }, + { + "epoch": 7.113058839516238, + "grad_norm": 0.16662295162677765, + "learning_rate": 1.1122435113466505e-05, + "loss": 3.7922, + "step": 104690 + }, + { + "epoch": 7.1133985595869005, + "grad_norm": 0.1850639283657074, + "learning_rate": 1.1118188612583232e-05, + "loss": 3.9321, + "step": 104695 + }, + { + "epoch": 7.113738279657563, + "grad_norm": 0.16997836530208588, + "learning_rate": 1.111394211169996e-05, + "loss": 3.7054, + "step": 104700 + }, + { + "epoch": 7.114077999728224, + "grad_norm": 0.18537187576293945, + "learning_rate": 1.1109695610816688e-05, + "loss": 3.9129, + "step": 104705 + }, + { + "epoch": 7.114417719798886, + "grad_norm": 0.6107091903686523, + "learning_rate": 1.1105449109933416e-05, + "loss": 3.8952, + "step": 104710 + }, + { + "epoch": 7.114757439869548, + "grad_norm": 0.14223439991474152, + "learning_rate": 1.1101202609050144e-05, + "loss": 3.955, + "step": 104715 + }, + { + "epoch": 7.115097159940209, + "grad_norm": 0.15296873450279236, + "learning_rate": 1.1096956108166872e-05, + "loss": 3.7624, + "step": 104720 + }, + { + "epoch": 7.115436880010871, + "grad_norm": 0.1875135749578476, + "learning_rate": 1.10927096072836e-05, + "loss": 3.6958, + "step": 104725 + }, + { + "epoch": 7.115776600081533, + "grad_norm": 0.16378141939640045, + "learning_rate": 1.1088463106400326e-05, + "loss": 3.8963, + "step": 104730 + }, + { + "epoch": 7.116116320152194, + "grad_norm": 0.18460461497306824, + "learning_rate": 1.1084216605517054e-05, + "loss": 3.7871, + "step": 104735 + }, + { + "epoch": 7.1164560402228565, + "grad_norm": 0.20359627902507782, + "learning_rate": 1.1079970104633782e-05, + "loss": 3.8924, + "step": 104740 + }, + { + "epoch": 7.116795760293519, + "grad_norm": 0.2503351867198944, + "learning_rate": 1.107572360375051e-05, + "loss": 3.7718, + "step": 104745 + }, + { + "epoch": 7.11713548036418, + "grad_norm": 0.18961785733699799, + "learning_rate": 1.1071477102867238e-05, + "loss": 3.9141, + "step": 104750 + }, + { + "epoch": 7.117475200434842, + "grad_norm": 0.14485454559326172, + "learning_rate": 1.1067230601983966e-05, + "loss": 3.8517, + "step": 104755 + }, + { + "epoch": 7.117814920505504, + "grad_norm": 0.8076295256614685, + "learning_rate": 1.1062984101100692e-05, + "loss": 3.749, + "step": 104760 + }, + { + "epoch": 7.118154640576165, + "grad_norm": 0.13640567660331726, + "learning_rate": 1.105873760021742e-05, + "loss": 3.8354, + "step": 104765 + }, + { + "epoch": 7.118494360646827, + "grad_norm": 0.14845503866672516, + "learning_rate": 1.105449109933415e-05, + "loss": 3.9395, + "step": 104770 + }, + { + "epoch": 7.118834080717488, + "grad_norm": 0.14844129979610443, + "learning_rate": 1.1050244598450878e-05, + "loss": 3.9418, + "step": 104775 + }, + { + "epoch": 7.11917380078815, + "grad_norm": 0.16658082604408264, + "learning_rate": 1.1045998097567604e-05, + "loss": 3.9154, + "step": 104780 + }, + { + "epoch": 7.1195135208588125, + "grad_norm": 0.16532079875469208, + "learning_rate": 1.1041751596684332e-05, + "loss": 3.8049, + "step": 104785 + }, + { + "epoch": 7.119853240929474, + "grad_norm": 0.15857280790805817, + "learning_rate": 1.103750509580106e-05, + "loss": 3.6988, + "step": 104790 + }, + { + "epoch": 7.120192961000136, + "grad_norm": 0.1435684859752655, + "learning_rate": 1.1033258594917788e-05, + "loss": 3.7829, + "step": 104795 + }, + { + "epoch": 7.120532681070798, + "grad_norm": 0.18879742920398712, + "learning_rate": 1.1029012094034516e-05, + "loss": 3.8642, + "step": 104800 + }, + { + "epoch": 7.120872401141459, + "grad_norm": 0.14973875880241394, + "learning_rate": 1.1024765593151244e-05, + "loss": 4.0129, + "step": 104805 + }, + { + "epoch": 7.121212121212121, + "grad_norm": 0.40718159079551697, + "learning_rate": 1.1020519092267972e-05, + "loss": 3.6529, + "step": 104810 + }, + { + "epoch": 7.121551841282783, + "grad_norm": 0.17917302250862122, + "learning_rate": 1.1016272591384699e-05, + "loss": 4.0672, + "step": 104815 + }, + { + "epoch": 7.121891561353444, + "grad_norm": 0.20989301800727844, + "learning_rate": 1.1012026090501427e-05, + "loss": 3.6943, + "step": 104820 + }, + { + "epoch": 7.122231281424106, + "grad_norm": 0.18580807745456696, + "learning_rate": 1.1007779589618156e-05, + "loss": 3.747, + "step": 104825 + }, + { + "epoch": 7.1225710014947685, + "grad_norm": 0.18297401070594788, + "learning_rate": 1.1003533088734883e-05, + "loss": 3.8289, + "step": 104830 + }, + { + "epoch": 7.12291072156543, + "grad_norm": 0.17501866817474365, + "learning_rate": 1.099928658785161e-05, + "loss": 4.1576, + "step": 104835 + }, + { + "epoch": 7.123250441636092, + "grad_norm": 0.17979498207569122, + "learning_rate": 1.0995040086968339e-05, + "loss": 3.7996, + "step": 104840 + }, + { + "epoch": 7.123590161706754, + "grad_norm": 0.15652774274349213, + "learning_rate": 1.0990793586085065e-05, + "loss": 3.7751, + "step": 104845 + }, + { + "epoch": 7.123929881777415, + "grad_norm": 0.14763563871383667, + "learning_rate": 1.0986547085201795e-05, + "loss": 3.9134, + "step": 104850 + }, + { + "epoch": 7.124269601848077, + "grad_norm": 0.17088522017002106, + "learning_rate": 1.0982300584318523e-05, + "loss": 3.7663, + "step": 104855 + }, + { + "epoch": 7.124609321918739, + "grad_norm": 0.1983380764722824, + "learning_rate": 1.097805408343525e-05, + "loss": 3.719, + "step": 104860 + }, + { + "epoch": 7.1249490419894, + "grad_norm": 0.1573651283979416, + "learning_rate": 1.0973807582551977e-05, + "loss": 3.7906, + "step": 104865 + }, + { + "epoch": 7.1252887620600625, + "grad_norm": 0.1985694020986557, + "learning_rate": 1.0969561081668705e-05, + "loss": 3.825, + "step": 104870 + }, + { + "epoch": 7.1256284821307245, + "grad_norm": 0.44047272205352783, + "learning_rate": 1.0965314580785433e-05, + "loss": 3.6692, + "step": 104875 + }, + { + "epoch": 7.125968202201386, + "grad_norm": 0.1695411652326584, + "learning_rate": 1.0961068079902161e-05, + "loss": 3.7841, + "step": 104880 + }, + { + "epoch": 7.126307922272048, + "grad_norm": 0.19676071405410767, + "learning_rate": 1.0956821579018889e-05, + "loss": 4.0298, + "step": 104885 + }, + { + "epoch": 7.12664764234271, + "grad_norm": 0.17684786021709442, + "learning_rate": 1.0952575078135617e-05, + "loss": 3.9686, + "step": 104890 + }, + { + "epoch": 7.126987362413371, + "grad_norm": 0.1653537005186081, + "learning_rate": 1.0948328577252345e-05, + "loss": 3.5703, + "step": 104895 + }, + { + "epoch": 7.127327082484033, + "grad_norm": 0.18006892502307892, + "learning_rate": 1.0944082076369071e-05, + "loss": 3.876, + "step": 104900 + }, + { + "epoch": 7.127666802554695, + "grad_norm": 0.17584924399852753, + "learning_rate": 1.0939835575485801e-05, + "loss": 4.0225, + "step": 104905 + }, + { + "epoch": 7.128006522625356, + "grad_norm": 0.1559402346611023, + "learning_rate": 1.093558907460253e-05, + "loss": 3.7508, + "step": 104910 + }, + { + "epoch": 7.1283462426960185, + "grad_norm": 0.9431583285331726, + "learning_rate": 1.0931342573719256e-05, + "loss": 3.9115, + "step": 104915 + }, + { + "epoch": 7.1286859627666805, + "grad_norm": 0.17803248763084412, + "learning_rate": 1.0927096072835984e-05, + "loss": 4.0093, + "step": 104920 + }, + { + "epoch": 7.129025682837342, + "grad_norm": 0.17776356637477875, + "learning_rate": 1.0922849571952712e-05, + "loss": 3.8447, + "step": 104925 + }, + { + "epoch": 7.129365402908004, + "grad_norm": 0.1683243364095688, + "learning_rate": 1.091860307106944e-05, + "loss": 3.6448, + "step": 104930 + }, + { + "epoch": 7.129705122978666, + "grad_norm": 0.1745576411485672, + "learning_rate": 1.0914356570186168e-05, + "loss": 3.8249, + "step": 104935 + }, + { + "epoch": 7.130044843049327, + "grad_norm": 0.17276518046855927, + "learning_rate": 1.0910110069302896e-05, + "loss": 3.8156, + "step": 104940 + }, + { + "epoch": 7.130384563119989, + "grad_norm": 0.24654018878936768, + "learning_rate": 1.0905863568419624e-05, + "loss": 3.9585, + "step": 104945 + }, + { + "epoch": 7.130724283190651, + "grad_norm": 0.18602538108825684, + "learning_rate": 1.090161706753635e-05, + "loss": 3.7335, + "step": 104950 + }, + { + "epoch": 7.131064003261312, + "grad_norm": 0.22763322293758392, + "learning_rate": 1.0897370566653078e-05, + "loss": 3.8536, + "step": 104955 + }, + { + "epoch": 7.1314037233319745, + "grad_norm": 0.16051426529884338, + "learning_rate": 1.0893124065769806e-05, + "loss": 3.7325, + "step": 104960 + }, + { + "epoch": 7.1317434434026366, + "grad_norm": 0.4836886525154114, + "learning_rate": 1.0888877564886534e-05, + "loss": 3.8546, + "step": 104965 + }, + { + "epoch": 7.132083163473298, + "grad_norm": 0.156094491481781, + "learning_rate": 1.0884631064003262e-05, + "loss": 3.7589, + "step": 104970 + }, + { + "epoch": 7.13242288354396, + "grad_norm": 0.17427973449230194, + "learning_rate": 1.088038456311999e-05, + "loss": 3.9387, + "step": 104975 + }, + { + "epoch": 7.132762603614622, + "grad_norm": 0.12840476632118225, + "learning_rate": 1.0876138062236716e-05, + "loss": 3.8469, + "step": 104980 + }, + { + "epoch": 7.133102323685283, + "grad_norm": 0.17204159498214722, + "learning_rate": 1.0871891561353444e-05, + "loss": 3.7069, + "step": 104985 + }, + { + "epoch": 7.133442043755945, + "grad_norm": 0.3980136811733246, + "learning_rate": 1.0867645060470174e-05, + "loss": 3.8692, + "step": 104990 + }, + { + "epoch": 7.133781763826607, + "grad_norm": 0.131425142288208, + "learning_rate": 1.0863398559586902e-05, + "loss": 3.8192, + "step": 104995 + }, + { + "epoch": 7.134121483897268, + "grad_norm": 0.21519066393375397, + "learning_rate": 1.0859152058703628e-05, + "loss": 3.768, + "step": 105000 + }, + { + "epoch": 7.1344612039679305, + "grad_norm": 0.13629478216171265, + "learning_rate": 1.0854905557820356e-05, + "loss": 3.7739, + "step": 105005 + }, + { + "epoch": 7.134800924038593, + "grad_norm": 0.21654270589351654, + "learning_rate": 1.0850659056937084e-05, + "loss": 4.0051, + "step": 105010 + }, + { + "epoch": 7.135140644109254, + "grad_norm": 0.12195278704166412, + "learning_rate": 1.0846412556053812e-05, + "loss": 3.8124, + "step": 105015 + }, + { + "epoch": 7.135480364179916, + "grad_norm": 0.178483784198761, + "learning_rate": 1.084216605517054e-05, + "loss": 3.7024, + "step": 105020 + }, + { + "epoch": 7.135820084250578, + "grad_norm": 0.17340338230133057, + "learning_rate": 1.0837919554287268e-05, + "loss": 3.7906, + "step": 105025 + }, + { + "epoch": 7.136159804321239, + "grad_norm": 0.2116081863641739, + "learning_rate": 1.0833673053403996e-05, + "loss": 4.0014, + "step": 105030 + }, + { + "epoch": 7.136499524391901, + "grad_norm": 0.1383853703737259, + "learning_rate": 1.0829426552520723e-05, + "loss": 3.7404, + "step": 105035 + }, + { + "epoch": 7.136839244462563, + "grad_norm": 0.21644334495067596, + "learning_rate": 1.082518005163745e-05, + "loss": 3.9044, + "step": 105040 + }, + { + "epoch": 7.137178964533224, + "grad_norm": 0.13124394416809082, + "learning_rate": 1.082093355075418e-05, + "loss": 3.9074, + "step": 105045 + }, + { + "epoch": 7.1375186846038865, + "grad_norm": 0.1625797599554062, + "learning_rate": 1.0816687049870907e-05, + "loss": 4.0805, + "step": 105050 + }, + { + "epoch": 7.137858404674549, + "grad_norm": 0.18100491166114807, + "learning_rate": 1.0812440548987635e-05, + "loss": 3.9006, + "step": 105055 + }, + { + "epoch": 7.13819812474521, + "grad_norm": 0.5069060325622559, + "learning_rate": 1.0808194048104363e-05, + "loss": 3.9305, + "step": 105060 + }, + { + "epoch": 7.138537844815872, + "grad_norm": 0.180201917886734, + "learning_rate": 1.0803947547221089e-05, + "loss": 3.726, + "step": 105065 + }, + { + "epoch": 7.138877564886534, + "grad_norm": 0.23087596893310547, + "learning_rate": 1.0799701046337819e-05, + "loss": 3.7141, + "step": 105070 + }, + { + "epoch": 7.139217284957195, + "grad_norm": 0.41485393047332764, + "learning_rate": 1.0795454545454547e-05, + "loss": 3.9398, + "step": 105075 + }, + { + "epoch": 7.139557005027857, + "grad_norm": 0.18099859356880188, + "learning_rate": 1.0791208044571275e-05, + "loss": 3.5691, + "step": 105080 + }, + { + "epoch": 7.139896725098519, + "grad_norm": 0.2162114828824997, + "learning_rate": 1.0786961543688001e-05, + "loss": 3.5194, + "step": 105085 + }, + { + "epoch": 7.14023644516918, + "grad_norm": 0.19367723166942596, + "learning_rate": 1.0782715042804729e-05, + "loss": 3.9313, + "step": 105090 + }, + { + "epoch": 7.1405761652398425, + "grad_norm": 0.7791758179664612, + "learning_rate": 1.0778468541921457e-05, + "loss": 3.6443, + "step": 105095 + }, + { + "epoch": 7.140915885310504, + "grad_norm": 0.16970881819725037, + "learning_rate": 1.0774222041038185e-05, + "loss": 4.2537, + "step": 105100 + }, + { + "epoch": 7.141255605381166, + "grad_norm": 0.23475424945354462, + "learning_rate": 1.0769975540154913e-05, + "loss": 3.8304, + "step": 105105 + }, + { + "epoch": 7.141595325451828, + "grad_norm": 0.16099248826503754, + "learning_rate": 1.0765729039271641e-05, + "loss": 3.8918, + "step": 105110 + }, + { + "epoch": 7.141935045522489, + "grad_norm": 0.17025332152843475, + "learning_rate": 1.0761482538388369e-05, + "loss": 3.731, + "step": 105115 + }, + { + "epoch": 7.142274765593151, + "grad_norm": 0.17315180599689484, + "learning_rate": 1.0757236037505095e-05, + "loss": 3.8543, + "step": 105120 + }, + { + "epoch": 7.142614485663813, + "grad_norm": 0.1392555981874466, + "learning_rate": 1.0752989536621823e-05, + "loss": 3.8737, + "step": 105125 + }, + { + "epoch": 7.142954205734474, + "grad_norm": 0.17633004486560822, + "learning_rate": 1.0748743035738553e-05, + "loss": 3.8596, + "step": 105130 + }, + { + "epoch": 7.1432939258051364, + "grad_norm": 0.2233087718486786, + "learning_rate": 1.074449653485528e-05, + "loss": 3.7097, + "step": 105135 + }, + { + "epoch": 7.1436336458757985, + "grad_norm": 0.1938527524471283, + "learning_rate": 1.0740250033972007e-05, + "loss": 3.8644, + "step": 105140 + }, + { + "epoch": 7.14397336594646, + "grad_norm": 0.21272389590740204, + "learning_rate": 1.0736003533088735e-05, + "loss": 3.6489, + "step": 105145 + }, + { + "epoch": 7.144313086017122, + "grad_norm": 0.42192864418029785, + "learning_rate": 1.0731757032205462e-05, + "loss": 3.6917, + "step": 105150 + }, + { + "epoch": 7.144652806087784, + "grad_norm": 0.15946421027183533, + "learning_rate": 1.0727510531322191e-05, + "loss": 3.9659, + "step": 105155 + }, + { + "epoch": 7.144992526158445, + "grad_norm": 0.18751901388168335, + "learning_rate": 1.072326403043892e-05, + "loss": 3.9161, + "step": 105160 + }, + { + "epoch": 7.145332246229107, + "grad_norm": 0.1689910739660263, + "learning_rate": 1.0719017529555647e-05, + "loss": 3.8713, + "step": 105165 + }, + { + "epoch": 7.145671966299769, + "grad_norm": 0.127254918217659, + "learning_rate": 1.0714771028672374e-05, + "loss": 3.9627, + "step": 105170 + }, + { + "epoch": 7.14601168637043, + "grad_norm": 0.16376721858978271, + "learning_rate": 1.0710524527789102e-05, + "loss": 3.9249, + "step": 105175 + }, + { + "epoch": 7.1463514064410925, + "grad_norm": 0.17732569575309753, + "learning_rate": 1.070627802690583e-05, + "loss": 3.8043, + "step": 105180 + }, + { + "epoch": 7.1466911265117545, + "grad_norm": 0.2089211791753769, + "learning_rate": 1.0702031526022558e-05, + "loss": 3.7178, + "step": 105185 + }, + { + "epoch": 7.147030846582416, + "grad_norm": 0.1776828020811081, + "learning_rate": 1.0697785025139286e-05, + "loss": 3.9385, + "step": 105190 + }, + { + "epoch": 7.147370566653078, + "grad_norm": 0.1811666190624237, + "learning_rate": 1.0693538524256014e-05, + "loss": 3.9339, + "step": 105195 + }, + { + "epoch": 7.14771028672374, + "grad_norm": 0.2117975354194641, + "learning_rate": 1.0689292023372742e-05, + "loss": 3.6805, + "step": 105200 + }, + { + "epoch": 7.148050006794401, + "grad_norm": 0.15430670976638794, + "learning_rate": 1.0685045522489468e-05, + "loss": 3.7023, + "step": 105205 + }, + { + "epoch": 7.148389726865063, + "grad_norm": 0.18929587304592133, + "learning_rate": 1.0680799021606198e-05, + "loss": 3.8156, + "step": 105210 + }, + { + "epoch": 7.148729446935725, + "grad_norm": 0.15399067103862762, + "learning_rate": 1.0676552520722926e-05, + "loss": 3.4468, + "step": 105215 + }, + { + "epoch": 7.149069167006386, + "grad_norm": 0.1733962595462799, + "learning_rate": 1.0672306019839652e-05, + "loss": 3.661, + "step": 105220 + }, + { + "epoch": 7.1494088870770485, + "grad_norm": 0.22749577462673187, + "learning_rate": 1.066805951895638e-05, + "loss": 3.9841, + "step": 105225 + }, + { + "epoch": 7.1497486071477105, + "grad_norm": 0.2201705425977707, + "learning_rate": 1.0663813018073108e-05, + "loss": 3.7709, + "step": 105230 + }, + { + "epoch": 7.150088327218372, + "grad_norm": 0.20788118243217468, + "learning_rate": 1.0659566517189836e-05, + "loss": 3.8832, + "step": 105235 + }, + { + "epoch": 7.150428047289034, + "grad_norm": 0.18605753779411316, + "learning_rate": 1.0655320016306564e-05, + "loss": 3.749, + "step": 105240 + }, + { + "epoch": 7.150767767359696, + "grad_norm": 0.1514264941215515, + "learning_rate": 1.0651073515423292e-05, + "loss": 4.0195, + "step": 105245 + }, + { + "epoch": 7.151107487430357, + "grad_norm": 0.13432341814041138, + "learning_rate": 1.064682701454002e-05, + "loss": 3.8641, + "step": 105250 + }, + { + "epoch": 7.151447207501019, + "grad_norm": 0.16402581334114075, + "learning_rate": 1.0642580513656746e-05, + "loss": 3.9724, + "step": 105255 + }, + { + "epoch": 7.151786927571681, + "grad_norm": 0.18911077082157135, + "learning_rate": 1.0638334012773474e-05, + "loss": 3.7894, + "step": 105260 + }, + { + "epoch": 7.152126647642342, + "grad_norm": 0.17676974833011627, + "learning_rate": 1.0634087511890204e-05, + "loss": 3.7831, + "step": 105265 + }, + { + "epoch": 7.1524663677130045, + "grad_norm": 0.13748499751091003, + "learning_rate": 1.062984101100693e-05, + "loss": 3.9905, + "step": 105270 + }, + { + "epoch": 7.1528060877836666, + "grad_norm": 0.19060803949832916, + "learning_rate": 1.0625594510123659e-05, + "loss": 3.866, + "step": 105275 + }, + { + "epoch": 7.153145807854328, + "grad_norm": 0.1867983490228653, + "learning_rate": 1.0621348009240387e-05, + "loss": 3.7256, + "step": 105280 + }, + { + "epoch": 7.15348552792499, + "grad_norm": 0.22965988516807556, + "learning_rate": 1.0617101508357115e-05, + "loss": 3.8218, + "step": 105285 + }, + { + "epoch": 7.153825247995652, + "grad_norm": 0.2521573305130005, + "learning_rate": 1.0612855007473843e-05, + "loss": 3.8825, + "step": 105290 + }, + { + "epoch": 7.154164968066313, + "grad_norm": 0.1756366640329361, + "learning_rate": 1.060860850659057e-05, + "loss": 3.8716, + "step": 105295 + }, + { + "epoch": 7.154504688136975, + "grad_norm": 0.137146458029747, + "learning_rate": 1.0604362005707299e-05, + "loss": 3.9662, + "step": 105300 + }, + { + "epoch": 7.154844408207637, + "grad_norm": 1.0301926136016846, + "learning_rate": 1.0600115504824025e-05, + "loss": 3.7643, + "step": 105305 + }, + { + "epoch": 7.155184128278298, + "grad_norm": 0.19828900694847107, + "learning_rate": 1.0595869003940753e-05, + "loss": 4.0452, + "step": 105310 + }, + { + "epoch": 7.1555238483489605, + "grad_norm": 0.2739354372024536, + "learning_rate": 1.0591622503057481e-05, + "loss": 3.8449, + "step": 105315 + }, + { + "epoch": 7.155863568419623, + "grad_norm": 0.17197668552398682, + "learning_rate": 1.0587376002174209e-05, + "loss": 3.8953, + "step": 105320 + }, + { + "epoch": 7.156203288490284, + "grad_norm": 0.15392367541790009, + "learning_rate": 1.0583129501290937e-05, + "loss": 3.8542, + "step": 105325 + }, + { + "epoch": 7.156543008560946, + "grad_norm": 0.2557527422904968, + "learning_rate": 1.0578883000407665e-05, + "loss": 3.8323, + "step": 105330 + }, + { + "epoch": 7.156882728631608, + "grad_norm": 0.15209369361400604, + "learning_rate": 1.0574636499524393e-05, + "loss": 3.8237, + "step": 105335 + }, + { + "epoch": 7.157222448702269, + "grad_norm": 0.1660553365945816, + "learning_rate": 1.057038999864112e-05, + "loss": 3.6392, + "step": 105340 + }, + { + "epoch": 7.157562168772931, + "grad_norm": 4.310255527496338, + "learning_rate": 1.0566143497757847e-05, + "loss": 3.7752, + "step": 105345 + }, + { + "epoch": 7.157901888843593, + "grad_norm": 0.1602918654680252, + "learning_rate": 1.0561896996874577e-05, + "loss": 3.826, + "step": 105350 + }, + { + "epoch": 7.158241608914254, + "grad_norm": 0.4569954574108124, + "learning_rate": 1.0557650495991303e-05, + "loss": 3.8407, + "step": 105355 + }, + { + "epoch": 7.1585813289849165, + "grad_norm": 0.18978922069072723, + "learning_rate": 1.0553403995108031e-05, + "loss": 3.8632, + "step": 105360 + }, + { + "epoch": 7.158921049055579, + "grad_norm": 0.13927701115608215, + "learning_rate": 1.054915749422476e-05, + "loss": 3.7809, + "step": 105365 + }, + { + "epoch": 7.15926076912624, + "grad_norm": 0.18773385882377625, + "learning_rate": 1.0544910993341487e-05, + "loss": 3.8249, + "step": 105370 + }, + { + "epoch": 7.159600489196902, + "grad_norm": 0.1812036633491516, + "learning_rate": 1.0540664492458215e-05, + "loss": 3.8937, + "step": 105375 + }, + { + "epoch": 7.159940209267564, + "grad_norm": 0.32784679532051086, + "learning_rate": 1.0536417991574943e-05, + "loss": 4.1018, + "step": 105380 + }, + { + "epoch": 7.160279929338225, + "grad_norm": 0.24117816984653473, + "learning_rate": 1.0532171490691671e-05, + "loss": 4.0474, + "step": 105385 + }, + { + "epoch": 7.160619649408887, + "grad_norm": 0.1523396074771881, + "learning_rate": 1.0527924989808398e-05, + "loss": 3.9382, + "step": 105390 + }, + { + "epoch": 7.160959369479549, + "grad_norm": 0.2034813016653061, + "learning_rate": 1.0523678488925126e-05, + "loss": 3.7921, + "step": 105395 + }, + { + "epoch": 7.16129908955021, + "grad_norm": 0.1879977136850357, + "learning_rate": 1.0519431988041854e-05, + "loss": 3.7568, + "step": 105400 + }, + { + "epoch": 7.1616388096208725, + "grad_norm": 1.4475798606872559, + "learning_rate": 1.0515185487158582e-05, + "loss": 3.6611, + "step": 105405 + }, + { + "epoch": 7.161978529691535, + "grad_norm": 0.22410406172275543, + "learning_rate": 1.051093898627531e-05, + "loss": 3.9033, + "step": 105410 + }, + { + "epoch": 7.162318249762196, + "grad_norm": 0.21385084092617035, + "learning_rate": 1.0506692485392038e-05, + "loss": 3.7552, + "step": 105415 + }, + { + "epoch": 7.162657969832858, + "grad_norm": 0.2432468980550766, + "learning_rate": 1.0502445984508766e-05, + "loss": 3.7167, + "step": 105420 + }, + { + "epoch": 7.16299768990352, + "grad_norm": 0.2533477544784546, + "learning_rate": 1.0498199483625492e-05, + "loss": 3.8338, + "step": 105425 + }, + { + "epoch": 7.163337409974181, + "grad_norm": 0.17815421521663666, + "learning_rate": 1.0493952982742222e-05, + "loss": 3.9824, + "step": 105430 + }, + { + "epoch": 7.163677130044843, + "grad_norm": 0.1565713733434677, + "learning_rate": 1.048970648185895e-05, + "loss": 3.7125, + "step": 105435 + }, + { + "epoch": 7.164016850115505, + "grad_norm": 0.19995199143886566, + "learning_rate": 1.0485459980975676e-05, + "loss": 4.1344, + "step": 105440 + }, + { + "epoch": 7.1643565701861665, + "grad_norm": 0.1945396363735199, + "learning_rate": 1.0481213480092404e-05, + "loss": 3.7473, + "step": 105445 + }, + { + "epoch": 7.1646962902568285, + "grad_norm": 0.16158965229988098, + "learning_rate": 1.0476966979209132e-05, + "loss": 3.7976, + "step": 105450 + }, + { + "epoch": 7.165036010327491, + "grad_norm": 0.1690213531255722, + "learning_rate": 1.047272047832586e-05, + "loss": 3.9677, + "step": 105455 + }, + { + "epoch": 7.165375730398152, + "grad_norm": 0.1676163673400879, + "learning_rate": 1.0468473977442588e-05, + "loss": 3.7396, + "step": 105460 + }, + { + "epoch": 7.165715450468814, + "grad_norm": 0.18150442838668823, + "learning_rate": 1.0464227476559316e-05, + "loss": 3.5439, + "step": 105465 + }, + { + "epoch": 7.166055170539475, + "grad_norm": 0.20614469051361084, + "learning_rate": 1.0459980975676044e-05, + "loss": 3.7688, + "step": 105470 + }, + { + "epoch": 7.166394890610137, + "grad_norm": 0.6823341250419617, + "learning_rate": 1.045573447479277e-05, + "loss": 3.853, + "step": 105475 + }, + { + "epoch": 7.166734610680799, + "grad_norm": 0.18142069876194, + "learning_rate": 1.0451487973909498e-05, + "loss": 3.7981, + "step": 105480 + }, + { + "epoch": 7.16707433075146, + "grad_norm": 0.1793074607849121, + "learning_rate": 1.0447241473026226e-05, + "loss": 3.681, + "step": 105485 + }, + { + "epoch": 7.1674140508221225, + "grad_norm": 0.15593750774860382, + "learning_rate": 1.0442994972142954e-05, + "loss": 3.6832, + "step": 105490 + }, + { + "epoch": 7.1677537708927845, + "grad_norm": 0.160322904586792, + "learning_rate": 1.0438748471259682e-05, + "loss": 4.0308, + "step": 105495 + }, + { + "epoch": 7.168093490963446, + "grad_norm": 7.487135887145996, + "learning_rate": 1.043450197037641e-05, + "loss": 4.0341, + "step": 105500 + }, + { + "epoch": 7.168433211034108, + "grad_norm": 0.2266218066215515, + "learning_rate": 1.0430255469493138e-05, + "loss": 3.7987, + "step": 105505 + }, + { + "epoch": 7.16877293110477, + "grad_norm": 0.21291887760162354, + "learning_rate": 1.0426008968609865e-05, + "loss": 3.7232, + "step": 105510 + }, + { + "epoch": 7.169112651175431, + "grad_norm": 0.19459211826324463, + "learning_rate": 1.0421762467726594e-05, + "loss": 3.9042, + "step": 105515 + }, + { + "epoch": 7.169452371246093, + "grad_norm": 0.17308743298053741, + "learning_rate": 1.0417515966843322e-05, + "loss": 3.6957, + "step": 105520 + }, + { + "epoch": 7.169792091316755, + "grad_norm": 0.20165714621543884, + "learning_rate": 1.0413269465960049e-05, + "loss": 4.0003, + "step": 105525 + }, + { + "epoch": 7.170131811387416, + "grad_norm": 0.18009023368358612, + "learning_rate": 1.0409022965076777e-05, + "loss": 3.8013, + "step": 105530 + }, + { + "epoch": 7.1704715314580785, + "grad_norm": 0.14799931645393372, + "learning_rate": 1.0404776464193505e-05, + "loss": 3.7473, + "step": 105535 + }, + { + "epoch": 7.1708112515287405, + "grad_norm": 0.19906052947044373, + "learning_rate": 1.0400529963310233e-05, + "loss": 3.8108, + "step": 105540 + }, + { + "epoch": 7.171150971599402, + "grad_norm": 0.2006111592054367, + "learning_rate": 1.039628346242696e-05, + "loss": 4.0051, + "step": 105545 + }, + { + "epoch": 7.171490691670064, + "grad_norm": 0.16491276025772095, + "learning_rate": 1.0392036961543689e-05, + "loss": 3.9407, + "step": 105550 + }, + { + "epoch": 7.171830411740726, + "grad_norm": 0.3006743788719177, + "learning_rate": 1.0387790460660417e-05, + "loss": 3.7057, + "step": 105555 + }, + { + "epoch": 7.172170131811387, + "grad_norm": 0.300719290971756, + "learning_rate": 1.0383543959777143e-05, + "loss": 3.8842, + "step": 105560 + }, + { + "epoch": 7.172509851882049, + "grad_norm": 0.18009167909622192, + "learning_rate": 1.0379297458893871e-05, + "loss": 3.8911, + "step": 105565 + }, + { + "epoch": 7.172849571952711, + "grad_norm": 0.15522456169128418, + "learning_rate": 1.03750509580106e-05, + "loss": 3.8677, + "step": 105570 + }, + { + "epoch": 7.173189292023372, + "grad_norm": 0.17518427968025208, + "learning_rate": 1.0370804457127327e-05, + "loss": 3.6277, + "step": 105575 + }, + { + "epoch": 7.1735290120940345, + "grad_norm": 0.9680127501487732, + "learning_rate": 1.0366557956244055e-05, + "loss": 3.7106, + "step": 105580 + }, + { + "epoch": 7.173868732164697, + "grad_norm": 0.24620042741298676, + "learning_rate": 1.0362311455360783e-05, + "loss": 4.0083, + "step": 105585 + }, + { + "epoch": 7.174208452235358, + "grad_norm": 0.40174633264541626, + "learning_rate": 1.0358064954477511e-05, + "loss": 3.7434, + "step": 105590 + }, + { + "epoch": 7.17454817230602, + "grad_norm": 0.21484537422657013, + "learning_rate": 1.0353818453594239e-05, + "loss": 3.963, + "step": 105595 + }, + { + "epoch": 7.174887892376682, + "grad_norm": 0.1843382716178894, + "learning_rate": 1.0349571952710967e-05, + "loss": 3.8093, + "step": 105600 + }, + { + "epoch": 7.175227612447343, + "grad_norm": 0.1731215864419937, + "learning_rate": 1.0345325451827695e-05, + "loss": 4.1503, + "step": 105605 + }, + { + "epoch": 7.175567332518005, + "grad_norm": 0.1783362478017807, + "learning_rate": 1.0341078950944421e-05, + "loss": 3.9617, + "step": 105610 + }, + { + "epoch": 7.175907052588667, + "grad_norm": 0.16517256200313568, + "learning_rate": 1.033683245006115e-05, + "loss": 3.8313, + "step": 105615 + }, + { + "epoch": 7.176246772659328, + "grad_norm": 0.1424359530210495, + "learning_rate": 1.0332585949177877e-05, + "loss": 3.6992, + "step": 105620 + }, + { + "epoch": 7.1765864927299905, + "grad_norm": 0.16656075417995453, + "learning_rate": 1.0328339448294607e-05, + "loss": 3.5112, + "step": 105625 + }, + { + "epoch": 7.176926212800653, + "grad_norm": 0.15514439344406128, + "learning_rate": 1.0324092947411333e-05, + "loss": 3.9051, + "step": 105630 + }, + { + "epoch": 7.177265932871314, + "grad_norm": 0.20790766179561615, + "learning_rate": 1.0319846446528062e-05, + "loss": 3.8827, + "step": 105635 + }, + { + "epoch": 7.177605652941976, + "grad_norm": 0.23827502131462097, + "learning_rate": 1.031559994564479e-05, + "loss": 3.928, + "step": 105640 + }, + { + "epoch": 7.177945373012638, + "grad_norm": 0.2039938122034073, + "learning_rate": 1.0311353444761516e-05, + "loss": 3.66, + "step": 105645 + }, + { + "epoch": 7.178285093083299, + "grad_norm": 0.207246333360672, + "learning_rate": 1.0307106943878246e-05, + "loss": 4.27, + "step": 105650 + }, + { + "epoch": 7.178624813153961, + "grad_norm": 0.13681672513484955, + "learning_rate": 1.0302860442994974e-05, + "loss": 3.7769, + "step": 105655 + }, + { + "epoch": 7.178964533224623, + "grad_norm": 0.1352943480014801, + "learning_rate": 1.02986139421117e-05, + "loss": 3.7954, + "step": 105660 + }, + { + "epoch": 7.179304253295284, + "grad_norm": 0.1808667629957199, + "learning_rate": 1.0294367441228428e-05, + "loss": 3.811, + "step": 105665 + }, + { + "epoch": 7.1796439733659465, + "grad_norm": 0.16035109758377075, + "learning_rate": 1.0290120940345156e-05, + "loss": 3.7032, + "step": 105670 + }, + { + "epoch": 7.179983693436609, + "grad_norm": 0.17603592574596405, + "learning_rate": 1.0285874439461884e-05, + "loss": 3.8752, + "step": 105675 + }, + { + "epoch": 7.18032341350727, + "grad_norm": 0.13096971809864044, + "learning_rate": 1.0281627938578612e-05, + "loss": 3.8071, + "step": 105680 + }, + { + "epoch": 7.180663133577932, + "grad_norm": 0.14562731981277466, + "learning_rate": 1.027738143769534e-05, + "loss": 3.9744, + "step": 105685 + }, + { + "epoch": 7.181002853648594, + "grad_norm": 0.200050488114357, + "learning_rate": 1.0273134936812068e-05, + "loss": 4.2508, + "step": 105690 + }, + { + "epoch": 7.181342573719255, + "grad_norm": 0.5283013582229614, + "learning_rate": 1.0268888435928794e-05, + "loss": 3.742, + "step": 105695 + }, + { + "epoch": 7.181682293789917, + "grad_norm": 0.2630375623703003, + "learning_rate": 1.0264641935045522e-05, + "loss": 4.016, + "step": 105700 + }, + { + "epoch": 7.182022013860579, + "grad_norm": 0.3464825451374054, + "learning_rate": 1.026039543416225e-05, + "loss": 3.5678, + "step": 105705 + }, + { + "epoch": 7.18236173393124, + "grad_norm": 0.23744118213653564, + "learning_rate": 1.025614893327898e-05, + "loss": 3.7146, + "step": 105710 + }, + { + "epoch": 7.1827014540019025, + "grad_norm": 0.20988459885120392, + "learning_rate": 1.0251902432395706e-05, + "loss": 3.9044, + "step": 105715 + }, + { + "epoch": 7.183041174072565, + "grad_norm": 0.20676498115062714, + "learning_rate": 1.0247655931512434e-05, + "loss": 3.8072, + "step": 105720 + }, + { + "epoch": 7.183380894143226, + "grad_norm": 0.16856448352336884, + "learning_rate": 1.0243409430629162e-05, + "loss": 3.78, + "step": 105725 + }, + { + "epoch": 7.183720614213888, + "grad_norm": 0.16840185225009918, + "learning_rate": 1.0239162929745889e-05, + "loss": 3.7707, + "step": 105730 + }, + { + "epoch": 7.18406033428455, + "grad_norm": 0.18321971595287323, + "learning_rate": 1.0234916428862618e-05, + "loss": 3.6541, + "step": 105735 + }, + { + "epoch": 7.184400054355211, + "grad_norm": 0.15949507057666779, + "learning_rate": 1.0230669927979346e-05, + "loss": 3.8456, + "step": 105740 + }, + { + "epoch": 7.184739774425873, + "grad_norm": 0.65687495470047, + "learning_rate": 1.0226423427096073e-05, + "loss": 3.5806, + "step": 105745 + }, + { + "epoch": 7.185079494496535, + "grad_norm": 0.2142629623413086, + "learning_rate": 1.02221769262128e-05, + "loss": 3.6911, + "step": 105750 + }, + { + "epoch": 7.1854192145671965, + "grad_norm": 0.14859774708747864, + "learning_rate": 1.0217930425329529e-05, + "loss": 3.9489, + "step": 105755 + }, + { + "epoch": 7.1857589346378585, + "grad_norm": 0.2215563803911209, + "learning_rate": 1.0213683924446257e-05, + "loss": 3.9803, + "step": 105760 + }, + { + "epoch": 7.186098654708521, + "grad_norm": 0.20065420866012573, + "learning_rate": 1.0209437423562985e-05, + "loss": 3.8413, + "step": 105765 + }, + { + "epoch": 7.186438374779182, + "grad_norm": 0.17923907935619354, + "learning_rate": 1.0205190922679713e-05, + "loss": 3.6789, + "step": 105770 + }, + { + "epoch": 7.186778094849844, + "grad_norm": 0.16097313165664673, + "learning_rate": 1.020094442179644e-05, + "loss": 3.7754, + "step": 105775 + }, + { + "epoch": 7.187117814920505, + "grad_norm": 0.18150635063648224, + "learning_rate": 1.0196697920913167e-05, + "loss": 3.7581, + "step": 105780 + }, + { + "epoch": 7.187457534991167, + "grad_norm": 0.16062183678150177, + "learning_rate": 1.0192451420029895e-05, + "loss": 3.8178, + "step": 105785 + }, + { + "epoch": 7.187797255061829, + "grad_norm": 0.3625733554363251, + "learning_rate": 1.0188204919146625e-05, + "loss": 3.965, + "step": 105790 + }, + { + "epoch": 7.18813697513249, + "grad_norm": 0.20888420939445496, + "learning_rate": 1.0183958418263353e-05, + "loss": 3.8633, + "step": 105795 + }, + { + "epoch": 7.1884766952031525, + "grad_norm": 0.16123875975608826, + "learning_rate": 1.0179711917380079e-05, + "loss": 3.6259, + "step": 105800 + }, + { + "epoch": 7.1888164152738145, + "grad_norm": 0.18501903116703033, + "learning_rate": 1.0175465416496807e-05, + "loss": 3.7581, + "step": 105805 + }, + { + "epoch": 7.189156135344476, + "grad_norm": 0.2694545090198517, + "learning_rate": 1.0171218915613535e-05, + "loss": 3.5306, + "step": 105810 + }, + { + "epoch": 7.189495855415138, + "grad_norm": 0.16628852486610413, + "learning_rate": 1.0166972414730263e-05, + "loss": 3.934, + "step": 105815 + }, + { + "epoch": 7.1898355754858, + "grad_norm": 0.1722017526626587, + "learning_rate": 1.0162725913846991e-05, + "loss": 3.8306, + "step": 105820 + }, + { + "epoch": 7.190175295556461, + "grad_norm": 0.15979450941085815, + "learning_rate": 1.0158479412963719e-05, + "loss": 4.0, + "step": 105825 + }, + { + "epoch": 7.190515015627123, + "grad_norm": 0.16897998750209808, + "learning_rate": 1.0154232912080445e-05, + "loss": 3.9524, + "step": 105830 + }, + { + "epoch": 7.190854735697785, + "grad_norm": 0.22653385996818542, + "learning_rate": 1.0149986411197173e-05, + "loss": 3.8007, + "step": 105835 + }, + { + "epoch": 7.191194455768446, + "grad_norm": 0.1805434674024582, + "learning_rate": 1.0145739910313901e-05, + "loss": 3.7577, + "step": 105840 + }, + { + "epoch": 7.1915341758391085, + "grad_norm": 0.2387198954820633, + "learning_rate": 1.014149340943063e-05, + "loss": 3.993, + "step": 105845 + }, + { + "epoch": 7.1918738959097706, + "grad_norm": 0.8900126814842224, + "learning_rate": 1.0137246908547357e-05, + "loss": 3.8865, + "step": 105850 + }, + { + "epoch": 7.192213615980432, + "grad_norm": 0.20907674729824066, + "learning_rate": 1.0133000407664085e-05, + "loss": 3.9425, + "step": 105855 + }, + { + "epoch": 7.192553336051094, + "grad_norm": 0.1627131551504135, + "learning_rate": 1.0128753906780813e-05, + "loss": 3.7399, + "step": 105860 + }, + { + "epoch": 7.192893056121756, + "grad_norm": 0.30013400316238403, + "learning_rate": 1.012450740589754e-05, + "loss": 4.1335, + "step": 105865 + }, + { + "epoch": 7.193232776192417, + "grad_norm": 0.5024619102478027, + "learning_rate": 1.0120260905014268e-05, + "loss": 3.5907, + "step": 105870 + }, + { + "epoch": 7.193572496263079, + "grad_norm": 0.18304383754730225, + "learning_rate": 1.0116014404130997e-05, + "loss": 3.6573, + "step": 105875 + }, + { + "epoch": 7.193912216333741, + "grad_norm": 0.18100009858608246, + "learning_rate": 1.0111767903247725e-05, + "loss": 3.8199, + "step": 105880 + }, + { + "epoch": 7.194251936404402, + "grad_norm": 0.20072126388549805, + "learning_rate": 1.0107521402364452e-05, + "loss": 3.871, + "step": 105885 + }, + { + "epoch": 7.1945916564750645, + "grad_norm": 0.1979658007621765, + "learning_rate": 1.010327490148118e-05, + "loss": 3.8262, + "step": 105890 + }, + { + "epoch": 7.194931376545727, + "grad_norm": 0.20816154778003693, + "learning_rate": 1.0099028400597908e-05, + "loss": 4.0454, + "step": 105895 + }, + { + "epoch": 7.195271096616388, + "grad_norm": 0.3563940227031708, + "learning_rate": 1.0094781899714636e-05, + "loss": 4.0184, + "step": 105900 + }, + { + "epoch": 7.19561081668705, + "grad_norm": 0.3370251953601837, + "learning_rate": 1.0090535398831364e-05, + "loss": 3.7618, + "step": 105905 + }, + { + "epoch": 7.195950536757712, + "grad_norm": 0.16540458798408508, + "learning_rate": 1.0086288897948092e-05, + "loss": 3.8432, + "step": 105910 + }, + { + "epoch": 7.196290256828373, + "grad_norm": 0.1559658795595169, + "learning_rate": 1.0082042397064818e-05, + "loss": 3.8291, + "step": 105915 + }, + { + "epoch": 7.196629976899035, + "grad_norm": 0.3133120536804199, + "learning_rate": 1.0077795896181546e-05, + "loss": 3.8654, + "step": 105920 + }, + { + "epoch": 7.196969696969697, + "grad_norm": 0.7428118586540222, + "learning_rate": 1.0073549395298274e-05, + "loss": 3.8424, + "step": 105925 + }, + { + "epoch": 7.197309417040358, + "grad_norm": 0.22266428172588348, + "learning_rate": 1.0069302894415004e-05, + "loss": 3.7831, + "step": 105930 + }, + { + "epoch": 7.1976491371110205, + "grad_norm": 0.31628552079200745, + "learning_rate": 1.006505639353173e-05, + "loss": 3.8568, + "step": 105935 + }, + { + "epoch": 7.197988857181683, + "grad_norm": 0.1856027990579605, + "learning_rate": 1.0060809892648458e-05, + "loss": 4.1075, + "step": 105940 + }, + { + "epoch": 7.198328577252344, + "grad_norm": 0.16538792848587036, + "learning_rate": 1.0056563391765186e-05, + "loss": 3.7315, + "step": 105945 + }, + { + "epoch": 7.198668297323006, + "grad_norm": 0.1544959992170334, + "learning_rate": 1.0052316890881912e-05, + "loss": 4.1379, + "step": 105950 + }, + { + "epoch": 7.199008017393668, + "grad_norm": 0.18066589534282684, + "learning_rate": 1.0048070389998642e-05, + "loss": 3.7149, + "step": 105955 + }, + { + "epoch": 7.199347737464329, + "grad_norm": 0.1679314821958542, + "learning_rate": 1.004382388911537e-05, + "loss": 3.8044, + "step": 105960 + }, + { + "epoch": 7.199687457534991, + "grad_norm": 0.19709515571594238, + "learning_rate": 1.0039577388232098e-05, + "loss": 3.7215, + "step": 105965 + }, + { + "epoch": 7.200027177605653, + "grad_norm": 0.16323690116405487, + "learning_rate": 1.0035330887348824e-05, + "loss": 3.913, + "step": 105970 + }, + { + "epoch": 7.200366897676314, + "grad_norm": 0.16538505256175995, + "learning_rate": 1.0031084386465552e-05, + "loss": 3.7626, + "step": 105975 + }, + { + "epoch": 7.2007066177469765, + "grad_norm": 0.2040734440088272, + "learning_rate": 1.002683788558228e-05, + "loss": 3.8536, + "step": 105980 + }, + { + "epoch": 7.201046337817639, + "grad_norm": 0.19778338074684143, + "learning_rate": 1.0022591384699008e-05, + "loss": 3.6355, + "step": 105985 + }, + { + "epoch": 7.2013860578883, + "grad_norm": 0.22022417187690735, + "learning_rate": 1.0018344883815737e-05, + "loss": 3.8612, + "step": 105990 + }, + { + "epoch": 7.201725777958962, + "grad_norm": 0.14583712816238403, + "learning_rate": 1.0014098382932465e-05, + "loss": 3.819, + "step": 105995 + }, + { + "epoch": 7.202065498029624, + "grad_norm": 0.18466675281524658, + "learning_rate": 1.000985188204919e-05, + "loss": 3.9393, + "step": 106000 + }, + { + "epoch": 7.202405218100285, + "grad_norm": 0.1480039358139038, + "learning_rate": 1.0005605381165919e-05, + "loss": 3.8751, + "step": 106005 + }, + { + "epoch": 7.202744938170947, + "grad_norm": 0.18611964583396912, + "learning_rate": 1.0001358880282649e-05, + "loss": 3.7609, + "step": 106010 + }, + { + "epoch": 7.203084658241609, + "grad_norm": 0.17085784673690796, + "learning_rate": 9.997112379399377e-06, + "loss": 3.9871, + "step": 106015 + }, + { + "epoch": 7.2034243783122704, + "grad_norm": 0.17896397411823273, + "learning_rate": 9.992865878516103e-06, + "loss": 3.8405, + "step": 106020 + }, + { + "epoch": 7.2037640983829325, + "grad_norm": 0.16278192400932312, + "learning_rate": 9.988619377632831e-06, + "loss": 3.8087, + "step": 106025 + }, + { + "epoch": 7.204103818453595, + "grad_norm": 0.2523459196090698, + "learning_rate": 9.984372876749559e-06, + "loss": 3.8989, + "step": 106030 + }, + { + "epoch": 7.204443538524256, + "grad_norm": 0.27982792258262634, + "learning_rate": 9.980126375866287e-06, + "loss": 4.0592, + "step": 106035 + }, + { + "epoch": 7.204783258594918, + "grad_norm": 0.24408277869224548, + "learning_rate": 9.975879874983015e-06, + "loss": 3.6615, + "step": 106040 + }, + { + "epoch": 7.20512297866558, + "grad_norm": 1.0700608491897583, + "learning_rate": 9.971633374099743e-06, + "loss": 3.9186, + "step": 106045 + }, + { + "epoch": 7.205462698736241, + "grad_norm": 0.3329193890094757, + "learning_rate": 9.967386873216471e-06, + "loss": 3.9395, + "step": 106050 + }, + { + "epoch": 7.205802418806903, + "grad_norm": 0.1425352245569229, + "learning_rate": 9.963140372333197e-06, + "loss": 3.9496, + "step": 106055 + }, + { + "epoch": 7.206142138877565, + "grad_norm": 0.49062788486480713, + "learning_rate": 9.958893871449925e-06, + "loss": 4.0247, + "step": 106060 + }, + { + "epoch": 7.2064818589482265, + "grad_norm": 0.21025535464286804, + "learning_rate": 9.954647370566653e-06, + "loss": 3.8576, + "step": 106065 + }, + { + "epoch": 7.2068215790188885, + "grad_norm": 0.144011452794075, + "learning_rate": 9.950400869683381e-06, + "loss": 3.9678, + "step": 106070 + }, + { + "epoch": 7.207161299089551, + "grad_norm": 0.15760357677936554, + "learning_rate": 9.94615436880011e-06, + "loss": 3.7442, + "step": 106075 + }, + { + "epoch": 7.207501019160212, + "grad_norm": 0.14224450290203094, + "learning_rate": 9.941907867916837e-06, + "loss": 3.7056, + "step": 106080 + }, + { + "epoch": 7.207840739230874, + "grad_norm": 0.19625242054462433, + "learning_rate": 9.937661367033564e-06, + "loss": 3.9234, + "step": 106085 + }, + { + "epoch": 7.208180459301536, + "grad_norm": 0.16138982772827148, + "learning_rate": 9.933414866150292e-06, + "loss": 3.6324, + "step": 106090 + }, + { + "epoch": 7.208520179372197, + "grad_norm": 0.2011641412973404, + "learning_rate": 9.929168365267021e-06, + "loss": 3.9009, + "step": 106095 + }, + { + "epoch": 7.208859899442859, + "grad_norm": 0.1729941964149475, + "learning_rate": 9.92492186438375e-06, + "loss": 4.0615, + "step": 106100 + }, + { + "epoch": 7.209199619513521, + "grad_norm": 0.17509616911411285, + "learning_rate": 9.920675363500476e-06, + "loss": 4.031, + "step": 106105 + }, + { + "epoch": 7.2095393395841825, + "grad_norm": 0.45859643816947937, + "learning_rate": 9.916428862617204e-06, + "loss": 3.8181, + "step": 106110 + }, + { + "epoch": 7.2098790596548445, + "grad_norm": 0.1678939312696457, + "learning_rate": 9.912182361733932e-06, + "loss": 3.6821, + "step": 106115 + }, + { + "epoch": 7.210218779725507, + "grad_norm": 0.20383325219154358, + "learning_rate": 9.90793586085066e-06, + "loss": 3.982, + "step": 106120 + }, + { + "epoch": 7.210558499796168, + "grad_norm": 0.1881791353225708, + "learning_rate": 9.903689359967388e-06, + "loss": 4.0491, + "step": 106125 + }, + { + "epoch": 7.21089821986683, + "grad_norm": 0.16440913081169128, + "learning_rate": 9.899442859084116e-06, + "loss": 3.8062, + "step": 106130 + }, + { + "epoch": 7.211237939937492, + "grad_norm": 0.1900634616613388, + "learning_rate": 9.895196358200844e-06, + "loss": 3.7154, + "step": 106135 + }, + { + "epoch": 7.211577660008153, + "grad_norm": 0.1700432151556015, + "learning_rate": 9.89094985731757e-06, + "loss": 3.7363, + "step": 106140 + }, + { + "epoch": 7.211917380078815, + "grad_norm": 0.17099183797836304, + "learning_rate": 9.886703356434298e-06, + "loss": 3.6953, + "step": 106145 + }, + { + "epoch": 7.212257100149476, + "grad_norm": 0.1401364654302597, + "learning_rate": 9.882456855551028e-06, + "loss": 3.6861, + "step": 106150 + }, + { + "epoch": 7.2125968202201385, + "grad_norm": 0.14366954565048218, + "learning_rate": 9.878210354667754e-06, + "loss": 3.7382, + "step": 106155 + }, + { + "epoch": 7.212936540290801, + "grad_norm": 0.19158560037612915, + "learning_rate": 9.873963853784482e-06, + "loss": 3.8743, + "step": 106160 + }, + { + "epoch": 7.213276260361462, + "grad_norm": 0.15892544388771057, + "learning_rate": 9.86971735290121e-06, + "loss": 3.7864, + "step": 106165 + }, + { + "epoch": 7.213615980432124, + "grad_norm": 0.17146950960159302, + "learning_rate": 9.865470852017936e-06, + "loss": 3.8289, + "step": 106170 + }, + { + "epoch": 7.213955700502786, + "grad_norm": 0.18968012928962708, + "learning_rate": 9.861224351134666e-06, + "loss": 3.8327, + "step": 106175 + }, + { + "epoch": 7.214295420573447, + "grad_norm": 0.17894570529460907, + "learning_rate": 9.856977850251394e-06, + "loss": 3.8142, + "step": 106180 + }, + { + "epoch": 7.214635140644109, + "grad_norm": 0.1798490583896637, + "learning_rate": 9.852731349368122e-06, + "loss": 3.9421, + "step": 106185 + }, + { + "epoch": 7.214974860714771, + "grad_norm": 0.15840807557106018, + "learning_rate": 9.848484848484848e-06, + "loss": 3.4215, + "step": 106190 + }, + { + "epoch": 7.215314580785432, + "grad_norm": 0.14254482090473175, + "learning_rate": 9.844238347601576e-06, + "loss": 3.7664, + "step": 106195 + }, + { + "epoch": 7.2156543008560945, + "grad_norm": 0.17783696949481964, + "learning_rate": 9.839991846718304e-06, + "loss": 3.7379, + "step": 106200 + }, + { + "epoch": 7.215994020926757, + "grad_norm": 0.15739257633686066, + "learning_rate": 9.835745345835032e-06, + "loss": 4.1466, + "step": 106205 + }, + { + "epoch": 7.216333740997418, + "grad_norm": 0.16010627150535583, + "learning_rate": 9.83149884495176e-06, + "loss": 3.8198, + "step": 106210 + }, + { + "epoch": 7.21667346106808, + "grad_norm": 0.1645490527153015, + "learning_rate": 9.827252344068488e-06, + "loss": 3.7171, + "step": 106215 + }, + { + "epoch": 7.217013181138742, + "grad_norm": 0.2286226600408554, + "learning_rate": 9.823005843185216e-06, + "loss": 3.6182, + "step": 106220 + }, + { + "epoch": 7.217352901209403, + "grad_norm": 0.19254088401794434, + "learning_rate": 9.818759342301943e-06, + "loss": 3.7249, + "step": 106225 + }, + { + "epoch": 7.217692621280065, + "grad_norm": 0.8020777702331543, + "learning_rate": 9.814512841418672e-06, + "loss": 3.9568, + "step": 106230 + }, + { + "epoch": 7.218032341350727, + "grad_norm": 0.20589227974414825, + "learning_rate": 9.8102663405354e-06, + "loss": 4.0151, + "step": 106235 + }, + { + "epoch": 7.218372061421388, + "grad_norm": 0.19219723343849182, + "learning_rate": 9.806019839652127e-06, + "loss": 3.7882, + "step": 106240 + }, + { + "epoch": 7.2187117814920505, + "grad_norm": 0.2634992003440857, + "learning_rate": 9.801773338768855e-06, + "loss": 3.4137, + "step": 106245 + }, + { + "epoch": 7.219051501562713, + "grad_norm": 0.2200208157300949, + "learning_rate": 9.797526837885583e-06, + "loss": 3.8, + "step": 106250 + }, + { + "epoch": 7.219391221633374, + "grad_norm": 0.22348396480083466, + "learning_rate": 9.79328033700231e-06, + "loss": 3.7715, + "step": 106255 + }, + { + "epoch": 7.219730941704036, + "grad_norm": 0.5917363166809082, + "learning_rate": 9.789033836119039e-06, + "loss": 3.8397, + "step": 106260 + }, + { + "epoch": 7.220070661774698, + "grad_norm": 0.3275803029537201, + "learning_rate": 9.784787335235767e-06, + "loss": 4.1102, + "step": 106265 + }, + { + "epoch": 7.220410381845359, + "grad_norm": 0.16098210215568542, + "learning_rate": 9.780540834352495e-06, + "loss": 4.0172, + "step": 106270 + }, + { + "epoch": 7.220750101916021, + "grad_norm": 0.46915799379348755, + "learning_rate": 9.776294333469221e-06, + "loss": 3.9761, + "step": 106275 + }, + { + "epoch": 7.221089821986683, + "grad_norm": 0.20506533980369568, + "learning_rate": 9.772047832585949e-06, + "loss": 3.9137, + "step": 106280 + }, + { + "epoch": 7.221429542057344, + "grad_norm": 0.2838280498981476, + "learning_rate": 9.767801331702677e-06, + "loss": 3.8597, + "step": 106285 + }, + { + "epoch": 7.2217692621280065, + "grad_norm": 0.25782161951065063, + "learning_rate": 9.763554830819405e-06, + "loss": 3.7374, + "step": 106290 + }, + { + "epoch": 7.222108982198669, + "grad_norm": 0.18050192296504974, + "learning_rate": 9.759308329936133e-06, + "loss": 3.8832, + "step": 106295 + }, + { + "epoch": 7.22244870226933, + "grad_norm": 0.14420916140079498, + "learning_rate": 9.755061829052861e-06, + "loss": 3.6321, + "step": 106300 + }, + { + "epoch": 7.222788422339992, + "grad_norm": 0.17149938642978668, + "learning_rate": 9.750815328169589e-06, + "loss": 3.83, + "step": 106305 + }, + { + "epoch": 7.223128142410654, + "grad_norm": 0.14778558909893036, + "learning_rate": 9.746568827286315e-06, + "loss": 3.9844, + "step": 106310 + }, + { + "epoch": 7.223467862481315, + "grad_norm": 0.4953763782978058, + "learning_rate": 9.742322326403045e-06, + "loss": 3.8058, + "step": 106315 + }, + { + "epoch": 7.223807582551977, + "grad_norm": 0.2373729646205902, + "learning_rate": 9.738075825519773e-06, + "loss": 4.0415, + "step": 106320 + }, + { + "epoch": 7.224147302622639, + "grad_norm": 0.16663897037506104, + "learning_rate": 9.7338293246365e-06, + "loss": 3.8378, + "step": 106325 + }, + { + "epoch": 7.2244870226933005, + "grad_norm": 0.2274041622877121, + "learning_rate": 9.729582823753227e-06, + "loss": 3.6886, + "step": 106330 + }, + { + "epoch": 7.2248267427639625, + "grad_norm": 0.2961041331291199, + "learning_rate": 9.725336322869955e-06, + "loss": 3.8225, + "step": 106335 + }, + { + "epoch": 7.225166462834625, + "grad_norm": 0.17412376403808594, + "learning_rate": 9.721089821986683e-06, + "loss": 3.8575, + "step": 106340 + }, + { + "epoch": 7.225506182905286, + "grad_norm": 0.17900125682353973, + "learning_rate": 9.716843321103411e-06, + "loss": 4.0841, + "step": 106345 + }, + { + "epoch": 7.225845902975948, + "grad_norm": 0.14147508144378662, + "learning_rate": 9.71259682022014e-06, + "loss": 3.8506, + "step": 106350 + }, + { + "epoch": 7.22618562304661, + "grad_norm": 0.1967446208000183, + "learning_rate": 9.708350319336868e-06, + "loss": 3.6434, + "step": 106355 + }, + { + "epoch": 7.226525343117271, + "grad_norm": 0.17864163219928741, + "learning_rate": 9.704103818453594e-06, + "loss": 3.7551, + "step": 106360 + }, + { + "epoch": 7.226865063187933, + "grad_norm": 0.17319990694522858, + "learning_rate": 9.699857317570322e-06, + "loss": 3.9313, + "step": 106365 + }, + { + "epoch": 7.227204783258595, + "grad_norm": 0.17828837037086487, + "learning_rate": 9.695610816687052e-06, + "loss": 4.2174, + "step": 106370 + }, + { + "epoch": 7.2275445033292565, + "grad_norm": 0.5078281164169312, + "learning_rate": 9.691364315803778e-06, + "loss": 3.7528, + "step": 106375 + }, + { + "epoch": 7.2278842233999185, + "grad_norm": 0.17885389924049377, + "learning_rate": 9.687117814920506e-06, + "loss": 4.087, + "step": 106380 + }, + { + "epoch": 7.228223943470581, + "grad_norm": 0.2437509000301361, + "learning_rate": 9.682871314037234e-06, + "loss": 3.5802, + "step": 106385 + }, + { + "epoch": 7.228563663541242, + "grad_norm": 0.5174271464347839, + "learning_rate": 9.678624813153962e-06, + "loss": 3.7972, + "step": 106390 + }, + { + "epoch": 7.228903383611904, + "grad_norm": 0.5182918310165405, + "learning_rate": 9.67437831227069e-06, + "loss": 3.8999, + "step": 106395 + }, + { + "epoch": 7.229243103682566, + "grad_norm": 0.2388562262058258, + "learning_rate": 9.670131811387418e-06, + "loss": 3.728, + "step": 106400 + }, + { + "epoch": 7.229582823753227, + "grad_norm": 0.2459823489189148, + "learning_rate": 9.665885310504146e-06, + "loss": 3.7487, + "step": 106405 + }, + { + "epoch": 7.229922543823889, + "grad_norm": 0.1823384165763855, + "learning_rate": 9.661638809620872e-06, + "loss": 3.742, + "step": 106410 + }, + { + "epoch": 7.230262263894551, + "grad_norm": 0.1461399346590042, + "learning_rate": 9.6573923087376e-06, + "loss": 3.8124, + "step": 106415 + }, + { + "epoch": 7.2306019839652125, + "grad_norm": 0.19701659679412842, + "learning_rate": 9.653145807854328e-06, + "loss": 3.8346, + "step": 106420 + }, + { + "epoch": 7.2309417040358746, + "grad_norm": 0.21999353170394897, + "learning_rate": 9.648899306971056e-06, + "loss": 3.789, + "step": 106425 + }, + { + "epoch": 7.231281424106537, + "grad_norm": 0.1834852248430252, + "learning_rate": 9.644652806087784e-06, + "loss": 3.8002, + "step": 106430 + }, + { + "epoch": 7.231621144177198, + "grad_norm": 0.13658834993839264, + "learning_rate": 9.640406305204512e-06, + "loss": 3.7201, + "step": 106435 + }, + { + "epoch": 7.23196086424786, + "grad_norm": 0.20270952582359314, + "learning_rate": 9.63615980432124e-06, + "loss": 4.1167, + "step": 106440 + }, + { + "epoch": 7.232300584318522, + "grad_norm": 0.18146108090877533, + "learning_rate": 9.631913303437967e-06, + "loss": 3.7744, + "step": 106445 + }, + { + "epoch": 7.232640304389183, + "grad_norm": 0.1795981079339981, + "learning_rate": 9.627666802554695e-06, + "loss": 3.9793, + "step": 106450 + }, + { + "epoch": 7.232980024459845, + "grad_norm": 0.6257718801498413, + "learning_rate": 9.623420301671424e-06, + "loss": 3.5763, + "step": 106455 + }, + { + "epoch": 7.233319744530506, + "grad_norm": 0.23305992782115936, + "learning_rate": 9.61917380078815e-06, + "loss": 3.8741, + "step": 106460 + }, + { + "epoch": 7.2336594646011685, + "grad_norm": 0.17772214114665985, + "learning_rate": 9.614927299904879e-06, + "loss": 3.9195, + "step": 106465 + }, + { + "epoch": 7.233999184671831, + "grad_norm": 0.19055825471878052, + "learning_rate": 9.610680799021607e-06, + "loss": 3.8156, + "step": 106470 + }, + { + "epoch": 7.234338904742492, + "grad_norm": 0.18272975087165833, + "learning_rate": 9.606434298138335e-06, + "loss": 3.7771, + "step": 106475 + }, + { + "epoch": 7.234678624813154, + "grad_norm": 0.1500982791185379, + "learning_rate": 9.602187797255063e-06, + "loss": 3.7364, + "step": 106480 + }, + { + "epoch": 7.235018344883816, + "grad_norm": 0.1619262993335724, + "learning_rate": 9.59794129637179e-06, + "loss": 3.6581, + "step": 106485 + }, + { + "epoch": 7.235358064954477, + "grad_norm": 0.16897796094417572, + "learning_rate": 9.593694795488519e-06, + "loss": 3.7728, + "step": 106490 + }, + { + "epoch": 7.235697785025139, + "grad_norm": 0.20258159935474396, + "learning_rate": 9.589448294605245e-06, + "loss": 4.1066, + "step": 106495 + }, + { + "epoch": 7.236037505095801, + "grad_norm": 0.1549985557794571, + "learning_rate": 9.585201793721973e-06, + "loss": 4.0052, + "step": 106500 + }, + { + "epoch": 7.236377225166462, + "grad_norm": 0.2017681896686554, + "learning_rate": 9.580955292838701e-06, + "loss": 3.7292, + "step": 106505 + }, + { + "epoch": 7.2367169452371245, + "grad_norm": 0.21812036633491516, + "learning_rate": 9.576708791955429e-06, + "loss": 3.7011, + "step": 106510 + }, + { + "epoch": 7.237056665307787, + "grad_norm": 0.21906466782093048, + "learning_rate": 9.572462291072157e-06, + "loss": 3.73, + "step": 106515 + }, + { + "epoch": 7.237396385378448, + "grad_norm": 0.20021162927150726, + "learning_rate": 9.568215790188885e-06, + "loss": 3.8931, + "step": 106520 + }, + { + "epoch": 7.23773610544911, + "grad_norm": 0.8064388036727905, + "learning_rate": 9.563969289305613e-06, + "loss": 3.6642, + "step": 106525 + }, + { + "epoch": 7.238075825519772, + "grad_norm": 0.2001275271177292, + "learning_rate": 9.55972278842234e-06, + "loss": 3.6908, + "step": 106530 + }, + { + "epoch": 7.238415545590433, + "grad_norm": 0.1453620195388794, + "learning_rate": 9.555476287539069e-06, + "loss": 3.7898, + "step": 106535 + }, + { + "epoch": 7.238755265661095, + "grad_norm": 0.1580914556980133, + "learning_rate": 9.551229786655797e-06, + "loss": 3.8686, + "step": 106540 + }, + { + "epoch": 7.239094985731757, + "grad_norm": 0.18858331441879272, + "learning_rate": 9.546983285772523e-06, + "loss": 4.0932, + "step": 106545 + }, + { + "epoch": 7.239434705802418, + "grad_norm": 0.1857316493988037, + "learning_rate": 9.542736784889251e-06, + "loss": 3.6493, + "step": 106550 + }, + { + "epoch": 7.2397744258730805, + "grad_norm": 0.17804810404777527, + "learning_rate": 9.53849028400598e-06, + "loss": 3.9539, + "step": 106555 + }, + { + "epoch": 7.240114145943743, + "grad_norm": 0.17907565832138062, + "learning_rate": 9.534243783122707e-06, + "loss": 3.8111, + "step": 106560 + }, + { + "epoch": 7.240453866014404, + "grad_norm": 0.19713543355464935, + "learning_rate": 9.529997282239435e-06, + "loss": 3.7636, + "step": 106565 + }, + { + "epoch": 7.240793586085066, + "grad_norm": 0.9161598682403564, + "learning_rate": 9.525750781356163e-06, + "loss": 3.9529, + "step": 106570 + }, + { + "epoch": 7.241133306155728, + "grad_norm": 0.1981162130832672, + "learning_rate": 9.521504280472891e-06, + "loss": 3.9451, + "step": 106575 + }, + { + "epoch": 7.241473026226389, + "grad_norm": 0.17682109773159027, + "learning_rate": 9.517257779589618e-06, + "loss": 3.7712, + "step": 106580 + }, + { + "epoch": 7.241812746297051, + "grad_norm": 0.33180272579193115, + "learning_rate": 9.513011278706346e-06, + "loss": 3.915, + "step": 106585 + }, + { + "epoch": 7.242152466367713, + "grad_norm": 0.1345345377922058, + "learning_rate": 9.508764777823075e-06, + "loss": 3.8892, + "step": 106590 + }, + { + "epoch": 7.2424921864383744, + "grad_norm": 0.17434433102607727, + "learning_rate": 9.504518276939802e-06, + "loss": 3.6987, + "step": 106595 + }, + { + "epoch": 7.2428319065090365, + "grad_norm": 0.17290525138378143, + "learning_rate": 9.50027177605653e-06, + "loss": 3.9701, + "step": 106600 + }, + { + "epoch": 7.243171626579699, + "grad_norm": 0.19121600687503815, + "learning_rate": 9.496025275173258e-06, + "loss": 3.7658, + "step": 106605 + }, + { + "epoch": 7.24351134665036, + "grad_norm": 0.20800167322158813, + "learning_rate": 9.491778774289986e-06, + "loss": 3.7623, + "step": 106610 + }, + { + "epoch": 7.243851066721022, + "grad_norm": 0.3306618332862854, + "learning_rate": 9.487532273406714e-06, + "loss": 3.9048, + "step": 106615 + }, + { + "epoch": 7.244190786791684, + "grad_norm": 0.17179201543331146, + "learning_rate": 9.483285772523442e-06, + "loss": 3.7595, + "step": 106620 + }, + { + "epoch": 7.244530506862345, + "grad_norm": 0.16630761325359344, + "learning_rate": 9.47903927164017e-06, + "loss": 3.6335, + "step": 106625 + }, + { + "epoch": 7.244870226933007, + "grad_norm": 0.7644411325454712, + "learning_rate": 9.474792770756896e-06, + "loss": 3.8086, + "step": 106630 + }, + { + "epoch": 7.245209947003669, + "grad_norm": 0.17777661979198456, + "learning_rate": 9.470546269873624e-06, + "loss": 3.934, + "step": 106635 + }, + { + "epoch": 7.2455496670743305, + "grad_norm": 0.15018057823181152, + "learning_rate": 9.466299768990352e-06, + "loss": 3.8771, + "step": 106640 + }, + { + "epoch": 7.2458893871449925, + "grad_norm": 0.15477721393108368, + "learning_rate": 9.46205326810708e-06, + "loss": 3.7582, + "step": 106645 + }, + { + "epoch": 7.246229107215655, + "grad_norm": 0.2095431536436081, + "learning_rate": 9.457806767223808e-06, + "loss": 4.0144, + "step": 106650 + }, + { + "epoch": 7.246568827286316, + "grad_norm": 0.18707531690597534, + "learning_rate": 9.453560266340536e-06, + "loss": 3.6051, + "step": 106655 + }, + { + "epoch": 7.246908547356978, + "grad_norm": 0.12104516476392746, + "learning_rate": 9.449313765457264e-06, + "loss": 3.7692, + "step": 106660 + }, + { + "epoch": 7.24724826742764, + "grad_norm": 0.15661132335662842, + "learning_rate": 9.44506726457399e-06, + "loss": 3.8158, + "step": 106665 + }, + { + "epoch": 7.247587987498301, + "grad_norm": 0.17015288770198822, + "learning_rate": 9.440820763690718e-06, + "loss": 3.8356, + "step": 106670 + }, + { + "epoch": 7.247927707568963, + "grad_norm": 0.1499588042497635, + "learning_rate": 9.437423562984103e-06, + "loss": 3.886, + "step": 106675 + }, + { + "epoch": 7.248267427639625, + "grad_norm": 0.5362591743469238, + "learning_rate": 9.43317706210083e-06, + "loss": 3.9316, + "step": 106680 + }, + { + "epoch": 7.2486071477102865, + "grad_norm": 4.086087226867676, + "learning_rate": 9.428930561217557e-06, + "loss": 3.9274, + "step": 106685 + }, + { + "epoch": 7.2489468677809485, + "grad_norm": 6.59607458114624, + "learning_rate": 9.424684060334285e-06, + "loss": 3.7334, + "step": 106690 + }, + { + "epoch": 7.249286587851611, + "grad_norm": 0.1582345813512802, + "learning_rate": 9.420437559451013e-06, + "loss": 3.9318, + "step": 106695 + }, + { + "epoch": 7.249626307922272, + "grad_norm": 0.13995233178138733, + "learning_rate": 9.416191058567741e-06, + "loss": 3.7228, + "step": 106700 + }, + { + "epoch": 7.249966027992934, + "grad_norm": 0.16095557808876038, + "learning_rate": 9.411944557684469e-06, + "loss": 4.0542, + "step": 106705 + }, + { + "epoch": 7.250305748063596, + "grad_norm": 0.14679847657680511, + "learning_rate": 9.407698056801197e-06, + "loss": 3.9738, + "step": 106710 + }, + { + "epoch": 7.250645468134257, + "grad_norm": 0.37966784834861755, + "learning_rate": 9.403451555917923e-06, + "loss": 3.8823, + "step": 106715 + }, + { + "epoch": 7.250985188204919, + "grad_norm": 0.8957479000091553, + "learning_rate": 9.399205055034651e-06, + "loss": 3.8146, + "step": 106720 + }, + { + "epoch": 7.251324908275581, + "grad_norm": 0.17454937100410461, + "learning_rate": 9.39495855415138e-06, + "loss": 3.9174, + "step": 106725 + }, + { + "epoch": 7.2516646283462425, + "grad_norm": 0.1516372412443161, + "learning_rate": 9.390712053268109e-06, + "loss": 3.7218, + "step": 106730 + }, + { + "epoch": 7.2520043484169046, + "grad_norm": 0.9381284117698669, + "learning_rate": 9.386465552384835e-06, + "loss": 3.868, + "step": 106735 + }, + { + "epoch": 7.252344068487567, + "grad_norm": 0.17732509970664978, + "learning_rate": 9.382219051501563e-06, + "loss": 3.7909, + "step": 106740 + }, + { + "epoch": 7.252683788558228, + "grad_norm": 0.21020089089870453, + "learning_rate": 9.377972550618291e-06, + "loss": 3.8557, + "step": 106745 + }, + { + "epoch": 7.25302350862889, + "grad_norm": 0.22802072763442993, + "learning_rate": 9.373726049735018e-06, + "loss": 3.5903, + "step": 106750 + }, + { + "epoch": 7.253363228699552, + "grad_norm": 0.21617920696735382, + "learning_rate": 9.369479548851747e-06, + "loss": 3.6981, + "step": 106755 + }, + { + "epoch": 7.253702948770213, + "grad_norm": 0.15866200625896454, + "learning_rate": 9.365233047968475e-06, + "loss": 3.8572, + "step": 106760 + }, + { + "epoch": 7.254042668840875, + "grad_norm": 0.2674309313297272, + "learning_rate": 9.360986547085203e-06, + "loss": 3.6659, + "step": 106765 + }, + { + "epoch": 7.254382388911537, + "grad_norm": 0.18313942849636078, + "learning_rate": 9.35674004620193e-06, + "loss": 3.8083, + "step": 106770 + }, + { + "epoch": 7.2547221089821985, + "grad_norm": 0.19258053600788116, + "learning_rate": 9.352493545318658e-06, + "loss": 3.7878, + "step": 106775 + }, + { + "epoch": 7.255061829052861, + "grad_norm": 0.43343308568000793, + "learning_rate": 9.348247044435386e-06, + "loss": 3.8104, + "step": 106780 + }, + { + "epoch": 7.255401549123523, + "grad_norm": 0.17707739770412445, + "learning_rate": 9.344000543552114e-06, + "loss": 3.8069, + "step": 106785 + }, + { + "epoch": 7.255741269194184, + "grad_norm": 0.26142436265945435, + "learning_rate": 9.339754042668842e-06, + "loss": 3.8223, + "step": 106790 + }, + { + "epoch": 7.256080989264846, + "grad_norm": 0.18021699786186218, + "learning_rate": 9.33550754178557e-06, + "loss": 3.8219, + "step": 106795 + }, + { + "epoch": 7.256420709335508, + "grad_norm": 0.1467004418373108, + "learning_rate": 9.331261040902296e-06, + "loss": 3.8775, + "step": 106800 + }, + { + "epoch": 7.256760429406169, + "grad_norm": 0.19004522264003754, + "learning_rate": 9.327014540019024e-06, + "loss": 3.8096, + "step": 106805 + }, + { + "epoch": 7.257100149476831, + "grad_norm": 1.8165072202682495, + "learning_rate": 9.322768039135752e-06, + "loss": 3.5591, + "step": 106810 + }, + { + "epoch": 7.257439869547493, + "grad_norm": 0.23188236355781555, + "learning_rate": 9.318521538252482e-06, + "loss": 4.0015, + "step": 106815 + }, + { + "epoch": 7.2577795896181545, + "grad_norm": 0.18529091775417328, + "learning_rate": 9.314275037369208e-06, + "loss": 3.9144, + "step": 106820 + }, + { + "epoch": 7.258119309688817, + "grad_norm": 0.3216480612754822, + "learning_rate": 9.310028536485936e-06, + "loss": 3.9924, + "step": 106825 + }, + { + "epoch": 7.258459029759479, + "grad_norm": 0.13760699331760406, + "learning_rate": 9.305782035602664e-06, + "loss": 3.9427, + "step": 106830 + }, + { + "epoch": 7.25879874983014, + "grad_norm": 0.2862265706062317, + "learning_rate": 9.30153553471939e-06, + "loss": 4.0257, + "step": 106835 + }, + { + "epoch": 7.259138469900802, + "grad_norm": 0.21403717994689941, + "learning_rate": 9.29728903383612e-06, + "loss": 3.8414, + "step": 106840 + }, + { + "epoch": 7.259478189971463, + "grad_norm": 0.19005103409290314, + "learning_rate": 9.293042532952848e-06, + "loss": 4.1067, + "step": 106845 + }, + { + "epoch": 7.259817910042125, + "grad_norm": 1.1713244915008545, + "learning_rate": 9.288796032069576e-06, + "loss": 3.7756, + "step": 106850 + }, + { + "epoch": 7.260157630112787, + "grad_norm": 0.846667468547821, + "learning_rate": 9.284549531186302e-06, + "loss": 3.8098, + "step": 106855 + }, + { + "epoch": 7.260497350183448, + "grad_norm": 0.17170731723308563, + "learning_rate": 9.28030303030303e-06, + "loss": 3.7211, + "step": 106860 + }, + { + "epoch": 7.2608370702541105, + "grad_norm": 0.30100494623184204, + "learning_rate": 9.276056529419758e-06, + "loss": 3.6875, + "step": 106865 + }, + { + "epoch": 7.261176790324773, + "grad_norm": 0.44360095262527466, + "learning_rate": 9.271810028536486e-06, + "loss": 3.864, + "step": 106870 + }, + { + "epoch": 7.261516510395434, + "grad_norm": 0.133025124669075, + "learning_rate": 9.267563527653214e-06, + "loss": 3.5982, + "step": 106875 + }, + { + "epoch": 7.261856230466096, + "grad_norm": 0.1760183572769165, + "learning_rate": 9.263317026769942e-06, + "loss": 3.8337, + "step": 106880 + }, + { + "epoch": 7.262195950536758, + "grad_norm": 0.15575312077999115, + "learning_rate": 9.259070525886669e-06, + "loss": 4.1677, + "step": 106885 + }, + { + "epoch": 7.262535670607419, + "grad_norm": 0.21953925490379333, + "learning_rate": 9.254824025003397e-06, + "loss": 3.7252, + "step": 106890 + }, + { + "epoch": 7.262875390678081, + "grad_norm": 0.1388680636882782, + "learning_rate": 9.250577524120126e-06, + "loss": 3.7092, + "step": 106895 + }, + { + "epoch": 7.263215110748743, + "grad_norm": 0.25392836332321167, + "learning_rate": 9.246331023236854e-06, + "loss": 3.8073, + "step": 106900 + }, + { + "epoch": 7.2635548308194045, + "grad_norm": 0.48912209272384644, + "learning_rate": 9.24208452235358e-06, + "loss": 3.9024, + "step": 106905 + }, + { + "epoch": 7.2638945508900665, + "grad_norm": 0.4340122938156128, + "learning_rate": 9.237838021470309e-06, + "loss": 3.832, + "step": 106910 + }, + { + "epoch": 7.264234270960729, + "grad_norm": 0.13902099430561066, + "learning_rate": 9.233591520587037e-06, + "loss": 3.9129, + "step": 106915 + }, + { + "epoch": 7.26457399103139, + "grad_norm": 0.1810179501771927, + "learning_rate": 9.229345019703765e-06, + "loss": 3.9145, + "step": 106920 + }, + { + "epoch": 7.264913711102052, + "grad_norm": 0.22656749188899994, + "learning_rate": 9.225098518820493e-06, + "loss": 3.8123, + "step": 106925 + }, + { + "epoch": 7.265253431172714, + "grad_norm": 0.1731712967157364, + "learning_rate": 9.22085201793722e-06, + "loss": 3.6944, + "step": 106930 + }, + { + "epoch": 7.265593151243375, + "grad_norm": 0.2347310483455658, + "learning_rate": 9.216605517053949e-06, + "loss": 3.9176, + "step": 106935 + }, + { + "epoch": 7.265932871314037, + "grad_norm": 0.19648002088069916, + "learning_rate": 9.212359016170675e-06, + "loss": 3.9053, + "step": 106940 + }, + { + "epoch": 7.266272591384699, + "grad_norm": 0.16543418169021606, + "learning_rate": 9.208112515287403e-06, + "loss": 3.8443, + "step": 106945 + }, + { + "epoch": 7.2666123114553605, + "grad_norm": 0.3098776936531067, + "learning_rate": 9.203866014404131e-06, + "loss": 3.8931, + "step": 106950 + }, + { + "epoch": 7.2669520315260225, + "grad_norm": 0.20325087010860443, + "learning_rate": 9.199619513520859e-06, + "loss": 3.8161, + "step": 106955 + }, + { + "epoch": 7.267291751596685, + "grad_norm": 0.16973423957824707, + "learning_rate": 9.195373012637587e-06, + "loss": 3.9294, + "step": 106960 + }, + { + "epoch": 7.267631471667346, + "grad_norm": 0.30174586176872253, + "learning_rate": 9.191126511754315e-06, + "loss": 3.9307, + "step": 106965 + }, + { + "epoch": 7.267971191738008, + "grad_norm": 0.17773662507534027, + "learning_rate": 9.186880010871041e-06, + "loss": 3.6169, + "step": 106970 + }, + { + "epoch": 7.26831091180867, + "grad_norm": 0.19220107793807983, + "learning_rate": 9.18263350998777e-06, + "loss": 3.8903, + "step": 106975 + }, + { + "epoch": 7.268650631879331, + "grad_norm": 0.1948050558567047, + "learning_rate": 9.1783870091045e-06, + "loss": 3.8626, + "step": 106980 + }, + { + "epoch": 7.268990351949993, + "grad_norm": 0.3027926981449127, + "learning_rate": 9.174140508221227e-06, + "loss": 3.8208, + "step": 106985 + }, + { + "epoch": 7.269330072020655, + "grad_norm": 0.16290181875228882, + "learning_rate": 9.169894007337953e-06, + "loss": 3.9095, + "step": 106990 + }, + { + "epoch": 7.2696697920913165, + "grad_norm": 0.17552107572555542, + "learning_rate": 9.165647506454681e-06, + "loss": 3.923, + "step": 106995 + }, + { + "epoch": 7.2700095121619785, + "grad_norm": 0.18723128736019135, + "learning_rate": 9.16140100557141e-06, + "loss": 3.9407, + "step": 107000 + }, + { + "epoch": 7.270349232232641, + "grad_norm": 0.14294032752513885, + "learning_rate": 9.157154504688138e-06, + "loss": 3.8165, + "step": 107005 + }, + { + "epoch": 7.270688952303302, + "grad_norm": 0.16279499232769012, + "learning_rate": 9.152908003804866e-06, + "loss": 3.7997, + "step": 107010 + }, + { + "epoch": 7.271028672373964, + "grad_norm": 0.7521360516548157, + "learning_rate": 9.148661502921594e-06, + "loss": 3.7861, + "step": 107015 + }, + { + "epoch": 7.271368392444626, + "grad_norm": 0.18876294791698456, + "learning_rate": 9.144415002038322e-06, + "loss": 3.8169, + "step": 107020 + }, + { + "epoch": 7.271708112515287, + "grad_norm": 0.17212562263011932, + "learning_rate": 9.140168501155048e-06, + "loss": 4.0605, + "step": 107025 + }, + { + "epoch": 7.272047832585949, + "grad_norm": 0.1786048412322998, + "learning_rate": 9.135922000271776e-06, + "loss": 3.6662, + "step": 107030 + }, + { + "epoch": 7.272387552656611, + "grad_norm": 0.17177772521972656, + "learning_rate": 9.131675499388506e-06, + "loss": 3.8207, + "step": 107035 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 0.18341132998466492, + "learning_rate": 9.127428998505232e-06, + "loss": 3.8022, + "step": 107040 + }, + { + "epoch": 7.273066992797935, + "grad_norm": 0.1651938110589981, + "learning_rate": 9.12318249762196e-06, + "loss": 3.777, + "step": 107045 + }, + { + "epoch": 7.273406712868597, + "grad_norm": 0.14203909039497375, + "learning_rate": 9.118935996738688e-06, + "loss": 3.5585, + "step": 107050 + }, + { + "epoch": 7.273746432939258, + "grad_norm": 1.008270025253296, + "learning_rate": 9.114689495855414e-06, + "loss": 3.7062, + "step": 107055 + }, + { + "epoch": 7.27408615300992, + "grad_norm": 0.17730818688869476, + "learning_rate": 9.110442994972144e-06, + "loss": 4.0075, + "step": 107060 + }, + { + "epoch": 7.274425873080582, + "grad_norm": 0.17288687825202942, + "learning_rate": 9.106196494088872e-06, + "loss": 3.8476, + "step": 107065 + }, + { + "epoch": 7.274765593151243, + "grad_norm": 0.15273357927799225, + "learning_rate": 9.1019499932056e-06, + "loss": 3.7059, + "step": 107070 + }, + { + "epoch": 7.275105313221905, + "grad_norm": 0.1435949206352234, + "learning_rate": 9.097703492322326e-06, + "loss": 3.881, + "step": 107075 + }, + { + "epoch": 7.275445033292567, + "grad_norm": 0.15523464977741241, + "learning_rate": 9.093456991439054e-06, + "loss": 3.8679, + "step": 107080 + }, + { + "epoch": 7.2757847533632285, + "grad_norm": 0.13459092378616333, + "learning_rate": 9.089210490555782e-06, + "loss": 3.8583, + "step": 107085 + }, + { + "epoch": 7.276124473433891, + "grad_norm": 0.1681976318359375, + "learning_rate": 9.08496398967251e-06, + "loss": 3.7486, + "step": 107090 + }, + { + "epoch": 7.276464193504553, + "grad_norm": 0.20908276736736298, + "learning_rate": 9.080717488789238e-06, + "loss": 3.7533, + "step": 107095 + }, + { + "epoch": 7.276803913575214, + "grad_norm": 0.18221858143806458, + "learning_rate": 9.076470987905966e-06, + "loss": 3.7491, + "step": 107100 + }, + { + "epoch": 7.277143633645876, + "grad_norm": 0.24291937053203583, + "learning_rate": 9.072224487022694e-06, + "loss": 4.0192, + "step": 107105 + }, + { + "epoch": 7.277483353716538, + "grad_norm": 0.1414584517478943, + "learning_rate": 9.06797798613942e-06, + "loss": 3.8258, + "step": 107110 + }, + { + "epoch": 7.277823073787199, + "grad_norm": 0.25838762521743774, + "learning_rate": 9.06373148525615e-06, + "loss": 4.1137, + "step": 107115 + }, + { + "epoch": 7.278162793857861, + "grad_norm": 0.1788569688796997, + "learning_rate": 9.059484984372878e-06, + "loss": 3.7819, + "step": 107120 + }, + { + "epoch": 7.278502513928522, + "grad_norm": 0.1627272218465805, + "learning_rate": 9.055238483489605e-06, + "loss": 3.7862, + "step": 107125 + }, + { + "epoch": 7.2788422339991845, + "grad_norm": 0.18772301077842712, + "learning_rate": 9.050991982606333e-06, + "loss": 3.9466, + "step": 107130 + }, + { + "epoch": 7.279181954069847, + "grad_norm": 0.18194879591464996, + "learning_rate": 9.04674548172306e-06, + "loss": 3.6127, + "step": 107135 + }, + { + "epoch": 7.279521674140508, + "grad_norm": 0.2717924118041992, + "learning_rate": 9.042498980839789e-06, + "loss": 3.6354, + "step": 107140 + }, + { + "epoch": 7.27986139421117, + "grad_norm": 0.1730601042509079, + "learning_rate": 9.038252479956517e-06, + "loss": 3.869, + "step": 107145 + }, + { + "epoch": 7.280201114281832, + "grad_norm": 0.1577412337064743, + "learning_rate": 9.034005979073245e-06, + "loss": 3.6639, + "step": 107150 + }, + { + "epoch": 7.280540834352493, + "grad_norm": 0.16313111782073975, + "learning_rate": 9.029759478189973e-06, + "loss": 3.7457, + "step": 107155 + }, + { + "epoch": 7.280880554423155, + "grad_norm": 0.15667392313480377, + "learning_rate": 9.025512977306699e-06, + "loss": 3.7414, + "step": 107160 + }, + { + "epoch": 7.281220274493817, + "grad_norm": 0.19825147092342377, + "learning_rate": 9.021266476423427e-06, + "loss": 3.6967, + "step": 107165 + }, + { + "epoch": 7.2815599945644784, + "grad_norm": 0.2179238498210907, + "learning_rate": 9.017019975540155e-06, + "loss": 3.6609, + "step": 107170 + }, + { + "epoch": 7.2818997146351405, + "grad_norm": 0.17657923698425293, + "learning_rate": 9.012773474656883e-06, + "loss": 3.8797, + "step": 107175 + }, + { + "epoch": 7.282239434705803, + "grad_norm": 0.1754498928785324, + "learning_rate": 9.008526973773611e-06, + "loss": 3.745, + "step": 107180 + }, + { + "epoch": 7.282579154776464, + "grad_norm": 0.1551327258348465, + "learning_rate": 9.004280472890339e-06, + "loss": 3.8241, + "step": 107185 + }, + { + "epoch": 7.282918874847126, + "grad_norm": 0.24933144450187683, + "learning_rate": 9.000033972007067e-06, + "loss": 3.8618, + "step": 107190 + }, + { + "epoch": 7.283258594917788, + "grad_norm": 0.20034943521022797, + "learning_rate": 8.995787471123793e-06, + "loss": 3.6419, + "step": 107195 + }, + { + "epoch": 7.283598314988449, + "grad_norm": 0.15386924147605896, + "learning_rate": 8.991540970240523e-06, + "loss": 3.8828, + "step": 107200 + }, + { + "epoch": 7.283938035059111, + "grad_norm": 0.21942266821861267, + "learning_rate": 8.987294469357251e-06, + "loss": 3.5867, + "step": 107205 + }, + { + "epoch": 7.284277755129773, + "grad_norm": 0.13553664088249207, + "learning_rate": 8.983047968473977e-06, + "loss": 3.7414, + "step": 107210 + }, + { + "epoch": 7.2846174752004345, + "grad_norm": 0.17963849008083344, + "learning_rate": 8.978801467590705e-06, + "loss": 3.6359, + "step": 107215 + }, + { + "epoch": 7.2849571952710965, + "grad_norm": 0.3162815272808075, + "learning_rate": 8.974554966707433e-06, + "loss": 3.8375, + "step": 107220 + }, + { + "epoch": 7.285296915341759, + "grad_norm": 0.19730260968208313, + "learning_rate": 8.970308465824161e-06, + "loss": 3.7725, + "step": 107225 + }, + { + "epoch": 7.28563663541242, + "grad_norm": 1.2079159021377563, + "learning_rate": 8.96606196494089e-06, + "loss": 4.2387, + "step": 107230 + }, + { + "epoch": 7.285976355483082, + "grad_norm": 0.2474302053451538, + "learning_rate": 8.961815464057617e-06, + "loss": 3.7069, + "step": 107235 + }, + { + "epoch": 7.286316075553744, + "grad_norm": 2.0732784271240234, + "learning_rate": 8.957568963174345e-06, + "loss": 3.6338, + "step": 107240 + }, + { + "epoch": 7.286655795624405, + "grad_norm": 0.24798686802387238, + "learning_rate": 8.953322462291072e-06, + "loss": 3.7271, + "step": 107245 + }, + { + "epoch": 7.286995515695067, + "grad_norm": 0.2121630162000656, + "learning_rate": 8.9490759614078e-06, + "loss": 3.6182, + "step": 107250 + }, + { + "epoch": 7.287335235765729, + "grad_norm": 0.202107235789299, + "learning_rate": 8.94482946052453e-06, + "loss": 3.9706, + "step": 107255 + }, + { + "epoch": 7.2876749558363905, + "grad_norm": 0.2077157199382782, + "learning_rate": 8.940582959641256e-06, + "loss": 3.849, + "step": 107260 + }, + { + "epoch": 7.2880146759070525, + "grad_norm": 0.1743665635585785, + "learning_rate": 8.936336458757984e-06, + "loss": 3.7639, + "step": 107265 + }, + { + "epoch": 7.288354395977715, + "grad_norm": 0.15266163647174835, + "learning_rate": 8.932089957874712e-06, + "loss": 3.7815, + "step": 107270 + }, + { + "epoch": 7.288694116048376, + "grad_norm": 0.2656671106815338, + "learning_rate": 8.92784345699144e-06, + "loss": 3.7878, + "step": 107275 + }, + { + "epoch": 7.289033836119038, + "grad_norm": 0.1991790533065796, + "learning_rate": 8.923596956108168e-06, + "loss": 3.4786, + "step": 107280 + }, + { + "epoch": 7.2893735561897, + "grad_norm": 0.1578037589788437, + "learning_rate": 8.919350455224896e-06, + "loss": 3.8783, + "step": 107285 + }, + { + "epoch": 7.289713276260361, + "grad_norm": 0.2941928207874298, + "learning_rate": 8.915103954341624e-06, + "loss": 3.8469, + "step": 107290 + }, + { + "epoch": 7.290052996331023, + "grad_norm": 0.17845311760902405, + "learning_rate": 8.91085745345835e-06, + "loss": 3.8739, + "step": 107295 + }, + { + "epoch": 7.290392716401685, + "grad_norm": 0.28691089153289795, + "learning_rate": 8.906610952575078e-06, + "loss": 4.1464, + "step": 107300 + }, + { + "epoch": 7.2907324364723465, + "grad_norm": 3.171241283416748, + "learning_rate": 8.902364451691806e-06, + "loss": 3.7414, + "step": 107305 + }, + { + "epoch": 7.2910721565430086, + "grad_norm": 0.7398142218589783, + "learning_rate": 8.898117950808534e-06, + "loss": 3.7933, + "step": 107310 + }, + { + "epoch": 7.291411876613671, + "grad_norm": 0.2706781029701233, + "learning_rate": 8.893871449925262e-06, + "loss": 3.8961, + "step": 107315 + }, + { + "epoch": 7.291751596684332, + "grad_norm": 0.1859157532453537, + "learning_rate": 8.88962494904199e-06, + "loss": 3.795, + "step": 107320 + }, + { + "epoch": 7.292091316754994, + "grad_norm": 0.505018949508667, + "learning_rate": 8.885378448158718e-06, + "loss": 3.8714, + "step": 107325 + }, + { + "epoch": 7.292431036825656, + "grad_norm": 2.527341604232788, + "learning_rate": 8.881131947275444e-06, + "loss": 4.0568, + "step": 107330 + }, + { + "epoch": 7.292770756896317, + "grad_norm": 0.17645128071308136, + "learning_rate": 8.876885446392172e-06, + "loss": 3.7025, + "step": 107335 + }, + { + "epoch": 7.293110476966979, + "grad_norm": 0.13302908837795258, + "learning_rate": 8.872638945508902e-06, + "loss": 3.9629, + "step": 107340 + }, + { + "epoch": 7.293450197037641, + "grad_norm": 0.1558966189622879, + "learning_rate": 8.868392444625628e-06, + "loss": 4.1375, + "step": 107345 + }, + { + "epoch": 7.2937899171083025, + "grad_norm": 0.1804826259613037, + "learning_rate": 8.864145943742356e-06, + "loss": 3.922, + "step": 107350 + }, + { + "epoch": 7.294129637178965, + "grad_norm": 0.26597148180007935, + "learning_rate": 8.859899442859084e-06, + "loss": 3.7232, + "step": 107355 + }, + { + "epoch": 7.294469357249627, + "grad_norm": 0.14696961641311646, + "learning_rate": 8.855652941975813e-06, + "loss": 3.7302, + "step": 107360 + }, + { + "epoch": 7.294809077320288, + "grad_norm": 0.16145189106464386, + "learning_rate": 8.85140644109254e-06, + "loss": 3.9719, + "step": 107365 + }, + { + "epoch": 7.29514879739095, + "grad_norm": 0.23564639687538147, + "learning_rate": 8.847159940209269e-06, + "loss": 4.035, + "step": 107370 + }, + { + "epoch": 7.295488517461612, + "grad_norm": 0.17241869866847992, + "learning_rate": 8.842913439325997e-06, + "loss": 3.7057, + "step": 107375 + }, + { + "epoch": 7.295828237532273, + "grad_norm": 0.19623598456382751, + "learning_rate": 8.838666938442723e-06, + "loss": 3.9146, + "step": 107380 + }, + { + "epoch": 7.296167957602935, + "grad_norm": 0.1829255372285843, + "learning_rate": 8.83442043755945e-06, + "loss": 3.9846, + "step": 107385 + }, + { + "epoch": 7.296507677673597, + "grad_norm": 0.16362528502941132, + "learning_rate": 8.830173936676179e-06, + "loss": 3.9447, + "step": 107390 + }, + { + "epoch": 7.2968473977442585, + "grad_norm": 0.29655978083610535, + "learning_rate": 8.825927435792907e-06, + "loss": 3.866, + "step": 107395 + }, + { + "epoch": 7.297187117814921, + "grad_norm": 0.3250157833099365, + "learning_rate": 8.821680934909635e-06, + "loss": 4.0155, + "step": 107400 + }, + { + "epoch": 7.297526837885583, + "grad_norm": 0.1357765942811966, + "learning_rate": 8.817434434026363e-06, + "loss": 3.7442, + "step": 107405 + }, + { + "epoch": 7.297866557956244, + "grad_norm": 0.20688477158546448, + "learning_rate": 8.813187933143091e-06, + "loss": 3.9458, + "step": 107410 + }, + { + "epoch": 7.298206278026906, + "grad_norm": 0.24890577793121338, + "learning_rate": 8.808941432259817e-06, + "loss": 3.7717, + "step": 107415 + }, + { + "epoch": 7.298545998097568, + "grad_norm": 0.1727716028690338, + "learning_rate": 8.804694931376547e-06, + "loss": 3.9467, + "step": 107420 + }, + { + "epoch": 7.298885718168229, + "grad_norm": 0.1806073933839798, + "learning_rate": 8.800448430493275e-06, + "loss": 3.8605, + "step": 107425 + }, + { + "epoch": 7.299225438238891, + "grad_norm": 0.14626602828502655, + "learning_rate": 8.796201929610001e-06, + "loss": 3.886, + "step": 107430 + }, + { + "epoch": 7.299565158309553, + "grad_norm": 0.17405296862125397, + "learning_rate": 8.79195542872673e-06, + "loss": 3.6932, + "step": 107435 + }, + { + "epoch": 7.2999048783802145, + "grad_norm": 0.13999539613723755, + "learning_rate": 8.787708927843457e-06, + "loss": 4.0062, + "step": 107440 + }, + { + "epoch": 7.300244598450877, + "grad_norm": 0.17110669612884521, + "learning_rate": 8.783462426960185e-06, + "loss": 3.5005, + "step": 107445 + }, + { + "epoch": 7.300584318521539, + "grad_norm": 0.1849653124809265, + "learning_rate": 8.779215926076913e-06, + "loss": 3.8668, + "step": 107450 + }, + { + "epoch": 7.3009240385922, + "grad_norm": 0.1586304008960724, + "learning_rate": 8.774969425193641e-06, + "loss": 3.8011, + "step": 107455 + }, + { + "epoch": 7.301263758662862, + "grad_norm": 0.17487601935863495, + "learning_rate": 8.77072292431037e-06, + "loss": 4.0174, + "step": 107460 + }, + { + "epoch": 7.301603478733524, + "grad_norm": 0.1563822478055954, + "learning_rate": 8.766476423427096e-06, + "loss": 3.7236, + "step": 107465 + }, + { + "epoch": 7.301943198804185, + "grad_norm": 0.6776418685913086, + "learning_rate": 8.762229922543824e-06, + "loss": 3.9356, + "step": 107470 + }, + { + "epoch": 7.302282918874847, + "grad_norm": 0.23824405670166016, + "learning_rate": 8.757983421660553e-06, + "loss": 3.6819, + "step": 107475 + }, + { + "epoch": 7.302622638945509, + "grad_norm": 0.15601377189159393, + "learning_rate": 8.75373692077728e-06, + "loss": 4.0311, + "step": 107480 + }, + { + "epoch": 7.3029623590161705, + "grad_norm": 0.26263752579689026, + "learning_rate": 8.749490419894008e-06, + "loss": 3.7785, + "step": 107485 + }, + { + "epoch": 7.303302079086833, + "grad_norm": 0.16778995096683502, + "learning_rate": 8.745243919010736e-06, + "loss": 3.7081, + "step": 107490 + }, + { + "epoch": 7.303641799157495, + "grad_norm": 0.18807077407836914, + "learning_rate": 8.740997418127464e-06, + "loss": 3.7497, + "step": 107495 + }, + { + "epoch": 7.303981519228156, + "grad_norm": 0.19120453298091888, + "learning_rate": 8.736750917244192e-06, + "loss": 3.6448, + "step": 107500 + }, + { + "epoch": 7.304321239298818, + "grad_norm": 0.17485946416854858, + "learning_rate": 8.73250441636092e-06, + "loss": 3.9761, + "step": 107505 + }, + { + "epoch": 7.30466095936948, + "grad_norm": 0.2005399614572525, + "learning_rate": 8.728257915477648e-06, + "loss": 3.5126, + "step": 107510 + }, + { + "epoch": 7.305000679440141, + "grad_norm": 0.18326766788959503, + "learning_rate": 8.724011414594374e-06, + "loss": 3.8546, + "step": 107515 + }, + { + "epoch": 7.305340399510803, + "grad_norm": 0.16299766302108765, + "learning_rate": 8.719764913711102e-06, + "loss": 3.974, + "step": 107520 + }, + { + "epoch": 7.3056801195814645, + "grad_norm": 0.1868872046470642, + "learning_rate": 8.71551841282783e-06, + "loss": 3.7068, + "step": 107525 + }, + { + "epoch": 7.3060198396521265, + "grad_norm": 0.16371487081050873, + "learning_rate": 8.711271911944558e-06, + "loss": 3.7715, + "step": 107530 + }, + { + "epoch": 7.306359559722789, + "grad_norm": 0.21083194017410278, + "learning_rate": 8.707025411061286e-06, + "loss": 4.0198, + "step": 107535 + }, + { + "epoch": 7.30669927979345, + "grad_norm": 0.3540940582752228, + "learning_rate": 8.702778910178014e-06, + "loss": 3.7051, + "step": 107540 + }, + { + "epoch": 7.307038999864112, + "grad_norm": 0.1616075038909912, + "learning_rate": 8.698532409294742e-06, + "loss": 3.7499, + "step": 107545 + }, + { + "epoch": 7.307378719934774, + "grad_norm": 0.6311487555503845, + "learning_rate": 8.694285908411468e-06, + "loss": 3.8775, + "step": 107550 + }, + { + "epoch": 7.307718440005435, + "grad_norm": 0.20392143726348877, + "learning_rate": 8.690039407528196e-06, + "loss": 4.1299, + "step": 107555 + }, + { + "epoch": 7.308058160076097, + "grad_norm": 0.16530351340770721, + "learning_rate": 8.685792906644926e-06, + "loss": 3.9828, + "step": 107560 + }, + { + "epoch": 7.308397880146759, + "grad_norm": 0.3407825231552124, + "learning_rate": 8.681546405761652e-06, + "loss": 3.77, + "step": 107565 + }, + { + "epoch": 7.3087376002174205, + "grad_norm": 0.5041821599006653, + "learning_rate": 8.67729990487838e-06, + "loss": 4.0765, + "step": 107570 + }, + { + "epoch": 7.3090773202880825, + "grad_norm": 0.17555730044841766, + "learning_rate": 8.673053403995108e-06, + "loss": 3.6678, + "step": 107575 + }, + { + "epoch": 7.309417040358745, + "grad_norm": 0.19526495039463043, + "learning_rate": 8.668806903111836e-06, + "loss": 3.8449, + "step": 107580 + }, + { + "epoch": 7.309756760429406, + "grad_norm": 0.1723722666501999, + "learning_rate": 8.664560402228564e-06, + "loss": 3.8898, + "step": 107585 + }, + { + "epoch": 7.310096480500068, + "grad_norm": 0.1570865362882614, + "learning_rate": 8.660313901345292e-06, + "loss": 3.7798, + "step": 107590 + }, + { + "epoch": 7.31043620057073, + "grad_norm": 0.6412787437438965, + "learning_rate": 8.65606740046202e-06, + "loss": 3.8649, + "step": 107595 + }, + { + "epoch": 7.310775920641391, + "grad_norm": 0.2056066393852234, + "learning_rate": 8.651820899578747e-06, + "loss": 3.8646, + "step": 107600 + }, + { + "epoch": 7.311115640712053, + "grad_norm": 0.2432146817445755, + "learning_rate": 8.647574398695475e-06, + "loss": 3.808, + "step": 107605 + }, + { + "epoch": 7.311455360782715, + "grad_norm": 0.8702138662338257, + "learning_rate": 8.643327897812203e-06, + "loss": 3.9286, + "step": 107610 + }, + { + "epoch": 7.3117950808533765, + "grad_norm": 0.20233777165412903, + "learning_rate": 8.639081396928932e-06, + "loss": 3.8287, + "step": 107615 + }, + { + "epoch": 7.312134800924039, + "grad_norm": 0.17163412272930145, + "learning_rate": 8.634834896045659e-06, + "loss": 3.7739, + "step": 107620 + }, + { + "epoch": 7.312474520994701, + "grad_norm": 0.15947814285755157, + "learning_rate": 8.630588395162387e-06, + "loss": 3.6785, + "step": 107625 + }, + { + "epoch": 7.312814241065362, + "grad_norm": 0.17247028648853302, + "learning_rate": 8.626341894279115e-06, + "loss": 3.9311, + "step": 107630 + }, + { + "epoch": 7.313153961136024, + "grad_norm": 0.15215569734573364, + "learning_rate": 8.622095393395841e-06, + "loss": 3.726, + "step": 107635 + }, + { + "epoch": 7.313493681206686, + "grad_norm": 0.27706605195999146, + "learning_rate": 8.61784889251257e-06, + "loss": 3.7133, + "step": 107640 + }, + { + "epoch": 7.313833401277347, + "grad_norm": 0.38356372714042664, + "learning_rate": 8.613602391629299e-06, + "loss": 3.9494, + "step": 107645 + }, + { + "epoch": 7.314173121348009, + "grad_norm": 0.18103563785552979, + "learning_rate": 8.609355890746025e-06, + "loss": 3.762, + "step": 107650 + }, + { + "epoch": 7.314512841418671, + "grad_norm": 0.4278948903083801, + "learning_rate": 8.605109389862753e-06, + "loss": 3.681, + "step": 107655 + }, + { + "epoch": 7.3148525614893325, + "grad_norm": 0.19335047900676727, + "learning_rate": 8.600862888979481e-06, + "loss": 3.562, + "step": 107660 + }, + { + "epoch": 7.315192281559995, + "grad_norm": 0.22074398398399353, + "learning_rate": 8.596616388096209e-06, + "loss": 3.8691, + "step": 107665 + }, + { + "epoch": 7.315532001630657, + "grad_norm": 0.14973321557044983, + "learning_rate": 8.592369887212937e-06, + "loss": 4.0373, + "step": 107670 + }, + { + "epoch": 7.315871721701318, + "grad_norm": 0.16589465737342834, + "learning_rate": 8.588123386329665e-06, + "loss": 4.059, + "step": 107675 + }, + { + "epoch": 7.31621144177198, + "grad_norm": 0.20607790350914001, + "learning_rate": 8.583876885446393e-06, + "loss": 3.8748, + "step": 107680 + }, + { + "epoch": 7.316551161842642, + "grad_norm": 0.19622597098350525, + "learning_rate": 8.57963038456312e-06, + "loss": 3.7194, + "step": 107685 + }, + { + "epoch": 7.316890881913303, + "grad_norm": 0.23347243666648865, + "learning_rate": 8.575383883679847e-06, + "loss": 3.9553, + "step": 107690 + }, + { + "epoch": 7.317230601983965, + "grad_norm": 0.1819451004266739, + "learning_rate": 8.571137382796575e-06, + "loss": 3.7066, + "step": 107695 + }, + { + "epoch": 7.317570322054627, + "grad_norm": 0.3156582713127136, + "learning_rate": 8.566890881913305e-06, + "loss": 3.7487, + "step": 107700 + }, + { + "epoch": 7.3179100421252885, + "grad_norm": 0.24029746651649475, + "learning_rate": 8.562644381030031e-06, + "loss": 4.136, + "step": 107705 + }, + { + "epoch": 7.318249762195951, + "grad_norm": 0.2002977430820465, + "learning_rate": 8.55839788014676e-06, + "loss": 3.8671, + "step": 107710 + }, + { + "epoch": 7.318589482266613, + "grad_norm": 0.19133514165878296, + "learning_rate": 8.554151379263487e-06, + "loss": 4.0722, + "step": 107715 + }, + { + "epoch": 7.318929202337274, + "grad_norm": 0.23230986297130585, + "learning_rate": 8.549904878380214e-06, + "loss": 4.0766, + "step": 107720 + }, + { + "epoch": 7.319268922407936, + "grad_norm": 0.1751737743616104, + "learning_rate": 8.545658377496944e-06, + "loss": 3.9282, + "step": 107725 + }, + { + "epoch": 7.319608642478598, + "grad_norm": 0.23105765879154205, + "learning_rate": 8.541411876613672e-06, + "loss": 3.8957, + "step": 107730 + }, + { + "epoch": 7.319948362549259, + "grad_norm": 0.19609323143959045, + "learning_rate": 8.537165375730398e-06, + "loss": 3.7947, + "step": 107735 + }, + { + "epoch": 7.320288082619921, + "grad_norm": 0.23834647238254547, + "learning_rate": 8.532918874847126e-06, + "loss": 3.8069, + "step": 107740 + }, + { + "epoch": 7.320627802690583, + "grad_norm": 0.14837568998336792, + "learning_rate": 8.528672373963854e-06, + "loss": 3.7072, + "step": 107745 + }, + { + "epoch": 7.3209675227612445, + "grad_norm": 0.18439270555973053, + "learning_rate": 8.524425873080582e-06, + "loss": 3.9011, + "step": 107750 + }, + { + "epoch": 7.321307242831907, + "grad_norm": 0.5424985289573669, + "learning_rate": 8.52017937219731e-06, + "loss": 3.5934, + "step": 107755 + }, + { + "epoch": 7.321646962902569, + "grad_norm": 0.20301909744739532, + "learning_rate": 8.515932871314038e-06, + "loss": 3.8267, + "step": 107760 + }, + { + "epoch": 7.32198668297323, + "grad_norm": 0.16596661508083344, + "learning_rate": 8.511686370430766e-06, + "loss": 3.7835, + "step": 107765 + }, + { + "epoch": 7.322326403043892, + "grad_norm": 0.19107195734977722, + "learning_rate": 8.507439869547492e-06, + "loss": 3.7133, + "step": 107770 + }, + { + "epoch": 7.322666123114554, + "grad_norm": 0.31618592143058777, + "learning_rate": 8.50319336866422e-06, + "loss": 3.8101, + "step": 107775 + }, + { + "epoch": 7.323005843185215, + "grad_norm": 0.1522805392742157, + "learning_rate": 8.49894686778095e-06, + "loss": 3.995, + "step": 107780 + }, + { + "epoch": 7.323345563255877, + "grad_norm": 0.18167628347873688, + "learning_rate": 8.494700366897678e-06, + "loss": 3.8757, + "step": 107785 + }, + { + "epoch": 7.323685283326539, + "grad_norm": 0.1597878336906433, + "learning_rate": 8.490453866014404e-06, + "loss": 3.6483, + "step": 107790 + }, + { + "epoch": 7.3240250033972005, + "grad_norm": 0.1683827042579651, + "learning_rate": 8.486207365131132e-06, + "loss": 3.8933, + "step": 107795 + }, + { + "epoch": 7.324364723467863, + "grad_norm": 0.1751611828804016, + "learning_rate": 8.48196086424786e-06, + "loss": 3.9444, + "step": 107800 + }, + { + "epoch": 7.324704443538524, + "grad_norm": 0.19201968610286713, + "learning_rate": 8.477714363364588e-06, + "loss": 3.8083, + "step": 107805 + }, + { + "epoch": 7.325044163609186, + "grad_norm": 0.18647661805152893, + "learning_rate": 8.473467862481316e-06, + "loss": 3.9343, + "step": 107810 + }, + { + "epoch": 7.325383883679848, + "grad_norm": 0.1607722043991089, + "learning_rate": 8.469221361598044e-06, + "loss": 3.8903, + "step": 107815 + }, + { + "epoch": 7.325723603750509, + "grad_norm": 0.223912313580513, + "learning_rate": 8.46497486071477e-06, + "loss": 3.9149, + "step": 107820 + }, + { + "epoch": 7.326063323821171, + "grad_norm": 0.14886409044265747, + "learning_rate": 8.460728359831499e-06, + "loss": 3.6537, + "step": 107825 + }, + { + "epoch": 7.326403043891833, + "grad_norm": 0.1703827977180481, + "learning_rate": 8.456481858948227e-06, + "loss": 3.9012, + "step": 107830 + }, + { + "epoch": 7.3267427639624945, + "grad_norm": 0.4273234009742737, + "learning_rate": 8.452235358064956e-06, + "loss": 3.9721, + "step": 107835 + }, + { + "epoch": 7.3270824840331565, + "grad_norm": 0.27335163950920105, + "learning_rate": 8.447988857181683e-06, + "loss": 3.9521, + "step": 107840 + }, + { + "epoch": 7.327422204103819, + "grad_norm": 0.16412664949893951, + "learning_rate": 8.44374235629841e-06, + "loss": 3.805, + "step": 107845 + }, + { + "epoch": 7.32776192417448, + "grad_norm": 0.1849745512008667, + "learning_rate": 8.439495855415139e-06, + "loss": 3.9557, + "step": 107850 + }, + { + "epoch": 7.328101644245142, + "grad_norm": 0.1939927041530609, + "learning_rate": 8.435249354531865e-06, + "loss": 3.8603, + "step": 107855 + }, + { + "epoch": 7.328441364315804, + "grad_norm": 0.13960106670856476, + "learning_rate": 8.431002853648595e-06, + "loss": 3.9251, + "step": 107860 + }, + { + "epoch": 7.328781084386465, + "grad_norm": 0.39026322960853577, + "learning_rate": 8.426756352765323e-06, + "loss": 3.8816, + "step": 107865 + }, + { + "epoch": 7.329120804457127, + "grad_norm": 0.42249009013175964, + "learning_rate": 8.42250985188205e-06, + "loss": 3.9561, + "step": 107870 + }, + { + "epoch": 7.329460524527789, + "grad_norm": 0.1603161245584488, + "learning_rate": 8.418263350998777e-06, + "loss": 3.8454, + "step": 107875 + }, + { + "epoch": 7.3298002445984505, + "grad_norm": 0.18263494968414307, + "learning_rate": 8.414016850115505e-06, + "loss": 3.9343, + "step": 107880 + }, + { + "epoch": 7.3301399646691126, + "grad_norm": 0.2341553121805191, + "learning_rate": 8.409770349232233e-06, + "loss": 3.9662, + "step": 107885 + }, + { + "epoch": 7.330479684739775, + "grad_norm": 0.20894405245780945, + "learning_rate": 8.405523848348961e-06, + "loss": 4.0367, + "step": 107890 + }, + { + "epoch": 7.330819404810436, + "grad_norm": 0.19286547601222992, + "learning_rate": 8.401277347465689e-06, + "loss": 3.8217, + "step": 107895 + }, + { + "epoch": 7.331159124881098, + "grad_norm": 0.20866671204566956, + "learning_rate": 8.397030846582417e-06, + "loss": 4.0212, + "step": 107900 + }, + { + "epoch": 7.33149884495176, + "grad_norm": 0.1757178157567978, + "learning_rate": 8.392784345699143e-06, + "loss": 3.7575, + "step": 107905 + }, + { + "epoch": 7.331838565022421, + "grad_norm": 0.18487712740898132, + "learning_rate": 8.388537844815871e-06, + "loss": 3.8831, + "step": 107910 + }, + { + "epoch": 7.332178285093083, + "grad_norm": 0.1810256838798523, + "learning_rate": 8.3842913439326e-06, + "loss": 3.8076, + "step": 107915 + }, + { + "epoch": 7.332518005163745, + "grad_norm": 0.1847206950187683, + "learning_rate": 8.380044843049329e-06, + "loss": 3.7936, + "step": 107920 + }, + { + "epoch": 7.3328577252344065, + "grad_norm": 0.18426653742790222, + "learning_rate": 8.375798342166055e-06, + "loss": 3.6626, + "step": 107925 + }, + { + "epoch": 7.333197445305069, + "grad_norm": 0.17291440069675446, + "learning_rate": 8.371551841282783e-06, + "loss": 3.6714, + "step": 107930 + }, + { + "epoch": 7.333537165375731, + "grad_norm": 0.1564497947692871, + "learning_rate": 8.367305340399511e-06, + "loss": 4.0796, + "step": 107935 + }, + { + "epoch": 7.333876885446392, + "grad_norm": 0.169969841837883, + "learning_rate": 8.363058839516238e-06, + "loss": 3.9492, + "step": 107940 + }, + { + "epoch": 7.334216605517054, + "grad_norm": 0.1844281405210495, + "learning_rate": 8.358812338632967e-06, + "loss": 3.9718, + "step": 107945 + }, + { + "epoch": 7.334556325587716, + "grad_norm": 0.18549422919750214, + "learning_rate": 8.354565837749695e-06, + "loss": 3.7795, + "step": 107950 + }, + { + "epoch": 7.334896045658377, + "grad_norm": 0.132721945643425, + "learning_rate": 8.350319336866423e-06, + "loss": 3.9137, + "step": 107955 + }, + { + "epoch": 7.335235765729039, + "grad_norm": 0.17554627358913422, + "learning_rate": 8.34607283598315e-06, + "loss": 3.6658, + "step": 107960 + }, + { + "epoch": 7.335575485799701, + "grad_norm": 0.17165358364582062, + "learning_rate": 8.341826335099878e-06, + "loss": 3.6642, + "step": 107965 + }, + { + "epoch": 7.3359152058703625, + "grad_norm": 0.16945745050907135, + "learning_rate": 8.337579834216606e-06, + "loss": 3.4195, + "step": 107970 + }, + { + "epoch": 7.336254925941025, + "grad_norm": 0.5275987386703491, + "learning_rate": 8.333333333333334e-06, + "loss": 3.8707, + "step": 107975 + }, + { + "epoch": 7.336594646011687, + "grad_norm": 0.2596190869808197, + "learning_rate": 8.329086832450062e-06, + "loss": 3.7326, + "step": 107980 + }, + { + "epoch": 7.336934366082348, + "grad_norm": 0.15066921710968018, + "learning_rate": 8.32484033156679e-06, + "loss": 4.1186, + "step": 107985 + }, + { + "epoch": 7.33727408615301, + "grad_norm": 0.2105061113834381, + "learning_rate": 8.320593830683516e-06, + "loss": 3.6672, + "step": 107990 + }, + { + "epoch": 7.337613806223672, + "grad_norm": 0.2163403183221817, + "learning_rate": 8.316347329800244e-06, + "loss": 3.9057, + "step": 107995 + }, + { + "epoch": 7.337953526294333, + "grad_norm": 0.16604188084602356, + "learning_rate": 8.312100828916974e-06, + "loss": 3.8134, + "step": 108000 + }, + { + "epoch": 7.338293246364995, + "grad_norm": 0.20128028094768524, + "learning_rate": 8.307854328033702e-06, + "loss": 4.04, + "step": 108005 + }, + { + "epoch": 7.338632966435657, + "grad_norm": 0.15485036373138428, + "learning_rate": 8.303607827150428e-06, + "loss": 3.8992, + "step": 108010 + }, + { + "epoch": 7.3389726865063185, + "grad_norm": 0.1683167815208435, + "learning_rate": 8.299361326267156e-06, + "loss": 3.9761, + "step": 108015 + }, + { + "epoch": 7.339312406576981, + "grad_norm": 0.2283181995153427, + "learning_rate": 8.295114825383884e-06, + "loss": 3.8604, + "step": 108020 + }, + { + "epoch": 7.339652126647643, + "grad_norm": 0.15910549461841583, + "learning_rate": 8.290868324500612e-06, + "loss": 3.8339, + "step": 108025 + }, + { + "epoch": 7.339991846718304, + "grad_norm": 0.17723943293094635, + "learning_rate": 8.28662182361734e-06, + "loss": 3.9985, + "step": 108030 + }, + { + "epoch": 7.340331566788966, + "grad_norm": 0.2146940976381302, + "learning_rate": 8.282375322734068e-06, + "loss": 3.8386, + "step": 108035 + }, + { + "epoch": 7.340671286859628, + "grad_norm": 0.17512372136116028, + "learning_rate": 8.278128821850796e-06, + "loss": 3.8577, + "step": 108040 + }, + { + "epoch": 7.341011006930289, + "grad_norm": 0.19845172762870789, + "learning_rate": 8.273882320967522e-06, + "loss": 4.1435, + "step": 108045 + }, + { + "epoch": 7.341350727000951, + "grad_norm": 0.20064064860343933, + "learning_rate": 8.26963582008425e-06, + "loss": 3.7158, + "step": 108050 + }, + { + "epoch": 7.341690447071613, + "grad_norm": 0.1760084480047226, + "learning_rate": 8.265389319200978e-06, + "loss": 3.7792, + "step": 108055 + }, + { + "epoch": 7.3420301671422745, + "grad_norm": 0.18345767259597778, + "learning_rate": 8.261142818317706e-06, + "loss": 3.84, + "step": 108060 + }, + { + "epoch": 7.342369887212937, + "grad_norm": 0.1386946439743042, + "learning_rate": 8.256896317434434e-06, + "loss": 3.9299, + "step": 108065 + }, + { + "epoch": 7.342709607283599, + "grad_norm": 0.1406395137310028, + "learning_rate": 8.252649816551162e-06, + "loss": 3.9723, + "step": 108070 + }, + { + "epoch": 7.34304932735426, + "grad_norm": 0.15392529964447021, + "learning_rate": 8.248403315667889e-06, + "loss": 3.8646, + "step": 108075 + }, + { + "epoch": 7.343389047424922, + "grad_norm": 0.1437893658876419, + "learning_rate": 8.244156814784617e-06, + "loss": 3.8511, + "step": 108080 + }, + { + "epoch": 7.343728767495584, + "grad_norm": 0.13719965517520905, + "learning_rate": 8.239910313901347e-06, + "loss": 3.838, + "step": 108085 + }, + { + "epoch": 7.344068487566245, + "grad_norm": 0.17664378881454468, + "learning_rate": 8.235663813018075e-06, + "loss": 3.7363, + "step": 108090 + }, + { + "epoch": 7.344408207636907, + "grad_norm": 0.1450982242822647, + "learning_rate": 8.2314173121348e-06, + "loss": 3.6605, + "step": 108095 + }, + { + "epoch": 7.344747927707569, + "grad_norm": 0.36115846037864685, + "learning_rate": 8.227170811251529e-06, + "loss": 3.5298, + "step": 108100 + }, + { + "epoch": 7.3450876477782305, + "grad_norm": 0.16428931057453156, + "learning_rate": 8.222924310368257e-06, + "loss": 3.9518, + "step": 108105 + }, + { + "epoch": 7.345427367848893, + "grad_norm": 0.13730302453041077, + "learning_rate": 8.218677809484985e-06, + "loss": 4.1693, + "step": 108110 + }, + { + "epoch": 7.345767087919555, + "grad_norm": 0.17691826820373535, + "learning_rate": 8.214431308601713e-06, + "loss": 3.8727, + "step": 108115 + }, + { + "epoch": 7.346106807990216, + "grad_norm": 0.1954871565103531, + "learning_rate": 8.210184807718441e-06, + "loss": 3.458, + "step": 108120 + }, + { + "epoch": 7.346446528060878, + "grad_norm": 0.2067486047744751, + "learning_rate": 8.205938306835169e-06, + "loss": 3.935, + "step": 108125 + }, + { + "epoch": 7.34678624813154, + "grad_norm": 0.19440864026546478, + "learning_rate": 8.201691805951895e-06, + "loss": 3.8856, + "step": 108130 + }, + { + "epoch": 7.347125968202201, + "grad_norm": 0.24049051105976105, + "learning_rate": 8.197445305068623e-06, + "loss": 3.6647, + "step": 108135 + }, + { + "epoch": 7.347465688272863, + "grad_norm": 0.5484030246734619, + "learning_rate": 8.193198804185353e-06, + "loss": 3.6596, + "step": 108140 + }, + { + "epoch": 7.347805408343525, + "grad_norm": 0.16562148928642273, + "learning_rate": 8.18895230330208e-06, + "loss": 3.4836, + "step": 108145 + }, + { + "epoch": 7.3481451284141865, + "grad_norm": 0.17930351197719574, + "learning_rate": 8.184705802418807e-06, + "loss": 3.9312, + "step": 108150 + }, + { + "epoch": 7.348484848484849, + "grad_norm": 0.18872691690921783, + "learning_rate": 8.180459301535535e-06, + "loss": 3.968, + "step": 108155 + }, + { + "epoch": 7.348824568555511, + "grad_norm": 0.15501806139945984, + "learning_rate": 8.176212800652262e-06, + "loss": 4.0032, + "step": 108160 + }, + { + "epoch": 7.349164288626172, + "grad_norm": 0.26826319098472595, + "learning_rate": 8.171966299768991e-06, + "loss": 3.9864, + "step": 108165 + }, + { + "epoch": 7.349504008696834, + "grad_norm": 0.188595250248909, + "learning_rate": 8.16771979888572e-06, + "loss": 3.7461, + "step": 108170 + }, + { + "epoch": 7.349843728767496, + "grad_norm": 0.1815394014120102, + "learning_rate": 8.163473298002447e-06, + "loss": 3.8666, + "step": 108175 + }, + { + "epoch": 7.350183448838157, + "grad_norm": 2.2284679412841797, + "learning_rate": 8.159226797119174e-06, + "loss": 3.9906, + "step": 108180 + }, + { + "epoch": 7.350523168908819, + "grad_norm": 0.506980836391449, + "learning_rate": 8.154980296235902e-06, + "loss": 3.7525, + "step": 108185 + }, + { + "epoch": 7.350862888979481, + "grad_norm": 0.15069225430488586, + "learning_rate": 8.15073379535263e-06, + "loss": 3.9772, + "step": 108190 + }, + { + "epoch": 7.3512026090501426, + "grad_norm": 0.2766154706478119, + "learning_rate": 8.146487294469358e-06, + "loss": 3.8363, + "step": 108195 + }, + { + "epoch": 7.351542329120805, + "grad_norm": 0.19335344433784485, + "learning_rate": 8.142240793586086e-06, + "loss": 3.7572, + "step": 108200 + }, + { + "epoch": 7.351882049191466, + "grad_norm": 0.21041299402713776, + "learning_rate": 8.137994292702814e-06, + "loss": 3.7314, + "step": 108205 + }, + { + "epoch": 7.352221769262128, + "grad_norm": 0.1351102739572525, + "learning_rate": 8.133747791819542e-06, + "loss": 3.9948, + "step": 108210 + }, + { + "epoch": 7.35256148933279, + "grad_norm": 0.17109940946102142, + "learning_rate": 8.129501290936268e-06, + "loss": 3.8886, + "step": 108215 + }, + { + "epoch": 7.352901209403451, + "grad_norm": 0.1437784731388092, + "learning_rate": 8.125254790052998e-06, + "loss": 3.7618, + "step": 108220 + }, + { + "epoch": 7.353240929474113, + "grad_norm": 0.18766094744205475, + "learning_rate": 8.121008289169726e-06, + "loss": 3.9379, + "step": 108225 + }, + { + "epoch": 7.353580649544775, + "grad_norm": 0.14657507836818695, + "learning_rate": 8.116761788286452e-06, + "loss": 3.8465, + "step": 108230 + }, + { + "epoch": 7.3539203696154365, + "grad_norm": 0.23665247857570648, + "learning_rate": 8.11251528740318e-06, + "loss": 3.7595, + "step": 108235 + }, + { + "epoch": 7.354260089686099, + "grad_norm": 0.15757589042186737, + "learning_rate": 8.108268786519908e-06, + "loss": 3.9485, + "step": 108240 + }, + { + "epoch": 7.354599809756761, + "grad_norm": 0.15939471125602722, + "learning_rate": 8.104022285636636e-06, + "loss": 4.0794, + "step": 108245 + }, + { + "epoch": 7.354939529827422, + "grad_norm": 0.19561387598514557, + "learning_rate": 8.099775784753364e-06, + "loss": 3.9179, + "step": 108250 + }, + { + "epoch": 7.355279249898084, + "grad_norm": 0.19883058965206146, + "learning_rate": 8.095529283870092e-06, + "loss": 3.9083, + "step": 108255 + }, + { + "epoch": 7.355618969968746, + "grad_norm": 0.4493640959262848, + "learning_rate": 8.09128278298682e-06, + "loss": 3.9139, + "step": 108260 + }, + { + "epoch": 7.355958690039407, + "grad_norm": 0.1405506283044815, + "learning_rate": 8.087036282103546e-06, + "loss": 3.853, + "step": 108265 + }, + { + "epoch": 7.356298410110069, + "grad_norm": 0.14385059475898743, + "learning_rate": 8.082789781220274e-06, + "loss": 3.8035, + "step": 108270 + }, + { + "epoch": 7.356638130180731, + "grad_norm": 0.15008562803268433, + "learning_rate": 8.078543280337002e-06, + "loss": 3.9305, + "step": 108275 + }, + { + "epoch": 7.3569778502513925, + "grad_norm": 0.13906918466091156, + "learning_rate": 8.07429677945373e-06, + "loss": 3.9662, + "step": 108280 + }, + { + "epoch": 7.357317570322055, + "grad_norm": 0.27156612277030945, + "learning_rate": 8.070050278570458e-06, + "loss": 3.9235, + "step": 108285 + }, + { + "epoch": 7.357657290392717, + "grad_norm": 0.20454014837741852, + "learning_rate": 8.065803777687186e-06, + "loss": 3.6908, + "step": 108290 + }, + { + "epoch": 7.357997010463378, + "grad_norm": 0.21413397789001465, + "learning_rate": 8.061557276803914e-06, + "loss": 3.9611, + "step": 108295 + }, + { + "epoch": 7.35833673053404, + "grad_norm": 0.16303732991218567, + "learning_rate": 8.05731077592064e-06, + "loss": 3.7261, + "step": 108300 + }, + { + "epoch": 7.358676450604702, + "grad_norm": 0.19971518218517303, + "learning_rate": 8.05306427503737e-06, + "loss": 3.8853, + "step": 108305 + }, + { + "epoch": 7.359016170675363, + "grad_norm": 0.14243032038211823, + "learning_rate": 8.048817774154098e-06, + "loss": 3.8938, + "step": 108310 + }, + { + "epoch": 7.359355890746025, + "grad_norm": 0.15057063102722168, + "learning_rate": 8.044571273270825e-06, + "loss": 3.8207, + "step": 108315 + }, + { + "epoch": 7.359695610816687, + "grad_norm": 0.20640496909618378, + "learning_rate": 8.040324772387553e-06, + "loss": 3.8797, + "step": 108320 + }, + { + "epoch": 7.3600353308873485, + "grad_norm": 0.1968332976102829, + "learning_rate": 8.03607827150428e-06, + "loss": 3.7957, + "step": 108325 + }, + { + "epoch": 7.360375050958011, + "grad_norm": 0.8537095785140991, + "learning_rate": 8.031831770621009e-06, + "loss": 3.6665, + "step": 108330 + }, + { + "epoch": 7.360714771028673, + "grad_norm": 0.12128935754299164, + "learning_rate": 8.027585269737737e-06, + "loss": 4.0931, + "step": 108335 + }, + { + "epoch": 7.361054491099334, + "grad_norm": 0.19294209778308868, + "learning_rate": 8.023338768854465e-06, + "loss": 3.8169, + "step": 108340 + }, + { + "epoch": 7.361394211169996, + "grad_norm": 0.5704728960990906, + "learning_rate": 8.019092267971193e-06, + "loss": 3.6773, + "step": 108345 + }, + { + "epoch": 7.361733931240658, + "grad_norm": 0.19956091046333313, + "learning_rate": 8.014845767087919e-06, + "loss": 3.6436, + "step": 108350 + }, + { + "epoch": 7.362073651311319, + "grad_norm": 0.1903611719608307, + "learning_rate": 8.010599266204647e-06, + "loss": 3.731, + "step": 108355 + }, + { + "epoch": 7.362413371381981, + "grad_norm": 0.17816706001758575, + "learning_rate": 8.006352765321377e-06, + "loss": 3.8169, + "step": 108360 + }, + { + "epoch": 7.362753091452643, + "grad_norm": 0.15743479132652283, + "learning_rate": 8.002106264438103e-06, + "loss": 3.9376, + "step": 108365 + }, + { + "epoch": 7.3630928115233045, + "grad_norm": 0.1647801548242569, + "learning_rate": 7.997859763554831e-06, + "loss": 3.9504, + "step": 108370 + }, + { + "epoch": 7.363432531593967, + "grad_norm": 0.2219424545764923, + "learning_rate": 7.993613262671559e-06, + "loss": 3.8073, + "step": 108375 + }, + { + "epoch": 7.363772251664629, + "grad_norm": 0.1540605127811432, + "learning_rate": 7.989366761788287e-06, + "loss": 4.0118, + "step": 108380 + }, + { + "epoch": 7.36411197173529, + "grad_norm": 0.23408758640289307, + "learning_rate": 7.985120260905015e-06, + "loss": 3.9184, + "step": 108385 + }, + { + "epoch": 7.364451691805952, + "grad_norm": 0.2109578549861908, + "learning_rate": 7.980873760021743e-06, + "loss": 3.4552, + "step": 108390 + }, + { + "epoch": 7.364791411876614, + "grad_norm": 0.1949453204870224, + "learning_rate": 7.976627259138471e-06, + "loss": 3.8151, + "step": 108395 + }, + { + "epoch": 7.365131131947275, + "grad_norm": 0.2017679065465927, + "learning_rate": 7.972380758255197e-06, + "loss": 3.8049, + "step": 108400 + }, + { + "epoch": 7.365470852017937, + "grad_norm": 0.26221540570259094, + "learning_rate": 7.968134257371925e-06, + "loss": 3.6449, + "step": 108405 + }, + { + "epoch": 7.365810572088599, + "grad_norm": 0.19529150426387787, + "learning_rate": 7.963887756488653e-06, + "loss": 3.5112, + "step": 108410 + }, + { + "epoch": 7.3661502921592605, + "grad_norm": 0.16357851028442383, + "learning_rate": 7.959641255605381e-06, + "loss": 3.6325, + "step": 108415 + }, + { + "epoch": 7.366490012229923, + "grad_norm": 0.3032436966896057, + "learning_rate": 7.95539475472211e-06, + "loss": 3.7154, + "step": 108420 + }, + { + "epoch": 7.366829732300585, + "grad_norm": 0.25075745582580566, + "learning_rate": 7.951148253838837e-06, + "loss": 3.8867, + "step": 108425 + }, + { + "epoch": 7.367169452371246, + "grad_norm": 0.17205968499183655, + "learning_rate": 7.946901752955565e-06, + "loss": 3.7213, + "step": 108430 + }, + { + "epoch": 7.367509172441908, + "grad_norm": 0.17247143387794495, + "learning_rate": 7.942655252072292e-06, + "loss": 3.9759, + "step": 108435 + }, + { + "epoch": 7.36784889251257, + "grad_norm": 0.1978517472743988, + "learning_rate": 7.938408751189022e-06, + "loss": 4.0703, + "step": 108440 + }, + { + "epoch": 7.368188612583231, + "grad_norm": 0.2111922800540924, + "learning_rate": 7.93416225030575e-06, + "loss": 3.6014, + "step": 108445 + }, + { + "epoch": 7.368528332653893, + "grad_norm": 0.20879314839839935, + "learning_rate": 7.929915749422476e-06, + "loss": 3.9199, + "step": 108450 + }, + { + "epoch": 7.368868052724555, + "grad_norm": 0.17666055262088776, + "learning_rate": 7.925669248539204e-06, + "loss": 3.8694, + "step": 108455 + }, + { + "epoch": 7.3692077727952165, + "grad_norm": 0.2577568590641022, + "learning_rate": 7.921422747655932e-06, + "loss": 3.7136, + "step": 108460 + }, + { + "epoch": 7.369547492865879, + "grad_norm": 0.20375528931617737, + "learning_rate": 7.91717624677266e-06, + "loss": 3.8182, + "step": 108465 + }, + { + "epoch": 7.369887212936541, + "grad_norm": 0.1491234302520752, + "learning_rate": 7.912929745889388e-06, + "loss": 3.7686, + "step": 108470 + }, + { + "epoch": 7.370226933007202, + "grad_norm": 0.13671518862247467, + "learning_rate": 7.908683245006116e-06, + "loss": 3.7775, + "step": 108475 + }, + { + "epoch": 7.370566653077864, + "grad_norm": 0.14283156394958496, + "learning_rate": 7.904436744122844e-06, + "loss": 3.7696, + "step": 108480 + }, + { + "epoch": 7.370906373148525, + "grad_norm": 0.18278057873249054, + "learning_rate": 7.90019024323957e-06, + "loss": 3.8658, + "step": 108485 + }, + { + "epoch": 7.371246093219187, + "grad_norm": 0.1827791929244995, + "learning_rate": 7.895943742356298e-06, + "loss": 3.8325, + "step": 108490 + }, + { + "epoch": 7.371585813289849, + "grad_norm": 0.24817967414855957, + "learning_rate": 7.891697241473026e-06, + "loss": 3.7931, + "step": 108495 + }, + { + "epoch": 7.3719255333605105, + "grad_norm": 0.15963256359100342, + "learning_rate": 7.887450740589754e-06, + "loss": 3.7432, + "step": 108500 + }, + { + "epoch": 7.372265253431173, + "grad_norm": 0.1463175117969513, + "learning_rate": 7.883204239706482e-06, + "loss": 3.8188, + "step": 108505 + }, + { + "epoch": 7.372604973501835, + "grad_norm": 0.17443057894706726, + "learning_rate": 7.87895773882321e-06, + "loss": 3.7327, + "step": 108510 + }, + { + "epoch": 7.372944693572496, + "grad_norm": 2.1831905841827393, + "learning_rate": 7.874711237939938e-06, + "loss": 4.0071, + "step": 108515 + }, + { + "epoch": 7.373284413643158, + "grad_norm": 0.14458733797073364, + "learning_rate": 7.870464737056665e-06, + "loss": 4.0708, + "step": 108520 + }, + { + "epoch": 7.37362413371382, + "grad_norm": 1.0653076171875, + "learning_rate": 7.866218236173394e-06, + "loss": 3.9448, + "step": 108525 + }, + { + "epoch": 7.373963853784481, + "grad_norm": 0.2075575739145279, + "learning_rate": 7.861971735290122e-06, + "loss": 3.8335, + "step": 108530 + }, + { + "epoch": 7.374303573855143, + "grad_norm": 0.16777454316616058, + "learning_rate": 7.857725234406849e-06, + "loss": 3.9257, + "step": 108535 + }, + { + "epoch": 7.374643293925805, + "grad_norm": 0.14747445285320282, + "learning_rate": 7.853478733523577e-06, + "loss": 3.8399, + "step": 108540 + }, + { + "epoch": 7.3749830139964665, + "grad_norm": 0.14463555812835693, + "learning_rate": 7.849232232640305e-06, + "loss": 3.7456, + "step": 108545 + }, + { + "epoch": 7.375322734067129, + "grad_norm": 0.1711658239364624, + "learning_rate": 7.844985731757033e-06, + "loss": 3.6467, + "step": 108550 + }, + { + "epoch": 7.375662454137791, + "grad_norm": 0.15463024377822876, + "learning_rate": 7.84073923087376e-06, + "loss": 4.1108, + "step": 108555 + }, + { + "epoch": 7.376002174208452, + "grad_norm": 1.8799561262130737, + "learning_rate": 7.836492729990489e-06, + "loss": 3.9268, + "step": 108560 + }, + { + "epoch": 7.376341894279114, + "grad_norm": 0.21382978558540344, + "learning_rate": 7.832246229107217e-06, + "loss": 3.8432, + "step": 108565 + }, + { + "epoch": 7.376681614349776, + "grad_norm": 1.2900538444519043, + "learning_rate": 7.827999728223943e-06, + "loss": 3.9665, + "step": 108570 + }, + { + "epoch": 7.377021334420437, + "grad_norm": 0.18911507725715637, + "learning_rate": 7.823753227340671e-06, + "loss": 3.9298, + "step": 108575 + }, + { + "epoch": 7.377361054491099, + "grad_norm": 0.18054737150669098, + "learning_rate": 7.8195067264574e-06, + "loss": 3.9212, + "step": 108580 + }, + { + "epoch": 7.377700774561761, + "grad_norm": 0.16565801203250885, + "learning_rate": 7.815260225574127e-06, + "loss": 3.8516, + "step": 108585 + }, + { + "epoch": 7.3780404946324225, + "grad_norm": 0.1949283480644226, + "learning_rate": 7.811013724690855e-06, + "loss": 3.8447, + "step": 108590 + }, + { + "epoch": 7.378380214703085, + "grad_norm": 0.16440311074256897, + "learning_rate": 7.806767223807583e-06, + "loss": 4.0505, + "step": 108595 + }, + { + "epoch": 7.378719934773747, + "grad_norm": 0.18146024644374847, + "learning_rate": 7.802520722924311e-06, + "loss": 3.7665, + "step": 108600 + }, + { + "epoch": 7.379059654844408, + "grad_norm": 0.19708986580371857, + "learning_rate": 7.798274222041039e-06, + "loss": 3.6315, + "step": 108605 + }, + { + "epoch": 7.37939937491507, + "grad_norm": 0.14477546513080597, + "learning_rate": 7.794027721157767e-06, + "loss": 3.6939, + "step": 108610 + }, + { + "epoch": 7.379739094985732, + "grad_norm": 0.1585482954978943, + "learning_rate": 7.789781220274495e-06, + "loss": 4.0391, + "step": 108615 + }, + { + "epoch": 7.380078815056393, + "grad_norm": 0.18314462900161743, + "learning_rate": 7.785534719391221e-06, + "loss": 3.9065, + "step": 108620 + }, + { + "epoch": 7.380418535127055, + "grad_norm": 0.17761915922164917, + "learning_rate": 7.78128821850795e-06, + "loss": 3.9106, + "step": 108625 + }, + { + "epoch": 7.380758255197717, + "grad_norm": 0.17923204600811005, + "learning_rate": 7.777041717624677e-06, + "loss": 3.797, + "step": 108630 + }, + { + "epoch": 7.3810979752683785, + "grad_norm": 0.17550228536128998, + "learning_rate": 7.772795216741405e-06, + "loss": 3.7239, + "step": 108635 + }, + { + "epoch": 7.381437695339041, + "grad_norm": 0.17268860340118408, + "learning_rate": 7.768548715858133e-06, + "loss": 3.846, + "step": 108640 + }, + { + "epoch": 7.381777415409703, + "grad_norm": 0.1531846970319748, + "learning_rate": 7.764302214974861e-06, + "loss": 3.8959, + "step": 108645 + }, + { + "epoch": 7.382117135480364, + "grad_norm": 0.1522221863269806, + "learning_rate": 7.76005571409159e-06, + "loss": 3.8248, + "step": 108650 + }, + { + "epoch": 7.382456855551026, + "grad_norm": 0.19235564768314362, + "learning_rate": 7.755809213208316e-06, + "loss": 3.7892, + "step": 108655 + }, + { + "epoch": 7.382796575621688, + "grad_norm": 0.2964096963405609, + "learning_rate": 7.751562712325044e-06, + "loss": 3.8676, + "step": 108660 + }, + { + "epoch": 7.383136295692349, + "grad_norm": 0.201217919588089, + "learning_rate": 7.747316211441773e-06, + "loss": 3.6672, + "step": 108665 + }, + { + "epoch": 7.383476015763011, + "grad_norm": 0.17498357594013214, + "learning_rate": 7.7430697105585e-06, + "loss": 3.6907, + "step": 108670 + }, + { + "epoch": 7.383815735833673, + "grad_norm": 0.16035409271717072, + "learning_rate": 7.738823209675228e-06, + "loss": 3.9342, + "step": 108675 + }, + { + "epoch": 7.3841554559043345, + "grad_norm": 0.23584771156311035, + "learning_rate": 7.734576708791956e-06, + "loss": 3.7774, + "step": 108680 + }, + { + "epoch": 7.384495175974997, + "grad_norm": 0.13966616988182068, + "learning_rate": 7.730330207908684e-06, + "loss": 4.1105, + "step": 108685 + }, + { + "epoch": 7.384834896045659, + "grad_norm": 0.1558683216571808, + "learning_rate": 7.726083707025412e-06, + "loss": 3.7582, + "step": 108690 + }, + { + "epoch": 7.38517461611632, + "grad_norm": 0.20995797216892242, + "learning_rate": 7.72183720614214e-06, + "loss": 3.7318, + "step": 108695 + }, + { + "epoch": 7.385514336186982, + "grad_norm": 0.41176143288612366, + "learning_rate": 7.717590705258868e-06, + "loss": 3.7664, + "step": 108700 + }, + { + "epoch": 7.385854056257644, + "grad_norm": 0.28362852334976196, + "learning_rate": 7.713344204375594e-06, + "loss": 3.8428, + "step": 108705 + }, + { + "epoch": 7.386193776328305, + "grad_norm": 0.1619897186756134, + "learning_rate": 7.709097703492322e-06, + "loss": 4.0483, + "step": 108710 + }, + { + "epoch": 7.386533496398967, + "grad_norm": 0.21205449104309082, + "learning_rate": 7.70485120260905e-06, + "loss": 3.7039, + "step": 108715 + }, + { + "epoch": 7.386873216469629, + "grad_norm": 0.18277598917484283, + "learning_rate": 7.700604701725778e-06, + "loss": 3.9681, + "step": 108720 + }, + { + "epoch": 7.3872129365402905, + "grad_norm": 0.15583190321922302, + "learning_rate": 7.696358200842506e-06, + "loss": 3.9142, + "step": 108725 + }, + { + "epoch": 7.387552656610953, + "grad_norm": 1.1889629364013672, + "learning_rate": 7.692111699959234e-06, + "loss": 3.8578, + "step": 108730 + }, + { + "epoch": 7.387892376681615, + "grad_norm": 0.21447785198688507, + "learning_rate": 7.687865199075962e-06, + "loss": 4.0448, + "step": 108735 + }, + { + "epoch": 7.388232096752276, + "grad_norm": 0.2074686884880066, + "learning_rate": 7.683618698192688e-06, + "loss": 3.944, + "step": 108740 + }, + { + "epoch": 7.388571816822938, + "grad_norm": 0.15387216210365295, + "learning_rate": 7.679372197309418e-06, + "loss": 3.8492, + "step": 108745 + }, + { + "epoch": 7.3889115368936, + "grad_norm": 0.15688331425189972, + "learning_rate": 7.675125696426146e-06, + "loss": 3.8514, + "step": 108750 + }, + { + "epoch": 7.389251256964261, + "grad_norm": 0.15247316658496857, + "learning_rate": 7.670879195542872e-06, + "loss": 4.0237, + "step": 108755 + }, + { + "epoch": 7.389590977034923, + "grad_norm": 0.16353081166744232, + "learning_rate": 7.6666326946596e-06, + "loss": 3.6543, + "step": 108760 + }, + { + "epoch": 7.389930697105585, + "grad_norm": 0.24369293451309204, + "learning_rate": 7.662386193776328e-06, + "loss": 3.8075, + "step": 108765 + }, + { + "epoch": 7.3902704171762466, + "grad_norm": 0.1788993626832962, + "learning_rate": 7.658139692893056e-06, + "loss": 3.985, + "step": 108770 + }, + { + "epoch": 7.390610137246909, + "grad_norm": 0.17527318000793457, + "learning_rate": 7.653893192009784e-06, + "loss": 3.8565, + "step": 108775 + }, + { + "epoch": 7.390949857317571, + "grad_norm": 0.18148688971996307, + "learning_rate": 7.649646691126512e-06, + "loss": 4.2291, + "step": 108780 + }, + { + "epoch": 7.391289577388232, + "grad_norm": 0.21547456085681915, + "learning_rate": 7.64540019024324e-06, + "loss": 3.9544, + "step": 108785 + }, + { + "epoch": 7.391629297458894, + "grad_norm": 0.19631826877593994, + "learning_rate": 7.641153689359967e-06, + "loss": 3.9504, + "step": 108790 + }, + { + "epoch": 7.391969017529556, + "grad_norm": 0.2659476697444916, + "learning_rate": 7.636907188476695e-06, + "loss": 3.7977, + "step": 108795 + }, + { + "epoch": 7.392308737600217, + "grad_norm": 0.16344229876995087, + "learning_rate": 7.632660687593425e-06, + "loss": 3.6489, + "step": 108800 + }, + { + "epoch": 7.392648457670879, + "grad_norm": 0.19426102936267853, + "learning_rate": 7.628414186710151e-06, + "loss": 4.1774, + "step": 108805 + }, + { + "epoch": 7.392988177741541, + "grad_norm": 0.23387260735034943, + "learning_rate": 7.624167685826879e-06, + "loss": 3.8268, + "step": 108810 + }, + { + "epoch": 7.393327897812203, + "grad_norm": 0.15504832565784454, + "learning_rate": 7.619921184943607e-06, + "loss": 3.8366, + "step": 108815 + }, + { + "epoch": 7.393667617882865, + "grad_norm": 0.21293596923351288, + "learning_rate": 7.615674684060335e-06, + "loss": 4.1191, + "step": 108820 + }, + { + "epoch": 7.394007337953527, + "grad_norm": 0.17468059062957764, + "learning_rate": 7.611428183177062e-06, + "loss": 3.8817, + "step": 108825 + }, + { + "epoch": 7.394347058024188, + "grad_norm": 0.14900673925876617, + "learning_rate": 7.60718168229379e-06, + "loss": 3.9753, + "step": 108830 + }, + { + "epoch": 7.39468677809485, + "grad_norm": 0.16774816811084747, + "learning_rate": 7.602935181410519e-06, + "loss": 3.7213, + "step": 108835 + }, + { + "epoch": 7.395026498165512, + "grad_norm": 0.15319500863552094, + "learning_rate": 7.598688680527245e-06, + "loss": 3.8183, + "step": 108840 + }, + { + "epoch": 7.395366218236173, + "grad_norm": 0.17309868335723877, + "learning_rate": 7.594442179643973e-06, + "loss": 4.0281, + "step": 108845 + }, + { + "epoch": 7.395705938306835, + "grad_norm": 0.16721110045909882, + "learning_rate": 7.590195678760702e-06, + "loss": 3.7937, + "step": 108850 + }, + { + "epoch": 7.396045658377497, + "grad_norm": 0.4333048462867737, + "learning_rate": 7.58594917787743e-06, + "loss": 3.9137, + "step": 108855 + }, + { + "epoch": 7.396385378448159, + "grad_norm": 0.15540090203285217, + "learning_rate": 7.581702676994157e-06, + "loss": 3.9508, + "step": 108860 + }, + { + "epoch": 7.396725098518821, + "grad_norm": 0.2467566579580307, + "learning_rate": 7.577456176110885e-06, + "loss": 3.754, + "step": 108865 + }, + { + "epoch": 7.397064818589483, + "grad_norm": 0.14780627191066742, + "learning_rate": 7.573209675227613e-06, + "loss": 3.6126, + "step": 108870 + }, + { + "epoch": 7.397404538660144, + "grad_norm": 0.18483993411064148, + "learning_rate": 7.56896317434434e-06, + "loss": 4.0018, + "step": 108875 + }, + { + "epoch": 7.397744258730806, + "grad_norm": 0.16332398355007172, + "learning_rate": 7.564716673461068e-06, + "loss": 4.0587, + "step": 108880 + }, + { + "epoch": 7.398083978801467, + "grad_norm": 0.17530429363250732, + "learning_rate": 7.560470172577796e-06, + "loss": 3.9919, + "step": 108885 + }, + { + "epoch": 7.398423698872129, + "grad_norm": 0.20640622079372406, + "learning_rate": 7.5562236716945236e-06, + "loss": 3.8279, + "step": 108890 + }, + { + "epoch": 7.398763418942791, + "grad_norm": 0.22614194452762604, + "learning_rate": 7.551977170811252e-06, + "loss": 3.6952, + "step": 108895 + }, + { + "epoch": 7.3991031390134525, + "grad_norm": 0.34354498982429504, + "learning_rate": 7.54773066992798e-06, + "loss": 3.7073, + "step": 108900 + }, + { + "epoch": 7.399442859084115, + "grad_norm": 0.24565371870994568, + "learning_rate": 7.5434841690447084e-06, + "loss": 3.7812, + "step": 108905 + }, + { + "epoch": 7.399782579154777, + "grad_norm": 0.1679777055978775, + "learning_rate": 7.539237668161435e-06, + "loss": 3.7228, + "step": 108910 + }, + { + "epoch": 7.400122299225438, + "grad_norm": 0.18827804923057556, + "learning_rate": 7.534991167278163e-06, + "loss": 3.8266, + "step": 108915 + }, + { + "epoch": 7.4004620192961, + "grad_norm": 0.22360190749168396, + "learning_rate": 7.530744666394892e-06, + "loss": 3.8089, + "step": 108920 + }, + { + "epoch": 7.400801739366762, + "grad_norm": 0.16795863211154938, + "learning_rate": 7.526498165511618e-06, + "loss": 3.7936, + "step": 108925 + }, + { + "epoch": 7.401141459437423, + "grad_norm": 0.1886647641658783, + "learning_rate": 7.522251664628347e-06, + "loss": 3.9789, + "step": 108930 + }, + { + "epoch": 7.401481179508085, + "grad_norm": 0.19236934185028076, + "learning_rate": 7.518005163745075e-06, + "loss": 3.8326, + "step": 108935 + }, + { + "epoch": 7.401820899578747, + "grad_norm": 0.19429020583629608, + "learning_rate": 7.513758662861803e-06, + "loss": 3.9924, + "step": 108940 + }, + { + "epoch": 7.4021606196494085, + "grad_norm": 0.19907760620117188, + "learning_rate": 7.50951216197853e-06, + "loss": 3.7378, + "step": 108945 + }, + { + "epoch": 7.402500339720071, + "grad_norm": 0.5499853491783142, + "learning_rate": 7.505265661095258e-06, + "loss": 3.7866, + "step": 108950 + }, + { + "epoch": 7.402840059790733, + "grad_norm": 0.37866702675819397, + "learning_rate": 7.501019160211986e-06, + "loss": 3.8361, + "step": 108955 + }, + { + "epoch": 7.403179779861394, + "grad_norm": 0.1784355491399765, + "learning_rate": 7.496772659328713e-06, + "loss": 3.9716, + "step": 108960 + }, + { + "epoch": 7.403519499932056, + "grad_norm": 0.1884450763463974, + "learning_rate": 7.492526158445441e-06, + "loss": 3.8122, + "step": 108965 + }, + { + "epoch": 7.403859220002718, + "grad_norm": 0.22205159068107605, + "learning_rate": 7.488279657562169e-06, + "loss": 3.691, + "step": 108970 + }, + { + "epoch": 7.404198940073379, + "grad_norm": 0.17293764650821686, + "learning_rate": 7.484033156678896e-06, + "loss": 3.8928, + "step": 108975 + }, + { + "epoch": 7.404538660144041, + "grad_norm": 0.1845817118883133, + "learning_rate": 7.479786655795624e-06, + "loss": 3.7039, + "step": 108980 + }, + { + "epoch": 7.404878380214703, + "grad_norm": 0.30611830949783325, + "learning_rate": 7.475540154912352e-06, + "loss": 3.7509, + "step": 108985 + }, + { + "epoch": 7.4052181002853645, + "grad_norm": 0.17488735914230347, + "learning_rate": 7.471293654029081e-06, + "loss": 3.8143, + "step": 108990 + }, + { + "epoch": 7.405557820356027, + "grad_norm": 0.17885108292102814, + "learning_rate": 7.4670471531458075e-06, + "loss": 3.8131, + "step": 108995 + }, + { + "epoch": 7.405897540426689, + "grad_norm": 0.14840106666088104, + "learning_rate": 7.462800652262536e-06, + "loss": 3.7664, + "step": 109000 + }, + { + "epoch": 7.40623726049735, + "grad_norm": 0.17601251602172852, + "learning_rate": 7.458554151379264e-06, + "loss": 4.0326, + "step": 109005 + }, + { + "epoch": 7.406576980568012, + "grad_norm": 0.16566050052642822, + "learning_rate": 7.454307650495991e-06, + "loss": 3.8823, + "step": 109010 + }, + { + "epoch": 7.406916700638674, + "grad_norm": 0.19287075102329254, + "learning_rate": 7.4500611496127195e-06, + "loss": 3.741, + "step": 109015 + }, + { + "epoch": 7.407256420709335, + "grad_norm": 0.18604899942874908, + "learning_rate": 7.4458146487294475e-06, + "loss": 3.684, + "step": 109020 + }, + { + "epoch": 7.407596140779997, + "grad_norm": 0.3332541882991791, + "learning_rate": 7.4415681478461755e-06, + "loss": 3.9433, + "step": 109025 + }, + { + "epoch": 7.407935860850659, + "grad_norm": 0.2275398224592209, + "learning_rate": 7.437321646962903e-06, + "loss": 3.8955, + "step": 109030 + }, + { + "epoch": 7.4082755809213205, + "grad_norm": 0.8451194167137146, + "learning_rate": 7.433075146079631e-06, + "loss": 3.6019, + "step": 109035 + }, + { + "epoch": 7.408615300991983, + "grad_norm": 0.20640048384666443, + "learning_rate": 7.428828645196359e-06, + "loss": 3.496, + "step": 109040 + }, + { + "epoch": 7.408955021062645, + "grad_norm": 0.31972578167915344, + "learning_rate": 7.424582144313086e-06, + "loss": 3.7506, + "step": 109045 + }, + { + "epoch": 7.409294741133306, + "grad_norm": 0.20296312868595123, + "learning_rate": 7.420335643429814e-06, + "loss": 3.671, + "step": 109050 + }, + { + "epoch": 7.409634461203968, + "grad_norm": 0.1994805783033371, + "learning_rate": 7.416089142546542e-06, + "loss": 3.7365, + "step": 109055 + }, + { + "epoch": 7.40997418127463, + "grad_norm": 0.21427865326404572, + "learning_rate": 7.411842641663269e-06, + "loss": 3.6428, + "step": 109060 + }, + { + "epoch": 7.410313901345291, + "grad_norm": 0.5618546605110168, + "learning_rate": 7.407596140779997e-06, + "loss": 3.9074, + "step": 109065 + }, + { + "epoch": 7.410653621415953, + "grad_norm": 0.19167368113994598, + "learning_rate": 7.403349639896726e-06, + "loss": 3.5789, + "step": 109070 + }, + { + "epoch": 7.410993341486615, + "grad_norm": 0.2075270563364029, + "learning_rate": 7.399103139013454e-06, + "loss": 3.7893, + "step": 109075 + }, + { + "epoch": 7.411333061557277, + "grad_norm": 0.16622589528560638, + "learning_rate": 7.39485663813018e-06, + "loss": 3.84, + "step": 109080 + }, + { + "epoch": 7.411672781627939, + "grad_norm": 0.18010737001895905, + "learning_rate": 7.390610137246909e-06, + "loss": 3.7996, + "step": 109085 + }, + { + "epoch": 7.412012501698601, + "grad_norm": 0.22241772711277008, + "learning_rate": 7.386363636363637e-06, + "loss": 3.742, + "step": 109090 + }, + { + "epoch": 7.412352221769262, + "grad_norm": 0.19196484982967377, + "learning_rate": 7.382117135480364e-06, + "loss": 3.8254, + "step": 109095 + }, + { + "epoch": 7.412691941839924, + "grad_norm": 0.15329168736934662, + "learning_rate": 7.377870634597092e-06, + "loss": 3.6378, + "step": 109100 + }, + { + "epoch": 7.413031661910586, + "grad_norm": 0.19803756475448608, + "learning_rate": 7.37362413371382e-06, + "loss": 3.7751, + "step": 109105 + }, + { + "epoch": 7.413371381981247, + "grad_norm": 0.23113292455673218, + "learning_rate": 7.369377632830548e-06, + "loss": 3.8222, + "step": 109110 + }, + { + "epoch": 7.413711102051909, + "grad_norm": 0.26190200448036194, + "learning_rate": 7.3651311319472754e-06, + "loss": 3.6776, + "step": 109115 + }, + { + "epoch": 7.414050822122571, + "grad_norm": 0.1269713193178177, + "learning_rate": 7.3608846310640035e-06, + "loss": 3.9576, + "step": 109120 + }, + { + "epoch": 7.414390542193233, + "grad_norm": 0.18273848295211792, + "learning_rate": 7.356638130180732e-06, + "loss": 4.0157, + "step": 109125 + }, + { + "epoch": 7.414730262263895, + "grad_norm": 0.1405690759420395, + "learning_rate": 7.352391629297459e-06, + "loss": 3.9671, + "step": 109130 + }, + { + "epoch": 7.415069982334557, + "grad_norm": 0.14745686948299408, + "learning_rate": 7.348145128414187e-06, + "loss": 3.9722, + "step": 109135 + }, + { + "epoch": 7.415409702405218, + "grad_norm": 0.11971167474985123, + "learning_rate": 7.3438986275309155e-06, + "loss": 3.7352, + "step": 109140 + }, + { + "epoch": 7.41574942247588, + "grad_norm": 0.7506585121154785, + "learning_rate": 7.339652126647642e-06, + "loss": 3.822, + "step": 109145 + }, + { + "epoch": 7.416089142546542, + "grad_norm": 0.170649915933609, + "learning_rate": 7.335405625764371e-06, + "loss": 3.4592, + "step": 109150 + }, + { + "epoch": 7.416428862617203, + "grad_norm": 0.13560651242733002, + "learning_rate": 7.331159124881099e-06, + "loss": 3.721, + "step": 109155 + }, + { + "epoch": 7.416768582687865, + "grad_norm": 0.17608891427516937, + "learning_rate": 7.326912623997827e-06, + "loss": 3.5658, + "step": 109160 + }, + { + "epoch": 7.4171083027585265, + "grad_norm": 0.16814164817333221, + "learning_rate": 7.322666123114554e-06, + "loss": 3.9786, + "step": 109165 + }, + { + "epoch": 7.417448022829189, + "grad_norm": 0.17016375064849854, + "learning_rate": 7.318419622231282e-06, + "loss": 3.8366, + "step": 109170 + }, + { + "epoch": 7.417787742899851, + "grad_norm": 1.987562656402588, + "learning_rate": 7.31417312134801e-06, + "loss": 3.8867, + "step": 109175 + }, + { + "epoch": 7.418127462970512, + "grad_norm": 0.19318337738513947, + "learning_rate": 7.309926620464737e-06, + "loss": 3.724, + "step": 109180 + }, + { + "epoch": 7.418467183041174, + "grad_norm": 0.25494545698165894, + "learning_rate": 7.305680119581465e-06, + "loss": 3.7108, + "step": 109185 + }, + { + "epoch": 7.418806903111836, + "grad_norm": 0.41395440697669983, + "learning_rate": 7.301433618698193e-06, + "loss": 3.891, + "step": 109190 + }, + { + "epoch": 7.419146623182497, + "grad_norm": 0.19290071725845337, + "learning_rate": 7.297187117814922e-06, + "loss": 3.8669, + "step": 109195 + }, + { + "epoch": 7.419486343253159, + "grad_norm": 0.16129694879055023, + "learning_rate": 7.292940616931648e-06, + "loss": 3.7862, + "step": 109200 + }, + { + "epoch": 7.419826063323821, + "grad_norm": 0.17002592980861664, + "learning_rate": 7.288694116048376e-06, + "loss": 3.7545, + "step": 109205 + }, + { + "epoch": 7.4201657833944825, + "grad_norm": 0.22869998216629028, + "learning_rate": 7.284447615165105e-06, + "loss": 3.6382, + "step": 109210 + }, + { + "epoch": 7.420505503465145, + "grad_norm": 0.2170196771621704, + "learning_rate": 7.280201114281831e-06, + "loss": 3.7169, + "step": 109215 + }, + { + "epoch": 7.420845223535807, + "grad_norm": 0.16344092786312103, + "learning_rate": 7.27595461339856e-06, + "loss": 3.6933, + "step": 109220 + }, + { + "epoch": 7.421184943606468, + "grad_norm": 0.17105597257614136, + "learning_rate": 7.271708112515288e-06, + "loss": 4.0867, + "step": 109225 + }, + { + "epoch": 7.42152466367713, + "grad_norm": 0.19106483459472656, + "learning_rate": 7.2674616116320145e-06, + "loss": 3.9217, + "step": 109230 + }, + { + "epoch": 7.421864383747792, + "grad_norm": 0.900521993637085, + "learning_rate": 7.263215110748743e-06, + "loss": 3.8785, + "step": 109235 + }, + { + "epoch": 7.422204103818453, + "grad_norm": 0.1671869307756424, + "learning_rate": 7.258968609865471e-06, + "loss": 4.0379, + "step": 109240 + }, + { + "epoch": 7.422543823889115, + "grad_norm": 1.107338786125183, + "learning_rate": 7.254722108982199e-06, + "loss": 3.8476, + "step": 109245 + }, + { + "epoch": 7.422883543959777, + "grad_norm": 0.18438304960727692, + "learning_rate": 7.2504756080989266e-06, + "loss": 3.8312, + "step": 109250 + }, + { + "epoch": 7.4232232640304385, + "grad_norm": 0.1730579137802124, + "learning_rate": 7.246229107215655e-06, + "loss": 3.748, + "step": 109255 + }, + { + "epoch": 7.423562984101101, + "grad_norm": 0.22037193179130554, + "learning_rate": 7.241982606332383e-06, + "loss": 3.6505, + "step": 109260 + }, + { + "epoch": 7.423902704171763, + "grad_norm": 0.17641600966453552, + "learning_rate": 7.23773610544911e-06, + "loss": 3.9981, + "step": 109265 + }, + { + "epoch": 7.424242424242424, + "grad_norm": 0.1378137320280075, + "learning_rate": 7.233489604565838e-06, + "loss": 3.9834, + "step": 109270 + }, + { + "epoch": 7.424582144313086, + "grad_norm": 0.17526623606681824, + "learning_rate": 7.229243103682566e-06, + "loss": 3.9945, + "step": 109275 + }, + { + "epoch": 7.424921864383748, + "grad_norm": 0.18353457748889923, + "learning_rate": 7.224996602799295e-06, + "loss": 3.8428, + "step": 109280 + }, + { + "epoch": 7.425261584454409, + "grad_norm": 0.16982626914978027, + "learning_rate": 7.220750101916021e-06, + "loss": 3.8746, + "step": 109285 + }, + { + "epoch": 7.425601304525071, + "grad_norm": 0.17352893948554993, + "learning_rate": 7.21650360103275e-06, + "loss": 3.8356, + "step": 109290 + }, + { + "epoch": 7.425941024595733, + "grad_norm": 0.1976802796125412, + "learning_rate": 7.212257100149478e-06, + "loss": 3.537, + "step": 109295 + }, + { + "epoch": 7.4262807446663945, + "grad_norm": 0.1831541657447815, + "learning_rate": 7.208010599266204e-06, + "loss": 3.8834, + "step": 109300 + }, + { + "epoch": 7.426620464737057, + "grad_norm": 0.23387780785560608, + "learning_rate": 7.203764098382933e-06, + "loss": 3.6518, + "step": 109305 + }, + { + "epoch": 7.426960184807719, + "grad_norm": 0.1895531415939331, + "learning_rate": 7.199517597499661e-06, + "loss": 3.6763, + "step": 109310 + }, + { + "epoch": 7.42729990487838, + "grad_norm": 0.5399442911148071, + "learning_rate": 7.195271096616388e-06, + "loss": 3.9066, + "step": 109315 + }, + { + "epoch": 7.427639624949042, + "grad_norm": 0.20780213177204132, + "learning_rate": 7.191024595733116e-06, + "loss": 3.8577, + "step": 109320 + }, + { + "epoch": 7.427979345019704, + "grad_norm": 0.17734146118164062, + "learning_rate": 7.186778094849844e-06, + "loss": 3.7968, + "step": 109325 + }, + { + "epoch": 7.428319065090365, + "grad_norm": 0.12837658822536469, + "learning_rate": 7.182531593966572e-06, + "loss": 3.8083, + "step": 109330 + }, + { + "epoch": 7.428658785161027, + "grad_norm": 0.2271530032157898, + "learning_rate": 7.178285093083299e-06, + "loss": 3.8206, + "step": 109335 + }, + { + "epoch": 7.428998505231689, + "grad_norm": 0.1533183753490448, + "learning_rate": 7.174038592200027e-06, + "loss": 3.908, + "step": 109340 + }, + { + "epoch": 7.4293382253023506, + "grad_norm": 0.16047532856464386, + "learning_rate": 7.169792091316755e-06, + "loss": 3.749, + "step": 109345 + }, + { + "epoch": 7.429677945373013, + "grad_norm": 0.17545054852962494, + "learning_rate": 7.1655455904334825e-06, + "loss": 3.9334, + "step": 109350 + }, + { + "epoch": 7.430017665443675, + "grad_norm": 0.22726798057556152, + "learning_rate": 7.1612990895502105e-06, + "loss": 3.6542, + "step": 109355 + }, + { + "epoch": 7.430357385514336, + "grad_norm": 0.1725831925868988, + "learning_rate": 7.157052588666939e-06, + "loss": 3.9506, + "step": 109360 + }, + { + "epoch": 7.430697105584998, + "grad_norm": 0.1716376692056656, + "learning_rate": 7.152806087783667e-06, + "loss": 3.9743, + "step": 109365 + }, + { + "epoch": 7.43103682565566, + "grad_norm": 0.25485989451408386, + "learning_rate": 7.148559586900394e-06, + "loss": 3.9761, + "step": 109370 + }, + { + "epoch": 7.431376545726321, + "grad_norm": 0.1772741675376892, + "learning_rate": 7.1443130860171225e-06, + "loss": 3.8419, + "step": 109375 + }, + { + "epoch": 7.431716265796983, + "grad_norm": 0.1632588505744934, + "learning_rate": 7.1400665851338505e-06, + "loss": 3.9262, + "step": 109380 + }, + { + "epoch": 7.432055985867645, + "grad_norm": 0.16017132997512817, + "learning_rate": 7.135820084250578e-06, + "loss": 3.7902, + "step": 109385 + }, + { + "epoch": 7.432395705938307, + "grad_norm": 0.17342129349708557, + "learning_rate": 7.131573583367306e-06, + "loss": 3.8208, + "step": 109390 + }, + { + "epoch": 7.432735426008969, + "grad_norm": 0.5068947672843933, + "learning_rate": 7.127327082484034e-06, + "loss": 3.7307, + "step": 109395 + }, + { + "epoch": 7.433075146079631, + "grad_norm": 0.17412562668323517, + "learning_rate": 7.123080581600761e-06, + "loss": 3.8933, + "step": 109400 + }, + { + "epoch": 7.433414866150292, + "grad_norm": 0.37150219082832336, + "learning_rate": 7.118834080717489e-06, + "loss": 3.8555, + "step": 109405 + }, + { + "epoch": 7.433754586220954, + "grad_norm": 0.16666193306446075, + "learning_rate": 7.114587579834217e-06, + "loss": 3.8822, + "step": 109410 + }, + { + "epoch": 7.434094306291616, + "grad_norm": 0.14273901283740997, + "learning_rate": 7.110341078950945e-06, + "loss": 3.9249, + "step": 109415 + }, + { + "epoch": 7.434434026362277, + "grad_norm": 0.18875354528427124, + "learning_rate": 7.106094578067672e-06, + "loss": 4.006, + "step": 109420 + }, + { + "epoch": 7.434773746432939, + "grad_norm": 0.19612587988376617, + "learning_rate": 7.1018480771844e-06, + "loss": 4.0422, + "step": 109425 + }, + { + "epoch": 7.435113466503601, + "grad_norm": 0.18860980868339539, + "learning_rate": 7.097601576301129e-06, + "loss": 3.5591, + "step": 109430 + }, + { + "epoch": 7.435453186574263, + "grad_norm": 0.1519361436367035, + "learning_rate": 7.093355075417855e-06, + "loss": 3.6188, + "step": 109435 + }, + { + "epoch": 7.435792906644925, + "grad_norm": 0.1334485113620758, + "learning_rate": 7.089108574534583e-06, + "loss": 3.5867, + "step": 109440 + }, + { + "epoch": 7.436132626715587, + "grad_norm": 0.1453641802072525, + "learning_rate": 7.084862073651312e-06, + "loss": 3.8798, + "step": 109445 + }, + { + "epoch": 7.436472346786248, + "grad_norm": 0.17504584789276123, + "learning_rate": 7.08061557276804e-06, + "loss": 3.9581, + "step": 109450 + }, + { + "epoch": 7.43681206685691, + "grad_norm": 0.162410169839859, + "learning_rate": 7.076369071884767e-06, + "loss": 3.7644, + "step": 109455 + }, + { + "epoch": 7.437151786927572, + "grad_norm": 0.16645638644695282, + "learning_rate": 7.072122571001495e-06, + "loss": 3.5934, + "step": 109460 + }, + { + "epoch": 7.437491506998233, + "grad_norm": 0.23132571578025818, + "learning_rate": 7.067876070118223e-06, + "loss": 3.6007, + "step": 109465 + }, + { + "epoch": 7.437831227068895, + "grad_norm": 0.17031195759773254, + "learning_rate": 7.0636295692349504e-06, + "loss": 3.9544, + "step": 109470 + }, + { + "epoch": 7.438170947139557, + "grad_norm": 0.18066935241222382, + "learning_rate": 7.0593830683516785e-06, + "loss": 3.7931, + "step": 109475 + }, + { + "epoch": 7.438510667210219, + "grad_norm": 0.18683797121047974, + "learning_rate": 7.0551365674684065e-06, + "loss": 3.988, + "step": 109480 + }, + { + "epoch": 7.438850387280881, + "grad_norm": 0.19551655650138855, + "learning_rate": 7.050890066585134e-06, + "loss": 3.7908, + "step": 109485 + }, + { + "epoch": 7.439190107351543, + "grad_norm": 0.22167779505252838, + "learning_rate": 7.046643565701862e-06, + "loss": 3.9152, + "step": 109490 + }, + { + "epoch": 7.439529827422204, + "grad_norm": 0.18610690534114838, + "learning_rate": 7.04239706481859e-06, + "loss": 3.9655, + "step": 109495 + }, + { + "epoch": 7.439869547492866, + "grad_norm": 0.21701297163963318, + "learning_rate": 7.0381505639353185e-06, + "loss": 3.9255, + "step": 109500 + }, + { + "epoch": 7.440209267563528, + "grad_norm": 1.0377954244613647, + "learning_rate": 7.033904063052045e-06, + "loss": 3.7705, + "step": 109505 + }, + { + "epoch": 7.440548987634189, + "grad_norm": 0.17192025482654572, + "learning_rate": 7.029657562168774e-06, + "loss": 3.7539, + "step": 109510 + }, + { + "epoch": 7.440888707704851, + "grad_norm": 0.15990011394023895, + "learning_rate": 7.025411061285502e-06, + "loss": 3.8325, + "step": 109515 + }, + { + "epoch": 7.441228427775513, + "grad_norm": 0.15991216897964478, + "learning_rate": 7.021164560402228e-06, + "loss": 3.7775, + "step": 109520 + }, + { + "epoch": 7.441568147846175, + "grad_norm": 0.1736760437488556, + "learning_rate": 7.016918059518957e-06, + "loss": 3.7468, + "step": 109525 + }, + { + "epoch": 7.441907867916837, + "grad_norm": 0.1541566401720047, + "learning_rate": 7.012671558635685e-06, + "loss": 3.893, + "step": 109530 + }, + { + "epoch": 7.442247587987499, + "grad_norm": 0.2033444195985794, + "learning_rate": 7.008425057752413e-06, + "loss": 3.7939, + "step": 109535 + }, + { + "epoch": 7.44258730805816, + "grad_norm": 0.17538900673389435, + "learning_rate": 7.00417855686914e-06, + "loss": 3.9384, + "step": 109540 + }, + { + "epoch": 7.442927028128822, + "grad_norm": 0.1778646558523178, + "learning_rate": 6.999932055985868e-06, + "loss": 3.7805, + "step": 109545 + }, + { + "epoch": 7.443266748199484, + "grad_norm": 0.19343748688697815, + "learning_rate": 6.995685555102596e-06, + "loss": 4.0678, + "step": 109550 + }, + { + "epoch": 7.443606468270145, + "grad_norm": 0.15940383076667786, + "learning_rate": 6.991439054219323e-06, + "loss": 3.8813, + "step": 109555 + }, + { + "epoch": 7.443946188340807, + "grad_norm": 0.23841628432273865, + "learning_rate": 6.987192553336051e-06, + "loss": 3.9343, + "step": 109560 + }, + { + "epoch": 7.444285908411469, + "grad_norm": 0.18137724697589874, + "learning_rate": 6.982946052452779e-06, + "loss": 3.7385, + "step": 109565 + }, + { + "epoch": 7.444625628482131, + "grad_norm": 0.16791173815727234, + "learning_rate": 6.978699551569506e-06, + "loss": 3.9529, + "step": 109570 + }, + { + "epoch": 7.444965348552793, + "grad_norm": 0.22845087945461273, + "learning_rate": 6.974453050686234e-06, + "loss": 3.6539, + "step": 109575 + }, + { + "epoch": 7.445305068623454, + "grad_norm": 0.19259728491306305, + "learning_rate": 6.970206549802963e-06, + "loss": 3.9583, + "step": 109580 + }, + { + "epoch": 7.445644788694116, + "grad_norm": 0.2603026330471039, + "learning_rate": 6.965960048919691e-06, + "loss": 3.9464, + "step": 109585 + }, + { + "epoch": 7.445984508764778, + "grad_norm": 0.15381331741809845, + "learning_rate": 6.9617135480364175e-06, + "loss": 3.8029, + "step": 109590 + }, + { + "epoch": 7.446324228835439, + "grad_norm": 0.7910506129264832, + "learning_rate": 6.957467047153146e-06, + "loss": 3.6006, + "step": 109595 + }, + { + "epoch": 7.446663948906101, + "grad_norm": 0.4088629186153412, + "learning_rate": 6.953220546269874e-06, + "loss": 3.7865, + "step": 109600 + }, + { + "epoch": 7.447003668976763, + "grad_norm": 0.18949274718761444, + "learning_rate": 6.9489740453866016e-06, + "loss": 3.8088, + "step": 109605 + }, + { + "epoch": 7.4473433890474245, + "grad_norm": 0.1708415150642395, + "learning_rate": 6.94472754450333e-06, + "loss": 3.8412, + "step": 109610 + }, + { + "epoch": 7.447683109118087, + "grad_norm": 0.2105727642774582, + "learning_rate": 6.940481043620058e-06, + "loss": 3.9056, + "step": 109615 + }, + { + "epoch": 7.448022829188749, + "grad_norm": 0.18236936628818512, + "learning_rate": 6.936234542736786e-06, + "loss": 3.9148, + "step": 109620 + }, + { + "epoch": 7.44836254925941, + "grad_norm": 0.20789735019207, + "learning_rate": 6.931988041853513e-06, + "loss": 3.7884, + "step": 109625 + }, + { + "epoch": 7.448702269330072, + "grad_norm": 0.20843641459941864, + "learning_rate": 6.927741540970241e-06, + "loss": 3.87, + "step": 109630 + }, + { + "epoch": 7.449041989400734, + "grad_norm": NaN, + "learning_rate": 6.924344340263623e-06, + "loss": 3.9489, + "step": 109635 + }, + { + "epoch": 7.449381709471395, + "grad_norm": 0.15654675662517548, + "learning_rate": 6.92009783938035e-06, + "loss": 3.8847, + "step": 109640 + }, + { + "epoch": 7.449721429542057, + "grad_norm": 0.17936086654663086, + "learning_rate": 6.915851338497078e-06, + "loss": 3.9451, + "step": 109645 + }, + { + "epoch": 7.450061149612719, + "grad_norm": 0.16713038086891174, + "learning_rate": 6.911604837613807e-06, + "loss": 3.7001, + "step": 109650 + }, + { + "epoch": 7.4504008696833806, + "grad_norm": 0.18930867314338684, + "learning_rate": 6.907358336730535e-06, + "loss": 3.5525, + "step": 109655 + }, + { + "epoch": 7.450740589754043, + "grad_norm": 0.161833718419075, + "learning_rate": 6.9031118358472615e-06, + "loss": 3.8137, + "step": 109660 + }, + { + "epoch": 7.451080309824705, + "grad_norm": 0.15543517470359802, + "learning_rate": 6.89886533496399e-06, + "loss": 3.7203, + "step": 109665 + }, + { + "epoch": 7.451420029895366, + "grad_norm": 0.22801001369953156, + "learning_rate": 6.894618834080718e-06, + "loss": 3.8029, + "step": 109670 + }, + { + "epoch": 7.451759749966028, + "grad_norm": 0.1449507772922516, + "learning_rate": 6.8903723331974455e-06, + "loss": 3.8516, + "step": 109675 + }, + { + "epoch": 7.45209947003669, + "grad_norm": 0.2634294033050537, + "learning_rate": 6.8861258323141735e-06, + "loss": 3.7228, + "step": 109680 + }, + { + "epoch": 7.452439190107351, + "grad_norm": 0.17277014255523682, + "learning_rate": 6.8818793314309015e-06, + "loss": 3.648, + "step": 109685 + }, + { + "epoch": 7.452778910178013, + "grad_norm": 0.18033961951732635, + "learning_rate": 6.877632830547629e-06, + "loss": 3.8329, + "step": 109690 + }, + { + "epoch": 7.453118630248675, + "grad_norm": 0.14593558013439178, + "learning_rate": 6.873386329664357e-06, + "loss": 3.7551, + "step": 109695 + }, + { + "epoch": 7.453458350319337, + "grad_norm": 0.19706209003925323, + "learning_rate": 6.869139828781085e-06, + "loss": 3.8951, + "step": 109700 + }, + { + "epoch": 7.453798070389999, + "grad_norm": 0.17898613214492798, + "learning_rate": 6.864893327897813e-06, + "loss": 3.7386, + "step": 109705 + }, + { + "epoch": 7.454137790460661, + "grad_norm": 0.9528050422668457, + "learning_rate": 6.86064682701454e-06, + "loss": 3.7631, + "step": 109710 + }, + { + "epoch": 7.454477510531322, + "grad_norm": 0.2808876931667328, + "learning_rate": 6.856400326131268e-06, + "loss": 3.7062, + "step": 109715 + }, + { + "epoch": 7.454817230601984, + "grad_norm": 0.1581539362668991, + "learning_rate": 6.852153825247997e-06, + "loss": 3.9042, + "step": 109720 + }, + { + "epoch": 7.455156950672646, + "grad_norm": 0.4107925295829773, + "learning_rate": 6.847907324364723e-06, + "loss": 3.7798, + "step": 109725 + }, + { + "epoch": 7.455496670743307, + "grad_norm": 0.1820652037858963, + "learning_rate": 6.843660823481451e-06, + "loss": 3.6674, + "step": 109730 + }, + { + "epoch": 7.455836390813969, + "grad_norm": 0.15248645842075348, + "learning_rate": 6.83941432259818e-06, + "loss": 3.908, + "step": 109735 + }, + { + "epoch": 7.456176110884631, + "grad_norm": 0.1846955120563507, + "learning_rate": 6.835167821714908e-06, + "loss": 3.8808, + "step": 109740 + }, + { + "epoch": 7.456515830955293, + "grad_norm": 0.15596073865890503, + "learning_rate": 6.830921320831635e-06, + "loss": 3.8469, + "step": 109745 + }, + { + "epoch": 7.456855551025955, + "grad_norm": 0.15091396868228912, + "learning_rate": 6.826674819948363e-06, + "loss": 3.8023, + "step": 109750 + }, + { + "epoch": 7.457195271096617, + "grad_norm": 0.15458767116069794, + "learning_rate": 6.822428319065091e-06, + "loss": 3.9492, + "step": 109755 + }, + { + "epoch": 7.457534991167278, + "grad_norm": 0.4396986663341522, + "learning_rate": 6.818181818181818e-06, + "loss": 3.825, + "step": 109760 + }, + { + "epoch": 7.45787471123794, + "grad_norm": 0.15947522222995758, + "learning_rate": 6.813935317298546e-06, + "loss": 3.8478, + "step": 109765 + }, + { + "epoch": 7.458214431308602, + "grad_norm": 0.18530385196208954, + "learning_rate": 6.809688816415274e-06, + "loss": 3.7804, + "step": 109770 + }, + { + "epoch": 7.458554151379263, + "grad_norm": 0.1915748119354248, + "learning_rate": 6.8054423155320015e-06, + "loss": 3.8453, + "step": 109775 + }, + { + "epoch": 7.458893871449925, + "grad_norm": 0.17182205617427826, + "learning_rate": 6.8011958146487295e-06, + "loss": 3.5823, + "step": 109780 + }, + { + "epoch": 7.459233591520587, + "grad_norm": 0.2378440797328949, + "learning_rate": 6.7969493137654575e-06, + "loss": 3.9825, + "step": 109785 + }, + { + "epoch": 7.459573311591249, + "grad_norm": 0.2880971133708954, + "learning_rate": 6.792702812882186e-06, + "loss": 3.9987, + "step": 109790 + }, + { + "epoch": 7.459913031661911, + "grad_norm": 0.22907812893390656, + "learning_rate": 6.788456311998913e-06, + "loss": 4.0087, + "step": 109795 + }, + { + "epoch": 7.460252751732573, + "grad_norm": 0.177445188164711, + "learning_rate": 6.784209811115641e-06, + "loss": 3.7061, + "step": 109800 + }, + { + "epoch": 7.460592471803234, + "grad_norm": 0.1578214317560196, + "learning_rate": 6.7799633102323695e-06, + "loss": 3.9333, + "step": 109805 + }, + { + "epoch": 7.460932191873896, + "grad_norm": 0.15473242104053497, + "learning_rate": 6.775716809349096e-06, + "loss": 3.8392, + "step": 109810 + }, + { + "epoch": 7.461271911944558, + "grad_norm": 0.15052500367164612, + "learning_rate": 6.771470308465825e-06, + "loss": 4.057, + "step": 109815 + }, + { + "epoch": 7.461611632015219, + "grad_norm": 0.20444171130657196, + "learning_rate": 6.767223807582553e-06, + "loss": 3.9055, + "step": 109820 + }, + { + "epoch": 7.461951352085881, + "grad_norm": 0.16043981909751892, + "learning_rate": 6.762977306699281e-06, + "loss": 3.8314, + "step": 109825 + }, + { + "epoch": 7.462291072156543, + "grad_norm": 0.16087083518505096, + "learning_rate": 6.758730805816008e-06, + "loss": 4.0873, + "step": 109830 + }, + { + "epoch": 7.462630792227205, + "grad_norm": 0.15098710358142853, + "learning_rate": 6.754484304932736e-06, + "loss": 3.7641, + "step": 109835 + }, + { + "epoch": 7.462970512297867, + "grad_norm": 0.18216603994369507, + "learning_rate": 6.750237804049464e-06, + "loss": 3.6343, + "step": 109840 + }, + { + "epoch": 7.463310232368528, + "grad_norm": 0.4758601784706116, + "learning_rate": 6.745991303166191e-06, + "loss": 3.7681, + "step": 109845 + }, + { + "epoch": 7.46364995243919, + "grad_norm": 0.3008681833744049, + "learning_rate": 6.741744802282919e-06, + "loss": 3.6421, + "step": 109850 + }, + { + "epoch": 7.463989672509852, + "grad_norm": 0.49506211280822754, + "learning_rate": 6.737498301399647e-06, + "loss": 3.7079, + "step": 109855 + }, + { + "epoch": 7.464329392580513, + "grad_norm": 0.18119214475154877, + "learning_rate": 6.733251800516374e-06, + "loss": 3.8663, + "step": 109860 + }, + { + "epoch": 7.464669112651175, + "grad_norm": 0.17644119262695312, + "learning_rate": 6.729005299633102e-06, + "loss": 3.6525, + "step": 109865 + }, + { + "epoch": 7.465008832721837, + "grad_norm": 0.20829389989376068, + "learning_rate": 6.72475879874983e-06, + "loss": 3.4941, + "step": 109870 + }, + { + "epoch": 7.4653485527924985, + "grad_norm": 0.1767452359199524, + "learning_rate": 6.720512297866559e-06, + "loss": 4.1662, + "step": 109875 + }, + { + "epoch": 7.465688272863161, + "grad_norm": 0.21192950010299683, + "learning_rate": 6.716265796983285e-06, + "loss": 3.742, + "step": 109880 + }, + { + "epoch": 7.466027992933823, + "grad_norm": 0.21048039197921753, + "learning_rate": 6.712019296100014e-06, + "loss": 3.8781, + "step": 109885 + }, + { + "epoch": 7.466367713004484, + "grad_norm": 0.21445302665233612, + "learning_rate": 6.707772795216742e-06, + "loss": 3.7763, + "step": 109890 + }, + { + "epoch": 7.466707433075146, + "grad_norm": 0.1882382333278656, + "learning_rate": 6.7035262943334686e-06, + "loss": 4.0177, + "step": 109895 + }, + { + "epoch": 7.467047153145808, + "grad_norm": 0.18073409795761108, + "learning_rate": 6.699279793450197e-06, + "loss": 3.91, + "step": 109900 + }, + { + "epoch": 7.467386873216469, + "grad_norm": 0.16392797231674194, + "learning_rate": 6.695033292566925e-06, + "loss": 3.8619, + "step": 109905 + }, + { + "epoch": 7.467726593287131, + "grad_norm": 0.1445956528186798, + "learning_rate": 6.6907867916836534e-06, + "loss": 3.5324, + "step": 109910 + }, + { + "epoch": 7.468066313357793, + "grad_norm": 0.47659778594970703, + "learning_rate": 6.686540290800381e-06, + "loss": 3.817, + "step": 109915 + }, + { + "epoch": 7.4684060334284545, + "grad_norm": 0.16593825817108154, + "learning_rate": 6.682293789917109e-06, + "loss": 3.7945, + "step": 109920 + }, + { + "epoch": 7.468745753499117, + "grad_norm": 0.19496354460716248, + "learning_rate": 6.678047289033837e-06, + "loss": 3.831, + "step": 109925 + }, + { + "epoch": 7.469085473569779, + "grad_norm": 0.16133597493171692, + "learning_rate": 6.673800788150564e-06, + "loss": 3.7698, + "step": 109930 + }, + { + "epoch": 7.46942519364044, + "grad_norm": 0.1915316879749298, + "learning_rate": 6.669554287267292e-06, + "loss": 3.828, + "step": 109935 + }, + { + "epoch": 7.469764913711102, + "grad_norm": 0.1453191041946411, + "learning_rate": 6.665307786384021e-06, + "loss": 3.721, + "step": 109940 + }, + { + "epoch": 7.470104633781764, + "grad_norm": 0.24224263429641724, + "learning_rate": 6.661061285500747e-06, + "loss": 3.6982, + "step": 109945 + }, + { + "epoch": 7.470444353852425, + "grad_norm": 0.16164836287498474, + "learning_rate": 6.656814784617475e-06, + "loss": 3.756, + "step": 109950 + }, + { + "epoch": 7.470784073923087, + "grad_norm": 0.21228118240833282, + "learning_rate": 6.652568283734204e-06, + "loss": 3.905, + "step": 109955 + }, + { + "epoch": 7.471123793993749, + "grad_norm": 0.23973114788532257, + "learning_rate": 6.648321782850932e-06, + "loss": 3.9515, + "step": 109960 + }, + { + "epoch": 7.471463514064411, + "grad_norm": 0.28135591745376587, + "learning_rate": 6.644075281967659e-06, + "loss": 3.8437, + "step": 109965 + }, + { + "epoch": 7.471803234135073, + "grad_norm": 0.17853094637393951, + "learning_rate": 6.639828781084387e-06, + "loss": 3.8159, + "step": 109970 + }, + { + "epoch": 7.472142954205735, + "grad_norm": 0.19884954392910004, + "learning_rate": 6.635582280201115e-06, + "loss": 3.904, + "step": 109975 + }, + { + "epoch": 7.472482674276396, + "grad_norm": 0.19739589095115662, + "learning_rate": 6.631335779317842e-06, + "loss": 3.9968, + "step": 109980 + }, + { + "epoch": 7.472822394347058, + "grad_norm": 0.23451930284500122, + "learning_rate": 6.62708927843457e-06, + "loss": 4.0386, + "step": 109985 + }, + { + "epoch": 7.47316211441772, + "grad_norm": 0.19170087575912476, + "learning_rate": 6.622842777551298e-06, + "loss": 3.8169, + "step": 109990 + }, + { + "epoch": 7.473501834488381, + "grad_norm": 0.1600438356399536, + "learning_rate": 6.618596276668026e-06, + "loss": 3.8086, + "step": 109995 + }, + { + "epoch": 7.473841554559043, + "grad_norm": 0.3782304525375366, + "learning_rate": 6.614349775784753e-06, + "loss": 3.8278, + "step": 110000 + }, + { + "epoch": 7.474181274629705, + "grad_norm": 0.16212645173072815, + "learning_rate": 6.610103274901481e-06, + "loss": 3.9789, + "step": 110005 + }, + { + "epoch": 7.474520994700367, + "grad_norm": 0.5765448212623596, + "learning_rate": 6.60585677401821e-06, + "loss": 3.8273, + "step": 110010 + }, + { + "epoch": 7.474860714771029, + "grad_norm": 0.21714815497398376, + "learning_rate": 6.6016102731349365e-06, + "loss": 3.8357, + "step": 110015 + }, + { + "epoch": 7.475200434841691, + "grad_norm": 0.645452082157135, + "learning_rate": 6.5973637722516645e-06, + "loss": 3.5087, + "step": 110020 + }, + { + "epoch": 7.475540154912352, + "grad_norm": 0.2010236233472824, + "learning_rate": 6.593117271368393e-06, + "loss": 3.7676, + "step": 110025 + }, + { + "epoch": 7.475879874983014, + "grad_norm": 0.25800055265426636, + "learning_rate": 6.58887077048512e-06, + "loss": 3.7821, + "step": 110030 + }, + { + "epoch": 7.476219595053676, + "grad_norm": 0.15778157114982605, + "learning_rate": 6.5846242696018485e-06, + "loss": 3.7263, + "step": 110035 + }, + { + "epoch": 7.476559315124337, + "grad_norm": 0.2254621386528015, + "learning_rate": 6.5803777687185765e-06, + "loss": 3.7414, + "step": 110040 + }, + { + "epoch": 7.476899035194999, + "grad_norm": 0.17335578799247742, + "learning_rate": 6.5761312678353046e-06, + "loss": 3.5911, + "step": 110045 + }, + { + "epoch": 7.477238755265661, + "grad_norm": 0.5763237476348877, + "learning_rate": 6.571884766952032e-06, + "loss": 3.8048, + "step": 110050 + }, + { + "epoch": 7.477578475336323, + "grad_norm": 0.1507263332605362, + "learning_rate": 6.56763826606876e-06, + "loss": 3.7854, + "step": 110055 + }, + { + "epoch": 7.477918195406985, + "grad_norm": 0.1896757334470749, + "learning_rate": 6.563391765185488e-06, + "loss": 3.6565, + "step": 110060 + }, + { + "epoch": 7.478257915477647, + "grad_norm": 0.13769379258155823, + "learning_rate": 6.559145264302215e-06, + "loss": 3.9473, + "step": 110065 + }, + { + "epoch": 7.478597635548308, + "grad_norm": 0.16669972240924835, + "learning_rate": 6.554898763418943e-06, + "loss": 3.6955, + "step": 110070 + }, + { + "epoch": 7.47893735561897, + "grad_norm": 0.14538028836250305, + "learning_rate": 6.550652262535671e-06, + "loss": 3.8324, + "step": 110075 + }, + { + "epoch": 7.479277075689632, + "grad_norm": 0.20739895105361938, + "learning_rate": 6.5464057616524e-06, + "loss": 3.7991, + "step": 110080 + }, + { + "epoch": 7.479616795760293, + "grad_norm": 0.1724867969751358, + "learning_rate": 6.542159260769126e-06, + "loss": 3.7764, + "step": 110085 + }, + { + "epoch": 7.479956515830955, + "grad_norm": 0.1606971025466919, + "learning_rate": 6.537912759885854e-06, + "loss": 3.6927, + "step": 110090 + }, + { + "epoch": 7.480296235901617, + "grad_norm": 0.21982353925704956, + "learning_rate": 6.533666259002583e-06, + "loss": 4.0209, + "step": 110095 + }, + { + "epoch": 7.480635955972279, + "grad_norm": 0.27606362104415894, + "learning_rate": 6.529419758119309e-06, + "loss": 3.8351, + "step": 110100 + }, + { + "epoch": 7.480975676042941, + "grad_norm": 0.21732757985591888, + "learning_rate": 6.525173257236038e-06, + "loss": 3.5729, + "step": 110105 + }, + { + "epoch": 7.481315396113603, + "grad_norm": 0.17515301704406738, + "learning_rate": 6.520926756352766e-06, + "loss": 3.8064, + "step": 110110 + }, + { + "epoch": 7.481655116184264, + "grad_norm": 0.16702444851398468, + "learning_rate": 6.5166802554694924e-06, + "loss": 3.9536, + "step": 110115 + }, + { + "epoch": 7.481994836254926, + "grad_norm": 0.21977703273296356, + "learning_rate": 6.512433754586221e-06, + "loss": 3.8971, + "step": 110120 + }, + { + "epoch": 7.482334556325588, + "grad_norm": 3.1769561767578125, + "learning_rate": 6.508187253702949e-06, + "loss": 3.672, + "step": 110125 + }, + { + "epoch": 7.482674276396249, + "grad_norm": 0.19932624697685242, + "learning_rate": 6.503940752819677e-06, + "loss": 3.6484, + "step": 110130 + }, + { + "epoch": 7.483013996466911, + "grad_norm": 0.17579224705696106, + "learning_rate": 6.4996942519364045e-06, + "loss": 3.658, + "step": 110135 + }, + { + "epoch": 7.483353716537573, + "grad_norm": 0.18252775073051453, + "learning_rate": 6.4954477510531325e-06, + "loss": 3.7359, + "step": 110140 + }, + { + "epoch": 7.483693436608235, + "grad_norm": 0.14879682660102844, + "learning_rate": 6.4912012501698605e-06, + "loss": 3.8351, + "step": 110145 + }, + { + "epoch": 7.484033156678897, + "grad_norm": 0.14014679193496704, + "learning_rate": 6.486954749286588e-06, + "loss": 3.8806, + "step": 110150 + }, + { + "epoch": 7.484372876749559, + "grad_norm": 0.15298493206501007, + "learning_rate": 6.482708248403316e-06, + "loss": 3.8567, + "step": 110155 + }, + { + "epoch": 7.48471259682022, + "grad_norm": 0.14724409580230713, + "learning_rate": 6.478461747520044e-06, + "loss": 3.708, + "step": 110160 + }, + { + "epoch": 7.485052316890882, + "grad_norm": 0.18449163436889648, + "learning_rate": 6.4742152466367725e-06, + "loss": 3.8703, + "step": 110165 + }, + { + "epoch": 7.485392036961544, + "grad_norm": 0.22657057642936707, + "learning_rate": 6.469968745753499e-06, + "loss": 3.7741, + "step": 110170 + }, + { + "epoch": 7.485731757032205, + "grad_norm": 0.36095985770225525, + "learning_rate": 6.465722244870228e-06, + "loss": 3.882, + "step": 110175 + }, + { + "epoch": 7.486071477102867, + "grad_norm": 0.21798478066921234, + "learning_rate": 6.461475743986956e-06, + "loss": 3.8994, + "step": 110180 + }, + { + "epoch": 7.486411197173529, + "grad_norm": 0.15034271776676178, + "learning_rate": 6.457229243103682e-06, + "loss": 3.7721, + "step": 110185 + }, + { + "epoch": 7.486750917244191, + "grad_norm": 0.17815186083316803, + "learning_rate": 6.452982742220411e-06, + "loss": 3.9546, + "step": 110190 + }, + { + "epoch": 7.487090637314853, + "grad_norm": 0.3245621621608734, + "learning_rate": 6.448736241337139e-06, + "loss": 3.7779, + "step": 110195 + }, + { + "epoch": 7.487430357385515, + "grad_norm": 0.22611987590789795, + "learning_rate": 6.444489740453866e-06, + "loss": 3.8188, + "step": 110200 + }, + { + "epoch": 7.487770077456176, + "grad_norm": 0.17790274322032928, + "learning_rate": 6.440243239570594e-06, + "loss": 3.9673, + "step": 110205 + }, + { + "epoch": 7.488109797526838, + "grad_norm": 0.20677368342876434, + "learning_rate": 6.435996738687322e-06, + "loss": 3.9731, + "step": 110210 + }, + { + "epoch": 7.4884495175975, + "grad_norm": 0.15185146033763885, + "learning_rate": 6.43175023780405e-06, + "loss": 3.8919, + "step": 110215 + }, + { + "epoch": 7.488789237668161, + "grad_norm": 0.17796403169631958, + "learning_rate": 6.427503736920777e-06, + "loss": 3.6145, + "step": 110220 + }, + { + "epoch": 7.489128957738823, + "grad_norm": 0.1837156116962433, + "learning_rate": 6.423257236037505e-06, + "loss": 3.8641, + "step": 110225 + }, + { + "epoch": 7.489468677809485, + "grad_norm": 0.16964703798294067, + "learning_rate": 6.419010735154233e-06, + "loss": 3.9366, + "step": 110230 + }, + { + "epoch": 7.489808397880147, + "grad_norm": 0.15134146809577942, + "learning_rate": 6.41476423427096e-06, + "loss": 3.8203, + "step": 110235 + }, + { + "epoch": 7.490148117950809, + "grad_norm": 0.6747602820396423, + "learning_rate": 6.410517733387688e-06, + "loss": 3.8717, + "step": 110240 + }, + { + "epoch": 7.490487838021471, + "grad_norm": 0.16926687955856323, + "learning_rate": 6.406271232504417e-06, + "loss": 3.5962, + "step": 110245 + }, + { + "epoch": 7.490827558092132, + "grad_norm": 0.17474006116390228, + "learning_rate": 6.402024731621145e-06, + "loss": 3.7076, + "step": 110250 + }, + { + "epoch": 7.491167278162794, + "grad_norm": 0.17829972505569458, + "learning_rate": 6.3977782307378716e-06, + "loss": 3.8287, + "step": 110255 + }, + { + "epoch": 7.491506998233455, + "grad_norm": 8.952106475830078, + "learning_rate": 6.3935317298546e-06, + "loss": 3.8435, + "step": 110260 + }, + { + "epoch": 7.491846718304117, + "grad_norm": 0.17663533985614777, + "learning_rate": 6.389285228971328e-06, + "loss": 3.9346, + "step": 110265 + }, + { + "epoch": 7.492186438374779, + "grad_norm": 0.17736689746379852, + "learning_rate": 6.385038728088056e-06, + "loss": 3.8554, + "step": 110270 + }, + { + "epoch": 7.492526158445441, + "grad_norm": 0.3904016613960266, + "learning_rate": 6.380792227204784e-06, + "loss": 3.8975, + "step": 110275 + }, + { + "epoch": 7.492865878516103, + "grad_norm": 0.1747378259897232, + "learning_rate": 6.376545726321512e-06, + "loss": 3.7018, + "step": 110280 + }, + { + "epoch": 7.493205598586765, + "grad_norm": 0.20085099339485168, + "learning_rate": 6.372299225438239e-06, + "loss": 3.9695, + "step": 110285 + }, + { + "epoch": 7.493545318657426, + "grad_norm": 0.16260723769664764, + "learning_rate": 6.368052724554967e-06, + "loss": 3.722, + "step": 110290 + }, + { + "epoch": 7.493885038728088, + "grad_norm": 0.15476442873477936, + "learning_rate": 6.363806223671695e-06, + "loss": 3.8676, + "step": 110295 + }, + { + "epoch": 7.49422475879875, + "grad_norm": 2.188727617263794, + "learning_rate": 6.359559722788424e-06, + "loss": 3.8808, + "step": 110300 + }, + { + "epoch": 7.494564478869411, + "grad_norm": 0.18405312299728394, + "learning_rate": 6.35531322190515e-06, + "loss": 3.7029, + "step": 110305 + }, + { + "epoch": 7.494904198940073, + "grad_norm": 3.4165172576904297, + "learning_rate": 6.351066721021878e-06, + "loss": 3.6152, + "step": 110310 + }, + { + "epoch": 7.495243919010735, + "grad_norm": 0.42312318086624146, + "learning_rate": 6.346820220138607e-06, + "loss": 4.0923, + "step": 110315 + }, + { + "epoch": 7.495583639081397, + "grad_norm": 0.16185984015464783, + "learning_rate": 6.342573719255333e-06, + "loss": 3.8263, + "step": 110320 + }, + { + "epoch": 7.495923359152059, + "grad_norm": 0.15543507039546967, + "learning_rate": 6.338327218372062e-06, + "loss": 3.6351, + "step": 110325 + }, + { + "epoch": 7.496263079222721, + "grad_norm": 0.15448641777038574, + "learning_rate": 6.33408071748879e-06, + "loss": 3.75, + "step": 110330 + }, + { + "epoch": 7.496602799293382, + "grad_norm": 0.15184248983860016, + "learning_rate": 6.329834216605518e-06, + "loss": 3.7488, + "step": 110335 + }, + { + "epoch": 7.496942519364044, + "grad_norm": 0.1504095047712326, + "learning_rate": 6.325587715722245e-06, + "loss": 3.7645, + "step": 110340 + }, + { + "epoch": 7.497282239434706, + "grad_norm": 0.19667299091815948, + "learning_rate": 6.321341214838973e-06, + "loss": 3.8353, + "step": 110345 + }, + { + "epoch": 7.497621959505367, + "grad_norm": 0.15750627219676971, + "learning_rate": 6.317094713955701e-06, + "loss": 4.0963, + "step": 110350 + }, + { + "epoch": 7.497961679576029, + "grad_norm": 0.16541993618011475, + "learning_rate": 6.312848213072428e-06, + "loss": 3.9388, + "step": 110355 + }, + { + "epoch": 7.498301399646691, + "grad_norm": 0.161697655916214, + "learning_rate": 6.308601712189156e-06, + "loss": 3.9319, + "step": 110360 + }, + { + "epoch": 7.498641119717353, + "grad_norm": 0.15437451004981995, + "learning_rate": 6.304355211305884e-06, + "loss": 3.7877, + "step": 110365 + }, + { + "epoch": 7.498980839788015, + "grad_norm": 0.17198531329631805, + "learning_rate": 6.3001087104226115e-06, + "loss": 3.8239, + "step": 110370 + }, + { + "epoch": 7.499320559858677, + "grad_norm": 0.2395193874835968, + "learning_rate": 6.2958622095393395e-06, + "loss": 3.8562, + "step": 110375 + }, + { + "epoch": 7.499660279929338, + "grad_norm": 0.14693060517311096, + "learning_rate": 6.2916157086560675e-06, + "loss": 3.9117, + "step": 110380 + }, + { + "epoch": 7.5, + "grad_norm": 0.3255544900894165, + "learning_rate": 6.287369207772796e-06, + "loss": 4.0811, + "step": 110385 + }, + { + "epoch": 7.500339720070662, + "grad_norm": 0.13454678654670715, + "learning_rate": 6.283122706889523e-06, + "loss": 3.7428, + "step": 110390 + }, + { + "epoch": 7.500679440141323, + "grad_norm": 0.150907963514328, + "learning_rate": 6.2788762060062515e-06, + "loss": 3.8735, + "step": 110395 + }, + { + "epoch": 7.501019160211985, + "grad_norm": 0.2542147934436798, + "learning_rate": 6.2746297051229795e-06, + "loss": 3.9313, + "step": 110400 + }, + { + "epoch": 7.501358880282647, + "grad_norm": 0.17880573868751526, + "learning_rate": 6.270383204239706e-06, + "loss": 3.5597, + "step": 110405 + }, + { + "epoch": 7.501698600353309, + "grad_norm": 0.2856956124305725, + "learning_rate": 6.266136703356435e-06, + "loss": 3.4522, + "step": 110410 + }, + { + "epoch": 7.502038320423971, + "grad_norm": 1.651317834854126, + "learning_rate": 6.261890202473163e-06, + "loss": 3.6466, + "step": 110415 + }, + { + "epoch": 7.502378040494633, + "grad_norm": 0.23491425812244415, + "learning_rate": 6.257643701589891e-06, + "loss": 3.8928, + "step": 110420 + }, + { + "epoch": 7.502717760565294, + "grad_norm": 0.1653457134962082, + "learning_rate": 6.253397200706618e-06, + "loss": 3.9084, + "step": 110425 + }, + { + "epoch": 7.503057480635956, + "grad_norm": 0.2666000425815582, + "learning_rate": 6.249150699823346e-06, + "loss": 3.627, + "step": 110430 + }, + { + "epoch": 7.503397200706618, + "grad_norm": 0.21459916234016418, + "learning_rate": 6.244904198940073e-06, + "loss": 4.1052, + "step": 110435 + }, + { + "epoch": 7.503736920777279, + "grad_norm": 0.2189721316099167, + "learning_rate": 6.240657698056802e-06, + "loss": 3.6468, + "step": 110440 + }, + { + "epoch": 7.504076640847941, + "grad_norm": 0.15870141983032227, + "learning_rate": 6.236411197173529e-06, + "loss": 3.9845, + "step": 110445 + }, + { + "epoch": 7.504416360918603, + "grad_norm": 0.17514795064926147, + "learning_rate": 6.232164696290257e-06, + "loss": 3.8381, + "step": 110450 + }, + { + "epoch": 7.504756080989265, + "grad_norm": 0.14625976979732513, + "learning_rate": 6.227918195406985e-06, + "loss": 3.7563, + "step": 110455 + }, + { + "epoch": 7.505095801059927, + "grad_norm": 0.17936332523822784, + "learning_rate": 6.223671694523712e-06, + "loss": 3.3362, + "step": 110460 + }, + { + "epoch": 7.505435521130589, + "grad_norm": 0.19327527284622192, + "learning_rate": 6.219425193640441e-06, + "loss": 3.8385, + "step": 110465 + }, + { + "epoch": 7.50577524120125, + "grad_norm": 0.19411304593086243, + "learning_rate": 6.215178692757168e-06, + "loss": 3.5455, + "step": 110470 + }, + { + "epoch": 7.506114961271912, + "grad_norm": 0.1488502472639084, + "learning_rate": 6.210932191873896e-06, + "loss": 3.9389, + "step": 110475 + }, + { + "epoch": 7.506454681342574, + "grad_norm": 0.16972362995147705, + "learning_rate": 6.206685690990624e-06, + "loss": 3.785, + "step": 110480 + }, + { + "epoch": 7.506794401413235, + "grad_norm": 0.1857243776321411, + "learning_rate": 6.2024391901073514e-06, + "loss": 3.8438, + "step": 110485 + }, + { + "epoch": 7.507134121483897, + "grad_norm": 0.18489739298820496, + "learning_rate": 6.1981926892240795e-06, + "loss": 3.6067, + "step": 110490 + }, + { + "epoch": 7.5074738415545585, + "grad_norm": 0.15643315017223358, + "learning_rate": 6.1939461883408075e-06, + "loss": 3.5518, + "step": 110495 + }, + { + "epoch": 7.507813561625221, + "grad_norm": 0.1646411418914795, + "learning_rate": 6.1896996874575355e-06, + "loss": 3.7616, + "step": 110500 + }, + { + "epoch": 7.508153281695883, + "grad_norm": 0.2047731578350067, + "learning_rate": 6.1854531865742635e-06, + "loss": 3.8602, + "step": 110505 + }, + { + "epoch": 7.508493001766544, + "grad_norm": 0.17653411626815796, + "learning_rate": 6.181206685690991e-06, + "loss": 3.8479, + "step": 110510 + }, + { + "epoch": 7.508832721837206, + "grad_norm": 0.1823432892560959, + "learning_rate": 6.176960184807719e-06, + "loss": 3.7151, + "step": 110515 + }, + { + "epoch": 7.509172441907868, + "grad_norm": 0.18825934827327728, + "learning_rate": 6.172713683924447e-06, + "loss": 3.5989, + "step": 110520 + }, + { + "epoch": 7.509512161978529, + "grad_norm": 0.22186389565467834, + "learning_rate": 6.168467183041175e-06, + "loss": 3.7384, + "step": 110525 + }, + { + "epoch": 7.509851882049191, + "grad_norm": 0.2513170540332794, + "learning_rate": 6.164220682157902e-06, + "loss": 3.8221, + "step": 110530 + }, + { + "epoch": 7.510191602119853, + "grad_norm": 0.17236828804016113, + "learning_rate": 6.159974181274631e-06, + "loss": 3.6418, + "step": 110535 + }, + { + "epoch": 7.510531322190515, + "grad_norm": 0.1502288579940796, + "learning_rate": 6.155727680391358e-06, + "loss": 3.9231, + "step": 110540 + }, + { + "epoch": 7.510871042261177, + "grad_norm": 0.24719512462615967, + "learning_rate": 6.151481179508085e-06, + "loss": 3.8227, + "step": 110545 + }, + { + "epoch": 7.511210762331839, + "grad_norm": 0.20168408751487732, + "learning_rate": 6.147234678624814e-06, + "loss": 3.8805, + "step": 110550 + }, + { + "epoch": 7.5115504824025, + "grad_norm": 0.1800236701965332, + "learning_rate": 6.142988177741541e-06, + "loss": 3.7034, + "step": 110555 + }, + { + "epoch": 7.511890202473162, + "grad_norm": 0.14899739623069763, + "learning_rate": 6.138741676858269e-06, + "loss": 4.0496, + "step": 110560 + }, + { + "epoch": 7.512229922543824, + "grad_norm": 0.2670130729675293, + "learning_rate": 6.134495175974997e-06, + "loss": 3.9938, + "step": 110565 + }, + { + "epoch": 7.512569642614485, + "grad_norm": 0.23219096660614014, + "learning_rate": 6.130248675091724e-06, + "loss": 3.6084, + "step": 110570 + }, + { + "epoch": 7.512909362685147, + "grad_norm": 0.20177027583122253, + "learning_rate": 6.126002174208453e-06, + "loss": 3.9484, + "step": 110575 + }, + { + "epoch": 7.513249082755809, + "grad_norm": 0.13997168838977814, + "learning_rate": 6.12175567332518e-06, + "loss": 4.0233, + "step": 110580 + }, + { + "epoch": 7.513588802826471, + "grad_norm": 0.17754283547401428, + "learning_rate": 6.117509172441908e-06, + "loss": 3.9777, + "step": 110585 + }, + { + "epoch": 7.513928522897133, + "grad_norm": 1.0384160280227661, + "learning_rate": 6.113262671558636e-06, + "loss": 3.8615, + "step": 110590 + }, + { + "epoch": 7.514268242967795, + "grad_norm": 0.18353107571601868, + "learning_rate": 6.109016170675363e-06, + "loss": 3.5834, + "step": 110595 + }, + { + "epoch": 7.514607963038456, + "grad_norm": 0.14695030450820923, + "learning_rate": 6.104769669792091e-06, + "loss": 3.8504, + "step": 110600 + }, + { + "epoch": 7.514947683109118, + "grad_norm": 0.20088018476963043, + "learning_rate": 6.100523168908819e-06, + "loss": 3.7209, + "step": 110605 + }, + { + "epoch": 7.51528740317978, + "grad_norm": 0.23556403815746307, + "learning_rate": 6.096276668025547e-06, + "loss": 3.8361, + "step": 110610 + }, + { + "epoch": 7.515627123250441, + "grad_norm": 0.19575819373130798, + "learning_rate": 6.0920301671422746e-06, + "loss": 3.7501, + "step": 110615 + }, + { + "epoch": 7.515966843321103, + "grad_norm": 0.1922726184129715, + "learning_rate": 6.087783666259003e-06, + "loss": 3.8824, + "step": 110620 + }, + { + "epoch": 7.516306563391765, + "grad_norm": 0.14147117733955383, + "learning_rate": 6.083537165375731e-06, + "loss": 3.8107, + "step": 110625 + }, + { + "epoch": 7.516646283462427, + "grad_norm": 0.14555774629116058, + "learning_rate": 6.079290664492459e-06, + "loss": 3.8455, + "step": 110630 + }, + { + "epoch": 7.516986003533089, + "grad_norm": 0.19247448444366455, + "learning_rate": 6.075044163609187e-06, + "loss": 3.6433, + "step": 110635 + }, + { + "epoch": 7.517325723603751, + "grad_norm": 0.18492308259010315, + "learning_rate": 6.070797662725914e-06, + "loss": 3.8199, + "step": 110640 + }, + { + "epoch": 7.517665443674412, + "grad_norm": 0.7976596355438232, + "learning_rate": 6.066551161842643e-06, + "loss": 3.8218, + "step": 110645 + }, + { + "epoch": 7.518005163745074, + "grad_norm": 0.18502740561962128, + "learning_rate": 6.06230466095937e-06, + "loss": 3.8673, + "step": 110650 + }, + { + "epoch": 7.518344883815736, + "grad_norm": 0.2059830129146576, + "learning_rate": 6.058058160076097e-06, + "loss": 3.9849, + "step": 110655 + }, + { + "epoch": 7.518684603886397, + "grad_norm": 0.14394930005073547, + "learning_rate": 6.053811659192826e-06, + "loss": 3.8012, + "step": 110660 + }, + { + "epoch": 7.519024323957059, + "grad_norm": 0.1683526635169983, + "learning_rate": 6.049565158309553e-06, + "loss": 3.9332, + "step": 110665 + }, + { + "epoch": 7.519364044027721, + "grad_norm": 0.15902544558048248, + "learning_rate": 6.045318657426281e-06, + "loss": 3.8073, + "step": 110670 + }, + { + "epoch": 7.519703764098383, + "grad_norm": 0.15049846470355988, + "learning_rate": 6.041072156543009e-06, + "loss": 3.5295, + "step": 110675 + }, + { + "epoch": 7.520043484169045, + "grad_norm": 0.24491986632347107, + "learning_rate": 6.036825655659736e-06, + "loss": 3.9191, + "step": 110680 + }, + { + "epoch": 7.520383204239707, + "grad_norm": 0.16350145637989044, + "learning_rate": 6.032579154776465e-06, + "loss": 3.8055, + "step": 110685 + }, + { + "epoch": 7.520722924310368, + "grad_norm": 0.15779028832912445, + "learning_rate": 6.028332653893192e-06, + "loss": 3.8138, + "step": 110690 + }, + { + "epoch": 7.52106264438103, + "grad_norm": 0.16396184265613556, + "learning_rate": 6.02408615300992e-06, + "loss": 3.7615, + "step": 110695 + }, + { + "epoch": 7.521402364451692, + "grad_norm": 0.1783406138420105, + "learning_rate": 6.019839652126648e-06, + "loss": 3.7727, + "step": 110700 + }, + { + "epoch": 7.521742084522353, + "grad_norm": 0.2645393908023834, + "learning_rate": 6.015593151243376e-06, + "loss": 3.7315, + "step": 110705 + }, + { + "epoch": 7.522081804593015, + "grad_norm": 0.1678187996149063, + "learning_rate": 6.011346650360103e-06, + "loss": 4.2299, + "step": 110710 + }, + { + "epoch": 7.522421524663677, + "grad_norm": 1.2547980546951294, + "learning_rate": 6.007100149476831e-06, + "loss": 4.1247, + "step": 110715 + }, + { + "epoch": 7.522761244734339, + "grad_norm": 0.1691565066576004, + "learning_rate": 6.002853648593559e-06, + "loss": 3.8669, + "step": 110720 + }, + { + "epoch": 7.523100964805001, + "grad_norm": 0.2241189032793045, + "learning_rate": 5.9986071477102865e-06, + "loss": 3.9136, + "step": 110725 + }, + { + "epoch": 7.523440684875663, + "grad_norm": 0.17967072129249573, + "learning_rate": 5.994360646827015e-06, + "loss": 4.0555, + "step": 110730 + }, + { + "epoch": 7.523780404946324, + "grad_norm": 0.17348523437976837, + "learning_rate": 5.9901141459437425e-06, + "loss": 3.9753, + "step": 110735 + }, + { + "epoch": 7.524120125016986, + "grad_norm": 0.19738423824310303, + "learning_rate": 5.9858676450604705e-06, + "loss": 3.7595, + "step": 110740 + }, + { + "epoch": 7.524459845087648, + "grad_norm": 0.15469396114349365, + "learning_rate": 5.9816211441771985e-06, + "loss": 3.7035, + "step": 110745 + }, + { + "epoch": 7.524799565158309, + "grad_norm": 0.15855391323566437, + "learning_rate": 5.977374643293926e-06, + "loss": 3.7656, + "step": 110750 + }, + { + "epoch": 7.525139285228971, + "grad_norm": 0.18219852447509766, + "learning_rate": 5.9731281424106545e-06, + "loss": 3.7504, + "step": 110755 + }, + { + "epoch": 7.525479005299633, + "grad_norm": 0.20954470336437225, + "learning_rate": 5.968881641527382e-06, + "loss": 3.6839, + "step": 110760 + }, + { + "epoch": 7.525818725370295, + "grad_norm": 0.16541160643100739, + "learning_rate": 5.964635140644109e-06, + "loss": 3.7989, + "step": 110765 + }, + { + "epoch": 7.526158445440957, + "grad_norm": 0.18675324320793152, + "learning_rate": 5.960388639760838e-06, + "loss": 3.8061, + "step": 110770 + }, + { + "epoch": 7.526498165511619, + "grad_norm": 0.17207616567611694, + "learning_rate": 5.956142138877565e-06, + "loss": 3.855, + "step": 110775 + }, + { + "epoch": 7.52683788558228, + "grad_norm": 1.3789759874343872, + "learning_rate": 5.951895637994293e-06, + "loss": 3.7494, + "step": 110780 + }, + { + "epoch": 7.527177605652942, + "grad_norm": 0.21613462269306183, + "learning_rate": 5.947649137111021e-06, + "loss": 3.7686, + "step": 110785 + }, + { + "epoch": 7.527517325723604, + "grad_norm": 0.2136811465024948, + "learning_rate": 5.943402636227749e-06, + "loss": 3.7431, + "step": 110790 + }, + { + "epoch": 7.527857045794265, + "grad_norm": 0.20059165358543396, + "learning_rate": 5.939156135344476e-06, + "loss": 3.874, + "step": 110795 + }, + { + "epoch": 7.528196765864927, + "grad_norm": 0.1521925926208496, + "learning_rate": 5.934909634461204e-06, + "loss": 3.9592, + "step": 110800 + }, + { + "epoch": 7.528536485935589, + "grad_norm": 0.7780377268791199, + "learning_rate": 5.930663133577932e-06, + "loss": 3.9632, + "step": 110805 + }, + { + "epoch": 7.528876206006251, + "grad_norm": 0.14979176223278046, + "learning_rate": 5.92641663269466e-06, + "loss": 3.7194, + "step": 110810 + }, + { + "epoch": 7.529215926076913, + "grad_norm": 0.3746068775653839, + "learning_rate": 5.922170131811388e-06, + "loss": 3.7201, + "step": 110815 + }, + { + "epoch": 7.529555646147575, + "grad_norm": 0.19081126153469086, + "learning_rate": 5.917923630928115e-06, + "loss": 3.8679, + "step": 110820 + }, + { + "epoch": 7.529895366218236, + "grad_norm": 2.442559003829956, + "learning_rate": 5.913677130044843e-06, + "loss": 3.7085, + "step": 110825 + }, + { + "epoch": 7.530235086288898, + "grad_norm": 0.49558788537979126, + "learning_rate": 5.909430629161571e-06, + "loss": 3.6819, + "step": 110830 + }, + { + "epoch": 7.53057480635956, + "grad_norm": 0.15946967899799347, + "learning_rate": 5.9051841282782984e-06, + "loss": 3.8249, + "step": 110835 + }, + { + "epoch": 7.530914526430221, + "grad_norm": 0.14450056850910187, + "learning_rate": 5.900937627395027e-06, + "loss": 3.5882, + "step": 110840 + }, + { + "epoch": 7.531254246500883, + "grad_norm": 0.2450341433286667, + "learning_rate": 5.8966911265117544e-06, + "loss": 3.9155, + "step": 110845 + }, + { + "epoch": 7.5315939665715455, + "grad_norm": 0.15736646950244904, + "learning_rate": 5.8924446256284825e-06, + "loss": 3.9012, + "step": 110850 + }, + { + "epoch": 7.531933686642207, + "grad_norm": 0.19251181185245514, + "learning_rate": 5.8881981247452105e-06, + "loss": 3.7867, + "step": 110855 + }, + { + "epoch": 7.532273406712869, + "grad_norm": 0.24156725406646729, + "learning_rate": 5.883951623861938e-06, + "loss": 3.9556, + "step": 110860 + }, + { + "epoch": 7.532613126783531, + "grad_norm": 0.24190065264701843, + "learning_rate": 5.8797051229786665e-06, + "loss": 3.7647, + "step": 110865 + }, + { + "epoch": 7.532952846854192, + "grad_norm": 0.3160419166088104, + "learning_rate": 5.875458622095394e-06, + "loss": 3.8499, + "step": 110870 + }, + { + "epoch": 7.533292566924854, + "grad_norm": 0.3055918216705322, + "learning_rate": 5.871212121212122e-06, + "loss": 3.84, + "step": 110875 + }, + { + "epoch": 7.533632286995516, + "grad_norm": 0.16488678753376007, + "learning_rate": 5.86696562032885e-06, + "loss": 3.7565, + "step": 110880 + }, + { + "epoch": 7.533972007066177, + "grad_norm": 0.16793543100357056, + "learning_rate": 5.862719119445577e-06, + "loss": 3.8452, + "step": 110885 + }, + { + "epoch": 7.534311727136839, + "grad_norm": 0.14857161045074463, + "learning_rate": 5.858472618562305e-06, + "loss": 3.9549, + "step": 110890 + }, + { + "epoch": 7.5346514472075015, + "grad_norm": 0.18639156222343445, + "learning_rate": 5.854226117679033e-06, + "loss": 4.1879, + "step": 110895 + }, + { + "epoch": 7.534991167278163, + "grad_norm": 0.15836340188980103, + "learning_rate": 5.849979616795761e-06, + "loss": 3.5605, + "step": 110900 + }, + { + "epoch": 7.535330887348825, + "grad_norm": 0.15421974658966064, + "learning_rate": 5.845733115912488e-06, + "loss": 3.7564, + "step": 110905 + }, + { + "epoch": 7.535670607419487, + "grad_norm": 0.8151878714561462, + "learning_rate": 5.841486615029216e-06, + "loss": 4.0353, + "step": 110910 + }, + { + "epoch": 7.536010327490148, + "grad_norm": 0.1683824062347412, + "learning_rate": 5.837240114145944e-06, + "loss": 3.8894, + "step": 110915 + }, + { + "epoch": 7.53635004756081, + "grad_norm": 0.1393110752105713, + "learning_rate": 5.832993613262672e-06, + "loss": 3.8367, + "step": 110920 + }, + { + "epoch": 7.536689767631472, + "grad_norm": 0.19730432331562042, + "learning_rate": 5.8287471123794e-06, + "loss": 3.6043, + "step": 110925 + }, + { + "epoch": 7.537029487702133, + "grad_norm": 0.18785543739795685, + "learning_rate": 5.824500611496127e-06, + "loss": 3.7994, + "step": 110930 + }, + { + "epoch": 7.537369207772795, + "grad_norm": 0.1479768604040146, + "learning_rate": 5.820254110612855e-06, + "loss": 3.9922, + "step": 110935 + }, + { + "epoch": 7.5377089278434575, + "grad_norm": 0.15111711621284485, + "learning_rate": 5.816007609729583e-06, + "loss": 4.0642, + "step": 110940 + }, + { + "epoch": 7.538048647914119, + "grad_norm": 0.1766716092824936, + "learning_rate": 5.81176110884631e-06, + "loss": 4.04, + "step": 110945 + }, + { + "epoch": 7.538388367984781, + "grad_norm": 0.1945595145225525, + "learning_rate": 5.807514607963039e-06, + "loss": 3.8176, + "step": 110950 + }, + { + "epoch": 7.538728088055443, + "grad_norm": 0.23102743923664093, + "learning_rate": 5.803268107079766e-06, + "loss": 4.0124, + "step": 110955 + }, + { + "epoch": 7.539067808126104, + "grad_norm": 0.15259675681591034, + "learning_rate": 5.799021606196494e-06, + "loss": 3.7094, + "step": 110960 + }, + { + "epoch": 7.539407528196766, + "grad_norm": 0.17189690470695496, + "learning_rate": 5.794775105313222e-06, + "loss": 4.089, + "step": 110965 + }, + { + "epoch": 7.539747248267427, + "grad_norm": 0.18903763592243195, + "learning_rate": 5.7905286044299496e-06, + "loss": 3.6597, + "step": 110970 + }, + { + "epoch": 7.540086968338089, + "grad_norm": 0.1990758329629898, + "learning_rate": 5.786282103546678e-06, + "loss": 3.9082, + "step": 110975 + }, + { + "epoch": 7.540426688408751, + "grad_norm": 0.18250702321529388, + "learning_rate": 5.7820356026634056e-06, + "loss": 3.6493, + "step": 110980 + }, + { + "epoch": 7.540766408479413, + "grad_norm": 0.12864722311496735, + "learning_rate": 5.777789101780134e-06, + "loss": 3.7761, + "step": 110985 + }, + { + "epoch": 7.541106128550075, + "grad_norm": 0.1518988311290741, + "learning_rate": 5.773542600896862e-06, + "loss": 3.8701, + "step": 110990 + }, + { + "epoch": 7.541445848620737, + "grad_norm": 0.1511543095111847, + "learning_rate": 5.769296100013589e-06, + "loss": 3.6455, + "step": 110995 + }, + { + "epoch": 7.541785568691398, + "grad_norm": 0.19066178798675537, + "learning_rate": 5.765049599130317e-06, + "loss": 3.7789, + "step": 111000 + }, + { + "epoch": 7.54212528876206, + "grad_norm": 0.17544065415859222, + "learning_rate": 5.760803098247045e-06, + "loss": 3.8721, + "step": 111005 + }, + { + "epoch": 7.542465008832722, + "grad_norm": 0.1610882431268692, + "learning_rate": 5.756556597363773e-06, + "loss": 3.9998, + "step": 111010 + }, + { + "epoch": 7.542804728903383, + "grad_norm": 0.14971689879894257, + "learning_rate": 5.7523100964805e-06, + "loss": 3.7467, + "step": 111015 + }, + { + "epoch": 7.543144448974045, + "grad_norm": 2.307210922241211, + "learning_rate": 5.748063595597228e-06, + "loss": 3.7923, + "step": 111020 + }, + { + "epoch": 7.543484169044707, + "grad_norm": 0.17039275169372559, + "learning_rate": 5.743817094713956e-06, + "loss": 3.6992, + "step": 111025 + }, + { + "epoch": 7.543823889115369, + "grad_norm": 0.1881740689277649, + "learning_rate": 5.739570593830684e-06, + "loss": 3.8128, + "step": 111030 + }, + { + "epoch": 7.544163609186031, + "grad_norm": 0.1597687304019928, + "learning_rate": 5.735324092947412e-06, + "loss": 3.8289, + "step": 111035 + }, + { + "epoch": 7.544503329256693, + "grad_norm": 0.24087315797805786, + "learning_rate": 5.731077592064139e-06, + "loss": 3.8593, + "step": 111040 + }, + { + "epoch": 7.544843049327354, + "grad_norm": 0.18211130797863007, + "learning_rate": 5.726831091180868e-06, + "loss": 3.6518, + "step": 111045 + }, + { + "epoch": 7.545182769398016, + "grad_norm": 0.24849484860897064, + "learning_rate": 5.722584590297595e-06, + "loss": 3.9902, + "step": 111050 + }, + { + "epoch": 7.545522489468678, + "grad_norm": 0.2087775617837906, + "learning_rate": 5.718338089414322e-06, + "loss": 3.9688, + "step": 111055 + }, + { + "epoch": 7.545862209539339, + "grad_norm": 0.17395073175430298, + "learning_rate": 5.714091588531051e-06, + "loss": 3.7658, + "step": 111060 + }, + { + "epoch": 7.546201929610001, + "grad_norm": 0.19094815850257874, + "learning_rate": 5.709845087647778e-06, + "loss": 3.6469, + "step": 111065 + }, + { + "epoch": 7.546541649680663, + "grad_norm": 0.17799699306488037, + "learning_rate": 5.705598586764506e-06, + "loss": 3.8414, + "step": 111070 + }, + { + "epoch": 7.546881369751325, + "grad_norm": 0.40842491388320923, + "learning_rate": 5.701352085881234e-06, + "loss": 3.9498, + "step": 111075 + }, + { + "epoch": 7.547221089821987, + "grad_norm": 0.14486826956272125, + "learning_rate": 5.6971055849979615e-06, + "loss": 3.745, + "step": 111080 + }, + { + "epoch": 7.547560809892649, + "grad_norm": 0.20098043978214264, + "learning_rate": 5.6928590841146895e-06, + "loss": 3.6803, + "step": 111085 + }, + { + "epoch": 7.54790052996331, + "grad_norm": 0.15154637396335602, + "learning_rate": 5.6886125832314175e-06, + "loss": 3.9235, + "step": 111090 + }, + { + "epoch": 7.548240250033972, + "grad_norm": 0.1612471491098404, + "learning_rate": 5.6843660823481455e-06, + "loss": 3.9775, + "step": 111095 + }, + { + "epoch": 7.548579970104634, + "grad_norm": 0.17969481647014618, + "learning_rate": 5.6801195814648735e-06, + "loss": 4.0457, + "step": 111100 + }, + { + "epoch": 7.548919690175295, + "grad_norm": 0.21678172051906586, + "learning_rate": 5.675873080581601e-06, + "loss": 3.9191, + "step": 111105 + }, + { + "epoch": 7.549259410245957, + "grad_norm": 0.220237597823143, + "learning_rate": 5.671626579698329e-06, + "loss": 3.7607, + "step": 111110 + }, + { + "epoch": 7.5495991303166194, + "grad_norm": 0.17431508004665375, + "learning_rate": 5.667380078815057e-06, + "loss": 3.5956, + "step": 111115 + }, + { + "epoch": 7.549938850387281, + "grad_norm": 0.19384737312793732, + "learning_rate": 5.663133577931785e-06, + "loss": 4.0433, + "step": 111120 + }, + { + "epoch": 7.550278570457943, + "grad_norm": 0.20456327497959137, + "learning_rate": 5.658887077048512e-06, + "loss": 3.91, + "step": 111125 + }, + { + "epoch": 7.550618290528605, + "grad_norm": 0.14036329090595245, + "learning_rate": 5.654640576165241e-06, + "loss": 3.7216, + "step": 111130 + }, + { + "epoch": 7.550958010599266, + "grad_norm": 0.26194655895233154, + "learning_rate": 5.650394075281968e-06, + "loss": 3.7071, + "step": 111135 + }, + { + "epoch": 7.551297730669928, + "grad_norm": 0.17938394844532013, + "learning_rate": 5.646147574398696e-06, + "loss": 3.7012, + "step": 111140 + }, + { + "epoch": 7.55163745074059, + "grad_norm": 0.19144611060619354, + "learning_rate": 5.641901073515424e-06, + "loss": 3.7307, + "step": 111145 + }, + { + "epoch": 7.551977170811251, + "grad_norm": 0.14892204105854034, + "learning_rate": 5.637654572632151e-06, + "loss": 3.8965, + "step": 111150 + }, + { + "epoch": 7.552316890881913, + "grad_norm": 0.1646641194820404, + "learning_rate": 5.63340807174888e-06, + "loss": 3.833, + "step": 111155 + }, + { + "epoch": 7.5526566109525755, + "grad_norm": 0.14717212319374084, + "learning_rate": 5.629161570865607e-06, + "loss": 3.8665, + "step": 111160 + }, + { + "epoch": 7.552996331023237, + "grad_norm": 0.16429032385349274, + "learning_rate": 5.624915069982334e-06, + "loss": 3.9031, + "step": 111165 + }, + { + "epoch": 7.553336051093899, + "grad_norm": 0.20289789140224457, + "learning_rate": 5.620668569099063e-06, + "loss": 3.9139, + "step": 111170 + }, + { + "epoch": 7.55367577116456, + "grad_norm": 0.14252842962741852, + "learning_rate": 5.61642206821579e-06, + "loss": 3.7512, + "step": 111175 + }, + { + "epoch": 7.554015491235222, + "grad_norm": 0.22839954495429993, + "learning_rate": 5.612175567332518e-06, + "loss": 3.7574, + "step": 111180 + }, + { + "epoch": 7.554355211305884, + "grad_norm": 0.16699527204036713, + "learning_rate": 5.607929066449246e-06, + "loss": 3.8581, + "step": 111185 + }, + { + "epoch": 7.554694931376545, + "grad_norm": 0.2206030637025833, + "learning_rate": 5.6036825655659734e-06, + "loss": 3.9139, + "step": 111190 + }, + { + "epoch": 7.555034651447207, + "grad_norm": 0.7113749384880066, + "learning_rate": 5.5994360646827014e-06, + "loss": 3.7499, + "step": 111195 + }, + { + "epoch": 7.555374371517869, + "grad_norm": 0.23931753635406494, + "learning_rate": 5.5951895637994294e-06, + "loss": 3.943, + "step": 111200 + }, + { + "epoch": 7.555714091588531, + "grad_norm": 0.14551526308059692, + "learning_rate": 5.5909430629161575e-06, + "loss": 3.8539, + "step": 111205 + }, + { + "epoch": 7.556053811659193, + "grad_norm": 0.1595981866121292, + "learning_rate": 5.5866965620328855e-06, + "loss": 4.0427, + "step": 111210 + }, + { + "epoch": 7.556393531729855, + "grad_norm": 0.1929859071969986, + "learning_rate": 5.5824500611496135e-06, + "loss": 3.5709, + "step": 111215 + }, + { + "epoch": 7.556733251800516, + "grad_norm": 0.16275545954704285, + "learning_rate": 5.578203560266341e-06, + "loss": 3.7696, + "step": 111220 + }, + { + "epoch": 7.557072971871178, + "grad_norm": 0.16619886457920074, + "learning_rate": 5.573957059383069e-06, + "loss": 3.8286, + "step": 111225 + }, + { + "epoch": 7.55741269194184, + "grad_norm": 0.19714006781578064, + "learning_rate": 5.569710558499797e-06, + "loss": 3.7449, + "step": 111230 + }, + { + "epoch": 7.557752412012501, + "grad_norm": 0.25609055161476135, + "learning_rate": 5.565464057616524e-06, + "loss": 3.5595, + "step": 111235 + }, + { + "epoch": 7.558092132083163, + "grad_norm": 0.16568341851234436, + "learning_rate": 5.561217556733253e-06, + "loss": 3.9008, + "step": 111240 + }, + { + "epoch": 7.558431852153825, + "grad_norm": 0.3904826045036316, + "learning_rate": 5.55697105584998e-06, + "loss": 3.7818, + "step": 111245 + }, + { + "epoch": 7.558771572224487, + "grad_norm": 0.17785285413265228, + "learning_rate": 5.552724554966708e-06, + "loss": 4.0528, + "step": 111250 + }, + { + "epoch": 7.559111292295149, + "grad_norm": 0.19545011222362518, + "learning_rate": 5.548478054083436e-06, + "loss": 3.978, + "step": 111255 + }, + { + "epoch": 7.559451012365811, + "grad_norm": 0.22523191571235657, + "learning_rate": 5.544231553200163e-06, + "loss": 3.8401, + "step": 111260 + }, + { + "epoch": 7.559790732436472, + "grad_norm": 0.15712900459766388, + "learning_rate": 5.539985052316891e-06, + "loss": 3.8993, + "step": 111265 + }, + { + "epoch": 7.560130452507134, + "grad_norm": 0.6036403775215149, + "learning_rate": 5.535738551433619e-06, + "loss": 3.5866, + "step": 111270 + }, + { + "epoch": 7.560470172577796, + "grad_norm": 0.21139898896217346, + "learning_rate": 5.531492050550346e-06, + "loss": 3.8423, + "step": 111275 + }, + { + "epoch": 7.560809892648457, + "grad_norm": 0.16368542611598969, + "learning_rate": 5.527245549667075e-06, + "loss": 3.9437, + "step": 111280 + }, + { + "epoch": 7.561149612719119, + "grad_norm": 0.20704501867294312, + "learning_rate": 5.522999048783802e-06, + "loss": 3.7663, + "step": 111285 + }, + { + "epoch": 7.561489332789781, + "grad_norm": 0.3995591104030609, + "learning_rate": 5.51875254790053e-06, + "loss": 4.0886, + "step": 111290 + }, + { + "epoch": 7.561829052860443, + "grad_norm": 1.5097674131393433, + "learning_rate": 5.514506047017258e-06, + "loss": 3.7751, + "step": 111295 + }, + { + "epoch": 7.562168772931105, + "grad_norm": 0.162684366106987, + "learning_rate": 5.510259546133986e-06, + "loss": 4.0313, + "step": 111300 + }, + { + "epoch": 7.562508493001767, + "grad_norm": 0.18431492149829865, + "learning_rate": 5.506013045250713e-06, + "loss": 3.841, + "step": 111305 + }, + { + "epoch": 7.562848213072428, + "grad_norm": 0.1870718151330948, + "learning_rate": 5.501766544367441e-06, + "loss": 4.0385, + "step": 111310 + }, + { + "epoch": 7.56318793314309, + "grad_norm": 0.21702441573143005, + "learning_rate": 5.497520043484169e-06, + "loss": 3.9232, + "step": 111315 + }, + { + "epoch": 7.563527653213752, + "grad_norm": 0.30307385325431824, + "learning_rate": 5.493273542600897e-06, + "loss": 3.7089, + "step": 111320 + }, + { + "epoch": 7.563867373284413, + "grad_norm": 0.2354942411184311, + "learning_rate": 5.489027041717625e-06, + "loss": 3.7749, + "step": 111325 + }, + { + "epoch": 7.564207093355075, + "grad_norm": 0.16496789455413818, + "learning_rate": 5.4847805408343526e-06, + "loss": 3.8346, + "step": 111330 + }, + { + "epoch": 7.564546813425737, + "grad_norm": 0.1869441121816635, + "learning_rate": 5.4805340399510806e-06, + "loss": 4.0959, + "step": 111335 + }, + { + "epoch": 7.564886533496399, + "grad_norm": 0.15804530680179596, + "learning_rate": 5.476287539067809e-06, + "loss": 3.8578, + "step": 111340 + }, + { + "epoch": 7.565226253567061, + "grad_norm": 0.16355986893177032, + "learning_rate": 5.472041038184536e-06, + "loss": 3.9191, + "step": 111345 + }, + { + "epoch": 7.565565973637723, + "grad_norm": 0.17209751904010773, + "learning_rate": 5.467794537301265e-06, + "loss": 3.9434, + "step": 111350 + }, + { + "epoch": 7.565905693708384, + "grad_norm": 0.16632620990276337, + "learning_rate": 5.463548036417992e-06, + "loss": 3.6346, + "step": 111355 + }, + { + "epoch": 7.566245413779046, + "grad_norm": 0.5215064287185669, + "learning_rate": 5.45930153553472e-06, + "loss": 3.939, + "step": 111360 + }, + { + "epoch": 7.566585133849708, + "grad_norm": 0.41561242938041687, + "learning_rate": 5.455055034651448e-06, + "loss": 3.8865, + "step": 111365 + }, + { + "epoch": 7.566924853920369, + "grad_norm": 0.21964053809642792, + "learning_rate": 5.450808533768175e-06, + "loss": 3.8808, + "step": 111370 + }, + { + "epoch": 7.567264573991031, + "grad_norm": 0.3285422623157501, + "learning_rate": 5.446562032884903e-06, + "loss": 3.808, + "step": 111375 + }, + { + "epoch": 7.567604294061693, + "grad_norm": 0.5777092576026917, + "learning_rate": 5.442315532001631e-06, + "loss": 3.9183, + "step": 111380 + }, + { + "epoch": 7.567944014132355, + "grad_norm": 0.16784802079200745, + "learning_rate": 5.438069031118358e-06, + "loss": 3.9207, + "step": 111385 + }, + { + "epoch": 7.568283734203017, + "grad_norm": 0.15484076738357544, + "learning_rate": 5.433822530235087e-06, + "loss": 4.0092, + "step": 111390 + }, + { + "epoch": 7.568623454273679, + "grad_norm": 0.1799124926328659, + "learning_rate": 5.429576029351814e-06, + "loss": 3.9686, + "step": 111395 + }, + { + "epoch": 7.56896317434434, + "grad_norm": 0.21313580870628357, + "learning_rate": 5.425329528468542e-06, + "loss": 3.9974, + "step": 111400 + }, + { + "epoch": 7.569302894415002, + "grad_norm": 0.18473964929580688, + "learning_rate": 5.42108302758527e-06, + "loss": 3.9485, + "step": 111405 + }, + { + "epoch": 7.569642614485664, + "grad_norm": 0.13854172825813293, + "learning_rate": 5.416836526701998e-06, + "loss": 3.8203, + "step": 111410 + }, + { + "epoch": 7.569982334556325, + "grad_norm": 0.14677202701568604, + "learning_rate": 5.412590025818725e-06, + "loss": 3.3878, + "step": 111415 + }, + { + "epoch": 7.570322054626987, + "grad_norm": 0.17290571331977844, + "learning_rate": 5.408343524935453e-06, + "loss": 3.697, + "step": 111420 + }, + { + "epoch": 7.5706617746976494, + "grad_norm": 0.17596106231212616, + "learning_rate": 5.404097024052181e-06, + "loss": 3.9426, + "step": 111425 + }, + { + "epoch": 7.571001494768311, + "grad_norm": 0.15785838663578033, + "learning_rate": 5.399850523168909e-06, + "loss": 3.8815, + "step": 111430 + }, + { + "epoch": 7.571341214838973, + "grad_norm": 0.278007447719574, + "learning_rate": 5.395604022285637e-06, + "loss": 3.7482, + "step": 111435 + }, + { + "epoch": 7.571680934909635, + "grad_norm": 0.5072944760322571, + "learning_rate": 5.3913575214023645e-06, + "loss": 3.8307, + "step": 111440 + }, + { + "epoch": 7.572020654980296, + "grad_norm": 0.15053199231624603, + "learning_rate": 5.3871110205190925e-06, + "loss": 4.1612, + "step": 111445 + }, + { + "epoch": 7.572360375050958, + "grad_norm": 0.17872114479541779, + "learning_rate": 5.3828645196358205e-06, + "loss": 3.6639, + "step": 111450 + }, + { + "epoch": 7.57270009512162, + "grad_norm": 0.14985321462154388, + "learning_rate": 5.378618018752548e-06, + "loss": 3.8504, + "step": 111455 + }, + { + "epoch": 7.573039815192281, + "grad_norm": 0.16421908140182495, + "learning_rate": 5.3743715178692765e-06, + "loss": 3.6471, + "step": 111460 + }, + { + "epoch": 7.573379535262943, + "grad_norm": 0.1849800944328308, + "learning_rate": 5.370125016986004e-06, + "loss": 3.7248, + "step": 111465 + }, + { + "epoch": 7.5737192553336055, + "grad_norm": 0.21991558372974396, + "learning_rate": 5.365878516102731e-06, + "loss": 3.9168, + "step": 111470 + }, + { + "epoch": 7.574058975404267, + "grad_norm": 0.17722585797309875, + "learning_rate": 5.36163201521946e-06, + "loss": 3.5755, + "step": 111475 + }, + { + "epoch": 7.574398695474929, + "grad_norm": 0.19616201519966125, + "learning_rate": 5.357385514336187e-06, + "loss": 4.1009, + "step": 111480 + }, + { + "epoch": 7.574738415545591, + "grad_norm": 0.12650318443775177, + "learning_rate": 5.353139013452915e-06, + "loss": 3.6294, + "step": 111485 + }, + { + "epoch": 7.575078135616252, + "grad_norm": 0.5647569298744202, + "learning_rate": 5.348892512569643e-06, + "loss": 3.8509, + "step": 111490 + }, + { + "epoch": 7.575417855686914, + "grad_norm": 0.19375953078269958, + "learning_rate": 5.344646011686371e-06, + "loss": 4.0257, + "step": 111495 + }, + { + "epoch": 7.575757575757576, + "grad_norm": 0.16198447346687317, + "learning_rate": 5.340399510803099e-06, + "loss": 3.849, + "step": 111500 + }, + { + "epoch": 7.576097295828237, + "grad_norm": 0.17629724740982056, + "learning_rate": 5.336153009919826e-06, + "loss": 3.8837, + "step": 111505 + }, + { + "epoch": 7.576437015898899, + "grad_norm": 0.22423088550567627, + "learning_rate": 5.331906509036554e-06, + "loss": 3.9919, + "step": 111510 + }, + { + "epoch": 7.5767767359695615, + "grad_norm": 0.16320492327213287, + "learning_rate": 5.327660008153282e-06, + "loss": 3.7833, + "step": 111515 + }, + { + "epoch": 7.577116456040223, + "grad_norm": 0.17046234011650085, + "learning_rate": 5.32341350727001e-06, + "loss": 3.9255, + "step": 111520 + }, + { + "epoch": 7.577456176110885, + "grad_norm": 0.16831378638744354, + "learning_rate": 5.319167006386737e-06, + "loss": 3.6752, + "step": 111525 + }, + { + "epoch": 7.577795896181547, + "grad_norm": 0.20963114500045776, + "learning_rate": 5.314920505503465e-06, + "loss": 3.8945, + "step": 111530 + }, + { + "epoch": 7.578135616252208, + "grad_norm": 0.21085967123508453, + "learning_rate": 5.310674004620193e-06, + "loss": 3.9633, + "step": 111535 + }, + { + "epoch": 7.57847533632287, + "grad_norm": 0.2018808275461197, + "learning_rate": 5.306427503736921e-06, + "loss": 3.5603, + "step": 111540 + }, + { + "epoch": 7.578815056393532, + "grad_norm": 0.19957216084003448, + "learning_rate": 5.302181002853649e-06, + "loss": 3.6844, + "step": 111545 + }, + { + "epoch": 7.579154776464193, + "grad_norm": 0.13520435988903046, + "learning_rate": 5.2979345019703764e-06, + "loss": 3.7499, + "step": 111550 + }, + { + "epoch": 7.579494496534855, + "grad_norm": 0.18666458129882812, + "learning_rate": 5.2936880010871044e-06, + "loss": 3.8449, + "step": 111555 + }, + { + "epoch": 7.5798342166055175, + "grad_norm": 0.15528562664985657, + "learning_rate": 5.2894415002038324e-06, + "loss": 3.7553, + "step": 111560 + }, + { + "epoch": 7.580173936676179, + "grad_norm": 0.1925085335969925, + "learning_rate": 5.28519499932056e-06, + "loss": 3.8359, + "step": 111565 + }, + { + "epoch": 7.580513656746841, + "grad_norm": 0.18165844678878784, + "learning_rate": 5.2809484984372885e-06, + "loss": 3.4482, + "step": 111570 + }, + { + "epoch": 7.580853376817503, + "grad_norm": 0.2649809420108795, + "learning_rate": 5.276701997554016e-06, + "loss": 3.7592, + "step": 111575 + }, + { + "epoch": 7.581193096888164, + "grad_norm": 0.16329777240753174, + "learning_rate": 5.272455496670744e-06, + "loss": 3.653, + "step": 111580 + }, + { + "epoch": 7.581532816958826, + "grad_norm": 0.18223612010478973, + "learning_rate": 5.268208995787472e-06, + "loss": 3.7378, + "step": 111585 + }, + { + "epoch": 7.581872537029488, + "grad_norm": 0.17576639354228973, + "learning_rate": 5.263962494904199e-06, + "loss": 3.8513, + "step": 111590 + }, + { + "epoch": 7.582212257100149, + "grad_norm": 0.14399555325508118, + "learning_rate": 5.259715994020927e-06, + "loss": 3.8504, + "step": 111595 + }, + { + "epoch": 7.582551977170811, + "grad_norm": 0.2507469356060028, + "learning_rate": 5.255469493137655e-06, + "loss": 3.7317, + "step": 111600 + }, + { + "epoch": 7.5828916972414735, + "grad_norm": 0.1291441172361374, + "learning_rate": 5.251222992254383e-06, + "loss": 4.0209, + "step": 111605 + }, + { + "epoch": 7.583231417312135, + "grad_norm": 0.16234296560287476, + "learning_rate": 5.246976491371111e-06, + "loss": 3.7439, + "step": 111610 + }, + { + "epoch": 7.583571137382797, + "grad_norm": 0.19685055315494537, + "learning_rate": 5.242729990487838e-06, + "loss": 3.912, + "step": 111615 + }, + { + "epoch": 7.583910857453459, + "grad_norm": 0.1711939126253128, + "learning_rate": 5.238483489604566e-06, + "loss": 3.8553, + "step": 111620 + }, + { + "epoch": 7.58425057752412, + "grad_norm": 0.17480432987213135, + "learning_rate": 5.234236988721294e-06, + "loss": 3.7402, + "step": 111625 + }, + { + "epoch": 7.584590297594782, + "grad_norm": 0.4183880090713501, + "learning_rate": 5.229990487838022e-06, + "loss": 4.1438, + "step": 111630 + }, + { + "epoch": 7.584930017665444, + "grad_norm": 0.1742481142282486, + "learning_rate": 5.225743986954749e-06, + "loss": 3.829, + "step": 111635 + }, + { + "epoch": 7.585269737736105, + "grad_norm": 0.21174661815166473, + "learning_rate": 5.221497486071477e-06, + "loss": 3.8457, + "step": 111640 + }, + { + "epoch": 7.585609457806767, + "grad_norm": 0.6650010347366333, + "learning_rate": 5.217250985188205e-06, + "loss": 3.7865, + "step": 111645 + }, + { + "epoch": 7.585949177877429, + "grad_norm": 0.184774711728096, + "learning_rate": 5.213004484304932e-06, + "loss": 3.6774, + "step": 111650 + }, + { + "epoch": 7.586288897948091, + "grad_norm": 0.30775514245033264, + "learning_rate": 5.208757983421661e-06, + "loss": 3.8693, + "step": 111655 + }, + { + "epoch": 7.586628618018753, + "grad_norm": 0.1903187334537506, + "learning_rate": 5.204511482538388e-06, + "loss": 3.8529, + "step": 111660 + }, + { + "epoch": 7.586968338089414, + "grad_norm": 0.16933947801589966, + "learning_rate": 5.200264981655116e-06, + "loss": 3.7684, + "step": 111665 + }, + { + "epoch": 7.587308058160076, + "grad_norm": 0.1904747635126114, + "learning_rate": 5.196018480771844e-06, + "loss": 3.7124, + "step": 111670 + }, + { + "epoch": 7.587647778230738, + "grad_norm": 0.17176245152950287, + "learning_rate": 5.1917719798885715e-06, + "loss": 3.8935, + "step": 111675 + }, + { + "epoch": 7.587987498301399, + "grad_norm": 0.15505391359329224, + "learning_rate": 5.1875254790053e-06, + "loss": 3.7265, + "step": 111680 + }, + { + "epoch": 7.588327218372061, + "grad_norm": 0.287440687417984, + "learning_rate": 5.1832789781220276e-06, + "loss": 3.6917, + "step": 111685 + }, + { + "epoch": 7.588666938442723, + "grad_norm": 0.20496521890163422, + "learning_rate": 5.1790324772387556e-06, + "loss": 3.7936, + "step": 111690 + }, + { + "epoch": 7.589006658513385, + "grad_norm": 0.5741974115371704, + "learning_rate": 5.1747859763554836e-06, + "loss": 3.9241, + "step": 111695 + }, + { + "epoch": 7.589346378584047, + "grad_norm": 0.2963028848171234, + "learning_rate": 5.170539475472211e-06, + "loss": 3.6514, + "step": 111700 + }, + { + "epoch": 7.589686098654709, + "grad_norm": 0.1379472017288208, + "learning_rate": 5.166292974588939e-06, + "loss": 3.8053, + "step": 111705 + }, + { + "epoch": 7.59002581872537, + "grad_norm": 2.281475305557251, + "learning_rate": 5.162046473705667e-06, + "loss": 3.7775, + "step": 111710 + }, + { + "epoch": 7.590365538796032, + "grad_norm": 0.40643998980522156, + "learning_rate": 5.157799972822395e-06, + "loss": 3.8426, + "step": 111715 + }, + { + "epoch": 7.590705258866694, + "grad_norm": 0.1866588145494461, + "learning_rate": 5.153553471939123e-06, + "loss": 4.008, + "step": 111720 + }, + { + "epoch": 7.591044978937355, + "grad_norm": 0.16359791159629822, + "learning_rate": 5.14930697105585e-06, + "loss": 3.7285, + "step": 111725 + }, + { + "epoch": 7.591384699008017, + "grad_norm": 0.2429637461900711, + "learning_rate": 5.145060470172578e-06, + "loss": 3.6187, + "step": 111730 + }, + { + "epoch": 7.5917244190786795, + "grad_norm": 0.14483462274074554, + "learning_rate": 5.140813969289306e-06, + "loss": 3.9777, + "step": 111735 + }, + { + "epoch": 7.592064139149341, + "grad_norm": 0.17722490429878235, + "learning_rate": 5.136567468406034e-06, + "loss": 3.8827, + "step": 111740 + }, + { + "epoch": 7.592403859220003, + "grad_norm": 0.2229844182729721, + "learning_rate": 5.132320967522761e-06, + "loss": 3.8232, + "step": 111745 + }, + { + "epoch": 7.592743579290665, + "grad_norm": 0.22346003353595734, + "learning_rate": 5.12807446663949e-06, + "loss": 3.8445, + "step": 111750 + }, + { + "epoch": 7.593083299361326, + "grad_norm": 0.4205736219882965, + "learning_rate": 5.123827965756217e-06, + "loss": 3.7921, + "step": 111755 + }, + { + "epoch": 7.593423019431988, + "grad_norm": 0.18113234639167786, + "learning_rate": 5.119581464872944e-06, + "loss": 3.7839, + "step": 111760 + }, + { + "epoch": 7.59376273950265, + "grad_norm": 0.16621053218841553, + "learning_rate": 5.115334963989673e-06, + "loss": 3.8094, + "step": 111765 + }, + { + "epoch": 7.594102459573311, + "grad_norm": 0.2320358008146286, + "learning_rate": 5.1110884631064e-06, + "loss": 3.7773, + "step": 111770 + }, + { + "epoch": 7.594442179643973, + "grad_norm": 0.41822952032089233, + "learning_rate": 5.106841962223128e-06, + "loss": 3.8549, + "step": 111775 + }, + { + "epoch": 7.5947818997146355, + "grad_norm": 0.15837006270885468, + "learning_rate": 5.102595461339856e-06, + "loss": 3.936, + "step": 111780 + }, + { + "epoch": 7.595121619785297, + "grad_norm": 0.24911858141422272, + "learning_rate": 5.0983489604565835e-06, + "loss": 3.8271, + "step": 111785 + }, + { + "epoch": 7.595461339855959, + "grad_norm": 0.180511474609375, + "learning_rate": 5.094102459573312e-06, + "loss": 3.7541, + "step": 111790 + }, + { + "epoch": 7.595801059926621, + "grad_norm": 0.17271721363067627, + "learning_rate": 5.0898559586900395e-06, + "loss": 3.9033, + "step": 111795 + }, + { + "epoch": 7.596140779997282, + "grad_norm": 1.107615351676941, + "learning_rate": 5.0856094578067675e-06, + "loss": 3.9758, + "step": 111800 + }, + { + "epoch": 7.596480500067944, + "grad_norm": 0.19187548756599426, + "learning_rate": 5.0813629569234955e-06, + "loss": 3.7659, + "step": 111805 + }, + { + "epoch": 7.596820220138606, + "grad_norm": 0.22427047789096832, + "learning_rate": 5.077116456040223e-06, + "loss": 4.0282, + "step": 111810 + }, + { + "epoch": 7.597159940209267, + "grad_norm": 0.5649197697639465, + "learning_rate": 5.072869955156951e-06, + "loss": 3.5912, + "step": 111815 + }, + { + "epoch": 7.597499660279929, + "grad_norm": 0.17374873161315918, + "learning_rate": 5.068623454273679e-06, + "loss": 3.7159, + "step": 111820 + }, + { + "epoch": 7.5978393803505915, + "grad_norm": 0.18248967826366425, + "learning_rate": 5.064376953390407e-06, + "loss": 3.9199, + "step": 111825 + }, + { + "epoch": 7.598179100421253, + "grad_norm": 0.1877526342868805, + "learning_rate": 5.060130452507134e-06, + "loss": 4.0463, + "step": 111830 + }, + { + "epoch": 7.598518820491915, + "grad_norm": 0.14561329782009125, + "learning_rate": 5.055883951623863e-06, + "loss": 4.0378, + "step": 111835 + }, + { + "epoch": 7.598858540562577, + "grad_norm": 0.2917240262031555, + "learning_rate": 5.05163745074059e-06, + "loss": 3.8396, + "step": 111840 + }, + { + "epoch": 7.599198260633238, + "grad_norm": 0.1774270087480545, + "learning_rate": 5.047390949857318e-06, + "loss": 3.7222, + "step": 111845 + }, + { + "epoch": 7.5995379807039, + "grad_norm": 0.20779365301132202, + "learning_rate": 5.043144448974046e-06, + "loss": 3.7302, + "step": 111850 + }, + { + "epoch": 7.599877700774561, + "grad_norm": 0.1831379234790802, + "learning_rate": 5.038897948090773e-06, + "loss": 3.7894, + "step": 111855 + }, + { + "epoch": 7.600217420845223, + "grad_norm": 0.1672278195619583, + "learning_rate": 5.034651447207502e-06, + "loss": 3.8336, + "step": 111860 + }, + { + "epoch": 7.600557140915885, + "grad_norm": 0.2922828197479248, + "learning_rate": 5.030404946324229e-06, + "loss": 3.7821, + "step": 111865 + }, + { + "epoch": 7.600896860986547, + "grad_norm": 0.14970353245735168, + "learning_rate": 5.026158445440956e-06, + "loss": 3.8567, + "step": 111870 + }, + { + "epoch": 7.601236581057209, + "grad_norm": 0.1515895575284958, + "learning_rate": 5.021911944557685e-06, + "loss": 3.5157, + "step": 111875 + }, + { + "epoch": 7.601576301127871, + "grad_norm": 0.16052961349487305, + "learning_rate": 5.017665443674412e-06, + "loss": 3.9102, + "step": 111880 + }, + { + "epoch": 7.601916021198532, + "grad_norm": 0.1449902057647705, + "learning_rate": 5.01341894279114e-06, + "loss": 3.7239, + "step": 111885 + }, + { + "epoch": 7.602255741269194, + "grad_norm": 0.24040456116199493, + "learning_rate": 5.009172441907868e-06, + "loss": 3.88, + "step": 111890 + }, + { + "epoch": 7.602595461339856, + "grad_norm": 0.18820488452911377, + "learning_rate": 5.004925941024595e-06, + "loss": 3.8295, + "step": 111895 + }, + { + "epoch": 7.602935181410517, + "grad_norm": 0.28314515948295593, + "learning_rate": 5.000679440141324e-06, + "loss": 3.763, + "step": 111900 + }, + { + "epoch": 7.603274901481179, + "grad_norm": 0.17950600385665894, + "learning_rate": 4.9964329392580514e-06, + "loss": 3.7705, + "step": 111905 + }, + { + "epoch": 7.603614621551841, + "grad_norm": 0.1741018146276474, + "learning_rate": 4.9921864383747794e-06, + "loss": 3.872, + "step": 111910 + }, + { + "epoch": 7.603954341622503, + "grad_norm": 0.2049492597579956, + "learning_rate": 4.9879399374915074e-06, + "loss": 3.9386, + "step": 111915 + }, + { + "epoch": 7.604294061693165, + "grad_norm": 0.2078702747821808, + "learning_rate": 4.9836934366082355e-06, + "loss": 3.9234, + "step": 111920 + }, + { + "epoch": 7.604633781763827, + "grad_norm": 0.20459574460983276, + "learning_rate": 4.979446935724963e-06, + "loss": 3.8571, + "step": 111925 + }, + { + "epoch": 7.604973501834488, + "grad_norm": 0.17066609859466553, + "learning_rate": 4.975200434841691e-06, + "loss": 3.6562, + "step": 111930 + }, + { + "epoch": 7.60531322190515, + "grad_norm": 0.7626039981842041, + "learning_rate": 4.970953933958419e-06, + "loss": 3.8038, + "step": 111935 + }, + { + "epoch": 7.605652941975812, + "grad_norm": 0.16454628109931946, + "learning_rate": 4.966707433075146e-06, + "loss": 4.0691, + "step": 111940 + }, + { + "epoch": 7.605992662046473, + "grad_norm": 0.19064408540725708, + "learning_rate": 4.962460932191875e-06, + "loss": 3.8547, + "step": 111945 + }, + { + "epoch": 7.606332382117135, + "grad_norm": 0.9388080835342407, + "learning_rate": 4.958214431308602e-06, + "loss": 3.9342, + "step": 111950 + }, + { + "epoch": 7.606672102187797, + "grad_norm": 0.23280595242977142, + "learning_rate": 4.95396793042533e-06, + "loss": 3.949, + "step": 111955 + }, + { + "epoch": 7.607011822258459, + "grad_norm": 0.1413731426000595, + "learning_rate": 4.949721429542058e-06, + "loss": 3.6668, + "step": 111960 + }, + { + "epoch": 7.607351542329121, + "grad_norm": 0.14123861491680145, + "learning_rate": 4.945474928658785e-06, + "loss": 3.7186, + "step": 111965 + }, + { + "epoch": 7.607691262399783, + "grad_norm": 0.15112298727035522, + "learning_rate": 4.941228427775514e-06, + "loss": 3.761, + "step": 111970 + }, + { + "epoch": 7.608030982470444, + "grad_norm": 0.3215044438838959, + "learning_rate": 4.936981926892241e-06, + "loss": 3.9492, + "step": 111975 + }, + { + "epoch": 7.608370702541106, + "grad_norm": 0.1417253613471985, + "learning_rate": 4.932735426008968e-06, + "loss": 3.7435, + "step": 111980 + }, + { + "epoch": 7.608710422611768, + "grad_norm": 0.2186952382326126, + "learning_rate": 4.928488925125697e-06, + "loss": 3.7924, + "step": 111985 + }, + { + "epoch": 7.609050142682429, + "grad_norm": 0.1606956571340561, + "learning_rate": 4.924242424242424e-06, + "loss": 3.8262, + "step": 111990 + }, + { + "epoch": 7.609389862753091, + "grad_norm": 0.1454920619726181, + "learning_rate": 4.919995923359152e-06, + "loss": 3.9888, + "step": 111995 + }, + { + "epoch": 7.6097295828237534, + "grad_norm": 0.20127645134925842, + "learning_rate": 4.91574942247588e-06, + "loss": 3.9867, + "step": 112000 + }, + { + "epoch": 7.610069302894415, + "grad_norm": 0.21779073774814606, + "learning_rate": 4.911502921592608e-06, + "loss": 3.7398, + "step": 112005 + }, + { + "epoch": 7.610409022965077, + "grad_norm": 0.19022023677825928, + "learning_rate": 4.907256420709336e-06, + "loss": 3.9476, + "step": 112010 + }, + { + "epoch": 7.610748743035739, + "grad_norm": 0.35414955019950867, + "learning_rate": 4.903009919826063e-06, + "loss": 3.2975, + "step": 112015 + }, + { + "epoch": 7.6110884631064, + "grad_norm": 0.43864554166793823, + "learning_rate": 4.898763418942791e-06, + "loss": 3.8902, + "step": 112020 + }, + { + "epoch": 7.611428183177062, + "grad_norm": 0.12601162493228912, + "learning_rate": 4.894516918059519e-06, + "loss": 3.9234, + "step": 112025 + }, + { + "epoch": 7.611767903247724, + "grad_norm": 0.14649911224842072, + "learning_rate": 4.890270417176247e-06, + "loss": 3.8438, + "step": 112030 + }, + { + "epoch": 7.612107623318385, + "grad_norm": 0.1739606410264969, + "learning_rate": 4.8860239162929745e-06, + "loss": 3.9328, + "step": 112035 + }, + { + "epoch": 7.612447343389047, + "grad_norm": 0.3429887890815735, + "learning_rate": 4.8817774154097026e-06, + "loss": 3.6081, + "step": 112040 + }, + { + "epoch": 7.6127870634597095, + "grad_norm": 0.153499037027359, + "learning_rate": 4.8775309145264306e-06, + "loss": 3.6422, + "step": 112045 + }, + { + "epoch": 7.613126783530371, + "grad_norm": 0.13926896452903748, + "learning_rate": 4.873284413643158e-06, + "loss": 3.7212, + "step": 112050 + }, + { + "epoch": 7.613466503601033, + "grad_norm": 0.6182361245155334, + "learning_rate": 4.869037912759887e-06, + "loss": 3.9453, + "step": 112055 + }, + { + "epoch": 7.613806223671695, + "grad_norm": 0.19811347126960754, + "learning_rate": 4.864791411876614e-06, + "loss": 3.7426, + "step": 112060 + }, + { + "epoch": 7.614145943742356, + "grad_norm": 0.625853419303894, + "learning_rate": 4.860544910993342e-06, + "loss": 3.6601, + "step": 112065 + }, + { + "epoch": 7.614485663813018, + "grad_norm": 0.1779860407114029, + "learning_rate": 4.85629841011007e-06, + "loss": 3.8011, + "step": 112070 + }, + { + "epoch": 7.61482538388368, + "grad_norm": 0.16911859810352325, + "learning_rate": 4.852051909226797e-06, + "loss": 4.0643, + "step": 112075 + }, + { + "epoch": 7.615165103954341, + "grad_norm": 0.22459936141967773, + "learning_rate": 4.847805408343526e-06, + "loss": 3.819, + "step": 112080 + }, + { + "epoch": 7.615504824025003, + "grad_norm": 0.18051132559776306, + "learning_rate": 4.843558907460253e-06, + "loss": 3.7851, + "step": 112085 + }, + { + "epoch": 7.6158445440956655, + "grad_norm": 0.21853949129581451, + "learning_rate": 4.839312406576981e-06, + "loss": 3.8003, + "step": 112090 + }, + { + "epoch": 7.616184264166327, + "grad_norm": 0.2743057906627655, + "learning_rate": 4.835065905693709e-06, + "loss": 3.9286, + "step": 112095 + }, + { + "epoch": 7.616523984236989, + "grad_norm": 0.18846717476844788, + "learning_rate": 4.830819404810436e-06, + "loss": 4.0413, + "step": 112100 + }, + { + "epoch": 7.616863704307651, + "grad_norm": 0.19461235404014587, + "learning_rate": 4.826572903927164e-06, + "loss": 3.929, + "step": 112105 + }, + { + "epoch": 7.617203424378312, + "grad_norm": 0.1825108379125595, + "learning_rate": 4.822326403043892e-06, + "loss": 3.7725, + "step": 112110 + }, + { + "epoch": 7.617543144448974, + "grad_norm": 5.964142322540283, + "learning_rate": 4.81807990216062e-06, + "loss": 3.8323, + "step": 112115 + }, + { + "epoch": 7.617882864519636, + "grad_norm": 0.14555175602436066, + "learning_rate": 4.813833401277347e-06, + "loss": 3.805, + "step": 112120 + }, + { + "epoch": 7.618222584590297, + "grad_norm": 0.14564844965934753, + "learning_rate": 4.809586900394075e-06, + "loss": 3.9926, + "step": 112125 + }, + { + "epoch": 7.618562304660959, + "grad_norm": 0.29254889488220215, + "learning_rate": 4.805340399510803e-06, + "loss": 3.6791, + "step": 112130 + }, + { + "epoch": 7.6189020247316215, + "grad_norm": 0.22883480787277222, + "learning_rate": 4.801093898627531e-06, + "loss": 3.9847, + "step": 112135 + }, + { + "epoch": 7.619241744802283, + "grad_norm": 0.20132368803024292, + "learning_rate": 4.796847397744259e-06, + "loss": 3.6565, + "step": 112140 + }, + { + "epoch": 7.619581464872945, + "grad_norm": 0.1549759805202484, + "learning_rate": 4.7926008968609865e-06, + "loss": 3.9222, + "step": 112145 + }, + { + "epoch": 7.619921184943607, + "grad_norm": 0.3193751871585846, + "learning_rate": 4.7883543959777145e-06, + "loss": 3.681, + "step": 112150 + }, + { + "epoch": 7.620260905014268, + "grad_norm": 0.18129432201385498, + "learning_rate": 4.7841078950944425e-06, + "loss": 3.8212, + "step": 112155 + }, + { + "epoch": 7.62060062508493, + "grad_norm": 0.16103850305080414, + "learning_rate": 4.77986139421117e-06, + "loss": 3.9212, + "step": 112160 + }, + { + "epoch": 7.620940345155592, + "grad_norm": 0.3238352835178375, + "learning_rate": 4.7756148933278985e-06, + "loss": 3.8173, + "step": 112165 + }, + { + "epoch": 7.621280065226253, + "grad_norm": 0.1878785640001297, + "learning_rate": 4.771368392444626e-06, + "loss": 3.9182, + "step": 112170 + }, + { + "epoch": 7.621619785296915, + "grad_norm": 0.25676557421684265, + "learning_rate": 4.767121891561354e-06, + "loss": 3.9461, + "step": 112175 + }, + { + "epoch": 7.6219595053675775, + "grad_norm": 0.21464121341705322, + "learning_rate": 4.762875390678082e-06, + "loss": 3.793, + "step": 112180 + }, + { + "epoch": 7.622299225438239, + "grad_norm": 0.29325515031814575, + "learning_rate": 4.758628889794809e-06, + "loss": 3.9607, + "step": 112185 + }, + { + "epoch": 7.622638945508901, + "grad_norm": 0.15304593741893768, + "learning_rate": 4.754382388911538e-06, + "loss": 3.8553, + "step": 112190 + }, + { + "epoch": 7.622978665579563, + "grad_norm": 0.1794818639755249, + "learning_rate": 4.750135888028265e-06, + "loss": 3.7516, + "step": 112195 + }, + { + "epoch": 7.623318385650224, + "grad_norm": 0.14576753973960876, + "learning_rate": 4.745889387144993e-06, + "loss": 3.858, + "step": 112200 + }, + { + "epoch": 7.623658105720886, + "grad_norm": 0.15542453527450562, + "learning_rate": 4.741642886261721e-06, + "loss": 3.8547, + "step": 112205 + }, + { + "epoch": 7.623997825791548, + "grad_norm": 0.16040484607219696, + "learning_rate": 4.737396385378448e-06, + "loss": 3.6272, + "step": 112210 + }, + { + "epoch": 7.624337545862209, + "grad_norm": 0.18179775774478912, + "learning_rate": 4.733149884495176e-06, + "loss": 3.9372, + "step": 112215 + }, + { + "epoch": 7.624677265932871, + "grad_norm": 0.18038633465766907, + "learning_rate": 4.728903383611904e-06, + "loss": 3.7486, + "step": 112220 + }, + { + "epoch": 7.6250169860035335, + "grad_norm": 0.1830657720565796, + "learning_rate": 4.724656882728632e-06, + "loss": 3.709, + "step": 112225 + }, + { + "epoch": 7.625356706074195, + "grad_norm": 0.19126927852630615, + "learning_rate": 4.720410381845359e-06, + "loss": 4.0483, + "step": 112230 + }, + { + "epoch": 7.625696426144857, + "grad_norm": 0.14936330914497375, + "learning_rate": 4.716163880962087e-06, + "loss": 3.9051, + "step": 112235 + }, + { + "epoch": 7.626036146215519, + "grad_norm": 0.1604580134153366, + "learning_rate": 4.711917380078815e-06, + "loss": 3.8484, + "step": 112240 + }, + { + "epoch": 7.62637586628618, + "grad_norm": 1.5342497825622559, + "learning_rate": 4.707670879195543e-06, + "loss": 3.9253, + "step": 112245 + }, + { + "epoch": 7.626715586356842, + "grad_norm": 0.16498957574367523, + "learning_rate": 4.703424378312271e-06, + "loss": 3.9789, + "step": 112250 + }, + { + "epoch": 7.627055306427504, + "grad_norm": 0.23153050243854523, + "learning_rate": 4.699177877428998e-06, + "loss": 3.4192, + "step": 112255 + }, + { + "epoch": 7.627395026498165, + "grad_norm": 0.14502039551734924, + "learning_rate": 4.694931376545727e-06, + "loss": 3.8373, + "step": 112260 + }, + { + "epoch": 7.627734746568827, + "grad_norm": 2.236954689025879, + "learning_rate": 4.6906848756624544e-06, + "loss": 3.9227, + "step": 112265 + }, + { + "epoch": 7.6280744666394895, + "grad_norm": 0.25267738103866577, + "learning_rate": 4.686438374779182e-06, + "loss": 3.745, + "step": 112270 + }, + { + "epoch": 7.628414186710151, + "grad_norm": 0.17772354185581207, + "learning_rate": 4.6821918738959104e-06, + "loss": 3.7648, + "step": 112275 + }, + { + "epoch": 7.628753906780813, + "grad_norm": 0.47055965662002563, + "learning_rate": 4.677945373012638e-06, + "loss": 3.9878, + "step": 112280 + }, + { + "epoch": 7.629093626851475, + "grad_norm": 1.250939130783081, + "learning_rate": 4.673698872129366e-06, + "loss": 3.9757, + "step": 112285 + }, + { + "epoch": 7.629433346922136, + "grad_norm": 0.16023394465446472, + "learning_rate": 4.669452371246094e-06, + "loss": 3.7039, + "step": 112290 + }, + { + "epoch": 7.629773066992798, + "grad_norm": 0.19301331043243408, + "learning_rate": 4.665205870362821e-06, + "loss": 3.9157, + "step": 112295 + }, + { + "epoch": 7.63011278706346, + "grad_norm": 0.19790953397750854, + "learning_rate": 4.660959369479549e-06, + "loss": 4.0678, + "step": 112300 + }, + { + "epoch": 7.630452507134121, + "grad_norm": 0.16698846220970154, + "learning_rate": 4.656712868596277e-06, + "loss": 3.7656, + "step": 112305 + }, + { + "epoch": 7.6307922272047835, + "grad_norm": 0.2883625328540802, + "learning_rate": 4.652466367713005e-06, + "loss": 3.7844, + "step": 112310 + }, + { + "epoch": 7.6311319472754455, + "grad_norm": 0.2091221660375595, + "learning_rate": 4.648219866829733e-06, + "loss": 3.8808, + "step": 112315 + }, + { + "epoch": 7.631471667346107, + "grad_norm": 0.19865016639232635, + "learning_rate": 4.64397336594646e-06, + "loss": 3.9383, + "step": 112320 + }, + { + "epoch": 7.631811387416769, + "grad_norm": 0.15896528959274292, + "learning_rate": 4.639726865063188e-06, + "loss": 3.8055, + "step": 112325 + }, + { + "epoch": 7.63215110748743, + "grad_norm": 0.16417719423770905, + "learning_rate": 4.635480364179916e-06, + "loss": 3.9431, + "step": 112330 + }, + { + "epoch": 7.632490827558092, + "grad_norm": 0.1721331626176834, + "learning_rate": 4.631233863296644e-06, + "loss": 3.8692, + "step": 112335 + }, + { + "epoch": 7.632830547628754, + "grad_norm": 0.17144207656383514, + "learning_rate": 4.626987362413371e-06, + "loss": 3.8144, + "step": 112340 + }, + { + "epoch": 7.633170267699415, + "grad_norm": 0.1836005449295044, + "learning_rate": 4.6227408615301e-06, + "loss": 3.8618, + "step": 112345 + }, + { + "epoch": 7.633509987770077, + "grad_norm": 0.5575776100158691, + "learning_rate": 4.618494360646827e-06, + "loss": 3.9154, + "step": 112350 + }, + { + "epoch": 7.6338497078407395, + "grad_norm": 0.15837261080741882, + "learning_rate": 4.614247859763555e-06, + "loss": 3.8203, + "step": 112355 + }, + { + "epoch": 7.634189427911401, + "grad_norm": 0.21588055789470673, + "learning_rate": 4.610001358880283e-06, + "loss": 3.903, + "step": 112360 + }, + { + "epoch": 7.634529147982063, + "grad_norm": 0.16607460379600525, + "learning_rate": 4.60575485799701e-06, + "loss": 3.5728, + "step": 112365 + }, + { + "epoch": 7.634868868052725, + "grad_norm": 0.15271775424480438, + "learning_rate": 4.601508357113739e-06, + "loss": 3.9376, + "step": 112370 + }, + { + "epoch": 7.635208588123386, + "grad_norm": 0.15529239177703857, + "learning_rate": 4.597261856230466e-06, + "loss": 3.909, + "step": 112375 + }, + { + "epoch": 7.635548308194048, + "grad_norm": 0.16654279828071594, + "learning_rate": 4.5930153553471935e-06, + "loss": 3.8917, + "step": 112380 + }, + { + "epoch": 7.63588802826471, + "grad_norm": 0.23654666543006897, + "learning_rate": 4.588768854463922e-06, + "loss": 3.689, + "step": 112385 + }, + { + "epoch": 7.636227748335371, + "grad_norm": 0.19746558368206024, + "learning_rate": 4.5845223535806495e-06, + "loss": 3.7253, + "step": 112390 + }, + { + "epoch": 7.636567468406033, + "grad_norm": 0.1693328469991684, + "learning_rate": 4.5802758526973775e-06, + "loss": 3.9713, + "step": 112395 + }, + { + "epoch": 7.6369071884766955, + "grad_norm": 0.2000347077846527, + "learning_rate": 4.5760293518141056e-06, + "loss": 3.6668, + "step": 112400 + }, + { + "epoch": 7.637246908547357, + "grad_norm": 0.20775751769542694, + "learning_rate": 4.571782850930833e-06, + "loss": 3.9876, + "step": 112405 + }, + { + "epoch": 7.637586628618019, + "grad_norm": 1.0755976438522339, + "learning_rate": 4.567536350047561e-06, + "loss": 3.7702, + "step": 112410 + }, + { + "epoch": 7.637926348688681, + "grad_norm": 0.1628975123167038, + "learning_rate": 4.563289849164289e-06, + "loss": 3.7773, + "step": 112415 + }, + { + "epoch": 7.638266068759342, + "grad_norm": 0.6423285007476807, + "learning_rate": 4.559043348281017e-06, + "loss": 3.9707, + "step": 112420 + }, + { + "epoch": 7.638605788830004, + "grad_norm": 0.20500460267066956, + "learning_rate": 4.554796847397745e-06, + "loss": 3.8497, + "step": 112425 + }, + { + "epoch": 7.638945508900666, + "grad_norm": 0.18472369015216827, + "learning_rate": 4.550550346514473e-06, + "loss": 3.8792, + "step": 112430 + }, + { + "epoch": 7.639285228971327, + "grad_norm": 0.15907327830791473, + "learning_rate": 4.5463038456312e-06, + "loss": 3.741, + "step": 112435 + }, + { + "epoch": 7.639624949041989, + "grad_norm": 0.16895656287670135, + "learning_rate": 4.542057344747928e-06, + "loss": 3.8677, + "step": 112440 + }, + { + "epoch": 7.6399646691126515, + "grad_norm": 0.19149094820022583, + "learning_rate": 4.537810843864656e-06, + "loss": 3.9252, + "step": 112445 + }, + { + "epoch": 7.640304389183313, + "grad_norm": 0.23944619297981262, + "learning_rate": 4.533564342981383e-06, + "loss": 4.0667, + "step": 112450 + }, + { + "epoch": 7.640644109253975, + "grad_norm": 0.1683572679758072, + "learning_rate": 4.529317842098112e-06, + "loss": 3.6689, + "step": 112455 + }, + { + "epoch": 7.640983829324637, + "grad_norm": 0.1805519461631775, + "learning_rate": 4.525071341214839e-06, + "loss": 3.7463, + "step": 112460 + }, + { + "epoch": 7.641323549395298, + "grad_norm": 0.1885574907064438, + "learning_rate": 4.520824840331567e-06, + "loss": 3.8632, + "step": 112465 + }, + { + "epoch": 7.64166326946596, + "grad_norm": 0.1752198338508606, + "learning_rate": 4.516578339448295e-06, + "loss": 3.8487, + "step": 112470 + }, + { + "epoch": 7.642002989536622, + "grad_norm": 0.6891939640045166, + "learning_rate": 4.512331838565022e-06, + "loss": 3.9257, + "step": 112475 + }, + { + "epoch": 7.642342709607283, + "grad_norm": 0.16077211499214172, + "learning_rate": 4.50808533768175e-06, + "loss": 3.8558, + "step": 112480 + }, + { + "epoch": 7.642682429677945, + "grad_norm": 0.20882941782474518, + "learning_rate": 4.503838836798478e-06, + "loss": 3.993, + "step": 112485 + }, + { + "epoch": 7.6430221497486075, + "grad_norm": 0.3539291322231293, + "learning_rate": 4.4995923359152055e-06, + "loss": 3.5475, + "step": 112490 + }, + { + "epoch": 7.643361869819269, + "grad_norm": 0.21934092044830322, + "learning_rate": 4.495345835031934e-06, + "loss": 3.8378, + "step": 112495 + }, + { + "epoch": 7.643701589889931, + "grad_norm": 0.14399443566799164, + "learning_rate": 4.4910993341486615e-06, + "loss": 4.0352, + "step": 112500 + }, + { + "epoch": 7.644041309960593, + "grad_norm": 0.1698325127363205, + "learning_rate": 4.4868528332653895e-06, + "loss": 3.9351, + "step": 112505 + }, + { + "epoch": 7.644381030031254, + "grad_norm": 0.12938565015792847, + "learning_rate": 4.4826063323821175e-06, + "loss": 3.7911, + "step": 112510 + }, + { + "epoch": 7.644720750101916, + "grad_norm": 0.13942435383796692, + "learning_rate": 4.4783598314988455e-06, + "loss": 3.9177, + "step": 112515 + }, + { + "epoch": 7.645060470172578, + "grad_norm": 1.10919988155365, + "learning_rate": 4.474113330615573e-06, + "loss": 3.9733, + "step": 112520 + }, + { + "epoch": 7.645400190243239, + "grad_norm": 0.22628824412822723, + "learning_rate": 4.469866829732301e-06, + "loss": 3.9585, + "step": 112525 + }, + { + "epoch": 7.645739910313901, + "grad_norm": 0.23578521609306335, + "learning_rate": 4.465620328849029e-06, + "loss": 3.8921, + "step": 112530 + }, + { + "epoch": 7.6460796303845635, + "grad_norm": 0.13409291207790375, + "learning_rate": 4.461373827965757e-06, + "loss": 3.903, + "step": 112535 + }, + { + "epoch": 7.646419350455225, + "grad_norm": 0.156231090426445, + "learning_rate": 4.457127327082485e-06, + "loss": 3.5141, + "step": 112540 + }, + { + "epoch": 7.646759070525887, + "grad_norm": 0.21103398501873016, + "learning_rate": 4.452880826199212e-06, + "loss": 3.8464, + "step": 112545 + }, + { + "epoch": 7.647098790596548, + "grad_norm": 0.18049533665180206, + "learning_rate": 4.44863432531594e-06, + "loss": 3.9791, + "step": 112550 + }, + { + "epoch": 7.64743851066721, + "grad_norm": 0.2010955810546875, + "learning_rate": 4.444387824432668e-06, + "loss": 3.6425, + "step": 112555 + }, + { + "epoch": 7.647778230737872, + "grad_norm": 0.14536675810813904, + "learning_rate": 4.440141323549395e-06, + "loss": 3.8451, + "step": 112560 + }, + { + "epoch": 7.648117950808533, + "grad_norm": 0.22105289995670319, + "learning_rate": 4.435894822666124e-06, + "loss": 3.6254, + "step": 112565 + }, + { + "epoch": 7.648457670879195, + "grad_norm": 0.15674102306365967, + "learning_rate": 4.431648321782851e-06, + "loss": 3.7496, + "step": 112570 + }, + { + "epoch": 7.6487973909498574, + "grad_norm": 0.18154945969581604, + "learning_rate": 4.427401820899579e-06, + "loss": 3.8489, + "step": 112575 + }, + { + "epoch": 7.649137111020519, + "grad_norm": 0.2011827528476715, + "learning_rate": 4.423155320016307e-06, + "loss": 3.8335, + "step": 112580 + }, + { + "epoch": 7.649476831091181, + "grad_norm": 0.1542072594165802, + "learning_rate": 4.418908819133034e-06, + "loss": 3.8757, + "step": 112585 + }, + { + "epoch": 7.649816551161843, + "grad_norm": 2.325204849243164, + "learning_rate": 4.414662318249762e-06, + "loss": 4.0169, + "step": 112590 + }, + { + "epoch": 7.650156271232504, + "grad_norm": 0.14286665618419647, + "learning_rate": 4.41041581736649e-06, + "loss": 3.9416, + "step": 112595 + }, + { + "epoch": 7.650495991303166, + "grad_norm": 0.1596774458885193, + "learning_rate": 4.406169316483218e-06, + "loss": 3.7235, + "step": 112600 + }, + { + "epoch": 7.650835711373828, + "grad_norm": 0.21439257264137268, + "learning_rate": 4.401922815599946e-06, + "loss": 4.0328, + "step": 112605 + }, + { + "epoch": 7.651175431444489, + "grad_norm": 0.13509149849414825, + "learning_rate": 4.397676314716673e-06, + "loss": 3.662, + "step": 112610 + }, + { + "epoch": 7.651515151515151, + "grad_norm": 0.1405085176229477, + "learning_rate": 4.393429813833401e-06, + "loss": 3.9621, + "step": 112615 + }, + { + "epoch": 7.6518548715858135, + "grad_norm": 0.18438096344470978, + "learning_rate": 4.3891833129501294e-06, + "loss": 3.5531, + "step": 112620 + }, + { + "epoch": 7.652194591656475, + "grad_norm": 0.20645089447498322, + "learning_rate": 4.3849368120668574e-06, + "loss": 3.7811, + "step": 112625 + }, + { + "epoch": 7.652534311727137, + "grad_norm": 0.1983444094657898, + "learning_rate": 4.380690311183585e-06, + "loss": 3.9289, + "step": 112630 + }, + { + "epoch": 7.652874031797799, + "grad_norm": 0.22488422691822052, + "learning_rate": 4.376443810300313e-06, + "loss": 3.7082, + "step": 112635 + }, + { + "epoch": 7.65321375186846, + "grad_norm": 0.15863251686096191, + "learning_rate": 4.372197309417041e-06, + "loss": 3.9903, + "step": 112640 + }, + { + "epoch": 7.653553471939122, + "grad_norm": 0.21009503304958344, + "learning_rate": 4.367950808533769e-06, + "loss": 3.766, + "step": 112645 + }, + { + "epoch": 7.653893192009784, + "grad_norm": 0.16529709100723267, + "learning_rate": 4.363704307650497e-06, + "loss": 3.8951, + "step": 112650 + }, + { + "epoch": 7.654232912080445, + "grad_norm": 0.1815108358860016, + "learning_rate": 4.359457806767224e-06, + "loss": 3.7812, + "step": 112655 + }, + { + "epoch": 7.654572632151107, + "grad_norm": 0.19644582271575928, + "learning_rate": 4.355211305883952e-06, + "loss": 3.8687, + "step": 112660 + }, + { + "epoch": 7.6549123522217695, + "grad_norm": 0.22166383266448975, + "learning_rate": 4.35096480500068e-06, + "loss": 3.749, + "step": 112665 + }, + { + "epoch": 7.655252072292431, + "grad_norm": 0.1870785802602768, + "learning_rate": 4.346718304117407e-06, + "loss": 3.6459, + "step": 112670 + }, + { + "epoch": 7.655591792363093, + "grad_norm": 0.22750510275363922, + "learning_rate": 4.342471803234136e-06, + "loss": 3.9401, + "step": 112675 + }, + { + "epoch": 7.655931512433755, + "grad_norm": 0.6421526074409485, + "learning_rate": 4.338225302350863e-06, + "loss": 3.8405, + "step": 112680 + }, + { + "epoch": 7.656271232504416, + "grad_norm": 0.2162836790084839, + "learning_rate": 4.333978801467591e-06, + "loss": 3.7906, + "step": 112685 + }, + { + "epoch": 7.656610952575078, + "grad_norm": 0.16175201535224915, + "learning_rate": 4.329732300584319e-06, + "loss": 3.861, + "step": 112690 + }, + { + "epoch": 7.65695067264574, + "grad_norm": 0.22940634191036224, + "learning_rate": 4.325485799701046e-06, + "loss": 3.8338, + "step": 112695 + }, + { + "epoch": 7.657290392716401, + "grad_norm": 0.16027942299842834, + "learning_rate": 4.321239298817774e-06, + "loss": 3.7956, + "step": 112700 + }, + { + "epoch": 7.657630112787063, + "grad_norm": 0.17709994316101074, + "learning_rate": 4.316992797934502e-06, + "loss": 3.7192, + "step": 112705 + }, + { + "epoch": 7.6579698328577255, + "grad_norm": 0.17877064645290375, + "learning_rate": 4.31274629705123e-06, + "loss": 3.8689, + "step": 112710 + }, + { + "epoch": 7.658309552928387, + "grad_norm": 0.1755998134613037, + "learning_rate": 4.308499796167958e-06, + "loss": 4.1019, + "step": 112715 + }, + { + "epoch": 7.658649272999049, + "grad_norm": 0.2188301682472229, + "learning_rate": 4.304253295284685e-06, + "loss": 4.1784, + "step": 112720 + }, + { + "epoch": 7.658988993069711, + "grad_norm": 0.16382157802581787, + "learning_rate": 4.300006794401413e-06, + "loss": 3.7284, + "step": 112725 + }, + { + "epoch": 7.659328713140372, + "grad_norm": 0.17089605331420898, + "learning_rate": 4.295760293518141e-06, + "loss": 3.7665, + "step": 112730 + }, + { + "epoch": 7.659668433211034, + "grad_norm": 0.1656363606452942, + "learning_rate": 4.291513792634869e-06, + "loss": 3.8384, + "step": 112735 + }, + { + "epoch": 7.660008153281696, + "grad_norm": 0.1561303287744522, + "learning_rate": 4.2872672917515965e-06, + "loss": 3.9768, + "step": 112740 + }, + { + "epoch": 7.660347873352357, + "grad_norm": 1.3002275228500366, + "learning_rate": 4.2830207908683245e-06, + "loss": 3.9683, + "step": 112745 + }, + { + "epoch": 7.660687593423019, + "grad_norm": 0.16499564051628113, + "learning_rate": 4.2787742899850525e-06, + "loss": 3.8498, + "step": 112750 + }, + { + "epoch": 7.6610273134936815, + "grad_norm": 0.19151848554611206, + "learning_rate": 4.2745277891017806e-06, + "loss": 3.9508, + "step": 112755 + }, + { + "epoch": 7.661367033564343, + "grad_norm": 0.39123377203941345, + "learning_rate": 4.2702812882185086e-06, + "loss": 3.5882, + "step": 112760 + }, + { + "epoch": 7.661706753635005, + "grad_norm": 0.18014025688171387, + "learning_rate": 4.266034787335236e-06, + "loss": 3.674, + "step": 112765 + }, + { + "epoch": 7.662046473705667, + "grad_norm": 0.14231112599372864, + "learning_rate": 4.261788286451964e-06, + "loss": 3.8527, + "step": 112770 + }, + { + "epoch": 7.662386193776328, + "grad_norm": 0.14810360968112946, + "learning_rate": 4.257541785568692e-06, + "loss": 3.9698, + "step": 112775 + }, + { + "epoch": 7.66272591384699, + "grad_norm": 0.1685314178466797, + "learning_rate": 4.253295284685419e-06, + "loss": 3.8013, + "step": 112780 + }, + { + "epoch": 7.663065633917652, + "grad_norm": 0.41699159145355225, + "learning_rate": 4.249048783802148e-06, + "loss": 3.746, + "step": 112785 + }, + { + "epoch": 7.663405353988313, + "grad_norm": 0.15190021693706512, + "learning_rate": 4.244802282918875e-06, + "loss": 3.8438, + "step": 112790 + }, + { + "epoch": 7.663745074058975, + "grad_norm": 0.1577160358428955, + "learning_rate": 4.240555782035603e-06, + "loss": 3.8214, + "step": 112795 + }, + { + "epoch": 7.6640847941296375, + "grad_norm": 0.15603110194206238, + "learning_rate": 4.236309281152331e-06, + "loss": 3.856, + "step": 112800 + }, + { + "epoch": 7.664424514200299, + "grad_norm": 0.16551455855369568, + "learning_rate": 4.232062780269058e-06, + "loss": 4.0182, + "step": 112805 + }, + { + "epoch": 7.664764234270961, + "grad_norm": 0.1836560070514679, + "learning_rate": 4.227816279385786e-06, + "loss": 3.6576, + "step": 112810 + }, + { + "epoch": 7.665103954341623, + "grad_norm": 0.193220317363739, + "learning_rate": 4.223569778502514e-06, + "loss": 4.0536, + "step": 112815 + }, + { + "epoch": 7.665443674412284, + "grad_norm": 0.18294566869735718, + "learning_rate": 4.219323277619242e-06, + "loss": 3.8933, + "step": 112820 + }, + { + "epoch": 7.665783394482946, + "grad_norm": 0.16774219274520874, + "learning_rate": 4.21507677673597e-06, + "loss": 3.7971, + "step": 112825 + }, + { + "epoch": 7.666123114553608, + "grad_norm": 0.16584840416908264, + "learning_rate": 4.210830275852697e-06, + "loss": 3.5809, + "step": 112830 + }, + { + "epoch": 7.666462834624269, + "grad_norm": 0.1737363338470459, + "learning_rate": 4.206583774969425e-06, + "loss": 3.9105, + "step": 112835 + }, + { + "epoch": 7.666802554694931, + "grad_norm": 0.152685284614563, + "learning_rate": 4.202337274086153e-06, + "loss": 4.0556, + "step": 112840 + }, + { + "epoch": 7.6671422747655935, + "grad_norm": 0.16127218306064606, + "learning_rate": 4.198090773202881e-06, + "loss": 3.8345, + "step": 112845 + }, + { + "epoch": 7.667481994836255, + "grad_norm": 0.1596374362707138, + "learning_rate": 4.1938442723196085e-06, + "loss": 3.7513, + "step": 112850 + }, + { + "epoch": 7.667821714906917, + "grad_norm": 0.49754247069358826, + "learning_rate": 4.189597771436337e-06, + "loss": 3.6264, + "step": 112855 + }, + { + "epoch": 7.668161434977579, + "grad_norm": 0.18759575486183167, + "learning_rate": 4.1853512705530645e-06, + "loss": 3.8715, + "step": 112860 + }, + { + "epoch": 7.66850115504824, + "grad_norm": 0.2048327475786209, + "learning_rate": 4.181104769669792e-06, + "loss": 3.8978, + "step": 112865 + }, + { + "epoch": 7.668840875118902, + "grad_norm": 0.2145974189043045, + "learning_rate": 4.1768582687865205e-06, + "loss": 3.7772, + "step": 112870 + }, + { + "epoch": 7.669180595189564, + "grad_norm": 0.17371805012226105, + "learning_rate": 4.172611767903248e-06, + "loss": 3.8382, + "step": 112875 + }, + { + "epoch": 7.669520315260225, + "grad_norm": 0.331630140542984, + "learning_rate": 4.168365267019976e-06, + "loss": 3.812, + "step": 112880 + }, + { + "epoch": 7.6698600353308874, + "grad_norm": 0.21062220633029938, + "learning_rate": 4.164118766136704e-06, + "loss": 3.8504, + "step": 112885 + }, + { + "epoch": 7.6701997554015495, + "grad_norm": 1.1126943826675415, + "learning_rate": 4.159872265253431e-06, + "loss": 3.8715, + "step": 112890 + }, + { + "epoch": 7.670539475472211, + "grad_norm": 0.18315057456493378, + "learning_rate": 4.15562576437016e-06, + "loss": 3.9758, + "step": 112895 + }, + { + "epoch": 7.670879195542873, + "grad_norm": 1.253699779510498, + "learning_rate": 4.151379263486887e-06, + "loss": 3.9668, + "step": 112900 + }, + { + "epoch": 7.671218915613535, + "grad_norm": 0.1566227227449417, + "learning_rate": 4.147132762603615e-06, + "loss": 3.5182, + "step": 112905 + }, + { + "epoch": 7.671558635684196, + "grad_norm": 0.18023449182510376, + "learning_rate": 4.142886261720343e-06, + "loss": 3.7741, + "step": 112910 + }, + { + "epoch": 7.671898355754858, + "grad_norm": 0.16771793365478516, + "learning_rate": 4.13863976083707e-06, + "loss": 4.0208, + "step": 112915 + }, + { + "epoch": 7.67223807582552, + "grad_norm": 0.1612756997346878, + "learning_rate": 4.134393259953798e-06, + "loss": 3.9887, + "step": 112920 + }, + { + "epoch": 7.672577795896181, + "grad_norm": 0.21739701926708221, + "learning_rate": 4.130146759070526e-06, + "loss": 3.8498, + "step": 112925 + }, + { + "epoch": 7.6729175159668435, + "grad_norm": 0.18420769274234772, + "learning_rate": 4.125900258187254e-06, + "loss": 3.7408, + "step": 112930 + }, + { + "epoch": 7.6732572360375055, + "grad_norm": 0.19716133177280426, + "learning_rate": 4.121653757303982e-06, + "loss": 3.6634, + "step": 112935 + }, + { + "epoch": 7.673596956108167, + "grad_norm": 0.17334727942943573, + "learning_rate": 4.11740725642071e-06, + "loss": 3.7214, + "step": 112940 + }, + { + "epoch": 7.673936676178829, + "grad_norm": 0.17919424176216125, + "learning_rate": 4.113160755537437e-06, + "loss": 3.5036, + "step": 112945 + }, + { + "epoch": 7.674276396249491, + "grad_norm": 0.2126038372516632, + "learning_rate": 4.108914254654165e-06, + "loss": 3.9933, + "step": 112950 + }, + { + "epoch": 7.674616116320152, + "grad_norm": 0.29586032032966614, + "learning_rate": 4.104667753770893e-06, + "loss": 3.8209, + "step": 112955 + }, + { + "epoch": 7.674955836390814, + "grad_norm": 0.19878436625003815, + "learning_rate": 4.10042125288762e-06, + "loss": 3.8943, + "step": 112960 + }, + { + "epoch": 7.675295556461476, + "grad_norm": 0.16746924817562103, + "learning_rate": 4.096174752004349e-06, + "loss": 3.9021, + "step": 112965 + }, + { + "epoch": 7.675635276532137, + "grad_norm": 0.3257145583629608, + "learning_rate": 4.091928251121076e-06, + "loss": 3.7079, + "step": 112970 + }, + { + "epoch": 7.6759749966027995, + "grad_norm": 0.18458236753940582, + "learning_rate": 4.0876817502378036e-06, + "loss": 3.5273, + "step": 112975 + }, + { + "epoch": 7.6763147166734615, + "grad_norm": 0.24448229372501373, + "learning_rate": 4.0834352493545324e-06, + "loss": 3.9918, + "step": 112980 + }, + { + "epoch": 7.676654436744123, + "grad_norm": 0.1945875883102417, + "learning_rate": 4.07918874847126e-06, + "loss": 3.7353, + "step": 112985 + }, + { + "epoch": 7.676994156814785, + "grad_norm": 0.24353887140750885, + "learning_rate": 4.074942247587988e-06, + "loss": 3.8034, + "step": 112990 + }, + { + "epoch": 7.677333876885447, + "grad_norm": 0.22707082331180573, + "learning_rate": 4.070695746704716e-06, + "loss": 3.8609, + "step": 112995 + }, + { + "epoch": 7.677673596956108, + "grad_norm": 0.17009630799293518, + "learning_rate": 4.066449245821443e-06, + "loss": 3.6988, + "step": 113000 + }, + { + "epoch": 7.67801331702677, + "grad_norm": 0.16839581727981567, + "learning_rate": 4.062202744938172e-06, + "loss": 3.916, + "step": 113005 + }, + { + "epoch": 7.678353037097431, + "grad_norm": 0.1791629046201706, + "learning_rate": 4.057956244054899e-06, + "loss": 3.6674, + "step": 113010 + }, + { + "epoch": 7.678692757168093, + "grad_norm": 0.16331544518470764, + "learning_rate": 4.053709743171627e-06, + "loss": 3.8644, + "step": 113015 + }, + { + "epoch": 7.6790324772387555, + "grad_norm": 0.1914084255695343, + "learning_rate": 4.049463242288355e-06, + "loss": 3.7463, + "step": 113020 + }, + { + "epoch": 7.679372197309417, + "grad_norm": 0.1671179085969925, + "learning_rate": 4.045216741405083e-06, + "loss": 3.8421, + "step": 113025 + }, + { + "epoch": 7.679711917380079, + "grad_norm": 0.13930678367614746, + "learning_rate": 4.04097024052181e-06, + "loss": 3.8304, + "step": 113030 + }, + { + "epoch": 7.680051637450741, + "grad_norm": 0.2959902286529541, + "learning_rate": 4.036723739638538e-06, + "loss": 3.9141, + "step": 113035 + }, + { + "epoch": 7.680391357521402, + "grad_norm": 0.16029483079910278, + "learning_rate": 4.032477238755266e-06, + "loss": 3.8851, + "step": 113040 + }, + { + "epoch": 7.680731077592064, + "grad_norm": 0.18072949349880219, + "learning_rate": 4.028230737871993e-06, + "loss": 3.8223, + "step": 113045 + }, + { + "epoch": 7.681070797662726, + "grad_norm": 0.16906820237636566, + "learning_rate": 4.023984236988722e-06, + "loss": 3.8267, + "step": 113050 + }, + { + "epoch": 7.681410517733387, + "grad_norm": 0.18109478056430817, + "learning_rate": 4.019737736105449e-06, + "loss": 3.7861, + "step": 113055 + }, + { + "epoch": 7.681750237804049, + "grad_norm": 0.182962104678154, + "learning_rate": 4.015491235222177e-06, + "loss": 3.8617, + "step": 113060 + }, + { + "epoch": 7.6820899578747115, + "grad_norm": 0.14733624458312988, + "learning_rate": 4.011244734338905e-06, + "loss": 3.7857, + "step": 113065 + }, + { + "epoch": 7.682429677945373, + "grad_norm": 0.23654374480247498, + "learning_rate": 4.006998233455632e-06, + "loss": 4.0836, + "step": 113070 + }, + { + "epoch": 7.682769398016035, + "grad_norm": 0.19624260067939758, + "learning_rate": 4.002751732572361e-06, + "loss": 3.8387, + "step": 113075 + }, + { + "epoch": 7.683109118086697, + "grad_norm": 0.18716159462928772, + "learning_rate": 3.998505231689088e-06, + "loss": 3.7443, + "step": 113080 + }, + { + "epoch": 7.683448838157358, + "grad_norm": 0.15375103056430817, + "learning_rate": 3.9942587308058155e-06, + "loss": 4.0035, + "step": 113085 + }, + { + "epoch": 7.68378855822802, + "grad_norm": 0.17333278059959412, + "learning_rate": 3.990012229922544e-06, + "loss": 3.7135, + "step": 113090 + }, + { + "epoch": 7.684128278298682, + "grad_norm": 0.17714330554008484, + "learning_rate": 3.9857657290392715e-06, + "loss": 3.9928, + "step": 113095 + }, + { + "epoch": 7.684467998369343, + "grad_norm": 0.15972134470939636, + "learning_rate": 3.9815192281559995e-06, + "loss": 3.599, + "step": 113100 + }, + { + "epoch": 7.684807718440005, + "grad_norm": 0.3812430500984192, + "learning_rate": 3.9772727272727275e-06, + "loss": 3.8118, + "step": 113105 + }, + { + "epoch": 7.6851474385106675, + "grad_norm": 0.14450514316558838, + "learning_rate": 3.9730262263894555e-06, + "loss": 3.6021, + "step": 113110 + }, + { + "epoch": 7.685487158581329, + "grad_norm": 0.22237437963485718, + "learning_rate": 3.9687797255061836e-06, + "loss": 3.8193, + "step": 113115 + }, + { + "epoch": 7.685826878651991, + "grad_norm": 0.152293398976326, + "learning_rate": 3.964533224622911e-06, + "loss": 3.6645, + "step": 113120 + }, + { + "epoch": 7.686166598722653, + "grad_norm": 0.16648456454277039, + "learning_rate": 3.960286723739639e-06, + "loss": 3.7506, + "step": 113125 + }, + { + "epoch": 7.686506318793314, + "grad_norm": 0.1711946576833725, + "learning_rate": 3.956040222856367e-06, + "loss": 3.6781, + "step": 113130 + }, + { + "epoch": 7.686846038863976, + "grad_norm": 0.3052242696285248, + "learning_rate": 3.951793721973095e-06, + "loss": 3.6381, + "step": 113135 + }, + { + "epoch": 7.687185758934638, + "grad_norm": 0.1699400246143341, + "learning_rate": 3.947547221089822e-06, + "loss": 3.8245, + "step": 113140 + }, + { + "epoch": 7.687525479005299, + "grad_norm": 0.16637267172336578, + "learning_rate": 3.94330072020655e-06, + "loss": 3.7611, + "step": 113145 + }, + { + "epoch": 7.687865199075961, + "grad_norm": 0.34994831681251526, + "learning_rate": 3.939054219323278e-06, + "loss": 4.0331, + "step": 113150 + }, + { + "epoch": 7.6882049191466235, + "grad_norm": 0.1587536334991455, + "learning_rate": 3.934807718440005e-06, + "loss": 3.9358, + "step": 113155 + }, + { + "epoch": 7.688544639217285, + "grad_norm": 0.1822309046983719, + "learning_rate": 3.930561217556734e-06, + "loss": 3.7742, + "step": 113160 + }, + { + "epoch": 7.688884359287947, + "grad_norm": 0.2727403938770294, + "learning_rate": 3.926314716673461e-06, + "loss": 3.6586, + "step": 113165 + }, + { + "epoch": 7.689224079358609, + "grad_norm": 0.18971753120422363, + "learning_rate": 3.922068215790189e-06, + "loss": 3.6635, + "step": 113170 + }, + { + "epoch": 7.68956379942927, + "grad_norm": 0.1896367073059082, + "learning_rate": 3.917821714906917e-06, + "loss": 3.8967, + "step": 113175 + }, + { + "epoch": 7.689903519499932, + "grad_norm": 0.1556480973958969, + "learning_rate": 3.913575214023644e-06, + "loss": 3.9357, + "step": 113180 + }, + { + "epoch": 7.690243239570594, + "grad_norm": 0.20665527880191803, + "learning_rate": 3.909328713140373e-06, + "loss": 3.8757, + "step": 113185 + }, + { + "epoch": 7.690582959641255, + "grad_norm": 0.15355284512043, + "learning_rate": 3.9050822122571e-06, + "loss": 3.7939, + "step": 113190 + }, + { + "epoch": 7.6909226797119175, + "grad_norm": 0.2096128612756729, + "learning_rate": 3.9008357113738274e-06, + "loss": 3.6937, + "step": 113195 + }, + { + "epoch": 7.6912623997825795, + "grad_norm": 0.23403924703598022, + "learning_rate": 3.896589210490556e-06, + "loss": 3.6141, + "step": 113200 + }, + { + "epoch": 7.691602119853241, + "grad_norm": 0.4181040823459625, + "learning_rate": 3.8923427096072835e-06, + "loss": 3.6442, + "step": 113205 + }, + { + "epoch": 7.691941839923903, + "grad_norm": 0.3271825313568115, + "learning_rate": 3.8880962087240115e-06, + "loss": 3.7402, + "step": 113210 + }, + { + "epoch": 7.692281559994565, + "grad_norm": 0.17580007016658783, + "learning_rate": 3.8838497078407395e-06, + "loss": 3.8647, + "step": 113215 + }, + { + "epoch": 7.692621280065226, + "grad_norm": 0.20257815718650818, + "learning_rate": 3.8796032069574675e-06, + "loss": 3.9185, + "step": 113220 + }, + { + "epoch": 7.692961000135888, + "grad_norm": 0.26233577728271484, + "learning_rate": 3.8753567060741955e-06, + "loss": 3.8785, + "step": 113225 + }, + { + "epoch": 7.693300720206549, + "grad_norm": 0.1592886596918106, + "learning_rate": 3.871110205190923e-06, + "loss": 3.6524, + "step": 113230 + }, + { + "epoch": 7.693640440277211, + "grad_norm": 0.15627208352088928, + "learning_rate": 3.866863704307651e-06, + "loss": 3.6312, + "step": 113235 + }, + { + "epoch": 7.6939801603478735, + "grad_norm": 0.29770633578300476, + "learning_rate": 3.862617203424379e-06, + "loss": 3.9146, + "step": 113240 + }, + { + "epoch": 7.694319880418535, + "grad_norm": 0.2274566888809204, + "learning_rate": 3.858370702541107e-06, + "loss": 4.0016, + "step": 113245 + }, + { + "epoch": 7.694659600489197, + "grad_norm": 0.22634607553482056, + "learning_rate": 3.854124201657834e-06, + "loss": 3.7684, + "step": 113250 + }, + { + "epoch": 7.694999320559859, + "grad_norm": 0.17197325825691223, + "learning_rate": 3.849877700774562e-06, + "loss": 3.7181, + "step": 113255 + }, + { + "epoch": 7.69533904063052, + "grad_norm": 0.1603788584470749, + "learning_rate": 3.84563119989129e-06, + "loss": 3.7221, + "step": 113260 + }, + { + "epoch": 7.695678760701182, + "grad_norm": 0.16288067400455475, + "learning_rate": 3.841384699008017e-06, + "loss": 3.6741, + "step": 113265 + }, + { + "epoch": 7.696018480771844, + "grad_norm": 0.20297180116176605, + "learning_rate": 3.837138198124746e-06, + "loss": 3.7677, + "step": 113270 + }, + { + "epoch": 7.696358200842505, + "grad_norm": 0.2243962585926056, + "learning_rate": 3.832891697241473e-06, + "loss": 4.0014, + "step": 113275 + }, + { + "epoch": 7.696697920913167, + "grad_norm": 0.23982323706150055, + "learning_rate": 3.828645196358201e-06, + "loss": 3.9801, + "step": 113280 + }, + { + "epoch": 7.6970376409838295, + "grad_norm": 0.1794046312570572, + "learning_rate": 3.824398695474929e-06, + "loss": 3.9293, + "step": 113285 + }, + { + "epoch": 7.697377361054491, + "grad_norm": 0.18852810561656952, + "learning_rate": 3.820152194591656e-06, + "loss": 3.7192, + "step": 113290 + }, + { + "epoch": 7.697717081125153, + "grad_norm": 0.2783920168876648, + "learning_rate": 3.815905693708385e-06, + "loss": 3.7231, + "step": 113295 + }, + { + "epoch": 7.698056801195815, + "grad_norm": 0.3409196138381958, + "learning_rate": 3.8116591928251122e-06, + "loss": 3.8231, + "step": 113300 + }, + { + "epoch": 7.698396521266476, + "grad_norm": 0.1699620485305786, + "learning_rate": 3.8074126919418402e-06, + "loss": 4.044, + "step": 113305 + }, + { + "epoch": 7.698736241337138, + "grad_norm": 0.1899985522031784, + "learning_rate": 3.803166191058568e-06, + "loss": 3.8335, + "step": 113310 + }, + { + "epoch": 7.6990759614078, + "grad_norm": 0.1908920556306839, + "learning_rate": 3.7989196901752954e-06, + "loss": 3.9146, + "step": 113315 + }, + { + "epoch": 7.699415681478461, + "grad_norm": 0.24266038835048676, + "learning_rate": 3.794673189292024e-06, + "loss": 3.8375, + "step": 113320 + }, + { + "epoch": 7.699755401549123, + "grad_norm": 0.19243201613426208, + "learning_rate": 3.7904266884087514e-06, + "loss": 3.8663, + "step": 113325 + }, + { + "epoch": 7.7000951216197855, + "grad_norm": 0.20911172032356262, + "learning_rate": 3.7861801875254794e-06, + "loss": 3.7377, + "step": 113330 + }, + { + "epoch": 7.700434841690447, + "grad_norm": 0.17387712001800537, + "learning_rate": 3.781933686642207e-06, + "loss": 3.9711, + "step": 113335 + }, + { + "epoch": 7.700774561761109, + "grad_norm": 0.19063767790794373, + "learning_rate": 3.7776871857589346e-06, + "loss": 3.902, + "step": 113340 + }, + { + "epoch": 7.701114281831771, + "grad_norm": 0.28132325410842896, + "learning_rate": 3.7734406848756626e-06, + "loss": 3.8181, + "step": 113345 + }, + { + "epoch": 7.701454001902432, + "grad_norm": 0.24792343378067017, + "learning_rate": 3.76919418399239e-06, + "loss": 3.8963, + "step": 113350 + }, + { + "epoch": 7.701793721973094, + "grad_norm": 0.21055099368095398, + "learning_rate": 3.7649476831091186e-06, + "loss": 3.9114, + "step": 113355 + }, + { + "epoch": 7.702133442043756, + "grad_norm": 0.14162668585777283, + "learning_rate": 3.760701182225846e-06, + "loss": 3.7148, + "step": 113360 + }, + { + "epoch": 7.702473162114417, + "grad_norm": 0.22363878786563873, + "learning_rate": 3.7564546813425734e-06, + "loss": 3.4417, + "step": 113365 + }, + { + "epoch": 7.702812882185079, + "grad_norm": 0.2252047061920166, + "learning_rate": 3.7522081804593018e-06, + "loss": 3.9528, + "step": 113370 + }, + { + "epoch": 7.7031526022557415, + "grad_norm": 0.17092053592205048, + "learning_rate": 3.7479616795760294e-06, + "loss": 3.7468, + "step": 113375 + }, + { + "epoch": 7.703492322326403, + "grad_norm": 0.1859705001115799, + "learning_rate": 3.7437151786927574e-06, + "loss": 3.9109, + "step": 113380 + }, + { + "epoch": 7.703832042397065, + "grad_norm": 0.16996058821678162, + "learning_rate": 3.739468677809485e-06, + "loss": 4.1316, + "step": 113385 + }, + { + "epoch": 7.704171762467727, + "grad_norm": 0.17062264680862427, + "learning_rate": 3.7352221769262134e-06, + "loss": 4.1414, + "step": 113390 + }, + { + "epoch": 7.704511482538388, + "grad_norm": 0.1760581135749817, + "learning_rate": 3.730975676042941e-06, + "loss": 3.8216, + "step": 113395 + }, + { + "epoch": 7.70485120260905, + "grad_norm": 0.16981075704097748, + "learning_rate": 3.7267291751596686e-06, + "loss": 3.8586, + "step": 113400 + }, + { + "epoch": 7.705190922679712, + "grad_norm": 0.16627690196037292, + "learning_rate": 3.7224826742763966e-06, + "loss": 3.8631, + "step": 113405 + }, + { + "epoch": 7.705530642750373, + "grad_norm": 0.19263115525245667, + "learning_rate": 3.718236173393124e-06, + "loss": 4.0235, + "step": 113410 + }, + { + "epoch": 7.705870362821035, + "grad_norm": 0.17714537680149078, + "learning_rate": 3.713989672509852e-06, + "loss": 3.7617, + "step": 113415 + }, + { + "epoch": 7.7062100828916975, + "grad_norm": 0.19603894650936127, + "learning_rate": 3.7097431716265797e-06, + "loss": 3.6202, + "step": 113420 + }, + { + "epoch": 7.706549802962359, + "grad_norm": 0.293978750705719, + "learning_rate": 3.7054966707433073e-06, + "loss": 3.7962, + "step": 113425 + }, + { + "epoch": 7.706889523033021, + "grad_norm": 1.8082587718963623, + "learning_rate": 3.7012501698600358e-06, + "loss": 3.705, + "step": 113430 + }, + { + "epoch": 7.707229243103683, + "grad_norm": 0.6329541802406311, + "learning_rate": 3.6970036689767633e-06, + "loss": 3.6948, + "step": 113435 + }, + { + "epoch": 7.707568963174344, + "grad_norm": 0.213396817445755, + "learning_rate": 3.6927571680934914e-06, + "loss": 3.6152, + "step": 113440 + }, + { + "epoch": 7.707908683245006, + "grad_norm": 0.41905564069747925, + "learning_rate": 3.688510667210219e-06, + "loss": 3.8229, + "step": 113445 + }, + { + "epoch": 7.708248403315668, + "grad_norm": 0.1781696081161499, + "learning_rate": 3.6842641663269465e-06, + "loss": 3.809, + "step": 113450 + }, + { + "epoch": 7.708588123386329, + "grad_norm": 0.1479966938495636, + "learning_rate": 3.6800176654436745e-06, + "loss": 3.6608, + "step": 113455 + }, + { + "epoch": 7.7089278434569914, + "grad_norm": 0.17572340369224548, + "learning_rate": 3.675771164560402e-06, + "loss": 3.7564, + "step": 113460 + }, + { + "epoch": 7.7092675635276535, + "grad_norm": 0.46418309211730957, + "learning_rate": 3.6715246636771305e-06, + "loss": 3.7465, + "step": 113465 + }, + { + "epoch": 7.709607283598315, + "grad_norm": 0.15103326737880707, + "learning_rate": 3.667278162793858e-06, + "loss": 3.9103, + "step": 113470 + }, + { + "epoch": 7.709947003668977, + "grad_norm": 0.4056239128112793, + "learning_rate": 3.663031661910586e-06, + "loss": 3.8962, + "step": 113475 + }, + { + "epoch": 7.710286723739639, + "grad_norm": 0.33960774540901184, + "learning_rate": 3.6587851610273137e-06, + "loss": 3.8126, + "step": 113480 + }, + { + "epoch": 7.7106264438103, + "grad_norm": 0.23902353644371033, + "learning_rate": 3.6545386601440413e-06, + "loss": 3.7901, + "step": 113485 + }, + { + "epoch": 7.710966163880962, + "grad_norm": 0.17387862503528595, + "learning_rate": 3.6502921592607693e-06, + "loss": 3.7879, + "step": 113490 + }, + { + "epoch": 7.711305883951624, + "grad_norm": 0.15304547548294067, + "learning_rate": 3.646045658377497e-06, + "loss": 3.8569, + "step": 113495 + }, + { + "epoch": 7.711645604022285, + "grad_norm": 0.20764245092868805, + "learning_rate": 3.6417991574942253e-06, + "loss": 3.9367, + "step": 113500 + }, + { + "epoch": 7.7119853240929475, + "grad_norm": 0.15467727184295654, + "learning_rate": 3.637552656610953e-06, + "loss": 4.1177, + "step": 113505 + }, + { + "epoch": 7.7123250441636095, + "grad_norm": 0.1759842038154602, + "learning_rate": 3.63330615572768e-06, + "loss": 3.904, + "step": 113510 + }, + { + "epoch": 7.712664764234271, + "grad_norm": 0.19183120131492615, + "learning_rate": 3.6290596548444085e-06, + "loss": 3.7807, + "step": 113515 + }, + { + "epoch": 7.713004484304933, + "grad_norm": 0.19294267892837524, + "learning_rate": 3.624813153961136e-06, + "loss": 3.6408, + "step": 113520 + }, + { + "epoch": 7.713344204375595, + "grad_norm": 0.18665984272956848, + "learning_rate": 3.620566653077864e-06, + "loss": 3.7426, + "step": 113525 + }, + { + "epoch": 7.713683924446256, + "grad_norm": 0.19385024905204773, + "learning_rate": 3.6163201521945917e-06, + "loss": 3.8406, + "step": 113530 + }, + { + "epoch": 7.714023644516918, + "grad_norm": 0.37463879585266113, + "learning_rate": 3.6120736513113193e-06, + "loss": 3.6973, + "step": 113535 + }, + { + "epoch": 7.71436336458758, + "grad_norm": 0.15457148849964142, + "learning_rate": 3.6078271504280477e-06, + "loss": 3.6399, + "step": 113540 + }, + { + "epoch": 7.714703084658241, + "grad_norm": 0.1841178685426712, + "learning_rate": 3.603580649544775e-06, + "loss": 3.7159, + "step": 113545 + }, + { + "epoch": 7.7150428047289035, + "grad_norm": 0.19638942182064056, + "learning_rate": 3.5993341486615033e-06, + "loss": 3.8579, + "step": 113550 + }, + { + "epoch": 7.7153825247995655, + "grad_norm": 2.0132484436035156, + "learning_rate": 3.595087647778231e-06, + "loss": 3.7132, + "step": 113555 + }, + { + "epoch": 7.715722244870227, + "grad_norm": 0.17659641802310944, + "learning_rate": 3.590841146894959e-06, + "loss": 3.7499, + "step": 113560 + }, + { + "epoch": 7.716061964940889, + "grad_norm": 0.1800343245267868, + "learning_rate": 3.5865946460116865e-06, + "loss": 3.6472, + "step": 113565 + }, + { + "epoch": 7.716401685011551, + "grad_norm": 0.27521631121635437, + "learning_rate": 3.582348145128414e-06, + "loss": 3.7362, + "step": 113570 + }, + { + "epoch": 7.716741405082212, + "grad_norm": 0.26884180307388306, + "learning_rate": 3.5781016442451425e-06, + "loss": 3.7316, + "step": 113575 + }, + { + "epoch": 7.717081125152874, + "grad_norm": 0.22702407836914062, + "learning_rate": 3.57385514336187e-06, + "loss": 3.719, + "step": 113580 + }, + { + "epoch": 7.717420845223536, + "grad_norm": 0.21918493509292603, + "learning_rate": 3.569608642478598e-06, + "loss": 3.7564, + "step": 113585 + }, + { + "epoch": 7.717760565294197, + "grad_norm": 0.20994730293750763, + "learning_rate": 3.5653621415953257e-06, + "loss": 3.7269, + "step": 113590 + }, + { + "epoch": 7.7181002853648595, + "grad_norm": 0.25416144728660583, + "learning_rate": 3.5611156407120532e-06, + "loss": 3.7108, + "step": 113595 + }, + { + "epoch": 7.7184400054355216, + "grad_norm": 0.20980988442897797, + "learning_rate": 3.5568691398287812e-06, + "loss": 3.6113, + "step": 113600 + }, + { + "epoch": 7.718779725506183, + "grad_norm": 0.4207887351512909, + "learning_rate": 3.552622638945509e-06, + "loss": 3.7054, + "step": 113605 + }, + { + "epoch": 7.719119445576845, + "grad_norm": 0.3252815008163452, + "learning_rate": 3.5483761380622373e-06, + "loss": 3.7563, + "step": 113610 + }, + { + "epoch": 7.719459165647507, + "grad_norm": 0.16566310822963715, + "learning_rate": 3.544129637178965e-06, + "loss": 3.6068, + "step": 113615 + }, + { + "epoch": 7.719798885718168, + "grad_norm": 0.2001447081565857, + "learning_rate": 3.539883136295692e-06, + "loss": 3.8678, + "step": 113620 + }, + { + "epoch": 7.72013860578883, + "grad_norm": 2.8978705406188965, + "learning_rate": 3.5356366354124204e-06, + "loss": 3.71, + "step": 113625 + }, + { + "epoch": 7.720478325859492, + "grad_norm": 0.16208162903785706, + "learning_rate": 3.531390134529148e-06, + "loss": 3.9137, + "step": 113630 + }, + { + "epoch": 7.720818045930153, + "grad_norm": 0.1598624587059021, + "learning_rate": 3.527143633645876e-06, + "loss": 3.7641, + "step": 113635 + }, + { + "epoch": 7.7211577660008155, + "grad_norm": 0.6064736843109131, + "learning_rate": 3.5228971327626036e-06, + "loss": 3.7501, + "step": 113640 + }, + { + "epoch": 7.721497486071478, + "grad_norm": 0.19411976635456085, + "learning_rate": 3.518650631879332e-06, + "loss": 3.9001, + "step": 113645 + }, + { + "epoch": 7.721837206142139, + "grad_norm": 0.2121371328830719, + "learning_rate": 3.5144041309960596e-06, + "loss": 3.8289, + "step": 113650 + }, + { + "epoch": 7.722176926212801, + "grad_norm": 0.1735425740480423, + "learning_rate": 3.510157630112787e-06, + "loss": 3.7846, + "step": 113655 + }, + { + "epoch": 7.722516646283463, + "grad_norm": 1.6641350984573364, + "learning_rate": 3.5059111292295152e-06, + "loss": 3.9427, + "step": 113660 + }, + { + "epoch": 7.722856366354124, + "grad_norm": 0.1890041083097458, + "learning_rate": 3.501664628346243e-06, + "loss": 3.7304, + "step": 113665 + }, + { + "epoch": 7.723196086424786, + "grad_norm": 0.19155696034431458, + "learning_rate": 3.497418127462971e-06, + "loss": 3.6781, + "step": 113670 + }, + { + "epoch": 7.723535806495448, + "grad_norm": 0.20992636680603027, + "learning_rate": 3.4931716265796984e-06, + "loss": 4.0737, + "step": 113675 + }, + { + "epoch": 7.723875526566109, + "grad_norm": 0.21382063627243042, + "learning_rate": 3.488925125696426e-06, + "loss": 3.9959, + "step": 113680 + }, + { + "epoch": 7.7242152466367715, + "grad_norm": 0.17342716455459595, + "learning_rate": 3.4846786248131544e-06, + "loss": 4.0809, + "step": 113685 + }, + { + "epoch": 7.724554966707433, + "grad_norm": 0.23877379298210144, + "learning_rate": 3.4804321239298816e-06, + "loss": 3.7882, + "step": 113690 + }, + { + "epoch": 7.724894686778095, + "grad_norm": 0.1889001578092575, + "learning_rate": 3.47618562304661e-06, + "loss": 3.9076, + "step": 113695 + }, + { + "epoch": 7.725234406848757, + "grad_norm": 0.16721367835998535, + "learning_rate": 3.4719391221633376e-06, + "loss": 3.7732, + "step": 113700 + }, + { + "epoch": 7.725574126919418, + "grad_norm": 0.16859857738018036, + "learning_rate": 3.467692621280065e-06, + "loss": 4.0987, + "step": 113705 + }, + { + "epoch": 7.72591384699008, + "grad_norm": 0.22017115354537964, + "learning_rate": 3.4642954205734476e-06, + "loss": 3.7944, + "step": 113710 + }, + { + "epoch": 7.726253567060742, + "grad_norm": 0.16824035346508026, + "learning_rate": 3.460048919690175e-06, + "loss": 3.9045, + "step": 113715 + }, + { + "epoch": 7.726593287131403, + "grad_norm": 0.15996074676513672, + "learning_rate": 3.4558024188069036e-06, + "loss": 3.8536, + "step": 113720 + }, + { + "epoch": 7.726933007202065, + "grad_norm": 0.17004480957984924, + "learning_rate": 3.4515559179236308e-06, + "loss": 3.613, + "step": 113725 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 0.19949962198734283, + "learning_rate": 3.447309417040359e-06, + "loss": 3.813, + "step": 113730 + }, + { + "epoch": 7.727612447343389, + "grad_norm": 0.2326989471912384, + "learning_rate": 3.4430629161570868e-06, + "loss": 3.8502, + "step": 113735 + }, + { + "epoch": 7.727952167414051, + "grad_norm": 0.17319642007350922, + "learning_rate": 3.4388164152738144e-06, + "loss": 3.9222, + "step": 113740 + }, + { + "epoch": 7.728291887484713, + "grad_norm": 0.1772698163986206, + "learning_rate": 3.4345699143905424e-06, + "loss": 3.8399, + "step": 113745 + }, + { + "epoch": 7.728631607555374, + "grad_norm": 0.27448034286499023, + "learning_rate": 3.43032341350727e-06, + "loss": 3.7341, + "step": 113750 + }, + { + "epoch": 7.728971327626036, + "grad_norm": 0.20547917485237122, + "learning_rate": 3.4260769126239984e-06, + "loss": 3.6748, + "step": 113755 + }, + { + "epoch": 7.729311047696698, + "grad_norm": 0.18337611854076385, + "learning_rate": 3.4218304117407255e-06, + "loss": 3.7583, + "step": 113760 + }, + { + "epoch": 7.729650767767359, + "grad_norm": 0.15681251883506775, + "learning_rate": 3.417583910857454e-06, + "loss": 3.9612, + "step": 113765 + }, + { + "epoch": 7.7299904878380215, + "grad_norm": 0.1949855387210846, + "learning_rate": 3.4133374099741816e-06, + "loss": 3.7927, + "step": 113770 + }, + { + "epoch": 7.7303302079086835, + "grad_norm": 0.7327077984809875, + "learning_rate": 3.409090909090909e-06, + "loss": 3.6848, + "step": 113775 + }, + { + "epoch": 7.730669927979345, + "grad_norm": 0.1509907990694046, + "learning_rate": 3.404844408207637e-06, + "loss": 3.7908, + "step": 113780 + }, + { + "epoch": 7.731009648050007, + "grad_norm": 0.17456109821796417, + "learning_rate": 3.4005979073243647e-06, + "loss": 3.8087, + "step": 113785 + }, + { + "epoch": 7.731349368120669, + "grad_norm": 0.14045372605323792, + "learning_rate": 3.396351406441093e-06, + "loss": 3.6553, + "step": 113790 + }, + { + "epoch": 7.73168908819133, + "grad_norm": 0.17046046257019043, + "learning_rate": 3.3921049055578203e-06, + "loss": 3.7251, + "step": 113795 + }, + { + "epoch": 7.732028808261992, + "grad_norm": 0.19969268143177032, + "learning_rate": 3.387858404674548e-06, + "loss": 4.0455, + "step": 113800 + }, + { + "epoch": 7.732368528332654, + "grad_norm": 0.4204806089401245, + "learning_rate": 3.3836119037912763e-06, + "loss": 3.8051, + "step": 113805 + }, + { + "epoch": 7.732708248403315, + "grad_norm": 0.36602890491485596, + "learning_rate": 3.379365402908004e-06, + "loss": 3.4397, + "step": 113810 + }, + { + "epoch": 7.7330479684739775, + "grad_norm": 0.2834625840187073, + "learning_rate": 3.375118902024732e-06, + "loss": 3.9056, + "step": 113815 + }, + { + "epoch": 7.7333876885446395, + "grad_norm": 0.23576615750789642, + "learning_rate": 3.3708724011414595e-06, + "loss": 3.7504, + "step": 113820 + }, + { + "epoch": 7.733727408615301, + "grad_norm": 0.1506354957818985, + "learning_rate": 3.366625900258187e-06, + "loss": 3.9969, + "step": 113825 + }, + { + "epoch": 7.734067128685963, + "grad_norm": 0.17696981132030487, + "learning_rate": 3.362379399374915e-06, + "loss": 3.685, + "step": 113830 + }, + { + "epoch": 7.734406848756625, + "grad_norm": 0.19955582916736603, + "learning_rate": 3.3581328984916427e-06, + "loss": 4.1836, + "step": 113835 + }, + { + "epoch": 7.734746568827286, + "grad_norm": 0.13660837709903717, + "learning_rate": 3.353886397608371e-06, + "loss": 3.9758, + "step": 113840 + }, + { + "epoch": 7.735086288897948, + "grad_norm": 0.15830425918102264, + "learning_rate": 3.3496398967250987e-06, + "loss": 3.8295, + "step": 113845 + }, + { + "epoch": 7.73542600896861, + "grad_norm": 0.7952906489372253, + "learning_rate": 3.3453933958418267e-06, + "loss": 3.7263, + "step": 113850 + }, + { + "epoch": 7.735765729039271, + "grad_norm": 0.2115071564912796, + "learning_rate": 3.3411468949585543e-06, + "loss": 3.807, + "step": 113855 + }, + { + "epoch": 7.7361054491099335, + "grad_norm": 0.2045045793056488, + "learning_rate": 3.336900394075282e-06, + "loss": 3.6109, + "step": 113860 + }, + { + "epoch": 7.7364451691805955, + "grad_norm": 0.33811765909194946, + "learning_rate": 3.3326538931920103e-06, + "loss": 4.0714, + "step": 113865 + }, + { + "epoch": 7.736784889251257, + "grad_norm": 0.16596929728984833, + "learning_rate": 3.3284073923087375e-06, + "loss": 3.782, + "step": 113870 + }, + { + "epoch": 7.737124609321919, + "grad_norm": 0.14751815795898438, + "learning_rate": 3.324160891425466e-06, + "loss": 3.8356, + "step": 113875 + }, + { + "epoch": 7.737464329392581, + "grad_norm": 0.2388286143541336, + "learning_rate": 3.3199143905421935e-06, + "loss": 3.9126, + "step": 113880 + }, + { + "epoch": 7.737804049463242, + "grad_norm": 0.2108033448457718, + "learning_rate": 3.315667889658921e-06, + "loss": 3.8557, + "step": 113885 + }, + { + "epoch": 7.738143769533904, + "grad_norm": 0.23111701011657715, + "learning_rate": 3.311421388775649e-06, + "loss": 3.7934, + "step": 113890 + }, + { + "epoch": 7.738483489604566, + "grad_norm": 0.22472262382507324, + "learning_rate": 3.3071748878923767e-06, + "loss": 3.8326, + "step": 113895 + }, + { + "epoch": 7.738823209675227, + "grad_norm": 0.24783849716186523, + "learning_rate": 3.302928387009105e-06, + "loss": 3.8366, + "step": 113900 + }, + { + "epoch": 7.7391629297458895, + "grad_norm": 0.17752036452293396, + "learning_rate": 3.2986818861258323e-06, + "loss": 3.8636, + "step": 113905 + }, + { + "epoch": 7.739502649816551, + "grad_norm": 0.1496298462152481, + "learning_rate": 3.29443538524256e-06, + "loss": 3.8141, + "step": 113910 + }, + { + "epoch": 7.739842369887213, + "grad_norm": 0.1867934912443161, + "learning_rate": 3.2901888843592883e-06, + "loss": 3.7649, + "step": 113915 + }, + { + "epoch": 7.740182089957875, + "grad_norm": 0.1576908379793167, + "learning_rate": 3.285942383476016e-06, + "loss": 3.7762, + "step": 113920 + }, + { + "epoch": 7.740521810028536, + "grad_norm": 0.20815931260585785, + "learning_rate": 3.281695882592744e-06, + "loss": 4.0112, + "step": 113925 + }, + { + "epoch": 7.740861530099198, + "grad_norm": 0.1591167002916336, + "learning_rate": 3.2774493817094714e-06, + "loss": 3.975, + "step": 113930 + }, + { + "epoch": 7.74120125016986, + "grad_norm": 0.15516695380210876, + "learning_rate": 3.2732028808262e-06, + "loss": 3.7082, + "step": 113935 + }, + { + "epoch": 7.741540970240521, + "grad_norm": 0.16507305204868317, + "learning_rate": 3.268956379942927e-06, + "loss": 3.8049, + "step": 113940 + }, + { + "epoch": 7.741880690311183, + "grad_norm": 0.15285924077033997, + "learning_rate": 3.2647098790596546e-06, + "loss": 3.857, + "step": 113945 + }, + { + "epoch": 7.7422204103818455, + "grad_norm": 0.24579617381095886, + "learning_rate": 3.260463378176383e-06, + "loss": 3.7505, + "step": 113950 + }, + { + "epoch": 7.742560130452507, + "grad_norm": 0.20818056166172028, + "learning_rate": 3.2562168772931106e-06, + "loss": 3.8281, + "step": 113955 + }, + { + "epoch": 7.742899850523169, + "grad_norm": 0.1716068983078003, + "learning_rate": 3.2519703764098386e-06, + "loss": 3.8569, + "step": 113960 + }, + { + "epoch": 7.743239570593831, + "grad_norm": 0.2395862340927124, + "learning_rate": 3.2477238755265662e-06, + "loss": 3.7807, + "step": 113965 + }, + { + "epoch": 7.743579290664492, + "grad_norm": 0.24851498007774353, + "learning_rate": 3.243477374643294e-06, + "loss": 3.8365, + "step": 113970 + }, + { + "epoch": 7.743919010735154, + "grad_norm": 0.15699566900730133, + "learning_rate": 3.239230873760022e-06, + "loss": 3.7892, + "step": 113975 + }, + { + "epoch": 7.744258730805816, + "grad_norm": 0.19263909757137299, + "learning_rate": 3.2349843728767494e-06, + "loss": 3.8797, + "step": 113980 + }, + { + "epoch": 7.744598450876477, + "grad_norm": 0.16869491338729858, + "learning_rate": 3.230737871993478e-06, + "loss": 3.8536, + "step": 113985 + }, + { + "epoch": 7.744938170947139, + "grad_norm": 0.19784705340862274, + "learning_rate": 3.2264913711102054e-06, + "loss": 3.6372, + "step": 113990 + }, + { + "epoch": 7.7452778910178015, + "grad_norm": 0.14142315089702606, + "learning_rate": 3.222244870226933e-06, + "loss": 3.6841, + "step": 113995 + }, + { + "epoch": 7.745617611088463, + "grad_norm": 0.1765945851802826, + "learning_rate": 3.217998369343661e-06, + "loss": 4.0723, + "step": 114000 + }, + { + "epoch": 7.745957331159125, + "grad_norm": 0.14821051061153412, + "learning_rate": 3.2137518684603886e-06, + "loss": 3.6527, + "step": 114005 + }, + { + "epoch": 7.746297051229787, + "grad_norm": 0.18464629352092743, + "learning_rate": 3.2095053675771166e-06, + "loss": 3.7805, + "step": 114010 + }, + { + "epoch": 7.746636771300448, + "grad_norm": 0.1688014715909958, + "learning_rate": 3.205258866693844e-06, + "loss": 4.0417, + "step": 114015 + }, + { + "epoch": 7.74697649137111, + "grad_norm": 0.15107864141464233, + "learning_rate": 3.2010123658105726e-06, + "loss": 3.7471, + "step": 114020 + }, + { + "epoch": 7.747316211441772, + "grad_norm": 0.1993967890739441, + "learning_rate": 3.1967658649273e-06, + "loss": 3.8047, + "step": 114025 + }, + { + "epoch": 7.747655931512433, + "grad_norm": 0.17310336232185364, + "learning_rate": 3.192519364044028e-06, + "loss": 3.88, + "step": 114030 + }, + { + "epoch": 7.7479956515830954, + "grad_norm": 0.1398129016160965, + "learning_rate": 3.188272863160756e-06, + "loss": 3.7946, + "step": 114035 + }, + { + "epoch": 7.7483353716537575, + "grad_norm": 0.2040424644947052, + "learning_rate": 3.1840263622774834e-06, + "loss": 3.8939, + "step": 114040 + }, + { + "epoch": 7.748675091724419, + "grad_norm": 0.12578441202640533, + "learning_rate": 3.179779861394212e-06, + "loss": 3.622, + "step": 114045 + }, + { + "epoch": 7.749014811795081, + "grad_norm": 0.2952415943145752, + "learning_rate": 3.175533360510939e-06, + "loss": 3.6733, + "step": 114050 + }, + { + "epoch": 7.749354531865743, + "grad_norm": 0.8871805667877197, + "learning_rate": 3.1712868596276666e-06, + "loss": 3.7382, + "step": 114055 + }, + { + "epoch": 7.749694251936404, + "grad_norm": 0.207885280251503, + "learning_rate": 3.167040358744395e-06, + "loss": 3.7315, + "step": 114060 + }, + { + "epoch": 7.750033972007066, + "grad_norm": 0.3944215178489685, + "learning_rate": 3.1627938578611226e-06, + "loss": 4.002, + "step": 114065 + }, + { + "epoch": 7.750373692077728, + "grad_norm": 0.18509525060653687, + "learning_rate": 3.1585473569778506e-06, + "loss": 3.8176, + "step": 114070 + }, + { + "epoch": 7.750713412148389, + "grad_norm": 0.16666719317436218, + "learning_rate": 3.154300856094578e-06, + "loss": 3.6841, + "step": 114075 + }, + { + "epoch": 7.7510531322190515, + "grad_norm": 0.18306578695774078, + "learning_rate": 3.1500543552113058e-06, + "loss": 3.8179, + "step": 114080 + }, + { + "epoch": 7.7513928522897135, + "grad_norm": 0.17080701887607574, + "learning_rate": 3.1458078543280338e-06, + "loss": 3.662, + "step": 114085 + }, + { + "epoch": 7.751732572360375, + "grad_norm": 0.15552517771720886, + "learning_rate": 3.1415613534447613e-06, + "loss": 3.8353, + "step": 114090 + }, + { + "epoch": 7.752072292431037, + "grad_norm": 0.8353556990623474, + "learning_rate": 3.1373148525614898e-06, + "loss": 4.0243, + "step": 114095 + }, + { + "epoch": 7.752412012501699, + "grad_norm": 0.13003234565258026, + "learning_rate": 3.1330683516782174e-06, + "loss": 3.7943, + "step": 114100 + }, + { + "epoch": 7.75275173257236, + "grad_norm": 0.17849251627922058, + "learning_rate": 3.1288218507949454e-06, + "loss": 3.7015, + "step": 114105 + }, + { + "epoch": 7.753091452643022, + "grad_norm": 0.16124439239501953, + "learning_rate": 3.124575349911673e-06, + "loss": 3.7327, + "step": 114110 + }, + { + "epoch": 7.753431172713684, + "grad_norm": 0.45314285159111023, + "learning_rate": 3.120328849028401e-06, + "loss": 3.5192, + "step": 114115 + }, + { + "epoch": 7.753770892784345, + "grad_norm": 0.24574634432792664, + "learning_rate": 3.1160823481451285e-06, + "loss": 4.163, + "step": 114120 + }, + { + "epoch": 7.7541106128550075, + "grad_norm": 0.2496778666973114, + "learning_rate": 3.111835847261856e-06, + "loss": 3.7901, + "step": 114125 + }, + { + "epoch": 7.7544503329256695, + "grad_norm": 0.2306700348854065, + "learning_rate": 3.107589346378584e-06, + "loss": 3.8813, + "step": 114130 + }, + { + "epoch": 7.754790052996331, + "grad_norm": 0.15179036557674408, + "learning_rate": 3.103342845495312e-06, + "loss": 3.6598, + "step": 114135 + }, + { + "epoch": 7.755129773066993, + "grad_norm": 0.5503612756729126, + "learning_rate": 3.0990963446120397e-06, + "loss": 3.8412, + "step": 114140 + }, + { + "epoch": 7.755469493137655, + "grad_norm": 0.1616421341896057, + "learning_rate": 3.0948498437287677e-06, + "loss": 3.7572, + "step": 114145 + }, + { + "epoch": 7.755809213208316, + "grad_norm": 0.18253381550312042, + "learning_rate": 3.0906033428454953e-06, + "loss": 3.781, + "step": 114150 + }, + { + "epoch": 7.756148933278978, + "grad_norm": 0.16274279356002808, + "learning_rate": 3.0863568419622233e-06, + "loss": 3.8531, + "step": 114155 + }, + { + "epoch": 7.75648865334964, + "grad_norm": 0.15770822763442993, + "learning_rate": 3.082110341078951e-06, + "loss": 3.9771, + "step": 114160 + }, + { + "epoch": 7.756828373420301, + "grad_norm": 0.2079276144504547, + "learning_rate": 3.077863840195679e-06, + "loss": 3.5422, + "step": 114165 + }, + { + "epoch": 7.7571680934909635, + "grad_norm": 0.23696590960025787, + "learning_rate": 3.073617339312407e-06, + "loss": 3.8949, + "step": 114170 + }, + { + "epoch": 7.7575078135616256, + "grad_norm": 0.15021446347236633, + "learning_rate": 3.0693708384291345e-06, + "loss": 4.0797, + "step": 114175 + }, + { + "epoch": 7.757847533632287, + "grad_norm": 0.17043949663639069, + "learning_rate": 3.065124337545862e-06, + "loss": 3.8134, + "step": 114180 + }, + { + "epoch": 7.758187253702949, + "grad_norm": 0.21514266729354858, + "learning_rate": 3.06087783666259e-06, + "loss": 4.0539, + "step": 114185 + }, + { + "epoch": 7.758526973773611, + "grad_norm": 0.18927118182182312, + "learning_rate": 3.056631335779318e-06, + "loss": 3.8343, + "step": 114190 + }, + { + "epoch": 7.758866693844272, + "grad_norm": 0.16148675978183746, + "learning_rate": 3.0523848348960457e-06, + "loss": 3.8963, + "step": 114195 + }, + { + "epoch": 7.759206413914934, + "grad_norm": 0.14790605008602142, + "learning_rate": 3.0481383340127737e-06, + "loss": 3.7654, + "step": 114200 + }, + { + "epoch": 7.759546133985596, + "grad_norm": 0.20968085527420044, + "learning_rate": 3.0438918331295017e-06, + "loss": 3.8139, + "step": 114205 + }, + { + "epoch": 7.759885854056257, + "grad_norm": 0.18959328532218933, + "learning_rate": 3.0396453322462293e-06, + "loss": 3.8021, + "step": 114210 + }, + { + "epoch": 7.7602255741269195, + "grad_norm": 0.14844755828380585, + "learning_rate": 3.035398831362957e-06, + "loss": 3.8875, + "step": 114215 + }, + { + "epoch": 7.760565294197582, + "grad_norm": 0.1870366930961609, + "learning_rate": 3.031152330479685e-06, + "loss": 3.7682, + "step": 114220 + }, + { + "epoch": 7.760905014268243, + "grad_norm": 0.21613070368766785, + "learning_rate": 3.026905829596413e-06, + "loss": 3.8496, + "step": 114225 + }, + { + "epoch": 7.761244734338905, + "grad_norm": 0.14497242867946625, + "learning_rate": 3.0226593287131405e-06, + "loss": 3.9967, + "step": 114230 + }, + { + "epoch": 7.761584454409567, + "grad_norm": 0.6374785304069519, + "learning_rate": 3.018412827829868e-06, + "loss": 3.6011, + "step": 114235 + }, + { + "epoch": 7.761924174480228, + "grad_norm": 0.36849531531333923, + "learning_rate": 3.014166326946596e-06, + "loss": 3.8735, + "step": 114240 + }, + { + "epoch": 7.76226389455089, + "grad_norm": 0.18735189735889435, + "learning_rate": 3.009919826063324e-06, + "loss": 3.6461, + "step": 114245 + }, + { + "epoch": 7.762603614621552, + "grad_norm": 0.13222886621952057, + "learning_rate": 3.0056733251800517e-06, + "loss": 3.8421, + "step": 114250 + }, + { + "epoch": 7.762943334692213, + "grad_norm": 0.24017512798309326, + "learning_rate": 3.0014268242967797e-06, + "loss": 3.8559, + "step": 114255 + }, + { + "epoch": 7.7632830547628755, + "grad_norm": 0.1500789076089859, + "learning_rate": 2.9971803234135077e-06, + "loss": 4.0921, + "step": 114260 + }, + { + "epoch": 7.763622774833538, + "grad_norm": 0.28919529914855957, + "learning_rate": 2.9929338225302353e-06, + "loss": 3.9495, + "step": 114265 + }, + { + "epoch": 7.763962494904199, + "grad_norm": 0.1991564929485321, + "learning_rate": 2.988687321646963e-06, + "loss": 3.5948, + "step": 114270 + }, + { + "epoch": 7.764302214974861, + "grad_norm": 0.1962490826845169, + "learning_rate": 2.984440820763691e-06, + "loss": 3.6126, + "step": 114275 + }, + { + "epoch": 7.764641935045523, + "grad_norm": 0.18408994376659393, + "learning_rate": 2.980194319880419e-06, + "loss": 3.8106, + "step": 114280 + }, + { + "epoch": 7.764981655116184, + "grad_norm": 0.20803600549697876, + "learning_rate": 2.9759478189971464e-06, + "loss": 3.8285, + "step": 114285 + }, + { + "epoch": 7.765321375186846, + "grad_norm": 0.23422497510910034, + "learning_rate": 2.9717013181138745e-06, + "loss": 3.9566, + "step": 114290 + }, + { + "epoch": 7.765661095257508, + "grad_norm": 0.16336816549301147, + "learning_rate": 2.967454817230602e-06, + "loss": 3.9207, + "step": 114295 + }, + { + "epoch": 7.766000815328169, + "grad_norm": 0.21521824598312378, + "learning_rate": 2.96320831634733e-06, + "loss": 3.9875, + "step": 114300 + }, + { + "epoch": 7.7663405353988315, + "grad_norm": 0.19353018701076508, + "learning_rate": 2.9589618154640576e-06, + "loss": 3.7939, + "step": 114305 + }, + { + "epoch": 7.766680255469494, + "grad_norm": 0.19020630419254303, + "learning_rate": 2.9547153145807856e-06, + "loss": 3.6882, + "step": 114310 + }, + { + "epoch": 7.767019975540155, + "grad_norm": 0.2075205296278, + "learning_rate": 2.9504688136975136e-06, + "loss": 3.6353, + "step": 114315 + }, + { + "epoch": 7.767359695610817, + "grad_norm": 0.17031782865524292, + "learning_rate": 2.9462223128142412e-06, + "loss": 4.0442, + "step": 114320 + }, + { + "epoch": 7.767699415681479, + "grad_norm": 0.14918765425682068, + "learning_rate": 2.941975811930969e-06, + "loss": 4.0471, + "step": 114325 + }, + { + "epoch": 7.76803913575214, + "grad_norm": 0.26367440819740295, + "learning_rate": 2.937729311047697e-06, + "loss": 3.6845, + "step": 114330 + }, + { + "epoch": 7.768378855822802, + "grad_norm": 0.1573086380958557, + "learning_rate": 2.933482810164425e-06, + "loss": 3.5221, + "step": 114335 + }, + { + "epoch": 7.768718575893464, + "grad_norm": 0.16686633229255676, + "learning_rate": 2.9292363092811524e-06, + "loss": 3.8443, + "step": 114340 + }, + { + "epoch": 7.7690582959641254, + "grad_norm": 0.13930658996105194, + "learning_rate": 2.9249898083978804e-06, + "loss": 3.7257, + "step": 114345 + }, + { + "epoch": 7.7693980160347875, + "grad_norm": 0.17911428213119507, + "learning_rate": 2.920743307514608e-06, + "loss": 3.863, + "step": 114350 + }, + { + "epoch": 7.76973773610545, + "grad_norm": 0.15339744091033936, + "learning_rate": 2.916496806631336e-06, + "loss": 3.734, + "step": 114355 + }, + { + "epoch": 7.770077456176111, + "grad_norm": 0.16526389122009277, + "learning_rate": 2.9122503057480636e-06, + "loss": 3.8519, + "step": 114360 + }, + { + "epoch": 7.770417176246773, + "grad_norm": 0.1887727826833725, + "learning_rate": 2.9080038048647916e-06, + "loss": 3.9607, + "step": 114365 + }, + { + "epoch": 7.770756896317434, + "grad_norm": 0.17908775806427002, + "learning_rate": 2.9037573039815196e-06, + "loss": 3.718, + "step": 114370 + }, + { + "epoch": 7.771096616388096, + "grad_norm": 0.18498069047927856, + "learning_rate": 2.899510803098247e-06, + "loss": 3.7538, + "step": 114375 + }, + { + "epoch": 7.771436336458758, + "grad_norm": 0.16659066081047058, + "learning_rate": 2.8952643022149748e-06, + "loss": 3.7879, + "step": 114380 + }, + { + "epoch": 7.771776056529419, + "grad_norm": 0.17459328472614288, + "learning_rate": 2.8910178013317028e-06, + "loss": 3.8364, + "step": 114385 + }, + { + "epoch": 7.7721157766000815, + "grad_norm": 0.14733199775218964, + "learning_rate": 2.886771300448431e-06, + "loss": 3.7366, + "step": 114390 + }, + { + "epoch": 7.7724554966707435, + "grad_norm": 0.1704983115196228, + "learning_rate": 2.8825247995651584e-06, + "loss": 3.7644, + "step": 114395 + }, + { + "epoch": 7.772795216741405, + "grad_norm": 0.15740850567817688, + "learning_rate": 2.8782782986818864e-06, + "loss": 3.6433, + "step": 114400 + }, + { + "epoch": 7.773134936812067, + "grad_norm": 0.15752282738685608, + "learning_rate": 2.874031797798614e-06, + "loss": 3.7497, + "step": 114405 + }, + { + "epoch": 7.773474656882729, + "grad_norm": 0.1885172724723816, + "learning_rate": 2.869785296915342e-06, + "loss": 3.5975, + "step": 114410 + }, + { + "epoch": 7.77381437695339, + "grad_norm": 0.1707897037267685, + "learning_rate": 2.8655387960320696e-06, + "loss": 3.9431, + "step": 114415 + }, + { + "epoch": 7.774154097024052, + "grad_norm": 0.13961590826511383, + "learning_rate": 2.8612922951487976e-06, + "loss": 3.9048, + "step": 114420 + }, + { + "epoch": 7.774493817094714, + "grad_norm": 0.273769348859787, + "learning_rate": 2.8570457942655256e-06, + "loss": 3.7157, + "step": 114425 + }, + { + "epoch": 7.774833537165375, + "grad_norm": 0.3792562186717987, + "learning_rate": 2.852799293382253e-06, + "loss": 3.7413, + "step": 114430 + }, + { + "epoch": 7.7751732572360375, + "grad_norm": 0.15743988752365112, + "learning_rate": 2.8485527924989807e-06, + "loss": 3.9117, + "step": 114435 + }, + { + "epoch": 7.7755129773066995, + "grad_norm": 0.15913382172584534, + "learning_rate": 2.8443062916157088e-06, + "loss": 3.9938, + "step": 114440 + }, + { + "epoch": 7.775852697377361, + "grad_norm": 0.16645139455795288, + "learning_rate": 2.8400597907324368e-06, + "loss": 3.7528, + "step": 114445 + }, + { + "epoch": 7.776192417448023, + "grad_norm": 0.16725799441337585, + "learning_rate": 2.8358132898491643e-06, + "loss": 4.083, + "step": 114450 + }, + { + "epoch": 7.776532137518685, + "grad_norm": 0.18995584547519684, + "learning_rate": 2.8315667889658924e-06, + "loss": 3.5329, + "step": 114455 + }, + { + "epoch": 7.776871857589346, + "grad_norm": 0.15374942123889923, + "learning_rate": 2.8273202880826204e-06, + "loss": 3.7397, + "step": 114460 + }, + { + "epoch": 7.777211577660008, + "grad_norm": 0.1562049835920334, + "learning_rate": 2.823073787199348e-06, + "loss": 3.7403, + "step": 114465 + }, + { + "epoch": 7.77755129773067, + "grad_norm": 0.6447733044624329, + "learning_rate": 2.8188272863160755e-06, + "loss": 4.0274, + "step": 114470 + }, + { + "epoch": 7.777891017801331, + "grad_norm": 0.19894526898860931, + "learning_rate": 2.8145807854328035e-06, + "loss": 3.7344, + "step": 114475 + }, + { + "epoch": 7.7782307378719935, + "grad_norm": 0.16057229042053223, + "learning_rate": 2.8103342845495315e-06, + "loss": 3.7145, + "step": 114480 + }, + { + "epoch": 7.778570457942656, + "grad_norm": 0.1384742558002472, + "learning_rate": 2.806087783666259e-06, + "loss": 3.969, + "step": 114485 + }, + { + "epoch": 7.778910178013317, + "grad_norm": 0.2086983621120453, + "learning_rate": 2.8018412827829867e-06, + "loss": 4.0722, + "step": 114490 + }, + { + "epoch": 7.779249898083979, + "grad_norm": 0.1684219390153885, + "learning_rate": 2.7975947818997147e-06, + "loss": 3.9953, + "step": 114495 + }, + { + "epoch": 7.779589618154641, + "grad_norm": 0.17318785190582275, + "learning_rate": 2.7933482810164427e-06, + "loss": 3.5313, + "step": 114500 + }, + { + "epoch": 7.779929338225302, + "grad_norm": 0.3689248263835907, + "learning_rate": 2.7891017801331703e-06, + "loss": 3.6975, + "step": 114505 + }, + { + "epoch": 7.780269058295964, + "grad_norm": 0.1915903389453888, + "learning_rate": 2.7848552792498983e-06, + "loss": 3.954, + "step": 114510 + }, + { + "epoch": 7.780608778366626, + "grad_norm": 0.144066721200943, + "learning_rate": 2.7806087783666263e-06, + "loss": 3.8954, + "step": 114515 + }, + { + "epoch": 7.780948498437287, + "grad_norm": 0.17883259057998657, + "learning_rate": 2.776362277483354e-06, + "loss": 3.7809, + "step": 114520 + }, + { + "epoch": 7.7812882185079495, + "grad_norm": 0.18224190175533295, + "learning_rate": 2.7721157766000815e-06, + "loss": 3.8528, + "step": 114525 + }, + { + "epoch": 7.781627938578612, + "grad_norm": 0.18641994893550873, + "learning_rate": 2.7678692757168095e-06, + "loss": 3.7204, + "step": 114530 + }, + { + "epoch": 7.781967658649273, + "grad_norm": 0.2031654566526413, + "learning_rate": 2.7636227748335375e-06, + "loss": 3.8082, + "step": 114535 + }, + { + "epoch": 7.782307378719935, + "grad_norm": 0.1894456446170807, + "learning_rate": 2.759376273950265e-06, + "loss": 3.733, + "step": 114540 + }, + { + "epoch": 7.782647098790597, + "grad_norm": 2.138537883758545, + "learning_rate": 2.755129773066993e-06, + "loss": 3.9215, + "step": 114545 + }, + { + "epoch": 7.782986818861258, + "grad_norm": 0.20423513650894165, + "learning_rate": 2.7508832721837207e-06, + "loss": 3.9674, + "step": 114550 + }, + { + "epoch": 7.78332653893192, + "grad_norm": 0.4252205491065979, + "learning_rate": 2.7466367713004487e-06, + "loss": 3.7121, + "step": 114555 + }, + { + "epoch": 7.783666259002582, + "grad_norm": 0.12604674696922302, + "learning_rate": 2.7423902704171763e-06, + "loss": 3.8403, + "step": 114560 + }, + { + "epoch": 7.784005979073243, + "grad_norm": 0.1603587567806244, + "learning_rate": 2.7381437695339043e-06, + "loss": 4.1231, + "step": 114565 + }, + { + "epoch": 7.7843456991439055, + "grad_norm": 0.2827565371990204, + "learning_rate": 2.7338972686506323e-06, + "loss": 3.5514, + "step": 114570 + }, + { + "epoch": 7.784685419214568, + "grad_norm": 0.14398686587810516, + "learning_rate": 2.72965076776736e-06, + "loss": 3.853, + "step": 114575 + }, + { + "epoch": 7.785025139285229, + "grad_norm": 0.18050017952919006, + "learning_rate": 2.7254042668840875e-06, + "loss": 3.9294, + "step": 114580 + }, + { + "epoch": 7.785364859355891, + "grad_norm": 0.16854403913021088, + "learning_rate": 2.7211577660008155e-06, + "loss": 3.5514, + "step": 114585 + }, + { + "epoch": 7.785704579426552, + "grad_norm": 0.20703357458114624, + "learning_rate": 2.7169112651175435e-06, + "loss": 3.758, + "step": 114590 + }, + { + "epoch": 7.786044299497214, + "grad_norm": 10.264630317687988, + "learning_rate": 2.712664764234271e-06, + "loss": 3.7669, + "step": 114595 + }, + { + "epoch": 7.786384019567876, + "grad_norm": 0.21121445298194885, + "learning_rate": 2.708418263350999e-06, + "loss": 3.7656, + "step": 114600 + }, + { + "epoch": 7.786723739638537, + "grad_norm": 0.18918967247009277, + "learning_rate": 2.7041717624677267e-06, + "loss": 3.8673, + "step": 114605 + }, + { + "epoch": 7.787063459709199, + "grad_norm": 0.19132967293262482, + "learning_rate": 2.6999252615844547e-06, + "loss": 3.9028, + "step": 114610 + }, + { + "epoch": 7.7874031797798615, + "grad_norm": 0.1783110797405243, + "learning_rate": 2.6956787607011822e-06, + "loss": 3.8384, + "step": 114615 + }, + { + "epoch": 7.787742899850523, + "grad_norm": 0.14331887662410736, + "learning_rate": 2.6914322598179103e-06, + "loss": 3.9283, + "step": 114620 + }, + { + "epoch": 7.788082619921185, + "grad_norm": 0.17041698098182678, + "learning_rate": 2.6871857589346383e-06, + "loss": 3.6852, + "step": 114625 + }, + { + "epoch": 7.788422339991847, + "grad_norm": 0.19968459010124207, + "learning_rate": 2.6829392580513654e-06, + "loss": 3.7521, + "step": 114630 + }, + { + "epoch": 7.788762060062508, + "grad_norm": 0.14278651773929596, + "learning_rate": 2.6786927571680934e-06, + "loss": 3.8727, + "step": 114635 + }, + { + "epoch": 7.78910178013317, + "grad_norm": 0.13427844643592834, + "learning_rate": 2.6744462562848214e-06, + "loss": 3.9135, + "step": 114640 + }, + { + "epoch": 7.789441500203832, + "grad_norm": 1.32760488986969, + "learning_rate": 2.6701997554015494e-06, + "loss": 3.4642, + "step": 114645 + }, + { + "epoch": 7.789781220274493, + "grad_norm": 0.3868170380592346, + "learning_rate": 2.665953254518277e-06, + "loss": 3.8283, + "step": 114650 + }, + { + "epoch": 7.7901209403451555, + "grad_norm": 0.1546592265367508, + "learning_rate": 2.661706753635005e-06, + "loss": 3.6501, + "step": 114655 + }, + { + "epoch": 7.7904606604158175, + "grad_norm": 0.2279806286096573, + "learning_rate": 2.6574602527517326e-06, + "loss": 3.8049, + "step": 114660 + }, + { + "epoch": 7.790800380486479, + "grad_norm": 0.21227070689201355, + "learning_rate": 2.6532137518684606e-06, + "loss": 3.9725, + "step": 114665 + }, + { + "epoch": 7.791140100557141, + "grad_norm": 0.16117580235004425, + "learning_rate": 2.6489672509851882e-06, + "loss": 3.6909, + "step": 114670 + }, + { + "epoch": 7.791479820627803, + "grad_norm": 0.18206334114074707, + "learning_rate": 2.6447207501019162e-06, + "loss": 3.9305, + "step": 114675 + }, + { + "epoch": 7.791819540698464, + "grad_norm": 0.1533532738685608, + "learning_rate": 2.6404742492186442e-06, + "loss": 3.8952, + "step": 114680 + }, + { + "epoch": 7.792159260769126, + "grad_norm": 0.16987307369709015, + "learning_rate": 2.636227748335372e-06, + "loss": 3.6512, + "step": 114685 + }, + { + "epoch": 7.792498980839788, + "grad_norm": 0.24084272980690002, + "learning_rate": 2.6319812474520994e-06, + "loss": 3.6792, + "step": 114690 + }, + { + "epoch": 7.792838700910449, + "grad_norm": 0.19552046060562134, + "learning_rate": 2.6277347465688274e-06, + "loss": 3.6517, + "step": 114695 + }, + { + "epoch": 7.7931784209811115, + "grad_norm": 2.4810845851898193, + "learning_rate": 2.6234882456855554e-06, + "loss": 3.973, + "step": 114700 + }, + { + "epoch": 7.7935181410517735, + "grad_norm": 0.15627367794513702, + "learning_rate": 2.619241744802283e-06, + "loss": 3.7846, + "step": 114705 + }, + { + "epoch": 7.793857861122435, + "grad_norm": 0.17051203548908234, + "learning_rate": 2.614995243919011e-06, + "loss": 3.4902, + "step": 114710 + }, + { + "epoch": 7.794197581193097, + "grad_norm": 0.18756872415542603, + "learning_rate": 2.6107487430357386e-06, + "loss": 3.7162, + "step": 114715 + }, + { + "epoch": 7.794537301263759, + "grad_norm": 0.18810300529003143, + "learning_rate": 2.606502242152466e-06, + "loss": 3.8329, + "step": 114720 + }, + { + "epoch": 7.79487702133442, + "grad_norm": 0.15020452439785004, + "learning_rate": 2.602255741269194e-06, + "loss": 3.7451, + "step": 114725 + }, + { + "epoch": 7.795216741405082, + "grad_norm": 0.20037680864334106, + "learning_rate": 2.598009240385922e-06, + "loss": 3.7788, + "step": 114730 + }, + { + "epoch": 7.795556461475744, + "grad_norm": 0.21866481006145477, + "learning_rate": 2.59376273950265e-06, + "loss": 3.7052, + "step": 114735 + }, + { + "epoch": 7.795896181546405, + "grad_norm": 0.18056201934814453, + "learning_rate": 2.5895162386193778e-06, + "loss": 3.8961, + "step": 114740 + }, + { + "epoch": 7.7962359016170675, + "grad_norm": 0.17961017787456512, + "learning_rate": 2.5852697377361054e-06, + "loss": 4.0195, + "step": 114745 + }, + { + "epoch": 7.7965756216877296, + "grad_norm": 0.21653559803962708, + "learning_rate": 2.5810232368528334e-06, + "loss": 3.8944, + "step": 114750 + }, + { + "epoch": 7.796915341758391, + "grad_norm": 0.185589000582695, + "learning_rate": 2.5767767359695614e-06, + "loss": 3.6814, + "step": 114755 + }, + { + "epoch": 7.797255061829053, + "grad_norm": 0.18723489344120026, + "learning_rate": 2.572530235086289e-06, + "loss": 3.8297, + "step": 114760 + }, + { + "epoch": 7.797594781899715, + "grad_norm": 0.15231196582317352, + "learning_rate": 2.568283734203017e-06, + "loss": 3.9243, + "step": 114765 + }, + { + "epoch": 7.797934501970376, + "grad_norm": 0.6353347301483154, + "learning_rate": 2.564037233319745e-06, + "loss": 3.8695, + "step": 114770 + }, + { + "epoch": 7.798274222041038, + "grad_norm": 0.1635562926530838, + "learning_rate": 2.559790732436472e-06, + "loss": 3.7393, + "step": 114775 + }, + { + "epoch": 7.7986139421117, + "grad_norm": 0.1885072886943817, + "learning_rate": 2.5555442315532e-06, + "loss": 4.0817, + "step": 114780 + }, + { + "epoch": 7.798953662182361, + "grad_norm": 0.178546741604805, + "learning_rate": 2.551297730669928e-06, + "loss": 3.5917, + "step": 114785 + }, + { + "epoch": 7.7992933822530235, + "grad_norm": 0.32392168045043945, + "learning_rate": 2.547051229786656e-06, + "loss": 3.6948, + "step": 114790 + }, + { + "epoch": 7.799633102323686, + "grad_norm": 0.1647418737411499, + "learning_rate": 2.5428047289033838e-06, + "loss": 3.8842, + "step": 114795 + }, + { + "epoch": 7.799972822394347, + "grad_norm": 0.16608108580112457, + "learning_rate": 2.5385582280201113e-06, + "loss": 3.7122, + "step": 114800 + }, + { + "epoch": 7.800312542465009, + "grad_norm": 0.18127264082431793, + "learning_rate": 2.5343117271368393e-06, + "loss": 4.1625, + "step": 114805 + }, + { + "epoch": 7.800652262535671, + "grad_norm": 0.1481214016675949, + "learning_rate": 2.530065226253567e-06, + "loss": 3.9969, + "step": 114810 + }, + { + "epoch": 7.800991982606332, + "grad_norm": 0.16709135472774506, + "learning_rate": 2.525818725370295e-06, + "loss": 3.8447, + "step": 114815 + }, + { + "epoch": 7.801331702676994, + "grad_norm": 0.23135323822498322, + "learning_rate": 2.521572224487023e-06, + "loss": 3.7541, + "step": 114820 + }, + { + "epoch": 7.801671422747656, + "grad_norm": 0.17701971530914307, + "learning_rate": 2.517325723603751e-06, + "loss": 3.8842, + "step": 114825 + }, + { + "epoch": 7.802011142818317, + "grad_norm": 0.28239068388938904, + "learning_rate": 2.513079222720478e-06, + "loss": 3.96, + "step": 114830 + }, + { + "epoch": 7.8023508628889795, + "grad_norm": 0.17716611921787262, + "learning_rate": 2.508832721837206e-06, + "loss": 3.8552, + "step": 114835 + }, + { + "epoch": 7.802690582959642, + "grad_norm": 0.15196365118026733, + "learning_rate": 2.504586220953934e-06, + "loss": 3.6707, + "step": 114840 + }, + { + "epoch": 7.803030303030303, + "grad_norm": 0.1428510844707489, + "learning_rate": 2.500339720070662e-06, + "loss": 3.8318, + "step": 114845 + }, + { + "epoch": 7.803370023100965, + "grad_norm": 0.16531261801719666, + "learning_rate": 2.4960932191873897e-06, + "loss": 3.7387, + "step": 114850 + }, + { + "epoch": 7.803709743171627, + "grad_norm": 0.8656522035598755, + "learning_rate": 2.4918467183041177e-06, + "loss": 3.8307, + "step": 114855 + }, + { + "epoch": 7.804049463242288, + "grad_norm": 0.16703329980373383, + "learning_rate": 2.4876002174208453e-06, + "loss": 3.552, + "step": 114860 + }, + { + "epoch": 7.80438918331295, + "grad_norm": 0.2175770252943039, + "learning_rate": 2.483353716537573e-06, + "loss": 4.0961, + "step": 114865 + }, + { + "epoch": 7.804728903383612, + "grad_norm": 1.5161693096160889, + "learning_rate": 2.479107215654301e-06, + "loss": 3.8152, + "step": 114870 + }, + { + "epoch": 7.805068623454273, + "grad_norm": 0.16943509876728058, + "learning_rate": 2.474860714771029e-06, + "loss": 3.7087, + "step": 114875 + }, + { + "epoch": 7.8054083435249355, + "grad_norm": 0.42747098207473755, + "learning_rate": 2.470614213887757e-06, + "loss": 3.5528, + "step": 114880 + }, + { + "epoch": 7.805748063595598, + "grad_norm": 0.15678323805332184, + "learning_rate": 2.466367713004484e-06, + "loss": 3.9218, + "step": 114885 + }, + { + "epoch": 7.806087783666259, + "grad_norm": 0.16769862174987793, + "learning_rate": 2.462121212121212e-06, + "loss": 3.619, + "step": 114890 + }, + { + "epoch": 7.806427503736921, + "grad_norm": 0.14550839364528656, + "learning_rate": 2.45787471123794e-06, + "loss": 3.7803, + "step": 114895 + }, + { + "epoch": 7.806767223807583, + "grad_norm": 0.17363357543945312, + "learning_rate": 2.453628210354668e-06, + "loss": 3.8868, + "step": 114900 + }, + { + "epoch": 7.807106943878244, + "grad_norm": 0.16995951533317566, + "learning_rate": 2.4493817094713957e-06, + "loss": 3.4904, + "step": 114905 + }, + { + "epoch": 7.807446663948906, + "grad_norm": 0.17680345475673676, + "learning_rate": 2.4451352085881237e-06, + "loss": 3.8411, + "step": 114910 + }, + { + "epoch": 7.807786384019568, + "grad_norm": 0.3194452226161957, + "learning_rate": 2.4408887077048513e-06, + "loss": 3.8292, + "step": 114915 + }, + { + "epoch": 7.8081261040902294, + "grad_norm": 0.1704830676317215, + "learning_rate": 2.436642206821579e-06, + "loss": 3.7239, + "step": 114920 + }, + { + "epoch": 7.8084658241608915, + "grad_norm": 0.20578908920288086, + "learning_rate": 2.432395705938307e-06, + "loss": 3.5702, + "step": 114925 + }, + { + "epoch": 7.808805544231554, + "grad_norm": 0.2008313238620758, + "learning_rate": 2.428149205055035e-06, + "loss": 3.9674, + "step": 114930 + }, + { + "epoch": 7.809145264302215, + "grad_norm": 0.1942504644393921, + "learning_rate": 2.423902704171763e-06, + "loss": 3.7595, + "step": 114935 + }, + { + "epoch": 7.809484984372877, + "grad_norm": 0.2291940599679947, + "learning_rate": 2.4196562032884905e-06, + "loss": 3.88, + "step": 114940 + }, + { + "epoch": 7.809824704443539, + "grad_norm": 0.15458792448043823, + "learning_rate": 2.415409702405218e-06, + "loss": 3.7735, + "step": 114945 + }, + { + "epoch": 7.8101644245142, + "grad_norm": 0.14380131661891937, + "learning_rate": 2.411163201521946e-06, + "loss": 3.8997, + "step": 114950 + }, + { + "epoch": 7.810504144584862, + "grad_norm": 0.19685889780521393, + "learning_rate": 2.4069167006386736e-06, + "loss": 3.8192, + "step": 114955 + }, + { + "epoch": 7.810843864655524, + "grad_norm": 0.17829790711402893, + "learning_rate": 2.4026701997554017e-06, + "loss": 3.9496, + "step": 114960 + }, + { + "epoch": 7.8111835847261855, + "grad_norm": 0.17062287032604218, + "learning_rate": 2.3984236988721297e-06, + "loss": 3.8855, + "step": 114965 + }, + { + "epoch": 7.8115233047968475, + "grad_norm": 0.6487093567848206, + "learning_rate": 2.3941771979888572e-06, + "loss": 3.9399, + "step": 114970 + }, + { + "epoch": 7.81186302486751, + "grad_norm": 0.22935740649700165, + "learning_rate": 2.389930697105585e-06, + "loss": 3.7178, + "step": 114975 + }, + { + "epoch": 7.812202744938171, + "grad_norm": 0.20066756010055542, + "learning_rate": 2.385684196222313e-06, + "loss": 3.5266, + "step": 114980 + }, + { + "epoch": 7.812542465008833, + "grad_norm": 0.1815599501132965, + "learning_rate": 2.381437695339041e-06, + "loss": 3.8209, + "step": 114985 + }, + { + "epoch": 7.812882185079495, + "grad_norm": 0.18458899855613708, + "learning_rate": 2.377191194455769e-06, + "loss": 3.7871, + "step": 114990 + }, + { + "epoch": 7.813221905150156, + "grad_norm": 0.15376973152160645, + "learning_rate": 2.3729446935724964e-06, + "loss": 3.8943, + "step": 114995 + }, + { + "epoch": 7.813561625220818, + "grad_norm": 0.21365341544151306, + "learning_rate": 2.368698192689224e-06, + "loss": 3.8494, + "step": 115000 + }, + { + "epoch": 7.81390134529148, + "grad_norm": 0.17426592111587524, + "learning_rate": 2.364451691805952e-06, + "loss": 3.7285, + "step": 115005 + }, + { + "epoch": 7.8142410653621415, + "grad_norm": 0.18098343908786774, + "learning_rate": 2.3602051909226796e-06, + "loss": 3.9408, + "step": 115010 + }, + { + "epoch": 7.8145807854328035, + "grad_norm": 1.2612411975860596, + "learning_rate": 2.3559586900394076e-06, + "loss": 3.7466, + "step": 115015 + }, + { + "epoch": 7.814920505503466, + "grad_norm": 0.20323817431926727, + "learning_rate": 2.3517121891561356e-06, + "loss": 4.0059, + "step": 115020 + }, + { + "epoch": 7.815260225574127, + "grad_norm": 0.1592099368572235, + "learning_rate": 2.3474656882728636e-06, + "loss": 3.5771, + "step": 115025 + }, + { + "epoch": 7.815599945644789, + "grad_norm": 0.44935712218284607, + "learning_rate": 2.343219187389591e-06, + "loss": 3.6944, + "step": 115030 + }, + { + "epoch": 7.815939665715451, + "grad_norm": 0.46467629075050354, + "learning_rate": 2.338972686506319e-06, + "loss": 3.9227, + "step": 115035 + }, + { + "epoch": 7.816279385786112, + "grad_norm": 0.23005929589271545, + "learning_rate": 2.334726185623047e-06, + "loss": 3.552, + "step": 115040 + }, + { + "epoch": 7.816619105856774, + "grad_norm": 0.19023725390434265, + "learning_rate": 2.3304796847397744e-06, + "loss": 3.7689, + "step": 115045 + }, + { + "epoch": 7.816958825927435, + "grad_norm": NaN, + "learning_rate": 2.327082484033157e-06, + "loss": 3.5717, + "step": 115050 + }, + { + "epoch": 7.8172985459980975, + "grad_norm": 0.21821483969688416, + "learning_rate": 2.322835983149885e-06, + "loss": 3.9492, + "step": 115055 + }, + { + "epoch": 7.81763826606876, + "grad_norm": 0.1817469745874405, + "learning_rate": 2.3185894822666124e-06, + "loss": 4.0541, + "step": 115060 + }, + { + "epoch": 7.817977986139421, + "grad_norm": 0.16159434616565704, + "learning_rate": 2.31434298138334e-06, + "loss": 3.8848, + "step": 115065 + }, + { + "epoch": 7.818317706210083, + "grad_norm": 0.308276504278183, + "learning_rate": 2.310096480500068e-06, + "loss": 3.9018, + "step": 115070 + }, + { + "epoch": 7.818657426280745, + "grad_norm": 0.1612575799226761, + "learning_rate": 2.305849979616796e-06, + "loss": 3.5972, + "step": 115075 + }, + { + "epoch": 7.818997146351406, + "grad_norm": 0.15713390707969666, + "learning_rate": 2.3016034787335236e-06, + "loss": 3.7938, + "step": 115080 + }, + { + "epoch": 7.819336866422068, + "grad_norm": 1.408290982246399, + "learning_rate": 2.2973569778502516e-06, + "loss": 3.6045, + "step": 115085 + }, + { + "epoch": 7.81967658649273, + "grad_norm": 0.2032548487186432, + "learning_rate": 2.293110476966979e-06, + "loss": 3.7975, + "step": 115090 + }, + { + "epoch": 7.820016306563391, + "grad_norm": 0.18865394592285156, + "learning_rate": 2.288863976083707e-06, + "loss": 4.0856, + "step": 115095 + }, + { + "epoch": 7.8203560266340535, + "grad_norm": 0.15208376944065094, + "learning_rate": 2.2846174752004348e-06, + "loss": 3.7576, + "step": 115100 + }, + { + "epoch": 7.820695746704716, + "grad_norm": 0.183175727725029, + "learning_rate": 2.2803709743171628e-06, + "loss": 3.7244, + "step": 115105 + }, + { + "epoch": 7.821035466775377, + "grad_norm": 0.1723073124885559, + "learning_rate": 2.2761244734338908e-06, + "loss": 3.9606, + "step": 115110 + }, + { + "epoch": 7.821375186846039, + "grad_norm": 0.18952374160289764, + "learning_rate": 2.2718779725506184e-06, + "loss": 3.9878, + "step": 115115 + }, + { + "epoch": 7.821714906916701, + "grad_norm": 0.14674167335033417, + "learning_rate": 2.267631471667346e-06, + "loss": 3.773, + "step": 115120 + }, + { + "epoch": 7.822054626987362, + "grad_norm": 0.40260037779808044, + "learning_rate": 2.263384970784074e-06, + "loss": 3.5655, + "step": 115125 + }, + { + "epoch": 7.822394347058024, + "grad_norm": 0.2227610945701599, + "learning_rate": 2.259138469900802e-06, + "loss": 3.8141, + "step": 115130 + }, + { + "epoch": 7.822734067128686, + "grad_norm": 0.8720604181289673, + "learning_rate": 2.2548919690175295e-06, + "loss": 3.8443, + "step": 115135 + }, + { + "epoch": 7.823073787199347, + "grad_norm": 0.16066598892211914, + "learning_rate": 2.2506454681342576e-06, + "loss": 3.8268, + "step": 115140 + }, + { + "epoch": 7.8234135072700095, + "grad_norm": 0.17034178972244263, + "learning_rate": 2.2463989672509856e-06, + "loss": 3.6978, + "step": 115145 + }, + { + "epoch": 7.823753227340672, + "grad_norm": 0.16923514008522034, + "learning_rate": 2.242152466367713e-06, + "loss": 3.7899, + "step": 115150 + }, + { + "epoch": 7.824092947411333, + "grad_norm": 0.17432734370231628, + "learning_rate": 2.2379059654844407e-06, + "loss": 3.7559, + "step": 115155 + }, + { + "epoch": 7.824432667481995, + "grad_norm": 0.2720164656639099, + "learning_rate": 2.2336594646011687e-06, + "loss": 3.6623, + "step": 115160 + }, + { + "epoch": 7.824772387552657, + "grad_norm": 0.3131222128868103, + "learning_rate": 2.2294129637178967e-06, + "loss": 3.6862, + "step": 115165 + }, + { + "epoch": 7.825112107623318, + "grad_norm": 0.13594438135623932, + "learning_rate": 2.2251664628346243e-06, + "loss": 3.7301, + "step": 115170 + }, + { + "epoch": 7.82545182769398, + "grad_norm": 0.15404319763183594, + "learning_rate": 2.220919961951352e-06, + "loss": 3.7252, + "step": 115175 + }, + { + "epoch": 7.825791547764642, + "grad_norm": 0.1580883115530014, + "learning_rate": 2.21667346106808e-06, + "loss": 3.9523, + "step": 115180 + }, + { + "epoch": 7.826131267835303, + "grad_norm": 0.9948074817657471, + "learning_rate": 2.212426960184808e-06, + "loss": 3.6913, + "step": 115185 + }, + { + "epoch": 7.8264709879059655, + "grad_norm": 0.18571017682552338, + "learning_rate": 2.2081804593015355e-06, + "loss": 3.9486, + "step": 115190 + }, + { + "epoch": 7.826810707976628, + "grad_norm": 0.20124420523643494, + "learning_rate": 2.2039339584182635e-06, + "loss": 3.9522, + "step": 115195 + }, + { + "epoch": 7.827150428047289, + "grad_norm": 0.556077778339386, + "learning_rate": 2.1996874575349915e-06, + "loss": 3.9305, + "step": 115200 + }, + { + "epoch": 7.827490148117951, + "grad_norm": 0.17076562345027924, + "learning_rate": 2.195440956651719e-06, + "loss": 3.7403, + "step": 115205 + }, + { + "epoch": 7.827829868188613, + "grad_norm": 0.19211645424365997, + "learning_rate": 2.1911944557684467e-06, + "loss": 3.8065, + "step": 115210 + }, + { + "epoch": 7.828169588259274, + "grad_norm": 0.19326914846897125, + "learning_rate": 2.1869479548851747e-06, + "loss": 3.8645, + "step": 115215 + }, + { + "epoch": 7.828509308329936, + "grad_norm": 0.1886027604341507, + "learning_rate": 2.1827014540019027e-06, + "loss": 3.8414, + "step": 115220 + }, + { + "epoch": 7.828849028400598, + "grad_norm": 0.17201919853687286, + "learning_rate": 2.1784549531186303e-06, + "loss": 3.841, + "step": 115225 + }, + { + "epoch": 7.8291887484712595, + "grad_norm": 0.2140149027109146, + "learning_rate": 2.1742084522353583e-06, + "loss": 3.8604, + "step": 115230 + }, + { + "epoch": 7.8295284685419215, + "grad_norm": 0.14808686077594757, + "learning_rate": 2.169961951352086e-06, + "loss": 3.7786, + "step": 115235 + }, + { + "epoch": 7.829868188612584, + "grad_norm": 0.5188559293746948, + "learning_rate": 2.165715450468814e-06, + "loss": 3.869, + "step": 115240 + }, + { + "epoch": 7.830207908683245, + "grad_norm": 0.24982301890850067, + "learning_rate": 2.1614689495855415e-06, + "loss": 3.9036, + "step": 115245 + }, + { + "epoch": 7.830547628753907, + "grad_norm": 0.1914442628622055, + "learning_rate": 2.1572224487022695e-06, + "loss": 3.8721, + "step": 115250 + }, + { + "epoch": 7.830887348824569, + "grad_norm": 0.17950540781021118, + "learning_rate": 2.1529759478189975e-06, + "loss": 3.8007, + "step": 115255 + }, + { + "epoch": 7.83122706889523, + "grad_norm": 0.15074919164180756, + "learning_rate": 2.148729446935725e-06, + "loss": 3.8312, + "step": 115260 + }, + { + "epoch": 7.831566788965892, + "grad_norm": 0.16599604487419128, + "learning_rate": 2.1444829460524527e-06, + "loss": 3.7837, + "step": 115265 + }, + { + "epoch": 7.831906509036553, + "grad_norm": 0.17573004961013794, + "learning_rate": 2.1402364451691807e-06, + "loss": 4.0733, + "step": 115270 + }, + { + "epoch": 7.8322462291072155, + "grad_norm": 0.1437542736530304, + "learning_rate": 2.1359899442859087e-06, + "loss": 3.6943, + "step": 115275 + }, + { + "epoch": 7.8325859491778775, + "grad_norm": 0.15226280689239502, + "learning_rate": 2.1317434434026363e-06, + "loss": 3.927, + "step": 115280 + }, + { + "epoch": 7.832925669248539, + "grad_norm": 0.20939812064170837, + "learning_rate": 2.1274969425193643e-06, + "loss": 3.7446, + "step": 115285 + }, + { + "epoch": 7.833265389319201, + "grad_norm": 0.1559753119945526, + "learning_rate": 2.123250441636092e-06, + "loss": 3.8868, + "step": 115290 + }, + { + "epoch": 7.833605109389863, + "grad_norm": 0.18638287484645844, + "learning_rate": 2.11900394075282e-06, + "loss": 3.9029, + "step": 115295 + }, + { + "epoch": 7.833944829460524, + "grad_norm": 0.26596730947494507, + "learning_rate": 2.1147574398695474e-06, + "loss": 3.8782, + "step": 115300 + }, + { + "epoch": 7.834284549531186, + "grad_norm": 0.13100992143154144, + "learning_rate": 2.1105109389862755e-06, + "loss": 3.7954, + "step": 115305 + }, + { + "epoch": 7.834624269601848, + "grad_norm": 0.18048886954784393, + "learning_rate": 2.1062644381030035e-06, + "loss": 3.9087, + "step": 115310 + }, + { + "epoch": 7.834963989672509, + "grad_norm": 0.16663040220737457, + "learning_rate": 2.102017937219731e-06, + "loss": 3.5484, + "step": 115315 + }, + { + "epoch": 7.8353037097431715, + "grad_norm": 0.1858481764793396, + "learning_rate": 2.0977714363364586e-06, + "loss": 3.9851, + "step": 115320 + }, + { + "epoch": 7.8356434298138335, + "grad_norm": 0.21858792006969452, + "learning_rate": 2.0935249354531866e-06, + "loss": 3.9054, + "step": 115325 + }, + { + "epoch": 7.835983149884495, + "grad_norm": 1.2401518821716309, + "learning_rate": 2.0892784345699146e-06, + "loss": 3.7769, + "step": 115330 + }, + { + "epoch": 7.836322869955157, + "grad_norm": 0.16787903010845184, + "learning_rate": 2.0850319336866422e-06, + "loss": 3.787, + "step": 115335 + }, + { + "epoch": 7.836662590025819, + "grad_norm": 0.9357500672340393, + "learning_rate": 2.0807854328033702e-06, + "loss": 3.8192, + "step": 115340 + }, + { + "epoch": 7.83700231009648, + "grad_norm": 0.21769079566001892, + "learning_rate": 2.076538931920098e-06, + "loss": 3.8272, + "step": 115345 + }, + { + "epoch": 7.837342030167142, + "grad_norm": 0.20669998228549957, + "learning_rate": 2.072292431036826e-06, + "loss": 3.818, + "step": 115350 + }, + { + "epoch": 7.837681750237804, + "grad_norm": 0.12953872978687286, + "learning_rate": 2.0680459301535534e-06, + "loss": 3.77, + "step": 115355 + }, + { + "epoch": 7.838021470308465, + "grad_norm": 0.19848035275936127, + "learning_rate": 2.0637994292702814e-06, + "loss": 4.037, + "step": 115360 + }, + { + "epoch": 7.8383611903791275, + "grad_norm": 0.16908647119998932, + "learning_rate": 2.0595529283870094e-06, + "loss": 3.9477, + "step": 115365 + }, + { + "epoch": 7.83870091044979, + "grad_norm": 0.47565576434135437, + "learning_rate": 2.055306427503737e-06, + "loss": 3.7523, + "step": 115370 + }, + { + "epoch": 7.839040630520451, + "grad_norm": 0.20079322159290314, + "learning_rate": 2.0510599266204646e-06, + "loss": 4.069, + "step": 115375 + }, + { + "epoch": 7.839380350591113, + "grad_norm": 0.15697690844535828, + "learning_rate": 2.0468134257371926e-06, + "loss": 3.9446, + "step": 115380 + }, + { + "epoch": 7.839720070661775, + "grad_norm": 0.17516109347343445, + "learning_rate": 2.0425669248539206e-06, + "loss": 3.9356, + "step": 115385 + }, + { + "epoch": 7.840059790732436, + "grad_norm": 0.19239959120750427, + "learning_rate": 2.038320423970648e-06, + "loss": 3.7195, + "step": 115390 + }, + { + "epoch": 7.840399510803098, + "grad_norm": 0.28553131222724915, + "learning_rate": 2.034073923087376e-06, + "loss": 3.838, + "step": 115395 + }, + { + "epoch": 7.84073923087376, + "grad_norm": 0.18603533506393433, + "learning_rate": 2.0298274222041042e-06, + "loss": 3.8407, + "step": 115400 + }, + { + "epoch": 7.841078950944421, + "grad_norm": 0.18576057255268097, + "learning_rate": 2.025580921320832e-06, + "loss": 3.6927, + "step": 115405 + }, + { + "epoch": 7.8414186710150835, + "grad_norm": 0.21501852571964264, + "learning_rate": 2.0213344204375594e-06, + "loss": 3.6173, + "step": 115410 + }, + { + "epoch": 7.841758391085746, + "grad_norm": 0.16673129796981812, + "learning_rate": 2.0170879195542874e-06, + "loss": 3.7657, + "step": 115415 + }, + { + "epoch": 7.842098111156407, + "grad_norm": 0.2368869185447693, + "learning_rate": 2.0128414186710154e-06, + "loss": 3.9181, + "step": 115420 + }, + { + "epoch": 7.842437831227069, + "grad_norm": 1.0360392332077026, + "learning_rate": 2.008594917787743e-06, + "loss": 4.0135, + "step": 115425 + }, + { + "epoch": 7.842777551297731, + "grad_norm": 0.26127123832702637, + "learning_rate": 2.0043484169044706e-06, + "loss": 3.8212, + "step": 115430 + }, + { + "epoch": 7.843117271368392, + "grad_norm": 0.15558668971061707, + "learning_rate": 2.0001019160211986e-06, + "loss": 3.8843, + "step": 115435 + }, + { + "epoch": 7.843456991439054, + "grad_norm": 0.27758362889289856, + "learning_rate": 1.9958554151379266e-06, + "loss": 3.7974, + "step": 115440 + }, + { + "epoch": 7.843796711509716, + "grad_norm": 1.793770432472229, + "learning_rate": 1.991608914254654e-06, + "loss": 3.9551, + "step": 115445 + }, + { + "epoch": 7.844136431580377, + "grad_norm": 0.2132192701101303, + "learning_rate": 1.987362413371382e-06, + "loss": 3.8295, + "step": 115450 + }, + { + "epoch": 7.8444761516510395, + "grad_norm": 0.12911666929721832, + "learning_rate": 1.98311591248811e-06, + "loss": 3.6592, + "step": 115455 + }, + { + "epoch": 7.844815871721702, + "grad_norm": 0.15964646637439728, + "learning_rate": 1.9788694116048378e-06, + "loss": 3.8216, + "step": 115460 + }, + { + "epoch": 7.845155591792363, + "grad_norm": 0.1740860790014267, + "learning_rate": 1.9746229107215653e-06, + "loss": 3.7871, + "step": 115465 + }, + { + "epoch": 7.845495311863025, + "grad_norm": 0.16032209992408752, + "learning_rate": 1.9703764098382934e-06, + "loss": 3.9306, + "step": 115470 + }, + { + "epoch": 7.845835031933687, + "grad_norm": 0.1383236199617386, + "learning_rate": 1.9661299089550214e-06, + "loss": 3.8757, + "step": 115475 + }, + { + "epoch": 7.846174752004348, + "grad_norm": 0.21903368830680847, + "learning_rate": 1.961883408071749e-06, + "loss": 3.7811, + "step": 115480 + }, + { + "epoch": 7.84651447207501, + "grad_norm": 0.15101826190948486, + "learning_rate": 1.957636907188477e-06, + "loss": 3.8677, + "step": 115485 + }, + { + "epoch": 7.846854192145672, + "grad_norm": 0.18472830951213837, + "learning_rate": 1.9533904063052045e-06, + "loss": 3.7443, + "step": 115490 + }, + { + "epoch": 7.8471939122163334, + "grad_norm": 0.20109249651432037, + "learning_rate": 1.9491439054219325e-06, + "loss": 3.8467, + "step": 115495 + }, + { + "epoch": 7.8475336322869955, + "grad_norm": 0.16343627870082855, + "learning_rate": 1.94489740453866e-06, + "loss": 4.0767, + "step": 115500 + }, + { + "epoch": 7.847873352357658, + "grad_norm": 0.5087806582450867, + "learning_rate": 1.940650903655388e-06, + "loss": 3.8468, + "step": 115505 + }, + { + "epoch": 7.848213072428319, + "grad_norm": 0.22257287800312042, + "learning_rate": 1.936404402772116e-06, + "loss": 3.6908, + "step": 115510 + }, + { + "epoch": 7.848552792498981, + "grad_norm": 0.15688547492027283, + "learning_rate": 1.9321579018888437e-06, + "loss": 3.568, + "step": 115515 + }, + { + "epoch": 7.848892512569643, + "grad_norm": 0.1627071648836136, + "learning_rate": 1.9279114010055713e-06, + "loss": 3.8915, + "step": 115520 + }, + { + "epoch": 7.849232232640304, + "grad_norm": 0.16236929595470428, + "learning_rate": 1.9236649001222993e-06, + "loss": 3.5155, + "step": 115525 + }, + { + "epoch": 7.849571952710966, + "grad_norm": 0.19859275221824646, + "learning_rate": 1.9194183992390273e-06, + "loss": 3.6508, + "step": 115530 + }, + { + "epoch": 7.849911672781628, + "grad_norm": 0.16096225380897522, + "learning_rate": 1.915171898355755e-06, + "loss": 3.7328, + "step": 115535 + }, + { + "epoch": 7.8502513928522895, + "grad_norm": 0.19734124839305878, + "learning_rate": 1.910925397472483e-06, + "loss": 3.8532, + "step": 115540 + }, + { + "epoch": 7.8505911129229515, + "grad_norm": 0.22757942974567413, + "learning_rate": 1.9066788965892105e-06, + "loss": 4.0399, + "step": 115545 + }, + { + "epoch": 7.850930832993614, + "grad_norm": 2.4135022163391113, + "learning_rate": 1.9024323957059383e-06, + "loss": 3.8876, + "step": 115550 + }, + { + "epoch": 7.851270553064275, + "grad_norm": 0.7658846974372864, + "learning_rate": 1.898185894822666e-06, + "loss": 3.7066, + "step": 115555 + }, + { + "epoch": 7.851610273134937, + "grad_norm": 0.1845761388540268, + "learning_rate": 1.8939393939393941e-06, + "loss": 3.6653, + "step": 115560 + }, + { + "epoch": 7.851949993205599, + "grad_norm": 0.14530088007450104, + "learning_rate": 1.889692893056122e-06, + "loss": 3.9674, + "step": 115565 + }, + { + "epoch": 7.85228971327626, + "grad_norm": 0.15896068513393402, + "learning_rate": 1.8854463921728495e-06, + "loss": 3.8379, + "step": 115570 + }, + { + "epoch": 7.852629433346922, + "grad_norm": 0.1459939181804657, + "learning_rate": 1.8811998912895773e-06, + "loss": 4.0456, + "step": 115575 + }, + { + "epoch": 7.852969153417584, + "grad_norm": 0.13987022638320923, + "learning_rate": 1.8769533904063053e-06, + "loss": 3.931, + "step": 115580 + }, + { + "epoch": 7.8533088734882455, + "grad_norm": 0.14753757417201996, + "learning_rate": 1.872706889523033e-06, + "loss": 3.5608, + "step": 115585 + }, + { + "epoch": 7.8536485935589075, + "grad_norm": 0.1788814812898636, + "learning_rate": 1.868460388639761e-06, + "loss": 3.6957, + "step": 115590 + }, + { + "epoch": 7.85398831362957, + "grad_norm": 0.15700975060462952, + "learning_rate": 1.8642138877564889e-06, + "loss": 3.7807, + "step": 115595 + }, + { + "epoch": 7.854328033700231, + "grad_norm": 0.16075068712234497, + "learning_rate": 1.8599673868732165e-06, + "loss": 3.7053, + "step": 115600 + }, + { + "epoch": 7.854667753770893, + "grad_norm": 0.16840314865112305, + "learning_rate": 1.8557208859899443e-06, + "loss": 3.7319, + "step": 115605 + }, + { + "epoch": 7.855007473841555, + "grad_norm": 0.20353078842163086, + "learning_rate": 1.851474385106672e-06, + "loss": 3.665, + "step": 115610 + }, + { + "epoch": 7.855347193912216, + "grad_norm": 0.17659714818000793, + "learning_rate": 1.8472278842234e-06, + "loss": 3.7186, + "step": 115615 + }, + { + "epoch": 7.855686913982878, + "grad_norm": 0.16947774589061737, + "learning_rate": 1.8429813833401279e-06, + "loss": 3.6715, + "step": 115620 + }, + { + "epoch": 7.85602663405354, + "grad_norm": 0.1481778919696808, + "learning_rate": 1.8387348824568559e-06, + "loss": 3.9869, + "step": 115625 + }, + { + "epoch": 7.8563663541242015, + "grad_norm": 0.2601185142993927, + "learning_rate": 1.8344883815735833e-06, + "loss": 3.7418, + "step": 115630 + }, + { + "epoch": 7.8567060741948636, + "grad_norm": 0.38306742906570435, + "learning_rate": 1.8302418806903113e-06, + "loss": 3.9998, + "step": 115635 + }, + { + "epoch": 7.857045794265526, + "grad_norm": 0.1524161845445633, + "learning_rate": 1.825995379807039e-06, + "loss": 3.8859, + "step": 115640 + }, + { + "epoch": 7.857385514336187, + "grad_norm": 0.16350775957107544, + "learning_rate": 1.821748878923767e-06, + "loss": 3.703, + "step": 115645 + }, + { + "epoch": 7.857725234406849, + "grad_norm": 0.165310338139534, + "learning_rate": 1.8175023780404949e-06, + "loss": 3.8769, + "step": 115650 + }, + { + "epoch": 7.858064954477511, + "grad_norm": 0.16882139444351196, + "learning_rate": 1.8132558771572224e-06, + "loss": 3.8135, + "step": 115655 + }, + { + "epoch": 7.858404674548172, + "grad_norm": 0.21799154579639435, + "learning_rate": 1.8098586764506048e-06, + "loss": 3.6623, + "step": 115660 + }, + { + "epoch": 7.858744394618834, + "grad_norm": 0.1779099553823471, + "learning_rate": 1.8056121755673324e-06, + "loss": 3.7523, + "step": 115665 + }, + { + "epoch": 7.859084114689496, + "grad_norm": 0.1825561821460724, + "learning_rate": 1.8013656746840604e-06, + "loss": 3.8679, + "step": 115670 + }, + { + "epoch": 7.8594238347601575, + "grad_norm": 0.13843581080436707, + "learning_rate": 1.7971191738007882e-06, + "loss": 3.7355, + "step": 115675 + }, + { + "epoch": 7.85976355483082, + "grad_norm": 0.16008658707141876, + "learning_rate": 1.792872672917516e-06, + "loss": 3.8576, + "step": 115680 + }, + { + "epoch": 7.860103274901482, + "grad_norm": 0.2742689251899719, + "learning_rate": 1.788626172034244e-06, + "loss": 3.9928, + "step": 115685 + }, + { + "epoch": 7.860442994972143, + "grad_norm": 0.1667098104953766, + "learning_rate": 1.7843796711509714e-06, + "loss": 3.9051, + "step": 115690 + }, + { + "epoch": 7.860782715042805, + "grad_norm": 0.15737788379192352, + "learning_rate": 1.7801331702676994e-06, + "loss": 3.6494, + "step": 115695 + }, + { + "epoch": 7.861122435113467, + "grad_norm": 0.15849186480045319, + "learning_rate": 1.7758866693844272e-06, + "loss": 3.7059, + "step": 115700 + }, + { + "epoch": 7.861462155184128, + "grad_norm": 0.2119208127260208, + "learning_rate": 1.7716401685011552e-06, + "loss": 3.7989, + "step": 115705 + }, + { + "epoch": 7.86180187525479, + "grad_norm": 0.19014139473438263, + "learning_rate": 1.767393667617883e-06, + "loss": 3.8855, + "step": 115710 + }, + { + "epoch": 7.862141595325452, + "grad_norm": 0.17001889646053314, + "learning_rate": 1.7631471667346108e-06, + "loss": 3.6373, + "step": 115715 + }, + { + "epoch": 7.8624813153961135, + "grad_norm": 8.008279800415039, + "learning_rate": 1.7589006658513384e-06, + "loss": 3.7931, + "step": 115720 + }, + { + "epoch": 7.862821035466776, + "grad_norm": 0.3961554169654846, + "learning_rate": 1.7546541649680664e-06, + "loss": 3.7189, + "step": 115725 + }, + { + "epoch": 7.863160755537437, + "grad_norm": 0.2089657336473465, + "learning_rate": 1.7504076640847942e-06, + "loss": 3.815, + "step": 115730 + }, + { + "epoch": 7.863500475608099, + "grad_norm": 0.16714675724506378, + "learning_rate": 1.746161163201522e-06, + "loss": 3.5627, + "step": 115735 + }, + { + "epoch": 7.863840195678761, + "grad_norm": 0.17142915725708008, + "learning_rate": 1.74191466231825e-06, + "loss": 3.7578, + "step": 115740 + }, + { + "epoch": 7.864179915749422, + "grad_norm": 0.15953168272972107, + "learning_rate": 1.7376681614349778e-06, + "loss": 3.938, + "step": 115745 + }, + { + "epoch": 7.864519635820084, + "grad_norm": 0.2192162275314331, + "learning_rate": 1.7334216605517054e-06, + "loss": 3.7593, + "step": 115750 + }, + { + "epoch": 7.864859355890746, + "grad_norm": 0.2340209037065506, + "learning_rate": 1.7291751596684332e-06, + "loss": 3.9622, + "step": 115755 + }, + { + "epoch": 7.865199075961407, + "grad_norm": 0.1496942788362503, + "learning_rate": 1.7249286587851612e-06, + "loss": 3.9418, + "step": 115760 + }, + { + "epoch": 7.8655387960320695, + "grad_norm": 0.16416987776756287, + "learning_rate": 1.720682157901889e-06, + "loss": 3.8654, + "step": 115765 + }, + { + "epoch": 7.865878516102732, + "grad_norm": 0.1708347201347351, + "learning_rate": 1.7164356570186168e-06, + "loss": 3.9323, + "step": 115770 + }, + { + "epoch": 7.866218236173393, + "grad_norm": 0.15936174988746643, + "learning_rate": 1.7121891561353444e-06, + "loss": 3.6685, + "step": 115775 + }, + { + "epoch": 7.866557956244055, + "grad_norm": 0.16805687546730042, + "learning_rate": 1.7079426552520722e-06, + "loss": 3.8705, + "step": 115780 + }, + { + "epoch": 7.866897676314717, + "grad_norm": 0.23999804258346558, + "learning_rate": 1.7036961543688002e-06, + "loss": 3.8663, + "step": 115785 + }, + { + "epoch": 7.867237396385378, + "grad_norm": 0.1615130454301834, + "learning_rate": 1.699449653485528e-06, + "loss": 3.9724, + "step": 115790 + }, + { + "epoch": 7.86757711645604, + "grad_norm": 0.17682549357414246, + "learning_rate": 1.695203152602256e-06, + "loss": 4.0461, + "step": 115795 + }, + { + "epoch": 7.867916836526702, + "grad_norm": 0.15635935962200165, + "learning_rate": 1.6909566517189838e-06, + "loss": 3.7194, + "step": 115800 + }, + { + "epoch": 7.8682565565973634, + "grad_norm": 0.19015200436115265, + "learning_rate": 1.6867101508357114e-06, + "loss": 3.8379, + "step": 115805 + }, + { + "epoch": 7.8685962766680255, + "grad_norm": 0.2743775248527527, + "learning_rate": 1.6824636499524392e-06, + "loss": 4.0052, + "step": 115810 + }, + { + "epoch": 7.868935996738688, + "grad_norm": 0.45265713334083557, + "learning_rate": 1.6782171490691672e-06, + "loss": 3.8635, + "step": 115815 + }, + { + "epoch": 7.869275716809349, + "grad_norm": 0.1490425020456314, + "learning_rate": 1.673970648185895e-06, + "loss": 3.9989, + "step": 115820 + }, + { + "epoch": 7.869615436880011, + "grad_norm": 0.8032705783843994, + "learning_rate": 1.6697241473026228e-06, + "loss": 3.7833, + "step": 115825 + }, + { + "epoch": 7.869955156950673, + "grad_norm": 0.16514045000076294, + "learning_rate": 1.6654776464193508e-06, + "loss": 4.1033, + "step": 115830 + }, + { + "epoch": 7.870294877021334, + "grad_norm": 0.24825578927993774, + "learning_rate": 1.6612311455360781e-06, + "loss": 3.6458, + "step": 115835 + }, + { + "epoch": 7.870634597091996, + "grad_norm": 0.2208424210548401, + "learning_rate": 1.6569846446528061e-06, + "loss": 3.7391, + "step": 115840 + }, + { + "epoch": 7.870974317162658, + "grad_norm": 0.6417048573493958, + "learning_rate": 1.652738143769534e-06, + "loss": 3.8112, + "step": 115845 + }, + { + "epoch": 7.8713140372333195, + "grad_norm": 0.23607192933559418, + "learning_rate": 1.648491642886262e-06, + "loss": 3.7467, + "step": 115850 + }, + { + "epoch": 7.8716537573039815, + "grad_norm": 0.2020142674446106, + "learning_rate": 1.6442451420029897e-06, + "loss": 3.6515, + "step": 115855 + }, + { + "epoch": 7.871993477374644, + "grad_norm": 0.18760383129119873, + "learning_rate": 1.6399986411197173e-06, + "loss": 3.8732, + "step": 115860 + }, + { + "epoch": 7.872333197445305, + "grad_norm": 0.1674005687236786, + "learning_rate": 1.6357521402364451e-06, + "loss": 3.8958, + "step": 115865 + }, + { + "epoch": 7.872672917515967, + "grad_norm": 0.17931732535362244, + "learning_rate": 1.6315056393531731e-06, + "loss": 3.8676, + "step": 115870 + }, + { + "epoch": 7.873012637586629, + "grad_norm": 0.3218427896499634, + "learning_rate": 1.627259138469901e-06, + "loss": 3.9777, + "step": 115875 + }, + { + "epoch": 7.87335235765729, + "grad_norm": 0.13260626792907715, + "learning_rate": 1.6230126375866287e-06, + "loss": 3.7885, + "step": 115880 + }, + { + "epoch": 7.873692077727952, + "grad_norm": 0.17373421788215637, + "learning_rate": 1.6187661367033567e-06, + "loss": 4.0071, + "step": 115885 + }, + { + "epoch": 7.874031797798614, + "grad_norm": 0.15977302193641663, + "learning_rate": 1.614519635820084e-06, + "loss": 3.8514, + "step": 115890 + }, + { + "epoch": 7.8743715178692755, + "grad_norm": 0.1941603273153305, + "learning_rate": 1.610273134936812e-06, + "loss": 3.7676, + "step": 115895 + }, + { + "epoch": 7.8747112379399375, + "grad_norm": 0.18742066621780396, + "learning_rate": 1.60602663405354e-06, + "loss": 3.7257, + "step": 115900 + }, + { + "epoch": 7.8750509580106, + "grad_norm": 0.15565814077854156, + "learning_rate": 1.601780133170268e-06, + "loss": 3.7799, + "step": 115905 + }, + { + "epoch": 7.875390678081261, + "grad_norm": 0.15320853888988495, + "learning_rate": 1.5975336322869957e-06, + "loss": 3.9483, + "step": 115910 + }, + { + "epoch": 7.875730398151923, + "grad_norm": 0.18914549052715302, + "learning_rate": 1.5932871314037235e-06, + "loss": 3.7804, + "step": 115915 + }, + { + "epoch": 7.876070118222585, + "grad_norm": 0.16576318442821503, + "learning_rate": 1.589040630520451e-06, + "loss": 3.792, + "step": 115920 + }, + { + "epoch": 7.876409838293246, + "grad_norm": 0.1886059045791626, + "learning_rate": 1.5847941296371789e-06, + "loss": 3.9102, + "step": 115925 + }, + { + "epoch": 7.876749558363908, + "grad_norm": 0.35218745470046997, + "learning_rate": 1.5805476287539069e-06, + "loss": 3.9178, + "step": 115930 + }, + { + "epoch": 7.87708927843457, + "grad_norm": 0.24168403446674347, + "learning_rate": 1.5763011278706347e-06, + "loss": 3.5108, + "step": 115935 + }, + { + "epoch": 7.8774289985052315, + "grad_norm": 0.18502888083457947, + "learning_rate": 1.5720546269873627e-06, + "loss": 3.6535, + "step": 115940 + }, + { + "epoch": 7.877768718575894, + "grad_norm": 0.48280733823776245, + "learning_rate": 1.56780812610409e-06, + "loss": 3.871, + "step": 115945 + }, + { + "epoch": 7.878108438646555, + "grad_norm": 0.23183463513851166, + "learning_rate": 1.563561625220818e-06, + "loss": 3.6898, + "step": 115950 + }, + { + "epoch": 7.878448158717217, + "grad_norm": 0.29964011907577515, + "learning_rate": 1.5593151243375459e-06, + "loss": 3.6645, + "step": 115955 + }, + { + "epoch": 7.878787878787879, + "grad_norm": 0.16540713608264923, + "learning_rate": 1.5550686234542739e-06, + "loss": 3.7296, + "step": 115960 + }, + { + "epoch": 7.87912759885854, + "grad_norm": 0.3400106132030487, + "learning_rate": 1.5508221225710017e-06, + "loss": 3.9022, + "step": 115965 + }, + { + "epoch": 7.879467318929202, + "grad_norm": 0.17198190093040466, + "learning_rate": 1.5465756216877293e-06, + "loss": 3.685, + "step": 115970 + }, + { + "epoch": 7.879807038999864, + "grad_norm": 0.21182386577129364, + "learning_rate": 1.5423291208044573e-06, + "loss": 3.8958, + "step": 115975 + }, + { + "epoch": 7.880146759070525, + "grad_norm": 0.16231518983840942, + "learning_rate": 1.5380826199211848e-06, + "loss": 3.8691, + "step": 115980 + }, + { + "epoch": 7.8804864791411875, + "grad_norm": 0.15177547931671143, + "learning_rate": 1.5338361190379129e-06, + "loss": 3.7817, + "step": 115985 + }, + { + "epoch": 7.88082619921185, + "grad_norm": 0.3809646964073181, + "learning_rate": 1.5295896181546407e-06, + "loss": 3.8596, + "step": 115990 + }, + { + "epoch": 7.881165919282511, + "grad_norm": 0.20320940017700195, + "learning_rate": 1.5253431172713684e-06, + "loss": 3.8805, + "step": 115995 + }, + { + "epoch": 7.881505639353173, + "grad_norm": 0.22772137820720673, + "learning_rate": 1.5210966163880962e-06, + "loss": 3.7768, + "step": 116000 + }, + { + "epoch": 7.881845359423835, + "grad_norm": 0.20089170336723328, + "learning_rate": 1.5168501155048243e-06, + "loss": 3.6444, + "step": 116005 + }, + { + "epoch": 7.882185079494496, + "grad_norm": 0.1601470559835434, + "learning_rate": 1.5126036146215518e-06, + "loss": 3.8442, + "step": 116010 + }, + { + "epoch": 7.882524799565158, + "grad_norm": 0.2130550742149353, + "learning_rate": 1.5083571137382796e-06, + "loss": 3.9031, + "step": 116015 + }, + { + "epoch": 7.88286451963582, + "grad_norm": 0.16107559204101562, + "learning_rate": 1.5041106128550076e-06, + "loss": 4.0138, + "step": 116020 + }, + { + "epoch": 7.883204239706481, + "grad_norm": 0.1798521727323532, + "learning_rate": 1.4998641119717352e-06, + "loss": 3.9506, + "step": 116025 + }, + { + "epoch": 7.8835439597771435, + "grad_norm": 0.15807956457138062, + "learning_rate": 1.4956176110884632e-06, + "loss": 3.9539, + "step": 116030 + }, + { + "epoch": 7.883883679847806, + "grad_norm": 0.44516900181770325, + "learning_rate": 1.491371110205191e-06, + "loss": 3.8437, + "step": 116035 + }, + { + "epoch": 7.884223399918467, + "grad_norm": 0.15080706775188446, + "learning_rate": 1.4871246093219188e-06, + "loss": 4.0411, + "step": 116040 + }, + { + "epoch": 7.884563119989129, + "grad_norm": 0.178952157497406, + "learning_rate": 1.4828781084386466e-06, + "loss": 3.577, + "step": 116045 + }, + { + "epoch": 7.884902840059791, + "grad_norm": 0.18181796371936798, + "learning_rate": 1.4786316075553746e-06, + "loss": 4.031, + "step": 116050 + }, + { + "epoch": 7.885242560130452, + "grad_norm": 0.14518819749355316, + "learning_rate": 1.4743851066721022e-06, + "loss": 3.8595, + "step": 116055 + }, + { + "epoch": 7.885582280201114, + "grad_norm": 0.17425918579101562, + "learning_rate": 1.47013860578883e-06, + "loss": 3.9581, + "step": 116060 + }, + { + "epoch": 7.885922000271776, + "grad_norm": 0.17922067642211914, + "learning_rate": 1.4658921049055578e-06, + "loss": 3.9873, + "step": 116065 + }, + { + "epoch": 7.886261720342437, + "grad_norm": 0.1371597945690155, + "learning_rate": 1.4616456040222856e-06, + "loss": 3.8208, + "step": 116070 + }, + { + "epoch": 7.8866014404130995, + "grad_norm": 0.16405847668647766, + "learning_rate": 1.4573991031390136e-06, + "loss": 3.6678, + "step": 116075 + }, + { + "epoch": 7.886941160483762, + "grad_norm": 0.22029268741607666, + "learning_rate": 1.4531526022557412e-06, + "loss": 3.9544, + "step": 116080 + }, + { + "epoch": 7.887280880554423, + "grad_norm": 0.19403433799743652, + "learning_rate": 1.4489061013724692e-06, + "loss": 3.7026, + "step": 116085 + }, + { + "epoch": 7.887620600625085, + "grad_norm": 0.17616619169712067, + "learning_rate": 1.444659600489197e-06, + "loss": 3.8871, + "step": 116090 + }, + { + "epoch": 7.887960320695747, + "grad_norm": 0.17625005543231964, + "learning_rate": 1.4404130996059248e-06, + "loss": 3.6753, + "step": 116095 + }, + { + "epoch": 7.888300040766408, + "grad_norm": 0.19418418407440186, + "learning_rate": 1.4361665987226526e-06, + "loss": 3.9282, + "step": 116100 + }, + { + "epoch": 7.88863976083707, + "grad_norm": 8.285860061645508, + "learning_rate": 1.4319200978393804e-06, + "loss": 3.8537, + "step": 116105 + }, + { + "epoch": 7.888979480907732, + "grad_norm": 0.18206679821014404, + "learning_rate": 1.4276735969561082e-06, + "loss": 3.6331, + "step": 116110 + }, + { + "epoch": 7.8893192009783935, + "grad_norm": 0.33881255984306335, + "learning_rate": 1.423427096072836e-06, + "loss": 3.7995, + "step": 116115 + }, + { + "epoch": 7.8896589210490555, + "grad_norm": 0.1886754184961319, + "learning_rate": 1.419180595189564e-06, + "loss": 3.8057, + "step": 116120 + }, + { + "epoch": 7.889998641119718, + "grad_norm": 0.1676386445760727, + "learning_rate": 1.4149340943062916e-06, + "loss": 3.8009, + "step": 116125 + }, + { + "epoch": 7.890338361190379, + "grad_norm": 0.14125476777553558, + "learning_rate": 1.4106875934230196e-06, + "loss": 4.0566, + "step": 116130 + }, + { + "epoch": 7.890678081261041, + "grad_norm": 0.1891328990459442, + "learning_rate": 1.4064410925397474e-06, + "loss": 3.9416, + "step": 116135 + }, + { + "epoch": 7.891017801331703, + "grad_norm": 0.19417628645896912, + "learning_rate": 1.4021945916564752e-06, + "loss": 3.5851, + "step": 116140 + }, + { + "epoch": 7.891357521402364, + "grad_norm": 0.18279825150966644, + "learning_rate": 1.397948090773203e-06, + "loss": 3.654, + "step": 116145 + }, + { + "epoch": 7.891697241473026, + "grad_norm": 0.22396333515644073, + "learning_rate": 1.3937015898899308e-06, + "loss": 3.7531, + "step": 116150 + }, + { + "epoch": 7.892036961543688, + "grad_norm": 0.15392985939979553, + "learning_rate": 1.3894550890066586e-06, + "loss": 3.7747, + "step": 116155 + }, + { + "epoch": 7.8923766816143495, + "grad_norm": 0.1545015126466751, + "learning_rate": 1.3852085881233864e-06, + "loss": 3.7611, + "step": 116160 + }, + { + "epoch": 7.8927164016850115, + "grad_norm": 0.20304299890995026, + "learning_rate": 1.3809620872401141e-06, + "loss": 3.8572, + "step": 116165 + }, + { + "epoch": 7.893056121755674, + "grad_norm": 0.13265854120254517, + "learning_rate": 1.376715586356842e-06, + "loss": 3.8421, + "step": 116170 + }, + { + "epoch": 7.893395841826335, + "grad_norm": 0.14765799045562744, + "learning_rate": 1.37246908547357e-06, + "loss": 3.8729, + "step": 116175 + }, + { + "epoch": 7.893735561896997, + "grad_norm": 0.16188237071037292, + "learning_rate": 1.3682225845902975e-06, + "loss": 3.8954, + "step": 116180 + }, + { + "epoch": 7.894075281967659, + "grad_norm": 0.15677890181541443, + "learning_rate": 1.3639760837070255e-06, + "loss": 3.8902, + "step": 116185 + }, + { + "epoch": 7.89441500203832, + "grad_norm": 0.14045758545398712, + "learning_rate": 1.3597295828237533e-06, + "loss": 3.7779, + "step": 116190 + }, + { + "epoch": 7.894754722108982, + "grad_norm": 0.2198839783668518, + "learning_rate": 1.3554830819404811e-06, + "loss": 3.7279, + "step": 116195 + }, + { + "epoch": 7.895094442179644, + "grad_norm": 0.18900440633296967, + "learning_rate": 1.351236581057209e-06, + "loss": 3.7634, + "step": 116200 + }, + { + "epoch": 7.8954341622503055, + "grad_norm": 0.458397775888443, + "learning_rate": 1.3469900801739367e-06, + "loss": 3.9951, + "step": 116205 + }, + { + "epoch": 7.8957738823209676, + "grad_norm": 0.15128721296787262, + "learning_rate": 1.3427435792906645e-06, + "loss": 3.5556, + "step": 116210 + }, + { + "epoch": 7.89611360239163, + "grad_norm": 0.14768928289413452, + "learning_rate": 1.3384970784073923e-06, + "loss": 3.9156, + "step": 116215 + }, + { + "epoch": 7.896453322462291, + "grad_norm": 0.2021690458059311, + "learning_rate": 1.3342505775241201e-06, + "loss": 3.8095, + "step": 116220 + }, + { + "epoch": 7.896793042532953, + "grad_norm": 0.20424900949001312, + "learning_rate": 1.330004076640848e-06, + "loss": 3.7112, + "step": 116225 + }, + { + "epoch": 7.897132762603615, + "grad_norm": 0.1780293732881546, + "learning_rate": 1.325757575757576e-06, + "loss": 3.6994, + "step": 116230 + }, + { + "epoch": 7.897472482674276, + "grad_norm": 0.1736258566379547, + "learning_rate": 1.3215110748743035e-06, + "loss": 3.8329, + "step": 116235 + }, + { + "epoch": 7.897812202744938, + "grad_norm": 0.16770756244659424, + "learning_rate": 1.3172645739910315e-06, + "loss": 3.6385, + "step": 116240 + }, + { + "epoch": 7.8981519228156, + "grad_norm": 0.8674100637435913, + "learning_rate": 1.3130180731077593e-06, + "loss": 3.9159, + "step": 116245 + }, + { + "epoch": 7.8984916428862615, + "grad_norm": 0.19839626550674438, + "learning_rate": 1.308771572224487e-06, + "loss": 3.8115, + "step": 116250 + }, + { + "epoch": 7.898831362956924, + "grad_norm": 0.23102207481861115, + "learning_rate": 1.304525071341215e-06, + "loss": 3.7508, + "step": 116255 + }, + { + "epoch": 7.899171083027586, + "grad_norm": 0.1530604213476181, + "learning_rate": 1.3002785704579427e-06, + "loss": 3.6019, + "step": 116260 + }, + { + "epoch": 7.899510803098247, + "grad_norm": 0.1641506552696228, + "learning_rate": 1.2960320695746705e-06, + "loss": 3.8668, + "step": 116265 + }, + { + "epoch": 7.899850523168909, + "grad_norm": 0.18931844830513, + "learning_rate": 1.2917855686913983e-06, + "loss": 4.0085, + "step": 116270 + }, + { + "epoch": 7.900190243239571, + "grad_norm": 0.2029097080230713, + "learning_rate": 1.2875390678081263e-06, + "loss": 3.7679, + "step": 116275 + }, + { + "epoch": 7.900529963310232, + "grad_norm": 0.16964520514011383, + "learning_rate": 1.2832925669248539e-06, + "loss": 3.7747, + "step": 116280 + }, + { + "epoch": 7.900869683380894, + "grad_norm": 0.23619960248470306, + "learning_rate": 1.2790460660415819e-06, + "loss": 3.8116, + "step": 116285 + }, + { + "epoch": 7.901209403451556, + "grad_norm": 0.16135098040103912, + "learning_rate": 1.2747995651583097e-06, + "loss": 3.8948, + "step": 116290 + }, + { + "epoch": 7.9015491235222175, + "grad_norm": 0.1677645593881607, + "learning_rate": 1.2705530642750375e-06, + "loss": 3.7508, + "step": 116295 + }, + { + "epoch": 7.90188884359288, + "grad_norm": 0.19557127356529236, + "learning_rate": 1.2663065633917653e-06, + "loss": 3.81, + "step": 116300 + }, + { + "epoch": 7.902228563663542, + "grad_norm": 0.16776186227798462, + "learning_rate": 1.262060062508493e-06, + "loss": 3.531, + "step": 116305 + }, + { + "epoch": 7.902568283734203, + "grad_norm": 0.2981445789337158, + "learning_rate": 1.2578135616252209e-06, + "loss": 3.756, + "step": 116310 + }, + { + "epoch": 7.902908003804865, + "grad_norm": 0.20716220140457153, + "learning_rate": 1.2535670607419487e-06, + "loss": 3.8202, + "step": 116315 + }, + { + "epoch": 7.903247723875527, + "grad_norm": 0.15512236952781677, + "learning_rate": 1.2493205598586765e-06, + "loss": 3.9775, + "step": 116320 + }, + { + "epoch": 7.903587443946188, + "grad_norm": 0.1681266874074936, + "learning_rate": 1.2450740589754043e-06, + "loss": 3.8483, + "step": 116325 + }, + { + "epoch": 7.90392716401685, + "grad_norm": 0.13666456937789917, + "learning_rate": 1.2408275580921323e-06, + "loss": 3.3817, + "step": 116330 + }, + { + "epoch": 7.904266884087512, + "grad_norm": 0.1475512683391571, + "learning_rate": 1.2365810572088598e-06, + "loss": 3.8398, + "step": 116335 + }, + { + "epoch": 7.9046066041581735, + "grad_norm": 0.1893518567085266, + "learning_rate": 1.2323345563255879e-06, + "loss": 3.799, + "step": 116340 + }, + { + "epoch": 7.904946324228836, + "grad_norm": 0.18604877591133118, + "learning_rate": 1.2280880554423156e-06, + "loss": 3.7216, + "step": 116345 + }, + { + "epoch": 7.905286044299498, + "grad_norm": 0.20884139835834503, + "learning_rate": 1.2238415545590434e-06, + "loss": 4.0369, + "step": 116350 + }, + { + "epoch": 7.905625764370159, + "grad_norm": 0.17566531896591187, + "learning_rate": 1.2195950536757712e-06, + "loss": 3.8525, + "step": 116355 + }, + { + "epoch": 7.905965484440821, + "grad_norm": 0.21450378000736237, + "learning_rate": 1.215348552792499e-06, + "loss": 3.7394, + "step": 116360 + }, + { + "epoch": 7.906305204511483, + "grad_norm": 0.18849922716617584, + "learning_rate": 1.2111020519092268e-06, + "loss": 3.9867, + "step": 116365 + }, + { + "epoch": 7.906644924582144, + "grad_norm": 0.15513326227664948, + "learning_rate": 1.2068555510259546e-06, + "loss": 3.8663, + "step": 116370 + }, + { + "epoch": 7.906984644652806, + "grad_norm": 0.16502848267555237, + "learning_rate": 1.2026090501426826e-06, + "loss": 3.579, + "step": 116375 + }, + { + "epoch": 7.907324364723468, + "grad_norm": 0.15573468804359436, + "learning_rate": 1.1983625492594102e-06, + "loss": 3.5775, + "step": 116380 + }, + { + "epoch": 7.9076640847941295, + "grad_norm": 0.19350413978099823, + "learning_rate": 1.1941160483761382e-06, + "loss": 3.955, + "step": 116385 + }, + { + "epoch": 7.908003804864792, + "grad_norm": 0.17238081991672516, + "learning_rate": 1.1898695474928658e-06, + "loss": 3.6446, + "step": 116390 + }, + { + "epoch": 7.908343524935454, + "grad_norm": 0.16848570108413696, + "learning_rate": 1.1856230466095938e-06, + "loss": 3.8192, + "step": 116395 + }, + { + "epoch": 7.908683245006115, + "grad_norm": 0.18310546875, + "learning_rate": 1.1813765457263216e-06, + "loss": 3.6859, + "step": 116400 + }, + { + "epoch": 7.909022965076777, + "grad_norm": 0.16684044897556305, + "learning_rate": 1.1771300448430494e-06, + "loss": 3.8417, + "step": 116405 + }, + { + "epoch": 7.909362685147439, + "grad_norm": 0.19005638360977173, + "learning_rate": 1.1728835439597772e-06, + "loss": 3.7962, + "step": 116410 + }, + { + "epoch": 7.9097024052181, + "grad_norm": 0.17635780572891235, + "learning_rate": 1.168637043076505e-06, + "loss": 3.7291, + "step": 116415 + }, + { + "epoch": 7.910042125288762, + "grad_norm": 0.17315322160720825, + "learning_rate": 1.1643905421932328e-06, + "loss": 3.8316, + "step": 116420 + }, + { + "epoch": 7.9103818453594235, + "grad_norm": 0.2474949061870575, + "learning_rate": 1.1601440413099606e-06, + "loss": 3.7937, + "step": 116425 + }, + { + "epoch": 7.9107215654300855, + "grad_norm": 0.4238426387310028, + "learning_rate": 1.1558975404266886e-06, + "loss": 4.0268, + "step": 116430 + }, + { + "epoch": 7.911061285500748, + "grad_norm": 0.18684951961040497, + "learning_rate": 1.1516510395434162e-06, + "loss": 3.8241, + "step": 116435 + }, + { + "epoch": 7.911401005571409, + "grad_norm": 0.1540907621383667, + "learning_rate": 1.1474045386601442e-06, + "loss": 4.005, + "step": 116440 + }, + { + "epoch": 7.911740725642071, + "grad_norm": 0.5831632614135742, + "learning_rate": 1.143158037776872e-06, + "loss": 3.7838, + "step": 116445 + }, + { + "epoch": 7.912080445712733, + "grad_norm": 0.21649765968322754, + "learning_rate": 1.1389115368935998e-06, + "loss": 3.7848, + "step": 116450 + }, + { + "epoch": 7.912420165783394, + "grad_norm": 0.1592056006193161, + "learning_rate": 1.1346650360103276e-06, + "loss": 4.0368, + "step": 116455 + }, + { + "epoch": 7.912759885854056, + "grad_norm": 0.20974864065647125, + "learning_rate": 1.1304185351270554e-06, + "loss": 3.923, + "step": 116460 + }, + { + "epoch": 7.913099605924718, + "grad_norm": 0.22162461280822754, + "learning_rate": 1.1261720342437832e-06, + "loss": 3.5833, + "step": 116465 + }, + { + "epoch": 7.9134393259953795, + "grad_norm": 0.2877512574195862, + "learning_rate": 1.121925533360511e-06, + "loss": 3.7006, + "step": 116470 + }, + { + "epoch": 7.9137790460660415, + "grad_norm": 0.14858470857143402, + "learning_rate": 1.1176790324772388e-06, + "loss": 4.1058, + "step": 116475 + }, + { + "epoch": 7.914118766136704, + "grad_norm": 0.1660924106836319, + "learning_rate": 1.1134325315939666e-06, + "loss": 3.769, + "step": 116480 + }, + { + "epoch": 7.914458486207365, + "grad_norm": 0.17613141238689423, + "learning_rate": 1.1091860307106946e-06, + "loss": 3.9349, + "step": 116485 + }, + { + "epoch": 7.914798206278027, + "grad_norm": 0.2721254527568817, + "learning_rate": 1.1049395298274222e-06, + "loss": 3.8294, + "step": 116490 + }, + { + "epoch": 7.915137926348689, + "grad_norm": 0.29328685998916626, + "learning_rate": 1.1006930289441502e-06, + "loss": 3.6205, + "step": 116495 + }, + { + "epoch": 7.91547764641935, + "grad_norm": 0.18419599533081055, + "learning_rate": 1.096446528060878e-06, + "loss": 3.752, + "step": 116500 + }, + { + "epoch": 7.915817366490012, + "grad_norm": 0.18969261646270752, + "learning_rate": 1.0922000271776055e-06, + "loss": 3.777, + "step": 116505 + }, + { + "epoch": 7.916157086560674, + "grad_norm": 0.16315753757953644, + "learning_rate": 1.0879535262943336e-06, + "loss": 3.5559, + "step": 116510 + }, + { + "epoch": 7.9164968066313355, + "grad_norm": 0.18335498869419098, + "learning_rate": 1.0837070254110613e-06, + "loss": 3.8914, + "step": 116515 + }, + { + "epoch": 7.916836526701998, + "grad_norm": 0.17289844155311584, + "learning_rate": 1.0794605245277891e-06, + "loss": 3.9823, + "step": 116520 + }, + { + "epoch": 7.91717624677266, + "grad_norm": 0.21323077380657196, + "learning_rate": 1.075214023644517e-06, + "loss": 3.9192, + "step": 116525 + }, + { + "epoch": 7.917515966843321, + "grad_norm": 0.15475361049175262, + "learning_rate": 1.070967522761245e-06, + "loss": 3.8463, + "step": 116530 + }, + { + "epoch": 7.917855686913983, + "grad_norm": 0.24270327389240265, + "learning_rate": 1.0667210218779725e-06, + "loss": 3.9798, + "step": 116535 + }, + { + "epoch": 7.918195406984645, + "grad_norm": 0.22119246423244476, + "learning_rate": 1.0624745209947005e-06, + "loss": 3.9069, + "step": 116540 + }, + { + "epoch": 7.918535127055306, + "grad_norm": 0.25975367426872253, + "learning_rate": 1.0582280201114283e-06, + "loss": 4.0374, + "step": 116545 + }, + { + "epoch": 7.918874847125968, + "grad_norm": 0.1483907550573349, + "learning_rate": 1.053981519228156e-06, + "loss": 3.8186, + "step": 116550 + }, + { + "epoch": 7.91921456719663, + "grad_norm": 0.14977385103702545, + "learning_rate": 1.049735018344884e-06, + "loss": 3.85, + "step": 116555 + }, + { + "epoch": 7.9195542872672915, + "grad_norm": 0.1864573210477829, + "learning_rate": 1.0454885174616115e-06, + "loss": 3.7053, + "step": 116560 + }, + { + "epoch": 7.919894007337954, + "grad_norm": 0.2159656435251236, + "learning_rate": 1.0412420165783395e-06, + "loss": 3.7804, + "step": 116565 + }, + { + "epoch": 7.920233727408616, + "grad_norm": 0.38630741834640503, + "learning_rate": 1.0369955156950673e-06, + "loss": 3.7689, + "step": 116570 + }, + { + "epoch": 7.920573447479277, + "grad_norm": 0.1766095757484436, + "learning_rate": 1.0327490148117951e-06, + "loss": 3.9366, + "step": 116575 + }, + { + "epoch": 7.920913167549939, + "grad_norm": 0.16967034339904785, + "learning_rate": 1.028502513928523e-06, + "loss": 3.7693, + "step": 116580 + }, + { + "epoch": 7.921252887620601, + "grad_norm": 0.584672212600708, + "learning_rate": 1.024256013045251e-06, + "loss": 3.8803, + "step": 116585 + }, + { + "epoch": 7.921592607691262, + "grad_norm": 0.2457764893770218, + "learning_rate": 1.0200095121619785e-06, + "loss": 3.6787, + "step": 116590 + }, + { + "epoch": 7.921932327761924, + "grad_norm": 0.1519096940755844, + "learning_rate": 1.0157630112787063e-06, + "loss": 3.872, + "step": 116595 + }, + { + "epoch": 7.922272047832586, + "grad_norm": 0.1565910428762436, + "learning_rate": 1.0115165103954343e-06, + "loss": 3.8046, + "step": 116600 + }, + { + "epoch": 7.9226117679032475, + "grad_norm": 0.18558458983898163, + "learning_rate": 1.0072700095121619e-06, + "loss": 3.7812, + "step": 116605 + }, + { + "epoch": 7.92295148797391, + "grad_norm": 0.20091262459754944, + "learning_rate": 1.0030235086288899e-06, + "loss": 3.59, + "step": 116610 + }, + { + "epoch": 7.923291208044572, + "grad_norm": 0.23446772992610931, + "learning_rate": 9.987770077456177e-07, + "loss": 3.8779, + "step": 116615 + }, + { + "epoch": 7.923630928115233, + "grad_norm": 0.15632714331150055, + "learning_rate": 9.945305068623455e-07, + "loss": 4.0247, + "step": 116620 + }, + { + "epoch": 7.923970648185895, + "grad_norm": 0.2171819657087326, + "learning_rate": 9.902840059790733e-07, + "loss": 3.6914, + "step": 116625 + }, + { + "epoch": 7.924310368256556, + "grad_norm": 0.1497294008731842, + "learning_rate": 9.860375050958013e-07, + "loss": 3.7015, + "step": 116630 + }, + { + "epoch": 7.924650088327218, + "grad_norm": 0.1970817595720291, + "learning_rate": 9.817910042125289e-07, + "loss": 3.8637, + "step": 116635 + }, + { + "epoch": 7.92498980839788, + "grad_norm": 0.4740430414676666, + "learning_rate": 9.775445033292569e-07, + "loss": 3.9716, + "step": 116640 + }, + { + "epoch": 7.925329528468541, + "grad_norm": 0.17363134026527405, + "learning_rate": 9.732980024459845e-07, + "loss": 3.9302, + "step": 116645 + }, + { + "epoch": 7.9256692485392035, + "grad_norm": 0.16657139360904694, + "learning_rate": 9.690515015627123e-07, + "loss": 3.8819, + "step": 116650 + }, + { + "epoch": 7.926008968609866, + "grad_norm": 0.2039119452238083, + "learning_rate": 9.648050006794403e-07, + "loss": 4.0902, + "step": 116655 + }, + { + "epoch": 7.926348688680527, + "grad_norm": 0.1939667910337448, + "learning_rate": 9.605584997961679e-07, + "loss": 3.8422, + "step": 116660 + }, + { + "epoch": 7.926688408751189, + "grad_norm": 0.1554241180419922, + "learning_rate": 9.563119989128959e-07, + "loss": 3.9442, + "step": 116665 + }, + { + "epoch": 7.927028128821851, + "grad_norm": 2.2107465267181396, + "learning_rate": 9.520654980296237e-07, + "loss": 3.8069, + "step": 116670 + }, + { + "epoch": 7.927367848892512, + "grad_norm": 0.18461725115776062, + "learning_rate": 9.478189971463513e-07, + "loss": 3.9452, + "step": 116675 + }, + { + "epoch": 7.927707568963174, + "grad_norm": 0.5740981698036194, + "learning_rate": 9.435724962630792e-07, + "loss": 3.7524, + "step": 116680 + }, + { + "epoch": 7.928047289033836, + "grad_norm": 0.16115793585777283, + "learning_rate": 9.393259953798071e-07, + "loss": 3.8063, + "step": 116685 + }, + { + "epoch": 7.9283870091044975, + "grad_norm": 0.21771611273288727, + "learning_rate": 9.350794944965348e-07, + "loss": 3.8649, + "step": 116690 + }, + { + "epoch": 7.9287267291751595, + "grad_norm": 0.18832166492938995, + "learning_rate": 9.308329936132627e-07, + "loss": 3.5923, + "step": 116695 + }, + { + "epoch": 7.929066449245822, + "grad_norm": 0.1819070428609848, + "learning_rate": 9.265864927299906e-07, + "loss": 3.8429, + "step": 116700 + }, + { + "epoch": 7.929406169316483, + "grad_norm": 0.22566859424114227, + "learning_rate": 9.223399918467183e-07, + "loss": 3.7858, + "step": 116705 + }, + { + "epoch": 7.929745889387145, + "grad_norm": 0.17253048717975616, + "learning_rate": 9.180934909634462e-07, + "loss": 3.8227, + "step": 116710 + }, + { + "epoch": 7.930085609457807, + "grad_norm": 0.1697719395160675, + "learning_rate": 9.138469900801739e-07, + "loss": 4.0103, + "step": 116715 + }, + { + "epoch": 7.930425329528468, + "grad_norm": 0.25431129336357117, + "learning_rate": 9.096004891969017e-07, + "loss": 3.5788, + "step": 116720 + }, + { + "epoch": 7.93076504959913, + "grad_norm": 1.0768682956695557, + "learning_rate": 9.053539883136296e-07, + "loss": 4.0229, + "step": 116725 + }, + { + "epoch": 7.931104769669792, + "grad_norm": 0.17170879244804382, + "learning_rate": 9.011074874303573e-07, + "loss": 3.7251, + "step": 116730 + }, + { + "epoch": 7.9314444897404535, + "grad_norm": 0.16677536070346832, + "learning_rate": 8.968609865470852e-07, + "loss": 3.6064, + "step": 116735 + }, + { + "epoch": 7.9317842098111155, + "grad_norm": 0.16497178375720978, + "learning_rate": 8.926144856638131e-07, + "loss": 3.7879, + "step": 116740 + }, + { + "epoch": 7.932123929881778, + "grad_norm": 0.1726301610469818, + "learning_rate": 8.883679847805408e-07, + "loss": 3.8474, + "step": 116745 + }, + { + "epoch": 7.932463649952439, + "grad_norm": 0.6334232687950134, + "learning_rate": 8.841214838972687e-07, + "loss": 3.5969, + "step": 116750 + }, + { + "epoch": 7.932803370023101, + "grad_norm": 0.1786389797925949, + "learning_rate": 8.798749830139966e-07, + "loss": 3.9079, + "step": 116755 + }, + { + "epoch": 7.933143090093763, + "grad_norm": 0.2064749300479889, + "learning_rate": 8.756284821307243e-07, + "loss": 3.7871, + "step": 116760 + }, + { + "epoch": 7.933482810164424, + "grad_norm": 0.17935776710510254, + "learning_rate": 8.713819812474521e-07, + "loss": 3.8336, + "step": 116765 + }, + { + "epoch": 7.933822530235086, + "grad_norm": 0.1699584573507309, + "learning_rate": 8.6713548036418e-07, + "loss": 3.9628, + "step": 116770 + }, + { + "epoch": 7.934162250305748, + "grad_norm": 0.17775452136993408, + "learning_rate": 8.628889794809077e-07, + "loss": 3.8449, + "step": 116775 + }, + { + "epoch": 7.9345019703764095, + "grad_norm": 0.6081246137619019, + "learning_rate": 8.586424785976356e-07, + "loss": 3.6, + "step": 116780 + }, + { + "epoch": 7.9348416904470715, + "grad_norm": 1.7078977823257446, + "learning_rate": 8.543959777143635e-07, + "loss": 3.9339, + "step": 116785 + }, + { + "epoch": 7.935181410517734, + "grad_norm": 0.17898713052272797, + "learning_rate": 8.501494768310912e-07, + "loss": 3.7907, + "step": 116790 + }, + { + "epoch": 7.935521130588395, + "grad_norm": 0.17173391580581665, + "learning_rate": 8.459029759478191e-07, + "loss": 4.1177, + "step": 116795 + }, + { + "epoch": 7.935860850659057, + "grad_norm": 0.17573359608650208, + "learning_rate": 8.416564750645468e-07, + "loss": 3.7768, + "step": 116800 + }, + { + "epoch": 7.936200570729719, + "grad_norm": 0.21795058250427246, + "learning_rate": 8.374099741812747e-07, + "loss": 3.9199, + "step": 116805 + }, + { + "epoch": 7.93654029080038, + "grad_norm": 0.35614314675331116, + "learning_rate": 8.331634732980026e-07, + "loss": 3.9499, + "step": 116810 + }, + { + "epoch": 7.936880010871042, + "grad_norm": 0.16646502912044525, + "learning_rate": 8.289169724147303e-07, + "loss": 3.9323, + "step": 116815 + }, + { + "epoch": 7.937219730941704, + "grad_norm": 0.1659419685602188, + "learning_rate": 8.246704715314581e-07, + "loss": 3.7001, + "step": 116820 + }, + { + "epoch": 7.9375594510123655, + "grad_norm": 0.1808411329984665, + "learning_rate": 8.20423970648186e-07, + "loss": 3.8231, + "step": 116825 + }, + { + "epoch": 7.937899171083028, + "grad_norm": 0.16783414781093597, + "learning_rate": 8.161774697649137e-07, + "loss": 3.8312, + "step": 116830 + }, + { + "epoch": 7.93823889115369, + "grad_norm": 0.17418622970581055, + "learning_rate": 8.119309688816416e-07, + "loss": 3.9835, + "step": 116835 + }, + { + "epoch": 7.938578611224351, + "grad_norm": 0.17001374065876007, + "learning_rate": 8.076844679983695e-07, + "loss": 3.6487, + "step": 116840 + }, + { + "epoch": 7.938918331295013, + "grad_norm": 0.16414591670036316, + "learning_rate": 8.034379671150972e-07, + "loss": 3.8543, + "step": 116845 + }, + { + "epoch": 7.939258051365675, + "grad_norm": 0.19039997458457947, + "learning_rate": 7.99191466231825e-07, + "loss": 3.8724, + "step": 116850 + }, + { + "epoch": 7.939597771436336, + "grad_norm": 0.17802147567272186, + "learning_rate": 7.94944965348553e-07, + "loss": 3.6247, + "step": 116855 + }, + { + "epoch": 7.939937491506998, + "grad_norm": 0.1683983951807022, + "learning_rate": 7.906984644652806e-07, + "loss": 3.9239, + "step": 116860 + }, + { + "epoch": 7.94027721157766, + "grad_norm": 0.2056625783443451, + "learning_rate": 7.864519635820084e-07, + "loss": 3.8758, + "step": 116865 + }, + { + "epoch": 7.9406169316483215, + "grad_norm": 0.17050820589065552, + "learning_rate": 7.822054626987363e-07, + "loss": 3.8427, + "step": 116870 + }, + { + "epoch": 7.940956651718984, + "grad_norm": 0.20983193814754486, + "learning_rate": 7.77958961815464e-07, + "loss": 3.8261, + "step": 116875 + }, + { + "epoch": 7.941296371789646, + "grad_norm": 0.16627977788448334, + "learning_rate": 7.737124609321919e-07, + "loss": 4.0422, + "step": 116880 + }, + { + "epoch": 7.941636091860307, + "grad_norm": 0.19383640587329865, + "learning_rate": 7.694659600489197e-07, + "loss": 3.8745, + "step": 116885 + }, + { + "epoch": 7.941975811930969, + "grad_norm": 0.18195964395999908, + "learning_rate": 7.652194591656475e-07, + "loss": 4.0276, + "step": 116890 + }, + { + "epoch": 7.942315532001631, + "grad_norm": 0.6455546021461487, + "learning_rate": 7.609729582823754e-07, + "loss": 3.7677, + "step": 116895 + }, + { + "epoch": 7.942655252072292, + "grad_norm": 0.9837350249290466, + "learning_rate": 7.567264573991032e-07, + "loss": 3.8934, + "step": 116900 + }, + { + "epoch": 7.942994972142954, + "grad_norm": 0.1591806262731552, + "learning_rate": 7.52479956515831e-07, + "loss": 3.9735, + "step": 116905 + }, + { + "epoch": 7.943334692213616, + "grad_norm": 0.14355456829071045, + "learning_rate": 7.482334556325588e-07, + "loss": 3.9055, + "step": 116910 + }, + { + "epoch": 7.9436744122842775, + "grad_norm": 0.17011432349681854, + "learning_rate": 7.439869547492866e-07, + "loss": 3.9357, + "step": 116915 + }, + { + "epoch": 7.94401413235494, + "grad_norm": 0.2045857161283493, + "learning_rate": 7.397404538660144e-07, + "loss": 3.8737, + "step": 116920 + }, + { + "epoch": 7.944353852425602, + "grad_norm": 0.18883581459522247, + "learning_rate": 7.354939529827422e-07, + "loss": 3.9801, + "step": 116925 + }, + { + "epoch": 7.944693572496263, + "grad_norm": 0.18689130246639252, + "learning_rate": 7.312474520994701e-07, + "loss": 3.7181, + "step": 116930 + }, + { + "epoch": 7.945033292566925, + "grad_norm": 0.15907599031925201, + "learning_rate": 7.270009512161979e-07, + "loss": 4.0519, + "step": 116935 + }, + { + "epoch": 7.945373012637587, + "grad_norm": 0.15944360196590424, + "learning_rate": 7.227544503329257e-07, + "loss": 3.8482, + "step": 116940 + }, + { + "epoch": 7.945712732708248, + "grad_norm": 0.13546036183834076, + "learning_rate": 7.185079494496535e-07, + "loss": 3.909, + "step": 116945 + }, + { + "epoch": 7.94605245277891, + "grad_norm": 0.21828047931194305, + "learning_rate": 7.142614485663814e-07, + "loss": 3.8177, + "step": 116950 + }, + { + "epoch": 7.946392172849572, + "grad_norm": 0.20256243646144867, + "learning_rate": 7.100149476831092e-07, + "loss": 3.9973, + "step": 116955 + }, + { + "epoch": 7.9467318929202335, + "grad_norm": 0.17852988839149475, + "learning_rate": 7.05768446799837e-07, + "loss": 3.9028, + "step": 116960 + }, + { + "epoch": 7.947071612990896, + "grad_norm": 0.7036532163619995, + "learning_rate": 7.015219459165648e-07, + "loss": 3.7719, + "step": 116965 + }, + { + "epoch": 7.947411333061558, + "grad_norm": 0.14145144820213318, + "learning_rate": 6.972754450332926e-07, + "loss": 3.7367, + "step": 116970 + }, + { + "epoch": 7.947751053132219, + "grad_norm": 0.16763433814048767, + "learning_rate": 6.930289441500204e-07, + "loss": 3.6479, + "step": 116975 + }, + { + "epoch": 7.948090773202881, + "grad_norm": 0.17852404713630676, + "learning_rate": 6.887824432667483e-07, + "loss": 4.0032, + "step": 116980 + }, + { + "epoch": 7.948430493273543, + "grad_norm": 0.1728605479001999, + "learning_rate": 6.845359423834761e-07, + "loss": 3.8353, + "step": 116985 + }, + { + "epoch": 7.948770213344204, + "grad_norm": 0.24011771380901337, + "learning_rate": 6.802894415002039e-07, + "loss": 4.0289, + "step": 116990 + }, + { + "epoch": 7.949109933414866, + "grad_norm": 0.2027532160282135, + "learning_rate": 6.760429406169317e-07, + "loss": 3.914, + "step": 116995 + }, + { + "epoch": 7.949449653485528, + "grad_norm": 0.1819620281457901, + "learning_rate": 6.717964397336596e-07, + "loss": 3.8337, + "step": 117000 + }, + { + "epoch": 7.9497893735561895, + "grad_norm": 0.1637030988931656, + "learning_rate": 6.675499388503874e-07, + "loss": 3.7193, + "step": 117005 + }, + { + "epoch": 7.950129093626852, + "grad_norm": 0.16770413517951965, + "learning_rate": 6.633034379671152e-07, + "loss": 3.6313, + "step": 117010 + }, + { + "epoch": 7.950468813697514, + "grad_norm": 0.2117864042520523, + "learning_rate": 6.59056937083843e-07, + "loss": 3.8497, + "step": 117015 + }, + { + "epoch": 7.950808533768175, + "grad_norm": 0.1343919336795807, + "learning_rate": 6.548104362005707e-07, + "loss": 3.6606, + "step": 117020 + }, + { + "epoch": 7.951148253838837, + "grad_norm": 0.18806464970111847, + "learning_rate": 6.505639353172985e-07, + "loss": 3.3501, + "step": 117025 + }, + { + "epoch": 7.951487973909499, + "grad_norm": 0.3653091788291931, + "learning_rate": 6.463174344340263e-07, + "loss": 3.9262, + "step": 117030 + }, + { + "epoch": 7.95182769398016, + "grad_norm": 0.18684060871601105, + "learning_rate": 6.420709335507542e-07, + "loss": 3.8453, + "step": 117035 + }, + { + "epoch": 7.952167414050822, + "grad_norm": 0.18925714492797852, + "learning_rate": 6.37824432667482e-07, + "loss": 3.8717, + "step": 117040 + }, + { + "epoch": 7.952507134121484, + "grad_norm": 0.1398121863603592, + "learning_rate": 6.335779317842098e-07, + "loss": 3.8723, + "step": 117045 + }, + { + "epoch": 7.9528468541921455, + "grad_norm": 0.18961557745933533, + "learning_rate": 6.293314309009377e-07, + "loss": 3.7578, + "step": 117050 + }, + { + "epoch": 7.953186574262808, + "grad_norm": 0.1693739891052246, + "learning_rate": 6.250849300176655e-07, + "loss": 3.7487, + "step": 117055 + }, + { + "epoch": 7.95352629433347, + "grad_norm": 0.16398443281650543, + "learning_rate": 6.208384291343932e-07, + "loss": 3.7813, + "step": 117060 + }, + { + "epoch": 7.953866014404131, + "grad_norm": 0.1549488604068756, + "learning_rate": 6.16591928251121e-07, + "loss": 3.7233, + "step": 117065 + }, + { + "epoch": 7.954205734474793, + "grad_norm": 0.8641138076782227, + "learning_rate": 6.123454273678489e-07, + "loss": 3.9333, + "step": 117070 + }, + { + "epoch": 7.954545454545455, + "grad_norm": 0.2671298384666443, + "learning_rate": 6.080989264845767e-07, + "loss": 3.7352, + "step": 117075 + }, + { + "epoch": 7.954885174616116, + "grad_norm": 0.2041815221309662, + "learning_rate": 6.038524256013045e-07, + "loss": 3.8143, + "step": 117080 + }, + { + "epoch": 7.955224894686778, + "grad_norm": 0.20247521996498108, + "learning_rate": 5.996059247180324e-07, + "loss": 3.7197, + "step": 117085 + }, + { + "epoch": 7.95556461475744, + "grad_norm": 0.1982482224702835, + "learning_rate": 5.953594238347602e-07, + "loss": 3.9135, + "step": 117090 + }, + { + "epoch": 7.9559043348281016, + "grad_norm": 0.14566786587238312, + "learning_rate": 5.91112922951488e-07, + "loss": 3.9438, + "step": 117095 + }, + { + "epoch": 7.956244054898764, + "grad_norm": 0.34758806228637695, + "learning_rate": 5.868664220682159e-07, + "loss": 3.82, + "step": 117100 + }, + { + "epoch": 7.956583774969425, + "grad_norm": 0.1709873229265213, + "learning_rate": 5.826199211849436e-07, + "loss": 3.7569, + "step": 117105 + }, + { + "epoch": 7.956923495040087, + "grad_norm": 0.161163792014122, + "learning_rate": 5.783734203016714e-07, + "loss": 3.8507, + "step": 117110 + }, + { + "epoch": 7.957263215110749, + "grad_norm": 0.3011038303375244, + "learning_rate": 5.741269194183992e-07, + "loss": 3.7787, + "step": 117115 + }, + { + "epoch": 7.95760293518141, + "grad_norm": 0.1605938971042633, + "learning_rate": 5.698804185351271e-07, + "loss": 3.6847, + "step": 117120 + }, + { + "epoch": 7.957942655252072, + "grad_norm": 0.18858031928539276, + "learning_rate": 5.656339176518549e-07, + "loss": 3.6143, + "step": 117125 + }, + { + "epoch": 7.958282375322734, + "grad_norm": 0.23677699267864227, + "learning_rate": 5.613874167685827e-07, + "loss": 3.7935, + "step": 117130 + }, + { + "epoch": 7.9586220953933955, + "grad_norm": 0.12490113079547882, + "learning_rate": 5.571409158853106e-07, + "loss": 3.7096, + "step": 117135 + }, + { + "epoch": 7.958961815464058, + "grad_norm": 0.1564282774925232, + "learning_rate": 5.528944150020384e-07, + "loss": 4.0169, + "step": 117140 + }, + { + "epoch": 7.95930153553472, + "grad_norm": 0.18766401708126068, + "learning_rate": 5.486479141187662e-07, + "loss": 3.8691, + "step": 117145 + }, + { + "epoch": 7.959641255605381, + "grad_norm": 0.21543556451797485, + "learning_rate": 5.44401413235494e-07, + "loss": 3.8261, + "step": 117150 + }, + { + "epoch": 7.959980975676043, + "grad_norm": 0.18499185144901276, + "learning_rate": 5.401549123522218e-07, + "loss": 3.9443, + "step": 117155 + }, + { + "epoch": 7.960320695746705, + "grad_norm": 0.23296475410461426, + "learning_rate": 5.359084114689496e-07, + "loss": 4.0246, + "step": 117160 + }, + { + "epoch": 7.960660415817366, + "grad_norm": 0.14801357686519623, + "learning_rate": 5.316619105856774e-07, + "loss": 3.8494, + "step": 117165 + }, + { + "epoch": 7.961000135888028, + "grad_norm": 0.20131255686283112, + "learning_rate": 5.274154097024053e-07, + "loss": 3.7158, + "step": 117170 + }, + { + "epoch": 7.96133985595869, + "grad_norm": 0.23376624286174774, + "learning_rate": 5.231689088191331e-07, + "loss": 3.898, + "step": 117175 + }, + { + "epoch": 7.9616795760293515, + "grad_norm": 0.3528525233268738, + "learning_rate": 5.189224079358609e-07, + "loss": 3.9194, + "step": 117180 + }, + { + "epoch": 7.962019296100014, + "grad_norm": 0.35555705428123474, + "learning_rate": 5.146759070525888e-07, + "loss": 3.6867, + "step": 117185 + }, + { + "epoch": 7.962359016170676, + "grad_norm": 0.16526058316230774, + "learning_rate": 5.104294061693166e-07, + "loss": 3.7359, + "step": 117190 + }, + { + "epoch": 7.962698736241337, + "grad_norm": 0.2065466344356537, + "learning_rate": 5.061829052860443e-07, + "loss": 3.6978, + "step": 117195 + }, + { + "epoch": 7.963038456311999, + "grad_norm": 0.15966634452342987, + "learning_rate": 5.019364044027721e-07, + "loss": 3.8961, + "step": 117200 + }, + { + "epoch": 7.963378176382661, + "grad_norm": 0.1607598066329956, + "learning_rate": 4.976899035194999e-07, + "loss": 3.7841, + "step": 117205 + }, + { + "epoch": 7.963717896453322, + "grad_norm": 0.19588902592658997, + "learning_rate": 4.934434026362277e-07, + "loss": 4.0221, + "step": 117210 + }, + { + "epoch": 7.964057616523984, + "grad_norm": 0.19537901878356934, + "learning_rate": 4.891969017529555e-07, + "loss": 4.0275, + "step": 117215 + }, + { + "epoch": 7.964397336594646, + "grad_norm": 0.1853545904159546, + "learning_rate": 4.849504008696834e-07, + "loss": 3.9872, + "step": 117220 + }, + { + "epoch": 7.9647370566653075, + "grad_norm": 0.22651594877243042, + "learning_rate": 4.807038999864112e-07, + "loss": 3.8544, + "step": 117225 + }, + { + "epoch": 7.96507677673597, + "grad_norm": 0.13671377301216125, + "learning_rate": 4.7645739910313903e-07, + "loss": 3.8754, + "step": 117230 + }, + { + "epoch": 7.965416496806632, + "grad_norm": 0.18839529156684875, + "learning_rate": 4.722108982198668e-07, + "loss": 3.7651, + "step": 117235 + }, + { + "epoch": 7.965756216877293, + "grad_norm": 0.19730910658836365, + "learning_rate": 4.6796439733659467e-07, + "loss": 3.8837, + "step": 117240 + }, + { + "epoch": 7.966095936947955, + "grad_norm": 0.20524311065673828, + "learning_rate": 4.6371789645332247e-07, + "loss": 3.9236, + "step": 117245 + }, + { + "epoch": 7.966435657018617, + "grad_norm": 0.14487558603286743, + "learning_rate": 4.5947139557005026e-07, + "loss": 3.9223, + "step": 117250 + }, + { + "epoch": 7.966775377089278, + "grad_norm": 0.21449798345565796, + "learning_rate": 4.5522489468677817e-07, + "loss": 3.8888, + "step": 117255 + }, + { + "epoch": 7.96711509715994, + "grad_norm": 0.14655789732933044, + "learning_rate": 4.5097839380350596e-07, + "loss": 3.4575, + "step": 117260 + }, + { + "epoch": 7.967454817230602, + "grad_norm": 0.1583438366651535, + "learning_rate": 4.4673189292023376e-07, + "loss": 3.7088, + "step": 117265 + }, + { + "epoch": 7.9677945373012635, + "grad_norm": 0.16256776452064514, + "learning_rate": 4.424853920369615e-07, + "loss": 4.0049, + "step": 117270 + }, + { + "epoch": 7.968134257371926, + "grad_norm": 0.16787825524806976, + "learning_rate": 4.382388911536894e-07, + "loss": 3.8271, + "step": 117275 + }, + { + "epoch": 7.968473977442588, + "grad_norm": 0.18341675400733948, + "learning_rate": 4.339923902704172e-07, + "loss": 3.7552, + "step": 117280 + }, + { + "epoch": 7.968813697513249, + "grad_norm": 0.4897415041923523, + "learning_rate": 4.29745889387145e-07, + "loss": 3.7705, + "step": 117285 + }, + { + "epoch": 7.969153417583911, + "grad_norm": 0.15908165276050568, + "learning_rate": 4.2549938850387284e-07, + "loss": 4.0654, + "step": 117290 + }, + { + "epoch": 7.969493137654573, + "grad_norm": 0.14203022420406342, + "learning_rate": 4.2125288762060064e-07, + "loss": 3.9679, + "step": 117295 + }, + { + "epoch": 7.969832857725234, + "grad_norm": 0.19928747415542603, + "learning_rate": 4.1700638673732844e-07, + "loss": 3.8585, + "step": 117300 + }, + { + "epoch": 7.970172577795896, + "grad_norm": 0.18887114524841309, + "learning_rate": 4.1275988585405634e-07, + "loss": 3.9824, + "step": 117305 + }, + { + "epoch": 7.9705122978665575, + "grad_norm": 0.17922064661979675, + "learning_rate": 4.0851338497078413e-07, + "loss": 3.8154, + "step": 117310 + }, + { + "epoch": 7.9708520179372195, + "grad_norm": 0.17762064933776855, + "learning_rate": 4.042668840875119e-07, + "loss": 3.609, + "step": 117315 + }, + { + "epoch": 7.971191738007882, + "grad_norm": 0.9181612133979797, + "learning_rate": 4.0002038320423967e-07, + "loss": 3.9112, + "step": 117320 + }, + { + "epoch": 7.971531458078543, + "grad_norm": 0.1871061623096466, + "learning_rate": 3.957738823209676e-07, + "loss": 3.7121, + "step": 117325 + }, + { + "epoch": 7.971871178149205, + "grad_norm": 0.21220983564853668, + "learning_rate": 3.9152738143769537e-07, + "loss": 3.859, + "step": 117330 + }, + { + "epoch": 7.972210898219867, + "grad_norm": 0.2834990322589874, + "learning_rate": 3.8728088055442317e-07, + "loss": 3.9435, + "step": 117335 + }, + { + "epoch": 7.972550618290528, + "grad_norm": 0.1703089326620102, + "learning_rate": 3.8303437967115096e-07, + "loss": 3.9794, + "step": 117340 + }, + { + "epoch": 7.97289033836119, + "grad_norm": 0.18531639873981476, + "learning_rate": 3.787878787878788e-07, + "loss": 3.7946, + "step": 117345 + }, + { + "epoch": 7.973230058431852, + "grad_norm": 0.17320990562438965, + "learning_rate": 3.745413779046066e-07, + "loss": 3.5306, + "step": 117350 + }, + { + "epoch": 7.9735697785025135, + "grad_norm": 0.17105576395988464, + "learning_rate": 3.7029487702133446e-07, + "loss": 3.8986, + "step": 117355 + }, + { + "epoch": 7.9739094985731755, + "grad_norm": 0.17169106006622314, + "learning_rate": 3.6604837613806225e-07, + "loss": 3.8433, + "step": 117360 + }, + { + "epoch": 7.974249218643838, + "grad_norm": 0.1853930950164795, + "learning_rate": 3.6180187525479005e-07, + "loss": 3.7371, + "step": 117365 + }, + { + "epoch": 7.974588938714499, + "grad_norm": 1.0106377601623535, + "learning_rate": 3.575553743715179e-07, + "loss": 3.9382, + "step": 117370 + }, + { + "epoch": 7.974928658785161, + "grad_norm": 0.19053488969802856, + "learning_rate": 3.533088734882457e-07, + "loss": 3.8666, + "step": 117375 + }, + { + "epoch": 7.975268378855823, + "grad_norm": 0.18448396027088165, + "learning_rate": 3.4906237260497354e-07, + "loss": 3.9268, + "step": 117380 + }, + { + "epoch": 7.975608098926484, + "grad_norm": 0.15586461126804352, + "learning_rate": 3.4481587172170134e-07, + "loss": 3.7809, + "step": 117385 + }, + { + "epoch": 7.975947818997146, + "grad_norm": 0.13972653448581696, + "learning_rate": 3.4056937083842913e-07, + "loss": 4.0332, + "step": 117390 + }, + { + "epoch": 7.976287539067808, + "grad_norm": 0.2710564434528351, + "learning_rate": 3.36322869955157e-07, + "loss": 3.9361, + "step": 117395 + }, + { + "epoch": 7.9766272591384695, + "grad_norm": 0.20069772005081177, + "learning_rate": 3.320763690718848e-07, + "loss": 3.6578, + "step": 117400 + }, + { + "epoch": 7.976966979209132, + "grad_norm": 0.16008785367012024, + "learning_rate": 3.2782986818861263e-07, + "loss": 3.9711, + "step": 117405 + }, + { + "epoch": 7.977306699279794, + "grad_norm": 0.2532937228679657, + "learning_rate": 3.235833673053404e-07, + "loss": 3.7212, + "step": 117410 + }, + { + "epoch": 7.977646419350455, + "grad_norm": 0.17641426622867584, + "learning_rate": 3.193368664220682e-07, + "loss": 3.6953, + "step": 117415 + }, + { + "epoch": 7.977986139421117, + "grad_norm": 0.1468195766210556, + "learning_rate": 3.1509036553879607e-07, + "loss": 4.008, + "step": 117420 + }, + { + "epoch": 7.978325859491779, + "grad_norm": 0.14334815740585327, + "learning_rate": 3.1084386465552386e-07, + "loss": 4.0804, + "step": 117425 + }, + { + "epoch": 7.97866557956244, + "grad_norm": 0.890771746635437, + "learning_rate": 3.065973637722517e-07, + "loss": 3.716, + "step": 117430 + }, + { + "epoch": 7.979005299633102, + "grad_norm": 0.23940132558345795, + "learning_rate": 3.023508628889795e-07, + "loss": 4.0134, + "step": 117435 + }, + { + "epoch": 7.979345019703764, + "grad_norm": 0.1702396422624588, + "learning_rate": 2.981043620057073e-07, + "loss": 3.9426, + "step": 117440 + }, + { + "epoch": 7.9796847397744255, + "grad_norm": 0.1795492172241211, + "learning_rate": 2.938578611224351e-07, + "loss": 4.0411, + "step": 117445 + }, + { + "epoch": 7.980024459845088, + "grad_norm": 0.24158768355846405, + "learning_rate": 2.8961136023916295e-07, + "loss": 3.6956, + "step": 117450 + }, + { + "epoch": 7.98036417991575, + "grad_norm": 0.14375033974647522, + "learning_rate": 2.853648593558908e-07, + "loss": 3.5619, + "step": 117455 + }, + { + "epoch": 7.980703899986411, + "grad_norm": 0.17915770411491394, + "learning_rate": 2.8111835847261854e-07, + "loss": 3.8388, + "step": 117460 + }, + { + "epoch": 7.981043620057073, + "grad_norm": 0.4470267593860626, + "learning_rate": 2.768718575893464e-07, + "loss": 3.8734, + "step": 117465 + }, + { + "epoch": 7.981383340127735, + "grad_norm": 0.2147827297449112, + "learning_rate": 2.726253567060742e-07, + "loss": 3.8631, + "step": 117470 + }, + { + "epoch": 7.981723060198396, + "grad_norm": 0.1854628175497055, + "learning_rate": 2.6837885582280204e-07, + "loss": 3.9937, + "step": 117475 + }, + { + "epoch": 7.982062780269058, + "grad_norm": 0.1593482494354248, + "learning_rate": 2.641323549395299e-07, + "loss": 3.8637, + "step": 117480 + }, + { + "epoch": 7.98240250033972, + "grad_norm": 0.147627592086792, + "learning_rate": 2.5988585405625763e-07, + "loss": 3.9834, + "step": 117485 + }, + { + "epoch": 7.9827422204103815, + "grad_norm": 0.1667567491531372, + "learning_rate": 2.556393531729855e-07, + "loss": 3.9774, + "step": 117490 + }, + { + "epoch": 7.983081940481044, + "grad_norm": 0.189104363322258, + "learning_rate": 2.5139285228971327e-07, + "loss": 3.5328, + "step": 117495 + }, + { + "epoch": 7.983421660551706, + "grad_norm": 0.16789880394935608, + "learning_rate": 2.471463514064411e-07, + "loss": 3.8143, + "step": 117500 + }, + { + "epoch": 7.983761380622367, + "grad_norm": 0.15741141140460968, + "learning_rate": 2.428998505231689e-07, + "loss": 3.7864, + "step": 117505 + }, + { + "epoch": 7.984101100693029, + "grad_norm": 0.21218381822109222, + "learning_rate": 2.386533496398967e-07, + "loss": 4.1408, + "step": 117510 + }, + { + "epoch": 7.984440820763691, + "grad_norm": 0.15561173856258392, + "learning_rate": 2.3440684875662456e-07, + "loss": 3.981, + "step": 117515 + }, + { + "epoch": 7.984780540834352, + "grad_norm": 0.14403964579105377, + "learning_rate": 2.3016034787335236e-07, + "loss": 3.8738, + "step": 117520 + }, + { + "epoch": 7.985120260905014, + "grad_norm": 0.15106597542762756, + "learning_rate": 2.2591384699008018e-07, + "loss": 3.6882, + "step": 117525 + }, + { + "epoch": 7.985459980975676, + "grad_norm": 0.16282644867897034, + "learning_rate": 2.2166734610680798e-07, + "loss": 3.5673, + "step": 117530 + }, + { + "epoch": 7.9857997010463375, + "grad_norm": 0.15891924500465393, + "learning_rate": 2.1742084522353583e-07, + "loss": 3.8592, + "step": 117535 + }, + { + "epoch": 7.986139421117, + "grad_norm": 0.1780364066362381, + "learning_rate": 2.1317434434026365e-07, + "loss": 3.7942, + "step": 117540 + }, + { + "epoch": 7.986479141187662, + "grad_norm": 0.24834007024765015, + "learning_rate": 2.0892784345699144e-07, + "loss": 3.7629, + "step": 117545 + }, + { + "epoch": 7.986818861258323, + "grad_norm": 0.16219528019428253, + "learning_rate": 2.0468134257371927e-07, + "loss": 3.7977, + "step": 117550 + }, + { + "epoch": 7.987158581328985, + "grad_norm": 0.17079496383666992, + "learning_rate": 2.0043484169044706e-07, + "loss": 4.0221, + "step": 117555 + }, + { + "epoch": 7.987498301399647, + "grad_norm": 0.18778850138187408, + "learning_rate": 1.961883408071749e-07, + "loss": 3.9608, + "step": 117560 + }, + { + "epoch": 7.987838021470308, + "grad_norm": 0.1772683709859848, + "learning_rate": 1.919418399239027e-07, + "loss": 3.9079, + "step": 117565 + }, + { + "epoch": 7.98817774154097, + "grad_norm": 0.18476401269435883, + "learning_rate": 1.8769533904063053e-07, + "loss": 3.6165, + "step": 117570 + }, + { + "epoch": 7.988517461611632, + "grad_norm": 0.1757347583770752, + "learning_rate": 1.8344883815735833e-07, + "loss": 3.6754, + "step": 117575 + }, + { + "epoch": 7.9888571816822935, + "grad_norm": 0.18459290266036987, + "learning_rate": 1.7920233727408617e-07, + "loss": 3.8098, + "step": 117580 + }, + { + "epoch": 7.989196901752956, + "grad_norm": 0.21493317186832428, + "learning_rate": 1.74955836390814e-07, + "loss": 3.8009, + "step": 117585 + }, + { + "epoch": 7.989536621823618, + "grad_norm": 0.2664526402950287, + "learning_rate": 1.707093355075418e-07, + "loss": 3.8747, + "step": 117590 + }, + { + "epoch": 7.989876341894279, + "grad_norm": 0.15224987268447876, + "learning_rate": 1.6646283462426961e-07, + "loss": 3.8945, + "step": 117595 + }, + { + "epoch": 7.990216061964941, + "grad_norm": 0.1501011848449707, + "learning_rate": 1.622163337409974e-07, + "loss": 3.8616, + "step": 117600 + }, + { + "epoch": 7.990555782035603, + "grad_norm": 0.17889438569545746, + "learning_rate": 1.5796983285772523e-07, + "loss": 3.7403, + "step": 117605 + }, + { + "epoch": 7.990895502106264, + "grad_norm": 0.1818421632051468, + "learning_rate": 1.5372333197445306e-07, + "loss": 3.7897, + "step": 117610 + }, + { + "epoch": 7.991235222176926, + "grad_norm": 0.22225822508335114, + "learning_rate": 1.4947683109118088e-07, + "loss": 3.8248, + "step": 117615 + }, + { + "epoch": 7.991574942247588, + "grad_norm": 0.1852625459432602, + "learning_rate": 1.452303302079087e-07, + "loss": 3.7711, + "step": 117620 + }, + { + "epoch": 7.9919146623182495, + "grad_norm": 0.24510236084461212, + "learning_rate": 1.409838293246365e-07, + "loss": 4.0503, + "step": 117625 + }, + { + "epoch": 7.992254382388912, + "grad_norm": 0.1970558911561966, + "learning_rate": 1.3673732844136432e-07, + "loss": 3.9345, + "step": 117630 + }, + { + "epoch": 7.992594102459574, + "grad_norm": 0.1873694807291031, + "learning_rate": 1.3249082755809214e-07, + "loss": 3.8284, + "step": 117635 + }, + { + "epoch": 7.992933822530235, + "grad_norm": 0.17604678869247437, + "learning_rate": 1.2824432667481996e-07, + "loss": 3.9655, + "step": 117640 + }, + { + "epoch": 7.993273542600897, + "grad_norm": 0.1626487523317337, + "learning_rate": 1.2399782579154779e-07, + "loss": 3.6342, + "step": 117645 + }, + { + "epoch": 7.993613262671559, + "grad_norm": 0.17807689309120178, + "learning_rate": 1.1975132490827558e-07, + "loss": 3.745, + "step": 117650 + }, + { + "epoch": 7.99395298274222, + "grad_norm": 0.1737707257270813, + "learning_rate": 1.155048240250034e-07, + "loss": 3.7742, + "step": 117655 + }, + { + "epoch": 7.994292702812882, + "grad_norm": 0.1531268209218979, + "learning_rate": 1.1125832314173121e-07, + "loss": 3.7616, + "step": 117660 + }, + { + "epoch": 7.994632422883544, + "grad_norm": 0.23513154685497284, + "learning_rate": 1.0701182225845902e-07, + "loss": 3.7596, + "step": 117665 + }, + { + "epoch": 7.9949721429542056, + "grad_norm": 0.1782200038433075, + "learning_rate": 1.0276532137518686e-07, + "loss": 3.7108, + "step": 117670 + }, + { + "epoch": 7.995311863024868, + "grad_norm": 0.3371969759464264, + "learning_rate": 9.851882049191467e-08, + "loss": 3.6457, + "step": 117675 + }, + { + "epoch": 7.99565158309553, + "grad_norm": 0.16718927025794983, + "learning_rate": 9.427231960864248e-08, + "loss": 3.6131, + "step": 117680 + }, + { + "epoch": 7.995991303166191, + "grad_norm": 2.7982003688812256, + "learning_rate": 9.00258187253703e-08, + "loss": 3.8984, + "step": 117685 + }, + { + "epoch": 7.996331023236853, + "grad_norm": 0.21741120517253876, + "learning_rate": 8.577931784209812e-08, + "loss": 3.8792, + "step": 117690 + }, + { + "epoch": 7.996670743307515, + "grad_norm": 0.16716469824314117, + "learning_rate": 8.153281695882593e-08, + "loss": 3.9352, + "step": 117695 + }, + { + "epoch": 7.997010463378176, + "grad_norm": 0.797361433506012, + "learning_rate": 7.728631607555374e-08, + "loss": 3.8405, + "step": 117700 + }, + { + "epoch": 7.997350183448838, + "grad_norm": 0.6190701723098755, + "learning_rate": 7.303981519228156e-08, + "loss": 3.7041, + "step": 117705 + }, + { + "epoch": 7.9976899035195, + "grad_norm": 0.2072145938873291, + "learning_rate": 6.879331430900939e-08, + "loss": 3.8219, + "step": 117710 + }, + { + "epoch": 7.998029623590162, + "grad_norm": 0.17154912650585175, + "learning_rate": 6.45468134257372e-08, + "loss": 3.7352, + "step": 117715 + }, + { + "epoch": 7.998369343660824, + "grad_norm": 0.19476549327373505, + "learning_rate": 6.030031254246502e-08, + "loss": 3.6517, + "step": 117720 + }, + { + "epoch": 7.998709063731486, + "grad_norm": 7.17972993850708, + "learning_rate": 5.6053811659192826e-08, + "loss": 3.908, + "step": 117725 + }, + { + "epoch": 7.999048783802147, + "grad_norm": 0.6440392732620239, + "learning_rate": 5.180731077592064e-08, + "loss": 3.9137, + "step": 117730 + }, + { + "epoch": 7.999388503872809, + "grad_norm": 0.28233370184898376, + "learning_rate": 4.7560809892648464e-08, + "loss": 3.8057, + "step": 117735 + }, + { + "epoch": 7.999728223943471, + "grad_norm": 0.18206889927387238, + "learning_rate": 4.3314309009376273e-08, + "loss": 3.9158, + "step": 117740 + }, + { + "epoch": 8.0, + "eval_bertscore": { + "f1": 0.8522687109129664, + "precision": 0.875668921671885, + "recall": 0.8304298887165934 + }, + "eval_bleu_4": 0.001962197334599666, + "eval_exact_match": 0.0, + "eval_loss": 3.641373634338379, + "eval_meteor": 0.07555700789934944, + "eval_rouge": { + "rouge1": 0.12306009878404744, + "rouge2": 0.015369979017870981, + "rougeL": 0.10878082220988909, + "rougeLsum": 0.10878220132622535 + }, + "eval_runtime": 374.5345, + "eval_samples_per_second": 27.552, + "eval_steps_per_second": 3.444, + "step": 117744 + } + ], + "logging_steps": 5, + "max_steps": 117744, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.7395808248109466e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}