diff --git "a/codet5_lora_official_0.001/checkpoint-117744/trainer_state.json" "b/codet5_lora_official_0.001/checkpoint-117744/trainer_state.json" new file mode 100644--- /dev/null +++ "b/codet5_lora_official_0.001/checkpoint-117744/trainer_state.json" @@ -0,0 +1,165045 @@ +{ + "best_metric": 0.02277402812413442, + "best_model_checkpoint": "./results-cc/code-t5/codet5_lora_official_0.001/checkpoint-88308", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 117744, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003397200706617747, + "grad_norm": NaN, + "learning_rate": 0.0009999745209947003, + "loss": 9.5631, + "step": 5 + }, + { + "epoch": 0.0006794401413235494, + "grad_norm": 1.5265518426895142, + "learning_rate": 0.0009999320559858677, + "loss": 7.4912, + "step": 10 + }, + { + "epoch": 0.0010191602119853241, + "grad_norm": 1.6913625001907349, + "learning_rate": 0.000999889590977035, + "loss": 5.6733, + "step": 15 + }, + { + "epoch": 0.001358880282647099, + "grad_norm": 0.8828639984130859, + "learning_rate": 0.0009998471259682023, + "loss": 4.5751, + "step": 20 + }, + { + "epoch": 0.0016986003533088735, + "grad_norm": 1.881712794303894, + "learning_rate": 0.0009998046609593695, + "loss": 4.381, + "step": 25 + }, + { + "epoch": 0.0020383204239706482, + "grad_norm": 0.7017437219619751, + "learning_rate": 0.0009997621959505368, + "loss": 4.1607, + "step": 30 + }, + { + "epoch": 0.002378040494632423, + "grad_norm": 0.6614991426467896, + "learning_rate": 0.0009997197309417041, + "loss": 3.9552, + "step": 35 + }, + { + "epoch": 0.002717760565294198, + "grad_norm": 0.6266366243362427, + "learning_rate": 0.000999685758934638, + "loss": 4.1837, + "step": 40 + }, + { + "epoch": 0.0030574806359559724, + "grad_norm": 1.5767444372177124, + "learning_rate": 0.000999643293925805, + "loss": 4.3598, + "step": 45 + }, + { + "epoch": 0.003397200706617747, + "grad_norm": 0.5440741181373596, + "learning_rate": 0.0009996008289169724, + "loss": 3.9786, + "step": 50 + }, + { + "epoch": 0.0037369207772795215, + "grad_norm": 2.4258925914764404, + "learning_rate": 0.0009995583639081397, + "loss": 4.092, + "step": 55 + }, + { + "epoch": 0.0040766408479412965, + "grad_norm": 1.0981065034866333, + "learning_rate": 0.0009995158988993069, + "loss": 3.9918, + "step": 60 + }, + { + "epoch": 0.0044163609186030715, + "grad_norm": 0.605392336845398, + "learning_rate": 0.0009994734338904742, + "loss": 4.0654, + "step": 65 + }, + { + "epoch": 0.004756080989264846, + "grad_norm": 2.5163285732269287, + "learning_rate": 0.0009994309688816415, + "loss": 4.2722, + "step": 70 + }, + { + "epoch": 0.005095801059926621, + "grad_norm": 0.6546579599380493, + "learning_rate": 0.0009993885038728089, + "loss": 4.1405, + "step": 75 + }, + { + "epoch": 0.005435521130588396, + "grad_norm": 0.7548283338546753, + "learning_rate": 0.0009993460388639762, + "loss": 3.8654, + "step": 80 + }, + { + "epoch": 0.00577524120125017, + "grad_norm": 0.6219947934150696, + "learning_rate": 0.0009993035738551433, + "loss": 3.9379, + "step": 85 + }, + { + "epoch": 0.006114961271911945, + "grad_norm": 0.7782043814659119, + "learning_rate": 0.0009992611088463106, + "loss": 4.0288, + "step": 90 + }, + { + "epoch": 0.006454681342573719, + "grad_norm": 0.5383527278900146, + "learning_rate": 0.000999218643837478, + "loss": 4.2282, + "step": 95 + }, + { + "epoch": 0.006794401413235494, + "grad_norm": 0.7215156555175781, + "learning_rate": 0.000999176178828645, + "loss": 4.1059, + "step": 100 + }, + { + "epoch": 0.007134121483897269, + "grad_norm": 0.40474793314933777, + "learning_rate": 0.0009991337138198124, + "loss": 3.6553, + "step": 105 + }, + { + "epoch": 0.007473841554559043, + "grad_norm": 0.5022174119949341, + "learning_rate": 0.0009990912488109798, + "loss": 4.0996, + "step": 110 + }, + { + "epoch": 0.007813561625220818, + "grad_norm": 0.42584916949272156, + "learning_rate": 0.000999048783802147, + "loss": 4.0931, + "step": 115 + }, + { + "epoch": 0.008153281695882593, + "grad_norm": 0.6117284893989563, + "learning_rate": 0.0009990063187933144, + "loss": 4.2467, + "step": 120 + }, + { + "epoch": 0.008493001766544368, + "grad_norm": 1.9124809503555298, + "learning_rate": 0.0009989638537844815, + "loss": 4.0609, + "step": 125 + }, + { + "epoch": 0.008832721837206143, + "grad_norm": 0.6506781578063965, + "learning_rate": 0.0009989213887756489, + "loss": 3.8329, + "step": 130 + }, + { + "epoch": 0.009172441907867916, + "grad_norm": 0.6759753227233887, + "learning_rate": 0.0009988789237668162, + "loss": 3.8335, + "step": 135 + }, + { + "epoch": 0.009512161978529691, + "grad_norm": 0.584456741809845, + "learning_rate": 0.0009988364587579835, + "loss": 3.859, + "step": 140 + }, + { + "epoch": 0.009851882049191466, + "grad_norm": 1.083524465560913, + "learning_rate": 0.0009987939937491507, + "loss": 3.8578, + "step": 145 + }, + { + "epoch": 0.010191602119853241, + "grad_norm": 0.5717402100563049, + "learning_rate": 0.000998751528740318, + "loss": 4.0386, + "step": 150 + }, + { + "epoch": 0.010531322190515016, + "grad_norm": 0.5727488398551941, + "learning_rate": 0.0009987090637314853, + "loss": 3.8979, + "step": 155 + }, + { + "epoch": 0.010871042261176791, + "grad_norm": 0.5168347954750061, + "learning_rate": 0.0009986665987226525, + "loss": 3.9575, + "step": 160 + }, + { + "epoch": 0.011210762331838564, + "grad_norm": 0.7882786393165588, + "learning_rate": 0.0009986241337138198, + "loss": 3.988, + "step": 165 + }, + { + "epoch": 0.01155048240250034, + "grad_norm": 0.49471724033355713, + "learning_rate": 0.0009985816687049871, + "loss": 3.8816, + "step": 170 + }, + { + "epoch": 0.011890202473162114, + "grad_norm": 0.5687004327774048, + "learning_rate": 0.0009985392036961545, + "loss": 3.8975, + "step": 175 + }, + { + "epoch": 0.01222992254382389, + "grad_norm": 0.5873124599456787, + "learning_rate": 0.0009984967386873218, + "loss": 3.9682, + "step": 180 + }, + { + "epoch": 0.012569642614485664, + "grad_norm": 0.5291316509246826, + "learning_rate": 0.000998454273678489, + "loss": 4.0071, + "step": 185 + }, + { + "epoch": 0.012909362685147438, + "grad_norm": 0.7649528980255127, + "learning_rate": 0.0009984118086696562, + "loss": 3.8657, + "step": 190 + }, + { + "epoch": 0.013249082755809213, + "grad_norm": 2.0646681785583496, + "learning_rate": 0.0009983693436608236, + "loss": 4.1499, + "step": 195 + }, + { + "epoch": 0.013588802826470988, + "grad_norm": 0.5356468558311462, + "learning_rate": 0.0009983268786519907, + "loss": 3.9831, + "step": 200 + }, + { + "epoch": 0.013928522897132763, + "grad_norm": 0.4889935553073883, + "learning_rate": 0.000998284413643158, + "loss": 4.1227, + "step": 205 + }, + { + "epoch": 0.014268242967794538, + "grad_norm": 0.3875698149204254, + "learning_rate": 0.0009982419486343254, + "loss": 3.8072, + "step": 210 + }, + { + "epoch": 0.014607963038456313, + "grad_norm": 0.40928298234939575, + "learning_rate": 0.0009981994836254927, + "loss": 3.8653, + "step": 215 + }, + { + "epoch": 0.014947683109118086, + "grad_norm": 0.5091014504432678, + "learning_rate": 0.0009981570186166598, + "loss": 4.0244, + "step": 220 + }, + { + "epoch": 0.015287403179779861, + "grad_norm": 0.5922226309776306, + "learning_rate": 0.0009981145536078271, + "loss": 3.9302, + "step": 225 + }, + { + "epoch": 0.015627123250441636, + "grad_norm": 0.4027731120586395, + "learning_rate": 0.0009980720885989945, + "loss": 3.8909, + "step": 230 + }, + { + "epoch": 0.01596684332110341, + "grad_norm": 0.4681120812892914, + "learning_rate": 0.0009980296235901616, + "loss": 3.9365, + "step": 235 + }, + { + "epoch": 0.016306563391765186, + "grad_norm": 0.8460211157798767, + "learning_rate": 0.0009979871585813292, + "loss": 3.8321, + "step": 240 + }, + { + "epoch": 0.01664628346242696, + "grad_norm": 0.5115953683853149, + "learning_rate": 0.0009979446935724963, + "loss": 3.876, + "step": 245 + }, + { + "epoch": 0.016986003533088736, + "grad_norm": 0.4146783947944641, + "learning_rate": 0.0009979022285636636, + "loss": 4.0201, + "step": 250 + }, + { + "epoch": 0.01732572360375051, + "grad_norm": 0.5596747994422913, + "learning_rate": 0.000997859763554831, + "loss": 3.8316, + "step": 255 + }, + { + "epoch": 0.017665443674412286, + "grad_norm": 1.2797352075576782, + "learning_rate": 0.000997817298545998, + "loss": 4.0192, + "step": 260 + }, + { + "epoch": 0.01800516374507406, + "grad_norm": 0.5335856676101685, + "learning_rate": 0.0009977748335371654, + "loss": 4.0721, + "step": 265 + }, + { + "epoch": 0.018344883815735832, + "grad_norm": 0.5869820713996887, + "learning_rate": 0.0009977323685283327, + "loss": 3.8598, + "step": 270 + }, + { + "epoch": 0.01868460388639761, + "grad_norm": 0.46688759326934814, + "learning_rate": 0.0009976899035195, + "loss": 3.8036, + "step": 275 + }, + { + "epoch": 0.019024323957059382, + "grad_norm": 0.42624059319496155, + "learning_rate": 0.0009976474385106672, + "loss": 3.8459, + "step": 280 + }, + { + "epoch": 0.01936404402772116, + "grad_norm": 0.39216870069503784, + "learning_rate": 0.0009976049735018345, + "loss": 3.9509, + "step": 285 + }, + { + "epoch": 0.019703764098382932, + "grad_norm": 0.6354107856750488, + "learning_rate": 0.0009975625084930018, + "loss": 3.992, + "step": 290 + }, + { + "epoch": 0.020043484169044706, + "grad_norm": 0.434683233499527, + "learning_rate": 0.000997520043484169, + "loss": 3.9471, + "step": 295 + }, + { + "epoch": 0.020383204239706482, + "grad_norm": 0.49986785650253296, + "learning_rate": 0.0009974775784753363, + "loss": 3.6998, + "step": 300 + }, + { + "epoch": 0.020722924310368256, + "grad_norm": 0.6968653798103333, + "learning_rate": 0.0009974351134665036, + "loss": 3.9801, + "step": 305 + }, + { + "epoch": 0.021062644381030032, + "grad_norm": 0.4575914144515991, + "learning_rate": 0.000997392648457671, + "loss": 3.7103, + "step": 310 + }, + { + "epoch": 0.021402364451691806, + "grad_norm": 0.45561906695365906, + "learning_rate": 0.0009973501834488383, + "loss": 4.1703, + "step": 315 + }, + { + "epoch": 0.021742084522353582, + "grad_norm": 0.5533580780029297, + "learning_rate": 0.0009973077184400054, + "loss": 3.9765, + "step": 320 + }, + { + "epoch": 0.022081804593015356, + "grad_norm": 0.5810810327529907, + "learning_rate": 0.0009972652534311727, + "loss": 3.7829, + "step": 325 + }, + { + "epoch": 0.02242152466367713, + "grad_norm": 0.49037688970565796, + "learning_rate": 0.00099722278842234, + "loss": 3.6074, + "step": 330 + }, + { + "epoch": 0.022761244734338906, + "grad_norm": 0.45677196979522705, + "learning_rate": 0.0009971803234135072, + "loss": 4.0936, + "step": 335 + }, + { + "epoch": 0.02310096480500068, + "grad_norm": 0.5844870805740356, + "learning_rate": 0.0009971378584046745, + "loss": 3.9574, + "step": 340 + }, + { + "epoch": 0.023440684875662456, + "grad_norm": 0.7637877464294434, + "learning_rate": 0.0009970953933958419, + "loss": 3.9552, + "step": 345 + }, + { + "epoch": 0.02378040494632423, + "grad_norm": 0.5789228081703186, + "learning_rate": 0.0009970529283870092, + "loss": 3.7573, + "step": 350 + }, + { + "epoch": 0.024120125016986002, + "grad_norm": 0.5973035097122192, + "learning_rate": 0.0009970104633781763, + "loss": 4.171, + "step": 355 + }, + { + "epoch": 0.02445984508764778, + "grad_norm": 0.5052527785301208, + "learning_rate": 0.0009969679983693437, + "loss": 4.0045, + "step": 360 + }, + { + "epoch": 0.024799565158309552, + "grad_norm": 0.6609503626823425, + "learning_rate": 0.000996925533360511, + "loss": 3.9002, + "step": 365 + }, + { + "epoch": 0.02513928522897133, + "grad_norm": 0.5240939855575562, + "learning_rate": 0.000996883068351678, + "loss": 4.0321, + "step": 370 + }, + { + "epoch": 0.025479005299633102, + "grad_norm": 0.5713841319084167, + "learning_rate": 0.0009968406033428454, + "loss": 3.8732, + "step": 375 + }, + { + "epoch": 0.025818725370294875, + "grad_norm": 0.5236926674842834, + "learning_rate": 0.0009967981383340128, + "loss": 4.0761, + "step": 380 + }, + { + "epoch": 0.026158445440956652, + "grad_norm": 0.5453757047653198, + "learning_rate": 0.0009967556733251801, + "loss": 3.9592, + "step": 385 + }, + { + "epoch": 0.026498165511618425, + "grad_norm": 0.45232608914375305, + "learning_rate": 0.0009967132083163474, + "loss": 3.7651, + "step": 390 + }, + { + "epoch": 0.026837885582280202, + "grad_norm": 0.6869668364524841, + "learning_rate": 0.0009966707433075146, + "loss": 3.4933, + "step": 395 + }, + { + "epoch": 0.027177605652941975, + "grad_norm": 0.7344043254852295, + "learning_rate": 0.000996628278298682, + "loss": 3.8085, + "step": 400 + }, + { + "epoch": 0.027517325723603752, + "grad_norm": 3.5851566791534424, + "learning_rate": 0.0009965858132898492, + "loss": 3.6846, + "step": 405 + }, + { + "epoch": 0.027857045794265525, + "grad_norm": 0.5283172130584717, + "learning_rate": 0.0009965433482810163, + "loss": 3.9472, + "step": 410 + }, + { + "epoch": 0.0281967658649273, + "grad_norm": 1.6593360900878906, + "learning_rate": 0.0009965008832721837, + "loss": 4.1005, + "step": 415 + }, + { + "epoch": 0.028536485935589075, + "grad_norm": 0.8874955773353577, + "learning_rate": 0.000996458418263351, + "loss": 3.8157, + "step": 420 + }, + { + "epoch": 0.02887620600625085, + "grad_norm": 0.9176999926567078, + "learning_rate": 0.0009964159532545184, + "loss": 4.1564, + "step": 425 + }, + { + "epoch": 0.029215926076912625, + "grad_norm": 0.5801127552986145, + "learning_rate": 0.0009963734882456855, + "loss": 3.9305, + "step": 430 + }, + { + "epoch": 0.0295556461475744, + "grad_norm": 3.5147316455841064, + "learning_rate": 0.0009963310232368528, + "loss": 3.9751, + "step": 435 + }, + { + "epoch": 0.029895366218236172, + "grad_norm": 0.5302643179893494, + "learning_rate": 0.0009962885582280201, + "loss": 3.8841, + "step": 440 + }, + { + "epoch": 0.03023508628889795, + "grad_norm": 2.070713520050049, + "learning_rate": 0.0009962460932191873, + "loss": 3.7756, + "step": 445 + }, + { + "epoch": 0.030574806359559722, + "grad_norm": 0.56476891040802, + "learning_rate": 0.0009962036282103548, + "loss": 3.5931, + "step": 450 + }, + { + "epoch": 0.0309145264302215, + "grad_norm": 0.519072949886322, + "learning_rate": 0.000996161163201522, + "loss": 3.8472, + "step": 455 + }, + { + "epoch": 0.03125424650088327, + "grad_norm": 0.5036606788635254, + "learning_rate": 0.0009961186981926893, + "loss": 4.1518, + "step": 460 + }, + { + "epoch": 0.03159396657154505, + "grad_norm": 0.7856326699256897, + "learning_rate": 0.0009960762331838566, + "loss": 3.9065, + "step": 465 + }, + { + "epoch": 0.03193368664220682, + "grad_norm": 0.5266003608703613, + "learning_rate": 0.0009960337681750237, + "loss": 3.8486, + "step": 470 + }, + { + "epoch": 0.032273406712868595, + "grad_norm": 0.5279913544654846, + "learning_rate": 0.000995991303166191, + "loss": 4.0562, + "step": 475 + }, + { + "epoch": 0.03261312678353037, + "grad_norm": 0.6494545936584473, + "learning_rate": 0.0009959488381573584, + "loss": 3.7181, + "step": 480 + }, + { + "epoch": 0.03295284685419215, + "grad_norm": 0.5125119090080261, + "learning_rate": 0.0009959063731485257, + "loss": 3.6412, + "step": 485 + }, + { + "epoch": 0.03329256692485392, + "grad_norm": 0.5816330909729004, + "learning_rate": 0.000995863908139693, + "loss": 3.8733, + "step": 490 + }, + { + "epoch": 0.033632286995515695, + "grad_norm": 0.5368356108665466, + "learning_rate": 0.0009958214431308602, + "loss": 3.9537, + "step": 495 + }, + { + "epoch": 0.03397200706617747, + "grad_norm": 0.5322296023368835, + "learning_rate": 0.0009957789781220275, + "loss": 3.7972, + "step": 500 + }, + { + "epoch": 0.03431172713683924, + "grad_norm": 0.5174826979637146, + "learning_rate": 0.0009957365131131948, + "loss": 3.8631, + "step": 505 + }, + { + "epoch": 0.03465144720750102, + "grad_norm": 0.6245468854904175, + "learning_rate": 0.000995694048104362, + "loss": 3.8548, + "step": 510 + }, + { + "epoch": 0.034991167278162795, + "grad_norm": 0.6452138423919678, + "learning_rate": 0.0009956515830955293, + "loss": 3.8005, + "step": 515 + }, + { + "epoch": 0.03533088734882457, + "grad_norm": 0.585810124874115, + "learning_rate": 0.0009956091180866966, + "loss": 3.9999, + "step": 520 + }, + { + "epoch": 0.03567060741948634, + "grad_norm": 0.5478231310844421, + "learning_rate": 0.000995566653077864, + "loss": 3.7628, + "step": 525 + }, + { + "epoch": 0.03601032749014812, + "grad_norm": 0.6791477203369141, + "learning_rate": 0.000995524188069031, + "loss": 3.6566, + "step": 530 + }, + { + "epoch": 0.036350047560809895, + "grad_norm": 0.6311488747596741, + "learning_rate": 0.0009954817230601984, + "loss": 3.8548, + "step": 535 + }, + { + "epoch": 0.036689767631471665, + "grad_norm": 0.6846545934677124, + "learning_rate": 0.0009954392580513657, + "loss": 3.8034, + "step": 540 + }, + { + "epoch": 0.03702948770213344, + "grad_norm": 0.6652524471282959, + "learning_rate": 0.0009953967930425329, + "loss": 3.782, + "step": 545 + }, + { + "epoch": 0.03736920777279522, + "grad_norm": 0.49301138520240784, + "learning_rate": 0.0009953543280337002, + "loss": 4.1403, + "step": 550 + }, + { + "epoch": 0.03770892784345699, + "grad_norm": 0.565241813659668, + "learning_rate": 0.0009953118630248675, + "loss": 3.8542, + "step": 555 + }, + { + "epoch": 0.038048647914118765, + "grad_norm": 0.5375010967254639, + "learning_rate": 0.0009952693980160349, + "loss": 3.8869, + "step": 560 + }, + { + "epoch": 0.03838836798478054, + "grad_norm": 0.5215250849723816, + "learning_rate": 0.0009952269330072022, + "loss": 3.9443, + "step": 565 + }, + { + "epoch": 0.03872808805544232, + "grad_norm": 0.5483617186546326, + "learning_rate": 0.0009951844679983693, + "loss": 4.0829, + "step": 570 + }, + { + "epoch": 0.03906780812610409, + "grad_norm": 0.5078197717666626, + "learning_rate": 0.0009951420029895366, + "loss": 4.0247, + "step": 575 + }, + { + "epoch": 0.039407528196765865, + "grad_norm": 0.4967578947544098, + "learning_rate": 0.000995099537980704, + "loss": 3.7678, + "step": 580 + }, + { + "epoch": 0.03974724826742764, + "grad_norm": 0.9934768080711365, + "learning_rate": 0.000995057072971871, + "loss": 3.7721, + "step": 585 + }, + { + "epoch": 0.04008696833808941, + "grad_norm": 0.5784791707992554, + "learning_rate": 0.0009950146079630384, + "loss": 3.6503, + "step": 590 + }, + { + "epoch": 0.04042668840875119, + "grad_norm": 0.7383197546005249, + "learning_rate": 0.0009949721429542058, + "loss": 3.6984, + "step": 595 + }, + { + "epoch": 0.040766408479412965, + "grad_norm": 0.720099687576294, + "learning_rate": 0.000994929677945373, + "loss": 3.9122, + "step": 600 + }, + { + "epoch": 0.04110612855007474, + "grad_norm": 0.6150121688842773, + "learning_rate": 0.0009948872129365402, + "loss": 3.7934, + "step": 605 + }, + { + "epoch": 0.04144584862073651, + "grad_norm": 0.5290435552597046, + "learning_rate": 0.0009948447479277076, + "loss": 3.9144, + "step": 610 + }, + { + "epoch": 0.04178556869139829, + "grad_norm": 0.538284957408905, + "learning_rate": 0.0009948022829188749, + "loss": 3.811, + "step": 615 + }, + { + "epoch": 0.042125288762060065, + "grad_norm": 0.4686654508113861, + "learning_rate": 0.000994759817910042, + "loss": 3.6469, + "step": 620 + }, + { + "epoch": 0.042465008832721834, + "grad_norm": 0.44892096519470215, + "learning_rate": 0.0009947173529012096, + "loss": 3.448, + "step": 625 + }, + { + "epoch": 0.04280472890338361, + "grad_norm": 0.5298163294792175, + "learning_rate": 0.0009946748878923767, + "loss": 3.5932, + "step": 630 + }, + { + "epoch": 0.04314444897404539, + "grad_norm": 0.5648486614227295, + "learning_rate": 0.000994632422883544, + "loss": 3.7949, + "step": 635 + }, + { + "epoch": 0.043484169044707165, + "grad_norm": 0.543407142162323, + "learning_rate": 0.0009945899578747113, + "loss": 3.693, + "step": 640 + }, + { + "epoch": 0.043823889115368934, + "grad_norm": 0.6799116134643555, + "learning_rate": 0.0009945474928658785, + "loss": 4.0389, + "step": 645 + }, + { + "epoch": 0.04416360918603071, + "grad_norm": 0.4761027991771698, + "learning_rate": 0.0009945050278570458, + "loss": 3.9379, + "step": 650 + }, + { + "epoch": 0.04450332925669249, + "grad_norm": 0.4443792998790741, + "learning_rate": 0.0009944625628482131, + "loss": 3.786, + "step": 655 + }, + { + "epoch": 0.04484304932735426, + "grad_norm": 0.5917078256607056, + "learning_rate": 0.0009944200978393805, + "loss": 3.6835, + "step": 660 + }, + { + "epoch": 0.045182769398016034, + "grad_norm": 0.5975964069366455, + "learning_rate": 0.0009943776328305476, + "loss": 3.7144, + "step": 665 + }, + { + "epoch": 0.04552248946867781, + "grad_norm": 0.522092878818512, + "learning_rate": 0.000994335167821715, + "loss": 3.9535, + "step": 670 + }, + { + "epoch": 0.04586220953933958, + "grad_norm": 0.6475699543952942, + "learning_rate": 0.0009942927028128822, + "loss": 3.7183, + "step": 675 + }, + { + "epoch": 0.04620192961000136, + "grad_norm": 0.5710383653640747, + "learning_rate": 0.0009942502378040494, + "loss": 3.85, + "step": 680 + }, + { + "epoch": 0.046541649680663134, + "grad_norm": 0.6206600069999695, + "learning_rate": 0.0009942077727952167, + "loss": 3.9603, + "step": 685 + }, + { + "epoch": 0.04688136975132491, + "grad_norm": 0.5517189502716064, + "learning_rate": 0.000994165307786384, + "loss": 3.9551, + "step": 690 + }, + { + "epoch": 0.04722108982198668, + "grad_norm": 0.5132995247840881, + "learning_rate": 0.0009941228427775514, + "loss": 3.8, + "step": 695 + }, + { + "epoch": 0.04756080989264846, + "grad_norm": 0.43643656373023987, + "learning_rate": 0.0009940803777687187, + "loss": 3.9457, + "step": 700 + }, + { + "epoch": 0.047900529963310234, + "grad_norm": 0.5518332123756409, + "learning_rate": 0.0009940379127598858, + "loss": 3.8862, + "step": 705 + }, + { + "epoch": 0.048240250033972004, + "grad_norm": 0.5246963500976562, + "learning_rate": 0.0009939954477510532, + "loss": 3.8879, + "step": 710 + }, + { + "epoch": 0.04857997010463378, + "grad_norm": 0.6005711555480957, + "learning_rate": 0.0009939529827422205, + "loss": 3.686, + "step": 715 + }, + { + "epoch": 0.04891969017529556, + "grad_norm": 0.5297259092330933, + "learning_rate": 0.0009939105177333876, + "loss": 3.9488, + "step": 720 + }, + { + "epoch": 0.049259410245957334, + "grad_norm": 0.8890801072120667, + "learning_rate": 0.000993868052724555, + "loss": 3.7205, + "step": 725 + }, + { + "epoch": 0.049599130316619104, + "grad_norm": 0.6051033139228821, + "learning_rate": 0.0009938255877157223, + "loss": 3.8075, + "step": 730 + }, + { + "epoch": 0.04993885038728088, + "grad_norm": 0.7406430840492249, + "learning_rate": 0.0009937831227068896, + "loss": 3.8095, + "step": 735 + }, + { + "epoch": 0.05027857045794266, + "grad_norm": 0.6831587553024292, + "learning_rate": 0.0009937406576980567, + "loss": 3.9945, + "step": 740 + }, + { + "epoch": 0.05061829052860443, + "grad_norm": 0.5415137410163879, + "learning_rate": 0.000993698192689224, + "loss": 3.6272, + "step": 745 + }, + { + "epoch": 0.050958010599266204, + "grad_norm": 0.6302469968795776, + "learning_rate": 0.0009936557276803914, + "loss": 3.8668, + "step": 750 + }, + { + "epoch": 0.05129773066992798, + "grad_norm": 0.5909126996994019, + "learning_rate": 0.0009936132626715585, + "loss": 3.8503, + "step": 755 + }, + { + "epoch": 0.05163745074058975, + "grad_norm": 0.6041208505630493, + "learning_rate": 0.000993570797662726, + "loss": 3.9896, + "step": 760 + }, + { + "epoch": 0.05197717081125153, + "grad_norm": 0.5620517134666443, + "learning_rate": 0.0009935283326538932, + "loss": 3.6891, + "step": 765 + }, + { + "epoch": 0.052316890881913304, + "grad_norm": 0.5894716382026672, + "learning_rate": 0.0009934858676450605, + "loss": 3.7675, + "step": 770 + }, + { + "epoch": 0.05265661095257508, + "grad_norm": 0.6489331126213074, + "learning_rate": 0.0009934434026362278, + "loss": 3.881, + "step": 775 + }, + { + "epoch": 0.05299633102323685, + "grad_norm": 0.5325939655303955, + "learning_rate": 0.000993400937627395, + "loss": 3.9001, + "step": 780 + }, + { + "epoch": 0.05333605109389863, + "grad_norm": 0.7115577459335327, + "learning_rate": 0.0009933584726185623, + "loss": 3.6433, + "step": 785 + }, + { + "epoch": 0.053675771164560404, + "grad_norm": 0.5939438343048096, + "learning_rate": 0.0009933160076097296, + "loss": 3.8852, + "step": 790 + }, + { + "epoch": 0.054015491235222174, + "grad_norm": 0.58011394739151, + "learning_rate": 0.000993273542600897, + "loss": 3.7433, + "step": 795 + }, + { + "epoch": 0.05435521130588395, + "grad_norm": 0.5787004232406616, + "learning_rate": 0.0009932310775920643, + "loss": 4.0052, + "step": 800 + }, + { + "epoch": 0.05469493137654573, + "grad_norm": 0.5443267226219177, + "learning_rate": 0.0009931886125832314, + "loss": 4.0476, + "step": 805 + }, + { + "epoch": 0.055034651447207504, + "grad_norm": 0.5212072134017944, + "learning_rate": 0.0009931461475743988, + "loss": 3.9724, + "step": 810 + }, + { + "epoch": 0.055374371517869274, + "grad_norm": 0.7275431752204895, + "learning_rate": 0.000993103682565566, + "loss": 3.869, + "step": 815 + }, + { + "epoch": 0.05571409158853105, + "grad_norm": 0.6683277487754822, + "learning_rate": 0.0009930612175567332, + "loss": 3.7693, + "step": 820 + }, + { + "epoch": 0.05605381165919283, + "grad_norm": 0.47047159075737, + "learning_rate": 0.0009930187525479005, + "loss": 3.6353, + "step": 825 + }, + { + "epoch": 0.0563935317298546, + "grad_norm": 0.5462180972099304, + "learning_rate": 0.0009929762875390679, + "loss": 3.6996, + "step": 830 + }, + { + "epoch": 0.056733251800516374, + "grad_norm": 0.5828820466995239, + "learning_rate": 0.0009929338225302352, + "loss": 3.852, + "step": 835 + }, + { + "epoch": 0.05707297187117815, + "grad_norm": 0.48185303807258606, + "learning_rate": 0.0009928913575214023, + "loss": 4.0958, + "step": 840 + }, + { + "epoch": 0.05741269194183993, + "grad_norm": 0.6095482707023621, + "learning_rate": 0.0009928488925125697, + "loss": 3.9707, + "step": 845 + }, + { + "epoch": 0.0577524120125017, + "grad_norm": 0.5710833072662354, + "learning_rate": 0.000992806427503737, + "loss": 3.8781, + "step": 850 + }, + { + "epoch": 0.058092132083163474, + "grad_norm": 0.6475180387496948, + "learning_rate": 0.0009927639624949041, + "loss": 3.7117, + "step": 855 + }, + { + "epoch": 0.05843185215382525, + "grad_norm": 0.5677533149719238, + "learning_rate": 0.0009927214974860714, + "loss": 3.5534, + "step": 860 + }, + { + "epoch": 0.05877157222448702, + "grad_norm": 0.6643016338348389, + "learning_rate": 0.0009926790324772388, + "loss": 3.8354, + "step": 865 + }, + { + "epoch": 0.0591112922951488, + "grad_norm": 2.15201473236084, + "learning_rate": 0.0009926365674684061, + "loss": 3.8914, + "step": 870 + }, + { + "epoch": 0.059451012365810574, + "grad_norm": 0.7162554860115051, + "learning_rate": 0.0009925941024595734, + "loss": 3.6254, + "step": 875 + }, + { + "epoch": 0.059790732436472344, + "grad_norm": 1.1859124898910522, + "learning_rate": 0.0009925516374507406, + "loss": 3.7454, + "step": 880 + }, + { + "epoch": 0.06013045250713412, + "grad_norm": 0.6241307854652405, + "learning_rate": 0.000992509172441908, + "loss": 3.7778, + "step": 885 + }, + { + "epoch": 0.0604701725777959, + "grad_norm": 0.7306844592094421, + "learning_rate": 0.0009924667074330752, + "loss": 3.9615, + "step": 890 + }, + { + "epoch": 0.060809892648457674, + "grad_norm": 0.5253746509552002, + "learning_rate": 0.0009924242424242424, + "loss": 3.9075, + "step": 895 + }, + { + "epoch": 0.061149612719119444, + "grad_norm": 0.6302961111068726, + "learning_rate": 0.0009923817774154097, + "loss": 3.8657, + "step": 900 + }, + { + "epoch": 0.06148933278978122, + "grad_norm": 0.9820981025695801, + "learning_rate": 0.000992339312406577, + "loss": 3.7156, + "step": 905 + }, + { + "epoch": 0.061829052860443, + "grad_norm": 1.0277503728866577, + "learning_rate": 0.0009922968473977444, + "loss": 3.6718, + "step": 910 + }, + { + "epoch": 0.06216877293110477, + "grad_norm": 0.6224450469017029, + "learning_rate": 0.0009922543823889115, + "loss": 4.03, + "step": 915 + }, + { + "epoch": 0.06250849300176654, + "grad_norm": 0.5719863176345825, + "learning_rate": 0.0009922119173800788, + "loss": 3.8174, + "step": 920 + }, + { + "epoch": 0.06284821307242831, + "grad_norm": 0.6054795980453491, + "learning_rate": 0.0009921694523712461, + "loss": 3.8943, + "step": 925 + }, + { + "epoch": 0.0631879331430901, + "grad_norm": 0.5510122179985046, + "learning_rate": 0.0009921269873624133, + "loss": 3.8721, + "step": 930 + }, + { + "epoch": 0.06352765321375187, + "grad_norm": 0.5710406303405762, + "learning_rate": 0.0009920845223535808, + "loss": 4.0071, + "step": 935 + }, + { + "epoch": 0.06386737328441364, + "grad_norm": 0.5538541674613953, + "learning_rate": 0.000992042057344748, + "loss": 3.637, + "step": 940 + }, + { + "epoch": 0.06420709335507542, + "grad_norm": 0.5355236530303955, + "learning_rate": 0.0009919995923359153, + "loss": 3.7712, + "step": 945 + }, + { + "epoch": 0.06454681342573719, + "grad_norm": 0.5843560695648193, + "learning_rate": 0.0009919571273270826, + "loss": 3.8616, + "step": 950 + }, + { + "epoch": 0.06488653349639897, + "grad_norm": 0.8365575671195984, + "learning_rate": 0.0009919146623182497, + "loss": 3.7749, + "step": 955 + }, + { + "epoch": 0.06522625356706074, + "grad_norm": 0.6228957176208496, + "learning_rate": 0.000991872197309417, + "loss": 3.8968, + "step": 960 + }, + { + "epoch": 0.06556597363772251, + "grad_norm": 0.47003018856048584, + "learning_rate": 0.0009918297323005844, + "loss": 3.7327, + "step": 965 + }, + { + "epoch": 0.0659056937083843, + "grad_norm": 0.8032851815223694, + "learning_rate": 0.0009917872672917517, + "loss": 4.0484, + "step": 970 + }, + { + "epoch": 0.06624541377904607, + "grad_norm": 0.9801841378211975, + "learning_rate": 0.0009917448022829188, + "loss": 4.0092, + "step": 975 + }, + { + "epoch": 0.06658513384970784, + "grad_norm": 0.628662109375, + "learning_rate": 0.0009917023372740862, + "loss": 3.8591, + "step": 980 + }, + { + "epoch": 0.06692485392036962, + "grad_norm": 0.592852771282196, + "learning_rate": 0.0009916598722652535, + "loss": 3.8501, + "step": 985 + }, + { + "epoch": 0.06726457399103139, + "grad_norm": 0.5750938057899475, + "learning_rate": 0.0009916174072564206, + "loss": 3.861, + "step": 990 + }, + { + "epoch": 0.06760429406169316, + "grad_norm": 0.510252833366394, + "learning_rate": 0.000991574942247588, + "loss": 3.7132, + "step": 995 + }, + { + "epoch": 0.06794401413235494, + "grad_norm": 0.6436201930046082, + "learning_rate": 0.0009915324772387553, + "loss": 3.8438, + "step": 1000 + }, + { + "epoch": 0.06828373420301671, + "grad_norm": 0.6968289017677307, + "learning_rate": 0.0009914900122299226, + "loss": 3.7903, + "step": 1005 + }, + { + "epoch": 0.06862345427367848, + "grad_norm": 0.5379902720451355, + "learning_rate": 0.00099144754722109, + "loss": 3.6007, + "step": 1010 + }, + { + "epoch": 0.06896317434434027, + "grad_norm": 0.5906341671943665, + "learning_rate": 0.000991405082212257, + "loss": 3.9147, + "step": 1015 + }, + { + "epoch": 0.06930289441500204, + "grad_norm": 0.5015144944190979, + "learning_rate": 0.0009913626172034244, + "loss": 3.7384, + "step": 1020 + }, + { + "epoch": 0.0696426144856638, + "grad_norm": 0.5123012065887451, + "learning_rate": 0.0009913201521945917, + "loss": 3.5768, + "step": 1025 + }, + { + "epoch": 0.06998233455632559, + "grad_norm": 1.0012619495391846, + "learning_rate": 0.0009912776871857589, + "loss": 3.982, + "step": 1030 + }, + { + "epoch": 0.07032205462698736, + "grad_norm": 0.6756113767623901, + "learning_rate": 0.0009912352221769262, + "loss": 3.7547, + "step": 1035 + }, + { + "epoch": 0.07066177469764914, + "grad_norm": 0.6944485902786255, + "learning_rate": 0.0009911927571680935, + "loss": 3.7883, + "step": 1040 + }, + { + "epoch": 0.07100149476831091, + "grad_norm": 0.8792134523391724, + "learning_rate": 0.0009911502921592609, + "loss": 3.7693, + "step": 1045 + }, + { + "epoch": 0.07134121483897268, + "grad_norm": 0.6554251313209534, + "learning_rate": 0.000991107827150428, + "loss": 4.0296, + "step": 1050 + }, + { + "epoch": 0.07168093490963447, + "grad_norm": 0.9558029174804688, + "learning_rate": 0.0009910653621415953, + "loss": 3.6429, + "step": 1055 + }, + { + "epoch": 0.07202065498029624, + "grad_norm": 0.5566821098327637, + "learning_rate": 0.0009910228971327626, + "loss": 3.6473, + "step": 1060 + }, + { + "epoch": 0.072360375050958, + "grad_norm": 0.6007353067398071, + "learning_rate": 0.0009909804321239298, + "loss": 3.5823, + "step": 1065 + }, + { + "epoch": 0.07270009512161979, + "grad_norm": 2.572160005569458, + "learning_rate": 0.000990937967115097, + "loss": 3.9877, + "step": 1070 + }, + { + "epoch": 0.07303981519228156, + "grad_norm": 0.6886956691741943, + "learning_rate": 0.0009908955021062644, + "loss": 3.6801, + "step": 1075 + }, + { + "epoch": 0.07337953526294333, + "grad_norm": 0.718035101890564, + "learning_rate": 0.0009908530370974318, + "loss": 3.9933, + "step": 1080 + }, + { + "epoch": 0.07371925533360511, + "grad_norm": 0.5323196649551392, + "learning_rate": 0.000990810572088599, + "loss": 3.7103, + "step": 1085 + }, + { + "epoch": 0.07405897540426688, + "grad_norm": 0.6115411520004272, + "learning_rate": 0.0009907681070797662, + "loss": 3.776, + "step": 1090 + }, + { + "epoch": 0.07439869547492865, + "grad_norm": 0.6811002492904663, + "learning_rate": 0.0009907256420709336, + "loss": 3.8797, + "step": 1095 + }, + { + "epoch": 0.07473841554559044, + "grad_norm": 0.6046304702758789, + "learning_rate": 0.0009906831770621009, + "loss": 3.3928, + "step": 1100 + }, + { + "epoch": 0.0750781356162522, + "grad_norm": 0.5722492337226868, + "learning_rate": 0.000990640712053268, + "loss": 4.0288, + "step": 1105 + }, + { + "epoch": 0.07541785568691398, + "grad_norm": 0.6895374059677124, + "learning_rate": 0.0009905982470444353, + "loss": 3.7249, + "step": 1110 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 6.356192111968994, + "learning_rate": 0.0009905557820356027, + "loss": 3.8826, + "step": 1115 + }, + { + "epoch": 0.07609729582823753, + "grad_norm": 0.5069996118545532, + "learning_rate": 0.00099051331702677, + "loss": 3.7226, + "step": 1120 + }, + { + "epoch": 0.07643701589889931, + "grad_norm": 0.7593939304351807, + "learning_rate": 0.0009904708520179371, + "loss": 3.8677, + "step": 1125 + }, + { + "epoch": 0.07677673596956108, + "grad_norm": 0.7149084806442261, + "learning_rate": 0.0009904283870091045, + "loss": 3.6559, + "step": 1130 + }, + { + "epoch": 0.07711645604022285, + "grad_norm": 0.6425236463546753, + "learning_rate": 0.0009903859220002718, + "loss": 3.7767, + "step": 1135 + }, + { + "epoch": 0.07745617611088464, + "grad_norm": 0.77424556016922, + "learning_rate": 0.0009903434569914391, + "loss": 3.4686, + "step": 1140 + }, + { + "epoch": 0.0777958961815464, + "grad_norm": 0.998789370059967, + "learning_rate": 0.0009903009919826065, + "loss": 3.7939, + "step": 1145 + }, + { + "epoch": 0.07813561625220818, + "grad_norm": 0.7090451717376709, + "learning_rate": 0.0009902585269737736, + "loss": 3.6671, + "step": 1150 + }, + { + "epoch": 0.07847533632286996, + "grad_norm": 0.6087352633476257, + "learning_rate": 0.000990216061964941, + "loss": 3.8358, + "step": 1155 + }, + { + "epoch": 0.07881505639353173, + "grad_norm": 0.5472407341003418, + "learning_rate": 0.0009901735969561083, + "loss": 3.6392, + "step": 1160 + }, + { + "epoch": 0.0791547764641935, + "grad_norm": 0.6369988918304443, + "learning_rate": 0.0009901311319472754, + "loss": 3.8922, + "step": 1165 + }, + { + "epoch": 0.07949449653485528, + "grad_norm": 0.5358173847198486, + "learning_rate": 0.0009900886669384427, + "loss": 3.7758, + "step": 1170 + }, + { + "epoch": 0.07983421660551705, + "grad_norm": 0.6733694076538086, + "learning_rate": 0.00099004620192961, + "loss": 3.6851, + "step": 1175 + }, + { + "epoch": 0.08017393667617882, + "grad_norm": 1.6449182033538818, + "learning_rate": 0.0009900037369207774, + "loss": 4.103, + "step": 1180 + }, + { + "epoch": 0.0805136567468406, + "grad_norm": 0.8030320405960083, + "learning_rate": 0.0009899612719119447, + "loss": 3.8129, + "step": 1185 + }, + { + "epoch": 0.08085337681750238, + "grad_norm": 0.5922967791557312, + "learning_rate": 0.0009899188069031118, + "loss": 3.8989, + "step": 1190 + }, + { + "epoch": 0.08119309688816416, + "grad_norm": 0.5337926149368286, + "learning_rate": 0.0009898763418942792, + "loss": 3.5802, + "step": 1195 + }, + { + "epoch": 0.08153281695882593, + "grad_norm": 0.6427436470985413, + "learning_rate": 0.0009898338768854465, + "loss": 3.8666, + "step": 1200 + }, + { + "epoch": 0.0818725370294877, + "grad_norm": 0.8470380306243896, + "learning_rate": 0.0009897914118766136, + "loss": 4.0009, + "step": 1205 + }, + { + "epoch": 0.08221225710014948, + "grad_norm": 0.5909467339515686, + "learning_rate": 0.000989748946867781, + "loss": 3.5994, + "step": 1210 + }, + { + "epoch": 0.08255197717081125, + "grad_norm": 0.6932828426361084, + "learning_rate": 0.0009897064818589483, + "loss": 3.6458, + "step": 1215 + }, + { + "epoch": 0.08289169724147302, + "grad_norm": 0.5876645445823669, + "learning_rate": 0.0009896640168501156, + "loss": 3.6359, + "step": 1220 + }, + { + "epoch": 0.0832314173121348, + "grad_norm": 0.5505638122558594, + "learning_rate": 0.0009896215518412827, + "loss": 3.6581, + "step": 1225 + }, + { + "epoch": 0.08357113738279658, + "grad_norm": 0.6316356658935547, + "learning_rate": 0.00098957908683245, + "loss": 3.8968, + "step": 1230 + }, + { + "epoch": 0.08391085745345835, + "grad_norm": 0.620417058467865, + "learning_rate": 0.0009895366218236174, + "loss": 3.698, + "step": 1235 + }, + { + "epoch": 0.08425057752412013, + "grad_norm": 0.7232619524002075, + "learning_rate": 0.0009894941568147845, + "loss": 3.781, + "step": 1240 + }, + { + "epoch": 0.0845902975947819, + "grad_norm": 0.6691374182701111, + "learning_rate": 0.000989451691805952, + "loss": 3.5228, + "step": 1245 + }, + { + "epoch": 0.08493001766544367, + "grad_norm": 0.8265055418014526, + "learning_rate": 0.0009894092267971192, + "loss": 3.8381, + "step": 1250 + }, + { + "epoch": 0.08526973773610545, + "grad_norm": 0.5624240636825562, + "learning_rate": 0.0009893667617882865, + "loss": 3.9376, + "step": 1255 + }, + { + "epoch": 0.08560945780676722, + "grad_norm": 0.647571861743927, + "learning_rate": 0.0009893242967794539, + "loss": 3.614, + "step": 1260 + }, + { + "epoch": 0.08594917787742899, + "grad_norm": 0.640143871307373, + "learning_rate": 0.000989281831770621, + "loss": 3.693, + "step": 1265 + }, + { + "epoch": 0.08628889794809078, + "grad_norm": 0.5218189358711243, + "learning_rate": 0.0009892393667617883, + "loss": 3.6869, + "step": 1270 + }, + { + "epoch": 0.08662861801875255, + "grad_norm": 0.6464168429374695, + "learning_rate": 0.0009891969017529556, + "loss": 3.7439, + "step": 1275 + }, + { + "epoch": 0.08696833808941433, + "grad_norm": 1.1705642938613892, + "learning_rate": 0.000989154436744123, + "loss": 3.9224, + "step": 1280 + }, + { + "epoch": 0.0873080581600761, + "grad_norm": 0.6425521969795227, + "learning_rate": 0.00098911197173529, + "loss": 4.2846, + "step": 1285 + }, + { + "epoch": 0.08764777823073787, + "grad_norm": 0.6133180856704712, + "learning_rate": 0.0009890695067264574, + "loss": 3.8211, + "step": 1290 + }, + { + "epoch": 0.08798749830139965, + "grad_norm": 0.533719539642334, + "learning_rate": 0.0009890270417176248, + "loss": 3.7322, + "step": 1295 + }, + { + "epoch": 0.08832721837206142, + "grad_norm": 0.5690361261367798, + "learning_rate": 0.0009889845767087919, + "loss": 4.0328, + "step": 1300 + }, + { + "epoch": 0.08866693844272319, + "grad_norm": 0.5779794454574585, + "learning_rate": 0.0009889421116999592, + "loss": 3.8317, + "step": 1305 + }, + { + "epoch": 0.08900665851338498, + "grad_norm": 0.4988411068916321, + "learning_rate": 0.0009888996466911265, + "loss": 3.855, + "step": 1310 + }, + { + "epoch": 0.08934637858404675, + "grad_norm": 0.7643659710884094, + "learning_rate": 0.0009888571816822939, + "loss": 3.7321, + "step": 1315 + }, + { + "epoch": 0.08968609865470852, + "grad_norm": 0.7879651188850403, + "learning_rate": 0.0009888147166734612, + "loss": 3.4683, + "step": 1320 + }, + { + "epoch": 0.0900258187253703, + "grad_norm": 0.6282975673675537, + "learning_rate": 0.0009887722516646283, + "loss": 3.5149, + "step": 1325 + }, + { + "epoch": 0.09036553879603207, + "grad_norm": 0.6503021717071533, + "learning_rate": 0.0009887297866557957, + "loss": 3.7902, + "step": 1330 + }, + { + "epoch": 0.09070525886669384, + "grad_norm": 0.6619112491607666, + "learning_rate": 0.000988687321646963, + "loss": 3.848, + "step": 1335 + }, + { + "epoch": 0.09104497893735562, + "grad_norm": 0.6034533381462097, + "learning_rate": 0.0009886448566381301, + "loss": 3.9998, + "step": 1340 + }, + { + "epoch": 0.09138469900801739, + "grad_norm": 0.6787202954292297, + "learning_rate": 0.0009886023916292975, + "loss": 3.5296, + "step": 1345 + }, + { + "epoch": 0.09172441907867916, + "grad_norm": 0.6927317976951599, + "learning_rate": 0.0009885599266204648, + "loss": 3.6109, + "step": 1350 + }, + { + "epoch": 0.09206413914934095, + "grad_norm": 0.48473402857780457, + "learning_rate": 0.0009885174616116321, + "loss": 3.8539, + "step": 1355 + }, + { + "epoch": 0.09240385922000272, + "grad_norm": 0.8487088084220886, + "learning_rate": 0.0009884749966027992, + "loss": 3.7033, + "step": 1360 + }, + { + "epoch": 0.0927435792906645, + "grad_norm": 0.5436307191848755, + "learning_rate": 0.0009884325315939666, + "loss": 3.7355, + "step": 1365 + }, + { + "epoch": 0.09308329936132627, + "grad_norm": 0.5661829113960266, + "learning_rate": 0.000988390066585134, + "loss": 3.7847, + "step": 1370 + }, + { + "epoch": 0.09342301943198804, + "grad_norm": 0.6609107255935669, + "learning_rate": 0.000988347601576301, + "loss": 3.8913, + "step": 1375 + }, + { + "epoch": 0.09376273950264982, + "grad_norm": 0.6719528436660767, + "learning_rate": 0.0009883051365674684, + "loss": 3.6255, + "step": 1380 + }, + { + "epoch": 0.09410245957331159, + "grad_norm": 0.7676579356193542, + "learning_rate": 0.0009882626715586357, + "loss": 3.7682, + "step": 1385 + }, + { + "epoch": 0.09444217964397336, + "grad_norm": 0.660991370677948, + "learning_rate": 0.000988220206549803, + "loss": 3.7209, + "step": 1390 + }, + { + "epoch": 0.09478189971463515, + "grad_norm": 0.672208845615387, + "learning_rate": 0.0009881777415409704, + "loss": 3.7323, + "step": 1395 + }, + { + "epoch": 0.09512161978529692, + "grad_norm": 0.518471360206604, + "learning_rate": 0.0009881352765321375, + "loss": 3.7968, + "step": 1400 + }, + { + "epoch": 0.09546133985595869, + "grad_norm": 0.5104296803474426, + "learning_rate": 0.0009880928115233048, + "loss": 3.8916, + "step": 1405 + }, + { + "epoch": 0.09580105992662047, + "grad_norm": 0.5241336226463318, + "learning_rate": 0.0009880503465144721, + "loss": 3.8582, + "step": 1410 + }, + { + "epoch": 0.09614077999728224, + "grad_norm": 0.5548410415649414, + "learning_rate": 0.0009880078815056393, + "loss": 3.82, + "step": 1415 + }, + { + "epoch": 0.09648050006794401, + "grad_norm": 0.5178701281547546, + "learning_rate": 0.0009879654164968066, + "loss": 3.9328, + "step": 1420 + }, + { + "epoch": 0.09682022013860579, + "grad_norm": 0.6589826941490173, + "learning_rate": 0.000987922951487974, + "loss": 3.8546, + "step": 1425 + }, + { + "epoch": 0.09715994020926756, + "grad_norm": 0.5174300074577332, + "learning_rate": 0.0009878804864791413, + "loss": 3.7733, + "step": 1430 + }, + { + "epoch": 0.09749966027992933, + "grad_norm": 0.7090018391609192, + "learning_rate": 0.0009878380214703084, + "loss": 3.7061, + "step": 1435 + }, + { + "epoch": 0.09783938035059112, + "grad_norm": 0.7324854135513306, + "learning_rate": 0.0009877955564614757, + "loss": 3.7906, + "step": 1440 + }, + { + "epoch": 0.09817910042125289, + "grad_norm": 0.5318551659584045, + "learning_rate": 0.000987753091452643, + "loss": 3.6494, + "step": 1445 + }, + { + "epoch": 0.09851882049191467, + "grad_norm": 1.4100422859191895, + "learning_rate": 0.0009877106264438102, + "loss": 3.718, + "step": 1450 + }, + { + "epoch": 0.09885854056257644, + "grad_norm": 0.6092397570610046, + "learning_rate": 0.0009876681614349777, + "loss": 3.7808, + "step": 1455 + }, + { + "epoch": 0.09919826063323821, + "grad_norm": 10.65583324432373, + "learning_rate": 0.0009876256964261448, + "loss": 3.8743, + "step": 1460 + }, + { + "epoch": 0.09953798070389999, + "grad_norm": 0.8897668123245239, + "learning_rate": 0.0009875832314173122, + "loss": 3.7571, + "step": 1465 + }, + { + "epoch": 0.09987770077456176, + "grad_norm": 0.5916352868080139, + "learning_rate": 0.0009875407664084795, + "loss": 3.9678, + "step": 1470 + }, + { + "epoch": 0.10021742084522353, + "grad_norm": 0.5981304049491882, + "learning_rate": 0.0009874983013996466, + "loss": 3.542, + "step": 1475 + }, + { + "epoch": 0.10055714091588532, + "grad_norm": 0.8324027061462402, + "learning_rate": 0.000987455836390814, + "loss": 3.8354, + "step": 1480 + }, + { + "epoch": 0.10089686098654709, + "grad_norm": 0.7107352018356323, + "learning_rate": 0.0009874133713819813, + "loss": 3.6449, + "step": 1485 + }, + { + "epoch": 0.10123658105720885, + "grad_norm": 0.6234697103500366, + "learning_rate": 0.0009873709063731486, + "loss": 3.7533, + "step": 1490 + }, + { + "epoch": 0.10157630112787064, + "grad_norm": 0.564215898513794, + "learning_rate": 0.000987328441364316, + "loss": 3.7199, + "step": 1495 + }, + { + "epoch": 0.10191602119853241, + "grad_norm": 2.565659999847412, + "learning_rate": 0.000987285976355483, + "loss": 3.8506, + "step": 1500 + }, + { + "epoch": 0.10225574126919418, + "grad_norm": 0.6729772686958313, + "learning_rate": 0.0009872435113466504, + "loss": 3.8449, + "step": 1505 + }, + { + "epoch": 0.10259546133985596, + "grad_norm": 1.0147976875305176, + "learning_rate": 0.0009872010463378177, + "loss": 3.7593, + "step": 1510 + }, + { + "epoch": 0.10293518141051773, + "grad_norm": 0.671852171421051, + "learning_rate": 0.0009871585813289849, + "loss": 3.7993, + "step": 1515 + }, + { + "epoch": 0.1032749014811795, + "grad_norm": 3.002666711807251, + "learning_rate": 0.0009871161163201522, + "loss": 3.658, + "step": 1520 + }, + { + "epoch": 0.10361462155184128, + "grad_norm": 0.7295017838478088, + "learning_rate": 0.0009870736513113195, + "loss": 3.8309, + "step": 1525 + }, + { + "epoch": 0.10395434162250305, + "grad_norm": 0.6976558566093445, + "learning_rate": 0.0009870311863024869, + "loss": 3.7905, + "step": 1530 + }, + { + "epoch": 0.10429406169316484, + "grad_norm": 0.5134066939353943, + "learning_rate": 0.000986988721293654, + "loss": 3.9279, + "step": 1535 + }, + { + "epoch": 0.10463378176382661, + "grad_norm": 0.6964764595031738, + "learning_rate": 0.0009869462562848213, + "loss": 3.8237, + "step": 1540 + }, + { + "epoch": 0.10497350183448838, + "grad_norm": 0.5132859349250793, + "learning_rate": 0.0009869037912759887, + "loss": 3.8312, + "step": 1545 + }, + { + "epoch": 0.10531322190515016, + "grad_norm": 0.6843128204345703, + "learning_rate": 0.0009868613262671558, + "loss": 3.7985, + "step": 1550 + }, + { + "epoch": 0.10565294197581193, + "grad_norm": 0.8300333619117737, + "learning_rate": 0.000986818861258323, + "loss": 3.6729, + "step": 1555 + }, + { + "epoch": 0.1059926620464737, + "grad_norm": 1.3154003620147705, + "learning_rate": 0.0009867763962494904, + "loss": 3.75, + "step": 1560 + }, + { + "epoch": 0.10633238211713548, + "grad_norm": 0.7497626543045044, + "learning_rate": 0.0009867339312406578, + "loss": 3.8579, + "step": 1565 + }, + { + "epoch": 0.10667210218779725, + "grad_norm": 0.6044666767120361, + "learning_rate": 0.0009866914662318251, + "loss": 3.8955, + "step": 1570 + }, + { + "epoch": 0.10701182225845902, + "grad_norm": 0.6490479111671448, + "learning_rate": 0.0009866490012229922, + "loss": 3.9642, + "step": 1575 + }, + { + "epoch": 0.10735154232912081, + "grad_norm": 0.5570883750915527, + "learning_rate": 0.0009866065362141596, + "loss": 3.8322, + "step": 1580 + }, + { + "epoch": 0.10769126239978258, + "grad_norm": 0.5646046996116638, + "learning_rate": 0.000986564071205327, + "loss": 3.7635, + "step": 1585 + }, + { + "epoch": 0.10803098247044435, + "grad_norm": 0.6692689061164856, + "learning_rate": 0.000986521606196494, + "loss": 3.711, + "step": 1590 + }, + { + "epoch": 0.10837070254110613, + "grad_norm": 0.7941418886184692, + "learning_rate": 0.0009864791411876613, + "loss": 3.9109, + "step": 1595 + }, + { + "epoch": 0.1087104226117679, + "grad_norm": 0.5412370562553406, + "learning_rate": 0.0009864366761788287, + "loss": 3.7685, + "step": 1600 + }, + { + "epoch": 0.10905014268242967, + "grad_norm": 0.5807170271873474, + "learning_rate": 0.000986394211169996, + "loss": 3.7857, + "step": 1605 + }, + { + "epoch": 0.10938986275309145, + "grad_norm": 0.7795683741569519, + "learning_rate": 0.0009863517461611631, + "loss": 3.8869, + "step": 1610 + }, + { + "epoch": 0.10972958282375322, + "grad_norm": 0.6435133814811707, + "learning_rate": 0.0009863092811523305, + "loss": 3.7709, + "step": 1615 + }, + { + "epoch": 0.11006930289441501, + "grad_norm": 0.7029778361320496, + "learning_rate": 0.0009862668161434978, + "loss": 3.6154, + "step": 1620 + }, + { + "epoch": 0.11040902296507678, + "grad_norm": 0.5450308322906494, + "learning_rate": 0.000986224351134665, + "loss": 3.6991, + "step": 1625 + }, + { + "epoch": 0.11074874303573855, + "grad_norm": 0.7857054471969604, + "learning_rate": 0.0009861818861258325, + "loss": 3.7906, + "step": 1630 + }, + { + "epoch": 0.11108846310640033, + "grad_norm": 0.9489302039146423, + "learning_rate": 0.000986147914118766, + "loss": 3.7714, + "step": 1635 + }, + { + "epoch": 0.1114281831770621, + "grad_norm": 0.6171247959136963, + "learning_rate": 0.0009861054491099334, + "loss": 3.8672, + "step": 1640 + }, + { + "epoch": 0.11176790324772387, + "grad_norm": 0.5731608867645264, + "learning_rate": 0.0009860629841011007, + "loss": 3.908, + "step": 1645 + }, + { + "epoch": 0.11210762331838565, + "grad_norm": 0.5450822710990906, + "learning_rate": 0.000986020519092268, + "loss": 3.6911, + "step": 1650 + }, + { + "epoch": 0.11244734338904742, + "grad_norm": 0.7144281268119812, + "learning_rate": 0.0009859780540834352, + "loss": 3.91, + "step": 1655 + }, + { + "epoch": 0.1127870634597092, + "grad_norm": 0.8958132863044739, + "learning_rate": 0.0009859355890746025, + "loss": 3.7964, + "step": 1660 + }, + { + "epoch": 0.11312678353037098, + "grad_norm": 4.702531337738037, + "learning_rate": 0.0009858931240657699, + "loss": 3.7893, + "step": 1665 + }, + { + "epoch": 0.11346650360103275, + "grad_norm": 0.8923603892326355, + "learning_rate": 0.000985850659056937, + "loss": 3.7818, + "step": 1670 + }, + { + "epoch": 0.11380622367169452, + "grad_norm": 0.6885060667991638, + "learning_rate": 0.0009858081940481043, + "loss": 3.8224, + "step": 1675 + }, + { + "epoch": 0.1141459437423563, + "grad_norm": 0.7664501070976257, + "learning_rate": 0.0009857657290392717, + "loss": 3.7937, + "step": 1680 + }, + { + "epoch": 0.11448566381301807, + "grad_norm": 0.5449358224868774, + "learning_rate": 0.000985723264030439, + "loss": 3.6439, + "step": 1685 + }, + { + "epoch": 0.11482538388367985, + "grad_norm": 0.560732901096344, + "learning_rate": 0.0009856807990216063, + "loss": 3.4441, + "step": 1690 + }, + { + "epoch": 0.11516510395434162, + "grad_norm": 0.5761048793792725, + "learning_rate": 0.0009856383340127734, + "loss": 3.6604, + "step": 1695 + }, + { + "epoch": 0.1155048240250034, + "grad_norm": 0.6777898073196411, + "learning_rate": 0.0009855958690039408, + "loss": 3.8273, + "step": 1700 + }, + { + "epoch": 0.11584454409566518, + "grad_norm": 0.5846005082130432, + "learning_rate": 0.000985553403995108, + "loss": 3.537, + "step": 1705 + }, + { + "epoch": 0.11618426416632695, + "grad_norm": 0.505875289440155, + "learning_rate": 0.0009855109389862752, + "loss": 3.9624, + "step": 1710 + }, + { + "epoch": 0.11652398423698872, + "grad_norm": 0.6299620270729065, + "learning_rate": 0.0009854684739774426, + "loss": 3.5603, + "step": 1715 + }, + { + "epoch": 0.1168637043076505, + "grad_norm": 0.668428897857666, + "learning_rate": 0.00098542600896861, + "loss": 3.841, + "step": 1720 + }, + { + "epoch": 0.11720342437831227, + "grad_norm": 1.202921986579895, + "learning_rate": 0.0009853835439597772, + "loss": 3.5688, + "step": 1725 + }, + { + "epoch": 0.11754314444897404, + "grad_norm": 0.6621115803718567, + "learning_rate": 0.0009853410789509446, + "loss": 3.6315, + "step": 1730 + }, + { + "epoch": 0.11788286451963582, + "grad_norm": 0.9224280118942261, + "learning_rate": 0.0009852986139421117, + "loss": 3.6575, + "step": 1735 + }, + { + "epoch": 0.1182225845902976, + "grad_norm": 0.8204379677772522, + "learning_rate": 0.000985256148933279, + "loss": 3.8769, + "step": 1740 + }, + { + "epoch": 0.11856230466095936, + "grad_norm": 0.6953498125076294, + "learning_rate": 0.0009852136839244463, + "loss": 3.9617, + "step": 1745 + }, + { + "epoch": 0.11890202473162115, + "grad_norm": 0.5947441458702087, + "learning_rate": 0.0009851712189156135, + "loss": 3.8369, + "step": 1750 + }, + { + "epoch": 0.11924174480228292, + "grad_norm": 0.9794597625732422, + "learning_rate": 0.0009851287539067808, + "loss": 3.8331, + "step": 1755 + }, + { + "epoch": 0.11958146487294469, + "grad_norm": 0.6032178401947021, + "learning_rate": 0.0009850862888979481, + "loss": 3.9298, + "step": 1760 + }, + { + "epoch": 0.11992118494360647, + "grad_norm": 0.6911065578460693, + "learning_rate": 0.0009850438238891155, + "loss": 3.8566, + "step": 1765 + }, + { + "epoch": 0.12026090501426824, + "grad_norm": 0.6410852670669556, + "learning_rate": 0.0009850013588802826, + "loss": 3.8143, + "step": 1770 + }, + { + "epoch": 0.12060062508493002, + "grad_norm": 0.7894377112388611, + "learning_rate": 0.00098495889387145, + "loss": 3.7537, + "step": 1775 + }, + { + "epoch": 0.1209403451555918, + "grad_norm": 0.6442724466323853, + "learning_rate": 0.0009849164288626173, + "loss": 3.837, + "step": 1780 + }, + { + "epoch": 0.12128006522625356, + "grad_norm": 0.6375800371170044, + "learning_rate": 0.0009848739638537844, + "loss": 3.8852, + "step": 1785 + }, + { + "epoch": 0.12161978529691535, + "grad_norm": 0.7201732397079468, + "learning_rate": 0.000984831498844952, + "loss": 3.8186, + "step": 1790 + }, + { + "epoch": 0.12195950536757712, + "grad_norm": 0.6306689381599426, + "learning_rate": 0.000984789033836119, + "loss": 3.5935, + "step": 1795 + }, + { + "epoch": 0.12229922543823889, + "grad_norm": 0.625109076499939, + "learning_rate": 0.0009847465688272864, + "loss": 3.6407, + "step": 1800 + }, + { + "epoch": 0.12263894550890067, + "grad_norm": 0.6638340950012207, + "learning_rate": 0.0009847041038184537, + "loss": 3.7362, + "step": 1805 + }, + { + "epoch": 0.12297866557956244, + "grad_norm": 0.5395439863204956, + "learning_rate": 0.0009846616388096208, + "loss": 3.6822, + "step": 1810 + }, + { + "epoch": 0.12331838565022421, + "grad_norm": 0.6070172190666199, + "learning_rate": 0.0009846191738007882, + "loss": 3.7129, + "step": 1815 + }, + { + "epoch": 0.123658105720886, + "grad_norm": 1.5616525411605835, + "learning_rate": 0.0009845767087919555, + "loss": 3.7285, + "step": 1820 + }, + { + "epoch": 0.12399782579154776, + "grad_norm": 0.6825032234191895, + "learning_rate": 0.0009845342437831228, + "loss": 3.6654, + "step": 1825 + }, + { + "epoch": 0.12433754586220953, + "grad_norm": 0.5459248423576355, + "learning_rate": 0.00098449177877429, + "loss": 3.8087, + "step": 1830 + }, + { + "epoch": 0.12467726593287132, + "grad_norm": 0.643254816532135, + "learning_rate": 0.0009844493137654573, + "loss": 3.8045, + "step": 1835 + }, + { + "epoch": 0.1250169860035331, + "grad_norm": 1.1462301015853882, + "learning_rate": 0.0009844068487566246, + "loss": 3.8003, + "step": 1840 + }, + { + "epoch": 0.12535670607419486, + "grad_norm": 0.838395357131958, + "learning_rate": 0.0009843643837477917, + "loss": 3.6972, + "step": 1845 + }, + { + "epoch": 0.12569642614485663, + "grad_norm": 0.6674336791038513, + "learning_rate": 0.000984321918738959, + "loss": 3.8718, + "step": 1850 + }, + { + "epoch": 0.12603614621551842, + "grad_norm": 0.6532777547836304, + "learning_rate": 0.0009842794537301264, + "loss": 3.7581, + "step": 1855 + }, + { + "epoch": 0.1263758662861802, + "grad_norm": 0.6336531043052673, + "learning_rate": 0.0009842369887212937, + "loss": 3.7911, + "step": 1860 + }, + { + "epoch": 0.12671558635684196, + "grad_norm": 0.6481202840805054, + "learning_rate": 0.000984194523712461, + "loss": 3.5609, + "step": 1865 + }, + { + "epoch": 0.12705530642750373, + "grad_norm": 0.5881672501564026, + "learning_rate": 0.0009841520587036282, + "loss": 3.8634, + "step": 1870 + }, + { + "epoch": 0.1273950264981655, + "grad_norm": 0.5524594187736511, + "learning_rate": 0.0009841095936947955, + "loss": 3.6231, + "step": 1875 + }, + { + "epoch": 0.12773474656882727, + "grad_norm": 0.704310417175293, + "learning_rate": 0.0009840671286859629, + "loss": 3.7562, + "step": 1880 + }, + { + "epoch": 0.12807446663948907, + "grad_norm": 0.537385106086731, + "learning_rate": 0.00098402466367713, + "loss": 3.8532, + "step": 1885 + }, + { + "epoch": 0.12841418671015084, + "grad_norm": 0.6722908020019531, + "learning_rate": 0.0009839821986682973, + "loss": 3.3509, + "step": 1890 + }, + { + "epoch": 0.1287539067808126, + "grad_norm": 0.6943793892860413, + "learning_rate": 0.0009839397336594646, + "loss": 3.8353, + "step": 1895 + }, + { + "epoch": 0.12909362685147438, + "grad_norm": 0.7343939542770386, + "learning_rate": 0.000983897268650632, + "loss": 3.6777, + "step": 1900 + }, + { + "epoch": 0.12943334692213615, + "grad_norm": 0.5748901963233948, + "learning_rate": 0.000983854803641799, + "loss": 3.5355, + "step": 1905 + }, + { + "epoch": 0.12977306699279795, + "grad_norm": 0.6326262354850769, + "learning_rate": 0.0009838123386329664, + "loss": 3.8559, + "step": 1910 + }, + { + "epoch": 0.13011278706345972, + "grad_norm": 0.5672209858894348, + "learning_rate": 0.0009837698736241338, + "loss": 3.8939, + "step": 1915 + }, + { + "epoch": 0.1304525071341215, + "grad_norm": 0.5978373885154724, + "learning_rate": 0.0009837274086153009, + "loss": 3.4478, + "step": 1920 + }, + { + "epoch": 0.13079222720478326, + "grad_norm": 0.5917426943778992, + "learning_rate": 0.0009836849436064682, + "loss": 3.8038, + "step": 1925 + }, + { + "epoch": 0.13113194727544503, + "grad_norm": 0.7074366211891174, + "learning_rate": 0.0009836424785976355, + "loss": 3.7562, + "step": 1930 + }, + { + "epoch": 0.1314716673461068, + "grad_norm": 0.6954122185707092, + "learning_rate": 0.0009836000135888029, + "loss": 3.6813, + "step": 1935 + }, + { + "epoch": 0.1318113874167686, + "grad_norm": 0.5686829686164856, + "learning_rate": 0.0009835575485799702, + "loss": 3.3489, + "step": 1940 + }, + { + "epoch": 0.13215110748743036, + "grad_norm": 0.853552520275116, + "learning_rate": 0.0009835150835711373, + "loss": 3.6677, + "step": 1945 + }, + { + "epoch": 0.13249082755809213, + "grad_norm": 0.5388857126235962, + "learning_rate": 0.0009834726185623047, + "loss": 3.8752, + "step": 1950 + }, + { + "epoch": 0.1328305476287539, + "grad_norm": 0.5482903718948364, + "learning_rate": 0.000983430153553472, + "loss": 3.7405, + "step": 1955 + }, + { + "epoch": 0.13317026769941567, + "grad_norm": 1.1427967548370361, + "learning_rate": 0.0009833876885446391, + "loss": 3.7649, + "step": 1960 + }, + { + "epoch": 0.13350998777007744, + "grad_norm": 0.6187644600868225, + "learning_rate": 0.0009833452235358065, + "loss": 3.4491, + "step": 1965 + }, + { + "epoch": 0.13384970784073924, + "grad_norm": 0.7513899803161621, + "learning_rate": 0.0009833027585269738, + "loss": 3.6382, + "step": 1970 + }, + { + "epoch": 0.134189427911401, + "grad_norm": 0.8520986437797546, + "learning_rate": 0.0009832602935181411, + "loss": 3.959, + "step": 1975 + }, + { + "epoch": 0.13452914798206278, + "grad_norm": 0.637906014919281, + "learning_rate": 0.0009832178285093082, + "loss": 3.5479, + "step": 1980 + }, + { + "epoch": 0.13486886805272455, + "grad_norm": 0.5550394058227539, + "learning_rate": 0.0009831753635004756, + "loss": 3.7619, + "step": 1985 + }, + { + "epoch": 0.13520858812338632, + "grad_norm": 0.7950093746185303, + "learning_rate": 0.000983132898491643, + "loss": 3.6916, + "step": 1990 + }, + { + "epoch": 0.13554830819404812, + "grad_norm": 0.6187700629234314, + "learning_rate": 0.00098309043348281, + "loss": 3.7878, + "step": 1995 + }, + { + "epoch": 0.1358880282647099, + "grad_norm": 0.8941767811775208, + "learning_rate": 0.0009830479684739776, + "loss": 3.7091, + "step": 2000 + }, + { + "epoch": 0.13622774833537166, + "grad_norm": 0.7293330430984497, + "learning_rate": 0.0009830055034651447, + "loss": 3.6498, + "step": 2005 + }, + { + "epoch": 0.13656746840603343, + "grad_norm": 0.6029607653617859, + "learning_rate": 0.000982963038456312, + "loss": 3.9875, + "step": 2010 + }, + { + "epoch": 0.1369071884766952, + "grad_norm": 0.48066434264183044, + "learning_rate": 0.0009829205734474794, + "loss": 3.7426, + "step": 2015 + }, + { + "epoch": 0.13724690854735697, + "grad_norm": 1.3064939975738525, + "learning_rate": 0.0009828781084386465, + "loss": 3.9627, + "step": 2020 + }, + { + "epoch": 0.13758662861801876, + "grad_norm": 0.7273495197296143, + "learning_rate": 0.0009828356434298138, + "loss": 3.6601, + "step": 2025 + }, + { + "epoch": 0.13792634868868053, + "grad_norm": 0.7601163983345032, + "learning_rate": 0.0009827931784209811, + "loss": 3.7093, + "step": 2030 + }, + { + "epoch": 0.1382660687593423, + "grad_norm": 0.6197329163551331, + "learning_rate": 0.0009827507134121485, + "loss": 3.889, + "step": 2035 + }, + { + "epoch": 0.13860578883000407, + "grad_norm": 0.5385544896125793, + "learning_rate": 0.0009827082484033158, + "loss": 3.823, + "step": 2040 + }, + { + "epoch": 0.13894550890066584, + "grad_norm": 0.5584795475006104, + "learning_rate": 0.000982665783394483, + "loss": 3.9137, + "step": 2045 + }, + { + "epoch": 0.1392852289713276, + "grad_norm": 0.5970639586448669, + "learning_rate": 0.0009826233183856503, + "loss": 3.6903, + "step": 2050 + }, + { + "epoch": 0.1396249490419894, + "grad_norm": 0.6042846441268921, + "learning_rate": 0.0009825808533768176, + "loss": 3.7305, + "step": 2055 + }, + { + "epoch": 0.13996466911265118, + "grad_norm": 0.5669568181037903, + "learning_rate": 0.0009825383883679847, + "loss": 3.8464, + "step": 2060 + }, + { + "epoch": 0.14030438918331295, + "grad_norm": 0.7000798583030701, + "learning_rate": 0.000982495923359152, + "loss": 3.9742, + "step": 2065 + }, + { + "epoch": 0.14064410925397472, + "grad_norm": 0.6590344309806824, + "learning_rate": 0.0009824534583503194, + "loss": 3.6722, + "step": 2070 + }, + { + "epoch": 0.1409838293246365, + "grad_norm": 1.237439751625061, + "learning_rate": 0.0009824109933414867, + "loss": 3.6081, + "step": 2075 + }, + { + "epoch": 0.1413235493952983, + "grad_norm": 0.49432504177093506, + "learning_rate": 0.0009823685283326538, + "loss": 3.9053, + "step": 2080 + }, + { + "epoch": 0.14166326946596006, + "grad_norm": 0.7276826500892639, + "learning_rate": 0.0009823260633238212, + "loss": 3.9136, + "step": 2085 + }, + { + "epoch": 0.14200298953662183, + "grad_norm": 0.9527072310447693, + "learning_rate": 0.0009822835983149885, + "loss": 3.9414, + "step": 2090 + }, + { + "epoch": 0.1423427096072836, + "grad_norm": 0.6771573424339294, + "learning_rate": 0.0009822411333061556, + "loss": 4.2451, + "step": 2095 + }, + { + "epoch": 0.14268242967794537, + "grad_norm": 0.5536261796951294, + "learning_rate": 0.0009821986682973232, + "loss": 3.7934, + "step": 2100 + }, + { + "epoch": 0.14302214974860714, + "grad_norm": 2.4145219326019287, + "learning_rate": 0.0009821562032884903, + "loss": 4.0844, + "step": 2105 + }, + { + "epoch": 0.14336186981926893, + "grad_norm": 0.9332415461540222, + "learning_rate": 0.0009821137382796576, + "loss": 4.6756, + "step": 2110 + }, + { + "epoch": 0.1437015898899307, + "grad_norm": 0.6700988411903381, + "learning_rate": 0.000982071273270825, + "loss": 4.1015, + "step": 2115 + }, + { + "epoch": 0.14404130996059247, + "grad_norm": 0.5267028212547302, + "learning_rate": 0.000982028808261992, + "loss": 3.9953, + "step": 2120 + }, + { + "epoch": 0.14438103003125424, + "grad_norm": 0.6514846682548523, + "learning_rate": 0.0009819863432531594, + "loss": 3.774, + "step": 2125 + }, + { + "epoch": 0.144720750101916, + "grad_norm": 30.499948501586914, + "learning_rate": 0.0009819438782443268, + "loss": 3.7445, + "step": 2130 + }, + { + "epoch": 0.14506047017257778, + "grad_norm": 0.8160439729690552, + "learning_rate": 0.000981901413235494, + "loss": 3.7701, + "step": 2135 + }, + { + "epoch": 0.14540019024323958, + "grad_norm": 0.6277735829353333, + "learning_rate": 0.0009818589482266612, + "loss": 3.7415, + "step": 2140 + }, + { + "epoch": 0.14573991031390135, + "grad_norm": 0.5879346132278442, + "learning_rate": 0.0009818164832178285, + "loss": 3.843, + "step": 2145 + }, + { + "epoch": 0.14607963038456312, + "grad_norm": 0.7151115536689758, + "learning_rate": 0.0009817740182089959, + "loss": 3.6211, + "step": 2150 + }, + { + "epoch": 0.1464193504552249, + "grad_norm": 0.6400715112686157, + "learning_rate": 0.000981731553200163, + "loss": 4.1885, + "step": 2155 + }, + { + "epoch": 0.14675907052588666, + "grad_norm": 0.5887287855148315, + "learning_rate": 0.0009816890881913303, + "loss": 3.7932, + "step": 2160 + }, + { + "epoch": 0.14709879059654846, + "grad_norm": 0.8216533064842224, + "learning_rate": 0.0009816466231824977, + "loss": 3.711, + "step": 2165 + }, + { + "epoch": 0.14743851066721023, + "grad_norm": 0.7174777388572693, + "learning_rate": 0.000981604158173665, + "loss": 3.5003, + "step": 2170 + }, + { + "epoch": 0.147778230737872, + "grad_norm": 0.6716716289520264, + "learning_rate": 0.0009815616931648323, + "loss": 3.9491, + "step": 2175 + }, + { + "epoch": 0.14811795080853377, + "grad_norm": 0.6218331456184387, + "learning_rate": 0.0009815192281559994, + "loss": 3.6992, + "step": 2180 + }, + { + "epoch": 0.14845767087919554, + "grad_norm": 0.9224669933319092, + "learning_rate": 0.0009814767631471668, + "loss": 3.7756, + "step": 2185 + }, + { + "epoch": 0.1487973909498573, + "grad_norm": 0.669808030128479, + "learning_rate": 0.0009814342981383341, + "loss": 3.7179, + "step": 2190 + }, + { + "epoch": 0.1491371110205191, + "grad_norm": 0.646990180015564, + "learning_rate": 0.0009813918331295012, + "loss": 3.736, + "step": 2195 + }, + { + "epoch": 0.14947683109118087, + "grad_norm": 0.6665001511573792, + "learning_rate": 0.0009813493681206686, + "loss": 3.7479, + "step": 2200 + }, + { + "epoch": 0.14981655116184264, + "grad_norm": 0.592941403388977, + "learning_rate": 0.000981306903111836, + "loss": 3.7309, + "step": 2205 + }, + { + "epoch": 0.1501562712325044, + "grad_norm": 2.441667318344116, + "learning_rate": 0.0009812644381030032, + "loss": 3.8594, + "step": 2210 + }, + { + "epoch": 0.15049599130316618, + "grad_norm": 0.750754177570343, + "learning_rate": 0.0009812219730941703, + "loss": 3.7635, + "step": 2215 + }, + { + "epoch": 0.15083571137382795, + "grad_norm": 0.6356704235076904, + "learning_rate": 0.0009811795080853377, + "loss": 3.9031, + "step": 2220 + }, + { + "epoch": 0.15117543144448975, + "grad_norm": 0.6002806425094604, + "learning_rate": 0.000981137043076505, + "loss": 3.952, + "step": 2225 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 1.865532636642456, + "learning_rate": 0.0009810945780676721, + "loss": 3.4822, + "step": 2230 + }, + { + "epoch": 0.1518548715858133, + "grad_norm": 0.5731847286224365, + "learning_rate": 0.0009810521130588395, + "loss": 3.8658, + "step": 2235 + }, + { + "epoch": 0.15219459165647506, + "grad_norm": 1.0151249170303345, + "learning_rate": 0.0009810096480500068, + "loss": 3.8909, + "step": 2240 + }, + { + "epoch": 0.15253431172713683, + "grad_norm": 0.5204872488975525, + "learning_rate": 0.0009809671830411741, + "loss": 3.8905, + "step": 2245 + }, + { + "epoch": 0.15287403179779863, + "grad_norm": 0.7252000570297241, + "learning_rate": 0.0009809247180323415, + "loss": 3.6122, + "step": 2250 + }, + { + "epoch": 0.1532137518684604, + "grad_norm": 0.6796010136604309, + "learning_rate": 0.0009808822530235086, + "loss": 3.8242, + "step": 2255 + }, + { + "epoch": 0.15355347193912217, + "grad_norm": 0.8065784573554993, + "learning_rate": 0.000980839788014676, + "loss": 3.8441, + "step": 2260 + }, + { + "epoch": 0.15389319200978394, + "grad_norm": 0.6226104497909546, + "learning_rate": 0.0009807973230058433, + "loss": 3.7555, + "step": 2265 + }, + { + "epoch": 0.1542329120804457, + "grad_norm": 0.6221926808357239, + "learning_rate": 0.0009807548579970104, + "loss": 3.7892, + "step": 2270 + }, + { + "epoch": 0.15457263215110748, + "grad_norm": 0.9402880668640137, + "learning_rate": 0.0009807123929881777, + "loss": 3.7085, + "step": 2275 + }, + { + "epoch": 0.15491235222176927, + "grad_norm": 0.8135420680046082, + "learning_rate": 0.000980669927979345, + "loss": 3.6189, + "step": 2280 + }, + { + "epoch": 0.15525207229243104, + "grad_norm": 0.7283142805099487, + "learning_rate": 0.0009806274629705124, + "loss": 3.6072, + "step": 2285 + }, + { + "epoch": 0.1555917923630928, + "grad_norm": 0.6130872964859009, + "learning_rate": 0.0009805849979616795, + "loss": 3.6386, + "step": 2290 + }, + { + "epoch": 0.15593151243375458, + "grad_norm": 0.6504844427108765, + "learning_rate": 0.0009805425329528468, + "loss": 3.5851, + "step": 2295 + }, + { + "epoch": 0.15627123250441635, + "grad_norm": 2.4974217414855957, + "learning_rate": 0.0009805000679440142, + "loss": 3.8048, + "step": 2300 + }, + { + "epoch": 0.15661095257507815, + "grad_norm": 4.654658317565918, + "learning_rate": 0.0009804576029351813, + "loss": 3.8206, + "step": 2305 + }, + { + "epoch": 0.15695067264573992, + "grad_norm": 0.6979673504829407, + "learning_rate": 0.0009804151379263488, + "loss": 3.6012, + "step": 2310 + }, + { + "epoch": 0.1572903927164017, + "grad_norm": 0.6086756587028503, + "learning_rate": 0.000980372672917516, + "loss": 3.8313, + "step": 2315 + }, + { + "epoch": 0.15763011278706346, + "grad_norm": 0.666412353515625, + "learning_rate": 0.0009803302079086833, + "loss": 3.7812, + "step": 2320 + }, + { + "epoch": 0.15796983285772523, + "grad_norm": 0.7673835158348083, + "learning_rate": 0.0009802877428998506, + "loss": 3.8774, + "step": 2325 + }, + { + "epoch": 0.158309552928387, + "grad_norm": 0.7827776670455933, + "learning_rate": 0.0009802452778910177, + "loss": 3.6997, + "step": 2330 + }, + { + "epoch": 0.1586492729990488, + "grad_norm": 2.70999813079834, + "learning_rate": 0.000980202812882185, + "loss": 3.741, + "step": 2335 + }, + { + "epoch": 0.15898899306971057, + "grad_norm": 0.637454628944397, + "learning_rate": 0.0009801603478733524, + "loss": 3.7703, + "step": 2340 + }, + { + "epoch": 0.15932871314037234, + "grad_norm": 1.1813085079193115, + "learning_rate": 0.0009801178828645197, + "loss": 3.8005, + "step": 2345 + }, + { + "epoch": 0.1596684332110341, + "grad_norm": 0.8899592161178589, + "learning_rate": 0.0009800754178556869, + "loss": 3.9647, + "step": 2350 + }, + { + "epoch": 0.16000815328169588, + "grad_norm": 0.6419814229011536, + "learning_rate": 0.0009800329528468542, + "loss": 3.811, + "step": 2355 + }, + { + "epoch": 0.16034787335235764, + "grad_norm": 1.8682748079299927, + "learning_rate": 0.0009799904878380215, + "loss": 3.876, + "step": 2360 + }, + { + "epoch": 0.16068759342301944, + "grad_norm": 0.586973249912262, + "learning_rate": 0.0009799480228291889, + "loss": 3.5851, + "step": 2365 + }, + { + "epoch": 0.1610273134936812, + "grad_norm": 0.8647047281265259, + "learning_rate": 0.000979905557820356, + "loss": 3.7035, + "step": 2370 + }, + { + "epoch": 0.16136703356434298, + "grad_norm": 0.7981041073799133, + "learning_rate": 0.0009798630928115233, + "loss": 3.7578, + "step": 2375 + }, + { + "epoch": 0.16170675363500475, + "grad_norm": 0.9460886120796204, + "learning_rate": 0.0009798206278026906, + "loss": 3.6221, + "step": 2380 + }, + { + "epoch": 0.16204647370566652, + "grad_norm": 0.6668416857719421, + "learning_rate": 0.000979778162793858, + "loss": 3.7681, + "step": 2385 + }, + { + "epoch": 0.16238619377632832, + "grad_norm": 0.7283238768577576, + "learning_rate": 0.000979735697785025, + "loss": 3.8829, + "step": 2390 + }, + { + "epoch": 0.1627259138469901, + "grad_norm": 0.8417674899101257, + "learning_rate": 0.0009796932327761924, + "loss": 3.5327, + "step": 2395 + }, + { + "epoch": 0.16306563391765186, + "grad_norm": 0.6552211046218872, + "learning_rate": 0.0009796507677673598, + "loss": 3.7189, + "step": 2400 + }, + { + "epoch": 0.16340535398831363, + "grad_norm": 0.5512735247612, + "learning_rate": 0.0009796083027585269, + "loss": 3.5777, + "step": 2405 + }, + { + "epoch": 0.1637450740589754, + "grad_norm": 0.6253995895385742, + "learning_rate": 0.0009795658377496942, + "loss": 3.8806, + "step": 2410 + }, + { + "epoch": 0.16408479412963717, + "grad_norm": 0.6286073327064514, + "learning_rate": 0.0009795233727408616, + "loss": 3.9194, + "step": 2415 + }, + { + "epoch": 0.16442451420029897, + "grad_norm": 0.7704349160194397, + "learning_rate": 0.0009794809077320289, + "loss": 3.804, + "step": 2420 + }, + { + "epoch": 0.16476423427096074, + "grad_norm": 2.0239031314849854, + "learning_rate": 0.0009794384427231962, + "loss": 3.7398, + "step": 2425 + }, + { + "epoch": 0.1651039543416225, + "grad_norm": 0.7128452062606812, + "learning_rate": 0.0009793959777143633, + "loss": 3.7541, + "step": 2430 + }, + { + "epoch": 0.16544367441228428, + "grad_norm": 0.8855803608894348, + "learning_rate": 0.0009793535127055307, + "loss": 3.8577, + "step": 2435 + }, + { + "epoch": 0.16578339448294604, + "grad_norm": 0.6545919179916382, + "learning_rate": 0.000979311047696698, + "loss": 3.7919, + "step": 2440 + }, + { + "epoch": 0.16612311455360781, + "grad_norm": 1.0922008752822876, + "learning_rate": 0.0009792685826878651, + "loss": 3.8223, + "step": 2445 + }, + { + "epoch": 0.1664628346242696, + "grad_norm": 0.5943416357040405, + "learning_rate": 0.0009792261176790325, + "loss": 3.6905, + "step": 2450 + }, + { + "epoch": 0.16680255469493138, + "grad_norm": 0.908586323261261, + "learning_rate": 0.0009791836526701998, + "loss": 3.718, + "step": 2455 + }, + { + "epoch": 0.16714227476559315, + "grad_norm": 0.7396723031997681, + "learning_rate": 0.0009791411876613671, + "loss": 3.7864, + "step": 2460 + }, + { + "epoch": 0.16748199483625492, + "grad_norm": 0.6914514899253845, + "learning_rate": 0.0009790987226525342, + "loss": 3.6808, + "step": 2465 + }, + { + "epoch": 0.1678217149069167, + "grad_norm": 0.7731796503067017, + "learning_rate": 0.0009790562576437016, + "loss": 3.8299, + "step": 2470 + }, + { + "epoch": 0.1681614349775785, + "grad_norm": 0.8832833170890808, + "learning_rate": 0.000979013792634869, + "loss": 3.8623, + "step": 2475 + }, + { + "epoch": 0.16850115504824026, + "grad_norm": 0.6989298462867737, + "learning_rate": 0.000978971327626036, + "loss": 3.7545, + "step": 2480 + }, + { + "epoch": 0.16884087511890203, + "grad_norm": 0.7118530869483948, + "learning_rate": 0.0009789288626172036, + "loss": 3.8333, + "step": 2485 + }, + { + "epoch": 0.1691805951895638, + "grad_norm": 0.5099814534187317, + "learning_rate": 0.0009788863976083707, + "loss": 3.8318, + "step": 2490 + }, + { + "epoch": 0.16952031526022557, + "grad_norm": 0.9980850219726562, + "learning_rate": 0.000978843932599538, + "loss": 3.7259, + "step": 2495 + }, + { + "epoch": 0.16986003533088734, + "grad_norm": 0.5700471997261047, + "learning_rate": 0.0009788014675907054, + "loss": 3.878, + "step": 2500 + }, + { + "epoch": 0.17019975540154914, + "grad_norm": 0.6536974906921387, + "learning_rate": 0.0009787590025818725, + "loss": 3.9553, + "step": 2505 + }, + { + "epoch": 0.1705394754722109, + "grad_norm": 0.7003123760223389, + "learning_rate": 0.0009787165375730398, + "loss": 3.9675, + "step": 2510 + }, + { + "epoch": 0.17087919554287267, + "grad_norm": 0.5029799342155457, + "learning_rate": 0.0009786740725642072, + "loss": 3.7516, + "step": 2515 + }, + { + "epoch": 0.17121891561353444, + "grad_norm": 0.6222336888313293, + "learning_rate": 0.0009786316075553745, + "loss": 3.9027, + "step": 2520 + }, + { + "epoch": 0.17155863568419621, + "grad_norm": 1.6709191799163818, + "learning_rate": 0.0009785891425465416, + "loss": 3.8679, + "step": 2525 + }, + { + "epoch": 0.17189835575485798, + "grad_norm": 0.783116340637207, + "learning_rate": 0.000978546677537709, + "loss": 3.6995, + "step": 2530 + }, + { + "epoch": 0.17223807582551978, + "grad_norm": 0.751410961151123, + "learning_rate": 0.0009785042125288763, + "loss": 3.9775, + "step": 2535 + }, + { + "epoch": 0.17257779589618155, + "grad_norm": 0.6237109899520874, + "learning_rate": 0.0009784617475200434, + "loss": 3.621, + "step": 2540 + }, + { + "epoch": 0.17291751596684332, + "grad_norm": 0.6151782870292664, + "learning_rate": 0.0009784192825112107, + "loss": 3.7078, + "step": 2545 + }, + { + "epoch": 0.1732572360375051, + "grad_norm": 0.6417315006256104, + "learning_rate": 0.000978376817502378, + "loss": 3.8202, + "step": 2550 + }, + { + "epoch": 0.17359695610816686, + "grad_norm": 0.6000332832336426, + "learning_rate": 0.0009783343524935454, + "loss": 3.7126, + "step": 2555 + }, + { + "epoch": 0.17393667617882866, + "grad_norm": 0.9587574005126953, + "learning_rate": 0.0009782918874847127, + "loss": 3.5443, + "step": 2560 + }, + { + "epoch": 0.17427639624949043, + "grad_norm": 0.9385026693344116, + "learning_rate": 0.0009782494224758798, + "loss": 3.7294, + "step": 2565 + }, + { + "epoch": 0.1746161163201522, + "grad_norm": 0.7729959487915039, + "learning_rate": 0.0009782069574670472, + "loss": 3.88, + "step": 2570 + }, + { + "epoch": 0.17495583639081397, + "grad_norm": 1.886183738708496, + "learning_rate": 0.0009781644924582145, + "loss": 3.8859, + "step": 2575 + }, + { + "epoch": 0.17529555646147574, + "grad_norm": 0.7273914217948914, + "learning_rate": 0.0009781220274493816, + "loss": 3.9598, + "step": 2580 + }, + { + "epoch": 0.1756352765321375, + "grad_norm": 0.5853223204612732, + "learning_rate": 0.000978079562440549, + "loss": 3.6438, + "step": 2585 + }, + { + "epoch": 0.1759749966027993, + "grad_norm": 0.5564325451850891, + "learning_rate": 0.0009780370974317163, + "loss": 3.6338, + "step": 2590 + }, + { + "epoch": 0.17631471667346107, + "grad_norm": 0.8639393448829651, + "learning_rate": 0.0009779946324228836, + "loss": 3.7254, + "step": 2595 + }, + { + "epoch": 0.17665443674412284, + "grad_norm": 0.7249088883399963, + "learning_rate": 0.0009779521674140508, + "loss": 3.5701, + "step": 2600 + }, + { + "epoch": 0.17699415681478461, + "grad_norm": 0.7234058380126953, + "learning_rate": 0.000977909702405218, + "loss": 3.9876, + "step": 2605 + }, + { + "epoch": 0.17733387688544638, + "grad_norm": 0.6160930395126343, + "learning_rate": 0.0009778672373963854, + "loss": 3.7759, + "step": 2610 + }, + { + "epoch": 0.17767359695610815, + "grad_norm": 0.5568186044692993, + "learning_rate": 0.0009778247723875525, + "loss": 3.5049, + "step": 2615 + }, + { + "epoch": 0.17801331702676995, + "grad_norm": 0.7568265795707703, + "learning_rate": 0.00097778230737872, + "loss": 3.7495, + "step": 2620 + }, + { + "epoch": 0.17835303709743172, + "grad_norm": 0.6679832935333252, + "learning_rate": 0.0009777398423698872, + "loss": 3.6284, + "step": 2625 + }, + { + "epoch": 0.1786927571680935, + "grad_norm": 0.6787500381469727, + "learning_rate": 0.0009776973773610545, + "loss": 3.6479, + "step": 2630 + }, + { + "epoch": 0.17903247723875526, + "grad_norm": 0.5944201350212097, + "learning_rate": 0.0009776549123522219, + "loss": 3.7451, + "step": 2635 + }, + { + "epoch": 0.17937219730941703, + "grad_norm": 0.572880744934082, + "learning_rate": 0.000977612447343389, + "loss": 3.8498, + "step": 2640 + }, + { + "epoch": 0.17971191738007883, + "grad_norm": 0.5941516757011414, + "learning_rate": 0.0009775699823345563, + "loss": 3.7507, + "step": 2645 + }, + { + "epoch": 0.1800516374507406, + "grad_norm": 0.6639554500579834, + "learning_rate": 0.0009775275173257237, + "loss": 3.9043, + "step": 2650 + }, + { + "epoch": 0.18039135752140237, + "grad_norm": 0.7486041188240051, + "learning_rate": 0.000977485052316891, + "loss": 3.8246, + "step": 2655 + }, + { + "epoch": 0.18073107759206414, + "grad_norm": 0.6216392517089844, + "learning_rate": 0.0009774425873080581, + "loss": 3.6112, + "step": 2660 + }, + { + "epoch": 0.1810707976627259, + "grad_norm": 0.6744546294212341, + "learning_rate": 0.0009774001222992254, + "loss": 3.7201, + "step": 2665 + }, + { + "epoch": 0.18141051773338768, + "grad_norm": 0.6723204255104065, + "learning_rate": 0.0009773576572903928, + "loss": 3.7016, + "step": 2670 + }, + { + "epoch": 0.18175023780404947, + "grad_norm": 1.119281530380249, + "learning_rate": 0.00097731519228156, + "loss": 3.6478, + "step": 2675 + }, + { + "epoch": 0.18208995787471124, + "grad_norm": 0.7174021601676941, + "learning_rate": 0.0009772727272727272, + "loss": 3.8457, + "step": 2680 + }, + { + "epoch": 0.18242967794537301, + "grad_norm": 0.5414323210716248, + "learning_rate": 0.0009772302622638946, + "loss": 3.7964, + "step": 2685 + }, + { + "epoch": 0.18276939801603478, + "grad_norm": 0.6062555909156799, + "learning_rate": 0.000977187797255062, + "loss": 3.7346, + "step": 2690 + }, + { + "epoch": 0.18310911808669655, + "grad_norm": 0.5681310296058655, + "learning_rate": 0.0009771453322462292, + "loss": 3.329, + "step": 2695 + }, + { + "epoch": 0.18344883815735832, + "grad_norm": 1.183348298072815, + "learning_rate": 0.0009771028672373964, + "loss": 3.9112, + "step": 2700 + }, + { + "epoch": 0.18378855822802012, + "grad_norm": 0.6772845387458801, + "learning_rate": 0.0009770604022285637, + "loss": 3.5992, + "step": 2705 + }, + { + "epoch": 0.1841282782986819, + "grad_norm": 0.6639375686645508, + "learning_rate": 0.000977017937219731, + "loss": 3.6232, + "step": 2710 + }, + { + "epoch": 0.18446799836934366, + "grad_norm": 0.6344166994094849, + "learning_rate": 0.0009769754722108981, + "loss": 3.7826, + "step": 2715 + }, + { + "epoch": 0.18480771844000543, + "grad_norm": 0.7264713644981384, + "learning_rate": 0.0009769330072020655, + "loss": 3.773, + "step": 2720 + }, + { + "epoch": 0.1851474385106672, + "grad_norm": 0.6034553647041321, + "learning_rate": 0.0009768905421932328, + "loss": 3.8112, + "step": 2725 + }, + { + "epoch": 0.185487158581329, + "grad_norm": 0.642060399055481, + "learning_rate": 0.0009768480771844001, + "loss": 3.9516, + "step": 2730 + }, + { + "epoch": 0.18582687865199077, + "grad_norm": 0.6606827974319458, + "learning_rate": 0.0009768056121755675, + "loss": 3.8215, + "step": 2735 + }, + { + "epoch": 0.18616659872265254, + "grad_norm": 1.7994295358657837, + "learning_rate": 0.0009767631471667346, + "loss": 3.5296, + "step": 2740 + }, + { + "epoch": 0.1865063187933143, + "grad_norm": 0.49489641189575195, + "learning_rate": 0.000976720682157902, + "loss": 3.6227, + "step": 2745 + }, + { + "epoch": 0.18684603886397608, + "grad_norm": 0.7200000286102295, + "learning_rate": 0.0009766782171490693, + "loss": 3.7873, + "step": 2750 + }, + { + "epoch": 0.18718575893463785, + "grad_norm": 0.6690630912780762, + "learning_rate": 0.0009766357521402364, + "loss": 4.0226, + "step": 2755 + }, + { + "epoch": 0.18752547900529964, + "grad_norm": 0.5845706462860107, + "learning_rate": 0.0009765932871314037, + "loss": 3.7333, + "step": 2760 + }, + { + "epoch": 0.18786519907596141, + "grad_norm": 0.5513134002685547, + "learning_rate": 0.000976550822122571, + "loss": 3.8383, + "step": 2765 + }, + { + "epoch": 0.18820491914662318, + "grad_norm": 0.6170289516448975, + "learning_rate": 0.0009765083571137383, + "loss": 3.8592, + "step": 2770 + }, + { + "epoch": 0.18854463921728495, + "grad_norm": 0.6403161883354187, + "learning_rate": 0.0009764658921049055, + "loss": 3.9809, + "step": 2775 + }, + { + "epoch": 0.18888435928794672, + "grad_norm": 0.9001038670539856, + "learning_rate": 0.0009764234270960729, + "loss": 3.7662, + "step": 2780 + }, + { + "epoch": 0.1892240793586085, + "grad_norm": 0.5840660929679871, + "learning_rate": 0.0009763809620872402, + "loss": 3.4086, + "step": 2785 + }, + { + "epoch": 0.1895637994292703, + "grad_norm": 0.8719920516014099, + "learning_rate": 0.0009763384970784074, + "loss": 3.7871, + "step": 2790 + }, + { + "epoch": 0.18990351949993206, + "grad_norm": 0.7229958176612854, + "learning_rate": 0.0009762960320695747, + "loss": 3.5669, + "step": 2795 + }, + { + "epoch": 0.19024323957059383, + "grad_norm": 0.7714795470237732, + "learning_rate": 0.000976253567060742, + "loss": 3.7081, + "step": 2800 + }, + { + "epoch": 0.1905829596412556, + "grad_norm": 0.736339807510376, + "learning_rate": 0.0009762111020519092, + "loss": 3.755, + "step": 2805 + }, + { + "epoch": 0.19092267971191737, + "grad_norm": 1.3026764392852783, + "learning_rate": 0.0009761686370430765, + "loss": 3.2761, + "step": 2810 + }, + { + "epoch": 0.19126239978257917, + "grad_norm": 0.9306278824806213, + "learning_rate": 0.0009761261720342439, + "loss": 3.7627, + "step": 2815 + }, + { + "epoch": 0.19160211985324094, + "grad_norm": 0.6496819853782654, + "learning_rate": 0.0009760837070254111, + "loss": 3.8303, + "step": 2820 + }, + { + "epoch": 0.1919418399239027, + "grad_norm": 0.5241103172302246, + "learning_rate": 0.0009760412420165784, + "loss": 3.6654, + "step": 2825 + }, + { + "epoch": 0.19228155999456448, + "grad_norm": 1.0750632286071777, + "learning_rate": 0.0009759987770077456, + "loss": 3.8691, + "step": 2830 + }, + { + "epoch": 0.19262128006522625, + "grad_norm": 0.733339250087738, + "learning_rate": 0.0009759563119989129, + "loss": 3.7238, + "step": 2835 + }, + { + "epoch": 0.19296100013588802, + "grad_norm": 1.0571733713150024, + "learning_rate": 0.0009759138469900802, + "loss": 3.7147, + "step": 2840 + }, + { + "epoch": 0.19330072020654981, + "grad_norm": 1.0953446626663208, + "learning_rate": 0.0009758713819812474, + "loss": 3.8607, + "step": 2845 + }, + { + "epoch": 0.19364044027721158, + "grad_norm": 0.538586437702179, + "learning_rate": 0.0009758289169724148, + "loss": 3.7512, + "step": 2850 + }, + { + "epoch": 0.19398016034787335, + "grad_norm": 0.9248883128166199, + "learning_rate": 0.0009757864519635821, + "loss": 3.8625, + "step": 2855 + }, + { + "epoch": 0.19431988041853512, + "grad_norm": 0.648949384689331, + "learning_rate": 0.0009757439869547493, + "loss": 3.7193, + "step": 2860 + }, + { + "epoch": 0.1946596004891969, + "grad_norm": 0.9958585500717163, + "learning_rate": 0.0009757015219459165, + "loss": 3.7625, + "step": 2865 + }, + { + "epoch": 0.19499932055985866, + "grad_norm": 0.8019771575927734, + "learning_rate": 0.0009756590569370839, + "loss": 3.5257, + "step": 2870 + }, + { + "epoch": 0.19533904063052046, + "grad_norm": 0.8672682046890259, + "learning_rate": 0.0009756165919282511, + "loss": 3.673, + "step": 2875 + }, + { + "epoch": 0.19567876070118223, + "grad_norm": 0.6151662468910217, + "learning_rate": 0.0009755741269194183, + "loss": 3.8772, + "step": 2880 + }, + { + "epoch": 0.196018480771844, + "grad_norm": 0.6601491570472717, + "learning_rate": 0.0009755316619105858, + "loss": 3.6881, + "step": 2885 + }, + { + "epoch": 0.19635820084250577, + "grad_norm": 0.7479104399681091, + "learning_rate": 0.000975489196901753, + "loss": 3.406, + "step": 2890 + }, + { + "epoch": 0.19669792091316754, + "grad_norm": 0.6604925394058228, + "learning_rate": 0.0009754467318929202, + "loss": 3.5316, + "step": 2895 + }, + { + "epoch": 0.19703764098382934, + "grad_norm": 0.7910983562469482, + "learning_rate": 0.0009754042668840876, + "loss": 3.8452, + "step": 2900 + }, + { + "epoch": 0.1973773610544911, + "grad_norm": 0.6984866261482239, + "learning_rate": 0.0009753618018752548, + "loss": 3.5478, + "step": 2905 + }, + { + "epoch": 0.19771708112515288, + "grad_norm": 0.9805856347084045, + "learning_rate": 0.000975319336866422, + "loss": 3.5406, + "step": 2910 + }, + { + "epoch": 0.19805680119581465, + "grad_norm": 0.7291609048843384, + "learning_rate": 0.0009752768718575895, + "loss": 3.7913, + "step": 2915 + }, + { + "epoch": 0.19839652126647642, + "grad_norm": 0.7049471735954285, + "learning_rate": 0.0009752344068487567, + "loss": 3.7061, + "step": 2920 + }, + { + "epoch": 0.1987362413371382, + "grad_norm": 0.6286263465881348, + "learning_rate": 0.0009751919418399239, + "loss": 3.6851, + "step": 2925 + }, + { + "epoch": 0.19907596140779998, + "grad_norm": 0.7198681831359863, + "learning_rate": 0.0009751494768310912, + "loss": 3.5295, + "step": 2930 + }, + { + "epoch": 0.19941568147846175, + "grad_norm": 0.6289663314819336, + "learning_rate": 0.0009751070118222585, + "loss": 3.845, + "step": 2935 + }, + { + "epoch": 0.19975540154912352, + "grad_norm": 0.8613581657409668, + "learning_rate": 0.0009750645468134257, + "loss": 3.5675, + "step": 2940 + }, + { + "epoch": 0.2000951216197853, + "grad_norm": 0.7133091688156128, + "learning_rate": 0.000975022081804593, + "loss": 3.7637, + "step": 2945 + }, + { + "epoch": 0.20043484169044706, + "grad_norm": 0.7702757716178894, + "learning_rate": 0.0009749796167957604, + "loss": 3.7904, + "step": 2950 + }, + { + "epoch": 0.20077456176110883, + "grad_norm": 0.5753114819526672, + "learning_rate": 0.0009749371517869276, + "loss": 4.0398, + "step": 2955 + }, + { + "epoch": 0.20111428183177063, + "grad_norm": 0.6712437272071838, + "learning_rate": 0.0009748946867780949, + "loss": 3.8843, + "step": 2960 + }, + { + "epoch": 0.2014540019024324, + "grad_norm": 0.5073221921920776, + "learning_rate": 0.0009748522217692621, + "loss": 3.6344, + "step": 2965 + }, + { + "epoch": 0.20179372197309417, + "grad_norm": 0.6728055477142334, + "learning_rate": 0.0009748097567604294, + "loss": 3.7253, + "step": 2970 + }, + { + "epoch": 0.20213344204375594, + "grad_norm": 0.6344888210296631, + "learning_rate": 0.0009747672917515967, + "loss": 3.6102, + "step": 2975 + }, + { + "epoch": 0.2024731621144177, + "grad_norm": 0.6257009506225586, + "learning_rate": 0.0009747248267427639, + "loss": 3.7066, + "step": 2980 + }, + { + "epoch": 0.2028128821850795, + "grad_norm": 1.2531574964523315, + "learning_rate": 0.0009746823617339313, + "loss": 3.6228, + "step": 2985 + }, + { + "epoch": 0.20315260225574128, + "grad_norm": 0.6333403587341309, + "learning_rate": 0.0009746398967250986, + "loss": 3.7618, + "step": 2990 + }, + { + "epoch": 0.20349232232640305, + "grad_norm": 1.046090006828308, + "learning_rate": 0.0009745974317162658, + "loss": 3.7381, + "step": 2995 + }, + { + "epoch": 0.20383204239706482, + "grad_norm": 0.7316690683364868, + "learning_rate": 0.000974554966707433, + "loss": 3.6493, + "step": 3000 + }, + { + "epoch": 0.2041717624677266, + "grad_norm": 0.9069976806640625, + "learning_rate": 0.0009745125016986004, + "loss": 3.7163, + "step": 3005 + }, + { + "epoch": 0.20451148253838836, + "grad_norm": 0.6396864652633667, + "learning_rate": 0.0009744700366897676, + "loss": 3.8534, + "step": 3010 + }, + { + "epoch": 0.20485120260905015, + "grad_norm": 0.6160149574279785, + "learning_rate": 0.0009744275716809348, + "loss": 3.8722, + "step": 3015 + }, + { + "epoch": 0.20519092267971192, + "grad_norm": 0.9537104964256287, + "learning_rate": 0.0009743851066721023, + "loss": 3.8577, + "step": 3020 + }, + { + "epoch": 0.2055306427503737, + "grad_norm": 1.0233756303787231, + "learning_rate": 0.0009743426416632695, + "loss": 3.782, + "step": 3025 + }, + { + "epoch": 0.20587036282103546, + "grad_norm": 0.7525044679641724, + "learning_rate": 0.0009743001766544367, + "loss": 3.7472, + "step": 3030 + }, + { + "epoch": 0.20621008289169723, + "grad_norm": 0.576762318611145, + "learning_rate": 0.0009742577116456041, + "loss": 3.7315, + "step": 3035 + }, + { + "epoch": 0.206549802962359, + "grad_norm": 0.688569188117981, + "learning_rate": 0.0009742152466367713, + "loss": 3.713, + "step": 3040 + }, + { + "epoch": 0.2068895230330208, + "grad_norm": 0.8552679419517517, + "learning_rate": 0.0009741727816279386, + "loss": 3.6278, + "step": 3045 + }, + { + "epoch": 0.20722924310368257, + "grad_norm": 0.5804129838943481, + "learning_rate": 0.0009741303166191059, + "loss": 3.6661, + "step": 3050 + }, + { + "epoch": 0.20756896317434434, + "grad_norm": 1.2718019485473633, + "learning_rate": 0.0009740878516102732, + "loss": 3.5583, + "step": 3055 + }, + { + "epoch": 0.2079086832450061, + "grad_norm": 0.7954198122024536, + "learning_rate": 0.0009740453866014405, + "loss": 3.8112, + "step": 3060 + }, + { + "epoch": 0.20824840331566788, + "grad_norm": 0.5551571249961853, + "learning_rate": 0.0009740029215926077, + "loss": 3.5717, + "step": 3065 + }, + { + "epoch": 0.20858812338632968, + "grad_norm": 0.7452906966209412, + "learning_rate": 0.000973960456583775, + "loss": 3.8748, + "step": 3070 + }, + { + "epoch": 0.20892784345699145, + "grad_norm": 0.5398733019828796, + "learning_rate": 0.0009739179915749423, + "loss": 3.6718, + "step": 3075 + }, + { + "epoch": 0.20926756352765322, + "grad_norm": 0.656112790107727, + "learning_rate": 0.0009738755265661095, + "loss": 3.679, + "step": 3080 + }, + { + "epoch": 0.209607283598315, + "grad_norm": 0.7886524200439453, + "learning_rate": 0.0009738330615572768, + "loss": 3.9167, + "step": 3085 + }, + { + "epoch": 0.20994700366897676, + "grad_norm": 0.8396526575088501, + "learning_rate": 0.0009737905965484442, + "loss": 3.6937, + "step": 3090 + }, + { + "epoch": 0.21028672373963853, + "grad_norm": 0.8416862487792969, + "learning_rate": 0.0009737481315396114, + "loss": 3.8143, + "step": 3095 + }, + { + "epoch": 0.21062644381030032, + "grad_norm": 0.6741566061973572, + "learning_rate": 0.0009737056665307787, + "loss": 3.9712, + "step": 3100 + }, + { + "epoch": 0.2109661638809621, + "grad_norm": 0.7233244180679321, + "learning_rate": 0.000973663201521946, + "loss": 3.7028, + "step": 3105 + }, + { + "epoch": 0.21130588395162386, + "grad_norm": 0.7363235354423523, + "learning_rate": 0.0009736207365131132, + "loss": 3.8979, + "step": 3110 + }, + { + "epoch": 0.21164560402228563, + "grad_norm": 0.6042994260787964, + "learning_rate": 0.0009735782715042804, + "loss": 3.7786, + "step": 3115 + }, + { + "epoch": 0.2119853240929474, + "grad_norm": 1.219826340675354, + "learning_rate": 0.0009735358064954478, + "loss": 3.7775, + "step": 3120 + }, + { + "epoch": 0.21232504416360917, + "grad_norm": 0.6792481541633606, + "learning_rate": 0.0009734933414866151, + "loss": 3.4481, + "step": 3125 + }, + { + "epoch": 0.21266476423427097, + "grad_norm": 0.650063693523407, + "learning_rate": 0.0009734508764777823, + "loss": 3.9247, + "step": 3130 + }, + { + "epoch": 0.21300448430493274, + "grad_norm": 0.8546820282936096, + "learning_rate": 0.0009734084114689497, + "loss": 3.7367, + "step": 3135 + }, + { + "epoch": 0.2133442043755945, + "grad_norm": 0.5956554412841797, + "learning_rate": 0.0009733659464601169, + "loss": 3.644, + "step": 3140 + }, + { + "epoch": 0.21368392444625628, + "grad_norm": 0.6377459764480591, + "learning_rate": 0.0009733234814512841, + "loss": 3.8372, + "step": 3145 + }, + { + "epoch": 0.21402364451691805, + "grad_norm": 0.6103334426879883, + "learning_rate": 0.0009732810164424515, + "loss": 3.4048, + "step": 3150 + }, + { + "epoch": 0.21436336458757985, + "grad_norm": 0.7477742433547974, + "learning_rate": 0.0009732385514336187, + "loss": 3.7598, + "step": 3155 + }, + { + "epoch": 0.21470308465824162, + "grad_norm": 0.6242577433586121, + "learning_rate": 0.000973196086424786, + "loss": 3.7061, + "step": 3160 + }, + { + "epoch": 0.2150428047289034, + "grad_norm": 1.2185317277908325, + "learning_rate": 0.0009731536214159533, + "loss": 3.769, + "step": 3165 + }, + { + "epoch": 0.21538252479956516, + "grad_norm": 0.7926366925239563, + "learning_rate": 0.0009731111564071206, + "loss": 3.4976, + "step": 3170 + }, + { + "epoch": 0.21572224487022693, + "grad_norm": 0.7110512852668762, + "learning_rate": 0.0009730686913982878, + "loss": 3.7507, + "step": 3175 + }, + { + "epoch": 0.2160619649408887, + "grad_norm": 0.8057163953781128, + "learning_rate": 0.0009730262263894551, + "loss": 3.787, + "step": 3180 + }, + { + "epoch": 0.2164016850115505, + "grad_norm": 0.720172107219696, + "learning_rate": 0.0009729837613806224, + "loss": 3.8023, + "step": 3185 + }, + { + "epoch": 0.21674140508221226, + "grad_norm": 0.8451990485191345, + "learning_rate": 0.0009729412963717896, + "loss": 3.9838, + "step": 3190 + }, + { + "epoch": 0.21708112515287403, + "grad_norm": 0.5866568684577942, + "learning_rate": 0.000972898831362957, + "loss": 3.7214, + "step": 3195 + }, + { + "epoch": 0.2174208452235358, + "grad_norm": 0.6415042877197266, + "learning_rate": 0.0009728563663541243, + "loss": 3.8833, + "step": 3200 + }, + { + "epoch": 0.21776056529419757, + "grad_norm": 0.7644772529602051, + "learning_rate": 0.0009728139013452915, + "loss": 4.0019, + "step": 3205 + }, + { + "epoch": 0.21810028536485934, + "grad_norm": 0.6343825459480286, + "learning_rate": 0.0009727714363364588, + "loss": 3.9486, + "step": 3210 + }, + { + "epoch": 0.21844000543552114, + "grad_norm": 0.6342509388923645, + "learning_rate": 0.000972728971327626, + "loss": 3.784, + "step": 3215 + }, + { + "epoch": 0.2187797255061829, + "grad_norm": 0.6624273061752319, + "learning_rate": 0.0009726865063187933, + "loss": 3.6421, + "step": 3220 + }, + { + "epoch": 0.21911944557684468, + "grad_norm": 0.6122679710388184, + "learning_rate": 0.0009726440413099606, + "loss": 3.614, + "step": 3225 + }, + { + "epoch": 0.21945916564750645, + "grad_norm": 0.5960533618927002, + "learning_rate": 0.0009726015763011279, + "loss": 3.7521, + "step": 3230 + }, + { + "epoch": 0.21979888571816822, + "grad_norm": 0.6187360286712646, + "learning_rate": 0.0009725591112922952, + "loss": 3.6919, + "step": 3235 + }, + { + "epoch": 0.22013860578883002, + "grad_norm": 0.6804354190826416, + "learning_rate": 0.0009725166462834625, + "loss": 3.695, + "step": 3240 + }, + { + "epoch": 0.22047832585949179, + "grad_norm": 0.5613947510719299, + "learning_rate": 0.0009724741812746297, + "loss": 3.6816, + "step": 3245 + }, + { + "epoch": 0.22081804593015356, + "grad_norm": 0.6357201337814331, + "learning_rate": 0.000972431716265797, + "loss": 3.7663, + "step": 3250 + }, + { + "epoch": 0.22115776600081533, + "grad_norm": 0.5632497072219849, + "learning_rate": 0.0009723892512569643, + "loss": 3.6126, + "step": 3255 + }, + { + "epoch": 0.2214974860714771, + "grad_norm": 0.6504238247871399, + "learning_rate": 0.0009723467862481315, + "loss": 3.7498, + "step": 3260 + }, + { + "epoch": 0.22183720614213887, + "grad_norm": 0.5361862778663635, + "learning_rate": 0.0009723043212392988, + "loss": 3.7981, + "step": 3265 + }, + { + "epoch": 0.22217692621280066, + "grad_norm": 0.7083408236503601, + "learning_rate": 0.0009722618562304662, + "loss": 3.854, + "step": 3270 + }, + { + "epoch": 0.22251664628346243, + "grad_norm": 0.6311686038970947, + "learning_rate": 0.0009722193912216334, + "loss": 3.7887, + "step": 3275 + }, + { + "epoch": 0.2228563663541242, + "grad_norm": 0.575412392616272, + "learning_rate": 0.0009721769262128006, + "loss": 3.6411, + "step": 3280 + }, + { + "epoch": 0.22319608642478597, + "grad_norm": 0.6615845561027527, + "learning_rate": 0.000972134461203968, + "loss": 3.9877, + "step": 3285 + }, + { + "epoch": 0.22353580649544774, + "grad_norm": 0.9929487109184265, + "learning_rate": 0.0009720919961951352, + "loss": 3.6892, + "step": 3290 + }, + { + "epoch": 0.22387552656610954, + "grad_norm": 0.7653114199638367, + "learning_rate": 0.0009720495311863024, + "loss": 3.3887, + "step": 3295 + }, + { + "epoch": 0.2242152466367713, + "grad_norm": 0.6863542199134827, + "learning_rate": 0.0009720070661774699, + "loss": 3.6325, + "step": 3300 + }, + { + "epoch": 0.22455496670743308, + "grad_norm": 1.1504642963409424, + "learning_rate": 0.0009719646011686371, + "loss": 3.8448, + "step": 3305 + }, + { + "epoch": 0.22489468677809485, + "grad_norm": 0.6335277557373047, + "learning_rate": 0.0009719221361598043, + "loss": 3.4823, + "step": 3310 + }, + { + "epoch": 0.22523440684875662, + "grad_norm": 0.7847116589546204, + "learning_rate": 0.0009718796711509716, + "loss": 3.5653, + "step": 3315 + }, + { + "epoch": 0.2255741269194184, + "grad_norm": 0.7022686004638672, + "learning_rate": 0.0009718372061421389, + "loss": 3.6047, + "step": 3320 + }, + { + "epoch": 0.22591384699008019, + "grad_norm": 0.7278538346290588, + "learning_rate": 0.0009717947411333061, + "loss": 3.715, + "step": 3325 + }, + { + "epoch": 0.22625356706074196, + "grad_norm": 0.6122897267341614, + "learning_rate": 0.0009717522761244734, + "loss": 3.9017, + "step": 3330 + }, + { + "epoch": 0.22659328713140373, + "grad_norm": 0.5491223931312561, + "learning_rate": 0.0009717098111156408, + "loss": 3.6035, + "step": 3335 + }, + { + "epoch": 0.2269330072020655, + "grad_norm": 1.0447373390197754, + "learning_rate": 0.000971667346106808, + "loss": 3.4153, + "step": 3340 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.6978347301483154, + "learning_rate": 0.0009716248810979753, + "loss": 3.7115, + "step": 3345 + }, + { + "epoch": 0.22761244734338903, + "grad_norm": 0.6773029565811157, + "learning_rate": 0.0009715824160891425, + "loss": 3.8145, + "step": 3350 + }, + { + "epoch": 0.22795216741405083, + "grad_norm": 0.6024543046951294, + "learning_rate": 0.0009715399510803098, + "loss": 3.6157, + "step": 3355 + }, + { + "epoch": 0.2282918874847126, + "grad_norm": 0.5864116549491882, + "learning_rate": 0.0009714974860714771, + "loss": 3.7057, + "step": 3360 + }, + { + "epoch": 0.22863160755537437, + "grad_norm": 0.6375217437744141, + "learning_rate": 0.0009714550210626443, + "loss": 3.8491, + "step": 3365 + }, + { + "epoch": 0.22897132762603614, + "grad_norm": 1.0712157487869263, + "learning_rate": 0.0009714125560538117, + "loss": 3.7685, + "step": 3370 + }, + { + "epoch": 0.2293110476966979, + "grad_norm": 0.5962576866149902, + "learning_rate": 0.000971370091044979, + "loss": 3.7387, + "step": 3375 + }, + { + "epoch": 0.2296507677673597, + "grad_norm": 0.7578774094581604, + "learning_rate": 0.0009713276260361462, + "loss": 3.6402, + "step": 3380 + }, + { + "epoch": 0.22999048783802148, + "grad_norm": 0.6613447070121765, + "learning_rate": 0.0009712851610273136, + "loss": 3.7874, + "step": 3385 + }, + { + "epoch": 0.23033020790868325, + "grad_norm": 0.574714720249176, + "learning_rate": 0.0009712426960184808, + "loss": 3.6511, + "step": 3390 + }, + { + "epoch": 0.23066992797934502, + "grad_norm": 0.6187616586685181, + "learning_rate": 0.000971200231009648, + "loss": 3.8553, + "step": 3395 + }, + { + "epoch": 0.2310096480500068, + "grad_norm": 0.7214511036872864, + "learning_rate": 0.0009711577660008153, + "loss": 3.5751, + "step": 3400 + }, + { + "epoch": 0.23134936812066856, + "grad_norm": 0.7186050415039062, + "learning_rate": 0.0009711153009919827, + "loss": 3.8441, + "step": 3405 + }, + { + "epoch": 0.23168908819133036, + "grad_norm": 0.6643950939178467, + "learning_rate": 0.0009710728359831499, + "loss": 3.6803, + "step": 3410 + }, + { + "epoch": 0.23202880826199213, + "grad_norm": 0.749325156211853, + "learning_rate": 0.0009710303709743172, + "loss": 3.7794, + "step": 3415 + }, + { + "epoch": 0.2323685283326539, + "grad_norm": 0.5802797675132751, + "learning_rate": 0.0009709879059654845, + "loss": 3.8622, + "step": 3420 + }, + { + "epoch": 0.23270824840331567, + "grad_norm": 0.7271153330802917, + "learning_rate": 0.0009709454409566517, + "loss": 3.8806, + "step": 3425 + }, + { + "epoch": 0.23304796847397743, + "grad_norm": 0.7022523283958435, + "learning_rate": 0.000970902975947819, + "loss": 3.8101, + "step": 3430 + }, + { + "epoch": 0.2333876885446392, + "grad_norm": 0.5061925649642944, + "learning_rate": 0.0009708605109389863, + "loss": 3.8341, + "step": 3435 + }, + { + "epoch": 0.233727408615301, + "grad_norm": 0.7293051481246948, + "learning_rate": 0.0009708180459301536, + "loss": 3.53, + "step": 3440 + }, + { + "epoch": 0.23406712868596277, + "grad_norm": 1.1794521808624268, + "learning_rate": 0.0009707755809213209, + "loss": 3.6657, + "step": 3445 + }, + { + "epoch": 0.23440684875662454, + "grad_norm": 0.7628663182258606, + "learning_rate": 0.0009707331159124881, + "loss": 3.7237, + "step": 3450 + }, + { + "epoch": 0.2347465688272863, + "grad_norm": 0.693013608455658, + "learning_rate": 0.0009706906509036554, + "loss": 3.7203, + "step": 3455 + }, + { + "epoch": 0.23508628889794808, + "grad_norm": 0.7980107665061951, + "learning_rate": 0.0009706481858948227, + "loss": 3.6071, + "step": 3460 + }, + { + "epoch": 0.23542600896860988, + "grad_norm": 0.5791229009628296, + "learning_rate": 0.0009706057208859899, + "loss": 3.8135, + "step": 3465 + }, + { + "epoch": 0.23576572903927165, + "grad_norm": 0.7232710123062134, + "learning_rate": 0.0009705632558771572, + "loss": 3.6093, + "step": 3470 + }, + { + "epoch": 0.23610544910993342, + "grad_norm": 0.7714651226997375, + "learning_rate": 0.0009705207908683246, + "loss": 3.6093, + "step": 3475 + }, + { + "epoch": 0.2364451691805952, + "grad_norm": 0.8571071624755859, + "learning_rate": 0.0009704783258594918, + "loss": 3.8385, + "step": 3480 + }, + { + "epoch": 0.23678488925125696, + "grad_norm": 0.6522979736328125, + "learning_rate": 0.0009704358608506591, + "loss": 3.7941, + "step": 3485 + }, + { + "epoch": 0.23712460932191873, + "grad_norm": 0.6264018416404724, + "learning_rate": 0.0009703933958418264, + "loss": 3.475, + "step": 3490 + }, + { + "epoch": 0.23746432939258053, + "grad_norm": 0.826338529586792, + "learning_rate": 0.0009703509308329936, + "loss": 3.6864, + "step": 3495 + }, + { + "epoch": 0.2378040494632423, + "grad_norm": 0.6870751976966858, + "learning_rate": 0.0009703084658241608, + "loss": 3.9105, + "step": 3500 + }, + { + "epoch": 0.23814376953390406, + "grad_norm": 0.738770067691803, + "learning_rate": 0.0009702660008153283, + "loss": 3.8655, + "step": 3505 + }, + { + "epoch": 0.23848348960456583, + "grad_norm": 0.7685903906822205, + "learning_rate": 0.0009702235358064955, + "loss": 3.6979, + "step": 3510 + }, + { + "epoch": 0.2388232096752276, + "grad_norm": 0.6998326778411865, + "learning_rate": 0.0009701810707976627, + "loss": 3.7202, + "step": 3515 + }, + { + "epoch": 0.23916292974588937, + "grad_norm": 0.6233009099960327, + "learning_rate": 0.0009701386057888301, + "loss": 3.5748, + "step": 3520 + }, + { + "epoch": 0.23950264981655117, + "grad_norm": 0.6062663793563843, + "learning_rate": 0.0009700961407799973, + "loss": 3.8507, + "step": 3525 + }, + { + "epoch": 0.23984236988721294, + "grad_norm": 0.6667577624320984, + "learning_rate": 0.0009700536757711645, + "loss": 3.7396, + "step": 3530 + }, + { + "epoch": 0.2401820899578747, + "grad_norm": 0.6072933673858643, + "learning_rate": 0.0009700112107623319, + "loss": 3.3934, + "step": 3535 + }, + { + "epoch": 0.24052181002853648, + "grad_norm": 0.6347804665565491, + "learning_rate": 0.0009699687457534992, + "loss": 3.6881, + "step": 3540 + }, + { + "epoch": 0.24086153009919825, + "grad_norm": 0.8191355466842651, + "learning_rate": 0.0009699262807446664, + "loss": 3.8147, + "step": 3545 + }, + { + "epoch": 0.24120125016986005, + "grad_norm": 0.7282726764678955, + "learning_rate": 0.0009698838157358338, + "loss": 3.9134, + "step": 3550 + }, + { + "epoch": 0.24154097024052182, + "grad_norm": 0.5578713417053223, + "learning_rate": 0.000969841350727001, + "loss": 3.8756, + "step": 3555 + }, + { + "epoch": 0.2418806903111836, + "grad_norm": 0.7785852551460266, + "learning_rate": 0.0009697988857181682, + "loss": 3.5146, + "step": 3560 + }, + { + "epoch": 0.24222041038184536, + "grad_norm": 0.6270478963851929, + "learning_rate": 0.0009697564207093355, + "loss": 3.6885, + "step": 3565 + }, + { + "epoch": 0.24256013045250713, + "grad_norm": 0.9992136359214783, + "learning_rate": 0.0009697139557005028, + "loss": 3.5533, + "step": 3570 + }, + { + "epoch": 0.2428998505231689, + "grad_norm": 0.7223435640335083, + "learning_rate": 0.0009696714906916701, + "loss": 3.8281, + "step": 3575 + }, + { + "epoch": 0.2432395705938307, + "grad_norm": 0.6559579372406006, + "learning_rate": 0.0009696290256828374, + "loss": 3.9161, + "step": 3580 + }, + { + "epoch": 0.24357929066449246, + "grad_norm": 0.571948230266571, + "learning_rate": 0.0009695865606740047, + "loss": 3.8552, + "step": 3585 + }, + { + "epoch": 0.24391901073515423, + "grad_norm": 0.7258049249649048, + "learning_rate": 0.0009695440956651719, + "loss": 3.5909, + "step": 3590 + }, + { + "epoch": 0.244258730805816, + "grad_norm": 0.659995973110199, + "learning_rate": 0.0009695016306563392, + "loss": 3.7047, + "step": 3595 + }, + { + "epoch": 0.24459845087647777, + "grad_norm": 0.6485466361045837, + "learning_rate": 0.0009694591656475064, + "loss": 3.7048, + "step": 3600 + }, + { + "epoch": 0.24493817094713954, + "grad_norm": 0.7553309798240662, + "learning_rate": 0.0009694167006386737, + "loss": 3.6525, + "step": 3605 + }, + { + "epoch": 0.24527789101780134, + "grad_norm": 0.8247597217559814, + "learning_rate": 0.0009693742356298411, + "loss": 3.7916, + "step": 3610 + }, + { + "epoch": 0.2456176110884631, + "grad_norm": 0.6353480219841003, + "learning_rate": 0.0009693317706210083, + "loss": 3.8631, + "step": 3615 + }, + { + "epoch": 0.24595733115912488, + "grad_norm": 0.6419610977172852, + "learning_rate": 0.0009692893056121756, + "loss": 3.563, + "step": 3620 + }, + { + "epoch": 0.24629705122978665, + "grad_norm": 0.8985269665718079, + "learning_rate": 0.0009692468406033429, + "loss": 3.7217, + "step": 3625 + }, + { + "epoch": 0.24663677130044842, + "grad_norm": 0.5882688760757446, + "learning_rate": 0.0009692043755945101, + "loss": 3.6108, + "step": 3630 + }, + { + "epoch": 0.24697649137111022, + "grad_norm": 0.6818410158157349, + "learning_rate": 0.0009691619105856773, + "loss": 3.7915, + "step": 3635 + }, + { + "epoch": 0.247316211441772, + "grad_norm": 0.7848137617111206, + "learning_rate": 0.0009691194455768447, + "loss": 3.854, + "step": 3640 + }, + { + "epoch": 0.24765593151243376, + "grad_norm": 0.7315320372581482, + "learning_rate": 0.000969076980568012, + "loss": 3.9308, + "step": 3645 + }, + { + "epoch": 0.24799565158309553, + "grad_norm": 1.1904094219207764, + "learning_rate": 0.0009690345155591792, + "loss": 3.6744, + "step": 3650 + }, + { + "epoch": 0.2483353716537573, + "grad_norm": 0.7596409320831299, + "learning_rate": 0.0009689920505503466, + "loss": 3.499, + "step": 3655 + }, + { + "epoch": 0.24867509172441907, + "grad_norm": 0.9544851183891296, + "learning_rate": 0.0009689495855415138, + "loss": 3.6202, + "step": 3660 + }, + { + "epoch": 0.24901481179508086, + "grad_norm": 0.6804072856903076, + "learning_rate": 0.000968907120532681, + "loss": 3.6708, + "step": 3665 + }, + { + "epoch": 0.24935453186574263, + "grad_norm": 0.7178321480751038, + "learning_rate": 0.0009688646555238484, + "loss": 3.7758, + "step": 3670 + }, + { + "epoch": 0.2496942519364044, + "grad_norm": 0.8629001975059509, + "learning_rate": 0.0009688221905150156, + "loss": 3.8303, + "step": 3675 + }, + { + "epoch": 0.2500339720070662, + "grad_norm": 0.6092126369476318, + "learning_rate": 0.0009687797255061829, + "loss": 3.699, + "step": 3680 + }, + { + "epoch": 0.25037369207772797, + "grad_norm": 0.6298483610153198, + "learning_rate": 0.0009687372604973503, + "loss": 3.6417, + "step": 3685 + }, + { + "epoch": 0.2507134121483897, + "grad_norm": 0.6586532592773438, + "learning_rate": 0.0009686947954885175, + "loss": 3.5655, + "step": 3690 + }, + { + "epoch": 0.2510531322190515, + "grad_norm": 0.6504899859428406, + "learning_rate": 0.0009686523304796847, + "loss": 3.8086, + "step": 3695 + }, + { + "epoch": 0.25139285228971325, + "grad_norm": 0.74556565284729, + "learning_rate": 0.000968609865470852, + "loss": 3.8454, + "step": 3700 + }, + { + "epoch": 0.25173257236037505, + "grad_norm": 0.6456407904624939, + "learning_rate": 0.0009685674004620193, + "loss": 3.7965, + "step": 3705 + }, + { + "epoch": 0.25207229243103685, + "grad_norm": 0.7533767223358154, + "learning_rate": 0.0009685249354531865, + "loss": 3.6608, + "step": 3710 + }, + { + "epoch": 0.2524120125016986, + "grad_norm": 0.8573164343833923, + "learning_rate": 0.0009684824704443539, + "loss": 3.7073, + "step": 3715 + }, + { + "epoch": 0.2527517325723604, + "grad_norm": 0.5270083546638489, + "learning_rate": 0.0009684400054355212, + "loss": 3.6085, + "step": 3720 + }, + { + "epoch": 0.25309145264302213, + "grad_norm": 0.622998058795929, + "learning_rate": 0.0009683975404266885, + "loss": 3.7717, + "step": 3725 + }, + { + "epoch": 0.2534311727136839, + "grad_norm": 0.6941125988960266, + "learning_rate": 0.0009683550754178557, + "loss": 3.9123, + "step": 3730 + }, + { + "epoch": 0.2537708927843457, + "grad_norm": 0.6702671051025391, + "learning_rate": 0.000968312610409023, + "loss": 3.6305, + "step": 3735 + }, + { + "epoch": 0.25411061285500747, + "grad_norm": 0.7417623996734619, + "learning_rate": 0.0009682701454001903, + "loss": 3.7676, + "step": 3740 + }, + { + "epoch": 0.25445033292566926, + "grad_norm": 0.8509305715560913, + "learning_rate": 0.0009682276803913575, + "loss": 3.5831, + "step": 3745 + }, + { + "epoch": 0.254790052996331, + "grad_norm": 0.5842397212982178, + "learning_rate": 0.0009681852153825248, + "loss": 3.5472, + "step": 3750 + }, + { + "epoch": 0.2551297730669928, + "grad_norm": 0.8058325052261353, + "learning_rate": 0.0009681427503736922, + "loss": 3.913, + "step": 3755 + }, + { + "epoch": 0.25546949313765455, + "grad_norm": 1.5214065313339233, + "learning_rate": 0.0009681002853648594, + "loss": 3.6576, + "step": 3760 + }, + { + "epoch": 0.25580921320831634, + "grad_norm": 1.1635100841522217, + "learning_rate": 0.0009680578203560266, + "loss": 3.7046, + "step": 3765 + }, + { + "epoch": 0.25614893327897814, + "grad_norm": 0.7895268201828003, + "learning_rate": 0.000968015355347194, + "loss": 3.7453, + "step": 3770 + }, + { + "epoch": 0.2564886533496399, + "grad_norm": 0.583803653717041, + "learning_rate": 0.0009679728903383612, + "loss": 3.6592, + "step": 3775 + }, + { + "epoch": 0.2568283734203017, + "grad_norm": 0.8014993667602539, + "learning_rate": 0.0009679304253295284, + "loss": 3.8655, + "step": 3780 + }, + { + "epoch": 0.2571680934909634, + "grad_norm": 0.6175362467765808, + "learning_rate": 0.0009678879603206959, + "loss": 3.7565, + "step": 3785 + }, + { + "epoch": 0.2575078135616252, + "grad_norm": 0.5759825706481934, + "learning_rate": 0.0009678454953118631, + "loss": 3.7816, + "step": 3790 + }, + { + "epoch": 0.257847533632287, + "grad_norm": 1.6779876947402954, + "learning_rate": 0.0009678030303030303, + "loss": 3.7719, + "step": 3795 + }, + { + "epoch": 0.25818725370294876, + "grad_norm": 0.6963607668876648, + "learning_rate": 0.0009677605652941976, + "loss": 3.6238, + "step": 3800 + }, + { + "epoch": 0.25852697377361056, + "grad_norm": 0.787282407283783, + "learning_rate": 0.0009677181002853649, + "loss": 3.671, + "step": 3805 + }, + { + "epoch": 0.2588666938442723, + "grad_norm": 0.7700339555740356, + "learning_rate": 0.0009676756352765321, + "loss": 3.6935, + "step": 3810 + }, + { + "epoch": 0.2592064139149341, + "grad_norm": 0.6205414533615112, + "learning_rate": 0.0009676331702676994, + "loss": 3.7573, + "step": 3815 + }, + { + "epoch": 0.2595461339855959, + "grad_norm": 1.2013931274414062, + "learning_rate": 0.0009675907052588668, + "loss": 3.5955, + "step": 3820 + }, + { + "epoch": 0.25988585405625764, + "grad_norm": 0.6477011442184448, + "learning_rate": 0.000967548240250034, + "loss": 3.7934, + "step": 3825 + }, + { + "epoch": 0.26022557412691943, + "grad_norm": 2.457035541534424, + "learning_rate": 0.0009675057752412013, + "loss": 3.5049, + "step": 3830 + }, + { + "epoch": 0.2605652941975812, + "grad_norm": 0.6924219131469727, + "learning_rate": 0.0009674633102323686, + "loss": 3.584, + "step": 3835 + }, + { + "epoch": 0.260905014268243, + "grad_norm": 0.5830779075622559, + "learning_rate": 0.0009674208452235358, + "loss": 3.6796, + "step": 3840 + }, + { + "epoch": 0.2612447343389047, + "grad_norm": 0.6160967350006104, + "learning_rate": 0.0009673783802147031, + "loss": 3.8637, + "step": 3845 + }, + { + "epoch": 0.2615844544095665, + "grad_norm": 0.8427665829658508, + "learning_rate": 0.0009673359152058703, + "loss": 3.7788, + "step": 3850 + }, + { + "epoch": 0.2619241744802283, + "grad_norm": 0.9947366118431091, + "learning_rate": 0.0009672934501970377, + "loss": 3.5738, + "step": 3855 + }, + { + "epoch": 0.26226389455089005, + "grad_norm": 1.8660234212875366, + "learning_rate": 0.000967250985188205, + "loss": 3.8406, + "step": 3860 + }, + { + "epoch": 0.26260361462155185, + "grad_norm": 0.683830976486206, + "learning_rate": 0.0009672085201793722, + "loss": 3.6732, + "step": 3865 + }, + { + "epoch": 0.2629433346922136, + "grad_norm": 0.842989981174469, + "learning_rate": 0.0009671660551705395, + "loss": 3.4333, + "step": 3870 + }, + { + "epoch": 0.2632830547628754, + "grad_norm": 0.6797104477882385, + "learning_rate": 0.0009671235901617068, + "loss": 3.7615, + "step": 3875 + }, + { + "epoch": 0.2636227748335372, + "grad_norm": 0.5441763401031494, + "learning_rate": 0.000967081125152874, + "loss": 3.7936, + "step": 3880 + }, + { + "epoch": 0.26396249490419893, + "grad_norm": 0.6964337825775146, + "learning_rate": 0.0009670386601440412, + "loss": 3.6879, + "step": 3885 + }, + { + "epoch": 0.2643022149748607, + "grad_norm": 0.552566647529602, + "learning_rate": 0.0009669961951352087, + "loss": 3.7784, + "step": 3890 + }, + { + "epoch": 0.26464193504552247, + "grad_norm": 0.6542810797691345, + "learning_rate": 0.0009669537301263759, + "loss": 3.9229, + "step": 3895 + }, + { + "epoch": 0.26498165511618427, + "grad_norm": 1.285036325454712, + "learning_rate": 0.0009669112651175431, + "loss": 3.4898, + "step": 3900 + }, + { + "epoch": 0.26532137518684606, + "grad_norm": 0.8983030915260315, + "learning_rate": 0.0009668688001087105, + "loss": 4.0326, + "step": 3905 + }, + { + "epoch": 0.2656610952575078, + "grad_norm": 0.7825999855995178, + "learning_rate": 0.0009668263350998777, + "loss": 3.6073, + "step": 3910 + }, + { + "epoch": 0.2660008153281696, + "grad_norm": 0.6404388546943665, + "learning_rate": 0.0009667838700910449, + "loss": 3.92, + "step": 3915 + }, + { + "epoch": 0.26634053539883135, + "grad_norm": 0.7598031759262085, + "learning_rate": 0.0009667414050822123, + "loss": 3.9254, + "step": 3920 + }, + { + "epoch": 0.26668025546949314, + "grad_norm": 0.7887185215950012, + "learning_rate": 0.0009666989400733796, + "loss": 3.7347, + "step": 3925 + }, + { + "epoch": 0.2670199755401549, + "grad_norm": 0.5804632902145386, + "learning_rate": 0.0009666564750645468, + "loss": 3.4362, + "step": 3930 + }, + { + "epoch": 0.2673596956108167, + "grad_norm": 0.6359730362892151, + "learning_rate": 0.0009666140100557142, + "loss": 3.8811, + "step": 3935 + }, + { + "epoch": 0.2676994156814785, + "grad_norm": 0.5660666227340698, + "learning_rate": 0.0009665715450468814, + "loss": 3.854, + "step": 3940 + }, + { + "epoch": 0.2680391357521402, + "grad_norm": 0.7817276120185852, + "learning_rate": 0.0009665290800380486, + "loss": 3.5496, + "step": 3945 + }, + { + "epoch": 0.268378855822802, + "grad_norm": 0.6742070317268372, + "learning_rate": 0.0009664866150292159, + "loss": 3.9981, + "step": 3950 + }, + { + "epoch": 0.26871857589346376, + "grad_norm": 1.0658676624298096, + "learning_rate": 0.0009664441500203832, + "loss": 3.4829, + "step": 3955 + }, + { + "epoch": 0.26905829596412556, + "grad_norm": 0.5417182445526123, + "learning_rate": 0.0009664016850115505, + "loss": 3.7942, + "step": 3960 + }, + { + "epoch": 0.26939801603478736, + "grad_norm": 0.6700714826583862, + "learning_rate": 0.0009663592200027178, + "loss": 3.9201, + "step": 3965 + }, + { + "epoch": 0.2697377361054491, + "grad_norm": 0.7168427109718323, + "learning_rate": 0.0009663167549938851, + "loss": 3.5427, + "step": 3970 + }, + { + "epoch": 0.2700774561761109, + "grad_norm": 0.6957629323005676, + "learning_rate": 0.0009662742899850523, + "loss": 4.0149, + "step": 3975 + }, + { + "epoch": 0.27041717624677264, + "grad_norm": 0.733623743057251, + "learning_rate": 0.0009662318249762196, + "loss": 3.7773, + "step": 3980 + }, + { + "epoch": 0.27075689631743444, + "grad_norm": 0.7007580995559692, + "learning_rate": 0.0009661893599673868, + "loss": 3.9111, + "step": 3985 + }, + { + "epoch": 0.27109661638809623, + "grad_norm": 0.5961278676986694, + "learning_rate": 0.0009661468949585541, + "loss": 3.7626, + "step": 3990 + }, + { + "epoch": 0.271436336458758, + "grad_norm": 1.3057128190994263, + "learning_rate": 0.0009661044299497215, + "loss": 3.6284, + "step": 3995 + }, + { + "epoch": 0.2717760565294198, + "grad_norm": 0.7507727146148682, + "learning_rate": 0.0009660619649408887, + "loss": 3.9908, + "step": 4000 + }, + { + "epoch": 0.2721157766000815, + "grad_norm": 0.60472172498703, + "learning_rate": 0.000966019499932056, + "loss": 3.6484, + "step": 4005 + }, + { + "epoch": 0.2724554966707433, + "grad_norm": 1.0519737005233765, + "learning_rate": 0.0009659770349232233, + "loss": 3.6807, + "step": 4010 + }, + { + "epoch": 0.27279521674140506, + "grad_norm": 0.7857118844985962, + "learning_rate": 0.0009659345699143905, + "loss": 3.9858, + "step": 4015 + }, + { + "epoch": 0.27313493681206685, + "grad_norm": 0.6388773918151855, + "learning_rate": 0.0009658921049055578, + "loss": 3.5815, + "step": 4020 + }, + { + "epoch": 0.27347465688272865, + "grad_norm": 0.7875389456748962, + "learning_rate": 0.0009658496398967251, + "loss": 3.7148, + "step": 4025 + }, + { + "epoch": 0.2738143769533904, + "grad_norm": 0.6732829213142395, + "learning_rate": 0.0009658071748878924, + "loss": 3.4594, + "step": 4030 + }, + { + "epoch": 0.2741540970240522, + "grad_norm": 0.8218333721160889, + "learning_rate": 0.0009657647098790596, + "loss": 3.8076, + "step": 4035 + }, + { + "epoch": 0.27449381709471393, + "grad_norm": 0.6171180605888367, + "learning_rate": 0.000965722244870227, + "loss": 3.6279, + "step": 4040 + }, + { + "epoch": 0.27483353716537573, + "grad_norm": 0.6974437236785889, + "learning_rate": 0.0009656797798613942, + "loss": 3.981, + "step": 4045 + }, + { + "epoch": 0.2751732572360375, + "grad_norm": 0.7614907622337341, + "learning_rate": 0.0009656373148525614, + "loss": 3.8498, + "step": 4050 + }, + { + "epoch": 0.27551297730669927, + "grad_norm": 0.6335248351097107, + "learning_rate": 0.0009655948498437288, + "loss": 3.7597, + "step": 4055 + }, + { + "epoch": 0.27585269737736107, + "grad_norm": 9.429041862487793, + "learning_rate": 0.000965552384834896, + "loss": 3.7184, + "step": 4060 + }, + { + "epoch": 0.2761924174480228, + "grad_norm": 0.8975064158439636, + "learning_rate": 0.0009655099198260634, + "loss": 3.6527, + "step": 4065 + }, + { + "epoch": 0.2765321375186846, + "grad_norm": 0.7609211802482605, + "learning_rate": 0.0009654674548172307, + "loss": 3.5494, + "step": 4070 + }, + { + "epoch": 0.2768718575893464, + "grad_norm": 0.6155276894569397, + "learning_rate": 0.0009654249898083979, + "loss": 3.8064, + "step": 4075 + }, + { + "epoch": 0.27721157766000815, + "grad_norm": 0.6104815006256104, + "learning_rate": 0.0009653825247995652, + "loss": 3.7244, + "step": 4080 + }, + { + "epoch": 0.27755129773066994, + "grad_norm": 0.6848341822624207, + "learning_rate": 0.0009653400597907324, + "loss": 3.4612, + "step": 4085 + }, + { + "epoch": 0.2778910178013317, + "grad_norm": 0.6740646362304688, + "learning_rate": 0.0009652975947818997, + "loss": 3.7027, + "step": 4090 + }, + { + "epoch": 0.2782307378719935, + "grad_norm": 0.8074580430984497, + "learning_rate": 0.0009652551297730671, + "loss": 3.7178, + "step": 4095 + }, + { + "epoch": 0.2785704579426552, + "grad_norm": 0.6604176759719849, + "learning_rate": 0.0009652126647642343, + "loss": 3.8003, + "step": 4100 + }, + { + "epoch": 0.278910178013317, + "grad_norm": 0.5929292440414429, + "learning_rate": 0.0009651701997554016, + "loss": 3.546, + "step": 4105 + }, + { + "epoch": 0.2792498980839788, + "grad_norm": 0.7518572211265564, + "learning_rate": 0.0009651277347465689, + "loss": 3.7978, + "step": 4110 + }, + { + "epoch": 0.27958961815464056, + "grad_norm": 0.6609513759613037, + "learning_rate": 0.0009650852697377361, + "loss": 3.7332, + "step": 4115 + }, + { + "epoch": 0.27992933822530236, + "grad_norm": 1.497218370437622, + "learning_rate": 0.0009650428047289034, + "loss": 3.7084, + "step": 4120 + }, + { + "epoch": 0.2802690582959641, + "grad_norm": 0.7574613690376282, + "learning_rate": 0.0009650003397200707, + "loss": 3.5678, + "step": 4125 + }, + { + "epoch": 0.2806087783666259, + "grad_norm": 0.8804316520690918, + "learning_rate": 0.000964957874711238, + "loss": 3.6466, + "step": 4130 + }, + { + "epoch": 0.2809484984372877, + "grad_norm": 0.8992692232131958, + "learning_rate": 0.0009649154097024052, + "loss": 3.6122, + "step": 4135 + }, + { + "epoch": 0.28128821850794944, + "grad_norm": 0.9340124130249023, + "learning_rate": 0.0009648729446935726, + "loss": 3.6142, + "step": 4140 + }, + { + "epoch": 0.28162793857861124, + "grad_norm": 0.6643006205558777, + "learning_rate": 0.0009648304796847398, + "loss": 3.7525, + "step": 4145 + }, + { + "epoch": 0.281967658649273, + "grad_norm": 0.9309554696083069, + "learning_rate": 0.000964788014675907, + "loss": 3.9566, + "step": 4150 + }, + { + "epoch": 0.2823073787199348, + "grad_norm": 0.7548115253448486, + "learning_rate": 0.0009647455496670744, + "loss": 3.8406, + "step": 4155 + }, + { + "epoch": 0.2826470987905966, + "grad_norm": 0.7500674724578857, + "learning_rate": 0.0009647030846582416, + "loss": 3.8411, + "step": 4160 + }, + { + "epoch": 0.2829868188612583, + "grad_norm": 0.7849990725517273, + "learning_rate": 0.0009646606196494089, + "loss": 3.591, + "step": 4165 + }, + { + "epoch": 0.2833265389319201, + "grad_norm": 0.564659833908081, + "learning_rate": 0.0009646181546405763, + "loss": 3.6361, + "step": 4170 + }, + { + "epoch": 0.28366625900258186, + "grad_norm": 0.7155148386955261, + "learning_rate": 0.0009645756896317435, + "loss": 3.7331, + "step": 4175 + }, + { + "epoch": 0.28400597907324365, + "grad_norm": 1.0544288158416748, + "learning_rate": 0.0009645332246229107, + "loss": 3.43, + "step": 4180 + }, + { + "epoch": 0.2843456991439054, + "grad_norm": 0.7033450603485107, + "learning_rate": 0.000964490759614078, + "loss": 3.848, + "step": 4185 + }, + { + "epoch": 0.2846854192145672, + "grad_norm": 0.8436892032623291, + "learning_rate": 0.0009644482946052453, + "loss": 3.4662, + "step": 4190 + }, + { + "epoch": 0.285025139285229, + "grad_norm": 0.7199618220329285, + "learning_rate": 0.0009644058295964125, + "loss": 3.7603, + "step": 4195 + }, + { + "epoch": 0.28536485935589073, + "grad_norm": 0.6125022768974304, + "learning_rate": 0.0009643633645875799, + "loss": 3.7205, + "step": 4200 + }, + { + "epoch": 0.28570457942655253, + "grad_norm": 0.7270635962486267, + "learning_rate": 0.0009643208995787472, + "loss": 3.7614, + "step": 4205 + }, + { + "epoch": 0.28604429949721427, + "grad_norm": 0.8654095530509949, + "learning_rate": 0.0009642784345699144, + "loss": 3.7231, + "step": 4210 + }, + { + "epoch": 0.28638401956787607, + "grad_norm": 0.653009295463562, + "learning_rate": 0.0009642359695610817, + "loss": 3.744, + "step": 4215 + }, + { + "epoch": 0.28672373963853787, + "grad_norm": 0.7519428133964539, + "learning_rate": 0.000964193504552249, + "loss": 3.9326, + "step": 4220 + }, + { + "epoch": 0.2870634597091996, + "grad_norm": 0.7253366708755493, + "learning_rate": 0.0009641510395434162, + "loss": 3.9129, + "step": 4225 + }, + { + "epoch": 0.2874031797798614, + "grad_norm": 0.6784452795982361, + "learning_rate": 0.0009641085745345835, + "loss": 3.8294, + "step": 4230 + }, + { + "epoch": 0.28774289985052315, + "grad_norm": 0.5623319745063782, + "learning_rate": 0.0009640661095257509, + "loss": 3.5343, + "step": 4235 + }, + { + "epoch": 0.28808261992118495, + "grad_norm": 0.6914134621620178, + "learning_rate": 0.0009640236445169181, + "loss": 3.8806, + "step": 4240 + }, + { + "epoch": 0.28842233999184674, + "grad_norm": 0.6120445132255554, + "learning_rate": 0.0009639811795080854, + "loss": 3.8645, + "step": 4245 + }, + { + "epoch": 0.2887620600625085, + "grad_norm": 0.82246994972229, + "learning_rate": 0.0009639387144992526, + "loss": 3.6456, + "step": 4250 + }, + { + "epoch": 0.2891017801331703, + "grad_norm": 0.5524410009384155, + "learning_rate": 0.0009638962494904199, + "loss": 3.5396, + "step": 4255 + }, + { + "epoch": 0.289441500203832, + "grad_norm": 0.7351158261299133, + "learning_rate": 0.0009638537844815872, + "loss": 3.9921, + "step": 4260 + }, + { + "epoch": 0.2897812202744938, + "grad_norm": 0.6955817937850952, + "learning_rate": 0.0009638113194727544, + "loss": 3.6247, + "step": 4265 + }, + { + "epoch": 0.29012094034515556, + "grad_norm": 0.7541545629501343, + "learning_rate": 0.0009637688544639218, + "loss": 3.7175, + "step": 4270 + }, + { + "epoch": 0.29046066041581736, + "grad_norm": 0.6941908597946167, + "learning_rate": 0.0009637263894550891, + "loss": 3.768, + "step": 4275 + }, + { + "epoch": 0.29080038048647916, + "grad_norm": 1.0598753690719604, + "learning_rate": 0.0009636839244462563, + "loss": 3.7326, + "step": 4280 + }, + { + "epoch": 0.2911401005571409, + "grad_norm": 0.6204294562339783, + "learning_rate": 0.0009636414594374235, + "loss": 3.8622, + "step": 4285 + }, + { + "epoch": 0.2914798206278027, + "grad_norm": 0.9644733667373657, + "learning_rate": 0.0009635989944285909, + "loss": 3.5848, + "step": 4290 + }, + { + "epoch": 0.29181954069846444, + "grad_norm": 0.68808513879776, + "learning_rate": 0.0009635565294197581, + "loss": 3.9749, + "step": 4295 + }, + { + "epoch": 0.29215926076912624, + "grad_norm": 0.7334057688713074, + "learning_rate": 0.0009635140644109253, + "loss": 3.5588, + "step": 4300 + }, + { + "epoch": 0.29249898083978804, + "grad_norm": 0.7295488715171814, + "learning_rate": 0.0009634715994020928, + "loss": 3.4784, + "step": 4305 + }, + { + "epoch": 0.2928387009104498, + "grad_norm": 3.4902732372283936, + "learning_rate": 0.00096342913439326, + "loss": 3.8775, + "step": 4310 + }, + { + "epoch": 0.2931784209811116, + "grad_norm": 0.9271676540374756, + "learning_rate": 0.0009633866693844272, + "loss": 3.5975, + "step": 4315 + }, + { + "epoch": 0.2935181410517733, + "grad_norm": 0.7023747563362122, + "learning_rate": 0.0009633442043755946, + "loss": 3.653, + "step": 4320 + }, + { + "epoch": 0.2938578611224351, + "grad_norm": 0.6651545763015747, + "learning_rate": 0.0009633017393667618, + "loss": 3.5547, + "step": 4325 + }, + { + "epoch": 0.2941975811930969, + "grad_norm": 1.2860726118087769, + "learning_rate": 0.000963259274357929, + "loss": 3.7162, + "step": 4330 + }, + { + "epoch": 0.29453730126375866, + "grad_norm": 0.6647730469703674, + "learning_rate": 0.0009632168093490963, + "loss": 3.8163, + "step": 4335 + }, + { + "epoch": 0.29487702133442045, + "grad_norm": 0.9133235812187195, + "learning_rate": 0.0009631743443402637, + "loss": 3.6734, + "step": 4340 + }, + { + "epoch": 0.2952167414050822, + "grad_norm": 0.6673218607902527, + "learning_rate": 0.0009631318793314309, + "loss": 3.2863, + "step": 4345 + }, + { + "epoch": 0.295556461475744, + "grad_norm": 0.7894598841667175, + "learning_rate": 0.0009630894143225982, + "loss": 3.6399, + "step": 4350 + }, + { + "epoch": 0.29589618154640573, + "grad_norm": 0.7132781744003296, + "learning_rate": 0.0009630469493137655, + "loss": 3.7906, + "step": 4355 + }, + { + "epoch": 0.29623590161706753, + "grad_norm": 0.868952751159668, + "learning_rate": 0.0009630044843049327, + "loss": 3.5307, + "step": 4360 + }, + { + "epoch": 0.29657562168772933, + "grad_norm": 0.7264300584793091, + "learning_rate": 0.0009629620192961, + "loss": 3.8976, + "step": 4365 + }, + { + "epoch": 0.29691534175839107, + "grad_norm": 0.6526342630386353, + "learning_rate": 0.0009629195542872672, + "loss": 3.8408, + "step": 4370 + }, + { + "epoch": 0.29725506182905287, + "grad_norm": 0.8408504128456116, + "learning_rate": 0.0009628770892784346, + "loss": 3.573, + "step": 4375 + }, + { + "epoch": 0.2975947818997146, + "grad_norm": 0.761601448059082, + "learning_rate": 0.0009628346242696019, + "loss": 3.7411, + "step": 4380 + }, + { + "epoch": 0.2979345019703764, + "grad_norm": 0.6787109375, + "learning_rate": 0.0009627921592607691, + "loss": 3.727, + "step": 4385 + }, + { + "epoch": 0.2982742220410382, + "grad_norm": 0.6976729035377502, + "learning_rate": 0.0009627496942519364, + "loss": 3.8812, + "step": 4390 + }, + { + "epoch": 0.29861394211169995, + "grad_norm": 0.699778139591217, + "learning_rate": 0.0009627072292431037, + "loss": 3.8929, + "step": 4395 + }, + { + "epoch": 0.29895366218236175, + "grad_norm": 0.7088847160339355, + "learning_rate": 0.0009626647642342709, + "loss": 3.6514, + "step": 4400 + }, + { + "epoch": 0.2992933822530235, + "grad_norm": 0.9244754910469055, + "learning_rate": 0.0009626222992254383, + "loss": 3.708, + "step": 4405 + }, + { + "epoch": 0.2996331023236853, + "grad_norm": 0.7207773923873901, + "learning_rate": 0.0009625798342166056, + "loss": 3.8624, + "step": 4410 + }, + { + "epoch": 0.2999728223943471, + "grad_norm": 0.7858375310897827, + "learning_rate": 0.0009625373692077728, + "loss": 3.8962, + "step": 4415 + }, + { + "epoch": 0.3003125424650088, + "grad_norm": 8.455495834350586, + "learning_rate": 0.0009624949041989402, + "loss": 3.7216, + "step": 4420 + }, + { + "epoch": 0.3006522625356706, + "grad_norm": 0.5572919249534607, + "learning_rate": 0.0009624524391901074, + "loss": 3.8624, + "step": 4425 + }, + { + "epoch": 0.30099198260633236, + "grad_norm": 0.8806977272033691, + "learning_rate": 0.0009624099741812746, + "loss": 3.9396, + "step": 4430 + }, + { + "epoch": 0.30133170267699416, + "grad_norm": 0.9121387600898743, + "learning_rate": 0.0009623675091724419, + "loss": 3.9864, + "step": 4435 + }, + { + "epoch": 0.3016714227476559, + "grad_norm": 0.6563469767570496, + "learning_rate": 0.0009623250441636092, + "loss": 3.6004, + "step": 4440 + }, + { + "epoch": 0.3020111428183177, + "grad_norm": 0.6301701664924622, + "learning_rate": 0.0009622825791547765, + "loss": 3.7647, + "step": 4445 + }, + { + "epoch": 0.3023508628889795, + "grad_norm": 0.6203917860984802, + "learning_rate": 0.0009622401141459438, + "loss": 3.7287, + "step": 4450 + }, + { + "epoch": 0.30269058295964124, + "grad_norm": 0.5850993394851685, + "learning_rate": 0.0009621976491371111, + "loss": 3.514, + "step": 4455 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.7155147194862366, + "learning_rate": 0.0009621551841282783, + "loss": 3.5709, + "step": 4460 + }, + { + "epoch": 0.3033700231009648, + "grad_norm": 0.7303117513656616, + "learning_rate": 0.0009621127191194456, + "loss": 3.6832, + "step": 4465 + }, + { + "epoch": 0.3037097431716266, + "grad_norm": 0.6564908027648926, + "learning_rate": 0.0009620702541106129, + "loss": 3.5008, + "step": 4470 + }, + { + "epoch": 0.3040494632422884, + "grad_norm": 1.0435309410095215, + "learning_rate": 0.0009620277891017801, + "loss": 3.7409, + "step": 4475 + }, + { + "epoch": 0.3043891833129501, + "grad_norm": 0.7008757591247559, + "learning_rate": 0.0009619853240929475, + "loss": 3.8772, + "step": 4480 + }, + { + "epoch": 0.3047289033836119, + "grad_norm": 0.6302245855331421, + "learning_rate": 0.0009619428590841147, + "loss": 3.5767, + "step": 4485 + }, + { + "epoch": 0.30506862345427366, + "grad_norm": 0.9175706505775452, + "learning_rate": 0.000961900394075282, + "loss": 4.0241, + "step": 4490 + }, + { + "epoch": 0.30540834352493546, + "grad_norm": 1.321284532546997, + "learning_rate": 0.0009618579290664493, + "loss": 3.7147, + "step": 4495 + }, + { + "epoch": 0.30574806359559725, + "grad_norm": 0.6964834928512573, + "learning_rate": 0.0009618154640576165, + "loss": 3.7424, + "step": 4500 + }, + { + "epoch": 0.306087783666259, + "grad_norm": 0.67368483543396, + "learning_rate": 0.0009617729990487838, + "loss": 3.6055, + "step": 4505 + }, + { + "epoch": 0.3064275037369208, + "grad_norm": 0.7849504351615906, + "learning_rate": 0.0009617305340399511, + "loss": 4.0305, + "step": 4510 + }, + { + "epoch": 0.30676722380758253, + "grad_norm": 0.5120977163314819, + "learning_rate": 0.0009616880690311184, + "loss": 3.836, + "step": 4515 + }, + { + "epoch": 0.30710694387824433, + "grad_norm": 1.1372401714324951, + "learning_rate": 0.0009616456040222857, + "loss": 3.7954, + "step": 4520 + }, + { + "epoch": 0.3074466639489061, + "grad_norm": 0.7200825214385986, + "learning_rate": 0.000961603139013453, + "loss": 3.5077, + "step": 4525 + }, + { + "epoch": 0.30778638401956787, + "grad_norm": 0.9650672674179077, + "learning_rate": 0.0009615606740046202, + "loss": 3.8079, + "step": 4530 + }, + { + "epoch": 0.30812610409022967, + "grad_norm": 1.503319263458252, + "learning_rate": 0.0009615182089957874, + "loss": 3.5117, + "step": 4535 + }, + { + "epoch": 0.3084658241608914, + "grad_norm": 0.6932219862937927, + "learning_rate": 0.0009614757439869548, + "loss": 3.6669, + "step": 4540 + }, + { + "epoch": 0.3088055442315532, + "grad_norm": 0.8320767879486084, + "learning_rate": 0.000961433278978122, + "loss": 3.4303, + "step": 4545 + }, + { + "epoch": 0.30914526430221495, + "grad_norm": 0.8233153223991394, + "learning_rate": 0.0009613908139692893, + "loss": 3.5823, + "step": 4550 + }, + { + "epoch": 0.30948498437287675, + "grad_norm": 0.5292722582817078, + "learning_rate": 0.0009613483489604567, + "loss": 3.8802, + "step": 4555 + }, + { + "epoch": 0.30982470444353855, + "grad_norm": 0.9130657911300659, + "learning_rate": 0.0009613058839516239, + "loss": 3.7136, + "step": 4560 + }, + { + "epoch": 0.3101644245142003, + "grad_norm": 0.8069859147071838, + "learning_rate": 0.0009612634189427911, + "loss": 3.8371, + "step": 4565 + }, + { + "epoch": 0.3105041445848621, + "grad_norm": 0.6846060156822205, + "learning_rate": 0.0009612209539339585, + "loss": 3.8087, + "step": 4570 + }, + { + "epoch": 0.3108438646555238, + "grad_norm": 0.6855045557022095, + "learning_rate": 0.0009611784889251257, + "loss": 3.564, + "step": 4575 + }, + { + "epoch": 0.3111835847261856, + "grad_norm": 0.8640575408935547, + "learning_rate": 0.0009611360239162929, + "loss": 3.9872, + "step": 4580 + }, + { + "epoch": 0.3115233047968474, + "grad_norm": 0.6725939512252808, + "learning_rate": 0.0009610935589074603, + "loss": 3.754, + "step": 4585 + }, + { + "epoch": 0.31186302486750916, + "grad_norm": 0.824292004108429, + "learning_rate": 0.0009610510938986276, + "loss": 3.7079, + "step": 4590 + }, + { + "epoch": 0.31220274493817096, + "grad_norm": 0.6315171718597412, + "learning_rate": 0.0009610086288897948, + "loss": 3.7007, + "step": 4595 + }, + { + "epoch": 0.3125424650088327, + "grad_norm": 1.3393851518630981, + "learning_rate": 0.0009609661638809621, + "loss": 3.8582, + "step": 4600 + }, + { + "epoch": 0.3128821850794945, + "grad_norm": 0.5872925519943237, + "learning_rate": 0.0009609236988721294, + "loss": 3.7893, + "step": 4605 + }, + { + "epoch": 0.3132219051501563, + "grad_norm": 0.7772021889686584, + "learning_rate": 0.0009608812338632966, + "loss": 3.6748, + "step": 4610 + }, + { + "epoch": 0.31356162522081804, + "grad_norm": 0.6851856708526611, + "learning_rate": 0.000960838768854464, + "loss": 3.7071, + "step": 4615 + }, + { + "epoch": 0.31390134529147984, + "grad_norm": 0.9199478626251221, + "learning_rate": 0.0009607963038456313, + "loss": 3.9462, + "step": 4620 + }, + { + "epoch": 0.3142410653621416, + "grad_norm": 1.5282838344573975, + "learning_rate": 0.0009607538388367985, + "loss": 3.7539, + "step": 4625 + }, + { + "epoch": 0.3145807854328034, + "grad_norm": 0.5920034646987915, + "learning_rate": 0.0009607113738279658, + "loss": 3.5516, + "step": 4630 + }, + { + "epoch": 0.3149205055034651, + "grad_norm": 0.829941987991333, + "learning_rate": 0.000960668908819133, + "loss": 3.6942, + "step": 4635 + }, + { + "epoch": 0.3152602255741269, + "grad_norm": 0.6957166790962219, + "learning_rate": 0.0009606264438103003, + "loss": 3.8637, + "step": 4640 + }, + { + "epoch": 0.3155999456447887, + "grad_norm": 0.6346925497055054, + "learning_rate": 0.0009605839788014676, + "loss": 3.7296, + "step": 4645 + }, + { + "epoch": 0.31593966571545046, + "grad_norm": 0.6140884160995483, + "learning_rate": 0.0009605415137926349, + "loss": 3.7309, + "step": 4650 + }, + { + "epoch": 0.31627938578611225, + "grad_norm": 0.7769841551780701, + "learning_rate": 0.0009604990487838022, + "loss": 3.6057, + "step": 4655 + }, + { + "epoch": 0.316619105856774, + "grad_norm": 0.8793666958808899, + "learning_rate": 0.0009604565837749695, + "loss": 3.9553, + "step": 4660 + }, + { + "epoch": 0.3169588259274358, + "grad_norm": 0.882628321647644, + "learning_rate": 0.0009604141187661367, + "loss": 3.7049, + "step": 4665 + }, + { + "epoch": 0.3172985459980976, + "grad_norm": 0.6991147994995117, + "learning_rate": 0.0009603716537573039, + "loss": 3.993, + "step": 4670 + }, + { + "epoch": 0.31763826606875933, + "grad_norm": 0.6810653209686279, + "learning_rate": 0.0009603291887484713, + "loss": 3.8445, + "step": 4675 + }, + { + "epoch": 0.31797798613942113, + "grad_norm": 0.7361776828765869, + "learning_rate": 0.0009602867237396385, + "loss": 3.6926, + "step": 4680 + }, + { + "epoch": 0.3183177062100829, + "grad_norm": 0.4924222528934479, + "learning_rate": 0.0009602442587308058, + "loss": 3.9149, + "step": 4685 + }, + { + "epoch": 0.31865742628074467, + "grad_norm": 0.6900418996810913, + "learning_rate": 0.0009602017937219732, + "loss": 3.9071, + "step": 4690 + }, + { + "epoch": 0.31899714635140647, + "grad_norm": 0.5779551267623901, + "learning_rate": 0.0009601593287131404, + "loss": 3.8636, + "step": 4695 + }, + { + "epoch": 0.3193368664220682, + "grad_norm": 0.601230800151825, + "learning_rate": 0.0009601168637043076, + "loss": 3.7455, + "step": 4700 + }, + { + "epoch": 0.31967658649273, + "grad_norm": 0.5784784555435181, + "learning_rate": 0.000960074398695475, + "loss": 3.5984, + "step": 4705 + }, + { + "epoch": 0.32001630656339175, + "grad_norm": 0.8068006038665771, + "learning_rate": 0.0009600319336866422, + "loss": 3.5827, + "step": 4710 + }, + { + "epoch": 0.32035602663405355, + "grad_norm": 0.6079913973808289, + "learning_rate": 0.0009599894686778094, + "loss": 3.711, + "step": 4715 + }, + { + "epoch": 0.3206957467047153, + "grad_norm": 0.6110500693321228, + "learning_rate": 0.0009599470036689769, + "loss": 3.9128, + "step": 4720 + }, + { + "epoch": 0.3210354667753771, + "grad_norm": 0.6039453148841858, + "learning_rate": 0.0009599045386601441, + "loss": 3.855, + "step": 4725 + }, + { + "epoch": 0.3213751868460389, + "grad_norm": 0.7285047769546509, + "learning_rate": 0.0009598620736513113, + "loss": 3.6163, + "step": 4730 + }, + { + "epoch": 0.3217149069167006, + "grad_norm": 0.6464027762413025, + "learning_rate": 0.0009598196086424786, + "loss": 3.4137, + "step": 4735 + }, + { + "epoch": 0.3220546269873624, + "grad_norm": 0.7541970610618591, + "learning_rate": 0.0009597771436336459, + "loss": 3.608, + "step": 4740 + }, + { + "epoch": 0.32239434705802417, + "grad_norm": 0.944543182849884, + "learning_rate": 0.0009597346786248132, + "loss": 3.4899, + "step": 4745 + }, + { + "epoch": 0.32273406712868596, + "grad_norm": 0.7513295412063599, + "learning_rate": 0.0009596922136159804, + "loss": 3.6781, + "step": 4750 + }, + { + "epoch": 0.32307378719934776, + "grad_norm": 0.7207176089286804, + "learning_rate": 0.0009596497486071478, + "loss": 3.8801, + "step": 4755 + }, + { + "epoch": 0.3234135072700095, + "grad_norm": 0.7275598049163818, + "learning_rate": 0.0009596072835983151, + "loss": 3.5385, + "step": 4760 + }, + { + "epoch": 0.3237532273406713, + "grad_norm": 0.6477530002593994, + "learning_rate": 0.0009595648185894823, + "loss": 3.756, + "step": 4765 + }, + { + "epoch": 0.32409294741133304, + "grad_norm": 0.7552240490913391, + "learning_rate": 0.0009595223535806495, + "loss": 3.7757, + "step": 4770 + }, + { + "epoch": 0.32443266748199484, + "grad_norm": 0.6641505360603333, + "learning_rate": 0.0009594798885718169, + "loss": 3.547, + "step": 4775 + }, + { + "epoch": 0.32477238755265664, + "grad_norm": 0.6412693858146667, + "learning_rate": 0.0009594374235629841, + "loss": 3.8229, + "step": 4780 + }, + { + "epoch": 0.3251121076233184, + "grad_norm": 0.6422855257987976, + "learning_rate": 0.0009593949585541513, + "loss": 3.7117, + "step": 4785 + }, + { + "epoch": 0.3254518276939802, + "grad_norm": 0.6391351222991943, + "learning_rate": 0.0009593524935453188, + "loss": 3.5596, + "step": 4790 + }, + { + "epoch": 0.3257915477646419, + "grad_norm": 0.6480675339698792, + "learning_rate": 0.000959310028536486, + "loss": 3.767, + "step": 4795 + }, + { + "epoch": 0.3261312678353037, + "grad_norm": 0.7814311981201172, + "learning_rate": 0.0009592675635276532, + "loss": 3.492, + "step": 4800 + }, + { + "epoch": 0.32647098790596546, + "grad_norm": 1.228147268295288, + "learning_rate": 0.0009592250985188206, + "loss": 3.5758, + "step": 4805 + }, + { + "epoch": 0.32681070797662726, + "grad_norm": 0.7019321918487549, + "learning_rate": 0.0009591826335099878, + "loss": 3.9746, + "step": 4810 + }, + { + "epoch": 0.32715042804728905, + "grad_norm": 0.8690329790115356, + "learning_rate": 0.000959140168501155, + "loss": 3.5397, + "step": 4815 + }, + { + "epoch": 0.3274901481179508, + "grad_norm": 0.8478692173957825, + "learning_rate": 0.0009590977034923223, + "loss": 3.7612, + "step": 4820 + }, + { + "epoch": 0.3278298681886126, + "grad_norm": 1.0294584035873413, + "learning_rate": 0.0009590552384834897, + "loss": 3.8201, + "step": 4825 + }, + { + "epoch": 0.32816958825927434, + "grad_norm": 0.8577257990837097, + "learning_rate": 0.0009590127734746569, + "loss": 3.7823, + "step": 4830 + }, + { + "epoch": 0.32850930832993613, + "grad_norm": 0.7076607942581177, + "learning_rate": 0.0009589703084658242, + "loss": 3.7521, + "step": 4835 + }, + { + "epoch": 0.32884902840059793, + "grad_norm": 0.6526204943656921, + "learning_rate": 0.0009589278434569915, + "loss": 3.9563, + "step": 4840 + }, + { + "epoch": 0.3291887484712597, + "grad_norm": 0.732799768447876, + "learning_rate": 0.0009588853784481587, + "loss": 3.7572, + "step": 4845 + }, + { + "epoch": 0.32952846854192147, + "grad_norm": 1.0005061626434326, + "learning_rate": 0.000958842913439326, + "loss": 3.7358, + "step": 4850 + }, + { + "epoch": 0.3298681886125832, + "grad_norm": 0.8873110413551331, + "learning_rate": 0.0009588004484304933, + "loss": 3.8761, + "step": 4855 + }, + { + "epoch": 0.330207908683245, + "grad_norm": 0.9196025729179382, + "learning_rate": 0.0009587579834216606, + "loss": 3.408, + "step": 4860 + }, + { + "epoch": 0.3305476287539068, + "grad_norm": 0.7819507122039795, + "learning_rate": 0.0009587155184128279, + "loss": 3.8494, + "step": 4865 + }, + { + "epoch": 0.33088734882456855, + "grad_norm": 0.6501876711845398, + "learning_rate": 0.0009586730534039951, + "loss": 3.9834, + "step": 4870 + }, + { + "epoch": 0.33122706889523035, + "grad_norm": 0.6443763375282288, + "learning_rate": 0.0009586305883951624, + "loss": 3.4783, + "step": 4875 + }, + { + "epoch": 0.3315667889658921, + "grad_norm": 0.7660901546478271, + "learning_rate": 0.0009585881233863297, + "loss": 3.4174, + "step": 4880 + }, + { + "epoch": 0.3319065090365539, + "grad_norm": 0.7198460698127747, + "learning_rate": 0.0009585456583774969, + "loss": 3.8417, + "step": 4885 + }, + { + "epoch": 0.33224622910721563, + "grad_norm": 0.7867069244384766, + "learning_rate": 0.0009585031933686642, + "loss": 3.7688, + "step": 4890 + }, + { + "epoch": 0.3325859491778774, + "grad_norm": 0.619573712348938, + "learning_rate": 0.0009584607283598316, + "loss": 3.9949, + "step": 4895 + }, + { + "epoch": 0.3329256692485392, + "grad_norm": 0.750971794128418, + "learning_rate": 0.0009584182633509988, + "loss": 3.7354, + "step": 4900 + }, + { + "epoch": 0.33326538931920097, + "grad_norm": 0.7560511231422424, + "learning_rate": 0.0009583757983421661, + "loss": 3.6314, + "step": 4905 + }, + { + "epoch": 0.33360510938986276, + "grad_norm": 0.5877467393875122, + "learning_rate": 0.0009583333333333334, + "loss": 3.8765, + "step": 4910 + }, + { + "epoch": 0.3339448294605245, + "grad_norm": 1.5097427368164062, + "learning_rate": 0.0009582908683245006, + "loss": 3.8506, + "step": 4915 + }, + { + "epoch": 0.3342845495311863, + "grad_norm": 0.7352880239486694, + "learning_rate": 0.0009582484033156678, + "loss": 3.6109, + "step": 4920 + }, + { + "epoch": 0.3346242696018481, + "grad_norm": 0.6416025757789612, + "learning_rate": 0.0009582059383068352, + "loss": 3.8371, + "step": 4925 + }, + { + "epoch": 0.33496398967250984, + "grad_norm": 0.6576231718063354, + "learning_rate": 0.0009581634732980025, + "loss": 4.1808, + "step": 4930 + }, + { + "epoch": 0.33530370974317164, + "grad_norm": 0.7199269533157349, + "learning_rate": 0.0009581210082891697, + "loss": 3.7073, + "step": 4935 + }, + { + "epoch": 0.3356434298138334, + "grad_norm": 0.578088104724884, + "learning_rate": 0.0009580785432803371, + "loss": 3.6145, + "step": 4940 + }, + { + "epoch": 0.3359831498844952, + "grad_norm": 0.5751422047615051, + "learning_rate": 0.0009580360782715043, + "loss": 3.802, + "step": 4945 + }, + { + "epoch": 0.336322869955157, + "grad_norm": 0.7758949398994446, + "learning_rate": 0.0009579936132626715, + "loss": 3.8168, + "step": 4950 + }, + { + "epoch": 0.3366625900258187, + "grad_norm": 0.503652811050415, + "learning_rate": 0.0009579511482538389, + "loss": 3.8077, + "step": 4955 + }, + { + "epoch": 0.3370023100964805, + "grad_norm": 0.5707221031188965, + "learning_rate": 0.0009579086832450061, + "loss": 3.603, + "step": 4960 + }, + { + "epoch": 0.33734203016714226, + "grad_norm": 0.530886173248291, + "learning_rate": 0.0009578662182361734, + "loss": 3.7883, + "step": 4965 + }, + { + "epoch": 0.33768175023780406, + "grad_norm": 1.2082452774047852, + "learning_rate": 0.0009578237532273408, + "loss": 3.4972, + "step": 4970 + }, + { + "epoch": 0.3380214703084658, + "grad_norm": 0.7403182983398438, + "learning_rate": 0.000957781288218508, + "loss": 3.6696, + "step": 4975 + }, + { + "epoch": 0.3383611903791276, + "grad_norm": 0.6957752704620361, + "learning_rate": 0.0009577388232096752, + "loss": 3.7591, + "step": 4980 + }, + { + "epoch": 0.3387009104497894, + "grad_norm": 0.60281902551651, + "learning_rate": 0.0009576963582008425, + "loss": 3.6431, + "step": 4985 + }, + { + "epoch": 0.33904063052045114, + "grad_norm": 1.0899558067321777, + "learning_rate": 0.0009576538931920098, + "loss": 3.7729, + "step": 4990 + }, + { + "epoch": 0.33938035059111293, + "grad_norm": 0.6822868585586548, + "learning_rate": 0.000957611428183177, + "loss": 3.893, + "step": 4995 + }, + { + "epoch": 0.3397200706617747, + "grad_norm": 0.6790818572044373, + "learning_rate": 0.0009575689631743444, + "loss": 3.6671, + "step": 5000 + }, + { + "epoch": 0.3400597907324365, + "grad_norm": 0.609584629535675, + "learning_rate": 0.0009575264981655117, + "loss": 3.9263, + "step": 5005 + }, + { + "epoch": 0.34039951080309827, + "grad_norm": 0.7966527938842773, + "learning_rate": 0.0009574840331566789, + "loss": 3.6249, + "step": 5010 + }, + { + "epoch": 0.34073923087376, + "grad_norm": 0.6699427962303162, + "learning_rate": 0.0009574415681478462, + "loss": 3.5491, + "step": 5015 + }, + { + "epoch": 0.3410789509444218, + "grad_norm": 0.6803253293037415, + "learning_rate": 0.0009573991031390134, + "loss": 3.5218, + "step": 5020 + }, + { + "epoch": 0.34141867101508355, + "grad_norm": 0.7213478684425354, + "learning_rate": 0.0009573566381301807, + "loss": 3.4764, + "step": 5025 + }, + { + "epoch": 0.34175839108574535, + "grad_norm": 0.7943812012672424, + "learning_rate": 0.000957314173121348, + "loss": 3.6295, + "step": 5030 + }, + { + "epoch": 0.34209811115640715, + "grad_norm": 0.7619991302490234, + "learning_rate": 0.0009572717081125153, + "loss": 3.5504, + "step": 5035 + }, + { + "epoch": 0.3424378312270689, + "grad_norm": 0.7792290449142456, + "learning_rate": 0.0009572292431036826, + "loss": 3.6003, + "step": 5040 + }, + { + "epoch": 0.3427775512977307, + "grad_norm": 0.6852863430976868, + "learning_rate": 0.0009571867780948499, + "loss": 3.7408, + "step": 5045 + }, + { + "epoch": 0.34311727136839243, + "grad_norm": 0.811378538608551, + "learning_rate": 0.0009571443130860171, + "loss": 3.7876, + "step": 5050 + }, + { + "epoch": 0.3434569914390542, + "grad_norm": 0.4857918918132782, + "learning_rate": 0.0009571018480771843, + "loss": 3.9571, + "step": 5055 + }, + { + "epoch": 0.34379671150971597, + "grad_norm": 0.7899984121322632, + "learning_rate": 0.0009570593830683517, + "loss": 3.5181, + "step": 5060 + }, + { + "epoch": 0.34413643158037777, + "grad_norm": 0.6474937796592712, + "learning_rate": 0.0009570169180595189, + "loss": 3.4179, + "step": 5065 + }, + { + "epoch": 0.34447615165103956, + "grad_norm": 0.5543707609176636, + "learning_rate": 0.0009569744530506862, + "loss": 3.6362, + "step": 5070 + }, + { + "epoch": 0.3448158717217013, + "grad_norm": 0.6907731890678406, + "learning_rate": 0.0009569319880418536, + "loss": 3.6122, + "step": 5075 + }, + { + "epoch": 0.3451555917923631, + "grad_norm": 0.9422761797904968, + "learning_rate": 0.0009568895230330208, + "loss": 3.6825, + "step": 5080 + }, + { + "epoch": 0.34549531186302485, + "grad_norm": 0.8774054050445557, + "learning_rate": 0.0009568470580241881, + "loss": 3.557, + "step": 5085 + }, + { + "epoch": 0.34583503193368664, + "grad_norm": 0.858328640460968, + "learning_rate": 0.0009568045930153554, + "loss": 3.5748, + "step": 5090 + }, + { + "epoch": 0.34617475200434844, + "grad_norm": 0.7406771183013916, + "learning_rate": 0.0009567621280065226, + "loss": 3.7991, + "step": 5095 + }, + { + "epoch": 0.3465144720750102, + "grad_norm": 0.6878300309181213, + "learning_rate": 0.0009567196629976899, + "loss": 3.4881, + "step": 5100 + }, + { + "epoch": 0.346854192145672, + "grad_norm": 0.6378861665725708, + "learning_rate": 0.0009566771979888573, + "loss": 3.7114, + "step": 5105 + }, + { + "epoch": 0.3471939122163337, + "grad_norm": 0.6227318644523621, + "learning_rate": 0.0009566347329800245, + "loss": 3.7425, + "step": 5110 + }, + { + "epoch": 0.3475336322869955, + "grad_norm": 0.6556995511054993, + "learning_rate": 0.0009565922679711918, + "loss": 3.7494, + "step": 5115 + }, + { + "epoch": 0.3478733523576573, + "grad_norm": 0.6490387916564941, + "learning_rate": 0.000956549802962359, + "loss": 3.6327, + "step": 5120 + }, + { + "epoch": 0.34821307242831906, + "grad_norm": 0.6596065163612366, + "learning_rate": 0.0009565073379535263, + "loss": 3.8066, + "step": 5125 + }, + { + "epoch": 0.34855279249898086, + "grad_norm": 0.5992652177810669, + "learning_rate": 0.0009564648729446936, + "loss": 3.5085, + "step": 5130 + }, + { + "epoch": 0.3488925125696426, + "grad_norm": 0.6926514506340027, + "learning_rate": 0.0009564224079358608, + "loss": 3.5554, + "step": 5135 + }, + { + "epoch": 0.3492322326403044, + "grad_norm": 0.5760352611541748, + "learning_rate": 0.0009563799429270282, + "loss": 3.6262, + "step": 5140 + }, + { + "epoch": 0.34957195271096614, + "grad_norm": 0.7403231263160706, + "learning_rate": 0.0009563374779181955, + "loss": 3.4308, + "step": 5145 + }, + { + "epoch": 0.34991167278162794, + "grad_norm": 0.6540454626083374, + "learning_rate": 0.0009562950129093627, + "loss": 3.8298, + "step": 5150 + }, + { + "epoch": 0.35025139285228973, + "grad_norm": 0.584364116191864, + "learning_rate": 0.00095625254790053, + "loss": 3.5342, + "step": 5155 + }, + { + "epoch": 0.3505911129229515, + "grad_norm": 0.6938738226890564, + "learning_rate": 0.0009562100828916973, + "loss": 3.8931, + "step": 5160 + }, + { + "epoch": 0.3509308329936133, + "grad_norm": 0.5901440978050232, + "learning_rate": 0.0009561676178828645, + "loss": 3.7538, + "step": 5165 + }, + { + "epoch": 0.351270553064275, + "grad_norm": 0.574917733669281, + "learning_rate": 0.0009561251528740317, + "loss": 3.5968, + "step": 5170 + }, + { + "epoch": 0.3516102731349368, + "grad_norm": 0.7238849401473999, + "learning_rate": 0.0009560826878651992, + "loss": 3.4551, + "step": 5175 + }, + { + "epoch": 0.3519499932055986, + "grad_norm": 0.8680605292320251, + "learning_rate": 0.0009560402228563664, + "loss": 3.8272, + "step": 5180 + }, + { + "epoch": 0.35228971327626035, + "grad_norm": 0.7943771481513977, + "learning_rate": 0.0009559977578475336, + "loss": 3.6301, + "step": 5185 + }, + { + "epoch": 0.35262943334692215, + "grad_norm": 0.6093536019325256, + "learning_rate": 0.000955955292838701, + "loss": 3.7729, + "step": 5190 + }, + { + "epoch": 0.3529691534175839, + "grad_norm": 0.672134280204773, + "learning_rate": 0.0009559128278298682, + "loss": 3.6954, + "step": 5195 + }, + { + "epoch": 0.3533088734882457, + "grad_norm": 0.7200559973716736, + "learning_rate": 0.0009558703628210354, + "loss": 3.7342, + "step": 5200 + }, + { + "epoch": 0.3536485935589075, + "grad_norm": 0.6872317790985107, + "learning_rate": 0.0009558278978122029, + "loss": 3.5735, + "step": 5205 + }, + { + "epoch": 0.35398831362956923, + "grad_norm": 0.8210635185241699, + "learning_rate": 0.0009557854328033701, + "loss": 3.7587, + "step": 5210 + }, + { + "epoch": 0.354328033700231, + "grad_norm": 0.5938117504119873, + "learning_rate": 0.0009557429677945373, + "loss": 3.865, + "step": 5215 + }, + { + "epoch": 0.35466775377089277, + "grad_norm": 0.8404695987701416, + "learning_rate": 0.0009557005027857046, + "loss": 3.8386, + "step": 5220 + }, + { + "epoch": 0.35500747384155457, + "grad_norm": 0.6327378749847412, + "learning_rate": 0.0009556580377768719, + "loss": 3.6112, + "step": 5225 + }, + { + "epoch": 0.3553471939122163, + "grad_norm": 0.6330134868621826, + "learning_rate": 0.0009556155727680391, + "loss": 3.6638, + "step": 5230 + }, + { + "epoch": 0.3556869139828781, + "grad_norm": 0.6879119277000427, + "learning_rate": 0.0009555731077592064, + "loss": 3.5412, + "step": 5235 + }, + { + "epoch": 0.3560266340535399, + "grad_norm": 0.591391921043396, + "learning_rate": 0.0009555306427503738, + "loss": 3.7826, + "step": 5240 + }, + { + "epoch": 0.35636635412420165, + "grad_norm": 0.677183985710144, + "learning_rate": 0.000955488177741541, + "loss": 3.5899, + "step": 5245 + }, + { + "epoch": 0.35670607419486344, + "grad_norm": 0.6066392660140991, + "learning_rate": 0.0009554457127327083, + "loss": 3.8977, + "step": 5250 + }, + { + "epoch": 0.3570457942655252, + "grad_norm": 0.5931090712547302, + "learning_rate": 0.0009554032477238756, + "loss": 3.5701, + "step": 5255 + }, + { + "epoch": 0.357385514336187, + "grad_norm": 0.6249707937240601, + "learning_rate": 0.0009553607827150428, + "loss": 3.6335, + "step": 5260 + }, + { + "epoch": 0.3577252344068488, + "grad_norm": 0.6523417234420776, + "learning_rate": 0.0009553183177062101, + "loss": 3.3794, + "step": 5265 + }, + { + "epoch": 0.3580649544775105, + "grad_norm": 0.5881463885307312, + "learning_rate": 0.0009552758526973773, + "loss": 3.6023, + "step": 5270 + }, + { + "epoch": 0.3584046745481723, + "grad_norm": 0.6209092140197754, + "learning_rate": 0.0009552333876885447, + "loss": 3.8411, + "step": 5275 + }, + { + "epoch": 0.35874439461883406, + "grad_norm": 0.5740726590156555, + "learning_rate": 0.000955190922679712, + "loss": 3.603, + "step": 5280 + }, + { + "epoch": 0.35908411468949586, + "grad_norm": 0.7282506823539734, + "learning_rate": 0.0009551484576708792, + "loss": 3.7519, + "step": 5285 + }, + { + "epoch": 0.35942383476015766, + "grad_norm": 0.8259398937225342, + "learning_rate": 0.0009551059926620465, + "loss": 3.7978, + "step": 5290 + }, + { + "epoch": 0.3597635548308194, + "grad_norm": 0.9605500102043152, + "learning_rate": 0.0009550635276532138, + "loss": 3.9071, + "step": 5295 + }, + { + "epoch": 0.3601032749014812, + "grad_norm": 0.5904541611671448, + "learning_rate": 0.000955021062644381, + "loss": 3.8455, + "step": 5300 + }, + { + "epoch": 0.36044299497214294, + "grad_norm": 2.283682107925415, + "learning_rate": 0.0009549785976355482, + "loss": 3.982, + "step": 5305 + }, + { + "epoch": 0.36078271504280474, + "grad_norm": 0.539195716381073, + "learning_rate": 0.0009549361326267157, + "loss": 3.852, + "step": 5310 + }, + { + "epoch": 0.3611224351134665, + "grad_norm": 0.6448954343795776, + "learning_rate": 0.0009548936676178829, + "loss": 3.9513, + "step": 5315 + }, + { + "epoch": 0.3614621551841283, + "grad_norm": 0.6438931822776794, + "learning_rate": 0.0009548512026090501, + "loss": 3.6161, + "step": 5320 + }, + { + "epoch": 0.3618018752547901, + "grad_norm": 0.665449321269989, + "learning_rate": 0.0009548087376002175, + "loss": 3.9527, + "step": 5325 + }, + { + "epoch": 0.3621415953254518, + "grad_norm": 0.5927137732505798, + "learning_rate": 0.0009547662725913847, + "loss": 3.7031, + "step": 5330 + }, + { + "epoch": 0.3624813153961136, + "grad_norm": 0.8576391339302063, + "learning_rate": 0.0009547238075825519, + "loss": 4.017, + "step": 5335 + }, + { + "epoch": 0.36282103546677535, + "grad_norm": 0.7774738669395447, + "learning_rate": 0.0009546813425737193, + "loss": 3.5992, + "step": 5340 + }, + { + "epoch": 0.36316075553743715, + "grad_norm": 0.7366918325424194, + "learning_rate": 0.0009546388775648866, + "loss": 3.607, + "step": 5345 + }, + { + "epoch": 0.36350047560809895, + "grad_norm": 0.8402129411697388, + "learning_rate": 0.0009545964125560538, + "loss": 3.7437, + "step": 5350 + }, + { + "epoch": 0.3638401956787607, + "grad_norm": 0.5433735251426697, + "learning_rate": 0.0009545539475472212, + "loss": 3.3804, + "step": 5355 + }, + { + "epoch": 0.3641799157494225, + "grad_norm": 0.7312291860580444, + "learning_rate": 0.0009545114825383884, + "loss": 3.5889, + "step": 5360 + }, + { + "epoch": 0.36451963582008423, + "grad_norm": 0.9587416052818298, + "learning_rate": 0.0009544690175295556, + "loss": 3.8001, + "step": 5365 + }, + { + "epoch": 0.36485935589074603, + "grad_norm": 0.7783311605453491, + "learning_rate": 0.0009544265525207229, + "loss": 3.4252, + "step": 5370 + }, + { + "epoch": 0.3651990759614078, + "grad_norm": 0.6259825825691223, + "learning_rate": 0.0009543840875118902, + "loss": 3.6209, + "step": 5375 + }, + { + "epoch": 0.36553879603206957, + "grad_norm": 0.743801474571228, + "learning_rate": 0.0009543416225030575, + "loss": 3.5814, + "step": 5380 + }, + { + "epoch": 0.36587851610273137, + "grad_norm": 0.6171203851699829, + "learning_rate": 0.0009542991574942248, + "loss": 3.7959, + "step": 5385 + }, + { + "epoch": 0.3662182361733931, + "grad_norm": 0.7721573710441589, + "learning_rate": 0.0009542566924853921, + "loss": 3.6595, + "step": 5390 + }, + { + "epoch": 0.3665579562440549, + "grad_norm": 1.5711735486984253, + "learning_rate": 0.0009542142274765593, + "loss": 3.5556, + "step": 5395 + }, + { + "epoch": 0.36689767631471665, + "grad_norm": 0.648213267326355, + "learning_rate": 0.0009541717624677266, + "loss": 3.6964, + "step": 5400 + }, + { + "epoch": 0.36723739638537845, + "grad_norm": 0.7098330855369568, + "learning_rate": 0.0009541292974588938, + "loss": 3.4736, + "step": 5405 + }, + { + "epoch": 0.36757711645604024, + "grad_norm": 0.6999013423919678, + "learning_rate": 0.0009540868324500611, + "loss": 3.6081, + "step": 5410 + }, + { + "epoch": 0.367916836526702, + "grad_norm": 0.6148515939712524, + "learning_rate": 0.0009540443674412285, + "loss": 3.7102, + "step": 5415 + }, + { + "epoch": 0.3682565565973638, + "grad_norm": 0.6147603988647461, + "learning_rate": 0.0009540019024323957, + "loss": 3.6812, + "step": 5420 + }, + { + "epoch": 0.3685962766680255, + "grad_norm": 0.89749675989151, + "learning_rate": 0.0009539594374235631, + "loss": 3.4323, + "step": 5425 + }, + { + "epoch": 0.3689359967386873, + "grad_norm": 0.5822694897651672, + "learning_rate": 0.0009539169724147303, + "loss": 3.6993, + "step": 5430 + }, + { + "epoch": 0.3692757168093491, + "grad_norm": 0.8035125136375427, + "learning_rate": 0.0009538745074058975, + "loss": 3.6355, + "step": 5435 + }, + { + "epoch": 0.36961543688001086, + "grad_norm": 0.8102260231971741, + "learning_rate": 0.0009538320423970649, + "loss": 3.5008, + "step": 5440 + }, + { + "epoch": 0.36995515695067266, + "grad_norm": 0.6941107511520386, + "learning_rate": 0.0009537895773882321, + "loss": 3.8025, + "step": 5445 + }, + { + "epoch": 0.3702948770213344, + "grad_norm": 0.8430598378181458, + "learning_rate": 0.0009537471123793994, + "loss": 3.5844, + "step": 5450 + }, + { + "epoch": 0.3706345970919962, + "grad_norm": 0.6645303964614868, + "learning_rate": 0.0009537046473705668, + "loss": 3.7972, + "step": 5455 + }, + { + "epoch": 0.370974317162658, + "grad_norm": 0.8034586310386658, + "learning_rate": 0.000953662182361734, + "loss": 3.6585, + "step": 5460 + }, + { + "epoch": 0.37131403723331974, + "grad_norm": 0.8142147064208984, + "learning_rate": 0.0009536197173529012, + "loss": 3.5477, + "step": 5465 + }, + { + "epoch": 0.37165375730398154, + "grad_norm": 0.8548118472099304, + "learning_rate": 0.0009535772523440685, + "loss": 3.859, + "step": 5470 + }, + { + "epoch": 0.3719934773746433, + "grad_norm": 0.6780622601509094, + "learning_rate": 0.0009535347873352358, + "loss": 3.86, + "step": 5475 + }, + { + "epoch": 0.3723331974453051, + "grad_norm": 0.6391974687576294, + "learning_rate": 0.000953492322326403, + "loss": 3.8442, + "step": 5480 + }, + { + "epoch": 0.3726729175159668, + "grad_norm": 0.8116501569747925, + "learning_rate": 0.0009534498573175704, + "loss": 3.9848, + "step": 5485 + }, + { + "epoch": 0.3730126375866286, + "grad_norm": 0.825129508972168, + "learning_rate": 0.0009534073923087377, + "loss": 3.618, + "step": 5490 + }, + { + "epoch": 0.3733523576572904, + "grad_norm": 0.6496371030807495, + "learning_rate": 0.0009533649272999049, + "loss": 3.6847, + "step": 5495 + }, + { + "epoch": 0.37369207772795215, + "grad_norm": 0.7003557085990906, + "learning_rate": 0.0009533224622910722, + "loss": 3.7773, + "step": 5500 + }, + { + "epoch": 0.37403179779861395, + "grad_norm": 0.7326276898384094, + "learning_rate": 0.0009532799972822394, + "loss": 3.8642, + "step": 5505 + }, + { + "epoch": 0.3743715178692757, + "grad_norm": 0.6305473446846008, + "learning_rate": 0.0009532375322734067, + "loss": 3.5758, + "step": 5510 + }, + { + "epoch": 0.3747112379399375, + "grad_norm": 0.8362885117530823, + "learning_rate": 0.000953195067264574, + "loss": 3.8094, + "step": 5515 + }, + { + "epoch": 0.3750509580105993, + "grad_norm": 1.1799019575119019, + "learning_rate": 0.0009531526022557413, + "loss": 3.4337, + "step": 5520 + }, + { + "epoch": 0.37539067808126103, + "grad_norm": 0.6741048693656921, + "learning_rate": 0.0009531101372469086, + "loss": 3.5718, + "step": 5525 + }, + { + "epoch": 0.37573039815192283, + "grad_norm": 0.7841007709503174, + "learning_rate": 0.0009530676722380759, + "loss": 3.6104, + "step": 5530 + }, + { + "epoch": 0.37607011822258457, + "grad_norm": 0.5769286155700684, + "learning_rate": 0.0009530252072292431, + "loss": 3.9677, + "step": 5535 + }, + { + "epoch": 0.37640983829324637, + "grad_norm": 0.760215699672699, + "learning_rate": 0.0009529827422204104, + "loss": 3.7755, + "step": 5540 + }, + { + "epoch": 0.37674955836390817, + "grad_norm": 0.6206812262535095, + "learning_rate": 0.0009529402772115777, + "loss": 4.0377, + "step": 5545 + }, + { + "epoch": 0.3770892784345699, + "grad_norm": 0.5871062874794006, + "learning_rate": 0.0009528978122027449, + "loss": 3.891, + "step": 5550 + }, + { + "epoch": 0.3774289985052317, + "grad_norm": 0.6532780528068542, + "learning_rate": 0.0009528553471939122, + "loss": 3.8963, + "step": 5555 + }, + { + "epoch": 0.37776871857589345, + "grad_norm": 0.867729127407074, + "learning_rate": 0.0009528128821850796, + "loss": 3.845, + "step": 5560 + }, + { + "epoch": 0.37810843864655524, + "grad_norm": 0.716459333896637, + "learning_rate": 0.0009527704171762468, + "loss": 3.611, + "step": 5565 + }, + { + "epoch": 0.378448158717217, + "grad_norm": 0.867291271686554, + "learning_rate": 0.000952727952167414, + "loss": 3.769, + "step": 5570 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.6595242023468018, + "learning_rate": 0.0009526854871585814, + "loss": 3.5169, + "step": 5575 + }, + { + "epoch": 0.3791275988585406, + "grad_norm": 0.7978132367134094, + "learning_rate": 0.0009526430221497486, + "loss": 3.5182, + "step": 5580 + }, + { + "epoch": 0.3794673189292023, + "grad_norm": 0.7491714954376221, + "learning_rate": 0.0009526005571409158, + "loss": 3.6077, + "step": 5585 + }, + { + "epoch": 0.3798070389998641, + "grad_norm": 1.1051557064056396, + "learning_rate": 0.0009525580921320833, + "loss": 3.5316, + "step": 5590 + }, + { + "epoch": 0.38014675907052586, + "grad_norm": 0.656012773513794, + "learning_rate": 0.0009525156271232505, + "loss": 3.6091, + "step": 5595 + }, + { + "epoch": 0.38048647914118766, + "grad_norm": 0.5540869235992432, + "learning_rate": 0.0009524731621144177, + "loss": 3.8614, + "step": 5600 + }, + { + "epoch": 0.38082619921184946, + "grad_norm": 0.9618043899536133, + "learning_rate": 0.000952430697105585, + "loss": 3.8121, + "step": 5605 + }, + { + "epoch": 0.3811659192825112, + "grad_norm": 0.7721150517463684, + "learning_rate": 0.0009523882320967523, + "loss": 3.5466, + "step": 5610 + }, + { + "epoch": 0.381505639353173, + "grad_norm": 0.6684600114822388, + "learning_rate": 0.0009523457670879195, + "loss": 3.61, + "step": 5615 + }, + { + "epoch": 0.38184535942383474, + "grad_norm": 0.6384857296943665, + "learning_rate": 0.0009523033020790868, + "loss": 3.7112, + "step": 5620 + }, + { + "epoch": 0.38218507949449654, + "grad_norm": 0.8800211548805237, + "learning_rate": 0.0009522608370702542, + "loss": 3.6356, + "step": 5625 + }, + { + "epoch": 0.38252479956515834, + "grad_norm": 0.7811053395271301, + "learning_rate": 0.0009522183720614214, + "loss": 3.808, + "step": 5630 + }, + { + "epoch": 0.3828645196358201, + "grad_norm": 0.8912071585655212, + "learning_rate": 0.0009521759070525887, + "loss": 3.5655, + "step": 5635 + }, + { + "epoch": 0.3832042397064819, + "grad_norm": 0.7520741820335388, + "learning_rate": 0.000952133442043756, + "loss": 3.7796, + "step": 5640 + }, + { + "epoch": 0.3835439597771436, + "grad_norm": 0.7486137747764587, + "learning_rate": 0.0009520909770349232, + "loss": 3.7903, + "step": 5645 + }, + { + "epoch": 0.3838836798478054, + "grad_norm": 0.7758468985557556, + "learning_rate": 0.0009520485120260905, + "loss": 3.6035, + "step": 5650 + }, + { + "epoch": 0.38422339991846716, + "grad_norm": 0.6716181039810181, + "learning_rate": 0.0009520060470172577, + "loss": 3.8629, + "step": 5655 + }, + { + "epoch": 0.38456311998912895, + "grad_norm": 0.6017566323280334, + "learning_rate": 0.0009519635820084251, + "loss": 3.6705, + "step": 5660 + }, + { + "epoch": 0.38490284005979075, + "grad_norm": 0.7134028673171997, + "learning_rate": 0.0009519211169995924, + "loss": 3.6861, + "step": 5665 + }, + { + "epoch": 0.3852425601304525, + "grad_norm": 0.7527996301651001, + "learning_rate": 0.0009518786519907596, + "loss": 3.572, + "step": 5670 + }, + { + "epoch": 0.3855822802011143, + "grad_norm": 0.6522907614707947, + "learning_rate": 0.0009518361869819269, + "loss": 3.6314, + "step": 5675 + }, + { + "epoch": 0.38592200027177603, + "grad_norm": 0.6650054454803467, + "learning_rate": 0.0009517937219730942, + "loss": 3.6111, + "step": 5680 + }, + { + "epoch": 0.38626172034243783, + "grad_norm": 0.6412862539291382, + "learning_rate": 0.0009517512569642614, + "loss": 3.5796, + "step": 5685 + }, + { + "epoch": 0.38660144041309963, + "grad_norm": 0.757982075214386, + "learning_rate": 0.0009517087919554286, + "loss": 3.8244, + "step": 5690 + }, + { + "epoch": 0.38694116048376137, + "grad_norm": 0.6149710416793823, + "learning_rate": 0.0009516663269465961, + "loss": 3.5768, + "step": 5695 + }, + { + "epoch": 0.38728088055442317, + "grad_norm": 0.8086115121841431, + "learning_rate": 0.0009516238619377633, + "loss": 3.7171, + "step": 5700 + }, + { + "epoch": 0.3876206006250849, + "grad_norm": 0.7600422501564026, + "learning_rate": 0.0009515813969289305, + "loss": 3.659, + "step": 5705 + }, + { + "epoch": 0.3879603206957467, + "grad_norm": 0.5905424356460571, + "learning_rate": 0.0009515389319200979, + "loss": 3.7003, + "step": 5710 + }, + { + "epoch": 0.3883000407664085, + "grad_norm": 0.7332346439361572, + "learning_rate": 0.0009514964669112651, + "loss": 3.7962, + "step": 5715 + }, + { + "epoch": 0.38863976083707025, + "grad_norm": 0.7058977484703064, + "learning_rate": 0.0009514540019024323, + "loss": 3.7295, + "step": 5720 + }, + { + "epoch": 0.38897948090773204, + "grad_norm": 0.6611815690994263, + "learning_rate": 0.0009514115368935997, + "loss": 3.7657, + "step": 5725 + }, + { + "epoch": 0.3893192009783938, + "grad_norm": 0.6847200989723206, + "learning_rate": 0.000951369071884767, + "loss": 3.7119, + "step": 5730 + }, + { + "epoch": 0.3896589210490556, + "grad_norm": 0.8326202034950256, + "learning_rate": 0.0009513266068759342, + "loss": 3.7298, + "step": 5735 + }, + { + "epoch": 0.3899986411197173, + "grad_norm": 0.6590185165405273, + "learning_rate": 0.0009512841418671016, + "loss": 3.9498, + "step": 5740 + }, + { + "epoch": 0.3903383611903791, + "grad_norm": 0.615993082523346, + "learning_rate": 0.0009512416768582688, + "loss": 3.7168, + "step": 5745 + }, + { + "epoch": 0.3906780812610409, + "grad_norm": 0.677245020866394, + "learning_rate": 0.000951199211849436, + "loss": 3.7493, + "step": 5750 + }, + { + "epoch": 0.39101780133170266, + "grad_norm": 1.3756181001663208, + "learning_rate": 0.0009511567468406033, + "loss": 3.7926, + "step": 5755 + }, + { + "epoch": 0.39135752140236446, + "grad_norm": 0.7752735614776611, + "learning_rate": 0.0009511142818317706, + "loss": 3.8492, + "step": 5760 + }, + { + "epoch": 0.3916972414730262, + "grad_norm": 0.6669707298278809, + "learning_rate": 0.000951071816822938, + "loss": 3.5291, + "step": 5765 + }, + { + "epoch": 0.392036961543688, + "grad_norm": 0.7959229946136475, + "learning_rate": 0.0009510293518141052, + "loss": 3.8665, + "step": 5770 + }, + { + "epoch": 0.3923766816143498, + "grad_norm": 0.7961986660957336, + "learning_rate": 0.0009509868868052725, + "loss": 3.8701, + "step": 5775 + }, + { + "epoch": 0.39271640168501154, + "grad_norm": 0.7884967923164368, + "learning_rate": 0.0009509444217964398, + "loss": 3.5183, + "step": 5780 + }, + { + "epoch": 0.39305612175567334, + "grad_norm": 0.7456098198890686, + "learning_rate": 0.000950901956787607, + "loss": 3.8663, + "step": 5785 + }, + { + "epoch": 0.3933958418263351, + "grad_norm": 0.8016980290412903, + "learning_rate": 0.0009508594917787742, + "loss": 3.8903, + "step": 5790 + }, + { + "epoch": 0.3937355618969969, + "grad_norm": 0.7217376232147217, + "learning_rate": 0.0009508170267699417, + "loss": 3.7396, + "step": 5795 + }, + { + "epoch": 0.3940752819676587, + "grad_norm": 0.9282112717628479, + "learning_rate": 0.0009507745617611089, + "loss": 3.8801, + "step": 5800 + }, + { + "epoch": 0.3944150020383204, + "grad_norm": 0.9271527528762817, + "learning_rate": 0.0009507320967522761, + "loss": 3.8464, + "step": 5805 + }, + { + "epoch": 0.3947547221089822, + "grad_norm": 0.6905227303504944, + "learning_rate": 0.0009506896317434435, + "loss": 3.7007, + "step": 5810 + }, + { + "epoch": 0.39509444217964396, + "grad_norm": 0.8685694336891174, + "learning_rate": 0.0009506471667346107, + "loss": 3.408, + "step": 5815 + }, + { + "epoch": 0.39543416225030575, + "grad_norm": 0.6663867831230164, + "learning_rate": 0.0009506047017257779, + "loss": 3.8875, + "step": 5820 + }, + { + "epoch": 0.3957738823209675, + "grad_norm": 0.7288834452629089, + "learning_rate": 0.0009505622367169453, + "loss": 3.8247, + "step": 5825 + }, + { + "epoch": 0.3961136023916293, + "grad_norm": 1.0479958057403564, + "learning_rate": 0.0009505197717081126, + "loss": 3.5304, + "step": 5830 + }, + { + "epoch": 0.3964533224622911, + "grad_norm": 0.6757906079292297, + "learning_rate": 0.0009504773066992798, + "loss": 3.8489, + "step": 5835 + }, + { + "epoch": 0.39679304253295283, + "grad_norm": 0.7249553799629211, + "learning_rate": 0.0009504348416904472, + "loss": 3.6056, + "step": 5840 + }, + { + "epoch": 0.39713276260361463, + "grad_norm": 0.8358896970748901, + "learning_rate": 0.0009503923766816144, + "loss": 3.6305, + "step": 5845 + }, + { + "epoch": 0.3974724826742764, + "grad_norm": 0.7137343883514404, + "learning_rate": 0.0009503499116727816, + "loss": 3.5923, + "step": 5850 + }, + { + "epoch": 0.39781220274493817, + "grad_norm": 0.7127547264099121, + "learning_rate": 0.0009503074466639489, + "loss": 3.6989, + "step": 5855 + }, + { + "epoch": 0.39815192281559997, + "grad_norm": 0.5736710429191589, + "learning_rate": 0.0009502649816551162, + "loss": 3.6885, + "step": 5860 + }, + { + "epoch": 0.3984916428862617, + "grad_norm": 2.031125783920288, + "learning_rate": 0.0009502225166462835, + "loss": 3.8928, + "step": 5865 + }, + { + "epoch": 0.3988313629569235, + "grad_norm": 0.8502443432807922, + "learning_rate": 0.0009501800516374508, + "loss": 3.6366, + "step": 5870 + }, + { + "epoch": 0.39917108302758525, + "grad_norm": 0.5695399641990662, + "learning_rate": 0.0009501375866286181, + "loss": 3.4589, + "step": 5875 + }, + { + "epoch": 0.39951080309824705, + "grad_norm": 0.7646152973175049, + "learning_rate": 0.0009500951216197853, + "loss": 3.7332, + "step": 5880 + }, + { + "epoch": 0.39985052316890884, + "grad_norm": 0.6613687872886658, + "learning_rate": 0.0009500526566109526, + "loss": 3.9132, + "step": 5885 + }, + { + "epoch": 0.4001902432395706, + "grad_norm": 0.6126554012298584, + "learning_rate": 0.0009500101916021199, + "loss": 3.7193, + "step": 5890 + }, + { + "epoch": 0.4005299633102324, + "grad_norm": 0.7746059894561768, + "learning_rate": 0.0009499677265932871, + "loss": 3.5007, + "step": 5895 + }, + { + "epoch": 0.4008696833808941, + "grad_norm": 0.7825262546539307, + "learning_rate": 0.0009499252615844545, + "loss": 3.6609, + "step": 5900 + }, + { + "epoch": 0.4012094034515559, + "grad_norm": 0.5884436964988708, + "learning_rate": 0.0009498827965756217, + "loss": 3.7637, + "step": 5905 + }, + { + "epoch": 0.40154912352221767, + "grad_norm": 0.7165656089782715, + "learning_rate": 0.000949840331566789, + "loss": 3.7109, + "step": 5910 + }, + { + "epoch": 0.40188884359287946, + "grad_norm": 0.7491403818130493, + "learning_rate": 0.0009497978665579563, + "loss": 3.7773, + "step": 5915 + }, + { + "epoch": 0.40222856366354126, + "grad_norm": 0.7385885119438171, + "learning_rate": 0.0009497554015491235, + "loss": 3.8439, + "step": 5920 + }, + { + "epoch": 0.402568283734203, + "grad_norm": 0.6485666036605835, + "learning_rate": 0.0009497129365402908, + "loss": 3.6329, + "step": 5925 + }, + { + "epoch": 0.4029080038048648, + "grad_norm": 0.6797777414321899, + "learning_rate": 0.0009496704715314581, + "loss": 3.6071, + "step": 5930 + }, + { + "epoch": 0.40324772387552654, + "grad_norm": 0.6002844572067261, + "learning_rate": 0.0009496280065226254, + "loss": 3.8685, + "step": 5935 + }, + { + "epoch": 0.40358744394618834, + "grad_norm": 0.675060510635376, + "learning_rate": 0.0009495855415137927, + "loss": 3.6562, + "step": 5940 + }, + { + "epoch": 0.40392716401685014, + "grad_norm": 2.28523850440979, + "learning_rate": 0.00094954307650496, + "loss": 3.725, + "step": 5945 + }, + { + "epoch": 0.4042668840875119, + "grad_norm": 0.5777581334114075, + "learning_rate": 0.0009495006114961272, + "loss": 3.9382, + "step": 5950 + }, + { + "epoch": 0.4046066041581737, + "grad_norm": 0.6222342848777771, + "learning_rate": 0.0009494581464872944, + "loss": 3.8063, + "step": 5955 + }, + { + "epoch": 0.4049463242288354, + "grad_norm": 0.709608256816864, + "learning_rate": 0.0009494156814784618, + "loss": 3.7031, + "step": 5960 + }, + { + "epoch": 0.4052860442994972, + "grad_norm": 0.6731900572776794, + "learning_rate": 0.000949373216469629, + "loss": 3.8212, + "step": 5965 + }, + { + "epoch": 0.405625764370159, + "grad_norm": 0.7777144908905029, + "learning_rate": 0.0009493307514607963, + "loss": 3.9838, + "step": 5970 + }, + { + "epoch": 0.40596548444082076, + "grad_norm": 0.7361230254173279, + "learning_rate": 0.0009492882864519637, + "loss": 3.4527, + "step": 5975 + }, + { + "epoch": 0.40630520451148255, + "grad_norm": 0.835973858833313, + "learning_rate": 0.0009492458214431309, + "loss": 3.6635, + "step": 5980 + }, + { + "epoch": 0.4066449245821443, + "grad_norm": 0.6733450889587402, + "learning_rate": 0.0009492033564342981, + "loss": 3.5865, + "step": 5985 + }, + { + "epoch": 0.4069846446528061, + "grad_norm": 0.7952526807785034, + "learning_rate": 0.0009491608914254655, + "loss": 3.9892, + "step": 5990 + }, + { + "epoch": 0.40732436472346784, + "grad_norm": 0.6852453351020813, + "learning_rate": 0.0009491184264166327, + "loss": 3.8619, + "step": 5995 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 0.6207086443901062, + "learning_rate": 0.0009490759614077999, + "loss": 3.6148, + "step": 6000 + }, + { + "epoch": 0.40800380486479143, + "grad_norm": 0.6955061554908752, + "learning_rate": 0.0009490334963989673, + "loss": 3.7418, + "step": 6005 + }, + { + "epoch": 0.4083435249354532, + "grad_norm": 0.7342314124107361, + "learning_rate": 0.0009489910313901346, + "loss": 3.6903, + "step": 6010 + }, + { + "epoch": 0.40868324500611497, + "grad_norm": 0.6419764161109924, + "learning_rate": 0.0009489485663813018, + "loss": 3.6945, + "step": 6015 + }, + { + "epoch": 0.4090229650767767, + "grad_norm": 0.7123669981956482, + "learning_rate": 0.0009489061013724691, + "loss": 3.749, + "step": 6020 + }, + { + "epoch": 0.4093626851474385, + "grad_norm": 0.7489724159240723, + "learning_rate": 0.0009488636363636364, + "loss": 3.5993, + "step": 6025 + }, + { + "epoch": 0.4097024052181003, + "grad_norm": 0.8198326230049133, + "learning_rate": 0.0009488211713548036, + "loss": 3.5582, + "step": 6030 + }, + { + "epoch": 0.41004212528876205, + "grad_norm": 0.6694689989089966, + "learning_rate": 0.0009487787063459709, + "loss": 3.6663, + "step": 6035 + }, + { + "epoch": 0.41038184535942385, + "grad_norm": 1.001598596572876, + "learning_rate": 0.0009487362413371383, + "loss": 3.6815, + "step": 6040 + }, + { + "epoch": 0.4107215654300856, + "grad_norm": 0.7290699481964111, + "learning_rate": 0.0009486937763283055, + "loss": 3.713, + "step": 6045 + }, + { + "epoch": 0.4110612855007474, + "grad_norm": 0.7787067890167236, + "learning_rate": 0.0009486513113194728, + "loss": 3.7469, + "step": 6050 + }, + { + "epoch": 0.4114010055714092, + "grad_norm": 3.026517629623413, + "learning_rate": 0.00094860884631064, + "loss": 3.5905, + "step": 6055 + }, + { + "epoch": 0.4117407256420709, + "grad_norm": 0.6710273623466492, + "learning_rate": 0.0009485663813018073, + "loss": 3.5288, + "step": 6060 + }, + { + "epoch": 0.4120804457127327, + "grad_norm": 0.9746031165122986, + "learning_rate": 0.0009485239162929746, + "loss": 3.7817, + "step": 6065 + }, + { + "epoch": 0.41242016578339447, + "grad_norm": 0.7773796916007996, + "learning_rate": 0.0009484814512841418, + "loss": 4.008, + "step": 6070 + }, + { + "epoch": 0.41275988585405626, + "grad_norm": 0.7256516814231873, + "learning_rate": 0.0009484389862753092, + "loss": 3.2691, + "step": 6075 + }, + { + "epoch": 0.413099605924718, + "grad_norm": 0.9408746361732483, + "learning_rate": 0.0009483965212664765, + "loss": 3.8523, + "step": 6080 + }, + { + "epoch": 0.4134393259953798, + "grad_norm": 0.8501414656639099, + "learning_rate": 0.0009483540562576437, + "loss": 3.7026, + "step": 6085 + }, + { + "epoch": 0.4137790460660416, + "grad_norm": 0.6323081254959106, + "learning_rate": 0.0009483115912488109, + "loss": 3.8937, + "step": 6090 + }, + { + "epoch": 0.41411876613670334, + "grad_norm": 0.9157864451408386, + "learning_rate": 0.0009482691262399783, + "loss": 3.6403, + "step": 6095 + }, + { + "epoch": 0.41445848620736514, + "grad_norm": 0.617572009563446, + "learning_rate": 0.0009482266612311455, + "loss": 3.5998, + "step": 6100 + }, + { + "epoch": 0.4147982062780269, + "grad_norm": 0.5865985751152039, + "learning_rate": 0.0009481841962223128, + "loss": 3.5068, + "step": 6105 + }, + { + "epoch": 0.4151379263486887, + "grad_norm": 0.6573167443275452, + "learning_rate": 0.0009481417312134802, + "loss": 3.6507, + "step": 6110 + }, + { + "epoch": 0.4154776464193505, + "grad_norm": 0.943011462688446, + "learning_rate": 0.0009480992662046474, + "loss": 3.598, + "step": 6115 + }, + { + "epoch": 0.4158173664900122, + "grad_norm": 0.6835089325904846, + "learning_rate": 0.0009480568011958147, + "loss": 3.9407, + "step": 6120 + }, + { + "epoch": 0.416157086560674, + "grad_norm": 0.6500422358512878, + "learning_rate": 0.000948014336186982, + "loss": 3.7285, + "step": 6125 + }, + { + "epoch": 0.41649680663133576, + "grad_norm": 0.6260750889778137, + "learning_rate": 0.0009479718711781492, + "loss": 3.5296, + "step": 6130 + }, + { + "epoch": 0.41683652670199756, + "grad_norm": 0.817827582359314, + "learning_rate": 0.0009479294061693165, + "loss": 3.5726, + "step": 6135 + }, + { + "epoch": 0.41717624677265935, + "grad_norm": 0.6850476264953613, + "learning_rate": 0.0009478869411604837, + "loss": 3.7562, + "step": 6140 + }, + { + "epoch": 0.4175159668433211, + "grad_norm": 0.720156192779541, + "learning_rate": 0.0009478444761516511, + "loss": 3.7822, + "step": 6145 + }, + { + "epoch": 0.4178556869139829, + "grad_norm": 1.0502398014068604, + "learning_rate": 0.0009478020111428184, + "loss": 3.8218, + "step": 6150 + }, + { + "epoch": 0.41819540698464464, + "grad_norm": 0.7193793058395386, + "learning_rate": 0.0009477595461339856, + "loss": 3.6011, + "step": 6155 + }, + { + "epoch": 0.41853512705530643, + "grad_norm": 0.6768472194671631, + "learning_rate": 0.0009477170811251529, + "loss": 3.7295, + "step": 6160 + }, + { + "epoch": 0.4188748471259682, + "grad_norm": 0.7362995743751526, + "learning_rate": 0.0009476746161163202, + "loss": 3.9129, + "step": 6165 + }, + { + "epoch": 0.41921456719663, + "grad_norm": 0.7388405203819275, + "learning_rate": 0.0009476321511074874, + "loss": 3.7678, + "step": 6170 + }, + { + "epoch": 0.41955428726729177, + "grad_norm": 0.8284323215484619, + "learning_rate": 0.0009475896860986547, + "loss": 3.8068, + "step": 6175 + }, + { + "epoch": 0.4198940073379535, + "grad_norm": 0.6240724921226501, + "learning_rate": 0.0009475472210898221, + "loss": 3.6619, + "step": 6180 + }, + { + "epoch": 0.4202337274086153, + "grad_norm": 0.6074116230010986, + "learning_rate": 0.0009475047560809893, + "loss": 3.6931, + "step": 6185 + }, + { + "epoch": 0.42057344747927705, + "grad_norm": 0.7071591019630432, + "learning_rate": 0.0009474622910721565, + "loss": 3.5296, + "step": 6190 + }, + { + "epoch": 0.42091316754993885, + "grad_norm": 0.5276950597763062, + "learning_rate": 0.0009474198260633239, + "loss": 3.7415, + "step": 6195 + }, + { + "epoch": 0.42125288762060065, + "grad_norm": 0.656801164150238, + "learning_rate": 0.0009473773610544911, + "loss": 3.6596, + "step": 6200 + }, + { + "epoch": 0.4215926076912624, + "grad_norm": 1.0711467266082764, + "learning_rate": 0.0009473348960456583, + "loss": 3.6748, + "step": 6205 + }, + { + "epoch": 0.4219323277619242, + "grad_norm": 0.6661550402641296, + "learning_rate": 0.0009472924310368257, + "loss": 3.6387, + "step": 6210 + }, + { + "epoch": 0.42227204783258593, + "grad_norm": 0.7838786840438843, + "learning_rate": 0.000947249966027993, + "loss": 3.456, + "step": 6215 + }, + { + "epoch": 0.4226117679032477, + "grad_norm": 0.6787034869194031, + "learning_rate": 0.0009472075010191602, + "loss": 3.6875, + "step": 6220 + }, + { + "epoch": 0.4229514879739095, + "grad_norm": 0.6901087760925293, + "learning_rate": 0.0009471650360103276, + "loss": 3.5455, + "step": 6225 + }, + { + "epoch": 0.42329120804457127, + "grad_norm": 0.6064029335975647, + "learning_rate": 0.0009471225710014948, + "loss": 3.6781, + "step": 6230 + }, + { + "epoch": 0.42363092811523306, + "grad_norm": 0.6404651999473572, + "learning_rate": 0.000947080105992662, + "loss": 3.6796, + "step": 6235 + }, + { + "epoch": 0.4239706481858948, + "grad_norm": 0.7489857077598572, + "learning_rate": 0.0009470376409838293, + "loss": 3.6066, + "step": 6240 + }, + { + "epoch": 0.4243103682565566, + "grad_norm": 0.8165284395217896, + "learning_rate": 0.0009469951759749966, + "loss": 3.6586, + "step": 6245 + }, + { + "epoch": 0.42465008832721834, + "grad_norm": 0.6606987714767456, + "learning_rate": 0.0009469527109661639, + "loss": 3.6124, + "step": 6250 + }, + { + "epoch": 0.42498980839788014, + "grad_norm": 0.6074065566062927, + "learning_rate": 0.0009469102459573312, + "loss": 3.6831, + "step": 6255 + }, + { + "epoch": 0.42532952846854194, + "grad_norm": 0.8217315673828125, + "learning_rate": 0.0009468677809484985, + "loss": 3.3544, + "step": 6260 + }, + { + "epoch": 0.4256692485392037, + "grad_norm": 0.5456224679946899, + "learning_rate": 0.0009468253159396657, + "loss": 3.5061, + "step": 6265 + }, + { + "epoch": 0.4260089686098655, + "grad_norm": 0.8272515535354614, + "learning_rate": 0.000946782850930833, + "loss": 3.8934, + "step": 6270 + }, + { + "epoch": 0.4263486886805272, + "grad_norm": 0.6489728689193726, + "learning_rate": 0.0009467403859220003, + "loss": 3.5362, + "step": 6275 + }, + { + "epoch": 0.426688408751189, + "grad_norm": 0.8700988292694092, + "learning_rate": 0.0009466979209131675, + "loss": 3.6272, + "step": 6280 + }, + { + "epoch": 0.4270281288218508, + "grad_norm": 0.841989278793335, + "learning_rate": 0.0009466554559043349, + "loss": 3.8479, + "step": 6285 + }, + { + "epoch": 0.42736784889251256, + "grad_norm": 0.7738193273544312, + "learning_rate": 0.0009466129908955021, + "loss": 3.6357, + "step": 6290 + }, + { + "epoch": 0.42770756896317436, + "grad_norm": 0.630150556564331, + "learning_rate": 0.0009465705258866694, + "loss": 3.7716, + "step": 6295 + }, + { + "epoch": 0.4280472890338361, + "grad_norm": 0.5808531045913696, + "learning_rate": 0.0009465280608778367, + "loss": 3.6844, + "step": 6300 + }, + { + "epoch": 0.4283870091044979, + "grad_norm": 0.6296698451042175, + "learning_rate": 0.0009464855958690039, + "loss": 3.3742, + "step": 6305 + }, + { + "epoch": 0.4287267291751597, + "grad_norm": 0.6392558813095093, + "learning_rate": 0.0009464431308601712, + "loss": 4.0005, + "step": 6310 + }, + { + "epoch": 0.42906644924582144, + "grad_norm": 0.665912389755249, + "learning_rate": 0.0009464006658513386, + "loss": 3.7679, + "step": 6315 + }, + { + "epoch": 0.42940616931648323, + "grad_norm": 0.7042819857597351, + "learning_rate": 0.0009463582008425058, + "loss": 3.6548, + "step": 6320 + }, + { + "epoch": 0.429745889387145, + "grad_norm": 0.8785261511802673, + "learning_rate": 0.0009463157358336731, + "loss": 3.5797, + "step": 6325 + }, + { + "epoch": 0.4300856094578068, + "grad_norm": 0.9008347988128662, + "learning_rate": 0.0009462732708248404, + "loss": 3.6255, + "step": 6330 + }, + { + "epoch": 0.4304253295284685, + "grad_norm": 0.7271003127098083, + "learning_rate": 0.0009462308058160076, + "loss": 3.7316, + "step": 6335 + }, + { + "epoch": 0.4307650495991303, + "grad_norm": 1.339601993560791, + "learning_rate": 0.0009461883408071748, + "loss": 3.7519, + "step": 6340 + }, + { + "epoch": 0.4311047696697921, + "grad_norm": 0.765197217464447, + "learning_rate": 0.0009461458757983422, + "loss": 3.5507, + "step": 6345 + }, + { + "epoch": 0.43144448974045385, + "grad_norm": 0.9495841860771179, + "learning_rate": 0.0009461034107895095, + "loss": 3.5906, + "step": 6350 + }, + { + "epoch": 0.43178420981111565, + "grad_norm": 0.7271772027015686, + "learning_rate": 0.0009460609457806767, + "loss": 3.6437, + "step": 6355 + }, + { + "epoch": 0.4321239298817774, + "grad_norm": 0.7216455936431885, + "learning_rate": 0.0009460184807718441, + "loss": 3.4058, + "step": 6360 + }, + { + "epoch": 0.4324636499524392, + "grad_norm": 0.6468266248703003, + "learning_rate": 0.0009459760157630113, + "loss": 3.7133, + "step": 6365 + }, + { + "epoch": 0.432803370023101, + "grad_norm": 0.816789984703064, + "learning_rate": 0.0009459335507541785, + "loss": 3.3991, + "step": 6370 + }, + { + "epoch": 0.43314309009376273, + "grad_norm": 0.8391227126121521, + "learning_rate": 0.0009458910857453459, + "loss": 3.6193, + "step": 6375 + }, + { + "epoch": 0.4334828101644245, + "grad_norm": 0.752625048160553, + "learning_rate": 0.0009458486207365131, + "loss": 3.7446, + "step": 6380 + }, + { + "epoch": 0.43382253023508627, + "grad_norm": 0.8280352354049683, + "learning_rate": 0.0009458061557276804, + "loss": 3.7452, + "step": 6385 + }, + { + "epoch": 0.43416225030574807, + "grad_norm": 0.844526469707489, + "learning_rate": 0.0009457636907188478, + "loss": 3.6496, + "step": 6390 + }, + { + "epoch": 0.43450197037640986, + "grad_norm": 0.6423800587654114, + "learning_rate": 0.000945721225710015, + "loss": 3.8373, + "step": 6395 + }, + { + "epoch": 0.4348416904470716, + "grad_norm": 0.7412212491035461, + "learning_rate": 0.0009456787607011822, + "loss": 3.633, + "step": 6400 + }, + { + "epoch": 0.4351814105177334, + "grad_norm": 0.6095122694969177, + "learning_rate": 0.0009456362956923495, + "loss": 3.7099, + "step": 6405 + }, + { + "epoch": 0.43552113058839514, + "grad_norm": 0.6678140163421631, + "learning_rate": 0.0009455938306835168, + "loss": 3.579, + "step": 6410 + }, + { + "epoch": 0.43586085065905694, + "grad_norm": 0.7172609567642212, + "learning_rate": 0.000945551365674684, + "loss": 3.6256, + "step": 6415 + }, + { + "epoch": 0.4362005707297187, + "grad_norm": 0.6448911428451538, + "learning_rate": 0.0009455089006658514, + "loss": 3.5773, + "step": 6420 + }, + { + "epoch": 0.4365402908003805, + "grad_norm": 0.7384248375892639, + "learning_rate": 0.0009454664356570187, + "loss": 3.5637, + "step": 6425 + }, + { + "epoch": 0.4368800108710423, + "grad_norm": 0.6744754314422607, + "learning_rate": 0.0009454239706481859, + "loss": 3.8973, + "step": 6430 + }, + { + "epoch": 0.437219730941704, + "grad_norm": 0.6267385482788086, + "learning_rate": 0.0009453815056393532, + "loss": 3.756, + "step": 6435 + }, + { + "epoch": 0.4375594510123658, + "grad_norm": 0.7856734395027161, + "learning_rate": 0.0009453390406305204, + "loss": 3.5795, + "step": 6440 + }, + { + "epoch": 0.43789917108302756, + "grad_norm": 0.9356297254562378, + "learning_rate": 0.0009452965756216878, + "loss": 3.6803, + "step": 6445 + }, + { + "epoch": 0.43823889115368936, + "grad_norm": 0.8816167116165161, + "learning_rate": 0.000945254110612855, + "loss": 3.6637, + "step": 6450 + }, + { + "epoch": 0.43857861122435116, + "grad_norm": 0.6949406862258911, + "learning_rate": 0.0009452116456040223, + "loss": 3.9039, + "step": 6455 + }, + { + "epoch": 0.4389183312950129, + "grad_norm": 0.7305207252502441, + "learning_rate": 0.0009451691805951897, + "loss": 3.511, + "step": 6460 + }, + { + "epoch": 0.4392580513656747, + "grad_norm": 0.7531835436820984, + "learning_rate": 0.0009451267155863569, + "loss": 3.6988, + "step": 6465 + }, + { + "epoch": 0.43959777143633644, + "grad_norm": 0.9541970491409302, + "learning_rate": 0.0009450842505775241, + "loss": 3.5167, + "step": 6470 + }, + { + "epoch": 0.43993749150699824, + "grad_norm": 0.7716962099075317, + "learning_rate": 0.0009450417855686915, + "loss": 3.8739, + "step": 6475 + }, + { + "epoch": 0.44027721157766003, + "grad_norm": 0.6692058444023132, + "learning_rate": 0.0009449993205598587, + "loss": 3.8868, + "step": 6480 + }, + { + "epoch": 0.4406169316483218, + "grad_norm": 0.671801745891571, + "learning_rate": 0.0009449568555510259, + "loss": 3.4523, + "step": 6485 + }, + { + "epoch": 0.44095665171898357, + "grad_norm": 0.7335435152053833, + "learning_rate": 0.0009449143905421934, + "loss": 3.9526, + "step": 6490 + }, + { + "epoch": 0.4412963717896453, + "grad_norm": 0.8249445557594299, + "learning_rate": 0.0009448719255333606, + "loss": 3.6853, + "step": 6495 + }, + { + "epoch": 0.4416360918603071, + "grad_norm": 0.8377898335456848, + "learning_rate": 0.0009448294605245278, + "loss": 3.9246, + "step": 6500 + }, + { + "epoch": 0.4419758119309689, + "grad_norm": 0.7699383497238159, + "learning_rate": 0.0009447869955156951, + "loss": 3.5128, + "step": 6505 + }, + { + "epoch": 0.44231553200163065, + "grad_norm": 0.8202624917030334, + "learning_rate": 0.0009447445305068624, + "loss": 3.841, + "step": 6510 + }, + { + "epoch": 0.44265525207229245, + "grad_norm": 0.6758928298950195, + "learning_rate": 0.0009447020654980296, + "loss": 3.5693, + "step": 6515 + }, + { + "epoch": 0.4429949721429542, + "grad_norm": 0.7316200733184814, + "learning_rate": 0.0009446596004891969, + "loss": 3.4346, + "step": 6520 + }, + { + "epoch": 0.443334692213616, + "grad_norm": 0.8385757207870483, + "learning_rate": 0.0009446171354803643, + "loss": 3.8284, + "step": 6525 + }, + { + "epoch": 0.44367441228427773, + "grad_norm": 0.6616659164428711, + "learning_rate": 0.0009445746704715315, + "loss": 3.635, + "step": 6530 + }, + { + "epoch": 0.44401413235493953, + "grad_norm": 0.8081826567649841, + "learning_rate": 0.0009445322054626988, + "loss": 3.5189, + "step": 6535 + }, + { + "epoch": 0.4443538524256013, + "grad_norm": 0.7155320048332214, + "learning_rate": 0.000944489740453866, + "loss": 3.5625, + "step": 6540 + }, + { + "epoch": 0.44469357249626307, + "grad_norm": 0.964938759803772, + "learning_rate": 0.0009444472754450333, + "loss": 3.7905, + "step": 6545 + }, + { + "epoch": 0.44503329256692487, + "grad_norm": 0.738115668296814, + "learning_rate": 0.0009444048104362006, + "loss": 3.4152, + "step": 6550 + }, + { + "epoch": 0.4453730126375866, + "grad_norm": 0.789850115776062, + "learning_rate": 0.0009443623454273678, + "loss": 3.8162, + "step": 6555 + }, + { + "epoch": 0.4457127327082484, + "grad_norm": 0.7045989036560059, + "learning_rate": 0.0009443198804185352, + "loss": 3.9199, + "step": 6560 + }, + { + "epoch": 0.4460524527789102, + "grad_norm": 0.6744515299797058, + "learning_rate": 0.0009442774154097025, + "loss": 3.3915, + "step": 6565 + }, + { + "epoch": 0.44639217284957194, + "grad_norm": 0.669938325881958, + "learning_rate": 0.0009442349504008697, + "loss": 3.5268, + "step": 6570 + }, + { + "epoch": 0.44673189292023374, + "grad_norm": 0.8120308518409729, + "learning_rate": 0.000944192485392037, + "loss": 3.7824, + "step": 6575 + }, + { + "epoch": 0.4470716129908955, + "grad_norm": 0.7444579005241394, + "learning_rate": 0.0009441500203832043, + "loss": 3.6022, + "step": 6580 + }, + { + "epoch": 0.4474113330615573, + "grad_norm": 0.790796160697937, + "learning_rate": 0.0009441075553743715, + "loss": 3.5898, + "step": 6585 + }, + { + "epoch": 0.4477510531322191, + "grad_norm": 0.6905165314674377, + "learning_rate": 0.0009440650903655387, + "loss": 3.6271, + "step": 6590 + }, + { + "epoch": 0.4480907732028808, + "grad_norm": 0.627794086933136, + "learning_rate": 0.0009440226253567062, + "loss": 3.6511, + "step": 6595 + }, + { + "epoch": 0.4484304932735426, + "grad_norm": 0.7536472082138062, + "learning_rate": 0.0009439801603478734, + "loss": 3.8871, + "step": 6600 + }, + { + "epoch": 0.44877021334420436, + "grad_norm": 0.653590977191925, + "learning_rate": 0.0009439376953390406, + "loss": 3.5676, + "step": 6605 + }, + { + "epoch": 0.44910993341486616, + "grad_norm": 0.695195198059082, + "learning_rate": 0.000943895230330208, + "loss": 3.6161, + "step": 6610 + }, + { + "epoch": 0.4494496534855279, + "grad_norm": 0.6932690143585205, + "learning_rate": 0.0009438527653213752, + "loss": 3.869, + "step": 6615 + }, + { + "epoch": 0.4497893735561897, + "grad_norm": 0.6617353558540344, + "learning_rate": 0.0009438103003125424, + "loss": 3.6891, + "step": 6620 + }, + { + "epoch": 0.4501290936268515, + "grad_norm": 0.9754509925842285, + "learning_rate": 0.0009437678353037098, + "loss": 3.7818, + "step": 6625 + }, + { + "epoch": 0.45046881369751324, + "grad_norm": 0.8326768279075623, + "learning_rate": 0.0009437253702948771, + "loss": 3.8185, + "step": 6630 + }, + { + "epoch": 0.45080853376817503, + "grad_norm": 1.079615831375122, + "learning_rate": 0.0009436829052860443, + "loss": 3.7191, + "step": 6635 + }, + { + "epoch": 0.4511482538388368, + "grad_norm": 0.7331617474555969, + "learning_rate": 0.0009436404402772116, + "loss": 3.7863, + "step": 6640 + }, + { + "epoch": 0.4514879739094986, + "grad_norm": 0.9490726590156555, + "learning_rate": 0.0009435979752683789, + "loss": 3.4744, + "step": 6645 + }, + { + "epoch": 0.45182769398016037, + "grad_norm": 0.7302230000495911, + "learning_rate": 0.0009435555102595461, + "loss": 3.8193, + "step": 6650 + }, + { + "epoch": 0.4521674140508221, + "grad_norm": 0.8122956156730652, + "learning_rate": 0.0009435130452507134, + "loss": 3.6671, + "step": 6655 + }, + { + "epoch": 0.4525071341214839, + "grad_norm": 0.7379894852638245, + "learning_rate": 0.0009434705802418807, + "loss": 3.4731, + "step": 6660 + }, + { + "epoch": 0.45284685419214565, + "grad_norm": 0.6658565402030945, + "learning_rate": 0.000943428115233048, + "loss": 3.515, + "step": 6665 + }, + { + "epoch": 0.45318657426280745, + "grad_norm": 0.8374200463294983, + "learning_rate": 0.0009433856502242153, + "loss": 3.7868, + "step": 6670 + }, + { + "epoch": 0.45352629433346925, + "grad_norm": 0.7800910472869873, + "learning_rate": 0.0009433431852153826, + "loss": 3.9725, + "step": 6675 + }, + { + "epoch": 0.453866014404131, + "grad_norm": 0.8169853687286377, + "learning_rate": 0.0009433007202065498, + "loss": 3.7076, + "step": 6680 + }, + { + "epoch": 0.4542057344747928, + "grad_norm": 0.6666299700737, + "learning_rate": 0.0009432582551977171, + "loss": 3.5033, + "step": 6685 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.7712122797966003, + "learning_rate": 0.0009432157901888843, + "loss": 3.5398, + "step": 6690 + }, + { + "epoch": 0.45488517461611633, + "grad_norm": 0.9505596160888672, + "learning_rate": 0.0009431733251800516, + "loss": 3.7849, + "step": 6695 + }, + { + "epoch": 0.45522489468677807, + "grad_norm": 0.7971099615097046, + "learning_rate": 0.000943130860171219, + "loss": 3.4126, + "step": 6700 + }, + { + "epoch": 0.45556461475743987, + "grad_norm": 1.0835922956466675, + "learning_rate": 0.0009430883951623862, + "loss": 3.5741, + "step": 6705 + }, + { + "epoch": 0.45590433482810166, + "grad_norm": 0.791041910648346, + "learning_rate": 0.0009430459301535535, + "loss": 3.7474, + "step": 6710 + }, + { + "epoch": 0.4562440548987634, + "grad_norm": 0.674280047416687, + "learning_rate": 0.0009430034651447208, + "loss": 3.676, + "step": 6715 + }, + { + "epoch": 0.4565837749694252, + "grad_norm": 0.7824015617370605, + "learning_rate": 0.000942961000135888, + "loss": 3.6451, + "step": 6720 + }, + { + "epoch": 0.45692349504008695, + "grad_norm": 0.8051427006721497, + "learning_rate": 0.0009429185351270552, + "loss": 3.5665, + "step": 6725 + }, + { + "epoch": 0.45726321511074874, + "grad_norm": 0.6609389185905457, + "learning_rate": 0.0009428760701182226, + "loss": 3.7426, + "step": 6730 + }, + { + "epoch": 0.45760293518141054, + "grad_norm": 0.9533427953720093, + "learning_rate": 0.0009428336051093899, + "loss": 3.7663, + "step": 6735 + }, + { + "epoch": 0.4579426552520723, + "grad_norm": 0.8080598711967468, + "learning_rate": 0.0009427911401005571, + "loss": 3.711, + "step": 6740 + }, + { + "epoch": 0.4582823753227341, + "grad_norm": 0.8367811441421509, + "learning_rate": 0.0009427486750917245, + "loss": 3.7052, + "step": 6745 + }, + { + "epoch": 0.4586220953933958, + "grad_norm": 0.6397939920425415, + "learning_rate": 0.0009427062100828917, + "loss": 3.7909, + "step": 6750 + }, + { + "epoch": 0.4589618154640576, + "grad_norm": 0.6705583930015564, + "learning_rate": 0.0009426637450740589, + "loss": 3.8097, + "step": 6755 + }, + { + "epoch": 0.4593015355347194, + "grad_norm": 0.7639762759208679, + "learning_rate": 0.0009426212800652263, + "loss": 3.3962, + "step": 6760 + }, + { + "epoch": 0.45964125560538116, + "grad_norm": 0.8546205163002014, + "learning_rate": 0.0009425788150563935, + "loss": 3.73, + "step": 6765 + }, + { + "epoch": 0.45998097567604296, + "grad_norm": 0.8888900876045227, + "learning_rate": 0.0009425363500475608, + "loss": 3.7983, + "step": 6770 + }, + { + "epoch": 0.4603206957467047, + "grad_norm": 0.7186718583106995, + "learning_rate": 0.0009424938850387282, + "loss": 3.9008, + "step": 6775 + }, + { + "epoch": 0.4606604158173665, + "grad_norm": 0.7613282799720764, + "learning_rate": 0.0009424514200298954, + "loss": 3.8355, + "step": 6780 + }, + { + "epoch": 0.46100013588802824, + "grad_norm": 0.7104913592338562, + "learning_rate": 0.0009424089550210627, + "loss": 3.7237, + "step": 6785 + }, + { + "epoch": 0.46133985595869004, + "grad_norm": 0.8852415084838867, + "learning_rate": 0.0009423664900122299, + "loss": 3.564, + "step": 6790 + }, + { + "epoch": 0.46167957602935183, + "grad_norm": 0.8393285274505615, + "learning_rate": 0.0009423240250033972, + "loss": 3.5721, + "step": 6795 + }, + { + "epoch": 0.4620192961000136, + "grad_norm": 0.7575976848602295, + "learning_rate": 0.0009422815599945645, + "loss": 3.7434, + "step": 6800 + }, + { + "epoch": 0.4623590161706754, + "grad_norm": 0.8561558127403259, + "learning_rate": 0.0009422390949857318, + "loss": 3.5232, + "step": 6805 + }, + { + "epoch": 0.4626987362413371, + "grad_norm": 0.8865601420402527, + "learning_rate": 0.0009421966299768991, + "loss": 3.5825, + "step": 6810 + }, + { + "epoch": 0.4630384563119989, + "grad_norm": 0.7860935926437378, + "learning_rate": 0.0009421541649680664, + "loss": 3.4413, + "step": 6815 + }, + { + "epoch": 0.4633781763826607, + "grad_norm": 6.963569641113281, + "learning_rate": 0.0009421116999592336, + "loss": 4.0295, + "step": 6820 + }, + { + "epoch": 0.46371789645332245, + "grad_norm": 0.8232359886169434, + "learning_rate": 0.0009420692349504008, + "loss": 3.8233, + "step": 6825 + }, + { + "epoch": 0.46405761652398425, + "grad_norm": 1.99703848361969, + "learning_rate": 0.0009420267699415682, + "loss": 3.6404, + "step": 6830 + }, + { + "epoch": 0.464397336594646, + "grad_norm": 0.7334964275360107, + "learning_rate": 0.0009419843049327354, + "loss": 3.5374, + "step": 6835 + }, + { + "epoch": 0.4647370566653078, + "grad_norm": 0.6888419389724731, + "learning_rate": 0.0009419418399239027, + "loss": 3.8283, + "step": 6840 + }, + { + "epoch": 0.4650767767359696, + "grad_norm": 0.6308257579803467, + "learning_rate": 0.0009418993749150701, + "loss": 3.6234, + "step": 6845 + }, + { + "epoch": 0.46541649680663133, + "grad_norm": 0.801514208316803, + "learning_rate": 0.0009418569099062373, + "loss": 3.7915, + "step": 6850 + }, + { + "epoch": 0.4657562168772931, + "grad_norm": 0.8150063753128052, + "learning_rate": 0.0009418144448974045, + "loss": 3.7033, + "step": 6855 + }, + { + "epoch": 0.46609593694795487, + "grad_norm": 0.8454571962356567, + "learning_rate": 0.0009417719798885719, + "loss": 3.8013, + "step": 6860 + }, + { + "epoch": 0.46643565701861667, + "grad_norm": 0.6461668610572815, + "learning_rate": 0.0009417295148797391, + "loss": 3.6288, + "step": 6865 + }, + { + "epoch": 0.4667753770892784, + "grad_norm": 0.7563257217407227, + "learning_rate": 0.0009416870498709063, + "loss": 3.5812, + "step": 6870 + }, + { + "epoch": 0.4671150971599402, + "grad_norm": 1.4337533712387085, + "learning_rate": 0.0009416445848620738, + "loss": 3.3616, + "step": 6875 + }, + { + "epoch": 0.467454817230602, + "grad_norm": 0.8615954518318176, + "learning_rate": 0.000941602119853241, + "loss": 3.4659, + "step": 6880 + }, + { + "epoch": 0.46779453730126375, + "grad_norm": 0.8438326120376587, + "learning_rate": 0.0009415596548444082, + "loss": 3.6612, + "step": 6885 + }, + { + "epoch": 0.46813425737192554, + "grad_norm": 0.6650198101997375, + "learning_rate": 0.0009415171898355755, + "loss": 3.7417, + "step": 6890 + }, + { + "epoch": 0.4684739774425873, + "grad_norm": 0.8018618226051331, + "learning_rate": 0.0009414747248267428, + "loss": 3.7458, + "step": 6895 + }, + { + "epoch": 0.4688136975132491, + "grad_norm": 0.8022788166999817, + "learning_rate": 0.00094143225981791, + "loss": 3.8749, + "step": 6900 + }, + { + "epoch": 0.4691534175839109, + "grad_norm": 0.7730206251144409, + "learning_rate": 0.0009413897948090774, + "loss": 3.8531, + "step": 6905 + }, + { + "epoch": 0.4694931376545726, + "grad_norm": 0.9486798048019409, + "learning_rate": 0.0009413473298002447, + "loss": 3.7315, + "step": 6910 + }, + { + "epoch": 0.4698328577252344, + "grad_norm": 0.7506369948387146, + "learning_rate": 0.0009413048647914119, + "loss": 3.7185, + "step": 6915 + }, + { + "epoch": 0.47017257779589616, + "grad_norm": 0.7792893648147583, + "learning_rate": 0.0009412623997825792, + "loss": 3.5069, + "step": 6920 + }, + { + "epoch": 0.47051229786655796, + "grad_norm": 0.6426275372505188, + "learning_rate": 0.0009412199347737464, + "loss": 3.6856, + "step": 6925 + }, + { + "epoch": 0.47085201793721976, + "grad_norm": 1.1287964582443237, + "learning_rate": 0.0009411774697649137, + "loss": 3.7541, + "step": 6930 + }, + { + "epoch": 0.4711917380078815, + "grad_norm": 0.7354869842529297, + "learning_rate": 0.000941135004756081, + "loss": 3.618, + "step": 6935 + }, + { + "epoch": 0.4715314580785433, + "grad_norm": 0.6764058470726013, + "learning_rate": 0.0009410925397472483, + "loss": 3.6902, + "step": 6940 + }, + { + "epoch": 0.47187117814920504, + "grad_norm": 0.9034955501556396, + "learning_rate": 0.0009410500747384156, + "loss": 3.8755, + "step": 6945 + }, + { + "epoch": 0.47221089821986684, + "grad_norm": 0.7983381748199463, + "learning_rate": 0.0009410076097295829, + "loss": 3.5908, + "step": 6950 + }, + { + "epoch": 0.4725506182905286, + "grad_norm": 0.8330667614936829, + "learning_rate": 0.0009409651447207501, + "loss": 3.9564, + "step": 6955 + }, + { + "epoch": 0.4728903383611904, + "grad_norm": 1.088700771331787, + "learning_rate": 0.0009409226797119174, + "loss": 3.8533, + "step": 6960 + }, + { + "epoch": 0.4732300584318522, + "grad_norm": 0.9549254179000854, + "learning_rate": 0.0009408802147030847, + "loss": 3.8088, + "step": 6965 + }, + { + "epoch": 0.4735697785025139, + "grad_norm": 0.6411834359169006, + "learning_rate": 0.0009408377496942519, + "loss": 3.5196, + "step": 6970 + }, + { + "epoch": 0.4739094985731757, + "grad_norm": 0.8803831934928894, + "learning_rate": 0.0009407952846854192, + "loss": 3.6967, + "step": 6975 + }, + { + "epoch": 0.47424921864383746, + "grad_norm": 0.6394052505493164, + "learning_rate": 0.0009407528196765866, + "loss": 3.9295, + "step": 6980 + }, + { + "epoch": 0.47458893871449925, + "grad_norm": 0.7032030820846558, + "learning_rate": 0.0009407103546677538, + "loss": 3.6363, + "step": 6985 + }, + { + "epoch": 0.47492865878516105, + "grad_norm": 0.6784736514091492, + "learning_rate": 0.000940667889658921, + "loss": 3.6755, + "step": 6990 + }, + { + "epoch": 0.4752683788558228, + "grad_norm": 0.8658331036567688, + "learning_rate": 0.0009406254246500884, + "loss": 3.3278, + "step": 6995 + }, + { + "epoch": 0.4756080989264846, + "grad_norm": 0.9446788430213928, + "learning_rate": 0.0009405829596412556, + "loss": 3.3408, + "step": 7000 + }, + { + "epoch": 0.47594781899714633, + "grad_norm": 0.9767313599586487, + "learning_rate": 0.0009405404946324228, + "loss": 4.0563, + "step": 7005 + }, + { + "epoch": 0.47628753906780813, + "grad_norm": 0.8589836955070496, + "learning_rate": 0.0009404980296235903, + "loss": 3.8317, + "step": 7010 + }, + { + "epoch": 0.4766272591384699, + "grad_norm": 0.8191371560096741, + "learning_rate": 0.0009404555646147575, + "loss": 3.6842, + "step": 7015 + }, + { + "epoch": 0.47696697920913167, + "grad_norm": 0.8298940658569336, + "learning_rate": 0.0009404130996059247, + "loss": 3.2965, + "step": 7020 + }, + { + "epoch": 0.47730669927979347, + "grad_norm": 0.6819496154785156, + "learning_rate": 0.000940370634597092, + "loss": 3.7643, + "step": 7025 + }, + { + "epoch": 0.4776464193504552, + "grad_norm": 0.7594950795173645, + "learning_rate": 0.0009403281695882593, + "loss": 3.7032, + "step": 7030 + }, + { + "epoch": 0.477986139421117, + "grad_norm": 0.9817823171615601, + "learning_rate": 0.0009402857045794265, + "loss": 3.5349, + "step": 7035 + }, + { + "epoch": 0.47832585949177875, + "grad_norm": 0.7096978425979614, + "learning_rate": 0.0009402432395705938, + "loss": 3.5846, + "step": 7040 + }, + { + "epoch": 0.47866557956244055, + "grad_norm": 1.0412415266036987, + "learning_rate": 0.0009402007745617612, + "loss": 3.7365, + "step": 7045 + }, + { + "epoch": 0.47900529963310234, + "grad_norm": 0.6880819797515869, + "learning_rate": 0.0009401583095529284, + "loss": 3.6949, + "step": 7050 + }, + { + "epoch": 0.4793450197037641, + "grad_norm": 0.7712616324424744, + "learning_rate": 0.0009401158445440957, + "loss": 3.8146, + "step": 7055 + }, + { + "epoch": 0.4796847397744259, + "grad_norm": 0.6539680361747742, + "learning_rate": 0.000940073379535263, + "loss": 3.7358, + "step": 7060 + }, + { + "epoch": 0.4800244598450876, + "grad_norm": 0.6903494596481323, + "learning_rate": 0.0009400309145264302, + "loss": 3.599, + "step": 7065 + }, + { + "epoch": 0.4803641799157494, + "grad_norm": 0.8483293652534485, + "learning_rate": 0.0009399884495175975, + "loss": 3.465, + "step": 7070 + }, + { + "epoch": 0.4807038999864112, + "grad_norm": 0.5895728468894958, + "learning_rate": 0.0009399459845087647, + "loss": 3.7281, + "step": 7075 + }, + { + "epoch": 0.48104362005707296, + "grad_norm": 0.6269807815551758, + "learning_rate": 0.0009399035194999321, + "loss": 3.8493, + "step": 7080 + }, + { + "epoch": 0.48138334012773476, + "grad_norm": 0.781550407409668, + "learning_rate": 0.0009398610544910994, + "loss": 3.9031, + "step": 7085 + }, + { + "epoch": 0.4817230601983965, + "grad_norm": 0.9604646563529968, + "learning_rate": 0.0009398185894822666, + "loss": 3.6078, + "step": 7090 + }, + { + "epoch": 0.4820627802690583, + "grad_norm": 0.7052937746047974, + "learning_rate": 0.0009397761244734339, + "loss": 3.7656, + "step": 7095 + }, + { + "epoch": 0.4824025003397201, + "grad_norm": 0.523674726486206, + "learning_rate": 0.0009397336594646012, + "loss": 3.6359, + "step": 7100 + }, + { + "epoch": 0.48274222041038184, + "grad_norm": 0.8396635055541992, + "learning_rate": 0.0009396911944557684, + "loss": 3.7101, + "step": 7105 + }, + { + "epoch": 0.48308194048104364, + "grad_norm": 0.8476489782333374, + "learning_rate": 0.0009396487294469356, + "loss": 3.9095, + "step": 7110 + }, + { + "epoch": 0.4834216605517054, + "grad_norm": 0.7786988615989685, + "learning_rate": 0.0009396062644381031, + "loss": 3.5994, + "step": 7115 + }, + { + "epoch": 0.4837613806223672, + "grad_norm": 0.6468896269798279, + "learning_rate": 0.0009395637994292703, + "loss": 3.698, + "step": 7120 + }, + { + "epoch": 0.4841011006930289, + "grad_norm": 0.8070240020751953, + "learning_rate": 0.0009395213344204377, + "loss": 3.6099, + "step": 7125 + }, + { + "epoch": 0.4844408207636907, + "grad_norm": 0.8506020307540894, + "learning_rate": 0.0009394788694116049, + "loss": 3.668, + "step": 7130 + }, + { + "epoch": 0.4847805408343525, + "grad_norm": 0.8194494843482971, + "learning_rate": 0.0009394364044027721, + "loss": 3.5535, + "step": 7135 + }, + { + "epoch": 0.48512026090501426, + "grad_norm": 0.8037394881248474, + "learning_rate": 0.0009393939393939394, + "loss": 3.7885, + "step": 7140 + }, + { + "epoch": 0.48545998097567605, + "grad_norm": 0.9339454770088196, + "learning_rate": 0.0009393514743851067, + "loss": 3.8393, + "step": 7145 + }, + { + "epoch": 0.4857997010463378, + "grad_norm": 0.6934463977813721, + "learning_rate": 0.000939309009376274, + "loss": 3.741, + "step": 7150 + }, + { + "epoch": 0.4861394211169996, + "grad_norm": 0.6785147786140442, + "learning_rate": 0.0009392665443674413, + "loss": 3.7768, + "step": 7155 + }, + { + "epoch": 0.4864791411876614, + "grad_norm": 0.7040234804153442, + "learning_rate": 0.0009392240793586086, + "loss": 3.7443, + "step": 7160 + }, + { + "epoch": 0.48681886125832313, + "grad_norm": 0.7514573335647583, + "learning_rate": 0.0009391816143497758, + "loss": 3.6992, + "step": 7165 + }, + { + "epoch": 0.48715858132898493, + "grad_norm": 0.7714027762413025, + "learning_rate": 0.0009391391493409431, + "loss": 3.3891, + "step": 7170 + }, + { + "epoch": 0.48749830139964667, + "grad_norm": 0.9218112230300903, + "learning_rate": 0.0009390966843321103, + "loss": 3.7688, + "step": 7175 + }, + { + "epoch": 0.48783802147030847, + "grad_norm": 0.7628677487373352, + "learning_rate": 0.0009390542193232776, + "loss": 3.7789, + "step": 7180 + }, + { + "epoch": 0.48817774154097027, + "grad_norm": 0.7548875212669373, + "learning_rate": 0.000939011754314445, + "loss": 3.5472, + "step": 7185 + }, + { + "epoch": 0.488517461611632, + "grad_norm": 0.6406281590461731, + "learning_rate": 0.0009389692893056122, + "loss": 3.7523, + "step": 7190 + }, + { + "epoch": 0.4888571816822938, + "grad_norm": 0.7024105787277222, + "learning_rate": 0.0009389268242967795, + "loss": 3.8367, + "step": 7195 + }, + { + "epoch": 0.48919690175295555, + "grad_norm": 0.7406633496284485, + "learning_rate": 0.0009388843592879468, + "loss": 3.6173, + "step": 7200 + }, + { + "epoch": 0.48953662182361735, + "grad_norm": 0.7633387446403503, + "learning_rate": 0.000938841894279114, + "loss": 3.6487, + "step": 7205 + }, + { + "epoch": 0.4898763418942791, + "grad_norm": 0.74398273229599, + "learning_rate": 0.0009387994292702812, + "loss": 3.9173, + "step": 7210 + }, + { + "epoch": 0.4902160619649409, + "grad_norm": 0.5968216061592102, + "learning_rate": 0.0009387569642614486, + "loss": 3.706, + "step": 7215 + }, + { + "epoch": 0.4905557820356027, + "grad_norm": 0.6736153960227966, + "learning_rate": 0.0009387144992526159, + "loss": 3.6653, + "step": 7220 + }, + { + "epoch": 0.4908955021062644, + "grad_norm": 0.9406900405883789, + "learning_rate": 0.0009386720342437831, + "loss": 3.8663, + "step": 7225 + }, + { + "epoch": 0.4912352221769262, + "grad_norm": 0.7575247287750244, + "learning_rate": 0.0009386295692349505, + "loss": 3.4235, + "step": 7230 + }, + { + "epoch": 0.49157494224758796, + "grad_norm": 0.7956984639167786, + "learning_rate": 0.0009385871042261177, + "loss": 3.7594, + "step": 7235 + }, + { + "epoch": 0.49191466231824976, + "grad_norm": 0.8746979832649231, + "learning_rate": 0.0009385446392172849, + "loss": 3.7614, + "step": 7240 + }, + { + "epoch": 0.49225438238891156, + "grad_norm": 0.7535360455513, + "learning_rate": 0.0009385021742084523, + "loss": 3.8089, + "step": 7245 + }, + { + "epoch": 0.4925941024595733, + "grad_norm": 0.7824869751930237, + "learning_rate": 0.0009384597091996195, + "loss": 3.8849, + "step": 7250 + }, + { + "epoch": 0.4929338225302351, + "grad_norm": 0.6390384435653687, + "learning_rate": 0.0009384172441907868, + "loss": 3.3618, + "step": 7255 + }, + { + "epoch": 0.49327354260089684, + "grad_norm": 0.7897850871086121, + "learning_rate": 0.0009383747791819542, + "loss": 3.8151, + "step": 7260 + }, + { + "epoch": 0.49361326267155864, + "grad_norm": 0.6274983882904053, + "learning_rate": 0.0009383323141731214, + "loss": 3.7064, + "step": 7265 + }, + { + "epoch": 0.49395298274222044, + "grad_norm": 0.7725183963775635, + "learning_rate": 0.0009382898491642886, + "loss": 3.6395, + "step": 7270 + }, + { + "epoch": 0.4942927028128822, + "grad_norm": 0.8436518907546997, + "learning_rate": 0.0009382473841554559, + "loss": 3.5565, + "step": 7275 + }, + { + "epoch": 0.494632422883544, + "grad_norm": 0.7765341997146606, + "learning_rate": 0.0009382049191466232, + "loss": 3.722, + "step": 7280 + }, + { + "epoch": 0.4949721429542057, + "grad_norm": 0.7182985544204712, + "learning_rate": 0.0009381624541377904, + "loss": 3.9235, + "step": 7285 + }, + { + "epoch": 0.4953118630248675, + "grad_norm": 0.8008597493171692, + "learning_rate": 0.0009381199891289578, + "loss": 3.453, + "step": 7290 + }, + { + "epoch": 0.49565158309552926, + "grad_norm": 0.7976318001747131, + "learning_rate": 0.0009380775241201251, + "loss": 3.682, + "step": 7295 + }, + { + "epoch": 0.49599130316619106, + "grad_norm": 0.8344050645828247, + "learning_rate": 0.0009380350591112923, + "loss": 3.7543, + "step": 7300 + }, + { + "epoch": 0.49633102323685285, + "grad_norm": 0.7897378206253052, + "learning_rate": 0.0009379925941024596, + "loss": 3.6266, + "step": 7305 + }, + { + "epoch": 0.4966707433075146, + "grad_norm": 0.7131500244140625, + "learning_rate": 0.0009379501290936269, + "loss": 3.7228, + "step": 7310 + }, + { + "epoch": 0.4970104633781764, + "grad_norm": 0.6977111101150513, + "learning_rate": 0.0009379076640847941, + "loss": 3.5669, + "step": 7315 + }, + { + "epoch": 0.49735018344883813, + "grad_norm": 0.9258297681808472, + "learning_rate": 0.0009378651990759614, + "loss": 3.9128, + "step": 7320 + }, + { + "epoch": 0.49768990351949993, + "grad_norm": 0.7817312479019165, + "learning_rate": 0.0009378227340671287, + "loss": 3.6678, + "step": 7325 + }, + { + "epoch": 0.49802962359016173, + "grad_norm": 0.6066571474075317, + "learning_rate": 0.000937780269058296, + "loss": 3.5993, + "step": 7330 + }, + { + "epoch": 0.49836934366082347, + "grad_norm": 0.6713829636573792, + "learning_rate": 0.0009377378040494633, + "loss": 3.8341, + "step": 7335 + }, + { + "epoch": 0.49870906373148527, + "grad_norm": 0.7814680337905884, + "learning_rate": 0.0009376953390406305, + "loss": 3.8925, + "step": 7340 + }, + { + "epoch": 0.499048783802147, + "grad_norm": 0.7190083861351013, + "learning_rate": 0.0009376528740317978, + "loss": 3.7025, + "step": 7345 + }, + { + "epoch": 0.4993885038728088, + "grad_norm": 0.7056635022163391, + "learning_rate": 0.0009376104090229651, + "loss": 3.3404, + "step": 7350 + }, + { + "epoch": 0.4997282239434706, + "grad_norm": 0.7722302079200745, + "learning_rate": 0.0009375679440141323, + "loss": 3.6827, + "step": 7355 + }, + { + "epoch": 0.5000679440141323, + "grad_norm": 0.7207581400871277, + "learning_rate": 0.0009375339720070662, + "loss": 3.7709, + "step": 7360 + }, + { + "epoch": 0.5004076640847941, + "grad_norm": 0.7106349468231201, + "learning_rate": 0.0009374915069982335, + "loss": 3.743, + "step": 7365 + }, + { + "epoch": 0.5007473841554559, + "grad_norm": 0.8502742648124695, + "learning_rate": 0.0009374490419894007, + "loss": 3.8675, + "step": 7370 + }, + { + "epoch": 0.5010871042261177, + "grad_norm": 1.017578363418579, + "learning_rate": 0.000937406576980568, + "loss": 3.7319, + "step": 7375 + }, + { + "epoch": 0.5014268242967794, + "grad_norm": 0.7092310190200806, + "learning_rate": 0.0009373641119717354, + "loss": 3.9528, + "step": 7380 + }, + { + "epoch": 0.5017665443674413, + "grad_norm": 0.7098028659820557, + "learning_rate": 0.0009373216469629026, + "loss": 3.9555, + "step": 7385 + }, + { + "epoch": 0.502106264438103, + "grad_norm": 0.6542254686355591, + "learning_rate": 0.0009372791819540699, + "loss": 3.4343, + "step": 7390 + }, + { + "epoch": 0.5024459845087648, + "grad_norm": 0.91145920753479, + "learning_rate": 0.0009372367169452372, + "loss": 3.6721, + "step": 7395 + }, + { + "epoch": 0.5027857045794265, + "grad_norm": 0.9808539152145386, + "learning_rate": 0.0009371942519364044, + "loss": 3.7951, + "step": 7400 + }, + { + "epoch": 0.5031254246500884, + "grad_norm": 1.125203013420105, + "learning_rate": 0.0009371517869275717, + "loss": 3.5913, + "step": 7405 + }, + { + "epoch": 0.5034651447207501, + "grad_norm": 0.8589319586753845, + "learning_rate": 0.0009371093219187389, + "loss": 3.6466, + "step": 7410 + }, + { + "epoch": 0.5038048647914118, + "grad_norm": 0.6763414144515991, + "learning_rate": 0.0009370668569099063, + "loss": 3.5969, + "step": 7415 + }, + { + "epoch": 0.5041445848620737, + "grad_norm": 0.844094455242157, + "learning_rate": 0.0009370243919010736, + "loss": 3.7936, + "step": 7420 + }, + { + "epoch": 0.5044843049327354, + "grad_norm": 0.7955632209777832, + "learning_rate": 0.0009369819268922408, + "loss": 3.5356, + "step": 7425 + }, + { + "epoch": 0.5048240250033972, + "grad_norm": 0.6856505870819092, + "learning_rate": 0.0009369394618834081, + "loss": 3.6414, + "step": 7430 + }, + { + "epoch": 0.5051637450740589, + "grad_norm": 0.5792022943496704, + "learning_rate": 0.0009368969968745754, + "loss": 3.7522, + "step": 7435 + }, + { + "epoch": 0.5055034651447208, + "grad_norm": 0.685958206653595, + "learning_rate": 0.0009368545318657426, + "loss": 3.6615, + "step": 7440 + }, + { + "epoch": 0.5058431852153825, + "grad_norm": 0.5949773192405701, + "learning_rate": 0.0009368120668569098, + "loss": 3.6017, + "step": 7445 + }, + { + "epoch": 0.5061829052860443, + "grad_norm": 0.6153993010520935, + "learning_rate": 0.0009367696018480773, + "loss": 3.7141, + "step": 7450 + }, + { + "epoch": 0.5065226253567061, + "grad_norm": 0.9449560046195984, + "learning_rate": 0.0009367271368392445, + "loss": 3.6056, + "step": 7455 + }, + { + "epoch": 0.5068623454273679, + "grad_norm": 0.8440470099449158, + "learning_rate": 0.0009366846718304117, + "loss": 3.7649, + "step": 7460 + }, + { + "epoch": 0.5072020654980296, + "grad_norm": 0.9993419051170349, + "learning_rate": 0.0009366422068215791, + "loss": 3.7675, + "step": 7465 + }, + { + "epoch": 0.5075417855686915, + "grad_norm": 0.8438595533370972, + "learning_rate": 0.0009365997418127463, + "loss": 3.3734, + "step": 7470 + }, + { + "epoch": 0.5078815056393532, + "grad_norm": 0.6846315860748291, + "learning_rate": 0.0009365572768039135, + "loss": 4.0183, + "step": 7475 + }, + { + "epoch": 0.5082212257100149, + "grad_norm": 0.7129391431808472, + "learning_rate": 0.0009365148117950809, + "loss": 3.4175, + "step": 7480 + }, + { + "epoch": 0.5085609457806767, + "grad_norm": 0.7299602031707764, + "learning_rate": 0.0009364723467862482, + "loss": 4.0775, + "step": 7485 + }, + { + "epoch": 0.5089006658513385, + "grad_norm": 0.8693822622299194, + "learning_rate": 0.0009364298817774154, + "loss": 3.4681, + "step": 7490 + }, + { + "epoch": 0.5092403859220003, + "grad_norm": 0.7556914687156677, + "learning_rate": 0.0009363874167685828, + "loss": 3.7943, + "step": 7495 + }, + { + "epoch": 0.509580105992662, + "grad_norm": 0.9923176169395447, + "learning_rate": 0.00093634495175975, + "loss": 3.7588, + "step": 7500 + }, + { + "epoch": 0.5099198260633239, + "grad_norm": 0.634723961353302, + "learning_rate": 0.0009363024867509172, + "loss": 3.7661, + "step": 7505 + }, + { + "epoch": 0.5102595461339856, + "grad_norm": 0.6677860617637634, + "learning_rate": 0.0009362600217420845, + "loss": 3.8467, + "step": 7510 + }, + { + "epoch": 0.5105992662046474, + "grad_norm": 1.10400390625, + "learning_rate": 0.0009362175567332518, + "loss": 3.8287, + "step": 7515 + }, + { + "epoch": 0.5109389862753091, + "grad_norm": 0.9024521112442017, + "learning_rate": 0.0009361750917244191, + "loss": 3.7336, + "step": 7520 + }, + { + "epoch": 0.511278706345971, + "grad_norm": 0.7987581491470337, + "learning_rate": 0.0009361326267155864, + "loss": 3.8043, + "step": 7525 + }, + { + "epoch": 0.5116184264166327, + "grad_norm": 1.0835459232330322, + "learning_rate": 0.0009360901617067537, + "loss": 3.8257, + "step": 7530 + }, + { + "epoch": 0.5119581464872944, + "grad_norm": 1.0358327627182007, + "learning_rate": 0.0009360476966979209, + "loss": 3.7183, + "step": 7535 + }, + { + "epoch": 0.5122978665579563, + "grad_norm": 0.6067409515380859, + "learning_rate": 0.0009360052316890882, + "loss": 3.8741, + "step": 7540 + }, + { + "epoch": 0.512637586628618, + "grad_norm": 0.7564204931259155, + "learning_rate": 0.0009359627666802554, + "loss": 3.4962, + "step": 7545 + }, + { + "epoch": 0.5129773066992798, + "grad_norm": 0.6872767210006714, + "learning_rate": 0.0009359203016714227, + "loss": 3.6282, + "step": 7550 + }, + { + "epoch": 0.5133170267699416, + "grad_norm": 1.9951661825180054, + "learning_rate": 0.0009358778366625901, + "loss": 3.7927, + "step": 7555 + }, + { + "epoch": 0.5136567468406034, + "grad_norm": 0.8494696617126465, + "learning_rate": 0.0009358353716537573, + "loss": 3.8174, + "step": 7560 + }, + { + "epoch": 0.5139964669112651, + "grad_norm": 0.8911325931549072, + "learning_rate": 0.0009357929066449246, + "loss": 3.7429, + "step": 7565 + }, + { + "epoch": 0.5143361869819268, + "grad_norm": 0.6900583505630493, + "learning_rate": 0.0009357504416360919, + "loss": 3.8136, + "step": 7570 + }, + { + "epoch": 0.5146759070525887, + "grad_norm": 1.1943303346633911, + "learning_rate": 0.0009357079766272591, + "loss": 3.5661, + "step": 7575 + }, + { + "epoch": 0.5150156271232504, + "grad_norm": 0.9147028923034668, + "learning_rate": 0.0009356655116184264, + "loss": 3.8086, + "step": 7580 + }, + { + "epoch": 0.5153553471939122, + "grad_norm": 0.7377321124076843, + "learning_rate": 0.0009356230466095937, + "loss": 3.8343, + "step": 7585 + }, + { + "epoch": 0.515695067264574, + "grad_norm": 0.754393994808197, + "learning_rate": 0.000935580581600761, + "loss": 3.6466, + "step": 7590 + }, + { + "epoch": 0.5160347873352358, + "grad_norm": 0.8830172419548035, + "learning_rate": 0.0009355381165919283, + "loss": 3.662, + "step": 7595 + }, + { + "epoch": 0.5163745074058975, + "grad_norm": 1.447893500328064, + "learning_rate": 0.0009354956515830956, + "loss": 3.762, + "step": 7600 + }, + { + "epoch": 0.5167142274765593, + "grad_norm": 0.9086380004882812, + "learning_rate": 0.0009354531865742628, + "loss": 3.5964, + "step": 7605 + }, + { + "epoch": 0.5170539475472211, + "grad_norm": 1.0121508836746216, + "learning_rate": 0.00093541072156543, + "loss": 3.6474, + "step": 7610 + }, + { + "epoch": 0.5173936676178829, + "grad_norm": 0.9906413555145264, + "learning_rate": 0.0009353682565565974, + "loss": 3.361, + "step": 7615 + }, + { + "epoch": 0.5177333876885446, + "grad_norm": 0.8321698307991028, + "learning_rate": 0.0009353257915477646, + "loss": 3.809, + "step": 7620 + }, + { + "epoch": 0.5180731077592065, + "grad_norm": 0.6370994448661804, + "learning_rate": 0.0009352833265389319, + "loss": 3.8383, + "step": 7625 + }, + { + "epoch": 0.5184128278298682, + "grad_norm": 0.7694125771522522, + "learning_rate": 0.0009352408615300993, + "loss": 3.4812, + "step": 7630 + }, + { + "epoch": 0.5187525479005299, + "grad_norm": 0.9469569325447083, + "learning_rate": 0.0009351983965212665, + "loss": 3.8311, + "step": 7635 + }, + { + "epoch": 0.5190922679711918, + "grad_norm": 0.9040980935096741, + "learning_rate": 0.0009351559315124337, + "loss": 3.8186, + "step": 7640 + }, + { + "epoch": 0.5194319880418535, + "grad_norm": 0.7341848015785217, + "learning_rate": 0.000935113466503601, + "loss": 3.6194, + "step": 7645 + }, + { + "epoch": 0.5197717081125153, + "grad_norm": 0.7815690636634827, + "learning_rate": 0.0009350710014947683, + "loss": 3.6886, + "step": 7650 + }, + { + "epoch": 0.520111428183177, + "grad_norm": 0.9311621189117432, + "learning_rate": 0.0009350285364859355, + "loss": 3.5399, + "step": 7655 + }, + { + "epoch": 0.5204511482538389, + "grad_norm": 0.748195230960846, + "learning_rate": 0.000934986071477103, + "loss": 3.6487, + "step": 7660 + }, + { + "epoch": 0.5207908683245006, + "grad_norm": 0.7536749839782715, + "learning_rate": 0.0009349436064682702, + "loss": 3.4575, + "step": 7665 + }, + { + "epoch": 0.5211305883951624, + "grad_norm": 0.7599215507507324, + "learning_rate": 0.0009349011414594374, + "loss": 3.6995, + "step": 7670 + }, + { + "epoch": 0.5214703084658242, + "grad_norm": 0.7540207505226135, + "learning_rate": 0.0009348586764506047, + "loss": 3.5484, + "step": 7675 + }, + { + "epoch": 0.521810028536486, + "grad_norm": 0.6258822679519653, + "learning_rate": 0.000934816211441772, + "loss": 3.8211, + "step": 7680 + }, + { + "epoch": 0.5221497486071477, + "grad_norm": 0.7951477766036987, + "learning_rate": 0.0009347737464329393, + "loss": 3.694, + "step": 7685 + }, + { + "epoch": 0.5224894686778094, + "grad_norm": 0.6939033269882202, + "learning_rate": 0.0009347312814241065, + "loss": 3.8581, + "step": 7690 + }, + { + "epoch": 0.5228291887484713, + "grad_norm": 0.7698881030082703, + "learning_rate": 0.0009346888164152739, + "loss": 3.4373, + "step": 7695 + }, + { + "epoch": 0.523168908819133, + "grad_norm": 0.6066703796386719, + "learning_rate": 0.0009346463514064412, + "loss": 3.774, + "step": 7700 + }, + { + "epoch": 0.5235086288897948, + "grad_norm": 0.6969402432441711, + "learning_rate": 0.0009346038863976084, + "loss": 3.9143, + "step": 7705 + }, + { + "epoch": 0.5238483489604566, + "grad_norm": 0.7153287529945374, + "learning_rate": 0.0009345614213887756, + "loss": 3.5591, + "step": 7710 + }, + { + "epoch": 0.5241880690311184, + "grad_norm": 0.9564037919044495, + "learning_rate": 0.000934518956379943, + "loss": 3.7187, + "step": 7715 + }, + { + "epoch": 0.5245277891017801, + "grad_norm": 0.9316312670707703, + "learning_rate": 0.0009344764913711102, + "loss": 3.6292, + "step": 7720 + }, + { + "epoch": 0.524867509172442, + "grad_norm": 0.7652209401130676, + "learning_rate": 0.0009344340263622774, + "loss": 3.7833, + "step": 7725 + }, + { + "epoch": 0.5252072292431037, + "grad_norm": 0.7273197770118713, + "learning_rate": 0.0009343915613534449, + "loss": 3.6127, + "step": 7730 + }, + { + "epoch": 0.5255469493137654, + "grad_norm": 0.6832858920097351, + "learning_rate": 0.0009343490963446121, + "loss": 3.5762, + "step": 7735 + }, + { + "epoch": 0.5258866693844272, + "grad_norm": 0.759276270866394, + "learning_rate": 0.0009343066313357793, + "loss": 3.7027, + "step": 7740 + }, + { + "epoch": 0.526226389455089, + "grad_norm": 0.6622229218482971, + "learning_rate": 0.0009342641663269467, + "loss": 3.6417, + "step": 7745 + }, + { + "epoch": 0.5265661095257508, + "grad_norm": 0.8111976385116577, + "learning_rate": 0.0009342217013181139, + "loss": 3.9326, + "step": 7750 + }, + { + "epoch": 0.5269058295964125, + "grad_norm": 0.8429571390151978, + "learning_rate": 0.0009341792363092811, + "loss": 3.8193, + "step": 7755 + }, + { + "epoch": 0.5272455496670744, + "grad_norm": 1.1273845434188843, + "learning_rate": 0.0009341367713004485, + "loss": 3.1268, + "step": 7760 + }, + { + "epoch": 0.5275852697377361, + "grad_norm": 0.883357584476471, + "learning_rate": 0.0009340943062916158, + "loss": 3.829, + "step": 7765 + }, + { + "epoch": 0.5279249898083979, + "grad_norm": 1.0428369045257568, + "learning_rate": 0.000934051841282783, + "loss": 3.7054, + "step": 7770 + }, + { + "epoch": 0.5282647098790596, + "grad_norm": 0.8668198585510254, + "learning_rate": 0.0009340093762739503, + "loss": 3.9419, + "step": 7775 + }, + { + "epoch": 0.5286044299497215, + "grad_norm": 0.8784446120262146, + "learning_rate": 0.0009339669112651176, + "loss": 3.3677, + "step": 7780 + }, + { + "epoch": 0.5289441500203832, + "grad_norm": 0.7869707345962524, + "learning_rate": 0.0009339244462562848, + "loss": 3.8799, + "step": 7785 + }, + { + "epoch": 0.5292838700910449, + "grad_norm": 0.725488543510437, + "learning_rate": 0.0009338819812474521, + "loss": 3.8379, + "step": 7790 + }, + { + "epoch": 0.5296235901617068, + "grad_norm": 0.910378634929657, + "learning_rate": 0.0009338395162386195, + "loss": 3.9184, + "step": 7795 + }, + { + "epoch": 0.5299633102323685, + "grad_norm": 0.7236026525497437, + "learning_rate": 0.0009337970512297867, + "loss": 3.4899, + "step": 7800 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.6535592675209045, + "learning_rate": 0.000933754586220954, + "loss": 3.6504, + "step": 7805 + }, + { + "epoch": 0.5306427503736921, + "grad_norm": 0.8938304781913757, + "learning_rate": 0.0009337121212121212, + "loss": 3.6218, + "step": 7810 + }, + { + "epoch": 0.5309824704443539, + "grad_norm": 0.8261063098907471, + "learning_rate": 0.0009336696562032885, + "loss": 3.6483, + "step": 7815 + }, + { + "epoch": 0.5313221905150156, + "grad_norm": 0.6633940935134888, + "learning_rate": 0.0009336271911944558, + "loss": 3.6792, + "step": 7820 + }, + { + "epoch": 0.5316619105856774, + "grad_norm": 0.7532559037208557, + "learning_rate": 0.000933584726185623, + "loss": 3.7929, + "step": 7825 + }, + { + "epoch": 0.5320016306563392, + "grad_norm": 0.7881512641906738, + "learning_rate": 0.0009335422611767904, + "loss": 3.5958, + "step": 7830 + }, + { + "epoch": 0.532341350727001, + "grad_norm": 1.059017539024353, + "learning_rate": 0.0009334997961679577, + "loss": 3.8223, + "step": 7835 + }, + { + "epoch": 0.5326810707976627, + "grad_norm": 1.0312862396240234, + "learning_rate": 0.0009334573311591249, + "loss": 3.9363, + "step": 7840 + }, + { + "epoch": 0.5330207908683245, + "grad_norm": 0.863673210144043, + "learning_rate": 0.0009334148661502921, + "loss": 3.6955, + "step": 7845 + }, + { + "epoch": 0.5333605109389863, + "grad_norm": 0.6127910017967224, + "learning_rate": 0.0009333724011414595, + "loss": 3.7459, + "step": 7850 + }, + { + "epoch": 0.533700231009648, + "grad_norm": 1.20305335521698, + "learning_rate": 0.0009333299361326267, + "loss": 3.8408, + "step": 7855 + }, + { + "epoch": 0.5340399510803098, + "grad_norm": 0.6360501050949097, + "learning_rate": 0.0009332874711237939, + "loss": 3.7, + "step": 7860 + }, + { + "epoch": 0.5343796711509716, + "grad_norm": 0.9378423094749451, + "learning_rate": 0.0009332450061149614, + "loss": 3.6398, + "step": 7865 + }, + { + "epoch": 0.5347193912216334, + "grad_norm": 0.6156966686248779, + "learning_rate": 0.0009332025411061286, + "loss": 3.8152, + "step": 7870 + }, + { + "epoch": 0.5350591112922951, + "grad_norm": 0.6156020760536194, + "learning_rate": 0.0009331600760972958, + "loss": 3.7468, + "step": 7875 + }, + { + "epoch": 0.535398831362957, + "grad_norm": 0.8581897616386414, + "learning_rate": 0.0009331176110884632, + "loss": 3.8143, + "step": 7880 + }, + { + "epoch": 0.5357385514336187, + "grad_norm": 0.8012222051620483, + "learning_rate": 0.0009330751460796304, + "loss": 3.6519, + "step": 7885 + }, + { + "epoch": 0.5360782715042804, + "grad_norm": 0.7586082220077515, + "learning_rate": 0.0009330326810707976, + "loss": 3.9273, + "step": 7890 + }, + { + "epoch": 0.5364179915749423, + "grad_norm": 0.8060702681541443, + "learning_rate": 0.000932990216061965, + "loss": 3.4962, + "step": 7895 + }, + { + "epoch": 0.536757711645604, + "grad_norm": 0.7138065099716187, + "learning_rate": 0.0009329477510531323, + "loss": 3.5215, + "step": 7900 + }, + { + "epoch": 0.5370974317162658, + "grad_norm": 0.6876052618026733, + "learning_rate": 0.0009329052860442995, + "loss": 3.7942, + "step": 7905 + }, + { + "epoch": 0.5374371517869275, + "grad_norm": 0.7649595737457275, + "learning_rate": 0.0009328628210354668, + "loss": 3.792, + "step": 7910 + }, + { + "epoch": 0.5377768718575894, + "grad_norm": 0.6129052042961121, + "learning_rate": 0.0009328203560266341, + "loss": 3.8296, + "step": 7915 + }, + { + "epoch": 0.5381165919282511, + "grad_norm": 0.7609361410140991, + "learning_rate": 0.0009327778910178013, + "loss": 3.6136, + "step": 7920 + }, + { + "epoch": 0.5384563119989129, + "grad_norm": 0.7974691390991211, + "learning_rate": 0.0009327354260089686, + "loss": 3.794, + "step": 7925 + }, + { + "epoch": 0.5387960320695747, + "grad_norm": 0.6914902329444885, + "learning_rate": 0.0009326929610001359, + "loss": 3.9231, + "step": 7930 + }, + { + "epoch": 0.5391357521402365, + "grad_norm": 0.6428053379058838, + "learning_rate": 0.0009326504959913032, + "loss": 3.8158, + "step": 7935 + }, + { + "epoch": 0.5394754722108982, + "grad_norm": 0.9203472137451172, + "learning_rate": 0.0009326080309824705, + "loss": 3.687, + "step": 7940 + }, + { + "epoch": 0.5398151922815599, + "grad_norm": 0.8020077347755432, + "learning_rate": 0.0009325655659736377, + "loss": 3.6562, + "step": 7945 + }, + { + "epoch": 0.5401549123522218, + "grad_norm": 0.7744275331497192, + "learning_rate": 0.000932523100964805, + "loss": 3.7757, + "step": 7950 + }, + { + "epoch": 0.5404946324228835, + "grad_norm": 0.6648283004760742, + "learning_rate": 0.0009324806359559723, + "loss": 3.5, + "step": 7955 + }, + { + "epoch": 0.5408343524935453, + "grad_norm": 0.7230730056762695, + "learning_rate": 0.0009324381709471395, + "loss": 3.7182, + "step": 7960 + }, + { + "epoch": 0.5411740725642071, + "grad_norm": 0.5622615218162537, + "learning_rate": 0.0009323957059383068, + "loss": 3.6614, + "step": 7965 + }, + { + "epoch": 0.5415137926348689, + "grad_norm": 0.7956311702728271, + "learning_rate": 0.0009323532409294742, + "loss": 3.6639, + "step": 7970 + }, + { + "epoch": 0.5418535127055306, + "grad_norm": 0.638638973236084, + "learning_rate": 0.0009323107759206414, + "loss": 3.7936, + "step": 7975 + }, + { + "epoch": 0.5421932327761925, + "grad_norm": 0.6574677228927612, + "learning_rate": 0.0009322683109118087, + "loss": 3.6964, + "step": 7980 + }, + { + "epoch": 0.5425329528468542, + "grad_norm": 0.6735721826553345, + "learning_rate": 0.000932225845902976, + "loss": 3.6976, + "step": 7985 + }, + { + "epoch": 0.542872672917516, + "grad_norm": 0.7398721575737, + "learning_rate": 0.0009321833808941432, + "loss": 3.6892, + "step": 7990 + }, + { + "epoch": 0.5432123929881777, + "grad_norm": 0.8249595165252686, + "learning_rate": 0.0009321409158853104, + "loss": 3.7821, + "step": 7995 + }, + { + "epoch": 0.5435521130588395, + "grad_norm": 0.8268739581108093, + "learning_rate": 0.0009320984508764778, + "loss": 3.7981, + "step": 8000 + }, + { + "epoch": 0.5438918331295013, + "grad_norm": 0.7778275012969971, + "learning_rate": 0.0009320559858676451, + "loss": 3.5656, + "step": 8005 + }, + { + "epoch": 0.544231553200163, + "grad_norm": 0.6648785471916199, + "learning_rate": 0.0009320135208588123, + "loss": 3.7531, + "step": 8010 + }, + { + "epoch": 0.5445712732708249, + "grad_norm": 0.5664708018302917, + "learning_rate": 0.0009319710558499797, + "loss": 3.6048, + "step": 8015 + }, + { + "epoch": 0.5449109933414866, + "grad_norm": 0.7545582056045532, + "learning_rate": 0.0009319285908411469, + "loss": 3.7252, + "step": 8020 + }, + { + "epoch": 0.5452507134121484, + "grad_norm": 0.686681866645813, + "learning_rate": 0.0009318861258323142, + "loss": 3.9318, + "step": 8025 + }, + { + "epoch": 0.5455904334828101, + "grad_norm": 0.8710927367210388, + "learning_rate": 0.0009318436608234815, + "loss": 3.7183, + "step": 8030 + }, + { + "epoch": 0.545930153553472, + "grad_norm": 0.6133655905723572, + "learning_rate": 0.0009318011958146487, + "loss": 3.7437, + "step": 8035 + }, + { + "epoch": 0.5462698736241337, + "grad_norm": 0.6889778971672058, + "learning_rate": 0.0009317587308058161, + "loss": 3.4666, + "step": 8040 + }, + { + "epoch": 0.5466095936947954, + "grad_norm": 0.778829038143158, + "learning_rate": 0.0009317162657969833, + "loss": 3.6812, + "step": 8045 + }, + { + "epoch": 0.5469493137654573, + "grad_norm": 0.8402515649795532, + "learning_rate": 0.0009316738007881506, + "loss": 3.498, + "step": 8050 + }, + { + "epoch": 0.547289033836119, + "grad_norm": 0.6934203505516052, + "learning_rate": 0.0009316313357793179, + "loss": 3.5952, + "step": 8055 + }, + { + "epoch": 0.5476287539067808, + "grad_norm": 0.7918394207954407, + "learning_rate": 0.0009315888707704851, + "loss": 3.6277, + "step": 8060 + }, + { + "epoch": 0.5479684739774426, + "grad_norm": 1.1845293045043945, + "learning_rate": 0.0009315464057616524, + "loss": 3.5973, + "step": 8065 + }, + { + "epoch": 0.5483081940481044, + "grad_norm": 0.7607541680335999, + "learning_rate": 0.0009315039407528197, + "loss": 3.7057, + "step": 8070 + }, + { + "epoch": 0.5486479141187661, + "grad_norm": 0.8895907402038574, + "learning_rate": 0.000931461475743987, + "loss": 3.755, + "step": 8075 + }, + { + "epoch": 0.5489876341894279, + "grad_norm": 0.6333205699920654, + "learning_rate": 0.0009314190107351543, + "loss": 3.7129, + "step": 8080 + }, + { + "epoch": 0.5493273542600897, + "grad_norm": 0.7815647125244141, + "learning_rate": 0.0009313765457263216, + "loss": 3.6661, + "step": 8085 + }, + { + "epoch": 0.5496670743307515, + "grad_norm": 0.8195438385009766, + "learning_rate": 0.0009313340807174888, + "loss": 3.6784, + "step": 8090 + }, + { + "epoch": 0.5500067944014132, + "grad_norm": 0.9247705340385437, + "learning_rate": 0.000931291615708656, + "loss": 3.6887, + "step": 8095 + }, + { + "epoch": 0.550346514472075, + "grad_norm": 0.8917085528373718, + "learning_rate": 0.0009312491506998234, + "loss": 3.6288, + "step": 8100 + }, + { + "epoch": 0.5506862345427368, + "grad_norm": 0.8460420370101929, + "learning_rate": 0.0009312066856909906, + "loss": 3.5112, + "step": 8105 + }, + { + "epoch": 0.5510259546133985, + "grad_norm": 0.8668807744979858, + "learning_rate": 0.0009311642206821579, + "loss": 3.5803, + "step": 8110 + }, + { + "epoch": 0.5513656746840603, + "grad_norm": 0.8677077889442444, + "learning_rate": 0.0009311217556733253, + "loss": 3.6391, + "step": 8115 + }, + { + "epoch": 0.5517053947547221, + "grad_norm": 0.7076985836029053, + "learning_rate": 0.0009310792906644925, + "loss": 3.6549, + "step": 8120 + }, + { + "epoch": 0.5520451148253839, + "grad_norm": 0.7993994951248169, + "learning_rate": 0.0009310368256556597, + "loss": 3.8052, + "step": 8125 + }, + { + "epoch": 0.5523848348960456, + "grad_norm": 0.9254461526870728, + "learning_rate": 0.0009309943606468271, + "loss": 3.801, + "step": 8130 + }, + { + "epoch": 0.5527245549667075, + "grad_norm": 0.8112729787826538, + "learning_rate": 0.0009309518956379943, + "loss": 3.5968, + "step": 8135 + }, + { + "epoch": 0.5530642750373692, + "grad_norm": 0.7495216727256775, + "learning_rate": 0.0009309094306291615, + "loss": 3.6892, + "step": 8140 + }, + { + "epoch": 0.553403995108031, + "grad_norm": 0.7089411020278931, + "learning_rate": 0.000930866965620329, + "loss": 3.8299, + "step": 8145 + }, + { + "epoch": 0.5537437151786928, + "grad_norm": 0.7646794319152832, + "learning_rate": 0.0009308245006114962, + "loss": 3.4565, + "step": 8150 + }, + { + "epoch": 0.5540834352493546, + "grad_norm": 0.7544694542884827, + "learning_rate": 0.0009307820356026634, + "loss": 3.6936, + "step": 8155 + }, + { + "epoch": 0.5544231553200163, + "grad_norm": 0.7222700715065002, + "learning_rate": 0.0009307395705938307, + "loss": 3.536, + "step": 8160 + }, + { + "epoch": 0.554762875390678, + "grad_norm": 0.8826112747192383, + "learning_rate": 0.000930697105584998, + "loss": 3.9062, + "step": 8165 + }, + { + "epoch": 0.5551025954613399, + "grad_norm": 0.6268359422683716, + "learning_rate": 0.0009306546405761652, + "loss": 4.023, + "step": 8170 + }, + { + "epoch": 0.5554423155320016, + "grad_norm": 0.619527280330658, + "learning_rate": 0.0009306121755673325, + "loss": 3.7879, + "step": 8175 + }, + { + "epoch": 0.5557820356026634, + "grad_norm": 0.6875556111335754, + "learning_rate": 0.0009305697105584999, + "loss": 3.6817, + "step": 8180 + }, + { + "epoch": 0.5561217556733252, + "grad_norm": 0.8525188565254211, + "learning_rate": 0.0009305272455496671, + "loss": 3.5198, + "step": 8185 + }, + { + "epoch": 0.556461475743987, + "grad_norm": 0.8419358134269714, + "learning_rate": 0.0009304847805408344, + "loss": 3.7304, + "step": 8190 + }, + { + "epoch": 0.5568011958146487, + "grad_norm": 0.8173694610595703, + "learning_rate": 0.0009304423155320016, + "loss": 3.72, + "step": 8195 + }, + { + "epoch": 0.5571409158853105, + "grad_norm": 0.60223388671875, + "learning_rate": 0.0009303998505231689, + "loss": 3.7242, + "step": 8200 + }, + { + "epoch": 0.5574806359559723, + "grad_norm": 0.7716183662414551, + "learning_rate": 0.0009303573855143362, + "loss": 3.6341, + "step": 8205 + }, + { + "epoch": 0.557820356026634, + "grad_norm": 0.7833092212677002, + "learning_rate": 0.0009303149205055034, + "loss": 3.6879, + "step": 8210 + }, + { + "epoch": 0.5581600760972958, + "grad_norm": 0.8152278065681458, + "learning_rate": 0.0009302724554966708, + "loss": 3.6685, + "step": 8215 + }, + { + "epoch": 0.5584997961679576, + "grad_norm": 0.6335099935531616, + "learning_rate": 0.0009302299904878381, + "loss": 3.8351, + "step": 8220 + }, + { + "epoch": 0.5588395162386194, + "grad_norm": 0.928673505783081, + "learning_rate": 0.0009301875254790053, + "loss": 3.5084, + "step": 8225 + }, + { + "epoch": 0.5591792363092811, + "grad_norm": 1.8466860055923462, + "learning_rate": 0.0009301450604701725, + "loss": 3.8575, + "step": 8230 + }, + { + "epoch": 0.559518956379943, + "grad_norm": 0.7000687718391418, + "learning_rate": 0.0009301025954613399, + "loss": 3.6585, + "step": 8235 + }, + { + "epoch": 0.5598586764506047, + "grad_norm": 0.7073691487312317, + "learning_rate": 0.0009300601304525071, + "loss": 3.946, + "step": 8240 + }, + { + "epoch": 0.5601983965212665, + "grad_norm": 1.1266405582427979, + "learning_rate": 0.0009300176654436743, + "loss": 3.5017, + "step": 8245 + }, + { + "epoch": 0.5605381165919282, + "grad_norm": 0.7734697461128235, + "learning_rate": 0.0009299752004348418, + "loss": 3.6568, + "step": 8250 + }, + { + "epoch": 0.5608778366625901, + "grad_norm": 0.75633305311203, + "learning_rate": 0.000929932735426009, + "loss": 3.5189, + "step": 8255 + }, + { + "epoch": 0.5612175567332518, + "grad_norm": 0.700429379940033, + "learning_rate": 0.0009298902704171762, + "loss": 3.6116, + "step": 8260 + }, + { + "epoch": 0.5615572768039135, + "grad_norm": 0.6340858340263367, + "learning_rate": 0.0009298478054083436, + "loss": 3.7006, + "step": 8265 + }, + { + "epoch": 0.5618969968745754, + "grad_norm": 0.7910075783729553, + "learning_rate": 0.0009298053403995108, + "loss": 3.6217, + "step": 8270 + }, + { + "epoch": 0.5622367169452371, + "grad_norm": 0.8640959858894348, + "learning_rate": 0.000929762875390678, + "loss": 3.4365, + "step": 8275 + }, + { + "epoch": 0.5625764370158989, + "grad_norm": 0.9636788368225098, + "learning_rate": 0.0009297204103818455, + "loss": 3.7328, + "step": 8280 + }, + { + "epoch": 0.5629161570865606, + "grad_norm": 0.6667340397834778, + "learning_rate": 0.0009296779453730127, + "loss": 3.6892, + "step": 8285 + }, + { + "epoch": 0.5632558771572225, + "grad_norm": 0.7061495184898376, + "learning_rate": 0.0009296354803641799, + "loss": 3.853, + "step": 8290 + }, + { + "epoch": 0.5635955972278842, + "grad_norm": 0.8605124354362488, + "learning_rate": 0.0009295930153553472, + "loss": 3.6526, + "step": 8295 + }, + { + "epoch": 0.563935317298546, + "grad_norm": 1.7243515253067017, + "learning_rate": 0.0009295505503465145, + "loss": 3.8083, + "step": 8300 + }, + { + "epoch": 0.5642750373692078, + "grad_norm": 0.6706386208534241, + "learning_rate": 0.0009295080853376817, + "loss": 3.7384, + "step": 8305 + }, + { + "epoch": 0.5646147574398696, + "grad_norm": 1.2414625883102417, + "learning_rate": 0.000929465620328849, + "loss": 3.6966, + "step": 8310 + }, + { + "epoch": 0.5649544775105313, + "grad_norm": 0.7770718932151794, + "learning_rate": 0.0009294231553200164, + "loss": 3.4762, + "step": 8315 + }, + { + "epoch": 0.5652941975811931, + "grad_norm": 0.7199429273605347, + "learning_rate": 0.0009293806903111836, + "loss": 3.5104, + "step": 8320 + }, + { + "epoch": 0.5656339176518549, + "grad_norm": 0.8669970631599426, + "learning_rate": 0.0009293382253023509, + "loss": 3.9618, + "step": 8325 + }, + { + "epoch": 0.5659736377225166, + "grad_norm": 1.0288798809051514, + "learning_rate": 0.0009292957602935182, + "loss": 3.7402, + "step": 8330 + }, + { + "epoch": 0.5663133577931784, + "grad_norm": 0.7510484457015991, + "learning_rate": 0.0009292532952846854, + "loss": 3.8597, + "step": 8335 + }, + { + "epoch": 0.5666530778638402, + "grad_norm": 0.8192992806434631, + "learning_rate": 0.0009292108302758527, + "loss": 3.7558, + "step": 8340 + }, + { + "epoch": 0.566992797934502, + "grad_norm": 0.6564905047416687, + "learning_rate": 0.0009291683652670199, + "loss": 3.5966, + "step": 8345 + }, + { + "epoch": 0.5673325180051637, + "grad_norm": 0.9887263178825378, + "learning_rate": 0.0009291259002581873, + "loss": 3.9555, + "step": 8350 + }, + { + "epoch": 0.5676722380758256, + "grad_norm": 0.7017914056777954, + "learning_rate": 0.0009290834352493546, + "loss": 3.8336, + "step": 8355 + }, + { + "epoch": 0.5680119581464873, + "grad_norm": 0.6893196702003479, + "learning_rate": 0.0009290409702405218, + "loss": 3.55, + "step": 8360 + }, + { + "epoch": 0.568351678217149, + "grad_norm": 0.7099121809005737, + "learning_rate": 0.0009289985052316892, + "loss": 3.6496, + "step": 8365 + }, + { + "epoch": 0.5686913982878108, + "grad_norm": 0.8736963868141174, + "learning_rate": 0.0009289560402228564, + "loss": 3.6585, + "step": 8370 + }, + { + "epoch": 0.5690311183584726, + "grad_norm": 0.6009021997451782, + "learning_rate": 0.0009289135752140236, + "loss": 3.8541, + "step": 8375 + }, + { + "epoch": 0.5693708384291344, + "grad_norm": 0.6407254934310913, + "learning_rate": 0.000928871110205191, + "loss": 3.8495, + "step": 8380 + }, + { + "epoch": 0.5697105584997961, + "grad_norm": 0.9075515270233154, + "learning_rate": 0.0009288286451963583, + "loss": 3.7475, + "step": 8385 + }, + { + "epoch": 0.570050278570458, + "grad_norm": 0.7904422879219055, + "learning_rate": 0.0009287861801875255, + "loss": 3.6271, + "step": 8390 + }, + { + "epoch": 0.5703899986411197, + "grad_norm": 0.6861191391944885, + "learning_rate": 0.0009287437151786928, + "loss": 3.9319, + "step": 8395 + }, + { + "epoch": 0.5707297187117815, + "grad_norm": 0.7525185346603394, + "learning_rate": 0.0009287012501698601, + "loss": 3.5781, + "step": 8400 + }, + { + "epoch": 0.5710694387824433, + "grad_norm": 0.7530349493026733, + "learning_rate": 0.0009286587851610273, + "loss": 3.7146, + "step": 8405 + }, + { + "epoch": 0.5714091588531051, + "grad_norm": 1.0239795446395874, + "learning_rate": 0.0009286163201521946, + "loss": 3.7318, + "step": 8410 + }, + { + "epoch": 0.5717488789237668, + "grad_norm": 0.7419626712799072, + "learning_rate": 0.0009285738551433619, + "loss": 3.3218, + "step": 8415 + }, + { + "epoch": 0.5720885989944285, + "grad_norm": 0.9489185214042664, + "learning_rate": 0.0009285313901345292, + "loss": 3.7564, + "step": 8420 + }, + { + "epoch": 0.5724283190650904, + "grad_norm": 0.6772943139076233, + "learning_rate": 0.0009284889251256965, + "loss": 3.7242, + "step": 8425 + }, + { + "epoch": 0.5727680391357521, + "grad_norm": 0.6522383689880371, + "learning_rate": 0.0009284464601168638, + "loss": 3.6877, + "step": 8430 + }, + { + "epoch": 0.5731077592064139, + "grad_norm": 1.0367323160171509, + "learning_rate": 0.000928403995108031, + "loss": 3.3824, + "step": 8435 + }, + { + "epoch": 0.5734474792770757, + "grad_norm": 0.8856057524681091, + "learning_rate": 0.0009283615300991983, + "loss": 3.648, + "step": 8440 + }, + { + "epoch": 0.5737871993477375, + "grad_norm": 0.9014374613761902, + "learning_rate": 0.0009283190650903655, + "loss": 3.5926, + "step": 8445 + }, + { + "epoch": 0.5741269194183992, + "grad_norm": 0.7920836806297302, + "learning_rate": 0.0009282766000815328, + "loss": 3.7633, + "step": 8450 + }, + { + "epoch": 0.574466639489061, + "grad_norm": 0.8168785572052002, + "learning_rate": 0.0009282341350727002, + "loss": 3.4312, + "step": 8455 + }, + { + "epoch": 0.5748063595597228, + "grad_norm": 0.7583015561103821, + "learning_rate": 0.0009281916700638674, + "loss": 3.8929, + "step": 8460 + }, + { + "epoch": 0.5751460796303846, + "grad_norm": 0.7633148431777954, + "learning_rate": 0.0009281492050550347, + "loss": 3.7933, + "step": 8465 + }, + { + "epoch": 0.5754857997010463, + "grad_norm": 0.7092442512512207, + "learning_rate": 0.000928106740046202, + "loss": 3.6163, + "step": 8470 + }, + { + "epoch": 0.5758255197717081, + "grad_norm": 0.7831745147705078, + "learning_rate": 0.0009280642750373692, + "loss": 3.596, + "step": 8475 + }, + { + "epoch": 0.5761652398423699, + "grad_norm": 0.8605108857154846, + "learning_rate": 0.0009280218100285364, + "loss": 3.5653, + "step": 8480 + }, + { + "epoch": 0.5765049599130316, + "grad_norm": 0.7284083962440491, + "learning_rate": 0.0009279793450197038, + "loss": 3.7432, + "step": 8485 + }, + { + "epoch": 0.5768446799836935, + "grad_norm": 0.7932636141777039, + "learning_rate": 0.0009279368800108711, + "loss": 3.4436, + "step": 8490 + }, + { + "epoch": 0.5771844000543552, + "grad_norm": 0.9294976592063904, + "learning_rate": 0.0009278944150020383, + "loss": 3.8353, + "step": 8495 + }, + { + "epoch": 0.577524120125017, + "grad_norm": 0.6502493619918823, + "learning_rate": 0.0009278519499932057, + "loss": 3.777, + "step": 8500 + }, + { + "epoch": 0.5778638401956787, + "grad_norm": 0.9694041013717651, + "learning_rate": 0.0009278094849843729, + "loss": 3.5965, + "step": 8505 + }, + { + "epoch": 0.5782035602663406, + "grad_norm": 1.0276085138320923, + "learning_rate": 0.0009277670199755401, + "loss": 3.6133, + "step": 8510 + }, + { + "epoch": 0.5785432803370023, + "grad_norm": 0.7934643030166626, + "learning_rate": 0.0009277245549667075, + "loss": 3.6218, + "step": 8515 + }, + { + "epoch": 0.578883000407664, + "grad_norm": 1.1531013250350952, + "learning_rate": 0.0009276820899578747, + "loss": 3.5538, + "step": 8520 + }, + { + "epoch": 0.5792227204783259, + "grad_norm": 3.298168420791626, + "learning_rate": 0.000927639624949042, + "loss": 3.6205, + "step": 8525 + }, + { + "epoch": 0.5795624405489876, + "grad_norm": 0.5763921141624451, + "learning_rate": 0.0009275971599402094, + "loss": 3.6324, + "step": 8530 + }, + { + "epoch": 0.5799021606196494, + "grad_norm": 0.8817288875579834, + "learning_rate": 0.0009275546949313766, + "loss": 3.6333, + "step": 8535 + }, + { + "epoch": 0.5802418806903111, + "grad_norm": 1.0536859035491943, + "learning_rate": 0.0009275122299225438, + "loss": 3.5902, + "step": 8540 + }, + { + "epoch": 0.580581600760973, + "grad_norm": 0.9292330741882324, + "learning_rate": 0.0009274697649137111, + "loss": 3.8039, + "step": 8545 + }, + { + "epoch": 0.5809213208316347, + "grad_norm": 0.7548916339874268, + "learning_rate": 0.0009274272999048784, + "loss": 3.4917, + "step": 8550 + }, + { + "epoch": 0.5812610409022965, + "grad_norm": 0.7566742300987244, + "learning_rate": 0.0009273848348960456, + "loss": 3.8026, + "step": 8555 + }, + { + "epoch": 0.5816007609729583, + "grad_norm": 0.8407796025276184, + "learning_rate": 0.000927342369887213, + "loss": 3.712, + "step": 8560 + }, + { + "epoch": 0.5819404810436201, + "grad_norm": 0.7273021340370178, + "learning_rate": 0.0009272999048783803, + "loss": 3.6042, + "step": 8565 + }, + { + "epoch": 0.5822802011142818, + "grad_norm": 0.9348588585853577, + "learning_rate": 0.0009272574398695475, + "loss": 3.6643, + "step": 8570 + }, + { + "epoch": 0.5826199211849437, + "grad_norm": 0.9107693433761597, + "learning_rate": 0.0009272149748607148, + "loss": 3.6888, + "step": 8575 + }, + { + "epoch": 0.5829596412556054, + "grad_norm": 0.7264164686203003, + "learning_rate": 0.000927172509851882, + "loss": 3.5232, + "step": 8580 + }, + { + "epoch": 0.5832993613262671, + "grad_norm": 0.7249370217323303, + "learning_rate": 0.0009271300448430493, + "loss": 3.7445, + "step": 8585 + }, + { + "epoch": 0.5836390813969289, + "grad_norm": 0.954176127910614, + "learning_rate": 0.0009270875798342166, + "loss": 3.6638, + "step": 8590 + }, + { + "epoch": 0.5839788014675907, + "grad_norm": 0.5809422731399536, + "learning_rate": 0.0009270451148253839, + "loss": 3.9743, + "step": 8595 + }, + { + "epoch": 0.5843185215382525, + "grad_norm": 1.2545517683029175, + "learning_rate": 0.0009270026498165512, + "loss": 3.6781, + "step": 8600 + }, + { + "epoch": 0.5846582416089142, + "grad_norm": 0.9106475710868835, + "learning_rate": 0.0009269601848077185, + "loss": 4.0101, + "step": 8605 + }, + { + "epoch": 0.5849979616795761, + "grad_norm": 0.6251487731933594, + "learning_rate": 0.0009269177197988857, + "loss": 3.5907, + "step": 8610 + }, + { + "epoch": 0.5853376817502378, + "grad_norm": 0.8116607069969177, + "learning_rate": 0.000926875254790053, + "loss": 3.7318, + "step": 8615 + }, + { + "epoch": 0.5856774018208996, + "grad_norm": 1.1675044298171997, + "learning_rate": 0.0009268327897812203, + "loss": 3.5898, + "step": 8620 + }, + { + "epoch": 0.5860171218915613, + "grad_norm": 0.856451690196991, + "learning_rate": 0.0009267903247723875, + "loss": 3.7109, + "step": 8625 + }, + { + "epoch": 0.5863568419622232, + "grad_norm": 1.056630253791809, + "learning_rate": 0.0009267478597635548, + "loss": 3.913, + "step": 8630 + }, + { + "epoch": 0.5866965620328849, + "grad_norm": 0.6259101033210754, + "learning_rate": 0.0009267053947547222, + "loss": 3.8156, + "step": 8635 + }, + { + "epoch": 0.5870362821035466, + "grad_norm": 0.7031649947166443, + "learning_rate": 0.0009266629297458894, + "loss": 3.3358, + "step": 8640 + }, + { + "epoch": 0.5873760021742085, + "grad_norm": 0.7294005751609802, + "learning_rate": 0.0009266204647370566, + "loss": 4.02, + "step": 8645 + }, + { + "epoch": 0.5877157222448702, + "grad_norm": 0.630720853805542, + "learning_rate": 0.000926577999728224, + "loss": 4.0371, + "step": 8650 + }, + { + "epoch": 0.588055442315532, + "grad_norm": 0.6754944920539856, + "learning_rate": 0.0009265355347193912, + "loss": 3.5559, + "step": 8655 + }, + { + "epoch": 0.5883951623861938, + "grad_norm": 0.9979802966117859, + "learning_rate": 0.0009264930697105584, + "loss": 3.5078, + "step": 8660 + }, + { + "epoch": 0.5887348824568556, + "grad_norm": 0.6446166038513184, + "learning_rate": 0.0009264506047017259, + "loss": 3.669, + "step": 8665 + }, + { + "epoch": 0.5890746025275173, + "grad_norm": 0.5784843564033508, + "learning_rate": 0.0009264081396928931, + "loss": 3.8287, + "step": 8670 + }, + { + "epoch": 0.589414322598179, + "grad_norm": 0.703614354133606, + "learning_rate": 0.0009263656746840603, + "loss": 3.74, + "step": 8675 + }, + { + "epoch": 0.5897540426688409, + "grad_norm": 0.6091992259025574, + "learning_rate": 0.0009263232096752276, + "loss": 3.7034, + "step": 8680 + }, + { + "epoch": 0.5900937627395026, + "grad_norm": 0.7547647356987, + "learning_rate": 0.0009262807446663949, + "loss": 3.5175, + "step": 8685 + }, + { + "epoch": 0.5904334828101644, + "grad_norm": 0.8012923002243042, + "learning_rate": 0.0009262382796575621, + "loss": 3.4131, + "step": 8690 + }, + { + "epoch": 0.5907732028808262, + "grad_norm": 0.6666890978813171, + "learning_rate": 0.0009261958146487294, + "loss": 3.6661, + "step": 8695 + }, + { + "epoch": 0.591112922951488, + "grad_norm": 0.7568162679672241, + "learning_rate": 0.0009261533496398968, + "loss": 3.7271, + "step": 8700 + }, + { + "epoch": 0.5914526430221497, + "grad_norm": 1.0481923818588257, + "learning_rate": 0.0009261108846310641, + "loss": 3.8302, + "step": 8705 + }, + { + "epoch": 0.5917923630928115, + "grad_norm": 0.7648333311080933, + "learning_rate": 0.0009260684196222313, + "loss": 3.6577, + "step": 8710 + }, + { + "epoch": 0.5921320831634733, + "grad_norm": 0.8095502257347107, + "learning_rate": 0.0009260259546133986, + "loss": 3.5814, + "step": 8715 + }, + { + "epoch": 0.5924718032341351, + "grad_norm": 0.8625242114067078, + "learning_rate": 0.0009259834896045659, + "loss": 3.6878, + "step": 8720 + }, + { + "epoch": 0.5928115233047968, + "grad_norm": 0.7523800134658813, + "learning_rate": 0.0009259410245957331, + "loss": 3.7414, + "step": 8725 + }, + { + "epoch": 0.5931512433754587, + "grad_norm": 0.8542045950889587, + "learning_rate": 0.0009258985595869003, + "loss": 3.7999, + "step": 8730 + }, + { + "epoch": 0.5934909634461204, + "grad_norm": 0.9341980814933777, + "learning_rate": 0.0009258560945780678, + "loss": 3.7289, + "step": 8735 + }, + { + "epoch": 0.5938306835167821, + "grad_norm": 0.7035656571388245, + "learning_rate": 0.000925813629569235, + "loss": 3.6493, + "step": 8740 + }, + { + "epoch": 0.594170403587444, + "grad_norm": 0.7686322927474976, + "learning_rate": 0.0009257711645604022, + "loss": 3.7088, + "step": 8745 + }, + { + "epoch": 0.5945101236581057, + "grad_norm": 1.070503830909729, + "learning_rate": 0.0009257286995515696, + "loss": 3.771, + "step": 8750 + }, + { + "epoch": 0.5948498437287675, + "grad_norm": 0.8231850862503052, + "learning_rate": 0.0009256862345427368, + "loss": 3.5934, + "step": 8755 + }, + { + "epoch": 0.5951895637994292, + "grad_norm": 1.1550178527832031, + "learning_rate": 0.000925643769533904, + "loss": 3.6715, + "step": 8760 + }, + { + "epoch": 0.5955292838700911, + "grad_norm": 0.9248678684234619, + "learning_rate": 0.0009256013045250714, + "loss": 3.6443, + "step": 8765 + }, + { + "epoch": 0.5958690039407528, + "grad_norm": 1.216161847114563, + "learning_rate": 0.0009255588395162387, + "loss": 3.7304, + "step": 8770 + }, + { + "epoch": 0.5962087240114146, + "grad_norm": 1.0278277397155762, + "learning_rate": 0.0009255163745074059, + "loss": 3.6684, + "step": 8775 + }, + { + "epoch": 0.5965484440820764, + "grad_norm": 0.7128772735595703, + "learning_rate": 0.0009254739094985732, + "loss": 3.66, + "step": 8780 + }, + { + "epoch": 0.5968881641527382, + "grad_norm": 0.733992874622345, + "learning_rate": 0.0009254314444897405, + "loss": 3.6106, + "step": 8785 + }, + { + "epoch": 0.5972278842233999, + "grad_norm": 0.8213968873023987, + "learning_rate": 0.0009253889794809077, + "loss": 3.7421, + "step": 8790 + }, + { + "epoch": 0.5975676042940616, + "grad_norm": 0.5878407955169678, + "learning_rate": 0.000925346514472075, + "loss": 3.7462, + "step": 8795 + }, + { + "epoch": 0.5979073243647235, + "grad_norm": 0.6332993507385254, + "learning_rate": 0.0009253040494632423, + "loss": 3.6523, + "step": 8800 + }, + { + "epoch": 0.5982470444353852, + "grad_norm": 0.7607488632202148, + "learning_rate": 0.0009252615844544096, + "loss": 3.6955, + "step": 8805 + }, + { + "epoch": 0.598586764506047, + "grad_norm": 0.7009526491165161, + "learning_rate": 0.0009252191194455769, + "loss": 3.7111, + "step": 8810 + }, + { + "epoch": 0.5989264845767088, + "grad_norm": 0.9849018454551697, + "learning_rate": 0.0009251766544367442, + "loss": 3.7841, + "step": 8815 + }, + { + "epoch": 0.5992662046473706, + "grad_norm": 0.7742160558700562, + "learning_rate": 0.0009251341894279114, + "loss": 3.4064, + "step": 8820 + }, + { + "epoch": 0.5996059247180323, + "grad_norm": 0.9086524844169617, + "learning_rate": 0.0009250917244190787, + "loss": 3.6211, + "step": 8825 + }, + { + "epoch": 0.5999456447886942, + "grad_norm": 0.6344274878501892, + "learning_rate": 0.0009250492594102459, + "loss": 3.5485, + "step": 8830 + }, + { + "epoch": 0.6002853648593559, + "grad_norm": 0.6842225193977356, + "learning_rate": 0.0009250067944014132, + "loss": 3.6511, + "step": 8835 + }, + { + "epoch": 0.6006250849300176, + "grad_norm": 0.830411970615387, + "learning_rate": 0.0009249643293925806, + "loss": 3.8767, + "step": 8840 + }, + { + "epoch": 0.6009648050006794, + "grad_norm": 0.6478545069694519, + "learning_rate": 0.0009249218643837478, + "loss": 3.6043, + "step": 8845 + }, + { + "epoch": 0.6013045250713412, + "grad_norm": 0.7021833658218384, + "learning_rate": 0.0009248793993749151, + "loss": 3.8733, + "step": 8850 + }, + { + "epoch": 0.601644245142003, + "grad_norm": 0.7565907835960388, + "learning_rate": 0.0009248369343660824, + "loss": 3.7359, + "step": 8855 + }, + { + "epoch": 0.6019839652126647, + "grad_norm": 0.7126416563987732, + "learning_rate": 0.0009247944693572496, + "loss": 3.6116, + "step": 8860 + }, + { + "epoch": 0.6023236852833266, + "grad_norm": 0.691165030002594, + "learning_rate": 0.0009247520043484168, + "loss": 3.6145, + "step": 8865 + }, + { + "epoch": 0.6026634053539883, + "grad_norm": 0.6719601154327393, + "learning_rate": 0.0009247095393395843, + "loss": 3.6062, + "step": 8870 + }, + { + "epoch": 0.6030031254246501, + "grad_norm": 0.8079831600189209, + "learning_rate": 0.0009246670743307515, + "loss": 3.648, + "step": 8875 + }, + { + "epoch": 0.6033428454953118, + "grad_norm": 0.9109815359115601, + "learning_rate": 0.0009246246093219187, + "loss": 3.4966, + "step": 8880 + }, + { + "epoch": 0.6036825655659737, + "grad_norm": 1.4598736763000488, + "learning_rate": 0.0009245821443130861, + "loss": 3.8851, + "step": 8885 + }, + { + "epoch": 0.6040222856366354, + "grad_norm": 0.634586751461029, + "learning_rate": 0.0009245396793042533, + "loss": 3.6354, + "step": 8890 + }, + { + "epoch": 0.6043620057072971, + "grad_norm": 0.7436334490776062, + "learning_rate": 0.0009244972142954205, + "loss": 3.6635, + "step": 8895 + }, + { + "epoch": 0.604701725777959, + "grad_norm": 0.7681170105934143, + "learning_rate": 0.0009244547492865879, + "loss": 3.8303, + "step": 8900 + }, + { + "epoch": 0.6050414458486207, + "grad_norm": 0.880865216255188, + "learning_rate": 0.0009244122842777552, + "loss": 3.482, + "step": 8905 + }, + { + "epoch": 0.6053811659192825, + "grad_norm": 0.622947096824646, + "learning_rate": 0.0009243698192689224, + "loss": 3.5929, + "step": 8910 + }, + { + "epoch": 0.6057208859899443, + "grad_norm": 0.9675633311271667, + "learning_rate": 0.0009243273542600898, + "loss": 3.611, + "step": 8915 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.6350060701370239, + "learning_rate": 0.000924284889251257, + "loss": 3.6927, + "step": 8920 + }, + { + "epoch": 0.6064003261312678, + "grad_norm": 0.7311625480651855, + "learning_rate": 0.0009242424242424242, + "loss": 3.5984, + "step": 8925 + }, + { + "epoch": 0.6067400462019296, + "grad_norm": 1.1549080610275269, + "learning_rate": 0.0009241999592335915, + "loss": 3.8315, + "step": 8930 + }, + { + "epoch": 0.6070797662725914, + "grad_norm": 1.1866075992584229, + "learning_rate": 0.0009241574942247588, + "loss": 3.7174, + "step": 8935 + }, + { + "epoch": 0.6074194863432532, + "grad_norm": 0.7123512029647827, + "learning_rate": 0.0009241150292159261, + "loss": 3.6551, + "step": 8940 + }, + { + "epoch": 0.6077592064139149, + "grad_norm": 0.7540401816368103, + "learning_rate": 0.0009240725642070934, + "loss": 3.719, + "step": 8945 + }, + { + "epoch": 0.6080989264845768, + "grad_norm": 0.7482537627220154, + "learning_rate": 0.0009240300991982607, + "loss": 3.8666, + "step": 8950 + }, + { + "epoch": 0.6084386465552385, + "grad_norm": 0.7662464380264282, + "learning_rate": 0.0009239876341894279, + "loss": 3.6186, + "step": 8955 + }, + { + "epoch": 0.6087783666259002, + "grad_norm": 0.6594422459602356, + "learning_rate": 0.0009239451691805952, + "loss": 3.7535, + "step": 8960 + }, + { + "epoch": 0.609118086696562, + "grad_norm": 0.6549309492111206, + "learning_rate": 0.0009239027041717624, + "loss": 3.4837, + "step": 8965 + }, + { + "epoch": 0.6094578067672238, + "grad_norm": 0.6469588279724121, + "learning_rate": 0.0009238602391629297, + "loss": 3.6993, + "step": 8970 + }, + { + "epoch": 0.6097975268378856, + "grad_norm": 0.8120731711387634, + "learning_rate": 0.0009238177741540971, + "loss": 3.5832, + "step": 8975 + }, + { + "epoch": 0.6101372469085473, + "grad_norm": 0.8818333745002747, + "learning_rate": 0.0009237753091452643, + "loss": 3.5431, + "step": 8980 + }, + { + "epoch": 0.6104769669792092, + "grad_norm": 0.7711759805679321, + "learning_rate": 0.0009237328441364316, + "loss": 3.4885, + "step": 8985 + }, + { + "epoch": 0.6108166870498709, + "grad_norm": 0.6740360260009766, + "learning_rate": 0.0009236903791275989, + "loss": 3.5924, + "step": 8990 + }, + { + "epoch": 0.6111564071205327, + "grad_norm": 0.8247122168540955, + "learning_rate": 0.0009236479141187661, + "loss": 3.9874, + "step": 8995 + }, + { + "epoch": 0.6114961271911945, + "grad_norm": 0.6380563974380493, + "learning_rate": 0.0009236054491099334, + "loss": 3.6934, + "step": 9000 + }, + { + "epoch": 0.6118358472618562, + "grad_norm": 0.8134249448776245, + "learning_rate": 0.0009235629841011007, + "loss": 3.8274, + "step": 9005 + }, + { + "epoch": 0.612175567332518, + "grad_norm": 0.7252808213233948, + "learning_rate": 0.000923520519092268, + "loss": 3.6475, + "step": 9010 + }, + { + "epoch": 0.6125152874031797, + "grad_norm": 0.7164291739463806, + "learning_rate": 0.0009234780540834353, + "loss": 3.6688, + "step": 9015 + }, + { + "epoch": 0.6128550074738416, + "grad_norm": 0.7637968063354492, + "learning_rate": 0.0009234355890746026, + "loss": 3.8114, + "step": 9020 + }, + { + "epoch": 0.6131947275445033, + "grad_norm": 0.9770905375480652, + "learning_rate": 0.0009233931240657698, + "loss": 3.5731, + "step": 9025 + }, + { + "epoch": 0.6135344476151651, + "grad_norm": 0.858594536781311, + "learning_rate": 0.000923350659056937, + "loss": 3.5697, + "step": 9030 + }, + { + "epoch": 0.6138741676858269, + "grad_norm": 0.775903582572937, + "learning_rate": 0.0009233081940481044, + "loss": 3.5317, + "step": 9035 + }, + { + "epoch": 0.6142138877564887, + "grad_norm": 0.6106407642364502, + "learning_rate": 0.0009232657290392716, + "loss": 3.8439, + "step": 9040 + }, + { + "epoch": 0.6145536078271504, + "grad_norm": 0.8875223994255066, + "learning_rate": 0.000923223264030439, + "loss": 3.8432, + "step": 9045 + }, + { + "epoch": 0.6148933278978121, + "grad_norm": 0.7110984325408936, + "learning_rate": 0.0009231807990216063, + "loss": 3.73, + "step": 9050 + }, + { + "epoch": 0.615233047968474, + "grad_norm": 0.8054157495498657, + "learning_rate": 0.0009231383340127735, + "loss": 3.8225, + "step": 9055 + }, + { + "epoch": 0.6155727680391357, + "grad_norm": 0.7321435213088989, + "learning_rate": 0.0009230958690039408, + "loss": 3.7622, + "step": 9060 + }, + { + "epoch": 0.6159124881097975, + "grad_norm": 0.6911776661872864, + "learning_rate": 0.000923053403995108, + "loss": 3.8346, + "step": 9065 + }, + { + "epoch": 0.6162522081804593, + "grad_norm": 0.7203043103218079, + "learning_rate": 0.0009230109389862753, + "loss": 3.6089, + "step": 9070 + }, + { + "epoch": 0.6165919282511211, + "grad_norm": 0.7480960488319397, + "learning_rate": 0.0009229684739774426, + "loss": 3.4919, + "step": 9075 + }, + { + "epoch": 0.6169316483217828, + "grad_norm": 0.6239256858825684, + "learning_rate": 0.0009229260089686099, + "loss": 3.7185, + "step": 9080 + }, + { + "epoch": 0.6172713683924447, + "grad_norm": 0.7230609059333801, + "learning_rate": 0.0009228835439597772, + "loss": 3.7581, + "step": 9085 + }, + { + "epoch": 0.6176110884631064, + "grad_norm": 0.7099137306213379, + "learning_rate": 0.0009228410789509445, + "loss": 3.4663, + "step": 9090 + }, + { + "epoch": 0.6179508085337682, + "grad_norm": 0.6550664901733398, + "learning_rate": 0.0009227986139421117, + "loss": 3.6248, + "step": 9095 + }, + { + "epoch": 0.6182905286044299, + "grad_norm": 0.735654354095459, + "learning_rate": 0.000922756148933279, + "loss": 3.8217, + "step": 9100 + }, + { + "epoch": 0.6186302486750918, + "grad_norm": 0.7264292240142822, + "learning_rate": 0.0009227136839244463, + "loss": 3.9349, + "step": 9105 + }, + { + "epoch": 0.6189699687457535, + "grad_norm": 0.8461785316467285, + "learning_rate": 0.0009226712189156135, + "loss": 3.2826, + "step": 9110 + }, + { + "epoch": 0.6193096888164152, + "grad_norm": 0.7672209143638611, + "learning_rate": 0.0009226287539067809, + "loss": 3.8141, + "step": 9115 + }, + { + "epoch": 0.6196494088870771, + "grad_norm": 0.744134783744812, + "learning_rate": 0.0009225862888979482, + "loss": 3.7199, + "step": 9120 + }, + { + "epoch": 0.6199891289577388, + "grad_norm": 0.704666018486023, + "learning_rate": 0.0009225438238891154, + "loss": 3.8207, + "step": 9125 + }, + { + "epoch": 0.6203288490284006, + "grad_norm": 0.784381091594696, + "learning_rate": 0.0009225013588802826, + "loss": 3.772, + "step": 9130 + }, + { + "epoch": 0.6206685690990623, + "grad_norm": 0.6884631514549255, + "learning_rate": 0.00092245889387145, + "loss": 3.8105, + "step": 9135 + }, + { + "epoch": 0.6210082891697242, + "grad_norm": 0.6395471096038818, + "learning_rate": 0.0009224164288626172, + "loss": 3.5551, + "step": 9140 + }, + { + "epoch": 0.6213480092403859, + "grad_norm": 0.797855019569397, + "learning_rate": 0.0009223739638537844, + "loss": 3.8737, + "step": 9145 + }, + { + "epoch": 0.6216877293110477, + "grad_norm": 0.4990188479423523, + "learning_rate": 0.0009223314988449519, + "loss": 3.5234, + "step": 9150 + }, + { + "epoch": 0.6220274493817095, + "grad_norm": 0.6369191408157349, + "learning_rate": 0.0009222890338361191, + "loss": 3.615, + "step": 9155 + }, + { + "epoch": 0.6223671694523712, + "grad_norm": 0.6226452589035034, + "learning_rate": 0.0009222465688272863, + "loss": 3.6996, + "step": 9160 + }, + { + "epoch": 0.622706889523033, + "grad_norm": 0.7768764495849609, + "learning_rate": 0.0009222041038184537, + "loss": 3.564, + "step": 9165 + }, + { + "epoch": 0.6230466095936948, + "grad_norm": 0.8177006244659424, + "learning_rate": 0.0009221616388096209, + "loss": 3.6511, + "step": 9170 + }, + { + "epoch": 0.6233863296643566, + "grad_norm": 0.7832475900650024, + "learning_rate": 0.0009221191738007881, + "loss": 3.7226, + "step": 9175 + }, + { + "epoch": 0.6237260497350183, + "grad_norm": 1.1210237741470337, + "learning_rate": 0.0009220767087919554, + "loss": 3.373, + "step": 9180 + }, + { + "epoch": 0.6240657698056801, + "grad_norm": 1.1354598999023438, + "learning_rate": 0.0009220342437831228, + "loss": 3.9265, + "step": 9185 + }, + { + "epoch": 0.6244054898763419, + "grad_norm": 0.7849172353744507, + "learning_rate": 0.00092199177877429, + "loss": 3.7794, + "step": 9190 + }, + { + "epoch": 0.6247452099470037, + "grad_norm": 0.7501644492149353, + "learning_rate": 0.0009219493137654573, + "loss": 3.8261, + "step": 9195 + }, + { + "epoch": 0.6250849300176654, + "grad_norm": 0.8172557353973389, + "learning_rate": 0.0009219068487566246, + "loss": 3.4929, + "step": 9200 + }, + { + "epoch": 0.6254246500883273, + "grad_norm": 0.9170751571655273, + "learning_rate": 0.0009218643837477918, + "loss": 3.7442, + "step": 9205 + }, + { + "epoch": 0.625764370158989, + "grad_norm": 0.9359281063079834, + "learning_rate": 0.0009218219187389591, + "loss": 4.0136, + "step": 9210 + }, + { + "epoch": 0.6261040902296507, + "grad_norm": 0.8366462588310242, + "learning_rate": 0.0009217794537301263, + "loss": 3.7102, + "step": 9215 + }, + { + "epoch": 0.6264438103003126, + "grad_norm": 0.7574153542518616, + "learning_rate": 0.0009217369887212937, + "loss": 3.8208, + "step": 9220 + }, + { + "epoch": 0.6267835303709743, + "grad_norm": 0.7762464284896851, + "learning_rate": 0.000921694523712461, + "loss": 4.1376, + "step": 9225 + }, + { + "epoch": 0.6271232504416361, + "grad_norm": 0.8688616156578064, + "learning_rate": 0.0009216520587036282, + "loss": 3.6136, + "step": 9230 + }, + { + "epoch": 0.6274629705122978, + "grad_norm": 0.7256196141242981, + "learning_rate": 0.0009216095936947955, + "loss": 3.7054, + "step": 9235 + }, + { + "epoch": 0.6278026905829597, + "grad_norm": 0.7886824607849121, + "learning_rate": 0.0009215671286859628, + "loss": 3.719, + "step": 9240 + }, + { + "epoch": 0.6281424106536214, + "grad_norm": 0.7764620780944824, + "learning_rate": 0.00092152466367713, + "loss": 3.7151, + "step": 9245 + }, + { + "epoch": 0.6284821307242832, + "grad_norm": 0.8668599724769592, + "learning_rate": 0.0009214821986682973, + "loss": 3.5108, + "step": 9250 + }, + { + "epoch": 0.628821850794945, + "grad_norm": 0.9191135168075562, + "learning_rate": 0.0009214397336594647, + "loss": 3.4262, + "step": 9255 + }, + { + "epoch": 0.6291615708656068, + "grad_norm": 0.6931703090667725, + "learning_rate": 0.0009213972686506319, + "loss": 3.6825, + "step": 9260 + }, + { + "epoch": 0.6295012909362685, + "grad_norm": 0.7942953705787659, + "learning_rate": 0.0009213548036417991, + "loss": 3.4983, + "step": 9265 + }, + { + "epoch": 0.6298410110069302, + "grad_norm": 1.1622178554534912, + "learning_rate": 0.0009213123386329665, + "loss": 3.5347, + "step": 9270 + }, + { + "epoch": 0.6301807310775921, + "grad_norm": 0.8261101245880127, + "learning_rate": 0.0009212698736241337, + "loss": 3.6787, + "step": 9275 + }, + { + "epoch": 0.6305204511482538, + "grad_norm": 0.709368109703064, + "learning_rate": 0.0009212274086153009, + "loss": 3.8112, + "step": 9280 + }, + { + "epoch": 0.6308601712189156, + "grad_norm": 0.7464368939399719, + "learning_rate": 0.0009211849436064683, + "loss": 3.795, + "step": 9285 + }, + { + "epoch": 0.6311998912895774, + "grad_norm": 0.8067333698272705, + "learning_rate": 0.0009211424785976356, + "loss": 3.3191, + "step": 9290 + }, + { + "epoch": 0.6315396113602392, + "grad_norm": 0.6999701261520386, + "learning_rate": 0.0009211000135888028, + "loss": 3.6177, + "step": 9295 + }, + { + "epoch": 0.6318793314309009, + "grad_norm": 0.8872784376144409, + "learning_rate": 0.0009210575485799702, + "loss": 3.8311, + "step": 9300 + }, + { + "epoch": 0.6322190515015628, + "grad_norm": 0.7342535257339478, + "learning_rate": 0.0009210150835711374, + "loss": 3.6521, + "step": 9305 + }, + { + "epoch": 0.6325587715722245, + "grad_norm": 0.980739951133728, + "learning_rate": 0.0009209726185623046, + "loss": 3.5301, + "step": 9310 + }, + { + "epoch": 0.6328984916428863, + "grad_norm": 0.9345033168792725, + "learning_rate": 0.000920930153553472, + "loss": 3.5698, + "step": 9315 + }, + { + "epoch": 0.633238211713548, + "grad_norm": 0.695016086101532, + "learning_rate": 0.0009208876885446392, + "loss": 3.585, + "step": 9320 + }, + { + "epoch": 0.6335779317842098, + "grad_norm": 0.8446256518363953, + "learning_rate": 0.0009208452235358065, + "loss": 3.7517, + "step": 9325 + }, + { + "epoch": 0.6339176518548716, + "grad_norm": 0.7682957649230957, + "learning_rate": 0.0009208027585269738, + "loss": 3.8184, + "step": 9330 + }, + { + "epoch": 0.6342573719255333, + "grad_norm": 0.832103967666626, + "learning_rate": 0.0009207602935181411, + "loss": 3.9113, + "step": 9335 + }, + { + "epoch": 0.6345970919961952, + "grad_norm": 0.7273295521736145, + "learning_rate": 0.0009207178285093083, + "loss": 3.8265, + "step": 9340 + }, + { + "epoch": 0.6349368120668569, + "grad_norm": 0.7015263438224792, + "learning_rate": 0.0009206753635004756, + "loss": 3.7156, + "step": 9345 + }, + { + "epoch": 0.6352765321375187, + "grad_norm": 1.2929308414459229, + "learning_rate": 0.0009206328984916429, + "loss": 3.7606, + "step": 9350 + }, + { + "epoch": 0.6356162522081804, + "grad_norm": 0.5630863308906555, + "learning_rate": 0.0009205904334828101, + "loss": 3.6093, + "step": 9355 + }, + { + "epoch": 0.6359559722788423, + "grad_norm": 0.7413787245750427, + "learning_rate": 0.0009205479684739775, + "loss": 3.6282, + "step": 9360 + }, + { + "epoch": 0.636295692349504, + "grad_norm": 0.9357811212539673, + "learning_rate": 0.0009205055034651447, + "loss": 3.8523, + "step": 9365 + }, + { + "epoch": 0.6366354124201657, + "grad_norm": 0.6081321239471436, + "learning_rate": 0.000920463038456312, + "loss": 3.9699, + "step": 9370 + }, + { + "epoch": 0.6369751324908276, + "grad_norm": 0.7971559166908264, + "learning_rate": 0.0009204205734474793, + "loss": 3.6331, + "step": 9375 + }, + { + "epoch": 0.6373148525614893, + "grad_norm": 0.7947766780853271, + "learning_rate": 0.0009203781084386465, + "loss": 3.5859, + "step": 9380 + }, + { + "epoch": 0.6376545726321511, + "grad_norm": 0.6993988752365112, + "learning_rate": 0.0009203356434298139, + "loss": 3.6884, + "step": 9385 + }, + { + "epoch": 0.6379942927028129, + "grad_norm": 1.101791262626648, + "learning_rate": 0.0009202931784209811, + "loss": 3.5965, + "step": 9390 + }, + { + "epoch": 0.6383340127734747, + "grad_norm": 1.0048019886016846, + "learning_rate": 0.0009202507134121484, + "loss": 3.712, + "step": 9395 + }, + { + "epoch": 0.6386737328441364, + "grad_norm": 0.6989179253578186, + "learning_rate": 0.0009202082484033158, + "loss": 3.6606, + "step": 9400 + }, + { + "epoch": 0.6390134529147982, + "grad_norm": 1.0008753538131714, + "learning_rate": 0.000920165783394483, + "loss": 3.5945, + "step": 9405 + }, + { + "epoch": 0.63935317298546, + "grad_norm": 0.735360860824585, + "learning_rate": 0.0009201233183856502, + "loss": 3.6428, + "step": 9410 + }, + { + "epoch": 0.6396928930561218, + "grad_norm": 0.8118060827255249, + "learning_rate": 0.0009200808533768175, + "loss": 3.5875, + "step": 9415 + }, + { + "epoch": 0.6400326131267835, + "grad_norm": 11.927619934082031, + "learning_rate": 0.0009200383883679848, + "loss": 3.671, + "step": 9420 + }, + { + "epoch": 0.6403723331974454, + "grad_norm": 0.8187731504440308, + "learning_rate": 0.000919995923359152, + "loss": 3.7276, + "step": 9425 + }, + { + "epoch": 0.6407120532681071, + "grad_norm": 0.9304511547088623, + "learning_rate": 0.0009199534583503194, + "loss": 3.6393, + "step": 9430 + }, + { + "epoch": 0.6410517733387688, + "grad_norm": 0.9244623184204102, + "learning_rate": 0.0009199109933414867, + "loss": 3.5139, + "step": 9435 + }, + { + "epoch": 0.6413914934094306, + "grad_norm": 0.727045476436615, + "learning_rate": 0.0009198685283326539, + "loss": 3.7037, + "step": 9440 + }, + { + "epoch": 0.6417312134800924, + "grad_norm": 1.2226309776306152, + "learning_rate": 0.0009198260633238212, + "loss": 3.6267, + "step": 9445 + }, + { + "epoch": 0.6420709335507542, + "grad_norm": 0.866617739200592, + "learning_rate": 0.0009197835983149885, + "loss": 3.6201, + "step": 9450 + }, + { + "epoch": 0.6424106536214159, + "grad_norm": 0.981812596321106, + "learning_rate": 0.0009197411333061557, + "loss": 3.6111, + "step": 9455 + }, + { + "epoch": 0.6427503736920778, + "grad_norm": 0.8815556168556213, + "learning_rate": 0.0009196986682973231, + "loss": 3.7241, + "step": 9460 + }, + { + "epoch": 0.6430900937627395, + "grad_norm": 0.8463683724403381, + "learning_rate": 0.0009196562032884903, + "loss": 3.5553, + "step": 9465 + }, + { + "epoch": 0.6434298138334013, + "grad_norm": 0.9120318293571472, + "learning_rate": 0.0009196137382796576, + "loss": 3.6311, + "step": 9470 + }, + { + "epoch": 0.6437695339040631, + "grad_norm": 0.7072749137878418, + "learning_rate": 0.0009195712732708249, + "loss": 3.8605, + "step": 9475 + }, + { + "epoch": 0.6441092539747248, + "grad_norm": 0.745539665222168, + "learning_rate": 0.0009195288082619921, + "loss": 3.4988, + "step": 9480 + }, + { + "epoch": 0.6444489740453866, + "grad_norm": 0.9695101380348206, + "learning_rate": 0.0009194863432531594, + "loss": 4.1248, + "step": 9485 + }, + { + "epoch": 0.6447886941160483, + "grad_norm": 0.8118835687637329, + "learning_rate": 0.0009194438782443267, + "loss": 3.6926, + "step": 9490 + }, + { + "epoch": 0.6451284141867102, + "grad_norm": 0.7597980499267578, + "learning_rate": 0.000919401413235494, + "loss": 3.6859, + "step": 9495 + }, + { + "epoch": 0.6454681342573719, + "grad_norm": 0.715021014213562, + "learning_rate": 0.0009193589482266613, + "loss": 3.5988, + "step": 9500 + }, + { + "epoch": 0.6458078543280337, + "grad_norm": 0.5884724855422974, + "learning_rate": 0.0009193164832178286, + "loss": 3.4855, + "step": 9505 + }, + { + "epoch": 0.6461475743986955, + "grad_norm": 1.3022178411483765, + "learning_rate": 0.0009192740182089958, + "loss": 3.6044, + "step": 9510 + }, + { + "epoch": 0.6464872944693573, + "grad_norm": 0.8824523687362671, + "learning_rate": 0.000919231553200163, + "loss": 3.7968, + "step": 9515 + }, + { + "epoch": 0.646827014540019, + "grad_norm": 0.9790303707122803, + "learning_rate": 0.0009191890881913304, + "loss": 3.8228, + "step": 9520 + }, + { + "epoch": 0.6471667346106807, + "grad_norm": 0.8170008063316345, + "learning_rate": 0.0009191466231824976, + "loss": 3.7198, + "step": 9525 + }, + { + "epoch": 0.6475064546813426, + "grad_norm": 0.796895444393158, + "learning_rate": 0.0009191041581736649, + "loss": 3.8509, + "step": 9530 + }, + { + "epoch": 0.6478461747520043, + "grad_norm": 0.7692844867706299, + "learning_rate": 0.0009190616931648323, + "loss": 3.6541, + "step": 9535 + }, + { + "epoch": 0.6481858948226661, + "grad_norm": 0.8774352669715881, + "learning_rate": 0.0009190192281559995, + "loss": 3.8141, + "step": 9540 + }, + { + "epoch": 0.6485256148933279, + "grad_norm": 1.1431351900100708, + "learning_rate": 0.0009189767631471667, + "loss": 3.8212, + "step": 9545 + }, + { + "epoch": 0.6488653349639897, + "grad_norm": 0.9278925657272339, + "learning_rate": 0.0009189342981383341, + "loss": 3.8392, + "step": 9550 + }, + { + "epoch": 0.6492050550346514, + "grad_norm": 0.6399276256561279, + "learning_rate": 0.0009188918331295013, + "loss": 3.6594, + "step": 9555 + }, + { + "epoch": 0.6495447751053133, + "grad_norm": 0.947661817073822, + "learning_rate": 0.0009188493681206685, + "loss": 3.8217, + "step": 9560 + }, + { + "epoch": 0.649884495175975, + "grad_norm": 0.6042300462722778, + "learning_rate": 0.000918806903111836, + "loss": 3.789, + "step": 9565 + }, + { + "epoch": 0.6502242152466368, + "grad_norm": 0.7104882001876831, + "learning_rate": 0.0009187644381030032, + "loss": 3.6344, + "step": 9570 + }, + { + "epoch": 0.6505639353172985, + "grad_norm": 0.8387842178344727, + "learning_rate": 0.0009187219730941704, + "loss": 3.4489, + "step": 9575 + }, + { + "epoch": 0.6509036553879604, + "grad_norm": 0.7712475657463074, + "learning_rate": 0.0009186795080853377, + "loss": 3.6573, + "step": 9580 + }, + { + "epoch": 0.6512433754586221, + "grad_norm": 1.392867922782898, + "learning_rate": 0.000918637043076505, + "loss": 3.5399, + "step": 9585 + }, + { + "epoch": 0.6515830955292838, + "grad_norm": 0.5812051296234131, + "learning_rate": 0.0009185945780676722, + "loss": 3.6652, + "step": 9590 + }, + { + "epoch": 0.6519228155999457, + "grad_norm": 0.8156892657279968, + "learning_rate": 0.0009185521130588395, + "loss": 3.7023, + "step": 9595 + }, + { + "epoch": 0.6522625356706074, + "grad_norm": 0.7554821372032166, + "learning_rate": 0.0009185096480500069, + "loss": 3.5153, + "step": 9600 + }, + { + "epoch": 0.6526022557412692, + "grad_norm": 0.6630816459655762, + "learning_rate": 0.0009184671830411741, + "loss": 3.7242, + "step": 9605 + }, + { + "epoch": 0.6529419758119309, + "grad_norm": 0.7791907787322998, + "learning_rate": 0.0009184247180323414, + "loss": 3.7158, + "step": 9610 + }, + { + "epoch": 0.6532816958825928, + "grad_norm": 0.6841757893562317, + "learning_rate": 0.0009183822530235086, + "loss": 3.6576, + "step": 9615 + }, + { + "epoch": 0.6536214159532545, + "grad_norm": 0.8406086564064026, + "learning_rate": 0.0009183397880146759, + "loss": 3.6176, + "step": 9620 + }, + { + "epoch": 0.6539611360239163, + "grad_norm": 0.9318506121635437, + "learning_rate": 0.0009182973230058432, + "loss": 3.7996, + "step": 9625 + }, + { + "epoch": 0.6543008560945781, + "grad_norm": 0.8049618601799011, + "learning_rate": 0.0009182548579970104, + "loss": 3.7004, + "step": 9630 + }, + { + "epoch": 0.6546405761652399, + "grad_norm": 0.7099414467811584, + "learning_rate": 0.0009182123929881778, + "loss": 3.6502, + "step": 9635 + }, + { + "epoch": 0.6549802962359016, + "grad_norm": 0.8906238675117493, + "learning_rate": 0.0009181699279793451, + "loss": 3.6436, + "step": 9640 + }, + { + "epoch": 0.6553200163065634, + "grad_norm": 0.6432847380638123, + "learning_rate": 0.0009181274629705123, + "loss": 3.5352, + "step": 9645 + }, + { + "epoch": 0.6556597363772252, + "grad_norm": 17.792665481567383, + "learning_rate": 0.0009180849979616795, + "loss": 3.3121, + "step": 9650 + }, + { + "epoch": 0.6559994564478869, + "grad_norm": 0.8596221208572388, + "learning_rate": 0.0009180425329528469, + "loss": 3.7113, + "step": 9655 + }, + { + "epoch": 0.6563391765185487, + "grad_norm": 0.78557288646698, + "learning_rate": 0.0009180000679440141, + "loss": 4.0791, + "step": 9660 + }, + { + "epoch": 0.6566788965892105, + "grad_norm": 0.9077975153923035, + "learning_rate": 0.0009179576029351813, + "loss": 3.5989, + "step": 9665 + }, + { + "epoch": 0.6570186166598723, + "grad_norm": 0.7964693903923035, + "learning_rate": 0.0009179151379263488, + "loss": 3.2537, + "step": 9670 + }, + { + "epoch": 0.657358336730534, + "grad_norm": 0.7818307876586914, + "learning_rate": 0.000917872672917516, + "loss": 3.4575, + "step": 9675 + }, + { + "epoch": 0.6576980568011959, + "grad_norm": 0.8340235948562622, + "learning_rate": 0.0009178302079086832, + "loss": 3.9334, + "step": 9680 + }, + { + "epoch": 0.6580377768718576, + "grad_norm": 0.9388173222541809, + "learning_rate": 0.0009177877428998506, + "loss": 3.4556, + "step": 9685 + }, + { + "epoch": 0.6583774969425193, + "grad_norm": 0.9960130453109741, + "learning_rate": 0.0009177452778910178, + "loss": 3.4359, + "step": 9690 + }, + { + "epoch": 0.6587172170131811, + "grad_norm": 0.8852899670600891, + "learning_rate": 0.000917702812882185, + "loss": 3.6864, + "step": 9695 + }, + { + "epoch": 0.6590569370838429, + "grad_norm": 0.7555365562438965, + "learning_rate": 0.0009176603478733524, + "loss": 3.6398, + "step": 9700 + }, + { + "epoch": 0.6593966571545047, + "grad_norm": 0.8369362950325012, + "learning_rate": 0.0009176178828645197, + "loss": 3.6648, + "step": 9705 + }, + { + "epoch": 0.6597363772251664, + "grad_norm": 0.6309359073638916, + "learning_rate": 0.0009175754178556869, + "loss": 3.67, + "step": 9710 + }, + { + "epoch": 0.6600760972958283, + "grad_norm": 0.6799605488777161, + "learning_rate": 0.0009175329528468542, + "loss": 3.6622, + "step": 9715 + }, + { + "epoch": 0.66041581736649, + "grad_norm": 0.6510674357414246, + "learning_rate": 0.0009174904878380215, + "loss": 4.1674, + "step": 9720 + }, + { + "epoch": 0.6607555374371518, + "grad_norm": 0.8291465044021606, + "learning_rate": 0.0009174480228291888, + "loss": 3.7566, + "step": 9725 + }, + { + "epoch": 0.6610952575078136, + "grad_norm": 0.8946384787559509, + "learning_rate": 0.000917405557820356, + "loss": 3.396, + "step": 9730 + }, + { + "epoch": 0.6614349775784754, + "grad_norm": 0.6762462854385376, + "learning_rate": 0.0009173630928115233, + "loss": 3.5795, + "step": 9735 + }, + { + "epoch": 0.6617746976491371, + "grad_norm": 0.8949843645095825, + "learning_rate": 0.0009173206278026907, + "loss": 3.5883, + "step": 9740 + }, + { + "epoch": 0.6621144177197988, + "grad_norm": 0.7681940197944641, + "learning_rate": 0.0009172781627938579, + "loss": 3.6638, + "step": 9745 + }, + { + "epoch": 0.6624541377904607, + "grad_norm": 1.3747310638427734, + "learning_rate": 0.0009172356977850252, + "loss": 3.7912, + "step": 9750 + }, + { + "epoch": 0.6627938578611224, + "grad_norm": 0.8486087322235107, + "learning_rate": 0.0009171932327761925, + "loss": 3.8146, + "step": 9755 + }, + { + "epoch": 0.6631335779317842, + "grad_norm": 1.0607984066009521, + "learning_rate": 0.0009171507677673597, + "loss": 3.5644, + "step": 9760 + }, + { + "epoch": 0.663473298002446, + "grad_norm": 0.9152856469154358, + "learning_rate": 0.0009171083027585269, + "loss": 3.3854, + "step": 9765 + }, + { + "epoch": 0.6638130180731078, + "grad_norm": 0.8767904043197632, + "learning_rate": 0.0009170658377496943, + "loss": 3.7754, + "step": 9770 + }, + { + "epoch": 0.6641527381437695, + "grad_norm": 0.9882788062095642, + "learning_rate": 0.0009170233727408616, + "loss": 3.4613, + "step": 9775 + }, + { + "epoch": 0.6644924582144313, + "grad_norm": 0.6091382503509521, + "learning_rate": 0.0009169809077320288, + "loss": 3.9222, + "step": 9780 + }, + { + "epoch": 0.6648321782850931, + "grad_norm": 0.7407724857330322, + "learning_rate": 0.0009169384427231962, + "loss": 3.8842, + "step": 9785 + }, + { + "epoch": 0.6651718983557549, + "grad_norm": 0.6614435911178589, + "learning_rate": 0.0009168959777143634, + "loss": 3.5705, + "step": 9790 + }, + { + "epoch": 0.6655116184264166, + "grad_norm": 1.0121595859527588, + "learning_rate": 0.0009168535127055306, + "loss": 3.4712, + "step": 9795 + }, + { + "epoch": 0.6658513384970784, + "grad_norm": 0.813205361366272, + "learning_rate": 0.000916811047696698, + "loss": 3.4951, + "step": 9800 + }, + { + "epoch": 0.6661910585677402, + "grad_norm": 0.7275193929672241, + "learning_rate": 0.0009167685826878652, + "loss": 3.5991, + "step": 9805 + }, + { + "epoch": 0.6665307786384019, + "grad_norm": 0.6451307535171509, + "learning_rate": 0.0009167261176790325, + "loss": 3.7031, + "step": 9810 + }, + { + "epoch": 0.6668704987090638, + "grad_norm": 0.8923203945159912, + "learning_rate": 0.0009166836526701998, + "loss": 3.8617, + "step": 9815 + }, + { + "epoch": 0.6672102187797255, + "grad_norm": 1.0733754634857178, + "learning_rate": 0.0009166411876613671, + "loss": 3.9135, + "step": 9820 + }, + { + "epoch": 0.6675499388503873, + "grad_norm": 0.8278425335884094, + "learning_rate": 0.0009165987226525343, + "loss": 3.8004, + "step": 9825 + }, + { + "epoch": 0.667889658921049, + "grad_norm": 1.4794338941574097, + "learning_rate": 0.0009165562576437016, + "loss": 3.6806, + "step": 9830 + }, + { + "epoch": 0.6682293789917109, + "grad_norm": 0.8013981580734253, + "learning_rate": 0.0009165137926348689, + "loss": 3.6009, + "step": 9835 + }, + { + "epoch": 0.6685690990623726, + "grad_norm": 0.7682388424873352, + "learning_rate": 0.0009164713276260361, + "loss": 3.6842, + "step": 9840 + }, + { + "epoch": 0.6689088191330343, + "grad_norm": 0.785372793674469, + "learning_rate": 0.0009164288626172035, + "loss": 3.7051, + "step": 9845 + }, + { + "epoch": 0.6692485392036962, + "grad_norm": 0.9392538666725159, + "learning_rate": 0.0009163863976083708, + "loss": 3.5854, + "step": 9850 + }, + { + "epoch": 0.6695882592743579, + "grad_norm": 0.974972128868103, + "learning_rate": 0.000916343932599538, + "loss": 3.9638, + "step": 9855 + }, + { + "epoch": 0.6699279793450197, + "grad_norm": 0.651196300983429, + "learning_rate": 0.0009163014675907053, + "loss": 3.7781, + "step": 9860 + }, + { + "epoch": 0.6702676994156814, + "grad_norm": 0.6594703793525696, + "learning_rate": 0.0009162590025818725, + "loss": 3.814, + "step": 9865 + }, + { + "epoch": 0.6706074194863433, + "grad_norm": 0.8552442193031311, + "learning_rate": 0.0009162165375730398, + "loss": 3.5257, + "step": 9870 + }, + { + "epoch": 0.670947139557005, + "grad_norm": 0.5921844840049744, + "learning_rate": 0.0009161740725642071, + "loss": 4.0512, + "step": 9875 + }, + { + "epoch": 0.6712868596276668, + "grad_norm": 0.7089529633522034, + "learning_rate": 0.0009161316075553744, + "loss": 3.7637, + "step": 9880 + }, + { + "epoch": 0.6716265796983286, + "grad_norm": 1.073927402496338, + "learning_rate": 0.0009160891425465417, + "loss": 3.7191, + "step": 9885 + }, + { + "epoch": 0.6719662997689904, + "grad_norm": 0.6456983685493469, + "learning_rate": 0.000916046677537709, + "loss": 3.5801, + "step": 9890 + }, + { + "epoch": 0.6723060198396521, + "grad_norm": 0.720731258392334, + "learning_rate": 0.0009160042125288762, + "loss": 3.5136, + "step": 9895 + }, + { + "epoch": 0.672645739910314, + "grad_norm": 0.7477688789367676, + "learning_rate": 0.0009159617475200434, + "loss": 3.7381, + "step": 9900 + }, + { + "epoch": 0.6729854599809757, + "grad_norm": 0.7337789535522461, + "learning_rate": 0.0009159192825112108, + "loss": 3.4277, + "step": 9905 + }, + { + "epoch": 0.6733251800516374, + "grad_norm": 0.7063499093055725, + "learning_rate": 0.000915876817502378, + "loss": 3.7902, + "step": 9910 + }, + { + "epoch": 0.6736649001222992, + "grad_norm": 0.7141169905662537, + "learning_rate": 0.0009158343524935453, + "loss": 3.781, + "step": 9915 + }, + { + "epoch": 0.674004620192961, + "grad_norm": 0.8094909191131592, + "learning_rate": 0.0009157918874847127, + "loss": 3.7012, + "step": 9920 + }, + { + "epoch": 0.6743443402636228, + "grad_norm": 0.770063579082489, + "learning_rate": 0.0009157494224758799, + "loss": 3.5749, + "step": 9925 + }, + { + "epoch": 0.6746840603342845, + "grad_norm": 0.6072184443473816, + "learning_rate": 0.0009157069574670471, + "loss": 3.6699, + "step": 9930 + }, + { + "epoch": 0.6750237804049464, + "grad_norm": 0.8047200441360474, + "learning_rate": 0.0009156644924582145, + "loss": 3.7258, + "step": 9935 + }, + { + "epoch": 0.6753635004756081, + "grad_norm": 1.0450642108917236, + "learning_rate": 0.0009156220274493817, + "loss": 3.7893, + "step": 9940 + }, + { + "epoch": 0.6757032205462699, + "grad_norm": 0.7630648612976074, + "learning_rate": 0.0009155795624405489, + "loss": 3.6754, + "step": 9945 + }, + { + "epoch": 0.6760429406169316, + "grad_norm": 0.6600102186203003, + "learning_rate": 0.0009155370974317164, + "loss": 3.8355, + "step": 9950 + }, + { + "epoch": 0.6763826606875935, + "grad_norm": 0.7620962858200073, + "learning_rate": 0.0009154946324228836, + "loss": 3.5419, + "step": 9955 + }, + { + "epoch": 0.6767223807582552, + "grad_norm": 0.8106443881988525, + "learning_rate": 0.0009154521674140508, + "loss": 3.6947, + "step": 9960 + }, + { + "epoch": 0.6770621008289169, + "grad_norm": 0.7598016858100891, + "learning_rate": 0.0009154097024052181, + "loss": 3.7201, + "step": 9965 + }, + { + "epoch": 0.6774018208995788, + "grad_norm": 0.8163079619407654, + "learning_rate": 0.0009153672373963854, + "loss": 3.5775, + "step": 9970 + }, + { + "epoch": 0.6777415409702405, + "grad_norm": 0.8379268050193787, + "learning_rate": 0.0009153247723875526, + "loss": 3.6143, + "step": 9975 + }, + { + "epoch": 0.6780812610409023, + "grad_norm": 0.7280101776123047, + "learning_rate": 0.00091528230737872, + "loss": 3.6799, + "step": 9980 + }, + { + "epoch": 0.6784209811115641, + "grad_norm": 0.8426733016967773, + "learning_rate": 0.0009152398423698873, + "loss": 3.5516, + "step": 9985 + }, + { + "epoch": 0.6787607011822259, + "grad_norm": 0.6634572744369507, + "learning_rate": 0.0009151973773610545, + "loss": 3.6674, + "step": 9990 + }, + { + "epoch": 0.6791004212528876, + "grad_norm": 0.6985065340995789, + "learning_rate": 0.0009151549123522218, + "loss": 3.4302, + "step": 9995 + }, + { + "epoch": 0.6794401413235494, + "grad_norm": 0.7582478523254395, + "learning_rate": 0.000915112447343389, + "loss": 3.5749, + "step": 10000 + }, + { + "epoch": 0.6797798613942112, + "grad_norm": 0.7918283939361572, + "learning_rate": 0.0009150699823345563, + "loss": 3.8646, + "step": 10005 + }, + { + "epoch": 0.680119581464873, + "grad_norm": 0.9313593506813049, + "learning_rate": 0.0009150275173257236, + "loss": 3.7706, + "step": 10010 + }, + { + "epoch": 0.6804593015355347, + "grad_norm": 0.8055834174156189, + "learning_rate": 0.0009149850523168909, + "loss": 3.6489, + "step": 10015 + }, + { + "epoch": 0.6807990216061965, + "grad_norm": 0.8774704933166504, + "learning_rate": 0.0009149425873080582, + "loss": 3.5921, + "step": 10020 + }, + { + "epoch": 0.6811387416768583, + "grad_norm": 1.7999626398086548, + "learning_rate": 0.0009149001222992255, + "loss": 3.7882, + "step": 10025 + }, + { + "epoch": 0.68147846174752, + "grad_norm": 0.7506680488586426, + "learning_rate": 0.0009148661502921592, + "loss": 3.6007, + "step": 10030 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.7653728127479553, + "learning_rate": 0.0009148236852833266, + "loss": 3.9545, + "step": 10035 + }, + { + "epoch": 0.6821579018888436, + "grad_norm": 0.7537739872932434, + "learning_rate": 0.0009147812202744939, + "loss": 3.6305, + "step": 10040 + }, + { + "epoch": 0.6824976219595054, + "grad_norm": 0.762178897857666, + "learning_rate": 0.0009147387552656611, + "loss": 3.6171, + "step": 10045 + }, + { + "epoch": 0.6828373420301671, + "grad_norm": 0.8478651642799377, + "learning_rate": 0.0009146962902568284, + "loss": 3.5191, + "step": 10050 + }, + { + "epoch": 0.683177062100829, + "grad_norm": 0.7584719657897949, + "learning_rate": 0.0009146538252479957, + "loss": 3.8313, + "step": 10055 + }, + { + "epoch": 0.6835167821714907, + "grad_norm": 0.6737688779830933, + "learning_rate": 0.0009146113602391629, + "loss": 3.9341, + "step": 10060 + }, + { + "epoch": 0.6838565022421524, + "grad_norm": 0.9122517704963684, + "learning_rate": 0.0009145688952303302, + "loss": 3.7132, + "step": 10065 + }, + { + "epoch": 0.6841962223128143, + "grad_norm": 0.6977302432060242, + "learning_rate": 0.0009145264302214975, + "loss": 3.9523, + "step": 10070 + }, + { + "epoch": 0.684535942383476, + "grad_norm": 1.052219271659851, + "learning_rate": 0.0009144839652126648, + "loss": 3.4009, + "step": 10075 + }, + { + "epoch": 0.6848756624541378, + "grad_norm": 0.8568553924560547, + "learning_rate": 0.0009144415002038321, + "loss": 3.6175, + "step": 10080 + }, + { + "epoch": 0.6852153825247995, + "grad_norm": 0.7441984415054321, + "learning_rate": 0.0009143990351949994, + "loss": 3.7121, + "step": 10085 + }, + { + "epoch": 0.6855551025954614, + "grad_norm": 0.8238592147827148, + "learning_rate": 0.0009143565701861666, + "loss": 3.5764, + "step": 10090 + }, + { + "epoch": 0.6858948226661231, + "grad_norm": 0.8596791625022888, + "learning_rate": 0.0009143141051773339, + "loss": 3.5909, + "step": 10095 + }, + { + "epoch": 0.6862345427367849, + "grad_norm": 1.11699640750885, + "learning_rate": 0.0009142716401685011, + "loss": 3.793, + "step": 10100 + }, + { + "epoch": 0.6865742628074467, + "grad_norm": 0.726455569267273, + "learning_rate": 0.0009142291751596684, + "loss": 3.6004, + "step": 10105 + }, + { + "epoch": 0.6869139828781085, + "grad_norm": 0.6608596444129944, + "learning_rate": 0.0009141867101508358, + "loss": 3.6617, + "step": 10110 + }, + { + "epoch": 0.6872537029487702, + "grad_norm": 0.7305247783660889, + "learning_rate": 0.000914144245142003, + "loss": 3.7902, + "step": 10115 + }, + { + "epoch": 0.6875934230194319, + "grad_norm": 1.058485507965088, + "learning_rate": 0.0009141017801331703, + "loss": 3.4352, + "step": 10120 + }, + { + "epoch": 0.6879331430900938, + "grad_norm": 0.798701822757721, + "learning_rate": 0.0009140593151243376, + "loss": 3.7692, + "step": 10125 + }, + { + "epoch": 0.6882728631607555, + "grad_norm": 0.8318188786506653, + "learning_rate": 0.0009140168501155048, + "loss": 3.515, + "step": 10130 + }, + { + "epoch": 0.6886125832314173, + "grad_norm": 0.6416201591491699, + "learning_rate": 0.000913974385106672, + "loss": 3.7346, + "step": 10135 + }, + { + "epoch": 0.6889523033020791, + "grad_norm": 1.276743769645691, + "learning_rate": 0.0009139319200978394, + "loss": 3.5391, + "step": 10140 + }, + { + "epoch": 0.6892920233727409, + "grad_norm": 1.0988878011703491, + "learning_rate": 0.0009138894550890067, + "loss": 3.8635, + "step": 10145 + }, + { + "epoch": 0.6896317434434026, + "grad_norm": 0.7121266722679138, + "learning_rate": 0.0009138469900801739, + "loss": 3.7859, + "step": 10150 + }, + { + "epoch": 0.6899714635140645, + "grad_norm": 0.8918036222457886, + "learning_rate": 0.0009138045250713413, + "loss": 3.6966, + "step": 10155 + }, + { + "epoch": 0.6903111835847262, + "grad_norm": 1.0426661968231201, + "learning_rate": 0.0009137620600625085, + "loss": 3.6189, + "step": 10160 + }, + { + "epoch": 0.690650903655388, + "grad_norm": 0.8264529705047607, + "learning_rate": 0.0009137195950536757, + "loss": 3.5792, + "step": 10165 + }, + { + "epoch": 0.6909906237260497, + "grad_norm": 0.7852846384048462, + "learning_rate": 0.0009136771300448431, + "loss": 3.598, + "step": 10170 + }, + { + "epoch": 0.6913303437967115, + "grad_norm": 0.7890053391456604, + "learning_rate": 0.0009136346650360103, + "loss": 3.4883, + "step": 10175 + }, + { + "epoch": 0.6916700638673733, + "grad_norm": 0.8568416237831116, + "learning_rate": 0.0009135922000271776, + "loss": 3.8862, + "step": 10180 + }, + { + "epoch": 0.692009783938035, + "grad_norm": 0.8177277445793152, + "learning_rate": 0.000913549735018345, + "loss": 3.3551, + "step": 10185 + }, + { + "epoch": 0.6923495040086969, + "grad_norm": 0.7934178113937378, + "learning_rate": 0.0009135072700095122, + "loss": 3.5482, + "step": 10190 + }, + { + "epoch": 0.6926892240793586, + "grad_norm": 0.6681546568870544, + "learning_rate": 0.0009134648050006794, + "loss": 3.7322, + "step": 10195 + }, + { + "epoch": 0.6930289441500204, + "grad_norm": 0.8195094466209412, + "learning_rate": 0.0009134223399918467, + "loss": 3.652, + "step": 10200 + }, + { + "epoch": 0.6933686642206821, + "grad_norm": 2.556976318359375, + "learning_rate": 0.000913379874983014, + "loss": 3.4268, + "step": 10205 + }, + { + "epoch": 0.693708384291344, + "grad_norm": 0.6802579164505005, + "learning_rate": 0.0009133374099741812, + "loss": 3.6377, + "step": 10210 + }, + { + "epoch": 0.6940481043620057, + "grad_norm": 0.7767915725708008, + "learning_rate": 0.0009132949449653486, + "loss": 3.7851, + "step": 10215 + }, + { + "epoch": 0.6943878244326674, + "grad_norm": 1.0303291082382202, + "learning_rate": 0.0009132524799565159, + "loss": 3.7536, + "step": 10220 + }, + { + "epoch": 0.6947275445033293, + "grad_norm": 1.08896005153656, + "learning_rate": 0.0009132100149476831, + "loss": 3.7167, + "step": 10225 + }, + { + "epoch": 0.695067264573991, + "grad_norm": 0.7559021711349487, + "learning_rate": 0.0009131675499388504, + "loss": 3.7227, + "step": 10230 + }, + { + "epoch": 0.6954069846446528, + "grad_norm": 0.7872931957244873, + "learning_rate": 0.0009131250849300176, + "loss": 3.7082, + "step": 10235 + }, + { + "epoch": 0.6957467047153146, + "grad_norm": 0.6334084868431091, + "learning_rate": 0.0009130826199211849, + "loss": 3.6569, + "step": 10240 + }, + { + "epoch": 0.6960864247859764, + "grad_norm": 0.8038175106048584, + "learning_rate": 0.0009130401549123523, + "loss": 3.8556, + "step": 10245 + }, + { + "epoch": 0.6964261448566381, + "grad_norm": 0.7176482677459717, + "learning_rate": 0.0009129976899035195, + "loss": 3.6775, + "step": 10250 + }, + { + "epoch": 0.6967658649272999, + "grad_norm": 1.118554949760437, + "learning_rate": 0.0009129552248946868, + "loss": 3.582, + "step": 10255 + }, + { + "epoch": 0.6971055849979617, + "grad_norm": 1.1085302829742432, + "learning_rate": 0.0009129127598858541, + "loss": 3.7009, + "step": 10260 + }, + { + "epoch": 0.6974453050686235, + "grad_norm": 1.0089564323425293, + "learning_rate": 0.0009128702948770213, + "loss": 3.6758, + "step": 10265 + }, + { + "epoch": 0.6977850251392852, + "grad_norm": 0.7915838956832886, + "learning_rate": 0.0009128278298681887, + "loss": 3.8029, + "step": 10270 + }, + { + "epoch": 0.698124745209947, + "grad_norm": 0.8801011443138123, + "learning_rate": 0.0009127853648593559, + "loss": 3.5682, + "step": 10275 + }, + { + "epoch": 0.6984644652806088, + "grad_norm": 0.81932133436203, + "learning_rate": 0.0009127428998505232, + "loss": 3.6816, + "step": 10280 + }, + { + "epoch": 0.6988041853512705, + "grad_norm": 1.302024245262146, + "learning_rate": 0.0009127004348416906, + "loss": 3.6259, + "step": 10285 + }, + { + "epoch": 0.6991439054219323, + "grad_norm": 0.7697573900222778, + "learning_rate": 0.0009126579698328578, + "loss": 3.5554, + "step": 10290 + }, + { + "epoch": 0.6994836254925941, + "grad_norm": 0.781768262386322, + "learning_rate": 0.000912615504824025, + "loss": 3.3851, + "step": 10295 + }, + { + "epoch": 0.6998233455632559, + "grad_norm": 0.7274905443191528, + "learning_rate": 0.0009125730398151923, + "loss": 3.649, + "step": 10300 + }, + { + "epoch": 0.7001630656339176, + "grad_norm": 1.0677140951156616, + "learning_rate": 0.0009125305748063596, + "loss": 3.5489, + "step": 10305 + }, + { + "epoch": 0.7005027857045795, + "grad_norm": 0.8634451627731323, + "learning_rate": 0.0009124881097975268, + "loss": 3.5771, + "step": 10310 + }, + { + "epoch": 0.7008425057752412, + "grad_norm": 0.8360048532485962, + "learning_rate": 0.0009124456447886942, + "loss": 3.661, + "step": 10315 + }, + { + "epoch": 0.701182225845903, + "grad_norm": 0.7537590861320496, + "learning_rate": 0.0009124031797798615, + "loss": 3.5378, + "step": 10320 + }, + { + "epoch": 0.7015219459165648, + "grad_norm": 0.7110024094581604, + "learning_rate": 0.0009123607147710287, + "loss": 3.6149, + "step": 10325 + }, + { + "epoch": 0.7018616659872265, + "grad_norm": 0.8229530453681946, + "learning_rate": 0.000912318249762196, + "loss": 3.6833, + "step": 10330 + }, + { + "epoch": 0.7022013860578883, + "grad_norm": 0.704639732837677, + "learning_rate": 0.0009122757847533632, + "loss": 3.5929, + "step": 10335 + }, + { + "epoch": 0.70254110612855, + "grad_norm": 0.723289430141449, + "learning_rate": 0.0009122333197445305, + "loss": 3.5283, + "step": 10340 + }, + { + "epoch": 0.7028808261992119, + "grad_norm": 0.7643259763717651, + "learning_rate": 0.0009121908547356978, + "loss": 3.8116, + "step": 10345 + }, + { + "epoch": 0.7032205462698736, + "grad_norm": 0.6214438676834106, + "learning_rate": 0.0009121483897268651, + "loss": 3.5953, + "step": 10350 + }, + { + "epoch": 0.7035602663405354, + "grad_norm": 0.7565822601318359, + "learning_rate": 0.0009121059247180324, + "loss": 3.7008, + "step": 10355 + }, + { + "epoch": 0.7038999864111972, + "grad_norm": 0.8599632978439331, + "learning_rate": 0.0009120634597091997, + "loss": 3.895, + "step": 10360 + }, + { + "epoch": 0.704239706481859, + "grad_norm": 0.9048056602478027, + "learning_rate": 0.0009120209947003669, + "loss": 3.4133, + "step": 10365 + }, + { + "epoch": 0.7045794265525207, + "grad_norm": 1.023110270500183, + "learning_rate": 0.0009119785296915342, + "loss": 3.734, + "step": 10370 + }, + { + "epoch": 0.7049191466231824, + "grad_norm": 0.773080050945282, + "learning_rate": 0.0009119360646827015, + "loss": 3.5872, + "step": 10375 + }, + { + "epoch": 0.7052588666938443, + "grad_norm": 0.6238204836845398, + "learning_rate": 0.0009118935996738687, + "loss": 3.7772, + "step": 10380 + }, + { + "epoch": 0.705598586764506, + "grad_norm": 0.8456680178642273, + "learning_rate": 0.000911851134665036, + "loss": 3.6887, + "step": 10385 + }, + { + "epoch": 0.7059383068351678, + "grad_norm": 1.0712569952011108, + "learning_rate": 0.0009118086696562034, + "loss": 3.6957, + "step": 10390 + }, + { + "epoch": 0.7062780269058296, + "grad_norm": 0.7546868324279785, + "learning_rate": 0.0009117662046473706, + "loss": 3.7577, + "step": 10395 + }, + { + "epoch": 0.7066177469764914, + "grad_norm": 0.6391757130622864, + "learning_rate": 0.0009117237396385378, + "loss": 3.6614, + "step": 10400 + }, + { + "epoch": 0.7069574670471531, + "grad_norm": 0.9013659358024597, + "learning_rate": 0.0009116812746297052, + "loss": 3.5654, + "step": 10405 + }, + { + "epoch": 0.707297187117815, + "grad_norm": 0.6841161847114563, + "learning_rate": 0.0009116388096208724, + "loss": 3.7218, + "step": 10410 + }, + { + "epoch": 0.7076369071884767, + "grad_norm": 0.7555821537971497, + "learning_rate": 0.0009115963446120396, + "loss": 3.7552, + "step": 10415 + }, + { + "epoch": 0.7079766272591385, + "grad_norm": 0.778216540813446, + "learning_rate": 0.0009115538796032071, + "loss": 3.7708, + "step": 10420 + }, + { + "epoch": 0.7083163473298002, + "grad_norm": 0.7501063942909241, + "learning_rate": 0.0009115114145943743, + "loss": 3.8126, + "step": 10425 + }, + { + "epoch": 0.708656067400462, + "grad_norm": 0.7453083992004395, + "learning_rate": 0.0009114689495855415, + "loss": 3.9305, + "step": 10430 + }, + { + "epoch": 0.7089957874711238, + "grad_norm": 0.675055980682373, + "learning_rate": 0.0009114264845767088, + "loss": 3.8111, + "step": 10435 + }, + { + "epoch": 0.7093355075417855, + "grad_norm": 0.6732578277587891, + "learning_rate": 0.0009113840195678761, + "loss": 3.8446, + "step": 10440 + }, + { + "epoch": 0.7096752276124474, + "grad_norm": 0.7732366919517517, + "learning_rate": 0.0009113415545590433, + "loss": 3.8107, + "step": 10445 + }, + { + "epoch": 0.7100149476831091, + "grad_norm": 0.7138248682022095, + "learning_rate": 0.0009112990895502106, + "loss": 3.7337, + "step": 10450 + }, + { + "epoch": 0.7103546677537709, + "grad_norm": 0.9110668897628784, + "learning_rate": 0.000911256624541378, + "loss": 3.8216, + "step": 10455 + }, + { + "epoch": 0.7106943878244326, + "grad_norm": 0.8174070715904236, + "learning_rate": 0.0009112141595325452, + "loss": 3.6795, + "step": 10460 + }, + { + "epoch": 0.7110341078950945, + "grad_norm": 0.7637075185775757, + "learning_rate": 0.0009111716945237125, + "loss": 3.6306, + "step": 10465 + }, + { + "epoch": 0.7113738279657562, + "grad_norm": 0.6081861853599548, + "learning_rate": 0.0009111292295148798, + "loss": 3.8034, + "step": 10470 + }, + { + "epoch": 0.711713548036418, + "grad_norm": 1.0641868114471436, + "learning_rate": 0.000911086764506047, + "loss": 3.6187, + "step": 10475 + }, + { + "epoch": 0.7120532681070798, + "grad_norm": 0.6701398491859436, + "learning_rate": 0.0009110442994972143, + "loss": 3.6542, + "step": 10480 + }, + { + "epoch": 0.7123929881777415, + "grad_norm": 0.776578962802887, + "learning_rate": 0.0009110018344883815, + "loss": 3.3623, + "step": 10485 + }, + { + "epoch": 0.7127327082484033, + "grad_norm": 1.4547529220581055, + "learning_rate": 0.0009109593694795489, + "loss": 3.7083, + "step": 10490 + }, + { + "epoch": 0.7130724283190651, + "grad_norm": 1.174072504043579, + "learning_rate": 0.0009109169044707162, + "loss": 3.6561, + "step": 10495 + }, + { + "epoch": 0.7134121483897269, + "grad_norm": 0.8077863454818726, + "learning_rate": 0.0009108744394618834, + "loss": 3.6596, + "step": 10500 + }, + { + "epoch": 0.7137518684603886, + "grad_norm": 0.8859773874282837, + "learning_rate": 0.0009108319744530507, + "loss": 3.7068, + "step": 10505 + }, + { + "epoch": 0.7140915885310504, + "grad_norm": 0.6377485394477844, + "learning_rate": 0.000910789509444218, + "loss": 3.9528, + "step": 10510 + }, + { + "epoch": 0.7144313086017122, + "grad_norm": 0.7322657704353333, + "learning_rate": 0.0009107470444353852, + "loss": 3.5683, + "step": 10515 + }, + { + "epoch": 0.714771028672374, + "grad_norm": 1.0763368606567383, + "learning_rate": 0.0009107045794265524, + "loss": 3.6345, + "step": 10520 + }, + { + "epoch": 0.7151107487430357, + "grad_norm": 0.8127692341804504, + "learning_rate": 0.0009106621144177199, + "loss": 3.6295, + "step": 10525 + }, + { + "epoch": 0.7154504688136976, + "grad_norm": 0.8383661508560181, + "learning_rate": 0.0009106196494088871, + "loss": 3.5113, + "step": 10530 + }, + { + "epoch": 0.7157901888843593, + "grad_norm": 0.575936496257782, + "learning_rate": 0.0009105771844000543, + "loss": 3.7219, + "step": 10535 + }, + { + "epoch": 0.716129908955021, + "grad_norm": 0.8582032322883606, + "learning_rate": 0.0009105347193912217, + "loss": 3.7142, + "step": 10540 + }, + { + "epoch": 0.7164696290256828, + "grad_norm": 0.8073345422744751, + "learning_rate": 0.0009104922543823889, + "loss": 3.5414, + "step": 10545 + }, + { + "epoch": 0.7168093490963446, + "grad_norm": 0.8117555975914001, + "learning_rate": 0.0009104497893735561, + "loss": 3.7768, + "step": 10550 + }, + { + "epoch": 0.7171490691670064, + "grad_norm": 0.6615213751792908, + "learning_rate": 0.0009104073243647235, + "loss": 3.7672, + "step": 10555 + }, + { + "epoch": 0.7174887892376681, + "grad_norm": 0.632484495639801, + "learning_rate": 0.0009103648593558908, + "loss": 3.7353, + "step": 10560 + }, + { + "epoch": 0.71782850930833, + "grad_norm": 0.5849599838256836, + "learning_rate": 0.000910322394347058, + "loss": 3.7964, + "step": 10565 + }, + { + "epoch": 0.7181682293789917, + "grad_norm": 0.7616914510726929, + "learning_rate": 0.0009102799293382254, + "loss": 3.861, + "step": 10570 + }, + { + "epoch": 0.7185079494496535, + "grad_norm": 0.9604212045669556, + "learning_rate": 0.0009102374643293926, + "loss": 3.635, + "step": 10575 + }, + { + "epoch": 0.7188476695203153, + "grad_norm": 0.9088853597640991, + "learning_rate": 0.0009101949993205598, + "loss": 3.6579, + "step": 10580 + }, + { + "epoch": 0.719187389590977, + "grad_norm": 0.9321430921554565, + "learning_rate": 0.0009101525343117271, + "loss": 3.4627, + "step": 10585 + }, + { + "epoch": 0.7195271096616388, + "grad_norm": 0.8103867769241333, + "learning_rate": 0.0009101100693028944, + "loss": 3.8875, + "step": 10590 + }, + { + "epoch": 0.7198668297323005, + "grad_norm": 0.7786688208580017, + "learning_rate": 0.0009100676042940617, + "loss": 3.4687, + "step": 10595 + }, + { + "epoch": 0.7202065498029624, + "grad_norm": 0.6908038258552551, + "learning_rate": 0.000910025139285229, + "loss": 3.5895, + "step": 10600 + }, + { + "epoch": 0.7205462698736241, + "grad_norm": 0.8303772211074829, + "learning_rate": 0.0009099826742763963, + "loss": 3.7636, + "step": 10605 + }, + { + "epoch": 0.7208859899442859, + "grad_norm": 0.6276874542236328, + "learning_rate": 0.0009099402092675636, + "loss": 3.733, + "step": 10610 + }, + { + "epoch": 0.7212257100149477, + "grad_norm": 0.6042386293411255, + "learning_rate": 0.0009098977442587308, + "loss": 3.6715, + "step": 10615 + }, + { + "epoch": 0.7215654300856095, + "grad_norm": 0.7888607382774353, + "learning_rate": 0.000909855279249898, + "loss": 3.661, + "step": 10620 + }, + { + "epoch": 0.7219051501562712, + "grad_norm": 0.6830819249153137, + "learning_rate": 0.0009098128142410654, + "loss": 3.8368, + "step": 10625 + }, + { + "epoch": 0.722244870226933, + "grad_norm": 0.7115374803543091, + "learning_rate": 0.0009097703492322327, + "loss": 3.7383, + "step": 10630 + }, + { + "epoch": 0.7225845902975948, + "grad_norm": 0.6878383159637451, + "learning_rate": 0.0009097278842233999, + "loss": 3.64, + "step": 10635 + }, + { + "epoch": 0.7229243103682566, + "grad_norm": 0.7130516171455383, + "learning_rate": 0.0009096854192145673, + "loss": 3.6298, + "step": 10640 + }, + { + "epoch": 0.7232640304389183, + "grad_norm": 1.088282585144043, + "learning_rate": 0.0009096429542057345, + "loss": 3.7195, + "step": 10645 + }, + { + "epoch": 0.7236037505095801, + "grad_norm": 1.0500643253326416, + "learning_rate": 0.0009096004891969017, + "loss": 3.5352, + "step": 10650 + }, + { + "epoch": 0.7239434705802419, + "grad_norm": 0.8064992427825928, + "learning_rate": 0.0009095580241880691, + "loss": 3.6189, + "step": 10655 + }, + { + "epoch": 0.7242831906509036, + "grad_norm": 0.750442624092102, + "learning_rate": 0.0009095155591792363, + "loss": 3.2585, + "step": 10660 + }, + { + "epoch": 0.7246229107215655, + "grad_norm": 0.757142961025238, + "learning_rate": 0.0009094730941704036, + "loss": 3.7234, + "step": 10665 + }, + { + "epoch": 0.7249626307922272, + "grad_norm": 0.9993300437927246, + "learning_rate": 0.000909430629161571, + "loss": 3.6642, + "step": 10670 + }, + { + "epoch": 0.725302350862889, + "grad_norm": 0.793341338634491, + "learning_rate": 0.0009093881641527382, + "loss": 3.7696, + "step": 10675 + }, + { + "epoch": 0.7256420709335507, + "grad_norm": 0.7660190463066101, + "learning_rate": 0.0009093456991439054, + "loss": 3.7425, + "step": 10680 + }, + { + "epoch": 0.7259817910042126, + "grad_norm": 0.962237536907196, + "learning_rate": 0.0009093032341350727, + "loss": 3.6989, + "step": 10685 + }, + { + "epoch": 0.7263215110748743, + "grad_norm": 0.7346736192703247, + "learning_rate": 0.00090926076912624, + "loss": 3.7143, + "step": 10690 + }, + { + "epoch": 0.726661231145536, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0009092183041174072, + "loss": 3.7265, + "step": 10695 + }, + { + "epoch": 0.7270009512161979, + "grad_norm": 0.9078261852264404, + "learning_rate": 0.0009091758391085746, + "loss": 3.8569, + "step": 10700 + }, + { + "epoch": 0.7273406712868596, + "grad_norm": 1.2587934732437134, + "learning_rate": 0.0009091333740997419, + "loss": 3.7729, + "step": 10705 + }, + { + "epoch": 0.7276803913575214, + "grad_norm": 0.8130671977996826, + "learning_rate": 0.0009090909090909091, + "loss": 3.8764, + "step": 10710 + }, + { + "epoch": 0.7280201114281831, + "grad_norm": 0.6780834197998047, + "learning_rate": 0.0009090484440820764, + "loss": 3.8441, + "step": 10715 + }, + { + "epoch": 0.728359831498845, + "grad_norm": 1.2454288005828857, + "learning_rate": 0.0009090059790732437, + "loss": 3.4576, + "step": 10720 + }, + { + "epoch": 0.7286995515695067, + "grad_norm": 0.6150233745574951, + "learning_rate": 0.0009089635140644109, + "loss": 3.6894, + "step": 10725 + }, + { + "epoch": 0.7290392716401685, + "grad_norm": 0.684750497341156, + "learning_rate": 0.0009089210490555782, + "loss": 3.5065, + "step": 10730 + }, + { + "epoch": 0.7293789917108303, + "grad_norm": 0.805694580078125, + "learning_rate": 0.0009088785840467455, + "loss": 3.8185, + "step": 10735 + }, + { + "epoch": 0.7297187117814921, + "grad_norm": 0.751578152179718, + "learning_rate": 0.0009088361190379128, + "loss": 3.828, + "step": 10740 + }, + { + "epoch": 0.7300584318521538, + "grad_norm": 0.7956777215003967, + "learning_rate": 0.0009087936540290801, + "loss": 3.5713, + "step": 10745 + }, + { + "epoch": 0.7303981519228157, + "grad_norm": 1.052182912826538, + "learning_rate": 0.0009087511890202473, + "loss": 3.6827, + "step": 10750 + }, + { + "epoch": 0.7307378719934774, + "grad_norm": 1.094173789024353, + "learning_rate": 0.0009087087240114146, + "loss": 3.7056, + "step": 10755 + }, + { + "epoch": 0.7310775920641391, + "grad_norm": 0.8195741772651672, + "learning_rate": 0.0009086662590025819, + "loss": 3.863, + "step": 10760 + }, + { + "epoch": 0.7314173121348009, + "grad_norm": 0.8643901348114014, + "learning_rate": 0.0009086237939937491, + "loss": 3.7196, + "step": 10765 + }, + { + "epoch": 0.7317570322054627, + "grad_norm": 0.7417980432510376, + "learning_rate": 0.0009085813289849165, + "loss": 3.5426, + "step": 10770 + }, + { + "epoch": 0.7320967522761245, + "grad_norm": 0.6818327307701111, + "learning_rate": 0.0009085388639760838, + "loss": 3.849, + "step": 10775 + }, + { + "epoch": 0.7324364723467862, + "grad_norm": 0.6863572597503662, + "learning_rate": 0.000908496398967251, + "loss": 3.699, + "step": 10780 + }, + { + "epoch": 0.7327761924174481, + "grad_norm": 0.8659505248069763, + "learning_rate": 0.0009084539339584182, + "loss": 3.7385, + "step": 10785 + }, + { + "epoch": 0.7331159124881098, + "grad_norm": 0.8008067607879639, + "learning_rate": 0.0009084114689495856, + "loss": 3.5411, + "step": 10790 + }, + { + "epoch": 0.7334556325587716, + "grad_norm": 0.8581641316413879, + "learning_rate": 0.0009083690039407528, + "loss": 3.6069, + "step": 10795 + }, + { + "epoch": 0.7337953526294333, + "grad_norm": 0.8889230489730835, + "learning_rate": 0.00090832653893192, + "loss": 3.6149, + "step": 10800 + }, + { + "epoch": 0.7341350727000951, + "grad_norm": 0.8319715857505798, + "learning_rate": 0.0009082840739230875, + "loss": 3.7764, + "step": 10805 + }, + { + "epoch": 0.7344747927707569, + "grad_norm": 0.7290542125701904, + "learning_rate": 0.0009082416089142547, + "loss": 3.7827, + "step": 10810 + }, + { + "epoch": 0.7348145128414186, + "grad_norm": 0.815315306186676, + "learning_rate": 0.0009081991439054219, + "loss": 3.7798, + "step": 10815 + }, + { + "epoch": 0.7351542329120805, + "grad_norm": 0.7303261756896973, + "learning_rate": 0.0009081566788965893, + "loss": 3.6682, + "step": 10820 + }, + { + "epoch": 0.7354939529827422, + "grad_norm": 0.6496320962905884, + "learning_rate": 0.0009081142138877565, + "loss": 3.7951, + "step": 10825 + }, + { + "epoch": 0.735833673053404, + "grad_norm": 0.7587418556213379, + "learning_rate": 0.0009080717488789237, + "loss": 3.5977, + "step": 10830 + }, + { + "epoch": 0.7361733931240658, + "grad_norm": 0.7920283675193787, + "learning_rate": 0.0009080292838700911, + "loss": 3.7328, + "step": 10835 + }, + { + "epoch": 0.7365131131947276, + "grad_norm": 0.7987052202224731, + "learning_rate": 0.0009079868188612584, + "loss": 3.5526, + "step": 10840 + }, + { + "epoch": 0.7368528332653893, + "grad_norm": 3.789886236190796, + "learning_rate": 0.0009079443538524256, + "loss": 3.5452, + "step": 10845 + }, + { + "epoch": 0.737192553336051, + "grad_norm": 1.2110261917114258, + "learning_rate": 0.0009079018888435929, + "loss": 3.7486, + "step": 10850 + }, + { + "epoch": 0.7375322734067129, + "grad_norm": 0.6987991333007812, + "learning_rate": 0.0009078594238347602, + "loss": 3.6386, + "step": 10855 + }, + { + "epoch": 0.7378719934773746, + "grad_norm": 0.7499234676361084, + "learning_rate": 0.0009078169588259274, + "loss": 3.7741, + "step": 10860 + }, + { + "epoch": 0.7382117135480364, + "grad_norm": 0.6603536009788513, + "learning_rate": 0.0009077744938170947, + "loss": 3.769, + "step": 10865 + }, + { + "epoch": 0.7385514336186982, + "grad_norm": 0.728244960308075, + "learning_rate": 0.000907732028808262, + "loss": 3.6486, + "step": 10870 + }, + { + "epoch": 0.73889115368936, + "grad_norm": 0.8622386455535889, + "learning_rate": 0.0009076895637994293, + "loss": 3.7031, + "step": 10875 + }, + { + "epoch": 0.7392308737600217, + "grad_norm": 0.7680321335792542, + "learning_rate": 0.0009076470987905966, + "loss": 3.7158, + "step": 10880 + }, + { + "epoch": 0.7395705938306835, + "grad_norm": 0.681460976600647, + "learning_rate": 0.0009076046337817638, + "loss": 3.6986, + "step": 10885 + }, + { + "epoch": 0.7399103139013453, + "grad_norm": 0.7468190789222717, + "learning_rate": 0.0009075621687729311, + "loss": 3.6263, + "step": 10890 + }, + { + "epoch": 0.7402500339720071, + "grad_norm": 0.7261123657226562, + "learning_rate": 0.0009075197037640984, + "loss": 3.6688, + "step": 10895 + }, + { + "epoch": 0.7405897540426688, + "grad_norm": 0.7458889484405518, + "learning_rate": 0.0009074772387552656, + "loss": 3.5449, + "step": 10900 + }, + { + "epoch": 0.7409294741133307, + "grad_norm": 0.6862163543701172, + "learning_rate": 0.000907434773746433, + "loss": 3.7715, + "step": 10905 + }, + { + "epoch": 0.7412691941839924, + "grad_norm": 0.7556419372558594, + "learning_rate": 0.0009073923087376003, + "loss": 3.6729, + "step": 10910 + }, + { + "epoch": 0.7416089142546541, + "grad_norm": 0.6329078078269958, + "learning_rate": 0.0009073498437287675, + "loss": 3.9472, + "step": 10915 + }, + { + "epoch": 0.741948634325316, + "grad_norm": 2.5260157585144043, + "learning_rate": 0.0009073073787199347, + "loss": 3.615, + "step": 10920 + }, + { + "epoch": 0.7422883543959777, + "grad_norm": 0.9930900931358337, + "learning_rate": 0.0009072649137111021, + "loss": 3.7181, + "step": 10925 + }, + { + "epoch": 0.7426280744666395, + "grad_norm": 0.9070388674736023, + "learning_rate": 0.0009072224487022693, + "loss": 3.7067, + "step": 10930 + }, + { + "epoch": 0.7429677945373012, + "grad_norm": 0.9061547517776489, + "learning_rate": 0.0009071799836934365, + "loss": 3.5905, + "step": 10935 + }, + { + "epoch": 0.7433075146079631, + "grad_norm": 0.940543532371521, + "learning_rate": 0.000907137518684604, + "loss": 3.5941, + "step": 10940 + }, + { + "epoch": 0.7436472346786248, + "grad_norm": 0.678053617477417, + "learning_rate": 0.0009070950536757712, + "loss": 3.594, + "step": 10945 + }, + { + "epoch": 0.7439869547492866, + "grad_norm": 0.6678659915924072, + "learning_rate": 0.0009070525886669385, + "loss": 4.0005, + "step": 10950 + }, + { + "epoch": 0.7443266748199484, + "grad_norm": 0.7654513716697693, + "learning_rate": 0.0009070101236581058, + "loss": 3.5041, + "step": 10955 + }, + { + "epoch": 0.7446663948906102, + "grad_norm": 0.6158933639526367, + "learning_rate": 0.000906967658649273, + "loss": 3.5851, + "step": 10960 + }, + { + "epoch": 0.7450061149612719, + "grad_norm": 0.6983581185340881, + "learning_rate": 0.0009069251936404403, + "loss": 3.8628, + "step": 10965 + }, + { + "epoch": 0.7453458350319336, + "grad_norm": 0.98110431432724, + "learning_rate": 0.0009068827286316075, + "loss": 3.5373, + "step": 10970 + }, + { + "epoch": 0.7456855551025955, + "grad_norm": 0.7524985074996948, + "learning_rate": 0.0009068402636227749, + "loss": 3.7216, + "step": 10975 + }, + { + "epoch": 0.7460252751732572, + "grad_norm": 1.3822213411331177, + "learning_rate": 0.0009067977986139422, + "loss": 3.5979, + "step": 10980 + }, + { + "epoch": 0.746364995243919, + "grad_norm": 0.7409912347793579, + "learning_rate": 0.0009067553336051094, + "loss": 3.7149, + "step": 10985 + }, + { + "epoch": 0.7467047153145808, + "grad_norm": 0.7096331119537354, + "learning_rate": 0.0009067128685962767, + "loss": 3.525, + "step": 10990 + }, + { + "epoch": 0.7470444353852426, + "grad_norm": 0.6391186118125916, + "learning_rate": 0.000906670403587444, + "loss": 3.4395, + "step": 10995 + }, + { + "epoch": 0.7473841554559043, + "grad_norm": 0.6337305307388306, + "learning_rate": 0.0009066279385786112, + "loss": 3.482, + "step": 11000 + }, + { + "epoch": 0.7477238755265662, + "grad_norm": 0.7633218765258789, + "learning_rate": 0.0009065854735697785, + "loss": 3.5604, + "step": 11005 + }, + { + "epoch": 0.7480635955972279, + "grad_norm": 0.8902758359909058, + "learning_rate": 0.0009065430085609459, + "loss": 3.6436, + "step": 11010 + }, + { + "epoch": 0.7484033156678896, + "grad_norm": 0.7491607666015625, + "learning_rate": 0.0009065005435521131, + "loss": 3.3561, + "step": 11015 + }, + { + "epoch": 0.7487430357385514, + "grad_norm": 0.7755002975463867, + "learning_rate": 0.0009064580785432803, + "loss": 3.6949, + "step": 11020 + }, + { + "epoch": 0.7490827558092132, + "grad_norm": 0.7575463652610779, + "learning_rate": 0.0009064156135344477, + "loss": 3.6314, + "step": 11025 + }, + { + "epoch": 0.749422475879875, + "grad_norm": 0.9390106201171875, + "learning_rate": 0.0009063731485256149, + "loss": 3.6426, + "step": 11030 + }, + { + "epoch": 0.7497621959505367, + "grad_norm": 0.8049656748771667, + "learning_rate": 0.0009063306835167821, + "loss": 3.7196, + "step": 11035 + }, + { + "epoch": 0.7501019160211986, + "grad_norm": 0.7370794415473938, + "learning_rate": 0.0009062882185079495, + "loss": 3.7679, + "step": 11040 + }, + { + "epoch": 0.7504416360918603, + "grad_norm": 0.5605579614639282, + "learning_rate": 0.0009062457534991168, + "loss": 3.6759, + "step": 11045 + }, + { + "epoch": 0.7507813561625221, + "grad_norm": 0.8295556902885437, + "learning_rate": 0.000906203288490284, + "loss": 3.6725, + "step": 11050 + }, + { + "epoch": 0.7511210762331838, + "grad_norm": 0.8465284109115601, + "learning_rate": 0.0009061608234814514, + "loss": 3.3927, + "step": 11055 + }, + { + "epoch": 0.7514607963038457, + "grad_norm": 0.7803046703338623, + "learning_rate": 0.0009061183584726186, + "loss": 3.736, + "step": 11060 + }, + { + "epoch": 0.7518005163745074, + "grad_norm": 0.8794353604316711, + "learning_rate": 0.0009060758934637858, + "loss": 3.8278, + "step": 11065 + }, + { + "epoch": 0.7521402364451691, + "grad_norm": 0.8784036040306091, + "learning_rate": 0.0009060334284549531, + "loss": 3.5779, + "step": 11070 + }, + { + "epoch": 0.752479956515831, + "grad_norm": 0.7327986359596252, + "learning_rate": 0.0009059909634461204, + "loss": 3.7451, + "step": 11075 + }, + { + "epoch": 0.7528196765864927, + "grad_norm": 0.6266762614250183, + "learning_rate": 0.0009059484984372877, + "loss": 3.6185, + "step": 11080 + }, + { + "epoch": 0.7531593966571545, + "grad_norm": 0.8826175332069397, + "learning_rate": 0.000905906033428455, + "loss": 3.5231, + "step": 11085 + }, + { + "epoch": 0.7534991167278163, + "grad_norm": 1.0041965246200562, + "learning_rate": 0.0009058635684196223, + "loss": 3.6654, + "step": 11090 + }, + { + "epoch": 0.7538388367984781, + "grad_norm": 0.5978000164031982, + "learning_rate": 0.0009058211034107895, + "loss": 3.8016, + "step": 11095 + }, + { + "epoch": 0.7541785568691398, + "grad_norm": 0.7321518659591675, + "learning_rate": 0.0009057786384019568, + "loss": 3.5245, + "step": 11100 + }, + { + "epoch": 0.7545182769398016, + "grad_norm": 0.7385342121124268, + "learning_rate": 0.000905736173393124, + "loss": 3.6975, + "step": 11105 + }, + { + "epoch": 0.7548579970104634, + "grad_norm": 0.7056331634521484, + "learning_rate": 0.0009056937083842913, + "loss": 3.5768, + "step": 11110 + }, + { + "epoch": 0.7551977170811252, + "grad_norm": 0.7574288845062256, + "learning_rate": 0.0009056512433754587, + "loss": 3.6303, + "step": 11115 + }, + { + "epoch": 0.7555374371517869, + "grad_norm": 0.8274157643318176, + "learning_rate": 0.000905608778366626, + "loss": 3.6376, + "step": 11120 + }, + { + "epoch": 0.7558771572224487, + "grad_norm": 0.6675964593887329, + "learning_rate": 0.0009055663133577932, + "loss": 3.8271, + "step": 11125 + }, + { + "epoch": 0.7562168772931105, + "grad_norm": 0.7021047472953796, + "learning_rate": 0.0009055238483489605, + "loss": 3.7435, + "step": 11130 + }, + { + "epoch": 0.7565565973637722, + "grad_norm": 0.7819975018501282, + "learning_rate": 0.0009054813833401277, + "loss": 3.8685, + "step": 11135 + }, + { + "epoch": 0.756896317434434, + "grad_norm": 0.6842313408851624, + "learning_rate": 0.000905438918331295, + "loss": 3.7833, + "step": 11140 + }, + { + "epoch": 0.7572360375050958, + "grad_norm": 0.920562744140625, + "learning_rate": 0.0009053964533224623, + "loss": 3.476, + "step": 11145 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.9534333348274231, + "learning_rate": 0.0009053539883136296, + "loss": 3.9637, + "step": 11150 + }, + { + "epoch": 0.7579154776464193, + "grad_norm": 0.9696018695831299, + "learning_rate": 0.0009053115233047969, + "loss": 3.6793, + "step": 11155 + }, + { + "epoch": 0.7582551977170812, + "grad_norm": 0.7492496371269226, + "learning_rate": 0.0009052690582959642, + "loss": 3.6915, + "step": 11160 + }, + { + "epoch": 0.7585949177877429, + "grad_norm": 0.8835594058036804, + "learning_rate": 0.0009052265932871314, + "loss": 3.4612, + "step": 11165 + }, + { + "epoch": 0.7589346378584046, + "grad_norm": 0.7148780822753906, + "learning_rate": 0.0009051841282782986, + "loss": 3.5451, + "step": 11170 + }, + { + "epoch": 0.7592743579290665, + "grad_norm": 0.7280018925666809, + "learning_rate": 0.000905141663269466, + "loss": 3.6353, + "step": 11175 + }, + { + "epoch": 0.7596140779997282, + "grad_norm": 0.9799088835716248, + "learning_rate": 0.0009050991982606332, + "loss": 3.503, + "step": 11180 + }, + { + "epoch": 0.75995379807039, + "grad_norm": 0.7397592067718506, + "learning_rate": 0.0009050567332518005, + "loss": 3.4837, + "step": 11185 + }, + { + "epoch": 0.7602935181410517, + "grad_norm": 0.6619171500205994, + "learning_rate": 0.0009050142682429679, + "loss": 3.7099, + "step": 11190 + }, + { + "epoch": 0.7606332382117136, + "grad_norm": 0.6666352152824402, + "learning_rate": 0.0009049718032341351, + "loss": 3.8002, + "step": 11195 + }, + { + "epoch": 0.7609729582823753, + "grad_norm": 0.8963965177536011, + "learning_rate": 0.0009049293382253023, + "loss": 3.5202, + "step": 11200 + }, + { + "epoch": 0.7613126783530371, + "grad_norm": 0.7986039519309998, + "learning_rate": 0.0009048868732164697, + "loss": 3.7107, + "step": 11205 + }, + { + "epoch": 0.7616523984236989, + "grad_norm": 0.8264760375022888, + "learning_rate": 0.0009048444082076369, + "loss": 3.6571, + "step": 11210 + }, + { + "epoch": 0.7619921184943607, + "grad_norm": 0.7924829721450806, + "learning_rate": 0.0009048019431988041, + "loss": 3.5541, + "step": 11215 + }, + { + "epoch": 0.7623318385650224, + "grad_norm": 0.645222008228302, + "learning_rate": 0.0009047594781899715, + "loss": 3.7158, + "step": 11220 + }, + { + "epoch": 0.7626715586356841, + "grad_norm": 0.643329918384552, + "learning_rate": 0.0009047170131811388, + "loss": 3.5572, + "step": 11225 + }, + { + "epoch": 0.763011278706346, + "grad_norm": 0.6600959300994873, + "learning_rate": 0.000904674548172306, + "loss": 3.8193, + "step": 11230 + }, + { + "epoch": 0.7633509987770077, + "grad_norm": 0.7832187414169312, + "learning_rate": 0.0009046320831634733, + "loss": 3.7341, + "step": 11235 + }, + { + "epoch": 0.7636907188476695, + "grad_norm": 0.7644637227058411, + "learning_rate": 0.0009045896181546406, + "loss": 3.6788, + "step": 11240 + }, + { + "epoch": 0.7640304389183313, + "grad_norm": 0.7946782112121582, + "learning_rate": 0.0009045471531458078, + "loss": 3.3585, + "step": 11245 + }, + { + "epoch": 0.7643701589889931, + "grad_norm": 0.7609429955482483, + "learning_rate": 0.0009045046881369751, + "loss": 3.5514, + "step": 11250 + }, + { + "epoch": 0.7647098790596548, + "grad_norm": 0.6562104225158691, + "learning_rate": 0.0009044622231281425, + "loss": 3.8451, + "step": 11255 + }, + { + "epoch": 0.7650495991303167, + "grad_norm": 0.6418102979660034, + "learning_rate": 0.0009044197581193097, + "loss": 3.7535, + "step": 11260 + }, + { + "epoch": 0.7653893192009784, + "grad_norm": 0.6506428122520447, + "learning_rate": 0.000904377293110477, + "loss": 3.6456, + "step": 11265 + }, + { + "epoch": 0.7657290392716402, + "grad_norm": 0.8719205260276794, + "learning_rate": 0.0009043348281016442, + "loss": 3.722, + "step": 11270 + }, + { + "epoch": 0.7660687593423019, + "grad_norm": 0.8912215828895569, + "learning_rate": 0.0009042923630928115, + "loss": 3.7161, + "step": 11275 + }, + { + "epoch": 0.7664084794129638, + "grad_norm": 0.8162899017333984, + "learning_rate": 0.0009042498980839788, + "loss": 3.6193, + "step": 11280 + }, + { + "epoch": 0.7667481994836255, + "grad_norm": 1.0792850255966187, + "learning_rate": 0.000904207433075146, + "loss": 3.5399, + "step": 11285 + }, + { + "epoch": 0.7670879195542872, + "grad_norm": 0.769577145576477, + "learning_rate": 0.0009041649680663135, + "loss": 3.6953, + "step": 11290 + }, + { + "epoch": 0.7674276396249491, + "grad_norm": 0.6627873182296753, + "learning_rate": 0.0009041225030574807, + "loss": 3.5632, + "step": 11295 + }, + { + "epoch": 0.7677673596956108, + "grad_norm": 0.5610398054122925, + "learning_rate": 0.0009040800380486479, + "loss": 3.6261, + "step": 11300 + }, + { + "epoch": 0.7681070797662726, + "grad_norm": 0.8693429231643677, + "learning_rate": 0.0009040375730398153, + "loss": 3.5948, + "step": 11305 + }, + { + "epoch": 0.7684467998369343, + "grad_norm": 0.845872163772583, + "learning_rate": 0.0009039951080309825, + "loss": 3.6844, + "step": 11310 + }, + { + "epoch": 0.7687865199075962, + "grad_norm": 0.8991537690162659, + "learning_rate": 0.0009039526430221497, + "loss": 3.586, + "step": 11315 + }, + { + "epoch": 0.7691262399782579, + "grad_norm": 0.7066490650177002, + "learning_rate": 0.0009039101780133172, + "loss": 3.8419, + "step": 11320 + }, + { + "epoch": 0.7694659600489197, + "grad_norm": 0.9950985312461853, + "learning_rate": 0.0009038677130044844, + "loss": 3.5813, + "step": 11325 + }, + { + "epoch": 0.7698056801195815, + "grad_norm": 0.8662399053573608, + "learning_rate": 0.0009038252479956516, + "loss": 3.543, + "step": 11330 + }, + { + "epoch": 0.7701454001902432, + "grad_norm": 0.9999657869338989, + "learning_rate": 0.0009037827829868189, + "loss": 3.5233, + "step": 11335 + }, + { + "epoch": 0.770485120260905, + "grad_norm": 0.8655561208724976, + "learning_rate": 0.0009037403179779862, + "loss": 3.3481, + "step": 11340 + }, + { + "epoch": 0.7708248403315668, + "grad_norm": 0.8533512949943542, + "learning_rate": 0.0009036978529691534, + "loss": 3.7933, + "step": 11345 + }, + { + "epoch": 0.7711645604022286, + "grad_norm": 0.5750146508216858, + "learning_rate": 0.0009036553879603207, + "loss": 3.6216, + "step": 11350 + }, + { + "epoch": 0.7715042804728903, + "grad_norm": 0.8064283132553101, + "learning_rate": 0.0009036129229514881, + "loss": 3.5434, + "step": 11355 + }, + { + "epoch": 0.7718440005435521, + "grad_norm": 0.8872594833374023, + "learning_rate": 0.0009035704579426553, + "loss": 3.96, + "step": 11360 + }, + { + "epoch": 0.7721837206142139, + "grad_norm": 0.6176731586456299, + "learning_rate": 0.0009035279929338226, + "loss": 3.6777, + "step": 11365 + }, + { + "epoch": 0.7725234406848757, + "grad_norm": 0.8666825890541077, + "learning_rate": 0.0009034855279249898, + "loss": 3.8615, + "step": 11370 + }, + { + "epoch": 0.7728631607555374, + "grad_norm": 0.9428972005844116, + "learning_rate": 0.0009034430629161571, + "loss": 3.6177, + "step": 11375 + }, + { + "epoch": 0.7732028808261993, + "grad_norm": 0.7848477363586426, + "learning_rate": 0.0009034005979073244, + "loss": 3.4241, + "step": 11380 + }, + { + "epoch": 0.773542600896861, + "grad_norm": 2.360543966293335, + "learning_rate": 0.0009033581328984916, + "loss": 3.7013, + "step": 11385 + }, + { + "epoch": 0.7738823209675227, + "grad_norm": 0.8416051268577576, + "learning_rate": 0.000903315667889659, + "loss": 3.7975, + "step": 11390 + }, + { + "epoch": 0.7742220410381845, + "grad_norm": 0.8668843507766724, + "learning_rate": 0.0009032732028808263, + "loss": 3.6834, + "step": 11395 + }, + { + "epoch": 0.7745617611088463, + "grad_norm": 0.7961780428886414, + "learning_rate": 0.0009032307378719935, + "loss": 3.6589, + "step": 11400 + }, + { + "epoch": 0.7749014811795081, + "grad_norm": 0.7476592659950256, + "learning_rate": 0.0009031882728631607, + "loss": 3.8325, + "step": 11405 + }, + { + "epoch": 0.7752412012501698, + "grad_norm": 0.7280569076538086, + "learning_rate": 0.0009031458078543281, + "loss": 3.9238, + "step": 11410 + }, + { + "epoch": 0.7755809213208317, + "grad_norm": 1.188405990600586, + "learning_rate": 0.0009031033428454953, + "loss": 3.802, + "step": 11415 + }, + { + "epoch": 0.7759206413914934, + "grad_norm": 1.1356881856918335, + "learning_rate": 0.0009030608778366625, + "loss": 3.8541, + "step": 11420 + }, + { + "epoch": 0.7762603614621552, + "grad_norm": 0.9920475482940674, + "learning_rate": 0.00090301841282783, + "loss": 3.3794, + "step": 11425 + }, + { + "epoch": 0.776600081532817, + "grad_norm": 0.7739996910095215, + "learning_rate": 0.0009029759478189972, + "loss": 3.3791, + "step": 11430 + }, + { + "epoch": 0.7769398016034788, + "grad_norm": 0.708064079284668, + "learning_rate": 0.0009029334828101644, + "loss": 3.6322, + "step": 11435 + }, + { + "epoch": 0.7772795216741405, + "grad_norm": 0.7811189293861389, + "learning_rate": 0.0009028910178013318, + "loss": 3.7, + "step": 11440 + }, + { + "epoch": 0.7776192417448022, + "grad_norm": 0.7669124007225037, + "learning_rate": 0.000902848552792499, + "loss": 3.4908, + "step": 11445 + }, + { + "epoch": 0.7779589618154641, + "grad_norm": 0.9353384375572205, + "learning_rate": 0.0009028060877836662, + "loss": 3.7255, + "step": 11450 + }, + { + "epoch": 0.7782986818861258, + "grad_norm": 0.8743352890014648, + "learning_rate": 0.0009027636227748336, + "loss": 3.8387, + "step": 11455 + }, + { + "epoch": 0.7786384019567876, + "grad_norm": 0.7309284210205078, + "learning_rate": 0.0009027211577660009, + "loss": 3.4251, + "step": 11460 + }, + { + "epoch": 0.7789781220274494, + "grad_norm": 0.7424012422561646, + "learning_rate": 0.0009026786927571681, + "loss": 3.8224, + "step": 11465 + }, + { + "epoch": 0.7793178420981112, + "grad_norm": 0.7942410111427307, + "learning_rate": 0.0009026362277483354, + "loss": 3.6329, + "step": 11470 + }, + { + "epoch": 0.7796575621687729, + "grad_norm": 0.7685617804527283, + "learning_rate": 0.0009025937627395027, + "loss": 3.771, + "step": 11475 + }, + { + "epoch": 0.7799972822394347, + "grad_norm": 0.7327280044555664, + "learning_rate": 0.0009025512977306699, + "loss": 3.6845, + "step": 11480 + }, + { + "epoch": 0.7803370023100965, + "grad_norm": 0.7311487197875977, + "learning_rate": 0.0009025088327218372, + "loss": 3.6051, + "step": 11485 + }, + { + "epoch": 0.7806767223807582, + "grad_norm": 0.9589959979057312, + "learning_rate": 0.0009024663677130045, + "loss": 3.6504, + "step": 11490 + }, + { + "epoch": 0.78101644245142, + "grad_norm": 0.6580057740211487, + "learning_rate": 0.0009024239027041718, + "loss": 3.6892, + "step": 11495 + }, + { + "epoch": 0.7813561625220818, + "grad_norm": 0.9299346804618835, + "learning_rate": 0.0009023814376953391, + "loss": 3.2588, + "step": 11500 + }, + { + "epoch": 0.7816958825927436, + "grad_norm": 0.7023794651031494, + "learning_rate": 0.0009023389726865064, + "loss": 3.6111, + "step": 11505 + }, + { + "epoch": 0.7820356026634053, + "grad_norm": 0.6749833226203918, + "learning_rate": 0.0009022965076776736, + "loss": 3.9216, + "step": 11510 + }, + { + "epoch": 0.7823753227340672, + "grad_norm": 0.7876642346382141, + "learning_rate": 0.0009022540426688409, + "loss": 3.6863, + "step": 11515 + }, + { + "epoch": 0.7827150428047289, + "grad_norm": 0.7046065330505371, + "learning_rate": 0.0009022115776600081, + "loss": 3.7488, + "step": 11520 + }, + { + "epoch": 0.7830547628753907, + "grad_norm": 0.8243586421012878, + "learning_rate": 0.0009021691126511754, + "loss": 3.8652, + "step": 11525 + }, + { + "epoch": 0.7833944829460524, + "grad_norm": 0.7738139629364014, + "learning_rate": 0.0009021266476423428, + "loss": 3.6429, + "step": 11530 + }, + { + "epoch": 0.7837342030167143, + "grad_norm": 0.8155585527420044, + "learning_rate": 0.00090208418263351, + "loss": 3.4873, + "step": 11535 + }, + { + "epoch": 0.784073923087376, + "grad_norm": 0.7936364412307739, + "learning_rate": 0.0009020417176246773, + "loss": 3.8642, + "step": 11540 + }, + { + "epoch": 0.7844136431580377, + "grad_norm": 0.7720269560813904, + "learning_rate": 0.0009019992526158446, + "loss": 3.6678, + "step": 11545 + }, + { + "epoch": 0.7847533632286996, + "grad_norm": 0.640693724155426, + "learning_rate": 0.0009019567876070118, + "loss": 3.7284, + "step": 11550 + }, + { + "epoch": 0.7850930832993613, + "grad_norm": 0.7694748044013977, + "learning_rate": 0.000901914322598179, + "loss": 3.6448, + "step": 11555 + }, + { + "epoch": 0.7854328033700231, + "grad_norm": 0.7252551317214966, + "learning_rate": 0.0009018718575893464, + "loss": 3.7941, + "step": 11560 + }, + { + "epoch": 0.7857725234406848, + "grad_norm": 0.7769997715950012, + "learning_rate": 0.0009018293925805137, + "loss": 3.6516, + "step": 11565 + }, + { + "epoch": 0.7861122435113467, + "grad_norm": 0.9725680351257324, + "learning_rate": 0.0009017869275716809, + "loss": 3.4228, + "step": 11570 + }, + { + "epoch": 0.7864519635820084, + "grad_norm": 0.7560505270957947, + "learning_rate": 0.0009017444625628483, + "loss": 3.6015, + "step": 11575 + }, + { + "epoch": 0.7867916836526702, + "grad_norm": 0.8336181044578552, + "learning_rate": 0.0009017019975540155, + "loss": 4.0335, + "step": 11580 + }, + { + "epoch": 0.787131403723332, + "grad_norm": 0.8251464366912842, + "learning_rate": 0.0009016595325451827, + "loss": 3.5641, + "step": 11585 + }, + { + "epoch": 0.7874711237939938, + "grad_norm": 0.5509349703788757, + "learning_rate": 0.0009016170675363501, + "loss": 3.5977, + "step": 11590 + }, + { + "epoch": 0.7878108438646555, + "grad_norm": 0.7564021348953247, + "learning_rate": 0.0009015746025275173, + "loss": 3.593, + "step": 11595 + }, + { + "epoch": 0.7881505639353173, + "grad_norm": 0.7966201305389404, + "learning_rate": 0.0009015321375186846, + "loss": 3.5492, + "step": 11600 + }, + { + "epoch": 0.7884902840059791, + "grad_norm": 0.8041855096817017, + "learning_rate": 0.000901489672509852, + "loss": 3.8541, + "step": 11605 + }, + { + "epoch": 0.7888300040766408, + "grad_norm": 0.8472400903701782, + "learning_rate": 0.0009014472075010192, + "loss": 3.9236, + "step": 11610 + }, + { + "epoch": 0.7891697241473026, + "grad_norm": 0.8034156560897827, + "learning_rate": 0.0009014047424921864, + "loss": 3.4673, + "step": 11615 + }, + { + "epoch": 0.7895094442179644, + "grad_norm": 0.8986274600028992, + "learning_rate": 0.0009013622774833537, + "loss": 3.5718, + "step": 11620 + }, + { + "epoch": 0.7898491642886262, + "grad_norm": 0.7308080196380615, + "learning_rate": 0.000901319812474521, + "loss": 3.5523, + "step": 11625 + }, + { + "epoch": 0.7901888843592879, + "grad_norm": 0.8409743905067444, + "learning_rate": 0.0009012773474656883, + "loss": 3.6518, + "step": 11630 + }, + { + "epoch": 0.7905286044299498, + "grad_norm": 1.2058814764022827, + "learning_rate": 0.0009012348824568556, + "loss": 3.6597, + "step": 11635 + }, + { + "epoch": 0.7908683245006115, + "grad_norm": 0.8940008878707886, + "learning_rate": 0.0009011924174480229, + "loss": 3.7667, + "step": 11640 + }, + { + "epoch": 0.7912080445712733, + "grad_norm": 0.8077465295791626, + "learning_rate": 0.0009011499524391902, + "loss": 3.4476, + "step": 11645 + }, + { + "epoch": 0.791547764641935, + "grad_norm": 0.8694632053375244, + "learning_rate": 0.0009011074874303574, + "loss": 3.7511, + "step": 11650 + }, + { + "epoch": 0.7918874847125968, + "grad_norm": 1.1284823417663574, + "learning_rate": 0.0009010650224215246, + "loss": 3.7529, + "step": 11655 + }, + { + "epoch": 0.7922272047832586, + "grad_norm": 0.8847794532775879, + "learning_rate": 0.000901022557412692, + "loss": 3.7332, + "step": 11660 + }, + { + "epoch": 0.7925669248539203, + "grad_norm": 19.21723175048828, + "learning_rate": 0.0009009800924038592, + "loss": 3.6351, + "step": 11665 + }, + { + "epoch": 0.7929066449245822, + "grad_norm": 0.7227755188941956, + "learning_rate": 0.0009009376273950265, + "loss": 3.672, + "step": 11670 + }, + { + "epoch": 0.7932463649952439, + "grad_norm": 0.9313464164733887, + "learning_rate": 0.0009008951623861939, + "loss": 3.6256, + "step": 11675 + }, + { + "epoch": 0.7935860850659057, + "grad_norm": 0.678211510181427, + "learning_rate": 0.0009008526973773611, + "loss": 3.7432, + "step": 11680 + }, + { + "epoch": 0.7939258051365675, + "grad_norm": 0.7791925668716431, + "learning_rate": 0.0009008102323685283, + "loss": 3.9595, + "step": 11685 + }, + { + "epoch": 0.7942655252072293, + "grad_norm": 0.8282289505004883, + "learning_rate": 0.0009007677673596957, + "loss": 3.4482, + "step": 11690 + }, + { + "epoch": 0.794605245277891, + "grad_norm": 0.747299313545227, + "learning_rate": 0.0009007253023508629, + "loss": 3.6961, + "step": 11695 + }, + { + "epoch": 0.7949449653485527, + "grad_norm": 1.8201948404312134, + "learning_rate": 0.0009006828373420301, + "loss": 3.6944, + "step": 11700 + }, + { + "epoch": 0.7952846854192146, + "grad_norm": 1.322952389717102, + "learning_rate": 0.0009006403723331976, + "loss": 3.5084, + "step": 11705 + }, + { + "epoch": 0.7956244054898763, + "grad_norm": 0.6801309585571289, + "learning_rate": 0.0009005979073243648, + "loss": 3.4178, + "step": 11710 + }, + { + "epoch": 0.7959641255605381, + "grad_norm": 0.6730698943138123, + "learning_rate": 0.000900555442315532, + "loss": 3.7642, + "step": 11715 + }, + { + "epoch": 0.7963038456311999, + "grad_norm": 0.9825437068939209, + "learning_rate": 0.0009005129773066993, + "loss": 3.7553, + "step": 11720 + }, + { + "epoch": 0.7966435657018617, + "grad_norm": 0.857135534286499, + "learning_rate": 0.0009004705122978666, + "loss": 3.6653, + "step": 11725 + }, + { + "epoch": 0.7969832857725234, + "grad_norm": 3.240708112716675, + "learning_rate": 0.0009004280472890338, + "loss": 3.7563, + "step": 11730 + }, + { + "epoch": 0.7973230058431852, + "grad_norm": 0.8420482873916626, + "learning_rate": 0.0009003855822802011, + "loss": 3.6388, + "step": 11735 + }, + { + "epoch": 0.797662725913847, + "grad_norm": 0.9364164471626282, + "learning_rate": 0.0009003431172713685, + "loss": 3.5354, + "step": 11740 + }, + { + "epoch": 0.7980024459845088, + "grad_norm": 0.8310482501983643, + "learning_rate": 0.0009003006522625357, + "loss": 3.628, + "step": 11745 + }, + { + "epoch": 0.7983421660551705, + "grad_norm": 1.066575527191162, + "learning_rate": 0.000900258187253703, + "loss": 3.7591, + "step": 11750 + }, + { + "epoch": 0.7986818861258324, + "grad_norm": 0.7524402737617493, + "learning_rate": 0.0009002157222448702, + "loss": 3.3023, + "step": 11755 + }, + { + "epoch": 0.7990216061964941, + "grad_norm": 1.9965541362762451, + "learning_rate": 0.0009001732572360375, + "loss": 3.7084, + "step": 11760 + }, + { + "epoch": 0.7993613262671558, + "grad_norm": 0.8044977784156799, + "learning_rate": 0.0009001307922272048, + "loss": 3.7319, + "step": 11765 + }, + { + "epoch": 0.7997010463378177, + "grad_norm": 6.163888931274414, + "learning_rate": 0.000900088327218372, + "loss": 3.7158, + "step": 11770 + }, + { + "epoch": 0.8000407664084794, + "grad_norm": 0.681088387966156, + "learning_rate": 0.0009000458622095394, + "loss": 3.5035, + "step": 11775 + }, + { + "epoch": 0.8003804864791412, + "grad_norm": 1.161738395690918, + "learning_rate": 0.0009000033972007067, + "loss": 3.8294, + "step": 11780 + }, + { + "epoch": 0.8007202065498029, + "grad_norm": 0.9559353590011597, + "learning_rate": 0.0008999609321918739, + "loss": 3.8343, + "step": 11785 + }, + { + "epoch": 0.8010599266204648, + "grad_norm": 0.8990914225578308, + "learning_rate": 0.0008999184671830412, + "loss": 3.3608, + "step": 11790 + }, + { + "epoch": 0.8013996466911265, + "grad_norm": 0.7692317366600037, + "learning_rate": 0.0008998760021742085, + "loss": 3.7129, + "step": 11795 + }, + { + "epoch": 0.8017393667617883, + "grad_norm": 0.8258704543113708, + "learning_rate": 0.0008998335371653757, + "loss": 3.8203, + "step": 11800 + }, + { + "epoch": 0.8020790868324501, + "grad_norm": 0.8166278600692749, + "learning_rate": 0.0008997910721565429, + "loss": 3.8864, + "step": 11805 + }, + { + "epoch": 0.8024188069031118, + "grad_norm": 1.2051016092300415, + "learning_rate": 0.0008997486071477104, + "loss": 3.4996, + "step": 11810 + }, + { + "epoch": 0.8027585269737736, + "grad_norm": 0.9588713049888611, + "learning_rate": 0.0008997061421388776, + "loss": 3.5689, + "step": 11815 + }, + { + "epoch": 0.8030982470444353, + "grad_norm": 1.0693833827972412, + "learning_rate": 0.0008996636771300448, + "loss": 3.7351, + "step": 11820 + }, + { + "epoch": 0.8034379671150972, + "grad_norm": 0.9330841302871704, + "learning_rate": 0.0008996212121212122, + "loss": 3.7526, + "step": 11825 + }, + { + "epoch": 0.8037776871857589, + "grad_norm": 0.8327504396438599, + "learning_rate": 0.0008995787471123794, + "loss": 3.7568, + "step": 11830 + }, + { + "epoch": 0.8041174072564207, + "grad_norm": 0.7175239324569702, + "learning_rate": 0.0008995362821035466, + "loss": 3.7489, + "step": 11835 + }, + { + "epoch": 0.8044571273270825, + "grad_norm": 0.8911623954772949, + "learning_rate": 0.000899493817094714, + "loss": 3.8002, + "step": 11840 + }, + { + "epoch": 0.8047968473977443, + "grad_norm": 0.827058732509613, + "learning_rate": 0.0008994513520858813, + "loss": 3.5888, + "step": 11845 + }, + { + "epoch": 0.805136567468406, + "grad_norm": 1.008345127105713, + "learning_rate": 0.0008994088870770485, + "loss": 3.789, + "step": 11850 + }, + { + "epoch": 0.8054762875390679, + "grad_norm": 1.1266976594924927, + "learning_rate": 0.0008993664220682158, + "loss": 3.6047, + "step": 11855 + }, + { + "epoch": 0.8058160076097296, + "grad_norm": 0.8931801915168762, + "learning_rate": 0.0008993239570593831, + "loss": 3.4656, + "step": 11860 + }, + { + "epoch": 0.8061557276803913, + "grad_norm": 0.981272280216217, + "learning_rate": 0.0008992814920505503, + "loss": 3.589, + "step": 11865 + }, + { + "epoch": 0.8064954477510531, + "grad_norm": 0.6813732981681824, + "learning_rate": 0.0008992390270417176, + "loss": 3.7431, + "step": 11870 + }, + { + "epoch": 0.8068351678217149, + "grad_norm": 0.746742308139801, + "learning_rate": 0.0008991965620328849, + "loss": 3.6678, + "step": 11875 + }, + { + "epoch": 0.8071748878923767, + "grad_norm": 1.7406831979751587, + "learning_rate": 0.0008991540970240522, + "loss": 3.2612, + "step": 11880 + }, + { + "epoch": 0.8075146079630384, + "grad_norm": 2.588305711746216, + "learning_rate": 0.0008991116320152195, + "loss": 3.7369, + "step": 11885 + }, + { + "epoch": 0.8078543280337003, + "grad_norm": 0.978930652141571, + "learning_rate": 0.0008990691670063868, + "loss": 3.4402, + "step": 11890 + }, + { + "epoch": 0.808194048104362, + "grad_norm": 0.9134714007377625, + "learning_rate": 0.000899026701997554, + "loss": 3.6657, + "step": 11895 + }, + { + "epoch": 0.8085337681750238, + "grad_norm": 4.314025402069092, + "learning_rate": 0.0008989842369887213, + "loss": 3.468, + "step": 11900 + }, + { + "epoch": 0.8088734882456855, + "grad_norm": 0.7964876890182495, + "learning_rate": 0.0008989417719798885, + "loss": 3.5326, + "step": 11905 + }, + { + "epoch": 0.8092132083163474, + "grad_norm": 0.7957569360733032, + "learning_rate": 0.0008988993069710558, + "loss": 3.8136, + "step": 11910 + }, + { + "epoch": 0.8095529283870091, + "grad_norm": 0.6765254139900208, + "learning_rate": 0.0008988568419622232, + "loss": 3.7646, + "step": 11915 + }, + { + "epoch": 0.8098926484576708, + "grad_norm": 0.8979242444038391, + "learning_rate": 0.0008988143769533904, + "loss": 3.4354, + "step": 11920 + }, + { + "epoch": 0.8102323685283327, + "grad_norm": 0.7286115288734436, + "learning_rate": 0.0008987719119445577, + "loss": 3.7409, + "step": 11925 + }, + { + "epoch": 0.8105720885989944, + "grad_norm": 0.737282395362854, + "learning_rate": 0.000898729446935725, + "loss": 3.7433, + "step": 11930 + }, + { + "epoch": 0.8109118086696562, + "grad_norm": 0.8476179242134094, + "learning_rate": 0.0008986869819268922, + "loss": 3.7743, + "step": 11935 + }, + { + "epoch": 0.811251528740318, + "grad_norm": 1.2118488550186157, + "learning_rate": 0.0008986445169180594, + "loss": 3.5224, + "step": 11940 + }, + { + "epoch": 0.8115912488109798, + "grad_norm": 0.7481421828269958, + "learning_rate": 0.0008986020519092269, + "loss": 3.4099, + "step": 11945 + }, + { + "epoch": 0.8119309688816415, + "grad_norm": 0.780239999294281, + "learning_rate": 0.0008985595869003941, + "loss": 3.6785, + "step": 11950 + }, + { + "epoch": 0.8122706889523033, + "grad_norm": 1.1838839054107666, + "learning_rate": 0.0008985171218915613, + "loss": 3.5821, + "step": 11955 + }, + { + "epoch": 0.8126104090229651, + "grad_norm": 1.429790735244751, + "learning_rate": 0.0008984746568827287, + "loss": 3.657, + "step": 11960 + }, + { + "epoch": 0.8129501290936268, + "grad_norm": 0.743295431137085, + "learning_rate": 0.0008984321918738959, + "loss": 3.5701, + "step": 11965 + }, + { + "epoch": 0.8132898491642886, + "grad_norm": 0.9131989479064941, + "learning_rate": 0.0008983897268650632, + "loss": 3.6544, + "step": 11970 + }, + { + "epoch": 0.8136295692349504, + "grad_norm": 0.9572948813438416, + "learning_rate": 0.0008983472618562305, + "loss": 3.8174, + "step": 11975 + }, + { + "epoch": 0.8139692893056122, + "grad_norm": 1.0678379535675049, + "learning_rate": 0.0008983047968473978, + "loss": 3.55, + "step": 11980 + }, + { + "epoch": 0.8143090093762739, + "grad_norm": 1.793413758277893, + "learning_rate": 0.0008982623318385651, + "loss": 3.5891, + "step": 11985 + }, + { + "epoch": 0.8146487294469357, + "grad_norm": 1.3618600368499756, + "learning_rate": 0.0008982198668297324, + "loss": 3.8057, + "step": 11990 + }, + { + "epoch": 0.8149884495175975, + "grad_norm": 1.1085466146469116, + "learning_rate": 0.0008981774018208996, + "loss": 3.5042, + "step": 11995 + }, + { + "epoch": 0.8153281695882593, + "grad_norm": 2.303776741027832, + "learning_rate": 0.0008981349368120669, + "loss": 3.6459, + "step": 12000 + }, + { + "epoch": 0.815667889658921, + "grad_norm": 1.1642874479293823, + "learning_rate": 0.0008980924718032341, + "loss": 3.7677, + "step": 12005 + }, + { + "epoch": 0.8160076097295829, + "grad_norm": 0.8356020450592041, + "learning_rate": 0.0008980500067944014, + "loss": 3.7202, + "step": 12010 + }, + { + "epoch": 0.8163473298002446, + "grad_norm": 0.7830950617790222, + "learning_rate": 0.0008980075417855688, + "loss": 3.8272, + "step": 12015 + }, + { + "epoch": 0.8166870498709063, + "grad_norm": 1.0376938581466675, + "learning_rate": 0.000897965076776736, + "loss": 3.7243, + "step": 12020 + }, + { + "epoch": 0.8170267699415682, + "grad_norm": 0.87122642993927, + "learning_rate": 0.0008979226117679033, + "loss": 3.2821, + "step": 12025 + }, + { + "epoch": 0.8173664900122299, + "grad_norm": 1.1050115823745728, + "learning_rate": 0.0008978801467590706, + "loss": 3.7031, + "step": 12030 + }, + { + "epoch": 0.8177062100828917, + "grad_norm": 1.1414813995361328, + "learning_rate": 0.0008978376817502378, + "loss": 3.6687, + "step": 12035 + }, + { + "epoch": 0.8180459301535534, + "grad_norm": 0.8770098686218262, + "learning_rate": 0.000897795216741405, + "loss": 3.5826, + "step": 12040 + }, + { + "epoch": 0.8183856502242153, + "grad_norm": 0.9358768463134766, + "learning_rate": 0.0008977527517325724, + "loss": 3.9931, + "step": 12045 + }, + { + "epoch": 0.818725370294877, + "grad_norm": 1.5611785650253296, + "learning_rate": 0.0008977102867237397, + "loss": 3.734, + "step": 12050 + }, + { + "epoch": 0.8190650903655388, + "grad_norm": 0.9423375725746155, + "learning_rate": 0.0008976678217149069, + "loss": 3.7024, + "step": 12055 + }, + { + "epoch": 0.8194048104362006, + "grad_norm": 0.8384486436843872, + "learning_rate": 0.0008976253567060743, + "loss": 3.557, + "step": 12060 + }, + { + "epoch": 0.8197445305068624, + "grad_norm": 4.71159553527832, + "learning_rate": 0.0008975828916972415, + "loss": 3.785, + "step": 12065 + }, + { + "epoch": 0.8200842505775241, + "grad_norm": 0.9767023324966431, + "learning_rate": 0.0008975404266884087, + "loss": 3.6251, + "step": 12070 + }, + { + "epoch": 0.8204239706481858, + "grad_norm": 0.9765772223472595, + "learning_rate": 0.0008974979616795761, + "loss": 3.7723, + "step": 12075 + }, + { + "epoch": 0.8207636907188477, + "grad_norm": 1.1468009948730469, + "learning_rate": 0.0008974554966707433, + "loss": 3.4427, + "step": 12080 + }, + { + "epoch": 0.8211034107895094, + "grad_norm": 1.6982296705245972, + "learning_rate": 0.0008974130316619106, + "loss": 3.6837, + "step": 12085 + }, + { + "epoch": 0.8214431308601712, + "grad_norm": 1.0457658767700195, + "learning_rate": 0.000897370566653078, + "loss": 3.8464, + "step": 12090 + }, + { + "epoch": 0.821782850930833, + "grad_norm": 2.1178550720214844, + "learning_rate": 0.0008973281016442452, + "loss": 3.8578, + "step": 12095 + }, + { + "epoch": 0.8221225710014948, + "grad_norm": 0.8319690823554993, + "learning_rate": 0.0008972856366354124, + "loss": 3.7482, + "step": 12100 + }, + { + "epoch": 0.8224622910721565, + "grad_norm": 1.5097166299819946, + "learning_rate": 0.0008972431716265797, + "loss": 3.6185, + "step": 12105 + }, + { + "epoch": 0.8228020111428184, + "grad_norm": 0.8399849534034729, + "learning_rate": 0.000897200706617747, + "loss": 3.691, + "step": 12110 + }, + { + "epoch": 0.8231417312134801, + "grad_norm": 0.8186467885971069, + "learning_rate": 0.0008971582416089142, + "loss": 3.7328, + "step": 12115 + }, + { + "epoch": 0.8234814512841419, + "grad_norm": 0.8267626762390137, + "learning_rate": 0.0008971157766000816, + "loss": 3.687, + "step": 12120 + }, + { + "epoch": 0.8238211713548036, + "grad_norm": 0.9893280267715454, + "learning_rate": 0.0008970733115912489, + "loss": 3.6648, + "step": 12125 + }, + { + "epoch": 0.8241608914254654, + "grad_norm": 0.738146960735321, + "learning_rate": 0.0008970308465824161, + "loss": 3.913, + "step": 12130 + }, + { + "epoch": 0.8245006114961272, + "grad_norm": 0.9565632343292236, + "learning_rate": 0.0008969883815735834, + "loss": 3.5519, + "step": 12135 + }, + { + "epoch": 0.8248403315667889, + "grad_norm": 0.9319611191749573, + "learning_rate": 0.0008969459165647507, + "loss": 3.7412, + "step": 12140 + }, + { + "epoch": 0.8251800516374508, + "grad_norm": 0.7641745209693909, + "learning_rate": 0.0008969034515559179, + "loss": 3.7149, + "step": 12145 + }, + { + "epoch": 0.8255197717081125, + "grad_norm": 1.1911808252334595, + "learning_rate": 0.0008968609865470852, + "loss": 3.62, + "step": 12150 + }, + { + "epoch": 0.8258594917787743, + "grad_norm": 0.800098717212677, + "learning_rate": 0.0008968185215382525, + "loss": 3.8071, + "step": 12155 + }, + { + "epoch": 0.826199211849436, + "grad_norm": 3.345296859741211, + "learning_rate": 0.0008967760565294198, + "loss": 3.6213, + "step": 12160 + }, + { + "epoch": 0.8265389319200979, + "grad_norm": 0.8754676580429077, + "learning_rate": 0.0008967335915205871, + "loss": 3.7782, + "step": 12165 + }, + { + "epoch": 0.8268786519907596, + "grad_norm": 1.348473310470581, + "learning_rate": 0.0008966911265117543, + "loss": 3.7474, + "step": 12170 + }, + { + "epoch": 0.8272183720614213, + "grad_norm": 1.8399182558059692, + "learning_rate": 0.0008966486615029216, + "loss": 3.5222, + "step": 12175 + }, + { + "epoch": 0.8275580921320832, + "grad_norm": 1.0082560777664185, + "learning_rate": 0.0008966061964940889, + "loss": 3.7147, + "step": 12180 + }, + { + "epoch": 0.8278978122027449, + "grad_norm": 0.6656339168548584, + "learning_rate": 0.0008965637314852561, + "loss": 3.6583, + "step": 12185 + }, + { + "epoch": 0.8282375322734067, + "grad_norm": 9.560941696166992, + "learning_rate": 0.0008965212664764235, + "loss": 3.8559, + "step": 12190 + }, + { + "epoch": 0.8285772523440685, + "grad_norm": 0.8942816257476807, + "learning_rate": 0.0008964788014675908, + "loss": 3.8328, + "step": 12195 + }, + { + "epoch": 0.8289169724147303, + "grad_norm": 1.0735458135604858, + "learning_rate": 0.000896436336458758, + "loss": 3.9173, + "step": 12200 + }, + { + "epoch": 0.829256692485392, + "grad_norm": 1.4457052946090698, + "learning_rate": 0.0008963938714499252, + "loss": 3.9023, + "step": 12205 + }, + { + "epoch": 0.8295964125560538, + "grad_norm": 0.8400038480758667, + "learning_rate": 0.0008963514064410926, + "loss": 3.5682, + "step": 12210 + }, + { + "epoch": 0.8299361326267156, + "grad_norm": 1.2635211944580078, + "learning_rate": 0.0008963089414322598, + "loss": 3.6892, + "step": 12215 + }, + { + "epoch": 0.8302758526973774, + "grad_norm": 0.8412904739379883, + "learning_rate": 0.000896266476423427, + "loss": 3.5975, + "step": 12220 + }, + { + "epoch": 0.8306155727680391, + "grad_norm": 1.175331711769104, + "learning_rate": 0.0008962240114145945, + "loss": 3.8253, + "step": 12225 + }, + { + "epoch": 0.830955292838701, + "grad_norm": 0.8512313365936279, + "learning_rate": 0.0008961815464057617, + "loss": 3.5383, + "step": 12230 + }, + { + "epoch": 0.8312950129093627, + "grad_norm": 0.8247966170310974, + "learning_rate": 0.0008961390813969289, + "loss": 3.1646, + "step": 12235 + }, + { + "epoch": 0.8316347329800244, + "grad_norm": 0.9606428146362305, + "learning_rate": 0.0008960966163880963, + "loss": 3.526, + "step": 12240 + }, + { + "epoch": 0.8319744530506862, + "grad_norm": 0.8347512483596802, + "learning_rate": 0.0008960541513792635, + "loss": 3.3199, + "step": 12245 + }, + { + "epoch": 0.832314173121348, + "grad_norm": 0.74213045835495, + "learning_rate": 0.0008960116863704307, + "loss": 3.6291, + "step": 12250 + }, + { + "epoch": 0.8326538931920098, + "grad_norm": 1.0132322311401367, + "learning_rate": 0.000895969221361598, + "loss": 3.456, + "step": 12255 + }, + { + "epoch": 0.8329936132626715, + "grad_norm": 1.574074149131775, + "learning_rate": 0.0008959267563527654, + "loss": 3.5547, + "step": 12260 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 1.0543498992919922, + "learning_rate": 0.0008958842913439326, + "loss": 3.7245, + "step": 12265 + }, + { + "epoch": 0.8336730534039951, + "grad_norm": 1.0911391973495483, + "learning_rate": 0.0008958418263350999, + "loss": 3.2935, + "step": 12270 + }, + { + "epoch": 0.8340127734746569, + "grad_norm": 0.8056760430335999, + "learning_rate": 0.0008957993613262672, + "loss": 3.7953, + "step": 12275 + }, + { + "epoch": 0.8343524935453187, + "grad_norm": 1.047203540802002, + "learning_rate": 0.0008957568963174344, + "loss": 3.7215, + "step": 12280 + }, + { + "epoch": 0.8346922136159804, + "grad_norm": 0.7943539619445801, + "learning_rate": 0.0008957144313086017, + "loss": 3.5883, + "step": 12285 + }, + { + "epoch": 0.8350319336866422, + "grad_norm": 1.2296775579452515, + "learning_rate": 0.0008956719662997689, + "loss": 3.6294, + "step": 12290 + }, + { + "epoch": 0.8353716537573039, + "grad_norm": 0.8864673972129822, + "learning_rate": 0.0008956295012909363, + "loss": 3.6182, + "step": 12295 + }, + { + "epoch": 0.8357113738279658, + "grad_norm": 1.208784818649292, + "learning_rate": 0.0008955870362821036, + "loss": 3.6632, + "step": 12300 + }, + { + "epoch": 0.8360510938986275, + "grad_norm": 1.317670226097107, + "learning_rate": 0.0008955445712732708, + "loss": 3.4335, + "step": 12305 + }, + { + "epoch": 0.8363908139692893, + "grad_norm": 1.1332440376281738, + "learning_rate": 0.0008955021062644382, + "loss": 3.8258, + "step": 12310 + }, + { + "epoch": 0.8367305340399511, + "grad_norm": 1.5073355436325073, + "learning_rate": 0.0008954596412556054, + "loss": 3.5211, + "step": 12315 + }, + { + "epoch": 0.8370702541106129, + "grad_norm": 0.9930076599121094, + "learning_rate": 0.0008954171762467726, + "loss": 3.7169, + "step": 12320 + }, + { + "epoch": 0.8374099741812746, + "grad_norm": 1.103778600692749, + "learning_rate": 0.00089537471123794, + "loss": 3.855, + "step": 12325 + }, + { + "epoch": 0.8377496942519363, + "grad_norm": 1.0513641834259033, + "learning_rate": 0.0008953322462291073, + "loss": 3.2781, + "step": 12330 + }, + { + "epoch": 0.8380894143225982, + "grad_norm": 0.8450642824172974, + "learning_rate": 0.0008952897812202745, + "loss": 3.8835, + "step": 12335 + }, + { + "epoch": 0.83842913439326, + "grad_norm": 0.8948069214820862, + "learning_rate": 0.0008952473162114419, + "loss": 3.7338, + "step": 12340 + }, + { + "epoch": 0.8387688544639217, + "grad_norm": 0.8194078803062439, + "learning_rate": 0.0008952048512026091, + "loss": 3.6616, + "step": 12345 + }, + { + "epoch": 0.8391085745345835, + "grad_norm": 0.7513666749000549, + "learning_rate": 0.0008951623861937763, + "loss": 3.6281, + "step": 12350 + }, + { + "epoch": 0.8394482946052453, + "grad_norm": 0.781000018119812, + "learning_rate": 0.0008951199211849436, + "loss": 3.6511, + "step": 12355 + }, + { + "epoch": 0.839788014675907, + "grad_norm": 0.8052552342414856, + "learning_rate": 0.0008950774561761109, + "loss": 3.6598, + "step": 12360 + }, + { + "epoch": 0.8401277347465689, + "grad_norm": 0.6139883399009705, + "learning_rate": 0.0008950349911672782, + "loss": 3.9005, + "step": 12365 + }, + { + "epoch": 0.8404674548172306, + "grad_norm": 0.9119839072227478, + "learning_rate": 0.0008949925261584455, + "loss": 3.6976, + "step": 12370 + }, + { + "epoch": 0.8408071748878924, + "grad_norm": 0.8594744801521301, + "learning_rate": 0.0008949500611496128, + "loss": 3.64, + "step": 12375 + }, + { + "epoch": 0.8411468949585541, + "grad_norm": 1.0599935054779053, + "learning_rate": 0.00089490759614078, + "loss": 3.6834, + "step": 12380 + }, + { + "epoch": 0.841486615029216, + "grad_norm": 0.9042484164237976, + "learning_rate": 0.0008948651311319473, + "loss": 3.8284, + "step": 12385 + }, + { + "epoch": 0.8418263350998777, + "grad_norm": 0.8020474314689636, + "learning_rate": 0.0008948226661231145, + "loss": 3.4645, + "step": 12390 + }, + { + "epoch": 0.8421660551705394, + "grad_norm": 0.8528745770454407, + "learning_rate": 0.0008947802011142818, + "loss": 3.5839, + "step": 12395 + }, + { + "epoch": 0.8425057752412013, + "grad_norm": 1.33851957321167, + "learning_rate": 0.0008947377361054492, + "loss": 3.7545, + "step": 12400 + }, + { + "epoch": 0.842845495311863, + "grad_norm": 0.7493317723274231, + "learning_rate": 0.0008946952710966164, + "loss": 3.6499, + "step": 12405 + }, + { + "epoch": 0.8431852153825248, + "grad_norm": 0.9472340941429138, + "learning_rate": 0.0008946528060877837, + "loss": 3.9046, + "step": 12410 + }, + { + "epoch": 0.8435249354531865, + "grad_norm": 0.8662052154541016, + "learning_rate": 0.000894610341078951, + "loss": 3.8141, + "step": 12415 + }, + { + "epoch": 0.8438646555238484, + "grad_norm": 0.7273616790771484, + "learning_rate": 0.0008945678760701182, + "loss": 3.738, + "step": 12420 + }, + { + "epoch": 0.8442043755945101, + "grad_norm": 0.8929319977760315, + "learning_rate": 0.0008945254110612855, + "loss": 3.8383, + "step": 12425 + }, + { + "epoch": 0.8445440956651719, + "grad_norm": 0.6989656686782837, + "learning_rate": 0.0008944829460524528, + "loss": 3.6693, + "step": 12430 + }, + { + "epoch": 0.8448838157358337, + "grad_norm": 1.8858411312103271, + "learning_rate": 0.0008944404810436201, + "loss": 3.7982, + "step": 12435 + }, + { + "epoch": 0.8452235358064955, + "grad_norm": 1.1658178567886353, + "learning_rate": 0.0008943980160347873, + "loss": 3.6572, + "step": 12440 + }, + { + "epoch": 0.8455632558771572, + "grad_norm": 0.7695299386978149, + "learning_rate": 0.0008943555510259547, + "loss": 3.5883, + "step": 12445 + }, + { + "epoch": 0.845902975947819, + "grad_norm": 0.7807323932647705, + "learning_rate": 0.0008943130860171219, + "loss": 3.6175, + "step": 12450 + }, + { + "epoch": 0.8462426960184808, + "grad_norm": 1.0242794752120972, + "learning_rate": 0.0008942706210082891, + "loss": 3.9449, + "step": 12455 + }, + { + "epoch": 0.8465824160891425, + "grad_norm": 0.9249842166900635, + "learning_rate": 0.0008942281559994565, + "loss": 3.6187, + "step": 12460 + }, + { + "epoch": 0.8469221361598043, + "grad_norm": 0.6891613006591797, + "learning_rate": 0.0008941856909906237, + "loss": 3.716, + "step": 12465 + }, + { + "epoch": 0.8472618562304661, + "grad_norm": 0.8940126895904541, + "learning_rate": 0.000894143225981791, + "loss": 3.5136, + "step": 12470 + }, + { + "epoch": 0.8476015763011279, + "grad_norm": 0.6634390950202942, + "learning_rate": 0.0008941007609729584, + "loss": 3.2417, + "step": 12475 + }, + { + "epoch": 0.8479412963717896, + "grad_norm": 1.1272963285446167, + "learning_rate": 0.0008940582959641256, + "loss": 3.5247, + "step": 12480 + }, + { + "epoch": 0.8482810164424515, + "grad_norm": 0.7897003293037415, + "learning_rate": 0.0008940158309552928, + "loss": 3.643, + "step": 12485 + }, + { + "epoch": 0.8486207365131132, + "grad_norm": 2.3707611560821533, + "learning_rate": 0.0008939733659464601, + "loss": 3.6738, + "step": 12490 + }, + { + "epoch": 0.848960456583775, + "grad_norm": 0.8061043620109558, + "learning_rate": 0.0008939309009376274, + "loss": 3.7245, + "step": 12495 + }, + { + "epoch": 0.8493001766544367, + "grad_norm": 1.6183264255523682, + "learning_rate": 0.0008938884359287946, + "loss": 3.7238, + "step": 12500 + }, + { + "epoch": 0.8496398967250985, + "grad_norm": 0.8394601345062256, + "learning_rate": 0.000893845970919962, + "loss": 3.648, + "step": 12505 + }, + { + "epoch": 0.8499796167957603, + "grad_norm": 0.9863250851631165, + "learning_rate": 0.0008938035059111293, + "loss": 3.4764, + "step": 12510 + }, + { + "epoch": 0.850319336866422, + "grad_norm": 0.763404905796051, + "learning_rate": 0.0008937610409022965, + "loss": 3.5146, + "step": 12515 + }, + { + "epoch": 0.8506590569370839, + "grad_norm": 1.1683576107025146, + "learning_rate": 0.0008937185758934638, + "loss": 3.6867, + "step": 12520 + }, + { + "epoch": 0.8509987770077456, + "grad_norm": 1.6497260332107544, + "learning_rate": 0.000893676110884631, + "loss": 3.6728, + "step": 12525 + }, + { + "epoch": 0.8513384970784074, + "grad_norm": 1.084399700164795, + "learning_rate": 0.0008936336458757983, + "loss": 3.8271, + "step": 12530 + }, + { + "epoch": 0.8516782171490692, + "grad_norm": 0.8513826727867126, + "learning_rate": 0.0008935911808669657, + "loss": 3.9418, + "step": 12535 + }, + { + "epoch": 0.852017937219731, + "grad_norm": 0.9956335425376892, + "learning_rate": 0.000893548715858133, + "loss": 3.7465, + "step": 12540 + }, + { + "epoch": 0.8523576572903927, + "grad_norm": 0.7502888441085815, + "learning_rate": 0.0008935062508493002, + "loss": 3.7657, + "step": 12545 + }, + { + "epoch": 0.8526973773610544, + "grad_norm": 1.493066668510437, + "learning_rate": 0.0008934637858404675, + "loss": 3.7782, + "step": 12550 + }, + { + "epoch": 0.8530370974317163, + "grad_norm": 1.0350700616836548, + "learning_rate": 0.0008934213208316347, + "loss": 3.7322, + "step": 12555 + }, + { + "epoch": 0.853376817502378, + "grad_norm": 0.8851453065872192, + "learning_rate": 0.000893378855822802, + "loss": 3.506, + "step": 12560 + }, + { + "epoch": 0.8537165375730398, + "grad_norm": 0.809582531452179, + "learning_rate": 0.0008933363908139693, + "loss": 3.8487, + "step": 12565 + }, + { + "epoch": 0.8540562576437016, + "grad_norm": 1.4163895845413208, + "learning_rate": 0.0008932939258051366, + "loss": 3.5388, + "step": 12570 + }, + { + "epoch": 0.8543959777143634, + "grad_norm": 0.8323284983634949, + "learning_rate": 0.0008932514607963039, + "loss": 3.7394, + "step": 12575 + }, + { + "epoch": 0.8547356977850251, + "grad_norm": 0.82051020860672, + "learning_rate": 0.0008932089957874712, + "loss": 3.5999, + "step": 12580 + }, + { + "epoch": 0.8550754178556869, + "grad_norm": 0.8441026210784912, + "learning_rate": 0.0008931665307786384, + "loss": 3.4076, + "step": 12585 + }, + { + "epoch": 0.8554151379263487, + "grad_norm": 0.809715986251831, + "learning_rate": 0.0008931240657698056, + "loss": 3.7076, + "step": 12590 + }, + { + "epoch": 0.8557548579970105, + "grad_norm": 0.8348985910415649, + "learning_rate": 0.000893081600760973, + "loss": 3.7876, + "step": 12595 + }, + { + "epoch": 0.8560945780676722, + "grad_norm": 0.8572840690612793, + "learning_rate": 0.0008930391357521402, + "loss": 3.7696, + "step": 12600 + }, + { + "epoch": 0.856434298138334, + "grad_norm": 0.9449257850646973, + "learning_rate": 0.0008929966707433075, + "loss": 3.4207, + "step": 12605 + }, + { + "epoch": 0.8567740182089958, + "grad_norm": 1.3028172254562378, + "learning_rate": 0.0008929542057344749, + "loss": 3.6107, + "step": 12610 + }, + { + "epoch": 0.8571137382796575, + "grad_norm": 0.8413300514221191, + "learning_rate": 0.0008929117407256421, + "loss": 3.569, + "step": 12615 + }, + { + "epoch": 0.8574534583503194, + "grad_norm": 0.9001073837280273, + "learning_rate": 0.0008928692757168093, + "loss": 3.9657, + "step": 12620 + }, + { + "epoch": 0.8577931784209811, + "grad_norm": 0.8678140044212341, + "learning_rate": 0.0008928268107079767, + "loss": 3.8447, + "step": 12625 + }, + { + "epoch": 0.8581328984916429, + "grad_norm": 1.2138323783874512, + "learning_rate": 0.0008927843456991439, + "loss": 3.8599, + "step": 12630 + }, + { + "epoch": 0.8584726185623046, + "grad_norm": 0.797297477722168, + "learning_rate": 0.0008927418806903111, + "loss": 3.4784, + "step": 12635 + }, + { + "epoch": 0.8588123386329665, + "grad_norm": 0.7508427500724792, + "learning_rate": 0.0008926994156814785, + "loss": 3.8237, + "step": 12640 + }, + { + "epoch": 0.8591520587036282, + "grad_norm": 0.6789147257804871, + "learning_rate": 0.0008926569506726458, + "loss": 3.773, + "step": 12645 + }, + { + "epoch": 0.85949177877429, + "grad_norm": 0.7708194851875305, + "learning_rate": 0.0008926144856638131, + "loss": 3.6893, + "step": 12650 + }, + { + "epoch": 0.8598314988449518, + "grad_norm": 0.7489946484565735, + "learning_rate": 0.0008925720206549803, + "loss": 3.5578, + "step": 12655 + }, + { + "epoch": 0.8601712189156135, + "grad_norm": 0.8548010587692261, + "learning_rate": 0.0008925295556461476, + "loss": 3.5309, + "step": 12660 + }, + { + "epoch": 0.8605109389862753, + "grad_norm": 0.9155256748199463, + "learning_rate": 0.0008924870906373149, + "loss": 3.6292, + "step": 12665 + }, + { + "epoch": 0.860850659056937, + "grad_norm": 1.0295207500457764, + "learning_rate": 0.0008924446256284821, + "loss": 3.6501, + "step": 12670 + }, + { + "epoch": 0.8611903791275989, + "grad_norm": 0.6268315315246582, + "learning_rate": 0.0008924021606196495, + "loss": 3.7322, + "step": 12675 + }, + { + "epoch": 0.8615300991982606, + "grad_norm": 1.7878749370574951, + "learning_rate": 0.0008923596956108168, + "loss": 3.6814, + "step": 12680 + }, + { + "epoch": 0.8618698192689224, + "grad_norm": 0.7935873866081238, + "learning_rate": 0.000892317230601984, + "loss": 3.6838, + "step": 12685 + }, + { + "epoch": 0.8622095393395842, + "grad_norm": 0.8684753179550171, + "learning_rate": 0.0008922747655931512, + "loss": 3.488, + "step": 12690 + }, + { + "epoch": 0.862549259410246, + "grad_norm": 0.7866601943969727, + "learning_rate": 0.0008922323005843186, + "loss": 3.7119, + "step": 12695 + }, + { + "epoch": 0.8628889794809077, + "grad_norm": 0.8693528771400452, + "learning_rate": 0.0008921898355754858, + "loss": 3.7289, + "step": 12700 + }, + { + "epoch": 0.8632286995515696, + "grad_norm": 0.9112973809242249, + "learning_rate": 0.000892147370566653, + "loss": 3.3847, + "step": 12705 + }, + { + "epoch": 0.8635684196222313, + "grad_norm": 0.9909111261367798, + "learning_rate": 0.0008921049055578205, + "loss": 3.4263, + "step": 12710 + }, + { + "epoch": 0.863908139692893, + "grad_norm": 0.9339936971664429, + "learning_rate": 0.0008920624405489877, + "loss": 3.7342, + "step": 12715 + }, + { + "epoch": 0.8642478597635548, + "grad_norm": 0.9828744530677795, + "learning_rate": 0.0008920199755401549, + "loss": 3.7732, + "step": 12720 + }, + { + "epoch": 0.8645875798342166, + "grad_norm": 0.7636462450027466, + "learning_rate": 0.0008919775105313223, + "loss": 3.7939, + "step": 12725 + }, + { + "epoch": 0.8649272999048784, + "grad_norm": 0.7255587577819824, + "learning_rate": 0.0008919350455224895, + "loss": 3.5836, + "step": 12730 + }, + { + "epoch": 0.8652670199755401, + "grad_norm": 0.892466127872467, + "learning_rate": 0.0008918925805136567, + "loss": 3.9951, + "step": 12735 + }, + { + "epoch": 0.865606740046202, + "grad_norm": 0.976373016834259, + "learning_rate": 0.000891850115504824, + "loss": 3.7314, + "step": 12740 + }, + { + "epoch": 0.8659464601168637, + "grad_norm": 0.7727254033088684, + "learning_rate": 0.0008918076504959914, + "loss": 3.7143, + "step": 12745 + }, + { + "epoch": 0.8662861801875255, + "grad_norm": 0.6775889992713928, + "learning_rate": 0.0008917651854871586, + "loss": 3.7828, + "step": 12750 + }, + { + "epoch": 0.8666259002581872, + "grad_norm": 0.7719122171401978, + "learning_rate": 0.0008917227204783259, + "loss": 3.5695, + "step": 12755 + }, + { + "epoch": 0.866965620328849, + "grad_norm": 0.8189406991004944, + "learning_rate": 0.0008916802554694932, + "loss": 3.6175, + "step": 12760 + }, + { + "epoch": 0.8673053403995108, + "grad_norm": 0.7982125282287598, + "learning_rate": 0.0008916377904606604, + "loss": 3.7163, + "step": 12765 + }, + { + "epoch": 0.8676450604701725, + "grad_norm": 0.7305169105529785, + "learning_rate": 0.0008915953254518277, + "loss": 3.7597, + "step": 12770 + }, + { + "epoch": 0.8679847805408344, + "grad_norm": 0.684145987033844, + "learning_rate": 0.000891552860442995, + "loss": 3.8156, + "step": 12775 + }, + { + "epoch": 0.8683245006114961, + "grad_norm": 0.6922198534011841, + "learning_rate": 0.0008915103954341623, + "loss": 3.5999, + "step": 12780 + }, + { + "epoch": 0.8686642206821579, + "grad_norm": 0.8814215660095215, + "learning_rate": 0.0008914679304253296, + "loss": 3.8284, + "step": 12785 + }, + { + "epoch": 0.8690039407528197, + "grad_norm": 0.8510563969612122, + "learning_rate": 0.0008914254654164968, + "loss": 3.5932, + "step": 12790 + }, + { + "epoch": 0.8693436608234815, + "grad_norm": 0.633137583732605, + "learning_rate": 0.0008913830004076641, + "loss": 3.7039, + "step": 12795 + }, + { + "epoch": 0.8696833808941432, + "grad_norm": 0.8084008097648621, + "learning_rate": 0.0008913405353988314, + "loss": 3.8535, + "step": 12800 + }, + { + "epoch": 0.870023100964805, + "grad_norm": 0.8659461140632629, + "learning_rate": 0.0008912980703899986, + "loss": 3.8013, + "step": 12805 + }, + { + "epoch": 0.8703628210354668, + "grad_norm": 0.8289008736610413, + "learning_rate": 0.0008912556053811659, + "loss": 3.8754, + "step": 12810 + }, + { + "epoch": 0.8707025411061285, + "grad_norm": 0.8602203130722046, + "learning_rate": 0.0008912131403723333, + "loss": 3.7246, + "step": 12815 + }, + { + "epoch": 0.8710422611767903, + "grad_norm": 0.7309598326683044, + "learning_rate": 0.0008911706753635005, + "loss": 3.7455, + "step": 12820 + }, + { + "epoch": 0.8713819812474521, + "grad_norm": 1.0993800163269043, + "learning_rate": 0.0008911282103546677, + "loss": 3.8854, + "step": 12825 + }, + { + "epoch": 0.8717217013181139, + "grad_norm": 0.6335657238960266, + "learning_rate": 0.0008910857453458351, + "loss": 3.6099, + "step": 12830 + }, + { + "epoch": 0.8720614213887756, + "grad_norm": 0.8809964656829834, + "learning_rate": 0.0008910432803370023, + "loss": 3.4817, + "step": 12835 + }, + { + "epoch": 0.8724011414594374, + "grad_norm": 0.8833673000335693, + "learning_rate": 0.0008910008153281695, + "loss": 3.6208, + "step": 12840 + }, + { + "epoch": 0.8727408615300992, + "grad_norm": 1.1117693185806274, + "learning_rate": 0.0008909583503193369, + "loss": 3.5492, + "step": 12845 + }, + { + "epoch": 0.873080581600761, + "grad_norm": 0.7198566794395447, + "learning_rate": 0.0008909158853105042, + "loss": 3.965, + "step": 12850 + }, + { + "epoch": 0.8734203016714227, + "grad_norm": 0.8196446895599365, + "learning_rate": 0.0008908734203016714, + "loss": 3.736, + "step": 12855 + }, + { + "epoch": 0.8737600217420846, + "grad_norm": 1.0343807935714722, + "learning_rate": 0.0008908309552928388, + "loss": 3.5933, + "step": 12860 + }, + { + "epoch": 0.8740997418127463, + "grad_norm": 0.7227100133895874, + "learning_rate": 0.000890788490284006, + "loss": 3.5595, + "step": 12865 + }, + { + "epoch": 0.874439461883408, + "grad_norm": 1.3584532737731934, + "learning_rate": 0.0008907460252751732, + "loss": 3.4168, + "step": 12870 + }, + { + "epoch": 0.8747791819540699, + "grad_norm": 0.9213500618934631, + "learning_rate": 0.0008907035602663406, + "loss": 3.7263, + "step": 12875 + }, + { + "epoch": 0.8751189020247316, + "grad_norm": 1.1483378410339355, + "learning_rate": 0.0008906610952575078, + "loss": 3.3609, + "step": 12880 + }, + { + "epoch": 0.8754586220953934, + "grad_norm": 0.8501725792884827, + "learning_rate": 0.0008906186302486751, + "loss": 3.6914, + "step": 12885 + }, + { + "epoch": 0.8757983421660551, + "grad_norm": 0.7157691717147827, + "learning_rate": 0.0008905761652398424, + "loss": 3.5567, + "step": 12890 + }, + { + "epoch": 0.876138062236717, + "grad_norm": 0.8595942258834839, + "learning_rate": 0.0008905337002310097, + "loss": 3.7815, + "step": 12895 + }, + { + "epoch": 0.8764777823073787, + "grad_norm": 0.8045763969421387, + "learning_rate": 0.0008904912352221769, + "loss": 3.8761, + "step": 12900 + }, + { + "epoch": 0.8768175023780405, + "grad_norm": 0.7781740427017212, + "learning_rate": 0.0008904487702133442, + "loss": 3.6283, + "step": 12905 + }, + { + "epoch": 0.8771572224487023, + "grad_norm": 1.0370241403579712, + "learning_rate": 0.0008904063052045115, + "loss": 3.6778, + "step": 12910 + }, + { + "epoch": 0.877496942519364, + "grad_norm": 0.8825350403785706, + "learning_rate": 0.0008903638401956787, + "loss": 3.376, + "step": 12915 + }, + { + "epoch": 0.8778366625900258, + "grad_norm": 0.7710245251655579, + "learning_rate": 0.0008903213751868461, + "loss": 3.6192, + "step": 12920 + }, + { + "epoch": 0.8781763826606876, + "grad_norm": 1.4703747034072876, + "learning_rate": 0.0008902789101780134, + "loss": 3.4355, + "step": 12925 + }, + { + "epoch": 0.8785161027313494, + "grad_norm": 1.5170077085494995, + "learning_rate": 0.0008902364451691806, + "loss": 3.6717, + "step": 12930 + }, + { + "epoch": 0.8788558228020111, + "grad_norm": 0.7630696296691895, + "learning_rate": 0.0008901939801603479, + "loss": 3.6508, + "step": 12935 + }, + { + "epoch": 0.8791955428726729, + "grad_norm": 0.8289691805839539, + "learning_rate": 0.0008901515151515151, + "loss": 3.6314, + "step": 12940 + }, + { + "epoch": 0.8795352629433347, + "grad_norm": 0.9249024987220764, + "learning_rate": 0.0008901090501426824, + "loss": 3.6243, + "step": 12945 + }, + { + "epoch": 0.8798749830139965, + "grad_norm": 0.7900879979133606, + "learning_rate": 0.0008900665851338497, + "loss": 3.4279, + "step": 12950 + }, + { + "epoch": 0.8802147030846582, + "grad_norm": 1.4901187419891357, + "learning_rate": 0.000890024120125017, + "loss": 3.7187, + "step": 12955 + }, + { + "epoch": 0.8805544231553201, + "grad_norm": 0.744059145450592, + "learning_rate": 0.0008899816551161843, + "loss": 3.7016, + "step": 12960 + }, + { + "epoch": 0.8808941432259818, + "grad_norm": 0.8059055209159851, + "learning_rate": 0.0008899391901073516, + "loss": 3.7085, + "step": 12965 + }, + { + "epoch": 0.8812338632966435, + "grad_norm": 1.1076507568359375, + "learning_rate": 0.0008898967250985188, + "loss": 3.7264, + "step": 12970 + }, + { + "epoch": 0.8815735833673053, + "grad_norm": 0.8307698965072632, + "learning_rate": 0.000889854260089686, + "loss": 3.6816, + "step": 12975 + }, + { + "epoch": 0.8819133034379671, + "grad_norm": 0.7859275937080383, + "learning_rate": 0.0008898117950808534, + "loss": 3.8154, + "step": 12980 + }, + { + "epoch": 0.8822530235086289, + "grad_norm": 0.9843325614929199, + "learning_rate": 0.0008897693300720206, + "loss": 3.593, + "step": 12985 + }, + { + "epoch": 0.8825927435792906, + "grad_norm": 1.0416489839553833, + "learning_rate": 0.000889726865063188, + "loss": 3.9183, + "step": 12990 + }, + { + "epoch": 0.8829324636499525, + "grad_norm": 0.8519264459609985, + "learning_rate": 0.0008896844000543553, + "loss": 3.6714, + "step": 12995 + }, + { + "epoch": 0.8832721837206142, + "grad_norm": 0.7330078482627869, + "learning_rate": 0.0008896419350455225, + "loss": 3.6677, + "step": 13000 + }, + { + "epoch": 0.883611903791276, + "grad_norm": 1.2345335483551025, + "learning_rate": 0.0008895994700366898, + "loss": 3.7669, + "step": 13005 + }, + { + "epoch": 0.8839516238619378, + "grad_norm": 0.6741784811019897, + "learning_rate": 0.0008895570050278571, + "loss": 3.4922, + "step": 13010 + }, + { + "epoch": 0.8842913439325996, + "grad_norm": 1.2192062139511108, + "learning_rate": 0.0008895145400190243, + "loss": 3.5527, + "step": 13015 + }, + { + "epoch": 0.8846310640032613, + "grad_norm": 0.6788417100906372, + "learning_rate": 0.0008894720750101916, + "loss": 3.5468, + "step": 13020 + }, + { + "epoch": 0.884970784073923, + "grad_norm": 0.907703697681427, + "learning_rate": 0.000889429610001359, + "loss": 3.5127, + "step": 13025 + }, + { + "epoch": 0.8853105041445849, + "grad_norm": 0.7998182773590088, + "learning_rate": 0.0008893871449925262, + "loss": 3.601, + "step": 13030 + }, + { + "epoch": 0.8856502242152466, + "grad_norm": 0.948671281337738, + "learning_rate": 0.0008893446799836935, + "loss": 3.6866, + "step": 13035 + }, + { + "epoch": 0.8859899442859084, + "grad_norm": 0.8805979490280151, + "learning_rate": 0.0008893022149748607, + "loss": 3.5878, + "step": 13040 + }, + { + "epoch": 0.8863296643565702, + "grad_norm": 0.6614952683448792, + "learning_rate": 0.000889259749966028, + "loss": 3.6349, + "step": 13045 + }, + { + "epoch": 0.886669384427232, + "grad_norm": 0.8000301718711853, + "learning_rate": 0.0008892172849571953, + "loss": 3.7459, + "step": 13050 + }, + { + "epoch": 0.8870091044978937, + "grad_norm": 0.8005181550979614, + "learning_rate": 0.0008891748199483625, + "loss": 3.621, + "step": 13055 + }, + { + "epoch": 0.8873488245685555, + "grad_norm": 1.0077171325683594, + "learning_rate": 0.0008891323549395299, + "loss": 3.5879, + "step": 13060 + }, + { + "epoch": 0.8876885446392173, + "grad_norm": 0.6627796292304993, + "learning_rate": 0.0008890898899306972, + "loss": 3.6634, + "step": 13065 + }, + { + "epoch": 0.8880282647098791, + "grad_norm": 0.9305934309959412, + "learning_rate": 0.0008890474249218644, + "loss": 3.5295, + "step": 13070 + }, + { + "epoch": 0.8883679847805408, + "grad_norm": 0.768598735332489, + "learning_rate": 0.0008890049599130316, + "loss": 3.7384, + "step": 13075 + }, + { + "epoch": 0.8887077048512027, + "grad_norm": 0.7440306544303894, + "learning_rate": 0.000888962494904199, + "loss": 3.6677, + "step": 13080 + }, + { + "epoch": 0.8890474249218644, + "grad_norm": 0.8969413638114929, + "learning_rate": 0.0008889200298953662, + "loss": 3.641, + "step": 13085 + }, + { + "epoch": 0.8893871449925261, + "grad_norm": 0.7797889709472656, + "learning_rate": 0.0008888775648865334, + "loss": 3.7145, + "step": 13090 + }, + { + "epoch": 0.889726865063188, + "grad_norm": 0.8052771687507629, + "learning_rate": 0.0008888350998777009, + "loss": 3.6505, + "step": 13095 + }, + { + "epoch": 0.8900665851338497, + "grad_norm": 0.7774892449378967, + "learning_rate": 0.0008887926348688681, + "loss": 3.6276, + "step": 13100 + }, + { + "epoch": 0.8904063052045115, + "grad_norm": 0.834392249584198, + "learning_rate": 0.0008887501698600353, + "loss": 3.6072, + "step": 13105 + }, + { + "epoch": 0.8907460252751732, + "grad_norm": 0.989751935005188, + "learning_rate": 0.0008887077048512027, + "loss": 3.7655, + "step": 13110 + }, + { + "epoch": 0.8910857453458351, + "grad_norm": 0.9355427622795105, + "learning_rate": 0.0008886652398423699, + "loss": 3.6014, + "step": 13115 + }, + { + "epoch": 0.8914254654164968, + "grad_norm": 0.7877017259597778, + "learning_rate": 0.0008886227748335371, + "loss": 3.8343, + "step": 13120 + }, + { + "epoch": 0.8917651854871586, + "grad_norm": 6.027553081512451, + "learning_rate": 0.0008885803098247046, + "loss": 3.4994, + "step": 13125 + }, + { + "epoch": 0.8921049055578204, + "grad_norm": 0.8912242650985718, + "learning_rate": 0.0008885378448158718, + "loss": 3.4291, + "step": 13130 + }, + { + "epoch": 0.8924446256284821, + "grad_norm": 0.9049345254898071, + "learning_rate": 0.000888495379807039, + "loss": 3.5536, + "step": 13135 + }, + { + "epoch": 0.8927843456991439, + "grad_norm": 0.6373432874679565, + "learning_rate": 0.0008884529147982063, + "loss": 3.7296, + "step": 13140 + }, + { + "epoch": 0.8931240657698056, + "grad_norm": 1.139078140258789, + "learning_rate": 0.0008884104497893736, + "loss": 3.8204, + "step": 13145 + }, + { + "epoch": 0.8934637858404675, + "grad_norm": 0.8393918871879578, + "learning_rate": 0.0008883679847805408, + "loss": 3.6334, + "step": 13150 + }, + { + "epoch": 0.8938035059111292, + "grad_norm": 0.777823805809021, + "learning_rate": 0.0008883255197717081, + "loss": 3.7935, + "step": 13155 + }, + { + "epoch": 0.894143225981791, + "grad_norm": 0.8541361689567566, + "learning_rate": 0.0008882830547628755, + "loss": 3.4105, + "step": 13160 + }, + { + "epoch": 0.8944829460524528, + "grad_norm": 0.7601314187049866, + "learning_rate": 0.0008882405897540427, + "loss": 3.4557, + "step": 13165 + }, + { + "epoch": 0.8948226661231146, + "grad_norm": 0.7342299222946167, + "learning_rate": 0.00088819812474521, + "loss": 3.4919, + "step": 13170 + }, + { + "epoch": 0.8951623861937763, + "grad_norm": 0.5788124799728394, + "learning_rate": 0.0008881556597363772, + "loss": 3.736, + "step": 13175 + }, + { + "epoch": 0.8955021062644382, + "grad_norm": 1.1262081861495972, + "learning_rate": 0.0008881131947275445, + "loss": 3.664, + "step": 13180 + }, + { + "epoch": 0.8958418263350999, + "grad_norm": 0.8499696850776672, + "learning_rate": 0.0008880707297187118, + "loss": 3.893, + "step": 13185 + }, + { + "epoch": 0.8961815464057616, + "grad_norm": 0.6769475936889648, + "learning_rate": 0.000888028264709879, + "loss": 3.5903, + "step": 13190 + }, + { + "epoch": 0.8965212664764234, + "grad_norm": 0.9238865375518799, + "learning_rate": 0.0008879857997010464, + "loss": 3.7412, + "step": 13195 + }, + { + "epoch": 0.8968609865470852, + "grad_norm": 0.7293673157691956, + "learning_rate": 0.0008879433346922137, + "loss": 3.6304, + "step": 13200 + }, + { + "epoch": 0.897200706617747, + "grad_norm": 0.7672217488288879, + "learning_rate": 0.0008879008696833809, + "loss": 3.5288, + "step": 13205 + }, + { + "epoch": 0.8975404266884087, + "grad_norm": 0.6517300009727478, + "learning_rate": 0.0008878584046745482, + "loss": 3.606, + "step": 13210 + }, + { + "epoch": 0.8978801467590706, + "grad_norm": 0.8979939222335815, + "learning_rate": 0.0008878159396657155, + "loss": 3.6145, + "step": 13215 + }, + { + "epoch": 0.8982198668297323, + "grad_norm": 0.8363761901855469, + "learning_rate": 0.0008877734746568827, + "loss": 3.7191, + "step": 13220 + }, + { + "epoch": 0.8985595869003941, + "grad_norm": 0.8604387044906616, + "learning_rate": 0.0008877310096480499, + "loss": 3.8193, + "step": 13225 + }, + { + "epoch": 0.8988993069710558, + "grad_norm": 0.8567779660224915, + "learning_rate": 0.0008876885446392174, + "loss": 3.8483, + "step": 13230 + }, + { + "epoch": 0.8992390270417177, + "grad_norm": 0.8949506878852844, + "learning_rate": 0.0008876460796303846, + "loss": 3.4932, + "step": 13235 + }, + { + "epoch": 0.8995787471123794, + "grad_norm": 0.6884737610816956, + "learning_rate": 0.0008876036146215518, + "loss": 3.8304, + "step": 13240 + }, + { + "epoch": 0.8999184671830411, + "grad_norm": 0.7422730922698975, + "learning_rate": 0.0008875611496127192, + "loss": 3.3829, + "step": 13245 + }, + { + "epoch": 0.900258187253703, + "grad_norm": 0.8351021409034729, + "learning_rate": 0.0008875186846038864, + "loss": 3.5943, + "step": 13250 + }, + { + "epoch": 0.9005979073243647, + "grad_norm": 0.8672903776168823, + "learning_rate": 0.0008874762195950536, + "loss": 3.6563, + "step": 13255 + }, + { + "epoch": 0.9009376273950265, + "grad_norm": 0.7368360757827759, + "learning_rate": 0.000887433754586221, + "loss": 3.7252, + "step": 13260 + }, + { + "epoch": 0.9012773474656883, + "grad_norm": 0.6936708092689514, + "learning_rate": 0.0008873912895773883, + "loss": 3.6197, + "step": 13265 + }, + { + "epoch": 0.9016170675363501, + "grad_norm": 1.0057976245880127, + "learning_rate": 0.0008873488245685555, + "loss": 3.6917, + "step": 13270 + }, + { + "epoch": 0.9019567876070118, + "grad_norm": 0.9247760772705078, + "learning_rate": 0.0008873063595597228, + "loss": 3.8023, + "step": 13275 + }, + { + "epoch": 0.9022965076776736, + "grad_norm": 0.8803149461746216, + "learning_rate": 0.0008872638945508901, + "loss": 3.8739, + "step": 13280 + }, + { + "epoch": 0.9026362277483354, + "grad_norm": 0.8565850257873535, + "learning_rate": 0.0008872214295420573, + "loss": 3.6967, + "step": 13285 + }, + { + "epoch": 0.9029759478189971, + "grad_norm": 0.9418727159500122, + "learning_rate": 0.0008871789645332246, + "loss": 3.6894, + "step": 13290 + }, + { + "epoch": 0.9033156678896589, + "grad_norm": 0.8542486429214478, + "learning_rate": 0.0008871364995243919, + "loss": 3.8724, + "step": 13295 + }, + { + "epoch": 0.9036553879603207, + "grad_norm": 0.7310173511505127, + "learning_rate": 0.0008870940345155592, + "loss": 3.7825, + "step": 13300 + }, + { + "epoch": 0.9039951080309825, + "grad_norm": 0.8404889106750488, + "learning_rate": 0.0008870515695067265, + "loss": 3.4567, + "step": 13305 + }, + { + "epoch": 0.9043348281016442, + "grad_norm": 0.7950018644332886, + "learning_rate": 0.0008870091044978938, + "loss": 3.8168, + "step": 13310 + }, + { + "epoch": 0.904674548172306, + "grad_norm": 0.6473474502563477, + "learning_rate": 0.000886966639489061, + "loss": 3.968, + "step": 13315 + }, + { + "epoch": 0.9050142682429678, + "grad_norm": 0.7432523369789124, + "learning_rate": 0.0008869241744802283, + "loss": 3.4704, + "step": 13320 + }, + { + "epoch": 0.9053539883136296, + "grad_norm": 0.8027026057243347, + "learning_rate": 0.0008868817094713955, + "loss": 3.5433, + "step": 13325 + }, + { + "epoch": 0.9056937083842913, + "grad_norm": 0.882222592830658, + "learning_rate": 0.0008868392444625629, + "loss": 3.5709, + "step": 13330 + }, + { + "epoch": 0.9060334284549532, + "grad_norm": 0.7984167337417603, + "learning_rate": 0.0008867967794537302, + "loss": 3.7816, + "step": 13335 + }, + { + "epoch": 0.9063731485256149, + "grad_norm": 0.7911337018013, + "learning_rate": 0.0008867543144448974, + "loss": 3.638, + "step": 13340 + }, + { + "epoch": 0.9067128685962766, + "grad_norm": 0.9159407019615173, + "learning_rate": 0.0008867118494360648, + "loss": 3.7456, + "step": 13345 + }, + { + "epoch": 0.9070525886669385, + "grad_norm": 2.4677720069885254, + "learning_rate": 0.000886669384427232, + "loss": 3.6656, + "step": 13350 + }, + { + "epoch": 0.9073923087376002, + "grad_norm": 0.8535523414611816, + "learning_rate": 0.0008866269194183992, + "loss": 3.6365, + "step": 13355 + }, + { + "epoch": 0.907732028808262, + "grad_norm": 0.8347687721252441, + "learning_rate": 0.0008865844544095666, + "loss": 3.541, + "step": 13360 + }, + { + "epoch": 0.9080717488789237, + "grad_norm": 1.0422698259353638, + "learning_rate": 0.0008865419894007338, + "loss": 3.5633, + "step": 13365 + }, + { + "epoch": 0.9084114689495856, + "grad_norm": 0.9044398665428162, + "learning_rate": 0.0008864995243919011, + "loss": 3.6355, + "step": 13370 + }, + { + "epoch": 0.9087511890202473, + "grad_norm": 0.7636294364929199, + "learning_rate": 0.0008864570593830684, + "loss": 3.2745, + "step": 13375 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.809526801109314, + "learning_rate": 0.0008864145943742357, + "loss": 3.8624, + "step": 13380 + }, + { + "epoch": 0.9094306291615709, + "grad_norm": 0.7578825950622559, + "learning_rate": 0.0008863721293654029, + "loss": 3.8821, + "step": 13385 + }, + { + "epoch": 0.9097703492322327, + "grad_norm": 0.952042818069458, + "learning_rate": 0.0008863381573583368, + "loss": 3.783, + "step": 13390 + }, + { + "epoch": 0.9101100693028944, + "grad_norm": 0.9756434559822083, + "learning_rate": 0.0008862956923495041, + "loss": 3.506, + "step": 13395 + }, + { + "epoch": 0.9104497893735561, + "grad_norm": 0.8290578126907349, + "learning_rate": 0.0008862532273406713, + "loss": 3.3927, + "step": 13400 + }, + { + "epoch": 0.910789509444218, + "grad_norm": 0.9047747254371643, + "learning_rate": 0.0008862107623318386, + "loss": 4.1358, + "step": 13405 + }, + { + "epoch": 0.9111292295148797, + "grad_norm": 0.6517637968063354, + "learning_rate": 0.0008861682973230058, + "loss": 3.4469, + "step": 13410 + }, + { + "epoch": 0.9114689495855415, + "grad_norm": 0.9057919383049011, + "learning_rate": 0.0008861258323141731, + "loss": 3.649, + "step": 13415 + }, + { + "epoch": 0.9118086696562033, + "grad_norm": 0.7667418718338013, + "learning_rate": 0.0008860833673053404, + "loss": 3.8761, + "step": 13420 + }, + { + "epoch": 0.9121483897268651, + "grad_norm": 0.8707554340362549, + "learning_rate": 0.0008860409022965077, + "loss": 3.6667, + "step": 13425 + }, + { + "epoch": 0.9124881097975268, + "grad_norm": 0.7027948498725891, + "learning_rate": 0.000885998437287675, + "loss": 3.5003, + "step": 13430 + }, + { + "epoch": 0.9128278298681887, + "grad_norm": 0.9205701351165771, + "learning_rate": 0.0008859559722788423, + "loss": 3.6461, + "step": 13435 + }, + { + "epoch": 0.9131675499388504, + "grad_norm": 0.9690806269645691, + "learning_rate": 0.0008859135072700095, + "loss": 3.4656, + "step": 13440 + }, + { + "epoch": 0.9135072700095122, + "grad_norm": 0.8491474390029907, + "learning_rate": 0.0008858710422611768, + "loss": 3.5775, + "step": 13445 + }, + { + "epoch": 0.9138469900801739, + "grad_norm": 0.7521077394485474, + "learning_rate": 0.0008858285772523441, + "loss": 3.51, + "step": 13450 + }, + { + "epoch": 0.9141867101508357, + "grad_norm": 0.6959039568901062, + "learning_rate": 0.0008857861122435113, + "loss": 3.5707, + "step": 13455 + }, + { + "epoch": 0.9145264302214975, + "grad_norm": 0.8571894764900208, + "learning_rate": 0.0008857436472346786, + "loss": 3.8924, + "step": 13460 + }, + { + "epoch": 0.9148661502921592, + "grad_norm": 0.8466618061065674, + "learning_rate": 0.000885701182225846, + "loss": 3.7246, + "step": 13465 + }, + { + "epoch": 0.9152058703628211, + "grad_norm": 1.6851712465286255, + "learning_rate": 0.0008856587172170132, + "loss": 3.8158, + "step": 13470 + }, + { + "epoch": 0.9155455904334828, + "grad_norm": 0.8464066982269287, + "learning_rate": 0.0008856162522081804, + "loss": 3.6258, + "step": 13475 + }, + { + "epoch": 0.9158853105041446, + "grad_norm": 0.8561259508132935, + "learning_rate": 0.0008855737871993478, + "loss": 3.4989, + "step": 13480 + }, + { + "epoch": 0.9162250305748063, + "grad_norm": 0.6971160769462585, + "learning_rate": 0.000885531322190515, + "loss": 3.5739, + "step": 13485 + }, + { + "epoch": 0.9165647506454682, + "grad_norm": 0.9155734777450562, + "learning_rate": 0.0008854888571816822, + "loss": 3.7581, + "step": 13490 + }, + { + "epoch": 0.9169044707161299, + "grad_norm": 1.0119434595108032, + "learning_rate": 0.0008854463921728497, + "loss": 3.706, + "step": 13495 + }, + { + "epoch": 0.9172441907867916, + "grad_norm": 1.085256814956665, + "learning_rate": 0.0008854039271640169, + "loss": 3.5625, + "step": 13500 + }, + { + "epoch": 0.9175839108574535, + "grad_norm": 0.7960216403007507, + "learning_rate": 0.0008853614621551841, + "loss": 3.7321, + "step": 13505 + }, + { + "epoch": 0.9179236309281152, + "grad_norm": 0.6819049119949341, + "learning_rate": 0.0008853189971463514, + "loss": 4.0129, + "step": 13510 + }, + { + "epoch": 0.918263350998777, + "grad_norm": 0.7596902251243591, + "learning_rate": 0.0008852765321375187, + "loss": 3.2518, + "step": 13515 + }, + { + "epoch": 0.9186030710694388, + "grad_norm": 1.0605517625808716, + "learning_rate": 0.0008852340671286859, + "loss": 3.752, + "step": 13520 + }, + { + "epoch": 0.9189427911401006, + "grad_norm": 0.7653706073760986, + "learning_rate": 0.0008851916021198532, + "loss": 3.638, + "step": 13525 + }, + { + "epoch": 0.9192825112107623, + "grad_norm": 0.8083335161209106, + "learning_rate": 0.0008851491371110206, + "loss": 3.3322, + "step": 13530 + }, + { + "epoch": 0.9196222312814241, + "grad_norm": 0.8433426022529602, + "learning_rate": 0.0008851066721021879, + "loss": 3.8036, + "step": 13535 + }, + { + "epoch": 0.9199619513520859, + "grad_norm": 0.7392431497573853, + "learning_rate": 0.0008850642070933551, + "loss": 3.8447, + "step": 13540 + }, + { + "epoch": 0.9203016714227477, + "grad_norm": 0.7772004008293152, + "learning_rate": 0.0008850217420845224, + "loss": 3.5301, + "step": 13545 + }, + { + "epoch": 0.9206413914934094, + "grad_norm": 0.826511561870575, + "learning_rate": 0.0008849792770756897, + "loss": 3.7882, + "step": 13550 + }, + { + "epoch": 0.9209811115640713, + "grad_norm": 0.8551540970802307, + "learning_rate": 0.0008849368120668569, + "loss": 3.6607, + "step": 13555 + }, + { + "epoch": 0.921320831634733, + "grad_norm": 0.8505045175552368, + "learning_rate": 0.0008848943470580241, + "loss": 3.63, + "step": 13560 + }, + { + "epoch": 0.9216605517053947, + "grad_norm": 1.1729084253311157, + "learning_rate": 0.0008848518820491916, + "loss": 3.7077, + "step": 13565 + }, + { + "epoch": 0.9220002717760565, + "grad_norm": 0.7206094264984131, + "learning_rate": 0.0008848094170403588, + "loss": 3.6335, + "step": 13570 + }, + { + "epoch": 0.9223399918467183, + "grad_norm": 0.7130137085914612, + "learning_rate": 0.000884766952031526, + "loss": 3.6784, + "step": 13575 + }, + { + "epoch": 0.9226797119173801, + "grad_norm": 1.3106021881103516, + "learning_rate": 0.0008847244870226934, + "loss": 3.6254, + "step": 13580 + }, + { + "epoch": 0.9230194319880418, + "grad_norm": 0.8854566216468811, + "learning_rate": 0.0008846820220138606, + "loss": 3.3756, + "step": 13585 + }, + { + "epoch": 0.9233591520587037, + "grad_norm": 0.7487343549728394, + "learning_rate": 0.0008846395570050278, + "loss": 3.6906, + "step": 13590 + }, + { + "epoch": 0.9236988721293654, + "grad_norm": 0.7952874898910522, + "learning_rate": 0.0008845970919961952, + "loss": 3.6622, + "step": 13595 + }, + { + "epoch": 0.9240385922000272, + "grad_norm": 0.7046886682510376, + "learning_rate": 0.0008845546269873625, + "loss": 3.6066, + "step": 13600 + }, + { + "epoch": 0.924378312270689, + "grad_norm": 0.7407775521278381, + "learning_rate": 0.0008845121619785297, + "loss": 3.8302, + "step": 13605 + }, + { + "epoch": 0.9247180323413507, + "grad_norm": 0.6793643832206726, + "learning_rate": 0.000884469696969697, + "loss": 3.8039, + "step": 13610 + }, + { + "epoch": 0.9250577524120125, + "grad_norm": 0.8096951842308044, + "learning_rate": 0.0008844272319608643, + "loss": 3.6233, + "step": 13615 + }, + { + "epoch": 0.9253974724826742, + "grad_norm": 0.9985918998718262, + "learning_rate": 0.0008843847669520315, + "loss": 3.6541, + "step": 13620 + }, + { + "epoch": 0.9257371925533361, + "grad_norm": 1.0893831253051758, + "learning_rate": 0.0008843423019431988, + "loss": 3.7452, + "step": 13625 + }, + { + "epoch": 0.9260769126239978, + "grad_norm": 0.559908926486969, + "learning_rate": 0.0008842998369343661, + "loss": 3.5642, + "step": 13630 + }, + { + "epoch": 0.9264166326946596, + "grad_norm": 0.8469265103340149, + "learning_rate": 0.0008842573719255334, + "loss": 3.7964, + "step": 13635 + }, + { + "epoch": 0.9267563527653214, + "grad_norm": 0.8896875381469727, + "learning_rate": 0.0008842149069167007, + "loss": 3.6546, + "step": 13640 + }, + { + "epoch": 0.9270960728359832, + "grad_norm": 0.6882612109184265, + "learning_rate": 0.000884172441907868, + "loss": 3.4332, + "step": 13645 + }, + { + "epoch": 0.9274357929066449, + "grad_norm": 0.7867699861526489, + "learning_rate": 0.0008841299768990352, + "loss": 3.8918, + "step": 13650 + }, + { + "epoch": 0.9277755129773066, + "grad_norm": 0.86528080701828, + "learning_rate": 0.0008840875118902025, + "loss": 3.6997, + "step": 13655 + }, + { + "epoch": 0.9281152330479685, + "grad_norm": 0.6395547986030579, + "learning_rate": 0.0008840450468813697, + "loss": 3.791, + "step": 13660 + }, + { + "epoch": 0.9284549531186302, + "grad_norm": 1.1433875560760498, + "learning_rate": 0.000884002581872537, + "loss": 3.6009, + "step": 13665 + }, + { + "epoch": 0.928794673189292, + "grad_norm": 0.7217481732368469, + "learning_rate": 0.0008839601168637044, + "loss": 3.8374, + "step": 13670 + }, + { + "epoch": 0.9291343932599538, + "grad_norm": 0.9810029864311218, + "learning_rate": 0.0008839176518548716, + "loss": 3.7599, + "step": 13675 + }, + { + "epoch": 0.9294741133306156, + "grad_norm": 0.8804602026939392, + "learning_rate": 0.0008838751868460389, + "loss": 3.673, + "step": 13680 + }, + { + "epoch": 0.9298138334012773, + "grad_norm": 0.8134076595306396, + "learning_rate": 0.0008838327218372062, + "loss": 3.5049, + "step": 13685 + }, + { + "epoch": 0.9301535534719392, + "grad_norm": 0.7853949069976807, + "learning_rate": 0.0008837902568283734, + "loss": 3.5951, + "step": 13690 + }, + { + "epoch": 0.9304932735426009, + "grad_norm": 1.2072824239730835, + "learning_rate": 0.0008837477918195406, + "loss": 3.5955, + "step": 13695 + }, + { + "epoch": 0.9308329936132627, + "grad_norm": 0.7537646293640137, + "learning_rate": 0.000883705326810708, + "loss": 3.5049, + "step": 13700 + }, + { + "epoch": 0.9311727136839244, + "grad_norm": 0.876721978187561, + "learning_rate": 0.0008836628618018753, + "loss": 3.5831, + "step": 13705 + }, + { + "epoch": 0.9315124337545863, + "grad_norm": 0.6627894639968872, + "learning_rate": 0.0008836203967930425, + "loss": 3.6574, + "step": 13710 + }, + { + "epoch": 0.931852153825248, + "grad_norm": 0.7239164113998413, + "learning_rate": 0.0008835779317842099, + "loss": 3.7791, + "step": 13715 + }, + { + "epoch": 0.9321918738959097, + "grad_norm": 0.8032355308532715, + "learning_rate": 0.0008835354667753771, + "loss": 3.617, + "step": 13720 + }, + { + "epoch": 0.9325315939665716, + "grad_norm": 0.8140981197357178, + "learning_rate": 0.0008834930017665443, + "loss": 3.4228, + "step": 13725 + }, + { + "epoch": 0.9328713140372333, + "grad_norm": 0.6916741728782654, + "learning_rate": 0.0008834505367577117, + "loss": 3.5757, + "step": 13730 + }, + { + "epoch": 0.9332110341078951, + "grad_norm": 0.7824718952178955, + "learning_rate": 0.0008834080717488789, + "loss": 3.8802, + "step": 13735 + }, + { + "epoch": 0.9335507541785568, + "grad_norm": 0.6864860653877258, + "learning_rate": 0.0008833656067400462, + "loss": 3.6424, + "step": 13740 + }, + { + "epoch": 0.9338904742492187, + "grad_norm": 0.8139113783836365, + "learning_rate": 0.0008833231417312136, + "loss": 3.7303, + "step": 13745 + }, + { + "epoch": 0.9342301943198804, + "grad_norm": 0.889613151550293, + "learning_rate": 0.0008832806767223808, + "loss": 3.1922, + "step": 13750 + }, + { + "epoch": 0.9345699143905422, + "grad_norm": 0.6793786883354187, + "learning_rate": 0.000883238211713548, + "loss": 3.77, + "step": 13755 + }, + { + "epoch": 0.934909634461204, + "grad_norm": 0.86944580078125, + "learning_rate": 0.0008831957467047153, + "loss": 3.5828, + "step": 13760 + }, + { + "epoch": 0.9352493545318658, + "grad_norm": 0.850764811038971, + "learning_rate": 0.0008831532816958826, + "loss": 3.7034, + "step": 13765 + }, + { + "epoch": 0.9355890746025275, + "grad_norm": 1.1837507486343384, + "learning_rate": 0.0008831108166870498, + "loss": 3.7219, + "step": 13770 + }, + { + "epoch": 0.9359287946731893, + "grad_norm": 0.8085290789604187, + "learning_rate": 0.0008830683516782172, + "loss": 3.4804, + "step": 13775 + }, + { + "epoch": 0.9362685147438511, + "grad_norm": 0.778426468372345, + "learning_rate": 0.0008830258866693845, + "loss": 3.5631, + "step": 13780 + }, + { + "epoch": 0.9366082348145128, + "grad_norm": 0.6045960783958435, + "learning_rate": 0.0008829834216605517, + "loss": 3.7075, + "step": 13785 + }, + { + "epoch": 0.9369479548851746, + "grad_norm": 0.7115256190299988, + "learning_rate": 0.000882940956651719, + "loss": 3.6152, + "step": 13790 + }, + { + "epoch": 0.9372876749558364, + "grad_norm": 1.9463105201721191, + "learning_rate": 0.0008828984916428862, + "loss": 3.7302, + "step": 13795 + }, + { + "epoch": 0.9376273950264982, + "grad_norm": 0.7814124226570129, + "learning_rate": 0.0008828560266340535, + "loss": 3.4532, + "step": 13800 + }, + { + "epoch": 0.9379671150971599, + "grad_norm": 0.6608603000640869, + "learning_rate": 0.0008828135616252208, + "loss": 3.6809, + "step": 13805 + }, + { + "epoch": 0.9383068351678218, + "grad_norm": 0.8141739964485168, + "learning_rate": 0.0008827710966163881, + "loss": 3.3225, + "step": 13810 + }, + { + "epoch": 0.9386465552384835, + "grad_norm": 0.7635350823402405, + "learning_rate": 0.0008827286316075554, + "loss": 3.6025, + "step": 13815 + }, + { + "epoch": 0.9389862753091452, + "grad_norm": 0.6930614709854126, + "learning_rate": 0.0008826861665987227, + "loss": 3.7506, + "step": 13820 + }, + { + "epoch": 0.939325995379807, + "grad_norm": 0.940110445022583, + "learning_rate": 0.0008826437015898899, + "loss": 3.3136, + "step": 13825 + }, + { + "epoch": 0.9396657154504688, + "grad_norm": 1.0639580488204956, + "learning_rate": 0.0008826012365810572, + "loss": 3.7108, + "step": 13830 + }, + { + "epoch": 0.9400054355211306, + "grad_norm": 0.7539517283439636, + "learning_rate": 0.0008825587715722245, + "loss": 3.583, + "step": 13835 + }, + { + "epoch": 0.9403451555917923, + "grad_norm": 0.6852843165397644, + "learning_rate": 0.0008825163065633917, + "loss": 3.6617, + "step": 13840 + }, + { + "epoch": 0.9406848756624542, + "grad_norm": 1.0215131044387817, + "learning_rate": 0.000882473841554559, + "loss": 3.5326, + "step": 13845 + }, + { + "epoch": 0.9410245957331159, + "grad_norm": 0.6570088863372803, + "learning_rate": 0.0008824313765457264, + "loss": 3.8114, + "step": 13850 + }, + { + "epoch": 0.9413643158037777, + "grad_norm": 0.6508252620697021, + "learning_rate": 0.0008823889115368936, + "loss": 3.7592, + "step": 13855 + }, + { + "epoch": 0.9417040358744395, + "grad_norm": 0.8454975485801697, + "learning_rate": 0.0008823464465280608, + "loss": 3.6928, + "step": 13860 + }, + { + "epoch": 0.9420437559451013, + "grad_norm": 1.0674575567245483, + "learning_rate": 0.0008823039815192282, + "loss": 3.3999, + "step": 13865 + }, + { + "epoch": 0.942383476015763, + "grad_norm": 0.9501475691795349, + "learning_rate": 0.0008822615165103954, + "loss": 3.6265, + "step": 13870 + }, + { + "epoch": 0.9427231960864247, + "grad_norm": 0.8592423796653748, + "learning_rate": 0.0008822190515015628, + "loss": 3.6936, + "step": 13875 + }, + { + "epoch": 0.9430629161570866, + "grad_norm": 1.037310242652893, + "learning_rate": 0.0008821765864927301, + "loss": 3.6883, + "step": 13880 + }, + { + "epoch": 0.9434026362277483, + "grad_norm": 0.83265620470047, + "learning_rate": 0.0008821341214838973, + "loss": 3.4519, + "step": 13885 + }, + { + "epoch": 0.9437423562984101, + "grad_norm": 1.2390183210372925, + "learning_rate": 0.0008820916564750646, + "loss": 3.8231, + "step": 13890 + }, + { + "epoch": 0.9440820763690719, + "grad_norm": 0.715252697467804, + "learning_rate": 0.0008820491914662319, + "loss": 3.797, + "step": 13895 + }, + { + "epoch": 0.9444217964397337, + "grad_norm": 0.8662611246109009, + "learning_rate": 0.0008820067264573991, + "loss": 3.5663, + "step": 13900 + }, + { + "epoch": 0.9447615165103954, + "grad_norm": 0.8430952429771423, + "learning_rate": 0.0008819642614485664, + "loss": 3.6256, + "step": 13905 + }, + { + "epoch": 0.9451012365810572, + "grad_norm": 0.781895101070404, + "learning_rate": 0.0008819217964397337, + "loss": 3.8182, + "step": 13910 + }, + { + "epoch": 0.945440956651719, + "grad_norm": 0.9409408569335938, + "learning_rate": 0.000881879331430901, + "loss": 3.6515, + "step": 13915 + }, + { + "epoch": 0.9457806767223808, + "grad_norm": 0.7531744837760925, + "learning_rate": 0.0008818368664220683, + "loss": 3.5132, + "step": 13920 + }, + { + "epoch": 0.9461203967930425, + "grad_norm": 0.6852971911430359, + "learning_rate": 0.0008817944014132355, + "loss": 3.8106, + "step": 13925 + }, + { + "epoch": 0.9464601168637043, + "grad_norm": 0.7968720197677612, + "learning_rate": 0.0008817519364044028, + "loss": 3.5188, + "step": 13930 + }, + { + "epoch": 0.9467998369343661, + "grad_norm": 0.8002681732177734, + "learning_rate": 0.0008817094713955701, + "loss": 3.5911, + "step": 13935 + }, + { + "epoch": 0.9471395570050278, + "grad_norm": 0.9507863521575928, + "learning_rate": 0.0008816670063867373, + "loss": 3.8999, + "step": 13940 + }, + { + "epoch": 0.9474792770756897, + "grad_norm": 0.9092618823051453, + "learning_rate": 0.0008816245413779047, + "loss": 3.6656, + "step": 13945 + }, + { + "epoch": 0.9478189971463514, + "grad_norm": 0.7698338627815247, + "learning_rate": 0.000881582076369072, + "loss": 3.7083, + "step": 13950 + }, + { + "epoch": 0.9481587172170132, + "grad_norm": 0.671148955821991, + "learning_rate": 0.0008815396113602392, + "loss": 3.4102, + "step": 13955 + }, + { + "epoch": 0.9484984372876749, + "grad_norm": 0.6878060102462769, + "learning_rate": 0.0008814971463514064, + "loss": 3.5625, + "step": 13960 + }, + { + "epoch": 0.9488381573583368, + "grad_norm": 0.7749282121658325, + "learning_rate": 0.0008814546813425738, + "loss": 3.4201, + "step": 13965 + }, + { + "epoch": 0.9491778774289985, + "grad_norm": 0.9377570748329163, + "learning_rate": 0.000881412216333741, + "loss": 3.8731, + "step": 13970 + }, + { + "epoch": 0.9495175974996602, + "grad_norm": 0.7445239424705505, + "learning_rate": 0.0008813697513249082, + "loss": 3.6481, + "step": 13975 + }, + { + "epoch": 0.9498573175703221, + "grad_norm": 0.778207004070282, + "learning_rate": 0.0008813272863160757, + "loss": 3.7647, + "step": 13980 + }, + { + "epoch": 0.9501970376409838, + "grad_norm": 0.8374427556991577, + "learning_rate": 0.0008812848213072429, + "loss": 3.5506, + "step": 13985 + }, + { + "epoch": 0.9505367577116456, + "grad_norm": 0.6861892938613892, + "learning_rate": 0.0008812423562984101, + "loss": 3.6606, + "step": 13990 + }, + { + "epoch": 0.9508764777823073, + "grad_norm": 0.792443037033081, + "learning_rate": 0.0008811998912895775, + "loss": 3.7327, + "step": 13995 + }, + { + "epoch": 0.9512161978529692, + "grad_norm": 1.3962026834487915, + "learning_rate": 0.0008811574262807447, + "loss": 3.5266, + "step": 14000 + }, + { + "epoch": 0.9515559179236309, + "grad_norm": 0.5809056162834167, + "learning_rate": 0.0008811149612719119, + "loss": 3.3213, + "step": 14005 + }, + { + "epoch": 0.9518956379942927, + "grad_norm": 0.7545545697212219, + "learning_rate": 0.0008810724962630792, + "loss": 3.4927, + "step": 14010 + }, + { + "epoch": 0.9522353580649545, + "grad_norm": 0.8809139132499695, + "learning_rate": 0.0008810300312542466, + "loss": 3.7494, + "step": 14015 + }, + { + "epoch": 0.9525750781356163, + "grad_norm": 0.7643922567367554, + "learning_rate": 0.0008809875662454138, + "loss": 3.6873, + "step": 14020 + }, + { + "epoch": 0.952914798206278, + "grad_norm": 0.8687959909439087, + "learning_rate": 0.0008809451012365811, + "loss": 3.7057, + "step": 14025 + }, + { + "epoch": 0.9532545182769399, + "grad_norm": 0.78090900182724, + "learning_rate": 0.0008809026362277484, + "loss": 3.5782, + "step": 14030 + }, + { + "epoch": 0.9535942383476016, + "grad_norm": 0.8087513446807861, + "learning_rate": 0.0008808601712189156, + "loss": 3.6528, + "step": 14035 + }, + { + "epoch": 0.9539339584182633, + "grad_norm": 1.0104749202728271, + "learning_rate": 0.0008808177062100829, + "loss": 3.8199, + "step": 14040 + }, + { + "epoch": 0.9542736784889251, + "grad_norm": 2.927396297454834, + "learning_rate": 0.0008807752412012501, + "loss": 3.6987, + "step": 14045 + }, + { + "epoch": 0.9546133985595869, + "grad_norm": 0.7671241760253906, + "learning_rate": 0.0008807327761924175, + "loss": 3.8471, + "step": 14050 + }, + { + "epoch": 0.9549531186302487, + "grad_norm": 0.7443845868110657, + "learning_rate": 0.0008806903111835848, + "loss": 3.6355, + "step": 14055 + }, + { + "epoch": 0.9552928387009104, + "grad_norm": 2.0092742443084717, + "learning_rate": 0.000880647846174752, + "loss": 3.5383, + "step": 14060 + }, + { + "epoch": 0.9556325587715723, + "grad_norm": 0.8730608820915222, + "learning_rate": 0.0008806053811659193, + "loss": 3.6488, + "step": 14065 + }, + { + "epoch": 0.955972278842234, + "grad_norm": 0.7821953892707825, + "learning_rate": 0.0008805629161570866, + "loss": 3.6181, + "step": 14070 + }, + { + "epoch": 0.9563119989128958, + "grad_norm": 0.816964864730835, + "learning_rate": 0.0008805204511482538, + "loss": 3.8055, + "step": 14075 + }, + { + "epoch": 0.9566517189835575, + "grad_norm": 0.9876099824905396, + "learning_rate": 0.000880477986139421, + "loss": 3.6915, + "step": 14080 + }, + { + "epoch": 0.9569914390542194, + "grad_norm": 0.7631768584251404, + "learning_rate": 0.0008804355211305885, + "loss": 3.5842, + "step": 14085 + }, + { + "epoch": 0.9573311591248811, + "grad_norm": 0.8626513481140137, + "learning_rate": 0.0008803930561217557, + "loss": 3.6435, + "step": 14090 + }, + { + "epoch": 0.9576708791955428, + "grad_norm": 0.6698563098907471, + "learning_rate": 0.0008803505911129229, + "loss": 3.6955, + "step": 14095 + }, + { + "epoch": 0.9580105992662047, + "grad_norm": 0.8681396245956421, + "learning_rate": 0.0008803081261040903, + "loss": 3.5313, + "step": 14100 + }, + { + "epoch": 0.9583503193368664, + "grad_norm": 0.9257439970970154, + "learning_rate": 0.0008802656610952575, + "loss": 3.4691, + "step": 14105 + }, + { + "epoch": 0.9586900394075282, + "grad_norm": 0.7571332454681396, + "learning_rate": 0.0008802231960864247, + "loss": 3.5118, + "step": 14110 + }, + { + "epoch": 0.95902975947819, + "grad_norm": 0.7581424713134766, + "learning_rate": 0.0008801807310775921, + "loss": 3.5854, + "step": 14115 + }, + { + "epoch": 0.9593694795488518, + "grad_norm": 0.7896050810813904, + "learning_rate": 0.0008801382660687594, + "loss": 4.002, + "step": 14120 + }, + { + "epoch": 0.9597091996195135, + "grad_norm": 0.878494918346405, + "learning_rate": 0.0008800958010599266, + "loss": 3.8238, + "step": 14125 + }, + { + "epoch": 0.9600489196901753, + "grad_norm": 0.729349672794342, + "learning_rate": 0.000880053336051094, + "loss": 3.7281, + "step": 14130 + }, + { + "epoch": 0.9603886397608371, + "grad_norm": 0.9200730323791504, + "learning_rate": 0.0008800108710422612, + "loss": 3.6588, + "step": 14135 + }, + { + "epoch": 0.9607283598314988, + "grad_norm": 0.8575905561447144, + "learning_rate": 0.0008799684060334284, + "loss": 3.8783, + "step": 14140 + }, + { + "epoch": 0.9610680799021606, + "grad_norm": 0.6644276976585388, + "learning_rate": 0.0008799259410245957, + "loss": 3.5634, + "step": 14145 + }, + { + "epoch": 0.9614077999728224, + "grad_norm": 0.8758966326713562, + "learning_rate": 0.000879883476015763, + "loss": 3.804, + "step": 14150 + }, + { + "epoch": 0.9617475200434842, + "grad_norm": 0.8961034417152405, + "learning_rate": 0.0008798410110069303, + "loss": 3.6504, + "step": 14155 + }, + { + "epoch": 0.9620872401141459, + "grad_norm": 1.4356441497802734, + "learning_rate": 0.0008797985459980976, + "loss": 3.661, + "step": 14160 + }, + { + "epoch": 0.9624269601848077, + "grad_norm": 0.7047199606895447, + "learning_rate": 0.0008797560809892649, + "loss": 3.702, + "step": 14165 + }, + { + "epoch": 0.9627666802554695, + "grad_norm": 0.7954128384590149, + "learning_rate": 0.0008797136159804321, + "loss": 3.6189, + "step": 14170 + }, + { + "epoch": 0.9631064003261313, + "grad_norm": 0.7756403088569641, + "learning_rate": 0.0008796711509715994, + "loss": 3.6875, + "step": 14175 + }, + { + "epoch": 0.963446120396793, + "grad_norm": 1.2114219665527344, + "learning_rate": 0.0008796286859627667, + "loss": 3.6276, + "step": 14180 + }, + { + "epoch": 0.9637858404674549, + "grad_norm": 0.897359311580658, + "learning_rate": 0.0008795862209539339, + "loss": 3.8286, + "step": 14185 + }, + { + "epoch": 0.9641255605381166, + "grad_norm": 0.7375702261924744, + "learning_rate": 0.0008795437559451013, + "loss": 3.5699, + "step": 14190 + }, + { + "epoch": 0.9644652806087783, + "grad_norm": 0.7505749464035034, + "learning_rate": 0.0008795012909362685, + "loss": 3.6231, + "step": 14195 + }, + { + "epoch": 0.9648050006794402, + "grad_norm": 0.5883892774581909, + "learning_rate": 0.0008794588259274358, + "loss": 3.5506, + "step": 14200 + }, + { + "epoch": 0.9651447207501019, + "grad_norm": 0.8931959867477417, + "learning_rate": 0.0008794163609186031, + "loss": 3.5767, + "step": 14205 + }, + { + "epoch": 0.9654844408207637, + "grad_norm": 0.7998825907707214, + "learning_rate": 0.0008793738959097703, + "loss": 3.6338, + "step": 14210 + }, + { + "epoch": 0.9658241608914254, + "grad_norm": 0.9050804972648621, + "learning_rate": 0.0008793314309009377, + "loss": 3.6815, + "step": 14215 + }, + { + "epoch": 0.9661638809620873, + "grad_norm": 0.9320011734962463, + "learning_rate": 0.0008792889658921049, + "loss": 3.7793, + "step": 14220 + }, + { + "epoch": 0.966503601032749, + "grad_norm": 0.8431034684181213, + "learning_rate": 0.0008792465008832722, + "loss": 3.681, + "step": 14225 + }, + { + "epoch": 0.9668433211034108, + "grad_norm": 0.805184006690979, + "learning_rate": 0.0008792040358744396, + "loss": 3.5972, + "step": 14230 + }, + { + "epoch": 0.9671830411740726, + "grad_norm": 0.8375820517539978, + "learning_rate": 0.0008791615708656068, + "loss": 3.7256, + "step": 14235 + }, + { + "epoch": 0.9675227612447344, + "grad_norm": 1.0146641731262207, + "learning_rate": 0.000879119105856774, + "loss": 3.4831, + "step": 14240 + }, + { + "epoch": 0.9678624813153961, + "grad_norm": 0.8173530101776123, + "learning_rate": 0.0008790766408479413, + "loss": 3.5049, + "step": 14245 + }, + { + "epoch": 0.9682022013860578, + "grad_norm": 0.6880283355712891, + "learning_rate": 0.0008790341758391086, + "loss": 3.7836, + "step": 14250 + }, + { + "epoch": 0.9685419214567197, + "grad_norm": 0.7227784991264343, + "learning_rate": 0.0008789917108302758, + "loss": 3.7745, + "step": 14255 + }, + { + "epoch": 0.9688816415273814, + "grad_norm": 0.6699380874633789, + "learning_rate": 0.0008789492458214432, + "loss": 3.6667, + "step": 14260 + }, + { + "epoch": 0.9692213615980432, + "grad_norm": 0.6838748455047607, + "learning_rate": 0.0008789067808126105, + "loss": 3.596, + "step": 14265 + }, + { + "epoch": 0.969561081668705, + "grad_norm": 3.4489290714263916, + "learning_rate": 0.0008788643158037777, + "loss": 3.59, + "step": 14270 + }, + { + "epoch": 0.9699008017393668, + "grad_norm": 0.8571457862854004, + "learning_rate": 0.000878821850794945, + "loss": 3.7428, + "step": 14275 + }, + { + "epoch": 0.9702405218100285, + "grad_norm": 0.8659314513206482, + "learning_rate": 0.0008787793857861123, + "loss": 3.6084, + "step": 14280 + }, + { + "epoch": 0.9705802418806904, + "grad_norm": 0.7246422171592712, + "learning_rate": 0.0008787369207772795, + "loss": 3.677, + "step": 14285 + }, + { + "epoch": 0.9709199619513521, + "grad_norm": 0.6887790560722351, + "learning_rate": 0.0008786944557684468, + "loss": 3.6696, + "step": 14290 + }, + { + "epoch": 0.9712596820220138, + "grad_norm": 0.7840778827667236, + "learning_rate": 0.0008786519907596141, + "loss": 3.6305, + "step": 14295 + }, + { + "epoch": 0.9715994020926756, + "grad_norm": 0.6830389499664307, + "learning_rate": 0.0008786095257507814, + "loss": 3.6028, + "step": 14300 + }, + { + "epoch": 0.9719391221633374, + "grad_norm": 0.7378713488578796, + "learning_rate": 0.0008785670607419487, + "loss": 3.8198, + "step": 14305 + }, + { + "epoch": 0.9722788422339992, + "grad_norm": 0.8808820843696594, + "learning_rate": 0.0008785245957331159, + "loss": 3.7087, + "step": 14310 + }, + { + "epoch": 0.9726185623046609, + "grad_norm": 1.0028831958770752, + "learning_rate": 0.0008784821307242832, + "loss": 3.4489, + "step": 14315 + }, + { + "epoch": 0.9729582823753228, + "grad_norm": 0.667633593082428, + "learning_rate": 0.0008784396657154505, + "loss": 3.6032, + "step": 14320 + }, + { + "epoch": 0.9732980024459845, + "grad_norm": 0.9350041747093201, + "learning_rate": 0.0008783972007066177, + "loss": 3.2011, + "step": 14325 + }, + { + "epoch": 0.9736377225166463, + "grad_norm": 0.7247070074081421, + "learning_rate": 0.0008783547356977851, + "loss": 3.7747, + "step": 14330 + }, + { + "epoch": 0.973977442587308, + "grad_norm": 0.6495918035507202, + "learning_rate": 0.0008783122706889524, + "loss": 3.7331, + "step": 14335 + }, + { + "epoch": 0.9743171626579699, + "grad_norm": 0.5766088366508484, + "learning_rate": 0.0008782698056801196, + "loss": 3.599, + "step": 14340 + }, + { + "epoch": 0.9746568827286316, + "grad_norm": 1.4575363397598267, + "learning_rate": 0.0008782273406712868, + "loss": 3.7844, + "step": 14345 + }, + { + "epoch": 0.9749966027992933, + "grad_norm": 0.8522710800170898, + "learning_rate": 0.0008781848756624542, + "loss": 3.6645, + "step": 14350 + }, + { + "epoch": 0.9753363228699552, + "grad_norm": 0.7669202089309692, + "learning_rate": 0.0008781424106536214, + "loss": 3.2779, + "step": 14355 + }, + { + "epoch": 0.9756760429406169, + "grad_norm": 0.7283913493156433, + "learning_rate": 0.0008780999456447886, + "loss": 3.7692, + "step": 14360 + }, + { + "epoch": 0.9760157630112787, + "grad_norm": 0.7726561427116394, + "learning_rate": 0.0008780574806359561, + "loss": 3.6281, + "step": 14365 + }, + { + "epoch": 0.9763554830819405, + "grad_norm": 0.7966226935386658, + "learning_rate": 0.0008780150156271233, + "loss": 3.7506, + "step": 14370 + }, + { + "epoch": 0.9766952031526023, + "grad_norm": 1.0231359004974365, + "learning_rate": 0.0008779725506182905, + "loss": 3.4747, + "step": 14375 + }, + { + "epoch": 0.977034923223264, + "grad_norm": 0.8659664988517761, + "learning_rate": 0.0008779300856094579, + "loss": 3.7118, + "step": 14380 + }, + { + "epoch": 0.9773746432939258, + "grad_norm": 1.3602862358093262, + "learning_rate": 0.0008778876206006251, + "loss": 3.36, + "step": 14385 + }, + { + "epoch": 0.9777143633645876, + "grad_norm": 0.9339749217033386, + "learning_rate": 0.0008778451555917923, + "loss": 3.4711, + "step": 14390 + }, + { + "epoch": 0.9780540834352494, + "grad_norm": 0.8001043200492859, + "learning_rate": 0.0008778026905829596, + "loss": 3.7394, + "step": 14395 + }, + { + "epoch": 0.9783938035059111, + "grad_norm": 1.3112300634384155, + "learning_rate": 0.000877760225574127, + "loss": 3.6017, + "step": 14400 + }, + { + "epoch": 0.978733523576573, + "grad_norm": 1.074655294418335, + "learning_rate": 0.0008777177605652942, + "loss": 3.4528, + "step": 14405 + }, + { + "epoch": 0.9790732436472347, + "grad_norm": 0.7202587723731995, + "learning_rate": 0.0008776752955564615, + "loss": 3.5412, + "step": 14410 + }, + { + "epoch": 0.9794129637178964, + "grad_norm": 0.6939672231674194, + "learning_rate": 0.0008776328305476288, + "loss": 3.6971, + "step": 14415 + }, + { + "epoch": 0.9797526837885582, + "grad_norm": 1.0251340866088867, + "learning_rate": 0.000877590365538796, + "loss": 3.4288, + "step": 14420 + }, + { + "epoch": 0.98009240385922, + "grad_norm": 0.8140147924423218, + "learning_rate": 0.0008775479005299633, + "loss": 3.395, + "step": 14425 + }, + { + "epoch": 0.9804321239298818, + "grad_norm": 0.9445436596870422, + "learning_rate": 0.0008775054355211305, + "loss": 3.7618, + "step": 14430 + }, + { + "epoch": 0.9807718440005435, + "grad_norm": 1.5093330144882202, + "learning_rate": 0.0008774629705122979, + "loss": 3.3225, + "step": 14435 + }, + { + "epoch": 0.9811115640712054, + "grad_norm": 0.7888212203979492, + "learning_rate": 0.0008774205055034652, + "loss": 3.6241, + "step": 14440 + }, + { + "epoch": 0.9814512841418671, + "grad_norm": 0.735809326171875, + "learning_rate": 0.0008773780404946324, + "loss": 3.7514, + "step": 14445 + }, + { + "epoch": 0.9817910042125289, + "grad_norm": 1.035038948059082, + "learning_rate": 0.0008773355754857997, + "loss": 3.3966, + "step": 14450 + }, + { + "epoch": 0.9821307242831907, + "grad_norm": 0.6126461625099182, + "learning_rate": 0.000877293110476967, + "loss": 3.7372, + "step": 14455 + }, + { + "epoch": 0.9824704443538524, + "grad_norm": 1.5529906749725342, + "learning_rate": 0.0008772506454681342, + "loss": 3.5282, + "step": 14460 + }, + { + "epoch": 0.9828101644245142, + "grad_norm": 0.6753084659576416, + "learning_rate": 0.0008772081804593015, + "loss": 3.7018, + "step": 14465 + }, + { + "epoch": 0.9831498844951759, + "grad_norm": 1.7974852323532104, + "learning_rate": 0.0008771657154504689, + "loss": 3.6868, + "step": 14470 + }, + { + "epoch": 0.9834896045658378, + "grad_norm": 1.1856310367584229, + "learning_rate": 0.0008771232504416361, + "loss": 3.3799, + "step": 14475 + }, + { + "epoch": 0.9838293246364995, + "grad_norm": 0.9552042484283447, + "learning_rate": 0.0008770807854328033, + "loss": 3.551, + "step": 14480 + }, + { + "epoch": 0.9841690447071613, + "grad_norm": 0.7982539534568787, + "learning_rate": 0.0008770383204239707, + "loss": 3.7515, + "step": 14485 + }, + { + "epoch": 0.9845087647778231, + "grad_norm": 0.710839569568634, + "learning_rate": 0.0008769958554151379, + "loss": 3.6189, + "step": 14490 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 1.089133858680725, + "learning_rate": 0.0008769533904063051, + "loss": 3.7779, + "step": 14495 + }, + { + "epoch": 0.9851882049191466, + "grad_norm": 0.813000500202179, + "learning_rate": 0.0008769109253974726, + "loss": 3.654, + "step": 14500 + }, + { + "epoch": 0.9855279249898083, + "grad_norm": 1.1919548511505127, + "learning_rate": 0.0008768684603886398, + "loss": 3.5314, + "step": 14505 + }, + { + "epoch": 0.9858676450604702, + "grad_norm": 1.1500823497772217, + "learning_rate": 0.000876825995379807, + "loss": 3.4783, + "step": 14510 + }, + { + "epoch": 0.9862073651311319, + "grad_norm": 0.7791033983230591, + "learning_rate": 0.0008767835303709744, + "loss": 3.8316, + "step": 14515 + }, + { + "epoch": 0.9865470852017937, + "grad_norm": 0.7140618562698364, + "learning_rate": 0.0008767410653621416, + "loss": 3.565, + "step": 14520 + }, + { + "epoch": 0.9868868052724555, + "grad_norm": 0.682572603225708, + "learning_rate": 0.0008766986003533088, + "loss": 3.4664, + "step": 14525 + }, + { + "epoch": 0.9872265253431173, + "grad_norm": 0.9518576860427856, + "learning_rate": 0.0008766561353444761, + "loss": 3.7221, + "step": 14530 + }, + { + "epoch": 0.987566245413779, + "grad_norm": 0.8951910734176636, + "learning_rate": 0.0008766136703356435, + "loss": 3.5713, + "step": 14535 + }, + { + "epoch": 0.9879059654844409, + "grad_norm": 1.3803086280822754, + "learning_rate": 0.0008765712053268107, + "loss": 3.7074, + "step": 14540 + }, + { + "epoch": 0.9882456855551026, + "grad_norm": 0.8013647198677063, + "learning_rate": 0.000876528740317978, + "loss": 3.4907, + "step": 14545 + }, + { + "epoch": 0.9885854056257644, + "grad_norm": 3.040494441986084, + "learning_rate": 0.0008764862753091453, + "loss": 3.5548, + "step": 14550 + }, + { + "epoch": 0.9889251256964261, + "grad_norm": 1.1658381223678589, + "learning_rate": 0.0008764438103003126, + "loss": 3.5798, + "step": 14555 + }, + { + "epoch": 0.989264845767088, + "grad_norm": 0.7970454692840576, + "learning_rate": 0.0008764013452914798, + "loss": 3.5966, + "step": 14560 + }, + { + "epoch": 0.9896045658377497, + "grad_norm": 0.7079986333847046, + "learning_rate": 0.0008763588802826471, + "loss": 3.74, + "step": 14565 + }, + { + "epoch": 0.9899442859084114, + "grad_norm": 0.6311217546463013, + "learning_rate": 0.0008763164152738145, + "loss": 3.4637, + "step": 14570 + }, + { + "epoch": 0.9902840059790733, + "grad_norm": 0.9189255833625793, + "learning_rate": 0.0008762739502649817, + "loss": 3.745, + "step": 14575 + }, + { + "epoch": 0.990623726049735, + "grad_norm": 0.9411978721618652, + "learning_rate": 0.000876231485256149, + "loss": 3.6037, + "step": 14580 + }, + { + "epoch": 0.9909634461203968, + "grad_norm": 0.7895781397819519, + "learning_rate": 0.0008761890202473163, + "loss": 3.6175, + "step": 14585 + }, + { + "epoch": 0.9913031661910585, + "grad_norm": 0.6289184093475342, + "learning_rate": 0.0008761465552384835, + "loss": 3.7815, + "step": 14590 + }, + { + "epoch": 0.9916428862617204, + "grad_norm": 1.0289568901062012, + "learning_rate": 0.0008761040902296507, + "loss": 3.6015, + "step": 14595 + }, + { + "epoch": 0.9919826063323821, + "grad_norm": 1.1223806142807007, + "learning_rate": 0.0008760616252208181, + "loss": 3.7296, + "step": 14600 + }, + { + "epoch": 0.9923223264030439, + "grad_norm": 1.0238943099975586, + "learning_rate": 0.0008760191602119854, + "loss": 3.8917, + "step": 14605 + }, + { + "epoch": 0.9926620464737057, + "grad_norm": 1.0552610158920288, + "learning_rate": 0.0008759766952031526, + "loss": 3.5648, + "step": 14610 + }, + { + "epoch": 0.9930017665443674, + "grad_norm": 0.7493376135826111, + "learning_rate": 0.00087593423019432, + "loss": 3.7246, + "step": 14615 + }, + { + "epoch": 0.9933414866150292, + "grad_norm": 0.7633503675460815, + "learning_rate": 0.0008758917651854872, + "loss": 3.552, + "step": 14620 + }, + { + "epoch": 0.993681206685691, + "grad_norm": 0.7958838939666748, + "learning_rate": 0.0008758493001766544, + "loss": 3.6726, + "step": 14625 + }, + { + "epoch": 0.9940209267563528, + "grad_norm": 0.7584198713302612, + "learning_rate": 0.0008758068351678218, + "loss": 3.8537, + "step": 14630 + }, + { + "epoch": 0.9943606468270145, + "grad_norm": 1.0128004550933838, + "learning_rate": 0.000875764370158989, + "loss": 3.5717, + "step": 14635 + }, + { + "epoch": 0.9947003668976763, + "grad_norm": 0.776348352432251, + "learning_rate": 0.0008757219051501563, + "loss": 3.8049, + "step": 14640 + }, + { + "epoch": 0.9950400869683381, + "grad_norm": 0.8426063656806946, + "learning_rate": 0.0008756794401413236, + "loss": 3.6119, + "step": 14645 + }, + { + "epoch": 0.9953798070389999, + "grad_norm": 1.3550785779953003, + "learning_rate": 0.0008756369751324909, + "loss": 3.5537, + "step": 14650 + }, + { + "epoch": 0.9957195271096616, + "grad_norm": 0.9330025911331177, + "learning_rate": 0.0008755945101236581, + "loss": 3.6364, + "step": 14655 + }, + { + "epoch": 0.9960592471803235, + "grad_norm": 0.7543591260910034, + "learning_rate": 0.0008755520451148254, + "loss": 3.5998, + "step": 14660 + }, + { + "epoch": 0.9963989672509852, + "grad_norm": 0.7924280762672424, + "learning_rate": 0.0008755095801059927, + "loss": 3.8385, + "step": 14665 + }, + { + "epoch": 0.9967386873216469, + "grad_norm": 0.8032102584838867, + "learning_rate": 0.0008754671150971599, + "loss": 3.7659, + "step": 14670 + }, + { + "epoch": 0.9970784073923087, + "grad_norm": 0.9652730822563171, + "learning_rate": 0.0008754246500883273, + "loss": 3.7116, + "step": 14675 + }, + { + "epoch": 0.9974181274629705, + "grad_norm": 1.3480511903762817, + "learning_rate": 0.0008753821850794946, + "loss": 3.7166, + "step": 14680 + }, + { + "epoch": 0.9977578475336323, + "grad_norm": 0.8818045854568481, + "learning_rate": 0.0008753397200706618, + "loss": 3.5993, + "step": 14685 + }, + { + "epoch": 0.998097567604294, + "grad_norm": 1.7601637840270996, + "learning_rate": 0.0008752972550618291, + "loss": 3.7252, + "step": 14690 + }, + { + "epoch": 0.9984372876749559, + "grad_norm": 0.9575991034507751, + "learning_rate": 0.0008752547900529963, + "loss": 3.9123, + "step": 14695 + }, + { + "epoch": 0.9987770077456176, + "grad_norm": 0.7978793978691101, + "learning_rate": 0.0008752123250441636, + "loss": 3.7741, + "step": 14700 + }, + { + "epoch": 0.9991167278162794, + "grad_norm": 0.8851346969604492, + "learning_rate": 0.0008751698600353309, + "loss": 3.6148, + "step": 14705 + }, + { + "epoch": 0.9994564478869412, + "grad_norm": 1.030360460281372, + "learning_rate": 0.0008751273950264982, + "loss": 3.6878, + "step": 14710 + }, + { + "epoch": 0.999796167957603, + "grad_norm": 0.8679876923561096, + "learning_rate": 0.0008750849300176655, + "loss": 3.8711, + "step": 14715 + }, + { + "epoch": 1.0, + "eval_bertscore": { + "f1": 0.8389091363640785, + "precision": 0.8363328560781013, + "recall": 0.8424776715587213 + }, + "eval_bleu_4": 0.02184768807264545, + "eval_exact_match": 9.690861517588914e-05, + "eval_loss": 3.498175859451294, + "eval_meteor": 0.08543425974180795, + "eval_rouge": { + "rouge1": 0.11716639522701967, + "rouge2": 0.01823257053335075, + "rougeL": 0.10197353098270825, + "rougeLsum": 0.10195712108044035 + }, + "eval_runtime": 1909.2989, + "eval_samples_per_second": 5.405, + "eval_steps_per_second": 0.676, + "step": 14718 + }, + { + "epoch": 1.0001358880282647, + "grad_norm": 0.8814338445663452, + "learning_rate": 0.0008750424650088328, + "loss": 3.6346, + "step": 14720 + }, + { + "epoch": 1.0004756080989265, + "grad_norm": 1.0643641948699951, + "learning_rate": 0.000875, + "loss": 3.5877, + "step": 14725 + }, + { + "epoch": 1.0008153281695882, + "grad_norm": 0.9938721656799316, + "learning_rate": 0.0008749575349911672, + "loss": 3.5232, + "step": 14730 + }, + { + "epoch": 1.00115504824025, + "grad_norm": 0.7084335684776306, + "learning_rate": 0.0008749150699823346, + "loss": 3.6679, + "step": 14735 + }, + { + "epoch": 1.0014947683109119, + "grad_norm": 0.9470041394233704, + "learning_rate": 0.0008748726049735018, + "loss": 3.6218, + "step": 14740 + }, + { + "epoch": 1.0018344883815735, + "grad_norm": 1.0030803680419922, + "learning_rate": 0.0008748301399646691, + "loss": 3.6294, + "step": 14745 + }, + { + "epoch": 1.0021742084522354, + "grad_norm": 0.8356907367706299, + "learning_rate": 0.0008747876749558365, + "loss": 3.3715, + "step": 14750 + }, + { + "epoch": 1.0025139285228972, + "grad_norm": 0.8871817588806152, + "learning_rate": 0.0008747452099470037, + "loss": 3.4734, + "step": 14755 + }, + { + "epoch": 1.0028536485935589, + "grad_norm": 1.310543417930603, + "learning_rate": 0.0008747027449381709, + "loss": 3.744, + "step": 14760 + }, + { + "epoch": 1.0031933686642207, + "grad_norm": 0.6988413333892822, + "learning_rate": 0.0008746602799293383, + "loss": 3.5933, + "step": 14765 + }, + { + "epoch": 1.0035330887348826, + "grad_norm": 1.1773345470428467, + "learning_rate": 0.0008746178149205055, + "loss": 3.4553, + "step": 14770 + }, + { + "epoch": 1.0038728088055442, + "grad_norm": 0.8376957774162292, + "learning_rate": 0.0008745753499116727, + "loss": 3.7318, + "step": 14775 + }, + { + "epoch": 1.004212528876206, + "grad_norm": 0.9681105613708496, + "learning_rate": 0.0008745328849028402, + "loss": 3.6417, + "step": 14780 + }, + { + "epoch": 1.0045522489468677, + "grad_norm": 1.0173298120498657, + "learning_rate": 0.0008744904198940074, + "loss": 3.6735, + "step": 14785 + }, + { + "epoch": 1.0048919690175295, + "grad_norm": 0.979135274887085, + "learning_rate": 0.0008744479548851746, + "loss": 3.7269, + "step": 14790 + }, + { + "epoch": 1.0052316890881914, + "grad_norm": 0.8437849879264832, + "learning_rate": 0.0008744054898763419, + "loss": 3.4077, + "step": 14795 + }, + { + "epoch": 1.005571409158853, + "grad_norm": 0.7566113471984863, + "learning_rate": 0.0008743630248675092, + "loss": 3.5871, + "step": 14800 + }, + { + "epoch": 1.0059111292295149, + "grad_norm": 0.7590341567993164, + "learning_rate": 0.0008743205598586764, + "loss": 3.6094, + "step": 14805 + }, + { + "epoch": 1.0062508493001767, + "grad_norm": 0.8771799206733704, + "learning_rate": 0.0008742780948498437, + "loss": 3.5055, + "step": 14810 + }, + { + "epoch": 1.0065905693708384, + "grad_norm": 0.8367830514907837, + "learning_rate": 0.0008742356298410111, + "loss": 3.4028, + "step": 14815 + }, + { + "epoch": 1.0069302894415002, + "grad_norm": 1.0329598188400269, + "learning_rate": 0.0008741931648321783, + "loss": 3.89, + "step": 14820 + }, + { + "epoch": 1.007270009512162, + "grad_norm": 0.989403247833252, + "learning_rate": 0.0008741506998233456, + "loss": 3.6164, + "step": 14825 + }, + { + "epoch": 1.0076097295828237, + "grad_norm": 0.9553883671760559, + "learning_rate": 0.0008741082348145128, + "loss": 3.662, + "step": 14830 + }, + { + "epoch": 1.0079494496534855, + "grad_norm": 0.9461486339569092, + "learning_rate": 0.0008740657698056801, + "loss": 3.667, + "step": 14835 + }, + { + "epoch": 1.0082891697241474, + "grad_norm": 0.737625002861023, + "learning_rate": 0.0008740233047968474, + "loss": 3.5138, + "step": 14840 + }, + { + "epoch": 1.008628889794809, + "grad_norm": 1.1586662530899048, + "learning_rate": 0.0008739808397880146, + "loss": 3.9285, + "step": 14845 + }, + { + "epoch": 1.0089686098654709, + "grad_norm": 1.00840425491333, + "learning_rate": 0.000873938374779182, + "loss": 3.7272, + "step": 14850 + }, + { + "epoch": 1.0093083299361327, + "grad_norm": 0.8020398020744324, + "learning_rate": 0.0008738959097703493, + "loss": 3.8652, + "step": 14855 + }, + { + "epoch": 1.0096480500067944, + "grad_norm": 0.8986596465110779, + "learning_rate": 0.0008738534447615165, + "loss": 3.5993, + "step": 14860 + }, + { + "epoch": 1.0099877700774562, + "grad_norm": 0.8338233828544617, + "learning_rate": 0.0008738109797526838, + "loss": 3.5684, + "step": 14865 + }, + { + "epoch": 1.0103274901481178, + "grad_norm": 0.9635263085365295, + "learning_rate": 0.0008737685147438511, + "loss": 3.5463, + "step": 14870 + }, + { + "epoch": 1.0106672102187797, + "grad_norm": 1.0407215356826782, + "learning_rate": 0.0008737260497350183, + "loss": 3.6459, + "step": 14875 + }, + { + "epoch": 1.0110069302894416, + "grad_norm": 1.1037853956222534, + "learning_rate": 0.0008736835847261855, + "loss": 3.5618, + "step": 14880 + }, + { + "epoch": 1.0113466503601032, + "grad_norm": 0.8600634336471558, + "learning_rate": 0.000873641119717353, + "loss": 3.7625, + "step": 14885 + }, + { + "epoch": 1.011686370430765, + "grad_norm": 1.0382503271102905, + "learning_rate": 0.0008735986547085202, + "loss": 3.8289, + "step": 14890 + }, + { + "epoch": 1.012026090501427, + "grad_norm": 0.6843704581260681, + "learning_rate": 0.0008735561896996874, + "loss": 3.5445, + "step": 14895 + }, + { + "epoch": 1.0123658105720885, + "grad_norm": 0.835701048374176, + "learning_rate": 0.0008735137246908548, + "loss": 3.3424, + "step": 14900 + }, + { + "epoch": 1.0127055306427504, + "grad_norm": 0.7699155807495117, + "learning_rate": 0.000873471259682022, + "loss": 3.9104, + "step": 14905 + }, + { + "epoch": 1.0130452507134122, + "grad_norm": 0.6510736346244812, + "learning_rate": 0.0008734287946731893, + "loss": 3.9143, + "step": 14910 + }, + { + "epoch": 1.0133849707840739, + "grad_norm": 0.9805281758308411, + "learning_rate": 0.0008733863296643566, + "loss": 3.5451, + "step": 14915 + }, + { + "epoch": 1.0137246908547357, + "grad_norm": 0.8176320195198059, + "learning_rate": 0.0008733438646555239, + "loss": 3.3819, + "step": 14920 + }, + { + "epoch": 1.0140644109253976, + "grad_norm": 0.7483733892440796, + "learning_rate": 0.0008733013996466912, + "loss": 3.6001, + "step": 14925 + }, + { + "epoch": 1.0144041309960592, + "grad_norm": 0.9397359490394592, + "learning_rate": 0.0008732589346378584, + "loss": 3.4551, + "step": 14930 + }, + { + "epoch": 1.014743851066721, + "grad_norm": 0.8956226110458374, + "learning_rate": 0.0008732164696290257, + "loss": 3.4347, + "step": 14935 + }, + { + "epoch": 1.015083571137383, + "grad_norm": 0.7432937622070312, + "learning_rate": 0.000873174004620193, + "loss": 3.3265, + "step": 14940 + }, + { + "epoch": 1.0154232912080445, + "grad_norm": 0.7844651937484741, + "learning_rate": 0.0008731315396113602, + "loss": 3.8496, + "step": 14945 + }, + { + "epoch": 1.0157630112787064, + "grad_norm": 0.7755668759346008, + "learning_rate": 0.0008730890746025275, + "loss": 3.7617, + "step": 14950 + }, + { + "epoch": 1.016102731349368, + "grad_norm": 0.8213574886322021, + "learning_rate": 0.0008730466095936949, + "loss": 3.6124, + "step": 14955 + }, + { + "epoch": 1.0164424514200299, + "grad_norm": 0.9597127437591553, + "learning_rate": 0.0008730041445848621, + "loss": 3.6307, + "step": 14960 + }, + { + "epoch": 1.0167821714906917, + "grad_norm": 0.928554892539978, + "learning_rate": 0.0008729616795760294, + "loss": 3.5489, + "step": 14965 + }, + { + "epoch": 1.0171218915613534, + "grad_norm": 0.8676695227622986, + "learning_rate": 0.0008729192145671967, + "loss": 3.4171, + "step": 14970 + }, + { + "epoch": 1.0174616116320152, + "grad_norm": 0.829727292060852, + "learning_rate": 0.0008728767495583639, + "loss": 3.6094, + "step": 14975 + }, + { + "epoch": 1.017801331702677, + "grad_norm": 0.801650881767273, + "learning_rate": 0.0008728342845495311, + "loss": 3.5852, + "step": 14980 + }, + { + "epoch": 1.0181410517733387, + "grad_norm": 0.8619431257247925, + "learning_rate": 0.0008727918195406986, + "loss": 3.8354, + "step": 14985 + }, + { + "epoch": 1.0184807718440005, + "grad_norm": 0.8465469479560852, + "learning_rate": 0.0008727493545318658, + "loss": 3.6028, + "step": 14990 + }, + { + "epoch": 1.0188204919146624, + "grad_norm": 1.0671541690826416, + "learning_rate": 0.000872706889523033, + "loss": 3.6368, + "step": 14995 + }, + { + "epoch": 1.019160211985324, + "grad_norm": 0.6990070343017578, + "learning_rate": 0.0008726644245142004, + "loss": 3.6074, + "step": 15000 + }, + { + "epoch": 1.0194999320559859, + "grad_norm": 0.7204691767692566, + "learning_rate": 0.0008726219595053676, + "loss": 3.4047, + "step": 15005 + }, + { + "epoch": 1.0198396521266477, + "grad_norm": 0.7201042175292969, + "learning_rate": 0.0008725794944965348, + "loss": 3.6594, + "step": 15010 + }, + { + "epoch": 1.0201793721973094, + "grad_norm": 0.6569257974624634, + "learning_rate": 0.0008725370294877022, + "loss": 3.6359, + "step": 15015 + }, + { + "epoch": 1.0205190922679712, + "grad_norm": 0.9001051783561707, + "learning_rate": 0.0008724945644788695, + "loss": 3.3237, + "step": 15020 + }, + { + "epoch": 1.020858812338633, + "grad_norm": 0.904829204082489, + "learning_rate": 0.0008724520994700367, + "loss": 3.6928, + "step": 15025 + }, + { + "epoch": 1.0211985324092947, + "grad_norm": 0.8714768886566162, + "learning_rate": 0.000872409634461204, + "loss": 3.3289, + "step": 15030 + }, + { + "epoch": 1.0215382524799566, + "grad_norm": 0.7015002369880676, + "learning_rate": 0.0008723671694523713, + "loss": 3.8618, + "step": 15035 + }, + { + "epoch": 1.0218779725506182, + "grad_norm": 0.8345310091972351, + "learning_rate": 0.0008723247044435385, + "loss": 3.5438, + "step": 15040 + }, + { + "epoch": 1.02221769262128, + "grad_norm": 0.8914956450462341, + "learning_rate": 0.0008722822394347058, + "loss": 3.6641, + "step": 15045 + }, + { + "epoch": 1.022557412691942, + "grad_norm": 1.0704143047332764, + "learning_rate": 0.0008722397744258731, + "loss": 3.6192, + "step": 15050 + }, + { + "epoch": 1.0228971327626035, + "grad_norm": 0.8193677067756653, + "learning_rate": 0.0008721973094170404, + "loss": 3.5526, + "step": 15055 + }, + { + "epoch": 1.0232368528332654, + "grad_norm": 0.9400145411491394, + "learning_rate": 0.0008721548444082077, + "loss": 3.583, + "step": 15060 + }, + { + "epoch": 1.0235765729039272, + "grad_norm": 1.011356234550476, + "learning_rate": 0.000872112379399375, + "loss": 3.5837, + "step": 15065 + }, + { + "epoch": 1.0239162929745889, + "grad_norm": 0.8681876063346863, + "learning_rate": 0.0008720699143905422, + "loss": 3.5572, + "step": 15070 + }, + { + "epoch": 1.0242560130452507, + "grad_norm": 1.0365689992904663, + "learning_rate": 0.0008720274493817095, + "loss": 3.6031, + "step": 15075 + }, + { + "epoch": 1.0245957331159126, + "grad_norm": 0.9950872659683228, + "learning_rate": 0.0008719849843728767, + "loss": 3.8435, + "step": 15080 + }, + { + "epoch": 1.0249354531865742, + "grad_norm": 0.9750520586967468, + "learning_rate": 0.000871942519364044, + "loss": 3.49, + "step": 15085 + }, + { + "epoch": 1.025275173257236, + "grad_norm": 0.8155407905578613, + "learning_rate": 0.0008719000543552114, + "loss": 3.6348, + "step": 15090 + }, + { + "epoch": 1.025614893327898, + "grad_norm": 0.756203293800354, + "learning_rate": 0.0008718575893463786, + "loss": 3.8899, + "step": 15095 + }, + { + "epoch": 1.0259546133985595, + "grad_norm": 0.7521533370018005, + "learning_rate": 0.0008718151243375459, + "loss": 3.5993, + "step": 15100 + }, + { + "epoch": 1.0262943334692214, + "grad_norm": 0.8122774958610535, + "learning_rate": 0.0008717726593287132, + "loss": 3.554, + "step": 15105 + }, + { + "epoch": 1.0266340535398832, + "grad_norm": 0.8847051858901978, + "learning_rate": 0.0008717301943198804, + "loss": 3.645, + "step": 15110 + }, + { + "epoch": 1.0269737736105449, + "grad_norm": 1.3711600303649902, + "learning_rate": 0.0008716877293110476, + "loss": 3.5638, + "step": 15115 + }, + { + "epoch": 1.0273134936812067, + "grad_norm": 0.7446442246437073, + "learning_rate": 0.000871645264302215, + "loss": 3.6628, + "step": 15120 + }, + { + "epoch": 1.0276532137518684, + "grad_norm": 0.8747648000717163, + "learning_rate": 0.0008716027992933823, + "loss": 3.5028, + "step": 15125 + }, + { + "epoch": 1.0279929338225302, + "grad_norm": 1.5600366592407227, + "learning_rate": 0.0008715603342845495, + "loss": 3.7662, + "step": 15130 + }, + { + "epoch": 1.028332653893192, + "grad_norm": 0.9516339898109436, + "learning_rate": 0.0008715178692757169, + "loss": 3.6854, + "step": 15135 + }, + { + "epoch": 1.0286723739638537, + "grad_norm": 0.6442124247550964, + "learning_rate": 0.0008714754042668841, + "loss": 3.695, + "step": 15140 + }, + { + "epoch": 1.0290120940345155, + "grad_norm": 0.8294907808303833, + "learning_rate": 0.0008714329392580513, + "loss": 3.5534, + "step": 15145 + }, + { + "epoch": 1.0293518141051774, + "grad_norm": 0.8132860660552979, + "learning_rate": 0.0008713904742492187, + "loss": 3.7457, + "step": 15150 + }, + { + "epoch": 1.029691534175839, + "grad_norm": 0.8109138607978821, + "learning_rate": 0.0008713480092403859, + "loss": 3.6412, + "step": 15155 + }, + { + "epoch": 1.0300312542465009, + "grad_norm": 0.7479994893074036, + "learning_rate": 0.0008713055442315532, + "loss": 3.764, + "step": 15160 + }, + { + "epoch": 1.0303709743171627, + "grad_norm": 0.7487816214561462, + "learning_rate": 0.0008712630792227206, + "loss": 3.5411, + "step": 15165 + }, + { + "epoch": 1.0307106943878244, + "grad_norm": 0.6944779753684998, + "learning_rate": 0.0008712206142138878, + "loss": 3.7505, + "step": 15170 + }, + { + "epoch": 1.0310504144584862, + "grad_norm": 0.8908357620239258, + "learning_rate": 0.000871178149205055, + "loss": 3.4252, + "step": 15175 + }, + { + "epoch": 1.031390134529148, + "grad_norm": 0.9046226143836975, + "learning_rate": 0.0008711356841962223, + "loss": 3.6963, + "step": 15180 + }, + { + "epoch": 1.0317298545998097, + "grad_norm": 0.6427444219589233, + "learning_rate": 0.0008710932191873896, + "loss": 3.658, + "step": 15185 + }, + { + "epoch": 1.0320695746704716, + "grad_norm": 0.9451667666435242, + "learning_rate": 0.0008710507541785568, + "loss": 3.7814, + "step": 15190 + }, + { + "epoch": 1.0324092947411334, + "grad_norm": 0.7863710522651672, + "learning_rate": 0.0008710082891697242, + "loss": 3.6874, + "step": 15195 + }, + { + "epoch": 1.032749014811795, + "grad_norm": 0.6998689770698547, + "learning_rate": 0.0008709658241608915, + "loss": 3.4988, + "step": 15200 + }, + { + "epoch": 1.033088734882457, + "grad_norm": 1.0530166625976562, + "learning_rate": 0.0008709233591520587, + "loss": 3.7067, + "step": 15205 + }, + { + "epoch": 1.0334284549531185, + "grad_norm": 0.9810565114021301, + "learning_rate": 0.000870880894143226, + "loss": 3.6403, + "step": 15210 + }, + { + "epoch": 1.0337681750237804, + "grad_norm": 0.6775922179222107, + "learning_rate": 0.0008708384291343932, + "loss": 3.6736, + "step": 15215 + }, + { + "epoch": 1.0341078950944422, + "grad_norm": 0.7473011016845703, + "learning_rate": 0.0008707959641255605, + "loss": 3.7868, + "step": 15220 + }, + { + "epoch": 1.0344476151651039, + "grad_norm": 0.8771403431892395, + "learning_rate": 0.0008707534991167278, + "loss": 3.3005, + "step": 15225 + }, + { + "epoch": 1.0347873352357657, + "grad_norm": 0.8676325082778931, + "learning_rate": 0.0008707110341078951, + "loss": 3.5982, + "step": 15230 + }, + { + "epoch": 1.0351270553064276, + "grad_norm": 0.7086388468742371, + "learning_rate": 0.0008706685690990624, + "loss": 3.8949, + "step": 15235 + }, + { + "epoch": 1.0354667753770892, + "grad_norm": 1.042790412902832, + "learning_rate": 0.0008706261040902297, + "loss": 3.5786, + "step": 15240 + }, + { + "epoch": 1.035806495447751, + "grad_norm": 1.1254736185073853, + "learning_rate": 0.0008705836390813969, + "loss": 3.8686, + "step": 15245 + }, + { + "epoch": 1.036146215518413, + "grad_norm": 0.6961475610733032, + "learning_rate": 0.0008705411740725643, + "loss": 3.7065, + "step": 15250 + }, + { + "epoch": 1.0364859355890745, + "grad_norm": 0.7207124829292297, + "learning_rate": 0.0008704987090637315, + "loss": 3.6319, + "step": 15255 + }, + { + "epoch": 1.0368256556597364, + "grad_norm": 0.8862795233726501, + "learning_rate": 0.0008704562440548987, + "loss": 3.7325, + "step": 15260 + }, + { + "epoch": 1.0371653757303982, + "grad_norm": 0.6556076407432556, + "learning_rate": 0.0008704137790460662, + "loss": 3.6097, + "step": 15265 + }, + { + "epoch": 1.0375050958010599, + "grad_norm": 0.6966277360916138, + "learning_rate": 0.0008703713140372334, + "loss": 3.6055, + "step": 15270 + }, + { + "epoch": 1.0378448158717217, + "grad_norm": 0.820662796497345, + "learning_rate": 0.0008703288490284006, + "loss": 3.3342, + "step": 15275 + }, + { + "epoch": 1.0381845359423836, + "grad_norm": 8.954673767089844, + "learning_rate": 0.0008702863840195679, + "loss": 3.7034, + "step": 15280 + }, + { + "epoch": 1.0385242560130452, + "grad_norm": 0.8142824769020081, + "learning_rate": 0.0008702439190107352, + "loss": 3.6912, + "step": 15285 + }, + { + "epoch": 1.038863976083707, + "grad_norm": 0.6903845071792603, + "learning_rate": 0.0008702014540019024, + "loss": 3.8957, + "step": 15290 + }, + { + "epoch": 1.0392036961543687, + "grad_norm": 0.9089766144752502, + "learning_rate": 0.0008701589889930697, + "loss": 3.6226, + "step": 15295 + }, + { + "epoch": 1.0395434162250305, + "grad_norm": 0.7763200402259827, + "learning_rate": 0.0008701165239842371, + "loss": 3.6205, + "step": 15300 + }, + { + "epoch": 1.0398831362956924, + "grad_norm": 0.8722520470619202, + "learning_rate": 0.0008700740589754043, + "loss": 3.6822, + "step": 15305 + }, + { + "epoch": 1.040222856366354, + "grad_norm": 1.062848448753357, + "learning_rate": 0.0008700315939665716, + "loss": 3.2898, + "step": 15310 + }, + { + "epoch": 1.0405625764370159, + "grad_norm": 0.9082038998603821, + "learning_rate": 0.0008699891289577389, + "loss": 3.7562, + "step": 15315 + }, + { + "epoch": 1.0409022965076777, + "grad_norm": 0.9263285994529724, + "learning_rate": 0.0008699466639489061, + "loss": 3.5439, + "step": 15320 + }, + { + "epoch": 1.0412420165783394, + "grad_norm": 0.9191448092460632, + "learning_rate": 0.0008699041989400734, + "loss": 3.5214, + "step": 15325 + }, + { + "epoch": 1.0415817366490012, + "grad_norm": 0.8411559462547302, + "learning_rate": 0.0008698617339312406, + "loss": 3.6458, + "step": 15330 + }, + { + "epoch": 1.041921456719663, + "grad_norm": 0.8023721575737, + "learning_rate": 0.000869819268922408, + "loss": 3.7477, + "step": 15335 + }, + { + "epoch": 1.0422611767903247, + "grad_norm": 0.7680902481079102, + "learning_rate": 0.0008697768039135753, + "loss": 3.9438, + "step": 15340 + }, + { + "epoch": 1.0426008968609866, + "grad_norm": 0.7697497010231018, + "learning_rate": 0.0008697343389047425, + "loss": 3.5727, + "step": 15345 + }, + { + "epoch": 1.0429406169316484, + "grad_norm": 0.741430938243866, + "learning_rate": 0.0008696918738959098, + "loss": 3.5024, + "step": 15350 + }, + { + "epoch": 1.04328033700231, + "grad_norm": 0.792542576789856, + "learning_rate": 0.0008696494088870771, + "loss": 3.3204, + "step": 15355 + }, + { + "epoch": 1.043620057072972, + "grad_norm": 0.66200190782547, + "learning_rate": 0.0008696069438782443, + "loss": 3.6594, + "step": 15360 + }, + { + "epoch": 1.0439597771436337, + "grad_norm": 0.8391197323799133, + "learning_rate": 0.0008695644788694115, + "loss": 3.3441, + "step": 15365 + }, + { + "epoch": 1.0442994972142954, + "grad_norm": 0.8183795213699341, + "learning_rate": 0.000869522013860579, + "loss": 3.7259, + "step": 15370 + }, + { + "epoch": 1.0446392172849572, + "grad_norm": 0.7900511622428894, + "learning_rate": 0.0008694795488517462, + "loss": 3.6028, + "step": 15375 + }, + { + "epoch": 1.0449789373556189, + "grad_norm": 0.7012553811073303, + "learning_rate": 0.0008694370838429134, + "loss": 3.4612, + "step": 15380 + }, + { + "epoch": 1.0453186574262807, + "grad_norm": 1.0632553100585938, + "learning_rate": 0.0008693946188340808, + "loss": 3.4512, + "step": 15385 + }, + { + "epoch": 1.0456583774969426, + "grad_norm": 0.7872094511985779, + "learning_rate": 0.000869352153825248, + "loss": 3.4549, + "step": 15390 + }, + { + "epoch": 1.0459980975676042, + "grad_norm": 0.719857931137085, + "learning_rate": 0.0008693096888164152, + "loss": 3.4266, + "step": 15395 + }, + { + "epoch": 1.046337817638266, + "grad_norm": 16.433879852294922, + "learning_rate": 0.0008692672238075826, + "loss": 3.4509, + "step": 15400 + }, + { + "epoch": 1.046677537708928, + "grad_norm": 1.077065348625183, + "learning_rate": 0.0008692247587987499, + "loss": 3.5047, + "step": 15405 + }, + { + "epoch": 1.0470172577795895, + "grad_norm": 1.4596573114395142, + "learning_rate": 0.0008691822937899171, + "loss": 3.5354, + "step": 15410 + }, + { + "epoch": 1.0473569778502514, + "grad_norm": 0.8032779097557068, + "learning_rate": 0.0008691398287810845, + "loss": 3.8902, + "step": 15415 + }, + { + "epoch": 1.0476966979209132, + "grad_norm": 0.8960117101669312, + "learning_rate": 0.0008690973637722517, + "loss": 3.4476, + "step": 15420 + }, + { + "epoch": 1.0480364179915749, + "grad_norm": 0.7177199721336365, + "learning_rate": 0.0008690548987634189, + "loss": 3.568, + "step": 15425 + }, + { + "epoch": 1.0483761380622367, + "grad_norm": 0.8919602632522583, + "learning_rate": 0.0008690124337545862, + "loss": 3.3943, + "step": 15430 + }, + { + "epoch": 1.0487158581328986, + "grad_norm": 0.809073805809021, + "learning_rate": 0.0008689699687457535, + "loss": 3.5871, + "step": 15435 + }, + { + "epoch": 1.0490555782035602, + "grad_norm": 0.6843132972717285, + "learning_rate": 0.0008689275037369208, + "loss": 3.538, + "step": 15440 + }, + { + "epoch": 1.049395298274222, + "grad_norm": 0.8548818230628967, + "learning_rate": 0.0008688850387280881, + "loss": 3.4308, + "step": 15445 + }, + { + "epoch": 1.049735018344884, + "grad_norm": 0.8107924461364746, + "learning_rate": 0.0008688425737192554, + "loss": 3.7346, + "step": 15450 + }, + { + "epoch": 1.0500747384155455, + "grad_norm": 0.8784366846084595, + "learning_rate": 0.0008688001087104226, + "loss": 3.687, + "step": 15455 + }, + { + "epoch": 1.0504144584862074, + "grad_norm": 1.162998914718628, + "learning_rate": 0.0008687576437015899, + "loss": 3.4654, + "step": 15460 + }, + { + "epoch": 1.050754178556869, + "grad_norm": 0.9834625124931335, + "learning_rate": 0.0008687151786927571, + "loss": 3.8437, + "step": 15465 + }, + { + "epoch": 1.0510938986275309, + "grad_norm": 2.212268114089966, + "learning_rate": 0.0008686727136839244, + "loss": 3.769, + "step": 15470 + }, + { + "epoch": 1.0514336186981927, + "grad_norm": 0.9205801486968994, + "learning_rate": 0.0008686302486750918, + "loss": 3.4809, + "step": 15475 + }, + { + "epoch": 1.0517733387688544, + "grad_norm": 0.8589804172515869, + "learning_rate": 0.000868587783666259, + "loss": 3.7163, + "step": 15480 + }, + { + "epoch": 1.0521130588395162, + "grad_norm": 0.8205832839012146, + "learning_rate": 0.0008685453186574263, + "loss": 3.7983, + "step": 15485 + }, + { + "epoch": 1.052452778910178, + "grad_norm": 0.6659541130065918, + "learning_rate": 0.0008685028536485936, + "loss": 3.7491, + "step": 15490 + }, + { + "epoch": 1.0527924989808397, + "grad_norm": 0.8629612326622009, + "learning_rate": 0.0008684603886397608, + "loss": 3.5968, + "step": 15495 + }, + { + "epoch": 1.0531322190515016, + "grad_norm": 0.7920187711715698, + "learning_rate": 0.000868417923630928, + "loss": 3.3988, + "step": 15500 + }, + { + "epoch": 1.0534719391221634, + "grad_norm": 0.8191885352134705, + "learning_rate": 0.0008683754586220954, + "loss": 3.6912, + "step": 15505 + }, + { + "epoch": 1.053811659192825, + "grad_norm": 0.7358852028846741, + "learning_rate": 0.0008683329936132627, + "loss": 3.9016, + "step": 15510 + }, + { + "epoch": 1.054151379263487, + "grad_norm": 0.9025446176528931, + "learning_rate": 0.0008682905286044299, + "loss": 3.4899, + "step": 15515 + }, + { + "epoch": 1.0544910993341488, + "grad_norm": 0.8615103960037231, + "learning_rate": 0.0008682480635955973, + "loss": 3.7464, + "step": 15520 + }, + { + "epoch": 1.0548308194048104, + "grad_norm": 0.980667233467102, + "learning_rate": 0.0008682055985867645, + "loss": 3.6619, + "step": 15525 + }, + { + "epoch": 1.0551705394754722, + "grad_norm": 0.6504858732223511, + "learning_rate": 0.0008681631335779317, + "loss": 3.6055, + "step": 15530 + }, + { + "epoch": 1.055510259546134, + "grad_norm": 0.9805901646614075, + "learning_rate": 0.0008681206685690991, + "loss": 3.787, + "step": 15535 + }, + { + "epoch": 1.0558499796167957, + "grad_norm": 0.877485454082489, + "learning_rate": 0.0008680782035602663, + "loss": 3.7024, + "step": 15540 + }, + { + "epoch": 1.0561896996874576, + "grad_norm": 0.6359384059906006, + "learning_rate": 0.0008680357385514336, + "loss": 3.6765, + "step": 15545 + }, + { + "epoch": 1.0565294197581192, + "grad_norm": 0.7626210451126099, + "learning_rate": 0.000867993273542601, + "loss": 3.6948, + "step": 15550 + }, + { + "epoch": 1.056869139828781, + "grad_norm": 0.739578902721405, + "learning_rate": 0.0008679508085337682, + "loss": 3.4635, + "step": 15555 + }, + { + "epoch": 1.057208859899443, + "grad_norm": 0.784304141998291, + "learning_rate": 0.0008679083435249354, + "loss": 3.3567, + "step": 15560 + }, + { + "epoch": 1.0575485799701045, + "grad_norm": 1.0111767053604126, + "learning_rate": 0.0008678658785161027, + "loss": 3.7883, + "step": 15565 + }, + { + "epoch": 1.0578883000407664, + "grad_norm": 1.2617043256759644, + "learning_rate": 0.00086782341350727, + "loss": 3.6458, + "step": 15570 + }, + { + "epoch": 1.0582280201114282, + "grad_norm": 1.0406670570373535, + "learning_rate": 0.0008677809484984372, + "loss": 3.4338, + "step": 15575 + }, + { + "epoch": 1.0585677401820899, + "grad_norm": 0.7946984171867371, + "learning_rate": 0.0008677384834896046, + "loss": 3.6424, + "step": 15580 + }, + { + "epoch": 1.0589074602527517, + "grad_norm": 0.7688732147216797, + "learning_rate": 0.0008676960184807719, + "loss": 3.7033, + "step": 15585 + }, + { + "epoch": 1.0592471803234136, + "grad_norm": 0.7276850342750549, + "learning_rate": 0.0008676535534719392, + "loss": 3.7805, + "step": 15590 + }, + { + "epoch": 1.0595869003940752, + "grad_norm": 0.8094419836997986, + "learning_rate": 0.0008676110884631064, + "loss": 3.7214, + "step": 15595 + }, + { + "epoch": 1.059926620464737, + "grad_norm": 0.7307969331741333, + "learning_rate": 0.0008675686234542737, + "loss": 3.5662, + "step": 15600 + }, + { + "epoch": 1.060266340535399, + "grad_norm": 0.7257490754127502, + "learning_rate": 0.000867526158445441, + "loss": 3.471, + "step": 15605 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.8852719664573669, + "learning_rate": 0.0008674836934366083, + "loss": 3.7268, + "step": 15610 + }, + { + "epoch": 1.0609457806767224, + "grad_norm": 0.7865869402885437, + "learning_rate": 0.0008674412284277755, + "loss": 3.6857, + "step": 15615 + }, + { + "epoch": 1.0612855007473843, + "grad_norm": 0.7893869280815125, + "learning_rate": 0.0008673987634189429, + "loss": 3.9989, + "step": 15620 + }, + { + "epoch": 1.061625220818046, + "grad_norm": 0.8716294169425964, + "learning_rate": 0.0008673562984101101, + "loss": 3.5972, + "step": 15625 + }, + { + "epoch": 1.0619649408887077, + "grad_norm": 0.7460842728614807, + "learning_rate": 0.0008673138334012773, + "loss": 3.7215, + "step": 15630 + }, + { + "epoch": 1.0623046609593694, + "grad_norm": 0.9360713958740234, + "learning_rate": 0.0008672713683924447, + "loss": 3.7538, + "step": 15635 + }, + { + "epoch": 1.0626443810300312, + "grad_norm": 1.1654986143112183, + "learning_rate": 0.0008672289033836119, + "loss": 3.4678, + "step": 15640 + }, + { + "epoch": 1.062984101100693, + "grad_norm": 0.9604095220565796, + "learning_rate": 0.0008671864383747792, + "loss": 3.5295, + "step": 15645 + }, + { + "epoch": 1.0633238211713547, + "grad_norm": 0.8354559540748596, + "learning_rate": 0.0008671439733659466, + "loss": 3.6092, + "step": 15650 + }, + { + "epoch": 1.0636635412420166, + "grad_norm": 1.0568187236785889, + "learning_rate": 0.0008671015083571138, + "loss": 3.605, + "step": 15655 + }, + { + "epoch": 1.0640032613126784, + "grad_norm": 1.3043498992919922, + "learning_rate": 0.000867059043348281, + "loss": 3.5474, + "step": 15660 + }, + { + "epoch": 1.06434298138334, + "grad_norm": 0.7852807641029358, + "learning_rate": 0.0008670165783394483, + "loss": 3.5353, + "step": 15665 + }, + { + "epoch": 1.064682701454002, + "grad_norm": 0.9302780628204346, + "learning_rate": 0.0008669741133306156, + "loss": 3.4599, + "step": 15670 + }, + { + "epoch": 1.0650224215246638, + "grad_norm": 1.1789782047271729, + "learning_rate": 0.0008669316483217828, + "loss": 3.6039, + "step": 15675 + }, + { + "epoch": 1.0653621415953254, + "grad_norm": 0.7629793286323547, + "learning_rate": 0.0008668891833129502, + "loss": 3.7981, + "step": 15680 + }, + { + "epoch": 1.0657018616659872, + "grad_norm": 1.1154636144638062, + "learning_rate": 0.0008668467183041175, + "loss": 3.4061, + "step": 15685 + }, + { + "epoch": 1.066041581736649, + "grad_norm": 0.8396511673927307, + "learning_rate": 0.0008668042532952847, + "loss": 3.8315, + "step": 15690 + }, + { + "epoch": 1.0663813018073107, + "grad_norm": 0.7889012098312378, + "learning_rate": 0.000866761788286452, + "loss": 3.661, + "step": 15695 + }, + { + "epoch": 1.0667210218779726, + "grad_norm": 0.8888555765151978, + "learning_rate": 0.0008667193232776193, + "loss": 3.6586, + "step": 15700 + }, + { + "epoch": 1.0670607419486344, + "grad_norm": 0.8926337957382202, + "learning_rate": 0.0008666768582687865, + "loss": 3.2638, + "step": 15705 + }, + { + "epoch": 1.067400462019296, + "grad_norm": 0.8350597023963928, + "learning_rate": 0.0008666343932599538, + "loss": 3.5546, + "step": 15710 + }, + { + "epoch": 1.067740182089958, + "grad_norm": 0.7126908302307129, + "learning_rate": 0.0008665919282511211, + "loss": 3.4451, + "step": 15715 + }, + { + "epoch": 1.0680799021606195, + "grad_norm": 0.8147726058959961, + "learning_rate": 0.0008665494632422884, + "loss": 3.5759, + "step": 15720 + }, + { + "epoch": 1.0684196222312814, + "grad_norm": 0.8288269639015198, + "learning_rate": 0.0008665069982334557, + "loss": 3.6222, + "step": 15725 + }, + { + "epoch": 1.0687593423019432, + "grad_norm": 0.8992449641227722, + "learning_rate": 0.0008664645332246229, + "loss": 3.6136, + "step": 15730 + }, + { + "epoch": 1.0690990623726049, + "grad_norm": 1.3939313888549805, + "learning_rate": 0.0008664220682157902, + "loss": 3.7762, + "step": 15735 + }, + { + "epoch": 1.0694387824432667, + "grad_norm": 0.7164123058319092, + "learning_rate": 0.0008663796032069575, + "loss": 3.6676, + "step": 15740 + }, + { + "epoch": 1.0697785025139286, + "grad_norm": 1.095192313194275, + "learning_rate": 0.0008663371381981247, + "loss": 3.5184, + "step": 15745 + }, + { + "epoch": 1.0701182225845902, + "grad_norm": 0.9224209785461426, + "learning_rate": 0.0008662946731892921, + "loss": 3.3435, + "step": 15750 + }, + { + "epoch": 1.070457942655252, + "grad_norm": 0.8177723288536072, + "learning_rate": 0.0008662522081804594, + "loss": 3.8843, + "step": 15755 + }, + { + "epoch": 1.070797662725914, + "grad_norm": 0.8707209825515747, + "learning_rate": 0.0008662097431716266, + "loss": 3.4853, + "step": 15760 + }, + { + "epoch": 1.0711373827965756, + "grad_norm": 1.417647123336792, + "learning_rate": 0.0008661672781627938, + "loss": 3.6745, + "step": 15765 + }, + { + "epoch": 1.0714771028672374, + "grad_norm": 0.9045370817184448, + "learning_rate": 0.0008661248131539612, + "loss": 3.397, + "step": 15770 + }, + { + "epoch": 1.0718168229378993, + "grad_norm": 0.9331650137901306, + "learning_rate": 0.0008660823481451284, + "loss": 3.7061, + "step": 15775 + }, + { + "epoch": 1.072156543008561, + "grad_norm": 0.8140069246292114, + "learning_rate": 0.0008660398831362956, + "loss": 3.6041, + "step": 15780 + }, + { + "epoch": 1.0724962630792227, + "grad_norm": 0.8020852208137512, + "learning_rate": 0.0008659974181274631, + "loss": 3.4695, + "step": 15785 + }, + { + "epoch": 1.0728359831498846, + "grad_norm": 0.758999764919281, + "learning_rate": 0.0008659549531186303, + "loss": 3.6057, + "step": 15790 + }, + { + "epoch": 1.0731757032205462, + "grad_norm": 0.8865442276000977, + "learning_rate": 0.0008659124881097975, + "loss": 3.829, + "step": 15795 + }, + { + "epoch": 1.073515423291208, + "grad_norm": 0.8678497076034546, + "learning_rate": 0.0008658700231009649, + "loss": 3.5981, + "step": 15800 + }, + { + "epoch": 1.07385514336187, + "grad_norm": 0.7725279331207275, + "learning_rate": 0.0008658275580921321, + "loss": 3.6465, + "step": 15805 + }, + { + "epoch": 1.0741948634325316, + "grad_norm": 0.8735711574554443, + "learning_rate": 0.0008657850930832993, + "loss": 3.4665, + "step": 15810 + }, + { + "epoch": 1.0745345835031934, + "grad_norm": 1.0707951784133911, + "learning_rate": 0.0008657426280744666, + "loss": 3.6129, + "step": 15815 + }, + { + "epoch": 1.074874303573855, + "grad_norm": 0.7036058902740479, + "learning_rate": 0.000865700163065634, + "loss": 3.6647, + "step": 15820 + }, + { + "epoch": 1.075214023644517, + "grad_norm": 0.9631282091140747, + "learning_rate": 0.0008656576980568012, + "loss": 3.6228, + "step": 15825 + }, + { + "epoch": 1.0755537437151788, + "grad_norm": 0.796127438545227, + "learning_rate": 0.0008656152330479685, + "loss": 3.6405, + "step": 15830 + }, + { + "epoch": 1.0758934637858404, + "grad_norm": 0.8031331300735474, + "learning_rate": 0.0008655727680391358, + "loss": 3.3856, + "step": 15835 + }, + { + "epoch": 1.0762331838565022, + "grad_norm": 1.1724438667297363, + "learning_rate": 0.000865530303030303, + "loss": 3.6188, + "step": 15840 + }, + { + "epoch": 1.076572903927164, + "grad_norm": 0.9749025702476501, + "learning_rate": 0.0008654878380214703, + "loss": 3.1766, + "step": 15845 + }, + { + "epoch": 1.0769126239978257, + "grad_norm": 0.7991123795509338, + "learning_rate": 0.0008654453730126375, + "loss": 3.3386, + "step": 15850 + }, + { + "epoch": 1.0772523440684876, + "grad_norm": 0.6894931197166443, + "learning_rate": 0.0008654029080038049, + "loss": 3.6114, + "step": 15855 + }, + { + "epoch": 1.0775920641391494, + "grad_norm": 1.01557457447052, + "learning_rate": 0.0008653604429949722, + "loss": 3.8509, + "step": 15860 + }, + { + "epoch": 1.077931784209811, + "grad_norm": 0.8648229837417603, + "learning_rate": 0.0008653179779861394, + "loss": 3.3591, + "step": 15865 + }, + { + "epoch": 1.078271504280473, + "grad_norm": 0.7908108830451965, + "learning_rate": 0.0008652755129773067, + "loss": 3.7018, + "step": 15870 + }, + { + "epoch": 1.0786112243511348, + "grad_norm": 0.7632598876953125, + "learning_rate": 0.000865233047968474, + "loss": 3.5691, + "step": 15875 + }, + { + "epoch": 1.0789509444217964, + "grad_norm": 1.1991726160049438, + "learning_rate": 0.0008651905829596412, + "loss": 3.4961, + "step": 15880 + }, + { + "epoch": 1.0792906644924583, + "grad_norm": 0.767606258392334, + "learning_rate": 0.0008651481179508085, + "loss": 3.6746, + "step": 15885 + }, + { + "epoch": 1.0796303845631199, + "grad_norm": 0.7464436888694763, + "learning_rate": 0.0008651056529419759, + "loss": 3.5623, + "step": 15890 + }, + { + "epoch": 1.0799701046337817, + "grad_norm": 0.8179015517234802, + "learning_rate": 0.0008650631879331431, + "loss": 3.6889, + "step": 15895 + }, + { + "epoch": 1.0803098247044436, + "grad_norm": 1.0746855735778809, + "learning_rate": 0.0008650207229243103, + "loss": 3.2479, + "step": 15900 + }, + { + "epoch": 1.0806495447751052, + "grad_norm": 0.7340206503868103, + "learning_rate": 0.0008649782579154777, + "loss": 3.8776, + "step": 15905 + }, + { + "epoch": 1.080989264845767, + "grad_norm": 0.8113567233085632, + "learning_rate": 0.0008649357929066449, + "loss": 3.8826, + "step": 15910 + }, + { + "epoch": 1.081328984916429, + "grad_norm": 0.8013406991958618, + "learning_rate": 0.0008648933278978121, + "loss": 3.6943, + "step": 15915 + }, + { + "epoch": 1.0816687049870906, + "grad_norm": 0.9016230702400208, + "learning_rate": 0.0008648508628889795, + "loss": 3.5482, + "step": 15920 + }, + { + "epoch": 1.0820084250577524, + "grad_norm": 0.7789667248725891, + "learning_rate": 0.0008648083978801468, + "loss": 3.6111, + "step": 15925 + }, + { + "epoch": 1.0823481451284143, + "grad_norm": 0.75594162940979, + "learning_rate": 0.0008647659328713141, + "loss": 3.5595, + "step": 15930 + }, + { + "epoch": 1.082687865199076, + "grad_norm": 0.7967160940170288, + "learning_rate": 0.0008647234678624814, + "loss": 3.4888, + "step": 15935 + }, + { + "epoch": 1.0830275852697377, + "grad_norm": 1.1452893018722534, + "learning_rate": 0.0008646810028536486, + "loss": 3.616, + "step": 15940 + }, + { + "epoch": 1.0833673053403996, + "grad_norm": 0.806159496307373, + "learning_rate": 0.0008646385378448159, + "loss": 3.645, + "step": 15945 + }, + { + "epoch": 1.0837070254110612, + "grad_norm": 0.7427250742912292, + "learning_rate": 0.0008645960728359831, + "loss": 3.6305, + "step": 15950 + }, + { + "epoch": 1.084046745481723, + "grad_norm": 0.8408187031745911, + "learning_rate": 0.0008645536078271504, + "loss": 3.6436, + "step": 15955 + }, + { + "epoch": 1.084386465552385, + "grad_norm": 0.8659944534301758, + "learning_rate": 0.0008645111428183178, + "loss": 3.5795, + "step": 15960 + }, + { + "epoch": 1.0847261856230466, + "grad_norm": 0.8397899270057678, + "learning_rate": 0.000864468677809485, + "loss": 3.6641, + "step": 15965 + }, + { + "epoch": 1.0850659056937084, + "grad_norm": 0.7360854148864746, + "learning_rate": 0.0008644262128006523, + "loss": 3.5218, + "step": 15970 + }, + { + "epoch": 1.0854056257643703, + "grad_norm": 0.8004935383796692, + "learning_rate": 0.0008643837477918196, + "loss": 3.7404, + "step": 15975 + }, + { + "epoch": 1.085745345835032, + "grad_norm": 0.8192716836929321, + "learning_rate": 0.0008643412827829868, + "loss": 3.7612, + "step": 15980 + }, + { + "epoch": 1.0860850659056938, + "grad_norm": 0.8023162484169006, + "learning_rate": 0.0008642988177741541, + "loss": 3.6097, + "step": 15985 + }, + { + "epoch": 1.0864247859763554, + "grad_norm": 0.7164750099182129, + "learning_rate": 0.0008642563527653214, + "loss": 3.5831, + "step": 15990 + }, + { + "epoch": 1.0867645060470172, + "grad_norm": 0.7966431379318237, + "learning_rate": 0.0008642138877564887, + "loss": 3.6656, + "step": 15995 + }, + { + "epoch": 1.087104226117679, + "grad_norm": 1.102699637413025, + "learning_rate": 0.000864171422747656, + "loss": 3.7259, + "step": 16000 + }, + { + "epoch": 1.0874439461883407, + "grad_norm": 0.9832888841629028, + "learning_rate": 0.0008641289577388233, + "loss": 3.6655, + "step": 16005 + }, + { + "epoch": 1.0877836662590026, + "grad_norm": 4.8686981201171875, + "learning_rate": 0.0008640864927299905, + "loss": 3.8857, + "step": 16010 + }, + { + "epoch": 1.0881233863296644, + "grad_norm": 0.6354562640190125, + "learning_rate": 0.0008640440277211577, + "loss": 3.48, + "step": 16015 + }, + { + "epoch": 1.088463106400326, + "grad_norm": 0.606461763381958, + "learning_rate": 0.0008640015627123251, + "loss": 3.5761, + "step": 16020 + }, + { + "epoch": 1.088802826470988, + "grad_norm": 0.7682428956031799, + "learning_rate": 0.0008639590977034923, + "loss": 3.6044, + "step": 16025 + }, + { + "epoch": 1.0891425465416498, + "grad_norm": 0.9601049423217773, + "learning_rate": 0.0008639166326946596, + "loss": 3.6189, + "step": 16030 + }, + { + "epoch": 1.0894822666123114, + "grad_norm": 0.8520056009292603, + "learning_rate": 0.000863874167685827, + "loss": 3.5965, + "step": 16035 + }, + { + "epoch": 1.0898219866829733, + "grad_norm": 0.754764974117279, + "learning_rate": 0.0008638317026769942, + "loss": 3.3746, + "step": 16040 + }, + { + "epoch": 1.090161706753635, + "grad_norm": 0.8628759980201721, + "learning_rate": 0.0008637892376681614, + "loss": 3.4719, + "step": 16045 + }, + { + "epoch": 1.0905014268242967, + "grad_norm": 0.6527395248413086, + "learning_rate": 0.0008637467726593288, + "loss": 3.8535, + "step": 16050 + }, + { + "epoch": 1.0908411468949586, + "grad_norm": 0.7080455422401428, + "learning_rate": 0.000863704307650496, + "loss": 3.6113, + "step": 16055 + }, + { + "epoch": 1.0911808669656202, + "grad_norm": 0.8439928293228149, + "learning_rate": 0.0008636618426416632, + "loss": 3.3118, + "step": 16060 + }, + { + "epoch": 1.091520587036282, + "grad_norm": 2.25856614112854, + "learning_rate": 0.0008636193776328306, + "loss": 3.6307, + "step": 16065 + }, + { + "epoch": 1.091860307106944, + "grad_norm": 1.1851789951324463, + "learning_rate": 0.0008635769126239979, + "loss": 3.4921, + "step": 16070 + }, + { + "epoch": 1.0922000271776056, + "grad_norm": 0.6983203887939453, + "learning_rate": 0.0008635344476151651, + "loss": 3.5946, + "step": 16075 + }, + { + "epoch": 1.0925397472482674, + "grad_norm": 1.3732134103775024, + "learning_rate": 0.0008634919826063324, + "loss": 3.6173, + "step": 16080 + }, + { + "epoch": 1.0928794673189293, + "grad_norm": 0.8757897615432739, + "learning_rate": 0.0008634495175974997, + "loss": 3.5725, + "step": 16085 + }, + { + "epoch": 1.093219187389591, + "grad_norm": 0.8341741561889648, + "learning_rate": 0.0008634070525886669, + "loss": 3.363, + "step": 16090 + }, + { + "epoch": 1.0935589074602527, + "grad_norm": 0.8814555406570435, + "learning_rate": 0.0008633645875798342, + "loss": 3.8033, + "step": 16095 + }, + { + "epoch": 1.0938986275309146, + "grad_norm": 0.7780993580818176, + "learning_rate": 0.0008633221225710016, + "loss": 3.7132, + "step": 16100 + }, + { + "epoch": 1.0942383476015762, + "grad_norm": 0.7374149560928345, + "learning_rate": 0.0008632796575621688, + "loss": 3.4939, + "step": 16105 + }, + { + "epoch": 1.094578067672238, + "grad_norm": 0.9646396636962891, + "learning_rate": 0.0008632371925533361, + "loss": 3.5916, + "step": 16110 + }, + { + "epoch": 1.0949177877429, + "grad_norm": 0.8299522995948792, + "learning_rate": 0.0008631947275445033, + "loss": 3.6576, + "step": 16115 + }, + { + "epoch": 1.0952575078135616, + "grad_norm": 0.6972337961196899, + "learning_rate": 0.0008631522625356706, + "loss": 3.6601, + "step": 16120 + }, + { + "epoch": 1.0955972278842234, + "grad_norm": 0.9139479398727417, + "learning_rate": 0.0008631097975268379, + "loss": 3.5024, + "step": 16125 + }, + { + "epoch": 1.0959369479548853, + "grad_norm": 0.7634836435317993, + "learning_rate": 0.0008630673325180051, + "loss": 3.8547, + "step": 16130 + }, + { + "epoch": 1.096276668025547, + "grad_norm": 0.8669366836547852, + "learning_rate": 0.0008630248675091725, + "loss": 3.4204, + "step": 16135 + }, + { + "epoch": 1.0966163880962088, + "grad_norm": 0.6963980793952942, + "learning_rate": 0.0008629824025003398, + "loss": 3.7502, + "step": 16140 + }, + { + "epoch": 1.0969561081668706, + "grad_norm": 0.7858267426490784, + "learning_rate": 0.000862939937491507, + "loss": 3.5196, + "step": 16145 + }, + { + "epoch": 1.0972958282375322, + "grad_norm": 0.7184492945671082, + "learning_rate": 0.0008628974724826742, + "loss": 3.6032, + "step": 16150 + }, + { + "epoch": 1.097635548308194, + "grad_norm": 0.8189446926116943, + "learning_rate": 0.0008628550074738416, + "loss": 3.751, + "step": 16155 + }, + { + "epoch": 1.0979752683788557, + "grad_norm": 0.7629602551460266, + "learning_rate": 0.0008628125424650088, + "loss": 3.5373, + "step": 16160 + }, + { + "epoch": 1.0983149884495176, + "grad_norm": 0.9630057215690613, + "learning_rate": 0.000862770077456176, + "loss": 3.5495, + "step": 16165 + }, + { + "epoch": 1.0986547085201794, + "grad_norm": 1.204612135887146, + "learning_rate": 0.0008627276124473435, + "loss": 3.5883, + "step": 16170 + }, + { + "epoch": 1.098994428590841, + "grad_norm": 0.9133951663970947, + "learning_rate": 0.0008626851474385107, + "loss": 3.7049, + "step": 16175 + }, + { + "epoch": 1.099334148661503, + "grad_norm": 0.8792445063591003, + "learning_rate": 0.0008626426824296779, + "loss": 3.601, + "step": 16180 + }, + { + "epoch": 1.0996738687321648, + "grad_norm": 0.7400299310684204, + "learning_rate": 0.0008626002174208453, + "loss": 3.7044, + "step": 16185 + }, + { + "epoch": 1.1000135888028264, + "grad_norm": 0.9188005328178406, + "learning_rate": 0.0008625577524120125, + "loss": 3.4331, + "step": 16190 + }, + { + "epoch": 1.1003533088734883, + "grad_norm": 1.1212857961654663, + "learning_rate": 0.0008625152874031797, + "loss": 3.6629, + "step": 16195 + }, + { + "epoch": 1.10069302894415, + "grad_norm": 0.8232102394104004, + "learning_rate": 0.0008624728223943472, + "loss": 3.6453, + "step": 16200 + }, + { + "epoch": 1.1010327490148117, + "grad_norm": 0.7455711364746094, + "learning_rate": 0.0008624303573855144, + "loss": 3.6205, + "step": 16205 + }, + { + "epoch": 1.1013724690854736, + "grad_norm": 0.7377623319625854, + "learning_rate": 0.0008623878923766816, + "loss": 3.4477, + "step": 16210 + }, + { + "epoch": 1.1017121891561354, + "grad_norm": 0.7220746278762817, + "learning_rate": 0.0008623454273678489, + "loss": 3.3879, + "step": 16215 + }, + { + "epoch": 1.102051909226797, + "grad_norm": 0.9508424997329712, + "learning_rate": 0.0008623029623590162, + "loss": 3.7113, + "step": 16220 + }, + { + "epoch": 1.102391629297459, + "grad_norm": 0.8642879724502563, + "learning_rate": 0.0008622604973501834, + "loss": 3.807, + "step": 16225 + }, + { + "epoch": 1.1027313493681206, + "grad_norm": 0.7338989973068237, + "learning_rate": 0.0008622180323413507, + "loss": 3.6923, + "step": 16230 + }, + { + "epoch": 1.1030710694387824, + "grad_norm": 0.8889486789703369, + "learning_rate": 0.0008621755673325181, + "loss": 3.649, + "step": 16235 + }, + { + "epoch": 1.1034107895094443, + "grad_norm": 0.6491808295249939, + "learning_rate": 0.0008621331023236853, + "loss": 3.7516, + "step": 16240 + }, + { + "epoch": 1.103750509580106, + "grad_norm": 1.078559398651123, + "learning_rate": 0.0008620906373148526, + "loss": 3.7655, + "step": 16245 + }, + { + "epoch": 1.1040902296507678, + "grad_norm": 0.7912032008171082, + "learning_rate": 0.0008620481723060198, + "loss": 3.7236, + "step": 16250 + }, + { + "epoch": 1.1044299497214296, + "grad_norm": 0.865936815738678, + "learning_rate": 0.0008620057072971871, + "loss": 3.6461, + "step": 16255 + }, + { + "epoch": 1.1047696697920912, + "grad_norm": 0.6954441666603088, + "learning_rate": 0.0008619632422883544, + "loss": 3.4813, + "step": 16260 + }, + { + "epoch": 1.105109389862753, + "grad_norm": 0.9260016679763794, + "learning_rate": 0.0008619207772795216, + "loss": 3.4543, + "step": 16265 + }, + { + "epoch": 1.105449109933415, + "grad_norm": 0.9640270471572876, + "learning_rate": 0.0008618783122706891, + "loss": 3.5372, + "step": 16270 + }, + { + "epoch": 1.1057888300040766, + "grad_norm": 1.1160563230514526, + "learning_rate": 0.0008618358472618563, + "loss": 3.6843, + "step": 16275 + }, + { + "epoch": 1.1061285500747384, + "grad_norm": 1.0663708448410034, + "learning_rate": 0.0008617933822530235, + "loss": 3.5298, + "step": 16280 + }, + { + "epoch": 1.1064682701454003, + "grad_norm": 0.9835437536239624, + "learning_rate": 0.0008617509172441909, + "loss": 3.6322, + "step": 16285 + }, + { + "epoch": 1.106807990216062, + "grad_norm": 0.8725509643554688, + "learning_rate": 0.0008617084522353581, + "loss": 3.5598, + "step": 16290 + }, + { + "epoch": 1.1071477102867238, + "grad_norm": 0.7793298959732056, + "learning_rate": 0.0008616659872265253, + "loss": 3.2784, + "step": 16295 + }, + { + "epoch": 1.1074874303573856, + "grad_norm": 0.8850125670433044, + "learning_rate": 0.0008616235222176926, + "loss": 3.3347, + "step": 16300 + }, + { + "epoch": 1.1078271504280472, + "grad_norm": 0.6695400476455688, + "learning_rate": 0.00086158105720886, + "loss": 3.5648, + "step": 16305 + }, + { + "epoch": 1.108166870498709, + "grad_norm": 0.738503098487854, + "learning_rate": 0.0008615385922000272, + "loss": 3.8309, + "step": 16310 + }, + { + "epoch": 1.108506590569371, + "grad_norm": 0.783973217010498, + "learning_rate": 0.0008614961271911945, + "loss": 3.4756, + "step": 16315 + }, + { + "epoch": 1.1088463106400326, + "grad_norm": 0.7216949462890625, + "learning_rate": 0.0008614536621823618, + "loss": 3.6984, + "step": 16320 + }, + { + "epoch": 1.1091860307106944, + "grad_norm": 0.6562370657920837, + "learning_rate": 0.000861411197173529, + "loss": 3.7432, + "step": 16325 + }, + { + "epoch": 1.109525750781356, + "grad_norm": 0.9012731313705444, + "learning_rate": 0.0008613687321646963, + "loss": 3.5746, + "step": 16330 + }, + { + "epoch": 1.109865470852018, + "grad_norm": 0.5781989693641663, + "learning_rate": 0.0008613262671558636, + "loss": 3.7064, + "step": 16335 + }, + { + "epoch": 1.1102051909226798, + "grad_norm": 0.7809541821479797, + "learning_rate": 0.0008612838021470309, + "loss": 3.4615, + "step": 16340 + }, + { + "epoch": 1.1105449109933414, + "grad_norm": 0.8737934231758118, + "learning_rate": 0.0008612413371381982, + "loss": 3.7512, + "step": 16345 + }, + { + "epoch": 1.1108846310640033, + "grad_norm": 0.7685277462005615, + "learning_rate": 0.0008611988721293654, + "loss": 3.7249, + "step": 16350 + }, + { + "epoch": 1.1112243511346651, + "grad_norm": 1.0498319864273071, + "learning_rate": 0.0008611564071205327, + "loss": 3.6526, + "step": 16355 + }, + { + "epoch": 1.1115640712053267, + "grad_norm": 0.7244815230369568, + "learning_rate": 0.0008611139421117, + "loss": 3.3779, + "step": 16360 + }, + { + "epoch": 1.1119037912759886, + "grad_norm": 0.6341451406478882, + "learning_rate": 0.0008610714771028672, + "loss": 3.6273, + "step": 16365 + }, + { + "epoch": 1.1122435113466504, + "grad_norm": 0.980139434337616, + "learning_rate": 0.0008610290120940345, + "loss": 3.5104, + "step": 16370 + }, + { + "epoch": 1.112583231417312, + "grad_norm": 0.6750521063804626, + "learning_rate": 0.0008609865470852019, + "loss": 3.3873, + "step": 16375 + }, + { + "epoch": 1.112922951487974, + "grad_norm": 0.8708168864250183, + "learning_rate": 0.0008609440820763691, + "loss": 3.73, + "step": 16380 + }, + { + "epoch": 1.1132626715586358, + "grad_norm": 0.7895426154136658, + "learning_rate": 0.0008609016170675364, + "loss": 3.7025, + "step": 16385 + }, + { + "epoch": 1.1136023916292974, + "grad_norm": 0.7221102714538574, + "learning_rate": 0.0008608591520587037, + "loss": 3.8443, + "step": 16390 + }, + { + "epoch": 1.1139421116999593, + "grad_norm": 0.762961745262146, + "learning_rate": 0.0008608166870498709, + "loss": 3.8529, + "step": 16395 + }, + { + "epoch": 1.114281831770621, + "grad_norm": 0.7619752883911133, + "learning_rate": 0.0008607742220410381, + "loss": 3.5271, + "step": 16400 + }, + { + "epoch": 1.1146215518412828, + "grad_norm": 0.8401373624801636, + "learning_rate": 0.0008607317570322055, + "loss": 3.5379, + "step": 16405 + }, + { + "epoch": 1.1149612719119446, + "grad_norm": 0.7221295833587646, + "learning_rate": 0.0008606892920233728, + "loss": 3.7169, + "step": 16410 + }, + { + "epoch": 1.1153009919826062, + "grad_norm": 0.7968506813049316, + "learning_rate": 0.00086064682701454, + "loss": 3.6011, + "step": 16415 + }, + { + "epoch": 1.115640712053268, + "grad_norm": 0.7913619875907898, + "learning_rate": 0.0008606043620057074, + "loss": 3.7893, + "step": 16420 + }, + { + "epoch": 1.11598043212393, + "grad_norm": 1.7621521949768066, + "learning_rate": 0.0008605618969968746, + "loss": 3.4525, + "step": 16425 + }, + { + "epoch": 1.1163201521945916, + "grad_norm": 0.8854485154151917, + "learning_rate": 0.0008605194319880418, + "loss": 3.7446, + "step": 16430 + }, + { + "epoch": 1.1166598722652534, + "grad_norm": 0.9722793102264404, + "learning_rate": 0.0008604769669792092, + "loss": 3.481, + "step": 16435 + }, + { + "epoch": 1.1169995923359153, + "grad_norm": 0.8112024664878845, + "learning_rate": 0.0008604345019703764, + "loss": 3.6496, + "step": 16440 + }, + { + "epoch": 1.117339312406577, + "grad_norm": 1.2786113023757935, + "learning_rate": 0.0008603920369615437, + "loss": 3.599, + "step": 16445 + }, + { + "epoch": 1.1176790324772388, + "grad_norm": 0.9102990031242371, + "learning_rate": 0.000860349571952711, + "loss": 3.9917, + "step": 16450 + }, + { + "epoch": 1.1180187525479006, + "grad_norm": 0.7415691614151001, + "learning_rate": 0.0008603071069438783, + "loss": 3.6446, + "step": 16455 + }, + { + "epoch": 1.1183584726185622, + "grad_norm": 1.1944321393966675, + "learning_rate": 0.0008602646419350455, + "loss": 3.6668, + "step": 16460 + }, + { + "epoch": 1.118698192689224, + "grad_norm": 0.7565018534660339, + "learning_rate": 0.0008602221769262128, + "loss": 3.8471, + "step": 16465 + }, + { + "epoch": 1.119037912759886, + "grad_norm": 1.1605318784713745, + "learning_rate": 0.0008601797119173801, + "loss": 3.3343, + "step": 16470 + }, + { + "epoch": 1.1193776328305476, + "grad_norm": 0.6550281047821045, + "learning_rate": 0.0008601372469085473, + "loss": 3.6725, + "step": 16475 + }, + { + "epoch": 1.1197173529012094, + "grad_norm": 0.7235205173492432, + "learning_rate": 0.0008600947818997147, + "loss": 3.613, + "step": 16480 + }, + { + "epoch": 1.1200570729718713, + "grad_norm": 0.8582978844642639, + "learning_rate": 0.000860052316890882, + "loss": 3.5835, + "step": 16485 + }, + { + "epoch": 1.120396793042533, + "grad_norm": 0.7200369238853455, + "learning_rate": 0.0008600098518820492, + "loss": 3.5101, + "step": 16490 + }, + { + "epoch": 1.1207365131131948, + "grad_norm": 0.7277953624725342, + "learning_rate": 0.0008599673868732165, + "loss": 3.6746, + "step": 16495 + }, + { + "epoch": 1.1210762331838564, + "grad_norm": 0.8080256581306458, + "learning_rate": 0.0008599249218643837, + "loss": 3.6874, + "step": 16500 + }, + { + "epoch": 1.1214159532545183, + "grad_norm": 0.7543604969978333, + "learning_rate": 0.000859882456855551, + "loss": 3.6566, + "step": 16505 + }, + { + "epoch": 1.1217556733251801, + "grad_norm": 0.859119176864624, + "learning_rate": 0.0008598399918467183, + "loss": 3.5896, + "step": 16510 + }, + { + "epoch": 1.1220953933958417, + "grad_norm": 0.9015430212020874, + "learning_rate": 0.0008597975268378856, + "loss": 3.6612, + "step": 16515 + }, + { + "epoch": 1.1224351134665036, + "grad_norm": 0.7934795022010803, + "learning_rate": 0.0008597550618290529, + "loss": 3.7667, + "step": 16520 + }, + { + "epoch": 1.1227748335371655, + "grad_norm": 0.7493761777877808, + "learning_rate": 0.0008597125968202202, + "loss": 3.4878, + "step": 16525 + }, + { + "epoch": 1.123114553607827, + "grad_norm": 0.6733303070068359, + "learning_rate": 0.0008596701318113874, + "loss": 3.5259, + "step": 16530 + }, + { + "epoch": 1.123454273678489, + "grad_norm": 0.7560723423957825, + "learning_rate": 0.0008596276668025546, + "loss": 3.541, + "step": 16535 + }, + { + "epoch": 1.1237939937491508, + "grad_norm": 0.7947947978973389, + "learning_rate": 0.000859585201793722, + "loss": 3.8098, + "step": 16540 + }, + { + "epoch": 1.1241337138198124, + "grad_norm": 0.8157612085342407, + "learning_rate": 0.0008595427367848892, + "loss": 3.6416, + "step": 16545 + }, + { + "epoch": 1.1244734338904743, + "grad_norm": 0.8298633098602295, + "learning_rate": 0.0008595002717760565, + "loss": 3.6013, + "step": 16550 + }, + { + "epoch": 1.1248131539611361, + "grad_norm": 0.8036666512489319, + "learning_rate": 0.0008594578067672239, + "loss": 3.6436, + "step": 16555 + }, + { + "epoch": 1.1251528740317978, + "grad_norm": 0.7945606708526611, + "learning_rate": 0.0008594153417583911, + "loss": 3.6807, + "step": 16560 + }, + { + "epoch": 1.1254925941024596, + "grad_norm": 0.8582147359848022, + "learning_rate": 0.0008593728767495583, + "loss": 3.7758, + "step": 16565 + }, + { + "epoch": 1.1258323141731212, + "grad_norm": 0.8802775740623474, + "learning_rate": 0.0008593304117407257, + "loss": 3.5149, + "step": 16570 + }, + { + "epoch": 1.126172034243783, + "grad_norm": 1.8572415113449097, + "learning_rate": 0.0008592879467318929, + "loss": 3.746, + "step": 16575 + }, + { + "epoch": 1.126511754314445, + "grad_norm": 0.80617356300354, + "learning_rate": 0.0008592454817230601, + "loss": 3.7905, + "step": 16580 + }, + { + "epoch": 1.1268514743851066, + "grad_norm": 0.6561198234558105, + "learning_rate": 0.0008592030167142276, + "loss": 3.7387, + "step": 16585 + }, + { + "epoch": 1.1271911944557684, + "grad_norm": 0.7791321873664856, + "learning_rate": 0.0008591605517053948, + "loss": 3.5106, + "step": 16590 + }, + { + "epoch": 1.1275309145264303, + "grad_norm": 1.0607845783233643, + "learning_rate": 0.000859118086696562, + "loss": 3.8483, + "step": 16595 + }, + { + "epoch": 1.127870634597092, + "grad_norm": 1.1248379945755005, + "learning_rate": 0.0008590756216877293, + "loss": 3.5489, + "step": 16600 + }, + { + "epoch": 1.1282103546677538, + "grad_norm": 0.7584629058837891, + "learning_rate": 0.0008590331566788966, + "loss": 3.4726, + "step": 16605 + }, + { + "epoch": 1.1285500747384156, + "grad_norm": 0.7691929936408997, + "learning_rate": 0.0008589906916700639, + "loss": 3.5564, + "step": 16610 + }, + { + "epoch": 1.1288897948090773, + "grad_norm": 0.8282674551010132, + "learning_rate": 0.0008589482266612311, + "loss": 3.6715, + "step": 16615 + }, + { + "epoch": 1.129229514879739, + "grad_norm": 0.7953303456306458, + "learning_rate": 0.0008589057616523985, + "loss": 3.3982, + "step": 16620 + }, + { + "epoch": 1.129569234950401, + "grad_norm": 0.9684343934059143, + "learning_rate": 0.0008588632966435658, + "loss": 3.656, + "step": 16625 + }, + { + "epoch": 1.1299089550210626, + "grad_norm": 0.8689923286437988, + "learning_rate": 0.000858820831634733, + "loss": 3.5687, + "step": 16630 + }, + { + "epoch": 1.1302486750917244, + "grad_norm": 0.815409243106842, + "learning_rate": 0.0008587783666259002, + "loss": 3.7309, + "step": 16635 + }, + { + "epoch": 1.1305883951623863, + "grad_norm": 0.9557545185089111, + "learning_rate": 0.0008587359016170676, + "loss": 3.6477, + "step": 16640 + }, + { + "epoch": 1.130928115233048, + "grad_norm": 0.9037600159645081, + "learning_rate": 0.0008586934366082348, + "loss": 3.6032, + "step": 16645 + }, + { + "epoch": 1.1312678353037098, + "grad_norm": 0.8134204149246216, + "learning_rate": 0.000858650971599402, + "loss": 3.8323, + "step": 16650 + }, + { + "epoch": 1.1316075553743716, + "grad_norm": 0.7151156067848206, + "learning_rate": 0.0008586085065905695, + "loss": 3.3559, + "step": 16655 + }, + { + "epoch": 1.1319472754450333, + "grad_norm": 0.7549887299537659, + "learning_rate": 0.0008585660415817367, + "loss": 3.6469, + "step": 16660 + }, + { + "epoch": 1.1322869955156951, + "grad_norm": 0.7631164193153381, + "learning_rate": 0.0008585235765729039, + "loss": 3.6137, + "step": 16665 + }, + { + "epoch": 1.1326267155863567, + "grad_norm": 0.838075578212738, + "learning_rate": 0.0008584811115640713, + "loss": 3.6998, + "step": 16670 + }, + { + "epoch": 1.1329664356570186, + "grad_norm": 0.843279242515564, + "learning_rate": 0.0008584386465552385, + "loss": 3.6292, + "step": 16675 + }, + { + "epoch": 1.1333061557276805, + "grad_norm": 0.8681386113166809, + "learning_rate": 0.0008583961815464057, + "loss": 3.5774, + "step": 16680 + }, + { + "epoch": 1.133645875798342, + "grad_norm": 1.1805564165115356, + "learning_rate": 0.0008583537165375732, + "loss": 3.6771, + "step": 16685 + }, + { + "epoch": 1.133985595869004, + "grad_norm": 1.0537199974060059, + "learning_rate": 0.0008583112515287404, + "loss": 3.7439, + "step": 16690 + }, + { + "epoch": 1.1343253159396658, + "grad_norm": 0.7703849077224731, + "learning_rate": 0.0008582687865199076, + "loss": 3.5656, + "step": 16695 + }, + { + "epoch": 1.1346650360103274, + "grad_norm": 0.9753703474998474, + "learning_rate": 0.0008582263215110749, + "loss": 3.341, + "step": 16700 + }, + { + "epoch": 1.1350047560809893, + "grad_norm": 0.7480882406234741, + "learning_rate": 0.0008581838565022422, + "loss": 3.7014, + "step": 16705 + }, + { + "epoch": 1.1353444761516511, + "grad_norm": 0.9739485383033752, + "learning_rate": 0.0008581413914934094, + "loss": 3.7192, + "step": 16710 + }, + { + "epoch": 1.1356841962223128, + "grad_norm": 0.7217869162559509, + "learning_rate": 0.0008580989264845767, + "loss": 3.5613, + "step": 16715 + }, + { + "epoch": 1.1360239162929746, + "grad_norm": 0.7310523986816406, + "learning_rate": 0.0008580564614757441, + "loss": 3.8005, + "step": 16720 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.828133761882782, + "learning_rate": 0.0008580139964669113, + "loss": 3.6132, + "step": 16725 + }, + { + "epoch": 1.136703356434298, + "grad_norm": 0.7488837838172913, + "learning_rate": 0.0008579715314580786, + "loss": 3.7826, + "step": 16730 + }, + { + "epoch": 1.13704307650496, + "grad_norm": 1.1500097513198853, + "learning_rate": 0.0008579290664492459, + "loss": 3.794, + "step": 16735 + }, + { + "epoch": 1.1373827965756216, + "grad_norm": 0.9793196320533752, + "learning_rate": 0.0008578866014404131, + "loss": 3.4909, + "step": 16740 + }, + { + "epoch": 1.1377225166462834, + "grad_norm": 0.7478467226028442, + "learning_rate": 0.0008578441364315804, + "loss": 3.572, + "step": 16745 + }, + { + "epoch": 1.1380622367169453, + "grad_norm": 0.978670060634613, + "learning_rate": 0.0008578016714227476, + "loss": 3.6292, + "step": 16750 + }, + { + "epoch": 1.138401956787607, + "grad_norm": 1.165989637374878, + "learning_rate": 0.000857759206413915, + "loss": 3.8286, + "step": 16755 + }, + { + "epoch": 1.1387416768582688, + "grad_norm": 0.9468324184417725, + "learning_rate": 0.0008577167414050823, + "loss": 3.5253, + "step": 16760 + }, + { + "epoch": 1.1390813969289306, + "grad_norm": 0.8043177127838135, + "learning_rate": 0.0008576742763962495, + "loss": 3.8567, + "step": 16765 + }, + { + "epoch": 1.1394211169995923, + "grad_norm": 0.970108151435852, + "learning_rate": 0.0008576318113874168, + "loss": 3.6461, + "step": 16770 + }, + { + "epoch": 1.139760837070254, + "grad_norm": 0.9052731990814209, + "learning_rate": 0.0008575893463785841, + "loss": 3.5674, + "step": 16775 + }, + { + "epoch": 1.140100557140916, + "grad_norm": 0.7988507151603699, + "learning_rate": 0.0008575468813697513, + "loss": 3.5781, + "step": 16780 + }, + { + "epoch": 1.1404402772115776, + "grad_norm": 0.8848839402198792, + "learning_rate": 0.0008575044163609185, + "loss": 3.6447, + "step": 16785 + }, + { + "epoch": 1.1407799972822394, + "grad_norm": 0.7153761386871338, + "learning_rate": 0.000857461951352086, + "loss": 3.8154, + "step": 16790 + }, + { + "epoch": 1.1411197173529013, + "grad_norm": 0.928085446357727, + "learning_rate": 0.0008574194863432532, + "loss": 3.428, + "step": 16795 + }, + { + "epoch": 1.141459437423563, + "grad_norm": 0.8604159355163574, + "learning_rate": 0.0008573770213344204, + "loss": 3.7002, + "step": 16800 + }, + { + "epoch": 1.1417991574942248, + "grad_norm": 1.1223363876342773, + "learning_rate": 0.0008573345563255878, + "loss": 3.7198, + "step": 16805 + }, + { + "epoch": 1.1421388775648866, + "grad_norm": 0.8849937915802002, + "learning_rate": 0.000857292091316755, + "loss": 3.7252, + "step": 16810 + }, + { + "epoch": 1.1424785976355483, + "grad_norm": 1.3976622819900513, + "learning_rate": 0.0008572496263079222, + "loss": 3.5899, + "step": 16815 + }, + { + "epoch": 1.1428183177062101, + "grad_norm": 0.7461236119270325, + "learning_rate": 0.0008572071612990896, + "loss": 3.4622, + "step": 16820 + }, + { + "epoch": 1.143158037776872, + "grad_norm": 0.9840811491012573, + "learning_rate": 0.0008571646962902569, + "loss": 3.5186, + "step": 16825 + }, + { + "epoch": 1.1434977578475336, + "grad_norm": 0.7931722402572632, + "learning_rate": 0.0008571222312814241, + "loss": 3.5883, + "step": 16830 + }, + { + "epoch": 1.1438374779181955, + "grad_norm": 0.8341879844665527, + "learning_rate": 0.0008570797662725915, + "loss": 3.7086, + "step": 16835 + }, + { + "epoch": 1.144177197988857, + "grad_norm": 0.769695520401001, + "learning_rate": 0.0008570373012637587, + "loss": 3.6759, + "step": 16840 + }, + { + "epoch": 1.144516918059519, + "grad_norm": 0.7257293462753296, + "learning_rate": 0.0008569948362549259, + "loss": 3.6387, + "step": 16845 + }, + { + "epoch": 1.1448566381301808, + "grad_norm": 0.8448808789253235, + "learning_rate": 0.0008569523712460932, + "loss": 3.7656, + "step": 16850 + }, + { + "epoch": 1.1451963582008424, + "grad_norm": 1.2357338666915894, + "learning_rate": 0.0008569099062372605, + "loss": 3.5426, + "step": 16855 + }, + { + "epoch": 1.1455360782715043, + "grad_norm": 0.660187840461731, + "learning_rate": 0.0008568674412284278, + "loss": 3.7469, + "step": 16860 + }, + { + "epoch": 1.1458757983421661, + "grad_norm": 1.1049987077713013, + "learning_rate": 0.0008568249762195951, + "loss": 3.4256, + "step": 16865 + }, + { + "epoch": 1.1462155184128278, + "grad_norm": 0.8352718949317932, + "learning_rate": 0.0008567825112107624, + "loss": 3.5055, + "step": 16870 + }, + { + "epoch": 1.1465552384834896, + "grad_norm": 0.9415690898895264, + "learning_rate": 0.0008567400462019296, + "loss": 3.8831, + "step": 16875 + }, + { + "epoch": 1.1468949585541515, + "grad_norm": 0.995897114276886, + "learning_rate": 0.0008566975811930969, + "loss": 3.6291, + "step": 16880 + }, + { + "epoch": 1.147234678624813, + "grad_norm": 0.8872207999229431, + "learning_rate": 0.0008566551161842641, + "loss": 3.6461, + "step": 16885 + }, + { + "epoch": 1.147574398695475, + "grad_norm": 0.8301382064819336, + "learning_rate": 0.0008566126511754314, + "loss": 3.6829, + "step": 16890 + }, + { + "epoch": 1.1479141187661366, + "grad_norm": 0.9403182864189148, + "learning_rate": 0.0008565701861665988, + "loss": 3.6154, + "step": 16895 + }, + { + "epoch": 1.1482538388367984, + "grad_norm": 0.8394535779953003, + "learning_rate": 0.000856527721157766, + "loss": 3.4917, + "step": 16900 + }, + { + "epoch": 1.1485935589074603, + "grad_norm": 0.7375920414924622, + "learning_rate": 0.0008564852561489333, + "loss": 3.7095, + "step": 16905 + }, + { + "epoch": 1.148933278978122, + "grad_norm": 0.8799547553062439, + "learning_rate": 0.0008564427911401006, + "loss": 3.5292, + "step": 16910 + }, + { + "epoch": 1.1492729990487838, + "grad_norm": 1.0603461265563965, + "learning_rate": 0.0008564003261312678, + "loss": 3.4986, + "step": 16915 + }, + { + "epoch": 1.1496127191194456, + "grad_norm": 0.9857471585273743, + "learning_rate": 0.000856357861122435, + "loss": 3.4724, + "step": 16920 + }, + { + "epoch": 1.1499524391901073, + "grad_norm": 0.9007765054702759, + "learning_rate": 0.0008563153961136024, + "loss": 3.5385, + "step": 16925 + }, + { + "epoch": 1.150292159260769, + "grad_norm": 1.0591518878936768, + "learning_rate": 0.0008562729311047697, + "loss": 3.6456, + "step": 16930 + }, + { + "epoch": 1.150631879331431, + "grad_norm": 1.0703179836273193, + "learning_rate": 0.0008562304660959369, + "loss": 3.5201, + "step": 16935 + }, + { + "epoch": 1.1509715994020926, + "grad_norm": 0.7310361266136169, + "learning_rate": 0.0008561880010871043, + "loss": 3.7084, + "step": 16940 + }, + { + "epoch": 1.1513113194727544, + "grad_norm": 0.8327747583389282, + "learning_rate": 0.0008561455360782715, + "loss": 3.4584, + "step": 16945 + }, + { + "epoch": 1.1516510395434163, + "grad_norm": 0.6871694922447205, + "learning_rate": 0.0008561030710694388, + "loss": 3.6828, + "step": 16950 + }, + { + "epoch": 1.151990759614078, + "grad_norm": 0.6984290480613708, + "learning_rate": 0.0008560606060606061, + "loss": 3.3014, + "step": 16955 + }, + { + "epoch": 1.1523304796847398, + "grad_norm": 0.6847667694091797, + "learning_rate": 0.0008560181410517733, + "loss": 3.4186, + "step": 16960 + }, + { + "epoch": 1.1526701997554016, + "grad_norm": 0.7431395053863525, + "learning_rate": 0.0008559756760429407, + "loss": 3.5889, + "step": 16965 + }, + { + "epoch": 1.1530099198260633, + "grad_norm": 0.90439373254776, + "learning_rate": 0.000855933211034108, + "loss": 3.5371, + "step": 16970 + }, + { + "epoch": 1.1533496398967251, + "grad_norm": 1.0250747203826904, + "learning_rate": 0.0008558907460252752, + "loss": 3.5035, + "step": 16975 + }, + { + "epoch": 1.153689359967387, + "grad_norm": 1.0194194316864014, + "learning_rate": 0.0008558482810164425, + "loss": 3.5559, + "step": 16980 + }, + { + "epoch": 1.1540290800380486, + "grad_norm": 0.916985273361206, + "learning_rate": 0.0008558058160076097, + "loss": 3.7909, + "step": 16985 + }, + { + "epoch": 1.1543688001087105, + "grad_norm": 0.9159713387489319, + "learning_rate": 0.000855763350998777, + "loss": 3.4499, + "step": 16990 + }, + { + "epoch": 1.1547085201793723, + "grad_norm": 0.8740003705024719, + "learning_rate": 0.0008557208859899443, + "loss": 3.6376, + "step": 16995 + }, + { + "epoch": 1.155048240250034, + "grad_norm": 0.9219914078712463, + "learning_rate": 0.0008556784209811116, + "loss": 3.5669, + "step": 17000 + }, + { + "epoch": 1.1553879603206958, + "grad_norm": 0.9303038120269775, + "learning_rate": 0.0008556359559722789, + "loss": 3.6126, + "step": 17005 + }, + { + "epoch": 1.1557276803913574, + "grad_norm": 0.8053663372993469, + "learning_rate": 0.0008555934909634462, + "loss": 3.693, + "step": 17010 + }, + { + "epoch": 1.1560674004620193, + "grad_norm": 0.7483538389205933, + "learning_rate": 0.0008555510259546134, + "loss": 3.7773, + "step": 17015 + }, + { + "epoch": 1.1564071205326811, + "grad_norm": 0.6888203024864197, + "learning_rate": 0.0008555085609457807, + "loss": 3.6581, + "step": 17020 + }, + { + "epoch": 1.1567468406033428, + "grad_norm": 0.9090051651000977, + "learning_rate": 0.000855466095936948, + "loss": 3.7956, + "step": 17025 + }, + { + "epoch": 1.1570865606740046, + "grad_norm": 1.0869661569595337, + "learning_rate": 0.0008554236309281152, + "loss": 3.5706, + "step": 17030 + }, + { + "epoch": 1.1574262807446665, + "grad_norm": 0.9737747311592102, + "learning_rate": 0.0008553811659192825, + "loss": 3.5667, + "step": 17035 + }, + { + "epoch": 1.157766000815328, + "grad_norm": 1.0313316583633423, + "learning_rate": 0.0008553387009104499, + "loss": 3.3744, + "step": 17040 + }, + { + "epoch": 1.15810572088599, + "grad_norm": 2.3488965034484863, + "learning_rate": 0.0008552962359016171, + "loss": 3.8816, + "step": 17045 + }, + { + "epoch": 1.1584454409566518, + "grad_norm": 0.8140867352485657, + "learning_rate": 0.0008552537708927843, + "loss": 3.3312, + "step": 17050 + }, + { + "epoch": 1.1587851610273134, + "grad_norm": 0.7494817972183228, + "learning_rate": 0.0008552113058839517, + "loss": 3.6381, + "step": 17055 + }, + { + "epoch": 1.1591248810979753, + "grad_norm": 0.7074995040893555, + "learning_rate": 0.0008551688408751189, + "loss": 3.6187, + "step": 17060 + }, + { + "epoch": 1.159464601168637, + "grad_norm": 0.7154060006141663, + "learning_rate": 0.0008551263758662861, + "loss": 3.7599, + "step": 17065 + }, + { + "epoch": 1.1598043212392988, + "grad_norm": 0.7824446558952332, + "learning_rate": 0.0008550839108574536, + "loss": 3.5637, + "step": 17070 + }, + { + "epoch": 1.1601440413099606, + "grad_norm": 0.8434299230575562, + "learning_rate": 0.0008550414458486208, + "loss": 3.7061, + "step": 17075 + }, + { + "epoch": 1.1604837613806223, + "grad_norm": 1.1458944082260132, + "learning_rate": 0.000854998980839788, + "loss": 3.3851, + "step": 17080 + }, + { + "epoch": 1.1608234814512841, + "grad_norm": 0.8063488006591797, + "learning_rate": 0.0008549565158309553, + "loss": 3.5811, + "step": 17085 + }, + { + "epoch": 1.161163201521946, + "grad_norm": 0.735196590423584, + "learning_rate": 0.0008549140508221226, + "loss": 3.4913, + "step": 17090 + }, + { + "epoch": 1.1615029215926076, + "grad_norm": 0.8051621317863464, + "learning_rate": 0.0008548715858132898, + "loss": 3.5612, + "step": 17095 + }, + { + "epoch": 1.1618426416632694, + "grad_norm": 0.8404978513717651, + "learning_rate": 0.0008548291208044571, + "loss": 3.5147, + "step": 17100 + }, + { + "epoch": 1.1621823617339313, + "grad_norm": 0.7084592580795288, + "learning_rate": 0.0008547866557956245, + "loss": 3.6023, + "step": 17105 + }, + { + "epoch": 1.162522081804593, + "grad_norm": 0.666179358959198, + "learning_rate": 0.0008547441907867917, + "loss": 3.4637, + "step": 17110 + }, + { + "epoch": 1.1628618018752548, + "grad_norm": 0.7333016991615295, + "learning_rate": 0.000854701725777959, + "loss": 3.4193, + "step": 17115 + }, + { + "epoch": 1.1632015219459166, + "grad_norm": 0.7837862968444824, + "learning_rate": 0.0008546592607691263, + "loss": 3.5966, + "step": 17120 + }, + { + "epoch": 1.1635412420165783, + "grad_norm": 0.8079808950424194, + "learning_rate": 0.0008546167957602935, + "loss": 3.5279, + "step": 17125 + }, + { + "epoch": 1.1638809620872401, + "grad_norm": 0.6643874049186707, + "learning_rate": 0.0008545743307514608, + "loss": 3.8924, + "step": 17130 + }, + { + "epoch": 1.164220682157902, + "grad_norm": 0.8216831088066101, + "learning_rate": 0.000854531865742628, + "loss": 3.361, + "step": 17135 + }, + { + "epoch": 1.1645604022285636, + "grad_norm": 0.8460012674331665, + "learning_rate": 0.0008544894007337954, + "loss": 3.7716, + "step": 17140 + }, + { + "epoch": 1.1649001222992255, + "grad_norm": 1.0371475219726562, + "learning_rate": 0.0008544469357249627, + "loss": 3.5967, + "step": 17145 + }, + { + "epoch": 1.1652398423698873, + "grad_norm": 1.1121952533721924, + "learning_rate": 0.0008544044707161299, + "loss": 3.673, + "step": 17150 + }, + { + "epoch": 1.165579562440549, + "grad_norm": 0.7458134293556213, + "learning_rate": 0.0008543620057072972, + "loss": 3.6568, + "step": 17155 + }, + { + "epoch": 1.1659192825112108, + "grad_norm": 1.1678210496902466, + "learning_rate": 0.0008543195406984645, + "loss": 3.7601, + "step": 17160 + }, + { + "epoch": 1.1662590025818727, + "grad_norm": 0.8681045770645142, + "learning_rate": 0.0008542770756896317, + "loss": 3.5989, + "step": 17165 + }, + { + "epoch": 1.1665987226525343, + "grad_norm": 0.6331527829170227, + "learning_rate": 0.000854234610680799, + "loss": 3.4708, + "step": 17170 + }, + { + "epoch": 1.1669384427231961, + "grad_norm": 0.9418392777442932, + "learning_rate": 0.0008541921456719664, + "loss": 3.8482, + "step": 17175 + }, + { + "epoch": 1.1672781627938578, + "grad_norm": 1.2741209268569946, + "learning_rate": 0.0008541496806631336, + "loss": 3.7523, + "step": 17180 + }, + { + "epoch": 1.1676178828645196, + "grad_norm": 0.6857944130897522, + "learning_rate": 0.0008541072156543008, + "loss": 3.6796, + "step": 17185 + }, + { + "epoch": 1.1679576029351815, + "grad_norm": 0.846011757850647, + "learning_rate": 0.0008540647506454682, + "loss": 3.371, + "step": 17190 + }, + { + "epoch": 1.168297323005843, + "grad_norm": 1.0942374467849731, + "learning_rate": 0.0008540222856366354, + "loss": 3.5144, + "step": 17195 + }, + { + "epoch": 1.168637043076505, + "grad_norm": 0.9394577741622925, + "learning_rate": 0.0008539798206278026, + "loss": 3.85, + "step": 17200 + }, + { + "epoch": 1.1689767631471668, + "grad_norm": 0.704520046710968, + "learning_rate": 0.00085393735561897, + "loss": 3.5581, + "step": 17205 + }, + { + "epoch": 1.1693164832178284, + "grad_norm": 0.7548087239265442, + "learning_rate": 0.0008538948906101373, + "loss": 3.5903, + "step": 17210 + }, + { + "epoch": 1.1696562032884903, + "grad_norm": 0.7969580888748169, + "learning_rate": 0.0008538524256013045, + "loss": 3.5678, + "step": 17215 + }, + { + "epoch": 1.1699959233591521, + "grad_norm": 0.6301403641700745, + "learning_rate": 0.0008538099605924719, + "loss": 3.6598, + "step": 17220 + }, + { + "epoch": 1.1703356434298138, + "grad_norm": 1.5143200159072876, + "learning_rate": 0.0008537674955836391, + "loss": 3.5582, + "step": 17225 + }, + { + "epoch": 1.1706753635004756, + "grad_norm": 0.8551926016807556, + "learning_rate": 0.0008537250305748063, + "loss": 3.5755, + "step": 17230 + }, + { + "epoch": 1.1710150835711373, + "grad_norm": 0.9103666543960571, + "learning_rate": 0.0008536825655659736, + "loss": 3.3885, + "step": 17235 + }, + { + "epoch": 1.1713548036417991, + "grad_norm": 1.3102667331695557, + "learning_rate": 0.0008536401005571409, + "loss": 3.317, + "step": 17240 + }, + { + "epoch": 1.171694523712461, + "grad_norm": 1.0636422634124756, + "learning_rate": 0.0008535976355483082, + "loss": 3.5073, + "step": 17245 + }, + { + "epoch": 1.1720342437831226, + "grad_norm": 0.9613081812858582, + "learning_rate": 0.0008535551705394755, + "loss": 3.6003, + "step": 17250 + }, + { + "epoch": 1.1723739638537845, + "grad_norm": 0.9186931848526001, + "learning_rate": 0.0008535127055306428, + "loss": 3.7816, + "step": 17255 + }, + { + "epoch": 1.1727136839244463, + "grad_norm": 0.7725684642791748, + "learning_rate": 0.00085347024052181, + "loss": 3.6469, + "step": 17260 + }, + { + "epoch": 1.173053403995108, + "grad_norm": 0.7709172964096069, + "learning_rate": 0.0008534277755129773, + "loss": 3.6162, + "step": 17265 + }, + { + "epoch": 1.1733931240657698, + "grad_norm": 1.3551890850067139, + "learning_rate": 0.0008533853105041445, + "loss": 3.5648, + "step": 17270 + }, + { + "epoch": 1.1737328441364316, + "grad_norm": 0.7982603311538696, + "learning_rate": 0.0008533428454953118, + "loss": 3.3942, + "step": 17275 + }, + { + "epoch": 1.1740725642070933, + "grad_norm": 1.0073354244232178, + "learning_rate": 0.0008533003804864792, + "loss": 3.8218, + "step": 17280 + }, + { + "epoch": 1.1744122842777551, + "grad_norm": 0.9076274633407593, + "learning_rate": 0.0008532579154776464, + "loss": 3.5452, + "step": 17285 + }, + { + "epoch": 1.174752004348417, + "grad_norm": 0.790998637676239, + "learning_rate": 0.0008532154504688138, + "loss": 3.6598, + "step": 17290 + }, + { + "epoch": 1.1750917244190786, + "grad_norm": 0.83008873462677, + "learning_rate": 0.000853172985459981, + "loss": 3.7705, + "step": 17295 + }, + { + "epoch": 1.1754314444897405, + "grad_norm": 0.8287382125854492, + "learning_rate": 0.0008531305204511482, + "loss": 3.703, + "step": 17300 + }, + { + "epoch": 1.1757711645604023, + "grad_norm": 0.7898476123809814, + "learning_rate": 0.0008530880554423156, + "loss": 3.7425, + "step": 17305 + }, + { + "epoch": 1.176110884631064, + "grad_norm": 1.0086753368377686, + "learning_rate": 0.0008530455904334829, + "loss": 3.7613, + "step": 17310 + }, + { + "epoch": 1.1764506047017258, + "grad_norm": 1.0348018407821655, + "learning_rate": 0.0008530031254246501, + "loss": 3.59, + "step": 17315 + }, + { + "epoch": 1.1767903247723877, + "grad_norm": 0.7177422046661377, + "learning_rate": 0.0008529606604158175, + "loss": 3.8026, + "step": 17320 + }, + { + "epoch": 1.1771300448430493, + "grad_norm": 0.8460989594459534, + "learning_rate": 0.0008529181954069847, + "loss": 3.6052, + "step": 17325 + }, + { + "epoch": 1.1774697649137111, + "grad_norm": 0.7028475403785706, + "learning_rate": 0.0008528757303981519, + "loss": 3.8838, + "step": 17330 + }, + { + "epoch": 1.177809484984373, + "grad_norm": 0.6197447180747986, + "learning_rate": 0.0008528332653893192, + "loss": 3.7551, + "step": 17335 + }, + { + "epoch": 1.1781492050550346, + "grad_norm": 0.847804605960846, + "learning_rate": 0.0008527908003804865, + "loss": 3.6692, + "step": 17340 + }, + { + "epoch": 1.1784889251256965, + "grad_norm": 0.9328190088272095, + "learning_rate": 0.0008527483353716538, + "loss": 3.5473, + "step": 17345 + }, + { + "epoch": 1.178828645196358, + "grad_norm": 0.7523034811019897, + "learning_rate": 0.0008527058703628211, + "loss": 3.4777, + "step": 17350 + }, + { + "epoch": 1.17916836526702, + "grad_norm": 1.034130334854126, + "learning_rate": 0.0008526634053539884, + "loss": 3.4567, + "step": 17355 + }, + { + "epoch": 1.1795080853376818, + "grad_norm": 0.8503477573394775, + "learning_rate": 0.0008526209403451556, + "loss": 3.8045, + "step": 17360 + }, + { + "epoch": 1.1798478054083434, + "grad_norm": 0.9190613627433777, + "learning_rate": 0.0008525784753363229, + "loss": 3.3263, + "step": 17365 + }, + { + "epoch": 1.1801875254790053, + "grad_norm": 0.7340781092643738, + "learning_rate": 0.0008525360103274901, + "loss": 3.5477, + "step": 17370 + }, + { + "epoch": 1.1805272455496671, + "grad_norm": 0.7570423483848572, + "learning_rate": 0.0008524935453186574, + "loss": 3.564, + "step": 17375 + }, + { + "epoch": 1.1808669656203288, + "grad_norm": 0.7497008442878723, + "learning_rate": 0.0008524510803098248, + "loss": 3.6466, + "step": 17380 + }, + { + "epoch": 1.1812066856909906, + "grad_norm": 0.942004919052124, + "learning_rate": 0.000852408615300992, + "loss": 3.3693, + "step": 17385 + }, + { + "epoch": 1.1815464057616525, + "grad_norm": 0.9698643684387207, + "learning_rate": 0.0008523661502921593, + "loss": 3.5443, + "step": 17390 + }, + { + "epoch": 1.1818861258323141, + "grad_norm": 0.9412668347358704, + "learning_rate": 0.0008523236852833266, + "loss": 3.4259, + "step": 17395 + }, + { + "epoch": 1.182225845902976, + "grad_norm": 0.8385773301124573, + "learning_rate": 0.0008522812202744938, + "loss": 3.4531, + "step": 17400 + }, + { + "epoch": 1.1825655659736376, + "grad_norm": 0.9761438369750977, + "learning_rate": 0.0008522387552656611, + "loss": 3.4314, + "step": 17405 + }, + { + "epoch": 1.1829052860442995, + "grad_norm": 0.7691978812217712, + "learning_rate": 0.0008521962902568284, + "loss": 3.7518, + "step": 17410 + }, + { + "epoch": 1.1832450061149613, + "grad_norm": 0.7829694747924805, + "learning_rate": 0.0008521538252479957, + "loss": 3.4818, + "step": 17415 + }, + { + "epoch": 1.183584726185623, + "grad_norm": 0.9436149001121521, + "learning_rate": 0.000852111360239163, + "loss": 3.5095, + "step": 17420 + }, + { + "epoch": 1.1839244462562848, + "grad_norm": 0.8304212689399719, + "learning_rate": 0.0008520688952303303, + "loss": 3.5131, + "step": 17425 + }, + { + "epoch": 1.1842641663269466, + "grad_norm": 0.8037211894989014, + "learning_rate": 0.0008520264302214975, + "loss": 3.8004, + "step": 17430 + }, + { + "epoch": 1.1846038863976083, + "grad_norm": 0.7005459666252136, + "learning_rate": 0.0008519839652126647, + "loss": 3.9463, + "step": 17435 + }, + { + "epoch": 1.1849436064682701, + "grad_norm": 1.3546866178512573, + "learning_rate": 0.0008519415002038321, + "loss": 3.6001, + "step": 17440 + }, + { + "epoch": 1.185283326538932, + "grad_norm": 1.0215438604354858, + "learning_rate": 0.0008518990351949993, + "loss": 3.3405, + "step": 17445 + }, + { + "epoch": 1.1856230466095936, + "grad_norm": 0.8340763449668884, + "learning_rate": 0.0008518565701861666, + "loss": 3.5868, + "step": 17450 + }, + { + "epoch": 1.1859627666802555, + "grad_norm": 0.8255767822265625, + "learning_rate": 0.000851814105177334, + "loss": 3.7506, + "step": 17455 + }, + { + "epoch": 1.1863024867509173, + "grad_norm": 0.7084366083145142, + "learning_rate": 0.0008517716401685012, + "loss": 3.6279, + "step": 17460 + }, + { + "epoch": 1.186642206821579, + "grad_norm": 0.9024516344070435, + "learning_rate": 0.0008517291751596684, + "loss": 3.6496, + "step": 17465 + }, + { + "epoch": 1.1869819268922408, + "grad_norm": 0.7757954597473145, + "learning_rate": 0.0008516867101508358, + "loss": 3.7771, + "step": 17470 + }, + { + "epoch": 1.1873216469629027, + "grad_norm": 0.9139785170555115, + "learning_rate": 0.000851644245142003, + "loss": 3.2087, + "step": 17475 + }, + { + "epoch": 1.1876613670335643, + "grad_norm": 0.9041799902915955, + "learning_rate": 0.0008516017801331702, + "loss": 3.6828, + "step": 17480 + }, + { + "epoch": 1.1880010871042261, + "grad_norm": 0.9358307123184204, + "learning_rate": 0.0008515593151243376, + "loss": 3.6764, + "step": 17485 + }, + { + "epoch": 1.188340807174888, + "grad_norm": 0.803115725517273, + "learning_rate": 0.0008515168501155049, + "loss": 3.287, + "step": 17490 + }, + { + "epoch": 1.1886805272455496, + "grad_norm": 0.7065115571022034, + "learning_rate": 0.0008514743851066721, + "loss": 3.4009, + "step": 17495 + }, + { + "epoch": 1.1890202473162115, + "grad_norm": 0.7417965531349182, + "learning_rate": 0.0008514319200978394, + "loss": 3.4767, + "step": 17500 + }, + { + "epoch": 1.1893599673868733, + "grad_norm": 0.8897076845169067, + "learning_rate": 0.0008513894550890067, + "loss": 3.4677, + "step": 17505 + }, + { + "epoch": 1.189699687457535, + "grad_norm": 0.6463358402252197, + "learning_rate": 0.0008513469900801739, + "loss": 3.5118, + "step": 17510 + }, + { + "epoch": 1.1900394075281968, + "grad_norm": 0.7615615129470825, + "learning_rate": 0.0008513045250713412, + "loss": 3.8688, + "step": 17515 + }, + { + "epoch": 1.1903791275988584, + "grad_norm": 0.8635746240615845, + "learning_rate": 0.0008512620600625086, + "loss": 3.4913, + "step": 17520 + }, + { + "epoch": 1.1907188476695203, + "grad_norm": 1.1920677423477173, + "learning_rate": 0.0008512195950536758, + "loss": 3.6063, + "step": 17525 + }, + { + "epoch": 1.1910585677401822, + "grad_norm": 0.7194543480873108, + "learning_rate": 0.0008511771300448431, + "loss": 3.6342, + "step": 17530 + }, + { + "epoch": 1.1913982878108438, + "grad_norm": 0.7707424759864807, + "learning_rate": 0.0008511346650360103, + "loss": 3.7426, + "step": 17535 + }, + { + "epoch": 1.1917380078815056, + "grad_norm": 0.8255816102027893, + "learning_rate": 0.0008510922000271776, + "loss": 3.4751, + "step": 17540 + }, + { + "epoch": 1.1920777279521675, + "grad_norm": 0.9390706419944763, + "learning_rate": 0.0008510497350183449, + "loss": 3.6239, + "step": 17545 + }, + { + "epoch": 1.1924174480228291, + "grad_norm": 0.8158770203590393, + "learning_rate": 0.0008510072700095121, + "loss": 3.6842, + "step": 17550 + }, + { + "epoch": 1.192757168093491, + "grad_norm": 1.0708986520767212, + "learning_rate": 0.0008509648050006795, + "loss": 3.6046, + "step": 17555 + }, + { + "epoch": 1.1930968881641528, + "grad_norm": 0.8855700492858887, + "learning_rate": 0.0008509223399918468, + "loss": 3.6954, + "step": 17560 + }, + { + "epoch": 1.1934366082348145, + "grad_norm": 0.9229809045791626, + "learning_rate": 0.000850879874983014, + "loss": 3.3733, + "step": 17565 + }, + { + "epoch": 1.1937763283054763, + "grad_norm": 0.8274008631706238, + "learning_rate": 0.0008508374099741812, + "loss": 3.6709, + "step": 17570 + }, + { + "epoch": 1.194116048376138, + "grad_norm": 1.028684139251709, + "learning_rate": 0.0008507949449653486, + "loss": 3.3154, + "step": 17575 + }, + { + "epoch": 1.1944557684467998, + "grad_norm": 0.7452706098556519, + "learning_rate": 0.0008507524799565158, + "loss": 3.591, + "step": 17580 + }, + { + "epoch": 1.1947954885174616, + "grad_norm": 1.3129258155822754, + "learning_rate": 0.000850710014947683, + "loss": 3.7504, + "step": 17585 + }, + { + "epoch": 1.1951352085881233, + "grad_norm": 1.391825556755066, + "learning_rate": 0.0008506675499388505, + "loss": 3.7959, + "step": 17590 + }, + { + "epoch": 1.1954749286587851, + "grad_norm": 0.7262458801269531, + "learning_rate": 0.0008506250849300177, + "loss": 3.7386, + "step": 17595 + }, + { + "epoch": 1.195814648729447, + "grad_norm": 1.5830119848251343, + "learning_rate": 0.0008505826199211849, + "loss": 3.6533, + "step": 17600 + }, + { + "epoch": 1.1961543688001086, + "grad_norm": 0.8708205819129944, + "learning_rate": 0.0008505401549123523, + "loss": 3.462, + "step": 17605 + }, + { + "epoch": 1.1964940888707705, + "grad_norm": 0.787441074848175, + "learning_rate": 0.0008504976899035195, + "loss": 3.1661, + "step": 17610 + }, + { + "epoch": 1.1968338089414323, + "grad_norm": 0.9973732829093933, + "learning_rate": 0.0008504552248946867, + "loss": 3.6802, + "step": 17615 + }, + { + "epoch": 1.197173529012094, + "grad_norm": 0.8012798428535461, + "learning_rate": 0.000850412759885854, + "loss": 3.5097, + "step": 17620 + }, + { + "epoch": 1.1975132490827558, + "grad_norm": 0.9300970435142517, + "learning_rate": 0.0008503702948770214, + "loss": 3.3272, + "step": 17625 + }, + { + "epoch": 1.1978529691534177, + "grad_norm": 0.6912575960159302, + "learning_rate": 0.0008503278298681887, + "loss": 3.5335, + "step": 17630 + }, + { + "epoch": 1.1981926892240793, + "grad_norm": 0.6569003462791443, + "learning_rate": 0.0008502853648593559, + "loss": 3.433, + "step": 17635 + }, + { + "epoch": 1.1985324092947411, + "grad_norm": 0.8399405479431152, + "learning_rate": 0.0008502428998505232, + "loss": 3.7546, + "step": 17640 + }, + { + "epoch": 1.198872129365403, + "grad_norm": 0.9926168918609619, + "learning_rate": 0.0008502004348416905, + "loss": 3.5485, + "step": 17645 + }, + { + "epoch": 1.1992118494360646, + "grad_norm": 0.8499864339828491, + "learning_rate": 0.0008501579698328577, + "loss": 3.5906, + "step": 17650 + }, + { + "epoch": 1.1995515695067265, + "grad_norm": 0.7953423857688904, + "learning_rate": 0.000850115504824025, + "loss": 3.6608, + "step": 17655 + }, + { + "epoch": 1.1998912895773883, + "grad_norm": 0.8453109860420227, + "learning_rate": 0.0008500730398151924, + "loss": 3.6794, + "step": 17660 + }, + { + "epoch": 1.20023100964805, + "grad_norm": 0.9171738624572754, + "learning_rate": 0.0008500390678081261, + "loss": 3.7127, + "step": 17665 + }, + { + "epoch": 1.2005707297187118, + "grad_norm": 0.731246292591095, + "learning_rate": 0.0008499966027992933, + "loss": 3.5276, + "step": 17670 + }, + { + "epoch": 1.2009104497893737, + "grad_norm": 0.7757867574691772, + "learning_rate": 0.0008499541377904607, + "loss": 3.5418, + "step": 17675 + }, + { + "epoch": 1.2012501698600353, + "grad_norm": 0.6717503070831299, + "learning_rate": 0.000849911672781628, + "loss": 3.326, + "step": 17680 + }, + { + "epoch": 1.2015898899306972, + "grad_norm": 1.0334370136260986, + "learning_rate": 0.0008498692077727952, + "loss": 3.7153, + "step": 17685 + }, + { + "epoch": 1.2019296100013588, + "grad_norm": 0.6353085041046143, + "learning_rate": 0.0008498267427639626, + "loss": 3.8158, + "step": 17690 + }, + { + "epoch": 1.2022693300720206, + "grad_norm": 0.760971188545227, + "learning_rate": 0.0008497842777551298, + "loss": 3.415, + "step": 17695 + }, + { + "epoch": 1.2026090501426825, + "grad_norm": 0.6896660923957825, + "learning_rate": 0.000849741812746297, + "loss": 3.3859, + "step": 17700 + }, + { + "epoch": 1.2029487702133441, + "grad_norm": 1.1161986589431763, + "learning_rate": 0.0008496993477374643, + "loss": 3.3117, + "step": 17705 + }, + { + "epoch": 1.203288490284006, + "grad_norm": 0.8199714422225952, + "learning_rate": 0.0008496568827286316, + "loss": 3.9587, + "step": 17710 + }, + { + "epoch": 1.2036282103546678, + "grad_norm": 0.7936241030693054, + "learning_rate": 0.0008496144177197989, + "loss": 3.3397, + "step": 17715 + }, + { + "epoch": 1.2039679304253295, + "grad_norm": 0.8291743397712708, + "learning_rate": 0.0008495719527109662, + "loss": 3.7315, + "step": 17720 + }, + { + "epoch": 1.2043076504959913, + "grad_norm": 0.8162603378295898, + "learning_rate": 0.0008495294877021335, + "loss": 3.7429, + "step": 17725 + }, + { + "epoch": 1.2046473705666532, + "grad_norm": 0.9450045228004456, + "learning_rate": 0.0008494870226933007, + "loss": 3.5561, + "step": 17730 + }, + { + "epoch": 1.2049870906373148, + "grad_norm": 1.0194299221038818, + "learning_rate": 0.000849444557684468, + "loss": 3.5873, + "step": 17735 + }, + { + "epoch": 1.2053268107079766, + "grad_norm": 0.7259212732315063, + "learning_rate": 0.0008494020926756353, + "loss": 3.4581, + "step": 17740 + }, + { + "epoch": 1.2056665307786383, + "grad_norm": 0.6895436644554138, + "learning_rate": 0.0008493596276668025, + "loss": 3.6304, + "step": 17745 + }, + { + "epoch": 1.2060062508493001, + "grad_norm": 0.7685421109199524, + "learning_rate": 0.0008493171626579699, + "loss": 3.7463, + "step": 17750 + }, + { + "epoch": 1.206345970919962, + "grad_norm": 1.0622243881225586, + "learning_rate": 0.0008492746976491372, + "loss": 3.4073, + "step": 17755 + }, + { + "epoch": 1.2066856909906236, + "grad_norm": 0.7880778312683105, + "learning_rate": 0.0008492322326403044, + "loss": 3.6727, + "step": 17760 + }, + { + "epoch": 1.2070254110612855, + "grad_norm": 0.7224171161651611, + "learning_rate": 0.0008491897676314717, + "loss": 3.6814, + "step": 17765 + }, + { + "epoch": 1.2073651311319473, + "grad_norm": 0.7976996302604675, + "learning_rate": 0.0008491473026226389, + "loss": 3.4074, + "step": 17770 + }, + { + "epoch": 1.207704851202609, + "grad_norm": 0.7182407975196838, + "learning_rate": 0.0008491048376138062, + "loss": 3.76, + "step": 17775 + }, + { + "epoch": 1.2080445712732708, + "grad_norm": 3.1275312900543213, + "learning_rate": 0.0008490623726049735, + "loss": 3.5892, + "step": 17780 + }, + { + "epoch": 1.2083842913439327, + "grad_norm": 0.8986103534698486, + "learning_rate": 0.0008490199075961408, + "loss": 3.5588, + "step": 17785 + }, + { + "epoch": 1.2087240114145943, + "grad_norm": 1.2770617008209229, + "learning_rate": 0.0008489774425873081, + "loss": 3.9451, + "step": 17790 + }, + { + "epoch": 1.2090637314852561, + "grad_norm": 0.6906563639640808, + "learning_rate": 0.0008489349775784754, + "loss": 3.6154, + "step": 17795 + }, + { + "epoch": 1.209403451555918, + "grad_norm": 0.867144763469696, + "learning_rate": 0.0008488925125696426, + "loss": 3.7988, + "step": 17800 + }, + { + "epoch": 1.2097431716265796, + "grad_norm": 0.8650851249694824, + "learning_rate": 0.0008488500475608098, + "loss": 3.3531, + "step": 17805 + }, + { + "epoch": 1.2100828916972415, + "grad_norm": 1.0674773454666138, + "learning_rate": 0.0008488075825519772, + "loss": 3.5301, + "step": 17810 + }, + { + "epoch": 1.2104226117679033, + "grad_norm": 1.11525559425354, + "learning_rate": 0.0008487651175431444, + "loss": 3.7138, + "step": 17815 + }, + { + "epoch": 1.210762331838565, + "grad_norm": 0.72477126121521, + "learning_rate": 0.0008487226525343117, + "loss": 3.6422, + "step": 17820 + }, + { + "epoch": 1.2111020519092268, + "grad_norm": 1.1667877435684204, + "learning_rate": 0.0008486801875254791, + "loss": 3.6516, + "step": 17825 + }, + { + "epoch": 1.2114417719798887, + "grad_norm": 0.8007199764251709, + "learning_rate": 0.0008486377225166463, + "loss": 3.7093, + "step": 17830 + }, + { + "epoch": 1.2117814920505503, + "grad_norm": 0.8265978097915649, + "learning_rate": 0.0008485952575078136, + "loss": 3.6457, + "step": 17835 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.8660348653793335, + "learning_rate": 0.0008485527924989809, + "loss": 3.8052, + "step": 17840 + }, + { + "epoch": 1.212460932191874, + "grad_norm": 0.7841839790344238, + "learning_rate": 0.0008485103274901481, + "loss": 3.7092, + "step": 17845 + }, + { + "epoch": 1.2128006522625356, + "grad_norm": 0.9366231560707092, + "learning_rate": 0.0008484678624813154, + "loss": 3.678, + "step": 17850 + }, + { + "epoch": 1.2131403723331975, + "grad_norm": 0.8529143333435059, + "learning_rate": 0.0008484253974724828, + "loss": 3.7632, + "step": 17855 + }, + { + "epoch": 1.2134800924038591, + "grad_norm": 0.7866474390029907, + "learning_rate": 0.00084838293246365, + "loss": 3.5512, + "step": 17860 + }, + { + "epoch": 1.213819812474521, + "grad_norm": 1.0206506252288818, + "learning_rate": 0.0008483404674548173, + "loss": 3.7976, + "step": 17865 + }, + { + "epoch": 1.2141595325451828, + "grad_norm": 0.8452919125556946, + "learning_rate": 0.0008482980024459845, + "loss": 3.4737, + "step": 17870 + }, + { + "epoch": 1.2144992526158445, + "grad_norm": 1.5598143339157104, + "learning_rate": 0.0008482555374371518, + "loss": 3.8863, + "step": 17875 + }, + { + "epoch": 1.2148389726865063, + "grad_norm": 0.7852082252502441, + "learning_rate": 0.0008482130724283191, + "loss": 3.7011, + "step": 17880 + }, + { + "epoch": 1.2151786927571682, + "grad_norm": 0.8847712278366089, + "learning_rate": 0.0008481706074194863, + "loss": 3.407, + "step": 17885 + }, + { + "epoch": 1.2155184128278298, + "grad_norm": 0.6900823712348938, + "learning_rate": 0.0008481281424106537, + "loss": 3.6824, + "step": 17890 + }, + { + "epoch": 1.2158581328984917, + "grad_norm": 0.7957562208175659, + "learning_rate": 0.000848085677401821, + "loss": 3.7815, + "step": 17895 + }, + { + "epoch": 1.2161978529691535, + "grad_norm": 0.6996729969978333, + "learning_rate": 0.0008480432123929882, + "loss": 3.7359, + "step": 17900 + }, + { + "epoch": 1.2165375730398151, + "grad_norm": 1.1650663614273071, + "learning_rate": 0.0008480007473841554, + "loss": 3.6604, + "step": 17905 + }, + { + "epoch": 1.216877293110477, + "grad_norm": 0.812868595123291, + "learning_rate": 0.0008479582823753228, + "loss": 3.5066, + "step": 17910 + }, + { + "epoch": 1.2172170131811386, + "grad_norm": 1.185300350189209, + "learning_rate": 0.00084791581736649, + "loss": 3.582, + "step": 17915 + }, + { + "epoch": 1.2175567332518005, + "grad_norm": 0.8237152695655823, + "learning_rate": 0.0008478733523576572, + "loss": 3.7495, + "step": 17920 + }, + { + "epoch": 1.2178964533224623, + "grad_norm": 1.0327682495117188, + "learning_rate": 0.0008478308873488247, + "loss": 3.7164, + "step": 17925 + }, + { + "epoch": 1.218236173393124, + "grad_norm": 0.8666285276412964, + "learning_rate": 0.0008477884223399919, + "loss": 3.4097, + "step": 17930 + }, + { + "epoch": 1.2185758934637858, + "grad_norm": 0.7448886632919312, + "learning_rate": 0.0008477459573311591, + "loss": 3.7621, + "step": 17935 + }, + { + "epoch": 1.2189156135344477, + "grad_norm": 0.8761913776397705, + "learning_rate": 0.0008477034923223265, + "loss": 3.4411, + "step": 17940 + }, + { + "epoch": 1.2192553336051093, + "grad_norm": 0.6797947883605957, + "learning_rate": 0.0008476610273134937, + "loss": 3.9488, + "step": 17945 + }, + { + "epoch": 1.2195950536757711, + "grad_norm": 1.0183029174804688, + "learning_rate": 0.0008476185623046609, + "loss": 3.5328, + "step": 17950 + }, + { + "epoch": 1.219934773746433, + "grad_norm": 0.9195513725280762, + "learning_rate": 0.0008475760972958282, + "loss": 3.535, + "step": 17955 + }, + { + "epoch": 1.2202744938170946, + "grad_norm": 0.7924513816833496, + "learning_rate": 0.0008475336322869956, + "loss": 3.6824, + "step": 17960 + }, + { + "epoch": 1.2206142138877565, + "grad_norm": 0.8071003556251526, + "learning_rate": 0.0008474911672781628, + "loss": 3.5872, + "step": 17965 + }, + { + "epoch": 1.2209539339584183, + "grad_norm": 0.6600189208984375, + "learning_rate": 0.0008474487022693301, + "loss": 3.6602, + "step": 17970 + }, + { + "epoch": 1.22129365402908, + "grad_norm": 0.9115119576454163, + "learning_rate": 0.0008474062372604974, + "loss": 3.6486, + "step": 17975 + }, + { + "epoch": 1.2216333740997418, + "grad_norm": 0.9429622888565063, + "learning_rate": 0.0008473637722516646, + "loss": 3.6138, + "step": 17980 + }, + { + "epoch": 1.2219730941704037, + "grad_norm": 0.8262921571731567, + "learning_rate": 0.0008473213072428319, + "loss": 3.4, + "step": 17985 + }, + { + "epoch": 1.2223128142410653, + "grad_norm": 0.8943473696708679, + "learning_rate": 0.0008472788422339992, + "loss": 3.7273, + "step": 17990 + }, + { + "epoch": 1.2226525343117272, + "grad_norm": 0.8614509701728821, + "learning_rate": 0.0008472363772251665, + "loss": 3.7009, + "step": 17995 + }, + { + "epoch": 1.222992254382389, + "grad_norm": 1.2057223320007324, + "learning_rate": 0.0008471939122163338, + "loss": 3.5689, + "step": 18000 + }, + { + "epoch": 1.2233319744530506, + "grad_norm": 0.7658494710922241, + "learning_rate": 0.000847151447207501, + "loss": 3.5281, + "step": 18005 + }, + { + "epoch": 1.2236716945237125, + "grad_norm": 0.8149915933609009, + "learning_rate": 0.0008471089821986683, + "loss": 3.6291, + "step": 18010 + }, + { + "epoch": 1.2240114145943743, + "grad_norm": 0.677395761013031, + "learning_rate": 0.0008470665171898356, + "loss": 3.5135, + "step": 18015 + }, + { + "epoch": 1.224351134665036, + "grad_norm": 0.7019641399383545, + "learning_rate": 0.0008470240521810028, + "loss": 3.8007, + "step": 18020 + }, + { + "epoch": 1.2246908547356978, + "grad_norm": 0.7035365700721741, + "learning_rate": 0.0008469815871721701, + "loss": 3.5212, + "step": 18025 + }, + { + "epoch": 1.2250305748063597, + "grad_norm": 0.9123901128768921, + "learning_rate": 0.0008469391221633375, + "loss": 3.4407, + "step": 18030 + }, + { + "epoch": 1.2253702948770213, + "grad_norm": 0.7771115899085999, + "learning_rate": 0.0008468966571545047, + "loss": 3.9167, + "step": 18035 + }, + { + "epoch": 1.2257100149476832, + "grad_norm": 0.9494962096214294, + "learning_rate": 0.000846854192145672, + "loss": 3.7407, + "step": 18040 + }, + { + "epoch": 1.2260497350183448, + "grad_norm": 0.7547560334205627, + "learning_rate": 0.0008468117271368393, + "loss": 3.8347, + "step": 18045 + }, + { + "epoch": 1.2263894550890067, + "grad_norm": 0.7536965608596802, + "learning_rate": 0.0008467692621280065, + "loss": 3.5069, + "step": 18050 + }, + { + "epoch": 1.2267291751596685, + "grad_norm": 0.8819370865821838, + "learning_rate": 0.0008467267971191737, + "loss": 3.6515, + "step": 18055 + }, + { + "epoch": 1.2270688952303301, + "grad_norm": 0.9105040431022644, + "learning_rate": 0.0008466843321103411, + "loss": 3.3059, + "step": 18060 + }, + { + "epoch": 1.227408615300992, + "grad_norm": 1.2674083709716797, + "learning_rate": 0.0008466418671015084, + "loss": 3.5657, + "step": 18065 + }, + { + "epoch": 1.2277483353716538, + "grad_norm": 0.8360303640365601, + "learning_rate": 0.0008465994020926756, + "loss": 3.7358, + "step": 18070 + }, + { + "epoch": 1.2280880554423155, + "grad_norm": 0.7850673198699951, + "learning_rate": 0.000846556937083843, + "loss": 3.7351, + "step": 18075 + }, + { + "epoch": 1.2284277755129773, + "grad_norm": 0.8816108107566833, + "learning_rate": 0.0008465144720750102, + "loss": 3.4173, + "step": 18080 + }, + { + "epoch": 1.228767495583639, + "grad_norm": 0.8079730868339539, + "learning_rate": 0.0008464720070661774, + "loss": 3.566, + "step": 18085 + }, + { + "epoch": 1.2291072156543008, + "grad_norm": 0.9690026044845581, + "learning_rate": 0.0008464295420573448, + "loss": 3.6555, + "step": 18090 + }, + { + "epoch": 1.2294469357249627, + "grad_norm": 0.8251434564590454, + "learning_rate": 0.000846387077048512, + "loss": 3.6655, + "step": 18095 + }, + { + "epoch": 1.2297866557956243, + "grad_norm": 0.7105693221092224, + "learning_rate": 0.0008463446120396793, + "loss": 3.8203, + "step": 18100 + }, + { + "epoch": 1.2301263758662861, + "grad_norm": 0.7494040131568909, + "learning_rate": 0.0008463021470308466, + "loss": 3.7103, + "step": 18105 + }, + { + "epoch": 1.230466095936948, + "grad_norm": 0.7186083197593689, + "learning_rate": 0.0008462596820220139, + "loss": 3.6266, + "step": 18110 + }, + { + "epoch": 1.2308058160076096, + "grad_norm": 0.7423375844955444, + "learning_rate": 0.0008462172170131811, + "loss": 3.7167, + "step": 18115 + }, + { + "epoch": 1.2311455360782715, + "grad_norm": 0.818461537361145, + "learning_rate": 0.0008461747520043484, + "loss": 3.8995, + "step": 18120 + }, + { + "epoch": 1.2314852561489333, + "grad_norm": 0.8165597915649414, + "learning_rate": 0.0008461322869955157, + "loss": 3.4447, + "step": 18125 + }, + { + "epoch": 1.231824976219595, + "grad_norm": 0.7954102158546448, + "learning_rate": 0.0008460898219866829, + "loss": 3.5313, + "step": 18130 + }, + { + "epoch": 1.2321646962902568, + "grad_norm": 0.7400712370872498, + "learning_rate": 0.0008460473569778503, + "loss": 3.6378, + "step": 18135 + }, + { + "epoch": 1.2325044163609187, + "grad_norm": 1.0486876964569092, + "learning_rate": 0.0008460048919690176, + "loss": 3.474, + "step": 18140 + }, + { + "epoch": 1.2328441364315803, + "grad_norm": 1.0368446111679077, + "learning_rate": 0.0008459624269601848, + "loss": 3.5582, + "step": 18145 + }, + { + "epoch": 1.2331838565022422, + "grad_norm": 0.8154240846633911, + "learning_rate": 0.0008459199619513521, + "loss": 3.6742, + "step": 18150 + }, + { + "epoch": 1.233523576572904, + "grad_norm": 0.7198530435562134, + "learning_rate": 0.0008458774969425193, + "loss": 3.4504, + "step": 18155 + }, + { + "epoch": 1.2338632966435656, + "grad_norm": 0.9889822602272034, + "learning_rate": 0.0008458350319336866, + "loss": 3.6321, + "step": 18160 + }, + { + "epoch": 1.2342030167142275, + "grad_norm": 1.4631205797195435, + "learning_rate": 0.000845792566924854, + "loss": 3.4183, + "step": 18165 + }, + { + "epoch": 1.2345427367848893, + "grad_norm": 0.8740648031234741, + "learning_rate": 0.0008457501019160212, + "loss": 3.5657, + "step": 18170 + }, + { + "epoch": 1.234882456855551, + "grad_norm": 0.8905075788497925, + "learning_rate": 0.0008457076369071886, + "loss": 3.5762, + "step": 18175 + }, + { + "epoch": 1.2352221769262128, + "grad_norm": 1.0373042821884155, + "learning_rate": 0.0008456651718983558, + "loss": 3.6282, + "step": 18180 + }, + { + "epoch": 1.2355618969968747, + "grad_norm": 0.8667267560958862, + "learning_rate": 0.000845622706889523, + "loss": 3.6036, + "step": 18185 + }, + { + "epoch": 1.2359016170675363, + "grad_norm": 0.7832011580467224, + "learning_rate": 0.0008455802418806904, + "loss": 3.506, + "step": 18190 + }, + { + "epoch": 1.2362413371381982, + "grad_norm": 0.8945884704589844, + "learning_rate": 0.0008455377768718576, + "loss": 3.5874, + "step": 18195 + }, + { + "epoch": 1.23658105720886, + "grad_norm": 0.7698514461517334, + "learning_rate": 0.0008454953118630249, + "loss": 3.6807, + "step": 18200 + }, + { + "epoch": 1.2369207772795217, + "grad_norm": 0.8925440907478333, + "learning_rate": 0.0008454528468541922, + "loss": 3.3988, + "step": 18205 + }, + { + "epoch": 1.2372604973501835, + "grad_norm": 0.9362595081329346, + "learning_rate": 0.0008454103818453595, + "loss": 3.5385, + "step": 18210 + }, + { + "epoch": 1.2376002174208451, + "grad_norm": 0.746705949306488, + "learning_rate": 0.0008453679168365267, + "loss": 3.5045, + "step": 18215 + }, + { + "epoch": 1.237939937491507, + "grad_norm": 0.7531605958938599, + "learning_rate": 0.000845325451827694, + "loss": 3.4423, + "step": 18220 + }, + { + "epoch": 1.2382796575621688, + "grad_norm": 0.898209810256958, + "learning_rate": 0.0008452829868188613, + "loss": 3.8014, + "step": 18225 + }, + { + "epoch": 1.2386193776328305, + "grad_norm": 0.9556558132171631, + "learning_rate": 0.0008452405218100285, + "loss": 3.625, + "step": 18230 + }, + { + "epoch": 1.2389590977034923, + "grad_norm": 0.7114981412887573, + "learning_rate": 0.0008451980568011959, + "loss": 3.7052, + "step": 18235 + }, + { + "epoch": 1.2392988177741542, + "grad_norm": 0.6377406716346741, + "learning_rate": 0.0008451555917923632, + "loss": 3.5428, + "step": 18240 + }, + { + "epoch": 1.2396385378448158, + "grad_norm": 1.0078846216201782, + "learning_rate": 0.0008451131267835304, + "loss": 3.5052, + "step": 18245 + }, + { + "epoch": 1.2399782579154777, + "grad_norm": 0.8812739849090576, + "learning_rate": 0.0008450706617746977, + "loss": 3.6285, + "step": 18250 + }, + { + "epoch": 1.2403179779861393, + "grad_norm": 0.8396075367927551, + "learning_rate": 0.0008450281967658649, + "loss": 3.5937, + "step": 18255 + }, + { + "epoch": 1.2406576980568012, + "grad_norm": 0.8703444004058838, + "learning_rate": 0.0008449857317570322, + "loss": 3.4371, + "step": 18260 + }, + { + "epoch": 1.240997418127463, + "grad_norm": 0.7681611776351929, + "learning_rate": 0.0008449432667481995, + "loss": 3.8954, + "step": 18265 + }, + { + "epoch": 1.2413371381981246, + "grad_norm": 0.8930127024650574, + "learning_rate": 0.0008449008017393668, + "loss": 3.3138, + "step": 18270 + }, + { + "epoch": 1.2416768582687865, + "grad_norm": 0.8380760550498962, + "learning_rate": 0.0008448583367305341, + "loss": 3.4913, + "step": 18275 + }, + { + "epoch": 1.2420165783394483, + "grad_norm": 0.707148551940918, + "learning_rate": 0.0008448158717217014, + "loss": 3.7631, + "step": 18280 + }, + { + "epoch": 1.24235629841011, + "grad_norm": 2.7769315242767334, + "learning_rate": 0.0008447734067128686, + "loss": 3.9584, + "step": 18285 + }, + { + "epoch": 1.2426960184807718, + "grad_norm": 0.9071149230003357, + "learning_rate": 0.0008447309417040358, + "loss": 3.6718, + "step": 18290 + }, + { + "epoch": 1.2430357385514337, + "grad_norm": 1.029103398323059, + "learning_rate": 0.0008446884766952032, + "loss": 3.5052, + "step": 18295 + }, + { + "epoch": 1.2433754586220953, + "grad_norm": 0.956720232963562, + "learning_rate": 0.0008446460116863704, + "loss": 3.7142, + "step": 18300 + }, + { + "epoch": 1.2437151786927572, + "grad_norm": 0.7290769815444946, + "learning_rate": 0.0008446035466775377, + "loss": 3.722, + "step": 18305 + }, + { + "epoch": 1.244054898763419, + "grad_norm": 0.8364763855934143, + "learning_rate": 0.0008445610816687051, + "loss": 3.7487, + "step": 18310 + }, + { + "epoch": 1.2443946188340806, + "grad_norm": 0.8126401901245117, + "learning_rate": 0.0008445186166598723, + "loss": 3.8039, + "step": 18315 + }, + { + "epoch": 1.2447343389047425, + "grad_norm": 0.86088627576828, + "learning_rate": 0.0008444761516510395, + "loss": 3.6632, + "step": 18320 + }, + { + "epoch": 1.2450740589754044, + "grad_norm": 0.8047016859054565, + "learning_rate": 0.0008444336866422069, + "loss": 3.4575, + "step": 18325 + }, + { + "epoch": 1.245413779046066, + "grad_norm": 0.7887160181999207, + "learning_rate": 0.0008443912216333741, + "loss": 3.4374, + "step": 18330 + }, + { + "epoch": 1.2457534991167278, + "grad_norm": 0.8475036025047302, + "learning_rate": 0.0008443487566245413, + "loss": 3.5902, + "step": 18335 + }, + { + "epoch": 1.2460932191873897, + "grad_norm": 0.8491775989532471, + "learning_rate": 0.0008443062916157088, + "loss": 3.4852, + "step": 18340 + }, + { + "epoch": 1.2464329392580513, + "grad_norm": 0.8231335282325745, + "learning_rate": 0.000844263826606876, + "loss": 3.58, + "step": 18345 + }, + { + "epoch": 1.2467726593287132, + "grad_norm": 0.9509361982345581, + "learning_rate": 0.0008442213615980432, + "loss": 3.4847, + "step": 18350 + }, + { + "epoch": 1.247112379399375, + "grad_norm": 0.7358238697052002, + "learning_rate": 0.0008441788965892105, + "loss": 3.8214, + "step": 18355 + }, + { + "epoch": 1.2474520994700367, + "grad_norm": 0.6405243873596191, + "learning_rate": 0.0008441364315803778, + "loss": 3.739, + "step": 18360 + }, + { + "epoch": 1.2477918195406985, + "grad_norm": 0.5791597366333008, + "learning_rate": 0.000844093966571545, + "loss": 3.6485, + "step": 18365 + }, + { + "epoch": 1.2481315396113604, + "grad_norm": 0.9219997525215149, + "learning_rate": 0.0008440515015627123, + "loss": 3.441, + "step": 18370 + }, + { + "epoch": 1.248471259682022, + "grad_norm": 0.6865357160568237, + "learning_rate": 0.0008440090365538797, + "loss": 3.7815, + "step": 18375 + }, + { + "epoch": 1.2488109797526838, + "grad_norm": 0.6976654529571533, + "learning_rate": 0.0008439665715450469, + "loss": 3.2883, + "step": 18380 + }, + { + "epoch": 1.2491506998233455, + "grad_norm": 0.6819227337837219, + "learning_rate": 0.0008439241065362142, + "loss": 3.8025, + "step": 18385 + }, + { + "epoch": 1.2494904198940073, + "grad_norm": 0.8959841132164001, + "learning_rate": 0.0008438816415273814, + "loss": 3.4372, + "step": 18390 + }, + { + "epoch": 1.2498301399646692, + "grad_norm": 0.728769838809967, + "learning_rate": 0.0008438391765185487, + "loss": 3.5178, + "step": 18395 + }, + { + "epoch": 1.2501698600353308, + "grad_norm": 0.757422924041748, + "learning_rate": 0.000843796711509716, + "loss": 3.6597, + "step": 18400 + }, + { + "epoch": 1.2505095801059927, + "grad_norm": 0.8230994939804077, + "learning_rate": 0.0008437542465008832, + "loss": 3.7298, + "step": 18405 + }, + { + "epoch": 1.2508493001766543, + "grad_norm": 0.6771420240402222, + "learning_rate": 0.0008437117814920506, + "loss": 3.7072, + "step": 18410 + }, + { + "epoch": 1.2511890202473162, + "grad_norm": 0.7683597803115845, + "learning_rate": 0.0008436693164832179, + "loss": 3.3495, + "step": 18415 + }, + { + "epoch": 1.251528740317978, + "grad_norm": 0.8989496827125549, + "learning_rate": 0.0008436268514743851, + "loss": 3.5018, + "step": 18420 + }, + { + "epoch": 1.2518684603886396, + "grad_norm": 1.0015931129455566, + "learning_rate": 0.0008435843864655524, + "loss": 3.534, + "step": 18425 + }, + { + "epoch": 1.2522081804593015, + "grad_norm": 1.057784914970398, + "learning_rate": 0.0008435419214567197, + "loss": 3.4003, + "step": 18430 + }, + { + "epoch": 1.2525479005299633, + "grad_norm": 0.8729202151298523, + "learning_rate": 0.0008434994564478869, + "loss": 3.7443, + "step": 18435 + }, + { + "epoch": 1.252887620600625, + "grad_norm": 0.9055291414260864, + "learning_rate": 0.0008434569914390541, + "loss": 3.5721, + "step": 18440 + }, + { + "epoch": 1.2532273406712868, + "grad_norm": 0.8667681813240051, + "learning_rate": 0.0008434145264302216, + "loss": 3.4871, + "step": 18445 + }, + { + "epoch": 1.2535670607419487, + "grad_norm": 0.6634142398834229, + "learning_rate": 0.0008433720614213888, + "loss": 3.7781, + "step": 18450 + }, + { + "epoch": 1.2539067808126103, + "grad_norm": 0.6974101662635803, + "learning_rate": 0.000843329596412556, + "loss": 3.4926, + "step": 18455 + }, + { + "epoch": 1.2542465008832722, + "grad_norm": 0.8090329170227051, + "learning_rate": 0.0008432871314037234, + "loss": 3.7641, + "step": 18460 + }, + { + "epoch": 1.254586220953934, + "grad_norm": 0.9116225838661194, + "learning_rate": 0.0008432446663948906, + "loss": 3.6064, + "step": 18465 + }, + { + "epoch": 1.2549259410245956, + "grad_norm": 0.8630117774009705, + "learning_rate": 0.0008432022013860578, + "loss": 3.3361, + "step": 18470 + }, + { + "epoch": 1.2552656610952575, + "grad_norm": 1.0257636308670044, + "learning_rate": 0.0008431597363772252, + "loss": 3.6776, + "step": 18475 + }, + { + "epoch": 1.2556053811659194, + "grad_norm": 0.9407450556755066, + "learning_rate": 0.0008431172713683925, + "loss": 3.5502, + "step": 18480 + }, + { + "epoch": 1.255945101236581, + "grad_norm": 0.896013617515564, + "learning_rate": 0.0008430748063595597, + "loss": 3.428, + "step": 18485 + }, + { + "epoch": 1.2562848213072428, + "grad_norm": 0.9130446910858154, + "learning_rate": 0.000843032341350727, + "loss": 3.6205, + "step": 18490 + }, + { + "epoch": 1.2566245413779047, + "grad_norm": 0.7894006967544556, + "learning_rate": 0.0008429898763418943, + "loss": 3.6725, + "step": 18495 + }, + { + "epoch": 1.2569642614485663, + "grad_norm": 0.8559442758560181, + "learning_rate": 0.0008429474113330615, + "loss": 3.4875, + "step": 18500 + }, + { + "epoch": 1.2573039815192282, + "grad_norm": 1.0807594060897827, + "learning_rate": 0.0008429049463242288, + "loss": 3.5719, + "step": 18505 + }, + { + "epoch": 1.25764370158989, + "grad_norm": 0.8181493878364563, + "learning_rate": 0.0008428624813153961, + "loss": 3.6267, + "step": 18510 + }, + { + "epoch": 1.2579834216605517, + "grad_norm": 0.76922208070755, + "learning_rate": 0.0008428200163065635, + "loss": 3.8395, + "step": 18515 + }, + { + "epoch": 1.2583231417312135, + "grad_norm": 1.1255172491073608, + "learning_rate": 0.0008427775512977307, + "loss": 3.6847, + "step": 18520 + }, + { + "epoch": 1.2586628618018754, + "grad_norm": 0.7809364199638367, + "learning_rate": 0.000842735086288898, + "loss": 3.8491, + "step": 18525 + }, + { + "epoch": 1.259002581872537, + "grad_norm": 0.6791590452194214, + "learning_rate": 0.0008426926212800653, + "loss": 3.6467, + "step": 18530 + }, + { + "epoch": 1.2593423019431988, + "grad_norm": 0.7563875317573547, + "learning_rate": 0.0008426501562712325, + "loss": 3.8761, + "step": 18535 + }, + { + "epoch": 1.2596820220138607, + "grad_norm": 0.965369701385498, + "learning_rate": 0.0008426076912623997, + "loss": 3.5848, + "step": 18540 + }, + { + "epoch": 1.2600217420845223, + "grad_norm": 0.798040509223938, + "learning_rate": 0.0008425652262535671, + "loss": 3.6692, + "step": 18545 + }, + { + "epoch": 1.2603614621551842, + "grad_norm": 0.7610406279563904, + "learning_rate": 0.0008425227612447344, + "loss": 3.9035, + "step": 18550 + }, + { + "epoch": 1.260701182225846, + "grad_norm": 1.074275255203247, + "learning_rate": 0.0008424802962359016, + "loss": 3.4831, + "step": 18555 + }, + { + "epoch": 1.2610409022965077, + "grad_norm": 0.6732968091964722, + "learning_rate": 0.000842437831227069, + "loss": 3.9344, + "step": 18560 + }, + { + "epoch": 1.2613806223671695, + "grad_norm": 0.9920135736465454, + "learning_rate": 0.0008423953662182362, + "loss": 3.6659, + "step": 18565 + }, + { + "epoch": 1.2617203424378312, + "grad_norm": 0.8121021389961243, + "learning_rate": 0.0008423529012094034, + "loss": 3.7438, + "step": 18570 + }, + { + "epoch": 1.262060062508493, + "grad_norm": 0.9736973643302917, + "learning_rate": 0.0008423104362005708, + "loss": 3.4738, + "step": 18575 + }, + { + "epoch": 1.2623997825791546, + "grad_norm": 0.8421375751495361, + "learning_rate": 0.000842267971191738, + "loss": 3.4708, + "step": 18580 + }, + { + "epoch": 1.2627395026498165, + "grad_norm": 1.136458396911621, + "learning_rate": 0.0008422255061829053, + "loss": 3.7466, + "step": 18585 + }, + { + "epoch": 1.2630792227204783, + "grad_norm": 0.9721758961677551, + "learning_rate": 0.0008421830411740727, + "loss": 3.7765, + "step": 18590 + }, + { + "epoch": 1.26341894279114, + "grad_norm": 0.7264630198478699, + "learning_rate": 0.0008421405761652399, + "loss": 3.5611, + "step": 18595 + }, + { + "epoch": 1.2637586628618018, + "grad_norm": 0.8007497191429138, + "learning_rate": 0.0008420981111564071, + "loss": 3.7143, + "step": 18600 + }, + { + "epoch": 1.2640983829324637, + "grad_norm": 1.5065646171569824, + "learning_rate": 0.0008420556461475744, + "loss": 3.5061, + "step": 18605 + }, + { + "epoch": 1.2644381030031253, + "grad_norm": 0.6845864653587341, + "learning_rate": 0.0008420131811387417, + "loss": 3.3451, + "step": 18610 + }, + { + "epoch": 1.2647778230737872, + "grad_norm": 0.7382348775863647, + "learning_rate": 0.0008419707161299089, + "loss": 3.7848, + "step": 18615 + }, + { + "epoch": 1.265117543144449, + "grad_norm": 0.7919543385505676, + "learning_rate": 0.0008419282511210763, + "loss": 3.7916, + "step": 18620 + }, + { + "epoch": 1.2654572632151107, + "grad_norm": 1.262646198272705, + "learning_rate": 0.0008418857861122436, + "loss": 3.426, + "step": 18625 + }, + { + "epoch": 1.2657969832857725, + "grad_norm": 0.8263772130012512, + "learning_rate": 0.0008418433211034108, + "loss": 3.3937, + "step": 18630 + }, + { + "epoch": 1.2661367033564344, + "grad_norm": 0.9791490435600281, + "learning_rate": 0.0008418008560945781, + "loss": 3.8605, + "step": 18635 + }, + { + "epoch": 1.266476423427096, + "grad_norm": 0.7318424582481384, + "learning_rate": 0.0008417583910857453, + "loss": 3.3524, + "step": 18640 + }, + { + "epoch": 1.2668161434977578, + "grad_norm": 0.7696229815483093, + "learning_rate": 0.0008417159260769126, + "loss": 3.3757, + "step": 18645 + }, + { + "epoch": 1.2671558635684197, + "grad_norm": 0.9644754528999329, + "learning_rate": 0.00084167346106808, + "loss": 3.6202, + "step": 18650 + }, + { + "epoch": 1.2674955836390813, + "grad_norm": 0.6481581926345825, + "learning_rate": 0.0008416309960592472, + "loss": 3.8119, + "step": 18655 + }, + { + "epoch": 1.2678353037097432, + "grad_norm": 0.8360882997512817, + "learning_rate": 0.0008415885310504145, + "loss": 3.7583, + "step": 18660 + }, + { + "epoch": 1.268175023780405, + "grad_norm": 0.7896801829338074, + "learning_rate": 0.0008415460660415818, + "loss": 3.6562, + "step": 18665 + }, + { + "epoch": 1.2685147438510667, + "grad_norm": 1.2480080127716064, + "learning_rate": 0.000841503601032749, + "loss": 3.5368, + "step": 18670 + }, + { + "epoch": 1.2688544639217285, + "grad_norm": 0.8980258107185364, + "learning_rate": 0.0008414611360239163, + "loss": 3.476, + "step": 18675 + }, + { + "epoch": 1.2691941839923904, + "grad_norm": 0.7919067144393921, + "learning_rate": 0.0008414186710150836, + "loss": 3.4797, + "step": 18680 + }, + { + "epoch": 1.269533904063052, + "grad_norm": 0.8472284078598022, + "learning_rate": 0.0008413762060062509, + "loss": 3.8554, + "step": 18685 + }, + { + "epoch": 1.2698736241337139, + "grad_norm": 0.8777706623077393, + "learning_rate": 0.0008413337409974181, + "loss": 3.887, + "step": 18690 + }, + { + "epoch": 1.2702133442043757, + "grad_norm": 0.9776824712753296, + "learning_rate": 0.0008412912759885855, + "loss": 3.6633, + "step": 18695 + }, + { + "epoch": 1.2705530642750373, + "grad_norm": 0.8354842662811279, + "learning_rate": 0.0008412488109797527, + "loss": 3.78, + "step": 18700 + }, + { + "epoch": 1.2708927843456992, + "grad_norm": 0.6844232082366943, + "learning_rate": 0.0008412063459709199, + "loss": 3.5976, + "step": 18705 + }, + { + "epoch": 1.271232504416361, + "grad_norm": 0.776009202003479, + "learning_rate": 0.0008411638809620873, + "loss": 3.6013, + "step": 18710 + }, + { + "epoch": 1.2715722244870227, + "grad_norm": 0.9003681540489197, + "learning_rate": 0.0008411214159532545, + "loss": 3.4949, + "step": 18715 + }, + { + "epoch": 1.2719119445576845, + "grad_norm": 0.9082055687904358, + "learning_rate": 0.0008410789509444218, + "loss": 3.4624, + "step": 18720 + }, + { + "epoch": 1.2722516646283464, + "grad_norm": 0.7913141250610352, + "learning_rate": 0.0008410364859355892, + "loss": 3.617, + "step": 18725 + }, + { + "epoch": 1.272591384699008, + "grad_norm": 0.8517146110534668, + "learning_rate": 0.0008409940209267564, + "loss": 3.4202, + "step": 18730 + }, + { + "epoch": 1.2729311047696699, + "grad_norm": 0.7099696397781372, + "learning_rate": 0.0008409515559179236, + "loss": 3.5881, + "step": 18735 + }, + { + "epoch": 1.2732708248403315, + "grad_norm": 0.7288572192192078, + "learning_rate": 0.000840909090909091, + "loss": 3.3992, + "step": 18740 + }, + { + "epoch": 1.2736105449109933, + "grad_norm": 0.8599908351898193, + "learning_rate": 0.0008408666259002582, + "loss": 3.5074, + "step": 18745 + }, + { + "epoch": 1.273950264981655, + "grad_norm": 0.8299602270126343, + "learning_rate": 0.0008408241608914254, + "loss": 3.6911, + "step": 18750 + }, + { + "epoch": 1.2742899850523168, + "grad_norm": 1.0271062850952148, + "learning_rate": 0.0008407816958825928, + "loss": 3.6587, + "step": 18755 + }, + { + "epoch": 1.2746297051229787, + "grad_norm": 0.7047721147537231, + "learning_rate": 0.0008407392308737601, + "loss": 3.6376, + "step": 18760 + }, + { + "epoch": 1.2749694251936403, + "grad_norm": 0.9240680932998657, + "learning_rate": 0.0008406967658649273, + "loss": 3.4267, + "step": 18765 + }, + { + "epoch": 1.2753091452643022, + "grad_norm": 0.7714706063270569, + "learning_rate": 0.0008406543008560946, + "loss": 3.4868, + "step": 18770 + }, + { + "epoch": 1.275648865334964, + "grad_norm": 0.8429070711135864, + "learning_rate": 0.0008406118358472619, + "loss": 3.6041, + "step": 18775 + }, + { + "epoch": 1.2759885854056257, + "grad_norm": 0.7418668270111084, + "learning_rate": 0.0008405693708384291, + "loss": 3.8879, + "step": 18780 + }, + { + "epoch": 1.2763283054762875, + "grad_norm": 0.9212924242019653, + "learning_rate": 0.0008405269058295964, + "loss": 3.759, + "step": 18785 + }, + { + "epoch": 1.2766680255469494, + "grad_norm": 0.8338181376457214, + "learning_rate": 0.0008404844408207637, + "loss": 3.9134, + "step": 18790 + }, + { + "epoch": 1.277007745617611, + "grad_norm": 0.7278923988342285, + "learning_rate": 0.000840441975811931, + "loss": 3.6655, + "step": 18795 + }, + { + "epoch": 1.2773474656882728, + "grad_norm": 0.9579760432243347, + "learning_rate": 0.0008403995108030983, + "loss": 3.5213, + "step": 18800 + }, + { + "epoch": 1.2776871857589347, + "grad_norm": 0.6988798975944519, + "learning_rate": 0.0008403570457942655, + "loss": 3.6256, + "step": 18805 + }, + { + "epoch": 1.2780269058295963, + "grad_norm": 0.7809003591537476, + "learning_rate": 0.0008403145807854328, + "loss": 3.5778, + "step": 18810 + }, + { + "epoch": 1.2783666259002582, + "grad_norm": 0.8023414611816406, + "learning_rate": 0.0008402721157766001, + "loss": 3.6487, + "step": 18815 + }, + { + "epoch": 1.27870634597092, + "grad_norm": 0.9113685488700867, + "learning_rate": 0.0008402296507677673, + "loss": 3.7373, + "step": 18820 + }, + { + "epoch": 1.2790460660415817, + "grad_norm": 0.736770510673523, + "learning_rate": 0.0008401871857589347, + "loss": 3.6802, + "step": 18825 + }, + { + "epoch": 1.2793857861122435, + "grad_norm": 0.9819709658622742, + "learning_rate": 0.000840144720750102, + "loss": 3.4673, + "step": 18830 + }, + { + "epoch": 1.2797255061829054, + "grad_norm": 0.8941782712936401, + "learning_rate": 0.0008401022557412692, + "loss": 3.821, + "step": 18835 + }, + { + "epoch": 1.280065226253567, + "grad_norm": 0.7992419600486755, + "learning_rate": 0.0008400597907324364, + "loss": 3.6911, + "step": 18840 + }, + { + "epoch": 1.2804049463242289, + "grad_norm": 0.7432798147201538, + "learning_rate": 0.0008400173257236038, + "loss": 3.5381, + "step": 18845 + }, + { + "epoch": 1.2807446663948907, + "grad_norm": 0.910342812538147, + "learning_rate": 0.000839974860714771, + "loss": 3.5295, + "step": 18850 + }, + { + "epoch": 1.2810843864655523, + "grad_norm": 0.684809148311615, + "learning_rate": 0.0008399323957059383, + "loss": 3.6254, + "step": 18855 + }, + { + "epoch": 1.2814241065362142, + "grad_norm": 0.9868423342704773, + "learning_rate": 0.0008398899306971057, + "loss": 3.6512, + "step": 18860 + }, + { + "epoch": 1.281763826606876, + "grad_norm": 0.7098813652992249, + "learning_rate": 0.0008398474656882729, + "loss": 3.5259, + "step": 18865 + }, + { + "epoch": 1.2821035466775377, + "grad_norm": 0.7242160439491272, + "learning_rate": 0.0008398050006794402, + "loss": 3.7642, + "step": 18870 + }, + { + "epoch": 1.2824432667481995, + "grad_norm": 0.71599280834198, + "learning_rate": 0.0008397625356706075, + "loss": 3.6049, + "step": 18875 + }, + { + "epoch": 1.2827829868188614, + "grad_norm": 0.7834014296531677, + "learning_rate": 0.0008397200706617747, + "loss": 3.4011, + "step": 18880 + }, + { + "epoch": 1.283122706889523, + "grad_norm": 0.9112538695335388, + "learning_rate": 0.000839677605652942, + "loss": 3.6944, + "step": 18885 + }, + { + "epoch": 1.2834624269601849, + "grad_norm": 0.8044084906578064, + "learning_rate": 0.0008396351406441092, + "loss": 3.3254, + "step": 18890 + }, + { + "epoch": 1.2838021470308467, + "grad_norm": 0.7889822125434875, + "learning_rate": 0.0008395926756352766, + "loss": 3.6002, + "step": 18895 + }, + { + "epoch": 1.2841418671015083, + "grad_norm": 0.7269510626792908, + "learning_rate": 0.0008395502106264439, + "loss": 3.5743, + "step": 18900 + }, + { + "epoch": 1.2844815871721702, + "grad_norm": 1.0339241027832031, + "learning_rate": 0.0008395077456176111, + "loss": 3.5983, + "step": 18905 + }, + { + "epoch": 1.2848213072428318, + "grad_norm": 0.6959977149963379, + "learning_rate": 0.0008394652806087784, + "loss": 3.8144, + "step": 18910 + }, + { + "epoch": 1.2851610273134937, + "grad_norm": 0.9645085334777832, + "learning_rate": 0.0008394228155999457, + "loss": 3.1622, + "step": 18915 + }, + { + "epoch": 1.2855007473841553, + "grad_norm": 0.6965295076370239, + "learning_rate": 0.0008393803505911129, + "loss": 3.5353, + "step": 18920 + }, + { + "epoch": 1.2858404674548172, + "grad_norm": 0.7943950295448303, + "learning_rate": 0.0008393378855822801, + "loss": 3.5112, + "step": 18925 + }, + { + "epoch": 1.286180187525479, + "grad_norm": 0.7328330278396606, + "learning_rate": 0.0008392954205734476, + "loss": 3.6068, + "step": 18930 + }, + { + "epoch": 1.2865199075961407, + "grad_norm": 0.8032127022743225, + "learning_rate": 0.0008392529555646148, + "loss": 3.4109, + "step": 18935 + }, + { + "epoch": 1.2868596276668025, + "grad_norm": 0.9397767186164856, + "learning_rate": 0.000839210490555782, + "loss": 3.7368, + "step": 18940 + }, + { + "epoch": 1.2871993477374644, + "grad_norm": 0.8262988924980164, + "learning_rate": 0.0008391680255469494, + "loss": 3.6045, + "step": 18945 + }, + { + "epoch": 1.287539067808126, + "grad_norm": 0.7373209595680237, + "learning_rate": 0.0008391255605381166, + "loss": 3.8029, + "step": 18950 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.832205593585968, + "learning_rate": 0.0008390830955292838, + "loss": 3.7445, + "step": 18955 + }, + { + "epoch": 1.2882185079494497, + "grad_norm": 0.8965170979499817, + "learning_rate": 0.0008390406305204512, + "loss": 3.438, + "step": 18960 + }, + { + "epoch": 1.2885582280201113, + "grad_norm": 0.8148509860038757, + "learning_rate": 0.0008389981655116185, + "loss": 3.445, + "step": 18965 + }, + { + "epoch": 1.2888979480907732, + "grad_norm": 0.8745383620262146, + "learning_rate": 0.0008389557005027857, + "loss": 3.7064, + "step": 18970 + }, + { + "epoch": 1.289237668161435, + "grad_norm": 0.7071954607963562, + "learning_rate": 0.0008389132354939531, + "loss": 3.6693, + "step": 18975 + }, + { + "epoch": 1.2895773882320967, + "grad_norm": 0.6986136436462402, + "learning_rate": 0.0008388707704851203, + "loss": 3.6281, + "step": 18980 + }, + { + "epoch": 1.2899171083027585, + "grad_norm": 0.6737527251243591, + "learning_rate": 0.0008388283054762875, + "loss": 3.6772, + "step": 18985 + }, + { + "epoch": 1.2902568283734204, + "grad_norm": 0.7477849721908569, + "learning_rate": 0.0008387858404674548, + "loss": 3.488, + "step": 18990 + }, + { + "epoch": 1.290596548444082, + "grad_norm": 1.0487247705459595, + "learning_rate": 0.0008387433754586221, + "loss": 3.9916, + "step": 18995 + }, + { + "epoch": 1.2909362685147439, + "grad_norm": 0.7276598811149597, + "learning_rate": 0.0008387009104497894, + "loss": 3.4777, + "step": 19000 + }, + { + "epoch": 1.2912759885854057, + "grad_norm": 0.9916195869445801, + "learning_rate": 0.0008386584454409567, + "loss": 3.5199, + "step": 19005 + }, + { + "epoch": 1.2916157086560673, + "grad_norm": 0.7936527729034424, + "learning_rate": 0.000838615980432124, + "loss": 3.658, + "step": 19010 + }, + { + "epoch": 1.2919554287267292, + "grad_norm": 0.9048296213150024, + "learning_rate": 0.0008385735154232912, + "loss": 3.6848, + "step": 19015 + }, + { + "epoch": 1.292295148797391, + "grad_norm": 0.9453439116477966, + "learning_rate": 0.0008385310504144585, + "loss": 3.6858, + "step": 19020 + }, + { + "epoch": 1.2926348688680527, + "grad_norm": 0.7384598255157471, + "learning_rate": 0.0008384885854056257, + "loss": 3.6307, + "step": 19025 + }, + { + "epoch": 1.2929745889387145, + "grad_norm": 0.9885731935501099, + "learning_rate": 0.000838446120396793, + "loss": 3.7374, + "step": 19030 + }, + { + "epoch": 1.2933143090093764, + "grad_norm": 0.771979808807373, + "learning_rate": 0.0008384036553879604, + "loss": 3.9273, + "step": 19035 + }, + { + "epoch": 1.293654029080038, + "grad_norm": 0.8753556609153748, + "learning_rate": 0.0008383611903791276, + "loss": 3.656, + "step": 19040 + }, + { + "epoch": 1.2939937491506999, + "grad_norm": 0.7992796301841736, + "learning_rate": 0.0008383187253702949, + "loss": 3.7861, + "step": 19045 + }, + { + "epoch": 1.2943334692213617, + "grad_norm": 0.8781776428222656, + "learning_rate": 0.0008382762603614622, + "loss": 3.3553, + "step": 19050 + }, + { + "epoch": 1.2946731892920234, + "grad_norm": 0.6208105683326721, + "learning_rate": 0.0008382337953526294, + "loss": 3.6442, + "step": 19055 + }, + { + "epoch": 1.2950129093626852, + "grad_norm": 0.9150822758674622, + "learning_rate": 0.0008381913303437967, + "loss": 3.7152, + "step": 19060 + }, + { + "epoch": 1.295352629433347, + "grad_norm": 0.7896659970283508, + "learning_rate": 0.000838148865334964, + "loss": 3.3809, + "step": 19065 + }, + { + "epoch": 1.2956923495040087, + "grad_norm": 0.8575422763824463, + "learning_rate": 0.0008381064003261313, + "loss": 3.4743, + "step": 19070 + }, + { + "epoch": 1.2960320695746705, + "grad_norm": 1.0971137285232544, + "learning_rate": 0.0008380639353172985, + "loss": 3.4975, + "step": 19075 + }, + { + "epoch": 1.2963717896453322, + "grad_norm": 0.8193876147270203, + "learning_rate": 0.0008380214703084659, + "loss": 3.8534, + "step": 19080 + }, + { + "epoch": 1.296711509715994, + "grad_norm": 0.7990505695343018, + "learning_rate": 0.0008379790052996331, + "loss": 3.8322, + "step": 19085 + }, + { + "epoch": 1.2970512297866557, + "grad_norm": 0.7239413261413574, + "learning_rate": 0.0008379365402908003, + "loss": 3.6938, + "step": 19090 + }, + { + "epoch": 1.2973909498573175, + "grad_norm": 0.7753434181213379, + "learning_rate": 0.0008378940752819677, + "loss": 3.414, + "step": 19095 + }, + { + "epoch": 1.2977306699279794, + "grad_norm": 1.0114697217941284, + "learning_rate": 0.0008378516102731349, + "loss": 3.5099, + "step": 19100 + }, + { + "epoch": 1.298070389998641, + "grad_norm": 0.7905046939849854, + "learning_rate": 0.0008378091452643022, + "loss": 3.5836, + "step": 19105 + }, + { + "epoch": 1.2984101100693028, + "grad_norm": 0.8378105163574219, + "learning_rate": 0.0008377666802554696, + "loss": 3.5389, + "step": 19110 + }, + { + "epoch": 1.2987498301399647, + "grad_norm": 0.9816122651100159, + "learning_rate": 0.0008377242152466368, + "loss": 3.5076, + "step": 19115 + }, + { + "epoch": 1.2990895502106263, + "grad_norm": 0.9470474123954773, + "learning_rate": 0.000837681750237804, + "loss": 3.715, + "step": 19120 + }, + { + "epoch": 1.2994292702812882, + "grad_norm": 0.6822510957717896, + "learning_rate": 0.0008376392852289713, + "loss": 3.6111, + "step": 19125 + }, + { + "epoch": 1.29976899035195, + "grad_norm": 0.8616366982460022, + "learning_rate": 0.0008375968202201386, + "loss": 3.6901, + "step": 19130 + }, + { + "epoch": 1.3001087104226117, + "grad_norm": 0.9164016246795654, + "learning_rate": 0.0008375543552113058, + "loss": 3.5292, + "step": 19135 + }, + { + "epoch": 1.3004484304932735, + "grad_norm": 0.8928403854370117, + "learning_rate": 0.0008375118902024732, + "loss": 3.4439, + "step": 19140 + }, + { + "epoch": 1.3007881505639354, + "grad_norm": 0.950133740901947, + "learning_rate": 0.0008374694251936405, + "loss": 3.6813, + "step": 19145 + }, + { + "epoch": 1.301127870634597, + "grad_norm": 0.7425905466079712, + "learning_rate": 0.0008374269601848077, + "loss": 3.7688, + "step": 19150 + }, + { + "epoch": 1.3014675907052589, + "grad_norm": 0.7523996829986572, + "learning_rate": 0.000837384495175975, + "loss": 3.4637, + "step": 19155 + }, + { + "epoch": 1.3018073107759207, + "grad_norm": 0.9336604475975037, + "learning_rate": 0.0008373420301671423, + "loss": 3.6783, + "step": 19160 + }, + { + "epoch": 1.3021470308465823, + "grad_norm": 0.9813581705093384, + "learning_rate": 0.0008372995651583095, + "loss": 3.5513, + "step": 19165 + }, + { + "epoch": 1.3024867509172442, + "grad_norm": 0.7883825302124023, + "learning_rate": 0.0008372571001494768, + "loss": 3.9184, + "step": 19170 + }, + { + "epoch": 1.302826470987906, + "grad_norm": 0.7765228152275085, + "learning_rate": 0.0008372146351406442, + "loss": 3.4788, + "step": 19175 + }, + { + "epoch": 1.3031661910585677, + "grad_norm": 1.2001516819000244, + "learning_rate": 0.0008371721701318114, + "loss": 3.5057, + "step": 19180 + }, + { + "epoch": 1.3035059111292295, + "grad_norm": 0.711991012096405, + "learning_rate": 0.0008371297051229787, + "loss": 3.5108, + "step": 19185 + }, + { + "epoch": 1.3038456311998914, + "grad_norm": 0.7260213494300842, + "learning_rate": 0.0008370872401141459, + "loss": 3.9015, + "step": 19190 + }, + { + "epoch": 1.304185351270553, + "grad_norm": 0.7892175316810608, + "learning_rate": 0.0008370447751053133, + "loss": 3.498, + "step": 19195 + }, + { + "epoch": 1.3045250713412149, + "grad_norm": 0.9343512058258057, + "learning_rate": 0.0008370023100964805, + "loss": 3.886, + "step": 19200 + }, + { + "epoch": 1.3048647914118767, + "grad_norm": 2.3296046257019043, + "learning_rate": 0.0008369598450876477, + "loss": 3.5829, + "step": 19205 + }, + { + "epoch": 1.3052045114825384, + "grad_norm": 0.6870824694633484, + "learning_rate": 0.0008369173800788152, + "loss": 3.6212, + "step": 19210 + }, + { + "epoch": 1.3055442315532002, + "grad_norm": 0.8819324970245361, + "learning_rate": 0.0008368749150699824, + "loss": 3.8625, + "step": 19215 + }, + { + "epoch": 1.305883951623862, + "grad_norm": 0.6965407729148865, + "learning_rate": 0.0008368324500611496, + "loss": 3.4695, + "step": 19220 + }, + { + "epoch": 1.3062236716945237, + "grad_norm": 0.9335972666740417, + "learning_rate": 0.000836789985052317, + "loss": 3.8229, + "step": 19225 + }, + { + "epoch": 1.3065633917651855, + "grad_norm": 1.0282480716705322, + "learning_rate": 0.0008367475200434842, + "loss": 3.7019, + "step": 19230 + }, + { + "epoch": 1.3069031118358474, + "grad_norm": 0.915276825428009, + "learning_rate": 0.0008367050550346514, + "loss": 3.3505, + "step": 19235 + }, + { + "epoch": 1.307242831906509, + "grad_norm": 1.0090320110321045, + "learning_rate": 0.0008366625900258188, + "loss": 3.572, + "step": 19240 + }, + { + "epoch": 1.3075825519771709, + "grad_norm": 0.7285304069519043, + "learning_rate": 0.0008366201250169861, + "loss": 3.7371, + "step": 19245 + }, + { + "epoch": 1.3079222720478325, + "grad_norm": 0.938769519329071, + "learning_rate": 0.0008365776600081533, + "loss": 3.7965, + "step": 19250 + }, + { + "epoch": 1.3082619921184944, + "grad_norm": 0.8031560778617859, + "learning_rate": 0.0008365351949993206, + "loss": 3.8637, + "step": 19255 + }, + { + "epoch": 1.308601712189156, + "grad_norm": 1.2350109815597534, + "learning_rate": 0.0008364927299904879, + "loss": 3.7783, + "step": 19260 + }, + { + "epoch": 1.3089414322598178, + "grad_norm": 0.8220742344856262, + "learning_rate": 0.0008364502649816551, + "loss": 3.211, + "step": 19265 + }, + { + "epoch": 1.3092811523304797, + "grad_norm": 1.2797129154205322, + "learning_rate": 0.0008364077999728224, + "loss": 3.5265, + "step": 19270 + }, + { + "epoch": 1.3096208724011413, + "grad_norm": 0.6370225548744202, + "learning_rate": 0.0008363653349639898, + "loss": 3.5181, + "step": 19275 + }, + { + "epoch": 1.3099605924718032, + "grad_norm": 0.7808799743652344, + "learning_rate": 0.000836322869955157, + "loss": 3.1757, + "step": 19280 + }, + { + "epoch": 1.310300312542465, + "grad_norm": 0.7316072583198547, + "learning_rate": 0.0008362804049463243, + "loss": 3.6153, + "step": 19285 + }, + { + "epoch": 1.3106400326131267, + "grad_norm": 0.6546277403831482, + "learning_rate": 0.0008362379399374915, + "loss": 3.5871, + "step": 19290 + }, + { + "epoch": 1.3109797526837885, + "grad_norm": 0.7915052175521851, + "learning_rate": 0.0008361954749286588, + "loss": 3.6293, + "step": 19295 + }, + { + "epoch": 1.3113194727544504, + "grad_norm": 0.9749646782875061, + "learning_rate": 0.0008361530099198261, + "loss": 3.6516, + "step": 19300 + }, + { + "epoch": 1.311659192825112, + "grad_norm": 0.7617306709289551, + "learning_rate": 0.0008361105449109933, + "loss": 3.7835, + "step": 19305 + }, + { + "epoch": 1.3119989128957739, + "grad_norm": 0.6724220514297485, + "learning_rate": 0.0008360680799021607, + "loss": 3.392, + "step": 19310 + }, + { + "epoch": 1.3123386329664357, + "grad_norm": 0.9551365375518799, + "learning_rate": 0.000836025614893328, + "loss": 3.4355, + "step": 19315 + }, + { + "epoch": 1.3126783530370973, + "grad_norm": 0.6739931106567383, + "learning_rate": 0.0008359831498844952, + "loss": 3.5013, + "step": 19320 + }, + { + "epoch": 1.3130180731077592, + "grad_norm": 0.831169068813324, + "learning_rate": 0.0008359406848756624, + "loss": 3.6373, + "step": 19325 + }, + { + "epoch": 1.313357793178421, + "grad_norm": 0.877882719039917, + "learning_rate": 0.0008358982198668298, + "loss": 3.5941, + "step": 19330 + }, + { + "epoch": 1.3136975132490827, + "grad_norm": 1.0135629177093506, + "learning_rate": 0.000835855754857997, + "loss": 3.4366, + "step": 19335 + }, + { + "epoch": 1.3140372333197445, + "grad_norm": 0.947805643081665, + "learning_rate": 0.0008358132898491642, + "loss": 3.674, + "step": 19340 + }, + { + "epoch": 1.3143769533904064, + "grad_norm": 1.0589076280593872, + "learning_rate": 0.0008357708248403317, + "loss": 3.6767, + "step": 19345 + }, + { + "epoch": 1.314716673461068, + "grad_norm": 1.0891550779342651, + "learning_rate": 0.0008357283598314989, + "loss": 3.5914, + "step": 19350 + }, + { + "epoch": 1.3150563935317299, + "grad_norm": 1.199847936630249, + "learning_rate": 0.0008356858948226661, + "loss": 3.5668, + "step": 19355 + }, + { + "epoch": 1.3153961136023917, + "grad_norm": 1.0983508825302124, + "learning_rate": 0.0008356434298138335, + "loss": 3.5076, + "step": 19360 + }, + { + "epoch": 1.3157358336730534, + "grad_norm": 0.8212329745292664, + "learning_rate": 0.0008356009648050007, + "loss": 3.752, + "step": 19365 + }, + { + "epoch": 1.3160755537437152, + "grad_norm": 0.7923041582107544, + "learning_rate": 0.0008355584997961679, + "loss": 3.6898, + "step": 19370 + }, + { + "epoch": 1.316415273814377, + "grad_norm": 0.7041112780570984, + "learning_rate": 0.0008355160347873352, + "loss": 3.9288, + "step": 19375 + }, + { + "epoch": 1.3167549938850387, + "grad_norm": 0.9502371549606323, + "learning_rate": 0.0008354735697785026, + "loss": 3.6307, + "step": 19380 + }, + { + "epoch": 1.3170947139557005, + "grad_norm": 0.7737523317337036, + "learning_rate": 0.0008354311047696698, + "loss": 3.7173, + "step": 19385 + }, + { + "epoch": 1.3174344340263624, + "grad_norm": 1.1494675874710083, + "learning_rate": 0.0008353886397608371, + "loss": 3.6815, + "step": 19390 + }, + { + "epoch": 1.317774154097024, + "grad_norm": 1.347084403038025, + "learning_rate": 0.0008353461747520044, + "loss": 3.2751, + "step": 19395 + }, + { + "epoch": 1.3181138741676859, + "grad_norm": 0.8071908354759216, + "learning_rate": 0.0008353037097431716, + "loss": 3.8104, + "step": 19400 + }, + { + "epoch": 1.3184535942383477, + "grad_norm": 1.001084327697754, + "learning_rate": 0.0008352612447343389, + "loss": 3.6036, + "step": 19405 + }, + { + "epoch": 1.3187933143090094, + "grad_norm": 1.3685288429260254, + "learning_rate": 0.0008352187797255062, + "loss": 3.582, + "step": 19410 + }, + { + "epoch": 1.3191330343796712, + "grad_norm": 0.899493396282196, + "learning_rate": 0.0008351763147166735, + "loss": 3.557, + "step": 19415 + }, + { + "epoch": 1.3194727544503329, + "grad_norm": 0.7737858891487122, + "learning_rate": 0.0008351338497078408, + "loss": 3.8035, + "step": 19420 + }, + { + "epoch": 1.3198124745209947, + "grad_norm": 0.6906596422195435, + "learning_rate": 0.000835091384699008, + "loss": 3.5729, + "step": 19425 + }, + { + "epoch": 1.3201521945916566, + "grad_norm": 0.7514760494232178, + "learning_rate": 0.0008350489196901753, + "loss": 3.7441, + "step": 19430 + }, + { + "epoch": 1.3204919146623182, + "grad_norm": 0.7398584485054016, + "learning_rate": 0.0008350064546813426, + "loss": 3.6444, + "step": 19435 + }, + { + "epoch": 1.32083163473298, + "grad_norm": 0.8550446629524231, + "learning_rate": 0.0008349639896725098, + "loss": 3.5221, + "step": 19440 + }, + { + "epoch": 1.3211713548036417, + "grad_norm": 0.9726698994636536, + "learning_rate": 0.0008349215246636771, + "loss": 3.6013, + "step": 19445 + }, + { + "epoch": 1.3215110748743035, + "grad_norm": 1.121169090270996, + "learning_rate": 0.0008348790596548445, + "loss": 3.6678, + "step": 19450 + }, + { + "epoch": 1.3218507949449654, + "grad_norm": 0.8855244517326355, + "learning_rate": 0.0008348365946460117, + "loss": 3.8562, + "step": 19455 + }, + { + "epoch": 1.322190515015627, + "grad_norm": 0.8463066816329956, + "learning_rate": 0.000834794129637179, + "loss": 3.8616, + "step": 19460 + }, + { + "epoch": 1.3225302350862889, + "grad_norm": 0.894733726978302, + "learning_rate": 0.0008347516646283463, + "loss": 3.5184, + "step": 19465 + }, + { + "epoch": 1.3228699551569507, + "grad_norm": 1.0641835927963257, + "learning_rate": 0.0008347091996195135, + "loss": 3.883, + "step": 19470 + }, + { + "epoch": 1.3232096752276123, + "grad_norm": 0.8857590556144714, + "learning_rate": 0.0008346667346106807, + "loss": 3.5607, + "step": 19475 + }, + { + "epoch": 1.3235493952982742, + "grad_norm": 1.4270941019058228, + "learning_rate": 0.0008346242696018481, + "loss": 3.5807, + "step": 19480 + }, + { + "epoch": 1.323889115368936, + "grad_norm": 0.8178171515464783, + "learning_rate": 0.0008345818045930154, + "loss": 3.6225, + "step": 19485 + }, + { + "epoch": 1.3242288354395977, + "grad_norm": 0.7018311619758606, + "learning_rate": 0.0008345393395841826, + "loss": 3.4232, + "step": 19490 + }, + { + "epoch": 1.3245685555102595, + "grad_norm": 0.8513080477714539, + "learning_rate": 0.00083449687457535, + "loss": 3.7176, + "step": 19495 + }, + { + "epoch": 1.3249082755809214, + "grad_norm": 0.9216023087501526, + "learning_rate": 0.0008344544095665172, + "loss": 3.6781, + "step": 19500 + }, + { + "epoch": 1.325247995651583, + "grad_norm": NaN, + "learning_rate": 0.0008344204375594511, + "loss": 3.4088, + "step": 19505 + }, + { + "epoch": 1.3255877157222449, + "grad_norm": 1.1977981328964233, + "learning_rate": 0.0008343779725506184, + "loss": 3.5032, + "step": 19510 + }, + { + "epoch": 1.3259274357929067, + "grad_norm": 0.7658312916755676, + "learning_rate": 0.0008343355075417856, + "loss": 3.6709, + "step": 19515 + }, + { + "epoch": 1.3262671558635684, + "grad_norm": 10.34023666381836, + "learning_rate": 0.0008342930425329529, + "loss": 3.5524, + "step": 19520 + }, + { + "epoch": 1.3266068759342302, + "grad_norm": 0.727379560470581, + "learning_rate": 0.0008342505775241201, + "loss": 3.5477, + "step": 19525 + }, + { + "epoch": 1.326946596004892, + "grad_norm": 0.8829269409179688, + "learning_rate": 0.0008342081125152874, + "loss": 3.3742, + "step": 19530 + }, + { + "epoch": 1.3272863160755537, + "grad_norm": 0.789625346660614, + "learning_rate": 0.0008341656475064547, + "loss": 3.6934, + "step": 19535 + }, + { + "epoch": 1.3276260361462155, + "grad_norm": 0.8891812562942505, + "learning_rate": 0.000834123182497622, + "loss": 3.662, + "step": 19540 + }, + { + "epoch": 1.3279657562168774, + "grad_norm": 0.8710037469863892, + "learning_rate": 0.0008340807174887893, + "loss": 3.5507, + "step": 19545 + }, + { + "epoch": 1.328305476287539, + "grad_norm": 0.6725106835365295, + "learning_rate": 0.0008340382524799566, + "loss": 3.4275, + "step": 19550 + }, + { + "epoch": 1.3286451963582009, + "grad_norm": 0.9594770073890686, + "learning_rate": 0.0008339957874711238, + "loss": 3.7487, + "step": 19555 + }, + { + "epoch": 1.3289849164288627, + "grad_norm": 0.7160376310348511, + "learning_rate": 0.000833953322462291, + "loss": 3.6952, + "step": 19560 + }, + { + "epoch": 1.3293246364995244, + "grad_norm": 1.2279534339904785, + "learning_rate": 0.0008339108574534584, + "loss": 3.4632, + "step": 19565 + }, + { + "epoch": 1.3296643565701862, + "grad_norm": 0.9995816349983215, + "learning_rate": 0.0008338683924446256, + "loss": 3.6117, + "step": 19570 + }, + { + "epoch": 1.330004076640848, + "grad_norm": 0.8015040159225464, + "learning_rate": 0.0008338259274357929, + "loss": 3.7456, + "step": 19575 + }, + { + "epoch": 1.3303437967115097, + "grad_norm": 0.8085102438926697, + "learning_rate": 0.0008337834624269603, + "loss": 3.491, + "step": 19580 + }, + { + "epoch": 1.3306835167821716, + "grad_norm": 0.8537411689758301, + "learning_rate": 0.0008337409974181275, + "loss": 3.5998, + "step": 19585 + }, + { + "epoch": 1.3310232368528332, + "grad_norm": 0.9545833468437195, + "learning_rate": 0.0008336985324092947, + "loss": 3.5538, + "step": 19590 + }, + { + "epoch": 1.331362956923495, + "grad_norm": 0.7261341214179993, + "learning_rate": 0.0008336560674004621, + "loss": 3.6739, + "step": 19595 + }, + { + "epoch": 1.331702676994157, + "grad_norm": 1.0572048425674438, + "learning_rate": 0.0008336136023916293, + "loss": 3.703, + "step": 19600 + }, + { + "epoch": 1.3320423970648185, + "grad_norm": 0.8849319815635681, + "learning_rate": 0.0008335711373827965, + "loss": 3.592, + "step": 19605 + }, + { + "epoch": 1.3323821171354804, + "grad_norm": 0.8576738238334656, + "learning_rate": 0.000833528672373964, + "loss": 3.5396, + "step": 19610 + }, + { + "epoch": 1.332721837206142, + "grad_norm": 0.9609581828117371, + "learning_rate": 0.0008334862073651312, + "loss": 3.3147, + "step": 19615 + }, + { + "epoch": 1.3330615572768039, + "grad_norm": 0.8687294721603394, + "learning_rate": 0.0008334437423562984, + "loss": 3.8806, + "step": 19620 + }, + { + "epoch": 1.3334012773474657, + "grad_norm": 0.8814398050308228, + "learning_rate": 0.0008334012773474657, + "loss": 3.7405, + "step": 19625 + }, + { + "epoch": 1.3337409974181273, + "grad_norm": 1.0255868434906006, + "learning_rate": 0.000833358812338633, + "loss": 4.0215, + "step": 19630 + }, + { + "epoch": 1.3340807174887892, + "grad_norm": 0.9433949589729309, + "learning_rate": 0.0008333163473298002, + "loss": 3.6658, + "step": 19635 + }, + { + "epoch": 1.334420437559451, + "grad_norm": 0.8163443803787231, + "learning_rate": 0.0008332738823209675, + "loss": 3.5735, + "step": 19640 + }, + { + "epoch": 1.3347601576301127, + "grad_norm": 0.722680389881134, + "learning_rate": 0.0008332314173121349, + "loss": 3.5735, + "step": 19645 + }, + { + "epoch": 1.3350998777007745, + "grad_norm": 1.3055249452590942, + "learning_rate": 0.0008331889523033021, + "loss": 3.3725, + "step": 19650 + }, + { + "epoch": 1.3354395977714364, + "grad_norm": 0.8159347176551819, + "learning_rate": 0.0008331464872944694, + "loss": 3.4361, + "step": 19655 + }, + { + "epoch": 1.335779317842098, + "grad_norm": 0.957936704158783, + "learning_rate": 0.0008331040222856366, + "loss": 3.7658, + "step": 19660 + }, + { + "epoch": 1.3361190379127599, + "grad_norm": 0.9720074534416199, + "learning_rate": 0.0008330615572768039, + "loss": 3.7043, + "step": 19665 + }, + { + "epoch": 1.3364587579834217, + "grad_norm": 0.8764237761497498, + "learning_rate": 0.0008330190922679712, + "loss": 3.6829, + "step": 19670 + }, + { + "epoch": 1.3367984780540834, + "grad_norm": 0.8784125447273254, + "learning_rate": 0.0008329766272591384, + "loss": 3.7462, + "step": 19675 + }, + { + "epoch": 1.3371381981247452, + "grad_norm": 0.9715747237205505, + "learning_rate": 0.0008329341622503058, + "loss": 3.6926, + "step": 19680 + }, + { + "epoch": 1.337477918195407, + "grad_norm": 0.7767139077186584, + "learning_rate": 0.0008328916972414731, + "loss": 3.8137, + "step": 19685 + }, + { + "epoch": 1.3378176382660687, + "grad_norm": 0.7698678970336914, + "learning_rate": 0.0008328492322326403, + "loss": 3.6207, + "step": 19690 + }, + { + "epoch": 1.3381573583367306, + "grad_norm": 0.7231669425964355, + "learning_rate": 0.0008328067672238076, + "loss": 3.6781, + "step": 19695 + }, + { + "epoch": 1.3384970784073924, + "grad_norm": 0.890603244304657, + "learning_rate": 0.0008327643022149749, + "loss": 3.4215, + "step": 19700 + }, + { + "epoch": 1.338836798478054, + "grad_norm": 0.7683752179145813, + "learning_rate": 0.0008327218372061421, + "loss": 3.666, + "step": 19705 + }, + { + "epoch": 1.3391765185487159, + "grad_norm": 5.050052642822266, + "learning_rate": 0.0008326793721973093, + "loss": 3.4734, + "step": 19710 + }, + { + "epoch": 1.3395162386193777, + "grad_norm": 0.7520052194595337, + "learning_rate": 0.0008326369071884768, + "loss": 3.4761, + "step": 19715 + }, + { + "epoch": 1.3398559586900394, + "grad_norm": 1.210484504699707, + "learning_rate": 0.000832594442179644, + "loss": 3.7027, + "step": 19720 + }, + { + "epoch": 1.3401956787607012, + "grad_norm": 0.7824100255966187, + "learning_rate": 0.0008325519771708112, + "loss": 3.6874, + "step": 19725 + }, + { + "epoch": 1.340535398831363, + "grad_norm": 1.5327448844909668, + "learning_rate": 0.0008325095121619786, + "loss": 3.4944, + "step": 19730 + }, + { + "epoch": 1.3408751189020247, + "grad_norm": 0.7447940111160278, + "learning_rate": 0.0008324670471531458, + "loss": 3.6508, + "step": 19735 + }, + { + "epoch": 1.3412148389726866, + "grad_norm": 0.8857502937316895, + "learning_rate": 0.0008324245821443131, + "loss": 3.3683, + "step": 19740 + }, + { + "epoch": 1.3415545590433484, + "grad_norm": 0.6946133971214294, + "learning_rate": 0.0008323821171354804, + "loss": 3.72, + "step": 19745 + }, + { + "epoch": 1.34189427911401, + "grad_norm": 0.9417757987976074, + "learning_rate": 0.0008323396521266477, + "loss": 3.4486, + "step": 19750 + }, + { + "epoch": 1.342233999184672, + "grad_norm": 0.7955737113952637, + "learning_rate": 0.000832297187117815, + "loss": 3.7268, + "step": 19755 + }, + { + "epoch": 1.3425737192553335, + "grad_norm": 1.126017689704895, + "learning_rate": 0.0008322547221089822, + "loss": 3.5999, + "step": 19760 + }, + { + "epoch": 1.3429134393259954, + "grad_norm": 1.0237139463424683, + "learning_rate": 0.0008322122571001495, + "loss": 3.3536, + "step": 19765 + }, + { + "epoch": 1.3432531593966572, + "grad_norm": 1.0918468236923218, + "learning_rate": 0.0008321697920913168, + "loss": 3.4942, + "step": 19770 + }, + { + "epoch": 1.3435928794673189, + "grad_norm": 1.0914114713668823, + "learning_rate": 0.000832127327082484, + "loss": 3.5147, + "step": 19775 + }, + { + "epoch": 1.3439325995379807, + "grad_norm": 0.8118355870246887, + "learning_rate": 0.0008320848620736513, + "loss": 3.6519, + "step": 19780 + }, + { + "epoch": 1.3442723196086424, + "grad_norm": 0.8225145936012268, + "learning_rate": 0.0008320423970648187, + "loss": 3.6427, + "step": 19785 + }, + { + "epoch": 1.3446120396793042, + "grad_norm": 0.7590323686599731, + "learning_rate": 0.0008319999320559859, + "loss": 3.784, + "step": 19790 + }, + { + "epoch": 1.344951759749966, + "grad_norm": 0.757396936416626, + "learning_rate": 0.0008319574670471532, + "loss": 3.7238, + "step": 19795 + }, + { + "epoch": 1.3452914798206277, + "grad_norm": 0.9419365525245667, + "learning_rate": 0.0008319150020383205, + "loss": 3.4881, + "step": 19800 + }, + { + "epoch": 1.3456311998912895, + "grad_norm": 0.7802294492721558, + "learning_rate": 0.0008318725370294877, + "loss": 3.5983, + "step": 19805 + }, + { + "epoch": 1.3459709199619514, + "grad_norm": 0.7338426113128662, + "learning_rate": 0.0008318300720206549, + "loss": 3.4188, + "step": 19810 + }, + { + "epoch": 1.346310640032613, + "grad_norm": 1.020233154296875, + "learning_rate": 0.0008317876070118223, + "loss": 3.592, + "step": 19815 + }, + { + "epoch": 1.3466503601032749, + "grad_norm": 1.6031413078308105, + "learning_rate": 0.0008317451420029896, + "loss": 3.5115, + "step": 19820 + }, + { + "epoch": 1.3469900801739367, + "grad_norm": 0.7466903328895569, + "learning_rate": 0.0008317026769941568, + "loss": 3.5347, + "step": 19825 + }, + { + "epoch": 1.3473298002445984, + "grad_norm": 0.7906804084777832, + "learning_rate": 0.0008316602119853242, + "loss": 3.5642, + "step": 19830 + }, + { + "epoch": 1.3476695203152602, + "grad_norm": 0.6610172986984253, + "learning_rate": 0.0008316177469764914, + "loss": 3.7409, + "step": 19835 + }, + { + "epoch": 1.348009240385922, + "grad_norm": 0.8404771685600281, + "learning_rate": 0.0008315752819676586, + "loss": 3.5611, + "step": 19840 + }, + { + "epoch": 1.3483489604565837, + "grad_norm": 0.8754986524581909, + "learning_rate": 0.000831532816958826, + "loss": 3.5726, + "step": 19845 + }, + { + "epoch": 1.3486886805272456, + "grad_norm": 0.906295895576477, + "learning_rate": 0.0008314903519499932, + "loss": 3.759, + "step": 19850 + }, + { + "epoch": 1.3490284005979074, + "grad_norm": 0.6729426383972168, + "learning_rate": 0.0008314478869411605, + "loss": 3.6824, + "step": 19855 + }, + { + "epoch": 1.349368120668569, + "grad_norm": 0.8578769564628601, + "learning_rate": 0.0008314054219323278, + "loss": 3.6098, + "step": 19860 + }, + { + "epoch": 1.349707840739231, + "grad_norm": 1.02249276638031, + "learning_rate": 0.0008313629569234951, + "loss": 3.7102, + "step": 19865 + }, + { + "epoch": 1.3500475608098927, + "grad_norm": 0.9229472279548645, + "learning_rate": 0.0008313204919146623, + "loss": 3.7223, + "step": 19870 + }, + { + "epoch": 1.3503872808805544, + "grad_norm": 1.0774152278900146, + "learning_rate": 0.0008312780269058296, + "loss": 3.9041, + "step": 19875 + }, + { + "epoch": 1.3507270009512162, + "grad_norm": 0.806865930557251, + "learning_rate": 0.0008312355618969969, + "loss": 3.878, + "step": 19880 + }, + { + "epoch": 1.351066721021878, + "grad_norm": 0.7242082357406616, + "learning_rate": 0.0008311930968881641, + "loss": 3.7391, + "step": 19885 + }, + { + "epoch": 1.3514064410925397, + "grad_norm": 0.854021430015564, + "learning_rate": 0.0008311506318793315, + "loss": 3.4395, + "step": 19890 + }, + { + "epoch": 1.3517461611632016, + "grad_norm": 0.8101058006286621, + "learning_rate": 0.0008311081668704988, + "loss": 3.7125, + "step": 19895 + }, + { + "epoch": 1.3520858812338634, + "grad_norm": 0.8974558711051941, + "learning_rate": 0.000831065701861666, + "loss": 3.5662, + "step": 19900 + }, + { + "epoch": 1.352425601304525, + "grad_norm": 0.7171255946159363, + "learning_rate": 0.0008310232368528333, + "loss": 3.5799, + "step": 19905 + }, + { + "epoch": 1.352765321375187, + "grad_norm": 1.0325874090194702, + "learning_rate": 0.0008309807718440005, + "loss": 3.6535, + "step": 19910 + }, + { + "epoch": 1.3531050414458488, + "grad_norm": 0.7834596633911133, + "learning_rate": 0.0008309383068351678, + "loss": 3.6895, + "step": 19915 + }, + { + "epoch": 1.3534447615165104, + "grad_norm": 0.9764876365661621, + "learning_rate": 0.0008308958418263351, + "loss": 3.6886, + "step": 19920 + }, + { + "epoch": 1.3537844815871722, + "grad_norm": 0.8145744800567627, + "learning_rate": 0.0008308533768175024, + "loss": 3.6058, + "step": 19925 + }, + { + "epoch": 1.3541242016578339, + "grad_norm": 0.775444507598877, + "learning_rate": 0.0008308109118086697, + "loss": 3.8489, + "step": 19930 + }, + { + "epoch": 1.3544639217284957, + "grad_norm": 0.789652943611145, + "learning_rate": 0.000830768446799837, + "loss": 3.7811, + "step": 19935 + }, + { + "epoch": 1.3548036417991576, + "grad_norm": 1.0224660634994507, + "learning_rate": 0.0008307259817910042, + "loss": 3.5583, + "step": 19940 + }, + { + "epoch": 1.3551433618698192, + "grad_norm": 1.9743516445159912, + "learning_rate": 0.0008306835167821714, + "loss": 3.7038, + "step": 19945 + }, + { + "epoch": 1.355483081940481, + "grad_norm": 0.7963464260101318, + "learning_rate": 0.0008306410517733388, + "loss": 3.6928, + "step": 19950 + }, + { + "epoch": 1.3558228020111427, + "grad_norm": 0.7329343557357788, + "learning_rate": 0.000830598586764506, + "loss": 3.6436, + "step": 19955 + }, + { + "epoch": 1.3561625220818045, + "grad_norm": 0.8114310503005981, + "learning_rate": 0.0008305561217556733, + "loss": 3.8064, + "step": 19960 + }, + { + "epoch": 1.3565022421524664, + "grad_norm": 0.8262535333633423, + "learning_rate": 0.0008305136567468407, + "loss": 3.8235, + "step": 19965 + }, + { + "epoch": 1.356841962223128, + "grad_norm": 0.850269079208374, + "learning_rate": 0.0008304711917380079, + "loss": 3.5684, + "step": 19970 + }, + { + "epoch": 1.3571816822937899, + "grad_norm": 0.7225843071937561, + "learning_rate": 0.0008304287267291751, + "loss": 3.6052, + "step": 19975 + }, + { + "epoch": 1.3575214023644517, + "grad_norm": 0.6841182112693787, + "learning_rate": 0.0008303862617203425, + "loss": 3.7209, + "step": 19980 + }, + { + "epoch": 1.3578611224351134, + "grad_norm": 0.6611128449440002, + "learning_rate": 0.0008303437967115097, + "loss": 3.4901, + "step": 19985 + }, + { + "epoch": 1.3582008425057752, + "grad_norm": 0.6489830613136292, + "learning_rate": 0.0008303013317026769, + "loss": 3.6066, + "step": 19990 + }, + { + "epoch": 1.358540562576437, + "grad_norm": 0.8354825377464294, + "learning_rate": 0.0008302588666938444, + "loss": 3.4372, + "step": 19995 + }, + { + "epoch": 1.3588802826470987, + "grad_norm": 0.7869886755943298, + "learning_rate": 0.0008302164016850116, + "loss": 3.7341, + "step": 20000 + }, + { + "epoch": 1.3592200027177606, + "grad_norm": 0.7658554315567017, + "learning_rate": 0.0008301739366761788, + "loss": 3.8667, + "step": 20005 + }, + { + "epoch": 1.3595597227884224, + "grad_norm": 1.0140355825424194, + "learning_rate": 0.0008301314716673461, + "loss": 3.8069, + "step": 20010 + }, + { + "epoch": 1.359899442859084, + "grad_norm": 0.7940161824226379, + "learning_rate": 0.0008300890066585134, + "loss": 3.8183, + "step": 20015 + }, + { + "epoch": 1.360239162929746, + "grad_norm": 0.6864373087882996, + "learning_rate": 0.0008300465416496806, + "loss": 3.7209, + "step": 20020 + }, + { + "epoch": 1.3605788830004077, + "grad_norm": 0.9016879200935364, + "learning_rate": 0.000830004076640848, + "loss": 3.5222, + "step": 20025 + }, + { + "epoch": 1.3609186030710694, + "grad_norm": 0.7155046463012695, + "learning_rate": 0.0008299616116320153, + "loss": 3.5669, + "step": 20030 + }, + { + "epoch": 1.3612583231417312, + "grad_norm": 0.7987703680992126, + "learning_rate": 0.0008299191466231825, + "loss": 3.7817, + "step": 20035 + }, + { + "epoch": 1.361598043212393, + "grad_norm": 0.8040763735771179, + "learning_rate": 0.0008298766816143498, + "loss": 3.7256, + "step": 20040 + }, + { + "epoch": 1.3619377632830547, + "grad_norm": 1.446528434753418, + "learning_rate": 0.000829834216605517, + "loss": 3.5137, + "step": 20045 + }, + { + "epoch": 1.3622774833537166, + "grad_norm": 0.9019175171852112, + "learning_rate": 0.0008297917515966843, + "loss": 3.6882, + "step": 20050 + }, + { + "epoch": 1.3626172034243784, + "grad_norm": 0.8347340226173401, + "learning_rate": 0.0008297492865878516, + "loss": 3.6436, + "step": 20055 + }, + { + "epoch": 1.36295692349504, + "grad_norm": 0.6172686815261841, + "learning_rate": 0.0008297068215790189, + "loss": 3.6482, + "step": 20060 + }, + { + "epoch": 1.363296643565702, + "grad_norm": 0.8235265612602234, + "learning_rate": 0.0008296643565701862, + "loss": 3.749, + "step": 20065 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 1.000372290611267, + "learning_rate": 0.0008296218915613535, + "loss": 3.6244, + "step": 20070 + }, + { + "epoch": 1.3639760837070254, + "grad_norm": 0.9786437749862671, + "learning_rate": 0.0008295794265525207, + "loss": 3.6754, + "step": 20075 + }, + { + "epoch": 1.3643158037776872, + "grad_norm": 0.8076041340827942, + "learning_rate": 0.0008295369615436881, + "loss": 3.5496, + "step": 20080 + }, + { + "epoch": 1.364655523848349, + "grad_norm": 0.9285458922386169, + "learning_rate": 0.0008294944965348553, + "loss": 3.464, + "step": 20085 + }, + { + "epoch": 1.3649952439190107, + "grad_norm": 0.7768741846084595, + "learning_rate": 0.0008294520315260225, + "loss": 3.578, + "step": 20090 + }, + { + "epoch": 1.3653349639896726, + "grad_norm": 0.688302218914032, + "learning_rate": 0.00082940956651719, + "loss": 3.5278, + "step": 20095 + }, + { + "epoch": 1.3656746840603342, + "grad_norm": 0.8040573596954346, + "learning_rate": 0.0008293671015083572, + "loss": 3.6319, + "step": 20100 + }, + { + "epoch": 1.366014404130996, + "grad_norm": 0.8600893616676331, + "learning_rate": 0.0008293246364995244, + "loss": 3.7086, + "step": 20105 + }, + { + "epoch": 1.366354124201658, + "grad_norm": 1.1413556337356567, + "learning_rate": 0.0008292821714906917, + "loss": 3.4148, + "step": 20110 + }, + { + "epoch": 1.3666938442723195, + "grad_norm": 0.8786232471466064, + "learning_rate": 0.000829239706481859, + "loss": 3.6449, + "step": 20115 + }, + { + "epoch": 1.3670335643429814, + "grad_norm": 0.845450222492218, + "learning_rate": 0.0008291972414730262, + "loss": 3.1408, + "step": 20120 + }, + { + "epoch": 1.367373284413643, + "grad_norm": 0.7747614979743958, + "learning_rate": 0.0008291547764641935, + "loss": 3.4571, + "step": 20125 + }, + { + "epoch": 1.3677130044843049, + "grad_norm": 0.905851423740387, + "learning_rate": 0.0008291123114553609, + "loss": 3.5839, + "step": 20130 + }, + { + "epoch": 1.3680527245549667, + "grad_norm": 0.8281379342079163, + "learning_rate": 0.0008290698464465281, + "loss": 3.6969, + "step": 20135 + }, + { + "epoch": 1.3683924446256284, + "grad_norm": 0.7702296376228333, + "learning_rate": 0.0008290273814376954, + "loss": 3.3568, + "step": 20140 + }, + { + "epoch": 1.3687321646962902, + "grad_norm": 1.017757773399353, + "learning_rate": 0.0008289849164288626, + "loss": 3.5411, + "step": 20145 + }, + { + "epoch": 1.369071884766952, + "grad_norm": 0.8192773461341858, + "learning_rate": 0.0008289424514200299, + "loss": 3.3569, + "step": 20150 + }, + { + "epoch": 1.3694116048376137, + "grad_norm": 0.9206108450889587, + "learning_rate": 0.0008288999864111972, + "loss": 3.784, + "step": 20155 + }, + { + "epoch": 1.3697513249082756, + "grad_norm": 0.877634584903717, + "learning_rate": 0.0008288575214023644, + "loss": 3.5919, + "step": 20160 + }, + { + "epoch": 1.3700910449789374, + "grad_norm": 0.9275442361831665, + "learning_rate": 0.0008288150563935318, + "loss": 3.6594, + "step": 20165 + }, + { + "epoch": 1.370430765049599, + "grad_norm": 0.8467435240745544, + "learning_rate": 0.0008287725913846991, + "loss": 3.4507, + "step": 20170 + }, + { + "epoch": 1.370770485120261, + "grad_norm": 0.765204668045044, + "learning_rate": 0.0008287301263758663, + "loss": 3.4571, + "step": 20175 + }, + { + "epoch": 1.3711102051909227, + "grad_norm": 0.6651422381401062, + "learning_rate": 0.0008286876613670336, + "loss": 3.6317, + "step": 20180 + }, + { + "epoch": 1.3714499252615844, + "grad_norm": 0.7055399417877197, + "learning_rate": 0.0008286451963582009, + "loss": 3.2672, + "step": 20185 + }, + { + "epoch": 1.3717896453322462, + "grad_norm": 0.949308454990387, + "learning_rate": 0.0008286027313493681, + "loss": 3.7542, + "step": 20190 + }, + { + "epoch": 1.372129365402908, + "grad_norm": 0.9752681851387024, + "learning_rate": 0.0008285602663405353, + "loss": 3.3879, + "step": 20195 + }, + { + "epoch": 1.3724690854735697, + "grad_norm": 0.8987239599227905, + "learning_rate": 0.0008285178013317028, + "loss": 3.3333, + "step": 20200 + }, + { + "epoch": 1.3728088055442316, + "grad_norm": 0.6393351554870605, + "learning_rate": 0.00082847533632287, + "loss": 3.4096, + "step": 20205 + }, + { + "epoch": 1.3731485256148934, + "grad_norm": 0.8089986443519592, + "learning_rate": 0.0008284328713140372, + "loss": 3.5044, + "step": 20210 + }, + { + "epoch": 1.373488245685555, + "grad_norm": 0.7524659037590027, + "learning_rate": 0.0008283904063052046, + "loss": 3.5909, + "step": 20215 + }, + { + "epoch": 1.373827965756217, + "grad_norm": 0.9002319574356079, + "learning_rate": 0.0008283479412963718, + "loss": 3.3949, + "step": 20220 + }, + { + "epoch": 1.3741676858268788, + "grad_norm": 0.7558292746543884, + "learning_rate": 0.000828305476287539, + "loss": 3.662, + "step": 20225 + }, + { + "epoch": 1.3745074058975404, + "grad_norm": 0.5760164856910706, + "learning_rate": 0.0008282630112787064, + "loss": 3.3376, + "step": 20230 + }, + { + "epoch": 1.3748471259682022, + "grad_norm": 1.0034853219985962, + "learning_rate": 0.0008282205462698737, + "loss": 3.564, + "step": 20235 + }, + { + "epoch": 1.375186846038864, + "grad_norm": 0.8343245983123779, + "learning_rate": 0.0008281780812610409, + "loss": 3.4775, + "step": 20240 + }, + { + "epoch": 1.3755265661095257, + "grad_norm": 0.7684645056724548, + "learning_rate": 0.0008281356162522083, + "loss": 3.5033, + "step": 20245 + }, + { + "epoch": 1.3758662861801876, + "grad_norm": 0.8841187357902527, + "learning_rate": 0.0008280931512433755, + "loss": 3.5006, + "step": 20250 + }, + { + "epoch": 1.3762060062508494, + "grad_norm": 0.855969250202179, + "learning_rate": 0.0008280506862345427, + "loss": 3.5093, + "step": 20255 + }, + { + "epoch": 1.376545726321511, + "grad_norm": 0.8306503891944885, + "learning_rate": 0.00082800822122571, + "loss": 3.4253, + "step": 20260 + }, + { + "epoch": 1.376885446392173, + "grad_norm": 1.137816071510315, + "learning_rate": 0.0008279657562168773, + "loss": 3.6305, + "step": 20265 + }, + { + "epoch": 1.3772251664628345, + "grad_norm": 1.0581587553024292, + "learning_rate": 0.0008279232912080446, + "loss": 3.8285, + "step": 20270 + }, + { + "epoch": 1.3775648865334964, + "grad_norm": 0.9015575647354126, + "learning_rate": 0.0008278808261992119, + "loss": 3.6158, + "step": 20275 + }, + { + "epoch": 1.3779046066041583, + "grad_norm": 0.7637923955917358, + "learning_rate": 0.0008278383611903792, + "loss": 3.6722, + "step": 20280 + }, + { + "epoch": 1.3782443266748199, + "grad_norm": 1.1021335124969482, + "learning_rate": 0.0008277958961815464, + "loss": 3.503, + "step": 20285 + }, + { + "epoch": 1.3785840467454817, + "grad_norm": 0.9594349265098572, + "learning_rate": 0.0008277534311727137, + "loss": 3.7998, + "step": 20290 + }, + { + "epoch": 1.3789237668161434, + "grad_norm": 0.7602871656417847, + "learning_rate": 0.0008277109661638809, + "loss": 3.7475, + "step": 20295 + }, + { + "epoch": 1.3792634868868052, + "grad_norm": 0.6606634259223938, + "learning_rate": 0.0008276685011550482, + "loss": 3.7967, + "step": 20300 + }, + { + "epoch": 1.379603206957467, + "grad_norm": 0.8870893716812134, + "learning_rate": 0.0008276260361462156, + "loss": 3.3971, + "step": 20305 + }, + { + "epoch": 1.3799429270281287, + "grad_norm": 0.6809976696968079, + "learning_rate": 0.0008275835711373828, + "loss": 3.6337, + "step": 20310 + }, + { + "epoch": 1.3802826470987906, + "grad_norm": 0.9260888695716858, + "learning_rate": 0.0008275411061285501, + "loss": 3.7347, + "step": 20315 + }, + { + "epoch": 1.3806223671694524, + "grad_norm": 1.0469493865966797, + "learning_rate": 0.0008274986411197174, + "loss": 3.6329, + "step": 20320 + }, + { + "epoch": 1.380962087240114, + "grad_norm": 0.991367757320404, + "learning_rate": 0.0008274561761108846, + "loss": 3.522, + "step": 20325 + }, + { + "epoch": 1.381301807310776, + "grad_norm": 0.7290663719177246, + "learning_rate": 0.0008274137111020519, + "loss": 3.5739, + "step": 20330 + }, + { + "epoch": 1.3816415273814378, + "grad_norm": 0.7811363935470581, + "learning_rate": 0.0008273712460932192, + "loss": 3.5296, + "step": 20335 + }, + { + "epoch": 1.3819812474520994, + "grad_norm": 0.7066630125045776, + "learning_rate": 0.0008273287810843865, + "loss": 3.7001, + "step": 20340 + }, + { + "epoch": 1.3823209675227612, + "grad_norm": 0.827725887298584, + "learning_rate": 0.0008272863160755537, + "loss": 3.5584, + "step": 20345 + }, + { + "epoch": 1.382660687593423, + "grad_norm": 0.7361013889312744, + "learning_rate": 0.0008272438510667211, + "loss": 3.5475, + "step": 20350 + }, + { + "epoch": 1.3830004076640847, + "grad_norm": 1.0498051643371582, + "learning_rate": 0.0008272013860578883, + "loss": 3.423, + "step": 20355 + }, + { + "epoch": 1.3833401277347466, + "grad_norm": 0.8487643599510193, + "learning_rate": 0.0008271589210490555, + "loss": 3.4709, + "step": 20360 + }, + { + "epoch": 1.3836798478054084, + "grad_norm": 0.8433998227119446, + "learning_rate": 0.0008271164560402229, + "loss": 3.7473, + "step": 20365 + }, + { + "epoch": 1.38401956787607, + "grad_norm": 0.7505950331687927, + "learning_rate": 0.0008270739910313901, + "loss": 3.6712, + "step": 20370 + }, + { + "epoch": 1.384359287946732, + "grad_norm": 0.8113127946853638, + "learning_rate": 0.0008270315260225574, + "loss": 3.61, + "step": 20375 + }, + { + "epoch": 1.3846990080173938, + "grad_norm": 0.6785798668861389, + "learning_rate": 0.0008269890610137248, + "loss": 3.8371, + "step": 20380 + }, + { + "epoch": 1.3850387280880554, + "grad_norm": 0.7709731459617615, + "learning_rate": 0.000826946596004892, + "loss": 3.7655, + "step": 20385 + }, + { + "epoch": 1.3853784481587172, + "grad_norm": 0.6532859206199646, + "learning_rate": 0.0008269041309960592, + "loss": 3.4945, + "step": 20390 + }, + { + "epoch": 1.385718168229379, + "grad_norm": 0.9087226986885071, + "learning_rate": 0.0008268616659872265, + "loss": 3.6312, + "step": 20395 + }, + { + "epoch": 1.3860578883000407, + "grad_norm": 0.923051655292511, + "learning_rate": 0.0008268192009783938, + "loss": 3.5325, + "step": 20400 + }, + { + "epoch": 1.3863976083707026, + "grad_norm": 1.148252010345459, + "learning_rate": 0.000826776735969561, + "loss": 3.5461, + "step": 20405 + }, + { + "epoch": 1.3867373284413644, + "grad_norm": 0.7475162148475647, + "learning_rate": 0.0008267342709607284, + "loss": 3.5932, + "step": 20410 + }, + { + "epoch": 1.387077048512026, + "grad_norm": 1.0134711265563965, + "learning_rate": 0.0008266918059518957, + "loss": 3.7088, + "step": 20415 + }, + { + "epoch": 1.387416768582688, + "grad_norm": 0.6765879392623901, + "learning_rate": 0.000826649340943063, + "loss": 3.6954, + "step": 20420 + }, + { + "epoch": 1.3877564886533498, + "grad_norm": 0.6751094460487366, + "learning_rate": 0.0008266068759342302, + "loss": 3.5186, + "step": 20425 + }, + { + "epoch": 1.3880962087240114, + "grad_norm": 0.7552658915519714, + "learning_rate": 0.0008265644109253975, + "loss": 3.5904, + "step": 20430 + }, + { + "epoch": 1.3884359287946733, + "grad_norm": 1.1143152713775635, + "learning_rate": 0.0008265219459165648, + "loss": 3.7687, + "step": 20435 + }, + { + "epoch": 1.3887756488653349, + "grad_norm": 0.7853351831436157, + "learning_rate": 0.000826479480907732, + "loss": 3.4545, + "step": 20440 + }, + { + "epoch": 1.3891153689359967, + "grad_norm": 0.855912446975708, + "learning_rate": 0.0008264370158988993, + "loss": 3.7527, + "step": 20445 + }, + { + "epoch": 1.3894550890066586, + "grad_norm": 1.0656296014785767, + "learning_rate": 0.0008263945508900667, + "loss": 3.6074, + "step": 20450 + }, + { + "epoch": 1.3897948090773202, + "grad_norm": 0.7564384341239929, + "learning_rate": 0.0008263520858812339, + "loss": 3.4093, + "step": 20455 + }, + { + "epoch": 1.390134529147982, + "grad_norm": 0.9112916588783264, + "learning_rate": 0.0008263096208724011, + "loss": 3.7937, + "step": 20460 + }, + { + "epoch": 1.3904742492186437, + "grad_norm": 0.680229127407074, + "learning_rate": 0.0008262671558635685, + "loss": 3.5322, + "step": 20465 + }, + { + "epoch": 1.3908139692893056, + "grad_norm": 0.8896828889846802, + "learning_rate": 0.0008262246908547357, + "loss": 3.4118, + "step": 20470 + }, + { + "epoch": 1.3911536893599674, + "grad_norm": 0.7922264933586121, + "learning_rate": 0.0008261822258459029, + "loss": 3.576, + "step": 20475 + }, + { + "epoch": 1.391493409430629, + "grad_norm": 2.019672393798828, + "learning_rate": 0.0008261397608370704, + "loss": 3.6122, + "step": 20480 + }, + { + "epoch": 1.391833129501291, + "grad_norm": 0.871623694896698, + "learning_rate": 0.0008260972958282376, + "loss": 3.4604, + "step": 20485 + }, + { + "epoch": 1.3921728495719528, + "grad_norm": 0.7731744050979614, + "learning_rate": 0.0008260548308194048, + "loss": 3.6007, + "step": 20490 + }, + { + "epoch": 1.3925125696426144, + "grad_norm": 0.8436648845672607, + "learning_rate": 0.0008260123658105721, + "loss": 3.4678, + "step": 20495 + }, + { + "epoch": 1.3928522897132762, + "grad_norm": 0.9880411028862, + "learning_rate": 0.0008259699008017394, + "loss": 3.5159, + "step": 20500 + }, + { + "epoch": 1.393192009783938, + "grad_norm": 0.8878276348114014, + "learning_rate": 0.0008259274357929066, + "loss": 3.553, + "step": 20505 + }, + { + "epoch": 1.3935317298545997, + "grad_norm": 0.836351215839386, + "learning_rate": 0.0008258849707840739, + "loss": 3.5583, + "step": 20510 + }, + { + "epoch": 1.3938714499252616, + "grad_norm": 0.7861984968185425, + "learning_rate": 0.0008258425057752413, + "loss": 3.5028, + "step": 20515 + }, + { + "epoch": 1.3942111699959234, + "grad_norm": 1.0536606311798096, + "learning_rate": 0.0008258000407664085, + "loss": 3.7537, + "step": 20520 + }, + { + "epoch": 1.394550890066585, + "grad_norm": 0.9899960160255432, + "learning_rate": 0.0008257575757575758, + "loss": 3.5896, + "step": 20525 + }, + { + "epoch": 1.394890610137247, + "grad_norm": 0.9633844494819641, + "learning_rate": 0.000825715110748743, + "loss": 3.343, + "step": 20530 + }, + { + "epoch": 1.3952303302079088, + "grad_norm": 0.7398457527160645, + "learning_rate": 0.0008256726457399103, + "loss": 3.6749, + "step": 20535 + }, + { + "epoch": 1.3955700502785704, + "grad_norm": 0.6418601870536804, + "learning_rate": 0.0008256301807310776, + "loss": 3.7318, + "step": 20540 + }, + { + "epoch": 1.3959097703492322, + "grad_norm": 0.9466742277145386, + "learning_rate": 0.0008255877157222448, + "loss": 3.6649, + "step": 20545 + }, + { + "epoch": 1.396249490419894, + "grad_norm": 0.8195507526397705, + "learning_rate": 0.0008255452507134122, + "loss": 3.3938, + "step": 20550 + }, + { + "epoch": 1.3965892104905557, + "grad_norm": 0.6578526496887207, + "learning_rate": 0.0008255027857045795, + "loss": 3.4102, + "step": 20555 + }, + { + "epoch": 1.3969289305612176, + "grad_norm": 0.6360197067260742, + "learning_rate": 0.0008254603206957467, + "loss": 3.5665, + "step": 20560 + }, + { + "epoch": 1.3972686506318794, + "grad_norm": 0.7911432385444641, + "learning_rate": 0.000825417855686914, + "loss": 3.6368, + "step": 20565 + }, + { + "epoch": 1.397608370702541, + "grad_norm": 0.9518787264823914, + "learning_rate": 0.0008253753906780813, + "loss": 3.5214, + "step": 20570 + }, + { + "epoch": 1.397948090773203, + "grad_norm": 0.7508004903793335, + "learning_rate": 0.0008253329256692485, + "loss": 3.2961, + "step": 20575 + }, + { + "epoch": 1.3982878108438648, + "grad_norm": 0.8278752565383911, + "learning_rate": 0.0008252904606604157, + "loss": 3.6308, + "step": 20580 + }, + { + "epoch": 1.3986275309145264, + "grad_norm": 0.9088984131813049, + "learning_rate": 0.0008252479956515832, + "loss": 3.562, + "step": 20585 + }, + { + "epoch": 1.3989672509851883, + "grad_norm": 0.8039524555206299, + "learning_rate": 0.0008252055306427504, + "loss": 3.669, + "step": 20590 + }, + { + "epoch": 1.3993069710558501, + "grad_norm": 1.1063014268875122, + "learning_rate": 0.0008251630656339176, + "loss": 3.4901, + "step": 20595 + }, + { + "epoch": 1.3996466911265117, + "grad_norm": 1.101318597793579, + "learning_rate": 0.000825120600625085, + "loss": 3.5205, + "step": 20600 + }, + { + "epoch": 1.3999864111971736, + "grad_norm": 0.9618262052536011, + "learning_rate": 0.0008250781356162522, + "loss": 3.7807, + "step": 20605 + }, + { + "epoch": 1.4003261312678352, + "grad_norm": 0.8230213522911072, + "learning_rate": 0.0008250356706074194, + "loss": 3.6007, + "step": 20610 + }, + { + "epoch": 1.400665851338497, + "grad_norm": 0.6376248002052307, + "learning_rate": 0.0008249932055985869, + "loss": 3.7622, + "step": 20615 + }, + { + "epoch": 1.401005571409159, + "grad_norm": 0.7768716812133789, + "learning_rate": 0.0008249507405897541, + "loss": 3.5362, + "step": 20620 + }, + { + "epoch": 1.4013452914798206, + "grad_norm": 1.034282922744751, + "learning_rate": 0.0008249082755809213, + "loss": 3.5445, + "step": 20625 + }, + { + "epoch": 1.4016850115504824, + "grad_norm": 0.5794238448143005, + "learning_rate": 0.0008248658105720887, + "loss": 3.6084, + "step": 20630 + }, + { + "epoch": 1.402024731621144, + "grad_norm": 0.9354327321052551, + "learning_rate": 0.0008248233455632559, + "loss": 3.3926, + "step": 20635 + }, + { + "epoch": 1.402364451691806, + "grad_norm": 0.8412973880767822, + "learning_rate": 0.0008247808805544231, + "loss": 4.088, + "step": 20640 + }, + { + "epoch": 1.4027041717624678, + "grad_norm": 0.994213342666626, + "learning_rate": 0.0008247384155455904, + "loss": 3.9227, + "step": 20645 + }, + { + "epoch": 1.4030438918331294, + "grad_norm": 1.0832910537719727, + "learning_rate": 0.0008246959505367578, + "loss": 3.4526, + "step": 20650 + }, + { + "epoch": 1.4033836119037912, + "grad_norm": 0.7778284549713135, + "learning_rate": 0.000824653485527925, + "loss": 3.4258, + "step": 20655 + }, + { + "epoch": 1.403723331974453, + "grad_norm": 0.72674161195755, + "learning_rate": 0.0008246110205190923, + "loss": 3.6148, + "step": 20660 + }, + { + "epoch": 1.4040630520451147, + "grad_norm": 0.8473602533340454, + "learning_rate": 0.0008245685555102596, + "loss": 3.7554, + "step": 20665 + }, + { + "epoch": 1.4044027721157766, + "grad_norm": 0.8490917086601257, + "learning_rate": 0.0008245260905014268, + "loss": 3.5637, + "step": 20670 + }, + { + "epoch": 1.4047424921864384, + "grad_norm": 0.7462838888168335, + "learning_rate": 0.0008244836254925941, + "loss": 3.5594, + "step": 20675 + }, + { + "epoch": 1.4050822122571, + "grad_norm": 0.7321681380271912, + "learning_rate": 0.0008244411604837613, + "loss": 3.7164, + "step": 20680 + }, + { + "epoch": 1.405421932327762, + "grad_norm": 0.6672002673149109, + "learning_rate": 0.0008243986954749287, + "loss": 3.5839, + "step": 20685 + }, + { + "epoch": 1.4057616523984238, + "grad_norm": 0.6962081789970398, + "learning_rate": 0.000824356230466096, + "loss": 3.4026, + "step": 20690 + }, + { + "epoch": 1.4061013724690854, + "grad_norm": 0.7916650772094727, + "learning_rate": 0.0008243137654572632, + "loss": 3.5734, + "step": 20695 + }, + { + "epoch": 1.4064410925397473, + "grad_norm": 0.902154803276062, + "learning_rate": 0.0008242713004484305, + "loss": 3.654, + "step": 20700 + }, + { + "epoch": 1.406780812610409, + "grad_norm": 0.8410627841949463, + "learning_rate": 0.0008242288354395978, + "loss": 3.4721, + "step": 20705 + }, + { + "epoch": 1.4071205326810707, + "grad_norm": 0.8136631846427917, + "learning_rate": 0.000824186370430765, + "loss": 3.8642, + "step": 20710 + }, + { + "epoch": 1.4074602527517326, + "grad_norm": 0.9130783677101135, + "learning_rate": 0.0008241439054219323, + "loss": 3.393, + "step": 20715 + }, + { + "epoch": 1.4077999728223944, + "grad_norm": 0.8609444499015808, + "learning_rate": 0.0008241014404130997, + "loss": 3.3423, + "step": 20720 + }, + { + "epoch": 1.408139692893056, + "grad_norm": 1.4656940698623657, + "learning_rate": 0.0008240589754042669, + "loss": 3.4804, + "step": 20725 + }, + { + "epoch": 1.408479412963718, + "grad_norm": 1.0013593435287476, + "learning_rate": 0.0008240165103954341, + "loss": 3.4419, + "step": 20730 + }, + { + "epoch": 1.4088191330343798, + "grad_norm": 0.8366342782974243, + "learning_rate": 0.0008239740453866015, + "loss": 3.6317, + "step": 20735 + }, + { + "epoch": 1.4091588531050414, + "grad_norm": 1.046828031539917, + "learning_rate": 0.0008239315803777687, + "loss": 3.4431, + "step": 20740 + }, + { + "epoch": 1.4094985731757033, + "grad_norm": 0.8617599606513977, + "learning_rate": 0.0008238891153689359, + "loss": 3.5064, + "step": 20745 + }, + { + "epoch": 1.4098382932463651, + "grad_norm": 1.3460118770599365, + "learning_rate": 0.0008238466503601033, + "loss": 3.6917, + "step": 20750 + }, + { + "epoch": 1.4101780133170267, + "grad_norm": 0.8538772463798523, + "learning_rate": 0.0008238041853512706, + "loss": 3.4111, + "step": 20755 + }, + { + "epoch": 1.4105177333876886, + "grad_norm": 1.0844227075576782, + "learning_rate": 0.0008237617203424379, + "loss": 3.5716, + "step": 20760 + }, + { + "epoch": 1.4108574534583505, + "grad_norm": 0.779915452003479, + "learning_rate": 0.0008237192553336052, + "loss": 3.6708, + "step": 20765 + }, + { + "epoch": 1.411197173529012, + "grad_norm": 0.8108894228935242, + "learning_rate": 0.0008236767903247724, + "loss": 3.6889, + "step": 20770 + }, + { + "epoch": 1.411536893599674, + "grad_norm": 0.9885055422782898, + "learning_rate": 0.0008236343253159397, + "loss": 3.5759, + "step": 20775 + }, + { + "epoch": 1.4118766136703356, + "grad_norm": 0.7984120845794678, + "learning_rate": 0.000823591860307107, + "loss": 3.8149, + "step": 20780 + }, + { + "epoch": 1.4122163337409974, + "grad_norm": 0.9806830883026123, + "learning_rate": 0.0008235493952982742, + "loss": 3.6116, + "step": 20785 + }, + { + "epoch": 1.4125560538116593, + "grad_norm": 1.10316002368927, + "learning_rate": 0.0008235069302894416, + "loss": 3.5597, + "step": 20790 + }, + { + "epoch": 1.412895773882321, + "grad_norm": 0.7780253291130066, + "learning_rate": 0.0008234644652806088, + "loss": 3.8044, + "step": 20795 + }, + { + "epoch": 1.4132354939529828, + "grad_norm": 2.149338960647583, + "learning_rate": 0.0008234220002717761, + "loss": 3.5811, + "step": 20800 + }, + { + "epoch": 1.4135752140236444, + "grad_norm": 0.7096241116523743, + "learning_rate": 0.0008233795352629434, + "loss": 3.5349, + "step": 20805 + }, + { + "epoch": 1.4139149340943062, + "grad_norm": 0.771761953830719, + "learning_rate": 0.0008233370702541106, + "loss": 3.4484, + "step": 20810 + }, + { + "epoch": 1.414254654164968, + "grad_norm": 0.9140762686729431, + "learning_rate": 0.0008232946052452779, + "loss": 3.7856, + "step": 20815 + }, + { + "epoch": 1.4145943742356297, + "grad_norm": 1.0000709295272827, + "learning_rate": 0.0008232521402364452, + "loss": 3.6642, + "step": 20820 + }, + { + "epoch": 1.4149340943062916, + "grad_norm": 0.7121148705482483, + "learning_rate": 0.0008232096752276125, + "loss": 3.7125, + "step": 20825 + }, + { + "epoch": 1.4152738143769534, + "grad_norm": 1.1866899728775024, + "learning_rate": 0.0008231672102187797, + "loss": 3.6587, + "step": 20830 + }, + { + "epoch": 1.415613534447615, + "grad_norm": 0.7443028092384338, + "learning_rate": 0.0008231247452099471, + "loss": 3.5384, + "step": 20835 + }, + { + "epoch": 1.415953254518277, + "grad_norm": 0.933971643447876, + "learning_rate": 0.0008230822802011143, + "loss": 3.7928, + "step": 20840 + }, + { + "epoch": 1.4162929745889388, + "grad_norm": 1.4636300802230835, + "learning_rate": 0.0008230398151922815, + "loss": 3.8264, + "step": 20845 + }, + { + "epoch": 1.4166326946596004, + "grad_norm": 0.8395285606384277, + "learning_rate": 0.0008229973501834489, + "loss": 3.3946, + "step": 20850 + }, + { + "epoch": 1.4169724147302623, + "grad_norm": 1.467028260231018, + "learning_rate": 0.0008229548851746161, + "loss": 3.3437, + "step": 20855 + }, + { + "epoch": 1.417312134800924, + "grad_norm": 0.7943036556243896, + "learning_rate": 0.0008229124201657834, + "loss": 3.5276, + "step": 20860 + }, + { + "epoch": 1.4176518548715857, + "grad_norm": 0.8287936449050903, + "learning_rate": 0.0008228699551569508, + "loss": 3.6553, + "step": 20865 + }, + { + "epoch": 1.4179915749422476, + "grad_norm": 0.7289841175079346, + "learning_rate": 0.000822827490148118, + "loss": 3.6603, + "step": 20870 + }, + { + "epoch": 1.4183312950129094, + "grad_norm": 0.7716347575187683, + "learning_rate": 0.0008227850251392852, + "loss": 3.645, + "step": 20875 + }, + { + "epoch": 1.418671015083571, + "grad_norm": 0.8529532551765442, + "learning_rate": 0.0008227425601304526, + "loss": 3.7138, + "step": 20880 + }, + { + "epoch": 1.419010735154233, + "grad_norm": 0.7755817174911499, + "learning_rate": 0.0008227000951216198, + "loss": 3.6043, + "step": 20885 + }, + { + "epoch": 1.4193504552248948, + "grad_norm": 0.7976102232933044, + "learning_rate": 0.000822657630112787, + "loss": 3.6427, + "step": 20890 + }, + { + "epoch": 1.4196901752955564, + "grad_norm": 0.8626576662063599, + "learning_rate": 0.0008226151651039544, + "loss": 3.8456, + "step": 20895 + }, + { + "epoch": 1.4200298953662183, + "grad_norm": 0.9899570941925049, + "learning_rate": 0.0008225727000951217, + "loss": 3.7051, + "step": 20900 + }, + { + "epoch": 1.4203696154368801, + "grad_norm": 0.9218582510948181, + "learning_rate": 0.0008225302350862889, + "loss": 3.4499, + "step": 20905 + }, + { + "epoch": 1.4207093355075417, + "grad_norm": 0.9151792526245117, + "learning_rate": 0.0008224877700774562, + "loss": 3.7835, + "step": 20910 + }, + { + "epoch": 1.4210490555782036, + "grad_norm": 0.7549874782562256, + "learning_rate": 0.0008224453050686235, + "loss": 3.5217, + "step": 20915 + }, + { + "epoch": 1.4213887756488655, + "grad_norm": 1.4265506267547607, + "learning_rate": 0.0008224028400597907, + "loss": 3.7022, + "step": 20920 + }, + { + "epoch": 1.421728495719527, + "grad_norm": 0.9432615637779236, + "learning_rate": 0.000822360375050958, + "loss": 3.6351, + "step": 20925 + }, + { + "epoch": 1.422068215790189, + "grad_norm": 0.7177340984344482, + "learning_rate": 0.0008223179100421254, + "loss": 3.5626, + "step": 20930 + }, + { + "epoch": 1.4224079358608508, + "grad_norm": 1.1297378540039062, + "learning_rate": 0.0008222754450332926, + "loss": 3.564, + "step": 20935 + }, + { + "epoch": 1.4227476559315124, + "grad_norm": 0.809187650680542, + "learning_rate": 0.0008222329800244599, + "loss": 3.414, + "step": 20940 + }, + { + "epoch": 1.4230873760021743, + "grad_norm": 0.6651842594146729, + "learning_rate": 0.0008221905150156271, + "loss": 3.4271, + "step": 20945 + }, + { + "epoch": 1.423427096072836, + "grad_norm": 1.2336649894714355, + "learning_rate": 0.0008221480500067944, + "loss": 3.4701, + "step": 20950 + }, + { + "epoch": 1.4237668161434978, + "grad_norm": 0.6805815100669861, + "learning_rate": 0.0008221055849979617, + "loss": 3.8323, + "step": 20955 + }, + { + "epoch": 1.4241065362141596, + "grad_norm": 0.93238765001297, + "learning_rate": 0.0008220631199891289, + "loss": 3.672, + "step": 20960 + }, + { + "epoch": 1.4244462562848212, + "grad_norm": 0.7003128528594971, + "learning_rate": 0.0008220206549802963, + "loss": 3.5581, + "step": 20965 + }, + { + "epoch": 1.424785976355483, + "grad_norm": 0.8854650855064392, + "learning_rate": 0.0008219781899714636, + "loss": 3.7234, + "step": 20970 + }, + { + "epoch": 1.4251256964261447, + "grad_norm": 0.7201051115989685, + "learning_rate": 0.0008219357249626308, + "loss": 3.4275, + "step": 20975 + }, + { + "epoch": 1.4254654164968066, + "grad_norm": 1.0168184041976929, + "learning_rate": 0.000821893259953798, + "loss": 3.8249, + "step": 20980 + }, + { + "epoch": 1.4258051365674684, + "grad_norm": 0.7838391065597534, + "learning_rate": 0.0008218507949449654, + "loss": 3.5618, + "step": 20985 + }, + { + "epoch": 1.42614485663813, + "grad_norm": 1.251909613609314, + "learning_rate": 0.0008218083299361326, + "loss": 3.4295, + "step": 20990 + }, + { + "epoch": 1.426484576708792, + "grad_norm": 0.6466462016105652, + "learning_rate": 0.0008217658649272998, + "loss": 3.3648, + "step": 20995 + }, + { + "epoch": 1.4268242967794538, + "grad_norm": 0.9024607539176941, + "learning_rate": 0.0008217233999184673, + "loss": 3.5286, + "step": 21000 + }, + { + "epoch": 1.4271640168501154, + "grad_norm": 0.7848509550094604, + "learning_rate": 0.0008216809349096345, + "loss": 3.6751, + "step": 21005 + }, + { + "epoch": 1.4275037369207773, + "grad_norm": 0.8536897897720337, + "learning_rate": 0.0008216384699008017, + "loss": 3.6863, + "step": 21010 + }, + { + "epoch": 1.427843456991439, + "grad_norm": 0.7577711939811707, + "learning_rate": 0.0008215960048919691, + "loss": 3.8904, + "step": 21015 + }, + { + "epoch": 1.4281831770621007, + "grad_norm": 0.8247796893119812, + "learning_rate": 0.0008215535398831363, + "loss": 3.3985, + "step": 21020 + }, + { + "epoch": 1.4285228971327626, + "grad_norm": 0.8463318347930908, + "learning_rate": 0.0008215110748743035, + "loss": 3.5565, + "step": 21025 + }, + { + "epoch": 1.4288626172034244, + "grad_norm": 0.9032055735588074, + "learning_rate": 0.0008214686098654708, + "loss": 3.7047, + "step": 21030 + }, + { + "epoch": 1.429202337274086, + "grad_norm": 0.912690281867981, + "learning_rate": 0.0008214261448566382, + "loss": 3.6805, + "step": 21035 + }, + { + "epoch": 1.429542057344748, + "grad_norm": 0.7947728037834167, + "learning_rate": 0.0008213836798478054, + "loss": 3.7801, + "step": 21040 + }, + { + "epoch": 1.4298817774154098, + "grad_norm": 0.6689997315406799, + "learning_rate": 0.0008213412148389727, + "loss": 3.6074, + "step": 21045 + }, + { + "epoch": 1.4302214974860714, + "grad_norm": 0.7089061737060547, + "learning_rate": 0.00082129874983014, + "loss": 3.8101, + "step": 21050 + }, + { + "epoch": 1.4305612175567333, + "grad_norm": 0.8824689388275146, + "learning_rate": 0.0008212562848213072, + "loss": 3.7375, + "step": 21055 + }, + { + "epoch": 1.4309009376273951, + "grad_norm": 0.9686276912689209, + "learning_rate": 0.0008212138198124745, + "loss": 3.6599, + "step": 21060 + }, + { + "epoch": 1.4312406576980568, + "grad_norm": 0.6699368357658386, + "learning_rate": 0.0008211713548036418, + "loss": 3.6546, + "step": 21065 + }, + { + "epoch": 1.4315803777687186, + "grad_norm": 0.8351903557777405, + "learning_rate": 0.0008211288897948091, + "loss": 3.4627, + "step": 21070 + }, + { + "epoch": 1.4319200978393805, + "grad_norm": 0.8426523208618164, + "learning_rate": 0.0008210864247859764, + "loss": 3.4511, + "step": 21075 + }, + { + "epoch": 1.432259817910042, + "grad_norm": 0.9831000566482544, + "learning_rate": 0.0008210439597771436, + "loss": 3.5271, + "step": 21080 + }, + { + "epoch": 1.432599537980704, + "grad_norm": 0.9150630235671997, + "learning_rate": 0.0008210014947683109, + "loss": 3.6096, + "step": 21085 + }, + { + "epoch": 1.4329392580513658, + "grad_norm": 0.9366199374198914, + "learning_rate": 0.0008209590297594782, + "loss": 3.5589, + "step": 21090 + }, + { + "epoch": 1.4332789781220274, + "grad_norm": 0.83981853723526, + "learning_rate": 0.0008209165647506454, + "loss": 3.5079, + "step": 21095 + }, + { + "epoch": 1.4336186981926893, + "grad_norm": 1.0068219900131226, + "learning_rate": 0.0008208740997418128, + "loss": 3.3112, + "step": 21100 + }, + { + "epoch": 1.4339584182633511, + "grad_norm": 0.7697469592094421, + "learning_rate": 0.0008208316347329801, + "loss": 3.4293, + "step": 21105 + }, + { + "epoch": 1.4342981383340128, + "grad_norm": 0.6952486634254456, + "learning_rate": 0.0008207891697241473, + "loss": 3.6772, + "step": 21110 + }, + { + "epoch": 1.4346378584046746, + "grad_norm": 0.7351554036140442, + "learning_rate": 0.0008207467047153147, + "loss": 3.5804, + "step": 21115 + }, + { + "epoch": 1.4349775784753362, + "grad_norm": 1.1700575351715088, + "learning_rate": 0.0008207042397064819, + "loss": 3.6869, + "step": 21120 + }, + { + "epoch": 1.435317298545998, + "grad_norm": 0.7126697897911072, + "learning_rate": 0.0008206617746976491, + "loss": 3.533, + "step": 21125 + }, + { + "epoch": 1.43565701861666, + "grad_norm": 0.917576253414154, + "learning_rate": 0.0008206193096888164, + "loss": 3.7123, + "step": 21130 + }, + { + "epoch": 1.4359967386873216, + "grad_norm": 0.6954729557037354, + "learning_rate": 0.0008205768446799837, + "loss": 3.7191, + "step": 21135 + }, + { + "epoch": 1.4363364587579834, + "grad_norm": 0.9447378516197205, + "learning_rate": 0.000820534379671151, + "loss": 3.6439, + "step": 21140 + }, + { + "epoch": 1.436676178828645, + "grad_norm": 0.881779670715332, + "learning_rate": 0.0008204919146623183, + "loss": 3.6114, + "step": 21145 + }, + { + "epoch": 1.437015898899307, + "grad_norm": 1.093141794204712, + "learning_rate": 0.0008204494496534856, + "loss": 3.5784, + "step": 21150 + }, + { + "epoch": 1.4373556189699688, + "grad_norm": 0.7237810492515564, + "learning_rate": 0.0008204069846446528, + "loss": 3.5034, + "step": 21155 + }, + { + "epoch": 1.4376953390406304, + "grad_norm": 0.9109655022621155, + "learning_rate": 0.0008203645196358201, + "loss": 3.5073, + "step": 21160 + }, + { + "epoch": 1.4380350591112923, + "grad_norm": 1.35444974899292, + "learning_rate": 0.0008203220546269874, + "loss": 3.6661, + "step": 21165 + }, + { + "epoch": 1.438374779181954, + "grad_norm": 0.7241541147232056, + "learning_rate": 0.0008202795896181546, + "loss": 3.9693, + "step": 21170 + }, + { + "epoch": 1.4387144992526157, + "grad_norm": 1.167004108428955, + "learning_rate": 0.000820237124609322, + "loss": 3.6458, + "step": 21175 + }, + { + "epoch": 1.4390542193232776, + "grad_norm": 0.779415488243103, + "learning_rate": 0.0008201946596004892, + "loss": 3.5717, + "step": 21180 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.731020987033844, + "learning_rate": 0.0008201521945916565, + "loss": 3.7664, + "step": 21185 + }, + { + "epoch": 1.439733659464601, + "grad_norm": 0.9060245156288147, + "learning_rate": 0.0008201097295828238, + "loss": 3.4913, + "step": 21190 + }, + { + "epoch": 1.440073379535263, + "grad_norm": 0.8694328665733337, + "learning_rate": 0.000820067264573991, + "loss": 3.6381, + "step": 21195 + }, + { + "epoch": 1.4404130996059248, + "grad_norm": 0.8190327286720276, + "learning_rate": 0.0008200247995651583, + "loss": 3.5779, + "step": 21200 + }, + { + "epoch": 1.4407528196765864, + "grad_norm": 0.8174338340759277, + "learning_rate": 0.0008199823345563257, + "loss": 3.9507, + "step": 21205 + }, + { + "epoch": 1.4410925397472483, + "grad_norm": 0.9831771850585938, + "learning_rate": 0.0008199398695474929, + "loss": 3.5849, + "step": 21210 + }, + { + "epoch": 1.4414322598179101, + "grad_norm": 0.7150174379348755, + "learning_rate": 0.0008198974045386602, + "loss": 3.7802, + "step": 21215 + }, + { + "epoch": 1.4417719798885718, + "grad_norm": 0.7905863523483276, + "learning_rate": 0.0008198549395298275, + "loss": 3.6846, + "step": 21220 + }, + { + "epoch": 1.4421116999592336, + "grad_norm": 0.8894950747489929, + "learning_rate": 0.0008198124745209947, + "loss": 3.6888, + "step": 21225 + }, + { + "epoch": 1.4424514200298955, + "grad_norm": 0.8170937895774841, + "learning_rate": 0.0008197700095121619, + "loss": 3.6722, + "step": 21230 + }, + { + "epoch": 1.442791140100557, + "grad_norm": 1.0815433263778687, + "learning_rate": 0.0008197275445033293, + "loss": 3.6039, + "step": 21235 + }, + { + "epoch": 1.443130860171219, + "grad_norm": 0.801895797252655, + "learning_rate": 0.0008196850794944966, + "loss": 3.5469, + "step": 21240 + }, + { + "epoch": 1.4434705802418808, + "grad_norm": 0.8273322582244873, + "learning_rate": 0.0008196426144856638, + "loss": 3.7756, + "step": 21245 + }, + { + "epoch": 1.4438103003125424, + "grad_norm": 0.8574962019920349, + "learning_rate": 0.0008196001494768312, + "loss": 3.4707, + "step": 21250 + }, + { + "epoch": 1.4441500203832043, + "grad_norm": 0.9884399175643921, + "learning_rate": 0.0008195576844679984, + "loss": 3.7402, + "step": 21255 + }, + { + "epoch": 1.4444897404538661, + "grad_norm": 0.8417077660560608, + "learning_rate": 0.0008195152194591656, + "loss": 3.5327, + "step": 21260 + }, + { + "epoch": 1.4448294605245278, + "grad_norm": 0.9184562563896179, + "learning_rate": 0.000819472754450333, + "loss": 3.4524, + "step": 21265 + }, + { + "epoch": 1.4451691805951896, + "grad_norm": 0.8043230772018433, + "learning_rate": 0.0008194302894415002, + "loss": 3.6111, + "step": 21270 + }, + { + "epoch": 1.4455089006658515, + "grad_norm": 1.2451082468032837, + "learning_rate": 0.0008193878244326675, + "loss": 3.4475, + "step": 21275 + }, + { + "epoch": 1.445848620736513, + "grad_norm": 0.7746094465255737, + "learning_rate": 0.0008193453594238348, + "loss": 3.4813, + "step": 21280 + }, + { + "epoch": 1.446188340807175, + "grad_norm": 0.8654516339302063, + "learning_rate": 0.0008193028944150021, + "loss": 3.7811, + "step": 21285 + }, + { + "epoch": 1.4465280608778366, + "grad_norm": 0.7163750529289246, + "learning_rate": 0.0008192604294061693, + "loss": 3.9147, + "step": 21290 + }, + { + "epoch": 1.4468677809484984, + "grad_norm": 0.9947525858879089, + "learning_rate": 0.0008192179643973366, + "loss": 3.5462, + "step": 21295 + }, + { + "epoch": 1.4472075010191603, + "grad_norm": 0.8202913403511047, + "learning_rate": 0.0008191754993885039, + "loss": 3.6692, + "step": 21300 + }, + { + "epoch": 1.447547221089822, + "grad_norm": 1.0683813095092773, + "learning_rate": 0.0008191330343796711, + "loss": 3.5015, + "step": 21305 + }, + { + "epoch": 1.4478869411604838, + "grad_norm": 0.8606248497962952, + "learning_rate": 0.0008190905693708385, + "loss": 3.5057, + "step": 21310 + }, + { + "epoch": 1.4482266612311454, + "grad_norm": 0.8713111281394958, + "learning_rate": 0.0008190481043620058, + "loss": 3.5216, + "step": 21315 + }, + { + "epoch": 1.4485663813018073, + "grad_norm": 3.322129011154175, + "learning_rate": 0.000819005639353173, + "loss": 3.5204, + "step": 21320 + }, + { + "epoch": 1.4489061013724691, + "grad_norm": 0.9598883986473083, + "learning_rate": 0.0008189631743443403, + "loss": 3.7471, + "step": 21325 + }, + { + "epoch": 1.4492458214431307, + "grad_norm": 0.8134629130363464, + "learning_rate": 0.0008189207093355075, + "loss": 3.5817, + "step": 21330 + }, + { + "epoch": 1.4495855415137926, + "grad_norm": 0.7407471537590027, + "learning_rate": 0.0008188782443266748, + "loss": 3.7824, + "step": 21335 + }, + { + "epoch": 1.4499252615844545, + "grad_norm": 0.7569480538368225, + "learning_rate": 0.0008188357793178421, + "loss": 3.5044, + "step": 21340 + }, + { + "epoch": 1.450264981655116, + "grad_norm": 0.8862072825431824, + "learning_rate": 0.0008187933143090094, + "loss": 3.765, + "step": 21345 + }, + { + "epoch": 1.450604701725778, + "grad_norm": 1.2891372442245483, + "learning_rate": 0.0008187508493001767, + "loss": 3.5571, + "step": 21350 + }, + { + "epoch": 1.4509444217964398, + "grad_norm": 1.0420043468475342, + "learning_rate": 0.000818708384291344, + "loss": 3.6362, + "step": 21355 + }, + { + "epoch": 1.4512841418671014, + "grad_norm": 0.8780930042266846, + "learning_rate": 0.0008186659192825112, + "loss": 3.6496, + "step": 21360 + }, + { + "epoch": 1.4516238619377633, + "grad_norm": 0.7846638560295105, + "learning_rate": 0.0008186234542736784, + "loss": 3.5168, + "step": 21365 + }, + { + "epoch": 1.4519635820084251, + "grad_norm": 0.7806840538978577, + "learning_rate": 0.0008185809892648458, + "loss": 3.9232, + "step": 21370 + }, + { + "epoch": 1.4523033020790868, + "grad_norm": 0.7512106895446777, + "learning_rate": 0.000818538524256013, + "loss": 3.5866, + "step": 21375 + }, + { + "epoch": 1.4526430221497486, + "grad_norm": 0.9655275940895081, + "learning_rate": 0.0008184960592471803, + "loss": 3.3584, + "step": 21380 + }, + { + "epoch": 1.4529827422204105, + "grad_norm": 0.9809117317199707, + "learning_rate": 0.0008184535942383477, + "loss": 3.7456, + "step": 21385 + }, + { + "epoch": 1.453322462291072, + "grad_norm": 0.8867383003234863, + "learning_rate": 0.0008184111292295149, + "loss": 3.7541, + "step": 21390 + }, + { + "epoch": 1.453662182361734, + "grad_norm": 0.9183284640312195, + "learning_rate": 0.0008183686642206821, + "loss": 3.325, + "step": 21395 + }, + { + "epoch": 1.4540019024323958, + "grad_norm": 0.7989374995231628, + "learning_rate": 0.0008183261992118495, + "loss": 3.4454, + "step": 21400 + }, + { + "epoch": 1.4543416225030574, + "grad_norm": 0.7856650948524475, + "learning_rate": 0.0008182837342030167, + "loss": 3.689, + "step": 21405 + }, + { + "epoch": 1.4546813425737193, + "grad_norm": 0.8222747445106506, + "learning_rate": 0.0008182412691941839, + "loss": 3.6644, + "step": 21410 + }, + { + "epoch": 1.4550210626443811, + "grad_norm": 0.9159167408943176, + "learning_rate": 0.0008181988041853514, + "loss": 3.6446, + "step": 21415 + }, + { + "epoch": 1.4553607827150428, + "grad_norm": 1.0283254384994507, + "learning_rate": 0.0008181563391765186, + "loss": 3.4586, + "step": 21420 + }, + { + "epoch": 1.4557005027857046, + "grad_norm": 0.7179701924324036, + "learning_rate": 0.0008181138741676858, + "loss": 3.472, + "step": 21425 + }, + { + "epoch": 1.4560402228563665, + "grad_norm": 3.5054337978363037, + "learning_rate": 0.0008180714091588531, + "loss": 3.6381, + "step": 21430 + }, + { + "epoch": 1.456379942927028, + "grad_norm": 0.966729998588562, + "learning_rate": 0.0008180289441500204, + "loss": 3.5003, + "step": 21435 + }, + { + "epoch": 1.45671966299769, + "grad_norm": 0.9251210689544678, + "learning_rate": 0.0008179864791411877, + "loss": 3.7307, + "step": 21440 + }, + { + "epoch": 1.4570593830683518, + "grad_norm": 0.8896040320396423, + "learning_rate": 0.0008179440141323549, + "loss": 3.5721, + "step": 21445 + }, + { + "epoch": 1.4573991031390134, + "grad_norm": 0.6923865079879761, + "learning_rate": 0.0008179015491235223, + "loss": 3.8275, + "step": 21450 + }, + { + "epoch": 1.4577388232096753, + "grad_norm": 0.8250048756599426, + "learning_rate": 0.0008178590841146896, + "loss": 3.7023, + "step": 21455 + }, + { + "epoch": 1.458078543280337, + "grad_norm": 0.8300202488899231, + "learning_rate": 0.0008178166191058568, + "loss": 3.4605, + "step": 21460 + }, + { + "epoch": 1.4584182633509988, + "grad_norm": 1.3340774774551392, + "learning_rate": 0.000817774154097024, + "loss": 3.7775, + "step": 21465 + }, + { + "epoch": 1.4587579834216606, + "grad_norm": 0.7371110320091248, + "learning_rate": 0.0008177316890881914, + "loss": 3.4734, + "step": 21470 + }, + { + "epoch": 1.4590977034923223, + "grad_norm": 0.8293664455413818, + "learning_rate": 0.0008176892240793586, + "loss": 3.4426, + "step": 21475 + }, + { + "epoch": 1.4594374235629841, + "grad_norm": 0.8874515295028687, + "learning_rate": 0.0008176467590705258, + "loss": 3.8097, + "step": 21480 + }, + { + "epoch": 1.4597771436336457, + "grad_norm": 0.9097764492034912, + "learning_rate": 0.0008176042940616933, + "loss": 3.6346, + "step": 21485 + }, + { + "epoch": 1.4601168637043076, + "grad_norm": 1.9026992321014404, + "learning_rate": 0.0008175618290528605, + "loss": 3.9298, + "step": 21490 + }, + { + "epoch": 1.4604565837749695, + "grad_norm": 0.8287867903709412, + "learning_rate": 0.0008175193640440277, + "loss": 3.7818, + "step": 21495 + }, + { + "epoch": 1.460796303845631, + "grad_norm": 0.8471132516860962, + "learning_rate": 0.0008174768990351951, + "loss": 3.4737, + "step": 21500 + }, + { + "epoch": 1.461136023916293, + "grad_norm": 0.7697592973709106, + "learning_rate": 0.0008174344340263623, + "loss": 3.5849, + "step": 21505 + }, + { + "epoch": 1.4614757439869548, + "grad_norm": 0.7081953287124634, + "learning_rate": 0.0008173919690175295, + "loss": 3.8277, + "step": 21510 + }, + { + "epoch": 1.4618154640576164, + "grad_norm": 0.8997223377227783, + "learning_rate": 0.0008173495040086968, + "loss": 3.6696, + "step": 21515 + }, + { + "epoch": 1.4621551841282783, + "grad_norm": 1.0204010009765625, + "learning_rate": 0.0008173070389998642, + "loss": 3.3982, + "step": 21520 + }, + { + "epoch": 1.4624949041989401, + "grad_norm": 1.7985694408416748, + "learning_rate": 0.0008172645739910314, + "loss": 3.8575, + "step": 21525 + }, + { + "epoch": 1.4628346242696018, + "grad_norm": 0.9225174188613892, + "learning_rate": 0.0008172221089821987, + "loss": 3.6238, + "step": 21530 + }, + { + "epoch": 1.4631743443402636, + "grad_norm": 1.3066933155059814, + "learning_rate": 0.000817179643973366, + "loss": 3.6888, + "step": 21535 + }, + { + "epoch": 1.4635140644109255, + "grad_norm": 0.9789854288101196, + "learning_rate": 0.0008171371789645332, + "loss": 3.6534, + "step": 21540 + }, + { + "epoch": 1.463853784481587, + "grad_norm": 1.3809174299240112, + "learning_rate": 0.0008170947139557005, + "loss": 3.4162, + "step": 21545 + }, + { + "epoch": 1.464193504552249, + "grad_norm": 1.8249504566192627, + "learning_rate": 0.0008170522489468678, + "loss": 3.8554, + "step": 21550 + }, + { + "epoch": 1.4645332246229108, + "grad_norm": 0.9678528904914856, + "learning_rate": 0.0008170097839380351, + "loss": 3.4929, + "step": 21555 + }, + { + "epoch": 1.4648729446935724, + "grad_norm": 0.7446896433830261, + "learning_rate": 0.0008169673189292024, + "loss": 3.6915, + "step": 21560 + }, + { + "epoch": 1.4652126647642343, + "grad_norm": 0.7250317335128784, + "learning_rate": 0.0008169248539203696, + "loss": 3.8173, + "step": 21565 + }, + { + "epoch": 1.4655523848348961, + "grad_norm": 0.7817710638046265, + "learning_rate": 0.0008168823889115369, + "loss": 3.7645, + "step": 21570 + }, + { + "epoch": 1.4658921049055578, + "grad_norm": 0.7820754647254944, + "learning_rate": 0.0008168399239027042, + "loss": 3.7789, + "step": 21575 + }, + { + "epoch": 1.4662318249762196, + "grad_norm": 1.0124086141586304, + "learning_rate": 0.0008167974588938714, + "loss": 3.9104, + "step": 21580 + }, + { + "epoch": 1.4665715450468815, + "grad_norm": 0.762733519077301, + "learning_rate": 0.0008167549938850387, + "loss": 3.6141, + "step": 21585 + }, + { + "epoch": 1.466911265117543, + "grad_norm": 0.6883044242858887, + "learning_rate": 0.0008167125288762061, + "loss": 3.537, + "step": 21590 + }, + { + "epoch": 1.467250985188205, + "grad_norm": 0.7670542597770691, + "learning_rate": 0.0008166700638673733, + "loss": 3.6879, + "step": 21595 + }, + { + "epoch": 1.4675907052588668, + "grad_norm": 0.8899379968643188, + "learning_rate": 0.0008166275988585406, + "loss": 3.8011, + "step": 21600 + }, + { + "epoch": 1.4679304253295284, + "grad_norm": 0.7380409240722656, + "learning_rate": 0.0008165851338497079, + "loss": 3.6815, + "step": 21605 + }, + { + "epoch": 1.4682701454001903, + "grad_norm": 0.8906005620956421, + "learning_rate": 0.0008165426688408751, + "loss": 3.6948, + "step": 21610 + }, + { + "epoch": 1.4686098654708521, + "grad_norm": 0.8510925769805908, + "learning_rate": 0.0008165002038320423, + "loss": 3.4822, + "step": 21615 + }, + { + "epoch": 1.4689495855415138, + "grad_norm": 1.0499447584152222, + "learning_rate": 0.0008164577388232097, + "loss": 3.5801, + "step": 21620 + }, + { + "epoch": 1.4692893056121756, + "grad_norm": 0.8151040077209473, + "learning_rate": 0.000816415273814377, + "loss": 3.333, + "step": 21625 + }, + { + "epoch": 1.4696290256828373, + "grad_norm": 0.7692030072212219, + "learning_rate": 0.0008163728088055442, + "loss": 3.3623, + "step": 21630 + }, + { + "epoch": 1.4699687457534991, + "grad_norm": 0.954224705696106, + "learning_rate": 0.0008163303437967116, + "loss": 3.7343, + "step": 21635 + }, + { + "epoch": 1.470308465824161, + "grad_norm": 0.9404346942901611, + "learning_rate": 0.0008162878787878788, + "loss": 3.3457, + "step": 21640 + }, + { + "epoch": 1.4706481858948226, + "grad_norm": 0.8341842293739319, + "learning_rate": 0.000816245413779046, + "loss": 3.409, + "step": 21645 + }, + { + "epoch": 1.4709879059654845, + "grad_norm": 0.8944321870803833, + "learning_rate": 0.0008162029487702134, + "loss": 3.5854, + "step": 21650 + }, + { + "epoch": 1.471327626036146, + "grad_norm": 0.8504683375358582, + "learning_rate": 0.0008161604837613806, + "loss": 3.5676, + "step": 21655 + }, + { + "epoch": 1.471667346106808, + "grad_norm": 0.8715880513191223, + "learning_rate": 0.0008161180187525479, + "loss": 3.4165, + "step": 21660 + }, + { + "epoch": 1.4720070661774698, + "grad_norm": 0.8759451508522034, + "learning_rate": 0.0008160755537437153, + "loss": 3.7312, + "step": 21665 + }, + { + "epoch": 1.4723467862481314, + "grad_norm": 0.8658482432365417, + "learning_rate": 0.0008160330887348825, + "loss": 3.8804, + "step": 21670 + }, + { + "epoch": 1.4726865063187933, + "grad_norm": 0.938194751739502, + "learning_rate": 0.0008159906237260497, + "loss": 3.5474, + "step": 21675 + }, + { + "epoch": 1.4730262263894551, + "grad_norm": 0.9586776494979858, + "learning_rate": 0.000815948158717217, + "loss": 3.5647, + "step": 21680 + }, + { + "epoch": 1.4733659464601168, + "grad_norm": 0.7118145823478699, + "learning_rate": 0.0008159056937083843, + "loss": 3.6132, + "step": 21685 + }, + { + "epoch": 1.4737056665307786, + "grad_norm": 0.8587698340415955, + "learning_rate": 0.0008158632286995515, + "loss": 3.5183, + "step": 21690 + }, + { + "epoch": 1.4740453866014405, + "grad_norm": 0.8893502354621887, + "learning_rate": 0.0008158207636907189, + "loss": 3.7508, + "step": 21695 + }, + { + "epoch": 1.474385106672102, + "grad_norm": 0.7379329800605774, + "learning_rate": 0.0008157782986818862, + "loss": 3.5505, + "step": 21700 + }, + { + "epoch": 1.474724826742764, + "grad_norm": 1.029417634010315, + "learning_rate": 0.0008157358336730534, + "loss": 3.5204, + "step": 21705 + }, + { + "epoch": 1.4750645468134258, + "grad_norm": 0.6896976828575134, + "learning_rate": 0.0008156933686642207, + "loss": 3.5989, + "step": 21710 + }, + { + "epoch": 1.4754042668840874, + "grad_norm": 1.0543491840362549, + "learning_rate": 0.0008156509036553879, + "loss": 3.6966, + "step": 21715 + }, + { + "epoch": 1.4757439869547493, + "grad_norm": 0.7930893301963806, + "learning_rate": 0.0008156084386465552, + "loss": 3.5882, + "step": 21720 + }, + { + "epoch": 1.4760837070254111, + "grad_norm": 1.0534542798995972, + "learning_rate": 0.0008155659736377226, + "loss": 3.5944, + "step": 21725 + }, + { + "epoch": 1.4764234270960728, + "grad_norm": 0.7957115769386292, + "learning_rate": 0.0008155235086288898, + "loss": 3.5683, + "step": 21730 + }, + { + "epoch": 1.4767631471667346, + "grad_norm": 0.7798818945884705, + "learning_rate": 0.0008154810436200571, + "loss": 3.5159, + "step": 21735 + }, + { + "epoch": 1.4771028672373965, + "grad_norm": 0.8581233024597168, + "learning_rate": 0.0008154385786112244, + "loss": 3.5074, + "step": 21740 + }, + { + "epoch": 1.477442587308058, + "grad_norm": 13.006803512573242, + "learning_rate": 0.0008153961136023916, + "loss": 3.5257, + "step": 21745 + }, + { + "epoch": 1.47778230737872, + "grad_norm": 0.896229088306427, + "learning_rate": 0.0008153536485935588, + "loss": 3.5959, + "step": 21750 + }, + { + "epoch": 1.4781220274493818, + "grad_norm": 0.7379077076911926, + "learning_rate": 0.0008153111835847262, + "loss": 3.6344, + "step": 21755 + }, + { + "epoch": 1.4784617475200434, + "grad_norm": 0.825890302658081, + "learning_rate": 0.0008152687185758935, + "loss": 3.6882, + "step": 21760 + }, + { + "epoch": 1.4788014675907053, + "grad_norm": 0.8955855369567871, + "learning_rate": 0.0008152262535670607, + "loss": 3.9724, + "step": 21765 + }, + { + "epoch": 1.4791411876613672, + "grad_norm": 1.392404556274414, + "learning_rate": 0.0008151837885582281, + "loss": 3.5749, + "step": 21770 + }, + { + "epoch": 1.4794809077320288, + "grad_norm": 0.7196982502937317, + "learning_rate": 0.0008151413235493953, + "loss": 3.5891, + "step": 21775 + }, + { + "epoch": 1.4798206278026906, + "grad_norm": 0.711516797542572, + "learning_rate": 0.0008150988585405626, + "loss": 3.4203, + "step": 21780 + }, + { + "epoch": 1.4801603478733525, + "grad_norm": 0.6733693480491638, + "learning_rate": 0.0008150563935317299, + "loss": 3.7878, + "step": 21785 + }, + { + "epoch": 1.4805000679440141, + "grad_norm": 1.1342600584030151, + "learning_rate": 0.0008150139285228971, + "loss": 3.6715, + "step": 21790 + }, + { + "epoch": 1.480839788014676, + "grad_norm": 0.7066829800605774, + "learning_rate": 0.0008149714635140645, + "loss": 3.7617, + "step": 21795 + }, + { + "epoch": 1.4811795080853376, + "grad_norm": 0.9710771441459656, + "learning_rate": 0.0008149289985052318, + "loss": 3.6566, + "step": 21800 + }, + { + "epoch": 1.4815192281559995, + "grad_norm": 0.6392430663108826, + "learning_rate": 0.000814886533496399, + "loss": 3.8231, + "step": 21805 + }, + { + "epoch": 1.4818589482266613, + "grad_norm": 0.6712721586227417, + "learning_rate": 0.0008148440684875663, + "loss": 3.6822, + "step": 21810 + }, + { + "epoch": 1.482198668297323, + "grad_norm": 1.0766667127609253, + "learning_rate": 0.0008148016034787335, + "loss": 3.8013, + "step": 21815 + }, + { + "epoch": 1.4825383883679848, + "grad_norm": 1.4217652082443237, + "learning_rate": 0.0008147591384699008, + "loss": 3.3801, + "step": 21820 + }, + { + "epoch": 1.4828781084386464, + "grad_norm": 5.365688800811768, + "learning_rate": 0.0008147166734610681, + "loss": 3.9023, + "step": 21825 + }, + { + "epoch": 1.4832178285093083, + "grad_norm": 0.7911930084228516, + "learning_rate": 0.0008146742084522354, + "loss": 3.7086, + "step": 21830 + }, + { + "epoch": 1.4835575485799701, + "grad_norm": 1.013545036315918, + "learning_rate": 0.0008146317434434027, + "loss": 3.7609, + "step": 21835 + }, + { + "epoch": 1.4838972686506318, + "grad_norm": 0.7804214954376221, + "learning_rate": 0.00081458927843457, + "loss": 3.5899, + "step": 21840 + }, + { + "epoch": 1.4842369887212936, + "grad_norm": 0.8121116161346436, + "learning_rate": 0.0008145468134257372, + "loss": 3.6358, + "step": 21845 + }, + { + "epoch": 1.4845767087919555, + "grad_norm": 0.997131884098053, + "learning_rate": 0.0008145043484169045, + "loss": 3.4675, + "step": 21850 + }, + { + "epoch": 1.484916428862617, + "grad_norm": 1.5683791637420654, + "learning_rate": 0.0008144618834080718, + "loss": 3.8026, + "step": 21855 + }, + { + "epoch": 1.485256148933279, + "grad_norm": 0.9194762110710144, + "learning_rate": 0.000814419418399239, + "loss": 3.6512, + "step": 21860 + }, + { + "epoch": 1.4855958690039408, + "grad_norm": 0.8475282192230225, + "learning_rate": 0.0008143769533904063, + "loss": 3.8176, + "step": 21865 + }, + { + "epoch": 1.4859355890746024, + "grad_norm": 1.7368885278701782, + "learning_rate": 0.0008143344883815737, + "loss": 3.4637, + "step": 21870 + }, + { + "epoch": 1.4862753091452643, + "grad_norm": 0.8091974854469299, + "learning_rate": 0.0008142920233727409, + "loss": 3.4879, + "step": 21875 + }, + { + "epoch": 1.4866150292159261, + "grad_norm": 0.9005821943283081, + "learning_rate": 0.0008142495583639081, + "loss": 3.6582, + "step": 21880 + }, + { + "epoch": 1.4869547492865878, + "grad_norm": 0.6839936375617981, + "learning_rate": 0.0008142070933550755, + "loss": 3.3596, + "step": 21885 + }, + { + "epoch": 1.4872944693572496, + "grad_norm": 0.8690326809883118, + "learning_rate": 0.0008141646283462427, + "loss": 3.3709, + "step": 21890 + }, + { + "epoch": 1.4876341894279115, + "grad_norm": 0.8035196661949158, + "learning_rate": 0.0008141221633374099, + "loss": 3.6573, + "step": 21895 + }, + { + "epoch": 1.487973909498573, + "grad_norm": 1.0507326126098633, + "learning_rate": 0.0008140796983285774, + "loss": 3.4127, + "step": 21900 + }, + { + "epoch": 1.488313629569235, + "grad_norm": 1.9073073863983154, + "learning_rate": 0.0008140372333197446, + "loss": 3.6817, + "step": 21905 + }, + { + "epoch": 1.4886533496398968, + "grad_norm": 0.8077768087387085, + "learning_rate": 0.0008139947683109118, + "loss": 3.2506, + "step": 21910 + }, + { + "epoch": 1.4889930697105584, + "grad_norm": 1.655142903327942, + "learning_rate": 0.0008139523033020791, + "loss": 3.4994, + "step": 21915 + }, + { + "epoch": 1.4893327897812203, + "grad_norm": 2.39542555809021, + "learning_rate": 0.0008139098382932464, + "loss": 3.4631, + "step": 21920 + }, + { + "epoch": 1.4896725098518822, + "grad_norm": 0.9651236534118652, + "learning_rate": 0.0008138673732844136, + "loss": 3.3962, + "step": 21925 + }, + { + "epoch": 1.4900122299225438, + "grad_norm": 1.1497455835342407, + "learning_rate": 0.0008138249082755809, + "loss": 3.6976, + "step": 21930 + }, + { + "epoch": 1.4903519499932056, + "grad_norm": 0.6623837351799011, + "learning_rate": 0.0008137824432667483, + "loss": 3.6666, + "step": 21935 + }, + { + "epoch": 1.4906916700638675, + "grad_norm": 0.8212748169898987, + "learning_rate": 0.0008137399782579155, + "loss": 3.7403, + "step": 21940 + }, + { + "epoch": 1.4910313901345291, + "grad_norm": 1.0556682348251343, + "learning_rate": 0.0008136975132490828, + "loss": 3.5023, + "step": 21945 + }, + { + "epoch": 1.491371110205191, + "grad_norm": 0.8832721710205078, + "learning_rate": 0.00081365504824025, + "loss": 3.6832, + "step": 21950 + }, + { + "epoch": 1.4917108302758528, + "grad_norm": 0.91910719871521, + "learning_rate": 0.0008136125832314173, + "loss": 3.6014, + "step": 21955 + }, + { + "epoch": 1.4920505503465145, + "grad_norm": 0.7372473478317261, + "learning_rate": 0.0008135701182225846, + "loss": 3.6317, + "step": 21960 + }, + { + "epoch": 1.4923902704171763, + "grad_norm": 1.4654520750045776, + "learning_rate": 0.0008135276532137518, + "loss": 3.5283, + "step": 21965 + }, + { + "epoch": 1.492729990487838, + "grad_norm": 1.052704930305481, + "learning_rate": 0.0008134851882049192, + "loss": 3.5183, + "step": 21970 + }, + { + "epoch": 1.4930697105584998, + "grad_norm": 0.9543300867080688, + "learning_rate": 0.0008134427231960865, + "loss": 3.7363, + "step": 21975 + }, + { + "epoch": 1.4934094306291616, + "grad_norm": 0.7999656796455383, + "learning_rate": 0.0008134002581872537, + "loss": 3.6932, + "step": 21980 + }, + { + "epoch": 1.4937491506998233, + "grad_norm": 1.5254663228988647, + "learning_rate": 0.000813357793178421, + "loss": 3.3421, + "step": 21985 + }, + { + "epoch": 1.4940888707704851, + "grad_norm": 0.6610624194145203, + "learning_rate": 0.0008133153281695883, + "loss": 3.5269, + "step": 21990 + }, + { + "epoch": 1.4944285908411468, + "grad_norm": 0.9944840669631958, + "learning_rate": 0.0008132728631607555, + "loss": 3.6551, + "step": 21995 + }, + { + "epoch": 1.4947683109118086, + "grad_norm": 0.7575967907905579, + "learning_rate": 0.0008132303981519227, + "loss": 3.7085, + "step": 22000 + }, + { + "epoch": 1.4951080309824705, + "grad_norm": 1.0307732820510864, + "learning_rate": 0.0008131879331430902, + "loss": 3.7477, + "step": 22005 + }, + { + "epoch": 1.495447751053132, + "grad_norm": 0.8909186720848083, + "learning_rate": 0.0008131454681342574, + "loss": 3.5256, + "step": 22010 + }, + { + "epoch": 1.495787471123794, + "grad_norm": 0.903501033782959, + "learning_rate": 0.0008131030031254246, + "loss": 3.6392, + "step": 22015 + }, + { + "epoch": 1.4961271911944558, + "grad_norm": 0.7458577752113342, + "learning_rate": 0.000813060538116592, + "loss": 3.6892, + "step": 22020 + }, + { + "epoch": 1.4964669112651174, + "grad_norm": 0.8863702416419983, + "learning_rate": 0.0008130180731077592, + "loss": 3.5218, + "step": 22025 + }, + { + "epoch": 1.4968066313357793, + "grad_norm": 1.9293718338012695, + "learning_rate": 0.0008129756080989264, + "loss": 3.3357, + "step": 22030 + }, + { + "epoch": 1.4971463514064411, + "grad_norm": 0.9360488653182983, + "learning_rate": 0.0008129331430900938, + "loss": 3.8593, + "step": 22035 + }, + { + "epoch": 1.4974860714771028, + "grad_norm": 0.888767421245575, + "learning_rate": 0.0008128906780812611, + "loss": 3.5354, + "step": 22040 + }, + { + "epoch": 1.4978257915477646, + "grad_norm": 0.7652256488800049, + "learning_rate": 0.0008128482130724283, + "loss": 3.6669, + "step": 22045 + }, + { + "epoch": 1.4981655116184265, + "grad_norm": 0.7880499958992004, + "learning_rate": 0.0008128057480635957, + "loss": 3.6182, + "step": 22050 + }, + { + "epoch": 1.4985052316890881, + "grad_norm": 2.8066999912261963, + "learning_rate": 0.0008127632830547629, + "loss": 3.4691, + "step": 22055 + }, + { + "epoch": 1.49884495175975, + "grad_norm": 0.7578378319740295, + "learning_rate": 0.0008127208180459301, + "loss": 3.7085, + "step": 22060 + }, + { + "epoch": 1.4991846718304118, + "grad_norm": 0.7623130679130554, + "learning_rate": 0.0008126783530370974, + "loss": 3.8969, + "step": 22065 + }, + { + "epoch": 1.4995243919010735, + "grad_norm": 1.4729433059692383, + "learning_rate": 0.0008126358880282647, + "loss": 3.4392, + "step": 22070 + }, + { + "epoch": 1.4998641119717353, + "grad_norm": 0.9118412733078003, + "learning_rate": 0.000812593423019432, + "loss": 3.6957, + "step": 22075 + }, + { + "epoch": 1.5002038320423972, + "grad_norm": 1.0181450843811035, + "learning_rate": 0.0008125509580105993, + "loss": 3.7814, + "step": 22080 + }, + { + "epoch": 1.5005435521130588, + "grad_norm": 0.8120703101158142, + "learning_rate": 0.0008125084930017666, + "loss": 3.4697, + "step": 22085 + }, + { + "epoch": 1.5008832721837206, + "grad_norm": 0.8658739924430847, + "learning_rate": 0.0008124660279929338, + "loss": 3.5197, + "step": 22090 + }, + { + "epoch": 1.5012229922543825, + "grad_norm": 1.1028704643249512, + "learning_rate": 0.0008124235629841011, + "loss": 3.5367, + "step": 22095 + }, + { + "epoch": 1.5015627123250441, + "grad_norm": 0.8906139731407166, + "learning_rate": 0.0008123810979752683, + "loss": 3.3931, + "step": 22100 + }, + { + "epoch": 1.501902432395706, + "grad_norm": 0.9652294516563416, + "learning_rate": 0.0008123386329664356, + "loss": 3.6673, + "step": 22105 + }, + { + "epoch": 1.5022421524663678, + "grad_norm": 0.8211134672164917, + "learning_rate": 0.000812296167957603, + "loss": 3.718, + "step": 22110 + }, + { + "epoch": 1.5025818725370295, + "grad_norm": 0.9439778923988342, + "learning_rate": 0.0008122537029487702, + "loss": 3.6224, + "step": 22115 + }, + { + "epoch": 1.5029215926076913, + "grad_norm": 0.8280893564224243, + "learning_rate": 0.0008122112379399375, + "loss": 3.501, + "step": 22120 + }, + { + "epoch": 1.5032613126783532, + "grad_norm": 0.8293588757514954, + "learning_rate": 0.0008121687729311048, + "loss": 3.626, + "step": 22125 + }, + { + "epoch": 1.5036010327490148, + "grad_norm": 0.8909928798675537, + "learning_rate": 0.000812126307922272, + "loss": 3.4691, + "step": 22130 + }, + { + "epoch": 1.5039407528196764, + "grad_norm": 0.988654375076294, + "learning_rate": 0.0008120838429134394, + "loss": 3.628, + "step": 22135 + }, + { + "epoch": 1.5042804728903385, + "grad_norm": 0.9312153458595276, + "learning_rate": 0.0008120413779046066, + "loss": 3.6191, + "step": 22140 + }, + { + "epoch": 1.5046201929610001, + "grad_norm": 0.953225314617157, + "learning_rate": 0.0008119989128957739, + "loss": 3.7696, + "step": 22145 + }, + { + "epoch": 1.5049599130316618, + "grad_norm": 0.9789878129959106, + "learning_rate": 0.0008119564478869413, + "loss": 3.7292, + "step": 22150 + }, + { + "epoch": 1.5052996331023238, + "grad_norm": 0.8332881331443787, + "learning_rate": 0.0008119139828781085, + "loss": 3.5911, + "step": 22155 + }, + { + "epoch": 1.5056393531729855, + "grad_norm": 0.963539719581604, + "learning_rate": 0.0008118715178692757, + "loss": 3.6376, + "step": 22160 + }, + { + "epoch": 1.505979073243647, + "grad_norm": 0.9182893633842468, + "learning_rate": 0.000811829052860443, + "loss": 3.7789, + "step": 22165 + }, + { + "epoch": 1.506318793314309, + "grad_norm": 0.8931882381439209, + "learning_rate": 0.0008117865878516103, + "loss": 3.4865, + "step": 22170 + }, + { + "epoch": 1.5066585133849708, + "grad_norm": 1.0923891067504883, + "learning_rate": 0.0008117441228427775, + "loss": 3.5197, + "step": 22175 + }, + { + "epoch": 1.5069982334556324, + "grad_norm": 0.8637180924415588, + "learning_rate": 0.0008117016578339449, + "loss": 3.5596, + "step": 22180 + }, + { + "epoch": 1.5073379535262943, + "grad_norm": 0.7522241473197937, + "learning_rate": 0.0008116591928251122, + "loss": 3.7056, + "step": 22185 + }, + { + "epoch": 1.5076776735969561, + "grad_norm": 1.2812672853469849, + "learning_rate": 0.0008116167278162794, + "loss": 3.3825, + "step": 22190 + }, + { + "epoch": 1.5080173936676178, + "grad_norm": 0.755690336227417, + "learning_rate": 0.0008115742628074467, + "loss": 3.7337, + "step": 22195 + }, + { + "epoch": 1.5083571137382796, + "grad_norm": 0.7484605312347412, + "learning_rate": 0.000811531797798614, + "loss": 3.6178, + "step": 22200 + }, + { + "epoch": 1.5086968338089415, + "grad_norm": 1.2078036069869995, + "learning_rate": 0.0008114893327897812, + "loss": 3.8271, + "step": 22205 + }, + { + "epoch": 1.5090365538796031, + "grad_norm": 0.960333526134491, + "learning_rate": 0.0008114468677809485, + "loss": 3.5748, + "step": 22210 + }, + { + "epoch": 1.509376273950265, + "grad_norm": 0.6677131056785583, + "learning_rate": 0.0008114044027721158, + "loss": 3.6914, + "step": 22215 + }, + { + "epoch": 1.5097159940209268, + "grad_norm": 0.7167598605155945, + "learning_rate": 0.0008113619377632831, + "loss": 3.4739, + "step": 22220 + }, + { + "epoch": 1.5100557140915885, + "grad_norm": 0.876864492893219, + "learning_rate": 0.0008113194727544504, + "loss": 3.5973, + "step": 22225 + }, + { + "epoch": 1.5103954341622503, + "grad_norm": 0.992123544216156, + "learning_rate": 0.0008112770077456176, + "loss": 3.4709, + "step": 22230 + }, + { + "epoch": 1.5107351542329122, + "grad_norm": 0.8294938802719116, + "learning_rate": 0.0008112345427367849, + "loss": 3.4507, + "step": 22235 + }, + { + "epoch": 1.5110748743035738, + "grad_norm": 0.7010102868080139, + "learning_rate": 0.0008111920777279522, + "loss": 3.6895, + "step": 22240 + }, + { + "epoch": 1.5114145943742356, + "grad_norm": 0.7238225936889648, + "learning_rate": 0.0008111496127191194, + "loss": 3.4021, + "step": 22245 + }, + { + "epoch": 1.5117543144448975, + "grad_norm": 0.8575050234794617, + "learning_rate": 0.0008111071477102867, + "loss": 3.7506, + "step": 22250 + }, + { + "epoch": 1.5120940345155591, + "grad_norm": 0.9438026547431946, + "learning_rate": 0.0008110646827014541, + "loss": 3.7778, + "step": 22255 + }, + { + "epoch": 1.512433754586221, + "grad_norm": 0.8470224142074585, + "learning_rate": 0.0008110222176926213, + "loss": 3.4705, + "step": 22260 + }, + { + "epoch": 1.5127734746568828, + "grad_norm": 0.6949136853218079, + "learning_rate": 0.0008109797526837885, + "loss": 3.5648, + "step": 22265 + }, + { + "epoch": 1.5131131947275445, + "grad_norm": 0.697044312953949, + "learning_rate": 0.0008109372876749559, + "loss": 3.5972, + "step": 22270 + }, + { + "epoch": 1.5134529147982063, + "grad_norm": 0.806930422782898, + "learning_rate": 0.0008108948226661231, + "loss": 3.4339, + "step": 22275 + }, + { + "epoch": 1.5137926348688682, + "grad_norm": 1.2596935033798218, + "learning_rate": 0.0008108523576572903, + "loss": 3.4205, + "step": 22280 + }, + { + "epoch": 1.5141323549395298, + "grad_norm": 0.7733656764030457, + "learning_rate": 0.0008108098926484578, + "loss": 3.746, + "step": 22285 + }, + { + "epoch": 1.5144720750101917, + "grad_norm": 5.800518989562988, + "learning_rate": 0.000810767427639625, + "loss": 3.609, + "step": 22290 + }, + { + "epoch": 1.5148117950808535, + "grad_norm": 1.0555808544158936, + "learning_rate": 0.0008107249626307922, + "loss": 3.4661, + "step": 22295 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.8168818950653076, + "learning_rate": 0.0008106824976219596, + "loss": 3.616, + "step": 22300 + }, + { + "epoch": 1.5154912352221768, + "grad_norm": 0.6689931750297546, + "learning_rate": 0.0008106400326131268, + "loss": 3.8282, + "step": 22305 + }, + { + "epoch": 1.5158309552928388, + "grad_norm": 0.8991059064865112, + "learning_rate": 0.000810597567604294, + "loss": 3.5994, + "step": 22310 + }, + { + "epoch": 1.5161706753635005, + "grad_norm": 0.9125682711601257, + "learning_rate": 0.0008105551025954614, + "loss": 3.7198, + "step": 22315 + }, + { + "epoch": 1.516510395434162, + "grad_norm": 0.8070785999298096, + "learning_rate": 0.0008105126375866287, + "loss": 3.6762, + "step": 22320 + }, + { + "epoch": 1.5168501155048242, + "grad_norm": 1.046485424041748, + "learning_rate": 0.0008104701725777959, + "loss": 3.6009, + "step": 22325 + }, + { + "epoch": 1.5171898355754858, + "grad_norm": 1.0733026266098022, + "learning_rate": 0.0008104277075689632, + "loss": 3.5366, + "step": 22330 + }, + { + "epoch": 1.5175295556461474, + "grad_norm": 0.6515213251113892, + "learning_rate": 0.0008103852425601305, + "loss": 3.6327, + "step": 22335 + }, + { + "epoch": 1.5178692757168093, + "grad_norm": 0.9786260724067688, + "learning_rate": 0.0008103427775512977, + "loss": 3.621, + "step": 22340 + }, + { + "epoch": 1.5182089957874711, + "grad_norm": 0.8355059623718262, + "learning_rate": 0.000810300312542465, + "loss": 3.5109, + "step": 22345 + }, + { + "epoch": 1.5185487158581328, + "grad_norm": 0.8438034653663635, + "learning_rate": 0.0008102578475336324, + "loss": 3.5423, + "step": 22350 + }, + { + "epoch": 1.5188884359287946, + "grad_norm": 0.8656495809555054, + "learning_rate": 0.0008102153825247996, + "loss": 3.5056, + "step": 22355 + }, + { + "epoch": 1.5192281559994565, + "grad_norm": 0.7253504395484924, + "learning_rate": 0.0008101729175159669, + "loss": 3.6737, + "step": 22360 + }, + { + "epoch": 1.5195678760701181, + "grad_norm": 0.8992181420326233, + "learning_rate": 0.0008101304525071341, + "loss": 3.453, + "step": 22365 + }, + { + "epoch": 1.51990759614078, + "grad_norm": 0.949766218662262, + "learning_rate": 0.0008100879874983014, + "loss": 3.4174, + "step": 22370 + }, + { + "epoch": 1.5202473162114418, + "grad_norm": 0.7784948348999023, + "learning_rate": 0.0008100455224894687, + "loss": 3.6992, + "step": 22375 + }, + { + "epoch": 1.5205870362821035, + "grad_norm": 0.709964394569397, + "learning_rate": 0.0008100030574806359, + "loss": 3.702, + "step": 22380 + }, + { + "epoch": 1.5209267563527653, + "grad_norm": 0.737909197807312, + "learning_rate": 0.0008099605924718033, + "loss": 3.6444, + "step": 22385 + }, + { + "epoch": 1.5212664764234272, + "grad_norm": 0.9086816906929016, + "learning_rate": 0.0008099181274629706, + "loss": 3.6417, + "step": 22390 + }, + { + "epoch": 1.5216061964940888, + "grad_norm": 0.9244238138198853, + "learning_rate": 0.0008098756624541378, + "loss": 3.6894, + "step": 22395 + }, + { + "epoch": 1.5219459165647506, + "grad_norm": 0.8606001734733582, + "learning_rate": 0.000809833197445305, + "loss": 3.7526, + "step": 22400 + }, + { + "epoch": 1.5222856366354125, + "grad_norm": 1.0476043224334717, + "learning_rate": 0.0008097907324364724, + "loss": 3.6135, + "step": 22405 + }, + { + "epoch": 1.5226253567060741, + "grad_norm": 0.7347127795219421, + "learning_rate": 0.0008097482674276396, + "loss": 3.8234, + "step": 22410 + }, + { + "epoch": 1.522965076776736, + "grad_norm": 0.739763081073761, + "learning_rate": 0.0008097058024188068, + "loss": 3.6234, + "step": 22415 + }, + { + "epoch": 1.5233047968473978, + "grad_norm": 0.9169771671295166, + "learning_rate": 0.0008096633374099743, + "loss": 3.6496, + "step": 22420 + }, + { + "epoch": 1.5236445169180595, + "grad_norm": 0.6137856245040894, + "learning_rate": 0.0008096208724011415, + "loss": 3.7252, + "step": 22425 + }, + { + "epoch": 1.5239842369887213, + "grad_norm": 0.804402768611908, + "learning_rate": 0.0008095784073923087, + "loss": 3.3508, + "step": 22430 + }, + { + "epoch": 1.5243239570593832, + "grad_norm": 0.9215467572212219, + "learning_rate": 0.0008095359423834761, + "loss": 3.4776, + "step": 22435 + }, + { + "epoch": 1.5246636771300448, + "grad_norm": 0.9266966581344604, + "learning_rate": 0.0008094934773746433, + "loss": 3.6288, + "step": 22440 + }, + { + "epoch": 1.5250033972007067, + "grad_norm": 1.2500064373016357, + "learning_rate": 0.0008094510123658105, + "loss": 3.6462, + "step": 22445 + }, + { + "epoch": 1.5253431172713685, + "grad_norm": 1.1392971277236938, + "learning_rate": 0.0008094085473569778, + "loss": 3.4615, + "step": 22450 + }, + { + "epoch": 1.5256828373420301, + "grad_norm": 0.7781038880348206, + "learning_rate": 0.0008093660823481452, + "loss": 3.5863, + "step": 22455 + }, + { + "epoch": 1.526022557412692, + "grad_norm": 0.9405845999717712, + "learning_rate": 0.0008093236173393124, + "loss": 3.7215, + "step": 22460 + }, + { + "epoch": 1.5263622774833538, + "grad_norm": 1.1770190000534058, + "learning_rate": 0.0008092811523304797, + "loss": 3.6793, + "step": 22465 + }, + { + "epoch": 1.5267019975540155, + "grad_norm": 0.7325275540351868, + "learning_rate": 0.000809238687321647, + "loss": 3.5797, + "step": 22470 + }, + { + "epoch": 1.527041717624677, + "grad_norm": 0.8297496438026428, + "learning_rate": 0.0008091962223128143, + "loss": 3.8462, + "step": 22475 + }, + { + "epoch": 1.5273814376953392, + "grad_norm": 0.8353131413459778, + "learning_rate": 0.0008091537573039815, + "loss": 3.6337, + "step": 22480 + }, + { + "epoch": 1.5277211577660008, + "grad_norm": 0.6747085452079773, + "learning_rate": 0.0008091112922951488, + "loss": 3.7437, + "step": 22485 + }, + { + "epoch": 1.5280608778366624, + "grad_norm": 1.039534330368042, + "learning_rate": 0.0008090688272863162, + "loss": 3.7647, + "step": 22490 + }, + { + "epoch": 1.5284005979073245, + "grad_norm": 0.9028795957565308, + "learning_rate": 0.0008090263622774834, + "loss": 3.6252, + "step": 22495 + }, + { + "epoch": 1.5287403179779862, + "grad_norm": 1.3202464580535889, + "learning_rate": 0.0008089838972686506, + "loss": 3.6182, + "step": 22500 + }, + { + "epoch": 1.5290800380486478, + "grad_norm": 0.7802270650863647, + "learning_rate": 0.000808941432259818, + "loss": 3.3468, + "step": 22505 + }, + { + "epoch": 1.5294197581193096, + "grad_norm": 0.8204078078269958, + "learning_rate": 0.0008088989672509852, + "loss": 3.5454, + "step": 22510 + }, + { + "epoch": 1.5297594781899715, + "grad_norm": 0.823747992515564, + "learning_rate": 0.0008088565022421524, + "loss": 3.4631, + "step": 22515 + }, + { + "epoch": 1.5300991982606331, + "grad_norm": 0.9554562568664551, + "learning_rate": 0.0008088140372333198, + "loss": 3.3086, + "step": 22520 + }, + { + "epoch": 1.530438918331295, + "grad_norm": 0.6879837512969971, + "learning_rate": 0.0008087715722244871, + "loss": 3.7419, + "step": 22525 + }, + { + "epoch": 1.5307786384019568, + "grad_norm": 0.7424771189689636, + "learning_rate": 0.0008087291072156543, + "loss": 3.6084, + "step": 22530 + }, + { + "epoch": 1.5311183584726185, + "grad_norm": 0.7941288352012634, + "learning_rate": 0.0008086866422068217, + "loss": 3.7548, + "step": 22535 + }, + { + "epoch": 1.5314580785432803, + "grad_norm": 0.9224671721458435, + "learning_rate": 0.0008086441771979889, + "loss": 3.5744, + "step": 22540 + }, + { + "epoch": 1.5317977986139422, + "grad_norm": 0.7183346748352051, + "learning_rate": 0.0008086017121891561, + "loss": 3.5483, + "step": 22545 + }, + { + "epoch": 1.5321375186846038, + "grad_norm": 1.0283465385437012, + "learning_rate": 0.0008085592471803234, + "loss": 3.5894, + "step": 22550 + }, + { + "epoch": 1.5324772387552656, + "grad_norm": 0.8464535474777222, + "learning_rate": 0.0008085167821714907, + "loss": 3.7064, + "step": 22555 + }, + { + "epoch": 1.5328169588259275, + "grad_norm": 0.736099362373352, + "learning_rate": 0.000808474317162658, + "loss": 3.615, + "step": 22560 + }, + { + "epoch": 1.5331566788965891, + "grad_norm": 0.9655994176864624, + "learning_rate": 0.0008084318521538253, + "loss": 3.5561, + "step": 22565 + }, + { + "epoch": 1.533496398967251, + "grad_norm": 0.7868728637695312, + "learning_rate": 0.0008083893871449926, + "loss": 3.7455, + "step": 22570 + }, + { + "epoch": 1.5338361190379128, + "grad_norm": 0.7701765894889832, + "learning_rate": 0.0008083469221361598, + "loss": 3.5631, + "step": 22575 + }, + { + "epoch": 1.5341758391085745, + "grad_norm": 1.09901762008667, + "learning_rate": 0.0008083129501290937, + "loss": 3.6894, + "step": 22580 + }, + { + "epoch": 1.5345155591792363, + "grad_norm": 1.0632330179214478, + "learning_rate": 0.000808270485120261, + "loss": 3.7189, + "step": 22585 + }, + { + "epoch": 1.5348552792498982, + "grad_norm": 1.0971126556396484, + "learning_rate": 0.0008082280201114282, + "loss": 3.9283, + "step": 22590 + }, + { + "epoch": 1.5351949993205598, + "grad_norm": 0.8488855957984924, + "learning_rate": 0.0008081855551025955, + "loss": 3.9576, + "step": 22595 + }, + { + "epoch": 1.5355347193912217, + "grad_norm": 0.718552827835083, + "learning_rate": 0.0008081430900937627, + "loss": 3.5423, + "step": 22600 + }, + { + "epoch": 1.5358744394618835, + "grad_norm": 0.6693300604820251, + "learning_rate": 0.00080810062508493, + "loss": 3.6544, + "step": 22605 + }, + { + "epoch": 1.5362141595325451, + "grad_norm": 1.227034091949463, + "learning_rate": 0.0008080581600760973, + "loss": 3.5665, + "step": 22610 + }, + { + "epoch": 1.536553879603207, + "grad_norm": 0.7921385765075684, + "learning_rate": 0.0008080156950672646, + "loss": 3.9728, + "step": 22615 + }, + { + "epoch": 1.5368935996738688, + "grad_norm": 0.7782266736030579, + "learning_rate": 0.0008079732300584319, + "loss": 3.5751, + "step": 22620 + }, + { + "epoch": 1.5372333197445305, + "grad_norm": 8.147351264953613, + "learning_rate": 0.0008079307650495992, + "loss": 3.5785, + "step": 22625 + }, + { + "epoch": 1.5375730398151923, + "grad_norm": 0.7345945239067078, + "learning_rate": 0.0008078883000407664, + "loss": 3.6177, + "step": 22630 + }, + { + "epoch": 1.5379127598858542, + "grad_norm": 0.8180938959121704, + "learning_rate": 0.0008078458350319336, + "loss": 3.7514, + "step": 22635 + }, + { + "epoch": 1.5382524799565158, + "grad_norm": 0.7594621777534485, + "learning_rate": 0.000807803370023101, + "loss": 3.5449, + "step": 22640 + }, + { + "epoch": 1.5385922000271774, + "grad_norm": 0.772228479385376, + "learning_rate": 0.0008077609050142682, + "loss": 3.6327, + "step": 22645 + }, + { + "epoch": 1.5389319200978395, + "grad_norm": 0.807470440864563, + "learning_rate": 0.0008077184400054355, + "loss": 3.6403, + "step": 22650 + }, + { + "epoch": 1.5392716401685012, + "grad_norm": 0.8679407238960266, + "learning_rate": 0.0008076759749966029, + "loss": 3.5145, + "step": 22655 + }, + { + "epoch": 1.5396113602391628, + "grad_norm": 0.8238019347190857, + "learning_rate": 0.0008076335099877701, + "loss": 3.6346, + "step": 22660 + }, + { + "epoch": 1.5399510803098249, + "grad_norm": 0.8320210576057434, + "learning_rate": 0.0008075910449789373, + "loss": 3.568, + "step": 22665 + }, + { + "epoch": 1.5402908003804865, + "grad_norm": 0.7420300245285034, + "learning_rate": 0.0008075485799701047, + "loss": 3.4845, + "step": 22670 + }, + { + "epoch": 1.5406305204511481, + "grad_norm": 0.7442185282707214, + "learning_rate": 0.0008075061149612719, + "loss": 3.6152, + "step": 22675 + }, + { + "epoch": 1.54097024052181, + "grad_norm": 1.2281203269958496, + "learning_rate": 0.0008074636499524392, + "loss": 3.5817, + "step": 22680 + }, + { + "epoch": 1.5413099605924718, + "grad_norm": 0.8081947565078735, + "learning_rate": 0.0008074211849436066, + "loss": 3.7695, + "step": 22685 + }, + { + "epoch": 1.5416496806631335, + "grad_norm": 0.854428231716156, + "learning_rate": 0.0008073787199347738, + "loss": 3.3019, + "step": 22690 + }, + { + "epoch": 1.5419894007337953, + "grad_norm": 0.7824023962020874, + "learning_rate": 0.0008073362549259411, + "loss": 3.7759, + "step": 22695 + }, + { + "epoch": 1.5423291208044572, + "grad_norm": 0.9708326458930969, + "learning_rate": 0.0008072937899171083, + "loss": 3.499, + "step": 22700 + }, + { + "epoch": 1.5426688408751188, + "grad_norm": 0.558819591999054, + "learning_rate": 0.0008072513249082756, + "loss": 3.5792, + "step": 22705 + }, + { + "epoch": 1.5430085609457806, + "grad_norm": 0.978066623210907, + "learning_rate": 0.0008072088598994429, + "loss": 3.5419, + "step": 22710 + }, + { + "epoch": 1.5433482810164425, + "grad_norm": 0.8693552613258362, + "learning_rate": 0.0008071663948906101, + "loss": 3.6238, + "step": 22715 + }, + { + "epoch": 1.5436880010871041, + "grad_norm": 0.8326080441474915, + "learning_rate": 0.0008071239298817775, + "loss": 3.4061, + "step": 22720 + }, + { + "epoch": 1.544027721157766, + "grad_norm": 0.8176658153533936, + "learning_rate": 0.0008070814648729448, + "loss": 3.6742, + "step": 22725 + }, + { + "epoch": 1.5443674412284278, + "grad_norm": 0.9251994490623474, + "learning_rate": 0.000807038999864112, + "loss": 3.736, + "step": 22730 + }, + { + "epoch": 1.5447071612990895, + "grad_norm": 0.696986198425293, + "learning_rate": 0.0008069965348552792, + "loss": 3.5589, + "step": 22735 + }, + { + "epoch": 1.5450468813697513, + "grad_norm": 0.6512287259101868, + "learning_rate": 0.0008069540698464466, + "loss": 3.6026, + "step": 22740 + }, + { + "epoch": 1.5453866014404132, + "grad_norm": 0.8983291983604431, + "learning_rate": 0.0008069116048376138, + "loss": 3.5935, + "step": 22745 + }, + { + "epoch": 1.5457263215110748, + "grad_norm": 0.7015606164932251, + "learning_rate": 0.000806869139828781, + "loss": 3.7548, + "step": 22750 + }, + { + "epoch": 1.5460660415817367, + "grad_norm": 0.8206071853637695, + "learning_rate": 0.0008068266748199485, + "loss": 3.5545, + "step": 22755 + }, + { + "epoch": 1.5464057616523985, + "grad_norm": 0.8466027975082397, + "learning_rate": 0.0008067842098111157, + "loss": 3.5103, + "step": 22760 + }, + { + "epoch": 1.5467454817230601, + "grad_norm": 0.822455108165741, + "learning_rate": 0.0008067417448022829, + "loss": 3.488, + "step": 22765 + }, + { + "epoch": 1.547085201793722, + "grad_norm": 0.9322642087936401, + "learning_rate": 0.0008066992797934503, + "loss": 3.4902, + "step": 22770 + }, + { + "epoch": 1.5474249218643839, + "grad_norm": 0.8981651663780212, + "learning_rate": 0.0008066568147846175, + "loss": 3.7901, + "step": 22775 + }, + { + "epoch": 1.5477646419350455, + "grad_norm": 0.9349554181098938, + "learning_rate": 0.0008066143497757847, + "loss": 3.5199, + "step": 22780 + }, + { + "epoch": 1.5481043620057073, + "grad_norm": 0.9003464579582214, + "learning_rate": 0.000806571884766952, + "loss": 3.801, + "step": 22785 + }, + { + "epoch": 1.5484440820763692, + "grad_norm": 1.1492609977722168, + "learning_rate": 0.0008065294197581194, + "loss": 3.6281, + "step": 22790 + }, + { + "epoch": 1.5487838021470308, + "grad_norm": 0.8404953479766846, + "learning_rate": 0.0008064869547492866, + "loss": 3.8341, + "step": 22795 + }, + { + "epoch": 1.5491235222176927, + "grad_norm": 0.722975492477417, + "learning_rate": 0.0008064444897404539, + "loss": 3.2842, + "step": 22800 + }, + { + "epoch": 1.5494632422883545, + "grad_norm": 0.8581058382987976, + "learning_rate": 0.0008064020247316212, + "loss": 3.3633, + "step": 22805 + }, + { + "epoch": 1.5498029623590162, + "grad_norm": 0.861478328704834, + "learning_rate": 0.0008063595597227884, + "loss": 3.4781, + "step": 22810 + }, + { + "epoch": 1.5501426824296778, + "grad_norm": 0.8061710000038147, + "learning_rate": 0.0008063170947139557, + "loss": 3.7053, + "step": 22815 + }, + { + "epoch": 1.5504824025003399, + "grad_norm": 0.6240681409835815, + "learning_rate": 0.000806274629705123, + "loss": 3.5347, + "step": 22820 + }, + { + "epoch": 1.5508221225710015, + "grad_norm": 0.9748387336730957, + "learning_rate": 0.0008062321646962903, + "loss": 3.5857, + "step": 22825 + }, + { + "epoch": 1.5511618426416631, + "grad_norm": 0.9153878092765808, + "learning_rate": 0.0008061896996874576, + "loss": 3.7423, + "step": 22830 + }, + { + "epoch": 1.5515015627123252, + "grad_norm": 0.943713366985321, + "learning_rate": 0.0008061472346786248, + "loss": 3.7113, + "step": 22835 + }, + { + "epoch": 1.5518412827829868, + "grad_norm": 0.9553012251853943, + "learning_rate": 0.0008061047696697921, + "loss": 3.7228, + "step": 22840 + }, + { + "epoch": 1.5521810028536485, + "grad_norm": 0.9145041704177856, + "learning_rate": 0.0008060623046609594, + "loss": 3.7001, + "step": 22845 + }, + { + "epoch": 1.5525207229243103, + "grad_norm": 0.7157474756240845, + "learning_rate": 0.0008060198396521266, + "loss": 3.5952, + "step": 22850 + }, + { + "epoch": 1.5528604429949722, + "grad_norm": 0.9293907880783081, + "learning_rate": 0.0008059773746432939, + "loss": 3.4588, + "step": 22855 + }, + { + "epoch": 1.5532001630656338, + "grad_norm": 0.8667646050453186, + "learning_rate": 0.0008059349096344613, + "loss": 3.6286, + "step": 22860 + }, + { + "epoch": 1.5535398831362957, + "grad_norm": 0.7536961436271667, + "learning_rate": 0.0008058924446256285, + "loss": 3.6848, + "step": 22865 + }, + { + "epoch": 1.5538796032069575, + "grad_norm": 0.9776958227157593, + "learning_rate": 0.0008058499796167958, + "loss": 3.5161, + "step": 22870 + }, + { + "epoch": 1.5542193232776191, + "grad_norm": 0.7712224721908569, + "learning_rate": 0.0008058075146079631, + "loss": 3.6939, + "step": 22875 + }, + { + "epoch": 1.554559043348281, + "grad_norm": 0.6326374411582947, + "learning_rate": 0.0008057650495991303, + "loss": 3.6126, + "step": 22880 + }, + { + "epoch": 1.5548987634189428, + "grad_norm": 0.8118758201599121, + "learning_rate": 0.0008057225845902975, + "loss": 3.6008, + "step": 22885 + }, + { + "epoch": 1.5552384834896045, + "grad_norm": 1.1382594108581543, + "learning_rate": 0.0008056801195814649, + "loss": 3.6725, + "step": 22890 + }, + { + "epoch": 1.5555782035602663, + "grad_norm": 0.9046120047569275, + "learning_rate": 0.0008056376545726322, + "loss": 3.7402, + "step": 22895 + }, + { + "epoch": 1.5559179236309282, + "grad_norm": 0.9160743355751038, + "learning_rate": 0.0008055951895637994, + "loss": 3.4929, + "step": 22900 + }, + { + "epoch": 1.5562576437015898, + "grad_norm": 0.665942907333374, + "learning_rate": 0.0008055527245549668, + "loss": 3.649, + "step": 22905 + }, + { + "epoch": 1.5565973637722517, + "grad_norm": 0.8153183460235596, + "learning_rate": 0.000805510259546134, + "loss": 3.6461, + "step": 22910 + }, + { + "epoch": 1.5569370838429135, + "grad_norm": 0.8035951852798462, + "learning_rate": 0.0008054677945373012, + "loss": 3.7177, + "step": 22915 + }, + { + "epoch": 1.5572768039135751, + "grad_norm": 0.784641444683075, + "learning_rate": 0.0008054253295284686, + "loss": 3.5771, + "step": 22920 + }, + { + "epoch": 1.557616523984237, + "grad_norm": 0.7522456645965576, + "learning_rate": 0.0008053828645196358, + "loss": 3.7286, + "step": 22925 + }, + { + "epoch": 1.5579562440548989, + "grad_norm": 0.7701612710952759, + "learning_rate": 0.0008053403995108031, + "loss": 3.5445, + "step": 22930 + }, + { + "epoch": 1.5582959641255605, + "grad_norm": 0.7606229186058044, + "learning_rate": 0.0008052979345019704, + "loss": 3.5027, + "step": 22935 + }, + { + "epoch": 1.5586356841962223, + "grad_norm": 0.6217659115791321, + "learning_rate": 0.0008052554694931377, + "loss": 3.7082, + "step": 22940 + }, + { + "epoch": 1.5589754042668842, + "grad_norm": 0.8623255491256714, + "learning_rate": 0.0008052130044843049, + "loss": 3.5639, + "step": 22945 + }, + { + "epoch": 1.5593151243375458, + "grad_norm": 0.6672137975692749, + "learning_rate": 0.0008051705394754722, + "loss": 3.5258, + "step": 22950 + }, + { + "epoch": 1.5596548444082077, + "grad_norm": 0.9366925358772278, + "learning_rate": 0.0008051280744666395, + "loss": 3.5991, + "step": 22955 + }, + { + "epoch": 1.5599945644788695, + "grad_norm": 0.9136122465133667, + "learning_rate": 0.0008050856094578067, + "loss": 3.5992, + "step": 22960 + }, + { + "epoch": 1.5603342845495312, + "grad_norm": 2.7036328315734863, + "learning_rate": 0.0008050431444489741, + "loss": 3.6638, + "step": 22965 + }, + { + "epoch": 1.560674004620193, + "grad_norm": 0.8905829787254333, + "learning_rate": 0.0008050006794401414, + "loss": 3.7614, + "step": 22970 + }, + { + "epoch": 1.5610137246908549, + "grad_norm": 0.8904100060462952, + "learning_rate": 0.0008049582144313086, + "loss": 3.3855, + "step": 22975 + }, + { + "epoch": 1.5613534447615165, + "grad_norm": 0.8152630925178528, + "learning_rate": 0.0008049157494224759, + "loss": 3.3973, + "step": 22980 + }, + { + "epoch": 1.5616931648321781, + "grad_norm": 0.8973947167396545, + "learning_rate": 0.0008048732844136431, + "loss": 3.5177, + "step": 22985 + }, + { + "epoch": 1.5620328849028402, + "grad_norm": 0.928124189376831, + "learning_rate": 0.0008048308194048104, + "loss": 3.8619, + "step": 22990 + }, + { + "epoch": 1.5623726049735018, + "grad_norm": 0.6332435011863708, + "learning_rate": 0.0008047883543959777, + "loss": 3.4902, + "step": 22995 + }, + { + "epoch": 1.5627123250441635, + "grad_norm": 1.0339155197143555, + "learning_rate": 0.000804745889387145, + "loss": 3.5277, + "step": 23000 + }, + { + "epoch": 1.5630520451148255, + "grad_norm": 0.8653725981712341, + "learning_rate": 0.0008047034243783123, + "loss": 3.545, + "step": 23005 + }, + { + "epoch": 1.5633917651854872, + "grad_norm": 0.7483752965927124, + "learning_rate": 0.0008046609593694796, + "loss": 3.5085, + "step": 23010 + }, + { + "epoch": 1.5637314852561488, + "grad_norm": 1.0611330270767212, + "learning_rate": 0.0008046184943606468, + "loss": 3.9361, + "step": 23015 + }, + { + "epoch": 1.5640712053268107, + "grad_norm": 0.8113749623298645, + "learning_rate": 0.0008045760293518142, + "loss": 3.7584, + "step": 23020 + }, + { + "epoch": 1.5644109253974725, + "grad_norm": 0.9361585974693298, + "learning_rate": 0.0008045335643429814, + "loss": 3.4338, + "step": 23025 + }, + { + "epoch": 1.5647506454681341, + "grad_norm": 1.1578115224838257, + "learning_rate": 0.0008044910993341486, + "loss": 3.4553, + "step": 23030 + }, + { + "epoch": 1.565090365538796, + "grad_norm": 0.8212599158287048, + "learning_rate": 0.000804448634325316, + "loss": 3.8481, + "step": 23035 + }, + { + "epoch": 1.5654300856094578, + "grad_norm": 0.9356555938720703, + "learning_rate": 0.0008044061693164833, + "loss": 3.6054, + "step": 23040 + }, + { + "epoch": 1.5657698056801195, + "grad_norm": 0.8156973719596863, + "learning_rate": 0.0008043637043076505, + "loss": 3.4773, + "step": 23045 + }, + { + "epoch": 1.5661095257507813, + "grad_norm": 0.79892897605896, + "learning_rate": 0.0008043212392988178, + "loss": 3.6782, + "step": 23050 + }, + { + "epoch": 1.5664492458214432, + "grad_norm": 1.0431510210037231, + "learning_rate": 0.0008042787742899851, + "loss": 3.6034, + "step": 23055 + }, + { + "epoch": 1.5667889658921048, + "grad_norm": 0.6228598952293396, + "learning_rate": 0.0008042363092811523, + "loss": 3.7185, + "step": 23060 + }, + { + "epoch": 1.5671286859627667, + "grad_norm": 0.9214348196983337, + "learning_rate": 0.0008041938442723196, + "loss": 3.8911, + "step": 23065 + }, + { + "epoch": 1.5674684060334285, + "grad_norm": 0.8524571657180786, + "learning_rate": 0.000804151379263487, + "loss": 3.9461, + "step": 23070 + }, + { + "epoch": 1.5678081261040901, + "grad_norm": 1.0265371799468994, + "learning_rate": 0.0008041089142546542, + "loss": 3.5768, + "step": 23075 + }, + { + "epoch": 1.568147846174752, + "grad_norm": 0.7600913643836975, + "learning_rate": 0.0008040664492458215, + "loss": 3.7367, + "step": 23080 + }, + { + "epoch": 1.5684875662454139, + "grad_norm": 0.8054922223091125, + "learning_rate": 0.0008040239842369887, + "loss": 3.6108, + "step": 23085 + }, + { + "epoch": 1.5688272863160755, + "grad_norm": 0.861549973487854, + "learning_rate": 0.000803981519228156, + "loss": 3.6313, + "step": 23090 + }, + { + "epoch": 1.5691670063867373, + "grad_norm": 0.9068548679351807, + "learning_rate": 0.0008039390542193233, + "loss": 3.6025, + "step": 23095 + }, + { + "epoch": 1.5695067264573992, + "grad_norm": 0.739829421043396, + "learning_rate": 0.0008038965892104905, + "loss": 3.6783, + "step": 23100 + }, + { + "epoch": 1.5698464465280608, + "grad_norm": 0.9467491507530212, + "learning_rate": 0.0008038541242016579, + "loss": 3.6867, + "step": 23105 + }, + { + "epoch": 1.5701861665987227, + "grad_norm": 0.660178005695343, + "learning_rate": 0.0008038116591928252, + "loss": 3.59, + "step": 23110 + }, + { + "epoch": 1.5705258866693845, + "grad_norm": 1.1303554773330688, + "learning_rate": 0.0008037691941839924, + "loss": 3.6687, + "step": 23115 + }, + { + "epoch": 1.5708656067400462, + "grad_norm": 0.72696453332901, + "learning_rate": 0.0008037267291751596, + "loss": 3.4973, + "step": 23120 + }, + { + "epoch": 1.571205326810708, + "grad_norm": 0.9332923293113708, + "learning_rate": 0.000803684264166327, + "loss": 3.7279, + "step": 23125 + }, + { + "epoch": 1.5715450468813699, + "grad_norm": 0.7196802496910095, + "learning_rate": 0.0008036417991574942, + "loss": 3.6727, + "step": 23130 + }, + { + "epoch": 1.5718847669520315, + "grad_norm": 0.7161621451377869, + "learning_rate": 0.0008035993341486614, + "loss": 3.7969, + "step": 23135 + }, + { + "epoch": 1.5722244870226934, + "grad_norm": 0.8757641315460205, + "learning_rate": 0.0008035568691398289, + "loss": 3.6314, + "step": 23140 + }, + { + "epoch": 1.5725642070933552, + "grad_norm": 0.7029061317443848, + "learning_rate": 0.0008035144041309961, + "loss": 3.6192, + "step": 23145 + }, + { + "epoch": 1.5729039271640168, + "grad_norm": 0.7227541208267212, + "learning_rate": 0.0008034719391221633, + "loss": 3.6975, + "step": 23150 + }, + { + "epoch": 1.5732436472346785, + "grad_norm": 1.8961331844329834, + "learning_rate": 0.0008034294741133307, + "loss": 3.7827, + "step": 23155 + }, + { + "epoch": 1.5735833673053405, + "grad_norm": 0.8577156066894531, + "learning_rate": 0.0008033870091044979, + "loss": 3.4611, + "step": 23160 + }, + { + "epoch": 1.5739230873760022, + "grad_norm": 0.6845523715019226, + "learning_rate": 0.0008033445440956651, + "loss": 3.5893, + "step": 23165 + }, + { + "epoch": 1.5742628074466638, + "grad_norm": 1.2792456150054932, + "learning_rate": 0.0008033020790868326, + "loss": 3.751, + "step": 23170 + }, + { + "epoch": 1.5746025275173259, + "grad_norm": 0.9697450995445251, + "learning_rate": 0.0008032596140779998, + "loss": 3.7381, + "step": 23175 + }, + { + "epoch": 1.5749422475879875, + "grad_norm": 0.7749747633934021, + "learning_rate": 0.000803217149069167, + "loss": 3.7721, + "step": 23180 + }, + { + "epoch": 1.5752819676586491, + "grad_norm": 0.9136292338371277, + "learning_rate": 0.0008031746840603343, + "loss": 3.6624, + "step": 23185 + }, + { + "epoch": 1.575621687729311, + "grad_norm": 0.8991854190826416, + "learning_rate": 0.0008031322190515016, + "loss": 3.7188, + "step": 23190 + }, + { + "epoch": 1.5759614077999728, + "grad_norm": 0.8085570931434631, + "learning_rate": 0.0008030897540426688, + "loss": 3.9886, + "step": 23195 + }, + { + "epoch": 1.5763011278706345, + "grad_norm": 0.8111883997917175, + "learning_rate": 0.0008030472890338361, + "loss": 3.2899, + "step": 23200 + }, + { + "epoch": 1.5766408479412963, + "grad_norm": 1.3886754512786865, + "learning_rate": 0.0008030048240250035, + "loss": 3.6384, + "step": 23205 + }, + { + "epoch": 1.5769805680119582, + "grad_norm": 1.0280253887176514, + "learning_rate": 0.0008029623590161707, + "loss": 3.4959, + "step": 23210 + }, + { + "epoch": 1.5773202880826198, + "grad_norm": 0.7412897944450378, + "learning_rate": 0.000802919894007338, + "loss": 3.4594, + "step": 23215 + }, + { + "epoch": 1.5776600081532817, + "grad_norm": 0.906151294708252, + "learning_rate": 0.0008028774289985052, + "loss": 3.5371, + "step": 23220 + }, + { + "epoch": 1.5779997282239435, + "grad_norm": 0.7241297960281372, + "learning_rate": 0.0008028349639896725, + "loss": 3.6938, + "step": 23225 + }, + { + "epoch": 1.5783394482946052, + "grad_norm": 0.7996935248374939, + "learning_rate": 0.0008027924989808398, + "loss": 3.4998, + "step": 23230 + }, + { + "epoch": 1.578679168365267, + "grad_norm": 0.9824647307395935, + "learning_rate": 0.000802750033972007, + "loss": 3.6929, + "step": 23235 + }, + { + "epoch": 1.5790188884359289, + "grad_norm": 0.8569043874740601, + "learning_rate": 0.0008027075689631744, + "loss": 3.5072, + "step": 23240 + }, + { + "epoch": 1.5793586085065905, + "grad_norm": 0.9508857727050781, + "learning_rate": 0.0008026651039543417, + "loss": 3.4209, + "step": 23245 + }, + { + "epoch": 1.5796983285772523, + "grad_norm": 0.7415825128555298, + "learning_rate": 0.0008026226389455089, + "loss": 3.6002, + "step": 23250 + }, + { + "epoch": 1.5800380486479142, + "grad_norm": 0.9358696937561035, + "learning_rate": 0.0008025801739366762, + "loss": 3.7487, + "step": 23255 + }, + { + "epoch": 1.5803777687185758, + "grad_norm": 0.8989538550376892, + "learning_rate": 0.0008025377089278435, + "loss": 3.6524, + "step": 23260 + }, + { + "epoch": 1.5807174887892377, + "grad_norm": 1.089047908782959, + "learning_rate": 0.0008024952439190107, + "loss": 3.6646, + "step": 23265 + }, + { + "epoch": 1.5810572088598995, + "grad_norm": 0.7854350209236145, + "learning_rate": 0.0008024527789101779, + "loss": 3.7039, + "step": 23270 + }, + { + "epoch": 1.5813969289305612, + "grad_norm": 0.7841466069221497, + "learning_rate": 0.0008024103139013454, + "loss": 3.8842, + "step": 23275 + }, + { + "epoch": 1.581736649001223, + "grad_norm": 0.8465746641159058, + "learning_rate": 0.0008023678488925126, + "loss": 3.8284, + "step": 23280 + }, + { + "epoch": 1.5820763690718849, + "grad_norm": 0.9133961796760559, + "learning_rate": 0.0008023253838836798, + "loss": 3.6686, + "step": 23285 + }, + { + "epoch": 1.5824160891425465, + "grad_norm": 0.98038250207901, + "learning_rate": 0.0008022829188748472, + "loss": 3.1521, + "step": 23290 + }, + { + "epoch": 1.5827558092132084, + "grad_norm": 0.9677000641822815, + "learning_rate": 0.0008022404538660144, + "loss": 3.523, + "step": 23295 + }, + { + "epoch": 1.5830955292838702, + "grad_norm": 0.8742707371711731, + "learning_rate": 0.0008021979888571816, + "loss": 3.6237, + "step": 23300 + }, + { + "epoch": 1.5834352493545318, + "grad_norm": 0.8464352488517761, + "learning_rate": 0.000802155523848349, + "loss": 3.6374, + "step": 23305 + }, + { + "epoch": 1.5837749694251937, + "grad_norm": 0.8734097480773926, + "learning_rate": 0.0008021130588395163, + "loss": 3.4717, + "step": 23310 + }, + { + "epoch": 1.5841146894958555, + "grad_norm": 1.6379852294921875, + "learning_rate": 0.0008020705938306835, + "loss": 3.3631, + "step": 23315 + }, + { + "epoch": 1.5844544095665172, + "grad_norm": 0.7916216254234314, + "learning_rate": 0.0008020281288218509, + "loss": 3.4244, + "step": 23320 + }, + { + "epoch": 1.5847941296371788, + "grad_norm": 0.8907950520515442, + "learning_rate": 0.0008019856638130181, + "loss": 3.5928, + "step": 23325 + }, + { + "epoch": 1.5851338497078409, + "grad_norm": 0.7809728980064392, + "learning_rate": 0.0008019431988041853, + "loss": 3.6664, + "step": 23330 + }, + { + "epoch": 1.5854735697785025, + "grad_norm": 0.784126341342926, + "learning_rate": 0.0008019007337953526, + "loss": 3.67, + "step": 23335 + }, + { + "epoch": 1.5858132898491641, + "grad_norm": 1.2602758407592773, + "learning_rate": 0.0008018582687865199, + "loss": 3.4859, + "step": 23340 + }, + { + "epoch": 1.5861530099198262, + "grad_norm": 0.8480249047279358, + "learning_rate": 0.0008018158037776872, + "loss": 3.5289, + "step": 23345 + }, + { + "epoch": 1.5864927299904878, + "grad_norm": 0.8000895380973816, + "learning_rate": 0.0008017733387688545, + "loss": 3.9535, + "step": 23350 + }, + { + "epoch": 1.5868324500611495, + "grad_norm": 0.9857907295227051, + "learning_rate": 0.0008017308737600218, + "loss": 3.6584, + "step": 23355 + }, + { + "epoch": 1.5871721701318113, + "grad_norm": 0.9857244491577148, + "learning_rate": 0.0008016884087511891, + "loss": 3.454, + "step": 23360 + }, + { + "epoch": 1.5875118902024732, + "grad_norm": 0.7638331651687622, + "learning_rate": 0.0008016459437423563, + "loss": 3.7068, + "step": 23365 + }, + { + "epoch": 1.5878516102731348, + "grad_norm": 1.0071889162063599, + "learning_rate": 0.0008016034787335235, + "loss": 3.6967, + "step": 23370 + }, + { + "epoch": 1.5881913303437967, + "grad_norm": 0.6308431029319763, + "learning_rate": 0.0008015610137246909, + "loss": 3.4985, + "step": 23375 + }, + { + "epoch": 1.5885310504144585, + "grad_norm": 0.7738470435142517, + "learning_rate": 0.0008015185487158582, + "loss": 3.6494, + "step": 23380 + }, + { + "epoch": 1.5888707704851202, + "grad_norm": 0.9574844241142273, + "learning_rate": 0.0008014760837070254, + "loss": 3.7692, + "step": 23385 + }, + { + "epoch": 1.589210490555782, + "grad_norm": 1.6807000637054443, + "learning_rate": 0.0008014336186981928, + "loss": 3.504, + "step": 23390 + }, + { + "epoch": 1.5895502106264439, + "grad_norm": 0.7377382516860962, + "learning_rate": 0.00080139115368936, + "loss": 3.7482, + "step": 23395 + }, + { + "epoch": 1.5898899306971055, + "grad_norm": 0.9454064965248108, + "learning_rate": 0.0008013486886805272, + "loss": 3.5457, + "step": 23400 + }, + { + "epoch": 1.5902296507677673, + "grad_norm": 0.8349514603614807, + "learning_rate": 0.0008013062236716946, + "loss": 3.8438, + "step": 23405 + }, + { + "epoch": 1.5905693708384292, + "grad_norm": 0.8690358400344849, + "learning_rate": 0.0008012637586628618, + "loss": 3.7738, + "step": 23410 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.6037783026695251, + "learning_rate": 0.0008012212936540291, + "loss": 3.6409, + "step": 23415 + }, + { + "epoch": 1.5912488109797527, + "grad_norm": 0.7523235082626343, + "learning_rate": 0.0008011788286451965, + "loss": 3.3945, + "step": 23420 + }, + { + "epoch": 1.5915885310504145, + "grad_norm": 0.8534591197967529, + "learning_rate": 0.0008011363636363637, + "loss": 3.799, + "step": 23425 + }, + { + "epoch": 1.5919282511210762, + "grad_norm": 0.8157196044921875, + "learning_rate": 0.0008010938986275309, + "loss": 3.549, + "step": 23430 + }, + { + "epoch": 1.592267971191738, + "grad_norm": 0.9259016513824463, + "learning_rate": 0.0008010514336186982, + "loss": 3.303, + "step": 23435 + }, + { + "epoch": 1.5926076912623999, + "grad_norm": 2.529879093170166, + "learning_rate": 0.0008010089686098655, + "loss": 3.2383, + "step": 23440 + }, + { + "epoch": 1.5929474113330615, + "grad_norm": 0.8630452156066895, + "learning_rate": 0.0008009665036010327, + "loss": 3.4938, + "step": 23445 + }, + { + "epoch": 1.5932871314037234, + "grad_norm": 0.8006749749183655, + "learning_rate": 0.0008009240385922001, + "loss": 3.7822, + "step": 23450 + }, + { + "epoch": 1.5936268514743852, + "grad_norm": 0.8409814834594727, + "learning_rate": 0.0008008815735833674, + "loss": 3.4596, + "step": 23455 + }, + { + "epoch": 1.5939665715450468, + "grad_norm": 0.824252724647522, + "learning_rate": 0.0008008391085745346, + "loss": 3.7668, + "step": 23460 + }, + { + "epoch": 1.5943062916157087, + "grad_norm": 0.683891236782074, + "learning_rate": 0.0008007966435657019, + "loss": 3.404, + "step": 23465 + }, + { + "epoch": 1.5946460116863705, + "grad_norm": 1.0803139209747314, + "learning_rate": 0.0008007541785568691, + "loss": 3.7135, + "step": 23470 + }, + { + "epoch": 1.5949857317570322, + "grad_norm": 0.7687934041023254, + "learning_rate": 0.0008007117135480364, + "loss": 3.2975, + "step": 23475 + }, + { + "epoch": 1.595325451827694, + "grad_norm": 0.7660512924194336, + "learning_rate": 0.0008006692485392037, + "loss": 3.6003, + "step": 23480 + }, + { + "epoch": 1.5956651718983559, + "grad_norm": 0.9547146558761597, + "learning_rate": 0.000800626783530371, + "loss": 3.5028, + "step": 23485 + }, + { + "epoch": 1.5960048919690175, + "grad_norm": 0.7855880856513977, + "learning_rate": 0.0008005843185215383, + "loss": 3.4482, + "step": 23490 + }, + { + "epoch": 1.5963446120396791, + "grad_norm": 1.1546063423156738, + "learning_rate": 0.0008005418535127056, + "loss": 3.4998, + "step": 23495 + }, + { + "epoch": 1.5966843321103412, + "grad_norm": 0.9801806211471558, + "learning_rate": 0.0008004993885038728, + "loss": 3.7316, + "step": 23500 + }, + { + "epoch": 1.5970240521810029, + "grad_norm": 0.9691447615623474, + "learning_rate": 0.00080045692349504, + "loss": 3.3114, + "step": 23505 + }, + { + "epoch": 1.5973637722516645, + "grad_norm": 0.8534424304962158, + "learning_rate": 0.0008004144584862074, + "loss": 3.5818, + "step": 23510 + }, + { + "epoch": 1.5977034923223266, + "grad_norm": 0.8292694091796875, + "learning_rate": 0.0008003719934773746, + "loss": 3.7204, + "step": 23515 + }, + { + "epoch": 1.5980432123929882, + "grad_norm": 0.9203265905380249, + "learning_rate": 0.0008003295284685419, + "loss": 3.5932, + "step": 23520 + }, + { + "epoch": 1.5983829324636498, + "grad_norm": 0.8803809881210327, + "learning_rate": 0.0008002870634597093, + "loss": 3.7692, + "step": 23525 + }, + { + "epoch": 1.5987226525343117, + "grad_norm": 0.7334040403366089, + "learning_rate": 0.0008002445984508765, + "loss": 3.4468, + "step": 23530 + }, + { + "epoch": 1.5990623726049735, + "grad_norm": 0.829446017742157, + "learning_rate": 0.0008002021334420437, + "loss": 3.341, + "step": 23535 + }, + { + "epoch": 1.5994020926756352, + "grad_norm": 2.199932098388672, + "learning_rate": 0.0008001596684332111, + "loss": 3.7761, + "step": 23540 + }, + { + "epoch": 1.599741812746297, + "grad_norm": 0.9488390684127808, + "learning_rate": 0.0008001172034243783, + "loss": 3.4355, + "step": 23545 + }, + { + "epoch": 1.6000815328169589, + "grad_norm": 0.8024408221244812, + "learning_rate": 0.0008000747384155455, + "loss": 3.8137, + "step": 23550 + }, + { + "epoch": 1.6004212528876205, + "grad_norm": 0.7753660082817078, + "learning_rate": 0.000800032273406713, + "loss": 3.625, + "step": 23555 + }, + { + "epoch": 1.6007609729582823, + "grad_norm": 0.8034167289733887, + "learning_rate": 0.0007999898083978802, + "loss": 3.6949, + "step": 23560 + }, + { + "epoch": 1.6011006930289442, + "grad_norm": 0.8872370719909668, + "learning_rate": 0.0007999473433890474, + "loss": 3.59, + "step": 23565 + }, + { + "epoch": 1.6014404130996058, + "grad_norm": 0.776997447013855, + "learning_rate": 0.0007999048783802147, + "loss": 3.7343, + "step": 23570 + }, + { + "epoch": 1.6017801331702677, + "grad_norm": 0.9146755337715149, + "learning_rate": 0.000799862413371382, + "loss": 3.5787, + "step": 23575 + }, + { + "epoch": 1.6021198532409295, + "grad_norm": 1.4037615060806274, + "learning_rate": 0.0007998199483625492, + "loss": 3.4511, + "step": 23580 + }, + { + "epoch": 1.6024595733115912, + "grad_norm": 0.888992965221405, + "learning_rate": 0.0007997774833537165, + "loss": 3.9292, + "step": 23585 + }, + { + "epoch": 1.602799293382253, + "grad_norm": 0.7413973808288574, + "learning_rate": 0.0007997350183448839, + "loss": 3.7729, + "step": 23590 + }, + { + "epoch": 1.6031390134529149, + "grad_norm": 0.829764723777771, + "learning_rate": 0.0007996925533360511, + "loss": 3.5326, + "step": 23595 + }, + { + "epoch": 1.6034787335235765, + "grad_norm": 1.0812530517578125, + "learning_rate": 0.0007996500883272184, + "loss": 3.7931, + "step": 23600 + }, + { + "epoch": 1.6038184535942384, + "grad_norm": 0.7724177837371826, + "learning_rate": 0.0007996076233183857, + "loss": 3.7528, + "step": 23605 + }, + { + "epoch": 1.6041581736649002, + "grad_norm": 0.740287721157074, + "learning_rate": 0.0007995651583095529, + "loss": 3.5682, + "step": 23610 + }, + { + "epoch": 1.6044978937355618, + "grad_norm": 1.2063742876052856, + "learning_rate": 0.0007995226933007202, + "loss": 3.2938, + "step": 23615 + }, + { + "epoch": 1.6048376138062237, + "grad_norm": 3.648000955581665, + "learning_rate": 0.0007994802282918874, + "loss": 3.5909, + "step": 23620 + }, + { + "epoch": 1.6051773338768855, + "grad_norm": 0.7615311145782471, + "learning_rate": 0.0007994377632830548, + "loss": 3.7168, + "step": 23625 + }, + { + "epoch": 1.6055170539475472, + "grad_norm": 0.6642646193504333, + "learning_rate": 0.0007993952982742221, + "loss": 3.6124, + "step": 23630 + }, + { + "epoch": 1.605856774018209, + "grad_norm": 0.8372210264205933, + "learning_rate": 0.0007993528332653893, + "loss": 3.4363, + "step": 23635 + }, + { + "epoch": 1.6061964940888709, + "grad_norm": 0.851889967918396, + "learning_rate": 0.0007993103682565566, + "loss": 3.7541, + "step": 23640 + }, + { + "epoch": 1.6065362141595325, + "grad_norm": 0.6583231091499329, + "learning_rate": 0.0007992679032477239, + "loss": 3.4157, + "step": 23645 + }, + { + "epoch": 1.6068759342301944, + "grad_norm": 0.7521815299987793, + "learning_rate": 0.0007992254382388911, + "loss": 3.5874, + "step": 23650 + }, + { + "epoch": 1.6072156543008562, + "grad_norm": 0.8464522957801819, + "learning_rate": 0.0007991829732300583, + "loss": 3.705, + "step": 23655 + }, + { + "epoch": 1.6075553743715179, + "grad_norm": 0.6839138269424438, + "learning_rate": 0.0007991405082212258, + "loss": 3.7072, + "step": 23660 + }, + { + "epoch": 1.6078950944421795, + "grad_norm": 0.698771595954895, + "learning_rate": 0.000799098043212393, + "loss": 3.3818, + "step": 23665 + }, + { + "epoch": 1.6082348145128416, + "grad_norm": 0.8584349155426025, + "learning_rate": 0.0007990555782035602, + "loss": 3.4646, + "step": 23670 + }, + { + "epoch": 1.6085745345835032, + "grad_norm": 0.8786426782608032, + "learning_rate": 0.0007990131131947276, + "loss": 3.4081, + "step": 23675 + }, + { + "epoch": 1.6089142546541648, + "grad_norm": 0.8291193842887878, + "learning_rate": 0.0007989706481858948, + "loss": 3.5319, + "step": 23680 + }, + { + "epoch": 1.609253974724827, + "grad_norm": 0.7785034775733948, + "learning_rate": 0.000798928183177062, + "loss": 3.5886, + "step": 23685 + }, + { + "epoch": 1.6095936947954885, + "grad_norm": 0.653964102268219, + "learning_rate": 0.0007988857181682295, + "loss": 3.6945, + "step": 23690 + }, + { + "epoch": 1.6099334148661502, + "grad_norm": 0.8138206601142883, + "learning_rate": 0.0007988432531593967, + "loss": 3.2695, + "step": 23695 + }, + { + "epoch": 1.610273134936812, + "grad_norm": 0.775956392288208, + "learning_rate": 0.000798800788150564, + "loss": 3.6393, + "step": 23700 + }, + { + "epoch": 1.6106128550074739, + "grad_norm": 0.6256877183914185, + "learning_rate": 0.0007987583231417313, + "loss": 3.5731, + "step": 23705 + }, + { + "epoch": 1.6109525750781355, + "grad_norm": 0.8629571795463562, + "learning_rate": 0.0007987158581328985, + "loss": 3.6673, + "step": 23710 + }, + { + "epoch": 1.6112922951487973, + "grad_norm": 0.8070040345191956, + "learning_rate": 0.0007986733931240658, + "loss": 3.682, + "step": 23715 + }, + { + "epoch": 1.6116320152194592, + "grad_norm": 0.7501397132873535, + "learning_rate": 0.000798630928115233, + "loss": 3.8609, + "step": 23720 + }, + { + "epoch": 1.6119717352901208, + "grad_norm": 0.9708045125007629, + "learning_rate": 0.0007985884631064004, + "loss": 3.5229, + "step": 23725 + }, + { + "epoch": 1.6123114553607827, + "grad_norm": 0.8499360680580139, + "learning_rate": 0.0007985459980975677, + "loss": 3.8556, + "step": 23730 + }, + { + "epoch": 1.6126511754314445, + "grad_norm": 0.9176612496376038, + "learning_rate": 0.0007985035330887349, + "loss": 3.6358, + "step": 23735 + }, + { + "epoch": 1.6129908955021062, + "grad_norm": 0.7003267407417297, + "learning_rate": 0.0007984610680799022, + "loss": 3.3151, + "step": 23740 + }, + { + "epoch": 1.613330615572768, + "grad_norm": 0.8738831877708435, + "learning_rate": 0.0007984186030710695, + "loss": 3.6452, + "step": 23745 + }, + { + "epoch": 1.6136703356434299, + "grad_norm": 0.7889330983161926, + "learning_rate": 0.0007983761380622367, + "loss": 3.6826, + "step": 23750 + }, + { + "epoch": 1.6140100557140915, + "grad_norm": 0.7811580896377563, + "learning_rate": 0.000798333673053404, + "loss": 3.7733, + "step": 23755 + }, + { + "epoch": 1.6143497757847534, + "grad_norm": 0.8554701209068298, + "learning_rate": 0.0007982912080445714, + "loss": 3.6259, + "step": 23760 + }, + { + "epoch": 1.6146894958554152, + "grad_norm": 0.8652950525283813, + "learning_rate": 0.0007982487430357386, + "loss": 3.5671, + "step": 23765 + }, + { + "epoch": 1.6150292159260768, + "grad_norm": 0.8608037233352661, + "learning_rate": 0.0007982062780269058, + "loss": 3.2791, + "step": 23770 + }, + { + "epoch": 1.6153689359967387, + "grad_norm": 0.7901681065559387, + "learning_rate": 0.0007981638130180732, + "loss": 3.6603, + "step": 23775 + }, + { + "epoch": 1.6157086560674006, + "grad_norm": 0.900314211845398, + "learning_rate": 0.0007981213480092404, + "loss": 3.6424, + "step": 23780 + }, + { + "epoch": 1.6160483761380622, + "grad_norm": 0.7887551784515381, + "learning_rate": 0.0007980788830004076, + "loss": 3.4569, + "step": 23785 + }, + { + "epoch": 1.616388096208724, + "grad_norm": 0.8453372120857239, + "learning_rate": 0.000798036417991575, + "loss": 3.5705, + "step": 23790 + }, + { + "epoch": 1.6167278162793859, + "grad_norm": 1.0080350637435913, + "learning_rate": 0.0007979939529827423, + "loss": 3.3754, + "step": 23795 + }, + { + "epoch": 1.6170675363500475, + "grad_norm": 0.7987036108970642, + "learning_rate": 0.0007979514879739095, + "loss": 3.656, + "step": 23800 + }, + { + "epoch": 1.6174072564207094, + "grad_norm": 0.7544422745704651, + "learning_rate": 0.0007979090229650769, + "loss": 3.4493, + "step": 23805 + }, + { + "epoch": 1.6177469764913712, + "grad_norm": 0.9198125004768372, + "learning_rate": 0.0007978665579562441, + "loss": 3.5881, + "step": 23810 + }, + { + "epoch": 1.6180866965620329, + "grad_norm": 0.70620197057724, + "learning_rate": 0.0007978240929474113, + "loss": 3.7933, + "step": 23815 + }, + { + "epoch": 1.6184264166326947, + "grad_norm": 0.6820084452629089, + "learning_rate": 0.0007977816279385786, + "loss": 3.5054, + "step": 23820 + }, + { + "epoch": 1.6187661367033566, + "grad_norm": 0.9124088883399963, + "learning_rate": 0.0007977391629297459, + "loss": 3.6126, + "step": 23825 + }, + { + "epoch": 1.6191058567740182, + "grad_norm": 1.1282427310943604, + "learning_rate": 0.0007976966979209132, + "loss": 3.6034, + "step": 23830 + }, + { + "epoch": 1.6194455768446798, + "grad_norm": 0.7744703888893127, + "learning_rate": 0.0007976542329120805, + "loss": 3.5311, + "step": 23835 + }, + { + "epoch": 1.619785296915342, + "grad_norm": 0.722459077835083, + "learning_rate": 0.0007976117679032478, + "loss": 3.4292, + "step": 23840 + }, + { + "epoch": 1.6201250169860035, + "grad_norm": 0.9394921660423279, + "learning_rate": 0.000797569302894415, + "loss": 3.3022, + "step": 23845 + }, + { + "epoch": 1.6204647370566652, + "grad_norm": 0.8032643795013428, + "learning_rate": 0.0007975268378855823, + "loss": 3.7957, + "step": 23850 + }, + { + "epoch": 1.6208044571273272, + "grad_norm": 0.8759027123451233, + "learning_rate": 0.0007974843728767495, + "loss": 3.7524, + "step": 23855 + }, + { + "epoch": 1.6211441771979889, + "grad_norm": 0.7859236001968384, + "learning_rate": 0.0007974419078679168, + "loss": 3.4549, + "step": 23860 + }, + { + "epoch": 1.6214838972686505, + "grad_norm": 0.8306437134742737, + "learning_rate": 0.0007973994428590842, + "loss": 3.4034, + "step": 23865 + }, + { + "epoch": 1.6218236173393124, + "grad_norm": 0.8684704899787903, + "learning_rate": 0.0007973569778502514, + "loss": 3.4857, + "step": 23870 + }, + { + "epoch": 1.6221633374099742, + "grad_norm": 0.9043675065040588, + "learning_rate": 0.0007973145128414187, + "loss": 3.5545, + "step": 23875 + }, + { + "epoch": 1.6225030574806358, + "grad_norm": 0.8670063018798828, + "learning_rate": 0.000797272047832586, + "loss": 3.706, + "step": 23880 + }, + { + "epoch": 1.6228427775512977, + "grad_norm": 0.8446089029312134, + "learning_rate": 0.0007972295828237532, + "loss": 3.659, + "step": 23885 + }, + { + "epoch": 1.6231824976219595, + "grad_norm": 0.8417934775352478, + "learning_rate": 0.0007971871178149205, + "loss": 3.7093, + "step": 23890 + }, + { + "epoch": 1.6235222176926212, + "grad_norm": 0.9083121418952942, + "learning_rate": 0.0007971446528060878, + "loss": 3.3869, + "step": 23895 + }, + { + "epoch": 1.623861937763283, + "grad_norm": 0.8039830923080444, + "learning_rate": 0.0007971021877972551, + "loss": 3.5859, + "step": 23900 + }, + { + "epoch": 1.6242016578339449, + "grad_norm": 0.821426272392273, + "learning_rate": 0.0007970597227884223, + "loss": 3.6463, + "step": 23905 + }, + { + "epoch": 1.6245413779046065, + "grad_norm": 2.8626513481140137, + "learning_rate": 0.0007970172577795897, + "loss": 3.6121, + "step": 23910 + }, + { + "epoch": 1.6248810979752684, + "grad_norm": 0.9663615226745605, + "learning_rate": 0.0007969747927707569, + "loss": 3.5968, + "step": 23915 + }, + { + "epoch": 1.6252208180459302, + "grad_norm": 0.8818921446800232, + "learning_rate": 0.0007969323277619241, + "loss": 3.5836, + "step": 23920 + }, + { + "epoch": 1.6255605381165918, + "grad_norm": 0.7534163594245911, + "learning_rate": 0.0007968898627530915, + "loss": 3.6573, + "step": 23925 + }, + { + "epoch": 1.6259002581872537, + "grad_norm": 0.640630841255188, + "learning_rate": 0.0007968473977442587, + "loss": 3.5806, + "step": 23930 + }, + { + "epoch": 1.6262399782579156, + "grad_norm": 0.8478986620903015, + "learning_rate": 0.000796804932735426, + "loss": 3.4252, + "step": 23935 + }, + { + "epoch": 1.6265796983285772, + "grad_norm": 0.7138925790786743, + "learning_rate": 0.0007967624677265934, + "loss": 3.5066, + "step": 23940 + }, + { + "epoch": 1.626919418399239, + "grad_norm": 1.074316143989563, + "learning_rate": 0.0007967200027177606, + "loss": 3.6065, + "step": 23945 + }, + { + "epoch": 1.627259138469901, + "grad_norm": 0.8808876276016235, + "learning_rate": 0.0007966775377089278, + "loss": 3.377, + "step": 23950 + }, + { + "epoch": 1.6275988585405625, + "grad_norm": 0.8790087699890137, + "learning_rate": 0.0007966350727000951, + "loss": 3.7083, + "step": 23955 + }, + { + "epoch": 1.6279385786112244, + "grad_norm": 0.8042747974395752, + "learning_rate": 0.0007965926076912624, + "loss": 3.9143, + "step": 23960 + }, + { + "epoch": 1.6282782986818862, + "grad_norm": 0.9414618015289307, + "learning_rate": 0.0007965501426824296, + "loss": 3.5862, + "step": 23965 + }, + { + "epoch": 1.6286180187525479, + "grad_norm": 0.7246545553207397, + "learning_rate": 0.000796507677673597, + "loss": 3.6764, + "step": 23970 + }, + { + "epoch": 1.6289577388232097, + "grad_norm": 0.7699822783470154, + "learning_rate": 0.0007964652126647643, + "loss": 3.5686, + "step": 23975 + }, + { + "epoch": 1.6292974588938716, + "grad_norm": 0.9942451119422913, + "learning_rate": 0.0007964227476559315, + "loss": 3.5792, + "step": 23980 + }, + { + "epoch": 1.6296371789645332, + "grad_norm": 0.8103046417236328, + "learning_rate": 0.0007963802826470988, + "loss": 3.8364, + "step": 23985 + }, + { + "epoch": 1.629976899035195, + "grad_norm": 1.04445219039917, + "learning_rate": 0.0007963378176382661, + "loss": 3.7543, + "step": 23990 + }, + { + "epoch": 1.630316619105857, + "grad_norm": 1.0919785499572754, + "learning_rate": 0.0007962953526294333, + "loss": 3.6354, + "step": 23995 + }, + { + "epoch": 1.6306563391765185, + "grad_norm": 0.7602235078811646, + "learning_rate": 0.0007962528876206006, + "loss": 3.5899, + "step": 24000 + }, + { + "epoch": 1.6309960592471802, + "grad_norm": 1.098949909210205, + "learning_rate": 0.000796210422611768, + "loss": 3.4882, + "step": 24005 + }, + { + "epoch": 1.6313357793178422, + "grad_norm": 0.9173164367675781, + "learning_rate": 0.0007961679576029352, + "loss": 3.4738, + "step": 24010 + }, + { + "epoch": 1.6316754993885039, + "grad_norm": 0.6776086091995239, + "learning_rate": 0.0007961254925941025, + "loss": 3.4375, + "step": 24015 + }, + { + "epoch": 1.6320152194591655, + "grad_norm": 0.8130583763122559, + "learning_rate": 0.0007960830275852697, + "loss": 3.7129, + "step": 24020 + }, + { + "epoch": 1.6323549395298276, + "grad_norm": 0.9874017834663391, + "learning_rate": 0.000796040562576437, + "loss": 3.6649, + "step": 24025 + }, + { + "epoch": 1.6326946596004892, + "grad_norm": 0.722480058670044, + "learning_rate": 0.0007959980975676043, + "loss": 3.7296, + "step": 24030 + }, + { + "epoch": 1.6330343796711508, + "grad_norm": 0.7024909853935242, + "learning_rate": 0.0007959556325587715, + "loss": 3.6995, + "step": 24035 + }, + { + "epoch": 1.633374099741813, + "grad_norm": 0.8614689111709595, + "learning_rate": 0.000795913167549939, + "loss": 3.4956, + "step": 24040 + }, + { + "epoch": 1.6337138198124745, + "grad_norm": 0.7457025051116943, + "learning_rate": 0.0007958707025411062, + "loss": 3.565, + "step": 24045 + }, + { + "epoch": 1.6340535398831362, + "grad_norm": 0.9903243780136108, + "learning_rate": 0.0007958282375322734, + "loss": 3.4228, + "step": 24050 + }, + { + "epoch": 1.634393259953798, + "grad_norm": 0.8961648941040039, + "learning_rate": 0.0007957857725234408, + "loss": 3.1979, + "step": 24055 + }, + { + "epoch": 1.6347329800244599, + "grad_norm": 0.7739893794059753, + "learning_rate": 0.000795743307514608, + "loss": 3.6768, + "step": 24060 + }, + { + "epoch": 1.6350727000951215, + "grad_norm": 0.8406683802604675, + "learning_rate": 0.0007957008425057752, + "loss": 3.8044, + "step": 24065 + }, + { + "epoch": 1.6354124201657834, + "grad_norm": 1.0473488569259644, + "learning_rate": 0.0007956583774969425, + "loss": 3.7505, + "step": 24070 + }, + { + "epoch": 1.6357521402364452, + "grad_norm": 0.6779006719589233, + "learning_rate": 0.0007956159124881099, + "loss": 3.6013, + "step": 24075 + }, + { + "epoch": 1.6360918603071068, + "grad_norm": 1.097986102104187, + "learning_rate": 0.0007955734474792771, + "loss": 3.8479, + "step": 24080 + }, + { + "epoch": 1.6364315803777687, + "grad_norm": 0.931155264377594, + "learning_rate": 0.0007955309824704444, + "loss": 4.0809, + "step": 24085 + }, + { + "epoch": 1.6367713004484306, + "grad_norm": 1.1740015745162964, + "learning_rate": 0.0007954885174616117, + "loss": 3.6316, + "step": 24090 + }, + { + "epoch": 1.6371110205190922, + "grad_norm": 0.8693308234214783, + "learning_rate": 0.0007954460524527789, + "loss": 3.5035, + "step": 24095 + }, + { + "epoch": 1.637450740589754, + "grad_norm": 0.9832381010055542, + "learning_rate": 0.0007954035874439462, + "loss": 3.6006, + "step": 24100 + }, + { + "epoch": 1.637790460660416, + "grad_norm": 0.9459017515182495, + "learning_rate": 0.0007953611224351134, + "loss": 3.5631, + "step": 24105 + }, + { + "epoch": 1.6381301807310775, + "grad_norm": 0.954075276851654, + "learning_rate": 0.0007953186574262808, + "loss": 3.4539, + "step": 24110 + }, + { + "epoch": 1.6384699008017394, + "grad_norm": 0.9896371960639954, + "learning_rate": 0.0007952761924174481, + "loss": 3.4923, + "step": 24115 + }, + { + "epoch": 1.6388096208724012, + "grad_norm": 0.9398961663246155, + "learning_rate": 0.0007952337274086153, + "loss": 3.5078, + "step": 24120 + }, + { + "epoch": 1.6391493409430629, + "grad_norm": 0.8104745149612427, + "learning_rate": 0.0007951912623997826, + "loss": 3.5612, + "step": 24125 + }, + { + "epoch": 1.6394890610137247, + "grad_norm": 0.9961585402488708, + "learning_rate": 0.0007951487973909499, + "loss": 3.6611, + "step": 24130 + }, + { + "epoch": 1.6398287810843866, + "grad_norm": 0.6860445737838745, + "learning_rate": 0.0007951063323821171, + "loss": 3.5328, + "step": 24135 + }, + { + "epoch": 1.6401685011550482, + "grad_norm": 0.8631895184516907, + "learning_rate": 0.0007950638673732843, + "loss": 3.7483, + "step": 24140 + }, + { + "epoch": 1.64050822122571, + "grad_norm": 1.0838048458099365, + "learning_rate": 0.0007950214023644518, + "loss": 3.5026, + "step": 24145 + }, + { + "epoch": 1.640847941296372, + "grad_norm": 0.9500845074653625, + "learning_rate": 0.000794978937355619, + "loss": 3.7714, + "step": 24150 + }, + { + "epoch": 1.6411876613670335, + "grad_norm": 0.8740270137786865, + "learning_rate": 0.0007949364723467862, + "loss": 3.4845, + "step": 24155 + }, + { + "epoch": 1.6415273814376954, + "grad_norm": 0.8705486059188843, + "learning_rate": 0.0007948940073379536, + "loss": 3.1519, + "step": 24160 + }, + { + "epoch": 1.6418671015083572, + "grad_norm": 0.8421065807342529, + "learning_rate": 0.0007948515423291208, + "loss": 3.7106, + "step": 24165 + }, + { + "epoch": 1.6422068215790189, + "grad_norm": 1.9846657514572144, + "learning_rate": 0.000794809077320288, + "loss": 3.2771, + "step": 24170 + }, + { + "epoch": 1.6425465416496805, + "grad_norm": 0.7649489641189575, + "learning_rate": 0.0007947666123114554, + "loss": 3.5723, + "step": 24175 + }, + { + "epoch": 1.6428862617203426, + "grad_norm": 0.930391788482666, + "learning_rate": 0.0007947241473026227, + "loss": 3.517, + "step": 24180 + }, + { + "epoch": 1.6432259817910042, + "grad_norm": 1.0508224964141846, + "learning_rate": 0.0007946816822937899, + "loss": 3.7029, + "step": 24185 + }, + { + "epoch": 1.6435657018616658, + "grad_norm": 0.821812093257904, + "learning_rate": 0.0007946392172849573, + "loss": 3.2592, + "step": 24190 + }, + { + "epoch": 1.643905421932328, + "grad_norm": 0.7933618426322937, + "learning_rate": 0.0007945967522761245, + "loss": 3.515, + "step": 24195 + }, + { + "epoch": 1.6442451420029895, + "grad_norm": 0.9353975653648376, + "learning_rate": 0.0007945542872672917, + "loss": 3.8655, + "step": 24200 + }, + { + "epoch": 1.6445848620736512, + "grad_norm": 0.8218021392822266, + "learning_rate": 0.000794511822258459, + "loss": 3.7475, + "step": 24205 + }, + { + "epoch": 1.6449245821443133, + "grad_norm": 0.695594847202301, + "learning_rate": 0.0007944693572496263, + "loss": 3.4363, + "step": 24210 + }, + { + "epoch": 1.6452643022149749, + "grad_norm": 0.9117914438247681, + "learning_rate": 0.0007944268922407936, + "loss": 3.5714, + "step": 24215 + }, + { + "epoch": 1.6456040222856365, + "grad_norm": 0.8661485910415649, + "learning_rate": 0.0007943844272319609, + "loss": 3.5946, + "step": 24220 + }, + { + "epoch": 1.6459437423562984, + "grad_norm": 0.718992292881012, + "learning_rate": 0.0007943419622231282, + "loss": 3.6146, + "step": 24225 + }, + { + "epoch": 1.6462834624269602, + "grad_norm": 1.117526888847351, + "learning_rate": 0.0007942994972142954, + "loss": 3.6641, + "step": 24230 + }, + { + "epoch": 1.6466231824976219, + "grad_norm": 0.8679843544960022, + "learning_rate": 0.0007942570322054627, + "loss": 3.5206, + "step": 24235 + }, + { + "epoch": 1.6469629025682837, + "grad_norm": 0.798078179359436, + "learning_rate": 0.00079421456719663, + "loss": 3.6823, + "step": 24240 + }, + { + "epoch": 1.6473026226389456, + "grad_norm": 0.7117829918861389, + "learning_rate": 0.0007941721021877972, + "loss": 3.4638, + "step": 24245 + }, + { + "epoch": 1.6476423427096072, + "grad_norm": 0.6869921684265137, + "learning_rate": 0.0007941296371789646, + "loss": 3.6778, + "step": 24250 + }, + { + "epoch": 1.647982062780269, + "grad_norm": 0.9473781585693359, + "learning_rate": 0.0007940871721701318, + "loss": 3.4986, + "step": 24255 + }, + { + "epoch": 1.648321782850931, + "grad_norm": 0.860776960849762, + "learning_rate": 0.0007940447071612991, + "loss": 3.6827, + "step": 24260 + }, + { + "epoch": 1.6486615029215925, + "grad_norm": 0.8023551106452942, + "learning_rate": 0.0007940022421524664, + "loss": 3.677, + "step": 24265 + }, + { + "epoch": 1.6490012229922544, + "grad_norm": 1.413991928100586, + "learning_rate": 0.0007939597771436336, + "loss": 3.5478, + "step": 24270 + }, + { + "epoch": 1.6493409430629162, + "grad_norm": 0.6877135634422302, + "learning_rate": 0.0007939173121348009, + "loss": 3.4701, + "step": 24275 + }, + { + "epoch": 1.6496806631335779, + "grad_norm": 0.9483077526092529, + "learning_rate": 0.0007938748471259683, + "loss": 3.849, + "step": 24280 + }, + { + "epoch": 1.6500203832042397, + "grad_norm": 0.7941563129425049, + "learning_rate": 0.0007938323821171355, + "loss": 3.4172, + "step": 24285 + }, + { + "epoch": 1.6503601032749016, + "grad_norm": 0.9562891125679016, + "learning_rate": 0.0007937899171083028, + "loss": 3.4902, + "step": 24290 + }, + { + "epoch": 1.6506998233455632, + "grad_norm": 0.8288894295692444, + "learning_rate": 0.0007937474520994701, + "loss": 3.786, + "step": 24295 + }, + { + "epoch": 1.651039543416225, + "grad_norm": 0.7573819160461426, + "learning_rate": 0.0007937049870906373, + "loss": 3.4248, + "step": 24300 + }, + { + "epoch": 1.651379263486887, + "grad_norm": 0.7444324493408203, + "learning_rate": 0.0007936625220818045, + "loss": 3.5537, + "step": 24305 + }, + { + "epoch": 1.6517189835575485, + "grad_norm": 1.0860012769699097, + "learning_rate": 0.0007936200570729719, + "loss": 3.6593, + "step": 24310 + }, + { + "epoch": 1.6520587036282104, + "grad_norm": 0.7766421437263489, + "learning_rate": 0.0007935775920641392, + "loss": 3.67, + "step": 24315 + }, + { + "epoch": 1.6523984236988722, + "grad_norm": 0.980828046798706, + "learning_rate": 0.0007935351270553064, + "loss": 3.4714, + "step": 24320 + }, + { + "epoch": 1.6527381437695339, + "grad_norm": 0.7196915745735168, + "learning_rate": 0.0007934926620464738, + "loss": 3.5739, + "step": 24325 + }, + { + "epoch": 1.6530778638401957, + "grad_norm": 1.0210065841674805, + "learning_rate": 0.000793450197037641, + "loss": 3.4709, + "step": 24330 + }, + { + "epoch": 1.6534175839108576, + "grad_norm": 0.9503947496414185, + "learning_rate": 0.0007934077320288082, + "loss": 3.5741, + "step": 24335 + }, + { + "epoch": 1.6537573039815192, + "grad_norm": 0.7662699818611145, + "learning_rate": 0.0007933652670199756, + "loss": 3.3959, + "step": 24340 + }, + { + "epoch": 1.6540970240521808, + "grad_norm": 0.7288244962692261, + "learning_rate": 0.0007933228020111428, + "loss": 3.5575, + "step": 24345 + }, + { + "epoch": 1.654436744122843, + "grad_norm": 0.9482656121253967, + "learning_rate": 0.0007932803370023101, + "loss": 3.6846, + "step": 24350 + }, + { + "epoch": 1.6547764641935045, + "grad_norm": 0.8246079087257385, + "learning_rate": 0.0007932378719934774, + "loss": 3.5936, + "step": 24355 + }, + { + "epoch": 1.6551161842641662, + "grad_norm": 0.8246719837188721, + "learning_rate": 0.0007931954069846447, + "loss": 3.6984, + "step": 24360 + }, + { + "epoch": 1.6554559043348283, + "grad_norm": 1.071637749671936, + "learning_rate": 0.0007931529419758119, + "loss": 3.5927, + "step": 24365 + }, + { + "epoch": 1.6557956244054899, + "grad_norm": 0.8073283433914185, + "learning_rate": 0.0007931104769669792, + "loss": 3.6328, + "step": 24370 + }, + { + "epoch": 1.6561353444761515, + "grad_norm": 2.0958986282348633, + "learning_rate": 0.0007930680119581465, + "loss": 3.5423, + "step": 24375 + }, + { + "epoch": 1.6564750645468136, + "grad_norm": 0.74517822265625, + "learning_rate": 0.0007930255469493138, + "loss": 3.4128, + "step": 24380 + }, + { + "epoch": 1.6568147846174752, + "grad_norm": 0.8314504027366638, + "learning_rate": 0.0007929830819404811, + "loss": 3.8161, + "step": 24385 + }, + { + "epoch": 1.6571545046881369, + "grad_norm": 0.8436396718025208, + "learning_rate": 0.0007929406169316484, + "loss": 3.5405, + "step": 24390 + }, + { + "epoch": 1.6574942247587987, + "grad_norm": 2.795474052429199, + "learning_rate": 0.0007928981519228157, + "loss": 3.7143, + "step": 24395 + }, + { + "epoch": 1.6578339448294606, + "grad_norm": 0.8774099349975586, + "learning_rate": 0.0007928556869139829, + "loss": 3.7409, + "step": 24400 + }, + { + "epoch": 1.6581736649001222, + "grad_norm": 0.9340038299560547, + "learning_rate": 0.0007928132219051501, + "loss": 3.4946, + "step": 24405 + }, + { + "epoch": 1.658513384970784, + "grad_norm": 0.9519670605659485, + "learning_rate": 0.0007927707568963175, + "loss": 3.6401, + "step": 24410 + }, + { + "epoch": 1.658853105041446, + "grad_norm": 2.1633570194244385, + "learning_rate": 0.0007927282918874847, + "loss": 3.5612, + "step": 24415 + }, + { + "epoch": 1.6591928251121075, + "grad_norm": 0.8679820895195007, + "learning_rate": 0.000792685826878652, + "loss": 3.5901, + "step": 24420 + }, + { + "epoch": 1.6595325451827694, + "grad_norm": 0.8289210200309753, + "learning_rate": 0.0007926433618698194, + "loss": 3.6593, + "step": 24425 + }, + { + "epoch": 1.6598722652534312, + "grad_norm": 0.8263924717903137, + "learning_rate": 0.0007926008968609866, + "loss": 3.7387, + "step": 24430 + }, + { + "epoch": 1.6602119853240929, + "grad_norm": 0.8254480957984924, + "learning_rate": 0.0007925584318521538, + "loss": 3.5931, + "step": 24435 + }, + { + "epoch": 1.6605517053947547, + "grad_norm": 1.3971071243286133, + "learning_rate": 0.0007925159668433212, + "loss": 3.6164, + "step": 24440 + }, + { + "epoch": 1.6608914254654166, + "grad_norm": 0.7676345705986023, + "learning_rate": 0.0007924735018344884, + "loss": 3.59, + "step": 24445 + }, + { + "epoch": 1.6612311455360782, + "grad_norm": 0.772107720375061, + "learning_rate": 0.0007924310368256556, + "loss": 3.7587, + "step": 24450 + }, + { + "epoch": 1.66157086560674, + "grad_norm": 0.9246844053268433, + "learning_rate": 0.000792388571816823, + "loss": 3.5499, + "step": 24455 + }, + { + "epoch": 1.661910585677402, + "grad_norm": 0.7375372052192688, + "learning_rate": 0.0007923461068079903, + "loss": 3.7428, + "step": 24460 + }, + { + "epoch": 1.6622503057480635, + "grad_norm": 1.0936533212661743, + "learning_rate": 0.0007923036417991575, + "loss": 3.6222, + "step": 24465 + }, + { + "epoch": 1.6625900258187254, + "grad_norm": 1.044481635093689, + "learning_rate": 0.0007922611767903248, + "loss": 3.7682, + "step": 24470 + }, + { + "epoch": 1.6629297458893872, + "grad_norm": 0.9488760232925415, + "learning_rate": 0.0007922187117814921, + "loss": 3.7288, + "step": 24475 + }, + { + "epoch": 1.6632694659600489, + "grad_norm": 0.7497943639755249, + "learning_rate": 0.0007921762467726593, + "loss": 3.8212, + "step": 24480 + }, + { + "epoch": 1.6636091860307107, + "grad_norm": 1.080062747001648, + "learning_rate": 0.0007921337817638266, + "loss": 3.4629, + "step": 24485 + }, + { + "epoch": 1.6639489061013726, + "grad_norm": 0.8742756843566895, + "learning_rate": 0.000792091316754994, + "loss": 3.5623, + "step": 24490 + }, + { + "epoch": 1.6642886261720342, + "grad_norm": 1.0671976804733276, + "learning_rate": 0.0007920488517461612, + "loss": 3.5439, + "step": 24495 + }, + { + "epoch": 1.664628346242696, + "grad_norm": 0.7957873940467834, + "learning_rate": 0.0007920063867373285, + "loss": 3.8252, + "step": 24500 + }, + { + "epoch": 1.664968066313358, + "grad_norm": 0.8818623423576355, + "learning_rate": 0.0007919639217284957, + "loss": 3.5372, + "step": 24505 + }, + { + "epoch": 1.6653077863840196, + "grad_norm": 0.7169892191886902, + "learning_rate": 0.000791921456719663, + "loss": 3.5187, + "step": 24510 + }, + { + "epoch": 1.6656475064546812, + "grad_norm": 0.8471723198890686, + "learning_rate": 0.0007918789917108303, + "loss": 3.4304, + "step": 24515 + }, + { + "epoch": 1.6659872265253433, + "grad_norm": 0.9031539559364319, + "learning_rate": 0.0007918365267019975, + "loss": 3.4574, + "step": 24520 + }, + { + "epoch": 1.6663269465960049, + "grad_norm": 0.6893659234046936, + "learning_rate": 0.0007917940616931649, + "loss": 3.7121, + "step": 24525 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.7697693705558777, + "learning_rate": 0.0007917515966843322, + "loss": 3.7147, + "step": 24530 + }, + { + "epoch": 1.6670063867373286, + "grad_norm": 1.362442970275879, + "learning_rate": 0.0007917091316754994, + "loss": 3.7798, + "step": 24535 + }, + { + "epoch": 1.6673461068079902, + "grad_norm": 0.8923950791358948, + "learning_rate": 0.0007916666666666666, + "loss": 3.5966, + "step": 24540 + }, + { + "epoch": 1.6676858268786519, + "grad_norm": 0.8707148432731628, + "learning_rate": 0.000791624201657834, + "loss": 3.6689, + "step": 24545 + }, + { + "epoch": 1.668025546949314, + "grad_norm": 1.0183985233306885, + "learning_rate": 0.0007915817366490012, + "loss": 3.5871, + "step": 24550 + }, + { + "epoch": 1.6683652670199756, + "grad_norm": 0.8272672295570374, + "learning_rate": 0.0007915392716401684, + "loss": 3.875, + "step": 24555 + }, + { + "epoch": 1.6687049870906372, + "grad_norm": 0.7532034516334534, + "learning_rate": 0.0007914968066313359, + "loss": 3.5997, + "step": 24560 + }, + { + "epoch": 1.669044707161299, + "grad_norm": 0.6456456780433655, + "learning_rate": 0.0007914543416225031, + "loss": 3.7126, + "step": 24565 + }, + { + "epoch": 1.669384427231961, + "grad_norm": 0.7527528405189514, + "learning_rate": 0.0007914118766136703, + "loss": 3.8208, + "step": 24570 + }, + { + "epoch": 1.6697241473026225, + "grad_norm": 0.8443238735198975, + "learning_rate": 0.0007913694116048377, + "loss": 3.5148, + "step": 24575 + }, + { + "epoch": 1.6700638673732844, + "grad_norm": 0.9762957692146301, + "learning_rate": 0.0007913269465960049, + "loss": 3.6362, + "step": 24580 + }, + { + "epoch": 1.6704035874439462, + "grad_norm": 0.9310317635536194, + "learning_rate": 0.0007912844815871721, + "loss": 3.6042, + "step": 24585 + }, + { + "epoch": 1.6707433075146079, + "grad_norm": 0.8808067440986633, + "learning_rate": 0.0007912420165783394, + "loss": 3.4182, + "step": 24590 + }, + { + "epoch": 1.6710830275852697, + "grad_norm": 0.7860133647918701, + "learning_rate": 0.0007911995515695068, + "loss": 3.6345, + "step": 24595 + }, + { + "epoch": 1.6714227476559316, + "grad_norm": 0.9115269184112549, + "learning_rate": 0.000791157086560674, + "loss": 3.4241, + "step": 24600 + }, + { + "epoch": 1.6717624677265932, + "grad_norm": 0.8529918193817139, + "learning_rate": 0.0007911146215518413, + "loss": 3.5589, + "step": 24605 + }, + { + "epoch": 1.672102187797255, + "grad_norm": 0.9823395013809204, + "learning_rate": 0.0007910721565430086, + "loss": 3.7929, + "step": 24610 + }, + { + "epoch": 1.672441907867917, + "grad_norm": 0.9951348900794983, + "learning_rate": 0.0007910296915341758, + "loss": 3.7778, + "step": 24615 + }, + { + "epoch": 1.6727816279385785, + "grad_norm": 9.480561256408691, + "learning_rate": 0.0007909872265253431, + "loss": 3.4182, + "step": 24620 + }, + { + "epoch": 1.6731213480092404, + "grad_norm": 0.8569163680076599, + "learning_rate": 0.0007909447615165104, + "loss": 3.6634, + "step": 24625 + }, + { + "epoch": 1.6734610680799022, + "grad_norm": 0.8005229234695435, + "learning_rate": 0.0007909022965076777, + "loss": 3.7271, + "step": 24630 + }, + { + "epoch": 1.6738007881505639, + "grad_norm": 0.7411558628082275, + "learning_rate": 0.000790859831498845, + "loss": 3.6637, + "step": 24635 + }, + { + "epoch": 1.6741405082212257, + "grad_norm": 0.7973654270172119, + "learning_rate": 0.0007908173664900122, + "loss": 3.6053, + "step": 24640 + }, + { + "epoch": 1.6744802282918876, + "grad_norm": 0.7626303434371948, + "learning_rate": 0.0007907749014811795, + "loss": 3.6244, + "step": 24645 + }, + { + "epoch": 1.6748199483625492, + "grad_norm": 0.742506742477417, + "learning_rate": 0.0007907324364723468, + "loss": 3.446, + "step": 24650 + }, + { + "epoch": 1.675159668433211, + "grad_norm": 0.8420978784561157, + "learning_rate": 0.000790689971463514, + "loss": 3.711, + "step": 24655 + }, + { + "epoch": 1.675499388503873, + "grad_norm": 0.8207518458366394, + "learning_rate": 0.0007906475064546813, + "loss": 3.4827, + "step": 24660 + }, + { + "epoch": 1.6758391085745346, + "grad_norm": 0.877315878868103, + "learning_rate": 0.0007906050414458487, + "loss": 3.6606, + "step": 24665 + }, + { + "epoch": 1.6761788286451964, + "grad_norm": 1.3421813249588013, + "learning_rate": 0.0007905625764370159, + "loss": 3.4863, + "step": 24670 + }, + { + "epoch": 1.6765185487158583, + "grad_norm": 0.8405719995498657, + "learning_rate": 0.0007905201114281832, + "loss": 3.7073, + "step": 24675 + }, + { + "epoch": 1.67685826878652, + "grad_norm": 0.6932323575019836, + "learning_rate": 0.0007904776464193505, + "loss": 3.4323, + "step": 24680 + }, + { + "epoch": 1.6771979888571815, + "grad_norm": 0.9573431611061096, + "learning_rate": 0.0007904351814105177, + "loss": 3.8728, + "step": 24685 + }, + { + "epoch": 1.6775377089278436, + "grad_norm": 0.8480305671691895, + "learning_rate": 0.0007903927164016849, + "loss": 3.6548, + "step": 24690 + }, + { + "epoch": 1.6778774289985052, + "grad_norm": 0.8857832551002502, + "learning_rate": 0.0007903502513928523, + "loss": 3.7044, + "step": 24695 + }, + { + "epoch": 1.6782171490691669, + "grad_norm": 0.7653765082359314, + "learning_rate": 0.0007903077863840196, + "loss": 3.6139, + "step": 24700 + }, + { + "epoch": 1.678556869139829, + "grad_norm": 0.8329910635948181, + "learning_rate": 0.0007902653213751868, + "loss": 3.6165, + "step": 24705 + }, + { + "epoch": 1.6788965892104906, + "grad_norm": 1.148681879043579, + "learning_rate": 0.0007902228563663542, + "loss": 3.621, + "step": 24710 + }, + { + "epoch": 1.6792363092811522, + "grad_norm": 0.7563319206237793, + "learning_rate": 0.0007901803913575214, + "loss": 3.8406, + "step": 24715 + }, + { + "epoch": 1.6795760293518143, + "grad_norm": 0.8426403999328613, + "learning_rate": 0.0007901379263486887, + "loss": 3.618, + "step": 24720 + }, + { + "epoch": 1.679915749422476, + "grad_norm": 0.7805582284927368, + "learning_rate": 0.000790095461339856, + "loss": 3.3991, + "step": 24725 + }, + { + "epoch": 1.6802554694931375, + "grad_norm": 1.6643744707107544, + "learning_rate": 0.0007900529963310232, + "loss": 3.6525, + "step": 24730 + }, + { + "epoch": 1.6805951895637994, + "grad_norm": 1.0691514015197754, + "learning_rate": 0.0007900105313221906, + "loss": 3.5633, + "step": 24735 + }, + { + "epoch": 1.6809349096344612, + "grad_norm": 0.7392637133598328, + "learning_rate": 0.0007899680663133579, + "loss": 3.5587, + "step": 24740 + }, + { + "epoch": 1.6812746297051229, + "grad_norm": 0.8804525136947632, + "learning_rate": 0.0007899256013045251, + "loss": 3.8447, + "step": 24745 + }, + { + "epoch": 1.6816143497757847, + "grad_norm": 0.9111513495445251, + "learning_rate": 0.0007898831362956924, + "loss": 3.6158, + "step": 24750 + }, + { + "epoch": 1.6819540698464466, + "grad_norm": 0.8889359831809998, + "learning_rate": 0.0007898406712868596, + "loss": 3.5904, + "step": 24755 + }, + { + "epoch": 1.6822937899171082, + "grad_norm": 1.1167829036712646, + "learning_rate": 0.0007897982062780269, + "loss": 3.4369, + "step": 24760 + }, + { + "epoch": 1.68263350998777, + "grad_norm": 0.8743190765380859, + "learning_rate": 0.0007897557412691942, + "loss": 3.1039, + "step": 24765 + }, + { + "epoch": 1.682973230058432, + "grad_norm": 0.845485508441925, + "learning_rate": 0.0007897132762603615, + "loss": 3.4752, + "step": 24770 + }, + { + "epoch": 1.6833129501290935, + "grad_norm": 0.8958326578140259, + "learning_rate": 0.0007896708112515288, + "loss": 3.6334, + "step": 24775 + }, + { + "epoch": 1.6836526701997554, + "grad_norm": 0.9082204699516296, + "learning_rate": 0.0007896283462426961, + "loss": 3.5024, + "step": 24780 + }, + { + "epoch": 1.6839923902704172, + "grad_norm": 0.950920820236206, + "learning_rate": 0.0007895858812338633, + "loss": 3.6079, + "step": 24785 + }, + { + "epoch": 1.6843321103410789, + "grad_norm": 0.7171888947486877, + "learning_rate": 0.0007895434162250305, + "loss": 3.4144, + "step": 24790 + }, + { + "epoch": 1.6846718304117407, + "grad_norm": 0.7586900591850281, + "learning_rate": 0.0007895009512161979, + "loss": 3.6151, + "step": 24795 + }, + { + "epoch": 1.6850115504824026, + "grad_norm": 0.7525753974914551, + "learning_rate": 0.0007894584862073651, + "loss": 3.4113, + "step": 24800 + }, + { + "epoch": 1.6853512705530642, + "grad_norm": 1.2170484066009521, + "learning_rate": 0.0007894160211985324, + "loss": 3.3417, + "step": 24805 + }, + { + "epoch": 1.685690990623726, + "grad_norm": 1.3474218845367432, + "learning_rate": 0.0007893735561896998, + "loss": 3.1772, + "step": 24810 + }, + { + "epoch": 1.686030710694388, + "grad_norm": 0.7971686124801636, + "learning_rate": 0.000789331091180867, + "loss": 3.5861, + "step": 24815 + }, + { + "epoch": 1.6863704307650496, + "grad_norm": 0.8797256946563721, + "learning_rate": 0.0007892886261720342, + "loss": 3.5194, + "step": 24820 + }, + { + "epoch": 1.6867101508357114, + "grad_norm": 0.9257854223251343, + "learning_rate": 0.0007892461611632016, + "loss": 3.601, + "step": 24825 + }, + { + "epoch": 1.6870498709063733, + "grad_norm": 0.832221269607544, + "learning_rate": 0.0007892036961543688, + "loss": 3.7941, + "step": 24830 + }, + { + "epoch": 1.687389590977035, + "grad_norm": 0.755641758441925, + "learning_rate": 0.000789161231145536, + "loss": 3.5499, + "step": 24835 + }, + { + "epoch": 1.6877293110476967, + "grad_norm": 0.738133430480957, + "learning_rate": 0.0007891187661367035, + "loss": 3.6842, + "step": 24840 + }, + { + "epoch": 1.6880690311183586, + "grad_norm": 0.8540025353431702, + "learning_rate": 0.0007890763011278707, + "loss": 3.6748, + "step": 24845 + }, + { + "epoch": 1.6884087511890202, + "grad_norm": 0.9164283871650696, + "learning_rate": 0.0007890338361190379, + "loss": 3.4512, + "step": 24850 + }, + { + "epoch": 1.6887484712596819, + "grad_norm": 0.7121273875236511, + "learning_rate": 0.0007889913711102052, + "loss": 3.4543, + "step": 24855 + }, + { + "epoch": 1.689088191330344, + "grad_norm": 0.8869855999946594, + "learning_rate": 0.0007889489061013725, + "loss": 3.7712, + "step": 24860 + }, + { + "epoch": 1.6894279114010056, + "grad_norm": 0.8272392749786377, + "learning_rate": 0.0007889064410925397, + "loss": 3.4332, + "step": 24865 + }, + { + "epoch": 1.6897676314716672, + "grad_norm": 7.3737006187438965, + "learning_rate": 0.0007888639760837071, + "loss": 3.4138, + "step": 24870 + }, + { + "epoch": 1.6901073515423293, + "grad_norm": 0.645222544670105, + "learning_rate": 0.0007888215110748744, + "loss": 3.6457, + "step": 24875 + }, + { + "epoch": 1.690447071612991, + "grad_norm": 0.6604169011116028, + "learning_rate": 0.0007887790460660416, + "loss": 3.4253, + "step": 24880 + }, + { + "epoch": 1.6907867916836525, + "grad_norm": 0.7343199849128723, + "learning_rate": 0.0007887365810572089, + "loss": 3.3394, + "step": 24885 + }, + { + "epoch": 1.6911265117543146, + "grad_norm": 0.9501261711120605, + "learning_rate": 0.0007886941160483761, + "loss": 3.6108, + "step": 24890 + }, + { + "epoch": 1.6914662318249762, + "grad_norm": 1.3082340955734253, + "learning_rate": 0.0007886516510395434, + "loss": 3.6318, + "step": 24895 + }, + { + "epoch": 1.6918059518956379, + "grad_norm": 0.7600964903831482, + "learning_rate": 0.0007886091860307107, + "loss": 3.3846, + "step": 24900 + }, + { + "epoch": 1.6921456719662997, + "grad_norm": 0.8528017401695251, + "learning_rate": 0.000788566721021878, + "loss": 3.436, + "step": 24905 + }, + { + "epoch": 1.6924853920369616, + "grad_norm": 0.7378764152526855, + "learning_rate": 0.0007885242560130453, + "loss": 3.7147, + "step": 24910 + }, + { + "epoch": 1.6928251121076232, + "grad_norm": 0.9914575815200806, + "learning_rate": 0.0007884817910042126, + "loss": 3.5237, + "step": 24915 + }, + { + "epoch": 1.693164832178285, + "grad_norm": 0.882283091545105, + "learning_rate": 0.0007884393259953798, + "loss": 3.5879, + "step": 24920 + }, + { + "epoch": 1.693504552248947, + "grad_norm": 0.8320963382720947, + "learning_rate": 0.000788396860986547, + "loss": 3.9506, + "step": 24925 + }, + { + "epoch": 1.6938442723196085, + "grad_norm": 0.8657430410385132, + "learning_rate": 0.0007883543959777144, + "loss": 3.1992, + "step": 24930 + }, + { + "epoch": 1.6941839923902704, + "grad_norm": 0.7681517004966736, + "learning_rate": 0.0007883119309688816, + "loss": 3.895, + "step": 24935 + }, + { + "epoch": 1.6945237124609323, + "grad_norm": 0.8359804153442383, + "learning_rate": 0.0007882694659600489, + "loss": 3.4467, + "step": 24940 + }, + { + "epoch": 1.6948634325315939, + "grad_norm": 0.7948193550109863, + "learning_rate": 0.0007882270009512163, + "loss": 3.675, + "step": 24945 + }, + { + "epoch": 1.6952031526022557, + "grad_norm": 1.3070861101150513, + "learning_rate": 0.0007881845359423835, + "loss": 3.7222, + "step": 24950 + }, + { + "epoch": 1.6955428726729176, + "grad_norm": 0.8336454629898071, + "learning_rate": 0.0007881420709335507, + "loss": 3.6566, + "step": 24955 + }, + { + "epoch": 1.6958825927435792, + "grad_norm": 0.8402694463729858, + "learning_rate": 0.0007880996059247181, + "loss": 3.3293, + "step": 24960 + }, + { + "epoch": 1.696222312814241, + "grad_norm": 0.9287682771682739, + "learning_rate": 0.0007880571409158853, + "loss": 3.5175, + "step": 24965 + }, + { + "epoch": 1.696562032884903, + "grad_norm": 0.7435588836669922, + "learning_rate": 0.0007880146759070525, + "loss": 3.7402, + "step": 24970 + }, + { + "epoch": 1.6969017529555646, + "grad_norm": 0.7393031120300293, + "learning_rate": 0.00078797221089822, + "loss": 3.6228, + "step": 24975 + }, + { + "epoch": 1.6972414730262264, + "grad_norm": 1.0056402683258057, + "learning_rate": 0.0007879297458893872, + "loss": 3.5249, + "step": 24980 + }, + { + "epoch": 1.6975811930968883, + "grad_norm": 0.9495598673820496, + "learning_rate": 0.0007878872808805544, + "loss": 3.32, + "step": 24985 + }, + { + "epoch": 1.69792091316755, + "grad_norm": 1.0601468086242676, + "learning_rate": 0.0007878448158717217, + "loss": 3.5487, + "step": 24990 + }, + { + "epoch": 1.6982606332382117, + "grad_norm": 0.9791831374168396, + "learning_rate": 0.000787802350862889, + "loss": 3.7158, + "step": 24995 + }, + { + "epoch": 1.6986003533088736, + "grad_norm": 0.9425805807113647, + "learning_rate": 0.0007877598858540562, + "loss": 3.7156, + "step": 25000 + }, + { + "epoch": 1.6989400733795352, + "grad_norm": 0.7922189831733704, + "learning_rate": 0.0007877174208452235, + "loss": 3.5581, + "step": 25005 + }, + { + "epoch": 1.699279793450197, + "grad_norm": 0.9855362176895142, + "learning_rate": 0.0007876749558363909, + "loss": 3.7814, + "step": 25010 + }, + { + "epoch": 1.699619513520859, + "grad_norm": 0.9574316143989563, + "learning_rate": 0.0007876324908275581, + "loss": 3.4324, + "step": 25015 + }, + { + "epoch": 1.6999592335915206, + "grad_norm": 0.992119312286377, + "learning_rate": 0.0007875900258187254, + "loss": 3.7321, + "step": 25020 + }, + { + "epoch": 1.7002989536621822, + "grad_norm": 0.9182361364364624, + "learning_rate": 0.0007875475608098927, + "loss": 3.6385, + "step": 25025 + }, + { + "epoch": 1.7006386737328443, + "grad_norm": 0.8356109261512756, + "learning_rate": 0.0007875050958010599, + "loss": 3.6105, + "step": 25030 + }, + { + "epoch": 1.700978393803506, + "grad_norm": 0.7890655398368835, + "learning_rate": 0.0007874626307922272, + "loss": 3.2611, + "step": 25035 + }, + { + "epoch": 1.7013181138741675, + "grad_norm": 0.8050886988639832, + "learning_rate": 0.0007874201657833944, + "loss": 3.7093, + "step": 25040 + }, + { + "epoch": 1.7016578339448296, + "grad_norm": 0.8968763947486877, + "learning_rate": 0.0007873777007745618, + "loss": 3.6184, + "step": 25045 + }, + { + "epoch": 1.7019975540154912, + "grad_norm": 0.9817919731140137, + "learning_rate": 0.0007873352357657291, + "loss": 3.6193, + "step": 25050 + }, + { + "epoch": 1.7023372740861529, + "grad_norm": 1.0319744348526, + "learning_rate": 0.0007872927707568963, + "loss": 3.5048, + "step": 25055 + }, + { + "epoch": 1.702676994156815, + "grad_norm": 0.8635240197181702, + "learning_rate": 0.0007872503057480637, + "loss": 3.5235, + "step": 25060 + }, + { + "epoch": 1.7030167142274766, + "grad_norm": 0.7271537184715271, + "learning_rate": 0.0007872078407392309, + "loss": 3.3458, + "step": 25065 + }, + { + "epoch": 1.7033564342981382, + "grad_norm": 0.7640396356582642, + "learning_rate": 0.0007871653757303981, + "loss": 3.6563, + "step": 25070 + }, + { + "epoch": 1.7036961543688, + "grad_norm": 0.8330565094947815, + "learning_rate": 0.0007871229107215655, + "loss": 3.3322, + "step": 25075 + }, + { + "epoch": 1.704035874439462, + "grad_norm": 0.7466673851013184, + "learning_rate": 0.0007870804457127328, + "loss": 3.4793, + "step": 25080 + }, + { + "epoch": 1.7043755945101235, + "grad_norm": 0.9030289053916931, + "learning_rate": 0.0007870379807039, + "loss": 3.7168, + "step": 25085 + }, + { + "epoch": 1.7047153145807854, + "grad_norm": 0.7079142928123474, + "learning_rate": 0.0007869955156950673, + "loss": 3.6878, + "step": 25090 + }, + { + "epoch": 1.7050550346514473, + "grad_norm": 1.0508791208267212, + "learning_rate": 0.0007869530506862346, + "loss": 3.7582, + "step": 25095 + }, + { + "epoch": 1.7053947547221089, + "grad_norm": 0.7063107490539551, + "learning_rate": 0.0007869105856774018, + "loss": 3.6378, + "step": 25100 + }, + { + "epoch": 1.7057344747927707, + "grad_norm": 0.865659773349762, + "learning_rate": 0.0007868681206685691, + "loss": 3.6271, + "step": 25105 + }, + { + "epoch": 1.7060741948634326, + "grad_norm": 0.6062456965446472, + "learning_rate": 0.0007868256556597364, + "loss": 3.6136, + "step": 25110 + }, + { + "epoch": 1.7064139149340942, + "grad_norm": 0.8144451379776001, + "learning_rate": 0.0007867831906509037, + "loss": 3.5236, + "step": 25115 + }, + { + "epoch": 1.706753635004756, + "grad_norm": 1.3249481916427612, + "learning_rate": 0.000786740725642071, + "loss": 3.4498, + "step": 25120 + }, + { + "epoch": 1.707093355075418, + "grad_norm": 1.2143559455871582, + "learning_rate": 0.0007866982606332383, + "loss": 3.5574, + "step": 25125 + }, + { + "epoch": 1.7074330751460796, + "grad_norm": 0.7378126978874207, + "learning_rate": 0.0007866557956244055, + "loss": 3.8802, + "step": 25130 + }, + { + "epoch": 1.7077727952167414, + "grad_norm": 0.7522438764572144, + "learning_rate": 0.0007866133306155728, + "loss": 3.4657, + "step": 25135 + }, + { + "epoch": 1.7081125152874033, + "grad_norm": 0.7902761101722717, + "learning_rate": 0.00078657086560674, + "loss": 3.7798, + "step": 25140 + }, + { + "epoch": 1.708452235358065, + "grad_norm": 1.4646514654159546, + "learning_rate": 0.0007865284005979073, + "loss": 3.6221, + "step": 25145 + }, + { + "epoch": 1.7087919554287267, + "grad_norm": 0.8539705276489258, + "learning_rate": 0.0007864859355890747, + "loss": 3.9526, + "step": 25150 + }, + { + "epoch": 1.7091316754993886, + "grad_norm": 1.5609830617904663, + "learning_rate": 0.0007864434705802419, + "loss": 3.7293, + "step": 25155 + }, + { + "epoch": 1.7094713955700502, + "grad_norm": 1.3252583742141724, + "learning_rate": 0.0007864010055714092, + "loss": 3.728, + "step": 25160 + }, + { + "epoch": 1.709811115640712, + "grad_norm": 0.7726114392280579, + "learning_rate": 0.0007863585405625765, + "loss": 3.5391, + "step": 25165 + }, + { + "epoch": 1.710150835711374, + "grad_norm": 0.8834228515625, + "learning_rate": 0.0007863160755537437, + "loss": 3.6561, + "step": 25170 + }, + { + "epoch": 1.7104905557820356, + "grad_norm": 1.044006109237671, + "learning_rate": 0.0007862736105449109, + "loss": 3.7361, + "step": 25175 + }, + { + "epoch": 1.7108302758526974, + "grad_norm": 0.909641444683075, + "learning_rate": 0.0007862311455360783, + "loss": 3.7335, + "step": 25180 + }, + { + "epoch": 1.7111699959233593, + "grad_norm": 0.8293392658233643, + "learning_rate": 0.0007861886805272456, + "loss": 3.7987, + "step": 25185 + }, + { + "epoch": 1.711509715994021, + "grad_norm": 1.1066696643829346, + "learning_rate": 0.0007861462155184128, + "loss": 3.5864, + "step": 25190 + }, + { + "epoch": 1.7118494360646825, + "grad_norm": 1.048907995223999, + "learning_rate": 0.0007861037505095802, + "loss": 3.7062, + "step": 25195 + }, + { + "epoch": 1.7121891561353446, + "grad_norm": 0.814866840839386, + "learning_rate": 0.0007860612855007474, + "loss": 3.5113, + "step": 25200 + }, + { + "epoch": 1.7125288762060062, + "grad_norm": 0.9189882278442383, + "learning_rate": 0.0007860188204919146, + "loss": 3.5028, + "step": 25205 + }, + { + "epoch": 1.7128685962766679, + "grad_norm": 0.9100513458251953, + "learning_rate": 0.000785976355483082, + "loss": 3.6334, + "step": 25210 + }, + { + "epoch": 1.71320831634733, + "grad_norm": 0.8607984185218811, + "learning_rate": 0.0007859338904742492, + "loss": 3.5303, + "step": 25215 + }, + { + "epoch": 1.7135480364179916, + "grad_norm": 0.7295019030570984, + "learning_rate": 0.0007858914254654165, + "loss": 3.641, + "step": 25220 + }, + { + "epoch": 1.7138877564886532, + "grad_norm": 0.9741166234016418, + "learning_rate": 0.0007858489604565839, + "loss": 3.6639, + "step": 25225 + }, + { + "epoch": 1.7142274765593153, + "grad_norm": 0.7915515899658203, + "learning_rate": 0.0007858064954477511, + "loss": 3.3309, + "step": 25230 + }, + { + "epoch": 1.714567196629977, + "grad_norm": 0.7592409253120422, + "learning_rate": 0.0007857640304389183, + "loss": 3.4913, + "step": 25235 + }, + { + "epoch": 1.7149069167006386, + "grad_norm": 1.2805774211883545, + "learning_rate": 0.0007857215654300856, + "loss": 4.0601, + "step": 25240 + }, + { + "epoch": 1.7152466367713004, + "grad_norm": 0.6981062293052673, + "learning_rate": 0.0007856791004212529, + "loss": 3.5537, + "step": 25245 + }, + { + "epoch": 1.7155863568419623, + "grad_norm": 0.8806710243225098, + "learning_rate": 0.0007856366354124201, + "loss": 3.6258, + "step": 25250 + }, + { + "epoch": 1.7159260769126239, + "grad_norm": 0.8031459450721741, + "learning_rate": 0.0007855941704035875, + "loss": 3.5059, + "step": 25255 + }, + { + "epoch": 1.7162657969832857, + "grad_norm": 0.8104864358901978, + "learning_rate": 0.0007855517053947548, + "loss": 3.4278, + "step": 25260 + }, + { + "epoch": 1.7166055170539476, + "grad_norm": 0.9036292433738708, + "learning_rate": 0.000785509240385922, + "loss": 3.7326, + "step": 25265 + }, + { + "epoch": 1.7169452371246092, + "grad_norm": 0.8150564432144165, + "learning_rate": 0.0007854667753770893, + "loss": 3.6372, + "step": 25270 + }, + { + "epoch": 1.717284957195271, + "grad_norm": 0.6849907040596008, + "learning_rate": 0.0007854243103682565, + "loss": 3.4876, + "step": 25275 + }, + { + "epoch": 1.717624677265933, + "grad_norm": 1.1569099426269531, + "learning_rate": 0.0007853818453594238, + "loss": 3.6788, + "step": 25280 + }, + { + "epoch": 1.7179643973365946, + "grad_norm": 0.7375945448875427, + "learning_rate": 0.0007853393803505911, + "loss": 3.732, + "step": 25285 + }, + { + "epoch": 1.7183041174072564, + "grad_norm": 0.777554988861084, + "learning_rate": 0.0007852969153417584, + "loss": 3.7477, + "step": 25290 + }, + { + "epoch": 1.7186438374779183, + "grad_norm": 1.0347059965133667, + "learning_rate": 0.0007852544503329257, + "loss": 3.3424, + "step": 25295 + }, + { + "epoch": 1.71898355754858, + "grad_norm": 0.6994560956954956, + "learning_rate": 0.000785211985324093, + "loss": 3.6161, + "step": 25300 + }, + { + "epoch": 1.7193232776192418, + "grad_norm": 1.7007733583450317, + "learning_rate": 0.0007851695203152602, + "loss": 3.423, + "step": 25305 + }, + { + "epoch": 1.7196629976899036, + "grad_norm": 0.9823334813117981, + "learning_rate": 0.0007851270553064275, + "loss": 3.7101, + "step": 25310 + }, + { + "epoch": 1.7200027177605652, + "grad_norm": 0.934550404548645, + "learning_rate": 0.0007850845902975948, + "loss": 3.6802, + "step": 25315 + }, + { + "epoch": 1.720342437831227, + "grad_norm": 0.8667023777961731, + "learning_rate": 0.000785042125288762, + "loss": 3.6356, + "step": 25320 + }, + { + "epoch": 1.720682157901889, + "grad_norm": 0.8304552435874939, + "learning_rate": 0.0007849996602799293, + "loss": 3.4965, + "step": 25325 + }, + { + "epoch": 1.7210218779725506, + "grad_norm": 2.14642333984375, + "learning_rate": 0.0007849571952710967, + "loss": 3.7746, + "step": 25330 + }, + { + "epoch": 1.7213615980432124, + "grad_norm": 1.6009124517440796, + "learning_rate": 0.0007849147302622639, + "loss": 3.4663, + "step": 25335 + }, + { + "epoch": 1.7217013181138743, + "grad_norm": 0.805641770362854, + "learning_rate": 0.0007848722652534311, + "loss": 3.6228, + "step": 25340 + }, + { + "epoch": 1.722041038184536, + "grad_norm": 1.4034204483032227, + "learning_rate": 0.0007848298002445985, + "loss": 3.4268, + "step": 25345 + }, + { + "epoch": 1.7223807582551978, + "grad_norm": 0.8345507383346558, + "learning_rate": 0.0007847873352357657, + "loss": 3.3646, + "step": 25350 + }, + { + "epoch": 1.7227204783258596, + "grad_norm": 0.965073823928833, + "learning_rate": 0.0007847533632286996, + "loss": 3.4949, + "step": 25355 + }, + { + "epoch": 1.7230601983965212, + "grad_norm": 0.7616696953773499, + "learning_rate": 0.0007847108982198669, + "loss": 3.4239, + "step": 25360 + }, + { + "epoch": 1.7233999184671829, + "grad_norm": 0.8556300401687622, + "learning_rate": 0.0007846684332110341, + "loss": 3.6012, + "step": 25365 + }, + { + "epoch": 1.723739638537845, + "grad_norm": 1.4296239614486694, + "learning_rate": 0.0007846259682022014, + "loss": 3.817, + "step": 25370 + }, + { + "epoch": 1.7240793586085066, + "grad_norm": 0.8410563468933105, + "learning_rate": 0.0007845835031933686, + "loss": 3.5267, + "step": 25375 + }, + { + "epoch": 1.7244190786791682, + "grad_norm": 0.9115620851516724, + "learning_rate": 0.000784541038184536, + "loss": 3.4979, + "step": 25380 + }, + { + "epoch": 1.7247587987498303, + "grad_norm": 0.9355559945106506, + "learning_rate": 0.0007844985731757033, + "loss": 3.7195, + "step": 25385 + }, + { + "epoch": 1.725098518820492, + "grad_norm": 0.8074443936347961, + "learning_rate": 0.0007844561081668705, + "loss": 3.7258, + "step": 25390 + }, + { + "epoch": 1.7254382388911536, + "grad_norm": 0.9238176941871643, + "learning_rate": 0.0007844136431580378, + "loss": 3.6386, + "step": 25395 + }, + { + "epoch": 1.7257779589618156, + "grad_norm": 0.8572179675102234, + "learning_rate": 0.0007843711781492051, + "loss": 3.4968, + "step": 25400 + }, + { + "epoch": 1.7261176790324773, + "grad_norm": 0.9015717506408691, + "learning_rate": 0.0007843287131403723, + "loss": 3.5931, + "step": 25405 + }, + { + "epoch": 1.726457399103139, + "grad_norm": 0.981796383857727, + "learning_rate": 0.0007842862481315395, + "loss": 3.6639, + "step": 25410 + }, + { + "epoch": 1.7267971191738007, + "grad_norm": 0.8444247841835022, + "learning_rate": 0.000784243783122707, + "loss": 3.7307, + "step": 25415 + }, + { + "epoch": 1.7271368392444626, + "grad_norm": 0.8982095718383789, + "learning_rate": 0.0007842013181138742, + "loss": 3.8644, + "step": 25420 + }, + { + "epoch": 1.7274765593151242, + "grad_norm": 1.0505677461624146, + "learning_rate": 0.0007841588531050414, + "loss": 3.6034, + "step": 25425 + }, + { + "epoch": 1.727816279385786, + "grad_norm": 1.0758347511291504, + "learning_rate": 0.0007841163880962088, + "loss": 3.48, + "step": 25430 + }, + { + "epoch": 1.728155999456448, + "grad_norm": 0.9618341326713562, + "learning_rate": 0.000784073923087376, + "loss": 3.6172, + "step": 25435 + }, + { + "epoch": 1.7284957195271096, + "grad_norm": 0.6849716901779175, + "learning_rate": 0.0007840314580785432, + "loss": 3.7121, + "step": 25440 + }, + { + "epoch": 1.7288354395977714, + "grad_norm": 0.8353209495544434, + "learning_rate": 0.0007839889930697106, + "loss": 3.4979, + "step": 25445 + }, + { + "epoch": 1.7291751596684333, + "grad_norm": 0.9652770161628723, + "learning_rate": 0.0007839465280608779, + "loss": 3.6021, + "step": 25450 + }, + { + "epoch": 1.729514879739095, + "grad_norm": 0.8948561549186707, + "learning_rate": 0.0007839040630520451, + "loss": 3.6817, + "step": 25455 + }, + { + "epoch": 1.7298545998097568, + "grad_norm": 0.7482254505157471, + "learning_rate": 0.0007838615980432125, + "loss": 3.8865, + "step": 25460 + }, + { + "epoch": 1.7301943198804186, + "grad_norm": 0.9581138491630554, + "learning_rate": 0.0007838191330343797, + "loss": 3.7175, + "step": 25465 + }, + { + "epoch": 1.7305340399510802, + "grad_norm": 1.0234006643295288, + "learning_rate": 0.0007837766680255469, + "loss": 3.4512, + "step": 25470 + }, + { + "epoch": 1.730873760021742, + "grad_norm": 0.8234611749649048, + "learning_rate": 0.0007837342030167142, + "loss": 3.5572, + "step": 25475 + }, + { + "epoch": 1.731213480092404, + "grad_norm": 0.8456802368164062, + "learning_rate": 0.0007836917380078815, + "loss": 3.456, + "step": 25480 + }, + { + "epoch": 1.7315532001630656, + "grad_norm": 0.9836328625679016, + "learning_rate": 0.0007836492729990488, + "loss": 3.5864, + "step": 25485 + }, + { + "epoch": 1.7318929202337274, + "grad_norm": 0.9252445697784424, + "learning_rate": 0.0007836068079902161, + "loss": 3.596, + "step": 25490 + }, + { + "epoch": 1.7322326403043893, + "grad_norm": 0.9595430493354797, + "learning_rate": 0.0007835643429813834, + "loss": 3.4234, + "step": 25495 + }, + { + "epoch": 1.732572360375051, + "grad_norm": 1.1580802202224731, + "learning_rate": 0.0007835218779725506, + "loss": 3.6681, + "step": 25500 + }, + { + "epoch": 1.7329120804457128, + "grad_norm": 0.9729642271995544, + "learning_rate": 0.0007834794129637179, + "loss": 3.7202, + "step": 25505 + }, + { + "epoch": 1.7332518005163746, + "grad_norm": 0.8731192946434021, + "learning_rate": 0.0007834369479548851, + "loss": 3.8749, + "step": 25510 + }, + { + "epoch": 1.7335915205870362, + "grad_norm": 0.8856524229049683, + "learning_rate": 0.0007833944829460524, + "loss": 3.4845, + "step": 25515 + }, + { + "epoch": 1.733931240657698, + "grad_norm": 0.7199622988700867, + "learning_rate": 0.0007833520179372198, + "loss": 3.635, + "step": 25520 + }, + { + "epoch": 1.73427096072836, + "grad_norm": 0.8543817400932312, + "learning_rate": 0.000783309552928387, + "loss": 3.267, + "step": 25525 + }, + { + "epoch": 1.7346106807990216, + "grad_norm": 0.9101433753967285, + "learning_rate": 0.0007832670879195543, + "loss": 3.2919, + "step": 25530 + }, + { + "epoch": 1.7349504008696832, + "grad_norm": 0.7267348170280457, + "learning_rate": 0.0007832246229107216, + "loss": 3.3752, + "step": 25535 + }, + { + "epoch": 1.7352901209403453, + "grad_norm": 0.8342488408088684, + "learning_rate": 0.0007831821579018888, + "loss": 3.519, + "step": 25540 + }, + { + "epoch": 1.735629841011007, + "grad_norm": 0.9301750063896179, + "learning_rate": 0.000783139692893056, + "loss": 3.5571, + "step": 25545 + }, + { + "epoch": 1.7359695610816686, + "grad_norm": 1.0548479557037354, + "learning_rate": 0.0007830972278842234, + "loss": 3.6786, + "step": 25550 + }, + { + "epoch": 1.7363092811523306, + "grad_norm": 0.8445652723312378, + "learning_rate": 0.0007830547628753907, + "loss": 3.7233, + "step": 25555 + }, + { + "epoch": 1.7366490012229923, + "grad_norm": 0.7229071855545044, + "learning_rate": 0.000783012297866558, + "loss": 3.6, + "step": 25560 + }, + { + "epoch": 1.736988721293654, + "grad_norm": 0.964106559753418, + "learning_rate": 0.0007829698328577253, + "loss": 3.6133, + "step": 25565 + }, + { + "epoch": 1.737328441364316, + "grad_norm": 0.9455996155738831, + "learning_rate": 0.0007829273678488925, + "loss": 3.3058, + "step": 25570 + }, + { + "epoch": 1.7376681614349776, + "grad_norm": 1.119430422782898, + "learning_rate": 0.0007828849028400597, + "loss": 3.4848, + "step": 25575 + }, + { + "epoch": 1.7380078815056392, + "grad_norm": 1.6902239322662354, + "learning_rate": 0.0007828424378312271, + "loss": 3.6717, + "step": 25580 + }, + { + "epoch": 1.738347601576301, + "grad_norm": 0.8278290629386902, + "learning_rate": 0.0007827999728223943, + "loss": 3.4953, + "step": 25585 + }, + { + "epoch": 1.738687321646963, + "grad_norm": 1.2331725358963013, + "learning_rate": 0.0007827575078135616, + "loss": 3.7851, + "step": 25590 + }, + { + "epoch": 1.7390270417176246, + "grad_norm": 0.8088516592979431, + "learning_rate": 0.000782715042804729, + "loss": 3.7518, + "step": 25595 + }, + { + "epoch": 1.7393667617882864, + "grad_norm": 1.1602987051010132, + "learning_rate": 0.0007826725777958962, + "loss": 3.8443, + "step": 25600 + }, + { + "epoch": 1.7397064818589483, + "grad_norm": 0.9219039082527161, + "learning_rate": 0.0007826301127870635, + "loss": 3.5939, + "step": 25605 + }, + { + "epoch": 1.74004620192961, + "grad_norm": 1.1139945983886719, + "learning_rate": 0.0007825876477782307, + "loss": 3.5899, + "step": 25610 + }, + { + "epoch": 1.7403859220002718, + "grad_norm": 0.9187111854553223, + "learning_rate": 0.000782545182769398, + "loss": 3.8198, + "step": 25615 + }, + { + "epoch": 1.7407256420709336, + "grad_norm": 0.8240117430686951, + "learning_rate": 0.0007825027177605654, + "loss": 3.4817, + "step": 25620 + }, + { + "epoch": 1.7410653621415952, + "grad_norm": 0.8086106181144714, + "learning_rate": 0.0007824602527517326, + "loss": 3.638, + "step": 25625 + }, + { + "epoch": 1.741405082212257, + "grad_norm": 0.8951728940010071, + "learning_rate": 0.0007824177877428999, + "loss": 3.7319, + "step": 25630 + }, + { + "epoch": 1.741744802282919, + "grad_norm": 0.9355682134628296, + "learning_rate": 0.0007823753227340672, + "loss": 3.5715, + "step": 25635 + }, + { + "epoch": 1.7420845223535806, + "grad_norm": 0.7194088697433472, + "learning_rate": 0.0007823328577252344, + "loss": 3.5049, + "step": 25640 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.8790980577468872, + "learning_rate": 0.0007822903927164017, + "loss": 3.4055, + "step": 25645 + }, + { + "epoch": 1.7427639624949043, + "grad_norm": 0.9724138379096985, + "learning_rate": 0.000782247927707569, + "loss": 3.4159, + "step": 25650 + }, + { + "epoch": 1.743103682565566, + "grad_norm": 0.8486266136169434, + "learning_rate": 0.0007822054626987363, + "loss": 3.6172, + "step": 25655 + }, + { + "epoch": 1.7434434026362278, + "grad_norm": 1.023450255393982, + "learning_rate": 0.0007821629976899035, + "loss": 3.464, + "step": 25660 + }, + { + "epoch": 1.7437831227068896, + "grad_norm": 1.21660578250885, + "learning_rate": 0.0007821205326810709, + "loss": 3.5886, + "step": 25665 + }, + { + "epoch": 1.7441228427775513, + "grad_norm": 0.8017347455024719, + "learning_rate": 0.0007820780676722381, + "loss": 3.9497, + "step": 25670 + }, + { + "epoch": 1.744462562848213, + "grad_norm": 2.1581838130950928, + "learning_rate": 0.0007820356026634053, + "loss": 3.4532, + "step": 25675 + }, + { + "epoch": 1.744802282918875, + "grad_norm": 0.8114291429519653, + "learning_rate": 0.0007819931376545727, + "loss": 3.744, + "step": 25680 + }, + { + "epoch": 1.7451420029895366, + "grad_norm": 0.839488685131073, + "learning_rate": 0.0007819506726457399, + "loss": 3.3602, + "step": 25685 + }, + { + "epoch": 1.7454817230601984, + "grad_norm": 0.8448663949966431, + "learning_rate": 0.0007819082076369072, + "loss": 3.6892, + "step": 25690 + }, + { + "epoch": 1.7458214431308603, + "grad_norm": 0.9307170510292053, + "learning_rate": 0.0007818657426280746, + "loss": 3.6696, + "step": 25695 + }, + { + "epoch": 1.746161163201522, + "grad_norm": 1.3388948440551758, + "learning_rate": 0.0007818232776192418, + "loss": 3.5112, + "step": 25700 + }, + { + "epoch": 1.7465008832721836, + "grad_norm": 0.8567869663238525, + "learning_rate": 0.000781780812610409, + "loss": 3.7193, + "step": 25705 + }, + { + "epoch": 1.7468406033428456, + "grad_norm": 1.0738362073898315, + "learning_rate": 0.0007817383476015763, + "loss": 3.3632, + "step": 25710 + }, + { + "epoch": 1.7471803234135073, + "grad_norm": 1.041454792022705, + "learning_rate": 0.0007816958825927436, + "loss": 3.6217, + "step": 25715 + }, + { + "epoch": 1.747520043484169, + "grad_norm": 0.9116367697715759, + "learning_rate": 0.0007816534175839108, + "loss": 3.5317, + "step": 25720 + }, + { + "epoch": 1.747859763554831, + "grad_norm": 1.4112224578857422, + "learning_rate": 0.0007816109525750782, + "loss": 3.5236, + "step": 25725 + }, + { + "epoch": 1.7481994836254926, + "grad_norm": 0.7296573519706726, + "learning_rate": 0.0007815684875662455, + "loss": 3.6255, + "step": 25730 + }, + { + "epoch": 1.7485392036961542, + "grad_norm": 0.7156210541725159, + "learning_rate": 0.0007815260225574127, + "loss": 3.2526, + "step": 25735 + }, + { + "epoch": 1.7488789237668163, + "grad_norm": 4.075358867645264, + "learning_rate": 0.00078148355754858, + "loss": 3.7135, + "step": 25740 + }, + { + "epoch": 1.749218643837478, + "grad_norm": 0.5846410989761353, + "learning_rate": 0.0007814410925397473, + "loss": 3.2703, + "step": 25745 + }, + { + "epoch": 1.7495583639081396, + "grad_norm": 1.097840428352356, + "learning_rate": 0.0007813986275309145, + "loss": 3.6961, + "step": 25750 + }, + { + "epoch": 1.7498980839788014, + "grad_norm": 1.7544199228286743, + "learning_rate": 0.0007813561625220818, + "loss": 3.4998, + "step": 25755 + }, + { + "epoch": 1.7502378040494633, + "grad_norm": 1.2665232419967651, + "learning_rate": 0.0007813136975132492, + "loss": 3.6523, + "step": 25760 + }, + { + "epoch": 1.750577524120125, + "grad_norm": 1.0235204696655273, + "learning_rate": 0.0007812712325044164, + "loss": 3.5007, + "step": 25765 + }, + { + "epoch": 1.7509172441907868, + "grad_norm": 0.6968969702720642, + "learning_rate": 0.0007812287674955837, + "loss": 3.3273, + "step": 25770 + }, + { + "epoch": 1.7512569642614486, + "grad_norm": 0.8375619649887085, + "learning_rate": 0.0007811863024867509, + "loss": 3.6921, + "step": 25775 + }, + { + "epoch": 1.7515966843321102, + "grad_norm": 0.8931649923324585, + "learning_rate": 0.0007811438374779182, + "loss": 3.638, + "step": 25780 + }, + { + "epoch": 1.751936404402772, + "grad_norm": 0.8587384223937988, + "learning_rate": 0.0007811013724690855, + "loss": 3.4838, + "step": 25785 + }, + { + "epoch": 1.752276124473434, + "grad_norm": 0.9998670220375061, + "learning_rate": 0.0007810589074602527, + "loss": 3.768, + "step": 25790 + }, + { + "epoch": 1.7526158445440956, + "grad_norm": 0.6909828186035156, + "learning_rate": 0.0007810164424514201, + "loss": 3.6113, + "step": 25795 + }, + { + "epoch": 1.7529555646147574, + "grad_norm": 0.8434349298477173, + "learning_rate": 0.0007809739774425874, + "loss": 3.6083, + "step": 25800 + }, + { + "epoch": 1.7532952846854193, + "grad_norm": 1.1343122720718384, + "learning_rate": 0.0007809315124337546, + "loss": 3.6339, + "step": 25805 + }, + { + "epoch": 1.753635004756081, + "grad_norm": 0.8196832537651062, + "learning_rate": 0.0007808890474249218, + "loss": 3.7651, + "step": 25810 + }, + { + "epoch": 1.7539747248267428, + "grad_norm": 0.8877002000808716, + "learning_rate": 0.0007808465824160892, + "loss": 3.8074, + "step": 25815 + }, + { + "epoch": 1.7543144448974046, + "grad_norm": 1.0415798425674438, + "learning_rate": 0.0007808041174072564, + "loss": 3.8307, + "step": 25820 + }, + { + "epoch": 1.7546541649680663, + "grad_norm": 0.9773862361907959, + "learning_rate": 0.0007807616523984236, + "loss": 3.5084, + "step": 25825 + }, + { + "epoch": 1.754993885038728, + "grad_norm": 3.991772174835205, + "learning_rate": 0.0007807191873895911, + "loss": 3.7254, + "step": 25830 + }, + { + "epoch": 1.75533360510939, + "grad_norm": 0.8535118699073792, + "learning_rate": 0.0007806767223807583, + "loss": 3.6692, + "step": 25835 + }, + { + "epoch": 1.7556733251800516, + "grad_norm": 0.7524866461753845, + "learning_rate": 0.0007806342573719255, + "loss": 3.6096, + "step": 25840 + }, + { + "epoch": 1.7560130452507134, + "grad_norm": 1.0126885175704956, + "learning_rate": 0.0007805917923630929, + "loss": 3.6979, + "step": 25845 + }, + { + "epoch": 1.7563527653213753, + "grad_norm": 1.7661908864974976, + "learning_rate": 0.0007805493273542601, + "loss": 3.4849, + "step": 25850 + }, + { + "epoch": 1.756692485392037, + "grad_norm": 0.9480913877487183, + "learning_rate": 0.0007805068623454273, + "loss": 3.6705, + "step": 25855 + }, + { + "epoch": 1.7570322054626988, + "grad_norm": 1.0913708209991455, + "learning_rate": 0.0007804643973365946, + "loss": 3.6251, + "step": 25860 + }, + { + "epoch": 1.7573719255333606, + "grad_norm": 1.0106046199798584, + "learning_rate": 0.000780421932327762, + "loss": 3.6321, + "step": 25865 + }, + { + "epoch": 1.7577116456040223, + "grad_norm": 0.7162382006645203, + "learning_rate": 0.0007803794673189292, + "loss": 3.4046, + "step": 25870 + }, + { + "epoch": 1.758051365674684, + "grad_norm": 0.9232354760169983, + "learning_rate": 0.0007803370023100965, + "loss": 3.3307, + "step": 25875 + }, + { + "epoch": 1.758391085745346, + "grad_norm": 0.8626432418823242, + "learning_rate": 0.0007802945373012638, + "loss": 3.397, + "step": 25880 + }, + { + "epoch": 1.7587308058160076, + "grad_norm": 0.7222766876220703, + "learning_rate": 0.000780252072292431, + "loss": 3.6525, + "step": 25885 + }, + { + "epoch": 1.7590705258866692, + "grad_norm": 0.860196590423584, + "learning_rate": 0.0007802096072835983, + "loss": 3.443, + "step": 25890 + }, + { + "epoch": 1.7594102459573313, + "grad_norm": 0.8334835171699524, + "learning_rate": 0.0007801671422747655, + "loss": 3.6244, + "step": 25895 + }, + { + "epoch": 1.759749966027993, + "grad_norm": 0.8382884860038757, + "learning_rate": 0.0007801246772659329, + "loss": 3.2737, + "step": 25900 + }, + { + "epoch": 1.7600896860986546, + "grad_norm": 1.0288357734680176, + "learning_rate": 0.0007800822122571002, + "loss": 3.556, + "step": 25905 + }, + { + "epoch": 1.7604294061693166, + "grad_norm": 0.9166410565376282, + "learning_rate": 0.0007800397472482674, + "loss": 3.6862, + "step": 25910 + }, + { + "epoch": 1.7607691262399783, + "grad_norm": 1.0280808210372925, + "learning_rate": 0.0007799972822394347, + "loss": 3.6957, + "step": 25915 + }, + { + "epoch": 1.76110884631064, + "grad_norm": 0.9952377080917358, + "learning_rate": 0.000779954817230602, + "loss": 3.4839, + "step": 25920 + }, + { + "epoch": 1.7614485663813018, + "grad_norm": 1.0024325847625732, + "learning_rate": 0.0007799123522217692, + "loss": 3.6026, + "step": 25925 + }, + { + "epoch": 1.7617882864519636, + "grad_norm": 0.7097206115722656, + "learning_rate": 0.0007798698872129365, + "loss": 3.6558, + "step": 25930 + }, + { + "epoch": 1.7621280065226252, + "grad_norm": 1.056200623512268, + "learning_rate": 0.0007798274222041039, + "loss": 3.6814, + "step": 25935 + }, + { + "epoch": 1.762467726593287, + "grad_norm": 0.9710252285003662, + "learning_rate": 0.0007797849571952711, + "loss": 3.4544, + "step": 25940 + }, + { + "epoch": 1.762807446663949, + "grad_norm": 0.9146710634231567, + "learning_rate": 0.0007797424921864385, + "loss": 3.9267, + "step": 25945 + }, + { + "epoch": 1.7631471667346106, + "grad_norm": 1.0249470472335815, + "learning_rate": 0.0007797000271776057, + "loss": 3.5271, + "step": 25950 + }, + { + "epoch": 1.7634868868052724, + "grad_norm": 0.8337762355804443, + "learning_rate": 0.0007796575621687729, + "loss": 3.8051, + "step": 25955 + }, + { + "epoch": 1.7638266068759343, + "grad_norm": 0.7267661690711975, + "learning_rate": 0.0007796150971599402, + "loss": 3.7149, + "step": 25960 + }, + { + "epoch": 1.764166326946596, + "grad_norm": 1.1663480997085571, + "learning_rate": 0.0007795726321511075, + "loss": 3.5452, + "step": 25965 + }, + { + "epoch": 1.7645060470172578, + "grad_norm": 0.9253443479537964, + "learning_rate": 0.0007795301671422748, + "loss": 3.9096, + "step": 25970 + }, + { + "epoch": 1.7648457670879196, + "grad_norm": 0.7172528505325317, + "learning_rate": 0.0007794877021334421, + "loss": 3.4998, + "step": 25975 + }, + { + "epoch": 1.7651854871585813, + "grad_norm": 0.6922627687454224, + "learning_rate": 0.0007794452371246094, + "loss": 3.466, + "step": 25980 + }, + { + "epoch": 1.765525207229243, + "grad_norm": 1.1829173564910889, + "learning_rate": 0.0007794027721157766, + "loss": 3.4829, + "step": 25985 + }, + { + "epoch": 1.765864927299905, + "grad_norm": 0.8547922372817993, + "learning_rate": 0.0007793603071069439, + "loss": 3.5566, + "step": 25990 + }, + { + "epoch": 1.7662046473705666, + "grad_norm": 0.9761931896209717, + "learning_rate": 0.0007793178420981112, + "loss": 3.9405, + "step": 25995 + }, + { + "epoch": 1.7665443674412284, + "grad_norm": 0.8508656024932861, + "learning_rate": 0.0007792753770892784, + "loss": 3.7588, + "step": 26000 + }, + { + "epoch": 1.7668840875118903, + "grad_norm": 1.2825756072998047, + "learning_rate": 0.0007792329120804458, + "loss": 3.6101, + "step": 26005 + }, + { + "epoch": 1.767223807582552, + "grad_norm": 0.9895874857902527, + "learning_rate": 0.000779190447071613, + "loss": 3.8342, + "step": 26010 + }, + { + "epoch": 1.7675635276532138, + "grad_norm": 0.7512866854667664, + "learning_rate": 0.0007791479820627803, + "loss": 3.5674, + "step": 26015 + }, + { + "epoch": 1.7679032477238756, + "grad_norm": 1.0592421293258667, + "learning_rate": 0.0007791055170539476, + "loss": 3.4491, + "step": 26020 + }, + { + "epoch": 1.7682429677945373, + "grad_norm": 0.8552484512329102, + "learning_rate": 0.0007790630520451148, + "loss": 3.2148, + "step": 26025 + }, + { + "epoch": 1.7685826878651991, + "grad_norm": 0.7596625089645386, + "learning_rate": 0.0007790205870362821, + "loss": 3.4947, + "step": 26030 + }, + { + "epoch": 1.768922407935861, + "grad_norm": 0.8540998101234436, + "learning_rate": 0.0007789781220274494, + "loss": 3.5282, + "step": 26035 + }, + { + "epoch": 1.7692621280065226, + "grad_norm": 0.8711145520210266, + "learning_rate": 0.0007789356570186167, + "loss": 3.6849, + "step": 26040 + }, + { + "epoch": 1.7696018480771842, + "grad_norm": 0.8310855031013489, + "learning_rate": 0.000778893192009784, + "loss": 3.5534, + "step": 26045 + }, + { + "epoch": 1.7699415681478463, + "grad_norm": 1.0801607370376587, + "learning_rate": 0.0007788507270009513, + "loss": 3.6423, + "step": 26050 + }, + { + "epoch": 1.770281288218508, + "grad_norm": 0.8533782958984375, + "learning_rate": 0.0007788082619921185, + "loss": 3.3905, + "step": 26055 + }, + { + "epoch": 1.7706210082891696, + "grad_norm": 0.8026643395423889, + "learning_rate": 0.0007787657969832857, + "loss": 3.5885, + "step": 26060 + }, + { + "epoch": 1.7709607283598316, + "grad_norm": 0.7627477049827576, + "learning_rate": 0.0007787233319744531, + "loss": 3.5566, + "step": 26065 + }, + { + "epoch": 1.7713004484304933, + "grad_norm": 0.9307865500450134, + "learning_rate": 0.0007786808669656203, + "loss": 3.3879, + "step": 26070 + }, + { + "epoch": 1.771640168501155, + "grad_norm": 1.039922833442688, + "learning_rate": 0.0007786384019567876, + "loss": 3.4412, + "step": 26075 + }, + { + "epoch": 1.771979888571817, + "grad_norm": 1.109264850616455, + "learning_rate": 0.000778595936947955, + "loss": 3.6952, + "step": 26080 + }, + { + "epoch": 1.7723196086424786, + "grad_norm": 0.828087329864502, + "learning_rate": 0.0007785534719391222, + "loss": 3.4133, + "step": 26085 + }, + { + "epoch": 1.7726593287131402, + "grad_norm": 0.8170620203018188, + "learning_rate": 0.0007785110069302894, + "loss": 3.6949, + "step": 26090 + }, + { + "epoch": 1.772999048783802, + "grad_norm": 1.6277247667312622, + "learning_rate": 0.0007784685419214568, + "loss": 3.3292, + "step": 26095 + }, + { + "epoch": 1.773338768854464, + "grad_norm": 1.0147051811218262, + "learning_rate": 0.000778426076912624, + "loss": 3.4946, + "step": 26100 + }, + { + "epoch": 1.7736784889251256, + "grad_norm": 0.5892513990402222, + "learning_rate": 0.0007783836119037912, + "loss": 3.5997, + "step": 26105 + }, + { + "epoch": 1.7740182089957874, + "grad_norm": 0.9243765473365784, + "learning_rate": 0.0007783411468949586, + "loss": 3.2318, + "step": 26110 + }, + { + "epoch": 1.7743579290664493, + "grad_norm": 0.9314038753509521, + "learning_rate": 0.0007782986818861259, + "loss": 3.5514, + "step": 26115 + }, + { + "epoch": 1.774697649137111, + "grad_norm": 0.9925742745399475, + "learning_rate": 0.0007782562168772931, + "loss": 3.7543, + "step": 26120 + }, + { + "epoch": 1.7750373692077728, + "grad_norm": 1.3409960269927979, + "learning_rate": 0.0007782137518684604, + "loss": 3.6573, + "step": 26125 + }, + { + "epoch": 1.7753770892784346, + "grad_norm": 0.7649749517440796, + "learning_rate": 0.0007781712868596277, + "loss": 3.5824, + "step": 26130 + }, + { + "epoch": 1.7757168093490963, + "grad_norm": 0.8953922390937805, + "learning_rate": 0.0007781288218507949, + "loss": 3.584, + "step": 26135 + }, + { + "epoch": 1.7760565294197581, + "grad_norm": 0.9761404991149902, + "learning_rate": 0.0007780863568419622, + "loss": 3.6341, + "step": 26140 + }, + { + "epoch": 1.77639624949042, + "grad_norm": 1.1089476346969604, + "learning_rate": 0.0007780438918331296, + "loss": 3.5745, + "step": 26145 + }, + { + "epoch": 1.7767359695610816, + "grad_norm": 0.9463505148887634, + "learning_rate": 0.0007780014268242968, + "loss": 3.3888, + "step": 26150 + }, + { + "epoch": 1.7770756896317434, + "grad_norm": 0.8914766907691956, + "learning_rate": 0.0007779589618154641, + "loss": 3.3685, + "step": 26155 + }, + { + "epoch": 1.7774154097024053, + "grad_norm": 0.8360973000526428, + "learning_rate": 0.0007779164968066313, + "loss": 3.5299, + "step": 26160 + }, + { + "epoch": 1.777755129773067, + "grad_norm": 0.9641970992088318, + "learning_rate": 0.0007778740317977986, + "loss": 3.6653, + "step": 26165 + }, + { + "epoch": 1.7780948498437288, + "grad_norm": 1.0088919401168823, + "learning_rate": 0.0007778315667889659, + "loss": 3.6981, + "step": 26170 + }, + { + "epoch": 1.7784345699143906, + "grad_norm": 0.8954862356185913, + "learning_rate": 0.0007777891017801331, + "loss": 3.324, + "step": 26175 + }, + { + "epoch": 1.7787742899850523, + "grad_norm": 0.7950172424316406, + "learning_rate": 0.0007777466367713005, + "loss": 3.9016, + "step": 26180 + }, + { + "epoch": 1.7791140100557141, + "grad_norm": 1.8928781747817993, + "learning_rate": 0.0007777041717624678, + "loss": 3.6768, + "step": 26185 + }, + { + "epoch": 1.779453730126376, + "grad_norm": 1.0018020868301392, + "learning_rate": 0.000777661706753635, + "loss": 3.5019, + "step": 26190 + }, + { + "epoch": 1.7797934501970376, + "grad_norm": 0.8608434200286865, + "learning_rate": 0.0007776192417448022, + "loss": 3.2994, + "step": 26195 + }, + { + "epoch": 1.7801331702676995, + "grad_norm": 1.0932605266571045, + "learning_rate": 0.0007775767767359696, + "loss": 3.3956, + "step": 26200 + }, + { + "epoch": 1.7804728903383613, + "grad_norm": 1.116819143295288, + "learning_rate": 0.0007775343117271368, + "loss": 3.5296, + "step": 26205 + }, + { + "epoch": 1.780812610409023, + "grad_norm": 1.7676503658294678, + "learning_rate": 0.000777491846718304, + "loss": 3.626, + "step": 26210 + }, + { + "epoch": 1.7811523304796846, + "grad_norm": 0.8664410710334778, + "learning_rate": 0.0007774493817094715, + "loss": 3.4848, + "step": 26215 + }, + { + "epoch": 1.7814920505503467, + "grad_norm": 0.9728400111198425, + "learning_rate": 0.0007774069167006387, + "loss": 3.5147, + "step": 26220 + }, + { + "epoch": 1.7818317706210083, + "grad_norm": 0.6609117984771729, + "learning_rate": 0.0007773644516918059, + "loss": 3.6803, + "step": 26225 + }, + { + "epoch": 1.78217149069167, + "grad_norm": 1.5286531448364258, + "learning_rate": 0.0007773219866829733, + "loss": 3.7605, + "step": 26230 + }, + { + "epoch": 1.782511210762332, + "grad_norm": 0.692642867565155, + "learning_rate": 0.0007772795216741405, + "loss": 3.636, + "step": 26235 + }, + { + "epoch": 1.7828509308329936, + "grad_norm": 1.2491809129714966, + "learning_rate": 0.0007772370566653077, + "loss": 3.5585, + "step": 26240 + }, + { + "epoch": 1.7831906509036552, + "grad_norm": 0.8209534287452698, + "learning_rate": 0.0007771945916564752, + "loss": 3.6854, + "step": 26245 + }, + { + "epoch": 1.7835303709743173, + "grad_norm": 0.8455584049224854, + "learning_rate": 0.0007771521266476424, + "loss": 3.7616, + "step": 26250 + }, + { + "epoch": 1.783870091044979, + "grad_norm": 0.6646820306777954, + "learning_rate": 0.0007771096616388096, + "loss": 3.4935, + "step": 26255 + }, + { + "epoch": 1.7842098111156406, + "grad_norm": 0.868610680103302, + "learning_rate": 0.0007770671966299769, + "loss": 3.521, + "step": 26260 + }, + { + "epoch": 1.7845495311863024, + "grad_norm": 2.3825769424438477, + "learning_rate": 0.0007770247316211442, + "loss": 3.5186, + "step": 26265 + }, + { + "epoch": 1.7848892512569643, + "grad_norm": 0.7983075380325317, + "learning_rate": 0.0007769822666123114, + "loss": 3.6815, + "step": 26270 + }, + { + "epoch": 1.785228971327626, + "grad_norm": 1.0889832973480225, + "learning_rate": 0.0007769398016034787, + "loss": 3.6708, + "step": 26275 + }, + { + "epoch": 1.7855686913982878, + "grad_norm": 0.6563325524330139, + "learning_rate": 0.0007768973365946461, + "loss": 3.6823, + "step": 26280 + }, + { + "epoch": 1.7859084114689496, + "grad_norm": 0.806905210018158, + "learning_rate": 0.0007768548715858134, + "loss": 3.6657, + "step": 26285 + }, + { + "epoch": 1.7862481315396113, + "grad_norm": 0.726801335811615, + "learning_rate": 0.0007768124065769806, + "loss": 3.7256, + "step": 26290 + }, + { + "epoch": 1.7865878516102731, + "grad_norm": 0.8535120487213135, + "learning_rate": 0.0007767699415681478, + "loss": 3.6044, + "step": 26295 + }, + { + "epoch": 1.786927571680935, + "grad_norm": 0.6706191897392273, + "learning_rate": 0.0007767274765593152, + "loss": 3.4134, + "step": 26300 + }, + { + "epoch": 1.7872672917515966, + "grad_norm": 0.7841045260429382, + "learning_rate": 0.0007766850115504824, + "loss": 3.5782, + "step": 26305 + }, + { + "epoch": 1.7876070118222585, + "grad_norm": 0.7997345328330994, + "learning_rate": 0.0007766425465416496, + "loss": 3.6688, + "step": 26310 + }, + { + "epoch": 1.7879467318929203, + "grad_norm": 0.7625271677970886, + "learning_rate": 0.0007766000815328171, + "loss": 3.6902, + "step": 26315 + }, + { + "epoch": 1.788286451963582, + "grad_norm": 0.6723198890686035, + "learning_rate": 0.0007765576165239843, + "loss": 3.4738, + "step": 26320 + }, + { + "epoch": 1.7886261720342438, + "grad_norm": 0.8226199150085449, + "learning_rate": 0.0007765151515151515, + "loss": 3.4325, + "step": 26325 + }, + { + "epoch": 1.7889658921049056, + "grad_norm": 0.8736315369606018, + "learning_rate": 0.0007764726865063189, + "loss": 3.6262, + "step": 26330 + }, + { + "epoch": 1.7893056121755673, + "grad_norm": 0.7992656230926514, + "learning_rate": 0.0007764302214974861, + "loss": 3.511, + "step": 26335 + }, + { + "epoch": 1.7896453322462291, + "grad_norm": 0.9451347589492798, + "learning_rate": 0.0007763877564886533, + "loss": 3.6718, + "step": 26340 + }, + { + "epoch": 1.789985052316891, + "grad_norm": 0.8022575974464417, + "learning_rate": 0.0007763452914798206, + "loss": 3.6199, + "step": 26345 + }, + { + "epoch": 1.7903247723875526, + "grad_norm": 0.843263566493988, + "learning_rate": 0.000776302826470988, + "loss": 3.929, + "step": 26350 + }, + { + "epoch": 1.7906644924582145, + "grad_norm": 1.4878934621810913, + "learning_rate": 0.0007762603614621552, + "loss": 3.8175, + "step": 26355 + }, + { + "epoch": 1.7910042125288763, + "grad_norm": 0.8609336018562317, + "learning_rate": 0.0007762178964533225, + "loss": 3.6099, + "step": 26360 + }, + { + "epoch": 1.791343932599538, + "grad_norm": 0.9350732564926147, + "learning_rate": 0.0007761754314444898, + "loss": 3.7466, + "step": 26365 + }, + { + "epoch": 1.7916836526701998, + "grad_norm": 0.9103706479072571, + "learning_rate": 0.000776132966435657, + "loss": 3.6775, + "step": 26370 + }, + { + "epoch": 1.7920233727408617, + "grad_norm": 1.3015198707580566, + "learning_rate": 0.0007760905014268243, + "loss": 3.6456, + "step": 26375 + }, + { + "epoch": 1.7923630928115233, + "grad_norm": 0.7334905862808228, + "learning_rate": 0.0007760480364179916, + "loss": 3.7843, + "step": 26380 + }, + { + "epoch": 1.792702812882185, + "grad_norm": 5.041487216949463, + "learning_rate": 0.0007760055714091589, + "loss": 3.7128, + "step": 26385 + }, + { + "epoch": 1.793042532952847, + "grad_norm": 1.103271245956421, + "learning_rate": 0.0007759631064003262, + "loss": 3.6786, + "step": 26390 + }, + { + "epoch": 1.7933822530235086, + "grad_norm": 0.9240031838417053, + "learning_rate": 0.0007759206413914934, + "loss": 3.7832, + "step": 26395 + }, + { + "epoch": 1.7937219730941703, + "grad_norm": 4.353055000305176, + "learning_rate": 0.0007758781763826607, + "loss": 3.5267, + "step": 26400 + }, + { + "epoch": 1.7940616931648323, + "grad_norm": 2.2458434104919434, + "learning_rate": 0.000775835711373828, + "loss": 3.5564, + "step": 26405 + }, + { + "epoch": 1.794401413235494, + "grad_norm": 0.8415870070457458, + "learning_rate": 0.0007757932463649952, + "loss": 3.5073, + "step": 26410 + }, + { + "epoch": 1.7947411333061556, + "grad_norm": 0.7067269086837769, + "learning_rate": 0.0007757507813561625, + "loss": 3.5724, + "step": 26415 + }, + { + "epoch": 1.7950808533768177, + "grad_norm": 0.7915319204330444, + "learning_rate": 0.0007757083163473299, + "loss": 3.4585, + "step": 26420 + }, + { + "epoch": 1.7954205734474793, + "grad_norm": 0.9206570982933044, + "learning_rate": 0.0007756658513384971, + "loss": 3.3685, + "step": 26425 + }, + { + "epoch": 1.795760293518141, + "grad_norm": 0.9234087467193604, + "learning_rate": 0.0007756233863296644, + "loss": 3.5621, + "step": 26430 + }, + { + "epoch": 1.7961000135888028, + "grad_norm": 1.0292432308197021, + "learning_rate": 0.0007755809213208317, + "loss": 3.4477, + "step": 26435 + }, + { + "epoch": 1.7964397336594646, + "grad_norm": 1.0153669118881226, + "learning_rate": 0.0007755384563119989, + "loss": 3.7404, + "step": 26440 + }, + { + "epoch": 1.7967794537301263, + "grad_norm": 0.716339111328125, + "learning_rate": 0.0007754959913031661, + "loss": 3.4089, + "step": 26445 + }, + { + "epoch": 1.7971191738007881, + "grad_norm": 0.8573842644691467, + "learning_rate": 0.0007754535262943335, + "loss": 3.2258, + "step": 26450 + }, + { + "epoch": 1.79745889387145, + "grad_norm": 1.0687893629074097, + "learning_rate": 0.0007754110612855008, + "loss": 3.6171, + "step": 26455 + }, + { + "epoch": 1.7977986139421116, + "grad_norm": 0.8323343992233276, + "learning_rate": 0.000775368596276668, + "loss": 3.704, + "step": 26460 + }, + { + "epoch": 1.7981383340127735, + "grad_norm": 0.7410216331481934, + "learning_rate": 0.0007753261312678354, + "loss": 3.5272, + "step": 26465 + }, + { + "epoch": 1.7984780540834353, + "grad_norm": 1.0385221242904663, + "learning_rate": 0.0007752836662590026, + "loss": 3.6735, + "step": 26470 + }, + { + "epoch": 1.798817774154097, + "grad_norm": 0.9131577610969543, + "learning_rate": 0.0007752412012501698, + "loss": 3.5557, + "step": 26475 + }, + { + "epoch": 1.7991574942247588, + "grad_norm": 0.8447600603103638, + "learning_rate": 0.0007751987362413372, + "loss": 3.7006, + "step": 26480 + }, + { + "epoch": 1.7994972142954206, + "grad_norm": 1.0111380815505981, + "learning_rate": 0.0007751562712325044, + "loss": 3.5316, + "step": 26485 + }, + { + "epoch": 1.7998369343660823, + "grad_norm": 0.9352080225944519, + "learning_rate": 0.0007751138062236717, + "loss": 3.5204, + "step": 26490 + }, + { + "epoch": 1.8001766544367441, + "grad_norm": 0.7293272614479065, + "learning_rate": 0.000775071341214839, + "loss": 3.6204, + "step": 26495 + }, + { + "epoch": 1.800516374507406, + "grad_norm": 0.7425527572631836, + "learning_rate": 0.0007750288762060063, + "loss": 3.5684, + "step": 26500 + }, + { + "epoch": 1.8008560945780676, + "grad_norm": 1.0074331760406494, + "learning_rate": 0.0007749864111971735, + "loss": 3.5079, + "step": 26505 + }, + { + "epoch": 1.8011958146487295, + "grad_norm": 0.8195802569389343, + "learning_rate": 0.0007749439461883408, + "loss": 3.9284, + "step": 26510 + }, + { + "epoch": 1.8015355347193913, + "grad_norm": 0.8029392957687378, + "learning_rate": 0.0007749014811795081, + "loss": 3.6213, + "step": 26515 + }, + { + "epoch": 1.801875254790053, + "grad_norm": 0.9631680250167847, + "learning_rate": 0.0007748590161706753, + "loss": 3.2753, + "step": 26520 + }, + { + "epoch": 1.8022149748607148, + "grad_norm": 0.8972461819648743, + "learning_rate": 0.0007748165511618427, + "loss": 3.7464, + "step": 26525 + }, + { + "epoch": 1.8025546949313767, + "grad_norm": 0.79483962059021, + "learning_rate": 0.00077477408615301, + "loss": 3.4682, + "step": 26530 + }, + { + "epoch": 1.8028944150020383, + "grad_norm": 0.9259505867958069, + "learning_rate": 0.0007747316211441772, + "loss": 3.7648, + "step": 26535 + }, + { + "epoch": 1.8032341350727001, + "grad_norm": 0.8305854797363281, + "learning_rate": 0.0007746891561353445, + "loss": 3.7059, + "step": 26540 + }, + { + "epoch": 1.803573855143362, + "grad_norm": 0.7770560383796692, + "learning_rate": 0.0007746466911265117, + "loss": 3.6379, + "step": 26545 + }, + { + "epoch": 1.8039135752140236, + "grad_norm": 1.0977897644042969, + "learning_rate": 0.000774604226117679, + "loss": 3.5133, + "step": 26550 + }, + { + "epoch": 1.8042532952846853, + "grad_norm": 0.851218044757843, + "learning_rate": 0.0007745617611088463, + "loss": 3.824, + "step": 26555 + }, + { + "epoch": 1.8045930153553473, + "grad_norm": 1.5430293083190918, + "learning_rate": 0.0007745192961000136, + "loss": 3.7666, + "step": 26560 + }, + { + "epoch": 1.804932735426009, + "grad_norm": 0.8333562612533569, + "learning_rate": 0.0007744768310911809, + "loss": 3.5449, + "step": 26565 + }, + { + "epoch": 1.8052724554966706, + "grad_norm": 0.8562154769897461, + "learning_rate": 0.0007744343660823482, + "loss": 3.6474, + "step": 26570 + }, + { + "epoch": 1.8056121755673327, + "grad_norm": 0.9571211934089661, + "learning_rate": 0.0007743919010735154, + "loss": 3.4472, + "step": 26575 + }, + { + "epoch": 1.8059518956379943, + "grad_norm": 0.8533197045326233, + "learning_rate": 0.0007743494360646826, + "loss": 3.4907, + "step": 26580 + }, + { + "epoch": 1.806291615708656, + "grad_norm": 0.752453625202179, + "learning_rate": 0.00077430697105585, + "loss": 3.4891, + "step": 26585 + }, + { + "epoch": 1.806631335779318, + "grad_norm": 0.9040192365646362, + "learning_rate": 0.0007742645060470172, + "loss": 3.6384, + "step": 26590 + }, + { + "epoch": 1.8069710558499796, + "grad_norm": 1.316732406616211, + "learning_rate": 0.0007742220410381845, + "loss": 3.6228, + "step": 26595 + }, + { + "epoch": 1.8073107759206413, + "grad_norm": 1.0320606231689453, + "learning_rate": 0.0007741795760293519, + "loss": 3.7584, + "step": 26600 + }, + { + "epoch": 1.8076504959913031, + "grad_norm": 0.8353241682052612, + "learning_rate": 0.0007741371110205191, + "loss": 3.5403, + "step": 26605 + }, + { + "epoch": 1.807990216061965, + "grad_norm": 0.749635636806488, + "learning_rate": 0.0007740946460116863, + "loss": 3.5852, + "step": 26610 + }, + { + "epoch": 1.8083299361326266, + "grad_norm": 0.9224425554275513, + "learning_rate": 0.0007740521810028537, + "loss": 3.4959, + "step": 26615 + }, + { + "epoch": 1.8086696562032885, + "grad_norm": 0.7784193158149719, + "learning_rate": 0.0007740097159940209, + "loss": 3.6604, + "step": 26620 + }, + { + "epoch": 1.8090093762739503, + "grad_norm": 0.7847733497619629, + "learning_rate": 0.0007739672509851882, + "loss": 3.4701, + "step": 26625 + }, + { + "epoch": 1.809349096344612, + "grad_norm": 0.7535330653190613, + "learning_rate": 0.0007739247859763556, + "loss": 3.5982, + "step": 26630 + }, + { + "epoch": 1.8096888164152738, + "grad_norm": 0.8146076798439026, + "learning_rate": 0.0007738823209675228, + "loss": 3.9607, + "step": 26635 + }, + { + "epoch": 1.8100285364859356, + "grad_norm": 1.0054246187210083, + "learning_rate": 0.0007738398559586901, + "loss": 3.5358, + "step": 26640 + }, + { + "epoch": 1.8103682565565973, + "grad_norm": 0.77800452709198, + "learning_rate": 0.0007737973909498573, + "loss": 3.6995, + "step": 26645 + }, + { + "epoch": 1.8107079766272591, + "grad_norm": 0.7917641401290894, + "learning_rate": 0.0007737549259410246, + "loss": 3.7104, + "step": 26650 + }, + { + "epoch": 1.811047696697921, + "grad_norm": 0.8000007271766663, + "learning_rate": 0.0007737124609321919, + "loss": 3.6263, + "step": 26655 + }, + { + "epoch": 1.8113874167685826, + "grad_norm": 0.7562968134880066, + "learning_rate": 0.0007736699959233591, + "loss": 3.5706, + "step": 26660 + }, + { + "epoch": 1.8117271368392445, + "grad_norm": 0.637056827545166, + "learning_rate": 0.0007736275309145265, + "loss": 3.4834, + "step": 26665 + }, + { + "epoch": 1.8120668569099063, + "grad_norm": 0.9027867317199707, + "learning_rate": 0.0007735850659056938, + "loss": 3.5873, + "step": 26670 + }, + { + "epoch": 1.812406576980568, + "grad_norm": 0.9056907296180725, + "learning_rate": 0.000773542600896861, + "loss": 3.5678, + "step": 26675 + }, + { + "epoch": 1.8127462970512298, + "grad_norm": 0.8989571928977966, + "learning_rate": 0.0007735001358880283, + "loss": 3.4129, + "step": 26680 + }, + { + "epoch": 1.8130860171218917, + "grad_norm": 0.9048509001731873, + "learning_rate": 0.0007734576708791956, + "loss": 3.5242, + "step": 26685 + }, + { + "epoch": 1.8134257371925533, + "grad_norm": 0.8693633675575256, + "learning_rate": 0.0007734152058703628, + "loss": 3.5506, + "step": 26690 + }, + { + "epoch": 1.8137654572632151, + "grad_norm": 0.9678965210914612, + "learning_rate": 0.00077337274086153, + "loss": 3.9893, + "step": 26695 + }, + { + "epoch": 1.814105177333877, + "grad_norm": 0.7007737159729004, + "learning_rate": 0.0007733302758526975, + "loss": 3.6464, + "step": 26700 + }, + { + "epoch": 1.8144448974045386, + "grad_norm": 0.7786060571670532, + "learning_rate": 0.0007732878108438647, + "loss": 3.3627, + "step": 26705 + }, + { + "epoch": 1.8147846174752005, + "grad_norm": 0.7253232002258301, + "learning_rate": 0.0007732453458350319, + "loss": 3.7853, + "step": 26710 + }, + { + "epoch": 1.8151243375458623, + "grad_norm": 0.938340961933136, + "learning_rate": 0.0007732028808261993, + "loss": 3.5289, + "step": 26715 + }, + { + "epoch": 1.815464057616524, + "grad_norm": 0.9805700778961182, + "learning_rate": 0.0007731604158173665, + "loss": 3.5003, + "step": 26720 + }, + { + "epoch": 1.8158037776871856, + "grad_norm": 0.7574619650840759, + "learning_rate": 0.0007731179508085337, + "loss": 3.6411, + "step": 26725 + }, + { + "epoch": 1.8161434977578477, + "grad_norm": 1.2061123847961426, + "learning_rate": 0.0007730754857997012, + "loss": 3.7545, + "step": 26730 + }, + { + "epoch": 1.8164832178285093, + "grad_norm": 0.9123266339302063, + "learning_rate": 0.0007730330207908684, + "loss": 3.726, + "step": 26735 + }, + { + "epoch": 1.816822937899171, + "grad_norm": 11.01323127746582, + "learning_rate": 0.0007729905557820356, + "loss": 3.1969, + "step": 26740 + }, + { + "epoch": 1.817162657969833, + "grad_norm": 0.728942334651947, + "learning_rate": 0.000772948090773203, + "loss": 3.7608, + "step": 26745 + }, + { + "epoch": 1.8175023780404946, + "grad_norm": 1.533151626586914, + "learning_rate": 0.0007729056257643702, + "loss": 3.8973, + "step": 26750 + }, + { + "epoch": 1.8178420981111563, + "grad_norm": 0.6320874691009521, + "learning_rate": 0.0007728631607555374, + "loss": 3.5937, + "step": 26755 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.1858397722244263, + "learning_rate": 0.0007728206957467047, + "loss": 3.5858, + "step": 26760 + }, + { + "epoch": 1.81852153825248, + "grad_norm": 0.9512020945549011, + "learning_rate": 0.0007727782307378721, + "loss": 3.4174, + "step": 26765 + }, + { + "epoch": 1.8188612583231416, + "grad_norm": 0.7724952697753906, + "learning_rate": 0.0007727357657290393, + "loss": 3.6791, + "step": 26770 + }, + { + "epoch": 1.8192009783938035, + "grad_norm": 0.9456770420074463, + "learning_rate": 0.0007726933007202066, + "loss": 3.6745, + "step": 26775 + }, + { + "epoch": 1.8195406984644653, + "grad_norm": 1.0876706838607788, + "learning_rate": 0.0007726508357113739, + "loss": 3.4008, + "step": 26780 + }, + { + "epoch": 1.819880418535127, + "grad_norm": 0.7621595859527588, + "learning_rate": 0.0007726083707025411, + "loss": 3.5668, + "step": 26785 + }, + { + "epoch": 1.8202201386057888, + "grad_norm": 0.9992814064025879, + "learning_rate": 0.0007725659056937084, + "loss": 3.4792, + "step": 26790 + }, + { + "epoch": 1.8205598586764506, + "grad_norm": 0.9767394065856934, + "learning_rate": 0.0007725234406848756, + "loss": 3.4771, + "step": 26795 + }, + { + "epoch": 1.8208995787471123, + "grad_norm": 0.9584024548530579, + "learning_rate": 0.000772480975676043, + "loss": 3.6694, + "step": 26800 + }, + { + "epoch": 1.8212392988177741, + "grad_norm": 1.0192334651947021, + "learning_rate": 0.0007724385106672103, + "loss": 3.7534, + "step": 26805 + }, + { + "epoch": 1.821579018888436, + "grad_norm": 1.1204872131347656, + "learning_rate": 0.0007723960456583775, + "loss": 3.5906, + "step": 26810 + }, + { + "epoch": 1.8219187389590976, + "grad_norm": 0.8234894275665283, + "learning_rate": 0.0007723535806495448, + "loss": 3.5513, + "step": 26815 + }, + { + "epoch": 1.8222584590297595, + "grad_norm": 0.7586296796798706, + "learning_rate": 0.0007723111156407121, + "loss": 3.4379, + "step": 26820 + }, + { + "epoch": 1.8225981791004213, + "grad_norm": 0.8469511270523071, + "learning_rate": 0.0007722686506318793, + "loss": 3.406, + "step": 26825 + }, + { + "epoch": 1.822937899171083, + "grad_norm": 0.9185637831687927, + "learning_rate": 0.0007722261856230465, + "loss": 3.55, + "step": 26830 + }, + { + "epoch": 1.8232776192417448, + "grad_norm": 0.7647606730461121, + "learning_rate": 0.000772183720614214, + "loss": 3.5772, + "step": 26835 + }, + { + "epoch": 1.8236173393124067, + "grad_norm": 0.8687662482261658, + "learning_rate": 0.0007721412556053812, + "loss": 3.6965, + "step": 26840 + }, + { + "epoch": 1.8239570593830683, + "grad_norm": 0.8910872340202332, + "learning_rate": 0.0007720987905965484, + "loss": 3.6021, + "step": 26845 + }, + { + "epoch": 1.8242967794537301, + "grad_norm": 0.7702224254608154, + "learning_rate": 0.0007720563255877158, + "loss": 3.7671, + "step": 26850 + }, + { + "epoch": 1.824636499524392, + "grad_norm": 0.8613458871841431, + "learning_rate": 0.000772013860578883, + "loss": 3.6691, + "step": 26855 + }, + { + "epoch": 1.8249762195950536, + "grad_norm": 1.8778021335601807, + "learning_rate": 0.0007719713955700502, + "loss": 3.4345, + "step": 26860 + }, + { + "epoch": 1.8253159396657155, + "grad_norm": 0.7421349883079529, + "learning_rate": 0.0007719289305612176, + "loss": 3.6881, + "step": 26865 + }, + { + "epoch": 1.8256556597363773, + "grad_norm": 0.9380187392234802, + "learning_rate": 0.0007718864655523849, + "loss": 3.6907, + "step": 26870 + }, + { + "epoch": 1.825995379807039, + "grad_norm": 0.9831246137619019, + "learning_rate": 0.0007718440005435521, + "loss": 3.9203, + "step": 26875 + }, + { + "epoch": 1.8263350998777008, + "grad_norm": 1.0366486310958862, + "learning_rate": 0.0007718015355347195, + "loss": 3.6726, + "step": 26880 + }, + { + "epoch": 1.8266748199483627, + "grad_norm": 0.7648555636405945, + "learning_rate": 0.0007717590705258867, + "loss": 3.5017, + "step": 26885 + }, + { + "epoch": 1.8270145400190243, + "grad_norm": 0.9651010036468506, + "learning_rate": 0.0007717166055170539, + "loss": 3.5889, + "step": 26890 + }, + { + "epoch": 1.827354260089686, + "grad_norm": 0.7906127572059631, + "learning_rate": 0.0007716741405082212, + "loss": 3.5536, + "step": 26895 + }, + { + "epoch": 1.827693980160348, + "grad_norm": 1.0769058465957642, + "learning_rate": 0.0007716316754993885, + "loss": 3.4639, + "step": 26900 + }, + { + "epoch": 1.8280337002310096, + "grad_norm": 0.9637919664382935, + "learning_rate": 0.0007715892104905558, + "loss": 3.5675, + "step": 26905 + }, + { + "epoch": 1.8283734203016713, + "grad_norm": 0.7984966039657593, + "learning_rate": 0.0007715467454817231, + "loss": 3.4677, + "step": 26910 + }, + { + "epoch": 1.8287131403723333, + "grad_norm": 0.9415756464004517, + "learning_rate": 0.0007715042804728904, + "loss": 3.7333, + "step": 26915 + }, + { + "epoch": 1.829052860442995, + "grad_norm": 0.90523362159729, + "learning_rate": 0.0007714618154640576, + "loss": 3.4839, + "step": 26920 + }, + { + "epoch": 1.8293925805136566, + "grad_norm": 1.0536646842956543, + "learning_rate": 0.0007714193504552249, + "loss": 3.4902, + "step": 26925 + }, + { + "epoch": 1.8297323005843187, + "grad_norm": 0.830269992351532, + "learning_rate": 0.0007713768854463921, + "loss": 3.6786, + "step": 26930 + }, + { + "epoch": 1.8300720206549803, + "grad_norm": 0.6591151356697083, + "learning_rate": 0.0007713344204375594, + "loss": 3.7196, + "step": 26935 + }, + { + "epoch": 1.830411740725642, + "grad_norm": 5.534862518310547, + "learning_rate": 0.0007712919554287268, + "loss": 3.5091, + "step": 26940 + }, + { + "epoch": 1.8307514607963038, + "grad_norm": 0.823306679725647, + "learning_rate": 0.000771249490419894, + "loss": 3.4781, + "step": 26945 + }, + { + "epoch": 1.8310911808669657, + "grad_norm": 0.8514467477798462, + "learning_rate": 0.0007712070254110613, + "loss": 3.3667, + "step": 26950 + }, + { + "epoch": 1.8314309009376273, + "grad_norm": 1.0357410907745361, + "learning_rate": 0.0007711645604022286, + "loss": 3.5559, + "step": 26955 + }, + { + "epoch": 1.8317706210082891, + "grad_norm": 0.7448777556419373, + "learning_rate": 0.0007711220953933958, + "loss": 3.6234, + "step": 26960 + }, + { + "epoch": 1.832110341078951, + "grad_norm": 0.7908376455307007, + "learning_rate": 0.0007710796303845632, + "loss": 3.4287, + "step": 26965 + }, + { + "epoch": 1.8324500611496126, + "grad_norm": 0.8904210329055786, + "learning_rate": 0.0007710371653757304, + "loss": 3.6465, + "step": 26970 + }, + { + "epoch": 1.8327897812202745, + "grad_norm": 0.9654533267021179, + "learning_rate": 0.0007709947003668977, + "loss": 3.9034, + "step": 26975 + }, + { + "epoch": 1.8331295012909363, + "grad_norm": 0.6749300956726074, + "learning_rate": 0.0007709522353580651, + "loss": 3.6212, + "step": 26980 + }, + { + "epoch": 1.833469221361598, + "grad_norm": 0.8815255761146545, + "learning_rate": 0.0007709097703492323, + "loss": 3.481, + "step": 26985 + }, + { + "epoch": 1.8338089414322598, + "grad_norm": 0.8119232058525085, + "learning_rate": 0.0007708673053403995, + "loss": 3.5187, + "step": 26990 + }, + { + "epoch": 1.8341486615029217, + "grad_norm": 0.832966685295105, + "learning_rate": 0.0007708248403315668, + "loss": 3.283, + "step": 26995 + }, + { + "epoch": 1.8344883815735833, + "grad_norm": 0.9243209362030029, + "learning_rate": 0.0007707823753227341, + "loss": 3.6813, + "step": 27000 + }, + { + "epoch": 1.8348281016442451, + "grad_norm": 0.9584366083145142, + "learning_rate": 0.0007707399103139013, + "loss": 3.3757, + "step": 27005 + }, + { + "epoch": 1.835167821714907, + "grad_norm": 1.0503939390182495, + "learning_rate": 0.0007706974453050687, + "loss": 3.5097, + "step": 27010 + }, + { + "epoch": 1.8355075417855686, + "grad_norm": 0.7203464508056641, + "learning_rate": 0.000770654980296236, + "loss": 3.5703, + "step": 27015 + }, + { + "epoch": 1.8358472618562305, + "grad_norm": 0.9157356023788452, + "learning_rate": 0.0007706125152874032, + "loss": 3.6318, + "step": 27020 + }, + { + "epoch": 1.8361869819268923, + "grad_norm": 1.0203956365585327, + "learning_rate": 0.0007705700502785705, + "loss": 3.6809, + "step": 27025 + }, + { + "epoch": 1.836526701997554, + "grad_norm": 1.1631085872650146, + "learning_rate": 0.0007705275852697377, + "loss": 3.4144, + "step": 27030 + }, + { + "epoch": 1.8368664220682158, + "grad_norm": 0.6900688409805298, + "learning_rate": 0.000770485120260905, + "loss": 3.6377, + "step": 27035 + }, + { + "epoch": 1.8372061421388777, + "grad_norm": 0.7896433472633362, + "learning_rate": 0.0007704426552520723, + "loss": 3.2703, + "step": 27040 + }, + { + "epoch": 1.8375458622095393, + "grad_norm": 2.1808743476867676, + "learning_rate": 0.0007704001902432396, + "loss": 3.5328, + "step": 27045 + }, + { + "epoch": 1.8378855822802012, + "grad_norm": 0.7439380884170532, + "learning_rate": 0.0007703577252344069, + "loss": 3.6576, + "step": 27050 + }, + { + "epoch": 1.838225302350863, + "grad_norm": 0.8591758012771606, + "learning_rate": 0.0007703152602255742, + "loss": 3.5636, + "step": 27055 + }, + { + "epoch": 1.8385650224215246, + "grad_norm": 1.3349450826644897, + "learning_rate": 0.0007702727952167414, + "loss": 3.4203, + "step": 27060 + }, + { + "epoch": 1.8389047424921863, + "grad_norm": 1.4784553050994873, + "learning_rate": 0.0007702303302079087, + "loss": 3.6749, + "step": 27065 + }, + { + "epoch": 1.8392444625628483, + "grad_norm": 0.9203925728797913, + "learning_rate": 0.000770187865199076, + "loss": 3.6148, + "step": 27070 + }, + { + "epoch": 1.83958418263351, + "grad_norm": 1.0382524728775024, + "learning_rate": 0.0007701454001902432, + "loss": 3.5962, + "step": 27075 + }, + { + "epoch": 1.8399239027041716, + "grad_norm": 1.0370577573776245, + "learning_rate": 0.0007701029351814105, + "loss": 3.6453, + "step": 27080 + }, + { + "epoch": 1.8402636227748337, + "grad_norm": 1.1736681461334229, + "learning_rate": 0.0007700604701725779, + "loss": 3.5342, + "step": 27085 + }, + { + "epoch": 1.8406033428454953, + "grad_norm": 0.9471827745437622, + "learning_rate": 0.0007700180051637451, + "loss": 3.6021, + "step": 27090 + }, + { + "epoch": 1.840943062916157, + "grad_norm": 0.8051036596298218, + "learning_rate": 0.0007699755401549123, + "loss": 3.4018, + "step": 27095 + }, + { + "epoch": 1.841282782986819, + "grad_norm": 0.9669556021690369, + "learning_rate": 0.0007699330751460797, + "loss": 3.561, + "step": 27100 + }, + { + "epoch": 1.8416225030574807, + "grad_norm": 0.8881266117095947, + "learning_rate": 0.0007698906101372469, + "loss": 3.5205, + "step": 27105 + }, + { + "epoch": 1.8419622231281423, + "grad_norm": 0.8495059013366699, + "learning_rate": 0.0007698481451284141, + "loss": 3.779, + "step": 27110 + }, + { + "epoch": 1.8423019431988041, + "grad_norm": 0.8556120991706848, + "learning_rate": 0.0007698056801195816, + "loss": 3.3807, + "step": 27115 + }, + { + "epoch": 1.842641663269466, + "grad_norm": 0.9349868297576904, + "learning_rate": 0.0007697632151107488, + "loss": 3.7318, + "step": 27120 + }, + { + "epoch": 1.8429813833401276, + "grad_norm": 0.9753512144088745, + "learning_rate": 0.000769720750101916, + "loss": 3.6382, + "step": 27125 + }, + { + "epoch": 1.8433211034107895, + "grad_norm": 0.8518068790435791, + "learning_rate": 0.0007696782850930833, + "loss": 3.7017, + "step": 27130 + }, + { + "epoch": 1.8436608234814513, + "grad_norm": 0.7665545344352722, + "learning_rate": 0.0007696358200842506, + "loss": 3.6764, + "step": 27135 + }, + { + "epoch": 1.844000543552113, + "grad_norm": 0.7952761650085449, + "learning_rate": 0.0007695933550754178, + "loss": 3.6953, + "step": 27140 + }, + { + "epoch": 1.8443402636227748, + "grad_norm": 0.7165946364402771, + "learning_rate": 0.0007695508900665851, + "loss": 3.3762, + "step": 27145 + }, + { + "epoch": 1.8446799836934367, + "grad_norm": 1.0380585193634033, + "learning_rate": 0.0007695084250577525, + "loss": 3.7044, + "step": 27150 + }, + { + "epoch": 1.8450197037640983, + "grad_norm": 1.0134998559951782, + "learning_rate": 0.0007694659600489197, + "loss": 3.7266, + "step": 27155 + }, + { + "epoch": 1.8453594238347601, + "grad_norm": 0.8726839423179626, + "learning_rate": 0.000769423495040087, + "loss": 3.6484, + "step": 27160 + }, + { + "epoch": 1.845699143905422, + "grad_norm": 1.2961808443069458, + "learning_rate": 0.0007693810300312543, + "loss": 3.2684, + "step": 27165 + }, + { + "epoch": 1.8460388639760836, + "grad_norm": 0.8979014158248901, + "learning_rate": 0.0007693385650224215, + "loss": 3.5745, + "step": 27170 + }, + { + "epoch": 1.8463785840467455, + "grad_norm": 0.9290340542793274, + "learning_rate": 0.0007692961000135888, + "loss": 3.8126, + "step": 27175 + }, + { + "epoch": 1.8467183041174073, + "grad_norm": 1.0253323316574097, + "learning_rate": 0.000769253635004756, + "loss": 3.9067, + "step": 27180 + }, + { + "epoch": 1.847058024188069, + "grad_norm": 1.0064963102340698, + "learning_rate": 0.0007692111699959234, + "loss": 3.6941, + "step": 27185 + }, + { + "epoch": 1.8473977442587308, + "grad_norm": 0.8827007412910461, + "learning_rate": 0.0007691687049870907, + "loss": 3.8907, + "step": 27190 + }, + { + "epoch": 1.8477374643293927, + "grad_norm": 18.345251083374023, + "learning_rate": 0.0007691262399782579, + "loss": 3.8096, + "step": 27195 + }, + { + "epoch": 1.8480771844000543, + "grad_norm": 0.6654131412506104, + "learning_rate": 0.0007690837749694252, + "loss": 3.5784, + "step": 27200 + }, + { + "epoch": 1.8484169044707162, + "grad_norm": 0.6710897088050842, + "learning_rate": 0.0007690413099605925, + "loss": 3.8796, + "step": 27205 + }, + { + "epoch": 1.848756624541378, + "grad_norm": 0.7632102370262146, + "learning_rate": 0.0007689988449517597, + "loss": 3.5892, + "step": 27210 + }, + { + "epoch": 1.8490963446120396, + "grad_norm": 0.7668070793151855, + "learning_rate": 0.000768956379942927, + "loss": 3.3721, + "step": 27215 + }, + { + "epoch": 1.8494360646827015, + "grad_norm": 0.8165104985237122, + "learning_rate": 0.0007689139149340944, + "loss": 3.8672, + "step": 27220 + }, + { + "epoch": 1.8497757847533634, + "grad_norm": 1.1374372243881226, + "learning_rate": 0.0007688714499252616, + "loss": 3.4137, + "step": 27225 + }, + { + "epoch": 1.850115504824025, + "grad_norm": 2.062495231628418, + "learning_rate": 0.0007688289849164288, + "loss": 3.4429, + "step": 27230 + }, + { + "epoch": 1.8504552248946866, + "grad_norm": 0.7371671795845032, + "learning_rate": 0.0007687865199075962, + "loss": 3.6695, + "step": 27235 + }, + { + "epoch": 1.8507949449653487, + "grad_norm": 1.1008405685424805, + "learning_rate": 0.0007687440548987634, + "loss": 3.6262, + "step": 27240 + }, + { + "epoch": 1.8511346650360103, + "grad_norm": 1.5753989219665527, + "learning_rate": 0.0007687015898899306, + "loss": 3.6887, + "step": 27245 + }, + { + "epoch": 1.851474385106672, + "grad_norm": 1.0408676862716675, + "learning_rate": 0.000768659124881098, + "loss": 3.568, + "step": 27250 + }, + { + "epoch": 1.851814105177334, + "grad_norm": 1.2493672370910645, + "learning_rate": 0.0007686166598722653, + "loss": 3.5386, + "step": 27255 + }, + { + "epoch": 1.8521538252479957, + "grad_norm": 1.1701287031173706, + "learning_rate": 0.0007685741948634325, + "loss": 3.6878, + "step": 27260 + }, + { + "epoch": 1.8524935453186573, + "grad_norm": 1.1777138710021973, + "learning_rate": 0.0007685317298545999, + "loss": 3.7883, + "step": 27265 + }, + { + "epoch": 1.8528332653893194, + "grad_norm": 1.0718438625335693, + "learning_rate": 0.0007684892648457671, + "loss": 3.7038, + "step": 27270 + }, + { + "epoch": 1.853172985459981, + "grad_norm": 0.8830891251564026, + "learning_rate": 0.0007684467998369343, + "loss": 3.9125, + "step": 27275 + }, + { + "epoch": 1.8535127055306426, + "grad_norm": 0.9009256362915039, + "learning_rate": 0.0007684043348281016, + "loss": 3.6332, + "step": 27280 + }, + { + "epoch": 1.8538524256013045, + "grad_norm": 1.0584313869476318, + "learning_rate": 0.0007683618698192689, + "loss": 3.6908, + "step": 27285 + }, + { + "epoch": 1.8541921456719663, + "grad_norm": 0.6448716521263123, + "learning_rate": 0.0007683194048104362, + "loss": 3.7519, + "step": 27290 + }, + { + "epoch": 1.854531865742628, + "grad_norm": 1.1291712522506714, + "learning_rate": 0.0007682769398016035, + "loss": 3.4856, + "step": 27295 + }, + { + "epoch": 1.8548715858132898, + "grad_norm": 1.8310078382492065, + "learning_rate": 0.0007682344747927708, + "loss": 3.6646, + "step": 27300 + }, + { + "epoch": 1.8552113058839517, + "grad_norm": 0.7827014923095703, + "learning_rate": 0.0007681920097839381, + "loss": 3.5769, + "step": 27305 + }, + { + "epoch": 1.8555510259546133, + "grad_norm": 1.0312530994415283, + "learning_rate": 0.0007681495447751053, + "loss": 3.5148, + "step": 27310 + }, + { + "epoch": 1.8558907460252752, + "grad_norm": 0.9965012073516846, + "learning_rate": 0.0007681070797662725, + "loss": 3.8302, + "step": 27315 + }, + { + "epoch": 1.856230466095937, + "grad_norm": 0.7794294357299805, + "learning_rate": 0.00076806461475744, + "loss": 3.4551, + "step": 27320 + }, + { + "epoch": 1.8565701861665986, + "grad_norm": 1.302919626235962, + "learning_rate": 0.0007680221497486072, + "loss": 3.4615, + "step": 27325 + }, + { + "epoch": 1.8569099062372605, + "grad_norm": 0.9832828044891357, + "learning_rate": 0.0007679796847397744, + "loss": 3.5093, + "step": 27330 + }, + { + "epoch": 1.8572496263079223, + "grad_norm": 0.9896733164787292, + "learning_rate": 0.0007679372197309418, + "loss": 3.4917, + "step": 27335 + }, + { + "epoch": 1.857589346378584, + "grad_norm": 0.8570865392684937, + "learning_rate": 0.000767894754722109, + "loss": 3.5777, + "step": 27340 + }, + { + "epoch": 1.8579290664492458, + "grad_norm": 1.2201696634292603, + "learning_rate": 0.0007678522897132762, + "loss": 3.4732, + "step": 27345 + }, + { + "epoch": 1.8582687865199077, + "grad_norm": 0.9998146891593933, + "learning_rate": 0.0007678098247044436, + "loss": 3.499, + "step": 27350 + }, + { + "epoch": 1.8586085065905693, + "grad_norm": 0.820268988609314, + "learning_rate": 0.0007677673596956109, + "loss": 3.6244, + "step": 27355 + }, + { + "epoch": 1.8589482266612312, + "grad_norm": 0.8104788661003113, + "learning_rate": 0.0007677248946867781, + "loss": 3.5397, + "step": 27360 + }, + { + "epoch": 1.859287946731893, + "grad_norm": 0.8055285811424255, + "learning_rate": 0.0007676824296779455, + "loss": 3.5206, + "step": 27365 + }, + { + "epoch": 1.8596276668025546, + "grad_norm": NaN, + "learning_rate": 0.0007676484576708792, + "loss": 3.4842, + "step": 27370 + }, + { + "epoch": 1.8599673868732165, + "grad_norm": 1.1112314462661743, + "learning_rate": 0.0007676059926620464, + "loss": 3.5435, + "step": 27375 + }, + { + "epoch": 1.8603071069438784, + "grad_norm": 0.8074025511741638, + "learning_rate": 0.0007675635276532138, + "loss": 3.4086, + "step": 27380 + }, + { + "epoch": 1.86064682701454, + "grad_norm": 0.8637706637382507, + "learning_rate": 0.0007675210626443811, + "loss": 3.5443, + "step": 27385 + }, + { + "epoch": 1.8609865470852018, + "grad_norm": 0.9935139417648315, + "learning_rate": 0.0007674785976355483, + "loss": 3.2928, + "step": 27390 + }, + { + "epoch": 1.8613262671558637, + "grad_norm": 0.7723684310913086, + "learning_rate": 0.0007674361326267156, + "loss": 3.7191, + "step": 27395 + }, + { + "epoch": 1.8616659872265253, + "grad_norm": 6.104430675506592, + "learning_rate": 0.0007673936676178829, + "loss": 3.8536, + "step": 27400 + }, + { + "epoch": 1.862005707297187, + "grad_norm": 1.015484094619751, + "learning_rate": 0.0007673512026090501, + "loss": 3.4166, + "step": 27405 + }, + { + "epoch": 1.862345427367849, + "grad_norm": 1.1504112482070923, + "learning_rate": 0.0007673087376002174, + "loss": 3.8142, + "step": 27410 + }, + { + "epoch": 1.8626851474385107, + "grad_norm": 1.0611931085586548, + "learning_rate": 0.0007672662725913847, + "loss": 3.8188, + "step": 27415 + }, + { + "epoch": 1.8630248675091723, + "grad_norm": 0.8033684492111206, + "learning_rate": 0.000767223807582552, + "loss": 3.7476, + "step": 27420 + }, + { + "epoch": 1.8633645875798344, + "grad_norm": 0.7787072658538818, + "learning_rate": 0.0007671813425737193, + "loss": 3.6295, + "step": 27425 + }, + { + "epoch": 1.863704307650496, + "grad_norm": 0.843275249004364, + "learning_rate": 0.0007671388775648865, + "loss": 3.7013, + "step": 27430 + }, + { + "epoch": 1.8640440277211576, + "grad_norm": 0.660118818283081, + "learning_rate": 0.0007670964125560538, + "loss": 3.6294, + "step": 27435 + }, + { + "epoch": 1.8643837477918197, + "grad_norm": 0.9897702932357788, + "learning_rate": 0.0007670539475472211, + "loss": 3.8567, + "step": 27440 + }, + { + "epoch": 1.8647234678624813, + "grad_norm": 0.8688026666641235, + "learning_rate": 0.0007670114825383883, + "loss": 3.5988, + "step": 27445 + }, + { + "epoch": 1.865063187933143, + "grad_norm": 1.024370551109314, + "learning_rate": 0.0007669690175295557, + "loss": 3.3907, + "step": 27450 + }, + { + "epoch": 1.8654029080038048, + "grad_norm": 0.8974940180778503, + "learning_rate": 0.000766926552520723, + "loss": 3.4386, + "step": 27455 + }, + { + "epoch": 1.8657426280744667, + "grad_norm": 0.7527058720588684, + "learning_rate": 0.0007668840875118902, + "loss": 3.5734, + "step": 27460 + }, + { + "epoch": 1.8660823481451283, + "grad_norm": 1.3199039697647095, + "learning_rate": 0.0007668416225030574, + "loss": 3.4076, + "step": 27465 + }, + { + "epoch": 1.8664220682157902, + "grad_norm": 1.119197130203247, + "learning_rate": 0.0007667991574942248, + "loss": 3.5359, + "step": 27470 + }, + { + "epoch": 1.866761788286452, + "grad_norm": 0.7454637885093689, + "learning_rate": 0.000766756692485392, + "loss": 3.3479, + "step": 27475 + }, + { + "epoch": 1.8671015083571136, + "grad_norm": 1.7604020833969116, + "learning_rate": 0.0007667142274765592, + "loss": 3.6872, + "step": 27480 + }, + { + "epoch": 1.8674412284277755, + "grad_norm": 0.8023717999458313, + "learning_rate": 0.0007666717624677267, + "loss": 3.4951, + "step": 27485 + }, + { + "epoch": 1.8677809484984373, + "grad_norm": 1.0189745426177979, + "learning_rate": 0.0007666292974588939, + "loss": 3.5444, + "step": 27490 + }, + { + "epoch": 1.868120668569099, + "grad_norm": 5.0081787109375, + "learning_rate": 0.0007665868324500611, + "loss": 3.8693, + "step": 27495 + }, + { + "epoch": 1.8684603886397608, + "grad_norm": 0.9334794878959656, + "learning_rate": 0.0007665443674412285, + "loss": 3.4669, + "step": 27500 + }, + { + "epoch": 1.8688001087104227, + "grad_norm": 0.9543081521987915, + "learning_rate": 0.0007665019024323957, + "loss": 3.7822, + "step": 27505 + }, + { + "epoch": 1.8691398287810843, + "grad_norm": 0.9998304843902588, + "learning_rate": 0.000766459437423563, + "loss": 3.3771, + "step": 27510 + }, + { + "epoch": 1.8694795488517462, + "grad_norm": 1.8321750164031982, + "learning_rate": 0.0007664169724147302, + "loss": 3.6232, + "step": 27515 + }, + { + "epoch": 1.869819268922408, + "grad_norm": 0.8738434910774231, + "learning_rate": 0.0007663745074058976, + "loss": 3.4218, + "step": 27520 + }, + { + "epoch": 1.8701589889930696, + "grad_norm": 0.8147904872894287, + "learning_rate": 0.0007663320423970649, + "loss": 3.5392, + "step": 27525 + }, + { + "epoch": 1.8704987090637315, + "grad_norm": 0.6396855115890503, + "learning_rate": 0.0007662895773882321, + "loss": 3.6253, + "step": 27530 + }, + { + "epoch": 1.8708384291343934, + "grad_norm": 0.9784702062606812, + "learning_rate": 0.0007662471123793994, + "loss": 3.2857, + "step": 27535 + }, + { + "epoch": 1.871178149205055, + "grad_norm": 0.8369816541671753, + "learning_rate": 0.0007662046473705667, + "loss": 3.8721, + "step": 27540 + }, + { + "epoch": 1.8715178692757168, + "grad_norm": 0.7046074867248535, + "learning_rate": 0.0007661621823617339, + "loss": 3.549, + "step": 27545 + }, + { + "epoch": 1.8718575893463787, + "grad_norm": 0.7716382145881653, + "learning_rate": 0.0007661197173529011, + "loss": 3.8418, + "step": 27550 + }, + { + "epoch": 1.8721973094170403, + "grad_norm": 0.8674455881118774, + "learning_rate": 0.0007660772523440686, + "loss": 3.5253, + "step": 27555 + }, + { + "epoch": 1.8725370294877022, + "grad_norm": 0.9572879672050476, + "learning_rate": 0.0007660347873352358, + "loss": 3.1832, + "step": 27560 + }, + { + "epoch": 1.872876749558364, + "grad_norm": 0.8367605209350586, + "learning_rate": 0.000765992322326403, + "loss": 3.69, + "step": 27565 + }, + { + "epoch": 1.8732164696290257, + "grad_norm": 0.722327470779419, + "learning_rate": 0.0007659498573175704, + "loss": 3.4492, + "step": 27570 + }, + { + "epoch": 1.8735561896996873, + "grad_norm": 0.9258478283882141, + "learning_rate": 0.0007659073923087376, + "loss": 3.703, + "step": 27575 + }, + { + "epoch": 1.8738959097703494, + "grad_norm": 1.066994547843933, + "learning_rate": 0.0007658649272999048, + "loss": 3.7245, + "step": 27580 + }, + { + "epoch": 1.874235629841011, + "grad_norm": 0.8795674443244934, + "learning_rate": 0.0007658224622910723, + "loss": 3.6018, + "step": 27585 + }, + { + "epoch": 1.8745753499116726, + "grad_norm": 0.8580011129379272, + "learning_rate": 0.0007657799972822395, + "loss": 3.4283, + "step": 27590 + }, + { + "epoch": 1.8749150699823347, + "grad_norm": 0.8461725115776062, + "learning_rate": 0.0007657375322734067, + "loss": 3.483, + "step": 27595 + }, + { + "epoch": 1.8752547900529963, + "grad_norm": 1.0798628330230713, + "learning_rate": 0.0007656950672645741, + "loss": 3.4781, + "step": 27600 + }, + { + "epoch": 1.875594510123658, + "grad_norm": 0.9481614828109741, + "learning_rate": 0.0007656526022557413, + "loss": 3.7577, + "step": 27605 + }, + { + "epoch": 1.87593423019432, + "grad_norm": 0.7476903200149536, + "learning_rate": 0.0007656101372469085, + "loss": 3.4276, + "step": 27610 + }, + { + "epoch": 1.8762739502649817, + "grad_norm": 0.783419132232666, + "learning_rate": 0.0007655676722380758, + "loss": 3.6972, + "step": 27615 + }, + { + "epoch": 1.8766136703356433, + "grad_norm": 0.7672784328460693, + "learning_rate": 0.0007655252072292432, + "loss": 3.5895, + "step": 27620 + }, + { + "epoch": 1.8769533904063052, + "grad_norm": 1.0037784576416016, + "learning_rate": 0.0007654827422204104, + "loss": 3.7439, + "step": 27625 + }, + { + "epoch": 1.877293110476967, + "grad_norm": 0.668411374092102, + "learning_rate": 0.0007654402772115777, + "loss": 3.5887, + "step": 27630 + }, + { + "epoch": 1.8776328305476286, + "grad_norm": 1.0647252798080444, + "learning_rate": 0.000765397812202745, + "loss": 3.2422, + "step": 27635 + }, + { + "epoch": 1.8779725506182905, + "grad_norm": 0.619929313659668, + "learning_rate": 0.0007653553471939122, + "loss": 3.6278, + "step": 27640 + }, + { + "epoch": 1.8783122706889523, + "grad_norm": 0.8431556224822998, + "learning_rate": 0.0007653128821850795, + "loss": 3.2922, + "step": 27645 + }, + { + "epoch": 1.878651990759614, + "grad_norm": 1.953548789024353, + "learning_rate": 0.0007652704171762468, + "loss": 3.4737, + "step": 27650 + }, + { + "epoch": 1.8789917108302758, + "grad_norm": 0.8076330423355103, + "learning_rate": 0.0007652279521674141, + "loss": 3.6275, + "step": 27655 + }, + { + "epoch": 1.8793314309009377, + "grad_norm": 0.709383487701416, + "learning_rate": 0.0007651854871585814, + "loss": 3.6091, + "step": 27660 + }, + { + "epoch": 1.8796711509715993, + "grad_norm": 0.9857257008552551, + "learning_rate": 0.0007651430221497486, + "loss": 3.7114, + "step": 27665 + }, + { + "epoch": 1.8800108710422612, + "grad_norm": 0.7714617252349854, + "learning_rate": 0.0007651005571409159, + "loss": 3.5688, + "step": 27670 + }, + { + "epoch": 1.880350591112923, + "grad_norm": 0.866457998752594, + "learning_rate": 0.0007650580921320832, + "loss": 3.4609, + "step": 27675 + }, + { + "epoch": 1.8806903111835847, + "grad_norm": 1.1325737237930298, + "learning_rate": 0.0007650156271232504, + "loss": 3.732, + "step": 27680 + }, + { + "epoch": 1.8810300312542465, + "grad_norm": 0.8003157377243042, + "learning_rate": 0.0007649731621144177, + "loss": 3.5352, + "step": 27685 + }, + { + "epoch": 1.8813697513249084, + "grad_norm": 0.6649025678634644, + "learning_rate": 0.0007649306971055851, + "loss": 3.5343, + "step": 27690 + }, + { + "epoch": 1.88170947139557, + "grad_norm": 0.9624032378196716, + "learning_rate": 0.0007648882320967523, + "loss": 3.7962, + "step": 27695 + }, + { + "epoch": 1.8820491914662318, + "grad_norm": 1.797417163848877, + "learning_rate": 0.0007648457670879196, + "loss": 3.4615, + "step": 27700 + }, + { + "epoch": 1.8823889115368937, + "grad_norm": 0.8794678449630737, + "learning_rate": 0.0007648033020790869, + "loss": 3.4939, + "step": 27705 + }, + { + "epoch": 1.8827286316075553, + "grad_norm": 0.8767163157463074, + "learning_rate": 0.0007647608370702541, + "loss": 3.573, + "step": 27710 + }, + { + "epoch": 1.8830683516782172, + "grad_norm": 0.7757917642593384, + "learning_rate": 0.0007647183720614213, + "loss": 3.6999, + "step": 27715 + }, + { + "epoch": 1.883408071748879, + "grad_norm": 0.8045429587364197, + "learning_rate": 0.0007646844000543553, + "loss": 3.5227, + "step": 27720 + }, + { + "epoch": 1.8837477918195407, + "grad_norm": 0.7878900766372681, + "learning_rate": 0.0007646419350455225, + "loss": 3.5927, + "step": 27725 + }, + { + "epoch": 1.8840875118902025, + "grad_norm": 1.1185532808303833, + "learning_rate": 0.0007645994700366898, + "loss": 3.5576, + "step": 27730 + }, + { + "epoch": 1.8844272319608644, + "grad_norm": 1.020034670829773, + "learning_rate": 0.0007645570050278571, + "loss": 3.6412, + "step": 27735 + }, + { + "epoch": 1.884766952031526, + "grad_norm": 1.3546710014343262, + "learning_rate": 0.0007645145400190243, + "loss": 3.6642, + "step": 27740 + }, + { + "epoch": 1.8851066721021879, + "grad_norm": 0.757754385471344, + "learning_rate": 0.0007644720750101916, + "loss": 3.4583, + "step": 27745 + }, + { + "epoch": 1.8854463921728497, + "grad_norm": 0.6539257764816284, + "learning_rate": 0.000764429610001359, + "loss": 3.4708, + "step": 27750 + }, + { + "epoch": 1.8857861122435113, + "grad_norm": 0.7370918989181519, + "learning_rate": 0.0007643871449925262, + "loss": 3.4611, + "step": 27755 + }, + { + "epoch": 1.886125832314173, + "grad_norm": 0.965022623538971, + "learning_rate": 0.0007643446799836935, + "loss": 3.7061, + "step": 27760 + }, + { + "epoch": 1.886465552384835, + "grad_norm": 0.8051489591598511, + "learning_rate": 0.0007643022149748607, + "loss": 3.5786, + "step": 27765 + }, + { + "epoch": 1.8868052724554967, + "grad_norm": 1.3533836603164673, + "learning_rate": 0.000764259749966028, + "loss": 3.7161, + "step": 27770 + }, + { + "epoch": 1.8871449925261583, + "grad_norm": 0.8475891351699829, + "learning_rate": 0.0007642172849571953, + "loss": 3.365, + "step": 27775 + }, + { + "epoch": 1.8874847125968204, + "grad_norm": 0.883865237236023, + "learning_rate": 0.0007641748199483625, + "loss": 3.429, + "step": 27780 + }, + { + "epoch": 1.887824432667482, + "grad_norm": 1.1754275560379028, + "learning_rate": 0.0007641323549395299, + "loss": 3.3995, + "step": 27785 + }, + { + "epoch": 1.8881641527381436, + "grad_norm": 0.8333796858787537, + "learning_rate": 0.0007640898899306972, + "loss": 3.7303, + "step": 27790 + }, + { + "epoch": 1.8885038728088055, + "grad_norm": 0.7219475507736206, + "learning_rate": 0.0007640474249218644, + "loss": 3.5494, + "step": 27795 + }, + { + "epoch": 1.8888435928794673, + "grad_norm": 0.8987503051757812, + "learning_rate": 0.0007640049599130316, + "loss": 3.5983, + "step": 27800 + }, + { + "epoch": 1.889183312950129, + "grad_norm": 0.8774315118789673, + "learning_rate": 0.000763962494904199, + "loss": 3.4999, + "step": 27805 + }, + { + "epoch": 1.8895230330207908, + "grad_norm": 0.8003666400909424, + "learning_rate": 0.0007639200298953662, + "loss": 3.8253, + "step": 27810 + }, + { + "epoch": 1.8898627530914527, + "grad_norm": 1.0031784772872925, + "learning_rate": 0.0007638775648865334, + "loss": 3.5024, + "step": 27815 + }, + { + "epoch": 1.8902024731621143, + "grad_norm": 0.7985211610794067, + "learning_rate": 0.0007638350998777009, + "loss": 3.4584, + "step": 27820 + }, + { + "epoch": 1.8905421932327762, + "grad_norm": 0.648682177066803, + "learning_rate": 0.0007637926348688681, + "loss": 3.5281, + "step": 27825 + }, + { + "epoch": 1.890881913303438, + "grad_norm": 1.053402304649353, + "learning_rate": 0.0007637501698600353, + "loss": 3.918, + "step": 27830 + }, + { + "epoch": 1.8912216333740997, + "grad_norm": 0.9003229141235352, + "learning_rate": 0.0007637077048512027, + "loss": 3.7667, + "step": 27835 + }, + { + "epoch": 1.8915613534447615, + "grad_norm": 0.7486752867698669, + "learning_rate": 0.0007636652398423699, + "loss": 3.3942, + "step": 27840 + }, + { + "epoch": 1.8919010735154234, + "grad_norm": 0.9213387370109558, + "learning_rate": 0.0007636227748335371, + "loss": 3.5184, + "step": 27845 + }, + { + "epoch": 1.892240793586085, + "grad_norm": 1.3836767673492432, + "learning_rate": 0.0007635803098247046, + "loss": 3.5293, + "step": 27850 + }, + { + "epoch": 1.8925805136567468, + "grad_norm": 1.9085369110107422, + "learning_rate": 0.0007635378448158718, + "loss": 3.9145, + "step": 27855 + }, + { + "epoch": 1.8929202337274087, + "grad_norm": 0.7079948782920837, + "learning_rate": 0.000763495379807039, + "loss": 3.7977, + "step": 27860 + }, + { + "epoch": 1.8932599537980703, + "grad_norm": 1.8549444675445557, + "learning_rate": 0.0007634529147982063, + "loss": 3.3867, + "step": 27865 + }, + { + "epoch": 1.8935996738687322, + "grad_norm": 1.1256998777389526, + "learning_rate": 0.0007634104497893736, + "loss": 3.8795, + "step": 27870 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.9983499646186829, + "learning_rate": 0.0007633679847805408, + "loss": 3.7596, + "step": 27875 + }, + { + "epoch": 1.8942791140100557, + "grad_norm": 0.8089243769645691, + "learning_rate": 0.0007633255197717081, + "loss": 3.6531, + "step": 27880 + }, + { + "epoch": 1.8946188340807175, + "grad_norm": 0.9009003043174744, + "learning_rate": 0.0007632830547628755, + "loss": 3.4236, + "step": 27885 + }, + { + "epoch": 1.8949585541513794, + "grad_norm": 0.7578535676002502, + "learning_rate": 0.0007632405897540427, + "loss": 3.5742, + "step": 27890 + }, + { + "epoch": 1.895298274222041, + "grad_norm": 0.8960786461830139, + "learning_rate": 0.00076319812474521, + "loss": 3.694, + "step": 27895 + }, + { + "epoch": 1.8956379942927029, + "grad_norm": 1.167737603187561, + "learning_rate": 0.0007631556597363772, + "loss": 3.6958, + "step": 27900 + }, + { + "epoch": 1.8959777143633647, + "grad_norm": 1.1819700002670288, + "learning_rate": 0.0007631131947275445, + "loss": 3.3991, + "step": 27905 + }, + { + "epoch": 1.8963174344340263, + "grad_norm": 0.8596386909484863, + "learning_rate": 0.0007630707297187118, + "loss": 3.6382, + "step": 27910 + }, + { + "epoch": 1.8966571545046882, + "grad_norm": 0.8243609070777893, + "learning_rate": 0.000763028264709879, + "loss": 3.5661, + "step": 27915 + }, + { + "epoch": 1.89699687457535, + "grad_norm": 0.8762713074684143, + "learning_rate": 0.0007629857997010464, + "loss": 3.6106, + "step": 27920 + }, + { + "epoch": 1.8973365946460117, + "grad_norm": 0.7971257567405701, + "learning_rate": 0.0007629433346922137, + "loss": 3.6189, + "step": 27925 + }, + { + "epoch": 1.8976763147166733, + "grad_norm": 0.7384485602378845, + "learning_rate": 0.0007629008696833809, + "loss": 3.6141, + "step": 27930 + }, + { + "epoch": 1.8980160347873354, + "grad_norm": 0.7726696133613586, + "learning_rate": 0.0007628584046745482, + "loss": 3.5965, + "step": 27935 + }, + { + "epoch": 1.898355754857997, + "grad_norm": 0.9663407206535339, + "learning_rate": 0.0007628159396657155, + "loss": 3.4731, + "step": 27940 + }, + { + "epoch": 1.8986954749286586, + "grad_norm": 0.8924596309661865, + "learning_rate": 0.0007627734746568827, + "loss": 3.3971, + "step": 27945 + }, + { + "epoch": 1.8990351949993207, + "grad_norm": 0.7326397895812988, + "learning_rate": 0.0007627310096480499, + "loss": 3.501, + "step": 27950 + }, + { + "epoch": 1.8993749150699824, + "grad_norm": 0.8314158320426941, + "learning_rate": 0.0007626885446392174, + "loss": 3.3113, + "step": 27955 + }, + { + "epoch": 1.899714635140644, + "grad_norm": 0.7577950954437256, + "learning_rate": 0.0007626460796303846, + "loss": 3.4507, + "step": 27960 + }, + { + "epoch": 1.9000543552113058, + "grad_norm": 1.203317642211914, + "learning_rate": 0.0007626036146215518, + "loss": 3.7072, + "step": 27965 + }, + { + "epoch": 1.9003940752819677, + "grad_norm": 1.5556076765060425, + "learning_rate": 0.0007625611496127192, + "loss": 3.7782, + "step": 27970 + }, + { + "epoch": 1.9007337953526293, + "grad_norm": 0.7718592286109924, + "learning_rate": 0.0007625186846038864, + "loss": 3.5797, + "step": 27975 + }, + { + "epoch": 1.9010735154232912, + "grad_norm": 0.7209183573722839, + "learning_rate": 0.0007624762195950536, + "loss": 3.6267, + "step": 27980 + }, + { + "epoch": 1.901413235493953, + "grad_norm": 0.8156195878982544, + "learning_rate": 0.000762433754586221, + "loss": 3.8976, + "step": 27985 + }, + { + "epoch": 1.9017529555646147, + "grad_norm": 1.2875349521636963, + "learning_rate": 0.0007623912895773883, + "loss": 3.5881, + "step": 27990 + }, + { + "epoch": 1.9020926756352765, + "grad_norm": 0.9651404619216919, + "learning_rate": 0.0007623488245685555, + "loss": 3.432, + "step": 27995 + }, + { + "epoch": 1.9024323957059384, + "grad_norm": 0.6976006031036377, + "learning_rate": 0.0007623063595597228, + "loss": 3.5836, + "step": 28000 + }, + { + "epoch": 1.9027721157766, + "grad_norm": 0.9313911199569702, + "learning_rate": 0.0007622638945508901, + "loss": 3.6827, + "step": 28005 + }, + { + "epoch": 1.9031118358472618, + "grad_norm": 0.8675135374069214, + "learning_rate": 0.0007622214295420573, + "loss": 3.8474, + "step": 28010 + }, + { + "epoch": 1.9034515559179237, + "grad_norm": 0.9648146033287048, + "learning_rate": 0.0007621789645332246, + "loss": 3.4511, + "step": 28015 + }, + { + "epoch": 1.9037912759885853, + "grad_norm": 0.8182967305183411, + "learning_rate": 0.0007621364995243919, + "loss": 3.6573, + "step": 28020 + }, + { + "epoch": 1.9041309960592472, + "grad_norm": 0.8030721545219421, + "learning_rate": 0.0007620940345155592, + "loss": 3.7019, + "step": 28025 + }, + { + "epoch": 1.904470716129909, + "grad_norm": 0.7500225901603699, + "learning_rate": 0.0007620515695067265, + "loss": 3.6617, + "step": 28030 + }, + { + "epoch": 1.9048104362005707, + "grad_norm": 0.956384539604187, + "learning_rate": 0.0007620091044978938, + "loss": 3.6276, + "step": 28035 + }, + { + "epoch": 1.9051501562712325, + "grad_norm": 1.283812165260315, + "learning_rate": 0.000761966639489061, + "loss": 3.5508, + "step": 28040 + }, + { + "epoch": 1.9054898763418944, + "grad_norm": 0.7799303531646729, + "learning_rate": 0.0007619241744802283, + "loss": 3.8565, + "step": 28045 + }, + { + "epoch": 1.905829596412556, + "grad_norm": 0.8875689506530762, + "learning_rate": 0.0007618817094713955, + "loss": 3.423, + "step": 28050 + }, + { + "epoch": 1.9061693164832179, + "grad_norm": 0.9127369523048401, + "learning_rate": 0.0007618392444625629, + "loss": 3.5347, + "step": 28055 + }, + { + "epoch": 1.9065090365538797, + "grad_norm": 0.7445546984672546, + "learning_rate": 0.0007617967794537302, + "loss": 3.7537, + "step": 28060 + }, + { + "epoch": 1.9068487566245413, + "grad_norm": 0.8386813402175903, + "learning_rate": 0.0007617543144448974, + "loss": 3.3695, + "step": 28065 + }, + { + "epoch": 1.9071884766952032, + "grad_norm": 0.7555525302886963, + "learning_rate": 0.0007617118494360648, + "loss": 3.4681, + "step": 28070 + }, + { + "epoch": 1.907528196765865, + "grad_norm": 0.8701010942459106, + "learning_rate": 0.000761669384427232, + "loss": 3.65, + "step": 28075 + }, + { + "epoch": 1.9078679168365267, + "grad_norm": 0.7222371101379395, + "learning_rate": 0.0007616269194183992, + "loss": 3.4947, + "step": 28080 + }, + { + "epoch": 1.9082076369071885, + "grad_norm": 1.0044844150543213, + "learning_rate": 0.0007615844544095666, + "loss": 3.5498, + "step": 28085 + }, + { + "epoch": 1.9085473569778504, + "grad_norm": 0.8536598682403564, + "learning_rate": 0.0007615419894007338, + "loss": 3.7966, + "step": 28090 + }, + { + "epoch": 1.908887077048512, + "grad_norm": 0.88126140832901, + "learning_rate": 0.0007614995243919011, + "loss": 3.485, + "step": 28095 + }, + { + "epoch": 1.9092267971191736, + "grad_norm": 0.9051271677017212, + "learning_rate": 0.0007614570593830684, + "loss": 3.4794, + "step": 28100 + }, + { + "epoch": 1.9095665171898357, + "grad_norm": 0.7351927757263184, + "learning_rate": 0.0007614145943742357, + "loss": 3.4719, + "step": 28105 + }, + { + "epoch": 1.9099062372604974, + "grad_norm": 2.731717109680176, + "learning_rate": 0.0007613721293654029, + "loss": 3.6463, + "step": 28110 + }, + { + "epoch": 1.910245957331159, + "grad_norm": 0.9370664954185486, + "learning_rate": 0.0007613296643565702, + "loss": 3.5827, + "step": 28115 + }, + { + "epoch": 1.910585677401821, + "grad_norm": 0.936790943145752, + "learning_rate": 0.0007612871993477375, + "loss": 3.7638, + "step": 28120 + }, + { + "epoch": 1.9109253974724827, + "grad_norm": 0.8203386664390564, + "learning_rate": 0.0007612447343389047, + "loss": 3.8316, + "step": 28125 + }, + { + "epoch": 1.9112651175431443, + "grad_norm": 1.0006448030471802, + "learning_rate": 0.0007612022693300721, + "loss": 3.595, + "step": 28130 + }, + { + "epoch": 1.9116048376138062, + "grad_norm": 1.1734522581100464, + "learning_rate": 0.0007611598043212394, + "loss": 3.3986, + "step": 28135 + }, + { + "epoch": 1.911944557684468, + "grad_norm": 1.2965720891952515, + "learning_rate": 0.0007611173393124066, + "loss": 3.9663, + "step": 28140 + }, + { + "epoch": 1.9122842777551297, + "grad_norm": 0.7741960883140564, + "learning_rate": 0.0007610748743035739, + "loss": 3.665, + "step": 28145 + }, + { + "epoch": 1.9126239978257915, + "grad_norm": 0.8326201438903809, + "learning_rate": 0.0007610324092947411, + "loss": 3.5344, + "step": 28150 + }, + { + "epoch": 1.9129637178964534, + "grad_norm": 0.907158374786377, + "learning_rate": 0.0007609899442859084, + "loss": 3.716, + "step": 28155 + }, + { + "epoch": 1.913303437967115, + "grad_norm": 1.0760447978973389, + "learning_rate": 0.0007609474792770757, + "loss": 3.6917, + "step": 28160 + }, + { + "epoch": 1.9136431580377768, + "grad_norm": 1.0809235572814941, + "learning_rate": 0.000760905014268243, + "loss": 3.8437, + "step": 28165 + }, + { + "epoch": 1.9139828781084387, + "grad_norm": 0.9321449398994446, + "learning_rate": 0.0007608625492594103, + "loss": 3.544, + "step": 28170 + }, + { + "epoch": 1.9143225981791003, + "grad_norm": 0.9697040915489197, + "learning_rate": 0.0007608200842505776, + "loss": 3.3572, + "step": 28175 + }, + { + "epoch": 1.9146623182497622, + "grad_norm": 0.9380744695663452, + "learning_rate": 0.0007607776192417448, + "loss": 3.7233, + "step": 28180 + }, + { + "epoch": 1.915002038320424, + "grad_norm": 0.8832533359527588, + "learning_rate": 0.000760735154232912, + "loss": 3.5574, + "step": 28185 + }, + { + "epoch": 1.9153417583910857, + "grad_norm": 0.9208546876907349, + "learning_rate": 0.0007606926892240794, + "loss": 3.6043, + "step": 28190 + }, + { + "epoch": 1.9156814784617475, + "grad_norm": 0.8165782690048218, + "learning_rate": 0.0007606502242152466, + "loss": 3.4423, + "step": 28195 + }, + { + "epoch": 1.9160211985324094, + "grad_norm": 0.9216737151145935, + "learning_rate": 0.0007606077592064139, + "loss": 4.0033, + "step": 28200 + }, + { + "epoch": 1.916360918603071, + "grad_norm": 0.959984540939331, + "learning_rate": 0.0007605652941975813, + "loss": 3.5908, + "step": 28205 + }, + { + "epoch": 1.9167006386737329, + "grad_norm": 0.653558075428009, + "learning_rate": 0.0007605228291887485, + "loss": 3.6503, + "step": 28210 + }, + { + "epoch": 1.9170403587443947, + "grad_norm": 0.8449308276176453, + "learning_rate": 0.0007604803641799157, + "loss": 3.4446, + "step": 28215 + }, + { + "epoch": 1.9173800788150563, + "grad_norm": 0.9120978116989136, + "learning_rate": 0.0007604378991710831, + "loss": 3.5164, + "step": 28220 + }, + { + "epoch": 1.9177197988857182, + "grad_norm": 0.9568759799003601, + "learning_rate": 0.0007603954341622503, + "loss": 3.3726, + "step": 28225 + }, + { + "epoch": 1.91805951895638, + "grad_norm": 0.9089421033859253, + "learning_rate": 0.0007603529691534175, + "loss": 3.2868, + "step": 28230 + }, + { + "epoch": 1.9183992390270417, + "grad_norm": 1.5515856742858887, + "learning_rate": 0.000760310504144585, + "loss": 3.6314, + "step": 28235 + }, + { + "epoch": 1.9187389590977035, + "grad_norm": 1.1558581590652466, + "learning_rate": 0.0007602680391357522, + "loss": 3.6875, + "step": 28240 + }, + { + "epoch": 1.9190786791683654, + "grad_norm": 0.910908043384552, + "learning_rate": 0.0007602255741269194, + "loss": 3.5129, + "step": 28245 + }, + { + "epoch": 1.919418399239027, + "grad_norm": 1.19758939743042, + "learning_rate": 0.0007601831091180867, + "loss": 3.6392, + "step": 28250 + }, + { + "epoch": 1.9197581193096889, + "grad_norm": 0.8769204020500183, + "learning_rate": 0.000760140644109254, + "loss": 3.7524, + "step": 28255 + }, + { + "epoch": 1.9200978393803507, + "grad_norm": 0.8514014482498169, + "learning_rate": 0.0007600981791004212, + "loss": 3.6222, + "step": 28260 + }, + { + "epoch": 1.9204375594510124, + "grad_norm": 1.179363489151001, + "learning_rate": 0.0007600557140915885, + "loss": 3.798, + "step": 28265 + }, + { + "epoch": 1.920777279521674, + "grad_norm": 1.0165294408798218, + "learning_rate": 0.0007600132490827559, + "loss": 3.6902, + "step": 28270 + }, + { + "epoch": 1.921116999592336, + "grad_norm": 0.8159063458442688, + "learning_rate": 0.0007599707840739231, + "loss": 3.6627, + "step": 28275 + }, + { + "epoch": 1.9214567196629977, + "grad_norm": 0.7959951758384705, + "learning_rate": 0.0007599283190650904, + "loss": 3.4288, + "step": 28280 + }, + { + "epoch": 1.9217964397336593, + "grad_norm": 0.8973066210746765, + "learning_rate": 0.0007598858540562576, + "loss": 3.6648, + "step": 28285 + }, + { + "epoch": 1.9221361598043214, + "grad_norm": 0.6868293285369873, + "learning_rate": 0.0007598433890474249, + "loss": 3.7099, + "step": 28290 + }, + { + "epoch": 1.922475879874983, + "grad_norm": 0.8255932331085205, + "learning_rate": 0.0007598009240385922, + "loss": 3.5467, + "step": 28295 + }, + { + "epoch": 1.9228155999456447, + "grad_norm": 0.7924712896347046, + "learning_rate": 0.0007597584590297594, + "loss": 3.7074, + "step": 28300 + }, + { + "epoch": 1.9231553200163065, + "grad_norm": 0.6859641671180725, + "learning_rate": 0.0007597159940209268, + "loss": 3.675, + "step": 28305 + }, + { + "epoch": 1.9234950400869684, + "grad_norm": 1.1323542594909668, + "learning_rate": 0.0007596735290120941, + "loss": 3.677, + "step": 28310 + }, + { + "epoch": 1.92383476015763, + "grad_norm": 0.7156279683113098, + "learning_rate": 0.0007596310640032613, + "loss": 3.82, + "step": 28315 + }, + { + "epoch": 1.9241744802282919, + "grad_norm": 0.7804394960403442, + "learning_rate": 0.0007595885989944286, + "loss": 3.6885, + "step": 28320 + }, + { + "epoch": 1.9245142002989537, + "grad_norm": 1.2960327863693237, + "learning_rate": 0.0007595461339855959, + "loss": 3.55, + "step": 28325 + }, + { + "epoch": 1.9248539203696153, + "grad_norm": 1.204235315322876, + "learning_rate": 0.0007595036689767631, + "loss": 3.5586, + "step": 28330 + }, + { + "epoch": 1.9251936404402772, + "grad_norm": 0.8421612977981567, + "learning_rate": 0.0007594612039679303, + "loss": 3.5717, + "step": 28335 + }, + { + "epoch": 1.925533360510939, + "grad_norm": 0.8517104387283325, + "learning_rate": 0.0007594187389590978, + "loss": 3.6382, + "step": 28340 + }, + { + "epoch": 1.9258730805816007, + "grad_norm": 0.8446228504180908, + "learning_rate": 0.000759376273950265, + "loss": 3.4962, + "step": 28345 + }, + { + "epoch": 1.9262128006522625, + "grad_norm": 1.0175116062164307, + "learning_rate": 0.0007593338089414322, + "loss": 3.4712, + "step": 28350 + }, + { + "epoch": 1.9265525207229244, + "grad_norm": 0.7851722836494446, + "learning_rate": 0.0007592913439325996, + "loss": 3.4358, + "step": 28355 + }, + { + "epoch": 1.926892240793586, + "grad_norm": 0.9572998881340027, + "learning_rate": 0.0007592488789237668, + "loss": 3.4253, + "step": 28360 + }, + { + "epoch": 1.9272319608642479, + "grad_norm": 1.1341079473495483, + "learning_rate": 0.000759206413914934, + "loss": 3.6541, + "step": 28365 + }, + { + "epoch": 1.9275716809349097, + "grad_norm": 0.7131646275520325, + "learning_rate": 0.0007591639489061015, + "loss": 3.4669, + "step": 28370 + }, + { + "epoch": 1.9279114010055713, + "grad_norm": 0.8406853079795837, + "learning_rate": 0.0007591214838972687, + "loss": 3.6058, + "step": 28375 + }, + { + "epoch": 1.9282511210762332, + "grad_norm": 1.0240042209625244, + "learning_rate": 0.0007590790188884359, + "loss": 3.6189, + "step": 28380 + }, + { + "epoch": 1.928590841146895, + "grad_norm": 1.9356498718261719, + "learning_rate": 0.0007590365538796032, + "loss": 3.3577, + "step": 28385 + }, + { + "epoch": 1.9289305612175567, + "grad_norm": 1.0498560667037964, + "learning_rate": 0.0007589940888707705, + "loss": 3.7382, + "step": 28390 + }, + { + "epoch": 1.9292702812882185, + "grad_norm": 0.7536966800689697, + "learning_rate": 0.0007589516238619378, + "loss": 3.6525, + "step": 28395 + }, + { + "epoch": 1.9296100013588804, + "grad_norm": 0.9722318053245544, + "learning_rate": 0.000758909158853105, + "loss": 3.6916, + "step": 28400 + }, + { + "epoch": 1.929949721429542, + "grad_norm": 0.9670264720916748, + "learning_rate": 0.0007588666938442724, + "loss": 3.6338, + "step": 28405 + }, + { + "epoch": 1.9302894415002039, + "grad_norm": 0.7388655543327332, + "learning_rate": 0.0007588242288354397, + "loss": 3.5572, + "step": 28410 + }, + { + "epoch": 1.9306291615708657, + "grad_norm": 1.1686363220214844, + "learning_rate": 0.0007587817638266069, + "loss": 3.2443, + "step": 28415 + }, + { + "epoch": 1.9309688816415274, + "grad_norm": 0.7899733185768127, + "learning_rate": 0.0007587392988177742, + "loss": 3.8086, + "step": 28420 + }, + { + "epoch": 1.9313086017121892, + "grad_norm": 0.8035762310028076, + "learning_rate": 0.0007586968338089415, + "loss": 3.5691, + "step": 28425 + }, + { + "epoch": 1.931648321782851, + "grad_norm": 0.8368618488311768, + "learning_rate": 0.0007586543688001087, + "loss": 3.5174, + "step": 28430 + }, + { + "epoch": 1.9319880418535127, + "grad_norm": 1.0983272790908813, + "learning_rate": 0.0007586119037912759, + "loss": 3.6339, + "step": 28435 + }, + { + "epoch": 1.9323277619241743, + "grad_norm": 0.826603889465332, + "learning_rate": 0.0007585694387824434, + "loss": 3.5481, + "step": 28440 + }, + { + "epoch": 1.9326674819948364, + "grad_norm": 0.7912547588348389, + "learning_rate": 0.0007585269737736106, + "loss": 3.5262, + "step": 28445 + }, + { + "epoch": 1.933007202065498, + "grad_norm": 0.9105021357536316, + "learning_rate": 0.0007584845087647778, + "loss": 3.74, + "step": 28450 + }, + { + "epoch": 1.9333469221361597, + "grad_norm": 0.7895652651786804, + "learning_rate": 0.0007584420437559452, + "loss": 3.6722, + "step": 28455 + }, + { + "epoch": 1.9336866422068217, + "grad_norm": 0.5935617685317993, + "learning_rate": 0.0007583995787471124, + "loss": 3.6901, + "step": 28460 + }, + { + "epoch": 1.9340263622774834, + "grad_norm": 0.9751099348068237, + "learning_rate": 0.0007583571137382796, + "loss": 3.6397, + "step": 28465 + }, + { + "epoch": 1.934366082348145, + "grad_norm": 0.7862982749938965, + "learning_rate": 0.000758314648729447, + "loss": 3.3809, + "step": 28470 + }, + { + "epoch": 1.9347058024188069, + "grad_norm": 1.1757220029830933, + "learning_rate": 0.0007582721837206143, + "loss": 3.7965, + "step": 28475 + }, + { + "epoch": 1.9350455224894687, + "grad_norm": 0.7361250519752502, + "learning_rate": 0.0007582297187117815, + "loss": 3.6647, + "step": 28480 + }, + { + "epoch": 1.9353852425601303, + "grad_norm": 1.0466033220291138, + "learning_rate": 0.0007581872537029489, + "loss": 3.5636, + "step": 28485 + }, + { + "epoch": 1.9357249626307922, + "grad_norm": 0.6589613556861877, + "learning_rate": 0.0007581447886941161, + "loss": 3.3865, + "step": 28490 + }, + { + "epoch": 1.936064682701454, + "grad_norm": 0.9935853481292725, + "learning_rate": 0.0007581023236852833, + "loss": 3.5879, + "step": 28495 + }, + { + "epoch": 1.9364044027721157, + "grad_norm": 1.3108460903167725, + "learning_rate": 0.0007580598586764506, + "loss": 3.6424, + "step": 28500 + }, + { + "epoch": 1.9367441228427775, + "grad_norm": 1.0185370445251465, + "learning_rate": 0.0007580173936676179, + "loss": 3.6673, + "step": 28505 + }, + { + "epoch": 1.9370838429134394, + "grad_norm": 0.7505614757537842, + "learning_rate": 0.0007579749286587852, + "loss": 3.4951, + "step": 28510 + }, + { + "epoch": 1.937423562984101, + "grad_norm": 2.6461355686187744, + "learning_rate": 0.0007579324636499525, + "loss": 3.6558, + "step": 28515 + }, + { + "epoch": 1.9377632830547629, + "grad_norm": 1.050493836402893, + "learning_rate": 0.0007578899986411198, + "loss": 3.5618, + "step": 28520 + }, + { + "epoch": 1.9381030031254247, + "grad_norm": 0.7938946485519409, + "learning_rate": 0.000757847533632287, + "loss": 3.6241, + "step": 28525 + }, + { + "epoch": 1.9384427231960863, + "grad_norm": 2.219087600708008, + "learning_rate": 0.0007578050686234543, + "loss": 3.6846, + "step": 28530 + }, + { + "epoch": 1.9387824432667482, + "grad_norm": 0.8250712752342224, + "learning_rate": 0.0007577626036146215, + "loss": 3.5897, + "step": 28535 + }, + { + "epoch": 1.93912216333741, + "grad_norm": 1.6027405261993408, + "learning_rate": 0.0007577201386057888, + "loss": 3.6831, + "step": 28540 + }, + { + "epoch": 1.9394618834080717, + "grad_norm": 0.8790163993835449, + "learning_rate": 0.0007576776735969562, + "loss": 3.5969, + "step": 28545 + }, + { + "epoch": 1.9398016034787335, + "grad_norm": 1.1722898483276367, + "learning_rate": 0.0007576352085881234, + "loss": 3.6523, + "step": 28550 + }, + { + "epoch": 1.9401413235493954, + "grad_norm": 0.7854223847389221, + "learning_rate": 0.0007575927435792907, + "loss": 3.5348, + "step": 28555 + }, + { + "epoch": 1.940481043620057, + "grad_norm": 0.8499945402145386, + "learning_rate": 0.000757550278570458, + "loss": 3.7712, + "step": 28560 + }, + { + "epoch": 1.9408207636907189, + "grad_norm": 0.6790933012962341, + "learning_rate": 0.0007575078135616252, + "loss": 3.4902, + "step": 28565 + }, + { + "epoch": 1.9411604837613807, + "grad_norm": 0.9359042644500732, + "learning_rate": 0.0007574653485527924, + "loss": 3.6354, + "step": 28570 + }, + { + "epoch": 1.9415002038320424, + "grad_norm": 0.9110603928565979, + "learning_rate": 0.0007574228835439598, + "loss": 3.4246, + "step": 28575 + }, + { + "epoch": 1.9418399239027042, + "grad_norm": 1.2070358991622925, + "learning_rate": 0.0007573804185351271, + "loss": 3.6399, + "step": 28580 + }, + { + "epoch": 1.942179643973366, + "grad_norm": 0.6821385622024536, + "learning_rate": 0.0007573379535262943, + "loss": 3.8456, + "step": 28585 + }, + { + "epoch": 1.9425193640440277, + "grad_norm": 0.8900517225265503, + "learning_rate": 0.0007572954885174617, + "loss": 3.6035, + "step": 28590 + }, + { + "epoch": 1.9428590841146895, + "grad_norm": 0.761317253112793, + "learning_rate": 0.0007572530235086289, + "loss": 3.513, + "step": 28595 + }, + { + "epoch": 1.9431988041853514, + "grad_norm": 0.9899647235870361, + "learning_rate": 0.0007572105584997961, + "loss": 3.2825, + "step": 28600 + }, + { + "epoch": 1.943538524256013, + "grad_norm": 0.967959463596344, + "learning_rate": 0.0007571680934909635, + "loss": 3.5915, + "step": 28605 + }, + { + "epoch": 1.9438782443266747, + "grad_norm": 1.0323562622070312, + "learning_rate": 0.0007571256284821307, + "loss": 3.456, + "step": 28610 + }, + { + "epoch": 1.9442179643973367, + "grad_norm": 1.1350847482681274, + "learning_rate": 0.000757083163473298, + "loss": 3.5779, + "step": 28615 + }, + { + "epoch": 1.9445576844679984, + "grad_norm": 1.1135549545288086, + "learning_rate": 0.0007570406984644654, + "loss": 3.1142, + "step": 28620 + }, + { + "epoch": 1.94489740453866, + "grad_norm": 0.8063178658485413, + "learning_rate": 0.0007569982334556326, + "loss": 3.491, + "step": 28625 + }, + { + "epoch": 1.945237124609322, + "grad_norm": 0.7653905749320984, + "learning_rate": 0.0007569557684467998, + "loss": 3.5696, + "step": 28630 + }, + { + "epoch": 1.9455768446799837, + "grad_norm": 0.8625290393829346, + "learning_rate": 0.0007569133034379671, + "loss": 3.536, + "step": 28635 + }, + { + "epoch": 1.9459165647506453, + "grad_norm": 0.6678442358970642, + "learning_rate": 0.0007568708384291344, + "loss": 3.4787, + "step": 28640 + }, + { + "epoch": 1.9462562848213072, + "grad_norm": 0.8058485388755798, + "learning_rate": 0.0007568283734203016, + "loss": 3.5694, + "step": 28645 + }, + { + "epoch": 1.946596004891969, + "grad_norm": 1.0442901849746704, + "learning_rate": 0.000756785908411469, + "loss": 3.2765, + "step": 28650 + }, + { + "epoch": 1.9469357249626307, + "grad_norm": 1.5623081922531128, + "learning_rate": 0.0007567434434026363, + "loss": 3.7357, + "step": 28655 + }, + { + "epoch": 1.9472754450332925, + "grad_norm": 1.6316235065460205, + "learning_rate": 0.0007567009783938035, + "loss": 3.8206, + "step": 28660 + }, + { + "epoch": 1.9476151651039544, + "grad_norm": 1.0365331172943115, + "learning_rate": 0.0007566585133849708, + "loss": 3.326, + "step": 28665 + }, + { + "epoch": 1.947954885174616, + "grad_norm": 1.2335442304611206, + "learning_rate": 0.000756616048376138, + "loss": 3.5317, + "step": 28670 + }, + { + "epoch": 1.9482946052452779, + "grad_norm": 1.2205390930175781, + "learning_rate": 0.0007565735833673053, + "loss": 3.3949, + "step": 28675 + }, + { + "epoch": 1.9486343253159397, + "grad_norm": 0.7768918871879578, + "learning_rate": 0.0007565311183584726, + "loss": 3.7775, + "step": 28680 + }, + { + "epoch": 1.9489740453866014, + "grad_norm": 1.0013525485992432, + "learning_rate": 0.0007564886533496399, + "loss": 3.7781, + "step": 28685 + }, + { + "epoch": 1.9493137654572632, + "grad_norm": 0.779225766658783, + "learning_rate": 0.0007564461883408072, + "loss": 3.6278, + "step": 28690 + }, + { + "epoch": 1.949653485527925, + "grad_norm": 0.8145430684089661, + "learning_rate": 0.0007564037233319745, + "loss": 3.7267, + "step": 28695 + }, + { + "epoch": 1.9499932055985867, + "grad_norm": 0.9002009630203247, + "learning_rate": 0.0007563612583231417, + "loss": 3.5814, + "step": 28700 + }, + { + "epoch": 1.9503329256692485, + "grad_norm": 1.0095949172973633, + "learning_rate": 0.000756318793314309, + "loss": 3.5101, + "step": 28705 + }, + { + "epoch": 1.9506726457399104, + "grad_norm": 1.0866632461547852, + "learning_rate": 0.0007562763283054763, + "loss": 3.772, + "step": 28710 + }, + { + "epoch": 1.951012365810572, + "grad_norm": 0.8887550234794617, + "learning_rate": 0.0007562338632966435, + "loss": 3.6542, + "step": 28715 + }, + { + "epoch": 1.9513520858812339, + "grad_norm": 2.12438702583313, + "learning_rate": 0.0007561913982878109, + "loss": 3.5663, + "step": 28720 + }, + { + "epoch": 1.9516918059518957, + "grad_norm": 0.8766905665397644, + "learning_rate": 0.0007561489332789782, + "loss": 3.6685, + "step": 28725 + }, + { + "epoch": 1.9520315260225574, + "grad_norm": 1.0125778913497925, + "learning_rate": 0.0007561064682701454, + "loss": 3.6829, + "step": 28730 + }, + { + "epoch": 1.9523712460932192, + "grad_norm": 0.9026830196380615, + "learning_rate": 0.0007560640032613127, + "loss": 3.7663, + "step": 28735 + }, + { + "epoch": 1.952710966163881, + "grad_norm": 0.8322377800941467, + "learning_rate": 0.00075602153825248, + "loss": 3.6963, + "step": 28740 + }, + { + "epoch": 1.9530506862345427, + "grad_norm": 1.236722707748413, + "learning_rate": 0.0007559790732436472, + "loss": 3.8049, + "step": 28745 + }, + { + "epoch": 1.9533904063052046, + "grad_norm": 0.8033831119537354, + "learning_rate": 0.0007559366082348145, + "loss": 3.5636, + "step": 28750 + }, + { + "epoch": 1.9537301263758664, + "grad_norm": 0.9286559224128723, + "learning_rate": 0.0007558941432259819, + "loss": 3.6784, + "step": 28755 + }, + { + "epoch": 1.954069846446528, + "grad_norm": 0.8716838955879211, + "learning_rate": 0.0007558516782171491, + "loss": 3.704, + "step": 28760 + }, + { + "epoch": 1.95440956651719, + "grad_norm": 1.178750991821289, + "learning_rate": 0.0007558092132083164, + "loss": 3.7511, + "step": 28765 + }, + { + "epoch": 1.9547492865878517, + "grad_norm": 1.0065803527832031, + "learning_rate": 0.0007557667481994837, + "loss": 3.7132, + "step": 28770 + }, + { + "epoch": 1.9550890066585134, + "grad_norm": 0.9018929600715637, + "learning_rate": 0.0007557242831906509, + "loss": 3.5592, + "step": 28775 + }, + { + "epoch": 1.955428726729175, + "grad_norm": 0.7369958758354187, + "learning_rate": 0.0007556818181818182, + "loss": 3.6946, + "step": 28780 + }, + { + "epoch": 1.955768446799837, + "grad_norm": 1.2520877122879028, + "learning_rate": 0.0007556393531729854, + "loss": 3.5333, + "step": 28785 + }, + { + "epoch": 1.9561081668704987, + "grad_norm": 0.736051082611084, + "learning_rate": 0.0007555968881641528, + "loss": 3.8152, + "step": 28790 + }, + { + "epoch": 1.9564478869411603, + "grad_norm": 0.9063373804092407, + "learning_rate": 0.0007555544231553201, + "loss": 3.5329, + "step": 28795 + }, + { + "epoch": 1.9567876070118224, + "grad_norm": 0.7022332549095154, + "learning_rate": 0.0007555119581464873, + "loss": 3.5942, + "step": 28800 + }, + { + "epoch": 1.957127327082484, + "grad_norm": 0.9597944021224976, + "learning_rate": 0.0007554694931376546, + "loss": 3.3211, + "step": 28805 + }, + { + "epoch": 1.9574670471531457, + "grad_norm": 0.9514861702919006, + "learning_rate": 0.0007554270281288219, + "loss": 3.6471, + "step": 28810 + }, + { + "epoch": 1.9578067672238075, + "grad_norm": 0.9152568578720093, + "learning_rate": 0.0007553845631199891, + "loss": 3.5983, + "step": 28815 + }, + { + "epoch": 1.9581464872944694, + "grad_norm": 0.8795842528343201, + "learning_rate": 0.0007553420981111563, + "loss": 3.6813, + "step": 28820 + }, + { + "epoch": 1.958486207365131, + "grad_norm": 0.734053373336792, + "learning_rate": 0.0007552996331023238, + "loss": 3.5284, + "step": 28825 + }, + { + "epoch": 1.9588259274357929, + "grad_norm": 1.0962954759597778, + "learning_rate": 0.000755257168093491, + "loss": 3.3742, + "step": 28830 + }, + { + "epoch": 1.9591656475064547, + "grad_norm": 1.0369884967803955, + "learning_rate": 0.0007552147030846582, + "loss": 3.385, + "step": 28835 + }, + { + "epoch": 1.9595053675771164, + "grad_norm": 0.7486779689788818, + "learning_rate": 0.0007551722380758256, + "loss": 3.6969, + "step": 28840 + }, + { + "epoch": 1.9598450876477782, + "grad_norm": 0.8494892716407776, + "learning_rate": 0.0007551297730669928, + "loss": 3.6635, + "step": 28845 + }, + { + "epoch": 1.96018480771844, + "grad_norm": 1.163935661315918, + "learning_rate": 0.00075508730805816, + "loss": 3.6166, + "step": 28850 + }, + { + "epoch": 1.9605245277891017, + "grad_norm": 0.7256469130516052, + "learning_rate": 0.0007550448430493274, + "loss": 3.5665, + "step": 28855 + }, + { + "epoch": 1.9608642478597635, + "grad_norm": 0.8635443449020386, + "learning_rate": 0.0007550023780404947, + "loss": 3.7842, + "step": 28860 + }, + { + "epoch": 1.9612039679304254, + "grad_norm": 1.3402035236358643, + "learning_rate": 0.0007549599130316619, + "loss": 3.6014, + "step": 28865 + }, + { + "epoch": 1.961543688001087, + "grad_norm": 0.9227058291435242, + "learning_rate": 0.0007549174480228293, + "loss": 3.5164, + "step": 28870 + }, + { + "epoch": 1.9618834080717489, + "grad_norm": 0.9014881253242493, + "learning_rate": 0.0007548749830139965, + "loss": 3.7389, + "step": 28875 + }, + { + "epoch": 1.9622231281424107, + "grad_norm": 0.6790865063667297, + "learning_rate": 0.0007548325180051637, + "loss": 3.5728, + "step": 28880 + }, + { + "epoch": 1.9625628482130724, + "grad_norm": 0.876067042350769, + "learning_rate": 0.000754790052996331, + "loss": 3.3877, + "step": 28885 + }, + { + "epoch": 1.9629025682837342, + "grad_norm": 0.7319326400756836, + "learning_rate": 0.0007547475879874983, + "loss": 3.5865, + "step": 28890 + }, + { + "epoch": 1.963242288354396, + "grad_norm": 0.9380958676338196, + "learning_rate": 0.0007547051229786656, + "loss": 3.5476, + "step": 28895 + }, + { + "epoch": 1.9635820084250577, + "grad_norm": 0.9331657886505127, + "learning_rate": 0.0007546626579698329, + "loss": 3.4791, + "step": 28900 + }, + { + "epoch": 1.9639217284957196, + "grad_norm": 2.552316665649414, + "learning_rate": 0.0007546201929610002, + "loss": 3.7155, + "step": 28905 + }, + { + "epoch": 1.9642614485663814, + "grad_norm": 0.8106240630149841, + "learning_rate": 0.0007545777279521674, + "loss": 3.4073, + "step": 28910 + }, + { + "epoch": 1.964601168637043, + "grad_norm": 0.8056961894035339, + "learning_rate": 0.0007545352629433347, + "loss": 3.4655, + "step": 28915 + }, + { + "epoch": 1.964940888707705, + "grad_norm": 0.8191368579864502, + "learning_rate": 0.000754492797934502, + "loss": 3.7548, + "step": 28920 + }, + { + "epoch": 1.9652806087783667, + "grad_norm": 0.8820658326148987, + "learning_rate": 0.0007544503329256692, + "loss": 3.5204, + "step": 28925 + }, + { + "epoch": 1.9656203288490284, + "grad_norm": 1.193146824836731, + "learning_rate": 0.0007544078679168366, + "loss": 3.5861, + "step": 28930 + }, + { + "epoch": 1.9659600489196902, + "grad_norm": 0.828199028968811, + "learning_rate": 0.0007543654029080038, + "loss": 3.4547, + "step": 28935 + }, + { + "epoch": 1.966299768990352, + "grad_norm": 0.7771934270858765, + "learning_rate": 0.0007543229378991711, + "loss": 3.5466, + "step": 28940 + }, + { + "epoch": 1.9666394890610137, + "grad_norm": 0.7976356148719788, + "learning_rate": 0.0007542804728903384, + "loss": 3.7264, + "step": 28945 + }, + { + "epoch": 1.9669792091316753, + "grad_norm": 1.0628077983856201, + "learning_rate": 0.0007542380078815056, + "loss": 3.8647, + "step": 28950 + }, + { + "epoch": 1.9673189292023374, + "grad_norm": 1.0639787912368774, + "learning_rate": 0.0007541955428726729, + "loss": 3.4203, + "step": 28955 + }, + { + "epoch": 1.967658649272999, + "grad_norm": 0.9266738295555115, + "learning_rate": 0.0007541530778638403, + "loss": 3.3301, + "step": 28960 + }, + { + "epoch": 1.9679983693436607, + "grad_norm": 0.9792012572288513, + "learning_rate": 0.0007541106128550075, + "loss": 3.7398, + "step": 28965 + }, + { + "epoch": 1.9683380894143228, + "grad_norm": 1.059991478919983, + "learning_rate": 0.0007540681478461747, + "loss": 3.6259, + "step": 28970 + }, + { + "epoch": 1.9686778094849844, + "grad_norm": 1.045894742012024, + "learning_rate": 0.0007540256828373421, + "loss": 3.4347, + "step": 28975 + }, + { + "epoch": 1.969017529555646, + "grad_norm": 0.7726205587387085, + "learning_rate": 0.0007539832178285093, + "loss": 3.3332, + "step": 28980 + }, + { + "epoch": 1.9693572496263079, + "grad_norm": 1.1491895914077759, + "learning_rate": 0.0007539407528196765, + "loss": 3.5723, + "step": 28985 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.8917666077613831, + "learning_rate": 0.0007538982878108439, + "loss": 3.5873, + "step": 28990 + }, + { + "epoch": 1.9700366897676314, + "grad_norm": 0.7321285009384155, + "learning_rate": 0.0007538558228020112, + "loss": 3.3589, + "step": 28995 + }, + { + "epoch": 1.9703764098382932, + "grad_norm": 0.8377131223678589, + "learning_rate": 0.0007538133577931784, + "loss": 3.6918, + "step": 29000 + }, + { + "epoch": 1.970716129908955, + "grad_norm": 0.716607391834259, + "learning_rate": 0.0007537708927843458, + "loss": 3.5828, + "step": 29005 + }, + { + "epoch": 1.9710558499796167, + "grad_norm": 0.7877514958381653, + "learning_rate": 0.000753728427775513, + "loss": 3.6384, + "step": 29010 + }, + { + "epoch": 1.9713955700502785, + "grad_norm": 0.8277802467346191, + "learning_rate": 0.0007536859627666802, + "loss": 3.5739, + "step": 29015 + }, + { + "epoch": 1.9717352901209404, + "grad_norm": 0.9380989074707031, + "learning_rate": 0.0007536434977578475, + "loss": 3.6981, + "step": 29020 + }, + { + "epoch": 1.972075010191602, + "grad_norm": 0.9432087540626526, + "learning_rate": 0.0007536010327490148, + "loss": 3.6688, + "step": 29025 + }, + { + "epoch": 1.9724147302622639, + "grad_norm": 1.1739062070846558, + "learning_rate": 0.0007535585677401821, + "loss": 3.8218, + "step": 29030 + }, + { + "epoch": 1.9727544503329257, + "grad_norm": 0.9299569725990295, + "learning_rate": 0.0007535161027313494, + "loss": 3.6106, + "step": 29035 + }, + { + "epoch": 1.9730941704035874, + "grad_norm": 0.8429141044616699, + "learning_rate": 0.0007534736377225167, + "loss": 3.5486, + "step": 29040 + }, + { + "epoch": 1.9734338904742492, + "grad_norm": 1.1155043840408325, + "learning_rate": 0.0007534311727136839, + "loss": 3.5499, + "step": 29045 + }, + { + "epoch": 1.973773610544911, + "grad_norm": 0.9181723594665527, + "learning_rate": 0.0007533887077048512, + "loss": 3.6381, + "step": 29050 + }, + { + "epoch": 1.9741133306155727, + "grad_norm": 1.1183253526687622, + "learning_rate": 0.0007533462426960185, + "loss": 3.8629, + "step": 29055 + }, + { + "epoch": 1.9744530506862346, + "grad_norm": 0.8301301002502441, + "learning_rate": 0.0007533037776871857, + "loss": 3.6028, + "step": 29060 + }, + { + "epoch": 1.9747927707568964, + "grad_norm": 0.9209470152854919, + "learning_rate": 0.0007532613126783531, + "loss": 3.8856, + "step": 29065 + }, + { + "epoch": 1.975132490827558, + "grad_norm": 0.8280025720596313, + "learning_rate": 0.0007532188476695203, + "loss": 3.5855, + "step": 29070 + }, + { + "epoch": 1.97547221089822, + "grad_norm": 0.8178666830062866, + "learning_rate": 0.0007531763826606877, + "loss": 3.5322, + "step": 29075 + }, + { + "epoch": 1.9758119309688817, + "grad_norm": 0.9272006750106812, + "learning_rate": 0.0007531339176518549, + "loss": 3.4992, + "step": 29080 + }, + { + "epoch": 1.9761516510395434, + "grad_norm": 0.9235244989395142, + "learning_rate": 0.0007530914526430221, + "loss": 3.7495, + "step": 29085 + }, + { + "epoch": 1.9764913711102052, + "grad_norm": 0.8850715160369873, + "learning_rate": 0.0007530489876341895, + "loss": 3.6541, + "step": 29090 + }, + { + "epoch": 1.976831091180867, + "grad_norm": 0.7254378795623779, + "learning_rate": 0.0007530065226253567, + "loss": 3.6537, + "step": 29095 + }, + { + "epoch": 1.9771708112515287, + "grad_norm": 0.7504207491874695, + "learning_rate": 0.000752964057616524, + "loss": 3.3373, + "step": 29100 + }, + { + "epoch": 1.9775105313221906, + "grad_norm": 0.7073450088500977, + "learning_rate": 0.0007529215926076914, + "loss": 3.6745, + "step": 29105 + }, + { + "epoch": 1.9778502513928524, + "grad_norm": 0.894619882106781, + "learning_rate": 0.0007528791275988586, + "loss": 3.36, + "step": 29110 + }, + { + "epoch": 1.978189971463514, + "grad_norm": 0.6916743516921997, + "learning_rate": 0.0007528366625900258, + "loss": 3.5579, + "step": 29115 + }, + { + "epoch": 1.9785296915341757, + "grad_norm": 0.8408764600753784, + "learning_rate": 0.0007527941975811931, + "loss": 3.6925, + "step": 29120 + }, + { + "epoch": 1.9788694116048378, + "grad_norm": 0.7881407737731934, + "learning_rate": 0.0007527517325723604, + "loss": 3.8251, + "step": 29125 + }, + { + "epoch": 1.9792091316754994, + "grad_norm": 1.074277400970459, + "learning_rate": 0.0007527092675635276, + "loss": 3.7337, + "step": 29130 + }, + { + "epoch": 1.979548851746161, + "grad_norm": 0.8392279148101807, + "learning_rate": 0.000752666802554695, + "loss": 3.6643, + "step": 29135 + }, + { + "epoch": 1.979888571816823, + "grad_norm": 1.2412654161453247, + "learning_rate": 0.0007526243375458623, + "loss": 3.4594, + "step": 29140 + }, + { + "epoch": 1.9802282918874847, + "grad_norm": 0.6625046133995056, + "learning_rate": 0.0007525818725370295, + "loss": 3.5669, + "step": 29145 + }, + { + "epoch": 1.9805680119581464, + "grad_norm": 0.826656699180603, + "learning_rate": 0.0007525394075281968, + "loss": 3.6529, + "step": 29150 + }, + { + "epoch": 1.9809077320288082, + "grad_norm": 0.7249147891998291, + "learning_rate": 0.0007524969425193641, + "loss": 3.4977, + "step": 29155 + }, + { + "epoch": 1.98124745209947, + "grad_norm": 1.1771036386489868, + "learning_rate": 0.0007524544775105313, + "loss": 3.4409, + "step": 29160 + }, + { + "epoch": 1.9815871721701317, + "grad_norm": 1.0239675045013428, + "learning_rate": 0.0007524120125016986, + "loss": 3.4792, + "step": 29165 + }, + { + "epoch": 1.9819268922407935, + "grad_norm": 1.153440237045288, + "learning_rate": 0.000752369547492866, + "loss": 3.7442, + "step": 29170 + }, + { + "epoch": 1.9822666123114554, + "grad_norm": 0.7173218727111816, + "learning_rate": 0.0007523270824840332, + "loss": 3.8373, + "step": 29175 + }, + { + "epoch": 1.982606332382117, + "grad_norm": 0.8514233231544495, + "learning_rate": 0.0007522846174752005, + "loss": 3.4861, + "step": 29180 + }, + { + "epoch": 1.9829460524527789, + "grad_norm": 0.7004932761192322, + "learning_rate": 0.0007522421524663677, + "loss": 3.702, + "step": 29185 + }, + { + "epoch": 1.9832857725234407, + "grad_norm": 0.9181821942329407, + "learning_rate": 0.000752199687457535, + "loss": 3.7137, + "step": 29190 + }, + { + "epoch": 1.9836254925941024, + "grad_norm": 2.8694136142730713, + "learning_rate": 0.0007521572224487023, + "loss": 3.7541, + "step": 29195 + }, + { + "epoch": 1.9839652126647642, + "grad_norm": 0.8365533351898193, + "learning_rate": 0.0007521147574398695, + "loss": 3.8299, + "step": 29200 + }, + { + "epoch": 1.984304932735426, + "grad_norm": 0.8580340147018433, + "learning_rate": 0.0007520722924310369, + "loss": 3.5913, + "step": 29205 + }, + { + "epoch": 1.9846446528060877, + "grad_norm": 1.1264780759811401, + "learning_rate": 0.0007520298274222042, + "loss": 3.345, + "step": 29210 + }, + { + "epoch": 1.9849843728767496, + "grad_norm": 0.8323303461074829, + "learning_rate": 0.0007519873624133714, + "loss": 3.7359, + "step": 29215 + }, + { + "epoch": 1.9853240929474114, + "grad_norm": 1.0919257402420044, + "learning_rate": 0.0007519448974045386, + "loss": 3.3925, + "step": 29220 + }, + { + "epoch": 1.985663813018073, + "grad_norm": 1.170393466949463, + "learning_rate": 0.000751902432395706, + "loss": 3.6923, + "step": 29225 + }, + { + "epoch": 1.986003533088735, + "grad_norm": 0.9667239785194397, + "learning_rate": 0.0007518599673868732, + "loss": 3.5384, + "step": 29230 + }, + { + "epoch": 1.9863432531593967, + "grad_norm": 1.019601583480835, + "learning_rate": 0.0007518175023780404, + "loss": 3.5109, + "step": 29235 + }, + { + "epoch": 1.9866829732300584, + "grad_norm": 7.660982131958008, + "learning_rate": 0.0007517750373692079, + "loss": 3.4234, + "step": 29240 + }, + { + "epoch": 1.9870226933007202, + "grad_norm": 0.9469075798988342, + "learning_rate": 0.0007517325723603751, + "loss": 3.7452, + "step": 29245 + }, + { + "epoch": 1.987362413371382, + "grad_norm": 0.9740317463874817, + "learning_rate": 0.0007516901073515423, + "loss": 3.7212, + "step": 29250 + }, + { + "epoch": 1.9877021334420437, + "grad_norm": 1.8448395729064941, + "learning_rate": 0.0007516476423427097, + "loss": 3.5962, + "step": 29255 + }, + { + "epoch": 1.9880418535127056, + "grad_norm": 4.446913719177246, + "learning_rate": 0.0007516051773338769, + "loss": 3.8237, + "step": 29260 + }, + { + "epoch": 1.9883815735833674, + "grad_norm": 1.0002325773239136, + "learning_rate": 0.0007515627123250441, + "loss": 3.4612, + "step": 29265 + }, + { + "epoch": 1.988721293654029, + "grad_norm": 0.8258467316627502, + "learning_rate": 0.0007515202473162114, + "loss": 3.5347, + "step": 29270 + }, + { + "epoch": 1.989061013724691, + "grad_norm": 1.136059045791626, + "learning_rate": 0.0007514777823073788, + "loss": 3.3349, + "step": 29275 + }, + { + "epoch": 1.9894007337953528, + "grad_norm": 0.6971839666366577, + "learning_rate": 0.000751435317298546, + "loss": 3.5442, + "step": 29280 + }, + { + "epoch": 1.9897404538660144, + "grad_norm": 1.402170181274414, + "learning_rate": 0.0007513928522897133, + "loss": 3.5908, + "step": 29285 + }, + { + "epoch": 1.990080173936676, + "grad_norm": 0.7529746890068054, + "learning_rate": 0.0007513503872808806, + "loss": 3.5052, + "step": 29290 + }, + { + "epoch": 1.990419894007338, + "grad_norm": 1.1907496452331543, + "learning_rate": 0.0007513079222720478, + "loss": 3.6153, + "step": 29295 + }, + { + "epoch": 1.9907596140779997, + "grad_norm": 0.7954854369163513, + "learning_rate": 0.0007512654572632151, + "loss": 3.7154, + "step": 29300 + }, + { + "epoch": 1.9910993341486614, + "grad_norm": 0.7898622155189514, + "learning_rate": 0.0007512229922543823, + "loss": 3.5539, + "step": 29305 + }, + { + "epoch": 1.9914390542193234, + "grad_norm": 0.7289479970932007, + "learning_rate": 0.0007511805272455497, + "loss": 3.6475, + "step": 29310 + }, + { + "epoch": 1.991778774289985, + "grad_norm": 0.7480578422546387, + "learning_rate": 0.000751138062236717, + "loss": 3.5471, + "step": 29315 + }, + { + "epoch": 1.9921184943606467, + "grad_norm": 2.835513114929199, + "learning_rate": 0.0007510955972278842, + "loss": 3.5491, + "step": 29320 + }, + { + "epoch": 1.9924582144313085, + "grad_norm": 0.9266762137413025, + "learning_rate": 0.0007510531322190515, + "loss": 3.7062, + "step": 29325 + }, + { + "epoch": 1.9927979345019704, + "grad_norm": 0.7519094347953796, + "learning_rate": 0.0007510106672102188, + "loss": 3.3887, + "step": 29330 + }, + { + "epoch": 1.993137654572632, + "grad_norm": 1.0791984796524048, + "learning_rate": 0.000750968202201386, + "loss": 3.5061, + "step": 29335 + }, + { + "epoch": 1.9934773746432939, + "grad_norm": 1.0453518629074097, + "learning_rate": 0.0007509257371925533, + "loss": 3.7762, + "step": 29340 + }, + { + "epoch": 1.9938170947139557, + "grad_norm": 0.9833052754402161, + "learning_rate": 0.0007508832721837207, + "loss": 3.3811, + "step": 29345 + }, + { + "epoch": 1.9941568147846174, + "grad_norm": 0.7320995926856995, + "learning_rate": 0.0007508408071748879, + "loss": 3.8285, + "step": 29350 + }, + { + "epoch": 1.9944965348552792, + "grad_norm": 0.8510575294494629, + "learning_rate": 0.0007507983421660552, + "loss": 3.4127, + "step": 29355 + }, + { + "epoch": 1.994836254925941, + "grad_norm": 2.171750783920288, + "learning_rate": 0.0007507558771572225, + "loss": 3.704, + "step": 29360 + }, + { + "epoch": 1.9951759749966027, + "grad_norm": 1.2309443950653076, + "learning_rate": 0.0007507134121483897, + "loss": 3.6484, + "step": 29365 + }, + { + "epoch": 1.9955156950672646, + "grad_norm": 0.8806555271148682, + "learning_rate": 0.0007506709471395569, + "loss": 3.6526, + "step": 29370 + }, + { + "epoch": 1.9958554151379264, + "grad_norm": 0.9319483637809753, + "learning_rate": 0.0007506284821307243, + "loss": 3.7033, + "step": 29375 + }, + { + "epoch": 1.996195135208588, + "grad_norm": 0.7409738898277283, + "learning_rate": 0.0007505860171218916, + "loss": 3.7983, + "step": 29380 + }, + { + "epoch": 1.99653485527925, + "grad_norm": 1.068408489227295, + "learning_rate": 0.0007505435521130588, + "loss": 3.6285, + "step": 29385 + }, + { + "epoch": 1.9968745753499118, + "grad_norm": 0.8524137139320374, + "learning_rate": 0.0007505010871042262, + "loss": 3.4742, + "step": 29390 + }, + { + "epoch": 1.9972142954205734, + "grad_norm": 0.7672048211097717, + "learning_rate": 0.0007504586220953934, + "loss": 3.7502, + "step": 29395 + }, + { + "epoch": 1.9975540154912352, + "grad_norm": 0.8630630970001221, + "learning_rate": 0.0007504161570865606, + "loss": 3.8113, + "step": 29400 + }, + { + "epoch": 1.997893735561897, + "grad_norm": 0.8821137547492981, + "learning_rate": 0.000750373692077728, + "loss": 3.6091, + "step": 29405 + }, + { + "epoch": 1.9982334556325587, + "grad_norm": 0.7845519781112671, + "learning_rate": 0.0007503312270688952, + "loss": 3.6552, + "step": 29410 + }, + { + "epoch": 1.9985731757032206, + "grad_norm": 0.9076778888702393, + "learning_rate": 0.0007502887620600626, + "loss": 3.5071, + "step": 29415 + }, + { + "epoch": 1.9989128957738824, + "grad_norm": 0.8801351189613342, + "learning_rate": 0.0007502462970512298, + "loss": 3.6387, + "step": 29420 + }, + { + "epoch": 1.999252615844544, + "grad_norm": 0.808059573173523, + "learning_rate": 0.0007502038320423971, + "loss": 3.651, + "step": 29425 + }, + { + "epoch": 1.999592335915206, + "grad_norm": 0.728587806224823, + "learning_rate": 0.0007501613670335644, + "loss": 3.5813, + "step": 29430 + }, + { + "epoch": 1.9999320559858678, + "grad_norm": 0.7217844724655151, + "learning_rate": 0.0007501189020247316, + "loss": 3.7235, + "step": 29435 + }, + { + "epoch": 2.0, + "eval_bertscore": { + "f1": 0.8417826208775792, + "precision": 0.8447663236616286, + "recall": 0.839637989532995 + }, + "eval_bleu_4": 0.016643211617545405, + "eval_exact_match": 0.00038763446070355656, + "eval_loss": 3.468294620513916, + "eval_meteor": 0.09029952444137972, + "eval_rouge": { + "rouge1": 0.12653498870463648, + "rouge2": 0.018108060932818283, + "rougeL": 0.11036090442604687, + "rougeLsum": 0.11039925347344667 + }, + "eval_runtime": 1555.3945, + "eval_samples_per_second": 6.634, + "eval_steps_per_second": 0.829, + "step": 29436 + }, + { + "epoch": 2.0002717760565294, + "grad_norm": 0.902706503868103, + "learning_rate": 0.0007500764370158989, + "loss": 3.6576, + "step": 29440 + }, + { + "epoch": 2.000611496127191, + "grad_norm": 0.7929250597953796, + "learning_rate": 0.0007500339720070662, + "loss": 3.5995, + "step": 29445 + }, + { + "epoch": 2.000951216197853, + "grad_norm": 0.799768328666687, + "learning_rate": 0.0007499915069982335, + "loss": 3.5462, + "step": 29450 + }, + { + "epoch": 2.0012909362685147, + "grad_norm": 0.9558728337287903, + "learning_rate": 0.0007499490419894008, + "loss": 3.5255, + "step": 29455 + }, + { + "epoch": 2.0016306563391764, + "grad_norm": 2.527005910873413, + "learning_rate": 0.0007499065769805681, + "loss": 3.4994, + "step": 29460 + }, + { + "epoch": 2.0019703764098384, + "grad_norm": 0.9265762567520142, + "learning_rate": 0.0007498641119717353, + "loss": 3.3643, + "step": 29465 + }, + { + "epoch": 2.0023100964805, + "grad_norm": 1.1578618288040161, + "learning_rate": 0.0007498216469629025, + "loss": 3.3947, + "step": 29470 + }, + { + "epoch": 2.0026498165511617, + "grad_norm": 0.9910609126091003, + "learning_rate": 0.0007497791819540699, + "loss": 3.3975, + "step": 29475 + }, + { + "epoch": 2.0029895366218238, + "grad_norm": 2.40384840965271, + "learning_rate": 0.0007497367169452371, + "loss": 3.424, + "step": 29480 + }, + { + "epoch": 2.0033292566924854, + "grad_norm": 2.261876344680786, + "learning_rate": 0.0007496942519364044, + "loss": 3.4348, + "step": 29485 + }, + { + "epoch": 2.003668976763147, + "grad_norm": 0.8409294486045837, + "learning_rate": 0.0007496517869275718, + "loss": 3.642, + "step": 29490 + }, + { + "epoch": 2.004008696833809, + "grad_norm": 0.7148813009262085, + "learning_rate": 0.000749609321918739, + "loss": 3.7198, + "step": 29495 + }, + { + "epoch": 2.0043484169044707, + "grad_norm": 0.8970974683761597, + "learning_rate": 0.0007495668569099062, + "loss": 3.156, + "step": 29500 + }, + { + "epoch": 2.0046881369751324, + "grad_norm": 1.1400750875473022, + "learning_rate": 0.0007495243919010736, + "loss": 3.5801, + "step": 29505 + }, + { + "epoch": 2.0050278570457944, + "grad_norm": 2.850945472717285, + "learning_rate": 0.0007494819268922408, + "loss": 3.848, + "step": 29510 + }, + { + "epoch": 2.005367577116456, + "grad_norm": 0.8271298408508301, + "learning_rate": 0.000749439461883408, + "loss": 3.4894, + "step": 29515 + }, + { + "epoch": 2.0057072971871177, + "grad_norm": 1.311874508857727, + "learning_rate": 0.0007493969968745754, + "loss": 3.3438, + "step": 29520 + }, + { + "epoch": 2.00604701725778, + "grad_norm": 1.6753573417663574, + "learning_rate": 0.0007493545318657427, + "loss": 3.6291, + "step": 29525 + }, + { + "epoch": 2.0063867373284414, + "grad_norm": 0.9074965119361877, + "learning_rate": 0.0007493120668569099, + "loss": 3.6954, + "step": 29530 + }, + { + "epoch": 2.006726457399103, + "grad_norm": 0.9712390899658203, + "learning_rate": 0.0007492696018480772, + "loss": 3.7097, + "step": 29535 + }, + { + "epoch": 2.007066177469765, + "grad_norm": 0.7796502113342285, + "learning_rate": 0.0007492271368392445, + "loss": 3.6226, + "step": 29540 + }, + { + "epoch": 2.0074058975404268, + "grad_norm": 0.8985151648521423, + "learning_rate": 0.0007491846718304117, + "loss": 3.8948, + "step": 29545 + }, + { + "epoch": 2.0077456176110884, + "grad_norm": 0.8522647619247437, + "learning_rate": 0.0007491422068215791, + "loss": 3.1305, + "step": 29550 + }, + { + "epoch": 2.0080853376817505, + "grad_norm": 0.9762449860572815, + "learning_rate": 0.0007490997418127464, + "loss": 3.5519, + "step": 29555 + }, + { + "epoch": 2.008425057752412, + "grad_norm": 0.8998928070068359, + "learning_rate": 0.0007490572768039136, + "loss": 3.3039, + "step": 29560 + }, + { + "epoch": 2.0087647778230737, + "grad_norm": 0.8634898066520691, + "learning_rate": 0.0007490148117950809, + "loss": 3.4508, + "step": 29565 + }, + { + "epoch": 2.0091044978937354, + "grad_norm": 0.8881954550743103, + "learning_rate": 0.0007489723467862481, + "loss": 3.675, + "step": 29570 + }, + { + "epoch": 2.0094442179643974, + "grad_norm": 0.7698472142219543, + "learning_rate": 0.0007489298817774154, + "loss": 3.5363, + "step": 29575 + }, + { + "epoch": 2.009783938035059, + "grad_norm": 0.8263338804244995, + "learning_rate": 0.0007488874167685827, + "loss": 3.6437, + "step": 29580 + }, + { + "epoch": 2.0101236581057207, + "grad_norm": 0.6162746548652649, + "learning_rate": 0.00074884495175975, + "loss": 3.5137, + "step": 29585 + }, + { + "epoch": 2.0104633781763828, + "grad_norm": 0.896315336227417, + "learning_rate": 0.0007488024867509173, + "loss": 3.5508, + "step": 29590 + }, + { + "epoch": 2.0108030982470444, + "grad_norm": 0.9861576557159424, + "learning_rate": 0.0007487600217420846, + "loss": 3.5525, + "step": 29595 + }, + { + "epoch": 2.011142818317706, + "grad_norm": 0.9364343881607056, + "learning_rate": 0.0007487175567332518, + "loss": 3.3904, + "step": 29600 + }, + { + "epoch": 2.011482538388368, + "grad_norm": 1.0701934099197388, + "learning_rate": 0.000748675091724419, + "loss": 3.4293, + "step": 29605 + }, + { + "epoch": 2.0118222584590297, + "grad_norm": 0.9974489212036133, + "learning_rate": 0.0007486326267155864, + "loss": 3.5271, + "step": 29610 + }, + { + "epoch": 2.0121619785296914, + "grad_norm": 1.0875741243362427, + "learning_rate": 0.0007485901617067536, + "loss": 3.7087, + "step": 29615 + }, + { + "epoch": 2.0125016986003534, + "grad_norm": 0.8567084670066833, + "learning_rate": 0.0007485476966979209, + "loss": 3.5877, + "step": 29620 + }, + { + "epoch": 2.012841418671015, + "grad_norm": 0.7971550226211548, + "learning_rate": 0.0007485052316890883, + "loss": 3.4187, + "step": 29625 + }, + { + "epoch": 2.0131811387416767, + "grad_norm": 0.8968897461891174, + "learning_rate": 0.0007484627666802555, + "loss": 3.6842, + "step": 29630 + }, + { + "epoch": 2.0135208588123388, + "grad_norm": 0.7633346915245056, + "learning_rate": 0.0007484203016714227, + "loss": 3.624, + "step": 29635 + }, + { + "epoch": 2.0138605788830004, + "grad_norm": 0.9562004804611206, + "learning_rate": 0.0007483778366625901, + "loss": 3.2954, + "step": 29640 + }, + { + "epoch": 2.014200298953662, + "grad_norm": 0.7962945103645325, + "learning_rate": 0.0007483353716537573, + "loss": 3.4235, + "step": 29645 + }, + { + "epoch": 2.014540019024324, + "grad_norm": 0.8576595187187195, + "learning_rate": 0.0007482929066449245, + "loss": 3.5552, + "step": 29650 + }, + { + "epoch": 2.0148797390949857, + "grad_norm": 1.0832886695861816, + "learning_rate": 0.000748250441636092, + "loss": 3.8693, + "step": 29655 + }, + { + "epoch": 2.0152194591656474, + "grad_norm": 0.8077504634857178, + "learning_rate": 0.0007482079766272592, + "loss": 3.6817, + "step": 29660 + }, + { + "epoch": 2.0155591792363095, + "grad_norm": 1.8054348230361938, + "learning_rate": 0.0007481655116184264, + "loss": 3.6274, + "step": 29665 + }, + { + "epoch": 2.015898899306971, + "grad_norm": 0.9765808582305908, + "learning_rate": 0.0007481230466095937, + "loss": 3.4667, + "step": 29670 + }, + { + "epoch": 2.0162386193776327, + "grad_norm": 1.2331407070159912, + "learning_rate": 0.000748080581600761, + "loss": 3.2072, + "step": 29675 + }, + { + "epoch": 2.016578339448295, + "grad_norm": 0.914566695690155, + "learning_rate": 0.0007480381165919282, + "loss": 3.8367, + "step": 29680 + }, + { + "epoch": 2.0169180595189564, + "grad_norm": 1.032149314880371, + "learning_rate": 0.0007479956515830955, + "loss": 3.4692, + "step": 29685 + }, + { + "epoch": 2.017257779589618, + "grad_norm": 0.7988753318786621, + "learning_rate": 0.0007479531865742629, + "loss": 3.743, + "step": 29690 + }, + { + "epoch": 2.01759749966028, + "grad_norm": 0.7812865972518921, + "learning_rate": 0.0007479107215654301, + "loss": 3.7252, + "step": 29695 + }, + { + "epoch": 2.0179372197309418, + "grad_norm": 2.351775646209717, + "learning_rate": 0.0007478682565565974, + "loss": 3.4816, + "step": 29700 + }, + { + "epoch": 2.0182769398016034, + "grad_norm": 0.7352385520935059, + "learning_rate": 0.0007478257915477646, + "loss": 3.4359, + "step": 29705 + }, + { + "epoch": 2.0186166598722655, + "grad_norm": 0.8367921113967896, + "learning_rate": 0.0007477833265389319, + "loss": 3.7558, + "step": 29710 + }, + { + "epoch": 2.018956379942927, + "grad_norm": 1.077057957649231, + "learning_rate": 0.0007477408615300992, + "loss": 3.6097, + "step": 29715 + }, + { + "epoch": 2.0192961000135887, + "grad_norm": 0.7933337688446045, + "learning_rate": 0.0007476983965212664, + "loss": 3.3704, + "step": 29720 + }, + { + "epoch": 2.0196358200842504, + "grad_norm": 0.8734638690948486, + "learning_rate": 0.0007476559315124338, + "loss": 3.548, + "step": 29725 + }, + { + "epoch": 2.0199755401549124, + "grad_norm": 0.9834259152412415, + "learning_rate": 0.0007476134665036011, + "loss": 3.2346, + "step": 29730 + }, + { + "epoch": 2.020315260225574, + "grad_norm": 1.0666128396987915, + "learning_rate": 0.0007475710014947683, + "loss": 3.3587, + "step": 29735 + }, + { + "epoch": 2.0206549802962357, + "grad_norm": 0.8740901350975037, + "learning_rate": 0.0007475285364859356, + "loss": 3.4636, + "step": 29740 + }, + { + "epoch": 2.0209947003668978, + "grad_norm": 1.2747498750686646, + "learning_rate": 0.0007474860714771029, + "loss": 3.3663, + "step": 29745 + }, + { + "epoch": 2.0213344204375594, + "grad_norm": 0.8305885195732117, + "learning_rate": 0.0007474436064682701, + "loss": 3.4198, + "step": 29750 + }, + { + "epoch": 2.021674140508221, + "grad_norm": 0.7399596571922302, + "learning_rate": 0.0007474011414594373, + "loss": 3.6117, + "step": 29755 + }, + { + "epoch": 2.022013860578883, + "grad_norm": 0.821613073348999, + "learning_rate": 0.0007473586764506048, + "loss": 3.7176, + "step": 29760 + }, + { + "epoch": 2.0223535806495447, + "grad_norm": 0.8310840725898743, + "learning_rate": 0.000747316211441772, + "loss": 3.5489, + "step": 29765 + }, + { + "epoch": 2.0226933007202064, + "grad_norm": 0.8851040005683899, + "learning_rate": 0.0007472737464329393, + "loss": 3.4106, + "step": 29770 + }, + { + "epoch": 2.0230330207908684, + "grad_norm": 0.8401482105255127, + "learning_rate": 0.0007472312814241066, + "loss": 3.4741, + "step": 29775 + }, + { + "epoch": 2.02337274086153, + "grad_norm": 0.916972815990448, + "learning_rate": 0.0007471888164152738, + "loss": 3.5823, + "step": 29780 + }, + { + "epoch": 2.0237124609321917, + "grad_norm": 0.7900135517120361, + "learning_rate": 0.0007471463514064411, + "loss": 3.5651, + "step": 29785 + }, + { + "epoch": 2.024052181002854, + "grad_norm": 0.7388175129890442, + "learning_rate": 0.0007471038863976084, + "loss": 3.5809, + "step": 29790 + }, + { + "epoch": 2.0243919010735154, + "grad_norm": 0.9249594807624817, + "learning_rate": 0.0007470614213887757, + "loss": 3.5511, + "step": 29795 + }, + { + "epoch": 2.024731621144177, + "grad_norm": 0.8434092998504639, + "learning_rate": 0.000747018956379943, + "loss": 3.9346, + "step": 29800 + }, + { + "epoch": 2.025071341214839, + "grad_norm": 0.9517419338226318, + "learning_rate": 0.0007469764913711102, + "loss": 3.6478, + "step": 29805 + }, + { + "epoch": 2.0254110612855007, + "grad_norm": 18.260408401489258, + "learning_rate": 0.0007469340263622775, + "loss": 3.6207, + "step": 29810 + }, + { + "epoch": 2.0257507813561624, + "grad_norm": 0.990160346031189, + "learning_rate": 0.0007468915613534448, + "loss": 3.5317, + "step": 29815 + }, + { + "epoch": 2.0260905014268245, + "grad_norm": 0.7647041082382202, + "learning_rate": 0.000746849096344612, + "loss": 3.5514, + "step": 29820 + }, + { + "epoch": 2.026430221497486, + "grad_norm": 0.9511529207229614, + "learning_rate": 0.0007468066313357793, + "loss": 3.5441, + "step": 29825 + }, + { + "epoch": 2.0267699415681477, + "grad_norm": 0.8290064334869385, + "learning_rate": 0.0007467641663269467, + "loss": 3.4049, + "step": 29830 + }, + { + "epoch": 2.02710966163881, + "grad_norm": 1.0084744691848755, + "learning_rate": 0.0007467217013181139, + "loss": 3.4037, + "step": 29835 + }, + { + "epoch": 2.0274493817094714, + "grad_norm": 1.0736650228500366, + "learning_rate": 0.0007466792363092812, + "loss": 3.7572, + "step": 29840 + }, + { + "epoch": 2.027789101780133, + "grad_norm": 0.8527398109436035, + "learning_rate": 0.0007466367713004485, + "loss": 3.3883, + "step": 29845 + }, + { + "epoch": 2.028128821850795, + "grad_norm": 0.7435247302055359, + "learning_rate": 0.0007465943062916157, + "loss": 3.5168, + "step": 29850 + }, + { + "epoch": 2.0284685419214568, + "grad_norm": 0.8619943261146545, + "learning_rate": 0.0007465518412827829, + "loss": 3.796, + "step": 29855 + }, + { + "epoch": 2.0288082619921184, + "grad_norm": 0.7409512996673584, + "learning_rate": 0.0007465093762739503, + "loss": 3.5448, + "step": 29860 + }, + { + "epoch": 2.0291479820627805, + "grad_norm": 1.0684194564819336, + "learning_rate": 0.0007464669112651176, + "loss": 3.6701, + "step": 29865 + }, + { + "epoch": 2.029487702133442, + "grad_norm": 1.1920106410980225, + "learning_rate": 0.0007464244462562848, + "loss": 3.5994, + "step": 29870 + }, + { + "epoch": 2.0298274222041037, + "grad_norm": 0.7864174842834473, + "learning_rate": 0.0007463819812474522, + "loss": 3.7077, + "step": 29875 + }, + { + "epoch": 2.030167142274766, + "grad_norm": 1.2373602390289307, + "learning_rate": 0.0007463395162386194, + "loss": 3.6488, + "step": 29880 + }, + { + "epoch": 2.0305068623454274, + "grad_norm": 0.9815283417701721, + "learning_rate": 0.0007462970512297866, + "loss": 3.5632, + "step": 29885 + }, + { + "epoch": 2.030846582416089, + "grad_norm": 0.8603841066360474, + "learning_rate": 0.000746254586220954, + "loss": 3.5574, + "step": 29890 + }, + { + "epoch": 2.031186302486751, + "grad_norm": 1.169021487236023, + "learning_rate": 0.0007462121212121212, + "loss": 3.1942, + "step": 29895 + }, + { + "epoch": 2.0315260225574128, + "grad_norm": 1.0851612091064453, + "learning_rate": 0.0007461696562032885, + "loss": 3.226, + "step": 29900 + }, + { + "epoch": 2.0318657426280744, + "grad_norm": 0.8373953104019165, + "learning_rate": 0.0007461271911944559, + "loss": 3.8635, + "step": 29905 + }, + { + "epoch": 2.032205462698736, + "grad_norm": 0.731135368347168, + "learning_rate": 0.0007460847261856231, + "loss": 3.8048, + "step": 29910 + }, + { + "epoch": 2.032545182769398, + "grad_norm": 0.8661358952522278, + "learning_rate": 0.0007460422611767903, + "loss": 3.4878, + "step": 29915 + }, + { + "epoch": 2.0328849028400597, + "grad_norm": 0.8129076361656189, + "learning_rate": 0.0007459997961679576, + "loss": 3.4928, + "step": 29920 + }, + { + "epoch": 2.0332246229107214, + "grad_norm": 1.099036693572998, + "learning_rate": 0.0007459573311591249, + "loss": 3.6196, + "step": 29925 + }, + { + "epoch": 2.0335643429813834, + "grad_norm": 0.7809199690818787, + "learning_rate": 0.0007459148661502921, + "loss": 3.6361, + "step": 29930 + }, + { + "epoch": 2.033904063052045, + "grad_norm": 1.0181523561477661, + "learning_rate": 0.0007458724011414595, + "loss": 3.4612, + "step": 29935 + }, + { + "epoch": 2.0342437831227067, + "grad_norm": 0.8296138644218445, + "learning_rate": 0.0007458299361326268, + "loss": 3.8206, + "step": 29940 + }, + { + "epoch": 2.034583503193369, + "grad_norm": 1.1546753644943237, + "learning_rate": 0.000745787471123794, + "loss": 3.5231, + "step": 29945 + }, + { + "epoch": 2.0349232232640304, + "grad_norm": 0.6354867219924927, + "learning_rate": 0.0007457450061149613, + "loss": 3.4436, + "step": 29950 + }, + { + "epoch": 2.035262943334692, + "grad_norm": 0.8824624419212341, + "learning_rate": 0.0007457025411061285, + "loss": 3.5555, + "step": 29955 + }, + { + "epoch": 2.035602663405354, + "grad_norm": 0.7092429995536804, + "learning_rate": 0.0007456600760972958, + "loss": 3.5279, + "step": 29960 + }, + { + "epoch": 2.0359423834760157, + "grad_norm": 1.01329505443573, + "learning_rate": 0.0007456176110884631, + "loss": 3.3856, + "step": 29965 + }, + { + "epoch": 2.0362821035466774, + "grad_norm": 1.6589281558990479, + "learning_rate": 0.0007455751460796304, + "loss": 3.6046, + "step": 29970 + }, + { + "epoch": 2.0366218236173395, + "grad_norm": 0.7226354479789734, + "learning_rate": 0.0007455326810707977, + "loss": 3.6933, + "step": 29975 + }, + { + "epoch": 2.036961543688001, + "grad_norm": 1.1590620279312134, + "learning_rate": 0.000745490216061965, + "loss": 3.3397, + "step": 29980 + }, + { + "epoch": 2.0373012637586627, + "grad_norm": 0.9630075693130493, + "learning_rate": 0.0007454477510531322, + "loss": 3.5178, + "step": 29985 + }, + { + "epoch": 2.037640983829325, + "grad_norm": 0.7578001022338867, + "learning_rate": 0.0007454052860442994, + "loss": 3.7264, + "step": 29990 + }, + { + "epoch": 2.0379807038999864, + "grad_norm": 0.7703878879547119, + "learning_rate": 0.0007453628210354668, + "loss": 3.7903, + "step": 29995 + }, + { + "epoch": 2.038320423970648, + "grad_norm": 0.7079905271530151, + "learning_rate": 0.000745320356026634, + "loss": 3.7416, + "step": 30000 + }, + { + "epoch": 2.03866014404131, + "grad_norm": 1.1307092905044556, + "learning_rate": 0.0007452778910178013, + "loss": 3.6549, + "step": 30005 + }, + { + "epoch": 2.0389998641119718, + "grad_norm": 0.8095118403434753, + "learning_rate": 0.0007452354260089687, + "loss": 3.8917, + "step": 30010 + }, + { + "epoch": 2.0393395841826334, + "grad_norm": 0.8549849390983582, + "learning_rate": 0.0007451929610001359, + "loss": 3.8158, + "step": 30015 + }, + { + "epoch": 2.0396793042532955, + "grad_norm": 0.8062247037887573, + "learning_rate": 0.0007451504959913031, + "loss": 3.6094, + "step": 30020 + }, + { + "epoch": 2.040019024323957, + "grad_norm": 0.8495997786521912, + "learning_rate": 0.0007451080309824705, + "loss": 3.5076, + "step": 30025 + }, + { + "epoch": 2.0403587443946187, + "grad_norm": 0.8343631029129028, + "learning_rate": 0.0007450655659736377, + "loss": 3.7007, + "step": 30030 + }, + { + "epoch": 2.040698464465281, + "grad_norm": 0.9398652911186218, + "learning_rate": 0.0007450231009648049, + "loss": 3.5886, + "step": 30035 + }, + { + "epoch": 2.0410381845359424, + "grad_norm": 1.0201457738876343, + "learning_rate": 0.0007449806359559724, + "loss": 3.5238, + "step": 30040 + }, + { + "epoch": 2.041377904606604, + "grad_norm": 0.9202156662940979, + "learning_rate": 0.0007449381709471396, + "loss": 3.4781, + "step": 30045 + }, + { + "epoch": 2.041717624677266, + "grad_norm": 1.0329872369766235, + "learning_rate": 0.0007448957059383068, + "loss": 3.5047, + "step": 30050 + }, + { + "epoch": 2.0420573447479278, + "grad_norm": 0.9759519696235657, + "learning_rate": 0.0007448532409294741, + "loss": 3.3157, + "step": 30055 + }, + { + "epoch": 2.0423970648185894, + "grad_norm": 0.9876469969749451, + "learning_rate": 0.0007448107759206414, + "loss": 3.5886, + "step": 30060 + }, + { + "epoch": 2.042736784889251, + "grad_norm": 0.8921344876289368, + "learning_rate": 0.0007447683109118086, + "loss": 3.476, + "step": 30065 + }, + { + "epoch": 2.043076504959913, + "grad_norm": 0.6781764626502991, + "learning_rate": 0.000744725845902976, + "loss": 3.851, + "step": 30070 + }, + { + "epoch": 2.0434162250305747, + "grad_norm": 0.7529522776603699, + "learning_rate": 0.0007446833808941433, + "loss": 3.2877, + "step": 30075 + }, + { + "epoch": 2.0437559451012364, + "grad_norm": 0.8231487274169922, + "learning_rate": 0.0007446409158853105, + "loss": 3.4887, + "step": 30080 + }, + { + "epoch": 2.0440956651718984, + "grad_norm": 0.8427303433418274, + "learning_rate": 0.0007445984508764778, + "loss": 3.5954, + "step": 30085 + }, + { + "epoch": 2.04443538524256, + "grad_norm": 0.719743549823761, + "learning_rate": 0.000744555985867645, + "loss": 3.5882, + "step": 30090 + }, + { + "epoch": 2.0447751053132217, + "grad_norm": 6.458067893981934, + "learning_rate": 0.0007445135208588123, + "loss": 3.4264, + "step": 30095 + }, + { + "epoch": 2.045114825383884, + "grad_norm": 0.9467030763626099, + "learning_rate": 0.0007444710558499796, + "loss": 3.4061, + "step": 30100 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.8596173524856567, + "learning_rate": 0.0007444285908411469, + "loss": 3.6461, + "step": 30105 + }, + { + "epoch": 2.045794265525207, + "grad_norm": 1.0502382516860962, + "learning_rate": 0.0007443861258323143, + "loss": 3.5348, + "step": 30110 + }, + { + "epoch": 2.046133985595869, + "grad_norm": 0.7976231575012207, + "learning_rate": 0.0007443436608234815, + "loss": 3.7263, + "step": 30115 + }, + { + "epoch": 2.0464737056665308, + "grad_norm": 0.7910042405128479, + "learning_rate": 0.0007443011958146487, + "loss": 3.5254, + "step": 30120 + }, + { + "epoch": 2.0468134257371924, + "grad_norm": 1.0758010149002075, + "learning_rate": 0.0007442587308058161, + "loss": 3.524, + "step": 30125 + }, + { + "epoch": 2.0471531458078545, + "grad_norm": 0.8543336987495422, + "learning_rate": 0.0007442162657969833, + "loss": 3.4581, + "step": 30130 + }, + { + "epoch": 2.047492865878516, + "grad_norm": 0.8796383142471313, + "learning_rate": 0.0007441738007881505, + "loss": 3.5128, + "step": 30135 + }, + { + "epoch": 2.0478325859491777, + "grad_norm": 0.7512159943580627, + "learning_rate": 0.000744131335779318, + "loss": 3.7088, + "step": 30140 + }, + { + "epoch": 2.04817230601984, + "grad_norm": 1.6405693292617798, + "learning_rate": 0.0007440888707704852, + "loss": 3.6906, + "step": 30145 + }, + { + "epoch": 2.0485120260905014, + "grad_norm": 0.9078961610794067, + "learning_rate": 0.0007440464057616524, + "loss": 3.3645, + "step": 30150 + }, + { + "epoch": 2.048851746161163, + "grad_norm": 0.8419570326805115, + "learning_rate": 0.0007440039407528197, + "loss": 3.4354, + "step": 30155 + }, + { + "epoch": 2.049191466231825, + "grad_norm": 0.7863807082176208, + "learning_rate": 0.000743961475743987, + "loss": 3.69, + "step": 30160 + }, + { + "epoch": 2.0495311863024868, + "grad_norm": 1.364188313484192, + "learning_rate": 0.0007439190107351542, + "loss": 3.4666, + "step": 30165 + }, + { + "epoch": 2.0498709063731484, + "grad_norm": 0.9250085353851318, + "learning_rate": 0.0007438765457263215, + "loss": 3.5496, + "step": 30170 + }, + { + "epoch": 2.0502106264438105, + "grad_norm": 1.0029747486114502, + "learning_rate": 0.0007438340807174889, + "loss": 3.4364, + "step": 30175 + }, + { + "epoch": 2.050550346514472, + "grad_norm": 0.8632626533508301, + "learning_rate": 0.0007437916157086561, + "loss": 3.783, + "step": 30180 + }, + { + "epoch": 2.0508900665851337, + "grad_norm": 1.095878005027771, + "learning_rate": 0.0007437491506998234, + "loss": 3.3048, + "step": 30185 + }, + { + "epoch": 2.051229786655796, + "grad_norm": 0.8282005786895752, + "learning_rate": 0.0007437066856909907, + "loss": 3.3769, + "step": 30190 + }, + { + "epoch": 2.0515695067264574, + "grad_norm": 0.8634428381919861, + "learning_rate": 0.0007436642206821579, + "loss": 3.3312, + "step": 30195 + }, + { + "epoch": 2.051909226797119, + "grad_norm": 0.8865042924880981, + "learning_rate": 0.0007436217556733252, + "loss": 3.8932, + "step": 30200 + }, + { + "epoch": 2.052248946867781, + "grad_norm": 0.6289206743240356, + "learning_rate": 0.0007435792906644924, + "loss": 3.6045, + "step": 30205 + }, + { + "epoch": 2.0525886669384428, + "grad_norm": 1.3415013551712036, + "learning_rate": 0.0007435368256556598, + "loss": 3.4953, + "step": 30210 + }, + { + "epoch": 2.0529283870091044, + "grad_norm": 1.0701266527175903, + "learning_rate": 0.0007434943606468271, + "loss": 3.559, + "step": 30215 + }, + { + "epoch": 2.0532681070797665, + "grad_norm": 0.7367812991142273, + "learning_rate": 0.0007434518956379943, + "loss": 3.7681, + "step": 30220 + }, + { + "epoch": 2.053607827150428, + "grad_norm": 0.7112928628921509, + "learning_rate": 0.0007434094306291616, + "loss": 3.375, + "step": 30225 + }, + { + "epoch": 2.0539475472210897, + "grad_norm": 1.1781489849090576, + "learning_rate": 0.0007433669656203289, + "loss": 3.3027, + "step": 30230 + }, + { + "epoch": 2.054287267291752, + "grad_norm": 0.7095500826835632, + "learning_rate": 0.0007433245006114961, + "loss": 3.5663, + "step": 30235 + }, + { + "epoch": 2.0546269873624134, + "grad_norm": 0.7627681493759155, + "learning_rate": 0.0007432820356026633, + "loss": 3.5558, + "step": 30240 + }, + { + "epoch": 2.054966707433075, + "grad_norm": 0.9601649045944214, + "learning_rate": 0.0007432395705938308, + "loss": 3.6184, + "step": 30245 + }, + { + "epoch": 2.0553064275037367, + "grad_norm": 0.850382924079895, + "learning_rate": 0.000743197105584998, + "loss": 3.4898, + "step": 30250 + }, + { + "epoch": 2.055646147574399, + "grad_norm": 0.7899922728538513, + "learning_rate": 0.0007431546405761652, + "loss": 3.7575, + "step": 30255 + }, + { + "epoch": 2.0559858676450604, + "grad_norm": 0.8121507167816162, + "learning_rate": 0.0007431121755673326, + "loss": 3.773, + "step": 30260 + }, + { + "epoch": 2.056325587715722, + "grad_norm": 0.8368807435035706, + "learning_rate": 0.0007430697105584998, + "loss": 3.71, + "step": 30265 + }, + { + "epoch": 2.056665307786384, + "grad_norm": 2.3201372623443604, + "learning_rate": 0.000743027245549667, + "loss": 3.3767, + "step": 30270 + }, + { + "epoch": 2.0570050278570458, + "grad_norm": 0.8351988196372986, + "learning_rate": 0.0007429847805408344, + "loss": 3.6348, + "step": 30275 + }, + { + "epoch": 2.0573447479277074, + "grad_norm": 0.9295761585235596, + "learning_rate": 0.0007429423155320017, + "loss": 3.2298, + "step": 30280 + }, + { + "epoch": 2.0576844679983695, + "grad_norm": 0.9871548414230347, + "learning_rate": 0.0007428998505231689, + "loss": 3.5585, + "step": 30285 + }, + { + "epoch": 2.058024188069031, + "grad_norm": 0.750560998916626, + "learning_rate": 0.0007428573855143363, + "loss": 3.6709, + "step": 30290 + }, + { + "epoch": 2.0583639081396927, + "grad_norm": 1.5458797216415405, + "learning_rate": 0.0007428149205055035, + "loss": 3.45, + "step": 30295 + }, + { + "epoch": 2.058703628210355, + "grad_norm": 1.4376354217529297, + "learning_rate": 0.0007427724554966707, + "loss": 3.6931, + "step": 30300 + }, + { + "epoch": 2.0590433482810164, + "grad_norm": 1.0368679761886597, + "learning_rate": 0.000742729990487838, + "loss": 3.6527, + "step": 30305 + }, + { + "epoch": 2.059383068351678, + "grad_norm": 0.9821159839630127, + "learning_rate": 0.0007426875254790053, + "loss": 3.5502, + "step": 30310 + }, + { + "epoch": 2.05972278842234, + "grad_norm": 0.7876103520393372, + "learning_rate": 0.0007426450604701726, + "loss": 3.519, + "step": 30315 + }, + { + "epoch": 2.0600625084930018, + "grad_norm": 2.527968406677246, + "learning_rate": 0.0007426025954613399, + "loss": 3.6095, + "step": 30320 + }, + { + "epoch": 2.0604022285636634, + "grad_norm": 0.823428213596344, + "learning_rate": 0.0007425601304525072, + "loss": 3.6939, + "step": 30325 + }, + { + "epoch": 2.0607419486343255, + "grad_norm": 0.9079275131225586, + "learning_rate": 0.0007425176654436744, + "loss": 3.4555, + "step": 30330 + }, + { + "epoch": 2.061081668704987, + "grad_norm": 0.9415116906166077, + "learning_rate": 0.0007424752004348417, + "loss": 3.5034, + "step": 30335 + }, + { + "epoch": 2.0614213887756487, + "grad_norm": 0.8073373436927795, + "learning_rate": 0.000742432735426009, + "loss": 3.5144, + "step": 30340 + }, + { + "epoch": 2.061761108846311, + "grad_norm": 2.07351016998291, + "learning_rate": 0.0007423902704171762, + "loss": 3.4408, + "step": 30345 + }, + { + "epoch": 2.0621008289169724, + "grad_norm": 0.8630059957504272, + "learning_rate": 0.0007423478054083436, + "loss": 3.6337, + "step": 30350 + }, + { + "epoch": 2.062440548987634, + "grad_norm": 0.8192998766899109, + "learning_rate": 0.0007423053403995108, + "loss": 3.5773, + "step": 30355 + }, + { + "epoch": 2.062780269058296, + "grad_norm": 0.8662537932395935, + "learning_rate": 0.0007422628753906781, + "loss": 3.6254, + "step": 30360 + }, + { + "epoch": 2.0631199891289578, + "grad_norm": 0.8938514590263367, + "learning_rate": 0.0007422204103818454, + "loss": 3.2738, + "step": 30365 + }, + { + "epoch": 2.0634597091996194, + "grad_norm": 0.9520589709281921, + "learning_rate": 0.0007421779453730126, + "loss": 3.449, + "step": 30370 + }, + { + "epoch": 2.0637994292702815, + "grad_norm": 0.8034418225288391, + "learning_rate": 0.0007421354803641799, + "loss": 3.3812, + "step": 30375 + }, + { + "epoch": 2.064139149340943, + "grad_norm": 0.8722392916679382, + "learning_rate": 0.0007420930153553472, + "loss": 3.5365, + "step": 30380 + }, + { + "epoch": 2.0644788694116047, + "grad_norm": 0.7887628674507141, + "learning_rate": 0.0007420505503465145, + "loss": 3.551, + "step": 30385 + }, + { + "epoch": 2.064818589482267, + "grad_norm": 1.0924632549285889, + "learning_rate": 0.0007420080853376817, + "loss": 3.6812, + "step": 30390 + }, + { + "epoch": 2.0651583095529285, + "grad_norm": 0.9164811968803406, + "learning_rate": 0.0007419656203288491, + "loss": 3.6807, + "step": 30395 + }, + { + "epoch": 2.06549802962359, + "grad_norm": 0.9681535959243774, + "learning_rate": 0.0007419231553200163, + "loss": 3.3651, + "step": 30400 + }, + { + "epoch": 2.0658377496942517, + "grad_norm": 0.9103661179542542, + "learning_rate": 0.0007418806903111835, + "loss": 3.4421, + "step": 30405 + }, + { + "epoch": 2.066177469764914, + "grad_norm": 0.8094600439071655, + "learning_rate": 0.0007418382253023509, + "loss": 3.6142, + "step": 30410 + }, + { + "epoch": 2.0665171898355754, + "grad_norm": 0.7640501856803894, + "learning_rate": 0.0007417957602935181, + "loss": 3.492, + "step": 30415 + }, + { + "epoch": 2.066856909906237, + "grad_norm": 0.9360306859016418, + "learning_rate": 0.0007417532952846854, + "loss": 3.3719, + "step": 30420 + }, + { + "epoch": 2.067196629976899, + "grad_norm": 0.8325222730636597, + "learning_rate": 0.0007417108302758528, + "loss": 3.5236, + "step": 30425 + }, + { + "epoch": 2.0675363500475608, + "grad_norm": 0.7637277841567993, + "learning_rate": 0.00074166836526702, + "loss": 3.5809, + "step": 30430 + }, + { + "epoch": 2.0678760701182224, + "grad_norm": 0.6084652543067932, + "learning_rate": 0.0007416259002581872, + "loss": 3.5383, + "step": 30435 + }, + { + "epoch": 2.0682157901888845, + "grad_norm": 0.7407736778259277, + "learning_rate": 0.0007415834352493545, + "loss": 3.773, + "step": 30440 + }, + { + "epoch": 2.068555510259546, + "grad_norm": 0.9711830615997314, + "learning_rate": 0.0007415409702405218, + "loss": 3.6814, + "step": 30445 + }, + { + "epoch": 2.0688952303302077, + "grad_norm": 0.715211033821106, + "learning_rate": 0.0007414985052316891, + "loss": 3.682, + "step": 30450 + }, + { + "epoch": 2.06923495040087, + "grad_norm": 1.0755252838134766, + "learning_rate": 0.0007414560402228564, + "loss": 3.3715, + "step": 30455 + }, + { + "epoch": 2.0695746704715314, + "grad_norm": 0.9661974310874939, + "learning_rate": 0.0007414135752140237, + "loss": 3.6247, + "step": 30460 + }, + { + "epoch": 2.069914390542193, + "grad_norm": 1.1175658702850342, + "learning_rate": 0.000741371110205191, + "loss": 3.5978, + "step": 30465 + }, + { + "epoch": 2.070254110612855, + "grad_norm": 1.00173819065094, + "learning_rate": 0.0007413286451963582, + "loss": 3.5078, + "step": 30470 + }, + { + "epoch": 2.0705938306835168, + "grad_norm": 1.1474239826202393, + "learning_rate": 0.0007412861801875255, + "loss": 3.3521, + "step": 30475 + }, + { + "epoch": 2.0709335507541784, + "grad_norm": 1.1081422567367554, + "learning_rate": 0.0007412437151786928, + "loss": 3.5634, + "step": 30480 + }, + { + "epoch": 2.0712732708248405, + "grad_norm": 0.8808743357658386, + "learning_rate": 0.00074120125016986, + "loss": 3.3502, + "step": 30485 + }, + { + "epoch": 2.071612990895502, + "grad_norm": 0.8398902416229248, + "learning_rate": 0.0007411587851610273, + "loss": 3.5701, + "step": 30490 + }, + { + "epoch": 2.0719527109661637, + "grad_norm": 0.8672765493392944, + "learning_rate": 0.0007411163201521947, + "loss": 3.6105, + "step": 30495 + }, + { + "epoch": 2.072292431036826, + "grad_norm": 0.8642379641532898, + "learning_rate": 0.0007410738551433619, + "loss": 3.3032, + "step": 30500 + }, + { + "epoch": 2.0726321511074874, + "grad_norm": 0.8073281049728394, + "learning_rate": 0.0007410313901345291, + "loss": 3.7156, + "step": 30505 + }, + { + "epoch": 2.072971871178149, + "grad_norm": 0.9139173626899719, + "learning_rate": 0.0007409889251256965, + "loss": 3.4618, + "step": 30510 + }, + { + "epoch": 2.073311591248811, + "grad_norm": 1.034740686416626, + "learning_rate": 0.0007409464601168637, + "loss": 3.4183, + "step": 30515 + }, + { + "epoch": 2.073651311319473, + "grad_norm": 0.7986640334129333, + "learning_rate": 0.0007409039951080309, + "loss": 3.5872, + "step": 30520 + }, + { + "epoch": 2.0739910313901344, + "grad_norm": 0.7943809032440186, + "learning_rate": 0.0007408615300991984, + "loss": 3.772, + "step": 30525 + }, + { + "epoch": 2.0743307514607965, + "grad_norm": 0.962866485118866, + "learning_rate": 0.0007408190650903656, + "loss": 3.6038, + "step": 30530 + }, + { + "epoch": 2.074670471531458, + "grad_norm": 0.8748966455459595, + "learning_rate": 0.0007407766000815328, + "loss": 3.3363, + "step": 30535 + }, + { + "epoch": 2.0750101916021197, + "grad_norm": 0.9392115473747253, + "learning_rate": 0.0007407341350727001, + "loss": 3.3561, + "step": 30540 + }, + { + "epoch": 2.075349911672782, + "grad_norm": 0.7187319397926331, + "learning_rate": 0.0007406916700638674, + "loss": 3.5051, + "step": 30545 + }, + { + "epoch": 2.0756896317434435, + "grad_norm": 0.8472775220870972, + "learning_rate": 0.0007406492050550346, + "loss": 3.5223, + "step": 30550 + }, + { + "epoch": 2.076029351814105, + "grad_norm": 0.8085153102874756, + "learning_rate": 0.0007406067400462019, + "loss": 3.6069, + "step": 30555 + }, + { + "epoch": 2.076369071884767, + "grad_norm": 1.2342640161514282, + "learning_rate": 0.0007405642750373693, + "loss": 3.4781, + "step": 30560 + }, + { + "epoch": 2.076708791955429, + "grad_norm": 0.9130920767784119, + "learning_rate": 0.0007405218100285365, + "loss": 3.5654, + "step": 30565 + }, + { + "epoch": 2.0770485120260904, + "grad_norm": 0.8804427981376648, + "learning_rate": 0.0007404793450197038, + "loss": 3.3736, + "step": 30570 + }, + { + "epoch": 2.0773882320967525, + "grad_norm": 0.812127411365509, + "learning_rate": 0.0007404368800108711, + "loss": 3.6136, + "step": 30575 + }, + { + "epoch": 2.077727952167414, + "grad_norm": 0.8327321410179138, + "learning_rate": 0.0007403944150020383, + "loss": 3.6335, + "step": 30580 + }, + { + "epoch": 2.0780676722380758, + "grad_norm": 1.0503300428390503, + "learning_rate": 0.0007403519499932056, + "loss": 3.4558, + "step": 30585 + }, + { + "epoch": 2.0784073923087374, + "grad_norm": 0.8428360819816589, + "learning_rate": 0.0007403094849843728, + "loss": 3.4203, + "step": 30590 + }, + { + "epoch": 2.0787471123793995, + "grad_norm": 0.9969569444656372, + "learning_rate": 0.0007402670199755402, + "loss": 3.4251, + "step": 30595 + }, + { + "epoch": 2.079086832450061, + "grad_norm": 0.94118332862854, + "learning_rate": 0.0007402245549667075, + "loss": 3.6303, + "step": 30600 + }, + { + "epoch": 2.0794265525207227, + "grad_norm": 0.863295316696167, + "learning_rate": 0.0007401820899578747, + "loss": 3.7417, + "step": 30605 + }, + { + "epoch": 2.079766272591385, + "grad_norm": 0.7934759855270386, + "learning_rate": 0.000740139624949042, + "loss": 3.7424, + "step": 30610 + }, + { + "epoch": 2.0801059926620464, + "grad_norm": 0.8345733284950256, + "learning_rate": 0.0007400971599402093, + "loss": 3.5665, + "step": 30615 + }, + { + "epoch": 2.080445712732708, + "grad_norm": 0.9138842821121216, + "learning_rate": 0.0007400546949313765, + "loss": 3.3629, + "step": 30620 + }, + { + "epoch": 2.08078543280337, + "grad_norm": 0.7476775646209717, + "learning_rate": 0.0007400122299225437, + "loss": 3.717, + "step": 30625 + }, + { + "epoch": 2.0811251528740318, + "grad_norm": 0.7932102084159851, + "learning_rate": 0.0007399697649137112, + "loss": 3.5699, + "step": 30630 + }, + { + "epoch": 2.0814648729446934, + "grad_norm": 0.8880126476287842, + "learning_rate": 0.0007399272999048784, + "loss": 3.4657, + "step": 30635 + }, + { + "epoch": 2.0818045930153555, + "grad_norm": 1.0008536577224731, + "learning_rate": 0.0007398848348960456, + "loss": 3.6423, + "step": 30640 + }, + { + "epoch": 2.082144313086017, + "grad_norm": 1.064678430557251, + "learning_rate": 0.000739842369887213, + "loss": 3.8951, + "step": 30645 + }, + { + "epoch": 2.0824840331566787, + "grad_norm": 0.7941216826438904, + "learning_rate": 0.0007397999048783802, + "loss": 3.7261, + "step": 30650 + }, + { + "epoch": 2.082823753227341, + "grad_norm": 0.686185359954834, + "learning_rate": 0.0007397574398695474, + "loss": 3.8276, + "step": 30655 + }, + { + "epoch": 2.0831634732980024, + "grad_norm": 0.952719509601593, + "learning_rate": 0.0007397149748607149, + "loss": 3.6261, + "step": 30660 + }, + { + "epoch": 2.083503193368664, + "grad_norm": 1.2012311220169067, + "learning_rate": 0.0007396725098518821, + "loss": 3.4665, + "step": 30665 + }, + { + "epoch": 2.083842913439326, + "grad_norm": 1.0207600593566895, + "learning_rate": 0.0007396300448430493, + "loss": 3.5631, + "step": 30670 + }, + { + "epoch": 2.084182633509988, + "grad_norm": 0.8331097960472107, + "learning_rate": 0.0007395875798342167, + "loss": 3.5366, + "step": 30675 + }, + { + "epoch": 2.0845223535806494, + "grad_norm": 0.8807751536369324, + "learning_rate": 0.0007395451148253839, + "loss": 3.5089, + "step": 30680 + }, + { + "epoch": 2.0848620736513115, + "grad_norm": 1.0322986841201782, + "learning_rate": 0.0007395026498165511, + "loss": 3.636, + "step": 30685 + }, + { + "epoch": 2.085201793721973, + "grad_norm": 0.7919819355010986, + "learning_rate": 0.0007394601848077184, + "loss": 3.3157, + "step": 30690 + }, + { + "epoch": 2.0855415137926347, + "grad_norm": 0.8510385155677795, + "learning_rate": 0.0007394177197988858, + "loss": 3.5925, + "step": 30695 + }, + { + "epoch": 2.085881233863297, + "grad_norm": 0.939411461353302, + "learning_rate": 0.000739375254790053, + "loss": 3.3948, + "step": 30700 + }, + { + "epoch": 2.0862209539339585, + "grad_norm": 1.2882685661315918, + "learning_rate": 0.0007393327897812203, + "loss": 3.4734, + "step": 30705 + }, + { + "epoch": 2.08656067400462, + "grad_norm": 1.0993708372116089, + "learning_rate": 0.0007392903247723876, + "loss": 3.5558, + "step": 30710 + }, + { + "epoch": 2.086900394075282, + "grad_norm": 0.8099491596221924, + "learning_rate": 0.0007392478597635548, + "loss": 3.5371, + "step": 30715 + }, + { + "epoch": 2.087240114145944, + "grad_norm": 0.7333676218986511, + "learning_rate": 0.0007392053947547221, + "loss": 3.6534, + "step": 30720 + }, + { + "epoch": 2.0875798342166054, + "grad_norm": 0.7583295702934265, + "learning_rate": 0.0007391629297458893, + "loss": 3.8239, + "step": 30725 + }, + { + "epoch": 2.0879195542872675, + "grad_norm": 0.7623599171638489, + "learning_rate": 0.0007391204647370567, + "loss": 3.6677, + "step": 30730 + }, + { + "epoch": 2.088259274357929, + "grad_norm": 0.8652844429016113, + "learning_rate": 0.000739077999728224, + "loss": 3.535, + "step": 30735 + }, + { + "epoch": 2.0885989944285908, + "grad_norm": 0.7543116211891174, + "learning_rate": 0.0007390355347193912, + "loss": 3.4658, + "step": 30740 + }, + { + "epoch": 2.0889387144992524, + "grad_norm": 0.7216506004333496, + "learning_rate": 0.0007389930697105585, + "loss": 3.5298, + "step": 30745 + }, + { + "epoch": 2.0892784345699145, + "grad_norm": 0.721743643283844, + "learning_rate": 0.0007389506047017258, + "loss": 3.6521, + "step": 30750 + }, + { + "epoch": 2.089618154640576, + "grad_norm": 1.4189786911010742, + "learning_rate": 0.000738908139692893, + "loss": 3.7214, + "step": 30755 + }, + { + "epoch": 2.0899578747112377, + "grad_norm": 0.860519289970398, + "learning_rate": 0.0007388656746840603, + "loss": 3.4201, + "step": 30760 + }, + { + "epoch": 2.0902975947819, + "grad_norm": 0.732560396194458, + "learning_rate": 0.0007388232096752277, + "loss": 4.1038, + "step": 30765 + }, + { + "epoch": 2.0906373148525614, + "grad_norm": 0.7037970423698425, + "learning_rate": 0.0007387807446663949, + "loss": 3.6381, + "step": 30770 + }, + { + "epoch": 2.090977034923223, + "grad_norm": 0.6780232191085815, + "learning_rate": 0.0007387382796575621, + "loss": 3.4591, + "step": 30775 + }, + { + "epoch": 2.091316754993885, + "grad_norm": 0.6782544851303101, + "learning_rate": 0.0007386958146487295, + "loss": 3.4852, + "step": 30780 + }, + { + "epoch": 2.0916564750645468, + "grad_norm": 0.8499617576599121, + "learning_rate": 0.0007386533496398967, + "loss": 3.6031, + "step": 30785 + }, + { + "epoch": 2.0919961951352084, + "grad_norm": 0.9527631402015686, + "learning_rate": 0.000738610884631064, + "loss": 3.6745, + "step": 30790 + }, + { + "epoch": 2.0923359152058705, + "grad_norm": 1.016126275062561, + "learning_rate": 0.0007385684196222313, + "loss": 3.7027, + "step": 30795 + }, + { + "epoch": 2.092675635276532, + "grad_norm": 0.6698843240737915, + "learning_rate": 0.0007385259546133986, + "loss": 3.5779, + "step": 30800 + }, + { + "epoch": 2.0930153553471937, + "grad_norm": 1.2042561769485474, + "learning_rate": 0.0007384834896045659, + "loss": 3.5227, + "step": 30805 + }, + { + "epoch": 2.093355075417856, + "grad_norm": 0.9136734008789062, + "learning_rate": 0.0007384410245957332, + "loss": 3.5577, + "step": 30810 + }, + { + "epoch": 2.0936947954885174, + "grad_norm": 0.8427501916885376, + "learning_rate": 0.0007383985595869004, + "loss": 3.706, + "step": 30815 + }, + { + "epoch": 2.094034515559179, + "grad_norm": 0.7621285319328308, + "learning_rate": 0.0007383560945780677, + "loss": 3.4684, + "step": 30820 + }, + { + "epoch": 2.094374235629841, + "grad_norm": 0.892071545124054, + "learning_rate": 0.000738313629569235, + "loss": 3.5686, + "step": 30825 + }, + { + "epoch": 2.094713955700503, + "grad_norm": 0.8584364056587219, + "learning_rate": 0.0007382711645604022, + "loss": 3.586, + "step": 30830 + }, + { + "epoch": 2.0950536757711644, + "grad_norm": 0.8609929084777832, + "learning_rate": 0.0007382286995515696, + "loss": 3.5827, + "step": 30835 + }, + { + "epoch": 2.0953933958418265, + "grad_norm": 0.8652588725090027, + "learning_rate": 0.0007381862345427368, + "loss": 3.6104, + "step": 30840 + }, + { + "epoch": 2.095733115912488, + "grad_norm": 1.130325436592102, + "learning_rate": 0.0007381437695339041, + "loss": 3.7111, + "step": 30845 + }, + { + "epoch": 2.0960728359831498, + "grad_norm": 0.7889182567596436, + "learning_rate": 0.0007381013045250714, + "loss": 3.858, + "step": 30850 + }, + { + "epoch": 2.096412556053812, + "grad_norm": 0.8236942291259766, + "learning_rate": 0.0007380588395162386, + "loss": 3.5443, + "step": 30855 + }, + { + "epoch": 2.0967522761244735, + "grad_norm": 0.8463494777679443, + "learning_rate": 0.0007380163745074059, + "loss": 3.7115, + "step": 30860 + }, + { + "epoch": 2.097091996195135, + "grad_norm": 1.2809205055236816, + "learning_rate": 0.0007379739094985732, + "loss": 3.3073, + "step": 30865 + }, + { + "epoch": 2.097431716265797, + "grad_norm": 1.4715906381607056, + "learning_rate": 0.0007379314444897405, + "loss": 3.6927, + "step": 30870 + }, + { + "epoch": 2.097771436336459, + "grad_norm": 0.7135846018791199, + "learning_rate": 0.0007378889794809078, + "loss": 3.5931, + "step": 30875 + }, + { + "epoch": 2.0981111564071204, + "grad_norm": 1.1118425130844116, + "learning_rate": 0.0007378465144720751, + "loss": 3.3935, + "step": 30880 + }, + { + "epoch": 2.0984508764777825, + "grad_norm": 0.9859788417816162, + "learning_rate": 0.0007378040494632423, + "loss": 3.7269, + "step": 30885 + }, + { + "epoch": 2.098790596548444, + "grad_norm": 0.8056640028953552, + "learning_rate": 0.0007377615844544095, + "loss": 3.6827, + "step": 30890 + }, + { + "epoch": 2.0991303166191058, + "grad_norm": 0.9969542622566223, + "learning_rate": 0.0007377191194455769, + "loss": 3.4708, + "step": 30895 + }, + { + "epoch": 2.099470036689768, + "grad_norm": 0.8984941840171814, + "learning_rate": 0.0007376766544367441, + "loss": 3.6035, + "step": 30900 + }, + { + "epoch": 2.0998097567604295, + "grad_norm": 0.8192826509475708, + "learning_rate": 0.0007376341894279114, + "loss": 3.8786, + "step": 30905 + }, + { + "epoch": 2.100149476831091, + "grad_norm": 0.8513901233673096, + "learning_rate": 0.0007375917244190788, + "loss": 3.6349, + "step": 30910 + }, + { + "epoch": 2.100489196901753, + "grad_norm": 1.0851718187332153, + "learning_rate": 0.000737549259410246, + "loss": 3.6602, + "step": 30915 + }, + { + "epoch": 2.100828916972415, + "grad_norm": 0.8214661478996277, + "learning_rate": 0.0007375067944014132, + "loss": 3.2423, + "step": 30920 + }, + { + "epoch": 2.1011686370430764, + "grad_norm": 0.9363375306129456, + "learning_rate": 0.0007374643293925806, + "loss": 3.7447, + "step": 30925 + }, + { + "epoch": 2.101508357113738, + "grad_norm": 0.7533241510391235, + "learning_rate": 0.0007374218643837478, + "loss": 3.4683, + "step": 30930 + }, + { + "epoch": 2.1018480771844, + "grad_norm": 0.9975235462188721, + "learning_rate": 0.000737379399374915, + "loss": 3.6189, + "step": 30935 + }, + { + "epoch": 2.1021877972550618, + "grad_norm": 0.9530056118965149, + "learning_rate": 0.0007373369343660824, + "loss": 3.6902, + "step": 30940 + }, + { + "epoch": 2.1025275173257234, + "grad_norm": 0.8945235013961792, + "learning_rate": 0.0007372944693572497, + "loss": 3.2253, + "step": 30945 + }, + { + "epoch": 2.1028672373963855, + "grad_norm": 0.8675315976142883, + "learning_rate": 0.0007372520043484169, + "loss": 3.4777, + "step": 30950 + }, + { + "epoch": 2.103206957467047, + "grad_norm": 1.258357286453247, + "learning_rate": 0.0007372095393395842, + "loss": 3.6835, + "step": 30955 + }, + { + "epoch": 2.1035466775377087, + "grad_norm": 0.8896135687828064, + "learning_rate": 0.0007371670743307515, + "loss": 3.8851, + "step": 30960 + }, + { + "epoch": 2.103886397608371, + "grad_norm": 0.9110943078994751, + "learning_rate": 0.0007371246093219187, + "loss": 3.5064, + "step": 30965 + }, + { + "epoch": 2.1042261176790324, + "grad_norm": 0.8549448847770691, + "learning_rate": 0.000737082144313086, + "loss": 3.3553, + "step": 30970 + }, + { + "epoch": 2.104565837749694, + "grad_norm": 0.9743663668632507, + "learning_rate": 0.0007370396793042534, + "loss": 3.5629, + "step": 30975 + }, + { + "epoch": 2.104905557820356, + "grad_norm": 2.2809581756591797, + "learning_rate": 0.0007369972142954206, + "loss": 3.4434, + "step": 30980 + }, + { + "epoch": 2.105245277891018, + "grad_norm": 0.8483846783638, + "learning_rate": 0.0007369547492865879, + "loss": 3.4622, + "step": 30985 + }, + { + "epoch": 2.1055849979616794, + "grad_norm": 0.8491222858428955, + "learning_rate": 0.0007369122842777551, + "loss": 3.561, + "step": 30990 + }, + { + "epoch": 2.1059247180323415, + "grad_norm": 0.8482919335365295, + "learning_rate": 0.0007368698192689224, + "loss": 3.7508, + "step": 30995 + }, + { + "epoch": 2.106264438103003, + "grad_norm": 0.8766744136810303, + "learning_rate": 0.0007368273542600897, + "loss": 3.672, + "step": 31000 + }, + { + "epoch": 2.1066041581736648, + "grad_norm": 0.8215848803520203, + "learning_rate": 0.0007367848892512569, + "loss": 3.5011, + "step": 31005 + }, + { + "epoch": 2.106943878244327, + "grad_norm": 0.8228998780250549, + "learning_rate": 0.0007367424242424243, + "loss": 3.6066, + "step": 31010 + }, + { + "epoch": 2.1072835983149885, + "grad_norm": 0.7706062197685242, + "learning_rate": 0.0007366999592335916, + "loss": 3.8213, + "step": 31015 + }, + { + "epoch": 2.10762331838565, + "grad_norm": 0.7274779081344604, + "learning_rate": 0.0007366574942247588, + "loss": 3.4903, + "step": 31020 + }, + { + "epoch": 2.107963038456312, + "grad_norm": 1.4515607357025146, + "learning_rate": 0.000736615029215926, + "loss": 3.5364, + "step": 31025 + }, + { + "epoch": 2.108302758526974, + "grad_norm": 0.9099046587944031, + "learning_rate": 0.0007365725642070934, + "loss": 3.629, + "step": 31030 + }, + { + "epoch": 2.1086424785976354, + "grad_norm": 1.0657747983932495, + "learning_rate": 0.0007365300991982606, + "loss": 3.3393, + "step": 31035 + }, + { + "epoch": 2.1089821986682975, + "grad_norm": 3.015028238296509, + "learning_rate": 0.0007364876341894278, + "loss": 3.5182, + "step": 31040 + }, + { + "epoch": 2.109321918738959, + "grad_norm": 0.6695683002471924, + "learning_rate": 0.0007364451691805953, + "loss": 3.7202, + "step": 31045 + }, + { + "epoch": 2.1096616388096208, + "grad_norm": 1.1665112972259521, + "learning_rate": 0.0007364027041717625, + "loss": 3.5927, + "step": 31050 + }, + { + "epoch": 2.110001358880283, + "grad_norm": 0.8083938956260681, + "learning_rate": 0.0007363602391629297, + "loss": 3.8315, + "step": 31055 + }, + { + "epoch": 2.1103410789509445, + "grad_norm": 1.115315556526184, + "learning_rate": 0.0007363177741540971, + "loss": 3.5647, + "step": 31060 + }, + { + "epoch": 2.110680799021606, + "grad_norm": 1.0565012693405151, + "learning_rate": 0.0007362753091452643, + "loss": 3.6685, + "step": 31065 + }, + { + "epoch": 2.111020519092268, + "grad_norm": 0.8967803120613098, + "learning_rate": 0.0007362328441364315, + "loss": 3.6994, + "step": 31070 + }, + { + "epoch": 2.11136023916293, + "grad_norm": 0.7697975635528564, + "learning_rate": 0.0007361903791275988, + "loss": 3.8156, + "step": 31075 + }, + { + "epoch": 2.1116999592335914, + "grad_norm": 0.9713969230651855, + "learning_rate": 0.0007361479141187662, + "loss": 3.4263, + "step": 31080 + }, + { + "epoch": 2.112039679304253, + "grad_norm": 1.405176043510437, + "learning_rate": 0.0007361054491099334, + "loss": 3.5138, + "step": 31085 + }, + { + "epoch": 2.112379399374915, + "grad_norm": 0.8167610764503479, + "learning_rate": 0.0007360629841011007, + "loss": 3.3997, + "step": 31090 + }, + { + "epoch": 2.1127191194455768, + "grad_norm": 0.6689099073410034, + "learning_rate": 0.000736020519092268, + "loss": 3.5027, + "step": 31095 + }, + { + "epoch": 2.1130588395162384, + "grad_norm": 0.7660166621208191, + "learning_rate": 0.0007359780540834352, + "loss": 3.3394, + "step": 31100 + }, + { + "epoch": 2.1133985595869005, + "grad_norm": 1.0402190685272217, + "learning_rate": 0.0007359355890746025, + "loss": 3.6069, + "step": 31105 + }, + { + "epoch": 2.113738279657562, + "grad_norm": 0.9853819012641907, + "learning_rate": 0.0007358931240657698, + "loss": 3.7318, + "step": 31110 + }, + { + "epoch": 2.1140779997282237, + "grad_norm": 0.691737174987793, + "learning_rate": 0.0007358506590569371, + "loss": 3.3548, + "step": 31115 + }, + { + "epoch": 2.114417719798886, + "grad_norm": 0.7252996563911438, + "learning_rate": 0.0007358081940481044, + "loss": 3.5706, + "step": 31120 + }, + { + "epoch": 2.1147574398695475, + "grad_norm": 0.8472908139228821, + "learning_rate": 0.0007357657290392716, + "loss": 3.8135, + "step": 31125 + }, + { + "epoch": 2.115097159940209, + "grad_norm": 0.8574680089950562, + "learning_rate": 0.000735723264030439, + "loss": 3.4398, + "step": 31130 + }, + { + "epoch": 2.115436880010871, + "grad_norm": 0.9372795820236206, + "learning_rate": 0.0007356807990216062, + "loss": 3.227, + "step": 31135 + }, + { + "epoch": 2.115776600081533, + "grad_norm": 0.9379299283027649, + "learning_rate": 0.0007356383340127734, + "loss": 3.364, + "step": 31140 + }, + { + "epoch": 2.1161163201521944, + "grad_norm": 0.8611903190612793, + "learning_rate": 0.0007355958690039408, + "loss": 3.7983, + "step": 31145 + }, + { + "epoch": 2.1164560402228565, + "grad_norm": 0.7524352669715881, + "learning_rate": 0.0007355534039951081, + "loss": 3.9146, + "step": 31150 + }, + { + "epoch": 2.116795760293518, + "grad_norm": 0.8491563200950623, + "learning_rate": 0.0007355109389862753, + "loss": 3.285, + "step": 31155 + }, + { + "epoch": 2.1171354803641798, + "grad_norm": 1.0011483430862427, + "learning_rate": 0.0007354684739774427, + "loss": 3.2898, + "step": 31160 + }, + { + "epoch": 2.117475200434842, + "grad_norm": 0.81803959608078, + "learning_rate": 0.0007354260089686099, + "loss": 3.7118, + "step": 31165 + }, + { + "epoch": 2.1178149205055035, + "grad_norm": 0.9882153272628784, + "learning_rate": 0.0007353835439597771, + "loss": 3.4839, + "step": 31170 + }, + { + "epoch": 2.118154640576165, + "grad_norm": 0.7708590030670166, + "learning_rate": 0.0007353410789509444, + "loss": 3.6435, + "step": 31175 + }, + { + "epoch": 2.118494360646827, + "grad_norm": 0.8206771016120911, + "learning_rate": 0.0007352986139421117, + "loss": 3.6188, + "step": 31180 + }, + { + "epoch": 2.118834080717489, + "grad_norm": 0.8005907535552979, + "learning_rate": 0.000735256148933279, + "loss": 3.4603, + "step": 31185 + }, + { + "epoch": 2.1191738007881504, + "grad_norm": 0.8151422739028931, + "learning_rate": 0.0007352136839244463, + "loss": 3.4768, + "step": 31190 + }, + { + "epoch": 2.1195135208588125, + "grad_norm": 2.0192935466766357, + "learning_rate": 0.0007351712189156136, + "loss": 3.4337, + "step": 31195 + }, + { + "epoch": 2.119853240929474, + "grad_norm": 0.9371762871742249, + "learning_rate": 0.0007351287539067808, + "loss": 3.685, + "step": 31200 + }, + { + "epoch": 2.1201929610001358, + "grad_norm": 0.8898231387138367, + "learning_rate": 0.0007350862888979481, + "loss": 3.7492, + "step": 31205 + }, + { + "epoch": 2.120532681070798, + "grad_norm": 0.8826519250869751, + "learning_rate": 0.0007350438238891154, + "loss": 3.4267, + "step": 31210 + }, + { + "epoch": 2.1208724011414595, + "grad_norm": 0.9330217838287354, + "learning_rate": 0.0007350013588802826, + "loss": 3.6674, + "step": 31215 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.8600148558616638, + "learning_rate": 0.00073495889387145, + "loss": 3.5621, + "step": 31220 + }, + { + "epoch": 2.121551841282783, + "grad_norm": 1.6930791139602661, + "learning_rate": 0.0007349164288626172, + "loss": 3.5019, + "step": 31225 + }, + { + "epoch": 2.121891561353445, + "grad_norm": 0.8379348516464233, + "learning_rate": 0.0007348739638537845, + "loss": 3.6595, + "step": 31230 + }, + { + "epoch": 2.1222312814241064, + "grad_norm": 0.8995645642280579, + "learning_rate": 0.0007348314988449518, + "loss": 3.5776, + "step": 31235 + }, + { + "epoch": 2.1225710014947685, + "grad_norm": 0.8795369267463684, + "learning_rate": 0.000734789033836119, + "loss": 3.5648, + "step": 31240 + }, + { + "epoch": 2.12291072156543, + "grad_norm": 0.8750913739204407, + "learning_rate": 0.0007347465688272863, + "loss": 3.6559, + "step": 31245 + }, + { + "epoch": 2.123250441636092, + "grad_norm": 0.9743653535842896, + "learning_rate": 0.0007347041038184537, + "loss": 3.7378, + "step": 31250 + }, + { + "epoch": 2.123590161706754, + "grad_norm": 1.343794584274292, + "learning_rate": 0.0007346616388096209, + "loss": 3.7254, + "step": 31255 + }, + { + "epoch": 2.1239298817774155, + "grad_norm": 1.584986925125122, + "learning_rate": 0.0007346191738007882, + "loss": 3.4849, + "step": 31260 + }, + { + "epoch": 2.124269601848077, + "grad_norm": 1.1060492992401123, + "learning_rate": 0.0007345767087919555, + "loss": 3.3046, + "step": 31265 + }, + { + "epoch": 2.1246093219187387, + "grad_norm": 0.9543876647949219, + "learning_rate": 0.0007345342437831227, + "loss": 3.4936, + "step": 31270 + }, + { + "epoch": 2.124949041989401, + "grad_norm": 0.8728023171424866, + "learning_rate": 0.0007344917787742899, + "loss": 3.286, + "step": 31275 + }, + { + "epoch": 2.1252887620600625, + "grad_norm": 0.9148169755935669, + "learning_rate": 0.0007344493137654573, + "loss": 3.5997, + "step": 31280 + }, + { + "epoch": 2.125628482130724, + "grad_norm": 0.8533278107643127, + "learning_rate": 0.0007344068487566246, + "loss": 3.6475, + "step": 31285 + }, + { + "epoch": 2.125968202201386, + "grad_norm": 0.9598996043205261, + "learning_rate": 0.0007343643837477918, + "loss": 3.608, + "step": 31290 + }, + { + "epoch": 2.126307922272048, + "grad_norm": 0.725835382938385, + "learning_rate": 0.0007343219187389592, + "loss": 3.0645, + "step": 31295 + }, + { + "epoch": 2.1266476423427094, + "grad_norm": 0.920544445514679, + "learning_rate": 0.0007342794537301264, + "loss": 3.4105, + "step": 31300 + }, + { + "epoch": 2.1269873624133715, + "grad_norm": 0.975378155708313, + "learning_rate": 0.0007342369887212936, + "loss": 3.3974, + "step": 31305 + }, + { + "epoch": 2.127327082484033, + "grad_norm": 0.7979410886764526, + "learning_rate": 0.000734194523712461, + "loss": 3.4136, + "step": 31310 + }, + { + "epoch": 2.1276668025546948, + "grad_norm": 0.7854820489883423, + "learning_rate": 0.0007341520587036282, + "loss": 3.4941, + "step": 31315 + }, + { + "epoch": 2.128006522625357, + "grad_norm": 0.9981824159622192, + "learning_rate": 0.0007341095936947955, + "loss": 3.5098, + "step": 31320 + }, + { + "epoch": 2.1283462426960185, + "grad_norm": 1.0388274192810059, + "learning_rate": 0.0007340671286859629, + "loss": 3.4716, + "step": 31325 + }, + { + "epoch": 2.12868596276668, + "grad_norm": 1.030381441116333, + "learning_rate": 0.0007340246636771301, + "loss": 3.5275, + "step": 31330 + }, + { + "epoch": 2.129025682837342, + "grad_norm": 0.8292697072029114, + "learning_rate": 0.0007339821986682973, + "loss": 3.7489, + "step": 31335 + }, + { + "epoch": 2.129365402908004, + "grad_norm": 1.052839756011963, + "learning_rate": 0.0007339397336594646, + "loss": 3.5305, + "step": 31340 + }, + { + "epoch": 2.1297051229786654, + "grad_norm": 1.652316927909851, + "learning_rate": 0.0007338972686506319, + "loss": 3.4869, + "step": 31345 + }, + { + "epoch": 2.1300448430493275, + "grad_norm": 0.8049498200416565, + "learning_rate": 0.0007338548036417991, + "loss": 3.6875, + "step": 31350 + }, + { + "epoch": 2.130384563119989, + "grad_norm": 0.882392942905426, + "learning_rate": 0.0007338123386329665, + "loss": 3.5531, + "step": 31355 + }, + { + "epoch": 2.1307242831906508, + "grad_norm": 0.8292747735977173, + "learning_rate": 0.0007337698736241338, + "loss": 3.3364, + "step": 31360 + }, + { + "epoch": 2.131064003261313, + "grad_norm": 0.8095467686653137, + "learning_rate": 0.000733727408615301, + "loss": 3.5131, + "step": 31365 + }, + { + "epoch": 2.1314037233319745, + "grad_norm": 0.7895323634147644, + "learning_rate": 0.0007336849436064683, + "loss": 3.533, + "step": 31370 + }, + { + "epoch": 2.131743443402636, + "grad_norm": 1.0993053913116455, + "learning_rate": 0.0007336424785976355, + "loss": 3.5485, + "step": 31375 + }, + { + "epoch": 2.132083163473298, + "grad_norm": 1.3468267917633057, + "learning_rate": 0.0007336000135888028, + "loss": 3.5004, + "step": 31380 + }, + { + "epoch": 2.13242288354396, + "grad_norm": 0.7504871487617493, + "learning_rate": 0.0007335575485799701, + "loss": 3.6349, + "step": 31385 + }, + { + "epoch": 2.1327626036146214, + "grad_norm": 1.1408835649490356, + "learning_rate": 0.0007335150835711374, + "loss": 3.7644, + "step": 31390 + }, + { + "epoch": 2.1331023236852835, + "grad_norm": 1.40465247631073, + "learning_rate": 0.0007334726185623047, + "loss": 3.366, + "step": 31395 + }, + { + "epoch": 2.133442043755945, + "grad_norm": 0.795600175857544, + "learning_rate": 0.000733430153553472, + "loss": 3.6133, + "step": 31400 + }, + { + "epoch": 2.133781763826607, + "grad_norm": 0.9059250950813293, + "learning_rate": 0.0007333876885446392, + "loss": 3.4073, + "step": 31405 + }, + { + "epoch": 2.134121483897269, + "grad_norm": 1.4748221635818481, + "learning_rate": 0.0007333452235358064, + "loss": 3.7058, + "step": 31410 + }, + { + "epoch": 2.1344612039679305, + "grad_norm": 1.1152546405792236, + "learning_rate": 0.0007333027585269738, + "loss": 3.5509, + "step": 31415 + }, + { + "epoch": 2.134800924038592, + "grad_norm": 0.9082005620002747, + "learning_rate": 0.000733260293518141, + "loss": 3.6149, + "step": 31420 + }, + { + "epoch": 2.1351406441092537, + "grad_norm": 0.9707046747207642, + "learning_rate": 0.0007332178285093083, + "loss": 3.7276, + "step": 31425 + }, + { + "epoch": 2.135480364179916, + "grad_norm": 1.096178650856018, + "learning_rate": 0.0007331753635004757, + "loss": 3.6408, + "step": 31430 + }, + { + "epoch": 2.1358200842505775, + "grad_norm": 0.9914501309394836, + "learning_rate": 0.0007331328984916429, + "loss": 3.4186, + "step": 31435 + }, + { + "epoch": 2.136159804321239, + "grad_norm": 1.311926007270813, + "learning_rate": 0.0007330904334828101, + "loss": 3.5823, + "step": 31440 + }, + { + "epoch": 2.136499524391901, + "grad_norm": 0.8214457631111145, + "learning_rate": 0.0007330479684739775, + "loss": 3.3105, + "step": 31445 + }, + { + "epoch": 2.136839244462563, + "grad_norm": 0.8366312980651855, + "learning_rate": 0.0007330055034651447, + "loss": 3.5117, + "step": 31450 + }, + { + "epoch": 2.1371789645332244, + "grad_norm": 0.9170657396316528, + "learning_rate": 0.0007329630384563119, + "loss": 3.5539, + "step": 31455 + }, + { + "epoch": 2.1375186846038865, + "grad_norm": 0.7330713868141174, + "learning_rate": 0.0007329205734474794, + "loss": 3.5146, + "step": 31460 + }, + { + "epoch": 2.137858404674548, + "grad_norm": 0.7009264230728149, + "learning_rate": 0.0007328781084386466, + "loss": 3.3655, + "step": 31465 + }, + { + "epoch": 2.1381981247452098, + "grad_norm": 1.1062251329421997, + "learning_rate": 0.0007328356434298139, + "loss": 3.2431, + "step": 31470 + }, + { + "epoch": 2.138537844815872, + "grad_norm": 0.8082361817359924, + "learning_rate": 0.0007327931784209811, + "loss": 3.4341, + "step": 31475 + }, + { + "epoch": 2.1388775648865335, + "grad_norm": 0.8770529627799988, + "learning_rate": 0.0007327507134121484, + "loss": 3.3258, + "step": 31480 + }, + { + "epoch": 2.139217284957195, + "grad_norm": 0.9274192452430725, + "learning_rate": 0.0007327082484033157, + "loss": 3.5893, + "step": 31485 + }, + { + "epoch": 2.139557005027857, + "grad_norm": 1.61424720287323, + "learning_rate": 0.0007326657833944829, + "loss": 3.7184, + "step": 31490 + }, + { + "epoch": 2.139896725098519, + "grad_norm": 1.203805923461914, + "learning_rate": 0.0007326233183856503, + "loss": 3.458, + "step": 31495 + }, + { + "epoch": 2.1402364451691804, + "grad_norm": 0.829584538936615, + "learning_rate": 0.0007325808533768176, + "loss": 3.3553, + "step": 31500 + }, + { + "epoch": 2.1405761652398425, + "grad_norm": 0.8424960374832153, + "learning_rate": 0.0007325383883679848, + "loss": 3.4785, + "step": 31505 + }, + { + "epoch": 2.140915885310504, + "grad_norm": 1.0579111576080322, + "learning_rate": 0.000732495923359152, + "loss": 3.4292, + "step": 31510 + }, + { + "epoch": 2.1412556053811658, + "grad_norm": 0.8693766593933105, + "learning_rate": 0.0007324534583503194, + "loss": 3.5012, + "step": 31515 + }, + { + "epoch": 2.141595325451828, + "grad_norm": 0.8167588710784912, + "learning_rate": 0.0007324109933414866, + "loss": 3.525, + "step": 31520 + }, + { + "epoch": 2.1419350455224895, + "grad_norm": 1.5498089790344238, + "learning_rate": 0.0007323685283326538, + "loss": 3.2574, + "step": 31525 + }, + { + "epoch": 2.142274765593151, + "grad_norm": 1.0586625337600708, + "learning_rate": 0.0007323260633238213, + "loss": 3.689, + "step": 31530 + }, + { + "epoch": 2.142614485663813, + "grad_norm": 0.8784122467041016, + "learning_rate": 0.0007322835983149885, + "loss": 3.557, + "step": 31535 + }, + { + "epoch": 2.142954205734475, + "grad_norm": 0.7802386283874512, + "learning_rate": 0.0007322411333061557, + "loss": 3.6088, + "step": 31540 + }, + { + "epoch": 2.1432939258051364, + "grad_norm": 1.2390096187591553, + "learning_rate": 0.0007321986682973231, + "loss": 3.4138, + "step": 31545 + }, + { + "epoch": 2.1436336458757985, + "grad_norm": 0.8008242249488831, + "learning_rate": 0.0007321562032884903, + "loss": 3.7866, + "step": 31550 + }, + { + "epoch": 2.14397336594646, + "grad_norm": 1.1589750051498413, + "learning_rate": 0.0007321137382796575, + "loss": 3.6594, + "step": 31555 + }, + { + "epoch": 2.144313086017122, + "grad_norm": 0.9538896679878235, + "learning_rate": 0.0007320712732708249, + "loss": 3.8055, + "step": 31560 + }, + { + "epoch": 2.144652806087784, + "grad_norm": 1.8019084930419922, + "learning_rate": 0.0007320288082619922, + "loss": 3.6573, + "step": 31565 + }, + { + "epoch": 2.1449925261584455, + "grad_norm": 0.8619280457496643, + "learning_rate": 0.0007319863432531594, + "loss": 3.4616, + "step": 31570 + }, + { + "epoch": 2.145332246229107, + "grad_norm": 0.9094575047492981, + "learning_rate": 0.0007319438782443267, + "loss": 3.6564, + "step": 31575 + }, + { + "epoch": 2.145671966299769, + "grad_norm": 1.7374048233032227, + "learning_rate": 0.000731901413235494, + "loss": 3.495, + "step": 31580 + }, + { + "epoch": 2.146011686370431, + "grad_norm": 0.9664519429206848, + "learning_rate": 0.0007318589482266612, + "loss": 3.5405, + "step": 31585 + }, + { + "epoch": 2.1463514064410925, + "grad_norm": 0.8580097556114197, + "learning_rate": 0.0007318164832178285, + "loss": 3.5762, + "step": 31590 + }, + { + "epoch": 2.1466911265117545, + "grad_norm": 1.2702449560165405, + "learning_rate": 0.0007317740182089958, + "loss": 3.4037, + "step": 31595 + }, + { + "epoch": 2.147030846582416, + "grad_norm": 0.7517059445381165, + "learning_rate": 0.0007317315532001631, + "loss": 3.5174, + "step": 31600 + }, + { + "epoch": 2.147370566653078, + "grad_norm": 2.1915338039398193, + "learning_rate": 0.0007316890881913304, + "loss": 3.3398, + "step": 31605 + }, + { + "epoch": 2.14771028672374, + "grad_norm": 1.0582964420318604, + "learning_rate": 0.0007316466231824977, + "loss": 3.3715, + "step": 31610 + }, + { + "epoch": 2.1480500067944015, + "grad_norm": 0.8231784701347351, + "learning_rate": 0.0007316041581736649, + "loss": 3.6465, + "step": 31615 + }, + { + "epoch": 2.148389726865063, + "grad_norm": 0.9748526811599731, + "learning_rate": 0.0007315616931648322, + "loss": 3.5525, + "step": 31620 + }, + { + "epoch": 2.1487294469357248, + "grad_norm": 1.2537816762924194, + "learning_rate": 0.0007315192281559994, + "loss": 3.6803, + "step": 31625 + }, + { + "epoch": 2.149069167006387, + "grad_norm": 1.0169610977172852, + "learning_rate": 0.0007314767631471667, + "loss": 3.5336, + "step": 31630 + }, + { + "epoch": 2.1494088870770485, + "grad_norm": 1.194810390472412, + "learning_rate": 0.0007314342981383341, + "loss": 3.4672, + "step": 31635 + }, + { + "epoch": 2.14974860714771, + "grad_norm": 1.0083627700805664, + "learning_rate": 0.0007313918331295013, + "loss": 3.6682, + "step": 31640 + }, + { + "epoch": 2.150088327218372, + "grad_norm": 0.8820313215255737, + "learning_rate": 0.0007313493681206686, + "loss": 3.4916, + "step": 31645 + }, + { + "epoch": 2.150428047289034, + "grad_norm": 0.882696270942688, + "learning_rate": 0.0007313069031118359, + "loss": 3.7724, + "step": 31650 + }, + { + "epoch": 2.1507677673596954, + "grad_norm": 0.719761073589325, + "learning_rate": 0.0007312644381030031, + "loss": 3.7158, + "step": 31655 + }, + { + "epoch": 2.1511074874303575, + "grad_norm": 0.8802604079246521, + "learning_rate": 0.0007312219730941703, + "loss": 3.7319, + "step": 31660 + }, + { + "epoch": 2.151447207501019, + "grad_norm": 0.7812095880508423, + "learning_rate": 0.0007311795080853377, + "loss": 3.6805, + "step": 31665 + }, + { + "epoch": 2.1517869275716808, + "grad_norm": 0.7553337812423706, + "learning_rate": 0.000731137043076505, + "loss": 3.6494, + "step": 31670 + }, + { + "epoch": 2.152126647642343, + "grad_norm": 1.4031389951705933, + "learning_rate": 0.0007310945780676722, + "loss": 3.6948, + "step": 31675 + }, + { + "epoch": 2.1524663677130045, + "grad_norm": 1.073222041130066, + "learning_rate": 0.0007310521130588396, + "loss": 3.4217, + "step": 31680 + }, + { + "epoch": 2.152806087783666, + "grad_norm": 0.9537948966026306, + "learning_rate": 0.0007310096480500068, + "loss": 3.5364, + "step": 31685 + }, + { + "epoch": 2.153145807854328, + "grad_norm": 2.8420157432556152, + "learning_rate": 0.000730967183041174, + "loss": 3.7812, + "step": 31690 + }, + { + "epoch": 2.15348552792499, + "grad_norm": 0.9436175227165222, + "learning_rate": 0.0007309247180323414, + "loss": 3.1447, + "step": 31695 + }, + { + "epoch": 2.1538252479956514, + "grad_norm": 0.9007399678230286, + "learning_rate": 0.0007308822530235086, + "loss": 3.589, + "step": 31700 + }, + { + "epoch": 2.1541649680663135, + "grad_norm": 1.1251952648162842, + "learning_rate": 0.0007308397880146759, + "loss": 3.612, + "step": 31705 + }, + { + "epoch": 2.154504688136975, + "grad_norm": 0.8047134280204773, + "learning_rate": 0.0007307973230058433, + "loss": 3.5687, + "step": 31710 + }, + { + "epoch": 2.154844408207637, + "grad_norm": 0.670504629611969, + "learning_rate": 0.0007307548579970105, + "loss": 3.4357, + "step": 31715 + }, + { + "epoch": 2.155184128278299, + "grad_norm": 1.0916495323181152, + "learning_rate": 0.0007307123929881777, + "loss": 3.551, + "step": 31720 + }, + { + "epoch": 2.1555238483489605, + "grad_norm": 0.8753914833068848, + "learning_rate": 0.000730669927979345, + "loss": 3.4751, + "step": 31725 + }, + { + "epoch": 2.155863568419622, + "grad_norm": 3.841430425643921, + "learning_rate": 0.0007306274629705123, + "loss": 3.7646, + "step": 31730 + }, + { + "epoch": 2.156203288490284, + "grad_norm": 1.0921698808670044, + "learning_rate": 0.0007305849979616795, + "loss": 3.2486, + "step": 31735 + }, + { + "epoch": 2.156543008560946, + "grad_norm": 0.973772406578064, + "learning_rate": 0.0007305425329528469, + "loss": 3.7207, + "step": 31740 + }, + { + "epoch": 2.1568827286316075, + "grad_norm": 1.1560883522033691, + "learning_rate": 0.0007305000679440142, + "loss": 3.4891, + "step": 31745 + }, + { + "epoch": 2.1572224487022695, + "grad_norm": 0.8242524862289429, + "learning_rate": 0.0007304576029351814, + "loss": 3.2145, + "step": 31750 + }, + { + "epoch": 2.157562168772931, + "grad_norm": 0.8720992803573608, + "learning_rate": 0.0007304151379263487, + "loss": 3.5504, + "step": 31755 + }, + { + "epoch": 2.157901888843593, + "grad_norm": 0.840972900390625, + "learning_rate": 0.0007303726729175159, + "loss": 3.5541, + "step": 31760 + }, + { + "epoch": 2.1582416089142544, + "grad_norm": 0.920417308807373, + "learning_rate": 0.0007303302079086832, + "loss": 3.4486, + "step": 31765 + }, + { + "epoch": 2.1585813289849165, + "grad_norm": 0.8031994104385376, + "learning_rate": 0.0007302877428998506, + "loss": 3.712, + "step": 31770 + }, + { + "epoch": 2.158921049055578, + "grad_norm": 1.2314672470092773, + "learning_rate": 0.0007302452778910178, + "loss": 3.32, + "step": 31775 + }, + { + "epoch": 2.1592607691262398, + "grad_norm": 1.062441349029541, + "learning_rate": 0.0007302028128821851, + "loss": 3.4706, + "step": 31780 + }, + { + "epoch": 2.159600489196902, + "grad_norm": 1.2554163932800293, + "learning_rate": 0.0007301603478733524, + "loss": 3.5039, + "step": 31785 + }, + { + "epoch": 2.1599402092675635, + "grad_norm": 0.8698097467422485, + "learning_rate": 0.0007301178828645196, + "loss": 3.8318, + "step": 31790 + }, + { + "epoch": 2.160279929338225, + "grad_norm": 0.8038329482078552, + "learning_rate": 0.0007300754178556869, + "loss": 3.6258, + "step": 31795 + }, + { + "epoch": 2.160619649408887, + "grad_norm": 0.8952566385269165, + "learning_rate": 0.0007300329528468542, + "loss": 3.3754, + "step": 31800 + }, + { + "epoch": 2.160959369479549, + "grad_norm": 1.155394196510315, + "learning_rate": 0.0007299904878380215, + "loss": 3.6947, + "step": 31805 + }, + { + "epoch": 2.1612990895502104, + "grad_norm": 0.9187846183776855, + "learning_rate": 0.0007299480228291889, + "loss": 3.8803, + "step": 31810 + }, + { + "epoch": 2.1616388096208725, + "grad_norm": 0.8060274124145508, + "learning_rate": 0.0007299055578203561, + "loss": 3.6745, + "step": 31815 + }, + { + "epoch": 2.161978529691534, + "grad_norm": 0.8383626937866211, + "learning_rate": 0.0007298630928115233, + "loss": 3.5325, + "step": 31820 + }, + { + "epoch": 2.1623182497621958, + "grad_norm": 0.8567686080932617, + "learning_rate": 0.0007298206278026906, + "loss": 3.7039, + "step": 31825 + }, + { + "epoch": 2.162657969832858, + "grad_norm": 1.13080894947052, + "learning_rate": 0.0007297781627938579, + "loss": 3.5583, + "step": 31830 + }, + { + "epoch": 2.1629976899035195, + "grad_norm": 1.0270707607269287, + "learning_rate": 0.0007297356977850251, + "loss": 3.5018, + "step": 31835 + }, + { + "epoch": 2.163337409974181, + "grad_norm": 0.8833244442939758, + "learning_rate": 0.0007296932327761925, + "loss": 3.8383, + "step": 31840 + }, + { + "epoch": 2.163677130044843, + "grad_norm": 0.8378016948699951, + "learning_rate": 0.0007296507677673598, + "loss": 3.7927, + "step": 31845 + }, + { + "epoch": 2.164016850115505, + "grad_norm": 1.0152708292007446, + "learning_rate": 0.000729608302758527, + "loss": 3.532, + "step": 31850 + }, + { + "epoch": 2.1643565701861665, + "grad_norm": 0.6729992032051086, + "learning_rate": 0.0007295658377496943, + "loss": 3.6485, + "step": 31855 + }, + { + "epoch": 2.1646962902568285, + "grad_norm": 1.37112295627594, + "learning_rate": 0.0007295233727408615, + "loss": 3.6186, + "step": 31860 + }, + { + "epoch": 2.16503601032749, + "grad_norm": 0.7444682717323303, + "learning_rate": 0.0007294809077320288, + "loss": 3.7081, + "step": 31865 + }, + { + "epoch": 2.165375730398152, + "grad_norm": 0.9767795205116272, + "learning_rate": 0.0007294384427231961, + "loss": 3.6454, + "step": 31870 + }, + { + "epoch": 2.165715450468814, + "grad_norm": 0.8461366891860962, + "learning_rate": 0.0007293959777143634, + "loss": 3.5796, + "step": 31875 + }, + { + "epoch": 2.1660551705394755, + "grad_norm": 1.0819379091262817, + "learning_rate": 0.0007293535127055307, + "loss": 3.3638, + "step": 31880 + }, + { + "epoch": 2.166394890610137, + "grad_norm": 0.6983944773674011, + "learning_rate": 0.000729311047696698, + "loss": 3.7548, + "step": 31885 + }, + { + "epoch": 2.166734610680799, + "grad_norm": 0.7234395742416382, + "learning_rate": 0.0007292685826878652, + "loss": 3.5778, + "step": 31890 + }, + { + "epoch": 2.167074330751461, + "grad_norm": 0.923786461353302, + "learning_rate": 0.0007292261176790325, + "loss": 3.5506, + "step": 31895 + }, + { + "epoch": 2.1674140508221225, + "grad_norm": 0.9486594200134277, + "learning_rate": 0.0007291836526701998, + "loss": 3.5283, + "step": 31900 + }, + { + "epoch": 2.1677537708927845, + "grad_norm": 0.7810891270637512, + "learning_rate": 0.000729141187661367, + "loss": 3.5876, + "step": 31905 + }, + { + "epoch": 2.168093490963446, + "grad_norm": 0.9325295090675354, + "learning_rate": 0.0007290987226525343, + "loss": 3.6761, + "step": 31910 + }, + { + "epoch": 2.168433211034108, + "grad_norm": 1.7229113578796387, + "learning_rate": 0.0007290562576437017, + "loss": 3.6411, + "step": 31915 + }, + { + "epoch": 2.16877293110477, + "grad_norm": 0.9134924411773682, + "learning_rate": 0.0007290137926348689, + "loss": 3.6111, + "step": 31920 + }, + { + "epoch": 2.1691126511754315, + "grad_norm": 1.158905029296875, + "learning_rate": 0.0007289713276260361, + "loss": 3.5681, + "step": 31925 + }, + { + "epoch": 2.169452371246093, + "grad_norm": 0.8200002312660217, + "learning_rate": 0.0007289288626172035, + "loss": 3.4045, + "step": 31930 + }, + { + "epoch": 2.169792091316755, + "grad_norm": 0.786288857460022, + "learning_rate": 0.0007288863976083707, + "loss": 3.5801, + "step": 31935 + }, + { + "epoch": 2.170131811387417, + "grad_norm": 0.8139554262161255, + "learning_rate": 0.0007288439325995379, + "loss": 3.6667, + "step": 31940 + }, + { + "epoch": 2.1704715314580785, + "grad_norm": 1.410984754562378, + "learning_rate": 0.0007288014675907054, + "loss": 3.5348, + "step": 31945 + }, + { + "epoch": 2.1708112515287405, + "grad_norm": 0.9039215445518494, + "learning_rate": 0.0007287590025818726, + "loss": 3.3964, + "step": 31950 + }, + { + "epoch": 2.171150971599402, + "grad_norm": 1.9705849885940552, + "learning_rate": 0.0007287165375730398, + "loss": 3.6611, + "step": 31955 + }, + { + "epoch": 2.171490691670064, + "grad_norm": 0.793578565120697, + "learning_rate": 0.0007286740725642071, + "loss": 3.6134, + "step": 31960 + }, + { + "epoch": 2.1718304117407254, + "grad_norm": 1.0183237791061401, + "learning_rate": 0.0007286316075553744, + "loss": 3.6397, + "step": 31965 + }, + { + "epoch": 2.1721701318113875, + "grad_norm": 0.8450209498405457, + "learning_rate": 0.0007285891425465416, + "loss": 3.6022, + "step": 31970 + }, + { + "epoch": 2.172509851882049, + "grad_norm": 1.3617327213287354, + "learning_rate": 0.0007285551705394755, + "loss": 3.5653, + "step": 31975 + }, + { + "epoch": 2.172849571952711, + "grad_norm": 1.250105619430542, + "learning_rate": 0.0007285127055306428, + "loss": 3.4502, + "step": 31980 + }, + { + "epoch": 2.173189292023373, + "grad_norm": 0.8559970259666443, + "learning_rate": 0.00072847024052181, + "loss": 3.5941, + "step": 31985 + }, + { + "epoch": 2.1735290120940345, + "grad_norm": 0.9586119651794434, + "learning_rate": 0.0007284277755129773, + "loss": 3.4551, + "step": 31990 + }, + { + "epoch": 2.173868732164696, + "grad_norm": 0.8786266446113586, + "learning_rate": 0.0007283853105041445, + "loss": 3.4747, + "step": 31995 + }, + { + "epoch": 2.174208452235358, + "grad_norm": 0.8417776226997375, + "learning_rate": 0.0007283428454953118, + "loss": 3.4412, + "step": 32000 + }, + { + "epoch": 2.17454817230602, + "grad_norm": 3.2239034175872803, + "learning_rate": 0.0007283003804864792, + "loss": 3.9345, + "step": 32005 + }, + { + "epoch": 2.1748878923766815, + "grad_norm": 1.4339239597320557, + "learning_rate": 0.0007282579154776464, + "loss": 3.5164, + "step": 32010 + }, + { + "epoch": 2.1752276124473435, + "grad_norm": 0.7063984870910645, + "learning_rate": 0.0007282154504688138, + "loss": 3.5971, + "step": 32015 + }, + { + "epoch": 2.175567332518005, + "grad_norm": 1.0237220525741577, + "learning_rate": 0.000728172985459981, + "loss": 3.3858, + "step": 32020 + }, + { + "epoch": 2.175907052588667, + "grad_norm": 0.8756076693534851, + "learning_rate": 0.0007281305204511482, + "loss": 3.5947, + "step": 32025 + }, + { + "epoch": 2.176246772659329, + "grad_norm": 0.8684592843055725, + "learning_rate": 0.0007280880554423156, + "loss": 3.5995, + "step": 32030 + }, + { + "epoch": 2.1765864927299905, + "grad_norm": 0.9678959250450134, + "learning_rate": 0.0007280455904334829, + "loss": 3.6593, + "step": 32035 + }, + { + "epoch": 2.176926212800652, + "grad_norm": 0.672089159488678, + "learning_rate": 0.0007280031254246501, + "loss": 3.3436, + "step": 32040 + }, + { + "epoch": 2.177265932871314, + "grad_norm": 0.6668519973754883, + "learning_rate": 0.0007279606604158175, + "loss": 3.6518, + "step": 32045 + }, + { + "epoch": 2.177605652941976, + "grad_norm": 0.9307196140289307, + "learning_rate": 0.0007279181954069847, + "loss": 3.4278, + "step": 32050 + }, + { + "epoch": 2.1779453730126375, + "grad_norm": 1.1751656532287598, + "learning_rate": 0.0007278757303981519, + "loss": 3.4669, + "step": 32055 + }, + { + "epoch": 2.1782850930832995, + "grad_norm": 0.8706891536712646, + "learning_rate": 0.0007278332653893192, + "loss": 3.6006, + "step": 32060 + }, + { + "epoch": 2.178624813153961, + "grad_norm": 0.9843306541442871, + "learning_rate": 0.0007277908003804865, + "loss": 3.5989, + "step": 32065 + }, + { + "epoch": 2.178964533224623, + "grad_norm": 0.7113016247749329, + "learning_rate": 0.0007277483353716538, + "loss": 3.6291, + "step": 32070 + }, + { + "epoch": 2.179304253295285, + "grad_norm": 1.0170234441757202, + "learning_rate": 0.0007277058703628211, + "loss": 3.5338, + "step": 32075 + }, + { + "epoch": 2.1796439733659465, + "grad_norm": 1.1809929609298706, + "learning_rate": 0.0007276634053539884, + "loss": 3.6429, + "step": 32080 + }, + { + "epoch": 2.179983693436608, + "grad_norm": 0.8453335165977478, + "learning_rate": 0.0007276209403451556, + "loss": 3.8604, + "step": 32085 + }, + { + "epoch": 2.18032341350727, + "grad_norm": 0.8561217188835144, + "learning_rate": 0.0007275784753363229, + "loss": 3.2616, + "step": 32090 + }, + { + "epoch": 2.180663133577932, + "grad_norm": 0.9211388826370239, + "learning_rate": 0.0007275360103274901, + "loss": 3.555, + "step": 32095 + }, + { + "epoch": 2.1810028536485935, + "grad_norm": 0.7335948348045349, + "learning_rate": 0.0007274935453186574, + "loss": 3.6559, + "step": 32100 + }, + { + "epoch": 2.181342573719255, + "grad_norm": 1.0807194709777832, + "learning_rate": 0.0007274510803098248, + "loss": 3.5119, + "step": 32105 + }, + { + "epoch": 2.181682293789917, + "grad_norm": 0.9045798182487488, + "learning_rate": 0.000727408615300992, + "loss": 3.5302, + "step": 32110 + }, + { + "epoch": 2.182022013860579, + "grad_norm": 1.0384405851364136, + "learning_rate": 0.0007273661502921593, + "loss": 3.4356, + "step": 32115 + }, + { + "epoch": 2.1823617339312404, + "grad_norm": 0.9198580384254456, + "learning_rate": 0.0007273236852833266, + "loss": 3.5478, + "step": 32120 + }, + { + "epoch": 2.1827014540019025, + "grad_norm": 2.1520044803619385, + "learning_rate": 0.0007272812202744938, + "loss": 3.7391, + "step": 32125 + }, + { + "epoch": 2.183041174072564, + "grad_norm": 0.7282635569572449, + "learning_rate": 0.000727238755265661, + "loss": 3.4752, + "step": 32130 + }, + { + "epoch": 2.183380894143226, + "grad_norm": 1.1178852319717407, + "learning_rate": 0.0007271962902568284, + "loss": 3.6084, + "step": 32135 + }, + { + "epoch": 2.183720614213888, + "grad_norm": 0.7841722369194031, + "learning_rate": 0.0007271538252479957, + "loss": 3.5909, + "step": 32140 + }, + { + "epoch": 2.1840603342845495, + "grad_norm": 0.836394727230072, + "learning_rate": 0.000727111360239163, + "loss": 3.3817, + "step": 32145 + }, + { + "epoch": 2.184400054355211, + "grad_norm": 0.9262600541114807, + "learning_rate": 0.0007270688952303303, + "loss": 3.4913, + "step": 32150 + }, + { + "epoch": 2.184739774425873, + "grad_norm": 0.8586285710334778, + "learning_rate": 0.0007270264302214975, + "loss": 3.2566, + "step": 32155 + }, + { + "epoch": 2.185079494496535, + "grad_norm": 1.0278990268707275, + "learning_rate": 0.0007269839652126647, + "loss": 3.5581, + "step": 32160 + }, + { + "epoch": 2.1854192145671965, + "grad_norm": 0.936800479888916, + "learning_rate": 0.0007269415002038321, + "loss": 3.5981, + "step": 32165 + }, + { + "epoch": 2.1857589346378585, + "grad_norm": 0.864743709564209, + "learning_rate": 0.0007268990351949993, + "loss": 3.6208, + "step": 32170 + }, + { + "epoch": 2.18609865470852, + "grad_norm": 2.8360917568206787, + "learning_rate": 0.0007268565701861666, + "loss": 3.3538, + "step": 32175 + }, + { + "epoch": 2.186438374779182, + "grad_norm": 0.893609881401062, + "learning_rate": 0.000726814105177334, + "loss": 3.6109, + "step": 32180 + }, + { + "epoch": 2.186778094849844, + "grad_norm": 0.7960904240608215, + "learning_rate": 0.0007267716401685012, + "loss": 3.6538, + "step": 32185 + }, + { + "epoch": 2.1871178149205055, + "grad_norm": 0.8971937298774719, + "learning_rate": 0.0007267291751596684, + "loss": 3.7578, + "step": 32190 + }, + { + "epoch": 2.187457534991167, + "grad_norm": 0.7273871898651123, + "learning_rate": 0.0007266867101508357, + "loss": 3.7448, + "step": 32195 + }, + { + "epoch": 2.187797255061829, + "grad_norm": 0.9385799169540405, + "learning_rate": 0.000726644245142003, + "loss": 3.3843, + "step": 32200 + }, + { + "epoch": 2.188136975132491, + "grad_norm": 0.8796471953392029, + "learning_rate": 0.0007266017801331702, + "loss": 3.6846, + "step": 32205 + }, + { + "epoch": 2.1884766952031525, + "grad_norm": 0.8675617575645447, + "learning_rate": 0.0007265593151243376, + "loss": 3.7416, + "step": 32210 + }, + { + "epoch": 2.1888164152738145, + "grad_norm": 1.002990484237671, + "learning_rate": 0.0007265168501155049, + "loss": 3.5439, + "step": 32215 + }, + { + "epoch": 2.189156135344476, + "grad_norm": 0.9952352046966553, + "learning_rate": 0.0007264743851066721, + "loss": 3.6333, + "step": 32220 + }, + { + "epoch": 2.189495855415138, + "grad_norm": 0.8067545294761658, + "learning_rate": 0.0007264319200978394, + "loss": 3.3585, + "step": 32225 + }, + { + "epoch": 2.1898355754858, + "grad_norm": 1.2202889919281006, + "learning_rate": 0.0007263894550890067, + "loss": 3.659, + "step": 32230 + }, + { + "epoch": 2.1901752955564615, + "grad_norm": 0.9225283861160278, + "learning_rate": 0.0007263469900801739, + "loss": 3.7862, + "step": 32235 + }, + { + "epoch": 2.190515015627123, + "grad_norm": 0.7979728579521179, + "learning_rate": 0.0007263045250713412, + "loss": 3.6675, + "step": 32240 + }, + { + "epoch": 2.190854735697785, + "grad_norm": 0.9402301907539368, + "learning_rate": 0.0007262620600625085, + "loss": 3.3252, + "step": 32245 + }, + { + "epoch": 2.191194455768447, + "grad_norm": 0.7611053586006165, + "learning_rate": 0.0007262195950536758, + "loss": 3.5311, + "step": 32250 + }, + { + "epoch": 2.1915341758391085, + "grad_norm": 0.8385330438613892, + "learning_rate": 0.0007261771300448431, + "loss": 3.5372, + "step": 32255 + }, + { + "epoch": 2.1918738959097706, + "grad_norm": 0.8203081488609314, + "learning_rate": 0.0007261346650360103, + "loss": 3.3369, + "step": 32260 + }, + { + "epoch": 2.192213615980432, + "grad_norm": 0.8749426603317261, + "learning_rate": 0.0007260922000271776, + "loss": 3.8324, + "step": 32265 + }, + { + "epoch": 2.192553336051094, + "grad_norm": 0.8212337493896484, + "learning_rate": 0.0007260497350183449, + "loss": 3.5471, + "step": 32270 + }, + { + "epoch": 2.192893056121756, + "grad_norm": 0.9568706750869751, + "learning_rate": 0.0007260072700095121, + "loss": 3.5859, + "step": 32275 + }, + { + "epoch": 2.1932327761924175, + "grad_norm": 1.1266655921936035, + "learning_rate": 0.0007259648050006795, + "loss": 3.577, + "step": 32280 + }, + { + "epoch": 2.193572496263079, + "grad_norm": 0.9397215843200684, + "learning_rate": 0.0007259223399918468, + "loss": 3.1123, + "step": 32285 + }, + { + "epoch": 2.1939122163337412, + "grad_norm": 0.9241084456443787, + "learning_rate": 0.000725879874983014, + "loss": 3.5653, + "step": 32290 + }, + { + "epoch": 2.194251936404403, + "grad_norm": 1.2040698528289795, + "learning_rate": 0.0007258374099741812, + "loss": 3.6444, + "step": 32295 + }, + { + "epoch": 2.1945916564750645, + "grad_norm": 0.9451388120651245, + "learning_rate": 0.0007257949449653486, + "loss": 3.5084, + "step": 32300 + }, + { + "epoch": 2.194931376545726, + "grad_norm": 1.1048002243041992, + "learning_rate": 0.0007257524799565158, + "loss": 3.5812, + "step": 32305 + }, + { + "epoch": 2.195271096616388, + "grad_norm": 1.6742416620254517, + "learning_rate": 0.000725710014947683, + "loss": 3.5428, + "step": 32310 + }, + { + "epoch": 2.19561081668705, + "grad_norm": 0.7272390723228455, + "learning_rate": 0.0007256675499388505, + "loss": 3.4124, + "step": 32315 + }, + { + "epoch": 2.1959505367577115, + "grad_norm": 0.9324318170547485, + "learning_rate": 0.0007256250849300177, + "loss": 3.2484, + "step": 32320 + }, + { + "epoch": 2.1962902568283735, + "grad_norm": 0.8172516822814941, + "learning_rate": 0.0007255826199211849, + "loss": 3.6403, + "step": 32325 + }, + { + "epoch": 2.196629976899035, + "grad_norm": 0.7441524267196655, + "learning_rate": 0.0007255401549123523, + "loss": 3.4703, + "step": 32330 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 1.0295567512512207, + "learning_rate": 0.0007254976899035195, + "loss": 3.6463, + "step": 32335 + }, + { + "epoch": 2.197309417040359, + "grad_norm": 0.8193092346191406, + "learning_rate": 0.0007254552248946867, + "loss": 3.7021, + "step": 32340 + }, + { + "epoch": 2.1976491371110205, + "grad_norm": 0.8470903635025024, + "learning_rate": 0.000725412759885854, + "loss": 3.7121, + "step": 32345 + }, + { + "epoch": 2.197988857181682, + "grad_norm": 0.8442506790161133, + "learning_rate": 0.0007253702948770214, + "loss": 3.6992, + "step": 32350 + }, + { + "epoch": 2.198328577252344, + "grad_norm": 1.1062668561935425, + "learning_rate": 0.0007253278298681887, + "loss": 3.5831, + "step": 32355 + }, + { + "epoch": 2.198668297323006, + "grad_norm": 0.7113924622535706, + "learning_rate": 0.0007252853648593559, + "loss": 3.565, + "step": 32360 + }, + { + "epoch": 2.1990080173936675, + "grad_norm": 0.7733425498008728, + "learning_rate": 0.0007252428998505232, + "loss": 3.35, + "step": 32365 + }, + { + "epoch": 2.1993477374643295, + "grad_norm": 0.740431547164917, + "learning_rate": 0.0007252004348416905, + "loss": 3.2386, + "step": 32370 + }, + { + "epoch": 2.199687457534991, + "grad_norm": 0.7725468277931213, + "learning_rate": 0.0007251579698328577, + "loss": 3.303, + "step": 32375 + }, + { + "epoch": 2.200027177605653, + "grad_norm": 0.7876932621002197, + "learning_rate": 0.000725115504824025, + "loss": 3.5728, + "step": 32380 + }, + { + "epoch": 2.200366897676315, + "grad_norm": 1.0140856504440308, + "learning_rate": 0.0007250730398151924, + "loss": 3.5864, + "step": 32385 + }, + { + "epoch": 2.2007066177469765, + "grad_norm": 1.1318106651306152, + "learning_rate": 0.0007250305748063596, + "loss": 3.8509, + "step": 32390 + }, + { + "epoch": 2.201046337817638, + "grad_norm": 0.9875067472457886, + "learning_rate": 0.0007249881097975268, + "loss": 3.3501, + "step": 32395 + }, + { + "epoch": 2.2013860578883, + "grad_norm": 1.1470381021499634, + "learning_rate": 0.0007249456447886942, + "loss": 3.679, + "step": 32400 + }, + { + "epoch": 2.201725777958962, + "grad_norm": 1.011925458908081, + "learning_rate": 0.0007249031797798614, + "loss": 3.6538, + "step": 32405 + }, + { + "epoch": 2.2020654980296235, + "grad_norm": 0.8726849555969238, + "learning_rate": 0.0007248607147710286, + "loss": 3.5061, + "step": 32410 + }, + { + "epoch": 2.2024052181002856, + "grad_norm": 0.7751703858375549, + "learning_rate": 0.000724818249762196, + "loss": 3.6964, + "step": 32415 + }, + { + "epoch": 2.202744938170947, + "grad_norm": 0.9744569063186646, + "learning_rate": 0.0007247757847533633, + "loss": 3.5729, + "step": 32420 + }, + { + "epoch": 2.203084658241609, + "grad_norm": 0.9227226972579956, + "learning_rate": 0.0007247333197445305, + "loss": 3.6263, + "step": 32425 + }, + { + "epoch": 2.203424378312271, + "grad_norm": 1.0290708541870117, + "learning_rate": 0.0007246908547356979, + "loss": 3.2745, + "step": 32430 + }, + { + "epoch": 2.2037640983829325, + "grad_norm": 0.8382985591888428, + "learning_rate": 0.0007246483897268651, + "loss": 3.572, + "step": 32435 + }, + { + "epoch": 2.204103818453594, + "grad_norm": 1.131726622581482, + "learning_rate": 0.0007246059247180323, + "loss": 3.7873, + "step": 32440 + }, + { + "epoch": 2.204443538524256, + "grad_norm": 0.843154788017273, + "learning_rate": 0.0007245634597091996, + "loss": 3.83, + "step": 32445 + }, + { + "epoch": 2.204783258594918, + "grad_norm": 0.9318386316299438, + "learning_rate": 0.0007245209947003669, + "loss": 3.3902, + "step": 32450 + }, + { + "epoch": 2.2051229786655795, + "grad_norm": 0.7213083505630493, + "learning_rate": 0.0007244785296915342, + "loss": 3.332, + "step": 32455 + }, + { + "epoch": 2.205462698736241, + "grad_norm": 0.9745978116989136, + "learning_rate": 0.0007244360646827015, + "loss": 3.4712, + "step": 32460 + }, + { + "epoch": 2.205802418806903, + "grad_norm": 0.8805606961250305, + "learning_rate": 0.0007243935996738688, + "loss": 3.7043, + "step": 32465 + }, + { + "epoch": 2.206142138877565, + "grad_norm": 0.9021103382110596, + "learning_rate": 0.000724351134665036, + "loss": 3.6265, + "step": 32470 + }, + { + "epoch": 2.2064818589482265, + "grad_norm": 0.8753682971000671, + "learning_rate": 0.0007243086696562033, + "loss": 3.7783, + "step": 32475 + }, + { + "epoch": 2.2068215790188885, + "grad_norm": 1.2948731184005737, + "learning_rate": 0.0007242662046473705, + "loss": 3.1822, + "step": 32480 + }, + { + "epoch": 2.20716129908955, + "grad_norm": 0.8933263421058655, + "learning_rate": 0.0007242237396385378, + "loss": 3.8576, + "step": 32485 + }, + { + "epoch": 2.207501019160212, + "grad_norm": 0.9253987073898315, + "learning_rate": 0.0007241812746297052, + "loss": 3.4493, + "step": 32490 + }, + { + "epoch": 2.207840739230874, + "grad_norm": 0.7479205131530762, + "learning_rate": 0.0007241388096208724, + "loss": 3.7061, + "step": 32495 + }, + { + "epoch": 2.2081804593015355, + "grad_norm": 0.812614381313324, + "learning_rate": 0.0007240963446120397, + "loss": 3.3829, + "step": 32500 + }, + { + "epoch": 2.208520179372197, + "grad_norm": 0.8514379262924194, + "learning_rate": 0.000724053879603207, + "loss": 3.4806, + "step": 32505 + }, + { + "epoch": 2.208859899442859, + "grad_norm": 0.8892616629600525, + "learning_rate": 0.0007240114145943742, + "loss": 3.6628, + "step": 32510 + }, + { + "epoch": 2.209199619513521, + "grad_norm": 0.9871666431427002, + "learning_rate": 0.0007239689495855415, + "loss": 3.735, + "step": 32515 + }, + { + "epoch": 2.2095393395841825, + "grad_norm": 0.9211719632148743, + "learning_rate": 0.0007239264845767088, + "loss": 3.6022, + "step": 32520 + }, + { + "epoch": 2.2098790596548445, + "grad_norm": 0.9571179151535034, + "learning_rate": 0.0007238840195678761, + "loss": 3.7542, + "step": 32525 + }, + { + "epoch": 2.210218779725506, + "grad_norm": 0.812986433506012, + "learning_rate": 0.0007238415545590434, + "loss": 3.63, + "step": 32530 + }, + { + "epoch": 2.210558499796168, + "grad_norm": 4.678325653076172, + "learning_rate": 0.0007237990895502107, + "loss": 3.6082, + "step": 32535 + }, + { + "epoch": 2.21089821986683, + "grad_norm": 0.706696093082428, + "learning_rate": 0.0007237566245413779, + "loss": 3.752, + "step": 32540 + }, + { + "epoch": 2.2112379399374915, + "grad_norm": 0.9891648292541504, + "learning_rate": 0.0007237141595325451, + "loss": 3.6524, + "step": 32545 + }, + { + "epoch": 2.211577660008153, + "grad_norm": 0.7088270783424377, + "learning_rate": 0.0007236716945237125, + "loss": 3.6599, + "step": 32550 + }, + { + "epoch": 2.211917380078815, + "grad_norm": 1.1636487245559692, + "learning_rate": 0.0007236292295148797, + "loss": 3.6523, + "step": 32555 + }, + { + "epoch": 2.212257100149477, + "grad_norm": 0.9831357598304749, + "learning_rate": 0.000723586764506047, + "loss": 3.4965, + "step": 32560 + }, + { + "epoch": 2.2125968202201385, + "grad_norm": 0.7097558975219727, + "learning_rate": 0.0007235442994972144, + "loss": 3.6228, + "step": 32565 + }, + { + "epoch": 2.2129365402908006, + "grad_norm": 0.9216629862785339, + "learning_rate": 0.0007235018344883816, + "loss": 3.4107, + "step": 32570 + }, + { + "epoch": 2.213276260361462, + "grad_norm": 0.8096122741699219, + "learning_rate": 0.0007234593694795488, + "loss": 3.4704, + "step": 32575 + }, + { + "epoch": 2.213615980432124, + "grad_norm": 1.286341905593872, + "learning_rate": 0.0007234169044707162, + "loss": 3.6002, + "step": 32580 + }, + { + "epoch": 2.213955700502786, + "grad_norm": 0.8083006739616394, + "learning_rate": 0.0007233744394618834, + "loss": 3.7372, + "step": 32585 + }, + { + "epoch": 2.2142954205734475, + "grad_norm": 0.9552554488182068, + "learning_rate": 0.0007233319744530506, + "loss": 3.4812, + "step": 32590 + }, + { + "epoch": 2.214635140644109, + "grad_norm": 1.1679961681365967, + "learning_rate": 0.000723289509444218, + "loss": 3.3604, + "step": 32595 + }, + { + "epoch": 2.2149748607147712, + "grad_norm": 2.260793447494507, + "learning_rate": 0.0007232470444353853, + "loss": 3.6614, + "step": 32600 + }, + { + "epoch": 2.215314580785433, + "grad_norm": 0.8728476166725159, + "learning_rate": 0.0007232045794265525, + "loss": 3.3933, + "step": 32605 + }, + { + "epoch": 2.2156543008560945, + "grad_norm": 1.3520357608795166, + "learning_rate": 0.0007231621144177198, + "loss": 3.3279, + "step": 32610 + }, + { + "epoch": 2.2159940209267566, + "grad_norm": 1.000787615776062, + "learning_rate": 0.0007231196494088871, + "loss": 3.7723, + "step": 32615 + }, + { + "epoch": 2.216333740997418, + "grad_norm": 0.873595118522644, + "learning_rate": 0.0007230771844000543, + "loss": 3.6225, + "step": 32620 + }, + { + "epoch": 2.21667346106808, + "grad_norm": 1.0396826267242432, + "learning_rate": 0.0007230347193912217, + "loss": 3.6188, + "step": 32625 + }, + { + "epoch": 2.217013181138742, + "grad_norm": 1.2223706245422363, + "learning_rate": 0.000722992254382389, + "loss": 3.5107, + "step": 32630 + }, + { + "epoch": 2.2173529012094035, + "grad_norm": 1.074193000793457, + "learning_rate": 0.0007229497893735562, + "loss": 3.4972, + "step": 32635 + }, + { + "epoch": 2.217692621280065, + "grad_norm": 1.1692854166030884, + "learning_rate": 0.0007229073243647235, + "loss": 3.5164, + "step": 32640 + }, + { + "epoch": 2.218032341350727, + "grad_norm": 0.817791223526001, + "learning_rate": 0.0007228648593558907, + "loss": 3.5288, + "step": 32645 + }, + { + "epoch": 2.218372061421389, + "grad_norm": 0.6975078582763672, + "learning_rate": 0.000722822394347058, + "loss": 3.3723, + "step": 32650 + }, + { + "epoch": 2.2187117814920505, + "grad_norm": 1.1027437448501587, + "learning_rate": 0.0007227799293382253, + "loss": 3.5707, + "step": 32655 + }, + { + "epoch": 2.219051501562712, + "grad_norm": 0.9739217758178711, + "learning_rate": 0.0007227374643293926, + "loss": 3.5183, + "step": 32660 + }, + { + "epoch": 2.219391221633374, + "grad_norm": 0.7480763792991638, + "learning_rate": 0.0007226949993205599, + "loss": 3.285, + "step": 32665 + }, + { + "epoch": 2.219730941704036, + "grad_norm": 0.9154850840568542, + "learning_rate": 0.0007226525343117272, + "loss": 3.6781, + "step": 32670 + }, + { + "epoch": 2.2200706617746975, + "grad_norm": 1.0887451171875, + "learning_rate": 0.0007226100693028944, + "loss": 3.5556, + "step": 32675 + }, + { + "epoch": 2.2204103818453595, + "grad_norm": 0.7890298366546631, + "learning_rate": 0.0007225676042940616, + "loss": 3.3977, + "step": 32680 + }, + { + "epoch": 2.220750101916021, + "grad_norm": 0.8560712337493896, + "learning_rate": 0.000722525139285229, + "loss": 3.523, + "step": 32685 + }, + { + "epoch": 2.221089821986683, + "grad_norm": 1.2232171297073364, + "learning_rate": 0.0007224826742763962, + "loss": 3.3072, + "step": 32690 + }, + { + "epoch": 2.221429542057345, + "grad_norm": 1.0619633197784424, + "learning_rate": 0.0007224402092675636, + "loss": 3.4112, + "step": 32695 + }, + { + "epoch": 2.2217692621280065, + "grad_norm": 0.7337565422058105, + "learning_rate": 0.0007223977442587309, + "loss": 3.518, + "step": 32700 + }, + { + "epoch": 2.222108982198668, + "grad_norm": 0.8400083780288696, + "learning_rate": 0.0007223552792498981, + "loss": 3.4771, + "step": 32705 + }, + { + "epoch": 2.2224487022693302, + "grad_norm": 0.6967839598655701, + "learning_rate": 0.0007223128142410654, + "loss": 3.6663, + "step": 32710 + }, + { + "epoch": 2.222788422339992, + "grad_norm": 0.9742965698242188, + "learning_rate": 0.0007222703492322327, + "loss": 3.6557, + "step": 32715 + }, + { + "epoch": 2.2231281424106535, + "grad_norm": 1.0020017623901367, + "learning_rate": 0.0007222278842233999, + "loss": 3.5397, + "step": 32720 + }, + { + "epoch": 2.2234678624813156, + "grad_norm": 0.7478546500205994, + "learning_rate": 0.0007221854192145672, + "loss": 3.4946, + "step": 32725 + }, + { + "epoch": 2.223807582551977, + "grad_norm": 0.8883554935455322, + "learning_rate": 0.0007221429542057346, + "loss": 3.6075, + "step": 32730 + }, + { + "epoch": 2.224147302622639, + "grad_norm": 1.3203779458999634, + "learning_rate": 0.0007221004891969018, + "loss": 3.6278, + "step": 32735 + }, + { + "epoch": 2.224487022693301, + "grad_norm": 0.8402491211891174, + "learning_rate": 0.0007220580241880691, + "loss": 3.6206, + "step": 32740 + }, + { + "epoch": 2.2248267427639625, + "grad_norm": 0.8827426433563232, + "learning_rate": 0.0007220155591792363, + "loss": 3.6166, + "step": 32745 + }, + { + "epoch": 2.225166462834624, + "grad_norm": 0.9873366951942444, + "learning_rate": 0.0007219730941704036, + "loss": 3.2477, + "step": 32750 + }, + { + "epoch": 2.2255061829052862, + "grad_norm": 1.2022844552993774, + "learning_rate": 0.0007219306291615709, + "loss": 3.5449, + "step": 32755 + }, + { + "epoch": 2.225845902975948, + "grad_norm": 0.7034822106361389, + "learning_rate": 0.0007218881641527381, + "loss": 3.6972, + "step": 32760 + }, + { + "epoch": 2.2261856230466095, + "grad_norm": 2.1997220516204834, + "learning_rate": 0.0007218456991439055, + "loss": 3.4877, + "step": 32765 + }, + { + "epoch": 2.2265253431172716, + "grad_norm": 0.9391852617263794, + "learning_rate": 0.0007218032341350728, + "loss": 3.6718, + "step": 32770 + }, + { + "epoch": 2.226865063187933, + "grad_norm": 0.870087742805481, + "learning_rate": 0.00072176076912624, + "loss": 3.6149, + "step": 32775 + }, + { + "epoch": 2.227204783258595, + "grad_norm": 0.7976835370063782, + "learning_rate": 0.0007217183041174072, + "loss": 3.6263, + "step": 32780 + }, + { + "epoch": 2.2275445033292565, + "grad_norm": 0.9592113494873047, + "learning_rate": 0.0007216758391085746, + "loss": 3.5053, + "step": 32785 + }, + { + "epoch": 2.2278842233999185, + "grad_norm": 0.7828862071037292, + "learning_rate": 0.0007216333740997418, + "loss": 3.4453, + "step": 32790 + }, + { + "epoch": 2.22822394347058, + "grad_norm": 0.8587181568145752, + "learning_rate": 0.000721590909090909, + "loss": 3.6823, + "step": 32795 + }, + { + "epoch": 2.228563663541242, + "grad_norm": 1.0020264387130737, + "learning_rate": 0.0007215484440820765, + "loss": 3.7877, + "step": 32800 + }, + { + "epoch": 2.228903383611904, + "grad_norm": 0.9478337168693542, + "learning_rate": 0.0007215059790732437, + "loss": 3.4887, + "step": 32805 + }, + { + "epoch": 2.2292431036825655, + "grad_norm": 0.8042871356010437, + "learning_rate": 0.0007214635140644109, + "loss": 3.5983, + "step": 32810 + }, + { + "epoch": 2.229582823753227, + "grad_norm": 0.7141690254211426, + "learning_rate": 0.0007214210490555783, + "loss": 3.6447, + "step": 32815 + }, + { + "epoch": 2.229922543823889, + "grad_norm": 0.9714575409889221, + "learning_rate": 0.0007213785840467455, + "loss": 3.6645, + "step": 32820 + }, + { + "epoch": 2.230262263894551, + "grad_norm": 0.8226107954978943, + "learning_rate": 0.0007213361190379127, + "loss": 3.6755, + "step": 32825 + }, + { + "epoch": 2.2306019839652125, + "grad_norm": 2.1806297302246094, + "learning_rate": 0.00072129365402908, + "loss": 3.8149, + "step": 32830 + }, + { + "epoch": 2.2309417040358746, + "grad_norm": 1.0160102844238281, + "learning_rate": 0.0007212511890202474, + "loss": 3.6485, + "step": 32835 + }, + { + "epoch": 2.231281424106536, + "grad_norm": 1.0176491737365723, + "learning_rate": 0.0007212087240114146, + "loss": 3.5338, + "step": 32840 + }, + { + "epoch": 2.231621144177198, + "grad_norm": 0.9845272898674011, + "learning_rate": 0.0007211662590025819, + "loss": 3.5168, + "step": 32845 + }, + { + "epoch": 2.23196086424786, + "grad_norm": 0.8377525806427002, + "learning_rate": 0.0007211237939937492, + "loss": 3.5828, + "step": 32850 + }, + { + "epoch": 2.2323005843185215, + "grad_norm": 0.7592108249664307, + "learning_rate": 0.0007210813289849164, + "loss": 3.4866, + "step": 32855 + }, + { + "epoch": 2.232640304389183, + "grad_norm": 1.407305121421814, + "learning_rate": 0.0007210388639760837, + "loss": 3.372, + "step": 32860 + }, + { + "epoch": 2.2329800244598452, + "grad_norm": 1.0751924514770508, + "learning_rate": 0.000720996398967251, + "loss": 3.2761, + "step": 32865 + }, + { + "epoch": 2.233319744530507, + "grad_norm": 0.7781738042831421, + "learning_rate": 0.0007209539339584183, + "loss": 3.7275, + "step": 32870 + }, + { + "epoch": 2.2336594646011685, + "grad_norm": 0.9250655770301819, + "learning_rate": 0.0007209114689495856, + "loss": 3.7333, + "step": 32875 + }, + { + "epoch": 2.2339991846718306, + "grad_norm": 0.8158172965049744, + "learning_rate": 0.0007208690039407528, + "loss": 3.5037, + "step": 32880 + }, + { + "epoch": 2.234338904742492, + "grad_norm": 0.9010457992553711, + "learning_rate": 0.0007208265389319201, + "loss": 3.4596, + "step": 32885 + }, + { + "epoch": 2.234678624813154, + "grad_norm": 0.9197947382926941, + "learning_rate": 0.0007207840739230874, + "loss": 3.5501, + "step": 32890 + }, + { + "epoch": 2.235018344883816, + "grad_norm": 1.0803883075714111, + "learning_rate": 0.0007207416089142546, + "loss": 3.5096, + "step": 32895 + }, + { + "epoch": 2.2353580649544775, + "grad_norm": 0.8565152287483215, + "learning_rate": 0.0007206991439054219, + "loss": 3.5701, + "step": 32900 + }, + { + "epoch": 2.235697785025139, + "grad_norm": 0.9967086911201477, + "learning_rate": 0.0007206566788965893, + "loss": 3.5209, + "step": 32905 + }, + { + "epoch": 2.2360375050958012, + "grad_norm": 0.8100451827049255, + "learning_rate": 0.0007206142138877565, + "loss": 3.8221, + "step": 32910 + }, + { + "epoch": 2.236377225166463, + "grad_norm": 1.1002541780471802, + "learning_rate": 0.0007205717488789238, + "loss": 3.8492, + "step": 32915 + }, + { + "epoch": 2.2367169452371245, + "grad_norm": 0.838062047958374, + "learning_rate": 0.0007205292838700911, + "loss": 3.4577, + "step": 32920 + }, + { + "epoch": 2.2370566653077866, + "grad_norm": 1.0103297233581543, + "learning_rate": 0.0007204868188612583, + "loss": 3.6501, + "step": 32925 + }, + { + "epoch": 2.237396385378448, + "grad_norm": 0.8256200551986694, + "learning_rate": 0.0007204443538524255, + "loss": 3.8116, + "step": 32930 + }, + { + "epoch": 2.23773610544911, + "grad_norm": 1.386776089668274, + "learning_rate": 0.0007204018888435929, + "loss": 3.3303, + "step": 32935 + }, + { + "epoch": 2.238075825519772, + "grad_norm": 0.9147841930389404, + "learning_rate": 0.0007203594238347602, + "loss": 3.3626, + "step": 32940 + }, + { + "epoch": 2.2384155455904335, + "grad_norm": 1.2385562658309937, + "learning_rate": 0.0007203169588259274, + "loss": 3.6081, + "step": 32945 + }, + { + "epoch": 2.238755265661095, + "grad_norm": 0.7737416625022888, + "learning_rate": 0.0007202744938170948, + "loss": 3.7393, + "step": 32950 + }, + { + "epoch": 2.2390949857317572, + "grad_norm": 0.7495541572570801, + "learning_rate": 0.000720232028808262, + "loss": 3.5998, + "step": 32955 + }, + { + "epoch": 2.239434705802419, + "grad_norm": 0.7394245862960815, + "learning_rate": 0.0007201895637994292, + "loss": 3.5464, + "step": 32960 + }, + { + "epoch": 2.2397744258730805, + "grad_norm": 1.0026191473007202, + "learning_rate": 0.0007201470987905966, + "loss": 3.5958, + "step": 32965 + }, + { + "epoch": 2.2401141459437426, + "grad_norm": 0.911328911781311, + "learning_rate": 0.0007201046337817638, + "loss": 3.6081, + "step": 32970 + }, + { + "epoch": 2.240453866014404, + "grad_norm": 1.2613301277160645, + "learning_rate": 0.0007200621687729311, + "loss": 3.5337, + "step": 32975 + }, + { + "epoch": 2.240793586085066, + "grad_norm": 0.8662687540054321, + "learning_rate": 0.0007200197037640984, + "loss": 3.5924, + "step": 32980 + }, + { + "epoch": 2.2411333061557275, + "grad_norm": 1.0577914714813232, + "learning_rate": 0.0007199772387552657, + "loss": 3.6014, + "step": 32985 + }, + { + "epoch": 2.2414730262263896, + "grad_norm": 1.006330132484436, + "learning_rate": 0.0007199347737464329, + "loss": 3.4465, + "step": 32990 + }, + { + "epoch": 2.241812746297051, + "grad_norm": 0.9153301119804382, + "learning_rate": 0.0007198923087376002, + "loss": 3.5072, + "step": 32995 + }, + { + "epoch": 2.242152466367713, + "grad_norm": 0.9491611123085022, + "learning_rate": 0.0007198498437287675, + "loss": 3.467, + "step": 33000 + }, + { + "epoch": 2.242492186438375, + "grad_norm": 1.175616979598999, + "learning_rate": 0.0007198073787199347, + "loss": 3.5999, + "step": 33005 + }, + { + "epoch": 2.2428319065090365, + "grad_norm": 0.8373321890830994, + "learning_rate": 0.0007197649137111021, + "loss": 3.2872, + "step": 33010 + }, + { + "epoch": 2.243171626579698, + "grad_norm": 0.9875304102897644, + "learning_rate": 0.0007197224487022694, + "loss": 3.7058, + "step": 33015 + }, + { + "epoch": 2.2435113466503602, + "grad_norm": 0.8658708333969116, + "learning_rate": 0.0007196799836934366, + "loss": 3.5639, + "step": 33020 + }, + { + "epoch": 2.243851066721022, + "grad_norm": 0.758778989315033, + "learning_rate": 0.0007196375186846039, + "loss": 3.5392, + "step": 33025 + }, + { + "epoch": 2.2441907867916835, + "grad_norm": 0.8740217089653015, + "learning_rate": 0.0007195950536757711, + "loss": 3.5774, + "step": 33030 + }, + { + "epoch": 2.2445305068623456, + "grad_norm": 0.775901198387146, + "learning_rate": 0.0007195525886669385, + "loss": 3.4935, + "step": 33035 + }, + { + "epoch": 2.244870226933007, + "grad_norm": 0.7549597024917603, + "learning_rate": 0.0007195101236581057, + "loss": 3.5023, + "step": 33040 + }, + { + "epoch": 2.245209947003669, + "grad_norm": 0.9329313635826111, + "learning_rate": 0.000719467658649273, + "loss": 3.618, + "step": 33045 + }, + { + "epoch": 2.245549667074331, + "grad_norm": 0.8688802123069763, + "learning_rate": 0.0007194251936404404, + "loss": 3.8953, + "step": 33050 + }, + { + "epoch": 2.2458893871449925, + "grad_norm": 0.9857128262519836, + "learning_rate": 0.0007193827286316076, + "loss": 3.8776, + "step": 33055 + }, + { + "epoch": 2.246229107215654, + "grad_norm": 0.8099145889282227, + "learning_rate": 0.0007193402636227748, + "loss": 3.567, + "step": 33060 + }, + { + "epoch": 2.2465688272863162, + "grad_norm": 0.9973512291908264, + "learning_rate": 0.0007192977986139422, + "loss": 3.5917, + "step": 33065 + }, + { + "epoch": 2.246908547356978, + "grad_norm": 1.04048752784729, + "learning_rate": 0.0007192553336051094, + "loss": 3.602, + "step": 33070 + }, + { + "epoch": 2.2472482674276395, + "grad_norm": 0.7784618139266968, + "learning_rate": 0.0007192128685962766, + "loss": 3.7201, + "step": 33075 + }, + { + "epoch": 2.2475879874983016, + "grad_norm": 0.9878979325294495, + "learning_rate": 0.000719170403587444, + "loss": 3.632, + "step": 33080 + }, + { + "epoch": 2.247927707568963, + "grad_norm": 1.0831000804901123, + "learning_rate": 0.0007191279385786113, + "loss": 3.5466, + "step": 33085 + }, + { + "epoch": 2.248267427639625, + "grad_norm": 0.8651350140571594, + "learning_rate": 0.0007190854735697785, + "loss": 3.4198, + "step": 33090 + }, + { + "epoch": 2.248607147710287, + "grad_norm": 6.106120586395264, + "learning_rate": 0.0007190430085609458, + "loss": 3.2769, + "step": 33095 + }, + { + "epoch": 2.2489468677809485, + "grad_norm": 2.612999677658081, + "learning_rate": 0.0007190005435521131, + "loss": 3.475, + "step": 33100 + }, + { + "epoch": 2.24928658785161, + "grad_norm": 0.6978240609169006, + "learning_rate": 0.0007189580785432803, + "loss": 3.4048, + "step": 33105 + }, + { + "epoch": 2.2496263079222723, + "grad_norm": 1.2255499362945557, + "learning_rate": 0.0007189156135344476, + "loss": 3.4172, + "step": 33110 + }, + { + "epoch": 2.249966027992934, + "grad_norm": 0.8075246810913086, + "learning_rate": 0.000718873148525615, + "loss": 3.6937, + "step": 33115 + }, + { + "epoch": 2.2503057480635955, + "grad_norm": 1.0194228887557983, + "learning_rate": 0.0007188306835167822, + "loss": 3.5395, + "step": 33120 + }, + { + "epoch": 2.250645468134257, + "grad_norm": 0.9428600668907166, + "learning_rate": 0.0007187882185079495, + "loss": 3.7437, + "step": 33125 + }, + { + "epoch": 2.250985188204919, + "grad_norm": 0.9098532199859619, + "learning_rate": 0.0007187457534991167, + "loss": 3.8069, + "step": 33130 + }, + { + "epoch": 2.251324908275581, + "grad_norm": 0.8651412725448608, + "learning_rate": 0.000718703288490284, + "loss": 3.6851, + "step": 33135 + }, + { + "epoch": 2.2516646283462425, + "grad_norm": 0.8794263005256653, + "learning_rate": 0.0007186608234814513, + "loss": 3.6514, + "step": 33140 + }, + { + "epoch": 2.2520043484169046, + "grad_norm": 1.0132888555526733, + "learning_rate": 0.0007186183584726185, + "loss": 3.5926, + "step": 33145 + }, + { + "epoch": 2.252344068487566, + "grad_norm": 0.9991350769996643, + "learning_rate": 0.0007185758934637859, + "loss": 3.2958, + "step": 33150 + }, + { + "epoch": 2.252683788558228, + "grad_norm": 0.9644416570663452, + "learning_rate": 0.0007185334284549532, + "loss": 3.8279, + "step": 33155 + }, + { + "epoch": 2.25302350862889, + "grad_norm": 0.8079434633255005, + "learning_rate": 0.0007184909634461204, + "loss": 3.6248, + "step": 33160 + }, + { + "epoch": 2.2533632286995515, + "grad_norm": 1.1816685199737549, + "learning_rate": 0.0007184484984372876, + "loss": 3.7602, + "step": 33165 + }, + { + "epoch": 2.253702948770213, + "grad_norm": 1.13334321975708, + "learning_rate": 0.000718406033428455, + "loss": 3.4088, + "step": 33170 + }, + { + "epoch": 2.2540426688408752, + "grad_norm": 0.9179485440254211, + "learning_rate": 0.0007183635684196222, + "loss": 3.5358, + "step": 33175 + }, + { + "epoch": 2.254382388911537, + "grad_norm": 0.8572705388069153, + "learning_rate": 0.0007183211034107894, + "loss": 3.3827, + "step": 33180 + }, + { + "epoch": 2.2547221089821985, + "grad_norm": 0.7260561585426331, + "learning_rate": 0.0007182786384019569, + "loss": 3.6247, + "step": 33185 + }, + { + "epoch": 2.2550618290528606, + "grad_norm": 0.9061114192008972, + "learning_rate": 0.0007182361733931241, + "loss": 3.5139, + "step": 33190 + }, + { + "epoch": 2.255401549123522, + "grad_norm": 1.273350477218628, + "learning_rate": 0.0007181937083842913, + "loss": 3.6955, + "step": 33195 + }, + { + "epoch": 2.255741269194184, + "grad_norm": 0.7427929043769836, + "learning_rate": 0.0007181512433754587, + "loss": 3.6379, + "step": 33200 + }, + { + "epoch": 2.256080989264846, + "grad_norm": 0.7195837497711182, + "learning_rate": 0.0007181087783666259, + "loss": 3.6417, + "step": 33205 + }, + { + "epoch": 2.2564207093355075, + "grad_norm": 1.6282472610473633, + "learning_rate": 0.0007180663133577931, + "loss": 3.5916, + "step": 33210 + }, + { + "epoch": 2.256760429406169, + "grad_norm": 0.8208787441253662, + "learning_rate": 0.0007180238483489606, + "loss": 3.3773, + "step": 33215 + }, + { + "epoch": 2.2571001494768312, + "grad_norm": 0.8015985488891602, + "learning_rate": 0.0007179813833401278, + "loss": 3.746, + "step": 33220 + }, + { + "epoch": 2.257439869547493, + "grad_norm": 1.6897759437561035, + "learning_rate": 0.000717938918331295, + "loss": 3.3743, + "step": 33225 + }, + { + "epoch": 2.2577795896181545, + "grad_norm": 1.0290122032165527, + "learning_rate": 0.0007178964533224623, + "loss": 3.518, + "step": 33230 + }, + { + "epoch": 2.2581193096888166, + "grad_norm": 0.8100578784942627, + "learning_rate": 0.0007178539883136296, + "loss": 3.5729, + "step": 33235 + }, + { + "epoch": 2.258459029759478, + "grad_norm": 0.9359807372093201, + "learning_rate": 0.0007178115233047968, + "loss": 3.5452, + "step": 33240 + }, + { + "epoch": 2.25879874983014, + "grad_norm": 0.9874002933502197, + "learning_rate": 0.0007177690582959641, + "loss": 3.3034, + "step": 33245 + }, + { + "epoch": 2.259138469900802, + "grad_norm": 0.9180927872657776, + "learning_rate": 0.0007177265932871315, + "loss": 3.6159, + "step": 33250 + }, + { + "epoch": 2.2594781899714635, + "grad_norm": 0.7662233710289001, + "learning_rate": 0.0007176841282782987, + "loss": 3.3775, + "step": 33255 + }, + { + "epoch": 2.259817910042125, + "grad_norm": 0.8119447827339172, + "learning_rate": 0.000717641663269466, + "loss": 3.763, + "step": 33260 + }, + { + "epoch": 2.2601576301127873, + "grad_norm": 10.526870727539062, + "learning_rate": 0.0007175991982606333, + "loss": 3.8007, + "step": 33265 + }, + { + "epoch": 2.260497350183449, + "grad_norm": 0.8445889949798584, + "learning_rate": 0.0007175567332518005, + "loss": 3.4701, + "step": 33270 + }, + { + "epoch": 2.2608370702541105, + "grad_norm": 0.961327075958252, + "learning_rate": 0.0007175142682429678, + "loss": 3.5386, + "step": 33275 + }, + { + "epoch": 2.2611767903247726, + "grad_norm": 0.9062765836715698, + "learning_rate": 0.000717471803234135, + "loss": 3.5678, + "step": 33280 + }, + { + "epoch": 2.261516510395434, + "grad_norm": 1.1368716955184937, + "learning_rate": 0.0007174293382253024, + "loss": 3.654, + "step": 33285 + }, + { + "epoch": 2.261856230466096, + "grad_norm": 0.7236694693565369, + "learning_rate": 0.0007173868732164697, + "loss": 3.4834, + "step": 33290 + }, + { + "epoch": 2.262195950536758, + "grad_norm": 1.0550775527954102, + "learning_rate": 0.0007173444082076369, + "loss": 3.5988, + "step": 33295 + }, + { + "epoch": 2.2625356706074196, + "grad_norm": 1.9038984775543213, + "learning_rate": 0.0007173019431988042, + "loss": 3.5498, + "step": 33300 + }, + { + "epoch": 2.262875390678081, + "grad_norm": 0.8651537299156189, + "learning_rate": 0.0007172594781899715, + "loss": 3.4599, + "step": 33305 + }, + { + "epoch": 2.2632151107487433, + "grad_norm": 1.0182011127471924, + "learning_rate": 0.0007172170131811387, + "loss": 3.5612, + "step": 33310 + }, + { + "epoch": 2.263554830819405, + "grad_norm": 0.8182995915412903, + "learning_rate": 0.0007171745481723059, + "loss": 3.74, + "step": 33315 + }, + { + "epoch": 2.2638945508900665, + "grad_norm": 0.9499862194061279, + "learning_rate": 0.0007171320831634734, + "loss": 3.5811, + "step": 33320 + }, + { + "epoch": 2.2642342709607286, + "grad_norm": 1.2922654151916504, + "learning_rate": 0.0007170896181546406, + "loss": 3.4362, + "step": 33325 + }, + { + "epoch": 2.2645739910313902, + "grad_norm": 1.0909181833267212, + "learning_rate": 0.0007170471531458078, + "loss": 3.5605, + "step": 33330 + }, + { + "epoch": 2.264913711102052, + "grad_norm": 1.0372793674468994, + "learning_rate": 0.0007170046881369752, + "loss": 3.5603, + "step": 33335 + }, + { + "epoch": 2.2652534311727135, + "grad_norm": 0.7344026565551758, + "learning_rate": 0.0007169622231281424, + "loss": 3.4679, + "step": 33340 + }, + { + "epoch": 2.2655931512433756, + "grad_norm": 1.6182074546813965, + "learning_rate": 0.0007169197581193096, + "loss": 3.513, + "step": 33345 + }, + { + "epoch": 2.265932871314037, + "grad_norm": 0.8323761224746704, + "learning_rate": 0.000716877293110477, + "loss": 3.7264, + "step": 33350 + }, + { + "epoch": 2.266272591384699, + "grad_norm": 1.1722978353500366, + "learning_rate": 0.0007168348281016443, + "loss": 3.6853, + "step": 33355 + }, + { + "epoch": 2.266612311455361, + "grad_norm": 0.6961495280265808, + "learning_rate": 0.0007167923630928115, + "loss": 3.778, + "step": 33360 + }, + { + "epoch": 2.2669520315260225, + "grad_norm": 1.0922868251800537, + "learning_rate": 0.0007167498980839789, + "loss": 3.6182, + "step": 33365 + }, + { + "epoch": 2.267291751596684, + "grad_norm": 0.9290953874588013, + "learning_rate": 0.0007167074330751461, + "loss": 3.6897, + "step": 33370 + }, + { + "epoch": 2.2676314716673462, + "grad_norm": 0.9563431739807129, + "learning_rate": 0.0007166649680663134, + "loss": 3.2758, + "step": 33375 + }, + { + "epoch": 2.267971191738008, + "grad_norm": 0.7452784180641174, + "learning_rate": 0.0007166225030574806, + "loss": 3.7371, + "step": 33380 + }, + { + "epoch": 2.2683109118086695, + "grad_norm": 1.064866065979004, + "learning_rate": 0.0007165800380486479, + "loss": 3.4129, + "step": 33385 + }, + { + "epoch": 2.2686506318793316, + "grad_norm": 1.7911573648452759, + "learning_rate": 0.0007165375730398153, + "loss": 3.5835, + "step": 33390 + }, + { + "epoch": 2.268990351949993, + "grad_norm": 1.2900882959365845, + "learning_rate": 0.0007164951080309825, + "loss": 3.6103, + "step": 33395 + }, + { + "epoch": 2.269330072020655, + "grad_norm": 1.3048884868621826, + "learning_rate": 0.0007164526430221498, + "loss": 3.3876, + "step": 33400 + }, + { + "epoch": 2.269669792091317, + "grad_norm": 1.0018296241760254, + "learning_rate": 0.0007164101780133171, + "loss": 3.3956, + "step": 33405 + }, + { + "epoch": 2.2700095121619785, + "grad_norm": 1.028058648109436, + "learning_rate": 0.0007163677130044843, + "loss": 3.4555, + "step": 33410 + }, + { + "epoch": 2.27034923223264, + "grad_norm": 0.7705847024917603, + "learning_rate": 0.0007163252479956515, + "loss": 3.5944, + "step": 33415 + }, + { + "epoch": 2.2706889523033023, + "grad_norm": 1.0157464742660522, + "learning_rate": 0.0007162827829868189, + "loss": 3.6262, + "step": 33420 + }, + { + "epoch": 2.271028672373964, + "grad_norm": 0.9328880906105042, + "learning_rate": 0.0007162403179779862, + "loss": 3.3556, + "step": 33425 + }, + { + "epoch": 2.2713683924446255, + "grad_norm": 0.9154773950576782, + "learning_rate": 0.0007161978529691534, + "loss": 3.4501, + "step": 33430 + }, + { + "epoch": 2.2717081125152876, + "grad_norm": 0.7220737338066101, + "learning_rate": 0.0007161553879603208, + "loss": 3.3987, + "step": 33435 + }, + { + "epoch": 2.2720478325859492, + "grad_norm": 1.0419389009475708, + "learning_rate": 0.000716112922951488, + "loss": 3.4422, + "step": 33440 + }, + { + "epoch": 2.272387552656611, + "grad_norm": 0.784308910369873, + "learning_rate": 0.0007160704579426552, + "loss": 3.5413, + "step": 33445 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.9580551385879517, + "learning_rate": 0.0007160279929338226, + "loss": 3.3477, + "step": 33450 + }, + { + "epoch": 2.2730669927979346, + "grad_norm": 1.1162357330322266, + "learning_rate": 0.0007159855279249898, + "loss": 3.6345, + "step": 33455 + }, + { + "epoch": 2.273406712868596, + "grad_norm": 1.2811156511306763, + "learning_rate": 0.0007159430629161571, + "loss": 3.5905, + "step": 33460 + }, + { + "epoch": 2.273746432939258, + "grad_norm": 0.9909342527389526, + "learning_rate": 0.0007159005979073245, + "loss": 3.4626, + "step": 33465 + }, + { + "epoch": 2.27408615300992, + "grad_norm": 1.1898667812347412, + "learning_rate": 0.0007158581328984917, + "loss": 3.475, + "step": 33470 + }, + { + "epoch": 2.2744258730805815, + "grad_norm": 0.750446081161499, + "learning_rate": 0.0007158156678896589, + "loss": 3.5958, + "step": 33475 + }, + { + "epoch": 2.274765593151243, + "grad_norm": 0.8788042068481445, + "learning_rate": 0.0007157732028808262, + "loss": 3.5383, + "step": 33480 + }, + { + "epoch": 2.2751053132219052, + "grad_norm": 1.0388602018356323, + "learning_rate": 0.0007157307378719935, + "loss": 3.5444, + "step": 33485 + }, + { + "epoch": 2.275445033292567, + "grad_norm": 0.6480758190155029, + "learning_rate": 0.0007156882728631607, + "loss": 3.7181, + "step": 33490 + }, + { + "epoch": 2.2757847533632285, + "grad_norm": 1.0393098592758179, + "learning_rate": 0.0007156458078543281, + "loss": 3.6888, + "step": 33495 + }, + { + "epoch": 2.2761244734338906, + "grad_norm": 0.7252459526062012, + "learning_rate": 0.0007156033428454954, + "loss": 3.7632, + "step": 33500 + }, + { + "epoch": 2.276464193504552, + "grad_norm": 0.778302788734436, + "learning_rate": 0.0007155608778366626, + "loss": 3.7793, + "step": 33505 + }, + { + "epoch": 2.276803913575214, + "grad_norm": 1.3086539506912231, + "learning_rate": 0.0007155184128278299, + "loss": 3.3842, + "step": 33510 + }, + { + "epoch": 2.277143633645876, + "grad_norm": 0.8399999141693115, + "learning_rate": 0.0007154759478189971, + "loss": 3.6124, + "step": 33515 + }, + { + "epoch": 2.2774833537165375, + "grad_norm": 1.12373948097229, + "learning_rate": 0.0007154334828101644, + "loss": 3.5537, + "step": 33520 + }, + { + "epoch": 2.277823073787199, + "grad_norm": 0.796057939529419, + "learning_rate": 0.0007153910178013317, + "loss": 3.5282, + "step": 33525 + }, + { + "epoch": 2.2781627938578612, + "grad_norm": 0.8396697640419006, + "learning_rate": 0.000715348552792499, + "loss": 3.6177, + "step": 33530 + }, + { + "epoch": 2.278502513928523, + "grad_norm": 0.9612255096435547, + "learning_rate": 0.0007153060877836663, + "loss": 3.4491, + "step": 33535 + }, + { + "epoch": 2.2788422339991845, + "grad_norm": 1.0736452341079712, + "learning_rate": 0.0007152636227748336, + "loss": 3.8327, + "step": 33540 + }, + { + "epoch": 2.2791819540698466, + "grad_norm": 0.9148756861686707, + "learning_rate": 0.0007152211577660008, + "loss": 3.4378, + "step": 33545 + }, + { + "epoch": 2.279521674140508, + "grad_norm": 0.9861878156661987, + "learning_rate": 0.000715178692757168, + "loss": 3.5201, + "step": 33550 + }, + { + "epoch": 2.27986139421117, + "grad_norm": 1.1396753787994385, + "learning_rate": 0.0007151362277483354, + "loss": 3.4472, + "step": 33555 + }, + { + "epoch": 2.280201114281832, + "grad_norm": 1.3787617683410645, + "learning_rate": 0.0007150937627395026, + "loss": 3.5117, + "step": 33560 + }, + { + "epoch": 2.2805408343524936, + "grad_norm": 0.8634085655212402, + "learning_rate": 0.00071505129773067, + "loss": 3.4267, + "step": 33565 + }, + { + "epoch": 2.280880554423155, + "grad_norm": 0.6682226061820984, + "learning_rate": 0.0007150088327218373, + "loss": 3.7799, + "step": 33570 + }, + { + "epoch": 2.2812202744938173, + "grad_norm": 0.9907086491584778, + "learning_rate": 0.0007149663677130045, + "loss": 3.4585, + "step": 33575 + }, + { + "epoch": 2.281559994564479, + "grad_norm": 3.303394317626953, + "learning_rate": 0.0007149239027041717, + "loss": 3.3658, + "step": 33580 + }, + { + "epoch": 2.2818997146351405, + "grad_norm": 1.0231963396072388, + "learning_rate": 0.0007148814376953391, + "loss": 3.6341, + "step": 33585 + }, + { + "epoch": 2.2822394347058026, + "grad_norm": 0.9903303384780884, + "learning_rate": 0.0007148389726865063, + "loss": 3.5056, + "step": 33590 + }, + { + "epoch": 2.2825791547764642, + "grad_norm": 1.2850438356399536, + "learning_rate": 0.0007147965076776735, + "loss": 3.4241, + "step": 33595 + }, + { + "epoch": 2.282918874847126, + "grad_norm": 0.8401099443435669, + "learning_rate": 0.000714754042668841, + "loss": 3.6598, + "step": 33600 + }, + { + "epoch": 2.283258594917788, + "grad_norm": 1.2621382474899292, + "learning_rate": 0.0007147115776600082, + "loss": 3.3817, + "step": 33605 + }, + { + "epoch": 2.2835983149884496, + "grad_norm": 1.245046615600586, + "learning_rate": 0.0007146691126511754, + "loss": 3.621, + "step": 33610 + }, + { + "epoch": 2.283938035059111, + "grad_norm": 0.8342440128326416, + "learning_rate": 0.0007146266476423427, + "loss": 3.5272, + "step": 33615 + }, + { + "epoch": 2.2842777551297733, + "grad_norm": 0.976043701171875, + "learning_rate": 0.00071458418263351, + "loss": 3.5902, + "step": 33620 + }, + { + "epoch": 2.284617475200435, + "grad_norm": 0.8176304697990417, + "learning_rate": 0.0007145417176246772, + "loss": 3.8198, + "step": 33625 + }, + { + "epoch": 2.2849571952710965, + "grad_norm": 0.9821677803993225, + "learning_rate": 0.0007144992526158445, + "loss": 3.4246, + "step": 33630 + }, + { + "epoch": 2.2852969153417586, + "grad_norm": 0.9601004123687744, + "learning_rate": 0.0007144567876070119, + "loss": 3.5342, + "step": 33635 + }, + { + "epoch": 2.2856366354124202, + "grad_norm": 1.3500828742980957, + "learning_rate": 0.0007144143225981791, + "loss": 3.7662, + "step": 33640 + }, + { + "epoch": 2.285976355483082, + "grad_norm": 0.9768383502960205, + "learning_rate": 0.0007143718575893464, + "loss": 3.6966, + "step": 33645 + }, + { + "epoch": 2.286316075553744, + "grad_norm": 0.8044917583465576, + "learning_rate": 0.0007143293925805137, + "loss": 3.537, + "step": 33650 + }, + { + "epoch": 2.2866557956244056, + "grad_norm": 2.1359975337982178, + "learning_rate": 0.0007142869275716809, + "loss": 3.5302, + "step": 33655 + }, + { + "epoch": 2.286995515695067, + "grad_norm": 1.2358940839767456, + "learning_rate": 0.0007142444625628482, + "loss": 3.6943, + "step": 33660 + }, + { + "epoch": 2.2873352357657293, + "grad_norm": 0.880716860294342, + "learning_rate": 0.0007142019975540154, + "loss": 3.471, + "step": 33665 + }, + { + "epoch": 2.287674955836391, + "grad_norm": 0.8520666360855103, + "learning_rate": 0.0007141595325451828, + "loss": 3.5479, + "step": 33670 + }, + { + "epoch": 2.2880146759070525, + "grad_norm": 0.9710259437561035, + "learning_rate": 0.0007141170675363501, + "loss": 3.3061, + "step": 33675 + }, + { + "epoch": 2.288354395977714, + "grad_norm": 0.9275334477424622, + "learning_rate": 0.0007140746025275173, + "loss": 3.4983, + "step": 33680 + }, + { + "epoch": 2.2886941160483762, + "grad_norm": 0.7842603921890259, + "learning_rate": 0.0007140321375186846, + "loss": 3.5875, + "step": 33685 + }, + { + "epoch": 2.289033836119038, + "grad_norm": 0.8783734440803528, + "learning_rate": 0.0007139896725098519, + "loss": 3.6035, + "step": 33690 + }, + { + "epoch": 2.2893735561896995, + "grad_norm": 0.9411804676055908, + "learning_rate": 0.0007139472075010191, + "loss": 3.5975, + "step": 33695 + }, + { + "epoch": 2.2897132762603616, + "grad_norm": 0.848061740398407, + "learning_rate": 0.0007139047424921863, + "loss": 3.8034, + "step": 33700 + }, + { + "epoch": 2.290052996331023, + "grad_norm": 0.730679452419281, + "learning_rate": 0.0007138622774833538, + "loss": 3.5951, + "step": 33705 + }, + { + "epoch": 2.290392716401685, + "grad_norm": 1.1105437278747559, + "learning_rate": 0.000713819812474521, + "loss": 3.3627, + "step": 33710 + }, + { + "epoch": 2.290732436472347, + "grad_norm": 0.9618144035339355, + "learning_rate": 0.0007137773474656883, + "loss": 3.399, + "step": 33715 + }, + { + "epoch": 2.2910721565430086, + "grad_norm": 0.86379474401474, + "learning_rate": 0.0007137348824568556, + "loss": 3.4444, + "step": 33720 + }, + { + "epoch": 2.29141187661367, + "grad_norm": 0.7848593592643738, + "learning_rate": 0.0007136924174480228, + "loss": 3.5625, + "step": 33725 + }, + { + "epoch": 2.2917515966843323, + "grad_norm": 0.8493658900260925, + "learning_rate": 0.0007136499524391901, + "loss": 3.871, + "step": 33730 + }, + { + "epoch": 2.292091316754994, + "grad_norm": 0.866919994354248, + "learning_rate": 0.0007136074874303575, + "loss": 3.4359, + "step": 33735 + }, + { + "epoch": 2.2924310368256555, + "grad_norm": 1.1422805786132812, + "learning_rate": 0.0007135650224215247, + "loss": 3.5226, + "step": 33740 + }, + { + "epoch": 2.2927707568963176, + "grad_norm": 1.0818778276443481, + "learning_rate": 0.000713522557412692, + "loss": 3.4514, + "step": 33745 + }, + { + "epoch": 2.2931104769669792, + "grad_norm": 0.9379894733428955, + "learning_rate": 0.0007134800924038593, + "loss": 3.6945, + "step": 33750 + }, + { + "epoch": 2.293450197037641, + "grad_norm": 0.8096943497657776, + "learning_rate": 0.0007134376273950265, + "loss": 3.6749, + "step": 33755 + }, + { + "epoch": 2.293789917108303, + "grad_norm": 0.7364004850387573, + "learning_rate": 0.0007133951623861938, + "loss": 3.354, + "step": 33760 + }, + { + "epoch": 2.2941296371789646, + "grad_norm": 0.7997116446495056, + "learning_rate": 0.000713352697377361, + "loss": 3.2872, + "step": 33765 + }, + { + "epoch": 2.294469357249626, + "grad_norm": 1.0711442232131958, + "learning_rate": 0.0007133102323685284, + "loss": 3.6762, + "step": 33770 + }, + { + "epoch": 2.2948090773202883, + "grad_norm": 0.9138102531433105, + "learning_rate": 0.0007132677673596957, + "loss": 3.5422, + "step": 33775 + }, + { + "epoch": 2.29514879739095, + "grad_norm": 0.8442439436912537, + "learning_rate": 0.0007132253023508629, + "loss": 3.4862, + "step": 33780 + }, + { + "epoch": 2.2954885174616115, + "grad_norm": 0.9199173450469971, + "learning_rate": 0.0007131828373420302, + "loss": 3.5981, + "step": 33785 + }, + { + "epoch": 2.295828237532273, + "grad_norm": 1.0445491075515747, + "learning_rate": 0.0007131403723331975, + "loss": 3.4685, + "step": 33790 + }, + { + "epoch": 2.2961679576029352, + "grad_norm": 0.9248667359352112, + "learning_rate": 0.0007130979073243647, + "loss": 3.6227, + "step": 33795 + }, + { + "epoch": 2.296507677673597, + "grad_norm": 0.7582318186759949, + "learning_rate": 0.000713055442315532, + "loss": 3.6485, + "step": 33800 + }, + { + "epoch": 2.2968473977442585, + "grad_norm": 1.8549916744232178, + "learning_rate": 0.0007130129773066994, + "loss": 3.588, + "step": 33805 + }, + { + "epoch": 2.2971871178149206, + "grad_norm": 4.677762031555176, + "learning_rate": 0.0007129705122978666, + "loss": 3.481, + "step": 33810 + }, + { + "epoch": 2.297526837885582, + "grad_norm": 0.7686132192611694, + "learning_rate": 0.0007129280472890338, + "loss": 3.647, + "step": 33815 + }, + { + "epoch": 2.297866557956244, + "grad_norm": 0.7648473978042603, + "learning_rate": 0.0007128855822802012, + "loss": 3.5364, + "step": 33820 + }, + { + "epoch": 2.298206278026906, + "grad_norm": 0.8098897933959961, + "learning_rate": 0.0007128431172713684, + "loss": 3.4042, + "step": 33825 + }, + { + "epoch": 2.2985459980975675, + "grad_norm": 0.9123522043228149, + "learning_rate": 0.0007128006522625356, + "loss": 3.5994, + "step": 33830 + }, + { + "epoch": 2.298885718168229, + "grad_norm": 0.9420242309570312, + "learning_rate": 0.000712758187253703, + "loss": 3.4539, + "step": 33835 + }, + { + "epoch": 2.2992254382388913, + "grad_norm": 0.9455181956291199, + "learning_rate": 0.0007127157222448703, + "loss": 3.4559, + "step": 33840 + }, + { + "epoch": 2.299565158309553, + "grad_norm": 0.8218865394592285, + "learning_rate": 0.0007126732572360375, + "loss": 3.814, + "step": 33845 + }, + { + "epoch": 2.2999048783802145, + "grad_norm": 1.0172486305236816, + "learning_rate": 0.0007126307922272049, + "loss": 3.6328, + "step": 33850 + }, + { + "epoch": 2.3002445984508766, + "grad_norm": 0.7569391131401062, + "learning_rate": 0.0007125883272183721, + "loss": 3.5599, + "step": 33855 + }, + { + "epoch": 2.300584318521538, + "grad_norm": 0.7924619913101196, + "learning_rate": 0.0007125458622095393, + "loss": 3.2956, + "step": 33860 + }, + { + "epoch": 2.3009240385922, + "grad_norm": 0.8539909720420837, + "learning_rate": 0.0007125033972007066, + "loss": 3.7459, + "step": 33865 + }, + { + "epoch": 2.301263758662862, + "grad_norm": 0.8298889398574829, + "learning_rate": 0.0007124609321918739, + "loss": 3.2861, + "step": 33870 + }, + { + "epoch": 2.3016034787335236, + "grad_norm": 2.6790945529937744, + "learning_rate": 0.0007124184671830412, + "loss": 3.3668, + "step": 33875 + }, + { + "epoch": 2.301943198804185, + "grad_norm": 1.0063676834106445, + "learning_rate": 0.0007123760021742085, + "loss": 3.6369, + "step": 33880 + }, + { + "epoch": 2.3022829188748473, + "grad_norm": 2.085052251815796, + "learning_rate": 0.0007123335371653758, + "loss": 3.4109, + "step": 33885 + }, + { + "epoch": 2.302622638945509, + "grad_norm": 0.7558144330978394, + "learning_rate": 0.000712291072156543, + "loss": 3.5917, + "step": 33890 + }, + { + "epoch": 2.3029623590161705, + "grad_norm": 0.847494900226593, + "learning_rate": 0.0007122486071477103, + "loss": 3.5922, + "step": 33895 + }, + { + "epoch": 2.3033020790868326, + "grad_norm": 0.869045078754425, + "learning_rate": 0.0007122061421388775, + "loss": 3.3857, + "step": 33900 + }, + { + "epoch": 2.3036417991574942, + "grad_norm": 0.9296221733093262, + "learning_rate": 0.0007121636771300448, + "loss": 3.6821, + "step": 33905 + }, + { + "epoch": 2.303981519228156, + "grad_norm": 0.7906876802444458, + "learning_rate": 0.0007121212121212122, + "loss": 3.6001, + "step": 33910 + }, + { + "epoch": 2.304321239298818, + "grad_norm": 0.9065683484077454, + "learning_rate": 0.0007120787471123794, + "loss": 3.794, + "step": 33915 + }, + { + "epoch": 2.3046609593694796, + "grad_norm": 1.1217070817947388, + "learning_rate": 0.0007120362821035467, + "loss": 3.7182, + "step": 33920 + }, + { + "epoch": 2.305000679440141, + "grad_norm": 0.8714315891265869, + "learning_rate": 0.000711993817094714, + "loss": 3.6959, + "step": 33925 + }, + { + "epoch": 2.3053403995108033, + "grad_norm": 0.80816650390625, + "learning_rate": 0.0007119513520858812, + "loss": 3.3584, + "step": 33930 + }, + { + "epoch": 2.305680119581465, + "grad_norm": 0.8936364650726318, + "learning_rate": 0.0007119088870770485, + "loss": 3.5974, + "step": 33935 + }, + { + "epoch": 2.3060198396521265, + "grad_norm": 0.8761956095695496, + "learning_rate": 0.0007118664220682158, + "loss": 3.6091, + "step": 33940 + }, + { + "epoch": 2.3063595597227886, + "grad_norm": 0.8794265389442444, + "learning_rate": 0.0007118239570593831, + "loss": 3.6317, + "step": 33945 + }, + { + "epoch": 2.3066992797934502, + "grad_norm": 1.1960383653640747, + "learning_rate": 0.0007117814920505504, + "loss": 3.4882, + "step": 33950 + }, + { + "epoch": 2.307038999864112, + "grad_norm": 0.8336586952209473, + "learning_rate": 0.0007117390270417177, + "loss": 3.4412, + "step": 33955 + }, + { + "epoch": 2.307378719934774, + "grad_norm": 0.9741801619529724, + "learning_rate": 0.0007116965620328849, + "loss": 3.6574, + "step": 33960 + }, + { + "epoch": 2.3077184400054356, + "grad_norm": 0.9704198837280273, + "learning_rate": 0.0007116540970240521, + "loss": 3.4503, + "step": 33965 + }, + { + "epoch": 2.308058160076097, + "grad_norm": 1.0081371068954468, + "learning_rate": 0.0007116116320152195, + "loss": 3.2394, + "step": 33970 + }, + { + "epoch": 2.3083978801467593, + "grad_norm": 0.7106031179428101, + "learning_rate": 0.0007115691670063867, + "loss": 3.6931, + "step": 33975 + }, + { + "epoch": 2.308737600217421, + "grad_norm": 1.1170090436935425, + "learning_rate": 0.000711526701997554, + "loss": 3.2987, + "step": 33980 + }, + { + "epoch": 2.3090773202880825, + "grad_norm": 1.1644269227981567, + "learning_rate": 0.0007114842369887214, + "loss": 3.6558, + "step": 33985 + }, + { + "epoch": 2.3094170403587446, + "grad_norm": 1.945035457611084, + "learning_rate": 0.0007114417719798886, + "loss": 3.5289, + "step": 33990 + }, + { + "epoch": 2.3097567604294063, + "grad_norm": 1.045504093170166, + "learning_rate": 0.0007113993069710558, + "loss": 3.7798, + "step": 33995 + }, + { + "epoch": 2.310096480500068, + "grad_norm": 1.1297615766525269, + "learning_rate": 0.0007113568419622232, + "loss": 3.5192, + "step": 34000 + }, + { + "epoch": 2.31043620057073, + "grad_norm": 0.8465031981468201, + "learning_rate": 0.0007113143769533904, + "loss": 3.6743, + "step": 34005 + }, + { + "epoch": 2.3107759206413916, + "grad_norm": 0.9677177667617798, + "learning_rate": 0.0007112719119445576, + "loss": 3.7342, + "step": 34010 + }, + { + "epoch": 2.311115640712053, + "grad_norm": 0.8363118171691895, + "learning_rate": 0.000711229446935725, + "loss": 3.6671, + "step": 34015 + }, + { + "epoch": 2.311455360782715, + "grad_norm": 0.7494251132011414, + "learning_rate": 0.0007111869819268923, + "loss": 3.6014, + "step": 34020 + }, + { + "epoch": 2.311795080853377, + "grad_norm": 0.8222722411155701, + "learning_rate": 0.0007111445169180595, + "loss": 3.5464, + "step": 34025 + }, + { + "epoch": 2.3121348009240386, + "grad_norm": 1.1161575317382812, + "learning_rate": 0.0007111020519092268, + "loss": 3.3265, + "step": 34030 + }, + { + "epoch": 2.3124745209947, + "grad_norm": 0.9769174456596375, + "learning_rate": 0.0007110595869003941, + "loss": 3.686, + "step": 34035 + }, + { + "epoch": 2.3128142410653623, + "grad_norm": 0.7697268128395081, + "learning_rate": 0.0007110171218915613, + "loss": 3.7865, + "step": 34040 + }, + { + "epoch": 2.313153961136024, + "grad_norm": 0.9875672459602356, + "learning_rate": 0.0007109746568827286, + "loss": 3.7255, + "step": 34045 + }, + { + "epoch": 2.3134936812066855, + "grad_norm": 0.9597907662391663, + "learning_rate": 0.000710932191873896, + "loss": 3.5042, + "step": 34050 + }, + { + "epoch": 2.3138334012773476, + "grad_norm": 1.126318097114563, + "learning_rate": 0.0007108897268650633, + "loss": 3.2927, + "step": 34055 + }, + { + "epoch": 2.3141731213480092, + "grad_norm": 0.7817652821540833, + "learning_rate": 0.0007108472618562305, + "loss": 3.3581, + "step": 34060 + }, + { + "epoch": 2.314512841418671, + "grad_norm": 0.7602252960205078, + "learning_rate": 0.0007108047968473977, + "loss": 3.6923, + "step": 34065 + }, + { + "epoch": 2.314852561489333, + "grad_norm": 0.9741318821907043, + "learning_rate": 0.0007107623318385651, + "loss": 3.4795, + "step": 34070 + }, + { + "epoch": 2.3151922815599946, + "grad_norm": 0.773223876953125, + "learning_rate": 0.0007107198668297323, + "loss": 3.381, + "step": 34075 + }, + { + "epoch": 2.315532001630656, + "grad_norm": 0.8746182918548584, + "learning_rate": 0.0007106774018208995, + "loss": 3.7554, + "step": 34080 + }, + { + "epoch": 2.3158717217013183, + "grad_norm": 0.819348931312561, + "learning_rate": 0.000710634936812067, + "loss": 3.7826, + "step": 34085 + }, + { + "epoch": 2.31621144177198, + "grad_norm": 0.6750273704528809, + "learning_rate": 0.0007105924718032342, + "loss": 3.6951, + "step": 34090 + }, + { + "epoch": 2.3165511618426415, + "grad_norm": 0.831307053565979, + "learning_rate": 0.0007105500067944014, + "loss": 3.8154, + "step": 34095 + }, + { + "epoch": 2.3168908819133036, + "grad_norm": 0.6851188540458679, + "learning_rate": 0.0007105075417855688, + "loss": 3.7037, + "step": 34100 + }, + { + "epoch": 2.3172306019839652, + "grad_norm": 0.8896733522415161, + "learning_rate": 0.000710465076776736, + "loss": 3.3309, + "step": 34105 + }, + { + "epoch": 2.317570322054627, + "grad_norm": 0.7745558619499207, + "learning_rate": 0.0007104226117679032, + "loss": 3.6754, + "step": 34110 + }, + { + "epoch": 2.317910042125289, + "grad_norm": 1.056715726852417, + "learning_rate": 0.0007103801467590705, + "loss": 3.7331, + "step": 34115 + }, + { + "epoch": 2.3182497621959506, + "grad_norm": 5.35130500793457, + "learning_rate": 0.0007103376817502379, + "loss": 3.6884, + "step": 34120 + }, + { + "epoch": 2.318589482266612, + "grad_norm": 0.8892108201980591, + "learning_rate": 0.0007102952167414051, + "loss": 3.6201, + "step": 34125 + }, + { + "epoch": 2.318929202337274, + "grad_norm": 0.9524503350257874, + "learning_rate": 0.0007102527517325724, + "loss": 3.5783, + "step": 34130 + }, + { + "epoch": 2.319268922407936, + "grad_norm": 0.8760326504707336, + "learning_rate": 0.0007102102867237397, + "loss": 3.7475, + "step": 34135 + }, + { + "epoch": 2.3196086424785975, + "grad_norm": 1.3569422960281372, + "learning_rate": 0.0007101678217149069, + "loss": 3.7498, + "step": 34140 + }, + { + "epoch": 2.319948362549259, + "grad_norm": 1.0268141031265259, + "learning_rate": 0.0007101253567060742, + "loss": 3.4252, + "step": 34145 + }, + { + "epoch": 2.3202880826199213, + "grad_norm": 0.9507217407226562, + "learning_rate": 0.0007100828916972414, + "loss": 3.6467, + "step": 34150 + }, + { + "epoch": 2.320627802690583, + "grad_norm": 0.9067598581314087, + "learning_rate": 0.0007100404266884088, + "loss": 3.4077, + "step": 34155 + }, + { + "epoch": 2.3209675227612445, + "grad_norm": 1.1590073108673096, + "learning_rate": 0.0007099979616795761, + "loss": 3.4224, + "step": 34160 + }, + { + "epoch": 2.3213072428319066, + "grad_norm": 0.8116672039031982, + "learning_rate": 0.0007099554966707433, + "loss": 3.4424, + "step": 34165 + }, + { + "epoch": 2.3216469629025682, + "grad_norm": 0.8196688294410706, + "learning_rate": 0.0007099130316619106, + "loss": 3.5628, + "step": 34170 + }, + { + "epoch": 2.32198668297323, + "grad_norm": 0.8279538154602051, + "learning_rate": 0.0007098705666530779, + "loss": 3.3377, + "step": 34175 + }, + { + "epoch": 2.322326403043892, + "grad_norm": 0.8963621854782104, + "learning_rate": 0.0007098281016442451, + "loss": 3.3373, + "step": 34180 + }, + { + "epoch": 2.3226661231145536, + "grad_norm": 1.0244994163513184, + "learning_rate": 0.0007097856366354124, + "loss": 3.3829, + "step": 34185 + }, + { + "epoch": 2.323005843185215, + "grad_norm": 0.8661332726478577, + "learning_rate": 0.0007097431716265798, + "loss": 3.3327, + "step": 34190 + }, + { + "epoch": 2.3233455632558773, + "grad_norm": 0.9949436187744141, + "learning_rate": 0.000709700706617747, + "loss": 3.7054, + "step": 34195 + }, + { + "epoch": 2.323685283326539, + "grad_norm": 0.9796545505523682, + "learning_rate": 0.0007096582416089142, + "loss": 3.4561, + "step": 34200 + }, + { + "epoch": 2.3240250033972005, + "grad_norm": 1.0886019468307495, + "learning_rate": 0.0007096157766000816, + "loss": 3.4384, + "step": 34205 + }, + { + "epoch": 2.3243647234678626, + "grad_norm": 1.2960582971572876, + "learning_rate": 0.0007095733115912488, + "loss": 3.9064, + "step": 34210 + }, + { + "epoch": 2.3247044435385242, + "grad_norm": 0.7183921337127686, + "learning_rate": 0.000709530846582416, + "loss": 3.4901, + "step": 34215 + }, + { + "epoch": 2.325044163609186, + "grad_norm": 0.9489277005195618, + "learning_rate": 0.0007094883815735834, + "loss": 3.8073, + "step": 34220 + }, + { + "epoch": 2.325383883679848, + "grad_norm": 1.1430370807647705, + "learning_rate": 0.0007094459165647507, + "loss": 3.4529, + "step": 34225 + }, + { + "epoch": 2.3257236037505096, + "grad_norm": 0.8577336072921753, + "learning_rate": 0.0007094034515559179, + "loss": 3.6817, + "step": 34230 + }, + { + "epoch": 2.326063323821171, + "grad_norm": 0.86420738697052, + "learning_rate": 0.0007093609865470853, + "loss": 3.5597, + "step": 34235 + }, + { + "epoch": 2.3264030438918333, + "grad_norm": 0.9296112656593323, + "learning_rate": 0.0007093185215382525, + "loss": 3.658, + "step": 34240 + }, + { + "epoch": 2.326742763962495, + "grad_norm": 0.9046612977981567, + "learning_rate": 0.0007092760565294197, + "loss": 3.543, + "step": 34245 + }, + { + "epoch": 2.3270824840331565, + "grad_norm": 0.9789881706237793, + "learning_rate": 0.000709233591520587, + "loss": 3.6472, + "step": 34250 + }, + { + "epoch": 2.3274222041038186, + "grad_norm": 0.8176478743553162, + "learning_rate": 0.0007091911265117543, + "loss": 3.5964, + "step": 34255 + }, + { + "epoch": 2.3277619241744802, + "grad_norm": 0.8777276873588562, + "learning_rate": 0.0007091486615029216, + "loss": 3.6325, + "step": 34260 + }, + { + "epoch": 2.328101644245142, + "grad_norm": 0.8540732860565186, + "learning_rate": 0.0007091061964940889, + "loss": 3.3151, + "step": 34265 + }, + { + "epoch": 2.328441364315804, + "grad_norm": 0.877428412437439, + "learning_rate": 0.0007090637314852562, + "loss": 3.725, + "step": 34270 + }, + { + "epoch": 2.3287810843864656, + "grad_norm": 0.9115272164344788, + "learning_rate": 0.0007090212664764234, + "loss": 3.3367, + "step": 34275 + }, + { + "epoch": 2.329120804457127, + "grad_norm": 0.7930452227592468, + "learning_rate": 0.0007089788014675907, + "loss": 3.9705, + "step": 34280 + }, + { + "epoch": 2.3294605245277893, + "grad_norm": 0.9343270063400269, + "learning_rate": 0.000708936336458758, + "loss": 3.4714, + "step": 34285 + }, + { + "epoch": 2.329800244598451, + "grad_norm": 0.995246171951294, + "learning_rate": 0.0007088938714499252, + "loss": 3.3528, + "step": 34290 + }, + { + "epoch": 2.3301399646691126, + "grad_norm": 0.8284001350402832, + "learning_rate": 0.0007088514064410926, + "loss": 3.7201, + "step": 34295 + }, + { + "epoch": 2.3304796847397746, + "grad_norm": 0.8096879720687866, + "learning_rate": 0.0007088089414322598, + "loss": 3.4655, + "step": 34300 + }, + { + "epoch": 2.3308194048104363, + "grad_norm": 0.9578198790550232, + "learning_rate": 0.0007087664764234271, + "loss": 3.7305, + "step": 34305 + }, + { + "epoch": 2.331159124881098, + "grad_norm": 0.9140468239784241, + "learning_rate": 0.0007087240114145944, + "loss": 3.6709, + "step": 34310 + }, + { + "epoch": 2.33149884495176, + "grad_norm": 1.1044596433639526, + "learning_rate": 0.0007086815464057616, + "loss": 3.6034, + "step": 34315 + }, + { + "epoch": 2.3318385650224216, + "grad_norm": 1.0751473903656006, + "learning_rate": 0.0007086390813969289, + "loss": 3.3236, + "step": 34320 + }, + { + "epoch": 2.3321782850930832, + "grad_norm": 1.106453537940979, + "learning_rate": 0.0007085966163880963, + "loss": 3.4337, + "step": 34325 + }, + { + "epoch": 2.3325180051637453, + "grad_norm": 0.851236879825592, + "learning_rate": 0.0007085541513792635, + "loss": 3.6697, + "step": 34330 + }, + { + "epoch": 2.332857725234407, + "grad_norm": 0.8308385610580444, + "learning_rate": 0.0007085116863704308, + "loss": 3.4378, + "step": 34335 + }, + { + "epoch": 2.3331974453050686, + "grad_norm": 1.0674384832382202, + "learning_rate": 0.0007084692213615981, + "loss": 3.5797, + "step": 34340 + }, + { + "epoch": 2.3335371653757306, + "grad_norm": 0.9432170391082764, + "learning_rate": 0.0007084267563527653, + "loss": 3.4976, + "step": 34345 + }, + { + "epoch": 2.3338768854463923, + "grad_norm": 0.8960615992546082, + "learning_rate": 0.0007083842913439325, + "loss": 3.6446, + "step": 34350 + }, + { + "epoch": 2.334216605517054, + "grad_norm": 1.1177794933319092, + "learning_rate": 0.0007083418263350999, + "loss": 3.5753, + "step": 34355 + }, + { + "epoch": 2.3345563255877155, + "grad_norm": 0.7682883143424988, + "learning_rate": 0.0007082993613262672, + "loss": 3.7885, + "step": 34360 + }, + { + "epoch": 2.3348960456583776, + "grad_norm": 0.8142637610435486, + "learning_rate": 0.0007082568963174344, + "loss": 3.5269, + "step": 34365 + }, + { + "epoch": 2.3352357657290392, + "grad_norm": 0.8764776587486267, + "learning_rate": 0.0007082144313086018, + "loss": 3.6855, + "step": 34370 + }, + { + "epoch": 2.335575485799701, + "grad_norm": 1.3206007480621338, + "learning_rate": 0.000708171966299769, + "loss": 3.6216, + "step": 34375 + }, + { + "epoch": 2.335915205870363, + "grad_norm": 0.8086995482444763, + "learning_rate": 0.0007081295012909362, + "loss": 3.4293, + "step": 34380 + }, + { + "epoch": 2.3362549259410246, + "grad_norm": 0.8999928832054138, + "learning_rate": 0.0007080870362821036, + "loss": 3.604, + "step": 34385 + }, + { + "epoch": 2.336594646011686, + "grad_norm": 1.4924781322479248, + "learning_rate": 0.0007080445712732708, + "loss": 3.5104, + "step": 34390 + }, + { + "epoch": 2.3369343660823483, + "grad_norm": 0.8843149542808533, + "learning_rate": 0.0007080021062644382, + "loss": 3.6769, + "step": 34395 + }, + { + "epoch": 2.33727408615301, + "grad_norm": 1.116897702217102, + "learning_rate": 0.0007079596412556054, + "loss": 3.7095, + "step": 34400 + }, + { + "epoch": 2.3376138062236715, + "grad_norm": 0.9213428497314453, + "learning_rate": 0.0007079171762467727, + "loss": 3.5794, + "step": 34405 + }, + { + "epoch": 2.3379535262943336, + "grad_norm": 1.0569889545440674, + "learning_rate": 0.00070787471123794, + "loss": 3.6598, + "step": 34410 + }, + { + "epoch": 2.3382932463649952, + "grad_norm": 0.9543172121047974, + "learning_rate": 0.0007078322462291072, + "loss": 3.6107, + "step": 34415 + }, + { + "epoch": 2.338632966435657, + "grad_norm": 5.221427917480469, + "learning_rate": 0.0007077897812202745, + "loss": 3.3769, + "step": 34420 + }, + { + "epoch": 2.338972686506319, + "grad_norm": 0.8090324401855469, + "learning_rate": 0.0007077473162114418, + "loss": 3.1978, + "step": 34425 + }, + { + "epoch": 2.3393124065769806, + "grad_norm": 0.8245089650154114, + "learning_rate": 0.0007077048512026091, + "loss": 3.6428, + "step": 34430 + }, + { + "epoch": 2.339652126647642, + "grad_norm": 0.7341430187225342, + "learning_rate": 0.0007076623861937764, + "loss": 3.7518, + "step": 34435 + }, + { + "epoch": 2.3399918467183043, + "grad_norm": 1.017844796180725, + "learning_rate": 0.0007076199211849437, + "loss": 3.3691, + "step": 34440 + }, + { + "epoch": 2.340331566788966, + "grad_norm": 0.8448595404624939, + "learning_rate": 0.0007075774561761109, + "loss": 3.5444, + "step": 34445 + }, + { + "epoch": 2.3406712868596276, + "grad_norm": 0.9568013548851013, + "learning_rate": 0.0007075349911672781, + "loss": 3.5888, + "step": 34450 + }, + { + "epoch": 2.3410110069302896, + "grad_norm": 0.8860839605331421, + "learning_rate": 0.0007074925261584455, + "loss": 3.7731, + "step": 34455 + }, + { + "epoch": 2.3413507270009513, + "grad_norm": 1.4220607280731201, + "learning_rate": 0.0007074500611496127, + "loss": 3.7152, + "step": 34460 + }, + { + "epoch": 2.341690447071613, + "grad_norm": 0.9543977379798889, + "learning_rate": 0.00070740759614078, + "loss": 3.5764, + "step": 34465 + }, + { + "epoch": 2.3420301671422745, + "grad_norm": 0.7432539463043213, + "learning_rate": 0.0007073651311319474, + "loss": 3.5308, + "step": 34470 + }, + { + "epoch": 2.3423698872129366, + "grad_norm": 0.9527960419654846, + "learning_rate": 0.0007073226661231146, + "loss": 3.3133, + "step": 34475 + }, + { + "epoch": 2.3427096072835982, + "grad_norm": 0.9432029724121094, + "learning_rate": 0.0007072802011142818, + "loss": 3.6704, + "step": 34480 + }, + { + "epoch": 2.34304932735426, + "grad_norm": 0.908528745174408, + "learning_rate": 0.0007072377361054492, + "loss": 3.5668, + "step": 34485 + }, + { + "epoch": 2.343389047424922, + "grad_norm": 0.8764864206314087, + "learning_rate": 0.0007071952710966164, + "loss": 3.6422, + "step": 34490 + }, + { + "epoch": 2.3437287674955836, + "grad_norm": 0.9141190052032471, + "learning_rate": 0.0007071528060877836, + "loss": 3.7268, + "step": 34495 + }, + { + "epoch": 2.344068487566245, + "grad_norm": 0.9215124249458313, + "learning_rate": 0.000707110341078951, + "loss": 3.3629, + "step": 34500 + }, + { + "epoch": 2.3444082076369073, + "grad_norm": 1.0514782667160034, + "learning_rate": 0.0007070678760701183, + "loss": 3.5794, + "step": 34505 + }, + { + "epoch": 2.344747927707569, + "grad_norm": 1.2034345865249634, + "learning_rate": 0.0007070254110612855, + "loss": 3.559, + "step": 34510 + }, + { + "epoch": 2.3450876477782305, + "grad_norm": 1.1555936336517334, + "learning_rate": 0.0007069829460524528, + "loss": 3.4819, + "step": 34515 + }, + { + "epoch": 2.3454273678488926, + "grad_norm": 0.8539782762527466, + "learning_rate": 0.0007069404810436201, + "loss": 3.4025, + "step": 34520 + }, + { + "epoch": 2.3457670879195542, + "grad_norm": 0.8233159780502319, + "learning_rate": 0.0007068980160347873, + "loss": 3.6727, + "step": 34525 + }, + { + "epoch": 2.346106807990216, + "grad_norm": 0.8086985349655151, + "learning_rate": 0.0007068555510259546, + "loss": 3.7192, + "step": 34530 + }, + { + "epoch": 2.346446528060878, + "grad_norm": 0.8654470443725586, + "learning_rate": 0.000706813086017122, + "loss": 3.7379, + "step": 34535 + }, + { + "epoch": 2.3467862481315396, + "grad_norm": 0.941077709197998, + "learning_rate": 0.0007067706210082892, + "loss": 3.3578, + "step": 34540 + }, + { + "epoch": 2.347125968202201, + "grad_norm": 0.8511686325073242, + "learning_rate": 0.0007067281559994565, + "loss": 3.5796, + "step": 34545 + }, + { + "epoch": 2.3474656882728633, + "grad_norm": 1.0091644525527954, + "learning_rate": 0.0007066856909906237, + "loss": 3.7316, + "step": 34550 + }, + { + "epoch": 2.347805408343525, + "grad_norm": 0.8682757616043091, + "learning_rate": 0.000706643225981791, + "loss": 3.6452, + "step": 34555 + }, + { + "epoch": 2.3481451284141865, + "grad_norm": 0.8162279725074768, + "learning_rate": 0.0007066007609729583, + "loss": 3.7179, + "step": 34560 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.8765701055526733, + "learning_rate": 0.0007065582959641255, + "loss": 3.7126, + "step": 34565 + }, + { + "epoch": 2.3488245685555103, + "grad_norm": 1.0220664739608765, + "learning_rate": 0.0007065158309552929, + "loss": 3.5807, + "step": 34570 + }, + { + "epoch": 2.349164288626172, + "grad_norm": 1.024558424949646, + "learning_rate": 0.0007064733659464602, + "loss": 3.5229, + "step": 34575 + }, + { + "epoch": 2.349504008696834, + "grad_norm": 0.8686379790306091, + "learning_rate": 0.0007064309009376274, + "loss": 3.6204, + "step": 34580 + }, + { + "epoch": 2.3498437287674956, + "grad_norm": 0.9166041612625122, + "learning_rate": 0.0007063884359287946, + "loss": 3.5174, + "step": 34585 + }, + { + "epoch": 2.350183448838157, + "grad_norm": 0.7181501984596252, + "learning_rate": 0.000706345970919962, + "loss": 3.7305, + "step": 34590 + }, + { + "epoch": 2.3505231689088193, + "grad_norm": 0.8669226169586182, + "learning_rate": 0.0007063035059111292, + "loss": 3.4213, + "step": 34595 + }, + { + "epoch": 2.350862888979481, + "grad_norm": 0.6995319128036499, + "learning_rate": 0.0007062610409022964, + "loss": 3.5898, + "step": 34600 + }, + { + "epoch": 2.3512026090501426, + "grad_norm": 1.0490045547485352, + "learning_rate": 0.0007062185758934639, + "loss": 3.6326, + "step": 34605 + }, + { + "epoch": 2.3515423291208046, + "grad_norm": 0.989657998085022, + "learning_rate": 0.0007061761108846311, + "loss": 3.7165, + "step": 34610 + }, + { + "epoch": 2.3518820491914663, + "grad_norm": 1.2742961645126343, + "learning_rate": 0.0007061336458757983, + "loss": 3.491, + "step": 34615 + }, + { + "epoch": 2.352221769262128, + "grad_norm": 0.740024983882904, + "learning_rate": 0.0007060911808669657, + "loss": 3.4793, + "step": 34620 + }, + { + "epoch": 2.35256148933279, + "grad_norm": 0.675859272480011, + "learning_rate": 0.0007060487158581329, + "loss": 3.6386, + "step": 34625 + }, + { + "epoch": 2.3529012094034516, + "grad_norm": 0.9767183661460876, + "learning_rate": 0.0007060062508493001, + "loss": 3.7101, + "step": 34630 + }, + { + "epoch": 2.3532409294741132, + "grad_norm": 1.0400917530059814, + "learning_rate": 0.0007059637858404674, + "loss": 3.8331, + "step": 34635 + }, + { + "epoch": 2.3535806495447753, + "grad_norm": 0.8829396963119507, + "learning_rate": 0.0007059213208316348, + "loss": 3.4398, + "step": 34640 + }, + { + "epoch": 2.353920369615437, + "grad_norm": 0.7119733095169067, + "learning_rate": 0.000705878855822802, + "loss": 3.5307, + "step": 34645 + }, + { + "epoch": 2.3542600896860986, + "grad_norm": 2.20643949508667, + "learning_rate": 0.0007058363908139693, + "loss": 3.5571, + "step": 34650 + }, + { + "epoch": 2.3545998097567606, + "grad_norm": 1.0329947471618652, + "learning_rate": 0.0007057939258051366, + "loss": 3.6579, + "step": 34655 + }, + { + "epoch": 2.3549395298274223, + "grad_norm": 1.016506314277649, + "learning_rate": 0.0007057514607963038, + "loss": 3.5526, + "step": 34660 + }, + { + "epoch": 2.355279249898084, + "grad_norm": 0.9393744468688965, + "learning_rate": 0.0007057089957874711, + "loss": 3.637, + "step": 34665 + }, + { + "epoch": 2.355618969968746, + "grad_norm": 0.7924431562423706, + "learning_rate": 0.0007056665307786384, + "loss": 3.6351, + "step": 34670 + }, + { + "epoch": 2.3559586900394076, + "grad_norm": 0.752042829990387, + "learning_rate": 0.0007056240657698057, + "loss": 3.6295, + "step": 34675 + }, + { + "epoch": 2.3562984101100692, + "grad_norm": 1.1009727716445923, + "learning_rate": 0.000705581600760973, + "loss": 3.5315, + "step": 34680 + }, + { + "epoch": 2.3566381301807313, + "grad_norm": 0.7764983177185059, + "learning_rate": 0.0007055391357521403, + "loss": 3.4849, + "step": 34685 + }, + { + "epoch": 2.356977850251393, + "grad_norm": 0.7163074016571045, + "learning_rate": 0.0007054966707433075, + "loss": 3.5528, + "step": 34690 + }, + { + "epoch": 2.3573175703220546, + "grad_norm": 0.8586317300796509, + "learning_rate": 0.0007054542057344748, + "loss": 3.4672, + "step": 34695 + }, + { + "epoch": 2.357657290392716, + "grad_norm": 0.7837478518486023, + "learning_rate": 0.000705411740725642, + "loss": 3.9098, + "step": 34700 + }, + { + "epoch": 2.3579970104633783, + "grad_norm": 1.0551223754882812, + "learning_rate": 0.0007053692757168093, + "loss": 3.4887, + "step": 34705 + }, + { + "epoch": 2.35833673053404, + "grad_norm": 0.8320517539978027, + "learning_rate": 0.0007053268107079767, + "loss": 3.4093, + "step": 34710 + }, + { + "epoch": 2.3586764506047015, + "grad_norm": 0.9051541686058044, + "learning_rate": 0.0007052843456991439, + "loss": 3.3979, + "step": 34715 + }, + { + "epoch": 2.3590161706753636, + "grad_norm": 0.8513590097427368, + "learning_rate": 0.0007052418806903112, + "loss": 3.6951, + "step": 34720 + }, + { + "epoch": 2.3593558907460253, + "grad_norm": 0.9706920981407166, + "learning_rate": 0.0007051994156814785, + "loss": 3.6411, + "step": 34725 + }, + { + "epoch": 2.359695610816687, + "grad_norm": 0.7920941710472107, + "learning_rate": 0.0007051569506726457, + "loss": 3.5058, + "step": 34730 + }, + { + "epoch": 2.360035330887349, + "grad_norm": 0.8491498827934265, + "learning_rate": 0.000705114485663813, + "loss": 3.4898, + "step": 34735 + }, + { + "epoch": 2.3603750509580106, + "grad_norm": 0.874626636505127, + "learning_rate": 0.0007050720206549803, + "loss": 3.2673, + "step": 34740 + }, + { + "epoch": 2.360714771028672, + "grad_norm": 0.8118370771408081, + "learning_rate": 0.0007050295556461476, + "loss": 3.5537, + "step": 34745 + }, + { + "epoch": 2.3610544910993343, + "grad_norm": 0.9023318886756897, + "learning_rate": 0.000704987090637315, + "loss": 3.6093, + "step": 34750 + }, + { + "epoch": 2.361394211169996, + "grad_norm": 0.8233477473258972, + "learning_rate": 0.0007049446256284822, + "loss": 3.5736, + "step": 34755 + }, + { + "epoch": 2.3617339312406576, + "grad_norm": 0.7776183485984802, + "learning_rate": 0.0007049021606196494, + "loss": 3.6213, + "step": 34760 + }, + { + "epoch": 2.3620736513113196, + "grad_norm": 0.9600569605827332, + "learning_rate": 0.0007048596956108167, + "loss": 3.5919, + "step": 34765 + }, + { + "epoch": 2.3624133713819813, + "grad_norm": 1.0517922639846802, + "learning_rate": 0.000704817230601984, + "loss": 3.3018, + "step": 34770 + }, + { + "epoch": 2.362753091452643, + "grad_norm": 1.0122087001800537, + "learning_rate": 0.0007047747655931512, + "loss": 3.5254, + "step": 34775 + }, + { + "epoch": 2.363092811523305, + "grad_norm": 0.8568893671035767, + "learning_rate": 0.0007047323005843186, + "loss": 3.4852, + "step": 34780 + }, + { + "epoch": 2.3634325315939666, + "grad_norm": 0.8918126225471497, + "learning_rate": 0.0007046898355754859, + "loss": 3.5457, + "step": 34785 + }, + { + "epoch": 2.3637722516646282, + "grad_norm": 1.4280935525894165, + "learning_rate": 0.0007046473705666531, + "loss": 3.7244, + "step": 34790 + }, + { + "epoch": 2.3641119717352903, + "grad_norm": 0.8083947896957397, + "learning_rate": 0.0007046049055578204, + "loss": 3.5098, + "step": 34795 + }, + { + "epoch": 2.364451691805952, + "grad_norm": 0.9195873737335205, + "learning_rate": 0.0007045624405489876, + "loss": 3.6845, + "step": 34800 + }, + { + "epoch": 2.3647914118766136, + "grad_norm": 0.7426067590713501, + "learning_rate": 0.0007045199755401549, + "loss": 3.682, + "step": 34805 + }, + { + "epoch": 2.365131131947275, + "grad_norm": 0.9219360947608948, + "learning_rate": 0.0007044775105313222, + "loss": 3.6027, + "step": 34810 + }, + { + "epoch": 2.3654708520179373, + "grad_norm": 1.0338866710662842, + "learning_rate": 0.0007044350455224895, + "loss": 3.36, + "step": 34815 + }, + { + "epoch": 2.365810572088599, + "grad_norm": 0.88029545545578, + "learning_rate": 0.0007043925805136568, + "loss": 3.3783, + "step": 34820 + }, + { + "epoch": 2.3661502921592605, + "grad_norm": 0.6950621008872986, + "learning_rate": 0.0007043501155048241, + "loss": 3.0906, + "step": 34825 + }, + { + "epoch": 2.3664900122299226, + "grad_norm": 0.8697402477264404, + "learning_rate": 0.0007043076504959913, + "loss": 3.198, + "step": 34830 + }, + { + "epoch": 2.3668297323005842, + "grad_norm": 0.7665315270423889, + "learning_rate": 0.0007042651854871585, + "loss": 3.8105, + "step": 34835 + }, + { + "epoch": 2.367169452371246, + "grad_norm": 0.8307465314865112, + "learning_rate": 0.0007042227204783259, + "loss": 3.7697, + "step": 34840 + }, + { + "epoch": 2.367509172441908, + "grad_norm": 1.1582896709442139, + "learning_rate": 0.0007041802554694931, + "loss": 3.5462, + "step": 34845 + }, + { + "epoch": 2.3678488925125696, + "grad_norm": 0.8167523145675659, + "learning_rate": 0.0007041377904606604, + "loss": 3.6801, + "step": 34850 + }, + { + "epoch": 2.368188612583231, + "grad_norm": 1.0425151586532593, + "learning_rate": 0.0007040953254518278, + "loss": 3.5636, + "step": 34855 + }, + { + "epoch": 2.3685283326538933, + "grad_norm": 0.763826847076416, + "learning_rate": 0.000704052860442995, + "loss": 3.4832, + "step": 34860 + }, + { + "epoch": 2.368868052724555, + "grad_norm": 0.8122187852859497, + "learning_rate": 0.0007040103954341622, + "loss": 3.5604, + "step": 34865 + }, + { + "epoch": 2.3692077727952165, + "grad_norm": 0.978056013584137, + "learning_rate": 0.0007039679304253296, + "loss": 3.6345, + "step": 34870 + }, + { + "epoch": 2.3695474928658786, + "grad_norm": 1.0032029151916504, + "learning_rate": 0.0007039254654164968, + "loss": 3.6873, + "step": 34875 + }, + { + "epoch": 2.3698872129365403, + "grad_norm": 0.880288302898407, + "learning_rate": 0.000703883000407664, + "loss": 3.5559, + "step": 34880 + }, + { + "epoch": 2.370226933007202, + "grad_norm": 0.8520089983940125, + "learning_rate": 0.0007038405353988315, + "loss": 3.6358, + "step": 34885 + }, + { + "epoch": 2.370566653077864, + "grad_norm": 0.6748731732368469, + "learning_rate": 0.0007037980703899987, + "loss": 3.5258, + "step": 34890 + }, + { + "epoch": 2.3709063731485256, + "grad_norm": 0.8479119539260864, + "learning_rate": 0.0007037556053811659, + "loss": 3.6613, + "step": 34895 + }, + { + "epoch": 2.3712460932191872, + "grad_norm": 0.7924423813819885, + "learning_rate": 0.0007037131403723332, + "loss": 3.6795, + "step": 34900 + }, + { + "epoch": 2.3715858132898493, + "grad_norm": 0.8993105292320251, + "learning_rate": 0.0007036706753635005, + "loss": 3.5535, + "step": 34905 + }, + { + "epoch": 2.371925533360511, + "grad_norm": 1.1423699855804443, + "learning_rate": 0.0007036282103546677, + "loss": 3.4117, + "step": 34910 + }, + { + "epoch": 2.3722652534311726, + "grad_norm": 0.7061717510223389, + "learning_rate": 0.0007035857453458351, + "loss": 3.7179, + "step": 34915 + }, + { + "epoch": 2.3726049735018346, + "grad_norm": 0.9319825172424316, + "learning_rate": 0.0007035432803370024, + "loss": 3.5922, + "step": 34920 + }, + { + "epoch": 2.3729446935724963, + "grad_norm": 1.1814967393875122, + "learning_rate": 0.0007035008153281696, + "loss": 3.5345, + "step": 34925 + }, + { + "epoch": 2.373284413643158, + "grad_norm": 0.8495219945907593, + "learning_rate": 0.0007034583503193369, + "loss": 3.6461, + "step": 34930 + }, + { + "epoch": 2.37362413371382, + "grad_norm": 1.105298638343811, + "learning_rate": 0.0007034158853105041, + "loss": 3.7567, + "step": 34935 + }, + { + "epoch": 2.3739638537844816, + "grad_norm": 0.9154067039489746, + "learning_rate": 0.0007033734203016714, + "loss": 3.7467, + "step": 34940 + }, + { + "epoch": 2.3743035738551432, + "grad_norm": 0.8111316561698914, + "learning_rate": 0.0007033309552928387, + "loss": 3.4362, + "step": 34945 + }, + { + "epoch": 2.3746432939258053, + "grad_norm": 1.0976632833480835, + "learning_rate": 0.000703288490284006, + "loss": 3.6908, + "step": 34950 + }, + { + "epoch": 2.374983013996467, + "grad_norm": 1.0718754529953003, + "learning_rate": 0.0007032460252751733, + "loss": 3.5517, + "step": 34955 + }, + { + "epoch": 2.3753227340671286, + "grad_norm": 0.8261789083480835, + "learning_rate": 0.0007032035602663406, + "loss": 3.5642, + "step": 34960 + }, + { + "epoch": 2.3756624541377906, + "grad_norm": 1.1281167268753052, + "learning_rate": 0.0007031610952575078, + "loss": 3.6494, + "step": 34965 + }, + { + "epoch": 2.3760021742084523, + "grad_norm": 1.2050065994262695, + "learning_rate": 0.000703118630248675, + "loss": 3.6226, + "step": 34970 + }, + { + "epoch": 2.376341894279114, + "grad_norm": 0.8801029324531555, + "learning_rate": 0.0007030761652398424, + "loss": 3.6293, + "step": 34975 + }, + { + "epoch": 2.376681614349776, + "grad_norm": 1.0094984769821167, + "learning_rate": 0.0007030337002310096, + "loss": 3.4505, + "step": 34980 + }, + { + "epoch": 2.3770213344204376, + "grad_norm": 1.0532797574996948, + "learning_rate": 0.000702991235222177, + "loss": 3.4997, + "step": 34985 + }, + { + "epoch": 2.3773610544910992, + "grad_norm": 1.2695343494415283, + "learning_rate": 0.0007029487702133443, + "loss": 3.7717, + "step": 34990 + }, + { + "epoch": 2.3777007745617613, + "grad_norm": 1.10023033618927, + "learning_rate": 0.0007029063052045115, + "loss": 3.5026, + "step": 34995 + }, + { + "epoch": 2.378040494632423, + "grad_norm": 0.9246497750282288, + "learning_rate": 0.0007028638401956787, + "loss": 3.683, + "step": 35000 + }, + { + "epoch": 2.3783802147030846, + "grad_norm": 0.9732288718223572, + "learning_rate": 0.0007028213751868461, + "loss": 3.3722, + "step": 35005 + }, + { + "epoch": 2.3787199347737467, + "grad_norm": 1.05582857131958, + "learning_rate": 0.0007027789101780133, + "loss": 3.7089, + "step": 35010 + }, + { + "epoch": 2.3790596548444083, + "grad_norm": 1.4667727947235107, + "learning_rate": 0.0007027364451691805, + "loss": 3.5473, + "step": 35015 + }, + { + "epoch": 2.37939937491507, + "grad_norm": 0.7253547310829163, + "learning_rate": 0.000702693980160348, + "loss": 3.6611, + "step": 35020 + }, + { + "epoch": 2.379739094985732, + "grad_norm": 0.8862168192863464, + "learning_rate": 0.0007026515151515152, + "loss": 3.7473, + "step": 35025 + }, + { + "epoch": 2.3800788150563936, + "grad_norm": 1.1392719745635986, + "learning_rate": 0.0007026090501426824, + "loss": 3.6146, + "step": 35030 + }, + { + "epoch": 2.3804185351270553, + "grad_norm": 0.788493812084198, + "learning_rate": 0.0007025665851338497, + "loss": 3.6504, + "step": 35035 + }, + { + "epoch": 2.380758255197717, + "grad_norm": 1.3973299264907837, + "learning_rate": 0.000702524120125017, + "loss": 3.7635, + "step": 35040 + }, + { + "epoch": 2.381097975268379, + "grad_norm": 1.1085562705993652, + "learning_rate": 0.0007024816551161842, + "loss": 3.1884, + "step": 35045 + }, + { + "epoch": 2.3814376953390406, + "grad_norm": 1.4696896076202393, + "learning_rate": 0.0007024391901073515, + "loss": 3.2913, + "step": 35050 + }, + { + "epoch": 2.3817774154097022, + "grad_norm": 1.344221830368042, + "learning_rate": 0.0007023967250985189, + "loss": 3.6442, + "step": 35055 + }, + { + "epoch": 2.3821171354803643, + "grad_norm": 0.8956277966499329, + "learning_rate": 0.0007023542600896861, + "loss": 3.6249, + "step": 35060 + }, + { + "epoch": 2.382456855551026, + "grad_norm": 0.9237677454948425, + "learning_rate": 0.0007023117950808534, + "loss": 3.6628, + "step": 35065 + }, + { + "epoch": 2.3827965756216876, + "grad_norm": 3.460217237472534, + "learning_rate": 0.0007022693300720207, + "loss": 3.5849, + "step": 35070 + }, + { + "epoch": 2.3831362956923496, + "grad_norm": 0.8832293152809143, + "learning_rate": 0.000702226865063188, + "loss": 3.5471, + "step": 35075 + }, + { + "epoch": 2.3834760157630113, + "grad_norm": 1.2343342304229736, + "learning_rate": 0.0007021844000543552, + "loss": 3.6212, + "step": 35080 + }, + { + "epoch": 2.383815735833673, + "grad_norm": 1.5536991357803345, + "learning_rate": 0.0007021419350455224, + "loss": 3.4733, + "step": 35085 + }, + { + "epoch": 2.384155455904335, + "grad_norm": 1.171811580657959, + "learning_rate": 0.0007020994700366899, + "loss": 3.5158, + "step": 35090 + }, + { + "epoch": 2.3844951759749966, + "grad_norm": 1.133973479270935, + "learning_rate": 0.0007020570050278571, + "loss": 3.9084, + "step": 35095 + }, + { + "epoch": 2.3848348960456582, + "grad_norm": 0.8076163530349731, + "learning_rate": 0.0007020145400190243, + "loss": 3.7902, + "step": 35100 + }, + { + "epoch": 2.3851746161163203, + "grad_norm": 0.7335665225982666, + "learning_rate": 0.0007019720750101917, + "loss": 3.8307, + "step": 35105 + }, + { + "epoch": 2.385514336186982, + "grad_norm": 0.8724912405014038, + "learning_rate": 0.0007019296100013589, + "loss": 3.7273, + "step": 35110 + }, + { + "epoch": 2.3858540562576436, + "grad_norm": 0.8020744919776917, + "learning_rate": 0.0007018871449925261, + "loss": 3.5197, + "step": 35115 + }, + { + "epoch": 2.3861937763283056, + "grad_norm": 1.1193618774414062, + "learning_rate": 0.0007018446799836935, + "loss": 3.8667, + "step": 35120 + }, + { + "epoch": 2.3865334963989673, + "grad_norm": 0.8588840961456299, + "learning_rate": 0.0007018022149748608, + "loss": 3.6827, + "step": 35125 + }, + { + "epoch": 2.386873216469629, + "grad_norm": 0.7788686752319336, + "learning_rate": 0.000701759749966028, + "loss": 3.8916, + "step": 35130 + }, + { + "epoch": 2.387212936540291, + "grad_norm": 0.8639914989471436, + "learning_rate": 0.0007017172849571953, + "loss": 3.5747, + "step": 35135 + }, + { + "epoch": 2.3875526566109526, + "grad_norm": 0.9603084325790405, + "learning_rate": 0.0007016748199483626, + "loss": 3.5154, + "step": 35140 + }, + { + "epoch": 2.3878923766816142, + "grad_norm": 0.9029190540313721, + "learning_rate": 0.0007016323549395298, + "loss": 3.6658, + "step": 35145 + }, + { + "epoch": 2.388232096752276, + "grad_norm": 1.1375178098678589, + "learning_rate": 0.0007015898899306971, + "loss": 3.5777, + "step": 35150 + }, + { + "epoch": 2.388571816822938, + "grad_norm": 0.8551430702209473, + "learning_rate": 0.0007015474249218644, + "loss": 3.6196, + "step": 35155 + }, + { + "epoch": 2.3889115368935996, + "grad_norm": 0.8448473215103149, + "learning_rate": 0.0007015049599130317, + "loss": 3.7175, + "step": 35160 + }, + { + "epoch": 2.389251256964261, + "grad_norm": 1.190048336982727, + "learning_rate": 0.000701462494904199, + "loss": 3.6854, + "step": 35165 + }, + { + "epoch": 2.3895909770349233, + "grad_norm": 0.9631114602088928, + "learning_rate": 0.0007014200298953663, + "loss": 3.4438, + "step": 35170 + }, + { + "epoch": 2.389930697105585, + "grad_norm": 0.9913832545280457, + "learning_rate": 0.0007013775648865335, + "loss": 3.6913, + "step": 35175 + }, + { + "epoch": 2.3902704171762466, + "grad_norm": 0.9786026477813721, + "learning_rate": 0.0007013350998777008, + "loss": 3.4809, + "step": 35180 + }, + { + "epoch": 2.3906101372469086, + "grad_norm": 0.9938039779663086, + "learning_rate": 0.000701292634868868, + "loss": 3.5344, + "step": 35185 + }, + { + "epoch": 2.3909498573175703, + "grad_norm": 0.8469758629798889, + "learning_rate": 0.0007012501698600353, + "loss": 3.5391, + "step": 35190 + }, + { + "epoch": 2.391289577388232, + "grad_norm": 0.9308152794837952, + "learning_rate": 0.0007012077048512027, + "loss": 3.7646, + "step": 35195 + }, + { + "epoch": 2.391629297458894, + "grad_norm": 0.8395126461982727, + "learning_rate": 0.0007011652398423699, + "loss": 3.839, + "step": 35200 + }, + { + "epoch": 2.3919690175295556, + "grad_norm": 1.2482866048812866, + "learning_rate": 0.0007011227748335372, + "loss": 3.145, + "step": 35205 + }, + { + "epoch": 2.3923087376002172, + "grad_norm": 0.8741042017936707, + "learning_rate": 0.0007010803098247045, + "loss": 3.9435, + "step": 35210 + }, + { + "epoch": 2.3926484576708793, + "grad_norm": 0.9608095288276672, + "learning_rate": 0.0007010378448158717, + "loss": 3.63, + "step": 35215 + }, + { + "epoch": 2.392988177741541, + "grad_norm": 0.7334005832672119, + "learning_rate": 0.000700995379807039, + "loss": 3.6451, + "step": 35220 + }, + { + "epoch": 2.3933278978122026, + "grad_norm": 0.8751153945922852, + "learning_rate": 0.0007009529147982063, + "loss": 3.8635, + "step": 35225 + }, + { + "epoch": 2.3936676178828646, + "grad_norm": 0.937850832939148, + "learning_rate": 0.0007009104497893736, + "loss": 3.6973, + "step": 35230 + }, + { + "epoch": 2.3940073379535263, + "grad_norm": 0.9855071902275085, + "learning_rate": 0.0007008764777823073, + "loss": 3.4866, + "step": 35235 + }, + { + "epoch": 2.394347058024188, + "grad_norm": 0.9708119034767151, + "learning_rate": 0.0007008340127734747, + "loss": 3.7122, + "step": 35240 + }, + { + "epoch": 2.39468677809485, + "grad_norm": 1.6134990453720093, + "learning_rate": 0.0007007915477646419, + "loss": 3.4925, + "step": 35245 + }, + { + "epoch": 2.3950264981655116, + "grad_norm": 1.1199809312820435, + "learning_rate": 0.0007007490827558092, + "loss": 3.4464, + "step": 35250 + }, + { + "epoch": 2.3953662182361732, + "grad_norm": 0.9637807607650757, + "learning_rate": 0.0007007066177469766, + "loss": 3.4463, + "step": 35255 + }, + { + "epoch": 2.3957059383068353, + "grad_norm": 1.1352903842926025, + "learning_rate": 0.0007006641527381438, + "loss": 3.5844, + "step": 35260 + }, + { + "epoch": 2.396045658377497, + "grad_norm": 0.736727237701416, + "learning_rate": 0.000700621687729311, + "loss": 3.7799, + "step": 35265 + }, + { + "epoch": 2.3963853784481586, + "grad_norm": 0.7962369918823242, + "learning_rate": 0.0007005792227204783, + "loss": 3.6709, + "step": 35270 + }, + { + "epoch": 2.3967250985188207, + "grad_norm": 0.7592287659645081, + "learning_rate": 0.0007005367577116456, + "loss": 3.5299, + "step": 35275 + }, + { + "epoch": 2.3970648185894823, + "grad_norm": 0.7756562232971191, + "learning_rate": 0.0007004942927028129, + "loss": 3.6342, + "step": 35280 + }, + { + "epoch": 2.397404538660144, + "grad_norm": 0.8063962459564209, + "learning_rate": 0.0007004518276939802, + "loss": 3.634, + "step": 35285 + }, + { + "epoch": 2.397744258730806, + "grad_norm": 0.8176876902580261, + "learning_rate": 0.0007004093626851475, + "loss": 3.6258, + "step": 35290 + }, + { + "epoch": 2.3980839788014676, + "grad_norm": 0.7971237897872925, + "learning_rate": 0.0007003668976763148, + "loss": 3.5861, + "step": 35295 + }, + { + "epoch": 2.3984236988721293, + "grad_norm": 1.0038982629776, + "learning_rate": 0.000700324432667482, + "loss": 3.5932, + "step": 35300 + }, + { + "epoch": 2.3987634189427913, + "grad_norm": 0.7765002250671387, + "learning_rate": 0.0007002819676586493, + "loss": 3.7525, + "step": 35305 + }, + { + "epoch": 2.399103139013453, + "grad_norm": 1.4492930173873901, + "learning_rate": 0.0007002395026498166, + "loss": 3.3802, + "step": 35310 + }, + { + "epoch": 2.3994428590841146, + "grad_norm": 1.0408176183700562, + "learning_rate": 0.0007001970376409838, + "loss": 3.4127, + "step": 35315 + }, + { + "epoch": 2.3997825791547767, + "grad_norm": 0.885699450969696, + "learning_rate": 0.0007001545726321511, + "loss": 3.5479, + "step": 35320 + }, + { + "epoch": 2.4001222992254383, + "grad_norm": 0.8642269372940063, + "learning_rate": 0.0007001121076233185, + "loss": 3.6624, + "step": 35325 + }, + { + "epoch": 2.4004620192961, + "grad_norm": 0.7476972341537476, + "learning_rate": 0.0007000696426144857, + "loss": 3.4715, + "step": 35330 + }, + { + "epoch": 2.400801739366762, + "grad_norm": 0.724482536315918, + "learning_rate": 0.0007000271776056529, + "loss": 3.823, + "step": 35335 + }, + { + "epoch": 2.4011414594374236, + "grad_norm": 0.7471984624862671, + "learning_rate": 0.0006999847125968203, + "loss": 3.5114, + "step": 35340 + }, + { + "epoch": 2.4014811795080853, + "grad_norm": 0.959846019744873, + "learning_rate": 0.0006999422475879875, + "loss": 3.5775, + "step": 35345 + }, + { + "epoch": 2.4018208995787473, + "grad_norm": 1.1816840171813965, + "learning_rate": 0.0006998997825791547, + "loss": 3.4142, + "step": 35350 + }, + { + "epoch": 2.402160619649409, + "grad_norm": 0.959404468536377, + "learning_rate": 0.0006998573175703222, + "loss": 3.5178, + "step": 35355 + }, + { + "epoch": 2.4025003397200706, + "grad_norm": 1.169690489768982, + "learning_rate": 0.0006998148525614894, + "loss": 3.4929, + "step": 35360 + }, + { + "epoch": 2.4028400597907327, + "grad_norm": 1.4098806381225586, + "learning_rate": 0.0006997723875526566, + "loss": 3.4622, + "step": 35365 + }, + { + "epoch": 2.4031797798613943, + "grad_norm": 0.8342859745025635, + "learning_rate": 0.000699729922543824, + "loss": 3.6961, + "step": 35370 + }, + { + "epoch": 2.403519499932056, + "grad_norm": 0.7027131915092468, + "learning_rate": 0.0006996874575349912, + "loss": 3.4402, + "step": 35375 + }, + { + "epoch": 2.4038592200027176, + "grad_norm": 0.8740379214286804, + "learning_rate": 0.0006996449925261584, + "loss": 3.4751, + "step": 35380 + }, + { + "epoch": 2.4041989400733796, + "grad_norm": 0.8814476728439331, + "learning_rate": 0.0006996025275173257, + "loss": 3.6264, + "step": 35385 + }, + { + "epoch": 2.4045386601440413, + "grad_norm": 1.416764497756958, + "learning_rate": 0.0006995600625084931, + "loss": 3.5465, + "step": 35390 + }, + { + "epoch": 2.404878380214703, + "grad_norm": 0.8965014219284058, + "learning_rate": 0.0006995175974996603, + "loss": 3.5468, + "step": 35395 + }, + { + "epoch": 2.405218100285365, + "grad_norm": 0.868372917175293, + "learning_rate": 0.0006994751324908276, + "loss": 3.5784, + "step": 35400 + }, + { + "epoch": 2.4055578203560266, + "grad_norm": 0.8812150359153748, + "learning_rate": 0.0006994326674819949, + "loss": 3.7694, + "step": 35405 + }, + { + "epoch": 2.4058975404266882, + "grad_norm": 0.9405295848846436, + "learning_rate": 0.0006993902024731621, + "loss": 3.4298, + "step": 35410 + }, + { + "epoch": 2.4062372604973503, + "grad_norm": 0.9191285371780396, + "learning_rate": 0.0006993477374643294, + "loss": 3.4293, + "step": 35415 + }, + { + "epoch": 2.406576980568012, + "grad_norm": 0.7453224062919617, + "learning_rate": 0.0006993052724554966, + "loss": 3.5107, + "step": 35420 + }, + { + "epoch": 2.4069167006386736, + "grad_norm": 1.0398153066635132, + "learning_rate": 0.000699262807446664, + "loss": 3.6778, + "step": 35425 + }, + { + "epoch": 2.4072564207093357, + "grad_norm": 0.8062455058097839, + "learning_rate": 0.0006992203424378313, + "loss": 3.612, + "step": 35430 + }, + { + "epoch": 2.4075961407799973, + "grad_norm": 1.099783182144165, + "learning_rate": 0.0006991778774289985, + "loss": 3.4834, + "step": 35435 + }, + { + "epoch": 2.407935860850659, + "grad_norm": 0.8316912055015564, + "learning_rate": 0.0006991354124201658, + "loss": 3.4701, + "step": 35440 + }, + { + "epoch": 2.408275580921321, + "grad_norm": 0.8799378275871277, + "learning_rate": 0.0006990929474113331, + "loss": 3.7481, + "step": 35445 + }, + { + "epoch": 2.4086153009919826, + "grad_norm": 0.9601541757583618, + "learning_rate": 0.0006990504824025003, + "loss": 3.8419, + "step": 35450 + }, + { + "epoch": 2.4089550210626443, + "grad_norm": 0.8363065123558044, + "learning_rate": 0.0006990080173936675, + "loss": 3.59, + "step": 35455 + }, + { + "epoch": 2.4092947411333063, + "grad_norm": 0.8493088483810425, + "learning_rate": 0.000698965552384835, + "loss": 3.7403, + "step": 35460 + }, + { + "epoch": 2.409634461203968, + "grad_norm": 1.284616231918335, + "learning_rate": 0.0006989230873760022, + "loss": 3.3989, + "step": 35465 + }, + { + "epoch": 2.4099741812746296, + "grad_norm": 0.7125193476676941, + "learning_rate": 0.0006988806223671694, + "loss": 3.6609, + "step": 35470 + }, + { + "epoch": 2.4103139013452917, + "grad_norm": 1.1371896266937256, + "learning_rate": 0.0006988381573583368, + "loss": 3.5321, + "step": 35475 + }, + { + "epoch": 2.4106536214159533, + "grad_norm": 0.6787104606628418, + "learning_rate": 0.000698795692349504, + "loss": 3.571, + "step": 35480 + }, + { + "epoch": 2.410993341486615, + "grad_norm": 0.9028304219245911, + "learning_rate": 0.0006987532273406712, + "loss": 3.6041, + "step": 35485 + }, + { + "epoch": 2.4113330615572766, + "grad_norm": 0.7824472188949585, + "learning_rate": 0.0006987107623318386, + "loss": 3.7747, + "step": 35490 + }, + { + "epoch": 2.4116727816279386, + "grad_norm": 0.8855538368225098, + "learning_rate": 0.0006986682973230059, + "loss": 3.7433, + "step": 35495 + }, + { + "epoch": 2.4120125016986003, + "grad_norm": 0.8891363739967346, + "learning_rate": 0.0006986258323141731, + "loss": 3.7886, + "step": 35500 + }, + { + "epoch": 2.412352221769262, + "grad_norm": 0.9093465805053711, + "learning_rate": 0.0006985833673053405, + "loss": 3.3002, + "step": 35505 + }, + { + "epoch": 2.412691941839924, + "grad_norm": 0.868728756904602, + "learning_rate": 0.0006985409022965077, + "loss": 3.5741, + "step": 35510 + }, + { + "epoch": 2.4130316619105856, + "grad_norm": 0.7317980527877808, + "learning_rate": 0.0006984984372876749, + "loss": 3.459, + "step": 35515 + }, + { + "epoch": 2.4133713819812472, + "grad_norm": 1.0144476890563965, + "learning_rate": 0.0006984559722788422, + "loss": 3.7741, + "step": 35520 + }, + { + "epoch": 2.4137111020519093, + "grad_norm": 1.3003703355789185, + "learning_rate": 0.0006984135072700095, + "loss": 3.555, + "step": 35525 + }, + { + "epoch": 2.414050822122571, + "grad_norm": 0.987356960773468, + "learning_rate": 0.0006983710422611768, + "loss": 3.7804, + "step": 35530 + }, + { + "epoch": 2.4143905421932326, + "grad_norm": 0.711345374584198, + "learning_rate": 0.0006983285772523441, + "loss": 3.705, + "step": 35535 + }, + { + "epoch": 2.4147302622638946, + "grad_norm": 0.7928053140640259, + "learning_rate": 0.0006982861122435114, + "loss": 3.6003, + "step": 35540 + }, + { + "epoch": 2.4150699823345563, + "grad_norm": 0.9631736874580383, + "learning_rate": 0.0006982436472346786, + "loss": 3.4567, + "step": 35545 + }, + { + "epoch": 2.415409702405218, + "grad_norm": 1.1730315685272217, + "learning_rate": 0.0006982011822258459, + "loss": 3.3731, + "step": 35550 + }, + { + "epoch": 2.41574942247588, + "grad_norm": 0.8448672294616699, + "learning_rate": 0.0006981587172170131, + "loss": 3.7189, + "step": 35555 + }, + { + "epoch": 2.4160891425465416, + "grad_norm": 0.7993003129959106, + "learning_rate": 0.0006981162522081804, + "loss": 3.7353, + "step": 35560 + }, + { + "epoch": 2.4164288626172032, + "grad_norm": 0.7422598600387573, + "learning_rate": 0.0006980737871993478, + "loss": 3.4696, + "step": 35565 + }, + { + "epoch": 2.4167685826878653, + "grad_norm": 0.8767399787902832, + "learning_rate": 0.000698031322190515, + "loss": 3.4682, + "step": 35570 + }, + { + "epoch": 2.417108302758527, + "grad_norm": 0.6790643334388733, + "learning_rate": 0.0006979888571816823, + "loss": 3.4675, + "step": 35575 + }, + { + "epoch": 2.4174480228291886, + "grad_norm": 0.9774796962738037, + "learning_rate": 0.0006979463921728496, + "loss": 3.589, + "step": 35580 + }, + { + "epoch": 2.4177877428998507, + "grad_norm": 0.6799168586730957, + "learning_rate": 0.0006979039271640168, + "loss": 3.6494, + "step": 35585 + }, + { + "epoch": 2.4181274629705123, + "grad_norm": 0.8500269651412964, + "learning_rate": 0.0006978614621551841, + "loss": 3.5867, + "step": 35590 + }, + { + "epoch": 2.418467183041174, + "grad_norm": 0.9883834719657898, + "learning_rate": 0.0006978189971463514, + "loss": 3.7238, + "step": 35595 + }, + { + "epoch": 2.418806903111836, + "grad_norm": 0.9603604674339294, + "learning_rate": 0.0006977765321375187, + "loss": 3.6944, + "step": 35600 + }, + { + "epoch": 2.4191466231824976, + "grad_norm": 1.1441569328308105, + "learning_rate": 0.000697734067128686, + "loss": 3.6029, + "step": 35605 + }, + { + "epoch": 2.4194863432531593, + "grad_norm": 0.8562232255935669, + "learning_rate": 0.0006976916021198533, + "loss": 3.4916, + "step": 35610 + }, + { + "epoch": 2.4198260633238213, + "grad_norm": 0.9752050638198853, + "learning_rate": 0.0006976491371110205, + "loss": 3.7085, + "step": 35615 + }, + { + "epoch": 2.420165783394483, + "grad_norm": 0.777980387210846, + "learning_rate": 0.0006976066721021878, + "loss": 3.6642, + "step": 35620 + }, + { + "epoch": 2.4205055034651446, + "grad_norm": 0.8653627634048462, + "learning_rate": 0.0006975642070933551, + "loss": 3.4172, + "step": 35625 + }, + { + "epoch": 2.4208452235358067, + "grad_norm": 0.9437899589538574, + "learning_rate": 0.0006975217420845223, + "loss": 3.5679, + "step": 35630 + }, + { + "epoch": 2.4211849436064683, + "grad_norm": 0.8172034025192261, + "learning_rate": 0.0006974792770756897, + "loss": 3.7364, + "step": 35635 + }, + { + "epoch": 2.42152466367713, + "grad_norm": 0.8128688931465149, + "learning_rate": 0.000697436812066857, + "loss": 3.4311, + "step": 35640 + }, + { + "epoch": 2.421864383747792, + "grad_norm": 0.7182714939117432, + "learning_rate": 0.0006973943470580242, + "loss": 3.4913, + "step": 35645 + }, + { + "epoch": 2.4222041038184536, + "grad_norm": 5.65151834487915, + "learning_rate": 0.0006973518820491915, + "loss": 3.4335, + "step": 35650 + }, + { + "epoch": 2.4225438238891153, + "grad_norm": 0.77165687084198, + "learning_rate": 0.0006973094170403588, + "loss": 3.3813, + "step": 35655 + }, + { + "epoch": 2.4228835439597773, + "grad_norm": 0.8458949327468872, + "learning_rate": 0.000697266952031526, + "loss": 3.7783, + "step": 35660 + }, + { + "epoch": 2.423223264030439, + "grad_norm": 0.7224632501602173, + "learning_rate": 0.0006972244870226934, + "loss": 3.5887, + "step": 35665 + }, + { + "epoch": 2.4235629841011006, + "grad_norm": 1.097083330154419, + "learning_rate": 0.0006971820220138606, + "loss": 3.703, + "step": 35670 + }, + { + "epoch": 2.4239027041717627, + "grad_norm": 1.1522047519683838, + "learning_rate": 0.0006971395570050279, + "loss": 3.4504, + "step": 35675 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.9651363492012024, + "learning_rate": 0.0006970970919961952, + "loss": 3.5777, + "step": 35680 + }, + { + "epoch": 2.424582144313086, + "grad_norm": 0.8302415609359741, + "learning_rate": 0.0006970546269873624, + "loss": 3.5118, + "step": 35685 + }, + { + "epoch": 2.424921864383748, + "grad_norm": 0.9945346117019653, + "learning_rate": 0.0006970121619785297, + "loss": 3.55, + "step": 35690 + }, + { + "epoch": 2.4252615844544096, + "grad_norm": 1.0301053524017334, + "learning_rate": 0.000696969696969697, + "loss": 3.4502, + "step": 35695 + }, + { + "epoch": 2.4256013045250713, + "grad_norm": 1.0134460926055908, + "learning_rate": 0.0006969272319608643, + "loss": 3.5893, + "step": 35700 + }, + { + "epoch": 2.4259410245957334, + "grad_norm": 1.1556336879730225, + "learning_rate": 0.0006968847669520316, + "loss": 3.0979, + "step": 35705 + }, + { + "epoch": 2.426280744666395, + "grad_norm": 0.8718451857566833, + "learning_rate": 0.0006968423019431989, + "loss": 3.8023, + "step": 35710 + }, + { + "epoch": 2.4266204647370566, + "grad_norm": 0.7166701555252075, + "learning_rate": 0.0006967998369343661, + "loss": 3.3888, + "step": 35715 + }, + { + "epoch": 2.4269601848077182, + "grad_norm": 0.9308009743690491, + "learning_rate": 0.0006967573719255333, + "loss": 3.6325, + "step": 35720 + }, + { + "epoch": 2.4272999048783803, + "grad_norm": 0.7750697135925293, + "learning_rate": 0.0006967149069167007, + "loss": 3.3205, + "step": 35725 + }, + { + "epoch": 2.427639624949042, + "grad_norm": 1.080763578414917, + "learning_rate": 0.0006966724419078679, + "loss": 3.7539, + "step": 35730 + }, + { + "epoch": 2.4279793450197036, + "grad_norm": 0.8223336935043335, + "learning_rate": 0.0006966299768990352, + "loss": 3.563, + "step": 35735 + }, + { + "epoch": 2.4283190650903657, + "grad_norm": 0.770321249961853, + "learning_rate": 0.0006965875118902026, + "loss": 3.5717, + "step": 35740 + }, + { + "epoch": 2.4286587851610273, + "grad_norm": 0.8646693229675293, + "learning_rate": 0.0006965450468813698, + "loss": 3.3031, + "step": 35745 + }, + { + "epoch": 2.428998505231689, + "grad_norm": 0.841491162776947, + "learning_rate": 0.000696502581872537, + "loss": 3.6227, + "step": 35750 + }, + { + "epoch": 2.429338225302351, + "grad_norm": 0.9248559474945068, + "learning_rate": 0.0006964601168637044, + "loss": 3.4763, + "step": 35755 + }, + { + "epoch": 2.4296779453730126, + "grad_norm": 0.8135377168655396, + "learning_rate": 0.0006964176518548716, + "loss": 3.8418, + "step": 35760 + }, + { + "epoch": 2.4300176654436743, + "grad_norm": 1.1698077917099, + "learning_rate": 0.0006963751868460388, + "loss": 3.6804, + "step": 35765 + }, + { + "epoch": 2.4303573855143363, + "grad_norm": 0.7975615859031677, + "learning_rate": 0.0006963327218372062, + "loss": 3.6329, + "step": 35770 + }, + { + "epoch": 2.430697105584998, + "grad_norm": 0.8998247385025024, + "learning_rate": 0.0006962902568283735, + "loss": 3.663, + "step": 35775 + }, + { + "epoch": 2.4310368256556596, + "grad_norm": 0.6345213651657104, + "learning_rate": 0.0006962477918195407, + "loss": 3.163, + "step": 35780 + }, + { + "epoch": 2.4313765457263217, + "grad_norm": 2.007495641708374, + "learning_rate": 0.000696205326810708, + "loss": 3.6229, + "step": 35785 + }, + { + "epoch": 2.4317162657969833, + "grad_norm": 3.11572265625, + "learning_rate": 0.0006961628618018753, + "loss": 3.4769, + "step": 35790 + }, + { + "epoch": 2.432055985867645, + "grad_norm": 0.9948982000350952, + "learning_rate": 0.0006961203967930425, + "loss": 3.6872, + "step": 35795 + }, + { + "epoch": 2.432395705938307, + "grad_norm": 0.7978007793426514, + "learning_rate": 0.0006960779317842098, + "loss": 3.5747, + "step": 35800 + }, + { + "epoch": 2.4327354260089686, + "grad_norm": 1.130210518836975, + "learning_rate": 0.0006960354667753772, + "loss": 3.5305, + "step": 35805 + }, + { + "epoch": 2.4330751460796303, + "grad_norm": 0.8419439792633057, + "learning_rate": 0.0006959930017665444, + "loss": 3.7025, + "step": 35810 + }, + { + "epoch": 2.4334148661502923, + "grad_norm": 0.9107771515846252, + "learning_rate": 0.0006959505367577117, + "loss": 3.48, + "step": 35815 + }, + { + "epoch": 2.433754586220954, + "grad_norm": 0.6942035555839539, + "learning_rate": 0.0006959080717488789, + "loss": 3.7891, + "step": 35820 + }, + { + "epoch": 2.4340943062916156, + "grad_norm": 0.8731754422187805, + "learning_rate": 0.0006958656067400462, + "loss": 3.6014, + "step": 35825 + }, + { + "epoch": 2.4344340263622772, + "grad_norm": 1.5606956481933594, + "learning_rate": 0.0006958231417312135, + "loss": 3.5635, + "step": 35830 + }, + { + "epoch": 2.4347737464329393, + "grad_norm": 0.7956401109695435, + "learning_rate": 0.0006957806767223807, + "loss": 3.3851, + "step": 35835 + }, + { + "epoch": 2.435113466503601, + "grad_norm": 0.8098528385162354, + "learning_rate": 0.0006957382117135481, + "loss": 3.6362, + "step": 35840 + }, + { + "epoch": 2.4354531865742626, + "grad_norm": 1.0825409889221191, + "learning_rate": 0.0006956957467047154, + "loss": 3.441, + "step": 35845 + }, + { + "epoch": 2.4357929066449246, + "grad_norm": 0.965732216835022, + "learning_rate": 0.0006956532816958826, + "loss": 3.4583, + "step": 35850 + }, + { + "epoch": 2.4361326267155863, + "grad_norm": 0.8481732606887817, + "learning_rate": 0.0006956108166870498, + "loss": 3.6177, + "step": 35855 + }, + { + "epoch": 2.436472346786248, + "grad_norm": 0.9869868755340576, + "learning_rate": 0.0006955683516782172, + "loss": 3.7016, + "step": 35860 + }, + { + "epoch": 2.43681206685691, + "grad_norm": 0.8660546541213989, + "learning_rate": 0.0006955258866693844, + "loss": 3.4794, + "step": 35865 + }, + { + "epoch": 2.4371517869275716, + "grad_norm": 0.78850919008255, + "learning_rate": 0.0006954834216605516, + "loss": 3.5041, + "step": 35870 + }, + { + "epoch": 2.4374915069982332, + "grad_norm": 0.8549779057502747, + "learning_rate": 0.0006954409566517191, + "loss": 3.6467, + "step": 35875 + }, + { + "epoch": 2.4378312270688953, + "grad_norm": 0.8265355825424194, + "learning_rate": 0.0006953984916428863, + "loss": 3.5945, + "step": 35880 + }, + { + "epoch": 2.438170947139557, + "grad_norm": 0.9662145376205444, + "learning_rate": 0.0006953560266340535, + "loss": 3.378, + "step": 35885 + }, + { + "epoch": 2.4385106672102186, + "grad_norm": 1.0322604179382324, + "learning_rate": 0.0006953135616252209, + "loss": 3.5535, + "step": 35890 + }, + { + "epoch": 2.4388503872808807, + "grad_norm": 0.8157899379730225, + "learning_rate": 0.0006952710966163881, + "loss": 3.5975, + "step": 35895 + }, + { + "epoch": 2.4391901073515423, + "grad_norm": 0.9161356091499329, + "learning_rate": 0.0006952286316075553, + "loss": 3.407, + "step": 35900 + }, + { + "epoch": 2.439529827422204, + "grad_norm": 1.3534289598464966, + "learning_rate": 0.0006951861665987226, + "loss": 3.4715, + "step": 35905 + }, + { + "epoch": 2.439869547492866, + "grad_norm": 0.6445372700691223, + "learning_rate": 0.00069514370158989, + "loss": 3.769, + "step": 35910 + }, + { + "epoch": 2.4402092675635276, + "grad_norm": 0.9437969326972961, + "learning_rate": 0.0006951012365810572, + "loss": 3.5357, + "step": 35915 + }, + { + "epoch": 2.4405489876341893, + "grad_norm": 1.0591100454330444, + "learning_rate": 0.0006950587715722245, + "loss": 3.4413, + "step": 35920 + }, + { + "epoch": 2.4408887077048513, + "grad_norm": 0.7262230515480042, + "learning_rate": 0.0006950163065633918, + "loss": 3.3508, + "step": 35925 + }, + { + "epoch": 2.441228427775513, + "grad_norm": 0.8793342709541321, + "learning_rate": 0.000694973841554559, + "loss": 3.7693, + "step": 35930 + }, + { + "epoch": 2.4415681478461746, + "grad_norm": 0.9017274975776672, + "learning_rate": 0.0006949313765457263, + "loss": 3.8387, + "step": 35935 + }, + { + "epoch": 2.4419078679168367, + "grad_norm": 1.0516079664230347, + "learning_rate": 0.0006948889115368936, + "loss": 3.5078, + "step": 35940 + }, + { + "epoch": 2.4422475879874983, + "grad_norm": 0.9665093421936035, + "learning_rate": 0.0006948464465280609, + "loss": 3.6012, + "step": 35945 + }, + { + "epoch": 2.44258730805816, + "grad_norm": 0.8372550010681152, + "learning_rate": 0.0006948039815192282, + "loss": 3.4872, + "step": 35950 + }, + { + "epoch": 2.442927028128822, + "grad_norm": 0.6722925305366516, + "learning_rate": 0.0006947615165103954, + "loss": 3.7833, + "step": 35955 + }, + { + "epoch": 2.4432667481994836, + "grad_norm": 0.6843031048774719, + "learning_rate": 0.0006947190515015628, + "loss": 3.3463, + "step": 35960 + }, + { + "epoch": 2.4436064682701453, + "grad_norm": 0.7506526112556458, + "learning_rate": 0.00069467658649273, + "loss": 3.5921, + "step": 35965 + }, + { + "epoch": 2.4439461883408073, + "grad_norm": 0.926970899105072, + "learning_rate": 0.0006946341214838972, + "loss": 3.4772, + "step": 35970 + }, + { + "epoch": 2.444285908411469, + "grad_norm": 0.7087849974632263, + "learning_rate": 0.0006945916564750646, + "loss": 3.4514, + "step": 35975 + }, + { + "epoch": 2.4446256284821306, + "grad_norm": 0.799283504486084, + "learning_rate": 0.0006945491914662319, + "loss": 3.2978, + "step": 35980 + }, + { + "epoch": 2.4449653485527927, + "grad_norm": 0.8610679507255554, + "learning_rate": 0.0006945067264573991, + "loss": 3.1999, + "step": 35985 + }, + { + "epoch": 2.4453050686234543, + "grad_norm": 0.8540274500846863, + "learning_rate": 0.0006944642614485665, + "loss": 3.6788, + "step": 35990 + }, + { + "epoch": 2.445644788694116, + "grad_norm": 0.9715571999549866, + "learning_rate": 0.0006944217964397337, + "loss": 3.5222, + "step": 35995 + }, + { + "epoch": 2.445984508764778, + "grad_norm": 0.9320533275604248, + "learning_rate": 0.0006943793314309009, + "loss": 3.6145, + "step": 36000 + }, + { + "epoch": 2.4463242288354397, + "grad_norm": 1.001530647277832, + "learning_rate": 0.0006943368664220682, + "loss": 3.6688, + "step": 36005 + }, + { + "epoch": 2.4466639489061013, + "grad_norm": 0.9243918061256409, + "learning_rate": 0.0006942944014132355, + "loss": 3.6368, + "step": 36010 + }, + { + "epoch": 2.4470036689767634, + "grad_norm": 0.8702043294906616, + "learning_rate": 0.0006942519364044028, + "loss": 3.5578, + "step": 36015 + }, + { + "epoch": 2.447343389047425, + "grad_norm": 0.7090713977813721, + "learning_rate": 0.0006942094713955701, + "loss": 3.3266, + "step": 36020 + }, + { + "epoch": 2.4476831091180866, + "grad_norm": 0.8716376423835754, + "learning_rate": 0.0006941670063867374, + "loss": 3.6038, + "step": 36025 + }, + { + "epoch": 2.4480228291887487, + "grad_norm": 1.1072258949279785, + "learning_rate": 0.0006941245413779046, + "loss": 3.4358, + "step": 36030 + }, + { + "epoch": 2.4483625492594103, + "grad_norm": 0.8755373954772949, + "learning_rate": 0.0006940820763690719, + "loss": 3.575, + "step": 36035 + }, + { + "epoch": 2.448702269330072, + "grad_norm": 0.8961668014526367, + "learning_rate": 0.0006940396113602392, + "loss": 3.4212, + "step": 36040 + }, + { + "epoch": 2.449041989400734, + "grad_norm": 0.8181819915771484, + "learning_rate": 0.0006939971463514064, + "loss": 3.7356, + "step": 36045 + }, + { + "epoch": 2.4493817094713957, + "grad_norm": 0.8718600273132324, + "learning_rate": 0.0006939546813425738, + "loss": 3.6407, + "step": 36050 + }, + { + "epoch": 2.4497214295420573, + "grad_norm": 0.9065583944320679, + "learning_rate": 0.000693912216333741, + "loss": 3.5974, + "step": 36055 + }, + { + "epoch": 2.4500611496127194, + "grad_norm": 0.8534652590751648, + "learning_rate": 0.0006938697513249083, + "loss": 3.5275, + "step": 36060 + }, + { + "epoch": 2.450400869683381, + "grad_norm": 1.2117019891738892, + "learning_rate": 0.0006938272863160756, + "loss": 3.5595, + "step": 36065 + }, + { + "epoch": 2.4507405897540426, + "grad_norm": 0.7508332133293152, + "learning_rate": 0.0006937848213072428, + "loss": 3.52, + "step": 36070 + }, + { + "epoch": 2.4510803098247043, + "grad_norm": 0.9331714510917664, + "learning_rate": 0.0006937423562984101, + "loss": 3.5592, + "step": 36075 + }, + { + "epoch": 2.4514200298953663, + "grad_norm": 0.7052761912345886, + "learning_rate": 0.0006936998912895774, + "loss": 3.7205, + "step": 36080 + }, + { + "epoch": 2.451759749966028, + "grad_norm": 0.9195571541786194, + "learning_rate": 0.0006936574262807447, + "loss": 3.7707, + "step": 36085 + }, + { + "epoch": 2.4520994700366896, + "grad_norm": 1.0533415079116821, + "learning_rate": 0.000693614961271912, + "loss": 3.4, + "step": 36090 + }, + { + "epoch": 2.4524391901073517, + "grad_norm": 0.9358044862747192, + "learning_rate": 0.0006935724962630793, + "loss": 3.4754, + "step": 36095 + }, + { + "epoch": 2.4527789101780133, + "grad_norm": 1.0205967426300049, + "learning_rate": 0.0006935300312542465, + "loss": 3.4011, + "step": 36100 + }, + { + "epoch": 2.453118630248675, + "grad_norm": 0.8178235292434692, + "learning_rate": 0.0006934875662454137, + "loss": 3.3674, + "step": 36105 + }, + { + "epoch": 2.453458350319337, + "grad_norm": 0.793389618396759, + "learning_rate": 0.0006934451012365811, + "loss": 3.7273, + "step": 36110 + }, + { + "epoch": 2.4537980703899986, + "grad_norm": 0.7531744837760925, + "learning_rate": 0.0006934026362277483, + "loss": 3.7175, + "step": 36115 + }, + { + "epoch": 2.4541377904606603, + "grad_norm": 0.8483249545097351, + "learning_rate": 0.0006933601712189156, + "loss": 3.6024, + "step": 36120 + }, + { + "epoch": 2.4544775105313223, + "grad_norm": 1.1468547582626343, + "learning_rate": 0.000693317706210083, + "loss": 3.4629, + "step": 36125 + }, + { + "epoch": 2.454817230601984, + "grad_norm": 0.8523963689804077, + "learning_rate": 0.0006932752412012502, + "loss": 3.6104, + "step": 36130 + }, + { + "epoch": 2.4551569506726456, + "grad_norm": 0.8265317678451538, + "learning_rate": 0.0006932327761924174, + "loss": 3.6008, + "step": 36135 + }, + { + "epoch": 2.4554966707433077, + "grad_norm": 0.7784208059310913, + "learning_rate": 0.0006931903111835848, + "loss": 3.2618, + "step": 36140 + }, + { + "epoch": 2.4558363908139693, + "grad_norm": 0.8659098744392395, + "learning_rate": 0.000693147846174752, + "loss": 3.7322, + "step": 36145 + }, + { + "epoch": 2.456176110884631, + "grad_norm": 0.8619340658187866, + "learning_rate": 0.0006931053811659192, + "loss": 3.5768, + "step": 36150 + }, + { + "epoch": 2.456515830955293, + "grad_norm": 0.9929268956184387, + "learning_rate": 0.0006930629161570866, + "loss": 3.7109, + "step": 36155 + }, + { + "epoch": 2.4568555510259547, + "grad_norm": 1.1205105781555176, + "learning_rate": 0.0006930204511482539, + "loss": 3.6103, + "step": 36160 + }, + { + "epoch": 2.4571952710966163, + "grad_norm": 0.851641058921814, + "learning_rate": 0.0006929779861394211, + "loss": 3.8709, + "step": 36165 + }, + { + "epoch": 2.457534991167278, + "grad_norm": 0.8470024466514587, + "learning_rate": 0.0006929355211305884, + "loss": 3.3211, + "step": 36170 + }, + { + "epoch": 2.45787471123794, + "grad_norm": 0.79361891746521, + "learning_rate": 0.0006928930561217557, + "loss": 3.5976, + "step": 36175 + }, + { + "epoch": 2.4582144313086016, + "grad_norm": 0.8078089952468872, + "learning_rate": 0.0006928505911129229, + "loss": 3.55, + "step": 36180 + }, + { + "epoch": 2.4585541513792633, + "grad_norm": 0.9007940888404846, + "learning_rate": 0.0006928081261040902, + "loss": 3.62, + "step": 36185 + }, + { + "epoch": 2.4588938714499253, + "grad_norm": 0.7090835571289062, + "learning_rate": 0.0006927656610952576, + "loss": 3.6302, + "step": 36190 + }, + { + "epoch": 2.459233591520587, + "grad_norm": 0.8088706135749817, + "learning_rate": 0.0006927231960864248, + "loss": 3.7379, + "step": 36195 + }, + { + "epoch": 2.4595733115912486, + "grad_norm": 0.8974642157554626, + "learning_rate": 0.0006926807310775921, + "loss": 3.4396, + "step": 36200 + }, + { + "epoch": 2.4599130316619107, + "grad_norm": 0.928989827632904, + "learning_rate": 0.0006926382660687593, + "loss": 3.911, + "step": 36205 + }, + { + "epoch": 2.4602527517325723, + "grad_norm": 0.7683658003807068, + "learning_rate": 0.0006925958010599266, + "loss": 3.6753, + "step": 36210 + }, + { + "epoch": 2.460592471803234, + "grad_norm": 0.8043687343597412, + "learning_rate": 0.0006925533360510939, + "loss": 3.3672, + "step": 36215 + }, + { + "epoch": 2.460932191873896, + "grad_norm": 1.0197385549545288, + "learning_rate": 0.0006925108710422611, + "loss": 3.8239, + "step": 36220 + }, + { + "epoch": 2.4612719119445576, + "grad_norm": 1.5007439851760864, + "learning_rate": 0.0006924684060334285, + "loss": 3.2693, + "step": 36225 + }, + { + "epoch": 2.4616116320152193, + "grad_norm": 1.0083128213882446, + "learning_rate": 0.0006924259410245958, + "loss": 3.7076, + "step": 36230 + }, + { + "epoch": 2.4619513520858813, + "grad_norm": 0.8509348630905151, + "learning_rate": 0.000692383476015763, + "loss": 3.2946, + "step": 36235 + }, + { + "epoch": 2.462291072156543, + "grad_norm": 1.152888536453247, + "learning_rate": 0.0006923410110069302, + "loss": 3.5203, + "step": 36240 + }, + { + "epoch": 2.4626307922272046, + "grad_norm": 0.9296496510505676, + "learning_rate": 0.0006922985459980976, + "loss": 3.5498, + "step": 36245 + }, + { + "epoch": 2.4629705122978667, + "grad_norm": 0.8694621920585632, + "learning_rate": 0.0006922560809892648, + "loss": 3.455, + "step": 36250 + }, + { + "epoch": 2.4633102323685283, + "grad_norm": 0.9762208461761475, + "learning_rate": 0.000692213615980432, + "loss": 3.3705, + "step": 36255 + }, + { + "epoch": 2.46364995243919, + "grad_norm": 1.2348740100860596, + "learning_rate": 0.0006921711509715995, + "loss": 3.4987, + "step": 36260 + }, + { + "epoch": 2.463989672509852, + "grad_norm": 1.019108533859253, + "learning_rate": 0.0006921286859627667, + "loss": 3.5061, + "step": 36265 + }, + { + "epoch": 2.4643293925805136, + "grad_norm": 0.8774006962776184, + "learning_rate": 0.0006920862209539339, + "loss": 3.7656, + "step": 36270 + }, + { + "epoch": 2.4646691126511753, + "grad_norm": 0.8606116771697998, + "learning_rate": 0.0006920437559451013, + "loss": 3.5469, + "step": 36275 + }, + { + "epoch": 2.4650088327218374, + "grad_norm": 1.05874502658844, + "learning_rate": 0.0006920012909362685, + "loss": 3.5303, + "step": 36280 + }, + { + "epoch": 2.465348552792499, + "grad_norm": 0.7317825555801392, + "learning_rate": 0.0006919588259274357, + "loss": 3.5634, + "step": 36285 + }, + { + "epoch": 2.4656882728631606, + "grad_norm": 1.0505937337875366, + "learning_rate": 0.0006919163609186032, + "loss": 3.6831, + "step": 36290 + }, + { + "epoch": 2.4660279929338227, + "grad_norm": 0.7658190131187439, + "learning_rate": 0.0006918738959097704, + "loss": 3.3016, + "step": 36295 + }, + { + "epoch": 2.4663677130044843, + "grad_norm": 1.0591740608215332, + "learning_rate": 0.0006918314309009377, + "loss": 3.5352, + "step": 36300 + }, + { + "epoch": 2.466707433075146, + "grad_norm": 0.6838187575340271, + "learning_rate": 0.0006917889658921049, + "loss": 3.4979, + "step": 36305 + }, + { + "epoch": 2.467047153145808, + "grad_norm": 0.9364463090896606, + "learning_rate": 0.0006917465008832722, + "loss": 3.6565, + "step": 36310 + }, + { + "epoch": 2.4673868732164697, + "grad_norm": 0.8119786381721497, + "learning_rate": 0.0006917040358744395, + "loss": 3.8444, + "step": 36315 + }, + { + "epoch": 2.4677265932871313, + "grad_norm": 0.6889965534210205, + "learning_rate": 0.0006916615708656067, + "loss": 3.6866, + "step": 36320 + }, + { + "epoch": 2.4680663133577934, + "grad_norm": 1.356595754623413, + "learning_rate": 0.0006916191058567741, + "loss": 3.6495, + "step": 36325 + }, + { + "epoch": 2.468406033428455, + "grad_norm": 2.738727331161499, + "learning_rate": 0.0006915766408479414, + "loss": 3.5053, + "step": 36330 + }, + { + "epoch": 2.4687457534991166, + "grad_norm": 0.9051178693771362, + "learning_rate": 0.0006915341758391086, + "loss": 3.711, + "step": 36335 + }, + { + "epoch": 2.4690854735697787, + "grad_norm": 2.7883641719818115, + "learning_rate": 0.0006914917108302758, + "loss": 3.3308, + "step": 36340 + }, + { + "epoch": 2.4694251936404403, + "grad_norm": 0.9305927753448486, + "learning_rate": 0.0006914492458214432, + "loss": 3.5995, + "step": 36345 + }, + { + "epoch": 2.469764913711102, + "grad_norm": 0.8959746360778809, + "learning_rate": 0.0006914067808126104, + "loss": 3.6368, + "step": 36350 + }, + { + "epoch": 2.470104633781764, + "grad_norm": 0.9596649408340454, + "learning_rate": 0.0006913643158037776, + "loss": 3.3104, + "step": 36355 + }, + { + "epoch": 2.4704443538524257, + "grad_norm": 0.8386121392250061, + "learning_rate": 0.0006913218507949451, + "loss": 3.4398, + "step": 36360 + }, + { + "epoch": 2.4707840739230873, + "grad_norm": 0.8898554444313049, + "learning_rate": 0.0006912793857861123, + "loss": 3.7822, + "step": 36365 + }, + { + "epoch": 2.4711237939937494, + "grad_norm": 0.7847386002540588, + "learning_rate": 0.0006912369207772795, + "loss": 3.5561, + "step": 36370 + }, + { + "epoch": 2.471463514064411, + "grad_norm": 0.9888058304786682, + "learning_rate": 0.0006911944557684469, + "loss": 3.5128, + "step": 36375 + }, + { + "epoch": 2.4718032341350726, + "grad_norm": 1.1068761348724365, + "learning_rate": 0.0006911519907596141, + "loss": 3.6938, + "step": 36380 + }, + { + "epoch": 2.4721429542057347, + "grad_norm": 0.897333562374115, + "learning_rate": 0.0006911095257507813, + "loss": 3.6783, + "step": 36385 + }, + { + "epoch": 2.4724826742763963, + "grad_norm": 0.9274793267250061, + "learning_rate": 0.0006910670607419487, + "loss": 3.4718, + "step": 36390 + }, + { + "epoch": 2.472822394347058, + "grad_norm": 0.850313663482666, + "learning_rate": 0.000691024595733116, + "loss": 3.6137, + "step": 36395 + }, + { + "epoch": 2.47316211441772, + "grad_norm": 0.7803919315338135, + "learning_rate": 0.0006909821307242832, + "loss": 3.3953, + "step": 36400 + }, + { + "epoch": 2.4735018344883817, + "grad_norm": 0.8353518843650818, + "learning_rate": 0.0006909396657154505, + "loss": 3.663, + "step": 36405 + }, + { + "epoch": 2.4738415545590433, + "grad_norm": 0.8238515853881836, + "learning_rate": 0.0006908972007066178, + "loss": 3.6589, + "step": 36410 + }, + { + "epoch": 2.474181274629705, + "grad_norm": 1.271271824836731, + "learning_rate": 0.000690854735697785, + "loss": 3.5707, + "step": 36415 + }, + { + "epoch": 2.474520994700367, + "grad_norm": 0.9054452180862427, + "learning_rate": 0.0006908122706889523, + "loss": 3.5778, + "step": 36420 + }, + { + "epoch": 2.4748607147710286, + "grad_norm": 0.8107162714004517, + "learning_rate": 0.0006907698056801196, + "loss": 3.4246, + "step": 36425 + }, + { + "epoch": 2.4752004348416903, + "grad_norm": 0.7463089227676392, + "learning_rate": 0.0006907273406712869, + "loss": 3.5852, + "step": 36430 + }, + { + "epoch": 2.4755401549123524, + "grad_norm": 0.8042941093444824, + "learning_rate": 0.0006906848756624542, + "loss": 3.6647, + "step": 36435 + }, + { + "epoch": 2.475879874983014, + "grad_norm": 2.168487071990967, + "learning_rate": 0.0006906424106536215, + "loss": 3.5075, + "step": 36440 + }, + { + "epoch": 2.4762195950536756, + "grad_norm": 0.8778647184371948, + "learning_rate": 0.0006905999456447887, + "loss": 3.7345, + "step": 36445 + }, + { + "epoch": 2.4765593151243377, + "grad_norm": 0.6512066721916199, + "learning_rate": 0.000690557480635956, + "loss": 3.6604, + "step": 36450 + }, + { + "epoch": 2.4768990351949993, + "grad_norm": 0.8887601494789124, + "learning_rate": 0.0006905150156271232, + "loss": 3.7197, + "step": 36455 + }, + { + "epoch": 2.477238755265661, + "grad_norm": 1.078426718711853, + "learning_rate": 0.0006904725506182905, + "loss": 3.563, + "step": 36460 + }, + { + "epoch": 2.477578475336323, + "grad_norm": 0.8834910988807678, + "learning_rate": 0.0006904300856094579, + "loss": 3.4437, + "step": 36465 + }, + { + "epoch": 2.4779181954069847, + "grad_norm": 0.7722088098526001, + "learning_rate": 0.0006903876206006251, + "loss": 3.5113, + "step": 36470 + }, + { + "epoch": 2.4782579154776463, + "grad_norm": 0.8016606569290161, + "learning_rate": 0.0006903451555917924, + "loss": 3.6682, + "step": 36475 + }, + { + "epoch": 2.4785976355483084, + "grad_norm": 0.890228271484375, + "learning_rate": 0.0006903026905829597, + "loss": 3.4543, + "step": 36480 + }, + { + "epoch": 2.47893735561897, + "grad_norm": 1.004185676574707, + "learning_rate": 0.0006902602255741269, + "loss": 3.4278, + "step": 36485 + }, + { + "epoch": 2.4792770756896316, + "grad_norm": 0.9682187438011169, + "learning_rate": 0.0006902177605652941, + "loss": 3.3861, + "step": 36490 + }, + { + "epoch": 2.4796167957602937, + "grad_norm": 0.7608022689819336, + "learning_rate": 0.0006901752955564615, + "loss": 3.6717, + "step": 36495 + }, + { + "epoch": 2.4799565158309553, + "grad_norm": 0.8768916130065918, + "learning_rate": 0.0006901328305476288, + "loss": 3.323, + "step": 36500 + }, + { + "epoch": 2.480296235901617, + "grad_norm": 0.7489612102508545, + "learning_rate": 0.000690090365538796, + "loss": 3.665, + "step": 36505 + }, + { + "epoch": 2.4806359559722786, + "grad_norm": 0.7496927976608276, + "learning_rate": 0.0006900479005299634, + "loss": 3.5371, + "step": 36510 + }, + { + "epoch": 2.4809756760429407, + "grad_norm": 0.8451232314109802, + "learning_rate": 0.0006900054355211306, + "loss": 3.4045, + "step": 36515 + }, + { + "epoch": 2.4813153961136023, + "grad_norm": 0.8394334316253662, + "learning_rate": 0.0006899629705122978, + "loss": 3.4399, + "step": 36520 + }, + { + "epoch": 2.481655116184264, + "grad_norm": 0.9226208329200745, + "learning_rate": 0.0006899205055034652, + "loss": 3.6355, + "step": 36525 + }, + { + "epoch": 2.481994836254926, + "grad_norm": 0.9382376074790955, + "learning_rate": 0.0006898780404946324, + "loss": 3.5944, + "step": 36530 + }, + { + "epoch": 2.4823345563255876, + "grad_norm": 0.8020952343940735, + "learning_rate": 0.0006898355754857997, + "loss": 3.6497, + "step": 36535 + }, + { + "epoch": 2.4826742763962493, + "grad_norm": 0.8517431616783142, + "learning_rate": 0.000689793110476967, + "loss": 3.2938, + "step": 36540 + }, + { + "epoch": 2.4830139964669113, + "grad_norm": 0.9466975331306458, + "learning_rate": 0.0006897506454681343, + "loss": 3.5709, + "step": 36545 + }, + { + "epoch": 2.483353716537573, + "grad_norm": 1.0749220848083496, + "learning_rate": 0.0006897081804593015, + "loss": 3.4687, + "step": 36550 + }, + { + "epoch": 2.4836934366082346, + "grad_norm": 0.8653540015220642, + "learning_rate": 0.0006896657154504688, + "loss": 3.457, + "step": 36555 + }, + { + "epoch": 2.4840331566788967, + "grad_norm": 0.9551055431365967, + "learning_rate": 0.0006896232504416361, + "loss": 3.3744, + "step": 36560 + }, + { + "epoch": 2.4843728767495583, + "grad_norm": 1.0422379970550537, + "learning_rate": 0.0006895807854328033, + "loss": 3.3811, + "step": 36565 + }, + { + "epoch": 2.48471259682022, + "grad_norm": 0.8918790817260742, + "learning_rate": 0.0006895383204239707, + "loss": 3.3721, + "step": 36570 + }, + { + "epoch": 2.485052316890882, + "grad_norm": 0.8593407273292542, + "learning_rate": 0.000689495855415138, + "loss": 3.77, + "step": 36575 + }, + { + "epoch": 2.4853920369615436, + "grad_norm": 1.1619842052459717, + "learning_rate": 0.0006894533904063052, + "loss": 3.8087, + "step": 36580 + }, + { + "epoch": 2.4857317570322053, + "grad_norm": 0.6964577436447144, + "learning_rate": 0.0006894109253974725, + "loss": 3.5053, + "step": 36585 + }, + { + "epoch": 2.4860714771028674, + "grad_norm": 0.8874382972717285, + "learning_rate": 0.0006893684603886397, + "loss": 3.5999, + "step": 36590 + }, + { + "epoch": 2.486411197173529, + "grad_norm": 1.2115167379379272, + "learning_rate": 0.000689325995379807, + "loss": 3.3022, + "step": 36595 + }, + { + "epoch": 2.4867509172441906, + "grad_norm": 0.9663459658622742, + "learning_rate": 0.0006892835303709743, + "loss": 3.6568, + "step": 36600 + }, + { + "epoch": 2.4870906373148527, + "grad_norm": 0.8903859853744507, + "learning_rate": 0.0006892410653621416, + "loss": 3.3939, + "step": 36605 + }, + { + "epoch": 2.4874303573855143, + "grad_norm": 0.9800843596458435, + "learning_rate": 0.0006891986003533089, + "loss": 3.7151, + "step": 36610 + }, + { + "epoch": 2.487770077456176, + "grad_norm": 0.8528366088867188, + "learning_rate": 0.0006891561353444762, + "loss": 3.7048, + "step": 36615 + }, + { + "epoch": 2.488109797526838, + "grad_norm": 0.9572628736495972, + "learning_rate": 0.0006891136703356434, + "loss": 3.6146, + "step": 36620 + }, + { + "epoch": 2.4884495175974997, + "grad_norm": 0.7797242403030396, + "learning_rate": 0.0006890712053268107, + "loss": 3.6536, + "step": 36625 + }, + { + "epoch": 2.4887892376681613, + "grad_norm": 1.2049996852874756, + "learning_rate": 0.000689028740317978, + "loss": 3.3906, + "step": 36630 + }, + { + "epoch": 2.4891289577388234, + "grad_norm": 0.7921689748764038, + "learning_rate": 0.0006889862753091452, + "loss": 3.651, + "step": 36635 + }, + { + "epoch": 2.489468677809485, + "grad_norm": 0.9526494145393372, + "learning_rate": 0.0006889438103003127, + "loss": 3.5305, + "step": 36640 + }, + { + "epoch": 2.4898083978801466, + "grad_norm": 1.121082067489624, + "learning_rate": 0.0006889013452914799, + "loss": 3.6047, + "step": 36645 + }, + { + "epoch": 2.4901481179508087, + "grad_norm": 0.8554778099060059, + "learning_rate": 0.0006888588802826471, + "loss": 3.5345, + "step": 36650 + }, + { + "epoch": 2.4904878380214703, + "grad_norm": 0.8699836134910583, + "learning_rate": 0.0006888164152738144, + "loss": 3.5773, + "step": 36655 + }, + { + "epoch": 2.490827558092132, + "grad_norm": 1.0910875797271729, + "learning_rate": 0.0006887739502649817, + "loss": 3.2798, + "step": 36660 + }, + { + "epoch": 2.491167278162794, + "grad_norm": 0.9309991598129272, + "learning_rate": 0.0006887314852561489, + "loss": 3.2957, + "step": 36665 + }, + { + "epoch": 2.4915069982334557, + "grad_norm": 1.1437005996704102, + "learning_rate": 0.0006886890202473162, + "loss": 3.4286, + "step": 36670 + }, + { + "epoch": 2.4918467183041173, + "grad_norm": 0.8672728538513184, + "learning_rate": 0.0006886465552384836, + "loss": 3.3956, + "step": 36675 + }, + { + "epoch": 2.4921864383747794, + "grad_norm": 0.9684737920761108, + "learning_rate": 0.0006886040902296508, + "loss": 3.4356, + "step": 36680 + }, + { + "epoch": 2.492526158445441, + "grad_norm": 0.9539965987205505, + "learning_rate": 0.0006885616252208181, + "loss": 3.4104, + "step": 36685 + }, + { + "epoch": 2.4928658785161026, + "grad_norm": 0.9110315442085266, + "learning_rate": 0.0006885191602119853, + "loss": 3.3002, + "step": 36690 + }, + { + "epoch": 2.4932055985867647, + "grad_norm": 0.9345609545707703, + "learning_rate": 0.0006884766952031526, + "loss": 3.6109, + "step": 36695 + }, + { + "epoch": 2.4935453186574263, + "grad_norm": 0.7751627564430237, + "learning_rate": 0.0006884342301943199, + "loss": 3.5554, + "step": 36700 + }, + { + "epoch": 2.493885038728088, + "grad_norm": 1.1158490180969238, + "learning_rate": 0.0006883917651854871, + "loss": 3.3419, + "step": 36705 + }, + { + "epoch": 2.49422475879875, + "grad_norm": 0.8605878353118896, + "learning_rate": 0.0006883493001766545, + "loss": 3.6156, + "step": 36710 + }, + { + "epoch": 2.4945644788694117, + "grad_norm": 0.7541114687919617, + "learning_rate": 0.0006883068351678218, + "loss": 3.7837, + "step": 36715 + }, + { + "epoch": 2.4949041989400733, + "grad_norm": 0.8255082368850708, + "learning_rate": 0.000688264370158989, + "loss": 3.3768, + "step": 36720 + }, + { + "epoch": 2.4952439190107354, + "grad_norm": 1.0605679750442505, + "learning_rate": 0.0006882219051501563, + "loss": 3.2329, + "step": 36725 + }, + { + "epoch": 2.495583639081397, + "grad_norm": 0.7460446357727051, + "learning_rate": 0.0006881794401413236, + "loss": 3.5645, + "step": 36730 + }, + { + "epoch": 2.4959233591520587, + "grad_norm": 1.074440360069275, + "learning_rate": 0.0006881369751324908, + "loss": 3.8486, + "step": 36735 + }, + { + "epoch": 2.4962630792227207, + "grad_norm": 0.9776721000671387, + "learning_rate": 0.000688094510123658, + "loss": 3.4483, + "step": 36740 + }, + { + "epoch": 2.4966027992933824, + "grad_norm": 0.9570642709732056, + "learning_rate": 0.0006880520451148255, + "loss": 3.4072, + "step": 36745 + }, + { + "epoch": 2.496942519364044, + "grad_norm": 0.765233039855957, + "learning_rate": 0.0006880095801059927, + "loss": 3.6545, + "step": 36750 + }, + { + "epoch": 2.4972822394347056, + "grad_norm": 0.9984706044197083, + "learning_rate": 0.0006879671150971599, + "loss": 3.3405, + "step": 36755 + }, + { + "epoch": 2.4976219595053677, + "grad_norm": 0.7731297016143799, + "learning_rate": 0.0006879246500883273, + "loss": 3.4121, + "step": 36760 + }, + { + "epoch": 2.4979616795760293, + "grad_norm": 0.8107051253318787, + "learning_rate": 0.0006878821850794945, + "loss": 3.4762, + "step": 36765 + }, + { + "epoch": 2.498301399646691, + "grad_norm": 0.7076941728591919, + "learning_rate": 0.0006878397200706617, + "loss": 3.522, + "step": 36770 + }, + { + "epoch": 2.498641119717353, + "grad_norm": 0.7531155943870544, + "learning_rate": 0.0006877972550618292, + "loss": 3.34, + "step": 36775 + }, + { + "epoch": 2.4989808397880147, + "grad_norm": 0.7311004400253296, + "learning_rate": 0.0006877547900529964, + "loss": 3.3803, + "step": 36780 + }, + { + "epoch": 2.4993205598586763, + "grad_norm": 1.0113418102264404, + "learning_rate": 0.0006877123250441636, + "loss": 3.5328, + "step": 36785 + }, + { + "epoch": 2.4996602799293384, + "grad_norm": 0.9074886441230774, + "learning_rate": 0.000687669860035331, + "loss": 3.4605, + "step": 36790 + }, + { + "epoch": 2.5, + "grad_norm": 0.820617139339447, + "learning_rate": 0.0006876273950264982, + "loss": 3.4666, + "step": 36795 + }, + { + "epoch": 2.5003397200706616, + "grad_norm": 0.9698944687843323, + "learning_rate": 0.0006875849300176654, + "loss": 3.6592, + "step": 36800 + }, + { + "epoch": 2.5006794401413237, + "grad_norm": 0.8221749663352966, + "learning_rate": 0.0006875424650088327, + "loss": 3.6905, + "step": 36805 + }, + { + "epoch": 2.5010191602119853, + "grad_norm": 0.8296582698822021, + "learning_rate": 0.0006875, + "loss": 3.6858, + "step": 36810 + }, + { + "epoch": 2.501358880282647, + "grad_norm": 0.9153863787651062, + "learning_rate": 0.0006874575349911673, + "loss": 3.3998, + "step": 36815 + }, + { + "epoch": 2.5016986003533086, + "grad_norm": 0.8744755387306213, + "learning_rate": 0.0006874150699823346, + "loss": 3.6855, + "step": 36820 + }, + { + "epoch": 2.5020383204239707, + "grad_norm": 0.7745585441589355, + "learning_rate": 0.0006873726049735019, + "loss": 3.5907, + "step": 36825 + }, + { + "epoch": 2.5023780404946323, + "grad_norm": 1.2356382608413696, + "learning_rate": 0.0006873301399646691, + "loss": 3.6038, + "step": 36830 + }, + { + "epoch": 2.502717760565294, + "grad_norm": 0.9567061066627502, + "learning_rate": 0.0006872876749558364, + "loss": 3.5362, + "step": 36835 + }, + { + "epoch": 2.503057480635956, + "grad_norm": 0.7758637070655823, + "learning_rate": 0.0006872452099470036, + "loss": 3.6424, + "step": 36840 + }, + { + "epoch": 2.5033972007066176, + "grad_norm": 1.3368792533874512, + "learning_rate": 0.0006872027449381709, + "loss": 3.6191, + "step": 36845 + }, + { + "epoch": 2.5037369207772793, + "grad_norm": 0.8259404897689819, + "learning_rate": 0.0006871602799293383, + "loss": 3.5669, + "step": 36850 + }, + { + "epoch": 2.5040766408479413, + "grad_norm": 1.0088380575180054, + "learning_rate": 0.0006871178149205055, + "loss": 3.4049, + "step": 36855 + }, + { + "epoch": 2.504416360918603, + "grad_norm": 0.9279365539550781, + "learning_rate": 0.0006870753499116728, + "loss": 3.7682, + "step": 36860 + }, + { + "epoch": 2.5047560809892646, + "grad_norm": 1.0760164260864258, + "learning_rate": 0.0006870328849028401, + "loss": 3.6811, + "step": 36865 + }, + { + "epoch": 2.5050958010599267, + "grad_norm": 0.6958915591239929, + "learning_rate": 0.0006869904198940073, + "loss": 3.2431, + "step": 36870 + }, + { + "epoch": 2.5054355211305883, + "grad_norm": 1.087584137916565, + "learning_rate": 0.0006869479548851745, + "loss": 3.358, + "step": 36875 + }, + { + "epoch": 2.50577524120125, + "grad_norm": 0.7616779208183289, + "learning_rate": 0.000686905489876342, + "loss": 3.49, + "step": 36880 + }, + { + "epoch": 2.506114961271912, + "grad_norm": 0.9172993898391724, + "learning_rate": 0.0006868630248675092, + "loss": 3.5478, + "step": 36885 + }, + { + "epoch": 2.5064546813425737, + "grad_norm": 1.075850248336792, + "learning_rate": 0.0006868205598586764, + "loss": 3.5067, + "step": 36890 + }, + { + "epoch": 2.5067944014132353, + "grad_norm": 0.8993471264839172, + "learning_rate": 0.0006867780948498438, + "loss": 3.356, + "step": 36895 + }, + { + "epoch": 2.5071341214838974, + "grad_norm": 1.0426256656646729, + "learning_rate": 0.000686735629841011, + "loss": 3.4172, + "step": 36900 + }, + { + "epoch": 2.507473841554559, + "grad_norm": 0.7860424518585205, + "learning_rate": 0.0006866931648321782, + "loss": 3.6106, + "step": 36905 + }, + { + "epoch": 2.5078135616252206, + "grad_norm": 0.6684382557868958, + "learning_rate": 0.0006866506998233456, + "loss": 3.4696, + "step": 36910 + }, + { + "epoch": 2.5081532816958827, + "grad_norm": 1.2995660305023193, + "learning_rate": 0.0006866082348145129, + "loss": 3.5771, + "step": 36915 + }, + { + "epoch": 2.5084930017665443, + "grad_norm": 0.7742058634757996, + "learning_rate": 0.0006865657698056801, + "loss": 3.3759, + "step": 36920 + }, + { + "epoch": 2.508832721837206, + "grad_norm": 1.0966546535491943, + "learning_rate": 0.0006865233047968475, + "loss": 3.737, + "step": 36925 + }, + { + "epoch": 2.509172441907868, + "grad_norm": 0.7188527584075928, + "learning_rate": 0.0006864808397880147, + "loss": 3.3631, + "step": 36930 + }, + { + "epoch": 2.5095121619785297, + "grad_norm": 1.013911247253418, + "learning_rate": 0.0006864383747791819, + "loss": 3.7398, + "step": 36935 + }, + { + "epoch": 2.5098518820491913, + "grad_norm": 0.7743934988975525, + "learning_rate": 0.0006863959097703492, + "loss": 3.663, + "step": 36940 + }, + { + "epoch": 2.5101916021198534, + "grad_norm": 1.1279109716415405, + "learning_rate": 0.0006863534447615165, + "loss": 3.6537, + "step": 36945 + }, + { + "epoch": 2.510531322190515, + "grad_norm": 1.2841403484344482, + "learning_rate": 0.0006863109797526838, + "loss": 3.4682, + "step": 36950 + }, + { + "epoch": 2.5108710422611766, + "grad_norm": 1.0314315557479858, + "learning_rate": 0.0006862685147438511, + "loss": 3.6298, + "step": 36955 + }, + { + "epoch": 2.5112107623318387, + "grad_norm": 0.600997805595398, + "learning_rate": 0.0006862260497350184, + "loss": 3.569, + "step": 36960 + }, + { + "epoch": 2.5115504824025003, + "grad_norm": 7.136981010437012, + "learning_rate": 0.0006861835847261856, + "loss": 3.4209, + "step": 36965 + }, + { + "epoch": 2.511890202473162, + "grad_norm": 0.8038784265518188, + "learning_rate": 0.0006861411197173529, + "loss": 3.5176, + "step": 36970 + }, + { + "epoch": 2.512229922543824, + "grad_norm": 0.7292317152023315, + "learning_rate": 0.0006860986547085201, + "loss": 3.7643, + "step": 36975 + }, + { + "epoch": 2.5125696426144857, + "grad_norm": 0.8943995237350464, + "learning_rate": 0.0006860561896996874, + "loss": 3.4348, + "step": 36980 + }, + { + "epoch": 2.5129093626851473, + "grad_norm": 0.908740758895874, + "learning_rate": 0.0006860137246908548, + "loss": 3.5842, + "step": 36985 + }, + { + "epoch": 2.5132490827558094, + "grad_norm": 0.9770544767379761, + "learning_rate": 0.000685971259682022, + "loss": 3.8594, + "step": 36990 + }, + { + "epoch": 2.513588802826471, + "grad_norm": 0.835317850112915, + "learning_rate": 0.0006859287946731894, + "loss": 3.7924, + "step": 36995 + }, + { + "epoch": 2.5139285228971326, + "grad_norm": 0.8581152558326721, + "learning_rate": 0.0006858863296643566, + "loss": 3.5349, + "step": 37000 + }, + { + "epoch": 2.5142682429677947, + "grad_norm": 0.849543571472168, + "learning_rate": 0.0006858438646555238, + "loss": 3.8963, + "step": 37005 + }, + { + "epoch": 2.5146079630384564, + "grad_norm": 0.7243213057518005, + "learning_rate": 0.0006858013996466912, + "loss": 3.3722, + "step": 37010 + }, + { + "epoch": 2.514947683109118, + "grad_norm": 1.15110182762146, + "learning_rate": 0.0006857589346378584, + "loss": 3.5354, + "step": 37015 + }, + { + "epoch": 2.51528740317978, + "grad_norm": 0.8062073588371277, + "learning_rate": 0.0006857164696290257, + "loss": 3.4464, + "step": 37020 + }, + { + "epoch": 2.5156271232504417, + "grad_norm": 0.8497734665870667, + "learning_rate": 0.0006856740046201931, + "loss": 3.4162, + "step": 37025 + }, + { + "epoch": 2.5159668433211033, + "grad_norm": 0.7582796216011047, + "learning_rate": 0.0006856315396113603, + "loss": 3.423, + "step": 37030 + }, + { + "epoch": 2.5163065633917654, + "grad_norm": 0.918688178062439, + "learning_rate": 0.0006855890746025275, + "loss": 3.4464, + "step": 37035 + }, + { + "epoch": 2.516646283462427, + "grad_norm": 0.7305085062980652, + "learning_rate": 0.0006855466095936948, + "loss": 3.5884, + "step": 37040 + }, + { + "epoch": 2.5169860035330887, + "grad_norm": 0.9733332991600037, + "learning_rate": 0.0006855041445848621, + "loss": 3.4729, + "step": 37045 + }, + { + "epoch": 2.5173257236037507, + "grad_norm": 1.1838867664337158, + "learning_rate": 0.0006854616795760293, + "loss": 3.7023, + "step": 37050 + }, + { + "epoch": 2.5176654436744124, + "grad_norm": 1.3748000860214233, + "learning_rate": 0.0006854192145671967, + "loss": 3.6618, + "step": 37055 + }, + { + "epoch": 2.518005163745074, + "grad_norm": 0.9256331920623779, + "learning_rate": 0.000685376749558364, + "loss": 3.3511, + "step": 37060 + }, + { + "epoch": 2.518344883815736, + "grad_norm": 0.7636584639549255, + "learning_rate": 0.0006853342845495312, + "loss": 3.687, + "step": 37065 + }, + { + "epoch": 2.5186846038863977, + "grad_norm": 0.9494330286979675, + "learning_rate": 0.0006852918195406985, + "loss": 3.5325, + "step": 37070 + }, + { + "epoch": 2.5190243239570593, + "grad_norm": 0.7560548186302185, + "learning_rate": 0.0006852493545318657, + "loss": 3.2654, + "step": 37075 + }, + { + "epoch": 2.5193640440277214, + "grad_norm": 1.0353922843933105, + "learning_rate": 0.000685206889523033, + "loss": 3.6647, + "step": 37080 + }, + { + "epoch": 2.519703764098383, + "grad_norm": 0.8432351350784302, + "learning_rate": 0.0006851644245142003, + "loss": 3.4574, + "step": 37085 + }, + { + "epoch": 2.5200434841690447, + "grad_norm": 0.7303746938705444, + "learning_rate": 0.0006851219595053676, + "loss": 3.6205, + "step": 37090 + }, + { + "epoch": 2.5203832042397067, + "grad_norm": 1.065165400505066, + "learning_rate": 0.0006850794944965349, + "loss": 3.7331, + "step": 37095 + }, + { + "epoch": 2.5207229243103684, + "grad_norm": 1.11162269115448, + "learning_rate": 0.0006850370294877022, + "loss": 3.4751, + "step": 37100 + }, + { + "epoch": 2.52106264438103, + "grad_norm": 0.8021150231361389, + "learning_rate": 0.0006849945644788694, + "loss": 3.3077, + "step": 37105 + }, + { + "epoch": 2.521402364451692, + "grad_norm": 0.8272626399993896, + "learning_rate": 0.0006849520994700367, + "loss": 3.4509, + "step": 37110 + }, + { + "epoch": 2.5217420845223537, + "grad_norm": 1.1256625652313232, + "learning_rate": 0.000684909634461204, + "loss": 3.5248, + "step": 37115 + }, + { + "epoch": 2.5220818045930153, + "grad_norm": 0.865553081035614, + "learning_rate": 0.0006848671694523712, + "loss": 3.7432, + "step": 37120 + }, + { + "epoch": 2.522421524663677, + "grad_norm": 0.8886268734931946, + "learning_rate": 0.0006848247044435386, + "loss": 3.6128, + "step": 37125 + }, + { + "epoch": 2.522761244734339, + "grad_norm": 0.8951315879821777, + "learning_rate": 0.0006847822394347059, + "loss": 3.558, + "step": 37130 + }, + { + "epoch": 2.5231009648050007, + "grad_norm": 0.8882858753204346, + "learning_rate": 0.0006847397744258731, + "loss": 3.7483, + "step": 37135 + }, + { + "epoch": 2.5234406848756623, + "grad_norm": 0.549642026424408, + "learning_rate": 0.0006846973094170403, + "loss": 3.6336, + "step": 37140 + }, + { + "epoch": 2.5237804049463244, + "grad_norm": 0.7569166421890259, + "learning_rate": 0.0006846548444082077, + "loss": 3.4539, + "step": 37145 + }, + { + "epoch": 2.524120125016986, + "grad_norm": 0.8699723482131958, + "learning_rate": 0.0006846123793993749, + "loss": 3.7348, + "step": 37150 + }, + { + "epoch": 2.5244598450876476, + "grad_norm": 0.9525786638259888, + "learning_rate": 0.0006845699143905421, + "loss": 3.562, + "step": 37155 + }, + { + "epoch": 2.5247995651583093, + "grad_norm": 0.8277950882911682, + "learning_rate": 0.0006845274493817096, + "loss": 3.4501, + "step": 37160 + }, + { + "epoch": 2.5251392852289714, + "grad_norm": 0.8423066139221191, + "learning_rate": 0.0006844849843728768, + "loss": 3.6761, + "step": 37165 + }, + { + "epoch": 2.525479005299633, + "grad_norm": 0.8786348104476929, + "learning_rate": 0.000684442519364044, + "loss": 3.4425, + "step": 37170 + }, + { + "epoch": 2.5258187253702946, + "grad_norm": 0.9565808176994324, + "learning_rate": 0.0006844000543552114, + "loss": 3.3921, + "step": 37175 + }, + { + "epoch": 2.5261584454409567, + "grad_norm": 1.1286540031433105, + "learning_rate": 0.0006843575893463786, + "loss": 3.5298, + "step": 37180 + }, + { + "epoch": 2.5264981655116183, + "grad_norm": 0.6773091554641724, + "learning_rate": 0.0006843151243375458, + "loss": 3.5015, + "step": 37185 + }, + { + "epoch": 2.52683788558228, + "grad_norm": 0.8455127477645874, + "learning_rate": 0.0006842726593287131, + "loss": 3.483, + "step": 37190 + }, + { + "epoch": 2.527177605652942, + "grad_norm": 0.910053014755249, + "learning_rate": 0.0006842301943198805, + "loss": 3.7112, + "step": 37195 + }, + { + "epoch": 2.5275173257236037, + "grad_norm": 1.0729894638061523, + "learning_rate": 0.0006841877293110477, + "loss": 3.3681, + "step": 37200 + }, + { + "epoch": 2.5278570457942653, + "grad_norm": 0.7220337390899658, + "learning_rate": 0.000684145264302215, + "loss": 3.6619, + "step": 37205 + }, + { + "epoch": 2.5281967658649274, + "grad_norm": 1.0115914344787598, + "learning_rate": 0.0006841027992933823, + "loss": 3.5399, + "step": 37210 + }, + { + "epoch": 2.528536485935589, + "grad_norm": 0.9949489235877991, + "learning_rate": 0.0006840603342845495, + "loss": 3.3914, + "step": 37215 + }, + { + "epoch": 2.5288762060062506, + "grad_norm": 0.6734651327133179, + "learning_rate": 0.0006840178692757168, + "loss": 3.6754, + "step": 37220 + }, + { + "epoch": 2.5292159260769127, + "grad_norm": 0.8436553478240967, + "learning_rate": 0.000683975404266884, + "loss": 3.4915, + "step": 37225 + }, + { + "epoch": 2.5295556461475743, + "grad_norm": 1.075241208076477, + "learning_rate": 0.0006839329392580514, + "loss": 3.4255, + "step": 37230 + }, + { + "epoch": 2.529895366218236, + "grad_norm": 1.1965173482894897, + "learning_rate": 0.0006838904742492187, + "loss": 3.4317, + "step": 37235 + }, + { + "epoch": 2.530235086288898, + "grad_norm": 1.0072189569473267, + "learning_rate": 0.0006838480092403859, + "loss": 3.6467, + "step": 37240 + }, + { + "epoch": 2.5305748063595597, + "grad_norm": 1.1499112844467163, + "learning_rate": 0.0006838055442315532, + "loss": 3.3778, + "step": 37245 + }, + { + "epoch": 2.5309145264302213, + "grad_norm": 0.7219229340553284, + "learning_rate": 0.0006837630792227205, + "loss": 3.5401, + "step": 37250 + }, + { + "epoch": 2.5312542465008834, + "grad_norm": 0.9035902619361877, + "learning_rate": 0.0006837206142138877, + "loss": 3.7015, + "step": 37255 + }, + { + "epoch": 2.531593966571545, + "grad_norm": 0.7898024320602417, + "learning_rate": 0.000683678149205055, + "loss": 3.6291, + "step": 37260 + }, + { + "epoch": 2.5319336866422066, + "grad_norm": 0.7728981971740723, + "learning_rate": 0.0006836356841962224, + "loss": 3.43, + "step": 37265 + }, + { + "epoch": 2.5322734067128687, + "grad_norm": 0.7902145385742188, + "learning_rate": 0.0006835932191873896, + "loss": 3.4508, + "step": 37270 + }, + { + "epoch": 2.5326131267835303, + "grad_norm": 0.77162766456604, + "learning_rate": 0.0006835507541785568, + "loss": 3.5197, + "step": 37275 + }, + { + "epoch": 2.532952846854192, + "grad_norm": 0.9623584747314453, + "learning_rate": 0.0006835082891697242, + "loss": 3.5349, + "step": 37280 + }, + { + "epoch": 2.533292566924854, + "grad_norm": 1.0805450677871704, + "learning_rate": 0.0006834658241608914, + "loss": 3.4743, + "step": 37285 + }, + { + "epoch": 2.5336322869955157, + "grad_norm": 1.154498815536499, + "learning_rate": 0.0006834233591520586, + "loss": 3.6301, + "step": 37290 + }, + { + "epoch": 2.5339720070661773, + "grad_norm": 0.9269973039627075, + "learning_rate": 0.000683380894143226, + "loss": 3.4243, + "step": 37295 + }, + { + "epoch": 2.5343117271368394, + "grad_norm": 1.304805874824524, + "learning_rate": 0.0006833384291343933, + "loss": 3.4145, + "step": 37300 + }, + { + "epoch": 2.534651447207501, + "grad_norm": 0.8571718335151672, + "learning_rate": 0.0006832959641255605, + "loss": 3.6585, + "step": 37305 + }, + { + "epoch": 2.5349911672781626, + "grad_norm": 0.7250730395317078, + "learning_rate": 0.0006832534991167279, + "loss": 3.4041, + "step": 37310 + }, + { + "epoch": 2.5353308873488247, + "grad_norm": 1.344346523284912, + "learning_rate": 0.0006832110341078951, + "loss": 3.2588, + "step": 37315 + }, + { + "epoch": 2.5356706074194864, + "grad_norm": 0.8169065117835999, + "learning_rate": 0.0006831685690990623, + "loss": 3.5679, + "step": 37320 + }, + { + "epoch": 2.536010327490148, + "grad_norm": 0.844508945941925, + "learning_rate": 0.0006831261040902296, + "loss": 3.485, + "step": 37325 + }, + { + "epoch": 2.53635004756081, + "grad_norm": 0.8913936614990234, + "learning_rate": 0.0006830836390813969, + "loss": 3.6509, + "step": 37330 + }, + { + "epoch": 2.5366897676314717, + "grad_norm": 0.7966809868812561, + "learning_rate": 0.0006830411740725643, + "loss": 3.5358, + "step": 37335 + }, + { + "epoch": 2.5370294877021333, + "grad_norm": 0.6682244539260864, + "learning_rate": 0.0006829987090637315, + "loss": 3.6159, + "step": 37340 + }, + { + "epoch": 2.5373692077727954, + "grad_norm": 0.9351009726524353, + "learning_rate": 0.0006829562440548988, + "loss": 3.8113, + "step": 37345 + }, + { + "epoch": 2.537708927843457, + "grad_norm": 0.9606015086174011, + "learning_rate": 0.0006829137790460661, + "loss": 3.3732, + "step": 37350 + }, + { + "epoch": 2.5380486479141187, + "grad_norm": 0.7294321060180664, + "learning_rate": 0.0006828713140372333, + "loss": 3.6999, + "step": 37355 + }, + { + "epoch": 2.5383883679847807, + "grad_norm": 0.9745467901229858, + "learning_rate": 0.0006828288490284006, + "loss": 3.6308, + "step": 37360 + }, + { + "epoch": 2.5387280880554424, + "grad_norm": 0.7990125417709351, + "learning_rate": 0.000682786384019568, + "loss": 3.3903, + "step": 37365 + }, + { + "epoch": 2.539067808126104, + "grad_norm": 1.296937108039856, + "learning_rate": 0.0006827439190107352, + "loss": 3.5714, + "step": 37370 + }, + { + "epoch": 2.539407528196766, + "grad_norm": 0.8125663995742798, + "learning_rate": 0.0006827014540019024, + "loss": 3.612, + "step": 37375 + }, + { + "epoch": 2.5397472482674277, + "grad_norm": 0.7332519292831421, + "learning_rate": 0.0006826589889930698, + "loss": 3.5132, + "step": 37380 + }, + { + "epoch": 2.5400869683380893, + "grad_norm": 0.8264506459236145, + "learning_rate": 0.000682616523984237, + "loss": 3.6276, + "step": 37385 + }, + { + "epoch": 2.5404266884087514, + "grad_norm": 0.9526277780532837, + "learning_rate": 0.0006825740589754042, + "loss": 3.6801, + "step": 37390 + }, + { + "epoch": 2.540766408479413, + "grad_norm": 0.7269062995910645, + "learning_rate": 0.0006825315939665716, + "loss": 3.6453, + "step": 37395 + }, + { + "epoch": 2.5411061285500747, + "grad_norm": 0.7843999862670898, + "learning_rate": 0.0006824891289577389, + "loss": 3.7978, + "step": 37400 + }, + { + "epoch": 2.5414458486207367, + "grad_norm": 1.2992713451385498, + "learning_rate": 0.0006824466639489061, + "loss": 3.2435, + "step": 37405 + }, + { + "epoch": 2.5417855686913984, + "grad_norm": 0.8157537579536438, + "learning_rate": 0.0006824041989400735, + "loss": 3.5747, + "step": 37410 + }, + { + "epoch": 2.54212528876206, + "grad_norm": 0.8656436204910278, + "learning_rate": 0.0006823617339312407, + "loss": 3.878, + "step": 37415 + }, + { + "epoch": 2.542465008832722, + "grad_norm": 0.9770327806472778, + "learning_rate": 0.0006823192689224079, + "loss": 3.8661, + "step": 37420 + }, + { + "epoch": 2.5428047289033837, + "grad_norm": 0.8693691492080688, + "learning_rate": 0.0006822768039135752, + "loss": 3.6509, + "step": 37425 + }, + { + "epoch": 2.5431444489740453, + "grad_norm": 0.7430413961410522, + "learning_rate": 0.0006822343389047425, + "loss": 3.6107, + "step": 37430 + }, + { + "epoch": 2.5434841690447074, + "grad_norm": 0.9970824122428894, + "learning_rate": 0.0006821918738959098, + "loss": 3.5646, + "step": 37435 + }, + { + "epoch": 2.543823889115369, + "grad_norm": 1.2031069993972778, + "learning_rate": 0.0006821494088870771, + "loss": 3.4543, + "step": 37440 + }, + { + "epoch": 2.5441636091860307, + "grad_norm": 0.9711928963661194, + "learning_rate": 0.0006821069438782444, + "loss": 3.4306, + "step": 37445 + }, + { + "epoch": 2.5445033292566928, + "grad_norm": 0.8750194907188416, + "learning_rate": 0.0006820644788694116, + "loss": 3.5273, + "step": 37450 + }, + { + "epoch": 2.5448430493273544, + "grad_norm": 1.1471461057662964, + "learning_rate": 0.0006820220138605789, + "loss": 3.4645, + "step": 37455 + }, + { + "epoch": 2.545182769398016, + "grad_norm": 0.8320173025131226, + "learning_rate": 0.0006819795488517462, + "loss": 3.6127, + "step": 37460 + }, + { + "epoch": 2.5455224894686777, + "grad_norm": 1.0403026342391968, + "learning_rate": 0.0006819370838429134, + "loss": 3.5644, + "step": 37465 + }, + { + "epoch": 2.5458622095393397, + "grad_norm": 1.3317701816558838, + "learning_rate": 0.0006818946188340808, + "loss": 3.5959, + "step": 37470 + }, + { + "epoch": 2.5462019296100014, + "grad_norm": 0.9442824125289917, + "learning_rate": 0.000681852153825248, + "loss": 3.4478, + "step": 37475 + }, + { + "epoch": 2.546541649680663, + "grad_norm": 1.0081294775009155, + "learning_rate": 0.0006818096888164153, + "loss": 3.3594, + "step": 37480 + }, + { + "epoch": 2.546881369751325, + "grad_norm": 0.7751688957214355, + "learning_rate": 0.0006817672238075826, + "loss": 3.7801, + "step": 37485 + }, + { + "epoch": 2.5472210898219867, + "grad_norm": 0.9441704750061035, + "learning_rate": 0.0006817247587987498, + "loss": 3.7924, + "step": 37490 + }, + { + "epoch": 2.5475608098926483, + "grad_norm": 0.8323125839233398, + "learning_rate": 0.0006816822937899171, + "loss": 3.4242, + "step": 37495 + }, + { + "epoch": 2.54790052996331, + "grad_norm": 0.871675968170166, + "learning_rate": 0.0006816398287810844, + "loss": 3.5567, + "step": 37500 + }, + { + "epoch": 2.548240250033972, + "grad_norm": 0.7489525079727173, + "learning_rate": 0.0006815973637722517, + "loss": 3.6946, + "step": 37505 + }, + { + "epoch": 2.5485799701046337, + "grad_norm": 1.1373589038848877, + "learning_rate": 0.000681554898763419, + "loss": 3.6565, + "step": 37510 + }, + { + "epoch": 2.5489196901752953, + "grad_norm": 0.8214653134346008, + "learning_rate": 0.0006815124337545863, + "loss": 3.5435, + "step": 37515 + }, + { + "epoch": 2.5492594102459574, + "grad_norm": 0.7379087805747986, + "learning_rate": 0.0006814699687457535, + "loss": 3.5017, + "step": 37520 + }, + { + "epoch": 2.549599130316619, + "grad_norm": 0.9120932817459106, + "learning_rate": 0.0006814275037369207, + "loss": 3.7545, + "step": 37525 + }, + { + "epoch": 2.5499388503872806, + "grad_norm": 0.7414853572845459, + "learning_rate": 0.0006813850387280881, + "loss": 3.5622, + "step": 37530 + }, + { + "epoch": 2.5502785704579427, + "grad_norm": 0.8253170847892761, + "learning_rate": 0.0006813425737192553, + "loss": 3.7085, + "step": 37535 + }, + { + "epoch": 2.5506182905286043, + "grad_norm": 0.7483157515525818, + "learning_rate": 0.0006813001087104226, + "loss": 3.599, + "step": 37540 + }, + { + "epoch": 2.550958010599266, + "grad_norm": 0.9109858274459839, + "learning_rate": 0.00068125764370159, + "loss": 3.5612, + "step": 37545 + }, + { + "epoch": 2.551297730669928, + "grad_norm": 0.90876704454422, + "learning_rate": 0.0006812151786927572, + "loss": 3.3054, + "step": 37550 + }, + { + "epoch": 2.5516374507405897, + "grad_norm": 0.8267819881439209, + "learning_rate": 0.0006811727136839244, + "loss": 3.2621, + "step": 37555 + }, + { + "epoch": 2.5519771708112513, + "grad_norm": 0.8463085889816284, + "learning_rate": 0.0006811302486750918, + "loss": 3.2608, + "step": 37560 + }, + { + "epoch": 2.5523168908819134, + "grad_norm": 0.7272642254829407, + "learning_rate": 0.000681087783666259, + "loss": 3.4601, + "step": 37565 + }, + { + "epoch": 2.552656610952575, + "grad_norm": 1.1404623985290527, + "learning_rate": 0.0006810453186574262, + "loss": 3.4145, + "step": 37570 + }, + { + "epoch": 2.5529963310232366, + "grad_norm": 0.8199654817581177, + "learning_rate": 0.0006810028536485936, + "loss": 3.461, + "step": 37575 + }, + { + "epoch": 2.5533360510938987, + "grad_norm": 0.8466174602508545, + "learning_rate": 0.0006809603886397609, + "loss": 3.8297, + "step": 37580 + }, + { + "epoch": 2.5536757711645603, + "grad_norm": 0.7050384879112244, + "learning_rate": 0.0006809179236309281, + "loss": 3.1746, + "step": 37585 + }, + { + "epoch": 2.554015491235222, + "grad_norm": 0.9237068295478821, + "learning_rate": 0.0006808754586220954, + "loss": 3.5473, + "step": 37590 + }, + { + "epoch": 2.554355211305884, + "grad_norm": 0.9028370380401611, + "learning_rate": 0.0006808329936132627, + "loss": 3.5851, + "step": 37595 + }, + { + "epoch": 2.5546949313765457, + "grad_norm": 0.8120203018188477, + "learning_rate": 0.0006807905286044299, + "loss": 3.2831, + "step": 37600 + }, + { + "epoch": 2.5550346514472073, + "grad_norm": 0.7166442275047302, + "learning_rate": 0.0006807480635955972, + "loss": 3.6266, + "step": 37605 + }, + { + "epoch": 2.5553743715178694, + "grad_norm": 5.250967979431152, + "learning_rate": 0.0006807055985867646, + "loss": 3.643, + "step": 37610 + }, + { + "epoch": 2.555714091588531, + "grad_norm": 1.1823431253433228, + "learning_rate": 0.0006806631335779318, + "loss": 3.3613, + "step": 37615 + }, + { + "epoch": 2.5560538116591927, + "grad_norm": 0.81910640001297, + "learning_rate": 0.0006806206685690991, + "loss": 3.7078, + "step": 37620 + }, + { + "epoch": 2.5563935317298547, + "grad_norm": 1.4527661800384521, + "learning_rate": 0.0006805782035602663, + "loss": 3.7822, + "step": 37625 + }, + { + "epoch": 2.5567332518005164, + "grad_norm": 1.1609488725662231, + "learning_rate": 0.0006805357385514336, + "loss": 3.4757, + "step": 37630 + }, + { + "epoch": 2.557072971871178, + "grad_norm": 0.8663038611412048, + "learning_rate": 0.0006804932735426009, + "loss": 3.6955, + "step": 37635 + }, + { + "epoch": 2.55741269194184, + "grad_norm": 1.0075753927230835, + "learning_rate": 0.0006804508085337681, + "loss": 3.3959, + "step": 37640 + }, + { + "epoch": 2.5577524120125017, + "grad_norm": 0.9179070591926575, + "learning_rate": 0.0006804083435249355, + "loss": 3.5803, + "step": 37645 + }, + { + "epoch": 2.5580921320831633, + "grad_norm": 0.7366122603416443, + "learning_rate": 0.0006803658785161028, + "loss": 3.4251, + "step": 37650 + }, + { + "epoch": 2.5584318521538254, + "grad_norm": 0.778286337852478, + "learning_rate": 0.00068032341350727, + "loss": 3.6142, + "step": 37655 + }, + { + "epoch": 2.558771572224487, + "grad_norm": 0.8225005269050598, + "learning_rate": 0.0006802809484984372, + "loss": 3.4763, + "step": 37660 + }, + { + "epoch": 2.5591112922951487, + "grad_norm": 0.6856055855751038, + "learning_rate": 0.0006802384834896046, + "loss": 3.2982, + "step": 37665 + }, + { + "epoch": 2.5594510123658107, + "grad_norm": 0.6743208765983582, + "learning_rate": 0.0006801960184807718, + "loss": 3.5247, + "step": 37670 + }, + { + "epoch": 2.5597907324364724, + "grad_norm": 0.7775627970695496, + "learning_rate": 0.0006801535534719391, + "loss": 3.661, + "step": 37675 + }, + { + "epoch": 2.560130452507134, + "grad_norm": 0.9776765704154968, + "learning_rate": 0.0006801110884631065, + "loss": 3.6184, + "step": 37680 + }, + { + "epoch": 2.560470172577796, + "grad_norm": 0.8821917772293091, + "learning_rate": 0.0006800686234542737, + "loss": 3.6346, + "step": 37685 + }, + { + "epoch": 2.5608098926484577, + "grad_norm": 1.133423924446106, + "learning_rate": 0.000680026158445441, + "loss": 3.7219, + "step": 37690 + }, + { + "epoch": 2.5611496127191193, + "grad_norm": 0.7596054673194885, + "learning_rate": 0.0006799836934366083, + "loss": 3.5566, + "step": 37695 + }, + { + "epoch": 2.5614893327897814, + "grad_norm": 0.8737888932228088, + "learning_rate": 0.0006799412284277755, + "loss": 3.5167, + "step": 37700 + }, + { + "epoch": 2.561829052860443, + "grad_norm": 0.9928485751152039, + "learning_rate": 0.0006798987634189428, + "loss": 3.5358, + "step": 37705 + }, + { + "epoch": 2.5621687729311047, + "grad_norm": 0.8116140961647034, + "learning_rate": 0.00067985629841011, + "loss": 3.4842, + "step": 37710 + }, + { + "epoch": 2.5625084930017668, + "grad_norm": 0.7324190735816956, + "learning_rate": 0.0006798138334012774, + "loss": 3.6537, + "step": 37715 + }, + { + "epoch": 2.5628482130724284, + "grad_norm": 0.9659972786903381, + "learning_rate": 0.0006797713683924447, + "loss": 3.5528, + "step": 37720 + }, + { + "epoch": 2.56318793314309, + "grad_norm": 0.9801137447357178, + "learning_rate": 0.0006797289033836119, + "loss": 3.3014, + "step": 37725 + }, + { + "epoch": 2.563527653213752, + "grad_norm": 0.8279529809951782, + "learning_rate": 0.0006796864383747792, + "loss": 3.741, + "step": 37730 + }, + { + "epoch": 2.5638673732844137, + "grad_norm": 0.9102848172187805, + "learning_rate": 0.0006796439733659465, + "loss": 3.6132, + "step": 37735 + }, + { + "epoch": 2.5642070933550754, + "grad_norm": 0.7214458584785461, + "learning_rate": 0.0006796015083571137, + "loss": 3.6381, + "step": 37740 + }, + { + "epoch": 2.5645468134257374, + "grad_norm": 0.6891651153564453, + "learning_rate": 0.000679559043348281, + "loss": 3.6591, + "step": 37745 + }, + { + "epoch": 2.564886533496399, + "grad_norm": 0.8058345317840576, + "learning_rate": 0.0006795165783394484, + "loss": 3.4067, + "step": 37750 + }, + { + "epoch": 2.5652262535670607, + "grad_norm": 0.7558161616325378, + "learning_rate": 0.0006794741133306156, + "loss": 3.5244, + "step": 37755 + }, + { + "epoch": 2.5655659736377228, + "grad_norm": 0.972630500793457, + "learning_rate": 0.0006794316483217828, + "loss": 3.5997, + "step": 37760 + }, + { + "epoch": 2.5659056937083844, + "grad_norm": 0.8297915458679199, + "learning_rate": 0.0006793891833129502, + "loss": 3.5944, + "step": 37765 + }, + { + "epoch": 2.566245413779046, + "grad_norm": 0.9361753463745117, + "learning_rate": 0.0006793467183041174, + "loss": 3.4942, + "step": 37770 + }, + { + "epoch": 2.566585133849708, + "grad_norm": 0.9300594925880432, + "learning_rate": 0.0006793042532952846, + "loss": 3.723, + "step": 37775 + }, + { + "epoch": 2.5669248539203697, + "grad_norm": 0.8634765148162842, + "learning_rate": 0.000679261788286452, + "loss": 3.6993, + "step": 37780 + }, + { + "epoch": 2.5672645739910314, + "grad_norm": 0.6678186655044556, + "learning_rate": 0.0006792193232776193, + "loss": 3.4815, + "step": 37785 + }, + { + "epoch": 2.5676042940616934, + "grad_norm": 0.7153738141059875, + "learning_rate": 0.0006791768582687865, + "loss": 3.8409, + "step": 37790 + }, + { + "epoch": 2.567944014132355, + "grad_norm": 0.7481696605682373, + "learning_rate": 0.0006791343932599539, + "loss": 3.5374, + "step": 37795 + }, + { + "epoch": 2.5682837342030167, + "grad_norm": 1.179614543914795, + "learning_rate": 0.0006790919282511211, + "loss": 3.7504, + "step": 37800 + }, + { + "epoch": 2.5686234542736783, + "grad_norm": 0.9024316072463989, + "learning_rate": 0.0006790494632422883, + "loss": 3.5086, + "step": 37805 + }, + { + "epoch": 2.5689631743443404, + "grad_norm": 0.6392594575881958, + "learning_rate": 0.0006790069982334557, + "loss": 3.6085, + "step": 37810 + }, + { + "epoch": 2.569302894415002, + "grad_norm": 0.7534192204475403, + "learning_rate": 0.0006789645332246229, + "loss": 3.6664, + "step": 37815 + }, + { + "epoch": 2.5696426144856637, + "grad_norm": 0.9390565156936646, + "learning_rate": 0.0006789220682157902, + "loss": 3.5956, + "step": 37820 + }, + { + "epoch": 2.5699823345563257, + "grad_norm": 0.8217517137527466, + "learning_rate": 0.0006788796032069575, + "loss": 3.5739, + "step": 37825 + }, + { + "epoch": 2.5703220546269874, + "grad_norm": 0.8223580718040466, + "learning_rate": 0.0006788371381981248, + "loss": 3.248, + "step": 37830 + }, + { + "epoch": 2.570661774697649, + "grad_norm": 0.881181001663208, + "learning_rate": 0.000678794673189292, + "loss": 3.4098, + "step": 37835 + }, + { + "epoch": 2.5710014947683106, + "grad_norm": 0.9131170511245728, + "learning_rate": 0.0006787522081804593, + "loss": 3.4219, + "step": 37840 + }, + { + "epoch": 2.5713412148389727, + "grad_norm": 0.9608713984489441, + "learning_rate": 0.0006787097431716266, + "loss": 3.4352, + "step": 37845 + }, + { + "epoch": 2.5716809349096343, + "grad_norm": 0.9232349395751953, + "learning_rate": 0.0006786672781627938, + "loss": 3.7012, + "step": 37850 + }, + { + "epoch": 2.572020654980296, + "grad_norm": 0.8997893333435059, + "learning_rate": 0.0006786248131539612, + "loss": 3.7041, + "step": 37855 + }, + { + "epoch": 2.572360375050958, + "grad_norm": 0.7555089592933655, + "learning_rate": 0.0006785823481451285, + "loss": 3.5112, + "step": 37860 + }, + { + "epoch": 2.5727000951216197, + "grad_norm": 1.211995005607605, + "learning_rate": 0.0006785398831362957, + "loss": 3.7721, + "step": 37865 + }, + { + "epoch": 2.5730398151922813, + "grad_norm": 0.8534327745437622, + "learning_rate": 0.000678497418127463, + "loss": 3.5719, + "step": 37870 + }, + { + "epoch": 2.5733795352629434, + "grad_norm": 0.8279335498809814, + "learning_rate": 0.0006784549531186302, + "loss": 3.598, + "step": 37875 + }, + { + "epoch": 2.573719255333605, + "grad_norm": 1.0076227188110352, + "learning_rate": 0.0006784124881097975, + "loss": 3.4672, + "step": 37880 + }, + { + "epoch": 2.5740589754042666, + "grad_norm": 0.8993183970451355, + "learning_rate": 0.0006783700231009648, + "loss": 3.5105, + "step": 37885 + }, + { + "epoch": 2.5743986954749287, + "grad_norm": 0.7372032999992371, + "learning_rate": 0.0006783275580921321, + "loss": 3.6086, + "step": 37890 + }, + { + "epoch": 2.5747384155455904, + "grad_norm": 1.3244833946228027, + "learning_rate": 0.0006782850930832994, + "loss": 3.6195, + "step": 37895 + }, + { + "epoch": 2.575078135616252, + "grad_norm": 0.9929924607276917, + "learning_rate": 0.0006782426280744667, + "loss": 3.7953, + "step": 37900 + }, + { + "epoch": 2.575417855686914, + "grad_norm": 1.2727817296981812, + "learning_rate": 0.0006782001630656339, + "loss": 3.749, + "step": 37905 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 1.3611867427825928, + "learning_rate": 0.0006781576980568011, + "loss": 3.6386, + "step": 37910 + }, + { + "epoch": 2.5760972958282373, + "grad_norm": 0.9790560007095337, + "learning_rate": 0.0006781152330479685, + "loss": 3.5984, + "step": 37915 + }, + { + "epoch": 2.5764370158988994, + "grad_norm": 0.7845713496208191, + "learning_rate": 0.0006780727680391357, + "loss": 3.4924, + "step": 37920 + }, + { + "epoch": 2.576776735969561, + "grad_norm": 0.9649502038955688, + "learning_rate": 0.000678030303030303, + "loss": 3.6299, + "step": 37925 + }, + { + "epoch": 2.5771164560402227, + "grad_norm": 0.8448287844657898, + "learning_rate": 0.0006779878380214704, + "loss": 3.3995, + "step": 37930 + }, + { + "epoch": 2.5774561761108847, + "grad_norm": 1.1541836261749268, + "learning_rate": 0.0006779453730126376, + "loss": 3.3624, + "step": 37935 + }, + { + "epoch": 2.5777958961815464, + "grad_norm": 0.8350121378898621, + "learning_rate": 0.0006779029080038048, + "loss": 3.4881, + "step": 37940 + }, + { + "epoch": 2.578135616252208, + "grad_norm": 1.1286314725875854, + "learning_rate": 0.0006778604429949722, + "loss": 3.4056, + "step": 37945 + }, + { + "epoch": 2.57847533632287, + "grad_norm": 0.8367844820022583, + "learning_rate": 0.0006778179779861394, + "loss": 3.4662, + "step": 37950 + }, + { + "epoch": 2.5788150563935317, + "grad_norm": 0.8371274471282959, + "learning_rate": 0.0006777755129773066, + "loss": 3.4958, + "step": 37955 + }, + { + "epoch": 2.5791547764641933, + "grad_norm": 0.84357088804245, + "learning_rate": 0.000677733047968474, + "loss": 3.2596, + "step": 37960 + }, + { + "epoch": 2.5794944965348554, + "grad_norm": 0.8393417596817017, + "learning_rate": 0.0006776905829596413, + "loss": 3.7401, + "step": 37965 + }, + { + "epoch": 2.579834216605517, + "grad_norm": 1.0599581003189087, + "learning_rate": 0.0006776481179508085, + "loss": 3.3788, + "step": 37970 + }, + { + "epoch": 2.5801739366761787, + "grad_norm": 0.8268111348152161, + "learning_rate": 0.0006776056529419758, + "loss": 3.4958, + "step": 37975 + }, + { + "epoch": 2.5805136567468407, + "grad_norm": 1.018105149269104, + "learning_rate": 0.0006775631879331431, + "loss": 3.6094, + "step": 37980 + }, + { + "epoch": 2.5808533768175024, + "grad_norm": 0.877504825592041, + "learning_rate": 0.0006775207229243103, + "loss": 3.3656, + "step": 37985 + }, + { + "epoch": 2.581193096888164, + "grad_norm": 1.043851375579834, + "learning_rate": 0.0006774782579154777, + "loss": 3.6982, + "step": 37990 + }, + { + "epoch": 2.581532816958826, + "grad_norm": 0.783184289932251, + "learning_rate": 0.000677435792906645, + "loss": 3.6907, + "step": 37995 + }, + { + "epoch": 2.5818725370294877, + "grad_norm": 1.0015839338302612, + "learning_rate": 0.0006773933278978122, + "loss": 3.4253, + "step": 38000 + }, + { + "epoch": 2.5822122571001493, + "grad_norm": 1.0572208166122437, + "learning_rate": 0.0006773508628889795, + "loss": 3.3999, + "step": 38005 + }, + { + "epoch": 2.5825519771708114, + "grad_norm": 0.6689481139183044, + "learning_rate": 0.0006773083978801467, + "loss": 3.8593, + "step": 38010 + }, + { + "epoch": 2.582891697241473, + "grad_norm": 0.9192221760749817, + "learning_rate": 0.0006772659328713141, + "loss": 3.6528, + "step": 38015 + }, + { + "epoch": 2.5832314173121347, + "grad_norm": 0.8209254741668701, + "learning_rate": 0.0006772234678624813, + "loss": 3.777, + "step": 38020 + }, + { + "epoch": 2.5835711373827968, + "grad_norm": 0.8303192257881165, + "learning_rate": 0.0006771810028536486, + "loss": 3.5005, + "step": 38025 + }, + { + "epoch": 2.5839108574534584, + "grad_norm": 0.8888244032859802, + "learning_rate": 0.000677138537844816, + "loss": 3.4954, + "step": 38030 + }, + { + "epoch": 2.58425057752412, + "grad_norm": 1.1203625202178955, + "learning_rate": 0.0006770960728359832, + "loss": 3.6063, + "step": 38035 + }, + { + "epoch": 2.584590297594782, + "grad_norm": 0.874431848526001, + "learning_rate": 0.0006770536078271504, + "loss": 3.7255, + "step": 38040 + }, + { + "epoch": 2.5849300176654437, + "grad_norm": 0.7736310362815857, + "learning_rate": 0.0006770111428183178, + "loss": 3.5856, + "step": 38045 + }, + { + "epoch": 2.5852697377361054, + "grad_norm": 0.9631763100624084, + "learning_rate": 0.000676968677809485, + "loss": 3.5168, + "step": 38050 + }, + { + "epoch": 2.5856094578067674, + "grad_norm": 0.7791212201118469, + "learning_rate": 0.0006769262128006522, + "loss": 3.6981, + "step": 38055 + }, + { + "epoch": 2.585949177877429, + "grad_norm": 0.7987561821937561, + "learning_rate": 0.0006768837477918197, + "loss": 3.5784, + "step": 38060 + }, + { + "epoch": 2.5862888979480907, + "grad_norm": 1.0018635988235474, + "learning_rate": 0.0006768412827829869, + "loss": 3.7195, + "step": 38065 + }, + { + "epoch": 2.5866286180187528, + "grad_norm": 0.6976622343063354, + "learning_rate": 0.0006767988177741541, + "loss": 3.7264, + "step": 38070 + }, + { + "epoch": 2.5869683380894144, + "grad_norm": 0.7477848529815674, + "learning_rate": 0.0006767563527653214, + "loss": 3.6421, + "step": 38075 + }, + { + "epoch": 2.587308058160076, + "grad_norm": 0.9398301243782043, + "learning_rate": 0.0006767138877564887, + "loss": 3.3678, + "step": 38080 + }, + { + "epoch": 2.587647778230738, + "grad_norm": 0.6285567879676819, + "learning_rate": 0.0006766714227476559, + "loss": 3.6065, + "step": 38085 + }, + { + "epoch": 2.5879874983013997, + "grad_norm": 0.8412474989891052, + "learning_rate": 0.0006766289577388232, + "loss": 3.5638, + "step": 38090 + }, + { + "epoch": 2.5883272183720614, + "grad_norm": 0.8571786284446716, + "learning_rate": 0.0006765864927299906, + "loss": 3.5619, + "step": 38095 + }, + { + "epoch": 2.5886669384427234, + "grad_norm": 0.742932140827179, + "learning_rate": 0.0006765440277211578, + "loss": 3.5376, + "step": 38100 + }, + { + "epoch": 2.589006658513385, + "grad_norm": 0.861108660697937, + "learning_rate": 0.0006765015627123251, + "loss": 3.8183, + "step": 38105 + }, + { + "epoch": 2.5893463785840467, + "grad_norm": 1.010828971862793, + "learning_rate": 0.0006764590977034923, + "loss": 3.8026, + "step": 38110 + }, + { + "epoch": 2.589686098654709, + "grad_norm": 0.9299313426017761, + "learning_rate": 0.0006764166326946596, + "loss": 3.5305, + "step": 38115 + }, + { + "epoch": 2.5900258187253704, + "grad_norm": 1.1794371604919434, + "learning_rate": 0.0006763741676858269, + "loss": 3.4218, + "step": 38120 + }, + { + "epoch": 2.590365538796032, + "grad_norm": 0.939010739326477, + "learning_rate": 0.0006763317026769941, + "loss": 3.6882, + "step": 38125 + }, + { + "epoch": 2.590705258866694, + "grad_norm": 1.101215124130249, + "learning_rate": 0.0006762892376681615, + "loss": 3.5797, + "step": 38130 + }, + { + "epoch": 2.5910449789373557, + "grad_norm": 0.8999865651130676, + "learning_rate": 0.0006762467726593288, + "loss": 3.5481, + "step": 38135 + }, + { + "epoch": 2.5913846990080174, + "grad_norm": 1.1950606107711792, + "learning_rate": 0.000676204307650496, + "loss": 3.4889, + "step": 38140 + }, + { + "epoch": 2.591724419078679, + "grad_norm": 0.8714291453361511, + "learning_rate": 0.0006761618426416633, + "loss": 3.7253, + "step": 38145 + }, + { + "epoch": 2.592064139149341, + "grad_norm": 0.9698233604431152, + "learning_rate": 0.0006761193776328306, + "loss": 3.5116, + "step": 38150 + }, + { + "epoch": 2.5924038592200027, + "grad_norm": 0.7656810879707336, + "learning_rate": 0.0006760769126239978, + "loss": 3.5647, + "step": 38155 + }, + { + "epoch": 2.5927435792906643, + "grad_norm": 0.8446677327156067, + "learning_rate": 0.000676034447615165, + "loss": 3.6179, + "step": 38160 + }, + { + "epoch": 2.5930832993613264, + "grad_norm": 0.8355880975723267, + "learning_rate": 0.0006759919826063325, + "loss": 3.491, + "step": 38165 + }, + { + "epoch": 2.593423019431988, + "grad_norm": 0.9361560940742493, + "learning_rate": 0.0006759495175974997, + "loss": 3.4815, + "step": 38170 + }, + { + "epoch": 2.5937627395026497, + "grad_norm": 0.8436786532402039, + "learning_rate": 0.0006759070525886669, + "loss": 3.3711, + "step": 38175 + }, + { + "epoch": 2.5941024595733113, + "grad_norm": 1.000539779663086, + "learning_rate": 0.0006758645875798343, + "loss": 3.594, + "step": 38180 + }, + { + "epoch": 2.5944421796439734, + "grad_norm": 0.8311334848403931, + "learning_rate": 0.0006758221225710015, + "loss": 3.3296, + "step": 38185 + }, + { + "epoch": 2.594781899714635, + "grad_norm": 0.7725995182991028, + "learning_rate": 0.0006757796575621687, + "loss": 3.4756, + "step": 38190 + }, + { + "epoch": 2.5951216197852967, + "grad_norm": 1.086095929145813, + "learning_rate": 0.000675737192553336, + "loss": 3.7606, + "step": 38195 + }, + { + "epoch": 2.5954613398559587, + "grad_norm": 0.8489976525306702, + "learning_rate": 0.0006756947275445034, + "loss": 3.4777, + "step": 38200 + }, + { + "epoch": 2.5958010599266204, + "grad_norm": 0.7761622667312622, + "learning_rate": 0.0006756522625356706, + "loss": 3.4632, + "step": 38205 + }, + { + "epoch": 2.596140779997282, + "grad_norm": 0.623809278011322, + "learning_rate": 0.000675609797526838, + "loss": 3.6777, + "step": 38210 + }, + { + "epoch": 2.596480500067944, + "grad_norm": 0.989152193069458, + "learning_rate": 0.0006755673325180052, + "loss": 3.4934, + "step": 38215 + }, + { + "epoch": 2.5968202201386057, + "grad_norm": 0.7261961102485657, + "learning_rate": 0.0006755248675091724, + "loss": 3.5014, + "step": 38220 + }, + { + "epoch": 2.5971599402092673, + "grad_norm": 0.8468483686447144, + "learning_rate": 0.0006754824025003397, + "loss": 3.5318, + "step": 38225 + }, + { + "epoch": 2.5974996602799294, + "grad_norm": 0.7214759588241577, + "learning_rate": 0.000675439937491507, + "loss": 3.6616, + "step": 38230 + }, + { + "epoch": 2.597839380350591, + "grad_norm": 0.760964572429657, + "learning_rate": 0.0006753974724826743, + "loss": 3.6139, + "step": 38235 + }, + { + "epoch": 2.5981791004212527, + "grad_norm": 0.7891107201576233, + "learning_rate": 0.0006753550074738416, + "loss": 3.5163, + "step": 38240 + }, + { + "epoch": 2.5985188204919147, + "grad_norm": 0.7787559628486633, + "learning_rate": 0.0006753125424650089, + "loss": 3.6588, + "step": 38245 + }, + { + "epoch": 2.5988585405625764, + "grad_norm": 0.9040635824203491, + "learning_rate": 0.0006752700774561761, + "loss": 3.7388, + "step": 38250 + }, + { + "epoch": 2.599198260633238, + "grad_norm": 0.872596800327301, + "learning_rate": 0.0006752276124473434, + "loss": 3.6633, + "step": 38255 + }, + { + "epoch": 2.5995379807039, + "grad_norm": 0.7658705115318298, + "learning_rate": 0.0006751851474385106, + "loss": 3.5995, + "step": 38260 + }, + { + "epoch": 2.5998777007745617, + "grad_norm": 0.8247947096824646, + "learning_rate": 0.0006751426824296779, + "loss": 3.4037, + "step": 38265 + }, + { + "epoch": 2.6002174208452233, + "grad_norm": 0.9564549326896667, + "learning_rate": 0.0006751002174208453, + "loss": 3.8102, + "step": 38270 + }, + { + "epoch": 2.6005571409158854, + "grad_norm": 0.7177067399024963, + "learning_rate": 0.0006750577524120125, + "loss": 3.3471, + "step": 38275 + }, + { + "epoch": 2.600896860986547, + "grad_norm": 0.8093592524528503, + "learning_rate": 0.0006750152874031798, + "loss": 3.5218, + "step": 38280 + }, + { + "epoch": 2.6012365810572087, + "grad_norm": 0.7619259357452393, + "learning_rate": 0.0006749728223943471, + "loss": 3.7714, + "step": 38285 + }, + { + "epoch": 2.6015763011278707, + "grad_norm": 1.044609785079956, + "learning_rate": 0.0006749303573855143, + "loss": 3.3185, + "step": 38290 + }, + { + "epoch": 2.6019160211985324, + "grad_norm": 1.078311800956726, + "learning_rate": 0.0006748878923766815, + "loss": 3.5489, + "step": 38295 + }, + { + "epoch": 2.602255741269194, + "grad_norm": 1.0095213651657104, + "learning_rate": 0.0006748454273678489, + "loss": 3.6506, + "step": 38300 + }, + { + "epoch": 2.602595461339856, + "grad_norm": 1.4612904787063599, + "learning_rate": 0.0006748029623590162, + "loss": 3.3963, + "step": 38305 + }, + { + "epoch": 2.6029351814105177, + "grad_norm": 0.6966364979743958, + "learning_rate": 0.0006747604973501834, + "loss": 3.4787, + "step": 38310 + }, + { + "epoch": 2.6032749014811793, + "grad_norm": 1.0704988241195679, + "learning_rate": 0.0006747180323413508, + "loss": 3.436, + "step": 38315 + }, + { + "epoch": 2.6036146215518414, + "grad_norm": 0.8748809695243835, + "learning_rate": 0.000674675567332518, + "loss": 3.6197, + "step": 38320 + }, + { + "epoch": 2.603954341622503, + "grad_norm": 1.0644104480743408, + "learning_rate": 0.0006746331023236852, + "loss": 3.7833, + "step": 38325 + }, + { + "epoch": 2.6042940616931647, + "grad_norm": 0.7312983274459839, + "learning_rate": 0.0006745906373148526, + "loss": 3.7061, + "step": 38330 + }, + { + "epoch": 2.6046337817638268, + "grad_norm": 0.8587197065353394, + "learning_rate": 0.0006745481723060198, + "loss": 3.6381, + "step": 38335 + }, + { + "epoch": 2.6049735018344884, + "grad_norm": 0.9730299115180969, + "learning_rate": 0.0006745057072971871, + "loss": 3.6134, + "step": 38340 + }, + { + "epoch": 2.60531322190515, + "grad_norm": 0.7714915871620178, + "learning_rate": 0.0006744632422883545, + "loss": 3.5671, + "step": 38345 + }, + { + "epoch": 2.605652941975812, + "grad_norm": 1.0341767072677612, + "learning_rate": 0.0006744207772795217, + "loss": 3.4484, + "step": 38350 + }, + { + "epoch": 2.6059926620464737, + "grad_norm": 0.9606527090072632, + "learning_rate": 0.000674378312270689, + "loss": 3.4885, + "step": 38355 + }, + { + "epoch": 2.6063323821171354, + "grad_norm": 1.344500184059143, + "learning_rate": 0.0006743358472618562, + "loss": 3.5139, + "step": 38360 + }, + { + "epoch": 2.6066721021877974, + "grad_norm": 1.016011118888855, + "learning_rate": 0.0006742933822530235, + "loss": 3.3763, + "step": 38365 + }, + { + "epoch": 2.607011822258459, + "grad_norm": 0.6497259736061096, + "learning_rate": 0.0006742509172441908, + "loss": 3.7803, + "step": 38370 + }, + { + "epoch": 2.6073515423291207, + "grad_norm": 0.6779730916023254, + "learning_rate": 0.0006742084522353581, + "loss": 4.018, + "step": 38375 + }, + { + "epoch": 2.6076912623997828, + "grad_norm": 0.8149189352989197, + "learning_rate": 0.0006741659872265254, + "loss": 3.6869, + "step": 38380 + }, + { + "epoch": 2.6080309824704444, + "grad_norm": 0.9677640795707703, + "learning_rate": 0.0006741235222176927, + "loss": 3.6104, + "step": 38385 + }, + { + "epoch": 2.608370702541106, + "grad_norm": 0.7079936861991882, + "learning_rate": 0.0006740810572088599, + "loss": 3.5108, + "step": 38390 + }, + { + "epoch": 2.608710422611768, + "grad_norm": 0.7564866542816162, + "learning_rate": 0.0006740385922000271, + "loss": 3.8192, + "step": 38395 + }, + { + "epoch": 2.6090501426824297, + "grad_norm": 0.682205319404602, + "learning_rate": 0.0006739961271911945, + "loss": 3.6809, + "step": 38400 + }, + { + "epoch": 2.6093898627530914, + "grad_norm": 1.0763970613479614, + "learning_rate": 0.0006739536621823617, + "loss": 3.5612, + "step": 38405 + }, + { + "epoch": 2.6097295828237534, + "grad_norm": 0.7323270440101624, + "learning_rate": 0.000673911197173529, + "loss": 3.6161, + "step": 38410 + }, + { + "epoch": 2.610069302894415, + "grad_norm": 0.846630871295929, + "learning_rate": 0.0006738687321646964, + "loss": 3.699, + "step": 38415 + }, + { + "epoch": 2.6104090229650767, + "grad_norm": 0.8020626902580261, + "learning_rate": 0.0006738262671558636, + "loss": 3.5032, + "step": 38420 + }, + { + "epoch": 2.610748743035739, + "grad_norm": 0.8144040703773499, + "learning_rate": 0.0006737838021470308, + "loss": 3.5675, + "step": 38425 + }, + { + "epoch": 2.6110884631064004, + "grad_norm": 0.6797598600387573, + "learning_rate": 0.0006737413371381982, + "loss": 3.244, + "step": 38430 + }, + { + "epoch": 2.611428183177062, + "grad_norm": 0.7951819896697998, + "learning_rate": 0.0006736988721293654, + "loss": 3.3252, + "step": 38435 + }, + { + "epoch": 2.611767903247724, + "grad_norm": 1.1211295127868652, + "learning_rate": 0.0006736564071205326, + "loss": 3.5553, + "step": 38440 + }, + { + "epoch": 2.6121076233183858, + "grad_norm": 1.0152770280838013, + "learning_rate": 0.0006736139421117001, + "loss": 3.358, + "step": 38445 + }, + { + "epoch": 2.6124473433890474, + "grad_norm": 0.74626225233078, + "learning_rate": 0.0006735714771028673, + "loss": 3.4934, + "step": 38450 + }, + { + "epoch": 2.6127870634597095, + "grad_norm": 0.8204950094223022, + "learning_rate": 0.0006735290120940345, + "loss": 3.5419, + "step": 38455 + }, + { + "epoch": 2.613126783530371, + "grad_norm": 0.7391467094421387, + "learning_rate": 0.0006734865470852018, + "loss": 3.6469, + "step": 38460 + }, + { + "epoch": 2.6134665036010327, + "grad_norm": 0.8226571679115295, + "learning_rate": 0.0006734440820763691, + "loss": 3.6068, + "step": 38465 + }, + { + "epoch": 2.613806223671695, + "grad_norm": 1.1823673248291016, + "learning_rate": 0.0006734016170675363, + "loss": 3.5573, + "step": 38470 + }, + { + "epoch": 2.6141459437423564, + "grad_norm": 1.3506959676742554, + "learning_rate": 0.0006733591520587036, + "loss": 3.4486, + "step": 38475 + }, + { + "epoch": 2.614485663813018, + "grad_norm": 1.0129735469818115, + "learning_rate": 0.000673316687049871, + "loss": 3.7574, + "step": 38480 + }, + { + "epoch": 2.6148253838836797, + "grad_norm": 0.7659022808074951, + "learning_rate": 0.0006732742220410382, + "loss": 3.4922, + "step": 38485 + }, + { + "epoch": 2.6151651039543418, + "grad_norm": 1.2818197011947632, + "learning_rate": 0.0006732317570322055, + "loss": 3.4003, + "step": 38490 + }, + { + "epoch": 2.6155048240250034, + "grad_norm": 0.8387561440467834, + "learning_rate": 0.0006731892920233727, + "loss": 3.5789, + "step": 38495 + }, + { + "epoch": 2.615844544095665, + "grad_norm": 1.2622536420822144, + "learning_rate": 0.00067314682701454, + "loss": 3.4174, + "step": 38500 + }, + { + "epoch": 2.616184264166327, + "grad_norm": 0.8265799283981323, + "learning_rate": 0.0006731043620057073, + "loss": 3.1578, + "step": 38505 + }, + { + "epoch": 2.6165239842369887, + "grad_norm": 1.2628419399261475, + "learning_rate": 0.0006730618969968745, + "loss": 3.569, + "step": 38510 + }, + { + "epoch": 2.6168637043076504, + "grad_norm": 1.2857385873794556, + "learning_rate": 0.0006730194319880419, + "loss": 3.3399, + "step": 38515 + }, + { + "epoch": 2.617203424378312, + "grad_norm": 0.8821470737457275, + "learning_rate": 0.0006729769669792092, + "loss": 3.5939, + "step": 38520 + }, + { + "epoch": 2.617543144448974, + "grad_norm": 1.0424641370773315, + "learning_rate": 0.0006729345019703764, + "loss": 3.4324, + "step": 38525 + }, + { + "epoch": 2.6178828645196357, + "grad_norm": 1.0647976398468018, + "learning_rate": 0.0006728920369615437, + "loss": 3.5371, + "step": 38530 + }, + { + "epoch": 2.6182225845902973, + "grad_norm": 0.6089511513710022, + "learning_rate": 0.000672849571952711, + "loss": 3.2688, + "step": 38535 + }, + { + "epoch": 2.6185623046609594, + "grad_norm": 0.6411743760108948, + "learning_rate": 0.0006728071069438782, + "loss": 3.6208, + "step": 38540 + }, + { + "epoch": 2.618902024731621, + "grad_norm": 0.870461642742157, + "learning_rate": 0.0006727646419350454, + "loss": 3.5651, + "step": 38545 + }, + { + "epoch": 2.6192417448022827, + "grad_norm": 0.6502228379249573, + "learning_rate": 0.0006727221769262129, + "loss": 3.462, + "step": 38550 + }, + { + "epoch": 2.6195814648729447, + "grad_norm": 0.8295050859451294, + "learning_rate": 0.0006726797119173801, + "loss": 3.666, + "step": 38555 + }, + { + "epoch": 2.6199211849436064, + "grad_norm": 0.8866757154464722, + "learning_rate": 0.0006726372469085473, + "loss": 3.4996, + "step": 38560 + }, + { + "epoch": 2.620260905014268, + "grad_norm": 0.8572387099266052, + "learning_rate": 0.0006725947818997147, + "loss": 3.5374, + "step": 38565 + }, + { + "epoch": 2.62060062508493, + "grad_norm": 0.8934454321861267, + "learning_rate": 0.0006725523168908819, + "loss": 3.8224, + "step": 38570 + }, + { + "epoch": 2.6209403451555917, + "grad_norm": 0.8764317631721497, + "learning_rate": 0.0006725098518820491, + "loss": 3.5965, + "step": 38575 + }, + { + "epoch": 2.6212800652262533, + "grad_norm": 0.8057767152786255, + "learning_rate": 0.0006724673868732166, + "loss": 3.7058, + "step": 38580 + }, + { + "epoch": 2.6216197852969154, + "grad_norm": 0.9043989181518555, + "learning_rate": 0.0006724249218643838, + "loss": 3.6098, + "step": 38585 + }, + { + "epoch": 2.621959505367577, + "grad_norm": 0.8078992366790771, + "learning_rate": 0.000672382456855551, + "loss": 3.512, + "step": 38590 + }, + { + "epoch": 2.6222992254382387, + "grad_norm": 1.036250114440918, + "learning_rate": 0.0006723399918467184, + "loss": 3.5971, + "step": 38595 + }, + { + "epoch": 2.6226389455089008, + "grad_norm": 0.8827484250068665, + "learning_rate": 0.0006722975268378856, + "loss": 3.6656, + "step": 38600 + }, + { + "epoch": 2.6229786655795624, + "grad_norm": 0.9132946729660034, + "learning_rate": 0.0006722550618290528, + "loss": 3.7247, + "step": 38605 + }, + { + "epoch": 2.623318385650224, + "grad_norm": 0.8188766241073608, + "learning_rate": 0.0006722125968202201, + "loss": 3.566, + "step": 38610 + }, + { + "epoch": 2.623658105720886, + "grad_norm": 0.824389636516571, + "learning_rate": 0.0006721701318113875, + "loss": 3.5389, + "step": 38615 + }, + { + "epoch": 2.6239978257915477, + "grad_norm": 0.8895384073257446, + "learning_rate": 0.0006721276668025547, + "loss": 3.5453, + "step": 38620 + }, + { + "epoch": 2.6243375458622094, + "grad_norm": 0.862647533416748, + "learning_rate": 0.000672085201793722, + "loss": 3.8783, + "step": 38625 + }, + { + "epoch": 2.6246772659328714, + "grad_norm": 0.8207495212554932, + "learning_rate": 0.0006720427367848893, + "loss": 3.7678, + "step": 38630 + }, + { + "epoch": 2.625016986003533, + "grad_norm": 1.6234124898910522, + "learning_rate": 0.0006720002717760565, + "loss": 3.8941, + "step": 38635 + }, + { + "epoch": 2.6253567060741947, + "grad_norm": 0.9618303179740906, + "learning_rate": 0.0006719578067672238, + "loss": 3.6951, + "step": 38640 + }, + { + "epoch": 2.6256964261448568, + "grad_norm": 0.8330925703048706, + "learning_rate": 0.000671915341758391, + "loss": 3.6709, + "step": 38645 + }, + { + "epoch": 2.6260361462155184, + "grad_norm": 0.7659103274345398, + "learning_rate": 0.0006718728767495584, + "loss": 3.6977, + "step": 38650 + }, + { + "epoch": 2.62637586628618, + "grad_norm": 0.953326940536499, + "learning_rate": 0.0006718304117407257, + "loss": 3.278, + "step": 38655 + }, + { + "epoch": 2.626715586356842, + "grad_norm": 0.9244345426559448, + "learning_rate": 0.0006717879467318929, + "loss": 3.5307, + "step": 38660 + }, + { + "epoch": 2.6270553064275037, + "grad_norm": 0.8329584002494812, + "learning_rate": 0.0006717454817230602, + "loss": 3.4186, + "step": 38665 + }, + { + "epoch": 2.6273950264981654, + "grad_norm": 1.0347521305084229, + "learning_rate": 0.0006717030167142275, + "loss": 3.5959, + "step": 38670 + }, + { + "epoch": 2.6277347465688274, + "grad_norm": 0.9870880842208862, + "learning_rate": 0.0006716605517053947, + "loss": 3.4186, + "step": 38675 + }, + { + "epoch": 2.628074466639489, + "grad_norm": 0.9729521870613098, + "learning_rate": 0.000671618086696562, + "loss": 3.6275, + "step": 38680 + }, + { + "epoch": 2.6284141867101507, + "grad_norm": 0.796580970287323, + "learning_rate": 0.0006715756216877294, + "loss": 3.7469, + "step": 38685 + }, + { + "epoch": 2.6287539067808128, + "grad_norm": 0.7097721695899963, + "learning_rate": 0.0006715331566788966, + "loss": 3.7223, + "step": 38690 + }, + { + "epoch": 2.6290936268514744, + "grad_norm": 0.8824094533920288, + "learning_rate": 0.000671490691670064, + "loss": 3.5095, + "step": 38695 + }, + { + "epoch": 2.629433346922136, + "grad_norm": 0.7501387596130371, + "learning_rate": 0.0006714482266612312, + "loss": 3.4841, + "step": 38700 + }, + { + "epoch": 2.629773066992798, + "grad_norm": 0.816253662109375, + "learning_rate": 0.0006714057616523984, + "loss": 3.7418, + "step": 38705 + }, + { + "epoch": 2.6301127870634597, + "grad_norm": 0.6083940267562866, + "learning_rate": 0.0006713632966435657, + "loss": 3.7415, + "step": 38710 + }, + { + "epoch": 2.6304525071341214, + "grad_norm": 0.9839432835578918, + "learning_rate": 0.000671320831634733, + "loss": 3.5923, + "step": 38715 + }, + { + "epoch": 2.6307922272047835, + "grad_norm": 0.9502136707305908, + "learning_rate": 0.0006712783666259003, + "loss": 3.5504, + "step": 38720 + }, + { + "epoch": 2.631131947275445, + "grad_norm": 0.7697909474372864, + "learning_rate": 0.0006712359016170676, + "loss": 3.5279, + "step": 38725 + }, + { + "epoch": 2.6314716673461067, + "grad_norm": 1.0607547760009766, + "learning_rate": 0.0006711934366082349, + "loss": 3.503, + "step": 38730 + }, + { + "epoch": 2.631811387416769, + "grad_norm": 1.4073164463043213, + "learning_rate": 0.0006711509715994021, + "loss": 3.5213, + "step": 38735 + }, + { + "epoch": 2.6321511074874304, + "grad_norm": 0.9240948557853699, + "learning_rate": 0.0006711085065905694, + "loss": 3.5762, + "step": 38740 + }, + { + "epoch": 2.632490827558092, + "grad_norm": 0.8917648792266846, + "learning_rate": 0.0006710660415817366, + "loss": 3.6613, + "step": 38745 + }, + { + "epoch": 2.632830547628754, + "grad_norm": 0.7891961336135864, + "learning_rate": 0.0006710235765729039, + "loss": 3.5215, + "step": 38750 + }, + { + "epoch": 2.6331702676994158, + "grad_norm": 4.060850620269775, + "learning_rate": 0.0006709811115640713, + "loss": 3.2936, + "step": 38755 + }, + { + "epoch": 2.6335099877700774, + "grad_norm": 0.8698610663414001, + "learning_rate": 0.0006709386465552385, + "loss": 3.5436, + "step": 38760 + }, + { + "epoch": 2.6338497078407395, + "grad_norm": 0.8444616794586182, + "learning_rate": 0.0006708961815464058, + "loss": 3.4143, + "step": 38765 + }, + { + "epoch": 2.634189427911401, + "grad_norm": 1.6267286539077759, + "learning_rate": 0.0006708537165375731, + "loss": 3.6935, + "step": 38770 + }, + { + "epoch": 2.6345291479820627, + "grad_norm": 0.7878610491752625, + "learning_rate": 0.0006708112515287403, + "loss": 3.5595, + "step": 38775 + }, + { + "epoch": 2.634868868052725, + "grad_norm": 0.7533514499664307, + "learning_rate": 0.0006707687865199076, + "loss": 3.7803, + "step": 38780 + }, + { + "epoch": 2.6352085881233864, + "grad_norm": 0.9468477368354797, + "learning_rate": 0.0006707263215110749, + "loss": 3.8132, + "step": 38785 + }, + { + "epoch": 2.635548308194048, + "grad_norm": 1.0671122074127197, + "learning_rate": 0.0006706838565022422, + "loss": 3.7052, + "step": 38790 + }, + { + "epoch": 2.63588802826471, + "grad_norm": 0.884826123714447, + "learning_rate": 0.0006706413914934094, + "loss": 3.5897, + "step": 38795 + }, + { + "epoch": 2.6362277483353718, + "grad_norm": 1.3027029037475586, + "learning_rate": 0.0006705989264845768, + "loss": 3.6592, + "step": 38800 + }, + { + "epoch": 2.6365674684060334, + "grad_norm": 0.8767277002334595, + "learning_rate": 0.000670556461475744, + "loss": 3.4804, + "step": 38805 + }, + { + "epoch": 2.6369071884766955, + "grad_norm": 1.0876315832138062, + "learning_rate": 0.0006705139964669112, + "loss": 3.4561, + "step": 38810 + }, + { + "epoch": 2.637246908547357, + "grad_norm": 0.8744105696678162, + "learning_rate": 0.0006704715314580786, + "loss": 3.3514, + "step": 38815 + }, + { + "epoch": 2.6375866286180187, + "grad_norm": 0.995034396648407, + "learning_rate": 0.0006704290664492458, + "loss": 3.8485, + "step": 38820 + }, + { + "epoch": 2.6379263486886804, + "grad_norm": 0.8879557251930237, + "learning_rate": 0.0006703866014404131, + "loss": 3.6498, + "step": 38825 + }, + { + "epoch": 2.6382660687593424, + "grad_norm": 0.8152334690093994, + "learning_rate": 0.0006703441364315805, + "loss": 3.571, + "step": 38830 + }, + { + "epoch": 2.638605788830004, + "grad_norm": 0.7356874346733093, + "learning_rate": 0.0006703016714227477, + "loss": 3.7238, + "step": 38835 + }, + { + "epoch": 2.6389455089006657, + "grad_norm": 0.8899386525154114, + "learning_rate": 0.0006702592064139149, + "loss": 3.4135, + "step": 38840 + }, + { + "epoch": 2.639285228971328, + "grad_norm": 0.8223162889480591, + "learning_rate": 0.0006702167414050822, + "loss": 3.5741, + "step": 38845 + }, + { + "epoch": 2.6396249490419894, + "grad_norm": 0.95176762342453, + "learning_rate": 0.0006701742763962495, + "loss": 3.47, + "step": 38850 + }, + { + "epoch": 2.639964669112651, + "grad_norm": 0.7472764253616333, + "learning_rate": 0.0006701318113874167, + "loss": 3.6068, + "step": 38855 + }, + { + "epoch": 2.640304389183313, + "grad_norm": 0.8491698503494263, + "learning_rate": 0.0006700893463785841, + "loss": 3.5297, + "step": 38860 + }, + { + "epoch": 2.6406441092539747, + "grad_norm": 0.8788675665855408, + "learning_rate": 0.0006700468813697514, + "loss": 3.5733, + "step": 38865 + }, + { + "epoch": 2.6409838293246364, + "grad_norm": 0.7397850751876831, + "learning_rate": 0.0006700044163609186, + "loss": 3.4647, + "step": 38870 + }, + { + "epoch": 2.641323549395298, + "grad_norm": 0.9290870428085327, + "learning_rate": 0.0006699619513520859, + "loss": 3.7397, + "step": 38875 + }, + { + "epoch": 2.64166326946596, + "grad_norm": 0.6939991116523743, + "learning_rate": 0.0006699194863432532, + "loss": 3.5759, + "step": 38880 + }, + { + "epoch": 2.6420029895366217, + "grad_norm": 0.8173078894615173, + "learning_rate": 0.0006698770213344204, + "loss": 3.7564, + "step": 38885 + }, + { + "epoch": 2.6423427096072833, + "grad_norm": 0.7468464970588684, + "learning_rate": 0.0006698345563255877, + "loss": 3.6541, + "step": 38890 + }, + { + "epoch": 2.6426824296779454, + "grad_norm": 0.8659349679946899, + "learning_rate": 0.000669792091316755, + "loss": 3.5882, + "step": 38895 + }, + { + "epoch": 2.643022149748607, + "grad_norm": 0.9251850247383118, + "learning_rate": 0.0006697496263079223, + "loss": 3.2244, + "step": 38900 + }, + { + "epoch": 2.6433618698192687, + "grad_norm": 0.9088659286499023, + "learning_rate": 0.0006697071612990896, + "loss": 3.4529, + "step": 38905 + }, + { + "epoch": 2.6437015898899308, + "grad_norm": 0.6900408267974854, + "learning_rate": 0.0006696646962902568, + "loss": 3.4793, + "step": 38910 + }, + { + "epoch": 2.6440413099605924, + "grad_norm": 0.8476678729057312, + "learning_rate": 0.0006696222312814241, + "loss": 3.5769, + "step": 38915 + }, + { + "epoch": 2.644381030031254, + "grad_norm": 0.905314028263092, + "learning_rate": 0.0006695797662725914, + "loss": 3.7348, + "step": 38920 + }, + { + "epoch": 2.644720750101916, + "grad_norm": 0.8645856380462646, + "learning_rate": 0.0006695373012637586, + "loss": 3.3749, + "step": 38925 + }, + { + "epoch": 2.6450604701725777, + "grad_norm": 0.8407091498374939, + "learning_rate": 0.000669494836254926, + "loss": 3.3235, + "step": 38930 + }, + { + "epoch": 2.6454001902432394, + "grad_norm": 0.8845350742340088, + "learning_rate": 0.0006694523712460933, + "loss": 3.3859, + "step": 38935 + }, + { + "epoch": 2.6457399103139014, + "grad_norm": 0.8837825059890747, + "learning_rate": 0.0006694099062372605, + "loss": 3.4257, + "step": 38940 + }, + { + "epoch": 2.646079630384563, + "grad_norm": 0.7416989207267761, + "learning_rate": 0.0006693674412284277, + "loss": 3.6048, + "step": 38945 + }, + { + "epoch": 2.6464193504552247, + "grad_norm": 1.022812843322754, + "learning_rate": 0.0006693249762195951, + "loss": 3.5538, + "step": 38950 + }, + { + "epoch": 2.6467590705258868, + "grad_norm": 0.9573689103126526, + "learning_rate": 0.0006692825112107623, + "loss": 3.4931, + "step": 38955 + }, + { + "epoch": 2.6470987905965484, + "grad_norm": 1.0323059558868408, + "learning_rate": 0.0006692400462019295, + "loss": 3.2901, + "step": 38960 + }, + { + "epoch": 2.64743851066721, + "grad_norm": 0.8983903527259827, + "learning_rate": 0.000669197581193097, + "loss": 3.5528, + "step": 38965 + }, + { + "epoch": 2.647778230737872, + "grad_norm": 0.9208760261535645, + "learning_rate": 0.0006691551161842642, + "loss": 3.5378, + "step": 38970 + }, + { + "epoch": 2.6481179508085337, + "grad_norm": 0.7435709834098816, + "learning_rate": 0.0006691126511754314, + "loss": 3.5117, + "step": 38975 + }, + { + "epoch": 2.6484576708791954, + "grad_norm": 0.8828679919242859, + "learning_rate": 0.0006690701861665988, + "loss": 3.6783, + "step": 38980 + }, + { + "epoch": 2.6487973909498574, + "grad_norm": 0.7584705948829651, + "learning_rate": 0.000669027721157766, + "loss": 3.4886, + "step": 38985 + }, + { + "epoch": 2.649137111020519, + "grad_norm": 0.9489779472351074, + "learning_rate": 0.0006689852561489332, + "loss": 3.5154, + "step": 38990 + }, + { + "epoch": 2.6494768310911807, + "grad_norm": 0.8523688912391663, + "learning_rate": 0.0006689427911401005, + "loss": 3.3445, + "step": 38995 + }, + { + "epoch": 2.649816551161843, + "grad_norm": 0.888565719127655, + "learning_rate": 0.0006689003261312679, + "loss": 3.7111, + "step": 39000 + }, + { + "epoch": 2.6501562712325044, + "grad_norm": 1.1264264583587646, + "learning_rate": 0.0006688578611224351, + "loss": 3.454, + "step": 39005 + }, + { + "epoch": 2.650495991303166, + "grad_norm": 0.8178558349609375, + "learning_rate": 0.0006688153961136024, + "loss": 3.6627, + "step": 39010 + }, + { + "epoch": 2.650835711373828, + "grad_norm": 0.8959958553314209, + "learning_rate": 0.0006687729311047697, + "loss": 3.6283, + "step": 39015 + }, + { + "epoch": 2.6511754314444897, + "grad_norm": 0.9152992367744446, + "learning_rate": 0.0006687304660959369, + "loss": 3.2623, + "step": 39020 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.7884079217910767, + "learning_rate": 0.0006686880010871042, + "loss": 3.7162, + "step": 39025 + }, + { + "epoch": 2.6518548715858135, + "grad_norm": 0.959491491317749, + "learning_rate": 0.0006686455360782714, + "loss": 3.7082, + "step": 39030 + }, + { + "epoch": 2.652194591656475, + "grad_norm": 0.938376784324646, + "learning_rate": 0.0006686030710694389, + "loss": 3.7269, + "step": 39035 + }, + { + "epoch": 2.6525343117271367, + "grad_norm": 1.014055848121643, + "learning_rate": 0.0006685606060606061, + "loss": 3.6747, + "step": 39040 + }, + { + "epoch": 2.652874031797799, + "grad_norm": 0.9203429222106934, + "learning_rate": 0.0006685181410517733, + "loss": 3.5113, + "step": 39045 + }, + { + "epoch": 2.6532137518684604, + "grad_norm": 0.962773323059082, + "learning_rate": 0.0006684756760429407, + "loss": 3.5049, + "step": 39050 + }, + { + "epoch": 2.653553471939122, + "grad_norm": 1.3023988008499146, + "learning_rate": 0.0006684332110341079, + "loss": 3.2158, + "step": 39055 + }, + { + "epoch": 2.653893192009784, + "grad_norm": 1.4684962034225464, + "learning_rate": 0.0006683907460252751, + "loss": 3.7291, + "step": 39060 + }, + { + "epoch": 2.6542329120804458, + "grad_norm": 0.7858505249023438, + "learning_rate": 0.0006683482810164426, + "loss": 3.2878, + "step": 39065 + }, + { + "epoch": 2.6545726321511074, + "grad_norm": 0.8321568369865417, + "learning_rate": 0.0006683058160076098, + "loss": 3.8094, + "step": 39070 + }, + { + "epoch": 2.6549123522217695, + "grad_norm": 1.079740285873413, + "learning_rate": 0.000668263350998777, + "loss": 3.4595, + "step": 39075 + }, + { + "epoch": 2.655252072292431, + "grad_norm": 0.8521371483802795, + "learning_rate": 0.0006682208859899444, + "loss": 3.6233, + "step": 39080 + }, + { + "epoch": 2.6555917923630927, + "grad_norm": 0.7521746158599854, + "learning_rate": 0.0006681784209811116, + "loss": 3.5885, + "step": 39085 + }, + { + "epoch": 2.655931512433755, + "grad_norm": 0.7179979681968689, + "learning_rate": 0.0006681359559722788, + "loss": 3.5451, + "step": 39090 + }, + { + "epoch": 2.6562712325044164, + "grad_norm": 0.883536696434021, + "learning_rate": 0.0006680934909634461, + "loss": 3.5329, + "step": 39095 + }, + { + "epoch": 2.656610952575078, + "grad_norm": 1.0736478567123413, + "learning_rate": 0.0006680510259546135, + "loss": 3.5144, + "step": 39100 + }, + { + "epoch": 2.65695067264574, + "grad_norm": 0.7705216407775879, + "learning_rate": 0.0006680085609457807, + "loss": 3.4052, + "step": 39105 + }, + { + "epoch": 2.6572903927164018, + "grad_norm": 0.982172429561615, + "learning_rate": 0.000667966095936948, + "loss": 3.3179, + "step": 39110 + }, + { + "epoch": 2.6576301127870634, + "grad_norm": 1.1115076541900635, + "learning_rate": 0.0006679236309281153, + "loss": 3.5011, + "step": 39115 + }, + { + "epoch": 2.6579698328577255, + "grad_norm": 0.9303742051124573, + "learning_rate": 0.0006678811659192825, + "loss": 3.7544, + "step": 39120 + }, + { + "epoch": 2.658309552928387, + "grad_norm": 0.7708895802497864, + "learning_rate": 0.0006678387009104498, + "loss": 3.2509, + "step": 39125 + }, + { + "epoch": 2.6586492729990487, + "grad_norm": 0.966750979423523, + "learning_rate": 0.0006678047289033836, + "loss": 3.4229, + "step": 39130 + }, + { + "epoch": 2.658988993069711, + "grad_norm": 0.9908020496368408, + "learning_rate": 0.0006677622638945509, + "loss": 3.6438, + "step": 39135 + }, + { + "epoch": 2.6593287131403724, + "grad_norm": 0.834606409072876, + "learning_rate": 0.0006677197988857182, + "loss": 3.4673, + "step": 39140 + }, + { + "epoch": 2.659668433211034, + "grad_norm": 0.8238089680671692, + "learning_rate": 0.0006676773338768854, + "loss": 3.3394, + "step": 39145 + }, + { + "epoch": 2.660008153281696, + "grad_norm": 0.8827638030052185, + "learning_rate": 0.0006676348688680527, + "loss": 3.2764, + "step": 39150 + }, + { + "epoch": 2.660347873352358, + "grad_norm": 0.780141294002533, + "learning_rate": 0.00066759240385922, + "loss": 3.9909, + "step": 39155 + }, + { + "epoch": 2.6606875934230194, + "grad_norm": 0.8750724792480469, + "learning_rate": 0.0006675499388503873, + "loss": 3.4568, + "step": 39160 + }, + { + "epoch": 2.661027313493681, + "grad_norm": 1.040964961051941, + "learning_rate": 0.0006675074738415546, + "loss": 3.779, + "step": 39165 + }, + { + "epoch": 2.661367033564343, + "grad_norm": 1.2339928150177002, + "learning_rate": 0.0006674650088327219, + "loss": 3.4785, + "step": 39170 + }, + { + "epoch": 2.6617067536350048, + "grad_norm": 0.8126529455184937, + "learning_rate": 0.0006674225438238891, + "loss": 3.5, + "step": 39175 + }, + { + "epoch": 2.6620464737056664, + "grad_norm": 0.8360945582389832, + "learning_rate": 0.0006673800788150563, + "loss": 3.3948, + "step": 39180 + }, + { + "epoch": 2.6623861937763285, + "grad_norm": 0.7842538952827454, + "learning_rate": 0.0006673376138062237, + "loss": 3.258, + "step": 39185 + }, + { + "epoch": 2.66272591384699, + "grad_norm": 1.9919917583465576, + "learning_rate": 0.0006672951487973909, + "loss": 3.4127, + "step": 39190 + }, + { + "epoch": 2.6630656339176517, + "grad_norm": 0.7879732847213745, + "learning_rate": 0.0006672526837885582, + "loss": 3.6784, + "step": 39195 + }, + { + "epoch": 2.663405353988314, + "grad_norm": 1.3146772384643555, + "learning_rate": 0.0006672102187797256, + "loss": 3.7799, + "step": 39200 + }, + { + "epoch": 2.6637450740589754, + "grad_norm": 1.3198031187057495, + "learning_rate": 0.0006671677537708928, + "loss": 3.4744, + "step": 39205 + }, + { + "epoch": 2.664084794129637, + "grad_norm": 0.8575770258903503, + "learning_rate": 0.00066712528876206, + "loss": 3.6026, + "step": 39210 + }, + { + "epoch": 2.6644245142002987, + "grad_norm": 0.8673245906829834, + "learning_rate": 0.0006670828237532274, + "loss": 3.4801, + "step": 39215 + }, + { + "epoch": 2.6647642342709608, + "grad_norm": 0.8270705342292786, + "learning_rate": 0.0006670403587443946, + "loss": 3.3714, + "step": 39220 + }, + { + "epoch": 2.6651039543416224, + "grad_norm": 0.7948834896087646, + "learning_rate": 0.0006669978937355618, + "loss": 3.7969, + "step": 39225 + }, + { + "epoch": 2.665443674412284, + "grad_norm": 0.976115882396698, + "learning_rate": 0.0006669554287267292, + "loss": 3.4286, + "step": 39230 + }, + { + "epoch": 2.665783394482946, + "grad_norm": 0.8481581807136536, + "learning_rate": 0.0006669129637178965, + "loss": 3.4771, + "step": 39235 + }, + { + "epoch": 2.6661231145536077, + "grad_norm": 0.943724513053894, + "learning_rate": 0.0006668704987090638, + "loss": 3.5208, + "step": 39240 + }, + { + "epoch": 2.6664628346242694, + "grad_norm": 0.7770695090293884, + "learning_rate": 0.000666828033700231, + "loss": 3.5641, + "step": 39245 + }, + { + "epoch": 2.6668025546949314, + "grad_norm": 0.7771450281143188, + "learning_rate": 0.0006667855686913983, + "loss": 3.6681, + "step": 39250 + }, + { + "epoch": 2.667142274765593, + "grad_norm": 0.7848898768424988, + "learning_rate": 0.0006667431036825656, + "loss": 3.3323, + "step": 39255 + }, + { + "epoch": 2.6674819948362547, + "grad_norm": 1.1710138320922852, + "learning_rate": 0.0006667006386737328, + "loss": 3.7562, + "step": 39260 + }, + { + "epoch": 2.6678217149069168, + "grad_norm": 0.9494715332984924, + "learning_rate": 0.0006666581736649002, + "loss": 3.3919, + "step": 39265 + }, + { + "epoch": 2.6681614349775784, + "grad_norm": 0.8524022698402405, + "learning_rate": 0.0006666157086560675, + "loss": 3.4384, + "step": 39270 + }, + { + "epoch": 2.66850115504824, + "grad_norm": 0.7561802268028259, + "learning_rate": 0.0006665732436472347, + "loss": 3.7456, + "step": 39275 + }, + { + "epoch": 2.668840875118902, + "grad_norm": 1.1264036893844604, + "learning_rate": 0.0006665307786384019, + "loss": 3.67, + "step": 39280 + }, + { + "epoch": 2.6691805951895637, + "grad_norm": 0.7854278087615967, + "learning_rate": 0.0006664883136295693, + "loss": 3.7803, + "step": 39285 + }, + { + "epoch": 2.6695203152602254, + "grad_norm": 1.6234161853790283, + "learning_rate": 0.0006664458486207365, + "loss": 3.4955, + "step": 39290 + }, + { + "epoch": 2.6698600353308874, + "grad_norm": 1.33502995967865, + "learning_rate": 0.0006664033836119037, + "loss": 3.6899, + "step": 39295 + }, + { + "epoch": 2.670199755401549, + "grad_norm": 0.9480525255203247, + "learning_rate": 0.0006663609186030712, + "loss": 3.5231, + "step": 39300 + }, + { + "epoch": 2.6705394754722107, + "grad_norm": 0.8613016605377197, + "learning_rate": 0.0006663184535942384, + "loss": 3.4958, + "step": 39305 + }, + { + "epoch": 2.670879195542873, + "grad_norm": 1.0123047828674316, + "learning_rate": 0.0006662759885854056, + "loss": 3.5172, + "step": 39310 + }, + { + "epoch": 2.6712189156135344, + "grad_norm": 1.0312747955322266, + "learning_rate": 0.000666233523576573, + "loss": 3.6197, + "step": 39315 + }, + { + "epoch": 2.671558635684196, + "grad_norm": 1.0466002225875854, + "learning_rate": 0.0006661910585677402, + "loss": 3.8506, + "step": 39320 + }, + { + "epoch": 2.671898355754858, + "grad_norm": 1.0171934366226196, + "learning_rate": 0.0006661485935589074, + "loss": 3.5744, + "step": 39325 + }, + { + "epoch": 2.6722380758255198, + "grad_norm": 0.9286202192306519, + "learning_rate": 0.0006661061285500748, + "loss": 3.6443, + "step": 39330 + }, + { + "epoch": 2.6725777958961814, + "grad_norm": 0.8538934588432312, + "learning_rate": 0.0006660636635412421, + "loss": 3.5277, + "step": 39335 + }, + { + "epoch": 2.6729175159668435, + "grad_norm": 0.9473258852958679, + "learning_rate": 0.0006660211985324093, + "loss": 3.7122, + "step": 39340 + }, + { + "epoch": 2.673257236037505, + "grad_norm": 1.3960251808166504, + "learning_rate": 0.0006659787335235766, + "loss": 3.4882, + "step": 39345 + }, + { + "epoch": 2.6735969561081667, + "grad_norm": 0.9405949711799622, + "learning_rate": 0.0006659362685147439, + "loss": 3.5185, + "step": 39350 + }, + { + "epoch": 2.673936676178829, + "grad_norm": 0.7767902612686157, + "learning_rate": 0.0006658938035059111, + "loss": 3.6647, + "step": 39355 + }, + { + "epoch": 2.6742763962494904, + "grad_norm": 0.6814755797386169, + "learning_rate": 0.0006658513384970784, + "loss": 3.3411, + "step": 39360 + }, + { + "epoch": 2.674616116320152, + "grad_norm": 0.8060585856437683, + "learning_rate": 0.0006658088734882458, + "loss": 3.4482, + "step": 39365 + }, + { + "epoch": 2.674955836390814, + "grad_norm": 0.8481523990631104, + "learning_rate": 0.000665766408479413, + "loss": 3.5906, + "step": 39370 + }, + { + "epoch": 2.6752955564614758, + "grad_norm": 0.7999590635299683, + "learning_rate": 0.0006657239434705803, + "loss": 3.68, + "step": 39375 + }, + { + "epoch": 2.6756352765321374, + "grad_norm": 0.8075509667396545, + "learning_rate": 0.0006656814784617475, + "loss": 3.4687, + "step": 39380 + }, + { + "epoch": 2.6759749966027995, + "grad_norm": 1.5194668769836426, + "learning_rate": 0.0006656390134529148, + "loss": 3.4034, + "step": 39385 + }, + { + "epoch": 2.676314716673461, + "grad_norm": 0.802416980266571, + "learning_rate": 0.0006655965484440821, + "loss": 3.6983, + "step": 39390 + }, + { + "epoch": 2.6766544367441227, + "grad_norm": 0.882663369178772, + "learning_rate": 0.0006655540834352493, + "loss": 3.7233, + "step": 39395 + }, + { + "epoch": 2.676994156814785, + "grad_norm": 0.8091782927513123, + "learning_rate": 0.0006655116184264167, + "loss": 3.6958, + "step": 39400 + }, + { + "epoch": 2.6773338768854464, + "grad_norm": 0.8368358612060547, + "learning_rate": 0.000665469153417584, + "loss": 3.4709, + "step": 39405 + }, + { + "epoch": 2.677673596956108, + "grad_norm": 0.9471108317375183, + "learning_rate": 0.0006654266884087512, + "loss": 3.2893, + "step": 39410 + }, + { + "epoch": 2.67801331702677, + "grad_norm": 0.6960483193397522, + "learning_rate": 0.0006653842233999184, + "loss": 3.6625, + "step": 39415 + }, + { + "epoch": 2.6783530370974318, + "grad_norm": 0.9823077917098999, + "learning_rate": 0.0006653417583910858, + "loss": 3.6717, + "step": 39420 + }, + { + "epoch": 2.6786927571680934, + "grad_norm": 0.8924122452735901, + "learning_rate": 0.000665299293382253, + "loss": 3.6359, + "step": 39425 + }, + { + "epoch": 2.6790324772387555, + "grad_norm": 0.933933675289154, + "learning_rate": 0.0006652568283734202, + "loss": 3.5703, + "step": 39430 + }, + { + "epoch": 2.679372197309417, + "grad_norm": 0.8145331144332886, + "learning_rate": 0.0006652143633645877, + "loss": 3.5523, + "step": 39435 + }, + { + "epoch": 2.6797119173800787, + "grad_norm": 0.7280272841453552, + "learning_rate": 0.0006651718983557549, + "loss": 3.4998, + "step": 39440 + }, + { + "epoch": 2.680051637450741, + "grad_norm": 0.8432096242904663, + "learning_rate": 0.0006651294333469221, + "loss": 3.6952, + "step": 39445 + }, + { + "epoch": 2.6803913575214025, + "grad_norm": 0.7710118889808655, + "learning_rate": 0.0006650869683380895, + "loss": 3.7202, + "step": 39450 + }, + { + "epoch": 2.680731077592064, + "grad_norm": 0.8582994937896729, + "learning_rate": 0.0006650445033292567, + "loss": 3.5356, + "step": 39455 + }, + { + "epoch": 2.681070797662726, + "grad_norm": 0.7433212995529175, + "learning_rate": 0.0006650020383204239, + "loss": 3.5709, + "step": 39460 + }, + { + "epoch": 2.681410517733388, + "grad_norm": 0.8371800184249878, + "learning_rate": 0.0006649595733115912, + "loss": 3.4104, + "step": 39465 + }, + { + "epoch": 2.6817502378040494, + "grad_norm": 0.721195638179779, + "learning_rate": 0.0006649171083027586, + "loss": 3.6557, + "step": 39470 + }, + { + "epoch": 2.6820899578747115, + "grad_norm": 0.7349361181259155, + "learning_rate": 0.0006648746432939258, + "loss": 3.4963, + "step": 39475 + }, + { + "epoch": 2.682429677945373, + "grad_norm": 0.7775934338569641, + "learning_rate": 0.0006648321782850931, + "loss": 3.4176, + "step": 39480 + }, + { + "epoch": 2.6827693980160348, + "grad_norm": 1.6909157037734985, + "learning_rate": 0.0006647897132762604, + "loss": 3.6471, + "step": 39485 + }, + { + "epoch": 2.683109118086697, + "grad_norm": 0.8942458033561707, + "learning_rate": 0.0006647472482674276, + "loss": 3.6941, + "step": 39490 + }, + { + "epoch": 2.6834488381573585, + "grad_norm": 1.144822120666504, + "learning_rate": 0.0006647047832585949, + "loss": 3.4453, + "step": 39495 + }, + { + "epoch": 2.68378855822802, + "grad_norm": 0.7950239777565002, + "learning_rate": 0.0006646623182497622, + "loss": 3.4238, + "step": 39500 + }, + { + "epoch": 2.6841282782986817, + "grad_norm": 1.0286245346069336, + "learning_rate": 0.0006646198532409295, + "loss": 3.4374, + "step": 39505 + }, + { + "epoch": 2.684467998369344, + "grad_norm": 0.7226224541664124, + "learning_rate": 0.0006645773882320968, + "loss": 3.6306, + "step": 39510 + }, + { + "epoch": 2.6848077184400054, + "grad_norm": 0.9865034222602844, + "learning_rate": 0.000664534923223264, + "loss": 3.412, + "step": 39515 + }, + { + "epoch": 2.685147438510667, + "grad_norm": 1.1473029851913452, + "learning_rate": 0.0006644924582144313, + "loss": 3.4302, + "step": 39520 + }, + { + "epoch": 2.685487158581329, + "grad_norm": 0.9498782157897949, + "learning_rate": 0.0006644499932055986, + "loss": 3.4125, + "step": 39525 + }, + { + "epoch": 2.6858268786519908, + "grad_norm": 1.199507713317871, + "learning_rate": 0.0006644075281967658, + "loss": 3.5614, + "step": 39530 + }, + { + "epoch": 2.6861665987226524, + "grad_norm": 0.808168888092041, + "learning_rate": 0.0006643650631879331, + "loss": 3.716, + "step": 39535 + }, + { + "epoch": 2.6865063187933145, + "grad_norm": 0.9125017523765564, + "learning_rate": 0.0006643225981791005, + "loss": 3.492, + "step": 39540 + }, + { + "epoch": 2.686846038863976, + "grad_norm": 1.2503811120986938, + "learning_rate": 0.0006642801331702677, + "loss": 3.4052, + "step": 39545 + }, + { + "epoch": 2.6871857589346377, + "grad_norm": 1.2221999168395996, + "learning_rate": 0.000664237668161435, + "loss": 3.5229, + "step": 39550 + }, + { + "epoch": 2.6875254790052994, + "grad_norm": 1.1701898574829102, + "learning_rate": 0.0006641952031526023, + "loss": 3.5763, + "step": 39555 + }, + { + "epoch": 2.6878651990759614, + "grad_norm": 0.8629347681999207, + "learning_rate": 0.0006641527381437695, + "loss": 3.5478, + "step": 39560 + }, + { + "epoch": 2.688204919146623, + "grad_norm": 0.7820437550544739, + "learning_rate": 0.0006641102731349367, + "loss": 3.6369, + "step": 39565 + }, + { + "epoch": 2.6885446392172847, + "grad_norm": 1.1309794187545776, + "learning_rate": 0.0006640678081261041, + "loss": 3.5434, + "step": 39570 + }, + { + "epoch": 2.688884359287947, + "grad_norm": 0.7954344153404236, + "learning_rate": 0.0006640253431172714, + "loss": 3.5271, + "step": 39575 + }, + { + "epoch": 2.6892240793586084, + "grad_norm": 0.8802614212036133, + "learning_rate": 0.0006639828781084387, + "loss": 3.4981, + "step": 39580 + }, + { + "epoch": 2.68956379942927, + "grad_norm": 0.9155426025390625, + "learning_rate": 0.000663940413099606, + "loss": 3.8649, + "step": 39585 + }, + { + "epoch": 2.689903519499932, + "grad_norm": 0.8549748659133911, + "learning_rate": 0.0006638979480907732, + "loss": 3.6801, + "step": 39590 + }, + { + "epoch": 2.6902432395705937, + "grad_norm": 0.8701176047325134, + "learning_rate": 0.0006638554830819405, + "loss": 3.6568, + "step": 39595 + }, + { + "epoch": 2.6905829596412554, + "grad_norm": 1.233784794807434, + "learning_rate": 0.0006638130180731078, + "loss": 3.4124, + "step": 39600 + }, + { + "epoch": 2.6909226797119175, + "grad_norm": 0.7892560958862305, + "learning_rate": 0.000663770553064275, + "loss": 3.5336, + "step": 39605 + }, + { + "epoch": 2.691262399782579, + "grad_norm": 1.5058848857879639, + "learning_rate": 0.0006637280880554424, + "loss": 3.6127, + "step": 39610 + }, + { + "epoch": 2.6916021198532407, + "grad_norm": 0.9076579809188843, + "learning_rate": 0.0006636856230466097, + "loss": 3.7528, + "step": 39615 + }, + { + "epoch": 2.691941839923903, + "grad_norm": 0.7590691447257996, + "learning_rate": 0.0006636431580377769, + "loss": 3.6038, + "step": 39620 + }, + { + "epoch": 2.6922815599945644, + "grad_norm": 0.9959279894828796, + "learning_rate": 0.0006636006930289442, + "loss": 3.621, + "step": 39625 + }, + { + "epoch": 2.692621280065226, + "grad_norm": 0.8961482048034668, + "learning_rate": 0.0006635582280201114, + "loss": 3.5108, + "step": 39630 + }, + { + "epoch": 2.692961000135888, + "grad_norm": 0.9579511284828186, + "learning_rate": 0.0006635157630112787, + "loss": 3.4824, + "step": 39635 + }, + { + "epoch": 2.6933007202065498, + "grad_norm": 0.9312372207641602, + "learning_rate": 0.000663473298002446, + "loss": 3.4697, + "step": 39640 + }, + { + "epoch": 2.6936404402772114, + "grad_norm": 0.874547004699707, + "learning_rate": 0.0006634308329936133, + "loss": 3.2409, + "step": 39645 + }, + { + "epoch": 2.6939801603478735, + "grad_norm": 0.9295708537101746, + "learning_rate": 0.0006633883679847806, + "loss": 3.5832, + "step": 39650 + }, + { + "epoch": 2.694319880418535, + "grad_norm": 0.937762975692749, + "learning_rate": 0.0006633459029759479, + "loss": 3.5541, + "step": 39655 + }, + { + "epoch": 2.6946596004891967, + "grad_norm": 0.9959119558334351, + "learning_rate": 0.0006633034379671151, + "loss": 3.6545, + "step": 39660 + }, + { + "epoch": 2.694999320559859, + "grad_norm": 0.7170135974884033, + "learning_rate": 0.0006632609729582823, + "loss": 3.619, + "step": 39665 + }, + { + "epoch": 2.6953390406305204, + "grad_norm": 0.7780436277389526, + "learning_rate": 0.0006632185079494497, + "loss": 3.4541, + "step": 39670 + }, + { + "epoch": 2.695678760701182, + "grad_norm": 1.1823962926864624, + "learning_rate": 0.0006631760429406169, + "loss": 3.3415, + "step": 39675 + }, + { + "epoch": 2.696018480771844, + "grad_norm": 0.8317288756370544, + "learning_rate": 0.0006631335779317842, + "loss": 3.4456, + "step": 39680 + }, + { + "epoch": 2.6963582008425058, + "grad_norm": 0.966630220413208, + "learning_rate": 0.0006630911129229516, + "loss": 3.5186, + "step": 39685 + }, + { + "epoch": 2.6966979209131674, + "grad_norm": 0.9663965702056885, + "learning_rate": 0.0006630486479141188, + "loss": 3.4487, + "step": 39690 + }, + { + "epoch": 2.6970376409838295, + "grad_norm": 0.8511801362037659, + "learning_rate": 0.000663006182905286, + "loss": 3.5072, + "step": 39695 + }, + { + "epoch": 2.697377361054491, + "grad_norm": 0.7907245755195618, + "learning_rate": 0.0006629637178964534, + "loss": 3.6374, + "step": 39700 + }, + { + "epoch": 2.6977170811251527, + "grad_norm": 0.7031225562095642, + "learning_rate": 0.0006629212528876206, + "loss": 3.4257, + "step": 39705 + }, + { + "epoch": 2.698056801195815, + "grad_norm": 0.8588396906852722, + "learning_rate": 0.0006628787878787878, + "loss": 3.5542, + "step": 39710 + }, + { + "epoch": 2.6983965212664764, + "grad_norm": 1.119443655014038, + "learning_rate": 0.0006628363228699553, + "loss": 3.4345, + "step": 39715 + }, + { + "epoch": 2.698736241337138, + "grad_norm": 1.0822786092758179, + "learning_rate": 0.0006627938578611225, + "loss": 3.772, + "step": 39720 + }, + { + "epoch": 2.6990759614078, + "grad_norm": 0.711891770362854, + "learning_rate": 0.0006627513928522897, + "loss": 3.5328, + "step": 39725 + }, + { + "epoch": 2.699415681478462, + "grad_norm": 0.6738964319229126, + "learning_rate": 0.000662708927843457, + "loss": 3.3592, + "step": 39730 + }, + { + "epoch": 2.6997554015491234, + "grad_norm": 1.1691665649414062, + "learning_rate": 0.0006626664628346243, + "loss": 3.5539, + "step": 39735 + }, + { + "epoch": 2.7000951216197855, + "grad_norm": 1.0696582794189453, + "learning_rate": 0.0006626239978257915, + "loss": 3.2617, + "step": 39740 + }, + { + "epoch": 2.700434841690447, + "grad_norm": 0.8285570740699768, + "learning_rate": 0.0006625815328169588, + "loss": 3.5913, + "step": 39745 + }, + { + "epoch": 2.7007745617611087, + "grad_norm": 1.011675477027893, + "learning_rate": 0.0006625390678081262, + "loss": 3.652, + "step": 39750 + }, + { + "epoch": 2.701114281831771, + "grad_norm": 0.7124798893928528, + "learning_rate": 0.0006624966027992934, + "loss": 3.5158, + "step": 39755 + }, + { + "epoch": 2.7014540019024325, + "grad_norm": 0.9683337807655334, + "learning_rate": 0.0006624541377904607, + "loss": 3.5826, + "step": 39760 + }, + { + "epoch": 2.701793721973094, + "grad_norm": 0.7508964538574219, + "learning_rate": 0.0006624116727816279, + "loss": 3.6189, + "step": 39765 + }, + { + "epoch": 2.702133442043756, + "grad_norm": 1.071168303489685, + "learning_rate": 0.0006623692077727952, + "loss": 3.4647, + "step": 39770 + }, + { + "epoch": 2.702473162114418, + "grad_norm": 2.669729232788086, + "learning_rate": 0.0006623267427639625, + "loss": 3.3728, + "step": 39775 + }, + { + "epoch": 2.7028128821850794, + "grad_norm": 1.1554898023605347, + "learning_rate": 0.0006622842777551297, + "loss": 3.3436, + "step": 39780 + }, + { + "epoch": 2.7031526022557415, + "grad_norm": 0.9393160939216614, + "learning_rate": 0.0006622418127462971, + "loss": 3.4612, + "step": 39785 + }, + { + "epoch": 2.703492322326403, + "grad_norm": 0.9906208515167236, + "learning_rate": 0.0006621993477374644, + "loss": 3.7391, + "step": 39790 + }, + { + "epoch": 2.7038320423970648, + "grad_norm": 1.141662359237671, + "learning_rate": 0.0006621568827286316, + "loss": 3.5379, + "step": 39795 + }, + { + "epoch": 2.704171762467727, + "grad_norm": 2.0544016361236572, + "learning_rate": 0.0006621144177197989, + "loss": 3.5422, + "step": 39800 + }, + { + "epoch": 2.7045114825383885, + "grad_norm": 1.2226861715316772, + "learning_rate": 0.0006620719527109662, + "loss": 3.4743, + "step": 39805 + }, + { + "epoch": 2.70485120260905, + "grad_norm": 0.8105239868164062, + "learning_rate": 0.0006620294877021334, + "loss": 3.5518, + "step": 39810 + }, + { + "epoch": 2.705190922679712, + "grad_norm": 0.885386049747467, + "learning_rate": 0.0006619870226933006, + "loss": 3.4618, + "step": 39815 + }, + { + "epoch": 2.705530642750374, + "grad_norm": 1.069288969039917, + "learning_rate": 0.0006619445576844681, + "loss": 3.6653, + "step": 39820 + }, + { + "epoch": 2.7058703628210354, + "grad_norm": 0.9861837029457092, + "learning_rate": 0.0006619020926756353, + "loss": 3.1592, + "step": 39825 + }, + { + "epoch": 2.7062100828916975, + "grad_norm": 0.8968390822410583, + "learning_rate": 0.0006618596276668025, + "loss": 3.416, + "step": 39830 + }, + { + "epoch": 2.706549802962359, + "grad_norm": 0.9717942476272583, + "learning_rate": 0.0006618171626579699, + "loss": 3.5619, + "step": 39835 + }, + { + "epoch": 2.7068895230330208, + "grad_norm": 1.0578216314315796, + "learning_rate": 0.0006617746976491371, + "loss": 3.2889, + "step": 39840 + }, + { + "epoch": 2.7072292431036824, + "grad_norm": 0.7994146943092346, + "learning_rate": 0.0006617322326403043, + "loss": 3.4716, + "step": 39845 + }, + { + "epoch": 2.7075689631743445, + "grad_norm": 1.6171010732650757, + "learning_rate": 0.0006616897676314717, + "loss": 3.6557, + "step": 39850 + }, + { + "epoch": 2.707908683245006, + "grad_norm": 0.8690605759620667, + "learning_rate": 0.000661647302622639, + "loss": 3.465, + "step": 39855 + }, + { + "epoch": 2.7082484033156677, + "grad_norm": 0.925180196762085, + "learning_rate": 0.0006616048376138062, + "loss": 3.529, + "step": 39860 + }, + { + "epoch": 2.70858812338633, + "grad_norm": 0.847785472869873, + "learning_rate": 0.0006615623726049735, + "loss": 3.5902, + "step": 39865 + }, + { + "epoch": 2.7089278434569914, + "grad_norm": 1.0399569272994995, + "learning_rate": 0.0006615199075961408, + "loss": 3.6449, + "step": 39870 + }, + { + "epoch": 2.709267563527653, + "grad_norm": 0.8617279529571533, + "learning_rate": 0.000661477442587308, + "loss": 3.4707, + "step": 39875 + }, + { + "epoch": 2.709607283598315, + "grad_norm": 1.1591763496398926, + "learning_rate": 0.0006614349775784753, + "loss": 3.5725, + "step": 39880 + }, + { + "epoch": 2.709947003668977, + "grad_norm": 0.8340027928352356, + "learning_rate": 0.0006613925125696426, + "loss": 3.6477, + "step": 39885 + }, + { + "epoch": 2.7102867237396384, + "grad_norm": 0.7018161416053772, + "learning_rate": 0.0006613500475608099, + "loss": 3.6889, + "step": 39890 + }, + { + "epoch": 2.7106264438103, + "grad_norm": 0.7798593044281006, + "learning_rate": 0.0006613075825519772, + "loss": 3.5735, + "step": 39895 + }, + { + "epoch": 2.710966163880962, + "grad_norm": 0.8730257153511047, + "learning_rate": 0.0006612651175431445, + "loss": 3.3506, + "step": 39900 + }, + { + "epoch": 2.7113058839516238, + "grad_norm": 1.3043413162231445, + "learning_rate": 0.0006612226525343117, + "loss": 3.6052, + "step": 39905 + }, + { + "epoch": 2.7116456040222854, + "grad_norm": 0.8055699467658997, + "learning_rate": 0.000661180187525479, + "loss": 3.5342, + "step": 39910 + }, + { + "epoch": 2.7119853240929475, + "grad_norm": 0.827287495136261, + "learning_rate": 0.0006611377225166462, + "loss": 3.5687, + "step": 39915 + }, + { + "epoch": 2.712325044163609, + "grad_norm": 0.7134578824043274, + "learning_rate": 0.0006610952575078137, + "loss": 3.6738, + "step": 39920 + }, + { + "epoch": 2.7126647642342707, + "grad_norm": 0.7725797891616821, + "learning_rate": 0.0006610527924989809, + "loss": 3.7256, + "step": 39925 + }, + { + "epoch": 2.713004484304933, + "grad_norm": 1.553903579711914, + "learning_rate": 0.0006610103274901481, + "loss": 3.8051, + "step": 39930 + }, + { + "epoch": 2.7133442043755944, + "grad_norm": 1.2682222127914429, + "learning_rate": 0.0006609678624813155, + "loss": 3.6511, + "step": 39935 + }, + { + "epoch": 2.713683924446256, + "grad_norm": 0.8520024418830872, + "learning_rate": 0.0006609253974724827, + "loss": 3.5719, + "step": 39940 + }, + { + "epoch": 2.714023644516918, + "grad_norm": 0.7681410908699036, + "learning_rate": 0.0006608829324636499, + "loss": 3.3032, + "step": 39945 + }, + { + "epoch": 2.7143633645875798, + "grad_norm": 0.9981362223625183, + "learning_rate": 0.0006608404674548173, + "loss": 3.4335, + "step": 39950 + }, + { + "epoch": 2.7147030846582414, + "grad_norm": 0.9981831312179565, + "learning_rate": 0.0006607980024459846, + "loss": 3.5302, + "step": 39955 + }, + { + "epoch": 2.7150428047289035, + "grad_norm": 0.7598600387573242, + "learning_rate": 0.0006607555374371518, + "loss": 3.7545, + "step": 39960 + }, + { + "epoch": 2.715382524799565, + "grad_norm": 0.800092875957489, + "learning_rate": 0.0006607130724283191, + "loss": 3.586, + "step": 39965 + }, + { + "epoch": 2.7157222448702267, + "grad_norm": 0.7075195908546448, + "learning_rate": 0.0006606706074194864, + "loss": 3.2308, + "step": 39970 + }, + { + "epoch": 2.716061964940889, + "grad_norm": 0.8923032879829407, + "learning_rate": 0.0006606281424106536, + "loss": 3.6624, + "step": 39975 + }, + { + "epoch": 2.7164016850115504, + "grad_norm": 0.9800217151641846, + "learning_rate": 0.0006605856774018209, + "loss": 3.4661, + "step": 39980 + }, + { + "epoch": 2.716741405082212, + "grad_norm": 0.9534821510314941, + "learning_rate": 0.0006605432123929882, + "loss": 3.4402, + "step": 39985 + }, + { + "epoch": 2.717081125152874, + "grad_norm": 0.8058264851570129, + "learning_rate": 0.0006605007473841555, + "loss": 3.5835, + "step": 39990 + }, + { + "epoch": 2.7174208452235358, + "grad_norm": 0.9272941946983337, + "learning_rate": 0.0006604582823753228, + "loss": 3.5944, + "step": 39995 + }, + { + "epoch": 2.7177605652941974, + "grad_norm": 0.893987774848938, + "learning_rate": 0.0006604158173664901, + "loss": 3.6523, + "step": 40000 + }, + { + "epoch": 2.7181002853648595, + "grad_norm": 1.0701932907104492, + "learning_rate": 0.0006603733523576573, + "loss": 3.6078, + "step": 40005 + }, + { + "epoch": 2.718440005435521, + "grad_norm": 0.9074406623840332, + "learning_rate": 0.0006603308873488246, + "loss": 3.4413, + "step": 40010 + }, + { + "epoch": 2.7187797255061827, + "grad_norm": 0.7789276838302612, + "learning_rate": 0.0006602884223399918, + "loss": 3.4454, + "step": 40015 + }, + { + "epoch": 2.719119445576845, + "grad_norm": 1.2617896795272827, + "learning_rate": 0.0006602459573311591, + "loss": 3.6303, + "step": 40020 + }, + { + "epoch": 2.7194591656475064, + "grad_norm": 0.7063398957252502, + "learning_rate": 0.0006602034923223265, + "loss": 3.8211, + "step": 40025 + }, + { + "epoch": 2.719798885718168, + "grad_norm": 1.0814720392227173, + "learning_rate": 0.0006601610273134937, + "loss": 3.6748, + "step": 40030 + }, + { + "epoch": 2.72013860578883, + "grad_norm": 0.7788472175598145, + "learning_rate": 0.000660118562304661, + "loss": 3.3363, + "step": 40035 + }, + { + "epoch": 2.720478325859492, + "grad_norm": 1.035106897354126, + "learning_rate": 0.0006600760972958283, + "loss": 3.6857, + "step": 40040 + }, + { + "epoch": 2.7208180459301534, + "grad_norm": 0.9838740825653076, + "learning_rate": 0.0006600336322869955, + "loss": 3.4369, + "step": 40045 + }, + { + "epoch": 2.7211577660008155, + "grad_norm": 0.9347538948059082, + "learning_rate": 0.0006599911672781627, + "loss": 3.4655, + "step": 40050 + }, + { + "epoch": 2.721497486071477, + "grad_norm": 0.958770751953125, + "learning_rate": 0.0006599487022693301, + "loss": 3.6059, + "step": 40055 + }, + { + "epoch": 2.7218372061421388, + "grad_norm": 1.3770954608917236, + "learning_rate": 0.0006599062372604974, + "loss": 3.3141, + "step": 40060 + }, + { + "epoch": 2.722176926212801, + "grad_norm": 0.9159761071205139, + "learning_rate": 0.0006598637722516646, + "loss": 3.5871, + "step": 40065 + }, + { + "epoch": 2.7225166462834625, + "grad_norm": 0.9210917949676514, + "learning_rate": 0.000659821307242832, + "loss": 3.5797, + "step": 40070 + }, + { + "epoch": 2.722856366354124, + "grad_norm": 0.9567116498947144, + "learning_rate": 0.0006597788422339992, + "loss": 3.6712, + "step": 40075 + }, + { + "epoch": 2.723196086424786, + "grad_norm": 1.1680623292922974, + "learning_rate": 0.0006597363772251664, + "loss": 3.4453, + "step": 40080 + }, + { + "epoch": 2.723535806495448, + "grad_norm": 0.8931622505187988, + "learning_rate": 0.0006596939122163338, + "loss": 3.7534, + "step": 40085 + }, + { + "epoch": 2.7238755265661094, + "grad_norm": 2.7064733505249023, + "learning_rate": 0.000659651447207501, + "loss": 3.6034, + "step": 40090 + }, + { + "epoch": 2.7242152466367715, + "grad_norm": 0.6924133896827698, + "learning_rate": 0.0006596089821986683, + "loss": 3.6363, + "step": 40095 + }, + { + "epoch": 2.724554966707433, + "grad_norm": 1.1854326725006104, + "learning_rate": 0.0006595665171898357, + "loss": 3.4818, + "step": 40100 + }, + { + "epoch": 2.7248946867780948, + "grad_norm": 0.8238921761512756, + "learning_rate": 0.0006595240521810029, + "loss": 3.467, + "step": 40105 + }, + { + "epoch": 2.725234406848757, + "grad_norm": 1.22781503200531, + "learning_rate": 0.0006594815871721701, + "loss": 3.674, + "step": 40110 + }, + { + "epoch": 2.7255741269194185, + "grad_norm": 0.8821894526481628, + "learning_rate": 0.0006594391221633374, + "loss": 3.5797, + "step": 40115 + }, + { + "epoch": 2.72591384699008, + "grad_norm": 0.9147789478302002, + "learning_rate": 0.0006593966571545047, + "loss": 3.3995, + "step": 40120 + }, + { + "epoch": 2.726253567060742, + "grad_norm": 0.8317065238952637, + "learning_rate": 0.0006593541921456719, + "loss": 3.3511, + "step": 40125 + }, + { + "epoch": 2.726593287131404, + "grad_norm": 0.8678780794143677, + "learning_rate": 0.0006593117271368393, + "loss": 3.7357, + "step": 40130 + }, + { + "epoch": 2.7269330072020654, + "grad_norm": 0.9172513484954834, + "learning_rate": 0.0006592692621280066, + "loss": 3.5037, + "step": 40135 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.8701730966567993, + "learning_rate": 0.0006592267971191738, + "loss": 3.3634, + "step": 40140 + }, + { + "epoch": 2.727612447343389, + "grad_norm": 0.9930633902549744, + "learning_rate": 0.0006591843321103411, + "loss": 3.5468, + "step": 40145 + }, + { + "epoch": 2.7279521674140508, + "grad_norm": 0.9840191006660461, + "learning_rate": 0.0006591418671015083, + "loss": 3.5762, + "step": 40150 + }, + { + "epoch": 2.728291887484713, + "grad_norm": 0.6642597317695618, + "learning_rate": 0.0006590994020926756, + "loss": 3.555, + "step": 40155 + }, + { + "epoch": 2.7286316075553745, + "grad_norm": 1.895679235458374, + "learning_rate": 0.0006590569370838429, + "loss": 3.5194, + "step": 40160 + }, + { + "epoch": 2.728971327626036, + "grad_norm": 1.241719126701355, + "learning_rate": 0.0006590144720750102, + "loss": 3.746, + "step": 40165 + }, + { + "epoch": 2.729311047696698, + "grad_norm": 1.9107745885849, + "learning_rate": 0.0006589720070661775, + "loss": 3.6004, + "step": 40170 + }, + { + "epoch": 2.72965076776736, + "grad_norm": 1.3947597742080688, + "learning_rate": 0.0006589295420573448, + "loss": 3.6629, + "step": 40175 + }, + { + "epoch": 2.7299904878380215, + "grad_norm": 1.1174310445785522, + "learning_rate": 0.000658887077048512, + "loss": 3.6939, + "step": 40180 + }, + { + "epoch": 2.730330207908683, + "grad_norm": 0.79265296459198, + "learning_rate": 0.0006588446120396793, + "loss": 3.6323, + "step": 40185 + }, + { + "epoch": 2.730669927979345, + "grad_norm": 0.7314167618751526, + "learning_rate": 0.0006588021470308466, + "loss": 3.7149, + "step": 40190 + }, + { + "epoch": 2.731009648050007, + "grad_norm": 0.8402870893478394, + "learning_rate": 0.0006587596820220138, + "loss": 3.5707, + "step": 40195 + }, + { + "epoch": 2.7313493681206684, + "grad_norm": 0.9154160618782043, + "learning_rate": 0.0006587172170131811, + "loss": 3.782, + "step": 40200 + }, + { + "epoch": 2.7316890881913305, + "grad_norm": 1.8445149660110474, + "learning_rate": 0.0006586747520043485, + "loss": 3.3992, + "step": 40205 + }, + { + "epoch": 2.732028808261992, + "grad_norm": 0.730736255645752, + "learning_rate": 0.0006586322869955157, + "loss": 3.5654, + "step": 40210 + }, + { + "epoch": 2.7323685283326538, + "grad_norm": 1.0220437049865723, + "learning_rate": 0.0006585898219866829, + "loss": 3.6285, + "step": 40215 + }, + { + "epoch": 2.732708248403316, + "grad_norm": 0.8478537797927856, + "learning_rate": 0.0006585473569778503, + "loss": 3.5828, + "step": 40220 + }, + { + "epoch": 2.7330479684739775, + "grad_norm": 0.9750195145606995, + "learning_rate": 0.0006585048919690175, + "loss": 3.5138, + "step": 40225 + }, + { + "epoch": 2.733387688544639, + "grad_norm": 0.982505202293396, + "learning_rate": 0.0006584624269601847, + "loss": 3.4142, + "step": 40230 + }, + { + "epoch": 2.7337274086153007, + "grad_norm": 1.0747997760772705, + "learning_rate": 0.0006584199619513522, + "loss": 3.5743, + "step": 40235 + }, + { + "epoch": 2.734067128685963, + "grad_norm": 0.9790638089179993, + "learning_rate": 0.0006583774969425194, + "loss": 3.62, + "step": 40240 + }, + { + "epoch": 2.7344068487566244, + "grad_norm": 0.9729027152061462, + "learning_rate": 0.0006583350319336866, + "loss": 3.4491, + "step": 40245 + }, + { + "epoch": 2.734746568827286, + "grad_norm": 1.7107230424880981, + "learning_rate": 0.000658292566924854, + "loss": 3.6703, + "step": 40250 + }, + { + "epoch": 2.735086288897948, + "grad_norm": 0.9736427664756775, + "learning_rate": 0.0006582501019160212, + "loss": 3.7022, + "step": 40255 + }, + { + "epoch": 2.7354260089686098, + "grad_norm": 0.6736463904380798, + "learning_rate": 0.0006582076369071885, + "loss": 3.5801, + "step": 40260 + }, + { + "epoch": 2.7357657290392714, + "grad_norm": 0.9220758676528931, + "learning_rate": 0.0006581651718983557, + "loss": 3.4937, + "step": 40265 + }, + { + "epoch": 2.7361054491099335, + "grad_norm": 0.8130114674568176, + "learning_rate": 0.0006581227068895231, + "loss": 3.5856, + "step": 40270 + }, + { + "epoch": 2.736445169180595, + "grad_norm": 0.8906183242797852, + "learning_rate": 0.0006580802418806904, + "loss": 3.2832, + "step": 40275 + }, + { + "epoch": 2.7367848892512567, + "grad_norm": 1.1336212158203125, + "learning_rate": 0.0006580377768718576, + "loss": 3.7354, + "step": 40280 + }, + { + "epoch": 2.737124609321919, + "grad_norm": 0.8855255246162415, + "learning_rate": 0.0006579953118630249, + "loss": 3.5117, + "step": 40285 + }, + { + "epoch": 2.7374643293925804, + "grad_norm": 0.8695024847984314, + "learning_rate": 0.0006579528468541922, + "loss": 3.6966, + "step": 40290 + }, + { + "epoch": 2.737804049463242, + "grad_norm": 0.7930164933204651, + "learning_rate": 0.0006579103818453594, + "loss": 3.4158, + "step": 40295 + }, + { + "epoch": 2.738143769533904, + "grad_norm": 1.052827000617981, + "learning_rate": 0.0006578679168365266, + "loss": 3.4642, + "step": 40300 + }, + { + "epoch": 2.738483489604566, + "grad_norm": 0.9936763644218445, + "learning_rate": 0.0006578254518276941, + "loss": 3.4275, + "step": 40305 + }, + { + "epoch": 2.7388232096752274, + "grad_norm": 0.8013944029808044, + "learning_rate": 0.0006577829868188613, + "loss": 3.7212, + "step": 40310 + }, + { + "epoch": 2.7391629297458895, + "grad_norm": 1.9864246845245361, + "learning_rate": 0.0006577405218100285, + "loss": 3.5606, + "step": 40315 + }, + { + "epoch": 2.739502649816551, + "grad_norm": 0.930047333240509, + "learning_rate": 0.0006576980568011959, + "loss": 3.28, + "step": 40320 + }, + { + "epoch": 2.7398423698872127, + "grad_norm": 0.9092527031898499, + "learning_rate": 0.0006576555917923631, + "loss": 3.6271, + "step": 40325 + }, + { + "epoch": 2.740182089957875, + "grad_norm": 1.0159852504730225, + "learning_rate": 0.0006576131267835303, + "loss": 3.677, + "step": 40330 + }, + { + "epoch": 2.7405218100285365, + "grad_norm": 1.1618571281433105, + "learning_rate": 0.0006575706617746977, + "loss": 3.5855, + "step": 40335 + }, + { + "epoch": 2.740861530099198, + "grad_norm": 0.8457804918289185, + "learning_rate": 0.000657528196765865, + "loss": 3.478, + "step": 40340 + }, + { + "epoch": 2.74120125016986, + "grad_norm": 1.04076087474823, + "learning_rate": 0.0006574857317570322, + "loss": 3.617, + "step": 40345 + }, + { + "epoch": 2.741540970240522, + "grad_norm": 1.1168832778930664, + "learning_rate": 0.0006574432667481996, + "loss": 3.0713, + "step": 40350 + }, + { + "epoch": 2.7418806903111834, + "grad_norm": 0.8668249249458313, + "learning_rate": 0.0006574008017393668, + "loss": 3.3217, + "step": 40355 + }, + { + "epoch": 2.7422204103818455, + "grad_norm": 0.9900619983673096, + "learning_rate": 0.000657358336730534, + "loss": 3.6275, + "step": 40360 + }, + { + "epoch": 2.742560130452507, + "grad_norm": 1.0593093633651733, + "learning_rate": 0.0006573158717217013, + "loss": 3.6366, + "step": 40365 + }, + { + "epoch": 2.7428998505231688, + "grad_norm": 0.9361907839775085, + "learning_rate": 0.0006572734067128686, + "loss": 3.7142, + "step": 40370 + }, + { + "epoch": 2.743239570593831, + "grad_norm": 1.1126965284347534, + "learning_rate": 0.0006572309417040359, + "loss": 3.5961, + "step": 40375 + }, + { + "epoch": 2.7435792906644925, + "grad_norm": 0.8739984035491943, + "learning_rate": 0.0006571884766952032, + "loss": 3.7047, + "step": 40380 + }, + { + "epoch": 2.743919010735154, + "grad_norm": 0.9822377562522888, + "learning_rate": 0.0006571460116863705, + "loss": 3.5331, + "step": 40385 + }, + { + "epoch": 2.744258730805816, + "grad_norm": 0.9588500261306763, + "learning_rate": 0.0006571035466775377, + "loss": 3.7774, + "step": 40390 + }, + { + "epoch": 2.744598450876478, + "grad_norm": 0.8315181732177734, + "learning_rate": 0.000657061081668705, + "loss": 3.5208, + "step": 40395 + }, + { + "epoch": 2.7449381709471394, + "grad_norm": 0.921482264995575, + "learning_rate": 0.0006570186166598722, + "loss": 3.2828, + "step": 40400 + }, + { + "epoch": 2.7452778910178015, + "grad_norm": 0.8548282384872437, + "learning_rate": 0.0006569761516510395, + "loss": 3.6001, + "step": 40405 + }, + { + "epoch": 2.745617611088463, + "grad_norm": 0.7767582535743713, + "learning_rate": 0.0006569336866422069, + "loss": 3.5115, + "step": 40410 + }, + { + "epoch": 2.7459573311591248, + "grad_norm": 0.7506192922592163, + "learning_rate": 0.0006568912216333741, + "loss": 3.528, + "step": 40415 + }, + { + "epoch": 2.746297051229787, + "grad_norm": 0.7501563429832458, + "learning_rate": 0.0006568487566245414, + "loss": 3.6455, + "step": 40420 + }, + { + "epoch": 2.7466367713004485, + "grad_norm": 0.7917484641075134, + "learning_rate": 0.0006568062916157087, + "loss": 3.6333, + "step": 40425 + }, + { + "epoch": 2.74697649137111, + "grad_norm": 0.7530392408370972, + "learning_rate": 0.0006567638266068759, + "loss": 3.3113, + "step": 40430 + }, + { + "epoch": 2.747316211441772, + "grad_norm": 0.8500756621360779, + "learning_rate": 0.0006567213615980432, + "loss": 3.5874, + "step": 40435 + }, + { + "epoch": 2.747655931512434, + "grad_norm": 0.91789710521698, + "learning_rate": 0.0006566788965892106, + "loss": 3.7065, + "step": 40440 + }, + { + "epoch": 2.7479956515830954, + "grad_norm": 0.8588805198669434, + "learning_rate": 0.0006566364315803778, + "loss": 3.6047, + "step": 40445 + }, + { + "epoch": 2.7483353716537575, + "grad_norm": 1.005081057548523, + "learning_rate": 0.000656593966571545, + "loss": 3.659, + "step": 40450 + }, + { + "epoch": 2.748675091724419, + "grad_norm": 0.7499361038208008, + "learning_rate": 0.0006565515015627124, + "loss": 3.588, + "step": 40455 + }, + { + "epoch": 2.749014811795081, + "grad_norm": 0.9843433499336243, + "learning_rate": 0.0006565090365538796, + "loss": 3.7475, + "step": 40460 + }, + { + "epoch": 2.749354531865743, + "grad_norm": 1.0509228706359863, + "learning_rate": 0.0006564665715450468, + "loss": 3.4175, + "step": 40465 + }, + { + "epoch": 2.7496942519364045, + "grad_norm": 1.0328434705734253, + "learning_rate": 0.0006564241065362142, + "loss": 3.2164, + "step": 40470 + }, + { + "epoch": 2.750033972007066, + "grad_norm": 0.850226104259491, + "learning_rate": 0.0006563816415273815, + "loss": 3.6567, + "step": 40475 + }, + { + "epoch": 2.750373692077728, + "grad_norm": 0.9954721927642822, + "learning_rate": 0.0006563391765185487, + "loss": 3.5989, + "step": 40480 + }, + { + "epoch": 2.75071341214839, + "grad_norm": 0.7283939719200134, + "learning_rate": 0.0006562967115097161, + "loss": 3.6076, + "step": 40485 + }, + { + "epoch": 2.7510531322190515, + "grad_norm": 0.7405085563659668, + "learning_rate": 0.0006562542465008833, + "loss": 3.4943, + "step": 40490 + }, + { + "epoch": 2.7513928522897135, + "grad_norm": 0.8201584815979004, + "learning_rate": 0.0006562117814920505, + "loss": 3.568, + "step": 40495 + }, + { + "epoch": 2.751732572360375, + "grad_norm": 0.9628103375434875, + "learning_rate": 0.0006561693164832178, + "loss": 3.5959, + "step": 40500 + }, + { + "epoch": 2.752072292431037, + "grad_norm": 0.8367608189582825, + "learning_rate": 0.0006561268514743851, + "loss": 3.4214, + "step": 40505 + }, + { + "epoch": 2.752412012501699, + "grad_norm": 0.8169490098953247, + "learning_rate": 0.0006560843864655524, + "loss": 3.5647, + "step": 40510 + }, + { + "epoch": 2.7527517325723605, + "grad_norm": 0.9625504612922668, + "learning_rate": 0.0006560419214567197, + "loss": 3.5688, + "step": 40515 + }, + { + "epoch": 2.753091452643022, + "grad_norm": 0.9025369882583618, + "learning_rate": 0.000655999456447887, + "loss": 3.415, + "step": 40520 + }, + { + "epoch": 2.7534311727136838, + "grad_norm": 1.035727858543396, + "learning_rate": 0.0006559569914390542, + "loss": 3.5635, + "step": 40525 + }, + { + "epoch": 2.753770892784346, + "grad_norm": 0.8665016889572144, + "learning_rate": 0.0006559145264302215, + "loss": 3.606, + "step": 40530 + }, + { + "epoch": 2.7541106128550075, + "grad_norm": 0.5956440567970276, + "learning_rate": 0.0006558720614213888, + "loss": 3.7282, + "step": 40535 + }, + { + "epoch": 2.754450332925669, + "grad_norm": 1.2028807401657104, + "learning_rate": 0.000655829596412556, + "loss": 3.5316, + "step": 40540 + }, + { + "epoch": 2.754790052996331, + "grad_norm": 0.9108421802520752, + "learning_rate": 0.0006557871314037234, + "loss": 3.5127, + "step": 40545 + }, + { + "epoch": 2.755129773066993, + "grad_norm": 0.8710141181945801, + "learning_rate": 0.0006557446663948906, + "loss": 3.7075, + "step": 40550 + }, + { + "epoch": 2.7554694931376544, + "grad_norm": 0.996338963508606, + "learning_rate": 0.0006557022013860579, + "loss": 3.6221, + "step": 40555 + }, + { + "epoch": 2.7558092132083165, + "grad_norm": 0.8262811899185181, + "learning_rate": 0.0006556597363772252, + "loss": 3.4846, + "step": 40560 + }, + { + "epoch": 2.756148933278978, + "grad_norm": 0.7233818769454956, + "learning_rate": 0.0006556172713683924, + "loss": 3.5049, + "step": 40565 + }, + { + "epoch": 2.7564886533496398, + "grad_norm": 0.7093307375907898, + "learning_rate": 0.0006555748063595597, + "loss": 3.2908, + "step": 40570 + }, + { + "epoch": 2.7568283734203014, + "grad_norm": 0.9796097278594971, + "learning_rate": 0.000655532341350727, + "loss": 3.5001, + "step": 40575 + }, + { + "epoch": 2.7571680934909635, + "grad_norm": 1.124726414680481, + "learning_rate": 0.0006554898763418943, + "loss": 3.5014, + "step": 40580 + }, + { + "epoch": 2.757507813561625, + "grad_norm": 0.9977583885192871, + "learning_rate": 0.0006554474113330616, + "loss": 3.5072, + "step": 40585 + }, + { + "epoch": 2.7578475336322867, + "grad_norm": 0.9261253476142883, + "learning_rate": 0.0006554049463242289, + "loss": 3.5225, + "step": 40590 + }, + { + "epoch": 2.758187253702949, + "grad_norm": 0.9929415583610535, + "learning_rate": 0.0006553624813153961, + "loss": 3.5676, + "step": 40595 + }, + { + "epoch": 2.7585269737736104, + "grad_norm": 0.8237209916114807, + "learning_rate": 0.0006553200163065634, + "loss": 3.6553, + "step": 40600 + }, + { + "epoch": 2.758866693844272, + "grad_norm": 0.6307286024093628, + "learning_rate": 0.0006552775512977307, + "loss": 3.5945, + "step": 40605 + }, + { + "epoch": 2.759206413914934, + "grad_norm": 0.9658561944961548, + "learning_rate": 0.0006552350862888979, + "loss": 3.583, + "step": 40610 + }, + { + "epoch": 2.759546133985596, + "grad_norm": 0.8142009973526001, + "learning_rate": 0.0006551926212800653, + "loss": 3.14, + "step": 40615 + }, + { + "epoch": 2.7598858540562574, + "grad_norm": 0.9704918265342712, + "learning_rate": 0.0006551501562712326, + "loss": 3.6742, + "step": 40620 + }, + { + "epoch": 2.7602255741269195, + "grad_norm": 0.7444309592247009, + "learning_rate": 0.0006551076912623998, + "loss": 3.4412, + "step": 40625 + }, + { + "epoch": 2.760565294197581, + "grad_norm": 0.9919167160987854, + "learning_rate": 0.0006550652262535671, + "loss": 3.3759, + "step": 40630 + }, + { + "epoch": 2.7609050142682428, + "grad_norm": 0.8450483083724976, + "learning_rate": 0.0006550227612447344, + "loss": 3.5137, + "step": 40635 + }, + { + "epoch": 2.761244734338905, + "grad_norm": 0.69350266456604, + "learning_rate": 0.0006549802962359016, + "loss": 3.1416, + "step": 40640 + }, + { + "epoch": 2.7615844544095665, + "grad_norm": 0.9832869172096252, + "learning_rate": 0.0006549378312270689, + "loss": 3.7143, + "step": 40645 + }, + { + "epoch": 2.761924174480228, + "grad_norm": 1.2729607820510864, + "learning_rate": 0.0006548953662182362, + "loss": 3.502, + "step": 40650 + }, + { + "epoch": 2.76226389455089, + "grad_norm": 0.716499388217926, + "learning_rate": 0.0006548529012094035, + "loss": 3.5438, + "step": 40655 + }, + { + "epoch": 2.762603614621552, + "grad_norm": 0.9809114336967468, + "learning_rate": 0.0006548104362005708, + "loss": 3.5803, + "step": 40660 + }, + { + "epoch": 2.7629433346922134, + "grad_norm": 1.6026331186294556, + "learning_rate": 0.000654767971191738, + "loss": 3.5464, + "step": 40665 + }, + { + "epoch": 2.7632830547628755, + "grad_norm": 0.8248729109764099, + "learning_rate": 0.0006547255061829053, + "loss": 3.5197, + "step": 40670 + }, + { + "epoch": 2.763622774833537, + "grad_norm": 0.8715229034423828, + "learning_rate": 0.0006546830411740726, + "loss": 3.3834, + "step": 40675 + }, + { + "epoch": 2.7639624949041988, + "grad_norm": 0.7012650966644287, + "learning_rate": 0.0006546405761652398, + "loss": 3.4043, + "step": 40680 + }, + { + "epoch": 2.764302214974861, + "grad_norm": 0.9713187217712402, + "learning_rate": 0.0006545981111564072, + "loss": 3.6747, + "step": 40685 + }, + { + "epoch": 2.7646419350455225, + "grad_norm": 4.783820152282715, + "learning_rate": 0.0006545556461475745, + "loss": 3.6701, + "step": 40690 + }, + { + "epoch": 2.764981655116184, + "grad_norm": 0.8832117915153503, + "learning_rate": 0.0006545131811387417, + "loss": 3.5361, + "step": 40695 + }, + { + "epoch": 2.765321375186846, + "grad_norm": 0.9706795811653137, + "learning_rate": 0.0006544707161299089, + "loss": 3.4209, + "step": 40700 + }, + { + "epoch": 2.765661095257508, + "grad_norm": 0.8082313537597656, + "learning_rate": 0.0006544282511210763, + "loss": 3.6015, + "step": 40705 + }, + { + "epoch": 2.7660008153281694, + "grad_norm": 0.941994845867157, + "learning_rate": 0.0006543857861122435, + "loss": 3.5336, + "step": 40710 + }, + { + "epoch": 2.7663405353988315, + "grad_norm": 0.7735217809677124, + "learning_rate": 0.0006543433211034107, + "loss": 3.5501, + "step": 40715 + }, + { + "epoch": 2.766680255469493, + "grad_norm": 1.0877485275268555, + "learning_rate": 0.0006543008560945782, + "loss": 3.5141, + "step": 40720 + }, + { + "epoch": 2.7670199755401548, + "grad_norm": 0.931935727596283, + "learning_rate": 0.0006542583910857454, + "loss": 3.2986, + "step": 40725 + }, + { + "epoch": 2.767359695610817, + "grad_norm": 1.2161357402801514, + "learning_rate": 0.0006542159260769126, + "loss": 3.4835, + "step": 40730 + }, + { + "epoch": 2.7676994156814785, + "grad_norm": 0.6963405013084412, + "learning_rate": 0.00065417346106808, + "loss": 3.5727, + "step": 40735 + }, + { + "epoch": 2.76803913575214, + "grad_norm": 0.7851743698120117, + "learning_rate": 0.0006541309960592472, + "loss": 3.5716, + "step": 40740 + }, + { + "epoch": 2.768378855822802, + "grad_norm": 0.6839212775230408, + "learning_rate": 0.0006540885310504144, + "loss": 3.6313, + "step": 40745 + }, + { + "epoch": 2.768718575893464, + "grad_norm": 0.9335740804672241, + "learning_rate": 0.0006540460660415817, + "loss": 3.6669, + "step": 40750 + }, + { + "epoch": 2.7690582959641254, + "grad_norm": 1.0695253610610962, + "learning_rate": 0.0006540036010327491, + "loss": 3.4541, + "step": 40755 + }, + { + "epoch": 2.7693980160347875, + "grad_norm": 0.9377077221870422, + "learning_rate": 0.0006539611360239163, + "loss": 3.5718, + "step": 40760 + }, + { + "epoch": 2.769737736105449, + "grad_norm": 0.9251968860626221, + "learning_rate": 0.0006539186710150836, + "loss": 3.5713, + "step": 40765 + }, + { + "epoch": 2.770077456176111, + "grad_norm": 0.7796881794929504, + "learning_rate": 0.0006538762060062509, + "loss": 3.6812, + "step": 40770 + }, + { + "epoch": 2.770417176246773, + "grad_norm": 0.86773282289505, + "learning_rate": 0.0006538337409974181, + "loss": 3.719, + "step": 40775 + }, + { + "epoch": 2.7707568963174345, + "grad_norm": 1.01491117477417, + "learning_rate": 0.0006537912759885854, + "loss": 3.5457, + "step": 40780 + }, + { + "epoch": 2.771096616388096, + "grad_norm": 0.9599595069885254, + "learning_rate": 0.0006537488109797526, + "loss": 3.5113, + "step": 40785 + }, + { + "epoch": 2.771436336458758, + "grad_norm": 0.7483896017074585, + "learning_rate": 0.00065370634597092, + "loss": 3.717, + "step": 40790 + }, + { + "epoch": 2.77177605652942, + "grad_norm": 0.8696107268333435, + "learning_rate": 0.0006536638809620873, + "loss": 3.7462, + "step": 40795 + }, + { + "epoch": 2.7721157766000815, + "grad_norm": 0.9427211284637451, + "learning_rate": 0.0006536214159532545, + "loss": 3.4336, + "step": 40800 + }, + { + "epoch": 2.7724554966707435, + "grad_norm": 0.9194132089614868, + "learning_rate": 0.0006535789509444218, + "loss": 3.4293, + "step": 40805 + }, + { + "epoch": 2.772795216741405, + "grad_norm": 0.7879698276519775, + "learning_rate": 0.0006535364859355891, + "loss": 3.3692, + "step": 40810 + }, + { + "epoch": 2.773134936812067, + "grad_norm": 0.8898518085479736, + "learning_rate": 0.0006534940209267563, + "loss": 3.6485, + "step": 40815 + }, + { + "epoch": 2.773474656882729, + "grad_norm": 0.7084068059921265, + "learning_rate": 0.0006534515559179236, + "loss": 3.6741, + "step": 40820 + }, + { + "epoch": 2.7738143769533905, + "grad_norm": 0.8880254626274109, + "learning_rate": 0.000653409090909091, + "loss": 3.3927, + "step": 40825 + }, + { + "epoch": 2.774154097024052, + "grad_norm": 2.4427032470703125, + "learning_rate": 0.0006533666259002582, + "loss": 3.4098, + "step": 40830 + }, + { + "epoch": 2.774493817094714, + "grad_norm": 0.9176467657089233, + "learning_rate": 0.0006533241608914254, + "loss": 3.6106, + "step": 40835 + }, + { + "epoch": 2.774833537165376, + "grad_norm": 0.9375967383384705, + "learning_rate": 0.0006532816958825928, + "loss": 3.7314, + "step": 40840 + }, + { + "epoch": 2.7751732572360375, + "grad_norm": 2.375473976135254, + "learning_rate": 0.00065323923087376, + "loss": 3.7609, + "step": 40845 + }, + { + "epoch": 2.7755129773066995, + "grad_norm": 1.0898433923721313, + "learning_rate": 0.0006531967658649272, + "loss": 3.3581, + "step": 40850 + }, + { + "epoch": 2.775852697377361, + "grad_norm": 1.1421445608139038, + "learning_rate": 0.0006531543008560946, + "loss": 3.4525, + "step": 40855 + }, + { + "epoch": 2.776192417448023, + "grad_norm": 0.924429178237915, + "learning_rate": 0.0006531118358472619, + "loss": 3.4674, + "step": 40860 + }, + { + "epoch": 2.7765321375186844, + "grad_norm": 0.8847517967224121, + "learning_rate": 0.0006530693708384291, + "loss": 3.7468, + "step": 40865 + }, + { + "epoch": 2.7768718575893465, + "grad_norm": 0.7684040665626526, + "learning_rate": 0.0006530269058295965, + "loss": 3.5228, + "step": 40870 + }, + { + "epoch": 2.777211577660008, + "grad_norm": 1.1485562324523926, + "learning_rate": 0.0006529844408207637, + "loss": 3.6147, + "step": 40875 + }, + { + "epoch": 2.7775512977306698, + "grad_norm": 0.8975318074226379, + "learning_rate": 0.0006529419758119309, + "loss": 3.6065, + "step": 40880 + }, + { + "epoch": 2.777891017801332, + "grad_norm": 1.0868172645568848, + "learning_rate": 0.0006528995108030982, + "loss": 3.335, + "step": 40885 + }, + { + "epoch": 2.7782307378719935, + "grad_norm": 1.2385257482528687, + "learning_rate": 0.0006528570457942655, + "loss": 3.6547, + "step": 40890 + }, + { + "epoch": 2.778570457942655, + "grad_norm": 0.9245891571044922, + "learning_rate": 0.0006528145807854328, + "loss": 3.7573, + "step": 40895 + }, + { + "epoch": 2.778910178013317, + "grad_norm": 0.804599404335022, + "learning_rate": 0.0006527721157766001, + "loss": 3.1576, + "step": 40900 + }, + { + "epoch": 2.779249898083979, + "grad_norm": 0.8503904342651367, + "learning_rate": 0.0006527296507677674, + "loss": 3.6978, + "step": 40905 + }, + { + "epoch": 2.7795896181546405, + "grad_norm": 0.6482880115509033, + "learning_rate": 0.0006526871857589346, + "loss": 3.5258, + "step": 40910 + }, + { + "epoch": 2.779929338225302, + "grad_norm": 0.9473826289176941, + "learning_rate": 0.0006526447207501019, + "loss": 3.6321, + "step": 40915 + }, + { + "epoch": 2.780269058295964, + "grad_norm": 0.9759376645088196, + "learning_rate": 0.0006526022557412692, + "loss": 3.1827, + "step": 40920 + }, + { + "epoch": 2.780608778366626, + "grad_norm": 0.8588887453079224, + "learning_rate": 0.0006525597907324364, + "loss": 3.6638, + "step": 40925 + }, + { + "epoch": 2.7809484984372874, + "grad_norm": 0.818915605545044, + "learning_rate": 0.0006525173257236038, + "loss": 3.6519, + "step": 40930 + }, + { + "epoch": 2.7812882185079495, + "grad_norm": 1.030646562576294, + "learning_rate": 0.000652474860714771, + "loss": 3.6327, + "step": 40935 + }, + { + "epoch": 2.781627938578611, + "grad_norm": 1.2684837579727173, + "learning_rate": 0.0006524323957059384, + "loss": 3.683, + "step": 40940 + }, + { + "epoch": 2.7819676586492728, + "grad_norm": 1.3630154132843018, + "learning_rate": 0.0006523899306971056, + "loss": 3.3219, + "step": 40945 + }, + { + "epoch": 2.782307378719935, + "grad_norm": 1.0641905069351196, + "learning_rate": 0.0006523474656882728, + "loss": 3.706, + "step": 40950 + }, + { + "epoch": 2.7826470987905965, + "grad_norm": 0.7899240255355835, + "learning_rate": 0.0006523050006794402, + "loss": 3.5335, + "step": 40955 + }, + { + "epoch": 2.782986818861258, + "grad_norm": 0.8376741409301758, + "learning_rate": 0.0006522625356706074, + "loss": 3.605, + "step": 40960 + }, + { + "epoch": 2.78332653893192, + "grad_norm": 0.9030218720436096, + "learning_rate": 0.0006522200706617747, + "loss": 3.4895, + "step": 40965 + }, + { + "epoch": 2.783666259002582, + "grad_norm": 0.7271941900253296, + "learning_rate": 0.0006521776056529421, + "loss": 3.4718, + "step": 40970 + }, + { + "epoch": 2.7840059790732434, + "grad_norm": 0.7747678756713867, + "learning_rate": 0.0006521351406441093, + "loss": 3.5914, + "step": 40975 + }, + { + "epoch": 2.7843456991439055, + "grad_norm": 0.8224813342094421, + "learning_rate": 0.0006520926756352765, + "loss": 3.3188, + "step": 40980 + }, + { + "epoch": 2.784685419214567, + "grad_norm": 0.7637853026390076, + "learning_rate": 0.0006520502106264439, + "loss": 3.6493, + "step": 40985 + }, + { + "epoch": 2.7850251392852288, + "grad_norm": 0.950633704662323, + "learning_rate": 0.0006520077456176111, + "loss": 3.4966, + "step": 40990 + }, + { + "epoch": 2.785364859355891, + "grad_norm": 0.9929881691932678, + "learning_rate": 0.0006519652806087783, + "loss": 3.7033, + "step": 40995 + }, + { + "epoch": 2.7857045794265525, + "grad_norm": 1.151172399520874, + "learning_rate": 0.0006519228155999457, + "loss": 3.3572, + "step": 41000 + }, + { + "epoch": 2.786044299497214, + "grad_norm": 0.7218544483184814, + "learning_rate": 0.000651880350591113, + "loss": 3.883, + "step": 41005 + }, + { + "epoch": 2.786384019567876, + "grad_norm": 0.8386082053184509, + "learning_rate": 0.0006518378855822802, + "loss": 3.7463, + "step": 41010 + }, + { + "epoch": 2.786723739638538, + "grad_norm": 0.7495570182800293, + "learning_rate": 0.0006517954205734475, + "loss": 3.7012, + "step": 41015 + }, + { + "epoch": 2.7870634597091994, + "grad_norm": 0.8122143149375916, + "learning_rate": 0.0006517529555646148, + "loss": 3.4481, + "step": 41020 + }, + { + "epoch": 2.7874031797798615, + "grad_norm": 1.7042022943496704, + "learning_rate": 0.000651710490555782, + "loss": 3.7167, + "step": 41025 + }, + { + "epoch": 2.787742899850523, + "grad_norm": 1.2038884162902832, + "learning_rate": 0.0006516680255469494, + "loss": 3.2653, + "step": 41030 + }, + { + "epoch": 2.788082619921185, + "grad_norm": 1.0500378608703613, + "learning_rate": 0.0006516255605381167, + "loss": 3.7708, + "step": 41035 + }, + { + "epoch": 2.788422339991847, + "grad_norm": 0.8300161957740784, + "learning_rate": 0.0006515830955292839, + "loss": 3.6388, + "step": 41040 + }, + { + "epoch": 2.7887620600625085, + "grad_norm": 0.7775846123695374, + "learning_rate": 0.0006515406305204512, + "loss": 3.1381, + "step": 41045 + }, + { + "epoch": 2.78910178013317, + "grad_norm": 0.955024242401123, + "learning_rate": 0.0006514981655116184, + "loss": 3.2685, + "step": 41050 + }, + { + "epoch": 2.789441500203832, + "grad_norm": 0.6768854856491089, + "learning_rate": 0.0006514557005027857, + "loss": 3.7576, + "step": 41055 + }, + { + "epoch": 2.789781220274494, + "grad_norm": 0.7762116193771362, + "learning_rate": 0.000651413235493953, + "loss": 3.7228, + "step": 41060 + }, + { + "epoch": 2.7901209403451555, + "grad_norm": 0.7541042566299438, + "learning_rate": 0.0006513707704851203, + "loss": 3.595, + "step": 41065 + }, + { + "epoch": 2.7904606604158175, + "grad_norm": 0.8273095488548279, + "learning_rate": 0.0006513283054762876, + "loss": 3.5543, + "step": 41070 + }, + { + "epoch": 2.790800380486479, + "grad_norm": 0.9398006200790405, + "learning_rate": 0.0006512858404674549, + "loss": 3.4047, + "step": 41075 + }, + { + "epoch": 2.791140100557141, + "grad_norm": 1.3106671571731567, + "learning_rate": 0.0006512433754586221, + "loss": 3.6312, + "step": 41080 + }, + { + "epoch": 2.791479820627803, + "grad_norm": 21.96941375732422, + "learning_rate": 0.0006512009104497893, + "loss": 3.5613, + "step": 41085 + }, + { + "epoch": 2.7918195406984645, + "grad_norm": 0.8005240559577942, + "learning_rate": 0.0006511584454409567, + "loss": 3.5411, + "step": 41090 + }, + { + "epoch": 2.792159260769126, + "grad_norm": 0.9304055571556091, + "learning_rate": 0.0006511159804321239, + "loss": 3.6494, + "step": 41095 + }, + { + "epoch": 2.792498980839788, + "grad_norm": 1.0446667671203613, + "learning_rate": 0.0006510735154232912, + "loss": 3.3726, + "step": 41100 + }, + { + "epoch": 2.79283870091045, + "grad_norm": 0.8183019161224365, + "learning_rate": 0.0006510310504144586, + "loss": 3.6025, + "step": 41105 + }, + { + "epoch": 2.7931784209811115, + "grad_norm": 0.7432127594947815, + "learning_rate": 0.0006509885854056258, + "loss": 3.5697, + "step": 41110 + }, + { + "epoch": 2.7935181410517735, + "grad_norm": 0.9804801344871521, + "learning_rate": 0.000650946120396793, + "loss": 3.6993, + "step": 41115 + }, + { + "epoch": 2.793857861122435, + "grad_norm": 0.7871012091636658, + "learning_rate": 0.0006509036553879604, + "loss": 3.4797, + "step": 41120 + }, + { + "epoch": 2.794197581193097, + "grad_norm": 0.8307488560676575, + "learning_rate": 0.0006508611903791276, + "loss": 3.6722, + "step": 41125 + }, + { + "epoch": 2.794537301263759, + "grad_norm": 0.956040620803833, + "learning_rate": 0.0006508187253702948, + "loss": 3.1971, + "step": 41130 + }, + { + "epoch": 2.7948770213344205, + "grad_norm": 0.739301323890686, + "learning_rate": 0.0006507762603614623, + "loss": 3.4224, + "step": 41135 + }, + { + "epoch": 2.795216741405082, + "grad_norm": 0.8754002451896667, + "learning_rate": 0.0006507337953526295, + "loss": 3.4274, + "step": 41140 + }, + { + "epoch": 2.795556461475744, + "grad_norm": 1.233412504196167, + "learning_rate": 0.0006506913303437967, + "loss": 3.2737, + "step": 41145 + }, + { + "epoch": 2.795896181546406, + "grad_norm": 1.2049740552902222, + "learning_rate": 0.000650648865334964, + "loss": 3.5244, + "step": 41150 + }, + { + "epoch": 2.7962359016170675, + "grad_norm": 1.0858606100082397, + "learning_rate": 0.0006506064003261313, + "loss": 3.4381, + "step": 41155 + }, + { + "epoch": 2.7965756216877296, + "grad_norm": 0.7721803784370422, + "learning_rate": 0.0006505639353172985, + "loss": 3.3994, + "step": 41160 + }, + { + "epoch": 2.796915341758391, + "grad_norm": 1.2833337783813477, + "learning_rate": 0.0006505214703084658, + "loss": 3.5648, + "step": 41165 + }, + { + "epoch": 2.797255061829053, + "grad_norm": 0.7756378054618835, + "learning_rate": 0.0006504790052996332, + "loss": 3.5579, + "step": 41170 + }, + { + "epoch": 2.797594781899715, + "grad_norm": 0.8071271181106567, + "learning_rate": 0.0006504365402908004, + "loss": 3.508, + "step": 41175 + }, + { + "epoch": 2.7979345019703765, + "grad_norm": 0.8756152987480164, + "learning_rate": 0.0006503940752819677, + "loss": 3.6836, + "step": 41180 + }, + { + "epoch": 2.798274222041038, + "grad_norm": 0.8202893137931824, + "learning_rate": 0.0006503516102731349, + "loss": 3.4573, + "step": 41185 + }, + { + "epoch": 2.7986139421117002, + "grad_norm": 0.7081833481788635, + "learning_rate": 0.0006503091452643022, + "loss": 3.6179, + "step": 41190 + }, + { + "epoch": 2.798953662182362, + "grad_norm": 0.8652169108390808, + "learning_rate": 0.0006502666802554695, + "loss": 3.8139, + "step": 41195 + }, + { + "epoch": 2.7992933822530235, + "grad_norm": 0.9437609314918518, + "learning_rate": 0.0006502242152466367, + "loss": 3.5326, + "step": 41200 + }, + { + "epoch": 2.799633102323685, + "grad_norm": 0.7696588635444641, + "learning_rate": 0.0006501817502378041, + "loss": 3.5635, + "step": 41205 + }, + { + "epoch": 2.799972822394347, + "grad_norm": 0.8584491014480591, + "learning_rate": 0.0006501392852289714, + "loss": 3.648, + "step": 41210 + }, + { + "epoch": 2.800312542465009, + "grad_norm": 0.7968984246253967, + "learning_rate": 0.0006500968202201386, + "loss": 3.3789, + "step": 41215 + }, + { + "epoch": 2.8006522625356705, + "grad_norm": 0.892042338848114, + "learning_rate": 0.0006500543552113059, + "loss": 3.383, + "step": 41220 + }, + { + "epoch": 2.8009919826063325, + "grad_norm": 0.7963066101074219, + "learning_rate": 0.0006500118902024732, + "loss": 3.5155, + "step": 41225 + }, + { + "epoch": 2.801331702676994, + "grad_norm": 0.9425565600395203, + "learning_rate": 0.0006499694251936404, + "loss": 3.5161, + "step": 41230 + }, + { + "epoch": 2.801671422747656, + "grad_norm": 0.7714890241622925, + "learning_rate": 0.0006499269601848076, + "loss": 3.4778, + "step": 41235 + }, + { + "epoch": 2.802011142818318, + "grad_norm": 0.9567568302154541, + "learning_rate": 0.0006498844951759751, + "loss": 3.5493, + "step": 41240 + }, + { + "epoch": 2.8023508628889795, + "grad_norm": 0.9088938236236572, + "learning_rate": 0.0006498420301671423, + "loss": 3.3893, + "step": 41245 + }, + { + "epoch": 2.802690582959641, + "grad_norm": 1.0077992677688599, + "learning_rate": 0.0006497995651583095, + "loss": 3.4408, + "step": 41250 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.8520005345344543, + "learning_rate": 0.0006497571001494769, + "loss": 3.4893, + "step": 41255 + }, + { + "epoch": 2.803370023100965, + "grad_norm": 1.3282532691955566, + "learning_rate": 0.0006497146351406441, + "loss": 4.0218, + "step": 41260 + }, + { + "epoch": 2.8037097431716265, + "grad_norm": 0.7937673330307007, + "learning_rate": 0.0006496721701318113, + "loss": 3.5128, + "step": 41265 + }, + { + "epoch": 2.804049463242288, + "grad_norm": 0.7189898490905762, + "learning_rate": 0.0006496297051229787, + "loss": 3.3873, + "step": 41270 + }, + { + "epoch": 2.80438918331295, + "grad_norm": 0.7903891205787659, + "learning_rate": 0.000649587240114146, + "loss": 3.519, + "step": 41275 + }, + { + "epoch": 2.804728903383612, + "grad_norm": 0.9665729999542236, + "learning_rate": 0.0006495447751053133, + "loss": 3.6236, + "step": 41280 + }, + { + "epoch": 2.8050686234542734, + "grad_norm": 0.8837174773216248, + "learning_rate": 0.0006495023100964805, + "loss": 3.4774, + "step": 41285 + }, + { + "epoch": 2.8054083435249355, + "grad_norm": 1.080338716506958, + "learning_rate": 0.0006494598450876478, + "loss": 3.6804, + "step": 41290 + }, + { + "epoch": 2.805748063595597, + "grad_norm": 1.222110390663147, + "learning_rate": 0.0006494173800788151, + "loss": 3.1722, + "step": 41295 + }, + { + "epoch": 2.8060877836662588, + "grad_norm": 0.8045562505722046, + "learning_rate": 0.0006493749150699823, + "loss": 3.6171, + "step": 41300 + }, + { + "epoch": 2.806427503736921, + "grad_norm": 0.8020747303962708, + "learning_rate": 0.0006493324500611496, + "loss": 3.6594, + "step": 41305 + }, + { + "epoch": 2.8067672238075825, + "grad_norm": 0.834672212600708, + "learning_rate": 0.000649289985052317, + "loss": 3.4248, + "step": 41310 + }, + { + "epoch": 2.807106943878244, + "grad_norm": 0.9552065134048462, + "learning_rate": 0.0006492475200434842, + "loss": 3.4754, + "step": 41315 + }, + { + "epoch": 2.807446663948906, + "grad_norm": 2.0086216926574707, + "learning_rate": 0.0006492050550346515, + "loss": 3.4369, + "step": 41320 + }, + { + "epoch": 2.807786384019568, + "grad_norm": 0.7783080339431763, + "learning_rate": 0.0006491625900258188, + "loss": 3.6401, + "step": 41325 + }, + { + "epoch": 2.8081261040902294, + "grad_norm": 0.9091768264770508, + "learning_rate": 0.000649120125016986, + "loss": 3.4132, + "step": 41330 + }, + { + "epoch": 2.8084658241608915, + "grad_norm": 0.9427194595336914, + "learning_rate": 0.0006490776600081532, + "loss": 3.5298, + "step": 41335 + }, + { + "epoch": 2.808805544231553, + "grad_norm": 0.8946439027786255, + "learning_rate": 0.0006490351949993206, + "loss": 3.5516, + "step": 41340 + }, + { + "epoch": 2.809145264302215, + "grad_norm": 0.7988029718399048, + "learning_rate": 0.0006489927299904879, + "loss": 3.625, + "step": 41345 + }, + { + "epoch": 2.809484984372877, + "grad_norm": 0.9871447086334229, + "learning_rate": 0.0006489502649816551, + "loss": 3.3302, + "step": 41350 + }, + { + "epoch": 2.8098247044435385, + "grad_norm": 0.769070565700531, + "learning_rate": 0.0006489077999728225, + "loss": 3.6682, + "step": 41355 + }, + { + "epoch": 2.8101644245142, + "grad_norm": 1.0409986972808838, + "learning_rate": 0.0006488653349639897, + "loss": 3.5344, + "step": 41360 + }, + { + "epoch": 2.810504144584862, + "grad_norm": 1.2116559743881226, + "learning_rate": 0.0006488228699551569, + "loss": 3.4833, + "step": 41365 + }, + { + "epoch": 2.810843864655524, + "grad_norm": 0.9668022990226746, + "learning_rate": 0.0006487804049463243, + "loss": 3.4294, + "step": 41370 + }, + { + "epoch": 2.8111835847261855, + "grad_norm": 0.8663278818130493, + "learning_rate": 0.0006487379399374915, + "loss": 3.4872, + "step": 41375 + }, + { + "epoch": 2.8115233047968475, + "grad_norm": 0.7246752977371216, + "learning_rate": 0.0006486954749286588, + "loss": 3.6264, + "step": 41380 + }, + { + "epoch": 2.811863024867509, + "grad_norm": 0.8989405035972595, + "learning_rate": 0.0006486530099198261, + "loss": 3.6971, + "step": 41385 + }, + { + "epoch": 2.812202744938171, + "grad_norm": 0.9362697005271912, + "learning_rate": 0.0006486105449109934, + "loss": 3.7282, + "step": 41390 + }, + { + "epoch": 2.812542465008833, + "grad_norm": 0.8873737454414368, + "learning_rate": 0.0006485680799021606, + "loss": 3.7039, + "step": 41395 + }, + { + "epoch": 2.8128821850794945, + "grad_norm": 0.783169686794281, + "learning_rate": 0.0006485256148933279, + "loss": 3.6752, + "step": 41400 + }, + { + "epoch": 2.813221905150156, + "grad_norm": 0.9742375612258911, + "learning_rate": 0.0006484916428862618, + "loss": 3.5178, + "step": 41405 + }, + { + "epoch": 2.813561625220818, + "grad_norm": 0.696772038936615, + "learning_rate": 0.000648449177877429, + "loss": 3.6745, + "step": 41410 + }, + { + "epoch": 2.81390134529148, + "grad_norm": 0.9131288528442383, + "learning_rate": 0.0006484067128685963, + "loss": 3.6861, + "step": 41415 + }, + { + "epoch": 2.8142410653621415, + "grad_norm": 1.371379017829895, + "learning_rate": 0.0006483642478597635, + "loss": 3.8296, + "step": 41420 + }, + { + "epoch": 2.8145807854328035, + "grad_norm": 0.855080783367157, + "learning_rate": 0.0006483217828509308, + "loss": 3.5022, + "step": 41425 + }, + { + "epoch": 2.814920505503465, + "grad_norm": 1.0859191417694092, + "learning_rate": 0.0006482793178420981, + "loss": 3.4587, + "step": 41430 + }, + { + "epoch": 2.815260225574127, + "grad_norm": 0.811506986618042, + "learning_rate": 0.0006482368528332654, + "loss": 3.4623, + "step": 41435 + }, + { + "epoch": 2.815599945644789, + "grad_norm": 0.9130503535270691, + "learning_rate": 0.0006481943878244327, + "loss": 3.5096, + "step": 41440 + }, + { + "epoch": 2.8159396657154505, + "grad_norm": 0.9437992572784424, + "learning_rate": 0.0006481519228156, + "loss": 3.604, + "step": 41445 + }, + { + "epoch": 2.816279385786112, + "grad_norm": 0.792876660823822, + "learning_rate": 0.0006481094578067672, + "loss": 3.8332, + "step": 41450 + }, + { + "epoch": 2.816619105856774, + "grad_norm": 0.9237465858459473, + "learning_rate": 0.0006480669927979345, + "loss": 3.5233, + "step": 41455 + }, + { + "epoch": 2.816958825927436, + "grad_norm": 1.1139229536056519, + "learning_rate": 0.0006480245277891018, + "loss": 3.8043, + "step": 41460 + }, + { + "epoch": 2.8172985459980975, + "grad_norm": 0.8715588450431824, + "learning_rate": 0.000647982062780269, + "loss": 3.475, + "step": 41465 + }, + { + "epoch": 2.8176382660687596, + "grad_norm": 0.9045029878616333, + "learning_rate": 0.0006479395977714363, + "loss": 3.5182, + "step": 41470 + }, + { + "epoch": 2.817977986139421, + "grad_norm": 0.6804875135421753, + "learning_rate": 0.0006478971327626037, + "loss": 3.7816, + "step": 41475 + }, + { + "epoch": 2.818317706210083, + "grad_norm": 0.6453151702880859, + "learning_rate": 0.0006478546677537709, + "loss": 3.6394, + "step": 41480 + }, + { + "epoch": 2.818657426280745, + "grad_norm": 0.8786467909812927, + "learning_rate": 0.0006478122027449382, + "loss": 3.7087, + "step": 41485 + }, + { + "epoch": 2.8189971463514065, + "grad_norm": 0.9369677901268005, + "learning_rate": 0.0006477697377361055, + "loss": 3.533, + "step": 41490 + }, + { + "epoch": 2.819336866422068, + "grad_norm": 0.7582247257232666, + "learning_rate": 0.0006477272727272727, + "loss": 3.6868, + "step": 41495 + }, + { + "epoch": 2.8196765864927302, + "grad_norm": 0.8987757563591003, + "learning_rate": 0.00064768480771844, + "loss": 3.4599, + "step": 41500 + }, + { + "epoch": 2.820016306563392, + "grad_norm": 0.858128547668457, + "learning_rate": 0.0006476423427096074, + "loss": 3.727, + "step": 41505 + }, + { + "epoch": 2.8203560266340535, + "grad_norm": 0.8812126517295837, + "learning_rate": 0.0006475998777007746, + "loss": 3.613, + "step": 41510 + }, + { + "epoch": 2.8206957467047156, + "grad_norm": 1.0107413530349731, + "learning_rate": 0.0006475574126919419, + "loss": 3.6026, + "step": 41515 + }, + { + "epoch": 2.821035466775377, + "grad_norm": 1.1278246641159058, + "learning_rate": 0.0006475149476831091, + "loss": 3.6779, + "step": 41520 + }, + { + "epoch": 2.821375186846039, + "grad_norm": 0.7137530446052551, + "learning_rate": 0.0006474724826742764, + "loss": 3.5834, + "step": 41525 + }, + { + "epoch": 2.821714906916701, + "grad_norm": 0.6817806363105774, + "learning_rate": 0.0006474300176654437, + "loss": 3.6482, + "step": 41530 + }, + { + "epoch": 2.8220546269873625, + "grad_norm": 0.6507067680358887, + "learning_rate": 0.0006473875526566109, + "loss": 3.7404, + "step": 41535 + }, + { + "epoch": 2.822394347058024, + "grad_norm": 1.092702031135559, + "learning_rate": 0.0006473450876477783, + "loss": 3.4391, + "step": 41540 + }, + { + "epoch": 2.822734067128686, + "grad_norm": 0.6085137128829956, + "learning_rate": 0.0006473026226389456, + "loss": 3.3923, + "step": 41545 + }, + { + "epoch": 2.823073787199348, + "grad_norm": 0.742829442024231, + "learning_rate": 0.0006472601576301128, + "loss": 3.5423, + "step": 41550 + }, + { + "epoch": 2.8234135072700095, + "grad_norm": 0.9700330495834351, + "learning_rate": 0.00064721769262128, + "loss": 3.6597, + "step": 41555 + }, + { + "epoch": 2.823753227340671, + "grad_norm": 0.8586932420730591, + "learning_rate": 0.0006471752276124474, + "loss": 3.6434, + "step": 41560 + }, + { + "epoch": 2.824092947411333, + "grad_norm": 1.0872390270233154, + "learning_rate": 0.0006471327626036146, + "loss": 3.9717, + "step": 41565 + }, + { + "epoch": 2.824432667481995, + "grad_norm": 0.9081577062606812, + "learning_rate": 0.0006470902975947818, + "loss": 3.4213, + "step": 41570 + }, + { + "epoch": 2.8247723875526565, + "grad_norm": 1.1140614748001099, + "learning_rate": 0.0006470478325859493, + "loss": 3.5813, + "step": 41575 + }, + { + "epoch": 2.8251121076233185, + "grad_norm": 3.4731569290161133, + "learning_rate": 0.0006470053675771165, + "loss": 3.5839, + "step": 41580 + }, + { + "epoch": 2.82545182769398, + "grad_norm": 1.1341291666030884, + "learning_rate": 0.0006469629025682837, + "loss": 3.4509, + "step": 41585 + }, + { + "epoch": 2.825791547764642, + "grad_norm": 1.0686534643173218, + "learning_rate": 0.0006469204375594511, + "loss": 3.7314, + "step": 41590 + }, + { + "epoch": 2.8261312678353034, + "grad_norm": 0.8059784770011902, + "learning_rate": 0.0006468864655523849, + "loss": 3.4467, + "step": 41595 + }, + { + "epoch": 2.8264709879059655, + "grad_norm": 0.9175277352333069, + "learning_rate": 0.0006468440005435521, + "loss": 3.7381, + "step": 41600 + }, + { + "epoch": 2.826810707976627, + "grad_norm": 1.6098018884658813, + "learning_rate": 0.0006468015355347195, + "loss": 3.6096, + "step": 41605 + }, + { + "epoch": 2.8271504280472888, + "grad_norm": 0.764064371585846, + "learning_rate": 0.0006467590705258867, + "loss": 3.4891, + "step": 41610 + }, + { + "epoch": 2.827490148117951, + "grad_norm": 0.7853283882141113, + "learning_rate": 0.0006467166055170539, + "loss": 3.6206, + "step": 41615 + }, + { + "epoch": 2.8278298681886125, + "grad_norm": 0.7951640486717224, + "learning_rate": 0.0006466741405082212, + "loss": 3.4763, + "step": 41620 + }, + { + "epoch": 2.828169588259274, + "grad_norm": 1.0951014757156372, + "learning_rate": 0.0006466316754993885, + "loss": 3.3665, + "step": 41625 + }, + { + "epoch": 2.828509308329936, + "grad_norm": 0.7083086371421814, + "learning_rate": 0.0006465892104905558, + "loss": 3.766, + "step": 41630 + }, + { + "epoch": 2.828849028400598, + "grad_norm": 0.6765576601028442, + "learning_rate": 0.0006465467454817231, + "loss": 3.4099, + "step": 41635 + }, + { + "epoch": 2.8291887484712595, + "grad_norm": 0.7273145318031311, + "learning_rate": 0.0006465042804728904, + "loss": 3.5658, + "step": 41640 + }, + { + "epoch": 2.8295284685419215, + "grad_norm": 0.9282916188240051, + "learning_rate": 0.0006464618154640576, + "loss": 3.6695, + "step": 41645 + }, + { + "epoch": 2.829868188612583, + "grad_norm": 0.934777021408081, + "learning_rate": 0.0006464193504552249, + "loss": 3.6472, + "step": 41650 + }, + { + "epoch": 2.830207908683245, + "grad_norm": 0.9209679365158081, + "learning_rate": 0.0006463768854463921, + "loss": 3.6058, + "step": 41655 + }, + { + "epoch": 2.830547628753907, + "grad_norm": 0.9337560534477234, + "learning_rate": 0.0006463344204375594, + "loss": 3.5028, + "step": 41660 + }, + { + "epoch": 2.8308873488245685, + "grad_norm": 0.8392233848571777, + "learning_rate": 0.0006462919554287268, + "loss": 3.2978, + "step": 41665 + }, + { + "epoch": 2.83122706889523, + "grad_norm": 0.9892001152038574, + "learning_rate": 0.000646249490419894, + "loss": 3.7887, + "step": 41670 + }, + { + "epoch": 2.831566788965892, + "grad_norm": 0.946465015411377, + "learning_rate": 0.0006462070254110613, + "loss": 3.5353, + "step": 41675 + }, + { + "epoch": 2.831906509036554, + "grad_norm": 0.8818231821060181, + "learning_rate": 0.0006461645604022286, + "loss": 3.6515, + "step": 41680 + }, + { + "epoch": 2.8322462291072155, + "grad_norm": 0.9673231244087219, + "learning_rate": 0.0006461220953933958, + "loss": 3.6074, + "step": 41685 + }, + { + "epoch": 2.8325859491778775, + "grad_norm": 0.7775977253913879, + "learning_rate": 0.0006460796303845632, + "loss": 3.6682, + "step": 41690 + }, + { + "epoch": 2.832925669248539, + "grad_norm": 0.7673445343971252, + "learning_rate": 0.0006460371653757304, + "loss": 3.7616, + "step": 41695 + }, + { + "epoch": 2.833265389319201, + "grad_norm": NaN, + "learning_rate": 0.0006460031933686642, + "loss": 3.5409, + "step": 41700 + }, + { + "epoch": 2.833605109389863, + "grad_norm": 0.7799642086029053, + "learning_rate": 0.0006459607283598315, + "loss": 3.6432, + "step": 41705 + }, + { + "epoch": 2.8339448294605245, + "grad_norm": 0.9139265418052673, + "learning_rate": 0.0006459182633509988, + "loss": 3.6841, + "step": 41710 + }, + { + "epoch": 2.834284549531186, + "grad_norm": 0.9002902507781982, + "learning_rate": 0.000645875798342166, + "loss": 3.4562, + "step": 41715 + }, + { + "epoch": 2.834624269601848, + "grad_norm": 0.6665650606155396, + "learning_rate": 0.0006458333333333334, + "loss": 3.5182, + "step": 41720 + }, + { + "epoch": 2.83496398967251, + "grad_norm": 0.7392123341560364, + "learning_rate": 0.0006457908683245007, + "loss": 3.6888, + "step": 41725 + }, + { + "epoch": 2.8353037097431715, + "grad_norm": 0.8112896084785461, + "learning_rate": 0.0006457484033156679, + "loss": 3.7281, + "step": 41730 + }, + { + "epoch": 2.8356434298138335, + "grad_norm": 8.73359203338623, + "learning_rate": 0.0006457059383068352, + "loss": 3.3413, + "step": 41735 + }, + { + "epoch": 2.835983149884495, + "grad_norm": 0.7331961393356323, + "learning_rate": 0.0006456634732980024, + "loss": 3.5774, + "step": 41740 + }, + { + "epoch": 2.836322869955157, + "grad_norm": 1.0709693431854248, + "learning_rate": 0.0006456210082891697, + "loss": 3.7614, + "step": 41745 + }, + { + "epoch": 2.836662590025819, + "grad_norm": 0.788046658039093, + "learning_rate": 0.000645578543280337, + "loss": 3.564, + "step": 41750 + }, + { + "epoch": 2.8370023100964805, + "grad_norm": 0.7915696501731873, + "learning_rate": 0.0006455360782715043, + "loss": 3.4264, + "step": 41755 + }, + { + "epoch": 2.837342030167142, + "grad_norm": 1.4239095449447632, + "learning_rate": 0.0006454936132626716, + "loss": 3.8207, + "step": 41760 + }, + { + "epoch": 2.8376817502378042, + "grad_norm": 0.7624847292900085, + "learning_rate": 0.0006454511482538389, + "loss": 3.5216, + "step": 41765 + }, + { + "epoch": 2.838021470308466, + "grad_norm": 1.0469367504119873, + "learning_rate": 0.0006454086832450061, + "loss": 3.7376, + "step": 41770 + }, + { + "epoch": 2.8383611903791275, + "grad_norm": 0.7993491888046265, + "learning_rate": 0.0006453662182361734, + "loss": 3.7005, + "step": 41775 + }, + { + "epoch": 2.8387009104497896, + "grad_norm": 0.7260223627090454, + "learning_rate": 0.0006453237532273407, + "loss": 3.6301, + "step": 41780 + }, + { + "epoch": 2.839040630520451, + "grad_norm": 0.7324342727661133, + "learning_rate": 0.0006452812882185079, + "loss": 3.722, + "step": 41785 + }, + { + "epoch": 2.839380350591113, + "grad_norm": 1.019953966140747, + "learning_rate": 0.0006452388232096752, + "loss": 3.5307, + "step": 41790 + }, + { + "epoch": 2.839720070661775, + "grad_norm": 0.8605174422264099, + "learning_rate": 0.0006451963582008426, + "loss": 3.6427, + "step": 41795 + }, + { + "epoch": 2.8400597907324365, + "grad_norm": 0.8547157049179077, + "learning_rate": 0.0006451538931920098, + "loss": 3.3234, + "step": 41800 + }, + { + "epoch": 2.840399510803098, + "grad_norm": 1.7089673280715942, + "learning_rate": 0.000645111428183177, + "loss": 3.8273, + "step": 41805 + }, + { + "epoch": 2.8407392308737602, + "grad_norm": 1.0030999183654785, + "learning_rate": 0.0006450689631743444, + "loss": 3.7922, + "step": 41810 + }, + { + "epoch": 2.841078950944422, + "grad_norm": 0.7178849577903748, + "learning_rate": 0.0006450264981655116, + "loss": 3.7822, + "step": 41815 + }, + { + "epoch": 2.8414186710150835, + "grad_norm": 1.4274375438690186, + "learning_rate": 0.0006449840331566788, + "loss": 3.3412, + "step": 41820 + }, + { + "epoch": 2.8417583910857456, + "grad_norm": 0.8561616539955139, + "learning_rate": 0.0006449415681478463, + "loss": 3.5954, + "step": 41825 + }, + { + "epoch": 2.842098111156407, + "grad_norm": 0.8162837028503418, + "learning_rate": 0.0006448991031390135, + "loss": 3.3174, + "step": 41830 + }, + { + "epoch": 2.842437831227069, + "grad_norm": 0.7193267941474915, + "learning_rate": 0.0006448566381301807, + "loss": 3.4237, + "step": 41835 + }, + { + "epoch": 2.842777551297731, + "grad_norm": 0.8798254728317261, + "learning_rate": 0.000644814173121348, + "loss": 3.6635, + "step": 41840 + }, + { + "epoch": 2.8431172713683925, + "grad_norm": 1.0115571022033691, + "learning_rate": 0.0006447717081125153, + "loss": 3.6783, + "step": 41845 + }, + { + "epoch": 2.843456991439054, + "grad_norm": 1.269887089729309, + "learning_rate": 0.0006447292431036825, + "loss": 3.3858, + "step": 41850 + }, + { + "epoch": 2.8437967115097162, + "grad_norm": 0.8452107310295105, + "learning_rate": 0.0006446867780948498, + "loss": 3.5561, + "step": 41855 + }, + { + "epoch": 2.844136431580378, + "grad_norm": 0.9553399682044983, + "learning_rate": 0.0006446443130860172, + "loss": 3.1253, + "step": 41860 + }, + { + "epoch": 2.8444761516510395, + "grad_norm": 0.8834658861160278, + "learning_rate": 0.0006446018480771844, + "loss": 3.343, + "step": 41865 + }, + { + "epoch": 2.8448158717217016, + "grad_norm": 0.8504374027252197, + "learning_rate": 0.0006445593830683517, + "loss": 3.4984, + "step": 41870 + }, + { + "epoch": 2.845155591792363, + "grad_norm": 0.8470826745033264, + "learning_rate": 0.000644516918059519, + "loss": 3.4798, + "step": 41875 + }, + { + "epoch": 2.845495311863025, + "grad_norm": 1.1121885776519775, + "learning_rate": 0.0006444744530506862, + "loss": 3.3631, + "step": 41880 + }, + { + "epoch": 2.8458350319336865, + "grad_norm": 0.8852717876434326, + "learning_rate": 0.0006444319880418535, + "loss": 3.4525, + "step": 41885 + }, + { + "epoch": 2.8461747520043486, + "grad_norm": 0.8735798597335815, + "learning_rate": 0.0006443895230330207, + "loss": 3.6614, + "step": 41890 + }, + { + "epoch": 2.84651447207501, + "grad_norm": 1.0935239791870117, + "learning_rate": 0.0006443470580241882, + "loss": 3.6764, + "step": 41895 + }, + { + "epoch": 2.846854192145672, + "grad_norm": 0.9603499174118042, + "learning_rate": 0.0006443045930153554, + "loss": 3.5544, + "step": 41900 + }, + { + "epoch": 2.847193912216334, + "grad_norm": 0.7775435447692871, + "learning_rate": 0.0006442621280065226, + "loss": 3.487, + "step": 41905 + }, + { + "epoch": 2.8475336322869955, + "grad_norm": 0.7949323654174805, + "learning_rate": 0.00064421966299769, + "loss": 3.5052, + "step": 41910 + }, + { + "epoch": 2.847873352357657, + "grad_norm": 0.7462652921676636, + "learning_rate": 0.0006441771979888572, + "loss": 3.7228, + "step": 41915 + }, + { + "epoch": 2.8482130724283192, + "grad_norm": 0.8500416874885559, + "learning_rate": 0.0006441347329800244, + "loss": 3.881, + "step": 41920 + }, + { + "epoch": 2.848552792498981, + "grad_norm": 0.8497430682182312, + "learning_rate": 0.0006440922679711918, + "loss": 3.5098, + "step": 41925 + }, + { + "epoch": 2.8488925125696425, + "grad_norm": 0.7977365255355835, + "learning_rate": 0.0006440498029623591, + "loss": 3.6275, + "step": 41930 + }, + { + "epoch": 2.849232232640304, + "grad_norm": 0.9367765188217163, + "learning_rate": 0.0006440073379535263, + "loss": 3.4508, + "step": 41935 + }, + { + "epoch": 2.849571952710966, + "grad_norm": 1.027408480644226, + "learning_rate": 0.0006439648729446937, + "loss": 3.4671, + "step": 41940 + }, + { + "epoch": 2.849911672781628, + "grad_norm": 0.7649872899055481, + "learning_rate": 0.0006439224079358609, + "loss": 3.5001, + "step": 41945 + }, + { + "epoch": 2.8502513928522895, + "grad_norm": 1.39231538772583, + "learning_rate": 0.0006438799429270281, + "loss": 3.5117, + "step": 41950 + }, + { + "epoch": 2.8505911129229515, + "grad_norm": 0.7736470103263855, + "learning_rate": 0.0006438374779181954, + "loss": 3.596, + "step": 41955 + }, + { + "epoch": 2.850930832993613, + "grad_norm": 1.152349591255188, + "learning_rate": 0.0006437950129093627, + "loss": 3.6789, + "step": 41960 + }, + { + "epoch": 2.851270553064275, + "grad_norm": 0.9323878884315491, + "learning_rate": 0.00064375254790053, + "loss": 3.4893, + "step": 41965 + }, + { + "epoch": 2.851610273134937, + "grad_norm": 0.635001003742218, + "learning_rate": 0.0006437100828916973, + "loss": 3.6494, + "step": 41970 + }, + { + "epoch": 2.8519499932055985, + "grad_norm": 0.9356409907341003, + "learning_rate": 0.0006436676178828646, + "loss": 3.644, + "step": 41975 + }, + { + "epoch": 2.85228971327626, + "grad_norm": 0.780820906162262, + "learning_rate": 0.0006436251528740318, + "loss": 3.2963, + "step": 41980 + }, + { + "epoch": 2.852629433346922, + "grad_norm": 1.010454773902893, + "learning_rate": 0.0006435826878651991, + "loss": 3.7489, + "step": 41985 + }, + { + "epoch": 2.852969153417584, + "grad_norm": 1.3641307353973389, + "learning_rate": 0.0006435402228563663, + "loss": 3.5472, + "step": 41990 + }, + { + "epoch": 2.8533088734882455, + "grad_norm": 0.9818708896636963, + "learning_rate": 0.0006434977578475336, + "loss": 3.6331, + "step": 41995 + }, + { + "epoch": 2.8536485935589075, + "grad_norm": 0.6735411286354065, + "learning_rate": 0.000643455292838701, + "loss": 3.6961, + "step": 42000 + }, + { + "epoch": 2.853988313629569, + "grad_norm": 0.7041317224502563, + "learning_rate": 0.0006434128278298682, + "loss": 3.6274, + "step": 42005 + }, + { + "epoch": 2.854328033700231, + "grad_norm": 1.0152040719985962, + "learning_rate": 0.0006433703628210355, + "loss": 3.281, + "step": 42010 + }, + { + "epoch": 2.854667753770893, + "grad_norm": 0.7790409326553345, + "learning_rate": 0.0006433278978122028, + "loss": 3.5264, + "step": 42015 + }, + { + "epoch": 2.8550074738415545, + "grad_norm": 0.8673937916755676, + "learning_rate": 0.00064328543280337, + "loss": 3.7845, + "step": 42020 + }, + { + "epoch": 2.855347193912216, + "grad_norm": 0.9047387838363647, + "learning_rate": 0.0006432429677945373, + "loss": 3.8279, + "step": 42025 + }, + { + "epoch": 2.855686913982878, + "grad_norm": 2.2494287490844727, + "learning_rate": 0.0006432005027857046, + "loss": 3.4055, + "step": 42030 + }, + { + "epoch": 2.85602663405354, + "grad_norm": 1.0082210302352905, + "learning_rate": 0.0006431580377768719, + "loss": 3.2078, + "step": 42035 + }, + { + "epoch": 2.8563663541242015, + "grad_norm": 0.9640223383903503, + "learning_rate": 0.0006431155727680391, + "loss": 3.5918, + "step": 42040 + }, + { + "epoch": 2.8567060741948636, + "grad_norm": 0.8125755786895752, + "learning_rate": 0.0006430731077592065, + "loss": 3.7647, + "step": 42045 + }, + { + "epoch": 2.857045794265525, + "grad_norm": 0.904200553894043, + "learning_rate": 0.0006430306427503737, + "loss": 3.4645, + "step": 42050 + }, + { + "epoch": 2.857385514336187, + "grad_norm": 0.8965504169464111, + "learning_rate": 0.0006429881777415409, + "loss": 3.649, + "step": 42055 + }, + { + "epoch": 2.857725234406849, + "grad_norm": 0.8384853005409241, + "learning_rate": 0.0006429457127327083, + "loss": 3.508, + "step": 42060 + }, + { + "epoch": 2.8580649544775105, + "grad_norm": 0.7649285197257996, + "learning_rate": 0.0006429032477238755, + "loss": 3.6334, + "step": 42065 + }, + { + "epoch": 2.858404674548172, + "grad_norm": 0.8681102395057678, + "learning_rate": 0.0006428607827150428, + "loss": 3.586, + "step": 42070 + }, + { + "epoch": 2.8587443946188342, + "grad_norm": 1.0122319459915161, + "learning_rate": 0.0006428183177062102, + "loss": 3.5585, + "step": 42075 + }, + { + "epoch": 2.859084114689496, + "grad_norm": 0.9388653635978699, + "learning_rate": 0.0006427758526973774, + "loss": 3.5131, + "step": 42080 + }, + { + "epoch": 2.8594238347601575, + "grad_norm": 0.7968860864639282, + "learning_rate": 0.0006427333876885446, + "loss": 3.4185, + "step": 42085 + }, + { + "epoch": 2.8597635548308196, + "grad_norm": 0.7823754549026489, + "learning_rate": 0.000642690922679712, + "loss": 3.6209, + "step": 42090 + }, + { + "epoch": 2.860103274901481, + "grad_norm": 0.8972755074501038, + "learning_rate": 0.0006426484576708792, + "loss": 3.5317, + "step": 42095 + }, + { + "epoch": 2.860442994972143, + "grad_norm": 0.960849404335022, + "learning_rate": 0.0006426059926620464, + "loss": 3.6546, + "step": 42100 + }, + { + "epoch": 2.860782715042805, + "grad_norm": 0.652914822101593, + "learning_rate": 0.0006425635276532138, + "loss": 3.3088, + "step": 42105 + }, + { + "epoch": 2.8611224351134665, + "grad_norm": 1.186468243598938, + "learning_rate": 0.0006425210626443811, + "loss": 3.8664, + "step": 42110 + }, + { + "epoch": 2.861462155184128, + "grad_norm": 0.9126514196395874, + "learning_rate": 0.0006424785976355483, + "loss": 3.5037, + "step": 42115 + }, + { + "epoch": 2.8618018752547902, + "grad_norm": 0.9175484776496887, + "learning_rate": 0.0006424361326267156, + "loss": 3.6481, + "step": 42120 + }, + { + "epoch": 2.862141595325452, + "grad_norm": 0.940341591835022, + "learning_rate": 0.0006423936676178829, + "loss": 3.6539, + "step": 42125 + }, + { + "epoch": 2.8624813153961135, + "grad_norm": 1.0567195415496826, + "learning_rate": 0.0006423512026090501, + "loss": 3.601, + "step": 42130 + }, + { + "epoch": 2.8628210354667756, + "grad_norm": 0.9860246777534485, + "learning_rate": 0.0006423087376002174, + "loss": 3.6, + "step": 42135 + }, + { + "epoch": 2.863160755537437, + "grad_norm": 0.737078070640564, + "learning_rate": 0.0006422662725913847, + "loss": 3.2246, + "step": 42140 + }, + { + "epoch": 2.863500475608099, + "grad_norm": 1.0648070573806763, + "learning_rate": 0.000642223807582552, + "loss": 3.5056, + "step": 42145 + }, + { + "epoch": 2.863840195678761, + "grad_norm": 0.8707583546638489, + "learning_rate": 0.0006421813425737193, + "loss": 3.5024, + "step": 42150 + }, + { + "epoch": 2.8641799157494225, + "grad_norm": 0.9226364493370056, + "learning_rate": 0.0006421388775648865, + "loss": 3.6725, + "step": 42155 + }, + { + "epoch": 2.864519635820084, + "grad_norm": 0.7001680135726929, + "learning_rate": 0.0006420964125560538, + "loss": 3.4537, + "step": 42160 + }, + { + "epoch": 2.8648593558907463, + "grad_norm": 1.036777138710022, + "learning_rate": 0.0006420539475472211, + "loss": 3.3681, + "step": 42165 + }, + { + "epoch": 2.865199075961408, + "grad_norm": 0.9670694470405579, + "learning_rate": 0.0006420114825383883, + "loss": 3.6125, + "step": 42170 + }, + { + "epoch": 2.8655387960320695, + "grad_norm": 0.7751501202583313, + "learning_rate": 0.0006419690175295557, + "loss": 3.553, + "step": 42175 + }, + { + "epoch": 2.8658785161027316, + "grad_norm": 0.8176817893981934, + "learning_rate": 0.000641926552520723, + "loss": 3.6356, + "step": 42180 + }, + { + "epoch": 2.866218236173393, + "grad_norm": 0.9133017063140869, + "learning_rate": 0.0006418840875118902, + "loss": 3.3203, + "step": 42185 + }, + { + "epoch": 2.866557956244055, + "grad_norm": 0.6663287281990051, + "learning_rate": 0.0006418416225030574, + "loss": 3.5201, + "step": 42190 + }, + { + "epoch": 2.866897676314717, + "grad_norm": 1.2816252708435059, + "learning_rate": 0.0006417991574942248, + "loss": 3.6075, + "step": 42195 + }, + { + "epoch": 2.8672373963853786, + "grad_norm": 0.9402884244918823, + "learning_rate": 0.000641756692485392, + "loss": 3.601, + "step": 42200 + }, + { + "epoch": 2.86757711645604, + "grad_norm": 0.9217813611030579, + "learning_rate": 0.0006417142274765592, + "loss": 3.5299, + "step": 42205 + }, + { + "epoch": 2.8679168365267023, + "grad_norm": 0.8542978167533875, + "learning_rate": 0.0006416717624677267, + "loss": 3.6354, + "step": 42210 + }, + { + "epoch": 2.868256556597364, + "grad_norm": 0.7789055705070496, + "learning_rate": 0.0006416292974588939, + "loss": 3.5831, + "step": 42215 + }, + { + "epoch": 2.8685962766680255, + "grad_norm": 0.9903241991996765, + "learning_rate": 0.0006415868324500611, + "loss": 3.2245, + "step": 42220 + }, + { + "epoch": 2.868935996738687, + "grad_norm": 0.8254021406173706, + "learning_rate": 0.0006415443674412285, + "loss": 3.7451, + "step": 42225 + }, + { + "epoch": 2.8692757168093492, + "grad_norm": 1.2583961486816406, + "learning_rate": 0.0006415019024323957, + "loss": 3.4588, + "step": 42230 + }, + { + "epoch": 2.869615436880011, + "grad_norm": 13.78840160369873, + "learning_rate": 0.000641459437423563, + "loss": 3.5943, + "step": 42235 + }, + { + "epoch": 2.8699551569506725, + "grad_norm": 1.9662549495697021, + "learning_rate": 0.0006414169724147302, + "loss": 3.6835, + "step": 42240 + }, + { + "epoch": 2.8702948770213346, + "grad_norm": 1.0769060850143433, + "learning_rate": 0.0006413745074058976, + "loss": 3.6778, + "step": 42245 + }, + { + "epoch": 2.870634597091996, + "grad_norm": 0.8173180222511292, + "learning_rate": 0.0006413320423970649, + "loss": 3.473, + "step": 42250 + }, + { + "epoch": 2.870974317162658, + "grad_norm": 4.246832370758057, + "learning_rate": 0.0006412895773882321, + "loss": 3.7108, + "step": 42255 + }, + { + "epoch": 2.87131403723332, + "grad_norm": 0.7548671960830688, + "learning_rate": 0.0006412471123793994, + "loss": 3.4785, + "step": 42260 + }, + { + "epoch": 2.8716537573039815, + "grad_norm": 2.4836862087249756, + "learning_rate": 0.0006412046473705667, + "loss": 3.4869, + "step": 42265 + }, + { + "epoch": 2.871993477374643, + "grad_norm": 1.266848087310791, + "learning_rate": 0.0006411621823617339, + "loss": 3.4284, + "step": 42270 + }, + { + "epoch": 2.872333197445305, + "grad_norm": 0.8746322393417358, + "learning_rate": 0.0006411197173529011, + "loss": 3.7544, + "step": 42275 + }, + { + "epoch": 2.872672917515967, + "grad_norm": 0.6954376101493835, + "learning_rate": 0.0006410772523440686, + "loss": 3.6195, + "step": 42280 + }, + { + "epoch": 2.8730126375866285, + "grad_norm": 0.9680060744285583, + "learning_rate": 0.0006410347873352358, + "loss": 3.4545, + "step": 42285 + }, + { + "epoch": 2.87335235765729, + "grad_norm": 1.0132145881652832, + "learning_rate": 0.000640992322326403, + "loss": 3.4854, + "step": 42290 + }, + { + "epoch": 2.873692077727952, + "grad_norm": 1.0142197608947754, + "learning_rate": 0.0006409498573175704, + "loss": 3.5072, + "step": 42295 + }, + { + "epoch": 2.874031797798614, + "grad_norm": 0.8288962244987488, + "learning_rate": 0.0006409073923087376, + "loss": 3.6967, + "step": 42300 + }, + { + "epoch": 2.8743715178692755, + "grad_norm": 0.8449621200561523, + "learning_rate": 0.0006408649272999048, + "loss": 3.5561, + "step": 42305 + }, + { + "epoch": 2.8747112379399375, + "grad_norm": 0.9477334022521973, + "learning_rate": 0.0006408224622910723, + "loss": 3.62, + "step": 42310 + }, + { + "epoch": 2.875050958010599, + "grad_norm": 0.8745961785316467, + "learning_rate": 0.0006407799972822395, + "loss": 3.5841, + "step": 42315 + }, + { + "epoch": 2.875390678081261, + "grad_norm": 0.9776270985603333, + "learning_rate": 0.0006407375322734067, + "loss": 3.5926, + "step": 42320 + }, + { + "epoch": 2.875730398151923, + "grad_norm": 0.8045760989189148, + "learning_rate": 0.0006406950672645741, + "loss": 3.5109, + "step": 42325 + }, + { + "epoch": 2.8760701182225845, + "grad_norm": 0.8886127471923828, + "learning_rate": 0.0006406526022557413, + "loss": 3.5653, + "step": 42330 + }, + { + "epoch": 2.876409838293246, + "grad_norm": 1.132373332977295, + "learning_rate": 0.0006406101372469085, + "loss": 3.4655, + "step": 42335 + }, + { + "epoch": 2.876749558363908, + "grad_norm": 0.8156754970550537, + "learning_rate": 0.0006405676722380758, + "loss": 3.682, + "step": 42340 + }, + { + "epoch": 2.87708927843457, + "grad_norm": 1.0190176963806152, + "learning_rate": 0.0006405252072292432, + "loss": 3.3914, + "step": 42345 + }, + { + "epoch": 2.8774289985052315, + "grad_norm": 0.7648590207099915, + "learning_rate": 0.0006404827422204104, + "loss": 3.712, + "step": 42350 + }, + { + "epoch": 2.8777687185758936, + "grad_norm": 0.968267023563385, + "learning_rate": 0.0006404402772115777, + "loss": 3.5028, + "step": 42355 + }, + { + "epoch": 2.878108438646555, + "grad_norm": 0.7702248096466064, + "learning_rate": 0.000640397812202745, + "loss": 3.6762, + "step": 42360 + }, + { + "epoch": 2.878448158717217, + "grad_norm": 0.9237242937088013, + "learning_rate": 0.0006403553471939122, + "loss": 3.5249, + "step": 42365 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 1.0136921405792236, + "learning_rate": 0.0006403128821850795, + "loss": 3.6232, + "step": 42370 + }, + { + "epoch": 2.8791275988585405, + "grad_norm": 0.8367647528648376, + "learning_rate": 0.0006402704171762467, + "loss": 3.8103, + "step": 42375 + }, + { + "epoch": 2.879467318929202, + "grad_norm": 0.874214231967926, + "learning_rate": 0.0006402279521674141, + "loss": 3.6426, + "step": 42380 + }, + { + "epoch": 2.8798070389998642, + "grad_norm": 0.6839501261711121, + "learning_rate": 0.0006401854871585814, + "loss": 3.4431, + "step": 42385 + }, + { + "epoch": 2.880146759070526, + "grad_norm": 0.7896199822425842, + "learning_rate": 0.0006401430221497486, + "loss": 3.5137, + "step": 42390 + }, + { + "epoch": 2.8804864791411875, + "grad_norm": 0.8621180057525635, + "learning_rate": 0.0006401005571409159, + "loss": 3.668, + "step": 42395 + }, + { + "epoch": 2.8808261992118496, + "grad_norm": 0.9963283538818359, + "learning_rate": 0.0006400580921320832, + "loss": 3.8039, + "step": 42400 + }, + { + "epoch": 2.881165919282511, + "grad_norm": 0.879838764667511, + "learning_rate": 0.0006400156271232504, + "loss": 3.4338, + "step": 42405 + }, + { + "epoch": 2.881505639353173, + "grad_norm": 0.9321365356445312, + "learning_rate": 0.0006399731621144177, + "loss": 3.5892, + "step": 42410 + }, + { + "epoch": 2.881845359423835, + "grad_norm": 0.8248634934425354, + "learning_rate": 0.0006399306971055851, + "loss": 3.6078, + "step": 42415 + }, + { + "epoch": 2.8821850794944965, + "grad_norm": 1.0600252151489258, + "learning_rate": 0.0006398882320967523, + "loss": 3.8613, + "step": 42420 + }, + { + "epoch": 2.882524799565158, + "grad_norm": 1.1072288751602173, + "learning_rate": 0.0006398457670879195, + "loss": 3.3806, + "step": 42425 + }, + { + "epoch": 2.8828645196358202, + "grad_norm": 1.1188571453094482, + "learning_rate": 0.0006398033020790869, + "loss": 3.6816, + "step": 42430 + }, + { + "epoch": 2.883204239706482, + "grad_norm": 0.7606415152549744, + "learning_rate": 0.0006397608370702541, + "loss": 3.6093, + "step": 42435 + }, + { + "epoch": 2.8835439597771435, + "grad_norm": 1.0329320430755615, + "learning_rate": 0.0006397183720614213, + "loss": 3.5699, + "step": 42440 + }, + { + "epoch": 2.8838836798478056, + "grad_norm": 0.856335461139679, + "learning_rate": 0.0006396759070525887, + "loss": 3.8745, + "step": 42445 + }, + { + "epoch": 2.884223399918467, + "grad_norm": 1.0819493532180786, + "learning_rate": 0.000639633442043756, + "loss": 3.356, + "step": 42450 + }, + { + "epoch": 2.884563119989129, + "grad_norm": 0.8642306923866272, + "learning_rate": 0.0006395909770349232, + "loss": 3.5888, + "step": 42455 + }, + { + "epoch": 2.884902840059791, + "grad_norm": 0.8853643536567688, + "learning_rate": 0.0006395485120260906, + "loss": 3.6792, + "step": 42460 + }, + { + "epoch": 2.8852425601304525, + "grad_norm": 0.7052310109138489, + "learning_rate": 0.0006395060470172578, + "loss": 3.6577, + "step": 42465 + }, + { + "epoch": 2.885582280201114, + "grad_norm": 0.9504676461219788, + "learning_rate": 0.000639463582008425, + "loss": 3.4925, + "step": 42470 + }, + { + "epoch": 2.8859220002717763, + "grad_norm": 1.0522091388702393, + "learning_rate": 0.0006394211169995923, + "loss": 3.7667, + "step": 42475 + }, + { + "epoch": 2.886261720342438, + "grad_norm": 0.8386231064796448, + "learning_rate": 0.0006393786519907596, + "loss": 3.7885, + "step": 42480 + }, + { + "epoch": 2.8866014404130995, + "grad_norm": 1.2784029245376587, + "learning_rate": 0.0006393361869819269, + "loss": 3.5684, + "step": 42485 + }, + { + "epoch": 2.8869411604837616, + "grad_norm": 0.8108141422271729, + "learning_rate": 0.0006392937219730942, + "loss": 3.5602, + "step": 42490 + }, + { + "epoch": 2.8872808805544232, + "grad_norm": 0.7610133290290833, + "learning_rate": 0.0006392512569642615, + "loss": 3.6701, + "step": 42495 + }, + { + "epoch": 2.887620600625085, + "grad_norm": 0.8445396423339844, + "learning_rate": 0.0006392087919554287, + "loss": 3.5702, + "step": 42500 + }, + { + "epoch": 2.887960320695747, + "grad_norm": 0.8261429071426392, + "learning_rate": 0.000639166326946596, + "loss": 3.5757, + "step": 42505 + }, + { + "epoch": 2.8883000407664086, + "grad_norm": 1.5207573175430298, + "learning_rate": 0.0006391238619377633, + "loss": 3.4822, + "step": 42510 + }, + { + "epoch": 2.88863976083707, + "grad_norm": 0.9298055171966553, + "learning_rate": 0.0006390813969289305, + "loss": 3.305, + "step": 42515 + }, + { + "epoch": 2.8889794809077323, + "grad_norm": 1.3546438217163086, + "learning_rate": 0.0006390389319200979, + "loss": 3.7134, + "step": 42520 + }, + { + "epoch": 2.889319200978394, + "grad_norm": 1.1777318716049194, + "learning_rate": 0.0006389964669112652, + "loss": 3.5155, + "step": 42525 + }, + { + "epoch": 2.8896589210490555, + "grad_norm": 1.1202706098556519, + "learning_rate": 0.0006389540019024324, + "loss": 3.6159, + "step": 42530 + }, + { + "epoch": 2.8899986411197176, + "grad_norm": 1.225242257118225, + "learning_rate": 0.0006389115368935997, + "loss": 3.2482, + "step": 42535 + }, + { + "epoch": 2.8903383611903792, + "grad_norm": 1.0605967044830322, + "learning_rate": 0.0006388690718847669, + "loss": 3.7586, + "step": 42540 + }, + { + "epoch": 2.890678081261041, + "grad_norm": 0.9269188046455383, + "learning_rate": 0.0006388266068759342, + "loss": 3.4119, + "step": 42545 + }, + { + "epoch": 2.891017801331703, + "grad_norm": 0.7671190500259399, + "learning_rate": 0.0006387841418671015, + "loss": 3.3782, + "step": 42550 + }, + { + "epoch": 2.8913575214023646, + "grad_norm": 1.0405076742172241, + "learning_rate": 0.0006387416768582688, + "loss": 3.4372, + "step": 42555 + }, + { + "epoch": 2.891697241473026, + "grad_norm": 1.08906090259552, + "learning_rate": 0.0006386992118494361, + "loss": 3.7143, + "step": 42560 + }, + { + "epoch": 2.8920369615436883, + "grad_norm": 0.9141576886177063, + "learning_rate": 0.0006386567468406034, + "loss": 3.7265, + "step": 42565 + }, + { + "epoch": 2.89237668161435, + "grad_norm": 0.8527538776397705, + "learning_rate": 0.0006386142818317706, + "loss": 3.5901, + "step": 42570 + }, + { + "epoch": 2.8927164016850115, + "grad_norm": 0.8208820223808289, + "learning_rate": 0.000638571816822938, + "loss": 3.5316, + "step": 42575 + }, + { + "epoch": 2.893056121755673, + "grad_norm": 0.8681095242500305, + "learning_rate": 0.0006385293518141052, + "loss": 3.4168, + "step": 42580 + }, + { + "epoch": 2.8933958418263352, + "grad_norm": 1.3085342645645142, + "learning_rate": 0.0006384868868052724, + "loss": 3.6088, + "step": 42585 + }, + { + "epoch": 2.893735561896997, + "grad_norm": 1.2689119577407837, + "learning_rate": 0.0006384444217964398, + "loss": 3.5412, + "step": 42590 + }, + { + "epoch": 2.8940752819676585, + "grad_norm": 1.0412354469299316, + "learning_rate": 0.0006384019567876071, + "loss": 3.6356, + "step": 42595 + }, + { + "epoch": 2.8944150020383206, + "grad_norm": 1.2049612998962402, + "learning_rate": 0.0006383594917787743, + "loss": 3.6621, + "step": 42600 + }, + { + "epoch": 2.894754722108982, + "grad_norm": 0.8979203701019287, + "learning_rate": 0.0006383170267699416, + "loss": 3.3126, + "step": 42605 + }, + { + "epoch": 2.895094442179644, + "grad_norm": 1.1260889768600464, + "learning_rate": 0.0006382745617611089, + "loss": 3.4861, + "step": 42610 + }, + { + "epoch": 2.8954341622503055, + "grad_norm": 0.9566530585289001, + "learning_rate": 0.0006382320967522761, + "loss": 3.3857, + "step": 42615 + }, + { + "epoch": 2.8957738823209676, + "grad_norm": 0.8316880464553833, + "learning_rate": 0.0006381896317434434, + "loss": 3.5437, + "step": 42620 + }, + { + "epoch": 2.896113602391629, + "grad_norm": 0.8635348081588745, + "learning_rate": 0.0006381471667346108, + "loss": 3.4943, + "step": 42625 + }, + { + "epoch": 2.896453322462291, + "grad_norm": 0.9278658032417297, + "learning_rate": 0.000638104701725778, + "loss": 3.5148, + "step": 42630 + }, + { + "epoch": 2.896793042532953, + "grad_norm": 0.9027742147445679, + "learning_rate": 0.0006380622367169453, + "loss": 3.6012, + "step": 42635 + }, + { + "epoch": 2.8971327626036145, + "grad_norm": 0.9915933609008789, + "learning_rate": 0.0006380197717081125, + "loss": 3.5731, + "step": 42640 + }, + { + "epoch": 2.897472482674276, + "grad_norm": 0.8808592557907104, + "learning_rate": 0.0006379773066992798, + "loss": 3.88, + "step": 42645 + }, + { + "epoch": 2.8978122027449382, + "grad_norm": 0.6079631447792053, + "learning_rate": 0.0006379348416904471, + "loss": 3.5292, + "step": 42650 + }, + { + "epoch": 2.8981519228156, + "grad_norm": 0.7509498000144958, + "learning_rate": 0.0006378923766816143, + "loss": 3.3698, + "step": 42655 + }, + { + "epoch": 2.8984916428862615, + "grad_norm": 0.9439882040023804, + "learning_rate": 0.0006378499116727817, + "loss": 3.5536, + "step": 42660 + }, + { + "epoch": 2.8988313629569236, + "grad_norm": 0.971625566482544, + "learning_rate": 0.000637807446663949, + "loss": 3.6547, + "step": 42665 + }, + { + "epoch": 2.899171083027585, + "grad_norm": 0.95089191198349, + "learning_rate": 0.0006377649816551162, + "loss": 3.4264, + "step": 42670 + }, + { + "epoch": 2.899510803098247, + "grad_norm": 0.7822761535644531, + "learning_rate": 0.0006377225166462834, + "loss": 3.5928, + "step": 42675 + }, + { + "epoch": 2.899850523168909, + "grad_norm": 0.8482622504234314, + "learning_rate": 0.0006376800516374508, + "loss": 3.4401, + "step": 42680 + }, + { + "epoch": 2.9001902432395705, + "grad_norm": 1.0770487785339355, + "learning_rate": 0.000637637586628618, + "loss": 3.5805, + "step": 42685 + }, + { + "epoch": 2.900529963310232, + "grad_norm": 0.976013720035553, + "learning_rate": 0.0006375951216197852, + "loss": 3.4725, + "step": 42690 + }, + { + "epoch": 2.9008696833808942, + "grad_norm": 0.8507923483848572, + "learning_rate": 0.0006375526566109527, + "loss": 3.5909, + "step": 42695 + }, + { + "epoch": 2.901209403451556, + "grad_norm": 0.7367267608642578, + "learning_rate": 0.0006375101916021199, + "loss": 3.5932, + "step": 42700 + }, + { + "epoch": 2.9015491235222175, + "grad_norm": 1.066497802734375, + "learning_rate": 0.0006374677265932871, + "loss": 3.8642, + "step": 42705 + }, + { + "epoch": 2.9018888435928796, + "grad_norm": 0.9525050520896912, + "learning_rate": 0.0006374252615844545, + "loss": 3.6169, + "step": 42710 + }, + { + "epoch": 2.902228563663541, + "grad_norm": 1.1267311573028564, + "learning_rate": 0.0006373827965756217, + "loss": 3.5302, + "step": 42715 + }, + { + "epoch": 2.902568283734203, + "grad_norm": 0.9697348475456238, + "learning_rate": 0.0006373403315667889, + "loss": 3.4993, + "step": 42720 + }, + { + "epoch": 2.902908003804865, + "grad_norm": 0.8653227090835571, + "learning_rate": 0.0006372978665579562, + "loss": 3.5645, + "step": 42725 + }, + { + "epoch": 2.9032477238755265, + "grad_norm": 0.8895566463470459, + "learning_rate": 0.0006372554015491236, + "loss": 3.6207, + "step": 42730 + }, + { + "epoch": 2.903587443946188, + "grad_norm": 0.7402145862579346, + "learning_rate": 0.0006372129365402908, + "loss": 3.6395, + "step": 42735 + }, + { + "epoch": 2.9039271640168502, + "grad_norm": 1.2616515159606934, + "learning_rate": 0.0006371704715314581, + "loss": 3.5094, + "step": 42740 + }, + { + "epoch": 2.904266884087512, + "grad_norm": 0.8593542575836182, + "learning_rate": 0.0006371280065226254, + "loss": 3.3714, + "step": 42745 + }, + { + "epoch": 2.9046066041581735, + "grad_norm": 0.8662847280502319, + "learning_rate": 0.0006370855415137926, + "loss": 3.5049, + "step": 42750 + }, + { + "epoch": 2.9049463242288356, + "grad_norm": 0.937491774559021, + "learning_rate": 0.0006370430765049599, + "loss": 3.4823, + "step": 42755 + }, + { + "epoch": 2.905286044299497, + "grad_norm": 0.9231739044189453, + "learning_rate": 0.0006370006114961272, + "loss": 3.5733, + "step": 42760 + }, + { + "epoch": 2.905625764370159, + "grad_norm": 0.8800464272499084, + "learning_rate": 0.0006369581464872945, + "loss": 3.5072, + "step": 42765 + }, + { + "epoch": 2.905965484440821, + "grad_norm": 0.6724525094032288, + "learning_rate": 0.0006369156814784618, + "loss": 3.5651, + "step": 42770 + }, + { + "epoch": 2.9063052045114826, + "grad_norm": 0.7144178748130798, + "learning_rate": 0.000636873216469629, + "loss": 3.7627, + "step": 42775 + }, + { + "epoch": 2.906644924582144, + "grad_norm": 0.9771715998649597, + "learning_rate": 0.0006368307514607963, + "loss": 3.5922, + "step": 42780 + }, + { + "epoch": 2.9069846446528063, + "grad_norm": 0.9213542938232422, + "learning_rate": 0.0006367882864519636, + "loss": 3.3435, + "step": 42785 + }, + { + "epoch": 2.907324364723468, + "grad_norm": 0.8765621781349182, + "learning_rate": 0.0006367458214431308, + "loss": 3.5564, + "step": 42790 + }, + { + "epoch": 2.9076640847941295, + "grad_norm": 0.8609213829040527, + "learning_rate": 0.0006367033564342981, + "loss": 3.394, + "step": 42795 + }, + { + "epoch": 2.9080038048647916, + "grad_norm": 0.7464118003845215, + "learning_rate": 0.0006366608914254655, + "loss": 3.6545, + "step": 42800 + }, + { + "epoch": 2.9083435249354532, + "grad_norm": 1.3768666982650757, + "learning_rate": 0.0006366184264166327, + "loss": 3.502, + "step": 42805 + }, + { + "epoch": 2.908683245006115, + "grad_norm": 0.7811957597732544, + "learning_rate": 0.0006365759614078, + "loss": 3.6964, + "step": 42810 + }, + { + "epoch": 2.909022965076777, + "grad_norm": 0.9333653450012207, + "learning_rate": 0.0006365334963989673, + "loss": 3.6563, + "step": 42815 + }, + { + "epoch": 2.9093626851474386, + "grad_norm": 0.733137845993042, + "learning_rate": 0.0006364910313901345, + "loss": 3.5583, + "step": 42820 + }, + { + "epoch": 2.9097024052181, + "grad_norm": 0.7694963216781616, + "learning_rate": 0.0006364485663813017, + "loss": 3.5298, + "step": 42825 + }, + { + "epoch": 2.9100421252887623, + "grad_norm": 0.6995178461074829, + "learning_rate": 0.0006364061013724691, + "loss": 3.3909, + "step": 42830 + }, + { + "epoch": 2.910381845359424, + "grad_norm": 0.9046741724014282, + "learning_rate": 0.0006363636363636364, + "loss": 3.7672, + "step": 42835 + }, + { + "epoch": 2.9107215654300855, + "grad_norm": 0.8005523085594177, + "learning_rate": 0.0006363211713548036, + "loss": 3.6635, + "step": 42840 + }, + { + "epoch": 2.9110612855007476, + "grad_norm": 0.6834561824798584, + "learning_rate": 0.000636278706345971, + "loss": 3.5005, + "step": 42845 + }, + { + "epoch": 2.9114010055714092, + "grad_norm": 1.0128384828567505, + "learning_rate": 0.0006362362413371382, + "loss": 3.5922, + "step": 42850 + }, + { + "epoch": 2.911740725642071, + "grad_norm": 0.9560602903366089, + "learning_rate": 0.0006361937763283054, + "loss": 3.4816, + "step": 42855 + }, + { + "epoch": 2.912080445712733, + "grad_norm": 1.1979265213012695, + "learning_rate": 0.0006361513113194728, + "loss": 3.6218, + "step": 42860 + }, + { + "epoch": 2.9124201657833946, + "grad_norm": 0.9801417589187622, + "learning_rate": 0.00063610884631064, + "loss": 3.5763, + "step": 42865 + }, + { + "epoch": 2.912759885854056, + "grad_norm": 0.825892448425293, + "learning_rate": 0.0006360663813018073, + "loss": 3.6697, + "step": 42870 + }, + { + "epoch": 2.9130996059247183, + "grad_norm": 1.012922763824463, + "learning_rate": 0.0006360239162929746, + "loss": 3.5917, + "step": 42875 + }, + { + "epoch": 2.91343932599538, + "grad_norm": 0.732200026512146, + "learning_rate": 0.0006359814512841419, + "loss": 3.57, + "step": 42880 + }, + { + "epoch": 2.9137790460660415, + "grad_norm": 0.7748098969459534, + "learning_rate": 0.0006359389862753091, + "loss": 3.8546, + "step": 42885 + }, + { + "epoch": 2.9141187661367036, + "grad_norm": 0.7699866890907288, + "learning_rate": 0.0006358965212664764, + "loss": 3.5173, + "step": 42890 + }, + { + "epoch": 2.9144584862073653, + "grad_norm": 0.8961706757545471, + "learning_rate": 0.0006358540562576437, + "loss": 3.629, + "step": 42895 + }, + { + "epoch": 2.914798206278027, + "grad_norm": 0.8691208958625793, + "learning_rate": 0.0006358115912488109, + "loss": 3.3448, + "step": 42900 + }, + { + "epoch": 2.915137926348689, + "grad_norm": 1.2777974605560303, + "learning_rate": 0.0006357691262399783, + "loss": 3.6747, + "step": 42905 + }, + { + "epoch": 2.9154776464193506, + "grad_norm": 0.8746368885040283, + "learning_rate": 0.0006357266612311456, + "loss": 3.4666, + "step": 42910 + }, + { + "epoch": 2.915817366490012, + "grad_norm": 0.6666287779808044, + "learning_rate": 0.0006356841962223129, + "loss": 3.6055, + "step": 42915 + }, + { + "epoch": 2.916157086560674, + "grad_norm": 2.627143144607544, + "learning_rate": 0.0006356417312134801, + "loss": 3.5156, + "step": 42920 + }, + { + "epoch": 2.916496806631336, + "grad_norm": 0.9225397109985352, + "learning_rate": 0.0006355992662046473, + "loss": 3.6787, + "step": 42925 + }, + { + "epoch": 2.9168365267019976, + "grad_norm": 0.9746904969215393, + "learning_rate": 0.0006355568011958147, + "loss": 3.4597, + "step": 42930 + }, + { + "epoch": 2.917176246772659, + "grad_norm": 0.9976130127906799, + "learning_rate": 0.000635514336186982, + "loss": 3.6351, + "step": 42935 + }, + { + "epoch": 2.9175159668433213, + "grad_norm": 0.7115152478218079, + "learning_rate": 0.0006354718711781492, + "loss": 3.8617, + "step": 42940 + }, + { + "epoch": 2.917855686913983, + "grad_norm": 0.7123271226882935, + "learning_rate": 0.0006354294061693166, + "loss": 3.7741, + "step": 42945 + }, + { + "epoch": 2.9181954069846445, + "grad_norm": 0.8429141044616699, + "learning_rate": 0.0006353869411604838, + "loss": 3.4628, + "step": 42950 + }, + { + "epoch": 2.918535127055306, + "grad_norm": 1.2569661140441895, + "learning_rate": 0.000635344476151651, + "loss": 3.4439, + "step": 42955 + }, + { + "epoch": 2.9188748471259682, + "grad_norm": 0.7635186314582825, + "learning_rate": 0.0006353020111428184, + "loss": 3.5506, + "step": 42960 + }, + { + "epoch": 2.91921456719663, + "grad_norm": 0.7828493714332581, + "learning_rate": 0.0006352595461339856, + "loss": 3.6268, + "step": 42965 + }, + { + "epoch": 2.9195542872672915, + "grad_norm": 0.9995033740997314, + "learning_rate": 0.0006352170811251529, + "loss": 3.7004, + "step": 42970 + }, + { + "epoch": 2.9198940073379536, + "grad_norm": 0.7126761078834534, + "learning_rate": 0.0006351746161163202, + "loss": 3.4393, + "step": 42975 + }, + { + "epoch": 2.920233727408615, + "grad_norm": 0.7742668390274048, + "learning_rate": 0.0006351321511074875, + "loss": 3.7761, + "step": 42980 + }, + { + "epoch": 2.920573447479277, + "grad_norm": 1.086374282836914, + "learning_rate": 0.0006350896860986547, + "loss": 3.3626, + "step": 42985 + }, + { + "epoch": 2.920913167549939, + "grad_norm": 0.927900493144989, + "learning_rate": 0.000635047221089822, + "loss": 3.5235, + "step": 42990 + }, + { + "epoch": 2.9212528876206005, + "grad_norm": 0.7161597013473511, + "learning_rate": 0.0006350047560809893, + "loss": 3.588, + "step": 42995 + }, + { + "epoch": 2.921592607691262, + "grad_norm": 0.9324819445610046, + "learning_rate": 0.0006349622910721565, + "loss": 3.5025, + "step": 43000 + }, + { + "epoch": 2.9219323277619242, + "grad_norm": 0.7857580184936523, + "learning_rate": 0.0006349198260633239, + "loss": 3.7736, + "step": 43005 + }, + { + "epoch": 2.922272047832586, + "grad_norm": 0.8242482542991638, + "learning_rate": 0.0006348773610544912, + "loss": 3.4838, + "step": 43010 + }, + { + "epoch": 2.9226117679032475, + "grad_norm": 1.0616786479949951, + "learning_rate": 0.0006348348960456584, + "loss": 3.2868, + "step": 43015 + }, + { + "epoch": 2.9229514879739096, + "grad_norm": 0.9958469867706299, + "learning_rate": 0.0006347924310368257, + "loss": 3.6245, + "step": 43020 + }, + { + "epoch": 2.923291208044571, + "grad_norm": 0.8357774615287781, + "learning_rate": 0.0006347499660279929, + "loss": 3.7633, + "step": 43025 + }, + { + "epoch": 2.923630928115233, + "grad_norm": 0.9551855325698853, + "learning_rate": 0.0006347075010191602, + "loss": 3.5758, + "step": 43030 + }, + { + "epoch": 2.923970648185895, + "grad_norm": 0.7071208357810974, + "learning_rate": 0.0006346650360103275, + "loss": 3.6411, + "step": 43035 + }, + { + "epoch": 2.9243103682565565, + "grad_norm": 0.7425499558448792, + "learning_rate": 0.0006346225710014948, + "loss": 3.5921, + "step": 43040 + }, + { + "epoch": 2.924650088327218, + "grad_norm": 0.7038243412971497, + "learning_rate": 0.0006345801059926621, + "loss": 3.6965, + "step": 43045 + }, + { + "epoch": 2.9249898083978803, + "grad_norm": 0.8490314483642578, + "learning_rate": 0.0006345376409838294, + "loss": 3.8042, + "step": 43050 + }, + { + "epoch": 2.925329528468542, + "grad_norm": 0.7792957425117493, + "learning_rate": 0.0006344951759749966, + "loss": 3.4395, + "step": 43055 + }, + { + "epoch": 2.9256692485392035, + "grad_norm": 24.38279914855957, + "learning_rate": 0.0006344527109661638, + "loss": 3.5391, + "step": 43060 + }, + { + "epoch": 2.9260089686098656, + "grad_norm": 1.0271424055099487, + "learning_rate": 0.0006344102459573312, + "loss": 3.7312, + "step": 43065 + }, + { + "epoch": 2.926348688680527, + "grad_norm": 0.8269239664077759, + "learning_rate": 0.0006343677809484984, + "loss": 3.3497, + "step": 43070 + }, + { + "epoch": 2.926688408751189, + "grad_norm": 0.8513595461845398, + "learning_rate": 0.0006343253159396657, + "loss": 3.5983, + "step": 43075 + }, + { + "epoch": 2.927028128821851, + "grad_norm": 1.1831064224243164, + "learning_rate": 0.0006342828509308331, + "loss": 3.6663, + "step": 43080 + }, + { + "epoch": 2.9273678488925126, + "grad_norm": 0.9857175946235657, + "learning_rate": 0.0006342403859220003, + "loss": 3.624, + "step": 43085 + }, + { + "epoch": 2.927707568963174, + "grad_norm": 0.7475153207778931, + "learning_rate": 0.0006341979209131675, + "loss": 3.331, + "step": 43090 + }, + { + "epoch": 2.9280472890338363, + "grad_norm": 0.716468870639801, + "learning_rate": 0.0006341554559043349, + "loss": 3.3707, + "step": 43095 + }, + { + "epoch": 2.928387009104498, + "grad_norm": 0.9868401885032654, + "learning_rate": 0.0006341129908955021, + "loss": 3.4027, + "step": 43100 + }, + { + "epoch": 2.9287267291751595, + "grad_norm": 1.140017032623291, + "learning_rate": 0.0006340705258866693, + "loss": 3.4604, + "step": 43105 + }, + { + "epoch": 2.9290664492458216, + "grad_norm": 0.6861456632614136, + "learning_rate": 0.0006340280608778368, + "loss": 3.5301, + "step": 43110 + }, + { + "epoch": 2.9294061693164832, + "grad_norm": 0.666670024394989, + "learning_rate": 0.000633985595869004, + "loss": 3.6785, + "step": 43115 + }, + { + "epoch": 2.929745889387145, + "grad_norm": 1.00276517868042, + "learning_rate": 0.0006339431308601712, + "loss": 3.2426, + "step": 43120 + }, + { + "epoch": 2.930085609457807, + "grad_norm": 0.7526521682739258, + "learning_rate": 0.0006339006658513385, + "loss": 3.6201, + "step": 43125 + }, + { + "epoch": 2.9304253295284686, + "grad_norm": 0.8696470260620117, + "learning_rate": 0.0006338582008425058, + "loss": 3.5767, + "step": 43130 + }, + { + "epoch": 2.93076504959913, + "grad_norm": 0.9144272804260254, + "learning_rate": 0.000633815735833673, + "loss": 3.1744, + "step": 43135 + }, + { + "epoch": 2.9311047696697923, + "grad_norm": 0.8709496259689331, + "learning_rate": 0.0006337732708248403, + "loss": 3.5084, + "step": 43140 + }, + { + "epoch": 2.931444489740454, + "grad_norm": 0.668001115322113, + "learning_rate": 0.0006337308058160077, + "loss": 3.7713, + "step": 43145 + }, + { + "epoch": 2.9317842098111155, + "grad_norm": 0.8131774067878723, + "learning_rate": 0.0006336883408071749, + "loss": 3.8141, + "step": 43150 + }, + { + "epoch": 2.9321239298817776, + "grad_norm": 0.9580155611038208, + "learning_rate": 0.0006336458757983422, + "loss": 3.6417, + "step": 43155 + }, + { + "epoch": 2.9324636499524392, + "grad_norm": 0.7503117322921753, + "learning_rate": 0.0006336034107895094, + "loss": 3.5568, + "step": 43160 + }, + { + "epoch": 2.932803370023101, + "grad_norm": 0.7087939977645874, + "learning_rate": 0.0006335609457806767, + "loss": 3.6545, + "step": 43165 + }, + { + "epoch": 2.933143090093763, + "grad_norm": 0.9584708213806152, + "learning_rate": 0.000633518480771844, + "loss": 3.3061, + "step": 43170 + }, + { + "epoch": 2.9334828101644246, + "grad_norm": 0.9971731305122375, + "learning_rate": 0.0006334760157630112, + "loss": 3.518, + "step": 43175 + }, + { + "epoch": 2.933822530235086, + "grad_norm": 0.9839001297950745, + "learning_rate": 0.0006334335507541786, + "loss": 3.7018, + "step": 43180 + }, + { + "epoch": 2.9341622503057483, + "grad_norm": 0.6520939469337463, + "learning_rate": 0.0006333910857453459, + "loss": 3.4143, + "step": 43185 + }, + { + "epoch": 2.93450197037641, + "grad_norm": 0.8588538765907288, + "learning_rate": 0.0006333486207365131, + "loss": 3.6025, + "step": 43190 + }, + { + "epoch": 2.9348416904470715, + "grad_norm": 1.1680620908737183, + "learning_rate": 0.0006333061557276804, + "loss": 3.6638, + "step": 43195 + }, + { + "epoch": 2.9351814105177336, + "grad_norm": 1.0371735095977783, + "learning_rate": 0.0006332636907188477, + "loss": 3.5937, + "step": 43200 + }, + { + "epoch": 2.9355211305883953, + "grad_norm": 0.8592299222946167, + "learning_rate": 0.0006332212257100149, + "loss": 3.6766, + "step": 43205 + }, + { + "epoch": 2.935860850659057, + "grad_norm": 0.6861622333526611, + "learning_rate": 0.0006331787607011821, + "loss": 3.6468, + "step": 43210 + }, + { + "epoch": 2.936200570729719, + "grad_norm": 0.8398114442825317, + "learning_rate": 0.0006331362956923496, + "loss": 3.5879, + "step": 43215 + }, + { + "epoch": 2.9365402908003806, + "grad_norm": 0.8335443139076233, + "learning_rate": 0.0006330938306835168, + "loss": 3.4827, + "step": 43220 + }, + { + "epoch": 2.9368800108710422, + "grad_norm": 0.8634170889854431, + "learning_rate": 0.000633051365674684, + "loss": 3.4962, + "step": 43225 + }, + { + "epoch": 2.9372197309417043, + "grad_norm": 0.9159271717071533, + "learning_rate": 0.0006330089006658514, + "loss": 3.5942, + "step": 43230 + }, + { + "epoch": 2.937559451012366, + "grad_norm": 0.9358786940574646, + "learning_rate": 0.0006329664356570186, + "loss": 3.4005, + "step": 43235 + }, + { + "epoch": 2.9378991710830276, + "grad_norm": 0.9783751368522644, + "learning_rate": 0.0006329239706481858, + "loss": 3.5481, + "step": 43240 + }, + { + "epoch": 2.9382388911536896, + "grad_norm": 0.8018321990966797, + "learning_rate": 0.0006328815056393532, + "loss": 3.3689, + "step": 43245 + }, + { + "epoch": 2.9385786112243513, + "grad_norm": 0.8216391801834106, + "learning_rate": 0.0006328390406305205, + "loss": 3.4978, + "step": 43250 + }, + { + "epoch": 2.938918331295013, + "grad_norm": 0.8826801776885986, + "learning_rate": 0.0006327965756216878, + "loss": 3.7175, + "step": 43255 + }, + { + "epoch": 2.9392580513656745, + "grad_norm": 0.9658626317977905, + "learning_rate": 0.000632754110612855, + "loss": 3.4361, + "step": 43260 + }, + { + "epoch": 2.9395977714363366, + "grad_norm": 1.0588515996932983, + "learning_rate": 0.0006327116456040223, + "loss": 3.4582, + "step": 43265 + }, + { + "epoch": 2.9399374915069982, + "grad_norm": 0.7249969840049744, + "learning_rate": 0.0006326691805951896, + "loss": 3.5105, + "step": 43270 + }, + { + "epoch": 2.94027721157766, + "grad_norm": 0.6981052756309509, + "learning_rate": 0.0006326267155863568, + "loss": 3.4267, + "step": 43275 + }, + { + "epoch": 2.940616931648322, + "grad_norm": 0.982877254486084, + "learning_rate": 0.0006325842505775241, + "loss": 3.5531, + "step": 43280 + }, + { + "epoch": 2.9409566517189836, + "grad_norm": 0.912081778049469, + "learning_rate": 0.0006325417855686915, + "loss": 3.7136, + "step": 43285 + }, + { + "epoch": 2.941296371789645, + "grad_norm": 0.6819454431533813, + "learning_rate": 0.0006324993205598587, + "loss": 3.5485, + "step": 43290 + }, + { + "epoch": 2.941636091860307, + "grad_norm": 0.7500665187835693, + "learning_rate": 0.000632456855551026, + "loss": 3.7018, + "step": 43295 + }, + { + "epoch": 2.941975811930969, + "grad_norm": 0.9001669883728027, + "learning_rate": 0.0006324143905421933, + "loss": 3.7241, + "step": 43300 + }, + { + "epoch": 2.9423155320016305, + "grad_norm": 0.8018466234207153, + "learning_rate": 0.0006323719255333605, + "loss": 3.3514, + "step": 43305 + }, + { + "epoch": 2.942655252072292, + "grad_norm": 1.8716145753860474, + "learning_rate": 0.0006323294605245277, + "loss": 3.5311, + "step": 43310 + }, + { + "epoch": 2.9429949721429542, + "grad_norm": 1.0071454048156738, + "learning_rate": 0.0006322869955156951, + "loss": 3.4074, + "step": 43315 + }, + { + "epoch": 2.943334692213616, + "grad_norm": 1.1681492328643799, + "learning_rate": 0.0006322445305068624, + "loss": 3.7391, + "step": 43320 + }, + { + "epoch": 2.9436744122842775, + "grad_norm": 1.0084863901138306, + "learning_rate": 0.0006322020654980296, + "loss": 3.3787, + "step": 43325 + }, + { + "epoch": 2.9440141323549396, + "grad_norm": 0.8290515542030334, + "learning_rate": 0.000632159600489197, + "loss": 3.4169, + "step": 43330 + }, + { + "epoch": 2.944353852425601, + "grad_norm": 0.8070934414863586, + "learning_rate": 0.0006321171354803642, + "loss": 3.4925, + "step": 43335 + }, + { + "epoch": 2.944693572496263, + "grad_norm": 1.2437325716018677, + "learning_rate": 0.0006320746704715314, + "loss": 3.8333, + "step": 43340 + }, + { + "epoch": 2.945033292566925, + "grad_norm": 1.0463618040084839, + "learning_rate": 0.0006320322054626988, + "loss": 3.5907, + "step": 43345 + }, + { + "epoch": 2.9453730126375866, + "grad_norm": 0.7578727602958679, + "learning_rate": 0.000631989740453866, + "loss": 3.5437, + "step": 43350 + }, + { + "epoch": 2.945712732708248, + "grad_norm": 0.8662514686584473, + "learning_rate": 0.0006319472754450333, + "loss": 3.7216, + "step": 43355 + }, + { + "epoch": 2.9460524527789103, + "grad_norm": 0.7873066663742065, + "learning_rate": 0.0006319048104362007, + "loss": 3.4956, + "step": 43360 + }, + { + "epoch": 2.946392172849572, + "grad_norm": 2.481755256652832, + "learning_rate": 0.0006318623454273679, + "loss": 3.4522, + "step": 43365 + }, + { + "epoch": 2.9467318929202335, + "grad_norm": 1.1874895095825195, + "learning_rate": 0.0006318198804185351, + "loss": 3.7056, + "step": 43370 + }, + { + "epoch": 2.9470716129908956, + "grad_norm": 0.7822505831718445, + "learning_rate": 0.0006317774154097024, + "loss": 3.819, + "step": 43375 + }, + { + "epoch": 2.9474113330615572, + "grad_norm": 0.9745808243751526, + "learning_rate": 0.0006317349504008697, + "loss": 3.3427, + "step": 43380 + }, + { + "epoch": 2.947751053132219, + "grad_norm": 0.8564258813858032, + "learning_rate": 0.0006316924853920369, + "loss": 3.4555, + "step": 43385 + }, + { + "epoch": 2.948090773202881, + "grad_norm": 0.7807032465934753, + "learning_rate": 0.0006316500203832043, + "loss": 3.6997, + "step": 43390 + }, + { + "epoch": 2.9484304932735426, + "grad_norm": 0.8237249255180359, + "learning_rate": 0.0006316075553743716, + "loss": 3.5506, + "step": 43395 + }, + { + "epoch": 2.948770213344204, + "grad_norm": 1.1430631875991821, + "learning_rate": 0.0006315650903655388, + "loss": 3.5451, + "step": 43400 + }, + { + "epoch": 2.9491099334148663, + "grad_norm": 0.7560694813728333, + "learning_rate": 0.0006315226253567061, + "loss": 3.3836, + "step": 43405 + }, + { + "epoch": 2.949449653485528, + "grad_norm": 1.135438084602356, + "learning_rate": 0.0006314801603478733, + "loss": 3.3141, + "step": 43410 + }, + { + "epoch": 2.9497893735561895, + "grad_norm": 0.899219274520874, + "learning_rate": 0.0006314376953390406, + "loss": 3.529, + "step": 43415 + }, + { + "epoch": 2.9501290936268516, + "grad_norm": 0.8624741435050964, + "learning_rate": 0.000631395230330208, + "loss": 3.4018, + "step": 43420 + }, + { + "epoch": 2.9504688136975132, + "grad_norm": 0.7243566513061523, + "learning_rate": 0.0006313527653213752, + "loss": 3.7043, + "step": 43425 + }, + { + "epoch": 2.950808533768175, + "grad_norm": 1.1664994955062866, + "learning_rate": 0.0006313103003125425, + "loss": 3.8445, + "step": 43430 + }, + { + "epoch": 2.951148253838837, + "grad_norm": 1.0812439918518066, + "learning_rate": 0.0006312678353037098, + "loss": 3.8207, + "step": 43435 + }, + { + "epoch": 2.9514879739094986, + "grad_norm": 0.853894054889679, + "learning_rate": 0.000631225370294877, + "loss": 3.6258, + "step": 43440 + }, + { + "epoch": 2.95182769398016, + "grad_norm": 0.8083696961402893, + "learning_rate": 0.0006311829052860443, + "loss": 3.7384, + "step": 43445 + }, + { + "epoch": 2.9521674140508223, + "grad_norm": 0.9763396382331848, + "learning_rate": 0.0006311404402772116, + "loss": 3.4273, + "step": 43450 + }, + { + "epoch": 2.952507134121484, + "grad_norm": 1.1314319372177124, + "learning_rate": 0.0006310979752683789, + "loss": 3.5097, + "step": 43455 + }, + { + "epoch": 2.9528468541921455, + "grad_norm": 0.7408660650253296, + "learning_rate": 0.0006310555102595461, + "loss": 3.5036, + "step": 43460 + }, + { + "epoch": 2.9531865742628076, + "grad_norm": 0.7746257185935974, + "learning_rate": 0.0006310130452507135, + "loss": 3.48, + "step": 43465 + }, + { + "epoch": 2.9535262943334692, + "grad_norm": 0.9968205094337463, + "learning_rate": 0.0006309705802418807, + "loss": 3.7588, + "step": 43470 + }, + { + "epoch": 2.953866014404131, + "grad_norm": 0.9078085422515869, + "learning_rate": 0.0006309281152330479, + "loss": 3.4163, + "step": 43475 + }, + { + "epoch": 2.954205734474793, + "grad_norm": 0.9210789203643799, + "learning_rate": 0.0006308856502242153, + "loss": 3.6515, + "step": 43480 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.8324581384658813, + "learning_rate": 0.0006308431852153825, + "loss": 3.6182, + "step": 43485 + }, + { + "epoch": 2.954885174616116, + "grad_norm": 0.8503854870796204, + "learning_rate": 0.0006308007202065498, + "loss": 3.7338, + "step": 43490 + }, + { + "epoch": 2.9552248946867783, + "grad_norm": 0.7889599204063416, + "learning_rate": 0.0006307582551977172, + "loss": 3.3891, + "step": 43495 + }, + { + "epoch": 2.95556461475744, + "grad_norm": 0.8917528390884399, + "learning_rate": 0.0006307157901888844, + "loss": 3.5133, + "step": 43500 + }, + { + "epoch": 2.9559043348281016, + "grad_norm": 0.8646953105926514, + "learning_rate": 0.0006306733251800516, + "loss": 3.2728, + "step": 43505 + }, + { + "epoch": 2.9562440548987636, + "grad_norm": 0.8279052376747131, + "learning_rate": 0.000630630860171219, + "loss": 3.616, + "step": 43510 + }, + { + "epoch": 2.9565837749694253, + "grad_norm": 1.0515791177749634, + "learning_rate": 0.0006305883951623862, + "loss": 3.2298, + "step": 43515 + }, + { + "epoch": 2.956923495040087, + "grad_norm": 0.7464945912361145, + "learning_rate": 0.0006305459301535534, + "loss": 3.4227, + "step": 43520 + }, + { + "epoch": 2.957263215110749, + "grad_norm": 0.8852419257164001, + "learning_rate": 0.0006305034651447208, + "loss": 3.5605, + "step": 43525 + }, + { + "epoch": 2.9576029351814106, + "grad_norm": 0.9240835309028625, + "learning_rate": 0.0006304610001358881, + "loss": 3.7049, + "step": 43530 + }, + { + "epoch": 2.9579426552520722, + "grad_norm": 0.9475210905075073, + "learning_rate": 0.0006304185351270553, + "loss": 3.4276, + "step": 43535 + }, + { + "epoch": 2.9582823753227343, + "grad_norm": 0.8497651815414429, + "learning_rate": 0.0006303760701182226, + "loss": 3.4251, + "step": 43540 + }, + { + "epoch": 2.958622095393396, + "grad_norm": 0.8122116327285767, + "learning_rate": 0.0006303336051093899, + "loss": 3.6137, + "step": 43545 + }, + { + "epoch": 2.9589618154640576, + "grad_norm": 0.9079899191856384, + "learning_rate": 0.0006302911401005571, + "loss": 3.6699, + "step": 43550 + }, + { + "epoch": 2.9593015355347196, + "grad_norm": 0.9736805558204651, + "learning_rate": 0.0006302486750917244, + "loss": 3.2371, + "step": 43555 + }, + { + "epoch": 2.9596412556053813, + "grad_norm": 0.7811906933784485, + "learning_rate": 0.0006302062100828917, + "loss": 3.585, + "step": 43560 + }, + { + "epoch": 2.959980975676043, + "grad_norm": 0.7673446536064148, + "learning_rate": 0.000630163745074059, + "loss": 3.335, + "step": 43565 + }, + { + "epoch": 2.960320695746705, + "grad_norm": 1.032315969467163, + "learning_rate": 0.0006301212800652263, + "loss": 3.4178, + "step": 43570 + }, + { + "epoch": 2.9606604158173666, + "grad_norm": 0.896633505821228, + "learning_rate": 0.0006300788150563935, + "loss": 3.7565, + "step": 43575 + }, + { + "epoch": 2.9610001358880282, + "grad_norm": 0.890682578086853, + "learning_rate": 0.0006300363500475608, + "loss": 3.6056, + "step": 43580 + }, + { + "epoch": 2.9613398559586903, + "grad_norm": 0.8713335990905762, + "learning_rate": 0.0006299938850387281, + "loss": 3.5676, + "step": 43585 + }, + { + "epoch": 2.961679576029352, + "grad_norm": 0.6371521949768066, + "learning_rate": 0.0006299514200298953, + "loss": 3.7964, + "step": 43590 + }, + { + "epoch": 2.9620192961000136, + "grad_norm": 0.8617971539497375, + "learning_rate": 0.0006299089550210628, + "loss": 3.5311, + "step": 43595 + }, + { + "epoch": 2.962359016170675, + "grad_norm": 0.7828670144081116, + "learning_rate": 0.00062986649001223, + "loss": 3.6739, + "step": 43600 + }, + { + "epoch": 2.9626987362413373, + "grad_norm": 0.6622208952903748, + "learning_rate": 0.0006298240250033972, + "loss": 3.7072, + "step": 43605 + }, + { + "epoch": 2.963038456311999, + "grad_norm": 0.946430504322052, + "learning_rate": 0.0006297815599945645, + "loss": 3.5822, + "step": 43610 + }, + { + "epoch": 2.9633781763826605, + "grad_norm": 0.7767845988273621, + "learning_rate": 0.0006297390949857318, + "loss": 3.3587, + "step": 43615 + }, + { + "epoch": 2.9637178964533226, + "grad_norm": 0.7529277205467224, + "learning_rate": 0.000629696629976899, + "loss": 3.6809, + "step": 43620 + }, + { + "epoch": 2.9640576165239843, + "grad_norm": 0.8006216287612915, + "learning_rate": 0.0006296541649680663, + "loss": 3.6535, + "step": 43625 + }, + { + "epoch": 2.964397336594646, + "grad_norm": 1.1268209218978882, + "learning_rate": 0.0006296116999592337, + "loss": 3.3267, + "step": 43630 + }, + { + "epoch": 2.9647370566653075, + "grad_norm": 0.9150340557098389, + "learning_rate": 0.0006295692349504009, + "loss": 3.6054, + "step": 43635 + }, + { + "epoch": 2.9650767767359696, + "grad_norm": 1.0391710996627808, + "learning_rate": 0.0006295267699415682, + "loss": 3.5877, + "step": 43640 + }, + { + "epoch": 2.965416496806631, + "grad_norm": 0.8927555680274963, + "learning_rate": 0.0006294843049327355, + "loss": 3.6138, + "step": 43645 + }, + { + "epoch": 2.965756216877293, + "grad_norm": 0.976182758808136, + "learning_rate": 0.0006294418399239027, + "loss": 3.5691, + "step": 43650 + }, + { + "epoch": 2.966095936947955, + "grad_norm": 0.7083033919334412, + "learning_rate": 0.00062939937491507, + "loss": 3.74, + "step": 43655 + }, + { + "epoch": 2.9664356570186166, + "grad_norm": 0.8380586504936218, + "learning_rate": 0.0006293569099062372, + "loss": 3.7151, + "step": 43660 + }, + { + "epoch": 2.966775377089278, + "grad_norm": 0.7940205931663513, + "learning_rate": 0.0006293144448974046, + "loss": 3.5396, + "step": 43665 + }, + { + "epoch": 2.9671150971599403, + "grad_norm": 0.828355073928833, + "learning_rate": 0.0006292719798885719, + "loss": 3.7346, + "step": 43670 + }, + { + "epoch": 2.967454817230602, + "grad_norm": 0.8053918480873108, + "learning_rate": 0.0006292295148797391, + "loss": 3.5241, + "step": 43675 + }, + { + "epoch": 2.9677945373012635, + "grad_norm": 0.9326063990592957, + "learning_rate": 0.0006291870498709064, + "loss": 3.591, + "step": 43680 + }, + { + "epoch": 2.9681342573719256, + "grad_norm": 0.7502681612968445, + "learning_rate": 0.0006291445848620737, + "loss": 3.615, + "step": 43685 + }, + { + "epoch": 2.9684739774425872, + "grad_norm": 0.7092805504798889, + "learning_rate": 0.0006291021198532409, + "loss": 3.4004, + "step": 43690 + }, + { + "epoch": 2.968813697513249, + "grad_norm": 0.9647485017776489, + "learning_rate": 0.0006290596548444081, + "loss": 3.7945, + "step": 43695 + }, + { + "epoch": 2.969153417583911, + "grad_norm": 1.009060263633728, + "learning_rate": 0.0006290171898355756, + "loss": 3.553, + "step": 43700 + }, + { + "epoch": 2.9694931376545726, + "grad_norm": 0.9841878414154053, + "learning_rate": 0.0006289747248267428, + "loss": 3.5059, + "step": 43705 + }, + { + "epoch": 2.969832857725234, + "grad_norm": 0.8084771037101746, + "learning_rate": 0.00062893225981791, + "loss": 3.5513, + "step": 43710 + }, + { + "epoch": 2.9701725777958963, + "grad_norm": 1.0631623268127441, + "learning_rate": 0.0006288897948090774, + "loss": 3.4013, + "step": 43715 + }, + { + "epoch": 2.970512297866558, + "grad_norm": 0.7629331350326538, + "learning_rate": 0.0006288473298002446, + "loss": 3.3826, + "step": 43720 + }, + { + "epoch": 2.9708520179372195, + "grad_norm": 0.9964699745178223, + "learning_rate": 0.0006288048647914118, + "loss": 3.4805, + "step": 43725 + }, + { + "epoch": 2.9711917380078816, + "grad_norm": 1.0695881843566895, + "learning_rate": 0.0006287623997825792, + "loss": 3.6854, + "step": 43730 + }, + { + "epoch": 2.9715314580785432, + "grad_norm": 2.554194450378418, + "learning_rate": 0.0006287199347737465, + "loss": 3.579, + "step": 43735 + }, + { + "epoch": 2.971871178149205, + "grad_norm": 0.9483156800270081, + "learning_rate": 0.0006286774697649137, + "loss": 3.4945, + "step": 43740 + }, + { + "epoch": 2.972210898219867, + "grad_norm": 0.8698270916938782, + "learning_rate": 0.0006286350047560811, + "loss": 3.7651, + "step": 43745 + }, + { + "epoch": 2.9725506182905286, + "grad_norm": 0.7211974263191223, + "learning_rate": 0.0006285925397472483, + "loss": 3.6415, + "step": 43750 + }, + { + "epoch": 2.97289033836119, + "grad_norm": 0.9575448036193848, + "learning_rate": 0.0006285500747384155, + "loss": 3.5337, + "step": 43755 + }, + { + "epoch": 2.9732300584318523, + "grad_norm": 3.640312433242798, + "learning_rate": 0.0006285076097295828, + "loss": 3.8455, + "step": 43760 + }, + { + "epoch": 2.973569778502514, + "grad_norm": 1.1740853786468506, + "learning_rate": 0.0006284651447207501, + "loss": 3.4268, + "step": 43765 + }, + { + "epoch": 2.9739094985731755, + "grad_norm": 0.6632174849510193, + "learning_rate": 0.0006284226797119174, + "loss": 3.6532, + "step": 43770 + }, + { + "epoch": 2.9742492186438376, + "grad_norm": 1.003337025642395, + "learning_rate": 0.0006283802147030847, + "loss": 3.3331, + "step": 43775 + }, + { + "epoch": 2.9745889387144993, + "grad_norm": 0.789198637008667, + "learning_rate": 0.000628337749694252, + "loss": 3.691, + "step": 43780 + }, + { + "epoch": 2.974928658785161, + "grad_norm": 0.8345378637313843, + "learning_rate": 0.0006282952846854192, + "loss": 3.5693, + "step": 43785 + }, + { + "epoch": 2.975268378855823, + "grad_norm": 0.8690646886825562, + "learning_rate": 0.0006282528196765865, + "loss": 3.6689, + "step": 43790 + }, + { + "epoch": 2.9756080989264846, + "grad_norm": 0.9681132435798645, + "learning_rate": 0.0006282103546677537, + "loss": 3.5829, + "step": 43795 + }, + { + "epoch": 2.975947818997146, + "grad_norm": 1.063088059425354, + "learning_rate": 0.000628167889658921, + "loss": 3.5155, + "step": 43800 + }, + { + "epoch": 2.9762875390678083, + "grad_norm": 0.8155789971351624, + "learning_rate": 0.0006281254246500884, + "loss": 3.5274, + "step": 43805 + }, + { + "epoch": 2.97662725913847, + "grad_norm": 32.01709747314453, + "learning_rate": 0.0006280829596412556, + "loss": 3.4627, + "step": 43810 + }, + { + "epoch": 2.9769669792091316, + "grad_norm": 0.7997117042541504, + "learning_rate": 0.0006280404946324229, + "loss": 3.6512, + "step": 43815 + }, + { + "epoch": 2.9773066992797936, + "grad_norm": 0.8450111150741577, + "learning_rate": 0.0006279980296235902, + "loss": 3.8375, + "step": 43820 + }, + { + "epoch": 2.9776464193504553, + "grad_norm": 0.7318648099899292, + "learning_rate": 0.0006279555646147574, + "loss": 3.5221, + "step": 43825 + }, + { + "epoch": 2.977986139421117, + "grad_norm": 0.7862438559532166, + "learning_rate": 0.0006279130996059247, + "loss": 3.4445, + "step": 43830 + }, + { + "epoch": 2.978325859491779, + "grad_norm": 1.0134919881820679, + "learning_rate": 0.000627870634597092, + "loss": 3.1235, + "step": 43835 + }, + { + "epoch": 2.9786655795624406, + "grad_norm": 0.748907744884491, + "learning_rate": 0.0006278281695882593, + "loss": 3.6696, + "step": 43840 + }, + { + "epoch": 2.9790052996331022, + "grad_norm": 0.7317928075790405, + "learning_rate": 0.0006277857045794265, + "loss": 3.6252, + "step": 43845 + }, + { + "epoch": 2.9793450197037643, + "grad_norm": 0.96550452709198, + "learning_rate": 0.0006277432395705939, + "loss": 3.6634, + "step": 43850 + }, + { + "epoch": 2.979684739774426, + "grad_norm": 0.7576945424079895, + "learning_rate": 0.0006277007745617611, + "loss": 3.3617, + "step": 43855 + }, + { + "epoch": 2.9800244598450876, + "grad_norm": 1.1861507892608643, + "learning_rate": 0.0006276583095529283, + "loss": 3.5405, + "step": 43860 + }, + { + "epoch": 2.9803641799157496, + "grad_norm": 0.730749785900116, + "learning_rate": 0.0006276158445440957, + "loss": 3.4514, + "step": 43865 + }, + { + "epoch": 2.9807038999864113, + "grad_norm": 0.8672133088111877, + "learning_rate": 0.0006275733795352629, + "loss": 3.5637, + "step": 43870 + }, + { + "epoch": 2.981043620057073, + "grad_norm": 2.8264951705932617, + "learning_rate": 0.0006275309145264302, + "loss": 3.3988, + "step": 43875 + }, + { + "epoch": 2.981383340127735, + "grad_norm": 0.8535977005958557, + "learning_rate": 0.0006274884495175976, + "loss": 3.7197, + "step": 43880 + }, + { + "epoch": 2.9817230601983966, + "grad_norm": 0.9903735518455505, + "learning_rate": 0.0006274459845087648, + "loss": 3.3938, + "step": 43885 + }, + { + "epoch": 2.9820627802690582, + "grad_norm": 0.9908225536346436, + "learning_rate": 0.000627403519499932, + "loss": 3.3101, + "step": 43890 + }, + { + "epoch": 2.9824025003397203, + "grad_norm": 0.9925866723060608, + "learning_rate": 0.0006273610544910993, + "loss": 3.9433, + "step": 43895 + }, + { + "epoch": 2.982742220410382, + "grad_norm": 1.162853717803955, + "learning_rate": 0.0006273185894822666, + "loss": 3.5949, + "step": 43900 + }, + { + "epoch": 2.9830819404810436, + "grad_norm": 1.0826687812805176, + "learning_rate": 0.0006272761244734338, + "loss": 3.5571, + "step": 43905 + }, + { + "epoch": 2.9834216605517057, + "grad_norm": 0.9977321624755859, + "learning_rate": 0.0006272336594646012, + "loss": 3.5517, + "step": 43910 + }, + { + "epoch": 2.9837613806223673, + "grad_norm": 0.7518835663795471, + "learning_rate": 0.0006271911944557685, + "loss": 3.6915, + "step": 43915 + }, + { + "epoch": 2.984101100693029, + "grad_norm": 0.851621687412262, + "learning_rate": 0.0006271487294469357, + "loss": 3.5073, + "step": 43920 + }, + { + "epoch": 2.984440820763691, + "grad_norm": 0.8327270746231079, + "learning_rate": 0.000627106264438103, + "loss": 3.6121, + "step": 43925 + }, + { + "epoch": 2.9847805408343526, + "grad_norm": 0.7813951969146729, + "learning_rate": 0.0006270637994292703, + "loss": 3.3577, + "step": 43930 + }, + { + "epoch": 2.9851202609050143, + "grad_norm": 1.10562002658844, + "learning_rate": 0.0006270213344204376, + "loss": 3.1745, + "step": 43935 + }, + { + "epoch": 2.985459980975676, + "grad_norm": 0.9436803460121155, + "learning_rate": 0.0006269788694116048, + "loss": 3.4879, + "step": 43940 + }, + { + "epoch": 2.985799701046338, + "grad_norm": 0.9433669447898865, + "learning_rate": 0.0006269364044027721, + "loss": 3.4553, + "step": 43945 + }, + { + "epoch": 2.9861394211169996, + "grad_norm": 0.8205870985984802, + "learning_rate": 0.0006268939393939395, + "loss": 3.5655, + "step": 43950 + }, + { + "epoch": 2.9864791411876612, + "grad_norm": 0.9097357392311096, + "learning_rate": 0.0006268514743851067, + "loss": 3.5946, + "step": 43955 + }, + { + "epoch": 2.9868188612583233, + "grad_norm": 4.907651901245117, + "learning_rate": 0.0006268090093762739, + "loss": 3.7784, + "step": 43960 + }, + { + "epoch": 2.987158581328985, + "grad_norm": 0.8768441081047058, + "learning_rate": 0.0006267665443674413, + "loss": 3.6149, + "step": 43965 + }, + { + "epoch": 2.9874983013996466, + "grad_norm": 0.8885760307312012, + "learning_rate": 0.0006267240793586085, + "loss": 3.7244, + "step": 43970 + }, + { + "epoch": 2.987838021470308, + "grad_norm": 0.6690799593925476, + "learning_rate": 0.0006266816143497757, + "loss": 3.4406, + "step": 43975 + }, + { + "epoch": 2.9881777415409703, + "grad_norm": 0.7571190595626831, + "learning_rate": 0.0006266391493409432, + "loss": 3.5412, + "step": 43980 + }, + { + "epoch": 2.988517461611632, + "grad_norm": 0.791172444820404, + "learning_rate": 0.0006265966843321104, + "loss": 3.3776, + "step": 43985 + }, + { + "epoch": 2.9888571816822935, + "grad_norm": 1.2130539417266846, + "learning_rate": 0.0006265542193232776, + "loss": 3.7053, + "step": 43990 + }, + { + "epoch": 2.9891969017529556, + "grad_norm": 1.0458229780197144, + "learning_rate": 0.000626511754314445, + "loss": 3.7718, + "step": 43995 + }, + { + "epoch": 2.9895366218236172, + "grad_norm": 0.8193138241767883, + "learning_rate": 0.0006264692893056122, + "loss": 3.4668, + "step": 44000 + }, + { + "epoch": 2.989876341894279, + "grad_norm": 1.0119136571884155, + "learning_rate": 0.0006264268242967794, + "loss": 3.5477, + "step": 44005 + }, + { + "epoch": 2.990216061964941, + "grad_norm": 1.0374222993850708, + "learning_rate": 0.0006263843592879468, + "loss": 3.3846, + "step": 44010 + }, + { + "epoch": 2.9905557820356026, + "grad_norm": 0.9011834859848022, + "learning_rate": 0.0006263418942791141, + "loss": 3.7136, + "step": 44015 + }, + { + "epoch": 2.990895502106264, + "grad_norm": 0.9106399416923523, + "learning_rate": 0.0006262994292702813, + "loss": 3.5893, + "step": 44020 + }, + { + "epoch": 2.9912352221769263, + "grad_norm": 0.8067495822906494, + "learning_rate": 0.0006262569642614486, + "loss": 3.547, + "step": 44025 + }, + { + "epoch": 2.991574942247588, + "grad_norm": 0.9715847373008728, + "learning_rate": 0.0006262144992526159, + "loss": 4.132, + "step": 44030 + }, + { + "epoch": 2.9919146623182495, + "grad_norm": 1.0100535154342651, + "learning_rate": 0.0006261720342437831, + "loss": 3.4858, + "step": 44035 + }, + { + "epoch": 2.9922543823889116, + "grad_norm": 0.724033772945404, + "learning_rate": 0.0006261295692349504, + "loss": 3.5014, + "step": 44040 + }, + { + "epoch": 2.9925941024595732, + "grad_norm": 0.6782093048095703, + "learning_rate": 0.0006260871042261178, + "loss": 3.6676, + "step": 44045 + }, + { + "epoch": 2.992933822530235, + "grad_norm": 0.7685904502868652, + "learning_rate": 0.000626044639217285, + "loss": 3.6652, + "step": 44050 + }, + { + "epoch": 2.993273542600897, + "grad_norm": 0.9054877758026123, + "learning_rate": 0.0006260021742084523, + "loss": 3.7615, + "step": 44055 + }, + { + "epoch": 2.9936132626715586, + "grad_norm": 1.0656733512878418, + "learning_rate": 0.0006259597091996195, + "loss": 3.6264, + "step": 44060 + }, + { + "epoch": 2.99395298274222, + "grad_norm": 0.9266199469566345, + "learning_rate": 0.0006259172441907868, + "loss": 3.4897, + "step": 44065 + }, + { + "epoch": 2.9942927028128823, + "grad_norm": 1.197279930114746, + "learning_rate": 0.0006258747791819541, + "loss": 3.659, + "step": 44070 + }, + { + "epoch": 2.994632422883544, + "grad_norm": 0.8734085559844971, + "learning_rate": 0.0006258323141731213, + "loss": 3.5211, + "step": 44075 + }, + { + "epoch": 2.9949721429542056, + "grad_norm": 1.197360634803772, + "learning_rate": 0.0006257898491642887, + "loss": 3.3594, + "step": 44080 + }, + { + "epoch": 2.9953118630248676, + "grad_norm": 1.0139802694320679, + "learning_rate": 0.000625747384155456, + "loss": 3.613, + "step": 44085 + }, + { + "epoch": 2.9956515830955293, + "grad_norm": 0.850475013256073, + "learning_rate": 0.0006257049191466232, + "loss": 3.4573, + "step": 44090 + }, + { + "epoch": 2.995991303166191, + "grad_norm": 0.7804518342018127, + "learning_rate": 0.0006256624541377904, + "loss": 3.7573, + "step": 44095 + }, + { + "epoch": 2.996331023236853, + "grad_norm": 0.9392233490943909, + "learning_rate": 0.0006256199891289578, + "loss": 3.6369, + "step": 44100 + }, + { + "epoch": 2.9966707433075146, + "grad_norm": 0.7866889238357544, + "learning_rate": 0.000625577524120125, + "loss": 3.7267, + "step": 44105 + }, + { + "epoch": 2.9970104633781762, + "grad_norm": 0.9338416457176208, + "learning_rate": 0.0006255350591112922, + "loss": 3.4692, + "step": 44110 + }, + { + "epoch": 2.9973501834488383, + "grad_norm": 0.9634619355201721, + "learning_rate": 0.0006254925941024597, + "loss": 3.3781, + "step": 44115 + }, + { + "epoch": 2.9976899035195, + "grad_norm": 0.7630186080932617, + "learning_rate": 0.0006254501290936269, + "loss": 3.7003, + "step": 44120 + }, + { + "epoch": 2.9980296235901616, + "grad_norm": 0.7857458591461182, + "learning_rate": 0.0006254076640847941, + "loss": 3.7346, + "step": 44125 + }, + { + "epoch": 2.9983693436608236, + "grad_norm": 1.0854766368865967, + "learning_rate": 0.0006253651990759615, + "loss": 3.7299, + "step": 44130 + }, + { + "epoch": 2.9987090637314853, + "grad_norm": 0.7907107472419739, + "learning_rate": 0.0006253227340671287, + "loss": 3.6409, + "step": 44135 + }, + { + "epoch": 2.999048783802147, + "grad_norm": 0.7531028985977173, + "learning_rate": 0.0006252802690582959, + "loss": 3.3213, + "step": 44140 + }, + { + "epoch": 2.999388503872809, + "grad_norm": 0.7596555352210999, + "learning_rate": 0.0006252378040494632, + "loss": 3.5946, + "step": 44145 + }, + { + "epoch": 2.9997282239434706, + "grad_norm": 0.8681999444961548, + "learning_rate": 0.0006251953390406306, + "loss": 3.5135, + "step": 44150 + }, + { + "epoch": 3.0, + "eval_bertscore": { + "f1": 0.8416656346581728, + "precision": 0.8425698047264658, + "recall": 0.8415703780545504 + }, + "eval_bleu_4": 0.01943429105044029, + "eval_exact_match": 0.0002907258455276674, + "eval_loss": 3.4496495723724365, + "eval_meteor": 0.08801618287030327, + "eval_rouge": { + "rouge1": 0.11647513793918819, + "rouge2": 0.016941052292240015, + "rougeL": 0.10153187772902783, + "rougeLsum": 0.10147342087872277 + }, + "eval_runtime": 1653.3529, + "eval_samples_per_second": 6.241, + "eval_steps_per_second": 0.78, + "step": 44154 + }, + { + "epoch": 3.0000679440141322, + "grad_norm": 0.9345661401748657, + "learning_rate": 0.0006251528740317978, + "loss": 3.467, + "step": 44155 + }, + { + "epoch": 3.0004076640847943, + "grad_norm": 0.8705325126647949, + "learning_rate": 0.0006251104090229651, + "loss": 3.6194, + "step": 44160 + }, + { + "epoch": 3.000747384155456, + "grad_norm": 0.9998478889465332, + "learning_rate": 0.0006250679440141324, + "loss": 3.5601, + "step": 44165 + }, + { + "epoch": 3.0010871042261176, + "grad_norm": 0.8781781792640686, + "learning_rate": 0.0006250254790052996, + "loss": 3.3275, + "step": 44170 + }, + { + "epoch": 3.0014268242967796, + "grad_norm": 1.110169768333435, + "learning_rate": 0.0006249830139964669, + "loss": 3.5681, + "step": 44175 + }, + { + "epoch": 3.0017665443674413, + "grad_norm": 0.8509441614151001, + "learning_rate": 0.0006249405489876342, + "loss": 3.4585, + "step": 44180 + }, + { + "epoch": 3.002106264438103, + "grad_norm": 0.9422857165336609, + "learning_rate": 0.0006248980839788015, + "loss": 3.3093, + "step": 44185 + }, + { + "epoch": 3.002445984508765, + "grad_norm": 0.714918851852417, + "learning_rate": 0.0006248556189699688, + "loss": 3.7215, + "step": 44190 + }, + { + "epoch": 3.0027857045794266, + "grad_norm": 0.9351991415023804, + "learning_rate": 0.000624813153961136, + "loss": 3.3067, + "step": 44195 + }, + { + "epoch": 3.0031254246500882, + "grad_norm": 0.808068037033081, + "learning_rate": 0.0006247706889523033, + "loss": 3.414, + "step": 44200 + }, + { + "epoch": 3.0034651447207503, + "grad_norm": 0.9515233039855957, + "learning_rate": 0.0006247282239434706, + "loss": 3.6038, + "step": 44205 + }, + { + "epoch": 3.003804864791412, + "grad_norm": 0.789360761642456, + "learning_rate": 0.0006246857589346378, + "loss": 3.5083, + "step": 44210 + }, + { + "epoch": 3.0041445848620736, + "grad_norm": 0.7440232634544373, + "learning_rate": 0.0006246432939258051, + "loss": 3.5356, + "step": 44215 + }, + { + "epoch": 3.004484304932735, + "grad_norm": 1.1063082218170166, + "learning_rate": 0.0006246008289169725, + "loss": 3.45, + "step": 44220 + }, + { + "epoch": 3.0048240250033973, + "grad_norm": 0.9479272961616516, + "learning_rate": 0.0006245583639081397, + "loss": 3.5763, + "step": 44225 + }, + { + "epoch": 3.005163745074059, + "grad_norm": 0.9309040307998657, + "learning_rate": 0.000624515898899307, + "loss": 3.3634, + "step": 44230 + }, + { + "epoch": 3.0055034651447206, + "grad_norm": 1.0334558486938477, + "learning_rate": 0.0006244734338904743, + "loss": 3.2965, + "step": 44235 + }, + { + "epoch": 3.0058431852153826, + "grad_norm": 0.7450209259986877, + "learning_rate": 0.0006244309688816415, + "loss": 3.5487, + "step": 44240 + }, + { + "epoch": 3.0061829052860443, + "grad_norm": 0.903424084186554, + "learning_rate": 0.0006243885038728087, + "loss": 3.6116, + "step": 44245 + }, + { + "epoch": 3.006522625356706, + "grad_norm": 1.0446337461471558, + "learning_rate": 0.0006243460388639761, + "loss": 3.5711, + "step": 44250 + }, + { + "epoch": 3.006862345427368, + "grad_norm": 0.9547023773193359, + "learning_rate": 0.0006243035738551434, + "loss": 3.4722, + "step": 44255 + }, + { + "epoch": 3.0072020654980296, + "grad_norm": 1.0250574350357056, + "learning_rate": 0.0006242611088463106, + "loss": 3.8727, + "step": 44260 + }, + { + "epoch": 3.0075417855686912, + "grad_norm": 0.8594845533370972, + "learning_rate": 0.000624218643837478, + "loss": 3.6842, + "step": 44265 + }, + { + "epoch": 3.0078815056393533, + "grad_norm": 0.9301007986068726, + "learning_rate": 0.0006241761788286452, + "loss": 3.1437, + "step": 44270 + }, + { + "epoch": 3.008221225710015, + "grad_norm": 1.499562382698059, + "learning_rate": 0.0006241337138198124, + "loss": 3.4057, + "step": 44275 + }, + { + "epoch": 3.0085609457806766, + "grad_norm": 0.8325976133346558, + "learning_rate": 0.0006240912488109798, + "loss": 3.7138, + "step": 44280 + }, + { + "epoch": 3.0089006658513386, + "grad_norm": 0.9880672097206116, + "learning_rate": 0.000624048783802147, + "loss": 3.4226, + "step": 44285 + }, + { + "epoch": 3.0092403859220003, + "grad_norm": 3.694622755050659, + "learning_rate": 0.0006240063187933144, + "loss": 3.4941, + "step": 44290 + }, + { + "epoch": 3.009580105992662, + "grad_norm": 0.7028996348381042, + "learning_rate": 0.0006239638537844816, + "loss": 3.5435, + "step": 44295 + }, + { + "epoch": 3.009919826063324, + "grad_norm": 0.7484161853790283, + "learning_rate": 0.0006239213887756489, + "loss": 3.452, + "step": 44300 + }, + { + "epoch": 3.0102595461339856, + "grad_norm": 0.8534006476402283, + "learning_rate": 0.0006238789237668162, + "loss": 3.4054, + "step": 44305 + }, + { + "epoch": 3.0105992662046472, + "grad_norm": 1.0042009353637695, + "learning_rate": 0.0006238364587579834, + "loss": 3.6941, + "step": 44310 + }, + { + "epoch": 3.0109389862753093, + "grad_norm": 1.1418033838272095, + "learning_rate": 0.0006237939937491507, + "loss": 3.5719, + "step": 44315 + }, + { + "epoch": 3.011278706345971, + "grad_norm": 0.8523388504981995, + "learning_rate": 0.000623751528740318, + "loss": 3.4433, + "step": 44320 + }, + { + "epoch": 3.0116184264166326, + "grad_norm": 4.097689628601074, + "learning_rate": 0.0006237090637314853, + "loss": 3.5483, + "step": 44325 + }, + { + "epoch": 3.0119581464872947, + "grad_norm": 0.7934311628341675, + "learning_rate": 0.0006236665987226526, + "loss": 3.7982, + "step": 44330 + }, + { + "epoch": 3.0122978665579563, + "grad_norm": 0.7928743362426758, + "learning_rate": 0.0006236241337138199, + "loss": 3.5018, + "step": 44335 + }, + { + "epoch": 3.012637586628618, + "grad_norm": 0.8081720471382141, + "learning_rate": 0.0006235816687049871, + "loss": 3.4057, + "step": 44340 + }, + { + "epoch": 3.01297730669928, + "grad_norm": 1.053788423538208, + "learning_rate": 0.0006235392036961543, + "loss": 3.6835, + "step": 44345 + }, + { + "epoch": 3.0133170267699416, + "grad_norm": 0.7519021034240723, + "learning_rate": 0.0006234967386873217, + "loss": 3.5405, + "step": 44350 + }, + { + "epoch": 3.0136567468406033, + "grad_norm": 1.3951926231384277, + "learning_rate": 0.0006234542736784889, + "loss": 3.6297, + "step": 44355 + }, + { + "epoch": 3.0139964669112653, + "grad_norm": 0.8109663724899292, + "learning_rate": 0.0006234118086696562, + "loss": 3.5586, + "step": 44360 + }, + { + "epoch": 3.014336186981927, + "grad_norm": 0.86181640625, + "learning_rate": 0.0006233693436608236, + "loss": 3.4603, + "step": 44365 + }, + { + "epoch": 3.0146759070525886, + "grad_norm": 0.7893531918525696, + "learning_rate": 0.0006233268786519908, + "loss": 3.5132, + "step": 44370 + }, + { + "epoch": 3.01501562712325, + "grad_norm": 0.7680171132087708, + "learning_rate": 0.000623284413643158, + "loss": 3.598, + "step": 44375 + }, + { + "epoch": 3.0153553471939123, + "grad_norm": 0.9623697996139526, + "learning_rate": 0.0006232419486343254, + "loss": 3.3849, + "step": 44380 + }, + { + "epoch": 3.015695067264574, + "grad_norm": 1.0252223014831543, + "learning_rate": 0.0006231994836254926, + "loss": 3.617, + "step": 44385 + }, + { + "epoch": 3.0160347873352356, + "grad_norm": 1.1469957828521729, + "learning_rate": 0.0006231570186166598, + "loss": 3.5966, + "step": 44390 + }, + { + "epoch": 3.0163745074058976, + "grad_norm": 0.7087029814720154, + "learning_rate": 0.0006231145536078272, + "loss": 3.3763, + "step": 44395 + }, + { + "epoch": 3.0167142274765593, + "grad_norm": 0.934836208820343, + "learning_rate": 0.0006230720885989945, + "loss": 3.756, + "step": 44400 + }, + { + "epoch": 3.017053947547221, + "grad_norm": 1.226208209991455, + "learning_rate": 0.0006230296235901617, + "loss": 3.3829, + "step": 44405 + }, + { + "epoch": 3.017393667617883, + "grad_norm": 0.9203476905822754, + "learning_rate": 0.000622987158581329, + "loss": 3.4185, + "step": 44410 + }, + { + "epoch": 3.0177333876885446, + "grad_norm": 0.9021828770637512, + "learning_rate": 0.0006229446935724963, + "loss": 3.7763, + "step": 44415 + }, + { + "epoch": 3.0180731077592062, + "grad_norm": 0.9706690311431885, + "learning_rate": 0.0006229022285636635, + "loss": 3.6524, + "step": 44420 + }, + { + "epoch": 3.0184128278298683, + "grad_norm": 0.8379446268081665, + "learning_rate": 0.0006228597635548308, + "loss": 3.7595, + "step": 44425 + }, + { + "epoch": 3.01875254790053, + "grad_norm": 0.6765066385269165, + "learning_rate": 0.0006228172985459982, + "loss": 3.651, + "step": 44430 + }, + { + "epoch": 3.0190922679711916, + "grad_norm": 0.845808744430542, + "learning_rate": 0.0006227748335371654, + "loss": 3.259, + "step": 44435 + }, + { + "epoch": 3.0194319880418536, + "grad_norm": 0.7800790667533875, + "learning_rate": 0.0006227323685283327, + "loss": 3.8262, + "step": 44440 + }, + { + "epoch": 3.0197717081125153, + "grad_norm": 0.8516272306442261, + "learning_rate": 0.0006226899035194999, + "loss": 3.5465, + "step": 44445 + }, + { + "epoch": 3.020111428183177, + "grad_norm": 0.9402086138725281, + "learning_rate": 0.0006226474385106672, + "loss": 3.5577, + "step": 44450 + }, + { + "epoch": 3.020451148253839, + "grad_norm": 1.080100655555725, + "learning_rate": 0.0006226049735018345, + "loss": 3.3875, + "step": 44455 + }, + { + "epoch": 3.0207908683245006, + "grad_norm": 0.81032794713974, + "learning_rate": 0.0006225625084930017, + "loss": 3.4286, + "step": 44460 + }, + { + "epoch": 3.0211305883951622, + "grad_norm": 0.9983835220336914, + "learning_rate": 0.0006225200434841691, + "loss": 3.5877, + "step": 44465 + }, + { + "epoch": 3.0214703084658243, + "grad_norm": 0.9937325716018677, + "learning_rate": 0.0006224775784753364, + "loss": 3.4271, + "step": 44470 + }, + { + "epoch": 3.021810028536486, + "grad_norm": 0.8771826028823853, + "learning_rate": 0.0006224351134665036, + "loss": 3.5455, + "step": 44475 + }, + { + "epoch": 3.0221497486071476, + "grad_norm": 1.0209681987762451, + "learning_rate": 0.0006223926484576708, + "loss": 3.666, + "step": 44480 + }, + { + "epoch": 3.0224894686778097, + "grad_norm": 1.0476527214050293, + "learning_rate": 0.0006223501834488382, + "loss": 3.787, + "step": 44485 + }, + { + "epoch": 3.0228291887484713, + "grad_norm": 0.7307692766189575, + "learning_rate": 0.0006223077184400054, + "loss": 3.2096, + "step": 44490 + }, + { + "epoch": 3.023168908819133, + "grad_norm": 0.9630268812179565, + "learning_rate": 0.0006222652534311726, + "loss": 3.4916, + "step": 44495 + }, + { + "epoch": 3.023508628889795, + "grad_norm": 0.7601862549781799, + "learning_rate": 0.0006222227884223401, + "loss": 3.5781, + "step": 44500 + }, + { + "epoch": 3.0238483489604566, + "grad_norm": 0.8575972318649292, + "learning_rate": 0.0006221803234135073, + "loss": 3.3468, + "step": 44505 + }, + { + "epoch": 3.0241880690311183, + "grad_norm": 0.8919970393180847, + "learning_rate": 0.0006221378584046745, + "loss": 3.6038, + "step": 44510 + }, + { + "epoch": 3.0245277891017803, + "grad_norm": 0.8413810133934021, + "learning_rate": 0.0006220953933958419, + "loss": 3.4909, + "step": 44515 + }, + { + "epoch": 3.024867509172442, + "grad_norm": 0.8939806222915649, + "learning_rate": 0.0006220529283870091, + "loss": 3.455, + "step": 44520 + }, + { + "epoch": 3.0252072292431036, + "grad_norm": 0.9620987772941589, + "learning_rate": 0.0006220104633781763, + "loss": 3.4879, + "step": 44525 + }, + { + "epoch": 3.0255469493137657, + "grad_norm": 0.824705958366394, + "learning_rate": 0.0006219679983693436, + "loss": 3.6444, + "step": 44530 + }, + { + "epoch": 3.0258866693844273, + "grad_norm": 1.0015169382095337, + "learning_rate": 0.000621925533360511, + "loss": 3.4487, + "step": 44535 + }, + { + "epoch": 3.026226389455089, + "grad_norm": 0.8214786648750305, + "learning_rate": 0.0006218830683516782, + "loss": 4.035, + "step": 44540 + }, + { + "epoch": 3.026566109525751, + "grad_norm": 0.9318434596061707, + "learning_rate": 0.0006218406033428455, + "loss": 3.1942, + "step": 44545 + }, + { + "epoch": 3.0269058295964126, + "grad_norm": 1.1608176231384277, + "learning_rate": 0.0006217981383340128, + "loss": 3.5268, + "step": 44550 + }, + { + "epoch": 3.0272455496670743, + "grad_norm": 0.7261649966239929, + "learning_rate": 0.00062175567332518, + "loss": 3.5752, + "step": 44555 + }, + { + "epoch": 3.027585269737736, + "grad_norm": 0.9563938975334167, + "learning_rate": 0.0006217132083163473, + "loss": 3.4287, + "step": 44560 + }, + { + "epoch": 3.027924989808398, + "grad_norm": 0.9009677767753601, + "learning_rate": 0.0006216707433075146, + "loss": 3.6337, + "step": 44565 + }, + { + "epoch": 3.0282647098790596, + "grad_norm": 0.7572489976882935, + "learning_rate": 0.0006216282782986819, + "loss": 3.5755, + "step": 44570 + }, + { + "epoch": 3.0286044299497212, + "grad_norm": 1.0424998998641968, + "learning_rate": 0.0006215858132898492, + "loss": 3.5146, + "step": 44575 + }, + { + "epoch": 3.0289441500203833, + "grad_norm": 1.224712610244751, + "learning_rate": 0.0006215433482810164, + "loss": 3.3755, + "step": 44580 + }, + { + "epoch": 3.029283870091045, + "grad_norm": 1.1416466236114502, + "learning_rate": 0.0006215008832721837, + "loss": 3.5797, + "step": 44585 + }, + { + "epoch": 3.0296235901617066, + "grad_norm": 1.2826133966445923, + "learning_rate": 0.000621458418263351, + "loss": 3.3013, + "step": 44590 + }, + { + "epoch": 3.0299633102323686, + "grad_norm": 1.000445008277893, + "learning_rate": 0.0006214159532545182, + "loss": 3.4776, + "step": 44595 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.8143826723098755, + "learning_rate": 0.0006213734882456855, + "loss": 3.4895, + "step": 44600 + }, + { + "epoch": 3.030642750373692, + "grad_norm": 0.9985485672950745, + "learning_rate": 0.0006213310232368529, + "loss": 3.4939, + "step": 44605 + }, + { + "epoch": 3.030982470444354, + "grad_norm": 0.9166423082351685, + "learning_rate": 0.0006212885582280201, + "loss": 3.4156, + "step": 44610 + }, + { + "epoch": 3.0313221905150156, + "grad_norm": 0.7473548650741577, + "learning_rate": 0.0006212460932191874, + "loss": 3.42, + "step": 44615 + }, + { + "epoch": 3.0316619105856772, + "grad_norm": 1.0722532272338867, + "learning_rate": 0.0006212036282103547, + "loss": 3.415, + "step": 44620 + }, + { + "epoch": 3.0320016306563393, + "grad_norm": 1.086130976676941, + "learning_rate": 0.0006211611632015219, + "loss": 3.5644, + "step": 44625 + }, + { + "epoch": 3.032341350727001, + "grad_norm": 1.1808322668075562, + "learning_rate": 0.0006211186981926892, + "loss": 3.7009, + "step": 44630 + }, + { + "epoch": 3.0326810707976626, + "grad_norm": 0.9255502223968506, + "learning_rate": 0.0006210762331838566, + "loss": 3.5142, + "step": 44635 + }, + { + "epoch": 3.0330207908683247, + "grad_norm": 0.7567299008369446, + "learning_rate": 0.0006210337681750238, + "loss": 3.2777, + "step": 44640 + }, + { + "epoch": 3.0333605109389863, + "grad_norm": 1.1588066816329956, + "learning_rate": 0.0006209913031661911, + "loss": 3.703, + "step": 44645 + }, + { + "epoch": 3.033700231009648, + "grad_norm": 0.7396805286407471, + "learning_rate": 0.0006209488381573584, + "loss": 3.1157, + "step": 44650 + }, + { + "epoch": 3.03403995108031, + "grad_norm": 0.8781517148017883, + "learning_rate": 0.0006209063731485256, + "loss": 3.583, + "step": 44655 + }, + { + "epoch": 3.0343796711509716, + "grad_norm": 1.144524335861206, + "learning_rate": 0.0006208639081396929, + "loss": 3.4846, + "step": 44660 + }, + { + "epoch": 3.0347193912216333, + "grad_norm": 0.8104262351989746, + "learning_rate": 0.0006208214431308602, + "loss": 3.7448, + "step": 44665 + }, + { + "epoch": 3.0350591112922953, + "grad_norm": 1.0510834455490112, + "learning_rate": 0.0006207789781220275, + "loss": 3.2614, + "step": 44670 + }, + { + "epoch": 3.035398831362957, + "grad_norm": 0.7789841294288635, + "learning_rate": 0.0006207365131131948, + "loss": 3.353, + "step": 44675 + }, + { + "epoch": 3.0357385514336186, + "grad_norm": 1.2945528030395508, + "learning_rate": 0.000620694048104362, + "loss": 3.5856, + "step": 44680 + }, + { + "epoch": 3.0360782715042807, + "grad_norm": 0.825264573097229, + "learning_rate": 0.0006206515830955293, + "loss": 3.5932, + "step": 44685 + }, + { + "epoch": 3.0364179915749423, + "grad_norm": 0.8568071126937866, + "learning_rate": 0.0006206091180866966, + "loss": 3.471, + "step": 44690 + }, + { + "epoch": 3.036757711645604, + "grad_norm": 1.060340404510498, + "learning_rate": 0.0006205666530778638, + "loss": 3.3036, + "step": 44695 + }, + { + "epoch": 3.037097431716266, + "grad_norm": 0.8282649517059326, + "learning_rate": 0.0006205241880690311, + "loss": 3.6159, + "step": 44700 + }, + { + "epoch": 3.0374371517869276, + "grad_norm": 0.9926738739013672, + "learning_rate": 0.0006204817230601985, + "loss": 3.4085, + "step": 44705 + }, + { + "epoch": 3.0377768718575893, + "grad_norm": 0.7519678473472595, + "learning_rate": 0.0006204392580513657, + "loss": 3.7173, + "step": 44710 + }, + { + "epoch": 3.038116591928251, + "grad_norm": 1.0478427410125732, + "learning_rate": 0.000620396793042533, + "loss": 3.2054, + "step": 44715 + }, + { + "epoch": 3.038456311998913, + "grad_norm": 0.8050093650817871, + "learning_rate": 0.0006203543280337003, + "loss": 3.3811, + "step": 44720 + }, + { + "epoch": 3.0387960320695746, + "grad_norm": 0.6812238693237305, + "learning_rate": 0.0006203118630248675, + "loss": 3.3788, + "step": 44725 + }, + { + "epoch": 3.0391357521402362, + "grad_norm": 1.0486485958099365, + "learning_rate": 0.0006202693980160347, + "loss": 3.5314, + "step": 44730 + }, + { + "epoch": 3.0394754722108983, + "grad_norm": 0.8285293579101562, + "learning_rate": 0.0006202269330072021, + "loss": 3.6845, + "step": 44735 + }, + { + "epoch": 3.03981519228156, + "grad_norm": 0.986740231513977, + "learning_rate": 0.0006201844679983694, + "loss": 3.2144, + "step": 44740 + }, + { + "epoch": 3.0401549123522216, + "grad_norm": 1.156001091003418, + "learning_rate": 0.0006201420029895366, + "loss": 3.683, + "step": 44745 + }, + { + "epoch": 3.0404946324228836, + "grad_norm": 0.7804427742958069, + "learning_rate": 0.000620099537980704, + "loss": 3.4426, + "step": 44750 + }, + { + "epoch": 3.0408343524935453, + "grad_norm": 0.7033209204673767, + "learning_rate": 0.0006200570729718712, + "loss": 3.1342, + "step": 44755 + }, + { + "epoch": 3.041174072564207, + "grad_norm": 0.9821354746818542, + "learning_rate": 0.0006200146079630384, + "loss": 3.4093, + "step": 44760 + }, + { + "epoch": 3.041513792634869, + "grad_norm": 0.811325192451477, + "learning_rate": 0.0006199721429542058, + "loss": 3.3555, + "step": 44765 + }, + { + "epoch": 3.0418535127055306, + "grad_norm": 0.9822152256965637, + "learning_rate": 0.000619929677945373, + "loss": 3.1896, + "step": 44770 + }, + { + "epoch": 3.0421932327761922, + "grad_norm": 1.2142118215560913, + "learning_rate": 0.0006198872129365403, + "loss": 3.6166, + "step": 44775 + }, + { + "epoch": 3.0425329528468543, + "grad_norm": 1.0171880722045898, + "learning_rate": 0.0006198447479277077, + "loss": 3.4549, + "step": 44780 + }, + { + "epoch": 3.042872672917516, + "grad_norm": 0.8662763833999634, + "learning_rate": 0.0006198022829188749, + "loss": 3.392, + "step": 44785 + }, + { + "epoch": 3.0432123929881776, + "grad_norm": 0.9291183948516846, + "learning_rate": 0.0006197598179100421, + "loss": 3.2294, + "step": 44790 + }, + { + "epoch": 3.0435521130588397, + "grad_norm": 0.8199495077133179, + "learning_rate": 0.0006197173529012094, + "loss": 3.4816, + "step": 44795 + }, + { + "epoch": 3.0438918331295013, + "grad_norm": 0.73676598072052, + "learning_rate": 0.0006196748878923767, + "loss": 3.5742, + "step": 44800 + }, + { + "epoch": 3.044231553200163, + "grad_norm": 1.0839177370071411, + "learning_rate": 0.0006196324228835439, + "loss": 3.4555, + "step": 44805 + }, + { + "epoch": 3.044571273270825, + "grad_norm": 0.7246881723403931, + "learning_rate": 0.0006195899578747113, + "loss": 3.2734, + "step": 44810 + }, + { + "epoch": 3.0449109933414866, + "grad_norm": 0.8187270164489746, + "learning_rate": 0.0006195474928658786, + "loss": 3.7407, + "step": 44815 + }, + { + "epoch": 3.0452507134121483, + "grad_norm": 0.9523969292640686, + "learning_rate": 0.0006195050278570458, + "loss": 3.517, + "step": 44820 + }, + { + "epoch": 3.0455904334828103, + "grad_norm": 0.6657265424728394, + "learning_rate": 0.0006194625628482131, + "loss": 3.5741, + "step": 44825 + }, + { + "epoch": 3.045930153553472, + "grad_norm": 0.7096591591835022, + "learning_rate": 0.0006194200978393803, + "loss": 3.5418, + "step": 44830 + }, + { + "epoch": 3.0462698736241336, + "grad_norm": 0.932666003704071, + "learning_rate": 0.0006193776328305476, + "loss": 3.2637, + "step": 44835 + }, + { + "epoch": 3.0466095936947957, + "grad_norm": 0.8521873354911804, + "learning_rate": 0.0006193351678217149, + "loss": 3.3831, + "step": 44840 + }, + { + "epoch": 3.0469493137654573, + "grad_norm": 0.9817870259284973, + "learning_rate": 0.0006192927028128822, + "loss": 3.3476, + "step": 44845 + }, + { + "epoch": 3.047289033836119, + "grad_norm": 0.8715782165527344, + "learning_rate": 0.0006192502378040495, + "loss": 3.6439, + "step": 44850 + }, + { + "epoch": 3.047628753906781, + "grad_norm": 0.6531186103820801, + "learning_rate": 0.0006192077727952168, + "loss": 3.5518, + "step": 44855 + }, + { + "epoch": 3.0479684739774426, + "grad_norm": 0.8560348749160767, + "learning_rate": 0.000619165307786384, + "loss": 3.3497, + "step": 44860 + }, + { + "epoch": 3.0483081940481043, + "grad_norm": 1.000889539718628, + "learning_rate": 0.0006191228427775513, + "loss": 3.4638, + "step": 44865 + }, + { + "epoch": 3.0486479141187663, + "grad_norm": 0.7538596391677856, + "learning_rate": 0.0006190803777687186, + "loss": 3.3808, + "step": 44870 + }, + { + "epoch": 3.048987634189428, + "grad_norm": 0.8992329835891724, + "learning_rate": 0.0006190379127598858, + "loss": 3.4565, + "step": 44875 + }, + { + "epoch": 3.0493273542600896, + "grad_norm": 0.9904758930206299, + "learning_rate": 0.0006189954477510531, + "loss": 3.3552, + "step": 44880 + }, + { + "epoch": 3.0496670743307517, + "grad_norm": 0.8859017491340637, + "learning_rate": 0.0006189529827422205, + "loss": 3.5639, + "step": 44885 + }, + { + "epoch": 3.0500067944014133, + "grad_norm": 0.8537598848342896, + "learning_rate": 0.0006189105177333877, + "loss": 3.6492, + "step": 44890 + }, + { + "epoch": 3.050346514472075, + "grad_norm": 0.8301873803138733, + "learning_rate": 0.0006188680527245549, + "loss": 3.4948, + "step": 44895 + }, + { + "epoch": 3.0506862345427366, + "grad_norm": 1.0596449375152588, + "learning_rate": 0.0006188255877157223, + "loss": 3.6333, + "step": 44900 + }, + { + "epoch": 3.0510259546133986, + "grad_norm": 0.7561647295951843, + "learning_rate": 0.0006187831227068895, + "loss": 3.3028, + "step": 44905 + }, + { + "epoch": 3.0513656746840603, + "grad_norm": 1.0134865045547485, + "learning_rate": 0.0006187406576980567, + "loss": 3.4636, + "step": 44910 + }, + { + "epoch": 3.051705394754722, + "grad_norm": 0.8390710949897766, + "learning_rate": 0.0006186981926892242, + "loss": 3.6768, + "step": 44915 + }, + { + "epoch": 3.052045114825384, + "grad_norm": 0.7435189485549927, + "learning_rate": 0.0006186557276803914, + "loss": 3.5702, + "step": 44920 + }, + { + "epoch": 3.0523848348960456, + "grad_norm": 0.8698585629463196, + "learning_rate": 0.0006186132626715586, + "loss": 3.5265, + "step": 44925 + }, + { + "epoch": 3.0527245549667072, + "grad_norm": 1.3293871879577637, + "learning_rate": 0.0006185707976627259, + "loss": 3.5857, + "step": 44930 + }, + { + "epoch": 3.0530642750373693, + "grad_norm": 1.0432089567184448, + "learning_rate": 0.0006185283326538932, + "loss": 3.2786, + "step": 44935 + }, + { + "epoch": 3.053403995108031, + "grad_norm": 0.8723371028900146, + "learning_rate": 0.0006184858676450604, + "loss": 3.4437, + "step": 44940 + }, + { + "epoch": 3.0537437151786926, + "grad_norm": 0.7902414202690125, + "learning_rate": 0.0006184434026362277, + "loss": 3.58, + "step": 44945 + }, + { + "epoch": 3.0540834352493547, + "grad_norm": 0.8076017498970032, + "learning_rate": 0.0006184009376273951, + "loss": 3.571, + "step": 44950 + }, + { + "epoch": 3.0544231553200163, + "grad_norm": 0.7994831800460815, + "learning_rate": 0.0006183584726185623, + "loss": 3.4126, + "step": 44955 + }, + { + "epoch": 3.054762875390678, + "grad_norm": 0.6688999533653259, + "learning_rate": 0.0006183160076097296, + "loss": 3.5461, + "step": 44960 + }, + { + "epoch": 3.05510259546134, + "grad_norm": 1.0569227933883667, + "learning_rate": 0.0006182735426008969, + "loss": 3.5014, + "step": 44965 + }, + { + "epoch": 3.0554423155320016, + "grad_norm": 0.84116530418396, + "learning_rate": 0.0006182310775920642, + "loss": 3.3509, + "step": 44970 + }, + { + "epoch": 3.0557820356026633, + "grad_norm": 0.9870677590370178, + "learning_rate": 0.0006181886125832314, + "loss": 3.574, + "step": 44975 + }, + { + "epoch": 3.0561217556733253, + "grad_norm": 0.8315675854682922, + "learning_rate": 0.0006181461475743986, + "loss": 3.5739, + "step": 44980 + }, + { + "epoch": 3.056461475743987, + "grad_norm": 0.7597320675849915, + "learning_rate": 0.0006181036825655661, + "loss": 3.5672, + "step": 44985 + }, + { + "epoch": 3.0568011958146486, + "grad_norm": 0.8166161775588989, + "learning_rate": 0.0006180612175567333, + "loss": 3.5832, + "step": 44990 + }, + { + "epoch": 3.0571409158853107, + "grad_norm": 0.8450973629951477, + "learning_rate": 0.0006180187525479005, + "loss": 3.2318, + "step": 44995 + }, + { + "epoch": 3.0574806359559723, + "grad_norm": 0.8925082683563232, + "learning_rate": 0.0006179762875390679, + "loss": 3.5133, + "step": 45000 + }, + { + "epoch": 3.057820356026634, + "grad_norm": 1.0904361009597778, + "learning_rate": 0.0006179338225302351, + "loss": 3.3644, + "step": 45005 + }, + { + "epoch": 3.058160076097296, + "grad_norm": 0.8527546525001526, + "learning_rate": 0.0006178913575214023, + "loss": 3.2037, + "step": 45010 + }, + { + "epoch": 3.0584997961679576, + "grad_norm": 0.9137938022613525, + "learning_rate": 0.0006178488925125697, + "loss": 3.6157, + "step": 45015 + }, + { + "epoch": 3.0588395162386193, + "grad_norm": 0.8367337584495544, + "learning_rate": 0.000617806427503737, + "loss": 3.2693, + "step": 45020 + }, + { + "epoch": 3.0591792363092813, + "grad_norm": 0.7667573094367981, + "learning_rate": 0.0006177639624949042, + "loss": 3.5785, + "step": 45025 + }, + { + "epoch": 3.059518956379943, + "grad_norm": 0.8716797232627869, + "learning_rate": 0.0006177214974860715, + "loss": 3.638, + "step": 45030 + }, + { + "epoch": 3.0598586764506046, + "grad_norm": 0.9637343287467957, + "learning_rate": 0.0006176790324772388, + "loss": 3.4471, + "step": 45035 + }, + { + "epoch": 3.0601983965212667, + "grad_norm": 0.8962384462356567, + "learning_rate": 0.000617636567468406, + "loss": 3.6449, + "step": 45040 + }, + { + "epoch": 3.0605381165919283, + "grad_norm": 0.8467882871627808, + "learning_rate": 0.0006175941024595733, + "loss": 3.3113, + "step": 45045 + }, + { + "epoch": 3.06087783666259, + "grad_norm": 0.8974126577377319, + "learning_rate": 0.0006175516374507406, + "loss": 3.4211, + "step": 45050 + }, + { + "epoch": 3.0612175567332516, + "grad_norm": 0.8538293242454529, + "learning_rate": 0.0006175091724419079, + "loss": 3.4708, + "step": 45055 + }, + { + "epoch": 3.0615572768039137, + "grad_norm": 0.845084547996521, + "learning_rate": 0.0006174667074330752, + "loss": 3.567, + "step": 45060 + }, + { + "epoch": 3.0618969968745753, + "grad_norm": 1.0373668670654297, + "learning_rate": 0.0006174242424242425, + "loss": 3.5419, + "step": 45065 + }, + { + "epoch": 3.062236716945237, + "grad_norm": 1.234775424003601, + "learning_rate": 0.0006173817774154097, + "loss": 3.4278, + "step": 45070 + }, + { + "epoch": 3.062576437015899, + "grad_norm": 0.8547048568725586, + "learning_rate": 0.000617339312406577, + "loss": 3.8774, + "step": 45075 + }, + { + "epoch": 3.0629161570865606, + "grad_norm": 0.692756175994873, + "learning_rate": 0.0006172968473977442, + "loss": 3.5812, + "step": 45080 + }, + { + "epoch": 3.0632558771572223, + "grad_norm": 0.7368325591087341, + "learning_rate": 0.0006172543823889115, + "loss": 3.6755, + "step": 45085 + }, + { + "epoch": 3.0635955972278843, + "grad_norm": 0.9509264230728149, + "learning_rate": 0.0006172119173800789, + "loss": 3.433, + "step": 45090 + }, + { + "epoch": 3.063935317298546, + "grad_norm": 0.7166617512702942, + "learning_rate": 0.0006171694523712461, + "loss": 3.4842, + "step": 45095 + }, + { + "epoch": 3.0642750373692076, + "grad_norm": 1.0783294439315796, + "learning_rate": 0.0006171269873624134, + "loss": 3.4839, + "step": 45100 + }, + { + "epoch": 3.0646147574398697, + "grad_norm": 0.956291913986206, + "learning_rate": 0.0006170845223535807, + "loss": 3.654, + "step": 45105 + }, + { + "epoch": 3.0649544775105313, + "grad_norm": 0.7574195861816406, + "learning_rate": 0.0006170420573447479, + "loss": 3.7373, + "step": 45110 + }, + { + "epoch": 3.065294197581193, + "grad_norm": 0.9336784482002258, + "learning_rate": 0.0006169995923359151, + "loss": 3.4802, + "step": 45115 + }, + { + "epoch": 3.065633917651855, + "grad_norm": 0.9458031058311462, + "learning_rate": 0.0006169571273270826, + "loss": 3.576, + "step": 45120 + }, + { + "epoch": 3.0659736377225166, + "grad_norm": 0.8831725120544434, + "learning_rate": 0.0006169146623182498, + "loss": 3.4639, + "step": 45125 + }, + { + "epoch": 3.0663133577931783, + "grad_norm": 5.633857727050781, + "learning_rate": 0.000616872197309417, + "loss": 3.4832, + "step": 45130 + }, + { + "epoch": 3.0666530778638403, + "grad_norm": 0.8408645987510681, + "learning_rate": 0.0006168297323005844, + "loss": 3.4656, + "step": 45135 + }, + { + "epoch": 3.066992797934502, + "grad_norm": 1.1470396518707275, + "learning_rate": 0.0006167872672917516, + "loss": 3.3914, + "step": 45140 + }, + { + "epoch": 3.0673325180051636, + "grad_norm": 0.9479343295097351, + "learning_rate": 0.0006167448022829188, + "loss": 3.3375, + "step": 45145 + }, + { + "epoch": 3.0676722380758257, + "grad_norm": 0.7824709415435791, + "learning_rate": 0.0006167023372740862, + "loss": 3.6161, + "step": 45150 + }, + { + "epoch": 3.0680119581464873, + "grad_norm": 1.0165427923202515, + "learning_rate": 0.0006166598722652535, + "loss": 3.5202, + "step": 45155 + }, + { + "epoch": 3.068351678217149, + "grad_norm": 0.8169671893119812, + "learning_rate": 0.0006166174072564207, + "loss": 3.4409, + "step": 45160 + }, + { + "epoch": 3.068691398287811, + "grad_norm": 0.8290146589279175, + "learning_rate": 0.0006165749422475881, + "loss": 3.6728, + "step": 45165 + }, + { + "epoch": 3.0690311183584726, + "grad_norm": 0.926299512386322, + "learning_rate": 0.0006165324772387553, + "loss": 3.5458, + "step": 45170 + }, + { + "epoch": 3.0693708384291343, + "grad_norm": 1.0660403966903687, + "learning_rate": 0.0006164900122299225, + "loss": 3.3876, + "step": 45175 + }, + { + "epoch": 3.0697105584997963, + "grad_norm": 0.9679116010665894, + "learning_rate": 0.0006164475472210898, + "loss": 3.4914, + "step": 45180 + }, + { + "epoch": 3.070050278570458, + "grad_norm": 0.9702383875846863, + "learning_rate": 0.0006164050822122571, + "loss": 3.4714, + "step": 45185 + }, + { + "epoch": 3.0703899986411196, + "grad_norm": 0.8133310079574585, + "learning_rate": 0.0006163626172034244, + "loss": 3.6444, + "step": 45190 + }, + { + "epoch": 3.0707297187117817, + "grad_norm": 0.9583044648170471, + "learning_rate": 0.0006163201521945917, + "loss": 3.3785, + "step": 45195 + }, + { + "epoch": 3.0710694387824433, + "grad_norm": 0.9679398536682129, + "learning_rate": 0.000616277687185759, + "loss": 3.4412, + "step": 45200 + }, + { + "epoch": 3.071409158853105, + "grad_norm": 0.918900191783905, + "learning_rate": 0.0006162352221769262, + "loss": 3.1504, + "step": 45205 + }, + { + "epoch": 3.071748878923767, + "grad_norm": 0.9898842573165894, + "learning_rate": 0.0006161927571680935, + "loss": 3.5751, + "step": 45210 + }, + { + "epoch": 3.0720885989944287, + "grad_norm": 0.9374984502792358, + "learning_rate": 0.0006161502921592607, + "loss": 3.5903, + "step": 45215 + }, + { + "epoch": 3.0724283190650903, + "grad_norm": 0.8619779944419861, + "learning_rate": 0.000616107827150428, + "loss": 3.4947, + "step": 45220 + }, + { + "epoch": 3.0727680391357524, + "grad_norm": 0.9117752909660339, + "learning_rate": 0.0006160653621415954, + "loss": 3.5096, + "step": 45225 + }, + { + "epoch": 3.073107759206414, + "grad_norm": 0.8052951097488403, + "learning_rate": 0.0006160228971327626, + "loss": 3.3475, + "step": 45230 + }, + { + "epoch": 3.0734474792770756, + "grad_norm": 0.9567247629165649, + "learning_rate": 0.0006159804321239299, + "loss": 3.157, + "step": 45235 + }, + { + "epoch": 3.0737871993477373, + "grad_norm": 1.4706432819366455, + "learning_rate": 0.0006159379671150972, + "loss": 3.541, + "step": 45240 + }, + { + "epoch": 3.0741269194183993, + "grad_norm": 1.003546953201294, + "learning_rate": 0.0006158955021062644, + "loss": 3.7431, + "step": 45245 + }, + { + "epoch": 3.074466639489061, + "grad_norm": 1.1332744359970093, + "learning_rate": 0.0006158530370974317, + "loss": 3.8134, + "step": 45250 + }, + { + "epoch": 3.0748063595597226, + "grad_norm": 0.7978418469429016, + "learning_rate": 0.000615810572088599, + "loss": 3.5113, + "step": 45255 + }, + { + "epoch": 3.0751460796303847, + "grad_norm": 1.2019243240356445, + "learning_rate": 0.0006157681070797663, + "loss": 3.2867, + "step": 45260 + }, + { + "epoch": 3.0754857997010463, + "grad_norm": 0.9840481877326965, + "learning_rate": 0.0006157256420709335, + "loss": 3.457, + "step": 45265 + }, + { + "epoch": 3.075825519771708, + "grad_norm": 1.0331770181655884, + "learning_rate": 0.0006156831770621009, + "loss": 3.279, + "step": 45270 + }, + { + "epoch": 3.07616523984237, + "grad_norm": 1.048207402229309, + "learning_rate": 0.0006156407120532681, + "loss": 3.2479, + "step": 45275 + }, + { + "epoch": 3.0765049599130316, + "grad_norm": 0.7639055848121643, + "learning_rate": 0.0006155982470444353, + "loss": 3.3698, + "step": 45280 + }, + { + "epoch": 3.0768446799836933, + "grad_norm": 0.8483713269233704, + "learning_rate": 0.0006155557820356027, + "loss": 3.7077, + "step": 45285 + }, + { + "epoch": 3.0771844000543553, + "grad_norm": 0.6491823196411133, + "learning_rate": 0.0006155133170267699, + "loss": 3.4729, + "step": 45290 + }, + { + "epoch": 3.077524120125017, + "grad_norm": 1.0904372930526733, + "learning_rate": 0.0006154708520179372, + "loss": 3.5196, + "step": 45295 + }, + { + "epoch": 3.0778638401956786, + "grad_norm": 0.8647538423538208, + "learning_rate": 0.0006154283870091046, + "loss": 3.7273, + "step": 45300 + }, + { + "epoch": 3.0782035602663407, + "grad_norm": 0.7139621376991272, + "learning_rate": 0.0006153859220002718, + "loss": 3.4943, + "step": 45305 + }, + { + "epoch": 3.0785432803370023, + "grad_norm": 1.0712698698043823, + "learning_rate": 0.0006153434569914391, + "loss": 3.5009, + "step": 45310 + }, + { + "epoch": 3.078883000407664, + "grad_norm": 0.7189934849739075, + "learning_rate": 0.0006153009919826063, + "loss": 3.6316, + "step": 45315 + }, + { + "epoch": 3.079222720478326, + "grad_norm": 0.8718123435974121, + "learning_rate": 0.0006152585269737736, + "loss": 3.4668, + "step": 45320 + }, + { + "epoch": 3.0795624405489876, + "grad_norm": 0.7875134348869324, + "learning_rate": 0.0006152160619649409, + "loss": 3.4973, + "step": 45325 + }, + { + "epoch": 3.0799021606196493, + "grad_norm": 0.8209888339042664, + "learning_rate": 0.0006151735969561082, + "loss": 3.4915, + "step": 45330 + }, + { + "epoch": 3.0802418806903114, + "grad_norm": 0.9438146948814392, + "learning_rate": 0.0006151311319472755, + "loss": 3.6158, + "step": 45335 + }, + { + "epoch": 3.080581600760973, + "grad_norm": 0.8529353737831116, + "learning_rate": 0.0006150886669384428, + "loss": 3.4102, + "step": 45340 + }, + { + "epoch": 3.0809213208316346, + "grad_norm": 0.9208599328994751, + "learning_rate": 0.00061504620192961, + "loss": 3.7575, + "step": 45345 + }, + { + "epoch": 3.0812610409022967, + "grad_norm": 0.9470713138580322, + "learning_rate": 0.0006150037369207773, + "loss": 3.5317, + "step": 45350 + }, + { + "epoch": 3.0816007609729583, + "grad_norm": 1.1081210374832153, + "learning_rate": 0.0006149612719119446, + "loss": 3.4409, + "step": 45355 + }, + { + "epoch": 3.08194048104362, + "grad_norm": 0.9554344415664673, + "learning_rate": 0.0006149188069031118, + "loss": 3.6027, + "step": 45360 + }, + { + "epoch": 3.082280201114282, + "grad_norm": 0.9609509110450745, + "learning_rate": 0.0006148763418942791, + "loss": 3.5005, + "step": 45365 + }, + { + "epoch": 3.0826199211849437, + "grad_norm": 0.9984030723571777, + "learning_rate": 0.0006148338768854465, + "loss": 3.5436, + "step": 45370 + }, + { + "epoch": 3.0829596412556053, + "grad_norm": 0.9290677905082703, + "learning_rate": 0.0006147914118766137, + "loss": 3.0949, + "step": 45375 + }, + { + "epoch": 3.0832993613262674, + "grad_norm": 0.8854575753211975, + "learning_rate": 0.0006147489468677809, + "loss": 3.5002, + "step": 45380 + }, + { + "epoch": 3.083639081396929, + "grad_norm": 0.8634077310562134, + "learning_rate": 0.0006147064818589483, + "loss": 3.5755, + "step": 45385 + }, + { + "epoch": 3.0839788014675906, + "grad_norm": 1.0878746509552002, + "learning_rate": 0.0006146640168501155, + "loss": 3.5552, + "step": 45390 + }, + { + "epoch": 3.0843185215382523, + "grad_norm": 0.7784273028373718, + "learning_rate": 0.0006146215518412827, + "loss": 3.5731, + "step": 45395 + }, + { + "epoch": 3.0846582416089143, + "grad_norm": 1.0655579566955566, + "learning_rate": 0.0006145790868324502, + "loss": 3.4878, + "step": 45400 + }, + { + "epoch": 3.084997961679576, + "grad_norm": 1.157449722290039, + "learning_rate": 0.0006145366218236174, + "loss": 3.4072, + "step": 45405 + }, + { + "epoch": 3.0853376817502376, + "grad_norm": 1.011207103729248, + "learning_rate": 0.0006144941568147846, + "loss": 3.3544, + "step": 45410 + }, + { + "epoch": 3.0856774018208997, + "grad_norm": 0.9424362182617188, + "learning_rate": 0.000614451691805952, + "loss": 3.3902, + "step": 45415 + }, + { + "epoch": 3.0860171218915613, + "grad_norm": 0.8765737414360046, + "learning_rate": 0.0006144092267971192, + "loss": 3.4528, + "step": 45420 + }, + { + "epoch": 3.086356841962223, + "grad_norm": 0.9136813282966614, + "learning_rate": 0.0006143667617882864, + "loss": 3.5573, + "step": 45425 + }, + { + "epoch": 3.086696562032885, + "grad_norm": 0.8442876935005188, + "learning_rate": 0.0006143242967794537, + "loss": 3.5964, + "step": 45430 + }, + { + "epoch": 3.0870362821035466, + "grad_norm": 0.7686412930488586, + "learning_rate": 0.0006142818317706211, + "loss": 3.6682, + "step": 45435 + }, + { + "epoch": 3.0873760021742083, + "grad_norm": 0.7092103958129883, + "learning_rate": 0.0006142393667617883, + "loss": 3.4485, + "step": 45440 + }, + { + "epoch": 3.0877157222448703, + "grad_norm": 0.9552003741264343, + "learning_rate": 0.0006141969017529556, + "loss": 3.4432, + "step": 45445 + }, + { + "epoch": 3.088055442315532, + "grad_norm": 1.0082370042800903, + "learning_rate": 0.0006141544367441229, + "loss": 3.8465, + "step": 45450 + }, + { + "epoch": 3.0883951623861936, + "grad_norm": 0.7997236251831055, + "learning_rate": 0.0006141119717352901, + "loss": 3.4936, + "step": 45455 + }, + { + "epoch": 3.0887348824568557, + "grad_norm": 1.1020950078964233, + "learning_rate": 0.0006140695067264574, + "loss": 3.5687, + "step": 45460 + }, + { + "epoch": 3.0890746025275173, + "grad_norm": 0.8940680623054504, + "learning_rate": 0.0006140270417176246, + "loss": 3.3249, + "step": 45465 + }, + { + "epoch": 3.089414322598179, + "grad_norm": 0.869782030582428, + "learning_rate": 0.000613984576708792, + "loss": 3.3515, + "step": 45470 + }, + { + "epoch": 3.089754042668841, + "grad_norm": 0.9605928659439087, + "learning_rate": 0.0006139421116999593, + "loss": 3.2952, + "step": 45475 + }, + { + "epoch": 3.0900937627395026, + "grad_norm": 0.9216681718826294, + "learning_rate": 0.0006138996466911265, + "loss": 3.3289, + "step": 45480 + }, + { + "epoch": 3.0904334828101643, + "grad_norm": 0.7874439358711243, + "learning_rate": 0.0006138571816822938, + "loss": 3.4657, + "step": 45485 + }, + { + "epoch": 3.0907732028808264, + "grad_norm": 0.8871141076087952, + "learning_rate": 0.0006138147166734611, + "loss": 3.6452, + "step": 45490 + }, + { + "epoch": 3.091112922951488, + "grad_norm": 1.0207550525665283, + "learning_rate": 0.0006137722516646283, + "loss": 3.5139, + "step": 45495 + }, + { + "epoch": 3.0914526430221496, + "grad_norm": 0.9839468002319336, + "learning_rate": 0.0006137297866557955, + "loss": 3.5151, + "step": 45500 + }, + { + "epoch": 3.0917923630928117, + "grad_norm": 1.0998214483261108, + "learning_rate": 0.000613687321646963, + "loss": 3.3863, + "step": 45505 + }, + { + "epoch": 3.0921320831634733, + "grad_norm": 1.0068325996398926, + "learning_rate": 0.0006136448566381302, + "loss": 3.6534, + "step": 45510 + }, + { + "epoch": 3.092471803234135, + "grad_norm": 0.8189786076545715, + "learning_rate": 0.0006136023916292974, + "loss": 3.6525, + "step": 45515 + }, + { + "epoch": 3.092811523304797, + "grad_norm": 0.6368412971496582, + "learning_rate": 0.0006135599266204648, + "loss": 3.6585, + "step": 45520 + }, + { + "epoch": 3.0931512433754587, + "grad_norm": 0.9695454835891724, + "learning_rate": 0.000613517461611632, + "loss": 3.5805, + "step": 45525 + }, + { + "epoch": 3.0934909634461203, + "grad_norm": 0.9223203659057617, + "learning_rate": 0.0006134749966027992, + "loss": 3.6827, + "step": 45530 + }, + { + "epoch": 3.0938306835167824, + "grad_norm": 1.0988616943359375, + "learning_rate": 0.0006134325315939666, + "loss": 3.4148, + "step": 45535 + }, + { + "epoch": 3.094170403587444, + "grad_norm": 0.6957002878189087, + "learning_rate": 0.0006133900665851339, + "loss": 3.5832, + "step": 45540 + }, + { + "epoch": 3.0945101236581056, + "grad_norm": 0.8006020784378052, + "learning_rate": 0.0006133476015763011, + "loss": 3.7413, + "step": 45545 + }, + { + "epoch": 3.0948498437287677, + "grad_norm": 0.916537344455719, + "learning_rate": 0.0006133051365674685, + "loss": 3.5057, + "step": 45550 + }, + { + "epoch": 3.0951895637994293, + "grad_norm": 0.8514760732650757, + "learning_rate": 0.0006132626715586357, + "loss": 3.6867, + "step": 45555 + }, + { + "epoch": 3.095529283870091, + "grad_norm": 0.9133549332618713, + "learning_rate": 0.0006132202065498029, + "loss": 3.5957, + "step": 45560 + }, + { + "epoch": 3.095869003940753, + "grad_norm": 8.850259780883789, + "learning_rate": 0.0006131777415409702, + "loss": 3.4794, + "step": 45565 + }, + { + "epoch": 3.0962087240114147, + "grad_norm": 0.917969286441803, + "learning_rate": 0.0006131352765321375, + "loss": 3.5654, + "step": 45570 + }, + { + "epoch": 3.0965484440820763, + "grad_norm": 0.7688896656036377, + "learning_rate": 0.0006130928115233048, + "loss": 3.5678, + "step": 45575 + }, + { + "epoch": 3.096888164152738, + "grad_norm": 0.9070624113082886, + "learning_rate": 0.0006130503465144721, + "loss": 3.3387, + "step": 45580 + }, + { + "epoch": 3.0972278842234, + "grad_norm": 0.876859724521637, + "learning_rate": 0.0006130078815056394, + "loss": 3.6744, + "step": 45585 + }, + { + "epoch": 3.0975676042940616, + "grad_norm": 0.8841543793678284, + "learning_rate": 0.0006129654164968066, + "loss": 3.4366, + "step": 45590 + }, + { + "epoch": 3.0979073243647233, + "grad_norm": 0.6697371602058411, + "learning_rate": 0.0006129229514879739, + "loss": 3.3675, + "step": 45595 + }, + { + "epoch": 3.0982470444353853, + "grad_norm": 1.030617594718933, + "learning_rate": 0.0006128804864791412, + "loss": 3.4241, + "step": 45600 + }, + { + "epoch": 3.098586764506047, + "grad_norm": 0.6899008750915527, + "learning_rate": 0.0006128380214703084, + "loss": 3.3761, + "step": 45605 + }, + { + "epoch": 3.0989264845767086, + "grad_norm": 0.7388415336608887, + "learning_rate": 0.0006127955564614758, + "loss": 3.356, + "step": 45610 + }, + { + "epoch": 3.0992662046473707, + "grad_norm": 0.8407344818115234, + "learning_rate": 0.000612753091452643, + "loss": 3.2815, + "step": 45615 + }, + { + "epoch": 3.0996059247180323, + "grad_norm": 0.7202184796333313, + "learning_rate": 0.0006127106264438103, + "loss": 3.5193, + "step": 45620 + }, + { + "epoch": 3.099945644788694, + "grad_norm": 1.652698040008545, + "learning_rate": 0.0006126681614349776, + "loss": 3.3178, + "step": 45625 + }, + { + "epoch": 3.100285364859356, + "grad_norm": 0.8853527307510376, + "learning_rate": 0.0006126256964261448, + "loss": 3.4837, + "step": 45630 + }, + { + "epoch": 3.1006250849300176, + "grad_norm": 1.21665358543396, + "learning_rate": 0.0006125832314173121, + "loss": 3.6222, + "step": 45635 + }, + { + "epoch": 3.1009648050006793, + "grad_norm": 0.8071850538253784, + "learning_rate": 0.0006125407664084794, + "loss": 3.2238, + "step": 45640 + }, + { + "epoch": 3.1013045250713414, + "grad_norm": 1.1222118139266968, + "learning_rate": 0.0006124983013996467, + "loss": 3.63, + "step": 45645 + }, + { + "epoch": 3.101644245142003, + "grad_norm": 0.993224024772644, + "learning_rate": 0.0006124558363908141, + "loss": 3.4077, + "step": 45650 + }, + { + "epoch": 3.1019839652126646, + "grad_norm": 0.8194601535797119, + "learning_rate": 0.0006124133713819813, + "loss": 3.5168, + "step": 45655 + }, + { + "epoch": 3.1023236852833267, + "grad_norm": 0.9690770506858826, + "learning_rate": 0.0006123709063731485, + "loss": 3.714, + "step": 45660 + }, + { + "epoch": 3.1026634053539883, + "grad_norm": 1.0250791311264038, + "learning_rate": 0.0006123284413643158, + "loss": 3.3504, + "step": 45665 + }, + { + "epoch": 3.10300312542465, + "grad_norm": 0.9894272685050964, + "learning_rate": 0.0006122859763554831, + "loss": 3.3732, + "step": 45670 + }, + { + "epoch": 3.103342845495312, + "grad_norm": 0.8401615023612976, + "learning_rate": 0.0006122435113466503, + "loss": 3.1357, + "step": 45675 + }, + { + "epoch": 3.1036825655659737, + "grad_norm": 1.0014797449111938, + "learning_rate": 0.0006122010463378177, + "loss": 3.6382, + "step": 45680 + }, + { + "epoch": 3.1040222856366353, + "grad_norm": 0.7298566102981567, + "learning_rate": 0.000612158581328985, + "loss": 3.6985, + "step": 45685 + }, + { + "epoch": 3.1043620057072974, + "grad_norm": 0.9199948310852051, + "learning_rate": 0.0006121161163201522, + "loss": 3.4918, + "step": 45690 + }, + { + "epoch": 3.104701725777959, + "grad_norm": 1.1441121101379395, + "learning_rate": 0.0006120736513113195, + "loss": 3.4463, + "step": 45695 + }, + { + "epoch": 3.1050414458486206, + "grad_norm": 1.0547446012496948, + "learning_rate": 0.0006120311863024868, + "loss": 3.7213, + "step": 45700 + }, + { + "epoch": 3.1053811659192827, + "grad_norm": 1.1514614820480347, + "learning_rate": 0.000611988721293654, + "loss": 3.6079, + "step": 45705 + }, + { + "epoch": 3.1057208859899443, + "grad_norm": 0.9341482520103455, + "learning_rate": 0.0006119462562848214, + "loss": 3.7278, + "step": 45710 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 0.9768016934394836, + "learning_rate": 0.0006119037912759886, + "loss": 3.4049, + "step": 45715 + }, + { + "epoch": 3.106400326131268, + "grad_norm": 0.9790335893630981, + "learning_rate": 0.0006118613262671559, + "loss": 3.4379, + "step": 45720 + }, + { + "epoch": 3.1067400462019297, + "grad_norm": 0.7492227554321289, + "learning_rate": 0.0006118188612583232, + "loss": 3.4397, + "step": 45725 + }, + { + "epoch": 3.1070797662725913, + "grad_norm": 0.7246140837669373, + "learning_rate": 0.0006117763962494904, + "loss": 3.3623, + "step": 45730 + }, + { + "epoch": 3.107419486343253, + "grad_norm": 0.8722405433654785, + "learning_rate": 0.0006117339312406577, + "loss": 3.4942, + "step": 45735 + }, + { + "epoch": 3.107759206413915, + "grad_norm": 1.0982486009597778, + "learning_rate": 0.000611691466231825, + "loss": 3.5117, + "step": 45740 + }, + { + "epoch": 3.1080989264845766, + "grad_norm": 1.0403333902359009, + "learning_rate": 0.0006116490012229923, + "loss": 3.6176, + "step": 45745 + }, + { + "epoch": 3.1084386465552383, + "grad_norm": 1.0949597358703613, + "learning_rate": 0.0006116065362141596, + "loss": 3.6646, + "step": 45750 + }, + { + "epoch": 3.1087783666259003, + "grad_norm": 1.059425711631775, + "learning_rate": 0.0006115640712053269, + "loss": 3.4826, + "step": 45755 + }, + { + "epoch": 3.109118086696562, + "grad_norm": 1.1251122951507568, + "learning_rate": 0.0006115216061964941, + "loss": 3.535, + "step": 45760 + }, + { + "epoch": 3.1094578067672236, + "grad_norm": 0.9542515873908997, + "learning_rate": 0.0006114791411876613, + "loss": 3.2803, + "step": 45765 + }, + { + "epoch": 3.1097975268378857, + "grad_norm": 1.0455451011657715, + "learning_rate": 0.0006114366761788287, + "loss": 3.4244, + "step": 45770 + }, + { + "epoch": 3.1101372469085473, + "grad_norm": 0.7721425890922546, + "learning_rate": 0.0006113942111699959, + "loss": 3.4856, + "step": 45775 + }, + { + "epoch": 3.110476966979209, + "grad_norm": 0.8104719519615173, + "learning_rate": 0.0006113517461611632, + "loss": 3.5859, + "step": 45780 + }, + { + "epoch": 3.110816687049871, + "grad_norm": 0.8811565637588501, + "learning_rate": 0.0006113092811523306, + "loss": 3.5654, + "step": 45785 + }, + { + "epoch": 3.1111564071205327, + "grad_norm": 0.7899727821350098, + "learning_rate": 0.0006112668161434978, + "loss": 3.79, + "step": 45790 + }, + { + "epoch": 3.1114961271911943, + "grad_norm": 1.030847191810608, + "learning_rate": 0.000611224351134665, + "loss": 3.6422, + "step": 45795 + }, + { + "epoch": 3.1118358472618564, + "grad_norm": 0.9650459289550781, + "learning_rate": 0.0006111818861258324, + "loss": 3.697, + "step": 45800 + }, + { + "epoch": 3.112175567332518, + "grad_norm": 1.0585886240005493, + "learning_rate": 0.0006111394211169996, + "loss": 3.453, + "step": 45805 + }, + { + "epoch": 3.1125152874031796, + "grad_norm": 0.7352738976478577, + "learning_rate": 0.0006110969561081668, + "loss": 3.2711, + "step": 45810 + }, + { + "epoch": 3.1128550074738417, + "grad_norm": 0.7258121371269226, + "learning_rate": 0.0006110544910993342, + "loss": 3.5064, + "step": 45815 + }, + { + "epoch": 3.1131947275445033, + "grad_norm": 1.7947381734848022, + "learning_rate": 0.0006110120260905015, + "loss": 3.7652, + "step": 45820 + }, + { + "epoch": 3.113534447615165, + "grad_norm": 0.8996764421463013, + "learning_rate": 0.0006109695610816687, + "loss": 3.3698, + "step": 45825 + }, + { + "epoch": 3.113874167685827, + "grad_norm": 0.6415503621101379, + "learning_rate": 0.000610927096072836, + "loss": 3.6142, + "step": 45830 + }, + { + "epoch": 3.1142138877564887, + "grad_norm": 0.8503998517990112, + "learning_rate": 0.0006108846310640033, + "loss": 3.5673, + "step": 45835 + }, + { + "epoch": 3.1145536078271503, + "grad_norm": 0.8762229084968567, + "learning_rate": 0.0006108421660551705, + "loss": 3.5727, + "step": 45840 + }, + { + "epoch": 3.1148933278978124, + "grad_norm": 0.926807165145874, + "learning_rate": 0.0006107997010463378, + "loss": 3.5344, + "step": 45845 + }, + { + "epoch": 3.115233047968474, + "grad_norm": 0.8956146836280823, + "learning_rate": 0.0006107572360375052, + "loss": 3.5763, + "step": 45850 + }, + { + "epoch": 3.1155727680391356, + "grad_norm": 1.1085032224655151, + "learning_rate": 0.0006107147710286724, + "loss": 3.4909, + "step": 45855 + }, + { + "epoch": 3.1159124881097977, + "grad_norm": 0.8928377032279968, + "learning_rate": 0.0006106723060198397, + "loss": 3.3816, + "step": 45860 + }, + { + "epoch": 3.1162522081804593, + "grad_norm": 0.6969806551933289, + "learning_rate": 0.0006106298410110069, + "loss": 3.5835, + "step": 45865 + }, + { + "epoch": 3.116591928251121, + "grad_norm": 0.9247251749038696, + "learning_rate": 0.0006105873760021742, + "loss": 3.4485, + "step": 45870 + }, + { + "epoch": 3.116931648321783, + "grad_norm": 0.8136182427406311, + "learning_rate": 0.0006105449109933415, + "loss": 3.557, + "step": 45875 + }, + { + "epoch": 3.1172713683924447, + "grad_norm": 0.8188034892082214, + "learning_rate": 0.0006105024459845087, + "loss": 3.7158, + "step": 45880 + }, + { + "epoch": 3.1176110884631063, + "grad_norm": 1.1930644512176514, + "learning_rate": 0.0006104599809756761, + "loss": 3.4998, + "step": 45885 + }, + { + "epoch": 3.1179508085337684, + "grad_norm": 1.079660177230835, + "learning_rate": 0.0006104175159668434, + "loss": 3.5645, + "step": 45890 + }, + { + "epoch": 3.11829052860443, + "grad_norm": 1.6480048894882202, + "learning_rate": 0.0006103750509580106, + "loss": 3.3669, + "step": 45895 + }, + { + "epoch": 3.1186302486750916, + "grad_norm": 1.118260383605957, + "learning_rate": 0.0006103325859491778, + "loss": 3.7141, + "step": 45900 + }, + { + "epoch": 3.1189699687457537, + "grad_norm": 0.7198235392570496, + "learning_rate": 0.0006102901209403452, + "loss": 3.6855, + "step": 45905 + }, + { + "epoch": 3.1193096888164153, + "grad_norm": 1.075509786605835, + "learning_rate": 0.0006102476559315124, + "loss": 3.7007, + "step": 45910 + }, + { + "epoch": 3.119649408887077, + "grad_norm": 0.9379994869232178, + "learning_rate": 0.0006102051909226796, + "loss": 3.604, + "step": 45915 + }, + { + "epoch": 3.1199891289577386, + "grad_norm": 0.8484824299812317, + "learning_rate": 0.0006101627259138471, + "loss": 3.4502, + "step": 45920 + }, + { + "epoch": 3.1203288490284007, + "grad_norm": 0.9027990102767944, + "learning_rate": 0.0006101202609050143, + "loss": 3.3439, + "step": 45925 + }, + { + "epoch": 3.1206685690990623, + "grad_norm": 0.8484631776809692, + "learning_rate": 0.0006100777958961815, + "loss": 3.4933, + "step": 45930 + }, + { + "epoch": 3.121008289169724, + "grad_norm": 0.9527095556259155, + "learning_rate": 0.0006100353308873489, + "loss": 3.4896, + "step": 45935 + }, + { + "epoch": 3.121348009240386, + "grad_norm": 0.8425201177597046, + "learning_rate": 0.0006099928658785161, + "loss": 3.2765, + "step": 45940 + }, + { + "epoch": 3.1216877293110477, + "grad_norm": 0.9998193383216858, + "learning_rate": 0.0006099504008696833, + "loss": 3.5732, + "step": 45945 + }, + { + "epoch": 3.1220274493817093, + "grad_norm": 0.7922817468643188, + "learning_rate": 0.0006099079358608506, + "loss": 3.5773, + "step": 45950 + }, + { + "epoch": 3.1223671694523714, + "grad_norm": 1.0666167736053467, + "learning_rate": 0.000609865470852018, + "loss": 3.2767, + "step": 45955 + }, + { + "epoch": 3.122706889523033, + "grad_norm": 0.943895161151886, + "learning_rate": 0.0006098230058431852, + "loss": 3.6291, + "step": 45960 + }, + { + "epoch": 3.1230466095936946, + "grad_norm": 0.8141846656799316, + "learning_rate": 0.0006097805408343525, + "loss": 3.2986, + "step": 45965 + }, + { + "epoch": 3.1233863296643567, + "grad_norm": 1.7815258502960205, + "learning_rate": 0.0006097380758255198, + "loss": 3.5458, + "step": 45970 + }, + { + "epoch": 3.1237260497350183, + "grad_norm": 1.0942445993423462, + "learning_rate": 0.000609695610816687, + "loss": 3.5792, + "step": 45975 + }, + { + "epoch": 3.12406576980568, + "grad_norm": 1.2193976640701294, + "learning_rate": 0.0006096531458078543, + "loss": 3.3163, + "step": 45980 + }, + { + "epoch": 3.124405489876342, + "grad_norm": 0.7208452820777893, + "learning_rate": 0.0006096106807990216, + "loss": 3.677, + "step": 45985 + }, + { + "epoch": 3.1247452099470037, + "grad_norm": 0.7632548809051514, + "learning_rate": 0.000609568215790189, + "loss": 3.2049, + "step": 45990 + }, + { + "epoch": 3.1250849300176653, + "grad_norm": 0.7589144706726074, + "learning_rate": 0.0006095257507813562, + "loss": 3.5534, + "step": 45995 + }, + { + "epoch": 3.1254246500883274, + "grad_norm": 0.8128264546394348, + "learning_rate": 0.0006094832857725234, + "loss": 3.752, + "step": 46000 + }, + { + "epoch": 3.125764370158989, + "grad_norm": 0.9216989278793335, + "learning_rate": 0.0006094408207636908, + "loss": 3.4161, + "step": 46005 + }, + { + "epoch": 3.1261040902296506, + "grad_norm": 0.7642640471458435, + "learning_rate": 0.000609398355754858, + "loss": 3.6234, + "step": 46010 + }, + { + "epoch": 3.1264438103003127, + "grad_norm": 0.8880747556686401, + "learning_rate": 0.0006093558907460252, + "loss": 3.3969, + "step": 46015 + }, + { + "epoch": 3.1267835303709743, + "grad_norm": 0.9441930651664734, + "learning_rate": 0.0006093134257371926, + "loss": 3.4485, + "step": 46020 + }, + { + "epoch": 3.127123250441636, + "grad_norm": 0.7467437386512756, + "learning_rate": 0.0006092709607283599, + "loss": 3.483, + "step": 46025 + }, + { + "epoch": 3.127462970512298, + "grad_norm": 0.7718145847320557, + "learning_rate": 0.0006092284957195271, + "loss": 3.5194, + "step": 46030 + }, + { + "epoch": 3.1278026905829597, + "grad_norm": 0.9905290007591248, + "learning_rate": 0.0006091860307106945, + "loss": 3.3001, + "step": 46035 + }, + { + "epoch": 3.1281424106536213, + "grad_norm": 1.0762919187545776, + "learning_rate": 0.0006091435657018617, + "loss": 3.5453, + "step": 46040 + }, + { + "epoch": 3.1284821307242834, + "grad_norm": 0.8631027936935425, + "learning_rate": 0.0006091011006930289, + "loss": 3.3007, + "step": 46045 + }, + { + "epoch": 3.128821850794945, + "grad_norm": 1.0237390995025635, + "learning_rate": 0.0006090586356841962, + "loss": 3.5246, + "step": 46050 + }, + { + "epoch": 3.1291615708656066, + "grad_norm": 0.8097394108772278, + "learning_rate": 0.0006090161706753635, + "loss": 3.4008, + "step": 46055 + }, + { + "epoch": 3.1295012909362687, + "grad_norm": 0.8495267033576965, + "learning_rate": 0.0006089737056665308, + "loss": 3.5143, + "step": 46060 + }, + { + "epoch": 3.1298410110069304, + "grad_norm": 1.0387853384017944, + "learning_rate": 0.0006089312406576981, + "loss": 3.5843, + "step": 46065 + }, + { + "epoch": 3.130180731077592, + "grad_norm": 0.8607489466667175, + "learning_rate": 0.0006088887756488654, + "loss": 3.6956, + "step": 46070 + }, + { + "epoch": 3.1305204511482536, + "grad_norm": 1.0732477903366089, + "learning_rate": 0.0006088463106400326, + "loss": 3.4254, + "step": 46075 + }, + { + "epoch": 3.1308601712189157, + "grad_norm": 1.1302192211151123, + "learning_rate": 0.0006088038456311999, + "loss": 3.5129, + "step": 46080 + }, + { + "epoch": 3.1311998912895773, + "grad_norm": 0.7527825832366943, + "learning_rate": 0.0006087613806223672, + "loss": 3.3262, + "step": 46085 + }, + { + "epoch": 3.131539611360239, + "grad_norm": 0.8113888502120972, + "learning_rate": 0.0006087189156135344, + "loss": 3.7509, + "step": 46090 + }, + { + "epoch": 3.131879331430901, + "grad_norm": 0.759975016117096, + "learning_rate": 0.0006086764506047018, + "loss": 3.6893, + "step": 46095 + }, + { + "epoch": 3.1322190515015627, + "grad_norm": 0.7809811234474182, + "learning_rate": 0.000608633985595869, + "loss": 3.668, + "step": 46100 + }, + { + "epoch": 3.1325587715722243, + "grad_norm": 0.973081111907959, + "learning_rate": 0.0006085915205870363, + "loss": 3.7831, + "step": 46105 + }, + { + "epoch": 3.1328984916428864, + "grad_norm": 0.8921964168548584, + "learning_rate": 0.0006085490555782036, + "loss": 3.5976, + "step": 46110 + }, + { + "epoch": 3.133238211713548, + "grad_norm": 1.7107208967208862, + "learning_rate": 0.0006085065905693708, + "loss": 3.3932, + "step": 46115 + }, + { + "epoch": 3.1335779317842096, + "grad_norm": 1.0520004034042358, + "learning_rate": 0.0006084641255605381, + "loss": 3.4954, + "step": 46120 + }, + { + "epoch": 3.1339176518548717, + "grad_norm": 0.8856369853019714, + "learning_rate": 0.0006084216605517054, + "loss": 3.7677, + "step": 46125 + }, + { + "epoch": 3.1342573719255333, + "grad_norm": 0.7930753827095032, + "learning_rate": 0.0006083791955428727, + "loss": 3.2206, + "step": 46130 + }, + { + "epoch": 3.134597091996195, + "grad_norm": 0.9167141914367676, + "learning_rate": 0.00060833673053404, + "loss": 3.4243, + "step": 46135 + }, + { + "epoch": 3.134936812066857, + "grad_norm": 0.965112030506134, + "learning_rate": 0.0006082942655252073, + "loss": 3.3166, + "step": 46140 + }, + { + "epoch": 3.1352765321375187, + "grad_norm": 0.9747025966644287, + "learning_rate": 0.0006082518005163745, + "loss": 3.5114, + "step": 46145 + }, + { + "epoch": 3.1356162522081803, + "grad_norm": 1.1292705535888672, + "learning_rate": 0.0006082093355075417, + "loss": 3.4995, + "step": 46150 + }, + { + "epoch": 3.1359559722788424, + "grad_norm": 0.7513163685798645, + "learning_rate": 0.0006081668704987091, + "loss": 3.5001, + "step": 46155 + }, + { + "epoch": 3.136295692349504, + "grad_norm": 0.9334239363670349, + "learning_rate": 0.0006081244054898763, + "loss": 3.4318, + "step": 46160 + }, + { + "epoch": 3.1366354124201656, + "grad_norm": 1.1728640794754028, + "learning_rate": 0.0006080819404810436, + "loss": 3.5484, + "step": 46165 + }, + { + "epoch": 3.1369751324908277, + "grad_norm": 0.8623613715171814, + "learning_rate": 0.000608039475472211, + "loss": 3.5882, + "step": 46170 + }, + { + "epoch": 3.1373148525614893, + "grad_norm": 0.9702238440513611, + "learning_rate": 0.0006079970104633782, + "loss": 3.6799, + "step": 46175 + }, + { + "epoch": 3.137654572632151, + "grad_norm": 0.8374427556991577, + "learning_rate": 0.0006079545454545454, + "loss": 3.5182, + "step": 46180 + }, + { + "epoch": 3.137994292702813, + "grad_norm": 0.9611116647720337, + "learning_rate": 0.0006079120804457128, + "loss": 3.5328, + "step": 46185 + }, + { + "epoch": 3.1383340127734747, + "grad_norm": 1.026437520980835, + "learning_rate": 0.00060786961543688, + "loss": 3.6497, + "step": 46190 + }, + { + "epoch": 3.1386737328441363, + "grad_norm": 1.00948965549469, + "learning_rate": 0.0006078271504280472, + "loss": 3.6534, + "step": 46195 + }, + { + "epoch": 3.1390134529147984, + "grad_norm": 1.1360137462615967, + "learning_rate": 0.0006077846854192147, + "loss": 3.5197, + "step": 46200 + }, + { + "epoch": 3.13935317298546, + "grad_norm": 1.1030008792877197, + "learning_rate": 0.0006077422204103819, + "loss": 3.4413, + "step": 46205 + }, + { + "epoch": 3.1396928930561216, + "grad_norm": 0.9710628390312195, + "learning_rate": 0.0006076997554015491, + "loss": 3.4287, + "step": 46210 + }, + { + "epoch": 3.1400326131267837, + "grad_norm": 0.7942935824394226, + "learning_rate": 0.0006076572903927164, + "loss": 3.8169, + "step": 46215 + }, + { + "epoch": 3.1403723331974454, + "grad_norm": 0.7436251044273376, + "learning_rate": 0.0006076148253838837, + "loss": 3.5965, + "step": 46220 + }, + { + "epoch": 3.140712053268107, + "grad_norm": 0.8710407614707947, + "learning_rate": 0.0006075723603750509, + "loss": 3.4706, + "step": 46225 + }, + { + "epoch": 3.141051773338769, + "grad_norm": 0.9795553088188171, + "learning_rate": 0.0006075298953662182, + "loss": 3.3476, + "step": 46230 + }, + { + "epoch": 3.1413914934094307, + "grad_norm": 1.0740381479263306, + "learning_rate": 0.0006074874303573856, + "loss": 3.5395, + "step": 46235 + }, + { + "epoch": 3.1417312134800923, + "grad_norm": 0.7790523171424866, + "learning_rate": 0.0006074449653485528, + "loss": 3.5145, + "step": 46240 + }, + { + "epoch": 3.1420709335507544, + "grad_norm": 0.9889588356018066, + "learning_rate": 0.0006074025003397201, + "loss": 3.4178, + "step": 46245 + }, + { + "epoch": 3.142410653621416, + "grad_norm": 0.9198558926582336, + "learning_rate": 0.0006073600353308873, + "loss": 3.5831, + "step": 46250 + }, + { + "epoch": 3.1427503736920777, + "grad_norm": 1.1918128728866577, + "learning_rate": 0.0006073175703220546, + "loss": 3.4339, + "step": 46255 + }, + { + "epoch": 3.1430900937627397, + "grad_norm": 1.5941894054412842, + "learning_rate": 0.0006072751053132219, + "loss": 3.5907, + "step": 46260 + }, + { + "epoch": 3.1434298138334014, + "grad_norm": 1.0564517974853516, + "learning_rate": 0.0006072326403043891, + "loss": 3.5591, + "step": 46265 + }, + { + "epoch": 3.143769533904063, + "grad_norm": 1.1934363842010498, + "learning_rate": 0.0006071901752955565, + "loss": 3.6709, + "step": 46270 + }, + { + "epoch": 3.1441092539747246, + "grad_norm": 0.9763439893722534, + "learning_rate": 0.0006071477102867238, + "loss": 3.6919, + "step": 46275 + }, + { + "epoch": 3.1444489740453867, + "grad_norm": 0.9669707417488098, + "learning_rate": 0.000607105245277891, + "loss": 3.5948, + "step": 46280 + }, + { + "epoch": 3.1447886941160483, + "grad_norm": 0.8981504440307617, + "learning_rate": 0.0006070627802690583, + "loss": 3.5097, + "step": 46285 + }, + { + "epoch": 3.14512841418671, + "grad_norm": 0.741168737411499, + "learning_rate": 0.0006070203152602256, + "loss": 3.6743, + "step": 46290 + }, + { + "epoch": 3.145468134257372, + "grad_norm": 0.8947200179100037, + "learning_rate": 0.0006069778502513928, + "loss": 3.402, + "step": 46295 + }, + { + "epoch": 3.1458078543280337, + "grad_norm": 0.8763923645019531, + "learning_rate": 0.00060693538524256, + "loss": 3.7738, + "step": 46300 + }, + { + "epoch": 3.1461475743986953, + "grad_norm": 1.0018210411071777, + "learning_rate": 0.0006068929202337275, + "loss": 3.7209, + "step": 46305 + }, + { + "epoch": 3.1464872944693574, + "grad_norm": 0.9102610945701599, + "learning_rate": 0.0006068504552248947, + "loss": 3.4045, + "step": 46310 + }, + { + "epoch": 3.146827014540019, + "grad_norm": 0.886794924736023, + "learning_rate": 0.0006068079902160619, + "loss": 3.3832, + "step": 46315 + }, + { + "epoch": 3.1471667346106806, + "grad_norm": 0.9271769523620605, + "learning_rate": 0.0006067655252072293, + "loss": 3.3512, + "step": 46320 + }, + { + "epoch": 3.1475064546813427, + "grad_norm": 1.1330229043960571, + "learning_rate": 0.0006067230601983965, + "loss": 3.7592, + "step": 46325 + }, + { + "epoch": 3.1478461747520043, + "grad_norm": 0.8698058724403381, + "learning_rate": 0.0006066805951895638, + "loss": 3.6292, + "step": 46330 + }, + { + "epoch": 3.148185894822666, + "grad_norm": 1.0209898948669434, + "learning_rate": 0.0006066381301807312, + "loss": 3.5407, + "step": 46335 + }, + { + "epoch": 3.148525614893328, + "grad_norm": 0.9837571382522583, + "learning_rate": 0.0006065956651718984, + "loss": 3.3779, + "step": 46340 + }, + { + "epoch": 3.1488653349639897, + "grad_norm": 1.1244730949401855, + "learning_rate": 0.0006065532001630657, + "loss": 3.659, + "step": 46345 + }, + { + "epoch": 3.1492050550346513, + "grad_norm": 0.7761459946632385, + "learning_rate": 0.0006065107351542329, + "loss": 3.4687, + "step": 46350 + }, + { + "epoch": 3.1495447751053134, + "grad_norm": 0.9284380078315735, + "learning_rate": 0.0006064682701454002, + "loss": 3.3564, + "step": 46355 + }, + { + "epoch": 3.149884495175975, + "grad_norm": 0.9221447110176086, + "learning_rate": 0.0006064258051365675, + "loss": 3.6012, + "step": 46360 + }, + { + "epoch": 3.1502242152466366, + "grad_norm": 0.8679723739624023, + "learning_rate": 0.0006063833401277347, + "loss": 3.7203, + "step": 46365 + }, + { + "epoch": 3.1505639353172987, + "grad_norm": 1.220781683921814, + "learning_rate": 0.0006063408751189021, + "loss": 3.3641, + "step": 46370 + }, + { + "epoch": 3.1509036553879604, + "grad_norm": 0.848608136177063, + "learning_rate": 0.0006062984101100694, + "loss": 3.4038, + "step": 46375 + }, + { + "epoch": 3.151243375458622, + "grad_norm": 0.9148770570755005, + "learning_rate": 0.0006062559451012366, + "loss": 3.4623, + "step": 46380 + }, + { + "epoch": 3.151583095529284, + "grad_norm": 0.9214658141136169, + "learning_rate": 0.0006062134800924039, + "loss": 3.6571, + "step": 46385 + }, + { + "epoch": 3.1519228155999457, + "grad_norm": 0.7882973551750183, + "learning_rate": 0.0006061710150835712, + "loss": 3.4731, + "step": 46390 + }, + { + "epoch": 3.1522625356706073, + "grad_norm": 0.8308243751525879, + "learning_rate": 0.0006061285500747384, + "loss": 3.582, + "step": 46395 + }, + { + "epoch": 3.1526022557412694, + "grad_norm": 0.8958504796028137, + "learning_rate": 0.0006060860850659056, + "loss": 3.744, + "step": 46400 + }, + { + "epoch": 3.152941975811931, + "grad_norm": 1.3854252099990845, + "learning_rate": 0.0006060436200570731, + "loss": 3.4901, + "step": 46405 + }, + { + "epoch": 3.1532816958825927, + "grad_norm": 0.6910836696624756, + "learning_rate": 0.0006060011550482403, + "loss": 3.5298, + "step": 46410 + }, + { + "epoch": 3.1536214159532543, + "grad_norm": 0.8302188515663147, + "learning_rate": 0.0006059586900394075, + "loss": 3.3542, + "step": 46415 + }, + { + "epoch": 3.1539611360239164, + "grad_norm": 1.0107803344726562, + "learning_rate": 0.0006059162250305749, + "loss": 3.5226, + "step": 46420 + }, + { + "epoch": 3.154300856094578, + "grad_norm": 5.867356300354004, + "learning_rate": 0.0006058737600217421, + "loss": 3.7382, + "step": 46425 + }, + { + "epoch": 3.1546405761652396, + "grad_norm": 0.7913936376571655, + "learning_rate": 0.0006058312950129093, + "loss": 3.5477, + "step": 46430 + }, + { + "epoch": 3.1549802962359017, + "grad_norm": 0.9924778342247009, + "learning_rate": 0.0006057888300040767, + "loss": 3.4129, + "step": 46435 + }, + { + "epoch": 3.1553200163065633, + "grad_norm": 0.9126939177513123, + "learning_rate": 0.000605746364995244, + "loss": 3.5577, + "step": 46440 + }, + { + "epoch": 3.155659736377225, + "grad_norm": 0.6479414701461792, + "learning_rate": 0.0006057038999864112, + "loss": 3.6248, + "step": 46445 + }, + { + "epoch": 3.155999456447887, + "grad_norm": 0.822129487991333, + "learning_rate": 0.0006056614349775785, + "loss": 3.6748, + "step": 46450 + }, + { + "epoch": 3.1563391765185487, + "grad_norm": 0.8057773113250732, + "learning_rate": 0.0006056189699687458, + "loss": 3.7631, + "step": 46455 + }, + { + "epoch": 3.1566788965892103, + "grad_norm": 0.9229053258895874, + "learning_rate": 0.000605576504959913, + "loss": 3.4607, + "step": 46460 + }, + { + "epoch": 3.1570186166598724, + "grad_norm": 0.6746360659599304, + "learning_rate": 0.0006055340399510803, + "loss": 3.457, + "step": 46465 + }, + { + "epoch": 3.157358336730534, + "grad_norm": 1.0471227169036865, + "learning_rate": 0.0006054915749422476, + "loss": 3.6736, + "step": 46470 + }, + { + "epoch": 3.1576980568011956, + "grad_norm": 0.8506830334663391, + "learning_rate": 0.0006054491099334149, + "loss": 3.2574, + "step": 46475 + }, + { + "epoch": 3.1580377768718577, + "grad_norm": 0.8007872700691223, + "learning_rate": 0.0006054066449245822, + "loss": 3.3805, + "step": 46480 + }, + { + "epoch": 3.1583774969425193, + "grad_norm": 0.814213752746582, + "learning_rate": 0.0006053641799157495, + "loss": 3.4605, + "step": 46485 + }, + { + "epoch": 3.158717217013181, + "grad_norm": 0.8441728353500366, + "learning_rate": 0.0006053217149069167, + "loss": 3.4405, + "step": 46490 + }, + { + "epoch": 3.159056937083843, + "grad_norm": 0.9004828929901123, + "learning_rate": 0.000605279249898084, + "loss": 3.535, + "step": 46495 + }, + { + "epoch": 3.1593966571545047, + "grad_norm": 0.7447927594184875, + "learning_rate": 0.0006052367848892512, + "loss": 3.0778, + "step": 46500 + }, + { + "epoch": 3.1597363772251663, + "grad_norm": 0.9692995548248291, + "learning_rate": 0.0006051943198804185, + "loss": 3.5846, + "step": 46505 + }, + { + "epoch": 3.1600760972958284, + "grad_norm": 1.101464033126831, + "learning_rate": 0.0006051518548715859, + "loss": 3.6693, + "step": 46510 + }, + { + "epoch": 3.16041581736649, + "grad_norm": 0.9324517250061035, + "learning_rate": 0.0006051093898627531, + "loss": 3.6704, + "step": 46515 + }, + { + "epoch": 3.1607555374371517, + "grad_norm": 0.8250672221183777, + "learning_rate": 0.0006050669248539204, + "loss": 3.6899, + "step": 46520 + }, + { + "epoch": 3.1610952575078137, + "grad_norm": 0.9854023456573486, + "learning_rate": 0.0006050244598450877, + "loss": 3.4412, + "step": 46525 + }, + { + "epoch": 3.1614349775784754, + "grad_norm": 0.938166081905365, + "learning_rate": 0.0006049819948362549, + "loss": 3.5538, + "step": 46530 + }, + { + "epoch": 3.161774697649137, + "grad_norm": 0.6857420206069946, + "learning_rate": 0.0006049395298274221, + "loss": 3.6492, + "step": 46535 + }, + { + "epoch": 3.162114417719799, + "grad_norm": 1.0589381456375122, + "learning_rate": 0.0006048970648185895, + "loss": 3.3891, + "step": 46540 + }, + { + "epoch": 3.1624541377904607, + "grad_norm": 1.199077844619751, + "learning_rate": 0.0006048545998097568, + "loss": 3.4084, + "step": 46545 + }, + { + "epoch": 3.1627938578611223, + "grad_norm": 0.6829050779342651, + "learning_rate": 0.000604812134800924, + "loss": 3.6577, + "step": 46550 + }, + { + "epoch": 3.1631335779317844, + "grad_norm": 1.3749375343322754, + "learning_rate": 0.0006047696697920914, + "loss": 3.6275, + "step": 46555 + }, + { + "epoch": 3.163473298002446, + "grad_norm": 1.0687618255615234, + "learning_rate": 0.0006047272047832586, + "loss": 3.4855, + "step": 46560 + }, + { + "epoch": 3.1638130180731077, + "grad_norm": 0.6431927680969238, + "learning_rate": 0.0006046847397744258, + "loss": 3.6915, + "step": 46565 + }, + { + "epoch": 3.1641527381437697, + "grad_norm": 1.0209699869155884, + "learning_rate": 0.0006046422747655932, + "loss": 3.4333, + "step": 46570 + }, + { + "epoch": 3.1644924582144314, + "grad_norm": 0.9238407015800476, + "learning_rate": 0.0006045998097567604, + "loss": 3.7916, + "step": 46575 + }, + { + "epoch": 3.164832178285093, + "grad_norm": 0.7589640021324158, + "learning_rate": 0.0006045573447479277, + "loss": 3.7483, + "step": 46580 + }, + { + "epoch": 3.165171898355755, + "grad_norm": 1.1190102100372314, + "learning_rate": 0.0006045148797390951, + "loss": 3.4163, + "step": 46585 + }, + { + "epoch": 3.1655116184264167, + "grad_norm": 0.9562110304832458, + "learning_rate": 0.0006044724147302623, + "loss": 3.3261, + "step": 46590 + }, + { + "epoch": 3.1658513384970783, + "grad_norm": 0.9003265500068665, + "learning_rate": 0.0006044299497214295, + "loss": 3.7291, + "step": 46595 + }, + { + "epoch": 3.1661910585677404, + "grad_norm": 0.8608159422874451, + "learning_rate": 0.0006043874847125968, + "loss": 3.5231, + "step": 46600 + }, + { + "epoch": 3.166530778638402, + "grad_norm": 0.9154327511787415, + "learning_rate": 0.0006043450197037641, + "loss": 3.6557, + "step": 46605 + }, + { + "epoch": 3.1668704987090637, + "grad_norm": 0.9925882816314697, + "learning_rate": 0.0006043025546949313, + "loss": 3.4511, + "step": 46610 + }, + { + "epoch": 3.1672102187797253, + "grad_norm": 0.9663722515106201, + "learning_rate": 0.0006042600896860987, + "loss": 3.4166, + "step": 46615 + }, + { + "epoch": 3.1675499388503874, + "grad_norm": 0.9861029386520386, + "learning_rate": 0.000604217624677266, + "loss": 3.6327, + "step": 46620 + }, + { + "epoch": 3.167889658921049, + "grad_norm": 0.8753692507743835, + "learning_rate": 0.0006041751596684332, + "loss": 3.8547, + "step": 46625 + }, + { + "epoch": 3.1682293789917106, + "grad_norm": 1.1428277492523193, + "learning_rate": 0.0006041326946596005, + "loss": 3.23, + "step": 46630 + }, + { + "epoch": 3.1685690990623727, + "grad_norm": 0.8439575433731079, + "learning_rate": 0.0006040902296507677, + "loss": 3.6398, + "step": 46635 + }, + { + "epoch": 3.1689088191330343, + "grad_norm": 0.7605099081993103, + "learning_rate": 0.000604047764641935, + "loss": 3.5639, + "step": 46640 + }, + { + "epoch": 3.169248539203696, + "grad_norm": 0.8255649209022522, + "learning_rate": 0.0006040052996331023, + "loss": 3.5929, + "step": 46645 + }, + { + "epoch": 3.169588259274358, + "grad_norm": 1.1977040767669678, + "learning_rate": 0.0006039628346242696, + "loss": 3.6478, + "step": 46650 + }, + { + "epoch": 3.1699279793450197, + "grad_norm": 0.8950563669204712, + "learning_rate": 0.0006039203696154369, + "loss": 3.5437, + "step": 46655 + }, + { + "epoch": 3.1702676994156813, + "grad_norm": 0.8936139345169067, + "learning_rate": 0.0006038779046066042, + "loss": 3.6786, + "step": 46660 + }, + { + "epoch": 3.1706074194863434, + "grad_norm": 0.6903778910636902, + "learning_rate": 0.0006038354395977714, + "loss": 3.4989, + "step": 46665 + }, + { + "epoch": 3.170947139557005, + "grad_norm": 0.8372366428375244, + "learning_rate": 0.0006037929745889388, + "loss": 3.5334, + "step": 46670 + }, + { + "epoch": 3.1712868596276667, + "grad_norm": 1.0461219549179077, + "learning_rate": 0.000603750509580106, + "loss": 3.6852, + "step": 46675 + }, + { + "epoch": 3.1716265796983287, + "grad_norm": 0.8519279360771179, + "learning_rate": 0.0006037080445712732, + "loss": 3.6736, + "step": 46680 + }, + { + "epoch": 3.1719662997689904, + "grad_norm": 1.0188757181167603, + "learning_rate": 0.0006036655795624407, + "loss": 3.3914, + "step": 46685 + }, + { + "epoch": 3.172306019839652, + "grad_norm": 1.8148763179779053, + "learning_rate": 0.0006036231145536079, + "loss": 3.5843, + "step": 46690 + }, + { + "epoch": 3.172645739910314, + "grad_norm": 0.8287613987922668, + "learning_rate": 0.0006035806495447751, + "loss": 3.7488, + "step": 46695 + }, + { + "epoch": 3.1729854599809757, + "grad_norm": 0.8893025517463684, + "learning_rate": 0.0006035381845359424, + "loss": 3.2906, + "step": 46700 + }, + { + "epoch": 3.1733251800516373, + "grad_norm": 0.847232460975647, + "learning_rate": 0.0006034957195271097, + "loss": 3.4813, + "step": 46705 + }, + { + "epoch": 3.1736649001222994, + "grad_norm": 0.9702305197715759, + "learning_rate": 0.0006034532545182769, + "loss": 3.4809, + "step": 46710 + }, + { + "epoch": 3.174004620192961, + "grad_norm": 1.1698143482208252, + "learning_rate": 0.0006034107895094442, + "loss": 3.7402, + "step": 46715 + }, + { + "epoch": 3.1743443402636227, + "grad_norm": 0.8594977259635925, + "learning_rate": 0.0006033683245006116, + "loss": 3.576, + "step": 46720 + }, + { + "epoch": 3.1746840603342847, + "grad_norm": 0.7405272126197815, + "learning_rate": 0.0006033258594917788, + "loss": 3.4256, + "step": 46725 + }, + { + "epoch": 3.1750237804049464, + "grad_norm": 0.7730692625045776, + "learning_rate": 0.0006032833944829461, + "loss": 3.5327, + "step": 46730 + }, + { + "epoch": 3.175363500475608, + "grad_norm": 0.8408945798873901, + "learning_rate": 0.0006032409294741133, + "loss": 3.4718, + "step": 46735 + }, + { + "epoch": 3.17570322054627, + "grad_norm": 1.0354704856872559, + "learning_rate": 0.0006031984644652806, + "loss": 3.6103, + "step": 46740 + }, + { + "epoch": 3.1760429406169317, + "grad_norm": 0.9831952452659607, + "learning_rate": 0.0006031559994564479, + "loss": 3.7897, + "step": 46745 + }, + { + "epoch": 3.1763826606875933, + "grad_norm": 0.7858777046203613, + "learning_rate": 0.0006031135344476151, + "loss": 3.4489, + "step": 46750 + }, + { + "epoch": 3.176722380758255, + "grad_norm": 0.8646013140678406, + "learning_rate": 0.0006030710694387825, + "loss": 3.1335, + "step": 46755 + }, + { + "epoch": 3.177062100828917, + "grad_norm": 0.9287490844726562, + "learning_rate": 0.0006030286044299498, + "loss": 3.499, + "step": 46760 + }, + { + "epoch": 3.1774018208995787, + "grad_norm": 0.7551256418228149, + "learning_rate": 0.000602986139421117, + "loss": 3.3785, + "step": 46765 + }, + { + "epoch": 3.1777415409702403, + "grad_norm": 0.9934840202331543, + "learning_rate": 0.0006029436744122843, + "loss": 3.5627, + "step": 46770 + }, + { + "epoch": 3.1780812610409024, + "grad_norm": 1.0932186841964722, + "learning_rate": 0.0006029012094034516, + "loss": 3.3344, + "step": 46775 + }, + { + "epoch": 3.178420981111564, + "grad_norm": 0.776796817779541, + "learning_rate": 0.0006028587443946188, + "loss": 3.5262, + "step": 46780 + }, + { + "epoch": 3.1787607011822256, + "grad_norm": 0.8539546728134155, + "learning_rate": 0.000602816279385786, + "loss": 3.4294, + "step": 46785 + }, + { + "epoch": 3.1791004212528877, + "grad_norm": 0.8351855874061584, + "learning_rate": 0.0006027738143769535, + "loss": 3.8036, + "step": 46790 + }, + { + "epoch": 3.1794401413235494, + "grad_norm": 0.7898567318916321, + "learning_rate": 0.0006027313493681207, + "loss": 3.5911, + "step": 46795 + }, + { + "epoch": 3.179779861394211, + "grad_norm": 1.0865623950958252, + "learning_rate": 0.0006026888843592879, + "loss": 3.326, + "step": 46800 + }, + { + "epoch": 3.180119581464873, + "grad_norm": 0.704699695110321, + "learning_rate": 0.0006026464193504553, + "loss": 3.3516, + "step": 46805 + }, + { + "epoch": 3.1804593015355347, + "grad_norm": 0.9653106927871704, + "learning_rate": 0.0006026039543416225, + "loss": 3.5272, + "step": 46810 + }, + { + "epoch": 3.1807990216061963, + "grad_norm": 0.8788805603981018, + "learning_rate": 0.0006025614893327897, + "loss": 3.396, + "step": 46815 + }, + { + "epoch": 3.1811387416768584, + "grad_norm": 1.168582558631897, + "learning_rate": 0.0006025190243239571, + "loss": 3.4098, + "step": 46820 + }, + { + "epoch": 3.18147846174752, + "grad_norm": 0.7129777669906616, + "learning_rate": 0.0006024765593151244, + "loss": 3.6063, + "step": 46825 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.7361340522766113, + "learning_rate": 0.0006024340943062916, + "loss": 3.5191, + "step": 46830 + }, + { + "epoch": 3.1821579018888437, + "grad_norm": 0.7671364545822144, + "learning_rate": 0.000602391629297459, + "loss": 3.5789, + "step": 46835 + }, + { + "epoch": 3.1824976219595054, + "grad_norm": 1.260205626487732, + "learning_rate": 0.0006023491642886262, + "loss": 3.391, + "step": 46840 + }, + { + "epoch": 3.182837342030167, + "grad_norm": 0.9091965556144714, + "learning_rate": 0.0006023066992797934, + "loss": 3.5358, + "step": 46845 + }, + { + "epoch": 3.183177062100829, + "grad_norm": 0.8036438822746277, + "learning_rate": 0.0006022642342709607, + "loss": 3.5578, + "step": 46850 + }, + { + "epoch": 3.1835167821714907, + "grad_norm": 0.8123762011528015, + "learning_rate": 0.000602221769262128, + "loss": 3.5424, + "step": 46855 + }, + { + "epoch": 3.1838565022421523, + "grad_norm": 1.2159863710403442, + "learning_rate": 0.0006021793042532953, + "loss": 3.6763, + "step": 46860 + }, + { + "epoch": 3.1841962223128144, + "grad_norm": 0.8449065089225769, + "learning_rate": 0.0006021368392444626, + "loss": 3.4787, + "step": 46865 + }, + { + "epoch": 3.184535942383476, + "grad_norm": 0.7536017894744873, + "learning_rate": 0.0006020943742356299, + "loss": 3.6605, + "step": 46870 + }, + { + "epoch": 3.1848756624541377, + "grad_norm": 0.9697331190109253, + "learning_rate": 0.0006020519092267971, + "loss": 3.5857, + "step": 46875 + }, + { + "epoch": 3.1852153825247997, + "grad_norm": 0.7890490293502808, + "learning_rate": 0.0006020094442179644, + "loss": 3.5538, + "step": 46880 + }, + { + "epoch": 3.1855551025954614, + "grad_norm": 0.8829075694084167, + "learning_rate": 0.0006019669792091316, + "loss": 3.7243, + "step": 46885 + }, + { + "epoch": 3.185894822666123, + "grad_norm": 1.000248670578003, + "learning_rate": 0.0006019245142002989, + "loss": 3.4946, + "step": 46890 + }, + { + "epoch": 3.186234542736785, + "grad_norm": 0.9197931289672852, + "learning_rate": 0.0006018820491914663, + "loss": 3.4464, + "step": 46895 + }, + { + "epoch": 3.1865742628074467, + "grad_norm": 0.8974789977073669, + "learning_rate": 0.0006018395841826335, + "loss": 3.6912, + "step": 46900 + }, + { + "epoch": 3.1869139828781083, + "grad_norm": 3.5404481887817383, + "learning_rate": 0.0006017971191738008, + "loss": 3.3499, + "step": 46905 + }, + { + "epoch": 3.1872537029487704, + "grad_norm": 0.8272597193717957, + "learning_rate": 0.0006017546541649681, + "loss": 3.6698, + "step": 46910 + }, + { + "epoch": 3.187593423019432, + "grad_norm": 0.8886229395866394, + "learning_rate": 0.0006017121891561353, + "loss": 3.4334, + "step": 46915 + }, + { + "epoch": 3.1879331430900937, + "grad_norm": 1.3106629848480225, + "learning_rate": 0.0006016697241473025, + "loss": 3.6093, + "step": 46920 + }, + { + "epoch": 3.1882728631607558, + "grad_norm": 0.8363166451454163, + "learning_rate": 0.00060162725913847, + "loss": 3.6953, + "step": 46925 + }, + { + "epoch": 3.1886125832314174, + "grad_norm": 0.8122643828392029, + "learning_rate": 0.0006015847941296372, + "loss": 3.5458, + "step": 46930 + }, + { + "epoch": 3.188952303302079, + "grad_norm": 0.9333836436271667, + "learning_rate": 0.0006015423291208044, + "loss": 3.502, + "step": 46935 + }, + { + "epoch": 3.189292023372741, + "grad_norm": 1.021997332572937, + "learning_rate": 0.0006014998641119718, + "loss": 3.5749, + "step": 46940 + }, + { + "epoch": 3.1896317434434027, + "grad_norm": 0.6847817301750183, + "learning_rate": 0.000601457399103139, + "loss": 3.4484, + "step": 46945 + }, + { + "epoch": 3.1899714635140644, + "grad_norm": 0.8086551427841187, + "learning_rate": 0.0006014149340943062, + "loss": 3.5478, + "step": 46950 + }, + { + "epoch": 3.190311183584726, + "grad_norm": 1.04338800907135, + "learning_rate": 0.0006013724690854736, + "loss": 3.3758, + "step": 46955 + }, + { + "epoch": 3.190650903655388, + "grad_norm": 0.7808682322502136, + "learning_rate": 0.0006013300040766409, + "loss": 3.3264, + "step": 46960 + }, + { + "epoch": 3.1909906237260497, + "grad_norm": 0.9394356608390808, + "learning_rate": 0.0006012875390678081, + "loss": 3.4327, + "step": 46965 + }, + { + "epoch": 3.1913303437967113, + "grad_norm": 1.09820556640625, + "learning_rate": 0.0006012450740589755, + "loss": 3.3579, + "step": 46970 + }, + { + "epoch": 3.1916700638673734, + "grad_norm": 0.9637178182601929, + "learning_rate": 0.0006012026090501427, + "loss": 3.5649, + "step": 46975 + }, + { + "epoch": 3.192009783938035, + "grad_norm": 0.9743833541870117, + "learning_rate": 0.0006011601440413099, + "loss": 3.4498, + "step": 46980 + }, + { + "epoch": 3.1923495040086967, + "grad_norm": 0.9451848864555359, + "learning_rate": 0.0006011176790324772, + "loss": 3.4451, + "step": 46985 + }, + { + "epoch": 3.1926892240793587, + "grad_norm": 0.9107082486152649, + "learning_rate": 0.0006010752140236445, + "loss": 3.5335, + "step": 46990 + }, + { + "epoch": 3.1930289441500204, + "grad_norm": 0.7596897482872009, + "learning_rate": 0.0006010327490148118, + "loss": 3.4703, + "step": 46995 + }, + { + "epoch": 3.193368664220682, + "grad_norm": 0.6794086694717407, + "learning_rate": 0.0006009902840059791, + "loss": 3.3964, + "step": 47000 + }, + { + "epoch": 3.193708384291344, + "grad_norm": 0.7960811853408813, + "learning_rate": 0.0006009478189971464, + "loss": 3.7229, + "step": 47005 + }, + { + "epoch": 3.1940481043620057, + "grad_norm": 0.830722987651825, + "learning_rate": 0.0006009053539883137, + "loss": 3.2985, + "step": 47010 + }, + { + "epoch": 3.1943878244326673, + "grad_norm": 0.9422034025192261, + "learning_rate": 0.0006008628889794809, + "loss": 3.5367, + "step": 47015 + }, + { + "epoch": 3.1947275445033294, + "grad_norm": 0.8896151185035706, + "learning_rate": 0.0006008204239706482, + "loss": 3.5658, + "step": 47020 + }, + { + "epoch": 3.195067264573991, + "grad_norm": 0.787697970867157, + "learning_rate": 0.0006007779589618155, + "loss": 3.5726, + "step": 47025 + }, + { + "epoch": 3.1954069846446527, + "grad_norm": 0.8697583079338074, + "learning_rate": 0.0006007354939529828, + "loss": 3.4205, + "step": 47030 + }, + { + "epoch": 3.1957467047153147, + "grad_norm": 0.7322379946708679, + "learning_rate": 0.00060069302894415, + "loss": 3.4746, + "step": 47035 + }, + { + "epoch": 3.1960864247859764, + "grad_norm": 0.9854978919029236, + "learning_rate": 0.0006006505639353174, + "loss": 3.5477, + "step": 47040 + }, + { + "epoch": 3.196426144856638, + "grad_norm": 0.771025538444519, + "learning_rate": 0.0006006080989264846, + "loss": 3.2027, + "step": 47045 + }, + { + "epoch": 3.1967658649273, + "grad_norm": 1.0371077060699463, + "learning_rate": 0.0006005656339176518, + "loss": 3.3897, + "step": 47050 + }, + { + "epoch": 3.1971055849979617, + "grad_norm": 0.7623852491378784, + "learning_rate": 0.0006005231689088192, + "loss": 3.4934, + "step": 47055 + }, + { + "epoch": 3.1974453050686233, + "grad_norm": 1.0432995557785034, + "learning_rate": 0.0006004807038999864, + "loss": 3.3387, + "step": 47060 + }, + { + "epoch": 3.1977850251392854, + "grad_norm": 0.8045148849487305, + "learning_rate": 0.0006004382388911537, + "loss": 3.7045, + "step": 47065 + }, + { + "epoch": 3.198124745209947, + "grad_norm": 0.9619086980819702, + "learning_rate": 0.0006003957738823211, + "loss": 3.511, + "step": 47070 + }, + { + "epoch": 3.1984644652806087, + "grad_norm": 0.7844075560569763, + "learning_rate": 0.0006003533088734883, + "loss": 3.8174, + "step": 47075 + }, + { + "epoch": 3.1988041853512708, + "grad_norm": 0.8481375575065613, + "learning_rate": 0.0006003108438646555, + "loss": 3.3377, + "step": 47080 + }, + { + "epoch": 3.1991439054219324, + "grad_norm": 0.8932517170906067, + "learning_rate": 0.0006002683788558228, + "loss": 3.6169, + "step": 47085 + }, + { + "epoch": 3.199483625492594, + "grad_norm": 0.9887673854827881, + "learning_rate": 0.0006002259138469901, + "loss": 3.6786, + "step": 47090 + }, + { + "epoch": 3.1998233455632556, + "grad_norm": 0.8950097560882568, + "learning_rate": 0.0006001834488381573, + "loss": 3.4194, + "step": 47095 + }, + { + "epoch": 3.2001630656339177, + "grad_norm": 0.8182898759841919, + "learning_rate": 0.0006001409838293247, + "loss": 3.4496, + "step": 47100 + }, + { + "epoch": 3.2005027857045794, + "grad_norm": 0.9116148948669434, + "learning_rate": 0.000600098518820492, + "loss": 3.4264, + "step": 47105 + }, + { + "epoch": 3.200842505775241, + "grad_norm": 0.7938491702079773, + "learning_rate": 0.0006000560538116592, + "loss": 3.8632, + "step": 47110 + }, + { + "epoch": 3.201182225845903, + "grad_norm": 0.9127912521362305, + "learning_rate": 0.0006000135888028265, + "loss": 3.4845, + "step": 47115 + }, + { + "epoch": 3.2015219459165647, + "grad_norm": 1.5167125463485718, + "learning_rate": 0.0005999711237939938, + "loss": 3.562, + "step": 47120 + }, + { + "epoch": 3.2018616659872263, + "grad_norm": 0.8054856657981873, + "learning_rate": 0.000599928658785161, + "loss": 3.1869, + "step": 47125 + }, + { + "epoch": 3.2022013860578884, + "grad_norm": 0.7191848158836365, + "learning_rate": 0.0005998861937763283, + "loss": 3.6923, + "step": 47130 + }, + { + "epoch": 3.20254110612855, + "grad_norm": 1.1410577297210693, + "learning_rate": 0.0005998437287674956, + "loss": 3.6635, + "step": 47135 + }, + { + "epoch": 3.2028808261992117, + "grad_norm": 1.1920030117034912, + "learning_rate": 0.0005998012637586629, + "loss": 3.509, + "step": 47140 + }, + { + "epoch": 3.2032205462698737, + "grad_norm": 0.732431948184967, + "learning_rate": 0.0005997587987498302, + "loss": 3.5515, + "step": 47145 + }, + { + "epoch": 3.2035602663405354, + "grad_norm": 0.9868467450141907, + "learning_rate": 0.0005997163337409974, + "loss": 3.5237, + "step": 47150 + }, + { + "epoch": 3.203899986411197, + "grad_norm": 6.666890621185303, + "learning_rate": 0.0005996738687321647, + "loss": 3.6783, + "step": 47155 + }, + { + "epoch": 3.204239706481859, + "grad_norm": 0.8998790383338928, + "learning_rate": 0.000599631403723332, + "loss": 3.5727, + "step": 47160 + }, + { + "epoch": 3.2045794265525207, + "grad_norm": 0.9208125472068787, + "learning_rate": 0.0005995889387144992, + "loss": 3.5466, + "step": 47165 + }, + { + "epoch": 3.2049191466231823, + "grad_norm": 0.8392632603645325, + "learning_rate": 0.0005995464737056666, + "loss": 3.7365, + "step": 47170 + }, + { + "epoch": 3.2052588666938444, + "grad_norm": 0.7278985977172852, + "learning_rate": 0.0005995040086968339, + "loss": 3.4092, + "step": 47175 + }, + { + "epoch": 3.205598586764506, + "grad_norm": 0.8337382674217224, + "learning_rate": 0.0005994615436880011, + "loss": 3.7015, + "step": 47180 + }, + { + "epoch": 3.2059383068351677, + "grad_norm": 0.9369175434112549, + "learning_rate": 0.0005994190786791683, + "loss": 3.4075, + "step": 47185 + }, + { + "epoch": 3.2062780269058297, + "grad_norm": 0.9891911745071411, + "learning_rate": 0.0005993766136703357, + "loss": 3.4581, + "step": 47190 + }, + { + "epoch": 3.2066177469764914, + "grad_norm": 0.9671744108200073, + "learning_rate": 0.0005993341486615029, + "loss": 3.4442, + "step": 47195 + }, + { + "epoch": 3.206957467047153, + "grad_norm": 0.8279282450675964, + "learning_rate": 0.0005992916836526701, + "loss": 3.3001, + "step": 47200 + }, + { + "epoch": 3.207297187117815, + "grad_norm": 0.9995616674423218, + "learning_rate": 0.0005992492186438376, + "loss": 3.3592, + "step": 47205 + }, + { + "epoch": 3.2076369071884767, + "grad_norm": 0.7095403075218201, + "learning_rate": 0.0005992067536350048, + "loss": 3.6536, + "step": 47210 + }, + { + "epoch": 3.2079766272591383, + "grad_norm": 1.2349727153778076, + "learning_rate": 0.000599164288626172, + "loss": 3.2319, + "step": 47215 + }, + { + "epoch": 3.2083163473298004, + "grad_norm": 1.0005429983139038, + "learning_rate": 0.0005991218236173394, + "loss": 3.3814, + "step": 47220 + }, + { + "epoch": 3.208656067400462, + "grad_norm": 0.8117414116859436, + "learning_rate": 0.0005990793586085066, + "loss": 3.47, + "step": 47225 + }, + { + "epoch": 3.2089957874711237, + "grad_norm": 0.7950822710990906, + "learning_rate": 0.0005990368935996738, + "loss": 3.1758, + "step": 47230 + }, + { + "epoch": 3.2093355075417858, + "grad_norm": 0.9990066289901733, + "learning_rate": 0.0005989944285908411, + "loss": 3.8167, + "step": 47235 + }, + { + "epoch": 3.2096752276124474, + "grad_norm": 0.8875026106834412, + "learning_rate": 0.0005989519635820085, + "loss": 3.5699, + "step": 47240 + }, + { + "epoch": 3.210014947683109, + "grad_norm": 0.8167032599449158, + "learning_rate": 0.0005989094985731757, + "loss": 3.6276, + "step": 47245 + }, + { + "epoch": 3.210354667753771, + "grad_norm": 1.000830888748169, + "learning_rate": 0.000598867033564343, + "loss": 3.7532, + "step": 47250 + }, + { + "epoch": 3.2106943878244327, + "grad_norm": 0.8671822547912598, + "learning_rate": 0.0005988245685555103, + "loss": 3.4874, + "step": 47255 + }, + { + "epoch": 3.2110341078950944, + "grad_norm": 0.8398844003677368, + "learning_rate": 0.0005987821035466775, + "loss": 3.5865, + "step": 47260 + }, + { + "epoch": 3.2113738279657564, + "grad_norm": 0.9057223200798035, + "learning_rate": 0.0005987396385378448, + "loss": 3.2238, + "step": 47265 + }, + { + "epoch": 3.211713548036418, + "grad_norm": 0.7942622303962708, + "learning_rate": 0.000598697173529012, + "loss": 3.5698, + "step": 47270 + }, + { + "epoch": 3.2120532681070797, + "grad_norm": 1.3247205018997192, + "learning_rate": 0.0005986547085201794, + "loss": 3.4761, + "step": 47275 + }, + { + "epoch": 3.2123929881777418, + "grad_norm": 1.0181808471679688, + "learning_rate": 0.0005986122435113467, + "loss": 3.5797, + "step": 47280 + }, + { + "epoch": 3.2127327082484034, + "grad_norm": 0.7267351746559143, + "learning_rate": 0.0005985697785025139, + "loss": 3.4536, + "step": 47285 + }, + { + "epoch": 3.213072428319065, + "grad_norm": 1.0968592166900635, + "learning_rate": 0.0005985273134936812, + "loss": 3.5328, + "step": 47290 + }, + { + "epoch": 3.2134121483897267, + "grad_norm": 0.7640217542648315, + "learning_rate": 0.0005984848484848485, + "loss": 3.4072, + "step": 47295 + }, + { + "epoch": 3.2137518684603887, + "grad_norm": 1.0211881399154663, + "learning_rate": 0.0005984423834760157, + "loss": 3.7047, + "step": 47300 + }, + { + "epoch": 3.2140915885310504, + "grad_norm": 0.709193766117096, + "learning_rate": 0.000598399918467183, + "loss": 3.1749, + "step": 47305 + }, + { + "epoch": 3.214431308601712, + "grad_norm": 0.8448679447174072, + "learning_rate": 0.0005983574534583504, + "loss": 3.1969, + "step": 47310 + }, + { + "epoch": 3.214771028672374, + "grad_norm": 0.8338539600372314, + "learning_rate": 0.0005983149884495176, + "loss": 3.4935, + "step": 47315 + }, + { + "epoch": 3.2151107487430357, + "grad_norm": 2.079035997390747, + "learning_rate": 0.0005982725234406848, + "loss": 3.6768, + "step": 47320 + }, + { + "epoch": 3.2154504688136973, + "grad_norm": 0.7940142750740051, + "learning_rate": 0.0005982300584318522, + "loss": 3.67, + "step": 47325 + }, + { + "epoch": 3.2157901888843594, + "grad_norm": 1.2892569303512573, + "learning_rate": 0.0005981875934230194, + "loss": 3.344, + "step": 47330 + }, + { + "epoch": 3.216129908955021, + "grad_norm": 3.6183581352233887, + "learning_rate": 0.0005981451284141866, + "loss": 3.4604, + "step": 47335 + }, + { + "epoch": 3.2164696290256827, + "grad_norm": 0.7363945245742798, + "learning_rate": 0.000598102663405354, + "loss": 3.4703, + "step": 47340 + }, + { + "epoch": 3.2168093490963448, + "grad_norm": 0.967413604259491, + "learning_rate": 0.0005980601983965213, + "loss": 3.2542, + "step": 47345 + }, + { + "epoch": 3.2171490691670064, + "grad_norm": 0.9582085609436035, + "learning_rate": 0.0005980177333876886, + "loss": 3.5357, + "step": 47350 + }, + { + "epoch": 3.217488789237668, + "grad_norm": 0.7528563737869263, + "learning_rate": 0.0005979752683788559, + "loss": 3.39, + "step": 47355 + }, + { + "epoch": 3.21782850930833, + "grad_norm": 0.8628895878791809, + "learning_rate": 0.0005979328033700231, + "loss": 3.7028, + "step": 47360 + }, + { + "epoch": 3.2181682293789917, + "grad_norm": 1.070219874382019, + "learning_rate": 0.0005978903383611904, + "loss": 3.3831, + "step": 47365 + }, + { + "epoch": 3.2185079494496533, + "grad_norm": 0.9216921925544739, + "learning_rate": 0.0005978478733523576, + "loss": 3.6581, + "step": 47370 + }, + { + "epoch": 3.2188476695203154, + "grad_norm": 0.8176274299621582, + "learning_rate": 0.0005978054083435249, + "loss": 3.4013, + "step": 47375 + }, + { + "epoch": 3.219187389590977, + "grad_norm": 0.7300509810447693, + "learning_rate": 0.0005977629433346923, + "loss": 3.7269, + "step": 47380 + }, + { + "epoch": 3.2195271096616387, + "grad_norm": 0.8048720359802246, + "learning_rate": 0.0005977204783258595, + "loss": 3.1934, + "step": 47385 + }, + { + "epoch": 3.2198668297323008, + "grad_norm": 0.6914896965026855, + "learning_rate": 0.0005976780133170268, + "loss": 3.6374, + "step": 47390 + }, + { + "epoch": 3.2202065498029624, + "grad_norm": 1.0596805810928345, + "learning_rate": 0.0005976355483081941, + "loss": 3.6157, + "step": 47395 + }, + { + "epoch": 3.220546269873624, + "grad_norm": 0.7556807398796082, + "learning_rate": 0.0005975930832993613, + "loss": 3.413, + "step": 47400 + }, + { + "epoch": 3.220885989944286, + "grad_norm": 1.2750440835952759, + "learning_rate": 0.0005975506182905286, + "loss": 3.4527, + "step": 47405 + }, + { + "epoch": 3.2212257100149477, + "grad_norm": 0.7988293170928955, + "learning_rate": 0.000597508153281696, + "loss": 3.4544, + "step": 47410 + }, + { + "epoch": 3.2215654300856094, + "grad_norm": 0.7833225727081299, + "learning_rate": 0.0005974656882728632, + "loss": 3.5853, + "step": 47415 + }, + { + "epoch": 3.2219051501562714, + "grad_norm": 0.9762526154518127, + "learning_rate": 0.0005974232232640304, + "loss": 3.4922, + "step": 47420 + }, + { + "epoch": 3.222244870226933, + "grad_norm": 1.1331369876861572, + "learning_rate": 0.0005973807582551978, + "loss": 3.3771, + "step": 47425 + }, + { + "epoch": 3.2225845902975947, + "grad_norm": 1.4212968349456787, + "learning_rate": 0.000597338293246365, + "loss": 3.3846, + "step": 47430 + }, + { + "epoch": 3.2229243103682563, + "grad_norm": 1.4719271659851074, + "learning_rate": 0.0005972958282375322, + "loss": 3.5188, + "step": 47435 + }, + { + "epoch": 3.2232640304389184, + "grad_norm": 0.9282580018043518, + "learning_rate": 0.0005972533632286996, + "loss": 3.3853, + "step": 47440 + }, + { + "epoch": 3.22360375050958, + "grad_norm": 0.9255167841911316, + "learning_rate": 0.0005972108982198669, + "loss": 3.6475, + "step": 47445 + }, + { + "epoch": 3.2239434705802417, + "grad_norm": 1.008363962173462, + "learning_rate": 0.0005971684332110341, + "loss": 3.4558, + "step": 47450 + }, + { + "epoch": 3.2242831906509037, + "grad_norm": 2.0970208644866943, + "learning_rate": 0.0005971259682022015, + "loss": 3.6591, + "step": 47455 + }, + { + "epoch": 3.2246229107215654, + "grad_norm": 0.9595999717712402, + "learning_rate": 0.0005970835031933687, + "loss": 3.5658, + "step": 47460 + }, + { + "epoch": 3.224962630792227, + "grad_norm": 2.674875020980835, + "learning_rate": 0.0005970410381845359, + "loss": 3.5505, + "step": 47465 + }, + { + "epoch": 3.225302350862889, + "grad_norm": 0.7944196462631226, + "learning_rate": 0.0005969985731757032, + "loss": 3.4832, + "step": 47470 + }, + { + "epoch": 3.2256420709335507, + "grad_norm": 0.7770931124687195, + "learning_rate": 0.0005969561081668705, + "loss": 3.5265, + "step": 47475 + }, + { + "epoch": 3.2259817910042123, + "grad_norm": 0.8102033734321594, + "learning_rate": 0.0005969136431580378, + "loss": 3.6995, + "step": 47480 + }, + { + "epoch": 3.2263215110748744, + "grad_norm": 0.8609127998352051, + "learning_rate": 0.0005968711781492051, + "loss": 3.7367, + "step": 47485 + }, + { + "epoch": 3.226661231145536, + "grad_norm": 0.9203303456306458, + "learning_rate": 0.0005968287131403724, + "loss": 3.5749, + "step": 47490 + }, + { + "epoch": 3.2270009512161977, + "grad_norm": 1.0082615613937378, + "learning_rate": 0.0005967862481315396, + "loss": 3.216, + "step": 47495 + }, + { + "epoch": 3.2273406712868598, + "grad_norm": 1.1802849769592285, + "learning_rate": 0.0005967437831227069, + "loss": 3.2504, + "step": 47500 + }, + { + "epoch": 3.2276803913575214, + "grad_norm": 0.887146532535553, + "learning_rate": 0.0005967013181138742, + "loss": 3.6866, + "step": 47505 + }, + { + "epoch": 3.228020111428183, + "grad_norm": 0.8191695213317871, + "learning_rate": 0.0005966588531050414, + "loss": 3.6189, + "step": 47510 + }, + { + "epoch": 3.228359831498845, + "grad_norm": 0.8491462469100952, + "learning_rate": 0.0005966163880962088, + "loss": 3.8255, + "step": 47515 + }, + { + "epoch": 3.2286995515695067, + "grad_norm": 0.8035827279090881, + "learning_rate": 0.000596573923087376, + "loss": 3.2167, + "step": 47520 + }, + { + "epoch": 3.2290392716401684, + "grad_norm": 0.8062894344329834, + "learning_rate": 0.0005965314580785433, + "loss": 3.5675, + "step": 47525 + }, + { + "epoch": 3.2293789917108304, + "grad_norm": 0.8055529594421387, + "learning_rate": 0.0005964889930697106, + "loss": 3.6262, + "step": 47530 + }, + { + "epoch": 3.229718711781492, + "grad_norm": 1.1580908298492432, + "learning_rate": 0.0005964465280608778, + "loss": 3.1722, + "step": 47535 + }, + { + "epoch": 3.2300584318521537, + "grad_norm": 1.037095069885254, + "learning_rate": 0.0005964040630520451, + "loss": 3.4264, + "step": 47540 + }, + { + "epoch": 3.2303981519228158, + "grad_norm": 0.7803179621696472, + "learning_rate": 0.0005963615980432124, + "loss": 3.4729, + "step": 47545 + }, + { + "epoch": 3.2307378719934774, + "grad_norm": 0.8891220092773438, + "learning_rate": 0.0005963191330343797, + "loss": 3.181, + "step": 47550 + }, + { + "epoch": 3.231077592064139, + "grad_norm": 1.1926286220550537, + "learning_rate": 0.000596276668025547, + "loss": 3.495, + "step": 47555 + }, + { + "epoch": 3.231417312134801, + "grad_norm": 0.8503395915031433, + "learning_rate": 0.0005962342030167143, + "loss": 3.2873, + "step": 47560 + }, + { + "epoch": 3.2317570322054627, + "grad_norm": 0.7468401193618774, + "learning_rate": 0.0005961917380078815, + "loss": 3.3043, + "step": 47565 + }, + { + "epoch": 3.2320967522761244, + "grad_norm": 0.7681732177734375, + "learning_rate": 0.0005961492729990487, + "loss": 3.4785, + "step": 47570 + }, + { + "epoch": 3.2324364723467864, + "grad_norm": 0.883495032787323, + "learning_rate": 0.0005961068079902161, + "loss": 3.5555, + "step": 47575 + }, + { + "epoch": 3.232776192417448, + "grad_norm": 1.001518964767456, + "learning_rate": 0.0005960643429813833, + "loss": 3.5398, + "step": 47580 + }, + { + "epoch": 3.2331159124881097, + "grad_norm": 1.0533230304718018, + "learning_rate": 0.0005960218779725506, + "loss": 3.7431, + "step": 47585 + }, + { + "epoch": 3.2334556325587718, + "grad_norm": 0.7747108936309814, + "learning_rate": 0.000595979412963718, + "loss": 3.665, + "step": 47590 + }, + { + "epoch": 3.2337953526294334, + "grad_norm": 3.1502833366394043, + "learning_rate": 0.0005959369479548852, + "loss": 3.3424, + "step": 47595 + }, + { + "epoch": 3.234135072700095, + "grad_norm": 0.8496320843696594, + "learning_rate": 0.0005958944829460524, + "loss": 3.7848, + "step": 47600 + }, + { + "epoch": 3.234474792770757, + "grad_norm": 3.3290321826934814, + "learning_rate": 0.0005958520179372198, + "loss": 3.654, + "step": 47605 + }, + { + "epoch": 3.2348145128414187, + "grad_norm": 0.8131629824638367, + "learning_rate": 0.000595809552928387, + "loss": 3.2836, + "step": 47610 + }, + { + "epoch": 3.2351542329120804, + "grad_norm": 1.1176185607910156, + "learning_rate": 0.0005957670879195542, + "loss": 3.5898, + "step": 47615 + }, + { + "epoch": 3.2354939529827424, + "grad_norm": 0.7077348232269287, + "learning_rate": 0.0005957246229107217, + "loss": 3.7067, + "step": 47620 + }, + { + "epoch": 3.235833673053404, + "grad_norm": 1.5158995389938354, + "learning_rate": 0.0005956821579018889, + "loss": 3.425, + "step": 47625 + }, + { + "epoch": 3.2361733931240657, + "grad_norm": 0.8071035742759705, + "learning_rate": 0.0005956396928930561, + "loss": 3.4786, + "step": 47630 + }, + { + "epoch": 3.2365131131947273, + "grad_norm": 0.800674557685852, + "learning_rate": 0.0005955972278842234, + "loss": 3.5058, + "step": 47635 + }, + { + "epoch": 3.2368528332653894, + "grad_norm": 0.7000810503959656, + "learning_rate": 0.0005955547628753907, + "loss": 3.5776, + "step": 47640 + }, + { + "epoch": 3.237192553336051, + "grad_norm": 0.8577728271484375, + "learning_rate": 0.0005955122978665579, + "loss": 3.6308, + "step": 47645 + }, + { + "epoch": 3.2375322734067127, + "grad_norm": 0.8657935857772827, + "learning_rate": 0.0005954698328577252, + "loss": 3.295, + "step": 47650 + }, + { + "epoch": 3.2378719934773748, + "grad_norm": 0.7039874196052551, + "learning_rate": 0.0005954273678488926, + "loss": 3.5067, + "step": 47655 + }, + { + "epoch": 3.2382117135480364, + "grad_norm": 0.8800588250160217, + "learning_rate": 0.0005953849028400598, + "loss": 3.7158, + "step": 47660 + }, + { + "epoch": 3.238551433618698, + "grad_norm": 0.947394609451294, + "learning_rate": 0.0005953424378312271, + "loss": 3.5404, + "step": 47665 + }, + { + "epoch": 3.23889115368936, + "grad_norm": 0.6780798435211182, + "learning_rate": 0.0005952999728223943, + "loss": 3.547, + "step": 47670 + }, + { + "epoch": 3.2392308737600217, + "grad_norm": 1.28602135181427, + "learning_rate": 0.0005952575078135616, + "loss": 3.5327, + "step": 47675 + }, + { + "epoch": 3.2395705938306834, + "grad_norm": 0.9508458971977234, + "learning_rate": 0.0005952150428047289, + "loss": 3.47, + "step": 47680 + }, + { + "epoch": 3.2399103139013454, + "grad_norm": 1.0098148584365845, + "learning_rate": 0.0005951725777958961, + "loss": 3.8414, + "step": 47685 + }, + { + "epoch": 3.240250033972007, + "grad_norm": 1.1358612775802612, + "learning_rate": 0.0005951301127870636, + "loss": 3.4281, + "step": 47690 + }, + { + "epoch": 3.2405897540426687, + "grad_norm": 0.7378382086753845, + "learning_rate": 0.0005950876477782308, + "loss": 3.8812, + "step": 47695 + }, + { + "epoch": 3.2409294741133308, + "grad_norm": 0.9042045474052429, + "learning_rate": 0.000595045182769398, + "loss": 3.547, + "step": 47700 + }, + { + "epoch": 3.2412691941839924, + "grad_norm": 0.7616078853607178, + "learning_rate": 0.0005950027177605654, + "loss": 3.6091, + "step": 47705 + }, + { + "epoch": 3.241608914254654, + "grad_norm": 1.5819436311721802, + "learning_rate": 0.0005949602527517326, + "loss": 3.6691, + "step": 47710 + }, + { + "epoch": 3.241948634325316, + "grad_norm": 1.0381344556808472, + "learning_rate": 0.0005949177877428998, + "loss": 3.5661, + "step": 47715 + }, + { + "epoch": 3.2422883543959777, + "grad_norm": 1.1091516017913818, + "learning_rate": 0.0005948753227340671, + "loss": 3.5455, + "step": 47720 + }, + { + "epoch": 3.2426280744666394, + "grad_norm": 0.8982202410697937, + "learning_rate": 0.0005948328577252345, + "loss": 3.5419, + "step": 47725 + }, + { + "epoch": 3.2429677945373014, + "grad_norm": 0.9508631825447083, + "learning_rate": 0.0005947903927164017, + "loss": 3.4951, + "step": 47730 + }, + { + "epoch": 3.243307514607963, + "grad_norm": 0.9515175819396973, + "learning_rate": 0.000594747927707569, + "loss": 3.5272, + "step": 47735 + }, + { + "epoch": 3.2436472346786247, + "grad_norm": 0.9496304988861084, + "learning_rate": 0.0005947054626987363, + "loss": 3.6214, + "step": 47740 + }, + { + "epoch": 3.2439869547492868, + "grad_norm": 0.7687771320343018, + "learning_rate": 0.0005946629976899035, + "loss": 3.5258, + "step": 47745 + }, + { + "epoch": 3.2443266748199484, + "grad_norm": 1.0584062337875366, + "learning_rate": 0.0005946205326810708, + "loss": 3.443, + "step": 47750 + }, + { + "epoch": 3.24466639489061, + "grad_norm": 1.0024250745773315, + "learning_rate": 0.000594578067672238, + "loss": 3.7003, + "step": 47755 + }, + { + "epoch": 3.245006114961272, + "grad_norm": 0.9932116270065308, + "learning_rate": 0.0005945356026634054, + "loss": 3.5008, + "step": 47760 + }, + { + "epoch": 3.2453458350319337, + "grad_norm": 0.8348458409309387, + "learning_rate": 0.0005944931376545727, + "loss": 3.6852, + "step": 47765 + }, + { + "epoch": 3.2456855551025954, + "grad_norm": 0.7780558466911316, + "learning_rate": 0.0005944506726457399, + "loss": 3.5167, + "step": 47770 + }, + { + "epoch": 3.246025275173257, + "grad_norm": 0.8953651189804077, + "learning_rate": 0.0005944082076369072, + "loss": 3.4763, + "step": 47775 + }, + { + "epoch": 3.246364995243919, + "grad_norm": 1.154801845550537, + "learning_rate": 0.0005943657426280745, + "loss": 3.5951, + "step": 47780 + }, + { + "epoch": 3.2467047153145807, + "grad_norm": 0.8035464882850647, + "learning_rate": 0.0005943232776192417, + "loss": 3.1384, + "step": 47785 + }, + { + "epoch": 3.2470444353852423, + "grad_norm": 1.1231799125671387, + "learning_rate": 0.000594280812610409, + "loss": 3.4572, + "step": 47790 + }, + { + "epoch": 3.2473841554559044, + "grad_norm": 1.2379932403564453, + "learning_rate": 0.0005942383476015764, + "loss": 3.4384, + "step": 47795 + }, + { + "epoch": 3.247723875526566, + "grad_norm": 1.1236450672149658, + "learning_rate": 0.0005941958825927436, + "loss": 3.4912, + "step": 47800 + }, + { + "epoch": 3.2480635955972277, + "grad_norm": 1.6857548952102661, + "learning_rate": 0.0005941534175839109, + "loss": 3.2879, + "step": 47805 + }, + { + "epoch": 3.2484033156678898, + "grad_norm": 0.7397426962852478, + "learning_rate": 0.0005941109525750782, + "loss": 3.6727, + "step": 47810 + }, + { + "epoch": 3.2487430357385514, + "grad_norm": 0.8336135745048523, + "learning_rate": 0.0005940684875662454, + "loss": 3.2362, + "step": 47815 + }, + { + "epoch": 3.249082755809213, + "grad_norm": 0.7189802527427673, + "learning_rate": 0.0005940260225574126, + "loss": 3.8189, + "step": 47820 + }, + { + "epoch": 3.249422475879875, + "grad_norm": 1.1133759021759033, + "learning_rate": 0.00059398355754858, + "loss": 3.4486, + "step": 47825 + }, + { + "epoch": 3.2497621959505367, + "grad_norm": 0.8767552971839905, + "learning_rate": 0.0005939410925397473, + "loss": 3.441, + "step": 47830 + }, + { + "epoch": 3.2501019160211984, + "grad_norm": 1.2502567768096924, + "learning_rate": 0.0005938986275309145, + "loss": 3.6282, + "step": 47835 + }, + { + "epoch": 3.2504416360918604, + "grad_norm": 0.8522949814796448, + "learning_rate": 0.0005938561625220819, + "loss": 3.7094, + "step": 47840 + }, + { + "epoch": 3.250781356162522, + "grad_norm": 0.7807096242904663, + "learning_rate": 0.0005938136975132491, + "loss": 3.5032, + "step": 47845 + }, + { + "epoch": 3.2511210762331837, + "grad_norm": 0.7700725793838501, + "learning_rate": 0.0005937712325044163, + "loss": 3.7447, + "step": 47850 + }, + { + "epoch": 3.2514607963038458, + "grad_norm": 1.032213807106018, + "learning_rate": 0.0005937287674955837, + "loss": 3.6808, + "step": 47855 + }, + { + "epoch": 3.2518005163745074, + "grad_norm": 0.6204071044921875, + "learning_rate": 0.0005936863024867509, + "loss": 3.6067, + "step": 47860 + }, + { + "epoch": 3.252140236445169, + "grad_norm": 1.1222666501998901, + "learning_rate": 0.0005936438374779182, + "loss": 3.302, + "step": 47865 + }, + { + "epoch": 3.252479956515831, + "grad_norm": 1.3393938541412354, + "learning_rate": 0.0005936013724690855, + "loss": 3.4844, + "step": 47870 + }, + { + "epoch": 3.2528196765864927, + "grad_norm": 0.9423413872718811, + "learning_rate": 0.0005935589074602528, + "loss": 3.2518, + "step": 47875 + }, + { + "epoch": 3.2531593966571544, + "grad_norm": 1.0760389566421509, + "learning_rate": 0.00059351644245142, + "loss": 3.5841, + "step": 47880 + }, + { + "epoch": 3.2534991167278164, + "grad_norm": 0.8922727704048157, + "learning_rate": 0.0005934739774425873, + "loss": 3.541, + "step": 47885 + }, + { + "epoch": 3.253838836798478, + "grad_norm": 0.8750848174095154, + "learning_rate": 0.0005934315124337546, + "loss": 3.6238, + "step": 47890 + }, + { + "epoch": 3.2541785568691397, + "grad_norm": 1.084044337272644, + "learning_rate": 0.0005933890474249218, + "loss": 3.5513, + "step": 47895 + }, + { + "epoch": 3.254518276939802, + "grad_norm": 0.9020415544509888, + "learning_rate": 0.0005933465824160892, + "loss": 3.3396, + "step": 47900 + }, + { + "epoch": 3.2548579970104634, + "grad_norm": 0.983256995677948, + "learning_rate": 0.0005933041174072565, + "loss": 3.5133, + "step": 47905 + }, + { + "epoch": 3.255197717081125, + "grad_norm": 0.8660027980804443, + "learning_rate": 0.0005932616523984237, + "loss": 3.4607, + "step": 47910 + }, + { + "epoch": 3.255537437151787, + "grad_norm": 0.8165795803070068, + "learning_rate": 0.000593219187389591, + "loss": 3.3842, + "step": 47915 + }, + { + "epoch": 3.2558771572224487, + "grad_norm": 0.7668473720550537, + "learning_rate": 0.0005931767223807582, + "loss": 3.9099, + "step": 47920 + }, + { + "epoch": 3.2562168772931104, + "grad_norm": 0.7355303764343262, + "learning_rate": 0.0005931342573719255, + "loss": 3.5257, + "step": 47925 + }, + { + "epoch": 3.2565565973637725, + "grad_norm": 0.8380235433578491, + "learning_rate": 0.0005930917923630928, + "loss": 3.6416, + "step": 47930 + }, + { + "epoch": 3.256896317434434, + "grad_norm": 0.8716810345649719, + "learning_rate": 0.0005930493273542601, + "loss": 3.8036, + "step": 47935 + }, + { + "epoch": 3.2572360375050957, + "grad_norm": 0.8615593910217285, + "learning_rate": 0.0005930068623454274, + "loss": 3.4508, + "step": 47940 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 0.825107753276825, + "learning_rate": 0.0005929643973365947, + "loss": 3.5219, + "step": 47945 + }, + { + "epoch": 3.2579154776464194, + "grad_norm": 0.8025553226470947, + "learning_rate": 0.0005929219323277619, + "loss": 3.5901, + "step": 47950 + }, + { + "epoch": 3.258255197717081, + "grad_norm": 0.8119816184043884, + "learning_rate": 0.0005928794673189291, + "loss": 3.6285, + "step": 47955 + }, + { + "epoch": 3.258594917787743, + "grad_norm": 1.0789769887924194, + "learning_rate": 0.0005928370023100965, + "loss": 3.7035, + "step": 47960 + }, + { + "epoch": 3.2589346378584048, + "grad_norm": 0.953315794467926, + "learning_rate": 0.0005927945373012637, + "loss": 3.5203, + "step": 47965 + }, + { + "epoch": 3.2592743579290664, + "grad_norm": 0.8874086141586304, + "learning_rate": 0.000592752072292431, + "loss": 3.6831, + "step": 47970 + }, + { + "epoch": 3.2596140779997285, + "grad_norm": 0.9759589433670044, + "learning_rate": 0.0005927096072835984, + "loss": 3.3744, + "step": 47975 + }, + { + "epoch": 3.25995379807039, + "grad_norm": 0.7321807146072388, + "learning_rate": 0.0005926671422747656, + "loss": 3.366, + "step": 47980 + }, + { + "epoch": 3.2602935181410517, + "grad_norm": 0.8013702630996704, + "learning_rate": 0.0005926246772659328, + "loss": 3.6306, + "step": 47985 + }, + { + "epoch": 3.2606332382117134, + "grad_norm": 0.8447233438491821, + "learning_rate": 0.0005925822122571002, + "loss": 3.5131, + "step": 47990 + }, + { + "epoch": 3.2609729582823754, + "grad_norm": 0.8963786363601685, + "learning_rate": 0.0005925397472482674, + "loss": 3.6954, + "step": 47995 + }, + { + "epoch": 3.261312678353037, + "grad_norm": 0.6817160844802856, + "learning_rate": 0.0005924972822394346, + "loss": 3.6009, + "step": 48000 + }, + { + "epoch": 3.2616523984236987, + "grad_norm": 0.8807687163352966, + "learning_rate": 0.0005924548172306021, + "loss": 3.453, + "step": 48005 + }, + { + "epoch": 3.2619921184943608, + "grad_norm": 0.8414362072944641, + "learning_rate": 0.0005924123522217693, + "loss": 3.7635, + "step": 48010 + }, + { + "epoch": 3.2623318385650224, + "grad_norm": 0.9000921249389648, + "learning_rate": 0.0005923698872129365, + "loss": 3.5285, + "step": 48015 + }, + { + "epoch": 3.262671558635684, + "grad_norm": 1.2976704835891724, + "learning_rate": 0.0005923274222041038, + "loss": 3.4813, + "step": 48020 + }, + { + "epoch": 3.263011278706346, + "grad_norm": 0.767167866230011, + "learning_rate": 0.0005922849571952711, + "loss": 3.8079, + "step": 48025 + }, + { + "epoch": 3.2633509987770077, + "grad_norm": 0.7373291850090027, + "learning_rate": 0.0005922424921864384, + "loss": 3.5951, + "step": 48030 + }, + { + "epoch": 3.2636907188476694, + "grad_norm": 0.9124268889427185, + "learning_rate": 0.0005922000271776057, + "loss": 3.4899, + "step": 48035 + }, + { + "epoch": 3.2640304389183314, + "grad_norm": 0.6883022785186768, + "learning_rate": 0.000592157562168773, + "loss": 3.5466, + "step": 48040 + }, + { + "epoch": 3.264370158988993, + "grad_norm": 1.0331430435180664, + "learning_rate": 0.0005921150971599403, + "loss": 3.4114, + "step": 48045 + }, + { + "epoch": 3.2647098790596547, + "grad_norm": 0.9091383814811707, + "learning_rate": 0.0005920726321511075, + "loss": 3.6642, + "step": 48050 + }, + { + "epoch": 3.265049599130317, + "grad_norm": 1.553706407546997, + "learning_rate": 0.0005920301671422747, + "loss": 3.2768, + "step": 48055 + }, + { + "epoch": 3.2653893192009784, + "grad_norm": 1.0447636842727661, + "learning_rate": 0.0005919877021334421, + "loss": 3.2583, + "step": 48060 + }, + { + "epoch": 3.26572903927164, + "grad_norm": 1.1477265357971191, + "learning_rate": 0.0005919452371246093, + "loss": 3.5438, + "step": 48065 + }, + { + "epoch": 3.266068759342302, + "grad_norm": 0.8709723949432373, + "learning_rate": 0.0005919027721157766, + "loss": 3.5365, + "step": 48070 + }, + { + "epoch": 3.2664084794129638, + "grad_norm": 0.8389315009117126, + "learning_rate": 0.000591860307106944, + "loss": 3.5695, + "step": 48075 + }, + { + "epoch": 3.2667481994836254, + "grad_norm": 0.963994562625885, + "learning_rate": 0.0005918178420981112, + "loss": 3.3699, + "step": 48080 + }, + { + "epoch": 3.2670879195542875, + "grad_norm": 1.1631040573120117, + "learning_rate": 0.0005917753770892784, + "loss": 3.4043, + "step": 48085 + }, + { + "epoch": 3.267427639624949, + "grad_norm": 1.4406322240829468, + "learning_rate": 0.0005917329120804458, + "loss": 3.5837, + "step": 48090 + }, + { + "epoch": 3.2677673596956107, + "grad_norm": 1.2294211387634277, + "learning_rate": 0.000591690447071613, + "loss": 3.5551, + "step": 48095 + }, + { + "epoch": 3.2681070797662723, + "grad_norm": 0.7757490277290344, + "learning_rate": 0.0005916479820627802, + "loss": 3.5999, + "step": 48100 + }, + { + "epoch": 3.2684467998369344, + "grad_norm": 1.318204402923584, + "learning_rate": 0.0005916055170539477, + "loss": 3.557, + "step": 48105 + }, + { + "epoch": 3.268786519907596, + "grad_norm": 1.1386222839355469, + "learning_rate": 0.0005915630520451149, + "loss": 3.5921, + "step": 48110 + }, + { + "epoch": 3.2691262399782577, + "grad_norm": 0.9124917387962341, + "learning_rate": 0.0005915205870362821, + "loss": 3.5565, + "step": 48115 + }, + { + "epoch": 3.2694659600489198, + "grad_norm": 0.7126113772392273, + "learning_rate": 0.0005914781220274494, + "loss": 3.6033, + "step": 48120 + }, + { + "epoch": 3.2698056801195814, + "grad_norm": 1.0095126628875732, + "learning_rate": 0.0005914356570186167, + "loss": 3.7539, + "step": 48125 + }, + { + "epoch": 3.270145400190243, + "grad_norm": 1.6416003704071045, + "learning_rate": 0.0005913931920097839, + "loss": 3.3086, + "step": 48130 + }, + { + "epoch": 3.270485120260905, + "grad_norm": 1.0488959550857544, + "learning_rate": 0.0005913507270009512, + "loss": 3.3023, + "step": 48135 + }, + { + "epoch": 3.2708248403315667, + "grad_norm": 0.9047770500183105, + "learning_rate": 0.0005913082619921186, + "loss": 3.4665, + "step": 48140 + }, + { + "epoch": 3.2711645604022284, + "grad_norm": 0.9051328301429749, + "learning_rate": 0.0005912657969832858, + "loss": 3.6283, + "step": 48145 + }, + { + "epoch": 3.2715042804728904, + "grad_norm": 0.7366903424263, + "learning_rate": 0.0005912233319744531, + "loss": 3.7424, + "step": 48150 + }, + { + "epoch": 3.271844000543552, + "grad_norm": 1.3453335762023926, + "learning_rate": 0.0005911808669656203, + "loss": 3.6981, + "step": 48155 + }, + { + "epoch": 3.2721837206142137, + "grad_norm": 0.9132069945335388, + "learning_rate": 0.0005911384019567876, + "loss": 3.0776, + "step": 48160 + }, + { + "epoch": 3.2725234406848758, + "grad_norm": 0.966465950012207, + "learning_rate": 0.0005910959369479549, + "loss": 3.5067, + "step": 48165 + }, + { + "epoch": 3.2728631607555374, + "grad_norm": 1.0318245887756348, + "learning_rate": 0.0005910534719391221, + "loss": 3.303, + "step": 48170 + }, + { + "epoch": 3.273202880826199, + "grad_norm": 1.04539954662323, + "learning_rate": 0.0005910110069302895, + "loss": 3.4541, + "step": 48175 + }, + { + "epoch": 3.273542600896861, + "grad_norm": 0.8623585104942322, + "learning_rate": 0.0005909685419214568, + "loss": 3.5005, + "step": 48180 + }, + { + "epoch": 3.2738823209675227, + "grad_norm": 0.950773298740387, + "learning_rate": 0.000590926076912624, + "loss": 3.6136, + "step": 48185 + }, + { + "epoch": 3.2742220410381844, + "grad_norm": 1.0420154333114624, + "learning_rate": 0.0005908836119037913, + "loss": 3.5335, + "step": 48190 + }, + { + "epoch": 3.2745617611088464, + "grad_norm": 1.1597312688827515, + "learning_rate": 0.0005908411468949586, + "loss": 3.6692, + "step": 48195 + }, + { + "epoch": 3.274901481179508, + "grad_norm": 1.0038360357284546, + "learning_rate": 0.0005907986818861258, + "loss": 3.5308, + "step": 48200 + }, + { + "epoch": 3.2752412012501697, + "grad_norm": 1.2555689811706543, + "learning_rate": 0.000590756216877293, + "loss": 3.4605, + "step": 48205 + }, + { + "epoch": 3.275580921320832, + "grad_norm": 0.9985889792442322, + "learning_rate": 0.0005907137518684605, + "loss": 3.6189, + "step": 48210 + }, + { + "epoch": 3.2759206413914934, + "grad_norm": 0.8193240165710449, + "learning_rate": 0.0005906712868596277, + "loss": 3.4246, + "step": 48215 + }, + { + "epoch": 3.276260361462155, + "grad_norm": 1.1964302062988281, + "learning_rate": 0.0005906288218507949, + "loss": 3.5866, + "step": 48220 + }, + { + "epoch": 3.276600081532817, + "grad_norm": 0.8333280086517334, + "learning_rate": 0.0005905863568419623, + "loss": 3.4992, + "step": 48225 + }, + { + "epoch": 3.2769398016034788, + "grad_norm": 0.935841977596283, + "learning_rate": 0.0005905438918331295, + "loss": 3.5249, + "step": 48230 + }, + { + "epoch": 3.2772795216741404, + "grad_norm": 1.2484509944915771, + "learning_rate": 0.0005905014268242967, + "loss": 3.5332, + "step": 48235 + }, + { + "epoch": 3.2776192417448025, + "grad_norm": 0.9926548600196838, + "learning_rate": 0.0005904589618154641, + "loss": 3.5525, + "step": 48240 + }, + { + "epoch": 3.277958961815464, + "grad_norm": 0.7924796342849731, + "learning_rate": 0.0005904164968066314, + "loss": 3.7103, + "step": 48245 + }, + { + "epoch": 3.2782986818861257, + "grad_norm": 1.1317142248153687, + "learning_rate": 0.0005903740317977986, + "loss": 3.5977, + "step": 48250 + }, + { + "epoch": 3.278638401956788, + "grad_norm": 0.8586472272872925, + "learning_rate": 0.000590331566788966, + "loss": 3.4116, + "step": 48255 + }, + { + "epoch": 3.2789781220274494, + "grad_norm": 1.2283574342727661, + "learning_rate": 0.0005902891017801332, + "loss": 3.4993, + "step": 48260 + }, + { + "epoch": 3.279317842098111, + "grad_norm": 0.961847722530365, + "learning_rate": 0.0005902466367713004, + "loss": 3.4693, + "step": 48265 + }, + { + "epoch": 3.279657562168773, + "grad_norm": 0.8853822946548462, + "learning_rate": 0.0005902041717624677, + "loss": 3.5264, + "step": 48270 + }, + { + "epoch": 3.2799972822394348, + "grad_norm": 0.9705225825309753, + "learning_rate": 0.000590161706753635, + "loss": 3.4381, + "step": 48275 + }, + { + "epoch": 3.2803370023100964, + "grad_norm": 0.7781040668487549, + "learning_rate": 0.0005901192417448023, + "loss": 3.6048, + "step": 48280 + }, + { + "epoch": 3.2806767223807585, + "grad_norm": 0.8501023054122925, + "learning_rate": 0.0005900767767359696, + "loss": 3.5787, + "step": 48285 + }, + { + "epoch": 3.28101644245142, + "grad_norm": 0.9343913197517395, + "learning_rate": 0.0005900343117271369, + "loss": 3.3749, + "step": 48290 + }, + { + "epoch": 3.2813561625220817, + "grad_norm": 0.8397113680839539, + "learning_rate": 0.0005899918467183041, + "loss": 3.6071, + "step": 48295 + }, + { + "epoch": 3.281695882592744, + "grad_norm": 0.9677368402481079, + "learning_rate": 0.0005899493817094714, + "loss": 3.4547, + "step": 48300 + }, + { + "epoch": 3.2820356026634054, + "grad_norm": 1.037785291671753, + "learning_rate": 0.0005899069167006386, + "loss": 3.5255, + "step": 48305 + }, + { + "epoch": 3.282375322734067, + "grad_norm": 1.5698615312576294, + "learning_rate": 0.0005898644516918059, + "loss": 3.6253, + "step": 48310 + }, + { + "epoch": 3.282715042804729, + "grad_norm": 2.763221502304077, + "learning_rate": 0.0005898219866829733, + "loss": 3.532, + "step": 48315 + }, + { + "epoch": 3.2830547628753908, + "grad_norm": 0.9807698726654053, + "learning_rate": 0.0005897795216741405, + "loss": 3.592, + "step": 48320 + }, + { + "epoch": 3.2833944829460524, + "grad_norm": 1.0531150102615356, + "learning_rate": 0.0005897370566653078, + "loss": 3.6074, + "step": 48325 + }, + { + "epoch": 3.283734203016714, + "grad_norm": 0.9492921829223633, + "learning_rate": 0.0005896945916564751, + "loss": 3.551, + "step": 48330 + }, + { + "epoch": 3.284073923087376, + "grad_norm": 0.8633854985237122, + "learning_rate": 0.0005896521266476423, + "loss": 3.433, + "step": 48335 + }, + { + "epoch": 3.2844136431580377, + "grad_norm": 0.8125228881835938, + "learning_rate": 0.0005896096616388095, + "loss": 3.4885, + "step": 48340 + }, + { + "epoch": 3.2847533632286994, + "grad_norm": 0.7551580667495728, + "learning_rate": 0.0005895671966299769, + "loss": 3.1579, + "step": 48345 + }, + { + "epoch": 3.2850930832993614, + "grad_norm": 0.9063514471054077, + "learning_rate": 0.0005895247316211442, + "loss": 3.5959, + "step": 48350 + }, + { + "epoch": 3.285432803370023, + "grad_norm": 0.7913870811462402, + "learning_rate": 0.0005894822666123114, + "loss": 3.4677, + "step": 48355 + }, + { + "epoch": 3.2857725234406847, + "grad_norm": 0.9281511902809143, + "learning_rate": 0.0005894398016034788, + "loss": 3.412, + "step": 48360 + }, + { + "epoch": 3.286112243511347, + "grad_norm": 0.8666707277297974, + "learning_rate": 0.000589397336594646, + "loss": 3.5253, + "step": 48365 + }, + { + "epoch": 3.2864519635820084, + "grad_norm": 0.8895286321640015, + "learning_rate": 0.0005893548715858133, + "loss": 3.585, + "step": 48370 + }, + { + "epoch": 3.28679168365267, + "grad_norm": 0.9219581484794617, + "learning_rate": 0.0005893124065769806, + "loss": 3.6043, + "step": 48375 + }, + { + "epoch": 3.287131403723332, + "grad_norm": 0.8217839598655701, + "learning_rate": 0.0005892699415681478, + "loss": 3.6587, + "step": 48380 + }, + { + "epoch": 3.2874711237939938, + "grad_norm": 0.9134098887443542, + "learning_rate": 0.0005892274765593152, + "loss": 3.4984, + "step": 48385 + }, + { + "epoch": 3.2878108438646554, + "grad_norm": 0.9104360342025757, + "learning_rate": 0.0005891850115504825, + "loss": 3.5901, + "step": 48390 + }, + { + "epoch": 3.2881505639353175, + "grad_norm": 0.7124811410903931, + "learning_rate": 0.0005891425465416497, + "loss": 3.5314, + "step": 48395 + }, + { + "epoch": 3.288490284005979, + "grad_norm": 0.7505641579627991, + "learning_rate": 0.000589100081532817, + "loss": 3.6414, + "step": 48400 + }, + { + "epoch": 3.2888300040766407, + "grad_norm": 0.9012295007705688, + "learning_rate": 0.0005890576165239842, + "loss": 3.8867, + "step": 48405 + }, + { + "epoch": 3.289169724147303, + "grad_norm": 0.7657046914100647, + "learning_rate": 0.0005890151515151515, + "loss": 3.6438, + "step": 48410 + }, + { + "epoch": 3.2895094442179644, + "grad_norm": 0.9305396676063538, + "learning_rate": 0.0005889726865063188, + "loss": 3.4017, + "step": 48415 + }, + { + "epoch": 3.289849164288626, + "grad_norm": 0.991240918636322, + "learning_rate": 0.0005889302214974861, + "loss": 3.3345, + "step": 48420 + }, + { + "epoch": 3.290188884359288, + "grad_norm": 0.8350464701652527, + "learning_rate": 0.0005888877564886534, + "loss": 3.4757, + "step": 48425 + }, + { + "epoch": 3.2905286044299498, + "grad_norm": 0.9591182470321655, + "learning_rate": 0.0005888452914798207, + "loss": 3.5293, + "step": 48430 + }, + { + "epoch": 3.2908683245006114, + "grad_norm": 0.8353481888771057, + "learning_rate": 0.0005888028264709879, + "loss": 3.3576, + "step": 48435 + }, + { + "epoch": 3.291208044571273, + "grad_norm": 1.2232239246368408, + "learning_rate": 0.0005887603614621552, + "loss": 3.9008, + "step": 48440 + }, + { + "epoch": 3.291547764641935, + "grad_norm": 0.8067269921302795, + "learning_rate": 0.0005887178964533225, + "loss": 3.898, + "step": 48445 + }, + { + "epoch": 3.2918874847125967, + "grad_norm": 0.7881442308425903, + "learning_rate": 0.0005886754314444897, + "loss": 3.7874, + "step": 48450 + }, + { + "epoch": 3.2922272047832584, + "grad_norm": 0.7909783124923706, + "learning_rate": 0.000588632966435657, + "loss": 3.0974, + "step": 48455 + }, + { + "epoch": 3.2925669248539204, + "grad_norm": 0.864059329032898, + "learning_rate": 0.0005885905014268244, + "loss": 3.2505, + "step": 48460 + }, + { + "epoch": 3.292906644924582, + "grad_norm": 0.9833866953849792, + "learning_rate": 0.0005885480364179916, + "loss": 3.5468, + "step": 48465 + }, + { + "epoch": 3.2932463649952437, + "grad_norm": 1.6457008123397827, + "learning_rate": 0.0005885055714091588, + "loss": 3.6498, + "step": 48470 + }, + { + "epoch": 3.2935860850659058, + "grad_norm": 0.9521425366401672, + "learning_rate": 0.0005884631064003262, + "loss": 3.4792, + "step": 48475 + }, + { + "epoch": 3.2939258051365674, + "grad_norm": 0.7595998644828796, + "learning_rate": 0.0005884206413914934, + "loss": 3.7236, + "step": 48480 + }, + { + "epoch": 3.294265525207229, + "grad_norm": 1.0262778997421265, + "learning_rate": 0.0005883781763826606, + "loss": 3.4407, + "step": 48485 + }, + { + "epoch": 3.294605245277891, + "grad_norm": 1.1619991064071655, + "learning_rate": 0.0005883357113738281, + "loss": 3.6636, + "step": 48490 + }, + { + "epoch": 3.2949449653485527, + "grad_norm": 0.9405437111854553, + "learning_rate": 0.0005882932463649953, + "loss": 3.4674, + "step": 48495 + }, + { + "epoch": 3.2952846854192144, + "grad_norm": 0.8474861979484558, + "learning_rate": 0.0005882507813561625, + "loss": 3.5084, + "step": 48500 + }, + { + "epoch": 3.2956244054898765, + "grad_norm": 2.2408437728881836, + "learning_rate": 0.0005882083163473298, + "loss": 3.6192, + "step": 48505 + }, + { + "epoch": 3.295964125560538, + "grad_norm": 1.6919485330581665, + "learning_rate": 0.0005881658513384971, + "loss": 3.3931, + "step": 48510 + }, + { + "epoch": 3.2963038456311997, + "grad_norm": 0.9929238557815552, + "learning_rate": 0.0005881233863296643, + "loss": 3.5689, + "step": 48515 + }, + { + "epoch": 3.296643565701862, + "grad_norm": 0.7829328775405884, + "learning_rate": 0.0005880809213208316, + "loss": 3.3748, + "step": 48520 + }, + { + "epoch": 3.2969832857725234, + "grad_norm": 0.9258415699005127, + "learning_rate": 0.000588038456311999, + "loss": 3.3697, + "step": 48525 + }, + { + "epoch": 3.297323005843185, + "grad_norm": 0.9617429375648499, + "learning_rate": 0.0005879959913031662, + "loss": 3.4141, + "step": 48530 + }, + { + "epoch": 3.297662725913847, + "grad_norm": 0.8169625401496887, + "learning_rate": 0.0005879535262943335, + "loss": 3.2696, + "step": 48535 + }, + { + "epoch": 3.2980024459845088, + "grad_norm": 0.8899175524711609, + "learning_rate": 0.0005879110612855008, + "loss": 3.2371, + "step": 48540 + }, + { + "epoch": 3.2983421660551704, + "grad_norm": 0.9717076420783997, + "learning_rate": 0.000587868596276668, + "loss": 3.6424, + "step": 48545 + }, + { + "epoch": 3.2986818861258325, + "grad_norm": 1.2614322900772095, + "learning_rate": 0.0005878261312678353, + "loss": 3.522, + "step": 48550 + }, + { + "epoch": 3.299021606196494, + "grad_norm": 0.8996133804321289, + "learning_rate": 0.0005877836662590025, + "loss": 3.6227, + "step": 48555 + }, + { + "epoch": 3.2993613262671557, + "grad_norm": 0.997409999370575, + "learning_rate": 0.0005877412012501699, + "loss": 3.5963, + "step": 48560 + }, + { + "epoch": 3.299701046337818, + "grad_norm": 0.8511382341384888, + "learning_rate": 0.0005876987362413372, + "loss": 3.2932, + "step": 48565 + }, + { + "epoch": 3.3000407664084794, + "grad_norm": 0.7118664383888245, + "learning_rate": 0.0005876562712325044, + "loss": 3.622, + "step": 48570 + }, + { + "epoch": 3.300380486479141, + "grad_norm": 1.1046123504638672, + "learning_rate": 0.0005876138062236717, + "loss": 3.6219, + "step": 48575 + }, + { + "epoch": 3.300720206549803, + "grad_norm": 1.931577444076538, + "learning_rate": 0.000587571341214839, + "loss": 3.5765, + "step": 48580 + }, + { + "epoch": 3.3010599266204648, + "grad_norm": 0.8633036017417908, + "learning_rate": 0.0005875288762060062, + "loss": 3.6468, + "step": 48585 + }, + { + "epoch": 3.3013996466911264, + "grad_norm": 0.9898293614387512, + "learning_rate": 0.0005874864111971734, + "loss": 3.3848, + "step": 48590 + }, + { + "epoch": 3.3017393667617885, + "grad_norm": 0.8494573831558228, + "learning_rate": 0.0005874439461883409, + "loss": 3.6418, + "step": 48595 + }, + { + "epoch": 3.30207908683245, + "grad_norm": 0.7578386664390564, + "learning_rate": 0.0005874014811795081, + "loss": 3.4757, + "step": 48600 + }, + { + "epoch": 3.3024188069031117, + "grad_norm": 0.9293911457061768, + "learning_rate": 0.0005873590161706753, + "loss": 3.3304, + "step": 48605 + }, + { + "epoch": 3.302758526973774, + "grad_norm": 0.9874370694160461, + "learning_rate": 0.0005873165511618427, + "loss": 3.6149, + "step": 48610 + }, + { + "epoch": 3.3030982470444354, + "grad_norm": 0.8754750490188599, + "learning_rate": 0.0005872740861530099, + "loss": 3.8351, + "step": 48615 + }, + { + "epoch": 3.303437967115097, + "grad_norm": 0.972672700881958, + "learning_rate": 0.0005872316211441771, + "loss": 3.5383, + "step": 48620 + }, + { + "epoch": 3.303777687185759, + "grad_norm": 0.9252616167068481, + "learning_rate": 0.0005871891561353446, + "loss": 3.3754, + "step": 48625 + }, + { + "epoch": 3.304117407256421, + "grad_norm": 0.8982014060020447, + "learning_rate": 0.0005871466911265118, + "loss": 3.3797, + "step": 48630 + }, + { + "epoch": 3.3044571273270824, + "grad_norm": 0.7996730804443359, + "learning_rate": 0.000587104226117679, + "loss": 3.7286, + "step": 48635 + }, + { + "epoch": 3.3047968473977445, + "grad_norm": 1.1621487140655518, + "learning_rate": 0.0005870617611088464, + "loss": 3.7283, + "step": 48640 + }, + { + "epoch": 3.305136567468406, + "grad_norm": 1.2569164037704468, + "learning_rate": 0.0005870192961000136, + "loss": 3.6218, + "step": 48645 + }, + { + "epoch": 3.3054762875390677, + "grad_norm": 0.8150321841239929, + "learning_rate": 0.0005869768310911808, + "loss": 3.6444, + "step": 48650 + }, + { + "epoch": 3.30581600760973, + "grad_norm": 0.7726898193359375, + "learning_rate": 0.0005869343660823481, + "loss": 3.5349, + "step": 48655 + }, + { + "epoch": 3.3061557276803915, + "grad_norm": 0.8049639463424683, + "learning_rate": 0.0005868919010735155, + "loss": 3.7556, + "step": 48660 + }, + { + "epoch": 3.306495447751053, + "grad_norm": 0.8311588764190674, + "learning_rate": 0.0005868494360646827, + "loss": 3.4055, + "step": 48665 + }, + { + "epoch": 3.3068351678217147, + "grad_norm": 1.1374574899673462, + "learning_rate": 0.00058680697105585, + "loss": 3.5003, + "step": 48670 + }, + { + "epoch": 3.307174887892377, + "grad_norm": 0.7896538972854614, + "learning_rate": 0.0005867645060470173, + "loss": 3.4422, + "step": 48675 + }, + { + "epoch": 3.3075146079630384, + "grad_norm": 0.8328851461410522, + "learning_rate": 0.0005867220410381845, + "loss": 3.3325, + "step": 48680 + }, + { + "epoch": 3.3078543280337, + "grad_norm": 1.2127774953842163, + "learning_rate": 0.0005866795760293518, + "loss": 3.2687, + "step": 48685 + }, + { + "epoch": 3.308194048104362, + "grad_norm": 0.9525131583213806, + "learning_rate": 0.000586637111020519, + "loss": 3.3413, + "step": 48690 + }, + { + "epoch": 3.3085337681750238, + "grad_norm": 0.9515867233276367, + "learning_rate": 0.0005865946460116864, + "loss": 3.5125, + "step": 48695 + }, + { + "epoch": 3.3088734882456854, + "grad_norm": 0.8401691317558289, + "learning_rate": 0.0005865521810028537, + "loss": 3.6961, + "step": 48700 + }, + { + "epoch": 3.3092132083163475, + "grad_norm": 2.3893473148345947, + "learning_rate": 0.0005865097159940209, + "loss": 3.3983, + "step": 48705 + }, + { + "epoch": 3.309552928387009, + "grad_norm": 0.8714545965194702, + "learning_rate": 0.0005864672509851883, + "loss": 3.3444, + "step": 48710 + }, + { + "epoch": 3.3098926484576707, + "grad_norm": 0.8598178625106812, + "learning_rate": 0.0005864247859763555, + "loss": 3.4752, + "step": 48715 + }, + { + "epoch": 3.310232368528333, + "grad_norm": 0.8659553527832031, + "learning_rate": 0.0005863823209675227, + "loss": 3.3217, + "step": 48720 + }, + { + "epoch": 3.3105720885989944, + "grad_norm": 1.3346612453460693, + "learning_rate": 0.0005863398559586901, + "loss": 3.5696, + "step": 48725 + }, + { + "epoch": 3.310911808669656, + "grad_norm": 1.0408681631088257, + "learning_rate": 0.0005862973909498574, + "loss": 3.5241, + "step": 48730 + }, + { + "epoch": 3.311251528740318, + "grad_norm": 0.8617814779281616, + "learning_rate": 0.0005862549259410246, + "loss": 3.3597, + "step": 48735 + }, + { + "epoch": 3.3115912488109798, + "grad_norm": 0.9738251566886902, + "learning_rate": 0.000586212460932192, + "loss": 3.6183, + "step": 48740 + }, + { + "epoch": 3.3119309688816414, + "grad_norm": 1.2191599607467651, + "learning_rate": 0.0005861699959233592, + "loss": 3.3052, + "step": 48745 + }, + { + "epoch": 3.3122706889523035, + "grad_norm": 0.8858315944671631, + "learning_rate": 0.0005861275309145264, + "loss": 3.7767, + "step": 48750 + }, + { + "epoch": 3.312610409022965, + "grad_norm": 0.7671151161193848, + "learning_rate": 0.0005860850659056937, + "loss": 3.3161, + "step": 48755 + }, + { + "epoch": 3.3129501290936267, + "grad_norm": 0.8289344310760498, + "learning_rate": 0.000586042600896861, + "loss": 3.4364, + "step": 48760 + }, + { + "epoch": 3.313289849164289, + "grad_norm": 0.9522739052772522, + "learning_rate": 0.0005860001358880283, + "loss": 3.5427, + "step": 48765 + }, + { + "epoch": 3.3136295692349504, + "grad_norm": 0.9616262912750244, + "learning_rate": 0.0005859576708791956, + "loss": 3.3539, + "step": 48770 + }, + { + "epoch": 3.313969289305612, + "grad_norm": 0.7289275527000427, + "learning_rate": 0.0005859152058703629, + "loss": 3.4175, + "step": 48775 + }, + { + "epoch": 3.3143090093762737, + "grad_norm": 0.8276107311248779, + "learning_rate": 0.0005858727408615301, + "loss": 3.5731, + "step": 48780 + }, + { + "epoch": 3.314648729446936, + "grad_norm": 0.9961513876914978, + "learning_rate": 0.0005858302758526974, + "loss": 3.5023, + "step": 48785 + }, + { + "epoch": 3.3149884495175974, + "grad_norm": 0.860588788986206, + "learning_rate": 0.0005857878108438646, + "loss": 3.5877, + "step": 48790 + }, + { + "epoch": 3.315328169588259, + "grad_norm": 0.8764373660087585, + "learning_rate": 0.0005857453458350319, + "loss": 3.6976, + "step": 48795 + }, + { + "epoch": 3.315667889658921, + "grad_norm": 0.8976684212684631, + "learning_rate": 0.0005857028808261993, + "loss": 3.422, + "step": 48800 + }, + { + "epoch": 3.3160076097295828, + "grad_norm": 0.8627735376358032, + "learning_rate": 0.0005856604158173665, + "loss": 3.6136, + "step": 48805 + }, + { + "epoch": 3.3163473298002444, + "grad_norm": 0.7218126058578491, + "learning_rate": 0.0005856179508085338, + "loss": 3.465, + "step": 48810 + }, + { + "epoch": 3.3166870498709065, + "grad_norm": 0.977854311466217, + "learning_rate": 0.0005855754857997011, + "loss": 3.5165, + "step": 48815 + }, + { + "epoch": 3.317026769941568, + "grad_norm": 1.0493806600570679, + "learning_rate": 0.0005855330207908683, + "loss": 3.4507, + "step": 48820 + }, + { + "epoch": 3.3173664900122297, + "grad_norm": 0.8334512114524841, + "learning_rate": 0.0005854905557820356, + "loss": 3.6127, + "step": 48825 + }, + { + "epoch": 3.317706210082892, + "grad_norm": 0.9004742503166199, + "learning_rate": 0.0005854480907732029, + "loss": 3.3477, + "step": 48830 + }, + { + "epoch": 3.3180459301535534, + "grad_norm": 0.6857470273971558, + "learning_rate": 0.0005854056257643702, + "loss": 3.7313, + "step": 48835 + }, + { + "epoch": 3.318385650224215, + "grad_norm": 0.8223456740379333, + "learning_rate": 0.0005853631607555374, + "loss": 3.6397, + "step": 48840 + }, + { + "epoch": 3.318725370294877, + "grad_norm": 0.7916955947875977, + "learning_rate": 0.0005853206957467048, + "loss": 3.7731, + "step": 48845 + }, + { + "epoch": 3.3190650903655388, + "grad_norm": 0.8083071708679199, + "learning_rate": 0.000585278230737872, + "loss": 3.1968, + "step": 48850 + }, + { + "epoch": 3.3194048104362004, + "grad_norm": 0.9212934970855713, + "learning_rate": 0.0005852357657290392, + "loss": 3.6771, + "step": 48855 + }, + { + "epoch": 3.3197445305068625, + "grad_norm": 0.8904837369918823, + "learning_rate": 0.0005851933007202066, + "loss": 3.1671, + "step": 48860 + }, + { + "epoch": 3.320084250577524, + "grad_norm": 1.1318037509918213, + "learning_rate": 0.0005851508357113738, + "loss": 3.5575, + "step": 48865 + }, + { + "epoch": 3.3204239706481857, + "grad_norm": 0.9218287467956543, + "learning_rate": 0.0005851083707025411, + "loss": 3.7527, + "step": 48870 + }, + { + "epoch": 3.320763690718848, + "grad_norm": 0.8592592477798462, + "learning_rate": 0.0005850659056937085, + "loss": 3.477, + "step": 48875 + }, + { + "epoch": 3.3211034107895094, + "grad_norm": 0.9075092077255249, + "learning_rate": 0.0005850234406848757, + "loss": 3.5239, + "step": 48880 + }, + { + "epoch": 3.321443130860171, + "grad_norm": 0.9627721905708313, + "learning_rate": 0.0005849809756760429, + "loss": 3.4573, + "step": 48885 + }, + { + "epoch": 3.321782850930833, + "grad_norm": 0.9931257367134094, + "learning_rate": 0.0005849385106672102, + "loss": 3.5326, + "step": 48890 + }, + { + "epoch": 3.3221225710014948, + "grad_norm": 0.9392978549003601, + "learning_rate": 0.0005848960456583775, + "loss": 3.578, + "step": 48895 + }, + { + "epoch": 3.3224622910721564, + "grad_norm": 0.7516424655914307, + "learning_rate": 0.0005848535806495447, + "loss": 3.6559, + "step": 48900 + }, + { + "epoch": 3.3228020111428185, + "grad_norm": 1.0782415866851807, + "learning_rate": 0.0005848111156407121, + "loss": 3.4662, + "step": 48905 + }, + { + "epoch": 3.32314173121348, + "grad_norm": 0.7623063325881958, + "learning_rate": 0.0005847686506318794, + "loss": 3.8632, + "step": 48910 + }, + { + "epoch": 3.3234814512841417, + "grad_norm": 0.9526509046554565, + "learning_rate": 0.0005847261856230466, + "loss": 3.5513, + "step": 48915 + }, + { + "epoch": 3.323821171354804, + "grad_norm": 0.9457007646560669, + "learning_rate": 0.0005846837206142139, + "loss": 3.419, + "step": 48920 + }, + { + "epoch": 3.3241608914254654, + "grad_norm": 0.8781028389930725, + "learning_rate": 0.0005846412556053812, + "loss": 3.54, + "step": 48925 + }, + { + "epoch": 3.324500611496127, + "grad_norm": 1.1272250413894653, + "learning_rate": 0.0005845987905965484, + "loss": 3.5594, + "step": 48930 + }, + { + "epoch": 3.324840331566789, + "grad_norm": 0.929874837398529, + "learning_rate": 0.0005845563255877157, + "loss": 3.5729, + "step": 48935 + }, + { + "epoch": 3.325180051637451, + "grad_norm": 0.9259381890296936, + "learning_rate": 0.000584513860578883, + "loss": 3.4804, + "step": 48940 + }, + { + "epoch": 3.3255197717081124, + "grad_norm": 0.8369226455688477, + "learning_rate": 0.0005844713955700503, + "loss": 3.561, + "step": 48945 + }, + { + "epoch": 3.3258594917787745, + "grad_norm": 1.0023441314697266, + "learning_rate": 0.0005844289305612176, + "loss": 3.6338, + "step": 48950 + }, + { + "epoch": 3.326199211849436, + "grad_norm": 0.8346406817436218, + "learning_rate": 0.0005843864655523848, + "loss": 3.646, + "step": 48955 + }, + { + "epoch": 3.3265389319200978, + "grad_norm": 0.8784379959106445, + "learning_rate": 0.0005843440005435521, + "loss": 3.5679, + "step": 48960 + }, + { + "epoch": 3.32687865199076, + "grad_norm": 1.000694751739502, + "learning_rate": 0.0005843015355347194, + "loss": 3.7684, + "step": 48965 + }, + { + "epoch": 3.3272183720614215, + "grad_norm": 0.8678324818611145, + "learning_rate": 0.0005842590705258866, + "loss": 3.5477, + "step": 48970 + }, + { + "epoch": 3.327558092132083, + "grad_norm": 0.8507412672042847, + "learning_rate": 0.000584216605517054, + "loss": 3.2324, + "step": 48975 + }, + { + "epoch": 3.327897812202745, + "grad_norm": 0.8081579804420471, + "learning_rate": 0.0005841741405082213, + "loss": 3.5153, + "step": 48980 + }, + { + "epoch": 3.328237532273407, + "grad_norm": 0.9053483605384827, + "learning_rate": 0.0005841316754993885, + "loss": 3.4459, + "step": 48985 + }, + { + "epoch": 3.3285772523440684, + "grad_norm": 1.0252604484558105, + "learning_rate": 0.0005840892104905557, + "loss": 3.4108, + "step": 48990 + }, + { + "epoch": 3.3289169724147305, + "grad_norm": 1.0516717433929443, + "learning_rate": 0.0005840467454817231, + "loss": 3.5, + "step": 48995 + }, + { + "epoch": 3.329256692485392, + "grad_norm": 1.0082449913024902, + "learning_rate": 0.0005840042804728903, + "loss": 3.6199, + "step": 49000 + }, + { + "epoch": 3.3295964125560538, + "grad_norm": 1.3430073261260986, + "learning_rate": 0.0005839618154640575, + "loss": 3.3693, + "step": 49005 + }, + { + "epoch": 3.3299361326267154, + "grad_norm": 1.020893931388855, + "learning_rate": 0.000583919350455225, + "loss": 3.499, + "step": 49010 + }, + { + "epoch": 3.3302758526973775, + "grad_norm": 1.521949291229248, + "learning_rate": 0.0005838768854463922, + "loss": 3.3377, + "step": 49015 + }, + { + "epoch": 3.330615572768039, + "grad_norm": 1.0695613622665405, + "learning_rate": 0.0005838344204375594, + "loss": 3.4777, + "step": 49020 + }, + { + "epoch": 3.3309552928387007, + "grad_norm": 0.959679365158081, + "learning_rate": 0.0005837919554287268, + "loss": 3.7255, + "step": 49025 + }, + { + "epoch": 3.331295012909363, + "grad_norm": 0.9630565643310547, + "learning_rate": 0.000583749490419894, + "loss": 3.727, + "step": 49030 + }, + { + "epoch": 3.3316347329800244, + "grad_norm": 0.8471716046333313, + "learning_rate": 0.0005837070254110612, + "loss": 3.7549, + "step": 49035 + }, + { + "epoch": 3.331974453050686, + "grad_norm": 0.7651530504226685, + "learning_rate": 0.0005836645604022285, + "loss": 3.7517, + "step": 49040 + }, + { + "epoch": 3.332314173121348, + "grad_norm": 0.7116681337356567, + "learning_rate": 0.0005836220953933959, + "loss": 3.4308, + "step": 49045 + }, + { + "epoch": 3.3326538931920098, + "grad_norm": 0.9452030062675476, + "learning_rate": 0.0005835796303845632, + "loss": 3.6668, + "step": 49050 + }, + { + "epoch": 3.3329936132626714, + "grad_norm": 0.6428164839744568, + "learning_rate": 0.0005835371653757304, + "loss": 3.6011, + "step": 49055 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.9650560021400452, + "learning_rate": 0.0005834947003668977, + "loss": 3.6251, + "step": 49060 + }, + { + "epoch": 3.333673053403995, + "grad_norm": 0.8934586048126221, + "learning_rate": 0.000583452235358065, + "loss": 3.156, + "step": 49065 + }, + { + "epoch": 3.3340127734746567, + "grad_norm": 1.4137073755264282, + "learning_rate": 0.0005834097703492322, + "loss": 3.2194, + "step": 49070 + }, + { + "epoch": 3.334352493545319, + "grad_norm": 0.8837845325469971, + "learning_rate": 0.0005833673053403994, + "loss": 3.5616, + "step": 49075 + }, + { + "epoch": 3.3346922136159804, + "grad_norm": 0.8996991515159607, + "learning_rate": 0.0005833248403315669, + "loss": 3.4204, + "step": 49080 + }, + { + "epoch": 3.335031933686642, + "grad_norm": 1.3444832563400269, + "learning_rate": 0.0005832823753227341, + "loss": 3.5662, + "step": 49085 + }, + { + "epoch": 3.335371653757304, + "grad_norm": 0.8671360611915588, + "learning_rate": 0.0005832399103139013, + "loss": 3.3048, + "step": 49090 + }, + { + "epoch": 3.335711373827966, + "grad_norm": 1.2677407264709473, + "learning_rate": 0.0005831974453050687, + "loss": 3.8559, + "step": 49095 + }, + { + "epoch": 3.3360510938986274, + "grad_norm": 0.8250064849853516, + "learning_rate": 0.0005831549802962359, + "loss": 3.5323, + "step": 49100 + }, + { + "epoch": 3.3363908139692895, + "grad_norm": 0.8714932799339294, + "learning_rate": 0.0005831125152874031, + "loss": 3.3732, + "step": 49105 + }, + { + "epoch": 3.336730534039951, + "grad_norm": 0.8895025253295898, + "learning_rate": 0.0005830700502785706, + "loss": 3.7916, + "step": 49110 + }, + { + "epoch": 3.3370702541106128, + "grad_norm": 1.044747233390808, + "learning_rate": 0.0005830275852697378, + "loss": 3.5191, + "step": 49115 + }, + { + "epoch": 3.3374099741812744, + "grad_norm": 0.9000561237335205, + "learning_rate": 0.000582985120260905, + "loss": 3.4542, + "step": 49120 + }, + { + "epoch": 3.3377496942519365, + "grad_norm": 1.1170920133590698, + "learning_rate": 0.0005829426552520724, + "loss": 3.5279, + "step": 49125 + }, + { + "epoch": 3.338089414322598, + "grad_norm": 0.8974453806877136, + "learning_rate": 0.0005829001902432396, + "loss": 3.568, + "step": 49130 + }, + { + "epoch": 3.3384291343932597, + "grad_norm": 0.7531300187110901, + "learning_rate": 0.0005828577252344068, + "loss": 3.5224, + "step": 49135 + }, + { + "epoch": 3.338768854463922, + "grad_norm": 1.0953787565231323, + "learning_rate": 0.0005828152602255741, + "loss": 3.8589, + "step": 49140 + }, + { + "epoch": 3.3391085745345834, + "grad_norm": 1.0345302820205688, + "learning_rate": 0.0005827727952167415, + "loss": 3.6451, + "step": 49145 + }, + { + "epoch": 3.339448294605245, + "grad_norm": 0.7113009095191956, + "learning_rate": 0.0005827303302079087, + "loss": 3.5216, + "step": 49150 + }, + { + "epoch": 3.339788014675907, + "grad_norm": 1.0032429695129395, + "learning_rate": 0.000582687865199076, + "loss": 3.3572, + "step": 49155 + }, + { + "epoch": 3.3401277347465688, + "grad_norm": 0.8714635968208313, + "learning_rate": 0.0005826454001902433, + "loss": 3.4459, + "step": 49160 + }, + { + "epoch": 3.3404674548172304, + "grad_norm": 0.9867728352546692, + "learning_rate": 0.0005826029351814105, + "loss": 3.4676, + "step": 49165 + }, + { + "epoch": 3.3408071748878925, + "grad_norm": 0.8073592782020569, + "learning_rate": 0.0005825604701725778, + "loss": 3.5211, + "step": 49170 + }, + { + "epoch": 3.341146894958554, + "grad_norm": 0.8826972246170044, + "learning_rate": 0.000582518005163745, + "loss": 3.5282, + "step": 49175 + }, + { + "epoch": 3.3414866150292157, + "grad_norm": 1.060948371887207, + "learning_rate": 0.0005824755401549124, + "loss": 3.308, + "step": 49180 + }, + { + "epoch": 3.341826335099878, + "grad_norm": 0.9003921747207642, + "learning_rate": 0.0005824330751460797, + "loss": 3.1983, + "step": 49185 + }, + { + "epoch": 3.3421660551705394, + "grad_norm": 0.7208395004272461, + "learning_rate": 0.0005823906101372469, + "loss": 3.346, + "step": 49190 + }, + { + "epoch": 3.342505775241201, + "grad_norm": 0.798274576663971, + "learning_rate": 0.0005823481451284142, + "loss": 3.387, + "step": 49195 + }, + { + "epoch": 3.342845495311863, + "grad_norm": 1.0186264514923096, + "learning_rate": 0.0005823056801195815, + "loss": 3.5591, + "step": 49200 + }, + { + "epoch": 3.3431852153825248, + "grad_norm": 0.8011279702186584, + "learning_rate": 0.0005822632151107487, + "loss": 3.6858, + "step": 49205 + }, + { + "epoch": 3.3435249354531864, + "grad_norm": 1.199198842048645, + "learning_rate": 0.000582220750101916, + "loss": 3.6513, + "step": 49210 + }, + { + "epoch": 3.3438646555238485, + "grad_norm": 1.1687602996826172, + "learning_rate": 0.0005821782850930834, + "loss": 3.5312, + "step": 49215 + }, + { + "epoch": 3.34420437559451, + "grad_norm": 1.0276405811309814, + "learning_rate": 0.0005821358200842506, + "loss": 3.4542, + "step": 49220 + }, + { + "epoch": 3.3445440956651717, + "grad_norm": 0.9062296748161316, + "learning_rate": 0.0005820933550754179, + "loss": 3.4668, + "step": 49225 + }, + { + "epoch": 3.344883815735834, + "grad_norm": 1.031029462814331, + "learning_rate": 0.0005820508900665852, + "loss": 3.6868, + "step": 49230 + }, + { + "epoch": 3.3452235358064955, + "grad_norm": 0.7237666249275208, + "learning_rate": 0.0005820084250577524, + "loss": 3.1926, + "step": 49235 + }, + { + "epoch": 3.345563255877157, + "grad_norm": 0.8073501586914062, + "learning_rate": 0.0005819659600489196, + "loss": 3.4949, + "step": 49240 + }, + { + "epoch": 3.345902975947819, + "grad_norm": 0.6924482583999634, + "learning_rate": 0.000581923495040087, + "loss": 3.5644, + "step": 49245 + }, + { + "epoch": 3.346242696018481, + "grad_norm": 1.2344952821731567, + "learning_rate": 0.0005818810300312543, + "loss": 3.6623, + "step": 49250 + }, + { + "epoch": 3.3465824160891424, + "grad_norm": 1.1876940727233887, + "learning_rate": 0.0005818385650224215, + "loss": 3.5486, + "step": 49255 + }, + { + "epoch": 3.3469221361598045, + "grad_norm": 0.9822284579277039, + "learning_rate": 0.0005817961000135889, + "loss": 3.6058, + "step": 49260 + }, + { + "epoch": 3.347261856230466, + "grad_norm": 0.978889524936676, + "learning_rate": 0.0005817536350047561, + "loss": 3.5822, + "step": 49265 + }, + { + "epoch": 3.3476015763011278, + "grad_norm": 1.0321766138076782, + "learning_rate": 0.0005817111699959233, + "loss": 3.7332, + "step": 49270 + }, + { + "epoch": 3.34794129637179, + "grad_norm": 0.8100548386573792, + "learning_rate": 0.0005816687049870907, + "loss": 3.6351, + "step": 49275 + }, + { + "epoch": 3.3482810164424515, + "grad_norm": 1.0282920598983765, + "learning_rate": 0.0005816262399782579, + "loss": 3.6922, + "step": 49280 + }, + { + "epoch": 3.348620736513113, + "grad_norm": 0.7977488040924072, + "learning_rate": 0.0005815837749694252, + "loss": 3.645, + "step": 49285 + }, + { + "epoch": 3.348960456583775, + "grad_norm": 0.8050475716590881, + "learning_rate": 0.0005815413099605925, + "loss": 3.5564, + "step": 49290 + }, + { + "epoch": 3.349300176654437, + "grad_norm": 0.7821256518363953, + "learning_rate": 0.0005814988449517598, + "loss": 3.5252, + "step": 49295 + }, + { + "epoch": 3.3496398967250984, + "grad_norm": 0.92271488904953, + "learning_rate": 0.000581456379942927, + "loss": 3.5016, + "step": 49300 + }, + { + "epoch": 3.3499796167957605, + "grad_norm": 1.0119736194610596, + "learning_rate": 0.0005814139149340943, + "loss": 3.4536, + "step": 49305 + }, + { + "epoch": 3.350319336866422, + "grad_norm": 0.9153777956962585, + "learning_rate": 0.0005813714499252616, + "loss": 3.559, + "step": 49310 + }, + { + "epoch": 3.3506590569370838, + "grad_norm": 0.7919812202453613, + "learning_rate": 0.0005813289849164288, + "loss": 3.7601, + "step": 49315 + }, + { + "epoch": 3.350998777007746, + "grad_norm": 0.8774921894073486, + "learning_rate": 0.0005812865199075962, + "loss": 3.625, + "step": 49320 + }, + { + "epoch": 3.3513384970784075, + "grad_norm": 0.8962453603744507, + "learning_rate": 0.0005812440548987635, + "loss": 3.397, + "step": 49325 + }, + { + "epoch": 3.351678217149069, + "grad_norm": 0.8583590388298035, + "learning_rate": 0.0005812015898899307, + "loss": 3.3786, + "step": 49330 + }, + { + "epoch": 3.352017937219731, + "grad_norm": 2.623183488845825, + "learning_rate": 0.000581159124881098, + "loss": 3.4954, + "step": 49335 + }, + { + "epoch": 3.352357657290393, + "grad_norm": 0.8995262384414673, + "learning_rate": 0.0005811166598722652, + "loss": 3.5553, + "step": 49340 + }, + { + "epoch": 3.3526973773610544, + "grad_norm": 0.9440169334411621, + "learning_rate": 0.0005810741948634325, + "loss": 3.3426, + "step": 49345 + }, + { + "epoch": 3.353037097431716, + "grad_norm": 0.746453046798706, + "learning_rate": 0.0005810317298545998, + "loss": 3.4138, + "step": 49350 + }, + { + "epoch": 3.353376817502378, + "grad_norm": 0.9893253445625305, + "learning_rate": 0.0005809892648457671, + "loss": 3.2423, + "step": 49355 + }, + { + "epoch": 3.35371653757304, + "grad_norm": 1.0629938840866089, + "learning_rate": 0.0005809467998369344, + "loss": 3.3672, + "step": 49360 + }, + { + "epoch": 3.3540562576437014, + "grad_norm": 0.9052327275276184, + "learning_rate": 0.0005809043348281017, + "loss": 3.4643, + "step": 49365 + }, + { + "epoch": 3.3543959777143635, + "grad_norm": 0.7484062910079956, + "learning_rate": 0.0005808618698192689, + "loss": 3.4274, + "step": 49370 + }, + { + "epoch": 3.354735697785025, + "grad_norm": 0.8362954258918762, + "learning_rate": 0.0005808194048104361, + "loss": 3.4077, + "step": 49375 + }, + { + "epoch": 3.3550754178556867, + "grad_norm": 0.9901958703994751, + "learning_rate": 0.0005807769398016035, + "loss": 3.4653, + "step": 49380 + }, + { + "epoch": 3.355415137926349, + "grad_norm": 1.1483792066574097, + "learning_rate": 0.0005807344747927707, + "loss": 3.5965, + "step": 49385 + }, + { + "epoch": 3.3557548579970105, + "grad_norm": 0.7287034392356873, + "learning_rate": 0.0005806920097839381, + "loss": 3.2981, + "step": 49390 + }, + { + "epoch": 3.356094578067672, + "grad_norm": 0.8453248739242554, + "learning_rate": 0.0005806495447751054, + "loss": 3.6379, + "step": 49395 + }, + { + "epoch": 3.356434298138334, + "grad_norm": 0.9392483234405518, + "learning_rate": 0.0005806070797662726, + "loss": 3.5662, + "step": 49400 + }, + { + "epoch": 3.356774018208996, + "grad_norm": 0.8161388635635376, + "learning_rate": 0.0005805646147574399, + "loss": 3.4189, + "step": 49405 + }, + { + "epoch": 3.3571137382796574, + "grad_norm": 0.8222795128822327, + "learning_rate": 0.0005805221497486072, + "loss": 3.8192, + "step": 49410 + }, + { + "epoch": 3.3574534583503195, + "grad_norm": 0.7265586256980896, + "learning_rate": 0.0005804796847397744, + "loss": 3.3553, + "step": 49415 + }, + { + "epoch": 3.357793178420981, + "grad_norm": 0.7177793979644775, + "learning_rate": 0.0005804372197309417, + "loss": 3.5061, + "step": 49420 + }, + { + "epoch": 3.3581328984916428, + "grad_norm": 1.268054485321045, + "learning_rate": 0.0005803947547221091, + "loss": 3.5344, + "step": 49425 + }, + { + "epoch": 3.358472618562305, + "grad_norm": 0.7869639992713928, + "learning_rate": 0.0005803522897132763, + "loss": 3.4634, + "step": 49430 + }, + { + "epoch": 3.3588123386329665, + "grad_norm": 0.8788741230964661, + "learning_rate": 0.0005803098247044436, + "loss": 3.1301, + "step": 49435 + }, + { + "epoch": 3.359152058703628, + "grad_norm": 1.0673754215240479, + "learning_rate": 0.0005802673596956108, + "loss": 3.1802, + "step": 49440 + }, + { + "epoch": 3.35949177877429, + "grad_norm": 1.024461030960083, + "learning_rate": 0.0005802248946867781, + "loss": 3.2389, + "step": 49445 + }, + { + "epoch": 3.359831498844952, + "grad_norm": 1.0067484378814697, + "learning_rate": 0.0005801824296779454, + "loss": 3.3627, + "step": 49450 + }, + { + "epoch": 3.3601712189156134, + "grad_norm": 0.9214107394218445, + "learning_rate": 0.0005801399646691126, + "loss": 3.8691, + "step": 49455 + }, + { + "epoch": 3.360510938986275, + "grad_norm": 0.8637719750404358, + "learning_rate": 0.00058009749966028, + "loss": 3.4521, + "step": 49460 + }, + { + "epoch": 3.360850659056937, + "grad_norm": 0.9467794299125671, + "learning_rate": 0.0005800550346514473, + "loss": 3.3081, + "step": 49465 + }, + { + "epoch": 3.3611903791275988, + "grad_norm": 0.8835609555244446, + "learning_rate": 0.0005800125696426145, + "loss": 3.5905, + "step": 49470 + }, + { + "epoch": 3.3615300991982604, + "grad_norm": 0.8884028196334839, + "learning_rate": 0.0005799701046337817, + "loss": 3.5138, + "step": 49475 + }, + { + "epoch": 3.3618698192689225, + "grad_norm": 0.9062149524688721, + "learning_rate": 0.0005799276396249491, + "loss": 3.5257, + "step": 49480 + }, + { + "epoch": 3.362209539339584, + "grad_norm": 0.8485463857650757, + "learning_rate": 0.0005798851746161163, + "loss": 3.3233, + "step": 49485 + }, + { + "epoch": 3.3625492594102457, + "grad_norm": 0.9344617128372192, + "learning_rate": 0.0005798427096072835, + "loss": 3.5854, + "step": 49490 + }, + { + "epoch": 3.362888979480908, + "grad_norm": 1.1020481586456299, + "learning_rate": 0.000579800244598451, + "loss": 3.6579, + "step": 49495 + }, + { + "epoch": 3.3632286995515694, + "grad_norm": 0.9241324663162231, + "learning_rate": 0.0005797577795896182, + "loss": 3.5314, + "step": 49500 + }, + { + "epoch": 3.363568419622231, + "grad_norm": 0.9628782272338867, + "learning_rate": 0.0005797153145807854, + "loss": 3.3635, + "step": 49505 + }, + { + "epoch": 3.363908139692893, + "grad_norm": 0.8908146619796753, + "learning_rate": 0.0005796728495719528, + "loss": 3.4567, + "step": 49510 + }, + { + "epoch": 3.364247859763555, + "grad_norm": 0.9499918818473816, + "learning_rate": 0.00057963038456312, + "loss": 3.837, + "step": 49515 + }, + { + "epoch": 3.3645875798342164, + "grad_norm": 1.103591799736023, + "learning_rate": 0.0005795879195542872, + "loss": 3.3853, + "step": 49520 + }, + { + "epoch": 3.3649272999048785, + "grad_norm": 3.0637366771698, + "learning_rate": 0.0005795454545454545, + "loss": 3.5539, + "step": 49525 + }, + { + "epoch": 3.36526701997554, + "grad_norm": 1.0696778297424316, + "learning_rate": 0.0005795029895366219, + "loss": 3.6146, + "step": 49530 + }, + { + "epoch": 3.3656067400462018, + "grad_norm": 0.8234710693359375, + "learning_rate": 0.0005794605245277891, + "loss": 3.8166, + "step": 49535 + }, + { + "epoch": 3.365946460116864, + "grad_norm": 0.9417569637298584, + "learning_rate": 0.0005794180595189564, + "loss": 3.553, + "step": 49540 + }, + { + "epoch": 3.3662861801875255, + "grad_norm": 0.9181104898452759, + "learning_rate": 0.0005793755945101237, + "loss": 3.5409, + "step": 49545 + }, + { + "epoch": 3.366625900258187, + "grad_norm": 0.7066804766654968, + "learning_rate": 0.0005793331295012909, + "loss": 3.6369, + "step": 49550 + }, + { + "epoch": 3.366965620328849, + "grad_norm": 0.783569872379303, + "learning_rate": 0.0005792906644924582, + "loss": 3.6519, + "step": 49555 + }, + { + "epoch": 3.367305340399511, + "grad_norm": 0.9277128577232361, + "learning_rate": 0.0005792481994836255, + "loss": 3.2665, + "step": 49560 + }, + { + "epoch": 3.3676450604701724, + "grad_norm": 1.0079425573349, + "learning_rate": 0.0005792057344747928, + "loss": 3.4916, + "step": 49565 + }, + { + "epoch": 3.3679847805408345, + "grad_norm": 1.1088428497314453, + "learning_rate": 0.0005791632694659601, + "loss": 3.5623, + "step": 49570 + }, + { + "epoch": 3.368324500611496, + "grad_norm": 0.7913759350776672, + "learning_rate": 0.0005791208044571273, + "loss": 3.4946, + "step": 49575 + }, + { + "epoch": 3.3686642206821578, + "grad_norm": 0.9697868824005127, + "learning_rate": 0.0005790783394482946, + "loss": 3.5892, + "step": 49580 + }, + { + "epoch": 3.36900394075282, + "grad_norm": 1.0353643894195557, + "learning_rate": 0.0005790358744394619, + "loss": 3.2561, + "step": 49585 + }, + { + "epoch": 3.3693436608234815, + "grad_norm": 0.6873120069503784, + "learning_rate": 0.0005789934094306291, + "loss": 3.4652, + "step": 49590 + }, + { + "epoch": 3.369683380894143, + "grad_norm": 0.9052225351333618, + "learning_rate": 0.0005789509444217964, + "loss": 3.518, + "step": 49595 + }, + { + "epoch": 3.370023100964805, + "grad_norm": 0.8863621354103088, + "learning_rate": 0.0005789084794129638, + "loss": 3.5807, + "step": 49600 + }, + { + "epoch": 3.370362821035467, + "grad_norm": 0.8485060930252075, + "learning_rate": 0.000578866014404131, + "loss": 3.545, + "step": 49605 + }, + { + "epoch": 3.3707025411061284, + "grad_norm": 0.8243993520736694, + "learning_rate": 0.0005788235493952983, + "loss": 3.6051, + "step": 49610 + }, + { + "epoch": 3.3710422611767905, + "grad_norm": 0.6892720460891724, + "learning_rate": 0.0005787810843864656, + "loss": 3.3329, + "step": 49615 + }, + { + "epoch": 3.371381981247452, + "grad_norm": 0.8653953671455383, + "learning_rate": 0.0005787386193776328, + "loss": 3.5844, + "step": 49620 + }, + { + "epoch": 3.3717217013181138, + "grad_norm": 1.0588648319244385, + "learning_rate": 0.0005786961543688, + "loss": 3.4964, + "step": 49625 + }, + { + "epoch": 3.372061421388776, + "grad_norm": 0.9723254442214966, + "learning_rate": 0.0005786536893599674, + "loss": 3.5733, + "step": 49630 + }, + { + "epoch": 3.3724011414594375, + "grad_norm": 0.89427250623703, + "learning_rate": 0.0005786112243511347, + "loss": 3.4733, + "step": 49635 + }, + { + "epoch": 3.372740861530099, + "grad_norm": 0.8159297108650208, + "learning_rate": 0.0005785687593423019, + "loss": 3.4316, + "step": 49640 + }, + { + "epoch": 3.373080581600761, + "grad_norm": 0.8976064324378967, + "learning_rate": 0.0005785262943334693, + "loss": 3.4681, + "step": 49645 + }, + { + "epoch": 3.373420301671423, + "grad_norm": 0.6861762404441833, + "learning_rate": 0.0005784838293246365, + "loss": 3.565, + "step": 49650 + }, + { + "epoch": 3.3737600217420844, + "grad_norm": 0.9914200305938721, + "learning_rate": 0.0005784413643158037, + "loss": 3.6162, + "step": 49655 + }, + { + "epoch": 3.3740997418127465, + "grad_norm": 0.9316592216491699, + "learning_rate": 0.0005783988993069711, + "loss": 3.36, + "step": 49660 + }, + { + "epoch": 3.374439461883408, + "grad_norm": 0.8541392087936401, + "learning_rate": 0.0005783564342981383, + "loss": 3.5335, + "step": 49665 + }, + { + "epoch": 3.37477918195407, + "grad_norm": 0.7261362075805664, + "learning_rate": 0.0005783139692893056, + "loss": 3.5562, + "step": 49670 + }, + { + "epoch": 3.375118902024732, + "grad_norm": 0.7883813977241516, + "learning_rate": 0.000578271504280473, + "loss": 3.4864, + "step": 49675 + }, + { + "epoch": 3.3754586220953935, + "grad_norm": 1.0224359035491943, + "learning_rate": 0.0005782290392716402, + "loss": 3.3399, + "step": 49680 + }, + { + "epoch": 3.375798342166055, + "grad_norm": 1.4116184711456299, + "learning_rate": 0.0005781865742628074, + "loss": 3.5174, + "step": 49685 + }, + { + "epoch": 3.3761380622367168, + "grad_norm": 1.0333983898162842, + "learning_rate": 0.0005781441092539747, + "loss": 3.4098, + "step": 49690 + }, + { + "epoch": 3.376477782307379, + "grad_norm": 0.9383448958396912, + "learning_rate": 0.000578101644245142, + "loss": 3.7271, + "step": 49695 + }, + { + "epoch": 3.3768175023780405, + "grad_norm": 0.8765286207199097, + "learning_rate": 0.0005780591792363092, + "loss": 3.533, + "step": 49700 + }, + { + "epoch": 3.377157222448702, + "grad_norm": 0.8886171579360962, + "learning_rate": 0.0005780167142274766, + "loss": 3.4494, + "step": 49705 + }, + { + "epoch": 3.377496942519364, + "grad_norm": 0.8792003989219666, + "learning_rate": 0.0005779742492186439, + "loss": 3.486, + "step": 49710 + }, + { + "epoch": 3.377836662590026, + "grad_norm": 0.759793221950531, + "learning_rate": 0.0005779317842098111, + "loss": 3.3735, + "step": 49715 + }, + { + "epoch": 3.3781763826606874, + "grad_norm": 0.9237404465675354, + "learning_rate": 0.0005778893192009784, + "loss": 3.685, + "step": 49720 + }, + { + "epoch": 3.3785161027313495, + "grad_norm": 0.9466947913169861, + "learning_rate": 0.0005778468541921456, + "loss": 3.4066, + "step": 49725 + }, + { + "epoch": 3.378855822802011, + "grad_norm": 0.9439827799797058, + "learning_rate": 0.000577804389183313, + "loss": 3.5326, + "step": 49730 + }, + { + "epoch": 3.3791955428726728, + "grad_norm": 1.1520558595657349, + "learning_rate": 0.0005777619241744803, + "loss": 3.5601, + "step": 49735 + }, + { + "epoch": 3.379535262943335, + "grad_norm": 0.9040645360946655, + "learning_rate": 0.0005777194591656475, + "loss": 3.5841, + "step": 49740 + }, + { + "epoch": 3.3798749830139965, + "grad_norm": 0.9678524732589722, + "learning_rate": 0.0005776769941568149, + "loss": 3.4563, + "step": 49745 + }, + { + "epoch": 3.380214703084658, + "grad_norm": 0.9383127689361572, + "learning_rate": 0.0005776345291479821, + "loss": 3.4733, + "step": 49750 + }, + { + "epoch": 3.38055442315532, + "grad_norm": 0.8488653302192688, + "learning_rate": 0.0005775920641391493, + "loss": 3.5113, + "step": 49755 + }, + { + "epoch": 3.380894143225982, + "grad_norm": 1.0893418788909912, + "learning_rate": 0.0005775495991303167, + "loss": 3.259, + "step": 49760 + }, + { + "epoch": 3.3812338632966434, + "grad_norm": 1.308603286743164, + "learning_rate": 0.0005775071341214839, + "loss": 3.5802, + "step": 49765 + }, + { + "epoch": 3.3815735833673055, + "grad_norm": 0.8495074510574341, + "learning_rate": 0.0005774646691126512, + "loss": 3.3669, + "step": 49770 + }, + { + "epoch": 3.381913303437967, + "grad_norm": 0.8388389945030212, + "learning_rate": 0.0005774222041038186, + "loss": 3.6153, + "step": 49775 + }, + { + "epoch": 3.3822530235086288, + "grad_norm": 0.9302075505256653, + "learning_rate": 0.0005773797390949858, + "loss": 3.6251, + "step": 49780 + }, + { + "epoch": 3.382592743579291, + "grad_norm": 0.723354697227478, + "learning_rate": 0.000577337274086153, + "loss": 3.5067, + "step": 49785 + }, + { + "epoch": 3.3829324636499525, + "grad_norm": 0.7855278849601746, + "learning_rate": 0.0005772948090773203, + "loss": 3.614, + "step": 49790 + }, + { + "epoch": 3.383272183720614, + "grad_norm": 0.8594374060630798, + "learning_rate": 0.0005772523440684876, + "loss": 3.5334, + "step": 49795 + }, + { + "epoch": 3.3836119037912757, + "grad_norm": 0.813541054725647, + "learning_rate": 0.0005772098790596548, + "loss": 3.6493, + "step": 49800 + }, + { + "epoch": 3.383951623861938, + "grad_norm": 0.7577971816062927, + "learning_rate": 0.0005771674140508222, + "loss": 3.2686, + "step": 49805 + }, + { + "epoch": 3.3842913439325994, + "grad_norm": 0.9489184021949768, + "learning_rate": 0.0005771249490419895, + "loss": 3.6497, + "step": 49810 + }, + { + "epoch": 3.384631064003261, + "grad_norm": 0.8003692626953125, + "learning_rate": 0.0005770824840331567, + "loss": 3.533, + "step": 49815 + }, + { + "epoch": 3.384970784073923, + "grad_norm": 0.8377352952957153, + "learning_rate": 0.000577040019024324, + "loss": 3.434, + "step": 49820 + }, + { + "epoch": 3.385310504144585, + "grad_norm": 0.7379021644592285, + "learning_rate": 0.0005769975540154912, + "loss": 3.5406, + "step": 49825 + }, + { + "epoch": 3.3856502242152464, + "grad_norm": 0.9719108939170837, + "learning_rate": 0.0005769550890066585, + "loss": 3.7998, + "step": 49830 + }, + { + "epoch": 3.3859899442859085, + "grad_norm": 1.1846009492874146, + "learning_rate": 0.0005769126239978258, + "loss": 3.881, + "step": 49835 + }, + { + "epoch": 3.38632966435657, + "grad_norm": 0.7503835558891296, + "learning_rate": 0.0005768701589889931, + "loss": 3.3638, + "step": 49840 + }, + { + "epoch": 3.3866693844272318, + "grad_norm": 0.9931747913360596, + "learning_rate": 0.0005768276939801604, + "loss": 3.3394, + "step": 49845 + }, + { + "epoch": 3.387009104497894, + "grad_norm": 0.8698179125785828, + "learning_rate": 0.0005767852289713277, + "loss": 3.7144, + "step": 49850 + }, + { + "epoch": 3.3873488245685555, + "grad_norm": 0.8937836289405823, + "learning_rate": 0.0005767427639624949, + "loss": 3.5248, + "step": 49855 + }, + { + "epoch": 3.387688544639217, + "grad_norm": 1.172354817390442, + "learning_rate": 0.0005767002989536622, + "loss": 3.5171, + "step": 49860 + }, + { + "epoch": 3.388028264709879, + "grad_norm": 2.4199867248535156, + "learning_rate": 0.0005766578339448295, + "loss": 3.5491, + "step": 49865 + }, + { + "epoch": 3.388367984780541, + "grad_norm": 0.9691428542137146, + "learning_rate": 0.0005766153689359967, + "loss": 3.6447, + "step": 49870 + }, + { + "epoch": 3.3887077048512024, + "grad_norm": 0.8048043251037598, + "learning_rate": 0.000576572903927164, + "loss": 3.6864, + "step": 49875 + }, + { + "epoch": 3.3890474249218645, + "grad_norm": 0.8955517411231995, + "learning_rate": 0.0005765304389183314, + "loss": 3.6196, + "step": 49880 + }, + { + "epoch": 3.389387144992526, + "grad_norm": 1.0135475397109985, + "learning_rate": 0.0005764879739094986, + "loss": 3.4329, + "step": 49885 + }, + { + "epoch": 3.3897268650631878, + "grad_norm": 1.0535873174667358, + "learning_rate": 0.0005764455089006658, + "loss": 3.332, + "step": 49890 + }, + { + "epoch": 3.39006658513385, + "grad_norm": 0.7889883518218994, + "learning_rate": 0.0005764030438918332, + "loss": 3.5857, + "step": 49895 + }, + { + "epoch": 3.3904063052045115, + "grad_norm": 0.9851111769676208, + "learning_rate": 0.0005763605788830004, + "loss": 3.6924, + "step": 49900 + }, + { + "epoch": 3.390746025275173, + "grad_norm": 0.9980193972587585, + "learning_rate": 0.0005763181138741676, + "loss": 3.416, + "step": 49905 + }, + { + "epoch": 3.391085745345835, + "grad_norm": 0.9257372617721558, + "learning_rate": 0.0005762756488653351, + "loss": 3.442, + "step": 49910 + }, + { + "epoch": 3.391425465416497, + "grad_norm": 1.1791220903396606, + "learning_rate": 0.0005762331838565023, + "loss": 3.4085, + "step": 49915 + }, + { + "epoch": 3.3917651854871584, + "grad_norm": 1.183353304862976, + "learning_rate": 0.0005761907188476695, + "loss": 3.3609, + "step": 49920 + }, + { + "epoch": 3.3921049055578205, + "grad_norm": 0.8577978610992432, + "learning_rate": 0.0005761482538388368, + "loss": 3.5902, + "step": 49925 + }, + { + "epoch": 3.392444625628482, + "grad_norm": 0.8301143050193787, + "learning_rate": 0.0005761057888300041, + "loss": 3.5556, + "step": 49930 + }, + { + "epoch": 3.3927843456991438, + "grad_norm": 0.941874623298645, + "learning_rate": 0.0005760633238211713, + "loss": 3.6918, + "step": 49935 + }, + { + "epoch": 3.393124065769806, + "grad_norm": 1.0854713916778564, + "learning_rate": 0.0005760208588123386, + "loss": 3.4471, + "step": 49940 + }, + { + "epoch": 3.3934637858404675, + "grad_norm": 0.9775370955467224, + "learning_rate": 0.000575978393803506, + "loss": 3.5625, + "step": 49945 + }, + { + "epoch": 3.393803505911129, + "grad_norm": 0.8222533464431763, + "learning_rate": 0.0005759359287946732, + "loss": 3.481, + "step": 49950 + }, + { + "epoch": 3.394143225981791, + "grad_norm": 0.6752081513404846, + "learning_rate": 0.0005758934637858405, + "loss": 3.691, + "step": 49955 + }, + { + "epoch": 3.394482946052453, + "grad_norm": 0.8202663660049438, + "learning_rate": 0.0005758509987770078, + "loss": 3.5312, + "step": 49960 + }, + { + "epoch": 3.3948226661231145, + "grad_norm": 0.8152515292167664, + "learning_rate": 0.000575808533768175, + "loss": 3.5623, + "step": 49965 + }, + { + "epoch": 3.3951623861937765, + "grad_norm": 0.903867781162262, + "learning_rate": 0.0005757660687593423, + "loss": 3.3136, + "step": 49970 + }, + { + "epoch": 3.395502106264438, + "grad_norm": 0.7957409024238586, + "learning_rate": 0.0005757236037505095, + "loss": 3.5014, + "step": 49975 + }, + { + "epoch": 3.3958418263351, + "grad_norm": 0.9124695062637329, + "learning_rate": 0.0005756811387416769, + "loss": 3.6069, + "step": 49980 + }, + { + "epoch": 3.396181546405762, + "grad_norm": 0.7117151618003845, + "learning_rate": 0.0005756386737328442, + "loss": 3.4896, + "step": 49985 + }, + { + "epoch": 3.3965212664764235, + "grad_norm": 0.7859393954277039, + "learning_rate": 0.0005755962087240114, + "loss": 3.7358, + "step": 49990 + }, + { + "epoch": 3.396860986547085, + "grad_norm": 0.7478525638580322, + "learning_rate": 0.0005755537437151787, + "loss": 3.3436, + "step": 49995 + }, + { + "epoch": 3.397200706617747, + "grad_norm": 0.868699848651886, + "learning_rate": 0.000575511278706346, + "loss": 3.6626, + "step": 50000 + }, + { + "epoch": 3.397540426688409, + "grad_norm": 0.9656057357788086, + "learning_rate": 0.0005754688136975132, + "loss": 3.5772, + "step": 50005 + }, + { + "epoch": 3.3978801467590705, + "grad_norm": 0.9830896258354187, + "learning_rate": 0.0005754263486886804, + "loss": 3.6923, + "step": 50010 + }, + { + "epoch": 3.3982198668297325, + "grad_norm": 1.0716181993484497, + "learning_rate": 0.0005753838836798479, + "loss": 3.5909, + "step": 50015 + }, + { + "epoch": 3.398559586900394, + "grad_norm": 0.9497999548912048, + "learning_rate": 0.0005753414186710151, + "loss": 3.45, + "step": 50020 + }, + { + "epoch": 3.398899306971056, + "grad_norm": 1.0860949754714966, + "learning_rate": 0.0005752989536621823, + "loss": 3.6738, + "step": 50025 + }, + { + "epoch": 3.3992390270417174, + "grad_norm": 0.8698149919509888, + "learning_rate": 0.0005752564886533497, + "loss": 3.423, + "step": 50030 + }, + { + "epoch": 3.3995787471123795, + "grad_norm": 0.9198766350746155, + "learning_rate": 0.0005752140236445169, + "loss": 3.5578, + "step": 50035 + }, + { + "epoch": 3.399918467183041, + "grad_norm": 0.8968635201454163, + "learning_rate": 0.0005751715586356841, + "loss": 3.6812, + "step": 50040 + }, + { + "epoch": 3.4002581872537028, + "grad_norm": 1.6590577363967896, + "learning_rate": 0.0005751290936268515, + "loss": 3.4674, + "step": 50045 + }, + { + "epoch": 3.400597907324365, + "grad_norm": 0.8317657113075256, + "learning_rate": 0.0005750866286180188, + "loss": 3.563, + "step": 50050 + }, + { + "epoch": 3.4009376273950265, + "grad_norm": 0.9401906132698059, + "learning_rate": 0.000575044163609186, + "loss": 3.567, + "step": 50055 + }, + { + "epoch": 3.401277347465688, + "grad_norm": 0.773350715637207, + "learning_rate": 0.0005750016986003534, + "loss": 3.3647, + "step": 50060 + }, + { + "epoch": 3.40161706753635, + "grad_norm": 0.8130866885185242, + "learning_rate": 0.0005749592335915206, + "loss": 3.5605, + "step": 50065 + }, + { + "epoch": 3.401956787607012, + "grad_norm": 1.1281410455703735, + "learning_rate": 0.0005749167685826879, + "loss": 3.4493, + "step": 50070 + }, + { + "epoch": 3.4022965076776734, + "grad_norm": 1.0046215057373047, + "learning_rate": 0.0005748743035738551, + "loss": 3.5523, + "step": 50075 + }, + { + "epoch": 3.4026362277483355, + "grad_norm": 0.9062886834144592, + "learning_rate": 0.0005748318385650224, + "loss": 3.2673, + "step": 50080 + }, + { + "epoch": 3.402975947818997, + "grad_norm": 0.8980346322059631, + "learning_rate": 0.0005747893735561898, + "loss": 3.7904, + "step": 50085 + }, + { + "epoch": 3.403315667889659, + "grad_norm": 0.8816210031509399, + "learning_rate": 0.000574746908547357, + "loss": 3.5162, + "step": 50090 + }, + { + "epoch": 3.403655387960321, + "grad_norm": 1.2273402214050293, + "learning_rate": 0.0005747044435385243, + "loss": 3.7291, + "step": 50095 + }, + { + "epoch": 3.4039951080309825, + "grad_norm": 0.8038480281829834, + "learning_rate": 0.0005746619785296916, + "loss": 3.4903, + "step": 50100 + }, + { + "epoch": 3.404334828101644, + "grad_norm": 0.9933997392654419, + "learning_rate": 0.0005746195135208588, + "loss": 3.5797, + "step": 50105 + }, + { + "epoch": 3.404674548172306, + "grad_norm": 0.8517804145812988, + "learning_rate": 0.000574577048512026, + "loss": 3.7196, + "step": 50110 + }, + { + "epoch": 3.405014268242968, + "grad_norm": 0.970610499382019, + "learning_rate": 0.0005745345835031934, + "loss": 3.6165, + "step": 50115 + }, + { + "epoch": 3.4053539883136295, + "grad_norm": 0.8250759243965149, + "learning_rate": 0.0005744921184943607, + "loss": 3.6527, + "step": 50120 + }, + { + "epoch": 3.4056937083842915, + "grad_norm": 1.0110492706298828, + "learning_rate": 0.0005744496534855279, + "loss": 3.5651, + "step": 50125 + }, + { + "epoch": 3.406033428454953, + "grad_norm": 0.918913722038269, + "learning_rate": 0.0005744071884766953, + "loss": 3.7343, + "step": 50130 + }, + { + "epoch": 3.406373148525615, + "grad_norm": 0.8714845776557922, + "learning_rate": 0.0005743647234678625, + "loss": 3.4655, + "step": 50135 + }, + { + "epoch": 3.4067128685962764, + "grad_norm": 0.7717921137809753, + "learning_rate": 0.0005743222584590297, + "loss": 3.609, + "step": 50140 + }, + { + "epoch": 3.4070525886669385, + "grad_norm": 0.9543799757957458, + "learning_rate": 0.0005742797934501971, + "loss": 3.4921, + "step": 50145 + }, + { + "epoch": 3.4073923087376, + "grad_norm": 1.0202826261520386, + "learning_rate": 0.0005742373284413643, + "loss": 3.4981, + "step": 50150 + }, + { + "epoch": 3.4077320288082618, + "grad_norm": 0.9284073114395142, + "learning_rate": 0.0005741948634325316, + "loss": 3.7482, + "step": 50155 + }, + { + "epoch": 3.408071748878924, + "grad_norm": 0.8760298490524292, + "learning_rate": 0.000574152398423699, + "loss": 3.5102, + "step": 50160 + }, + { + "epoch": 3.4084114689495855, + "grad_norm": 0.888300895690918, + "learning_rate": 0.0005741099334148662, + "loss": 3.7932, + "step": 50165 + }, + { + "epoch": 3.408751189020247, + "grad_norm": 0.9118449687957764, + "learning_rate": 0.0005740674684060334, + "loss": 3.5538, + "step": 50170 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.7671306133270264, + "learning_rate": 0.0005740250033972007, + "loss": 3.4639, + "step": 50175 + }, + { + "epoch": 3.409430629161571, + "grad_norm": 0.8283976912498474, + "learning_rate": 0.000573982538388368, + "loss": 3.4672, + "step": 50180 + }, + { + "epoch": 3.4097703492322324, + "grad_norm": 0.9731794595718384, + "learning_rate": 0.0005739400733795352, + "loss": 3.5356, + "step": 50185 + }, + { + "epoch": 3.4101100693028945, + "grad_norm": 0.9376983642578125, + "learning_rate": 0.0005738976083707026, + "loss": 3.4167, + "step": 50190 + }, + { + "epoch": 3.410449789373556, + "grad_norm": 0.8369411826133728, + "learning_rate": 0.0005738551433618699, + "loss": 3.3161, + "step": 50195 + }, + { + "epoch": 3.4107895094442178, + "grad_norm": 0.9789059162139893, + "learning_rate": 0.0005738126783530371, + "loss": 3.7591, + "step": 50200 + }, + { + "epoch": 3.41112922951488, + "grad_norm": 1.5260155200958252, + "learning_rate": 0.0005737702133442044, + "loss": 3.7166, + "step": 50205 + }, + { + "epoch": 3.4114689495855415, + "grad_norm": 0.7041724324226379, + "learning_rate": 0.0005737277483353716, + "loss": 3.2696, + "step": 50210 + }, + { + "epoch": 3.411808669656203, + "grad_norm": 0.8872123956680298, + "learning_rate": 0.0005736852833265389, + "loss": 3.6847, + "step": 50215 + }, + { + "epoch": 3.412148389726865, + "grad_norm": 0.8216543793678284, + "learning_rate": 0.0005736428183177062, + "loss": 3.5606, + "step": 50220 + }, + { + "epoch": 3.412488109797527, + "grad_norm": 0.9878095984458923, + "learning_rate": 0.0005736003533088735, + "loss": 3.4705, + "step": 50225 + }, + { + "epoch": 3.4128278298681884, + "grad_norm": 1.1303635835647583, + "learning_rate": 0.0005735578883000408, + "loss": 3.4315, + "step": 50230 + }, + { + "epoch": 3.4131675499388505, + "grad_norm": 0.9155978560447693, + "learning_rate": 0.0005735154232912081, + "loss": 3.2873, + "step": 50235 + }, + { + "epoch": 3.413507270009512, + "grad_norm": 0.8432127237319946, + "learning_rate": 0.0005734729582823753, + "loss": 3.5082, + "step": 50240 + }, + { + "epoch": 3.413846990080174, + "grad_norm": 1.1378315687179565, + "learning_rate": 0.0005734304932735426, + "loss": 3.3356, + "step": 50245 + }, + { + "epoch": 3.414186710150836, + "grad_norm": 0.8533917665481567, + "learning_rate": 0.0005733880282647099, + "loss": 3.6214, + "step": 50250 + }, + { + "epoch": 3.4145264302214975, + "grad_norm": 0.8682616949081421, + "learning_rate": 0.0005733455632558771, + "loss": 3.3716, + "step": 50255 + }, + { + "epoch": 3.414866150292159, + "grad_norm": 0.9716643691062927, + "learning_rate": 0.0005733030982470444, + "loss": 3.5501, + "step": 50260 + }, + { + "epoch": 3.415205870362821, + "grad_norm": 0.7967177629470825, + "learning_rate": 0.0005732606332382118, + "loss": 3.7447, + "step": 50265 + }, + { + "epoch": 3.415545590433483, + "grad_norm": 0.9651310443878174, + "learning_rate": 0.000573218168229379, + "loss": 3.6377, + "step": 50270 + }, + { + "epoch": 3.4158853105041445, + "grad_norm": 1.2807878255844116, + "learning_rate": 0.0005731757032205462, + "loss": 3.5642, + "step": 50275 + }, + { + "epoch": 3.4162250305748065, + "grad_norm": 0.7533833980560303, + "learning_rate": 0.0005731332382117136, + "loss": 3.7246, + "step": 50280 + }, + { + "epoch": 3.416564750645468, + "grad_norm": 0.9303075671195984, + "learning_rate": 0.0005730907732028808, + "loss": 3.5114, + "step": 50285 + }, + { + "epoch": 3.41690447071613, + "grad_norm": 0.7236813306808472, + "learning_rate": 0.000573048308194048, + "loss": 3.6417, + "step": 50290 + }, + { + "epoch": 3.417244190786792, + "grad_norm": 1.2435483932495117, + "learning_rate": 0.0005730058431852155, + "loss": 3.4472, + "step": 50295 + }, + { + "epoch": 3.4175839108574535, + "grad_norm": 1.0605806112289429, + "learning_rate": 0.0005729633781763827, + "loss": 3.4302, + "step": 50300 + }, + { + "epoch": 3.417923630928115, + "grad_norm": 0.7964075207710266, + "learning_rate": 0.0005729209131675499, + "loss": 3.3854, + "step": 50305 + }, + { + "epoch": 3.418263350998777, + "grad_norm": 0.809626042842865, + "learning_rate": 0.0005728784481587172, + "loss": 3.6186, + "step": 50310 + }, + { + "epoch": 3.418603071069439, + "grad_norm": 0.911316454410553, + "learning_rate": 0.0005728359831498845, + "loss": 3.3782, + "step": 50315 + }, + { + "epoch": 3.4189427911401005, + "grad_norm": 1.0848240852355957, + "learning_rate": 0.0005727935181410517, + "loss": 3.6257, + "step": 50320 + }, + { + "epoch": 3.4192825112107625, + "grad_norm": 0.8441908359527588, + "learning_rate": 0.0005727510531322191, + "loss": 3.4496, + "step": 50325 + }, + { + "epoch": 3.419622231281424, + "grad_norm": 1.0398932695388794, + "learning_rate": 0.0005727085881233864, + "loss": 3.7268, + "step": 50330 + }, + { + "epoch": 3.419961951352086, + "grad_norm": 0.7537044882774353, + "learning_rate": 0.0005726661231145536, + "loss": 3.3344, + "step": 50335 + }, + { + "epoch": 3.420301671422748, + "grad_norm": 0.9815772771835327, + "learning_rate": 0.0005726236581057209, + "loss": 3.7557, + "step": 50340 + }, + { + "epoch": 3.4206413914934095, + "grad_norm": 0.9954870939254761, + "learning_rate": 0.0005725811930968882, + "loss": 3.6186, + "step": 50345 + }, + { + "epoch": 3.420981111564071, + "grad_norm": 0.8874187469482422, + "learning_rate": 0.0005725387280880554, + "loss": 3.5347, + "step": 50350 + }, + { + "epoch": 3.421320831634733, + "grad_norm": 0.9769953489303589, + "learning_rate": 0.0005724962630792227, + "loss": 3.606, + "step": 50355 + }, + { + "epoch": 3.421660551705395, + "grad_norm": 1.018858790397644, + "learning_rate": 0.00057245379807039, + "loss": 3.4209, + "step": 50360 + }, + { + "epoch": 3.4220002717760565, + "grad_norm": 0.8123021721839905, + "learning_rate": 0.0005724113330615573, + "loss": 3.5147, + "step": 50365 + }, + { + "epoch": 3.422339991846718, + "grad_norm": 0.881446361541748, + "learning_rate": 0.0005723688680527246, + "loss": 3.467, + "step": 50370 + }, + { + "epoch": 3.42267971191738, + "grad_norm": 0.9096500277519226, + "learning_rate": 0.0005723264030438918, + "loss": 3.6454, + "step": 50375 + }, + { + "epoch": 3.423019431988042, + "grad_norm": 0.9773837924003601, + "learning_rate": 0.0005722839380350591, + "loss": 3.2926, + "step": 50380 + }, + { + "epoch": 3.4233591520587034, + "grad_norm": 1.0193259716033936, + "learning_rate": 0.0005722414730262264, + "loss": 3.7621, + "step": 50385 + }, + { + "epoch": 3.4236988721293655, + "grad_norm": 0.861585259437561, + "learning_rate": 0.0005721990080173936, + "loss": 3.4484, + "step": 50390 + }, + { + "epoch": 3.424038592200027, + "grad_norm": 0.7826228141784668, + "learning_rate": 0.000572156543008561, + "loss": 3.5088, + "step": 50395 + }, + { + "epoch": 3.424378312270689, + "grad_norm": 0.8084939122200012, + "learning_rate": 0.0005721140779997283, + "loss": 3.4895, + "step": 50400 + }, + { + "epoch": 3.424718032341351, + "grad_norm": 0.7932422161102295, + "learning_rate": 0.0005720716129908955, + "loss": 3.3704, + "step": 50405 + }, + { + "epoch": 3.4250577524120125, + "grad_norm": 0.9412611126899719, + "learning_rate": 0.0005720291479820629, + "loss": 3.5941, + "step": 50410 + }, + { + "epoch": 3.425397472482674, + "grad_norm": 0.9537215828895569, + "learning_rate": 0.0005719866829732301, + "loss": 3.3724, + "step": 50415 + }, + { + "epoch": 3.425737192553336, + "grad_norm": 0.8393676280975342, + "learning_rate": 0.0005719442179643973, + "loss": 3.5104, + "step": 50420 + }, + { + "epoch": 3.426076912623998, + "grad_norm": 3.0899221897125244, + "learning_rate": 0.0005719017529555646, + "loss": 3.6389, + "step": 50425 + }, + { + "epoch": 3.4264166326946595, + "grad_norm": 0.9157818555831909, + "learning_rate": 0.000571859287946732, + "loss": 3.571, + "step": 50430 + }, + { + "epoch": 3.4267563527653215, + "grad_norm": 0.714016318321228, + "learning_rate": 0.0005718168229378992, + "loss": 3.3174, + "step": 50435 + }, + { + "epoch": 3.427096072835983, + "grad_norm": 3.624479293823242, + "learning_rate": 0.0005717743579290665, + "loss": 3.3192, + "step": 50440 + }, + { + "epoch": 3.427435792906645, + "grad_norm": 0.7737239599227905, + "learning_rate": 0.0005717318929202338, + "loss": 3.4945, + "step": 50445 + }, + { + "epoch": 3.427775512977307, + "grad_norm": 0.8943622708320618, + "learning_rate": 0.000571689427911401, + "loss": 3.3962, + "step": 50450 + }, + { + "epoch": 3.4281152330479685, + "grad_norm": 0.8933145999908447, + "learning_rate": 0.0005716469629025683, + "loss": 3.496, + "step": 50455 + }, + { + "epoch": 3.42845495311863, + "grad_norm": 0.7683107256889343, + "learning_rate": 0.0005716044978937355, + "loss": 3.6764, + "step": 50460 + }, + { + "epoch": 3.428794673189292, + "grad_norm": 1.0067991018295288, + "learning_rate": 0.0005715620328849029, + "loss": 3.2515, + "step": 50465 + }, + { + "epoch": 3.429134393259954, + "grad_norm": 0.8209438920021057, + "learning_rate": 0.0005715195678760702, + "loss": 3.5094, + "step": 50470 + }, + { + "epoch": 3.4294741133306155, + "grad_norm": 0.8933861255645752, + "learning_rate": 0.0005714771028672374, + "loss": 3.5915, + "step": 50475 + }, + { + "epoch": 3.429813833401277, + "grad_norm": 2.804863452911377, + "learning_rate": 0.0005714346378584047, + "loss": 3.6076, + "step": 50480 + }, + { + "epoch": 3.430153553471939, + "grad_norm": 0.8806614279747009, + "learning_rate": 0.000571392172849572, + "loss": 3.6114, + "step": 50485 + }, + { + "epoch": 3.430493273542601, + "grad_norm": 0.6364525556564331, + "learning_rate": 0.0005713497078407392, + "loss": 3.5328, + "step": 50490 + }, + { + "epoch": 3.4308329936132624, + "grad_norm": 1.225527286529541, + "learning_rate": 0.0005713072428319064, + "loss": 3.3838, + "step": 50495 + }, + { + "epoch": 3.4311727136839245, + "grad_norm": 0.9523400664329529, + "learning_rate": 0.0005712647778230739, + "loss": 3.7156, + "step": 50500 + }, + { + "epoch": 3.431512433754586, + "grad_norm": 0.7459933161735535, + "learning_rate": 0.0005712223128142411, + "loss": 3.5043, + "step": 50505 + }, + { + "epoch": 3.4318521538252478, + "grad_norm": 2.3976974487304688, + "learning_rate": 0.0005711798478054083, + "loss": 3.3371, + "step": 50510 + }, + { + "epoch": 3.43219187389591, + "grad_norm": 0.9851628541946411, + "learning_rate": 0.0005711373827965757, + "loss": 3.5257, + "step": 50515 + }, + { + "epoch": 3.4325315939665715, + "grad_norm": 0.9798867702484131, + "learning_rate": 0.0005710949177877429, + "loss": 3.5422, + "step": 50520 + }, + { + "epoch": 3.432871314037233, + "grad_norm": 0.8075578212738037, + "learning_rate": 0.0005710524527789101, + "loss": 3.2031, + "step": 50525 + }, + { + "epoch": 3.433211034107895, + "grad_norm": 0.825364887714386, + "learning_rate": 0.0005710099877700775, + "loss": 3.5421, + "step": 50530 + }, + { + "epoch": 3.433550754178557, + "grad_norm": 0.9858722686767578, + "learning_rate": 0.0005709675227612448, + "loss": 3.2984, + "step": 50535 + }, + { + "epoch": 3.4338904742492184, + "grad_norm": 0.9933401346206665, + "learning_rate": 0.000570925057752412, + "loss": 3.5415, + "step": 50540 + }, + { + "epoch": 3.4342301943198805, + "grad_norm": 0.9146266579627991, + "learning_rate": 0.0005708825927435794, + "loss": 3.4446, + "step": 50545 + }, + { + "epoch": 3.434569914390542, + "grad_norm": 0.9559007287025452, + "learning_rate": 0.0005708401277347466, + "loss": 3.5919, + "step": 50550 + }, + { + "epoch": 3.434909634461204, + "grad_norm": 0.8345280885696411, + "learning_rate": 0.0005707976627259138, + "loss": 3.4638, + "step": 50555 + }, + { + "epoch": 3.435249354531866, + "grad_norm": 1.1103558540344238, + "learning_rate": 0.0005707551977170811, + "loss": 3.7457, + "step": 50560 + }, + { + "epoch": 3.4355890746025275, + "grad_norm": 0.8759259581565857, + "learning_rate": 0.0005707127327082484, + "loss": 3.4434, + "step": 50565 + }, + { + "epoch": 3.435928794673189, + "grad_norm": 0.7567179203033447, + "learning_rate": 0.0005706702676994157, + "loss": 3.4759, + "step": 50570 + }, + { + "epoch": 3.436268514743851, + "grad_norm": 0.8903446793556213, + "learning_rate": 0.000570627802690583, + "loss": 3.4172, + "step": 50575 + }, + { + "epoch": 3.436608234814513, + "grad_norm": 1.0649511814117432, + "learning_rate": 0.0005705853376817503, + "loss": 3.4711, + "step": 50580 + }, + { + "epoch": 3.4369479548851745, + "grad_norm": 1.2484006881713867, + "learning_rate": 0.0005705428726729175, + "loss": 3.681, + "step": 50585 + }, + { + "epoch": 3.4372876749558365, + "grad_norm": 1.1946030855178833, + "learning_rate": 0.0005705004076640848, + "loss": 3.5156, + "step": 50590 + }, + { + "epoch": 3.437627395026498, + "grad_norm": 0.8881866931915283, + "learning_rate": 0.000570457942655252, + "loss": 3.6714, + "step": 50595 + }, + { + "epoch": 3.43796711509716, + "grad_norm": 0.7064659595489502, + "learning_rate": 0.0005704154776464193, + "loss": 3.6839, + "step": 50600 + }, + { + "epoch": 3.438306835167822, + "grad_norm": 0.6961212754249573, + "learning_rate": 0.0005703730126375867, + "loss": 3.5803, + "step": 50605 + }, + { + "epoch": 3.4386465552384835, + "grad_norm": 0.8226665258407593, + "learning_rate": 0.0005703305476287539, + "loss": 3.5175, + "step": 50610 + }, + { + "epoch": 3.438986275309145, + "grad_norm": 0.8005390167236328, + "learning_rate": 0.0005702880826199212, + "loss": 3.2036, + "step": 50615 + }, + { + "epoch": 3.439325995379807, + "grad_norm": 0.9491279125213623, + "learning_rate": 0.0005702456176110885, + "loss": 3.6572, + "step": 50620 + }, + { + "epoch": 3.439665715450469, + "grad_norm": 0.8843263387680054, + "learning_rate": 0.0005702031526022557, + "loss": 3.4924, + "step": 50625 + }, + { + "epoch": 3.4400054355211305, + "grad_norm": 0.8825557827949524, + "learning_rate": 0.000570160687593423, + "loss": 3.7273, + "step": 50630 + }, + { + "epoch": 3.4403451555917925, + "grad_norm": 0.9091175198554993, + "learning_rate": 0.0005701182225845903, + "loss": 3.4756, + "step": 50635 + }, + { + "epoch": 3.440684875662454, + "grad_norm": 0.7703776359558105, + "learning_rate": 0.0005700757575757576, + "loss": 3.4821, + "step": 50640 + }, + { + "epoch": 3.441024595733116, + "grad_norm": 1.0410175323486328, + "learning_rate": 0.0005700332925669249, + "loss": 3.7609, + "step": 50645 + }, + { + "epoch": 3.441364315803778, + "grad_norm": 0.7933105230331421, + "learning_rate": 0.0005699908275580922, + "loss": 3.4838, + "step": 50650 + }, + { + "epoch": 3.4417040358744395, + "grad_norm": 1.2558836936950684, + "learning_rate": 0.0005699483625492594, + "loss": 3.7221, + "step": 50655 + }, + { + "epoch": 3.442043755945101, + "grad_norm": 1.0918821096420288, + "learning_rate": 0.0005699058975404266, + "loss": 3.6025, + "step": 50660 + }, + { + "epoch": 3.442383476015763, + "grad_norm": 0.6222479343414307, + "learning_rate": 0.000569863432531594, + "loss": 3.7571, + "step": 50665 + }, + { + "epoch": 3.442723196086425, + "grad_norm": 0.8531039953231812, + "learning_rate": 0.0005698209675227612, + "loss": 3.4993, + "step": 50670 + }, + { + "epoch": 3.4430629161570865, + "grad_norm": 0.8129599690437317, + "learning_rate": 0.0005697785025139285, + "loss": 3.4784, + "step": 50675 + }, + { + "epoch": 3.4434026362277486, + "grad_norm": 0.8597965836524963, + "learning_rate": 0.0005697360375050959, + "loss": 3.6046, + "step": 50680 + }, + { + "epoch": 3.44374235629841, + "grad_norm": 0.8667415976524353, + "learning_rate": 0.0005696935724962631, + "loss": 3.6303, + "step": 50685 + }, + { + "epoch": 3.444082076369072, + "grad_norm": 1.0438060760498047, + "learning_rate": 0.0005696511074874303, + "loss": 3.6525, + "step": 50690 + }, + { + "epoch": 3.444421796439734, + "grad_norm": 0.889739990234375, + "learning_rate": 0.0005696086424785977, + "loss": 3.5784, + "step": 50695 + }, + { + "epoch": 3.4447615165103955, + "grad_norm": 0.7096075415611267, + "learning_rate": 0.0005695661774697649, + "loss": 3.4267, + "step": 50700 + }, + { + "epoch": 3.445101236581057, + "grad_norm": 0.864692747592926, + "learning_rate": 0.0005695237124609321, + "loss": 3.4258, + "step": 50705 + }, + { + "epoch": 3.4454409566517192, + "grad_norm": 0.8251798748970032, + "learning_rate": 0.0005694812474520995, + "loss": 3.6307, + "step": 50710 + }, + { + "epoch": 3.445780676722381, + "grad_norm": 1.040792465209961, + "learning_rate": 0.0005694387824432668, + "loss": 3.7316, + "step": 50715 + }, + { + "epoch": 3.4461203967930425, + "grad_norm": 0.8786572813987732, + "learning_rate": 0.000569396317434434, + "loss": 3.6333, + "step": 50720 + }, + { + "epoch": 3.446460116863704, + "grad_norm": 0.7633973956108093, + "learning_rate": 0.0005693538524256013, + "loss": 3.7283, + "step": 50725 + }, + { + "epoch": 3.446799836934366, + "grad_norm": 1.043666958808899, + "learning_rate": 0.0005693113874167686, + "loss": 3.8939, + "step": 50730 + }, + { + "epoch": 3.447139557005028, + "grad_norm": 1.2220017910003662, + "learning_rate": 0.0005692689224079358, + "loss": 3.3209, + "step": 50735 + }, + { + "epoch": 3.4474792770756895, + "grad_norm": 0.808220386505127, + "learning_rate": 0.0005692264573991031, + "loss": 3.5517, + "step": 50740 + }, + { + "epoch": 3.4478189971463515, + "grad_norm": 0.7418552041053772, + "learning_rate": 0.0005691839923902705, + "loss": 3.2665, + "step": 50745 + }, + { + "epoch": 3.448158717217013, + "grad_norm": 1.0045408010482788, + "learning_rate": 0.0005691415273814378, + "loss": 3.5522, + "step": 50750 + }, + { + "epoch": 3.448498437287675, + "grad_norm": 2.531285047531128, + "learning_rate": 0.000569099062372605, + "loss": 3.4527, + "step": 50755 + }, + { + "epoch": 3.448838157358337, + "grad_norm": 1.3924392461776733, + "learning_rate": 0.0005690565973637722, + "loss": 3.3105, + "step": 50760 + }, + { + "epoch": 3.4491778774289985, + "grad_norm": 0.8543065786361694, + "learning_rate": 0.0005690141323549396, + "loss": 3.2012, + "step": 50765 + }, + { + "epoch": 3.44951759749966, + "grad_norm": 1.1210556030273438, + "learning_rate": 0.0005689716673461068, + "loss": 3.5511, + "step": 50770 + }, + { + "epoch": 3.449857317570322, + "grad_norm": 1.063027024269104, + "learning_rate": 0.000568929202337274, + "loss": 3.357, + "step": 50775 + }, + { + "epoch": 3.450197037640984, + "grad_norm": 0.765548586845398, + "learning_rate": 0.0005688867373284415, + "loss": 3.3444, + "step": 50780 + }, + { + "epoch": 3.4505367577116455, + "grad_norm": 0.8437544703483582, + "learning_rate": 0.0005688442723196087, + "loss": 3.4908, + "step": 50785 + }, + { + "epoch": 3.4508764777823075, + "grad_norm": 0.9316146969795227, + "learning_rate": 0.0005688018073107759, + "loss": 3.8704, + "step": 50790 + }, + { + "epoch": 3.451216197852969, + "grad_norm": 1.2177737951278687, + "learning_rate": 0.0005687593423019433, + "loss": 3.4699, + "step": 50795 + }, + { + "epoch": 3.451555917923631, + "grad_norm": 0.7834724187850952, + "learning_rate": 0.0005687168772931105, + "loss": 3.699, + "step": 50800 + }, + { + "epoch": 3.451895637994293, + "grad_norm": 0.9548046588897705, + "learning_rate": 0.0005686744122842777, + "loss": 3.5771, + "step": 50805 + }, + { + "epoch": 3.4522353580649545, + "grad_norm": 0.8696679472923279, + "learning_rate": 0.0005686404402772116, + "loss": 3.5327, + "step": 50810 + }, + { + "epoch": 3.452575078135616, + "grad_norm": 0.6721393465995789, + "learning_rate": 0.0005685979752683789, + "loss": 3.6998, + "step": 50815 + }, + { + "epoch": 3.452914798206278, + "grad_norm": 0.8136578798294067, + "learning_rate": 0.0005685555102595461, + "loss": 3.4745, + "step": 50820 + }, + { + "epoch": 3.45325451827694, + "grad_norm": 0.7502471804618835, + "learning_rate": 0.0005685130452507134, + "loss": 3.5482, + "step": 50825 + }, + { + "epoch": 3.4535942383476015, + "grad_norm": 0.90555340051651, + "learning_rate": 0.0005684705802418806, + "loss": 3.5209, + "step": 50830 + }, + { + "epoch": 3.453933958418263, + "grad_norm": 0.9339720010757446, + "learning_rate": 0.000568428115233048, + "loss": 3.4007, + "step": 50835 + }, + { + "epoch": 3.454273678488925, + "grad_norm": 0.8884099721908569, + "learning_rate": 0.0005683856502242153, + "loss": 3.516, + "step": 50840 + }, + { + "epoch": 3.454613398559587, + "grad_norm": 0.7895032167434692, + "learning_rate": 0.0005683431852153825, + "loss": 3.4484, + "step": 50845 + }, + { + "epoch": 3.4549531186302485, + "grad_norm": 0.9309988617897034, + "learning_rate": 0.0005683007202065498, + "loss": 3.595, + "step": 50850 + }, + { + "epoch": 3.4552928387009105, + "grad_norm": 1.1492661237716675, + "learning_rate": 0.0005682582551977171, + "loss": 3.6407, + "step": 50855 + }, + { + "epoch": 3.455632558771572, + "grad_norm": 0.8543238043785095, + "learning_rate": 0.0005682157901888843, + "loss": 3.3193, + "step": 50860 + }, + { + "epoch": 3.455972278842234, + "grad_norm": 0.9281453490257263, + "learning_rate": 0.0005681733251800516, + "loss": 3.444, + "step": 50865 + }, + { + "epoch": 3.456311998912896, + "grad_norm": 0.9269437789916992, + "learning_rate": 0.000568130860171219, + "loss": 3.5092, + "step": 50870 + }, + { + "epoch": 3.4566517189835575, + "grad_norm": 1.0779725313186646, + "learning_rate": 0.0005680883951623862, + "loss": 3.5512, + "step": 50875 + }, + { + "epoch": 3.456991439054219, + "grad_norm": 0.8693379163742065, + "learning_rate": 0.0005680459301535535, + "loss": 3.5094, + "step": 50880 + }, + { + "epoch": 3.457331159124881, + "grad_norm": 1.3311654329299927, + "learning_rate": 0.0005680034651447208, + "loss": 3.4434, + "step": 50885 + }, + { + "epoch": 3.457670879195543, + "grad_norm": 5.903748989105225, + "learning_rate": 0.000567961000135888, + "loss": 3.6099, + "step": 50890 + }, + { + "epoch": 3.4580105992662045, + "grad_norm": 1.1150431632995605, + "learning_rate": 0.0005679185351270552, + "loss": 3.3, + "step": 50895 + }, + { + "epoch": 3.4583503193368665, + "grad_norm": 0.8862776756286621, + "learning_rate": 0.0005678760701182226, + "loss": 3.6774, + "step": 50900 + }, + { + "epoch": 3.458690039407528, + "grad_norm": 0.9499490261077881, + "learning_rate": 0.0005678336051093899, + "loss": 3.5499, + "step": 50905 + }, + { + "epoch": 3.45902975947819, + "grad_norm": 0.9868497252464294, + "learning_rate": 0.0005677911401005571, + "loss": 3.5385, + "step": 50910 + }, + { + "epoch": 3.459369479548852, + "grad_norm": 1.0802985429763794, + "learning_rate": 0.0005677486750917245, + "loss": 3.5174, + "step": 50915 + }, + { + "epoch": 3.4597091996195135, + "grad_norm": 1.0347508192062378, + "learning_rate": 0.0005677062100828917, + "loss": 3.407, + "step": 50920 + }, + { + "epoch": 3.460048919690175, + "grad_norm": 0.9104641675949097, + "learning_rate": 0.0005676637450740589, + "loss": 3.331, + "step": 50925 + }, + { + "epoch": 3.460388639760837, + "grad_norm": 0.8036583065986633, + "learning_rate": 0.0005676212800652263, + "loss": 3.2023, + "step": 50930 + }, + { + "epoch": 3.460728359831499, + "grad_norm": 1.018028974533081, + "learning_rate": 0.0005675788150563935, + "loss": 3.6782, + "step": 50935 + }, + { + "epoch": 3.4610680799021605, + "grad_norm": 0.9045596122741699, + "learning_rate": 0.0005675363500475608, + "loss": 3.4883, + "step": 50940 + }, + { + "epoch": 3.4614077999728226, + "grad_norm": 0.8529058694839478, + "learning_rate": 0.0005674938850387281, + "loss": 3.6554, + "step": 50945 + }, + { + "epoch": 3.461747520043484, + "grad_norm": 1.7191523313522339, + "learning_rate": 0.0005674514200298954, + "loss": 3.5242, + "step": 50950 + }, + { + "epoch": 3.462087240114146, + "grad_norm": 0.7875005006790161, + "learning_rate": 0.0005674089550210627, + "loss": 3.4972, + "step": 50955 + }, + { + "epoch": 3.462426960184808, + "grad_norm": 0.845041036605835, + "learning_rate": 0.0005673664900122299, + "loss": 3.7284, + "step": 50960 + }, + { + "epoch": 3.4627666802554695, + "grad_norm": 0.957655668258667, + "learning_rate": 0.0005673240250033972, + "loss": 3.5463, + "step": 50965 + }, + { + "epoch": 3.463106400326131, + "grad_norm": 0.9813191890716553, + "learning_rate": 0.0005672815599945645, + "loss": 3.4838, + "step": 50970 + }, + { + "epoch": 3.4634461203967932, + "grad_norm": 1.039358377456665, + "learning_rate": 0.0005672390949857318, + "loss": 3.6738, + "step": 50975 + }, + { + "epoch": 3.463785840467455, + "grad_norm": 0.9115439653396606, + "learning_rate": 0.000567196629976899, + "loss": 3.5311, + "step": 50980 + }, + { + "epoch": 3.4641255605381165, + "grad_norm": 0.8755999207496643, + "learning_rate": 0.0005671541649680664, + "loss": 3.5302, + "step": 50985 + }, + { + "epoch": 3.4644652806087786, + "grad_norm": 1.0544756650924683, + "learning_rate": 0.0005671116999592336, + "loss": 3.2692, + "step": 50990 + }, + { + "epoch": 3.46480500067944, + "grad_norm": 0.8575214743614197, + "learning_rate": 0.0005670692349504008, + "loss": 3.5661, + "step": 50995 + }, + { + "epoch": 3.465144720750102, + "grad_norm": 0.604101836681366, + "learning_rate": 0.0005670267699415682, + "loss": 3.7716, + "step": 51000 + }, + { + "epoch": 3.465484440820764, + "grad_norm": 0.6888284087181091, + "learning_rate": 0.0005669843049327354, + "loss": 3.8598, + "step": 51005 + }, + { + "epoch": 3.4658241608914255, + "grad_norm": 0.8463163375854492, + "learning_rate": 0.0005669418399239027, + "loss": 3.4823, + "step": 51010 + }, + { + "epoch": 3.466163880962087, + "grad_norm": 0.8960619568824768, + "learning_rate": 0.0005668993749150701, + "loss": 3.5427, + "step": 51015 + }, + { + "epoch": 3.4665036010327492, + "grad_norm": 1.1084091663360596, + "learning_rate": 0.0005668569099062373, + "loss": 3.4076, + "step": 51020 + }, + { + "epoch": 3.466843321103411, + "grad_norm": 1.0673884153366089, + "learning_rate": 0.0005668144448974045, + "loss": 3.546, + "step": 51025 + }, + { + "epoch": 3.4671830411740725, + "grad_norm": 0.7826205492019653, + "learning_rate": 0.0005667719798885719, + "loss": 3.5682, + "step": 51030 + }, + { + "epoch": 3.4675227612447346, + "grad_norm": 0.8013639450073242, + "learning_rate": 0.0005667295148797391, + "loss": 3.6554, + "step": 51035 + }, + { + "epoch": 3.467862481315396, + "grad_norm": 0.8751564621925354, + "learning_rate": 0.0005666870498709063, + "loss": 3.6221, + "step": 51040 + }, + { + "epoch": 3.468202201386058, + "grad_norm": 0.882834255695343, + "learning_rate": 0.0005666445848620737, + "loss": 3.4614, + "step": 51045 + }, + { + "epoch": 3.46854192145672, + "grad_norm": 0.8185749650001526, + "learning_rate": 0.000566602119853241, + "loss": 3.6049, + "step": 51050 + }, + { + "epoch": 3.4688816415273815, + "grad_norm": 0.8372458815574646, + "learning_rate": 0.0005665596548444082, + "loss": 3.7001, + "step": 51055 + }, + { + "epoch": 3.469221361598043, + "grad_norm": 1.0725535154342651, + "learning_rate": 0.0005665171898355755, + "loss": 3.3801, + "step": 51060 + }, + { + "epoch": 3.469561081668705, + "grad_norm": 0.758840799331665, + "learning_rate": 0.0005664747248267428, + "loss": 3.3705, + "step": 51065 + }, + { + "epoch": 3.469900801739367, + "grad_norm": 0.9141230583190918, + "learning_rate": 0.00056643225981791, + "loss": 3.382, + "step": 51070 + }, + { + "epoch": 3.4702405218100285, + "grad_norm": 0.8912198543548584, + "learning_rate": 0.0005663897948090774, + "loss": 3.342, + "step": 51075 + }, + { + "epoch": 3.47058024188069, + "grad_norm": 1.033087134361267, + "learning_rate": 0.0005663473298002447, + "loss": 3.5797, + "step": 51080 + }, + { + "epoch": 3.470919961951352, + "grad_norm": 0.9312952160835266, + "learning_rate": 0.0005663048647914119, + "loss": 3.5617, + "step": 51085 + }, + { + "epoch": 3.471259682022014, + "grad_norm": 0.9684900045394897, + "learning_rate": 0.0005662623997825792, + "loss": 3.823, + "step": 51090 + }, + { + "epoch": 3.4715994020926755, + "grad_norm": 0.6710322499275208, + "learning_rate": 0.0005662199347737464, + "loss": 3.4719, + "step": 51095 + }, + { + "epoch": 3.4719391221633376, + "grad_norm": 1.0847615003585815, + "learning_rate": 0.0005661774697649137, + "loss": 3.6091, + "step": 51100 + }, + { + "epoch": 3.472278842233999, + "grad_norm": 0.8329928517341614, + "learning_rate": 0.000566135004756081, + "loss": 3.3672, + "step": 51105 + }, + { + "epoch": 3.472618562304661, + "grad_norm": 0.7629515528678894, + "learning_rate": 0.0005660925397472483, + "loss": 3.475, + "step": 51110 + }, + { + "epoch": 3.472958282375323, + "grad_norm": 0.7892920970916748, + "learning_rate": 0.0005660500747384156, + "loss": 3.5439, + "step": 51115 + }, + { + "epoch": 3.4732980024459845, + "grad_norm": 0.9652289152145386, + "learning_rate": 0.0005660076097295829, + "loss": 3.598, + "step": 51120 + }, + { + "epoch": 3.473637722516646, + "grad_norm": 1.014975905418396, + "learning_rate": 0.0005659651447207501, + "loss": 3.5762, + "step": 51125 + }, + { + "epoch": 3.4739774425873082, + "grad_norm": 1.516170859336853, + "learning_rate": 0.0005659226797119173, + "loss": 3.5239, + "step": 51130 + }, + { + "epoch": 3.47431716265797, + "grad_norm": 0.789914608001709, + "learning_rate": 0.0005658802147030847, + "loss": 3.6145, + "step": 51135 + }, + { + "epoch": 3.4746568827286315, + "grad_norm": 1.1964117288589478, + "learning_rate": 0.0005658377496942519, + "loss": 3.4151, + "step": 51140 + }, + { + "epoch": 3.4749966027992936, + "grad_norm": 0.880392849445343, + "learning_rate": 0.0005657952846854192, + "loss": 3.5678, + "step": 51145 + }, + { + "epoch": 3.475336322869955, + "grad_norm": 0.727676510810852, + "learning_rate": 0.0005657528196765866, + "loss": 3.6841, + "step": 51150 + }, + { + "epoch": 3.475676042940617, + "grad_norm": 1.007419228553772, + "learning_rate": 0.0005657103546677538, + "loss": 3.3769, + "step": 51155 + }, + { + "epoch": 3.4760157630112785, + "grad_norm": 0.8023557662963867, + "learning_rate": 0.000565667889658921, + "loss": 3.6103, + "step": 51160 + }, + { + "epoch": 3.4763554830819405, + "grad_norm": 0.8693539500236511, + "learning_rate": 0.0005656254246500884, + "loss": 3.4057, + "step": 51165 + }, + { + "epoch": 3.476695203152602, + "grad_norm": 0.8002108931541443, + "learning_rate": 0.0005655829596412556, + "loss": 3.3246, + "step": 51170 + }, + { + "epoch": 3.477034923223264, + "grad_norm": 0.8079497814178467, + "learning_rate": 0.0005655404946324228, + "loss": 3.2762, + "step": 51175 + }, + { + "epoch": 3.477374643293926, + "grad_norm": 1.1020482778549194, + "learning_rate": 0.0005654980296235903, + "loss": 3.6542, + "step": 51180 + }, + { + "epoch": 3.4777143633645875, + "grad_norm": 0.9112440943717957, + "learning_rate": 0.0005654555646147575, + "loss": 3.6557, + "step": 51185 + }, + { + "epoch": 3.478054083435249, + "grad_norm": 0.6502084136009216, + "learning_rate": 0.0005654130996059247, + "loss": 3.5811, + "step": 51190 + }, + { + "epoch": 3.478393803505911, + "grad_norm": 0.8804580569267273, + "learning_rate": 0.000565370634597092, + "loss": 3.5532, + "step": 51195 + }, + { + "epoch": 3.478733523576573, + "grad_norm": 0.7863538265228271, + "learning_rate": 0.0005653281695882593, + "loss": 3.4197, + "step": 51200 + }, + { + "epoch": 3.4790732436472345, + "grad_norm": 0.6890247464179993, + "learning_rate": 0.0005652857045794265, + "loss": 3.5283, + "step": 51205 + }, + { + "epoch": 3.4794129637178965, + "grad_norm": 1.0168797969818115, + "learning_rate": 0.0005652432395705938, + "loss": 3.2183, + "step": 51210 + }, + { + "epoch": 3.479752683788558, + "grad_norm": 0.8578886985778809, + "learning_rate": 0.0005652007745617612, + "loss": 3.78, + "step": 51215 + }, + { + "epoch": 3.48009240385922, + "grad_norm": 1.0400383472442627, + "learning_rate": 0.0005651583095529284, + "loss": 3.575, + "step": 51220 + }, + { + "epoch": 3.480432123929882, + "grad_norm": 1.797033667564392, + "learning_rate": 0.0005651158445440957, + "loss": 3.5556, + "step": 51225 + }, + { + "epoch": 3.4807718440005435, + "grad_norm": 1.1042301654815674, + "learning_rate": 0.000565073379535263, + "loss": 3.5476, + "step": 51230 + }, + { + "epoch": 3.481111564071205, + "grad_norm": 1.009182333946228, + "learning_rate": 0.0005650309145264302, + "loss": 3.6687, + "step": 51235 + }, + { + "epoch": 3.481451284141867, + "grad_norm": 0.8819342851638794, + "learning_rate": 0.0005649884495175975, + "loss": 3.4792, + "step": 51240 + }, + { + "epoch": 3.481791004212529, + "grad_norm": 0.907322347164154, + "learning_rate": 0.0005649459845087647, + "loss": 3.3669, + "step": 51245 + }, + { + "epoch": 3.4821307242831905, + "grad_norm": 0.9387765526771545, + "learning_rate": 0.0005649035194999321, + "loss": 3.3823, + "step": 51250 + }, + { + "epoch": 3.4824704443538526, + "grad_norm": 0.747371256351471, + "learning_rate": 0.0005648610544910994, + "loss": 3.7351, + "step": 51255 + }, + { + "epoch": 3.482810164424514, + "grad_norm": 2.4423608779907227, + "learning_rate": 0.0005648185894822666, + "loss": 3.5394, + "step": 51260 + }, + { + "epoch": 3.483149884495176, + "grad_norm": 0.8741277456283569, + "learning_rate": 0.0005647761244734339, + "loss": 3.5208, + "step": 51265 + }, + { + "epoch": 3.483489604565838, + "grad_norm": 0.8195372223854065, + "learning_rate": 0.0005647336594646012, + "loss": 3.8152, + "step": 51270 + }, + { + "epoch": 3.4838293246364995, + "grad_norm": 0.961691677570343, + "learning_rate": 0.0005646911944557684, + "loss": 3.4134, + "step": 51275 + }, + { + "epoch": 3.484169044707161, + "grad_norm": 1.3356719017028809, + "learning_rate": 0.0005646487294469356, + "loss": 3.6778, + "step": 51280 + }, + { + "epoch": 3.4845087647778232, + "grad_norm": 0.7898390889167786, + "learning_rate": 0.0005646062644381031, + "loss": 3.4208, + "step": 51285 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 1.15496826171875, + "learning_rate": 0.0005645637994292703, + "loss": 3.4504, + "step": 51290 + }, + { + "epoch": 3.4851882049191465, + "grad_norm": 0.9502716064453125, + "learning_rate": 0.0005645213344204376, + "loss": 3.4544, + "step": 51295 + }, + { + "epoch": 3.4855279249898086, + "grad_norm": 0.8165900111198425, + "learning_rate": 0.0005644788694116049, + "loss": 3.4463, + "step": 51300 + }, + { + "epoch": 3.48586764506047, + "grad_norm": 0.8395061492919922, + "learning_rate": 0.0005644364044027721, + "loss": 3.7898, + "step": 51305 + }, + { + "epoch": 3.486207365131132, + "grad_norm": 0.8682558536529541, + "learning_rate": 0.0005643939393939394, + "loss": 3.3702, + "step": 51310 + }, + { + "epoch": 3.486547085201794, + "grad_norm": 0.9322106838226318, + "learning_rate": 0.0005643514743851067, + "loss": 3.6277, + "step": 51315 + }, + { + "epoch": 3.4868868052724555, + "grad_norm": 0.8170090317726135, + "learning_rate": 0.000564309009376274, + "loss": 3.2052, + "step": 51320 + }, + { + "epoch": 3.487226525343117, + "grad_norm": 0.916042149066925, + "learning_rate": 0.0005642665443674413, + "loss": 3.5877, + "step": 51325 + }, + { + "epoch": 3.4875662454137792, + "grad_norm": 0.861544132232666, + "learning_rate": 0.0005642240793586085, + "loss": 3.6817, + "step": 51330 + }, + { + "epoch": 3.487905965484441, + "grad_norm": 0.8182805776596069, + "learning_rate": 0.0005641816143497758, + "loss": 3.6409, + "step": 51335 + }, + { + "epoch": 3.4882456855551025, + "grad_norm": 0.8990697264671326, + "learning_rate": 0.0005641391493409431, + "loss": 3.5427, + "step": 51340 + }, + { + "epoch": 3.4885854056257646, + "grad_norm": 0.7797118425369263, + "learning_rate": 0.0005640966843321103, + "loss": 3.727, + "step": 51345 + }, + { + "epoch": 3.488925125696426, + "grad_norm": 0.7812651991844177, + "learning_rate": 0.0005640542193232776, + "loss": 3.5985, + "step": 51350 + }, + { + "epoch": 3.489264845767088, + "grad_norm": 0.7565735578536987, + "learning_rate": 0.000564011754314445, + "loss": 3.4443, + "step": 51355 + }, + { + "epoch": 3.48960456583775, + "grad_norm": 0.9053610563278198, + "learning_rate": 0.0005639692893056122, + "loss": 3.316, + "step": 51360 + }, + { + "epoch": 3.4899442859084115, + "grad_norm": 0.8683055639266968, + "learning_rate": 0.0005639268242967795, + "loss": 3.5647, + "step": 51365 + }, + { + "epoch": 3.490284005979073, + "grad_norm": 0.8806053400039673, + "learning_rate": 0.0005638843592879468, + "loss": 3.5878, + "step": 51370 + }, + { + "epoch": 3.4906237260497353, + "grad_norm": 0.7587064504623413, + "learning_rate": 0.000563841894279114, + "loss": 3.6573, + "step": 51375 + }, + { + "epoch": 3.490963446120397, + "grad_norm": 0.8001023530960083, + "learning_rate": 0.0005637994292702812, + "loss": 3.5776, + "step": 51380 + }, + { + "epoch": 3.4913031661910585, + "grad_norm": 0.8363889455795288, + "learning_rate": 0.0005637569642614486, + "loss": 3.5178, + "step": 51385 + }, + { + "epoch": 3.4916428862617206, + "grad_norm": 1.4014264345169067, + "learning_rate": 0.0005637144992526159, + "loss": 3.3244, + "step": 51390 + }, + { + "epoch": 3.491982606332382, + "grad_norm": 0.7827888131141663, + "learning_rate": 0.0005636720342437831, + "loss": 3.6116, + "step": 51395 + }, + { + "epoch": 3.492322326403044, + "grad_norm": 1.2634133100509644, + "learning_rate": 0.0005636295692349505, + "loss": 3.5615, + "step": 51400 + }, + { + "epoch": 3.4926620464737055, + "grad_norm": 0.9634220004081726, + "learning_rate": 0.0005635871042261177, + "loss": 3.5066, + "step": 51405 + }, + { + "epoch": 3.4930017665443676, + "grad_norm": 1.126098871231079, + "learning_rate": 0.0005635446392172849, + "loss": 3.4837, + "step": 51410 + }, + { + "epoch": 3.493341486615029, + "grad_norm": 0.9537572860717773, + "learning_rate": 0.0005635021742084523, + "loss": 3.3501, + "step": 51415 + }, + { + "epoch": 3.493681206685691, + "grad_norm": 0.9149693846702576, + "learning_rate": 0.0005634597091996195, + "loss": 3.9425, + "step": 51420 + }, + { + "epoch": 3.494020926756353, + "grad_norm": 0.8552558422088623, + "learning_rate": 0.0005634172441907868, + "loss": 3.5748, + "step": 51425 + }, + { + "epoch": 3.4943606468270145, + "grad_norm": 0.8688321113586426, + "learning_rate": 0.0005633747791819542, + "loss": 3.5799, + "step": 51430 + }, + { + "epoch": 3.494700366897676, + "grad_norm": 0.7103790640830994, + "learning_rate": 0.0005633323141731214, + "loss": 3.7496, + "step": 51435 + }, + { + "epoch": 3.4950400869683382, + "grad_norm": 0.9047719836235046, + "learning_rate": 0.0005632898491642886, + "loss": 3.4031, + "step": 51440 + }, + { + "epoch": 3.495379807039, + "grad_norm": 0.8459380269050598, + "learning_rate": 0.0005632473841554559, + "loss": 3.4656, + "step": 51445 + }, + { + "epoch": 3.4957195271096615, + "grad_norm": 1.1212704181671143, + "learning_rate": 0.0005632049191466232, + "loss": 3.355, + "step": 51450 + }, + { + "epoch": 3.4960592471803236, + "grad_norm": 1.2283480167388916, + "learning_rate": 0.0005631624541377904, + "loss": 3.5227, + "step": 51455 + }, + { + "epoch": 3.496398967250985, + "grad_norm": 0.7555848956108093, + "learning_rate": 0.0005631199891289578, + "loss": 3.6443, + "step": 51460 + }, + { + "epoch": 3.496738687321647, + "grad_norm": 0.8231528401374817, + "learning_rate": 0.0005630775241201251, + "loss": 3.42, + "step": 51465 + }, + { + "epoch": 3.497078407392309, + "grad_norm": 0.839104413986206, + "learning_rate": 0.0005630350591112923, + "loss": 3.5426, + "step": 51470 + }, + { + "epoch": 3.4974181274629705, + "grad_norm": 0.9389153122901917, + "learning_rate": 0.0005629925941024596, + "loss": 3.3485, + "step": 51475 + }, + { + "epoch": 3.497757847533632, + "grad_norm": 0.7654488682746887, + "learning_rate": 0.0005629501290936268, + "loss": 3.6723, + "step": 51480 + }, + { + "epoch": 3.4980975676042942, + "grad_norm": 0.8114607930183411, + "learning_rate": 0.0005629076640847941, + "loss": 3.3659, + "step": 51485 + }, + { + "epoch": 3.498437287674956, + "grad_norm": 0.8321049809455872, + "learning_rate": 0.0005628651990759614, + "loss": 3.4881, + "step": 51490 + }, + { + "epoch": 3.4987770077456175, + "grad_norm": 0.935689389705658, + "learning_rate": 0.0005628227340671287, + "loss": 3.512, + "step": 51495 + }, + { + "epoch": 3.499116727816279, + "grad_norm": 0.7740063667297363, + "learning_rate": 0.000562780269058296, + "loss": 3.3381, + "step": 51500 + }, + { + "epoch": 3.499456447886941, + "grad_norm": 1.0268219709396362, + "learning_rate": 0.0005627378040494633, + "loss": 3.4644, + "step": 51505 + }, + { + "epoch": 3.499796167957603, + "grad_norm": 0.8976519107818604, + "learning_rate": 0.0005626953390406305, + "loss": 3.7193, + "step": 51510 + }, + { + "epoch": 3.5001358880282645, + "grad_norm": 0.9955483675003052, + "learning_rate": 0.0005626528740317977, + "loss": 3.465, + "step": 51515 + }, + { + "epoch": 3.5004756080989265, + "grad_norm": 0.8242892622947693, + "learning_rate": 0.0005626104090229651, + "loss": 3.2576, + "step": 51520 + }, + { + "epoch": 3.500815328169588, + "grad_norm": 0.7883636355400085, + "learning_rate": 0.0005625679440141323, + "loss": 3.3666, + "step": 51525 + }, + { + "epoch": 3.50115504824025, + "grad_norm": 0.8102698922157288, + "learning_rate": 0.0005625254790052996, + "loss": 3.5833, + "step": 51530 + }, + { + "epoch": 3.501494768310912, + "grad_norm": 0.7935492992401123, + "learning_rate": 0.000562483013996467, + "loss": 3.2241, + "step": 51535 + }, + { + "epoch": 3.5018344883815735, + "grad_norm": 0.8349325060844421, + "learning_rate": 0.0005624405489876342, + "loss": 3.5291, + "step": 51540 + }, + { + "epoch": 3.502174208452235, + "grad_norm": 1.8599059581756592, + "learning_rate": 0.0005623980839788014, + "loss": 3.4669, + "step": 51545 + }, + { + "epoch": 3.5025139285228972, + "grad_norm": 0.8696405291557312, + "learning_rate": 0.0005623556189699688, + "loss": 3.5286, + "step": 51550 + }, + { + "epoch": 3.502853648593559, + "grad_norm": 0.8819985389709473, + "learning_rate": 0.000562313153961136, + "loss": 3.5845, + "step": 51555 + }, + { + "epoch": 3.5031933686642205, + "grad_norm": 0.7555956244468689, + "learning_rate": 0.0005622706889523032, + "loss": 3.3814, + "step": 51560 + }, + { + "epoch": 3.5035330887348826, + "grad_norm": 0.7913928627967834, + "learning_rate": 0.0005622282239434707, + "loss": 3.6169, + "step": 51565 + }, + { + "epoch": 3.503872808805544, + "grad_norm": 0.8123531341552734, + "learning_rate": 0.0005621857589346379, + "loss": 3.2591, + "step": 51570 + }, + { + "epoch": 3.504212528876206, + "grad_norm": 1.0844898223876953, + "learning_rate": 0.0005621432939258051, + "loss": 3.7891, + "step": 51575 + }, + { + "epoch": 3.504552248946868, + "grad_norm": 0.8885155916213989, + "learning_rate": 0.0005621008289169724, + "loss": 3.4791, + "step": 51580 + }, + { + "epoch": 3.5048919690175295, + "grad_norm": 0.7610024809837341, + "learning_rate": 0.0005620583639081397, + "loss": 3.4999, + "step": 51585 + }, + { + "epoch": 3.505231689088191, + "grad_norm": 0.8304603099822998, + "learning_rate": 0.0005620158988993069, + "loss": 3.3848, + "step": 51590 + }, + { + "epoch": 3.5055714091588532, + "grad_norm": 0.9676072597503662, + "learning_rate": 0.0005619734338904742, + "loss": 3.8215, + "step": 51595 + }, + { + "epoch": 3.505911129229515, + "grad_norm": 0.8626194000244141, + "learning_rate": 0.0005619309688816416, + "loss": 3.2148, + "step": 51600 + }, + { + "epoch": 3.5062508493001765, + "grad_norm": 0.8706783056259155, + "learning_rate": 0.0005618885038728088, + "loss": 3.1729, + "step": 51605 + }, + { + "epoch": 3.5065905693708386, + "grad_norm": 0.919647753238678, + "learning_rate": 0.0005618460388639761, + "loss": 3.2977, + "step": 51610 + }, + { + "epoch": 3.5069302894415, + "grad_norm": 0.8039460778236389, + "learning_rate": 0.0005618035738551434, + "loss": 3.2331, + "step": 51615 + }, + { + "epoch": 3.507270009512162, + "grad_norm": 0.731069803237915, + "learning_rate": 0.0005617611088463106, + "loss": 3.6979, + "step": 51620 + }, + { + "epoch": 3.507609729582824, + "grad_norm": 1.0288151502609253, + "learning_rate": 0.0005617186438374779, + "loss": 3.8642, + "step": 51625 + }, + { + "epoch": 3.5079494496534855, + "grad_norm": 0.8793969750404358, + "learning_rate": 0.0005616761788286451, + "loss": 3.5511, + "step": 51630 + }, + { + "epoch": 3.508289169724147, + "grad_norm": 0.8325799703598022, + "learning_rate": 0.0005616337138198125, + "loss": 3.4417, + "step": 51635 + }, + { + "epoch": 3.5086288897948092, + "grad_norm": 1.0473800897598267, + "learning_rate": 0.0005615912488109798, + "loss": 3.7136, + "step": 51640 + }, + { + "epoch": 3.508968609865471, + "grad_norm": 0.8606711030006409, + "learning_rate": 0.000561548783802147, + "loss": 3.638, + "step": 51645 + }, + { + "epoch": 3.5093083299361325, + "grad_norm": 1.3508104085922241, + "learning_rate": 0.0005615063187933144, + "loss": 3.7374, + "step": 51650 + }, + { + "epoch": 3.5096480500067946, + "grad_norm": 1.7020424604415894, + "learning_rate": 0.0005614638537844816, + "loss": 3.341, + "step": 51655 + }, + { + "epoch": 3.509987770077456, + "grad_norm": 0.7627628445625305, + "learning_rate": 0.0005614213887756488, + "loss": 3.4586, + "step": 51660 + }, + { + "epoch": 3.510327490148118, + "grad_norm": 0.7467352747917175, + "learning_rate": 0.0005613789237668163, + "loss": 3.3912, + "step": 51665 + }, + { + "epoch": 3.51066721021878, + "grad_norm": 0.8230012655258179, + "learning_rate": 0.0005613364587579835, + "loss": 3.5814, + "step": 51670 + }, + { + "epoch": 3.5110069302894416, + "grad_norm": 0.8982395529747009, + "learning_rate": 0.0005612939937491507, + "loss": 3.5438, + "step": 51675 + }, + { + "epoch": 3.511346650360103, + "grad_norm": 0.8082804083824158, + "learning_rate": 0.000561251528740318, + "loss": 3.4206, + "step": 51680 + }, + { + "epoch": 3.5116863704307653, + "grad_norm": 0.650678813457489, + "learning_rate": 0.0005612090637314853, + "loss": 3.5286, + "step": 51685 + }, + { + "epoch": 3.512026090501427, + "grad_norm": 0.8324115872383118, + "learning_rate": 0.0005611665987226525, + "loss": 3.3523, + "step": 51690 + }, + { + "epoch": 3.5123658105720885, + "grad_norm": 0.8410704135894775, + "learning_rate": 0.0005611241337138198, + "loss": 3.7392, + "step": 51695 + }, + { + "epoch": 3.5127055306427506, + "grad_norm": 1.00032639503479, + "learning_rate": 0.0005610816687049872, + "loss": 3.6985, + "step": 51700 + }, + { + "epoch": 3.5130452507134122, + "grad_norm": 0.8709708452224731, + "learning_rate": 0.0005610392036961544, + "loss": 3.3597, + "step": 51705 + }, + { + "epoch": 3.513384970784074, + "grad_norm": 0.7466675639152527, + "learning_rate": 0.0005609967386873217, + "loss": 3.6171, + "step": 51710 + }, + { + "epoch": 3.513724690854736, + "grad_norm": 0.8336014151573181, + "learning_rate": 0.000560954273678489, + "loss": 3.3167, + "step": 51715 + }, + { + "epoch": 3.5140644109253976, + "grad_norm": 0.8941938877105713, + "learning_rate": 0.0005609118086696562, + "loss": 3.6009, + "step": 51720 + }, + { + "epoch": 3.514404130996059, + "grad_norm": 0.7302713394165039, + "learning_rate": 0.0005608693436608235, + "loss": 3.3494, + "step": 51725 + }, + { + "epoch": 3.5147438510667213, + "grad_norm": 0.9202880263328552, + "learning_rate": 0.0005608268786519907, + "loss": 3.28, + "step": 51730 + }, + { + "epoch": 3.515083571137383, + "grad_norm": 0.7894494533538818, + "learning_rate": 0.0005607844136431581, + "loss": 3.4501, + "step": 51735 + }, + { + "epoch": 3.5154232912080445, + "grad_norm": 0.9487261772155762, + "learning_rate": 0.0005607419486343254, + "loss": 3.5224, + "step": 51740 + }, + { + "epoch": 3.5157630112787066, + "grad_norm": 1.2309153079986572, + "learning_rate": 0.0005606994836254926, + "loss": 3.7142, + "step": 51745 + }, + { + "epoch": 3.5161027313493682, + "grad_norm": 0.9234302043914795, + "learning_rate": 0.0005606570186166599, + "loss": 3.8437, + "step": 51750 + }, + { + "epoch": 3.51644245142003, + "grad_norm": 0.7856410145759583, + "learning_rate": 0.0005606145536078272, + "loss": 3.5451, + "step": 51755 + }, + { + "epoch": 3.516782171490692, + "grad_norm": 0.8589171171188354, + "learning_rate": 0.0005605720885989944, + "loss": 3.5181, + "step": 51760 + }, + { + "epoch": 3.5171218915613536, + "grad_norm": 0.8116687536239624, + "learning_rate": 0.0005605296235901616, + "loss": 3.4361, + "step": 51765 + }, + { + "epoch": 3.517461611632015, + "grad_norm": 0.931008517742157, + "learning_rate": 0.0005604871585813291, + "loss": 3.5574, + "step": 51770 + }, + { + "epoch": 3.517801331702677, + "grad_norm": 0.9329083561897278, + "learning_rate": 0.0005604446935724963, + "loss": 3.4658, + "step": 51775 + }, + { + "epoch": 3.518141051773339, + "grad_norm": 0.9476087093353271, + "learning_rate": 0.0005604022285636635, + "loss": 3.5935, + "step": 51780 + }, + { + "epoch": 3.5184807718440005, + "grad_norm": 1.075653314590454, + "learning_rate": 0.0005603597635548309, + "loss": 3.3666, + "step": 51785 + }, + { + "epoch": 3.518820491914662, + "grad_norm": 0.9022022485733032, + "learning_rate": 0.0005603172985459981, + "loss": 3.7101, + "step": 51790 + }, + { + "epoch": 3.5191602119853242, + "grad_norm": 1.0623139142990112, + "learning_rate": 0.0005602748335371653, + "loss": 3.3213, + "step": 51795 + }, + { + "epoch": 3.519499932055986, + "grad_norm": 1.3991012573242188, + "learning_rate": 0.0005602323685283327, + "loss": 3.3801, + "step": 51800 + }, + { + "epoch": 3.5198396521266475, + "grad_norm": 0.7965555191040039, + "learning_rate": 0.0005601899035195, + "loss": 3.7207, + "step": 51805 + }, + { + "epoch": 3.520179372197309, + "grad_norm": 0.7549816370010376, + "learning_rate": 0.0005601474385106672, + "loss": 3.8208, + "step": 51810 + }, + { + "epoch": 3.520519092267971, + "grad_norm": 0.8849869966506958, + "learning_rate": 0.0005601049735018346, + "loss": 3.5525, + "step": 51815 + }, + { + "epoch": 3.520858812338633, + "grad_norm": 0.8664124011993408, + "learning_rate": 0.0005600625084930018, + "loss": 3.287, + "step": 51820 + }, + { + "epoch": 3.5211985324092945, + "grad_norm": 0.7991177439689636, + "learning_rate": 0.000560020043484169, + "loss": 3.4111, + "step": 51825 + }, + { + "epoch": 3.5215382524799566, + "grad_norm": 0.8982857465744019, + "learning_rate": 0.0005599775784753363, + "loss": 3.5812, + "step": 51830 + }, + { + "epoch": 3.521877972550618, + "grad_norm": 0.9427811503410339, + "learning_rate": 0.0005599351134665036, + "loss": 3.6777, + "step": 51835 + }, + { + "epoch": 3.52221769262128, + "grad_norm": 1.0985770225524902, + "learning_rate": 0.0005598926484576709, + "loss": 3.2933, + "step": 51840 + }, + { + "epoch": 3.522557412691942, + "grad_norm": 0.9055580496788025, + "learning_rate": 0.0005598501834488382, + "loss": 3.4596, + "step": 51845 + }, + { + "epoch": 3.5228971327626035, + "grad_norm": 0.9566438794136047, + "learning_rate": 0.0005598077184400055, + "loss": 3.5315, + "step": 51850 + }, + { + "epoch": 3.523236852833265, + "grad_norm": 0.8612704873085022, + "learning_rate": 0.0005597652534311727, + "loss": 3.4531, + "step": 51855 + }, + { + "epoch": 3.5235765729039272, + "grad_norm": 0.9406459927558899, + "learning_rate": 0.00055972278842234, + "loss": 3.6065, + "step": 51860 + }, + { + "epoch": 3.523916292974589, + "grad_norm": 0.8984143733978271, + "learning_rate": 0.0005596803234135072, + "loss": 3.5008, + "step": 51865 + }, + { + "epoch": 3.5242560130452505, + "grad_norm": 0.802166759967804, + "learning_rate": 0.0005596378584046745, + "loss": 3.3689, + "step": 51870 + }, + { + "epoch": 3.5245957331159126, + "grad_norm": 0.8040323853492737, + "learning_rate": 0.0005595953933958419, + "loss": 3.6252, + "step": 51875 + }, + { + "epoch": 3.524935453186574, + "grad_norm": 0.9943200349807739, + "learning_rate": 0.0005595529283870091, + "loss": 3.2406, + "step": 51880 + }, + { + "epoch": 3.525275173257236, + "grad_norm": 0.7924588322639465, + "learning_rate": 0.0005595104633781764, + "loss": 3.29, + "step": 51885 + }, + { + "epoch": 3.525614893327898, + "grad_norm": 0.7379409670829773, + "learning_rate": 0.0005594679983693437, + "loss": 3.6547, + "step": 51890 + }, + { + "epoch": 3.5259546133985595, + "grad_norm": 0.8332077264785767, + "learning_rate": 0.0005594255333605109, + "loss": 3.472, + "step": 51895 + }, + { + "epoch": 3.526294333469221, + "grad_norm": 0.968741774559021, + "learning_rate": 0.0005593830683516782, + "loss": 3.5338, + "step": 51900 + }, + { + "epoch": 3.5266340535398832, + "grad_norm": 1.03218674659729, + "learning_rate": 0.0005593406033428455, + "loss": 3.5872, + "step": 51905 + }, + { + "epoch": 3.526973773610545, + "grad_norm": 1.1642073392868042, + "learning_rate": 0.0005592981383340128, + "loss": 3.5455, + "step": 51910 + }, + { + "epoch": 3.5273134936812065, + "grad_norm": 1.0267119407653809, + "learning_rate": 0.00055925567332518, + "loss": 3.6403, + "step": 51915 + }, + { + "epoch": 3.5276532137518686, + "grad_norm": 0.8016071319580078, + "learning_rate": 0.0005592132083163474, + "loss": 3.2719, + "step": 51920 + }, + { + "epoch": 3.52799293382253, + "grad_norm": 0.8957409858703613, + "learning_rate": 0.0005591707433075146, + "loss": 3.5466, + "step": 51925 + }, + { + "epoch": 3.528332653893192, + "grad_norm": 0.737649142742157, + "learning_rate": 0.0005591282782986818, + "loss": 3.7329, + "step": 51930 + }, + { + "epoch": 3.528672373963854, + "grad_norm": 0.8677420616149902, + "learning_rate": 0.0005590858132898492, + "loss": 3.4252, + "step": 51935 + }, + { + "epoch": 3.5290120940345155, + "grad_norm": 1.0586079359054565, + "learning_rate": 0.0005590433482810164, + "loss": 3.4801, + "step": 51940 + }, + { + "epoch": 3.529351814105177, + "grad_norm": 0.9931183457374573, + "learning_rate": 0.0005590008832721837, + "loss": 3.3956, + "step": 51945 + }, + { + "epoch": 3.5296915341758393, + "grad_norm": 0.8678961992263794, + "learning_rate": 0.0005589584182633511, + "loss": 3.401, + "step": 51950 + }, + { + "epoch": 3.530031254246501, + "grad_norm": 1.0297869443893433, + "learning_rate": 0.0005589159532545183, + "loss": 3.4634, + "step": 51955 + }, + { + "epoch": 3.5303709743171625, + "grad_norm": 1.0277563333511353, + "learning_rate": 0.0005588734882456855, + "loss": 3.534, + "step": 51960 + }, + { + "epoch": 3.5307106943878246, + "grad_norm": 0.9229379296302795, + "learning_rate": 0.0005588310232368528, + "loss": 3.4485, + "step": 51965 + }, + { + "epoch": 3.531050414458486, + "grad_norm": 0.732833981513977, + "learning_rate": 0.0005587885582280201, + "loss": 3.6714, + "step": 51970 + }, + { + "epoch": 3.531390134529148, + "grad_norm": 0.8177512884140015, + "learning_rate": 0.0005587460932191873, + "loss": 3.1757, + "step": 51975 + }, + { + "epoch": 3.53172985459981, + "grad_norm": 0.9070915579795837, + "learning_rate": 0.0005587036282103547, + "loss": 3.2879, + "step": 51980 + }, + { + "epoch": 3.5320695746704716, + "grad_norm": 0.9442996978759766, + "learning_rate": 0.000558661163201522, + "loss": 3.6729, + "step": 51985 + }, + { + "epoch": 3.532409294741133, + "grad_norm": 0.816365659236908, + "learning_rate": 0.0005586186981926893, + "loss": 3.5309, + "step": 51990 + }, + { + "epoch": 3.5327490148117953, + "grad_norm": 1.1017876863479614, + "learning_rate": 0.0005585762331838565, + "loss": 3.6128, + "step": 51995 + }, + { + "epoch": 3.533088734882457, + "grad_norm": 1.509099006652832, + "learning_rate": 0.0005585337681750238, + "loss": 3.6098, + "step": 52000 + }, + { + "epoch": 3.5334284549531185, + "grad_norm": 1.5052326917648315, + "learning_rate": 0.0005584913031661911, + "loss": 3.8518, + "step": 52005 + }, + { + "epoch": 3.5337681750237806, + "grad_norm": 0.7808403372764587, + "learning_rate": 0.0005584488381573583, + "loss": 3.2555, + "step": 52010 + }, + { + "epoch": 3.5341078950944422, + "grad_norm": 0.9303004145622253, + "learning_rate": 0.0005584063731485256, + "loss": 3.5612, + "step": 52015 + }, + { + "epoch": 3.534447615165104, + "grad_norm": 0.8947545886039734, + "learning_rate": 0.000558363908139693, + "loss": 3.4668, + "step": 52020 + }, + { + "epoch": 3.534787335235766, + "grad_norm": 0.7378798723220825, + "learning_rate": 0.0005583214431308602, + "loss": 3.5788, + "step": 52025 + }, + { + "epoch": 3.5351270553064276, + "grad_norm": 0.8089230060577393, + "learning_rate": 0.0005582789781220274, + "loss": 3.7478, + "step": 52030 + }, + { + "epoch": 3.535466775377089, + "grad_norm": 1.0210524797439575, + "learning_rate": 0.0005582365131131948, + "loss": 3.7034, + "step": 52035 + }, + { + "epoch": 3.5358064954477513, + "grad_norm": 1.05917489528656, + "learning_rate": 0.000558194048104362, + "loss": 3.4976, + "step": 52040 + }, + { + "epoch": 3.536146215518413, + "grad_norm": 1.0013574361801147, + "learning_rate": 0.0005581515830955292, + "loss": 3.6103, + "step": 52045 + }, + { + "epoch": 3.5364859355890745, + "grad_norm": 0.8525996804237366, + "learning_rate": 0.0005581091180866967, + "loss": 3.5833, + "step": 52050 + }, + { + "epoch": 3.5368256556597366, + "grad_norm": 1.0691851377487183, + "learning_rate": 0.0005580666530778639, + "loss": 3.683, + "step": 52055 + }, + { + "epoch": 3.5371653757303982, + "grad_norm": 0.9284354448318481, + "learning_rate": 0.0005580241880690311, + "loss": 3.4687, + "step": 52060 + }, + { + "epoch": 3.53750509580106, + "grad_norm": 0.8928323984146118, + "learning_rate": 0.0005579817230601984, + "loss": 3.4257, + "step": 52065 + }, + { + "epoch": 3.537844815871722, + "grad_norm": 0.983173668384552, + "learning_rate": 0.0005579392580513657, + "loss": 3.3407, + "step": 52070 + }, + { + "epoch": 3.5381845359423836, + "grad_norm": 0.8226133584976196, + "learning_rate": 0.0005578967930425329, + "loss": 3.6356, + "step": 52075 + }, + { + "epoch": 3.538524256013045, + "grad_norm": 0.8250253200531006, + "learning_rate": 0.0005578543280337002, + "loss": 3.5962, + "step": 52080 + }, + { + "epoch": 3.5388639760837073, + "grad_norm": 1.6067519187927246, + "learning_rate": 0.0005578118630248676, + "loss": 3.3831, + "step": 52085 + }, + { + "epoch": 3.539203696154369, + "grad_norm": 1.035300612449646, + "learning_rate": 0.0005577693980160348, + "loss": 3.3823, + "step": 52090 + }, + { + "epoch": 3.5395434162250305, + "grad_norm": 1.0241998434066772, + "learning_rate": 0.0005577269330072021, + "loss": 3.8013, + "step": 52095 + }, + { + "epoch": 3.5398831362956926, + "grad_norm": 0.8283922076225281, + "learning_rate": 0.0005576844679983694, + "loss": 3.4946, + "step": 52100 + }, + { + "epoch": 3.5402228563663543, + "grad_norm": 0.6295567154884338, + "learning_rate": 0.0005576420029895366, + "loss": 3.7171, + "step": 52105 + }, + { + "epoch": 3.540562576437016, + "grad_norm": 0.7412692308425903, + "learning_rate": 0.0005575995379807039, + "loss": 3.2784, + "step": 52110 + }, + { + "epoch": 3.5409022965076775, + "grad_norm": 0.9922723770141602, + "learning_rate": 0.0005575570729718711, + "loss": 3.7437, + "step": 52115 + }, + { + "epoch": 3.5412420165783396, + "grad_norm": 0.8637376427650452, + "learning_rate": 0.0005575146079630385, + "loss": 3.3978, + "step": 52120 + }, + { + "epoch": 3.541581736649001, + "grad_norm": 0.8419885635375977, + "learning_rate": 0.0005574721429542058, + "loss": 3.4135, + "step": 52125 + }, + { + "epoch": 3.541921456719663, + "grad_norm": 0.9797675013542175, + "learning_rate": 0.000557429677945373, + "loss": 3.5897, + "step": 52130 + }, + { + "epoch": 3.542261176790325, + "grad_norm": 1.0099424123764038, + "learning_rate": 0.0005573872129365403, + "loss": 3.6205, + "step": 52135 + }, + { + "epoch": 3.5426008968609866, + "grad_norm": 0.8967818021774292, + "learning_rate": 0.0005573447479277076, + "loss": 3.4489, + "step": 52140 + }, + { + "epoch": 3.542940616931648, + "grad_norm": 0.8909847140312195, + "learning_rate": 0.0005573022829188748, + "loss": 3.5638, + "step": 52145 + }, + { + "epoch": 3.54328033700231, + "grad_norm": 0.70592200756073, + "learning_rate": 0.000557259817910042, + "loss": 3.4604, + "step": 52150 + }, + { + "epoch": 3.543620057072972, + "grad_norm": 0.9096888899803162, + "learning_rate": 0.0005572173529012095, + "loss": 3.6946, + "step": 52155 + }, + { + "epoch": 3.5439597771436335, + "grad_norm": 1.0310910940170288, + "learning_rate": 0.0005571748878923767, + "loss": 3.6754, + "step": 52160 + }, + { + "epoch": 3.544299497214295, + "grad_norm": 0.9313104748725891, + "learning_rate": 0.0005571324228835439, + "loss": 3.4924, + "step": 52165 + }, + { + "epoch": 3.5446392172849572, + "grad_norm": 0.835337221622467, + "learning_rate": 0.0005570899578747113, + "loss": 3.4155, + "step": 52170 + }, + { + "epoch": 3.544978937355619, + "grad_norm": 0.8809797167778015, + "learning_rate": 0.0005570474928658785, + "loss": 3.3225, + "step": 52175 + }, + { + "epoch": 3.5453186574262805, + "grad_norm": 1.0883190631866455, + "learning_rate": 0.0005570050278570457, + "loss": 3.3966, + "step": 52180 + }, + { + "epoch": 3.5456583774969426, + "grad_norm": 0.7609801292419434, + "learning_rate": 0.0005569625628482131, + "loss": 3.0664, + "step": 52185 + }, + { + "epoch": 3.545998097567604, + "grad_norm": 0.7628222107887268, + "learning_rate": 0.0005569200978393804, + "loss": 3.5822, + "step": 52190 + }, + { + "epoch": 3.546337817638266, + "grad_norm": 0.8782508969306946, + "learning_rate": 0.0005568776328305476, + "loss": 3.2944, + "step": 52195 + }, + { + "epoch": 3.546677537708928, + "grad_norm": 1.1059023141860962, + "learning_rate": 0.000556835167821715, + "loss": 3.4216, + "step": 52200 + }, + { + "epoch": 3.5470172577795895, + "grad_norm": 1.0722280740737915, + "learning_rate": 0.0005567927028128822, + "loss": 3.4733, + "step": 52205 + }, + { + "epoch": 3.547356977850251, + "grad_norm": 0.8238985538482666, + "learning_rate": 0.0005567502378040494, + "loss": 3.5576, + "step": 52210 + }, + { + "epoch": 3.5476966979209132, + "grad_norm": 0.8706415295600891, + "learning_rate": 0.0005567077727952167, + "loss": 3.6048, + "step": 52215 + }, + { + "epoch": 3.548036417991575, + "grad_norm": 0.816357433795929, + "learning_rate": 0.000556665307786384, + "loss": 3.5365, + "step": 52220 + }, + { + "epoch": 3.5483761380622365, + "grad_norm": 0.8835539817810059, + "learning_rate": 0.0005566228427775513, + "loss": 3.4495, + "step": 52225 + }, + { + "epoch": 3.5487158581328986, + "grad_norm": 0.952446460723877, + "learning_rate": 0.0005565803777687186, + "loss": 3.6146, + "step": 52230 + }, + { + "epoch": 3.54905557820356, + "grad_norm": 0.7468661665916443, + "learning_rate": 0.0005565379127598859, + "loss": 3.6068, + "step": 52235 + }, + { + "epoch": 3.549395298274222, + "grad_norm": 0.8351343274116516, + "learning_rate": 0.0005564954477510531, + "loss": 3.5182, + "step": 52240 + }, + { + "epoch": 3.549735018344884, + "grad_norm": 0.9759404063224792, + "learning_rate": 0.0005564529827422204, + "loss": 3.5682, + "step": 52245 + }, + { + "epoch": 3.5500747384155455, + "grad_norm": 1.6659177541732788, + "learning_rate": 0.0005564105177333876, + "loss": 3.5503, + "step": 52250 + }, + { + "epoch": 3.550414458486207, + "grad_norm": 0.7488589882850647, + "learning_rate": 0.0005563680527245549, + "loss": 3.493, + "step": 52255 + }, + { + "epoch": 3.5507541785568693, + "grad_norm": 0.9675310850143433, + "learning_rate": 0.0005563255877157223, + "loss": 3.3902, + "step": 52260 + }, + { + "epoch": 3.551093898627531, + "grad_norm": 0.8121783137321472, + "learning_rate": 0.0005562831227068895, + "loss": 3.4784, + "step": 52265 + }, + { + "epoch": 3.5514336186981925, + "grad_norm": 0.8204929828643799, + "learning_rate": 0.0005562406576980568, + "loss": 3.4398, + "step": 52270 + }, + { + "epoch": 3.5517733387688546, + "grad_norm": 0.7302677631378174, + "learning_rate": 0.0005561981926892241, + "loss": 3.7244, + "step": 52275 + }, + { + "epoch": 3.5521130588395162, + "grad_norm": 1.115089774131775, + "learning_rate": 0.0005561557276803913, + "loss": 3.5197, + "step": 52280 + }, + { + "epoch": 3.552452778910178, + "grad_norm": 0.9453549981117249, + "learning_rate": 0.0005561132626715586, + "loss": 3.5982, + "step": 52285 + }, + { + "epoch": 3.55279249898084, + "grad_norm": 0.9295415282249451, + "learning_rate": 0.000556070797662726, + "loss": 3.5901, + "step": 52290 + }, + { + "epoch": 3.5531322190515016, + "grad_norm": 1.1223952770233154, + "learning_rate": 0.0005560283326538932, + "loss": 3.8518, + "step": 52295 + }, + { + "epoch": 3.553471939122163, + "grad_norm": 0.8469665050506592, + "learning_rate": 0.0005559858676450605, + "loss": 3.3637, + "step": 52300 + }, + { + "epoch": 3.5538116591928253, + "grad_norm": 0.9927549958229065, + "learning_rate": 0.0005559434026362278, + "loss": 3.7396, + "step": 52305 + }, + { + "epoch": 3.554151379263487, + "grad_norm": 0.881680965423584, + "learning_rate": 0.000555900937627395, + "loss": 3.4199, + "step": 52310 + }, + { + "epoch": 3.5544910993341485, + "grad_norm": 0.7174882888793945, + "learning_rate": 0.0005558584726185622, + "loss": 3.4547, + "step": 52315 + }, + { + "epoch": 3.5548308194048106, + "grad_norm": 0.8790659308433533, + "learning_rate": 0.0005558160076097296, + "loss": 3.583, + "step": 52320 + }, + { + "epoch": 3.5551705394754722, + "grad_norm": 0.6741118431091309, + "learning_rate": 0.0005557735426008969, + "loss": 3.6079, + "step": 52325 + }, + { + "epoch": 3.555510259546134, + "grad_norm": 1.0115681886672974, + "learning_rate": 0.0005557310775920642, + "loss": 3.5254, + "step": 52330 + }, + { + "epoch": 3.555849979616796, + "grad_norm": 1.01543128490448, + "learning_rate": 0.0005556886125832315, + "loss": 3.722, + "step": 52335 + }, + { + "epoch": 3.5561896996874576, + "grad_norm": 0.9257655143737793, + "learning_rate": 0.0005556461475743987, + "loss": 3.4009, + "step": 52340 + }, + { + "epoch": 3.556529419758119, + "grad_norm": 1.1146456003189087, + "learning_rate": 0.000555603682565566, + "loss": 3.1916, + "step": 52345 + }, + { + "epoch": 3.5568691398287813, + "grad_norm": 0.7870867252349854, + "learning_rate": 0.0005555612175567333, + "loss": 3.4088, + "step": 52350 + }, + { + "epoch": 3.557208859899443, + "grad_norm": 1.0934025049209595, + "learning_rate": 0.0005555187525479005, + "loss": 3.5677, + "step": 52355 + }, + { + "epoch": 3.5575485799701045, + "grad_norm": 0.7760512232780457, + "learning_rate": 0.0005554762875390679, + "loss": 3.5933, + "step": 52360 + }, + { + "epoch": 3.5578883000407666, + "grad_norm": 0.8559786677360535, + "learning_rate": 0.0005554338225302351, + "loss": 3.4455, + "step": 52365 + }, + { + "epoch": 3.5582280201114282, + "grad_norm": 0.9860999584197998, + "learning_rate": 0.0005553913575214024, + "loss": 3.4163, + "step": 52370 + }, + { + "epoch": 3.55856774018209, + "grad_norm": 0.762481153011322, + "learning_rate": 0.0005553488925125697, + "loss": 3.4308, + "step": 52375 + }, + { + "epoch": 3.558907460252752, + "grad_norm": 0.8665501475334167, + "learning_rate": 0.0005553064275037369, + "loss": 3.535, + "step": 52380 + }, + { + "epoch": 3.5592471803234136, + "grad_norm": 0.7506090998649597, + "learning_rate": 0.0005552639624949042, + "loss": 3.3864, + "step": 52385 + }, + { + "epoch": 3.559586900394075, + "grad_norm": 1.0209954977035522, + "learning_rate": 0.0005552214974860715, + "loss": 3.3763, + "step": 52390 + }, + { + "epoch": 3.5599266204647373, + "grad_norm": 1.142770528793335, + "learning_rate": 0.0005551790324772388, + "loss": 3.3517, + "step": 52395 + }, + { + "epoch": 3.560266340535399, + "grad_norm": 0.9215434789657593, + "learning_rate": 0.000555136567468406, + "loss": 3.5275, + "step": 52400 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 0.8310906291007996, + "learning_rate": 0.0005550941024595734, + "loss": 3.3895, + "step": 52405 + }, + { + "epoch": 3.5609457806767226, + "grad_norm": 0.7167714834213257, + "learning_rate": 0.0005550516374507406, + "loss": 3.5274, + "step": 52410 + }, + { + "epoch": 3.5612855007473843, + "grad_norm": 0.8011178374290466, + "learning_rate": 0.0005550091724419078, + "loss": 3.5571, + "step": 52415 + }, + { + "epoch": 3.561625220818046, + "grad_norm": 0.8512776494026184, + "learning_rate": 0.0005549667074330752, + "loss": 3.6212, + "step": 52420 + }, + { + "epoch": 3.561964940888708, + "grad_norm": 0.8573417663574219, + "learning_rate": 0.0005549242424242424, + "loss": 3.6264, + "step": 52425 + }, + { + "epoch": 3.5623046609593696, + "grad_norm": 0.800025999546051, + "learning_rate": 0.0005548817774154097, + "loss": 3.3651, + "step": 52430 + }, + { + "epoch": 3.5626443810300312, + "grad_norm": 0.902248740196228, + "learning_rate": 0.0005548393124065771, + "loss": 3.7586, + "step": 52435 + }, + { + "epoch": 3.5629841011006933, + "grad_norm": 1.0518494844436646, + "learning_rate": 0.0005547968473977443, + "loss": 3.2434, + "step": 52440 + }, + { + "epoch": 3.563323821171355, + "grad_norm": 1.1437619924545288, + "learning_rate": 0.0005547543823889115, + "loss": 3.5293, + "step": 52445 + }, + { + "epoch": 3.5636635412420166, + "grad_norm": 0.9899710416793823, + "learning_rate": 0.0005547119173800789, + "loss": 3.3231, + "step": 52450 + }, + { + "epoch": 3.564003261312678, + "grad_norm": 1.1467019319534302, + "learning_rate": 0.0005546694523712461, + "loss": 3.3755, + "step": 52455 + }, + { + "epoch": 3.5643429813833403, + "grad_norm": 0.80193692445755, + "learning_rate": 0.0005546269873624133, + "loss": 3.6284, + "step": 52460 + }, + { + "epoch": 3.564682701454002, + "grad_norm": 0.7692804932594299, + "learning_rate": 0.0005545845223535807, + "loss": 3.3634, + "step": 52465 + }, + { + "epoch": 3.5650224215246635, + "grad_norm": 0.9002745747566223, + "learning_rate": 0.000554542057344748, + "loss": 3.398, + "step": 52470 + }, + { + "epoch": 3.5653621415953256, + "grad_norm": 0.9504525065422058, + "learning_rate": 0.0005544995923359152, + "loss": 3.5356, + "step": 52475 + }, + { + "epoch": 3.5657018616659872, + "grad_norm": 0.9739566445350647, + "learning_rate": 0.0005544571273270825, + "loss": 3.5592, + "step": 52480 + }, + { + "epoch": 3.566041581736649, + "grad_norm": 0.8488055467605591, + "learning_rate": 0.0005544146623182498, + "loss": 3.5795, + "step": 52485 + }, + { + "epoch": 3.5663813018073105, + "grad_norm": 0.9292692542076111, + "learning_rate": 0.000554372197309417, + "loss": 3.5386, + "step": 52490 + }, + { + "epoch": 3.5667210218779726, + "grad_norm": 0.8906123042106628, + "learning_rate": 0.0005543297323005843, + "loss": 3.7848, + "step": 52495 + }, + { + "epoch": 3.567060741948634, + "grad_norm": 0.870985209941864, + "learning_rate": 0.0005542872672917517, + "loss": 3.3741, + "step": 52500 + }, + { + "epoch": 3.567400462019296, + "grad_norm": 0.7586756348609924, + "learning_rate": 0.0005542448022829189, + "loss": 3.3505, + "step": 52505 + }, + { + "epoch": 3.567740182089958, + "grad_norm": 0.6719898581504822, + "learning_rate": 0.0005542023372740862, + "loss": 3.3577, + "step": 52510 + }, + { + "epoch": 3.5680799021606195, + "grad_norm": 0.8750602006912231, + "learning_rate": 0.0005541598722652534, + "loss": 3.2506, + "step": 52515 + }, + { + "epoch": 3.568419622231281, + "grad_norm": 0.8769327402114868, + "learning_rate": 0.0005541174072564207, + "loss": 3.6437, + "step": 52520 + }, + { + "epoch": 3.5687593423019432, + "grad_norm": 1.0202521085739136, + "learning_rate": 0.000554074942247588, + "loss": 3.8358, + "step": 52525 + }, + { + "epoch": 3.569099062372605, + "grad_norm": 0.8598548769950867, + "learning_rate": 0.0005540324772387552, + "loss": 3.4955, + "step": 52530 + }, + { + "epoch": 3.5694387824432665, + "grad_norm": 1.0905437469482422, + "learning_rate": 0.0005539900122299226, + "loss": 3.2232, + "step": 52535 + }, + { + "epoch": 3.5697785025139286, + "grad_norm": 0.9007442593574524, + "learning_rate": 0.0005539475472210899, + "loss": 3.9169, + "step": 52540 + }, + { + "epoch": 3.57011822258459, + "grad_norm": 0.9302268028259277, + "learning_rate": 0.0005539050822122571, + "loss": 3.4434, + "step": 52545 + }, + { + "epoch": 3.570457942655252, + "grad_norm": 0.9118942022323608, + "learning_rate": 0.0005538626172034243, + "loss": 3.583, + "step": 52550 + }, + { + "epoch": 3.570797662725914, + "grad_norm": 0.9857950210571289, + "learning_rate": 0.0005538201521945917, + "loss": 3.5595, + "step": 52555 + }, + { + "epoch": 3.5711373827965756, + "grad_norm": 0.9345163106918335, + "learning_rate": 0.0005537776871857589, + "loss": 3.4784, + "step": 52560 + }, + { + "epoch": 3.571477102867237, + "grad_norm": 0.7823102474212646, + "learning_rate": 0.0005537352221769261, + "loss": 3.3875, + "step": 52565 + }, + { + "epoch": 3.5718168229378993, + "grad_norm": 0.766549289226532, + "learning_rate": 0.0005536927571680936, + "loss": 3.5358, + "step": 52570 + }, + { + "epoch": 3.572156543008561, + "grad_norm": 0.9522895812988281, + "learning_rate": 0.0005536502921592608, + "loss": 3.6514, + "step": 52575 + }, + { + "epoch": 3.5724962630792225, + "grad_norm": 0.9757686257362366, + "learning_rate": 0.000553607827150428, + "loss": 3.5455, + "step": 52580 + }, + { + "epoch": 3.5728359831498846, + "grad_norm": 1.327557921409607, + "learning_rate": 0.0005535653621415954, + "loss": 3.7951, + "step": 52585 + }, + { + "epoch": 3.5731757032205462, + "grad_norm": 0.8996463418006897, + "learning_rate": 0.0005535228971327626, + "loss": 3.6857, + "step": 52590 + }, + { + "epoch": 3.573515423291208, + "grad_norm": 1.089372992515564, + "learning_rate": 0.0005534804321239298, + "loss": 3.4564, + "step": 52595 + }, + { + "epoch": 3.57385514336187, + "grad_norm": 0.8037695288658142, + "learning_rate": 0.0005534379671150971, + "loss": 3.5317, + "step": 52600 + }, + { + "epoch": 3.5741948634325316, + "grad_norm": 0.8426719307899475, + "learning_rate": 0.0005533955021062645, + "loss": 3.4455, + "step": 52605 + }, + { + "epoch": 3.574534583503193, + "grad_norm": 1.1934808492660522, + "learning_rate": 0.0005533530370974317, + "loss": 3.7114, + "step": 52610 + }, + { + "epoch": 3.5748743035738553, + "grad_norm": 1.1200505495071411, + "learning_rate": 0.000553310572088599, + "loss": 3.6382, + "step": 52615 + }, + { + "epoch": 3.575214023644517, + "grad_norm": 0.792309582233429, + "learning_rate": 0.0005532681070797663, + "loss": 3.5394, + "step": 52620 + }, + { + "epoch": 3.5755537437151785, + "grad_norm": 0.9034616351127625, + "learning_rate": 0.0005532256420709335, + "loss": 3.6714, + "step": 52625 + }, + { + "epoch": 3.5758934637858406, + "grad_norm": 0.8941036462783813, + "learning_rate": 0.0005531831770621008, + "loss": 3.7174, + "step": 52630 + }, + { + "epoch": 3.5762331838565022, + "grad_norm": 1.0855708122253418, + "learning_rate": 0.000553140712053268, + "loss": 3.3463, + "step": 52635 + }, + { + "epoch": 3.576572903927164, + "grad_norm": 0.9636162519454956, + "learning_rate": 0.0005530982470444354, + "loss": 3.4248, + "step": 52640 + }, + { + "epoch": 3.576912623997826, + "grad_norm": 0.9388499855995178, + "learning_rate": 0.0005530557820356027, + "loss": 3.4029, + "step": 52645 + }, + { + "epoch": 3.5772523440684876, + "grad_norm": 0.9016910195350647, + "learning_rate": 0.00055301331702677, + "loss": 3.7457, + "step": 52650 + }, + { + "epoch": 3.577592064139149, + "grad_norm": 0.9004783630371094, + "learning_rate": 0.0005529708520179372, + "loss": 3.4686, + "step": 52655 + }, + { + "epoch": 3.5779317842098113, + "grad_norm": 0.9338772296905518, + "learning_rate": 0.0005529283870091045, + "loss": 3.4409, + "step": 52660 + }, + { + "epoch": 3.578271504280473, + "grad_norm": 0.8499504923820496, + "learning_rate": 0.0005528859220002717, + "loss": 3.4111, + "step": 52665 + }, + { + "epoch": 3.5786112243511345, + "grad_norm": 0.8294602036476135, + "learning_rate": 0.0005528434569914391, + "loss": 3.4902, + "step": 52670 + }, + { + "epoch": 3.5789509444217966, + "grad_norm": 0.8773077130317688, + "learning_rate": 0.0005528009919826064, + "loss": 3.5333, + "step": 52675 + }, + { + "epoch": 3.5792906644924583, + "grad_norm": 0.7797747850418091, + "learning_rate": 0.0005527585269737736, + "loss": 3.5504, + "step": 52680 + }, + { + "epoch": 3.57963038456312, + "grad_norm": 0.8486306071281433, + "learning_rate": 0.000552716061964941, + "loss": 3.613, + "step": 52685 + }, + { + "epoch": 3.579970104633782, + "grad_norm": 0.8345975875854492, + "learning_rate": 0.0005526735969561082, + "loss": 3.6687, + "step": 52690 + }, + { + "epoch": 3.5803098247044436, + "grad_norm": 1.2098362445831299, + "learning_rate": 0.0005526311319472754, + "loss": 3.436, + "step": 52695 + }, + { + "epoch": 3.580649544775105, + "grad_norm": 0.8362375497817993, + "learning_rate": 0.0005525886669384427, + "loss": 3.7986, + "step": 52700 + }, + { + "epoch": 3.5809892648457673, + "grad_norm": 0.9034695625305176, + "learning_rate": 0.00055254620192961, + "loss": 3.5439, + "step": 52705 + }, + { + "epoch": 3.581328984916429, + "grad_norm": 0.7922974824905396, + "learning_rate": 0.0005525037369207773, + "loss": 3.5212, + "step": 52710 + }, + { + "epoch": 3.5816687049870906, + "grad_norm": 0.71144700050354, + "learning_rate": 0.0005524612719119446, + "loss": 3.56, + "step": 52715 + }, + { + "epoch": 3.5820084250577526, + "grad_norm": 0.942363977432251, + "learning_rate": 0.0005524188069031119, + "loss": 3.617, + "step": 52720 + }, + { + "epoch": 3.5823481451284143, + "grad_norm": 0.8381492495536804, + "learning_rate": 0.0005523763418942791, + "loss": 3.5654, + "step": 52725 + }, + { + "epoch": 3.582687865199076, + "grad_norm": 0.8321513533592224, + "learning_rate": 0.0005523338768854464, + "loss": 3.4097, + "step": 52730 + }, + { + "epoch": 3.583027585269738, + "grad_norm": 0.8994709253311157, + "learning_rate": 0.0005522914118766137, + "loss": 3.5892, + "step": 52735 + }, + { + "epoch": 3.5833673053403996, + "grad_norm": 0.9479643106460571, + "learning_rate": 0.0005522489468677809, + "loss": 3.4511, + "step": 52740 + }, + { + "epoch": 3.5837070254110612, + "grad_norm": 0.7959499955177307, + "learning_rate": 0.0005522064818589483, + "loss": 3.2341, + "step": 52745 + }, + { + "epoch": 3.5840467454817233, + "grad_norm": 0.7832716703414917, + "learning_rate": 0.0005521640168501155, + "loss": 3.4783, + "step": 52750 + }, + { + "epoch": 3.584386465552385, + "grad_norm": 0.9579699039459229, + "learning_rate": 0.0005521215518412828, + "loss": 3.4554, + "step": 52755 + }, + { + "epoch": 3.5847261856230466, + "grad_norm": 1.0745619535446167, + "learning_rate": 0.0005520790868324501, + "loss": 3.722, + "step": 52760 + }, + { + "epoch": 3.5850659056937086, + "grad_norm": 0.833681583404541, + "learning_rate": 0.0005520366218236173, + "loss": 3.4407, + "step": 52765 + }, + { + "epoch": 3.5854056257643703, + "grad_norm": 0.7390937805175781, + "learning_rate": 0.0005519941568147846, + "loss": 3.4881, + "step": 52770 + }, + { + "epoch": 3.585745345835032, + "grad_norm": 0.8168641328811646, + "learning_rate": 0.000551951691805952, + "loss": 3.4884, + "step": 52775 + }, + { + "epoch": 3.586085065905694, + "grad_norm": 0.6547437310218811, + "learning_rate": 0.0005519092267971192, + "loss": 3.5413, + "step": 52780 + }, + { + "epoch": 3.5864247859763556, + "grad_norm": 1.1342782974243164, + "learning_rate": 0.0005518667617882865, + "loss": 3.2294, + "step": 52785 + }, + { + "epoch": 3.5867645060470172, + "grad_norm": 0.7073545455932617, + "learning_rate": 0.0005518242967794538, + "loss": 3.7566, + "step": 52790 + }, + { + "epoch": 3.587104226117679, + "grad_norm": 0.8684728741645813, + "learning_rate": 0.000551781831770621, + "loss": 3.6116, + "step": 52795 + }, + { + "epoch": 3.587443946188341, + "grad_norm": 0.783809244632721, + "learning_rate": 0.0005517393667617882, + "loss": 3.8807, + "step": 52800 + }, + { + "epoch": 3.5877836662590026, + "grad_norm": 0.9198316931724548, + "learning_rate": 0.0005516969017529556, + "loss": 3.5532, + "step": 52805 + }, + { + "epoch": 3.588123386329664, + "grad_norm": 0.9740683436393738, + "learning_rate": 0.0005516544367441229, + "loss": 3.9351, + "step": 52810 + }, + { + "epoch": 3.5884631064003263, + "grad_norm": 0.83992600440979, + "learning_rate": 0.0005516119717352901, + "loss": 3.652, + "step": 52815 + }, + { + "epoch": 3.588802826470988, + "grad_norm": 1.056426763534546, + "learning_rate": 0.0005515695067264575, + "loss": 3.6004, + "step": 52820 + }, + { + "epoch": 3.5891425465416495, + "grad_norm": 1.0180282592773438, + "learning_rate": 0.0005515270417176247, + "loss": 3.5972, + "step": 52825 + }, + { + "epoch": 3.589482266612311, + "grad_norm": 1.077870488166809, + "learning_rate": 0.0005514845767087919, + "loss": 3.7495, + "step": 52830 + }, + { + "epoch": 3.5898219866829733, + "grad_norm": 0.795088529586792, + "learning_rate": 0.0005514421116999593, + "loss": 3.5061, + "step": 52835 + }, + { + "epoch": 3.590161706753635, + "grad_norm": 0.7796046137809753, + "learning_rate": 0.0005513996466911265, + "loss": 3.7538, + "step": 52840 + }, + { + "epoch": 3.5905014268242965, + "grad_norm": 0.8545706868171692, + "learning_rate": 0.0005513571816822938, + "loss": 3.5152, + "step": 52845 + }, + { + "epoch": 3.5908411468949586, + "grad_norm": 0.9384684562683105, + "learning_rate": 0.0005513147166734612, + "loss": 3.5595, + "step": 52850 + }, + { + "epoch": 3.59118086696562, + "grad_norm": 1.0258421897888184, + "learning_rate": 0.0005512722516646284, + "loss": 3.5447, + "step": 52855 + }, + { + "epoch": 3.591520587036282, + "grad_norm": 0.7560019493103027, + "learning_rate": 0.0005512297866557956, + "loss": 3.4531, + "step": 52860 + }, + { + "epoch": 3.591860307106944, + "grad_norm": 0.9572668075561523, + "learning_rate": 0.0005511873216469629, + "loss": 3.5311, + "step": 52865 + }, + { + "epoch": 3.5922000271776056, + "grad_norm": 0.9964108467102051, + "learning_rate": 0.0005511448566381302, + "loss": 3.5465, + "step": 52870 + }, + { + "epoch": 3.592539747248267, + "grad_norm": 0.7823436260223389, + "learning_rate": 0.0005511023916292974, + "loss": 3.2905, + "step": 52875 + }, + { + "epoch": 3.5928794673189293, + "grad_norm": 0.9332485795021057, + "learning_rate": 0.0005510599266204648, + "loss": 3.5355, + "step": 52880 + }, + { + "epoch": 3.593219187389591, + "grad_norm": 0.8645064234733582, + "learning_rate": 0.0005510174616116321, + "loss": 3.5533, + "step": 52885 + }, + { + "epoch": 3.5935589074602525, + "grad_norm": 0.7016422152519226, + "learning_rate": 0.0005509749966027993, + "loss": 3.4642, + "step": 52890 + }, + { + "epoch": 3.5938986275309146, + "grad_norm": 0.8779458403587341, + "learning_rate": 0.0005509325315939666, + "loss": 3.5302, + "step": 52895 + }, + { + "epoch": 3.5942383476015762, + "grad_norm": 0.8556497693061829, + "learning_rate": 0.0005508900665851338, + "loss": 3.3977, + "step": 52900 + }, + { + "epoch": 3.594578067672238, + "grad_norm": 0.8873161673545837, + "learning_rate": 0.0005508476015763011, + "loss": 3.5807, + "step": 52905 + }, + { + "epoch": 3.5949177877429, + "grad_norm": 0.9418740272521973, + "learning_rate": 0.0005508051365674684, + "loss": 3.5199, + "step": 52910 + }, + { + "epoch": 3.5952575078135616, + "grad_norm": 0.8823916912078857, + "learning_rate": 0.0005507626715586357, + "loss": 3.7849, + "step": 52915 + }, + { + "epoch": 3.595597227884223, + "grad_norm": 0.7913811206817627, + "learning_rate": 0.000550720206549803, + "loss": 3.5692, + "step": 52920 + }, + { + "epoch": 3.5959369479548853, + "grad_norm": 0.805544376373291, + "learning_rate": 0.0005506777415409703, + "loss": 3.1172, + "step": 52925 + }, + { + "epoch": 3.596276668025547, + "grad_norm": 1.0788365602493286, + "learning_rate": 0.0005506352765321375, + "loss": 3.5295, + "step": 52930 + }, + { + "epoch": 3.5966163880962085, + "grad_norm": 0.8051691055297852, + "learning_rate": 0.0005505928115233047, + "loss": 3.416, + "step": 52935 + }, + { + "epoch": 3.5969561081668706, + "grad_norm": 0.9376055598258972, + "learning_rate": 0.0005505503465144721, + "loss": 3.392, + "step": 52940 + }, + { + "epoch": 3.5972958282375322, + "grad_norm": 0.8463627099990845, + "learning_rate": 0.0005505078815056393, + "loss": 3.3491, + "step": 52945 + }, + { + "epoch": 3.597635548308194, + "grad_norm": 0.9486368298530579, + "learning_rate": 0.0005504654164968066, + "loss": 3.5592, + "step": 52950 + }, + { + "epoch": 3.597975268378856, + "grad_norm": 1.0295315980911255, + "learning_rate": 0.000550422951487974, + "loss": 3.4834, + "step": 52955 + }, + { + "epoch": 3.5983149884495176, + "grad_norm": 2.692504405975342, + "learning_rate": 0.0005503804864791412, + "loss": 3.5352, + "step": 52960 + }, + { + "epoch": 3.598654708520179, + "grad_norm": 0.7977980971336365, + "learning_rate": 0.0005503380214703084, + "loss": 3.585, + "step": 52965 + }, + { + "epoch": 3.5989944285908413, + "grad_norm": 0.8412880301475525, + "learning_rate": 0.0005502955564614758, + "loss": 3.5231, + "step": 52970 + }, + { + "epoch": 3.599334148661503, + "grad_norm": 0.6407668590545654, + "learning_rate": 0.000550253091452643, + "loss": 3.6614, + "step": 52975 + }, + { + "epoch": 3.5996738687321646, + "grad_norm": 0.7974188327789307, + "learning_rate": 0.0005502106264438102, + "loss": 3.4152, + "step": 52980 + }, + { + "epoch": 3.6000135888028266, + "grad_norm": 1.0708857774734497, + "learning_rate": 0.0005501681614349777, + "loss": 3.4187, + "step": 52985 + }, + { + "epoch": 3.6003533088734883, + "grad_norm": 0.9016870260238647, + "learning_rate": 0.0005501256964261449, + "loss": 3.5349, + "step": 52990 + }, + { + "epoch": 3.60069302894415, + "grad_norm": 0.8180779218673706, + "learning_rate": 0.0005500832314173121, + "loss": 3.7903, + "step": 52995 + }, + { + "epoch": 3.601032749014812, + "grad_norm": 0.7955362796783447, + "learning_rate": 0.0005500407664084794, + "loss": 3.4442, + "step": 53000 + }, + { + "epoch": 3.6013724690854736, + "grad_norm": 0.7339842319488525, + "learning_rate": 0.0005499983013996467, + "loss": 3.5846, + "step": 53005 + }, + { + "epoch": 3.6017121891561352, + "grad_norm": 0.8711400628089905, + "learning_rate": 0.000549955836390814, + "loss": 3.3987, + "step": 53010 + }, + { + "epoch": 3.6020519092267973, + "grad_norm": 0.7559113502502441, + "learning_rate": 0.0005499133713819812, + "loss": 3.4994, + "step": 53015 + }, + { + "epoch": 3.602391629297459, + "grad_norm": 0.8538342118263245, + "learning_rate": 0.0005498709063731486, + "loss": 3.5329, + "step": 53020 + }, + { + "epoch": 3.6027313493681206, + "grad_norm": 0.9202799201011658, + "learning_rate": 0.0005498284413643159, + "loss": 3.4259, + "step": 53025 + }, + { + "epoch": 3.6030710694387826, + "grad_norm": 0.9722372889518738, + "learning_rate": 0.0005497859763554831, + "loss": 3.4438, + "step": 53030 + }, + { + "epoch": 3.6034107895094443, + "grad_norm": 0.8315594792366028, + "learning_rate": 0.0005497435113466504, + "loss": 3.4125, + "step": 53035 + }, + { + "epoch": 3.603750509580106, + "grad_norm": 0.8761420845985413, + "learning_rate": 0.0005497010463378177, + "loss": 3.3145, + "step": 53040 + }, + { + "epoch": 3.604090229650768, + "grad_norm": 0.9038668274879456, + "learning_rate": 0.0005496585813289849, + "loss": 3.452, + "step": 53045 + }, + { + "epoch": 3.6044299497214296, + "grad_norm": 0.9236066341400146, + "learning_rate": 0.0005496161163201521, + "loss": 3.3587, + "step": 53050 + }, + { + "epoch": 3.6047696697920912, + "grad_norm": 0.7065162062644958, + "learning_rate": 0.0005495736513113196, + "loss": 3.5815, + "step": 53055 + }, + { + "epoch": 3.6051093898627533, + "grad_norm": 0.7982288002967834, + "learning_rate": 0.0005495311863024868, + "loss": 3.6493, + "step": 53060 + }, + { + "epoch": 3.605449109933415, + "grad_norm": 1.0135608911514282, + "learning_rate": 0.000549488721293654, + "loss": 3.5367, + "step": 53065 + }, + { + "epoch": 3.6057888300040766, + "grad_norm": 0.677914559841156, + "learning_rate": 0.0005494462562848214, + "loss": 3.5427, + "step": 53070 + }, + { + "epoch": 3.6061285500747386, + "grad_norm": 1.0613304376602173, + "learning_rate": 0.0005494037912759886, + "loss": 3.3978, + "step": 53075 + }, + { + "epoch": 3.6064682701454003, + "grad_norm": 0.8823173642158508, + "learning_rate": 0.0005493613262671558, + "loss": 3.5131, + "step": 53080 + }, + { + "epoch": 3.606807990216062, + "grad_norm": 0.8162183165550232, + "learning_rate": 0.0005493188612583232, + "loss": 3.2506, + "step": 53085 + }, + { + "epoch": 3.607147710286724, + "grad_norm": 0.9652081727981567, + "learning_rate": 0.0005492763962494905, + "loss": 3.4216, + "step": 53090 + }, + { + "epoch": 3.6074874303573856, + "grad_norm": 0.739782989025116, + "learning_rate": 0.0005492339312406577, + "loss": 3.3893, + "step": 53095 + }, + { + "epoch": 3.6078271504280472, + "grad_norm": 0.8135303258895874, + "learning_rate": 0.000549191466231825, + "loss": 3.5692, + "step": 53100 + }, + { + "epoch": 3.6081668704987093, + "grad_norm": 0.7996247410774231, + "learning_rate": 0.0005491490012229923, + "loss": 3.6543, + "step": 53105 + }, + { + "epoch": 3.608506590569371, + "grad_norm": 0.8896073698997498, + "learning_rate": 0.0005491065362141595, + "loss": 3.1362, + "step": 53110 + }, + { + "epoch": 3.6088463106400326, + "grad_norm": 0.7613343596458435, + "learning_rate": 0.0005490640712053268, + "loss": 3.5772, + "step": 53115 + }, + { + "epoch": 3.6091860307106947, + "grad_norm": 1.1504135131835938, + "learning_rate": 0.0005490216061964941, + "loss": 3.5303, + "step": 53120 + }, + { + "epoch": 3.6095257507813563, + "grad_norm": 1.0859193801879883, + "learning_rate": 0.0005489791411876614, + "loss": 3.6872, + "step": 53125 + }, + { + "epoch": 3.609865470852018, + "grad_norm": 0.970740556716919, + "learning_rate": 0.0005489366761788287, + "loss": 3.6681, + "step": 53130 + }, + { + "epoch": 3.6102051909226796, + "grad_norm": 1.1665878295898438, + "learning_rate": 0.000548894211169996, + "loss": 3.8046, + "step": 53135 + }, + { + "epoch": 3.6105449109933416, + "grad_norm": 0.7731561064720154, + "learning_rate": 0.0005488517461611632, + "loss": 3.3662, + "step": 53140 + }, + { + "epoch": 3.6108846310640033, + "grad_norm": 3.161797523498535, + "learning_rate": 0.0005488092811523305, + "loss": 3.5341, + "step": 53145 + }, + { + "epoch": 3.611224351134665, + "grad_norm": 0.9348070025444031, + "learning_rate": 0.0005487668161434977, + "loss": 3.6116, + "step": 53150 + }, + { + "epoch": 3.611564071205327, + "grad_norm": 0.8020744919776917, + "learning_rate": 0.000548724351134665, + "loss": 3.6185, + "step": 53155 + }, + { + "epoch": 3.6119037912759886, + "grad_norm": 1.084151029586792, + "learning_rate": 0.0005486818861258324, + "loss": 3.4909, + "step": 53160 + }, + { + "epoch": 3.6122435113466502, + "grad_norm": 0.9327008128166199, + "learning_rate": 0.0005486394211169996, + "loss": 3.7374, + "step": 53165 + }, + { + "epoch": 3.612583231417312, + "grad_norm": 0.8291416764259338, + "learning_rate": 0.0005485969561081669, + "loss": 3.4535, + "step": 53170 + }, + { + "epoch": 3.612922951487974, + "grad_norm": 1.0023761987686157, + "learning_rate": 0.0005485544910993342, + "loss": 3.3992, + "step": 53175 + }, + { + "epoch": 3.6132626715586356, + "grad_norm": 0.8939257860183716, + "learning_rate": 0.0005485120260905014, + "loss": 3.5514, + "step": 53180 + }, + { + "epoch": 3.613602391629297, + "grad_norm": 1.1401320695877075, + "learning_rate": 0.0005484695610816686, + "loss": 3.5131, + "step": 53185 + }, + { + "epoch": 3.6139421116999593, + "grad_norm": 1.323630928993225, + "learning_rate": 0.000548427096072836, + "loss": 3.5592, + "step": 53190 + }, + { + "epoch": 3.614281831770621, + "grad_norm": 0.8592724800109863, + "learning_rate": 0.0005483846310640033, + "loss": 3.5082, + "step": 53195 + }, + { + "epoch": 3.6146215518412825, + "grad_norm": 1.0455197095870972, + "learning_rate": 0.0005483421660551705, + "loss": 3.5241, + "step": 53200 + }, + { + "epoch": 3.6149612719119446, + "grad_norm": 1.0021414756774902, + "learning_rate": 0.0005482997010463379, + "loss": 3.4177, + "step": 53205 + }, + { + "epoch": 3.6153009919826062, + "grad_norm": 0.8474823832511902, + "learning_rate": 0.0005482572360375051, + "loss": 3.733, + "step": 53210 + }, + { + "epoch": 3.615640712053268, + "grad_norm": 1.158491849899292, + "learning_rate": 0.0005482147710286723, + "loss": 3.7061, + "step": 53215 + }, + { + "epoch": 3.61598043212393, + "grad_norm": 0.8034505248069763, + "learning_rate": 0.0005481723060198397, + "loss": 3.5424, + "step": 53220 + }, + { + "epoch": 3.6163201521945916, + "grad_norm": 0.8616746664047241, + "learning_rate": 0.0005481298410110069, + "loss": 3.3732, + "step": 53225 + }, + { + "epoch": 3.616659872265253, + "grad_norm": 0.8294572234153748, + "learning_rate": 0.0005480873760021742, + "loss": 3.4349, + "step": 53230 + }, + { + "epoch": 3.6169995923359153, + "grad_norm": 0.7767804265022278, + "learning_rate": 0.0005480449109933416, + "loss": 3.5209, + "step": 53235 + }, + { + "epoch": 3.617339312406577, + "grad_norm": 1.0193052291870117, + "learning_rate": 0.0005480024459845088, + "loss": 3.6887, + "step": 53240 + }, + { + "epoch": 3.6176790324772385, + "grad_norm": 1.058376669883728, + "learning_rate": 0.000547959980975676, + "loss": 3.3731, + "step": 53245 + }, + { + "epoch": 3.6180187525479006, + "grad_norm": 1.0212019681930542, + "learning_rate": 0.0005479175159668433, + "loss": 3.5814, + "step": 53250 + }, + { + "epoch": 3.6183584726185622, + "grad_norm": 1.0459392070770264, + "learning_rate": 0.0005478750509580106, + "loss": 3.5611, + "step": 53255 + }, + { + "epoch": 3.618698192689224, + "grad_norm": 0.8038339018821716, + "learning_rate": 0.0005478325859491778, + "loss": 3.4438, + "step": 53260 + }, + { + "epoch": 3.619037912759886, + "grad_norm": 0.9216831922531128, + "learning_rate": 0.0005477901209403452, + "loss": 3.3931, + "step": 53265 + }, + { + "epoch": 3.6193776328305476, + "grad_norm": 1.299797773361206, + "learning_rate": 0.0005477476559315125, + "loss": 3.3578, + "step": 53270 + }, + { + "epoch": 3.619717352901209, + "grad_norm": 0.8621424436569214, + "learning_rate": 0.0005477051909226797, + "loss": 3.4418, + "step": 53275 + }, + { + "epoch": 3.6200570729718713, + "grad_norm": 0.9578065872192383, + "learning_rate": 0.000547662725913847, + "loss": 3.6843, + "step": 53280 + }, + { + "epoch": 3.620396793042533, + "grad_norm": 1.0238393545150757, + "learning_rate": 0.0005476202609050142, + "loss": 3.4834, + "step": 53285 + }, + { + "epoch": 3.6207365131131946, + "grad_norm": 0.7883076071739197, + "learning_rate": 0.0005475777958961815, + "loss": 3.4925, + "step": 53290 + }, + { + "epoch": 3.6210762331838566, + "grad_norm": 0.9622504115104675, + "learning_rate": 0.0005475353308873488, + "loss": 3.4405, + "step": 53295 + }, + { + "epoch": 3.6214159532545183, + "grad_norm": 0.9517592191696167, + "learning_rate": 0.0005474928658785161, + "loss": 3.4437, + "step": 53300 + }, + { + "epoch": 3.62175567332518, + "grad_norm": 0.8426573872566223, + "learning_rate": 0.0005474504008696834, + "loss": 3.5712, + "step": 53305 + }, + { + "epoch": 3.622095393395842, + "grad_norm": 0.8998821973800659, + "learning_rate": 0.0005474079358608507, + "loss": 3.6465, + "step": 53310 + }, + { + "epoch": 3.6224351134665036, + "grad_norm": 0.7315207123756409, + "learning_rate": 0.0005473654708520179, + "loss": 3.4696, + "step": 53315 + }, + { + "epoch": 3.6227748335371652, + "grad_norm": 1.1573255062103271, + "learning_rate": 0.0005473230058431852, + "loss": 3.1097, + "step": 53320 + }, + { + "epoch": 3.6231145536078273, + "grad_norm": 0.7386139035224915, + "learning_rate": 0.0005472805408343525, + "loss": 3.4395, + "step": 53325 + }, + { + "epoch": 3.623454273678489, + "grad_norm": 0.9619468450546265, + "learning_rate": 0.0005472380758255197, + "loss": 3.5713, + "step": 53330 + }, + { + "epoch": 3.6237939937491506, + "grad_norm": 0.7527628540992737, + "learning_rate": 0.000547195610816687, + "loss": 3.4729, + "step": 53335 + }, + { + "epoch": 3.6241337138198126, + "grad_norm": 0.8720498085021973, + "learning_rate": 0.0005471531458078544, + "loss": 3.6773, + "step": 53340 + }, + { + "epoch": 3.6244734338904743, + "grad_norm": 1.0229952335357666, + "learning_rate": 0.0005471106807990216, + "loss": 3.275, + "step": 53345 + }, + { + "epoch": 3.624813153961136, + "grad_norm": 0.9368073344230652, + "learning_rate": 0.0005470682157901889, + "loss": 3.3269, + "step": 53350 + }, + { + "epoch": 3.625152874031798, + "grad_norm": 0.7876255512237549, + "learning_rate": 0.0005470257507813562, + "loss": 3.5188, + "step": 53355 + }, + { + "epoch": 3.6254925941024596, + "grad_norm": 0.9945898652076721, + "learning_rate": 0.0005469832857725234, + "loss": 3.8083, + "step": 53360 + }, + { + "epoch": 3.6258323141731212, + "grad_norm": 0.6981765031814575, + "learning_rate": 0.0005469408207636908, + "loss": 3.4277, + "step": 53365 + }, + { + "epoch": 3.6261720342437833, + "grad_norm": 0.7349561452865601, + "learning_rate": 0.0005468983557548581, + "loss": 3.6487, + "step": 53370 + }, + { + "epoch": 3.626511754314445, + "grad_norm": 0.8500538468360901, + "learning_rate": 0.0005468558907460253, + "loss": 3.2882, + "step": 53375 + }, + { + "epoch": 3.6268514743851066, + "grad_norm": 0.9674482345581055, + "learning_rate": 0.0005468134257371926, + "loss": 3.721, + "step": 53380 + }, + { + "epoch": 3.6271911944557687, + "grad_norm": 0.7942502498626709, + "learning_rate": 0.0005467709607283598, + "loss": 3.6578, + "step": 53385 + }, + { + "epoch": 3.6275309145264303, + "grad_norm": 0.9464951157569885, + "learning_rate": 0.0005467284957195271, + "loss": 3.4315, + "step": 53390 + }, + { + "epoch": 3.627870634597092, + "grad_norm": 1.1179567575454712, + "learning_rate": 0.0005466860307106944, + "loss": 3.432, + "step": 53395 + }, + { + "epoch": 3.628210354667754, + "grad_norm": 0.9114906787872314, + "learning_rate": 0.0005466435657018617, + "loss": 3.4261, + "step": 53400 + }, + { + "epoch": 3.6285500747384156, + "grad_norm": 0.9447861313819885, + "learning_rate": 0.000546601100693029, + "loss": 3.4855, + "step": 53405 + }, + { + "epoch": 3.6288897948090773, + "grad_norm": 0.9472312927246094, + "learning_rate": 0.0005465586356841963, + "loss": 3.7531, + "step": 53410 + }, + { + "epoch": 3.6292295148797393, + "grad_norm": 0.9389589428901672, + "learning_rate": 0.0005465161706753635, + "loss": 3.4174, + "step": 53415 + }, + { + "epoch": 3.629569234950401, + "grad_norm": 0.7900646924972534, + "learning_rate": 0.0005464737056665308, + "loss": 3.3464, + "step": 53420 + }, + { + "epoch": 3.6299089550210626, + "grad_norm": 0.8448693752288818, + "learning_rate": 0.0005464312406576981, + "loss": 3.629, + "step": 53425 + }, + { + "epoch": 3.6302486750917247, + "grad_norm": 0.985680103302002, + "learning_rate": 0.0005463887756488653, + "loss": 3.4336, + "step": 53430 + }, + { + "epoch": 3.6305883951623863, + "grad_norm": 1.0249252319335938, + "learning_rate": 0.0005463463106400326, + "loss": 3.6783, + "step": 53435 + }, + { + "epoch": 3.630928115233048, + "grad_norm": 0.9765924215316772, + "learning_rate": 0.0005463038456312, + "loss": 3.5812, + "step": 53440 + }, + { + "epoch": 3.63126783530371, + "grad_norm": 0.8694468140602112, + "learning_rate": 0.0005462613806223672, + "loss": 3.4509, + "step": 53445 + }, + { + "epoch": 3.6316075553743716, + "grad_norm": 0.7229470014572144, + "learning_rate": 0.0005462189156135344, + "loss": 3.5385, + "step": 53450 + }, + { + "epoch": 3.6319472754450333, + "grad_norm": 0.8552659749984741, + "learning_rate": 0.0005461764506047018, + "loss": 3.6114, + "step": 53455 + }, + { + "epoch": 3.6322869955156953, + "grad_norm": 1.0587165355682373, + "learning_rate": 0.000546133985595869, + "loss": 3.1542, + "step": 53460 + }, + { + "epoch": 3.632626715586357, + "grad_norm": 0.9568684101104736, + "learning_rate": 0.0005460915205870362, + "loss": 3.589, + "step": 53465 + }, + { + "epoch": 3.6329664356570186, + "grad_norm": 0.7544073462486267, + "learning_rate": 0.0005460490555782037, + "loss": 3.4516, + "step": 53470 + }, + { + "epoch": 3.6333061557276802, + "grad_norm": 1.0944217443466187, + "learning_rate": 0.0005460065905693709, + "loss": 3.573, + "step": 53475 + }, + { + "epoch": 3.6336458757983423, + "grad_norm": 1.0402231216430664, + "learning_rate": 0.0005459641255605381, + "loss": 3.5868, + "step": 53480 + }, + { + "epoch": 3.633985595869004, + "grad_norm": 1.0738757848739624, + "learning_rate": 0.0005459216605517054, + "loss": 3.5531, + "step": 53485 + }, + { + "epoch": 3.6343253159396656, + "grad_norm": 1.198174238204956, + "learning_rate": 0.0005458791955428727, + "loss": 3.5029, + "step": 53490 + }, + { + "epoch": 3.6346650360103276, + "grad_norm": 0.913598895072937, + "learning_rate": 0.0005458367305340399, + "loss": 3.5172, + "step": 53495 + }, + { + "epoch": 3.6350047560809893, + "grad_norm": 0.7374840974807739, + "learning_rate": 0.0005457942655252072, + "loss": 3.61, + "step": 53500 + }, + { + "epoch": 3.635344476151651, + "grad_norm": 0.9120209813117981, + "learning_rate": 0.0005457518005163746, + "loss": 3.4992, + "step": 53505 + }, + { + "epoch": 3.635684196222313, + "grad_norm": 0.7685786485671997, + "learning_rate": 0.0005457093355075418, + "loss": 3.6429, + "step": 53510 + }, + { + "epoch": 3.6360239162929746, + "grad_norm": 0.7588890790939331, + "learning_rate": 0.0005456668704987091, + "loss": 3.4713, + "step": 53515 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8091776967048645, + "learning_rate": 0.0005456244054898764, + "loss": 3.7082, + "step": 53520 + }, + { + "epoch": 3.636703356434298, + "grad_norm": 1.245002269744873, + "learning_rate": 0.0005455819404810436, + "loss": 3.3476, + "step": 53525 + }, + { + "epoch": 3.63704307650496, + "grad_norm": 0.7875908613204956, + "learning_rate": 0.0005455394754722109, + "loss": 3.8265, + "step": 53530 + }, + { + "epoch": 3.6373827965756216, + "grad_norm": 0.7236676812171936, + "learning_rate": 0.0005454970104633781, + "loss": 3.8029, + "step": 53535 + }, + { + "epoch": 3.637722516646283, + "grad_norm": 1.0492889881134033, + "learning_rate": 0.0005454545454545455, + "loss": 3.6132, + "step": 53540 + }, + { + "epoch": 3.6380622367169453, + "grad_norm": 0.7291705012321472, + "learning_rate": 0.0005454120804457128, + "loss": 3.5557, + "step": 53545 + }, + { + "epoch": 3.638401956787607, + "grad_norm": 0.8539997339248657, + "learning_rate": 0.00054536961543688, + "loss": 3.6193, + "step": 53550 + }, + { + "epoch": 3.6387416768582685, + "grad_norm": 0.820368766784668, + "learning_rate": 0.0005453271504280473, + "loss": 3.8116, + "step": 53555 + }, + { + "epoch": 3.6390813969289306, + "grad_norm": 0.9628114700317383, + "learning_rate": 0.0005452846854192146, + "loss": 3.2991, + "step": 53560 + }, + { + "epoch": 3.6394211169995923, + "grad_norm": 0.9317513704299927, + "learning_rate": 0.0005452422204103818, + "loss": 3.581, + "step": 53565 + }, + { + "epoch": 3.639760837070254, + "grad_norm": 0.8308173418045044, + "learning_rate": 0.000545199755401549, + "loss": 3.4594, + "step": 53570 + }, + { + "epoch": 3.640100557140916, + "grad_norm": 0.9830506443977356, + "learning_rate": 0.0005451572903927165, + "loss": 3.8224, + "step": 53575 + }, + { + "epoch": 3.6404402772115776, + "grad_norm": 0.7715734839439392, + "learning_rate": 0.0005451148253838837, + "loss": 3.7016, + "step": 53580 + }, + { + "epoch": 3.6407799972822392, + "grad_norm": 0.8911080360412598, + "learning_rate": 0.0005450723603750509, + "loss": 3.3807, + "step": 53585 + }, + { + "epoch": 3.6411197173529013, + "grad_norm": 0.8048841953277588, + "learning_rate": 0.0005450298953662183, + "loss": 3.7132, + "step": 53590 + }, + { + "epoch": 3.641459437423563, + "grad_norm": 1.1014550924301147, + "learning_rate": 0.0005449874303573855, + "loss": 3.3869, + "step": 53595 + }, + { + "epoch": 3.6417991574942246, + "grad_norm": 1.1169852018356323, + "learning_rate": 0.0005449449653485527, + "loss": 3.6473, + "step": 53600 + }, + { + "epoch": 3.6421388775648866, + "grad_norm": 0.8730533719062805, + "learning_rate": 0.0005449025003397201, + "loss": 3.4814, + "step": 53605 + }, + { + "epoch": 3.6424785976355483, + "grad_norm": 0.9567104578018188, + "learning_rate": 0.0005448600353308874, + "loss": 3.4283, + "step": 53610 + }, + { + "epoch": 3.64281831770621, + "grad_norm": 1.0069283246994019, + "learning_rate": 0.0005448175703220546, + "loss": 3.5508, + "step": 53615 + }, + { + "epoch": 3.643158037776872, + "grad_norm": 0.9305693507194519, + "learning_rate": 0.000544775105313222, + "loss": 3.5009, + "step": 53620 + }, + { + "epoch": 3.6434977578475336, + "grad_norm": 0.8448309302330017, + "learning_rate": 0.0005447326403043892, + "loss": 3.6315, + "step": 53625 + }, + { + "epoch": 3.6438374779181952, + "grad_norm": 0.9750667810440063, + "learning_rate": 0.0005446901752955564, + "loss": 3.815, + "step": 53630 + }, + { + "epoch": 3.6441771979888573, + "grad_norm": 0.9001898169517517, + "learning_rate": 0.0005446477102867237, + "loss": 3.3954, + "step": 53635 + }, + { + "epoch": 3.644516918059519, + "grad_norm": 0.9333378672599792, + "learning_rate": 0.000544605245277891, + "loss": 3.6932, + "step": 53640 + }, + { + "epoch": 3.6448566381301806, + "grad_norm": 0.8549649119377136, + "learning_rate": 0.0005445627802690583, + "loss": 3.3477, + "step": 53645 + }, + { + "epoch": 3.6451963582008426, + "grad_norm": 0.9726544618606567, + "learning_rate": 0.0005445203152602256, + "loss": 3.5421, + "step": 53650 + }, + { + "epoch": 3.6455360782715043, + "grad_norm": 1.005571961402893, + "learning_rate": 0.0005444778502513929, + "loss": 3.5152, + "step": 53655 + }, + { + "epoch": 3.645875798342166, + "grad_norm": 1.0286338329315186, + "learning_rate": 0.0005444353852425601, + "loss": 3.6594, + "step": 53660 + }, + { + "epoch": 3.646215518412828, + "grad_norm": 1.4790140390396118, + "learning_rate": 0.0005443929202337274, + "loss": 3.5095, + "step": 53665 + }, + { + "epoch": 3.6465552384834896, + "grad_norm": 0.9264194369316101, + "learning_rate": 0.0005443504552248946, + "loss": 3.6157, + "step": 53670 + }, + { + "epoch": 3.6468949585541512, + "grad_norm": 1.1653603315353394, + "learning_rate": 0.0005443079902160619, + "loss": 3.462, + "step": 53675 + }, + { + "epoch": 3.6472346786248133, + "grad_norm": 0.9679402112960815, + "learning_rate": 0.0005442655252072293, + "loss": 3.3452, + "step": 53680 + }, + { + "epoch": 3.647574398695475, + "grad_norm": 0.7488239407539368, + "learning_rate": 0.0005442230601983965, + "loss": 3.5484, + "step": 53685 + }, + { + "epoch": 3.6479141187661366, + "grad_norm": 0.8363377451896667, + "learning_rate": 0.0005441805951895639, + "loss": 3.6879, + "step": 53690 + }, + { + "epoch": 3.6482538388367987, + "grad_norm": 0.8868919610977173, + "learning_rate": 0.0005441381301807311, + "loss": 3.6101, + "step": 53695 + }, + { + "epoch": 3.6485935589074603, + "grad_norm": 0.8799973130226135, + "learning_rate": 0.0005440956651718983, + "loss": 3.5139, + "step": 53700 + }, + { + "epoch": 3.648933278978122, + "grad_norm": 1.0484988689422607, + "learning_rate": 0.0005440532001630657, + "loss": 3.5057, + "step": 53705 + }, + { + "epoch": 3.649272999048784, + "grad_norm": 0.9297687411308289, + "learning_rate": 0.0005440107351542329, + "loss": 3.7489, + "step": 53710 + }, + { + "epoch": 3.6496127191194456, + "grad_norm": 3.2022736072540283, + "learning_rate": 0.0005439682701454002, + "loss": 3.738, + "step": 53715 + }, + { + "epoch": 3.6499524391901073, + "grad_norm": 0.8155837059020996, + "learning_rate": 0.0005439258051365676, + "loss": 3.5469, + "step": 53720 + }, + { + "epoch": 3.6502921592607693, + "grad_norm": 0.7643327713012695, + "learning_rate": 0.0005438833401277348, + "loss": 3.5259, + "step": 53725 + }, + { + "epoch": 3.650631879331431, + "grad_norm": 0.9980785250663757, + "learning_rate": 0.000543840875118902, + "loss": 3.5422, + "step": 53730 + }, + { + "epoch": 3.6509715994020926, + "grad_norm": 1.0344367027282715, + "learning_rate": 0.0005437984101100693, + "loss": 3.4025, + "step": 53735 + }, + { + "epoch": 3.6513113194727547, + "grad_norm": 1.1083860397338867, + "learning_rate": 0.0005437559451012366, + "loss": 3.4482, + "step": 53740 + }, + { + "epoch": 3.6516510395434163, + "grad_norm": 0.9622552990913391, + "learning_rate": 0.0005437134800924038, + "loss": 3.302, + "step": 53745 + }, + { + "epoch": 3.651990759614078, + "grad_norm": 0.7270735502243042, + "learning_rate": 0.0005436710150835712, + "loss": 3.4932, + "step": 53750 + }, + { + "epoch": 3.65233047968474, + "grad_norm": 0.922563374042511, + "learning_rate": 0.0005436285500747385, + "loss": 3.6933, + "step": 53755 + }, + { + "epoch": 3.6526701997554016, + "grad_norm": 0.8631471395492554, + "learning_rate": 0.0005435860850659057, + "loss": 3.4558, + "step": 53760 + }, + { + "epoch": 3.6530099198260633, + "grad_norm": 0.9657631516456604, + "learning_rate": 0.000543543620057073, + "loss": 3.5256, + "step": 53765 + }, + { + "epoch": 3.6533496398967253, + "grad_norm": 1.023979663848877, + "learning_rate": 0.0005435011550482403, + "loss": 3.3266, + "step": 53770 + }, + { + "epoch": 3.653689359967387, + "grad_norm": 0.8565627932548523, + "learning_rate": 0.0005434586900394075, + "loss": 3.6544, + "step": 53775 + }, + { + "epoch": 3.6540290800380486, + "grad_norm": 0.9227367639541626, + "learning_rate": 0.0005434162250305748, + "loss": 3.5818, + "step": 53780 + }, + { + "epoch": 3.6543688001087107, + "grad_norm": 0.8070738315582275, + "learning_rate": 0.0005433737600217421, + "loss": 3.553, + "step": 53785 + }, + { + "epoch": 3.6547085201793723, + "grad_norm": 0.9844295978546143, + "learning_rate": 0.0005433312950129094, + "loss": 3.4678, + "step": 53790 + }, + { + "epoch": 3.655048240250034, + "grad_norm": 0.9921348690986633, + "learning_rate": 0.0005432888300040767, + "loss": 3.57, + "step": 53795 + }, + { + "epoch": 3.655387960320696, + "grad_norm": 0.7508013248443604, + "learning_rate": 0.0005432463649952439, + "loss": 3.5179, + "step": 53800 + }, + { + "epoch": 3.6557276803913576, + "grad_norm": 1.0456613302230835, + "learning_rate": 0.0005432038999864112, + "loss": 3.5896, + "step": 53805 + }, + { + "epoch": 3.6560674004620193, + "grad_norm": 0.9474089741706848, + "learning_rate": 0.0005431614349775785, + "loss": 3.4187, + "step": 53810 + }, + { + "epoch": 3.656407120532681, + "grad_norm": 0.7284054756164551, + "learning_rate": 0.0005431189699687457, + "loss": 3.6133, + "step": 53815 + }, + { + "epoch": 3.656746840603343, + "grad_norm": 0.8597087264060974, + "learning_rate": 0.000543076504959913, + "loss": 3.6638, + "step": 53820 + }, + { + "epoch": 3.6570865606740046, + "grad_norm": 0.9193095564842224, + "learning_rate": 0.0005430340399510804, + "loss": 3.6053, + "step": 53825 + }, + { + "epoch": 3.6574262807446662, + "grad_norm": 0.9221349358558655, + "learning_rate": 0.0005429915749422476, + "loss": 3.3287, + "step": 53830 + }, + { + "epoch": 3.6577660008153283, + "grad_norm": 0.8691791892051697, + "learning_rate": 0.0005429491099334148, + "loss": 3.0786, + "step": 53835 + }, + { + "epoch": 3.65810572088599, + "grad_norm": 4.4507856369018555, + "learning_rate": 0.0005429066449245822, + "loss": 3.2415, + "step": 53840 + }, + { + "epoch": 3.6584454409566516, + "grad_norm": 2.2986631393432617, + "learning_rate": 0.0005428641799157494, + "loss": 3.5252, + "step": 53845 + }, + { + "epoch": 3.6587851610273137, + "grad_norm": 0.8343453407287598, + "learning_rate": 0.0005428217149069166, + "loss": 3.5374, + "step": 53850 + }, + { + "epoch": 3.6591248810979753, + "grad_norm": 0.9695400595664978, + "learning_rate": 0.0005427792498980841, + "loss": 3.5743, + "step": 53855 + }, + { + "epoch": 3.659464601168637, + "grad_norm": 0.8947219252586365, + "learning_rate": 0.0005427367848892513, + "loss": 3.4393, + "step": 53860 + }, + { + "epoch": 3.6598043212392986, + "grad_norm": 1.3693032264709473, + "learning_rate": 0.0005426943198804185, + "loss": 3.1978, + "step": 53865 + }, + { + "epoch": 3.6601440413099606, + "grad_norm": 0.879641592502594, + "learning_rate": 0.0005426518548715859, + "loss": 3.4752, + "step": 53870 + }, + { + "epoch": 3.6604837613806223, + "grad_norm": 1.6642051935195923, + "learning_rate": 0.0005426093898627531, + "loss": 3.5837, + "step": 53875 + }, + { + "epoch": 3.660823481451284, + "grad_norm": 0.7730083465576172, + "learning_rate": 0.0005425669248539203, + "loss": 3.4529, + "step": 53880 + }, + { + "epoch": 3.661163201521946, + "grad_norm": 0.7907399535179138, + "learning_rate": 0.0005425244598450876, + "loss": 3.2974, + "step": 53885 + }, + { + "epoch": 3.6615029215926076, + "grad_norm": 0.819017231464386, + "learning_rate": 0.000542481994836255, + "loss": 3.5617, + "step": 53890 + }, + { + "epoch": 3.6618426416632692, + "grad_norm": 0.9198362827301025, + "learning_rate": 0.0005424395298274222, + "loss": 3.7441, + "step": 53895 + }, + { + "epoch": 3.6621823617339313, + "grad_norm": 0.8231764435768127, + "learning_rate": 0.0005423970648185895, + "loss": 3.4841, + "step": 53900 + }, + { + "epoch": 3.662522081804593, + "grad_norm": 0.6553881764411926, + "learning_rate": 0.0005423545998097568, + "loss": 3.6976, + "step": 53905 + }, + { + "epoch": 3.6628618018752546, + "grad_norm": 0.9289422035217285, + "learning_rate": 0.000542312134800924, + "loss": 3.5366, + "step": 53910 + }, + { + "epoch": 3.6632015219459166, + "grad_norm": 1.0634040832519531, + "learning_rate": 0.0005422696697920913, + "loss": 3.4089, + "step": 53915 + }, + { + "epoch": 3.6635412420165783, + "grad_norm": 0.9060655832290649, + "learning_rate": 0.0005422272047832585, + "loss": 3.6058, + "step": 53920 + }, + { + "epoch": 3.66388096208724, + "grad_norm": 0.855936586856842, + "learning_rate": 0.0005421847397744259, + "loss": 3.6563, + "step": 53925 + }, + { + "epoch": 3.664220682157902, + "grad_norm": 0.7236250638961792, + "learning_rate": 0.0005421422747655932, + "loss": 3.7949, + "step": 53930 + }, + { + "epoch": 3.6645604022285636, + "grad_norm": 1.4241117238998413, + "learning_rate": 0.0005420998097567604, + "loss": 3.5621, + "step": 53935 + }, + { + "epoch": 3.6649001222992252, + "grad_norm": 0.7712810635566711, + "learning_rate": 0.0005420573447479277, + "loss": 3.6143, + "step": 53940 + }, + { + "epoch": 3.6652398423698873, + "grad_norm": 0.8639662265777588, + "learning_rate": 0.000542014879739095, + "loss": 3.4224, + "step": 53945 + }, + { + "epoch": 3.665579562440549, + "grad_norm": 0.8472933173179626, + "learning_rate": 0.0005419724147302622, + "loss": 3.3658, + "step": 53950 + }, + { + "epoch": 3.6659192825112106, + "grad_norm": 1.0413376092910767, + "learning_rate": 0.0005419299497214295, + "loss": 3.7552, + "step": 53955 + }, + { + "epoch": 3.6662590025818727, + "grad_norm": 0.7592529654502869, + "learning_rate": 0.0005418874847125969, + "loss": 3.5151, + "step": 53960 + }, + { + "epoch": 3.6665987226525343, + "grad_norm": 0.9555734992027283, + "learning_rate": 0.0005418450197037641, + "loss": 3.7284, + "step": 53965 + }, + { + "epoch": 3.666938442723196, + "grad_norm": 1.0836386680603027, + "learning_rate": 0.0005418025546949313, + "loss": 3.5565, + "step": 53970 + }, + { + "epoch": 3.667278162793858, + "grad_norm": 0.8123279809951782, + "learning_rate": 0.0005417600896860987, + "loss": 3.8932, + "step": 53975 + }, + { + "epoch": 3.6676178828645196, + "grad_norm": 0.8126285672187805, + "learning_rate": 0.0005417176246772659, + "loss": 3.6789, + "step": 53980 + }, + { + "epoch": 3.6679576029351812, + "grad_norm": 0.9076705574989319, + "learning_rate": 0.0005416751596684331, + "loss": 3.6491, + "step": 53985 + }, + { + "epoch": 3.6682973230058433, + "grad_norm": 0.9691588878631592, + "learning_rate": 0.0005416326946596006, + "loss": 3.2437, + "step": 53990 + }, + { + "epoch": 3.668637043076505, + "grad_norm": 0.8860732913017273, + "learning_rate": 0.0005415902296507678, + "loss": 3.5711, + "step": 53995 + }, + { + "epoch": 3.6689767631471666, + "grad_norm": 1.2037523984909058, + "learning_rate": 0.000541547764641935, + "loss": 3.6186, + "step": 54000 + }, + { + "epoch": 3.6693164832178287, + "grad_norm": 0.7662795186042786, + "learning_rate": 0.0005415052996331024, + "loss": 3.5319, + "step": 54005 + }, + { + "epoch": 3.6696562032884903, + "grad_norm": 0.6535143852233887, + "learning_rate": 0.0005414628346242696, + "loss": 3.4832, + "step": 54010 + }, + { + "epoch": 3.669995923359152, + "grad_norm": 0.7135093212127686, + "learning_rate": 0.0005414203696154368, + "loss": 3.5677, + "step": 54015 + }, + { + "epoch": 3.670335643429814, + "grad_norm": 0.7677327990531921, + "learning_rate": 0.0005413779046066041, + "loss": 3.5208, + "step": 54020 + }, + { + "epoch": 3.6706753635004756, + "grad_norm": 0.887317419052124, + "learning_rate": 0.0005413354395977715, + "loss": 3.5648, + "step": 54025 + }, + { + "epoch": 3.6710150835711373, + "grad_norm": 1.641862392425537, + "learning_rate": 0.0005412929745889388, + "loss": 3.5252, + "step": 54030 + }, + { + "epoch": 3.6713548036417993, + "grad_norm": 0.8488301038742065, + "learning_rate": 0.000541250509580106, + "loss": 3.554, + "step": 54035 + }, + { + "epoch": 3.671694523712461, + "grad_norm": 0.8101773262023926, + "learning_rate": 0.0005412080445712733, + "loss": 3.6836, + "step": 54040 + }, + { + "epoch": 3.6720342437831226, + "grad_norm": 1.0983421802520752, + "learning_rate": 0.0005411655795624406, + "loss": 3.6108, + "step": 54045 + }, + { + "epoch": 3.6723739638537847, + "grad_norm": 0.7626888751983643, + "learning_rate": 0.0005411231145536078, + "loss": 3.5134, + "step": 54050 + }, + { + "epoch": 3.6727136839244463, + "grad_norm": 1.0626847743988037, + "learning_rate": 0.000541080649544775, + "loss": 3.6446, + "step": 54055 + }, + { + "epoch": 3.673053403995108, + "grad_norm": 0.8966887593269348, + "learning_rate": 0.000541046677537709, + "loss": 3.7368, + "step": 54060 + }, + { + "epoch": 3.67339312406577, + "grad_norm": 0.8835975527763367, + "learning_rate": 0.0005410042125288762, + "loss": 3.5249, + "step": 54065 + }, + { + "epoch": 3.6737328441364316, + "grad_norm": 1.1847426891326904, + "learning_rate": 0.0005409617475200434, + "loss": 3.6213, + "step": 54070 + }, + { + "epoch": 3.6740725642070933, + "grad_norm": 0.7691404223442078, + "learning_rate": 0.0005409192825112108, + "loss": 3.819, + "step": 54075 + }, + { + "epoch": 3.6744122842777553, + "grad_norm": 0.8413029313087463, + "learning_rate": 0.000540876817502378, + "loss": 3.6733, + "step": 54080 + }, + { + "epoch": 3.674752004348417, + "grad_norm": 0.9577942490577698, + "learning_rate": 0.0005408343524935453, + "loss": 3.7113, + "step": 54085 + }, + { + "epoch": 3.6750917244190786, + "grad_norm": 0.747960090637207, + "learning_rate": 0.0005407918874847127, + "loss": 3.5449, + "step": 54090 + }, + { + "epoch": 3.6754314444897407, + "grad_norm": 0.9806613922119141, + "learning_rate": 0.0005407494224758799, + "loss": 3.1673, + "step": 54095 + }, + { + "epoch": 3.6757711645604023, + "grad_norm": 0.974738359451294, + "learning_rate": 0.0005407069574670471, + "loss": 3.684, + "step": 54100 + }, + { + "epoch": 3.676110884631064, + "grad_norm": 1.0366618633270264, + "learning_rate": 0.0005406644924582145, + "loss": 3.469, + "step": 54105 + }, + { + "epoch": 3.676450604701726, + "grad_norm": 1.0136120319366455, + "learning_rate": 0.0005406220274493817, + "loss": 3.5532, + "step": 54110 + }, + { + "epoch": 3.6767903247723877, + "grad_norm": 0.9588584303855896, + "learning_rate": 0.0005405795624405489, + "loss": 3.4836, + "step": 54115 + }, + { + "epoch": 3.6771300448430493, + "grad_norm": 0.9791717529296875, + "learning_rate": 0.0005405370974317163, + "loss": 3.5582, + "step": 54120 + }, + { + "epoch": 3.6774697649137114, + "grad_norm": 0.8912561535835266, + "learning_rate": 0.0005404946324228836, + "loss": 3.5432, + "step": 54125 + }, + { + "epoch": 3.677809484984373, + "grad_norm": 0.76885586977005, + "learning_rate": 0.0005404521674140508, + "loss": 3.3913, + "step": 54130 + }, + { + "epoch": 3.6781492050550346, + "grad_norm": 1.5039868354797363, + "learning_rate": 0.0005404097024052181, + "loss": 3.6912, + "step": 54135 + }, + { + "epoch": 3.6784889251256967, + "grad_norm": 0.9796516299247742, + "learning_rate": 0.0005403672373963854, + "loss": 3.6329, + "step": 54140 + }, + { + "epoch": 3.6788286451963583, + "grad_norm": 0.8144344687461853, + "learning_rate": 0.0005403247723875526, + "loss": 3.6149, + "step": 54145 + }, + { + "epoch": 3.67916836526702, + "grad_norm": 0.8316938877105713, + "learning_rate": 0.00054028230737872, + "loss": 3.5676, + "step": 54150 + }, + { + "epoch": 3.6795080853376816, + "grad_norm": 1.1190590858459473, + "learning_rate": 0.0005402398423698873, + "loss": 3.5446, + "step": 54155 + }, + { + "epoch": 3.6798478054083437, + "grad_norm": 0.7893803715705872, + "learning_rate": 0.0005401973773610545, + "loss": 3.504, + "step": 54160 + }, + { + "epoch": 3.6801875254790053, + "grad_norm": 1.1727056503295898, + "learning_rate": 0.0005401549123522218, + "loss": 3.6709, + "step": 54165 + }, + { + "epoch": 3.680527245549667, + "grad_norm": 0.8898587822914124, + "learning_rate": 0.000540112447343389, + "loss": 3.1534, + "step": 54170 + }, + { + "epoch": 3.680866965620329, + "grad_norm": 1.1187223196029663, + "learning_rate": 0.0005400699823345563, + "loss": 3.8611, + "step": 54175 + }, + { + "epoch": 3.6812066856909906, + "grad_norm": 0.6748635172843933, + "learning_rate": 0.0005400275173257236, + "loss": 3.3779, + "step": 54180 + }, + { + "epoch": 3.6815464057616523, + "grad_norm": 0.8489018678665161, + "learning_rate": 0.0005399850523168909, + "loss": 3.5482, + "step": 54185 + }, + { + "epoch": 3.6818861258323143, + "grad_norm": 0.8029678463935852, + "learning_rate": 0.0005399425873080582, + "loss": 3.5418, + "step": 54190 + }, + { + "epoch": 3.682225845902976, + "grad_norm": 0.7644438147544861, + "learning_rate": 0.0005399001222992255, + "loss": 3.3744, + "step": 54195 + }, + { + "epoch": 3.6825655659736376, + "grad_norm": 1.0178329944610596, + "learning_rate": 0.0005398576572903927, + "loss": 3.6184, + "step": 54200 + }, + { + "epoch": 3.6829052860442992, + "grad_norm": 0.8287317752838135, + "learning_rate": 0.0005398151922815599, + "loss": 3.51, + "step": 54205 + }, + { + "epoch": 3.6832450061149613, + "grad_norm": 1.0194584131240845, + "learning_rate": 0.0005397727272727273, + "loss": 3.618, + "step": 54210 + }, + { + "epoch": 3.683584726185623, + "grad_norm": 0.8088593482971191, + "learning_rate": 0.0005397302622638945, + "loss": 3.5754, + "step": 54215 + }, + { + "epoch": 3.6839244462562846, + "grad_norm": 0.7583014965057373, + "learning_rate": 0.0005396877972550618, + "loss": 3.4456, + "step": 54220 + }, + { + "epoch": 3.6842641663269466, + "grad_norm": 0.9968864917755127, + "learning_rate": 0.0005396453322462292, + "loss": 3.4505, + "step": 54225 + }, + { + "epoch": 3.6846038863976083, + "grad_norm": 0.8842389583587646, + "learning_rate": 0.0005396028672373964, + "loss": 3.6391, + "step": 54230 + }, + { + "epoch": 3.68494360646827, + "grad_norm": 0.7744782567024231, + "learning_rate": 0.0005395604022285637, + "loss": 3.5707, + "step": 54235 + }, + { + "epoch": 3.685283326538932, + "grad_norm": 1.3036378622055054, + "learning_rate": 0.000539517937219731, + "loss": 3.2669, + "step": 54240 + }, + { + "epoch": 3.6856230466095936, + "grad_norm": 0.9784154891967773, + "learning_rate": 0.0005394754722108982, + "loss": 3.5614, + "step": 54245 + }, + { + "epoch": 3.6859627666802552, + "grad_norm": 1.2701444625854492, + "learning_rate": 0.0005394330072020655, + "loss": 3.7136, + "step": 54250 + }, + { + "epoch": 3.6863024867509173, + "grad_norm": 1.0348261594772339, + "learning_rate": 0.0005393905421932329, + "loss": 3.6474, + "step": 54255 + }, + { + "epoch": 3.686642206821579, + "grad_norm": 0.7751639485359192, + "learning_rate": 0.0005393480771844001, + "loss": 3.3143, + "step": 54260 + }, + { + "epoch": 3.6869819268922406, + "grad_norm": 0.6220433115959167, + "learning_rate": 0.0005393056121755674, + "loss": 3.6813, + "step": 54265 + }, + { + "epoch": 3.6873216469629027, + "grad_norm": 0.7965081930160522, + "learning_rate": 0.0005392631471667346, + "loss": 3.5256, + "step": 54270 + }, + { + "epoch": 3.6876613670335643, + "grad_norm": 0.7906827926635742, + "learning_rate": 0.0005392206821579019, + "loss": 3.4078, + "step": 54275 + }, + { + "epoch": 3.688001087104226, + "grad_norm": 0.6934220790863037, + "learning_rate": 0.0005391782171490692, + "loss": 3.6478, + "step": 54280 + }, + { + "epoch": 3.688340807174888, + "grad_norm": 0.8303923606872559, + "learning_rate": 0.0005391357521402364, + "loss": 3.6095, + "step": 54285 + }, + { + "epoch": 3.6886805272455496, + "grad_norm": 0.7225098013877869, + "learning_rate": 0.0005390932871314038, + "loss": 3.526, + "step": 54290 + }, + { + "epoch": 3.6890202473162113, + "grad_norm": 0.6876189112663269, + "learning_rate": 0.0005390508221225711, + "loss": 3.43, + "step": 54295 + }, + { + "epoch": 3.6893599673868733, + "grad_norm": 1.0093861818313599, + "learning_rate": 0.0005390083571137383, + "loss": 3.5287, + "step": 54300 + }, + { + "epoch": 3.689699687457535, + "grad_norm": 1.010561466217041, + "learning_rate": 0.0005389658921049055, + "loss": 3.6162, + "step": 54305 + }, + { + "epoch": 3.6900394075281966, + "grad_norm": 0.8978577256202698, + "learning_rate": 0.0005389234270960729, + "loss": 3.2227, + "step": 54310 + }, + { + "epoch": 3.6903791275988587, + "grad_norm": 0.8329516649246216, + "learning_rate": 0.0005388809620872401, + "loss": 3.6002, + "step": 54315 + }, + { + "epoch": 3.6907188476695203, + "grad_norm": 0.7605953812599182, + "learning_rate": 0.0005388384970784073, + "loss": 3.7521, + "step": 54320 + }, + { + "epoch": 3.691058567740182, + "grad_norm": 0.9332346320152283, + "learning_rate": 0.0005387960320695748, + "loss": 3.4737, + "step": 54325 + }, + { + "epoch": 3.691398287810844, + "grad_norm": 0.6883199214935303, + "learning_rate": 0.000538753567060742, + "loss": 3.385, + "step": 54330 + }, + { + "epoch": 3.6917380078815056, + "grad_norm": 0.9560381174087524, + "learning_rate": 0.0005387111020519092, + "loss": 3.5159, + "step": 54335 + }, + { + "epoch": 3.6920777279521673, + "grad_norm": 0.8032732009887695, + "learning_rate": 0.0005386686370430766, + "loss": 3.2552, + "step": 54340 + }, + { + "epoch": 3.6924174480228293, + "grad_norm": 0.7945679426193237, + "learning_rate": 0.0005386261720342438, + "loss": 3.4358, + "step": 54345 + }, + { + "epoch": 3.692757168093491, + "grad_norm": 0.7396383285522461, + "learning_rate": 0.000538583707025411, + "loss": 3.7597, + "step": 54350 + }, + { + "epoch": 3.6930968881641526, + "grad_norm": 0.8277559280395508, + "learning_rate": 0.0005385412420165783, + "loss": 3.6322, + "step": 54355 + }, + { + "epoch": 3.6934366082348147, + "grad_norm": 0.8260045051574707, + "learning_rate": 0.0005384987770077457, + "loss": 3.4902, + "step": 54360 + }, + { + "epoch": 3.6937763283054763, + "grad_norm": 0.6847435235977173, + "learning_rate": 0.0005384563119989129, + "loss": 3.4793, + "step": 54365 + }, + { + "epoch": 3.694116048376138, + "grad_norm": 0.7929403185844421, + "learning_rate": 0.0005384138469900802, + "loss": 3.4366, + "step": 54370 + }, + { + "epoch": 3.6944557684468, + "grad_norm": 1.007776141166687, + "learning_rate": 0.0005383713819812475, + "loss": 3.6566, + "step": 54375 + }, + { + "epoch": 3.6947954885174616, + "grad_norm": 0.7812309265136719, + "learning_rate": 0.0005383289169724147, + "loss": 3.1347, + "step": 54380 + }, + { + "epoch": 3.6951352085881233, + "grad_norm": 0.9151851534843445, + "learning_rate": 0.000538286451963582, + "loss": 3.415, + "step": 54385 + }, + { + "epoch": 3.6954749286587854, + "grad_norm": 0.944149911403656, + "learning_rate": 0.0005382439869547493, + "loss": 3.5124, + "step": 54390 + }, + { + "epoch": 3.695814648729447, + "grad_norm": 0.8862351775169373, + "learning_rate": 0.0005382015219459166, + "loss": 3.3679, + "step": 54395 + }, + { + "epoch": 3.6961543688001086, + "grad_norm": 1.1382607221603394, + "learning_rate": 0.0005381590569370839, + "loss": 3.4091, + "step": 54400 + }, + { + "epoch": 3.6964940888707707, + "grad_norm": 0.8045135736465454, + "learning_rate": 0.0005381165919282511, + "loss": 3.7042, + "step": 54405 + }, + { + "epoch": 3.6968338089414323, + "grad_norm": 0.809064507484436, + "learning_rate": 0.0005380741269194184, + "loss": 3.5757, + "step": 54410 + }, + { + "epoch": 3.697173529012094, + "grad_norm": 1.0329324007034302, + "learning_rate": 0.0005380316619105857, + "loss": 3.4736, + "step": 54415 + }, + { + "epoch": 3.697513249082756, + "grad_norm": 0.9891685843467712, + "learning_rate": 0.0005379891969017529, + "loss": 3.5582, + "step": 54420 + }, + { + "epoch": 3.6978529691534177, + "grad_norm": 0.855061948299408, + "learning_rate": 0.0005379467318929202, + "loss": 3.4013, + "step": 54425 + }, + { + "epoch": 3.6981926892240793, + "grad_norm": 0.9201240539550781, + "learning_rate": 0.0005379042668840876, + "loss": 3.3518, + "step": 54430 + }, + { + "epoch": 3.6985324092947414, + "grad_norm": 0.949068009853363, + "learning_rate": 0.0005378618018752548, + "loss": 3.5397, + "step": 54435 + }, + { + "epoch": 3.698872129365403, + "grad_norm": 0.8309175372123718, + "learning_rate": 0.000537819336866422, + "loss": 3.4629, + "step": 54440 + }, + { + "epoch": 3.6992118494360646, + "grad_norm": 0.9338887333869934, + "learning_rate": 0.0005377768718575894, + "loss": 3.561, + "step": 54445 + }, + { + "epoch": 3.6995515695067267, + "grad_norm": 0.7932783961296082, + "learning_rate": 0.0005377344068487566, + "loss": 3.4039, + "step": 54450 + }, + { + "epoch": 3.6998912895773883, + "grad_norm": 0.996977686882019, + "learning_rate": 0.0005376919418399238, + "loss": 3.7304, + "step": 54455 + }, + { + "epoch": 3.70023100964805, + "grad_norm": 7.821020126342773, + "learning_rate": 0.0005376494768310912, + "loss": 3.5286, + "step": 54460 + }, + { + "epoch": 3.700570729718712, + "grad_norm": 0.7767088413238525, + "learning_rate": 0.0005376070118222585, + "loss": 3.4414, + "step": 54465 + }, + { + "epoch": 3.7009104497893737, + "grad_norm": 0.7038832902908325, + "learning_rate": 0.0005375645468134257, + "loss": 3.5359, + "step": 54470 + }, + { + "epoch": 3.7012501698600353, + "grad_norm": 0.748259961605072, + "learning_rate": 0.0005375220818045931, + "loss": 3.5922, + "step": 54475 + }, + { + "epoch": 3.7015898899306974, + "grad_norm": 0.922260582447052, + "learning_rate": 0.0005374796167957603, + "loss": 3.5345, + "step": 54480 + }, + { + "epoch": 3.701929610001359, + "grad_norm": 3.8083419799804688, + "learning_rate": 0.0005374371517869275, + "loss": 3.5284, + "step": 54485 + }, + { + "epoch": 3.7022693300720206, + "grad_norm": 0.8823128938674927, + "learning_rate": 0.0005373946867780949, + "loss": 3.5432, + "step": 54490 + }, + { + "epoch": 3.7026090501426823, + "grad_norm": 0.9568198919296265, + "learning_rate": 0.0005373522217692621, + "loss": 3.5678, + "step": 54495 + }, + { + "epoch": 3.7029487702133443, + "grad_norm": 0.7618738412857056, + "learning_rate": 0.0005373097567604294, + "loss": 3.5271, + "step": 54500 + }, + { + "epoch": 3.703288490284006, + "grad_norm": 0.8264259099960327, + "learning_rate": 0.0005372672917515967, + "loss": 3.4868, + "step": 54505 + }, + { + "epoch": 3.7036282103546676, + "grad_norm": 1.2852931022644043, + "learning_rate": 0.000537224826742764, + "loss": 3.7262, + "step": 54510 + }, + { + "epoch": 3.7039679304253297, + "grad_norm": 0.8190256357192993, + "learning_rate": 0.0005371823617339312, + "loss": 3.2206, + "step": 54515 + }, + { + "epoch": 3.7043076504959913, + "grad_norm": 1.0484867095947266, + "learning_rate": 0.0005371398967250985, + "loss": 3.4134, + "step": 54520 + }, + { + "epoch": 3.704647370566653, + "grad_norm": 0.8543711304664612, + "learning_rate": 0.0005370974317162658, + "loss": 3.6826, + "step": 54525 + }, + { + "epoch": 3.704987090637315, + "grad_norm": 0.8401758074760437, + "learning_rate": 0.000537054966707433, + "loss": 3.5726, + "step": 54530 + }, + { + "epoch": 3.7053268107079766, + "grad_norm": 0.8125331401824951, + "learning_rate": 0.0005370125016986004, + "loss": 3.4772, + "step": 54535 + }, + { + "epoch": 3.7056665307786383, + "grad_norm": 0.9186561703681946, + "learning_rate": 0.0005369700366897677, + "loss": 3.6217, + "step": 54540 + }, + { + "epoch": 3.7060062508493, + "grad_norm": 0.7453955411911011, + "learning_rate": 0.0005369275716809349, + "loss": 3.6168, + "step": 54545 + }, + { + "epoch": 3.706345970919962, + "grad_norm": 1.0691556930541992, + "learning_rate": 0.0005368851066721022, + "loss": 3.5342, + "step": 54550 + }, + { + "epoch": 3.7066856909906236, + "grad_norm": 0.9565544128417969, + "learning_rate": 0.0005368426416632694, + "loss": 3.4415, + "step": 54555 + }, + { + "epoch": 3.7070254110612852, + "grad_norm": 2.180410385131836, + "learning_rate": 0.0005368001766544367, + "loss": 3.5547, + "step": 54560 + }, + { + "epoch": 3.7073651311319473, + "grad_norm": 0.814058780670166, + "learning_rate": 0.000536757711645604, + "loss": 3.7288, + "step": 54565 + }, + { + "epoch": 3.707704851202609, + "grad_norm": 1.0187994241714478, + "learning_rate": 0.0005367152466367713, + "loss": 3.3851, + "step": 54570 + }, + { + "epoch": 3.7080445712732706, + "grad_norm": 0.831760585308075, + "learning_rate": 0.0005366727816279387, + "loss": 3.4994, + "step": 54575 + }, + { + "epoch": 3.7083842913439327, + "grad_norm": 0.7636042833328247, + "learning_rate": 0.0005366303166191059, + "loss": 3.5339, + "step": 54580 + }, + { + "epoch": 3.7087240114145943, + "grad_norm": 0.9361118674278259, + "learning_rate": 0.0005365878516102731, + "loss": 3.2918, + "step": 54585 + }, + { + "epoch": 3.709063731485256, + "grad_norm": 0.9369881749153137, + "learning_rate": 0.0005365453866014405, + "loss": 3.7324, + "step": 54590 + }, + { + "epoch": 3.709403451555918, + "grad_norm": 0.84767746925354, + "learning_rate": 0.0005365029215926077, + "loss": 3.5391, + "step": 54595 + }, + { + "epoch": 3.7097431716265796, + "grad_norm": 0.9156890511512756, + "learning_rate": 0.0005364604565837749, + "loss": 3.4717, + "step": 54600 + }, + { + "epoch": 3.7100828916972413, + "grad_norm": 0.8554109334945679, + "learning_rate": 0.0005364179915749424, + "loss": 3.5459, + "step": 54605 + }, + { + "epoch": 3.7104226117679033, + "grad_norm": 2.821526527404785, + "learning_rate": 0.0005363755265661096, + "loss": 3.3429, + "step": 54610 + }, + { + "epoch": 3.710762331838565, + "grad_norm": 0.853610098361969, + "learning_rate": 0.0005363330615572768, + "loss": 3.2225, + "step": 54615 + }, + { + "epoch": 3.7111020519092266, + "grad_norm": 0.7477685213088989, + "learning_rate": 0.0005362905965484441, + "loss": 3.3099, + "step": 54620 + }, + { + "epoch": 3.7114417719798887, + "grad_norm": 0.8036760687828064, + "learning_rate": 0.0005362481315396114, + "loss": 3.398, + "step": 54625 + }, + { + "epoch": 3.7117814920505503, + "grad_norm": 1.1480666399002075, + "learning_rate": 0.0005362056665307786, + "loss": 3.8355, + "step": 54630 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 0.7095838189125061, + "learning_rate": 0.0005361632015219459, + "loss": 3.6385, + "step": 54635 + }, + { + "epoch": 3.712460932191874, + "grad_norm": 0.9823459386825562, + "learning_rate": 0.0005361207365131133, + "loss": 3.5069, + "step": 54640 + }, + { + "epoch": 3.7128006522625356, + "grad_norm": 0.9177284836769104, + "learning_rate": 0.0005360782715042805, + "loss": 3.558, + "step": 54645 + }, + { + "epoch": 3.7131403723331973, + "grad_norm": 0.9842345714569092, + "learning_rate": 0.0005360358064954478, + "loss": 3.4071, + "step": 54650 + }, + { + "epoch": 3.7134800924038593, + "grad_norm": 1.143006443977356, + "learning_rate": 0.000535993341486615, + "loss": 3.5398, + "step": 54655 + }, + { + "epoch": 3.713819812474521, + "grad_norm": 0.850955069065094, + "learning_rate": 0.0005359508764777823, + "loss": 3.2657, + "step": 54660 + }, + { + "epoch": 3.7141595325451826, + "grad_norm": 0.9978475570678711, + "learning_rate": 0.0005359084114689496, + "loss": 3.3555, + "step": 54665 + }, + { + "epoch": 3.7144992526158447, + "grad_norm": 0.8020433187484741, + "learning_rate": 0.0005358659464601168, + "loss": 3.5846, + "step": 54670 + }, + { + "epoch": 3.7148389726865063, + "grad_norm": 0.7511733174324036, + "learning_rate": 0.0005358234814512842, + "loss": 3.5052, + "step": 54675 + }, + { + "epoch": 3.715178692757168, + "grad_norm": 1.1383249759674072, + "learning_rate": 0.0005357810164424515, + "loss": 3.3825, + "step": 54680 + }, + { + "epoch": 3.71551841282783, + "grad_norm": 0.934053897857666, + "learning_rate": 0.0005357385514336187, + "loss": 3.5681, + "step": 54685 + }, + { + "epoch": 3.7158581328984917, + "grad_norm": 1.0155646800994873, + "learning_rate": 0.000535696086424786, + "loss": 3.6761, + "step": 54690 + }, + { + "epoch": 3.7161978529691533, + "grad_norm": 0.8193019032478333, + "learning_rate": 0.0005356536214159533, + "loss": 3.5868, + "step": 54695 + }, + { + "epoch": 3.7165375730398154, + "grad_norm": 0.8471525311470032, + "learning_rate": 0.0005356111564071205, + "loss": 3.637, + "step": 54700 + }, + { + "epoch": 3.716877293110477, + "grad_norm": 0.9972643852233887, + "learning_rate": 0.0005355686913982877, + "loss": 3.6231, + "step": 54705 + }, + { + "epoch": 3.7172170131811386, + "grad_norm": 0.7218445539474487, + "learning_rate": 0.0005355262263894552, + "loss": 3.6182, + "step": 54710 + }, + { + "epoch": 3.7175567332518007, + "grad_norm": 0.6964718699455261, + "learning_rate": 0.0005354837613806224, + "loss": 3.3818, + "step": 54715 + }, + { + "epoch": 3.7178964533224623, + "grad_norm": 0.7517160177230835, + "learning_rate": 0.0005354412963717896, + "loss": 3.4669, + "step": 54720 + }, + { + "epoch": 3.718236173393124, + "grad_norm": 0.858527421951294, + "learning_rate": 0.000535398831362957, + "loss": 3.4696, + "step": 54725 + }, + { + "epoch": 3.718575893463786, + "grad_norm": 0.8840001821517944, + "learning_rate": 0.0005353563663541242, + "loss": 3.5403, + "step": 54730 + }, + { + "epoch": 3.7189156135344477, + "grad_norm": 0.7905082702636719, + "learning_rate": 0.0005353139013452914, + "loss": 3.2079, + "step": 54735 + }, + { + "epoch": 3.7192553336051093, + "grad_norm": 0.9871233105659485, + "learning_rate": 0.0005352714363364589, + "loss": 3.5791, + "step": 54740 + }, + { + "epoch": 3.7195950536757714, + "grad_norm": 1.0439120531082153, + "learning_rate": 0.0005352289713276261, + "loss": 3.4803, + "step": 54745 + }, + { + "epoch": 3.719934773746433, + "grad_norm": 1.3573912382125854, + "learning_rate": 0.0005351865063187933, + "loss": 3.5948, + "step": 54750 + }, + { + "epoch": 3.7202744938170946, + "grad_norm": 0.658374011516571, + "learning_rate": 0.0005351440413099606, + "loss": 3.5918, + "step": 54755 + }, + { + "epoch": 3.7206142138877567, + "grad_norm": 0.654848575592041, + "learning_rate": 0.0005351015763011279, + "loss": 3.5778, + "step": 54760 + }, + { + "epoch": 3.7209539339584183, + "grad_norm": 0.6874385476112366, + "learning_rate": 0.0005350591112922951, + "loss": 3.3851, + "step": 54765 + }, + { + "epoch": 3.72129365402908, + "grad_norm": 0.9272044897079468, + "learning_rate": 0.0005350166462834624, + "loss": 3.4977, + "step": 54770 + }, + { + "epoch": 3.721633374099742, + "grad_norm": 0.8894643187522888, + "learning_rate": 0.0005349741812746298, + "loss": 3.3439, + "step": 54775 + }, + { + "epoch": 3.7219730941704037, + "grad_norm": 1.0387922525405884, + "learning_rate": 0.000534931716265797, + "loss": 3.6642, + "step": 54780 + }, + { + "epoch": 3.7223128142410653, + "grad_norm": 0.8908992409706116, + "learning_rate": 0.0005348892512569643, + "loss": 3.3051, + "step": 54785 + }, + { + "epoch": 3.7226525343117274, + "grad_norm": 0.9173659682273865, + "learning_rate": 0.0005348467862481316, + "loss": 3.7051, + "step": 54790 + }, + { + "epoch": 3.722992254382389, + "grad_norm": 0.9139732122421265, + "learning_rate": 0.0005348043212392988, + "loss": 3.5607, + "step": 54795 + }, + { + "epoch": 3.7233319744530506, + "grad_norm": 0.7579687833786011, + "learning_rate": 0.0005347618562304661, + "loss": 3.2559, + "step": 54800 + }, + { + "epoch": 3.7236716945237127, + "grad_norm": 0.8060002326965332, + "learning_rate": 0.0005347193912216333, + "loss": 3.4843, + "step": 54805 + }, + { + "epoch": 3.7240114145943743, + "grad_norm": 1.025412678718567, + "learning_rate": 0.0005346769262128007, + "loss": 3.4916, + "step": 54810 + }, + { + "epoch": 3.724351134665036, + "grad_norm": 0.8227401375770569, + "learning_rate": 0.000534634461203968, + "loss": 3.3547, + "step": 54815 + }, + { + "epoch": 3.724690854735698, + "grad_norm": 0.9234954714775085, + "learning_rate": 0.0005345919961951352, + "loss": 3.7813, + "step": 54820 + }, + { + "epoch": 3.7250305748063597, + "grad_norm": 0.9258533716201782, + "learning_rate": 0.0005345495311863025, + "loss": 3.358, + "step": 54825 + }, + { + "epoch": 3.7253702948770213, + "grad_norm": 1.0237486362457275, + "learning_rate": 0.0005345070661774698, + "loss": 3.3643, + "step": 54830 + }, + { + "epoch": 3.725710014947683, + "grad_norm": 0.8670133948326111, + "learning_rate": 0.000534464601168637, + "loss": 3.4656, + "step": 54835 + }, + { + "epoch": 3.726049735018345, + "grad_norm": 0.8724098801612854, + "learning_rate": 0.0005344221361598042, + "loss": 3.4246, + "step": 54840 + }, + { + "epoch": 3.7263894550890067, + "grad_norm": 0.7598333358764648, + "learning_rate": 0.0005343796711509717, + "loss": 3.5582, + "step": 54845 + }, + { + "epoch": 3.7267291751596683, + "grad_norm": 0.8060013055801392, + "learning_rate": 0.0005343372061421389, + "loss": 3.1932, + "step": 54850 + }, + { + "epoch": 3.7270688952303304, + "grad_norm": 0.8815630674362183, + "learning_rate": 0.0005342947411333061, + "loss": 3.5669, + "step": 54855 + }, + { + "epoch": 3.727408615300992, + "grad_norm": 0.824000358581543, + "learning_rate": 0.0005342522761244735, + "loss": 3.629, + "step": 54860 + }, + { + "epoch": 3.7277483353716536, + "grad_norm": 0.8391299843788147, + "learning_rate": 0.0005342098111156407, + "loss": 3.7632, + "step": 54865 + }, + { + "epoch": 3.7280880554423157, + "grad_norm": 1.0810083150863647, + "learning_rate": 0.0005341673461068079, + "loss": 3.5569, + "step": 54870 + }, + { + "epoch": 3.7284277755129773, + "grad_norm": 0.8473385572433472, + "learning_rate": 0.0005341248810979753, + "loss": 3.403, + "step": 54875 + }, + { + "epoch": 3.728767495583639, + "grad_norm": 0.8844379782676697, + "learning_rate": 0.0005340824160891426, + "loss": 3.4691, + "step": 54880 + }, + { + "epoch": 3.7291072156543006, + "grad_norm": 0.8988941311836243, + "learning_rate": 0.0005340399510803098, + "loss": 3.4963, + "step": 54885 + }, + { + "epoch": 3.7294469357249627, + "grad_norm": 1.094560980796814, + "learning_rate": 0.0005339974860714772, + "loss": 3.6129, + "step": 54890 + }, + { + "epoch": 3.7297866557956243, + "grad_norm": 1.1015762090682983, + "learning_rate": 0.0005339550210626444, + "loss": 3.6425, + "step": 54895 + }, + { + "epoch": 3.730126375866286, + "grad_norm": 1.0380468368530273, + "learning_rate": 0.0005339125560538116, + "loss": 3.4358, + "step": 54900 + }, + { + "epoch": 3.730466095936948, + "grad_norm": 0.8351988196372986, + "learning_rate": 0.0005338700910449789, + "loss": 3.3805, + "step": 54905 + }, + { + "epoch": 3.7308058160076096, + "grad_norm": 0.9078689813613892, + "learning_rate": 0.0005338276260361462, + "loss": 3.5684, + "step": 54910 + }, + { + "epoch": 3.7311455360782713, + "grad_norm": 0.9259300231933594, + "learning_rate": 0.0005337851610273136, + "loss": 3.4132, + "step": 54915 + }, + { + "epoch": 3.7314852561489333, + "grad_norm": 0.7662572264671326, + "learning_rate": 0.0005337426960184808, + "loss": 3.3829, + "step": 54920 + }, + { + "epoch": 3.731824976219595, + "grad_norm": 0.8535485863685608, + "learning_rate": 0.0005337002310096481, + "loss": 3.5451, + "step": 54925 + }, + { + "epoch": 3.7321646962902566, + "grad_norm": 0.9252389669418335, + "learning_rate": 0.0005336577660008154, + "loss": 3.446, + "step": 54930 + }, + { + "epoch": 3.7325044163609187, + "grad_norm": 0.7652342319488525, + "learning_rate": 0.0005336153009919826, + "loss": 3.5884, + "step": 54935 + }, + { + "epoch": 3.7328441364315803, + "grad_norm": 0.7747856974601746, + "learning_rate": 0.0005335728359831498, + "loss": 3.4759, + "step": 54940 + }, + { + "epoch": 3.733183856502242, + "grad_norm": 0.9390807151794434, + "learning_rate": 0.0005335303709743172, + "loss": 3.3596, + "step": 54945 + }, + { + "epoch": 3.733523576572904, + "grad_norm": 1.0120669603347778, + "learning_rate": 0.0005334879059654845, + "loss": 3.3746, + "step": 54950 + }, + { + "epoch": 3.7338632966435656, + "grad_norm": 1.3688359260559082, + "learning_rate": 0.0005334454409566517, + "loss": 3.682, + "step": 54955 + }, + { + "epoch": 3.7342030167142273, + "grad_norm": 0.8044041991233826, + "learning_rate": 0.0005334029759478191, + "loss": 3.588, + "step": 54960 + }, + { + "epoch": 3.7345427367848893, + "grad_norm": 0.8782764673233032, + "learning_rate": 0.0005333605109389863, + "loss": 3.4441, + "step": 54965 + }, + { + "epoch": 3.734882456855551, + "grad_norm": 0.7561390399932861, + "learning_rate": 0.0005333180459301535, + "loss": 3.5273, + "step": 54970 + }, + { + "epoch": 3.7352221769262126, + "grad_norm": 0.9201357960700989, + "learning_rate": 0.0005332755809213209, + "loss": 3.6334, + "step": 54975 + }, + { + "epoch": 3.7355618969968747, + "grad_norm": 1.4937915802001953, + "learning_rate": 0.0005332331159124881, + "loss": 3.6053, + "step": 54980 + }, + { + "epoch": 3.7359016170675363, + "grad_norm": 0.7815308570861816, + "learning_rate": 0.0005331906509036554, + "loss": 3.6802, + "step": 54985 + }, + { + "epoch": 3.736241337138198, + "grad_norm": 0.8350999355316162, + "learning_rate": 0.0005331481858948228, + "loss": 3.4881, + "step": 54990 + }, + { + "epoch": 3.73658105720886, + "grad_norm": 0.9744027853012085, + "learning_rate": 0.00053310572088599, + "loss": 3.6761, + "step": 54995 + }, + { + "epoch": 3.7369207772795217, + "grad_norm": 0.9894617199897766, + "learning_rate": 0.0005330632558771572, + "loss": 3.416, + "step": 55000 + }, + { + "epoch": 3.7372604973501833, + "grad_norm": 0.7445394992828369, + "learning_rate": 0.0005330207908683245, + "loss": 3.5047, + "step": 55005 + }, + { + "epoch": 3.7376002174208454, + "grad_norm": 0.8940715193748474, + "learning_rate": 0.0005329783258594918, + "loss": 3.5405, + "step": 55010 + }, + { + "epoch": 3.737939937491507, + "grad_norm": 0.9121837615966797, + "learning_rate": 0.000532935860850659, + "loss": 3.7374, + "step": 55015 + }, + { + "epoch": 3.7382796575621686, + "grad_norm": 1.1095999479293823, + "learning_rate": 0.0005328933958418264, + "loss": 3.7395, + "step": 55020 + }, + { + "epoch": 3.7386193776328307, + "grad_norm": 1.0399552583694458, + "learning_rate": 0.0005328509308329937, + "loss": 3.4144, + "step": 55025 + }, + { + "epoch": 3.7389590977034923, + "grad_norm": 1.083461880683899, + "learning_rate": 0.0005328084658241609, + "loss": 3.5318, + "step": 55030 + }, + { + "epoch": 3.739298817774154, + "grad_norm": 1.1159976720809937, + "learning_rate": 0.0005327660008153282, + "loss": 3.3818, + "step": 55035 + }, + { + "epoch": 3.739638537844816, + "grad_norm": 1.207781434059143, + "learning_rate": 0.0005327235358064954, + "loss": 3.5264, + "step": 55040 + }, + { + "epoch": 3.7399782579154777, + "grad_norm": 0.8892134428024292, + "learning_rate": 0.0005326810707976627, + "loss": 3.0759, + "step": 55045 + }, + { + "epoch": 3.7403179779861393, + "grad_norm": 0.9831871390342712, + "learning_rate": 0.00053263860578883, + "loss": 3.3487, + "step": 55050 + }, + { + "epoch": 3.7406576980568014, + "grad_norm": 0.8840104341506958, + "learning_rate": 0.0005325961407799973, + "loss": 3.6132, + "step": 55055 + }, + { + "epoch": 3.740997418127463, + "grad_norm": 0.9139968156814575, + "learning_rate": 0.0005325536757711646, + "loss": 3.591, + "step": 55060 + }, + { + "epoch": 3.7413371381981246, + "grad_norm": 0.8722597360610962, + "learning_rate": 0.0005325112107623319, + "loss": 3.562, + "step": 55065 + }, + { + "epoch": 3.7416768582687867, + "grad_norm": 0.9021174907684326, + "learning_rate": 0.0005324687457534991, + "loss": 3.7046, + "step": 55070 + }, + { + "epoch": 3.7420165783394483, + "grad_norm": 0.8170347809791565, + "learning_rate": 0.0005324262807446664, + "loss": 3.8767, + "step": 55075 + }, + { + "epoch": 3.74235629841011, + "grad_norm": 0.7089269757270813, + "learning_rate": 0.0005323838157358337, + "loss": 3.2882, + "step": 55080 + }, + { + "epoch": 3.742696018480772, + "grad_norm": 0.8517564535140991, + "learning_rate": 0.0005323413507270009, + "loss": 3.334, + "step": 55085 + }, + { + "epoch": 3.7430357385514337, + "grad_norm": 1.3590353727340698, + "learning_rate": 0.0005322988857181682, + "loss": 3.4143, + "step": 55090 + }, + { + "epoch": 3.7433754586220953, + "grad_norm": 0.8649422526359558, + "learning_rate": 0.0005322564207093356, + "loss": 3.4744, + "step": 55095 + }, + { + "epoch": 3.7437151786927574, + "grad_norm": 1.140626072883606, + "learning_rate": 0.0005322139557005028, + "loss": 3.4698, + "step": 55100 + }, + { + "epoch": 3.744054898763419, + "grad_norm": 0.8868101239204407, + "learning_rate": 0.00053217149069167, + "loss": 3.2847, + "step": 55105 + }, + { + "epoch": 3.7443946188340806, + "grad_norm": 1.0726596117019653, + "learning_rate": 0.0005321290256828374, + "loss": 3.4627, + "step": 55110 + }, + { + "epoch": 3.7447343389047427, + "grad_norm": 0.9685164093971252, + "learning_rate": 0.0005320865606740046, + "loss": 3.6135, + "step": 55115 + }, + { + "epoch": 3.7450740589754044, + "grad_norm": 0.73788982629776, + "learning_rate": 0.0005320440956651718, + "loss": 3.1723, + "step": 55120 + }, + { + "epoch": 3.745413779046066, + "grad_norm": 0.9749637246131897, + "learning_rate": 0.0005320016306563393, + "loss": 3.1178, + "step": 55125 + }, + { + "epoch": 3.745753499116728, + "grad_norm": 0.8260735869407654, + "learning_rate": 0.0005319591656475065, + "loss": 3.5449, + "step": 55130 + }, + { + "epoch": 3.7460932191873897, + "grad_norm": 0.9066513180732727, + "learning_rate": 0.0005319167006386737, + "loss": 3.5402, + "step": 55135 + }, + { + "epoch": 3.7464329392580513, + "grad_norm": 0.6825814247131348, + "learning_rate": 0.000531874235629841, + "loss": 3.3844, + "step": 55140 + }, + { + "epoch": 3.7467726593287134, + "grad_norm": 1.0481066703796387, + "learning_rate": 0.0005318317706210083, + "loss": 3.2837, + "step": 55145 + }, + { + "epoch": 3.747112379399375, + "grad_norm": 0.7989221811294556, + "learning_rate": 0.0005317893056121755, + "loss": 3.553, + "step": 55150 + }, + { + "epoch": 3.7474520994700367, + "grad_norm": 1.2207711935043335, + "learning_rate": 0.0005317468406033428, + "loss": 3.5926, + "step": 55155 + }, + { + "epoch": 3.7477918195406987, + "grad_norm": 0.9532440900802612, + "learning_rate": 0.0005317043755945102, + "loss": 3.4526, + "step": 55160 + }, + { + "epoch": 3.7481315396113604, + "grad_norm": 0.9052656888961792, + "learning_rate": 0.0005316619105856774, + "loss": 3.5859, + "step": 55165 + }, + { + "epoch": 3.748471259682022, + "grad_norm": 0.8305543065071106, + "learning_rate": 0.0005316194455768447, + "loss": 3.3612, + "step": 55170 + }, + { + "epoch": 3.7488109797526836, + "grad_norm": 1.017230749130249, + "learning_rate": 0.000531576980568012, + "loss": 3.4423, + "step": 55175 + }, + { + "epoch": 3.7491506998233457, + "grad_norm": 1.4099266529083252, + "learning_rate": 0.0005315345155591792, + "loss": 3.5978, + "step": 55180 + }, + { + "epoch": 3.7494904198940073, + "grad_norm": 0.869074821472168, + "learning_rate": 0.0005314920505503465, + "loss": 3.3265, + "step": 55185 + }, + { + "epoch": 3.749830139964669, + "grad_norm": 0.852171003818512, + "learning_rate": 0.0005314495855415137, + "loss": 3.6433, + "step": 55190 + }, + { + "epoch": 3.750169860035331, + "grad_norm": 0.9421220421791077, + "learning_rate": 0.0005314071205326811, + "loss": 3.2247, + "step": 55195 + }, + { + "epoch": 3.7505095801059927, + "grad_norm": 0.9400818943977356, + "learning_rate": 0.0005313646555238484, + "loss": 3.3751, + "step": 55200 + }, + { + "epoch": 3.7508493001766543, + "grad_norm": 0.801715612411499, + "learning_rate": 0.0005313221905150156, + "loss": 3.6393, + "step": 55205 + }, + { + "epoch": 3.7511890202473164, + "grad_norm": 1.176222324371338, + "learning_rate": 0.0005312797255061829, + "loss": 3.4815, + "step": 55210 + }, + { + "epoch": 3.751528740317978, + "grad_norm": 0.7445846199989319, + "learning_rate": 0.0005312372604973502, + "loss": 3.4595, + "step": 55215 + }, + { + "epoch": 3.7518684603886396, + "grad_norm": 0.7894490361213684, + "learning_rate": 0.0005311947954885174, + "loss": 3.3306, + "step": 55220 + }, + { + "epoch": 3.7522081804593013, + "grad_norm": 0.8974458575248718, + "learning_rate": 0.0005311523304796846, + "loss": 3.4457, + "step": 55225 + }, + { + "epoch": 3.7525479005299633, + "grad_norm": 0.7961004376411438, + "learning_rate": 0.0005311098654708521, + "loss": 3.6952, + "step": 55230 + }, + { + "epoch": 3.752887620600625, + "grad_norm": 0.8453017473220825, + "learning_rate": 0.0005310674004620193, + "loss": 3.5114, + "step": 55235 + }, + { + "epoch": 3.7532273406712866, + "grad_norm": 0.7636140584945679, + "learning_rate": 0.0005310249354531865, + "loss": 3.2699, + "step": 55240 + }, + { + "epoch": 3.7535670607419487, + "grad_norm": 1.1634408235549927, + "learning_rate": 0.0005309824704443539, + "loss": 3.792, + "step": 55245 + }, + { + "epoch": 3.7539067808126103, + "grad_norm": 0.8946232795715332, + "learning_rate": 0.0005309400054355211, + "loss": 3.6827, + "step": 55250 + }, + { + "epoch": 3.754246500883272, + "grad_norm": 0.664465606212616, + "learning_rate": 0.0005308975404266884, + "loss": 3.3936, + "step": 55255 + }, + { + "epoch": 3.754586220953934, + "grad_norm": 0.812756359577179, + "learning_rate": 0.0005308550754178557, + "loss": 3.5522, + "step": 55260 + }, + { + "epoch": 3.7549259410245956, + "grad_norm": 0.8426024913787842, + "learning_rate": 0.000530812610409023, + "loss": 3.6441, + "step": 55265 + }, + { + "epoch": 3.7552656610952573, + "grad_norm": 0.7616772651672363, + "learning_rate": 0.0005307701454001903, + "loss": 3.505, + "step": 55270 + }, + { + "epoch": 3.7556053811659194, + "grad_norm": 0.7767956852912903, + "learning_rate": 0.0005307276803913576, + "loss": 3.514, + "step": 55275 + }, + { + "epoch": 3.755945101236581, + "grad_norm": 1.210906744003296, + "learning_rate": 0.0005306852153825248, + "loss": 3.4874, + "step": 55280 + }, + { + "epoch": 3.7562848213072426, + "grad_norm": 0.8980417847633362, + "learning_rate": 0.0005306427503736921, + "loss": 3.5182, + "step": 55285 + }, + { + "epoch": 3.7566245413779047, + "grad_norm": 0.9105693697929382, + "learning_rate": 0.0005306002853648593, + "loss": 3.5348, + "step": 55290 + }, + { + "epoch": 3.7569642614485663, + "grad_norm": 1.0385831594467163, + "learning_rate": 0.0005305578203560266, + "loss": 3.5203, + "step": 55295 + }, + { + "epoch": 3.757303981519228, + "grad_norm": 1.036965250968933, + "learning_rate": 0.000530515355347194, + "loss": 3.6073, + "step": 55300 + }, + { + "epoch": 3.75764370158989, + "grad_norm": 0.9562515616416931, + "learning_rate": 0.0005304728903383612, + "loss": 3.5007, + "step": 55305 + }, + { + "epoch": 3.7579834216605517, + "grad_norm": 1.6136319637298584, + "learning_rate": 0.0005304304253295285, + "loss": 3.6554, + "step": 55310 + }, + { + "epoch": 3.7583231417312133, + "grad_norm": 1.090719223022461, + "learning_rate": 0.0005303879603206958, + "loss": 3.6498, + "step": 55315 + }, + { + "epoch": 3.7586628618018754, + "grad_norm": 0.9095427989959717, + "learning_rate": 0.000530345495311863, + "loss": 3.3836, + "step": 55320 + }, + { + "epoch": 3.759002581872537, + "grad_norm": 1.035569429397583, + "learning_rate": 0.0005303030303030302, + "loss": 3.6633, + "step": 55325 + }, + { + "epoch": 3.7593423019431986, + "grad_norm": 2.244798183441162, + "learning_rate": 0.0005302605652941977, + "loss": 3.4897, + "step": 55330 + }, + { + "epoch": 3.7596820220138607, + "grad_norm": 0.776496410369873, + "learning_rate": 0.0005302181002853649, + "loss": 3.4197, + "step": 55335 + }, + { + "epoch": 3.7600217420845223, + "grad_norm": 0.7489408254623413, + "learning_rate": 0.0005301756352765321, + "loss": 3.6238, + "step": 55340 + }, + { + "epoch": 3.760361462155184, + "grad_norm": 1.0078809261322021, + "learning_rate": 0.0005301331702676995, + "loss": 3.6445, + "step": 55345 + }, + { + "epoch": 3.760701182225846, + "grad_norm": 0.9269019961357117, + "learning_rate": 0.0005300907052588667, + "loss": 3.2803, + "step": 55350 + }, + { + "epoch": 3.7610409022965077, + "grad_norm": 0.9160454273223877, + "learning_rate": 0.0005300482402500339, + "loss": 3.743, + "step": 55355 + }, + { + "epoch": 3.7613806223671693, + "grad_norm": 0.792028546333313, + "learning_rate": 0.0005300057752412013, + "loss": 3.698, + "step": 55360 + }, + { + "epoch": 3.7617203424378314, + "grad_norm": 0.9154097437858582, + "learning_rate": 0.0005299633102323686, + "loss": 3.6001, + "step": 55365 + }, + { + "epoch": 3.762060062508493, + "grad_norm": 1.1782492399215698, + "learning_rate": 0.0005299208452235358, + "loss": 3.2071, + "step": 55370 + }, + { + "epoch": 3.7623997825791546, + "grad_norm": 0.9005864262580872, + "learning_rate": 0.0005298783802147032, + "loss": 3.5828, + "step": 55375 + }, + { + "epoch": 3.7627395026498167, + "grad_norm": 0.9597716331481934, + "learning_rate": 0.0005298359152058704, + "loss": 3.6054, + "step": 55380 + }, + { + "epoch": 3.7630792227204783, + "grad_norm": 0.7244840264320374, + "learning_rate": 0.0005297934501970376, + "loss": 3.3793, + "step": 55385 + }, + { + "epoch": 3.76341894279114, + "grad_norm": 0.8825938105583191, + "learning_rate": 0.0005297509851882049, + "loss": 3.5522, + "step": 55390 + }, + { + "epoch": 3.763758662861802, + "grad_norm": 0.9817368388175964, + "learning_rate": 0.0005297085201793722, + "loss": 4.0633, + "step": 55395 + }, + { + "epoch": 3.7640983829324637, + "grad_norm": 0.8809889554977417, + "learning_rate": 0.0005296660551705395, + "loss": 3.5106, + "step": 55400 + }, + { + "epoch": 3.7644381030031253, + "grad_norm": 0.8104987740516663, + "learning_rate": 0.0005296235901617068, + "loss": 3.2842, + "step": 55405 + }, + { + "epoch": 3.7647778230737874, + "grad_norm": 0.7242406606674194, + "learning_rate": 0.0005295811251528741, + "loss": 3.4776, + "step": 55410 + }, + { + "epoch": 3.765117543144449, + "grad_norm": 1.4764653444290161, + "learning_rate": 0.0005295386601440413, + "loss": 3.4658, + "step": 55415 + }, + { + "epoch": 3.7654572632151107, + "grad_norm": 1.390846610069275, + "learning_rate": 0.0005294961951352086, + "loss": 3.7129, + "step": 55420 + }, + { + "epoch": 3.7657969832857727, + "grad_norm": 0.7090047001838684, + "learning_rate": 0.0005294537301263758, + "loss": 3.6616, + "step": 55425 + }, + { + "epoch": 3.7661367033564344, + "grad_norm": 0.9972355961799622, + "learning_rate": 0.0005294112651175431, + "loss": 3.4256, + "step": 55430 + }, + { + "epoch": 3.766476423427096, + "grad_norm": 1.1724389791488647, + "learning_rate": 0.0005293688001087105, + "loss": 3.7219, + "step": 55435 + }, + { + "epoch": 3.766816143497758, + "grad_norm": 0.8376767039299011, + "learning_rate": 0.0005293263350998777, + "loss": 3.4615, + "step": 55440 + }, + { + "epoch": 3.7671558635684197, + "grad_norm": 0.9940052032470703, + "learning_rate": 0.000529283870091045, + "loss": 3.3804, + "step": 55445 + }, + { + "epoch": 3.7674955836390813, + "grad_norm": 0.9253517389297485, + "learning_rate": 0.0005292414050822123, + "loss": 3.7039, + "step": 55450 + }, + { + "epoch": 3.7678353037097434, + "grad_norm": 0.9232513904571533, + "learning_rate": 0.0005291989400733795, + "loss": 3.7027, + "step": 55455 + }, + { + "epoch": 3.768175023780405, + "grad_norm": 1.2464171648025513, + "learning_rate": 0.0005291564750645468, + "loss": 3.3816, + "step": 55460 + }, + { + "epoch": 3.7685147438510667, + "grad_norm": 0.8739017248153687, + "learning_rate": 0.0005291140100557141, + "loss": 3.2525, + "step": 55465 + }, + { + "epoch": 3.7688544639217287, + "grad_norm": 1.0148460865020752, + "learning_rate": 0.0005290715450468814, + "loss": 3.2883, + "step": 55470 + }, + { + "epoch": 3.7691941839923904, + "grad_norm": 1.2224669456481934, + "learning_rate": 0.0005290290800380487, + "loss": 3.4398, + "step": 55475 + }, + { + "epoch": 3.769533904063052, + "grad_norm": 0.8447927236557007, + "learning_rate": 0.000528986615029216, + "loss": 3.4991, + "step": 55480 + }, + { + "epoch": 3.769873624133714, + "grad_norm": 0.8097416758537292, + "learning_rate": 0.0005289441500203832, + "loss": 3.5246, + "step": 55485 + }, + { + "epoch": 3.7702133442043757, + "grad_norm": 0.7253726720809937, + "learning_rate": 0.0005289016850115504, + "loss": 3.6061, + "step": 55490 + }, + { + "epoch": 3.7705530642750373, + "grad_norm": 0.9674172401428223, + "learning_rate": 0.0005288592200027178, + "loss": 3.5866, + "step": 55495 + }, + { + "epoch": 3.7708927843456994, + "grad_norm": 0.8796103000640869, + "learning_rate": 0.000528816754993885, + "loss": 3.5597, + "step": 55500 + }, + { + "epoch": 3.771232504416361, + "grad_norm": 0.9452317357063293, + "learning_rate": 0.0005287742899850523, + "loss": 3.3743, + "step": 55505 + }, + { + "epoch": 3.7715722244870227, + "grad_norm": 0.8545734882354736, + "learning_rate": 0.0005287318249762197, + "loss": 3.3131, + "step": 55510 + }, + { + "epoch": 3.7719119445576843, + "grad_norm": 0.9797021746635437, + "learning_rate": 0.0005286893599673869, + "loss": 3.7571, + "step": 55515 + }, + { + "epoch": 3.7722516646283464, + "grad_norm": 0.690685510635376, + "learning_rate": 0.0005286468949585541, + "loss": 3.7234, + "step": 55520 + }, + { + "epoch": 3.772591384699008, + "grad_norm": 0.9952331185340881, + "learning_rate": 0.0005286044299497215, + "loss": 3.6338, + "step": 55525 + }, + { + "epoch": 3.7729311047696696, + "grad_norm": 0.7442764639854431, + "learning_rate": 0.0005285619649408887, + "loss": 3.2426, + "step": 55530 + }, + { + "epoch": 3.7732708248403317, + "grad_norm": 0.7450322508811951, + "learning_rate": 0.0005285194999320559, + "loss": 3.3703, + "step": 55535 + }, + { + "epoch": 3.7736105449109933, + "grad_norm": 0.757738471031189, + "learning_rate": 0.0005284770349232233, + "loss": 3.5042, + "step": 55540 + }, + { + "epoch": 3.773950264981655, + "grad_norm": 0.8894155621528625, + "learning_rate": 0.0005284345699143906, + "loss": 3.6249, + "step": 55545 + }, + { + "epoch": 3.774289985052317, + "grad_norm": 0.8436086177825928, + "learning_rate": 0.0005283921049055578, + "loss": 3.6084, + "step": 55550 + }, + { + "epoch": 3.7746297051229787, + "grad_norm": 0.8859430551528931, + "learning_rate": 0.0005283496398967251, + "loss": 3.513, + "step": 55555 + }, + { + "epoch": 3.7749694251936403, + "grad_norm": 1.0147720575332642, + "learning_rate": 0.0005283071748878924, + "loss": 3.5008, + "step": 55560 + }, + { + "epoch": 3.775309145264302, + "grad_norm": 0.8795601725578308, + "learning_rate": 0.0005282647098790596, + "loss": 3.2767, + "step": 55565 + }, + { + "epoch": 3.775648865334964, + "grad_norm": 0.9506633281707764, + "learning_rate": 0.0005282222448702269, + "loss": 3.4627, + "step": 55570 + }, + { + "epoch": 3.7759885854056257, + "grad_norm": 0.9408283233642578, + "learning_rate": 0.0005281797798613943, + "loss": 3.8395, + "step": 55575 + }, + { + "epoch": 3.7763283054762873, + "grad_norm": 0.8377165198326111, + "learning_rate": 0.0005281373148525615, + "loss": 3.4474, + "step": 55580 + }, + { + "epoch": 3.7766680255469494, + "grad_norm": 1.061051607131958, + "learning_rate": 0.0005280948498437288, + "loss": 3.5081, + "step": 55585 + }, + { + "epoch": 3.777007745617611, + "grad_norm": 0.9819728136062622, + "learning_rate": 0.000528052384834896, + "loss": 3.6013, + "step": 55590 + }, + { + "epoch": 3.7773474656882726, + "grad_norm": 1.2657508850097656, + "learning_rate": 0.0005280099198260634, + "loss": 3.6614, + "step": 55595 + }, + { + "epoch": 3.7776871857589347, + "grad_norm": 0.9659993648529053, + "learning_rate": 0.0005279674548172306, + "loss": 3.6279, + "step": 55600 + }, + { + "epoch": 3.7780269058295963, + "grad_norm": 0.8876321315765381, + "learning_rate": 0.0005279249898083978, + "loss": 3.4669, + "step": 55605 + }, + { + "epoch": 3.778366625900258, + "grad_norm": 0.8395402431488037, + "learning_rate": 0.0005278825247995653, + "loss": 3.5381, + "step": 55610 + }, + { + "epoch": 3.77870634597092, + "grad_norm": 0.6521126627922058, + "learning_rate": 0.0005278400597907325, + "loss": 3.6681, + "step": 55615 + }, + { + "epoch": 3.7790460660415817, + "grad_norm": 0.94387286901474, + "learning_rate": 0.0005277975947818997, + "loss": 3.5514, + "step": 55620 + }, + { + "epoch": 3.7793857861122433, + "grad_norm": 0.7893203496932983, + "learning_rate": 0.000527755129773067, + "loss": 3.6204, + "step": 55625 + }, + { + "epoch": 3.7797255061829054, + "grad_norm": 0.9829889535903931, + "learning_rate": 0.0005277126647642343, + "loss": 3.7479, + "step": 55630 + }, + { + "epoch": 3.780065226253567, + "grad_norm": 0.7843021154403687, + "learning_rate": 0.0005276701997554015, + "loss": 3.6735, + "step": 55635 + }, + { + "epoch": 3.7804049463242286, + "grad_norm": 0.7913486361503601, + "learning_rate": 0.0005276277347465688, + "loss": 3.7147, + "step": 55640 + }, + { + "epoch": 3.7807446663948907, + "grad_norm": 0.7376495003700256, + "learning_rate": 0.0005275852697377362, + "loss": 3.5161, + "step": 55645 + }, + { + "epoch": 3.7810843864655523, + "grad_norm": 0.8385688066482544, + "learning_rate": 0.0005275428047289034, + "loss": 3.4662, + "step": 55650 + }, + { + "epoch": 3.781424106536214, + "grad_norm": 0.7793996334075928, + "learning_rate": 0.0005275003397200707, + "loss": 3.5939, + "step": 55655 + }, + { + "epoch": 3.781763826606876, + "grad_norm": 0.9369453191757202, + "learning_rate": 0.000527457874711238, + "loss": 3.5605, + "step": 55660 + }, + { + "epoch": 3.7821035466775377, + "grad_norm": 1.1184266805648804, + "learning_rate": 0.0005274154097024052, + "loss": 3.3708, + "step": 55665 + }, + { + "epoch": 3.7824432667481993, + "grad_norm": 0.8188756704330444, + "learning_rate": 0.0005273729446935725, + "loss": 3.6899, + "step": 55670 + }, + { + "epoch": 3.7827829868188614, + "grad_norm": 0.9939940571784973, + "learning_rate": 0.0005273304796847397, + "loss": 3.439, + "step": 55675 + }, + { + "epoch": 3.783122706889523, + "grad_norm": 0.7566760778427124, + "learning_rate": 0.0005272880146759071, + "loss": 3.4863, + "step": 55680 + }, + { + "epoch": 3.7834624269601846, + "grad_norm": 1.0845677852630615, + "learning_rate": 0.0005272455496670744, + "loss": 3.5636, + "step": 55685 + }, + { + "epoch": 3.7838021470308467, + "grad_norm": 1.0487074851989746, + "learning_rate": 0.0005272030846582416, + "loss": 3.2958, + "step": 55690 + }, + { + "epoch": 3.7841418671015083, + "grad_norm": 1.0026788711547852, + "learning_rate": 0.0005271606196494089, + "loss": 3.8452, + "step": 55695 + }, + { + "epoch": 3.78448158717217, + "grad_norm": 0.9613252878189087, + "learning_rate": 0.0005271181546405762, + "loss": 3.3604, + "step": 55700 + }, + { + "epoch": 3.784821307242832, + "grad_norm": 1.0594747066497803, + "learning_rate": 0.0005270756896317434, + "loss": 3.4666, + "step": 55705 + }, + { + "epoch": 3.7851610273134937, + "grad_norm": 0.8288143873214722, + "learning_rate": 0.0005270332246229107, + "loss": 3.5089, + "step": 55710 + }, + { + "epoch": 3.7855007473841553, + "grad_norm": 0.9700300693511963, + "learning_rate": 0.0005269907596140781, + "loss": 3.4243, + "step": 55715 + }, + { + "epoch": 3.7858404674548174, + "grad_norm": 0.9670067429542542, + "learning_rate": 0.0005269482946052453, + "loss": 3.5702, + "step": 55720 + }, + { + "epoch": 3.786180187525479, + "grad_norm": 1.2418347597122192, + "learning_rate": 0.0005269058295964125, + "loss": 3.4082, + "step": 55725 + }, + { + "epoch": 3.7865199075961407, + "grad_norm": 0.9691250920295715, + "learning_rate": 0.0005268633645875799, + "loss": 3.771, + "step": 55730 + }, + { + "epoch": 3.7868596276668027, + "grad_norm": 0.7658679485321045, + "learning_rate": 0.0005268208995787471, + "loss": 3.4267, + "step": 55735 + }, + { + "epoch": 3.7871993477374644, + "grad_norm": 0.8770347237586975, + "learning_rate": 0.0005267784345699143, + "loss": 3.7229, + "step": 55740 + }, + { + "epoch": 3.787539067808126, + "grad_norm": 0.7714872360229492, + "learning_rate": 0.0005267359695610817, + "loss": 3.6147, + "step": 55745 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.9125569462776184, + "learning_rate": 0.000526693504552249, + "loss": 3.4788, + "step": 55750 + }, + { + "epoch": 3.7882185079494497, + "grad_norm": 0.7684047818183899, + "learning_rate": 0.0005266510395434162, + "loss": 3.6943, + "step": 55755 + }, + { + "epoch": 3.7885582280201113, + "grad_norm": 0.8984284400939941, + "learning_rate": 0.0005266085745345836, + "loss": 3.4841, + "step": 55760 + }, + { + "epoch": 3.7888979480907734, + "grad_norm": 0.8100431561470032, + "learning_rate": 0.0005265661095257508, + "loss": 3.5003, + "step": 55765 + }, + { + "epoch": 3.789237668161435, + "grad_norm": 0.9192761778831482, + "learning_rate": 0.000526523644516918, + "loss": 3.6638, + "step": 55770 + }, + { + "epoch": 3.7895773882320967, + "grad_norm": 0.894765317440033, + "learning_rate": 0.0005264811795080853, + "loss": 3.5491, + "step": 55775 + }, + { + "epoch": 3.7899171083027587, + "grad_norm": 0.9922235608100891, + "learning_rate": 0.0005264387144992526, + "loss": 3.6308, + "step": 55780 + }, + { + "epoch": 3.7902568283734204, + "grad_norm": 0.9084966778755188, + "learning_rate": 0.0005263962494904199, + "loss": 3.8732, + "step": 55785 + }, + { + "epoch": 3.790596548444082, + "grad_norm": 0.9052179455757141, + "learning_rate": 0.0005263537844815872, + "loss": 3.4658, + "step": 55790 + }, + { + "epoch": 3.790936268514744, + "grad_norm": 1.2339998483657837, + "learning_rate": 0.0005263113194727545, + "loss": 3.5476, + "step": 55795 + }, + { + "epoch": 3.7912759885854057, + "grad_norm": 1.227586030960083, + "learning_rate": 0.0005262688544639217, + "loss": 3.3126, + "step": 55800 + }, + { + "epoch": 3.7916157086560673, + "grad_norm": 1.3422658443450928, + "learning_rate": 0.000526226389455089, + "loss": 3.4646, + "step": 55805 + }, + { + "epoch": 3.7919554287267294, + "grad_norm": 0.7722707390785217, + "learning_rate": 0.0005261839244462563, + "loss": 3.4578, + "step": 55810 + }, + { + "epoch": 3.792295148797391, + "grad_norm": 0.7832756638526917, + "learning_rate": 0.0005261414594374235, + "loss": 3.4111, + "step": 55815 + }, + { + "epoch": 3.7926348688680527, + "grad_norm": 0.8626936078071594, + "learning_rate": 0.0005260989944285909, + "loss": 3.7831, + "step": 55820 + }, + { + "epoch": 3.7929745889387148, + "grad_norm": 0.9510303139686584, + "learning_rate": 0.0005260565294197581, + "loss": 3.4653, + "step": 55825 + }, + { + "epoch": 3.7933143090093764, + "grad_norm": 0.8417792916297913, + "learning_rate": 0.0005260140644109254, + "loss": 3.6457, + "step": 55830 + }, + { + "epoch": 3.793654029080038, + "grad_norm": 1.1724414825439453, + "learning_rate": 0.0005259715994020927, + "loss": 3.41, + "step": 55835 + }, + { + "epoch": 3.7939937491507, + "grad_norm": 1.0869691371917725, + "learning_rate": 0.0005259291343932599, + "loss": 3.5679, + "step": 55840 + }, + { + "epoch": 3.7943334692213617, + "grad_norm": 3.272974729537964, + "learning_rate": 0.0005258866693844272, + "loss": 3.5299, + "step": 55845 + }, + { + "epoch": 3.7946731892920234, + "grad_norm": 0.9460324645042419, + "learning_rate": 0.0005258442043755946, + "loss": 3.3183, + "step": 55850 + }, + { + "epoch": 3.795012909362685, + "grad_norm": 1.3304400444030762, + "learning_rate": 0.0005258017393667618, + "loss": 3.4143, + "step": 55855 + }, + { + "epoch": 3.795352629433347, + "grad_norm": 0.7570227980613708, + "learning_rate": 0.000525759274357929, + "loss": 3.5395, + "step": 55860 + }, + { + "epoch": 3.7956923495040087, + "grad_norm": 1.7856756448745728, + "learning_rate": 0.0005257168093490964, + "loss": 3.5092, + "step": 55865 + }, + { + "epoch": 3.7960320695746703, + "grad_norm": 0.9535858035087585, + "learning_rate": 0.0005256743443402636, + "loss": 3.617, + "step": 55870 + }, + { + "epoch": 3.7963717896453324, + "grad_norm": 0.9274249076843262, + "learning_rate": 0.0005256318793314308, + "loss": 3.3257, + "step": 55875 + }, + { + "epoch": 3.796711509715994, + "grad_norm": 0.7748128175735474, + "learning_rate": 0.0005255894143225982, + "loss": 3.6472, + "step": 55880 + }, + { + "epoch": 3.7970512297866557, + "grad_norm": 0.7901484966278076, + "learning_rate": 0.0005255469493137655, + "loss": 3.6592, + "step": 55885 + }, + { + "epoch": 3.7973909498573177, + "grad_norm": 0.9580680727958679, + "learning_rate": 0.0005255044843049327, + "loss": 3.6607, + "step": 55890 + }, + { + "epoch": 3.7977306699279794, + "grad_norm": 1.099428653717041, + "learning_rate": 0.0005254620192961001, + "loss": 3.4061, + "step": 55895 + }, + { + "epoch": 3.798070389998641, + "grad_norm": 0.886085569858551, + "learning_rate": 0.0005254195542872673, + "loss": 3.6347, + "step": 55900 + }, + { + "epoch": 3.7984101100693026, + "grad_norm": 3.1027748584747314, + "learning_rate": 0.0005253770892784345, + "loss": 3.7612, + "step": 55905 + }, + { + "epoch": 3.7987498301399647, + "grad_norm": 1.018548846244812, + "learning_rate": 0.0005253346242696019, + "loss": 3.0944, + "step": 55910 + }, + { + "epoch": 3.7990895502106263, + "grad_norm": 0.6790233850479126, + "learning_rate": 0.0005252921592607691, + "loss": 3.4677, + "step": 55915 + }, + { + "epoch": 3.799429270281288, + "grad_norm": 1.079872965812683, + "learning_rate": 0.0005252496942519364, + "loss": 3.6161, + "step": 55920 + }, + { + "epoch": 3.79976899035195, + "grad_norm": 0.814750611782074, + "learning_rate": 0.0005252072292431037, + "loss": 3.713, + "step": 55925 + }, + { + "epoch": 3.8001087104226117, + "grad_norm": 0.8753792643547058, + "learning_rate": 0.000525164764234271, + "loss": 3.62, + "step": 55930 + }, + { + "epoch": 3.8004484304932733, + "grad_norm": 0.7355855703353882, + "learning_rate": 0.0005251222992254383, + "loss": 3.7129, + "step": 55935 + }, + { + "epoch": 3.8007881505639354, + "grad_norm": 0.735390841960907, + "learning_rate": 0.0005250798342166055, + "loss": 3.3093, + "step": 55940 + }, + { + "epoch": 3.801127870634597, + "grad_norm": 0.9661595821380615, + "learning_rate": 0.0005250373692077728, + "loss": 3.6053, + "step": 55945 + }, + { + "epoch": 3.8014675907052586, + "grad_norm": 0.9914594888687134, + "learning_rate": 0.0005249949041989401, + "loss": 3.5076, + "step": 55950 + }, + { + "epoch": 3.8018073107759207, + "grad_norm": 0.8655800223350525, + "learning_rate": 0.0005249524391901074, + "loss": 3.1394, + "step": 55955 + }, + { + "epoch": 3.8021470308465823, + "grad_norm": 0.7778284549713135, + "learning_rate": 0.0005249099741812747, + "loss": 3.543, + "step": 55960 + }, + { + "epoch": 3.802486750917244, + "grad_norm": 0.8530519008636475, + "learning_rate": 0.000524867509172442, + "loss": 3.4467, + "step": 55965 + }, + { + "epoch": 3.802826470987906, + "grad_norm": 0.8805492520332336, + "learning_rate": 0.0005248250441636092, + "loss": 3.6242, + "step": 55970 + }, + { + "epoch": 3.8031661910585677, + "grad_norm": 1.1237521171569824, + "learning_rate": 0.0005247825791547764, + "loss": 3.566, + "step": 55975 + }, + { + "epoch": 3.8035059111292293, + "grad_norm": 0.8818703293800354, + "learning_rate": 0.0005247401141459438, + "loss": 3.4102, + "step": 55980 + }, + { + "epoch": 3.8038456311998914, + "grad_norm": 0.826289713382721, + "learning_rate": 0.000524697649137111, + "loss": 3.6128, + "step": 55985 + }, + { + "epoch": 3.804185351270553, + "grad_norm": 0.8887828588485718, + "learning_rate": 0.0005246551841282783, + "loss": 3.4238, + "step": 55990 + }, + { + "epoch": 3.8045250713412146, + "grad_norm": 0.9229504466056824, + "learning_rate": 0.0005246127191194457, + "loss": 3.4023, + "step": 55995 + }, + { + "epoch": 3.8048647914118767, + "grad_norm": 1.0549566745758057, + "learning_rate": 0.0005245702541106129, + "loss": 3.3786, + "step": 56000 + }, + { + "epoch": 3.8052045114825384, + "grad_norm": 0.954309344291687, + "learning_rate": 0.0005245277891017801, + "loss": 3.323, + "step": 56005 + }, + { + "epoch": 3.8055442315532, + "grad_norm": 1.0791290998458862, + "learning_rate": 0.0005244853240929475, + "loss": 3.3658, + "step": 56010 + }, + { + "epoch": 3.805883951623862, + "grad_norm": 0.7496359348297119, + "learning_rate": 0.0005244428590841147, + "loss": 3.415, + "step": 56015 + }, + { + "epoch": 3.8062236716945237, + "grad_norm": 0.7653619647026062, + "learning_rate": 0.0005244003940752819, + "loss": 3.5819, + "step": 56020 + }, + { + "epoch": 3.8065633917651853, + "grad_norm": 0.9597043991088867, + "learning_rate": 0.0005243579290664494, + "loss": 3.4592, + "step": 56025 + }, + { + "epoch": 3.8069031118358474, + "grad_norm": 0.7797598242759705, + "learning_rate": 0.0005243154640576166, + "loss": 3.6574, + "step": 56030 + }, + { + "epoch": 3.807242831906509, + "grad_norm": 0.9257333874702454, + "learning_rate": 0.0005242729990487838, + "loss": 3.376, + "step": 56035 + }, + { + "epoch": 3.8075825519771707, + "grad_norm": 0.6423773169517517, + "learning_rate": 0.0005242305340399511, + "loss": 3.5342, + "step": 56040 + }, + { + "epoch": 3.8079222720478327, + "grad_norm": 0.7509011030197144, + "learning_rate": 0.0005241880690311184, + "loss": 3.3794, + "step": 56045 + }, + { + "epoch": 3.8082619921184944, + "grad_norm": 1.15890634059906, + "learning_rate": 0.0005241456040222856, + "loss": 3.5399, + "step": 56050 + }, + { + "epoch": 3.808601712189156, + "grad_norm": 1.0324373245239258, + "learning_rate": 0.0005241031390134529, + "loss": 3.6894, + "step": 56055 + }, + { + "epoch": 3.808941432259818, + "grad_norm": 0.8314304947853088, + "learning_rate": 0.0005240606740046203, + "loss": 3.4099, + "step": 56060 + }, + { + "epoch": 3.8092811523304797, + "grad_norm": 0.8293315768241882, + "learning_rate": 0.0005240182089957875, + "loss": 3.705, + "step": 56065 + }, + { + "epoch": 3.8096208724011413, + "grad_norm": 1.0033458471298218, + "learning_rate": 0.0005239757439869548, + "loss": 3.5537, + "step": 56070 + }, + { + "epoch": 3.8099605924718034, + "grad_norm": 1.040730595588684, + "learning_rate": 0.000523933278978122, + "loss": 3.7589, + "step": 56075 + }, + { + "epoch": 3.810300312542465, + "grad_norm": 1.1200722455978394, + "learning_rate": 0.0005238908139692893, + "loss": 3.6315, + "step": 56080 + }, + { + "epoch": 3.8106400326131267, + "grad_norm": 0.7642076015472412, + "learning_rate": 0.0005238483489604566, + "loss": 3.4835, + "step": 56085 + }, + { + "epoch": 3.8109797526837887, + "grad_norm": 1.2603247165679932, + "learning_rate": 0.0005238058839516238, + "loss": 3.3655, + "step": 56090 + }, + { + "epoch": 3.8113194727544504, + "grad_norm": 1.045735478401184, + "learning_rate": 0.0005237634189427912, + "loss": 3.5997, + "step": 56095 + }, + { + "epoch": 3.811659192825112, + "grad_norm": 0.865080714225769, + "learning_rate": 0.0005237209539339585, + "loss": 3.6146, + "step": 56100 + }, + { + "epoch": 3.811998912895774, + "grad_norm": 0.8477247953414917, + "learning_rate": 0.0005236784889251257, + "loss": 3.6288, + "step": 56105 + }, + { + "epoch": 3.8123386329664357, + "grad_norm": 1.268038034439087, + "learning_rate": 0.000523636023916293, + "loss": 3.7539, + "step": 56110 + }, + { + "epoch": 3.8126783530370973, + "grad_norm": 0.9087737798690796, + "learning_rate": 0.0005235935589074603, + "loss": 3.7752, + "step": 56115 + }, + { + "epoch": 3.8130180731077594, + "grad_norm": 1.5807157754898071, + "learning_rate": 0.0005235510938986275, + "loss": 3.5387, + "step": 56120 + }, + { + "epoch": 3.813357793178421, + "grad_norm": 0.8277689814567566, + "learning_rate": 0.0005235086288897947, + "loss": 3.6759, + "step": 56125 + }, + { + "epoch": 3.8136975132490827, + "grad_norm": 0.9952521324157715, + "learning_rate": 0.0005234661638809622, + "loss": 3.4894, + "step": 56130 + }, + { + "epoch": 3.8140372333197448, + "grad_norm": 0.8412572741508484, + "learning_rate": 0.0005234236988721294, + "loss": 3.3654, + "step": 56135 + }, + { + "epoch": 3.8143769533904064, + "grad_norm": 1.093355417251587, + "learning_rate": 0.0005233812338632966, + "loss": 3.3446, + "step": 56140 + }, + { + "epoch": 3.814716673461068, + "grad_norm": 1.2520610094070435, + "learning_rate": 0.000523338768854464, + "loss": 3.5131, + "step": 56145 + }, + { + "epoch": 3.81505639353173, + "grad_norm": 0.8256568908691406, + "learning_rate": 0.0005232963038456312, + "loss": 3.6759, + "step": 56150 + }, + { + "epoch": 3.8153961136023917, + "grad_norm": 1.3551404476165771, + "learning_rate": 0.0005232538388367984, + "loss": 3.6351, + "step": 56155 + }, + { + "epoch": 3.8157358336730534, + "grad_norm": 1.3067020177841187, + "learning_rate": 0.0005232113738279658, + "loss": 3.3582, + "step": 56160 + }, + { + "epoch": 3.8160755537437154, + "grad_norm": 1.152469277381897, + "learning_rate": 0.0005231689088191331, + "loss": 3.6267, + "step": 56165 + }, + { + "epoch": 3.816415273814377, + "grad_norm": 0.9209774136543274, + "learning_rate": 0.0005231264438103003, + "loss": 3.5292, + "step": 56170 + }, + { + "epoch": 3.8167549938850387, + "grad_norm": 0.7577576041221619, + "learning_rate": 0.0005230839788014676, + "loss": 3.3798, + "step": 56175 + }, + { + "epoch": 3.8170947139557008, + "grad_norm": 0.8089054226875305, + "learning_rate": 0.0005230415137926349, + "loss": 3.6146, + "step": 56180 + }, + { + "epoch": 3.8174344340263624, + "grad_norm": 0.7528221607208252, + "learning_rate": 0.0005229990487838021, + "loss": 3.647, + "step": 56185 + }, + { + "epoch": 3.817774154097024, + "grad_norm": 0.8559520244598389, + "learning_rate": 0.0005229565837749694, + "loss": 3.6106, + "step": 56190 + }, + { + "epoch": 3.8181138741676857, + "grad_norm": 0.9472511410713196, + "learning_rate": 0.0005229141187661367, + "loss": 3.7415, + "step": 56195 + }, + { + "epoch": 3.8184535942383477, + "grad_norm": 0.8631306886672974, + "learning_rate": 0.000522871653757304, + "loss": 3.6581, + "step": 56200 + }, + { + "epoch": 3.8187933143090094, + "grad_norm": 0.8586714267730713, + "learning_rate": 0.0005228291887484713, + "loss": 3.1445, + "step": 56205 + }, + { + "epoch": 3.819133034379671, + "grad_norm": 0.8145372271537781, + "learning_rate": 0.0005227867237396386, + "loss": 3.5681, + "step": 56210 + }, + { + "epoch": 3.819472754450333, + "grad_norm": 0.9120622277259827, + "learning_rate": 0.0005227442587308058, + "loss": 3.5308, + "step": 56215 + }, + { + "epoch": 3.8198124745209947, + "grad_norm": 1.0165743827819824, + "learning_rate": 0.0005227017937219731, + "loss": 3.7551, + "step": 56220 + }, + { + "epoch": 3.8201521945916563, + "grad_norm": 0.9398404955863953, + "learning_rate": 0.0005226593287131403, + "loss": 3.4369, + "step": 56225 + }, + { + "epoch": 3.8204919146623184, + "grad_norm": 0.7606810927391052, + "learning_rate": 0.0005226168637043076, + "loss": 3.2756, + "step": 56230 + }, + { + "epoch": 3.82083163473298, + "grad_norm": 0.8033487796783447, + "learning_rate": 0.000522574398695475, + "loss": 3.6656, + "step": 56235 + }, + { + "epoch": 3.8211713548036417, + "grad_norm": 0.7566196322441101, + "learning_rate": 0.0005225319336866422, + "loss": 3.6181, + "step": 56240 + }, + { + "epoch": 3.8215110748743033, + "grad_norm": 0.8729888200759888, + "learning_rate": 0.0005224894686778095, + "loss": 3.4524, + "step": 56245 + }, + { + "epoch": 3.8218507949449654, + "grad_norm": 1.2228026390075684, + "learning_rate": 0.0005224554966707433, + "loss": 3.3976, + "step": 56250 + }, + { + "epoch": 3.822190515015627, + "grad_norm": 0.9489407539367676, + "learning_rate": 0.0005224130316619106, + "loss": 3.7264, + "step": 56255 + }, + { + "epoch": 3.8225302350862886, + "grad_norm": 0.8902500867843628, + "learning_rate": 0.000522370566653078, + "loss": 3.4479, + "step": 56260 + }, + { + "epoch": 3.8228699551569507, + "grad_norm": 0.6853080987930298, + "learning_rate": 0.0005223281016442452, + "loss": 3.4693, + "step": 56265 + }, + { + "epoch": 3.8232096752276123, + "grad_norm": 0.7071894407272339, + "learning_rate": 0.0005222856366354124, + "loss": 3.269, + "step": 56270 + }, + { + "epoch": 3.823549395298274, + "grad_norm": 0.8206537365913391, + "learning_rate": 0.0005222431716265797, + "loss": 3.5207, + "step": 56275 + }, + { + "epoch": 3.823889115368936, + "grad_norm": 0.9896106123924255, + "learning_rate": 0.000522200706617747, + "loss": 3.6716, + "step": 56280 + }, + { + "epoch": 3.8242288354395977, + "grad_norm": 1.04513680934906, + "learning_rate": 0.0005221582416089142, + "loss": 3.628, + "step": 56285 + }, + { + "epoch": 3.8245685555102593, + "grad_norm": 0.8927282691001892, + "learning_rate": 0.0005221157766000816, + "loss": 3.7199, + "step": 56290 + }, + { + "epoch": 3.8249082755809214, + "grad_norm": 0.782428503036499, + "learning_rate": 0.0005220733115912489, + "loss": 3.6325, + "step": 56295 + }, + { + "epoch": 3.825247995651583, + "grad_norm": 0.7506347894668579, + "learning_rate": 0.0005220308465824161, + "loss": 3.5552, + "step": 56300 + }, + { + "epoch": 3.8255877157222447, + "grad_norm": 0.850969135761261, + "learning_rate": 0.0005219883815735834, + "loss": 3.5405, + "step": 56305 + }, + { + "epoch": 3.8259274357929067, + "grad_norm": 2.855464458465576, + "learning_rate": 0.0005219459165647506, + "loss": 3.734, + "step": 56310 + }, + { + "epoch": 3.8262671558635684, + "grad_norm": 0.712417721748352, + "learning_rate": 0.0005219034515559179, + "loss": 3.595, + "step": 56315 + }, + { + "epoch": 3.82660687593423, + "grad_norm": 0.8818855881690979, + "learning_rate": 0.0005218609865470852, + "loss": 3.2064, + "step": 56320 + }, + { + "epoch": 3.826946596004892, + "grad_norm": 0.8743383884429932, + "learning_rate": 0.0005218185215382525, + "loss": 3.4798, + "step": 56325 + }, + { + "epoch": 3.8272863160755537, + "grad_norm": 0.773773193359375, + "learning_rate": 0.0005217760565294198, + "loss": 3.4538, + "step": 56330 + }, + { + "epoch": 3.8276260361462153, + "grad_norm": 0.8563107848167419, + "learning_rate": 0.0005217335915205871, + "loss": 3.6062, + "step": 56335 + }, + { + "epoch": 3.8279657562168774, + "grad_norm": 0.7018090486526489, + "learning_rate": 0.0005216911265117543, + "loss": 3.6015, + "step": 56340 + }, + { + "epoch": 3.828305476287539, + "grad_norm": 0.8458636403083801, + "learning_rate": 0.0005216486615029215, + "loss": 3.5076, + "step": 56345 + }, + { + "epoch": 3.8286451963582007, + "grad_norm": 0.8235912919044495, + "learning_rate": 0.0005216061964940889, + "loss": 3.7159, + "step": 56350 + }, + { + "epoch": 3.8289849164288627, + "grad_norm": 1.1087473630905151, + "learning_rate": 0.0005215637314852561, + "loss": 3.1674, + "step": 56355 + }, + { + "epoch": 3.8293246364995244, + "grad_norm": 1.0773584842681885, + "learning_rate": 0.0005215212664764234, + "loss": 3.5151, + "step": 56360 + }, + { + "epoch": 3.829664356570186, + "grad_norm": 0.9320704936981201, + "learning_rate": 0.0005214788014675908, + "loss": 3.62, + "step": 56365 + }, + { + "epoch": 3.830004076640848, + "grad_norm": 0.7739166021347046, + "learning_rate": 0.000521436336458758, + "loss": 3.4858, + "step": 56370 + }, + { + "epoch": 3.8303437967115097, + "grad_norm": 1.1849255561828613, + "learning_rate": 0.0005213938714499252, + "loss": 3.4682, + "step": 56375 + }, + { + "epoch": 3.8306835167821713, + "grad_norm": 0.9396222829818726, + "learning_rate": 0.0005213514064410926, + "loss": 3.3783, + "step": 56380 + }, + { + "epoch": 3.8310232368528334, + "grad_norm": 0.8062900900840759, + "learning_rate": 0.0005213089414322598, + "loss": 3.41, + "step": 56385 + }, + { + "epoch": 3.831362956923495, + "grad_norm": 0.999290943145752, + "learning_rate": 0.000521266476423427, + "loss": 3.4048, + "step": 56390 + }, + { + "epoch": 3.8317026769941567, + "grad_norm": 1.1473809480667114, + "learning_rate": 0.0005212240114145945, + "loss": 3.5308, + "step": 56395 + }, + { + "epoch": 3.8320423970648188, + "grad_norm": 0.6688233017921448, + "learning_rate": 0.0005211815464057617, + "loss": 3.5828, + "step": 56400 + }, + { + "epoch": 3.8323821171354804, + "grad_norm": 0.9201781749725342, + "learning_rate": 0.0005211390813969289, + "loss": 3.615, + "step": 56405 + }, + { + "epoch": 3.832721837206142, + "grad_norm": 0.8514717817306519, + "learning_rate": 0.0005210966163880962, + "loss": 3.631, + "step": 56410 + }, + { + "epoch": 3.833061557276804, + "grad_norm": 0.8726839423179626, + "learning_rate": 0.0005210541513792635, + "loss": 3.2992, + "step": 56415 + }, + { + "epoch": 3.8334012773474657, + "grad_norm": 0.8170264959335327, + "learning_rate": 0.0005210116863704307, + "loss": 3.3036, + "step": 56420 + }, + { + "epoch": 3.8337409974181273, + "grad_norm": 0.9408130645751953, + "learning_rate": 0.000520969221361598, + "loss": 3.3887, + "step": 56425 + }, + { + "epoch": 3.8340807174887894, + "grad_norm": 0.9068808555603027, + "learning_rate": 0.0005209267563527654, + "loss": 3.4423, + "step": 56430 + }, + { + "epoch": 3.834420437559451, + "grad_norm": 0.8743143081665039, + "learning_rate": 0.0005208842913439326, + "loss": 3.4644, + "step": 56435 + }, + { + "epoch": 3.8347601576301127, + "grad_norm": 0.8882112503051758, + "learning_rate": 0.0005208418263350999, + "loss": 3.29, + "step": 56440 + }, + { + "epoch": 3.8350998777007748, + "grad_norm": 0.8541393280029297, + "learning_rate": 0.0005207993613262672, + "loss": 3.4922, + "step": 56445 + }, + { + "epoch": 3.8354395977714364, + "grad_norm": 0.8456972241401672, + "learning_rate": 0.0005207568963174344, + "loss": 3.5116, + "step": 56450 + }, + { + "epoch": 3.835779317842098, + "grad_norm": 0.7866805791854858, + "learning_rate": 0.0005207144313086017, + "loss": 3.6365, + "step": 56455 + }, + { + "epoch": 3.83611903791276, + "grad_norm": 0.7358934283256531, + "learning_rate": 0.0005206719662997689, + "loss": 3.6389, + "step": 56460 + }, + { + "epoch": 3.8364587579834217, + "grad_norm": 0.7838963270187378, + "learning_rate": 0.0005206295012909363, + "loss": 3.4689, + "step": 56465 + }, + { + "epoch": 3.8367984780540834, + "grad_norm": 0.7620003819465637, + "learning_rate": 0.0005205870362821036, + "loss": 3.4546, + "step": 56470 + }, + { + "epoch": 3.8371381981247454, + "grad_norm": 0.9825019240379333, + "learning_rate": 0.0005205445712732708, + "loss": 3.5931, + "step": 56475 + }, + { + "epoch": 3.837477918195407, + "grad_norm": 1.0864334106445312, + "learning_rate": 0.0005205021062644382, + "loss": 3.525, + "step": 56480 + }, + { + "epoch": 3.8378176382660687, + "grad_norm": 0.9463905096054077, + "learning_rate": 0.0005204596412556054, + "loss": 3.7122, + "step": 56485 + }, + { + "epoch": 3.8381573583367308, + "grad_norm": 1.3351728916168213, + "learning_rate": 0.0005204171762467726, + "loss": 3.511, + "step": 56490 + }, + { + "epoch": 3.8384970784073924, + "grad_norm": 0.7617438435554504, + "learning_rate": 0.00052037471123794, + "loss": 3.8322, + "step": 56495 + }, + { + "epoch": 3.838836798478054, + "grad_norm": 1.0442399978637695, + "learning_rate": 0.0005203322462291073, + "loss": 3.4507, + "step": 56500 + }, + { + "epoch": 3.839176518548716, + "grad_norm": 0.7595780491828918, + "learning_rate": 0.0005202897812202745, + "loss": 3.5229, + "step": 56505 + }, + { + "epoch": 3.8395162386193777, + "grad_norm": 0.8496035933494568, + "learning_rate": 0.0005202473162114418, + "loss": 3.7074, + "step": 56510 + }, + { + "epoch": 3.8398559586900394, + "grad_norm": 0.8279311060905457, + "learning_rate": 0.0005202048512026091, + "loss": 3.5942, + "step": 56515 + }, + { + "epoch": 3.8401956787607014, + "grad_norm": 1.5697021484375, + "learning_rate": 0.0005201623861937763, + "loss": 3.5636, + "step": 56520 + }, + { + "epoch": 3.840535398831363, + "grad_norm": 0.7104946970939636, + "learning_rate": 0.0005201199211849436, + "loss": 3.3306, + "step": 56525 + }, + { + "epoch": 3.8408751189020247, + "grad_norm": 1.0342332124710083, + "learning_rate": 0.0005200774561761109, + "loss": 3.574, + "step": 56530 + }, + { + "epoch": 3.8412148389726863, + "grad_norm": 1.3352214097976685, + "learning_rate": 0.0005200349911672782, + "loss": 3.3476, + "step": 56535 + }, + { + "epoch": 3.8415545590433484, + "grad_norm": 0.8209943771362305, + "learning_rate": 0.0005199925261584455, + "loss": 3.7426, + "step": 56540 + }, + { + "epoch": 3.84189427911401, + "grad_norm": 1.0913070440292358, + "learning_rate": 0.0005199500611496128, + "loss": 3.6492, + "step": 56545 + }, + { + "epoch": 3.8422339991846717, + "grad_norm": 0.7373887896537781, + "learning_rate": 0.00051990759614078, + "loss": 3.6175, + "step": 56550 + }, + { + "epoch": 3.8425737192553338, + "grad_norm": 1.0017685890197754, + "learning_rate": 0.0005198651311319473, + "loss": 3.7277, + "step": 56555 + }, + { + "epoch": 3.8429134393259954, + "grad_norm": 0.8056206107139587, + "learning_rate": 0.0005198226661231145, + "loss": 3.6778, + "step": 56560 + }, + { + "epoch": 3.843253159396657, + "grad_norm": 0.7962728142738342, + "learning_rate": 0.0005197802011142818, + "loss": 3.4068, + "step": 56565 + }, + { + "epoch": 3.843592879467319, + "grad_norm": 0.9367039203643799, + "learning_rate": 0.0005197377361054492, + "loss": 3.508, + "step": 56570 + }, + { + "epoch": 3.8439325995379807, + "grad_norm": 0.8952248096466064, + "learning_rate": 0.0005196952710966164, + "loss": 3.4049, + "step": 56575 + }, + { + "epoch": 3.8442723196086424, + "grad_norm": 0.6751724481582642, + "learning_rate": 0.0005196528060877837, + "loss": 3.5786, + "step": 56580 + }, + { + "epoch": 3.844612039679304, + "grad_norm": 1.0384479761123657, + "learning_rate": 0.000519610341078951, + "loss": 3.7522, + "step": 56585 + }, + { + "epoch": 3.844951759749966, + "grad_norm": 0.7585147619247437, + "learning_rate": 0.0005195678760701182, + "loss": 3.3601, + "step": 56590 + }, + { + "epoch": 3.8452914798206277, + "grad_norm": 0.712372899055481, + "learning_rate": 0.0005195254110612854, + "loss": 3.3804, + "step": 56595 + }, + { + "epoch": 3.8456311998912893, + "grad_norm": 0.9108429551124573, + "learning_rate": 0.0005194829460524528, + "loss": 3.631, + "step": 56600 + }, + { + "epoch": 3.8459709199619514, + "grad_norm": 0.8678547739982605, + "learning_rate": 0.0005194404810436201, + "loss": 3.4056, + "step": 56605 + }, + { + "epoch": 3.846310640032613, + "grad_norm": 0.6379256844520569, + "learning_rate": 0.0005193980160347873, + "loss": 3.2984, + "step": 56610 + }, + { + "epoch": 3.8466503601032747, + "grad_norm": 1.7119849920272827, + "learning_rate": 0.0005193555510259547, + "loss": 3.7312, + "step": 56615 + }, + { + "epoch": 3.8469900801739367, + "grad_norm": 0.8064926862716675, + "learning_rate": 0.0005193130860171219, + "loss": 3.3688, + "step": 56620 + }, + { + "epoch": 3.8473298002445984, + "grad_norm": 0.7420786619186401, + "learning_rate": 0.0005192706210082891, + "loss": 3.3466, + "step": 56625 + }, + { + "epoch": 3.84766952031526, + "grad_norm": 0.9028169512748718, + "learning_rate": 0.0005192281559994565, + "loss": 3.6414, + "step": 56630 + }, + { + "epoch": 3.848009240385922, + "grad_norm": 0.885371208190918, + "learning_rate": 0.0005191856909906237, + "loss": 3.7076, + "step": 56635 + }, + { + "epoch": 3.8483489604565837, + "grad_norm": 0.7276880145072937, + "learning_rate": 0.000519143225981791, + "loss": 3.4978, + "step": 56640 + }, + { + "epoch": 3.8486886805272453, + "grad_norm": 0.7210808396339417, + "learning_rate": 0.0005191007609729584, + "loss": 3.571, + "step": 56645 + }, + { + "epoch": 3.8490284005979074, + "grad_norm": 0.6389056444168091, + "learning_rate": 0.0005190582959641256, + "loss": 3.4889, + "step": 56650 + }, + { + "epoch": 3.849368120668569, + "grad_norm": 0.8251342177391052, + "learning_rate": 0.0005190158309552928, + "loss": 3.4414, + "step": 56655 + }, + { + "epoch": 3.8497078407392307, + "grad_norm": 0.7639437913894653, + "learning_rate": 0.0005189733659464601, + "loss": 3.617, + "step": 56660 + }, + { + "epoch": 3.8500475608098927, + "grad_norm": 1.065617322921753, + "learning_rate": 0.0005189309009376274, + "loss": 3.415, + "step": 56665 + }, + { + "epoch": 3.8503872808805544, + "grad_norm": 0.734910249710083, + "learning_rate": 0.0005188884359287946, + "loss": 3.2752, + "step": 56670 + }, + { + "epoch": 3.850727000951216, + "grad_norm": 0.7438514828681946, + "learning_rate": 0.000518845970919962, + "loss": 3.6756, + "step": 56675 + }, + { + "epoch": 3.851066721021878, + "grad_norm": 1.0961720943450928, + "learning_rate": 0.0005188035059111293, + "loss": 3.4869, + "step": 56680 + }, + { + "epoch": 3.8514064410925397, + "grad_norm": 0.8718752264976501, + "learning_rate": 0.0005187610409022965, + "loss": 3.2681, + "step": 56685 + }, + { + "epoch": 3.8517461611632013, + "grad_norm": 0.8657114505767822, + "learning_rate": 0.0005187185758934638, + "loss": 3.5462, + "step": 56690 + }, + { + "epoch": 3.8520858812338634, + "grad_norm": 0.977986752986908, + "learning_rate": 0.000518676110884631, + "loss": 3.7022, + "step": 56695 + }, + { + "epoch": 3.852425601304525, + "grad_norm": 0.9062284827232361, + "learning_rate": 0.0005186336458757983, + "loss": 3.3368, + "step": 56700 + }, + { + "epoch": 3.8527653213751867, + "grad_norm": 0.9855496883392334, + "learning_rate": 0.0005185911808669657, + "loss": 3.275, + "step": 56705 + }, + { + "epoch": 3.8531050414458488, + "grad_norm": 0.7970459461212158, + "learning_rate": 0.0005185487158581329, + "loss": 3.4469, + "step": 56710 + }, + { + "epoch": 3.8534447615165104, + "grad_norm": 0.7854704260826111, + "learning_rate": 0.0005185062508493002, + "loss": 3.4514, + "step": 56715 + }, + { + "epoch": 3.853784481587172, + "grad_norm": 0.6569924354553223, + "learning_rate": 0.0005184637858404675, + "loss": 3.4232, + "step": 56720 + }, + { + "epoch": 3.854124201657834, + "grad_norm": 1.1530728340148926, + "learning_rate": 0.0005184213208316347, + "loss": 3.4977, + "step": 56725 + }, + { + "epoch": 3.8544639217284957, + "grad_norm": 0.8790482878684998, + "learning_rate": 0.000518378855822802, + "loss": 3.607, + "step": 56730 + }, + { + "epoch": 3.8548036417991574, + "grad_norm": 0.7748684883117676, + "learning_rate": 0.0005183363908139693, + "loss": 3.4516, + "step": 56735 + }, + { + "epoch": 3.8551433618698194, + "grad_norm": 0.9400753974914551, + "learning_rate": 0.0005182939258051366, + "loss": 3.5859, + "step": 56740 + }, + { + "epoch": 3.855483081940481, + "grad_norm": 0.8871325254440308, + "learning_rate": 0.0005182514607963038, + "loss": 3.3449, + "step": 56745 + }, + { + "epoch": 3.8558228020111427, + "grad_norm": 0.8402085304260254, + "learning_rate": 0.0005182089957874712, + "loss": 3.5652, + "step": 56750 + }, + { + "epoch": 3.8561625220818048, + "grad_norm": 0.7682943344116211, + "learning_rate": 0.0005181665307786384, + "loss": 3.5758, + "step": 56755 + }, + { + "epoch": 3.8565022421524664, + "grad_norm": 0.7415351271629333, + "learning_rate": 0.0005181240657698056, + "loss": 3.6062, + "step": 56760 + }, + { + "epoch": 3.856841962223128, + "grad_norm": 0.8369572758674622, + "learning_rate": 0.000518081600760973, + "loss": 3.3864, + "step": 56765 + }, + { + "epoch": 3.85718168229379, + "grad_norm": 1.9266852140426636, + "learning_rate": 0.0005180391357521402, + "loss": 3.7559, + "step": 56770 + }, + { + "epoch": 3.8575214023644517, + "grad_norm": 0.6947185397148132, + "learning_rate": 0.0005179966707433075, + "loss": 3.5691, + "step": 56775 + }, + { + "epoch": 3.8578611224351134, + "grad_norm": 0.8154336214065552, + "learning_rate": 0.0005179542057344749, + "loss": 3.4742, + "step": 56780 + }, + { + "epoch": 3.8582008425057754, + "grad_norm": 0.7626187801361084, + "learning_rate": 0.0005179117407256421, + "loss": 3.4713, + "step": 56785 + }, + { + "epoch": 3.858540562576437, + "grad_norm": 0.8558303713798523, + "learning_rate": 0.0005178692757168093, + "loss": 3.3552, + "step": 56790 + }, + { + "epoch": 3.8588802826470987, + "grad_norm": 0.6507863402366638, + "learning_rate": 0.0005178268107079766, + "loss": 3.6055, + "step": 56795 + }, + { + "epoch": 3.8592200027177608, + "grad_norm": 0.8796634674072266, + "learning_rate": 0.0005177843456991439, + "loss": 3.3438, + "step": 56800 + }, + { + "epoch": 3.8595597227884224, + "grad_norm": 0.9883622527122498, + "learning_rate": 0.0005177418806903111, + "loss": 3.3926, + "step": 56805 + }, + { + "epoch": 3.859899442859084, + "grad_norm": 0.7711801528930664, + "learning_rate": 0.0005176994156814785, + "loss": 3.5911, + "step": 56810 + }, + { + "epoch": 3.860239162929746, + "grad_norm": 1.1735100746154785, + "learning_rate": 0.0005176569506726458, + "loss": 3.4101, + "step": 56815 + }, + { + "epoch": 3.8605788830004077, + "grad_norm": 0.8614303469657898, + "learning_rate": 0.0005176144856638131, + "loss": 3.6525, + "step": 56820 + }, + { + "epoch": 3.8609186030710694, + "grad_norm": 0.8195895552635193, + "learning_rate": 0.0005175720206549803, + "loss": 3.5149, + "step": 56825 + }, + { + "epoch": 3.8612583231417315, + "grad_norm": 0.8056187629699707, + "learning_rate": 0.0005175295556461476, + "loss": 3.3836, + "step": 56830 + }, + { + "epoch": 3.861598043212393, + "grad_norm": 0.8629986643791199, + "learning_rate": 0.0005174870906373149, + "loss": 3.3393, + "step": 56835 + }, + { + "epoch": 3.8619377632830547, + "grad_norm": 0.8915890455245972, + "learning_rate": 0.0005174446256284821, + "loss": 3.3139, + "step": 56840 + }, + { + "epoch": 3.862277483353717, + "grad_norm": 0.8786935210227966, + "learning_rate": 0.0005174021606196494, + "loss": 3.285, + "step": 56845 + }, + { + "epoch": 3.8626172034243784, + "grad_norm": 0.8542793989181519, + "learning_rate": 0.0005173596956108168, + "loss": 3.2412, + "step": 56850 + }, + { + "epoch": 3.86295692349504, + "grad_norm": 1.0367604494094849, + "learning_rate": 0.000517317230601984, + "loss": 3.5826, + "step": 56855 + }, + { + "epoch": 3.863296643565702, + "grad_norm": 0.7216777205467224, + "learning_rate": 0.0005172747655931512, + "loss": 3.2464, + "step": 56860 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 1.31154203414917, + "learning_rate": 0.0005172323005843186, + "loss": 3.2373, + "step": 56865 + }, + { + "epoch": 3.8639760837070254, + "grad_norm": 0.8592551946640015, + "learning_rate": 0.0005171898355754858, + "loss": 3.498, + "step": 56870 + }, + { + "epoch": 3.864315803777687, + "grad_norm": 0.917550265789032, + "learning_rate": 0.000517147370566653, + "loss": 3.8114, + "step": 56875 + }, + { + "epoch": 3.864655523848349, + "grad_norm": 0.9399448037147522, + "learning_rate": 0.0005171049055578205, + "loss": 3.7083, + "step": 56880 + }, + { + "epoch": 3.8649952439190107, + "grad_norm": 1.0331610441207886, + "learning_rate": 0.0005170624405489877, + "loss": 3.4461, + "step": 56885 + }, + { + "epoch": 3.8653349639896724, + "grad_norm": 0.8666069507598877, + "learning_rate": 0.0005170199755401549, + "loss": 3.5173, + "step": 56890 + }, + { + "epoch": 3.8656746840603344, + "grad_norm": 0.8050867319107056, + "learning_rate": 0.0005169775105313222, + "loss": 3.4878, + "step": 56895 + }, + { + "epoch": 3.866014404130996, + "grad_norm": 0.863825261592865, + "learning_rate": 0.0005169350455224895, + "loss": 3.5337, + "step": 56900 + }, + { + "epoch": 3.8663541242016577, + "grad_norm": 0.7346368432044983, + "learning_rate": 0.0005168925805136567, + "loss": 3.4779, + "step": 56905 + }, + { + "epoch": 3.8666938442723198, + "grad_norm": 0.9623124003410339, + "learning_rate": 0.000516850115504824, + "loss": 3.7245, + "step": 56910 + }, + { + "epoch": 3.8670335643429814, + "grad_norm": 0.9723739624023438, + "learning_rate": 0.0005168076504959914, + "loss": 3.5371, + "step": 56915 + }, + { + "epoch": 3.867373284413643, + "grad_norm": 0.8392958641052246, + "learning_rate": 0.0005167651854871586, + "loss": 3.5757, + "step": 56920 + }, + { + "epoch": 3.8677130044843047, + "grad_norm": 0.786159873008728, + "learning_rate": 0.0005167227204783259, + "loss": 3.4234, + "step": 56925 + }, + { + "epoch": 3.8680527245549667, + "grad_norm": 0.9147164225578308, + "learning_rate": 0.0005166802554694932, + "loss": 3.561, + "step": 56930 + }, + { + "epoch": 3.8683924446256284, + "grad_norm": 0.8488204479217529, + "learning_rate": 0.0005166377904606604, + "loss": 3.3344, + "step": 56935 + }, + { + "epoch": 3.86873216469629, + "grad_norm": 1.5046764612197876, + "learning_rate": 0.0005165953254518277, + "loss": 3.6141, + "step": 56940 + }, + { + "epoch": 3.869071884766952, + "grad_norm": 1.1847774982452393, + "learning_rate": 0.0005165528604429949, + "loss": 3.772, + "step": 56945 + }, + { + "epoch": 3.8694116048376137, + "grad_norm": 0.9297807216644287, + "learning_rate": 0.0005165103954341623, + "loss": 3.4391, + "step": 56950 + }, + { + "epoch": 3.8697513249082753, + "grad_norm": 0.8311876654624939, + "learning_rate": 0.0005164679304253296, + "loss": 3.5069, + "step": 56955 + }, + { + "epoch": 3.8700910449789374, + "grad_norm": 0.9822632670402527, + "learning_rate": 0.0005164254654164968, + "loss": 3.6945, + "step": 56960 + }, + { + "epoch": 3.870430765049599, + "grad_norm": 0.8008955717086792, + "learning_rate": 0.0005163830004076641, + "loss": 3.6615, + "step": 56965 + }, + { + "epoch": 3.8707704851202607, + "grad_norm": 0.8190264701843262, + "learning_rate": 0.0005163405353988314, + "loss": 3.5432, + "step": 56970 + }, + { + "epoch": 3.8711102051909227, + "grad_norm": 0.99906325340271, + "learning_rate": 0.0005162980703899986, + "loss": 3.4746, + "step": 56975 + }, + { + "epoch": 3.8714499252615844, + "grad_norm": 0.7091568112373352, + "learning_rate": 0.0005162556053811658, + "loss": 3.5762, + "step": 56980 + }, + { + "epoch": 3.871789645332246, + "grad_norm": 0.969938337802887, + "learning_rate": 0.0005162131403723333, + "loss": 3.1318, + "step": 56985 + }, + { + "epoch": 3.872129365402908, + "grad_norm": 1.3380165100097656, + "learning_rate": 0.0005161706753635005, + "loss": 3.573, + "step": 56990 + }, + { + "epoch": 3.8724690854735697, + "grad_norm": 0.8665280342102051, + "learning_rate": 0.0005161282103546677, + "loss": 3.4662, + "step": 56995 + }, + { + "epoch": 3.8728088055442313, + "grad_norm": 0.7217531800270081, + "learning_rate": 0.0005160857453458351, + "loss": 3.5157, + "step": 57000 + }, + { + "epoch": 3.8731485256148934, + "grad_norm": 0.7743408679962158, + "learning_rate": 0.0005160432803370023, + "loss": 3.3517, + "step": 57005 + }, + { + "epoch": 3.873488245685555, + "grad_norm": 0.9551417827606201, + "learning_rate": 0.0005160008153281695, + "loss": 3.511, + "step": 57010 + }, + { + "epoch": 3.8738279657562167, + "grad_norm": 0.9639148712158203, + "learning_rate": 0.0005159583503193369, + "loss": 3.4527, + "step": 57015 + }, + { + "epoch": 3.8741676858268788, + "grad_norm": 1.2646957635879517, + "learning_rate": 0.0005159158853105042, + "loss": 3.6416, + "step": 57020 + }, + { + "epoch": 3.8745074058975404, + "grad_norm": 0.8816516399383545, + "learning_rate": 0.0005158734203016714, + "loss": 3.4773, + "step": 57025 + }, + { + "epoch": 3.874847125968202, + "grad_norm": 1.0330942869186401, + "learning_rate": 0.0005158309552928388, + "loss": 3.3577, + "step": 57030 + }, + { + "epoch": 3.875186846038864, + "grad_norm": 1.0460010766983032, + "learning_rate": 0.000515788490284006, + "loss": 3.613, + "step": 57035 + }, + { + "epoch": 3.8755265661095257, + "grad_norm": 0.7395579814910889, + "learning_rate": 0.0005157460252751732, + "loss": 3.332, + "step": 57040 + }, + { + "epoch": 3.8758662861801874, + "grad_norm": 1.009875774383545, + "learning_rate": 0.0005157035602663405, + "loss": 3.4328, + "step": 57045 + }, + { + "epoch": 3.8762060062508494, + "grad_norm": 1.0408645868301392, + "learning_rate": 0.0005156610952575078, + "loss": 3.4684, + "step": 57050 + }, + { + "epoch": 3.876545726321511, + "grad_norm": 1.766273021697998, + "learning_rate": 0.0005156186302486751, + "loss": 3.3331, + "step": 57055 + }, + { + "epoch": 3.8768854463921727, + "grad_norm": 0.775963544845581, + "learning_rate": 0.0005155761652398424, + "loss": 3.5153, + "step": 57060 + }, + { + "epoch": 3.8772251664628348, + "grad_norm": 1.0458781719207764, + "learning_rate": 0.0005155337002310097, + "loss": 3.2091, + "step": 57065 + }, + { + "epoch": 3.8775648865334964, + "grad_norm": 0.8542417883872986, + "learning_rate": 0.0005154912352221769, + "loss": 3.3556, + "step": 57070 + }, + { + "epoch": 3.877904606604158, + "grad_norm": 0.8828438520431519, + "learning_rate": 0.0005154487702133442, + "loss": 3.7103, + "step": 57075 + }, + { + "epoch": 3.87824432667482, + "grad_norm": 0.980075478553772, + "learning_rate": 0.0005154063052045114, + "loss": 3.4423, + "step": 57080 + }, + { + "epoch": 3.8785840467454817, + "grad_norm": 0.839630126953125, + "learning_rate": 0.0005153638401956787, + "loss": 3.8739, + "step": 57085 + }, + { + "epoch": 3.8789237668161434, + "grad_norm": 1.0373154878616333, + "learning_rate": 0.0005153213751868461, + "loss": 3.7106, + "step": 57090 + }, + { + "epoch": 3.8792634868868054, + "grad_norm": 1.015345811843872, + "learning_rate": 0.0005152789101780133, + "loss": 3.4807, + "step": 57095 + }, + { + "epoch": 3.879603206957467, + "grad_norm": 0.8028944730758667, + "learning_rate": 0.0005152364451691806, + "loss": 3.4797, + "step": 57100 + }, + { + "epoch": 3.8799429270281287, + "grad_norm": 0.8579580187797546, + "learning_rate": 0.0005151939801603479, + "loss": 3.523, + "step": 57105 + }, + { + "epoch": 3.880282647098791, + "grad_norm": 0.8851569890975952, + "learning_rate": 0.0005151515151515151, + "loss": 3.5237, + "step": 57110 + }, + { + "epoch": 3.8806223671694524, + "grad_norm": 0.6991825103759766, + "learning_rate": 0.0005151090501426824, + "loss": 3.5159, + "step": 57115 + }, + { + "epoch": 3.880962087240114, + "grad_norm": 0.929646909236908, + "learning_rate": 0.0005150665851338497, + "loss": 3.43, + "step": 57120 + }, + { + "epoch": 3.881301807310776, + "grad_norm": 0.7679665088653564, + "learning_rate": 0.000515024120125017, + "loss": 3.4878, + "step": 57125 + }, + { + "epoch": 3.8816415273814378, + "grad_norm": 0.8433682918548584, + "learning_rate": 0.0005149816551161842, + "loss": 3.6267, + "step": 57130 + }, + { + "epoch": 3.8819812474520994, + "grad_norm": 0.8524599671363831, + "learning_rate": 0.0005149391901073516, + "loss": 3.3677, + "step": 57135 + }, + { + "epoch": 3.8823209675227615, + "grad_norm": 0.8748569488525391, + "learning_rate": 0.0005148967250985188, + "loss": 3.5246, + "step": 57140 + }, + { + "epoch": 3.882660687593423, + "grad_norm": 0.7771814465522766, + "learning_rate": 0.000514854260089686, + "loss": 3.3166, + "step": 57145 + }, + { + "epoch": 3.8830004076640847, + "grad_norm": 0.7792397141456604, + "learning_rate": 0.0005148117950808534, + "loss": 3.3898, + "step": 57150 + }, + { + "epoch": 3.883340127734747, + "grad_norm": 0.905968964099884, + "learning_rate": 0.0005147693300720206, + "loss": 3.4163, + "step": 57155 + }, + { + "epoch": 3.8836798478054084, + "grad_norm": 1.350589394569397, + "learning_rate": 0.000514726865063188, + "loss": 3.8427, + "step": 57160 + }, + { + "epoch": 3.88401956787607, + "grad_norm": 0.90104079246521, + "learning_rate": 0.0005146844000543553, + "loss": 3.6259, + "step": 57165 + }, + { + "epoch": 3.884359287946732, + "grad_norm": 0.7808669805526733, + "learning_rate": 0.0005146419350455225, + "loss": 3.5331, + "step": 57170 + }, + { + "epoch": 3.8846990080173938, + "grad_norm": 0.9017115235328674, + "learning_rate": 0.0005145994700366898, + "loss": 3.4077, + "step": 57175 + }, + { + "epoch": 3.8850387280880554, + "grad_norm": 0.7593175768852234, + "learning_rate": 0.000514557005027857, + "loss": 3.3257, + "step": 57180 + }, + { + "epoch": 3.8853784481587175, + "grad_norm": 0.7938282489776611, + "learning_rate": 0.0005145145400190243, + "loss": 3.4859, + "step": 57185 + }, + { + "epoch": 3.885718168229379, + "grad_norm": 0.7819607257843018, + "learning_rate": 0.0005144720750101916, + "loss": 3.5705, + "step": 57190 + }, + { + "epoch": 3.8860578883000407, + "grad_norm": 0.7027607560157776, + "learning_rate": 0.0005144296100013589, + "loss": 3.5546, + "step": 57195 + }, + { + "epoch": 3.886397608370703, + "grad_norm": 0.870997965335846, + "learning_rate": 0.0005143871449925262, + "loss": 3.2548, + "step": 57200 + }, + { + "epoch": 3.8867373284413644, + "grad_norm": 0.8003460168838501, + "learning_rate": 0.0005143446799836935, + "loss": 3.5346, + "step": 57205 + }, + { + "epoch": 3.887077048512026, + "grad_norm": 0.8084418773651123, + "learning_rate": 0.0005143022149748607, + "loss": 3.613, + "step": 57210 + }, + { + "epoch": 3.887416768582688, + "grad_norm": 5.198816299438477, + "learning_rate": 0.000514259749966028, + "loss": 3.7441, + "step": 57215 + }, + { + "epoch": 3.8877564886533498, + "grad_norm": 0.8484058380126953, + "learning_rate": 0.0005142172849571953, + "loss": 3.5515, + "step": 57220 + }, + { + "epoch": 3.8880962087240114, + "grad_norm": 0.7840083241462708, + "learning_rate": 0.0005141748199483625, + "loss": 3.4103, + "step": 57225 + }, + { + "epoch": 3.888435928794673, + "grad_norm": 1.1401288509368896, + "learning_rate": 0.0005141323549395299, + "loss": 3.4186, + "step": 57230 + }, + { + "epoch": 3.888775648865335, + "grad_norm": 0.8520201444625854, + "learning_rate": 0.0005140898899306972, + "loss": 3.5722, + "step": 57235 + }, + { + "epoch": 3.8891153689359967, + "grad_norm": 0.7398256063461304, + "learning_rate": 0.0005140474249218644, + "loss": 3.3901, + "step": 57240 + }, + { + "epoch": 3.8894550890066584, + "grad_norm": 0.8958461880683899, + "learning_rate": 0.0005140049599130316, + "loss": 3.4174, + "step": 57245 + }, + { + "epoch": 3.8897948090773204, + "grad_norm": 0.6988781094551086, + "learning_rate": 0.000513962494904199, + "loss": 3.4016, + "step": 57250 + }, + { + "epoch": 3.890134529147982, + "grad_norm": 0.7490710616111755, + "learning_rate": 0.0005139200298953662, + "loss": 3.5305, + "step": 57255 + }, + { + "epoch": 3.8904742492186437, + "grad_norm": 0.9246149659156799, + "learning_rate": 0.0005138775648865334, + "loss": 3.5699, + "step": 57260 + }, + { + "epoch": 3.8908139692893053, + "grad_norm": 0.9762859344482422, + "learning_rate": 0.0005138350998777009, + "loss": 3.5949, + "step": 57265 + }, + { + "epoch": 3.8911536893599674, + "grad_norm": 1.3036695718765259, + "learning_rate": 0.0005137926348688681, + "loss": 3.3754, + "step": 57270 + }, + { + "epoch": 3.891493409430629, + "grad_norm": 1.2211401462554932, + "learning_rate": 0.0005137501698600353, + "loss": 3.5038, + "step": 57275 + }, + { + "epoch": 3.8918331295012907, + "grad_norm": 0.8336858749389648, + "learning_rate": 0.0005137077048512027, + "loss": 3.3369, + "step": 57280 + }, + { + "epoch": 3.8921728495719528, + "grad_norm": 0.8282257914543152, + "learning_rate": 0.0005136652398423699, + "loss": 3.5309, + "step": 57285 + }, + { + "epoch": 3.8925125696426144, + "grad_norm": 0.9944437742233276, + "learning_rate": 0.0005136227748335371, + "loss": 3.5847, + "step": 57290 + }, + { + "epoch": 3.892852289713276, + "grad_norm": 0.8761589527130127, + "learning_rate": 0.0005135803098247045, + "loss": 3.6066, + "step": 57295 + }, + { + "epoch": 3.893192009783938, + "grad_norm": 1.056311011314392, + "learning_rate": 0.0005135378448158718, + "loss": 3.5038, + "step": 57300 + }, + { + "epoch": 3.8935317298545997, + "grad_norm": 0.889857828617096, + "learning_rate": 0.000513495379807039, + "loss": 3.7979, + "step": 57305 + }, + { + "epoch": 3.8938714499252614, + "grad_norm": 0.7912567853927612, + "learning_rate": 0.0005134529147982063, + "loss": 3.5632, + "step": 57310 + }, + { + "epoch": 3.8942111699959234, + "grad_norm": 1.0803968906402588, + "learning_rate": 0.0005134104497893736, + "loss": 3.5872, + "step": 57315 + }, + { + "epoch": 3.894550890066585, + "grad_norm": 1.0242215394973755, + "learning_rate": 0.0005133679847805408, + "loss": 3.6359, + "step": 57320 + }, + { + "epoch": 3.8948906101372467, + "grad_norm": 0.820503830909729, + "learning_rate": 0.0005133255197717081, + "loss": 3.5565, + "step": 57325 + }, + { + "epoch": 3.8952303302079088, + "grad_norm": 0.8312424421310425, + "learning_rate": 0.0005132830547628755, + "loss": 3.5478, + "step": 57330 + }, + { + "epoch": 3.8955700502785704, + "grad_norm": 0.9335893988609314, + "learning_rate": 0.0005132405897540427, + "loss": 3.5805, + "step": 57335 + }, + { + "epoch": 3.895909770349232, + "grad_norm": 0.8250530958175659, + "learning_rate": 0.00051319812474521, + "loss": 3.3324, + "step": 57340 + }, + { + "epoch": 3.896249490419894, + "grad_norm": 1.0678486824035645, + "learning_rate": 0.0005131556597363772, + "loss": 3.6091, + "step": 57345 + }, + { + "epoch": 3.8965892104905557, + "grad_norm": 0.6195693612098694, + "learning_rate": 0.0005131131947275445, + "loss": 3.8687, + "step": 57350 + }, + { + "epoch": 3.8969289305612174, + "grad_norm": 0.746182918548584, + "learning_rate": 0.0005130707297187118, + "loss": 3.6035, + "step": 57355 + }, + { + "epoch": 3.8972686506318794, + "grad_norm": 0.8712149858474731, + "learning_rate": 0.000513028264709879, + "loss": 3.5906, + "step": 57360 + }, + { + "epoch": 3.897608370702541, + "grad_norm": 0.8051648139953613, + "learning_rate": 0.0005129857997010464, + "loss": 3.6346, + "step": 57365 + }, + { + "epoch": 3.8979480907732027, + "grad_norm": 0.773091197013855, + "learning_rate": 0.0005129433346922137, + "loss": 3.646, + "step": 57370 + }, + { + "epoch": 3.8982878108438648, + "grad_norm": 1.0084558725357056, + "learning_rate": 0.0005129008696833809, + "loss": 3.7006, + "step": 57375 + }, + { + "epoch": 3.8986275309145264, + "grad_norm": 1.123521327972412, + "learning_rate": 0.0005128584046745481, + "loss": 3.7426, + "step": 57380 + }, + { + "epoch": 3.898967250985188, + "grad_norm": 0.723750650882721, + "learning_rate": 0.0005128159396657155, + "loss": 3.5219, + "step": 57385 + }, + { + "epoch": 3.89930697105585, + "grad_norm": 0.8818126320838928, + "learning_rate": 0.0005127734746568827, + "loss": 3.6426, + "step": 57390 + }, + { + "epoch": 3.8996466911265117, + "grad_norm": 0.9635238647460938, + "learning_rate": 0.0005127310096480499, + "loss": 3.4319, + "step": 57395 + }, + { + "epoch": 3.8999864111971734, + "grad_norm": 1.022154450416565, + "learning_rate": 0.0005126885446392174, + "loss": 3.3583, + "step": 57400 + }, + { + "epoch": 3.9003261312678354, + "grad_norm": 0.7239106893539429, + "learning_rate": 0.0005126460796303846, + "loss": 3.3177, + "step": 57405 + }, + { + "epoch": 3.900665851338497, + "grad_norm": 0.7653338313102722, + "learning_rate": 0.0005126036146215518, + "loss": 3.4048, + "step": 57410 + }, + { + "epoch": 3.9010055714091587, + "grad_norm": 0.835209310054779, + "learning_rate": 0.0005125611496127192, + "loss": 3.29, + "step": 57415 + }, + { + "epoch": 3.901345291479821, + "grad_norm": 0.949242353439331, + "learning_rate": 0.0005125186846038864, + "loss": 3.4799, + "step": 57420 + }, + { + "epoch": 3.9016850115504824, + "grad_norm": 0.8888781666755676, + "learning_rate": 0.0005124762195950536, + "loss": 3.5659, + "step": 57425 + }, + { + "epoch": 3.902024731621144, + "grad_norm": 0.8218334317207336, + "learning_rate": 0.000512433754586221, + "loss": 3.5577, + "step": 57430 + }, + { + "epoch": 3.902364451691806, + "grad_norm": 0.8771854639053345, + "learning_rate": 0.0005123912895773883, + "loss": 3.3659, + "step": 57435 + }, + { + "epoch": 3.9027041717624678, + "grad_norm": 0.7885036468505859, + "learning_rate": 0.0005123488245685555, + "loss": 3.6251, + "step": 57440 + }, + { + "epoch": 3.9030438918331294, + "grad_norm": 1.110485553741455, + "learning_rate": 0.0005123063595597228, + "loss": 3.2329, + "step": 57445 + }, + { + "epoch": 3.9033836119037915, + "grad_norm": 0.7711028456687927, + "learning_rate": 0.0005122638945508901, + "loss": 3.324, + "step": 57450 + }, + { + "epoch": 3.903723331974453, + "grad_norm": 1.054261565208435, + "learning_rate": 0.0005122214295420573, + "loss": 3.7695, + "step": 57455 + }, + { + "epoch": 3.9040630520451147, + "grad_norm": 0.7413650751113892, + "learning_rate": 0.0005121789645332246, + "loss": 3.5265, + "step": 57460 + }, + { + "epoch": 3.904402772115777, + "grad_norm": 0.787361741065979, + "learning_rate": 0.0005121364995243919, + "loss": 3.5794, + "step": 57465 + }, + { + "epoch": 3.9047424921864384, + "grad_norm": 1.0933198928833008, + "learning_rate": 0.0005120940345155592, + "loss": 3.4944, + "step": 57470 + }, + { + "epoch": 3.9050822122571, + "grad_norm": 0.9275854229927063, + "learning_rate": 0.0005120515695067265, + "loss": 3.367, + "step": 57475 + }, + { + "epoch": 3.905421932327762, + "grad_norm": 0.9533092379570007, + "learning_rate": 0.0005120091044978937, + "loss": 3.541, + "step": 57480 + }, + { + "epoch": 3.9057616523984238, + "grad_norm": 0.9793087244033813, + "learning_rate": 0.000511966639489061, + "loss": 3.3625, + "step": 57485 + }, + { + "epoch": 3.9061013724690854, + "grad_norm": 1.314186692237854, + "learning_rate": 0.0005119241744802283, + "loss": 3.0588, + "step": 57490 + }, + { + "epoch": 3.9064410925397475, + "grad_norm": 1.5261478424072266, + "learning_rate": 0.0005118817094713955, + "loss": 3.5793, + "step": 57495 + }, + { + "epoch": 3.906780812610409, + "grad_norm": 0.800995409488678, + "learning_rate": 0.0005118392444625629, + "loss": 3.5983, + "step": 57500 + }, + { + "epoch": 3.9071205326810707, + "grad_norm": 0.7834985256195068, + "learning_rate": 0.0005117967794537302, + "loss": 3.5901, + "step": 57505 + }, + { + "epoch": 3.907460252751733, + "grad_norm": 0.7107078433036804, + "learning_rate": 0.0005117543144448974, + "loss": 3.3535, + "step": 57510 + }, + { + "epoch": 3.9077999728223944, + "grad_norm": 1.0284098386764526, + "learning_rate": 0.0005117118494360648, + "loss": 3.597, + "step": 57515 + }, + { + "epoch": 3.908139692893056, + "grad_norm": 0.9475288987159729, + "learning_rate": 0.000511669384427232, + "loss": 3.5888, + "step": 57520 + }, + { + "epoch": 3.908479412963718, + "grad_norm": 1.122776985168457, + "learning_rate": 0.0005116269194183992, + "loss": 3.2436, + "step": 57525 + }, + { + "epoch": 3.90881913303438, + "grad_norm": 1.036953330039978, + "learning_rate": 0.0005115844544095665, + "loss": 3.459, + "step": 57530 + }, + { + "epoch": 3.9091588531050414, + "grad_norm": 0.9882097840309143, + "learning_rate": 0.0005115419894007338, + "loss": 3.5937, + "step": 57535 + }, + { + "epoch": 3.9094985731757035, + "grad_norm": 0.8656896352767944, + "learning_rate": 0.0005114995243919011, + "loss": 3.4975, + "step": 57540 + }, + { + "epoch": 3.909838293246365, + "grad_norm": 0.9994798302650452, + "learning_rate": 0.0005114570593830684, + "loss": 3.5811, + "step": 57545 + }, + { + "epoch": 3.9101780133170267, + "grad_norm": 1.1553641557693481, + "learning_rate": 0.0005114145943742357, + "loss": 3.4708, + "step": 57550 + }, + { + "epoch": 3.910517733387689, + "grad_norm": 0.995116114616394, + "learning_rate": 0.0005113721293654029, + "loss": 3.3929, + "step": 57555 + }, + { + "epoch": 3.9108574534583505, + "grad_norm": 0.7870581746101379, + "learning_rate": 0.0005113296643565702, + "loss": 3.6373, + "step": 57560 + }, + { + "epoch": 3.911197173529012, + "grad_norm": 0.7483757138252258, + "learning_rate": 0.0005112871993477375, + "loss": 3.5433, + "step": 57565 + }, + { + "epoch": 3.9115368935996737, + "grad_norm": 0.6820041537284851, + "learning_rate": 0.0005112447343389047, + "loss": 3.4485, + "step": 57570 + }, + { + "epoch": 3.911876613670336, + "grad_norm": 0.8858791589736938, + "learning_rate": 0.0005112022693300721, + "loss": 3.5039, + "step": 57575 + }, + { + "epoch": 3.9122163337409974, + "grad_norm": 0.7981573343276978, + "learning_rate": 0.0005111598043212393, + "loss": 3.5436, + "step": 57580 + }, + { + "epoch": 3.912556053811659, + "grad_norm": 0.8629791140556335, + "learning_rate": 0.0005111173393124066, + "loss": 3.461, + "step": 57585 + }, + { + "epoch": 3.912895773882321, + "grad_norm": 0.9032166600227356, + "learning_rate": 0.0005110748743035739, + "loss": 3.729, + "step": 57590 + }, + { + "epoch": 3.9132354939529828, + "grad_norm": 1.2385836839675903, + "learning_rate": 0.0005110324092947411, + "loss": 3.3276, + "step": 57595 + }, + { + "epoch": 3.9135752140236444, + "grad_norm": 1.3065160512924194, + "learning_rate": 0.0005109899442859084, + "loss": 3.5724, + "step": 57600 + }, + { + "epoch": 3.913914934094306, + "grad_norm": 0.8931924104690552, + "learning_rate": 0.0005109474792770757, + "loss": 3.5877, + "step": 57605 + }, + { + "epoch": 3.914254654164968, + "grad_norm": 0.8525195717811584, + "learning_rate": 0.000510905014268243, + "loss": 3.4426, + "step": 57610 + }, + { + "epoch": 3.9145943742356297, + "grad_norm": 1.126924753189087, + "learning_rate": 0.0005108625492594103, + "loss": 3.5265, + "step": 57615 + }, + { + "epoch": 3.9149340943062914, + "grad_norm": 0.8245852589607239, + "learning_rate": 0.0005108200842505776, + "loss": 3.5447, + "step": 57620 + }, + { + "epoch": 3.9152738143769534, + "grad_norm": 0.8034182190895081, + "learning_rate": 0.0005107776192417448, + "loss": 3.7271, + "step": 57625 + }, + { + "epoch": 3.915613534447615, + "grad_norm": 1.0464674234390259, + "learning_rate": 0.000510735154232912, + "loss": 3.4077, + "step": 57630 + }, + { + "epoch": 3.9159532545182767, + "grad_norm": 0.771489143371582, + "learning_rate": 0.0005106926892240794, + "loss": 3.666, + "step": 57635 + }, + { + "epoch": 3.9162929745889388, + "grad_norm": NaN, + "learning_rate": 0.0005106587172170132, + "loss": 3.5908, + "step": 57640 + }, + { + "epoch": 3.9166326946596004, + "grad_norm": 0.8605748414993286, + "learning_rate": 0.0005106162522081804, + "loss": 3.6859, + "step": 57645 + }, + { + "epoch": 3.916972414730262, + "grad_norm": 0.6843796968460083, + "learning_rate": 0.0005105737871993478, + "loss": 3.3872, + "step": 57650 + }, + { + "epoch": 3.917312134800924, + "grad_norm": 0.9121683835983276, + "learning_rate": 0.000510531322190515, + "loss": 3.3517, + "step": 57655 + }, + { + "epoch": 3.9176518548715857, + "grad_norm": 0.9333606362342834, + "learning_rate": 0.0005104888571816822, + "loss": 3.3034, + "step": 57660 + }, + { + "epoch": 3.9179915749422474, + "grad_norm": 0.586627721786499, + "learning_rate": 0.0005104463921728497, + "loss": 3.6765, + "step": 57665 + }, + { + "epoch": 3.9183312950129094, + "grad_norm": 0.9150449633598328, + "learning_rate": 0.0005104039271640169, + "loss": 3.4363, + "step": 57670 + }, + { + "epoch": 3.918671015083571, + "grad_norm": 0.7935352325439453, + "learning_rate": 0.0005103614621551841, + "loss": 3.6924, + "step": 57675 + }, + { + "epoch": 3.9190107351542327, + "grad_norm": 0.8821390271186829, + "learning_rate": 0.0005103189971463514, + "loss": 3.4793, + "step": 57680 + }, + { + "epoch": 3.919350455224895, + "grad_norm": 0.7919027805328369, + "learning_rate": 0.0005102765321375187, + "loss": 3.5155, + "step": 57685 + }, + { + "epoch": 3.9196901752955564, + "grad_norm": 1.084805965423584, + "learning_rate": 0.0005102340671286859, + "loss": 3.5235, + "step": 57690 + }, + { + "epoch": 3.920029895366218, + "grad_norm": 0.6750452518463135, + "learning_rate": 0.0005101916021198532, + "loss": 3.8304, + "step": 57695 + }, + { + "epoch": 3.92036961543688, + "grad_norm": 0.8276028037071228, + "learning_rate": 0.0005101491371110206, + "loss": 3.7317, + "step": 57700 + }, + { + "epoch": 3.9207093355075417, + "grad_norm": 0.7802964448928833, + "learning_rate": 0.0005101066721021879, + "loss": 3.6955, + "step": 57705 + }, + { + "epoch": 3.9210490555782034, + "grad_norm": 1.0042442083358765, + "learning_rate": 0.0005100642070933551, + "loss": 3.5293, + "step": 57710 + }, + { + "epoch": 3.9213887756488655, + "grad_norm": 0.940449059009552, + "learning_rate": 0.0005100217420845223, + "loss": 3.3864, + "step": 57715 + }, + { + "epoch": 3.921728495719527, + "grad_norm": 1.0492037534713745, + "learning_rate": 0.0005099792770756897, + "loss": 3.2865, + "step": 57720 + }, + { + "epoch": 3.9220682157901887, + "grad_norm": 0.888658344745636, + "learning_rate": 0.0005099368120668569, + "loss": 3.446, + "step": 57725 + }, + { + "epoch": 3.922407935860851, + "grad_norm": 1.0922069549560547, + "learning_rate": 0.0005098943470580241, + "loss": 3.5724, + "step": 57730 + }, + { + "epoch": 3.9227476559315124, + "grad_norm": 0.8466131091117859, + "learning_rate": 0.0005098518820491916, + "loss": 3.5208, + "step": 57735 + }, + { + "epoch": 3.923087376002174, + "grad_norm": 1.7191169261932373, + "learning_rate": 0.0005098094170403588, + "loss": 3.347, + "step": 57740 + }, + { + "epoch": 3.923427096072836, + "grad_norm": 0.8131155967712402, + "learning_rate": 0.000509766952031526, + "loss": 3.4226, + "step": 57745 + }, + { + "epoch": 3.9237668161434978, + "grad_norm": 0.9421606659889221, + "learning_rate": 0.0005097244870226934, + "loss": 3.665, + "step": 57750 + }, + { + "epoch": 3.9241065362141594, + "grad_norm": 2.074309825897217, + "learning_rate": 0.0005096820220138606, + "loss": 3.5343, + "step": 57755 + }, + { + "epoch": 3.9244462562848215, + "grad_norm": 0.8156411051750183, + "learning_rate": 0.0005096395570050278, + "loss": 3.5427, + "step": 57760 + }, + { + "epoch": 3.924785976355483, + "grad_norm": 0.6627988219261169, + "learning_rate": 0.0005095970919961951, + "loss": 3.5656, + "step": 57765 + }, + { + "epoch": 3.9251256964261447, + "grad_norm": 0.9967922568321228, + "learning_rate": 0.0005095546269873625, + "loss": 3.3468, + "step": 57770 + }, + { + "epoch": 3.925465416496807, + "grad_norm": 1.1125835180282593, + "learning_rate": 0.0005095121619785297, + "loss": 3.5673, + "step": 57775 + }, + { + "epoch": 3.9258051365674684, + "grad_norm": 0.7692672610282898, + "learning_rate": 0.000509469696969697, + "loss": 3.3234, + "step": 57780 + }, + { + "epoch": 3.92614485663813, + "grad_norm": 1.1076446771621704, + "learning_rate": 0.0005094272319608643, + "loss": 3.5991, + "step": 57785 + }, + { + "epoch": 3.926484576708792, + "grad_norm": 0.8201953172683716, + "learning_rate": 0.0005093847669520315, + "loss": 3.5615, + "step": 57790 + }, + { + "epoch": 3.9268242967794538, + "grad_norm": 0.8156509399414062, + "learning_rate": 0.0005093423019431988, + "loss": 3.2514, + "step": 57795 + }, + { + "epoch": 3.9271640168501154, + "grad_norm": 1.0051909685134888, + "learning_rate": 0.000509299836934366, + "loss": 3.6097, + "step": 57800 + }, + { + "epoch": 3.9275037369207775, + "grad_norm": 0.9710397720336914, + "learning_rate": 0.0005092573719255334, + "loss": 3.6388, + "step": 57805 + }, + { + "epoch": 3.927843456991439, + "grad_norm": 1.0676183700561523, + "learning_rate": 0.0005092149069167007, + "loss": 3.4286, + "step": 57810 + }, + { + "epoch": 3.9281831770621007, + "grad_norm": 0.9420333504676819, + "learning_rate": 0.000509172441907868, + "loss": 3.4239, + "step": 57815 + }, + { + "epoch": 3.928522897132763, + "grad_norm": 0.9030850529670715, + "learning_rate": 0.0005091299768990352, + "loss": 3.3254, + "step": 57820 + }, + { + "epoch": 3.9288626172034244, + "grad_norm": 1.3166937828063965, + "learning_rate": 0.0005090875118902025, + "loss": 3.6439, + "step": 57825 + }, + { + "epoch": 3.929202337274086, + "grad_norm": 0.8145439624786377, + "learning_rate": 0.0005090450468813697, + "loss": 3.5613, + "step": 57830 + }, + { + "epoch": 3.929542057344748, + "grad_norm": 1.1031310558319092, + "learning_rate": 0.000509002581872537, + "loss": 3.2813, + "step": 57835 + }, + { + "epoch": 3.92988177741541, + "grad_norm": 0.8910592198371887, + "learning_rate": 0.0005089601168637044, + "loss": 3.288, + "step": 57840 + }, + { + "epoch": 3.9302214974860714, + "grad_norm": 0.9540947079658508, + "learning_rate": 0.0005089176518548716, + "loss": 3.6123, + "step": 57845 + }, + { + "epoch": 3.9305612175567335, + "grad_norm": 0.7896159887313843, + "learning_rate": 0.0005088751868460389, + "loss": 3.4867, + "step": 57850 + }, + { + "epoch": 3.930900937627395, + "grad_norm": 0.7346255779266357, + "learning_rate": 0.0005088327218372062, + "loss": 3.5148, + "step": 57855 + }, + { + "epoch": 3.9312406576980568, + "grad_norm": 0.8689724206924438, + "learning_rate": 0.0005087902568283734, + "loss": 3.6471, + "step": 57860 + }, + { + "epoch": 3.931580377768719, + "grad_norm": 0.9524775147438049, + "learning_rate": 0.0005087477918195406, + "loss": 3.4378, + "step": 57865 + }, + { + "epoch": 3.9319200978393805, + "grad_norm": 1.3496228456497192, + "learning_rate": 0.000508705326810708, + "loss": 3.2953, + "step": 57870 + }, + { + "epoch": 3.932259817910042, + "grad_norm": 0.8638948798179626, + "learning_rate": 0.0005086628618018753, + "loss": 3.6192, + "step": 57875 + }, + { + "epoch": 3.932599537980704, + "grad_norm": 0.7366384267807007, + "learning_rate": 0.0005086203967930425, + "loss": 3.4335, + "step": 57880 + }, + { + "epoch": 3.932939258051366, + "grad_norm": 0.9562558531761169, + "learning_rate": 0.0005085779317842099, + "loss": 3.6541, + "step": 57885 + }, + { + "epoch": 3.9332789781220274, + "grad_norm": 0.7516249418258667, + "learning_rate": 0.0005085354667753771, + "loss": 3.4857, + "step": 57890 + }, + { + "epoch": 3.9336186981926895, + "grad_norm": 0.7454807758331299, + "learning_rate": 0.0005084930017665443, + "loss": 3.4634, + "step": 57895 + }, + { + "epoch": 3.933958418263351, + "grad_norm": 0.9412474632263184, + "learning_rate": 0.0005084505367577117, + "loss": 3.3297, + "step": 57900 + }, + { + "epoch": 3.9342981383340128, + "grad_norm": 0.9617710709571838, + "learning_rate": 0.0005084080717488789, + "loss": 3.6175, + "step": 57905 + }, + { + "epoch": 3.9346378584046744, + "grad_norm": 0.7677515149116516, + "learning_rate": 0.0005083656067400462, + "loss": 3.6971, + "step": 57910 + }, + { + "epoch": 3.9349775784753365, + "grad_norm": 0.7997751832008362, + "learning_rate": 0.0005083231417312135, + "loss": 3.6206, + "step": 57915 + }, + { + "epoch": 3.935317298545998, + "grad_norm": 1.1940951347351074, + "learning_rate": 0.0005082806767223808, + "loss": 3.266, + "step": 57920 + }, + { + "epoch": 3.9356570186166597, + "grad_norm": 1.2010533809661865, + "learning_rate": 0.000508238211713548, + "loss": 3.6422, + "step": 57925 + }, + { + "epoch": 3.935996738687322, + "grad_norm": 0.9770098924636841, + "learning_rate": 0.0005081957467047153, + "loss": 3.3378, + "step": 57930 + }, + { + "epoch": 3.9363364587579834, + "grad_norm": 0.7966454029083252, + "learning_rate": 0.0005081532816958826, + "loss": 3.6021, + "step": 57935 + }, + { + "epoch": 3.936676178828645, + "grad_norm": 1.185572862625122, + "learning_rate": 0.0005081108166870498, + "loss": 3.4309, + "step": 57940 + }, + { + "epoch": 3.9370158988993067, + "grad_norm": 0.7693960070610046, + "learning_rate": 0.0005080683516782172, + "loss": 3.151, + "step": 57945 + }, + { + "epoch": 3.9373556189699688, + "grad_norm": 1.0086668729782104, + "learning_rate": 0.0005080258866693845, + "loss": 3.5059, + "step": 57950 + }, + { + "epoch": 3.9376953390406304, + "grad_norm": 1.0467751026153564, + "learning_rate": 0.0005079834216605517, + "loss": 3.5643, + "step": 57955 + }, + { + "epoch": 3.938035059111292, + "grad_norm": 0.8268922567367554, + "learning_rate": 0.000507940956651719, + "loss": 3.2828, + "step": 57960 + }, + { + "epoch": 3.938374779181954, + "grad_norm": 0.8494259119033813, + "learning_rate": 0.0005078984916428862, + "loss": 3.6161, + "step": 57965 + }, + { + "epoch": 3.9387144992526157, + "grad_norm": 1.0625147819519043, + "learning_rate": 0.0005078560266340535, + "loss": 3.7323, + "step": 57970 + }, + { + "epoch": 3.9390542193232774, + "grad_norm": 0.7889636754989624, + "learning_rate": 0.0005078135616252208, + "loss": 3.4769, + "step": 57975 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.8562115430831909, + "learning_rate": 0.0005077710966163881, + "loss": 3.309, + "step": 57980 + }, + { + "epoch": 3.939733659464601, + "grad_norm": 0.845073938369751, + "learning_rate": 0.0005077286316075554, + "loss": 3.5852, + "step": 57985 + }, + { + "epoch": 3.9400733795352627, + "grad_norm": 1.1771830320358276, + "learning_rate": 0.0005076861665987227, + "loss": 3.4956, + "step": 57990 + }, + { + "epoch": 3.940413099605925, + "grad_norm": 0.8361566662788391, + "learning_rate": 0.0005076437015898899, + "loss": 3.3519, + "step": 57995 + }, + { + "epoch": 3.9407528196765864, + "grad_norm": 0.8239336013793945, + "learning_rate": 0.0005076012365810571, + "loss": 3.3597, + "step": 58000 + }, + { + "epoch": 3.941092539747248, + "grad_norm": 1.0343875885009766, + "learning_rate": 0.0005075587715722245, + "loss": 3.2062, + "step": 58005 + }, + { + "epoch": 3.94143225981791, + "grad_norm": 0.9432041049003601, + "learning_rate": 0.0005075163065633917, + "loss": 3.5733, + "step": 58010 + }, + { + "epoch": 3.9417719798885718, + "grad_norm": 2.8203182220458984, + "learning_rate": 0.000507473841554559, + "loss": 3.4425, + "step": 58015 + }, + { + "epoch": 3.9421116999592334, + "grad_norm": 0.7992556095123291, + "learning_rate": 0.0005074313765457264, + "loss": 3.5447, + "step": 58020 + }, + { + "epoch": 3.9424514200298955, + "grad_norm": 0.8767862915992737, + "learning_rate": 0.0005073889115368936, + "loss": 3.4985, + "step": 58025 + }, + { + "epoch": 3.942791140100557, + "grad_norm": 1.0771410465240479, + "learning_rate": 0.0005073464465280608, + "loss": 3.4225, + "step": 58030 + }, + { + "epoch": 3.9431308601712187, + "grad_norm": 0.8564871549606323, + "learning_rate": 0.0005073039815192282, + "loss": 3.4217, + "step": 58035 + }, + { + "epoch": 3.943470580241881, + "grad_norm": 0.9158282279968262, + "learning_rate": 0.0005072615165103954, + "loss": 3.4879, + "step": 58040 + }, + { + "epoch": 3.9438103003125424, + "grad_norm": 0.9553199410438538, + "learning_rate": 0.0005072190515015628, + "loss": 3.4986, + "step": 58045 + }, + { + "epoch": 3.944150020383204, + "grad_norm": 0.8343895673751831, + "learning_rate": 0.0005071765864927301, + "loss": 3.3998, + "step": 58050 + }, + { + "epoch": 3.944489740453866, + "grad_norm": 0.781995952129364, + "learning_rate": 0.0005071341214838973, + "loss": 3.659, + "step": 58055 + }, + { + "epoch": 3.9448294605245278, + "grad_norm": 0.9655227661132812, + "learning_rate": 0.0005070916564750646, + "loss": 3.5675, + "step": 58060 + }, + { + "epoch": 3.9451691805951894, + "grad_norm": 1.2866848707199097, + "learning_rate": 0.0005070491914662318, + "loss": 3.4047, + "step": 58065 + }, + { + "epoch": 3.9455089006658515, + "grad_norm": 0.8249284029006958, + "learning_rate": 0.0005070067264573991, + "loss": 3.6359, + "step": 58070 + }, + { + "epoch": 3.945848620736513, + "grad_norm": 1.619341254234314, + "learning_rate": 0.0005069642614485664, + "loss": 3.6633, + "step": 58075 + }, + { + "epoch": 3.9461883408071747, + "grad_norm": 0.7634679675102234, + "learning_rate": 0.0005069217964397337, + "loss": 3.5676, + "step": 58080 + }, + { + "epoch": 3.946528060877837, + "grad_norm": 0.8154359459877014, + "learning_rate": 0.000506879331430901, + "loss": 3.5735, + "step": 58085 + }, + { + "epoch": 3.9468677809484984, + "grad_norm": 0.8728752732276917, + "learning_rate": 0.0005068368664220683, + "loss": 3.4801, + "step": 58090 + }, + { + "epoch": 3.94720750101916, + "grad_norm": 0.8374011516571045, + "learning_rate": 0.0005067944014132355, + "loss": 3.4791, + "step": 58095 + }, + { + "epoch": 3.947547221089822, + "grad_norm": 0.8921509981155396, + "learning_rate": 0.0005067519364044027, + "loss": 3.7126, + "step": 58100 + }, + { + "epoch": 3.9478869411604838, + "grad_norm": 1.2721198797225952, + "learning_rate": 0.0005067094713955701, + "loss": 3.6989, + "step": 58105 + }, + { + "epoch": 3.9482266612311454, + "grad_norm": 0.9803842306137085, + "learning_rate": 0.0005066670063867373, + "loss": 3.2346, + "step": 58110 + }, + { + "epoch": 3.9485663813018075, + "grad_norm": 0.9707382321357727, + "learning_rate": 0.0005066245413779046, + "loss": 3.571, + "step": 58115 + }, + { + "epoch": 3.948906101372469, + "grad_norm": 0.9294204115867615, + "learning_rate": 0.000506582076369072, + "loss": 3.2772, + "step": 58120 + }, + { + "epoch": 3.9492458214431307, + "grad_norm": 0.879185676574707, + "learning_rate": 0.0005065396113602392, + "loss": 3.3468, + "step": 58125 + }, + { + "epoch": 3.949585541513793, + "grad_norm": 1.0609229803085327, + "learning_rate": 0.0005064971463514064, + "loss": 3.8394, + "step": 58130 + }, + { + "epoch": 3.9499252615844545, + "grad_norm": 0.8563007712364197, + "learning_rate": 0.0005064546813425738, + "loss": 3.5456, + "step": 58135 + }, + { + "epoch": 3.950264981655116, + "grad_norm": 0.933224618434906, + "learning_rate": 0.000506412216333741, + "loss": 3.4856, + "step": 58140 + }, + { + "epoch": 3.950604701725778, + "grad_norm": 0.8817102313041687, + "learning_rate": 0.0005063697513249082, + "loss": 3.6121, + "step": 58145 + }, + { + "epoch": 3.95094442179644, + "grad_norm": 0.8613654375076294, + "learning_rate": 0.0005063272863160757, + "loss": 3.5495, + "step": 58150 + }, + { + "epoch": 3.9512841418671014, + "grad_norm": 0.7603500485420227, + "learning_rate": 0.0005062848213072429, + "loss": 3.5627, + "step": 58155 + }, + { + "epoch": 3.9516238619377635, + "grad_norm": 0.8019192814826965, + "learning_rate": 0.0005062423562984101, + "loss": 3.7379, + "step": 58160 + }, + { + "epoch": 3.951963582008425, + "grad_norm": 1.4360169172286987, + "learning_rate": 0.0005061998912895774, + "loss": 3.8377, + "step": 58165 + }, + { + "epoch": 3.9523033020790868, + "grad_norm": 0.954161524772644, + "learning_rate": 0.0005061574262807447, + "loss": 3.4406, + "step": 58170 + }, + { + "epoch": 3.952643022149749, + "grad_norm": 0.9113037586212158, + "learning_rate": 0.0005061149612719119, + "loss": 3.3729, + "step": 58175 + }, + { + "epoch": 3.9529827422204105, + "grad_norm": 0.7251841425895691, + "learning_rate": 0.0005060724962630792, + "loss": 3.3893, + "step": 58180 + }, + { + "epoch": 3.953322462291072, + "grad_norm": 0.9330145120620728, + "learning_rate": 0.0005060300312542466, + "loss": 3.4954, + "step": 58185 + }, + { + "epoch": 3.953662182361734, + "grad_norm": 0.7568967938423157, + "learning_rate": 0.0005059875662454138, + "loss": 3.4765, + "step": 58190 + }, + { + "epoch": 3.954001902432396, + "grad_norm": 1.0054070949554443, + "learning_rate": 0.0005059451012365811, + "loss": 3.457, + "step": 58195 + }, + { + "epoch": 3.9543416225030574, + "grad_norm": 0.8225808143615723, + "learning_rate": 0.0005059026362277484, + "loss": 3.4279, + "step": 58200 + }, + { + "epoch": 3.9546813425737195, + "grad_norm": 1.1128121614456177, + "learning_rate": 0.0005058601712189156, + "loss": 3.5722, + "step": 58205 + }, + { + "epoch": 3.955021062644381, + "grad_norm": 0.8905042409896851, + "learning_rate": 0.0005058177062100829, + "loss": 3.2752, + "step": 58210 + }, + { + "epoch": 3.9553607827150428, + "grad_norm": 0.9994968175888062, + "learning_rate": 0.0005057752412012501, + "loss": 3.2636, + "step": 58215 + }, + { + "epoch": 3.955700502785705, + "grad_norm": 0.9429242014884949, + "learning_rate": 0.0005057327761924175, + "loss": 3.7081, + "step": 58220 + }, + { + "epoch": 3.9560402228563665, + "grad_norm": 0.8492140769958496, + "learning_rate": 0.0005056903111835848, + "loss": 3.3153, + "step": 58225 + }, + { + "epoch": 3.956379942927028, + "grad_norm": 0.8432299494743347, + "learning_rate": 0.000505647846174752, + "loss": 3.3432, + "step": 58230 + }, + { + "epoch": 3.95671966299769, + "grad_norm": 0.7842205166816711, + "learning_rate": 0.0005056053811659193, + "loss": 3.7315, + "step": 58235 + }, + { + "epoch": 3.957059383068352, + "grad_norm": 0.9253696203231812, + "learning_rate": 0.0005055629161570866, + "loss": 3.6334, + "step": 58240 + }, + { + "epoch": 3.9573991031390134, + "grad_norm": 0.8406718373298645, + "learning_rate": 0.0005055204511482538, + "loss": 3.6746, + "step": 58245 + }, + { + "epoch": 3.957738823209675, + "grad_norm": 0.7803347706794739, + "learning_rate": 0.000505477986139421, + "loss": 3.5023, + "step": 58250 + }, + { + "epoch": 3.958078543280337, + "grad_norm": 0.8635876774787903, + "learning_rate": 0.0005054355211305885, + "loss": 3.3061, + "step": 58255 + }, + { + "epoch": 3.958418263350999, + "grad_norm": 0.9071358442306519, + "learning_rate": 0.0005053930561217557, + "loss": 3.3692, + "step": 58260 + }, + { + "epoch": 3.9587579834216604, + "grad_norm": 0.8634017109870911, + "learning_rate": 0.0005053505911129229, + "loss": 3.6319, + "step": 58265 + }, + { + "epoch": 3.9590977034923225, + "grad_norm": 1.0053681135177612, + "learning_rate": 0.0005053081261040903, + "loss": 3.5395, + "step": 58270 + }, + { + "epoch": 3.959437423562984, + "grad_norm": 0.9392582178115845, + "learning_rate": 0.0005052656610952575, + "loss": 3.2616, + "step": 58275 + }, + { + "epoch": 3.9597771436336457, + "grad_norm": 0.8226429224014282, + "learning_rate": 0.0005052231960864247, + "loss": 3.5736, + "step": 58280 + }, + { + "epoch": 3.9601168637043074, + "grad_norm": 0.9347580075263977, + "learning_rate": 0.0005051807310775921, + "loss": 3.5441, + "step": 58285 + }, + { + "epoch": 3.9604565837749695, + "grad_norm": 0.8877529501914978, + "learning_rate": 0.0005051382660687594, + "loss": 3.7932, + "step": 58290 + }, + { + "epoch": 3.960796303845631, + "grad_norm": 0.8019188046455383, + "learning_rate": 0.0005050958010599266, + "loss": 3.5052, + "step": 58295 + }, + { + "epoch": 3.9611360239162927, + "grad_norm": 1.1863377094268799, + "learning_rate": 0.000505053336051094, + "loss": 3.4246, + "step": 58300 + }, + { + "epoch": 3.961475743986955, + "grad_norm": 0.972170889377594, + "learning_rate": 0.0005050108710422612, + "loss": 3.5359, + "step": 58305 + }, + { + "epoch": 3.9618154640576164, + "grad_norm": 0.8710821866989136, + "learning_rate": 0.0005049684060334284, + "loss": 3.4023, + "step": 58310 + }, + { + "epoch": 3.962155184128278, + "grad_norm": 0.8967952132225037, + "learning_rate": 0.0005049259410245957, + "loss": 3.7247, + "step": 58315 + }, + { + "epoch": 3.96249490419894, + "grad_norm": 0.9135844707489014, + "learning_rate": 0.000504883476015763, + "loss": 3.5226, + "step": 58320 + }, + { + "epoch": 3.9628346242696018, + "grad_norm": 0.7362406253814697, + "learning_rate": 0.0005048410110069303, + "loss": 3.4515, + "step": 58325 + }, + { + "epoch": 3.9631743443402634, + "grad_norm": 1.0835920572280884, + "learning_rate": 0.0005047985459980976, + "loss": 3.3855, + "step": 58330 + }, + { + "epoch": 3.9635140644109255, + "grad_norm": 0.722529947757721, + "learning_rate": 0.0005047560809892649, + "loss": 3.5324, + "step": 58335 + }, + { + "epoch": 3.963853784481587, + "grad_norm": 0.7589671611785889, + "learning_rate": 0.0005047136159804321, + "loss": 3.5553, + "step": 58340 + }, + { + "epoch": 3.9641935045522487, + "grad_norm": 1.089255452156067, + "learning_rate": 0.0005046711509715994, + "loss": 3.2103, + "step": 58345 + }, + { + "epoch": 3.964533224622911, + "grad_norm": 0.8645279407501221, + "learning_rate": 0.0005046286859627666, + "loss": 3.8186, + "step": 58350 + }, + { + "epoch": 3.9648729446935724, + "grad_norm": 0.9358604550361633, + "learning_rate": 0.0005045862209539339, + "loss": 3.1685, + "step": 58355 + }, + { + "epoch": 3.965212664764234, + "grad_norm": 0.8498232960700989, + "learning_rate": 0.0005045437559451013, + "loss": 3.4074, + "step": 58360 + }, + { + "epoch": 3.965552384834896, + "grad_norm": 0.7698826193809509, + "learning_rate": 0.0005045012909362685, + "loss": 3.6578, + "step": 58365 + }, + { + "epoch": 3.9658921049055578, + "grad_norm": 1.767526388168335, + "learning_rate": 0.0005044588259274358, + "loss": 3.4536, + "step": 58370 + }, + { + "epoch": 3.9662318249762194, + "grad_norm": 0.8577873706817627, + "learning_rate": 0.0005044163609186031, + "loss": 3.4543, + "step": 58375 + }, + { + "epoch": 3.9665715450468815, + "grad_norm": 0.9142153263092041, + "learning_rate": 0.0005043738959097703, + "loss": 3.5909, + "step": 58380 + }, + { + "epoch": 3.966911265117543, + "grad_norm": 1.0068395137786865, + "learning_rate": 0.0005043314309009377, + "loss": 3.6326, + "step": 58385 + }, + { + "epoch": 3.9672509851882047, + "grad_norm": 0.958506166934967, + "learning_rate": 0.0005042889658921049, + "loss": 3.6436, + "step": 58390 + }, + { + "epoch": 3.967590705258867, + "grad_norm": 0.9199275374412537, + "learning_rate": 0.0005042465008832722, + "loss": 3.6352, + "step": 58395 + }, + { + "epoch": 3.9679304253295284, + "grad_norm": 0.7783728837966919, + "learning_rate": 0.0005042040358744396, + "loss": 3.4343, + "step": 58400 + }, + { + "epoch": 3.96827014540019, + "grad_norm": 0.7204961180686951, + "learning_rate": 0.0005041615708656068, + "loss": 3.429, + "step": 58405 + }, + { + "epoch": 3.968609865470852, + "grad_norm": 1.1507351398468018, + "learning_rate": 0.000504119105856774, + "loss": 3.4539, + "step": 58410 + }, + { + "epoch": 3.968949585541514, + "grad_norm": 0.8438140153884888, + "learning_rate": 0.0005040766408479413, + "loss": 3.2879, + "step": 58415 + }, + { + "epoch": 3.9692893056121754, + "grad_norm": 1.1651077270507812, + "learning_rate": 0.0005040341758391086, + "loss": 3.434, + "step": 58420 + }, + { + "epoch": 3.9696290256828375, + "grad_norm": 2.1911492347717285, + "learning_rate": 0.0005039917108302758, + "loss": 3.808, + "step": 58425 + }, + { + "epoch": 3.969968745753499, + "grad_norm": 1.7138091325759888, + "learning_rate": 0.0005039492458214432, + "loss": 3.3868, + "step": 58430 + }, + { + "epoch": 3.9703084658241607, + "grad_norm": 0.8073131442070007, + "learning_rate": 0.0005039067808126105, + "loss": 3.3214, + "step": 58435 + }, + { + "epoch": 3.970648185894823, + "grad_norm": 0.8921253085136414, + "learning_rate": 0.0005038643158037777, + "loss": 3.4532, + "step": 58440 + }, + { + "epoch": 3.9709879059654845, + "grad_norm": 0.9272505640983582, + "learning_rate": 0.000503821850794945, + "loss": 3.4475, + "step": 58445 + }, + { + "epoch": 3.971327626036146, + "grad_norm": 0.9590405225753784, + "learning_rate": 0.0005037793857861122, + "loss": 3.1615, + "step": 58450 + }, + { + "epoch": 3.971667346106808, + "grad_norm": 1.1287455558776855, + "learning_rate": 0.0005037369207772795, + "loss": 3.5718, + "step": 58455 + }, + { + "epoch": 3.97200706617747, + "grad_norm": 7.902523994445801, + "learning_rate": 0.0005036944557684468, + "loss": 3.4335, + "step": 58460 + }, + { + "epoch": 3.9723467862481314, + "grad_norm": 0.9455626010894775, + "learning_rate": 0.0005036519907596141, + "loss": 3.4772, + "step": 58465 + }, + { + "epoch": 3.9726865063187935, + "grad_norm": 2.648346185684204, + "learning_rate": 0.0005036095257507814, + "loss": 3.3402, + "step": 58470 + }, + { + "epoch": 3.973026226389455, + "grad_norm": 0.8543696999549866, + "learning_rate": 0.0005035670607419487, + "loss": 3.4692, + "step": 58475 + }, + { + "epoch": 3.9733659464601168, + "grad_norm": 0.8983910083770752, + "learning_rate": 0.0005035245957331159, + "loss": 3.5901, + "step": 58480 + }, + { + "epoch": 3.973705666530779, + "grad_norm": 0.8436374068260193, + "learning_rate": 0.0005034821307242832, + "loss": 3.1953, + "step": 58485 + }, + { + "epoch": 3.9740453866014405, + "grad_norm": 1.4477773904800415, + "learning_rate": 0.0005034396657154505, + "loss": 3.6198, + "step": 58490 + }, + { + "epoch": 3.974385106672102, + "grad_norm": 0.8770825266838074, + "learning_rate": 0.0005033972007066177, + "loss": 3.3535, + "step": 58495 + }, + { + "epoch": 3.974724826742764, + "grad_norm": 1.0756946802139282, + "learning_rate": 0.000503354735697785, + "loss": 3.4213, + "step": 58500 + }, + { + "epoch": 3.975064546813426, + "grad_norm": 0.8505082130432129, + "learning_rate": 0.0005033122706889524, + "loss": 3.1982, + "step": 58505 + }, + { + "epoch": 3.9754042668840874, + "grad_norm": 0.7297608852386475, + "learning_rate": 0.0005032698056801196, + "loss": 3.4784, + "step": 58510 + }, + { + "epoch": 3.9757439869547495, + "grad_norm": 0.8716750741004944, + "learning_rate": 0.0005032273406712868, + "loss": 3.6593, + "step": 58515 + }, + { + "epoch": 3.976083707025411, + "grad_norm": 0.9443747401237488, + "learning_rate": 0.0005031848756624542, + "loss": 3.552, + "step": 58520 + }, + { + "epoch": 3.9764234270960728, + "grad_norm": 0.7938926219940186, + "learning_rate": 0.0005031424106536214, + "loss": 3.6335, + "step": 58525 + }, + { + "epoch": 3.976763147166735, + "grad_norm": 1.0341399908065796, + "learning_rate": 0.0005030999456447886, + "loss": 3.4101, + "step": 58530 + }, + { + "epoch": 3.9771028672373965, + "grad_norm": 1.2547041177749634, + "learning_rate": 0.0005030574806359561, + "loss": 3.4947, + "step": 58535 + }, + { + "epoch": 3.977442587308058, + "grad_norm": 0.8609169125556946, + "learning_rate": 0.0005030150156271233, + "loss": 3.4869, + "step": 58540 + }, + { + "epoch": 3.97778230737872, + "grad_norm": 1.0788646936416626, + "learning_rate": 0.0005029725506182905, + "loss": 3.4839, + "step": 58545 + }, + { + "epoch": 3.978122027449382, + "grad_norm": 1.0014984607696533, + "learning_rate": 0.0005029300856094578, + "loss": 3.59, + "step": 58550 + }, + { + "epoch": 3.9784617475200434, + "grad_norm": 0.8735423684120178, + "learning_rate": 0.0005028876206006251, + "loss": 3.7404, + "step": 58555 + }, + { + "epoch": 3.9788014675907055, + "grad_norm": 1.9361032247543335, + "learning_rate": 0.0005028451555917923, + "loss": 3.6349, + "step": 58560 + }, + { + "epoch": 3.979141187661367, + "grad_norm": 0.673593282699585, + "learning_rate": 0.0005028026905829596, + "loss": 3.5066, + "step": 58565 + }, + { + "epoch": 3.979480907732029, + "grad_norm": 0.9140896201133728, + "learning_rate": 0.000502760225574127, + "loss": 3.5093, + "step": 58570 + }, + { + "epoch": 3.979820627802691, + "grad_norm": 1.1639484167099, + "learning_rate": 0.0005027177605652942, + "loss": 3.3614, + "step": 58575 + }, + { + "epoch": 3.9801603478733525, + "grad_norm": 0.6280642747879028, + "learning_rate": 0.0005026752955564615, + "loss": 3.5629, + "step": 58580 + }, + { + "epoch": 3.980500067944014, + "grad_norm": 0.7664386034011841, + "learning_rate": 0.0005026328305476288, + "loss": 3.5615, + "step": 58585 + }, + { + "epoch": 3.9808397880146758, + "grad_norm": 0.8103002905845642, + "learning_rate": 0.000502590365538796, + "loss": 3.4778, + "step": 58590 + }, + { + "epoch": 3.981179508085338, + "grad_norm": 2.3616878986358643, + "learning_rate": 0.0005025479005299633, + "loss": 3.5136, + "step": 58595 + }, + { + "epoch": 3.9815192281559995, + "grad_norm": 0.7197387218475342, + "learning_rate": 0.0005025054355211305, + "loss": 3.5678, + "step": 58600 + }, + { + "epoch": 3.981858948226661, + "grad_norm": 0.6427649259567261, + "learning_rate": 0.0005024629705122979, + "loss": 3.5397, + "step": 58605 + }, + { + "epoch": 3.982198668297323, + "grad_norm": 0.9363502264022827, + "learning_rate": 0.0005024205055034652, + "loss": 3.4614, + "step": 58610 + }, + { + "epoch": 3.982538388367985, + "grad_norm": 0.8515766263008118, + "learning_rate": 0.0005023780404946324, + "loss": 3.3974, + "step": 58615 + }, + { + "epoch": 3.9828781084386464, + "grad_norm": 0.7800848484039307, + "learning_rate": 0.0005023355754857997, + "loss": 3.6661, + "step": 58620 + }, + { + "epoch": 3.983217828509308, + "grad_norm": 0.7067897915840149, + "learning_rate": 0.000502293110476967, + "loss": 3.4209, + "step": 58625 + }, + { + "epoch": 3.98355754857997, + "grad_norm": 0.7345088124275208, + "learning_rate": 0.0005022506454681342, + "loss": 3.5258, + "step": 58630 + }, + { + "epoch": 3.9838972686506318, + "grad_norm": 0.7573986649513245, + "learning_rate": 0.0005022081804593014, + "loss": 3.2591, + "step": 58635 + }, + { + "epoch": 3.9842369887212934, + "grad_norm": 0.9010897874832153, + "learning_rate": 0.0005021657154504689, + "loss": 3.6207, + "step": 58640 + }, + { + "epoch": 3.9845767087919555, + "grad_norm": 0.9001638293266296, + "learning_rate": 0.0005021232504416361, + "loss": 3.4867, + "step": 58645 + }, + { + "epoch": 3.984916428862617, + "grad_norm": 0.83902508020401, + "learning_rate": 0.0005020807854328033, + "loss": 3.6561, + "step": 58650 + }, + { + "epoch": 3.9852561489332787, + "grad_norm": 0.753254234790802, + "learning_rate": 0.0005020383204239707, + "loss": 3.5102, + "step": 58655 + }, + { + "epoch": 3.985595869003941, + "grad_norm": 0.7518637180328369, + "learning_rate": 0.0005019958554151379, + "loss": 3.369, + "step": 58660 + }, + { + "epoch": 3.9859355890746024, + "grad_norm": 0.8829060792922974, + "learning_rate": 0.0005019533904063051, + "loss": 3.5404, + "step": 58665 + }, + { + "epoch": 3.986275309145264, + "grad_norm": 0.9723363518714905, + "learning_rate": 0.0005019109253974726, + "loss": 3.3455, + "step": 58670 + }, + { + "epoch": 3.986615029215926, + "grad_norm": 1.1501209735870361, + "learning_rate": 0.0005018684603886398, + "loss": 3.586, + "step": 58675 + }, + { + "epoch": 3.9869547492865878, + "grad_norm": 1.4075474739074707, + "learning_rate": 0.000501825995379807, + "loss": 3.4415, + "step": 58680 + }, + { + "epoch": 3.9872944693572494, + "grad_norm": 1.2175798416137695, + "learning_rate": 0.0005017835303709744, + "loss": 3.2143, + "step": 58685 + }, + { + "epoch": 3.9876341894279115, + "grad_norm": 0.8956888318061829, + "learning_rate": 0.0005017410653621416, + "loss": 3.5445, + "step": 58690 + }, + { + "epoch": 3.987973909498573, + "grad_norm": 0.6477235555648804, + "learning_rate": 0.0005016986003533088, + "loss": 3.6707, + "step": 58695 + }, + { + "epoch": 3.9883136295692347, + "grad_norm": 0.7912454605102539, + "learning_rate": 0.0005016561353444761, + "loss": 3.3227, + "step": 58700 + }, + { + "epoch": 3.988653349639897, + "grad_norm": 1.401807427406311, + "learning_rate": 0.0005016136703356435, + "loss": 3.5643, + "step": 58705 + }, + { + "epoch": 3.9889930697105584, + "grad_norm": 2.533843517303467, + "learning_rate": 0.0005015712053268107, + "loss": 3.3992, + "step": 58710 + }, + { + "epoch": 3.98933278978122, + "grad_norm": 0.7648670077323914, + "learning_rate": 0.000501528740317978, + "loss": 3.6717, + "step": 58715 + }, + { + "epoch": 3.989672509851882, + "grad_norm": 0.8034269213676453, + "learning_rate": 0.0005014862753091453, + "loss": 3.7814, + "step": 58720 + }, + { + "epoch": 3.990012229922544, + "grad_norm": 1.023957371711731, + "learning_rate": 0.0005014438103003126, + "loss": 3.1128, + "step": 58725 + }, + { + "epoch": 3.9903519499932054, + "grad_norm": 0.979388415813446, + "learning_rate": 0.0005014013452914798, + "loss": 3.3117, + "step": 58730 + }, + { + "epoch": 3.9906916700638675, + "grad_norm": 0.9511295557022095, + "learning_rate": 0.000501358880282647, + "loss": 3.3193, + "step": 58735 + }, + { + "epoch": 3.991031390134529, + "grad_norm": 0.7476137280464172, + "learning_rate": 0.0005013164152738145, + "loss": 3.4795, + "step": 58740 + }, + { + "epoch": 3.9913711102051908, + "grad_norm": 0.7594215869903564, + "learning_rate": 0.0005012739502649817, + "loss": 3.3811, + "step": 58745 + }, + { + "epoch": 3.991710830275853, + "grad_norm": 0.805874764919281, + "learning_rate": 0.0005012314852561489, + "loss": 3.3617, + "step": 58750 + }, + { + "epoch": 3.9920505503465145, + "grad_norm": 0.9665697813034058, + "learning_rate": 0.0005011890202473163, + "loss": 3.5666, + "step": 58755 + }, + { + "epoch": 3.992390270417176, + "grad_norm": 0.8872327208518982, + "learning_rate": 0.0005011465552384835, + "loss": 3.3453, + "step": 58760 + }, + { + "epoch": 3.992729990487838, + "grad_norm": 1.4011952877044678, + "learning_rate": 0.0005011040902296507, + "loss": 3.7559, + "step": 58765 + }, + { + "epoch": 3.9930697105585, + "grad_norm": 0.905147910118103, + "learning_rate": 0.0005010616252208181, + "loss": 3.4479, + "step": 58770 + }, + { + "epoch": 3.9934094306291614, + "grad_norm": 1.0457593202590942, + "learning_rate": 0.0005010191602119854, + "loss": 3.2598, + "step": 58775 + }, + { + "epoch": 3.9937491506998235, + "grad_norm": 0.7817652225494385, + "learning_rate": 0.0005009766952031526, + "loss": 3.3054, + "step": 58780 + }, + { + "epoch": 3.994088870770485, + "grad_norm": 0.8439315557479858, + "learning_rate": 0.00050093423019432, + "loss": 3.5328, + "step": 58785 + }, + { + "epoch": 3.9944285908411468, + "grad_norm": 0.9013926386833191, + "learning_rate": 0.0005008917651854872, + "loss": 3.3628, + "step": 58790 + }, + { + "epoch": 3.994768310911809, + "grad_norm": 0.8768414258956909, + "learning_rate": 0.0005008493001766544, + "loss": 3.5714, + "step": 58795 + }, + { + "epoch": 3.9951080309824705, + "grad_norm": 1.0577733516693115, + "learning_rate": 0.0005008068351678217, + "loss": 3.5545, + "step": 58800 + }, + { + "epoch": 3.995447751053132, + "grad_norm": 1.0926955938339233, + "learning_rate": 0.000500764370158989, + "loss": 3.7847, + "step": 58805 + }, + { + "epoch": 3.995787471123794, + "grad_norm": 0.7216775417327881, + "learning_rate": 0.0005007219051501563, + "loss": 3.4806, + "step": 58810 + }, + { + "epoch": 3.996127191194456, + "grad_norm": 0.8964203596115112, + "learning_rate": 0.0005006794401413236, + "loss": 3.3401, + "step": 58815 + }, + { + "epoch": 3.9964669112651174, + "grad_norm": 1.0750247240066528, + "learning_rate": 0.0005006369751324909, + "loss": 3.6753, + "step": 58820 + }, + { + "epoch": 3.9968066313357795, + "grad_norm": 0.9576235413551331, + "learning_rate": 0.0005005945101236581, + "loss": 3.5143, + "step": 58825 + }, + { + "epoch": 3.997146351406441, + "grad_norm": 0.8113265037536621, + "learning_rate": 0.0005005520451148254, + "loss": 3.597, + "step": 58830 + }, + { + "epoch": 3.9974860714771028, + "grad_norm": 0.8113423585891724, + "learning_rate": 0.0005005095801059926, + "loss": 3.2455, + "step": 58835 + }, + { + "epoch": 3.997825791547765, + "grad_norm": 1.024649977684021, + "learning_rate": 0.0005004671150971599, + "loss": 3.4848, + "step": 58840 + }, + { + "epoch": 3.9981655116184265, + "grad_norm": 0.6404519081115723, + "learning_rate": 0.0005004246500883273, + "loss": 3.6332, + "step": 58845 + }, + { + "epoch": 3.998505231689088, + "grad_norm": 0.9171125292778015, + "learning_rate": 0.0005003821850794945, + "loss": 3.4272, + "step": 58850 + }, + { + "epoch": 3.99884495175975, + "grad_norm": 0.7678583860397339, + "learning_rate": 0.0005003397200706618, + "loss": 3.5413, + "step": 58855 + }, + { + "epoch": 3.999184671830412, + "grad_norm": 1.0538419485092163, + "learning_rate": 0.0005002972550618291, + "loss": 3.6838, + "step": 58860 + }, + { + "epoch": 3.9995243919010735, + "grad_norm": 0.9793692827224731, + "learning_rate": 0.0005002547900529963, + "loss": 3.7542, + "step": 58865 + }, + { + "epoch": 3.9998641119717355, + "grad_norm": 1.184192419052124, + "learning_rate": 0.0005002123250441636, + "loss": 3.4473, + "step": 58870 + }, + { + "epoch": 4.0, + "eval_bertscore": { + "f1": 0.8384510946142106, + "precision": 0.8413312704667806, + "recall": 0.8363650355912974 + }, + "eval_bleu_4": 0.016446153797015348, + "eval_exact_match": 0.0005814516910553348, + "eval_loss": 3.4301984310150146, + "eval_meteor": 0.08296746730422343, + "eval_rouge": { + "rouge1": 0.11961414750516826, + "rouge2": 0.018002933265808155, + "rougeL": 0.10406368044565853, + "rougeLsum": 0.10408589745338362 + }, + "eval_runtime": 1408.7711, + "eval_samples_per_second": 7.325, + "eval_steps_per_second": 0.916, + "step": 58872 + }, + { + "epoch": 4.000203832042397, + "grad_norm": 0.8519189953804016, + "learning_rate": 0.0005001698600353309, + "loss": 3.7108, + "step": 58875 + }, + { + "epoch": 4.000543552113059, + "grad_norm": 1.0332545042037964, + "learning_rate": 0.0005001273950264982, + "loss": 3.4484, + "step": 58880 + }, + { + "epoch": 4.000883272183721, + "grad_norm": 0.9267972111701965, + "learning_rate": 0.0005000849300176655, + "loss": 3.3217, + "step": 58885 + }, + { + "epoch": 4.001222992254382, + "grad_norm": 0.7630363702774048, + "learning_rate": 0.0005000424650088328, + "loss": 3.2859, + "step": 58890 + }, + { + "epoch": 4.001562712325044, + "grad_norm": 0.9275040626525879, + "learning_rate": 0.0005, + "loss": 3.4761, + "step": 58895 + }, + { + "epoch": 4.001902432395706, + "grad_norm": 1.8134682178497314, + "learning_rate": 0.0004999575349911673, + "loss": 3.4526, + "step": 58900 + }, + { + "epoch": 4.002242152466367, + "grad_norm": 0.8156493902206421, + "learning_rate": 0.0004999150699823346, + "loss": 3.3478, + "step": 58905 + }, + { + "epoch": 4.0025818725370295, + "grad_norm": 0.5968990921974182, + "learning_rate": 0.0004998726049735018, + "loss": 3.3947, + "step": 58910 + }, + { + "epoch": 4.0029215926076915, + "grad_norm": 1.2406584024429321, + "learning_rate": 0.0004998301399646691, + "loss": 3.3297, + "step": 58915 + }, + { + "epoch": 4.003261312678353, + "grad_norm": 1.0924009084701538, + "learning_rate": 0.0004997876749558364, + "loss": 3.5881, + "step": 58920 + }, + { + "epoch": 4.003601032749015, + "grad_norm": 0.7941852807998657, + "learning_rate": 0.0004997452099470037, + "loss": 3.3139, + "step": 58925 + }, + { + "epoch": 4.003940752819677, + "grad_norm": 1.1119452714920044, + "learning_rate": 0.000499702744938171, + "loss": 3.7319, + "step": 58930 + }, + { + "epoch": 4.004280472890338, + "grad_norm": 1.0147446393966675, + "learning_rate": 0.0004996602799293383, + "loss": 3.3416, + "step": 58935 + }, + { + "epoch": 4.004620192961, + "grad_norm": 0.7423129081726074, + "learning_rate": 0.0004996178149205055, + "loss": 3.5832, + "step": 58940 + }, + { + "epoch": 4.004959913031662, + "grad_norm": 0.9405943155288696, + "learning_rate": 0.0004995753499116728, + "loss": 3.4124, + "step": 58945 + }, + { + "epoch": 4.005299633102323, + "grad_norm": 6.79968786239624, + "learning_rate": 0.00049953288490284, + "loss": 3.417, + "step": 58950 + }, + { + "epoch": 4.0056393531729855, + "grad_norm": 1.051818609237671, + "learning_rate": 0.0004994904198940074, + "loss": 3.3584, + "step": 58955 + }, + { + "epoch": 4.0059790732436475, + "grad_norm": 0.9337111115455627, + "learning_rate": 0.0004994479548851746, + "loss": 3.5856, + "step": 58960 + }, + { + "epoch": 4.006318793314309, + "grad_norm": 0.898451030254364, + "learning_rate": 0.0004994054898763419, + "loss": 3.3294, + "step": 58965 + }, + { + "epoch": 4.006658513384971, + "grad_norm": 0.8091122508049011, + "learning_rate": 0.0004993630248675092, + "loss": 3.337, + "step": 58970 + }, + { + "epoch": 4.006998233455633, + "grad_norm": 0.8572282195091248, + "learning_rate": 0.0004993205598586765, + "loss": 3.6497, + "step": 58975 + }, + { + "epoch": 4.007337953526294, + "grad_norm": 0.853178858757019, + "learning_rate": 0.0004992780948498437, + "loss": 3.5381, + "step": 58980 + }, + { + "epoch": 4.007677673596956, + "grad_norm": 0.9070908427238464, + "learning_rate": 0.000499235629841011, + "loss": 3.59, + "step": 58985 + }, + { + "epoch": 4.008017393667618, + "grad_norm": 0.8678823709487915, + "learning_rate": 0.0004991931648321783, + "loss": 3.4703, + "step": 58990 + }, + { + "epoch": 4.008357113738279, + "grad_norm": 0.8288912773132324, + "learning_rate": 0.0004991506998233456, + "loss": 3.5335, + "step": 58995 + }, + { + "epoch": 4.0086968338089415, + "grad_norm": 0.844184398651123, + "learning_rate": 0.0004991082348145128, + "loss": 3.6686, + "step": 59000 + }, + { + "epoch": 4.009036553879604, + "grad_norm": 0.7278297543525696, + "learning_rate": 0.0004990657698056802, + "loss": 3.3913, + "step": 59005 + }, + { + "epoch": 4.009376273950265, + "grad_norm": 1.5454286336898804, + "learning_rate": 0.0004990233047968474, + "loss": 3.4485, + "step": 59010 + }, + { + "epoch": 4.009715994020927, + "grad_norm": 0.9499795436859131, + "learning_rate": 0.0004989808397880146, + "loss": 3.3259, + "step": 59015 + }, + { + "epoch": 4.010055714091589, + "grad_norm": 0.6594309210777283, + "learning_rate": 0.000498938374779182, + "loss": 3.797, + "step": 59020 + }, + { + "epoch": 4.01039543416225, + "grad_norm": 1.371636986732483, + "learning_rate": 0.0004988959097703493, + "loss": 3.6211, + "step": 59025 + }, + { + "epoch": 4.010735154232912, + "grad_norm": 0.9112067818641663, + "learning_rate": 0.0004988534447615165, + "loss": 3.4356, + "step": 59030 + }, + { + "epoch": 4.011074874303574, + "grad_norm": 0.8416643738746643, + "learning_rate": 0.0004988109797526839, + "loss": 3.4284, + "step": 59035 + }, + { + "epoch": 4.011414594374235, + "grad_norm": 0.9199733734130859, + "learning_rate": 0.0004987685147438511, + "loss": 3.2872, + "step": 59040 + }, + { + "epoch": 4.0117543144448975, + "grad_norm": 0.863436758518219, + "learning_rate": 0.0004987260497350183, + "loss": 3.5717, + "step": 59045 + }, + { + "epoch": 4.01209403451556, + "grad_norm": 1.965509295463562, + "learning_rate": 0.0004986835847261856, + "loss": 3.519, + "step": 59050 + }, + { + "epoch": 4.012433754586221, + "grad_norm": 0.9706948399543762, + "learning_rate": 0.000498641119717353, + "loss": 3.4903, + "step": 59055 + }, + { + "epoch": 4.012773474656883, + "grad_norm": 1.2448976039886475, + "learning_rate": 0.0004985986547085202, + "loss": 3.5462, + "step": 59060 + }, + { + "epoch": 4.013113194727545, + "grad_norm": 0.8056244850158691, + "learning_rate": 0.0004985561896996874, + "loss": 3.6148, + "step": 59065 + }, + { + "epoch": 4.013452914798206, + "grad_norm": 0.9555721282958984, + "learning_rate": 0.0004985137246908548, + "loss": 3.4895, + "step": 59070 + }, + { + "epoch": 4.013792634868868, + "grad_norm": 0.7755557894706726, + "learning_rate": 0.000498471259682022, + "loss": 3.3411, + "step": 59075 + }, + { + "epoch": 4.01413235493953, + "grad_norm": 1.3462457656860352, + "learning_rate": 0.0004984287946731893, + "loss": 3.5468, + "step": 59080 + }, + { + "epoch": 4.014472075010191, + "grad_norm": 0.9855201840400696, + "learning_rate": 0.0004983863296643565, + "loss": 3.4734, + "step": 59085 + }, + { + "epoch": 4.0148117950808535, + "grad_norm": 0.8196870684623718, + "learning_rate": 0.0004983438646555239, + "loss": 3.5459, + "step": 59090 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.7483983039855957, + "learning_rate": 0.0004983013996466911, + "loss": 3.5044, + "step": 59095 + }, + { + "epoch": 4.015491235222177, + "grad_norm": 0.9870972037315369, + "learning_rate": 0.0004982589346378584, + "loss": 3.6129, + "step": 59100 + }, + { + "epoch": 4.015830955292839, + "grad_norm": 1.025268793106079, + "learning_rate": 0.0004982164696290257, + "loss": 3.3757, + "step": 59105 + }, + { + "epoch": 4.016170675363501, + "grad_norm": 2.4655425548553467, + "learning_rate": 0.000498174004620193, + "loss": 3.5137, + "step": 59110 + }, + { + "epoch": 4.016510395434162, + "grad_norm": 1.3579258918762207, + "learning_rate": 0.0004981315396113602, + "loss": 3.2921, + "step": 59115 + }, + { + "epoch": 4.016850115504824, + "grad_norm": 1.1214567422866821, + "learning_rate": 0.0004980890746025275, + "loss": 3.4271, + "step": 59120 + }, + { + "epoch": 4.017189835575485, + "grad_norm": 1.0016041994094849, + "learning_rate": 0.0004980466095936948, + "loss": 3.274, + "step": 59125 + }, + { + "epoch": 4.017529555646147, + "grad_norm": 0.6863726377487183, + "learning_rate": 0.0004980041445848621, + "loss": 3.6385, + "step": 59130 + }, + { + "epoch": 4.0178692757168095, + "grad_norm": 2.182424545288086, + "learning_rate": 0.0004979616795760293, + "loss": 3.656, + "step": 59135 + }, + { + "epoch": 4.018208995787471, + "grad_norm": 1.028386116027832, + "learning_rate": 0.0004979192145671967, + "loss": 3.5911, + "step": 59140 + }, + { + "epoch": 4.018548715858133, + "grad_norm": 0.8198193311691284, + "learning_rate": 0.0004978767495583639, + "loss": 3.4404, + "step": 59145 + }, + { + "epoch": 4.018888435928795, + "grad_norm": 1.2617802619934082, + "learning_rate": 0.0004978342845495311, + "loss": 3.4689, + "step": 59150 + }, + { + "epoch": 4.019228155999456, + "grad_norm": 0.5998005270957947, + "learning_rate": 0.0004977918195406985, + "loss": 3.7243, + "step": 59155 + }, + { + "epoch": 4.019567876070118, + "grad_norm": 0.8704761266708374, + "learning_rate": 0.0004977493545318658, + "loss": 3.2778, + "step": 59160 + }, + { + "epoch": 4.01990759614078, + "grad_norm": 0.8309449553489685, + "learning_rate": 0.000497706889523033, + "loss": 3.5103, + "step": 59165 + }, + { + "epoch": 4.020247316211441, + "grad_norm": 0.702453076839447, + "learning_rate": 0.0004976644245142003, + "loss": 3.4175, + "step": 59170 + }, + { + "epoch": 4.0205870362821035, + "grad_norm": 0.7779124975204468, + "learning_rate": 0.0004976219595053676, + "loss": 3.6647, + "step": 59175 + }, + { + "epoch": 4.0209267563527655, + "grad_norm": 0.84400874376297, + "learning_rate": 0.0004975794944965349, + "loss": 3.3364, + "step": 59180 + }, + { + "epoch": 4.021266476423427, + "grad_norm": 1.1379069089889526, + "learning_rate": 0.0004975370294877021, + "loss": 3.7127, + "step": 59185 + }, + { + "epoch": 4.021606196494089, + "grad_norm": 0.7037822008132935, + "learning_rate": 0.0004974945644788695, + "loss": 3.781, + "step": 59190 + }, + { + "epoch": 4.021945916564751, + "grad_norm": 0.7788320779800415, + "learning_rate": 0.0004974520994700367, + "loss": 3.5553, + "step": 59195 + }, + { + "epoch": 4.022285636635412, + "grad_norm": 0.8578101992607117, + "learning_rate": 0.0004974096344612039, + "loss": 3.4998, + "step": 59200 + }, + { + "epoch": 4.022625356706074, + "grad_norm": 0.8958861827850342, + "learning_rate": 0.0004973671694523713, + "loss": 3.5145, + "step": 59205 + }, + { + "epoch": 4.022965076776736, + "grad_norm": 0.8616841435432434, + "learning_rate": 0.0004973247044435386, + "loss": 3.5822, + "step": 59210 + }, + { + "epoch": 4.023304796847397, + "grad_norm": 0.8769033551216125, + "learning_rate": 0.0004972822394347058, + "loss": 3.629, + "step": 59215 + }, + { + "epoch": 4.0236445169180595, + "grad_norm": 0.755324125289917, + "learning_rate": 0.000497239774425873, + "loss": 3.5096, + "step": 59220 + }, + { + "epoch": 4.0239842369887215, + "grad_norm": 0.9850172400474548, + "learning_rate": 0.0004971973094170404, + "loss": 3.5562, + "step": 59225 + }, + { + "epoch": 4.024323957059383, + "grad_norm": 0.6654769778251648, + "learning_rate": 0.0004971548444082076, + "loss": 3.7205, + "step": 59230 + }, + { + "epoch": 4.024663677130045, + "grad_norm": 0.7300781607627869, + "learning_rate": 0.000497112379399375, + "loss": 3.614, + "step": 59235 + }, + { + "epoch": 4.025003397200707, + "grad_norm": 0.6652653217315674, + "learning_rate": 0.0004970699143905422, + "loss": 3.4797, + "step": 59240 + }, + { + "epoch": 4.025343117271368, + "grad_norm": 0.8888729214668274, + "learning_rate": 0.0004970274493817095, + "loss": 3.6619, + "step": 59245 + }, + { + "epoch": 4.02568283734203, + "grad_norm": 0.8681566715240479, + "learning_rate": 0.0004969849843728767, + "loss": 3.6572, + "step": 59250 + }, + { + "epoch": 4.026022557412692, + "grad_norm": 0.8001010417938232, + "learning_rate": 0.0004969425193640441, + "loss": 3.5115, + "step": 59255 + }, + { + "epoch": 4.026362277483353, + "grad_norm": 0.7880661487579346, + "learning_rate": 0.0004969000543552113, + "loss": 3.4483, + "step": 59260 + }, + { + "epoch": 4.0267019975540155, + "grad_norm": 0.7890077829360962, + "learning_rate": 0.0004968575893463786, + "loss": 3.7525, + "step": 59265 + }, + { + "epoch": 4.0270417176246776, + "grad_norm": 0.9139121770858765, + "learning_rate": 0.0004968151243375459, + "loss": 3.6723, + "step": 59270 + }, + { + "epoch": 4.027381437695339, + "grad_norm": 0.9614365100860596, + "learning_rate": 0.0004967726593287131, + "loss": 3.5769, + "step": 59275 + }, + { + "epoch": 4.027721157766001, + "grad_norm": 0.7309695482254028, + "learning_rate": 0.0004967301943198804, + "loss": 3.3334, + "step": 59280 + }, + { + "epoch": 4.028060877836663, + "grad_norm": 0.8989684581756592, + "learning_rate": 0.0004966877293110477, + "loss": 3.6066, + "step": 59285 + }, + { + "epoch": 4.028400597907324, + "grad_norm": 0.7039458155632019, + "learning_rate": 0.000496645264302215, + "loss": 3.3526, + "step": 59290 + }, + { + "epoch": 4.028740317977986, + "grad_norm": 1.027158498764038, + "learning_rate": 0.0004966027992933823, + "loss": 3.447, + "step": 59295 + }, + { + "epoch": 4.029080038048648, + "grad_norm": 0.6698904633522034, + "learning_rate": 0.0004965603342845495, + "loss": 3.4966, + "step": 59300 + }, + { + "epoch": 4.029419758119309, + "grad_norm": 0.768668532371521, + "learning_rate": 0.0004965178692757168, + "loss": 3.4904, + "step": 59305 + }, + { + "epoch": 4.0297594781899715, + "grad_norm": 0.9057156443595886, + "learning_rate": 0.0004964754042668841, + "loss": 3.3473, + "step": 59310 + }, + { + "epoch": 4.030099198260634, + "grad_norm": 0.8317149877548218, + "learning_rate": 0.0004964329392580514, + "loss": 3.4561, + "step": 59315 + }, + { + "epoch": 4.030438918331295, + "grad_norm": 0.8060755729675293, + "learning_rate": 0.0004963904742492187, + "loss": 3.5494, + "step": 59320 + }, + { + "epoch": 4.030778638401957, + "grad_norm": 0.8407382965087891, + "learning_rate": 0.0004963480092403859, + "loss": 3.5301, + "step": 59325 + }, + { + "epoch": 4.031118358472619, + "grad_norm": 0.6719914078712463, + "learning_rate": 0.0004963055442315532, + "loss": 3.6359, + "step": 59330 + }, + { + "epoch": 4.03145807854328, + "grad_norm": 0.9066123366355896, + "learning_rate": 0.0004962630792227205, + "loss": 3.562, + "step": 59335 + }, + { + "epoch": 4.031797798613942, + "grad_norm": 0.8308018445968628, + "learning_rate": 0.0004962206142138878, + "loss": 3.452, + "step": 59340 + }, + { + "epoch": 4.032137518684604, + "grad_norm": 1.021974802017212, + "learning_rate": 0.000496178149205055, + "loss": 3.4772, + "step": 59345 + }, + { + "epoch": 4.032477238755265, + "grad_norm": 0.8861751556396484, + "learning_rate": 0.0004961356841962223, + "loss": 3.5203, + "step": 59350 + }, + { + "epoch": 4.0328169588259275, + "grad_norm": 0.9537569284439087, + "learning_rate": 0.0004960932191873896, + "loss": 3.5844, + "step": 59355 + }, + { + "epoch": 4.03315667889659, + "grad_norm": 0.9293715357780457, + "learning_rate": 0.0004960507541785569, + "loss": 3.3204, + "step": 59360 + }, + { + "epoch": 4.033496398967251, + "grad_norm": 0.8695241808891296, + "learning_rate": 0.0004960082891697242, + "loss": 3.3839, + "step": 59365 + }, + { + "epoch": 4.033836119037913, + "grad_norm": 0.8139508366584778, + "learning_rate": 0.0004959658241608915, + "loss": 3.4698, + "step": 59370 + }, + { + "epoch": 4.034175839108575, + "grad_norm": 0.7105188369750977, + "learning_rate": 0.0004959233591520587, + "loss": 3.3877, + "step": 59375 + }, + { + "epoch": 4.034515559179236, + "grad_norm": 0.7434634566307068, + "learning_rate": 0.000495880894143226, + "loss": 3.6983, + "step": 59380 + }, + { + "epoch": 4.034855279249898, + "grad_norm": 0.8330010771751404, + "learning_rate": 0.0004958384291343932, + "loss": 3.5086, + "step": 59385 + }, + { + "epoch": 4.03519499932056, + "grad_norm": 0.8883917331695557, + "learning_rate": 0.0004957959641255606, + "loss": 3.4023, + "step": 59390 + }, + { + "epoch": 4.035534719391221, + "grad_norm": 0.9460362195968628, + "learning_rate": 0.0004957534991167278, + "loss": 3.2749, + "step": 59395 + }, + { + "epoch": 4.0358744394618835, + "grad_norm": 1.6230714321136475, + "learning_rate": 0.0004957110341078951, + "loss": 3.2161, + "step": 59400 + }, + { + "epoch": 4.036214159532546, + "grad_norm": 0.7635154128074646, + "learning_rate": 0.0004956685690990624, + "loss": 3.4246, + "step": 59405 + }, + { + "epoch": 4.036553879603207, + "grad_norm": 1.0212361812591553, + "learning_rate": 0.0004956261040902297, + "loss": 3.3496, + "step": 59410 + }, + { + "epoch": 4.036893599673869, + "grad_norm": 0.9008714556694031, + "learning_rate": 0.0004955836390813969, + "loss": 3.3817, + "step": 59415 + }, + { + "epoch": 4.037233319744531, + "grad_norm": 0.8420206904411316, + "learning_rate": 0.0004955411740725643, + "loss": 3.3725, + "step": 59420 + }, + { + "epoch": 4.037573039815192, + "grad_norm": 0.8912153244018555, + "learning_rate": 0.0004954987090637315, + "loss": 3.4668, + "step": 59425 + }, + { + "epoch": 4.037912759885854, + "grad_norm": 1.1100375652313232, + "learning_rate": 0.0004954562440548987, + "loss": 3.5664, + "step": 59430 + }, + { + "epoch": 4.038252479956516, + "grad_norm": 1.0139741897583008, + "learning_rate": 0.000495413779046066, + "loss": 3.4088, + "step": 59435 + }, + { + "epoch": 4.0385922000271774, + "grad_norm": 0.8909417986869812, + "learning_rate": 0.0004953713140372334, + "loss": 3.4118, + "step": 59440 + }, + { + "epoch": 4.0389319200978395, + "grad_norm": 1.1226792335510254, + "learning_rate": 0.0004953288490284006, + "loss": 3.4984, + "step": 59445 + }, + { + "epoch": 4.039271640168501, + "grad_norm": 0.9343265295028687, + "learning_rate": 0.0004952863840195678, + "loss": 3.3995, + "step": 59450 + }, + { + "epoch": 4.039611360239163, + "grad_norm": 0.9252835512161255, + "learning_rate": 0.0004952439190107352, + "loss": 3.7624, + "step": 59455 + }, + { + "epoch": 4.039951080309825, + "grad_norm": 0.8190149068832397, + "learning_rate": 0.0004952014540019024, + "loss": 3.5609, + "step": 59460 + }, + { + "epoch": 4.040290800380486, + "grad_norm": 0.980078935623169, + "learning_rate": 0.0004951589889930697, + "loss": 3.4099, + "step": 59465 + }, + { + "epoch": 4.040630520451148, + "grad_norm": 0.7281792759895325, + "learning_rate": 0.0004951165239842371, + "loss": 3.6113, + "step": 59470 + }, + { + "epoch": 4.04097024052181, + "grad_norm": 0.7763375043869019, + "learning_rate": 0.0004950740589754043, + "loss": 3.5003, + "step": 59475 + }, + { + "epoch": 4.041309960592471, + "grad_norm": 0.892322838306427, + "learning_rate": 0.0004950315939665715, + "loss": 3.5036, + "step": 59480 + }, + { + "epoch": 4.0416496806631335, + "grad_norm": 0.7889970541000366, + "learning_rate": 0.0004949891289577388, + "loss": 3.4563, + "step": 59485 + }, + { + "epoch": 4.0419894007337955, + "grad_norm": 0.9280396103858948, + "learning_rate": 0.0004949466639489061, + "loss": 3.5801, + "step": 59490 + }, + { + "epoch": 4.042329120804457, + "grad_norm": 0.9578284621238708, + "learning_rate": 0.0004949041989400734, + "loss": 3.7168, + "step": 59495 + }, + { + "epoch": 4.042668840875119, + "grad_norm": 0.9008978009223938, + "learning_rate": 0.0004948617339312406, + "loss": 3.5893, + "step": 59500 + }, + { + "epoch": 4.043008560945781, + "grad_norm": 0.645816445350647, + "learning_rate": 0.000494819268922408, + "loss": 3.7349, + "step": 59505 + }, + { + "epoch": 4.043348281016442, + "grad_norm": 0.9051821231842041, + "learning_rate": 0.0004947768039135752, + "loss": 3.4611, + "step": 59510 + }, + { + "epoch": 4.043688001087104, + "grad_norm": 0.9449268579483032, + "learning_rate": 0.0004947343389047425, + "loss": 3.4782, + "step": 59515 + }, + { + "epoch": 4.044027721157766, + "grad_norm": 0.9442574381828308, + "learning_rate": 0.0004946918738959099, + "loss": 3.4057, + "step": 59520 + }, + { + "epoch": 4.044367441228427, + "grad_norm": 0.7636062502861023, + "learning_rate": 0.0004946494088870771, + "loss": 3.3891, + "step": 59525 + }, + { + "epoch": 4.0447071612990895, + "grad_norm": 0.7140181064605713, + "learning_rate": 0.0004946069438782443, + "loss": 3.4489, + "step": 59530 + }, + { + "epoch": 4.0450468813697515, + "grad_norm": 0.9157096743583679, + "learning_rate": 0.0004945644788694116, + "loss": 3.6042, + "step": 59535 + }, + { + "epoch": 4.045386601440413, + "grad_norm": 0.7779486775398254, + "learning_rate": 0.0004945220138605789, + "loss": 3.7228, + "step": 59540 + }, + { + "epoch": 4.045726321511075, + "grad_norm": 0.8475788235664368, + "learning_rate": 0.0004944795488517462, + "loss": 3.375, + "step": 59545 + }, + { + "epoch": 4.046066041581737, + "grad_norm": 0.8338893055915833, + "learning_rate": 0.0004944370838429134, + "loss": 3.4327, + "step": 59550 + }, + { + "epoch": 4.046405761652398, + "grad_norm": 0.8249170184135437, + "learning_rate": 0.0004943946188340808, + "loss": 3.2807, + "step": 59555 + }, + { + "epoch": 4.04674548172306, + "grad_norm": 0.9074604511260986, + "learning_rate": 0.000494352153825248, + "loss": 3.7108, + "step": 59560 + }, + { + "epoch": 4.047085201793722, + "grad_norm": 0.9617912769317627, + "learning_rate": 0.0004943096888164153, + "loss": 3.5189, + "step": 59565 + }, + { + "epoch": 4.047424921864383, + "grad_norm": 1.1055030822753906, + "learning_rate": 0.0004942672238075825, + "loss": 3.3367, + "step": 59570 + }, + { + "epoch": 4.0477646419350455, + "grad_norm": 0.8554553985595703, + "learning_rate": 0.0004942247587987499, + "loss": 3.2035, + "step": 59575 + }, + { + "epoch": 4.048104362005708, + "grad_norm": 0.8208368420600891, + "learning_rate": 0.0004941822937899171, + "loss": 3.5558, + "step": 59580 + }, + { + "epoch": 4.048444082076369, + "grad_norm": 1.613654375076294, + "learning_rate": 0.0004941398287810843, + "loss": 3.4765, + "step": 59585 + }, + { + "epoch": 4.048783802147031, + "grad_norm": 0.8521966934204102, + "learning_rate": 0.0004940973637722517, + "loss": 3.3814, + "step": 59590 + }, + { + "epoch": 4.049123522217693, + "grad_norm": 1.011383295059204, + "learning_rate": 0.000494054898763419, + "loss": 3.3917, + "step": 59595 + }, + { + "epoch": 4.049463242288354, + "grad_norm": 0.887047529220581, + "learning_rate": 0.0004940124337545862, + "loss": 3.639, + "step": 59600 + }, + { + "epoch": 4.049802962359016, + "grad_norm": 0.9182097315788269, + "learning_rate": 0.0004939699687457535, + "loss": 3.5991, + "step": 59605 + }, + { + "epoch": 4.050142682429678, + "grad_norm": 0.8084776401519775, + "learning_rate": 0.0004939275037369208, + "loss": 3.5567, + "step": 59610 + }, + { + "epoch": 4.050482402500339, + "grad_norm": 0.9192066192626953, + "learning_rate": 0.000493885038728088, + "loss": 3.4847, + "step": 59615 + }, + { + "epoch": 4.0508221225710015, + "grad_norm": 0.8889290690422058, + "learning_rate": 0.0004938425737192554, + "loss": 3.5875, + "step": 59620 + }, + { + "epoch": 4.051161842641664, + "grad_norm": 0.8754131197929382, + "learning_rate": 0.0004938001087104227, + "loss": 3.349, + "step": 59625 + }, + { + "epoch": 4.051501562712325, + "grad_norm": 0.8507117629051208, + "learning_rate": 0.0004937576437015899, + "loss": 3.4266, + "step": 59630 + }, + { + "epoch": 4.051841282782987, + "grad_norm": 0.9887165427207947, + "learning_rate": 0.0004937151786927571, + "loss": 3.745, + "step": 59635 + }, + { + "epoch": 4.052181002853649, + "grad_norm": 0.6863018870353699, + "learning_rate": 0.0004936727136839245, + "loss": 3.3551, + "step": 59640 + }, + { + "epoch": 4.05252072292431, + "grad_norm": 0.76800137758255, + "learning_rate": 0.0004936302486750917, + "loss": 3.5026, + "step": 59645 + }, + { + "epoch": 4.052860442994972, + "grad_norm": 0.960908055305481, + "learning_rate": 0.000493587783666259, + "loss": 3.505, + "step": 59650 + }, + { + "epoch": 4.053200163065634, + "grad_norm": 0.8802252411842346, + "learning_rate": 0.0004935453186574263, + "loss": 3.4594, + "step": 59655 + }, + { + "epoch": 4.053539883136295, + "grad_norm": 0.8793491125106812, + "learning_rate": 0.0004935028536485936, + "loss": 3.3814, + "step": 59660 + }, + { + "epoch": 4.0538796032069575, + "grad_norm": 1.100463628768921, + "learning_rate": 0.0004934603886397608, + "loss": 3.5471, + "step": 59665 + }, + { + "epoch": 4.05421932327762, + "grad_norm": 0.8778460621833801, + "learning_rate": 0.0004934179236309282, + "loss": 3.6864, + "step": 59670 + }, + { + "epoch": 4.054559043348281, + "grad_norm": 0.9455233812332153, + "learning_rate": 0.0004933754586220954, + "loss": 3.5101, + "step": 59675 + }, + { + "epoch": 4.054898763418943, + "grad_norm": 0.8119200468063354, + "learning_rate": 0.0004933329936132627, + "loss": 3.6052, + "step": 59680 + }, + { + "epoch": 4.055238483489605, + "grad_norm": 0.8363625407218933, + "learning_rate": 0.0004932905286044299, + "loss": 3.4424, + "step": 59685 + }, + { + "epoch": 4.055578203560266, + "grad_norm": 0.922256588935852, + "learning_rate": 0.0004932480635955973, + "loss": 3.3186, + "step": 59690 + }, + { + "epoch": 4.055917923630928, + "grad_norm": 1.0836877822875977, + "learning_rate": 0.0004932055985867645, + "loss": 3.3798, + "step": 59695 + }, + { + "epoch": 4.05625764370159, + "grad_norm": 0.8566212058067322, + "learning_rate": 0.0004931631335779318, + "loss": 3.388, + "step": 59700 + }, + { + "epoch": 4.056597363772251, + "grad_norm": 1.1481186151504517, + "learning_rate": 0.0004931206685690991, + "loss": 3.4265, + "step": 59705 + }, + { + "epoch": 4.0569370838429135, + "grad_norm": 0.8796323537826538, + "learning_rate": 0.0004930782035602663, + "loss": 3.566, + "step": 59710 + }, + { + "epoch": 4.057276803913576, + "grad_norm": 0.7798465490341187, + "learning_rate": 0.0004930357385514336, + "loss": 3.5965, + "step": 59715 + }, + { + "epoch": 4.057616523984237, + "grad_norm": 0.6780524849891663, + "learning_rate": 0.000492993273542601, + "loss": 3.4996, + "step": 59720 + }, + { + "epoch": 4.057956244054899, + "grad_norm": 0.7374577522277832, + "learning_rate": 0.0004929508085337682, + "loss": 3.5647, + "step": 59725 + }, + { + "epoch": 4.058295964125561, + "grad_norm": 1.1030747890472412, + "learning_rate": 0.0004929083435249355, + "loss": 3.5401, + "step": 59730 + }, + { + "epoch": 4.058635684196222, + "grad_norm": 0.7595429420471191, + "learning_rate": 0.0004928658785161027, + "loss": 3.4143, + "step": 59735 + }, + { + "epoch": 4.058975404266884, + "grad_norm": 1.0204191207885742, + "learning_rate": 0.00049282341350727, + "loss": 3.5529, + "step": 59740 + }, + { + "epoch": 4.059315124337546, + "grad_norm": 0.7698794007301331, + "learning_rate": 0.0004927809484984373, + "loss": 3.573, + "step": 59745 + }, + { + "epoch": 4.0596548444082075, + "grad_norm": 1.1621057987213135, + "learning_rate": 0.0004927384834896046, + "loss": 3.5116, + "step": 59750 + }, + { + "epoch": 4.0599945644788695, + "grad_norm": 0.7962614297866821, + "learning_rate": 0.0004926960184807719, + "loss": 3.5227, + "step": 59755 + }, + { + "epoch": 4.060334284549532, + "grad_norm": 1.0799944400787354, + "learning_rate": 0.0004926535534719391, + "loss": 3.4565, + "step": 59760 + }, + { + "epoch": 4.060674004620193, + "grad_norm": 0.9836661219596863, + "learning_rate": 0.0004926110884631064, + "loss": 3.3829, + "step": 59765 + }, + { + "epoch": 4.061013724690855, + "grad_norm": 0.7432460188865662, + "learning_rate": 0.0004925686234542736, + "loss": 3.4209, + "step": 59770 + }, + { + "epoch": 4.061353444761517, + "grad_norm": 0.8501033186912537, + "learning_rate": 0.000492526158445441, + "loss": 3.5235, + "step": 59775 + }, + { + "epoch": 4.061693164832178, + "grad_norm": 0.8153987526893616, + "learning_rate": 0.0004924836934366083, + "loss": 3.4685, + "step": 59780 + }, + { + "epoch": 4.06203288490284, + "grad_norm": 1.119522213935852, + "learning_rate": 0.0004924412284277755, + "loss": 3.3922, + "step": 59785 + }, + { + "epoch": 4.062372604973502, + "grad_norm": 0.7587021589279175, + "learning_rate": 0.0004923987634189428, + "loss": 3.5482, + "step": 59790 + }, + { + "epoch": 4.0627123250441635, + "grad_norm": 1.0584545135498047, + "learning_rate": 0.0004923562984101101, + "loss": 3.714, + "step": 59795 + }, + { + "epoch": 4.0630520451148255, + "grad_norm": 0.9658828973770142, + "learning_rate": 0.0004923138334012773, + "loss": 3.5181, + "step": 59800 + }, + { + "epoch": 4.063391765185487, + "grad_norm": 0.7202749252319336, + "learning_rate": 0.0004922713683924447, + "loss": 3.5986, + "step": 59805 + }, + { + "epoch": 4.063731485256149, + "grad_norm": 1.536464810371399, + "learning_rate": 0.0004922289033836119, + "loss": 3.4901, + "step": 59810 + }, + { + "epoch": 4.064071205326811, + "grad_norm": 2.1154098510742188, + "learning_rate": 0.0004921864383747792, + "loss": 3.3865, + "step": 59815 + }, + { + "epoch": 4.064410925397472, + "grad_norm": 0.7437331080436707, + "learning_rate": 0.0004921439733659464, + "loss": 3.6038, + "step": 59820 + }, + { + "epoch": 4.064750645468134, + "grad_norm": 1.1127680540084839, + "learning_rate": 0.0004921015083571138, + "loss": 3.4685, + "step": 59825 + }, + { + "epoch": 4.065090365538796, + "grad_norm": 0.8566794395446777, + "learning_rate": 0.000492059043348281, + "loss": 3.2769, + "step": 59830 + }, + { + "epoch": 4.065430085609457, + "grad_norm": 0.9445235729217529, + "learning_rate": 0.0004920165783394483, + "loss": 3.2099, + "step": 59835 + }, + { + "epoch": 4.0657698056801195, + "grad_norm": 0.8048754930496216, + "learning_rate": 0.0004919741133306156, + "loss": 3.7584, + "step": 59840 + }, + { + "epoch": 4.0661095257507816, + "grad_norm": 0.8994931578636169, + "learning_rate": 0.0004919316483217829, + "loss": 3.6641, + "step": 59845 + }, + { + "epoch": 4.066449245821443, + "grad_norm": 0.9145632386207581, + "learning_rate": 0.0004918891833129501, + "loss": 3.4743, + "step": 59850 + }, + { + "epoch": 4.066788965892105, + "grad_norm": 1.145402193069458, + "learning_rate": 0.0004918467183041175, + "loss": 3.1589, + "step": 59855 + }, + { + "epoch": 4.067128685962767, + "grad_norm": 0.8065330982208252, + "learning_rate": 0.0004918042532952847, + "loss": 3.405, + "step": 59860 + }, + { + "epoch": 4.067468406033428, + "grad_norm": 0.8032689690589905, + "learning_rate": 0.0004917617882864519, + "loss": 3.4957, + "step": 59865 + }, + { + "epoch": 4.06780812610409, + "grad_norm": 0.8046206831932068, + "learning_rate": 0.0004917193232776192, + "loss": 3.579, + "step": 59870 + }, + { + "epoch": 4.068147846174752, + "grad_norm": 0.8540023565292358, + "learning_rate": 0.0004916768582687866, + "loss": 3.6003, + "step": 59875 + }, + { + "epoch": 4.068487566245413, + "grad_norm": 0.7919192314147949, + "learning_rate": 0.0004916343932599538, + "loss": 3.6506, + "step": 59880 + }, + { + "epoch": 4.0688272863160755, + "grad_norm": 0.9280046820640564, + "learning_rate": 0.0004915919282511211, + "loss": 3.513, + "step": 59885 + }, + { + "epoch": 4.069167006386738, + "grad_norm": 1.1192290782928467, + "learning_rate": 0.0004915494632422884, + "loss": 3.4629, + "step": 59890 + }, + { + "epoch": 4.069506726457399, + "grad_norm": 1.0363168716430664, + "learning_rate": 0.0004915069982334556, + "loss": 3.4601, + "step": 59895 + }, + { + "epoch": 4.069846446528061, + "grad_norm": 0.8286400437355042, + "learning_rate": 0.0004914645332246229, + "loss": 3.6364, + "step": 59900 + }, + { + "epoch": 4.070186166598723, + "grad_norm": 0.848187267780304, + "learning_rate": 0.0004914220682157903, + "loss": 3.7743, + "step": 59905 + }, + { + "epoch": 4.070525886669384, + "grad_norm": 0.9170063138008118, + "learning_rate": 0.0004913796032069575, + "loss": 3.342, + "step": 59910 + }, + { + "epoch": 4.070865606740046, + "grad_norm": 1.0368363857269287, + "learning_rate": 0.0004913371381981247, + "loss": 3.5784, + "step": 59915 + }, + { + "epoch": 4.071205326810708, + "grad_norm": 1.2141284942626953, + "learning_rate": 0.000491294673189292, + "loss": 3.5915, + "step": 59920 + }, + { + "epoch": 4.071545046881369, + "grad_norm": 2.694369077682495, + "learning_rate": 0.0004912522081804593, + "loss": 3.5464, + "step": 59925 + }, + { + "epoch": 4.0718847669520315, + "grad_norm": 0.8907386660575867, + "learning_rate": 0.0004912097431716266, + "loss": 3.6438, + "step": 59930 + }, + { + "epoch": 4.072224487022694, + "grad_norm": 0.7541745901107788, + "learning_rate": 0.0004911672781627938, + "loss": 3.4109, + "step": 59935 + }, + { + "epoch": 4.072564207093355, + "grad_norm": 0.9400231838226318, + "learning_rate": 0.0004911248131539612, + "loss": 3.7112, + "step": 59940 + }, + { + "epoch": 4.072903927164017, + "grad_norm": 1.6789300441741943, + "learning_rate": 0.0004910823481451284, + "loss": 3.7889, + "step": 59945 + }, + { + "epoch": 4.073243647234679, + "grad_norm": 1.8144301176071167, + "learning_rate": 0.0004910398831362957, + "loss": 3.53, + "step": 59950 + }, + { + "epoch": 4.07358336730534, + "grad_norm": 1.1084864139556885, + "learning_rate": 0.000490997418127463, + "loss": 3.4527, + "step": 59955 + }, + { + "epoch": 4.073923087376002, + "grad_norm": 1.3801870346069336, + "learning_rate": 0.0004909549531186303, + "loss": 3.388, + "step": 59960 + }, + { + "epoch": 4.074262807446664, + "grad_norm": 1.5060023069381714, + "learning_rate": 0.0004909124881097975, + "loss": 3.2424, + "step": 59965 + }, + { + "epoch": 4.074602527517325, + "grad_norm": 0.8574672937393188, + "learning_rate": 0.0004908700231009647, + "loss": 3.4394, + "step": 59970 + }, + { + "epoch": 4.0749422475879875, + "grad_norm": 0.9153268933296204, + "learning_rate": 0.0004908275580921321, + "loss": 3.5801, + "step": 59975 + }, + { + "epoch": 4.07528196765865, + "grad_norm": 0.7475935816764832, + "learning_rate": 0.0004907850930832994, + "loss": 3.5549, + "step": 59980 + }, + { + "epoch": 4.075621687729311, + "grad_norm": 0.8156780004501343, + "learning_rate": 0.0004907426280744666, + "loss": 3.3658, + "step": 59985 + }, + { + "epoch": 4.075961407799973, + "grad_norm": 0.8531131744384766, + "learning_rate": 0.000490700163065634, + "loss": 3.4129, + "step": 59990 + }, + { + "epoch": 4.076301127870635, + "grad_norm": 1.0836093425750732, + "learning_rate": 0.0004906576980568012, + "loss": 3.2594, + "step": 59995 + }, + { + "epoch": 4.076640847941296, + "grad_norm": 1.1020183563232422, + "learning_rate": 0.0004906152330479684, + "loss": 3.5252, + "step": 60000 + }, + { + "epoch": 4.076980568011958, + "grad_norm": 0.7993221879005432, + "learning_rate": 0.0004905727680391358, + "loss": 3.4434, + "step": 60005 + }, + { + "epoch": 4.07732028808262, + "grad_norm": 2.3565268516540527, + "learning_rate": 0.0004905303030303031, + "loss": 3.4065, + "step": 60010 + }, + { + "epoch": 4.0776600081532814, + "grad_norm": 1.5973923206329346, + "learning_rate": 0.0004904878380214703, + "loss": 3.3454, + "step": 60015 + }, + { + "epoch": 4.0779997282239435, + "grad_norm": 1.1485506296157837, + "learning_rate": 0.0004904453730126375, + "loss": 3.4322, + "step": 60020 + }, + { + "epoch": 4.078339448294606, + "grad_norm": 0.81014084815979, + "learning_rate": 0.0004904029080038049, + "loss": 3.3965, + "step": 60025 + }, + { + "epoch": 4.078679168365267, + "grad_norm": 0.7585779428482056, + "learning_rate": 0.0004903604429949722, + "loss": 3.4773, + "step": 60030 + }, + { + "epoch": 4.079018888435929, + "grad_norm": 0.8914204835891724, + "learning_rate": 0.0004903179779861394, + "loss": 3.4269, + "step": 60035 + }, + { + "epoch": 4.079358608506591, + "grad_norm": 0.8952040076255798, + "learning_rate": 0.0004902755129773068, + "loss": 3.517, + "step": 60040 + }, + { + "epoch": 4.079698328577252, + "grad_norm": 0.8605546951293945, + "learning_rate": 0.000490233047968474, + "loss": 3.4476, + "step": 60045 + }, + { + "epoch": 4.080038048647914, + "grad_norm": 0.7974916100502014, + "learning_rate": 0.0004901905829596412, + "loss": 3.4433, + "step": 60050 + }, + { + "epoch": 4.080377768718576, + "grad_norm": 0.8027783036231995, + "learning_rate": 0.0004901481179508086, + "loss": 3.5257, + "step": 60055 + }, + { + "epoch": 4.0807174887892375, + "grad_norm": 0.738534688949585, + "learning_rate": 0.0004901056529419759, + "loss": 3.5231, + "step": 60060 + }, + { + "epoch": 4.0810572088598995, + "grad_norm": 1.549047827720642, + "learning_rate": 0.0004900631879331431, + "loss": 3.3868, + "step": 60065 + }, + { + "epoch": 4.081396928930562, + "grad_norm": 1.1312435865402222, + "learning_rate": 0.0004900207229243103, + "loss": 3.3931, + "step": 60070 + }, + { + "epoch": 4.081736649001223, + "grad_norm": 1.1390950679779053, + "learning_rate": 0.0004899782579154777, + "loss": 3.4166, + "step": 60075 + }, + { + "epoch": 4.082076369071885, + "grad_norm": 1.0959135293960571, + "learning_rate": 0.0004899357929066449, + "loss": 3.4998, + "step": 60080 + }, + { + "epoch": 4.082416089142547, + "grad_norm": 0.8348273038864136, + "learning_rate": 0.0004898933278978122, + "loss": 3.5855, + "step": 60085 + }, + { + "epoch": 4.082755809213208, + "grad_norm": 0.8332827687263489, + "learning_rate": 0.0004898508628889795, + "loss": 3.6504, + "step": 60090 + }, + { + "epoch": 4.08309552928387, + "grad_norm": 0.963668167591095, + "learning_rate": 0.0004898083978801468, + "loss": 3.9165, + "step": 60095 + }, + { + "epoch": 4.083435249354532, + "grad_norm": 0.8363831639289856, + "learning_rate": 0.000489765932871314, + "loss": 3.3594, + "step": 60100 + }, + { + "epoch": 4.0837749694251935, + "grad_norm": 0.8213236927986145, + "learning_rate": 0.0004897234678624814, + "loss": 3.2578, + "step": 60105 + }, + { + "epoch": 4.0841146894958555, + "grad_norm": 0.8293001055717468, + "learning_rate": 0.0004896810028536486, + "loss": 3.5714, + "step": 60110 + }, + { + "epoch": 4.084454409566518, + "grad_norm": 0.8506905436515808, + "learning_rate": 0.0004896385378448159, + "loss": 3.6627, + "step": 60115 + }, + { + "epoch": 4.084794129637179, + "grad_norm": 0.7441390156745911, + "learning_rate": 0.0004895960728359831, + "loss": 3.495, + "step": 60120 + }, + { + "epoch": 4.085133849707841, + "grad_norm": 0.9529460072517395, + "learning_rate": 0.0004895536078271504, + "loss": 3.6344, + "step": 60125 + }, + { + "epoch": 4.085473569778502, + "grad_norm": 1.0155911445617676, + "learning_rate": 0.0004895111428183177, + "loss": 3.4293, + "step": 60130 + }, + { + "epoch": 4.085813289849164, + "grad_norm": 1.201806664466858, + "learning_rate": 0.000489468677809485, + "loss": 3.799, + "step": 60135 + }, + { + "epoch": 4.086153009919826, + "grad_norm": 0.9333803057670593, + "learning_rate": 0.0004894262128006523, + "loss": 3.3225, + "step": 60140 + }, + { + "epoch": 4.086492729990487, + "grad_norm": 0.8674829006195068, + "learning_rate": 0.0004893837477918196, + "loss": 3.5339, + "step": 60145 + }, + { + "epoch": 4.0868324500611495, + "grad_norm": 0.7678355574607849, + "learning_rate": 0.0004893412827829868, + "loss": 3.5766, + "step": 60150 + }, + { + "epoch": 4.0871721701318116, + "grad_norm": 0.697464108467102, + "learning_rate": 0.000489298817774154, + "loss": 3.7483, + "step": 60155 + }, + { + "epoch": 4.087511890202473, + "grad_norm": 1.0166263580322266, + "learning_rate": 0.0004892563527653214, + "loss": 3.5099, + "step": 60160 + }, + { + "epoch": 4.087851610273135, + "grad_norm": 1.6570059061050415, + "learning_rate": 0.0004892138877564887, + "loss": 3.5939, + "step": 60165 + }, + { + "epoch": 4.088191330343797, + "grad_norm": 1.0402477979660034, + "learning_rate": 0.0004891714227476559, + "loss": 3.3406, + "step": 60170 + }, + { + "epoch": 4.088531050414458, + "grad_norm": 0.9011391401290894, + "learning_rate": 0.0004891289577388232, + "loss": 3.6148, + "step": 60175 + }, + { + "epoch": 4.08887077048512, + "grad_norm": 0.8842949867248535, + "learning_rate": 0.0004890864927299905, + "loss": 3.6897, + "step": 60180 + }, + { + "epoch": 4.089210490555782, + "grad_norm": 1.9162085056304932, + "learning_rate": 0.0004890440277211578, + "loss": 3.2303, + "step": 60185 + }, + { + "epoch": 4.089550210626443, + "grad_norm": 1.2645186185836792, + "learning_rate": 0.0004890015627123251, + "loss": 3.483, + "step": 60190 + }, + { + "epoch": 4.0898899306971055, + "grad_norm": 1.2870635986328125, + "learning_rate": 0.0004889590977034923, + "loss": 3.7873, + "step": 60195 + }, + { + "epoch": 4.090229650767768, + "grad_norm": 0.7688187956809998, + "learning_rate": 0.0004889166326946596, + "loss": 3.4108, + "step": 60200 + }, + { + "epoch": 4.090569370838429, + "grad_norm": 1.597001314163208, + "learning_rate": 0.0004888741676858268, + "loss": 3.6822, + "step": 60205 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.90828537940979, + "learning_rate": 0.0004888317026769942, + "loss": 3.3655, + "step": 60210 + }, + { + "epoch": 4.091248810979753, + "grad_norm": 0.8833420872688293, + "learning_rate": 0.0004887892376681615, + "loss": 3.6231, + "step": 60215 + }, + { + "epoch": 4.091588531050414, + "grad_norm": 0.8855195045471191, + "learning_rate": 0.0004887467726593287, + "loss": 3.4207, + "step": 60220 + }, + { + "epoch": 4.091928251121076, + "grad_norm": 3.833012104034424, + "learning_rate": 0.000488704307650496, + "loss": 3.5978, + "step": 60225 + }, + { + "epoch": 4.092267971191738, + "grad_norm": 0.7818519473075867, + "learning_rate": 0.0004886618426416633, + "loss": 3.6125, + "step": 60230 + }, + { + "epoch": 4.092607691262399, + "grad_norm": 0.8146418929100037, + "learning_rate": 0.0004886193776328305, + "loss": 3.31, + "step": 60235 + }, + { + "epoch": 4.0929474113330615, + "grad_norm": 1.1933406591415405, + "learning_rate": 0.0004885769126239979, + "loss": 3.2668, + "step": 60240 + }, + { + "epoch": 4.093287131403724, + "grad_norm": 0.956004798412323, + "learning_rate": 0.0004885344476151651, + "loss": 3.3902, + "step": 60245 + }, + { + "epoch": 4.093626851474385, + "grad_norm": 0.978080153465271, + "learning_rate": 0.0004884919826063324, + "loss": 3.2355, + "step": 60250 + }, + { + "epoch": 4.093966571545047, + "grad_norm": 1.0229716300964355, + "learning_rate": 0.0004884495175974996, + "loss": 3.4753, + "step": 60255 + }, + { + "epoch": 4.094306291615709, + "grad_norm": 1.0869359970092773, + "learning_rate": 0.000488407052588667, + "loss": 3.5881, + "step": 60260 + }, + { + "epoch": 4.09464601168637, + "grad_norm": 0.806182861328125, + "learning_rate": 0.0004883645875798342, + "loss": 3.6205, + "step": 60265 + }, + { + "epoch": 4.094985731757032, + "grad_norm": 1.062564730644226, + "learning_rate": 0.0004883221225710015, + "loss": 3.4123, + "step": 60270 + }, + { + "epoch": 4.095325451827694, + "grad_norm": 0.7810941934585571, + "learning_rate": 0.0004882796575621688, + "loss": 3.4209, + "step": 60275 + }, + { + "epoch": 4.095665171898355, + "grad_norm": 0.8729587197303772, + "learning_rate": 0.00048823719255333605, + "loss": 3.5089, + "step": 60280 + }, + { + "epoch": 4.0960048919690175, + "grad_norm": 1.357271432876587, + "learning_rate": 0.00048819472754450333, + "loss": 3.3196, + "step": 60285 + }, + { + "epoch": 4.09634461203968, + "grad_norm": 1.0196400880813599, + "learning_rate": 0.0004881522625356706, + "loss": 3.4414, + "step": 60290 + }, + { + "epoch": 4.096684332110341, + "grad_norm": 0.8984969258308411, + "learning_rate": 0.0004881097975268379, + "loss": 3.4021, + "step": 60295 + }, + { + "epoch": 4.097024052181003, + "grad_norm": 1.113566517829895, + "learning_rate": 0.00048806733251800517, + "loss": 3.5924, + "step": 60300 + }, + { + "epoch": 4.097363772251665, + "grad_norm": 0.823829710483551, + "learning_rate": 0.00048802486750917245, + "loss": 3.6056, + "step": 60305 + }, + { + "epoch": 4.097703492322326, + "grad_norm": 0.6837289333343506, + "learning_rate": 0.00048798240250033973, + "loss": 3.2943, + "step": 60310 + }, + { + "epoch": 4.098043212392988, + "grad_norm": 0.9056379795074463, + "learning_rate": 0.000487939937491507, + "loss": 3.5952, + "step": 60315 + }, + { + "epoch": 4.09838293246365, + "grad_norm": 0.852904200553894, + "learning_rate": 0.0004878974724826743, + "loss": 3.3632, + "step": 60320 + }, + { + "epoch": 4.0987226525343115, + "grad_norm": 1.0834609270095825, + "learning_rate": 0.0004878550074738415, + "loss": 3.5663, + "step": 60325 + }, + { + "epoch": 4.0990623726049735, + "grad_norm": 1.0215169191360474, + "learning_rate": 0.00048781254246500885, + "loss": 3.5261, + "step": 60330 + }, + { + "epoch": 4.099402092675636, + "grad_norm": 1.0578057765960693, + "learning_rate": 0.00048777007745617613, + "loss": 3.6291, + "step": 60335 + }, + { + "epoch": 4.099741812746297, + "grad_norm": 1.8172330856323242, + "learning_rate": 0.00048772761244734336, + "loss": 3.5023, + "step": 60340 + }, + { + "epoch": 4.100081532816959, + "grad_norm": 0.8477697968482971, + "learning_rate": 0.0004876851474385107, + "loss": 3.5428, + "step": 60345 + }, + { + "epoch": 4.100421252887621, + "grad_norm": 0.7450252175331116, + "learning_rate": 0.00048764268242967797, + "loss": 3.6137, + "step": 60350 + }, + { + "epoch": 4.100760972958282, + "grad_norm": 0.6945070624351501, + "learning_rate": 0.00048760021742084525, + "loss": 3.5552, + "step": 60355 + }, + { + "epoch": 4.101100693028944, + "grad_norm": 0.8678349852561951, + "learning_rate": 0.0004875577524120125, + "loss": 3.5542, + "step": 60360 + }, + { + "epoch": 4.101440413099606, + "grad_norm": 0.9669976830482483, + "learning_rate": 0.0004875152874031798, + "loss": 3.4997, + "step": 60365 + }, + { + "epoch": 4.1017801331702675, + "grad_norm": 0.9436227083206177, + "learning_rate": 0.0004874728223943471, + "loss": 3.4582, + "step": 60370 + }, + { + "epoch": 4.1021198532409295, + "grad_norm": 1.1040784120559692, + "learning_rate": 0.0004874303573855143, + "loss": 3.4544, + "step": 60375 + }, + { + "epoch": 4.102459573311592, + "grad_norm": 0.7681019306182861, + "learning_rate": 0.00048738789237668165, + "loss": 3.628, + "step": 60380 + }, + { + "epoch": 4.102799293382253, + "grad_norm": 0.8733704686164856, + "learning_rate": 0.00048734542736784893, + "loss": 3.4674, + "step": 60385 + }, + { + "epoch": 4.103139013452915, + "grad_norm": 0.9012787342071533, + "learning_rate": 0.00048730296235901616, + "loss": 3.3908, + "step": 60390 + }, + { + "epoch": 4.103478733523577, + "grad_norm": 0.8285872340202332, + "learning_rate": 0.00048726049735018344, + "loss": 3.3491, + "step": 60395 + }, + { + "epoch": 4.103818453594238, + "grad_norm": 0.8290849328041077, + "learning_rate": 0.00048721803234135077, + "loss": 3.3169, + "step": 60400 + }, + { + "epoch": 4.1041581736649, + "grad_norm": 13.275155067443848, + "learning_rate": 0.000487175567332518, + "loss": 3.4835, + "step": 60405 + }, + { + "epoch": 4.104497893735562, + "grad_norm": 0.8565815091133118, + "learning_rate": 0.0004871331023236853, + "loss": 3.5032, + "step": 60410 + }, + { + "epoch": 4.1048376138062235, + "grad_norm": 0.864708423614502, + "learning_rate": 0.0004870906373148526, + "loss": 3.4405, + "step": 60415 + }, + { + "epoch": 4.1051773338768855, + "grad_norm": 1.130233883857727, + "learning_rate": 0.00048704817230601984, + "loss": 3.5547, + "step": 60420 + }, + { + "epoch": 4.105517053947548, + "grad_norm": 0.9972354173660278, + "learning_rate": 0.0004870057072971871, + "loss": 3.5442, + "step": 60425 + }, + { + "epoch": 4.105856774018209, + "grad_norm": 0.8381731510162354, + "learning_rate": 0.0004869632422883544, + "loss": 3.3848, + "step": 60430 + }, + { + "epoch": 4.106196494088871, + "grad_norm": 0.8210023641586304, + "learning_rate": 0.0004869207772795217, + "loss": 3.2081, + "step": 60435 + }, + { + "epoch": 4.106536214159533, + "grad_norm": 0.9711310863494873, + "learning_rate": 0.00048687831227068896, + "loss": 3.577, + "step": 60440 + }, + { + "epoch": 4.106875934230194, + "grad_norm": 0.8383638858795166, + "learning_rate": 0.00048683584726185624, + "loss": 3.5146, + "step": 60445 + }, + { + "epoch": 4.107215654300856, + "grad_norm": 0.8778899908065796, + "learning_rate": 0.0004867933822530235, + "loss": 3.6338, + "step": 60450 + }, + { + "epoch": 4.107555374371518, + "grad_norm": 0.9330540299415588, + "learning_rate": 0.0004867509172441908, + "loss": 3.4698, + "step": 60455 + }, + { + "epoch": 4.1078950944421795, + "grad_norm": 0.8989884853363037, + "learning_rate": 0.0004867084522353581, + "loss": 3.3999, + "step": 60460 + }, + { + "epoch": 4.108234814512842, + "grad_norm": 0.9409488439559937, + "learning_rate": 0.0004866659872265253, + "loss": 3.3922, + "step": 60465 + }, + { + "epoch": 4.108574534583504, + "grad_norm": 0.9616606831550598, + "learning_rate": 0.00048662352221769264, + "loss": 3.4441, + "step": 60470 + }, + { + "epoch": 4.108914254654165, + "grad_norm": 1.0149863958358765, + "learning_rate": 0.0004865810572088599, + "loss": 3.6244, + "step": 60475 + }, + { + "epoch": 4.109253974724827, + "grad_norm": 0.7913025617599487, + "learning_rate": 0.00048653859220002714, + "loss": 3.5598, + "step": 60480 + }, + { + "epoch": 4.109593694795488, + "grad_norm": 0.8005882501602173, + "learning_rate": 0.0004864961271911945, + "loss": 3.3969, + "step": 60485 + }, + { + "epoch": 4.10993341486615, + "grad_norm": 1.1756598949432373, + "learning_rate": 0.00048645366218236176, + "loss": 3.7174, + "step": 60490 + }, + { + "epoch": 4.110273134936812, + "grad_norm": 1.0768214464187622, + "learning_rate": 0.000486411197173529, + "loss": 3.4558, + "step": 60495 + }, + { + "epoch": 4.110612855007473, + "grad_norm": 0.8948865532875061, + "learning_rate": 0.0004863687321646963, + "loss": 3.1587, + "step": 60500 + }, + { + "epoch": 4.1109525750781355, + "grad_norm": 0.8397327065467834, + "learning_rate": 0.0004863262671558636, + "loss": 3.441, + "step": 60505 + }, + { + "epoch": 4.111292295148798, + "grad_norm": 0.9375227093696594, + "learning_rate": 0.0004862838021470308, + "loss": 3.5371, + "step": 60510 + }, + { + "epoch": 4.111632015219459, + "grad_norm": 0.7462906241416931, + "learning_rate": 0.0004862413371381981, + "loss": 3.4946, + "step": 60515 + }, + { + "epoch": 4.111971735290121, + "grad_norm": 1.0387154817581177, + "learning_rate": 0.00048619887212936544, + "loss": 3.4367, + "step": 60520 + }, + { + "epoch": 4.112311455360783, + "grad_norm": 1.1908185482025146, + "learning_rate": 0.0004861564071205327, + "loss": 3.4562, + "step": 60525 + }, + { + "epoch": 4.112651175431444, + "grad_norm": 0.9729493260383606, + "learning_rate": 0.00048611394211169995, + "loss": 3.3332, + "step": 60530 + }, + { + "epoch": 4.112990895502106, + "grad_norm": 1.0518019199371338, + "learning_rate": 0.0004860714771028673, + "loss": 3.1895, + "step": 60535 + }, + { + "epoch": 4.113330615572768, + "grad_norm": 1.6905988454818726, + "learning_rate": 0.00048602901209403456, + "loss": 3.3257, + "step": 60540 + }, + { + "epoch": 4.113670335643429, + "grad_norm": 0.8248864412307739, + "learning_rate": 0.0004859865470852018, + "loss": 3.5606, + "step": 60545 + }, + { + "epoch": 4.1140100557140915, + "grad_norm": 0.8361690640449524, + "learning_rate": 0.00048594408207636907, + "loss": 3.2817, + "step": 60550 + }, + { + "epoch": 4.114349775784754, + "grad_norm": 0.8379681706428528, + "learning_rate": 0.0004859016170675364, + "loss": 3.3566, + "step": 60555 + }, + { + "epoch": 4.114689495855415, + "grad_norm": 1.555497646331787, + "learning_rate": 0.0004858591520587036, + "loss": 3.4062, + "step": 60560 + }, + { + "epoch": 4.115029215926077, + "grad_norm": 1.0184646844863892, + "learning_rate": 0.0004858166870498709, + "loss": 3.503, + "step": 60565 + }, + { + "epoch": 4.115368935996739, + "grad_norm": 0.9793114066123962, + "learning_rate": 0.00048577422204103824, + "loss": 3.4208, + "step": 60570 + }, + { + "epoch": 4.1157086560674, + "grad_norm": 1.0974189043045044, + "learning_rate": 0.00048573175703220547, + "loss": 3.201, + "step": 60575 + }, + { + "epoch": 4.116048376138062, + "grad_norm": 1.1086117029190063, + "learning_rate": 0.00048568929202337275, + "loss": 3.7089, + "step": 60580 + }, + { + "epoch": 4.116388096208724, + "grad_norm": 0.7402608394622803, + "learning_rate": 0.00048564682701454, + "loss": 3.2999, + "step": 60585 + }, + { + "epoch": 4.116727816279385, + "grad_norm": 1.0118488073349, + "learning_rate": 0.0004856043620057073, + "loss": 3.4182, + "step": 60590 + }, + { + "epoch": 4.1170675363500475, + "grad_norm": 0.9384225606918335, + "learning_rate": 0.0004855618969968746, + "loss": 3.5966, + "step": 60595 + }, + { + "epoch": 4.11740725642071, + "grad_norm": 0.8584282398223877, + "learning_rate": 0.00048551943198804187, + "loss": 3.7745, + "step": 60600 + }, + { + "epoch": 4.117746976491371, + "grad_norm": 0.9300656318664551, + "learning_rate": 0.00048547696697920915, + "loss": 3.481, + "step": 60605 + }, + { + "epoch": 4.118086696562033, + "grad_norm": 1.0953280925750732, + "learning_rate": 0.0004854345019703764, + "loss": 3.2949, + "step": 60610 + }, + { + "epoch": 4.118426416632695, + "grad_norm": 0.9929054379463196, + "learning_rate": 0.0004853920369615437, + "loss": 3.3829, + "step": 60615 + }, + { + "epoch": 4.118766136703356, + "grad_norm": 0.8112167119979858, + "learning_rate": 0.00048534957195271093, + "loss": 3.487, + "step": 60620 + }, + { + "epoch": 4.119105856774018, + "grad_norm": 0.8590622544288635, + "learning_rate": 0.00048530710694387827, + "loss": 3.5652, + "step": 60625 + }, + { + "epoch": 4.11944557684468, + "grad_norm": 0.7285771369934082, + "learning_rate": 0.00048526464193504555, + "loss": 3.2105, + "step": 60630 + }, + { + "epoch": 4.1197852969153415, + "grad_norm": 0.8886924982070923, + "learning_rate": 0.00048522217692621277, + "loss": 3.7493, + "step": 60635 + }, + { + "epoch": 4.1201250169860035, + "grad_norm": 0.8288488388061523, + "learning_rate": 0.0004851797119173801, + "loss": 3.3453, + "step": 60640 + }, + { + "epoch": 4.120464737056666, + "grad_norm": 0.9049253463745117, + "learning_rate": 0.0004851372469085474, + "loss": 3.5065, + "step": 60645 + }, + { + "epoch": 4.120804457127327, + "grad_norm": 0.9969305396080017, + "learning_rate": 0.0004850947818997146, + "loss": 3.6363, + "step": 60650 + }, + { + "epoch": 4.121144177197989, + "grad_norm": 0.924252450466156, + "learning_rate": 0.0004850523168908819, + "loss": 3.5646, + "step": 60655 + }, + { + "epoch": 4.121483897268651, + "grad_norm": 0.7325116395950317, + "learning_rate": 0.00048500985188204923, + "loss": 3.2755, + "step": 60660 + }, + { + "epoch": 4.121823617339312, + "grad_norm": 0.8659356236457825, + "learning_rate": 0.00048496738687321645, + "loss": 3.4207, + "step": 60665 + }, + { + "epoch": 4.122163337409974, + "grad_norm": 1.0989394187927246, + "learning_rate": 0.00048492492186438373, + "loss": 3.4345, + "step": 60670 + }, + { + "epoch": 4.122503057480636, + "grad_norm": 1.2669823169708252, + "learning_rate": 0.00048488245685555107, + "loss": 3.3313, + "step": 60675 + }, + { + "epoch": 4.1228427775512975, + "grad_norm": 1.3982501029968262, + "learning_rate": 0.0004848399918467183, + "loss": 3.3062, + "step": 60680 + }, + { + "epoch": 4.1231824976219595, + "grad_norm": 3.5873093605041504, + "learning_rate": 0.0004847975268378856, + "loss": 3.4131, + "step": 60685 + }, + { + "epoch": 4.123522217692622, + "grad_norm": 0.7587495446205139, + "learning_rate": 0.00048475506182905285, + "loss": 3.8885, + "step": 60690 + }, + { + "epoch": 4.123861937763283, + "grad_norm": 1.0931551456451416, + "learning_rate": 0.0004847125968202202, + "loss": 3.2171, + "step": 60695 + }, + { + "epoch": 4.124201657833945, + "grad_norm": 1.0962896347045898, + "learning_rate": 0.0004846701318113874, + "loss": 3.1821, + "step": 60700 + }, + { + "epoch": 4.124541377904607, + "grad_norm": 1.0926806926727295, + "learning_rate": 0.0004846276668025547, + "loss": 3.4337, + "step": 60705 + }, + { + "epoch": 4.124881097975268, + "grad_norm": 2.454643487930298, + "learning_rate": 0.00048458520179372203, + "loss": 3.4066, + "step": 60710 + }, + { + "epoch": 4.12522081804593, + "grad_norm": 0.8300812840461731, + "learning_rate": 0.00048454273678488925, + "loss": 3.4493, + "step": 60715 + }, + { + "epoch": 4.125560538116592, + "grad_norm": 0.8434850573539734, + "learning_rate": 0.00048450027177605653, + "loss": 3.4063, + "step": 60720 + }, + { + "epoch": 4.1259002581872535, + "grad_norm": 0.8532112240791321, + "learning_rate": 0.0004844578067672238, + "loss": 3.1452, + "step": 60725 + }, + { + "epoch": 4.1262399782579156, + "grad_norm": 1.2177702188491821, + "learning_rate": 0.0004844153417583911, + "loss": 3.4509, + "step": 60730 + }, + { + "epoch": 4.126579698328578, + "grad_norm": 0.9804102182388306, + "learning_rate": 0.0004843728767495584, + "loss": 3.4107, + "step": 60735 + }, + { + "epoch": 4.126919418399239, + "grad_norm": 0.8604499101638794, + "learning_rate": 0.00048433041174072565, + "loss": 3.3541, + "step": 60740 + }, + { + "epoch": 4.127259138469901, + "grad_norm": 1.4239665269851685, + "learning_rate": 0.00048428794673189293, + "loss": 3.132, + "step": 60745 + }, + { + "epoch": 4.127598858540563, + "grad_norm": 0.885103166103363, + "learning_rate": 0.0004842454817230602, + "loss": 3.6869, + "step": 60750 + }, + { + "epoch": 4.127938578611224, + "grad_norm": 0.8011782765388489, + "learning_rate": 0.0004842030167142275, + "loss": 3.4997, + "step": 60755 + }, + { + "epoch": 4.128278298681886, + "grad_norm": 0.8957793712615967, + "learning_rate": 0.0004841605517053947, + "loss": 3.5033, + "step": 60760 + }, + { + "epoch": 4.128618018752548, + "grad_norm": 0.9379497170448303, + "learning_rate": 0.00048411808669656205, + "loss": 3.3168, + "step": 60765 + }, + { + "epoch": 4.1289577388232095, + "grad_norm": 1.9746123552322388, + "learning_rate": 0.00048407562168772933, + "loss": 3.4948, + "step": 60770 + }, + { + "epoch": 4.129297458893872, + "grad_norm": 0.9172775149345398, + "learning_rate": 0.00048403315667889656, + "loss": 3.3917, + "step": 60775 + }, + { + "epoch": 4.129637178964534, + "grad_norm": 1.019860029220581, + "learning_rate": 0.0004839906916700639, + "loss": 3.5188, + "step": 60780 + }, + { + "epoch": 4.129976899035195, + "grad_norm": 0.6821675896644592, + "learning_rate": 0.0004839482266612312, + "loss": 3.512, + "step": 60785 + }, + { + "epoch": 4.130316619105857, + "grad_norm": 1.0885146856307983, + "learning_rate": 0.0004839057616523984, + "loss": 3.5185, + "step": 60790 + }, + { + "epoch": 4.130656339176519, + "grad_norm": 1.2183313369750977, + "learning_rate": 0.00048386329664356574, + "loss": 3.3945, + "step": 60795 + }, + { + "epoch": 4.13099605924718, + "grad_norm": 0.8969928026199341, + "learning_rate": 0.000483820831634733, + "loss": 3.3495, + "step": 60800 + }, + { + "epoch": 4.131335779317842, + "grad_norm": 0.6959011554718018, + "learning_rate": 0.00048377836662590024, + "loss": 3.2901, + "step": 60805 + }, + { + "epoch": 4.131675499388503, + "grad_norm": 0.9824219346046448, + "learning_rate": 0.0004837359016170675, + "loss": 3.4948, + "step": 60810 + }, + { + "epoch": 4.1320152194591655, + "grad_norm": 1.1484794616699219, + "learning_rate": 0.00048369343660823486, + "loss": 3.3524, + "step": 60815 + }, + { + "epoch": 4.132354939529828, + "grad_norm": 0.8702675104141235, + "learning_rate": 0.0004836509715994021, + "loss": 3.5537, + "step": 60820 + }, + { + "epoch": 4.132694659600489, + "grad_norm": 0.9683172702789307, + "learning_rate": 0.00048360850659056936, + "loss": 3.4545, + "step": 60825 + }, + { + "epoch": 4.133034379671151, + "grad_norm": 1.029128909111023, + "learning_rate": 0.0004835660415817367, + "loss": 3.4714, + "step": 60830 + }, + { + "epoch": 4.133374099741813, + "grad_norm": 0.9914759397506714, + "learning_rate": 0.0004835235765729039, + "loss": 3.4354, + "step": 60835 + }, + { + "epoch": 4.133713819812474, + "grad_norm": 1.011048436164856, + "learning_rate": 0.0004834811115640712, + "loss": 3.3805, + "step": 60840 + }, + { + "epoch": 4.134053539883136, + "grad_norm": 1.2514324188232422, + "learning_rate": 0.0004834386465552385, + "loss": 3.3137, + "step": 60845 + }, + { + "epoch": 4.134393259953798, + "grad_norm": 0.8945834636688232, + "learning_rate": 0.00048339618154640576, + "loss": 3.5288, + "step": 60850 + }, + { + "epoch": 4.134732980024459, + "grad_norm": 1.014595627784729, + "learning_rate": 0.00048335371653757304, + "loss": 3.4544, + "step": 60855 + }, + { + "epoch": 4.1350727000951215, + "grad_norm": 0.949591338634491, + "learning_rate": 0.0004833112515287403, + "loss": 3.5007, + "step": 60860 + }, + { + "epoch": 4.135412420165784, + "grad_norm": 0.8489345908164978, + "learning_rate": 0.00048326878651990766, + "loss": 3.5158, + "step": 60865 + }, + { + "epoch": 4.135752140236445, + "grad_norm": 0.8581366539001465, + "learning_rate": 0.0004832263215110749, + "loss": 3.4125, + "step": 60870 + }, + { + "epoch": 4.136091860307107, + "grad_norm": 0.981072187423706, + "learning_rate": 0.00048318385650224216, + "loss": 3.5004, + "step": 60875 + }, + { + "epoch": 4.136431580377769, + "grad_norm": 1.2032808065414429, + "learning_rate": 0.00048314139149340944, + "loss": 3.4512, + "step": 60880 + }, + { + "epoch": 4.13677130044843, + "grad_norm": 0.9587888717651367, + "learning_rate": 0.0004830989264845767, + "loss": 3.4911, + "step": 60885 + }, + { + "epoch": 4.137111020519092, + "grad_norm": 0.9927632808685303, + "learning_rate": 0.00048306495447751055, + "loss": 3.5728, + "step": 60890 + }, + { + "epoch": 4.137450740589754, + "grad_norm": 1.0154802799224854, + "learning_rate": 0.0004830224894686778, + "loss": 3.4497, + "step": 60895 + }, + { + "epoch": 4.1377904606604154, + "grad_norm": 0.8871815204620361, + "learning_rate": 0.0004829800244598451, + "loss": 3.4507, + "step": 60900 + }, + { + "epoch": 4.1381301807310775, + "grad_norm": 0.8781307339668274, + "learning_rate": 0.0004829375594510124, + "loss": 3.3913, + "step": 60905 + }, + { + "epoch": 4.13846990080174, + "grad_norm": 0.8712561726570129, + "learning_rate": 0.00048289509444217967, + "loss": 3.509, + "step": 60910 + }, + { + "epoch": 4.138809620872401, + "grad_norm": 1.000641942024231, + "learning_rate": 0.00048285262943334695, + "loss": 3.6715, + "step": 60915 + }, + { + "epoch": 4.139149340943063, + "grad_norm": 0.9192151427268982, + "learning_rate": 0.00048281016442451417, + "loss": 3.3841, + "step": 60920 + }, + { + "epoch": 4.139489061013725, + "grad_norm": 0.8362899422645569, + "learning_rate": 0.0004827676994156815, + "loss": 3.4, + "step": 60925 + }, + { + "epoch": 4.139828781084386, + "grad_norm": 1.0384634733200073, + "learning_rate": 0.0004827252344068488, + "loss": 3.8469, + "step": 60930 + }, + { + "epoch": 4.140168501155048, + "grad_norm": 1.0129059553146362, + "learning_rate": 0.000482682769398016, + "loss": 3.4392, + "step": 60935 + }, + { + "epoch": 4.14050822122571, + "grad_norm": 0.8117624521255493, + "learning_rate": 0.00048264030438918335, + "loss": 3.486, + "step": 60940 + }, + { + "epoch": 4.1408479412963715, + "grad_norm": 0.8883699178695679, + "learning_rate": 0.0004825978393803506, + "loss": 3.7308, + "step": 60945 + }, + { + "epoch": 4.1411876613670335, + "grad_norm": 0.800073504447937, + "learning_rate": 0.00048255537437151785, + "loss": 3.6299, + "step": 60950 + }, + { + "epoch": 4.141527381437696, + "grad_norm": 0.782366931438446, + "learning_rate": 0.00048251290936268513, + "loss": 3.1066, + "step": 60955 + }, + { + "epoch": 4.141867101508357, + "grad_norm": 0.7832164168357849, + "learning_rate": 0.00048247044435385247, + "loss": 3.3253, + "step": 60960 + }, + { + "epoch": 4.142206821579019, + "grad_norm": 0.9168151617050171, + "learning_rate": 0.0004824279793450197, + "loss": 3.4552, + "step": 60965 + }, + { + "epoch": 4.142546541649681, + "grad_norm": 1.1362011432647705, + "learning_rate": 0.000482385514336187, + "loss": 3.5719, + "step": 60970 + }, + { + "epoch": 4.142886261720342, + "grad_norm": 0.7745733261108398, + "learning_rate": 0.0004823430493273543, + "loss": 3.3998, + "step": 60975 + }, + { + "epoch": 4.143225981791004, + "grad_norm": 0.8043518662452698, + "learning_rate": 0.00048230058431852153, + "loss": 3.369, + "step": 60980 + }, + { + "epoch": 4.143565701861666, + "grad_norm": 1.0540937185287476, + "learning_rate": 0.0004822581193096888, + "loss": 3.6495, + "step": 60985 + }, + { + "epoch": 4.1439054219323275, + "grad_norm": 1.1645954847335815, + "learning_rate": 0.00048221565430085615, + "loss": 3.3558, + "step": 60990 + }, + { + "epoch": 4.1442451420029895, + "grad_norm": 1.0627593994140625, + "learning_rate": 0.0004821731892920234, + "loss": 3.2822, + "step": 60995 + }, + { + "epoch": 4.144584862073652, + "grad_norm": 0.9172415733337402, + "learning_rate": 0.00048213072428319065, + "loss": 3.3148, + "step": 61000 + }, + { + "epoch": 4.144924582144313, + "grad_norm": 0.8969100117683411, + "learning_rate": 0.00048208825927435793, + "loss": 3.3565, + "step": 61005 + }, + { + "epoch": 4.145264302214975, + "grad_norm": 0.7650719881057739, + "learning_rate": 0.0004820457942655252, + "loss": 3.5713, + "step": 61010 + }, + { + "epoch": 4.145604022285637, + "grad_norm": 0.9105943441390991, + "learning_rate": 0.0004820033292566925, + "loss": 3.5107, + "step": 61015 + }, + { + "epoch": 4.145943742356298, + "grad_norm": 0.8141853213310242, + "learning_rate": 0.0004819608642478598, + "loss": 3.4214, + "step": 61020 + }, + { + "epoch": 4.14628346242696, + "grad_norm": 0.9823259711265564, + "learning_rate": 0.00048191839923902705, + "loss": 3.5348, + "step": 61025 + }, + { + "epoch": 4.146623182497622, + "grad_norm": 0.9623443484306335, + "learning_rate": 0.00048187593423019433, + "loss": 3.5348, + "step": 61030 + }, + { + "epoch": 4.1469629025682835, + "grad_norm": 0.831840455532074, + "learning_rate": 0.0004818334692213616, + "loss": 3.4637, + "step": 61035 + }, + { + "epoch": 4.147302622638946, + "grad_norm": 0.8067043423652649, + "learning_rate": 0.00048179100421252884, + "loss": 3.5884, + "step": 61040 + }, + { + "epoch": 4.147642342709608, + "grad_norm": 1.0196129083633423, + "learning_rate": 0.0004817485392036962, + "loss": 3.7589, + "step": 61045 + }, + { + "epoch": 4.147982062780269, + "grad_norm": 1.1024820804595947, + "learning_rate": 0.00048170607419486345, + "loss": 3.4848, + "step": 61050 + }, + { + "epoch": 4.148321782850931, + "grad_norm": 0.8064836859703064, + "learning_rate": 0.0004816636091860307, + "loss": 3.5727, + "step": 61055 + }, + { + "epoch": 4.148661502921593, + "grad_norm": 0.6886911392211914, + "learning_rate": 0.000481621144177198, + "loss": 3.4755, + "step": 61060 + }, + { + "epoch": 4.149001222992254, + "grad_norm": 0.8029985427856445, + "learning_rate": 0.0004815786791683653, + "loss": 3.367, + "step": 61065 + }, + { + "epoch": 4.149340943062916, + "grad_norm": 1.2038064002990723, + "learning_rate": 0.0004815362141595326, + "loss": 3.5569, + "step": 61070 + }, + { + "epoch": 4.149680663133578, + "grad_norm": 0.8145481944084167, + "learning_rate": 0.0004814937491506998, + "loss": 3.4493, + "step": 61075 + }, + { + "epoch": 4.1500203832042395, + "grad_norm": 0.8461026549339294, + "learning_rate": 0.00048145128414186713, + "loss": 3.4107, + "step": 61080 + }, + { + "epoch": 4.150360103274902, + "grad_norm": 0.973686933517456, + "learning_rate": 0.0004814088191330344, + "loss": 3.4107, + "step": 61085 + }, + { + "epoch": 4.150699823345564, + "grad_norm": 1.311644196510315, + "learning_rate": 0.00048136635412420164, + "loss": 3.4629, + "step": 61090 + }, + { + "epoch": 4.151039543416225, + "grad_norm": 1.0196807384490967, + "learning_rate": 0.000481323889115369, + "loss": 3.6787, + "step": 61095 + }, + { + "epoch": 4.151379263486887, + "grad_norm": 0.7503507733345032, + "learning_rate": 0.00048128142410653626, + "loss": 3.4577, + "step": 61100 + }, + { + "epoch": 4.151718983557549, + "grad_norm": 0.8744183778762817, + "learning_rate": 0.0004812389590977035, + "loss": 3.4197, + "step": 61105 + }, + { + "epoch": 4.15205870362821, + "grad_norm": 0.9147146344184875, + "learning_rate": 0.00048119649408887076, + "loss": 3.6768, + "step": 61110 + }, + { + "epoch": 4.152398423698872, + "grad_norm": 1.2562767267227173, + "learning_rate": 0.0004811540290800381, + "loss": 3.4754, + "step": 61115 + }, + { + "epoch": 4.152738143769534, + "grad_norm": 0.8295187950134277, + "learning_rate": 0.0004811115640712053, + "loss": 3.4594, + "step": 61120 + }, + { + "epoch": 4.1530778638401955, + "grad_norm": 0.8625035881996155, + "learning_rate": 0.0004810690990623726, + "loss": 3.567, + "step": 61125 + }, + { + "epoch": 4.153417583910858, + "grad_norm": 0.8011373281478882, + "learning_rate": 0.00048102663405353994, + "loss": 3.3791, + "step": 61130 + }, + { + "epoch": 4.15375730398152, + "grad_norm": 0.886993408203125, + "learning_rate": 0.00048098416904470716, + "loss": 3.5571, + "step": 61135 + }, + { + "epoch": 4.154097024052181, + "grad_norm": 1.239869236946106, + "learning_rate": 0.00048094170403587444, + "loss": 3.5597, + "step": 61140 + }, + { + "epoch": 4.154436744122843, + "grad_norm": 0.8828072547912598, + "learning_rate": 0.0004808992390270417, + "loss": 3.4681, + "step": 61145 + }, + { + "epoch": 4.154776464193505, + "grad_norm": 0.9335965514183044, + "learning_rate": 0.000480856774018209, + "loss": 3.5029, + "step": 61150 + }, + { + "epoch": 4.155116184264166, + "grad_norm": 0.7728764414787292, + "learning_rate": 0.0004808143090093763, + "loss": 3.6833, + "step": 61155 + }, + { + "epoch": 4.155455904334828, + "grad_norm": 1.0058040618896484, + "learning_rate": 0.00048077184400054356, + "loss": 3.3964, + "step": 61160 + }, + { + "epoch": 4.15579562440549, + "grad_norm": 0.9841409921646118, + "learning_rate": 0.00048072937899171084, + "loss": 3.5441, + "step": 61165 + }, + { + "epoch": 4.1561353444761515, + "grad_norm": 1.125986099243164, + "learning_rate": 0.0004806869139828781, + "loss": 3.7546, + "step": 61170 + }, + { + "epoch": 4.156475064546814, + "grad_norm": 0.8361207842826843, + "learning_rate": 0.0004806444489740454, + "loss": 3.3696, + "step": 61175 + }, + { + "epoch": 4.156814784617475, + "grad_norm": 1.1193454265594482, + "learning_rate": 0.00048060198396521263, + "loss": 3.5231, + "step": 61180 + }, + { + "epoch": 4.157154504688137, + "grad_norm": 1.0645794868469238, + "learning_rate": 0.00048055951895637996, + "loss": 3.6047, + "step": 61185 + }, + { + "epoch": 4.157494224758799, + "grad_norm": 0.9826891422271729, + "learning_rate": 0.00048051705394754724, + "loss": 3.5149, + "step": 61190 + }, + { + "epoch": 4.15783394482946, + "grad_norm": 0.7417986989021301, + "learning_rate": 0.00048047458893871447, + "loss": 3.6072, + "step": 61195 + }, + { + "epoch": 4.158173664900122, + "grad_norm": 1.480231523513794, + "learning_rate": 0.0004804321239298818, + "loss": 3.418, + "step": 61200 + }, + { + "epoch": 4.158513384970784, + "grad_norm": 1.0944023132324219, + "learning_rate": 0.0004803896589210491, + "loss": 3.2847, + "step": 61205 + }, + { + "epoch": 4.1588531050414455, + "grad_norm": 0.9818964600563049, + "learning_rate": 0.0004803471939122163, + "loss": 3.7795, + "step": 61210 + }, + { + "epoch": 4.1591928251121075, + "grad_norm": 0.7970341444015503, + "learning_rate": 0.0004803047289033836, + "loss": 3.3905, + "step": 61215 + }, + { + "epoch": 4.15953254518277, + "grad_norm": 0.8937077522277832, + "learning_rate": 0.0004802622638945509, + "loss": 3.5761, + "step": 61220 + }, + { + "epoch": 4.159872265253431, + "grad_norm": 1.023947834968567, + "learning_rate": 0.00048021979888571815, + "loss": 3.4634, + "step": 61225 + }, + { + "epoch": 4.160211985324093, + "grad_norm": 0.881676971912384, + "learning_rate": 0.00048017733387688543, + "loss": 3.3815, + "step": 61230 + }, + { + "epoch": 4.160551705394755, + "grad_norm": 0.8250371217727661, + "learning_rate": 0.00048013486886805276, + "loss": 3.4652, + "step": 61235 + }, + { + "epoch": 4.160891425465416, + "grad_norm": 1.173215389251709, + "learning_rate": 0.00048009240385922004, + "loss": 3.3945, + "step": 61240 + }, + { + "epoch": 4.161231145536078, + "grad_norm": 0.8424421548843384, + "learning_rate": 0.00048004993885038727, + "loss": 3.5992, + "step": 61245 + }, + { + "epoch": 4.16157086560674, + "grad_norm": 1.102643609046936, + "learning_rate": 0.00048000747384155455, + "loss": 3.4054, + "step": 61250 + }, + { + "epoch": 4.1619105856774015, + "grad_norm": 1.1778626441955566, + "learning_rate": 0.0004799650088327219, + "loss": 3.6202, + "step": 61255 + }, + { + "epoch": 4.1622503057480635, + "grad_norm": 0.723430871963501, + "learning_rate": 0.0004799225438238891, + "loss": 3.6755, + "step": 61260 + }, + { + "epoch": 4.162590025818726, + "grad_norm": 0.8646814823150635, + "learning_rate": 0.0004798800788150564, + "loss": 3.5397, + "step": 61265 + }, + { + "epoch": 4.162929745889387, + "grad_norm": 0.9581089615821838, + "learning_rate": 0.0004798376138062237, + "loss": 3.4534, + "step": 61270 + }, + { + "epoch": 4.163269465960049, + "grad_norm": 0.8008308410644531, + "learning_rate": 0.00047979514879739095, + "loss": 3.5739, + "step": 61275 + }, + { + "epoch": 4.163609186030711, + "grad_norm": 0.9227950572967529, + "learning_rate": 0.00047975268378855823, + "loss": 3.5842, + "step": 61280 + }, + { + "epoch": 4.163948906101372, + "grad_norm": 0.6787657737731934, + "learning_rate": 0.00047971021877972556, + "loss": 3.3714, + "step": 61285 + }, + { + "epoch": 4.164288626172034, + "grad_norm": 0.7908263802528381, + "learning_rate": 0.0004796677537708928, + "loss": 3.4104, + "step": 61290 + }, + { + "epoch": 4.164628346242696, + "grad_norm": 0.8699466586112976, + "learning_rate": 0.00047962528876206007, + "loss": 3.3827, + "step": 61295 + }, + { + "epoch": 4.1649680663133575, + "grad_norm": 0.8958128690719604, + "learning_rate": 0.00047958282375322735, + "loss": 3.5093, + "step": 61300 + }, + { + "epoch": 4.1653077863840196, + "grad_norm": 0.8379378318786621, + "learning_rate": 0.00047954035874439463, + "loss": 3.6384, + "step": 61305 + }, + { + "epoch": 4.165647506454682, + "grad_norm": 0.8143219947814941, + "learning_rate": 0.0004794978937355619, + "loss": 3.3426, + "step": 61310 + }, + { + "epoch": 4.165987226525343, + "grad_norm": 1.0008351802825928, + "learning_rate": 0.0004794554287267292, + "loss": 3.5585, + "step": 61315 + }, + { + "epoch": 4.166326946596005, + "grad_norm": 0.9004687666893005, + "learning_rate": 0.00047941296371789647, + "loss": 3.4327, + "step": 61320 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.8180439472198486, + "learning_rate": 0.00047937049870906375, + "loss": 3.3314, + "step": 61325 + }, + { + "epoch": 4.167006386737328, + "grad_norm": 1.150847315788269, + "learning_rate": 0.00047932803370023103, + "loss": 3.5631, + "step": 61330 + }, + { + "epoch": 4.16734610680799, + "grad_norm": 0.8171801567077637, + "learning_rate": 0.00047928556869139826, + "loss": 3.5948, + "step": 61335 + }, + { + "epoch": 4.167685826878652, + "grad_norm": 1.5090765953063965, + "learning_rate": 0.0004792431036825656, + "loss": 3.7341, + "step": 61340 + }, + { + "epoch": 4.1680255469493135, + "grad_norm": 0.8964012861251831, + "learning_rate": 0.00047920063867373287, + "loss": 3.193, + "step": 61345 + }, + { + "epoch": 4.168365267019976, + "grad_norm": 1.0191599130630493, + "learning_rate": 0.0004791581736649001, + "loss": 3.3238, + "step": 61350 + }, + { + "epoch": 4.168704987090638, + "grad_norm": 0.8877114653587341, + "learning_rate": 0.00047911570865606743, + "loss": 3.6474, + "step": 61355 + }, + { + "epoch": 4.169044707161299, + "grad_norm": 0.7577100992202759, + "learning_rate": 0.0004790732436472347, + "loss": 3.6155, + "step": 61360 + }, + { + "epoch": 4.169384427231961, + "grad_norm": 0.9438970685005188, + "learning_rate": 0.00047903077863840194, + "loss": 3.2758, + "step": 61365 + }, + { + "epoch": 4.169724147302623, + "grad_norm": 0.9291349053382874, + "learning_rate": 0.0004789883136295692, + "loss": 3.2867, + "step": 61370 + }, + { + "epoch": 4.170063867373284, + "grad_norm": 0.9546253681182861, + "learning_rate": 0.00047894584862073655, + "loss": 3.3778, + "step": 61375 + }, + { + "epoch": 4.170403587443946, + "grad_norm": 0.9703764319419861, + "learning_rate": 0.0004789033836119038, + "loss": 3.577, + "step": 61380 + }, + { + "epoch": 4.170743307514608, + "grad_norm": 0.8772135376930237, + "learning_rate": 0.00047886091860307106, + "loss": 3.512, + "step": 61385 + }, + { + "epoch": 4.1710830275852695, + "grad_norm": 0.9121068120002747, + "learning_rate": 0.0004788184535942384, + "loss": 3.0894, + "step": 61390 + }, + { + "epoch": 4.171422747655932, + "grad_norm": 0.9390279054641724, + "learning_rate": 0.0004787759885854056, + "loss": 3.683, + "step": 61395 + }, + { + "epoch": 4.171762467726594, + "grad_norm": 1.22541344165802, + "learning_rate": 0.0004787335235765729, + "loss": 3.496, + "step": 61400 + }, + { + "epoch": 4.172102187797255, + "grad_norm": 0.7794098854064941, + "learning_rate": 0.0004786910585677402, + "loss": 3.445, + "step": 61405 + }, + { + "epoch": 4.172441907867917, + "grad_norm": 0.8829683661460876, + "learning_rate": 0.0004786485935589075, + "loss": 3.6252, + "step": 61410 + }, + { + "epoch": 4.172781627938579, + "grad_norm": 1.179837942123413, + "learning_rate": 0.00047860612855007474, + "loss": 3.4379, + "step": 61415 + }, + { + "epoch": 4.17312134800924, + "grad_norm": 0.8776351809501648, + "learning_rate": 0.000478563663541242, + "loss": 3.4873, + "step": 61420 + }, + { + "epoch": 4.173461068079902, + "grad_norm": 1.2777010202407837, + "learning_rate": 0.00047852119853240935, + "loss": 3.4579, + "step": 61425 + }, + { + "epoch": 4.173800788150564, + "grad_norm": 0.8634617924690247, + "learning_rate": 0.0004784787335235766, + "loss": 3.5105, + "step": 61430 + }, + { + "epoch": 4.1741405082212255, + "grad_norm": 1.0558093786239624, + "learning_rate": 0.00047843626851474386, + "loss": 3.636, + "step": 61435 + }, + { + "epoch": 4.174480228291888, + "grad_norm": 0.9798359870910645, + "learning_rate": 0.00047839380350591114, + "loss": 3.6461, + "step": 61440 + }, + { + "epoch": 4.17481994836255, + "grad_norm": 0.8451381325721741, + "learning_rate": 0.0004783513384970784, + "loss": 3.5088, + "step": 61445 + }, + { + "epoch": 4.175159668433211, + "grad_norm": 0.7302253246307373, + "learning_rate": 0.0004783088734882457, + "loss": 3.709, + "step": 61450 + }, + { + "epoch": 4.175499388503873, + "grad_norm": 0.7581247687339783, + "learning_rate": 0.000478266408479413, + "loss": 3.4924, + "step": 61455 + }, + { + "epoch": 4.175839108574535, + "grad_norm": 0.8275901675224304, + "learning_rate": 0.00047822394347058026, + "loss": 3.3504, + "step": 61460 + }, + { + "epoch": 4.176178828645196, + "grad_norm": 1.0731698274612427, + "learning_rate": 0.00047818147846174754, + "loss": 3.4189, + "step": 61465 + }, + { + "epoch": 4.176518548715858, + "grad_norm": 1.0587215423583984, + "learning_rate": 0.0004781390134529148, + "loss": 3.4817, + "step": 61470 + }, + { + "epoch": 4.17685826878652, + "grad_norm": 0.7530107498168945, + "learning_rate": 0.00047809654844408204, + "loss": 3.5089, + "step": 61475 + }, + { + "epoch": 4.1771979888571815, + "grad_norm": 0.7558256387710571, + "learning_rate": 0.0004780540834352494, + "loss": 3.6728, + "step": 61480 + }, + { + "epoch": 4.177537708927844, + "grad_norm": 0.9698879718780518, + "learning_rate": 0.00047801161842641666, + "loss": 3.2301, + "step": 61485 + }, + { + "epoch": 4.177877428998505, + "grad_norm": 0.932871401309967, + "learning_rate": 0.0004779691534175839, + "loss": 3.508, + "step": 61490 + }, + { + "epoch": 4.178217149069167, + "grad_norm": 0.9275081753730774, + "learning_rate": 0.0004779266884087512, + "loss": 3.4633, + "step": 61495 + }, + { + "epoch": 4.178556869139829, + "grad_norm": 0.875995934009552, + "learning_rate": 0.0004778842233999185, + "loss": 3.4221, + "step": 61500 + }, + { + "epoch": 4.17889658921049, + "grad_norm": 1.5425082445144653, + "learning_rate": 0.0004778417583910857, + "loss": 3.5028, + "step": 61505 + }, + { + "epoch": 4.179236309281152, + "grad_norm": 0.9858370423316956, + "learning_rate": 0.000477799293382253, + "loss": 3.4641, + "step": 61510 + }, + { + "epoch": 4.179576029351814, + "grad_norm": 1.0755040645599365, + "learning_rate": 0.00047775682837342034, + "loss": 3.2477, + "step": 61515 + }, + { + "epoch": 4.1799157494224755, + "grad_norm": 0.792167603969574, + "learning_rate": 0.00047771436336458756, + "loss": 3.5842, + "step": 61520 + }, + { + "epoch": 4.1802554694931375, + "grad_norm": 0.9279307126998901, + "learning_rate": 0.00047767189835575485, + "loss": 3.2422, + "step": 61525 + }, + { + "epoch": 4.1805951895638, + "grad_norm": 0.6563863158226013, + "learning_rate": 0.0004776294333469222, + "loss": 3.6328, + "step": 61530 + }, + { + "epoch": 4.180934909634461, + "grad_norm": 1.3233578205108643, + "learning_rate": 0.0004775869683380894, + "loss": 3.7242, + "step": 61535 + }, + { + "epoch": 4.181274629705123, + "grad_norm": 0.9331963062286377, + "learning_rate": 0.0004775445033292567, + "loss": 3.5129, + "step": 61540 + }, + { + "epoch": 4.181614349775785, + "grad_norm": 0.9481460452079773, + "learning_rate": 0.000477502038320424, + "loss": 3.5239, + "step": 61545 + }, + { + "epoch": 4.181954069846446, + "grad_norm": 0.9343422651290894, + "learning_rate": 0.00047745957331159125, + "loss": 3.4359, + "step": 61550 + }, + { + "epoch": 4.182293789917108, + "grad_norm": 0.8137853145599365, + "learning_rate": 0.0004774171083027585, + "loss": 3.495, + "step": 61555 + }, + { + "epoch": 4.18263350998777, + "grad_norm": 0.8054407835006714, + "learning_rate": 0.0004773746432939258, + "loss": 3.2664, + "step": 61560 + }, + { + "epoch": 4.1829732300584315, + "grad_norm": 0.8184186220169067, + "learning_rate": 0.0004773321782850931, + "loss": 3.679, + "step": 61565 + }, + { + "epoch": 4.1833129501290935, + "grad_norm": 1.0021629333496094, + "learning_rate": 0.00047728971327626037, + "loss": 3.4702, + "step": 61570 + }, + { + "epoch": 4.183652670199756, + "grad_norm": 0.8591084480285645, + "learning_rate": 0.00047724724826742765, + "loss": 3.6655, + "step": 61575 + }, + { + "epoch": 4.183992390270417, + "grad_norm": 1.0550155639648438, + "learning_rate": 0.000477204783258595, + "loss": 3.6389, + "step": 61580 + }, + { + "epoch": 4.184332110341079, + "grad_norm": 0.6556769609451294, + "learning_rate": 0.0004771623182497622, + "loss": 3.5613, + "step": 61585 + }, + { + "epoch": 4.184671830411741, + "grad_norm": 0.7437652945518494, + "learning_rate": 0.0004771198532409295, + "loss": 3.5224, + "step": 61590 + }, + { + "epoch": 4.185011550482402, + "grad_norm": 0.9275814294815063, + "learning_rate": 0.00047707738823209677, + "loss": 3.5707, + "step": 61595 + }, + { + "epoch": 4.185351270553064, + "grad_norm": 0.707420289516449, + "learning_rate": 0.00047703492322326405, + "loss": 3.7118, + "step": 61600 + }, + { + "epoch": 4.185690990623726, + "grad_norm": 0.9534379839897156, + "learning_rate": 0.0004769924582144313, + "loss": 3.5531, + "step": 61605 + }, + { + "epoch": 4.1860307106943875, + "grad_norm": 1.0042377710342407, + "learning_rate": 0.0004769499932055986, + "loss": 3.6389, + "step": 61610 + }, + { + "epoch": 4.1863704307650496, + "grad_norm": 1.1760345697402954, + "learning_rate": 0.0004769075281967659, + "loss": 3.4865, + "step": 61615 + }, + { + "epoch": 4.186710150835712, + "grad_norm": 1.0403176546096802, + "learning_rate": 0.00047686506318793317, + "loss": 3.5263, + "step": 61620 + }, + { + "epoch": 4.187049870906373, + "grad_norm": 9.477473258972168, + "learning_rate": 0.00047682259817910045, + "loss": 3.4377, + "step": 61625 + }, + { + "epoch": 4.187389590977035, + "grad_norm": 0.9285584688186646, + "learning_rate": 0.00047678013317026767, + "loss": 3.6038, + "step": 61630 + }, + { + "epoch": 4.187729311047697, + "grad_norm": 0.9756678938865662, + "learning_rate": 0.000476737668161435, + "loss": 3.3521, + "step": 61635 + }, + { + "epoch": 4.188069031118358, + "grad_norm": 0.8023003339767456, + "learning_rate": 0.0004766952031526023, + "loss": 3.5971, + "step": 61640 + }, + { + "epoch": 4.18840875118902, + "grad_norm": 0.7629098296165466, + "learning_rate": 0.0004766527381437695, + "loss": 3.2694, + "step": 61645 + }, + { + "epoch": 4.188748471259682, + "grad_norm": 0.9840520620346069, + "learning_rate": 0.00047661027313493685, + "loss": 3.3458, + "step": 61650 + }, + { + "epoch": 4.1890881913303435, + "grad_norm": 0.9950060844421387, + "learning_rate": 0.0004765678081261041, + "loss": 3.4792, + "step": 61655 + }, + { + "epoch": 4.189427911401006, + "grad_norm": 1.0389543771743774, + "learning_rate": 0.00047652534311727135, + "loss": 3.2795, + "step": 61660 + }, + { + "epoch": 4.189767631471668, + "grad_norm": 0.7561582326889038, + "learning_rate": 0.00047648287810843863, + "loss": 3.4474, + "step": 61665 + }, + { + "epoch": 4.190107351542329, + "grad_norm": 0.8321341276168823, + "learning_rate": 0.00047644041309960597, + "loss": 3.4571, + "step": 61670 + }, + { + "epoch": 4.190447071612991, + "grad_norm": 0.9924423098564148, + "learning_rate": 0.0004763979480907732, + "loss": 3.4653, + "step": 61675 + }, + { + "epoch": 4.190786791683653, + "grad_norm": 0.858684241771698, + "learning_rate": 0.0004763554830819405, + "loss": 3.368, + "step": 61680 + }, + { + "epoch": 4.191126511754314, + "grad_norm": 0.9290311336517334, + "learning_rate": 0.0004763130180731078, + "loss": 3.1864, + "step": 61685 + }, + { + "epoch": 4.191466231824976, + "grad_norm": 0.8252352476119995, + "learning_rate": 0.00047627055306427503, + "loss": 3.5335, + "step": 61690 + }, + { + "epoch": 4.191805951895638, + "grad_norm": 0.9329754114151001, + "learning_rate": 0.0004762280880554423, + "loss": 3.7466, + "step": 61695 + }, + { + "epoch": 4.1921456719662995, + "grad_norm": 0.8813470005989075, + "learning_rate": 0.0004761856230466096, + "loss": 3.6787, + "step": 61700 + }, + { + "epoch": 4.192485392036962, + "grad_norm": 0.7869167327880859, + "learning_rate": 0.0004761431580377769, + "loss": 3.3092, + "step": 61705 + }, + { + "epoch": 4.192825112107624, + "grad_norm": 0.8545785546302795, + "learning_rate": 0.00047610069302894415, + "loss": 3.3465, + "step": 61710 + }, + { + "epoch": 4.193164832178285, + "grad_norm": 0.9059953689575195, + "learning_rate": 0.00047605822802011143, + "loss": 3.3153, + "step": 61715 + }, + { + "epoch": 4.193504552248947, + "grad_norm": 0.7889425158500671, + "learning_rate": 0.0004760157630112787, + "loss": 3.3422, + "step": 61720 + }, + { + "epoch": 4.193844272319609, + "grad_norm": 0.8810021877288818, + "learning_rate": 0.000475973298002446, + "loss": 3.406, + "step": 61725 + }, + { + "epoch": 4.19418399239027, + "grad_norm": 0.7761560678482056, + "learning_rate": 0.0004759308329936133, + "loss": 3.5315, + "step": 61730 + }, + { + "epoch": 4.194523712460932, + "grad_norm": 0.8527973294258118, + "learning_rate": 0.0004758883679847805, + "loss": 3.4653, + "step": 61735 + }, + { + "epoch": 4.194863432531594, + "grad_norm": 0.8393343687057495, + "learning_rate": 0.00047584590297594783, + "loss": 3.4343, + "step": 61740 + }, + { + "epoch": 4.1952031526022555, + "grad_norm": 0.875372052192688, + "learning_rate": 0.0004758034379671151, + "loss": 3.5771, + "step": 61745 + }, + { + "epoch": 4.195542872672918, + "grad_norm": 0.6871384382247925, + "learning_rate": 0.0004757609729582824, + "loss": 3.5994, + "step": 61750 + }, + { + "epoch": 4.19588259274358, + "grad_norm": 1.0302915573120117, + "learning_rate": 0.0004757185079494497, + "loss": 3.5878, + "step": 61755 + }, + { + "epoch": 4.196222312814241, + "grad_norm": 1.1195985078811646, + "learning_rate": 0.00047567604294061695, + "loss": 3.4856, + "step": 61760 + }, + { + "epoch": 4.196562032884903, + "grad_norm": 0.8441839814186096, + "learning_rate": 0.00047563357793178423, + "loss": 3.3161, + "step": 61765 + }, + { + "epoch": 4.196901752955565, + "grad_norm": 1.3091888427734375, + "learning_rate": 0.00047559111292295146, + "loss": 3.6511, + "step": 61770 + }, + { + "epoch": 4.197241473026226, + "grad_norm": 1.1396050453186035, + "learning_rate": 0.0004755486479141188, + "loss": 3.5603, + "step": 61775 + }, + { + "epoch": 4.197581193096888, + "grad_norm": 0.8382667303085327, + "learning_rate": 0.0004755061829052861, + "loss": 3.5155, + "step": 61780 + }, + { + "epoch": 4.19792091316755, + "grad_norm": 0.9768345952033997, + "learning_rate": 0.0004754637178964533, + "loss": 3.4241, + "step": 61785 + }, + { + "epoch": 4.1982606332382115, + "grad_norm": 0.7651055455207825, + "learning_rate": 0.00047542125288762063, + "loss": 3.5317, + "step": 61790 + }, + { + "epoch": 4.198600353308874, + "grad_norm": 0.8648186922073364, + "learning_rate": 0.0004753787878787879, + "loss": 3.3982, + "step": 61795 + }, + { + "epoch": 4.198940073379536, + "grad_norm": 0.8968487977981567, + "learning_rate": 0.00047533632286995514, + "loss": 3.6876, + "step": 61800 + }, + { + "epoch": 4.199279793450197, + "grad_norm": 0.9737926125526428, + "learning_rate": 0.0004752938578611224, + "loss": 3.5581, + "step": 61805 + }, + { + "epoch": 4.199619513520859, + "grad_norm": 0.8033174872398376, + "learning_rate": 0.00047525139285228976, + "loss": 3.5607, + "step": 61810 + }, + { + "epoch": 4.199959233591521, + "grad_norm": 0.8751312494277954, + "learning_rate": 0.000475208927843457, + "loss": 3.5718, + "step": 61815 + }, + { + "epoch": 4.200298953662182, + "grad_norm": 1.1747570037841797, + "learning_rate": 0.00047516646283462426, + "loss": 3.577, + "step": 61820 + }, + { + "epoch": 4.200638673732844, + "grad_norm": 0.956802487373352, + "learning_rate": 0.0004751239978257916, + "loss": 3.4934, + "step": 61825 + }, + { + "epoch": 4.200978393803506, + "grad_norm": 0.9007416367530823, + "learning_rate": 0.0004750815328169588, + "loss": 3.6234, + "step": 61830 + }, + { + "epoch": 4.2013181138741675, + "grad_norm": 1.374605417251587, + "learning_rate": 0.0004750390678081261, + "loss": 3.5296, + "step": 61835 + }, + { + "epoch": 4.20165783394483, + "grad_norm": 0.8114254474639893, + "learning_rate": 0.00047499660279929344, + "loss": 3.3943, + "step": 61840 + }, + { + "epoch": 4.201997554015492, + "grad_norm": 1.2361410856246948, + "learning_rate": 0.00047495413779046066, + "loss": 3.4279, + "step": 61845 + }, + { + "epoch": 4.202337274086153, + "grad_norm": 0.785903811454773, + "learning_rate": 0.00047491167278162794, + "loss": 3.307, + "step": 61850 + }, + { + "epoch": 4.202676994156815, + "grad_norm": 1.0972729921340942, + "learning_rate": 0.0004748692077727952, + "loss": 3.2687, + "step": 61855 + }, + { + "epoch": 4.203016714227476, + "grad_norm": 0.9873654246330261, + "learning_rate": 0.0004748267427639625, + "loss": 3.364, + "step": 61860 + }, + { + "epoch": 4.203356434298138, + "grad_norm": 0.8294976949691772, + "learning_rate": 0.0004747842777551298, + "loss": 3.3699, + "step": 61865 + }, + { + "epoch": 4.2036961543688, + "grad_norm": 1.0659875869750977, + "learning_rate": 0.00047474181274629706, + "loss": 3.4089, + "step": 61870 + }, + { + "epoch": 4.2040358744394615, + "grad_norm": 1.2330275774002075, + "learning_rate": 0.00047469934773746434, + "loss": 3.1422, + "step": 61875 + }, + { + "epoch": 4.2043755945101235, + "grad_norm": 0.9525544047355652, + "learning_rate": 0.0004746568827286316, + "loss": 3.5237, + "step": 61880 + }, + { + "epoch": 4.204715314580786, + "grad_norm": 0.7312281727790833, + "learning_rate": 0.0004746144177197989, + "loss": 3.5415, + "step": 61885 + }, + { + "epoch": 4.205055034651447, + "grad_norm": 1.1209661960601807, + "learning_rate": 0.00047457195271096613, + "loss": 3.4765, + "step": 61890 + }, + { + "epoch": 4.205394754722109, + "grad_norm": 0.9766157269477844, + "learning_rate": 0.00047452948770213346, + "loss": 3.4929, + "step": 61895 + }, + { + "epoch": 4.205734474792771, + "grad_norm": 1.1769119501113892, + "learning_rate": 0.00047448702269330074, + "loss": 3.4064, + "step": 61900 + }, + { + "epoch": 4.206074194863432, + "grad_norm": 0.9480777978897095, + "learning_rate": 0.00047444455768446797, + "loss": 3.2579, + "step": 61905 + }, + { + "epoch": 4.206413914934094, + "grad_norm": 0.8584173321723938, + "learning_rate": 0.0004744020926756353, + "loss": 3.5766, + "step": 61910 + }, + { + "epoch": 4.206753635004756, + "grad_norm": 0.7385799884796143, + "learning_rate": 0.0004743596276668026, + "loss": 3.6365, + "step": 61915 + }, + { + "epoch": 4.2070933550754175, + "grad_norm": 1.1514978408813477, + "learning_rate": 0.00047431716265796986, + "loss": 3.5375, + "step": 61920 + }, + { + "epoch": 4.20743307514608, + "grad_norm": 0.7875049710273743, + "learning_rate": 0.0004742746976491371, + "loss": 3.3705, + "step": 61925 + }, + { + "epoch": 4.207772795216742, + "grad_norm": 0.9296401143074036, + "learning_rate": 0.0004742322326403044, + "loss": 3.3525, + "step": 61930 + }, + { + "epoch": 4.208112515287403, + "grad_norm": 0.6532353758811951, + "learning_rate": 0.0004741897676314717, + "loss": 3.4933, + "step": 61935 + }, + { + "epoch": 4.208452235358065, + "grad_norm": 1.0172353982925415, + "learning_rate": 0.00047414730262263893, + "loss": 3.5243, + "step": 61940 + }, + { + "epoch": 4.208791955428727, + "grad_norm": 1.4368290901184082, + "learning_rate": 0.00047410483761380626, + "loss": 3.3615, + "step": 61945 + }, + { + "epoch": 4.209131675499388, + "grad_norm": 1.1404001712799072, + "learning_rate": 0.00047406237260497354, + "loss": 3.6397, + "step": 61950 + }, + { + "epoch": 4.20947139557005, + "grad_norm": 0.9934514760971069, + "learning_rate": 0.00047401990759614077, + "loss": 3.5234, + "step": 61955 + }, + { + "epoch": 4.209811115640712, + "grad_norm": 0.9641137719154358, + "learning_rate": 0.00047397744258730805, + "loss": 3.519, + "step": 61960 + }, + { + "epoch": 4.2101508357113735, + "grad_norm": 1.0760753154754639, + "learning_rate": 0.0004739349775784754, + "loss": 3.579, + "step": 61965 + }, + { + "epoch": 4.210490555782036, + "grad_norm": 0.8873518109321594, + "learning_rate": 0.0004738925125696426, + "loss": 3.2917, + "step": 61970 + }, + { + "epoch": 4.210830275852698, + "grad_norm": 0.8862466216087341, + "learning_rate": 0.0004738500475608099, + "loss": 3.5296, + "step": 61975 + }, + { + "epoch": 4.211169995923359, + "grad_norm": 0.8808194398880005, + "learning_rate": 0.0004738075825519772, + "loss": 3.3667, + "step": 61980 + }, + { + "epoch": 4.211509715994021, + "grad_norm": 0.8476575613021851, + "learning_rate": 0.00047376511754314445, + "loss": 3.4982, + "step": 61985 + }, + { + "epoch": 4.211849436064683, + "grad_norm": 1.066279411315918, + "learning_rate": 0.00047372265253431173, + "loss": 3.3649, + "step": 61990 + }, + { + "epoch": 4.212189156135344, + "grad_norm": 1.1036165952682495, + "learning_rate": 0.000473680187525479, + "loss": 3.5094, + "step": 61995 + }, + { + "epoch": 4.212528876206006, + "grad_norm": 0.7324236631393433, + "learning_rate": 0.0004736377225166463, + "loss": 3.5919, + "step": 62000 + }, + { + "epoch": 4.212868596276668, + "grad_norm": 1.0725162029266357, + "learning_rate": 0.00047359525750781357, + "loss": 3.5393, + "step": 62005 + }, + { + "epoch": 4.2132083163473295, + "grad_norm": 1.1366947889328003, + "learning_rate": 0.00047355279249898085, + "loss": 3.3722, + "step": 62010 + }, + { + "epoch": 4.213548036417992, + "grad_norm": 0.6647496223449707, + "learning_rate": 0.00047351032749014813, + "loss": 3.5435, + "step": 62015 + }, + { + "epoch": 4.213887756488654, + "grad_norm": 0.9916176795959473, + "learning_rate": 0.0004734678624813154, + "loss": 3.6031, + "step": 62020 + }, + { + "epoch": 4.214227476559315, + "grad_norm": 0.8981994986534119, + "learning_rate": 0.0004734253974724827, + "loss": 3.6764, + "step": 62025 + }, + { + "epoch": 4.214567196629977, + "grad_norm": 0.7730247974395752, + "learning_rate": 0.0004733829324636499, + "loss": 3.5094, + "step": 62030 + }, + { + "epoch": 4.214906916700639, + "grad_norm": 1.2946847677230835, + "learning_rate": 0.00047334046745481725, + "loss": 3.6072, + "step": 62035 + }, + { + "epoch": 4.2152466367713, + "grad_norm": 1.2118031978607178, + "learning_rate": 0.00047329800244598453, + "loss": 3.1874, + "step": 62040 + }, + { + "epoch": 4.215586356841962, + "grad_norm": 0.8122612833976746, + "learning_rate": 0.00047325553743715176, + "loss": 3.6896, + "step": 62045 + }, + { + "epoch": 4.215926076912624, + "grad_norm": 0.9668992757797241, + "learning_rate": 0.0004732130724283191, + "loss": 3.2654, + "step": 62050 + }, + { + "epoch": 4.2162657969832855, + "grad_norm": 1.160780668258667, + "learning_rate": 0.00047317060741948637, + "loss": 3.2785, + "step": 62055 + }, + { + "epoch": 4.216605517053948, + "grad_norm": 0.9436171054840088, + "learning_rate": 0.0004731281424106536, + "loss": 3.4188, + "step": 62060 + }, + { + "epoch": 4.21694523712461, + "grad_norm": 0.8520186543464661, + "learning_rate": 0.0004730856774018209, + "loss": 3.423, + "step": 62065 + }, + { + "epoch": 4.217284957195271, + "grad_norm": 1.3129063844680786, + "learning_rate": 0.0004730432123929882, + "loss": 3.7226, + "step": 62070 + }, + { + "epoch": 4.217624677265933, + "grad_norm": 0.810491144657135, + "learning_rate": 0.00047300074738415544, + "loss": 3.4151, + "step": 62075 + }, + { + "epoch": 4.217964397336595, + "grad_norm": 0.8563981652259827, + "learning_rate": 0.0004729582823753227, + "loss": 3.309, + "step": 62080 + }, + { + "epoch": 4.218304117407256, + "grad_norm": 0.925452470779419, + "learning_rate": 0.00047291581736649005, + "loss": 3.5391, + "step": 62085 + }, + { + "epoch": 4.218643837477918, + "grad_norm": 1.7217905521392822, + "learning_rate": 0.00047287335235765733, + "loss": 3.5141, + "step": 62090 + }, + { + "epoch": 4.21898355754858, + "grad_norm": 1.3116225004196167, + "learning_rate": 0.00047283088734882456, + "loss": 3.4105, + "step": 62095 + }, + { + "epoch": 4.2193232776192415, + "grad_norm": 0.9617807865142822, + "learning_rate": 0.00047278842233999184, + "loss": 3.5339, + "step": 62100 + }, + { + "epoch": 4.219662997689904, + "grad_norm": 0.7606753706932068, + "learning_rate": 0.00047274595733115917, + "loss": 3.6702, + "step": 62105 + }, + { + "epoch": 4.220002717760566, + "grad_norm": 2.4240195751190186, + "learning_rate": 0.0004727034923223264, + "loss": 3.1129, + "step": 62110 + }, + { + "epoch": 4.220342437831227, + "grad_norm": 0.878364086151123, + "learning_rate": 0.0004726610273134937, + "loss": 3.4774, + "step": 62115 + }, + { + "epoch": 4.220682157901889, + "grad_norm": 0.696345865726471, + "learning_rate": 0.000472618562304661, + "loss": 3.5585, + "step": 62120 + }, + { + "epoch": 4.221021877972551, + "grad_norm": 0.8128225803375244, + "learning_rate": 0.00047257609729582824, + "loss": 3.4744, + "step": 62125 + }, + { + "epoch": 4.221361598043212, + "grad_norm": 1.0861862897872925, + "learning_rate": 0.0004725336322869955, + "loss": 3.5329, + "step": 62130 + }, + { + "epoch": 4.221701318113874, + "grad_norm": 0.8595383763313293, + "learning_rate": 0.00047249116727816285, + "loss": 3.3268, + "step": 62135 + }, + { + "epoch": 4.222041038184536, + "grad_norm": 0.8806082010269165, + "learning_rate": 0.0004724487022693301, + "loss": 3.3303, + "step": 62140 + }, + { + "epoch": 4.2223807582551975, + "grad_norm": 0.8835695385932922, + "learning_rate": 0.00047240623726049736, + "loss": 3.518, + "step": 62145 + }, + { + "epoch": 4.22272047832586, + "grad_norm": 0.8354586362838745, + "learning_rate": 0.00047236377225166464, + "loss": 3.5987, + "step": 62150 + }, + { + "epoch": 4.223060198396522, + "grad_norm": 1.0021276473999023, + "learning_rate": 0.0004723213072428319, + "loss": 3.625, + "step": 62155 + }, + { + "epoch": 4.223399918467183, + "grad_norm": 1.1074401140213013, + "learning_rate": 0.0004722788422339992, + "loss": 3.3408, + "step": 62160 + }, + { + "epoch": 4.223739638537845, + "grad_norm": 0.8748573660850525, + "learning_rate": 0.0004722363772251665, + "loss": 3.2681, + "step": 62165 + }, + { + "epoch": 4.224079358608506, + "grad_norm": 1.2845767736434937, + "learning_rate": 0.00047219391221633376, + "loss": 3.2329, + "step": 62170 + }, + { + "epoch": 4.224419078679168, + "grad_norm": 0.7824113368988037, + "learning_rate": 0.00047215144720750104, + "loss": 3.2993, + "step": 62175 + }, + { + "epoch": 4.22475879874983, + "grad_norm": 1.0895147323608398, + "learning_rate": 0.0004721089821986683, + "loss": 3.5681, + "step": 62180 + }, + { + "epoch": 4.2250985188204915, + "grad_norm": 1.124133586883545, + "learning_rate": 0.00047206651718983554, + "loss": 3.3643, + "step": 62185 + }, + { + "epoch": 4.2254382388911536, + "grad_norm": 12.661433219909668, + "learning_rate": 0.0004720240521810029, + "loss": 3.5435, + "step": 62190 + }, + { + "epoch": 4.225777958961816, + "grad_norm": 0.8504529595375061, + "learning_rate": 0.00047198158717217016, + "loss": 3.3245, + "step": 62195 + }, + { + "epoch": 4.226117679032477, + "grad_norm": 0.8816905617713928, + "learning_rate": 0.0004719391221633374, + "loss": 3.5469, + "step": 62200 + }, + { + "epoch": 4.226457399103139, + "grad_norm": 1.0230493545532227, + "learning_rate": 0.0004718966571545047, + "loss": 3.5589, + "step": 62205 + }, + { + "epoch": 4.226797119173801, + "grad_norm": 0.8646937012672424, + "learning_rate": 0.000471854192145672, + "loss": 3.2385, + "step": 62210 + }, + { + "epoch": 4.227136839244462, + "grad_norm": 0.9940260052680969, + "learning_rate": 0.0004718117271368392, + "loss": 3.3032, + "step": 62215 + }, + { + "epoch": 4.227476559315124, + "grad_norm": 0.8295077085494995, + "learning_rate": 0.0004717692621280065, + "loss": 3.5549, + "step": 62220 + }, + { + "epoch": 4.227816279385786, + "grad_norm": 0.8922302722930908, + "learning_rate": 0.00047172679711917384, + "loss": 3.3547, + "step": 62225 + }, + { + "epoch": 4.2281559994564475, + "grad_norm": 1.0468846559524536, + "learning_rate": 0.00047168433211034106, + "loss": 3.4373, + "step": 62230 + }, + { + "epoch": 4.22849571952711, + "grad_norm": 2.566987991333008, + "learning_rate": 0.00047164186710150834, + "loss": 3.478, + "step": 62235 + }, + { + "epoch": 4.228835439597772, + "grad_norm": 1.0212949514389038, + "learning_rate": 0.0004715994020926757, + "loss": 3.5512, + "step": 62240 + }, + { + "epoch": 4.229175159668433, + "grad_norm": 1.0142236948013306, + "learning_rate": 0.0004715569370838429, + "loss": 3.5442, + "step": 62245 + }, + { + "epoch": 4.229514879739095, + "grad_norm": 0.8566301465034485, + "learning_rate": 0.0004715144720750102, + "loss": 3.3538, + "step": 62250 + }, + { + "epoch": 4.229854599809757, + "grad_norm": 1.417082667350769, + "learning_rate": 0.00047147200706617747, + "loss": 3.3882, + "step": 62255 + }, + { + "epoch": 4.230194319880418, + "grad_norm": 0.755373477935791, + "learning_rate": 0.0004714295420573448, + "loss": 3.7053, + "step": 62260 + }, + { + "epoch": 4.23053403995108, + "grad_norm": 0.8811341524124146, + "learning_rate": 0.000471387077048512, + "loss": 3.4399, + "step": 62265 + }, + { + "epoch": 4.230873760021742, + "grad_norm": 1.138218879699707, + "learning_rate": 0.0004713446120396793, + "loss": 3.34, + "step": 62270 + }, + { + "epoch": 4.2312134800924035, + "grad_norm": 0.8062149882316589, + "learning_rate": 0.00047130214703084664, + "loss": 3.5854, + "step": 62275 + }, + { + "epoch": 4.231553200163066, + "grad_norm": 0.8482598066329956, + "learning_rate": 0.00047125968202201387, + "loss": 3.707, + "step": 62280 + }, + { + "epoch": 4.231892920233728, + "grad_norm": 0.7732557058334351, + "learning_rate": 0.00047121721701318115, + "loss": 3.6549, + "step": 62285 + }, + { + "epoch": 4.232232640304389, + "grad_norm": 0.9325189590454102, + "learning_rate": 0.0004711747520043484, + "loss": 3.5536, + "step": 62290 + }, + { + "epoch": 4.232572360375051, + "grad_norm": 1.1554745435714722, + "learning_rate": 0.0004711322869955157, + "loss": 3.4811, + "step": 62295 + }, + { + "epoch": 4.232912080445713, + "grad_norm": 1.270506739616394, + "learning_rate": 0.000471089821986683, + "loss": 3.4547, + "step": 62300 + }, + { + "epoch": 4.233251800516374, + "grad_norm": 1.1357699632644653, + "learning_rate": 0.00047104735697785027, + "loss": 3.3955, + "step": 62305 + }, + { + "epoch": 4.233591520587036, + "grad_norm": 1.0169323682785034, + "learning_rate": 0.00047100489196901755, + "loss": 3.2544, + "step": 62310 + }, + { + "epoch": 4.233931240657698, + "grad_norm": 0.8289300799369812, + "learning_rate": 0.0004709624269601848, + "loss": 3.5501, + "step": 62315 + }, + { + "epoch": 4.2342709607283595, + "grad_norm": 0.8175435066223145, + "learning_rate": 0.0004709199619513521, + "loss": 3.1469, + "step": 62320 + }, + { + "epoch": 4.234610680799022, + "grad_norm": 1.0501093864440918, + "learning_rate": 0.00047087749694251933, + "loss": 3.229, + "step": 62325 + }, + { + "epoch": 4.234950400869684, + "grad_norm": 0.9832239151000977, + "learning_rate": 0.00047083503193368667, + "loss": 3.3164, + "step": 62330 + }, + { + "epoch": 4.235290120940345, + "grad_norm": 0.7246628403663635, + "learning_rate": 0.00047079256692485395, + "loss": 3.6534, + "step": 62335 + }, + { + "epoch": 4.235629841011007, + "grad_norm": 0.793237030506134, + "learning_rate": 0.00047075010191602117, + "loss": 3.5163, + "step": 62340 + }, + { + "epoch": 4.235969561081669, + "grad_norm": 1.1008336544036865, + "learning_rate": 0.0004707076369071885, + "loss": 3.3016, + "step": 62345 + }, + { + "epoch": 4.23630928115233, + "grad_norm": 0.9286287426948547, + "learning_rate": 0.0004706651718983558, + "loss": 3.4629, + "step": 62350 + }, + { + "epoch": 4.236649001222992, + "grad_norm": 0.7046343684196472, + "learning_rate": 0.000470622706889523, + "loss": 3.3931, + "step": 62355 + }, + { + "epoch": 4.236988721293654, + "grad_norm": 0.9754976034164429, + "learning_rate": 0.0004705802418806903, + "loss": 3.369, + "step": 62360 + }, + { + "epoch": 4.2373284413643155, + "grad_norm": 0.688900351524353, + "learning_rate": 0.0004705377768718576, + "loss": 3.5022, + "step": 62365 + }, + { + "epoch": 4.237668161434978, + "grad_norm": 0.922686755657196, + "learning_rate": 0.00047049531186302485, + "loss": 3.5064, + "step": 62370 + }, + { + "epoch": 4.23800788150564, + "grad_norm": 0.8912158012390137, + "learning_rate": 0.00047045284685419213, + "loss": 3.3341, + "step": 62375 + }, + { + "epoch": 4.238347601576301, + "grad_norm": 0.8791553974151611, + "learning_rate": 0.00047041038184535947, + "loss": 3.4997, + "step": 62380 + }, + { + "epoch": 4.238687321646963, + "grad_norm": 1.0698002576828003, + "learning_rate": 0.0004703679168365267, + "loss": 3.4742, + "step": 62385 + }, + { + "epoch": 4.239027041717625, + "grad_norm": 0.9292575120925903, + "learning_rate": 0.000470325451827694, + "loss": 3.4681, + "step": 62390 + }, + { + "epoch": 4.239366761788286, + "grad_norm": 1.0633902549743652, + "learning_rate": 0.0004702829868188613, + "loss": 3.5848, + "step": 62395 + }, + { + "epoch": 4.239706481858948, + "grad_norm": 0.8790925145149231, + "learning_rate": 0.00047024052181002853, + "loss": 3.4151, + "step": 62400 + }, + { + "epoch": 4.24004620192961, + "grad_norm": 1.4642481803894043, + "learning_rate": 0.0004701980568011958, + "loss": 3.4582, + "step": 62405 + }, + { + "epoch": 4.2403859220002715, + "grad_norm": 0.9389922618865967, + "learning_rate": 0.0004701555917923631, + "loss": 3.6629, + "step": 62410 + }, + { + "epoch": 4.240725642070934, + "grad_norm": 0.742598295211792, + "learning_rate": 0.0004701131267835304, + "loss": 3.5939, + "step": 62415 + }, + { + "epoch": 4.241065362141596, + "grad_norm": 0.7616636753082275, + "learning_rate": 0.00047007066177469765, + "loss": 3.624, + "step": 62420 + }, + { + "epoch": 4.241405082212257, + "grad_norm": 1.0790058374404907, + "learning_rate": 0.00047002819676586493, + "loss": 3.5721, + "step": 62425 + }, + { + "epoch": 4.241744802282919, + "grad_norm": 0.846799373626709, + "learning_rate": 0.00046998573175703227, + "loss": 3.2689, + "step": 62430 + }, + { + "epoch": 4.242084522353581, + "grad_norm": 0.8939149975776672, + "learning_rate": 0.0004699432667481995, + "loss": 3.3599, + "step": 62435 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.8004636168479919, + "learning_rate": 0.0004699008017393668, + "loss": 3.2343, + "step": 62440 + }, + { + "epoch": 4.242763962494904, + "grad_norm": 0.9230446815490723, + "learning_rate": 0.00046985833673053405, + "loss": 3.6918, + "step": 62445 + }, + { + "epoch": 4.243103682565566, + "grad_norm": 1.0470510721206665, + "learning_rate": 0.00046981587172170133, + "loss": 3.4544, + "step": 62450 + }, + { + "epoch": 4.2434434026362275, + "grad_norm": 0.8842229843139648, + "learning_rate": 0.0004697734067128686, + "loss": 3.5991, + "step": 62455 + }, + { + "epoch": 4.24378312270689, + "grad_norm": 0.6333302855491638, + "learning_rate": 0.0004697309417040359, + "loss": 3.5458, + "step": 62460 + }, + { + "epoch": 4.244122842777552, + "grad_norm": 0.8292087316513062, + "learning_rate": 0.0004696884766952032, + "loss": 3.47, + "step": 62465 + }, + { + "epoch": 4.244462562848213, + "grad_norm": 0.9230862855911255, + "learning_rate": 0.00046964601168637045, + "loss": 3.1759, + "step": 62470 + }, + { + "epoch": 4.244802282918875, + "grad_norm": 0.853147566318512, + "learning_rate": 0.00046960354667753773, + "loss": 3.424, + "step": 62475 + }, + { + "epoch": 4.245142002989537, + "grad_norm": 0.785579264163971, + "learning_rate": 0.00046956108166870496, + "loss": 3.2709, + "step": 62480 + }, + { + "epoch": 4.245481723060198, + "grad_norm": 0.8149586915969849, + "learning_rate": 0.0004695186166598723, + "loss": 3.3071, + "step": 62485 + }, + { + "epoch": 4.24582144313086, + "grad_norm": 1.1115707159042358, + "learning_rate": 0.0004694761516510396, + "loss": 3.3272, + "step": 62490 + }, + { + "epoch": 4.246161163201522, + "grad_norm": 1.0767168998718262, + "learning_rate": 0.0004694336866422068, + "loss": 3.3312, + "step": 62495 + }, + { + "epoch": 4.246500883272184, + "grad_norm": 1.0698087215423584, + "learning_rate": 0.00046939122163337413, + "loss": 3.4883, + "step": 62500 + }, + { + "epoch": 4.246840603342846, + "grad_norm": 0.877433717250824, + "learning_rate": 0.0004693487566245414, + "loss": 3.4519, + "step": 62505 + }, + { + "epoch": 4.247180323413508, + "grad_norm": 1.165985107421875, + "learning_rate": 0.00046930629161570864, + "loss": 3.6447, + "step": 62510 + }, + { + "epoch": 4.247520043484169, + "grad_norm": 1.1950287818908691, + "learning_rate": 0.0004692638266068759, + "loss": 3.5306, + "step": 62515 + }, + { + "epoch": 4.247859763554831, + "grad_norm": 1.241331934928894, + "learning_rate": 0.00046922136159804326, + "loss": 3.2628, + "step": 62520 + }, + { + "epoch": 4.248199483625493, + "grad_norm": 1.0112899541854858, + "learning_rate": 0.0004691788965892105, + "loss": 3.2962, + "step": 62525 + }, + { + "epoch": 4.248539203696154, + "grad_norm": 0.7630959749221802, + "learning_rate": 0.00046913643158037776, + "loss": 3.4789, + "step": 62530 + }, + { + "epoch": 4.248878923766816, + "grad_norm": 1.1937966346740723, + "learning_rate": 0.0004690939665715451, + "loss": 3.3924, + "step": 62535 + }, + { + "epoch": 4.2492186438374775, + "grad_norm": 0.9489196538925171, + "learning_rate": 0.0004690515015627123, + "loss": 3.6002, + "step": 62540 + }, + { + "epoch": 4.24955836390814, + "grad_norm": 0.9031516909599304, + "learning_rate": 0.0004690090365538796, + "loss": 3.2978, + "step": 62545 + }, + { + "epoch": 4.249898083978802, + "grad_norm": 0.9075033664703369, + "learning_rate": 0.0004689665715450469, + "loss": 3.4837, + "step": 62550 + }, + { + "epoch": 4.250237804049463, + "grad_norm": 0.9967255592346191, + "learning_rate": 0.00046892410653621416, + "loss": 3.3594, + "step": 62555 + }, + { + "epoch": 4.250577524120125, + "grad_norm": 0.769785463809967, + "learning_rate": 0.00046888164152738144, + "loss": 3.0486, + "step": 62560 + }, + { + "epoch": 4.250917244190787, + "grad_norm": 1.2372970581054688, + "learning_rate": 0.0004688391765185487, + "loss": 3.5036, + "step": 62565 + }, + { + "epoch": 4.251256964261448, + "grad_norm": 0.8480458855628967, + "learning_rate": 0.000468796711509716, + "loss": 3.5341, + "step": 62570 + }, + { + "epoch": 4.25159668433211, + "grad_norm": 0.9168427586555481, + "learning_rate": 0.0004687542465008833, + "loss": 3.5242, + "step": 62575 + }, + { + "epoch": 4.251936404402772, + "grad_norm": 1.0051335096359253, + "learning_rate": 0.00046871178149205056, + "loss": 3.2791, + "step": 62580 + }, + { + "epoch": 4.2522761244734335, + "grad_norm": 0.8896768689155579, + "learning_rate": 0.0004686693164832178, + "loss": 3.5127, + "step": 62585 + }, + { + "epoch": 4.252615844544096, + "grad_norm": 1.0137256383895874, + "learning_rate": 0.0004686268514743851, + "loss": 3.5231, + "step": 62590 + }, + { + "epoch": 4.252955564614758, + "grad_norm": 0.8274829387664795, + "learning_rate": 0.0004685843864655524, + "loss": 3.4677, + "step": 62595 + }, + { + "epoch": 4.253295284685419, + "grad_norm": 0.9273743033409119, + "learning_rate": 0.0004685419214567197, + "loss": 3.436, + "step": 62600 + }, + { + "epoch": 4.253635004756081, + "grad_norm": 1.0536936521530151, + "learning_rate": 0.00046849945644788696, + "loss": 3.1167, + "step": 62605 + }, + { + "epoch": 4.253974724826743, + "grad_norm": 0.9859506487846375, + "learning_rate": 0.00046845699143905424, + "loss": 3.4918, + "step": 62610 + }, + { + "epoch": 4.254314444897404, + "grad_norm": 0.8257538080215454, + "learning_rate": 0.0004684145264302215, + "loss": 3.4082, + "step": 62615 + }, + { + "epoch": 4.254654164968066, + "grad_norm": 0.9712603092193604, + "learning_rate": 0.00046837206142138875, + "loss": 3.483, + "step": 62620 + }, + { + "epoch": 4.254993885038728, + "grad_norm": 0.9240502119064331, + "learning_rate": 0.0004683295964125561, + "loss": 3.5108, + "step": 62625 + }, + { + "epoch": 4.2553336051093895, + "grad_norm": 0.840088963508606, + "learning_rate": 0.00046828713140372336, + "loss": 3.6777, + "step": 62630 + }, + { + "epoch": 4.255673325180052, + "grad_norm": 1.0737701654434204, + "learning_rate": 0.0004682446663948906, + "loss": 3.4877, + "step": 62635 + }, + { + "epoch": 4.256013045250714, + "grad_norm": 1.070001244544983, + "learning_rate": 0.0004682022013860579, + "loss": 3.1317, + "step": 62640 + }, + { + "epoch": 4.256352765321375, + "grad_norm": 1.0922194719314575, + "learning_rate": 0.0004681597363772252, + "loss": 3.482, + "step": 62645 + }, + { + "epoch": 4.256692485392037, + "grad_norm": 1.7680633068084717, + "learning_rate": 0.00046811727136839243, + "loss": 3.4839, + "step": 62650 + }, + { + "epoch": 4.257032205462699, + "grad_norm": 1.5959795713424683, + "learning_rate": 0.0004680748063595597, + "loss": 3.5897, + "step": 62655 + }, + { + "epoch": 4.25737192553336, + "grad_norm": 0.8733215928077698, + "learning_rate": 0.00046803234135072704, + "loss": 3.3036, + "step": 62660 + }, + { + "epoch": 4.257711645604022, + "grad_norm": 0.9621344208717346, + "learning_rate": 0.00046798987634189427, + "loss": 3.6128, + "step": 62665 + }, + { + "epoch": 4.258051365674684, + "grad_norm": 0.8397162556648254, + "learning_rate": 0.00046794741133306155, + "loss": 3.7388, + "step": 62670 + }, + { + "epoch": 4.2583910857453455, + "grad_norm": 0.8782327175140381, + "learning_rate": 0.0004679049463242289, + "loss": 3.3235, + "step": 62675 + }, + { + "epoch": 4.258730805816008, + "grad_norm": 1.3603144884109497, + "learning_rate": 0.0004678624813153961, + "loss": 3.2375, + "step": 62680 + }, + { + "epoch": 4.25907052588667, + "grad_norm": 0.8709167242050171, + "learning_rate": 0.0004678200163065634, + "loss": 3.3907, + "step": 62685 + }, + { + "epoch": 4.259410245957331, + "grad_norm": 1.1793235540390015, + "learning_rate": 0.0004677775512977307, + "loss": 3.3338, + "step": 62690 + }, + { + "epoch": 4.259749966027993, + "grad_norm": 0.8079987168312073, + "learning_rate": 0.00046773508628889795, + "loss": 3.3294, + "step": 62695 + }, + { + "epoch": 4.260089686098655, + "grad_norm": 0.7011122107505798, + "learning_rate": 0.00046769262128006523, + "loss": 3.5335, + "step": 62700 + }, + { + "epoch": 4.260429406169316, + "grad_norm": 1.2750184535980225, + "learning_rate": 0.0004676501562712325, + "loss": 3.6792, + "step": 62705 + }, + { + "epoch": 4.260769126239978, + "grad_norm": 0.8417379260063171, + "learning_rate": 0.0004676076912623998, + "loss": 3.6869, + "step": 62710 + }, + { + "epoch": 4.26110884631064, + "grad_norm": 0.7783904671669006, + "learning_rate": 0.00046756522625356707, + "loss": 3.6368, + "step": 62715 + }, + { + "epoch": 4.2614485663813015, + "grad_norm": 1.0484988689422607, + "learning_rate": 0.00046752276124473435, + "loss": 3.2844, + "step": 62720 + }, + { + "epoch": 4.261788286451964, + "grad_norm": 0.9628962874412537, + "learning_rate": 0.00046748029623590163, + "loss": 3.5032, + "step": 62725 + }, + { + "epoch": 4.262128006522626, + "grad_norm": 0.9595065712928772, + "learning_rate": 0.0004674378312270689, + "loss": 3.6395, + "step": 62730 + }, + { + "epoch": 4.262467726593287, + "grad_norm": 0.8706510066986084, + "learning_rate": 0.0004673953662182362, + "loss": 3.5739, + "step": 62735 + }, + { + "epoch": 4.262807446663949, + "grad_norm": 0.9150076508522034, + "learning_rate": 0.0004673529012094034, + "loss": 3.4177, + "step": 62740 + }, + { + "epoch": 4.263147166734611, + "grad_norm": 0.9285559058189392, + "learning_rate": 0.00046731043620057075, + "loss": 3.6545, + "step": 62745 + }, + { + "epoch": 4.263486886805272, + "grad_norm": 1.0854462385177612, + "learning_rate": 0.00046726797119173803, + "loss": 3.4139, + "step": 62750 + }, + { + "epoch": 4.263826606875934, + "grad_norm": 1.0121731758117676, + "learning_rate": 0.00046722550618290526, + "loss": 3.3708, + "step": 62755 + }, + { + "epoch": 4.264166326946596, + "grad_norm": 0.9207617044448853, + "learning_rate": 0.0004671830411740726, + "loss": 3.6071, + "step": 62760 + }, + { + "epoch": 4.2645060470172576, + "grad_norm": 0.8880199193954468, + "learning_rate": 0.00046714057616523987, + "loss": 3.4625, + "step": 62765 + }, + { + "epoch": 4.26484576708792, + "grad_norm": 0.7963793873786926, + "learning_rate": 0.00046709811115640715, + "loss": 3.4335, + "step": 62770 + }, + { + "epoch": 4.265185487158582, + "grad_norm": 0.9662215709686279, + "learning_rate": 0.0004670556461475744, + "loss": 3.5232, + "step": 62775 + }, + { + "epoch": 4.265525207229243, + "grad_norm": 1.333925485610962, + "learning_rate": 0.0004670131811387417, + "loss": 3.2605, + "step": 62780 + }, + { + "epoch": 4.265864927299905, + "grad_norm": 1.0203183889389038, + "learning_rate": 0.000466970716129909, + "loss": 3.4193, + "step": 62785 + }, + { + "epoch": 4.266204647370567, + "grad_norm": 1.2352871894836426, + "learning_rate": 0.0004669282511210762, + "loss": 3.5024, + "step": 62790 + }, + { + "epoch": 4.266544367441228, + "grad_norm": 0.9238154888153076, + "learning_rate": 0.00046688578611224355, + "loss": 3.5146, + "step": 62795 + }, + { + "epoch": 4.26688408751189, + "grad_norm": 1.2414124011993408, + "learning_rate": 0.00046684332110341083, + "loss": 3.4931, + "step": 62800 + }, + { + "epoch": 4.267223807582552, + "grad_norm": 1.0679413080215454, + "learning_rate": 0.00046680085609457806, + "loss": 3.4434, + "step": 62805 + }, + { + "epoch": 4.267563527653214, + "grad_norm": 1.0167793035507202, + "learning_rate": 0.00046675839108574534, + "loss": 3.2709, + "step": 62810 + }, + { + "epoch": 4.267903247723876, + "grad_norm": 0.9738951921463013, + "learning_rate": 0.00046671592607691267, + "loss": 3.7578, + "step": 62815 + }, + { + "epoch": 4.268242967794538, + "grad_norm": 1.5366685390472412, + "learning_rate": 0.0004666734610680799, + "loss": 3.0914, + "step": 62820 + }, + { + "epoch": 4.268582687865199, + "grad_norm": 0.92111736536026, + "learning_rate": 0.0004666309960592472, + "loss": 3.6064, + "step": 62825 + }, + { + "epoch": 4.268922407935861, + "grad_norm": 0.911973237991333, + "learning_rate": 0.0004665885310504145, + "loss": 3.766, + "step": 62830 + }, + { + "epoch": 4.269262128006522, + "grad_norm": 0.8342196345329285, + "learning_rate": 0.00046654606604158174, + "loss": 3.5418, + "step": 62835 + }, + { + "epoch": 4.269601848077184, + "grad_norm": 0.7963050007820129, + "learning_rate": 0.000466503601032749, + "loss": 3.5248, + "step": 62840 + }, + { + "epoch": 4.269941568147846, + "grad_norm": 0.9696300625801086, + "learning_rate": 0.0004664611360239163, + "loss": 3.6776, + "step": 62845 + }, + { + "epoch": 4.2702812882185075, + "grad_norm": 0.8297840356826782, + "learning_rate": 0.0004664186710150836, + "loss": 3.461, + "step": 62850 + }, + { + "epoch": 4.27062100828917, + "grad_norm": 0.8407372832298279, + "learning_rate": 0.00046637620600625086, + "loss": 3.6198, + "step": 62855 + }, + { + "epoch": 4.270960728359832, + "grad_norm": 0.8115400075912476, + "learning_rate": 0.00046633374099741814, + "loss": 3.3892, + "step": 62860 + }, + { + "epoch": 4.271300448430493, + "grad_norm": 0.8315827250480652, + "learning_rate": 0.0004662912759885854, + "loss": 3.707, + "step": 62865 + }, + { + "epoch": 4.271640168501155, + "grad_norm": 0.8041180968284607, + "learning_rate": 0.0004662488109797527, + "loss": 3.4355, + "step": 62870 + }, + { + "epoch": 4.271979888571817, + "grad_norm": 0.7348676323890686, + "learning_rate": 0.00046620634597092, + "loss": 3.52, + "step": 62875 + }, + { + "epoch": 4.272319608642478, + "grad_norm": 1.100131630897522, + "learning_rate": 0.0004661638809620872, + "loss": 3.173, + "step": 62880 + }, + { + "epoch": 4.27265932871314, + "grad_norm": 1.074238657951355, + "learning_rate": 0.00046612141595325454, + "loss": 3.4948, + "step": 62885 + }, + { + "epoch": 4.272999048783802, + "grad_norm": 0.9294273853302002, + "learning_rate": 0.0004660789509444218, + "loss": 3.6583, + "step": 62890 + }, + { + "epoch": 4.2733387688544635, + "grad_norm": 1.062404751777649, + "learning_rate": 0.00046603648593558904, + "loss": 3.4517, + "step": 62895 + }, + { + "epoch": 4.273678488925126, + "grad_norm": 0.8152176737785339, + "learning_rate": 0.0004659940209267564, + "loss": 3.2214, + "step": 62900 + }, + { + "epoch": 4.274018208995788, + "grad_norm": 0.9120787382125854, + "learning_rate": 0.00046595155591792366, + "loss": 3.4727, + "step": 62905 + }, + { + "epoch": 4.274357929066449, + "grad_norm": 0.9106739163398743, + "learning_rate": 0.0004659090909090909, + "loss": 3.376, + "step": 62910 + }, + { + "epoch": 4.274697649137111, + "grad_norm": 0.935278058052063, + "learning_rate": 0.00046586662590025816, + "loss": 3.3404, + "step": 62915 + }, + { + "epoch": 4.275037369207773, + "grad_norm": 0.7770323753356934, + "learning_rate": 0.0004658241608914255, + "loss": 3.653, + "step": 62920 + }, + { + "epoch": 4.275377089278434, + "grad_norm": 0.8024708032608032, + "learning_rate": 0.0004657816958825927, + "loss": 3.4946, + "step": 62925 + }, + { + "epoch": 4.275716809349096, + "grad_norm": 0.9722824692726135, + "learning_rate": 0.00046573923087376, + "loss": 3.4985, + "step": 62930 + }, + { + "epoch": 4.276056529419758, + "grad_norm": 0.8358780741691589, + "learning_rate": 0.00046569676586492734, + "loss": 3.3085, + "step": 62935 + }, + { + "epoch": 4.2763962494904195, + "grad_norm": 0.8335534930229187, + "learning_rate": 0.0004656543008560946, + "loss": 3.4855, + "step": 62940 + }, + { + "epoch": 4.276735969561082, + "grad_norm": 1.0614230632781982, + "learning_rate": 0.00046561183584726184, + "loss": 3.3644, + "step": 62945 + }, + { + "epoch": 4.277075689631744, + "grad_norm": 0.865924596786499, + "learning_rate": 0.0004655693708384291, + "loss": 3.5181, + "step": 62950 + }, + { + "epoch": 4.277415409702405, + "grad_norm": 0.8984517455101013, + "learning_rate": 0.00046552690582959646, + "loss": 3.5683, + "step": 62955 + }, + { + "epoch": 4.277755129773067, + "grad_norm": 0.9432259202003479, + "learning_rate": 0.0004654844408207637, + "loss": 3.3963, + "step": 62960 + }, + { + "epoch": 4.278094849843729, + "grad_norm": 0.8854591846466064, + "learning_rate": 0.00046544197581193097, + "loss": 3.6019, + "step": 62965 + }, + { + "epoch": 4.27843456991439, + "grad_norm": 0.9467496871948242, + "learning_rate": 0.0004653995108030983, + "loss": 3.4159, + "step": 62970 + }, + { + "epoch": 4.278774289985052, + "grad_norm": 0.8231064677238464, + "learning_rate": 0.0004653570457942655, + "loss": 3.4988, + "step": 62975 + }, + { + "epoch": 4.279114010055714, + "grad_norm": 1.1189846992492676, + "learning_rate": 0.0004653145807854328, + "loss": 3.627, + "step": 62980 + }, + { + "epoch": 4.2794537301263755, + "grad_norm": 0.6712808609008789, + "learning_rate": 0.00046527211577660014, + "loss": 3.3774, + "step": 62985 + }, + { + "epoch": 4.279793450197038, + "grad_norm": 0.6759068369865417, + "learning_rate": 0.00046522965076776737, + "loss": 3.3713, + "step": 62990 + }, + { + "epoch": 4.2801331702677, + "grad_norm": 0.9409923553466797, + "learning_rate": 0.00046518718575893465, + "loss": 3.2324, + "step": 62995 + }, + { + "epoch": 4.280472890338361, + "grad_norm": 1.0454951524734497, + "learning_rate": 0.0004651447207501019, + "loss": 3.4577, + "step": 63000 + }, + { + "epoch": 4.280812610409023, + "grad_norm": 0.7773561477661133, + "learning_rate": 0.0004651022557412692, + "loss": 3.6641, + "step": 63005 + }, + { + "epoch": 4.281152330479685, + "grad_norm": 1.0369950532913208, + "learning_rate": 0.0004650597907324365, + "loss": 3.6334, + "step": 63010 + }, + { + "epoch": 4.281492050550346, + "grad_norm": 0.8465280532836914, + "learning_rate": 0.00046501732572360377, + "loss": 3.5432, + "step": 63015 + }, + { + "epoch": 4.281831770621008, + "grad_norm": 1.0343936681747437, + "learning_rate": 0.00046497486071477105, + "loss": 3.7027, + "step": 63020 + }, + { + "epoch": 4.28217149069167, + "grad_norm": 0.8327369093894958, + "learning_rate": 0.0004649323957059383, + "loss": 3.549, + "step": 63025 + }, + { + "epoch": 4.2825112107623315, + "grad_norm": 0.9537756443023682, + "learning_rate": 0.0004648899306971056, + "loss": 3.4914, + "step": 63030 + }, + { + "epoch": 4.282850930832994, + "grad_norm": 0.8075971007347107, + "learning_rate": 0.00046484746568827283, + "loss": 3.6651, + "step": 63035 + }, + { + "epoch": 4.283190650903656, + "grad_norm": 0.9077102541923523, + "learning_rate": 0.00046480500067944017, + "loss": 3.5384, + "step": 63040 + }, + { + "epoch": 4.283530370974317, + "grad_norm": 0.8558540344238281, + "learning_rate": 0.00046476253567060745, + "loss": 3.5418, + "step": 63045 + }, + { + "epoch": 4.283870091044979, + "grad_norm": 1.2871683835983276, + "learning_rate": 0.00046472007066177467, + "loss": 3.4596, + "step": 63050 + }, + { + "epoch": 4.284209811115641, + "grad_norm": 0.7667078971862793, + "learning_rate": 0.000464677605652942, + "loss": 3.6418, + "step": 63055 + }, + { + "epoch": 4.284549531186302, + "grad_norm": 0.9985043406486511, + "learning_rate": 0.0004646351406441093, + "loss": 3.6052, + "step": 63060 + }, + { + "epoch": 4.284889251256964, + "grad_norm": 0.9645100235939026, + "learning_rate": 0.0004645926756352765, + "loss": 3.6414, + "step": 63065 + }, + { + "epoch": 4.285228971327626, + "grad_norm": 0.8240687847137451, + "learning_rate": 0.0004645502106264438, + "loss": 3.4987, + "step": 63070 + }, + { + "epoch": 4.2855686913982876, + "grad_norm": 1.1972805261611938, + "learning_rate": 0.0004645077456176111, + "loss": 3.7235, + "step": 63075 + }, + { + "epoch": 4.28590841146895, + "grad_norm": 1.047987699508667, + "learning_rate": 0.00046446528060877835, + "loss": 3.7069, + "step": 63080 + }, + { + "epoch": 4.286248131539612, + "grad_norm": 0.9179308414459229, + "learning_rate": 0.00046442281559994563, + "loss": 3.4034, + "step": 63085 + }, + { + "epoch": 4.286587851610273, + "grad_norm": 1.1575714349746704, + "learning_rate": 0.00046438035059111297, + "loss": 3.3855, + "step": 63090 + }, + { + "epoch": 4.286927571680935, + "grad_norm": 1.0204805135726929, + "learning_rate": 0.0004643378855822802, + "loss": 2.9736, + "step": 63095 + }, + { + "epoch": 4.287267291751597, + "grad_norm": 0.7787651419639587, + "learning_rate": 0.0004642954205734475, + "loss": 3.5204, + "step": 63100 + }, + { + "epoch": 4.287607011822258, + "grad_norm": 0.7901066541671753, + "learning_rate": 0.00046425295556461475, + "loss": 3.3403, + "step": 63105 + }, + { + "epoch": 4.28794673189292, + "grad_norm": 1.1962003707885742, + "learning_rate": 0.0004642104905557821, + "loss": 3.4717, + "step": 63110 + }, + { + "epoch": 4.288286451963582, + "grad_norm": 0.8809593915939331, + "learning_rate": 0.0004641680255469493, + "loss": 3.4853, + "step": 63115 + }, + { + "epoch": 4.288626172034244, + "grad_norm": 1.0118751525878906, + "learning_rate": 0.0004641255605381166, + "loss": 3.5454, + "step": 63120 + }, + { + "epoch": 4.288965892104906, + "grad_norm": 1.085115671157837, + "learning_rate": 0.00046408309552928393, + "loss": 3.8491, + "step": 63125 + }, + { + "epoch": 4.289305612175568, + "grad_norm": 1.0400446653366089, + "learning_rate": 0.00046404063052045115, + "loss": 3.4531, + "step": 63130 + }, + { + "epoch": 4.289645332246229, + "grad_norm": 0.7920119166374207, + "learning_rate": 0.00046399816551161843, + "loss": 3.5028, + "step": 63135 + }, + { + "epoch": 4.289985052316891, + "grad_norm": 1.149543285369873, + "learning_rate": 0.0004639557005027857, + "loss": 3.3653, + "step": 63140 + }, + { + "epoch": 4.290324772387553, + "grad_norm": 0.7529909014701843, + "learning_rate": 0.000463913235493953, + "loss": 3.5601, + "step": 63145 + }, + { + "epoch": 4.290664492458214, + "grad_norm": 0.9413520693778992, + "learning_rate": 0.0004638707704851203, + "loss": 3.687, + "step": 63150 + }, + { + "epoch": 4.291004212528876, + "grad_norm": 0.8962732553482056, + "learning_rate": 0.00046382830547628755, + "loss": 3.6253, + "step": 63155 + }, + { + "epoch": 4.291343932599538, + "grad_norm": 0.8489076495170593, + "learning_rate": 0.00046378584046745483, + "loss": 3.5985, + "step": 63160 + }, + { + "epoch": 4.2916836526702, + "grad_norm": 1.0118961334228516, + "learning_rate": 0.0004637433754586221, + "loss": 3.2114, + "step": 63165 + }, + { + "epoch": 4.292023372740862, + "grad_norm": 1.357130527496338, + "learning_rate": 0.0004637009104497894, + "loss": 3.4965, + "step": 63170 + }, + { + "epoch": 4.292363092811524, + "grad_norm": 0.8592820763587952, + "learning_rate": 0.0004636584454409566, + "loss": 3.4632, + "step": 63175 + }, + { + "epoch": 4.292702812882185, + "grad_norm": 0.9469602704048157, + "learning_rate": 0.00046361598043212395, + "loss": 3.4382, + "step": 63180 + }, + { + "epoch": 4.293042532952847, + "grad_norm": 1.2061738967895508, + "learning_rate": 0.00046357351542329123, + "loss": 3.5108, + "step": 63185 + }, + { + "epoch": 4.293382253023509, + "grad_norm": 0.9763404130935669, + "learning_rate": 0.00046353105041445846, + "loss": 3.4979, + "step": 63190 + }, + { + "epoch": 4.29372197309417, + "grad_norm": 0.6768050789833069, + "learning_rate": 0.0004634885854056258, + "loss": 3.7342, + "step": 63195 + }, + { + "epoch": 4.294061693164832, + "grad_norm": 1.0358705520629883, + "learning_rate": 0.0004634461203967931, + "loss": 3.2631, + "step": 63200 + }, + { + "epoch": 4.294401413235494, + "grad_norm": 1.0106147527694702, + "learning_rate": 0.0004634036553879603, + "loss": 3.5145, + "step": 63205 + }, + { + "epoch": 4.294741133306156, + "grad_norm": 0.9053330421447754, + "learning_rate": 0.0004633611903791276, + "loss": 3.5175, + "step": 63210 + }, + { + "epoch": 4.295080853376818, + "grad_norm": 6.461337089538574, + "learning_rate": 0.0004633187253702949, + "loss": 3.5273, + "step": 63215 + }, + { + "epoch": 4.29542057344748, + "grad_norm": 0.9505025744438171, + "learning_rate": 0.00046327626036146214, + "loss": 3.6163, + "step": 63220 + }, + { + "epoch": 4.295760293518141, + "grad_norm": 0.9532656073570251, + "learning_rate": 0.0004632337953526294, + "loss": 3.5119, + "step": 63225 + }, + { + "epoch": 4.296100013588803, + "grad_norm": 1.1041043996810913, + "learning_rate": 0.00046319133034379676, + "loss": 3.4952, + "step": 63230 + }, + { + "epoch": 4.296439733659464, + "grad_norm": 1.0667705535888672, + "learning_rate": 0.000463148865334964, + "loss": 3.3183, + "step": 63235 + }, + { + "epoch": 4.296779453730126, + "grad_norm": 1.0051324367523193, + "learning_rate": 0.00046310640032613126, + "loss": 3.3466, + "step": 63240 + }, + { + "epoch": 4.297119173800788, + "grad_norm": 0.7838789224624634, + "learning_rate": 0.00046306393531729854, + "loss": 3.5499, + "step": 63245 + }, + { + "epoch": 4.2974588938714495, + "grad_norm": 0.7536952495574951, + "learning_rate": 0.0004630214703084658, + "loss": 3.5872, + "step": 63250 + }, + { + "epoch": 4.297798613942112, + "grad_norm": 0.8589155673980713, + "learning_rate": 0.0004629790052996331, + "loss": 3.5282, + "step": 63255 + }, + { + "epoch": 4.298138334012774, + "grad_norm": 0.9237825274467468, + "learning_rate": 0.0004629365402908004, + "loss": 3.4764, + "step": 63260 + }, + { + "epoch": 4.298478054083435, + "grad_norm": 1.020599126815796, + "learning_rate": 0.00046289407528196766, + "loss": 3.7309, + "step": 63265 + }, + { + "epoch": 4.298817774154097, + "grad_norm": 0.9265382885932922, + "learning_rate": 0.00046285161027313494, + "loss": 3.3448, + "step": 63270 + }, + { + "epoch": 4.299157494224759, + "grad_norm": 0.8682718276977539, + "learning_rate": 0.0004628091452643022, + "loss": 3.6741, + "step": 63275 + }, + { + "epoch": 4.29949721429542, + "grad_norm": 0.8410624861717224, + "learning_rate": 0.00046276668025546956, + "loss": 3.5663, + "step": 63280 + }, + { + "epoch": 4.299836934366082, + "grad_norm": 1.0244592428207397, + "learning_rate": 0.0004627242152466368, + "loss": 3.5059, + "step": 63285 + }, + { + "epoch": 4.300176654436744, + "grad_norm": 0.7348805665969849, + "learning_rate": 0.00046268175023780406, + "loss": 3.583, + "step": 63290 + }, + { + "epoch": 4.3005163745074055, + "grad_norm": 0.9262404441833496, + "learning_rate": 0.00046263928522897134, + "loss": 3.6529, + "step": 63295 + }, + { + "epoch": 4.300856094578068, + "grad_norm": 1.009838342666626, + "learning_rate": 0.0004625968202201386, + "loss": 3.3832, + "step": 63300 + }, + { + "epoch": 4.30119581464873, + "grad_norm": 0.8991787433624268, + "learning_rate": 0.0004625543552113059, + "loss": 3.6146, + "step": 63305 + }, + { + "epoch": 4.301535534719391, + "grad_norm": 1.0110728740692139, + "learning_rate": 0.0004625118902024732, + "loss": 3.2501, + "step": 63310 + }, + { + "epoch": 4.301875254790053, + "grad_norm": 0.9592567086219788, + "learning_rate": 0.00046246942519364046, + "loss": 3.493, + "step": 63315 + }, + { + "epoch": 4.302214974860715, + "grad_norm": 0.9057253003120422, + "learning_rate": 0.00046242696018480774, + "loss": 3.7104, + "step": 63320 + }, + { + "epoch": 4.302554694931376, + "grad_norm": 0.7570481300354004, + "learning_rate": 0.000462384495175975, + "loss": 3.4067, + "step": 63325 + }, + { + "epoch": 4.302894415002038, + "grad_norm": 0.8806493878364563, + "learning_rate": 0.00046234203016714225, + "loss": 3.5308, + "step": 63330 + }, + { + "epoch": 4.3032341350727, + "grad_norm": 0.9763409495353699, + "learning_rate": 0.0004622995651583096, + "loss": 3.3351, + "step": 63335 + }, + { + "epoch": 4.3035738551433615, + "grad_norm": 1.0526341199874878, + "learning_rate": 0.00046225710014947686, + "loss": 3.3213, + "step": 63340 + }, + { + "epoch": 4.303913575214024, + "grad_norm": 0.7454416751861572, + "learning_rate": 0.0004622146351406441, + "loss": 3.1546, + "step": 63345 + }, + { + "epoch": 4.304253295284686, + "grad_norm": 0.8022636771202087, + "learning_rate": 0.0004621721701318114, + "loss": 3.4647, + "step": 63350 + }, + { + "epoch": 4.304593015355347, + "grad_norm": 0.8116572499275208, + "learning_rate": 0.0004621297051229787, + "loss": 3.306, + "step": 63355 + }, + { + "epoch": 4.304932735426009, + "grad_norm": 0.6523182392120361, + "learning_rate": 0.00046208724011414593, + "loss": 3.843, + "step": 63360 + }, + { + "epoch": 4.305272455496671, + "grad_norm": 0.9372496008872986, + "learning_rate": 0.0004620447751053132, + "loss": 3.6551, + "step": 63365 + }, + { + "epoch": 4.305612175567332, + "grad_norm": 0.9905537366867065, + "learning_rate": 0.00046200231009648054, + "loss": 3.504, + "step": 63370 + }, + { + "epoch": 4.305951895637994, + "grad_norm": 0.8568122386932373, + "learning_rate": 0.00046195984508764777, + "loss": 3.5246, + "step": 63375 + }, + { + "epoch": 4.306291615708656, + "grad_norm": 0.9677243232727051, + "learning_rate": 0.00046191738007881505, + "loss": 3.2051, + "step": 63380 + }, + { + "epoch": 4.306631335779318, + "grad_norm": 0.8657357096672058, + "learning_rate": 0.0004618749150699824, + "loss": 3.3531, + "step": 63385 + }, + { + "epoch": 4.30697105584998, + "grad_norm": 0.91708904504776, + "learning_rate": 0.0004618324500611496, + "loss": 3.4608, + "step": 63390 + }, + { + "epoch": 4.307310775920642, + "grad_norm": 0.7901963591575623, + "learning_rate": 0.0004617899850523169, + "loss": 3.4577, + "step": 63395 + }, + { + "epoch": 4.307650495991303, + "grad_norm": 0.8076801896095276, + "learning_rate": 0.00046174752004348417, + "loss": 3.6364, + "step": 63400 + }, + { + "epoch": 4.307990216061965, + "grad_norm": 0.7215825319290161, + "learning_rate": 0.00046170505503465145, + "loss": 3.2673, + "step": 63405 + }, + { + "epoch": 4.308329936132627, + "grad_norm": 0.7734267711639404, + "learning_rate": 0.00046166259002581873, + "loss": 3.491, + "step": 63410 + }, + { + "epoch": 4.308669656203288, + "grad_norm": 0.7741556167602539, + "learning_rate": 0.000461620125016986, + "loss": 3.624, + "step": 63415 + }, + { + "epoch": 4.30900937627395, + "grad_norm": 0.9202483296394348, + "learning_rate": 0.0004615776600081533, + "loss": 3.5691, + "step": 63420 + }, + { + "epoch": 4.309349096344612, + "grad_norm": 0.7439380288124084, + "learning_rate": 0.00046153519499932057, + "loss": 3.5634, + "step": 63425 + }, + { + "epoch": 4.309688816415274, + "grad_norm": 0.8515594005584717, + "learning_rate": 0.00046149272999048785, + "loss": 3.6031, + "step": 63430 + }, + { + "epoch": 4.310028536485936, + "grad_norm": 0.7504320740699768, + "learning_rate": 0.0004614502649816551, + "loss": 3.5645, + "step": 63435 + }, + { + "epoch": 4.310368256556598, + "grad_norm": 0.936023473739624, + "learning_rate": 0.0004614077999728224, + "loss": 3.4255, + "step": 63440 + }, + { + "epoch": 4.310707976627259, + "grad_norm": 0.9955886006355286, + "learning_rate": 0.0004613653349639897, + "loss": 3.3105, + "step": 63445 + }, + { + "epoch": 4.311047696697921, + "grad_norm": 0.898379921913147, + "learning_rate": 0.00046132286995515697, + "loss": 3.5858, + "step": 63450 + }, + { + "epoch": 4.311387416768583, + "grad_norm": 0.8548946976661682, + "learning_rate": 0.00046128040494632425, + "loss": 3.7829, + "step": 63455 + }, + { + "epoch": 4.311727136839244, + "grad_norm": 1.1356630325317383, + "learning_rate": 0.00046123793993749153, + "loss": 3.622, + "step": 63460 + }, + { + "epoch": 4.312066856909906, + "grad_norm": 0.9631102085113525, + "learning_rate": 0.0004611954749286588, + "loss": 3.3979, + "step": 63465 + }, + { + "epoch": 4.312406576980568, + "grad_norm": 1.0479397773742676, + "learning_rate": 0.00046115300991982604, + "loss": 3.5344, + "step": 63470 + }, + { + "epoch": 4.31274629705123, + "grad_norm": 1.063807725906372, + "learning_rate": 0.00046111054491099337, + "loss": 3.6783, + "step": 63475 + }, + { + "epoch": 4.313086017121892, + "grad_norm": 0.9240648746490479, + "learning_rate": 0.00046106807990216065, + "loss": 3.5288, + "step": 63480 + }, + { + "epoch": 4.313425737192554, + "grad_norm": 0.8409518003463745, + "learning_rate": 0.0004610256148933279, + "loss": 3.4427, + "step": 63485 + }, + { + "epoch": 4.313765457263215, + "grad_norm": 0.9321063160896301, + "learning_rate": 0.0004609831498844952, + "loss": 3.5712, + "step": 63490 + }, + { + "epoch": 4.314105177333877, + "grad_norm": 0.8674365282058716, + "learning_rate": 0.0004609406848756625, + "loss": 3.4421, + "step": 63495 + }, + { + "epoch": 4.314444897404539, + "grad_norm": 0.7544772028923035, + "learning_rate": 0.0004608982198668297, + "loss": 3.6404, + "step": 63500 + }, + { + "epoch": 4.3147846174752, + "grad_norm": 0.9659435749053955, + "learning_rate": 0.000460855754857997, + "loss": 3.5979, + "step": 63505 + }, + { + "epoch": 4.315124337545862, + "grad_norm": 0.9477208852767944, + "learning_rate": 0.00046081328984916433, + "loss": 3.6152, + "step": 63510 + }, + { + "epoch": 4.3154640576165235, + "grad_norm": 1.2662537097930908, + "learning_rate": 0.00046077082484033156, + "loss": 3.7525, + "step": 63515 + }, + { + "epoch": 4.315803777687186, + "grad_norm": 0.8814932107925415, + "learning_rate": 0.00046072835983149884, + "loss": 3.3497, + "step": 63520 + }, + { + "epoch": 4.316143497757848, + "grad_norm": 0.7424570322036743, + "learning_rate": 0.00046068589482266617, + "loss": 3.51, + "step": 63525 + }, + { + "epoch": 4.316483217828509, + "grad_norm": 0.9773172736167908, + "learning_rate": 0.0004606434298138334, + "loss": 3.2225, + "step": 63530 + }, + { + "epoch": 4.316822937899171, + "grad_norm": 0.9896515607833862, + "learning_rate": 0.0004606009648050007, + "loss": 3.5559, + "step": 63535 + }, + { + "epoch": 4.317162657969833, + "grad_norm": 0.7280979156494141, + "learning_rate": 0.000460558499796168, + "loss": 3.484, + "step": 63540 + }, + { + "epoch": 4.317502378040494, + "grad_norm": 0.9159126877784729, + "learning_rate": 0.00046051603478733524, + "loss": 3.5251, + "step": 63545 + }, + { + "epoch": 4.317842098111156, + "grad_norm": 0.7604270577430725, + "learning_rate": 0.0004604735697785025, + "loss": 3.4266, + "step": 63550 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.8605981469154358, + "learning_rate": 0.0004604311047696698, + "loss": 3.706, + "step": 63555 + }, + { + "epoch": 4.3185215382524795, + "grad_norm": 0.881369948387146, + "learning_rate": 0.0004603886397608371, + "loss": 3.5693, + "step": 63560 + }, + { + "epoch": 4.318861258323142, + "grad_norm": 0.8846889734268188, + "learning_rate": 0.00046034617475200436, + "loss": 3.5214, + "step": 63565 + }, + { + "epoch": 4.319200978393804, + "grad_norm": 0.9039222598075867, + "learning_rate": 0.00046030370974317164, + "loss": 3.4921, + "step": 63570 + }, + { + "epoch": 4.319540698464465, + "grad_norm": 0.8873583674430847, + "learning_rate": 0.0004602612447343389, + "loss": 3.3268, + "step": 63575 + }, + { + "epoch": 4.319880418535127, + "grad_norm": 0.6588369011878967, + "learning_rate": 0.0004602187797255062, + "loss": 3.6508, + "step": 63580 + }, + { + "epoch": 4.320220138605789, + "grad_norm": 0.7112980484962463, + "learning_rate": 0.0004601763147166735, + "loss": 3.6463, + "step": 63585 + }, + { + "epoch": 4.32055985867645, + "grad_norm": 1.0501071214675903, + "learning_rate": 0.0004601338497078407, + "loss": 3.2998, + "step": 63590 + }, + { + "epoch": 4.320899578747112, + "grad_norm": 0.8802195191383362, + "learning_rate": 0.00046009138469900804, + "loss": 3.2761, + "step": 63595 + }, + { + "epoch": 4.321239298817774, + "grad_norm": 0.8654646873474121, + "learning_rate": 0.0004600489196901753, + "loss": 3.4798, + "step": 63600 + }, + { + "epoch": 4.3215790188884355, + "grad_norm": 0.8506633639335632, + "learning_rate": 0.00046000645468134254, + "loss": 3.5117, + "step": 63605 + }, + { + "epoch": 4.321918738959098, + "grad_norm": 1.1394805908203125, + "learning_rate": 0.0004599639896725099, + "loss": 3.3488, + "step": 63610 + }, + { + "epoch": 4.32225845902976, + "grad_norm": 0.924342155456543, + "learning_rate": 0.00045992152466367716, + "loss": 3.4738, + "step": 63615 + }, + { + "epoch": 4.322598179100421, + "grad_norm": 0.8239415287971497, + "learning_rate": 0.00045987905965484444, + "loss": 3.4748, + "step": 63620 + }, + { + "epoch": 4.322937899171083, + "grad_norm": 0.887124240398407, + "learning_rate": 0.00045983659464601166, + "loss": 3.4353, + "step": 63625 + }, + { + "epoch": 4.323277619241745, + "grad_norm": 0.9468467235565186, + "learning_rate": 0.000459794129637179, + "loss": 3.6162, + "step": 63630 + }, + { + "epoch": 4.323617339312406, + "grad_norm": 0.8707683086395264, + "learning_rate": 0.0004597516646283463, + "loss": 3.2422, + "step": 63635 + }, + { + "epoch": 4.323957059383068, + "grad_norm": 0.8277537226676941, + "learning_rate": 0.0004597091996195135, + "loss": 3.5858, + "step": 63640 + }, + { + "epoch": 4.32429677945373, + "grad_norm": 0.9231480360031128, + "learning_rate": 0.00045966673461068084, + "loss": 3.5707, + "step": 63645 + }, + { + "epoch": 4.3246364995243916, + "grad_norm": 0.7574716806411743, + "learning_rate": 0.0004596242696018481, + "loss": 3.4973, + "step": 63650 + }, + { + "epoch": 4.324976219595054, + "grad_norm": 0.8050694465637207, + "learning_rate": 0.00045958180459301534, + "loss": 3.3976, + "step": 63655 + }, + { + "epoch": 4.325315939665716, + "grad_norm": 1.0283045768737793, + "learning_rate": 0.0004595393395841826, + "loss": 3.2844, + "step": 63660 + }, + { + "epoch": 4.325655659736377, + "grad_norm": 1.075227975845337, + "learning_rate": 0.00045949687457534996, + "loss": 3.7627, + "step": 63665 + }, + { + "epoch": 4.325995379807039, + "grad_norm": 0.6887281537055969, + "learning_rate": 0.0004594544095665172, + "loss": 3.4861, + "step": 63670 + }, + { + "epoch": 4.326335099877701, + "grad_norm": 0.7869235873222351, + "learning_rate": 0.00045941194455768447, + "loss": 3.5676, + "step": 63675 + }, + { + "epoch": 4.326674819948362, + "grad_norm": 0.9517861008644104, + "learning_rate": 0.0004593694795488518, + "loss": 3.8298, + "step": 63680 + }, + { + "epoch": 4.327014540019024, + "grad_norm": 0.7521634101867676, + "learning_rate": 0.000459327014540019, + "loss": 3.5319, + "step": 63685 + }, + { + "epoch": 4.327354260089686, + "grad_norm": 0.7177425622940063, + "learning_rate": 0.0004592845495311863, + "loss": 3.603, + "step": 63690 + }, + { + "epoch": 4.327693980160348, + "grad_norm": 0.8394197225570679, + "learning_rate": 0.0004592420845223536, + "loss": 3.5553, + "step": 63695 + }, + { + "epoch": 4.32803370023101, + "grad_norm": 0.8540846109390259, + "learning_rate": 0.00045919961951352087, + "loss": 3.2586, + "step": 63700 + }, + { + "epoch": 4.328373420301672, + "grad_norm": 0.8408299088478088, + "learning_rate": 0.00045915715450468815, + "loss": 3.4172, + "step": 63705 + }, + { + "epoch": 4.328713140372333, + "grad_norm": 0.9187860488891602, + "learning_rate": 0.0004591146894958554, + "loss": 3.4737, + "step": 63710 + }, + { + "epoch": 4.329052860442995, + "grad_norm": 1.6724387407302856, + "learning_rate": 0.0004590722244870227, + "loss": 3.1936, + "step": 63715 + }, + { + "epoch": 4.329392580513657, + "grad_norm": 0.787885308265686, + "learning_rate": 0.00045902975947819, + "loss": 3.4411, + "step": 63720 + }, + { + "epoch": 4.329732300584318, + "grad_norm": 0.8340242505073547, + "learning_rate": 0.00045898729446935727, + "loss": 3.2569, + "step": 63725 + }, + { + "epoch": 4.33007202065498, + "grad_norm": 0.9426168203353882, + "learning_rate": 0.0004589448294605245, + "loss": 3.3149, + "step": 63730 + }, + { + "epoch": 4.330411740725642, + "grad_norm": 3.6117043495178223, + "learning_rate": 0.0004589023644516918, + "loss": 3.0973, + "step": 63735 + }, + { + "epoch": 4.330751460796304, + "grad_norm": 0.8387958407402039, + "learning_rate": 0.0004588598994428591, + "loss": 3.2812, + "step": 63740 + }, + { + "epoch": 4.331091180866966, + "grad_norm": 2.6120729446411133, + "learning_rate": 0.00045881743443402633, + "loss": 3.1041, + "step": 63745 + }, + { + "epoch": 4.331430900937628, + "grad_norm": 0.8347731828689575, + "learning_rate": 0.00045877496942519367, + "loss": 3.6873, + "step": 63750 + }, + { + "epoch": 4.331770621008289, + "grad_norm": 0.9290691614151001, + "learning_rate": 0.00045873250441636095, + "loss": 3.3279, + "step": 63755 + }, + { + "epoch": 4.332110341078951, + "grad_norm": 0.8443601131439209, + "learning_rate": 0.00045869003940752817, + "loss": 3.3592, + "step": 63760 + }, + { + "epoch": 4.332450061149613, + "grad_norm": 0.9414072036743164, + "learning_rate": 0.00045864757439869545, + "loss": 3.2117, + "step": 63765 + }, + { + "epoch": 4.332789781220274, + "grad_norm": 0.7067700624465942, + "learning_rate": 0.0004586051093898628, + "loss": 3.4643, + "step": 63770 + }, + { + "epoch": 4.333129501290936, + "grad_norm": 1.006150722503662, + "learning_rate": 0.00045856264438103, + "loss": 3.3833, + "step": 63775 + }, + { + "epoch": 4.333469221361598, + "grad_norm": 0.8081148266792297, + "learning_rate": 0.0004585201793721973, + "loss": 3.2435, + "step": 63780 + }, + { + "epoch": 4.33380894143226, + "grad_norm": 0.8181775808334351, + "learning_rate": 0.0004584777143633646, + "loss": 3.8901, + "step": 63785 + }, + { + "epoch": 4.334148661502922, + "grad_norm": 0.7032859921455383, + "learning_rate": 0.0004584352493545319, + "loss": 3.3616, + "step": 63790 + }, + { + "epoch": 4.334488381573584, + "grad_norm": 1.3013781309127808, + "learning_rate": 0.00045839278434569913, + "loss": 3.452, + "step": 63795 + }, + { + "epoch": 4.334828101644245, + "grad_norm": 1.0007555484771729, + "learning_rate": 0.0004583503193368664, + "loss": 3.1628, + "step": 63800 + }, + { + "epoch": 4.335167821714907, + "grad_norm": 0.6116490960121155, + "learning_rate": 0.00045830785432803375, + "loss": 3.5322, + "step": 63805 + }, + { + "epoch": 4.335507541785569, + "grad_norm": 1.256000280380249, + "learning_rate": 0.000458265389319201, + "loss": 3.0904, + "step": 63810 + }, + { + "epoch": 4.33584726185623, + "grad_norm": 0.9123033285140991, + "learning_rate": 0.00045822292431036825, + "loss": 3.7393, + "step": 63815 + }, + { + "epoch": 4.336186981926892, + "grad_norm": 1.2238882780075073, + "learning_rate": 0.0004581804593015356, + "loss": 3.5204, + "step": 63820 + }, + { + "epoch": 4.336526701997554, + "grad_norm": 0.8865488171577454, + "learning_rate": 0.0004581379942927028, + "loss": 3.7438, + "step": 63825 + }, + { + "epoch": 4.336866422068216, + "grad_norm": 0.9483126997947693, + "learning_rate": 0.0004580955292838701, + "loss": 3.6169, + "step": 63830 + }, + { + "epoch": 4.337206142138878, + "grad_norm": 0.9301586151123047, + "learning_rate": 0.00045805306427503743, + "loss": 3.3631, + "step": 63835 + }, + { + "epoch": 4.33754586220954, + "grad_norm": 0.8311380743980408, + "learning_rate": 0.00045801059926620465, + "loss": 3.4739, + "step": 63840 + }, + { + "epoch": 4.337885582280201, + "grad_norm": 0.8448362350463867, + "learning_rate": 0.00045796813425737193, + "loss": 3.4725, + "step": 63845 + }, + { + "epoch": 4.338225302350863, + "grad_norm": 0.9612818360328674, + "learning_rate": 0.0004579256692485392, + "loss": 3.0708, + "step": 63850 + }, + { + "epoch": 4.338565022421525, + "grad_norm": 0.6881468296051025, + "learning_rate": 0.0004578832042397065, + "loss": 3.4361, + "step": 63855 + }, + { + "epoch": 4.338904742492186, + "grad_norm": 0.894188404083252, + "learning_rate": 0.0004578407392308738, + "loss": 3.5322, + "step": 63860 + }, + { + "epoch": 4.339244462562848, + "grad_norm": 1.1477761268615723, + "learning_rate": 0.00045779827422204105, + "loss": 3.7592, + "step": 63865 + }, + { + "epoch": 4.33958418263351, + "grad_norm": 0.971332311630249, + "learning_rate": 0.00045775580921320833, + "loss": 3.5792, + "step": 63870 + }, + { + "epoch": 4.339923902704172, + "grad_norm": 0.879938006401062, + "learning_rate": 0.0004577133442043756, + "loss": 3.4716, + "step": 63875 + }, + { + "epoch": 4.340263622774834, + "grad_norm": 0.8534563779830933, + "learning_rate": 0.0004576708791955429, + "loss": 3.7993, + "step": 63880 + }, + { + "epoch": 4.340603342845496, + "grad_norm": 0.9018073678016663, + "learning_rate": 0.0004576284141867101, + "loss": 3.642, + "step": 63885 + }, + { + "epoch": 4.340943062916157, + "grad_norm": 0.8780676126480103, + "learning_rate": 0.00045758594917787745, + "loss": 3.5372, + "step": 63890 + }, + { + "epoch": 4.341282782986819, + "grad_norm": 0.7116134166717529, + "learning_rate": 0.00045754348416904473, + "loss": 3.3767, + "step": 63895 + }, + { + "epoch": 4.341622503057481, + "grad_norm": 0.96043461561203, + "learning_rate": 0.00045750101916021196, + "loss": 3.3998, + "step": 63900 + }, + { + "epoch": 4.341962223128142, + "grad_norm": 0.8447843790054321, + "learning_rate": 0.0004574585541513793, + "loss": 3.262, + "step": 63905 + }, + { + "epoch": 4.342301943198804, + "grad_norm": 0.9072927236557007, + "learning_rate": 0.0004574160891425466, + "loss": 3.3668, + "step": 63910 + }, + { + "epoch": 4.3426416632694655, + "grad_norm": 0.8178151249885559, + "learning_rate": 0.0004573736241337138, + "loss": 3.4908, + "step": 63915 + }, + { + "epoch": 4.342981383340128, + "grad_norm": 1.4214881658554077, + "learning_rate": 0.0004573311591248811, + "loss": 3.4948, + "step": 63920 + }, + { + "epoch": 4.34332110341079, + "grad_norm": 0.7968133687973022, + "learning_rate": 0.0004572886941160484, + "loss": 3.8252, + "step": 63925 + }, + { + "epoch": 4.343660823481451, + "grad_norm": 0.8553484082221985, + "learning_rate": 0.00045724622910721564, + "loss": 3.7059, + "step": 63930 + }, + { + "epoch": 4.344000543552113, + "grad_norm": 0.8529188632965088, + "learning_rate": 0.0004572037640983829, + "loss": 3.6484, + "step": 63935 + }, + { + "epoch": 4.344340263622775, + "grad_norm": 0.9842460751533508, + "learning_rate": 0.00045716129908955025, + "loss": 3.5073, + "step": 63940 + }, + { + "epoch": 4.344679983693436, + "grad_norm": 1.071899175643921, + "learning_rate": 0.0004571188340807175, + "loss": 3.3691, + "step": 63945 + }, + { + "epoch": 4.345019703764098, + "grad_norm": 0.7720540165901184, + "learning_rate": 0.00045707636907188476, + "loss": 3.4772, + "step": 63950 + }, + { + "epoch": 4.34535942383476, + "grad_norm": 0.8805068731307983, + "learning_rate": 0.00045703390406305204, + "loss": 3.3335, + "step": 63955 + }, + { + "epoch": 4.345699143905422, + "grad_norm": 0.9086663722991943, + "learning_rate": 0.0004569914390542194, + "loss": 3.3589, + "step": 63960 + }, + { + "epoch": 4.346038863976084, + "grad_norm": 1.3795945644378662, + "learning_rate": 0.0004569489740453866, + "loss": 3.4281, + "step": 63965 + }, + { + "epoch": 4.346378584046746, + "grad_norm": 0.8319389224052429, + "learning_rate": 0.0004569065090365539, + "loss": 3.0242, + "step": 63970 + }, + { + "epoch": 4.346718304117407, + "grad_norm": 0.8706095218658447, + "learning_rate": 0.0004568640440277212, + "loss": 3.6021, + "step": 63975 + }, + { + "epoch": 4.347058024188069, + "grad_norm": 0.9374828934669495, + "learning_rate": 0.00045682157901888844, + "loss": 3.3053, + "step": 63980 + }, + { + "epoch": 4.347397744258731, + "grad_norm": 0.7548850774765015, + "learning_rate": 0.0004567791140100557, + "loss": 3.6169, + "step": 63985 + }, + { + "epoch": 4.347737464329392, + "grad_norm": 0.9955710172653198, + "learning_rate": 0.000456736649001223, + "loss": 3.5056, + "step": 63990 + }, + { + "epoch": 4.348077184400054, + "grad_norm": 0.9825693368911743, + "learning_rate": 0.0004566941839923903, + "loss": 3.5585, + "step": 63995 + }, + { + "epoch": 4.348416904470716, + "grad_norm": 0.7649504542350769, + "learning_rate": 0.00045665171898355756, + "loss": 3.2462, + "step": 64000 + }, + { + "epoch": 4.348756624541378, + "grad_norm": 1.119617223739624, + "learning_rate": 0.00045660925397472484, + "loss": 3.4637, + "step": 64005 + }, + { + "epoch": 4.34909634461204, + "grad_norm": 1.0587819814682007, + "learning_rate": 0.0004565667889658921, + "loss": 3.6263, + "step": 64010 + }, + { + "epoch": 4.349436064682702, + "grad_norm": 0.9225760102272034, + "learning_rate": 0.0004565243239570594, + "loss": 3.2593, + "step": 64015 + }, + { + "epoch": 4.349775784753363, + "grad_norm": 0.961891233921051, + "learning_rate": 0.0004564818589482267, + "loss": 3.6646, + "step": 64020 + }, + { + "epoch": 4.350115504824025, + "grad_norm": 1.035410761833191, + "learning_rate": 0.0004564393939393939, + "loss": 3.3057, + "step": 64025 + }, + { + "epoch": 4.350455224894687, + "grad_norm": 1.0201056003570557, + "learning_rate": 0.00045639692893056124, + "loss": 3.709, + "step": 64030 + }, + { + "epoch": 4.350794944965348, + "grad_norm": 0.8509143590927124, + "learning_rate": 0.0004563544639217285, + "loss": 3.2642, + "step": 64035 + }, + { + "epoch": 4.35113466503601, + "grad_norm": 0.9493917226791382, + "learning_rate": 0.00045631199891289575, + "loss": 3.6891, + "step": 64040 + }, + { + "epoch": 4.351474385106672, + "grad_norm": 0.7031378746032715, + "learning_rate": 0.0004562695339040631, + "loss": 3.4161, + "step": 64045 + }, + { + "epoch": 4.351814105177334, + "grad_norm": 1.0035152435302734, + "learning_rate": 0.00045622706889523036, + "loss": 3.1621, + "step": 64050 + }, + { + "epoch": 4.352153825247996, + "grad_norm": 0.8530465364456177, + "learning_rate": 0.0004561846038863976, + "loss": 3.3175, + "step": 64055 + }, + { + "epoch": 4.352493545318658, + "grad_norm": 1.0070996284484863, + "learning_rate": 0.00045614213887756487, + "loss": 3.4565, + "step": 64060 + }, + { + "epoch": 4.352833265389319, + "grad_norm": 0.8819023966789246, + "learning_rate": 0.0004560996738687322, + "loss": 3.7139, + "step": 64065 + }, + { + "epoch": 4.353172985459981, + "grad_norm": 1.4278347492218018, + "learning_rate": 0.00045605720885989943, + "loss": 3.482, + "step": 64070 + }, + { + "epoch": 4.353512705530643, + "grad_norm": 0.9556347727775574, + "learning_rate": 0.0004560147438510667, + "loss": 3.6315, + "step": 64075 + }, + { + "epoch": 4.353852425601304, + "grad_norm": 1.4528006315231323, + "learning_rate": 0.00045597227884223404, + "loss": 3.5273, + "step": 64080 + }, + { + "epoch": 4.354192145671966, + "grad_norm": 1.1534096002578735, + "learning_rate": 0.00045592981383340127, + "loss": 3.2857, + "step": 64085 + }, + { + "epoch": 4.354531865742628, + "grad_norm": 0.7674627900123596, + "learning_rate": 0.00045588734882456855, + "loss": 3.3297, + "step": 64090 + }, + { + "epoch": 4.35487158581329, + "grad_norm": 0.8239625096321106, + "learning_rate": 0.00045584488381573583, + "loss": 3.6701, + "step": 64095 + }, + { + "epoch": 4.355211305883952, + "grad_norm": 1.0091314315795898, + "learning_rate": 0.0004558024188069031, + "loss": 3.3864, + "step": 64100 + }, + { + "epoch": 4.355551025954614, + "grad_norm": 0.7791523933410645, + "learning_rate": 0.0004557599537980704, + "loss": 3.5243, + "step": 64105 + }, + { + "epoch": 4.355890746025275, + "grad_norm": 4.2146315574646, + "learning_rate": 0.00045571748878923767, + "loss": 3.488, + "step": 64110 + }, + { + "epoch": 4.356230466095937, + "grad_norm": 0.88255774974823, + "learning_rate": 0.00045567502378040495, + "loss": 3.2648, + "step": 64115 + }, + { + "epoch": 4.356570186166599, + "grad_norm": 0.851347029209137, + "learning_rate": 0.00045563255877157223, + "loss": 3.7771, + "step": 64120 + }, + { + "epoch": 4.35690990623726, + "grad_norm": 1.049168348312378, + "learning_rate": 0.0004555900937627395, + "loss": 3.5061, + "step": 64125 + }, + { + "epoch": 4.357249626307922, + "grad_norm": 0.8117693662643433, + "learning_rate": 0.00045554762875390684, + "loss": 3.457, + "step": 64130 + }, + { + "epoch": 4.357589346378584, + "grad_norm": 0.9118788838386536, + "learning_rate": 0.00045550516374507407, + "loss": 3.4264, + "step": 64135 + }, + { + "epoch": 4.357929066449246, + "grad_norm": 1.9998811483383179, + "learning_rate": 0.00045546269873624135, + "loss": 3.3739, + "step": 64140 + }, + { + "epoch": 4.358268786519908, + "grad_norm": 0.9814351797103882, + "learning_rate": 0.00045542023372740863, + "loss": 3.6838, + "step": 64145 + }, + { + "epoch": 4.35860850659057, + "grad_norm": 0.816159188747406, + "learning_rate": 0.0004553777687185759, + "loss": 3.2587, + "step": 64150 + }, + { + "epoch": 4.358948226661231, + "grad_norm": 0.7863412499427795, + "learning_rate": 0.0004553353037097432, + "loss": 3.6597, + "step": 64155 + }, + { + "epoch": 4.359287946731893, + "grad_norm": 1.2685233354568481, + "learning_rate": 0.00045529283870091047, + "loss": 3.4471, + "step": 64160 + }, + { + "epoch": 4.359627666802555, + "grad_norm": 0.9784970283508301, + "learning_rate": 0.00045525037369207775, + "loss": 3.3912, + "step": 64165 + }, + { + "epoch": 4.359967386873216, + "grad_norm": 1.011184573173523, + "learning_rate": 0.00045520790868324503, + "loss": 3.4253, + "step": 64170 + }, + { + "epoch": 4.360307106943878, + "grad_norm": 0.8361697793006897, + "learning_rate": 0.0004551654436744123, + "loss": 3.5556, + "step": 64175 + }, + { + "epoch": 4.36064682701454, + "grad_norm": 0.8222962021827698, + "learning_rate": 0.00045512297866557954, + "loss": 3.374, + "step": 64180 + }, + { + "epoch": 4.360986547085202, + "grad_norm": 0.8348346948623657, + "learning_rate": 0.00045508051365674687, + "loss": 3.7222, + "step": 64185 + }, + { + "epoch": 4.361326267155864, + "grad_norm": 0.7790852785110474, + "learning_rate": 0.00045503804864791415, + "loss": 3.5262, + "step": 64190 + }, + { + "epoch": 4.361665987226525, + "grad_norm": 0.8870062232017517, + "learning_rate": 0.0004549955836390814, + "loss": 3.5041, + "step": 64195 + }, + { + "epoch": 4.362005707297187, + "grad_norm": 1.0144201517105103, + "learning_rate": 0.0004549531186302487, + "loss": 3.3334, + "step": 64200 + }, + { + "epoch": 4.362345427367849, + "grad_norm": 0.9583907723426819, + "learning_rate": 0.000454910653621416, + "loss": 3.5214, + "step": 64205 + }, + { + "epoch": 4.36268514743851, + "grad_norm": 1.1545937061309814, + "learning_rate": 0.0004548681886125832, + "loss": 3.3474, + "step": 64210 + }, + { + "epoch": 4.363024867509172, + "grad_norm": 0.8857768774032593, + "learning_rate": 0.0004548257236037505, + "loss": 3.6266, + "step": 64215 + }, + { + "epoch": 4.363364587579834, + "grad_norm": 0.8800503015518188, + "learning_rate": 0.00045478325859491783, + "loss": 3.5103, + "step": 64220 + }, + { + "epoch": 4.3637043076504956, + "grad_norm": 0.7199701070785522, + "learning_rate": 0.00045474079358608506, + "loss": 3.4551, + "step": 64225 + }, + { + "epoch": 4.364044027721158, + "grad_norm": 0.8346177339553833, + "learning_rate": 0.00045469832857725234, + "loss": 3.6012, + "step": 64230 + }, + { + "epoch": 4.36438374779182, + "grad_norm": 1.0359927415847778, + "learning_rate": 0.00045465586356841967, + "loss": 3.4892, + "step": 64235 + }, + { + "epoch": 4.364723467862481, + "grad_norm": 0.9256945252418518, + "learning_rate": 0.0004546133985595869, + "loss": 3.5656, + "step": 64240 + }, + { + "epoch": 4.365063187933143, + "grad_norm": 0.9039768576622009, + "learning_rate": 0.0004545709335507542, + "loss": 3.6127, + "step": 64245 + }, + { + "epoch": 4.365402908003805, + "grad_norm": 1.0710312128067017, + "learning_rate": 0.00045452846854192146, + "loss": 3.4051, + "step": 64250 + }, + { + "epoch": 4.365742628074466, + "grad_norm": 1.0363637208938599, + "learning_rate": 0.00045448600353308874, + "loss": 3.2846, + "step": 64255 + }, + { + "epoch": 4.366082348145128, + "grad_norm": 1.0503432750701904, + "learning_rate": 0.000454443538524256, + "loss": 3.3893, + "step": 64260 + }, + { + "epoch": 4.36642206821579, + "grad_norm": 0.7770836353302002, + "learning_rate": 0.0004544010735154233, + "loss": 3.1138, + "step": 64265 + }, + { + "epoch": 4.366761788286452, + "grad_norm": 0.7790826559066772, + "learning_rate": 0.0004543586085065906, + "loss": 3.2987, + "step": 64270 + }, + { + "epoch": 4.367101508357114, + "grad_norm": 0.8265770673751831, + "learning_rate": 0.00045431614349775786, + "loss": 3.7084, + "step": 64275 + }, + { + "epoch": 4.367441228427776, + "grad_norm": 1.0185645818710327, + "learning_rate": 0.00045427367848892514, + "loss": 3.4564, + "step": 64280 + }, + { + "epoch": 4.367780948498437, + "grad_norm": 0.7880184054374695, + "learning_rate": 0.00045423121348009236, + "loss": 3.4828, + "step": 64285 + }, + { + "epoch": 4.368120668569099, + "grad_norm": 1.2013239860534668, + "learning_rate": 0.0004541887484712597, + "loss": 3.4022, + "step": 64290 + }, + { + "epoch": 4.368460388639761, + "grad_norm": 0.9043722152709961, + "learning_rate": 0.000454146283462427, + "loss": 3.4538, + "step": 64295 + }, + { + "epoch": 4.368800108710422, + "grad_norm": 1.0089129209518433, + "learning_rate": 0.00045410381845359426, + "loss": 3.6077, + "step": 64300 + }, + { + "epoch": 4.369139828781084, + "grad_norm": 0.8904833793640137, + "learning_rate": 0.00045406135344476154, + "loss": 3.9039, + "step": 64305 + }, + { + "epoch": 4.369479548851746, + "grad_norm": 0.942651629447937, + "learning_rate": 0.0004540188884359288, + "loss": 3.4701, + "step": 64310 + }, + { + "epoch": 4.369819268922408, + "grad_norm": 1.1451703310012817, + "learning_rate": 0.0004539764234270961, + "loss": 3.2117, + "step": 64315 + }, + { + "epoch": 4.37015898899307, + "grad_norm": 0.8590653538703918, + "learning_rate": 0.0004539339584182633, + "loss": 3.7275, + "step": 64320 + }, + { + "epoch": 4.370498709063732, + "grad_norm": 0.8141555786132812, + "learning_rate": 0.00045389149340943066, + "loss": 3.3597, + "step": 64325 + }, + { + "epoch": 4.370838429134393, + "grad_norm": 0.69057697057724, + "learning_rate": 0.00045384902840059794, + "loss": 3.7535, + "step": 64330 + }, + { + "epoch": 4.371178149205055, + "grad_norm": 1.0135431289672852, + "learning_rate": 0.00045380656339176516, + "loss": 3.6155, + "step": 64335 + }, + { + "epoch": 4.371517869275717, + "grad_norm": 1.284415602684021, + "learning_rate": 0.0004537640983829325, + "loss": 3.2286, + "step": 64340 + }, + { + "epoch": 4.371857589346378, + "grad_norm": 0.7624582648277283, + "learning_rate": 0.0004537216333740998, + "loss": 3.943, + "step": 64345 + }, + { + "epoch": 4.37219730941704, + "grad_norm": 0.8361964225769043, + "learning_rate": 0.000453679168365267, + "loss": 3.3428, + "step": 64350 + }, + { + "epoch": 4.372537029487702, + "grad_norm": 0.8447329998016357, + "learning_rate": 0.0004536367033564343, + "loss": 3.6586, + "step": 64355 + }, + { + "epoch": 4.372876749558364, + "grad_norm": 0.8483626246452332, + "learning_rate": 0.0004535942383476016, + "loss": 3.4862, + "step": 64360 + }, + { + "epoch": 4.373216469629026, + "grad_norm": 0.9104040265083313, + "learning_rate": 0.00045355177333876884, + "loss": 3.6771, + "step": 64365 + }, + { + "epoch": 4.373556189699688, + "grad_norm": 0.7859687805175781, + "learning_rate": 0.0004535093083299361, + "loss": 3.4654, + "step": 64370 + }, + { + "epoch": 4.373895909770349, + "grad_norm": 0.7813522219657898, + "learning_rate": 0.00045346684332110346, + "loss": 3.7063, + "step": 64375 + }, + { + "epoch": 4.374235629841011, + "grad_norm": 1.086904764175415, + "learning_rate": 0.0004534243783122707, + "loss": 3.684, + "step": 64380 + }, + { + "epoch": 4.374575349911673, + "grad_norm": 0.8421517610549927, + "learning_rate": 0.00045338191330343796, + "loss": 3.2114, + "step": 64385 + }, + { + "epoch": 4.374915069982334, + "grad_norm": 1.611527442932129, + "learning_rate": 0.0004533394482946053, + "loss": 3.3911, + "step": 64390 + }, + { + "epoch": 4.375254790052996, + "grad_norm": 0.9464080929756165, + "learning_rate": 0.0004532969832857725, + "loss": 3.6465, + "step": 64395 + }, + { + "epoch": 4.375594510123658, + "grad_norm": 1.0488828420639038, + "learning_rate": 0.0004532545182769398, + "loss": 3.316, + "step": 64400 + }, + { + "epoch": 4.37593423019432, + "grad_norm": 1.1642862558364868, + "learning_rate": 0.0004532120532681071, + "loss": 3.5134, + "step": 64405 + }, + { + "epoch": 4.376273950264982, + "grad_norm": 0.9300335645675659, + "learning_rate": 0.00045316958825927437, + "loss": 3.3363, + "step": 64410 + }, + { + "epoch": 4.376613670335644, + "grad_norm": 0.775959849357605, + "learning_rate": 0.00045312712325044165, + "loss": 3.5051, + "step": 64415 + }, + { + "epoch": 4.376953390406305, + "grad_norm": 0.7587294578552246, + "learning_rate": 0.0004530846582416089, + "loss": 3.6228, + "step": 64420 + }, + { + "epoch": 4.377293110476967, + "grad_norm": 1.880449891090393, + "learning_rate": 0.0004530421932327762, + "loss": 3.5065, + "step": 64425 + }, + { + "epoch": 4.377632830547629, + "grad_norm": 0.957757294178009, + "learning_rate": 0.0004529997282239435, + "loss": 3.8064, + "step": 64430 + }, + { + "epoch": 4.37797255061829, + "grad_norm": 0.68839430809021, + "learning_rate": 0.00045295726321511077, + "loss": 3.7162, + "step": 64435 + }, + { + "epoch": 4.378312270688952, + "grad_norm": 0.8689002990722656, + "learning_rate": 0.000452914798206278, + "loss": 3.3706, + "step": 64440 + }, + { + "epoch": 4.378651990759614, + "grad_norm": 0.8401349782943726, + "learning_rate": 0.0004528723331974453, + "loss": 3.3722, + "step": 64445 + }, + { + "epoch": 4.378991710830276, + "grad_norm": 1.7538131475448608, + "learning_rate": 0.0004528298681886126, + "loss": 3.858, + "step": 64450 + }, + { + "epoch": 4.379331430900938, + "grad_norm": 0.7545464038848877, + "learning_rate": 0.00045278740317977983, + "loss": 3.344, + "step": 64455 + }, + { + "epoch": 4.3796711509716, + "grad_norm": 0.7076008319854736, + "learning_rate": 0.00045274493817094717, + "loss": 3.3532, + "step": 64460 + }, + { + "epoch": 4.380010871042261, + "grad_norm": 0.8087411522865295, + "learning_rate": 0.00045270247316211445, + "loss": 3.5867, + "step": 64465 + }, + { + "epoch": 4.380350591112923, + "grad_norm": 1.052977204322815, + "learning_rate": 0.0004526600081532817, + "loss": 3.3076, + "step": 64470 + }, + { + "epoch": 4.380690311183585, + "grad_norm": 0.9588543772697449, + "learning_rate": 0.00045261754314444895, + "loss": 3.4213, + "step": 64475 + }, + { + "epoch": 4.381030031254246, + "grad_norm": 0.9621521234512329, + "learning_rate": 0.0004525750781356163, + "loss": 3.5194, + "step": 64480 + }, + { + "epoch": 4.381369751324908, + "grad_norm": 1.1784563064575195, + "learning_rate": 0.00045253261312678357, + "loss": 3.5038, + "step": 64485 + }, + { + "epoch": 4.38170947139557, + "grad_norm": 0.9267094731330872, + "learning_rate": 0.0004524901481179508, + "loss": 3.6243, + "step": 64490 + }, + { + "epoch": 4.382049191466232, + "grad_norm": 1.0403075218200684, + "learning_rate": 0.0004524476831091181, + "loss": 3.538, + "step": 64495 + }, + { + "epoch": 4.382388911536894, + "grad_norm": 0.9710588455200195, + "learning_rate": 0.0004524052181002854, + "loss": 3.2397, + "step": 64500 + }, + { + "epoch": 4.382728631607556, + "grad_norm": 0.8826228976249695, + "learning_rate": 0.00045236275309145263, + "loss": 3.6047, + "step": 64505 + }, + { + "epoch": 4.383068351678217, + "grad_norm": 0.9054449796676636, + "learning_rate": 0.0004523202880826199, + "loss": 3.4021, + "step": 64510 + }, + { + "epoch": 4.383408071748879, + "grad_norm": 0.7757033705711365, + "learning_rate": 0.00045227782307378725, + "loss": 3.5644, + "step": 64515 + }, + { + "epoch": 4.383747791819541, + "grad_norm": 1.1512248516082764, + "learning_rate": 0.00045223535806495447, + "loss": 3.5822, + "step": 64520 + }, + { + "epoch": 4.384087511890202, + "grad_norm": 0.9755250811576843, + "learning_rate": 0.00045219289305612175, + "loss": 3.8457, + "step": 64525 + }, + { + "epoch": 4.384427231960864, + "grad_norm": 0.8220083713531494, + "learning_rate": 0.0004521504280472891, + "loss": 3.7112, + "step": 64530 + }, + { + "epoch": 4.384766952031526, + "grad_norm": 0.8471550941467285, + "learning_rate": 0.0004521079630384563, + "loss": 3.4539, + "step": 64535 + }, + { + "epoch": 4.385106672102188, + "grad_norm": 1.0005929470062256, + "learning_rate": 0.0004520654980296236, + "loss": 3.7607, + "step": 64540 + }, + { + "epoch": 4.38544639217285, + "grad_norm": 1.004012107849121, + "learning_rate": 0.0004520230330207909, + "loss": 3.3225, + "step": 64545 + }, + { + "epoch": 4.385786112243512, + "grad_norm": 1.1639416217803955, + "learning_rate": 0.00045198056801195815, + "loss": 3.658, + "step": 64550 + }, + { + "epoch": 4.386125832314173, + "grad_norm": 0.754597544670105, + "learning_rate": 0.00045193810300312543, + "loss": 3.4948, + "step": 64555 + }, + { + "epoch": 4.386465552384835, + "grad_norm": 0.7645388841629028, + "learning_rate": 0.0004518956379942927, + "loss": 3.4419, + "step": 64560 + }, + { + "epoch": 4.386805272455497, + "grad_norm": 0.9099061489105225, + "learning_rate": 0.00045185317298546, + "loss": 3.5981, + "step": 64565 + }, + { + "epoch": 4.387144992526158, + "grad_norm": 0.8330501914024353, + "learning_rate": 0.0004518107079766273, + "loss": 3.5258, + "step": 64570 + }, + { + "epoch": 4.38748471259682, + "grad_norm": 1.0213381052017212, + "learning_rate": 0.00045176824296779455, + "loss": 3.4796, + "step": 64575 + }, + { + "epoch": 4.3878244326674825, + "grad_norm": 0.8588259816169739, + "learning_rate": 0.0004517257779589618, + "loss": 3.6299, + "step": 64580 + }, + { + "epoch": 4.388164152738144, + "grad_norm": 0.6705437898635864, + "learning_rate": 0.0004516833129501291, + "loss": 3.5309, + "step": 64585 + }, + { + "epoch": 4.388503872808806, + "grad_norm": 0.8778727650642395, + "learning_rate": 0.0004516408479412964, + "loss": 3.5026, + "step": 64590 + }, + { + "epoch": 4.388843592879467, + "grad_norm": 0.7961484789848328, + "learning_rate": 0.0004515983829324636, + "loss": 3.615, + "step": 64595 + }, + { + "epoch": 4.389183312950129, + "grad_norm": 0.8052172064781189, + "learning_rate": 0.00045155591792363095, + "loss": 3.3749, + "step": 64600 + }, + { + "epoch": 4.389523033020791, + "grad_norm": 0.8783710598945618, + "learning_rate": 0.00045151345291479823, + "loss": 3.6251, + "step": 64605 + }, + { + "epoch": 4.389862753091452, + "grad_norm": 0.9373437166213989, + "learning_rate": 0.00045147098790596546, + "loss": 3.6627, + "step": 64610 + }, + { + "epoch": 4.390202473162114, + "grad_norm": 0.9416751265525818, + "learning_rate": 0.00045142852289713274, + "loss": 3.7203, + "step": 64615 + }, + { + "epoch": 4.390542193232776, + "grad_norm": 0.7939909100532532, + "learning_rate": 0.0004513860578883001, + "loss": 3.4373, + "step": 64620 + }, + { + "epoch": 4.390881913303438, + "grad_norm": 0.9840084910392761, + "learning_rate": 0.0004513435928794673, + "loss": 3.5403, + "step": 64625 + }, + { + "epoch": 4.3912216333741, + "grad_norm": 0.8711320161819458, + "learning_rate": 0.0004513096208724011, + "loss": 3.4214, + "step": 64630 + }, + { + "epoch": 4.391561353444762, + "grad_norm": 0.8806049227714539, + "learning_rate": 0.0004512671558635684, + "loss": 3.3574, + "step": 64635 + }, + { + "epoch": 4.391901073515423, + "grad_norm": 0.8555415868759155, + "learning_rate": 0.00045122469085473574, + "loss": 3.5765, + "step": 64640 + }, + { + "epoch": 4.392240793586085, + "grad_norm": 0.8242248892784119, + "learning_rate": 0.00045118222584590296, + "loss": 3.3751, + "step": 64645 + }, + { + "epoch": 4.392580513656747, + "grad_norm": 1.3158787488937378, + "learning_rate": 0.00045113976083707024, + "loss": 3.3377, + "step": 64650 + }, + { + "epoch": 4.392920233727408, + "grad_norm": 0.921808660030365, + "learning_rate": 0.0004510972958282376, + "loss": 3.4476, + "step": 64655 + }, + { + "epoch": 4.39325995379807, + "grad_norm": 0.7963510155677795, + "learning_rate": 0.0004510548308194048, + "loss": 3.6734, + "step": 64660 + }, + { + "epoch": 4.393599673868732, + "grad_norm": 1.081626534461975, + "learning_rate": 0.0004510123658105721, + "loss": 3.3015, + "step": 64665 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.8314542770385742, + "learning_rate": 0.00045096990080173936, + "loss": 3.4302, + "step": 64670 + }, + { + "epoch": 4.394279114010056, + "grad_norm": 0.8997324109077454, + "learning_rate": 0.0004509274357929067, + "loss": 3.3298, + "step": 64675 + }, + { + "epoch": 4.394618834080718, + "grad_norm": 0.7316117882728577, + "learning_rate": 0.0004508849707840739, + "loss": 3.6669, + "step": 64680 + }, + { + "epoch": 4.394958554151379, + "grad_norm": 0.8788439631462097, + "learning_rate": 0.0004508425057752412, + "loss": 3.5749, + "step": 64685 + }, + { + "epoch": 4.395298274222041, + "grad_norm": 0.8888380527496338, + "learning_rate": 0.00045080004076640854, + "loss": 3.5035, + "step": 64690 + }, + { + "epoch": 4.395637994292703, + "grad_norm": 1.0367696285247803, + "learning_rate": 0.00045075757575757577, + "loss": 3.3677, + "step": 64695 + }, + { + "epoch": 4.395977714363364, + "grad_norm": 0.977719247341156, + "learning_rate": 0.00045071511074874305, + "loss": 3.2836, + "step": 64700 + }, + { + "epoch": 4.396317434434026, + "grad_norm": 1.62290358543396, + "learning_rate": 0.0004506726457399103, + "loss": 3.489, + "step": 64705 + }, + { + "epoch": 4.396657154504688, + "grad_norm": 1.1035139560699463, + "learning_rate": 0.0004506301807310776, + "loss": 3.4037, + "step": 64710 + }, + { + "epoch": 4.39699687457535, + "grad_norm": 0.739878237247467, + "learning_rate": 0.0004505877157222449, + "loss": 3.4019, + "step": 64715 + }, + { + "epoch": 4.397336594646012, + "grad_norm": 0.9389841556549072, + "learning_rate": 0.00045054525071341217, + "loss": 3.2869, + "step": 64720 + }, + { + "epoch": 4.397676314716674, + "grad_norm": 0.8559161424636841, + "learning_rate": 0.00045050278570457945, + "loss": 3.39, + "step": 64725 + }, + { + "epoch": 4.398016034787335, + "grad_norm": 0.7942428588867188, + "learning_rate": 0.0004504603206957467, + "loss": 3.4106, + "step": 64730 + }, + { + "epoch": 4.398355754857997, + "grad_norm": 0.6113680601119995, + "learning_rate": 0.000450417855686914, + "loss": 3.6148, + "step": 64735 + }, + { + "epoch": 4.398695474928659, + "grad_norm": 1.0442508459091187, + "learning_rate": 0.00045037539067808123, + "loss": 3.2981, + "step": 64740 + }, + { + "epoch": 4.39903519499932, + "grad_norm": 0.8563708662986755, + "learning_rate": 0.00045033292566924857, + "loss": 3.4632, + "step": 64745 + }, + { + "epoch": 4.399374915069982, + "grad_norm": 0.9741948246955872, + "learning_rate": 0.00045029046066041585, + "loss": 3.2701, + "step": 64750 + }, + { + "epoch": 4.399714635140644, + "grad_norm": 0.7933897972106934, + "learning_rate": 0.00045024799565158307, + "loss": 3.5122, + "step": 64755 + }, + { + "epoch": 4.400054355211306, + "grad_norm": 1.0805963277816772, + "learning_rate": 0.0004502055306427504, + "loss": 3.5173, + "step": 64760 + }, + { + "epoch": 4.400394075281968, + "grad_norm": 0.901157796382904, + "learning_rate": 0.0004501630656339177, + "loss": 3.3062, + "step": 64765 + }, + { + "epoch": 4.40073379535263, + "grad_norm": 0.864221453666687, + "learning_rate": 0.0004501206006250849, + "loss": 3.3749, + "step": 64770 + }, + { + "epoch": 4.401073515423291, + "grad_norm": 0.7995808124542236, + "learning_rate": 0.0004500781356162522, + "loss": 3.4557, + "step": 64775 + }, + { + "epoch": 4.401413235493953, + "grad_norm": 0.8526076674461365, + "learning_rate": 0.0004500356706074195, + "loss": 3.3011, + "step": 64780 + }, + { + "epoch": 4.401752955564615, + "grad_norm": 0.947216272354126, + "learning_rate": 0.00044999320559858675, + "loss": 3.2747, + "step": 64785 + }, + { + "epoch": 4.402092675635276, + "grad_norm": 1.4352689981460571, + "learning_rate": 0.00044995074058975403, + "loss": 3.5349, + "step": 64790 + }, + { + "epoch": 4.402432395705938, + "grad_norm": 0.9228683710098267, + "learning_rate": 0.00044990827558092137, + "loss": 3.4973, + "step": 64795 + }, + { + "epoch": 4.4027721157766, + "grad_norm": 0.9077434539794922, + "learning_rate": 0.0004498658105720886, + "loss": 3.2067, + "step": 64800 + }, + { + "epoch": 4.403111835847262, + "grad_norm": 1.2018746137619019, + "learning_rate": 0.00044982334556325587, + "loss": 3.4066, + "step": 64805 + }, + { + "epoch": 4.403451555917924, + "grad_norm": 0.9864763021469116, + "learning_rate": 0.00044978088055442315, + "loss": 3.4506, + "step": 64810 + }, + { + "epoch": 4.403791275988586, + "grad_norm": 1.0919992923736572, + "learning_rate": 0.00044973841554559043, + "loss": 3.5139, + "step": 64815 + }, + { + "epoch": 4.404130996059247, + "grad_norm": 0.8834913372993469, + "learning_rate": 0.0004496959505367577, + "loss": 3.3065, + "step": 64820 + }, + { + "epoch": 4.404470716129909, + "grad_norm": 0.774351954460144, + "learning_rate": 0.000449653485527925, + "loss": 3.3595, + "step": 64825 + }, + { + "epoch": 4.404810436200571, + "grad_norm": 0.8436650037765503, + "learning_rate": 0.0004496110205190923, + "loss": 3.8206, + "step": 64830 + }, + { + "epoch": 4.405150156271232, + "grad_norm": 0.9153661727905273, + "learning_rate": 0.00044956855551025955, + "loss": 3.76, + "step": 64835 + }, + { + "epoch": 4.405489876341894, + "grad_norm": 0.8346265554428101, + "learning_rate": 0.00044952609050142683, + "loss": 3.439, + "step": 64840 + }, + { + "epoch": 4.4058295964125564, + "grad_norm": 0.9452887177467346, + "learning_rate": 0.0004494836254925941, + "loss": 3.3112, + "step": 64845 + }, + { + "epoch": 4.406169316483218, + "grad_norm": 0.7447081208229065, + "learning_rate": 0.0004494411604837614, + "loss": 3.4535, + "step": 64850 + }, + { + "epoch": 4.40650903655388, + "grad_norm": 0.8085386157035828, + "learning_rate": 0.0004493986954749287, + "loss": 3.4225, + "step": 64855 + }, + { + "epoch": 4.406848756624542, + "grad_norm": 0.9039657711982727, + "learning_rate": 0.00044935623046609595, + "loss": 3.2064, + "step": 64860 + }, + { + "epoch": 4.407188476695203, + "grad_norm": 0.7990557551383972, + "learning_rate": 0.00044931376545726323, + "loss": 3.6247, + "step": 64865 + }, + { + "epoch": 4.407528196765865, + "grad_norm": 0.9236570000648499, + "learning_rate": 0.0004492713004484305, + "loss": 3.3681, + "step": 64870 + }, + { + "epoch": 4.407867916836526, + "grad_norm": 1.1473126411437988, + "learning_rate": 0.0004492288354395978, + "loss": 3.5678, + "step": 64875 + }, + { + "epoch": 4.408207636907188, + "grad_norm": 0.8442654609680176, + "learning_rate": 0.000449186370430765, + "loss": 3.6812, + "step": 64880 + }, + { + "epoch": 4.40854735697785, + "grad_norm": 0.824555516242981, + "learning_rate": 0.00044914390542193235, + "loss": 3.3323, + "step": 64885 + }, + { + "epoch": 4.408887077048512, + "grad_norm": 1.0207281112670898, + "learning_rate": 0.00044910144041309963, + "loss": 3.513, + "step": 64890 + }, + { + "epoch": 4.409226797119174, + "grad_norm": 0.7807279825210571, + "learning_rate": 0.00044905897540426686, + "loss": 3.4211, + "step": 64895 + }, + { + "epoch": 4.409566517189836, + "grad_norm": 0.945638120174408, + "learning_rate": 0.0004490165103954342, + "loss": 3.3994, + "step": 64900 + }, + { + "epoch": 4.409906237260497, + "grad_norm": 0.7966599464416504, + "learning_rate": 0.0004489740453866015, + "loss": 3.4273, + "step": 64905 + }, + { + "epoch": 4.410245957331159, + "grad_norm": 0.8317762017250061, + "learning_rate": 0.0004489315803777687, + "loss": 3.5685, + "step": 64910 + }, + { + "epoch": 4.410585677401821, + "grad_norm": 1.0067033767700195, + "learning_rate": 0.00044888911536893603, + "loss": 3.6488, + "step": 64915 + }, + { + "epoch": 4.410925397472482, + "grad_norm": 1.3884077072143555, + "learning_rate": 0.0004488466503601033, + "loss": 3.2336, + "step": 64920 + }, + { + "epoch": 4.411265117543144, + "grad_norm": 0.822937548160553, + "learning_rate": 0.00044880418535127054, + "loss": 3.2561, + "step": 64925 + }, + { + "epoch": 4.411604837613806, + "grad_norm": 0.832878589630127, + "learning_rate": 0.0004487617203424378, + "loss": 3.3097, + "step": 64930 + }, + { + "epoch": 4.411944557684468, + "grad_norm": 0.9038941860198975, + "learning_rate": 0.00044871925533360515, + "loss": 3.6251, + "step": 64935 + }, + { + "epoch": 4.41228427775513, + "grad_norm": 0.8573558926582336, + "learning_rate": 0.0004486767903247724, + "loss": 3.5235, + "step": 64940 + }, + { + "epoch": 4.412623997825792, + "grad_norm": 1.2484327554702759, + "learning_rate": 0.00044863432531593966, + "loss": 3.4027, + "step": 64945 + }, + { + "epoch": 4.412963717896453, + "grad_norm": 0.9883583188056946, + "learning_rate": 0.000448591860307107, + "loss": 3.3752, + "step": 64950 + }, + { + "epoch": 4.413303437967115, + "grad_norm": 1.2556802034378052, + "learning_rate": 0.0004485493952982742, + "loss": 3.4707, + "step": 64955 + }, + { + "epoch": 4.413643158037777, + "grad_norm": 0.8755394220352173, + "learning_rate": 0.0004485069302894415, + "loss": 3.7012, + "step": 64960 + }, + { + "epoch": 4.413982878108438, + "grad_norm": 0.5955003499984741, + "learning_rate": 0.0004484644652806088, + "loss": 3.3936, + "step": 64965 + }, + { + "epoch": 4.4143225981791, + "grad_norm": 1.9808253049850464, + "learning_rate": 0.00044842200027177606, + "loss": 3.6443, + "step": 64970 + }, + { + "epoch": 4.414662318249762, + "grad_norm": 1.2214571237564087, + "learning_rate": 0.00044837953526294334, + "loss": 3.3598, + "step": 64975 + }, + { + "epoch": 4.415002038320424, + "grad_norm": 0.8163056969642639, + "learning_rate": 0.0004483370702541106, + "loss": 3.5777, + "step": 64980 + }, + { + "epoch": 4.415341758391086, + "grad_norm": 0.9462394714355469, + "learning_rate": 0.0004482946052452779, + "loss": 3.6009, + "step": 64985 + }, + { + "epoch": 4.415681478461748, + "grad_norm": 0.9361789226531982, + "learning_rate": 0.0004482521402364452, + "loss": 3.6353, + "step": 64990 + }, + { + "epoch": 4.416021198532409, + "grad_norm": 0.8031364679336548, + "learning_rate": 0.00044820967522761246, + "loss": 3.5817, + "step": 64995 + }, + { + "epoch": 4.416360918603071, + "grad_norm": 0.9673303365707397, + "learning_rate": 0.0004481672102187797, + "loss": 3.204, + "step": 65000 + }, + { + "epoch": 4.416700638673733, + "grad_norm": 0.8691394925117493, + "learning_rate": 0.000448124745209947, + "loss": 3.2265, + "step": 65005 + }, + { + "epoch": 4.417040358744394, + "grad_norm": 0.7142326235771179, + "learning_rate": 0.0004480822802011143, + "loss": 3.3133, + "step": 65010 + }, + { + "epoch": 4.417380078815056, + "grad_norm": 2.2351300716400146, + "learning_rate": 0.0004480398151922816, + "loss": 3.5514, + "step": 65015 + }, + { + "epoch": 4.417719798885718, + "grad_norm": 6.065305709838867, + "learning_rate": 0.00044799735018344886, + "loss": 3.4867, + "step": 65020 + }, + { + "epoch": 4.41805951895638, + "grad_norm": 0.8734179139137268, + "learning_rate": 0.00044795488517461614, + "loss": 3.3181, + "step": 65025 + }, + { + "epoch": 4.418399239027042, + "grad_norm": 0.8867202401161194, + "learning_rate": 0.0004479124201657834, + "loss": 3.5827, + "step": 65030 + }, + { + "epoch": 4.418738959097704, + "grad_norm": 0.8663195371627808, + "learning_rate": 0.00044786995515695065, + "loss": 3.4216, + "step": 65035 + }, + { + "epoch": 4.419078679168365, + "grad_norm": 0.7589112520217896, + "learning_rate": 0.000447827490148118, + "loss": 3.3665, + "step": 65040 + }, + { + "epoch": 4.419418399239027, + "grad_norm": 1.0398386716842651, + "learning_rate": 0.00044778502513928526, + "loss": 3.2301, + "step": 65045 + }, + { + "epoch": 4.419758119309689, + "grad_norm": 1.0071163177490234, + "learning_rate": 0.0004477425601304525, + "loss": 3.376, + "step": 65050 + }, + { + "epoch": 4.42009783938035, + "grad_norm": 0.7705613970756531, + "learning_rate": 0.0004477000951216198, + "loss": 3.6591, + "step": 65055 + }, + { + "epoch": 4.420437559451012, + "grad_norm": 0.848717212677002, + "learning_rate": 0.0004476576301127871, + "loss": 3.6578, + "step": 65060 + }, + { + "epoch": 4.420777279521674, + "grad_norm": 1.038037896156311, + "learning_rate": 0.00044761516510395433, + "loss": 3.5644, + "step": 65065 + }, + { + "epoch": 4.421116999592336, + "grad_norm": 0.8012915253639221, + "learning_rate": 0.0004475727000951216, + "loss": 3.3798, + "step": 65070 + }, + { + "epoch": 4.421456719662998, + "grad_norm": 1.002084732055664, + "learning_rate": 0.00044753023508628894, + "loss": 3.4068, + "step": 65075 + }, + { + "epoch": 4.42179643973366, + "grad_norm": 1.3425579071044922, + "learning_rate": 0.00044748777007745617, + "loss": 3.3973, + "step": 65080 + }, + { + "epoch": 4.422136159804321, + "grad_norm": 0.8317723870277405, + "learning_rate": 0.00044744530506862345, + "loss": 3.611, + "step": 65085 + }, + { + "epoch": 4.422475879874983, + "grad_norm": 0.8252835869789124, + "learning_rate": 0.0004474028400597908, + "loss": 3.1579, + "step": 65090 + }, + { + "epoch": 4.422815599945645, + "grad_norm": 0.8404601216316223, + "learning_rate": 0.000447360375050958, + "loss": 3.5393, + "step": 65095 + }, + { + "epoch": 4.423155320016306, + "grad_norm": 1.2352092266082764, + "learning_rate": 0.0004473179100421253, + "loss": 3.6341, + "step": 65100 + }, + { + "epoch": 4.423495040086968, + "grad_norm": 0.8391205072402954, + "learning_rate": 0.00044727544503329257, + "loss": 3.5285, + "step": 65105 + }, + { + "epoch": 4.42383476015763, + "grad_norm": 0.73949134349823, + "learning_rate": 0.00044723298002445985, + "loss": 3.6104, + "step": 65110 + }, + { + "epoch": 4.424174480228292, + "grad_norm": 0.9297716617584229, + "learning_rate": 0.00044719051501562713, + "loss": 3.512, + "step": 65115 + }, + { + "epoch": 4.424514200298954, + "grad_norm": 1.0370845794677734, + "learning_rate": 0.0004471480500067944, + "loss": 3.3531, + "step": 65120 + }, + { + "epoch": 4.424853920369616, + "grad_norm": 0.9985736608505249, + "learning_rate": 0.0004471055849979617, + "loss": 3.3294, + "step": 65125 + }, + { + "epoch": 4.425193640440277, + "grad_norm": 0.8628408908843994, + "learning_rate": 0.00044706311998912897, + "loss": 3.2366, + "step": 65130 + }, + { + "epoch": 4.425533360510939, + "grad_norm": 0.8805834054946899, + "learning_rate": 0.00044702065498029625, + "loss": 3.3725, + "step": 65135 + }, + { + "epoch": 4.425873080581601, + "grad_norm": 1.3535112142562866, + "learning_rate": 0.0004469781899714635, + "loss": 3.4017, + "step": 65140 + }, + { + "epoch": 4.426212800652262, + "grad_norm": 1.1526833772659302, + "learning_rate": 0.0004469357249626308, + "loss": 3.5066, + "step": 65145 + }, + { + "epoch": 4.426552520722924, + "grad_norm": 0.9460657238960266, + "learning_rate": 0.0004468932599537981, + "loss": 3.4418, + "step": 65150 + }, + { + "epoch": 4.4268922407935865, + "grad_norm": 1.0223063230514526, + "learning_rate": 0.0004468507949449653, + "loss": 3.4651, + "step": 65155 + }, + { + "epoch": 4.427231960864248, + "grad_norm": 0.922024130821228, + "learning_rate": 0.00044680832993613265, + "loss": 3.893, + "step": 65160 + }, + { + "epoch": 4.42757168093491, + "grad_norm": 1.2897919416427612, + "learning_rate": 0.00044676586492729993, + "loss": 3.8892, + "step": 65165 + }, + { + "epoch": 4.427911401005572, + "grad_norm": 1.2038300037384033, + "learning_rate": 0.00044672339991846716, + "loss": 3.6018, + "step": 65170 + }, + { + "epoch": 4.428251121076233, + "grad_norm": 1.2062848806381226, + "learning_rate": 0.00044668093490963444, + "loss": 3.282, + "step": 65175 + }, + { + "epoch": 4.428590841146895, + "grad_norm": 0.8753140568733215, + "learning_rate": 0.00044663846990080177, + "loss": 3.5889, + "step": 65180 + }, + { + "epoch": 4.428930561217557, + "grad_norm": 1.022332787513733, + "learning_rate": 0.00044659600489196905, + "loss": 3.5732, + "step": 65185 + }, + { + "epoch": 4.429270281288218, + "grad_norm": 0.9117839932441711, + "learning_rate": 0.0004465535398831363, + "loss": 3.6355, + "step": 65190 + }, + { + "epoch": 4.42961000135888, + "grad_norm": 0.9572985768318176, + "learning_rate": 0.0004465110748743036, + "loss": 3.3273, + "step": 65195 + }, + { + "epoch": 4.4299497214295425, + "grad_norm": 0.8200777769088745, + "learning_rate": 0.0004464686098654709, + "loss": 3.749, + "step": 65200 + }, + { + "epoch": 4.430289441500204, + "grad_norm": 1.0102661848068237, + "learning_rate": 0.0004464261448566381, + "loss": 3.559, + "step": 65205 + }, + { + "epoch": 4.430629161570866, + "grad_norm": 0.7217300534248352, + "learning_rate": 0.00044638367984780545, + "loss": 3.1719, + "step": 65210 + }, + { + "epoch": 4.430968881641528, + "grad_norm": 1.2932497262954712, + "learning_rate": 0.00044634121483897273, + "loss": 3.7321, + "step": 65215 + }, + { + "epoch": 4.431308601712189, + "grad_norm": 0.895243227481842, + "learning_rate": 0.00044629874983013996, + "loss": 3.7534, + "step": 65220 + }, + { + "epoch": 4.431648321782851, + "grad_norm": 0.9161772727966309, + "learning_rate": 0.00044625628482130724, + "loss": 3.5283, + "step": 65225 + }, + { + "epoch": 4.431988041853513, + "grad_norm": 1.1879712343215942, + "learning_rate": 0.00044621381981247457, + "loss": 3.6345, + "step": 65230 + }, + { + "epoch": 4.432327761924174, + "grad_norm": 1.1556800603866577, + "learning_rate": 0.0004461713548036418, + "loss": 3.6882, + "step": 65235 + }, + { + "epoch": 4.432667481994836, + "grad_norm": 1.1501909494400024, + "learning_rate": 0.0004461288897948091, + "loss": 3.4932, + "step": 65240 + }, + { + "epoch": 4.4330072020654985, + "grad_norm": 0.9311378002166748, + "learning_rate": 0.0004460864247859764, + "loss": 3.4407, + "step": 65245 + }, + { + "epoch": 4.43334692213616, + "grad_norm": 0.7318961024284363, + "learning_rate": 0.00044604395977714364, + "loss": 3.8982, + "step": 65250 + }, + { + "epoch": 4.433686642206822, + "grad_norm": 1.0106852054595947, + "learning_rate": 0.0004460014947683109, + "loss": 3.4792, + "step": 65255 + }, + { + "epoch": 4.434026362277484, + "grad_norm": 1.3561581373214722, + "learning_rate": 0.0004459590297594782, + "loss": 3.2778, + "step": 65260 + }, + { + "epoch": 4.434366082348145, + "grad_norm": 1.2460403442382812, + "learning_rate": 0.0004459165647506455, + "loss": 3.6557, + "step": 65265 + }, + { + "epoch": 4.434705802418807, + "grad_norm": 0.8561227917671204, + "learning_rate": 0.00044587409974181276, + "loss": 3.6636, + "step": 65270 + }, + { + "epoch": 4.435045522489468, + "grad_norm": 1.7293646335601807, + "learning_rate": 0.00044583163473298004, + "loss": 3.4405, + "step": 65275 + }, + { + "epoch": 4.43538524256013, + "grad_norm": 0.9411782622337341, + "learning_rate": 0.0004457891697241473, + "loss": 3.3803, + "step": 65280 + }, + { + "epoch": 4.435724962630792, + "grad_norm": 0.9246314167976379, + "learning_rate": 0.0004457467047153146, + "loss": 3.9172, + "step": 65285 + }, + { + "epoch": 4.436064682701454, + "grad_norm": 0.9520529508590698, + "learning_rate": 0.0004457042397064819, + "loss": 3.1416, + "step": 65290 + }, + { + "epoch": 4.436404402772116, + "grad_norm": 1.2349592447280884, + "learning_rate": 0.0004456617746976491, + "loss": 3.4164, + "step": 65295 + }, + { + "epoch": 4.436744122842778, + "grad_norm": 0.99843829870224, + "learning_rate": 0.00044561930968881644, + "loss": 3.5136, + "step": 65300 + }, + { + "epoch": 4.437083842913439, + "grad_norm": 0.9226893186569214, + "learning_rate": 0.0004455768446799837, + "loss": 3.6256, + "step": 65305 + }, + { + "epoch": 4.437423562984101, + "grad_norm": 1.091285228729248, + "learning_rate": 0.00044553437967115094, + "loss": 3.5879, + "step": 65310 + }, + { + "epoch": 4.437763283054763, + "grad_norm": 0.9810730814933777, + "learning_rate": 0.0004454919146623183, + "loss": 3.3928, + "step": 65315 + }, + { + "epoch": 4.438103003125424, + "grad_norm": 0.7757936120033264, + "learning_rate": 0.00044544944965348556, + "loss": 3.6907, + "step": 65320 + }, + { + "epoch": 4.438442723196086, + "grad_norm": 1.3968138694763184, + "learning_rate": 0.0004454069846446528, + "loss": 3.3679, + "step": 65325 + }, + { + "epoch": 4.438782443266748, + "grad_norm": 0.7792653441429138, + "learning_rate": 0.00044536451963582006, + "loss": 3.3977, + "step": 65330 + }, + { + "epoch": 4.43912216333741, + "grad_norm": 0.8726547956466675, + "learning_rate": 0.0004453220546269874, + "loss": 3.7314, + "step": 65335 + }, + { + "epoch": 4.439461883408072, + "grad_norm": 0.8497108221054077, + "learning_rate": 0.0004452795896181546, + "loss": 3.3453, + "step": 65340 + }, + { + "epoch": 4.439801603478734, + "grad_norm": 0.8675013780593872, + "learning_rate": 0.0004452371246093219, + "loss": 3.3988, + "step": 65345 + }, + { + "epoch": 4.440141323549395, + "grad_norm": 0.8335239291191101, + "learning_rate": 0.00044519465960048924, + "loss": 3.6566, + "step": 65350 + }, + { + "epoch": 4.440481043620057, + "grad_norm": 0.7894169092178345, + "learning_rate": 0.0004451521945916565, + "loss": 3.4059, + "step": 65355 + }, + { + "epoch": 4.440820763690719, + "grad_norm": 1.0204472541809082, + "learning_rate": 0.00044510972958282374, + "loss": 3.4865, + "step": 65360 + }, + { + "epoch": 4.44116048376138, + "grad_norm": 0.8330448269844055, + "learning_rate": 0.000445067264573991, + "loss": 3.644, + "step": 65365 + }, + { + "epoch": 4.441500203832042, + "grad_norm": 0.8583707809448242, + "learning_rate": 0.00044502479956515836, + "loss": 3.4203, + "step": 65370 + }, + { + "epoch": 4.441839923902704, + "grad_norm": 1.2718985080718994, + "learning_rate": 0.0004449823345563256, + "loss": 3.3188, + "step": 65375 + }, + { + "epoch": 4.442179643973366, + "grad_norm": 1.000611424446106, + "learning_rate": 0.00044493986954749286, + "loss": 3.5327, + "step": 65380 + }, + { + "epoch": 4.442519364044028, + "grad_norm": 1.6572728157043457, + "learning_rate": 0.0004448974045386602, + "loss": 3.2991, + "step": 65385 + }, + { + "epoch": 4.44285908411469, + "grad_norm": 0.9415321350097656, + "learning_rate": 0.0004448549395298274, + "loss": 3.5568, + "step": 65390 + }, + { + "epoch": 4.443198804185351, + "grad_norm": 1.2902612686157227, + "learning_rate": 0.0004448124745209947, + "loss": 3.5623, + "step": 65395 + }, + { + "epoch": 4.443538524256013, + "grad_norm": 1.0888227224349976, + "learning_rate": 0.000444770009512162, + "loss": 3.576, + "step": 65400 + }, + { + "epoch": 4.443878244326675, + "grad_norm": 0.8842988014221191, + "learning_rate": 0.00044472754450332927, + "loss": 3.4107, + "step": 65405 + }, + { + "epoch": 4.444217964397336, + "grad_norm": 0.7863832116127014, + "learning_rate": 0.00044468507949449655, + "loss": 3.3831, + "step": 65410 + }, + { + "epoch": 4.444557684467998, + "grad_norm": 0.9237938523292542, + "learning_rate": 0.0004446426144856638, + "loss": 3.4721, + "step": 65415 + }, + { + "epoch": 4.4448974045386604, + "grad_norm": 0.8098964095115662, + "learning_rate": 0.0004446001494768311, + "loss": 3.6058, + "step": 65420 + }, + { + "epoch": 4.445237124609322, + "grad_norm": 1.0051449537277222, + "learning_rate": 0.0004445576844679984, + "loss": 3.6308, + "step": 65425 + }, + { + "epoch": 4.445576844679984, + "grad_norm": 1.1236838102340698, + "learning_rate": 0.00044451521945916567, + "loss": 3.546, + "step": 65430 + }, + { + "epoch": 4.445916564750646, + "grad_norm": 1.0387742519378662, + "learning_rate": 0.0004444727544503329, + "loss": 3.379, + "step": 65435 + }, + { + "epoch": 4.446256284821307, + "grad_norm": 0.7376751899719238, + "learning_rate": 0.0004444302894415002, + "loss": 3.5351, + "step": 65440 + }, + { + "epoch": 4.446596004891969, + "grad_norm": 0.938535749912262, + "learning_rate": 0.0004443878244326675, + "loss": 3.4781, + "step": 65445 + }, + { + "epoch": 4.446935724962631, + "grad_norm": 0.8216606974601746, + "learning_rate": 0.00044434535942383473, + "loss": 3.55, + "step": 65450 + }, + { + "epoch": 4.447275445033292, + "grad_norm": 0.8746105432510376, + "learning_rate": 0.00044430289441500207, + "loss": 3.6754, + "step": 65455 + }, + { + "epoch": 4.447615165103954, + "grad_norm": 0.8357396125793457, + "learning_rate": 0.00044426042940616935, + "loss": 3.2818, + "step": 65460 + }, + { + "epoch": 4.4479548851746165, + "grad_norm": 1.0735106468200684, + "learning_rate": 0.00044421796439733657, + "loss": 3.6576, + "step": 65465 + }, + { + "epoch": 4.448294605245278, + "grad_norm": 0.882767379283905, + "learning_rate": 0.00044417549938850385, + "loss": 3.44, + "step": 65470 + }, + { + "epoch": 4.44863432531594, + "grad_norm": 1.030924916267395, + "learning_rate": 0.0004441330343796712, + "loss": 3.3885, + "step": 65475 + }, + { + "epoch": 4.448974045386602, + "grad_norm": 1.071376085281372, + "learning_rate": 0.0004440905693708384, + "loss": 3.3856, + "step": 65480 + }, + { + "epoch": 4.449313765457263, + "grad_norm": 4.1315765380859375, + "learning_rate": 0.0004440481043620057, + "loss": 3.2722, + "step": 65485 + }, + { + "epoch": 4.449653485527925, + "grad_norm": 0.8239027857780457, + "learning_rate": 0.000444005639353173, + "loss": 3.4677, + "step": 65490 + }, + { + "epoch": 4.449993205598587, + "grad_norm": 0.8714231252670288, + "learning_rate": 0.00044396317434434025, + "loss": 3.5667, + "step": 65495 + }, + { + "epoch": 4.450332925669248, + "grad_norm": 0.6639158129692078, + "learning_rate": 0.00044392070933550753, + "loss": 3.4295, + "step": 65500 + }, + { + "epoch": 4.45067264573991, + "grad_norm": 0.8222694396972656, + "learning_rate": 0.00044387824432667487, + "loss": 3.3699, + "step": 65505 + }, + { + "epoch": 4.4510123658105725, + "grad_norm": 0.8179710507392883, + "learning_rate": 0.0004438357793178421, + "loss": 3.4703, + "step": 65510 + }, + { + "epoch": 4.451352085881234, + "grad_norm": 0.9810464978218079, + "learning_rate": 0.00044379331430900937, + "loss": 3.8295, + "step": 65515 + }, + { + "epoch": 4.451691805951896, + "grad_norm": 2.4219343662261963, + "learning_rate": 0.00044375084930017665, + "loss": 3.5622, + "step": 65520 + }, + { + "epoch": 4.452031526022558, + "grad_norm": 1.6013425588607788, + "learning_rate": 0.000443708384291344, + "loss": 3.5702, + "step": 65525 + }, + { + "epoch": 4.452371246093219, + "grad_norm": 0.7212433218955994, + "learning_rate": 0.0004436659192825112, + "loss": 3.0077, + "step": 65530 + }, + { + "epoch": 4.452710966163881, + "grad_norm": 0.8779890537261963, + "learning_rate": 0.0004436234542736785, + "loss": 3.3171, + "step": 65535 + }, + { + "epoch": 4.453050686234543, + "grad_norm": 0.8702100515365601, + "learning_rate": 0.00044358098926484583, + "loss": 3.4199, + "step": 65540 + }, + { + "epoch": 4.453390406305204, + "grad_norm": 1.1054561138153076, + "learning_rate": 0.00044353852425601305, + "loss": 3.6021, + "step": 65545 + }, + { + "epoch": 4.453730126375866, + "grad_norm": 0.7898555397987366, + "learning_rate": 0.00044349605924718033, + "loss": 3.4242, + "step": 65550 + }, + { + "epoch": 4.454069846446528, + "grad_norm": 0.841025173664093, + "learning_rate": 0.0004434535942383476, + "loss": 3.4786, + "step": 65555 + }, + { + "epoch": 4.45440956651719, + "grad_norm": 0.6612181067466736, + "learning_rate": 0.0004434111292295149, + "loss": 3.4722, + "step": 65560 + }, + { + "epoch": 4.454749286587852, + "grad_norm": 0.9085617661476135, + "learning_rate": 0.0004433686642206822, + "loss": 3.3709, + "step": 65565 + }, + { + "epoch": 4.455089006658513, + "grad_norm": 1.0491480827331543, + "learning_rate": 0.00044332619921184945, + "loss": 3.7237, + "step": 65570 + }, + { + "epoch": 4.455428726729175, + "grad_norm": 0.6503285765647888, + "learning_rate": 0.00044328373420301673, + "loss": 3.6532, + "step": 65575 + }, + { + "epoch": 4.455768446799837, + "grad_norm": 0.8887665867805481, + "learning_rate": 0.000443241269194184, + "loss": 3.338, + "step": 65580 + }, + { + "epoch": 4.456108166870498, + "grad_norm": 0.9628685712814331, + "learning_rate": 0.0004431988041853513, + "loss": 3.2992, + "step": 65585 + }, + { + "epoch": 4.45644788694116, + "grad_norm": 1.0476009845733643, + "learning_rate": 0.0004431563391765185, + "loss": 3.379, + "step": 65590 + }, + { + "epoch": 4.456787607011822, + "grad_norm": 0.7614072561264038, + "learning_rate": 0.00044311387416768585, + "loss": 3.5493, + "step": 65595 + }, + { + "epoch": 4.457127327082484, + "grad_norm": 0.9113505482673645, + "learning_rate": 0.00044307140915885313, + "loss": 3.4906, + "step": 65600 + }, + { + "epoch": 4.457467047153146, + "grad_norm": 0.997184157371521, + "learning_rate": 0.00044302894415002036, + "loss": 3.5172, + "step": 65605 + }, + { + "epoch": 4.457806767223808, + "grad_norm": 1.2120007276535034, + "learning_rate": 0.0004429864791411877, + "loss": 3.5239, + "step": 65610 + }, + { + "epoch": 4.458146487294469, + "grad_norm": 2.3843955993652344, + "learning_rate": 0.000442944014132355, + "loss": 3.5361, + "step": 65615 + }, + { + "epoch": 4.458486207365131, + "grad_norm": 0.7807196974754333, + "learning_rate": 0.0004429015491235222, + "loss": 3.723, + "step": 65620 + }, + { + "epoch": 4.458825927435793, + "grad_norm": 0.8565259575843811, + "learning_rate": 0.0004428590841146895, + "loss": 3.185, + "step": 65625 + }, + { + "epoch": 4.459165647506454, + "grad_norm": 1.2892051935195923, + "learning_rate": 0.0004428166191058568, + "loss": 3.4832, + "step": 65630 + }, + { + "epoch": 4.459505367577116, + "grad_norm": 0.9703664779663086, + "learning_rate": 0.00044277415409702404, + "loss": 3.434, + "step": 65635 + }, + { + "epoch": 4.459845087647778, + "grad_norm": 0.8646153211593628, + "learning_rate": 0.0004427316890881913, + "loss": 3.7456, + "step": 65640 + }, + { + "epoch": 4.46018480771844, + "grad_norm": 1.162142038345337, + "learning_rate": 0.00044268922407935865, + "loss": 3.4297, + "step": 65645 + }, + { + "epoch": 4.460524527789102, + "grad_norm": 0.8448032736778259, + "learning_rate": 0.0004426467590705259, + "loss": 3.4193, + "step": 65650 + }, + { + "epoch": 4.460864247859764, + "grad_norm": 1.0093001127243042, + "learning_rate": 0.00044260429406169316, + "loss": 3.437, + "step": 65655 + }, + { + "epoch": 4.461203967930425, + "grad_norm": 1.0680816173553467, + "learning_rate": 0.00044256182905286044, + "loss": 3.2535, + "step": 65660 + }, + { + "epoch": 4.461543688001087, + "grad_norm": 0.6940823197364807, + "learning_rate": 0.0004425193640440277, + "loss": 3.4845, + "step": 65665 + }, + { + "epoch": 4.461883408071749, + "grad_norm": 0.9623860716819763, + "learning_rate": 0.000442476899035195, + "loss": 3.5207, + "step": 65670 + }, + { + "epoch": 4.46222312814241, + "grad_norm": 0.9328630566596985, + "learning_rate": 0.0004424344340263623, + "loss": 3.6322, + "step": 65675 + }, + { + "epoch": 4.462562848213072, + "grad_norm": 0.64179527759552, + "learning_rate": 0.00044239196901752956, + "loss": 3.5192, + "step": 65680 + }, + { + "epoch": 4.462902568283734, + "grad_norm": 1.0220202207565308, + "learning_rate": 0.00044234950400869684, + "loss": 3.5782, + "step": 65685 + }, + { + "epoch": 4.463242288354396, + "grad_norm": 1.052938461303711, + "learning_rate": 0.0004423070389998641, + "loss": 3.5671, + "step": 65690 + }, + { + "epoch": 4.463582008425058, + "grad_norm": 1.1928592920303345, + "learning_rate": 0.0004422645739910314, + "loss": 3.2336, + "step": 65695 + }, + { + "epoch": 4.46392172849572, + "grad_norm": 0.9484936594963074, + "learning_rate": 0.0004422221089821987, + "loss": 3.4009, + "step": 65700 + }, + { + "epoch": 4.464261448566381, + "grad_norm": 0.797093391418457, + "learning_rate": 0.00044217964397336596, + "loss": 3.4885, + "step": 65705 + }, + { + "epoch": 4.464601168637043, + "grad_norm": 0.9792342782020569, + "learning_rate": 0.00044213717896453324, + "loss": 3.3853, + "step": 65710 + }, + { + "epoch": 4.464940888707705, + "grad_norm": 0.9235125780105591, + "learning_rate": 0.0004420947139557005, + "loss": 3.4614, + "step": 65715 + }, + { + "epoch": 4.465280608778366, + "grad_norm": 1.0308688879013062, + "learning_rate": 0.0004420522489468678, + "loss": 3.5605, + "step": 65720 + }, + { + "epoch": 4.465620328849028, + "grad_norm": 1.1906712055206299, + "learning_rate": 0.0004420097839380351, + "loss": 3.4512, + "step": 65725 + }, + { + "epoch": 4.4659600489196905, + "grad_norm": 0.8207629323005676, + "learning_rate": 0.0004419673189292023, + "loss": 3.3041, + "step": 65730 + }, + { + "epoch": 4.466299768990352, + "grad_norm": 0.8904368281364441, + "learning_rate": 0.00044192485392036964, + "loss": 3.4358, + "step": 65735 + }, + { + "epoch": 4.466639489061014, + "grad_norm": 0.9362987875938416, + "learning_rate": 0.0004418823889115369, + "loss": 3.1208, + "step": 65740 + }, + { + "epoch": 4.466979209131676, + "grad_norm": 0.9011346101760864, + "learning_rate": 0.00044183992390270415, + "loss": 3.3551, + "step": 65745 + }, + { + "epoch": 4.467318929202337, + "grad_norm": 0.7240453362464905, + "learning_rate": 0.0004417974588938715, + "loss": 3.4207, + "step": 65750 + }, + { + "epoch": 4.467658649272999, + "grad_norm": 0.9561145901679993, + "learning_rate": 0.00044175499388503876, + "loss": 3.4311, + "step": 65755 + }, + { + "epoch": 4.467998369343661, + "grad_norm": 0.8010588884353638, + "learning_rate": 0.000441712528876206, + "loss": 3.7481, + "step": 65760 + }, + { + "epoch": 4.468338089414322, + "grad_norm": 1.1368838548660278, + "learning_rate": 0.00044167006386737327, + "loss": 3.5027, + "step": 65765 + }, + { + "epoch": 4.468677809484984, + "grad_norm": 0.8991111516952515, + "learning_rate": 0.0004416275988585406, + "loss": 3.6248, + "step": 65770 + }, + { + "epoch": 4.4690175295556465, + "grad_norm": 0.7511616945266724, + "learning_rate": 0.00044158513384970783, + "loss": 3.4997, + "step": 65775 + }, + { + "epoch": 4.469357249626308, + "grad_norm": 0.832809567451477, + "learning_rate": 0.0004415426688408751, + "loss": 3.2685, + "step": 65780 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.8109950423240662, + "learning_rate": 0.00044150020383204244, + "loss": 3.3188, + "step": 65785 + }, + { + "epoch": 4.470036689767632, + "grad_norm": 0.8105044960975647, + "learning_rate": 0.00044145773882320967, + "loss": 3.4355, + "step": 65790 + }, + { + "epoch": 4.470376409838293, + "grad_norm": 0.8461794853210449, + "learning_rate": 0.00044141527381437695, + "loss": 3.6628, + "step": 65795 + }, + { + "epoch": 4.470716129908955, + "grad_norm": 0.9444897770881653, + "learning_rate": 0.0004413728088055443, + "loss": 3.4216, + "step": 65800 + }, + { + "epoch": 4.471055849979617, + "grad_norm": 1.1487935781478882, + "learning_rate": 0.0004413303437967115, + "loss": 3.5516, + "step": 65805 + }, + { + "epoch": 4.471395570050278, + "grad_norm": 0.9000996947288513, + "learning_rate": 0.0004412878787878788, + "loss": 3.6127, + "step": 65810 + }, + { + "epoch": 4.47173529012094, + "grad_norm": 0.759964108467102, + "learning_rate": 0.00044124541377904607, + "loss": 3.3251, + "step": 65815 + }, + { + "epoch": 4.4720750101916025, + "grad_norm": 0.9669622778892517, + "learning_rate": 0.00044120294877021335, + "loss": 3.4359, + "step": 65820 + }, + { + "epoch": 4.472414730262264, + "grad_norm": 1.0597937107086182, + "learning_rate": 0.00044116048376138063, + "loss": 3.3399, + "step": 65825 + }, + { + "epoch": 4.472754450332926, + "grad_norm": 0.9722199440002441, + "learning_rate": 0.0004411180187525479, + "loss": 3.4974, + "step": 65830 + }, + { + "epoch": 4.473094170403588, + "grad_norm": 0.8904256224632263, + "learning_rate": 0.0004410755537437152, + "loss": 3.3262, + "step": 65835 + }, + { + "epoch": 4.473433890474249, + "grad_norm": 0.937782347202301, + "learning_rate": 0.00044103308873488247, + "loss": 3.574, + "step": 65840 + }, + { + "epoch": 4.473773610544911, + "grad_norm": 0.8939664363861084, + "learning_rate": 0.00044099062372604975, + "loss": 3.2764, + "step": 65845 + }, + { + "epoch": 4.474113330615573, + "grad_norm": 1.163545846939087, + "learning_rate": 0.000440948158717217, + "loss": 3.2814, + "step": 65850 + }, + { + "epoch": 4.474453050686234, + "grad_norm": 1.757050633430481, + "learning_rate": 0.0004409056937083843, + "loss": 3.2161, + "step": 65855 + }, + { + "epoch": 4.474792770756896, + "grad_norm": 0.7318170666694641, + "learning_rate": 0.0004408632286995516, + "loss": 3.3097, + "step": 65860 + }, + { + "epoch": 4.4751324908275585, + "grad_norm": 0.843804657459259, + "learning_rate": 0.00044082076369071887, + "loss": 3.4979, + "step": 65865 + }, + { + "epoch": 4.47547221089822, + "grad_norm": 0.7631068229675293, + "learning_rate": 0.00044077829868188615, + "loss": 3.3792, + "step": 65870 + }, + { + "epoch": 4.475811930968882, + "grad_norm": 1.1167502403259277, + "learning_rate": 0.00044073583367305343, + "loss": 3.5401, + "step": 65875 + }, + { + "epoch": 4.476151651039544, + "grad_norm": 0.7828704118728638, + "learning_rate": 0.0004406933686642207, + "loss": 3.4924, + "step": 65880 + }, + { + "epoch": 4.476491371110205, + "grad_norm": 0.9671616554260254, + "learning_rate": 0.00044065090365538794, + "loss": 3.3358, + "step": 65885 + }, + { + "epoch": 4.476831091180867, + "grad_norm": 0.9829466342926025, + "learning_rate": 0.00044060843864655527, + "loss": 3.7525, + "step": 65890 + }, + { + "epoch": 4.477170811251529, + "grad_norm": 0.8248799443244934, + "learning_rate": 0.00044056597363772255, + "loss": 3.4879, + "step": 65895 + }, + { + "epoch": 4.47751053132219, + "grad_norm": 1.0757858753204346, + "learning_rate": 0.0004405235086288898, + "loss": 3.3434, + "step": 65900 + }, + { + "epoch": 4.477850251392852, + "grad_norm": 0.9598780274391174, + "learning_rate": 0.0004404810436200571, + "loss": 3.4282, + "step": 65905 + }, + { + "epoch": 4.4781899714635145, + "grad_norm": 0.8108580112457275, + "learning_rate": 0.0004404385786112244, + "loss": 3.2747, + "step": 65910 + }, + { + "epoch": 4.478529691534176, + "grad_norm": 0.7971905469894409, + "learning_rate": 0.0004403961136023916, + "loss": 3.6849, + "step": 65915 + }, + { + "epoch": 4.478869411604838, + "grad_norm": 0.9227086901664734, + "learning_rate": 0.0004403536485935589, + "loss": 3.2659, + "step": 65920 + }, + { + "epoch": 4.4792091316755, + "grad_norm": 0.8887190818786621, + "learning_rate": 0.00044031118358472623, + "loss": 3.5092, + "step": 65925 + }, + { + "epoch": 4.479548851746161, + "grad_norm": 0.8663660287857056, + "learning_rate": 0.00044026871857589346, + "loss": 3.4442, + "step": 65930 + }, + { + "epoch": 4.479888571816823, + "grad_norm": 1.08977472782135, + "learning_rate": 0.00044022625356706074, + "loss": 3.4442, + "step": 65935 + }, + { + "epoch": 4.480228291887485, + "grad_norm": 0.7511997222900391, + "learning_rate": 0.00044018378855822807, + "loss": 3.4317, + "step": 65940 + }, + { + "epoch": 4.480568011958146, + "grad_norm": 1.2239910364151, + "learning_rate": 0.0004401413235493953, + "loss": 3.3159, + "step": 65945 + }, + { + "epoch": 4.480907732028808, + "grad_norm": 0.9081506729125977, + "learning_rate": 0.0004400988585405626, + "loss": 3.5081, + "step": 65950 + }, + { + "epoch": 4.4812474520994705, + "grad_norm": 0.9353605508804321, + "learning_rate": 0.00044005639353172986, + "loss": 3.5791, + "step": 65955 + }, + { + "epoch": 4.481587172170132, + "grad_norm": 0.9648020267486572, + "learning_rate": 0.00044001392852289714, + "loss": 3.546, + "step": 65960 + }, + { + "epoch": 4.481926892240794, + "grad_norm": 0.8220242857933044, + "learning_rate": 0.0004399714635140644, + "loss": 3.4124, + "step": 65965 + }, + { + "epoch": 4.482266612311455, + "grad_norm": 0.8394401669502258, + "learning_rate": 0.0004399289985052317, + "loss": 3.2902, + "step": 65970 + }, + { + "epoch": 4.482606332382117, + "grad_norm": 0.8521081805229187, + "learning_rate": 0.000439886533496399, + "loss": 3.4379, + "step": 65975 + }, + { + "epoch": 4.482946052452779, + "grad_norm": 0.9047088027000427, + "learning_rate": 0.00043984406848756626, + "loss": 3.4913, + "step": 65980 + }, + { + "epoch": 4.48328577252344, + "grad_norm": 1.257603645324707, + "learning_rate": 0.00043980160347873354, + "loss": 3.6158, + "step": 65985 + }, + { + "epoch": 4.483625492594102, + "grad_norm": 0.6776742339134216, + "learning_rate": 0.00043975913846990076, + "loss": 3.3903, + "step": 65990 + }, + { + "epoch": 4.483965212664764, + "grad_norm": 0.8954745531082153, + "learning_rate": 0.0004397166734610681, + "loss": 3.3705, + "step": 65995 + }, + { + "epoch": 4.484304932735426, + "grad_norm": 0.8811147212982178, + "learning_rate": 0.0004396742084522354, + "loss": 3.3216, + "step": 66000 + }, + { + "epoch": 4.484644652806088, + "grad_norm": 1.15407133102417, + "learning_rate": 0.0004396317434434026, + "loss": 3.6145, + "step": 66005 + }, + { + "epoch": 4.48498437287675, + "grad_norm": 0.9171308279037476, + "learning_rate": 0.00043958927843456994, + "loss": 3.2937, + "step": 66010 + }, + { + "epoch": 4.485324092947411, + "grad_norm": 0.8404129147529602, + "learning_rate": 0.0004395468134257372, + "loss": 3.584, + "step": 66015 + }, + { + "epoch": 4.485663813018073, + "grad_norm": 0.81142258644104, + "learning_rate": 0.00043950434841690444, + "loss": 3.3727, + "step": 66020 + }, + { + "epoch": 4.486003533088735, + "grad_norm": 0.8440983891487122, + "learning_rate": 0.0004394618834080717, + "loss": 3.3521, + "step": 66025 + }, + { + "epoch": 4.486343253159396, + "grad_norm": 1.0008430480957031, + "learning_rate": 0.00043941941839923906, + "loss": 3.3574, + "step": 66030 + }, + { + "epoch": 4.486682973230058, + "grad_norm": 1.0336220264434814, + "learning_rate": 0.00043937695339040634, + "loss": 3.3368, + "step": 66035 + }, + { + "epoch": 4.4870226933007205, + "grad_norm": 0.9227715730667114, + "learning_rate": 0.00043933448838157356, + "loss": 3.6166, + "step": 66040 + }, + { + "epoch": 4.487362413371382, + "grad_norm": 1.1628543138504028, + "learning_rate": 0.0004392920233727409, + "loss": 3.6135, + "step": 66045 + }, + { + "epoch": 4.487702133442044, + "grad_norm": 0.9720963835716248, + "learning_rate": 0.0004392495583639082, + "loss": 3.5685, + "step": 66050 + }, + { + "epoch": 4.488041853512706, + "grad_norm": 0.7822572588920593, + "learning_rate": 0.0004392070933550754, + "loss": 3.2747, + "step": 66055 + }, + { + "epoch": 4.488381573583367, + "grad_norm": 0.8414997458457947, + "learning_rate": 0.00043916462834624274, + "loss": 3.4729, + "step": 66060 + }, + { + "epoch": 4.488721293654029, + "grad_norm": 0.8095850944519043, + "learning_rate": 0.00043912216333741, + "loss": 3.5511, + "step": 66065 + }, + { + "epoch": 4.489061013724691, + "grad_norm": 0.9841702580451965, + "learning_rate": 0.00043907969832857724, + "loss": 3.63, + "step": 66070 + }, + { + "epoch": 4.489400733795352, + "grad_norm": 0.9968935251235962, + "learning_rate": 0.0004390372333197445, + "loss": 3.4721, + "step": 66075 + }, + { + "epoch": 4.489740453866014, + "grad_norm": 0.9655702114105225, + "learning_rate": 0.00043899476831091186, + "loss": 3.3523, + "step": 66080 + }, + { + "epoch": 4.4900801739366765, + "grad_norm": 1.0748201608657837, + "learning_rate": 0.0004389523033020791, + "loss": 3.4517, + "step": 66085 + }, + { + "epoch": 4.490419894007338, + "grad_norm": 0.9846497178077698, + "learning_rate": 0.00043890983829324636, + "loss": 3.5396, + "step": 66090 + }, + { + "epoch": 4.490759614078, + "grad_norm": 0.8847998976707458, + "learning_rate": 0.0004388673732844137, + "loss": 3.2043, + "step": 66095 + }, + { + "epoch": 4.491099334148662, + "grad_norm": 1.0119754076004028, + "learning_rate": 0.0004388249082755809, + "loss": 3.3207, + "step": 66100 + }, + { + "epoch": 4.491439054219323, + "grad_norm": 0.8346574306488037, + "learning_rate": 0.0004387824432667482, + "loss": 3.4641, + "step": 66105 + }, + { + "epoch": 4.491778774289985, + "grad_norm": 0.9469310641288757, + "learning_rate": 0.0004387399782579155, + "loss": 3.4154, + "step": 66110 + }, + { + "epoch": 4.492118494360647, + "grad_norm": 0.8790440559387207, + "learning_rate": 0.00043869751324908277, + "loss": 3.4467, + "step": 66115 + }, + { + "epoch": 4.492458214431308, + "grad_norm": 0.8098751306533813, + "learning_rate": 0.00043865504824025005, + "loss": 3.7322, + "step": 66120 + }, + { + "epoch": 4.49279793450197, + "grad_norm": 0.6582716107368469, + "learning_rate": 0.0004386125832314173, + "loss": 3.4243, + "step": 66125 + }, + { + "epoch": 4.4931376545726325, + "grad_norm": 1.1199787855148315, + "learning_rate": 0.0004385701182225846, + "loss": 3.4451, + "step": 66130 + }, + { + "epoch": 4.493477374643294, + "grad_norm": 1.1899428367614746, + "learning_rate": 0.0004385276532137519, + "loss": 3.4274, + "step": 66135 + }, + { + "epoch": 4.493817094713956, + "grad_norm": 0.8270637392997742, + "learning_rate": 0.00043848518820491917, + "loss": 3.5381, + "step": 66140 + }, + { + "epoch": 4.494156814784618, + "grad_norm": 0.786910355091095, + "learning_rate": 0.0004384427231960864, + "loss": 3.4966, + "step": 66145 + }, + { + "epoch": 4.494496534855279, + "grad_norm": 0.8113481998443604, + "learning_rate": 0.0004384002581872537, + "loss": 3.4887, + "step": 66150 + }, + { + "epoch": 4.494836254925941, + "grad_norm": 0.7609441876411438, + "learning_rate": 0.000438357793178421, + "loss": 3.4209, + "step": 66155 + }, + { + "epoch": 4.495175974996603, + "grad_norm": 0.9320996403694153, + "learning_rate": 0.00043831532816958823, + "loss": 3.6521, + "step": 66160 + }, + { + "epoch": 4.495515695067264, + "grad_norm": 1.0775659084320068, + "learning_rate": 0.00043827286316075557, + "loss": 3.3968, + "step": 66165 + }, + { + "epoch": 4.495855415137926, + "grad_norm": 1.0194765329360962, + "learning_rate": 0.00043823039815192285, + "loss": 3.2662, + "step": 66170 + }, + { + "epoch": 4.4961951352085885, + "grad_norm": 1.0781058073043823, + "learning_rate": 0.00043818793314309007, + "loss": 3.546, + "step": 66175 + }, + { + "epoch": 4.49653485527925, + "grad_norm": 0.851560115814209, + "learning_rate": 0.00043814546813425735, + "loss": 3.4515, + "step": 66180 + }, + { + "epoch": 4.496874575349912, + "grad_norm": 1.0154660940170288, + "learning_rate": 0.0004381030031254247, + "loss": 3.3518, + "step": 66185 + }, + { + "epoch": 4.497214295420574, + "grad_norm": 0.8541907668113708, + "learning_rate": 0.0004380605381165919, + "loss": 3.7383, + "step": 66190 + }, + { + "epoch": 4.497554015491235, + "grad_norm": 0.9435146450996399, + "learning_rate": 0.0004380180731077592, + "loss": 3.6163, + "step": 66195 + }, + { + "epoch": 4.497893735561897, + "grad_norm": 1.124619960784912, + "learning_rate": 0.0004379756080989265, + "loss": 3.5398, + "step": 66200 + }, + { + "epoch": 4.498233455632559, + "grad_norm": 0.9778558611869812, + "learning_rate": 0.0004379331430900938, + "loss": 3.3137, + "step": 66205 + }, + { + "epoch": 4.49857317570322, + "grad_norm": 0.757222592830658, + "learning_rate": 0.00043789067808126103, + "loss": 3.6536, + "step": 66210 + }, + { + "epoch": 4.498912895773882, + "grad_norm": 0.7816507816314697, + "learning_rate": 0.0004378482130724283, + "loss": 3.3967, + "step": 66215 + }, + { + "epoch": 4.4992526158445445, + "grad_norm": 0.8152292966842651, + "learning_rate": 0.00043780574806359565, + "loss": 3.4377, + "step": 66220 + }, + { + "epoch": 4.499592335915206, + "grad_norm": 1.1455628871917725, + "learning_rate": 0.00043776328305476287, + "loss": 3.5647, + "step": 66225 + }, + { + "epoch": 4.499932055985868, + "grad_norm": 0.7937171459197998, + "learning_rate": 0.00043772081804593015, + "loss": 3.5486, + "step": 66230 + }, + { + "epoch": 4.500271776056529, + "grad_norm": 0.9103925228118896, + "learning_rate": 0.0004376783530370975, + "loss": 3.3742, + "step": 66235 + }, + { + "epoch": 4.500611496127191, + "grad_norm": 1.0693119764328003, + "learning_rate": 0.0004376358880282647, + "loss": 3.2389, + "step": 66240 + }, + { + "epoch": 4.500951216197853, + "grad_norm": 0.9325717687606812, + "learning_rate": 0.000437593423019432, + "loss": 3.4791, + "step": 66245 + }, + { + "epoch": 4.501290936268514, + "grad_norm": 1.5204144716262817, + "learning_rate": 0.0004375509580105993, + "loss": 3.4836, + "step": 66250 + }, + { + "epoch": 4.501630656339176, + "grad_norm": 0.8243429660797119, + "learning_rate": 0.00043750849300176655, + "loss": 3.1457, + "step": 66255 + }, + { + "epoch": 4.501970376409838, + "grad_norm": 0.7662097215652466, + "learning_rate": 0.00043746602799293383, + "loss": 3.4157, + "step": 66260 + }, + { + "epoch": 4.5023100964805, + "grad_norm": 0.8040570020675659, + "learning_rate": 0.0004374235629841011, + "loss": 3.3533, + "step": 66265 + }, + { + "epoch": 4.502649816551162, + "grad_norm": 0.9656363725662231, + "learning_rate": 0.0004373810979752684, + "loss": 3.3263, + "step": 66270 + }, + { + "epoch": 4.502989536621824, + "grad_norm": 0.8828940391540527, + "learning_rate": 0.0004373386329664357, + "loss": 3.5255, + "step": 66275 + }, + { + "epoch": 4.503329256692485, + "grad_norm": 1.1273237466812134, + "learning_rate": 0.00043729616795760295, + "loss": 3.4955, + "step": 66280 + }, + { + "epoch": 4.503668976763147, + "grad_norm": 0.8365733623504639, + "learning_rate": 0.0004372537029487702, + "loss": 3.5936, + "step": 66285 + }, + { + "epoch": 4.504008696833809, + "grad_norm": 1.0020831823349, + "learning_rate": 0.0004372112379399375, + "loss": 3.6356, + "step": 66290 + }, + { + "epoch": 4.50434841690447, + "grad_norm": 0.869614839553833, + "learning_rate": 0.0004371687729311048, + "loss": 3.4606, + "step": 66295 + }, + { + "epoch": 4.504688136975132, + "grad_norm": 0.8969299793243408, + "learning_rate": 0.000437126307922272, + "loss": 3.519, + "step": 66300 + }, + { + "epoch": 4.5050278570457944, + "grad_norm": 0.7851541042327881, + "learning_rate": 0.00043708384291343935, + "loss": 3.5518, + "step": 66305 + }, + { + "epoch": 4.505367577116456, + "grad_norm": 1.2252655029296875, + "learning_rate": 0.00043704137790460663, + "loss": 3.2516, + "step": 66310 + }, + { + "epoch": 4.505707297187118, + "grad_norm": 0.9728975892066956, + "learning_rate": 0.00043699891289577386, + "loss": 3.421, + "step": 66315 + }, + { + "epoch": 4.50604701725778, + "grad_norm": 0.9106217622756958, + "learning_rate": 0.00043695644788694114, + "loss": 3.4602, + "step": 66320 + }, + { + "epoch": 4.506386737328441, + "grad_norm": 0.8873616456985474, + "learning_rate": 0.0004369139828781085, + "loss": 3.6343, + "step": 66325 + }, + { + "epoch": 4.506726457399103, + "grad_norm": 0.9423653483390808, + "learning_rate": 0.0004368715178692757, + "loss": 3.4407, + "step": 66330 + }, + { + "epoch": 4.507066177469765, + "grad_norm": 0.77431321144104, + "learning_rate": 0.000436829052860443, + "loss": 3.5977, + "step": 66335 + }, + { + "epoch": 4.507405897540426, + "grad_norm": 1.0570741891860962, + "learning_rate": 0.0004367865878516103, + "loss": 3.0323, + "step": 66340 + }, + { + "epoch": 4.507745617611088, + "grad_norm": 0.9476856589317322, + "learning_rate": 0.00043674412284277754, + "loss": 3.7621, + "step": 66345 + }, + { + "epoch": 4.5080853376817505, + "grad_norm": 0.859182596206665, + "learning_rate": 0.0004367016578339448, + "loss": 3.4223, + "step": 66350 + }, + { + "epoch": 4.508425057752412, + "grad_norm": 0.9640282392501831, + "learning_rate": 0.00043665919282511215, + "loss": 3.5224, + "step": 66355 + }, + { + "epoch": 4.508764777823074, + "grad_norm": 0.7077986001968384, + "learning_rate": 0.0004366167278162794, + "loss": 3.6068, + "step": 66360 + }, + { + "epoch": 4.509104497893736, + "grad_norm": 0.8591490983963013, + "learning_rate": 0.00043657426280744666, + "loss": 3.7581, + "step": 66365 + }, + { + "epoch": 4.509444217964397, + "grad_norm": 1.065057396888733, + "learning_rate": 0.00043653179779861394, + "loss": 3.4753, + "step": 66370 + }, + { + "epoch": 4.509783938035059, + "grad_norm": 0.8846026062965393, + "learning_rate": 0.0004364893327897812, + "loss": 3.3444, + "step": 66375 + }, + { + "epoch": 4.510123658105721, + "grad_norm": 0.7998479008674622, + "learning_rate": 0.0004364468677809485, + "loss": 3.5513, + "step": 66380 + }, + { + "epoch": 4.510463378176382, + "grad_norm": 0.8918149471282959, + "learning_rate": 0.0004364044027721158, + "loss": 3.6722, + "step": 66385 + }, + { + "epoch": 4.510803098247044, + "grad_norm": 0.9165902137756348, + "learning_rate": 0.0004363619377632831, + "loss": 3.4668, + "step": 66390 + }, + { + "epoch": 4.5111428183177065, + "grad_norm": 0.7888235449790955, + "learning_rate": 0.00043631947275445034, + "loss": 3.3043, + "step": 66395 + }, + { + "epoch": 4.511482538388368, + "grad_norm": 1.1385672092437744, + "learning_rate": 0.0004362770077456176, + "loss": 3.2745, + "step": 66400 + }, + { + "epoch": 4.51182225845903, + "grad_norm": 1.0574147701263428, + "learning_rate": 0.0004362345427367849, + "loss": 3.5623, + "step": 66405 + }, + { + "epoch": 4.512161978529692, + "grad_norm": 1.376930594444275, + "learning_rate": 0.0004361920777279522, + "loss": 3.4686, + "step": 66410 + }, + { + "epoch": 4.512501698600353, + "grad_norm": 0.8158301711082458, + "learning_rate": 0.00043614961271911946, + "loss": 3.4171, + "step": 66415 + }, + { + "epoch": 4.512841418671015, + "grad_norm": 1.3513548374176025, + "learning_rate": 0.00043610714771028674, + "loss": 3.4529, + "step": 66420 + }, + { + "epoch": 4.513181138741677, + "grad_norm": 0.857384204864502, + "learning_rate": 0.000436064682701454, + "loss": 3.3711, + "step": 66425 + }, + { + "epoch": 4.513520858812338, + "grad_norm": 0.9187546372413635, + "learning_rate": 0.0004360222176926213, + "loss": 3.3019, + "step": 66430 + }, + { + "epoch": 4.513860578883, + "grad_norm": 0.9559286236763, + "learning_rate": 0.0004359797526837886, + "loss": 3.6438, + "step": 66435 + }, + { + "epoch": 4.5142002989536625, + "grad_norm": 1.0374987125396729, + "learning_rate": 0.0004359372876749558, + "loss": 3.633, + "step": 66440 + }, + { + "epoch": 4.514540019024324, + "grad_norm": 0.9533666372299194, + "learning_rate": 0.00043589482266612314, + "loss": 3.4262, + "step": 66445 + }, + { + "epoch": 4.514879739094986, + "grad_norm": 0.8687214255332947, + "learning_rate": 0.0004358523576572904, + "loss": 3.5272, + "step": 66450 + }, + { + "epoch": 4.515219459165648, + "grad_norm": 0.8457769751548767, + "learning_rate": 0.00043580989264845765, + "loss": 3.3366, + "step": 66455 + }, + { + "epoch": 4.515559179236309, + "grad_norm": 0.7652486562728882, + "learning_rate": 0.000435767427639625, + "loss": 3.4611, + "step": 66460 + }, + { + "epoch": 4.515898899306971, + "grad_norm": 0.8273781538009644, + "learning_rate": 0.00043572496263079226, + "loss": 3.5143, + "step": 66465 + }, + { + "epoch": 4.516238619377633, + "grad_norm": 0.9512889981269836, + "learning_rate": 0.0004356824976219595, + "loss": 3.5331, + "step": 66470 + }, + { + "epoch": 4.516578339448294, + "grad_norm": 0.9495568871498108, + "learning_rate": 0.00043564003261312677, + "loss": 3.1873, + "step": 66475 + }, + { + "epoch": 4.516918059518956, + "grad_norm": 0.8809605240821838, + "learning_rate": 0.0004355975676042941, + "loss": 3.2277, + "step": 66480 + }, + { + "epoch": 4.5172577795896185, + "grad_norm": 0.804263174533844, + "learning_rate": 0.00043555510259546133, + "loss": 3.4377, + "step": 66485 + }, + { + "epoch": 4.51759749966028, + "grad_norm": 0.8776216506958008, + "learning_rate": 0.0004355126375866286, + "loss": 3.5522, + "step": 66490 + }, + { + "epoch": 4.517937219730942, + "grad_norm": 0.9398301839828491, + "learning_rate": 0.00043547017257779594, + "loss": 3.4927, + "step": 66495 + }, + { + "epoch": 4.518276939801604, + "grad_norm": 0.8675826787948608, + "learning_rate": 0.00043542770756896317, + "loss": 3.5883, + "step": 66500 + }, + { + "epoch": 4.518616659872265, + "grad_norm": 0.7657281756401062, + "learning_rate": 0.00043538524256013045, + "loss": 3.3136, + "step": 66505 + }, + { + "epoch": 4.518956379942927, + "grad_norm": 1.0676519870758057, + "learning_rate": 0.00043534277755129773, + "loss": 3.5366, + "step": 66510 + }, + { + "epoch": 4.519296100013589, + "grad_norm": 0.9114040732383728, + "learning_rate": 0.000435300312542465, + "loss": 3.4378, + "step": 66515 + }, + { + "epoch": 4.51963582008425, + "grad_norm": 0.8184763789176941, + "learning_rate": 0.0004352578475336323, + "loss": 3.5482, + "step": 66520 + }, + { + "epoch": 4.519975540154912, + "grad_norm": 0.8254773020744324, + "learning_rate": 0.00043521538252479957, + "loss": 3.5537, + "step": 66525 + }, + { + "epoch": 4.5203152602255745, + "grad_norm": 0.6760982275009155, + "learning_rate": 0.00043517291751596685, + "loss": 3.5283, + "step": 66530 + }, + { + "epoch": 4.520654980296236, + "grad_norm": 0.7849103808403015, + "learning_rate": 0.00043513045250713413, + "loss": 3.3145, + "step": 66535 + }, + { + "epoch": 4.520994700366898, + "grad_norm": 0.8778895735740662, + "learning_rate": 0.0004350879874983014, + "loss": 3.5573, + "step": 66540 + }, + { + "epoch": 4.52133442043756, + "grad_norm": 0.7815543413162231, + "learning_rate": 0.00043504552248946863, + "loss": 3.4657, + "step": 66545 + }, + { + "epoch": 4.521674140508221, + "grad_norm": 0.8976170420646667, + "learning_rate": 0.00043500305748063597, + "loss": 3.4953, + "step": 66550 + }, + { + "epoch": 4.522013860578883, + "grad_norm": 0.862889289855957, + "learning_rate": 0.00043496059247180325, + "loss": 3.389, + "step": 66555 + }, + { + "epoch": 4.522353580649545, + "grad_norm": 0.7615370154380798, + "learning_rate": 0.00043491812746297053, + "loss": 3.373, + "step": 66560 + }, + { + "epoch": 4.522693300720206, + "grad_norm": 0.8731809258460999, + "learning_rate": 0.0004348756624541378, + "loss": 3.4685, + "step": 66565 + }, + { + "epoch": 4.523033020790868, + "grad_norm": 0.8075938820838928, + "learning_rate": 0.0004348331974453051, + "loss": 3.7727, + "step": 66570 + }, + { + "epoch": 4.5233727408615305, + "grad_norm": 1.067079782485962, + "learning_rate": 0.00043479073243647237, + "loss": 3.2367, + "step": 66575 + }, + { + "epoch": 4.523712460932192, + "grad_norm": 0.7081294655799866, + "learning_rate": 0.0004347482674276396, + "loss": 3.462, + "step": 66580 + }, + { + "epoch": 4.524052181002854, + "grad_norm": 0.9149375557899475, + "learning_rate": 0.00043470580241880693, + "loss": 3.4304, + "step": 66585 + }, + { + "epoch": 4.524391901073516, + "grad_norm": 0.7500657439231873, + "learning_rate": 0.0004346633374099742, + "loss": 3.6862, + "step": 66590 + }, + { + "epoch": 4.524731621144177, + "grad_norm": 0.6426228880882263, + "learning_rate": 0.00043462087240114144, + "loss": 3.3784, + "step": 66595 + }, + { + "epoch": 4.525071341214839, + "grad_norm": 0.9446566700935364, + "learning_rate": 0.00043457840739230877, + "loss": 3.624, + "step": 66600 + }, + { + "epoch": 4.525411061285501, + "grad_norm": 1.1000287532806396, + "learning_rate": 0.00043453594238347605, + "loss": 3.5959, + "step": 66605 + }, + { + "epoch": 4.525750781356162, + "grad_norm": 0.9022693037986755, + "learning_rate": 0.0004344934773746433, + "loss": 3.4527, + "step": 66610 + }, + { + "epoch": 4.5260905014268245, + "grad_norm": 1.2907336950302124, + "learning_rate": 0.00043445101236581056, + "loss": 3.4605, + "step": 66615 + }, + { + "epoch": 4.5264302214974865, + "grad_norm": 0.9457799792289734, + "learning_rate": 0.0004344085473569779, + "loss": 3.3191, + "step": 66620 + }, + { + "epoch": 4.526769941568148, + "grad_norm": 3.147824764251709, + "learning_rate": 0.0004343660823481451, + "loss": 3.246, + "step": 66625 + }, + { + "epoch": 4.52710966163881, + "grad_norm": 1.1260836124420166, + "learning_rate": 0.0004343236173393124, + "loss": 3.4189, + "step": 66630 + }, + { + "epoch": 4.527449381709472, + "grad_norm": 0.9453970789909363, + "learning_rate": 0.00043428115233047973, + "loss": 3.4274, + "step": 66635 + }, + { + "epoch": 4.527789101780133, + "grad_norm": 0.7287835478782654, + "learning_rate": 0.00043423868732164696, + "loss": 3.3615, + "step": 66640 + }, + { + "epoch": 4.528128821850795, + "grad_norm": 0.8731788992881775, + "learning_rate": 0.00043419622231281424, + "loss": 3.5073, + "step": 66645 + }, + { + "epoch": 4.528468541921457, + "grad_norm": 0.7559298872947693, + "learning_rate": 0.00043415375730398157, + "loss": 3.4935, + "step": 66650 + }, + { + "epoch": 4.528808261992118, + "grad_norm": 0.7116197347640991, + "learning_rate": 0.0004341112922951488, + "loss": 3.5331, + "step": 66655 + }, + { + "epoch": 4.5291479820627805, + "grad_norm": 0.8386355042457581, + "learning_rate": 0.0004340688272863161, + "loss": 3.3609, + "step": 66660 + }, + { + "epoch": 4.5294877021334425, + "grad_norm": 0.8887431621551514, + "learning_rate": 0.00043402636227748336, + "loss": 3.4812, + "step": 66665 + }, + { + "epoch": 4.529827422204104, + "grad_norm": 0.9075607061386108, + "learning_rate": 0.00043398389726865064, + "loss": 3.6228, + "step": 66670 + }, + { + "epoch": 4.530167142274766, + "grad_norm": 0.9503486156463623, + "learning_rate": 0.0004339414322598179, + "loss": 3.7013, + "step": 66675 + }, + { + "epoch": 4.530506862345427, + "grad_norm": 0.9523894190788269, + "learning_rate": 0.0004338989672509852, + "loss": 3.6798, + "step": 66680 + }, + { + "epoch": 4.530846582416089, + "grad_norm": 0.8966330885887146, + "learning_rate": 0.0004338565022421525, + "loss": 3.5456, + "step": 66685 + }, + { + "epoch": 4.531186302486751, + "grad_norm": 0.9148954749107361, + "learning_rate": 0.00043381403723331976, + "loss": 3.435, + "step": 66690 + }, + { + "epoch": 4.531526022557412, + "grad_norm": 0.8715833425521851, + "learning_rate": 0.00043377157222448704, + "loss": 3.5016, + "step": 66695 + }, + { + "epoch": 4.531865742628074, + "grad_norm": 0.8135961890220642, + "learning_rate": 0.00043372910721565426, + "loss": 3.3985, + "step": 66700 + }, + { + "epoch": 4.5322054626987365, + "grad_norm": 0.907426655292511, + "learning_rate": 0.0004336866422068216, + "loss": 3.5131, + "step": 66705 + }, + { + "epoch": 4.532545182769398, + "grad_norm": 0.9398869276046753, + "learning_rate": 0.0004336441771979889, + "loss": 3.5591, + "step": 66710 + }, + { + "epoch": 4.53288490284006, + "grad_norm": 0.9635887742042542, + "learning_rate": 0.0004336017121891561, + "loss": 3.3334, + "step": 66715 + }, + { + "epoch": 4.533224622910722, + "grad_norm": 1.150578260421753, + "learning_rate": 0.00043355924718032344, + "loss": 3.6606, + "step": 66720 + }, + { + "epoch": 4.533564342981383, + "grad_norm": 0.8901181221008301, + "learning_rate": 0.0004335167821714907, + "loss": 3.535, + "step": 66725 + }, + { + "epoch": 4.533904063052045, + "grad_norm": 1.0030391216278076, + "learning_rate": 0.000433474317162658, + "loss": 3.3849, + "step": 66730 + }, + { + "epoch": 4.534243783122707, + "grad_norm": 0.722649872303009, + "learning_rate": 0.0004334318521538252, + "loss": 3.4579, + "step": 66735 + }, + { + "epoch": 4.534583503193368, + "grad_norm": 0.8379092216491699, + "learning_rate": 0.00043338938714499256, + "loss": 3.5015, + "step": 66740 + }, + { + "epoch": 4.53492322326403, + "grad_norm": 0.8905465602874756, + "learning_rate": 0.00043334692213615984, + "loss": 3.6366, + "step": 66745 + }, + { + "epoch": 4.5352629433346925, + "grad_norm": 1.1721950769424438, + "learning_rate": 0.00043330445712732706, + "loss": 3.5031, + "step": 66750 + }, + { + "epoch": 4.535602663405354, + "grad_norm": 0.9789055585861206, + "learning_rate": 0.0004332619921184944, + "loss": 3.372, + "step": 66755 + }, + { + "epoch": 4.535942383476016, + "grad_norm": 0.7642418742179871, + "learning_rate": 0.0004332195271096617, + "loss": 3.5529, + "step": 66760 + }, + { + "epoch": 4.536282103546678, + "grad_norm": 0.7391638159751892, + "learning_rate": 0.0004331770621008289, + "loss": 3.3372, + "step": 66765 + }, + { + "epoch": 4.536621823617339, + "grad_norm": 0.8156479001045227, + "learning_rate": 0.0004331345970919962, + "loss": 3.3752, + "step": 66770 + }, + { + "epoch": 4.536961543688001, + "grad_norm": 0.9755398631095886, + "learning_rate": 0.0004330921320831635, + "loss": 3.5451, + "step": 66775 + }, + { + "epoch": 4.537301263758663, + "grad_norm": 1.0144672393798828, + "learning_rate": 0.00043304966707433074, + "loss": 3.2575, + "step": 66780 + }, + { + "epoch": 4.537640983829324, + "grad_norm": 1.0727170705795288, + "learning_rate": 0.000433007202065498, + "loss": 3.6098, + "step": 66785 + }, + { + "epoch": 4.537980703899986, + "grad_norm": 0.9148262739181519, + "learning_rate": 0.00043296473705666536, + "loss": 3.5571, + "step": 66790 + }, + { + "epoch": 4.5383204239706485, + "grad_norm": 1.0542691946029663, + "learning_rate": 0.0004329222720478326, + "loss": 3.5765, + "step": 66795 + }, + { + "epoch": 4.53866014404131, + "grad_norm": 0.8970499634742737, + "learning_rate": 0.00043287980703899986, + "loss": 3.569, + "step": 66800 + }, + { + "epoch": 4.538999864111972, + "grad_norm": 0.8786765933036804, + "learning_rate": 0.00043283734203016714, + "loss": 3.6079, + "step": 66805 + }, + { + "epoch": 4.539339584182634, + "grad_norm": 0.8521862030029297, + "learning_rate": 0.0004327948770213344, + "loss": 3.944, + "step": 66810 + }, + { + "epoch": 4.539679304253295, + "grad_norm": 1.0088995695114136, + "learning_rate": 0.0004327524120125017, + "loss": 3.6874, + "step": 66815 + }, + { + "epoch": 4.540019024323957, + "grad_norm": 0.8235347270965576, + "learning_rate": 0.000432709947003669, + "loss": 3.5681, + "step": 66820 + }, + { + "epoch": 4.540358744394619, + "grad_norm": 0.8564519286155701, + "learning_rate": 0.00043266748199483627, + "loss": 3.6369, + "step": 66825 + }, + { + "epoch": 4.54069846446528, + "grad_norm": 0.7123836278915405, + "learning_rate": 0.00043262501698600355, + "loss": 3.4682, + "step": 66830 + }, + { + "epoch": 4.541038184535942, + "grad_norm": 0.927152156829834, + "learning_rate": 0.0004325825519771708, + "loss": 3.5397, + "step": 66835 + }, + { + "epoch": 4.5413779046066045, + "grad_norm": 0.8476060032844543, + "learning_rate": 0.00043254008696833805, + "loss": 3.425, + "step": 66840 + }, + { + "epoch": 4.541717624677266, + "grad_norm": 1.2041747570037842, + "learning_rate": 0.0004324976219595054, + "loss": 3.3659, + "step": 66845 + }, + { + "epoch": 4.542057344747928, + "grad_norm": 1.3024017810821533, + "learning_rate": 0.00043245515695067267, + "loss": 3.6119, + "step": 66850 + }, + { + "epoch": 4.54239706481859, + "grad_norm": 1.048058271408081, + "learning_rate": 0.0004324126919418399, + "loss": 3.4374, + "step": 66855 + }, + { + "epoch": 4.542736784889251, + "grad_norm": 0.6659095883369446, + "learning_rate": 0.0004323702269330072, + "loss": 3.491, + "step": 66860 + }, + { + "epoch": 4.543076504959913, + "grad_norm": 0.9252830147743225, + "learning_rate": 0.0004323277619241745, + "loss": 3.5374, + "step": 66865 + }, + { + "epoch": 4.543416225030575, + "grad_norm": 1.1475127935409546, + "learning_rate": 0.00043228529691534173, + "loss": 3.4657, + "step": 66870 + }, + { + "epoch": 4.543755945101236, + "grad_norm": 0.9352647662162781, + "learning_rate": 0.000432242831906509, + "loss": 3.6066, + "step": 66875 + }, + { + "epoch": 4.5440956651718984, + "grad_norm": 0.8748888373374939, + "learning_rate": 0.00043220036689767635, + "loss": 3.6275, + "step": 66880 + }, + { + "epoch": 4.54443538524256, + "grad_norm": 0.9884456396102905, + "learning_rate": 0.00043215790188884357, + "loss": 3.4348, + "step": 66885 + }, + { + "epoch": 4.544775105313222, + "grad_norm": 1.2057445049285889, + "learning_rate": 0.00043211543688001085, + "loss": 3.2978, + "step": 66890 + }, + { + "epoch": 4.545114825383884, + "grad_norm": 0.868394136428833, + "learning_rate": 0.0004320729718711782, + "loss": 3.3691, + "step": 66895 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.9456878304481506, + "learning_rate": 0.00043203050686234547, + "loss": 3.3336, + "step": 66900 + }, + { + "epoch": 4.545794265525207, + "grad_norm": 0.8894228935241699, + "learning_rate": 0.0004319880418535127, + "loss": 3.4238, + "step": 66905 + }, + { + "epoch": 4.546133985595869, + "grad_norm": 1.0424593687057495, + "learning_rate": 0.00043194557684468, + "loss": 3.6596, + "step": 66910 + }, + { + "epoch": 4.54647370566653, + "grad_norm": 0.9214271903038025, + "learning_rate": 0.0004319031118358473, + "loss": 3.4955, + "step": 66915 + }, + { + "epoch": 4.546813425737192, + "grad_norm": 0.7861922383308411, + "learning_rate": 0.00043186064682701453, + "loss": 3.4744, + "step": 66920 + }, + { + "epoch": 4.5471531458078545, + "grad_norm": 0.7577658295631409, + "learning_rate": 0.0004318181818181818, + "loss": 3.6282, + "step": 66925 + }, + { + "epoch": 4.547492865878516, + "grad_norm": 0.9120380878448486, + "learning_rate": 0.00043177571680934915, + "loss": 3.6647, + "step": 66930 + }, + { + "epoch": 4.547832585949178, + "grad_norm": 0.7643416523933411, + "learning_rate": 0.00043173325180051637, + "loss": 3.355, + "step": 66935 + }, + { + "epoch": 4.54817230601984, + "grad_norm": 1.2387057542800903, + "learning_rate": 0.00043169078679168365, + "loss": 3.3096, + "step": 66940 + }, + { + "epoch": 4.548512026090501, + "grad_norm": 0.8132821917533875, + "learning_rate": 0.000431648321782851, + "loss": 3.5748, + "step": 66945 + }, + { + "epoch": 4.548851746161163, + "grad_norm": 1.0034990310668945, + "learning_rate": 0.0004316058567740182, + "loss": 3.5895, + "step": 66950 + }, + { + "epoch": 4.549191466231825, + "grad_norm": 1.0332789421081543, + "learning_rate": 0.0004315633917651855, + "loss": 3.542, + "step": 66955 + }, + { + "epoch": 4.549531186302486, + "grad_norm": 0.922146737575531, + "learning_rate": 0.0004315209267563528, + "loss": 3.5107, + "step": 66960 + }, + { + "epoch": 4.549870906373148, + "grad_norm": 0.936262309551239, + "learning_rate": 0.00043147846174752005, + "loss": 3.3631, + "step": 66965 + }, + { + "epoch": 4.5502106264438105, + "grad_norm": 1.0479505062103271, + "learning_rate": 0.00043143599673868733, + "loss": 3.4805, + "step": 66970 + }, + { + "epoch": 4.550550346514472, + "grad_norm": 0.8792405724525452, + "learning_rate": 0.0004313935317298546, + "loss": 3.3414, + "step": 66975 + }, + { + "epoch": 4.550890066585134, + "grad_norm": 0.7811269164085388, + "learning_rate": 0.0004313510667210219, + "loss": 3.3657, + "step": 66980 + }, + { + "epoch": 4.551229786655796, + "grad_norm": 1.4832252264022827, + "learning_rate": 0.0004313086017121892, + "loss": 3.5395, + "step": 66985 + }, + { + "epoch": 4.551569506726457, + "grad_norm": 0.9375556707382202, + "learning_rate": 0.00043126613670335645, + "loss": 3.3995, + "step": 66990 + }, + { + "epoch": 4.551909226797119, + "grad_norm": 0.8964979648590088, + "learning_rate": 0.0004312236716945237, + "loss": 3.5229, + "step": 66995 + }, + { + "epoch": 4.552248946867781, + "grad_norm": 0.9471815824508667, + "learning_rate": 0.000431181206685691, + "loss": 3.2566, + "step": 67000 + }, + { + "epoch": 4.552588666938442, + "grad_norm": 0.7718502879142761, + "learning_rate": 0.0004311387416768583, + "loss": 3.4037, + "step": 67005 + }, + { + "epoch": 4.552928387009104, + "grad_norm": 0.6472495794296265, + "learning_rate": 0.0004310962766680255, + "loss": 3.4404, + "step": 67010 + }, + { + "epoch": 4.5532681070797665, + "grad_norm": 0.8912786841392517, + "learning_rate": 0.00043105381165919285, + "loss": 3.4762, + "step": 67015 + }, + { + "epoch": 4.553607827150428, + "grad_norm": 1.0178440809249878, + "learning_rate": 0.00043101134665036013, + "loss": 3.397, + "step": 67020 + }, + { + "epoch": 4.55394754722109, + "grad_norm": 0.8915054798126221, + "learning_rate": 0.00043096888164152736, + "loss": 3.5383, + "step": 67025 + }, + { + "epoch": 4.554287267291752, + "grad_norm": 1.1411832571029663, + "learning_rate": 0.00043092641663269464, + "loss": 3.5493, + "step": 67030 + }, + { + "epoch": 4.554626987362413, + "grad_norm": 0.9685059189796448, + "learning_rate": 0.000430883951623862, + "loss": 3.3233, + "step": 67035 + }, + { + "epoch": 4.554966707433075, + "grad_norm": 0.8090518712997437, + "learning_rate": 0.0004308414866150292, + "loss": 3.5627, + "step": 67040 + }, + { + "epoch": 4.555306427503737, + "grad_norm": 0.9945644736289978, + "learning_rate": 0.0004307990216061965, + "loss": 3.4445, + "step": 67045 + }, + { + "epoch": 4.555646147574398, + "grad_norm": 0.8897326588630676, + "learning_rate": 0.0004307565565973638, + "loss": 3.2953, + "step": 67050 + }, + { + "epoch": 4.55598586764506, + "grad_norm": 0.8639029860496521, + "learning_rate": 0.00043071409158853104, + "loss": 3.5598, + "step": 67055 + }, + { + "epoch": 4.5563255877157225, + "grad_norm": 1.0045974254608154, + "learning_rate": 0.0004306716265796983, + "loss": 3.5777, + "step": 67060 + }, + { + "epoch": 4.556665307786384, + "grad_norm": 1.0073877573013306, + "learning_rate": 0.0004306291615708656, + "loss": 3.6533, + "step": 67065 + }, + { + "epoch": 4.557005027857046, + "grad_norm": 1.0373882055282593, + "learning_rate": 0.00043058669656203293, + "loss": 3.4495, + "step": 67070 + }, + { + "epoch": 4.557344747927708, + "grad_norm": 1.329776644706726, + "learning_rate": 0.00043054423155320016, + "loss": 3.8173, + "step": 67075 + }, + { + "epoch": 4.557684467998369, + "grad_norm": 2.411485195159912, + "learning_rate": 0.00043050176654436744, + "loss": 3.4705, + "step": 67080 + }, + { + "epoch": 4.558024188069031, + "grad_norm": 0.8607636094093323, + "learning_rate": 0.0004304593015355348, + "loss": 3.6889, + "step": 67085 + }, + { + "epoch": 4.558363908139693, + "grad_norm": 0.6912363171577454, + "learning_rate": 0.000430416836526702, + "loss": 3.4417, + "step": 67090 + }, + { + "epoch": 4.558703628210354, + "grad_norm": 0.7211779356002808, + "learning_rate": 0.0004303743715178693, + "loss": 3.5172, + "step": 67095 + }, + { + "epoch": 4.559043348281016, + "grad_norm": 1.0378735065460205, + "learning_rate": 0.00043033190650903656, + "loss": 3.2223, + "step": 67100 + }, + { + "epoch": 4.5593830683516785, + "grad_norm": 1.0428037643432617, + "learning_rate": 0.00043028944150020384, + "loss": 3.6051, + "step": 67105 + }, + { + "epoch": 4.55972278842234, + "grad_norm": 0.9142529964447021, + "learning_rate": 0.0004302469764913711, + "loss": 3.316, + "step": 67110 + }, + { + "epoch": 4.560062508493002, + "grad_norm": 0.984703004360199, + "learning_rate": 0.0004302045114825384, + "loss": 3.4504, + "step": 67115 + }, + { + "epoch": 4.560402228563664, + "grad_norm": 0.9665036797523499, + "learning_rate": 0.0004301620464737057, + "loss": 3.6272, + "step": 67120 + }, + { + "epoch": 4.560741948634325, + "grad_norm": 0.9005587697029114, + "learning_rate": 0.00043011958146487296, + "loss": 3.2835, + "step": 67125 + }, + { + "epoch": 4.561081668704987, + "grad_norm": 0.748664915561676, + "learning_rate": 0.00043007711645604024, + "loss": 3.4661, + "step": 67130 + }, + { + "epoch": 4.561421388775649, + "grad_norm": 1.0780904293060303, + "learning_rate": 0.00043003465144720747, + "loss": 3.2909, + "step": 67135 + }, + { + "epoch": 4.56176110884631, + "grad_norm": 0.7615379691123962, + "learning_rate": 0.0004299921864383748, + "loss": 3.5494, + "step": 67140 + }, + { + "epoch": 4.562100828916972, + "grad_norm": 0.8269045948982239, + "learning_rate": 0.0004299497214295421, + "loss": 3.4778, + "step": 67145 + }, + { + "epoch": 4.5624405489876345, + "grad_norm": 1.046461820602417, + "learning_rate": 0.0004299072564207093, + "loss": 3.608, + "step": 67150 + }, + { + "epoch": 4.562780269058296, + "grad_norm": 0.836706817150116, + "learning_rate": 0.00042986479141187664, + "loss": 3.5362, + "step": 67155 + }, + { + "epoch": 4.563119989128958, + "grad_norm": 1.0729625225067139, + "learning_rate": 0.0004298223264030439, + "loss": 3.5035, + "step": 67160 + }, + { + "epoch": 4.56345970919962, + "grad_norm": 0.8621786832809448, + "learning_rate": 0.00042977986139421115, + "loss": 3.543, + "step": 67165 + }, + { + "epoch": 4.563799429270281, + "grad_norm": 0.8110635876655579, + "learning_rate": 0.00042973739638537843, + "loss": 3.6033, + "step": 67170 + }, + { + "epoch": 4.564139149340943, + "grad_norm": 0.79312664270401, + "learning_rate": 0.00042969493137654576, + "loss": 3.5026, + "step": 67175 + }, + { + "epoch": 4.564478869411605, + "grad_norm": 1.0779545307159424, + "learning_rate": 0.000429652466367713, + "loss": 3.5178, + "step": 67180 + }, + { + "epoch": 4.564818589482266, + "grad_norm": 1.1566768884658813, + "learning_rate": 0.00042961000135888027, + "loss": 3.1368, + "step": 67185 + }, + { + "epoch": 4.5651583095529285, + "grad_norm": 1.073005199432373, + "learning_rate": 0.0004295675363500476, + "loss": 3.4781, + "step": 67190 + }, + { + "epoch": 4.5654980296235905, + "grad_norm": 0.8295185565948486, + "learning_rate": 0.00042952507134121483, + "loss": 3.3433, + "step": 67195 + }, + { + "epoch": 4.565837749694252, + "grad_norm": 0.9815894365310669, + "learning_rate": 0.0004294826063323821, + "loss": 3.4762, + "step": 67200 + }, + { + "epoch": 4.566177469764914, + "grad_norm": 1.1259667873382568, + "learning_rate": 0.00042944014132354944, + "loss": 3.4903, + "step": 67205 + }, + { + "epoch": 4.566517189835576, + "grad_norm": 1.0580482482910156, + "learning_rate": 0.00042939767631471667, + "loss": 3.6208, + "step": 67210 + }, + { + "epoch": 4.566856909906237, + "grad_norm": 1.0183595418930054, + "learning_rate": 0.00042935521130588395, + "loss": 3.1756, + "step": 67215 + }, + { + "epoch": 4.567196629976899, + "grad_norm": 0.9099323749542236, + "learning_rate": 0.00042931274629705123, + "loss": 3.315, + "step": 67220 + }, + { + "epoch": 4.567536350047561, + "grad_norm": 1.1575851440429688, + "learning_rate": 0.0004292702812882185, + "loss": 3.3351, + "step": 67225 + }, + { + "epoch": 4.567876070118222, + "grad_norm": 1.0001609325408936, + "learning_rate": 0.0004292278162793858, + "loss": 3.2962, + "step": 67230 + }, + { + "epoch": 4.5682157901888845, + "grad_norm": 0.7493778467178345, + "learning_rate": 0.00042918535127055307, + "loss": 3.7059, + "step": 67235 + }, + { + "epoch": 4.5685555102595465, + "grad_norm": 0.9649704694747925, + "learning_rate": 0.0004291428862617204, + "loss": 3.5084, + "step": 67240 + }, + { + "epoch": 4.568895230330208, + "grad_norm": 0.736458420753479, + "learning_rate": 0.00042910042125288763, + "loss": 3.5131, + "step": 67245 + }, + { + "epoch": 4.56923495040087, + "grad_norm": 1.1670411825180054, + "learning_rate": 0.0004290579562440549, + "loss": 3.7056, + "step": 67250 + }, + { + "epoch": 4.569574670471532, + "grad_norm": 0.8948031663894653, + "learning_rate": 0.0004290154912352222, + "loss": 3.3911, + "step": 67255 + }, + { + "epoch": 4.569914390542193, + "grad_norm": 0.9808430075645447, + "learning_rate": 0.00042897302622638947, + "loss": 3.4126, + "step": 67260 + }, + { + "epoch": 4.570254110612855, + "grad_norm": 1.0445612668991089, + "learning_rate": 0.00042893056121755675, + "loss": 3.4685, + "step": 67265 + }, + { + "epoch": 4.570593830683517, + "grad_norm": 0.8177327513694763, + "learning_rate": 0.00042888809620872403, + "loss": 3.3966, + "step": 67270 + }, + { + "epoch": 4.570933550754178, + "grad_norm": 0.897955596446991, + "learning_rate": 0.0004288456311998913, + "loss": 3.4348, + "step": 67275 + }, + { + "epoch": 4.5712732708248405, + "grad_norm": 0.8223017454147339, + "learning_rate": 0.0004288031661910586, + "loss": 3.2029, + "step": 67280 + }, + { + "epoch": 4.5716129908955025, + "grad_norm": 0.856989324092865, + "learning_rate": 0.00042876070118222587, + "loss": 3.3946, + "step": 67285 + }, + { + "epoch": 4.571952710966164, + "grad_norm": 1.0508395433425903, + "learning_rate": 0.0004287182361733931, + "loss": 3.68, + "step": 67290 + }, + { + "epoch": 4.572292431036826, + "grad_norm": 1.003301978111267, + "learning_rate": 0.00042867577116456043, + "loss": 3.2755, + "step": 67295 + }, + { + "epoch": 4.572632151107488, + "grad_norm": 1.2731139659881592, + "learning_rate": 0.0004286333061557277, + "loss": 3.6186, + "step": 67300 + }, + { + "epoch": 4.572971871178149, + "grad_norm": 0.9085086584091187, + "learning_rate": 0.00042859084114689494, + "loss": 3.316, + "step": 67305 + }, + { + "epoch": 4.573311591248811, + "grad_norm": 0.9845300316810608, + "learning_rate": 0.00042854837613806227, + "loss": 3.7199, + "step": 67310 + }, + { + "epoch": 4.573651311319473, + "grad_norm": 0.9198624491691589, + "learning_rate": 0.00042850591112922955, + "loss": 3.5195, + "step": 67315 + }, + { + "epoch": 4.573991031390134, + "grad_norm": 0.9295461177825928, + "learning_rate": 0.0004284634461203968, + "loss": 3.3157, + "step": 67320 + }, + { + "epoch": 4.5743307514607965, + "grad_norm": 2.2340400218963623, + "learning_rate": 0.00042842098111156406, + "loss": 3.4871, + "step": 67325 + }, + { + "epoch": 4.574670471531459, + "grad_norm": 0.8152403235435486, + "learning_rate": 0.0004283785161027314, + "loss": 3.5218, + "step": 67330 + }, + { + "epoch": 4.57501019160212, + "grad_norm": 0.8695029020309448, + "learning_rate": 0.0004283360510938986, + "loss": 3.5435, + "step": 67335 + }, + { + "epoch": 4.575349911672782, + "grad_norm": 0.7092896103858948, + "learning_rate": 0.0004282935860850659, + "loss": 2.9881, + "step": 67340 + }, + { + "epoch": 4.575689631743444, + "grad_norm": 0.8532364964485168, + "learning_rate": 0.00042825112107623323, + "loss": 3.5179, + "step": 67345 + }, + { + "epoch": 4.576029351814105, + "grad_norm": 1.0035890340805054, + "learning_rate": 0.00042820865606740046, + "loss": 3.5444, + "step": 67350 + }, + { + "epoch": 4.576369071884767, + "grad_norm": 0.9702343940734863, + "learning_rate": 0.00042816619105856774, + "loss": 3.5728, + "step": 67355 + }, + { + "epoch": 4.576708791955428, + "grad_norm": 1.161484718322754, + "learning_rate": 0.000428123726049735, + "loss": 3.5374, + "step": 67360 + }, + { + "epoch": 4.57704851202609, + "grad_norm": 0.8325549364089966, + "learning_rate": 0.0004280812610409023, + "loss": 3.3462, + "step": 67365 + }, + { + "epoch": 4.5773882320967525, + "grad_norm": 1.2593601942062378, + "learning_rate": 0.0004280387960320696, + "loss": 3.2844, + "step": 67370 + }, + { + "epoch": 4.577727952167414, + "grad_norm": 1.0018056631088257, + "learning_rate": 0.00042799633102323686, + "loss": 3.3779, + "step": 67375 + }, + { + "epoch": 4.578067672238076, + "grad_norm": 0.9532210826873779, + "learning_rate": 0.00042795386601440414, + "loss": 3.6841, + "step": 67380 + }, + { + "epoch": 4.578407392308738, + "grad_norm": 0.909170389175415, + "learning_rate": 0.0004279114010055714, + "loss": 3.6378, + "step": 67385 + }, + { + "epoch": 4.578747112379399, + "grad_norm": 1.028110146522522, + "learning_rate": 0.0004278689359967387, + "loss": 3.4555, + "step": 67390 + }, + { + "epoch": 4.579086832450061, + "grad_norm": 0.7049983739852905, + "learning_rate": 0.0004278264709879059, + "loss": 3.3238, + "step": 67395 + }, + { + "epoch": 4.579426552520723, + "grad_norm": 1.5184146165847778, + "learning_rate": 0.00042778400597907326, + "loss": 3.4355, + "step": 67400 + }, + { + "epoch": 4.579766272591384, + "grad_norm": 0.895922839641571, + "learning_rate": 0.00042774154097024054, + "loss": 3.6635, + "step": 67405 + }, + { + "epoch": 4.580105992662046, + "grad_norm": 0.8685749173164368, + "learning_rate": 0.0004276990759614078, + "loss": 3.7057, + "step": 67410 + }, + { + "epoch": 4.5804457127327085, + "grad_norm": 0.8431677222251892, + "learning_rate": 0.0004276566109525751, + "loss": 3.472, + "step": 67415 + }, + { + "epoch": 4.58078543280337, + "grad_norm": 0.9706683158874512, + "learning_rate": 0.0004276141459437424, + "loss": 3.3467, + "step": 67420 + }, + { + "epoch": 4.581125152874032, + "grad_norm": 0.7461704611778259, + "learning_rate": 0.00042757168093490966, + "loss": 3.3921, + "step": 67425 + }, + { + "epoch": 4.581464872944694, + "grad_norm": 0.9601660370826721, + "learning_rate": 0.0004275292159260769, + "loss": 3.4204, + "step": 67430 + }, + { + "epoch": 4.581804593015355, + "grad_norm": 0.9619849324226379, + "learning_rate": 0.0004274867509172442, + "loss": 3.6263, + "step": 67435 + }, + { + "epoch": 4.582144313086017, + "grad_norm": 1.11203134059906, + "learning_rate": 0.0004274442859084115, + "loss": 3.2645, + "step": 67440 + }, + { + "epoch": 4.582484033156679, + "grad_norm": 0.8030264973640442, + "learning_rate": 0.0004274018208995787, + "loss": 3.4474, + "step": 67445 + }, + { + "epoch": 4.58282375322734, + "grad_norm": 1.0002907514572144, + "learning_rate": 0.00042735935589074606, + "loss": 3.3811, + "step": 67450 + }, + { + "epoch": 4.583163473298002, + "grad_norm": 0.7376506328582764, + "learning_rate": 0.00042731689088191334, + "loss": 3.3069, + "step": 67455 + }, + { + "epoch": 4.5835031933686645, + "grad_norm": 0.9239730834960938, + "learning_rate": 0.00042727442587308056, + "loss": 3.6644, + "step": 67460 + }, + { + "epoch": 4.583842913439326, + "grad_norm": 0.8496314883232117, + "learning_rate": 0.00042723196086424784, + "loss": 3.3853, + "step": 67465 + }, + { + "epoch": 4.584182633509988, + "grad_norm": 0.886184573173523, + "learning_rate": 0.0004271894958554152, + "loss": 3.4643, + "step": 67470 + }, + { + "epoch": 4.58452235358065, + "grad_norm": 1.0450458526611328, + "learning_rate": 0.0004271470308465824, + "loss": 3.5553, + "step": 67475 + }, + { + "epoch": 4.584862073651311, + "grad_norm": 1.1104778051376343, + "learning_rate": 0.0004271045658377497, + "loss": 3.2226, + "step": 67480 + }, + { + "epoch": 4.585201793721973, + "grad_norm": 0.7594872117042542, + "learning_rate": 0.000427062100828917, + "loss": 3.2422, + "step": 67485 + }, + { + "epoch": 4.585541513792635, + "grad_norm": 1.382143497467041, + "learning_rate": 0.00042701963582008424, + "loss": 3.6654, + "step": 67490 + }, + { + "epoch": 4.585881233863296, + "grad_norm": 0.9319247007369995, + "learning_rate": 0.0004269771708112515, + "loss": 3.4086, + "step": 67495 + }, + { + "epoch": 4.5862209539339585, + "grad_norm": 0.8966941237449646, + "learning_rate": 0.00042693470580241886, + "loss": 3.6573, + "step": 67500 + }, + { + "epoch": 4.5865606740046205, + "grad_norm": 0.8223041892051697, + "learning_rate": 0.0004268922407935861, + "loss": 3.4412, + "step": 67505 + }, + { + "epoch": 4.586900394075282, + "grad_norm": 0.9804565906524658, + "learning_rate": 0.00042684977578475336, + "loss": 3.4583, + "step": 67510 + }, + { + "epoch": 4.587240114145944, + "grad_norm": 0.7310084700584412, + "learning_rate": 0.00042680731077592064, + "loss": 3.5237, + "step": 67515 + }, + { + "epoch": 4.587579834216606, + "grad_norm": 1.2186017036437988, + "learning_rate": 0.0004267648457670879, + "loss": 3.5755, + "step": 67520 + }, + { + "epoch": 4.587919554287267, + "grad_norm": 0.8460050225257874, + "learning_rate": 0.0004267223807582552, + "loss": 3.6447, + "step": 67525 + }, + { + "epoch": 4.588259274357929, + "grad_norm": 1.0685724020004272, + "learning_rate": 0.0004266799157494225, + "loss": 3.5279, + "step": 67530 + }, + { + "epoch": 4.588598994428591, + "grad_norm": 0.878531813621521, + "learning_rate": 0.00042663745074058976, + "loss": 3.3674, + "step": 67535 + }, + { + "epoch": 4.588938714499252, + "grad_norm": 1.1671885251998901, + "learning_rate": 0.00042659498573175705, + "loss": 3.4972, + "step": 67540 + }, + { + "epoch": 4.5892784345699145, + "grad_norm": 1.2734168767929077, + "learning_rate": 0.0004265525207229243, + "loss": 3.5874, + "step": 67545 + }, + { + "epoch": 4.5896181546405765, + "grad_norm": 1.1873035430908203, + "learning_rate": 0.00042651005571409155, + "loss": 3.391, + "step": 67550 + }, + { + "epoch": 4.589957874711238, + "grad_norm": 0.9917282462120056, + "learning_rate": 0.0004264675907052589, + "loss": 3.4908, + "step": 67555 + }, + { + "epoch": 4.5902975947819, + "grad_norm": 0.7625194787979126, + "learning_rate": 0.00042642512569642617, + "loss": 3.6177, + "step": 67560 + }, + { + "epoch": 4.590637314852561, + "grad_norm": 0.9742088913917542, + "learning_rate": 0.0004263826606875934, + "loss": 3.2184, + "step": 67565 + }, + { + "epoch": 4.590977034923223, + "grad_norm": 0.8605336546897888, + "learning_rate": 0.0004263401956787607, + "loss": 3.3687, + "step": 67570 + }, + { + "epoch": 4.591316754993885, + "grad_norm": 0.7566969394683838, + "learning_rate": 0.000426297730669928, + "loss": 3.5571, + "step": 67575 + }, + { + "epoch": 4.591656475064546, + "grad_norm": 1.4101474285125732, + "learning_rate": 0.0004262552656610953, + "loss": 3.4234, + "step": 67580 + }, + { + "epoch": 4.591996195135208, + "grad_norm": 1.1425610780715942, + "learning_rate": 0.0004262128006522625, + "loss": 3.5098, + "step": 67585 + }, + { + "epoch": 4.5923359152058705, + "grad_norm": 0.8138750791549683, + "learning_rate": 0.00042617033564342985, + "loss": 3.4209, + "step": 67590 + }, + { + "epoch": 4.592675635276532, + "grad_norm": 0.9538364410400391, + "learning_rate": 0.0004261278706345971, + "loss": 3.1929, + "step": 67595 + }, + { + "epoch": 4.593015355347194, + "grad_norm": 0.9729725122451782, + "learning_rate": 0.00042608540562576435, + "loss": 3.3388, + "step": 67600 + }, + { + "epoch": 4.593355075417856, + "grad_norm": 1.0136361122131348, + "learning_rate": 0.0004260429406169317, + "loss": 3.513, + "step": 67605 + }, + { + "epoch": 4.593694795488517, + "grad_norm": 0.940673291683197, + "learning_rate": 0.00042600047560809897, + "loss": 3.3968, + "step": 67610 + }, + { + "epoch": 4.594034515559179, + "grad_norm": 0.9911448955535889, + "learning_rate": 0.0004259580105992662, + "loss": 3.3973, + "step": 67615 + }, + { + "epoch": 4.594374235629841, + "grad_norm": 1.0242573022842407, + "learning_rate": 0.00042591554559043347, + "loss": 3.3936, + "step": 67620 + }, + { + "epoch": 4.594713955700502, + "grad_norm": 1.0970944166183472, + "learning_rate": 0.0004258730805816008, + "loss": 3.3394, + "step": 67625 + }, + { + "epoch": 4.595053675771164, + "grad_norm": 0.7678404450416565, + "learning_rate": 0.00042583061557276803, + "loss": 3.564, + "step": 67630 + }, + { + "epoch": 4.5953933958418265, + "grad_norm": 1.0020453929901123, + "learning_rate": 0.0004257881505639353, + "loss": 3.3251, + "step": 67635 + }, + { + "epoch": 4.595733115912488, + "grad_norm": 1.007866382598877, + "learning_rate": 0.00042574568555510265, + "loss": 3.1795, + "step": 67640 + }, + { + "epoch": 4.59607283598315, + "grad_norm": 1.0952868461608887, + "learning_rate": 0.00042570322054626987, + "loss": 3.7584, + "step": 67645 + }, + { + "epoch": 4.596412556053812, + "grad_norm": 1.126753330230713, + "learning_rate": 0.00042566075553743715, + "loss": 3.5754, + "step": 67650 + }, + { + "epoch": 4.596752276124473, + "grad_norm": 0.8902984857559204, + "learning_rate": 0.00042561829052860443, + "loss": 3.8086, + "step": 67655 + }, + { + "epoch": 4.597091996195135, + "grad_norm": 0.9846822023391724, + "learning_rate": 0.0004255758255197717, + "loss": 3.284, + "step": 67660 + }, + { + "epoch": 4.597431716265797, + "grad_norm": 0.896574079990387, + "learning_rate": 0.000425533360510939, + "loss": 3.4342, + "step": 67665 + }, + { + "epoch": 4.597771436336458, + "grad_norm": 0.7636597752571106, + "learning_rate": 0.00042549089550210627, + "loss": 3.6563, + "step": 67670 + }, + { + "epoch": 4.59811115640712, + "grad_norm": 1.1551413536071777, + "learning_rate": 0.00042544843049327355, + "loss": 3.4332, + "step": 67675 + }, + { + "epoch": 4.5984508764777825, + "grad_norm": 0.8216379880905151, + "learning_rate": 0.00042540596548444083, + "loss": 3.3793, + "step": 67680 + }, + { + "epoch": 4.598790596548444, + "grad_norm": 1.128330945968628, + "learning_rate": 0.0004253635004756081, + "loss": 3.285, + "step": 67685 + }, + { + "epoch": 4.599130316619106, + "grad_norm": 0.8447304964065552, + "learning_rate": 0.00042532103546677534, + "loss": 3.4514, + "step": 67690 + }, + { + "epoch": 4.599470036689768, + "grad_norm": 0.8488949537277222, + "learning_rate": 0.0004252785704579427, + "loss": 3.3014, + "step": 67695 + }, + { + "epoch": 4.599809756760429, + "grad_norm": 0.8691514134407043, + "learning_rate": 0.00042523610544910995, + "loss": 3.593, + "step": 67700 + }, + { + "epoch": 4.600149476831091, + "grad_norm": 1.0395342111587524, + "learning_rate": 0.0004251936404402772, + "loss": 3.6971, + "step": 67705 + }, + { + "epoch": 4.600489196901753, + "grad_norm": 1.239989995956421, + "learning_rate": 0.0004251511754314445, + "loss": 3.2066, + "step": 67710 + }, + { + "epoch": 4.600828916972414, + "grad_norm": 0.9771098494529724, + "learning_rate": 0.0004251087104226118, + "loss": 3.2027, + "step": 67715 + }, + { + "epoch": 4.601168637043076, + "grad_norm": 1.061637282371521, + "learning_rate": 0.000425066245413779, + "loss": 3.3995, + "step": 67720 + }, + { + "epoch": 4.6015083571137385, + "grad_norm": 0.8627749085426331, + "learning_rate": 0.0004250237804049463, + "loss": 3.4376, + "step": 67725 + }, + { + "epoch": 4.6018480771844, + "grad_norm": 1.188488245010376, + "learning_rate": 0.00042498131539611363, + "loss": 3.4413, + "step": 67730 + }, + { + "epoch": 4.602187797255062, + "grad_norm": 0.9200079441070557, + "learning_rate": 0.00042493885038728086, + "loss": 3.4353, + "step": 67735 + }, + { + "epoch": 4.602527517325724, + "grad_norm": 0.9842554330825806, + "learning_rate": 0.00042489638537844814, + "loss": 3.6355, + "step": 67740 + }, + { + "epoch": 4.602867237396385, + "grad_norm": 0.8547536134719849, + "learning_rate": 0.0004248539203696155, + "loss": 3.4259, + "step": 67745 + }, + { + "epoch": 4.603206957467047, + "grad_norm": 1.177857756614685, + "learning_rate": 0.00042481145536078275, + "loss": 3.5518, + "step": 67750 + }, + { + "epoch": 4.603546677537709, + "grad_norm": 1.1207643747329712, + "learning_rate": 0.00042476899035195, + "loss": 3.3753, + "step": 67755 + }, + { + "epoch": 4.60388639760837, + "grad_norm": 1.0456726551055908, + "learning_rate": 0.00042472652534311726, + "loss": 3.6081, + "step": 67760 + }, + { + "epoch": 4.6042261176790324, + "grad_norm": 1.168534517288208, + "learning_rate": 0.0004246840603342846, + "loss": 3.4814, + "step": 67765 + }, + { + "epoch": 4.6045658377496945, + "grad_norm": 0.9754408001899719, + "learning_rate": 0.0004246415953254518, + "loss": 3.6873, + "step": 67770 + }, + { + "epoch": 4.604905557820356, + "grad_norm": 1.0667824745178223, + "learning_rate": 0.0004245991303166191, + "loss": 3.569, + "step": 67775 + }, + { + "epoch": 4.605245277891018, + "grad_norm": 0.8599440455436707, + "learning_rate": 0.00042455666530778643, + "loss": 3.5864, + "step": 67780 + }, + { + "epoch": 4.60558499796168, + "grad_norm": 1.0200839042663574, + "learning_rate": 0.00042451420029895366, + "loss": 3.5794, + "step": 67785 + }, + { + "epoch": 4.605924718032341, + "grad_norm": 0.8856172561645508, + "learning_rate": 0.00042447173529012094, + "loss": 3.5217, + "step": 67790 + }, + { + "epoch": 4.606264438103003, + "grad_norm": 0.6975310444831848, + "learning_rate": 0.0004244292702812883, + "loss": 3.4679, + "step": 67795 + }, + { + "epoch": 4.606604158173665, + "grad_norm": 0.9643648266792297, + "learning_rate": 0.0004243868052724555, + "loss": 3.3261, + "step": 67800 + }, + { + "epoch": 4.606943878244326, + "grad_norm": 1.0460666418075562, + "learning_rate": 0.0004243443402636228, + "loss": 3.4821, + "step": 67805 + }, + { + "epoch": 4.6072835983149885, + "grad_norm": 0.8765555620193481, + "learning_rate": 0.00042430187525479006, + "loss": 3.3445, + "step": 67810 + }, + { + "epoch": 4.6076233183856505, + "grad_norm": 0.7732099890708923, + "learning_rate": 0.00042425941024595734, + "loss": 3.449, + "step": 67815 + }, + { + "epoch": 4.607963038456312, + "grad_norm": 0.7454981803894043, + "learning_rate": 0.0004242169452371246, + "loss": 3.0417, + "step": 67820 + }, + { + "epoch": 4.608302758526974, + "grad_norm": 0.9115806818008423, + "learning_rate": 0.0004241744802282919, + "loss": 3.6322, + "step": 67825 + }, + { + "epoch": 4.608642478597636, + "grad_norm": 0.9436793327331543, + "learning_rate": 0.0004241320152194592, + "loss": 3.8252, + "step": 67830 + }, + { + "epoch": 4.608982198668297, + "grad_norm": 0.8755800724029541, + "learning_rate": 0.00042408955021062646, + "loss": 3.4558, + "step": 67835 + }, + { + "epoch": 4.609321918738959, + "grad_norm": 0.8173354268074036, + "learning_rate": 0.00042404708520179374, + "loss": 3.3487, + "step": 67840 + }, + { + "epoch": 4.609661638809621, + "grad_norm": 0.93462073802948, + "learning_rate": 0.00042400462019296097, + "loss": 3.3753, + "step": 67845 + }, + { + "epoch": 4.610001358880282, + "grad_norm": 1.0286446809768677, + "learning_rate": 0.0004239621551841283, + "loss": 3.5894, + "step": 67850 + }, + { + "epoch": 4.6103410789509445, + "grad_norm": 0.9861451983451843, + "learning_rate": 0.0004239196901752956, + "loss": 3.4899, + "step": 67855 + }, + { + "epoch": 4.6106807990216065, + "grad_norm": 0.9511154294013977, + "learning_rate": 0.0004238772251664628, + "loss": 3.4278, + "step": 67860 + }, + { + "epoch": 4.611020519092268, + "grad_norm": 0.9542855620384216, + "learning_rate": 0.00042383476015763014, + "loss": 3.3939, + "step": 67865 + }, + { + "epoch": 4.61136023916293, + "grad_norm": 0.9497278928756714, + "learning_rate": 0.0004237922951487974, + "loss": 3.1942, + "step": 67870 + }, + { + "epoch": 4.611699959233592, + "grad_norm": 0.9607531428337097, + "learning_rate": 0.00042374983013996465, + "loss": 3.4325, + "step": 67875 + }, + { + "epoch": 4.612039679304253, + "grad_norm": 0.9245302081108093, + "learning_rate": 0.00042370736513113193, + "loss": 3.585, + "step": 67880 + }, + { + "epoch": 4.612379399374915, + "grad_norm": 0.6633448600769043, + "learning_rate": 0.00042366490012229926, + "loss": 3.4518, + "step": 67885 + }, + { + "epoch": 4.612719119445577, + "grad_norm": 0.8772918581962585, + "learning_rate": 0.0004236224351134665, + "loss": 3.65, + "step": 67890 + }, + { + "epoch": 4.613058839516238, + "grad_norm": 1.0511177778244019, + "learning_rate": 0.00042357997010463377, + "loss": 3.2801, + "step": 67895 + }, + { + "epoch": 4.6133985595869005, + "grad_norm": 0.9914047122001648, + "learning_rate": 0.0004235375050958011, + "loss": 3.6367, + "step": 67900 + }, + { + "epoch": 4.613738279657563, + "grad_norm": 0.7486261129379272, + "learning_rate": 0.00042349504008696833, + "loss": 3.5587, + "step": 67905 + }, + { + "epoch": 4.614077999728224, + "grad_norm": 1.0845264196395874, + "learning_rate": 0.0004234525750781356, + "loss": 3.2332, + "step": 67910 + }, + { + "epoch": 4.614417719798886, + "grad_norm": 1.0671820640563965, + "learning_rate": 0.0004234101100693029, + "loss": 3.4662, + "step": 67915 + }, + { + "epoch": 4.614757439869548, + "grad_norm": 0.827296793460846, + "learning_rate": 0.0004233676450604702, + "loss": 3.4402, + "step": 67920 + }, + { + "epoch": 4.615097159940209, + "grad_norm": 0.8654360175132751, + "learning_rate": 0.00042332518005163745, + "loss": 3.7429, + "step": 67925 + }, + { + "epoch": 4.615436880010871, + "grad_norm": 1.1359440088272095, + "learning_rate": 0.00042328271504280473, + "loss": 3.3554, + "step": 67930 + }, + { + "epoch": 4.615776600081533, + "grad_norm": 0.8161609172821045, + "learning_rate": 0.00042324025003397206, + "loss": 3.393, + "step": 67935 + }, + { + "epoch": 4.616116320152194, + "grad_norm": 0.7660179734230042, + "learning_rate": 0.0004231977850251393, + "loss": 3.4246, + "step": 67940 + }, + { + "epoch": 4.6164560402228565, + "grad_norm": 1.5755057334899902, + "learning_rate": 0.00042315532001630657, + "loss": 3.6323, + "step": 67945 + }, + { + "epoch": 4.616795760293519, + "grad_norm": 0.7539491057395935, + "learning_rate": 0.00042311285500747385, + "loss": 3.6008, + "step": 67950 + }, + { + "epoch": 4.61713548036418, + "grad_norm": 0.8688687682151794, + "learning_rate": 0.00042307038999864113, + "loss": 3.6917, + "step": 67955 + }, + { + "epoch": 4.617475200434842, + "grad_norm": 0.9952382445335388, + "learning_rate": 0.0004230279249898084, + "loss": 3.2879, + "step": 67960 + }, + { + "epoch": 4.617814920505504, + "grad_norm": 0.7649852633476257, + "learning_rate": 0.0004229854599809757, + "loss": 3.4274, + "step": 67965 + }, + { + "epoch": 4.618154640576165, + "grad_norm": 1.2530995607376099, + "learning_rate": 0.00042294299497214297, + "loss": 3.4488, + "step": 67970 + }, + { + "epoch": 4.618494360646827, + "grad_norm": 1.3780122995376587, + "learning_rate": 0.00042290052996331025, + "loss": 3.579, + "step": 67975 + }, + { + "epoch": 4.618834080717489, + "grad_norm": 2.0877487659454346, + "learning_rate": 0.00042285806495447753, + "loss": 3.2822, + "step": 67980 + }, + { + "epoch": 4.61917380078815, + "grad_norm": 0.7844423651695251, + "learning_rate": 0.00042281559994564476, + "loss": 3.4221, + "step": 67985 + }, + { + "epoch": 4.6195135208588125, + "grad_norm": 0.8942902088165283, + "learning_rate": 0.0004227731349368121, + "loss": 3.5368, + "step": 67990 + }, + { + "epoch": 4.619853240929475, + "grad_norm": 1.0745983123779297, + "learning_rate": 0.00042273066992797937, + "loss": 3.5803, + "step": 67995 + }, + { + "epoch": 4.620192961000136, + "grad_norm": 1.3320765495300293, + "learning_rate": 0.0004226882049191466, + "loss": 3.2468, + "step": 68000 + }, + { + "epoch": 4.620532681070798, + "grad_norm": 0.9916790127754211, + "learning_rate": 0.00042264573991031393, + "loss": 3.6054, + "step": 68005 + }, + { + "epoch": 4.62087240114146, + "grad_norm": 0.9033504724502563, + "learning_rate": 0.0004226032749014812, + "loss": 3.5163, + "step": 68010 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 0.8054015636444092, + "learning_rate": 0.00042256080989264844, + "loss": 3.3587, + "step": 68015 + }, + { + "epoch": 4.621551841282783, + "grad_norm": 0.9221025705337524, + "learning_rate": 0.0004225183448838157, + "loss": 3.3693, + "step": 68020 + }, + { + "epoch": 4.621891561353445, + "grad_norm": 1.02006995677948, + "learning_rate": 0.00042247587987498305, + "loss": 3.3122, + "step": 68025 + }, + { + "epoch": 4.622231281424106, + "grad_norm": 0.9173843860626221, + "learning_rate": 0.0004224334148661503, + "loss": 3.4325, + "step": 68030 + }, + { + "epoch": 4.6225710014947685, + "grad_norm": 1.116262674331665, + "learning_rate": 0.00042239094985731756, + "loss": 3.3826, + "step": 68035 + }, + { + "epoch": 4.62291072156543, + "grad_norm": 1.05982506275177, + "learning_rate": 0.0004223484848484849, + "loss": 3.5441, + "step": 68040 + }, + { + "epoch": 4.623250441636092, + "grad_norm": 0.950940728187561, + "learning_rate": 0.0004223060198396521, + "loss": 3.2434, + "step": 68045 + }, + { + "epoch": 4.623590161706754, + "grad_norm": 0.9409537315368652, + "learning_rate": 0.0004222635548308194, + "loss": 3.6118, + "step": 68050 + }, + { + "epoch": 4.623929881777415, + "grad_norm": 1.268643856048584, + "learning_rate": 0.00042222108982198673, + "loss": 3.3503, + "step": 68055 + }, + { + "epoch": 4.624269601848077, + "grad_norm": 0.944706380367279, + "learning_rate": 0.00042217862481315396, + "loss": 3.6359, + "step": 68060 + }, + { + "epoch": 4.624609321918739, + "grad_norm": 0.9204175472259521, + "learning_rate": 0.00042213615980432124, + "loss": 3.6794, + "step": 68065 + }, + { + "epoch": 4.6249490419894, + "grad_norm": 0.9858812689781189, + "learning_rate": 0.0004220936947954885, + "loss": 3.1558, + "step": 68070 + }, + { + "epoch": 4.6252887620600625, + "grad_norm": 0.7597103714942932, + "learning_rate": 0.0004220512297866558, + "loss": 3.4954, + "step": 68075 + }, + { + "epoch": 4.6256284821307245, + "grad_norm": 1.071475863456726, + "learning_rate": 0.0004220087647778231, + "loss": 3.4208, + "step": 68080 + }, + { + "epoch": 4.625968202201386, + "grad_norm": 0.7880346179008484, + "learning_rate": 0.00042196629976899036, + "loss": 3.4231, + "step": 68085 + }, + { + "epoch": 4.626307922272048, + "grad_norm": 1.0125168561935425, + "learning_rate": 0.0004219238347601577, + "loss": 3.3897, + "step": 68090 + }, + { + "epoch": 4.62664764234271, + "grad_norm": 0.7479348182678223, + "learning_rate": 0.0004218813697513249, + "loss": 3.5771, + "step": 68095 + }, + { + "epoch": 4.626987362413371, + "grad_norm": 1.5701583623886108, + "learning_rate": 0.0004218389047424922, + "loss": 3.647, + "step": 68100 + }, + { + "epoch": 4.627327082484033, + "grad_norm": 1.0936230421066284, + "learning_rate": 0.0004217964397336595, + "loss": 3.3265, + "step": 68105 + }, + { + "epoch": 4.627666802554695, + "grad_norm": 0.7409560680389404, + "learning_rate": 0.00042175397472482676, + "loss": 3.3092, + "step": 68110 + }, + { + "epoch": 4.628006522625356, + "grad_norm": 0.9632290005683899, + "learning_rate": 0.00042171150971599404, + "loss": 3.6188, + "step": 68115 + }, + { + "epoch": 4.6283462426960185, + "grad_norm": 0.8851532936096191, + "learning_rate": 0.0004216690447071613, + "loss": 3.4776, + "step": 68120 + }, + { + "epoch": 4.6286859627666805, + "grad_norm": 1.1417982578277588, + "learning_rate": 0.0004216265796983286, + "loss": 3.6304, + "step": 68125 + }, + { + "epoch": 4.629025682837342, + "grad_norm": 0.9079473614692688, + "learning_rate": 0.0004215841146894959, + "loss": 3.6266, + "step": 68130 + }, + { + "epoch": 4.629365402908004, + "grad_norm": 0.6701377034187317, + "learning_rate": 0.00042154164968066316, + "loss": 3.4408, + "step": 68135 + }, + { + "epoch": 4.629705122978666, + "grad_norm": 0.8451500535011292, + "learning_rate": 0.0004214991846718304, + "loss": 3.4482, + "step": 68140 + }, + { + "epoch": 4.630044843049327, + "grad_norm": 0.7505351305007935, + "learning_rate": 0.0004214567196629977, + "loss": 3.4199, + "step": 68145 + }, + { + "epoch": 4.630384563119989, + "grad_norm": 1.0736321210861206, + "learning_rate": 0.000421414254654165, + "loss": 3.4715, + "step": 68150 + }, + { + "epoch": 4.630724283190651, + "grad_norm": 1.0362058877944946, + "learning_rate": 0.0004213717896453322, + "loss": 3.3602, + "step": 68155 + }, + { + "epoch": 4.631064003261312, + "grad_norm": 0.9060691595077515, + "learning_rate": 0.00042132932463649956, + "loss": 3.5674, + "step": 68160 + }, + { + "epoch": 4.6314037233319745, + "grad_norm": 2.0289061069488525, + "learning_rate": 0.00042128685962766684, + "loss": 3.4848, + "step": 68165 + }, + { + "epoch": 4.6317434434026366, + "grad_norm": 0.879246175289154, + "learning_rate": 0.00042124439461883406, + "loss": 3.5348, + "step": 68170 + }, + { + "epoch": 4.632083163473298, + "grad_norm": 0.986301839351654, + "learning_rate": 0.00042120192961000134, + "loss": 3.4426, + "step": 68175 + }, + { + "epoch": 4.63242288354396, + "grad_norm": 2.2660024166107178, + "learning_rate": 0.0004211594646011687, + "loss": 3.3949, + "step": 68180 + }, + { + "epoch": 4.632762603614622, + "grad_norm": 1.0886389017105103, + "learning_rate": 0.0004211169995923359, + "loss": 3.4583, + "step": 68185 + }, + { + "epoch": 4.633102323685283, + "grad_norm": 0.8283159732818604, + "learning_rate": 0.0004210745345835032, + "loss": 3.4666, + "step": 68190 + }, + { + "epoch": 4.633442043755945, + "grad_norm": 0.7860187888145447, + "learning_rate": 0.0004210320695746705, + "loss": 3.6893, + "step": 68195 + }, + { + "epoch": 4.633781763826607, + "grad_norm": 0.822529673576355, + "learning_rate": 0.00042098960456583774, + "loss": 3.3774, + "step": 68200 + }, + { + "epoch": 4.634121483897268, + "grad_norm": 1.351799488067627, + "learning_rate": 0.000420947139557005, + "loss": 3.5299, + "step": 68205 + }, + { + "epoch": 4.6344612039679305, + "grad_norm": 0.9076831340789795, + "learning_rate": 0.0004209046745481723, + "loss": 3.1671, + "step": 68210 + }, + { + "epoch": 4.634800924038593, + "grad_norm": 0.9458209276199341, + "learning_rate": 0.0004208622095393396, + "loss": 3.8599, + "step": 68215 + }, + { + "epoch": 4.635140644109254, + "grad_norm": 1.125292420387268, + "learning_rate": 0.00042081974453050686, + "loss": 3.4028, + "step": 68220 + }, + { + "epoch": 4.635480364179916, + "grad_norm": 0.7516064047813416, + "learning_rate": 0.00042077727952167414, + "loss": 3.6658, + "step": 68225 + }, + { + "epoch": 4.635820084250578, + "grad_norm": 0.8200341463088989, + "learning_rate": 0.0004207348145128414, + "loss": 3.3035, + "step": 68230 + }, + { + "epoch": 4.636159804321239, + "grad_norm": 0.9639137387275696, + "learning_rate": 0.0004206923495040087, + "loss": 3.4202, + "step": 68235 + }, + { + "epoch": 4.636499524391901, + "grad_norm": 0.9078567624092102, + "learning_rate": 0.000420649884495176, + "loss": 3.527, + "step": 68240 + }, + { + "epoch": 4.636839244462563, + "grad_norm": 0.8271381258964539, + "learning_rate": 0.0004206074194863432, + "loss": 3.5641, + "step": 68245 + }, + { + "epoch": 4.637178964533224, + "grad_norm": 1.1140286922454834, + "learning_rate": 0.00042056495447751054, + "loss": 3.5029, + "step": 68250 + }, + { + "epoch": 4.6375186846038865, + "grad_norm": 0.787558913230896, + "learning_rate": 0.0004205224894686778, + "loss": 3.4288, + "step": 68255 + }, + { + "epoch": 4.637858404674548, + "grad_norm": 0.7674005627632141, + "learning_rate": 0.0004204800244598451, + "loss": 3.5675, + "step": 68260 + }, + { + "epoch": 4.63819812474521, + "grad_norm": 0.8042033910751343, + "learning_rate": 0.0004204375594510124, + "loss": 3.3687, + "step": 68265 + }, + { + "epoch": 4.638537844815872, + "grad_norm": 0.8212522268295288, + "learning_rate": 0.00042039509444217967, + "loss": 3.2221, + "step": 68270 + }, + { + "epoch": 4.638877564886533, + "grad_norm": 0.7807596325874329, + "learning_rate": 0.00042035262943334695, + "loss": 3.6051, + "step": 68275 + }, + { + "epoch": 4.639217284957195, + "grad_norm": 0.997909426689148, + "learning_rate": 0.00042031016442451417, + "loss": 3.7048, + "step": 68280 + }, + { + "epoch": 4.639557005027857, + "grad_norm": 1.1429107189178467, + "learning_rate": 0.0004202676994156815, + "loss": 3.2478, + "step": 68285 + }, + { + "epoch": 4.639896725098518, + "grad_norm": 0.7868729829788208, + "learning_rate": 0.0004202252344068488, + "loss": 3.3429, + "step": 68290 + }, + { + "epoch": 4.64023644516918, + "grad_norm": 1.117238163948059, + "learning_rate": 0.000420182769398016, + "loss": 3.6059, + "step": 68295 + }, + { + "epoch": 4.6405761652398425, + "grad_norm": 0.9118273258209229, + "learning_rate": 0.00042014030438918335, + "loss": 3.269, + "step": 68300 + }, + { + "epoch": 4.640915885310504, + "grad_norm": 0.8289272785186768, + "learning_rate": 0.0004200978393803506, + "loss": 3.1827, + "step": 68305 + }, + { + "epoch": 4.641255605381166, + "grad_norm": 0.8657904267311096, + "learning_rate": 0.00042005537437151785, + "loss": 3.3898, + "step": 68310 + }, + { + "epoch": 4.641595325451828, + "grad_norm": 1.0496755838394165, + "learning_rate": 0.00042001290936268513, + "loss": 3.5389, + "step": 68315 + }, + { + "epoch": 4.641935045522489, + "grad_norm": 0.8585916757583618, + "learning_rate": 0.00041997044435385247, + "loss": 3.4973, + "step": 68320 + }, + { + "epoch": 4.642274765593151, + "grad_norm": NaN, + "learning_rate": 0.0004199364723467863, + "loss": 3.4047, + "step": 68325 + }, + { + "epoch": 4.642614485663813, + "grad_norm": 1.1544864177703857, + "learning_rate": 0.0004198940073379535, + "loss": 3.5459, + "step": 68330 + }, + { + "epoch": 4.642954205734474, + "grad_norm": 0.7217406630516052, + "learning_rate": 0.0004198515423291208, + "loss": 3.4778, + "step": 68335 + }, + { + "epoch": 4.6432939258051364, + "grad_norm": 0.9202871322631836, + "learning_rate": 0.00041980907732028813, + "loss": 3.5284, + "step": 68340 + }, + { + "epoch": 4.6436336458757985, + "grad_norm": 1.161841630935669, + "learning_rate": 0.00041976661231145536, + "loss": 3.1032, + "step": 68345 + }, + { + "epoch": 4.64397336594646, + "grad_norm": 0.9443156719207764, + "learning_rate": 0.00041972414730262264, + "loss": 3.4783, + "step": 68350 + }, + { + "epoch": 4.644313086017122, + "grad_norm": 0.9308874011039734, + "learning_rate": 0.00041968168229378997, + "loss": 3.5231, + "step": 68355 + }, + { + "epoch": 4.644652806087784, + "grad_norm": 0.8463960289955139, + "learning_rate": 0.0004196392172849572, + "loss": 3.4287, + "step": 68360 + }, + { + "epoch": 4.644992526158445, + "grad_norm": 1.151592493057251, + "learning_rate": 0.0004195967522761245, + "loss": 3.2152, + "step": 68365 + }, + { + "epoch": 4.645332246229107, + "grad_norm": 1.0140987634658813, + "learning_rate": 0.00041955428726729176, + "loss": 3.653, + "step": 68370 + }, + { + "epoch": 4.645671966299769, + "grad_norm": 0.962916910648346, + "learning_rate": 0.00041951182225845904, + "loss": 3.6072, + "step": 68375 + }, + { + "epoch": 4.64601168637043, + "grad_norm": 0.8232810497283936, + "learning_rate": 0.0004194693572496263, + "loss": 3.3832, + "step": 68380 + }, + { + "epoch": 4.6463514064410925, + "grad_norm": 0.9009073972702026, + "learning_rate": 0.0004194268922407936, + "loss": 3.4617, + "step": 68385 + }, + { + "epoch": 4.6466911265117545, + "grad_norm": 0.8511329889297485, + "learning_rate": 0.0004193844272319609, + "loss": 3.3822, + "step": 68390 + }, + { + "epoch": 4.647030846582416, + "grad_norm": 1.0330052375793457, + "learning_rate": 0.00041934196222312816, + "loss": 3.5501, + "step": 68395 + }, + { + "epoch": 4.647370566653078, + "grad_norm": 0.938461422920227, + "learning_rate": 0.00041929949721429544, + "loss": 3.6957, + "step": 68400 + }, + { + "epoch": 4.64771028672374, + "grad_norm": 1.0211055278778076, + "learning_rate": 0.00041925703220546266, + "loss": 3.4956, + "step": 68405 + }, + { + "epoch": 4.648050006794401, + "grad_norm": 0.9394470453262329, + "learning_rate": 0.00041921456719663, + "loss": 3.4306, + "step": 68410 + }, + { + "epoch": 4.648389726865063, + "grad_norm": 1.02425217628479, + "learning_rate": 0.0004191721021877973, + "loss": 3.3925, + "step": 68415 + }, + { + "epoch": 4.648729446935725, + "grad_norm": 0.8377985954284668, + "learning_rate": 0.0004191296371789645, + "loss": 3.3606, + "step": 68420 + }, + { + "epoch": 4.649069167006386, + "grad_norm": 0.7971248030662537, + "learning_rate": 0.00041908717217013184, + "loss": 3.3745, + "step": 68425 + }, + { + "epoch": 4.6494088870770485, + "grad_norm": 0.8930221796035767, + "learning_rate": 0.0004190447071612991, + "loss": 3.4173, + "step": 68430 + }, + { + "epoch": 4.6497486071477105, + "grad_norm": 0.8430196642875671, + "learning_rate": 0.00041900224215246634, + "loss": 3.5027, + "step": 68435 + }, + { + "epoch": 4.650088327218372, + "grad_norm": 0.709414541721344, + "learning_rate": 0.0004189597771436336, + "loss": 3.8059, + "step": 68440 + }, + { + "epoch": 4.650428047289034, + "grad_norm": 0.8469245433807373, + "learning_rate": 0.00041891731213480096, + "loss": 3.5245, + "step": 68445 + }, + { + "epoch": 4.650767767359696, + "grad_norm": 1.1460881233215332, + "learning_rate": 0.0004188748471259682, + "loss": 3.5545, + "step": 68450 + }, + { + "epoch": 4.651107487430357, + "grad_norm": 0.8345444202423096, + "learning_rate": 0.00041883238211713546, + "loss": 3.4441, + "step": 68455 + }, + { + "epoch": 4.651447207501019, + "grad_norm": 1.309239149093628, + "learning_rate": 0.0004187899171083028, + "loss": 3.8473, + "step": 68460 + }, + { + "epoch": 4.651786927571681, + "grad_norm": 0.7085918188095093, + "learning_rate": 0.0004187474520994701, + "loss": 3.3696, + "step": 68465 + }, + { + "epoch": 4.652126647642342, + "grad_norm": 0.8945024013519287, + "learning_rate": 0.0004187049870906373, + "loss": 3.6138, + "step": 68470 + }, + { + "epoch": 4.6524663677130045, + "grad_norm": 1.1071985960006714, + "learning_rate": 0.0004186625220818046, + "loss": 3.3351, + "step": 68475 + }, + { + "epoch": 4.6528060877836666, + "grad_norm": 0.8492181301116943, + "learning_rate": 0.0004186200570729719, + "loss": 3.1641, + "step": 68480 + }, + { + "epoch": 4.653145807854328, + "grad_norm": 1.078387975692749, + "learning_rate": 0.00041857759206413914, + "loss": 3.5278, + "step": 68485 + }, + { + "epoch": 4.65348552792499, + "grad_norm": 0.8538522720336914, + "learning_rate": 0.0004185351270553064, + "loss": 3.5538, + "step": 68490 + }, + { + "epoch": 4.653825247995652, + "grad_norm": 0.9867050647735596, + "learning_rate": 0.00041849266204647376, + "loss": 3.3839, + "step": 68495 + }, + { + "epoch": 4.654164968066313, + "grad_norm": 0.8118751049041748, + "learning_rate": 0.000418450197037641, + "loss": 3.5708, + "step": 68500 + }, + { + "epoch": 4.654504688136975, + "grad_norm": 0.858320951461792, + "learning_rate": 0.00041840773202880826, + "loss": 3.4942, + "step": 68505 + }, + { + "epoch": 4.654844408207637, + "grad_norm": 0.8895071744918823, + "learning_rate": 0.00041836526701997554, + "loss": 3.487, + "step": 68510 + }, + { + "epoch": 4.655184128278298, + "grad_norm": 0.7744737267494202, + "learning_rate": 0.0004183228020111428, + "loss": 3.3509, + "step": 68515 + }, + { + "epoch": 4.6555238483489605, + "grad_norm": 0.8673223257064819, + "learning_rate": 0.0004182803370023101, + "loss": 3.4455, + "step": 68520 + }, + { + "epoch": 4.655863568419623, + "grad_norm": 1.090714693069458, + "learning_rate": 0.0004182378719934774, + "loss": 3.3787, + "step": 68525 + }, + { + "epoch": 4.656203288490284, + "grad_norm": 1.0521056652069092, + "learning_rate": 0.00041819540698464466, + "loss": 3.4652, + "step": 68530 + }, + { + "epoch": 4.656543008560946, + "grad_norm": 0.9571863412857056, + "learning_rate": 0.00041815294197581194, + "loss": 3.5472, + "step": 68535 + }, + { + "epoch": 4.656882728631608, + "grad_norm": 2.2917494773864746, + "learning_rate": 0.0004181104769669792, + "loss": 3.5929, + "step": 68540 + }, + { + "epoch": 4.657222448702269, + "grad_norm": 0.9364058375358582, + "learning_rate": 0.00041806801195814645, + "loss": 3.5055, + "step": 68545 + }, + { + "epoch": 4.657562168772931, + "grad_norm": 0.9183163642883301, + "learning_rate": 0.0004180255469493138, + "loss": 3.6195, + "step": 68550 + }, + { + "epoch": 4.657901888843593, + "grad_norm": 0.9120282530784607, + "learning_rate": 0.00041798308194048107, + "loss": 3.269, + "step": 68555 + }, + { + "epoch": 4.658241608914254, + "grad_norm": 0.8997876048088074, + "learning_rate": 0.0004179406169316483, + "loss": 3.5365, + "step": 68560 + }, + { + "epoch": 4.6585813289849165, + "grad_norm": 0.8240998983383179, + "learning_rate": 0.0004178981519228156, + "loss": 3.6223, + "step": 68565 + }, + { + "epoch": 4.658921049055579, + "grad_norm": 1.1167128086090088, + "learning_rate": 0.0004178556869139829, + "loss": 3.6703, + "step": 68570 + }, + { + "epoch": 4.65926076912624, + "grad_norm": 0.9014145731925964, + "learning_rate": 0.00041781322190515013, + "loss": 3.3778, + "step": 68575 + }, + { + "epoch": 4.659600489196902, + "grad_norm": 0.9157878160476685, + "learning_rate": 0.00041777075689631747, + "loss": 3.6306, + "step": 68580 + }, + { + "epoch": 4.659940209267564, + "grad_norm": 0.9390214085578918, + "learning_rate": 0.00041772829188748475, + "loss": 3.6574, + "step": 68585 + }, + { + "epoch": 4.660279929338225, + "grad_norm": 0.910952091217041, + "learning_rate": 0.00041768582687865197, + "loss": 3.4977, + "step": 68590 + }, + { + "epoch": 4.660619649408887, + "grad_norm": 0.8853210806846619, + "learning_rate": 0.00041764336186981925, + "loss": 3.4659, + "step": 68595 + }, + { + "epoch": 4.660959369479549, + "grad_norm": 0.7250235676765442, + "learning_rate": 0.0004176008968609866, + "loss": 3.0685, + "step": 68600 + }, + { + "epoch": 4.66129908955021, + "grad_norm": 0.7922696471214294, + "learning_rate": 0.0004175584318521538, + "loss": 3.4752, + "step": 68605 + }, + { + "epoch": 4.6616388096208725, + "grad_norm": 0.9440906643867493, + "learning_rate": 0.0004175159668433211, + "loss": 3.5307, + "step": 68610 + }, + { + "epoch": 4.661978529691535, + "grad_norm": 1.022051215171814, + "learning_rate": 0.0004174735018344884, + "loss": 3.3897, + "step": 68615 + }, + { + "epoch": 4.662318249762196, + "grad_norm": 1.055835247039795, + "learning_rate": 0.00041743103682565565, + "loss": 3.3832, + "step": 68620 + }, + { + "epoch": 4.662657969832858, + "grad_norm": 0.747126579284668, + "learning_rate": 0.00041738857181682293, + "loss": 3.4385, + "step": 68625 + }, + { + "epoch": 4.66299768990352, + "grad_norm": 0.8269716501235962, + "learning_rate": 0.0004173461068079902, + "loss": 3.6225, + "step": 68630 + }, + { + "epoch": 4.663337409974181, + "grad_norm": 1.0355701446533203, + "learning_rate": 0.00041730364179915755, + "loss": 3.4913, + "step": 68635 + }, + { + "epoch": 4.663677130044843, + "grad_norm": 1.3126460313796997, + "learning_rate": 0.00041726117679032477, + "loss": 3.3639, + "step": 68640 + }, + { + "epoch": 4.664016850115505, + "grad_norm": 0.7754928469657898, + "learning_rate": 0.00041721871178149205, + "loss": 3.4186, + "step": 68645 + }, + { + "epoch": 4.6643565701861665, + "grad_norm": 1.0171769857406616, + "learning_rate": 0.0004171762467726594, + "loss": 3.3595, + "step": 68650 + }, + { + "epoch": 4.6646962902568285, + "grad_norm": 0.803172767162323, + "learning_rate": 0.0004171337817638266, + "loss": 3.362, + "step": 68655 + }, + { + "epoch": 4.665036010327491, + "grad_norm": 0.794612467288971, + "learning_rate": 0.0004170913167549939, + "loss": 3.5938, + "step": 68660 + }, + { + "epoch": 4.665375730398152, + "grad_norm": 0.8498074412345886, + "learning_rate": 0.00041704885174616117, + "loss": 3.5558, + "step": 68665 + }, + { + "epoch": 4.665715450468814, + "grad_norm": 1.1903995275497437, + "learning_rate": 0.00041700638673732845, + "loss": 3.5195, + "step": 68670 + }, + { + "epoch": 4.666055170539476, + "grad_norm": 0.8439905047416687, + "learning_rate": 0.00041696392172849573, + "loss": 3.5928, + "step": 68675 + }, + { + "epoch": 4.666394890610137, + "grad_norm": 0.9108756184577942, + "learning_rate": 0.000416921456719663, + "loss": 3.2458, + "step": 68680 + }, + { + "epoch": 4.666734610680799, + "grad_norm": 1.0803277492523193, + "learning_rate": 0.0004168789917108303, + "loss": 3.586, + "step": 68685 + }, + { + "epoch": 4.667074330751461, + "grad_norm": 1.1362838745117188, + "learning_rate": 0.0004168365267019976, + "loss": 3.6057, + "step": 68690 + }, + { + "epoch": 4.6674140508221225, + "grad_norm": 0.7965763807296753, + "learning_rate": 0.00041679406169316485, + "loss": 3.3342, + "step": 68695 + }, + { + "epoch": 4.6677537708927845, + "grad_norm": 1.1843644380569458, + "learning_rate": 0.0004167515966843321, + "loss": 3.5646, + "step": 68700 + }, + { + "epoch": 4.668093490963447, + "grad_norm": 0.9335412979125977, + "learning_rate": 0.0004167091316754994, + "loss": 3.4251, + "step": 68705 + }, + { + "epoch": 4.668433211034108, + "grad_norm": 1.009602665901184, + "learning_rate": 0.0004166666666666667, + "loss": 3.3, + "step": 68710 + }, + { + "epoch": 4.66877293110477, + "grad_norm": 0.8194633722305298, + "learning_rate": 0.0004166242016578339, + "loss": 3.4751, + "step": 68715 + }, + { + "epoch": 4.669112651175431, + "grad_norm": 1.2031645774841309, + "learning_rate": 0.00041658173664900125, + "loss": 3.5301, + "step": 68720 + }, + { + "epoch": 4.669452371246093, + "grad_norm": 0.9757990837097168, + "learning_rate": 0.00041653927164016853, + "loss": 3.5916, + "step": 68725 + }, + { + "epoch": 4.669792091316755, + "grad_norm": 1.1158692836761475, + "learning_rate": 0.00041649680663133576, + "loss": 3.4681, + "step": 68730 + }, + { + "epoch": 4.670131811387416, + "grad_norm": 1.000247836112976, + "learning_rate": 0.00041645434162250304, + "loss": 3.4652, + "step": 68735 + }, + { + "epoch": 4.6704715314580785, + "grad_norm": 0.7221294045448303, + "learning_rate": 0.0004164118766136704, + "loss": 3.4307, + "step": 68740 + }, + { + "epoch": 4.6708112515287405, + "grad_norm": 0.984487771987915, + "learning_rate": 0.0004163694116048376, + "loss": 3.2751, + "step": 68745 + }, + { + "epoch": 4.671150971599402, + "grad_norm": 1.125544548034668, + "learning_rate": 0.0004163269465960049, + "loss": 3.594, + "step": 68750 + }, + { + "epoch": 4.671490691670064, + "grad_norm": 1.055929183959961, + "learning_rate": 0.0004162844815871722, + "loss": 3.5043, + "step": 68755 + }, + { + "epoch": 4.671830411740726, + "grad_norm": 1.0287470817565918, + "learning_rate": 0.00041624201657833944, + "loss": 3.5652, + "step": 68760 + }, + { + "epoch": 4.672170131811387, + "grad_norm": 0.8526451587677002, + "learning_rate": 0.0004161995515695067, + "loss": 3.6881, + "step": 68765 + }, + { + "epoch": 4.672509851882049, + "grad_norm": 1.044836401939392, + "learning_rate": 0.000416157086560674, + "loss": 3.3054, + "step": 68770 + }, + { + "epoch": 4.672849571952711, + "grad_norm": 0.6997367143630981, + "learning_rate": 0.0004161146215518413, + "loss": 3.6725, + "step": 68775 + }, + { + "epoch": 4.673189292023372, + "grad_norm": 0.8189860582351685, + "learning_rate": 0.00041607215654300856, + "loss": 3.6372, + "step": 68780 + }, + { + "epoch": 4.6735290120940345, + "grad_norm": 0.7443654537200928, + "learning_rate": 0.00041602969153417584, + "loss": 3.5811, + "step": 68785 + }, + { + "epoch": 4.673868732164697, + "grad_norm": 0.7590558528900146, + "learning_rate": 0.0004159872265253431, + "loss": 3.3999, + "step": 68790 + }, + { + "epoch": 4.674208452235358, + "grad_norm": 0.9306771159172058, + "learning_rate": 0.0004159447615165104, + "loss": 3.3527, + "step": 68795 + }, + { + "epoch": 4.67454817230602, + "grad_norm": 0.8987955451011658, + "learning_rate": 0.0004159022965076777, + "loss": 3.438, + "step": 68800 + }, + { + "epoch": 4.674887892376682, + "grad_norm": 0.9012541770935059, + "learning_rate": 0.00041585983149884496, + "loss": 3.3144, + "step": 68805 + }, + { + "epoch": 4.675227612447343, + "grad_norm": 0.8580293655395508, + "learning_rate": 0.00041581736649001224, + "loss": 3.6704, + "step": 68810 + }, + { + "epoch": 4.675567332518005, + "grad_norm": 0.7682273983955383, + "learning_rate": 0.0004157749014811795, + "loss": 3.5244, + "step": 68815 + }, + { + "epoch": 4.675907052588667, + "grad_norm": 1.0014803409576416, + "learning_rate": 0.0004157324364723468, + "loss": 3.4706, + "step": 68820 + }, + { + "epoch": 4.676246772659328, + "grad_norm": 0.9742813110351562, + "learning_rate": 0.0004156899714635141, + "loss": 3.3975, + "step": 68825 + }, + { + "epoch": 4.6765864927299905, + "grad_norm": 1.0611658096313477, + "learning_rate": 0.00041564750645468136, + "loss": 3.3381, + "step": 68830 + }, + { + "epoch": 4.676926212800653, + "grad_norm": 0.9164726734161377, + "learning_rate": 0.00041560504144584864, + "loss": 3.532, + "step": 68835 + }, + { + "epoch": 4.677265932871314, + "grad_norm": 1.3302876949310303, + "learning_rate": 0.00041556257643701587, + "loss": 3.2348, + "step": 68840 + }, + { + "epoch": 4.677605652941976, + "grad_norm": 1.0350905656814575, + "learning_rate": 0.0004155201114281832, + "loss": 3.4483, + "step": 68845 + }, + { + "epoch": 4.677945373012638, + "grad_norm": 0.8649369478225708, + "learning_rate": 0.0004154776464193505, + "loss": 3.3583, + "step": 68850 + }, + { + "epoch": 4.678285093083299, + "grad_norm": 0.9525208473205566, + "learning_rate": 0.0004154351814105177, + "loss": 3.3975, + "step": 68855 + }, + { + "epoch": 4.678624813153961, + "grad_norm": 1.0290883779525757, + "learning_rate": 0.00041539271640168504, + "loss": 3.3928, + "step": 68860 + }, + { + "epoch": 4.678964533224623, + "grad_norm": 1.0896538496017456, + "learning_rate": 0.0004153502513928523, + "loss": 3.3086, + "step": 68865 + }, + { + "epoch": 4.679304253295284, + "grad_norm": 1.0847340822219849, + "learning_rate": 0.00041530778638401955, + "loss": 3.2482, + "step": 68870 + }, + { + "epoch": 4.6796439733659465, + "grad_norm": 1.040099859237671, + "learning_rate": 0.0004152653213751869, + "loss": 3.5591, + "step": 68875 + }, + { + "epoch": 4.679983693436609, + "grad_norm": 0.8350760340690613, + "learning_rate": 0.00041522285636635416, + "loss": 3.3834, + "step": 68880 + }, + { + "epoch": 4.68032341350727, + "grad_norm": 0.8309686183929443, + "learning_rate": 0.0004151803913575214, + "loss": 3.428, + "step": 68885 + }, + { + "epoch": 4.680663133577932, + "grad_norm": 1.202967643737793, + "learning_rate": 0.00041513792634868867, + "loss": 3.5556, + "step": 68890 + }, + { + "epoch": 4.681002853648594, + "grad_norm": 1.0246570110321045, + "learning_rate": 0.000415095461339856, + "loss": 3.4083, + "step": 68895 + }, + { + "epoch": 4.681342573719255, + "grad_norm": 1.1056689023971558, + "learning_rate": 0.00041505299633102323, + "loss": 3.4493, + "step": 68900 + }, + { + "epoch": 4.681682293789917, + "grad_norm": 1.0936998128890991, + "learning_rate": 0.0004150105313221905, + "loss": 3.5161, + "step": 68905 + }, + { + "epoch": 4.682022013860579, + "grad_norm": 0.9661248326301575, + "learning_rate": 0.00041496806631335784, + "loss": 3.3486, + "step": 68910 + }, + { + "epoch": 4.68236173393124, + "grad_norm": 1.0056967735290527, + "learning_rate": 0.00041492560130452507, + "loss": 3.4742, + "step": 68915 + }, + { + "epoch": 4.6827014540019025, + "grad_norm": 1.0155545473098755, + "learning_rate": 0.00041488313629569235, + "loss": 3.7154, + "step": 68920 + }, + { + "epoch": 4.683041174072565, + "grad_norm": 1.4421604871749878, + "learning_rate": 0.00041484067128685963, + "loss": 3.6463, + "step": 68925 + }, + { + "epoch": 4.683380894143226, + "grad_norm": 0.8410888314247131, + "learning_rate": 0.0004147982062780269, + "loss": 3.3986, + "step": 68930 + }, + { + "epoch": 4.683720614213888, + "grad_norm": 1.0440857410430908, + "learning_rate": 0.0004147557412691942, + "loss": 3.1263, + "step": 68935 + }, + { + "epoch": 4.684060334284549, + "grad_norm": 0.9956788420677185, + "learning_rate": 0.00041471327626036147, + "loss": 3.6773, + "step": 68940 + }, + { + "epoch": 4.684400054355211, + "grad_norm": 0.7611450552940369, + "learning_rate": 0.00041467081125152875, + "loss": 3.3464, + "step": 68945 + }, + { + "epoch": 4.684739774425873, + "grad_norm": 0.8789675235748291, + "learning_rate": 0.00041462834624269603, + "loss": 3.4656, + "step": 68950 + }, + { + "epoch": 4.685079494496534, + "grad_norm": 0.9582851529121399, + "learning_rate": 0.0004145858812338633, + "loss": 3.4781, + "step": 68955 + }, + { + "epoch": 4.6854192145671965, + "grad_norm": 1.4130141735076904, + "learning_rate": 0.00041454341622503053, + "loss": 3.438, + "step": 68960 + }, + { + "epoch": 4.6857589346378585, + "grad_norm": 0.9046201705932617, + "learning_rate": 0.00041450095121619787, + "loss": 3.4526, + "step": 68965 + }, + { + "epoch": 4.68609865470852, + "grad_norm": 0.9121639132499695, + "learning_rate": 0.00041445848620736515, + "loss": 3.2492, + "step": 68970 + }, + { + "epoch": 4.686438374779182, + "grad_norm": 0.9174379110336304, + "learning_rate": 0.00041441602119853243, + "loss": 3.5205, + "step": 68975 + }, + { + "epoch": 4.686778094849844, + "grad_norm": 0.7370476722717285, + "learning_rate": 0.0004143735561896997, + "loss": 3.5175, + "step": 68980 + }, + { + "epoch": 4.687117814920505, + "grad_norm": 0.9719595909118652, + "learning_rate": 0.000414331091180867, + "loss": 3.1739, + "step": 68985 + }, + { + "epoch": 4.687457534991167, + "grad_norm": 2.2770755290985107, + "learning_rate": 0.00041428862617203427, + "loss": 3.536, + "step": 68990 + }, + { + "epoch": 4.687797255061829, + "grad_norm": 0.938651978969574, + "learning_rate": 0.0004142461611632015, + "loss": 3.6113, + "step": 68995 + }, + { + "epoch": 4.68813697513249, + "grad_norm": 1.0261788368225098, + "learning_rate": 0.00041420369615436883, + "loss": 3.3636, + "step": 69000 + }, + { + "epoch": 4.6884766952031525, + "grad_norm": 0.9363745450973511, + "learning_rate": 0.0004141612311455361, + "loss": 3.664, + "step": 69005 + }, + { + "epoch": 4.6888164152738145, + "grad_norm": 0.924619734287262, + "learning_rate": 0.00041411876613670334, + "loss": 3.6627, + "step": 69010 + }, + { + "epoch": 4.689156135344476, + "grad_norm": 0.8507902026176453, + "learning_rate": 0.00041407630112787067, + "loss": 3.7528, + "step": 69015 + }, + { + "epoch": 4.689495855415138, + "grad_norm": 1.027483582496643, + "learning_rate": 0.00041403383611903795, + "loss": 3.534, + "step": 69020 + }, + { + "epoch": 4.6898355754858, + "grad_norm": 1.1089626550674438, + "learning_rate": 0.0004139913711102052, + "loss": 3.5485, + "step": 69025 + }, + { + "epoch": 4.690175295556461, + "grad_norm": 0.8208482265472412, + "learning_rate": 0.00041394890610137246, + "loss": 3.7775, + "step": 69030 + }, + { + "epoch": 4.690515015627123, + "grad_norm": 0.7648712396621704, + "learning_rate": 0.0004139064410925398, + "loss": 3.3837, + "step": 69035 + }, + { + "epoch": 4.690854735697785, + "grad_norm": 0.8592596054077148, + "learning_rate": 0.000413863976083707, + "loss": 3.4065, + "step": 69040 + }, + { + "epoch": 4.691194455768446, + "grad_norm": 1.2705016136169434, + "learning_rate": 0.0004138215110748743, + "loss": 3.456, + "step": 69045 + }, + { + "epoch": 4.6915341758391085, + "grad_norm": 2.088160753250122, + "learning_rate": 0.00041377904606604163, + "loss": 3.4951, + "step": 69050 + }, + { + "epoch": 4.6918738959097706, + "grad_norm": 1.0803329944610596, + "learning_rate": 0.00041373658105720886, + "loss": 3.6121, + "step": 69055 + }, + { + "epoch": 4.692213615980432, + "grad_norm": 1.150571584701538, + "learning_rate": 0.00041369411604837614, + "loss": 3.5977, + "step": 69060 + }, + { + "epoch": 4.692553336051094, + "grad_norm": 0.8030391931533813, + "learning_rate": 0.0004136516510395434, + "loss": 3.4896, + "step": 69065 + }, + { + "epoch": 4.692893056121756, + "grad_norm": 0.9197236895561218, + "learning_rate": 0.0004136091860307107, + "loss": 3.6468, + "step": 69070 + }, + { + "epoch": 4.693232776192417, + "grad_norm": 1.0581278800964355, + "learning_rate": 0.000413566721021878, + "loss": 3.8452, + "step": 69075 + }, + { + "epoch": 4.693572496263079, + "grad_norm": 0.76227867603302, + "learning_rate": 0.00041352425601304526, + "loss": 3.5314, + "step": 69080 + }, + { + "epoch": 4.693912216333741, + "grad_norm": 0.8748906850814819, + "learning_rate": 0.00041348179100421254, + "loss": 3.7236, + "step": 69085 + }, + { + "epoch": 4.694251936404402, + "grad_norm": 0.9207110404968262, + "learning_rate": 0.0004134393259953798, + "loss": 3.1599, + "step": 69090 + }, + { + "epoch": 4.6945916564750645, + "grad_norm": 0.7538648247718811, + "learning_rate": 0.0004133968609865471, + "loss": 3.6057, + "step": 69095 + }, + { + "epoch": 4.694931376545727, + "grad_norm": 1.0337698459625244, + "learning_rate": 0.0004133543959777143, + "loss": 3.5138, + "step": 69100 + }, + { + "epoch": 4.695271096616388, + "grad_norm": 0.8189862966537476, + "learning_rate": 0.00041331193096888166, + "loss": 3.6365, + "step": 69105 + }, + { + "epoch": 4.69561081668705, + "grad_norm": 0.8686023950576782, + "learning_rate": 0.00041326946596004894, + "loss": 3.4618, + "step": 69110 + }, + { + "epoch": 4.695950536757712, + "grad_norm": 0.9767278432846069, + "learning_rate": 0.00041322700095121616, + "loss": 3.5134, + "step": 69115 + }, + { + "epoch": 4.696290256828373, + "grad_norm": 0.7752298712730408, + "learning_rate": 0.0004131845359423835, + "loss": 3.654, + "step": 69120 + }, + { + "epoch": 4.696629976899035, + "grad_norm": 0.8428330421447754, + "learning_rate": 0.0004131420709335508, + "loss": 3.4803, + "step": 69125 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 1.1484829187393188, + "learning_rate": 0.000413099605924718, + "loss": 3.2131, + "step": 69130 + }, + { + "epoch": 4.697309417040358, + "grad_norm": 0.7619966268539429, + "learning_rate": 0.0004130571409158853, + "loss": 3.5849, + "step": 69135 + }, + { + "epoch": 4.6976491371110205, + "grad_norm": 0.9202275276184082, + "learning_rate": 0.0004130146759070526, + "loss": 3.4537, + "step": 69140 + }, + { + "epoch": 4.697988857181683, + "grad_norm": 0.8107428550720215, + "learning_rate": 0.0004129722108982199, + "loss": 3.5029, + "step": 69145 + }, + { + "epoch": 4.698328577252344, + "grad_norm": 0.9080563187599182, + "learning_rate": 0.0004129297458893871, + "loss": 3.6235, + "step": 69150 + }, + { + "epoch": 4.698668297323006, + "grad_norm": 1.0622740983963013, + "learning_rate": 0.00041288728088055446, + "loss": 3.5784, + "step": 69155 + }, + { + "epoch": 4.699008017393668, + "grad_norm": 0.78920978307724, + "learning_rate": 0.00041284481587172174, + "loss": 3.632, + "step": 69160 + }, + { + "epoch": 4.699347737464329, + "grad_norm": 0.9653229117393494, + "learning_rate": 0.00041280235086288896, + "loss": 3.3258, + "step": 69165 + }, + { + "epoch": 4.699687457534991, + "grad_norm": 0.8599035739898682, + "learning_rate": 0.0004127598858540563, + "loss": 3.5223, + "step": 69170 + }, + { + "epoch": 4.700027177605653, + "grad_norm": 1.0004385709762573, + "learning_rate": 0.0004127174208452236, + "loss": 3.4521, + "step": 69175 + }, + { + "epoch": 4.700366897676314, + "grad_norm": 0.9432724118232727, + "learning_rate": 0.0004126749558363908, + "loss": 3.3499, + "step": 69180 + }, + { + "epoch": 4.7007066177469765, + "grad_norm": 0.9100037217140198, + "learning_rate": 0.0004126324908275581, + "loss": 3.4295, + "step": 69185 + }, + { + "epoch": 4.701046337817639, + "grad_norm": 1.0370256900787354, + "learning_rate": 0.0004125900258187254, + "loss": 3.1991, + "step": 69190 + }, + { + "epoch": 4.7013860578883, + "grad_norm": 0.886771559715271, + "learning_rate": 0.00041254756080989264, + "loss": 3.7169, + "step": 69195 + }, + { + "epoch": 4.701725777958962, + "grad_norm": 0.9950128197669983, + "learning_rate": 0.0004125050958010599, + "loss": 3.5246, + "step": 69200 + }, + { + "epoch": 4.702065498029624, + "grad_norm": 0.873160719871521, + "learning_rate": 0.00041246263079222726, + "loss": 3.5537, + "step": 69205 + }, + { + "epoch": 4.702405218100285, + "grad_norm": 0.9359835386276245, + "learning_rate": 0.0004124201657833945, + "loss": 3.4535, + "step": 69210 + }, + { + "epoch": 4.702744938170947, + "grad_norm": 1.4792603254318237, + "learning_rate": 0.00041237770077456176, + "loss": 3.4774, + "step": 69215 + }, + { + "epoch": 4.703084658241609, + "grad_norm": 1.2348146438598633, + "learning_rate": 0.00041233523576572904, + "loss": 3.5892, + "step": 69220 + }, + { + "epoch": 4.7034243783122704, + "grad_norm": 0.9451727867126465, + "learning_rate": 0.0004122927707568963, + "loss": 3.5503, + "step": 69225 + }, + { + "epoch": 4.7037640983829325, + "grad_norm": 0.7396966218948364, + "learning_rate": 0.0004122503057480636, + "loss": 3.6825, + "step": 69230 + }, + { + "epoch": 4.704103818453595, + "grad_norm": 0.9278928637504578, + "learning_rate": 0.0004122078407392309, + "loss": 3.4822, + "step": 69235 + }, + { + "epoch": 4.704443538524256, + "grad_norm": 0.8786912560462952, + "learning_rate": 0.00041216537573039816, + "loss": 3.498, + "step": 69240 + }, + { + "epoch": 4.704783258594918, + "grad_norm": 0.8967496752738953, + "learning_rate": 0.00041212291072156544, + "loss": 3.589, + "step": 69245 + }, + { + "epoch": 4.70512297866558, + "grad_norm": 0.9595003724098206, + "learning_rate": 0.0004120804457127327, + "loss": 3.5231, + "step": 69250 + }, + { + "epoch": 4.705462698736241, + "grad_norm": 0.9805569052696228, + "learning_rate": 0.00041203798070389995, + "loss": 3.5817, + "step": 69255 + }, + { + "epoch": 4.705802418806903, + "grad_norm": 1.042433500289917, + "learning_rate": 0.0004119955156950673, + "loss": 3.5397, + "step": 69260 + }, + { + "epoch": 4.706142138877565, + "grad_norm": 0.9085493087768555, + "learning_rate": 0.00041195305068623457, + "loss": 3.3466, + "step": 69265 + }, + { + "epoch": 4.7064818589482265, + "grad_norm": 0.7201143503189087, + "learning_rate": 0.0004119105856774018, + "loss": 3.5289, + "step": 69270 + }, + { + "epoch": 4.7068215790188885, + "grad_norm": 0.8668369054794312, + "learning_rate": 0.0004118681206685691, + "loss": 3.3125, + "step": 69275 + }, + { + "epoch": 4.707161299089551, + "grad_norm": 0.8340924382209778, + "learning_rate": 0.0004118256556597364, + "loss": 3.3237, + "step": 69280 + }, + { + "epoch": 4.707501019160212, + "grad_norm": 1.3947975635528564, + "learning_rate": 0.00041178319065090363, + "loss": 3.4015, + "step": 69285 + }, + { + "epoch": 4.707840739230874, + "grad_norm": 1.2803535461425781, + "learning_rate": 0.0004117407256420709, + "loss": 3.3123, + "step": 69290 + }, + { + "epoch": 4.708180459301536, + "grad_norm": 1.0236245393753052, + "learning_rate": 0.00041169826063323825, + "loss": 3.4137, + "step": 69295 + }, + { + "epoch": 4.708520179372197, + "grad_norm": 0.9299820065498352, + "learning_rate": 0.00041165579562440547, + "loss": 3.7213, + "step": 69300 + }, + { + "epoch": 4.708859899442859, + "grad_norm": 0.8146981596946716, + "learning_rate": 0.00041161333061557275, + "loss": 3.6286, + "step": 69305 + }, + { + "epoch": 4.709199619513521, + "grad_norm": 1.0629218816757202, + "learning_rate": 0.0004115708656067401, + "loss": 3.4441, + "step": 69310 + }, + { + "epoch": 4.7095393395841825, + "grad_norm": 0.6920353770256042, + "learning_rate": 0.00041152840059790737, + "loss": 3.5137, + "step": 69315 + }, + { + "epoch": 4.7098790596548445, + "grad_norm": 0.9011183381080627, + "learning_rate": 0.0004114859355890746, + "loss": 3.5492, + "step": 69320 + }, + { + "epoch": 4.710218779725507, + "grad_norm": 0.7653124332427979, + "learning_rate": 0.00041144347058024187, + "loss": 3.4338, + "step": 69325 + }, + { + "epoch": 4.710558499796168, + "grad_norm": 1.2043757438659668, + "learning_rate": 0.0004114010055714092, + "loss": 3.4366, + "step": 69330 + }, + { + "epoch": 4.71089821986683, + "grad_norm": 0.9090380668640137, + "learning_rate": 0.00041135854056257643, + "loss": 3.3432, + "step": 69335 + }, + { + "epoch": 4.711237939937492, + "grad_norm": 0.8514343500137329, + "learning_rate": 0.0004113160755537437, + "loss": 3.898, + "step": 69340 + }, + { + "epoch": 4.711577660008153, + "grad_norm": 1.0044324398040771, + "learning_rate": 0.00041127361054491105, + "loss": 3.5371, + "step": 69345 + }, + { + "epoch": 4.711917380078815, + "grad_norm": 0.9042927622795105, + "learning_rate": 0.00041123114553607827, + "loss": 3.3053, + "step": 69350 + }, + { + "epoch": 4.712257100149477, + "grad_norm": 0.9744037389755249, + "learning_rate": 0.00041118868052724555, + "loss": 3.3031, + "step": 69355 + }, + { + "epoch": 4.7125968202201385, + "grad_norm": 0.7237083911895752, + "learning_rate": 0.00041114621551841283, + "loss": 3.485, + "step": 69360 + }, + { + "epoch": 4.712936540290801, + "grad_norm": 0.6866466403007507, + "learning_rate": 0.0004111037505095801, + "loss": 3.5617, + "step": 69365 + }, + { + "epoch": 4.713276260361463, + "grad_norm": 0.9360725283622742, + "learning_rate": 0.0004110612855007474, + "loss": 3.4339, + "step": 69370 + }, + { + "epoch": 4.713615980432124, + "grad_norm": 1.0577508211135864, + "learning_rate": 0.00041101882049191467, + "loss": 3.3926, + "step": 69375 + }, + { + "epoch": 4.713955700502786, + "grad_norm": 0.7326856851577759, + "learning_rate": 0.00041097635548308195, + "loss": 3.4204, + "step": 69380 + }, + { + "epoch": 4.714295420573448, + "grad_norm": 0.8062983751296997, + "learning_rate": 0.00041093389047424923, + "loss": 3.3098, + "step": 69385 + }, + { + "epoch": 4.714635140644109, + "grad_norm": 0.8984755277633667, + "learning_rate": 0.0004108914254654165, + "loss": 3.2538, + "step": 69390 + }, + { + "epoch": 4.714974860714771, + "grad_norm": 0.8935202956199646, + "learning_rate": 0.00041084896045658374, + "loss": 3.5943, + "step": 69395 + }, + { + "epoch": 4.715314580785432, + "grad_norm": 0.9000974297523499, + "learning_rate": 0.0004108064954477511, + "loss": 3.5617, + "step": 69400 + }, + { + "epoch": 4.7156543008560945, + "grad_norm": 1.2094624042510986, + "learning_rate": 0.00041076403043891835, + "loss": 3.6264, + "step": 69405 + }, + { + "epoch": 4.715994020926757, + "grad_norm": 1.0527187585830688, + "learning_rate": 0.0004107215654300856, + "loss": 3.5593, + "step": 69410 + }, + { + "epoch": 4.716333740997418, + "grad_norm": 0.8783236742019653, + "learning_rate": 0.0004106791004212529, + "loss": 3.2297, + "step": 69415 + }, + { + "epoch": 4.71667346106808, + "grad_norm": 1.2283588647842407, + "learning_rate": 0.0004106366354124202, + "loss": 3.3098, + "step": 69420 + }, + { + "epoch": 4.717013181138742, + "grad_norm": 0.8305366039276123, + "learning_rate": 0.0004105941704035874, + "loss": 3.6067, + "step": 69425 + }, + { + "epoch": 4.717352901209403, + "grad_norm": 0.722089409828186, + "learning_rate": 0.00041055170539475475, + "loss": 3.4856, + "step": 69430 + }, + { + "epoch": 4.717692621280065, + "grad_norm": 0.7544430494308472, + "learning_rate": 0.00041050924038592203, + "loss": 3.471, + "step": 69435 + }, + { + "epoch": 4.718032341350727, + "grad_norm": 0.9398439526557922, + "learning_rate": 0.00041046677537708926, + "loss": 3.2922, + "step": 69440 + }, + { + "epoch": 4.718372061421388, + "grad_norm": 1.0087188482284546, + "learning_rate": 0.00041042431036825654, + "loss": 3.5108, + "step": 69445 + }, + { + "epoch": 4.7187117814920505, + "grad_norm": 1.0991499423980713, + "learning_rate": 0.0004103818453594239, + "loss": 3.5671, + "step": 69450 + }, + { + "epoch": 4.719051501562713, + "grad_norm": 0.9303971529006958, + "learning_rate": 0.0004103393803505911, + "loss": 3.1105, + "step": 69455 + }, + { + "epoch": 4.719391221633374, + "grad_norm": 1.1387780904769897, + "learning_rate": 0.0004102969153417584, + "loss": 3.4478, + "step": 69460 + }, + { + "epoch": 4.719730941704036, + "grad_norm": 1.056408405303955, + "learning_rate": 0.0004102544503329257, + "loss": 3.6072, + "step": 69465 + }, + { + "epoch": 4.720070661774698, + "grad_norm": 1.0209388732910156, + "learning_rate": 0.00041021198532409294, + "loss": 3.5423, + "step": 69470 + }, + { + "epoch": 4.720410381845359, + "grad_norm": 1.493380069732666, + "learning_rate": 0.0004101695203152602, + "loss": 3.3595, + "step": 69475 + }, + { + "epoch": 4.720750101916021, + "grad_norm": 0.7706539034843445, + "learning_rate": 0.0004101270553064275, + "loss": 3.5306, + "step": 69480 + }, + { + "epoch": 4.721089821986683, + "grad_norm": 0.9722647070884705, + "learning_rate": 0.00041008459029759483, + "loss": 3.2002, + "step": 69485 + }, + { + "epoch": 4.721429542057344, + "grad_norm": 2.343932628631592, + "learning_rate": 0.00041004212528876206, + "loss": 3.4669, + "step": 69490 + }, + { + "epoch": 4.7217692621280065, + "grad_norm": 1.002375841140747, + "learning_rate": 0.00040999966027992934, + "loss": 3.6347, + "step": 69495 + }, + { + "epoch": 4.722108982198669, + "grad_norm": 0.92393958568573, + "learning_rate": 0.0004099571952710967, + "loss": 3.6469, + "step": 69500 + }, + { + "epoch": 4.72244870226933, + "grad_norm": 0.8304163813591003, + "learning_rate": 0.0004099147302622639, + "loss": 3.5686, + "step": 69505 + }, + { + "epoch": 4.722788422339992, + "grad_norm": 1.0055514574050903, + "learning_rate": 0.0004098722652534312, + "loss": 3.5573, + "step": 69510 + }, + { + "epoch": 4.723128142410654, + "grad_norm": 1.0248374938964844, + "learning_rate": 0.00040982980024459846, + "loss": 3.4539, + "step": 69515 + }, + { + "epoch": 4.723467862481315, + "grad_norm": 0.9339898228645325, + "learning_rate": 0.00040978733523576574, + "loss": 3.5153, + "step": 69520 + }, + { + "epoch": 4.723807582551977, + "grad_norm": 0.8309392929077148, + "learning_rate": 0.000409744870226933, + "loss": 3.4249, + "step": 69525 + }, + { + "epoch": 4.724147302622639, + "grad_norm": 1.079878568649292, + "learning_rate": 0.0004097024052181003, + "loss": 3.5313, + "step": 69530 + }, + { + "epoch": 4.7244870226933005, + "grad_norm": 1.0695475339889526, + "learning_rate": 0.0004096599402092676, + "loss": 3.2531, + "step": 69535 + }, + { + "epoch": 4.7248267427639625, + "grad_norm": 1.1379320621490479, + "learning_rate": 0.00040961747520043486, + "loss": 3.4041, + "step": 69540 + }, + { + "epoch": 4.725166462834625, + "grad_norm": 1.009966254234314, + "learning_rate": 0.00040957501019160214, + "loss": 3.602, + "step": 69545 + }, + { + "epoch": 4.725506182905286, + "grad_norm": 0.6881663203239441, + "learning_rate": 0.00040953254518276937, + "loss": 3.9406, + "step": 69550 + }, + { + "epoch": 4.725845902975948, + "grad_norm": 0.7333729863166809, + "learning_rate": 0.0004094900801739367, + "loss": 3.3185, + "step": 69555 + }, + { + "epoch": 4.72618562304661, + "grad_norm": 1.497066617012024, + "learning_rate": 0.000409447615165104, + "loss": 3.1743, + "step": 69560 + }, + { + "epoch": 4.726525343117271, + "grad_norm": 1.089799165725708, + "learning_rate": 0.0004094051501562712, + "loss": 3.4764, + "step": 69565 + }, + { + "epoch": 4.726865063187933, + "grad_norm": 0.8782235980033875, + "learning_rate": 0.00040936268514743854, + "loss": 3.5389, + "step": 69570 + }, + { + "epoch": 4.727204783258595, + "grad_norm": 1.2074804306030273, + "learning_rate": 0.0004093202201386058, + "loss": 3.2767, + "step": 69575 + }, + { + "epoch": 4.7275445033292565, + "grad_norm": 1.063856840133667, + "learning_rate": 0.00040927775512977305, + "loss": 3.4847, + "step": 69580 + }, + { + "epoch": 4.7278842233999185, + "grad_norm": 1.061047077178955, + "learning_rate": 0.00040923529012094033, + "loss": 3.381, + "step": 69585 + }, + { + "epoch": 4.728223943470581, + "grad_norm": 0.9687809348106384, + "learning_rate": 0.00040919282511210766, + "loss": 3.5443, + "step": 69590 + }, + { + "epoch": 4.728563663541242, + "grad_norm": 0.9501528143882751, + "learning_rate": 0.0004091503601032749, + "loss": 3.4657, + "step": 69595 + }, + { + "epoch": 4.728903383611904, + "grad_norm": 1.0584858655929565, + "learning_rate": 0.00040910789509444217, + "loss": 3.4576, + "step": 69600 + }, + { + "epoch": 4.729243103682566, + "grad_norm": 0.9016488790512085, + "learning_rate": 0.0004090654300856095, + "loss": 3.4288, + "step": 69605 + }, + { + "epoch": 4.729582823753227, + "grad_norm": 0.7905771732330322, + "learning_rate": 0.00040902296507677673, + "loss": 3.7033, + "step": 69610 + }, + { + "epoch": 4.729922543823889, + "grad_norm": 1.1306062936782837, + "learning_rate": 0.000408980500067944, + "loss": 3.2668, + "step": 69615 + }, + { + "epoch": 4.73026226389455, + "grad_norm": 0.8357083797454834, + "learning_rate": 0.0004089380350591113, + "loss": 3.467, + "step": 69620 + }, + { + "epoch": 4.7306019839652125, + "grad_norm": 0.8403435349464417, + "learning_rate": 0.00040889557005027857, + "loss": 3.5184, + "step": 69625 + }, + { + "epoch": 4.7309417040358746, + "grad_norm": 0.9138261675834656, + "learning_rate": 0.00040885310504144585, + "loss": 3.5619, + "step": 69630 + }, + { + "epoch": 4.731281424106536, + "grad_norm": 0.917419970035553, + "learning_rate": 0.00040881064003261313, + "loss": 3.3362, + "step": 69635 + }, + { + "epoch": 4.731621144177198, + "grad_norm": 0.9246631264686584, + "learning_rate": 0.0004087681750237804, + "loss": 3.6141, + "step": 69640 + }, + { + "epoch": 4.73196086424786, + "grad_norm": 0.9522258639335632, + "learning_rate": 0.0004087257100149477, + "loss": 3.3835, + "step": 69645 + }, + { + "epoch": 4.732300584318521, + "grad_norm": 0.9418515563011169, + "learning_rate": 0.00040868324500611497, + "loss": 3.3496, + "step": 69650 + }, + { + "epoch": 4.732640304389183, + "grad_norm": 0.9382460713386536, + "learning_rate": 0.00040864077999728225, + "loss": 3.5809, + "step": 69655 + }, + { + "epoch": 4.732980024459845, + "grad_norm": 0.9743805527687073, + "learning_rate": 0.00040859831498844953, + "loss": 3.4562, + "step": 69660 + }, + { + "epoch": 4.733319744530506, + "grad_norm": 0.8578090071678162, + "learning_rate": 0.0004085558499796168, + "loss": 3.1526, + "step": 69665 + }, + { + "epoch": 4.7336594646011685, + "grad_norm": 0.8038434982299805, + "learning_rate": 0.0004085133849707841, + "loss": 3.6163, + "step": 69670 + }, + { + "epoch": 4.733999184671831, + "grad_norm": 1.0043584108352661, + "learning_rate": 0.00040847091996195137, + "loss": 3.5717, + "step": 69675 + }, + { + "epoch": 4.734338904742492, + "grad_norm": 0.8762568831443787, + "learning_rate": 0.00040842845495311865, + "loss": 3.5378, + "step": 69680 + }, + { + "epoch": 4.734678624813154, + "grad_norm": 0.8073022961616516, + "learning_rate": 0.00040838598994428593, + "loss": 3.477, + "step": 69685 + }, + { + "epoch": 4.735018344883816, + "grad_norm": 0.9552071690559387, + "learning_rate": 0.00040834352493545315, + "loss": 3.7551, + "step": 69690 + }, + { + "epoch": 4.735358064954477, + "grad_norm": 0.9925696849822998, + "learning_rate": 0.0004083010599266205, + "loss": 3.4117, + "step": 69695 + }, + { + "epoch": 4.735697785025139, + "grad_norm": 0.8923359513282776, + "learning_rate": 0.00040825859491778777, + "loss": 3.2676, + "step": 69700 + }, + { + "epoch": 4.736037505095801, + "grad_norm": 0.8663620352745056, + "learning_rate": 0.000408216129908955, + "loss": 3.3519, + "step": 69705 + }, + { + "epoch": 4.736377225166462, + "grad_norm": 1.1418766975402832, + "learning_rate": 0.00040817366490012233, + "loss": 3.5682, + "step": 69710 + }, + { + "epoch": 4.7367169452371245, + "grad_norm": 1.4117382764816284, + "learning_rate": 0.0004081311998912896, + "loss": 3.5193, + "step": 69715 + }, + { + "epoch": 4.737056665307787, + "grad_norm": 0.9089451432228088, + "learning_rate": 0.00040808873488245684, + "loss": 3.0171, + "step": 69720 + }, + { + "epoch": 4.737396385378448, + "grad_norm": 0.7812069058418274, + "learning_rate": 0.00040804626987362417, + "loss": 3.8175, + "step": 69725 + }, + { + "epoch": 4.73773610544911, + "grad_norm": 1.0893961191177368, + "learning_rate": 0.00040800380486479145, + "loss": 3.4373, + "step": 69730 + }, + { + "epoch": 4.738075825519772, + "grad_norm": 0.8251974582672119, + "learning_rate": 0.0004079613398559587, + "loss": 3.3316, + "step": 69735 + }, + { + "epoch": 4.738415545590433, + "grad_norm": 0.8916765451431274, + "learning_rate": 0.00040791887484712596, + "loss": 3.6795, + "step": 69740 + }, + { + "epoch": 4.738755265661095, + "grad_norm": 1.0584638118743896, + "learning_rate": 0.0004078764098382933, + "loss": 3.5646, + "step": 69745 + }, + { + "epoch": 4.739094985731757, + "grad_norm": 0.8497293591499329, + "learning_rate": 0.0004078339448294605, + "loss": 3.6483, + "step": 69750 + }, + { + "epoch": 4.739434705802418, + "grad_norm": 1.0508346557617188, + "learning_rate": 0.0004077914798206278, + "loss": 3.6635, + "step": 69755 + }, + { + "epoch": 4.7397744258730805, + "grad_norm": 1.8096421957015991, + "learning_rate": 0.00040774901481179513, + "loss": 3.4127, + "step": 69760 + }, + { + "epoch": 4.740114145943743, + "grad_norm": 0.8721823692321777, + "learning_rate": 0.00040770654980296236, + "loss": 3.6277, + "step": 69765 + }, + { + "epoch": 4.740453866014404, + "grad_norm": 0.812736451625824, + "learning_rate": 0.00040766408479412964, + "loss": 3.4318, + "step": 69770 + }, + { + "epoch": 4.740793586085066, + "grad_norm": 0.9979515075683594, + "learning_rate": 0.0004076216197852969, + "loss": 3.3972, + "step": 69775 + }, + { + "epoch": 4.741133306155728, + "grad_norm": 1.2400060892105103, + "learning_rate": 0.0004075791547764642, + "loss": 3.4446, + "step": 69780 + }, + { + "epoch": 4.741473026226389, + "grad_norm": 0.931250274181366, + "learning_rate": 0.0004075366897676315, + "loss": 3.5123, + "step": 69785 + }, + { + "epoch": 4.741812746297051, + "grad_norm": 1.1109479665756226, + "learning_rate": 0.00040749422475879876, + "loss": 3.5016, + "step": 69790 + }, + { + "epoch": 4.742152466367713, + "grad_norm": 0.8184496760368347, + "learning_rate": 0.00040745175974996604, + "loss": 3.746, + "step": 69795 + }, + { + "epoch": 4.7424921864383744, + "grad_norm": 1.0017801523208618, + "learning_rate": 0.0004074092947411333, + "loss": 3.3659, + "step": 69800 + }, + { + "epoch": 4.7428319065090365, + "grad_norm": 0.9311455488204956, + "learning_rate": 0.0004073668297323006, + "loss": 3.333, + "step": 69805 + }, + { + "epoch": 4.743171626579699, + "grad_norm": 1.0535578727722168, + "learning_rate": 0.0004073243647234678, + "loss": 3.4727, + "step": 69810 + }, + { + "epoch": 4.74351134665036, + "grad_norm": 1.155904769897461, + "learning_rate": 0.00040728189971463516, + "loss": 2.9714, + "step": 69815 + }, + { + "epoch": 4.743851066721022, + "grad_norm": 0.8782064318656921, + "learning_rate": 0.00040723943470580244, + "loss": 3.5545, + "step": 69820 + }, + { + "epoch": 4.744190786791684, + "grad_norm": 0.7951642870903015, + "learning_rate": 0.0004071969696969697, + "loss": 3.5009, + "step": 69825 + }, + { + "epoch": 4.744530506862345, + "grad_norm": 0.8597425222396851, + "learning_rate": 0.000407154504688137, + "loss": 3.5068, + "step": 69830 + }, + { + "epoch": 4.744870226933007, + "grad_norm": 1.0049335956573486, + "learning_rate": 0.0004071120396793043, + "loss": 3.6691, + "step": 69835 + }, + { + "epoch": 4.745209947003669, + "grad_norm": 0.936811625957489, + "learning_rate": 0.00040706957467047156, + "loss": 3.2968, + "step": 69840 + }, + { + "epoch": 4.7455496670743305, + "grad_norm": 1.8954524993896484, + "learning_rate": 0.0004070271096616388, + "loss": 3.5088, + "step": 69845 + }, + { + "epoch": 4.7458893871449925, + "grad_norm": 0.8869929909706116, + "learning_rate": 0.0004069846446528061, + "loss": 3.645, + "step": 69850 + }, + { + "epoch": 4.746229107215655, + "grad_norm": 0.9415448904037476, + "learning_rate": 0.0004069421796439734, + "loss": 3.7322, + "step": 69855 + }, + { + "epoch": 4.746568827286316, + "grad_norm": 1.1199277639389038, + "learning_rate": 0.0004068997146351406, + "loss": 3.2288, + "step": 69860 + }, + { + "epoch": 4.746908547356978, + "grad_norm": 0.8030219674110413, + "learning_rate": 0.00040685724962630796, + "loss": 3.6351, + "step": 69865 + }, + { + "epoch": 4.74724826742764, + "grad_norm": 1.1165502071380615, + "learning_rate": 0.00040681478461747524, + "loss": 3.2518, + "step": 69870 + }, + { + "epoch": 4.747587987498301, + "grad_norm": 0.9954916834831238, + "learning_rate": 0.00040677231960864246, + "loss": 3.4161, + "step": 69875 + }, + { + "epoch": 4.747927707568963, + "grad_norm": 1.1825116872787476, + "learning_rate": 0.00040672985459980974, + "loss": 3.6027, + "step": 69880 + }, + { + "epoch": 4.748267427639625, + "grad_norm": 1.483061671257019, + "learning_rate": 0.0004066873895909771, + "loss": 3.5523, + "step": 69885 + }, + { + "epoch": 4.7486071477102865, + "grad_norm": 0.9154840111732483, + "learning_rate": 0.0004066449245821443, + "loss": 3.7033, + "step": 69890 + }, + { + "epoch": 4.7489468677809485, + "grad_norm": 1.0458025932312012, + "learning_rate": 0.0004066024595733116, + "loss": 3.5413, + "step": 69895 + }, + { + "epoch": 4.749286587851611, + "grad_norm": 0.9702135324478149, + "learning_rate": 0.0004065599945644789, + "loss": 3.5298, + "step": 69900 + }, + { + "epoch": 4.749626307922272, + "grad_norm": 1.0128929615020752, + "learning_rate": 0.00040651752955564614, + "loss": 3.3264, + "step": 69905 + }, + { + "epoch": 4.749966027992934, + "grad_norm": 0.7060816884040833, + "learning_rate": 0.0004064750645468134, + "loss": 3.5484, + "step": 69910 + }, + { + "epoch": 4.750305748063596, + "grad_norm": 0.7376797199249268, + "learning_rate": 0.0004064325995379807, + "loss": 3.6693, + "step": 69915 + }, + { + "epoch": 4.750645468134257, + "grad_norm": 0.9364362955093384, + "learning_rate": 0.000406390134529148, + "loss": 3.4386, + "step": 69920 + }, + { + "epoch": 4.750985188204919, + "grad_norm": 0.8784198760986328, + "learning_rate": 0.00040634766952031526, + "loss": 3.37, + "step": 69925 + }, + { + "epoch": 4.751324908275581, + "grad_norm": 0.8886426687240601, + "learning_rate": 0.00040630520451148254, + "loss": 3.3463, + "step": 69930 + }, + { + "epoch": 4.7516646283462425, + "grad_norm": 0.9895469546318054, + "learning_rate": 0.0004062627395026498, + "loss": 3.1473, + "step": 69935 + }, + { + "epoch": 4.7520043484169046, + "grad_norm": 0.7948732376098633, + "learning_rate": 0.0004062202744938171, + "loss": 3.4975, + "step": 69940 + }, + { + "epoch": 4.752344068487567, + "grad_norm": 1.0006366968154907, + "learning_rate": 0.0004061778094849844, + "loss": 3.8377, + "step": 69945 + }, + { + "epoch": 4.752683788558228, + "grad_norm": 0.8659425973892212, + "learning_rate": 0.0004061353444761516, + "loss": 3.3307, + "step": 69950 + }, + { + "epoch": 4.75302350862889, + "grad_norm": 0.7717373967170715, + "learning_rate": 0.00040609287946731894, + "loss": 3.4198, + "step": 69955 + }, + { + "epoch": 4.753363228699552, + "grad_norm": 1.911049723625183, + "learning_rate": 0.0004060504144584862, + "loss": 3.5772, + "step": 69960 + }, + { + "epoch": 4.753702948770213, + "grad_norm": 0.9027948379516602, + "learning_rate": 0.00040600794944965345, + "loss": 3.6782, + "step": 69965 + }, + { + "epoch": 4.754042668840875, + "grad_norm": 0.8574250340461731, + "learning_rate": 0.0004059654844408208, + "loss": 3.5374, + "step": 69970 + }, + { + "epoch": 4.754382388911537, + "grad_norm": 1.2639585733413696, + "learning_rate": 0.00040592301943198807, + "loss": 3.2083, + "step": 69975 + }, + { + "epoch": 4.7547221089821985, + "grad_norm": 0.995083749294281, + "learning_rate": 0.0004058805544231553, + "loss": 3.531, + "step": 69980 + }, + { + "epoch": 4.755061829052861, + "grad_norm": 1.0490994453430176, + "learning_rate": 0.00040583808941432257, + "loss": 3.4044, + "step": 69985 + }, + { + "epoch": 4.755401549123523, + "grad_norm": 0.8625929355621338, + "learning_rate": 0.0004057956244054899, + "loss": 3.402, + "step": 69990 + }, + { + "epoch": 4.755741269194184, + "grad_norm": 1.001082181930542, + "learning_rate": 0.0004057531593966572, + "loss": 3.2871, + "step": 69995 + }, + { + "epoch": 4.756080989264846, + "grad_norm": 1.2514638900756836, + "learning_rate": 0.0004057106943878244, + "loss": 3.4937, + "step": 70000 + }, + { + "epoch": 4.756420709335508, + "grad_norm": 1.107264518737793, + "learning_rate": 0.00040566822937899175, + "loss": 3.8079, + "step": 70005 + }, + { + "epoch": 4.756760429406169, + "grad_norm": 0.8619604110717773, + "learning_rate": 0.000405625764370159, + "loss": 3.5046, + "step": 70010 + }, + { + "epoch": 4.757100149476831, + "grad_norm": 1.052803874015808, + "learning_rate": 0.00040558329936132625, + "loss": 3.4577, + "step": 70015 + }, + { + "epoch": 4.757439869547493, + "grad_norm": 0.8060438632965088, + "learning_rate": 0.0004055408343524936, + "loss": 3.4292, + "step": 70020 + }, + { + "epoch": 4.7577795896181545, + "grad_norm": 0.9801464080810547, + "learning_rate": 0.00040549836934366087, + "loss": 3.4248, + "step": 70025 + }, + { + "epoch": 4.758119309688817, + "grad_norm": 0.9717452526092529, + "learning_rate": 0.0004054559043348281, + "loss": 3.452, + "step": 70030 + }, + { + "epoch": 4.758459029759479, + "grad_norm": 0.8775578141212463, + "learning_rate": 0.00040541343932599537, + "loss": 3.6844, + "step": 70035 + }, + { + "epoch": 4.75879874983014, + "grad_norm": 0.8441234230995178, + "learning_rate": 0.0004053709743171627, + "loss": 3.5734, + "step": 70040 + }, + { + "epoch": 4.759138469900802, + "grad_norm": 0.8365767002105713, + "learning_rate": 0.00040532850930832993, + "loss": 3.6528, + "step": 70045 + }, + { + "epoch": 4.759478189971464, + "grad_norm": 0.7439895868301392, + "learning_rate": 0.0004052860442994972, + "loss": 3.0339, + "step": 70050 + }, + { + "epoch": 4.759817910042125, + "grad_norm": 0.693715512752533, + "learning_rate": 0.00040524357929066455, + "loss": 3.6697, + "step": 70055 + }, + { + "epoch": 4.760157630112787, + "grad_norm": 0.7161167860031128, + "learning_rate": 0.00040520111428183177, + "loss": 3.2555, + "step": 70060 + }, + { + "epoch": 4.760497350183449, + "grad_norm": 1.0640653371810913, + "learning_rate": 0.00040515864927299905, + "loss": 3.3708, + "step": 70065 + }, + { + "epoch": 4.7608370702541105, + "grad_norm": 0.9384024143218994, + "learning_rate": 0.00040511618426416633, + "loss": 3.7375, + "step": 70070 + }, + { + "epoch": 4.761176790324773, + "grad_norm": 0.9993557333946228, + "learning_rate": 0.0004050737192553336, + "loss": 3.3771, + "step": 70075 + }, + { + "epoch": 4.761516510395434, + "grad_norm": 0.8719512820243835, + "learning_rate": 0.0004050312542465009, + "loss": 3.6066, + "step": 70080 + }, + { + "epoch": 4.761856230466096, + "grad_norm": 0.8744362592697144, + "learning_rate": 0.00040498878923766817, + "loss": 3.4291, + "step": 70085 + }, + { + "epoch": 4.762195950536758, + "grad_norm": 2.1583023071289062, + "learning_rate": 0.00040494632422883545, + "loss": 3.049, + "step": 70090 + }, + { + "epoch": 4.762535670607419, + "grad_norm": 1.001440167427063, + "learning_rate": 0.00040490385922000273, + "loss": 3.3462, + "step": 70095 + }, + { + "epoch": 4.762875390678081, + "grad_norm": 0.8132185935974121, + "learning_rate": 0.00040486139421117, + "loss": 3.5902, + "step": 70100 + }, + { + "epoch": 4.763215110748743, + "grad_norm": 0.710800051689148, + "learning_rate": 0.00040481892920233724, + "loss": 3.4773, + "step": 70105 + }, + { + "epoch": 4.7635548308194045, + "grad_norm": 0.8062109351158142, + "learning_rate": 0.0004047764641935046, + "loss": 2.9979, + "step": 70110 + }, + { + "epoch": 4.7638945508900665, + "grad_norm": 1.2540242671966553, + "learning_rate": 0.00040473399918467185, + "loss": 3.3028, + "step": 70115 + }, + { + "epoch": 4.764234270960729, + "grad_norm": 0.9580163359642029, + "learning_rate": 0.0004046915341758391, + "loss": 3.4495, + "step": 70120 + }, + { + "epoch": 4.76457399103139, + "grad_norm": 0.8417928218841553, + "learning_rate": 0.0004046490691670064, + "loss": 3.5444, + "step": 70125 + }, + { + "epoch": 4.764913711102052, + "grad_norm": 0.9825800061225891, + "learning_rate": 0.0004046066041581737, + "loss": 3.3175, + "step": 70130 + }, + { + "epoch": 4.765253431172714, + "grad_norm": 0.893513023853302, + "learning_rate": 0.0004045641391493409, + "loss": 3.637, + "step": 70135 + }, + { + "epoch": 4.765593151243375, + "grad_norm": 0.9716838002204895, + "learning_rate": 0.0004045216741405082, + "loss": 3.5041, + "step": 70140 + }, + { + "epoch": 4.765932871314037, + "grad_norm": 0.831158459186554, + "learning_rate": 0.00040447920913167553, + "loss": 3.4177, + "step": 70145 + }, + { + "epoch": 4.766272591384699, + "grad_norm": 0.9289102554321289, + "learning_rate": 0.00040443674412284276, + "loss": 3.3409, + "step": 70150 + }, + { + "epoch": 4.7666123114553605, + "grad_norm": 0.8639213442802429, + "learning_rate": 0.00040439427911401004, + "loss": 3.3352, + "step": 70155 + }, + { + "epoch": 4.7669520315260225, + "grad_norm": 1.266225814819336, + "learning_rate": 0.0004043518141051774, + "loss": 3.6745, + "step": 70160 + }, + { + "epoch": 4.767291751596685, + "grad_norm": 0.937574028968811, + "learning_rate": 0.00040430934909634465, + "loss": 3.6245, + "step": 70165 + }, + { + "epoch": 4.767631471667346, + "grad_norm": 1.1318204402923584, + "learning_rate": 0.0004042668840875119, + "loss": 3.549, + "step": 70170 + }, + { + "epoch": 4.767971191738008, + "grad_norm": 0.7939515709877014, + "learning_rate": 0.00040422441907867916, + "loss": 3.4073, + "step": 70175 + }, + { + "epoch": 4.76831091180867, + "grad_norm": 1.242926836013794, + "learning_rate": 0.0004041819540698465, + "loss": 3.7735, + "step": 70180 + }, + { + "epoch": 4.768650631879331, + "grad_norm": 0.8212417960166931, + "learning_rate": 0.0004041394890610137, + "loss": 3.5301, + "step": 70185 + }, + { + "epoch": 4.768990351949993, + "grad_norm": 0.7971827983856201, + "learning_rate": 0.000404097024052181, + "loss": 3.6283, + "step": 70190 + }, + { + "epoch": 4.769330072020655, + "grad_norm": 0.8526086807250977, + "learning_rate": 0.00040405455904334833, + "loss": 3.4423, + "step": 70195 + }, + { + "epoch": 4.7696697920913165, + "grad_norm": 0.821765661239624, + "learning_rate": 0.00040401209403451556, + "loss": 3.4067, + "step": 70200 + }, + { + "epoch": 4.7700095121619785, + "grad_norm": 0.78155517578125, + "learning_rate": 0.00040396962902568284, + "loss": 3.5241, + "step": 70205 + }, + { + "epoch": 4.770349232232641, + "grad_norm": 0.7334286570549011, + "learning_rate": 0.0004039271640168501, + "loss": 3.4577, + "step": 70210 + }, + { + "epoch": 4.770688952303302, + "grad_norm": 0.8580557703971863, + "learning_rate": 0.0004038846990080174, + "loss": 3.4354, + "step": 70215 + }, + { + "epoch": 4.771028672373964, + "grad_norm": 0.8526396155357361, + "learning_rate": 0.0004038422339991847, + "loss": 3.5078, + "step": 70220 + }, + { + "epoch": 4.771368392444626, + "grad_norm": 0.9157712459564209, + "learning_rate": 0.00040379976899035196, + "loss": 3.4086, + "step": 70225 + }, + { + "epoch": 4.771708112515287, + "grad_norm": 0.8810611367225647, + "learning_rate": 0.00040375730398151924, + "loss": 3.6025, + "step": 70230 + }, + { + "epoch": 4.772047832585949, + "grad_norm": 0.9696812033653259, + "learning_rate": 0.0004037148389726865, + "loss": 3.3575, + "step": 70235 + }, + { + "epoch": 4.772387552656611, + "grad_norm": 0.8329572677612305, + "learning_rate": 0.0004036723739638538, + "loss": 3.2537, + "step": 70240 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 2.50813364982605, + "learning_rate": 0.000403629908955021, + "loss": 3.4859, + "step": 70245 + }, + { + "epoch": 4.773066992797935, + "grad_norm": 0.7980586886405945, + "learning_rate": 0.00040358744394618836, + "loss": 3.506, + "step": 70250 + }, + { + "epoch": 4.773406712868597, + "grad_norm": 0.7793160676956177, + "learning_rate": 0.00040354497893735564, + "loss": 3.5113, + "step": 70255 + }, + { + "epoch": 4.773746432939258, + "grad_norm": 0.9271100759506226, + "learning_rate": 0.00040350251392852287, + "loss": 3.5235, + "step": 70260 + }, + { + "epoch": 4.77408615300992, + "grad_norm": 1.1717387437820435, + "learning_rate": 0.0004034600489196902, + "loss": 3.4002, + "step": 70265 + }, + { + "epoch": 4.774425873080582, + "grad_norm": 1.0959041118621826, + "learning_rate": 0.0004034175839108575, + "loss": 3.44, + "step": 70270 + }, + { + "epoch": 4.774765593151243, + "grad_norm": 0.7792422771453857, + "learning_rate": 0.0004033751189020247, + "loss": 3.7154, + "step": 70275 + }, + { + "epoch": 4.775105313221905, + "grad_norm": 0.9272069334983826, + "learning_rate": 0.000403332653893192, + "loss": 3.3525, + "step": 70280 + }, + { + "epoch": 4.775445033292567, + "grad_norm": 0.820048451423645, + "learning_rate": 0.0004032901888843593, + "loss": 3.7379, + "step": 70285 + }, + { + "epoch": 4.7757847533632285, + "grad_norm": 0.9239283204078674, + "learning_rate": 0.00040324772387552655, + "loss": 3.4974, + "step": 70290 + }, + { + "epoch": 4.776124473433891, + "grad_norm": 1.0926564931869507, + "learning_rate": 0.00040320525886669383, + "loss": 3.6238, + "step": 70295 + }, + { + "epoch": 4.776464193504552, + "grad_norm": 1.0045926570892334, + "learning_rate": 0.00040316279385786116, + "loss": 3.6772, + "step": 70300 + }, + { + "epoch": 4.776803913575214, + "grad_norm": 0.9715753197669983, + "learning_rate": 0.0004031203288490284, + "loss": 3.7596, + "step": 70305 + }, + { + "epoch": 4.777143633645876, + "grad_norm": 1.235514760017395, + "learning_rate": 0.00040307786384019567, + "loss": 3.3151, + "step": 70310 + }, + { + "epoch": 4.777483353716537, + "grad_norm": 0.9131484031677246, + "learning_rate": 0.000403035398831363, + "loss": 3.3941, + "step": 70315 + }, + { + "epoch": 4.777823073787199, + "grad_norm": 0.8391175270080566, + "learning_rate": 0.00040299293382253023, + "loss": 3.571, + "step": 70320 + }, + { + "epoch": 4.778162793857861, + "grad_norm": 0.8662473559379578, + "learning_rate": 0.0004029504688136975, + "loss": 3.4645, + "step": 70325 + }, + { + "epoch": 4.778502513928522, + "grad_norm": 0.8971375226974487, + "learning_rate": 0.0004029080038048648, + "loss": 3.3633, + "step": 70330 + }, + { + "epoch": 4.7788422339991845, + "grad_norm": 0.8582561612129211, + "learning_rate": 0.0004028655387960321, + "loss": 3.4816, + "step": 70335 + }, + { + "epoch": 4.779181954069847, + "grad_norm": 0.9009100794792175, + "learning_rate": 0.00040282307378719935, + "loss": 3.2908, + "step": 70340 + }, + { + "epoch": 4.779521674140508, + "grad_norm": 1.053267478942871, + "learning_rate": 0.00040278060877836663, + "loss": 3.4571, + "step": 70345 + }, + { + "epoch": 4.77986139421117, + "grad_norm": 0.9728791117668152, + "learning_rate": 0.00040273814376953396, + "loss": 3.2918, + "step": 70350 + }, + { + "epoch": 4.780201114281832, + "grad_norm": 0.8260645270347595, + "learning_rate": 0.0004026956787607012, + "loss": 3.7085, + "step": 70355 + }, + { + "epoch": 4.780540834352493, + "grad_norm": 0.6608973741531372, + "learning_rate": 0.00040265321375186847, + "loss": 3.558, + "step": 70360 + }, + { + "epoch": 4.780880554423155, + "grad_norm": 1.1728826761245728, + "learning_rate": 0.00040261074874303575, + "loss": 3.2725, + "step": 70365 + }, + { + "epoch": 4.781220274493817, + "grad_norm": 0.8080207705497742, + "learning_rate": 0.00040256828373420303, + "loss": 3.6162, + "step": 70370 + }, + { + "epoch": 4.7815599945644784, + "grad_norm": 0.962407648563385, + "learning_rate": 0.0004025258187253703, + "loss": 3.5778, + "step": 70375 + }, + { + "epoch": 4.7818997146351405, + "grad_norm": 1.1161731481552124, + "learning_rate": 0.0004024833537165376, + "loss": 3.3908, + "step": 70380 + }, + { + "epoch": 4.782239434705803, + "grad_norm": 0.8287981152534485, + "learning_rate": 0.00040244088870770487, + "loss": 3.6904, + "step": 70385 + }, + { + "epoch": 4.782579154776464, + "grad_norm": 0.9440880417823792, + "learning_rate": 0.00040239842369887215, + "loss": 3.4216, + "step": 70390 + }, + { + "epoch": 4.782918874847126, + "grad_norm": 0.9340777397155762, + "learning_rate": 0.00040235595869003943, + "loss": 3.4076, + "step": 70395 + }, + { + "epoch": 4.783258594917788, + "grad_norm": 0.913470983505249, + "learning_rate": 0.00040231349368120665, + "loss": 3.3025, + "step": 70400 + }, + { + "epoch": 4.783598314988449, + "grad_norm": 1.2260994911193848, + "learning_rate": 0.000402271028672374, + "loss": 3.5742, + "step": 70405 + }, + { + "epoch": 4.783938035059111, + "grad_norm": 0.8173726201057434, + "learning_rate": 0.00040222856366354127, + "loss": 3.6769, + "step": 70410 + }, + { + "epoch": 4.784277755129773, + "grad_norm": 0.9800576567649841, + "learning_rate": 0.0004021860986547085, + "loss": 3.7129, + "step": 70415 + }, + { + "epoch": 4.7846174752004345, + "grad_norm": 1.1857936382293701, + "learning_rate": 0.00040214363364587583, + "loss": 3.4074, + "step": 70420 + }, + { + "epoch": 4.7849571952710965, + "grad_norm": 0.8994945287704468, + "learning_rate": 0.0004021011686370431, + "loss": 3.2559, + "step": 70425 + }, + { + "epoch": 4.785296915341759, + "grad_norm": 2.2135043144226074, + "learning_rate": 0.00040205870362821034, + "loss": 3.6353, + "step": 70430 + }, + { + "epoch": 4.78563663541242, + "grad_norm": 0.733504593372345, + "learning_rate": 0.0004020162386193776, + "loss": 3.4394, + "step": 70435 + }, + { + "epoch": 4.785976355483082, + "grad_norm": 1.0649021863937378, + "learning_rate": 0.00040197377361054495, + "loss": 3.3112, + "step": 70440 + }, + { + "epoch": 4.786316075553744, + "grad_norm": 0.8857606053352356, + "learning_rate": 0.0004019313086017122, + "loss": 3.427, + "step": 70445 + }, + { + "epoch": 4.786655795624405, + "grad_norm": 0.8734515905380249, + "learning_rate": 0.00040188884359287946, + "loss": 3.5705, + "step": 70450 + }, + { + "epoch": 4.786995515695067, + "grad_norm": 0.9998297691345215, + "learning_rate": 0.0004018463785840468, + "loss": 3.2476, + "step": 70455 + }, + { + "epoch": 4.787335235765729, + "grad_norm": 0.9271976351737976, + "learning_rate": 0.000401803913575214, + "loss": 3.4479, + "step": 70460 + }, + { + "epoch": 4.7876749558363905, + "grad_norm": 0.8889431953430176, + "learning_rate": 0.0004017614485663813, + "loss": 3.4007, + "step": 70465 + }, + { + "epoch": 4.7880146759070525, + "grad_norm": 1.1757175922393799, + "learning_rate": 0.0004017189835575486, + "loss": 3.709, + "step": 70470 + }, + { + "epoch": 4.788354395977715, + "grad_norm": 1.2908788919448853, + "learning_rate": 0.00040167651854871586, + "loss": 3.5364, + "step": 70475 + }, + { + "epoch": 4.788694116048376, + "grad_norm": 0.8177440762519836, + "learning_rate": 0.00040163405353988314, + "loss": 3.3743, + "step": 70480 + }, + { + "epoch": 4.789033836119038, + "grad_norm": 0.8458428382873535, + "learning_rate": 0.0004015915885310504, + "loss": 3.4072, + "step": 70485 + }, + { + "epoch": 4.7893735561897, + "grad_norm": 0.7606944441795349, + "learning_rate": 0.0004015491235222177, + "loss": 3.3909, + "step": 70490 + }, + { + "epoch": 4.789713276260361, + "grad_norm": 0.9418500065803528, + "learning_rate": 0.000401506658513385, + "loss": 3.3958, + "step": 70495 + }, + { + "epoch": 4.790052996331023, + "grad_norm": 1.0297702550888062, + "learning_rate": 0.00040146419350455226, + "loss": 3.7557, + "step": 70500 + }, + { + "epoch": 4.790392716401685, + "grad_norm": 0.7198707461357117, + "learning_rate": 0.00040142172849571954, + "loss": 3.3048, + "step": 70505 + }, + { + "epoch": 4.7907324364723465, + "grad_norm": 0.7754819393157959, + "learning_rate": 0.0004013792634868868, + "loss": 3.4914, + "step": 70510 + }, + { + "epoch": 4.7910721565430086, + "grad_norm": 0.9045870900154114, + "learning_rate": 0.0004013367984780541, + "loss": 3.3912, + "step": 70515 + }, + { + "epoch": 4.791411876613671, + "grad_norm": 0.7294789552688599, + "learning_rate": 0.0004012943334692214, + "loss": 3.7871, + "step": 70520 + }, + { + "epoch": 4.791751596684332, + "grad_norm": 0.7881842255592346, + "learning_rate": 0.00040125186846038866, + "loss": 3.5098, + "step": 70525 + }, + { + "epoch": 4.792091316754994, + "grad_norm": 1.120352864265442, + "learning_rate": 0.00040120940345155594, + "loss": 3.3382, + "step": 70530 + }, + { + "epoch": 4.792431036825656, + "grad_norm": 1.005694031715393, + "learning_rate": 0.0004011669384427232, + "loss": 3.5737, + "step": 70535 + }, + { + "epoch": 4.792770756896317, + "grad_norm": 1.128570795059204, + "learning_rate": 0.00040112447343389044, + "loss": 3.2717, + "step": 70540 + }, + { + "epoch": 4.793110476966979, + "grad_norm": 1.1145176887512207, + "learning_rate": 0.0004010820084250578, + "loss": 3.3536, + "step": 70545 + }, + { + "epoch": 4.793450197037641, + "grad_norm": 1.1116024255752563, + "learning_rate": 0.00040103954341622506, + "loss": 3.5413, + "step": 70550 + }, + { + "epoch": 4.7937899171083025, + "grad_norm": 0.8806758522987366, + "learning_rate": 0.0004009970784073923, + "loss": 3.7109, + "step": 70555 + }, + { + "epoch": 4.794129637178965, + "grad_norm": 0.8022595643997192, + "learning_rate": 0.0004009546133985596, + "loss": 3.7076, + "step": 70560 + }, + { + "epoch": 4.794469357249627, + "grad_norm": 1.0097354650497437, + "learning_rate": 0.0004009121483897269, + "loss": 3.5832, + "step": 70565 + }, + { + "epoch": 4.794809077320288, + "grad_norm": 1.043013572692871, + "learning_rate": 0.0004008696833808941, + "loss": 3.7485, + "step": 70570 + }, + { + "epoch": 4.79514879739095, + "grad_norm": 1.1443777084350586, + "learning_rate": 0.00040082721837206146, + "loss": 3.3118, + "step": 70575 + }, + { + "epoch": 4.795488517461612, + "grad_norm": 0.9649503827095032, + "learning_rate": 0.00040078475336322874, + "loss": 3.392, + "step": 70580 + }, + { + "epoch": 4.795828237532273, + "grad_norm": 0.9693946838378906, + "learning_rate": 0.00040074228835439596, + "loss": 3.4047, + "step": 70585 + }, + { + "epoch": 4.796167957602935, + "grad_norm": 0.9457901120185852, + "learning_rate": 0.00040069982334556324, + "loss": 3.5851, + "step": 70590 + }, + { + "epoch": 4.796507677673597, + "grad_norm": 1.1099580526351929, + "learning_rate": 0.0004006573583367306, + "loss": 3.3784, + "step": 70595 + }, + { + "epoch": 4.7968473977442585, + "grad_norm": 0.7914828062057495, + "learning_rate": 0.0004006148933278978, + "loss": 3.4554, + "step": 70600 + }, + { + "epoch": 4.797187117814921, + "grad_norm": 0.7389926314353943, + "learning_rate": 0.0004005724283190651, + "loss": 3.5413, + "step": 70605 + }, + { + "epoch": 4.797526837885583, + "grad_norm": 0.8208965063095093, + "learning_rate": 0.0004005299633102324, + "loss": 3.541, + "step": 70610 + }, + { + "epoch": 4.797866557956244, + "grad_norm": 0.9029210805892944, + "learning_rate": 0.00040048749830139964, + "loss": 3.3959, + "step": 70615 + }, + { + "epoch": 4.798206278026906, + "grad_norm": 0.8446347713470459, + "learning_rate": 0.0004004450332925669, + "loss": 3.661, + "step": 70620 + }, + { + "epoch": 4.798545998097568, + "grad_norm": 1.103885531425476, + "learning_rate": 0.0004004025682837342, + "loss": 3.5562, + "step": 70625 + }, + { + "epoch": 4.798885718168229, + "grad_norm": 0.8147714138031006, + "learning_rate": 0.0004003601032749015, + "loss": 3.2248, + "step": 70630 + }, + { + "epoch": 4.799225438238891, + "grad_norm": 0.8847523927688599, + "learning_rate": 0.00040031763826606876, + "loss": 3.481, + "step": 70635 + }, + { + "epoch": 4.799565158309553, + "grad_norm": 0.9900251030921936, + "learning_rate": 0.00040027517325723604, + "loss": 3.4791, + "step": 70640 + }, + { + "epoch": 4.7999048783802145, + "grad_norm": 0.9976971745491028, + "learning_rate": 0.0004002327082484033, + "loss": 3.2292, + "step": 70645 + }, + { + "epoch": 4.800244598450877, + "grad_norm": 1.3184161186218262, + "learning_rate": 0.0004001902432395706, + "loss": 3.4613, + "step": 70650 + }, + { + "epoch": 4.800584318521539, + "grad_norm": 0.9441972970962524, + "learning_rate": 0.0004001477782307379, + "loss": 3.5047, + "step": 70655 + }, + { + "epoch": 4.8009240385922, + "grad_norm": 1.0068618059158325, + "learning_rate": 0.0004001053132219051, + "loss": 3.5668, + "step": 70660 + }, + { + "epoch": 4.801263758662862, + "grad_norm": 0.7266855239868164, + "learning_rate": 0.00040006284821307244, + "loss": 3.6534, + "step": 70665 + }, + { + "epoch": 4.801603478733524, + "grad_norm": 1.0084290504455566, + "learning_rate": 0.0004000203832042397, + "loss": 3.2699, + "step": 70670 + }, + { + "epoch": 4.801943198804185, + "grad_norm": 0.9120093584060669, + "learning_rate": 0.000399977918195407, + "loss": 3.8263, + "step": 70675 + }, + { + "epoch": 4.802282918874847, + "grad_norm": 0.9629181027412415, + "learning_rate": 0.0003999354531865743, + "loss": 3.5054, + "step": 70680 + }, + { + "epoch": 4.802622638945509, + "grad_norm": 0.7449311017990112, + "learning_rate": 0.00039989298817774156, + "loss": 3.4293, + "step": 70685 + }, + { + "epoch": 4.8029623590161705, + "grad_norm": 1.0928714275360107, + "learning_rate": 0.00039985052316890885, + "loss": 3.4782, + "step": 70690 + }, + { + "epoch": 4.803302079086833, + "grad_norm": 0.9108014106750488, + "learning_rate": 0.00039980805816007607, + "loss": 3.6105, + "step": 70695 + }, + { + "epoch": 4.803641799157495, + "grad_norm": 0.9724355936050415, + "learning_rate": 0.0003997655931512434, + "loss": 3.3713, + "step": 70700 + }, + { + "epoch": 4.803981519228156, + "grad_norm": 0.92947918176651, + "learning_rate": 0.0003997231281424107, + "loss": 3.4354, + "step": 70705 + }, + { + "epoch": 4.804321239298818, + "grad_norm": 0.9115689992904663, + "learning_rate": 0.0003996806631335779, + "loss": 3.5414, + "step": 70710 + }, + { + "epoch": 4.80466095936948, + "grad_norm": 0.9166790843009949, + "learning_rate": 0.00039963819812474525, + "loss": 3.4396, + "step": 70715 + }, + { + "epoch": 4.805000679440141, + "grad_norm": 0.6971699595451355, + "learning_rate": 0.0003995957331159125, + "loss": 3.3474, + "step": 70720 + }, + { + "epoch": 4.805340399510803, + "grad_norm": 0.7654916644096375, + "learning_rate": 0.00039955326810707975, + "loss": 3.3307, + "step": 70725 + }, + { + "epoch": 4.805680119581465, + "grad_norm": 0.9090659618377686, + "learning_rate": 0.00039951080309824703, + "loss": 3.5628, + "step": 70730 + }, + { + "epoch": 4.8060198396521265, + "grad_norm": 0.8206060528755188, + "learning_rate": 0.00039946833808941437, + "loss": 3.5372, + "step": 70735 + }, + { + "epoch": 4.806359559722789, + "grad_norm": 0.9293666481971741, + "learning_rate": 0.0003994258730805816, + "loss": 3.4678, + "step": 70740 + }, + { + "epoch": 4.806699279793451, + "grad_norm": 0.8692308068275452, + "learning_rate": 0.00039938340807174887, + "loss": 3.5414, + "step": 70745 + }, + { + "epoch": 4.807038999864112, + "grad_norm": 0.9480046033859253, + "learning_rate": 0.0003993409430629162, + "loss": 3.6496, + "step": 70750 + }, + { + "epoch": 4.807378719934774, + "grad_norm": 0.7501437664031982, + "learning_rate": 0.00039929847805408343, + "loss": 3.5312, + "step": 70755 + }, + { + "epoch": 4.807718440005435, + "grad_norm": 1.1891744136810303, + "learning_rate": 0.0003992560130452507, + "loss": 3.3598, + "step": 70760 + }, + { + "epoch": 4.808058160076097, + "grad_norm": 0.7522063255310059, + "learning_rate": 0.000399213548036418, + "loss": 3.6162, + "step": 70765 + }, + { + "epoch": 4.808397880146759, + "grad_norm": 1.2199472188949585, + "learning_rate": 0.00039917108302758527, + "loss": 3.4997, + "step": 70770 + }, + { + "epoch": 4.8087376002174205, + "grad_norm": 0.8001447319984436, + "learning_rate": 0.00039912861801875255, + "loss": 3.576, + "step": 70775 + }, + { + "epoch": 4.8090773202880825, + "grad_norm": 0.8568519949913025, + "learning_rate": 0.00039908615300991983, + "loss": 3.2623, + "step": 70780 + }, + { + "epoch": 4.809417040358745, + "grad_norm": 0.713882565498352, + "learning_rate": 0.0003990436880010871, + "loss": 3.4663, + "step": 70785 + }, + { + "epoch": 4.809756760429406, + "grad_norm": 0.8788935542106628, + "learning_rate": 0.0003990012229922544, + "loss": 3.3359, + "step": 70790 + }, + { + "epoch": 4.810096480500068, + "grad_norm": 0.987219512462616, + "learning_rate": 0.00039895875798342167, + "loss": 3.3427, + "step": 70795 + }, + { + "epoch": 4.81043620057073, + "grad_norm": 0.8513180017471313, + "learning_rate": 0.0003989162929745889, + "loss": 3.4085, + "step": 70800 + }, + { + "epoch": 4.810775920641391, + "grad_norm": 0.8717772960662842, + "learning_rate": 0.00039887382796575623, + "loss": 3.4728, + "step": 70805 + }, + { + "epoch": 4.811115640712053, + "grad_norm": 0.8682042360305786, + "learning_rate": 0.0003988313629569235, + "loss": 3.6443, + "step": 70810 + }, + { + "epoch": 4.811455360782715, + "grad_norm": 0.9448670148849487, + "learning_rate": 0.00039878889794809074, + "loss": 3.6996, + "step": 70815 + }, + { + "epoch": 4.8117950808533765, + "grad_norm": 1.1857585906982422, + "learning_rate": 0.00039874643293925807, + "loss": 3.4621, + "step": 70820 + }, + { + "epoch": 4.812134800924039, + "grad_norm": 0.9601569771766663, + "learning_rate": 0.00039870396793042535, + "loss": 3.5606, + "step": 70825 + }, + { + "epoch": 4.812474520994701, + "grad_norm": 0.8571251034736633, + "learning_rate": 0.0003986615029215926, + "loss": 3.6147, + "step": 70830 + }, + { + "epoch": 4.812814241065362, + "grad_norm": 1.1373811960220337, + "learning_rate": 0.00039861903791275986, + "loss": 3.558, + "step": 70835 + }, + { + "epoch": 4.813153961136024, + "grad_norm": 1.1855220794677734, + "learning_rate": 0.0003985765729039272, + "loss": 3.5463, + "step": 70840 + }, + { + "epoch": 4.813493681206686, + "grad_norm": 1.0148098468780518, + "learning_rate": 0.0003985341078950945, + "loss": 3.9963, + "step": 70845 + }, + { + "epoch": 4.813833401277347, + "grad_norm": 0.8095639944076538, + "learning_rate": 0.0003984916428862617, + "loss": 3.4624, + "step": 70850 + }, + { + "epoch": 4.814173121348009, + "grad_norm": 1.883031964302063, + "learning_rate": 0.00039844917787742903, + "loss": 3.7426, + "step": 70855 + }, + { + "epoch": 4.814512841418671, + "grad_norm": 0.6871235966682434, + "learning_rate": 0.0003984067128685963, + "loss": 3.3553, + "step": 70860 + }, + { + "epoch": 4.8148525614893325, + "grad_norm": 0.857749342918396, + "learning_rate": 0.00039836424785976354, + "loss": 3.5103, + "step": 70865 + }, + { + "epoch": 4.815192281559995, + "grad_norm": 0.8908435106277466, + "learning_rate": 0.0003983217828509309, + "loss": 3.3724, + "step": 70870 + }, + { + "epoch": 4.815532001630657, + "grad_norm": 0.885162889957428, + "learning_rate": 0.00039827931784209815, + "loss": 3.3721, + "step": 70875 + }, + { + "epoch": 4.815871721701318, + "grad_norm": 0.9193497896194458, + "learning_rate": 0.0003982368528332654, + "loss": 3.7423, + "step": 70880 + }, + { + "epoch": 4.81621144177198, + "grad_norm": 0.7935445308685303, + "learning_rate": 0.00039819438782443266, + "loss": 3.496, + "step": 70885 + }, + { + "epoch": 4.816551161842642, + "grad_norm": 1.1224976778030396, + "learning_rate": 0.0003981519228156, + "loss": 3.6204, + "step": 70890 + }, + { + "epoch": 4.816890881913303, + "grad_norm": 0.9045135974884033, + "learning_rate": 0.0003981094578067672, + "loss": 3.6623, + "step": 70895 + }, + { + "epoch": 4.817230601983965, + "grad_norm": 0.9652150869369507, + "learning_rate": 0.0003980669927979345, + "loss": 3.6629, + "step": 70900 + }, + { + "epoch": 4.817570322054627, + "grad_norm": 1.0509644746780396, + "learning_rate": 0.00039802452778910183, + "loss": 3.7244, + "step": 70905 + }, + { + "epoch": 4.8179100421252885, + "grad_norm": 0.9712699055671692, + "learning_rate": 0.00039798206278026906, + "loss": 3.7698, + "step": 70910 + }, + { + "epoch": 4.818249762195951, + "grad_norm": 0.9808831810951233, + "learning_rate": 0.00039793959777143634, + "loss": 3.4287, + "step": 70915 + }, + { + "epoch": 4.818589482266613, + "grad_norm": 0.7956730127334595, + "learning_rate": 0.0003978971327626036, + "loss": 3.3016, + "step": 70920 + }, + { + "epoch": 4.818929202337274, + "grad_norm": 1.037026047706604, + "learning_rate": 0.0003978546677537709, + "loss": 3.4582, + "step": 70925 + }, + { + "epoch": 4.819268922407936, + "grad_norm": 0.9771406054496765, + "learning_rate": 0.0003978122027449382, + "loss": 3.473, + "step": 70930 + }, + { + "epoch": 4.819608642478598, + "grad_norm": 0.9202039837837219, + "learning_rate": 0.00039776973773610546, + "loss": 3.4383, + "step": 70935 + }, + { + "epoch": 4.819948362549259, + "grad_norm": 0.8714622259140015, + "learning_rate": 0.00039772727272727274, + "loss": 3.6978, + "step": 70940 + }, + { + "epoch": 4.820288082619921, + "grad_norm": 0.8862742781639099, + "learning_rate": 0.00039768480771844, + "loss": 3.779, + "step": 70945 + }, + { + "epoch": 4.820627802690583, + "grad_norm": 0.9422771334648132, + "learning_rate": 0.0003976423427096073, + "loss": 3.471, + "step": 70950 + }, + { + "epoch": 4.8209675227612445, + "grad_norm": 0.8343178033828735, + "learning_rate": 0.0003975998777007745, + "loss": 3.3618, + "step": 70955 + }, + { + "epoch": 4.821307242831907, + "grad_norm": 0.8073381185531616, + "learning_rate": 0.00039755741269194186, + "loss": 3.5272, + "step": 70960 + }, + { + "epoch": 4.821646962902569, + "grad_norm": 0.933560311794281, + "learning_rate": 0.00039751494768310914, + "loss": 3.7033, + "step": 70965 + }, + { + "epoch": 4.82198668297323, + "grad_norm": 0.8543426990509033, + "learning_rate": 0.00039747248267427637, + "loss": 3.6562, + "step": 70970 + }, + { + "epoch": 4.822326403043892, + "grad_norm": 0.7579144835472107, + "learning_rate": 0.0003974300176654437, + "loss": 3.4159, + "step": 70975 + }, + { + "epoch": 4.822666123114553, + "grad_norm": 0.7482693791389465, + "learning_rate": 0.000397387552656611, + "loss": 3.2259, + "step": 70980 + }, + { + "epoch": 4.823005843185215, + "grad_norm": 4.8461079597473145, + "learning_rate": 0.0003973450876477782, + "loss": 3.4983, + "step": 70985 + }, + { + "epoch": 4.823345563255877, + "grad_norm": 0.8982952237129211, + "learning_rate": 0.0003973026226389455, + "loss": 3.4098, + "step": 70990 + }, + { + "epoch": 4.8236852833265385, + "grad_norm": 1.1731888055801392, + "learning_rate": 0.0003972601576301128, + "loss": 3.5188, + "step": 70995 + }, + { + "epoch": 4.8240250033972005, + "grad_norm": 1.0816080570220947, + "learning_rate": 0.00039721769262128005, + "loss": 3.5488, + "step": 71000 + }, + { + "epoch": 4.824364723467863, + "grad_norm": 0.868992030620575, + "learning_rate": 0.00039717522761244733, + "loss": 3.4631, + "step": 71005 + }, + { + "epoch": 4.824704443538524, + "grad_norm": 1.1523455381393433, + "learning_rate": 0.00039713276260361466, + "loss": 3.5184, + "step": 71010 + }, + { + "epoch": 4.825044163609186, + "grad_norm": 0.822525680065155, + "learning_rate": 0.00039709029759478194, + "loss": 3.0018, + "step": 71015 + }, + { + "epoch": 4.825383883679848, + "grad_norm": 0.7927701473236084, + "learning_rate": 0.00039704783258594917, + "loss": 3.374, + "step": 71020 + }, + { + "epoch": 4.825723603750509, + "grad_norm": 0.9470824003219604, + "learning_rate": 0.00039700536757711645, + "loss": 3.5883, + "step": 71025 + }, + { + "epoch": 4.826063323821171, + "grad_norm": 1.0016555786132812, + "learning_rate": 0.0003969629025682838, + "loss": 3.6152, + "step": 71030 + }, + { + "epoch": 4.826403043891833, + "grad_norm": 1.0169398784637451, + "learning_rate": 0.000396920437559451, + "loss": 3.5645, + "step": 71035 + }, + { + "epoch": 4.8267427639624945, + "grad_norm": 0.9602264761924744, + "learning_rate": 0.0003968779725506183, + "loss": 3.3711, + "step": 71040 + }, + { + "epoch": 4.8270824840331565, + "grad_norm": 0.9066042900085449, + "learning_rate": 0.0003968355075417856, + "loss": 3.3141, + "step": 71045 + }, + { + "epoch": 4.827422204103819, + "grad_norm": 0.8322476148605347, + "learning_rate": 0.00039679304253295285, + "loss": 3.6205, + "step": 71050 + }, + { + "epoch": 4.82776192417448, + "grad_norm": 1.06336510181427, + "learning_rate": 0.00039675057752412013, + "loss": 3.4474, + "step": 71055 + }, + { + "epoch": 4.828101644245142, + "grad_norm": 12.803571701049805, + "learning_rate": 0.0003967081125152874, + "loss": 3.5448, + "step": 71060 + }, + { + "epoch": 4.828441364315804, + "grad_norm": 1.0238789319992065, + "learning_rate": 0.0003966656475064547, + "loss": 3.6302, + "step": 71065 + }, + { + "epoch": 4.828781084386465, + "grad_norm": 0.8726906776428223, + "learning_rate": 0.00039662318249762197, + "loss": 3.4473, + "step": 71070 + }, + { + "epoch": 4.829120804457127, + "grad_norm": 0.9747965931892395, + "learning_rate": 0.00039658071748878925, + "loss": 3.4315, + "step": 71075 + }, + { + "epoch": 4.829460524527789, + "grad_norm": 0.8402705788612366, + "learning_rate": 0.00039653825247995653, + "loss": 3.4744, + "step": 71080 + }, + { + "epoch": 4.8298002445984505, + "grad_norm": 0.7554439902305603, + "learning_rate": 0.0003964957874711238, + "loss": 3.6345, + "step": 71085 + }, + { + "epoch": 4.8301399646691126, + "grad_norm": 0.7433812022209167, + "learning_rate": 0.0003964533224622911, + "loss": 3.6533, + "step": 71090 + }, + { + "epoch": 4.830479684739775, + "grad_norm": 1.048396348953247, + "learning_rate": 0.0003964108574534583, + "loss": 3.4474, + "step": 71095 + }, + { + "epoch": 4.830819404810436, + "grad_norm": 1.8106375932693481, + "learning_rate": 0.00039636839244462565, + "loss": 3.5479, + "step": 71100 + }, + { + "epoch": 4.831159124881098, + "grad_norm": 0.9991978406906128, + "learning_rate": 0.00039632592743579293, + "loss": 3.4235, + "step": 71105 + }, + { + "epoch": 4.83149884495176, + "grad_norm": 1.0404540300369263, + "learning_rate": 0.00039628346242696015, + "loss": 3.6267, + "step": 71110 + }, + { + "epoch": 4.831838565022421, + "grad_norm": 0.857841968536377, + "learning_rate": 0.0003962409974181275, + "loss": 3.6042, + "step": 71115 + }, + { + "epoch": 4.832178285093083, + "grad_norm": 0.7388150095939636, + "learning_rate": 0.00039619853240929477, + "loss": 3.3191, + "step": 71120 + }, + { + "epoch": 4.832518005163745, + "grad_norm": 0.8850029706954956, + "learning_rate": 0.000396156067400462, + "loss": 3.5982, + "step": 71125 + }, + { + "epoch": 4.8328577252344065, + "grad_norm": 0.8312262296676636, + "learning_rate": 0.0003961136023916293, + "loss": 3.2964, + "step": 71130 + }, + { + "epoch": 4.833197445305069, + "grad_norm": 0.7024043798446655, + "learning_rate": 0.0003960711373827966, + "loss": 3.6816, + "step": 71135 + }, + { + "epoch": 4.833537165375731, + "grad_norm": 0.8636791110038757, + "learning_rate": 0.00039602867237396384, + "loss": 3.2056, + "step": 71140 + }, + { + "epoch": 4.833876885446392, + "grad_norm": 0.9131413102149963, + "learning_rate": 0.0003959862073651311, + "loss": 3.4385, + "step": 71145 + }, + { + "epoch": 4.834216605517054, + "grad_norm": 0.7394030690193176, + "learning_rate": 0.00039594374235629845, + "loss": 3.5554, + "step": 71150 + }, + { + "epoch": 4.834556325587716, + "grad_norm": 0.9875873327255249, + "learning_rate": 0.0003959012773474657, + "loss": 3.4805, + "step": 71155 + }, + { + "epoch": 4.834896045658377, + "grad_norm": 0.9506178498268127, + "learning_rate": 0.00039585881233863296, + "loss": 3.3773, + "step": 71160 + }, + { + "epoch": 4.835235765729039, + "grad_norm": 0.9853305816650391, + "learning_rate": 0.0003958163473298003, + "loss": 3.6641, + "step": 71165 + }, + { + "epoch": 4.835575485799701, + "grad_norm": 0.7909849286079407, + "learning_rate": 0.0003957738823209675, + "loss": 3.5329, + "step": 71170 + }, + { + "epoch": 4.8359152058703625, + "grad_norm": 1.0765265226364136, + "learning_rate": 0.0003957314173121348, + "loss": 3.4067, + "step": 71175 + }, + { + "epoch": 4.836254925941025, + "grad_norm": 0.6848616003990173, + "learning_rate": 0.0003956889523033021, + "loss": 3.4663, + "step": 71180 + }, + { + "epoch": 4.836594646011687, + "grad_norm": 1.0184352397918701, + "learning_rate": 0.0003956464872944694, + "loss": 3.6003, + "step": 71185 + }, + { + "epoch": 4.836934366082348, + "grad_norm": 0.8530946373939514, + "learning_rate": 0.00039560402228563664, + "loss": 3.389, + "step": 71190 + }, + { + "epoch": 4.83727408615301, + "grad_norm": 0.8850993514060974, + "learning_rate": 0.0003955615572768039, + "loss": 3.4482, + "step": 71195 + }, + { + "epoch": 4.837613806223672, + "grad_norm": 1.027437686920166, + "learning_rate": 0.00039551909226797125, + "loss": 3.4834, + "step": 71200 + }, + { + "epoch": 4.837953526294333, + "grad_norm": 0.9675962924957275, + "learning_rate": 0.0003954766272591385, + "loss": 3.4704, + "step": 71205 + }, + { + "epoch": 4.838293246364995, + "grad_norm": 1.0145937204360962, + "learning_rate": 0.00039543416225030576, + "loss": 3.5571, + "step": 71210 + }, + { + "epoch": 4.838632966435657, + "grad_norm": 0.8983915448188782, + "learning_rate": 0.00039539169724147304, + "loss": 3.5281, + "step": 71215 + }, + { + "epoch": 4.8389726865063185, + "grad_norm": 0.8906980156898499, + "learning_rate": 0.0003953492322326403, + "loss": 3.4546, + "step": 71220 + }, + { + "epoch": 4.839312406576981, + "grad_norm": 0.8135682940483093, + "learning_rate": 0.0003953067672238076, + "loss": 3.3413, + "step": 71225 + }, + { + "epoch": 4.839652126647643, + "grad_norm": 0.7624861598014832, + "learning_rate": 0.0003952643022149749, + "loss": 3.1632, + "step": 71230 + }, + { + "epoch": 4.839991846718304, + "grad_norm": 0.8589698076248169, + "learning_rate": 0.00039522183720614216, + "loss": 3.4635, + "step": 71235 + }, + { + "epoch": 4.840331566788966, + "grad_norm": 0.8466506004333496, + "learning_rate": 0.00039517937219730944, + "loss": 3.6362, + "step": 71240 + }, + { + "epoch": 4.840671286859628, + "grad_norm": 0.9901653528213501, + "learning_rate": 0.0003951369071884767, + "loss": 3.3883, + "step": 71245 + }, + { + "epoch": 4.841011006930289, + "grad_norm": 0.7635117173194885, + "learning_rate": 0.00039509444217964394, + "loss": 3.5695, + "step": 71250 + }, + { + "epoch": 4.841350727000951, + "grad_norm": 0.8886563777923584, + "learning_rate": 0.0003950519771708113, + "loss": 3.5531, + "step": 71255 + }, + { + "epoch": 4.841690447071613, + "grad_norm": 0.8161185383796692, + "learning_rate": 0.00039500951216197856, + "loss": 3.4164, + "step": 71260 + }, + { + "epoch": 4.8420301671422745, + "grad_norm": 0.9381132125854492, + "learning_rate": 0.0003949670471531458, + "loss": 3.5204, + "step": 71265 + }, + { + "epoch": 4.842369887212937, + "grad_norm": 0.7493913769721985, + "learning_rate": 0.0003949245821443131, + "loss": 3.3353, + "step": 71270 + }, + { + "epoch": 4.842709607283599, + "grad_norm": 0.8516833782196045, + "learning_rate": 0.0003948821171354804, + "loss": 3.5583, + "step": 71275 + }, + { + "epoch": 4.84304932735426, + "grad_norm": 0.8884556889533997, + "learning_rate": 0.0003948396521266476, + "loss": 3.4925, + "step": 71280 + }, + { + "epoch": 4.843389047424922, + "grad_norm": 0.9411657452583313, + "learning_rate": 0.0003947971871178149, + "loss": 3.4557, + "step": 71285 + }, + { + "epoch": 4.843728767495584, + "grad_norm": 0.9855687618255615, + "learning_rate": 0.00039475472210898224, + "loss": 3.4519, + "step": 71290 + }, + { + "epoch": 4.844068487566245, + "grad_norm": 0.7584624290466309, + "learning_rate": 0.00039471225710014946, + "loss": 3.4461, + "step": 71295 + }, + { + "epoch": 4.844408207636907, + "grad_norm": 0.7659784555435181, + "learning_rate": 0.00039466979209131674, + "loss": 3.199, + "step": 71300 + }, + { + "epoch": 4.844747927707569, + "grad_norm": 1.610106348991394, + "learning_rate": 0.0003946273270824841, + "loss": 3.636, + "step": 71305 + }, + { + "epoch": 4.8450876477782305, + "grad_norm": 1.1047072410583496, + "learning_rate": 0.0003945848620736513, + "loss": 3.3576, + "step": 71310 + }, + { + "epoch": 4.845427367848893, + "grad_norm": 1.084025263786316, + "learning_rate": 0.0003945423970648186, + "loss": 3.5552, + "step": 71315 + }, + { + "epoch": 4.845767087919555, + "grad_norm": 0.8473870158195496, + "learning_rate": 0.00039449993205598586, + "loss": 3.778, + "step": 71320 + }, + { + "epoch": 4.846106807990216, + "grad_norm": 0.8838701844215393, + "learning_rate": 0.00039445746704715314, + "loss": 3.6097, + "step": 71325 + }, + { + "epoch": 4.846446528060878, + "grad_norm": 3.5867583751678467, + "learning_rate": 0.0003944150020383204, + "loss": 3.627, + "step": 71330 + }, + { + "epoch": 4.84678624813154, + "grad_norm": 1.1372102499008179, + "learning_rate": 0.0003943725370294877, + "loss": 3.6092, + "step": 71335 + }, + { + "epoch": 4.847125968202201, + "grad_norm": 1.0246628522872925, + "learning_rate": 0.000394330072020655, + "loss": 3.3384, + "step": 71340 + }, + { + "epoch": 4.847465688272863, + "grad_norm": 0.8294450044631958, + "learning_rate": 0.00039428760701182226, + "loss": 3.6227, + "step": 71345 + }, + { + "epoch": 4.847805408343525, + "grad_norm": 0.9367049932479858, + "learning_rate": 0.00039424514200298954, + "loss": 3.5085, + "step": 71350 + }, + { + "epoch": 4.8481451284141865, + "grad_norm": 1.283703327178955, + "learning_rate": 0.0003942026769941568, + "loss": 3.7882, + "step": 71355 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.7247530817985535, + "learning_rate": 0.0003941602119853241, + "loss": 3.287, + "step": 71360 + }, + { + "epoch": 4.848824568555511, + "grad_norm": 0.8477506637573242, + "learning_rate": 0.0003941177469764914, + "loss": 3.5753, + "step": 71365 + }, + { + "epoch": 4.849164288626172, + "grad_norm": 0.8587186932563782, + "learning_rate": 0.00039407528196765866, + "loss": 3.2608, + "step": 71370 + }, + { + "epoch": 4.849504008696834, + "grad_norm": 0.8510515093803406, + "learning_rate": 0.00039403281695882594, + "loss": 3.4441, + "step": 71375 + }, + { + "epoch": 4.849843728767496, + "grad_norm": 0.9366026520729065, + "learning_rate": 0.0003939903519499932, + "loss": 3.5082, + "step": 71380 + }, + { + "epoch": 4.850183448838157, + "grad_norm": 1.2395544052124023, + "learning_rate": 0.0003939478869411605, + "loss": 3.3451, + "step": 71385 + }, + { + "epoch": 4.850523168908819, + "grad_norm": 0.8000422716140747, + "learning_rate": 0.00039390542193232773, + "loss": 3.5251, + "step": 71390 + }, + { + "epoch": 4.850862888979481, + "grad_norm": 0.8992046117782593, + "learning_rate": 0.00039386295692349506, + "loss": 3.2743, + "step": 71395 + }, + { + "epoch": 4.8512026090501426, + "grad_norm": 0.7928590774536133, + "learning_rate": 0.00039382049191466234, + "loss": 3.4928, + "step": 71400 + }, + { + "epoch": 4.851542329120805, + "grad_norm": 1.0892409086227417, + "learning_rate": 0.00039377802690582957, + "loss": 3.4757, + "step": 71405 + }, + { + "epoch": 4.851882049191467, + "grad_norm": 1.062013030052185, + "learning_rate": 0.0003937355618969969, + "loss": 3.3835, + "step": 71410 + }, + { + "epoch": 4.852221769262128, + "grad_norm": 0.8623493313789368, + "learning_rate": 0.0003936930968881642, + "loss": 3.5521, + "step": 71415 + }, + { + "epoch": 4.85256148933279, + "grad_norm": 1.0173346996307373, + "learning_rate": 0.0003936506318793314, + "loss": 3.307, + "step": 71420 + }, + { + "epoch": 4.852901209403452, + "grad_norm": 0.776207685470581, + "learning_rate": 0.00039360816687049875, + "loss": 3.383, + "step": 71425 + }, + { + "epoch": 4.853240929474113, + "grad_norm": 0.9538078308105469, + "learning_rate": 0.000393565701861666, + "loss": 3.657, + "step": 71430 + }, + { + "epoch": 4.853580649544775, + "grad_norm": 0.8916516900062561, + "learning_rate": 0.00039352323685283325, + "loss": 3.5432, + "step": 71435 + }, + { + "epoch": 4.8539203696154365, + "grad_norm": 0.860891580581665, + "learning_rate": 0.00039348077184400053, + "loss": 3.7033, + "step": 71440 + }, + { + "epoch": 4.854260089686099, + "grad_norm": 1.0825690031051636, + "learning_rate": 0.00039343830683516787, + "loss": 3.2504, + "step": 71445 + }, + { + "epoch": 4.854599809756761, + "grad_norm": 0.7629599571228027, + "learning_rate": 0.0003933958418263351, + "loss": 3.4018, + "step": 71450 + }, + { + "epoch": 4.854939529827422, + "grad_norm": 0.9384850859642029, + "learning_rate": 0.00039335337681750237, + "loss": 3.39, + "step": 71455 + }, + { + "epoch": 4.855279249898084, + "grad_norm": 0.9430871605873108, + "learning_rate": 0.0003933109118086697, + "loss": 3.408, + "step": 71460 + }, + { + "epoch": 4.855618969968746, + "grad_norm": 0.9833781123161316, + "learning_rate": 0.00039326844679983693, + "loss": 3.436, + "step": 71465 + }, + { + "epoch": 4.855958690039407, + "grad_norm": 1.018206238746643, + "learning_rate": 0.0003932259817910042, + "loss": 3.475, + "step": 71470 + }, + { + "epoch": 4.856298410110069, + "grad_norm": 1.2605363130569458, + "learning_rate": 0.0003931835167821715, + "loss": 3.6734, + "step": 71475 + }, + { + "epoch": 4.856638130180731, + "grad_norm": 0.8456436991691589, + "learning_rate": 0.00039314105177333877, + "loss": 3.2504, + "step": 71480 + }, + { + "epoch": 4.8569778502513925, + "grad_norm": 0.8546353578567505, + "learning_rate": 0.00039309858676450605, + "loss": 3.7728, + "step": 71485 + }, + { + "epoch": 4.857317570322055, + "grad_norm": 0.9890176653862, + "learning_rate": 0.00039305612175567333, + "loss": 3.616, + "step": 71490 + }, + { + "epoch": 4.857657290392717, + "grad_norm": 1.2920981645584106, + "learning_rate": 0.0003930136567468406, + "loss": 3.3002, + "step": 71495 + }, + { + "epoch": 4.857997010463378, + "grad_norm": 0.820393443107605, + "learning_rate": 0.0003929711917380079, + "loss": 3.681, + "step": 71500 + }, + { + "epoch": 4.85833673053404, + "grad_norm": 1.141140341758728, + "learning_rate": 0.00039292872672917517, + "loss": 3.1819, + "step": 71505 + }, + { + "epoch": 4.858676450604702, + "grad_norm": 0.8667577505111694, + "learning_rate": 0.0003928862617203424, + "loss": 3.2559, + "step": 71510 + }, + { + "epoch": 4.859016170675363, + "grad_norm": 0.9907949566841125, + "learning_rate": 0.00039284379671150973, + "loss": 3.31, + "step": 71515 + }, + { + "epoch": 4.859355890746025, + "grad_norm": 0.7735624313354492, + "learning_rate": 0.000392801331702677, + "loss": 3.3489, + "step": 71520 + }, + { + "epoch": 4.859695610816687, + "grad_norm": 1.0069260597229004, + "learning_rate": 0.0003927588666938443, + "loss": 3.493, + "step": 71525 + }, + { + "epoch": 4.8600353308873485, + "grad_norm": 0.928533673286438, + "learning_rate": 0.00039271640168501157, + "loss": 3.4108, + "step": 71530 + }, + { + "epoch": 4.860375050958011, + "grad_norm": 1.0607649087905884, + "learning_rate": 0.00039267393667617885, + "loss": 3.431, + "step": 71535 + }, + { + "epoch": 4.860714771028673, + "grad_norm": 0.965529203414917, + "learning_rate": 0.00039263147166734613, + "loss": 3.5174, + "step": 71540 + }, + { + "epoch": 4.861054491099334, + "grad_norm": 0.7889527082443237, + "learning_rate": 0.00039258900665851336, + "loss": 3.7822, + "step": 71545 + }, + { + "epoch": 4.861394211169996, + "grad_norm": 0.7861848473548889, + "learning_rate": 0.0003925465416496807, + "loss": 3.4793, + "step": 71550 + }, + { + "epoch": 4.861733931240658, + "grad_norm": 1.0139328241348267, + "learning_rate": 0.000392504076640848, + "loss": 3.6884, + "step": 71555 + }, + { + "epoch": 4.862073651311319, + "grad_norm": 0.9447523355484009, + "learning_rate": 0.0003924616116320152, + "loss": 3.2767, + "step": 71560 + }, + { + "epoch": 4.862413371381981, + "grad_norm": 1.2079130411148071, + "learning_rate": 0.00039241914662318253, + "loss": 3.4133, + "step": 71565 + }, + { + "epoch": 4.862753091452643, + "grad_norm": 1.0133838653564453, + "learning_rate": 0.0003923766816143498, + "loss": 3.3767, + "step": 71570 + }, + { + "epoch": 4.8630928115233045, + "grad_norm": 0.8459023237228394, + "learning_rate": 0.00039233421660551704, + "loss": 3.5578, + "step": 71575 + }, + { + "epoch": 4.863432531593967, + "grad_norm": 0.970865786075592, + "learning_rate": 0.0003922917515966843, + "loss": 3.5272, + "step": 71580 + }, + { + "epoch": 4.863772251664629, + "grad_norm": 0.8361061215400696, + "learning_rate": 0.00039224928658785165, + "loss": 3.7356, + "step": 71585 + }, + { + "epoch": 4.86411197173529, + "grad_norm": 0.8000577688217163, + "learning_rate": 0.0003922068215790189, + "loss": 3.3026, + "step": 71590 + }, + { + "epoch": 4.864451691805952, + "grad_norm": 0.7858565449714661, + "learning_rate": 0.00039216435657018616, + "loss": 3.5635, + "step": 71595 + }, + { + "epoch": 4.864791411876614, + "grad_norm": 0.6775853037834167, + "learning_rate": 0.0003921218915613535, + "loss": 3.3149, + "step": 71600 + }, + { + "epoch": 4.865131131947275, + "grad_norm": 0.7447283267974854, + "learning_rate": 0.0003920794265525207, + "loss": 3.5977, + "step": 71605 + }, + { + "epoch": 4.865470852017937, + "grad_norm": 1.0092582702636719, + "learning_rate": 0.000392036961543688, + "loss": 3.3182, + "step": 71610 + }, + { + "epoch": 4.865810572088599, + "grad_norm": 0.9396032094955444, + "learning_rate": 0.0003919944965348553, + "loss": 3.3599, + "step": 71615 + }, + { + "epoch": 4.8661502921592605, + "grad_norm": 1.2044275999069214, + "learning_rate": 0.00039195203152602256, + "loss": 3.4551, + "step": 71620 + }, + { + "epoch": 4.866490012229923, + "grad_norm": 0.846407949924469, + "learning_rate": 0.00039190956651718984, + "loss": 3.6777, + "step": 71625 + }, + { + "epoch": 4.866829732300585, + "grad_norm": 0.9076966047286987, + "learning_rate": 0.0003918671015083571, + "loss": 3.4465, + "step": 71630 + }, + { + "epoch": 4.867169452371246, + "grad_norm": 0.7897721529006958, + "learning_rate": 0.0003918246364995244, + "loss": 3.4282, + "step": 71635 + }, + { + "epoch": 4.867509172441908, + "grad_norm": 0.7562487721443176, + "learning_rate": 0.0003917821714906917, + "loss": 3.5387, + "step": 71640 + }, + { + "epoch": 4.86784889251257, + "grad_norm": 1.0832679271697998, + "learning_rate": 0.00039173970648185896, + "loss": 3.739, + "step": 71645 + }, + { + "epoch": 4.868188612583231, + "grad_norm": 0.7358027100563049, + "learning_rate": 0.0003916972414730262, + "loss": 3.387, + "step": 71650 + }, + { + "epoch": 4.868528332653893, + "grad_norm": 0.8585151433944702, + "learning_rate": 0.0003916547764641935, + "loss": 3.5369, + "step": 71655 + }, + { + "epoch": 4.8688680527245545, + "grad_norm": 1.0067412853240967, + "learning_rate": 0.0003916123114553608, + "loss": 3.3063, + "step": 71660 + }, + { + "epoch": 4.8692077727952165, + "grad_norm": 0.7679489254951477, + "learning_rate": 0.000391569846446528, + "loss": 3.5759, + "step": 71665 + }, + { + "epoch": 4.869547492865879, + "grad_norm": 0.7471771836280823, + "learning_rate": 0.00039152738143769536, + "loss": 3.8085, + "step": 71670 + }, + { + "epoch": 4.86988721293654, + "grad_norm": 0.8263405561447144, + "learning_rate": 0.00039148491642886264, + "loss": 3.5458, + "step": 71675 + }, + { + "epoch": 4.870226933007202, + "grad_norm": 0.9125627875328064, + "learning_rate": 0.00039144245142002987, + "loss": 3.5194, + "step": 71680 + }, + { + "epoch": 4.870566653077864, + "grad_norm": 1.1084539890289307, + "learning_rate": 0.00039139998641119715, + "loss": 3.1894, + "step": 71685 + }, + { + "epoch": 4.870906373148525, + "grad_norm": 0.8582249879837036, + "learning_rate": 0.0003913575214023645, + "loss": 3.4442, + "step": 71690 + }, + { + "epoch": 4.871246093219187, + "grad_norm": 1.0106711387634277, + "learning_rate": 0.00039131505639353176, + "loss": 3.3668, + "step": 71695 + }, + { + "epoch": 4.871585813289849, + "grad_norm": 1.2190543413162231, + "learning_rate": 0.000391272591384699, + "loss": 3.4497, + "step": 71700 + }, + { + "epoch": 4.8719255333605105, + "grad_norm": 0.7255255579948425, + "learning_rate": 0.0003912301263758663, + "loss": 3.3239, + "step": 71705 + }, + { + "epoch": 4.872265253431173, + "grad_norm": 0.923798680305481, + "learning_rate": 0.0003911876613670336, + "loss": 3.0957, + "step": 71710 + }, + { + "epoch": 4.872604973501835, + "grad_norm": 0.9152931571006775, + "learning_rate": 0.00039114519635820083, + "loss": 3.2401, + "step": 71715 + }, + { + "epoch": 4.872944693572496, + "grad_norm": 0.7888710498809814, + "learning_rate": 0.00039110273134936816, + "loss": 3.7321, + "step": 71720 + }, + { + "epoch": 4.873284413643158, + "grad_norm": 7.542074680328369, + "learning_rate": 0.00039106026634053544, + "loss": 3.5092, + "step": 71725 + }, + { + "epoch": 4.87362413371382, + "grad_norm": 0.7895801663398743, + "learning_rate": 0.00039101780133170267, + "loss": 3.6462, + "step": 71730 + }, + { + "epoch": 4.873963853784481, + "grad_norm": 0.8703444600105286, + "learning_rate": 0.00039097533632286995, + "loss": 3.5422, + "step": 71735 + }, + { + "epoch": 4.874303573855143, + "grad_norm": 0.771831750869751, + "learning_rate": 0.0003909328713140373, + "loss": 3.4975, + "step": 71740 + }, + { + "epoch": 4.874643293925805, + "grad_norm": 2.12913179397583, + "learning_rate": 0.0003908904063052045, + "loss": 3.6183, + "step": 71745 + }, + { + "epoch": 4.8749830139964665, + "grad_norm": 0.8164390921592712, + "learning_rate": 0.0003908479412963718, + "loss": 3.4346, + "step": 71750 + }, + { + "epoch": 4.875322734067129, + "grad_norm": 0.9011436700820923, + "learning_rate": 0.0003908054762875391, + "loss": 3.3434, + "step": 71755 + }, + { + "epoch": 4.875662454137791, + "grad_norm": 0.7469122409820557, + "learning_rate": 0.00039076301127870635, + "loss": 3.4448, + "step": 71760 + }, + { + "epoch": 4.876002174208452, + "grad_norm": 0.6908722519874573, + "learning_rate": 0.00039072054626987363, + "loss": 3.6994, + "step": 71765 + }, + { + "epoch": 4.876341894279114, + "grad_norm": 1.0258781909942627, + "learning_rate": 0.0003906780812610409, + "loss": 3.3758, + "step": 71770 + }, + { + "epoch": 4.876681614349776, + "grad_norm": 0.9265233278274536, + "learning_rate": 0.0003906356162522082, + "loss": 3.5897, + "step": 71775 + }, + { + "epoch": 4.877021334420437, + "grad_norm": 0.7283373475074768, + "learning_rate": 0.00039059315124337547, + "loss": 3.6856, + "step": 71780 + }, + { + "epoch": 4.877361054491099, + "grad_norm": 0.7199022173881531, + "learning_rate": 0.00039055068623454275, + "loss": 3.4881, + "step": 71785 + }, + { + "epoch": 4.877700774561761, + "grad_norm": 1.0024833679199219, + "learning_rate": 0.00039050822122571003, + "loss": 3.5375, + "step": 71790 + }, + { + "epoch": 4.8780404946324225, + "grad_norm": 0.8350037932395935, + "learning_rate": 0.0003904657562168773, + "loss": 3.2246, + "step": 71795 + }, + { + "epoch": 4.878380214703085, + "grad_norm": 0.722672164440155, + "learning_rate": 0.0003904232912080446, + "loss": 3.4951, + "step": 71800 + }, + { + "epoch": 4.878719934773747, + "grad_norm": 1.0324609279632568, + "learning_rate": 0.0003903808261992118, + "loss": 3.2946, + "step": 71805 + }, + { + "epoch": 4.879059654844408, + "grad_norm": 0.9949887990951538, + "learning_rate": 0.00039033836119037915, + "loss": 3.6435, + "step": 71810 + }, + { + "epoch": 4.87939937491507, + "grad_norm": 1.737104058265686, + "learning_rate": 0.00039029589618154643, + "loss": 3.3974, + "step": 71815 + }, + { + "epoch": 4.879739094985732, + "grad_norm": 0.9656334519386292, + "learning_rate": 0.00039025343117271365, + "loss": 3.4567, + "step": 71820 + }, + { + "epoch": 4.880078815056393, + "grad_norm": 0.8947311639785767, + "learning_rate": 0.000390210966163881, + "loss": 3.4973, + "step": 71825 + }, + { + "epoch": 4.880418535127055, + "grad_norm": 1.5078785419464111, + "learning_rate": 0.00039016850115504827, + "loss": 3.6141, + "step": 71830 + }, + { + "epoch": 4.880758255197717, + "grad_norm": 0.8987067341804504, + "learning_rate": 0.0003901260361462155, + "loss": 3.6868, + "step": 71835 + }, + { + "epoch": 4.8810979752683785, + "grad_norm": 0.7814134955406189, + "learning_rate": 0.0003900835711373828, + "loss": 3.5008, + "step": 71840 + }, + { + "epoch": 4.881437695339041, + "grad_norm": 0.9610578417778015, + "learning_rate": 0.0003900411061285501, + "loss": 3.5073, + "step": 71845 + }, + { + "epoch": 4.881777415409703, + "grad_norm": 1.20891273021698, + "learning_rate": 0.00038999864111971734, + "loss": 3.4743, + "step": 71850 + }, + { + "epoch": 4.882117135480364, + "grad_norm": 0.7818129062652588, + "learning_rate": 0.0003899561761108846, + "loss": 3.4505, + "step": 71855 + }, + { + "epoch": 4.882456855551026, + "grad_norm": 0.7846366763114929, + "learning_rate": 0.00038991371110205195, + "loss": 3.5997, + "step": 71860 + }, + { + "epoch": 4.882796575621688, + "grad_norm": 0.8860562443733215, + "learning_rate": 0.00038987124609321923, + "loss": 3.7024, + "step": 71865 + }, + { + "epoch": 4.883136295692349, + "grad_norm": 0.7810195088386536, + "learning_rate": 0.00038982878108438646, + "loss": 3.6904, + "step": 71870 + }, + { + "epoch": 4.883476015763011, + "grad_norm": 1.2419137954711914, + "learning_rate": 0.00038978631607555374, + "loss": 3.4962, + "step": 71875 + }, + { + "epoch": 4.883815735833673, + "grad_norm": 0.9972943067550659, + "learning_rate": 0.00038974385106672107, + "loss": 3.6723, + "step": 71880 + }, + { + "epoch": 4.8841554559043345, + "grad_norm": 0.9468991160392761, + "learning_rate": 0.0003897013860578883, + "loss": 3.469, + "step": 71885 + }, + { + "epoch": 4.884495175974997, + "grad_norm": 0.8898295164108276, + "learning_rate": 0.0003896589210490556, + "loss": 3.4321, + "step": 71890 + }, + { + "epoch": 4.884834896045659, + "grad_norm": 0.9151332378387451, + "learning_rate": 0.0003896164560402229, + "loss": 3.4475, + "step": 71895 + }, + { + "epoch": 4.88517461611632, + "grad_norm": 1.0645049810409546, + "learning_rate": 0.00038957399103139014, + "loss": 3.3354, + "step": 71900 + }, + { + "epoch": 4.885514336186982, + "grad_norm": 0.7655720710754395, + "learning_rate": 0.0003895315260225574, + "loss": 3.5479, + "step": 71905 + }, + { + "epoch": 4.885854056257644, + "grad_norm": 0.9879159927368164, + "learning_rate": 0.0003894890610137247, + "loss": 3.364, + "step": 71910 + }, + { + "epoch": 4.886193776328305, + "grad_norm": 0.7424384355545044, + "learning_rate": 0.000389446596004892, + "loss": 3.374, + "step": 71915 + }, + { + "epoch": 4.886533496398967, + "grad_norm": 0.8333654403686523, + "learning_rate": 0.00038940413099605926, + "loss": 3.5015, + "step": 71920 + }, + { + "epoch": 4.886873216469629, + "grad_norm": 0.9502548575401306, + "learning_rate": 0.00038936166598722654, + "loss": 3.3997, + "step": 71925 + }, + { + "epoch": 4.8872129365402905, + "grad_norm": 0.9110161066055298, + "learning_rate": 0.0003893192009783938, + "loss": 3.5531, + "step": 71930 + }, + { + "epoch": 4.887552656610953, + "grad_norm": 0.9182873368263245, + "learning_rate": 0.0003892767359695611, + "loss": 3.4585, + "step": 71935 + }, + { + "epoch": 4.887892376681615, + "grad_norm": 0.7941346764564514, + "learning_rate": 0.0003892342709607284, + "loss": 3.7745, + "step": 71940 + }, + { + "epoch": 4.888232096752276, + "grad_norm": 0.8665228486061096, + "learning_rate": 0.0003891918059518956, + "loss": 3.4898, + "step": 71945 + }, + { + "epoch": 4.888571816822938, + "grad_norm": 0.8560140132904053, + "learning_rate": 0.00038914934094306294, + "loss": 3.5415, + "step": 71950 + }, + { + "epoch": 4.8889115368936, + "grad_norm": 0.9188718199729919, + "learning_rate": 0.0003891068759342302, + "loss": 3.7266, + "step": 71955 + }, + { + "epoch": 4.889251256964261, + "grad_norm": 0.9035201668739319, + "learning_rate": 0.00038906441092539744, + "loss": 3.1157, + "step": 71960 + }, + { + "epoch": 4.889590977034923, + "grad_norm": 0.8998892307281494, + "learning_rate": 0.0003890219459165648, + "loss": 3.3155, + "step": 71965 + }, + { + "epoch": 4.889930697105585, + "grad_norm": 0.9042481184005737, + "learning_rate": 0.00038897948090773206, + "loss": 3.3214, + "step": 71970 + }, + { + "epoch": 4.8902704171762466, + "grad_norm": 0.8381983041763306, + "learning_rate": 0.0003889370158988993, + "loss": 3.5819, + "step": 71975 + }, + { + "epoch": 4.890610137246909, + "grad_norm": 1.075136423110962, + "learning_rate": 0.00038889455089006656, + "loss": 3.5005, + "step": 71980 + }, + { + "epoch": 4.890949857317571, + "grad_norm": 0.8215078711509705, + "learning_rate": 0.0003888520858812339, + "loss": 3.7118, + "step": 71985 + }, + { + "epoch": 4.891289577388232, + "grad_norm": 0.9779772162437439, + "learning_rate": 0.0003888096208724011, + "loss": 3.608, + "step": 71990 + }, + { + "epoch": 4.891629297458894, + "grad_norm": 0.9269939661026001, + "learning_rate": 0.0003887671558635684, + "loss": 3.6556, + "step": 71995 + }, + { + "epoch": 4.891969017529556, + "grad_norm": 0.7972735166549683, + "learning_rate": 0.00038872469085473574, + "loss": 3.5449, + "step": 72000 + }, + { + "epoch": 4.892308737600217, + "grad_norm": 0.8890233039855957, + "learning_rate": 0.00038868222584590296, + "loss": 3.5073, + "step": 72005 + }, + { + "epoch": 4.892648457670879, + "grad_norm": 1.0043126344680786, + "learning_rate": 0.00038863976083707024, + "loss": 3.2737, + "step": 72010 + }, + { + "epoch": 4.892988177741541, + "grad_norm": 1.182923674583435, + "learning_rate": 0.0003885972958282376, + "loss": 3.6959, + "step": 72015 + }, + { + "epoch": 4.893327897812203, + "grad_norm": 1.0786304473876953, + "learning_rate": 0.0003885548308194048, + "loss": 3.6218, + "step": 72020 + }, + { + "epoch": 4.893667617882865, + "grad_norm": 0.8181334733963013, + "learning_rate": 0.0003885123658105721, + "loss": 3.4688, + "step": 72025 + }, + { + "epoch": 4.894007337953527, + "grad_norm": 0.9745379686355591, + "learning_rate": 0.00038846990080173936, + "loss": 3.7262, + "step": 72030 + }, + { + "epoch": 4.894347058024188, + "grad_norm": 0.7434333562850952, + "learning_rate": 0.0003884274357929067, + "loss": 3.1519, + "step": 72035 + }, + { + "epoch": 4.89468677809485, + "grad_norm": 1.0815131664276123, + "learning_rate": 0.0003883849707840739, + "loss": 3.7665, + "step": 72040 + }, + { + "epoch": 4.895026498165512, + "grad_norm": 1.7085148096084595, + "learning_rate": 0.0003883425057752412, + "loss": 3.5492, + "step": 72045 + }, + { + "epoch": 4.895366218236173, + "grad_norm": 0.8909322023391724, + "learning_rate": 0.00038830004076640854, + "loss": 3.542, + "step": 72050 + }, + { + "epoch": 4.895705938306835, + "grad_norm": 0.8655850291252136, + "learning_rate": 0.00038825757575757576, + "loss": 3.6096, + "step": 72055 + }, + { + "epoch": 4.896045658377497, + "grad_norm": 1.2099684476852417, + "learning_rate": 0.00038821511074874304, + "loss": 3.4775, + "step": 72060 + }, + { + "epoch": 4.896385378448159, + "grad_norm": 0.8427212834358215, + "learning_rate": 0.0003881726457399103, + "loss": 3.5552, + "step": 72065 + }, + { + "epoch": 4.896725098518821, + "grad_norm": 0.99402916431427, + "learning_rate": 0.0003881301807310776, + "loss": 3.5348, + "step": 72070 + }, + { + "epoch": 4.897064818589483, + "grad_norm": 0.7190234661102295, + "learning_rate": 0.0003880877157222449, + "loss": 3.5272, + "step": 72075 + }, + { + "epoch": 4.897404538660144, + "grad_norm": 0.9836555123329163, + "learning_rate": 0.00038804525071341216, + "loss": 3.4836, + "step": 72080 + }, + { + "epoch": 4.897744258730806, + "grad_norm": 0.8361469507217407, + "learning_rate": 0.00038800278570457944, + "loss": 3.6603, + "step": 72085 + }, + { + "epoch": 4.898083978801468, + "grad_norm": 1.0264861583709717, + "learning_rate": 0.0003879603206957467, + "loss": 3.6288, + "step": 72090 + }, + { + "epoch": 4.898423698872129, + "grad_norm": 1.0591360330581665, + "learning_rate": 0.000387917855686914, + "loss": 3.4018, + "step": 72095 + }, + { + "epoch": 4.898763418942791, + "grad_norm": 0.8438334465026855, + "learning_rate": 0.00038787539067808123, + "loss": 3.4538, + "step": 72100 + }, + { + "epoch": 4.899103139013453, + "grad_norm": 0.937085747718811, + "learning_rate": 0.00038783292566924856, + "loss": 3.6464, + "step": 72105 + }, + { + "epoch": 4.899442859084115, + "grad_norm": 1.1227306127548218, + "learning_rate": 0.00038779046066041584, + "loss": 3.6236, + "step": 72110 + }, + { + "epoch": 4.899782579154777, + "grad_norm": 0.929801344871521, + "learning_rate": 0.00038774799565158307, + "loss": 3.1102, + "step": 72115 + }, + { + "epoch": 4.900122299225439, + "grad_norm": 0.8780380487442017, + "learning_rate": 0.0003877055306427504, + "loss": 3.4621, + "step": 72120 + }, + { + "epoch": 4.9004620192961, + "grad_norm": 0.7343317270278931, + "learning_rate": 0.0003876630656339177, + "loss": 3.3289, + "step": 72125 + }, + { + "epoch": 4.900801739366762, + "grad_norm": 0.9572729468345642, + "learning_rate": 0.0003876206006250849, + "loss": 3.6754, + "step": 72130 + }, + { + "epoch": 4.901141459437423, + "grad_norm": 0.7457481622695923, + "learning_rate": 0.0003875781356162522, + "loss": 3.4987, + "step": 72135 + }, + { + "epoch": 4.901481179508085, + "grad_norm": 1.2815136909484863, + "learning_rate": 0.0003875356706074195, + "loss": 3.3631, + "step": 72140 + }, + { + "epoch": 4.901820899578747, + "grad_norm": 0.9035515189170837, + "learning_rate": 0.00038749320559858675, + "loss": 3.5255, + "step": 72145 + }, + { + "epoch": 4.9021606196494085, + "grad_norm": 0.8505612015724182, + "learning_rate": 0.00038745074058975403, + "loss": 3.7001, + "step": 72150 + }, + { + "epoch": 4.902500339720071, + "grad_norm": 0.9595489501953125, + "learning_rate": 0.00038740827558092137, + "loss": 3.3757, + "step": 72155 + }, + { + "epoch": 4.902840059790733, + "grad_norm": 0.7919760942459106, + "learning_rate": 0.0003873658105720886, + "loss": 3.516, + "step": 72160 + }, + { + "epoch": 4.903179779861394, + "grad_norm": 0.8245599269866943, + "learning_rate": 0.00038732334556325587, + "loss": 3.4039, + "step": 72165 + }, + { + "epoch": 4.903519499932056, + "grad_norm": 1.0217649936676025, + "learning_rate": 0.00038728088055442315, + "loss": 3.467, + "step": 72170 + }, + { + "epoch": 4.903859220002718, + "grad_norm": 0.8319564461708069, + "learning_rate": 0.00038723841554559043, + "loss": 3.5086, + "step": 72175 + }, + { + "epoch": 4.904198940073379, + "grad_norm": 0.7178353667259216, + "learning_rate": 0.0003871959505367577, + "loss": 3.5914, + "step": 72180 + }, + { + "epoch": 4.904538660144041, + "grad_norm": 0.8826897740364075, + "learning_rate": 0.000387153485527925, + "loss": 3.6292, + "step": 72185 + }, + { + "epoch": 4.904878380214703, + "grad_norm": 1.071850299835205, + "learning_rate": 0.00038711102051909227, + "loss": 3.7807, + "step": 72190 + }, + { + "epoch": 4.9052181002853645, + "grad_norm": 0.9524065256118774, + "learning_rate": 0.00038706855551025955, + "loss": 3.4402, + "step": 72195 + }, + { + "epoch": 4.905557820356027, + "grad_norm": 0.9583002924919128, + "learning_rate": 0.00038702609050142683, + "loss": 3.519, + "step": 72200 + }, + { + "epoch": 4.905897540426689, + "grad_norm": 0.8821239471435547, + "learning_rate": 0.0003869836254925941, + "loss": 3.4864, + "step": 72205 + }, + { + "epoch": 4.90623726049735, + "grad_norm": 1.650545358657837, + "learning_rate": 0.0003869411604837614, + "loss": 3.5942, + "step": 72210 + }, + { + "epoch": 4.906576980568012, + "grad_norm": 0.6966699361801147, + "learning_rate": 0.00038689869547492867, + "loss": 3.4357, + "step": 72215 + }, + { + "epoch": 4.906916700638674, + "grad_norm": 1.020283579826355, + "learning_rate": 0.00038685623046609595, + "loss": 3.3769, + "step": 72220 + }, + { + "epoch": 4.907256420709335, + "grad_norm": 0.9300582408905029, + "learning_rate": 0.00038681376545726323, + "loss": 3.4875, + "step": 72225 + }, + { + "epoch": 4.907596140779997, + "grad_norm": 1.0021885633468628, + "learning_rate": 0.0003867713004484305, + "loss": 3.2438, + "step": 72230 + }, + { + "epoch": 4.907935860850659, + "grad_norm": 0.810500979423523, + "learning_rate": 0.0003867288354395978, + "loss": 3.2897, + "step": 72235 + }, + { + "epoch": 4.9082755809213205, + "grad_norm": 0.8771494030952454, + "learning_rate": 0.000386686370430765, + "loss": 3.6044, + "step": 72240 + }, + { + "epoch": 4.908615300991983, + "grad_norm": 0.903542160987854, + "learning_rate": 0.00038664390542193235, + "loss": 3.6281, + "step": 72245 + }, + { + "epoch": 4.908955021062645, + "grad_norm": 0.9107587933540344, + "learning_rate": 0.00038660144041309963, + "loss": 3.3875, + "step": 72250 + }, + { + "epoch": 4.909294741133306, + "grad_norm": 1.2749990224838257, + "learning_rate": 0.00038655897540426686, + "loss": 3.4948, + "step": 72255 + }, + { + "epoch": 4.909634461203968, + "grad_norm": 0.959922194480896, + "learning_rate": 0.0003865165103954342, + "loss": 3.6322, + "step": 72260 + }, + { + "epoch": 4.90997418127463, + "grad_norm": 0.8880477547645569, + "learning_rate": 0.0003864740453866015, + "loss": 3.4912, + "step": 72265 + }, + { + "epoch": 4.910313901345291, + "grad_norm": 0.9978618025779724, + "learning_rate": 0.0003864315803777687, + "loss": 3.2866, + "step": 72270 + }, + { + "epoch": 4.910653621415953, + "grad_norm": 1.2684142589569092, + "learning_rate": 0.00038638911536893603, + "loss": 3.4616, + "step": 72275 + }, + { + "epoch": 4.910993341486615, + "grad_norm": 0.8254991769790649, + "learning_rate": 0.0003863466503601033, + "loss": 3.395, + "step": 72280 + }, + { + "epoch": 4.911333061557277, + "grad_norm": 1.0260909795761108, + "learning_rate": 0.00038630418535127054, + "loss": 3.467, + "step": 72285 + }, + { + "epoch": 4.911672781627939, + "grad_norm": 0.7375059723854065, + "learning_rate": 0.0003862617203424378, + "loss": 3.0623, + "step": 72290 + }, + { + "epoch": 4.912012501698601, + "grad_norm": 0.9302277565002441, + "learning_rate": 0.00038621925533360515, + "loss": 3.1987, + "step": 72295 + }, + { + "epoch": 4.912352221769262, + "grad_norm": 0.7349042296409607, + "learning_rate": 0.0003861767903247724, + "loss": 3.3461, + "step": 72300 + }, + { + "epoch": 4.912691941839924, + "grad_norm": 0.7494734525680542, + "learning_rate": 0.00038613432531593966, + "loss": 3.6637, + "step": 72305 + }, + { + "epoch": 4.913031661910586, + "grad_norm": 0.9692125916481018, + "learning_rate": 0.000386091860307107, + "loss": 3.4952, + "step": 72310 + }, + { + "epoch": 4.913371381981247, + "grad_norm": 1.1955313682556152, + "learning_rate": 0.0003860493952982742, + "loss": 3.375, + "step": 72315 + }, + { + "epoch": 4.913711102051909, + "grad_norm": 0.850462019443512, + "learning_rate": 0.0003860069302894415, + "loss": 3.5269, + "step": 72320 + }, + { + "epoch": 4.914050822122571, + "grad_norm": 0.9245151877403259, + "learning_rate": 0.0003859644652806088, + "loss": 3.5452, + "step": 72325 + }, + { + "epoch": 4.914390542193233, + "grad_norm": 0.8447486758232117, + "learning_rate": 0.00038592200027177606, + "loss": 3.2361, + "step": 72330 + }, + { + "epoch": 4.914730262263895, + "grad_norm": 1.0529649257659912, + "learning_rate": 0.00038587953526294334, + "loss": 3.4632, + "step": 72335 + }, + { + "epoch": 4.915069982334556, + "grad_norm": 0.8441202044487, + "learning_rate": 0.0003858370702541106, + "loss": 3.5498, + "step": 72340 + }, + { + "epoch": 4.915409702405218, + "grad_norm": 1.1533490419387817, + "learning_rate": 0.0003857946052452779, + "loss": 3.5131, + "step": 72345 + }, + { + "epoch": 4.91574942247588, + "grad_norm": 1.1208429336547852, + "learning_rate": 0.0003857521402364452, + "loss": 3.4475, + "step": 72350 + }, + { + "epoch": 4.916089142546541, + "grad_norm": 0.7976725697517395, + "learning_rate": 0.00038570967522761246, + "loss": 3.5278, + "step": 72355 + }, + { + "epoch": 4.916428862617203, + "grad_norm": 0.8748093843460083, + "learning_rate": 0.0003856672102187797, + "loss": 3.4622, + "step": 72360 + }, + { + "epoch": 4.916768582687865, + "grad_norm": 0.874478280544281, + "learning_rate": 0.000385624745209947, + "loss": 3.5412, + "step": 72365 + }, + { + "epoch": 4.9171083027585265, + "grad_norm": 0.8676944375038147, + "learning_rate": 0.0003855822802011143, + "loss": 3.3899, + "step": 72370 + }, + { + "epoch": 4.917448022829189, + "grad_norm": 0.7888920903205872, + "learning_rate": 0.0003855398151922816, + "loss": 3.5877, + "step": 72375 + }, + { + "epoch": 4.917787742899851, + "grad_norm": 1.1511616706848145, + "learning_rate": 0.00038549735018344886, + "loss": 3.4899, + "step": 72380 + }, + { + "epoch": 4.918127462970512, + "grad_norm": 1.7583162784576416, + "learning_rate": 0.00038545488517461614, + "loss": 3.4269, + "step": 72385 + }, + { + "epoch": 4.918467183041174, + "grad_norm": 0.9110392928123474, + "learning_rate": 0.0003854124201657834, + "loss": 3.5544, + "step": 72390 + }, + { + "epoch": 4.918806903111836, + "grad_norm": 1.0830185413360596, + "learning_rate": 0.00038536995515695065, + "loss": 3.5099, + "step": 72395 + }, + { + "epoch": 4.919146623182497, + "grad_norm": 0.7331733107566833, + "learning_rate": 0.000385327490148118, + "loss": 3.5913, + "step": 72400 + }, + { + "epoch": 4.919486343253159, + "grad_norm": 1.2669568061828613, + "learning_rate": 0.00038528502513928526, + "loss": 3.3574, + "step": 72405 + }, + { + "epoch": 4.919826063323821, + "grad_norm": 1.0812487602233887, + "learning_rate": 0.0003852425601304525, + "loss": 3.5169, + "step": 72410 + }, + { + "epoch": 4.9201657833944825, + "grad_norm": 3.6566271781921387, + "learning_rate": 0.0003852000951216198, + "loss": 3.4467, + "step": 72415 + }, + { + "epoch": 4.920505503465145, + "grad_norm": 0.7579536437988281, + "learning_rate": 0.0003851576301127871, + "loss": 3.4467, + "step": 72420 + }, + { + "epoch": 4.920845223535807, + "grad_norm": 1.003059983253479, + "learning_rate": 0.0003851151651039543, + "loss": 3.2586, + "step": 72425 + }, + { + "epoch": 4.921184943606468, + "grad_norm": 1.0004786252975464, + "learning_rate": 0.0003850727000951216, + "loss": 3.3623, + "step": 72430 + }, + { + "epoch": 4.92152466367713, + "grad_norm": 0.7935636043548584, + "learning_rate": 0.00038503023508628894, + "loss": 3.2634, + "step": 72435 + }, + { + "epoch": 4.921864383747792, + "grad_norm": 0.854377031326294, + "learning_rate": 0.00038498777007745617, + "loss": 3.4699, + "step": 72440 + }, + { + "epoch": 4.922204103818453, + "grad_norm": 1.6159389019012451, + "learning_rate": 0.00038494530506862345, + "loss": 3.5778, + "step": 72445 + }, + { + "epoch": 4.922543823889115, + "grad_norm": 0.9797263145446777, + "learning_rate": 0.0003849028400597908, + "loss": 3.2481, + "step": 72450 + }, + { + "epoch": 4.922883543959777, + "grad_norm": 1.0452929735183716, + "learning_rate": 0.000384860375050958, + "loss": 3.4738, + "step": 72455 + }, + { + "epoch": 4.9232232640304385, + "grad_norm": 0.6572943329811096, + "learning_rate": 0.0003848179100421253, + "loss": 3.5115, + "step": 72460 + }, + { + "epoch": 4.923562984101101, + "grad_norm": 0.8405246734619141, + "learning_rate": 0.00038477544503329257, + "loss": 3.5012, + "step": 72465 + }, + { + "epoch": 4.923902704171763, + "grad_norm": 0.6848536133766174, + "learning_rate": 0.00038473298002445985, + "loss": 3.6651, + "step": 72470 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.9563995003700256, + "learning_rate": 0.00038469051501562713, + "loss": 3.6374, + "step": 72475 + }, + { + "epoch": 4.924582144313086, + "grad_norm": 0.6934095621109009, + "learning_rate": 0.0003846480500067944, + "loss": 3.3282, + "step": 72480 + }, + { + "epoch": 4.924921864383748, + "grad_norm": 1.3187470436096191, + "learning_rate": 0.0003846055849979617, + "loss": 3.3913, + "step": 72485 + }, + { + "epoch": 4.925261584454409, + "grad_norm": 0.9115572571754456, + "learning_rate": 0.00038456311998912897, + "loss": 3.332, + "step": 72490 + }, + { + "epoch": 4.925601304525071, + "grad_norm": 1.207271933555603, + "learning_rate": 0.00038452065498029625, + "loss": 3.349, + "step": 72495 + }, + { + "epoch": 4.925941024595733, + "grad_norm": 0.8519386649131775, + "learning_rate": 0.0003844781899714635, + "loss": 3.4024, + "step": 72500 + }, + { + "epoch": 4.9262807446663945, + "grad_norm": 1.124508261680603, + "learning_rate": 0.0003844357249626308, + "loss": 3.3839, + "step": 72505 + }, + { + "epoch": 4.926620464737057, + "grad_norm": 1.0344734191894531, + "learning_rate": 0.0003843932599537981, + "loss": 3.6792, + "step": 72510 + }, + { + "epoch": 4.926960184807719, + "grad_norm": 0.9427729845046997, + "learning_rate": 0.0003843507949449653, + "loss": 3.7843, + "step": 72515 + }, + { + "epoch": 4.92729990487838, + "grad_norm": 0.8796576857566833, + "learning_rate": 0.00038430832993613265, + "loss": 3.5801, + "step": 72520 + }, + { + "epoch": 4.927639624949042, + "grad_norm": 0.9738473892211914, + "learning_rate": 0.00038426586492729993, + "loss": 3.4272, + "step": 72525 + }, + { + "epoch": 4.927979345019704, + "grad_norm": 0.820087194442749, + "learning_rate": 0.00038422339991846715, + "loss": 3.278, + "step": 72530 + }, + { + "epoch": 4.928319065090365, + "grad_norm": 0.7423975467681885, + "learning_rate": 0.00038418093490963443, + "loss": 3.3874, + "step": 72535 + }, + { + "epoch": 4.928658785161027, + "grad_norm": 0.7717877626419067, + "learning_rate": 0.00038413846990080177, + "loss": 3.7349, + "step": 72540 + }, + { + "epoch": 4.928998505231689, + "grad_norm": 1.0511821508407593, + "learning_rate": 0.00038409600489196905, + "loss": 3.3543, + "step": 72545 + }, + { + "epoch": 4.9293382253023506, + "grad_norm": 1.0428640842437744, + "learning_rate": 0.0003840535398831363, + "loss": 3.455, + "step": 72550 + }, + { + "epoch": 4.929677945373013, + "grad_norm": 0.7440029978752136, + "learning_rate": 0.0003840110748743036, + "loss": 3.5437, + "step": 72555 + }, + { + "epoch": 4.930017665443675, + "grad_norm": 0.7974912524223328, + "learning_rate": 0.0003839686098654709, + "loss": 3.4152, + "step": 72560 + }, + { + "epoch": 4.930357385514336, + "grad_norm": 1.000136137008667, + "learning_rate": 0.0003839261448566381, + "loss": 3.5191, + "step": 72565 + }, + { + "epoch": 4.930697105584998, + "grad_norm": 0.9665020108222961, + "learning_rate": 0.00038388367984780545, + "loss": 3.5109, + "step": 72570 + }, + { + "epoch": 4.93103682565566, + "grad_norm": 1.1322526931762695, + "learning_rate": 0.00038384121483897273, + "loss": 3.6932, + "step": 72575 + }, + { + "epoch": 4.931376545726321, + "grad_norm": 0.9334116578102112, + "learning_rate": 0.00038379874983013996, + "loss": 3.5653, + "step": 72580 + }, + { + "epoch": 4.931716265796983, + "grad_norm": 0.9704157710075378, + "learning_rate": 0.00038375628482130724, + "loss": 3.5451, + "step": 72585 + }, + { + "epoch": 4.932055985867645, + "grad_norm": 1.001556158065796, + "learning_rate": 0.00038371381981247457, + "loss": 3.4303, + "step": 72590 + }, + { + "epoch": 4.932395705938307, + "grad_norm": 0.8553072810173035, + "learning_rate": 0.0003836713548036418, + "loss": 3.6261, + "step": 72595 + }, + { + "epoch": 4.932735426008969, + "grad_norm": 1.017979621887207, + "learning_rate": 0.0003836288897948091, + "loss": 3.5795, + "step": 72600 + }, + { + "epoch": 4.933075146079631, + "grad_norm": 0.7722117900848389, + "learning_rate": 0.0003835864247859764, + "loss": 3.7922, + "step": 72605 + }, + { + "epoch": 4.933414866150292, + "grad_norm": 0.8845933675765991, + "learning_rate": 0.00038354395977714364, + "loss": 3.6063, + "step": 72610 + }, + { + "epoch": 4.933754586220954, + "grad_norm": 0.9131596684455872, + "learning_rate": 0.0003835014947683109, + "loss": 3.4292, + "step": 72615 + }, + { + "epoch": 4.934094306291616, + "grad_norm": 0.7675774097442627, + "learning_rate": 0.0003834590297594782, + "loss": 3.7118, + "step": 72620 + }, + { + "epoch": 4.934434026362277, + "grad_norm": 0.7413609027862549, + "learning_rate": 0.0003834165647506455, + "loss": 3.1166, + "step": 72625 + }, + { + "epoch": 4.934773746432939, + "grad_norm": 0.9664254188537598, + "learning_rate": 0.00038337409974181276, + "loss": 3.5698, + "step": 72630 + }, + { + "epoch": 4.935113466503601, + "grad_norm": 0.8135935068130493, + "learning_rate": 0.00038333163473298004, + "loss": 3.1499, + "step": 72635 + }, + { + "epoch": 4.935453186574263, + "grad_norm": 0.8166611790657043, + "learning_rate": 0.0003832891697241473, + "loss": 3.4697, + "step": 72640 + }, + { + "epoch": 4.935792906644925, + "grad_norm": 0.9220448732376099, + "learning_rate": 0.0003832467047153146, + "loss": 3.4521, + "step": 72645 + }, + { + "epoch": 4.936132626715587, + "grad_norm": 0.9558717012405396, + "learning_rate": 0.0003832042397064819, + "loss": 3.4659, + "step": 72650 + }, + { + "epoch": 4.936472346786248, + "grad_norm": 0.8334888815879822, + "learning_rate": 0.0003831617746976491, + "loss": 3.3027, + "step": 72655 + }, + { + "epoch": 4.93681206685691, + "grad_norm": 0.8469164967536926, + "learning_rate": 0.00038311930968881644, + "loss": 3.63, + "step": 72660 + }, + { + "epoch": 4.937151786927572, + "grad_norm": 0.9805964231491089, + "learning_rate": 0.0003830768446799837, + "loss": 3.5173, + "step": 72665 + }, + { + "epoch": 4.937491506998233, + "grad_norm": 0.7419153451919556, + "learning_rate": 0.00038303437967115094, + "loss": 3.3796, + "step": 72670 + }, + { + "epoch": 4.937831227068895, + "grad_norm": 0.9005613923072815, + "learning_rate": 0.0003829919146623183, + "loss": 3.4179, + "step": 72675 + }, + { + "epoch": 4.938170947139557, + "grad_norm": 0.6911476850509644, + "learning_rate": 0.00038294944965348556, + "loss": 3.3704, + "step": 72680 + }, + { + "epoch": 4.938510667210219, + "grad_norm": 0.7690288424491882, + "learning_rate": 0.0003829069846446528, + "loss": 3.2209, + "step": 72685 + }, + { + "epoch": 4.938850387280881, + "grad_norm": 0.8175159692764282, + "learning_rate": 0.00038286451963582006, + "loss": 3.4632, + "step": 72690 + }, + { + "epoch": 4.939190107351543, + "grad_norm": 1.0454809665679932, + "learning_rate": 0.0003828220546269874, + "loss": 3.1991, + "step": 72695 + }, + { + "epoch": 4.939529827422204, + "grad_norm": 0.9375357031822205, + "learning_rate": 0.0003827795896181546, + "loss": 3.5183, + "step": 72700 + }, + { + "epoch": 4.939869547492866, + "grad_norm": 1.1896196603775024, + "learning_rate": 0.0003827371246093219, + "loss": 3.4359, + "step": 72705 + }, + { + "epoch": 4.940209267563528, + "grad_norm": 0.9040124416351318, + "learning_rate": 0.00038269465960048924, + "loss": 3.2681, + "step": 72710 + }, + { + "epoch": 4.940548987634189, + "grad_norm": 0.8235114812850952, + "learning_rate": 0.0003826521945916565, + "loss": 3.503, + "step": 72715 + }, + { + "epoch": 4.940888707704851, + "grad_norm": 0.9028295278549194, + "learning_rate": 0.00038260972958282374, + "loss": 3.595, + "step": 72720 + }, + { + "epoch": 4.941228427775513, + "grad_norm": 0.7822396755218506, + "learning_rate": 0.000382567264573991, + "loss": 3.5893, + "step": 72725 + }, + { + "epoch": 4.941568147846175, + "grad_norm": 0.9134349822998047, + "learning_rate": 0.00038252479956515836, + "loss": 3.2941, + "step": 72730 + }, + { + "epoch": 4.941907867916837, + "grad_norm": 0.8677173852920532, + "learning_rate": 0.0003824823345563256, + "loss": 3.4234, + "step": 72735 + }, + { + "epoch": 4.942247587987499, + "grad_norm": 1.32822585105896, + "learning_rate": 0.00038243986954749286, + "loss": 3.6051, + "step": 72740 + }, + { + "epoch": 4.94258730805816, + "grad_norm": 0.968186616897583, + "learning_rate": 0.0003823974045386602, + "loss": 3.3858, + "step": 72745 + }, + { + "epoch": 4.942927028128822, + "grad_norm": 0.6857550144195557, + "learning_rate": 0.0003823549395298274, + "loss": 3.3158, + "step": 72750 + }, + { + "epoch": 4.943266748199484, + "grad_norm": 1.1124213933944702, + "learning_rate": 0.0003823124745209947, + "loss": 3.5817, + "step": 72755 + }, + { + "epoch": 4.943606468270145, + "grad_norm": 0.9625373482704163, + "learning_rate": 0.000382270009512162, + "loss": 3.4445, + "step": 72760 + }, + { + "epoch": 4.943946188340807, + "grad_norm": 0.8662756681442261, + "learning_rate": 0.00038222754450332926, + "loss": 3.6254, + "step": 72765 + }, + { + "epoch": 4.944285908411469, + "grad_norm": 0.9250845313072205, + "learning_rate": 0.00038218507949449654, + "loss": 3.4, + "step": 72770 + }, + { + "epoch": 4.944625628482131, + "grad_norm": 0.870151937007904, + "learning_rate": 0.0003821426144856638, + "loss": 3.5803, + "step": 72775 + }, + { + "epoch": 4.944965348552793, + "grad_norm": 0.8889309763908386, + "learning_rate": 0.0003821001494768311, + "loss": 3.7393, + "step": 72780 + }, + { + "epoch": 4.945305068623455, + "grad_norm": 0.8762021064758301, + "learning_rate": 0.0003820576844679984, + "loss": 3.2555, + "step": 72785 + }, + { + "epoch": 4.945644788694116, + "grad_norm": 1.034146785736084, + "learning_rate": 0.00038201521945916566, + "loss": 3.6384, + "step": 72790 + }, + { + "epoch": 4.945984508764778, + "grad_norm": 0.699321448802948, + "learning_rate": 0.0003819727544503329, + "loss": 3.5025, + "step": 72795 + }, + { + "epoch": 4.94632422883544, + "grad_norm": 1.0628716945648193, + "learning_rate": 0.0003819302894415002, + "loss": 3.7025, + "step": 72800 + }, + { + "epoch": 4.946663948906101, + "grad_norm": 0.8917919397354126, + "learning_rate": 0.0003818878244326675, + "loss": 3.36, + "step": 72805 + }, + { + "epoch": 4.947003668976763, + "grad_norm": 0.9161941409111023, + "learning_rate": 0.00038184535942383473, + "loss": 3.3529, + "step": 72810 + }, + { + "epoch": 4.9473433890474245, + "grad_norm": 0.8078793883323669, + "learning_rate": 0.00038180289441500206, + "loss": 3.5771, + "step": 72815 + }, + { + "epoch": 4.947683109118087, + "grad_norm": 0.7155930995941162, + "learning_rate": 0.00038176042940616934, + "loss": 3.136, + "step": 72820 + }, + { + "epoch": 4.948022829188749, + "grad_norm": 0.865337610244751, + "learning_rate": 0.00038171796439733657, + "loss": 3.3826, + "step": 72825 + }, + { + "epoch": 4.94836254925941, + "grad_norm": 1.1696631908416748, + "learning_rate": 0.00038167549938850385, + "loss": 3.3613, + "step": 72830 + }, + { + "epoch": 4.948702269330072, + "grad_norm": 1.1409913301467896, + "learning_rate": 0.0003816330343796712, + "loss": 3.6192, + "step": 72835 + }, + { + "epoch": 4.949041989400734, + "grad_norm": 1.0848907232284546, + "learning_rate": 0.0003815905693708384, + "loss": 3.5694, + "step": 72840 + }, + { + "epoch": 4.949381709471395, + "grad_norm": 0.9731229543685913, + "learning_rate": 0.0003815481043620057, + "loss": 3.4651, + "step": 72845 + }, + { + "epoch": 4.949721429542057, + "grad_norm": 0.8717184066772461, + "learning_rate": 0.000381505639353173, + "loss": 3.5657, + "step": 72850 + }, + { + "epoch": 4.950061149612719, + "grad_norm": 0.8764669299125671, + "learning_rate": 0.00038146317434434025, + "loss": 3.0773, + "step": 72855 + }, + { + "epoch": 4.9504008696833806, + "grad_norm": 0.8264338374137878, + "learning_rate": 0.00038142070933550753, + "loss": 3.538, + "step": 72860 + }, + { + "epoch": 4.950740589754043, + "grad_norm": 0.9257292747497559, + "learning_rate": 0.00038137824432667487, + "loss": 3.6684, + "step": 72865 + }, + { + "epoch": 4.951080309824705, + "grad_norm": 0.8162530660629272, + "learning_rate": 0.0003813357793178421, + "loss": 3.6401, + "step": 72870 + }, + { + "epoch": 4.951420029895366, + "grad_norm": 1.1220721006393433, + "learning_rate": 0.00038129331430900937, + "loss": 3.5668, + "step": 72875 + }, + { + "epoch": 4.951759749966028, + "grad_norm": 0.7995182871818542, + "learning_rate": 0.00038125084930017665, + "loss": 3.4374, + "step": 72880 + }, + { + "epoch": 4.95209947003669, + "grad_norm": 0.812667727470398, + "learning_rate": 0.000381208384291344, + "loss": 3.5779, + "step": 72885 + }, + { + "epoch": 4.952439190107351, + "grad_norm": 0.8270485997200012, + "learning_rate": 0.0003811659192825112, + "loss": 3.4452, + "step": 72890 + }, + { + "epoch": 4.952778910178013, + "grad_norm": 0.7757332921028137, + "learning_rate": 0.0003811234542736785, + "loss": 3.596, + "step": 72895 + }, + { + "epoch": 4.953118630248675, + "grad_norm": 0.9907073974609375, + "learning_rate": 0.0003810809892648458, + "loss": 3.2804, + "step": 72900 + }, + { + "epoch": 4.953458350319337, + "grad_norm": 0.9051772356033325, + "learning_rate": 0.00038103852425601305, + "loss": 3.2993, + "step": 72905 + }, + { + "epoch": 4.953798070389999, + "grad_norm": 0.8264038562774658, + "learning_rate": 0.00038099605924718033, + "loss": 3.2598, + "step": 72910 + }, + { + "epoch": 4.954137790460661, + "grad_norm": 0.7962325215339661, + "learning_rate": 0.0003809535942383476, + "loss": 3.6567, + "step": 72915 + }, + { + "epoch": 4.954477510531322, + "grad_norm": 2.271636724472046, + "learning_rate": 0.0003809111292295149, + "loss": 3.5567, + "step": 72920 + }, + { + "epoch": 4.954817230601984, + "grad_norm": 0.9740408658981323, + "learning_rate": 0.00038086866422068217, + "loss": 3.595, + "step": 72925 + }, + { + "epoch": 4.955156950672646, + "grad_norm": 0.8294957876205444, + "learning_rate": 0.00038082619921184945, + "loss": 3.5551, + "step": 72930 + }, + { + "epoch": 4.955496670743307, + "grad_norm": 1.0682748556137085, + "learning_rate": 0.00038078373420301673, + "loss": 3.3595, + "step": 72935 + }, + { + "epoch": 4.955836390813969, + "grad_norm": 0.9658530354499817, + "learning_rate": 0.000380741269194184, + "loss": 3.697, + "step": 72940 + }, + { + "epoch": 4.956176110884631, + "grad_norm": 1.1217141151428223, + "learning_rate": 0.0003806988041853513, + "loss": 3.3308, + "step": 72945 + }, + { + "epoch": 4.956515830955293, + "grad_norm": 1.3348183631896973, + "learning_rate": 0.0003806563391765185, + "loss": 3.5389, + "step": 72950 + }, + { + "epoch": 4.956855551025955, + "grad_norm": 0.9575929045677185, + "learning_rate": 0.00038061387416768585, + "loss": 3.3615, + "step": 72955 + }, + { + "epoch": 4.957195271096617, + "grad_norm": 0.7427817583084106, + "learning_rate": 0.00038057140915885313, + "loss": 3.6188, + "step": 72960 + }, + { + "epoch": 4.957534991167278, + "grad_norm": 1.1634316444396973, + "learning_rate": 0.00038052894415002036, + "loss": 3.6763, + "step": 72965 + }, + { + "epoch": 4.95787471123794, + "grad_norm": 0.8442378044128418, + "learning_rate": 0.0003804864791411877, + "loss": 3.5869, + "step": 72970 + }, + { + "epoch": 4.958214431308602, + "grad_norm": 0.756823718547821, + "learning_rate": 0.000380444014132355, + "loss": 3.6683, + "step": 72975 + }, + { + "epoch": 4.958554151379263, + "grad_norm": 0.8192638158798218, + "learning_rate": 0.0003804015491235222, + "loss": 3.5285, + "step": 72980 + }, + { + "epoch": 4.958893871449925, + "grad_norm": 0.8839911222457886, + "learning_rate": 0.0003803590841146895, + "loss": 3.5194, + "step": 72985 + }, + { + "epoch": 4.959233591520587, + "grad_norm": 1.027857780456543, + "learning_rate": 0.0003803166191058568, + "loss": 3.3389, + "step": 72990 + }, + { + "epoch": 4.959573311591249, + "grad_norm": 0.9517308473587036, + "learning_rate": 0.00038027415409702404, + "loss": 3.3992, + "step": 72995 + }, + { + "epoch": 4.959913031661911, + "grad_norm": 1.0318973064422607, + "learning_rate": 0.0003802316890881913, + "loss": 3.4311, + "step": 73000 + }, + { + "epoch": 4.960252751732573, + "grad_norm": 0.7531245946884155, + "learning_rate": 0.00038018922407935865, + "loss": 3.3476, + "step": 73005 + }, + { + "epoch": 4.960592471803234, + "grad_norm": 0.8224403858184814, + "learning_rate": 0.0003801467590705259, + "loss": 3.6405, + "step": 73010 + }, + { + "epoch": 4.960932191873896, + "grad_norm": 0.9825751185417175, + "learning_rate": 0.00038010429406169316, + "loss": 3.3046, + "step": 73015 + }, + { + "epoch": 4.961271911944557, + "grad_norm": 0.8589398860931396, + "learning_rate": 0.000380070322054627, + "loss": 3.7297, + "step": 73020 + }, + { + "epoch": 4.961611632015219, + "grad_norm": 0.8707638382911682, + "learning_rate": 0.00038002785704579426, + "loss": 3.418, + "step": 73025 + }, + { + "epoch": 4.961951352085881, + "grad_norm": 0.802861213684082, + "learning_rate": 0.00037998539203696154, + "loss": 3.4982, + "step": 73030 + }, + { + "epoch": 4.9622910721565425, + "grad_norm": 0.9617286324501038, + "learning_rate": 0.0003799429270281288, + "loss": 3.693, + "step": 73035 + }, + { + "epoch": 4.962630792227205, + "grad_norm": 0.8438165187835693, + "learning_rate": 0.0003799004620192961, + "loss": 3.6013, + "step": 73040 + }, + { + "epoch": 4.962970512297867, + "grad_norm": 0.9196261763572693, + "learning_rate": 0.0003798579970104634, + "loss": 3.6706, + "step": 73045 + }, + { + "epoch": 4.963310232368528, + "grad_norm": 0.9793634414672852, + "learning_rate": 0.00037981553200163066, + "loss": 3.4627, + "step": 73050 + }, + { + "epoch": 4.96364995243919, + "grad_norm": 0.7381039261817932, + "learning_rate": 0.00037977306699279794, + "loss": 3.2338, + "step": 73055 + }, + { + "epoch": 4.963989672509852, + "grad_norm": 1.0071227550506592, + "learning_rate": 0.00037973060198396517, + "loss": 3.4134, + "step": 73060 + }, + { + "epoch": 4.964329392580513, + "grad_norm": 0.9565443992614746, + "learning_rate": 0.0003796881369751325, + "loss": 3.3272, + "step": 73065 + }, + { + "epoch": 4.964669112651175, + "grad_norm": 1.1846388578414917, + "learning_rate": 0.0003796456719662998, + "loss": 3.6374, + "step": 73070 + }, + { + "epoch": 4.965008832721837, + "grad_norm": 0.8797850012779236, + "learning_rate": 0.000379603206957467, + "loss": 3.4257, + "step": 73075 + }, + { + "epoch": 4.9653485527924985, + "grad_norm": 0.8766632080078125, + "learning_rate": 0.00037956074194863434, + "loss": 3.4989, + "step": 73080 + }, + { + "epoch": 4.965688272863161, + "grad_norm": 0.8836286664009094, + "learning_rate": 0.0003795182769398016, + "loss": 3.6529, + "step": 73085 + }, + { + "epoch": 4.966027992933823, + "grad_norm": 0.8127000331878662, + "learning_rate": 0.0003794758119309689, + "loss": 3.531, + "step": 73090 + }, + { + "epoch": 4.966367713004484, + "grad_norm": 0.8703115582466125, + "learning_rate": 0.0003794333469221362, + "loss": 3.3593, + "step": 73095 + }, + { + "epoch": 4.966707433075146, + "grad_norm": 1.0239789485931396, + "learning_rate": 0.00037939088191330346, + "loss": 3.2908, + "step": 73100 + }, + { + "epoch": 4.967047153145808, + "grad_norm": 0.6369106769561768, + "learning_rate": 0.00037934841690447074, + "loss": 3.5464, + "step": 73105 + }, + { + "epoch": 4.967386873216469, + "grad_norm": 0.8755888938903809, + "learning_rate": 0.00037930595189563797, + "loss": 3.4081, + "step": 73110 + }, + { + "epoch": 4.967726593287131, + "grad_norm": 0.7980126738548279, + "learning_rate": 0.0003792634868868053, + "loss": 3.3769, + "step": 73115 + }, + { + "epoch": 4.968066313357793, + "grad_norm": 0.8070387244224548, + "learning_rate": 0.0003792210218779726, + "loss": 3.7639, + "step": 73120 + }, + { + "epoch": 4.9684060334284545, + "grad_norm": 0.9779368042945862, + "learning_rate": 0.0003791785568691398, + "loss": 3.6151, + "step": 73125 + }, + { + "epoch": 4.968745753499117, + "grad_norm": 0.8079439401626587, + "learning_rate": 0.00037913609186030715, + "loss": 3.4603, + "step": 73130 + }, + { + "epoch": 4.969085473569779, + "grad_norm": 0.7508350014686584, + "learning_rate": 0.0003790936268514744, + "loss": 3.2525, + "step": 73135 + }, + { + "epoch": 4.96942519364044, + "grad_norm": 0.821510910987854, + "learning_rate": 0.00037905116184264165, + "loss": 3.5646, + "step": 73140 + }, + { + "epoch": 4.969764913711102, + "grad_norm": 0.7953730821609497, + "learning_rate": 0.00037900869683380893, + "loss": 3.273, + "step": 73145 + }, + { + "epoch": 4.970104633781764, + "grad_norm": 0.9620274901390076, + "learning_rate": 0.00037896623182497627, + "loss": 3.7268, + "step": 73150 + }, + { + "epoch": 4.970444353852425, + "grad_norm": 0.8195778131484985, + "learning_rate": 0.0003789237668161435, + "loss": 3.3992, + "step": 73155 + }, + { + "epoch": 4.970784073923087, + "grad_norm": 0.6574294567108154, + "learning_rate": 0.00037888130180731077, + "loss": 3.6755, + "step": 73160 + }, + { + "epoch": 4.971123793993749, + "grad_norm": 0.9386349320411682, + "learning_rate": 0.0003788388367984781, + "loss": 3.6215, + "step": 73165 + }, + { + "epoch": 4.971463514064411, + "grad_norm": 0.7361937761306763, + "learning_rate": 0.00037879637178964533, + "loss": 3.4542, + "step": 73170 + }, + { + "epoch": 4.971803234135073, + "grad_norm": 0.9517983794212341, + "learning_rate": 0.0003787539067808126, + "loss": 3.6316, + "step": 73175 + }, + { + "epoch": 4.972142954205735, + "grad_norm": 0.8266351819038391, + "learning_rate": 0.0003787114417719799, + "loss": 3.3636, + "step": 73180 + }, + { + "epoch": 4.972482674276396, + "grad_norm": 0.8275221586227417, + "learning_rate": 0.00037866897676314717, + "loss": 3.3745, + "step": 73185 + }, + { + "epoch": 4.972822394347058, + "grad_norm": 0.7640926241874695, + "learning_rate": 0.00037862651175431445, + "loss": 3.6032, + "step": 73190 + }, + { + "epoch": 4.97316211441772, + "grad_norm": 0.9339492917060852, + "learning_rate": 0.00037858404674548173, + "loss": 3.6623, + "step": 73195 + }, + { + "epoch": 4.973501834488381, + "grad_norm": 1.0526384115219116, + "learning_rate": 0.000378541581736649, + "loss": 3.5645, + "step": 73200 + }, + { + "epoch": 4.973841554559043, + "grad_norm": 0.8323665261268616, + "learning_rate": 0.0003784991167278163, + "loss": 3.6161, + "step": 73205 + }, + { + "epoch": 4.974181274629705, + "grad_norm": 0.8113769888877869, + "learning_rate": 0.00037845665171898357, + "loss": 3.536, + "step": 73210 + }, + { + "epoch": 4.974520994700367, + "grad_norm": 0.7971802353858948, + "learning_rate": 0.0003784141867101508, + "loss": 3.4684, + "step": 73215 + }, + { + "epoch": 4.974860714771029, + "grad_norm": 0.812390923500061, + "learning_rate": 0.00037837172170131813, + "loss": 3.4672, + "step": 73220 + }, + { + "epoch": 4.975200434841691, + "grad_norm": 0.8449340462684631, + "learning_rate": 0.0003783292566924854, + "loss": 3.4544, + "step": 73225 + }, + { + "epoch": 4.975540154912352, + "grad_norm": 0.8342920541763306, + "learning_rate": 0.00037828679168365264, + "loss": 3.107, + "step": 73230 + }, + { + "epoch": 4.975879874983014, + "grad_norm": 0.8234513998031616, + "learning_rate": 0.00037824432667481997, + "loss": 3.4255, + "step": 73235 + }, + { + "epoch": 4.976219595053676, + "grad_norm": 0.9016555547714233, + "learning_rate": 0.00037820186166598725, + "loss": 3.2343, + "step": 73240 + }, + { + "epoch": 4.976559315124337, + "grad_norm": 1.0710186958312988, + "learning_rate": 0.0003781593966571545, + "loss": 3.6409, + "step": 73245 + }, + { + "epoch": 4.976899035194999, + "grad_norm": 0.807847261428833, + "learning_rate": 0.00037811693164832176, + "loss": 3.7425, + "step": 73250 + }, + { + "epoch": 4.977238755265661, + "grad_norm": 0.681605339050293, + "learning_rate": 0.0003780744666394891, + "loss": 3.5728, + "step": 73255 + }, + { + "epoch": 4.977578475336323, + "grad_norm": 0.7357401847839355, + "learning_rate": 0.0003780320016306564, + "loss": 3.4506, + "step": 73260 + }, + { + "epoch": 4.977918195406985, + "grad_norm": 0.8085784316062927, + "learning_rate": 0.0003779895366218236, + "loss": 3.6868, + "step": 73265 + }, + { + "epoch": 4.978257915477647, + "grad_norm": 0.8739722967147827, + "learning_rate": 0.00037794707161299093, + "loss": 3.4922, + "step": 73270 + }, + { + "epoch": 4.978597635548308, + "grad_norm": 1.052246332168579, + "learning_rate": 0.0003779046066041582, + "loss": 3.5771, + "step": 73275 + }, + { + "epoch": 4.97893735561897, + "grad_norm": 0.845543384552002, + "learning_rate": 0.00037786214159532544, + "loss": 3.3368, + "step": 73280 + }, + { + "epoch": 4.979277075689632, + "grad_norm": 0.8477576375007629, + "learning_rate": 0.0003778196765864927, + "loss": 3.3259, + "step": 73285 + }, + { + "epoch": 4.979616795760293, + "grad_norm": 0.8069630861282349, + "learning_rate": 0.00037777721157766005, + "loss": 3.1542, + "step": 73290 + }, + { + "epoch": 4.979956515830955, + "grad_norm": 2.2534751892089844, + "learning_rate": 0.0003777347465688273, + "loss": 3.4752, + "step": 73295 + }, + { + "epoch": 4.980296235901617, + "grad_norm": 0.869499921798706, + "learning_rate": 0.00037769228155999456, + "loss": 3.2494, + "step": 73300 + }, + { + "epoch": 4.980635955972279, + "grad_norm": 0.9795598387718201, + "learning_rate": 0.0003776498165511619, + "loss": 3.5712, + "step": 73305 + }, + { + "epoch": 4.980975676042941, + "grad_norm": 0.7508470416069031, + "learning_rate": 0.0003776073515423291, + "loss": 3.5615, + "step": 73310 + }, + { + "epoch": 4.981315396113603, + "grad_norm": 1.120951771736145, + "learning_rate": 0.0003775648865334964, + "loss": 3.5468, + "step": 73315 + }, + { + "epoch": 4.981655116184264, + "grad_norm": 1.0279345512390137, + "learning_rate": 0.0003775224215246637, + "loss": 3.4769, + "step": 73320 + }, + { + "epoch": 4.981994836254926, + "grad_norm": 0.7992268204689026, + "learning_rate": 0.00037747995651583096, + "loss": 3.4494, + "step": 73325 + }, + { + "epoch": 4.982334556325588, + "grad_norm": 0.8894575834274292, + "learning_rate": 0.00037743749150699824, + "loss": 3.1991, + "step": 73330 + }, + { + "epoch": 4.982674276396249, + "grad_norm": 0.7867279052734375, + "learning_rate": 0.0003773950264981655, + "loss": 3.6071, + "step": 73335 + }, + { + "epoch": 4.983013996466911, + "grad_norm": 0.854415237903595, + "learning_rate": 0.0003773525614893328, + "loss": 3.6371, + "step": 73340 + }, + { + "epoch": 4.983353716537573, + "grad_norm": 1.2216944694519043, + "learning_rate": 0.0003773100964805001, + "loss": 3.2501, + "step": 73345 + }, + { + "epoch": 4.983693436608235, + "grad_norm": 0.8059180974960327, + "learning_rate": 0.00037726763147166736, + "loss": 3.5009, + "step": 73350 + }, + { + "epoch": 4.984033156678897, + "grad_norm": 0.8056294322013855, + "learning_rate": 0.0003772251664628346, + "loss": 3.3596, + "step": 73355 + }, + { + "epoch": 4.984372876749559, + "grad_norm": 0.9536705017089844, + "learning_rate": 0.0003771827014540019, + "loss": 3.3777, + "step": 73360 + }, + { + "epoch": 4.98471259682022, + "grad_norm": 0.8590909242630005, + "learning_rate": 0.0003771402364451692, + "loss": 3.6966, + "step": 73365 + }, + { + "epoch": 4.985052316890882, + "grad_norm": 0.7205674648284912, + "learning_rate": 0.0003770977714363364, + "loss": 3.515, + "step": 73370 + }, + { + "epoch": 4.985392036961544, + "grad_norm": 0.8265754580497742, + "learning_rate": 0.00037705530642750376, + "loss": 3.5864, + "step": 73375 + }, + { + "epoch": 4.985731757032205, + "grad_norm": 0.9633201360702515, + "learning_rate": 0.00037701284141867104, + "loss": 3.3909, + "step": 73380 + }, + { + "epoch": 4.986071477102867, + "grad_norm": 1.175659418106079, + "learning_rate": 0.00037697037640983827, + "loss": 3.2769, + "step": 73385 + }, + { + "epoch": 4.986411197173529, + "grad_norm": 0.7886924743652344, + "learning_rate": 0.0003769279114010056, + "loss": 3.4543, + "step": 73390 + }, + { + "epoch": 4.986750917244191, + "grad_norm": 0.791894257068634, + "learning_rate": 0.0003768854463921729, + "loss": 3.3131, + "step": 73395 + }, + { + "epoch": 4.987090637314853, + "grad_norm": 0.9024456143379211, + "learning_rate": 0.0003768429813833401, + "loss": 3.4628, + "step": 73400 + }, + { + "epoch": 4.987430357385515, + "grad_norm": 0.8091867566108704, + "learning_rate": 0.0003768005163745074, + "loss": 3.5109, + "step": 73405 + }, + { + "epoch": 4.987770077456176, + "grad_norm": 0.8446680903434753, + "learning_rate": 0.0003767580513656747, + "loss": 3.5669, + "step": 73410 + }, + { + "epoch": 4.988109797526838, + "grad_norm": 1.0075198411941528, + "learning_rate": 0.00037671558635684195, + "loss": 3.301, + "step": 73415 + }, + { + "epoch": 4.9884495175975, + "grad_norm": 0.8329077959060669, + "learning_rate": 0.0003766731213480092, + "loss": 3.5417, + "step": 73420 + }, + { + "epoch": 4.988789237668161, + "grad_norm": 0.9297465682029724, + "learning_rate": 0.00037663065633917656, + "loss": 3.6382, + "step": 73425 + }, + { + "epoch": 4.989128957738823, + "grad_norm": 1.0710428953170776, + "learning_rate": 0.00037658819133034384, + "loss": 3.574, + "step": 73430 + }, + { + "epoch": 4.989468677809485, + "grad_norm": 0.9128989577293396, + "learning_rate": 0.00037654572632151107, + "loss": 3.6721, + "step": 73435 + }, + { + "epoch": 4.989808397880147, + "grad_norm": 0.6970356106758118, + "learning_rate": 0.00037650326131267835, + "loss": 3.5561, + "step": 73440 + }, + { + "epoch": 4.990148117950809, + "grad_norm": 1.047363519668579, + "learning_rate": 0.0003764607963038457, + "loss": 3.6877, + "step": 73445 + }, + { + "epoch": 4.990487838021471, + "grad_norm": 0.6922537088394165, + "learning_rate": 0.0003764183312950129, + "loss": 3.2988, + "step": 73450 + }, + { + "epoch": 4.990827558092132, + "grad_norm": 1.1932212114334106, + "learning_rate": 0.0003763758662861802, + "loss": 3.356, + "step": 73455 + }, + { + "epoch": 4.991167278162794, + "grad_norm": 0.6997671127319336, + "learning_rate": 0.0003763334012773475, + "loss": 3.5577, + "step": 73460 + }, + { + "epoch": 4.991506998233456, + "grad_norm": 1.0073438882827759, + "learning_rate": 0.00037629093626851475, + "loss": 3.2479, + "step": 73465 + }, + { + "epoch": 4.991846718304117, + "grad_norm": 0.9190857410430908, + "learning_rate": 0.00037624847125968203, + "loss": 3.479, + "step": 73470 + }, + { + "epoch": 4.992186438374779, + "grad_norm": 0.915298342704773, + "learning_rate": 0.0003762060062508493, + "loss": 3.5352, + "step": 73475 + }, + { + "epoch": 4.9925261584454415, + "grad_norm": 0.7476702928543091, + "learning_rate": 0.0003761635412420166, + "loss": 3.532, + "step": 73480 + }, + { + "epoch": 4.992865878516103, + "grad_norm": 0.7395258545875549, + "learning_rate": 0.00037612107623318387, + "loss": 3.5379, + "step": 73485 + }, + { + "epoch": 4.993205598586765, + "grad_norm": 0.9907160401344299, + "learning_rate": 0.00037607861122435115, + "loss": 3.5372, + "step": 73490 + }, + { + "epoch": 4.993545318657426, + "grad_norm": 0.7768539190292358, + "learning_rate": 0.00037603614621551843, + "loss": 3.61, + "step": 73495 + }, + { + "epoch": 4.993885038728088, + "grad_norm": 0.8661043047904968, + "learning_rate": 0.0003759936812066857, + "loss": 3.3821, + "step": 73500 + }, + { + "epoch": 4.99422475879875, + "grad_norm": 0.841312825679779, + "learning_rate": 0.000375951216197853, + "loss": 3.3864, + "step": 73505 + }, + { + "epoch": 4.994564478869411, + "grad_norm": 0.8835954070091248, + "learning_rate": 0.0003759087511890202, + "loss": 3.6711, + "step": 73510 + }, + { + "epoch": 4.994904198940073, + "grad_norm": 1.0256164073944092, + "learning_rate": 0.00037586628618018755, + "loss": 3.5179, + "step": 73515 + }, + { + "epoch": 4.995243919010735, + "grad_norm": 0.9863247275352478, + "learning_rate": 0.00037582382117135483, + "loss": 3.4976, + "step": 73520 + }, + { + "epoch": 4.995583639081397, + "grad_norm": 1.2267128229141235, + "learning_rate": 0.00037578135616252205, + "loss": 3.312, + "step": 73525 + }, + { + "epoch": 4.995923359152059, + "grad_norm": 0.9240493774414062, + "learning_rate": 0.0003757388911536894, + "loss": 3.4785, + "step": 73530 + }, + { + "epoch": 4.996263079222721, + "grad_norm": 0.9256787896156311, + "learning_rate": 0.00037569642614485667, + "loss": 3.3641, + "step": 73535 + }, + { + "epoch": 4.996602799293382, + "grad_norm": 0.8959862589836121, + "learning_rate": 0.0003756539611360239, + "loss": 3.5026, + "step": 73540 + }, + { + "epoch": 4.996942519364044, + "grad_norm": 0.8777661919593811, + "learning_rate": 0.0003756114961271912, + "loss": 3.6058, + "step": 73545 + }, + { + "epoch": 4.997282239434706, + "grad_norm": 0.9687397480010986, + "learning_rate": 0.0003755690311183585, + "loss": 3.4929, + "step": 73550 + }, + { + "epoch": 4.997621959505367, + "grad_norm": 0.8348364233970642, + "learning_rate": 0.00037552656610952573, + "loss": 3.5241, + "step": 73555 + }, + { + "epoch": 4.997961679576029, + "grad_norm": 0.8352223038673401, + "learning_rate": 0.000375484101100693, + "loss": 3.6137, + "step": 73560 + }, + { + "epoch": 4.998301399646691, + "grad_norm": 0.9314370155334473, + "learning_rate": 0.00037544163609186035, + "loss": 3.2595, + "step": 73565 + }, + { + "epoch": 4.998641119717353, + "grad_norm": 0.7257234454154968, + "learning_rate": 0.0003753991710830276, + "loss": 3.4883, + "step": 73570 + }, + { + "epoch": 4.998980839788015, + "grad_norm": 0.8355661630630493, + "learning_rate": 0.00037535670607419486, + "loss": 3.3348, + "step": 73575 + }, + { + "epoch": 4.999320559858677, + "grad_norm": 0.7806727886199951, + "learning_rate": 0.00037531424106536214, + "loss": 3.4873, + "step": 73580 + }, + { + "epoch": 4.999660279929338, + "grad_norm": 1.0444079637527466, + "learning_rate": 0.0003752717760565294, + "loss": 3.6871, + "step": 73585 + }, + { + "epoch": 5.0, + "grad_norm": 15.952349662780762, + "learning_rate": 0.0003752293110476967, + "loss": 3.4057, + "step": 73590 + }, + { + "epoch": 5.0, + "eval_bertscore": { + "f1": 0.8398914781868116, + "precision": 0.8441804538853052, + "recall": 0.8364045665596608 + }, + "eval_bleu_4": 0.015792577734129523, + "eval_exact_match": 0.00019381723035177828, + "eval_loss": 3.4122190475463867, + "eval_meteor": 0.08528680118964778, + "eval_rouge": { + "rouge1": 0.12317198497260273, + "rouge2": 0.01733294876550417, + "rougeL": 0.10626811005235914, + "rougeLsum": 0.10625852396916953 + }, + "eval_runtime": 1435.9184, + "eval_samples_per_second": 7.186, + "eval_steps_per_second": 0.898, + "step": 73590 + }, + { + "epoch": 5.000339720070662, + "grad_norm": 0.9994232654571533, + "learning_rate": 0.000375186846038864, + "loss": 3.4601, + "step": 73595 + }, + { + "epoch": 5.000679440141323, + "grad_norm": 0.9486339092254639, + "learning_rate": 0.0003751443810300313, + "loss": 3.364, + "step": 73600 + }, + { + "epoch": 5.001019160211985, + "grad_norm": 0.8389306664466858, + "learning_rate": 0.00037510191602119854, + "loss": 3.6097, + "step": 73605 + }, + { + "epoch": 5.001358880282647, + "grad_norm": 1.2887734174728394, + "learning_rate": 0.0003750594510123658, + "loss": 3.5944, + "step": 73610 + }, + { + "epoch": 5.001698600353309, + "grad_norm": 0.915189266204834, + "learning_rate": 0.0003750169860035331, + "loss": 3.4356, + "step": 73615 + }, + { + "epoch": 5.002038320423971, + "grad_norm": 0.922826886177063, + "learning_rate": 0.0003749745209947004, + "loss": 3.4161, + "step": 73620 + }, + { + "epoch": 5.002378040494633, + "grad_norm": 0.8105789422988892, + "learning_rate": 0.00037493205598586766, + "loss": 3.54, + "step": 73625 + }, + { + "epoch": 5.002717760565294, + "grad_norm": 0.8324738144874573, + "learning_rate": 0.00037488959097703494, + "loss": 3.2008, + "step": 73630 + }, + { + "epoch": 5.003057480635956, + "grad_norm": 0.7788756489753723, + "learning_rate": 0.0003748471259682022, + "loss": 3.5862, + "step": 73635 + }, + { + "epoch": 5.003397200706618, + "grad_norm": 0.9470195770263672, + "learning_rate": 0.0003748046609593695, + "loss": 3.2738, + "step": 73640 + }, + { + "epoch": 5.003736920777279, + "grad_norm": 0.8502275943756104, + "learning_rate": 0.0003747621959505368, + "loss": 3.3517, + "step": 73645 + }, + { + "epoch": 5.004076640847941, + "grad_norm": 1.0197906494140625, + "learning_rate": 0.000374719730941704, + "loss": 3.4757, + "step": 73650 + }, + { + "epoch": 5.004416360918603, + "grad_norm": 0.7774593234062195, + "learning_rate": 0.00037467726593287134, + "loss": 3.4927, + "step": 73655 + }, + { + "epoch": 5.004756080989265, + "grad_norm": 1.0251638889312744, + "learning_rate": 0.0003746348009240386, + "loss": 3.5666, + "step": 73660 + }, + { + "epoch": 5.005095801059927, + "grad_norm": 0.9299939870834351, + "learning_rate": 0.00037459233591520584, + "loss": 3.5064, + "step": 73665 + }, + { + "epoch": 5.005435521130589, + "grad_norm": 0.9357452988624573, + "learning_rate": 0.0003745498709063732, + "loss": 3.2613, + "step": 73670 + }, + { + "epoch": 5.00577524120125, + "grad_norm": 0.692859947681427, + "learning_rate": 0.00037450740589754046, + "loss": 3.5372, + "step": 73675 + }, + { + "epoch": 5.006114961271912, + "grad_norm": 0.7752829790115356, + "learning_rate": 0.0003744649408887077, + "loss": 3.4152, + "step": 73680 + }, + { + "epoch": 5.006454681342574, + "grad_norm": 0.9712240099906921, + "learning_rate": 0.000374422475879875, + "loss": 3.5696, + "step": 73685 + }, + { + "epoch": 5.006794401413235, + "grad_norm": 0.9418588280677795, + "learning_rate": 0.0003743800108710423, + "loss": 3.3617, + "step": 73690 + }, + { + "epoch": 5.007134121483897, + "grad_norm": 0.9460843205451965, + "learning_rate": 0.0003743375458622095, + "loss": 3.4308, + "step": 73695 + }, + { + "epoch": 5.007473841554559, + "grad_norm": 0.8249228000640869, + "learning_rate": 0.0003742950808533768, + "loss": 3.3089, + "step": 73700 + }, + { + "epoch": 5.007813561625221, + "grad_norm": 0.8295456767082214, + "learning_rate": 0.00037425261584454414, + "loss": 3.5134, + "step": 73705 + }, + { + "epoch": 5.008153281695883, + "grad_norm": 0.9643674492835999, + "learning_rate": 0.00037421015083571136, + "loss": 3.5055, + "step": 73710 + }, + { + "epoch": 5.008493001766545, + "grad_norm": 1.0379256010055542, + "learning_rate": 0.00037416768582687864, + "loss": 3.6052, + "step": 73715 + }, + { + "epoch": 5.008832721837206, + "grad_norm": 0.7906454801559448, + "learning_rate": 0.000374125220818046, + "loss": 3.4178, + "step": 73720 + }, + { + "epoch": 5.009172441907868, + "grad_norm": 0.7784892916679382, + "learning_rate": 0.0003740827558092132, + "loss": 3.261, + "step": 73725 + }, + { + "epoch": 5.00951216197853, + "grad_norm": 0.9405249357223511, + "learning_rate": 0.0003740402908003805, + "loss": 3.2724, + "step": 73730 + }, + { + "epoch": 5.009851882049191, + "grad_norm": 0.8090588450431824, + "learning_rate": 0.00037399782579154776, + "loss": 3.4607, + "step": 73735 + }, + { + "epoch": 5.010191602119853, + "grad_norm": 0.8078798055648804, + "learning_rate": 0.00037395536078271504, + "loss": 3.4377, + "step": 73740 + }, + { + "epoch": 5.0105313221905154, + "grad_norm": 0.7545223236083984, + "learning_rate": 0.0003739128957738823, + "loss": 3.205, + "step": 73745 + }, + { + "epoch": 5.010871042261177, + "grad_norm": 0.8076714873313904, + "learning_rate": 0.0003738704307650496, + "loss": 3.1831, + "step": 73750 + }, + { + "epoch": 5.011210762331839, + "grad_norm": 0.9578872919082642, + "learning_rate": 0.0003738279657562169, + "loss": 3.389, + "step": 73755 + }, + { + "epoch": 5.0115504824025, + "grad_norm": 0.962840735912323, + "learning_rate": 0.00037378550074738416, + "loss": 3.4191, + "step": 73760 + }, + { + "epoch": 5.011890202473162, + "grad_norm": 2.0843331813812256, + "learning_rate": 0.00037374303573855144, + "loss": 3.4794, + "step": 73765 + }, + { + "epoch": 5.012229922543824, + "grad_norm": 1.0201191902160645, + "learning_rate": 0.00037370057072971867, + "loss": 3.1952, + "step": 73770 + }, + { + "epoch": 5.012569642614485, + "grad_norm": 0.904174268245697, + "learning_rate": 0.000373658105720886, + "loss": 3.3297, + "step": 73775 + }, + { + "epoch": 5.012909362685147, + "grad_norm": 0.9298562407493591, + "learning_rate": 0.0003736156407120533, + "loss": 3.5011, + "step": 73780 + }, + { + "epoch": 5.013249082755809, + "grad_norm": 0.7149190306663513, + "learning_rate": 0.00037357317570322056, + "loss": 3.485, + "step": 73785 + }, + { + "epoch": 5.013588802826471, + "grad_norm": 0.8251997828483582, + "learning_rate": 0.00037353071069438784, + "loss": 3.3702, + "step": 73790 + }, + { + "epoch": 5.013928522897133, + "grad_norm": 3.364637613296509, + "learning_rate": 0.0003734882456855551, + "loss": 3.5662, + "step": 73795 + }, + { + "epoch": 5.014268242967795, + "grad_norm": 0.800084114074707, + "learning_rate": 0.0003734457806767224, + "loss": 3.4805, + "step": 73800 + }, + { + "epoch": 5.014607963038456, + "grad_norm": 0.8326594829559326, + "learning_rate": 0.00037340331566788963, + "loss": 3.1259, + "step": 73805 + }, + { + "epoch": 5.014947683109118, + "grad_norm": 0.7391427755355835, + "learning_rate": 0.00037336085065905696, + "loss": 3.4942, + "step": 73810 + }, + { + "epoch": 5.01528740317978, + "grad_norm": 1.1283173561096191, + "learning_rate": 0.00037331838565022424, + "loss": 3.3695, + "step": 73815 + }, + { + "epoch": 5.015627123250441, + "grad_norm": 1.0340348482131958, + "learning_rate": 0.00037327592064139147, + "loss": 3.5047, + "step": 73820 + }, + { + "epoch": 5.015966843321103, + "grad_norm": 0.9628918766975403, + "learning_rate": 0.0003732334556325588, + "loss": 3.2661, + "step": 73825 + }, + { + "epoch": 5.016306563391765, + "grad_norm": 0.872589111328125, + "learning_rate": 0.0003731909906237261, + "loss": 3.5156, + "step": 73830 + }, + { + "epoch": 5.016646283462427, + "grad_norm": 0.8294765949249268, + "learning_rate": 0.0003731485256148933, + "loss": 3.4694, + "step": 73835 + }, + { + "epoch": 5.016986003533089, + "grad_norm": 0.8212862610816956, + "learning_rate": 0.0003731060606060606, + "loss": 3.3008, + "step": 73840 + }, + { + "epoch": 5.017325723603751, + "grad_norm": 0.8345354199409485, + "learning_rate": 0.0003730635955972279, + "loss": 3.1798, + "step": 73845 + }, + { + "epoch": 5.017665443674412, + "grad_norm": 0.882144570350647, + "learning_rate": 0.00037302113058839515, + "loss": 3.361, + "step": 73850 + }, + { + "epoch": 5.018005163745074, + "grad_norm": 0.9927316904067993, + "learning_rate": 0.00037297866557956243, + "loss": 3.4227, + "step": 73855 + }, + { + "epoch": 5.018344883815736, + "grad_norm": 0.7955802083015442, + "learning_rate": 0.00037293620057072977, + "loss": 3.2914, + "step": 73860 + }, + { + "epoch": 5.018684603886397, + "grad_norm": 1.051182508468628, + "learning_rate": 0.000372893735561897, + "loss": 3.4238, + "step": 73865 + }, + { + "epoch": 5.019024323957059, + "grad_norm": 0.8550714254379272, + "learning_rate": 0.00037285127055306427, + "loss": 3.4838, + "step": 73870 + }, + { + "epoch": 5.019364044027721, + "grad_norm": 0.8220783472061157, + "learning_rate": 0.00037280880554423155, + "loss": 3.5348, + "step": 73875 + }, + { + "epoch": 5.019703764098383, + "grad_norm": 0.8828471899032593, + "learning_rate": 0.00037276634053539883, + "loss": 3.3527, + "step": 73880 + }, + { + "epoch": 5.020043484169045, + "grad_norm": 1.1163197755813599, + "learning_rate": 0.0003727238755265661, + "loss": 3.3666, + "step": 73885 + }, + { + "epoch": 5.020383204239707, + "grad_norm": 1.0556244850158691, + "learning_rate": 0.0003726814105177334, + "loss": 3.5452, + "step": 73890 + }, + { + "epoch": 5.020722924310368, + "grad_norm": 0.9323630332946777, + "learning_rate": 0.00037263894550890067, + "loss": 3.2959, + "step": 73895 + }, + { + "epoch": 5.02106264438103, + "grad_norm": 1.2798514366149902, + "learning_rate": 0.00037259648050006795, + "loss": 3.4831, + "step": 73900 + }, + { + "epoch": 5.021402364451692, + "grad_norm": 0.8313497304916382, + "learning_rate": 0.00037255401549123523, + "loss": 3.4798, + "step": 73905 + }, + { + "epoch": 5.021742084522353, + "grad_norm": 1.0122069120407104, + "learning_rate": 0.00037251155048240246, + "loss": 3.4343, + "step": 73910 + }, + { + "epoch": 5.022081804593015, + "grad_norm": 1.2634543180465698, + "learning_rate": 0.0003724690854735698, + "loss": 3.4824, + "step": 73915 + }, + { + "epoch": 5.022421524663677, + "grad_norm": 0.8093626499176025, + "learning_rate": 0.00037242662046473707, + "loss": 3.3749, + "step": 73920 + }, + { + "epoch": 5.022761244734339, + "grad_norm": 0.9433859586715698, + "learning_rate": 0.0003723841554559043, + "loss": 3.4069, + "step": 73925 + }, + { + "epoch": 5.023100964805001, + "grad_norm": 0.7309264540672302, + "learning_rate": 0.00037234169044707163, + "loss": 3.3713, + "step": 73930 + }, + { + "epoch": 5.023440684875663, + "grad_norm": 0.9086424708366394, + "learning_rate": 0.0003722992254382389, + "loss": 3.7081, + "step": 73935 + }, + { + "epoch": 5.023780404946324, + "grad_norm": 0.7914588451385498, + "learning_rate": 0.00037225676042940614, + "loss": 3.2824, + "step": 73940 + }, + { + "epoch": 5.024120125016986, + "grad_norm": 0.6827050447463989, + "learning_rate": 0.00037221429542057347, + "loss": 3.2269, + "step": 73945 + }, + { + "epoch": 5.024459845087648, + "grad_norm": 0.7827907800674438, + "learning_rate": 0.00037217183041174075, + "loss": 3.2104, + "step": 73950 + }, + { + "epoch": 5.024799565158309, + "grad_norm": 0.8546298146247864, + "learning_rate": 0.00037212936540290803, + "loss": 3.2791, + "step": 73955 + }, + { + "epoch": 5.025139285228971, + "grad_norm": 0.769433856010437, + "learning_rate": 0.00037208690039407526, + "loss": 3.4841, + "step": 73960 + }, + { + "epoch": 5.025479005299633, + "grad_norm": 0.916776716709137, + "learning_rate": 0.0003720444353852426, + "loss": 3.6621, + "step": 73965 + }, + { + "epoch": 5.025818725370295, + "grad_norm": 0.8575485944747925, + "learning_rate": 0.00037200197037640987, + "loss": 3.5787, + "step": 73970 + }, + { + "epoch": 5.026158445440957, + "grad_norm": 0.7219715714454651, + "learning_rate": 0.0003719595053675771, + "loss": 3.4126, + "step": 73975 + }, + { + "epoch": 5.026498165511619, + "grad_norm": 0.8852742314338684, + "learning_rate": 0.00037191704035874443, + "loss": 3.4364, + "step": 73980 + }, + { + "epoch": 5.02683788558228, + "grad_norm": 0.7971519827842712, + "learning_rate": 0.0003718745753499117, + "loss": 3.3037, + "step": 73985 + }, + { + "epoch": 5.027177605652942, + "grad_norm": 0.8093227744102478, + "learning_rate": 0.00037183211034107894, + "loss": 3.365, + "step": 73990 + }, + { + "epoch": 5.027517325723604, + "grad_norm": 0.6934683322906494, + "learning_rate": 0.0003717896453322462, + "loss": 3.349, + "step": 73995 + }, + { + "epoch": 5.027857045794265, + "grad_norm": 0.810848593711853, + "learning_rate": 0.00037174718032341355, + "loss": 3.2756, + "step": 74000 + }, + { + "epoch": 5.028196765864927, + "grad_norm": 0.9274951219558716, + "learning_rate": 0.0003717047153145808, + "loss": 3.6305, + "step": 74005 + }, + { + "epoch": 5.028536485935589, + "grad_norm": 0.9303903579711914, + "learning_rate": 0.00037166225030574806, + "loss": 3.4136, + "step": 74010 + }, + { + "epoch": 5.028876206006251, + "grad_norm": 1.301264762878418, + "learning_rate": 0.0003716197852969154, + "loss": 3.3685, + "step": 74015 + }, + { + "epoch": 5.029215926076913, + "grad_norm": 1.0389009714126587, + "learning_rate": 0.0003715773202880826, + "loss": 3.2973, + "step": 74020 + }, + { + "epoch": 5.029555646147575, + "grad_norm": 1.0551624298095703, + "learning_rate": 0.0003715348552792499, + "loss": 3.6446, + "step": 74025 + }, + { + "epoch": 5.029895366218236, + "grad_norm": 0.9462447762489319, + "learning_rate": 0.0003714923902704172, + "loss": 3.3543, + "step": 74030 + }, + { + "epoch": 5.030235086288898, + "grad_norm": 0.9538938999176025, + "learning_rate": 0.00037144992526158446, + "loss": 3.4077, + "step": 74035 + }, + { + "epoch": 5.03057480635956, + "grad_norm": 0.9427371621131897, + "learning_rate": 0.00037140746025275174, + "loss": 3.3914, + "step": 74040 + }, + { + "epoch": 5.030914526430221, + "grad_norm": 1.897518515586853, + "learning_rate": 0.000371364995243919, + "loss": 3.1834, + "step": 74045 + }, + { + "epoch": 5.031254246500883, + "grad_norm": 1.1013680696487427, + "learning_rate": 0.0003713225302350863, + "loss": 3.6813, + "step": 74050 + }, + { + "epoch": 5.0315939665715455, + "grad_norm": 1.1735268831253052, + "learning_rate": 0.0003712800652262536, + "loss": 3.0994, + "step": 74055 + }, + { + "epoch": 5.031933686642207, + "grad_norm": 1.4102859497070312, + "learning_rate": 0.00037123760021742086, + "loss": 3.1744, + "step": 74060 + }, + { + "epoch": 5.032273406712869, + "grad_norm": 0.8009237051010132, + "learning_rate": 0.0003711951352085881, + "loss": 3.5815, + "step": 74065 + }, + { + "epoch": 5.032613126783531, + "grad_norm": 1.0834287405014038, + "learning_rate": 0.0003711526701997554, + "loss": 3.2265, + "step": 74070 + }, + { + "epoch": 5.032952846854192, + "grad_norm": 1.1820052862167358, + "learning_rate": 0.0003711102051909227, + "loss": 3.3397, + "step": 74075 + }, + { + "epoch": 5.033292566924854, + "grad_norm": 1.1438591480255127, + "learning_rate": 0.0003710677401820899, + "loss": 3.1517, + "step": 74080 + }, + { + "epoch": 5.033632286995516, + "grad_norm": 1.0315282344818115, + "learning_rate": 0.00037102527517325726, + "loss": 3.5182, + "step": 74085 + }, + { + "epoch": 5.033972007066177, + "grad_norm": 1.09751558303833, + "learning_rate": 0.00037098281016442454, + "loss": 3.3236, + "step": 74090 + }, + { + "epoch": 5.034311727136839, + "grad_norm": 0.8128194212913513, + "learning_rate": 0.00037094034515559177, + "loss": 3.7643, + "step": 74095 + }, + { + "epoch": 5.0346514472075015, + "grad_norm": 0.8694228529930115, + "learning_rate": 0.00037089788014675905, + "loss": 3.5288, + "step": 74100 + }, + { + "epoch": 5.034991167278163, + "grad_norm": 0.9522294998168945, + "learning_rate": 0.0003708554151379264, + "loss": 3.483, + "step": 74105 + }, + { + "epoch": 5.035330887348825, + "grad_norm": 1.090871810913086, + "learning_rate": 0.0003708129501290936, + "loss": 3.4496, + "step": 74110 + }, + { + "epoch": 5.035670607419486, + "grad_norm": 0.8297543525695801, + "learning_rate": 0.0003707704851202609, + "loss": 3.8216, + "step": 74115 + }, + { + "epoch": 5.036010327490148, + "grad_norm": 0.7101438641548157, + "learning_rate": 0.0003707280201114282, + "loss": 3.1768, + "step": 74120 + }, + { + "epoch": 5.03635004756081, + "grad_norm": 0.9635698795318604, + "learning_rate": 0.0003706855551025955, + "loss": 3.5372, + "step": 74125 + }, + { + "epoch": 5.036689767631471, + "grad_norm": 0.9348769187927246, + "learning_rate": 0.0003706430900937627, + "loss": 3.5914, + "step": 74130 + }, + { + "epoch": 5.037029487702133, + "grad_norm": 1.078817367553711, + "learning_rate": 0.00037060062508493, + "loss": 3.3153, + "step": 74135 + }, + { + "epoch": 5.037369207772795, + "grad_norm": 0.876028299331665, + "learning_rate": 0.00037055816007609734, + "loss": 3.6241, + "step": 74140 + }, + { + "epoch": 5.037708927843457, + "grad_norm": 0.9226345419883728, + "learning_rate": 0.00037051569506726457, + "loss": 3.5665, + "step": 74145 + }, + { + "epoch": 5.038048647914119, + "grad_norm": 1.1517977714538574, + "learning_rate": 0.00037047323005843185, + "loss": 3.4293, + "step": 74150 + }, + { + "epoch": 5.038388367984781, + "grad_norm": 0.7589874863624573, + "learning_rate": 0.0003704307650495992, + "loss": 3.5015, + "step": 74155 + }, + { + "epoch": 5.038728088055442, + "grad_norm": 1.1262942552566528, + "learning_rate": 0.0003703883000407664, + "loss": 3.3061, + "step": 74160 + }, + { + "epoch": 5.039067808126104, + "grad_norm": 1.181709885597229, + "learning_rate": 0.0003703458350319337, + "loss": 3.8682, + "step": 74165 + }, + { + "epoch": 5.039407528196766, + "grad_norm": 1.9769432544708252, + "learning_rate": 0.00037030337002310097, + "loss": 3.4371, + "step": 74170 + }, + { + "epoch": 5.039747248267427, + "grad_norm": 1.0707076787948608, + "learning_rate": 0.00037026090501426825, + "loss": 3.7277, + "step": 74175 + }, + { + "epoch": 5.040086968338089, + "grad_norm": 0.8418802618980408, + "learning_rate": 0.00037021844000543553, + "loss": 3.4404, + "step": 74180 + }, + { + "epoch": 5.040426688408751, + "grad_norm": 0.7402486801147461, + "learning_rate": 0.0003701759749966028, + "loss": 3.3841, + "step": 74185 + }, + { + "epoch": 5.040766408479413, + "grad_norm": 0.795915424823761, + "learning_rate": 0.0003701335099877701, + "loss": 3.5653, + "step": 74190 + }, + { + "epoch": 5.041106128550075, + "grad_norm": 0.8609259128570557, + "learning_rate": 0.00037009104497893737, + "loss": 3.4427, + "step": 74195 + }, + { + "epoch": 5.041445848620737, + "grad_norm": 0.9427459239959717, + "learning_rate": 0.00037004857997010465, + "loss": 3.6664, + "step": 74200 + }, + { + "epoch": 5.041785568691398, + "grad_norm": 0.7916103005409241, + "learning_rate": 0.0003700061149612719, + "loss": 3.3437, + "step": 74205 + }, + { + "epoch": 5.04212528876206, + "grad_norm": 1.0997225046157837, + "learning_rate": 0.0003699636499524392, + "loss": 3.78, + "step": 74210 + }, + { + "epoch": 5.042465008832722, + "grad_norm": 0.8461731672286987, + "learning_rate": 0.0003699211849436065, + "loss": 3.4848, + "step": 74215 + }, + { + "epoch": 5.042804728903383, + "grad_norm": 0.8070161938667297, + "learning_rate": 0.0003698787199347737, + "loss": 3.6364, + "step": 74220 + }, + { + "epoch": 5.043144448974045, + "grad_norm": 1.0020973682403564, + "learning_rate": 0.00036983625492594105, + "loss": 3.3674, + "step": 74225 + }, + { + "epoch": 5.043484169044707, + "grad_norm": 0.9731824994087219, + "learning_rate": 0.00036979378991710833, + "loss": 3.6054, + "step": 74230 + }, + { + "epoch": 5.043823889115369, + "grad_norm": 0.8446808457374573, + "learning_rate": 0.00036975132490827555, + "loss": 3.2864, + "step": 74235 + }, + { + "epoch": 5.044163609186031, + "grad_norm": 0.9603143930435181, + "learning_rate": 0.0003697088598994429, + "loss": 3.0696, + "step": 74240 + }, + { + "epoch": 5.044503329256693, + "grad_norm": 1.1697968244552612, + "learning_rate": 0.00036966639489061017, + "loss": 3.3659, + "step": 74245 + }, + { + "epoch": 5.044843049327354, + "grad_norm": 0.9583245515823364, + "learning_rate": 0.0003696239298817774, + "loss": 3.3855, + "step": 74250 + }, + { + "epoch": 5.045182769398016, + "grad_norm": 0.9555863738059998, + "learning_rate": 0.0003695814648729447, + "loss": 3.6217, + "step": 74255 + }, + { + "epoch": 5.045522489468678, + "grad_norm": 0.9887585043907166, + "learning_rate": 0.000369538999864112, + "loss": 3.4507, + "step": 74260 + }, + { + "epoch": 5.045862209539339, + "grad_norm": 1.0837663412094116, + "learning_rate": 0.00036949653485527923, + "loss": 3.6401, + "step": 74265 + }, + { + "epoch": 5.046201929610001, + "grad_norm": 0.7356453537940979, + "learning_rate": 0.0003694540698464465, + "loss": 3.4637, + "step": 74270 + }, + { + "epoch": 5.046541649680663, + "grad_norm": 0.8341341614723206, + "learning_rate": 0.00036941160483761385, + "loss": 3.4636, + "step": 74275 + }, + { + "epoch": 5.046881369751325, + "grad_norm": 0.9536346793174744, + "learning_rate": 0.0003693691398287811, + "loss": 3.1188, + "step": 74280 + }, + { + "epoch": 5.047221089821987, + "grad_norm": 0.9065497517585754, + "learning_rate": 0.00036932667481994836, + "loss": 3.249, + "step": 74285 + }, + { + "epoch": 5.047560809892649, + "grad_norm": 0.8214880228042603, + "learning_rate": 0.00036928420981111564, + "loss": 3.4559, + "step": 74290 + }, + { + "epoch": 5.04790052996331, + "grad_norm": 0.8957664966583252, + "learning_rate": 0.00036924174480228297, + "loss": 3.3154, + "step": 74295 + }, + { + "epoch": 5.048240250033972, + "grad_norm": 0.8157066702842712, + "learning_rate": 0.0003691992797934502, + "loss": 3.6853, + "step": 74300 + }, + { + "epoch": 5.048579970104634, + "grad_norm": 1.2623722553253174, + "learning_rate": 0.0003691568147846175, + "loss": 3.5726, + "step": 74305 + }, + { + "epoch": 5.048919690175295, + "grad_norm": 0.872540295124054, + "learning_rate": 0.0003691143497757848, + "loss": 3.8638, + "step": 74310 + }, + { + "epoch": 5.049259410245957, + "grad_norm": 1.0257303714752197, + "learning_rate": 0.00036907188476695204, + "loss": 3.4807, + "step": 74315 + }, + { + "epoch": 5.0495991303166194, + "grad_norm": 0.8790091276168823, + "learning_rate": 0.0003690294197581193, + "loss": 3.6646, + "step": 74320 + }, + { + "epoch": 5.049938850387281, + "grad_norm": 0.6399126052856445, + "learning_rate": 0.0003689869547492866, + "loss": 3.3938, + "step": 74325 + }, + { + "epoch": 5.050278570457943, + "grad_norm": 0.8638678789138794, + "learning_rate": 0.0003689444897404539, + "loss": 3.1564, + "step": 74330 + }, + { + "epoch": 5.050618290528605, + "grad_norm": 0.9551567435264587, + "learning_rate": 0.00036890202473162116, + "loss": 3.5693, + "step": 74335 + }, + { + "epoch": 5.050958010599266, + "grad_norm": 0.9173234701156616, + "learning_rate": 0.00036885955972278844, + "loss": 3.2978, + "step": 74340 + }, + { + "epoch": 5.051297730669928, + "grad_norm": 1.0111420154571533, + "learning_rate": 0.0003688170947139557, + "loss": 3.1567, + "step": 74345 + }, + { + "epoch": 5.05163745074059, + "grad_norm": 0.872836709022522, + "learning_rate": 0.000368774629705123, + "loss": 3.4207, + "step": 74350 + }, + { + "epoch": 5.051977170811251, + "grad_norm": 0.7614759802818298, + "learning_rate": 0.0003687321646962903, + "loss": 3.5087, + "step": 74355 + }, + { + "epoch": 5.052316890881913, + "grad_norm": 0.8034939169883728, + "learning_rate": 0.0003686896996874575, + "loss": 3.4054, + "step": 74360 + }, + { + "epoch": 5.0526566109525755, + "grad_norm": 0.759128749370575, + "learning_rate": 0.00036864723467862484, + "loss": 3.2621, + "step": 74365 + }, + { + "epoch": 5.052996331023237, + "grad_norm": 1.0296958684921265, + "learning_rate": 0.0003686047696697921, + "loss": 3.4314, + "step": 74370 + }, + { + "epoch": 5.053336051093899, + "grad_norm": 0.9124054312705994, + "learning_rate": 0.00036856230466095934, + "loss": 3.3913, + "step": 74375 + }, + { + "epoch": 5.053675771164561, + "grad_norm": 0.7844187021255493, + "learning_rate": 0.0003685198396521267, + "loss": 3.2008, + "step": 74380 + }, + { + "epoch": 5.054015491235222, + "grad_norm": 1.3641693592071533, + "learning_rate": 0.00036847737464329396, + "loss": 3.2634, + "step": 74385 + }, + { + "epoch": 5.054355211305884, + "grad_norm": 0.9494420886039734, + "learning_rate": 0.0003684349096344612, + "loss": 3.6212, + "step": 74390 + }, + { + "epoch": 5.054694931376546, + "grad_norm": 0.8123714923858643, + "learning_rate": 0.00036839244462562846, + "loss": 3.3918, + "step": 74395 + }, + { + "epoch": 5.055034651447207, + "grad_norm": 0.9888006448745728, + "learning_rate": 0.0003683499796167958, + "loss": 3.1779, + "step": 74400 + }, + { + "epoch": 5.055374371517869, + "grad_norm": 0.9614259600639343, + "learning_rate": 0.000368307514607963, + "loss": 3.3813, + "step": 74405 + }, + { + "epoch": 5.0557140915885315, + "grad_norm": 0.7652382850646973, + "learning_rate": 0.0003682650495991303, + "loss": 3.3582, + "step": 74410 + }, + { + "epoch": 5.056053811659193, + "grad_norm": 0.8448954820632935, + "learning_rate": 0.00036822258459029764, + "loss": 3.4592, + "step": 74415 + }, + { + "epoch": 5.056393531729855, + "grad_norm": 1.0771404504776, + "learning_rate": 0.00036818011958146486, + "loss": 3.5676, + "step": 74420 + }, + { + "epoch": 5.056733251800517, + "grad_norm": 0.8102321624755859, + "learning_rate": 0.00036813765457263214, + "loss": 3.7734, + "step": 74425 + }, + { + "epoch": 5.057072971871178, + "grad_norm": 0.8213083744049072, + "learning_rate": 0.0003680951895637994, + "loss": 3.5371, + "step": 74430 + }, + { + "epoch": 5.05741269194184, + "grad_norm": 0.920207679271698, + "learning_rate": 0.0003680527245549667, + "loss": 3.3838, + "step": 74435 + }, + { + "epoch": 5.057752412012501, + "grad_norm": 0.8273656368255615, + "learning_rate": 0.000368010259546134, + "loss": 3.4367, + "step": 74440 + }, + { + "epoch": 5.058092132083163, + "grad_norm": 0.8773515224456787, + "learning_rate": 0.00036796779453730126, + "loss": 3.444, + "step": 74445 + }, + { + "epoch": 5.058431852153825, + "grad_norm": 1.0294030904769897, + "learning_rate": 0.00036792532952846854, + "loss": 3.209, + "step": 74450 + }, + { + "epoch": 5.058771572224487, + "grad_norm": 0.8051391839981079, + "learning_rate": 0.0003678828645196358, + "loss": 3.4353, + "step": 74455 + }, + { + "epoch": 5.059111292295149, + "grad_norm": 0.9460633397102356, + "learning_rate": 0.0003678403995108031, + "loss": 3.4577, + "step": 74460 + }, + { + "epoch": 5.059451012365811, + "grad_norm": 1.1404725313186646, + "learning_rate": 0.0003677979345019704, + "loss": 3.4684, + "step": 74465 + }, + { + "epoch": 5.059790732436472, + "grad_norm": 0.8290247917175293, + "learning_rate": 0.00036775546949313766, + "loss": 3.2333, + "step": 74470 + }, + { + "epoch": 5.060130452507134, + "grad_norm": 0.8031269907951355, + "learning_rate": 0.00036771300448430494, + "loss": 3.5049, + "step": 74475 + }, + { + "epoch": 5.060470172577796, + "grad_norm": 1.1340327262878418, + "learning_rate": 0.0003676705394754722, + "loss": 3.643, + "step": 74480 + }, + { + "epoch": 5.060809892648457, + "grad_norm": 0.8549513220787048, + "learning_rate": 0.0003676280744666395, + "loss": 3.6543, + "step": 74485 + }, + { + "epoch": 5.061149612719119, + "grad_norm": 0.9902439117431641, + "learning_rate": 0.0003675856094578068, + "loss": 3.312, + "step": 74490 + }, + { + "epoch": 5.061489332789781, + "grad_norm": 1.4202297925949097, + "learning_rate": 0.00036754314444897406, + "loss": 3.4748, + "step": 74495 + }, + { + "epoch": 5.061829052860443, + "grad_norm": 1.0835224390029907, + "learning_rate": 0.0003675006794401413, + "loss": 3.6543, + "step": 74500 + }, + { + "epoch": 5.062168772931105, + "grad_norm": 0.8172304630279541, + "learning_rate": 0.0003674582144313086, + "loss": 3.2156, + "step": 74505 + }, + { + "epoch": 5.062508493001767, + "grad_norm": 0.9089840054512024, + "learning_rate": 0.0003674157494224759, + "loss": 3.437, + "step": 74510 + }, + { + "epoch": 5.062848213072428, + "grad_norm": 0.9072228670120239, + "learning_rate": 0.00036737328441364313, + "loss": 3.4751, + "step": 74515 + }, + { + "epoch": 5.06318793314309, + "grad_norm": 0.9386318325996399, + "learning_rate": 0.00036733081940481046, + "loss": 3.5439, + "step": 74520 + }, + { + "epoch": 5.063527653213752, + "grad_norm": 0.8436791300773621, + "learning_rate": 0.00036728835439597774, + "loss": 3.1149, + "step": 74525 + }, + { + "epoch": 5.063867373284413, + "grad_norm": 0.9431077837944031, + "learning_rate": 0.00036724588938714497, + "loss": 3.6387, + "step": 74530 + }, + { + "epoch": 5.064207093355075, + "grad_norm": 0.8757582306861877, + "learning_rate": 0.0003672034243783123, + "loss": 3.4406, + "step": 74535 + }, + { + "epoch": 5.064546813425737, + "grad_norm": 1.1502429246902466, + "learning_rate": 0.0003671609593694796, + "loss": 3.378, + "step": 74540 + }, + { + "epoch": 5.064886533496399, + "grad_norm": 0.9777489304542542, + "learning_rate": 0.0003671184943606468, + "loss": 3.449, + "step": 74545 + }, + { + "epoch": 5.065226253567061, + "grad_norm": 0.8395383358001709, + "learning_rate": 0.0003670760293518141, + "loss": 3.3354, + "step": 74550 + }, + { + "epoch": 5.065565973637723, + "grad_norm": 0.9629839062690735, + "learning_rate": 0.0003670335643429814, + "loss": 3.4805, + "step": 74555 + }, + { + "epoch": 5.065905693708384, + "grad_norm": 0.9035612940788269, + "learning_rate": 0.00036699109933414865, + "loss": 3.3223, + "step": 74560 + }, + { + "epoch": 5.066245413779046, + "grad_norm": 0.9889609217643738, + "learning_rate": 0.00036694863432531593, + "loss": 3.5542, + "step": 74565 + }, + { + "epoch": 5.066585133849708, + "grad_norm": 1.0771567821502686, + "learning_rate": 0.00036690616931648327, + "loss": 3.4837, + "step": 74570 + }, + { + "epoch": 5.066924853920369, + "grad_norm": 1.0657587051391602, + "learning_rate": 0.0003668637043076505, + "loss": 3.6936, + "step": 74575 + }, + { + "epoch": 5.067264573991031, + "grad_norm": 0.9667888879776001, + "learning_rate": 0.00036682123929881777, + "loss": 3.1935, + "step": 74580 + }, + { + "epoch": 5.067604294061693, + "grad_norm": 1.0443179607391357, + "learning_rate": 0.00036677877428998505, + "loss": 3.6469, + "step": 74585 + }, + { + "epoch": 5.067944014132355, + "grad_norm": 0.8967350125312805, + "learning_rate": 0.00036673630928115233, + "loss": 3.4716, + "step": 74590 + }, + { + "epoch": 5.068283734203017, + "grad_norm": 3.704216241836548, + "learning_rate": 0.0003666938442723196, + "loss": 3.5075, + "step": 74595 + }, + { + "epoch": 5.068623454273679, + "grad_norm": 1.155657172203064, + "learning_rate": 0.0003666513792634869, + "loss": 3.5802, + "step": 74600 + }, + { + "epoch": 5.06896317434434, + "grad_norm": 0.8169151544570923, + "learning_rate": 0.00036660891425465417, + "loss": 3.4818, + "step": 74605 + }, + { + "epoch": 5.069302894415002, + "grad_norm": 1.0104279518127441, + "learning_rate": 0.00036656644924582145, + "loss": 3.7537, + "step": 74610 + }, + { + "epoch": 5.069642614485664, + "grad_norm": 0.6890705227851868, + "learning_rate": 0.00036652398423698873, + "loss": 3.5497, + "step": 74615 + }, + { + "epoch": 5.069982334556325, + "grad_norm": 0.772115170955658, + "learning_rate": 0.00036648151922815596, + "loss": 3.5674, + "step": 74620 + }, + { + "epoch": 5.070322054626987, + "grad_norm": 1.3289175033569336, + "learning_rate": 0.0003664390542193233, + "loss": 3.1963, + "step": 74625 + }, + { + "epoch": 5.0706617746976494, + "grad_norm": 1.0464763641357422, + "learning_rate": 0.00036639658921049057, + "loss": 3.3565, + "step": 74630 + }, + { + "epoch": 5.071001494768311, + "grad_norm": 0.9222560524940491, + "learning_rate": 0.00036635412420165785, + "loss": 3.5191, + "step": 74635 + }, + { + "epoch": 5.071341214838973, + "grad_norm": 0.8830601572990417, + "learning_rate": 0.00036631165919282513, + "loss": 3.5708, + "step": 74640 + }, + { + "epoch": 5.071680934909635, + "grad_norm": 1.0648162364959717, + "learning_rate": 0.0003662691941839924, + "loss": 3.3411, + "step": 74645 + }, + { + "epoch": 5.072020654980296, + "grad_norm": 0.8010802268981934, + "learning_rate": 0.0003662267291751597, + "loss": 3.3141, + "step": 74650 + }, + { + "epoch": 5.072360375050958, + "grad_norm": 0.8223400712013245, + "learning_rate": 0.0003661842641663269, + "loss": 3.4016, + "step": 74655 + }, + { + "epoch": 5.07270009512162, + "grad_norm": 1.0708882808685303, + "learning_rate": 0.00036614179915749425, + "loss": 3.3853, + "step": 74660 + }, + { + "epoch": 5.073039815192281, + "grad_norm": 0.7843606472015381, + "learning_rate": 0.00036609933414866153, + "loss": 3.6688, + "step": 74665 + }, + { + "epoch": 5.073379535262943, + "grad_norm": 0.9151174426078796, + "learning_rate": 0.00036605686913982876, + "loss": 3.8937, + "step": 74670 + }, + { + "epoch": 5.0737192553336055, + "grad_norm": 0.9959307909011841, + "learning_rate": 0.0003660144041309961, + "loss": 3.2297, + "step": 74675 + }, + { + "epoch": 5.074058975404267, + "grad_norm": 0.9152815937995911, + "learning_rate": 0.00036597193912216337, + "loss": 3.3294, + "step": 74680 + }, + { + "epoch": 5.074398695474929, + "grad_norm": 0.868704080581665, + "learning_rate": 0.0003659294741133306, + "loss": 3.5137, + "step": 74685 + }, + { + "epoch": 5.074738415545591, + "grad_norm": 0.8252987861633301, + "learning_rate": 0.0003658870091044979, + "loss": 3.1831, + "step": 74690 + }, + { + "epoch": 5.075078135616252, + "grad_norm": 0.9079264402389526, + "learning_rate": 0.0003658445440956652, + "loss": 3.1945, + "step": 74695 + }, + { + "epoch": 5.075417855686914, + "grad_norm": 0.7481065392494202, + "learning_rate": 0.00036580207908683244, + "loss": 3.2899, + "step": 74700 + }, + { + "epoch": 5.075757575757576, + "grad_norm": 0.9381486177444458, + "learning_rate": 0.0003657596140779997, + "loss": 3.3995, + "step": 74705 + }, + { + "epoch": 5.076097295828237, + "grad_norm": 0.8992714285850525, + "learning_rate": 0.00036571714906916705, + "loss": 3.4581, + "step": 74710 + }, + { + "epoch": 5.076437015898899, + "grad_norm": 0.9679471254348755, + "learning_rate": 0.0003656746840603343, + "loss": 3.581, + "step": 74715 + }, + { + "epoch": 5.0767767359695615, + "grad_norm": 0.9061264991760254, + "learning_rate": 0.00036563221905150156, + "loss": 3.5342, + "step": 74720 + }, + { + "epoch": 5.077116456040223, + "grad_norm": 0.9209926128387451, + "learning_rate": 0.00036558975404266884, + "loss": 3.5252, + "step": 74725 + }, + { + "epoch": 5.077456176110885, + "grad_norm": 0.9399899244308472, + "learning_rate": 0.0003655472890338361, + "loss": 3.6672, + "step": 74730 + }, + { + "epoch": 5.077795896181547, + "grad_norm": 0.8676060438156128, + "learning_rate": 0.0003655048240250034, + "loss": 3.3217, + "step": 74735 + }, + { + "epoch": 5.078135616252208, + "grad_norm": 2.676459550857544, + "learning_rate": 0.0003654623590161707, + "loss": 3.4911, + "step": 74740 + }, + { + "epoch": 5.07847533632287, + "grad_norm": 0.9713585376739502, + "learning_rate": 0.00036541989400733796, + "loss": 3.3449, + "step": 74745 + }, + { + "epoch": 5.078815056393532, + "grad_norm": 0.9820237755775452, + "learning_rate": 0.00036537742899850524, + "loss": 3.3205, + "step": 74750 + }, + { + "epoch": 5.079154776464193, + "grad_norm": 1.3356568813323975, + "learning_rate": 0.0003653349639896725, + "loss": 3.6737, + "step": 74755 + }, + { + "epoch": 5.079494496534855, + "grad_norm": 0.9465750455856323, + "learning_rate": 0.00036529249898083975, + "loss": 3.5338, + "step": 74760 + }, + { + "epoch": 5.0798342166055175, + "grad_norm": 1.1982115507125854, + "learning_rate": 0.0003652500339720071, + "loss": 3.4976, + "step": 74765 + }, + { + "epoch": 5.080173936676179, + "grad_norm": 0.9430028200149536, + "learning_rate": 0.00036520756896317436, + "loss": 3.5166, + "step": 74770 + }, + { + "epoch": 5.080513656746841, + "grad_norm": 0.9437617063522339, + "learning_rate": 0.0003651651039543416, + "loss": 3.5767, + "step": 74775 + }, + { + "epoch": 5.080853376817503, + "grad_norm": 0.9998571872711182, + "learning_rate": 0.0003651226389455089, + "loss": 3.3809, + "step": 74780 + }, + { + "epoch": 5.081193096888164, + "grad_norm": 1.5069353580474854, + "learning_rate": 0.0003650801739366762, + "loss": 3.4007, + "step": 74785 + }, + { + "epoch": 5.081532816958826, + "grad_norm": 1.0099660158157349, + "learning_rate": 0.0003650377089278434, + "loss": 3.3683, + "step": 74790 + }, + { + "epoch": 5.081872537029487, + "grad_norm": 0.9140014052391052, + "learning_rate": 0.00036499524391901076, + "loss": 3.3773, + "step": 74795 + }, + { + "epoch": 5.082212257100149, + "grad_norm": 0.7512241005897522, + "learning_rate": 0.00036495277891017804, + "loss": 3.3953, + "step": 74800 + }, + { + "epoch": 5.082551977170811, + "grad_norm": 0.8145073652267456, + "learning_rate": 0.0003649103139013453, + "loss": 3.3753, + "step": 74805 + }, + { + "epoch": 5.082891697241473, + "grad_norm": 0.93300461769104, + "learning_rate": 0.00036486784889251255, + "loss": 3.4258, + "step": 74810 + }, + { + "epoch": 5.083231417312135, + "grad_norm": 1.1951708793640137, + "learning_rate": 0.0003648253838836799, + "loss": 3.5958, + "step": 74815 + }, + { + "epoch": 5.083571137382797, + "grad_norm": 1.0483087301254272, + "learning_rate": 0.00036478291887484716, + "loss": 3.422, + "step": 74820 + }, + { + "epoch": 5.083910857453458, + "grad_norm": 0.9316043853759766, + "learning_rate": 0.0003647404538660144, + "loss": 3.6435, + "step": 74825 + }, + { + "epoch": 5.08425057752412, + "grad_norm": 1.0635319948196411, + "learning_rate": 0.0003646979888571817, + "loss": 3.5867, + "step": 74830 + }, + { + "epoch": 5.084590297594782, + "grad_norm": 0.8153219819068909, + "learning_rate": 0.000364655523848349, + "loss": 3.4164, + "step": 74835 + }, + { + "epoch": 5.084930017665443, + "grad_norm": 1.0799212455749512, + "learning_rate": 0.0003646130588395162, + "loss": 3.3653, + "step": 74840 + }, + { + "epoch": 5.085269737736105, + "grad_norm": 1.0873308181762695, + "learning_rate": 0.0003645705938306835, + "loss": 3.7847, + "step": 74845 + }, + { + "epoch": 5.085609457806767, + "grad_norm": 0.764984667301178, + "learning_rate": 0.00036452812882185084, + "loss": 3.5612, + "step": 74850 + }, + { + "epoch": 5.085949177877429, + "grad_norm": 0.9413536787033081, + "learning_rate": 0.00036448566381301807, + "loss": 3.2767, + "step": 74855 + }, + { + "epoch": 5.086288897948091, + "grad_norm": 0.6952675580978394, + "learning_rate": 0.00036444319880418535, + "loss": 3.4394, + "step": 74860 + }, + { + "epoch": 5.086628618018753, + "grad_norm": 0.8785685300827026, + "learning_rate": 0.0003644007337953527, + "loss": 3.435, + "step": 74865 + }, + { + "epoch": 5.086968338089414, + "grad_norm": 0.8914371132850647, + "learning_rate": 0.0003643582687865199, + "loss": 3.2377, + "step": 74870 + }, + { + "epoch": 5.087308058160076, + "grad_norm": 1.2207525968551636, + "learning_rate": 0.0003643158037776872, + "loss": 3.3802, + "step": 74875 + }, + { + "epoch": 5.087647778230738, + "grad_norm": 0.7919811606407166, + "learning_rate": 0.00036427333876885447, + "loss": 3.6427, + "step": 74880 + }, + { + "epoch": 5.087987498301399, + "grad_norm": 0.8844937086105347, + "learning_rate": 0.00036423087376002175, + "loss": 3.3036, + "step": 74885 + }, + { + "epoch": 5.088327218372061, + "grad_norm": 1.0923155546188354, + "learning_rate": 0.00036418840875118903, + "loss": 3.3693, + "step": 74890 + }, + { + "epoch": 5.088666938442723, + "grad_norm": 0.7732470631599426, + "learning_rate": 0.0003641459437423563, + "loss": 3.1763, + "step": 74895 + }, + { + "epoch": 5.089006658513385, + "grad_norm": 1.0991963148117065, + "learning_rate": 0.0003641034787335236, + "loss": 3.5752, + "step": 74900 + }, + { + "epoch": 5.089346378584047, + "grad_norm": 0.8838290572166443, + "learning_rate": 0.00036406101372469087, + "loss": 3.3803, + "step": 74905 + }, + { + "epoch": 5.089686098654709, + "grad_norm": 0.9321802258491516, + "learning_rate": 0.00036401854871585815, + "loss": 3.2742, + "step": 74910 + }, + { + "epoch": 5.09002581872537, + "grad_norm": 0.7832319736480713, + "learning_rate": 0.0003639760837070254, + "loss": 3.4251, + "step": 74915 + }, + { + "epoch": 5.090365538796032, + "grad_norm": 0.7962220907211304, + "learning_rate": 0.0003639336186981927, + "loss": 3.4491, + "step": 74920 + }, + { + "epoch": 5.090705258866694, + "grad_norm": 0.9230138659477234, + "learning_rate": 0.00036389115368936, + "loss": 3.2937, + "step": 74925 + }, + { + "epoch": 5.091044978937355, + "grad_norm": 0.833032488822937, + "learning_rate": 0.0003638486886805272, + "loss": 3.3197, + "step": 74930 + }, + { + "epoch": 5.091384699008017, + "grad_norm": 0.9291242957115173, + "learning_rate": 0.00036380622367169455, + "loss": 3.4146, + "step": 74935 + }, + { + "epoch": 5.0917244190786795, + "grad_norm": 0.8552272915840149, + "learning_rate": 0.00036376375866286183, + "loss": 3.6439, + "step": 74940 + }, + { + "epoch": 5.092064139149341, + "grad_norm": 0.877326250076294, + "learning_rate": 0.00036372129365402905, + "loss": 3.3217, + "step": 74945 + }, + { + "epoch": 5.092403859220003, + "grad_norm": 0.9269464612007141, + "learning_rate": 0.00036367882864519633, + "loss": 3.1888, + "step": 74950 + }, + { + "epoch": 5.092743579290665, + "grad_norm": 0.8496392369270325, + "learning_rate": 0.00036363636363636367, + "loss": 3.5922, + "step": 74955 + }, + { + "epoch": 5.093083299361326, + "grad_norm": 0.8064510226249695, + "learning_rate": 0.0003635938986275309, + "loss": 3.4349, + "step": 74960 + }, + { + "epoch": 5.093423019431988, + "grad_norm": 0.6948370933532715, + "learning_rate": 0.0003635514336186982, + "loss": 3.5361, + "step": 74965 + }, + { + "epoch": 5.09376273950265, + "grad_norm": 0.7466139793395996, + "learning_rate": 0.0003635089686098655, + "loss": 3.6369, + "step": 74970 + }, + { + "epoch": 5.094102459573311, + "grad_norm": 1.0478534698486328, + "learning_rate": 0.0003634665036010328, + "loss": 3.5093, + "step": 74975 + }, + { + "epoch": 5.094442179643973, + "grad_norm": 1.0302352905273438, + "learning_rate": 0.0003634240385922, + "loss": 3.2157, + "step": 74980 + }, + { + "epoch": 5.0947818997146355, + "grad_norm": 1.0532633066177368, + "learning_rate": 0.0003633815735833673, + "loss": 3.516, + "step": 74985 + }, + { + "epoch": 5.095121619785297, + "grad_norm": 1.052453637123108, + "learning_rate": 0.00036333910857453463, + "loss": 3.6019, + "step": 74990 + }, + { + "epoch": 5.095461339855959, + "grad_norm": 0.7548352479934692, + "learning_rate": 0.00036329664356570185, + "loss": 3.7434, + "step": 74995 + }, + { + "epoch": 5.095801059926621, + "grad_norm": 0.8936693668365479, + "learning_rate": 0.00036325417855686914, + "loss": 3.5746, + "step": 75000 + }, + { + "epoch": 5.096140779997282, + "grad_norm": 0.87913578748703, + "learning_rate": 0.00036321171354803647, + "loss": 3.3616, + "step": 75005 + }, + { + "epoch": 5.096480500067944, + "grad_norm": 0.8990415334701538, + "learning_rate": 0.0003631692485392037, + "loss": 3.57, + "step": 75010 + }, + { + "epoch": 5.096820220138606, + "grad_norm": 0.7970677018165588, + "learning_rate": 0.000363126783530371, + "loss": 3.6775, + "step": 75015 + }, + { + "epoch": 5.097159940209267, + "grad_norm": 1.561854362487793, + "learning_rate": 0.00036308431852153826, + "loss": 3.4605, + "step": 75020 + }, + { + "epoch": 5.097499660279929, + "grad_norm": 0.8032172322273254, + "learning_rate": 0.00036304185351270554, + "loss": 3.5228, + "step": 75025 + }, + { + "epoch": 5.0978393803505915, + "grad_norm": 1.2674400806427002, + "learning_rate": 0.0003629993885038728, + "loss": 3.315, + "step": 75030 + }, + { + "epoch": 5.098179100421253, + "grad_norm": 1.0019713640213013, + "learning_rate": 0.0003629569234950401, + "loss": 3.3241, + "step": 75035 + }, + { + "epoch": 5.098518820491915, + "grad_norm": 0.8493909239768982, + "learning_rate": 0.0003629144584862074, + "loss": 3.4035, + "step": 75040 + }, + { + "epoch": 5.098858540562577, + "grad_norm": 0.7503010034561157, + "learning_rate": 0.00036287199347737466, + "loss": 3.5897, + "step": 75045 + }, + { + "epoch": 5.099198260633238, + "grad_norm": 1.068415880203247, + "learning_rate": 0.00036282952846854194, + "loss": 3.1859, + "step": 75050 + }, + { + "epoch": 5.0995379807039, + "grad_norm": 0.9574511051177979, + "learning_rate": 0.00036278706345970916, + "loss": 3.2244, + "step": 75055 + }, + { + "epoch": 5.099877700774562, + "grad_norm": 0.8176553249359131, + "learning_rate": 0.0003627445984508765, + "loss": 3.4798, + "step": 75060 + }, + { + "epoch": 5.100217420845223, + "grad_norm": 0.7115208506584167, + "learning_rate": 0.0003627021334420438, + "loss": 3.3215, + "step": 75065 + }, + { + "epoch": 5.100557140915885, + "grad_norm": 0.9752081632614136, + "learning_rate": 0.000362659668433211, + "loss": 3.5148, + "step": 75070 + }, + { + "epoch": 5.1008968609865475, + "grad_norm": 1.0558444261550903, + "learning_rate": 0.00036261720342437834, + "loss": 3.5295, + "step": 75075 + }, + { + "epoch": 5.101236581057209, + "grad_norm": 0.8523917198181152, + "learning_rate": 0.0003625747384155456, + "loss": 3.5114, + "step": 75080 + }, + { + "epoch": 5.101576301127871, + "grad_norm": 1.117894172668457, + "learning_rate": 0.00036253227340671284, + "loss": 3.4171, + "step": 75085 + }, + { + "epoch": 5.101916021198533, + "grad_norm": 0.8902376294136047, + "learning_rate": 0.0003624898083978802, + "loss": 3.6221, + "step": 75090 + }, + { + "epoch": 5.102255741269194, + "grad_norm": 0.8768068552017212, + "learning_rate": 0.00036244734338904746, + "loss": 3.3729, + "step": 75095 + }, + { + "epoch": 5.102595461339856, + "grad_norm": 3.3800156116485596, + "learning_rate": 0.0003624048783802147, + "loss": 3.4743, + "step": 75100 + }, + { + "epoch": 5.102935181410518, + "grad_norm": 0.9786582589149475, + "learning_rate": 0.00036236241337138196, + "loss": 3.4943, + "step": 75105 + }, + { + "epoch": 5.103274901481179, + "grad_norm": 1.0812629461288452, + "learning_rate": 0.0003623199483625493, + "loss": 3.3419, + "step": 75110 + }, + { + "epoch": 5.103614621551841, + "grad_norm": 0.8443645238876343, + "learning_rate": 0.0003622774833537165, + "loss": 3.4681, + "step": 75115 + }, + { + "epoch": 5.103954341622503, + "grad_norm": 0.8232329487800598, + "learning_rate": 0.0003622350183448838, + "loss": 3.374, + "step": 75120 + }, + { + "epoch": 5.104294061693165, + "grad_norm": 0.9142671227455139, + "learning_rate": 0.00036219255333605114, + "loss": 3.3595, + "step": 75125 + }, + { + "epoch": 5.104633781763827, + "grad_norm": 2.5695884227752686, + "learning_rate": 0.00036215008832721836, + "loss": 3.4539, + "step": 75130 + }, + { + "epoch": 5.104973501834488, + "grad_norm": 2.2469587326049805, + "learning_rate": 0.00036210762331838564, + "loss": 3.3255, + "step": 75135 + }, + { + "epoch": 5.10531322190515, + "grad_norm": 1.1142516136169434, + "learning_rate": 0.0003620651583095529, + "loss": 3.1886, + "step": 75140 + }, + { + "epoch": 5.105652941975812, + "grad_norm": 2.7696444988250732, + "learning_rate": 0.00036202269330072026, + "loss": 3.21, + "step": 75145 + }, + { + "epoch": 5.105992662046473, + "grad_norm": 1.2185921669006348, + "learning_rate": 0.0003619802282918875, + "loss": 3.5297, + "step": 75150 + }, + { + "epoch": 5.106332382117135, + "grad_norm": 1.0559622049331665, + "learning_rate": 0.00036193776328305476, + "loss": 3.4726, + "step": 75155 + }, + { + "epoch": 5.106672102187797, + "grad_norm": 1.2602914571762085, + "learning_rate": 0.0003618952982742221, + "loss": 3.2971, + "step": 75160 + }, + { + "epoch": 5.107011822258459, + "grad_norm": 1.0155549049377441, + "learning_rate": 0.0003618528332653893, + "loss": 3.7329, + "step": 75165 + }, + { + "epoch": 5.107351542329121, + "grad_norm": 0.7548108696937561, + "learning_rate": 0.0003618103682565566, + "loss": 3.0906, + "step": 75170 + }, + { + "epoch": 5.107691262399783, + "grad_norm": 0.8115633726119995, + "learning_rate": 0.0003617679032477239, + "loss": 3.5282, + "step": 75175 + }, + { + "epoch": 5.108030982470444, + "grad_norm": 0.896251380443573, + "learning_rate": 0.00036172543823889116, + "loss": 3.2473, + "step": 75180 + }, + { + "epoch": 5.108370702541106, + "grad_norm": 1.1997495889663696, + "learning_rate": 0.00036168297323005844, + "loss": 3.3777, + "step": 75185 + }, + { + "epoch": 5.108710422611768, + "grad_norm": 0.7781342267990112, + "learning_rate": 0.0003616405082212257, + "loss": 3.325, + "step": 75190 + }, + { + "epoch": 5.109050142682429, + "grad_norm": 1.2607507705688477, + "learning_rate": 0.000361598043212393, + "loss": 3.4467, + "step": 75195 + }, + { + "epoch": 5.109389862753091, + "grad_norm": 1.0934783220291138, + "learning_rate": 0.0003615555782035603, + "loss": 3.3917, + "step": 75200 + }, + { + "epoch": 5.1097295828237534, + "grad_norm": 0.8321346044540405, + "learning_rate": 0.00036151311319472756, + "loss": 3.4522, + "step": 75205 + }, + { + "epoch": 5.110069302894415, + "grad_norm": 1.1476589441299438, + "learning_rate": 0.0003614706481858948, + "loss": 3.5991, + "step": 75210 + }, + { + "epoch": 5.110409022965077, + "grad_norm": 1.115465760231018, + "learning_rate": 0.0003614281831770621, + "loss": 3.3386, + "step": 75215 + }, + { + "epoch": 5.110748743035739, + "grad_norm": 1.0848848819732666, + "learning_rate": 0.0003613857181682294, + "loss": 3.4735, + "step": 75220 + }, + { + "epoch": 5.1110884631064, + "grad_norm": 0.7689398527145386, + "learning_rate": 0.00036134325315939663, + "loss": 3.5541, + "step": 75225 + }, + { + "epoch": 5.111428183177062, + "grad_norm": 1.1542972326278687, + "learning_rate": 0.00036130078815056396, + "loss": 3.5869, + "step": 75230 + }, + { + "epoch": 5.111767903247724, + "grad_norm": 0.9087128043174744, + "learning_rate": 0.00036125832314173124, + "loss": 3.4538, + "step": 75235 + }, + { + "epoch": 5.112107623318385, + "grad_norm": 0.961708128452301, + "learning_rate": 0.00036121585813289847, + "loss": 3.3405, + "step": 75240 + }, + { + "epoch": 5.112447343389047, + "grad_norm": 2.5039546489715576, + "learning_rate": 0.00036117339312406575, + "loss": 3.3355, + "step": 75245 + }, + { + "epoch": 5.1127870634597095, + "grad_norm": 1.0458950996398926, + "learning_rate": 0.0003611309281152331, + "loss": 3.642, + "step": 75250 + }, + { + "epoch": 5.113126783530371, + "grad_norm": 0.9664716124534607, + "learning_rate": 0.0003610884631064003, + "loss": 3.1792, + "step": 75255 + }, + { + "epoch": 5.113466503601033, + "grad_norm": 0.8738074898719788, + "learning_rate": 0.0003610459980975676, + "loss": 3.3441, + "step": 75260 + }, + { + "epoch": 5.113806223671695, + "grad_norm": 1.2631560564041138, + "learning_rate": 0.0003610035330887349, + "loss": 3.488, + "step": 75265 + }, + { + "epoch": 5.114145943742356, + "grad_norm": 0.853336751461029, + "learning_rate": 0.00036096106807990215, + "loss": 3.5346, + "step": 75270 + }, + { + "epoch": 5.114485663813018, + "grad_norm": 0.913227915763855, + "learning_rate": 0.00036091860307106943, + "loss": 3.4258, + "step": 75275 + }, + { + "epoch": 5.11482538388368, + "grad_norm": 0.7793901562690735, + "learning_rate": 0.0003608761380622367, + "loss": 3.5371, + "step": 75280 + }, + { + "epoch": 5.115165103954341, + "grad_norm": 0.977526068687439, + "learning_rate": 0.000360833673053404, + "loss": 3.5755, + "step": 75285 + }, + { + "epoch": 5.115504824025003, + "grad_norm": 1.0466021299362183, + "learning_rate": 0.00036079120804457127, + "loss": 3.3294, + "step": 75290 + }, + { + "epoch": 5.1158445440956655, + "grad_norm": 1.2900959253311157, + "learning_rate": 0.00036074874303573855, + "loss": 3.515, + "step": 75295 + }, + { + "epoch": 5.116184264166327, + "grad_norm": 1.031498908996582, + "learning_rate": 0.00036070627802690583, + "loss": 3.5061, + "step": 75300 + }, + { + "epoch": 5.116523984236989, + "grad_norm": 1.1877766847610474, + "learning_rate": 0.0003606638130180731, + "loss": 3.1995, + "step": 75305 + }, + { + "epoch": 5.116863704307651, + "grad_norm": 1.4812793731689453, + "learning_rate": 0.0003606213480092404, + "loss": 3.6763, + "step": 75310 + }, + { + "epoch": 5.117203424378312, + "grad_norm": 1.0896186828613281, + "learning_rate": 0.00036057888300040767, + "loss": 3.4065, + "step": 75315 + }, + { + "epoch": 5.117543144448974, + "grad_norm": 0.7313472628593445, + "learning_rate": 0.00036053641799157495, + "loss": 3.4666, + "step": 75320 + }, + { + "epoch": 5.117882864519636, + "grad_norm": 0.8708968758583069, + "learning_rate": 0.00036049395298274223, + "loss": 3.523, + "step": 75325 + }, + { + "epoch": 5.118222584590297, + "grad_norm": 0.9839439392089844, + "learning_rate": 0.0003604514879739095, + "loss": 3.4319, + "step": 75330 + }, + { + "epoch": 5.118562304660959, + "grad_norm": 0.9997403621673584, + "learning_rate": 0.0003604090229650768, + "loss": 3.436, + "step": 75335 + }, + { + "epoch": 5.1189020247316215, + "grad_norm": 1.0291342735290527, + "learning_rate": 0.00036036655795624407, + "loss": 3.51, + "step": 75340 + }, + { + "epoch": 5.119241744802283, + "grad_norm": 1.1835917234420776, + "learning_rate": 0.00036032409294741135, + "loss": 3.3013, + "step": 75345 + }, + { + "epoch": 5.119581464872945, + "grad_norm": 0.7763064503669739, + "learning_rate": 0.0003602816279385786, + "loss": 3.7148, + "step": 75350 + }, + { + "epoch": 5.119921184943607, + "grad_norm": 0.8296974301338196, + "learning_rate": 0.0003602391629297459, + "loss": 3.3785, + "step": 75355 + }, + { + "epoch": 5.120260905014268, + "grad_norm": 0.8377522826194763, + "learning_rate": 0.0003601966979209132, + "loss": 3.1773, + "step": 75360 + }, + { + "epoch": 5.12060062508493, + "grad_norm": 0.8788779973983765, + "learning_rate": 0.0003601542329120804, + "loss": 3.4834, + "step": 75365 + }, + { + "epoch": 5.120940345155592, + "grad_norm": 1.4539577960968018, + "learning_rate": 0.00036011176790324775, + "loss": 3.5867, + "step": 75370 + }, + { + "epoch": 5.121280065226253, + "grad_norm": 1.067986249923706, + "learning_rate": 0.00036006930289441503, + "loss": 3.4971, + "step": 75375 + }, + { + "epoch": 5.121619785296915, + "grad_norm": 0.991457998752594, + "learning_rate": 0.00036002683788558226, + "loss": 3.3861, + "step": 75380 + }, + { + "epoch": 5.1219595053675775, + "grad_norm": 1.0350656509399414, + "learning_rate": 0.0003599843728767496, + "loss": 3.1379, + "step": 75385 + }, + { + "epoch": 5.122299225438239, + "grad_norm": 0.7893446087837219, + "learning_rate": 0.00035994190786791687, + "loss": 3.1719, + "step": 75390 + }, + { + "epoch": 5.122638945508901, + "grad_norm": 1.1286197900772095, + "learning_rate": 0.0003598994428590841, + "loss": 3.2899, + "step": 75395 + }, + { + "epoch": 5.122978665579563, + "grad_norm": 0.9397675395011902, + "learning_rate": 0.0003598569778502514, + "loss": 3.505, + "step": 75400 + }, + { + "epoch": 5.123318385650224, + "grad_norm": 0.9369810223579407, + "learning_rate": 0.0003598145128414187, + "loss": 3.4832, + "step": 75405 + }, + { + "epoch": 5.123658105720886, + "grad_norm": 1.0305153131484985, + "learning_rate": 0.00035977204783258594, + "loss": 3.565, + "step": 75410 + }, + { + "epoch": 5.123997825791548, + "grad_norm": 1.0589401721954346, + "learning_rate": 0.0003597295828237532, + "loss": 3.2967, + "step": 75415 + }, + { + "epoch": 5.124337545862209, + "grad_norm": 0.7920261025428772, + "learning_rate": 0.00035968711781492055, + "loss": 3.4406, + "step": 75420 + }, + { + "epoch": 5.124677265932871, + "grad_norm": 0.7843968272209167, + "learning_rate": 0.0003596446528060878, + "loss": 3.264, + "step": 75425 + }, + { + "epoch": 5.1250169860035335, + "grad_norm": 0.8364913463592529, + "learning_rate": 0.00035960218779725506, + "loss": 3.5544, + "step": 75430 + }, + { + "epoch": 5.125356706074195, + "grad_norm": 1.0090137720108032, + "learning_rate": 0.00035955972278842234, + "loss": 3.3887, + "step": 75435 + }, + { + "epoch": 5.125696426144857, + "grad_norm": 1.0300874710083008, + "learning_rate": 0.0003595172577795896, + "loss": 3.5062, + "step": 75440 + }, + { + "epoch": 5.126036146215519, + "grad_norm": 0.8499267101287842, + "learning_rate": 0.0003594747927707569, + "loss": 3.6993, + "step": 75445 + }, + { + "epoch": 5.12637586628618, + "grad_norm": 0.9229161739349365, + "learning_rate": 0.0003594323277619242, + "loss": 3.4558, + "step": 75450 + }, + { + "epoch": 5.126715586356842, + "grad_norm": 0.8303118348121643, + "learning_rate": 0.00035938986275309146, + "loss": 3.6055, + "step": 75455 + }, + { + "epoch": 5.127055306427504, + "grad_norm": 1.050044298171997, + "learning_rate": 0.00035934739774425874, + "loss": 3.2816, + "step": 75460 + }, + { + "epoch": 5.127395026498165, + "grad_norm": 1.3023356199264526, + "learning_rate": 0.000359304932735426, + "loss": 3.3916, + "step": 75465 + }, + { + "epoch": 5.127734746568827, + "grad_norm": 1.05077064037323, + "learning_rate": 0.00035926246772659325, + "loss": 3.2726, + "step": 75470 + }, + { + "epoch": 5.1280744666394895, + "grad_norm": 0.9377334713935852, + "learning_rate": 0.0003592200027177606, + "loss": 2.977, + "step": 75475 + }, + { + "epoch": 5.128414186710151, + "grad_norm": 1.027722716331482, + "learning_rate": 0.00035917753770892786, + "loss": 3.3697, + "step": 75480 + }, + { + "epoch": 5.128753906780813, + "grad_norm": 1.1096478700637817, + "learning_rate": 0.00035913507270009514, + "loss": 3.3054, + "step": 75485 + }, + { + "epoch": 5.129093626851474, + "grad_norm": 0.7367983460426331, + "learning_rate": 0.0003591011006930289, + "loss": 3.3695, + "step": 75490 + }, + { + "epoch": 5.129433346922136, + "grad_norm": 1.0401781797409058, + "learning_rate": 0.00035905863568419624, + "loss": 3.5574, + "step": 75495 + }, + { + "epoch": 5.129773066992798, + "grad_norm": 0.7621700167655945, + "learning_rate": 0.0003590161706753635, + "loss": 3.3016, + "step": 75500 + }, + { + "epoch": 5.130112787063459, + "grad_norm": 0.9435403347015381, + "learning_rate": 0.00035897370566653075, + "loss": 3.4503, + "step": 75505 + }, + { + "epoch": 5.130452507134121, + "grad_norm": 0.9207668900489807, + "learning_rate": 0.00035893124065769803, + "loss": 3.4868, + "step": 75510 + }, + { + "epoch": 5.1307922272047835, + "grad_norm": 1.0668619871139526, + "learning_rate": 0.00035888877564886536, + "loss": 3.6211, + "step": 75515 + }, + { + "epoch": 5.131131947275445, + "grad_norm": 1.0035346746444702, + "learning_rate": 0.00035884631064003264, + "loss": 3.3853, + "step": 75520 + }, + { + "epoch": 5.131471667346107, + "grad_norm": 0.7235339879989624, + "learning_rate": 0.00035880384563119987, + "loss": 3.307, + "step": 75525 + }, + { + "epoch": 5.131811387416769, + "grad_norm": 0.8016841411590576, + "learning_rate": 0.0003587613806223672, + "loss": 3.3267, + "step": 75530 + }, + { + "epoch": 5.13215110748743, + "grad_norm": 1.57282292842865, + "learning_rate": 0.0003587189156135345, + "loss": 3.3387, + "step": 75535 + }, + { + "epoch": 5.132490827558092, + "grad_norm": 0.9315999150276184, + "learning_rate": 0.0003586764506047017, + "loss": 3.5075, + "step": 75540 + }, + { + "epoch": 5.132830547628754, + "grad_norm": 0.6545839309692383, + "learning_rate": 0.000358633985595869, + "loss": 3.8851, + "step": 75545 + }, + { + "epoch": 5.133170267699415, + "grad_norm": 0.9061064124107361, + "learning_rate": 0.0003585915205870363, + "loss": 3.2867, + "step": 75550 + }, + { + "epoch": 5.133509987770077, + "grad_norm": 0.7675952315330505, + "learning_rate": 0.00035854905557820355, + "loss": 3.4448, + "step": 75555 + }, + { + "epoch": 5.1338497078407395, + "grad_norm": 0.99594646692276, + "learning_rate": 0.00035850659056937083, + "loss": 3.5863, + "step": 75560 + }, + { + "epoch": 5.134189427911401, + "grad_norm": 0.7751418948173523, + "learning_rate": 0.00035846412556053817, + "loss": 3.5607, + "step": 75565 + }, + { + "epoch": 5.134529147982063, + "grad_norm": 0.8438655138015747, + "learning_rate": 0.0003584216605517054, + "loss": 3.4041, + "step": 75570 + }, + { + "epoch": 5.134868868052725, + "grad_norm": 0.8589345216751099, + "learning_rate": 0.00035837919554287267, + "loss": 3.4219, + "step": 75575 + }, + { + "epoch": 5.135208588123386, + "grad_norm": 0.9799549579620361, + "learning_rate": 0.00035833673053404, + "loss": 3.4131, + "step": 75580 + }, + { + "epoch": 5.135548308194048, + "grad_norm": 0.7581786513328552, + "learning_rate": 0.00035829426552520723, + "loss": 3.6269, + "step": 75585 + }, + { + "epoch": 5.13588802826471, + "grad_norm": 1.0855883359909058, + "learning_rate": 0.0003582518005163745, + "loss": 3.2005, + "step": 75590 + }, + { + "epoch": 5.136227748335371, + "grad_norm": 1.0746545791625977, + "learning_rate": 0.0003582093355075418, + "loss": 3.5084, + "step": 75595 + }, + { + "epoch": 5.136567468406033, + "grad_norm": 1.0121632814407349, + "learning_rate": 0.00035816687049870907, + "loss": 3.4055, + "step": 75600 + }, + { + "epoch": 5.1369071884766955, + "grad_norm": 1.0005230903625488, + "learning_rate": 0.00035812440548987635, + "loss": 3.6438, + "step": 75605 + }, + { + "epoch": 5.137246908547357, + "grad_norm": 0.7701160311698914, + "learning_rate": 0.00035808194048104363, + "loss": 3.5337, + "step": 75610 + }, + { + "epoch": 5.137586628618019, + "grad_norm": 0.8466830849647522, + "learning_rate": 0.0003580394754722109, + "loss": 3.5234, + "step": 75615 + }, + { + "epoch": 5.137926348688681, + "grad_norm": 0.9489638209342957, + "learning_rate": 0.0003579970104633782, + "loss": 3.9219, + "step": 75620 + }, + { + "epoch": 5.138266068759342, + "grad_norm": 0.9682037234306335, + "learning_rate": 0.00035795454545454547, + "loss": 3.6823, + "step": 75625 + }, + { + "epoch": 5.138605788830004, + "grad_norm": 1.1045693159103394, + "learning_rate": 0.0003579120804457127, + "loss": 3.4165, + "step": 75630 + }, + { + "epoch": 5.138945508900666, + "grad_norm": 0.7934451103210449, + "learning_rate": 0.00035786961543688003, + "loss": 3.5365, + "step": 75635 + }, + { + "epoch": 5.139285228971327, + "grad_norm": 0.9305216670036316, + "learning_rate": 0.0003578271504280473, + "loss": 3.2086, + "step": 75640 + }, + { + "epoch": 5.139624949041989, + "grad_norm": 3.5662033557891846, + "learning_rate": 0.00035778468541921454, + "loss": 3.393, + "step": 75645 + }, + { + "epoch": 5.1399646691126515, + "grad_norm": 0.9127552509307861, + "learning_rate": 0.00035774222041038187, + "loss": 3.5444, + "step": 75650 + }, + { + "epoch": 5.140304389183313, + "grad_norm": 1.7454509735107422, + "learning_rate": 0.00035769975540154915, + "loss": 3.543, + "step": 75655 + }, + { + "epoch": 5.140644109253975, + "grad_norm": 0.8440763354301453, + "learning_rate": 0.0003576572903927164, + "loss": 3.3662, + "step": 75660 + }, + { + "epoch": 5.140983829324637, + "grad_norm": 0.8017412424087524, + "learning_rate": 0.00035761482538388366, + "loss": 3.5833, + "step": 75665 + }, + { + "epoch": 5.141323549395298, + "grad_norm": 0.9223544001579285, + "learning_rate": 0.000357572360375051, + "loss": 3.4566, + "step": 75670 + }, + { + "epoch": 5.14166326946596, + "grad_norm": 0.8816435933113098, + "learning_rate": 0.0003575298953662182, + "loss": 3.7289, + "step": 75675 + }, + { + "epoch": 5.142002989536622, + "grad_norm": 0.7520071268081665, + "learning_rate": 0.0003574874303573855, + "loss": 3.4157, + "step": 75680 + }, + { + "epoch": 5.142342709607283, + "grad_norm": 0.7445632219314575, + "learning_rate": 0.00035744496534855283, + "loss": 3.4046, + "step": 75685 + }, + { + "epoch": 5.142682429677945, + "grad_norm": 0.8384628891944885, + "learning_rate": 0.0003574025003397201, + "loss": 3.4699, + "step": 75690 + }, + { + "epoch": 5.1430221497486075, + "grad_norm": 1.0319843292236328, + "learning_rate": 0.00035736003533088734, + "loss": 3.4349, + "step": 75695 + }, + { + "epoch": 5.143361869819269, + "grad_norm": 0.8538283109664917, + "learning_rate": 0.0003573175703220546, + "loss": 3.5156, + "step": 75700 + }, + { + "epoch": 5.143701589889931, + "grad_norm": 0.7731644511222839, + "learning_rate": 0.00035727510531322195, + "loss": 3.6807, + "step": 75705 + }, + { + "epoch": 5.144041309960593, + "grad_norm": 1.0195730924606323, + "learning_rate": 0.0003572326403043892, + "loss": 3.5144, + "step": 75710 + }, + { + "epoch": 5.144381030031254, + "grad_norm": 0.9497541785240173, + "learning_rate": 0.00035719017529555646, + "loss": 3.5505, + "step": 75715 + }, + { + "epoch": 5.144720750101916, + "grad_norm": 0.845953106880188, + "learning_rate": 0.0003571477102867238, + "loss": 3.5737, + "step": 75720 + }, + { + "epoch": 5.145060470172578, + "grad_norm": 1.7132205963134766, + "learning_rate": 0.000357105245277891, + "loss": 3.6705, + "step": 75725 + }, + { + "epoch": 5.145400190243239, + "grad_norm": 1.1021482944488525, + "learning_rate": 0.0003570627802690583, + "loss": 3.4928, + "step": 75730 + }, + { + "epoch": 5.145739910313901, + "grad_norm": 0.9758666753768921, + "learning_rate": 0.0003570203152602256, + "loss": 3.3507, + "step": 75735 + }, + { + "epoch": 5.1460796303845635, + "grad_norm": 0.7958225011825562, + "learning_rate": 0.00035697785025139286, + "loss": 3.3516, + "step": 75740 + }, + { + "epoch": 5.146419350455225, + "grad_norm": 1.0058013200759888, + "learning_rate": 0.00035693538524256014, + "loss": 3.3862, + "step": 75745 + }, + { + "epoch": 5.146759070525887, + "grad_norm": 0.9670622944831848, + "learning_rate": 0.0003568929202337274, + "loss": 3.6564, + "step": 75750 + }, + { + "epoch": 5.147098790596549, + "grad_norm": 0.861303985118866, + "learning_rate": 0.0003568504552248947, + "loss": 3.3129, + "step": 75755 + }, + { + "epoch": 5.14743851066721, + "grad_norm": 1.1970033645629883, + "learning_rate": 0.000356807990216062, + "loss": 3.3072, + "step": 75760 + }, + { + "epoch": 5.147778230737872, + "grad_norm": 0.9224584698677063, + "learning_rate": 0.00035676552520722926, + "loss": 3.7965, + "step": 75765 + }, + { + "epoch": 5.148117950808534, + "grad_norm": 1.0663291215896606, + "learning_rate": 0.0003567230601983965, + "loss": 3.4682, + "step": 75770 + }, + { + "epoch": 5.148457670879195, + "grad_norm": 1.0709235668182373, + "learning_rate": 0.0003566805951895638, + "loss": 3.4422, + "step": 75775 + }, + { + "epoch": 5.1487973909498574, + "grad_norm": 0.894195020198822, + "learning_rate": 0.0003566381301807311, + "loss": 3.3104, + "step": 75780 + }, + { + "epoch": 5.1491371110205195, + "grad_norm": 1.3014155626296997, + "learning_rate": 0.0003565956651718983, + "loss": 3.3609, + "step": 75785 + }, + { + "epoch": 5.149476831091181, + "grad_norm": 0.8666042685508728, + "learning_rate": 0.00035655320016306566, + "loss": 3.4669, + "step": 75790 + }, + { + "epoch": 5.149816551161843, + "grad_norm": 0.8061203956604004, + "learning_rate": 0.00035651073515423294, + "loss": 3.4226, + "step": 75795 + }, + { + "epoch": 5.150156271232504, + "grad_norm": 0.8202029466629028, + "learning_rate": 0.00035646827014540017, + "loss": 3.5009, + "step": 75800 + }, + { + "epoch": 5.150495991303166, + "grad_norm": 0.903713583946228, + "learning_rate": 0.00035642580513656745, + "loss": 3.2124, + "step": 75805 + }, + { + "epoch": 5.150835711373828, + "grad_norm": 0.7601439952850342, + "learning_rate": 0.0003563833401277348, + "loss": 3.6192, + "step": 75810 + }, + { + "epoch": 5.151175431444489, + "grad_norm": 0.9633428454399109, + "learning_rate": 0.000356340875118902, + "loss": 3.2655, + "step": 75815 + }, + { + "epoch": 5.151515151515151, + "grad_norm": 1.1564083099365234, + "learning_rate": 0.0003562984101100693, + "loss": 3.5244, + "step": 75820 + }, + { + "epoch": 5.1518548715858135, + "grad_norm": 1.2485640048980713, + "learning_rate": 0.0003562559451012366, + "loss": 3.3888, + "step": 75825 + }, + { + "epoch": 5.152194591656475, + "grad_norm": 0.8334026336669922, + "learning_rate": 0.00035621348009240385, + "loss": 3.6467, + "step": 75830 + }, + { + "epoch": 5.152534311727137, + "grad_norm": 0.7886650562286377, + "learning_rate": 0.0003561710150835711, + "loss": 3.6473, + "step": 75835 + }, + { + "epoch": 5.152874031797799, + "grad_norm": 0.9635425806045532, + "learning_rate": 0.0003561285500747384, + "loss": 3.4983, + "step": 75840 + }, + { + "epoch": 5.15321375186846, + "grad_norm": 0.8134226202964783, + "learning_rate": 0.0003560860850659057, + "loss": 3.2854, + "step": 75845 + }, + { + "epoch": 5.153553471939122, + "grad_norm": 1.065832257270813, + "learning_rate": 0.00035604362005707297, + "loss": 3.48, + "step": 75850 + }, + { + "epoch": 5.153893192009784, + "grad_norm": 1.0040004253387451, + "learning_rate": 0.00035600115504824025, + "loss": 3.446, + "step": 75855 + }, + { + "epoch": 5.154232912080445, + "grad_norm": 0.8424885869026184, + "learning_rate": 0.0003559586900394076, + "loss": 3.5443, + "step": 75860 + }, + { + "epoch": 5.154572632151107, + "grad_norm": 1.0970627069473267, + "learning_rate": 0.0003559162250305748, + "loss": 3.5166, + "step": 75865 + }, + { + "epoch": 5.1549123522217695, + "grad_norm": 1.1280426979064941, + "learning_rate": 0.0003558737600217421, + "loss": 3.2641, + "step": 75870 + }, + { + "epoch": 5.155252072292431, + "grad_norm": 1.0118502378463745, + "learning_rate": 0.0003558312950129094, + "loss": 3.4987, + "step": 75875 + }, + { + "epoch": 5.155591792363093, + "grad_norm": 0.9337117075920105, + "learning_rate": 0.00035578883000407665, + "loss": 3.3677, + "step": 75880 + }, + { + "epoch": 5.155931512433755, + "grad_norm": 1.0728553533554077, + "learning_rate": 0.00035574636499524393, + "loss": 3.4773, + "step": 75885 + }, + { + "epoch": 5.156271232504416, + "grad_norm": 0.9852176308631897, + "learning_rate": 0.0003557038999864112, + "loss": 3.4705, + "step": 75890 + }, + { + "epoch": 5.156610952575078, + "grad_norm": 1.066967487335205, + "learning_rate": 0.0003556614349775785, + "loss": 3.2903, + "step": 75895 + }, + { + "epoch": 5.15695067264574, + "grad_norm": 0.8087648749351501, + "learning_rate": 0.00035561896996874577, + "loss": 3.4417, + "step": 75900 + }, + { + "epoch": 5.157290392716401, + "grad_norm": 0.8266345262527466, + "learning_rate": 0.00035557650495991305, + "loss": 3.2797, + "step": 75905 + }, + { + "epoch": 5.157630112787063, + "grad_norm": 0.8810130953788757, + "learning_rate": 0.00035553403995108033, + "loss": 3.4957, + "step": 75910 + }, + { + "epoch": 5.1579698328577255, + "grad_norm": 0.7827169895172119, + "learning_rate": 0.0003554915749422476, + "loss": 2.8925, + "step": 75915 + }, + { + "epoch": 5.158309552928387, + "grad_norm": 1.0547292232513428, + "learning_rate": 0.0003554491099334149, + "loss": 3.5164, + "step": 75920 + }, + { + "epoch": 5.158649272999049, + "grad_norm": 0.7813152074813843, + "learning_rate": 0.0003554066449245821, + "loss": 3.5344, + "step": 75925 + }, + { + "epoch": 5.158988993069711, + "grad_norm": 0.8361643552780151, + "learning_rate": 0.00035536417991574945, + "loss": 3.3044, + "step": 75930 + }, + { + "epoch": 5.159328713140372, + "grad_norm": 2.0263147354125977, + "learning_rate": 0.00035532171490691673, + "loss": 3.3253, + "step": 75935 + }, + { + "epoch": 5.159668433211034, + "grad_norm": 0.934095561504364, + "learning_rate": 0.00035527924989808395, + "loss": 3.5024, + "step": 75940 + }, + { + "epoch": 5.160008153281696, + "grad_norm": 1.0925078392028809, + "learning_rate": 0.0003552367848892513, + "loss": 3.424, + "step": 75945 + }, + { + "epoch": 5.160347873352357, + "grad_norm": 0.7248360514640808, + "learning_rate": 0.00035519431988041857, + "loss": 3.4061, + "step": 75950 + }, + { + "epoch": 5.160687593423019, + "grad_norm": 0.8645932078361511, + "learning_rate": 0.0003551518548715858, + "loss": 3.2823, + "step": 75955 + }, + { + "epoch": 5.1610273134936815, + "grad_norm": 1.0302553176879883, + "learning_rate": 0.0003551093898627531, + "loss": 3.5246, + "step": 75960 + }, + { + "epoch": 5.161367033564343, + "grad_norm": 1.3541146516799927, + "learning_rate": 0.0003550669248539204, + "loss": 3.6436, + "step": 75965 + }, + { + "epoch": 5.161706753635005, + "grad_norm": 0.9448859691619873, + "learning_rate": 0.00035502445984508763, + "loss": 3.3827, + "step": 75970 + }, + { + "epoch": 5.162046473705667, + "grad_norm": 0.8917394876480103, + "learning_rate": 0.0003549819948362549, + "loss": 3.3193, + "step": 75975 + }, + { + "epoch": 5.162386193776328, + "grad_norm": 1.0374557971954346, + "learning_rate": 0.00035493952982742225, + "loss": 3.681, + "step": 75980 + }, + { + "epoch": 5.16272591384699, + "grad_norm": 0.7501043677330017, + "learning_rate": 0.0003548970648185895, + "loss": 3.2679, + "step": 75985 + }, + { + "epoch": 5.163065633917652, + "grad_norm": 0.8722904920578003, + "learning_rate": 0.00035485459980975675, + "loss": 3.4513, + "step": 75990 + }, + { + "epoch": 5.163405353988313, + "grad_norm": 0.9900466203689575, + "learning_rate": 0.00035481213480092403, + "loss": 3.5611, + "step": 75995 + }, + { + "epoch": 5.163745074058975, + "grad_norm": 0.6715785264968872, + "learning_rate": 0.0003547696697920913, + "loss": 3.6714, + "step": 76000 + }, + { + "epoch": 5.1640847941296375, + "grad_norm": 0.7456715106964111, + "learning_rate": 0.0003547272047832586, + "loss": 3.5963, + "step": 76005 + }, + { + "epoch": 5.164424514200299, + "grad_norm": 1.163414716720581, + "learning_rate": 0.0003546847397744259, + "loss": 3.3925, + "step": 76010 + }, + { + "epoch": 5.164764234270961, + "grad_norm": 0.7402921915054321, + "learning_rate": 0.00035464227476559316, + "loss": 3.5438, + "step": 76015 + }, + { + "epoch": 5.165103954341623, + "grad_norm": 0.9611417055130005, + "learning_rate": 0.00035459980975676044, + "loss": 3.6154, + "step": 76020 + }, + { + "epoch": 5.165443674412284, + "grad_norm": 1.0484379529953003, + "learning_rate": 0.0003545573447479277, + "loss": 3.2555, + "step": 76025 + }, + { + "epoch": 5.165783394482946, + "grad_norm": 1.2378052473068237, + "learning_rate": 0.000354514879739095, + "loss": 3.296, + "step": 76030 + }, + { + "epoch": 5.166123114553608, + "grad_norm": 1.1221626996994019, + "learning_rate": 0.0003544724147302623, + "loss": 3.5875, + "step": 76035 + }, + { + "epoch": 5.166462834624269, + "grad_norm": 1.1053109169006348, + "learning_rate": 0.00035442994972142956, + "loss": 3.3923, + "step": 76040 + }, + { + "epoch": 5.166802554694931, + "grad_norm": 1.1508893966674805, + "learning_rate": 0.00035438748471259684, + "loss": 3.3654, + "step": 76045 + }, + { + "epoch": 5.1671422747655935, + "grad_norm": 0.8927268385887146, + "learning_rate": 0.0003543450197037641, + "loss": 3.5275, + "step": 76050 + }, + { + "epoch": 5.167481994836255, + "grad_norm": 1.174953579902649, + "learning_rate": 0.0003543025546949314, + "loss": 3.3907, + "step": 76055 + }, + { + "epoch": 5.167821714906917, + "grad_norm": 1.313240647315979, + "learning_rate": 0.0003542600896860987, + "loss": 3.2279, + "step": 76060 + }, + { + "epoch": 5.168161434977579, + "grad_norm": 0.7984179854393005, + "learning_rate": 0.0003542176246772659, + "loss": 3.546, + "step": 76065 + }, + { + "epoch": 5.16850115504824, + "grad_norm": 0.7838951945304871, + "learning_rate": 0.00035417515966843324, + "loss": 3.7721, + "step": 76070 + }, + { + "epoch": 5.168840875118902, + "grad_norm": 1.962845802307129, + "learning_rate": 0.0003541326946596005, + "loss": 3.3604, + "step": 76075 + }, + { + "epoch": 5.169180595189564, + "grad_norm": 1.2212457656860352, + "learning_rate": 0.00035409022965076774, + "loss": 3.4653, + "step": 76080 + }, + { + "epoch": 5.169520315260225, + "grad_norm": 0.9575892686843872, + "learning_rate": 0.0003540477646419351, + "loss": 3.6377, + "step": 76085 + }, + { + "epoch": 5.1698600353308874, + "grad_norm": 1.2181637287139893, + "learning_rate": 0.00035400529963310236, + "loss": 3.6425, + "step": 76090 + }, + { + "epoch": 5.1701997554015495, + "grad_norm": 0.9032516479492188, + "learning_rate": 0.0003539628346242696, + "loss": 3.5015, + "step": 76095 + }, + { + "epoch": 5.170539475472211, + "grad_norm": 1.5478798151016235, + "learning_rate": 0.00035392036961543686, + "loss": 3.4909, + "step": 76100 + }, + { + "epoch": 5.170879195542873, + "grad_norm": 0.837245762348175, + "learning_rate": 0.0003538779046066042, + "loss": 3.3792, + "step": 76105 + }, + { + "epoch": 5.171218915613535, + "grad_norm": 1.1613460779190063, + "learning_rate": 0.0003538354395977714, + "loss": 3.3183, + "step": 76110 + }, + { + "epoch": 5.171558635684196, + "grad_norm": 0.9528232216835022, + "learning_rate": 0.0003537929745889387, + "loss": 3.6115, + "step": 76115 + }, + { + "epoch": 5.171898355754858, + "grad_norm": 0.8013635873794556, + "learning_rate": 0.00035375050958010604, + "loss": 3.6119, + "step": 76120 + }, + { + "epoch": 5.17223807582552, + "grad_norm": 0.8213931918144226, + "learning_rate": 0.00035370804457127326, + "loss": 3.2099, + "step": 76125 + }, + { + "epoch": 5.172577795896181, + "grad_norm": 0.8673731684684753, + "learning_rate": 0.00035366557956244054, + "loss": 3.4492, + "step": 76130 + }, + { + "epoch": 5.1729175159668435, + "grad_norm": 1.152968406677246, + "learning_rate": 0.0003536231145536078, + "loss": 3.5246, + "step": 76135 + }, + { + "epoch": 5.1732572360375055, + "grad_norm": 0.8475187420845032, + "learning_rate": 0.0003535806495447751, + "loss": 3.3028, + "step": 76140 + }, + { + "epoch": 5.173596956108167, + "grad_norm": 0.9546343088150024, + "learning_rate": 0.0003535381845359424, + "loss": 3.288, + "step": 76145 + }, + { + "epoch": 5.173936676178829, + "grad_norm": 0.7820631265640259, + "learning_rate": 0.00035349571952710966, + "loss": 3.3752, + "step": 76150 + }, + { + "epoch": 5.174276396249491, + "grad_norm": 0.9394371509552002, + "learning_rate": 0.00035345325451827694, + "loss": 3.4208, + "step": 76155 + }, + { + "epoch": 5.174616116320152, + "grad_norm": 0.8498660922050476, + "learning_rate": 0.0003534107895094442, + "loss": 3.6099, + "step": 76160 + }, + { + "epoch": 5.174955836390814, + "grad_norm": 1.0075654983520508, + "learning_rate": 0.0003533683245006115, + "loss": 3.5154, + "step": 76165 + }, + { + "epoch": 5.175295556461475, + "grad_norm": 0.8904738426208496, + "learning_rate": 0.00035332585949177873, + "loss": 3.212, + "step": 76170 + }, + { + "epoch": 5.175635276532137, + "grad_norm": 1.0199486017227173, + "learning_rate": 0.00035328339448294606, + "loss": 3.5839, + "step": 76175 + }, + { + "epoch": 5.1759749966027995, + "grad_norm": 0.907651960849762, + "learning_rate": 0.00035324092947411334, + "loss": 3.5482, + "step": 76180 + }, + { + "epoch": 5.176314716673461, + "grad_norm": 0.7857928276062012, + "learning_rate": 0.00035319846446528057, + "loss": 3.6274, + "step": 76185 + }, + { + "epoch": 5.176654436744123, + "grad_norm": 1.0497698783874512, + "learning_rate": 0.0003531559994564479, + "loss": 3.3069, + "step": 76190 + }, + { + "epoch": 5.176994156814785, + "grad_norm": 0.9278537034988403, + "learning_rate": 0.0003531135344476152, + "loss": 3.5036, + "step": 76195 + }, + { + "epoch": 5.177333876885446, + "grad_norm": 1.3838635683059692, + "learning_rate": 0.00035307106943878246, + "loss": 3.7359, + "step": 76200 + }, + { + "epoch": 5.177673596956108, + "grad_norm": 1.186934232711792, + "learning_rate": 0.00035302860442994974, + "loss": 3.352, + "step": 76205 + }, + { + "epoch": 5.17801331702677, + "grad_norm": 0.913405179977417, + "learning_rate": 0.000352986139421117, + "loss": 3.5184, + "step": 76210 + }, + { + "epoch": 5.178353037097431, + "grad_norm": 1.0284614562988281, + "learning_rate": 0.0003529436744122843, + "loss": 3.5175, + "step": 76215 + }, + { + "epoch": 5.178692757168093, + "grad_norm": 0.8065379858016968, + "learning_rate": 0.00035290120940345153, + "loss": 3.8729, + "step": 76220 + }, + { + "epoch": 5.1790324772387555, + "grad_norm": 0.855293869972229, + "learning_rate": 0.00035285874439461886, + "loss": 3.7456, + "step": 76225 + }, + { + "epoch": 5.179372197309417, + "grad_norm": 1.1450179815292358, + "learning_rate": 0.00035281627938578614, + "loss": 3.3136, + "step": 76230 + }, + { + "epoch": 5.179711917380079, + "grad_norm": 1.7858816385269165, + "learning_rate": 0.00035277381437695337, + "loss": 3.2339, + "step": 76235 + }, + { + "epoch": 5.180051637450741, + "grad_norm": 1.0796592235565186, + "learning_rate": 0.0003527313493681207, + "loss": 3.5143, + "step": 76240 + }, + { + "epoch": 5.180391357521402, + "grad_norm": 0.8058212399482727, + "learning_rate": 0.000352688884359288, + "loss": 3.4837, + "step": 76245 + }, + { + "epoch": 5.180731077592064, + "grad_norm": 0.8155843615531921, + "learning_rate": 0.0003526464193504552, + "loss": 3.6257, + "step": 76250 + }, + { + "epoch": 5.181070797662726, + "grad_norm": 0.9111512303352356, + "learning_rate": 0.0003526039543416225, + "loss": 3.2847, + "step": 76255 + }, + { + "epoch": 5.181410517733387, + "grad_norm": 1.0625380277633667, + "learning_rate": 0.0003525614893327898, + "loss": 3.4358, + "step": 76260 + }, + { + "epoch": 5.181750237804049, + "grad_norm": 1.009197473526001, + "learning_rate": 0.00035251902432395705, + "loss": 3.4684, + "step": 76265 + }, + { + "epoch": 5.1820899578747115, + "grad_norm": 1.2381291389465332, + "learning_rate": 0.00035247655931512433, + "loss": 3.3957, + "step": 76270 + }, + { + "epoch": 5.182429677945373, + "grad_norm": 1.1153761148452759, + "learning_rate": 0.00035243409430629167, + "loss": 3.5469, + "step": 76275 + }, + { + "epoch": 5.182769398016035, + "grad_norm": 0.967791736125946, + "learning_rate": 0.0003523916292974589, + "loss": 3.4105, + "step": 76280 + }, + { + "epoch": 5.183109118086697, + "grad_norm": 1.1468486785888672, + "learning_rate": 0.00035234916428862617, + "loss": 3.3239, + "step": 76285 + }, + { + "epoch": 5.183448838157358, + "grad_norm": 0.729259192943573, + "learning_rate": 0.00035230669927979345, + "loss": 3.6601, + "step": 76290 + }, + { + "epoch": 5.18378855822802, + "grad_norm": 1.1746844053268433, + "learning_rate": 0.00035226423427096073, + "loss": 3.0863, + "step": 76295 + }, + { + "epoch": 5.184128278298682, + "grad_norm": 1.3127202987670898, + "learning_rate": 0.000352221769262128, + "loss": 3.4318, + "step": 76300 + }, + { + "epoch": 5.184467998369343, + "grad_norm": 0.9289271235466003, + "learning_rate": 0.0003521793042532953, + "loss": 3.7284, + "step": 76305 + }, + { + "epoch": 5.184807718440005, + "grad_norm": 1.0185413360595703, + "learning_rate": 0.00035213683924446257, + "loss": 3.4989, + "step": 76310 + }, + { + "epoch": 5.1851474385106675, + "grad_norm": 0.8239532113075256, + "learning_rate": 0.00035209437423562985, + "loss": 3.4571, + "step": 76315 + }, + { + "epoch": 5.185487158581329, + "grad_norm": 0.9071556925773621, + "learning_rate": 0.00035205190922679713, + "loss": 3.3946, + "step": 76320 + }, + { + "epoch": 5.185826878651991, + "grad_norm": 1.123165249824524, + "learning_rate": 0.00035200944421796436, + "loss": 3.3954, + "step": 76325 + }, + { + "epoch": 5.186166598722653, + "grad_norm": 1.0267144441604614, + "learning_rate": 0.0003519669792091317, + "loss": 3.4589, + "step": 76330 + }, + { + "epoch": 5.186506318793314, + "grad_norm": 0.8810275197029114, + "learning_rate": 0.00035192451420029897, + "loss": 3.4723, + "step": 76335 + }, + { + "epoch": 5.186846038863976, + "grad_norm": 1.2737574577331543, + "learning_rate": 0.0003518820491914662, + "loss": 3.3711, + "step": 76340 + }, + { + "epoch": 5.187185758934638, + "grad_norm": 0.9376014471054077, + "learning_rate": 0.00035183958418263353, + "loss": 3.5216, + "step": 76345 + }, + { + "epoch": 5.187525479005299, + "grad_norm": 1.0005861520767212, + "learning_rate": 0.0003517971191738008, + "loss": 3.4645, + "step": 76350 + }, + { + "epoch": 5.187865199075961, + "grad_norm": 1.1232060194015503, + "learning_rate": 0.00035175465416496804, + "loss": 3.5584, + "step": 76355 + }, + { + "epoch": 5.1882049191466235, + "grad_norm": 0.8296521902084351, + "learning_rate": 0.0003517121891561353, + "loss": 3.3432, + "step": 76360 + }, + { + "epoch": 5.188544639217285, + "grad_norm": 0.8388288021087646, + "learning_rate": 0.00035166972414730265, + "loss": 3.4388, + "step": 76365 + }, + { + "epoch": 5.188884359287947, + "grad_norm": 0.7373752593994141, + "learning_rate": 0.00035162725913846993, + "loss": 3.6649, + "step": 76370 + }, + { + "epoch": 5.189224079358609, + "grad_norm": 0.8929604291915894, + "learning_rate": 0.00035158479412963716, + "loss": 3.3549, + "step": 76375 + }, + { + "epoch": 5.18956379942927, + "grad_norm": 0.7801471948623657, + "learning_rate": 0.0003515423291208045, + "loss": 3.225, + "step": 76380 + }, + { + "epoch": 5.189903519499932, + "grad_norm": 1.0942977666854858, + "learning_rate": 0.00035149986411197177, + "loss": 3.577, + "step": 76385 + }, + { + "epoch": 5.190243239570594, + "grad_norm": 1.0681802034378052, + "learning_rate": 0.000351457399103139, + "loss": 3.5511, + "step": 76390 + }, + { + "epoch": 5.190582959641255, + "grad_norm": 0.9680463075637817, + "learning_rate": 0.0003514149340943063, + "loss": 3.7487, + "step": 76395 + }, + { + "epoch": 5.1909226797119175, + "grad_norm": 0.8330009579658508, + "learning_rate": 0.0003513724690854736, + "loss": 3.3869, + "step": 76400 + }, + { + "epoch": 5.1912623997825795, + "grad_norm": 0.7445343136787415, + "learning_rate": 0.00035133000407664084, + "loss": 3.3669, + "step": 76405 + }, + { + "epoch": 5.191602119853241, + "grad_norm": 1.4376360177993774, + "learning_rate": 0.0003512875390678081, + "loss": 3.5814, + "step": 76410 + }, + { + "epoch": 5.191941839923903, + "grad_norm": 0.719276487827301, + "learning_rate": 0.00035124507405897545, + "loss": 3.1876, + "step": 76415 + }, + { + "epoch": 5.192281559994565, + "grad_norm": 0.7470962405204773, + "learning_rate": 0.0003512026090501427, + "loss": 3.1002, + "step": 76420 + }, + { + "epoch": 5.192621280065226, + "grad_norm": 1.5123597383499146, + "learning_rate": 0.00035116014404130996, + "loss": 3.7106, + "step": 76425 + }, + { + "epoch": 5.192961000135888, + "grad_norm": 0.9330747723579407, + "learning_rate": 0.0003511176790324773, + "loss": 3.337, + "step": 76430 + }, + { + "epoch": 5.19330072020655, + "grad_norm": 0.8314443230628967, + "learning_rate": 0.0003510752140236445, + "loss": 3.3864, + "step": 76435 + }, + { + "epoch": 5.193640440277211, + "grad_norm": 0.918666660785675, + "learning_rate": 0.0003510327490148118, + "loss": 3.3867, + "step": 76440 + }, + { + "epoch": 5.1939801603478735, + "grad_norm": 0.8609732985496521, + "learning_rate": 0.0003509902840059791, + "loss": 3.7045, + "step": 76445 + }, + { + "epoch": 5.1943198804185355, + "grad_norm": 0.7392222285270691, + "learning_rate": 0.00035094781899714636, + "loss": 3.5925, + "step": 76450 + }, + { + "epoch": 5.194659600489197, + "grad_norm": 0.9171335101127625, + "learning_rate": 0.00035090535398831364, + "loss": 3.4914, + "step": 76455 + }, + { + "epoch": 5.194999320559859, + "grad_norm": 0.7219635248184204, + "learning_rate": 0.0003508628889794809, + "loss": 3.6316, + "step": 76460 + }, + { + "epoch": 5.195339040630521, + "grad_norm": 0.9819028377532959, + "learning_rate": 0.0003508204239706482, + "loss": 3.5283, + "step": 76465 + }, + { + "epoch": 5.195678760701182, + "grad_norm": 0.8742094039916992, + "learning_rate": 0.0003507779589618155, + "loss": 3.4547, + "step": 76470 + }, + { + "epoch": 5.196018480771844, + "grad_norm": 0.9559829235076904, + "learning_rate": 0.00035073549395298276, + "loss": 3.473, + "step": 76475 + }, + { + "epoch": 5.196358200842505, + "grad_norm": 0.7612019777297974, + "learning_rate": 0.00035069302894415, + "loss": 3.5545, + "step": 76480 + }, + { + "epoch": 5.196697920913167, + "grad_norm": 1.3907430171966553, + "learning_rate": 0.0003506505639353173, + "loss": 3.5216, + "step": 76485 + }, + { + "epoch": 5.1970376409838295, + "grad_norm": 0.9297182559967041, + "learning_rate": 0.0003506080989264846, + "loss": 3.5989, + "step": 76490 + }, + { + "epoch": 5.197377361054491, + "grad_norm": 1.1308543682098389, + "learning_rate": 0.0003505656339176518, + "loss": 3.3873, + "step": 76495 + }, + { + "epoch": 5.197717081125153, + "grad_norm": 1.2078535556793213, + "learning_rate": 0.00035052316890881916, + "loss": 3.3858, + "step": 76500 + }, + { + "epoch": 5.198056801195815, + "grad_norm": 1.144275188446045, + "learning_rate": 0.00035048070389998644, + "loss": 3.3743, + "step": 76505 + }, + { + "epoch": 5.198396521266476, + "grad_norm": 0.9776083827018738, + "learning_rate": 0.00035043823889115367, + "loss": 3.4567, + "step": 76510 + }, + { + "epoch": 5.198736241337138, + "grad_norm": 0.7686120867729187, + "learning_rate": 0.00035039577388232095, + "loss": 3.7641, + "step": 76515 + }, + { + "epoch": 5.1990759614078, + "grad_norm": 0.8416746854782104, + "learning_rate": 0.0003503533088734883, + "loss": 3.3129, + "step": 76520 + }, + { + "epoch": 5.199415681478461, + "grad_norm": 1.1315481662750244, + "learning_rate": 0.0003503108438646555, + "loss": 3.4665, + "step": 76525 + }, + { + "epoch": 5.199755401549123, + "grad_norm": 0.829027533531189, + "learning_rate": 0.0003502683788558228, + "loss": 3.4969, + "step": 76530 + }, + { + "epoch": 5.2000951216197855, + "grad_norm": 0.9309937953948975, + "learning_rate": 0.0003502259138469901, + "loss": 3.6943, + "step": 76535 + }, + { + "epoch": 5.200434841690447, + "grad_norm": 1.3628058433532715, + "learning_rate": 0.0003501834488381574, + "loss": 3.5997, + "step": 76540 + }, + { + "epoch": 5.200774561761109, + "grad_norm": 0.9830437898635864, + "learning_rate": 0.0003501409838293246, + "loss": 3.4912, + "step": 76545 + }, + { + "epoch": 5.201114281831771, + "grad_norm": 1.1168453693389893, + "learning_rate": 0.0003500985188204919, + "loss": 3.4876, + "step": 76550 + }, + { + "epoch": 5.201454001902432, + "grad_norm": 0.9512060880661011, + "learning_rate": 0.00035005605381165924, + "loss": 3.2739, + "step": 76555 + }, + { + "epoch": 5.201793721973094, + "grad_norm": 0.8930521011352539, + "learning_rate": 0.00035001358880282647, + "loss": 3.4285, + "step": 76560 + }, + { + "epoch": 5.202133442043756, + "grad_norm": 1.4606044292449951, + "learning_rate": 0.00034997112379399375, + "loss": 3.4997, + "step": 76565 + }, + { + "epoch": 5.202473162114417, + "grad_norm": 1.1546412706375122, + "learning_rate": 0.0003499286587851611, + "loss": 3.3278, + "step": 76570 + }, + { + "epoch": 5.202812882185079, + "grad_norm": 0.8892627358436584, + "learning_rate": 0.0003498861937763283, + "loss": 3.5452, + "step": 76575 + }, + { + "epoch": 5.2031526022557415, + "grad_norm": 1.0346076488494873, + "learning_rate": 0.0003498437287674956, + "loss": 3.6376, + "step": 76580 + }, + { + "epoch": 5.203492322326403, + "grad_norm": 0.7711248993873596, + "learning_rate": 0.00034980126375866287, + "loss": 3.2516, + "step": 76585 + }, + { + "epoch": 5.203832042397065, + "grad_norm": 0.8648890256881714, + "learning_rate": 0.00034975879874983015, + "loss": 3.4845, + "step": 76590 + }, + { + "epoch": 5.204171762467727, + "grad_norm": 0.7444353103637695, + "learning_rate": 0.00034971633374099743, + "loss": 3.3752, + "step": 76595 + }, + { + "epoch": 5.204511482538388, + "grad_norm": 1.4762388467788696, + "learning_rate": 0.0003496738687321647, + "loss": 3.5636, + "step": 76600 + }, + { + "epoch": 5.20485120260905, + "grad_norm": 0.8475094437599182, + "learning_rate": 0.000349631403723332, + "loss": 3.2789, + "step": 76605 + }, + { + "epoch": 5.205190922679712, + "grad_norm": 1.7200127840042114, + "learning_rate": 0.00034958893871449927, + "loss": 3.3733, + "step": 76610 + }, + { + "epoch": 5.205530642750373, + "grad_norm": 2.6658642292022705, + "learning_rate": 0.00034954647370566655, + "loss": 3.5134, + "step": 76615 + }, + { + "epoch": 5.205870362821035, + "grad_norm": 1.3578640222549438, + "learning_rate": 0.0003495040086968338, + "loss": 3.5637, + "step": 76620 + }, + { + "epoch": 5.2062100828916975, + "grad_norm": 1.3043696880340576, + "learning_rate": 0.0003494615436880011, + "loss": 3.6872, + "step": 76625 + }, + { + "epoch": 5.206549802962359, + "grad_norm": 0.9613557457923889, + "learning_rate": 0.0003494190786791684, + "loss": 3.4695, + "step": 76630 + }, + { + "epoch": 5.206889523033021, + "grad_norm": 1.0480520725250244, + "learning_rate": 0.0003493766136703356, + "loss": 3.6395, + "step": 76635 + }, + { + "epoch": 5.207229243103683, + "grad_norm": 0.9128376245498657, + "learning_rate": 0.00034933414866150295, + "loss": 3.4602, + "step": 76640 + }, + { + "epoch": 5.207568963174344, + "grad_norm": 0.8964043259620667, + "learning_rate": 0.00034929168365267023, + "loss": 3.2087, + "step": 76645 + }, + { + "epoch": 5.207908683245006, + "grad_norm": 0.9308962821960449, + "learning_rate": 0.00034924921864383745, + "loss": 3.3199, + "step": 76650 + }, + { + "epoch": 5.208248403315668, + "grad_norm": 0.8795655369758606, + "learning_rate": 0.00034920675363500473, + "loss": 3.6545, + "step": 76655 + }, + { + "epoch": 5.208588123386329, + "grad_norm": 0.9301890730857849, + "learning_rate": 0.00034916428862617207, + "loss": 3.285, + "step": 76660 + }, + { + "epoch": 5.2089278434569914, + "grad_norm": 1.395858883857727, + "learning_rate": 0.0003491218236173393, + "loss": 3.4258, + "step": 76665 + }, + { + "epoch": 5.2092675635276535, + "grad_norm": 1.00365149974823, + "learning_rate": 0.0003490793586085066, + "loss": 3.2663, + "step": 76670 + }, + { + "epoch": 5.209607283598315, + "grad_norm": 0.8913069367408752, + "learning_rate": 0.0003490368935996739, + "loss": 3.5053, + "step": 76675 + }, + { + "epoch": 5.209947003668977, + "grad_norm": 0.9287757277488708, + "learning_rate": 0.00034899442859084113, + "loss": 3.4717, + "step": 76680 + }, + { + "epoch": 5.210286723739639, + "grad_norm": 1.5536363124847412, + "learning_rate": 0.0003489519635820084, + "loss": 3.3555, + "step": 76685 + }, + { + "epoch": 5.2106264438103, + "grad_norm": 0.912068784236908, + "learning_rate": 0.0003489094985731757, + "loss": 3.286, + "step": 76690 + }, + { + "epoch": 5.210966163880962, + "grad_norm": 0.9146206974983215, + "learning_rate": 0.000348867033564343, + "loss": 3.4692, + "step": 76695 + }, + { + "epoch": 5.211305883951624, + "grad_norm": 0.727143406867981, + "learning_rate": 0.00034882456855551025, + "loss": 3.5165, + "step": 76700 + }, + { + "epoch": 5.211645604022285, + "grad_norm": 0.8445073366165161, + "learning_rate": 0.00034878210354667753, + "loss": 3.3729, + "step": 76705 + }, + { + "epoch": 5.2119853240929475, + "grad_norm": 0.9613944888114929, + "learning_rate": 0.00034873963853784487, + "loss": 3.484, + "step": 76710 + }, + { + "epoch": 5.2123250441636095, + "grad_norm": 1.1473444700241089, + "learning_rate": 0.0003486971735290121, + "loss": 3.3755, + "step": 76715 + }, + { + "epoch": 5.212664764234271, + "grad_norm": 1.7418212890625, + "learning_rate": 0.0003486547085201794, + "loss": 3.6503, + "step": 76720 + }, + { + "epoch": 5.213004484304933, + "grad_norm": 0.9513466358184814, + "learning_rate": 0.0003486122435113467, + "loss": 3.4967, + "step": 76725 + }, + { + "epoch": 5.213344204375595, + "grad_norm": 2.739348888397217, + "learning_rate": 0.00034856977850251394, + "loss": 3.4276, + "step": 76730 + }, + { + "epoch": 5.213683924446256, + "grad_norm": 0.8947828412055969, + "learning_rate": 0.0003485273134936812, + "loss": 3.5697, + "step": 76735 + }, + { + "epoch": 5.214023644516918, + "grad_norm": 1.0734071731567383, + "learning_rate": 0.0003484848484848485, + "loss": 3.4957, + "step": 76740 + }, + { + "epoch": 5.21436336458758, + "grad_norm": 1.102291464805603, + "learning_rate": 0.0003484423834760158, + "loss": 3.4426, + "step": 76745 + }, + { + "epoch": 5.214703084658241, + "grad_norm": 1.0937780141830444, + "learning_rate": 0.00034839991846718306, + "loss": 3.2797, + "step": 76750 + }, + { + "epoch": 5.2150428047289035, + "grad_norm": 1.2172300815582275, + "learning_rate": 0.00034835745345835034, + "loss": 3.2783, + "step": 76755 + }, + { + "epoch": 5.2153825247995655, + "grad_norm": 1.0502691268920898, + "learning_rate": 0.0003483149884495176, + "loss": 3.4964, + "step": 76760 + }, + { + "epoch": 5.215722244870227, + "grad_norm": 1.1333132982254028, + "learning_rate": 0.0003482725234406849, + "loss": 3.4329, + "step": 76765 + }, + { + "epoch": 5.216061964940889, + "grad_norm": 1.1777031421661377, + "learning_rate": 0.0003482300584318522, + "loss": 3.3089, + "step": 76770 + }, + { + "epoch": 5.216401685011551, + "grad_norm": 0.8515138030052185, + "learning_rate": 0.0003481875934230194, + "loss": 3.3071, + "step": 76775 + }, + { + "epoch": 5.216741405082212, + "grad_norm": 1.0677005052566528, + "learning_rate": 0.00034814512841418674, + "loss": 3.3706, + "step": 76780 + }, + { + "epoch": 5.217081125152874, + "grad_norm": 1.095780611038208, + "learning_rate": 0.000348102663405354, + "loss": 3.4924, + "step": 76785 + }, + { + "epoch": 5.217420845223536, + "grad_norm": 1.1595251560211182, + "learning_rate": 0.00034806019839652124, + "loss": 3.7585, + "step": 76790 + }, + { + "epoch": 5.217760565294197, + "grad_norm": 0.9632075428962708, + "learning_rate": 0.0003480177333876886, + "loss": 3.3417, + "step": 76795 + }, + { + "epoch": 5.2181002853648595, + "grad_norm": 0.9305838942527771, + "learning_rate": 0.00034797526837885586, + "loss": 3.3985, + "step": 76800 + }, + { + "epoch": 5.2184400054355216, + "grad_norm": 0.7110072374343872, + "learning_rate": 0.0003479328033700231, + "loss": 3.6286, + "step": 76805 + }, + { + "epoch": 5.218779725506183, + "grad_norm": 1.010613203048706, + "learning_rate": 0.00034789033836119036, + "loss": 3.4049, + "step": 76810 + }, + { + "epoch": 5.219119445576845, + "grad_norm": 0.6793950796127319, + "learning_rate": 0.0003478478733523577, + "loss": 3.4446, + "step": 76815 + }, + { + "epoch": 5.219459165647507, + "grad_norm": 0.9115724563598633, + "learning_rate": 0.0003478054083435249, + "loss": 3.3107, + "step": 76820 + }, + { + "epoch": 5.219798885718168, + "grad_norm": 0.8952100276947021, + "learning_rate": 0.0003477629433346922, + "loss": 3.4927, + "step": 76825 + }, + { + "epoch": 5.22013860578883, + "grad_norm": 0.8970851898193359, + "learning_rate": 0.00034772047832585954, + "loss": 3.3244, + "step": 76830 + }, + { + "epoch": 5.220478325859492, + "grad_norm": 0.8214719295501709, + "learning_rate": 0.00034767801331702676, + "loss": 3.2155, + "step": 76835 + }, + { + "epoch": 5.220818045930153, + "grad_norm": 1.022929072380066, + "learning_rate": 0.00034763554830819404, + "loss": 3.4113, + "step": 76840 + }, + { + "epoch": 5.2211577660008155, + "grad_norm": 0.7492793798446655, + "learning_rate": 0.0003475930832993613, + "loss": 3.3445, + "step": 76845 + }, + { + "epoch": 5.221497486071477, + "grad_norm": 0.8960014581680298, + "learning_rate": 0.0003475506182905286, + "loss": 3.5645, + "step": 76850 + }, + { + "epoch": 5.221837206142139, + "grad_norm": 0.8171675801277161, + "learning_rate": 0.0003475081532816959, + "loss": 3.3881, + "step": 76855 + }, + { + "epoch": 5.222176926212801, + "grad_norm": 1.1860538721084595, + "learning_rate": 0.00034746568827286316, + "loss": 3.5893, + "step": 76860 + }, + { + "epoch": 5.222516646283462, + "grad_norm": 0.9685695767402649, + "learning_rate": 0.00034742322326403044, + "loss": 3.5616, + "step": 76865 + }, + { + "epoch": 5.222856366354124, + "grad_norm": 0.9258379936218262, + "learning_rate": 0.0003473807582551977, + "loss": 3.5321, + "step": 76870 + }, + { + "epoch": 5.223196086424786, + "grad_norm": 1.0897870063781738, + "learning_rate": 0.000347338293246365, + "loss": 3.4739, + "step": 76875 + }, + { + "epoch": 5.223535806495447, + "grad_norm": 0.8085293173789978, + "learning_rate": 0.0003472958282375323, + "loss": 3.2027, + "step": 76880 + }, + { + "epoch": 5.223875526566109, + "grad_norm": 0.8994132876396179, + "learning_rate": 0.00034725336322869956, + "loss": 3.3379, + "step": 76885 + }, + { + "epoch": 5.2242152466367715, + "grad_norm": 0.8360452055931091, + "learning_rate": 0.00034721089821986684, + "loss": 3.6016, + "step": 76890 + }, + { + "epoch": 5.224554966707433, + "grad_norm": 1.6288801431655884, + "learning_rate": 0.0003471684332110341, + "loss": 3.2643, + "step": 76895 + }, + { + "epoch": 5.224894686778095, + "grad_norm": 0.9811125993728638, + "learning_rate": 0.0003471259682022014, + "loss": 3.4744, + "step": 76900 + }, + { + "epoch": 5.225234406848757, + "grad_norm": 0.9402899742126465, + "learning_rate": 0.0003470835031933687, + "loss": 3.679, + "step": 76905 + }, + { + "epoch": 5.225574126919418, + "grad_norm": 0.9598615169525146, + "learning_rate": 0.00034704103818453596, + "loss": 3.4397, + "step": 76910 + }, + { + "epoch": 5.22591384699008, + "grad_norm": 1.683327078819275, + "learning_rate": 0.0003469985731757032, + "loss": 3.4616, + "step": 76915 + }, + { + "epoch": 5.226253567060742, + "grad_norm": 1.1773000955581665, + "learning_rate": 0.0003469561081668705, + "loss": 3.5357, + "step": 76920 + }, + { + "epoch": 5.226593287131403, + "grad_norm": 0.8486369252204895, + "learning_rate": 0.0003469136431580378, + "loss": 3.5246, + "step": 76925 + }, + { + "epoch": 5.226933007202065, + "grad_norm": 1.231277346611023, + "learning_rate": 0.00034687117814920503, + "loss": 3.6515, + "step": 76930 + }, + { + "epoch": 5.2272727272727275, + "grad_norm": 0.921932578086853, + "learning_rate": 0.00034682871314037236, + "loss": 3.2551, + "step": 76935 + }, + { + "epoch": 5.227612447343389, + "grad_norm": 0.8024895191192627, + "learning_rate": 0.00034678624813153964, + "loss": 3.3273, + "step": 76940 + }, + { + "epoch": 5.227952167414051, + "grad_norm": 0.9568153619766235, + "learning_rate": 0.00034674378312270687, + "loss": 3.4941, + "step": 76945 + }, + { + "epoch": 5.228291887484713, + "grad_norm": 0.8378103375434875, + "learning_rate": 0.00034670131811387415, + "loss": 3.4296, + "step": 76950 + }, + { + "epoch": 5.228631607555374, + "grad_norm": 0.8377346992492676, + "learning_rate": 0.0003466588531050415, + "loss": 3.5721, + "step": 76955 + }, + { + "epoch": 5.228971327626036, + "grad_norm": 1.1548608541488647, + "learning_rate": 0.0003466163880962087, + "loss": 3.4687, + "step": 76960 + }, + { + "epoch": 5.229311047696698, + "grad_norm": 0.8465765714645386, + "learning_rate": 0.000346573923087376, + "loss": 3.4147, + "step": 76965 + }, + { + "epoch": 5.229650767767359, + "grad_norm": 0.9042304158210754, + "learning_rate": 0.0003465314580785433, + "loss": 3.5192, + "step": 76970 + }, + { + "epoch": 5.2299904878380215, + "grad_norm": 0.8461657166481018, + "learning_rate": 0.00034648899306971055, + "loss": 3.6457, + "step": 76975 + }, + { + "epoch": 5.2303302079086835, + "grad_norm": 0.8847707509994507, + "learning_rate": 0.00034644652806087783, + "loss": 3.5735, + "step": 76980 + }, + { + "epoch": 5.230669927979345, + "grad_norm": 1.0302486419677734, + "learning_rate": 0.0003464040630520451, + "loss": 3.4018, + "step": 76985 + }, + { + "epoch": 5.231009648050007, + "grad_norm": 1.020413875579834, + "learning_rate": 0.0003463615980432124, + "loss": 3.3788, + "step": 76990 + }, + { + "epoch": 5.231349368120669, + "grad_norm": 0.9015278816223145, + "learning_rate": 0.00034631913303437967, + "loss": 3.5581, + "step": 76995 + }, + { + "epoch": 5.23168908819133, + "grad_norm": 0.8782800436019897, + "learning_rate": 0.00034627666802554695, + "loss": 3.4466, + "step": 77000 + }, + { + "epoch": 5.232028808261992, + "grad_norm": 0.8526657223701477, + "learning_rate": 0.00034623420301671423, + "loss": 3.4156, + "step": 77005 + }, + { + "epoch": 5.232368528332654, + "grad_norm": 1.3350859880447388, + "learning_rate": 0.0003461917380078815, + "loss": 3.275, + "step": 77010 + }, + { + "epoch": 5.232708248403315, + "grad_norm": 0.8574503660202026, + "learning_rate": 0.0003461492729990488, + "loss": 3.3705, + "step": 77015 + }, + { + "epoch": 5.2330479684739775, + "grad_norm": 1.023664116859436, + "learning_rate": 0.000346106807990216, + "loss": 3.3802, + "step": 77020 + }, + { + "epoch": 5.2333876885446395, + "grad_norm": 0.9111095070838928, + "learning_rate": 0.00034606434298138335, + "loss": 3.5266, + "step": 77025 + }, + { + "epoch": 5.233727408615301, + "grad_norm": 1.0308899879455566, + "learning_rate": 0.00034602187797255063, + "loss": 3.547, + "step": 77030 + }, + { + "epoch": 5.234067128685963, + "grad_norm": 1.5204520225524902, + "learning_rate": 0.00034597941296371786, + "loss": 3.5495, + "step": 77035 + }, + { + "epoch": 5.234406848756625, + "grad_norm": 0.8277630805969238, + "learning_rate": 0.0003459369479548852, + "loss": 3.4858, + "step": 77040 + }, + { + "epoch": 5.234746568827286, + "grad_norm": 1.011932134628296, + "learning_rate": 0.00034589448294605247, + "loss": 3.3973, + "step": 77045 + }, + { + "epoch": 5.235086288897948, + "grad_norm": 0.8339505791664124, + "learning_rate": 0.00034585201793721975, + "loss": 3.5591, + "step": 77050 + }, + { + "epoch": 5.23542600896861, + "grad_norm": 1.0865505933761597, + "learning_rate": 0.00034580955292838703, + "loss": 3.4137, + "step": 77055 + }, + { + "epoch": 5.235765729039271, + "grad_norm": 1.1691852807998657, + "learning_rate": 0.0003457670879195543, + "loss": 3.3196, + "step": 77060 + }, + { + "epoch": 5.2361054491099335, + "grad_norm": 1.1138931512832642, + "learning_rate": 0.0003457246229107216, + "loss": 3.484, + "step": 77065 + }, + { + "epoch": 5.2364451691805955, + "grad_norm": 0.9311533570289612, + "learning_rate": 0.0003456821579018888, + "loss": 3.5514, + "step": 77070 + }, + { + "epoch": 5.236784889251257, + "grad_norm": 0.8000608682632446, + "learning_rate": 0.00034563969289305615, + "loss": 3.4278, + "step": 77075 + }, + { + "epoch": 5.237124609321919, + "grad_norm": 0.8512917160987854, + "learning_rate": 0.00034559722788422343, + "loss": 3.4647, + "step": 77080 + }, + { + "epoch": 5.237464329392581, + "grad_norm": 1.0472298860549927, + "learning_rate": 0.00034555476287539066, + "loss": 3.6068, + "step": 77085 + }, + { + "epoch": 5.237804049463242, + "grad_norm": 1.7385145425796509, + "learning_rate": 0.000345512297866558, + "loss": 3.4064, + "step": 77090 + }, + { + "epoch": 5.238143769533904, + "grad_norm": 1.0124698877334595, + "learning_rate": 0.00034546983285772527, + "loss": 3.6638, + "step": 77095 + }, + { + "epoch": 5.238483489604566, + "grad_norm": 0.759158194065094, + "learning_rate": 0.0003454273678488925, + "loss": 3.4839, + "step": 77100 + }, + { + "epoch": 5.238823209675227, + "grad_norm": 0.8190465569496155, + "learning_rate": 0.0003453849028400598, + "loss": 3.3904, + "step": 77105 + }, + { + "epoch": 5.2391629297458895, + "grad_norm": 1.044790267944336, + "learning_rate": 0.0003453424378312271, + "loss": 3.4299, + "step": 77110 + }, + { + "epoch": 5.239502649816552, + "grad_norm": 0.9401665925979614, + "learning_rate": 0.00034529997282239434, + "loss": 3.3778, + "step": 77115 + }, + { + "epoch": 5.239842369887213, + "grad_norm": 0.8525695204734802, + "learning_rate": 0.0003452575078135616, + "loss": 3.4953, + "step": 77120 + }, + { + "epoch": 5.240182089957875, + "grad_norm": 0.9375599026679993, + "learning_rate": 0.00034521504280472895, + "loss": 3.3271, + "step": 77125 + }, + { + "epoch": 5.240521810028537, + "grad_norm": 1.37141752243042, + "learning_rate": 0.0003451725777958962, + "loss": 3.5444, + "step": 77130 + }, + { + "epoch": 5.240861530099198, + "grad_norm": 1.1798548698425293, + "learning_rate": 0.00034513011278706346, + "loss": 3.4497, + "step": 77135 + }, + { + "epoch": 5.24120125016986, + "grad_norm": 0.9912571310997009, + "learning_rate": 0.00034508764777823074, + "loss": 3.622, + "step": 77140 + }, + { + "epoch": 5.241540970240522, + "grad_norm": 0.9016405344009399, + "learning_rate": 0.000345045182769398, + "loss": 3.4281, + "step": 77145 + }, + { + "epoch": 5.241880690311183, + "grad_norm": 0.8471721410751343, + "learning_rate": 0.0003450027177605653, + "loss": 3.593, + "step": 77150 + }, + { + "epoch": 5.2422204103818455, + "grad_norm": 0.790213942527771, + "learning_rate": 0.0003449602527517326, + "loss": 3.3862, + "step": 77155 + }, + { + "epoch": 5.242560130452507, + "grad_norm": 1.419467568397522, + "learning_rate": 0.00034491778774289986, + "loss": 3.3953, + "step": 77160 + }, + { + "epoch": 5.242899850523169, + "grad_norm": 4.708017826080322, + "learning_rate": 0.00034487532273406714, + "loss": 3.5286, + "step": 77165 + }, + { + "epoch": 5.243239570593831, + "grad_norm": 0.899353563785553, + "learning_rate": 0.0003448328577252344, + "loss": 3.5834, + "step": 77170 + }, + { + "epoch": 5.243579290664492, + "grad_norm": 1.2048900127410889, + "learning_rate": 0.00034479039271640165, + "loss": 3.3049, + "step": 77175 + }, + { + "epoch": 5.243919010735154, + "grad_norm": 1.023018479347229, + "learning_rate": 0.000344747927707569, + "loss": 3.491, + "step": 77180 + }, + { + "epoch": 5.244258730805816, + "grad_norm": 0.9381595849990845, + "learning_rate": 0.00034470546269873626, + "loss": 3.3962, + "step": 77185 + }, + { + "epoch": 5.244598450876477, + "grad_norm": 0.9759712815284729, + "learning_rate": 0.0003446629976899035, + "loss": 3.1223, + "step": 77190 + }, + { + "epoch": 5.244938170947139, + "grad_norm": 0.9797585606575012, + "learning_rate": 0.0003446205326810708, + "loss": 3.6612, + "step": 77195 + }, + { + "epoch": 5.2452778910178015, + "grad_norm": 0.7988614439964294, + "learning_rate": 0.0003445780676722381, + "loss": 3.4737, + "step": 77200 + }, + { + "epoch": 5.245617611088463, + "grad_norm": 0.9903556108474731, + "learning_rate": 0.0003445356026634053, + "loss": 3.5872, + "step": 77205 + }, + { + "epoch": 5.245957331159125, + "grad_norm": 0.9549646377563477, + "learning_rate": 0.0003444931376545726, + "loss": 3.3148, + "step": 77210 + }, + { + "epoch": 5.246297051229787, + "grad_norm": 0.9528343081474304, + "learning_rate": 0.00034445067264573994, + "loss": 3.326, + "step": 77215 + }, + { + "epoch": 5.246636771300448, + "grad_norm": 1.0039889812469482, + "learning_rate": 0.0003444082076369072, + "loss": 3.5785, + "step": 77220 + }, + { + "epoch": 5.24697649137111, + "grad_norm": 1.0323238372802734, + "learning_rate": 0.00034436574262807445, + "loss": 3.3208, + "step": 77225 + }, + { + "epoch": 5.247316211441772, + "grad_norm": 1.3218953609466553, + "learning_rate": 0.0003443232776192418, + "loss": 3.5493, + "step": 77230 + }, + { + "epoch": 5.247655931512433, + "grad_norm": 0.8242223858833313, + "learning_rate": 0.00034428081261040906, + "loss": 3.3512, + "step": 77235 + }, + { + "epoch": 5.2479956515830954, + "grad_norm": 1.173876166343689, + "learning_rate": 0.0003442383476015763, + "loss": 3.5831, + "step": 77240 + }, + { + "epoch": 5.2483353716537575, + "grad_norm": 1.0365004539489746, + "learning_rate": 0.00034419588259274357, + "loss": 3.5955, + "step": 77245 + }, + { + "epoch": 5.248675091724419, + "grad_norm": 0.9519453048706055, + "learning_rate": 0.0003441534175839109, + "loss": 3.4835, + "step": 77250 + }, + { + "epoch": 5.249014811795081, + "grad_norm": 0.9389734268188477, + "learning_rate": 0.0003441109525750781, + "loss": 3.2457, + "step": 77255 + }, + { + "epoch": 5.249354531865743, + "grad_norm": 0.7807303667068481, + "learning_rate": 0.0003440684875662454, + "loss": 3.5083, + "step": 77260 + }, + { + "epoch": 5.249694251936404, + "grad_norm": 1.0312633514404297, + "learning_rate": 0.00034402602255741274, + "loss": 3.5344, + "step": 77265 + }, + { + "epoch": 5.250033972007066, + "grad_norm": 0.9758066534996033, + "learning_rate": 0.00034398355754857997, + "loss": 3.3804, + "step": 77270 + }, + { + "epoch": 5.250373692077728, + "grad_norm": 0.978251039981842, + "learning_rate": 0.00034394109253974725, + "loss": 3.4509, + "step": 77275 + }, + { + "epoch": 5.250713412148389, + "grad_norm": 0.937860906124115, + "learning_rate": 0.0003438986275309146, + "loss": 3.6064, + "step": 77280 + }, + { + "epoch": 5.2510531322190515, + "grad_norm": 1.0998567342758179, + "learning_rate": 0.0003438561625220818, + "loss": 3.4544, + "step": 77285 + }, + { + "epoch": 5.2513928522897135, + "grad_norm": 0.7865886688232422, + "learning_rate": 0.0003438136975132491, + "loss": 3.4571, + "step": 77290 + }, + { + "epoch": 5.251732572360375, + "grad_norm": 0.8413904309272766, + "learning_rate": 0.00034377123250441637, + "loss": 3.4124, + "step": 77295 + }, + { + "epoch": 5.252072292431037, + "grad_norm": 0.997648298740387, + "learning_rate": 0.00034372876749558365, + "loss": 3.1539, + "step": 77300 + }, + { + "epoch": 5.252412012501699, + "grad_norm": 0.7821124196052551, + "learning_rate": 0.00034368630248675093, + "loss": 3.404, + "step": 77305 + }, + { + "epoch": 5.25275173257236, + "grad_norm": 0.982426106929779, + "learning_rate": 0.0003436438374779182, + "loss": 3.5464, + "step": 77310 + }, + { + "epoch": 5.253091452643022, + "grad_norm": 0.7602556347846985, + "learning_rate": 0.00034360137246908543, + "loss": 3.5318, + "step": 77315 + }, + { + "epoch": 5.253431172713684, + "grad_norm": 0.8127565979957581, + "learning_rate": 0.00034355890746025277, + "loss": 3.3794, + "step": 77320 + }, + { + "epoch": 5.253770892784345, + "grad_norm": 0.8601808547973633, + "learning_rate": 0.00034351644245142005, + "loss": 3.6449, + "step": 77325 + }, + { + "epoch": 5.2541106128550075, + "grad_norm": 0.9603238105773926, + "learning_rate": 0.0003434739774425873, + "loss": 3.4579, + "step": 77330 + }, + { + "epoch": 5.2544503329256695, + "grad_norm": 0.8392713069915771, + "learning_rate": 0.0003434315124337546, + "loss": 3.4735, + "step": 77335 + }, + { + "epoch": 5.254790052996331, + "grad_norm": 0.6616888642311096, + "learning_rate": 0.0003433890474249219, + "loss": 3.5723, + "step": 77340 + }, + { + "epoch": 5.255129773066993, + "grad_norm": 0.8381593823432922, + "learning_rate": 0.0003433465824160891, + "loss": 3.6266, + "step": 77345 + }, + { + "epoch": 5.255469493137655, + "grad_norm": 0.9013186097145081, + "learning_rate": 0.00034330411740725645, + "loss": 3.5225, + "step": 77350 + }, + { + "epoch": 5.255809213208316, + "grad_norm": 0.9206399917602539, + "learning_rate": 0.00034326165239842373, + "loss": 3.4362, + "step": 77355 + }, + { + "epoch": 5.256148933278978, + "grad_norm": 0.9012489914894104, + "learning_rate": 0.00034321918738959095, + "loss": 3.4049, + "step": 77360 + }, + { + "epoch": 5.25648865334964, + "grad_norm": 0.8417568206787109, + "learning_rate": 0.00034317672238075823, + "loss": 3.6158, + "step": 77365 + }, + { + "epoch": 5.256828373420301, + "grad_norm": 0.7164590358734131, + "learning_rate": 0.00034313425737192557, + "loss": 3.6824, + "step": 77370 + }, + { + "epoch": 5.2571680934909635, + "grad_norm": 0.8478497862815857, + "learning_rate": 0.0003430917923630928, + "loss": 3.4573, + "step": 77375 + }, + { + "epoch": 5.2575078135616256, + "grad_norm": 1.1225422620773315, + "learning_rate": 0.0003430493273542601, + "loss": 3.5263, + "step": 77380 + }, + { + "epoch": 5.257847533632287, + "grad_norm": 1.1010841131210327, + "learning_rate": 0.0003430068623454274, + "loss": 3.3588, + "step": 77385 + }, + { + "epoch": 5.258187253702949, + "grad_norm": 0.9214475154876709, + "learning_rate": 0.0003429643973365947, + "loss": 3.5449, + "step": 77390 + }, + { + "epoch": 5.258526973773611, + "grad_norm": 0.9849879741668701, + "learning_rate": 0.0003429219323277619, + "loss": 3.5708, + "step": 77395 + }, + { + "epoch": 5.258866693844272, + "grad_norm": 0.7890661358833313, + "learning_rate": 0.0003428794673189292, + "loss": 3.2855, + "step": 77400 + }, + { + "epoch": 5.259206413914934, + "grad_norm": 0.9314683079719543, + "learning_rate": 0.00034283700231009653, + "loss": 3.5623, + "step": 77405 + }, + { + "epoch": 5.259546133985596, + "grad_norm": 0.7384652495384216, + "learning_rate": 0.00034279453730126375, + "loss": 3.5619, + "step": 77410 + }, + { + "epoch": 5.259885854056257, + "grad_norm": 0.9943201541900635, + "learning_rate": 0.00034275207229243103, + "loss": 3.4696, + "step": 77415 + }, + { + "epoch": 5.2602255741269195, + "grad_norm": 1.0340702533721924, + "learning_rate": 0.00034270960728359837, + "loss": 3.7012, + "step": 77420 + }, + { + "epoch": 5.260565294197582, + "grad_norm": 1.1956580877304077, + "learning_rate": 0.0003426671422747656, + "loss": 3.2982, + "step": 77425 + }, + { + "epoch": 5.260905014268243, + "grad_norm": 1.1408084630966187, + "learning_rate": 0.0003426246772659329, + "loss": 3.5638, + "step": 77430 + }, + { + "epoch": 5.261244734338905, + "grad_norm": 1.0091361999511719, + "learning_rate": 0.00034258221225710016, + "loss": 3.4856, + "step": 77435 + }, + { + "epoch": 5.261584454409567, + "grad_norm": 1.6551138162612915, + "learning_rate": 0.00034253974724826744, + "loss": 3.5529, + "step": 77440 + }, + { + "epoch": 5.261924174480228, + "grad_norm": 1.3936779499053955, + "learning_rate": 0.0003424972822394347, + "loss": 3.392, + "step": 77445 + }, + { + "epoch": 5.26226389455089, + "grad_norm": 0.9671104550361633, + "learning_rate": 0.000342454817230602, + "loss": 3.4088, + "step": 77450 + }, + { + "epoch": 5.262603614621552, + "grad_norm": 1.1720342636108398, + "learning_rate": 0.0003424123522217693, + "loss": 3.5177, + "step": 77455 + }, + { + "epoch": 5.262943334692213, + "grad_norm": 1.009598970413208, + "learning_rate": 0.00034236988721293656, + "loss": 3.5083, + "step": 77460 + }, + { + "epoch": 5.2632830547628755, + "grad_norm": 0.7564134001731873, + "learning_rate": 0.00034232742220410384, + "loss": 3.6063, + "step": 77465 + }, + { + "epoch": 5.263622774833538, + "grad_norm": 1.0341084003448486, + "learning_rate": 0.00034228495719527106, + "loss": 3.2952, + "step": 77470 + }, + { + "epoch": 5.263962494904199, + "grad_norm": 1.2899399995803833, + "learning_rate": 0.0003422424921864384, + "loss": 3.7059, + "step": 77475 + }, + { + "epoch": 5.264302214974861, + "grad_norm": 1.1316176652908325, + "learning_rate": 0.0003422000271776057, + "loss": 3.5913, + "step": 77480 + }, + { + "epoch": 5.264641935045523, + "grad_norm": 0.8020249605178833, + "learning_rate": 0.0003421575621687729, + "loss": 3.3605, + "step": 77485 + }, + { + "epoch": 5.264981655116184, + "grad_norm": 0.7822467684745789, + "learning_rate": 0.00034211509715994024, + "loss": 3.3426, + "step": 77490 + }, + { + "epoch": 5.265321375186846, + "grad_norm": 0.9902470111846924, + "learning_rate": 0.0003420726321511075, + "loss": 3.3789, + "step": 77495 + }, + { + "epoch": 5.265661095257508, + "grad_norm": 0.8548924922943115, + "learning_rate": 0.00034203016714227474, + "loss": 3.599, + "step": 77500 + }, + { + "epoch": 5.266000815328169, + "grad_norm": 0.9187685251235962, + "learning_rate": 0.000341987702133442, + "loss": 3.4624, + "step": 77505 + }, + { + "epoch": 5.2663405353988315, + "grad_norm": 0.841669499874115, + "learning_rate": 0.00034194523712460936, + "loss": 3.4062, + "step": 77510 + }, + { + "epoch": 5.266680255469494, + "grad_norm": 0.8491201400756836, + "learning_rate": 0.0003419027721157766, + "loss": 3.5345, + "step": 77515 + }, + { + "epoch": 5.267019975540155, + "grad_norm": 1.4098798036575317, + "learning_rate": 0.00034186030710694386, + "loss": 3.5526, + "step": 77520 + }, + { + "epoch": 5.267359695610817, + "grad_norm": 0.7796936631202698, + "learning_rate": 0.0003418178420981112, + "loss": 3.4492, + "step": 77525 + }, + { + "epoch": 5.267699415681479, + "grad_norm": 1.135048270225525, + "learning_rate": 0.0003417753770892784, + "loss": 3.3257, + "step": 77530 + }, + { + "epoch": 5.26803913575214, + "grad_norm": 1.084131121635437, + "learning_rate": 0.0003417329120804457, + "loss": 3.4288, + "step": 77535 + }, + { + "epoch": 5.268378855822802, + "grad_norm": 0.9565548896789551, + "learning_rate": 0.000341690447071613, + "loss": 3.4354, + "step": 77540 + }, + { + "epoch": 5.268718575893463, + "grad_norm": 0.7867613434791565, + "learning_rate": 0.00034164798206278026, + "loss": 3.4915, + "step": 77545 + }, + { + "epoch": 5.2690582959641254, + "grad_norm": 0.8224363327026367, + "learning_rate": 0.00034160551705394754, + "loss": 3.4744, + "step": 77550 + }, + { + "epoch": 5.2693980160347875, + "grad_norm": 0.8810397386550903, + "learning_rate": 0.0003415630520451148, + "loss": 3.373, + "step": 77555 + }, + { + "epoch": 5.269737736105449, + "grad_norm": 0.818778395652771, + "learning_rate": 0.00034152058703628216, + "loss": 3.4015, + "step": 77560 + }, + { + "epoch": 5.270077456176111, + "grad_norm": 0.9685967564582825, + "learning_rate": 0.0003414781220274494, + "loss": 3.6615, + "step": 77565 + }, + { + "epoch": 5.270417176246773, + "grad_norm": 0.8377653956413269, + "learning_rate": 0.00034143565701861666, + "loss": 3.6177, + "step": 77570 + }, + { + "epoch": 5.270756896317434, + "grad_norm": 0.8556618690490723, + "learning_rate": 0.000341393192009784, + "loss": 3.7659, + "step": 77575 + }, + { + "epoch": 5.271096616388096, + "grad_norm": 1.1136298179626465, + "learning_rate": 0.0003413507270009512, + "loss": 3.4517, + "step": 77580 + }, + { + "epoch": 5.271436336458758, + "grad_norm": 0.9531322121620178, + "learning_rate": 0.0003413082619921185, + "loss": 3.4269, + "step": 77585 + }, + { + "epoch": 5.271776056529419, + "grad_norm": 0.7756415605545044, + "learning_rate": 0.0003412657969832858, + "loss": 3.1834, + "step": 77590 + }, + { + "epoch": 5.2721157766000815, + "grad_norm": 0.8196192383766174, + "learning_rate": 0.00034122333197445306, + "loss": 3.6032, + "step": 77595 + }, + { + "epoch": 5.2724554966707435, + "grad_norm": 0.9683305025100708, + "learning_rate": 0.00034118086696562034, + "loss": 3.5046, + "step": 77600 + }, + { + "epoch": 5.272795216741405, + "grad_norm": 0.9356143474578857, + "learning_rate": 0.0003411384019567876, + "loss": 3.3129, + "step": 77605 + }, + { + "epoch": 5.273134936812067, + "grad_norm": 1.1505295038223267, + "learning_rate": 0.0003410959369479549, + "loss": 3.3694, + "step": 77610 + }, + { + "epoch": 5.273474656882729, + "grad_norm": 0.8563144207000732, + "learning_rate": 0.0003410534719391222, + "loss": 3.5046, + "step": 77615 + }, + { + "epoch": 5.27381437695339, + "grad_norm": 0.9647845029830933, + "learning_rate": 0.00034101100693028946, + "loss": 3.5461, + "step": 77620 + }, + { + "epoch": 5.274154097024052, + "grad_norm": 1.1294902563095093, + "learning_rate": 0.0003409685419214567, + "loss": 3.3653, + "step": 77625 + }, + { + "epoch": 5.274493817094714, + "grad_norm": 0.8563801050186157, + "learning_rate": 0.000340926076912624, + "loss": 3.523, + "step": 77630 + }, + { + "epoch": 5.274833537165375, + "grad_norm": 0.9082407355308533, + "learning_rate": 0.0003408836119037913, + "loss": 3.5993, + "step": 77635 + }, + { + "epoch": 5.2751732572360375, + "grad_norm": 0.8560264706611633, + "learning_rate": 0.00034084114689495853, + "loss": 3.4995, + "step": 77640 + }, + { + "epoch": 5.2755129773066995, + "grad_norm": 0.88304603099823, + "learning_rate": 0.00034079868188612586, + "loss": 3.3728, + "step": 77645 + }, + { + "epoch": 5.275852697377361, + "grad_norm": 1.0334898233413696, + "learning_rate": 0.00034075621687729314, + "loss": 3.3861, + "step": 77650 + }, + { + "epoch": 5.276192417448023, + "grad_norm": 0.8470972180366516, + "learning_rate": 0.00034071375186846037, + "loss": 3.4206, + "step": 77655 + }, + { + "epoch": 5.276532137518685, + "grad_norm": 0.9669126272201538, + "learning_rate": 0.00034067128685962765, + "loss": 3.5531, + "step": 77660 + }, + { + "epoch": 5.276871857589346, + "grad_norm": 1.0127102136611938, + "learning_rate": 0.000340628821850795, + "loss": 3.3182, + "step": 77665 + }, + { + "epoch": 5.277211577660008, + "grad_norm": 0.8244888186454773, + "learning_rate": 0.0003405863568419622, + "loss": 3.5884, + "step": 77670 + }, + { + "epoch": 5.27755129773067, + "grad_norm": 2.008202075958252, + "learning_rate": 0.0003405438918331295, + "loss": 3.2762, + "step": 77675 + }, + { + "epoch": 5.277891017801331, + "grad_norm": 1.0937964916229248, + "learning_rate": 0.0003405014268242968, + "loss": 3.1554, + "step": 77680 + }, + { + "epoch": 5.2782307378719935, + "grad_norm": 0.8983838558197021, + "learning_rate": 0.00034045896181546405, + "loss": 3.5146, + "step": 77685 + }, + { + "epoch": 5.278570457942656, + "grad_norm": 1.0878580808639526, + "learning_rate": 0.00034041649680663133, + "loss": 3.3844, + "step": 77690 + }, + { + "epoch": 5.278910178013317, + "grad_norm": 0.9766399264335632, + "learning_rate": 0.0003403740317977986, + "loss": 3.629, + "step": 77695 + }, + { + "epoch": 5.279249898083979, + "grad_norm": 1.190747618675232, + "learning_rate": 0.0003403315667889659, + "loss": 3.252, + "step": 77700 + }, + { + "epoch": 5.279589618154641, + "grad_norm": 0.8323453068733215, + "learning_rate": 0.00034028910178013317, + "loss": 3.7146, + "step": 77705 + }, + { + "epoch": 5.279929338225302, + "grad_norm": 1.0267424583435059, + "learning_rate": 0.00034024663677130045, + "loss": 3.2103, + "step": 77710 + }, + { + "epoch": 5.280269058295964, + "grad_norm": 0.852499783039093, + "learning_rate": 0.00034020417176246773, + "loss": 3.4258, + "step": 77715 + }, + { + "epoch": 5.280608778366626, + "grad_norm": 1.014878511428833, + "learning_rate": 0.000340161706753635, + "loss": 3.4758, + "step": 77720 + }, + { + "epoch": 5.280948498437287, + "grad_norm": 1.0260480642318726, + "learning_rate": 0.0003401192417448023, + "loss": 3.5203, + "step": 77725 + }, + { + "epoch": 5.2812882185079495, + "grad_norm": 1.038825273513794, + "learning_rate": 0.00034007677673596957, + "loss": 3.3704, + "step": 77730 + }, + { + "epoch": 5.281627938578612, + "grad_norm": 0.9486488103866577, + "learning_rate": 0.00034003431172713685, + "loss": 3.576, + "step": 77735 + }, + { + "epoch": 5.281967658649273, + "grad_norm": 0.9681752324104309, + "learning_rate": 0.00033999184671830413, + "loss": 3.4043, + "step": 77740 + }, + { + "epoch": 5.282307378719935, + "grad_norm": 1.0173265933990479, + "learning_rate": 0.0003399493817094714, + "loss": 3.4377, + "step": 77745 + }, + { + "epoch": 5.282647098790597, + "grad_norm": 0.7983232140541077, + "learning_rate": 0.0003399069167006387, + "loss": 3.2282, + "step": 77750 + }, + { + "epoch": 5.282986818861258, + "grad_norm": 0.819148063659668, + "learning_rate": 0.00033986445169180597, + "loss": 3.5411, + "step": 77755 + }, + { + "epoch": 5.28332653893192, + "grad_norm": 1.0374865531921387, + "learning_rate": 0.00033982198668297325, + "loss": 3.5313, + "step": 77760 + }, + { + "epoch": 5.283666259002582, + "grad_norm": 0.8209851980209351, + "learning_rate": 0.0003397795216741405, + "loss": 2.9935, + "step": 77765 + }, + { + "epoch": 5.284005979073243, + "grad_norm": 0.855811357498169, + "learning_rate": 0.0003397370566653078, + "loss": 3.4552, + "step": 77770 + }, + { + "epoch": 5.2843456991439055, + "grad_norm": 0.9454516768455505, + "learning_rate": 0.0003396945916564751, + "loss": 3.504, + "step": 77775 + }, + { + "epoch": 5.284685419214568, + "grad_norm": 1.0162608623504639, + "learning_rate": 0.0003396521266476423, + "loss": 3.2887, + "step": 77780 + }, + { + "epoch": 5.285025139285229, + "grad_norm": 1.0283983945846558, + "learning_rate": 0.00033960966163880965, + "loss": 3.4291, + "step": 77785 + }, + { + "epoch": 5.285364859355891, + "grad_norm": 0.9946487545967102, + "learning_rate": 0.00033956719662997693, + "loss": 3.5485, + "step": 77790 + }, + { + "epoch": 5.285704579426553, + "grad_norm": 1.018020749092102, + "learning_rate": 0.00033952473162114416, + "loss": 3.3561, + "step": 77795 + }, + { + "epoch": 5.286044299497214, + "grad_norm": 0.9395130276679993, + "learning_rate": 0.00033948226661231144, + "loss": 3.3451, + "step": 77800 + }, + { + "epoch": 5.286384019567876, + "grad_norm": 0.8248791694641113, + "learning_rate": 0.00033943980160347877, + "loss": 3.5081, + "step": 77805 + }, + { + "epoch": 5.286723739638538, + "grad_norm": 0.9247918128967285, + "learning_rate": 0.000339397336594646, + "loss": 3.5172, + "step": 77810 + }, + { + "epoch": 5.287063459709199, + "grad_norm": 1.033839225769043, + "learning_rate": 0.0003393548715858133, + "loss": 3.4182, + "step": 77815 + }, + { + "epoch": 5.2874031797798615, + "grad_norm": 0.9611290097236633, + "learning_rate": 0.0003393124065769806, + "loss": 3.491, + "step": 77820 + }, + { + "epoch": 5.287742899850523, + "grad_norm": 0.8374946713447571, + "learning_rate": 0.00033926994156814784, + "loss": 3.5437, + "step": 77825 + }, + { + "epoch": 5.288082619921185, + "grad_norm": 0.8906586766242981, + "learning_rate": 0.0003392274765593151, + "loss": 3.2555, + "step": 77830 + }, + { + "epoch": 5.288422339991847, + "grad_norm": 0.9277933239936829, + "learning_rate": 0.0003391850115504824, + "loss": 3.4711, + "step": 77835 + }, + { + "epoch": 5.288762060062508, + "grad_norm": 0.8573673367500305, + "learning_rate": 0.0003391425465416497, + "loss": 3.7297, + "step": 77840 + }, + { + "epoch": 5.28910178013317, + "grad_norm": 1.0617949962615967, + "learning_rate": 0.00033910008153281696, + "loss": 3.4283, + "step": 77845 + }, + { + "epoch": 5.289441500203832, + "grad_norm": 0.997910737991333, + "learning_rate": 0.00033905761652398424, + "loss": 3.3728, + "step": 77850 + }, + { + "epoch": 5.289781220274493, + "grad_norm": 1.1394004821777344, + "learning_rate": 0.0003390151515151515, + "loss": 3.5479, + "step": 77855 + }, + { + "epoch": 5.2901209403451555, + "grad_norm": 0.8915480971336365, + "learning_rate": 0.0003389726865063188, + "loss": 3.5544, + "step": 77860 + }, + { + "epoch": 5.2904606604158175, + "grad_norm": 0.9711739420890808, + "learning_rate": 0.0003389302214974861, + "loss": 3.2986, + "step": 77865 + }, + { + "epoch": 5.290800380486479, + "grad_norm": 0.6624096035957336, + "learning_rate": 0.0003388877564886533, + "loss": 3.3391, + "step": 77870 + }, + { + "epoch": 5.291140100557141, + "grad_norm": 0.8566524982452393, + "learning_rate": 0.00033884529147982064, + "loss": 3.708, + "step": 77875 + }, + { + "epoch": 5.291479820627803, + "grad_norm": 1.1753623485565186, + "learning_rate": 0.0003388028264709879, + "loss": 3.2751, + "step": 77880 + }, + { + "epoch": 5.291819540698464, + "grad_norm": 0.764839768409729, + "learning_rate": 0.00033876036146215515, + "loss": 3.3728, + "step": 77885 + }, + { + "epoch": 5.292159260769126, + "grad_norm": 0.980332612991333, + "learning_rate": 0.0003387178964533225, + "loss": 3.5351, + "step": 77890 + }, + { + "epoch": 5.292498980839788, + "grad_norm": 1.048262357711792, + "learning_rate": 0.00033867543144448976, + "loss": 3.4116, + "step": 77895 + }, + { + "epoch": 5.292838700910449, + "grad_norm": 0.9497652053833008, + "learning_rate": 0.00033863296643565704, + "loss": 3.4195, + "step": 77900 + }, + { + "epoch": 5.2931784209811115, + "grad_norm": 1.4416178464889526, + "learning_rate": 0.0003385905014268243, + "loss": 3.3018, + "step": 77905 + }, + { + "epoch": 5.2935181410517735, + "grad_norm": 1.1065051555633545, + "learning_rate": 0.0003385480364179916, + "loss": 3.3928, + "step": 77910 + }, + { + "epoch": 5.293857861122435, + "grad_norm": 0.9605606198310852, + "learning_rate": 0.0003385055714091589, + "loss": 3.4984, + "step": 77915 + }, + { + "epoch": 5.294197581193097, + "grad_norm": 1.1090437173843384, + "learning_rate": 0.0003384631064003261, + "loss": 3.3957, + "step": 77920 + }, + { + "epoch": 5.294537301263759, + "grad_norm": 0.9717448949813843, + "learning_rate": 0.00033842064139149344, + "loss": 3.2781, + "step": 77925 + }, + { + "epoch": 5.29487702133442, + "grad_norm": 0.9759958386421204, + "learning_rate": 0.0003383781763826607, + "loss": 3.4287, + "step": 77930 + }, + { + "epoch": 5.295216741405082, + "grad_norm": 0.8043806552886963, + "learning_rate": 0.00033833571137382795, + "loss": 3.6856, + "step": 77935 + }, + { + "epoch": 5.295556461475744, + "grad_norm": 1.0052684545516968, + "learning_rate": 0.0003382932463649953, + "loss": 3.3423, + "step": 77940 + }, + { + "epoch": 5.295896181546405, + "grad_norm": 1.0062370300292969, + "learning_rate": 0.00033825078135616256, + "loss": 3.5298, + "step": 77945 + }, + { + "epoch": 5.2962359016170675, + "grad_norm": 0.780600905418396, + "learning_rate": 0.0003382083163473298, + "loss": 3.6105, + "step": 77950 + }, + { + "epoch": 5.2965756216877296, + "grad_norm": 0.8797338604927063, + "learning_rate": 0.00033816585133849707, + "loss": 3.5322, + "step": 77955 + }, + { + "epoch": 5.296915341758391, + "grad_norm": 0.8921968340873718, + "learning_rate": 0.0003381233863296644, + "loss": 3.3453, + "step": 77960 + }, + { + "epoch": 5.297255061829053, + "grad_norm": 0.9763129949569702, + "learning_rate": 0.0003380809213208316, + "loss": 3.4308, + "step": 77965 + }, + { + "epoch": 5.297594781899715, + "grad_norm": 0.8423880338668823, + "learning_rate": 0.0003380384563119989, + "loss": 3.4529, + "step": 77970 + }, + { + "epoch": 5.297934501970376, + "grad_norm": 1.0107553005218506, + "learning_rate": 0.00033799599130316624, + "loss": 3.3699, + "step": 77975 + }, + { + "epoch": 5.298274222041038, + "grad_norm": 0.9197039604187012, + "learning_rate": 0.00033795352629433347, + "loss": 3.7275, + "step": 77980 + }, + { + "epoch": 5.2986139421117, + "grad_norm": 0.8877140283584595, + "learning_rate": 0.00033791106128550075, + "loss": 3.4336, + "step": 77985 + }, + { + "epoch": 5.298953662182361, + "grad_norm": 0.7776240110397339, + "learning_rate": 0.000337868596276668, + "loss": 3.4861, + "step": 77990 + }, + { + "epoch": 5.2992933822530235, + "grad_norm": 1.3042383193969727, + "learning_rate": 0.0003378261312678353, + "loss": 3.2738, + "step": 77995 + }, + { + "epoch": 5.299633102323686, + "grad_norm": 1.7553373575210571, + "learning_rate": 0.0003377836662590026, + "loss": 3.3622, + "step": 78000 + }, + { + "epoch": 5.299972822394347, + "grad_norm": 1.4395217895507812, + "learning_rate": 0.00033774120125016987, + "loss": 3.3614, + "step": 78005 + }, + { + "epoch": 5.300312542465009, + "grad_norm": 0.9228554964065552, + "learning_rate": 0.00033769873624133715, + "loss": 3.179, + "step": 78010 + }, + { + "epoch": 5.300652262535671, + "grad_norm": 1.0710654258728027, + "learning_rate": 0.00033765627123250443, + "loss": 3.5226, + "step": 78015 + }, + { + "epoch": 5.300991982606332, + "grad_norm": 0.9453474879264832, + "learning_rate": 0.0003376138062236717, + "loss": 3.3552, + "step": 78020 + }, + { + "epoch": 5.301331702676994, + "grad_norm": 1.018923044204712, + "learning_rate": 0.00033757134121483893, + "loss": 3.4294, + "step": 78025 + }, + { + "epoch": 5.301671422747656, + "grad_norm": 0.8575799465179443, + "learning_rate": 0.00033752887620600627, + "loss": 3.3757, + "step": 78030 + }, + { + "epoch": 5.302011142818317, + "grad_norm": 1.2365413904190063, + "learning_rate": 0.00033748641119717355, + "loss": 3.5129, + "step": 78035 + }, + { + "epoch": 5.3023508628889795, + "grad_norm": 0.8244345784187317, + "learning_rate": 0.0003374439461883408, + "loss": 3.4893, + "step": 78040 + }, + { + "epoch": 5.302690582959642, + "grad_norm": 0.7201393842697144, + "learning_rate": 0.0003374014811795081, + "loss": 3.5262, + "step": 78045 + }, + { + "epoch": 5.303030303030303, + "grad_norm": 1.2410898208618164, + "learning_rate": 0.0003373590161706754, + "loss": 3.2518, + "step": 78050 + }, + { + "epoch": 5.303370023100965, + "grad_norm": 1.19711434841156, + "learning_rate": 0.0003373165511618426, + "loss": 3.6069, + "step": 78055 + }, + { + "epoch": 5.303709743171627, + "grad_norm": 0.9960489273071289, + "learning_rate": 0.0003372740861530099, + "loss": 3.4211, + "step": 78060 + }, + { + "epoch": 5.304049463242288, + "grad_norm": 0.9364823698997498, + "learning_rate": 0.00033723162114417723, + "loss": 3.6464, + "step": 78065 + }, + { + "epoch": 5.30438918331295, + "grad_norm": 0.9833688735961914, + "learning_rate": 0.0003371891561353445, + "loss": 3.3518, + "step": 78070 + }, + { + "epoch": 5.304728903383612, + "grad_norm": 1.2660419940948486, + "learning_rate": 0.00033714669112651173, + "loss": 3.267, + "step": 78075 + }, + { + "epoch": 5.305068623454273, + "grad_norm": 1.4324074983596802, + "learning_rate": 0.00033710422611767907, + "loss": 3.5646, + "step": 78080 + }, + { + "epoch": 5.3054083435249355, + "grad_norm": 0.9092172384262085, + "learning_rate": 0.00033706176110884635, + "loss": 3.2636, + "step": 78085 + }, + { + "epoch": 5.305748063595598, + "grad_norm": 0.9912005662918091, + "learning_rate": 0.0003370192961000136, + "loss": 3.4694, + "step": 78090 + }, + { + "epoch": 5.306087783666259, + "grad_norm": 0.8318581581115723, + "learning_rate": 0.00033697683109118085, + "loss": 3.5408, + "step": 78095 + }, + { + "epoch": 5.306427503736921, + "grad_norm": 0.9152544736862183, + "learning_rate": 0.0003369343660823482, + "loss": 3.2865, + "step": 78100 + }, + { + "epoch": 5.306767223807583, + "grad_norm": 0.7828632593154907, + "learning_rate": 0.0003368919010735154, + "loss": 3.6311, + "step": 78105 + }, + { + "epoch": 5.307106943878244, + "grad_norm": 1.2817453145980835, + "learning_rate": 0.0003368494360646827, + "loss": 3.8332, + "step": 78110 + }, + { + "epoch": 5.307446663948906, + "grad_norm": 0.7643349766731262, + "learning_rate": 0.00033680697105585003, + "loss": 3.6305, + "step": 78115 + }, + { + "epoch": 5.307786384019568, + "grad_norm": 0.7174769043922424, + "learning_rate": 0.00033676450604701725, + "loss": 3.3231, + "step": 78120 + }, + { + "epoch": 5.3081261040902294, + "grad_norm": 0.7830010652542114, + "learning_rate": 0.00033672204103818453, + "loss": 3.2419, + "step": 78125 + }, + { + "epoch": 5.3084658241608915, + "grad_norm": 0.9582051038742065, + "learning_rate": 0.0003366795760293518, + "loss": 3.621, + "step": 78130 + }, + { + "epoch": 5.308805544231554, + "grad_norm": 0.795814037322998, + "learning_rate": 0.0003366371110205191, + "loss": 3.2449, + "step": 78135 + }, + { + "epoch": 5.309145264302215, + "grad_norm": 0.7611916065216064, + "learning_rate": 0.0003365946460116864, + "loss": 3.3537, + "step": 78140 + }, + { + "epoch": 5.309484984372877, + "grad_norm": 1.138769268989563, + "learning_rate": 0.00033655218100285365, + "loss": 3.2362, + "step": 78145 + }, + { + "epoch": 5.309824704443539, + "grad_norm": 1.124953269958496, + "learning_rate": 0.00033650971599402094, + "loss": 3.5712, + "step": 78150 + }, + { + "epoch": 5.3101644245142, + "grad_norm": 0.8033455610275269, + "learning_rate": 0.0003364672509851882, + "loss": 3.5699, + "step": 78155 + }, + { + "epoch": 5.310504144584862, + "grad_norm": 0.890675961971283, + "learning_rate": 0.0003364247859763555, + "loss": 3.5521, + "step": 78160 + }, + { + "epoch": 5.310843864655524, + "grad_norm": 0.7583604454994202, + "learning_rate": 0.0003363823209675227, + "loss": 3.8427, + "step": 78165 + }, + { + "epoch": 5.3111835847261855, + "grad_norm": 2.3161354064941406, + "learning_rate": 0.00033633985595869006, + "loss": 3.3898, + "step": 78170 + }, + { + "epoch": 5.3115233047968475, + "grad_norm": 0.8743608593940735, + "learning_rate": 0.00033629739094985734, + "loss": 3.6014, + "step": 78175 + }, + { + "epoch": 5.31186302486751, + "grad_norm": 1.187766671180725, + "learning_rate": 0.00033625492594102456, + "loss": 3.8347, + "step": 78180 + }, + { + "epoch": 5.312202744938171, + "grad_norm": 0.7637351155281067, + "learning_rate": 0.0003362124609321919, + "loss": 3.0764, + "step": 78185 + }, + { + "epoch": 5.312542465008833, + "grad_norm": 0.869805634021759, + "learning_rate": 0.0003361699959233592, + "loss": 3.2322, + "step": 78190 + }, + { + "epoch": 5.312882185079495, + "grad_norm": 0.8887575268745422, + "learning_rate": 0.0003361275309145264, + "loss": 3.3648, + "step": 78195 + }, + { + "epoch": 5.313221905150156, + "grad_norm": 0.9522185325622559, + "learning_rate": 0.00033608506590569374, + "loss": 3.5246, + "step": 78200 + }, + { + "epoch": 5.313561625220818, + "grad_norm": NaN, + "learning_rate": 0.00033605109389862756, + "loss": 3.7893, + "step": 78205 + }, + { + "epoch": 5.31390134529148, + "grad_norm": 1.0350196361541748, + "learning_rate": 0.00033600862888979484, + "loss": 3.5861, + "step": 78210 + }, + { + "epoch": 5.3142410653621415, + "grad_norm": 1.2102874517440796, + "learning_rate": 0.00033596616388096207, + "loss": 3.2102, + "step": 78215 + }, + { + "epoch": 5.3145807854328035, + "grad_norm": 0.8044596910476685, + "learning_rate": 0.00033592369887212935, + "loss": 3.4944, + "step": 78220 + }, + { + "epoch": 5.314920505503465, + "grad_norm": 1.3589829206466675, + "learning_rate": 0.0003358812338632967, + "loss": 3.6346, + "step": 78225 + }, + { + "epoch": 5.315260225574127, + "grad_norm": 1.163985252380371, + "learning_rate": 0.0003358387688544639, + "loss": 3.4381, + "step": 78230 + }, + { + "epoch": 5.315599945644789, + "grad_norm": 0.8058573603630066, + "learning_rate": 0.0003357963038456312, + "loss": 3.2557, + "step": 78235 + }, + { + "epoch": 5.31593966571545, + "grad_norm": 0.8002357482910156, + "learning_rate": 0.0003357538388367985, + "loss": 3.6057, + "step": 78240 + }, + { + "epoch": 5.316279385786112, + "grad_norm": 0.8772565722465515, + "learning_rate": 0.00033571137382796575, + "loss": 3.4565, + "step": 78245 + }, + { + "epoch": 5.316619105856774, + "grad_norm": 0.9587078094482422, + "learning_rate": 0.000335668908819133, + "loss": 3.3946, + "step": 78250 + }, + { + "epoch": 5.316958825927435, + "grad_norm": 1.2248965501785278, + "learning_rate": 0.0003356264438103003, + "loss": 3.6027, + "step": 78255 + }, + { + "epoch": 5.3172985459980975, + "grad_norm": 0.7917762994766235, + "learning_rate": 0.0003355839788014676, + "loss": 3.5382, + "step": 78260 + }, + { + "epoch": 5.31763826606876, + "grad_norm": 0.7721677422523499, + "learning_rate": 0.00033554151379263487, + "loss": 3.2177, + "step": 78265 + }, + { + "epoch": 5.317977986139421, + "grad_norm": 0.8288191556930542, + "learning_rate": 0.00033549904878380215, + "loss": 3.4848, + "step": 78270 + }, + { + "epoch": 5.318317706210083, + "grad_norm": 0.8727341890335083, + "learning_rate": 0.0003354565837749695, + "loss": 3.4304, + "step": 78275 + }, + { + "epoch": 5.318657426280745, + "grad_norm": 0.8614826202392578, + "learning_rate": 0.0003354141187661367, + "loss": 3.1654, + "step": 78280 + }, + { + "epoch": 5.318997146351406, + "grad_norm": 1.2600387334823608, + "learning_rate": 0.000335371653757304, + "loss": 3.329, + "step": 78285 + }, + { + "epoch": 5.319336866422068, + "grad_norm": 0.757792592048645, + "learning_rate": 0.00033532918874847127, + "loss": 3.459, + "step": 78290 + }, + { + "epoch": 5.31967658649273, + "grad_norm": 0.9871798753738403, + "learning_rate": 0.00033528672373963855, + "loss": 3.5363, + "step": 78295 + }, + { + "epoch": 5.320016306563391, + "grad_norm": 0.8874430060386658, + "learning_rate": 0.00033524425873080583, + "loss": 3.2621, + "step": 78300 + }, + { + "epoch": 5.3203560266340535, + "grad_norm": 0.8635753989219666, + "learning_rate": 0.0003352017937219731, + "loss": 3.4438, + "step": 78305 + }, + { + "epoch": 5.320695746704716, + "grad_norm": 0.9109507203102112, + "learning_rate": 0.0003351593287131404, + "loss": 3.2877, + "step": 78310 + }, + { + "epoch": 5.321035466775377, + "grad_norm": 1.007425308227539, + "learning_rate": 0.00033511686370430767, + "loss": 3.6404, + "step": 78315 + }, + { + "epoch": 5.321375186846039, + "grad_norm": 1.0662721395492554, + "learning_rate": 0.00033507439869547495, + "loss": 3.2819, + "step": 78320 + }, + { + "epoch": 5.321714906916701, + "grad_norm": 0.8646244406700134, + "learning_rate": 0.0003350319336866422, + "loss": 3.3596, + "step": 78325 + }, + { + "epoch": 5.322054626987362, + "grad_norm": 0.817894458770752, + "learning_rate": 0.0003349894686778095, + "loss": 3.2898, + "step": 78330 + }, + { + "epoch": 5.322394347058024, + "grad_norm": 0.8746413588523865, + "learning_rate": 0.0003349470036689768, + "loss": 3.4253, + "step": 78335 + }, + { + "epoch": 5.322734067128686, + "grad_norm": 0.9179010987281799, + "learning_rate": 0.000334904538660144, + "loss": 3.57, + "step": 78340 + }, + { + "epoch": 5.323073787199347, + "grad_norm": 0.9317873120307922, + "learning_rate": 0.00033486207365131135, + "loss": 3.5576, + "step": 78345 + }, + { + "epoch": 5.3234135072700095, + "grad_norm": 0.8391185998916626, + "learning_rate": 0.00033481960864247863, + "loss": 3.3881, + "step": 78350 + }, + { + "epoch": 5.323753227340672, + "grad_norm": 0.9344938397407532, + "learning_rate": 0.00033477714363364585, + "loss": 3.5244, + "step": 78355 + }, + { + "epoch": 5.324092947411333, + "grad_norm": 0.968170166015625, + "learning_rate": 0.00033473467862481313, + "loss": 3.4562, + "step": 78360 + }, + { + "epoch": 5.324432667481995, + "grad_norm": 0.9829947352409363, + "learning_rate": 0.00033469221361598047, + "loss": 3.407, + "step": 78365 + }, + { + "epoch": 5.324772387552657, + "grad_norm": 1.0028263330459595, + "learning_rate": 0.0003346497486071477, + "loss": 3.431, + "step": 78370 + }, + { + "epoch": 5.325112107623318, + "grad_norm": 0.7823069095611572, + "learning_rate": 0.000334607283598315, + "loss": 3.4042, + "step": 78375 + }, + { + "epoch": 5.32545182769398, + "grad_norm": 0.8772959113121033, + "learning_rate": 0.0003345648185894823, + "loss": 3.6243, + "step": 78380 + }, + { + "epoch": 5.325791547764642, + "grad_norm": 1.294337272644043, + "learning_rate": 0.00033452235358064953, + "loss": 3.6019, + "step": 78385 + }, + { + "epoch": 5.326131267835303, + "grad_norm": 0.850843608379364, + "learning_rate": 0.0003344798885718168, + "loss": 3.3374, + "step": 78390 + }, + { + "epoch": 5.3264709879059655, + "grad_norm": 0.9707168340682983, + "learning_rate": 0.00033443742356298415, + "loss": 3.5074, + "step": 78395 + }, + { + "epoch": 5.326810707976628, + "grad_norm": 1.803490161895752, + "learning_rate": 0.0003343949585541514, + "loss": 3.2931, + "step": 78400 + }, + { + "epoch": 5.327150428047289, + "grad_norm": 0.9284553527832031, + "learning_rate": 0.00033435249354531865, + "loss": 3.4106, + "step": 78405 + }, + { + "epoch": 5.327490148117951, + "grad_norm": 0.7270012497901917, + "learning_rate": 0.00033431002853648593, + "loss": 3.4438, + "step": 78410 + }, + { + "epoch": 5.327829868188613, + "grad_norm": 1.0198653936386108, + "learning_rate": 0.0003342675635276532, + "loss": 3.4079, + "step": 78415 + }, + { + "epoch": 5.328169588259274, + "grad_norm": 1.0706558227539062, + "learning_rate": 0.0003342250985188205, + "loss": 3.1516, + "step": 78420 + }, + { + "epoch": 5.328509308329936, + "grad_norm": 0.9104152321815491, + "learning_rate": 0.0003341826335099878, + "loss": 3.3566, + "step": 78425 + }, + { + "epoch": 5.328849028400598, + "grad_norm": 0.7641459107398987, + "learning_rate": 0.00033414016850115505, + "loss": 3.3699, + "step": 78430 + }, + { + "epoch": 5.3291887484712595, + "grad_norm": 0.8400526642799377, + "learning_rate": 0.00033409770349232234, + "loss": 3.2467, + "step": 78435 + }, + { + "epoch": 5.3295284685419215, + "grad_norm": 1.0850653648376465, + "learning_rate": 0.0003340552384834896, + "loss": 3.6192, + "step": 78440 + }, + { + "epoch": 5.329868188612584, + "grad_norm": 0.9061029553413391, + "learning_rate": 0.0003340127734746569, + "loss": 3.352, + "step": 78445 + }, + { + "epoch": 5.330207908683245, + "grad_norm": 1.2331809997558594, + "learning_rate": 0.0003339703084658242, + "loss": 3.5302, + "step": 78450 + }, + { + "epoch": 5.330547628753907, + "grad_norm": 0.9855996966362, + "learning_rate": 0.00033392784345699146, + "loss": 3.5792, + "step": 78455 + }, + { + "epoch": 5.330887348824569, + "grad_norm": 0.9157211184501648, + "learning_rate": 0.00033388537844815874, + "loss": 3.6585, + "step": 78460 + }, + { + "epoch": 5.33122706889523, + "grad_norm": 0.920341968536377, + "learning_rate": 0.000333842913439326, + "loss": 3.5693, + "step": 78465 + }, + { + "epoch": 5.331566788965892, + "grad_norm": 0.9692850708961487, + "learning_rate": 0.0003338004484304933, + "loss": 3.272, + "step": 78470 + }, + { + "epoch": 5.331906509036554, + "grad_norm": 0.9119105935096741, + "learning_rate": 0.0003337579834216606, + "loss": 3.4253, + "step": 78475 + }, + { + "epoch": 5.3322462291072155, + "grad_norm": 0.8390273451805115, + "learning_rate": 0.0003337155184128278, + "loss": 3.6279, + "step": 78480 + }, + { + "epoch": 5.3325859491778775, + "grad_norm": 1.1716893911361694, + "learning_rate": 0.00033367305340399514, + "loss": 3.465, + "step": 78485 + }, + { + "epoch": 5.33292566924854, + "grad_norm": 1.1058151721954346, + "learning_rate": 0.0003336305883951624, + "loss": 3.4018, + "step": 78490 + }, + { + "epoch": 5.333265389319201, + "grad_norm": 0.9083455204963684, + "learning_rate": 0.00033358812338632964, + "loss": 3.2907, + "step": 78495 + }, + { + "epoch": 5.333605109389863, + "grad_norm": 0.7768117785453796, + "learning_rate": 0.000333545658377497, + "loss": 3.3384, + "step": 78500 + }, + { + "epoch": 5.333944829460524, + "grad_norm": 0.7943568825721741, + "learning_rate": 0.00033350319336866426, + "loss": 3.6466, + "step": 78505 + }, + { + "epoch": 5.334284549531186, + "grad_norm": 0.9749590754508972, + "learning_rate": 0.0003334607283598315, + "loss": 3.3015, + "step": 78510 + }, + { + "epoch": 5.334624269601848, + "grad_norm": 1.175009846687317, + "learning_rate": 0.00033341826335099876, + "loss": 3.3082, + "step": 78515 + }, + { + "epoch": 5.334963989672509, + "grad_norm": 0.8843885660171509, + "learning_rate": 0.0003333757983421661, + "loss": 3.5673, + "step": 78520 + }, + { + "epoch": 5.3353037097431715, + "grad_norm": 0.7179445028305054, + "learning_rate": 0.0003333333333333333, + "loss": 3.7616, + "step": 78525 + }, + { + "epoch": 5.3356434298138335, + "grad_norm": 0.7520729303359985, + "learning_rate": 0.0003332908683245006, + "loss": 3.5719, + "step": 78530 + }, + { + "epoch": 5.335983149884495, + "grad_norm": 1.1650919914245605, + "learning_rate": 0.00033324840331566794, + "loss": 3.542, + "step": 78535 + }, + { + "epoch": 5.336322869955157, + "grad_norm": 0.9286103844642639, + "learning_rate": 0.00033320593830683516, + "loss": 3.1065, + "step": 78540 + }, + { + "epoch": 5.336662590025819, + "grad_norm": 0.8398768901824951, + "learning_rate": 0.00033316347329800244, + "loss": 3.3752, + "step": 78545 + }, + { + "epoch": 5.33700231009648, + "grad_norm": 0.8590371012687683, + "learning_rate": 0.0003331210082891697, + "loss": 3.2166, + "step": 78550 + }, + { + "epoch": 5.337342030167142, + "grad_norm": 0.7811344265937805, + "learning_rate": 0.000333078543280337, + "loss": 3.3657, + "step": 78555 + }, + { + "epoch": 5.337681750237804, + "grad_norm": 0.8904007077217102, + "learning_rate": 0.0003330360782715043, + "loss": 3.7048, + "step": 78560 + }, + { + "epoch": 5.338021470308465, + "grad_norm": 0.9846480488777161, + "learning_rate": 0.00033299361326267156, + "loss": 3.2483, + "step": 78565 + }, + { + "epoch": 5.3383611903791275, + "grad_norm": 1.0743763446807861, + "learning_rate": 0.00033295114825383884, + "loss": 3.1703, + "step": 78570 + }, + { + "epoch": 5.33870091044979, + "grad_norm": 0.8142232298851013, + "learning_rate": 0.0003329086832450061, + "loss": 3.1464, + "step": 78575 + }, + { + "epoch": 5.339040630520451, + "grad_norm": 0.9354940056800842, + "learning_rate": 0.0003328662182361734, + "loss": 3.4084, + "step": 78580 + }, + { + "epoch": 5.339380350591113, + "grad_norm": 1.184295654296875, + "learning_rate": 0.00033282375322734063, + "loss": 3.092, + "step": 78585 + }, + { + "epoch": 5.339720070661775, + "grad_norm": 1.00796639919281, + "learning_rate": 0.00033278128821850796, + "loss": 3.2445, + "step": 78590 + }, + { + "epoch": 5.340059790732436, + "grad_norm": 0.9670786261558533, + "learning_rate": 0.00033273882320967524, + "loss": 3.4869, + "step": 78595 + }, + { + "epoch": 5.340399510803098, + "grad_norm": 0.9666523933410645, + "learning_rate": 0.00033269635820084247, + "loss": 3.5436, + "step": 78600 + }, + { + "epoch": 5.34073923087376, + "grad_norm": 0.8219179511070251, + "learning_rate": 0.0003326538931920098, + "loss": 3.3761, + "step": 78605 + }, + { + "epoch": 5.341078950944421, + "grad_norm": 1.0661697387695312, + "learning_rate": 0.0003326114281831771, + "loss": 3.374, + "step": 78610 + }, + { + "epoch": 5.3414186710150835, + "grad_norm": 1.0598089694976807, + "learning_rate": 0.00033256896317434436, + "loss": 3.399, + "step": 78615 + }, + { + "epoch": 5.341758391085746, + "grad_norm": 1.0059072971343994, + "learning_rate": 0.0003325264981655116, + "loss": 3.6183, + "step": 78620 + }, + { + "epoch": 5.342098111156407, + "grad_norm": 1.179978609085083, + "learning_rate": 0.0003324840331566789, + "loss": 3.3524, + "step": 78625 + }, + { + "epoch": 5.342437831227069, + "grad_norm": 0.9605768918991089, + "learning_rate": 0.0003324415681478462, + "loss": 3.7746, + "step": 78630 + }, + { + "epoch": 5.342777551297731, + "grad_norm": 1.127455711364746, + "learning_rate": 0.00033239910313901343, + "loss": 3.4889, + "step": 78635 + }, + { + "epoch": 5.343117271368392, + "grad_norm": 0.7527157068252563, + "learning_rate": 0.00033235663813018076, + "loss": 3.1625, + "step": 78640 + }, + { + "epoch": 5.343456991439054, + "grad_norm": 1.131860613822937, + "learning_rate": 0.00033231417312134804, + "loss": 3.5543, + "step": 78645 + }, + { + "epoch": 5.343796711509716, + "grad_norm": 1.3852944374084473, + "learning_rate": 0.00033227170811251527, + "loss": 3.3299, + "step": 78650 + }, + { + "epoch": 5.344136431580377, + "grad_norm": 0.9040126204490662, + "learning_rate": 0.00033222924310368255, + "loss": 3.3819, + "step": 78655 + }, + { + "epoch": 5.3444761516510395, + "grad_norm": 1.0137277841567993, + "learning_rate": 0.0003321867780948499, + "loss": 3.6864, + "step": 78660 + }, + { + "epoch": 5.344815871721702, + "grad_norm": 1.104650855064392, + "learning_rate": 0.0003321443130860171, + "loss": 3.436, + "step": 78665 + }, + { + "epoch": 5.345155591792363, + "grad_norm": 0.9470736384391785, + "learning_rate": 0.0003321018480771844, + "loss": 3.5134, + "step": 78670 + }, + { + "epoch": 5.345495311863025, + "grad_norm": 1.6437859535217285, + "learning_rate": 0.0003320593830683517, + "loss": 3.4246, + "step": 78675 + }, + { + "epoch": 5.345835031933687, + "grad_norm": 1.1332060098648071, + "learning_rate": 0.00033201691805951895, + "loss": 3.414, + "step": 78680 + }, + { + "epoch": 5.346174752004348, + "grad_norm": 0.8300628066062927, + "learning_rate": 0.00033197445305068623, + "loss": 3.3875, + "step": 78685 + }, + { + "epoch": 5.34651447207501, + "grad_norm": 0.7411965727806091, + "learning_rate": 0.00033193198804185356, + "loss": 3.5505, + "step": 78690 + }, + { + "epoch": 5.346854192145672, + "grad_norm": 0.8105598092079163, + "learning_rate": 0.0003318895230330208, + "loss": 3.6493, + "step": 78695 + }, + { + "epoch": 5.3471939122163334, + "grad_norm": 0.8811694979667664, + "learning_rate": 0.00033184705802418807, + "loss": 3.5908, + "step": 78700 + }, + { + "epoch": 5.3475336322869955, + "grad_norm": 0.858921229839325, + "learning_rate": 0.00033180459301535535, + "loss": 3.5255, + "step": 78705 + }, + { + "epoch": 5.347873352357658, + "grad_norm": 0.8268376588821411, + "learning_rate": 0.00033176212800652263, + "loss": 3.1009, + "step": 78710 + }, + { + "epoch": 5.348213072428319, + "grad_norm": 0.7393038272857666, + "learning_rate": 0.0003317196629976899, + "loss": 3.3183, + "step": 78715 + }, + { + "epoch": 5.348552792498981, + "grad_norm": 0.9884183406829834, + "learning_rate": 0.0003316771979888572, + "loss": 3.3217, + "step": 78720 + }, + { + "epoch": 5.348892512569643, + "grad_norm": 0.8622164130210876, + "learning_rate": 0.00033163473298002447, + "loss": 3.6003, + "step": 78725 + }, + { + "epoch": 5.349232232640304, + "grad_norm": 1.0201623439788818, + "learning_rate": 0.00033159226797119175, + "loss": 3.5925, + "step": 78730 + }, + { + "epoch": 5.349571952710966, + "grad_norm": 0.7636402249336243, + "learning_rate": 0.00033154980296235903, + "loss": 3.4099, + "step": 78735 + }, + { + "epoch": 5.349911672781628, + "grad_norm": 0.8646628260612488, + "learning_rate": 0.00033150733795352626, + "loss": 3.4994, + "step": 78740 + }, + { + "epoch": 5.3502513928522895, + "grad_norm": 1.102747917175293, + "learning_rate": 0.0003314648729446936, + "loss": 3.3343, + "step": 78745 + }, + { + "epoch": 5.3505911129229515, + "grad_norm": 0.9709671139717102, + "learning_rate": 0.00033142240793586087, + "loss": 3.4933, + "step": 78750 + }, + { + "epoch": 5.350930832993614, + "grad_norm": 0.8226383924484253, + "learning_rate": 0.0003313799429270281, + "loss": 3.4992, + "step": 78755 + }, + { + "epoch": 5.351270553064275, + "grad_norm": 0.8482345938682556, + "learning_rate": 0.00033133747791819543, + "loss": 3.4772, + "step": 78760 + }, + { + "epoch": 5.351610273134937, + "grad_norm": 0.9817516803741455, + "learning_rate": 0.0003312950129093627, + "loss": 3.6463, + "step": 78765 + }, + { + "epoch": 5.351949993205599, + "grad_norm": 0.886222779750824, + "learning_rate": 0.00033125254790052994, + "loss": 3.2949, + "step": 78770 + }, + { + "epoch": 5.35228971327626, + "grad_norm": 0.916619598865509, + "learning_rate": 0.0003312100828916972, + "loss": 3.3728, + "step": 78775 + }, + { + "epoch": 5.352629433346922, + "grad_norm": 0.9743863344192505, + "learning_rate": 0.00033116761788286455, + "loss": 3.5101, + "step": 78780 + }, + { + "epoch": 5.352969153417584, + "grad_norm": 1.0635727643966675, + "learning_rate": 0.00033112515287403183, + "loss": 3.2133, + "step": 78785 + }, + { + "epoch": 5.3533088734882455, + "grad_norm": 0.7889314889907837, + "learning_rate": 0.00033108268786519906, + "loss": 3.5372, + "step": 78790 + }, + { + "epoch": 5.3536485935589075, + "grad_norm": 0.8080573081970215, + "learning_rate": 0.0003310402228563664, + "loss": 3.3119, + "step": 78795 + }, + { + "epoch": 5.35398831362957, + "grad_norm": 1.0627039670944214, + "learning_rate": 0.00033099775784753367, + "loss": 3.1856, + "step": 78800 + }, + { + "epoch": 5.354328033700231, + "grad_norm": 1.350366473197937, + "learning_rate": 0.0003309552928387009, + "loss": 3.4229, + "step": 78805 + }, + { + "epoch": 5.354667753770893, + "grad_norm": 1.057413935661316, + "learning_rate": 0.0003309128278298682, + "loss": 3.4054, + "step": 78810 + }, + { + "epoch": 5.355007473841555, + "grad_norm": 0.7883201241493225, + "learning_rate": 0.0003308703628210355, + "loss": 3.3768, + "step": 78815 + }, + { + "epoch": 5.355347193912216, + "grad_norm": 0.8580082058906555, + "learning_rate": 0.00033082789781220274, + "loss": 3.5674, + "step": 78820 + }, + { + "epoch": 5.355686913982878, + "grad_norm": 0.7184182405471802, + "learning_rate": 0.00033078543280337, + "loss": 3.4447, + "step": 78825 + }, + { + "epoch": 5.35602663405354, + "grad_norm": 0.8567744493484497, + "learning_rate": 0.00033074296779453735, + "loss": 3.3725, + "step": 78830 + }, + { + "epoch": 5.3563663541242015, + "grad_norm": 1.2165744304656982, + "learning_rate": 0.0003307005027857046, + "loss": 3.3954, + "step": 78835 + }, + { + "epoch": 5.3567060741948636, + "grad_norm": 0.9639005661010742, + "learning_rate": 0.00033065803777687186, + "loss": 3.4746, + "step": 78840 + }, + { + "epoch": 5.357045794265526, + "grad_norm": 0.8944782018661499, + "learning_rate": 0.00033061557276803914, + "loss": 3.4347, + "step": 78845 + }, + { + "epoch": 5.357385514336187, + "grad_norm": 0.7790409922599792, + "learning_rate": 0.0003305731077592064, + "loss": 3.4299, + "step": 78850 + }, + { + "epoch": 5.357725234406849, + "grad_norm": 0.92762690782547, + "learning_rate": 0.0003305306427503737, + "loss": 3.5652, + "step": 78855 + }, + { + "epoch": 5.358064954477511, + "grad_norm": 1.1336801052093506, + "learning_rate": 0.000330488177741541, + "loss": 3.2862, + "step": 78860 + }, + { + "epoch": 5.358404674548172, + "grad_norm": 0.8653033375740051, + "learning_rate": 0.00033044571273270826, + "loss": 3.4073, + "step": 78865 + }, + { + "epoch": 5.358744394618834, + "grad_norm": 0.7597814202308655, + "learning_rate": 0.00033040324772387554, + "loss": 3.3576, + "step": 78870 + }, + { + "epoch": 5.359084114689496, + "grad_norm": 0.8331955075263977, + "learning_rate": 0.0003303607827150428, + "loss": 3.506, + "step": 78875 + }, + { + "epoch": 5.3594238347601575, + "grad_norm": 0.8696445226669312, + "learning_rate": 0.00033031831770621005, + "loss": 3.5598, + "step": 78880 + }, + { + "epoch": 5.35976355483082, + "grad_norm": 0.8037549257278442, + "learning_rate": 0.0003302758526973774, + "loss": 3.4669, + "step": 78885 + }, + { + "epoch": 5.360103274901482, + "grad_norm": 0.8894295692443848, + "learning_rate": 0.00033023338768854466, + "loss": 3.6241, + "step": 78890 + }, + { + "epoch": 5.360442994972143, + "grad_norm": 0.822829008102417, + "learning_rate": 0.0003301909226797119, + "loss": 3.4626, + "step": 78895 + }, + { + "epoch": 5.360782715042805, + "grad_norm": 0.840218722820282, + "learning_rate": 0.0003301484576708792, + "loss": 3.3036, + "step": 78900 + }, + { + "epoch": 5.361122435113466, + "grad_norm": 1.8218754529953003, + "learning_rate": 0.0003301059926620465, + "loss": 3.4395, + "step": 78905 + }, + { + "epoch": 5.361462155184128, + "grad_norm": 0.879086434841156, + "learning_rate": 0.0003300635276532137, + "loss": 3.247, + "step": 78910 + }, + { + "epoch": 5.36180187525479, + "grad_norm": 1.1511181592941284, + "learning_rate": 0.000330021062644381, + "loss": 3.4256, + "step": 78915 + }, + { + "epoch": 5.362141595325451, + "grad_norm": 0.9569038152694702, + "learning_rate": 0.00032997859763554834, + "loss": 3.5775, + "step": 78920 + }, + { + "epoch": 5.3624813153961135, + "grad_norm": 0.9520998001098633, + "learning_rate": 0.00032993613262671557, + "loss": 3.5025, + "step": 78925 + }, + { + "epoch": 5.362821035466776, + "grad_norm": 0.7456569671630859, + "learning_rate": 0.00032989366761788285, + "loss": 3.3934, + "step": 78930 + }, + { + "epoch": 5.363160755537437, + "grad_norm": 1.2182466983795166, + "learning_rate": 0.0003298512026090502, + "loss": 3.5138, + "step": 78935 + }, + { + "epoch": 5.363500475608099, + "grad_norm": 1.1037970781326294, + "learning_rate": 0.0003298087376002174, + "loss": 3.4385, + "step": 78940 + }, + { + "epoch": 5.363840195678761, + "grad_norm": 1.632307529449463, + "learning_rate": 0.0003297662725913847, + "loss": 3.4059, + "step": 78945 + }, + { + "epoch": 5.364179915749422, + "grad_norm": 0.8596802353858948, + "learning_rate": 0.000329723807582552, + "loss": 3.5392, + "step": 78950 + }, + { + "epoch": 5.364519635820084, + "grad_norm": 1.5340478420257568, + "learning_rate": 0.0003296813425737193, + "loss": 3.4994, + "step": 78955 + }, + { + "epoch": 5.364859355890746, + "grad_norm": 1.4278537034988403, + "learning_rate": 0.0003296388775648865, + "loss": 3.2344, + "step": 78960 + }, + { + "epoch": 5.365199075961407, + "grad_norm": 0.9255120158195496, + "learning_rate": 0.0003295964125560538, + "loss": 3.3485, + "step": 78965 + }, + { + "epoch": 5.3655387960320695, + "grad_norm": 1.0969109535217285, + "learning_rate": 0.00032955394754722114, + "loss": 3.4262, + "step": 78970 + }, + { + "epoch": 5.365878516102732, + "grad_norm": 0.7705106139183044, + "learning_rate": 0.00032951148253838837, + "loss": 3.7405, + "step": 78975 + }, + { + "epoch": 5.366218236173393, + "grad_norm": 0.8592711687088013, + "learning_rate": 0.00032946901752955565, + "loss": 3.3097, + "step": 78980 + }, + { + "epoch": 5.366557956244055, + "grad_norm": 0.8942554593086243, + "learning_rate": 0.000329426552520723, + "loss": 3.367, + "step": 78985 + }, + { + "epoch": 5.366897676314717, + "grad_norm": 0.7926554083824158, + "learning_rate": 0.0003293840875118902, + "loss": 3.2311, + "step": 78990 + }, + { + "epoch": 5.367237396385378, + "grad_norm": 2.9825799465179443, + "learning_rate": 0.0003293416225030575, + "loss": 3.4589, + "step": 78995 + }, + { + "epoch": 5.36757711645604, + "grad_norm": 1.0915566682815552, + "learning_rate": 0.00032929915749422477, + "loss": 3.3362, + "step": 79000 + }, + { + "epoch": 5.367916836526702, + "grad_norm": 1.2022655010223389, + "learning_rate": 0.00032925669248539205, + "loss": 3.4205, + "step": 79005 + }, + { + "epoch": 5.3682565565973634, + "grad_norm": 0.7981449365615845, + "learning_rate": 0.0003292142274765593, + "loss": 3.6998, + "step": 79010 + }, + { + "epoch": 5.3685962766680255, + "grad_norm": 1.0801926851272583, + "learning_rate": 0.0003291717624677266, + "loss": 3.3759, + "step": 79015 + }, + { + "epoch": 5.368935996738688, + "grad_norm": 1.1026370525360107, + "learning_rate": 0.0003291292974588939, + "loss": 3.5109, + "step": 79020 + }, + { + "epoch": 5.369275716809349, + "grad_norm": 0.8695186972618103, + "learning_rate": 0.00032908683245006117, + "loss": 3.3207, + "step": 79025 + }, + { + "epoch": 5.369615436880011, + "grad_norm": 1.1116480827331543, + "learning_rate": 0.00032904436744122845, + "loss": 3.5346, + "step": 79030 + }, + { + "epoch": 5.369955156950673, + "grad_norm": 0.7934876680374146, + "learning_rate": 0.0003290019024323957, + "loss": 3.5627, + "step": 79035 + }, + { + "epoch": 5.370294877021334, + "grad_norm": 0.9421831965446472, + "learning_rate": 0.000328959437423563, + "loss": 3.5485, + "step": 79040 + }, + { + "epoch": 5.370634597091996, + "grad_norm": 1.0624295473098755, + "learning_rate": 0.0003289169724147303, + "loss": 3.4266, + "step": 79045 + }, + { + "epoch": 5.370974317162658, + "grad_norm": 1.0138667821884155, + "learning_rate": 0.0003288745074058975, + "loss": 3.3154, + "step": 79050 + }, + { + "epoch": 5.3713140372333195, + "grad_norm": 1.1131486892700195, + "learning_rate": 0.00032883204239706485, + "loss": 3.4986, + "step": 79055 + }, + { + "epoch": 5.3716537573039815, + "grad_norm": 0.8036146759986877, + "learning_rate": 0.00032878957738823213, + "loss": 3.3918, + "step": 79060 + }, + { + "epoch": 5.371993477374644, + "grad_norm": 0.9234992861747742, + "learning_rate": 0.00032874711237939935, + "loss": 3.5502, + "step": 79065 + }, + { + "epoch": 5.372333197445305, + "grad_norm": 1.0803871154785156, + "learning_rate": 0.00032870464737056663, + "loss": 3.2993, + "step": 79070 + }, + { + "epoch": 5.372672917515967, + "grad_norm": 0.7172015905380249, + "learning_rate": 0.00032866218236173397, + "loss": 3.364, + "step": 79075 + }, + { + "epoch": 5.373012637586629, + "grad_norm": 0.7015817165374756, + "learning_rate": 0.0003286197173529012, + "loss": 3.345, + "step": 79080 + }, + { + "epoch": 5.37335235765729, + "grad_norm": 0.8331137895584106, + "learning_rate": 0.0003285772523440685, + "loss": 3.3763, + "step": 79085 + }, + { + "epoch": 5.373692077727952, + "grad_norm": 1.0210866928100586, + "learning_rate": 0.0003285347873352358, + "loss": 3.3008, + "step": 79090 + }, + { + "epoch": 5.374031797798614, + "grad_norm": 0.8869075775146484, + "learning_rate": 0.00032849232232640303, + "loss": 3.264, + "step": 79095 + }, + { + "epoch": 5.3743715178692755, + "grad_norm": 0.8397893905639648, + "learning_rate": 0.0003284498573175703, + "loss": 3.2819, + "step": 79100 + }, + { + "epoch": 5.3747112379399375, + "grad_norm": 0.9354531168937683, + "learning_rate": 0.0003284073923087376, + "loss": 3.4059, + "step": 79105 + }, + { + "epoch": 5.3750509580106, + "grad_norm": 0.897051990032196, + "learning_rate": 0.0003283649272999049, + "loss": 3.3516, + "step": 79110 + }, + { + "epoch": 5.375390678081261, + "grad_norm": 0.994745135307312, + "learning_rate": 0.00032832246229107215, + "loss": 3.2713, + "step": 79115 + }, + { + "epoch": 5.375730398151923, + "grad_norm": 1.1210886240005493, + "learning_rate": 0.00032827999728223943, + "loss": 3.5028, + "step": 79120 + }, + { + "epoch": 5.376070118222585, + "grad_norm": 0.9184995293617249, + "learning_rate": 0.00032823753227340677, + "loss": 3.3855, + "step": 79125 + }, + { + "epoch": 5.376409838293246, + "grad_norm": 0.8700675368309021, + "learning_rate": 0.000328195067264574, + "loss": 3.556, + "step": 79130 + }, + { + "epoch": 5.376749558363908, + "grad_norm": 1.1067755222320557, + "learning_rate": 0.0003281526022557413, + "loss": 3.3437, + "step": 79135 + }, + { + "epoch": 5.37708927843457, + "grad_norm": 1.0544168949127197, + "learning_rate": 0.00032811013724690855, + "loss": 3.4814, + "step": 79140 + }, + { + "epoch": 5.3774289985052315, + "grad_norm": 0.9017446637153625, + "learning_rate": 0.00032806767223807583, + "loss": 3.6591, + "step": 79145 + }, + { + "epoch": 5.377768718575894, + "grad_norm": 0.8998450636863708, + "learning_rate": 0.0003280252072292431, + "loss": 3.4248, + "step": 79150 + }, + { + "epoch": 5.378108438646556, + "grad_norm": 0.902768611907959, + "learning_rate": 0.0003279827422204104, + "loss": 3.4161, + "step": 79155 + }, + { + "epoch": 5.378448158717217, + "grad_norm": 0.709985613822937, + "learning_rate": 0.0003279402772115777, + "loss": 3.4708, + "step": 79160 + }, + { + "epoch": 5.378787878787879, + "grad_norm": 0.9652303457260132, + "learning_rate": 0.00032789781220274496, + "loss": 3.3609, + "step": 79165 + }, + { + "epoch": 5.379127598858541, + "grad_norm": 0.7969284057617188, + "learning_rate": 0.00032785534719391224, + "loss": 3.6727, + "step": 79170 + }, + { + "epoch": 5.379467318929202, + "grad_norm": 1.0158381462097168, + "learning_rate": 0.00032781288218507946, + "loss": 3.2884, + "step": 79175 + }, + { + "epoch": 5.379807038999864, + "grad_norm": 0.7873596549034119, + "learning_rate": 0.0003277704171762468, + "loss": 3.6113, + "step": 79180 + }, + { + "epoch": 5.380146759070525, + "grad_norm": 1.774013876914978, + "learning_rate": 0.0003277279521674141, + "loss": 3.452, + "step": 79185 + }, + { + "epoch": 5.3804864791411875, + "grad_norm": 0.7472483515739441, + "learning_rate": 0.0003276854871585813, + "loss": 3.4575, + "step": 79190 + }, + { + "epoch": 5.38082619921185, + "grad_norm": 0.7570422291755676, + "learning_rate": 0.00032764302214974864, + "loss": 3.4994, + "step": 79195 + }, + { + "epoch": 5.381165919282511, + "grad_norm": 0.9990218281745911, + "learning_rate": 0.0003276005571409159, + "loss": 3.5564, + "step": 79200 + }, + { + "epoch": 5.381505639353173, + "grad_norm": 0.7530079483985901, + "learning_rate": 0.00032755809213208314, + "loss": 3.3967, + "step": 79205 + }, + { + "epoch": 5.381845359423835, + "grad_norm": 0.8793720602989197, + "learning_rate": 0.0003275156271232504, + "loss": 3.432, + "step": 79210 + }, + { + "epoch": 5.382185079494496, + "grad_norm": 1.0459480285644531, + "learning_rate": 0.00032747316211441776, + "loss": 3.3804, + "step": 79215 + }, + { + "epoch": 5.382524799565158, + "grad_norm": 1.154422402381897, + "learning_rate": 0.000327430697105585, + "loss": 3.4179, + "step": 79220 + }, + { + "epoch": 5.38286451963582, + "grad_norm": 0.8266650438308716, + "learning_rate": 0.00032738823209675226, + "loss": 3.4334, + "step": 79225 + }, + { + "epoch": 5.383204239706481, + "grad_norm": 1.174237847328186, + "learning_rate": 0.0003273457670879196, + "loss": 3.6486, + "step": 79230 + }, + { + "epoch": 5.3835439597771435, + "grad_norm": 1.1758134365081787, + "learning_rate": 0.0003273033020790868, + "loss": 3.6416, + "step": 79235 + }, + { + "epoch": 5.383883679847806, + "grad_norm": 0.9150899648666382, + "learning_rate": 0.0003272608370702541, + "loss": 3.5861, + "step": 79240 + }, + { + "epoch": 5.384223399918467, + "grad_norm": 0.8898830413818359, + "learning_rate": 0.00032721837206142144, + "loss": 3.3488, + "step": 79245 + }, + { + "epoch": 5.384563119989129, + "grad_norm": 0.8721166253089905, + "learning_rate": 0.00032717590705258866, + "loss": 3.2601, + "step": 79250 + }, + { + "epoch": 5.384902840059791, + "grad_norm": 1.0810602903366089, + "learning_rate": 0.00032713344204375594, + "loss": 3.2251, + "step": 79255 + }, + { + "epoch": 5.385242560130452, + "grad_norm": 0.8562744855880737, + "learning_rate": 0.0003270909770349232, + "loss": 3.4984, + "step": 79260 + }, + { + "epoch": 5.385582280201114, + "grad_norm": 0.9606202244758606, + "learning_rate": 0.0003270485120260905, + "loss": 3.4845, + "step": 79265 + }, + { + "epoch": 5.385922000271776, + "grad_norm": 0.8403765559196472, + "learning_rate": 0.0003270060470172578, + "loss": 3.3168, + "step": 79270 + }, + { + "epoch": 5.386261720342437, + "grad_norm": 0.9014598727226257, + "learning_rate": 0.00032696358200842506, + "loss": 3.4889, + "step": 79275 + }, + { + "epoch": 5.3866014404130995, + "grad_norm": 1.2438316345214844, + "learning_rate": 0.00032692111699959234, + "loss": 3.4513, + "step": 79280 + }, + { + "epoch": 5.386941160483762, + "grad_norm": 0.7887360453605652, + "learning_rate": 0.0003268786519907596, + "loss": 3.1854, + "step": 79285 + }, + { + "epoch": 5.387280880554423, + "grad_norm": 0.7481051087379456, + "learning_rate": 0.0003268361869819269, + "loss": 3.3765, + "step": 79290 + }, + { + "epoch": 5.387620600625085, + "grad_norm": 0.7148723602294922, + "learning_rate": 0.0003267937219730942, + "loss": 3.4214, + "step": 79295 + }, + { + "epoch": 5.387960320695747, + "grad_norm": 0.8343608379364014, + "learning_rate": 0.00032675125696426146, + "loss": 3.2288, + "step": 79300 + }, + { + "epoch": 5.388300040766408, + "grad_norm": 1.1041756868362427, + "learning_rate": 0.00032670879195542874, + "loss": 3.3668, + "step": 79305 + }, + { + "epoch": 5.38863976083707, + "grad_norm": 0.7464832067489624, + "learning_rate": 0.000326666326946596, + "loss": 3.4726, + "step": 79310 + }, + { + "epoch": 5.388979480907732, + "grad_norm": 0.9252157211303711, + "learning_rate": 0.0003266238619377633, + "loss": 3.4088, + "step": 79315 + }, + { + "epoch": 5.3893192009783935, + "grad_norm": 0.890361487865448, + "learning_rate": 0.0003265813969289306, + "loss": 3.4516, + "step": 79320 + }, + { + "epoch": 5.3896589210490555, + "grad_norm": 0.9441397190093994, + "learning_rate": 0.00032653893192009786, + "loss": 3.6149, + "step": 79325 + }, + { + "epoch": 5.389998641119718, + "grad_norm": 0.7902988791465759, + "learning_rate": 0.0003264964669112651, + "loss": 3.5519, + "step": 79330 + }, + { + "epoch": 5.390338361190379, + "grad_norm": 1.0195708274841309, + "learning_rate": 0.0003264540019024324, + "loss": 3.3648, + "step": 79335 + }, + { + "epoch": 5.390678081261041, + "grad_norm": 0.6792661547660828, + "learning_rate": 0.0003264115368935997, + "loss": 3.5259, + "step": 79340 + }, + { + "epoch": 5.391017801331703, + "grad_norm": 1.0039857625961304, + "learning_rate": 0.00032636907188476693, + "loss": 3.3804, + "step": 79345 + }, + { + "epoch": 5.391357521402364, + "grad_norm": 1.2649227380752563, + "learning_rate": 0.00032632660687593426, + "loss": 3.5525, + "step": 79350 + }, + { + "epoch": 5.391697241473026, + "grad_norm": 0.8730764985084534, + "learning_rate": 0.00032628414186710154, + "loss": 3.3534, + "step": 79355 + }, + { + "epoch": 5.392036961543688, + "grad_norm": 1.1407568454742432, + "learning_rate": 0.00032624167685826877, + "loss": 3.287, + "step": 79360 + }, + { + "epoch": 5.3923766816143495, + "grad_norm": 1.034912109375, + "learning_rate": 0.00032619921184943605, + "loss": 3.4236, + "step": 79365 + }, + { + "epoch": 5.3927164016850115, + "grad_norm": 0.9126209020614624, + "learning_rate": 0.0003261567468406034, + "loss": 3.1933, + "step": 79370 + }, + { + "epoch": 5.393056121755674, + "grad_norm": 0.8266832232475281, + "learning_rate": 0.0003261142818317706, + "loss": 3.4376, + "step": 79375 + }, + { + "epoch": 5.393395841826335, + "grad_norm": 0.8672918081283569, + "learning_rate": 0.0003260718168229379, + "loss": 3.1671, + "step": 79380 + }, + { + "epoch": 5.393735561896997, + "grad_norm": 0.9795860648155212, + "learning_rate": 0.0003260293518141052, + "loss": 3.4217, + "step": 79385 + }, + { + "epoch": 5.394075281967659, + "grad_norm": 1.2965126037597656, + "learning_rate": 0.00032598688680527245, + "loss": 3.5942, + "step": 79390 + }, + { + "epoch": 5.39441500203832, + "grad_norm": 1.0755366086959839, + "learning_rate": 0.00032594442179643973, + "loss": 3.3525, + "step": 79395 + }, + { + "epoch": 5.394754722108982, + "grad_norm": 0.9428107142448425, + "learning_rate": 0.000325901956787607, + "loss": 3.4125, + "step": 79400 + }, + { + "epoch": 5.395094442179644, + "grad_norm": 0.8409997224807739, + "learning_rate": 0.0003258594917787743, + "loss": 3.287, + "step": 79405 + }, + { + "epoch": 5.3954341622503055, + "grad_norm": 1.0464175939559937, + "learning_rate": 0.00032581702676994157, + "loss": 3.6794, + "step": 79410 + }, + { + "epoch": 5.3957738823209676, + "grad_norm": 1.104807734489441, + "learning_rate": 0.00032577456176110885, + "loss": 3.5223, + "step": 79415 + }, + { + "epoch": 5.39611360239163, + "grad_norm": 0.6611825823783875, + "learning_rate": 0.00032573209675227613, + "loss": 3.5091, + "step": 79420 + }, + { + "epoch": 5.396453322462291, + "grad_norm": 0.8177070021629333, + "learning_rate": 0.0003256896317434434, + "loss": 3.4203, + "step": 79425 + }, + { + "epoch": 5.396793042532953, + "grad_norm": 0.8809230923652649, + "learning_rate": 0.0003256471667346107, + "loss": 3.4187, + "step": 79430 + }, + { + "epoch": 5.397132762603615, + "grad_norm": 1.0695096254348755, + "learning_rate": 0.0003256047017257779, + "loss": 3.5207, + "step": 79435 + }, + { + "epoch": 5.397472482674276, + "grad_norm": 0.8588560223579407, + "learning_rate": 0.00032556223671694525, + "loss": 3.7525, + "step": 79440 + }, + { + "epoch": 5.397812202744938, + "grad_norm": 0.8181442022323608, + "learning_rate": 0.00032551977170811253, + "loss": 3.4306, + "step": 79445 + }, + { + "epoch": 5.3981519228156, + "grad_norm": 1.0193049907684326, + "learning_rate": 0.00032547730669927976, + "loss": 3.2742, + "step": 79450 + }, + { + "epoch": 5.3984916428862615, + "grad_norm": 1.1851600408554077, + "learning_rate": 0.0003254348416904471, + "loss": 3.262, + "step": 79455 + }, + { + "epoch": 5.398831362956924, + "grad_norm": 0.8023043274879456, + "learning_rate": 0.00032539237668161437, + "loss": 3.6453, + "step": 79460 + }, + { + "epoch": 5.399171083027586, + "grad_norm": 1.0766974687576294, + "learning_rate": 0.00032534991167278165, + "loss": 3.0738, + "step": 79465 + }, + { + "epoch": 5.399510803098247, + "grad_norm": 0.9304689168930054, + "learning_rate": 0.0003253074466639489, + "loss": 3.4352, + "step": 79470 + }, + { + "epoch": 5.399850523168909, + "grad_norm": 1.017843246459961, + "learning_rate": 0.0003252649816551162, + "loss": 3.5047, + "step": 79475 + }, + { + "epoch": 5.400190243239571, + "grad_norm": 0.8220357298851013, + "learning_rate": 0.0003252225166462835, + "loss": 3.4874, + "step": 79480 + }, + { + "epoch": 5.400529963310232, + "grad_norm": 0.8341916799545288, + "learning_rate": 0.0003251800516374507, + "loss": 3.1201, + "step": 79485 + }, + { + "epoch": 5.400869683380894, + "grad_norm": 0.8705910444259644, + "learning_rate": 0.00032513758662861805, + "loss": 3.5782, + "step": 79490 + }, + { + "epoch": 5.401209403451556, + "grad_norm": 1.1191139221191406, + "learning_rate": 0.00032509512161978533, + "loss": 3.5304, + "step": 79495 + }, + { + "epoch": 5.4015491235222175, + "grad_norm": 0.9812166690826416, + "learning_rate": 0.00032505265661095256, + "loss": 3.4781, + "step": 79500 + }, + { + "epoch": 5.40188884359288, + "grad_norm": 1.0357614755630493, + "learning_rate": 0.00032501019160211984, + "loss": 3.329, + "step": 79505 + }, + { + "epoch": 5.402228563663542, + "grad_norm": 0.9081376791000366, + "learning_rate": 0.00032496772659328717, + "loss": 3.2647, + "step": 79510 + }, + { + "epoch": 5.402568283734203, + "grad_norm": 1.1135876178741455, + "learning_rate": 0.0003249252615844544, + "loss": 3.4323, + "step": 79515 + }, + { + "epoch": 5.402908003804865, + "grad_norm": 0.9309114813804626, + "learning_rate": 0.0003248827965756217, + "loss": 3.4122, + "step": 79520 + }, + { + "epoch": 5.403247723875527, + "grad_norm": 1.024735927581787, + "learning_rate": 0.000324840331566789, + "loss": 3.5349, + "step": 79525 + }, + { + "epoch": 5.403587443946188, + "grad_norm": 1.0466909408569336, + "learning_rate": 0.00032479786655795624, + "loss": 3.5096, + "step": 79530 + }, + { + "epoch": 5.40392716401685, + "grad_norm": 0.9750074148178101, + "learning_rate": 0.0003247554015491235, + "loss": 3.2277, + "step": 79535 + }, + { + "epoch": 5.404266884087512, + "grad_norm": 1.0501703023910522, + "learning_rate": 0.00032471293654029085, + "loss": 3.3912, + "step": 79540 + }, + { + "epoch": 5.4046066041581735, + "grad_norm": 0.7479121088981628, + "learning_rate": 0.0003246704715314581, + "loss": 3.4324, + "step": 79545 + }, + { + "epoch": 5.404946324228836, + "grad_norm": 0.8439898490905762, + "learning_rate": 0.00032462800652262536, + "loss": 3.5186, + "step": 79550 + }, + { + "epoch": 5.405286044299498, + "grad_norm": 0.8997906446456909, + "learning_rate": 0.00032458554151379264, + "loss": 3.6254, + "step": 79555 + }, + { + "epoch": 5.405625764370159, + "grad_norm": 0.8601919412612915, + "learning_rate": 0.0003245430765049599, + "loss": 3.5469, + "step": 79560 + }, + { + "epoch": 5.405965484440821, + "grad_norm": 0.8941689729690552, + "learning_rate": 0.0003245006114961272, + "loss": 3.3692, + "step": 79565 + }, + { + "epoch": 5.406305204511483, + "grad_norm": 0.8017120361328125, + "learning_rate": 0.0003244581464872945, + "loss": 3.5665, + "step": 79570 + }, + { + "epoch": 5.406644924582144, + "grad_norm": 1.2222740650177002, + "learning_rate": 0.00032441568147846176, + "loss": 3.3488, + "step": 79575 + }, + { + "epoch": 5.406984644652806, + "grad_norm": 1.0860592126846313, + "learning_rate": 0.00032437321646962904, + "loss": 3.5306, + "step": 79580 + }, + { + "epoch": 5.4073243647234674, + "grad_norm": 0.9382442235946655, + "learning_rate": 0.0003243307514607963, + "loss": 3.2403, + "step": 79585 + }, + { + "epoch": 5.4076640847941295, + "grad_norm": 0.8088377714157104, + "learning_rate": 0.00032428828645196354, + "loss": 3.3619, + "step": 79590 + }, + { + "epoch": 5.408003804864792, + "grad_norm": 1.0085145235061646, + "learning_rate": 0.0003242458214431309, + "loss": 3.1582, + "step": 79595 + }, + { + "epoch": 5.408343524935453, + "grad_norm": 0.8397640585899353, + "learning_rate": 0.00032420335643429816, + "loss": 3.8477, + "step": 79600 + }, + { + "epoch": 5.408683245006115, + "grad_norm": 0.8200675249099731, + "learning_rate": 0.0003241608914254654, + "loss": 3.3303, + "step": 79605 + }, + { + "epoch": 5.409022965076777, + "grad_norm": 0.8448841571807861, + "learning_rate": 0.0003241184264166327, + "loss": 3.3727, + "step": 79610 + }, + { + "epoch": 5.409362685147438, + "grad_norm": 0.8934252262115479, + "learning_rate": 0.0003240759614078, + "loss": 3.1237, + "step": 79615 + }, + { + "epoch": 5.4097024052181, + "grad_norm": 0.8694664239883423, + "learning_rate": 0.0003240334963989672, + "loss": 3.1951, + "step": 79620 + }, + { + "epoch": 5.410042125288762, + "grad_norm": 0.8353589177131653, + "learning_rate": 0.0003239910313901345, + "loss": 3.634, + "step": 79625 + }, + { + "epoch": 5.4103818453594235, + "grad_norm": 1.1166372299194336, + "learning_rate": 0.00032394856638130184, + "loss": 3.4221, + "step": 79630 + }, + { + "epoch": 5.4107215654300855, + "grad_norm": 0.8261678218841553, + "learning_rate": 0.0003239061013724691, + "loss": 3.568, + "step": 79635 + }, + { + "epoch": 5.411061285500748, + "grad_norm": 1.3792684078216553, + "learning_rate": 0.00032386363636363635, + "loss": 3.2759, + "step": 79640 + }, + { + "epoch": 5.411401005571409, + "grad_norm": 0.8805740475654602, + "learning_rate": 0.0003238211713548037, + "loss": 3.3441, + "step": 79645 + }, + { + "epoch": 5.411740725642071, + "grad_norm": 0.7955598831176758, + "learning_rate": 0.00032377870634597096, + "loss": 3.185, + "step": 79650 + }, + { + "epoch": 5.412080445712733, + "grad_norm": 0.7447164058685303, + "learning_rate": 0.0003237362413371382, + "loss": 3.5561, + "step": 79655 + }, + { + "epoch": 5.412420165783394, + "grad_norm": 0.8554501533508301, + "learning_rate": 0.00032369377632830547, + "loss": 3.2734, + "step": 79660 + }, + { + "epoch": 5.412759885854056, + "grad_norm": 0.968463122844696, + "learning_rate": 0.0003236513113194728, + "loss": 3.5891, + "step": 79665 + }, + { + "epoch": 5.413099605924718, + "grad_norm": 1.1137666702270508, + "learning_rate": 0.00032360884631064, + "loss": 3.4277, + "step": 79670 + }, + { + "epoch": 5.4134393259953795, + "grad_norm": 0.8419466018676758, + "learning_rate": 0.0003235663813018073, + "loss": 3.5262, + "step": 79675 + }, + { + "epoch": 5.4137790460660415, + "grad_norm": 1.024114727973938, + "learning_rate": 0.00032352391629297464, + "loss": 3.329, + "step": 79680 + }, + { + "epoch": 5.414118766136704, + "grad_norm": 0.8377326726913452, + "learning_rate": 0.00032348145128414187, + "loss": 3.3484, + "step": 79685 + }, + { + "epoch": 5.414458486207365, + "grad_norm": 0.9777465462684631, + "learning_rate": 0.00032343898627530915, + "loss": 3.4264, + "step": 79690 + }, + { + "epoch": 5.414798206278027, + "grad_norm": 0.8329743146896362, + "learning_rate": 0.0003233965212664764, + "loss": 3.4722, + "step": 79695 + }, + { + "epoch": 5.415137926348689, + "grad_norm": 8.56298828125, + "learning_rate": 0.0003233540562576437, + "loss": 3.3883, + "step": 79700 + }, + { + "epoch": 5.41547764641935, + "grad_norm": 0.7169463634490967, + "learning_rate": 0.000323311591248811, + "loss": 3.472, + "step": 79705 + }, + { + "epoch": 5.415817366490012, + "grad_norm": 1.0703269243240356, + "learning_rate": 0.00032326912623997827, + "loss": 3.41, + "step": 79710 + }, + { + "epoch": 5.416157086560674, + "grad_norm": 1.1895880699157715, + "learning_rate": 0.00032322666123114555, + "loss": 3.4052, + "step": 79715 + }, + { + "epoch": 5.4164968066313355, + "grad_norm": 0.8967957496643066, + "learning_rate": 0.0003231841962223128, + "loss": 3.4227, + "step": 79720 + }, + { + "epoch": 5.416836526701998, + "grad_norm": 0.931761622428894, + "learning_rate": 0.0003231417312134801, + "loss": 3.4961, + "step": 79725 + }, + { + "epoch": 5.41717624677266, + "grad_norm": 1.0917631387710571, + "learning_rate": 0.00032309926620464733, + "loss": 3.6167, + "step": 79730 + }, + { + "epoch": 5.417515966843321, + "grad_norm": 0.8824374675750732, + "learning_rate": 0.00032305680119581467, + "loss": 3.3864, + "step": 79735 + }, + { + "epoch": 5.417855686913983, + "grad_norm": 1.0541365146636963, + "learning_rate": 0.00032301433618698195, + "loss": 3.3796, + "step": 79740 + }, + { + "epoch": 5.418195406984645, + "grad_norm": 1.0509617328643799, + "learning_rate": 0.0003229718711781492, + "loss": 3.3797, + "step": 79745 + }, + { + "epoch": 5.418535127055306, + "grad_norm": 1.0851020812988281, + "learning_rate": 0.0003229294061693165, + "loss": 3.5051, + "step": 79750 + }, + { + "epoch": 5.418874847125968, + "grad_norm": 1.251941204071045, + "learning_rate": 0.0003228869411604838, + "loss": 3.6763, + "step": 79755 + }, + { + "epoch": 5.41921456719663, + "grad_norm": 2.5995147228240967, + "learning_rate": 0.000322844476151651, + "loss": 3.1671, + "step": 79760 + }, + { + "epoch": 5.4195542872672915, + "grad_norm": 0.951636791229248, + "learning_rate": 0.0003228020111428183, + "loss": 3.3671, + "step": 79765 + }, + { + "epoch": 5.419894007337954, + "grad_norm": 0.9047222137451172, + "learning_rate": 0.00032275954613398563, + "loss": 3.3713, + "step": 79770 + }, + { + "epoch": 5.420233727408616, + "grad_norm": 0.8243939280509949, + "learning_rate": 0.00032271708112515285, + "loss": 3.5212, + "step": 79775 + }, + { + "epoch": 5.420573447479277, + "grad_norm": 0.8358378410339355, + "learning_rate": 0.00032267461611632013, + "loss": 3.6642, + "step": 79780 + }, + { + "epoch": 5.420913167549939, + "grad_norm": 0.7819998860359192, + "learning_rate": 0.00032263215110748747, + "loss": 3.591, + "step": 79785 + }, + { + "epoch": 5.421252887620601, + "grad_norm": 0.9887744188308716, + "learning_rate": 0.0003225896860986547, + "loss": 3.4534, + "step": 79790 + }, + { + "epoch": 5.421592607691262, + "grad_norm": 0.8979799747467041, + "learning_rate": 0.000322547221089822, + "loss": 3.2341, + "step": 79795 + }, + { + "epoch": 5.421932327761924, + "grad_norm": 1.0473697185516357, + "learning_rate": 0.00032250475608098925, + "loss": 3.283, + "step": 79800 + }, + { + "epoch": 5.422272047832586, + "grad_norm": 0.7699300050735474, + "learning_rate": 0.0003224622910721566, + "loss": 3.2072, + "step": 79805 + }, + { + "epoch": 5.4226117679032475, + "grad_norm": 1.1058954000473022, + "learning_rate": 0.0003224198260633238, + "loss": 3.3664, + "step": 79810 + }, + { + "epoch": 5.42295148797391, + "grad_norm": 1.3493869304656982, + "learning_rate": 0.0003223773610544911, + "loss": 3.629, + "step": 79815 + }, + { + "epoch": 5.423291208044572, + "grad_norm": 0.9128247499465942, + "learning_rate": 0.00032233489604565843, + "loss": 3.4751, + "step": 79820 + }, + { + "epoch": 5.423630928115233, + "grad_norm": 0.9727969765663147, + "learning_rate": 0.00032229243103682565, + "loss": 3.5558, + "step": 79825 + }, + { + "epoch": 5.423970648185895, + "grad_norm": 0.9157310724258423, + "learning_rate": 0.00032224996602799293, + "loss": 3.5619, + "step": 79830 + }, + { + "epoch": 5.424310368256557, + "grad_norm": 0.8300975561141968, + "learning_rate": 0.00032220750101916027, + "loss": 3.6871, + "step": 79835 + }, + { + "epoch": 5.424650088327218, + "grad_norm": 1.4264094829559326, + "learning_rate": 0.0003221650360103275, + "loss": 3.5198, + "step": 79840 + }, + { + "epoch": 5.42498980839788, + "grad_norm": 0.8003417253494263, + "learning_rate": 0.0003221225710014948, + "loss": 3.6028, + "step": 79845 + }, + { + "epoch": 5.425329528468542, + "grad_norm": 0.6717928647994995, + "learning_rate": 0.00032208010599266205, + "loss": 3.4583, + "step": 79850 + }, + { + "epoch": 5.4256692485392035, + "grad_norm": 0.8655832409858704, + "learning_rate": 0.00032203764098382933, + "loss": 3.5582, + "step": 79855 + }, + { + "epoch": 5.426008968609866, + "grad_norm": 1.2960829734802246, + "learning_rate": 0.0003219951759749966, + "loss": 3.3233, + "step": 79860 + }, + { + "epoch": 5.426348688680527, + "grad_norm": 0.965732753276825, + "learning_rate": 0.0003219527109661639, + "loss": 3.5743, + "step": 79865 + }, + { + "epoch": 5.426688408751189, + "grad_norm": 0.9134685397148132, + "learning_rate": 0.0003219102459573312, + "loss": 3.1765, + "step": 79870 + }, + { + "epoch": 5.427028128821851, + "grad_norm": 1.1891931295394897, + "learning_rate": 0.00032186778094849846, + "loss": 3.4048, + "step": 79875 + }, + { + "epoch": 5.427367848892512, + "grad_norm": 0.6897513270378113, + "learning_rate": 0.00032182531593966574, + "loss": 3.2867, + "step": 79880 + }, + { + "epoch": 5.427707568963174, + "grad_norm": 0.7765058279037476, + "learning_rate": 0.00032178285093083296, + "loss": 3.2024, + "step": 79885 + }, + { + "epoch": 5.428047289033836, + "grad_norm": 0.872805118560791, + "learning_rate": 0.0003217403859220003, + "loss": 3.5585, + "step": 79890 + }, + { + "epoch": 5.4283870091044975, + "grad_norm": 0.79768967628479, + "learning_rate": 0.0003216979209131676, + "loss": 3.6066, + "step": 79895 + }, + { + "epoch": 5.4287267291751595, + "grad_norm": 1.2816226482391357, + "learning_rate": 0.0003216554559043348, + "loss": 3.4536, + "step": 79900 + }, + { + "epoch": 5.429066449245822, + "grad_norm": 0.8321355581283569, + "learning_rate": 0.00032161299089550214, + "loss": 3.4451, + "step": 79905 + }, + { + "epoch": 5.429406169316483, + "grad_norm": 0.9831712245941162, + "learning_rate": 0.0003215705258866694, + "loss": 3.554, + "step": 79910 + }, + { + "epoch": 5.429745889387145, + "grad_norm": 1.0875723361968994, + "learning_rate": 0.00032152806087783664, + "loss": 3.5383, + "step": 79915 + }, + { + "epoch": 5.430085609457807, + "grad_norm": 2.8549723625183105, + "learning_rate": 0.0003214855958690039, + "loss": 2.9509, + "step": 79920 + }, + { + "epoch": 5.430425329528468, + "grad_norm": 0.7393620610237122, + "learning_rate": 0.00032144313086017126, + "loss": 3.4701, + "step": 79925 + }, + { + "epoch": 5.43076504959913, + "grad_norm": 0.9477068185806274, + "learning_rate": 0.0003214006658513385, + "loss": 3.3145, + "step": 79930 + }, + { + "epoch": 5.431104769669792, + "grad_norm": 0.8904054164886475, + "learning_rate": 0.00032135820084250576, + "loss": 3.0997, + "step": 79935 + }, + { + "epoch": 5.4314444897404535, + "grad_norm": 1.0474299192428589, + "learning_rate": 0.0003213157358336731, + "loss": 3.5179, + "step": 79940 + }, + { + "epoch": 5.4317842098111155, + "grad_norm": 0.9332869648933411, + "learning_rate": 0.0003212732708248403, + "loss": 3.4598, + "step": 79945 + }, + { + "epoch": 5.432123929881778, + "grad_norm": 0.7845218777656555, + "learning_rate": 0.0003212308058160076, + "loss": 3.3015, + "step": 79950 + }, + { + "epoch": 5.432463649952439, + "grad_norm": 0.8861736059188843, + "learning_rate": 0.0003211883408071749, + "loss": 3.4138, + "step": 79955 + }, + { + "epoch": 5.432803370023101, + "grad_norm": 1.256034255027771, + "learning_rate": 0.00032114587579834216, + "loss": 3.5259, + "step": 79960 + }, + { + "epoch": 5.433143090093763, + "grad_norm": 1.0277302265167236, + "learning_rate": 0.00032110341078950944, + "loss": 3.5023, + "step": 79965 + }, + { + "epoch": 5.433482810164424, + "grad_norm": 0.8671965599060059, + "learning_rate": 0.0003210609457806767, + "loss": 3.5146, + "step": 79970 + }, + { + "epoch": 5.433822530235086, + "grad_norm": 0.862398624420166, + "learning_rate": 0.00032101848077184406, + "loss": 3.3211, + "step": 79975 + }, + { + "epoch": 5.434162250305748, + "grad_norm": 0.9296576380729675, + "learning_rate": 0.0003209760157630113, + "loss": 3.26, + "step": 79980 + }, + { + "epoch": 5.4345019703764095, + "grad_norm": 0.8994754552841187, + "learning_rate": 0.00032093355075417856, + "loss": 3.3386, + "step": 79985 + }, + { + "epoch": 5.4348416904470715, + "grad_norm": 0.7039737105369568, + "learning_rate": 0.00032089108574534584, + "loss": 3.6183, + "step": 79990 + }, + { + "epoch": 5.435181410517734, + "grad_norm": 0.8544374108314514, + "learning_rate": 0.0003208486207365131, + "loss": 3.2972, + "step": 79995 + }, + { + "epoch": 5.435521130588395, + "grad_norm": 0.8900360465049744, + "learning_rate": 0.0003208061557276804, + "loss": 3.4646, + "step": 80000 + }, + { + "epoch": 5.435860850659057, + "grad_norm": 0.8716821670532227, + "learning_rate": 0.0003207636907188477, + "loss": 3.4876, + "step": 80005 + }, + { + "epoch": 5.436200570729719, + "grad_norm": 0.8669501543045044, + "learning_rate": 0.00032072122571001496, + "loss": 3.4888, + "step": 80010 + }, + { + "epoch": 5.43654029080038, + "grad_norm": 0.888417661190033, + "learning_rate": 0.00032067876070118224, + "loss": 3.5112, + "step": 80015 + }, + { + "epoch": 5.436880010871042, + "grad_norm": 1.274094581604004, + "learning_rate": 0.0003206362956923495, + "loss": 3.6114, + "step": 80020 + }, + { + "epoch": 5.437219730941704, + "grad_norm": 0.9025928974151611, + "learning_rate": 0.00032059383068351675, + "loss": 3.258, + "step": 80025 + }, + { + "epoch": 5.4375594510123655, + "grad_norm": 1.1116725206375122, + "learning_rate": 0.0003205513656746841, + "loss": 3.6949, + "step": 80030 + }, + { + "epoch": 5.437899171083028, + "grad_norm": 0.7715994715690613, + "learning_rate": 0.00032050890066585136, + "loss": 3.5648, + "step": 80035 + }, + { + "epoch": 5.43823889115369, + "grad_norm": 0.8373642563819885, + "learning_rate": 0.0003204664356570186, + "loss": 3.4171, + "step": 80040 + }, + { + "epoch": 5.438578611224351, + "grad_norm": 0.8001989126205444, + "learning_rate": 0.0003204239706481859, + "loss": 3.2914, + "step": 80045 + }, + { + "epoch": 5.438918331295013, + "grad_norm": 0.9441534280776978, + "learning_rate": 0.0003203815056393532, + "loss": 3.498, + "step": 80050 + }, + { + "epoch": 5.439258051365675, + "grad_norm": 0.9471800923347473, + "learning_rate": 0.00032033904063052043, + "loss": 3.4935, + "step": 80055 + }, + { + "epoch": 5.439597771436336, + "grad_norm": 0.7910758852958679, + "learning_rate": 0.0003202965756216877, + "loss": 3.3353, + "step": 80060 + }, + { + "epoch": 5.439937491506998, + "grad_norm": 1.2238199710845947, + "learning_rate": 0.00032025411061285504, + "loss": 3.2325, + "step": 80065 + }, + { + "epoch": 5.44027721157766, + "grad_norm": 0.9310084581375122, + "learning_rate": 0.00032021164560402227, + "loss": 3.2377, + "step": 80070 + }, + { + "epoch": 5.4406169316483215, + "grad_norm": 0.8661611080169678, + "learning_rate": 0.00032016918059518955, + "loss": 3.3066, + "step": 80075 + }, + { + "epoch": 5.440956651718984, + "grad_norm": 1.0226691961288452, + "learning_rate": 0.0003201267155863569, + "loss": 3.4883, + "step": 80080 + }, + { + "epoch": 5.441296371789646, + "grad_norm": 0.8285424709320068, + "learning_rate": 0.0003200842505775241, + "loss": 3.6289, + "step": 80085 + }, + { + "epoch": 5.441636091860307, + "grad_norm": 0.9715149998664856, + "learning_rate": 0.0003200417855686914, + "loss": 3.1928, + "step": 80090 + }, + { + "epoch": 5.441975811930969, + "grad_norm": 0.9621602296829224, + "learning_rate": 0.0003199993205598587, + "loss": 3.4768, + "step": 80095 + }, + { + "epoch": 5.442315532001631, + "grad_norm": 1.1797081232070923, + "learning_rate": 0.00031995685555102595, + "loss": 3.3975, + "step": 80100 + }, + { + "epoch": 5.442655252072292, + "grad_norm": 3.7835307121276855, + "learning_rate": 0.00031991439054219323, + "loss": 3.3846, + "step": 80105 + }, + { + "epoch": 5.442994972142954, + "grad_norm": 1.0029194355010986, + "learning_rate": 0.0003198719255333605, + "loss": 3.4271, + "step": 80110 + }, + { + "epoch": 5.443334692213616, + "grad_norm": 0.8474113941192627, + "learning_rate": 0.0003198294605245278, + "loss": 3.5805, + "step": 80115 + }, + { + "epoch": 5.4436744122842775, + "grad_norm": 0.9101787805557251, + "learning_rate": 0.00031978699551569507, + "loss": 3.469, + "step": 80120 + }, + { + "epoch": 5.44401413235494, + "grad_norm": 0.9559616446495056, + "learning_rate": 0.00031974453050686235, + "loss": 3.2089, + "step": 80125 + }, + { + "epoch": 5.444353852425602, + "grad_norm": 0.7701300382614136, + "learning_rate": 0.00031970206549802963, + "loss": 3.6512, + "step": 80130 + }, + { + "epoch": 5.444693572496263, + "grad_norm": 0.9561347365379333, + "learning_rate": 0.0003196596004891969, + "loss": 3.4624, + "step": 80135 + }, + { + "epoch": 5.445033292566925, + "grad_norm": 1.1839675903320312, + "learning_rate": 0.0003196171354803642, + "loss": 3.8247, + "step": 80140 + }, + { + "epoch": 5.445373012637587, + "grad_norm": 1.602795958518982, + "learning_rate": 0.00031957467047153147, + "loss": 3.6378, + "step": 80145 + }, + { + "epoch": 5.445712732708248, + "grad_norm": 0.9458635449409485, + "learning_rate": 0.00031953220546269875, + "loss": 3.4715, + "step": 80150 + }, + { + "epoch": 5.44605245277891, + "grad_norm": 0.9211256504058838, + "learning_rate": 0.00031948974045386603, + "loss": 3.3905, + "step": 80155 + }, + { + "epoch": 5.446392172849572, + "grad_norm": 0.8921207785606384, + "learning_rate": 0.0003194472754450333, + "loss": 3.3903, + "step": 80160 + }, + { + "epoch": 5.4467318929202335, + "grad_norm": 0.7493058443069458, + "learning_rate": 0.0003194048104362006, + "loss": 3.6341, + "step": 80165 + }, + { + "epoch": 5.447071612990896, + "grad_norm": 1.5679749250411987, + "learning_rate": 0.00031936234542736787, + "loss": 3.5733, + "step": 80170 + }, + { + "epoch": 5.447411333061558, + "grad_norm": 1.0827479362487793, + "learning_rate": 0.00031931988041853515, + "loss": 3.541, + "step": 80175 + }, + { + "epoch": 5.447751053132219, + "grad_norm": 1.1507035493850708, + "learning_rate": 0.0003192774154097024, + "loss": 3.3185, + "step": 80180 + }, + { + "epoch": 5.448090773202881, + "grad_norm": 0.8664204478263855, + "learning_rate": 0.0003192349504008697, + "loss": 3.5362, + "step": 80185 + }, + { + "epoch": 5.448430493273543, + "grad_norm": 0.7773880958557129, + "learning_rate": 0.000319192485392037, + "loss": 3.544, + "step": 80190 + }, + { + "epoch": 5.448770213344204, + "grad_norm": 1.1592998504638672, + "learning_rate": 0.0003191500203832042, + "loss": 3.4382, + "step": 80195 + }, + { + "epoch": 5.449109933414866, + "grad_norm": 1.045702576637268, + "learning_rate": 0.00031910755537437155, + "loss": 3.43, + "step": 80200 + }, + { + "epoch": 5.449449653485528, + "grad_norm": 0.8382076025009155, + "learning_rate": 0.00031906509036553883, + "loss": 3.3986, + "step": 80205 + }, + { + "epoch": 5.4497893735561895, + "grad_norm": 1.0348231792449951, + "learning_rate": 0.00031902262535670606, + "loss": 3.3022, + "step": 80210 + }, + { + "epoch": 5.450129093626852, + "grad_norm": 1.4955037832260132, + "learning_rate": 0.00031898016034787334, + "loss": 3.7348, + "step": 80215 + }, + { + "epoch": 5.450468813697514, + "grad_norm": 0.8799879550933838, + "learning_rate": 0.00031893769533904067, + "loss": 3.3235, + "step": 80220 + }, + { + "epoch": 5.450808533768175, + "grad_norm": 1.0107990503311157, + "learning_rate": 0.0003188952303302079, + "loss": 3.2902, + "step": 80225 + }, + { + "epoch": 5.451148253838837, + "grad_norm": 1.0452646017074585, + "learning_rate": 0.0003188527653213752, + "loss": 3.4594, + "step": 80230 + }, + { + "epoch": 5.451487973909499, + "grad_norm": 0.796984076499939, + "learning_rate": 0.0003188103003125425, + "loss": 3.5612, + "step": 80235 + }, + { + "epoch": 5.45182769398016, + "grad_norm": 0.7372075319290161, + "learning_rate": 0.00031876783530370974, + "loss": 3.3261, + "step": 80240 + }, + { + "epoch": 5.452167414050822, + "grad_norm": 0.9485595226287842, + "learning_rate": 0.000318725370294877, + "loss": 3.3881, + "step": 80245 + }, + { + "epoch": 5.452507134121484, + "grad_norm": 0.9232583045959473, + "learning_rate": 0.0003186829052860443, + "loss": 3.2895, + "step": 80250 + }, + { + "epoch": 5.4528468541921455, + "grad_norm": 0.7641134858131409, + "learning_rate": 0.0003186404402772116, + "loss": 3.7243, + "step": 80255 + }, + { + "epoch": 5.453186574262808, + "grad_norm": 0.9967696666717529, + "learning_rate": 0.00031859797526837886, + "loss": 3.4379, + "step": 80260 + }, + { + "epoch": 5.45352629433347, + "grad_norm": 0.9608823657035828, + "learning_rate": 0.00031855551025954614, + "loss": 3.3987, + "step": 80265 + }, + { + "epoch": 5.453866014404131, + "grad_norm": 0.8756109476089478, + "learning_rate": 0.0003185130452507134, + "loss": 3.4477, + "step": 80270 + }, + { + "epoch": 5.454205734474793, + "grad_norm": 1.002604603767395, + "learning_rate": 0.0003184705802418807, + "loss": 3.3144, + "step": 80275 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 1.0856528282165527, + "learning_rate": 0.000318428115233048, + "loss": 3.4157, + "step": 80280 + }, + { + "epoch": 5.454885174616116, + "grad_norm": 1.090226173400879, + "learning_rate": 0.0003183856502242152, + "loss": 3.423, + "step": 80285 + }, + { + "epoch": 5.455224894686778, + "grad_norm": 1.1059675216674805, + "learning_rate": 0.00031834318521538254, + "loss": 3.2635, + "step": 80290 + }, + { + "epoch": 5.4555646147574395, + "grad_norm": 0.7887905240058899, + "learning_rate": 0.0003183007202065498, + "loss": 3.664, + "step": 80295 + }, + { + "epoch": 5.4559043348281016, + "grad_norm": 0.8990795612335205, + "learning_rate": 0.00031825825519771704, + "loss": 3.6689, + "step": 80300 + }, + { + "epoch": 5.456244054898764, + "grad_norm": 1.0197972059249878, + "learning_rate": 0.0003182157901888844, + "loss": 3.5105, + "step": 80305 + }, + { + "epoch": 5.456583774969425, + "grad_norm": 0.9389737248420715, + "learning_rate": 0.00031817332518005166, + "loss": 3.4477, + "step": 80310 + }, + { + "epoch": 5.456923495040087, + "grad_norm": 0.9955779910087585, + "learning_rate": 0.00031813086017121894, + "loss": 3.3387, + "step": 80315 + }, + { + "epoch": 5.457263215110749, + "grad_norm": 0.8759332299232483, + "learning_rate": 0.00031808839516238617, + "loss": 3.277, + "step": 80320 + }, + { + "epoch": 5.45760293518141, + "grad_norm": 1.0105565786361694, + "learning_rate": 0.0003180459301535535, + "loss": 3.4141, + "step": 80325 + }, + { + "epoch": 5.457942655252072, + "grad_norm": 0.9125223755836487, + "learning_rate": 0.0003180034651447208, + "loss": 3.7221, + "step": 80330 + }, + { + "epoch": 5.458282375322734, + "grad_norm": 0.7112265825271606, + "learning_rate": 0.000317961000135888, + "loss": 3.5627, + "step": 80335 + }, + { + "epoch": 5.4586220953933955, + "grad_norm": 0.9322915077209473, + "learning_rate": 0.00031791853512705534, + "loss": 3.3503, + "step": 80340 + }, + { + "epoch": 5.458961815464058, + "grad_norm": 1.089666485786438, + "learning_rate": 0.0003178760701182226, + "loss": 3.5583, + "step": 80345 + }, + { + "epoch": 5.45930153553472, + "grad_norm": 0.9125679135322571, + "learning_rate": 0.00031783360510938985, + "loss": 3.3778, + "step": 80350 + }, + { + "epoch": 5.459641255605381, + "grad_norm": 0.9331650733947754, + "learning_rate": 0.0003177911401005571, + "loss": 3.5461, + "step": 80355 + }, + { + "epoch": 5.459980975676043, + "grad_norm": 0.8801083564758301, + "learning_rate": 0.00031774867509172446, + "loss": 3.2255, + "step": 80360 + }, + { + "epoch": 5.460320695746705, + "grad_norm": 1.0674937963485718, + "learning_rate": 0.0003177062100828917, + "loss": 3.5677, + "step": 80365 + }, + { + "epoch": 5.460660415817366, + "grad_norm": 7.992137432098389, + "learning_rate": 0.00031766374507405897, + "loss": 3.4095, + "step": 80370 + }, + { + "epoch": 5.461000135888028, + "grad_norm": 0.8660706877708435, + "learning_rate": 0.0003176212800652263, + "loss": 3.5965, + "step": 80375 + }, + { + "epoch": 5.46133985595869, + "grad_norm": 1.0018383264541626, + "learning_rate": 0.0003175788150563935, + "loss": 3.534, + "step": 80380 + }, + { + "epoch": 5.4616795760293515, + "grad_norm": 0.9591741561889648, + "learning_rate": 0.0003175363500475608, + "loss": 3.3897, + "step": 80385 + }, + { + "epoch": 5.462019296100014, + "grad_norm": 0.9587635397911072, + "learning_rate": 0.00031749388503872814, + "loss": 3.6208, + "step": 80390 + }, + { + "epoch": 5.462359016170676, + "grad_norm": 0.8461111783981323, + "learning_rate": 0.00031745142002989537, + "loss": 3.529, + "step": 80395 + }, + { + "epoch": 5.462698736241337, + "grad_norm": 0.8670841455459595, + "learning_rate": 0.00031740895502106265, + "loss": 3.471, + "step": 80400 + }, + { + "epoch": 5.463038456311999, + "grad_norm": 1.0339136123657227, + "learning_rate": 0.0003173664900122299, + "loss": 3.4844, + "step": 80405 + }, + { + "epoch": 5.463378176382661, + "grad_norm": 0.8602668046951294, + "learning_rate": 0.0003173240250033972, + "loss": 3.5697, + "step": 80410 + }, + { + "epoch": 5.463717896453322, + "grad_norm": 1.1924785375595093, + "learning_rate": 0.0003172815599945645, + "loss": 3.6711, + "step": 80415 + }, + { + "epoch": 5.464057616523984, + "grad_norm": 0.9488211870193481, + "learning_rate": 0.00031723909498573177, + "loss": 3.4497, + "step": 80420 + }, + { + "epoch": 5.464397336594646, + "grad_norm": 0.7675157785415649, + "learning_rate": 0.00031719662997689905, + "loss": 3.063, + "step": 80425 + }, + { + "epoch": 5.4647370566653075, + "grad_norm": 1.0077201128005981, + "learning_rate": 0.0003171541649680663, + "loss": 3.5055, + "step": 80430 + }, + { + "epoch": 5.46507677673597, + "grad_norm": 0.7233545184135437, + "learning_rate": 0.0003171116999592336, + "loss": 3.6267, + "step": 80435 + }, + { + "epoch": 5.465416496806632, + "grad_norm": 0.9181723594665527, + "learning_rate": 0.00031706923495040083, + "loss": 3.5837, + "step": 80440 + }, + { + "epoch": 5.465756216877293, + "grad_norm": 1.2666128873825073, + "learning_rate": 0.00031702676994156817, + "loss": 3.7931, + "step": 80445 + }, + { + "epoch": 5.466095936947955, + "grad_norm": 1.1057716608047485, + "learning_rate": 0.00031698430493273545, + "loss": 3.7831, + "step": 80450 + }, + { + "epoch": 5.466435657018617, + "grad_norm": 0.7375536561012268, + "learning_rate": 0.0003169418399239027, + "loss": 3.482, + "step": 80455 + }, + { + "epoch": 5.466775377089278, + "grad_norm": 0.9361556172370911, + "learning_rate": 0.00031689937491507, + "loss": 3.6089, + "step": 80460 + }, + { + "epoch": 5.46711509715994, + "grad_norm": 1.1935408115386963, + "learning_rate": 0.0003168569099062373, + "loss": 3.7373, + "step": 80465 + }, + { + "epoch": 5.467454817230602, + "grad_norm": 1.022929310798645, + "learning_rate": 0.0003168144448974045, + "loss": 3.3234, + "step": 80470 + }, + { + "epoch": 5.4677945373012635, + "grad_norm": 0.9962301850318909, + "learning_rate": 0.0003167719798885718, + "loss": 3.3716, + "step": 80475 + }, + { + "epoch": 5.468134257371926, + "grad_norm": 0.8599129915237427, + "learning_rate": 0.00031672951487973913, + "loss": 3.5238, + "step": 80480 + }, + { + "epoch": 5.468473977442588, + "grad_norm": 0.7768940925598145, + "learning_rate": 0.0003166870498709064, + "loss": 3.2286, + "step": 80485 + }, + { + "epoch": 5.468813697513249, + "grad_norm": 1.1042178869247437, + "learning_rate": 0.00031664458486207363, + "loss": 3.4785, + "step": 80490 + }, + { + "epoch": 5.469153417583911, + "grad_norm": 1.279759407043457, + "learning_rate": 0.00031660211985324097, + "loss": 3.3287, + "step": 80495 + }, + { + "epoch": 5.469493137654573, + "grad_norm": 0.9756912589073181, + "learning_rate": 0.00031655965484440825, + "loss": 3.2746, + "step": 80500 + }, + { + "epoch": 5.469832857725234, + "grad_norm": 1.5159380435943604, + "learning_rate": 0.0003165171898355755, + "loss": 3.4729, + "step": 80505 + }, + { + "epoch": 5.470172577795896, + "grad_norm": 0.9007816314697266, + "learning_rate": 0.00031647472482674275, + "loss": 3.4252, + "step": 80510 + }, + { + "epoch": 5.470512297866558, + "grad_norm": 1.1283482313156128, + "learning_rate": 0.0003164322598179101, + "loss": 3.5506, + "step": 80515 + }, + { + "epoch": 5.4708520179372195, + "grad_norm": 1.1231189966201782, + "learning_rate": 0.0003163897948090773, + "loss": 3.2234, + "step": 80520 + }, + { + "epoch": 5.471191738007882, + "grad_norm": 1.0220391750335693, + "learning_rate": 0.0003163473298002446, + "loss": 3.3291, + "step": 80525 + }, + { + "epoch": 5.471531458078544, + "grad_norm": 0.7220825552940369, + "learning_rate": 0.00031630486479141193, + "loss": 3.5011, + "step": 80530 + }, + { + "epoch": 5.471871178149205, + "grad_norm": 0.7479972839355469, + "learning_rate": 0.00031626239978257915, + "loss": 3.2896, + "step": 80535 + }, + { + "epoch": 5.472210898219867, + "grad_norm": 1.2463107109069824, + "learning_rate": 0.00031621993477374643, + "loss": 3.4313, + "step": 80540 + }, + { + "epoch": 5.472550618290528, + "grad_norm": 1.1685484647750854, + "learning_rate": 0.0003161774697649137, + "loss": 3.5699, + "step": 80545 + }, + { + "epoch": 5.47289033836119, + "grad_norm": 0.9255625605583191, + "learning_rate": 0.000316135004756081, + "loss": 3.5232, + "step": 80550 + }, + { + "epoch": 5.473230058431852, + "grad_norm": 0.7525144219398499, + "learning_rate": 0.0003160925397472483, + "loss": 3.4836, + "step": 80555 + }, + { + "epoch": 5.4735697785025135, + "grad_norm": 0.9239212274551392, + "learning_rate": 0.00031605007473841555, + "loss": 3.4078, + "step": 80560 + }, + { + "epoch": 5.4739094985731755, + "grad_norm": 1.1576951742172241, + "learning_rate": 0.00031600760972958283, + "loss": 3.3302, + "step": 80565 + }, + { + "epoch": 5.474249218643838, + "grad_norm": 0.7894527316093445, + "learning_rate": 0.0003159651447207501, + "loss": 3.5752, + "step": 80570 + }, + { + "epoch": 5.474588938714499, + "grad_norm": 0.8381906151771545, + "learning_rate": 0.0003159226797119174, + "loss": 3.4243, + "step": 80575 + }, + { + "epoch": 5.474928658785161, + "grad_norm": 1.0875244140625, + "learning_rate": 0.0003158802147030846, + "loss": 3.7276, + "step": 80580 + }, + { + "epoch": 5.475268378855823, + "grad_norm": 1.0241554975509644, + "learning_rate": 0.00031583774969425196, + "loss": 3.6242, + "step": 80585 + }, + { + "epoch": 5.475608098926484, + "grad_norm": 0.8830274343490601, + "learning_rate": 0.00031579528468541924, + "loss": 3.6314, + "step": 80590 + }, + { + "epoch": 5.475947818997146, + "grad_norm": 1.097192645072937, + "learning_rate": 0.00031575281967658646, + "loss": 3.5781, + "step": 80595 + }, + { + "epoch": 5.476287539067808, + "grad_norm": 0.7599223256111145, + "learning_rate": 0.0003157103546677538, + "loss": 3.6199, + "step": 80600 + }, + { + "epoch": 5.4766272591384695, + "grad_norm": 0.925335705280304, + "learning_rate": 0.0003156678896589211, + "loss": 3.2831, + "step": 80605 + }, + { + "epoch": 5.476966979209132, + "grad_norm": 0.7598815560340881, + "learning_rate": 0.0003156254246500883, + "loss": 3.364, + "step": 80610 + }, + { + "epoch": 5.477306699279794, + "grad_norm": 1.2204970121383667, + "learning_rate": 0.0003155829596412556, + "loss": 3.6531, + "step": 80615 + }, + { + "epoch": 5.477646419350455, + "grad_norm": 0.7642669081687927, + "learning_rate": 0.0003155404946324229, + "loss": 3.725, + "step": 80620 + }, + { + "epoch": 5.477986139421117, + "grad_norm": 0.8724303841590881, + "learning_rate": 0.00031549802962359014, + "loss": 3.7188, + "step": 80625 + }, + { + "epoch": 5.478325859491779, + "grad_norm": 0.9082561731338501, + "learning_rate": 0.0003154555646147574, + "loss": 3.2944, + "step": 80630 + }, + { + "epoch": 5.47866557956244, + "grad_norm": 0.7944132089614868, + "learning_rate": 0.00031541309960592476, + "loss": 3.3412, + "step": 80635 + }, + { + "epoch": 5.479005299633102, + "grad_norm": 0.8999765515327454, + "learning_rate": 0.000315370634597092, + "loss": 3.6561, + "step": 80640 + }, + { + "epoch": 5.479345019703764, + "grad_norm": 0.9144772291183472, + "learning_rate": 0.00031532816958825926, + "loss": 3.3773, + "step": 80645 + }, + { + "epoch": 5.4796847397744255, + "grad_norm": 0.7646811604499817, + "learning_rate": 0.00031528570457942654, + "loss": 3.5353, + "step": 80650 + }, + { + "epoch": 5.480024459845088, + "grad_norm": 0.9247301816940308, + "learning_rate": 0.0003152432395705939, + "loss": 3.5329, + "step": 80655 + }, + { + "epoch": 5.48036417991575, + "grad_norm": 1.0358062982559204, + "learning_rate": 0.0003152007745617611, + "loss": 3.3274, + "step": 80660 + }, + { + "epoch": 5.480703899986411, + "grad_norm": 1.2111520767211914, + "learning_rate": 0.0003151583095529284, + "loss": 3.3824, + "step": 80665 + }, + { + "epoch": 5.481043620057073, + "grad_norm": 0.8121327757835388, + "learning_rate": 0.0003151158445440957, + "loss": 3.4946, + "step": 80670 + }, + { + "epoch": 5.481383340127735, + "grad_norm": 1.0313273668289185, + "learning_rate": 0.00031507337953526294, + "loss": 3.646, + "step": 80675 + }, + { + "epoch": 5.481723060198396, + "grad_norm": 0.87859708070755, + "learning_rate": 0.0003150309145264302, + "loss": 3.4527, + "step": 80680 + }, + { + "epoch": 5.482062780269058, + "grad_norm": 0.9963703751564026, + "learning_rate": 0.00031498844951759756, + "loss": 3.5841, + "step": 80685 + }, + { + "epoch": 5.48240250033972, + "grad_norm": 0.7737547755241394, + "learning_rate": 0.0003149459845087648, + "loss": 3.2492, + "step": 80690 + }, + { + "epoch": 5.4827422204103815, + "grad_norm": 1.0888996124267578, + "learning_rate": 0.00031490351949993206, + "loss": 3.6334, + "step": 80695 + }, + { + "epoch": 5.483081940481044, + "grad_norm": 0.7301745414733887, + "learning_rate": 0.00031486105449109934, + "loss": 3.5221, + "step": 80700 + }, + { + "epoch": 5.483421660551706, + "grad_norm": 0.6617140769958496, + "learning_rate": 0.0003148185894822666, + "loss": 3.2421, + "step": 80705 + }, + { + "epoch": 5.483761380622367, + "grad_norm": 0.7258843183517456, + "learning_rate": 0.0003147761244734339, + "loss": 3.5016, + "step": 80710 + }, + { + "epoch": 5.484101100693029, + "grad_norm": 0.8673818707466125, + "learning_rate": 0.0003147336594646012, + "loss": 3.421, + "step": 80715 + }, + { + "epoch": 5.484440820763691, + "grad_norm": 0.9263697862625122, + "learning_rate": 0.00031469119445576846, + "loss": 3.6724, + "step": 80720 + }, + { + "epoch": 5.484780540834352, + "grad_norm": 3.000908374786377, + "learning_rate": 0.00031464872944693574, + "loss": 3.3973, + "step": 80725 + }, + { + "epoch": 5.485120260905014, + "grad_norm": 0.6787800192832947, + "learning_rate": 0.000314606264438103, + "loss": 3.6729, + "step": 80730 + }, + { + "epoch": 5.485459980975676, + "grad_norm": 2.5421805381774902, + "learning_rate": 0.00031456379942927025, + "loss": 3.5727, + "step": 80735 + }, + { + "epoch": 5.4857997010463375, + "grad_norm": 0.8370576500892639, + "learning_rate": 0.0003145213344204376, + "loss": 3.3453, + "step": 80740 + }, + { + "epoch": 5.486139421117, + "grad_norm": 0.81061190366745, + "learning_rate": 0.00031447886941160486, + "loss": 3.1989, + "step": 80745 + }, + { + "epoch": 5.486479141187662, + "grad_norm": 0.8545049428939819, + "learning_rate": 0.0003144364044027721, + "loss": 3.588, + "step": 80750 + }, + { + "epoch": 5.486818861258323, + "grad_norm": 0.9116913080215454, + "learning_rate": 0.0003143939393939394, + "loss": 3.4451, + "step": 80755 + }, + { + "epoch": 5.487158581328985, + "grad_norm": 0.8620434403419495, + "learning_rate": 0.0003143514743851067, + "loss": 3.2858, + "step": 80760 + }, + { + "epoch": 5.487498301399647, + "grad_norm": 0.9107691645622253, + "learning_rate": 0.00031430900937627393, + "loss": 3.2893, + "step": 80765 + }, + { + "epoch": 5.487838021470308, + "grad_norm": 0.8186777234077454, + "learning_rate": 0.0003142665443674412, + "loss": 3.6795, + "step": 80770 + }, + { + "epoch": 5.48817774154097, + "grad_norm": 0.9910987615585327, + "learning_rate": 0.00031422407935860854, + "loss": 3.0753, + "step": 80775 + }, + { + "epoch": 5.488517461611632, + "grad_norm": 1.1468720436096191, + "learning_rate": 0.00031418161434977577, + "loss": 3.3977, + "step": 80780 + }, + { + "epoch": 5.4888571816822935, + "grad_norm": 0.9198011159896851, + "learning_rate": 0.00031413914934094305, + "loss": 3.357, + "step": 80785 + }, + { + "epoch": 5.489196901752956, + "grad_norm": 0.7730534076690674, + "learning_rate": 0.0003140966843321104, + "loss": 3.4022, + "step": 80790 + }, + { + "epoch": 5.489536621823618, + "grad_norm": 0.7917429208755493, + "learning_rate": 0.0003140542193232776, + "loss": 3.4998, + "step": 80795 + }, + { + "epoch": 5.489876341894279, + "grad_norm": 0.8717261552810669, + "learning_rate": 0.0003140117543144449, + "loss": 3.2743, + "step": 80800 + }, + { + "epoch": 5.490216061964941, + "grad_norm": 1.3928253650665283, + "learning_rate": 0.00031396928930561217, + "loss": 3.4933, + "step": 80805 + }, + { + "epoch": 5.490555782035603, + "grad_norm": 1.3269171714782715, + "learning_rate": 0.00031392682429677945, + "loss": 3.6632, + "step": 80810 + }, + { + "epoch": 5.490895502106264, + "grad_norm": 0.8876561522483826, + "learning_rate": 0.00031388435928794673, + "loss": 3.5284, + "step": 80815 + }, + { + "epoch": 5.491235222176926, + "grad_norm": 0.9903690218925476, + "learning_rate": 0.000313841894279114, + "loss": 3.2676, + "step": 80820 + }, + { + "epoch": 5.491574942247588, + "grad_norm": 1.1255865097045898, + "learning_rate": 0.00031379942927028134, + "loss": 3.4305, + "step": 80825 + }, + { + "epoch": 5.4919146623182495, + "grad_norm": 0.9743989706039429, + "learning_rate": 0.00031375696426144857, + "loss": 3.4474, + "step": 80830 + }, + { + "epoch": 5.492254382388912, + "grad_norm": 0.7623120546340942, + "learning_rate": 0.00031371449925261585, + "loss": 3.3995, + "step": 80835 + }, + { + "epoch": 5.492594102459574, + "grad_norm": 0.8176437616348267, + "learning_rate": 0.00031367203424378313, + "loss": 3.6135, + "step": 80840 + }, + { + "epoch": 5.492933822530235, + "grad_norm": 1.0991687774658203, + "learning_rate": 0.0003136295692349504, + "loss": 3.1712, + "step": 80845 + }, + { + "epoch": 5.493273542600897, + "grad_norm": 0.8191214203834534, + "learning_rate": 0.0003135871042261177, + "loss": 3.3585, + "step": 80850 + }, + { + "epoch": 5.493613262671559, + "grad_norm": 0.923830509185791, + "learning_rate": 0.00031354463921728497, + "loss": 3.321, + "step": 80855 + }, + { + "epoch": 5.49395298274222, + "grad_norm": 0.7593967914581299, + "learning_rate": 0.00031350217420845225, + "loss": 3.6663, + "step": 80860 + }, + { + "epoch": 5.494292702812882, + "grad_norm": 0.9454252123832703, + "learning_rate": 0.00031345970919961953, + "loss": 3.4001, + "step": 80865 + }, + { + "epoch": 5.494632422883544, + "grad_norm": 0.8815407156944275, + "learning_rate": 0.0003134172441907868, + "loss": 3.7083, + "step": 80870 + }, + { + "epoch": 5.4949721429542056, + "grad_norm": 1.0383402109146118, + "learning_rate": 0.00031337477918195404, + "loss": 3.5339, + "step": 80875 + }, + { + "epoch": 5.495311863024868, + "grad_norm": 1.3942880630493164, + "learning_rate": 0.00031333231417312137, + "loss": 3.6135, + "step": 80880 + }, + { + "epoch": 5.49565158309553, + "grad_norm": 0.8521626591682434, + "learning_rate": 0.00031328984916428865, + "loss": 3.2226, + "step": 80885 + }, + { + "epoch": 5.495991303166191, + "grad_norm": 0.9190009236335754, + "learning_rate": 0.0003132473841554559, + "loss": 3.4594, + "step": 80890 + }, + { + "epoch": 5.496331023236853, + "grad_norm": 0.8421842455863953, + "learning_rate": 0.0003132049191466232, + "loss": 3.4799, + "step": 80895 + }, + { + "epoch": 5.496670743307515, + "grad_norm": 0.9567166566848755, + "learning_rate": 0.0003131624541377905, + "loss": 3.1126, + "step": 80900 + }, + { + "epoch": 5.497010463378176, + "grad_norm": 0.9814320802688599, + "learning_rate": 0.0003131199891289577, + "loss": 3.7253, + "step": 80905 + }, + { + "epoch": 5.497350183448838, + "grad_norm": 1.0834457874298096, + "learning_rate": 0.000313077524120125, + "loss": 3.3353, + "step": 80910 + }, + { + "epoch": 5.4976899035195, + "grad_norm": 0.9021211266517639, + "learning_rate": 0.00031303505911129233, + "loss": 3.7895, + "step": 80915 + }, + { + "epoch": 5.498029623590162, + "grad_norm": 0.7210484743118286, + "learning_rate": 0.00031299259410245956, + "loss": 3.4123, + "step": 80920 + }, + { + "epoch": 5.498369343660824, + "grad_norm": 0.9869989156723022, + "learning_rate": 0.00031295012909362684, + "loss": 3.4433, + "step": 80925 + }, + { + "epoch": 5.498709063731486, + "grad_norm": 1.4483628273010254, + "learning_rate": 0.00031290766408479417, + "loss": 3.7488, + "step": 80930 + }, + { + "epoch": 5.499048783802147, + "grad_norm": 1.3793151378631592, + "learning_rate": 0.0003128651990759614, + "loss": 3.5094, + "step": 80935 + }, + { + "epoch": 5.499388503872809, + "grad_norm": 0.6922215819358826, + "learning_rate": 0.0003128227340671287, + "loss": 3.3461, + "step": 80940 + }, + { + "epoch": 5.499728223943471, + "grad_norm": 0.7561509013175964, + "learning_rate": 0.000312780269058296, + "loss": 3.5206, + "step": 80945 + }, + { + "epoch": 5.500067944014132, + "grad_norm": 0.8769567012786865, + "learning_rate": 0.00031273780404946324, + "loss": 3.3935, + "step": 80950 + }, + { + "epoch": 5.500407664084794, + "grad_norm": 1.0237146615982056, + "learning_rate": 0.0003126953390406305, + "loss": 3.5021, + "step": 80955 + }, + { + "epoch": 5.500747384155456, + "grad_norm": 3.762659788131714, + "learning_rate": 0.0003126528740317978, + "loss": 3.5166, + "step": 80960 + }, + { + "epoch": 5.501087104226118, + "grad_norm": 0.839933454990387, + "learning_rate": 0.0003126104090229651, + "loss": 3.1661, + "step": 80965 + }, + { + "epoch": 5.50142682429678, + "grad_norm": 0.8734918236732483, + "learning_rate": 0.00031256794401413236, + "loss": 3.5654, + "step": 80970 + }, + { + "epoch": 5.501766544367442, + "grad_norm": 0.7974902987480164, + "learning_rate": 0.00031252547900529964, + "loss": 3.3355, + "step": 80975 + }, + { + "epoch": 5.502106264438103, + "grad_norm": 0.8493919372558594, + "learning_rate": 0.0003124830139964669, + "loss": 3.6327, + "step": 80980 + }, + { + "epoch": 5.502445984508765, + "grad_norm": 0.9078546762466431, + "learning_rate": 0.0003124405489876342, + "loss": 3.4379, + "step": 80985 + }, + { + "epoch": 5.502785704579426, + "grad_norm": 1.092421054840088, + "learning_rate": 0.0003123980839788015, + "loss": 3.2082, + "step": 80990 + }, + { + "epoch": 5.503125424650088, + "grad_norm": 0.8897440433502197, + "learning_rate": 0.0003123556189699687, + "loss": 3.0752, + "step": 80995 + }, + { + "epoch": 5.50346514472075, + "grad_norm": 0.821748673915863, + "learning_rate": 0.00031231315396113604, + "loss": 3.5559, + "step": 81000 + }, + { + "epoch": 5.5038048647914115, + "grad_norm": 0.8444026708602905, + "learning_rate": 0.0003122706889523033, + "loss": 3.5565, + "step": 81005 + }, + { + "epoch": 5.504144584862074, + "grad_norm": 0.7252334952354431, + "learning_rate": 0.0003122282239434706, + "loss": 3.2271, + "step": 81010 + }, + { + "epoch": 5.504484304932736, + "grad_norm": 1.0538671016693115, + "learning_rate": 0.0003121857589346379, + "loss": 3.3762, + "step": 81015 + }, + { + "epoch": 5.504824025003397, + "grad_norm": 0.7518550157546997, + "learning_rate": 0.00031214329392580516, + "loss": 3.4973, + "step": 81020 + }, + { + "epoch": 5.505163745074059, + "grad_norm": 0.9393234848976135, + "learning_rate": 0.00031210082891697244, + "loss": 3.3255, + "step": 81025 + }, + { + "epoch": 5.505503465144721, + "grad_norm": 0.6716040372848511, + "learning_rate": 0.00031205836390813967, + "loss": 3.3672, + "step": 81030 + }, + { + "epoch": 5.505843185215382, + "grad_norm": 0.9096150994300842, + "learning_rate": 0.000312015898899307, + "loss": 3.3978, + "step": 81035 + }, + { + "epoch": 5.506182905286044, + "grad_norm": 0.8751334547996521, + "learning_rate": 0.0003119734338904743, + "loss": 3.4776, + "step": 81040 + }, + { + "epoch": 5.506522625356706, + "grad_norm": 0.999354898929596, + "learning_rate": 0.0003119309688816415, + "loss": 3.5068, + "step": 81045 + }, + { + "epoch": 5.5068623454273675, + "grad_norm": 0.7124093174934387, + "learning_rate": 0.00031188850387280884, + "loss": 3.4729, + "step": 81050 + }, + { + "epoch": 5.50720206549803, + "grad_norm": 0.873681902885437, + "learning_rate": 0.0003118460388639761, + "loss": 3.5881, + "step": 81055 + }, + { + "epoch": 5.507541785568692, + "grad_norm": 0.9823644161224365, + "learning_rate": 0.00031180357385514335, + "loss": 3.3324, + "step": 81060 + }, + { + "epoch": 5.507881505639353, + "grad_norm": 0.8041965961456299, + "learning_rate": 0.0003117611088463106, + "loss": 3.4833, + "step": 81065 + }, + { + "epoch": 5.508221225710015, + "grad_norm": 0.7784940004348755, + "learning_rate": 0.00031171864383747796, + "loss": 3.5162, + "step": 81070 + }, + { + "epoch": 5.508560945780677, + "grad_norm": 0.9759957194328308, + "learning_rate": 0.0003116761788286452, + "loss": 3.4366, + "step": 81075 + }, + { + "epoch": 5.508900665851338, + "grad_norm": 0.8699195981025696, + "learning_rate": 0.00031163371381981247, + "loss": 3.1995, + "step": 81080 + }, + { + "epoch": 5.509240385922, + "grad_norm": 1.0761523246765137, + "learning_rate": 0.0003115912488109798, + "loss": 3.4627, + "step": 81085 + }, + { + "epoch": 5.509580105992662, + "grad_norm": 0.8605514168739319, + "learning_rate": 0.000311548783802147, + "loss": 3.3942, + "step": 81090 + }, + { + "epoch": 5.5099198260633235, + "grad_norm": 1.0430785417556763, + "learning_rate": 0.0003115063187933143, + "loss": 3.5121, + "step": 81095 + }, + { + "epoch": 5.510259546133986, + "grad_norm": 0.9115331768989563, + "learning_rate": 0.0003114638537844816, + "loss": 3.5402, + "step": 81100 + }, + { + "epoch": 5.510599266204648, + "grad_norm": 1.2525904178619385, + "learning_rate": 0.00031142138877564887, + "loss": 3.5883, + "step": 81105 + }, + { + "epoch": 5.510938986275309, + "grad_norm": 0.9731706976890564, + "learning_rate": 0.00031137892376681615, + "loss": 3.5908, + "step": 81110 + }, + { + "epoch": 5.511278706345971, + "grad_norm": 0.794813871383667, + "learning_rate": 0.0003113364587579834, + "loss": 3.3418, + "step": 81115 + }, + { + "epoch": 5.511618426416633, + "grad_norm": 0.6943268775939941, + "learning_rate": 0.0003112939937491507, + "loss": 3.4381, + "step": 81120 + }, + { + "epoch": 5.511958146487294, + "grad_norm": 0.8844769597053528, + "learning_rate": 0.000311251528740318, + "loss": 3.2225, + "step": 81125 + }, + { + "epoch": 5.512297866557956, + "grad_norm": 1.0031296014785767, + "learning_rate": 0.00031120906373148527, + "loss": 3.5803, + "step": 81130 + }, + { + "epoch": 5.512637586628618, + "grad_norm": 0.902060329914093, + "learning_rate": 0.0003111665987226525, + "loss": 3.449, + "step": 81135 + }, + { + "epoch": 5.5129773066992795, + "grad_norm": 1.8491265773773193, + "learning_rate": 0.0003111241337138198, + "loss": 3.3932, + "step": 81140 + }, + { + "epoch": 5.513317026769942, + "grad_norm": 1.419859766960144, + "learning_rate": 0.0003110816687049871, + "loss": 3.6403, + "step": 81145 + }, + { + "epoch": 5.513656746840604, + "grad_norm": 0.8864592909812927, + "learning_rate": 0.00031103920369615433, + "loss": 3.5775, + "step": 81150 + }, + { + "epoch": 5.513996466911265, + "grad_norm": 0.7180608510971069, + "learning_rate": 0.00031099673868732167, + "loss": 3.35, + "step": 81155 + }, + { + "epoch": 5.514336186981927, + "grad_norm": 1.10251784324646, + "learning_rate": 0.00031095427367848895, + "loss": 3.7659, + "step": 81160 + }, + { + "epoch": 5.514675907052589, + "grad_norm": 0.9915873408317566, + "learning_rate": 0.0003109118086696562, + "loss": 3.6078, + "step": 81165 + }, + { + "epoch": 5.51501562712325, + "grad_norm": 1.0368765592575073, + "learning_rate": 0.00031086934366082345, + "loss": 3.5125, + "step": 81170 + }, + { + "epoch": 5.515355347193912, + "grad_norm": 1.0795496702194214, + "learning_rate": 0.0003108268786519908, + "loss": 3.3183, + "step": 81175 + }, + { + "epoch": 5.515695067264574, + "grad_norm": 0.893660306930542, + "learning_rate": 0.00031078441364315807, + "loss": 3.5459, + "step": 81180 + }, + { + "epoch": 5.516034787335236, + "grad_norm": 1.097823977470398, + "learning_rate": 0.0003107419486343253, + "loss": 3.5103, + "step": 81185 + }, + { + "epoch": 5.516374507405898, + "grad_norm": 0.8254541754722595, + "learning_rate": 0.00031069948362549263, + "loss": 3.5669, + "step": 81190 + }, + { + "epoch": 5.516714227476559, + "grad_norm": 0.9592100381851196, + "learning_rate": 0.0003106570186166599, + "loss": 3.4527, + "step": 81195 + }, + { + "epoch": 5.517053947547221, + "grad_norm": 0.7741908431053162, + "learning_rate": 0.00031061455360782713, + "loss": 3.5368, + "step": 81200 + }, + { + "epoch": 5.517393667617883, + "grad_norm": 1.1800304651260376, + "learning_rate": 0.0003105720885989944, + "loss": 3.3974, + "step": 81205 + }, + { + "epoch": 5.517733387688544, + "grad_norm": 0.9996441006660461, + "learning_rate": 0.00031052962359016175, + "loss": 3.63, + "step": 81210 + }, + { + "epoch": 5.518073107759206, + "grad_norm": 0.802487850189209, + "learning_rate": 0.000310487158581329, + "loss": 3.5226, + "step": 81215 + }, + { + "epoch": 5.518412827829868, + "grad_norm": 0.7857689261436462, + "learning_rate": 0.00031044469357249625, + "loss": 3.5699, + "step": 81220 + }, + { + "epoch": 5.5187525479005295, + "grad_norm": 0.9140170812606812, + "learning_rate": 0.0003104022285636636, + "loss": 3.7535, + "step": 81225 + }, + { + "epoch": 5.519092267971192, + "grad_norm": 0.9124502539634705, + "learning_rate": 0.0003103597635548308, + "loss": 3.3696, + "step": 81230 + }, + { + "epoch": 5.519431988041854, + "grad_norm": 1.9604969024658203, + "learning_rate": 0.0003103172985459981, + "loss": 3.6893, + "step": 81235 + }, + { + "epoch": 5.519771708112515, + "grad_norm": 0.7832944989204407, + "learning_rate": 0.00031027483353716543, + "loss": 3.6859, + "step": 81240 + }, + { + "epoch": 5.520111428183177, + "grad_norm": 0.7533808350563049, + "learning_rate": 0.00031023236852833265, + "loss": 3.6967, + "step": 81245 + }, + { + "epoch": 5.520451148253839, + "grad_norm": 0.8609068393707275, + "learning_rate": 0.00031018990351949993, + "loss": 3.7202, + "step": 81250 + }, + { + "epoch": 5.5207908683245, + "grad_norm": 0.8382969498634338, + "learning_rate": 0.0003101474385106672, + "loss": 3.3727, + "step": 81255 + }, + { + "epoch": 5.521130588395162, + "grad_norm": 0.9900043606758118, + "learning_rate": 0.0003101049735018345, + "loss": 3.7391, + "step": 81260 + }, + { + "epoch": 5.521470308465824, + "grad_norm": 0.9239123463630676, + "learning_rate": 0.0003100625084930018, + "loss": 3.4028, + "step": 81265 + }, + { + "epoch": 5.5218100285364855, + "grad_norm": 0.9753579497337341, + "learning_rate": 0.00031002004348416905, + "loss": 3.3947, + "step": 81270 + }, + { + "epoch": 5.522149748607148, + "grad_norm": 1.0288466215133667, + "learning_rate": 0.00030997757847533633, + "loss": 3.3427, + "step": 81275 + }, + { + "epoch": 5.52248946867781, + "grad_norm": 0.9127027988433838, + "learning_rate": 0.0003099351134665036, + "loss": 3.5782, + "step": 81280 + }, + { + "epoch": 5.522829188748471, + "grad_norm": 0.9700169563293457, + "learning_rate": 0.0003098926484576709, + "loss": 3.6307, + "step": 81285 + }, + { + "epoch": 5.523168908819133, + "grad_norm": 1.2641404867172241, + "learning_rate": 0.0003098501834488381, + "loss": 3.7075, + "step": 81290 + }, + { + "epoch": 5.523508628889795, + "grad_norm": 1.0381836891174316, + "learning_rate": 0.00030980771844000545, + "loss": 3.3933, + "step": 81295 + }, + { + "epoch": 5.523848348960456, + "grad_norm": 0.7891045212745667, + "learning_rate": 0.00030976525343117274, + "loss": 3.485, + "step": 81300 + }, + { + "epoch": 5.524188069031118, + "grad_norm": 0.8103420734405518, + "learning_rate": 0.00030972278842233996, + "loss": 3.5202, + "step": 81305 + }, + { + "epoch": 5.52452778910178, + "grad_norm": 1.2312097549438477, + "learning_rate": 0.0003096803234135073, + "loss": 3.5838, + "step": 81310 + }, + { + "epoch": 5.5248675091724415, + "grad_norm": 0.9017577171325684, + "learning_rate": 0.0003096378584046746, + "loss": 3.4031, + "step": 81315 + }, + { + "epoch": 5.525207229243104, + "grad_norm": 1.0606924295425415, + "learning_rate": 0.0003095953933958418, + "loss": 3.2913, + "step": 81320 + }, + { + "epoch": 5.525546949313766, + "grad_norm": 0.8020796179771423, + "learning_rate": 0.0003095529283870091, + "loss": 3.3846, + "step": 81325 + }, + { + "epoch": 5.525886669384427, + "grad_norm": 0.7487274408340454, + "learning_rate": 0.0003095104633781764, + "loss": 3.385, + "step": 81330 + }, + { + "epoch": 5.526226389455089, + "grad_norm": 0.9573579430580139, + "learning_rate": 0.00030946799836934364, + "loss": 3.7265, + "step": 81335 + }, + { + "epoch": 5.526566109525751, + "grad_norm": 0.763799786567688, + "learning_rate": 0.0003094255333605109, + "loss": 3.5656, + "step": 81340 + }, + { + "epoch": 5.526905829596412, + "grad_norm": 1.0272263288497925, + "learning_rate": 0.00030938306835167826, + "loss": 3.4466, + "step": 81345 + }, + { + "epoch": 5.527245549667074, + "grad_norm": 0.9075291156768799, + "learning_rate": 0.00030934060334284554, + "loss": 3.6394, + "step": 81350 + }, + { + "epoch": 5.527585269737736, + "grad_norm": 0.9421831369400024, + "learning_rate": 0.00030929813833401276, + "loss": 3.5803, + "step": 81355 + }, + { + "epoch": 5.5279249898083975, + "grad_norm": 0.8914426565170288, + "learning_rate": 0.00030925567332518004, + "loss": 3.2537, + "step": 81360 + }, + { + "epoch": 5.52826470987906, + "grad_norm": 0.8479793071746826, + "learning_rate": 0.0003092132083163474, + "loss": 3.4557, + "step": 81365 + }, + { + "epoch": 5.528604429949722, + "grad_norm": 1.0412880182266235, + "learning_rate": 0.0003091707433075146, + "loss": 3.1173, + "step": 81370 + }, + { + "epoch": 5.528944150020383, + "grad_norm": 0.9087055325508118, + "learning_rate": 0.0003091282782986819, + "loss": 3.434, + "step": 81375 + }, + { + "epoch": 5.529283870091045, + "grad_norm": 1.0271732807159424, + "learning_rate": 0.0003090858132898492, + "loss": 3.4359, + "step": 81380 + }, + { + "epoch": 5.529623590161707, + "grad_norm": 0.9190521836280823, + "learning_rate": 0.00030904334828101644, + "loss": 3.5741, + "step": 81385 + }, + { + "epoch": 5.529963310232368, + "grad_norm": 0.7449003458023071, + "learning_rate": 0.0003090008832721837, + "loss": 3.4989, + "step": 81390 + }, + { + "epoch": 5.53030303030303, + "grad_norm": 0.8472681641578674, + "learning_rate": 0.000308958418263351, + "loss": 3.3112, + "step": 81395 + }, + { + "epoch": 5.530642750373692, + "grad_norm": 1.0192617177963257, + "learning_rate": 0.0003089159532545183, + "loss": 3.3724, + "step": 81400 + }, + { + "epoch": 5.5309824704443535, + "grad_norm": 0.7930430769920349, + "learning_rate": 0.00030887348824568556, + "loss": 3.523, + "step": 81405 + }, + { + "epoch": 5.531322190515016, + "grad_norm": 1.006048321723938, + "learning_rate": 0.00030883102323685284, + "loss": 3.4587, + "step": 81410 + }, + { + "epoch": 5.531661910585678, + "grad_norm": 0.9342740178108215, + "learning_rate": 0.0003087885582280201, + "loss": 3.608, + "step": 81415 + }, + { + "epoch": 5.532001630656339, + "grad_norm": 0.7891435027122498, + "learning_rate": 0.0003087460932191874, + "loss": 3.6028, + "step": 81420 + }, + { + "epoch": 5.532341350727001, + "grad_norm": 1.5499881505966187, + "learning_rate": 0.0003087036282103547, + "loss": 3.3014, + "step": 81425 + }, + { + "epoch": 5.532681070797663, + "grad_norm": 0.7744536995887756, + "learning_rate": 0.0003086611632015219, + "loss": 3.3221, + "step": 81430 + }, + { + "epoch": 5.533020790868324, + "grad_norm": 0.8496066331863403, + "learning_rate": 0.00030861869819268924, + "loss": 3.5811, + "step": 81435 + }, + { + "epoch": 5.533360510938986, + "grad_norm": 0.9043992161750793, + "learning_rate": 0.0003085762331838565, + "loss": 3.5282, + "step": 81440 + }, + { + "epoch": 5.533700231009648, + "grad_norm": 0.9672937989234924, + "learning_rate": 0.00030853376817502375, + "loss": 3.5742, + "step": 81445 + }, + { + "epoch": 5.5340399510803095, + "grad_norm": 0.9414442181587219, + "learning_rate": 0.0003084913031661911, + "loss": 3.2642, + "step": 81450 + }, + { + "epoch": 5.534379671150972, + "grad_norm": 0.8413162231445312, + "learning_rate": 0.00030844883815735836, + "loss": 3.4864, + "step": 81455 + }, + { + "epoch": 5.534719391221634, + "grad_norm": 1.8403575420379639, + "learning_rate": 0.0003084063731485256, + "loss": 3.4197, + "step": 81460 + }, + { + "epoch": 5.535059111292295, + "grad_norm": 1.1641454696655273, + "learning_rate": 0.00030836390813969287, + "loss": 3.5294, + "step": 81465 + }, + { + "epoch": 5.535398831362957, + "grad_norm": 0.9248160123825073, + "learning_rate": 0.0003083214431308602, + "loss": 3.3538, + "step": 81470 + }, + { + "epoch": 5.535738551433619, + "grad_norm": 0.785529613494873, + "learning_rate": 0.00030827897812202743, + "loss": 3.6154, + "step": 81475 + }, + { + "epoch": 5.53607827150428, + "grad_norm": 0.9424094557762146, + "learning_rate": 0.0003082365131131947, + "loss": 3.5091, + "step": 81480 + }, + { + "epoch": 5.536417991574942, + "grad_norm": 0.9527498483657837, + "learning_rate": 0.00030819404810436204, + "loss": 3.4758, + "step": 81485 + }, + { + "epoch": 5.536757711645604, + "grad_norm": 0.7874264121055603, + "learning_rate": 0.00030815158309552927, + "loss": 3.3896, + "step": 81490 + }, + { + "epoch": 5.537097431716266, + "grad_norm": 0.8474417328834534, + "learning_rate": 0.00030810911808669655, + "loss": 3.6189, + "step": 81495 + }, + { + "epoch": 5.537437151786928, + "grad_norm": 0.8775928020477295, + "learning_rate": 0.00030806665307786383, + "loss": 3.367, + "step": 81500 + }, + { + "epoch": 5.53777687185759, + "grad_norm": 0.8484158515930176, + "learning_rate": 0.0003080241880690311, + "loss": 3.8098, + "step": 81505 + }, + { + "epoch": 5.538116591928251, + "grad_norm": 0.8923918604850769, + "learning_rate": 0.0003079817230601984, + "loss": 3.2615, + "step": 81510 + }, + { + "epoch": 5.538456311998913, + "grad_norm": 1.045936107635498, + "learning_rate": 0.00030793925805136567, + "loss": 3.4486, + "step": 81515 + }, + { + "epoch": 5.538796032069575, + "grad_norm": 1.0422921180725098, + "learning_rate": 0.000307896793042533, + "loss": 3.467, + "step": 81520 + }, + { + "epoch": 5.539135752140236, + "grad_norm": 0.7557669878005981, + "learning_rate": 0.00030785432803370023, + "loss": 3.5995, + "step": 81525 + }, + { + "epoch": 5.539475472210898, + "grad_norm": 0.9232848882675171, + "learning_rate": 0.0003078118630248675, + "loss": 3.605, + "step": 81530 + }, + { + "epoch": 5.53981519228156, + "grad_norm": 0.8446678519248962, + "learning_rate": 0.00030776939801603484, + "loss": 3.5914, + "step": 81535 + }, + { + "epoch": 5.540154912352222, + "grad_norm": 0.9507452249526978, + "learning_rate": 0.00030772693300720207, + "loss": 3.5796, + "step": 81540 + }, + { + "epoch": 5.540494632422884, + "grad_norm": 0.6620997786521912, + "learning_rate": 0.00030768446799836935, + "loss": 3.4908, + "step": 81545 + }, + { + "epoch": 5.540834352493546, + "grad_norm": 0.8895394206047058, + "learning_rate": 0.00030764200298953663, + "loss": 3.5473, + "step": 81550 + }, + { + "epoch": 5.541174072564207, + "grad_norm": 0.8932436108589172, + "learning_rate": 0.0003075995379807039, + "loss": 3.4816, + "step": 81555 + }, + { + "epoch": 5.541513792634869, + "grad_norm": 0.7118507027626038, + "learning_rate": 0.0003075570729718712, + "loss": 3.637, + "step": 81560 + }, + { + "epoch": 5.541853512705531, + "grad_norm": 0.969048023223877, + "learning_rate": 0.00030751460796303847, + "loss": 3.2136, + "step": 81565 + }, + { + "epoch": 5.542193232776192, + "grad_norm": 0.9098450541496277, + "learning_rate": 0.00030747214295420575, + "loss": 3.4478, + "step": 81570 + }, + { + "epoch": 5.542532952846854, + "grad_norm": 1.1106027364730835, + "learning_rate": 0.00030742967794537303, + "loss": 3.2368, + "step": 81575 + }, + { + "epoch": 5.542872672917516, + "grad_norm": 1.1201722621917725, + "learning_rate": 0.0003073872129365403, + "loss": 3.5099, + "step": 81580 + }, + { + "epoch": 5.543212392988178, + "grad_norm": 1.0455268621444702, + "learning_rate": 0.00030734474792770754, + "loss": 3.5766, + "step": 81585 + }, + { + "epoch": 5.54355211305884, + "grad_norm": 1.2117973566055298, + "learning_rate": 0.00030730228291887487, + "loss": 3.4078, + "step": 81590 + }, + { + "epoch": 5.543891833129502, + "grad_norm": 0.8742362260818481, + "learning_rate": 0.00030725981791004215, + "loss": 3.3199, + "step": 81595 + }, + { + "epoch": 5.544231553200163, + "grad_norm": 0.8285057544708252, + "learning_rate": 0.0003072173529012094, + "loss": 3.4184, + "step": 81600 + }, + { + "epoch": 5.544571273270825, + "grad_norm": 0.815168559551239, + "learning_rate": 0.0003071748878923767, + "loss": 3.5534, + "step": 81605 + }, + { + "epoch": 5.544910993341487, + "grad_norm": 0.7863364219665527, + "learning_rate": 0.000307132422883544, + "loss": 3.5428, + "step": 81610 + }, + { + "epoch": 5.545250713412148, + "grad_norm": 1.2266688346862793, + "learning_rate": 0.0003070899578747112, + "loss": 3.279, + "step": 81615 + }, + { + "epoch": 5.54559043348281, + "grad_norm": 1.3250237703323364, + "learning_rate": 0.0003070474928658785, + "loss": 3.246, + "step": 81620 + }, + { + "epoch": 5.545930153553472, + "grad_norm": 0.9749814867973328, + "learning_rate": 0.00030700502785704583, + "loss": 3.5012, + "step": 81625 + }, + { + "epoch": 5.546269873624134, + "grad_norm": 1.2728488445281982, + "learning_rate": 0.00030696256284821306, + "loss": 3.4301, + "step": 81630 + }, + { + "epoch": 5.546609593694796, + "grad_norm": 0.748591423034668, + "learning_rate": 0.00030692009783938034, + "loss": 3.2344, + "step": 81635 + }, + { + "epoch": 5.546949313765458, + "grad_norm": 1.1629027128219604, + "learning_rate": 0.00030687763283054767, + "loss": 3.0585, + "step": 81640 + }, + { + "epoch": 5.547289033836119, + "grad_norm": 0.8770295977592468, + "learning_rate": 0.0003068351678217149, + "loss": 3.357, + "step": 81645 + }, + { + "epoch": 5.547628753906781, + "grad_norm": 0.6806021332740784, + "learning_rate": 0.0003067927028128822, + "loss": 3.7234, + "step": 81650 + }, + { + "epoch": 5.547968473977443, + "grad_norm": 0.96271812915802, + "learning_rate": 0.00030675023780404946, + "loss": 3.1494, + "step": 81655 + }, + { + "epoch": 5.548308194048104, + "grad_norm": 0.7079302668571472, + "learning_rate": 0.00030670777279521674, + "loss": 3.5321, + "step": 81660 + }, + { + "epoch": 5.548647914118766, + "grad_norm": 0.8815109729766846, + "learning_rate": 0.000306665307786384, + "loss": 3.5055, + "step": 81665 + }, + { + "epoch": 5.5489876341894275, + "grad_norm": 1.019953966140747, + "learning_rate": 0.0003066228427775513, + "loss": 3.542, + "step": 81670 + }, + { + "epoch": 5.54932735426009, + "grad_norm": 0.8190523386001587, + "learning_rate": 0.0003065803777687186, + "loss": 3.1779, + "step": 81675 + }, + { + "epoch": 5.549667074330752, + "grad_norm": 1.111685037612915, + "learning_rate": 0.00030653791275988586, + "loss": 3.4705, + "step": 81680 + }, + { + "epoch": 5.550006794401413, + "grad_norm": 0.802570641040802, + "learning_rate": 0.00030649544775105314, + "loss": 3.3957, + "step": 81685 + }, + { + "epoch": 5.550346514472075, + "grad_norm": 0.8658155798912048, + "learning_rate": 0.0003064529827422204, + "loss": 3.4071, + "step": 81690 + }, + { + "epoch": 5.550686234542737, + "grad_norm": 1.0032360553741455, + "learning_rate": 0.0003064105177333877, + "loss": 3.4026, + "step": 81695 + }, + { + "epoch": 5.551025954613398, + "grad_norm": 0.9982784390449524, + "learning_rate": 0.000306368052724555, + "loss": 3.5603, + "step": 81700 + }, + { + "epoch": 5.55136567468406, + "grad_norm": 0.9116156101226807, + "learning_rate": 0.00030632558771572226, + "loss": 3.4395, + "step": 81705 + }, + { + "epoch": 5.551705394754722, + "grad_norm": 0.9459748268127441, + "learning_rate": 0.00030628312270688954, + "loss": 3.1819, + "step": 81710 + }, + { + "epoch": 5.5520451148253835, + "grad_norm": 0.9306715130805969, + "learning_rate": 0.0003062406576980568, + "loss": 3.5058, + "step": 81715 + }, + { + "epoch": 5.552384834896046, + "grad_norm": 1.0625226497650146, + "learning_rate": 0.0003061981926892241, + "loss": 3.4991, + "step": 81720 + }, + { + "epoch": 5.552724554966708, + "grad_norm": 1.025296926498413, + "learning_rate": 0.0003061557276803913, + "loss": 3.6969, + "step": 81725 + }, + { + "epoch": 5.553064275037369, + "grad_norm": 0.9476326704025269, + "learning_rate": 0.00030611326267155866, + "loss": 3.3019, + "step": 81730 + }, + { + "epoch": 5.553403995108031, + "grad_norm": 1.0592613220214844, + "learning_rate": 0.00030607079766272594, + "loss": 3.562, + "step": 81735 + }, + { + "epoch": 5.553743715178693, + "grad_norm": 0.9117239713668823, + "learning_rate": 0.00030602833265389317, + "loss": 3.6799, + "step": 81740 + }, + { + "epoch": 5.554083435249354, + "grad_norm": 0.7504327893257141, + "learning_rate": 0.0003059858676450605, + "loss": 3.5539, + "step": 81745 + }, + { + "epoch": 5.554423155320016, + "grad_norm": 1.0611305236816406, + "learning_rate": 0.0003059434026362278, + "loss": 3.256, + "step": 81750 + }, + { + "epoch": 5.554762875390678, + "grad_norm": 0.7392900586128235, + "learning_rate": 0.000305900937627395, + "loss": 3.3849, + "step": 81755 + }, + { + "epoch": 5.5551025954613396, + "grad_norm": 0.789951503276825, + "learning_rate": 0.0003058584726185623, + "loss": 3.4227, + "step": 81760 + }, + { + "epoch": 5.555442315532002, + "grad_norm": 1.1229350566864014, + "learning_rate": 0.0003058160076097296, + "loss": 3.3243, + "step": 81765 + }, + { + "epoch": 5.555782035602664, + "grad_norm": 0.953586757183075, + "learning_rate": 0.00030577354260089685, + "loss": 3.314, + "step": 81770 + }, + { + "epoch": 5.556121755673325, + "grad_norm": 1.139670729637146, + "learning_rate": 0.0003057310775920641, + "loss": 3.5697, + "step": 81775 + }, + { + "epoch": 5.556461475743987, + "grad_norm": 0.9670082330703735, + "learning_rate": 0.00030568861258323146, + "loss": 3.5419, + "step": 81780 + }, + { + "epoch": 5.556801195814649, + "grad_norm": 0.8223721981048584, + "learning_rate": 0.0003056461475743987, + "loss": 3.6881, + "step": 81785 + }, + { + "epoch": 5.55714091588531, + "grad_norm": 0.9354435801506042, + "learning_rate": 0.00030560368256556597, + "loss": 3.3864, + "step": 81790 + }, + { + "epoch": 5.557480635955972, + "grad_norm": 0.705143392086029, + "learning_rate": 0.0003055612175567333, + "loss": 3.2138, + "step": 81795 + }, + { + "epoch": 5.557820356026634, + "grad_norm": 0.9220160841941833, + "learning_rate": 0.0003055187525479005, + "loss": 3.3868, + "step": 81800 + }, + { + "epoch": 5.558160076097296, + "grad_norm": 0.8663750290870667, + "learning_rate": 0.0003054762875390678, + "loss": 3.528, + "step": 81805 + }, + { + "epoch": 5.558499796167958, + "grad_norm": 0.9897984862327576, + "learning_rate": 0.0003054338225302351, + "loss": 3.4112, + "step": 81810 + }, + { + "epoch": 5.55883951623862, + "grad_norm": 0.9100580215454102, + "learning_rate": 0.00030539135752140237, + "loss": 3.5451, + "step": 81815 + }, + { + "epoch": 5.559179236309281, + "grad_norm": 1.0126265287399292, + "learning_rate": 0.00030534889251256965, + "loss": 3.5437, + "step": 81820 + }, + { + "epoch": 5.559518956379943, + "grad_norm": 0.8965676426887512, + "learning_rate": 0.0003053064275037369, + "loss": 3.4497, + "step": 81825 + }, + { + "epoch": 5.559858676450605, + "grad_norm": 0.9969977736473083, + "learning_rate": 0.0003052639624949042, + "loss": 3.4081, + "step": 81830 + }, + { + "epoch": 5.560198396521266, + "grad_norm": 0.8342288136482239, + "learning_rate": 0.0003052214974860715, + "loss": 3.473, + "step": 81835 + }, + { + "epoch": 5.560538116591928, + "grad_norm": 0.9324702620506287, + "learning_rate": 0.00030517903247723877, + "loss": 3.515, + "step": 81840 + }, + { + "epoch": 5.56087783666259, + "grad_norm": 0.9207886457443237, + "learning_rate": 0.000305136567468406, + "loss": 3.5227, + "step": 81845 + }, + { + "epoch": 5.561217556733252, + "grad_norm": 0.8494712710380554, + "learning_rate": 0.0003050941024595733, + "loss": 3.3718, + "step": 81850 + }, + { + "epoch": 5.561557276803914, + "grad_norm": 0.9318746328353882, + "learning_rate": 0.0003050516374507406, + "loss": 3.642, + "step": 81855 + }, + { + "epoch": 5.561896996874576, + "grad_norm": 0.8409349322319031, + "learning_rate": 0.0003050091724419079, + "loss": 3.4012, + "step": 81860 + }, + { + "epoch": 5.562236716945237, + "grad_norm": 0.961229681968689, + "learning_rate": 0.00030496670743307517, + "loss": 3.0365, + "step": 81865 + }, + { + "epoch": 5.562576437015899, + "grad_norm": 0.8935351967811584, + "learning_rate": 0.00030492424242424245, + "loss": 3.5586, + "step": 81870 + }, + { + "epoch": 5.56291615708656, + "grad_norm": 0.786618709564209, + "learning_rate": 0.0003048817774154097, + "loss": 3.2604, + "step": 81875 + }, + { + "epoch": 5.563255877157222, + "grad_norm": 0.9865716695785522, + "learning_rate": 0.00030483931240657695, + "loss": 3.1426, + "step": 81880 + }, + { + "epoch": 5.563595597227884, + "grad_norm": 0.9849636554718018, + "learning_rate": 0.0003047968473977443, + "loss": 3.2376, + "step": 81885 + }, + { + "epoch": 5.5639353172985455, + "grad_norm": 0.8008187413215637, + "learning_rate": 0.00030475438238891157, + "loss": 3.4317, + "step": 81890 + }, + { + "epoch": 5.564275037369208, + "grad_norm": 0.9079995155334473, + "learning_rate": 0.0003047119173800788, + "loss": 3.2408, + "step": 81895 + }, + { + "epoch": 5.56461475743987, + "grad_norm": 0.927918553352356, + "learning_rate": 0.00030466945237124613, + "loss": 3.4009, + "step": 81900 + }, + { + "epoch": 5.564954477510531, + "grad_norm": 0.9902480244636536, + "learning_rate": 0.0003046269873624134, + "loss": 3.4936, + "step": 81905 + }, + { + "epoch": 5.565294197581193, + "grad_norm": 0.7465091943740845, + "learning_rate": 0.00030458452235358063, + "loss": 3.5292, + "step": 81910 + }, + { + "epoch": 5.565633917651855, + "grad_norm": 0.8548529744148254, + "learning_rate": 0.0003045420573447479, + "loss": 3.4853, + "step": 81915 + }, + { + "epoch": 5.565973637722516, + "grad_norm": 0.9467124938964844, + "learning_rate": 0.00030449959233591525, + "loss": 2.9834, + "step": 81920 + }, + { + "epoch": 5.566313357793178, + "grad_norm": 0.8567249774932861, + "learning_rate": 0.0003044571273270825, + "loss": 3.4746, + "step": 81925 + }, + { + "epoch": 5.56665307786384, + "grad_norm": 0.8060634136199951, + "learning_rate": 0.00030441466231824975, + "loss": 3.1917, + "step": 81930 + }, + { + "epoch": 5.5669927979345015, + "grad_norm": 1.08177649974823, + "learning_rate": 0.0003043721973094171, + "loss": 3.4593, + "step": 81935 + }, + { + "epoch": 5.567332518005164, + "grad_norm": 0.6874261498451233, + "learning_rate": 0.0003043297323005843, + "loss": 3.7289, + "step": 81940 + }, + { + "epoch": 5.567672238075826, + "grad_norm": 0.9873265027999878, + "learning_rate": 0.0003042872672917516, + "loss": 3.3614, + "step": 81945 + }, + { + "epoch": 5.568011958146487, + "grad_norm": 0.9214231371879578, + "learning_rate": 0.0003042448022829189, + "loss": 3.6432, + "step": 81950 + }, + { + "epoch": 5.568351678217149, + "grad_norm": 0.8076217770576477, + "learning_rate": 0.00030420233727408615, + "loss": 3.2581, + "step": 81955 + }, + { + "epoch": 5.568691398287811, + "grad_norm": 0.8961169123649597, + "learning_rate": 0.00030415987226525343, + "loss": 3.4718, + "step": 81960 + }, + { + "epoch": 5.569031118358472, + "grad_norm": 0.8468266129493713, + "learning_rate": 0.0003041174072564207, + "loss": 3.4319, + "step": 81965 + }, + { + "epoch": 5.569370838429134, + "grad_norm": 0.7869751453399658, + "learning_rate": 0.000304074942247588, + "loss": 3.6136, + "step": 81970 + }, + { + "epoch": 5.569710558499796, + "grad_norm": 4.091273784637451, + "learning_rate": 0.0003040324772387553, + "loss": 3.4811, + "step": 81975 + }, + { + "epoch": 5.5700502785704575, + "grad_norm": 1.3834716081619263, + "learning_rate": 0.00030399001222992255, + "loss": 3.4788, + "step": 81980 + }, + { + "epoch": 5.57038999864112, + "grad_norm": 0.8581205606460571, + "learning_rate": 0.0003039475472210898, + "loss": 3.5418, + "step": 81985 + }, + { + "epoch": 5.570729718711782, + "grad_norm": 0.948347270488739, + "learning_rate": 0.0003039050822122571, + "loss": 3.2235, + "step": 81990 + }, + { + "epoch": 5.571069438782443, + "grad_norm": 0.8277897238731384, + "learning_rate": 0.0003038626172034244, + "loss": 3.514, + "step": 81995 + }, + { + "epoch": 5.571409158853105, + "grad_norm": 1.3615957498550415, + "learning_rate": 0.0003038201521945916, + "loss": 3.4601, + "step": 82000 + }, + { + "epoch": 5.571748878923767, + "grad_norm": 0.8798730373382568, + "learning_rate": 0.00030377768718575895, + "loss": 3.2984, + "step": 82005 + }, + { + "epoch": 5.572088598994428, + "grad_norm": 0.964838981628418, + "learning_rate": 0.00030373522217692623, + "loss": 3.4246, + "step": 82010 + }, + { + "epoch": 5.57242831906509, + "grad_norm": 1.1229451894760132, + "learning_rate": 0.00030369275716809346, + "loss": 3.5748, + "step": 82015 + }, + { + "epoch": 5.572768039135752, + "grad_norm": 0.964598536491394, + "learning_rate": 0.00030365029215926074, + "loss": 3.3403, + "step": 82020 + }, + { + "epoch": 5.5731077592064135, + "grad_norm": 0.747417151927948, + "learning_rate": 0.0003036078271504281, + "loss": 3.6245, + "step": 82025 + }, + { + "epoch": 5.573447479277076, + "grad_norm": 0.8244746327400208, + "learning_rate": 0.00030356536214159536, + "loss": 3.6656, + "step": 82030 + }, + { + "epoch": 5.573787199347738, + "grad_norm": 0.8747579455375671, + "learning_rate": 0.0003035228971327626, + "loss": 3.7941, + "step": 82035 + }, + { + "epoch": 5.574126919418399, + "grad_norm": 0.8820817470550537, + "learning_rate": 0.0003034804321239299, + "loss": 3.6213, + "step": 82040 + }, + { + "epoch": 5.574466639489061, + "grad_norm": 0.7854366898536682, + "learning_rate": 0.0003034379671150972, + "loss": 3.4266, + "step": 82045 + }, + { + "epoch": 5.574806359559723, + "grad_norm": 0.9381937980651855, + "learning_rate": 0.0003033955021062644, + "loss": 3.5324, + "step": 82050 + }, + { + "epoch": 5.575146079630384, + "grad_norm": 0.8447019457817078, + "learning_rate": 0.0003033530370974317, + "loss": 3.3755, + "step": 82055 + }, + { + "epoch": 5.575485799701046, + "grad_norm": 1.0231516361236572, + "learning_rate": 0.00030331057208859904, + "loss": 3.4423, + "step": 82060 + }, + { + "epoch": 5.575825519771708, + "grad_norm": 0.8613697290420532, + "learning_rate": 0.00030326810707976626, + "loss": 3.2757, + "step": 82065 + }, + { + "epoch": 5.57616523984237, + "grad_norm": 0.7503589391708374, + "learning_rate": 0.00030322564207093354, + "loss": 3.4373, + "step": 82070 + }, + { + "epoch": 5.576504959913032, + "grad_norm": 1.1392546892166138, + "learning_rate": 0.0003031831770621009, + "loss": 3.3122, + "step": 82075 + }, + { + "epoch": 5.576844679983694, + "grad_norm": 0.8246403932571411, + "learning_rate": 0.0003031407120532681, + "loss": 3.5311, + "step": 82080 + }, + { + "epoch": 5.577184400054355, + "grad_norm": 0.9468549489974976, + "learning_rate": 0.0003030982470444354, + "loss": 3.6399, + "step": 82085 + }, + { + "epoch": 5.577524120125017, + "grad_norm": 0.7528762221336365, + "learning_rate": 0.0003030557820356027, + "loss": 3.6624, + "step": 82090 + }, + { + "epoch": 5.577863840195679, + "grad_norm": 0.9800229668617249, + "learning_rate": 0.00030301331702676994, + "loss": 3.39, + "step": 82095 + }, + { + "epoch": 5.57820356026634, + "grad_norm": 1.2957720756530762, + "learning_rate": 0.0003029708520179372, + "loss": 3.6897, + "step": 82100 + }, + { + "epoch": 5.578543280337002, + "grad_norm": 0.8108674883842468, + "learning_rate": 0.0003029283870091045, + "loss": 3.5175, + "step": 82105 + }, + { + "epoch": 5.578883000407664, + "grad_norm": 0.8029197454452515, + "learning_rate": 0.0003028859220002718, + "loss": 3.462, + "step": 82110 + }, + { + "epoch": 5.579222720478326, + "grad_norm": 1.0418628454208374, + "learning_rate": 0.00030284345699143906, + "loss": 3.2249, + "step": 82115 + }, + { + "epoch": 5.579562440548988, + "grad_norm": 0.9687813520431519, + "learning_rate": 0.00030280099198260634, + "loss": 3.5883, + "step": 82120 + }, + { + "epoch": 5.57990216061965, + "grad_norm": 1.0037375688552856, + "learning_rate": 0.0003027585269737736, + "loss": 3.3479, + "step": 82125 + }, + { + "epoch": 5.580241880690311, + "grad_norm": 1.1772832870483398, + "learning_rate": 0.0003027160619649409, + "loss": 3.3549, + "step": 82130 + }, + { + "epoch": 5.580581600760973, + "grad_norm": 1.099798321723938, + "learning_rate": 0.0003026735969561082, + "loss": 3.4906, + "step": 82135 + }, + { + "epoch": 5.580921320831635, + "grad_norm": 1.3639968633651733, + "learning_rate": 0.0003026311319472754, + "loss": 3.4729, + "step": 82140 + }, + { + "epoch": 5.581261040902296, + "grad_norm": 1.0824909210205078, + "learning_rate": 0.00030258866693844274, + "loss": 3.765, + "step": 82145 + }, + { + "epoch": 5.581600760972958, + "grad_norm": 0.9254806041717529, + "learning_rate": 0.00030254620192961, + "loss": 3.6803, + "step": 82150 + }, + { + "epoch": 5.58194048104362, + "grad_norm": 0.861976683139801, + "learning_rate": 0.00030250373692077725, + "loss": 3.5285, + "step": 82155 + }, + { + "epoch": 5.582280201114282, + "grad_norm": 0.9835058450698853, + "learning_rate": 0.0003024612719119446, + "loss": 3.7135, + "step": 82160 + }, + { + "epoch": 5.582619921184944, + "grad_norm": 0.8155087828636169, + "learning_rate": 0.00030241880690311186, + "loss": 3.4021, + "step": 82165 + }, + { + "epoch": 5.582959641255606, + "grad_norm": 1.021843433380127, + "learning_rate": 0.0003023763418942791, + "loss": 3.5024, + "step": 82170 + }, + { + "epoch": 5.583299361326267, + "grad_norm": 0.988379180431366, + "learning_rate": 0.00030233387688544637, + "loss": 3.6498, + "step": 82175 + }, + { + "epoch": 5.583639081396929, + "grad_norm": 1.031922698020935, + "learning_rate": 0.0003022914118766137, + "loss": 3.1922, + "step": 82180 + }, + { + "epoch": 5.583978801467591, + "grad_norm": 0.843747615814209, + "learning_rate": 0.00030224894686778093, + "loss": 3.2862, + "step": 82185 + }, + { + "epoch": 5.584318521538252, + "grad_norm": 0.9582408666610718, + "learning_rate": 0.0003022064818589482, + "loss": 3.4335, + "step": 82190 + }, + { + "epoch": 5.584658241608914, + "grad_norm": 0.903764009475708, + "learning_rate": 0.00030216401685011554, + "loss": 3.281, + "step": 82195 + }, + { + "epoch": 5.584997961679576, + "grad_norm": 0.7197360396385193, + "learning_rate": 0.0003021215518412828, + "loss": 3.6987, + "step": 82200 + }, + { + "epoch": 5.585337681750238, + "grad_norm": 0.7903319597244263, + "learning_rate": 0.00030207908683245005, + "loss": 3.4503, + "step": 82205 + }, + { + "epoch": 5.5856774018209, + "grad_norm": 0.8506431579589844, + "learning_rate": 0.00030203662182361733, + "loss": 3.1156, + "step": 82210 + }, + { + "epoch": 5.586017121891562, + "grad_norm": 0.8456606864929199, + "learning_rate": 0.00030199415681478466, + "loss": 3.5798, + "step": 82215 + }, + { + "epoch": 5.586356841962223, + "grad_norm": 1.1572433710098267, + "learning_rate": 0.0003019516918059519, + "loss": 3.3169, + "step": 82220 + }, + { + "epoch": 5.586696562032885, + "grad_norm": 1.3299356698989868, + "learning_rate": 0.00030190922679711917, + "loss": 3.5426, + "step": 82225 + }, + { + "epoch": 5.587036282103547, + "grad_norm": 0.9224075078964233, + "learning_rate": 0.0003018667617882865, + "loss": 3.4459, + "step": 82230 + }, + { + "epoch": 5.587376002174208, + "grad_norm": 0.8886950016021729, + "learning_rate": 0.00030182429677945373, + "loss": 3.6313, + "step": 82235 + }, + { + "epoch": 5.58771572224487, + "grad_norm": 1.0885686874389648, + "learning_rate": 0.000301781831770621, + "loss": 3.2142, + "step": 82240 + }, + { + "epoch": 5.588055442315532, + "grad_norm": 0.8239755630493164, + "learning_rate": 0.0003017393667617883, + "loss": 3.5698, + "step": 82245 + }, + { + "epoch": 5.588395162386194, + "grad_norm": 0.8759142160415649, + "learning_rate": 0.00030169690175295557, + "loss": 3.4423, + "step": 82250 + }, + { + "epoch": 5.588734882456856, + "grad_norm": 0.7899795174598694, + "learning_rate": 0.00030165443674412285, + "loss": 3.594, + "step": 82255 + }, + { + "epoch": 5.589074602527518, + "grad_norm": 0.79869544506073, + "learning_rate": 0.00030161197173529013, + "loss": 3.7246, + "step": 82260 + }, + { + "epoch": 5.589414322598179, + "grad_norm": 0.8189754486083984, + "learning_rate": 0.0003015695067264574, + "loss": 3.3592, + "step": 82265 + }, + { + "epoch": 5.589754042668841, + "grad_norm": 1.019060492515564, + "learning_rate": 0.0003015270417176247, + "loss": 3.5434, + "step": 82270 + }, + { + "epoch": 5.590093762739503, + "grad_norm": 0.9712685942649841, + "learning_rate": 0.00030148457670879197, + "loss": 3.3284, + "step": 82275 + }, + { + "epoch": 5.590433482810164, + "grad_norm": 1.0486847162246704, + "learning_rate": 0.0003014421116999592, + "loss": 3.3406, + "step": 82280 + }, + { + "epoch": 5.590773202880826, + "grad_norm": 0.8356035947799683, + "learning_rate": 0.00030139964669112653, + "loss": 3.0995, + "step": 82285 + }, + { + "epoch": 5.591112922951488, + "grad_norm": 0.9599518775939941, + "learning_rate": 0.0003013571816822938, + "loss": 3.4411, + "step": 82290 + }, + { + "epoch": 5.59145264302215, + "grad_norm": 1.0497380495071411, + "learning_rate": 0.00030131471667346104, + "loss": 3.3112, + "step": 82295 + }, + { + "epoch": 5.591792363092812, + "grad_norm": 0.8319849967956543, + "learning_rate": 0.00030127225166462837, + "loss": 3.5724, + "step": 82300 + }, + { + "epoch": 5.592132083163474, + "grad_norm": 0.9133396148681641, + "learning_rate": 0.00030122978665579565, + "loss": 3.426, + "step": 82305 + }, + { + "epoch": 5.592471803234135, + "grad_norm": 0.8198358416557312, + "learning_rate": 0.0003011873216469629, + "loss": 3.4275, + "step": 82310 + }, + { + "epoch": 5.592811523304797, + "grad_norm": 1.144156813621521, + "learning_rate": 0.00030114485663813016, + "loss": 3.4857, + "step": 82315 + }, + { + "epoch": 5.593151243375459, + "grad_norm": 0.8157926201820374, + "learning_rate": 0.0003011023916292975, + "loss": 3.3695, + "step": 82320 + }, + { + "epoch": 5.59349096344612, + "grad_norm": 1.0066725015640259, + "learning_rate": 0.0003010599266204647, + "loss": 3.3882, + "step": 82325 + }, + { + "epoch": 5.593830683516782, + "grad_norm": 1.1774667501449585, + "learning_rate": 0.000301017461611632, + "loss": 3.6133, + "step": 82330 + }, + { + "epoch": 5.594170403587444, + "grad_norm": 0.8732501864433289, + "learning_rate": 0.00030097499660279933, + "loss": 3.4499, + "step": 82335 + }, + { + "epoch": 5.594510123658106, + "grad_norm": 1.1965855360031128, + "learning_rate": 0.00030093253159396656, + "loss": 3.6535, + "step": 82340 + }, + { + "epoch": 5.594849843728768, + "grad_norm": 1.2349088191986084, + "learning_rate": 0.00030089006658513384, + "loss": 3.5009, + "step": 82345 + }, + { + "epoch": 5.595189563799429, + "grad_norm": 0.795509934425354, + "learning_rate": 0.0003008476015763011, + "loss": 3.1068, + "step": 82350 + }, + { + "epoch": 5.595529283870091, + "grad_norm": 0.9838965535163879, + "learning_rate": 0.0003008051365674684, + "loss": 3.8401, + "step": 82355 + }, + { + "epoch": 5.595869003940753, + "grad_norm": 0.8306987285614014, + "learning_rate": 0.0003007626715586357, + "loss": 3.5999, + "step": 82360 + }, + { + "epoch": 5.596208724011414, + "grad_norm": 0.9013603925704956, + "learning_rate": 0.00030072020654980296, + "loss": 3.7418, + "step": 82365 + }, + { + "epoch": 5.596548444082076, + "grad_norm": 1.0191317796707153, + "learning_rate": 0.0003006777415409703, + "loss": 3.5537, + "step": 82370 + }, + { + "epoch": 5.596888164152738, + "grad_norm": 0.8368712067604065, + "learning_rate": 0.0003006352765321375, + "loss": 3.3374, + "step": 82375 + }, + { + "epoch": 5.5972278842234, + "grad_norm": 1.0695958137512207, + "learning_rate": 0.0003005928115233048, + "loss": 3.5775, + "step": 82380 + }, + { + "epoch": 5.597567604294062, + "grad_norm": 1.6139408349990845, + "learning_rate": 0.00030055034651447213, + "loss": 3.2801, + "step": 82385 + }, + { + "epoch": 5.597907324364724, + "grad_norm": 0.958324134349823, + "learning_rate": 0.00030050788150563936, + "loss": 3.457, + "step": 82390 + }, + { + "epoch": 5.598247044435385, + "grad_norm": 0.7392140030860901, + "learning_rate": 0.00030046541649680664, + "loss": 3.4693, + "step": 82395 + }, + { + "epoch": 5.598586764506047, + "grad_norm": 0.8974740505218506, + "learning_rate": 0.0003004229514879739, + "loss": 3.5134, + "step": 82400 + }, + { + "epoch": 5.598926484576709, + "grad_norm": 0.915385901927948, + "learning_rate": 0.0003003804864791412, + "loss": 3.2957, + "step": 82405 + }, + { + "epoch": 5.59926620464737, + "grad_norm": 0.68277907371521, + "learning_rate": 0.0003003380214703085, + "loss": 3.3873, + "step": 82410 + }, + { + "epoch": 5.599605924718032, + "grad_norm": 1.0916481018066406, + "learning_rate": 0.00030029555646147576, + "loss": 3.6142, + "step": 82415 + }, + { + "epoch": 5.599945644788694, + "grad_norm": 2.5263521671295166, + "learning_rate": 0.00030025309145264304, + "loss": 3.59, + "step": 82420 + }, + { + "epoch": 5.600285364859356, + "grad_norm": 0.8443775177001953, + "learning_rate": 0.0003002106264438103, + "loss": 3.4595, + "step": 82425 + }, + { + "epoch": 5.600625084930018, + "grad_norm": 0.8210853934288025, + "learning_rate": 0.0003001681614349776, + "loss": 3.4907, + "step": 82430 + }, + { + "epoch": 5.60096480500068, + "grad_norm": 0.9868175387382507, + "learning_rate": 0.0003001256964261448, + "loss": 3.3893, + "step": 82435 + }, + { + "epoch": 5.601304525071341, + "grad_norm": 0.8070982694625854, + "learning_rate": 0.00030008323141731216, + "loss": 3.3311, + "step": 82440 + }, + { + "epoch": 5.601644245142003, + "grad_norm": 0.7585562467575073, + "learning_rate": 0.00030004076640847944, + "loss": 3.597, + "step": 82445 + }, + { + "epoch": 5.601983965212665, + "grad_norm": 0.9856508374214172, + "learning_rate": 0.00029999830139964666, + "loss": 3.4421, + "step": 82450 + }, + { + "epoch": 5.602323685283326, + "grad_norm": 0.975025475025177, + "learning_rate": 0.000299955836390814, + "loss": 3.5962, + "step": 82455 + }, + { + "epoch": 5.602663405353988, + "grad_norm": 1.0496126413345337, + "learning_rate": 0.0002999133713819813, + "loss": 3.3714, + "step": 82460 + }, + { + "epoch": 5.60300312542465, + "grad_norm": 1.108263373374939, + "learning_rate": 0.0002998709063731485, + "loss": 3.4329, + "step": 82465 + }, + { + "epoch": 5.603342845495312, + "grad_norm": 1.0169235467910767, + "learning_rate": 0.0002998284413643158, + "loss": 3.6333, + "step": 82470 + }, + { + "epoch": 5.603682565565974, + "grad_norm": 1.0286551713943481, + "learning_rate": 0.0002997859763554831, + "loss": 3.509, + "step": 82475 + }, + { + "epoch": 5.604022285636636, + "grad_norm": 0.8635029792785645, + "learning_rate": 0.00029974351134665035, + "loss": 3.4111, + "step": 82480 + }, + { + "epoch": 5.604362005707297, + "grad_norm": 0.7629952430725098, + "learning_rate": 0.0002997010463378176, + "loss": 3.3078, + "step": 82485 + }, + { + "epoch": 5.604701725777959, + "grad_norm": 0.829544723033905, + "learning_rate": 0.00029965858132898496, + "loss": 3.2476, + "step": 82490 + }, + { + "epoch": 5.605041445848621, + "grad_norm": 1.324083685874939, + "learning_rate": 0.0002996161163201522, + "loss": 3.3614, + "step": 82495 + }, + { + "epoch": 5.605381165919282, + "grad_norm": 0.9480860233306885, + "learning_rate": 0.00029957365131131947, + "loss": 3.4118, + "step": 82500 + }, + { + "epoch": 5.605720885989944, + "grad_norm": 1.1419827938079834, + "learning_rate": 0.00029953118630248675, + "loss": 3.5418, + "step": 82505 + }, + { + "epoch": 5.606060606060606, + "grad_norm": 0.9095557332038879, + "learning_rate": 0.000299488721293654, + "loss": 3.4919, + "step": 82510 + }, + { + "epoch": 5.606400326131268, + "grad_norm": 0.7148863077163696, + "learning_rate": 0.0002994462562848213, + "loss": 3.5781, + "step": 82515 + }, + { + "epoch": 5.60674004620193, + "grad_norm": 0.7570147514343262, + "learning_rate": 0.0002994037912759886, + "loss": 3.5266, + "step": 82520 + }, + { + "epoch": 5.607079766272592, + "grad_norm": 0.7395424246788025, + "learning_rate": 0.00029936132626715587, + "loss": 3.324, + "step": 82525 + }, + { + "epoch": 5.607419486343253, + "grad_norm": 1.159304141998291, + "learning_rate": 0.00029931886125832315, + "loss": 3.6356, + "step": 82530 + }, + { + "epoch": 5.607759206413915, + "grad_norm": 0.6632686853408813, + "learning_rate": 0.0002992763962494904, + "loss": 3.4288, + "step": 82535 + }, + { + "epoch": 5.608098926484577, + "grad_norm": 1.5349422693252563, + "learning_rate": 0.0002992339312406577, + "loss": 3.4123, + "step": 82540 + }, + { + "epoch": 5.608438646555238, + "grad_norm": 0.777924120426178, + "learning_rate": 0.000299191466231825, + "loss": 3.5523, + "step": 82545 + }, + { + "epoch": 5.6087783666259, + "grad_norm": 1.5168615579605103, + "learning_rate": 0.00029914900122299227, + "loss": 3.6415, + "step": 82550 + }, + { + "epoch": 5.6091180866965615, + "grad_norm": 1.2412922382354736, + "learning_rate": 0.00029910653621415955, + "loss": 3.1805, + "step": 82555 + }, + { + "epoch": 5.609457806767224, + "grad_norm": 0.674479067325592, + "learning_rate": 0.0002990640712053268, + "loss": 3.3702, + "step": 82560 + }, + { + "epoch": 5.609797526837886, + "grad_norm": 0.831429660320282, + "learning_rate": 0.0002990216061964941, + "loss": 3.4645, + "step": 82565 + }, + { + "epoch": 5.610137246908547, + "grad_norm": 0.8258492350578308, + "learning_rate": 0.0002989791411876614, + "loss": 3.5172, + "step": 82570 + }, + { + "epoch": 5.610476966979209, + "grad_norm": 1.225560188293457, + "learning_rate": 0.0002989366761788286, + "loss": 3.2328, + "step": 82575 + }, + { + "epoch": 5.610816687049871, + "grad_norm": 0.8803064823150635, + "learning_rate": 0.00029889421116999595, + "loss": 3.4655, + "step": 82580 + }, + { + "epoch": 5.611156407120532, + "grad_norm": 0.9098411202430725, + "learning_rate": 0.0002988517461611632, + "loss": 3.5917, + "step": 82585 + }, + { + "epoch": 5.611496127191194, + "grad_norm": 1.0054348707199097, + "learning_rate": 0.00029880928115233045, + "loss": 3.3794, + "step": 82590 + }, + { + "epoch": 5.611835847261856, + "grad_norm": 1.0529571771621704, + "learning_rate": 0.0002987668161434978, + "loss": 3.4082, + "step": 82595 + }, + { + "epoch": 5.6121755673325175, + "grad_norm": 0.9421250820159912, + "learning_rate": 0.00029872435113466507, + "loss": 3.255, + "step": 82600 + }, + { + "epoch": 5.61251528740318, + "grad_norm": 1.0112226009368896, + "learning_rate": 0.0002986818861258323, + "loss": 3.3739, + "step": 82605 + }, + { + "epoch": 5.612855007473842, + "grad_norm": 0.9568010568618774, + "learning_rate": 0.0002986394211169996, + "loss": 3.2898, + "step": 82610 + }, + { + "epoch": 5.613194727544503, + "grad_norm": 1.7519389390945435, + "learning_rate": 0.0002985969561081669, + "loss": 3.4198, + "step": 82615 + }, + { + "epoch": 5.613534447615165, + "grad_norm": 1.1059998273849487, + "learning_rate": 0.00029855449109933413, + "loss": 3.475, + "step": 82620 + }, + { + "epoch": 5.613874167685827, + "grad_norm": 1.2380024194717407, + "learning_rate": 0.0002985120260905014, + "loss": 3.6412, + "step": 82625 + }, + { + "epoch": 5.614213887756488, + "grad_norm": 1.0683209896087646, + "learning_rate": 0.00029846956108166875, + "loss": 3.5694, + "step": 82630 + }, + { + "epoch": 5.61455360782715, + "grad_norm": 0.7356081604957581, + "learning_rate": 0.000298427096072836, + "loss": 3.294, + "step": 82635 + }, + { + "epoch": 5.614893327897812, + "grad_norm": 0.7848540544509888, + "learning_rate": 0.00029838463106400325, + "loss": 3.1767, + "step": 82640 + }, + { + "epoch": 5.615233047968474, + "grad_norm": 0.8614038228988647, + "learning_rate": 0.00029834216605517053, + "loss": 3.3995, + "step": 82645 + }, + { + "epoch": 5.615572768039136, + "grad_norm": 0.8450446724891663, + "learning_rate": 0.0002982997010463378, + "loss": 3.445, + "step": 82650 + }, + { + "epoch": 5.615912488109798, + "grad_norm": 1.2327396869659424, + "learning_rate": 0.0002982572360375051, + "loss": 3.444, + "step": 82655 + }, + { + "epoch": 5.616252208180459, + "grad_norm": 0.7348139882087708, + "learning_rate": 0.0002982147710286724, + "loss": 3.55, + "step": 82660 + }, + { + "epoch": 5.616591928251121, + "grad_norm": 1.086801528930664, + "learning_rate": 0.00029817230601983965, + "loss": 3.3675, + "step": 82665 + }, + { + "epoch": 5.616931648321783, + "grad_norm": 2.773688316345215, + "learning_rate": 0.00029812984101100693, + "loss": 3.1877, + "step": 82670 + }, + { + "epoch": 5.617271368392444, + "grad_norm": 0.8606184720993042, + "learning_rate": 0.0002980873760021742, + "loss": 3.4862, + "step": 82675 + }, + { + "epoch": 5.617611088463106, + "grad_norm": 0.9149243235588074, + "learning_rate": 0.00029804491099334144, + "loss": 3.4309, + "step": 82680 + }, + { + "epoch": 5.617950808533768, + "grad_norm": 1.0936967134475708, + "learning_rate": 0.0002980024459845088, + "loss": 3.5057, + "step": 82685 + }, + { + "epoch": 5.61829052860443, + "grad_norm": 1.0084660053253174, + "learning_rate": 0.0002979684739774426, + "loss": 3.3385, + "step": 82690 + }, + { + "epoch": 5.618630248675092, + "grad_norm": 0.7227598428726196, + "learning_rate": 0.0002979260089686099, + "loss": 3.4787, + "step": 82695 + }, + { + "epoch": 5.618969968745754, + "grad_norm": 1.0506643056869507, + "learning_rate": 0.0002978835439597771, + "loss": 3.7436, + "step": 82700 + }, + { + "epoch": 5.619309688816415, + "grad_norm": 1.0549341440200806, + "learning_rate": 0.00029784107895094444, + "loss": 3.5461, + "step": 82705 + }, + { + "epoch": 5.619649408887077, + "grad_norm": 1.047432541847229, + "learning_rate": 0.0002977986139421117, + "loss": 3.5065, + "step": 82710 + }, + { + "epoch": 5.619989128957739, + "grad_norm": 0.8499552607536316, + "learning_rate": 0.00029775614893327894, + "loss": 3.3617, + "step": 82715 + }, + { + "epoch": 5.6203288490284, + "grad_norm": 0.8752502202987671, + "learning_rate": 0.0002977136839244463, + "loss": 3.183, + "step": 82720 + }, + { + "epoch": 5.620668569099062, + "grad_norm": 0.719285786151886, + "learning_rate": 0.00029767121891561356, + "loss": 3.633, + "step": 82725 + }, + { + "epoch": 5.621008289169724, + "grad_norm": 1.1122639179229736, + "learning_rate": 0.0002976287539067808, + "loss": 3.1813, + "step": 82730 + }, + { + "epoch": 5.621348009240386, + "grad_norm": 0.7415134906768799, + "learning_rate": 0.00029758628889794806, + "loss": 3.6081, + "step": 82735 + }, + { + "epoch": 5.621687729311048, + "grad_norm": 0.7810465693473816, + "learning_rate": 0.0002975438238891154, + "loss": 3.5904, + "step": 82740 + }, + { + "epoch": 5.62202744938171, + "grad_norm": 0.8887830972671509, + "learning_rate": 0.0002975013588802827, + "loss": 3.357, + "step": 82745 + }, + { + "epoch": 5.622367169452371, + "grad_norm": 0.735896110534668, + "learning_rate": 0.0002974588938714499, + "loss": 3.466, + "step": 82750 + }, + { + "epoch": 5.622706889523033, + "grad_norm": 1.042852520942688, + "learning_rate": 0.00029741642886261724, + "loss": 3.5442, + "step": 82755 + }, + { + "epoch": 5.623046609593695, + "grad_norm": 0.8833327293395996, + "learning_rate": 0.0002973739638537845, + "loss": 3.5125, + "step": 82760 + }, + { + "epoch": 5.623386329664356, + "grad_norm": 1.0001680850982666, + "learning_rate": 0.00029733149884495175, + "loss": 3.448, + "step": 82765 + }, + { + "epoch": 5.623726049735018, + "grad_norm": 1.005846381187439, + "learning_rate": 0.000297289033836119, + "loss": 3.4266, + "step": 82770 + }, + { + "epoch": 5.62406576980568, + "grad_norm": 0.9192900061607361, + "learning_rate": 0.00029724656882728636, + "loss": 3.3241, + "step": 82775 + }, + { + "epoch": 5.624405489876342, + "grad_norm": 1.220954179763794, + "learning_rate": 0.0002972041038184536, + "loss": 3.576, + "step": 82780 + }, + { + "epoch": 5.624745209947004, + "grad_norm": 0.9629876613616943, + "learning_rate": 0.00029716163880962087, + "loss": 3.8158, + "step": 82785 + }, + { + "epoch": 5.625084930017666, + "grad_norm": 0.8216454386711121, + "learning_rate": 0.0002971191738007882, + "loss": 3.4106, + "step": 82790 + }, + { + "epoch": 5.625424650088327, + "grad_norm": 1.386764645576477, + "learning_rate": 0.0002970767087919554, + "loss": 3.4562, + "step": 82795 + }, + { + "epoch": 5.625764370158989, + "grad_norm": 1.0348938703536987, + "learning_rate": 0.0002970342437831227, + "loss": 3.6719, + "step": 82800 + }, + { + "epoch": 5.626104090229651, + "grad_norm": 0.9282650351524353, + "learning_rate": 0.00029699177877429, + "loss": 3.4849, + "step": 82805 + }, + { + "epoch": 5.626443810300312, + "grad_norm": 1.0235950946807861, + "learning_rate": 0.00029694931376545727, + "loss": 3.3095, + "step": 82810 + }, + { + "epoch": 5.626783530370974, + "grad_norm": 1.1868515014648438, + "learning_rate": 0.00029690684875662455, + "loss": 3.4831, + "step": 82815 + }, + { + "epoch": 5.627123250441636, + "grad_norm": 1.0348695516586304, + "learning_rate": 0.0002968643837477918, + "loss": 3.2895, + "step": 82820 + }, + { + "epoch": 5.627462970512298, + "grad_norm": 0.9190675616264343, + "learning_rate": 0.0002968219187389591, + "loss": 3.39, + "step": 82825 + }, + { + "epoch": 5.62780269058296, + "grad_norm": 0.9336915612220764, + "learning_rate": 0.0002967794537301264, + "loss": 3.5135, + "step": 82830 + }, + { + "epoch": 5.628142410653622, + "grad_norm": 0.8426969051361084, + "learning_rate": 0.00029673698872129367, + "loss": 3.241, + "step": 82835 + }, + { + "epoch": 5.628482130724283, + "grad_norm": 0.8052544593811035, + "learning_rate": 0.0002966945237124609, + "loss": 3.5897, + "step": 82840 + }, + { + "epoch": 5.628821850794945, + "grad_norm": 0.7809633016586304, + "learning_rate": 0.0002966520587036282, + "loss": 3.5164, + "step": 82845 + }, + { + "epoch": 5.629161570865607, + "grad_norm": 0.81617271900177, + "learning_rate": 0.0002966095936947955, + "loss": 3.3611, + "step": 82850 + }, + { + "epoch": 5.629501290936268, + "grad_norm": 1.1365035772323608, + "learning_rate": 0.00029656712868596273, + "loss": 3.8565, + "step": 82855 + }, + { + "epoch": 5.62984101100693, + "grad_norm": 1.0559958219528198, + "learning_rate": 0.00029652466367713007, + "loss": 3.4681, + "step": 82860 + }, + { + "epoch": 5.630180731077592, + "grad_norm": 0.6634359955787659, + "learning_rate": 0.00029648219866829735, + "loss": 3.2709, + "step": 82865 + }, + { + "epoch": 5.630520451148254, + "grad_norm": 0.811455249786377, + "learning_rate": 0.00029643973365946457, + "loss": 3.3871, + "step": 82870 + }, + { + "epoch": 5.630860171218916, + "grad_norm": 0.9984935522079468, + "learning_rate": 0.00029639726865063185, + "loss": 3.3186, + "step": 82875 + }, + { + "epoch": 5.631199891289578, + "grad_norm": 1.1625847816467285, + "learning_rate": 0.0002963548036417992, + "loss": 3.6196, + "step": 82880 + }, + { + "epoch": 5.631539611360239, + "grad_norm": 0.9148594737052917, + "learning_rate": 0.0002963123386329664, + "loss": 3.3848, + "step": 82885 + }, + { + "epoch": 5.631879331430901, + "grad_norm": 1.3087009191513062, + "learning_rate": 0.0002962698736241337, + "loss": 3.5903, + "step": 82890 + }, + { + "epoch": 5.632219051501563, + "grad_norm": 1.252044677734375, + "learning_rate": 0.00029622740861530103, + "loss": 3.3171, + "step": 82895 + }, + { + "epoch": 5.632558771572224, + "grad_norm": 0.898587703704834, + "learning_rate": 0.00029618494360646825, + "loss": 3.3769, + "step": 82900 + }, + { + "epoch": 5.632898491642886, + "grad_norm": 0.623556911945343, + "learning_rate": 0.00029614247859763553, + "loss": 3.6967, + "step": 82905 + }, + { + "epoch": 5.633238211713548, + "grad_norm": 0.9160165190696716, + "learning_rate": 0.00029610001358880287, + "loss": 3.4872, + "step": 82910 + }, + { + "epoch": 5.63357793178421, + "grad_norm": 0.8547165393829346, + "learning_rate": 0.00029605754857997015, + "loss": 3.6375, + "step": 82915 + }, + { + "epoch": 5.633917651854872, + "grad_norm": 0.933552622795105, + "learning_rate": 0.0002960150835711374, + "loss": 3.3863, + "step": 82920 + }, + { + "epoch": 5.634257371925534, + "grad_norm": 0.9109623432159424, + "learning_rate": 0.00029597261856230465, + "loss": 3.3101, + "step": 82925 + }, + { + "epoch": 5.634597091996195, + "grad_norm": 1.0880091190338135, + "learning_rate": 0.000295930153553472, + "loss": 3.4209, + "step": 82930 + }, + { + "epoch": 5.634936812066857, + "grad_norm": 1.155125617980957, + "learning_rate": 0.0002958876885446392, + "loss": 3.4784, + "step": 82935 + }, + { + "epoch": 5.635276532137519, + "grad_norm": 0.7522550821304321, + "learning_rate": 0.0002958452235358065, + "loss": 2.9639, + "step": 82940 + }, + { + "epoch": 5.63561625220818, + "grad_norm": 1.0621402263641357, + "learning_rate": 0.00029580275852697383, + "loss": 3.421, + "step": 82945 + }, + { + "epoch": 5.635955972278842, + "grad_norm": 0.9183138608932495, + "learning_rate": 0.00029576029351814105, + "loss": 3.463, + "step": 82950 + }, + { + "epoch": 5.6362956923495044, + "grad_norm": 0.7272627949714661, + "learning_rate": 0.00029571782850930833, + "loss": 3.4928, + "step": 82955 + }, + { + "epoch": 5.636635412420166, + "grad_norm": 1.0552297830581665, + "learning_rate": 0.0002956753635004756, + "loss": 3.677, + "step": 82960 + }, + { + "epoch": 5.636975132490828, + "grad_norm": 0.7517940998077393, + "learning_rate": 0.0002956328984916429, + "loss": 3.3028, + "step": 82965 + }, + { + "epoch": 5.63731485256149, + "grad_norm": 0.7649659514427185, + "learning_rate": 0.0002955904334828102, + "loss": 3.4234, + "step": 82970 + }, + { + "epoch": 5.637654572632151, + "grad_norm": 0.9010518193244934, + "learning_rate": 0.00029554796847397745, + "loss": 3.8423, + "step": 82975 + }, + { + "epoch": 5.637994292702813, + "grad_norm": 0.9676318764686584, + "learning_rate": 0.00029550550346514473, + "loss": 3.3305, + "step": 82980 + }, + { + "epoch": 5.638334012773475, + "grad_norm": 1.158852219581604, + "learning_rate": 0.000295463038456312, + "loss": 3.5065, + "step": 82985 + }, + { + "epoch": 5.638673732844136, + "grad_norm": 1.0062724351882935, + "learning_rate": 0.0002954205734474793, + "loss": 3.5292, + "step": 82990 + }, + { + "epoch": 5.639013452914798, + "grad_norm": 0.7771356105804443, + "learning_rate": 0.0002953781084386465, + "loss": 3.2335, + "step": 82995 + }, + { + "epoch": 5.6393531729854605, + "grad_norm": 0.8035603165626526, + "learning_rate": 0.00029533564342981385, + "loss": 3.5616, + "step": 83000 + }, + { + "epoch": 5.639692893056122, + "grad_norm": 1.1244747638702393, + "learning_rate": 0.00029529317842098113, + "loss": 3.395, + "step": 83005 + }, + { + "epoch": 5.640032613126784, + "grad_norm": 0.9025858640670776, + "learning_rate": 0.00029525071341214836, + "loss": 3.667, + "step": 83010 + }, + { + "epoch": 5.640372333197446, + "grad_norm": 1.0146806240081787, + "learning_rate": 0.0002952082484033157, + "loss": 3.5117, + "step": 83015 + }, + { + "epoch": 5.640712053268107, + "grad_norm": 1.086349606513977, + "learning_rate": 0.000295165783394483, + "loss": 3.2558, + "step": 83020 + }, + { + "epoch": 5.641051773338769, + "grad_norm": 0.9843370318412781, + "learning_rate": 0.0002951233183856502, + "loss": 3.6393, + "step": 83025 + }, + { + "epoch": 5.64139149340943, + "grad_norm": 0.8231114149093628, + "learning_rate": 0.0002950808533768175, + "loss": 3.4701, + "step": 83030 + }, + { + "epoch": 5.641731213480092, + "grad_norm": 0.8402571082115173, + "learning_rate": 0.0002950383883679848, + "loss": 3.438, + "step": 83035 + }, + { + "epoch": 5.642070933550754, + "grad_norm": 0.8465688228607178, + "learning_rate": 0.00029499592335915204, + "loss": 3.5766, + "step": 83040 + }, + { + "epoch": 5.642410653621416, + "grad_norm": 1.5020369291305542, + "learning_rate": 0.0002949534583503193, + "loss": 3.3472, + "step": 83045 + }, + { + "epoch": 5.642750373692078, + "grad_norm": 0.8210878372192383, + "learning_rate": 0.00029491099334148666, + "loss": 3.5349, + "step": 83050 + }, + { + "epoch": 5.64309009376274, + "grad_norm": 0.9857664108276367, + "learning_rate": 0.0002948685283326539, + "loss": 3.537, + "step": 83055 + }, + { + "epoch": 5.643429813833401, + "grad_norm": 1.0040420293807983, + "learning_rate": 0.00029482606332382116, + "loss": 3.5304, + "step": 83060 + }, + { + "epoch": 5.643769533904063, + "grad_norm": 0.7423401474952698, + "learning_rate": 0.00029478359831498844, + "loss": 3.5346, + "step": 83065 + }, + { + "epoch": 5.644109253974725, + "grad_norm": 0.9391701817512512, + "learning_rate": 0.0002947411333061557, + "loss": 3.3915, + "step": 83070 + }, + { + "epoch": 5.644448974045386, + "grad_norm": 0.7893316149711609, + "learning_rate": 0.000294698668297323, + "loss": 3.347, + "step": 83075 + }, + { + "epoch": 5.644788694116048, + "grad_norm": 0.8247819542884827, + "learning_rate": 0.0002946562032884903, + "loss": 3.2101, + "step": 83080 + }, + { + "epoch": 5.64512841418671, + "grad_norm": 2.3108251094818115, + "learning_rate": 0.0002946137382796576, + "loss": 3.3703, + "step": 83085 + }, + { + "epoch": 5.645468134257372, + "grad_norm": 0.9561464190483093, + "learning_rate": 0.00029457127327082484, + "loss": 3.4698, + "step": 83090 + }, + { + "epoch": 5.645807854328034, + "grad_norm": 0.8971894383430481, + "learning_rate": 0.0002945288082619921, + "loss": 3.438, + "step": 83095 + }, + { + "epoch": 5.646147574398696, + "grad_norm": 1.016218662261963, + "learning_rate": 0.0002944863432531594, + "loss": 3.5235, + "step": 83100 + }, + { + "epoch": 5.646487294469357, + "grad_norm": 1.3055081367492676, + "learning_rate": 0.0002944438782443267, + "loss": 3.3482, + "step": 83105 + }, + { + "epoch": 5.646827014540019, + "grad_norm": 0.6970291137695312, + "learning_rate": 0.00029440141323549396, + "loss": 3.3575, + "step": 83110 + }, + { + "epoch": 5.647166734610681, + "grad_norm": 1.1152212619781494, + "learning_rate": 0.00029435894822666124, + "loss": 3.7124, + "step": 83115 + }, + { + "epoch": 5.647506454681342, + "grad_norm": 0.8771040439605713, + "learning_rate": 0.0002943164832178285, + "loss": 3.4522, + "step": 83120 + }, + { + "epoch": 5.647846174752004, + "grad_norm": 1.005172848701477, + "learning_rate": 0.0002942740182089958, + "loss": 3.3761, + "step": 83125 + }, + { + "epoch": 5.648185894822666, + "grad_norm": 1.0246533155441284, + "learning_rate": 0.0002942315532001631, + "loss": 3.5619, + "step": 83130 + }, + { + "epoch": 5.648525614893328, + "grad_norm": 0.8693761825561523, + "learning_rate": 0.0002941890881913303, + "loss": 3.4222, + "step": 83135 + }, + { + "epoch": 5.64886533496399, + "grad_norm": 0.8457663655281067, + "learning_rate": 0.00029414662318249764, + "loss": 3.4338, + "step": 83140 + }, + { + "epoch": 5.649205055034652, + "grad_norm": 0.892007052898407, + "learning_rate": 0.0002941041581736649, + "loss": 3.178, + "step": 83145 + }, + { + "epoch": 5.649544775105313, + "grad_norm": 0.7931923270225525, + "learning_rate": 0.00029406169316483215, + "loss": 3.5269, + "step": 83150 + }, + { + "epoch": 5.649884495175975, + "grad_norm": 0.9847087860107422, + "learning_rate": 0.0002940192281559995, + "loss": 3.4299, + "step": 83155 + }, + { + "epoch": 5.650224215246637, + "grad_norm": 1.161384105682373, + "learning_rate": 0.00029397676314716676, + "loss": 3.7265, + "step": 83160 + }, + { + "epoch": 5.650563935317298, + "grad_norm": 0.9489477872848511, + "learning_rate": 0.000293934298138334, + "loss": 3.5146, + "step": 83165 + }, + { + "epoch": 5.65090365538796, + "grad_norm": 0.8438978791236877, + "learning_rate": 0.00029389183312950127, + "loss": 3.4176, + "step": 83170 + }, + { + "epoch": 5.651243375458622, + "grad_norm": 1.0882558822631836, + "learning_rate": 0.0002938493681206686, + "loss": 3.5438, + "step": 83175 + }, + { + "epoch": 5.651583095529284, + "grad_norm": 1.0530973672866821, + "learning_rate": 0.00029380690311183583, + "loss": 3.2199, + "step": 83180 + }, + { + "epoch": 5.651922815599946, + "grad_norm": 1.1972612142562866, + "learning_rate": 0.0002937644381030031, + "loss": 3.4545, + "step": 83185 + }, + { + "epoch": 5.652262535670608, + "grad_norm": 1.0582321882247925, + "learning_rate": 0.00029372197309417044, + "loss": 3.2883, + "step": 83190 + }, + { + "epoch": 5.652602255741269, + "grad_norm": 0.8236549496650696, + "learning_rate": 0.00029367950808533767, + "loss": 3.3984, + "step": 83195 + }, + { + "epoch": 5.652941975811931, + "grad_norm": 0.7398567795753479, + "learning_rate": 0.00029363704307650495, + "loss": 3.4388, + "step": 83200 + }, + { + "epoch": 5.653281695882593, + "grad_norm": 0.864092230796814, + "learning_rate": 0.0002935945780676723, + "loss": 3.3111, + "step": 83205 + }, + { + "epoch": 5.653621415953254, + "grad_norm": 1.8370178937911987, + "learning_rate": 0.0002935521130588395, + "loss": 3.6851, + "step": 83210 + }, + { + "epoch": 5.653961136023916, + "grad_norm": 0.9882895946502686, + "learning_rate": 0.0002935096480500068, + "loss": 3.404, + "step": 83215 + }, + { + "epoch": 5.654300856094578, + "grad_norm": 0.7590557932853699, + "learning_rate": 0.00029346718304117407, + "loss": 3.5843, + "step": 83220 + }, + { + "epoch": 5.65464057616524, + "grad_norm": 0.8677561283111572, + "learning_rate": 0.00029342471803234135, + "loss": 3.4577, + "step": 83225 + }, + { + "epoch": 5.654980296235902, + "grad_norm": 0.9132842421531677, + "learning_rate": 0.00029338225302350863, + "loss": 3.3153, + "step": 83230 + }, + { + "epoch": 5.655320016306564, + "grad_norm": 0.9157471656799316, + "learning_rate": 0.0002933397880146759, + "loss": 3.3665, + "step": 83235 + }, + { + "epoch": 5.655659736377225, + "grad_norm": 0.9550807476043701, + "learning_rate": 0.0002932973230058432, + "loss": 3.449, + "step": 83240 + }, + { + "epoch": 5.655999456447887, + "grad_norm": 0.8664529919624329, + "learning_rate": 0.00029325485799701047, + "loss": 3.3538, + "step": 83245 + }, + { + "epoch": 5.656339176518548, + "grad_norm": 0.7395876049995422, + "learning_rate": 0.00029321239298817775, + "loss": 3.169, + "step": 83250 + }, + { + "epoch": 5.65667889658921, + "grad_norm": 1.4137564897537231, + "learning_rate": 0.00029316992797934503, + "loss": 3.7043, + "step": 83255 + }, + { + "epoch": 5.657018616659872, + "grad_norm": 1.1369050741195679, + "learning_rate": 0.0002931274629705123, + "loss": 3.2827, + "step": 83260 + }, + { + "epoch": 5.657358336730534, + "grad_norm": 0.9533289670944214, + "learning_rate": 0.0002930849979616796, + "loss": 3.6102, + "step": 83265 + }, + { + "epoch": 5.657698056801196, + "grad_norm": 0.7966260313987732, + "learning_rate": 0.00029304253295284687, + "loss": 3.7055, + "step": 83270 + }, + { + "epoch": 5.658037776871858, + "grad_norm": 0.856463611125946, + "learning_rate": 0.00029300006794401415, + "loss": 3.1695, + "step": 83275 + }, + { + "epoch": 5.658377496942519, + "grad_norm": 0.8741515278816223, + "learning_rate": 0.00029295760293518143, + "loss": 3.3929, + "step": 83280 + }, + { + "epoch": 5.658717217013181, + "grad_norm": 0.9062851071357727, + "learning_rate": 0.0002929151379263487, + "loss": 3.2124, + "step": 83285 + }, + { + "epoch": 5.659056937083843, + "grad_norm": 0.8651259541511536, + "learning_rate": 0.00029287267291751594, + "loss": 3.5312, + "step": 83290 + }, + { + "epoch": 5.659396657154504, + "grad_norm": 1.1057521104812622, + "learning_rate": 0.00029283020790868327, + "loss": 3.0417, + "step": 83295 + }, + { + "epoch": 5.659736377225166, + "grad_norm": 2.340284824371338, + "learning_rate": 0.00029278774289985055, + "loss": 3.3235, + "step": 83300 + }, + { + "epoch": 5.660076097295828, + "grad_norm": 0.9036365747451782, + "learning_rate": 0.0002927452778910178, + "loss": 3.3134, + "step": 83305 + }, + { + "epoch": 5.66041581736649, + "grad_norm": 1.1283159255981445, + "learning_rate": 0.0002927028128821851, + "loss": 3.2205, + "step": 83310 + }, + { + "epoch": 5.660755537437152, + "grad_norm": 0.712291955947876, + "learning_rate": 0.0002926603478733524, + "loss": 3.3603, + "step": 83315 + }, + { + "epoch": 5.661095257507814, + "grad_norm": 1.0397887229919434, + "learning_rate": 0.0002926178828645196, + "loss": 3.5416, + "step": 83320 + }, + { + "epoch": 5.661434977578475, + "grad_norm": 1.0280518531799316, + "learning_rate": 0.0002925754178556869, + "loss": 3.4767, + "step": 83325 + }, + { + "epoch": 5.661774697649137, + "grad_norm": 0.9437503814697266, + "learning_rate": 0.00029253295284685423, + "loss": 3.5976, + "step": 83330 + }, + { + "epoch": 5.662114417719799, + "grad_norm": 0.7488273978233337, + "learning_rate": 0.00029249048783802146, + "loss": 3.2904, + "step": 83335 + }, + { + "epoch": 5.66245413779046, + "grad_norm": 0.7392702102661133, + "learning_rate": 0.00029244802282918874, + "loss": 3.5964, + "step": 83340 + }, + { + "epoch": 5.662793857861122, + "grad_norm": 0.8745890855789185, + "learning_rate": 0.00029240555782035607, + "loss": 3.5058, + "step": 83345 + }, + { + "epoch": 5.663133577931784, + "grad_norm": 1.1320058107376099, + "learning_rate": 0.0002923630928115233, + "loss": 3.4007, + "step": 83350 + }, + { + "epoch": 5.663473298002446, + "grad_norm": 0.9163044095039368, + "learning_rate": 0.0002923206278026906, + "loss": 3.6109, + "step": 83355 + }, + { + "epoch": 5.663813018073108, + "grad_norm": 0.9028212428092957, + "learning_rate": 0.00029227816279385786, + "loss": 3.6068, + "step": 83360 + }, + { + "epoch": 5.66415273814377, + "grad_norm": 1.0310269594192505, + "learning_rate": 0.00029223569778502514, + "loss": 3.3513, + "step": 83365 + }, + { + "epoch": 5.664492458214431, + "grad_norm": 0.8864743113517761, + "learning_rate": 0.0002921932327761924, + "loss": 3.2587, + "step": 83370 + }, + { + "epoch": 5.664832178285093, + "grad_norm": 0.7554112672805786, + "learning_rate": 0.0002921507677673597, + "loss": 3.365, + "step": 83375 + }, + { + "epoch": 5.665171898355755, + "grad_norm": 0.7793689370155334, + "learning_rate": 0.000292108302758527, + "loss": 3.3158, + "step": 83380 + }, + { + "epoch": 5.665511618426416, + "grad_norm": 0.9823889136314392, + "learning_rate": 0.00029206583774969426, + "loss": 3.2225, + "step": 83385 + }, + { + "epoch": 5.665851338497078, + "grad_norm": 0.8162235021591187, + "learning_rate": 0.00029202337274086154, + "loss": 3.6471, + "step": 83390 + }, + { + "epoch": 5.66619105856774, + "grad_norm": 0.851116955280304, + "learning_rate": 0.00029198090773202876, + "loss": 3.4475, + "step": 83395 + }, + { + "epoch": 5.666530778638402, + "grad_norm": 2.529934883117676, + "learning_rate": 0.0002919384427231961, + "loss": 3.1704, + "step": 83400 + }, + { + "epoch": 5.666870498709064, + "grad_norm": 0.9855534434318542, + "learning_rate": 0.0002918959777143634, + "loss": 3.3386, + "step": 83405 + }, + { + "epoch": 5.667210218779726, + "grad_norm": 0.7247567176818848, + "learning_rate": 0.0002918535127055306, + "loss": 3.2524, + "step": 83410 + }, + { + "epoch": 5.667549938850387, + "grad_norm": 0.788919985294342, + "learning_rate": 0.00029181104769669794, + "loss": 3.6158, + "step": 83415 + }, + { + "epoch": 5.667889658921049, + "grad_norm": 0.9521941542625427, + "learning_rate": 0.0002917685826878652, + "loss": 3.447, + "step": 83420 + }, + { + "epoch": 5.668229378991711, + "grad_norm": 0.8882750868797302, + "learning_rate": 0.0002917261176790325, + "loss": 3.3779, + "step": 83425 + }, + { + "epoch": 5.668569099062372, + "grad_norm": 1.0108823776245117, + "learning_rate": 0.0002916836526701997, + "loss": 3.4692, + "step": 83430 + }, + { + "epoch": 5.668908819133034, + "grad_norm": 0.8170201182365417, + "learning_rate": 0.00029164118766136706, + "loss": 3.618, + "step": 83435 + }, + { + "epoch": 5.669248539203696, + "grad_norm": 1.3918834924697876, + "learning_rate": 0.00029159872265253434, + "loss": 3.366, + "step": 83440 + }, + { + "epoch": 5.669588259274358, + "grad_norm": 0.9038988947868347, + "learning_rate": 0.00029155625764370156, + "loss": 3.5486, + "step": 83445 + }, + { + "epoch": 5.66992797934502, + "grad_norm": 0.9005204439163208, + "learning_rate": 0.0002915137926348689, + "loss": 3.2578, + "step": 83450 + }, + { + "epoch": 5.670267699415682, + "grad_norm": 0.7744598388671875, + "learning_rate": 0.0002914713276260362, + "loss": 3.4516, + "step": 83455 + }, + { + "epoch": 5.670607419486343, + "grad_norm": 1.6715221405029297, + "learning_rate": 0.0002914288626172034, + "loss": 3.5058, + "step": 83460 + }, + { + "epoch": 5.670947139557005, + "grad_norm": 0.877471923828125, + "learning_rate": 0.00029138639760837074, + "loss": 3.7015, + "step": 83465 + }, + { + "epoch": 5.671286859627667, + "grad_norm": 1.5308631658554077, + "learning_rate": 0.000291343932599538, + "loss": 3.4442, + "step": 83470 + }, + { + "epoch": 5.671626579698328, + "grad_norm": 0.8762699365615845, + "learning_rate": 0.00029130146759070525, + "loss": 3.1735, + "step": 83475 + }, + { + "epoch": 5.67196629976899, + "grad_norm": 2.1269843578338623, + "learning_rate": 0.0002912590025818725, + "loss": 3.2728, + "step": 83480 + }, + { + "epoch": 5.672306019839652, + "grad_norm": 0.972895085811615, + "learning_rate": 0.00029121653757303986, + "loss": 3.4321, + "step": 83485 + }, + { + "epoch": 5.672645739910314, + "grad_norm": 0.8060019612312317, + "learning_rate": 0.0002911740725642071, + "loss": 3.613, + "step": 83490 + }, + { + "epoch": 5.672985459980976, + "grad_norm": 1.1519427299499512, + "learning_rate": 0.00029113160755537437, + "loss": 3.364, + "step": 83495 + }, + { + "epoch": 5.673325180051638, + "grad_norm": 0.7765557765960693, + "learning_rate": 0.0002910891425465417, + "loss": 3.5248, + "step": 83500 + }, + { + "epoch": 5.673664900122299, + "grad_norm": 1.1052685976028442, + "learning_rate": 0.0002910466775377089, + "loss": 3.3766, + "step": 83505 + }, + { + "epoch": 5.674004620192961, + "grad_norm": 1.0152758359909058, + "learning_rate": 0.0002910042125288762, + "loss": 3.4405, + "step": 83510 + }, + { + "epoch": 5.674344340263623, + "grad_norm": 1.0194087028503418, + "learning_rate": 0.0002909617475200435, + "loss": 3.5255, + "step": 83515 + }, + { + "epoch": 5.674684060334284, + "grad_norm": 0.8403129577636719, + "learning_rate": 0.00029091928251121077, + "loss": 3.4917, + "step": 83520 + }, + { + "epoch": 5.675023780404946, + "grad_norm": 0.7378539443016052, + "learning_rate": 0.00029087681750237805, + "loss": 3.1929, + "step": 83525 + }, + { + "epoch": 5.6753635004756084, + "grad_norm": 0.9433521032333374, + "learning_rate": 0.0002908343524935453, + "loss": 3.6423, + "step": 83530 + }, + { + "epoch": 5.67570322054627, + "grad_norm": 1.1976068019866943, + "learning_rate": 0.0002907918874847126, + "loss": 3.4661, + "step": 83535 + }, + { + "epoch": 5.676042940616932, + "grad_norm": 0.9202266335487366, + "learning_rate": 0.0002907494224758799, + "loss": 3.5073, + "step": 83540 + }, + { + "epoch": 5.676382660687594, + "grad_norm": 0.9232271909713745, + "learning_rate": 0.00029070695746704717, + "loss": 3.335, + "step": 83545 + }, + { + "epoch": 5.676722380758255, + "grad_norm": 1.2885308265686035, + "learning_rate": 0.0002906644924582144, + "loss": 3.1531, + "step": 83550 + }, + { + "epoch": 5.677062100828917, + "grad_norm": 0.8193087577819824, + "learning_rate": 0.0002906220274493817, + "loss": 3.4005, + "step": 83555 + }, + { + "epoch": 5.677401820899579, + "grad_norm": 0.8035110235214233, + "learning_rate": 0.000290579562440549, + "loss": 3.3662, + "step": 83560 + }, + { + "epoch": 5.67774154097024, + "grad_norm": 0.9333395957946777, + "learning_rate": 0.00029053709743171623, + "loss": 3.4637, + "step": 83565 + }, + { + "epoch": 5.678081261040902, + "grad_norm": 1.2696019411087036, + "learning_rate": 0.00029049463242288357, + "loss": 3.5137, + "step": 83570 + }, + { + "epoch": 5.6784209811115645, + "grad_norm": 0.9691367745399475, + "learning_rate": 0.00029045216741405085, + "loss": 3.4342, + "step": 83575 + }, + { + "epoch": 5.678760701182226, + "grad_norm": 0.8282463550567627, + "learning_rate": 0.00029040970240521807, + "loss": 3.3283, + "step": 83580 + }, + { + "epoch": 5.679100421252888, + "grad_norm": 0.9359847903251648, + "learning_rate": 0.00029036723739638535, + "loss": 3.3857, + "step": 83585 + }, + { + "epoch": 5.67944014132355, + "grad_norm": 0.7729727625846863, + "learning_rate": 0.0002903247723875527, + "loss": 3.44, + "step": 83590 + }, + { + "epoch": 5.679779861394211, + "grad_norm": 0.8071892857551575, + "learning_rate": 0.00029028230737871997, + "loss": 3.3342, + "step": 83595 + }, + { + "epoch": 5.680119581464873, + "grad_norm": 0.9142494797706604, + "learning_rate": 0.0002902398423698872, + "loss": 3.5375, + "step": 83600 + }, + { + "epoch": 5.680459301535535, + "grad_norm": 1.0777134895324707, + "learning_rate": 0.00029019737736105453, + "loss": 3.4867, + "step": 83605 + }, + { + "epoch": 5.680799021606196, + "grad_norm": 0.8296523094177246, + "learning_rate": 0.0002901549123522218, + "loss": 3.6162, + "step": 83610 + }, + { + "epoch": 5.681138741676858, + "grad_norm": 0.901035487651825, + "learning_rate": 0.00029011244734338903, + "loss": 3.6028, + "step": 83615 + }, + { + "epoch": 5.6814784617475205, + "grad_norm": 0.7166416049003601, + "learning_rate": 0.0002900699823345563, + "loss": 3.4074, + "step": 83620 + }, + { + "epoch": 5.681818181818182, + "grad_norm": 0.8282741904258728, + "learning_rate": 0.00029002751732572365, + "loss": 3.5845, + "step": 83625 + }, + { + "epoch": 5.682157901888844, + "grad_norm": 0.972112238407135, + "learning_rate": 0.0002899850523168909, + "loss": 3.5149, + "step": 83630 + }, + { + "epoch": 5.682497621959506, + "grad_norm": 0.7597414255142212, + "learning_rate": 0.00028994258730805815, + "loss": 3.4149, + "step": 83635 + }, + { + "epoch": 5.682837342030167, + "grad_norm": 1.1291098594665527, + "learning_rate": 0.0002899001222992255, + "loss": 3.6694, + "step": 83640 + }, + { + "epoch": 5.683177062100829, + "grad_norm": 0.7365636825561523, + "learning_rate": 0.0002898576572903927, + "loss": 3.5827, + "step": 83645 + }, + { + "epoch": 5.683516782171491, + "grad_norm": 0.84773188829422, + "learning_rate": 0.00028981519228156, + "loss": 3.1137, + "step": 83650 + }, + { + "epoch": 5.683856502242152, + "grad_norm": 1.1980736255645752, + "learning_rate": 0.0002897727272727273, + "loss": 3.1264, + "step": 83655 + }, + { + "epoch": 5.684196222312814, + "grad_norm": 0.6134248971939087, + "learning_rate": 0.00028973026226389455, + "loss": 3.6452, + "step": 83660 + }, + { + "epoch": 5.6845359423834765, + "grad_norm": 0.9611337780952454, + "learning_rate": 0.00028968779725506183, + "loss": 3.2734, + "step": 83665 + }, + { + "epoch": 5.684875662454138, + "grad_norm": 0.8040307760238647, + "learning_rate": 0.0002896453322462291, + "loss": 3.4302, + "step": 83670 + }, + { + "epoch": 5.6852153825248, + "grad_norm": 0.8691861033439636, + "learning_rate": 0.0002896028672373964, + "loss": 3.3818, + "step": 83675 + }, + { + "epoch": 5.685555102595462, + "grad_norm": 1.210279107093811, + "learning_rate": 0.0002895604022285637, + "loss": 3.6235, + "step": 83680 + }, + { + "epoch": 5.685894822666123, + "grad_norm": 0.8031282424926758, + "learning_rate": 0.00028951793721973095, + "loss": 3.5773, + "step": 83685 + }, + { + "epoch": 5.686234542736785, + "grad_norm": 1.041133165359497, + "learning_rate": 0.0002894754722108982, + "loss": 3.5104, + "step": 83690 + }, + { + "epoch": 5.686574262807447, + "grad_norm": 1.1003351211547852, + "learning_rate": 0.0002894330072020655, + "loss": 3.4663, + "step": 83695 + }, + { + "epoch": 5.686913982878108, + "grad_norm": 0.7298412322998047, + "learning_rate": 0.0002893905421932328, + "loss": 3.5192, + "step": 83700 + }, + { + "epoch": 5.68725370294877, + "grad_norm": 0.9523268938064575, + "learning_rate": 0.0002893480771844, + "loss": 3.3549, + "step": 83705 + }, + { + "epoch": 5.687593423019432, + "grad_norm": 0.8936564326286316, + "learning_rate": 0.00028930561217556735, + "loss": 3.1649, + "step": 83710 + }, + { + "epoch": 5.687933143090094, + "grad_norm": 0.9024315476417542, + "learning_rate": 0.00028926314716673463, + "loss": 3.356, + "step": 83715 + }, + { + "epoch": 5.688272863160756, + "grad_norm": 0.8056045174598694, + "learning_rate": 0.00028922068215790186, + "loss": 3.5041, + "step": 83720 + }, + { + "epoch": 5.688612583231417, + "grad_norm": 0.9626389741897583, + "learning_rate": 0.00028917821714906914, + "loss": 3.4953, + "step": 83725 + }, + { + "epoch": 5.688952303302079, + "grad_norm": 0.8947411775588989, + "learning_rate": 0.0002891357521402365, + "loss": 3.5996, + "step": 83730 + }, + { + "epoch": 5.689292023372741, + "grad_norm": 0.8357754945755005, + "learning_rate": 0.0002890932871314037, + "loss": 3.3353, + "step": 83735 + }, + { + "epoch": 5.689631743443402, + "grad_norm": 1.2133411169052124, + "learning_rate": 0.000289050822122571, + "loss": 3.4188, + "step": 83740 + }, + { + "epoch": 5.689971463514064, + "grad_norm": 0.865189790725708, + "learning_rate": 0.0002890083571137383, + "loss": 3.4933, + "step": 83745 + }, + { + "epoch": 5.690311183584726, + "grad_norm": 0.8070453405380249, + "learning_rate": 0.00028896589210490554, + "loss": 3.467, + "step": 83750 + }, + { + "epoch": 5.690650903655388, + "grad_norm": 0.9795275926589966, + "learning_rate": 0.0002889234270960728, + "loss": 3.5636, + "step": 83755 + }, + { + "epoch": 5.69099062372605, + "grad_norm": 0.6925147771835327, + "learning_rate": 0.00028888096208724016, + "loss": 3.5007, + "step": 83760 + }, + { + "epoch": 5.691330343796712, + "grad_norm": 0.9845404624938965, + "learning_rate": 0.00028883849707840744, + "loss": 3.5379, + "step": 83765 + }, + { + "epoch": 5.691670063867373, + "grad_norm": 0.9568605422973633, + "learning_rate": 0.00028879603206957466, + "loss": 3.3494, + "step": 83770 + }, + { + "epoch": 5.692009783938035, + "grad_norm": 0.9877433180809021, + "learning_rate": 0.00028875356706074194, + "loss": 3.4354, + "step": 83775 + }, + { + "epoch": 5.692349504008697, + "grad_norm": 0.8218603134155273, + "learning_rate": 0.0002887111020519093, + "loss": 3.4498, + "step": 83780 + }, + { + "epoch": 5.692689224079358, + "grad_norm": 0.9249626994132996, + "learning_rate": 0.0002886686370430765, + "loss": 3.4925, + "step": 83785 + }, + { + "epoch": 5.69302894415002, + "grad_norm": 1.116148591041565, + "learning_rate": 0.0002886261720342438, + "loss": 3.2019, + "step": 83790 + }, + { + "epoch": 5.693368664220682, + "grad_norm": 0.8623901605606079, + "learning_rate": 0.0002885837070254111, + "loss": 3.599, + "step": 83795 + }, + { + "epoch": 5.693708384291344, + "grad_norm": 0.9894263744354248, + "learning_rate": 0.00028854124201657834, + "loss": 3.4809, + "step": 83800 + }, + { + "epoch": 5.694048104362006, + "grad_norm": 0.7963728904724121, + "learning_rate": 0.0002884987770077456, + "loss": 3.2485, + "step": 83805 + }, + { + "epoch": 5.694387824432668, + "grad_norm": 1.0630016326904297, + "learning_rate": 0.0002884563119989129, + "loss": 3.4365, + "step": 83810 + }, + { + "epoch": 5.694727544503329, + "grad_norm": 1.060658574104309, + "learning_rate": 0.0002884138469900802, + "loss": 3.5702, + "step": 83815 + }, + { + "epoch": 5.695067264573991, + "grad_norm": 1.0556249618530273, + "learning_rate": 0.00028837138198124746, + "loss": 3.4608, + "step": 83820 + }, + { + "epoch": 5.695406984644653, + "grad_norm": 0.8964207172393799, + "learning_rate": 0.00028832891697241474, + "loss": 3.275, + "step": 83825 + }, + { + "epoch": 5.695746704715314, + "grad_norm": 1.1023313999176025, + "learning_rate": 0.000288286451963582, + "loss": 3.5213, + "step": 83830 + }, + { + "epoch": 5.696086424785976, + "grad_norm": 0.8793622255325317, + "learning_rate": 0.0002882439869547493, + "loss": 3.7469, + "step": 83835 + }, + { + "epoch": 5.6964261448566385, + "grad_norm": 0.9231306910514832, + "learning_rate": 0.0002882015219459166, + "loss": 3.3097, + "step": 83840 + }, + { + "epoch": 5.6967658649273, + "grad_norm": 1.1015775203704834, + "learning_rate": 0.0002881590569370838, + "loss": 3.3817, + "step": 83845 + }, + { + "epoch": 5.697105584997962, + "grad_norm": 0.8196941614151001, + "learning_rate": 0.00028811659192825114, + "loss": 3.436, + "step": 83850 + }, + { + "epoch": 5.697445305068624, + "grad_norm": 0.9774232506752014, + "learning_rate": 0.0002880741269194184, + "loss": 3.39, + "step": 83855 + }, + { + "epoch": 5.697785025139285, + "grad_norm": 1.0966999530792236, + "learning_rate": 0.00028803166191058565, + "loss": 3.417, + "step": 83860 + }, + { + "epoch": 5.698124745209947, + "grad_norm": 1.1334071159362793, + "learning_rate": 0.000287989196901753, + "loss": 3.3902, + "step": 83865 + }, + { + "epoch": 5.698464465280609, + "grad_norm": 0.953660249710083, + "learning_rate": 0.00028794673189292026, + "loss": 3.388, + "step": 83870 + }, + { + "epoch": 5.69880418535127, + "grad_norm": 0.9211793541908264, + "learning_rate": 0.0002879042668840875, + "loss": 3.5135, + "step": 83875 + }, + { + "epoch": 5.699143905421932, + "grad_norm": 1.0495069026947021, + "learning_rate": 0.00028786180187525477, + "loss": 3.5445, + "step": 83880 + }, + { + "epoch": 5.6994836254925945, + "grad_norm": 1.0265876054763794, + "learning_rate": 0.0002878193368664221, + "loss": 3.2873, + "step": 83885 + }, + { + "epoch": 5.699823345563256, + "grad_norm": 0.784022867679596, + "learning_rate": 0.00028777687185758933, + "loss": 3.3167, + "step": 83890 + }, + { + "epoch": 5.700163065633918, + "grad_norm": 1.3390392065048218, + "learning_rate": 0.0002877344068487566, + "loss": 3.3866, + "step": 83895 + }, + { + "epoch": 5.70050278570458, + "grad_norm": 0.9145408868789673, + "learning_rate": 0.00028769194183992394, + "loss": 3.3546, + "step": 83900 + }, + { + "epoch": 5.700842505775241, + "grad_norm": 0.9205167889595032, + "learning_rate": 0.00028764947683109117, + "loss": 3.4892, + "step": 83905 + }, + { + "epoch": 5.701182225845903, + "grad_norm": 1.324546456336975, + "learning_rate": 0.00028760701182225845, + "loss": 3.5786, + "step": 83910 + }, + { + "epoch": 5.701521945916565, + "grad_norm": 0.7949965596199036, + "learning_rate": 0.00028756454681342573, + "loss": 3.5556, + "step": 83915 + }, + { + "epoch": 5.701861665987226, + "grad_norm": 1.213868260383606, + "learning_rate": 0.000287522081804593, + "loss": 3.4114, + "step": 83920 + }, + { + "epoch": 5.702201386057888, + "grad_norm": 0.8094156384468079, + "learning_rate": 0.0002874796167957603, + "loss": 3.3168, + "step": 83925 + }, + { + "epoch": 5.70254110612855, + "grad_norm": 0.9252081513404846, + "learning_rate": 0.00028743715178692757, + "loss": 3.3603, + "step": 83930 + }, + { + "epoch": 5.702880826199212, + "grad_norm": 0.7807667851448059, + "learning_rate": 0.0002873946867780949, + "loss": 3.3351, + "step": 83935 + }, + { + "epoch": 5.703220546269874, + "grad_norm": 0.8328498005867004, + "learning_rate": 0.00028735222176926213, + "loss": 3.2183, + "step": 83940 + }, + { + "epoch": 5.703560266340535, + "grad_norm": 0.9249861836433411, + "learning_rate": 0.0002873097567604294, + "loss": 3.6401, + "step": 83945 + }, + { + "epoch": 5.703899986411197, + "grad_norm": 1.1098711490631104, + "learning_rate": 0.0002872672917515967, + "loss": 3.2397, + "step": 83950 + }, + { + "epoch": 5.704239706481859, + "grad_norm": 1.0246496200561523, + "learning_rate": 0.00028722482674276397, + "loss": 3.4206, + "step": 83955 + }, + { + "epoch": 5.70457942655252, + "grad_norm": 1.0861350297927856, + "learning_rate": 0.00028718236173393125, + "loss": 3.3089, + "step": 83960 + }, + { + "epoch": 5.704919146623182, + "grad_norm": 0.8780844807624817, + "learning_rate": 0.00028713989672509853, + "loss": 3.3392, + "step": 83965 + }, + { + "epoch": 5.705258866693844, + "grad_norm": 0.9942272305488586, + "learning_rate": 0.0002870974317162658, + "loss": 3.4557, + "step": 83970 + }, + { + "epoch": 5.705598586764506, + "grad_norm": 1.5698579549789429, + "learning_rate": 0.0002870549667074331, + "loss": 3.5614, + "step": 83975 + }, + { + "epoch": 5.705938306835168, + "grad_norm": 0.7918470501899719, + "learning_rate": 0.00028701250169860037, + "loss": 3.6235, + "step": 83980 + }, + { + "epoch": 5.70627802690583, + "grad_norm": 0.8190701603889465, + "learning_rate": 0.0002869700366897676, + "loss": 3.4348, + "step": 83985 + }, + { + "epoch": 5.706617746976491, + "grad_norm": 0.7898657321929932, + "learning_rate": 0.00028692757168093493, + "loss": 3.3262, + "step": 83990 + }, + { + "epoch": 5.706957467047153, + "grad_norm": 0.9372881650924683, + "learning_rate": 0.0002868851066721022, + "loss": 3.4982, + "step": 83995 + }, + { + "epoch": 5.707297187117815, + "grad_norm": 0.8472697138786316, + "learning_rate": 0.00028684264166326944, + "loss": 3.3549, + "step": 84000 + }, + { + "epoch": 5.707636907188476, + "grad_norm": 0.9722793698310852, + "learning_rate": 0.00028680017665443677, + "loss": 3.514, + "step": 84005 + }, + { + "epoch": 5.707976627259138, + "grad_norm": 0.8146842122077942, + "learning_rate": 0.00028675771164560405, + "loss": 3.5236, + "step": 84010 + }, + { + "epoch": 5.7083163473298, + "grad_norm": 0.9873725771903992, + "learning_rate": 0.0002867152466367713, + "loss": 3.2656, + "step": 84015 + }, + { + "epoch": 5.708656067400462, + "grad_norm": 0.8779329061508179, + "learning_rate": 0.00028667278162793856, + "loss": 3.4456, + "step": 84020 + }, + { + "epoch": 5.708995787471124, + "grad_norm": 0.8179624676704407, + "learning_rate": 0.0002866303166191059, + "loss": 3.2703, + "step": 84025 + }, + { + "epoch": 5.709335507541786, + "grad_norm": 1.0017788410186768, + "learning_rate": 0.0002865878516102731, + "loss": 3.2542, + "step": 84030 + }, + { + "epoch": 5.709675227612447, + "grad_norm": 0.8930956125259399, + "learning_rate": 0.0002865453866014404, + "loss": 3.4006, + "step": 84035 + }, + { + "epoch": 5.710014947683109, + "grad_norm": 0.9024474620819092, + "learning_rate": 0.00028650292159260773, + "loss": 3.5658, + "step": 84040 + }, + { + "epoch": 5.710354667753771, + "grad_norm": 0.9011237621307373, + "learning_rate": 0.00028646045658377496, + "loss": 3.5541, + "step": 84045 + }, + { + "epoch": 5.710694387824432, + "grad_norm": 0.8061602711677551, + "learning_rate": 0.00028641799157494224, + "loss": 3.7223, + "step": 84050 + }, + { + "epoch": 5.711034107895094, + "grad_norm": 0.9095404744148254, + "learning_rate": 0.00028637552656610957, + "loss": 3.5228, + "step": 84055 + }, + { + "epoch": 5.711373827965756, + "grad_norm": 0.8982224464416504, + "learning_rate": 0.0002863330615572768, + "loss": 3.4623, + "step": 84060 + }, + { + "epoch": 5.711713548036418, + "grad_norm": 1.2277770042419434, + "learning_rate": 0.0002862905965484441, + "loss": 3.3218, + "step": 84065 + }, + { + "epoch": 5.71205326810708, + "grad_norm": 1.017465353012085, + "learning_rate": 0.00028624813153961136, + "loss": 3.5488, + "step": 84070 + }, + { + "epoch": 5.712392988177742, + "grad_norm": 1.1421034336090088, + "learning_rate": 0.00028620566653077864, + "loss": 3.5783, + "step": 84075 + }, + { + "epoch": 5.712732708248403, + "grad_norm": 0.9392196536064148, + "learning_rate": 0.0002861632015219459, + "loss": 3.6789, + "step": 84080 + }, + { + "epoch": 5.713072428319065, + "grad_norm": 0.7899647951126099, + "learning_rate": 0.0002861207365131132, + "loss": 3.357, + "step": 84085 + }, + { + "epoch": 5.713412148389727, + "grad_norm": 0.9580755233764648, + "learning_rate": 0.0002860782715042805, + "loss": 3.2536, + "step": 84090 + }, + { + "epoch": 5.713751868460388, + "grad_norm": 0.7306355834007263, + "learning_rate": 0.00028603580649544776, + "loss": 3.4857, + "step": 84095 + }, + { + "epoch": 5.71409158853105, + "grad_norm": 0.7009839415550232, + "learning_rate": 0.00028599334148661504, + "loss": 3.3278, + "step": 84100 + }, + { + "epoch": 5.7144313086017124, + "grad_norm": 0.8571102023124695, + "learning_rate": 0.0002859508764777823, + "loss": 3.4634, + "step": 84105 + }, + { + "epoch": 5.714771028672374, + "grad_norm": 0.7968100905418396, + "learning_rate": 0.0002859084114689496, + "loss": 3.4157, + "step": 84110 + }, + { + "epoch": 5.715110748743036, + "grad_norm": 1.0608696937561035, + "learning_rate": 0.0002858659464601169, + "loss": 3.397, + "step": 84115 + }, + { + "epoch": 5.715450468813698, + "grad_norm": 0.9491149187088013, + "learning_rate": 0.00028582348145128416, + "loss": 3.5418, + "step": 84120 + }, + { + "epoch": 5.715790188884359, + "grad_norm": 1.1561461687088013, + "learning_rate": 0.00028578101644245144, + "loss": 3.4238, + "step": 84125 + }, + { + "epoch": 5.716129908955021, + "grad_norm": 1.0358370542526245, + "learning_rate": 0.0002857385514336187, + "loss": 3.2545, + "step": 84130 + }, + { + "epoch": 5.716469629025683, + "grad_norm": 0.8536635637283325, + "learning_rate": 0.000285696086424786, + "loss": 3.505, + "step": 84135 + }, + { + "epoch": 5.716809349096344, + "grad_norm": 0.835204005241394, + "learning_rate": 0.0002856536214159532, + "loss": 3.6877, + "step": 84140 + }, + { + "epoch": 5.717149069167006, + "grad_norm": 1.2492343187332153, + "learning_rate": 0.00028561115640712056, + "loss": 3.6294, + "step": 84145 + }, + { + "epoch": 5.7174887892376685, + "grad_norm": 0.6579792499542236, + "learning_rate": 0.00028556869139828784, + "loss": 3.4733, + "step": 84150 + }, + { + "epoch": 5.71782850930833, + "grad_norm": 0.9526249766349792, + "learning_rate": 0.00028552622638945506, + "loss": 3.7688, + "step": 84155 + }, + { + "epoch": 5.718168229378992, + "grad_norm": 1.0695613622665405, + "learning_rate": 0.0002854837613806224, + "loss": 3.4056, + "step": 84160 + }, + { + "epoch": 5.718507949449654, + "grad_norm": 1.0571779012680054, + "learning_rate": 0.0002854412963717897, + "loss": 3.5737, + "step": 84165 + }, + { + "epoch": 5.718847669520315, + "grad_norm": 0.8048433661460876, + "learning_rate": 0.0002853988313629569, + "loss": 3.6341, + "step": 84170 + }, + { + "epoch": 5.719187389590977, + "grad_norm": 1.2929494380950928, + "learning_rate": 0.0002853563663541242, + "loss": 3.1933, + "step": 84175 + }, + { + "epoch": 5.719527109661639, + "grad_norm": 0.9100107550621033, + "learning_rate": 0.0002853139013452915, + "loss": 3.2724, + "step": 84180 + }, + { + "epoch": 5.7198668297323, + "grad_norm": 0.778846800327301, + "learning_rate": 0.00028527143633645875, + "loss": 3.548, + "step": 84185 + }, + { + "epoch": 5.720206549802962, + "grad_norm": 0.7225214838981628, + "learning_rate": 0.000285228971327626, + "loss": 3.2935, + "step": 84190 + }, + { + "epoch": 5.7205462698736245, + "grad_norm": 0.9138268828392029, + "learning_rate": 0.00028518650631879336, + "loss": 3.3533, + "step": 84195 + }, + { + "epoch": 5.720885989944286, + "grad_norm": 1.3907766342163086, + "learning_rate": 0.0002851440413099606, + "loss": 3.3998, + "step": 84200 + }, + { + "epoch": 5.721225710014948, + "grad_norm": 0.8027796149253845, + "learning_rate": 0.00028510157630112787, + "loss": 3.4134, + "step": 84205 + }, + { + "epoch": 5.72156543008561, + "grad_norm": 0.8229761123657227, + "learning_rate": 0.00028505911129229515, + "loss": 3.5, + "step": 84210 + }, + { + "epoch": 5.721905150156271, + "grad_norm": 0.8670302629470825, + "learning_rate": 0.0002850166462834624, + "loss": 3.5143, + "step": 84215 + }, + { + "epoch": 5.722244870226933, + "grad_norm": 1.0904825925827026, + "learning_rate": 0.0002849741812746297, + "loss": 3.577, + "step": 84220 + }, + { + "epoch": 5.722584590297595, + "grad_norm": 0.7785070538520813, + "learning_rate": 0.000284931716265797, + "loss": 3.3959, + "step": 84225 + }, + { + "epoch": 5.722924310368256, + "grad_norm": 0.7457568645477295, + "learning_rate": 0.00028488925125696427, + "loss": 3.3767, + "step": 84230 + }, + { + "epoch": 5.723264030438918, + "grad_norm": 0.7534506916999817, + "learning_rate": 0.00028484678624813155, + "loss": 3.2443, + "step": 84235 + }, + { + "epoch": 5.7236037505095805, + "grad_norm": 0.7830812335014343, + "learning_rate": 0.0002848043212392988, + "loss": 3.5741, + "step": 84240 + }, + { + "epoch": 5.723943470580242, + "grad_norm": 0.9062910676002502, + "learning_rate": 0.00028476185623046605, + "loss": 3.4315, + "step": 84245 + }, + { + "epoch": 5.724283190650904, + "grad_norm": 0.9664648175239563, + "learning_rate": 0.0002847193912216334, + "loss": 3.4936, + "step": 84250 + }, + { + "epoch": 5.724622910721566, + "grad_norm": 1.032411813735962, + "learning_rate": 0.00028467692621280067, + "loss": 3.2719, + "step": 84255 + }, + { + "epoch": 5.724962630792227, + "grad_norm": 0.8423478007316589, + "learning_rate": 0.0002846344612039679, + "loss": 3.0675, + "step": 84260 + }, + { + "epoch": 5.725302350862889, + "grad_norm": 2.505166530609131, + "learning_rate": 0.0002845919961951352, + "loss": 3.5142, + "step": 84265 + }, + { + "epoch": 5.725642070933551, + "grad_norm": 0.8692846894264221, + "learning_rate": 0.0002845495311863025, + "loss": 3.4742, + "step": 84270 + }, + { + "epoch": 5.725981791004212, + "grad_norm": 0.822528064250946, + "learning_rate": 0.0002845070661774698, + "loss": 3.5366, + "step": 84275 + }, + { + "epoch": 5.726321511074874, + "grad_norm": 0.9878360629081726, + "learning_rate": 0.000284464601168637, + "loss": 3.429, + "step": 84280 + }, + { + "epoch": 5.7266612311455365, + "grad_norm": 0.8152016401290894, + "learning_rate": 0.00028442213615980435, + "loss": 3.1965, + "step": 84285 + }, + { + "epoch": 5.727000951216198, + "grad_norm": 0.710882842540741, + "learning_rate": 0.0002843796711509716, + "loss": 3.338, + "step": 84290 + }, + { + "epoch": 5.72734067128686, + "grad_norm": 0.8270421624183655, + "learning_rate": 0.00028433720614213885, + "loss": 3.1344, + "step": 84295 + }, + { + "epoch": 5.727680391357522, + "grad_norm": 0.7645787596702576, + "learning_rate": 0.0002842947411333062, + "loss": 3.4459, + "step": 84300 + }, + { + "epoch": 5.728020111428183, + "grad_norm": 0.7773895859718323, + "learning_rate": 0.00028425227612447347, + "loss": 3.4657, + "step": 84305 + }, + { + "epoch": 5.728359831498845, + "grad_norm": 1.1334983110427856, + "learning_rate": 0.0002842098111156407, + "loss": 3.4099, + "step": 84310 + }, + { + "epoch": 5.728699551569507, + "grad_norm": 1.126188039779663, + "learning_rate": 0.00028416734610680803, + "loss": 3.4231, + "step": 84315 + }, + { + "epoch": 5.729039271640168, + "grad_norm": 1.0259454250335693, + "learning_rate": 0.0002841248810979753, + "loss": 3.3725, + "step": 84320 + }, + { + "epoch": 5.72937899171083, + "grad_norm": 0.9307862520217896, + "learning_rate": 0.00028408241608914253, + "loss": 3.1687, + "step": 84325 + }, + { + "epoch": 5.7297187117814925, + "grad_norm": 1.0933241844177246, + "learning_rate": 0.0002840399510803098, + "loss": 3.3471, + "step": 84330 + }, + { + "epoch": 5.730058431852154, + "grad_norm": 0.9303666353225708, + "learning_rate": 0.00028399748607147715, + "loss": 3.2778, + "step": 84335 + }, + { + "epoch": 5.730398151922816, + "grad_norm": 0.8520998954772949, + "learning_rate": 0.0002839550210626444, + "loss": 3.3819, + "step": 84340 + }, + { + "epoch": 5.730737871993478, + "grad_norm": 1.3583306074142456, + "learning_rate": 0.00028391255605381165, + "loss": 3.5571, + "step": 84345 + }, + { + "epoch": 5.731077592064139, + "grad_norm": 0.9360685348510742, + "learning_rate": 0.000283870091044979, + "loss": 3.2983, + "step": 84350 + }, + { + "epoch": 5.731417312134801, + "grad_norm": 0.8228017091751099, + "learning_rate": 0.0002838276260361462, + "loss": 3.5527, + "step": 84355 + }, + { + "epoch": 5.731757032205463, + "grad_norm": 0.8810756802558899, + "learning_rate": 0.0002837851610273135, + "loss": 3.2763, + "step": 84360 + }, + { + "epoch": 5.732096752276124, + "grad_norm": 1.084437370300293, + "learning_rate": 0.0002837426960184808, + "loss": 3.4021, + "step": 84365 + }, + { + "epoch": 5.732436472346786, + "grad_norm": 0.9288226366043091, + "learning_rate": 0.00028370023100964805, + "loss": 3.3483, + "step": 84370 + }, + { + "epoch": 5.7327761924174485, + "grad_norm": 0.8223653435707092, + "learning_rate": 0.00028365776600081533, + "loss": 3.2707, + "step": 84375 + }, + { + "epoch": 5.73311591248811, + "grad_norm": 0.9286640286445618, + "learning_rate": 0.0002836153009919826, + "loss": 3.3724, + "step": 84380 + }, + { + "epoch": 5.733455632558772, + "grad_norm": 0.8790549039840698, + "learning_rate": 0.0002835728359831499, + "loss": 3.293, + "step": 84385 + }, + { + "epoch": 5.733795352629433, + "grad_norm": 0.9599015116691589, + "learning_rate": 0.0002835303709743172, + "loss": 3.5252, + "step": 84390 + }, + { + "epoch": 5.734135072700095, + "grad_norm": 1.0945180654525757, + "learning_rate": 0.00028348790596548445, + "loss": 3.8197, + "step": 84395 + }, + { + "epoch": 5.734474792770757, + "grad_norm": 0.8682452440261841, + "learning_rate": 0.0002834454409566517, + "loss": 3.3548, + "step": 84400 + }, + { + "epoch": 5.734814512841418, + "grad_norm": 0.9676696062088013, + "learning_rate": 0.000283402975947819, + "loss": 3.1914, + "step": 84405 + }, + { + "epoch": 5.73515423291208, + "grad_norm": 0.7740125060081482, + "learning_rate": 0.0002833605109389863, + "loss": 3.6758, + "step": 84410 + }, + { + "epoch": 5.7354939529827424, + "grad_norm": 0.987575113773346, + "learning_rate": 0.0002833180459301535, + "loss": 3.4431, + "step": 84415 + }, + { + "epoch": 5.735833673053404, + "grad_norm": 0.9242709279060364, + "learning_rate": 0.00028327558092132085, + "loss": 3.5015, + "step": 84420 + }, + { + "epoch": 5.736173393124066, + "grad_norm": 0.8475847244262695, + "learning_rate": 0.00028323311591248813, + "loss": 3.5808, + "step": 84425 + }, + { + "epoch": 5.736513113194728, + "grad_norm": 0.8192474246025085, + "learning_rate": 0.00028319065090365536, + "loss": 3.4186, + "step": 84430 + }, + { + "epoch": 5.736852833265389, + "grad_norm": 1.0076948404312134, + "learning_rate": 0.00028314818589482264, + "loss": 3.3856, + "step": 84435 + }, + { + "epoch": 5.737192553336051, + "grad_norm": 0.7039334177970886, + "learning_rate": 0.00028310572088599, + "loss": 3.4993, + "step": 84440 + }, + { + "epoch": 5.737532273406713, + "grad_norm": 0.9806506037712097, + "learning_rate": 0.00028306325587715725, + "loss": 3.1446, + "step": 84445 + }, + { + "epoch": 5.737871993477374, + "grad_norm": 0.9400902986526489, + "learning_rate": 0.0002830207908683245, + "loss": 3.6152, + "step": 84450 + }, + { + "epoch": 5.738211713548036, + "grad_norm": 0.7554712891578674, + "learning_rate": 0.0002829783258594918, + "loss": 3.3467, + "step": 84455 + }, + { + "epoch": 5.7385514336186985, + "grad_norm": 0.7873030304908752, + "learning_rate": 0.0002829358608506591, + "loss": 3.3276, + "step": 84460 + }, + { + "epoch": 5.73889115368936, + "grad_norm": 1.0474143028259277, + "learning_rate": 0.0002828933958418263, + "loss": 3.4904, + "step": 84465 + }, + { + "epoch": 5.739230873760022, + "grad_norm": 1.0394130945205688, + "learning_rate": 0.0002828509308329936, + "loss": 3.6153, + "step": 84470 + }, + { + "epoch": 5.739570593830684, + "grad_norm": 0.7784074544906616, + "learning_rate": 0.00028280846582416094, + "loss": 3.5683, + "step": 84475 + }, + { + "epoch": 5.739910313901345, + "grad_norm": 1.381017804145813, + "learning_rate": 0.00028276600081532816, + "loss": 3.4012, + "step": 84480 + }, + { + "epoch": 5.740250033972007, + "grad_norm": 0.8107795119285583, + "learning_rate": 0.00028272353580649544, + "loss": 3.3843, + "step": 84485 + }, + { + "epoch": 5.740589754042669, + "grad_norm": 1.0040910243988037, + "learning_rate": 0.0002826810707976628, + "loss": 3.6437, + "step": 84490 + }, + { + "epoch": 5.74092947411333, + "grad_norm": 0.6511548757553101, + "learning_rate": 0.00028263860578883, + "loss": 3.54, + "step": 84495 + }, + { + "epoch": 5.741269194183992, + "grad_norm": 1.0013830661773682, + "learning_rate": 0.0002825961407799973, + "loss": 3.3924, + "step": 84500 + }, + { + "epoch": 5.7416089142546545, + "grad_norm": 0.8224095702171326, + "learning_rate": 0.00028255367577116456, + "loss": 3.0987, + "step": 84505 + }, + { + "epoch": 5.741948634325316, + "grad_norm": 0.9141286611557007, + "learning_rate": 0.00028251121076233184, + "loss": 3.1612, + "step": 84510 + }, + { + "epoch": 5.742288354395978, + "grad_norm": 0.8334749341011047, + "learning_rate": 0.0002824687457534991, + "loss": 3.4992, + "step": 84515 + }, + { + "epoch": 5.74262807446664, + "grad_norm": 0.8866061568260193, + "learning_rate": 0.0002824262807446664, + "loss": 3.5478, + "step": 84520 + }, + { + "epoch": 5.742967794537301, + "grad_norm": 1.1317301988601685, + "learning_rate": 0.0002823838157358337, + "loss": 3.5257, + "step": 84525 + }, + { + "epoch": 5.743307514607963, + "grad_norm": 0.6831353306770325, + "learning_rate": 0.00028234135072700096, + "loss": 3.532, + "step": 84530 + }, + { + "epoch": 5.743647234678625, + "grad_norm": 1.019716739654541, + "learning_rate": 0.00028229888571816824, + "loss": 3.4412, + "step": 84535 + }, + { + "epoch": 5.743986954749286, + "grad_norm": 0.8241785168647766, + "learning_rate": 0.00028225642070933547, + "loss": 3.5818, + "step": 84540 + }, + { + "epoch": 5.744326674819948, + "grad_norm": 1.0471223592758179, + "learning_rate": 0.0002822139557005028, + "loss": 3.4987, + "step": 84545 + }, + { + "epoch": 5.7446663948906105, + "grad_norm": 0.9474467039108276, + "learning_rate": 0.0002821714906916701, + "loss": 3.4042, + "step": 84550 + }, + { + "epoch": 5.745006114961272, + "grad_norm": 1.12886643409729, + "learning_rate": 0.0002821290256828373, + "loss": 3.2678, + "step": 84555 + }, + { + "epoch": 5.745345835031934, + "grad_norm": 0.7743617296218872, + "learning_rate": 0.00028208656067400464, + "loss": 3.4937, + "step": 84560 + }, + { + "epoch": 5.745685555102596, + "grad_norm": 1.1200528144836426, + "learning_rate": 0.0002820440956651719, + "loss": 3.379, + "step": 84565 + }, + { + "epoch": 5.746025275173257, + "grad_norm": 0.886712908744812, + "learning_rate": 0.00028200163065633915, + "loss": 3.5364, + "step": 84570 + }, + { + "epoch": 5.746364995243919, + "grad_norm": 1.003408432006836, + "learning_rate": 0.00028195916564750643, + "loss": 3.5456, + "step": 84575 + }, + { + "epoch": 5.746704715314581, + "grad_norm": 0.9175798892974854, + "learning_rate": 0.00028191670063867376, + "loss": 3.2701, + "step": 84580 + }, + { + "epoch": 5.747044435385242, + "grad_norm": 0.9334120750427246, + "learning_rate": 0.000281874235629841, + "loss": 3.527, + "step": 84585 + }, + { + "epoch": 5.747384155455904, + "grad_norm": 0.9059342741966248, + "learning_rate": 0.00028183177062100827, + "loss": 3.6673, + "step": 84590 + }, + { + "epoch": 5.7477238755265665, + "grad_norm": 0.9571492671966553, + "learning_rate": 0.0002817893056121756, + "loss": 3.3541, + "step": 84595 + }, + { + "epoch": 5.748063595597228, + "grad_norm": 0.8485720753669739, + "learning_rate": 0.00028174684060334283, + "loss": 3.1857, + "step": 84600 + }, + { + "epoch": 5.74840331566789, + "grad_norm": 0.8107588291168213, + "learning_rate": 0.0002817043755945101, + "loss": 3.583, + "step": 84605 + }, + { + "epoch": 5.748743035738551, + "grad_norm": 1.0185202360153198, + "learning_rate": 0.00028166191058567744, + "loss": 3.4507, + "step": 84610 + }, + { + "epoch": 5.749082755809213, + "grad_norm": 0.8092616200447083, + "learning_rate": 0.0002816194455768447, + "loss": 3.3184, + "step": 84615 + }, + { + "epoch": 5.749422475879875, + "grad_norm": 0.8934595584869385, + "learning_rate": 0.00028157698056801195, + "loss": 3.3966, + "step": 84620 + }, + { + "epoch": 5.749762195950536, + "grad_norm": 0.7827594876289368, + "learning_rate": 0.00028153451555917923, + "loss": 3.4473, + "step": 84625 + }, + { + "epoch": 5.750101916021198, + "grad_norm": 1.0654170513153076, + "learning_rate": 0.00028149205055034656, + "loss": 3.3413, + "step": 84630 + }, + { + "epoch": 5.75044163609186, + "grad_norm": 0.8667079210281372, + "learning_rate": 0.0002814495855415138, + "loss": 3.3158, + "step": 84635 + }, + { + "epoch": 5.750781356162522, + "grad_norm": 0.7611660361289978, + "learning_rate": 0.00028140712053268107, + "loss": 3.3284, + "step": 84640 + }, + { + "epoch": 5.751121076233184, + "grad_norm": 0.675838053226471, + "learning_rate": 0.0002813646555238484, + "loss": 3.3424, + "step": 84645 + }, + { + "epoch": 5.751460796303846, + "grad_norm": 0.7194709777832031, + "learning_rate": 0.00028132219051501563, + "loss": 3.3273, + "step": 84650 + }, + { + "epoch": 5.751800516374507, + "grad_norm": 0.8566758632659912, + "learning_rate": 0.0002812797255061829, + "loss": 3.4183, + "step": 84655 + }, + { + "epoch": 5.752140236445169, + "grad_norm": 0.7404763698577881, + "learning_rate": 0.0002812372604973502, + "loss": 3.6421, + "step": 84660 + }, + { + "epoch": 5.752479956515831, + "grad_norm": 1.1754703521728516, + "learning_rate": 0.00028119479548851747, + "loss": 3.3586, + "step": 84665 + }, + { + "epoch": 5.752819676586492, + "grad_norm": 0.8850026726722717, + "learning_rate": 0.00028115233047968475, + "loss": 3.477, + "step": 84670 + }, + { + "epoch": 5.753159396657154, + "grad_norm": 1.0192652940750122, + "learning_rate": 0.00028110986547085203, + "loss": 3.356, + "step": 84675 + }, + { + "epoch": 5.753499116727816, + "grad_norm": 0.9972853064537048, + "learning_rate": 0.0002810674004620193, + "loss": 3.5749, + "step": 84680 + }, + { + "epoch": 5.753838836798478, + "grad_norm": 0.8074849843978882, + "learning_rate": 0.0002810249354531866, + "loss": 3.2701, + "step": 84685 + }, + { + "epoch": 5.75417855686914, + "grad_norm": 1.0023555755615234, + "learning_rate": 0.00028098247044435387, + "loss": 3.5231, + "step": 84690 + }, + { + "epoch": 5.754518276939802, + "grad_norm": 1.3214921951293945, + "learning_rate": 0.0002809400054355211, + "loss": 3.5521, + "step": 84695 + }, + { + "epoch": 5.754857997010463, + "grad_norm": 1.0308172702789307, + "learning_rate": 0.00028089754042668843, + "loss": 3.2048, + "step": 84700 + }, + { + "epoch": 5.755197717081125, + "grad_norm": 1.0497404336929321, + "learning_rate": 0.0002808550754178557, + "loss": 3.3417, + "step": 84705 + }, + { + "epoch": 5.755537437151787, + "grad_norm": 0.9844977855682373, + "learning_rate": 0.00028081261040902294, + "loss": 3.2804, + "step": 84710 + }, + { + "epoch": 5.755877157222448, + "grad_norm": 1.122767686843872, + "learning_rate": 0.00028077014540019027, + "loss": 3.0089, + "step": 84715 + }, + { + "epoch": 5.75621687729311, + "grad_norm": 0.9535915851593018, + "learning_rate": 0.00028072768039135755, + "loss": 3.3589, + "step": 84720 + }, + { + "epoch": 5.7565565973637725, + "grad_norm": 0.9867868423461914, + "learning_rate": 0.0002806852153825248, + "loss": 3.3441, + "step": 84725 + }, + { + "epoch": 5.756896317434434, + "grad_norm": 0.9792026281356812, + "learning_rate": 0.00028064275037369206, + "loss": 3.2376, + "step": 84730 + }, + { + "epoch": 5.757236037505096, + "grad_norm": 0.820782482624054, + "learning_rate": 0.0002806002853648594, + "loss": 3.2819, + "step": 84735 + }, + { + "epoch": 5.757575757575758, + "grad_norm": 1.0022656917572021, + "learning_rate": 0.0002805578203560266, + "loss": 3.4269, + "step": 84740 + }, + { + "epoch": 5.757915477646419, + "grad_norm": 1.4341785907745361, + "learning_rate": 0.0002805153553471939, + "loss": 3.3146, + "step": 84745 + }, + { + "epoch": 5.758255197717081, + "grad_norm": 0.7614864110946655, + "learning_rate": 0.00028047289033836123, + "loss": 3.4474, + "step": 84750 + }, + { + "epoch": 5.758594917787743, + "grad_norm": 0.9292521476745605, + "learning_rate": 0.00028043042532952846, + "loss": 3.6708, + "step": 84755 + }, + { + "epoch": 5.758934637858404, + "grad_norm": 1.0267651081085205, + "learning_rate": 0.00028038796032069574, + "loss": 3.3376, + "step": 84760 + }, + { + "epoch": 5.759274357929066, + "grad_norm": 0.9289642572402954, + "learning_rate": 0.000280345495311863, + "loss": 3.752, + "step": 84765 + }, + { + "epoch": 5.7596140779997285, + "grad_norm": 1.022080898284912, + "learning_rate": 0.0002803030303030303, + "loss": 3.3865, + "step": 84770 + }, + { + "epoch": 5.75995379807039, + "grad_norm": 1.18377685546875, + "learning_rate": 0.0002802605652941976, + "loss": 3.3742, + "step": 84775 + }, + { + "epoch": 5.760293518141052, + "grad_norm": 0.9211857318878174, + "learning_rate": 0.00028021810028536486, + "loss": 3.65, + "step": 84780 + }, + { + "epoch": 5.760633238211714, + "grad_norm": 0.9545742869377136, + "learning_rate": 0.0002801756352765322, + "loss": 3.4602, + "step": 84785 + }, + { + "epoch": 5.760972958282375, + "grad_norm": 0.9244253635406494, + "learning_rate": 0.0002801331702676994, + "loss": 3.3587, + "step": 84790 + }, + { + "epoch": 5.761312678353037, + "grad_norm": 0.9159384369850159, + "learning_rate": 0.0002800907052588667, + "loss": 3.486, + "step": 84795 + }, + { + "epoch": 5.761652398423699, + "grad_norm": 0.9811659455299377, + "learning_rate": 0.000280048240250034, + "loss": 3.3703, + "step": 84800 + }, + { + "epoch": 5.76199211849436, + "grad_norm": 0.8544828295707703, + "learning_rate": 0.00028000577524120126, + "loss": 2.9801, + "step": 84805 + }, + { + "epoch": 5.762331838565022, + "grad_norm": 0.8166537880897522, + "learning_rate": 0.00027996331023236854, + "loss": 3.3157, + "step": 84810 + }, + { + "epoch": 5.7626715586356845, + "grad_norm": 0.8933308720588684, + "learning_rate": 0.0002799208452235358, + "loss": 3.6497, + "step": 84815 + }, + { + "epoch": 5.763011278706346, + "grad_norm": 0.8971887826919556, + "learning_rate": 0.0002798783802147031, + "loss": 3.6176, + "step": 84820 + }, + { + "epoch": 5.763350998777008, + "grad_norm": 0.8819124698638916, + "learning_rate": 0.0002798359152058704, + "loss": 3.4074, + "step": 84825 + }, + { + "epoch": 5.76369071884767, + "grad_norm": 1.1164460182189941, + "learning_rate": 0.00027979345019703766, + "loss": 3.5276, + "step": 84830 + }, + { + "epoch": 5.764030438918331, + "grad_norm": 0.8393356800079346, + "learning_rate": 0.0002797509851882049, + "loss": 3.4456, + "step": 84835 + }, + { + "epoch": 5.764370158988993, + "grad_norm": 0.8412635922431946, + "learning_rate": 0.0002797085201793722, + "loss": 3.2694, + "step": 84840 + }, + { + "epoch": 5.764709879059655, + "grad_norm": 1.0012216567993164, + "learning_rate": 0.0002796660551705395, + "loss": 3.4542, + "step": 84845 + }, + { + "epoch": 5.765049599130316, + "grad_norm": 0.8667099475860596, + "learning_rate": 0.0002796235901617067, + "loss": 3.1774, + "step": 84850 + }, + { + "epoch": 5.765389319200978, + "grad_norm": 0.9090390801429749, + "learning_rate": 0.00027958112515287406, + "loss": 3.5171, + "step": 84855 + }, + { + "epoch": 5.7657290392716405, + "grad_norm": 0.8180267214775085, + "learning_rate": 0.00027953866014404134, + "loss": 3.4268, + "step": 84860 + }, + { + "epoch": 5.766068759342302, + "grad_norm": 0.7228946685791016, + "learning_rate": 0.00027949619513520856, + "loss": 3.4299, + "step": 84865 + }, + { + "epoch": 5.766408479412964, + "grad_norm": 0.7036243677139282, + "learning_rate": 0.00027945373012637584, + "loss": 3.4997, + "step": 84870 + }, + { + "epoch": 5.766748199483626, + "grad_norm": 0.9595456719398499, + "learning_rate": 0.0002794112651175432, + "loss": 3.5157, + "step": 84875 + }, + { + "epoch": 5.767087919554287, + "grad_norm": 0.8089964389801025, + "learning_rate": 0.0002793688001087104, + "loss": 3.3745, + "step": 84880 + }, + { + "epoch": 5.767427639624949, + "grad_norm": 1.8148776292800903, + "learning_rate": 0.0002793263350998777, + "loss": 3.3066, + "step": 84885 + }, + { + "epoch": 5.767767359695611, + "grad_norm": 0.9661250710487366, + "learning_rate": 0.000279283870091045, + "loss": 3.5721, + "step": 84890 + }, + { + "epoch": 5.768107079766272, + "grad_norm": 0.8969875574111938, + "learning_rate": 0.00027924140508221225, + "loss": 3.4119, + "step": 84895 + }, + { + "epoch": 5.768446799836934, + "grad_norm": 0.7666468024253845, + "learning_rate": 0.0002791989400733795, + "loss": 3.5905, + "step": 84900 + }, + { + "epoch": 5.7687865199075965, + "grad_norm": 0.8262254595756531, + "learning_rate": 0.00027915647506454686, + "loss": 3.6349, + "step": 84905 + }, + { + "epoch": 5.769126239978258, + "grad_norm": 0.768191397190094, + "learning_rate": 0.0002791140100557141, + "loss": 3.5335, + "step": 84910 + }, + { + "epoch": 5.76946596004892, + "grad_norm": 1.1899518966674805, + "learning_rate": 0.00027907154504688137, + "loss": 3.4166, + "step": 84915 + }, + { + "epoch": 5.769805680119582, + "grad_norm": 0.838577151298523, + "learning_rate": 0.00027902908003804865, + "loss": 3.3546, + "step": 84920 + }, + { + "epoch": 5.770145400190243, + "grad_norm": 1.1114656925201416, + "learning_rate": 0.0002789866150292159, + "loss": 3.6022, + "step": 84925 + }, + { + "epoch": 5.770485120260905, + "grad_norm": 0.773032009601593, + "learning_rate": 0.0002789441500203832, + "loss": 3.5781, + "step": 84930 + }, + { + "epoch": 5.770824840331567, + "grad_norm": 0.8706827759742737, + "learning_rate": 0.0002789016850115505, + "loss": 3.6457, + "step": 84935 + }, + { + "epoch": 5.771164560402228, + "grad_norm": 0.8521878719329834, + "learning_rate": 0.00027885922000271777, + "loss": 3.4727, + "step": 84940 + }, + { + "epoch": 5.77150428047289, + "grad_norm": 0.779400110244751, + "learning_rate": 0.00027881675499388505, + "loss": 3.4204, + "step": 84945 + }, + { + "epoch": 5.7718440005435525, + "grad_norm": 0.9508264660835266, + "learning_rate": 0.0002787742899850523, + "loss": 3.4524, + "step": 84950 + }, + { + "epoch": 5.772183720614214, + "grad_norm": 0.7061075568199158, + "learning_rate": 0.0002787318249762196, + "loss": 3.5179, + "step": 84955 + }, + { + "epoch": 5.772523440684876, + "grad_norm": 0.7639891505241394, + "learning_rate": 0.0002786893599673869, + "loss": 3.4342, + "step": 84960 + }, + { + "epoch": 5.772863160755538, + "grad_norm": 0.8772944808006287, + "learning_rate": 0.00027864689495855417, + "loss": 3.3587, + "step": 84965 + }, + { + "epoch": 5.773202880826199, + "grad_norm": 0.7489306330680847, + "learning_rate": 0.00027860442994972145, + "loss": 3.4108, + "step": 84970 + }, + { + "epoch": 5.773542600896861, + "grad_norm": 0.945115327835083, + "learning_rate": 0.0002785619649408887, + "loss": 3.387, + "step": 84975 + }, + { + "epoch": 5.773882320967523, + "grad_norm": 0.9723930358886719, + "learning_rate": 0.000278519499932056, + "loss": 3.3204, + "step": 84980 + }, + { + "epoch": 5.774222041038184, + "grad_norm": 0.7629793882369995, + "learning_rate": 0.0002784770349232233, + "loss": 3.2069, + "step": 84985 + }, + { + "epoch": 5.7745617611088464, + "grad_norm": 0.9073855876922607, + "learning_rate": 0.0002784345699143905, + "loss": 3.6164, + "step": 84990 + }, + { + "epoch": 5.7749014811795085, + "grad_norm": 1.022373914718628, + "learning_rate": 0.00027839210490555785, + "loss": 3.5023, + "step": 84995 + }, + { + "epoch": 5.77524120125017, + "grad_norm": 0.6948070526123047, + "learning_rate": 0.0002783496398967251, + "loss": 3.4319, + "step": 85000 + }, + { + "epoch": 5.775580921320832, + "grad_norm": 0.803390622138977, + "learning_rate": 0.00027830717488789235, + "loss": 3.2668, + "step": 85005 + }, + { + "epoch": 5.775920641391494, + "grad_norm": 0.9695560932159424, + "learning_rate": 0.0002782647098790597, + "loss": 3.4588, + "step": 85010 + }, + { + "epoch": 5.776260361462155, + "grad_norm": 0.7705345153808594, + "learning_rate": 0.00027822224487022697, + "loss": 3.6114, + "step": 85015 + }, + { + "epoch": 5.776600081532817, + "grad_norm": 0.8456597328186035, + "learning_rate": 0.0002781797798613942, + "loss": 3.445, + "step": 85020 + }, + { + "epoch": 5.776939801603479, + "grad_norm": 0.8580629229545593, + "learning_rate": 0.00027813731485256147, + "loss": 3.3209, + "step": 85025 + }, + { + "epoch": 5.77727952167414, + "grad_norm": 0.7933374047279358, + "learning_rate": 0.0002780948498437288, + "loss": 3.4385, + "step": 85030 + }, + { + "epoch": 5.7776192417448025, + "grad_norm": 0.8703150749206543, + "learning_rate": 0.00027805238483489603, + "loss": 3.5549, + "step": 85035 + }, + { + "epoch": 5.7779589618154645, + "grad_norm": 1.0002001523971558, + "learning_rate": 0.0002780099198260633, + "loss": 3.3039, + "step": 85040 + }, + { + "epoch": 5.778298681886126, + "grad_norm": 0.777171790599823, + "learning_rate": 0.00027796745481723065, + "loss": 3.8013, + "step": 85045 + }, + { + "epoch": 5.778638401956788, + "grad_norm": 1.0654836893081665, + "learning_rate": 0.0002779249898083979, + "loss": 3.2762, + "step": 85050 + }, + { + "epoch": 5.77897812202745, + "grad_norm": 0.7646058201789856, + "learning_rate": 0.00027788252479956515, + "loss": 3.4802, + "step": 85055 + }, + { + "epoch": 5.779317842098111, + "grad_norm": 1.1060969829559326, + "learning_rate": 0.00027784005979073243, + "loss": 3.3288, + "step": 85060 + }, + { + "epoch": 5.779657562168773, + "grad_norm": 0.843159019947052, + "learning_rate": 0.0002777975947818997, + "loss": 3.6004, + "step": 85065 + }, + { + "epoch": 5.779997282239434, + "grad_norm": 0.8048436641693115, + "learning_rate": 0.000277755129773067, + "loss": 3.2696, + "step": 85070 + }, + { + "epoch": 5.780337002310096, + "grad_norm": 1.1550689935684204, + "learning_rate": 0.0002777126647642343, + "loss": 3.3093, + "step": 85075 + }, + { + "epoch": 5.7806767223807585, + "grad_norm": 0.8909990787506104, + "learning_rate": 0.00027767019975540155, + "loss": 3.4689, + "step": 85080 + }, + { + "epoch": 5.78101644245142, + "grad_norm": 0.8575339913368225, + "learning_rate": 0.00027762773474656883, + "loss": 3.4054, + "step": 85085 + }, + { + "epoch": 5.781356162522082, + "grad_norm": 0.8623911142349243, + "learning_rate": 0.0002775852697377361, + "loss": 3.4541, + "step": 85090 + }, + { + "epoch": 5.781695882592744, + "grad_norm": 0.8108147382736206, + "learning_rate": 0.00027754280472890334, + "loss": 3.5366, + "step": 85095 + }, + { + "epoch": 5.782035602663405, + "grad_norm": 0.9422404766082764, + "learning_rate": 0.0002775003397200707, + "loss": 3.2841, + "step": 85100 + }, + { + "epoch": 5.782375322734067, + "grad_norm": 1.1559394598007202, + "learning_rate": 0.00027745787471123795, + "loss": 3.1353, + "step": 85105 + }, + { + "epoch": 5.782715042804729, + "grad_norm": 0.7180761694908142, + "learning_rate": 0.0002774154097024052, + "loss": 3.4899, + "step": 85110 + }, + { + "epoch": 5.78305476287539, + "grad_norm": 0.839855432510376, + "learning_rate": 0.0002773729446935725, + "loss": 3.4894, + "step": 85115 + }, + { + "epoch": 5.783394482946052, + "grad_norm": 1.0017191171646118, + "learning_rate": 0.0002773304796847398, + "loss": 3.1628, + "step": 85120 + }, + { + "epoch": 5.7837342030167145, + "grad_norm": 0.8165546655654907, + "learning_rate": 0.0002772880146759071, + "loss": 3.4312, + "step": 85125 + }, + { + "epoch": 5.784073923087376, + "grad_norm": 0.9815808534622192, + "learning_rate": 0.0002772455496670743, + "loss": 3.2882, + "step": 85130 + }, + { + "epoch": 5.784413643158038, + "grad_norm": 0.810731053352356, + "learning_rate": 0.00027720308465824163, + "loss": 3.6855, + "step": 85135 + }, + { + "epoch": 5.7847533632287, + "grad_norm": 0.7853988409042358, + "learning_rate": 0.0002771606196494089, + "loss": 3.337, + "step": 85140 + }, + { + "epoch": 5.785093083299361, + "grad_norm": 0.8760092258453369, + "learning_rate": 0.00027711815464057614, + "loss": 3.5156, + "step": 85145 + }, + { + "epoch": 5.785432803370023, + "grad_norm": 0.9232132434844971, + "learning_rate": 0.0002770756896317435, + "loss": 3.4071, + "step": 85150 + }, + { + "epoch": 5.785772523440685, + "grad_norm": 0.9695355296134949, + "learning_rate": 0.00027703322462291075, + "loss": 3.3758, + "step": 85155 + }, + { + "epoch": 5.786112243511346, + "grad_norm": 0.8862046599388123, + "learning_rate": 0.000276990759614078, + "loss": 3.4341, + "step": 85160 + }, + { + "epoch": 5.786451963582008, + "grad_norm": 0.8917404413223267, + "learning_rate": 0.00027694829460524526, + "loss": 3.4408, + "step": 85165 + }, + { + "epoch": 5.7867916836526705, + "grad_norm": 0.854174792766571, + "learning_rate": 0.0002769058295964126, + "loss": 3.5451, + "step": 85170 + }, + { + "epoch": 5.787131403723332, + "grad_norm": 1.3359681367874146, + "learning_rate": 0.0002768633645875798, + "loss": 3.405, + "step": 85175 + }, + { + "epoch": 5.787471123793994, + "grad_norm": 0.9163963198661804, + "learning_rate": 0.0002768208995787471, + "loss": 3.7444, + "step": 85180 + }, + { + "epoch": 5.787810843864656, + "grad_norm": 0.783859133720398, + "learning_rate": 0.00027677843456991444, + "loss": 3.4492, + "step": 85185 + }, + { + "epoch": 5.788150563935317, + "grad_norm": 0.830437421798706, + "learning_rate": 0.00027673596956108166, + "loss": 3.6456, + "step": 85190 + }, + { + "epoch": 5.788490284005979, + "grad_norm": 0.8571285605430603, + "learning_rate": 0.00027669350455224894, + "loss": 3.4765, + "step": 85195 + }, + { + "epoch": 5.788830004076641, + "grad_norm": 1.1131328344345093, + "learning_rate": 0.0002766510395434163, + "loss": 3.3494, + "step": 85200 + }, + { + "epoch": 5.789169724147302, + "grad_norm": 0.9242942333221436, + "learning_rate": 0.0002766085745345835, + "loss": 3.5041, + "step": 85205 + }, + { + "epoch": 5.789509444217964, + "grad_norm": 0.9489637613296509, + "learning_rate": 0.0002765661095257508, + "loss": 3.5862, + "step": 85210 + }, + { + "epoch": 5.7898491642886265, + "grad_norm": 0.8635234832763672, + "learning_rate": 0.00027652364451691806, + "loss": 3.2555, + "step": 85215 + }, + { + "epoch": 5.790188884359288, + "grad_norm": 1.3633110523223877, + "learning_rate": 0.00027648117950808534, + "loss": 3.5194, + "step": 85220 + }, + { + "epoch": 5.79052860442995, + "grad_norm": 1.0149911642074585, + "learning_rate": 0.0002764387144992526, + "loss": 3.2628, + "step": 85225 + }, + { + "epoch": 5.790868324500612, + "grad_norm": 0.8866867423057556, + "learning_rate": 0.0002763962494904199, + "loss": 3.5456, + "step": 85230 + }, + { + "epoch": 5.791208044571273, + "grad_norm": 1.0468628406524658, + "learning_rate": 0.0002763537844815872, + "loss": 3.3359, + "step": 85235 + }, + { + "epoch": 5.791547764641935, + "grad_norm": 0.9685946106910706, + "learning_rate": 0.00027631131947275446, + "loss": 3.382, + "step": 85240 + }, + { + "epoch": 5.791887484712597, + "grad_norm": 0.9827118515968323, + "learning_rate": 0.00027626885446392174, + "loss": 3.5233, + "step": 85245 + }, + { + "epoch": 5.792227204783258, + "grad_norm": 3.432516098022461, + "learning_rate": 0.00027622638945508897, + "loss": 3.4853, + "step": 85250 + }, + { + "epoch": 5.79256692485392, + "grad_norm": 1.001778483390808, + "learning_rate": 0.0002761839244462563, + "loss": 3.3503, + "step": 85255 + }, + { + "epoch": 5.7929066449245825, + "grad_norm": 1.013781189918518, + "learning_rate": 0.0002761414594374236, + "loss": 3.6431, + "step": 85260 + }, + { + "epoch": 5.793246364995244, + "grad_norm": 0.8559560775756836, + "learning_rate": 0.0002760989944285908, + "loss": 3.3218, + "step": 85265 + }, + { + "epoch": 5.793586085065906, + "grad_norm": 1.380214810371399, + "learning_rate": 0.00027605652941975814, + "loss": 3.302, + "step": 85270 + }, + { + "epoch": 5.793925805136568, + "grad_norm": 0.7724699378013611, + "learning_rate": 0.0002760140644109254, + "loss": 3.5282, + "step": 85275 + }, + { + "epoch": 5.794265525207229, + "grad_norm": 0.8812540769577026, + "learning_rate": 0.00027597159940209265, + "loss": 3.719, + "step": 85280 + }, + { + "epoch": 5.794605245277891, + "grad_norm": 0.9294450879096985, + "learning_rate": 0.00027592913439325993, + "loss": 3.5885, + "step": 85285 + }, + { + "epoch": 5.794944965348552, + "grad_norm": 0.7558821439743042, + "learning_rate": 0.00027588666938442726, + "loss": 3.4517, + "step": 85290 + }, + { + "epoch": 5.795284685419214, + "grad_norm": 0.9568177461624146, + "learning_rate": 0.00027584420437559454, + "loss": 3.5415, + "step": 85295 + }, + { + "epoch": 5.7956244054898765, + "grad_norm": 0.8929890394210815, + "learning_rate": 0.00027580173936676177, + "loss": 3.4361, + "step": 85300 + }, + { + "epoch": 5.795964125560538, + "grad_norm": 0.984186589717865, + "learning_rate": 0.0002757592743579291, + "loss": 3.5241, + "step": 85305 + }, + { + "epoch": 5.7963038456312, + "grad_norm": 0.9198573231697083, + "learning_rate": 0.0002757168093490964, + "loss": 3.6123, + "step": 85310 + }, + { + "epoch": 5.796643565701862, + "grad_norm": 0.7941446900367737, + "learning_rate": 0.0002756743443402636, + "loss": 3.3759, + "step": 85315 + }, + { + "epoch": 5.796983285772523, + "grad_norm": 1.0163958072662354, + "learning_rate": 0.0002756318793314309, + "loss": 3.144, + "step": 85320 + }, + { + "epoch": 5.797323005843185, + "grad_norm": 0.7657999396324158, + "learning_rate": 0.0002755894143225982, + "loss": 3.2427, + "step": 85325 + }, + { + "epoch": 5.797662725913847, + "grad_norm": 0.8563111424446106, + "learning_rate": 0.00027554694931376545, + "loss": 3.536, + "step": 85330 + }, + { + "epoch": 5.798002445984508, + "grad_norm": 0.6837221384048462, + "learning_rate": 0.00027550448430493273, + "loss": 3.581, + "step": 85335 + }, + { + "epoch": 5.79834216605517, + "grad_norm": 1.2807585000991821, + "learning_rate": 0.00027546201929610006, + "loss": 3.3842, + "step": 85340 + }, + { + "epoch": 5.7986818861258325, + "grad_norm": 1.5165210962295532, + "learning_rate": 0.0002754195542872673, + "loss": 3.3318, + "step": 85345 + }, + { + "epoch": 5.799021606196494, + "grad_norm": 1.1337095499038696, + "learning_rate": 0.00027537708927843457, + "loss": 3.4679, + "step": 85350 + }, + { + "epoch": 5.799361326267156, + "grad_norm": 0.811368465423584, + "learning_rate": 0.00027533462426960185, + "loss": 3.4101, + "step": 85355 + }, + { + "epoch": 5.799701046337818, + "grad_norm": 1.0859991312026978, + "learning_rate": 0.00027529215926076913, + "loss": 3.401, + "step": 85360 + }, + { + "epoch": 5.800040766408479, + "grad_norm": 0.8833967447280884, + "learning_rate": 0.0002752496942519364, + "loss": 3.3243, + "step": 85365 + }, + { + "epoch": 5.800380486479141, + "grad_norm": 1.0116569995880127, + "learning_rate": 0.0002752072292431037, + "loss": 2.9823, + "step": 85370 + }, + { + "epoch": 5.800720206549803, + "grad_norm": 1.0019696950912476, + "learning_rate": 0.00027516476423427097, + "loss": 3.3779, + "step": 85375 + }, + { + "epoch": 5.801059926620464, + "grad_norm": 0.8002239465713501, + "learning_rate": 0.00027512229922543825, + "loss": 3.3028, + "step": 85380 + }, + { + "epoch": 5.801399646691126, + "grad_norm": 0.8214337825775146, + "learning_rate": 0.00027507983421660553, + "loss": 3.4845, + "step": 85385 + }, + { + "epoch": 5.8017393667617885, + "grad_norm": 0.8741536140441895, + "learning_rate": 0.00027503736920777276, + "loss": 3.3163, + "step": 85390 + }, + { + "epoch": 5.80207908683245, + "grad_norm": 0.8808944821357727, + "learning_rate": 0.0002749949041989401, + "loss": 3.6349, + "step": 85395 + }, + { + "epoch": 5.802418806903112, + "grad_norm": 0.9486581087112427, + "learning_rate": 0.00027495243919010737, + "loss": 3.4922, + "step": 85400 + }, + { + "epoch": 5.802758526973774, + "grad_norm": 0.9147966504096985, + "learning_rate": 0.0002749099741812746, + "loss": 3.5344, + "step": 85405 + }, + { + "epoch": 5.803098247044435, + "grad_norm": 1.0907738208770752, + "learning_rate": 0.00027486750917244193, + "loss": 3.4951, + "step": 85410 + }, + { + "epoch": 5.803437967115097, + "grad_norm": 1.1645057201385498, + "learning_rate": 0.0002748250441636092, + "loss": 3.2516, + "step": 85415 + }, + { + "epoch": 5.803777687185759, + "grad_norm": 0.7734910249710083, + "learning_rate": 0.00027478257915477644, + "loss": 3.3471, + "step": 85420 + }, + { + "epoch": 5.80411740725642, + "grad_norm": 1.045120120048523, + "learning_rate": 0.0002747401141459437, + "loss": 3.1837, + "step": 85425 + }, + { + "epoch": 5.804457127327082, + "grad_norm": 1.0016454458236694, + "learning_rate": 0.00027469764913711105, + "loss": 3.2341, + "step": 85430 + }, + { + "epoch": 5.8047968473977445, + "grad_norm": 1.0934362411499023, + "learning_rate": 0.0002746551841282783, + "loss": 3.7125, + "step": 85435 + }, + { + "epoch": 5.805136567468406, + "grad_norm": 1.031195878982544, + "learning_rate": 0.00027461271911944556, + "loss": 3.2646, + "step": 85440 + }, + { + "epoch": 5.805476287539068, + "grad_norm": 1.3409125804901123, + "learning_rate": 0.0002745702541106129, + "loss": 3.5212, + "step": 85445 + }, + { + "epoch": 5.80581600760973, + "grad_norm": 1.1695574522018433, + "learning_rate": 0.0002745277891017801, + "loss": 3.4205, + "step": 85450 + }, + { + "epoch": 5.806155727680391, + "grad_norm": 1.0356390476226807, + "learning_rate": 0.0002744853240929474, + "loss": 3.5075, + "step": 85455 + }, + { + "epoch": 5.806495447751053, + "grad_norm": 0.8575956225395203, + "learning_rate": 0.00027444285908411473, + "loss": 3.3658, + "step": 85460 + }, + { + "epoch": 5.806835167821715, + "grad_norm": 0.7914144992828369, + "learning_rate": 0.000274400394075282, + "loss": 3.5102, + "step": 85465 + }, + { + "epoch": 5.807174887892376, + "grad_norm": 0.9491885304450989, + "learning_rate": 0.00027435792906644924, + "loss": 3.4829, + "step": 85470 + }, + { + "epoch": 5.807514607963038, + "grad_norm": 0.8398022055625916, + "learning_rate": 0.0002743154640576165, + "loss": 3.4574, + "step": 85475 + }, + { + "epoch": 5.8078543280337005, + "grad_norm": 0.7210383415222168, + "learning_rate": 0.00027427299904878385, + "loss": 3.4694, + "step": 85480 + }, + { + "epoch": 5.808194048104362, + "grad_norm": 0.9783635139465332, + "learning_rate": 0.0002742305340399511, + "loss": 3.5397, + "step": 85485 + }, + { + "epoch": 5.808533768175024, + "grad_norm": 0.7893935441970825, + "learning_rate": 0.00027418806903111836, + "loss": 3.5413, + "step": 85490 + }, + { + "epoch": 5.808873488245686, + "grad_norm": 1.0629544258117676, + "learning_rate": 0.0002741456040222857, + "loss": 3.3512, + "step": 85495 + }, + { + "epoch": 5.809213208316347, + "grad_norm": 0.9241312742233276, + "learning_rate": 0.0002741031390134529, + "loss": 3.4363, + "step": 85500 + }, + { + "epoch": 5.809552928387009, + "grad_norm": 0.943058967590332, + "learning_rate": 0.0002740606740046202, + "loss": 3.3459, + "step": 85505 + }, + { + "epoch": 5.809892648457671, + "grad_norm": 0.7687779068946838, + "learning_rate": 0.0002740182089957875, + "loss": 3.3756, + "step": 85510 + }, + { + "epoch": 5.810232368528332, + "grad_norm": 1.0439605712890625, + "learning_rate": 0.00027397574398695476, + "loss": 3.4162, + "step": 85515 + }, + { + "epoch": 5.810572088598994, + "grad_norm": 0.8171853423118591, + "learning_rate": 0.00027393327897812204, + "loss": 3.3947, + "step": 85520 + }, + { + "epoch": 5.8109118086696565, + "grad_norm": 0.8713304400444031, + "learning_rate": 0.0002738908139692893, + "loss": 3.7344, + "step": 85525 + }, + { + "epoch": 5.811251528740318, + "grad_norm": 0.950101375579834, + "learning_rate": 0.0002738483489604566, + "loss": 3.5229, + "step": 85530 + }, + { + "epoch": 5.81159124881098, + "grad_norm": 0.824985146522522, + "learning_rate": 0.0002738058839516239, + "loss": 3.5835, + "step": 85535 + }, + { + "epoch": 5.811930968881642, + "grad_norm": 1.5317907333374023, + "learning_rate": 0.00027376341894279116, + "loss": 3.2508, + "step": 85540 + }, + { + "epoch": 5.812270688952303, + "grad_norm": 0.7840662002563477, + "learning_rate": 0.0002737209539339584, + "loss": 3.4444, + "step": 85545 + }, + { + "epoch": 5.812610409022965, + "grad_norm": 0.9082853198051453, + "learning_rate": 0.0002736784889251257, + "loss": 3.315, + "step": 85550 + }, + { + "epoch": 5.812950129093627, + "grad_norm": 0.9790573716163635, + "learning_rate": 0.000273636023916293, + "loss": 3.5261, + "step": 85555 + }, + { + "epoch": 5.813289849164288, + "grad_norm": 0.7621351480484009, + "learning_rate": 0.0002735935589074602, + "loss": 3.4466, + "step": 85560 + }, + { + "epoch": 5.8136295692349504, + "grad_norm": 0.657404363155365, + "learning_rate": 0.00027355109389862756, + "loss": 3.5114, + "step": 85565 + }, + { + "epoch": 5.8139692893056125, + "grad_norm": 0.8356375694274902, + "learning_rate": 0.00027350862888979484, + "loss": 3.3621, + "step": 85570 + }, + { + "epoch": 5.814309009376274, + "grad_norm": 0.9051974415779114, + "learning_rate": 0.00027346616388096206, + "loss": 3.5166, + "step": 85575 + }, + { + "epoch": 5.814648729446936, + "grad_norm": 1.048669457435608, + "learning_rate": 0.00027342369887212934, + "loss": 3.5788, + "step": 85580 + }, + { + "epoch": 5.814988449517598, + "grad_norm": 0.822951078414917, + "learning_rate": 0.0002733812338632967, + "loss": 3.5148, + "step": 85585 + }, + { + "epoch": 5.815328169588259, + "grad_norm": 1.0010145902633667, + "learning_rate": 0.0002733387688544639, + "loss": 3.2095, + "step": 85590 + }, + { + "epoch": 5.815667889658921, + "grad_norm": 1.1512166261672974, + "learning_rate": 0.0002732963038456312, + "loss": 3.4813, + "step": 85595 + }, + { + "epoch": 5.816007609729583, + "grad_norm": 0.9535001516342163, + "learning_rate": 0.0002732538388367985, + "loss": 3.4815, + "step": 85600 + }, + { + "epoch": 5.816347329800244, + "grad_norm": 0.823553740978241, + "learning_rate": 0.00027321137382796574, + "loss": 3.3985, + "step": 85605 + }, + { + "epoch": 5.8166870498709065, + "grad_norm": 1.202459692955017, + "learning_rate": 0.000273168908819133, + "loss": 3.3477, + "step": 85610 + }, + { + "epoch": 5.8170267699415685, + "grad_norm": 1.0121946334838867, + "learning_rate": 0.0002731264438103003, + "loss": 3.3096, + "step": 85615 + }, + { + "epoch": 5.81736649001223, + "grad_norm": 0.8445636630058289, + "learning_rate": 0.0002730839788014676, + "loss": 3.2654, + "step": 85620 + }, + { + "epoch": 5.817706210082892, + "grad_norm": 1.229304313659668, + "learning_rate": 0.00027304151379263487, + "loss": 3.3372, + "step": 85625 + }, + { + "epoch": 5.818045930153554, + "grad_norm": 0.80705326795578, + "learning_rate": 0.00027299904878380215, + "loss": 3.4465, + "step": 85630 + }, + { + "epoch": 5.818385650224215, + "grad_norm": 0.9996903538703918, + "learning_rate": 0.0002729565837749695, + "loss": 3.4632, + "step": 85635 + }, + { + "epoch": 5.818725370294877, + "grad_norm": 0.9494014382362366, + "learning_rate": 0.0002729141187661367, + "loss": 3.4355, + "step": 85640 + }, + { + "epoch": 5.819065090365539, + "grad_norm": 1.1306864023208618, + "learning_rate": 0.000272871653757304, + "loss": 3.3732, + "step": 85645 + }, + { + "epoch": 5.8194048104362, + "grad_norm": 1.064032793045044, + "learning_rate": 0.00027282918874847127, + "loss": 3.528, + "step": 85650 + }, + { + "epoch": 5.8197445305068625, + "grad_norm": 0.7719119787216187, + "learning_rate": 0.00027278672373963855, + "loss": 3.5414, + "step": 85655 + }, + { + "epoch": 5.8200842505775245, + "grad_norm": 0.9863925576210022, + "learning_rate": 0.0002727442587308058, + "loss": 3.5355, + "step": 85660 + }, + { + "epoch": 5.820423970648186, + "grad_norm": 0.8343253135681152, + "learning_rate": 0.0002727017937219731, + "loss": 3.5537, + "step": 85665 + }, + { + "epoch": 5.820763690718848, + "grad_norm": 0.886970043182373, + "learning_rate": 0.0002726593287131404, + "loss": 3.4982, + "step": 85670 + }, + { + "epoch": 5.82110341078951, + "grad_norm": 0.8007820248603821, + "learning_rate": 0.00027261686370430767, + "loss": 3.4414, + "step": 85675 + }, + { + "epoch": 5.821443130860171, + "grad_norm": 0.8986972570419312, + "learning_rate": 0.00027257439869547495, + "loss": 3.4704, + "step": 85680 + }, + { + "epoch": 5.821782850930833, + "grad_norm": 0.7778869867324829, + "learning_rate": 0.00027253193368664217, + "loss": 3.2434, + "step": 85685 + }, + { + "epoch": 5.822122571001495, + "grad_norm": 0.8440612554550171, + "learning_rate": 0.0002724894686778095, + "loss": 3.3359, + "step": 85690 + }, + { + "epoch": 5.822462291072156, + "grad_norm": 0.930967390537262, + "learning_rate": 0.0002724470036689768, + "loss": 3.5938, + "step": 85695 + }, + { + "epoch": 5.8228020111428185, + "grad_norm": 0.8988779187202454, + "learning_rate": 0.000272404538660144, + "loss": 3.3772, + "step": 85700 + }, + { + "epoch": 5.8231417312134806, + "grad_norm": 1.0116018056869507, + "learning_rate": 0.00027236207365131135, + "loss": 3.3669, + "step": 85705 + }, + { + "epoch": 5.823481451284142, + "grad_norm": 0.8337979316711426, + "learning_rate": 0.0002723196086424786, + "loss": 3.3797, + "step": 85710 + }, + { + "epoch": 5.823821171354804, + "grad_norm": 0.9278810024261475, + "learning_rate": 0.00027227714363364585, + "loss": 3.185, + "step": 85715 + }, + { + "epoch": 5.824160891425466, + "grad_norm": 1.001962423324585, + "learning_rate": 0.00027223467862481313, + "loss": 3.5888, + "step": 85720 + }, + { + "epoch": 5.824500611496127, + "grad_norm": 1.2054061889648438, + "learning_rate": 0.00027219221361598047, + "loss": 3.4313, + "step": 85725 + }, + { + "epoch": 5.824840331566789, + "grad_norm": 1.0082054138183594, + "learning_rate": 0.0002721497486071477, + "loss": 3.4865, + "step": 85730 + }, + { + "epoch": 5.825180051637451, + "grad_norm": 1.1160482168197632, + "learning_rate": 0.00027210728359831497, + "loss": 3.2638, + "step": 85735 + }, + { + "epoch": 5.825519771708112, + "grad_norm": 0.876923680305481, + "learning_rate": 0.0002720648185894823, + "loss": 3.3704, + "step": 85740 + }, + { + "epoch": 5.8258594917787745, + "grad_norm": 0.9848538637161255, + "learning_rate": 0.00027202235358064953, + "loss": 3.4257, + "step": 85745 + }, + { + "epoch": 5.826199211849436, + "grad_norm": 0.9340847730636597, + "learning_rate": 0.0002719798885718168, + "loss": 3.5443, + "step": 85750 + }, + { + "epoch": 5.826538931920098, + "grad_norm": 1.0072225332260132, + "learning_rate": 0.00027193742356298415, + "loss": 3.306, + "step": 85755 + }, + { + "epoch": 5.82687865199076, + "grad_norm": 0.856212317943573, + "learning_rate": 0.0002718949585541514, + "loss": 3.189, + "step": 85760 + }, + { + "epoch": 5.827218372061421, + "grad_norm": 0.7648259997367859, + "learning_rate": 0.00027185249354531865, + "loss": 3.6671, + "step": 85765 + }, + { + "epoch": 5.827558092132083, + "grad_norm": 1.0310956239700317, + "learning_rate": 0.00027181002853648593, + "loss": 3.3332, + "step": 85770 + }, + { + "epoch": 5.827897812202745, + "grad_norm": 0.9745451211929321, + "learning_rate": 0.0002717675635276532, + "loss": 3.153, + "step": 85775 + }, + { + "epoch": 5.828237532273406, + "grad_norm": 0.7667463421821594, + "learning_rate": 0.0002717250985188205, + "loss": 3.4433, + "step": 85780 + }, + { + "epoch": 5.828577252344068, + "grad_norm": 0.8918489217758179, + "learning_rate": 0.0002716826335099878, + "loss": 3.5023, + "step": 85785 + }, + { + "epoch": 5.8289169724147305, + "grad_norm": 0.8896180987358093, + "learning_rate": 0.00027164016850115505, + "loss": 3.5919, + "step": 85790 + }, + { + "epoch": 5.829256692485392, + "grad_norm": 0.689349353313446, + "learning_rate": 0.00027159770349232233, + "loss": 3.4804, + "step": 85795 + }, + { + "epoch": 5.829596412556054, + "grad_norm": 0.8774466514587402, + "learning_rate": 0.0002715552384834896, + "loss": 3.2648, + "step": 85800 + }, + { + "epoch": 5.829936132626716, + "grad_norm": 1.1038342714309692, + "learning_rate": 0.0002715127734746569, + "loss": 3.3377, + "step": 85805 + }, + { + "epoch": 5.830275852697377, + "grad_norm": 0.8872410655021667, + "learning_rate": 0.0002714703084658242, + "loss": 3.5323, + "step": 85810 + }, + { + "epoch": 5.830615572768039, + "grad_norm": 1.0103442668914795, + "learning_rate": 0.00027142784345699145, + "loss": 3.2287, + "step": 85815 + }, + { + "epoch": 5.830955292838701, + "grad_norm": 0.7691941857337952, + "learning_rate": 0.00027138537844815873, + "loss": 3.443, + "step": 85820 + }, + { + "epoch": 5.831295012909362, + "grad_norm": 0.8993629217147827, + "learning_rate": 0.000271342913439326, + "loss": 3.6297, + "step": 85825 + }, + { + "epoch": 5.831634732980024, + "grad_norm": 0.9167149066925049, + "learning_rate": 0.0002713004484304933, + "loss": 3.4515, + "step": 85830 + }, + { + "epoch": 5.8319744530506865, + "grad_norm": 0.7162668108940125, + "learning_rate": 0.0002712579834216606, + "loss": 3.7221, + "step": 85835 + }, + { + "epoch": 5.832314173121348, + "grad_norm": 0.7262733578681946, + "learning_rate": 0.0002712155184128278, + "loss": 3.6246, + "step": 85840 + }, + { + "epoch": 5.83265389319201, + "grad_norm": 0.9179102778434753, + "learning_rate": 0.00027117305340399513, + "loss": 3.4507, + "step": 85845 + }, + { + "epoch": 5.832993613262672, + "grad_norm": 0.902248203754425, + "learning_rate": 0.0002711305883951624, + "loss": 3.4139, + "step": 85850 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.8453141450881958, + "learning_rate": 0.00027108812338632964, + "loss": 3.6762, + "step": 85855 + }, + { + "epoch": 5.833673053403995, + "grad_norm": 0.8040580749511719, + "learning_rate": 0.000271045658377497, + "loss": 3.5212, + "step": 85860 + }, + { + "epoch": 5.834012773474657, + "grad_norm": 0.947470486164093, + "learning_rate": 0.00027100319336866425, + "loss": 3.3452, + "step": 85865 + }, + { + "epoch": 5.834352493545318, + "grad_norm": 1.0370835065841675, + "learning_rate": 0.0002709607283598315, + "loss": 3.5386, + "step": 85870 + }, + { + "epoch": 5.8346922136159804, + "grad_norm": 0.9472407102584839, + "learning_rate": 0.00027091826335099876, + "loss": 3.2859, + "step": 85875 + }, + { + "epoch": 5.8350319336866425, + "grad_norm": 1.6536996364593506, + "learning_rate": 0.0002708757983421661, + "loss": 3.4583, + "step": 85880 + }, + { + "epoch": 5.835371653757304, + "grad_norm": 0.7231073379516602, + "learning_rate": 0.0002708333333333333, + "loss": 3.2972, + "step": 85885 + }, + { + "epoch": 5.835711373827966, + "grad_norm": 0.9917052388191223, + "learning_rate": 0.0002707908683245006, + "loss": 3.6698, + "step": 85890 + }, + { + "epoch": 5.836051093898628, + "grad_norm": 1.342458724975586, + "learning_rate": 0.00027074840331566794, + "loss": 3.4437, + "step": 85895 + }, + { + "epoch": 5.836390813969289, + "grad_norm": 0.9054881930351257, + "learning_rate": 0.00027070593830683516, + "loss": 3.323, + "step": 85900 + }, + { + "epoch": 5.836730534039951, + "grad_norm": 0.9519320130348206, + "learning_rate": 0.00027066347329800244, + "loss": 3.3563, + "step": 85905 + }, + { + "epoch": 5.837070254110613, + "grad_norm": 0.9664747714996338, + "learning_rate": 0.0002706210082891697, + "loss": 3.5183, + "step": 85910 + }, + { + "epoch": 5.837409974181274, + "grad_norm": 0.8269459009170532, + "learning_rate": 0.000270578543280337, + "loss": 3.3623, + "step": 85915 + }, + { + "epoch": 5.8377496942519365, + "grad_norm": 0.8527933955192566, + "learning_rate": 0.0002705360782715043, + "loss": 3.579, + "step": 85920 + }, + { + "epoch": 5.8380894143225985, + "grad_norm": 0.7117810845375061, + "learning_rate": 0.0002705021062644381, + "loss": 3.3925, + "step": 85925 + }, + { + "epoch": 5.83842913439326, + "grad_norm": 0.8804981112480164, + "learning_rate": 0.0002704596412556054, + "loss": 3.627, + "step": 85930 + }, + { + "epoch": 5.838768854463922, + "grad_norm": 1.1434721946716309, + "learning_rate": 0.00027041717624677267, + "loss": 3.2384, + "step": 85935 + }, + { + "epoch": 5.839108574534584, + "grad_norm": 0.7131012678146362, + "learning_rate": 0.00027037471123793995, + "loss": 3.1806, + "step": 85940 + }, + { + "epoch": 5.839448294605245, + "grad_norm": 0.8278761506080627, + "learning_rate": 0.0002703322462291072, + "loss": 3.4977, + "step": 85945 + }, + { + "epoch": 5.839788014675907, + "grad_norm": 0.856231689453125, + "learning_rate": 0.00027028978122027445, + "loss": 3.6983, + "step": 85950 + }, + { + "epoch": 5.840127734746569, + "grad_norm": 0.8632441759109497, + "learning_rate": 0.0002702473162114418, + "loss": 3.5011, + "step": 85955 + }, + { + "epoch": 5.84046745481723, + "grad_norm": 0.8508334159851074, + "learning_rate": 0.00027020485120260907, + "loss": 3.3592, + "step": 85960 + }, + { + "epoch": 5.8408071748878925, + "grad_norm": 0.8594121932983398, + "learning_rate": 0.0002701623861937763, + "loss": 3.2995, + "step": 85965 + }, + { + "epoch": 5.841146894958554, + "grad_norm": 1.7162503004074097, + "learning_rate": 0.0002701199211849436, + "loss": 3.1554, + "step": 85970 + }, + { + "epoch": 5.841486615029216, + "grad_norm": 0.8897128701210022, + "learning_rate": 0.0002700774561761109, + "loss": 3.5605, + "step": 85975 + }, + { + "epoch": 5.841826335099878, + "grad_norm": 0.8521721959114075, + "learning_rate": 0.00027003499116727813, + "loss": 3.3733, + "step": 85980 + }, + { + "epoch": 5.842166055170539, + "grad_norm": 1.081356167793274, + "learning_rate": 0.00026999252615844547, + "loss": 3.872, + "step": 85985 + }, + { + "epoch": 5.842505775241201, + "grad_norm": 1.1414093971252441, + "learning_rate": 0.00026995006114961275, + "loss": 3.4899, + "step": 85990 + }, + { + "epoch": 5.842845495311863, + "grad_norm": 0.7409201860427856, + "learning_rate": 0.00026990759614077997, + "loss": 3.424, + "step": 85995 + }, + { + "epoch": 5.843185215382524, + "grad_norm": 0.9080458879470825, + "learning_rate": 0.00026986513113194725, + "loss": 3.1323, + "step": 86000 + }, + { + "epoch": 5.843524935453186, + "grad_norm": 0.9385801553726196, + "learning_rate": 0.0002698226661231146, + "loss": 3.376, + "step": 86005 + }, + { + "epoch": 5.8438646555238485, + "grad_norm": 1.0017622709274292, + "learning_rate": 0.00026978020111428187, + "loss": 3.5508, + "step": 86010 + }, + { + "epoch": 5.84420437559451, + "grad_norm": 0.9929341077804565, + "learning_rate": 0.0002697377361054491, + "loss": 3.4166, + "step": 86015 + }, + { + "epoch": 5.844544095665172, + "grad_norm": 0.846314549446106, + "learning_rate": 0.0002696952710966164, + "loss": 3.3793, + "step": 86020 + }, + { + "epoch": 5.844883815735834, + "grad_norm": 0.9961134791374207, + "learning_rate": 0.0002696528060877837, + "loss": 3.5212, + "step": 86025 + }, + { + "epoch": 5.845223535806495, + "grad_norm": 1.146599531173706, + "learning_rate": 0.00026961034107895093, + "loss": 3.5736, + "step": 86030 + }, + { + "epoch": 5.845563255877157, + "grad_norm": 0.9524893760681152, + "learning_rate": 0.0002695678760701182, + "loss": 3.4581, + "step": 86035 + }, + { + "epoch": 5.845902975947819, + "grad_norm": 1.032321572303772, + "learning_rate": 0.00026952541106128555, + "loss": 3.4487, + "step": 86040 + }, + { + "epoch": 5.84624269601848, + "grad_norm": 0.9524300694465637, + "learning_rate": 0.0002694829460524528, + "loss": 3.4059, + "step": 86045 + }, + { + "epoch": 5.846582416089142, + "grad_norm": 0.9169803261756897, + "learning_rate": 0.00026944048104362005, + "loss": 3.2868, + "step": 86050 + }, + { + "epoch": 5.8469221361598045, + "grad_norm": 0.9699530005455017, + "learning_rate": 0.0002693980160347874, + "loss": 3.2499, + "step": 86055 + }, + { + "epoch": 5.847261856230466, + "grad_norm": 0.8540475368499756, + "learning_rate": 0.0002693555510259546, + "loss": 3.4537, + "step": 86060 + }, + { + "epoch": 5.847601576301128, + "grad_norm": 0.8074814677238464, + "learning_rate": 0.0002693130860171219, + "loss": 3.2293, + "step": 86065 + }, + { + "epoch": 5.84794129637179, + "grad_norm": 0.9232829213142395, + "learning_rate": 0.0002692706210082892, + "loss": 3.2547, + "step": 86070 + }, + { + "epoch": 5.848281016442451, + "grad_norm": 0.8689764142036438, + "learning_rate": 0.00026922815599945645, + "loss": 3.4021, + "step": 86075 + }, + { + "epoch": 5.848620736513113, + "grad_norm": 1.014462947845459, + "learning_rate": 0.00026918569099062373, + "loss": 3.28, + "step": 86080 + }, + { + "epoch": 5.848960456583775, + "grad_norm": 0.8579389452934265, + "learning_rate": 0.000269143225981791, + "loss": 3.4006, + "step": 86085 + }, + { + "epoch": 5.849300176654436, + "grad_norm": 0.8454967141151428, + "learning_rate": 0.0002691007609729583, + "loss": 3.5293, + "step": 86090 + }, + { + "epoch": 5.849639896725098, + "grad_norm": 0.9118895530700684, + "learning_rate": 0.0002690582959641256, + "loss": 3.3605, + "step": 86095 + }, + { + "epoch": 5.8499796167957605, + "grad_norm": 1.1769922971725464, + "learning_rate": 0.00026901583095529285, + "loss": 3.1935, + "step": 86100 + }, + { + "epoch": 5.850319336866422, + "grad_norm": 0.7723960280418396, + "learning_rate": 0.0002689733659464601, + "loss": 3.4319, + "step": 86105 + }, + { + "epoch": 5.850659056937084, + "grad_norm": 0.81398606300354, + "learning_rate": 0.0002689309009376274, + "loss": 3.3631, + "step": 86110 + }, + { + "epoch": 5.850998777007746, + "grad_norm": 0.7734928131103516, + "learning_rate": 0.0002688884359287947, + "loss": 3.2667, + "step": 86115 + }, + { + "epoch": 5.851338497078407, + "grad_norm": 1.0510709285736084, + "learning_rate": 0.0002688459709199619, + "loss": 3.4113, + "step": 86120 + }, + { + "epoch": 5.851678217149069, + "grad_norm": 0.8106723427772522, + "learning_rate": 0.00026880350591112925, + "loss": 3.2034, + "step": 86125 + }, + { + "epoch": 5.852017937219731, + "grad_norm": 0.7515668272972107, + "learning_rate": 0.00026876104090229653, + "loss": 3.4243, + "step": 86130 + }, + { + "epoch": 5.852357657290392, + "grad_norm": 1.0409852266311646, + "learning_rate": 0.00026871857589346376, + "loss": 3.3842, + "step": 86135 + }, + { + "epoch": 5.852697377361054, + "grad_norm": 0.773460328578949, + "learning_rate": 0.00026867611088463104, + "loss": 3.4616, + "step": 86140 + }, + { + "epoch": 5.8530370974317165, + "grad_norm": 0.7904223203659058, + "learning_rate": 0.0002686336458757984, + "loss": 3.5271, + "step": 86145 + }, + { + "epoch": 5.853376817502378, + "grad_norm": 0.7308520078659058, + "learning_rate": 0.0002685911808669656, + "loss": 3.4609, + "step": 86150 + }, + { + "epoch": 5.85371653757304, + "grad_norm": 0.8999838829040527, + "learning_rate": 0.0002685487158581329, + "loss": 3.4168, + "step": 86155 + }, + { + "epoch": 5.854056257643702, + "grad_norm": 0.7455321550369263, + "learning_rate": 0.0002685062508493002, + "loss": 3.2149, + "step": 86160 + }, + { + "epoch": 5.854395977714363, + "grad_norm": 0.8829158544540405, + "learning_rate": 0.00026846378584046744, + "loss": 3.5983, + "step": 86165 + }, + { + "epoch": 5.854735697785025, + "grad_norm": 1.497408390045166, + "learning_rate": 0.0002684213208316347, + "loss": 3.5042, + "step": 86170 + }, + { + "epoch": 5.855075417855687, + "grad_norm": 0.986340343952179, + "learning_rate": 0.000268378855822802, + "loss": 3.4139, + "step": 86175 + }, + { + "epoch": 5.855415137926348, + "grad_norm": 0.8681049942970276, + "learning_rate": 0.00026833639081396934, + "loss": 3.5741, + "step": 86180 + }, + { + "epoch": 5.8557548579970105, + "grad_norm": 0.9097377061843872, + "learning_rate": 0.00026829392580513656, + "loss": 3.5196, + "step": 86185 + }, + { + "epoch": 5.8560945780676725, + "grad_norm": 0.8823378682136536, + "learning_rate": 0.00026825146079630384, + "loss": 3.4557, + "step": 86190 + }, + { + "epoch": 5.856434298138334, + "grad_norm": 0.6795923709869385, + "learning_rate": 0.0002682089957874712, + "loss": 3.4685, + "step": 86195 + }, + { + "epoch": 5.856774018208996, + "grad_norm": 0.8642271161079407, + "learning_rate": 0.0002681665307786384, + "loss": 3.3359, + "step": 86200 + }, + { + "epoch": 5.857113738279658, + "grad_norm": 1.0377018451690674, + "learning_rate": 0.0002681240657698057, + "loss": 3.517, + "step": 86205 + }, + { + "epoch": 5.857453458350319, + "grad_norm": 1.006542682647705, + "learning_rate": 0.00026808160076097296, + "loss": 3.3948, + "step": 86210 + }, + { + "epoch": 5.857793178420981, + "grad_norm": 2.9211783409118652, + "learning_rate": 0.00026803913575214024, + "loss": 3.3608, + "step": 86215 + }, + { + "epoch": 5.858132898491643, + "grad_norm": 0.7408518195152283, + "learning_rate": 0.0002679966707433075, + "loss": 3.4321, + "step": 86220 + }, + { + "epoch": 5.858472618562304, + "grad_norm": 0.8106531500816345, + "learning_rate": 0.0002679542057344748, + "loss": 3.2606, + "step": 86225 + }, + { + "epoch": 5.8588123386329665, + "grad_norm": 0.9199251532554626, + "learning_rate": 0.0002679117407256421, + "loss": 3.6131, + "step": 86230 + }, + { + "epoch": 5.8591520587036285, + "grad_norm": 2.767080545425415, + "learning_rate": 0.00026786927571680936, + "loss": 3.2219, + "step": 86235 + }, + { + "epoch": 5.85949177877429, + "grad_norm": 0.82436603307724, + "learning_rate": 0.00026782681070797664, + "loss": 3.3334, + "step": 86240 + }, + { + "epoch": 5.859831498844952, + "grad_norm": 0.7270861268043518, + "learning_rate": 0.00026778434569914387, + "loss": 3.5791, + "step": 86245 + }, + { + "epoch": 5.860171218915614, + "grad_norm": 1.398419976234436, + "learning_rate": 0.0002677418806903112, + "loss": 3.3687, + "step": 86250 + }, + { + "epoch": 5.860510938986275, + "grad_norm": 0.8210367560386658, + "learning_rate": 0.0002676994156814785, + "loss": 3.1861, + "step": 86255 + }, + { + "epoch": 5.860850659056937, + "grad_norm": 0.9818872213363647, + "learning_rate": 0.0002676569506726457, + "loss": 3.3963, + "step": 86260 + }, + { + "epoch": 5.861190379127599, + "grad_norm": 0.8567278981208801, + "learning_rate": 0.00026761448566381304, + "loss": 3.3536, + "step": 86265 + }, + { + "epoch": 5.86153009919826, + "grad_norm": 0.9595933556556702, + "learning_rate": 0.0002675720206549803, + "loss": 3.5343, + "step": 86270 + }, + { + "epoch": 5.8618698192689225, + "grad_norm": 0.9132570028305054, + "learning_rate": 0.00026752955564614755, + "loss": 3.5047, + "step": 86275 + }, + { + "epoch": 5.8622095393395846, + "grad_norm": 0.7381733655929565, + "learning_rate": 0.0002674870906373149, + "loss": 3.4392, + "step": 86280 + }, + { + "epoch": 5.862549259410246, + "grad_norm": 0.9164617657661438, + "learning_rate": 0.00026744462562848216, + "loss": 3.0251, + "step": 86285 + }, + { + "epoch": 5.862888979480908, + "grad_norm": 0.8019139766693115, + "learning_rate": 0.0002674021606196494, + "loss": 3.2512, + "step": 86290 + }, + { + "epoch": 5.86322869955157, + "grad_norm": 0.8563025593757629, + "learning_rate": 0.00026735969561081667, + "loss": 3.596, + "step": 86295 + }, + { + "epoch": 5.863568419622231, + "grad_norm": 0.7289330363273621, + "learning_rate": 0.000267317230601984, + "loss": 3.2338, + "step": 86300 + }, + { + "epoch": 5.863908139692893, + "grad_norm": 0.7861294746398926, + "learning_rate": 0.00026727476559315123, + "loss": 3.6479, + "step": 86305 + }, + { + "epoch": 5.864247859763555, + "grad_norm": 1.1015769243240356, + "learning_rate": 0.0002672323005843185, + "loss": 3.7782, + "step": 86310 + }, + { + "epoch": 5.864587579834216, + "grad_norm": 0.8051776885986328, + "learning_rate": 0.00026718983557548584, + "loss": 3.6983, + "step": 86315 + }, + { + "epoch": 5.8649272999048785, + "grad_norm": 0.8252801895141602, + "learning_rate": 0.00026714737056665307, + "loss": 3.6472, + "step": 86320 + }, + { + "epoch": 5.865267019975541, + "grad_norm": 0.8720535635948181, + "learning_rate": 0.00026710490555782035, + "loss": 3.3102, + "step": 86325 + }, + { + "epoch": 5.865606740046202, + "grad_norm": 0.8332847952842712, + "learning_rate": 0.00026706244054898763, + "loss": 3.5447, + "step": 86330 + }, + { + "epoch": 5.865946460116864, + "grad_norm": 0.9563583731651306, + "learning_rate": 0.0002670199755401549, + "loss": 3.4064, + "step": 86335 + }, + { + "epoch": 5.866286180187526, + "grad_norm": 0.7094854116439819, + "learning_rate": 0.0002669775105313222, + "loss": 3.6079, + "step": 86340 + }, + { + "epoch": 5.866625900258187, + "grad_norm": 1.1143832206726074, + "learning_rate": 0.00026693504552248947, + "loss": 3.1087, + "step": 86345 + }, + { + "epoch": 5.866965620328849, + "grad_norm": 0.8786844611167908, + "learning_rate": 0.0002668925805136568, + "loss": 3.4041, + "step": 86350 + }, + { + "epoch": 5.867305340399511, + "grad_norm": 0.9052826762199402, + "learning_rate": 0.00026685011550482403, + "loss": 3.347, + "step": 86355 + }, + { + "epoch": 5.867645060470172, + "grad_norm": 0.8842107057571411, + "learning_rate": 0.0002668076504959913, + "loss": 3.2645, + "step": 86360 + }, + { + "epoch": 5.8679847805408345, + "grad_norm": 0.8783309459686279, + "learning_rate": 0.0002667651854871586, + "loss": 3.521, + "step": 86365 + }, + { + "epoch": 5.868324500611497, + "grad_norm": 5.126203536987305, + "learning_rate": 0.00026672272047832587, + "loss": 3.3901, + "step": 86370 + }, + { + "epoch": 5.868664220682158, + "grad_norm": 0.9004598259925842, + "learning_rate": 0.00026668025546949315, + "loss": 3.7889, + "step": 86375 + }, + { + "epoch": 5.86900394075282, + "grad_norm": 1.0173715353012085, + "learning_rate": 0.00026663779046066043, + "loss": 3.4043, + "step": 86380 + }, + { + "epoch": 5.869343660823482, + "grad_norm": 0.9580851793289185, + "learning_rate": 0.0002665953254518277, + "loss": 3.3848, + "step": 86385 + }, + { + "epoch": 5.869683380894143, + "grad_norm": 0.9193467497825623, + "learning_rate": 0.000266552860442995, + "loss": 3.5887, + "step": 86390 + }, + { + "epoch": 5.870023100964805, + "grad_norm": 0.9518941640853882, + "learning_rate": 0.00026651039543416227, + "loss": 3.3378, + "step": 86395 + }, + { + "epoch": 5.870362821035467, + "grad_norm": 0.8353771567344666, + "learning_rate": 0.0002664679304253295, + "loss": 3.5532, + "step": 86400 + }, + { + "epoch": 5.870702541106128, + "grad_norm": 0.9603943228721619, + "learning_rate": 0.00026642546541649683, + "loss": 3.433, + "step": 86405 + }, + { + "epoch": 5.8710422611767905, + "grad_norm": 1.1699985265731812, + "learning_rate": 0.0002663830004076641, + "loss": 3.4043, + "step": 86410 + }, + { + "epoch": 5.871381981247453, + "grad_norm": 0.7131714224815369, + "learning_rate": 0.00026634053539883134, + "loss": 3.5599, + "step": 86415 + }, + { + "epoch": 5.871721701318114, + "grad_norm": 0.8245424032211304, + "learning_rate": 0.00026629807038999867, + "loss": 3.4363, + "step": 86420 + }, + { + "epoch": 5.872061421388776, + "grad_norm": 1.0683494806289673, + "learning_rate": 0.00026625560538116595, + "loss": 3.2427, + "step": 86425 + }, + { + "epoch": 5.872401141459437, + "grad_norm": 0.8901724815368652, + "learning_rate": 0.0002662131403723332, + "loss": 3.4132, + "step": 86430 + }, + { + "epoch": 5.872740861530099, + "grad_norm": 0.8202028274536133, + "learning_rate": 0.00026617067536350046, + "loss": 3.5764, + "step": 86435 + }, + { + "epoch": 5.873080581600761, + "grad_norm": 0.8818259835243225, + "learning_rate": 0.0002661282103546678, + "loss": 3.1972, + "step": 86440 + }, + { + "epoch": 5.873420301671422, + "grad_norm": 0.847334086894989, + "learning_rate": 0.000266085745345835, + "loss": 3.4352, + "step": 86445 + }, + { + "epoch": 5.8737600217420844, + "grad_norm": 0.8709720969200134, + "learning_rate": 0.0002660432803370023, + "loss": 3.6704, + "step": 86450 + }, + { + "epoch": 5.8740997418127465, + "grad_norm": 1.0228006839752197, + "learning_rate": 0.00026600081532816963, + "loss": 3.4589, + "step": 86455 + }, + { + "epoch": 5.874439461883408, + "grad_norm": 0.8871210217475891, + "learning_rate": 0.00026595835031933686, + "loss": 3.4706, + "step": 86460 + }, + { + "epoch": 5.87477918195407, + "grad_norm": 0.9307692050933838, + "learning_rate": 0.00026591588531050414, + "loss": 3.264, + "step": 86465 + }, + { + "epoch": 5.875118902024732, + "grad_norm": 0.9189106822013855, + "learning_rate": 0.0002658734203016714, + "loss": 3.4733, + "step": 86470 + }, + { + "epoch": 5.875458622095393, + "grad_norm": 0.7206747531890869, + "learning_rate": 0.0002658309552928387, + "loss": 3.6019, + "step": 86475 + }, + { + "epoch": 5.875798342166055, + "grad_norm": 0.6723261475563049, + "learning_rate": 0.000265788490284006, + "loss": 3.4902, + "step": 86480 + }, + { + "epoch": 5.876138062236717, + "grad_norm": 0.8568747639656067, + "learning_rate": 0.00026574602527517326, + "loss": 3.3928, + "step": 86485 + }, + { + "epoch": 5.876477782307378, + "grad_norm": 0.9626859426498413, + "learning_rate": 0.00026570356026634054, + "loss": 3.3789, + "step": 86490 + }, + { + "epoch": 5.8768175023780405, + "grad_norm": 0.7932118773460388, + "learning_rate": 0.0002656610952575078, + "loss": 3.3378, + "step": 86495 + }, + { + "epoch": 5.8771572224487025, + "grad_norm": 1.1334195137023926, + "learning_rate": 0.0002656186302486751, + "loss": 3.4276, + "step": 86500 + }, + { + "epoch": 5.877496942519364, + "grad_norm": 1.2824655771255493, + "learning_rate": 0.0002655761652398423, + "loss": 3.2653, + "step": 86505 + }, + { + "epoch": 5.877836662590026, + "grad_norm": 0.8707414269447327, + "learning_rate": 0.00026553370023100966, + "loss": 3.4084, + "step": 86510 + }, + { + "epoch": 5.878176382660688, + "grad_norm": 0.7619236707687378, + "learning_rate": 0.00026549123522217694, + "loss": 3.4442, + "step": 86515 + }, + { + "epoch": 5.878516102731349, + "grad_norm": 1.1393473148345947, + "learning_rate": 0.0002654487702133442, + "loss": 3.4899, + "step": 86520 + }, + { + "epoch": 5.878855822802011, + "grad_norm": 0.990719735622406, + "learning_rate": 0.0002654063052045115, + "loss": 3.5318, + "step": 86525 + }, + { + "epoch": 5.879195542872673, + "grad_norm": 0.803991973400116, + "learning_rate": 0.0002653638401956788, + "loss": 3.0146, + "step": 86530 + }, + { + "epoch": 5.879535262943334, + "grad_norm": 0.8346924185752869, + "learning_rate": 0.00026532137518684606, + "loss": 3.4468, + "step": 86535 + }, + { + "epoch": 5.8798749830139965, + "grad_norm": 1.2626184225082397, + "learning_rate": 0.0002652789101780133, + "loss": 3.501, + "step": 86540 + }, + { + "epoch": 5.8802147030846585, + "grad_norm": 0.7619519233703613, + "learning_rate": 0.0002652364451691806, + "loss": 3.3185, + "step": 86545 + }, + { + "epoch": 5.88055442315532, + "grad_norm": 0.8874446749687195, + "learning_rate": 0.0002651939801603479, + "loss": 3.5685, + "step": 86550 + }, + { + "epoch": 5.880894143225982, + "grad_norm": 1.1227364540100098, + "learning_rate": 0.0002651515151515151, + "loss": 3.511, + "step": 86555 + }, + { + "epoch": 5.881233863296644, + "grad_norm": 1.0537521839141846, + "learning_rate": 0.00026510905014268246, + "loss": 3.558, + "step": 86560 + }, + { + "epoch": 5.881573583367305, + "grad_norm": 1.0293253660202026, + "learning_rate": 0.00026506658513384974, + "loss": 3.4533, + "step": 86565 + }, + { + "epoch": 5.881913303437967, + "grad_norm": 0.7562568783760071, + "learning_rate": 0.00026502412012501696, + "loss": 3.5104, + "step": 86570 + }, + { + "epoch": 5.882253023508629, + "grad_norm": 0.9520260691642761, + "learning_rate": 0.0002649816551161843, + "loss": 3.3878, + "step": 86575 + }, + { + "epoch": 5.88259274357929, + "grad_norm": 1.3104500770568848, + "learning_rate": 0.0002649391901073516, + "loss": 3.4505, + "step": 86580 + }, + { + "epoch": 5.8829324636499525, + "grad_norm": 0.8337045907974243, + "learning_rate": 0.0002648967250985188, + "loss": 3.6537, + "step": 86585 + }, + { + "epoch": 5.883272183720615, + "grad_norm": 1.0300835371017456, + "learning_rate": 0.0002648542600896861, + "loss": 3.5426, + "step": 86590 + }, + { + "epoch": 5.883611903791276, + "grad_norm": 1.095925211906433, + "learning_rate": 0.0002648117950808534, + "loss": 3.1029, + "step": 86595 + }, + { + "epoch": 5.883951623861938, + "grad_norm": 0.9421355128288269, + "learning_rate": 0.00026476933007202064, + "loss": 3.4633, + "step": 86600 + }, + { + "epoch": 5.8842913439326, + "grad_norm": 1.0269948244094849, + "learning_rate": 0.0002647268650631879, + "loss": 3.3523, + "step": 86605 + }, + { + "epoch": 5.884631064003261, + "grad_norm": 1.4429559707641602, + "learning_rate": 0.00026468440005435526, + "loss": 3.1757, + "step": 86610 + }, + { + "epoch": 5.884970784073923, + "grad_norm": 0.8259848952293396, + "learning_rate": 0.0002646419350455225, + "loss": 3.5232, + "step": 86615 + }, + { + "epoch": 5.885310504144585, + "grad_norm": 0.7626441121101379, + "learning_rate": 0.00026459947003668977, + "loss": 3.4651, + "step": 86620 + }, + { + "epoch": 5.885650224215246, + "grad_norm": 0.8608167767524719, + "learning_rate": 0.00026455700502785705, + "loss": 3.2879, + "step": 86625 + }, + { + "epoch": 5.8859899442859085, + "grad_norm": 1.2798395156860352, + "learning_rate": 0.0002645145400190243, + "loss": 3.4237, + "step": 86630 + }, + { + "epoch": 5.886329664356571, + "grad_norm": 0.8479122519493103, + "learning_rate": 0.0002644720750101916, + "loss": 3.3456, + "step": 86635 + }, + { + "epoch": 5.886669384427232, + "grad_norm": 1.088969111442566, + "learning_rate": 0.0002644296100013589, + "loss": 3.4282, + "step": 86640 + }, + { + "epoch": 5.887009104497894, + "grad_norm": 1.1120307445526123, + "learning_rate": 0.00026438714499252617, + "loss": 3.3597, + "step": 86645 + }, + { + "epoch": 5.887348824568555, + "grad_norm": 0.9269734025001526, + "learning_rate": 0.00026434467998369345, + "loss": 3.5402, + "step": 86650 + }, + { + "epoch": 5.887688544639217, + "grad_norm": 0.8134316802024841, + "learning_rate": 0.0002643022149748607, + "loss": 3.541, + "step": 86655 + }, + { + "epoch": 5.888028264709879, + "grad_norm": 0.9266580939292908, + "learning_rate": 0.00026425974996602795, + "loss": 3.4924, + "step": 86660 + }, + { + "epoch": 5.88836798478054, + "grad_norm": 0.8075083494186401, + "learning_rate": 0.0002642172849571953, + "loss": 3.6724, + "step": 86665 + }, + { + "epoch": 5.888707704851202, + "grad_norm": 1.1349248886108398, + "learning_rate": 0.00026417481994836257, + "loss": 3.389, + "step": 86670 + }, + { + "epoch": 5.8890474249218645, + "grad_norm": 0.9456443786621094, + "learning_rate": 0.0002641323549395298, + "loss": 3.3019, + "step": 86675 + }, + { + "epoch": 5.889387144992526, + "grad_norm": 0.9228094220161438, + "learning_rate": 0.0002640898899306971, + "loss": 3.2039, + "step": 86680 + }, + { + "epoch": 5.889726865063188, + "grad_norm": 0.7995375990867615, + "learning_rate": 0.0002640474249218644, + "loss": 3.5504, + "step": 86685 + }, + { + "epoch": 5.89006658513385, + "grad_norm": 0.7504515647888184, + "learning_rate": 0.0002640049599130317, + "loss": 3.2858, + "step": 86690 + }, + { + "epoch": 5.890406305204511, + "grad_norm": 0.8209716081619263, + "learning_rate": 0.0002639624949041989, + "loss": 3.4779, + "step": 86695 + }, + { + "epoch": 5.890746025275173, + "grad_norm": 2.3663249015808105, + "learning_rate": 0.00026392002989536625, + "loss": 3.2809, + "step": 86700 + }, + { + "epoch": 5.891085745345835, + "grad_norm": 1.0148179531097412, + "learning_rate": 0.0002638775648865335, + "loss": 3.3648, + "step": 86705 + }, + { + "epoch": 5.891425465416496, + "grad_norm": 1.951947569847107, + "learning_rate": 0.00026383509987770075, + "loss": 3.58, + "step": 86710 + }, + { + "epoch": 5.891765185487158, + "grad_norm": 1.0822138786315918, + "learning_rate": 0.0002637926348688681, + "loss": 3.3501, + "step": 86715 + }, + { + "epoch": 5.8921049055578205, + "grad_norm": 0.8705980181694031, + "learning_rate": 0.00026375016986003537, + "loss": 3.31, + "step": 86720 + }, + { + "epoch": 5.892444625628482, + "grad_norm": 0.8680037260055542, + "learning_rate": 0.0002637077048512026, + "loss": 3.7852, + "step": 86725 + }, + { + "epoch": 5.892784345699144, + "grad_norm": 1.068090796470642, + "learning_rate": 0.00026366523984236987, + "loss": 3.4858, + "step": 86730 + }, + { + "epoch": 5.893124065769806, + "grad_norm": 0.8169301152229309, + "learning_rate": 0.0002636227748335372, + "loss": 3.3259, + "step": 86735 + }, + { + "epoch": 5.893463785840467, + "grad_norm": 0.839977502822876, + "learning_rate": 0.00026358030982470443, + "loss": 3.6125, + "step": 86740 + }, + { + "epoch": 5.893803505911129, + "grad_norm": 1.2998368740081787, + "learning_rate": 0.0002635378448158717, + "loss": 3.2385, + "step": 86745 + }, + { + "epoch": 5.894143225981791, + "grad_norm": 0.9597717523574829, + "learning_rate": 0.00026349537980703905, + "loss": 3.4447, + "step": 86750 + }, + { + "epoch": 5.894482946052452, + "grad_norm": 0.8671619296073914, + "learning_rate": 0.0002634529147982063, + "loss": 3.49, + "step": 86755 + }, + { + "epoch": 5.8948226661231145, + "grad_norm": 0.8931105732917786, + "learning_rate": 0.00026341044978937355, + "loss": 3.4284, + "step": 86760 + }, + { + "epoch": 5.8951623861937765, + "grad_norm": 0.9297800660133362, + "learning_rate": 0.00026336798478054083, + "loss": 3.225, + "step": 86765 + }, + { + "epoch": 5.895502106264438, + "grad_norm": 0.7962076663970947, + "learning_rate": 0.0002633255197717081, + "loss": 3.5311, + "step": 86770 + }, + { + "epoch": 5.8958418263351, + "grad_norm": 0.8681319355964661, + "learning_rate": 0.0002632830547628754, + "loss": 3.4835, + "step": 86775 + }, + { + "epoch": 5.896181546405762, + "grad_norm": 0.9529136419296265, + "learning_rate": 0.0002632405897540427, + "loss": 3.3185, + "step": 86780 + }, + { + "epoch": 5.896521266476423, + "grad_norm": 0.9875779747962952, + "learning_rate": 0.00026319812474520995, + "loss": 3.6356, + "step": 86785 + }, + { + "epoch": 5.896860986547085, + "grad_norm": 1.159195899963379, + "learning_rate": 0.00026315565973637723, + "loss": 3.4579, + "step": 86790 + }, + { + "epoch": 5.897200706617747, + "grad_norm": 0.974744975566864, + "learning_rate": 0.0002631131947275445, + "loss": 3.3325, + "step": 86795 + }, + { + "epoch": 5.897540426688408, + "grad_norm": 0.8333086967468262, + "learning_rate": 0.00026307072971871174, + "loss": 3.348, + "step": 86800 + }, + { + "epoch": 5.8978801467590705, + "grad_norm": 1.243043303489685, + "learning_rate": 0.0002630282647098791, + "loss": 3.5183, + "step": 86805 + }, + { + "epoch": 5.8982198668297325, + "grad_norm": 0.8995612263679504, + "learning_rate": 0.00026298579970104635, + "loss": 3.4454, + "step": 86810 + }, + { + "epoch": 5.898559586900394, + "grad_norm": 0.9952684044837952, + "learning_rate": 0.0002629433346922136, + "loss": 3.3127, + "step": 86815 + }, + { + "epoch": 5.898899306971056, + "grad_norm": 0.8556402921676636, + "learning_rate": 0.0002629008696833809, + "loss": 3.5834, + "step": 86820 + }, + { + "epoch": 5.899239027041718, + "grad_norm": 0.708816647529602, + "learning_rate": 0.0002628584046745482, + "loss": 3.4974, + "step": 86825 + }, + { + "epoch": 5.899578747112379, + "grad_norm": 2.3004868030548096, + "learning_rate": 0.0002628159396657154, + "loss": 3.2669, + "step": 86830 + }, + { + "epoch": 5.899918467183041, + "grad_norm": 0.7914449572563171, + "learning_rate": 0.00026277347465688275, + "loss": 3.3325, + "step": 86835 + }, + { + "epoch": 5.900258187253703, + "grad_norm": 0.8589985966682434, + "learning_rate": 0.00026273100964805003, + "loss": 3.399, + "step": 86840 + }, + { + "epoch": 5.900597907324364, + "grad_norm": 0.9387797713279724, + "learning_rate": 0.00026268854463921726, + "loss": 3.604, + "step": 86845 + }, + { + "epoch": 5.9009376273950265, + "grad_norm": 0.8169525265693665, + "learning_rate": 0.00026264607963038454, + "loss": 3.3751, + "step": 86850 + }, + { + "epoch": 5.9012773474656885, + "grad_norm": 1.0765163898468018, + "learning_rate": 0.0002626036146215519, + "loss": 3.2339, + "step": 86855 + }, + { + "epoch": 5.90161706753635, + "grad_norm": 0.7464698553085327, + "learning_rate": 0.00026256114961271915, + "loss": 3.6253, + "step": 86860 + }, + { + "epoch": 5.901956787607012, + "grad_norm": 0.8235451579093933, + "learning_rate": 0.0002625186846038864, + "loss": 3.4092, + "step": 86865 + }, + { + "epoch": 5.902296507677674, + "grad_norm": 1.0528082847595215, + "learning_rate": 0.0002624762195950537, + "loss": 3.5906, + "step": 86870 + }, + { + "epoch": 5.902636227748335, + "grad_norm": 0.868760347366333, + "learning_rate": 0.000262433754586221, + "loss": 3.4225, + "step": 86875 + }, + { + "epoch": 5.902975947818997, + "grad_norm": 1.0837578773498535, + "learning_rate": 0.0002623912895773882, + "loss": 3.4533, + "step": 86880 + }, + { + "epoch": 5.903315667889659, + "grad_norm": 0.9398670196533203, + "learning_rate": 0.0002623488245685555, + "loss": 3.4249, + "step": 86885 + }, + { + "epoch": 5.90365538796032, + "grad_norm": 0.8957585096359253, + "learning_rate": 0.00026230635955972284, + "loss": 3.5816, + "step": 86890 + }, + { + "epoch": 5.9039951080309825, + "grad_norm": 0.8304343223571777, + "learning_rate": 0.00026226389455089006, + "loss": 3.4553, + "step": 86895 + }, + { + "epoch": 5.904334828101645, + "grad_norm": 1.7944316864013672, + "learning_rate": 0.00026222142954205734, + "loss": 3.4434, + "step": 86900 + }, + { + "epoch": 5.904674548172306, + "grad_norm": 1.0046963691711426, + "learning_rate": 0.0002621789645332247, + "loss": 3.4787, + "step": 86905 + }, + { + "epoch": 5.905014268242968, + "grad_norm": 0.79020094871521, + "learning_rate": 0.0002621364995243919, + "loss": 3.1879, + "step": 86910 + }, + { + "epoch": 5.90535398831363, + "grad_norm": 0.9317235946655273, + "learning_rate": 0.0002620940345155592, + "loss": 3.355, + "step": 86915 + }, + { + "epoch": 5.905693708384291, + "grad_norm": 0.8564223647117615, + "learning_rate": 0.00026205156950672646, + "loss": 3.2635, + "step": 86920 + }, + { + "epoch": 5.906033428454953, + "grad_norm": 1.055840015411377, + "learning_rate": 0.00026200910449789374, + "loss": 3.4557, + "step": 86925 + }, + { + "epoch": 5.906373148525615, + "grad_norm": 0.9973794221878052, + "learning_rate": 0.000261966639489061, + "loss": 3.3785, + "step": 86930 + }, + { + "epoch": 5.906712868596276, + "grad_norm": 0.6936120986938477, + "learning_rate": 0.0002619241744802283, + "loss": 3.105, + "step": 86935 + }, + { + "epoch": 5.9070525886669385, + "grad_norm": 1.0391945838928223, + "learning_rate": 0.0002618817094713956, + "loss": 3.1209, + "step": 86940 + }, + { + "epoch": 5.907392308737601, + "grad_norm": 0.893470287322998, + "learning_rate": 0.00026183924446256286, + "loss": 3.3094, + "step": 86945 + }, + { + "epoch": 5.907732028808262, + "grad_norm": 1.0276168584823608, + "learning_rate": 0.00026179677945373014, + "loss": 3.694, + "step": 86950 + }, + { + "epoch": 5.908071748878924, + "grad_norm": 0.8118465542793274, + "learning_rate": 0.00026175431444489737, + "loss": 3.5316, + "step": 86955 + }, + { + "epoch": 5.908411468949586, + "grad_norm": 0.8243783116340637, + "learning_rate": 0.0002617118494360647, + "loss": 3.3234, + "step": 86960 + }, + { + "epoch": 5.908751189020247, + "grad_norm": 1.0331975221633911, + "learning_rate": 0.000261669384427232, + "loss": 3.5842, + "step": 86965 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 0.8877561688423157, + "learning_rate": 0.0002616269194183992, + "loss": 3.421, + "step": 86970 + }, + { + "epoch": 5.909430629161571, + "grad_norm": 0.8891770243644714, + "learning_rate": 0.00026158445440956654, + "loss": 3.7298, + "step": 86975 + }, + { + "epoch": 5.909770349232232, + "grad_norm": 0.8355556130409241, + "learning_rate": 0.0002615419894007338, + "loss": 3.4218, + "step": 86980 + }, + { + "epoch": 5.9101100693028945, + "grad_norm": 0.7937032580375671, + "learning_rate": 0.00026149952439190105, + "loss": 3.3011, + "step": 86985 + }, + { + "epoch": 5.910449789373557, + "grad_norm": 0.8990845084190369, + "learning_rate": 0.00026145705938306833, + "loss": 3.3992, + "step": 86990 + }, + { + "epoch": 5.910789509444218, + "grad_norm": 0.8384902477264404, + "learning_rate": 0.00026141459437423566, + "loss": 3.5586, + "step": 86995 + }, + { + "epoch": 5.91112922951488, + "grad_norm": 0.9573510885238647, + "learning_rate": 0.0002613721293654029, + "loss": 3.4892, + "step": 87000 + }, + { + "epoch": 5.911468949585542, + "grad_norm": 0.8281229734420776, + "learning_rate": 0.00026132966435657017, + "loss": 3.5543, + "step": 87005 + }, + { + "epoch": 5.911808669656203, + "grad_norm": 0.9700893759727478, + "learning_rate": 0.0002612871993477375, + "loss": 3.1885, + "step": 87010 + }, + { + "epoch": 5.912148389726865, + "grad_norm": 0.8416017293930054, + "learning_rate": 0.00026124473433890473, + "loss": 3.3587, + "step": 87015 + }, + { + "epoch": 5.912488109797527, + "grad_norm": 0.8035040497779846, + "learning_rate": 0.000261202269330072, + "loss": 3.4583, + "step": 87020 + }, + { + "epoch": 5.9128278298681884, + "grad_norm": 0.9929309487342834, + "learning_rate": 0.0002611598043212393, + "loss": 3.6649, + "step": 87025 + }, + { + "epoch": 5.9131675499388505, + "grad_norm": 0.8687652945518494, + "learning_rate": 0.0002611173393124066, + "loss": 3.7609, + "step": 87030 + }, + { + "epoch": 5.913507270009513, + "grad_norm": 0.8855723142623901, + "learning_rate": 0.00026107487430357385, + "loss": 3.6028, + "step": 87035 + }, + { + "epoch": 5.913846990080174, + "grad_norm": 0.9196783900260925, + "learning_rate": 0.00026103240929474113, + "loss": 3.4008, + "step": 87040 + }, + { + "epoch": 5.914186710150836, + "grad_norm": 0.8940955400466919, + "learning_rate": 0.00026098994428590846, + "loss": 3.1507, + "step": 87045 + }, + { + "epoch": 5.914526430221498, + "grad_norm": 0.8061543107032776, + "learning_rate": 0.0002609474792770757, + "loss": 3.4413, + "step": 87050 + }, + { + "epoch": 5.914866150292159, + "grad_norm": 1.1648595333099365, + "learning_rate": 0.00026090501426824297, + "loss": 3.1861, + "step": 87055 + }, + { + "epoch": 5.915205870362821, + "grad_norm": 0.8928185701370239, + "learning_rate": 0.00026086254925941025, + "loss": 3.3152, + "step": 87060 + }, + { + "epoch": 5.915545590433483, + "grad_norm": 0.8327751159667969, + "learning_rate": 0.00026082008425057753, + "loss": 3.4412, + "step": 87065 + }, + { + "epoch": 5.9158853105041445, + "grad_norm": 0.9127298593521118, + "learning_rate": 0.0002607776192417448, + "loss": 3.3233, + "step": 87070 + }, + { + "epoch": 5.9162250305748065, + "grad_norm": 0.7545013427734375, + "learning_rate": 0.0002607351542329121, + "loss": 3.5371, + "step": 87075 + }, + { + "epoch": 5.916564750645469, + "grad_norm": 0.9608603715896606, + "learning_rate": 0.00026069268922407937, + "loss": 3.5592, + "step": 87080 + }, + { + "epoch": 5.91690447071613, + "grad_norm": 1.0239756107330322, + "learning_rate": 0.00026065022421524665, + "loss": 3.2858, + "step": 87085 + }, + { + "epoch": 5.917244190786792, + "grad_norm": 0.8383137583732605, + "learning_rate": 0.00026060775920641393, + "loss": 3.7643, + "step": 87090 + }, + { + "epoch": 5.917583910857454, + "grad_norm": 1.015661358833313, + "learning_rate": 0.00026056529419758116, + "loss": 3.2802, + "step": 87095 + }, + { + "epoch": 5.917923630928115, + "grad_norm": 0.7962061762809753, + "learning_rate": 0.0002605228291887485, + "loss": 3.4882, + "step": 87100 + }, + { + "epoch": 5.918263350998777, + "grad_norm": 0.822151243686676, + "learning_rate": 0.00026048036417991577, + "loss": 3.6033, + "step": 87105 + }, + { + "epoch": 5.918603071069439, + "grad_norm": 0.8710185289382935, + "learning_rate": 0.000260437899171083, + "loss": 3.3813, + "step": 87110 + }, + { + "epoch": 5.9189427911401005, + "grad_norm": 0.8527087569236755, + "learning_rate": 0.00026039543416225033, + "loss": 3.4853, + "step": 87115 + }, + { + "epoch": 5.9192825112107625, + "grad_norm": 0.8425455093383789, + "learning_rate": 0.0002603529691534176, + "loss": 3.2583, + "step": 87120 + }, + { + "epoch": 5.919622231281424, + "grad_norm": 0.8958030343055725, + "learning_rate": 0.00026031050414458484, + "loss": 3.526, + "step": 87125 + }, + { + "epoch": 5.919961951352086, + "grad_norm": 0.8853068351745605, + "learning_rate": 0.00026026803913575217, + "loss": 3.1069, + "step": 87130 + }, + { + "epoch": 5.920301671422748, + "grad_norm": 1.8608351945877075, + "learning_rate": 0.00026022557412691945, + "loss": 3.5177, + "step": 87135 + }, + { + "epoch": 5.920641391493409, + "grad_norm": 0.7724325060844421, + "learning_rate": 0.0002601831091180867, + "loss": 3.4227, + "step": 87140 + }, + { + "epoch": 5.920981111564071, + "grad_norm": 1.204099178314209, + "learning_rate": 0.00026014064410925396, + "loss": 3.5353, + "step": 87145 + }, + { + "epoch": 5.921320831634733, + "grad_norm": 0.8287742733955383, + "learning_rate": 0.0002600981791004213, + "loss": 3.456, + "step": 87150 + }, + { + "epoch": 5.921660551705394, + "grad_norm": 0.8621846437454224, + "learning_rate": 0.0002600557140915885, + "loss": 3.4033, + "step": 87155 + }, + { + "epoch": 5.9220002717760565, + "grad_norm": 0.8368261456489563, + "learning_rate": 0.0002600132490827558, + "loss": 3.3735, + "step": 87160 + }, + { + "epoch": 5.9223399918467186, + "grad_norm": 0.9940850734710693, + "learning_rate": 0.00025997078407392313, + "loss": 3.762, + "step": 87165 + }, + { + "epoch": 5.92267971191738, + "grad_norm": 0.9455165266990662, + "learning_rate": 0.00025992831906509036, + "loss": 3.5601, + "step": 87170 + }, + { + "epoch": 5.923019431988042, + "grad_norm": 0.8573393821716309, + "learning_rate": 0.00025988585405625764, + "loss": 3.4057, + "step": 87175 + }, + { + "epoch": 5.923359152058704, + "grad_norm": 0.7993945479393005, + "learning_rate": 0.0002598433890474249, + "loss": 3.5569, + "step": 87180 + }, + { + "epoch": 5.923698872129365, + "grad_norm": 0.7324861884117126, + "learning_rate": 0.0002598009240385922, + "loss": 3.5174, + "step": 87185 + }, + { + "epoch": 5.924038592200027, + "grad_norm": 1.0687042474746704, + "learning_rate": 0.0002597584590297595, + "loss": 3.4761, + "step": 87190 + }, + { + "epoch": 5.924378312270689, + "grad_norm": 0.8822992444038391, + "learning_rate": 0.00025971599402092676, + "loss": 3.6604, + "step": 87195 + }, + { + "epoch": 5.92471803234135, + "grad_norm": 1.1003047227859497, + "learning_rate": 0.0002596735290120941, + "loss": 3.7235, + "step": 87200 + }, + { + "epoch": 5.9250577524120125, + "grad_norm": 0.863612949848175, + "learning_rate": 0.0002596310640032613, + "loss": 3.3171, + "step": 87205 + }, + { + "epoch": 5.925397472482675, + "grad_norm": 1.1000735759735107, + "learning_rate": 0.0002595885989944286, + "loss": 3.4114, + "step": 87210 + }, + { + "epoch": 5.925737192553336, + "grad_norm": 1.0025889873504639, + "learning_rate": 0.0002595461339855959, + "loss": 3.5527, + "step": 87215 + }, + { + "epoch": 5.926076912623998, + "grad_norm": 0.9169624447822571, + "learning_rate": 0.00025950366897676316, + "loss": 3.3941, + "step": 87220 + }, + { + "epoch": 5.92641663269466, + "grad_norm": 0.9119603633880615, + "learning_rate": 0.00025946120396793044, + "loss": 3.31, + "step": 87225 + }, + { + "epoch": 5.926756352765321, + "grad_norm": 1.0673167705535889, + "learning_rate": 0.0002594187389590977, + "loss": 3.4004, + "step": 87230 + }, + { + "epoch": 5.927096072835983, + "grad_norm": 0.7262454628944397, + "learning_rate": 0.000259376273950265, + "loss": 3.7155, + "step": 87235 + }, + { + "epoch": 5.927435792906645, + "grad_norm": 0.7208853960037231, + "learning_rate": 0.0002593338089414323, + "loss": 3.3828, + "step": 87240 + }, + { + "epoch": 5.927775512977306, + "grad_norm": 0.7661446928977966, + "learning_rate": 0.0002592998369343661, + "loss": 3.0308, + "step": 87245 + }, + { + "epoch": 5.9281152330479685, + "grad_norm": 0.9347319006919861, + "learning_rate": 0.0002592573719255334, + "loss": 3.5024, + "step": 87250 + }, + { + "epoch": 5.928454953118631, + "grad_norm": 1.0295777320861816, + "learning_rate": 0.0002592149069167006, + "loss": 3.3241, + "step": 87255 + }, + { + "epoch": 5.928794673189292, + "grad_norm": 1.023579478263855, + "learning_rate": 0.00025917244190786794, + "loss": 3.5432, + "step": 87260 + }, + { + "epoch": 5.929134393259954, + "grad_norm": 1.0716497898101807, + "learning_rate": 0.0002591299768990352, + "loss": 3.304, + "step": 87265 + }, + { + "epoch": 5.929474113330616, + "grad_norm": 0.9037529230117798, + "learning_rate": 0.00025908751189020245, + "loss": 3.6416, + "step": 87270 + }, + { + "epoch": 5.929813833401277, + "grad_norm": 13.710335731506348, + "learning_rate": 0.0002590450468813698, + "loss": 3.3151, + "step": 87275 + }, + { + "epoch": 5.930153553471939, + "grad_norm": 0.8618022799491882, + "learning_rate": 0.00025900258187253706, + "loss": 3.3288, + "step": 87280 + }, + { + "epoch": 5.930493273542601, + "grad_norm": 0.9251920580863953, + "learning_rate": 0.0002589601168637043, + "loss": 3.4242, + "step": 87285 + }, + { + "epoch": 5.930832993613262, + "grad_norm": 0.8978553414344788, + "learning_rate": 0.00025891765185487157, + "loss": 3.4317, + "step": 87290 + }, + { + "epoch": 5.9311727136839245, + "grad_norm": 0.9681868553161621, + "learning_rate": 0.0002588751868460389, + "loss": 3.4826, + "step": 87295 + }, + { + "epoch": 5.931512433754587, + "grad_norm": 4.869377613067627, + "learning_rate": 0.00025883272183720613, + "loss": 3.3921, + "step": 87300 + }, + { + "epoch": 5.931852153825248, + "grad_norm": 1.019148588180542, + "learning_rate": 0.0002587902568283734, + "loss": 3.4553, + "step": 87305 + }, + { + "epoch": 5.93219187389591, + "grad_norm": 0.9622085094451904, + "learning_rate": 0.00025874779181954074, + "loss": 3.3943, + "step": 87310 + }, + { + "epoch": 5.932531593966572, + "grad_norm": 0.879891037940979, + "learning_rate": 0.00025870532681070797, + "loss": 3.5496, + "step": 87315 + }, + { + "epoch": 5.932871314037233, + "grad_norm": 0.7299776077270508, + "learning_rate": 0.00025866286180187525, + "loss": 3.4, + "step": 87320 + }, + { + "epoch": 5.933211034107895, + "grad_norm": 0.8822351694107056, + "learning_rate": 0.00025862039679304253, + "loss": 3.4823, + "step": 87325 + }, + { + "epoch": 5.933550754178556, + "grad_norm": 0.969819188117981, + "learning_rate": 0.0002585779317842098, + "loss": 3.4178, + "step": 87330 + }, + { + "epoch": 5.9338904742492184, + "grad_norm": 1.1901196241378784, + "learning_rate": 0.0002585354667753771, + "loss": 3.515, + "step": 87335 + }, + { + "epoch": 5.9342301943198805, + "grad_norm": 0.853628396987915, + "learning_rate": 0.00025849300176654437, + "loss": 3.7457, + "step": 87340 + }, + { + "epoch": 5.934569914390542, + "grad_norm": 0.830877423286438, + "learning_rate": 0.00025845053675771165, + "loss": 3.3073, + "step": 87345 + }, + { + "epoch": 5.934909634461204, + "grad_norm": 0.9942733645439148, + "learning_rate": 0.00025840807174887893, + "loss": 3.4437, + "step": 87350 + }, + { + "epoch": 5.935249354531866, + "grad_norm": 0.8680742979049683, + "learning_rate": 0.0002583656067400462, + "loss": 3.4135, + "step": 87355 + }, + { + "epoch": 5.935589074602527, + "grad_norm": 1.0217350721359253, + "learning_rate": 0.00025832314173121344, + "loss": 3.4429, + "step": 87360 + }, + { + "epoch": 5.935928794673189, + "grad_norm": 0.7823540568351746, + "learning_rate": 0.00025828067672238077, + "loss": 3.3188, + "step": 87365 + }, + { + "epoch": 5.936268514743851, + "grad_norm": 0.8331390023231506, + "learning_rate": 0.00025823821171354805, + "loss": 3.4444, + "step": 87370 + }, + { + "epoch": 5.936608234814512, + "grad_norm": 0.9083067178726196, + "learning_rate": 0.0002581957467047153, + "loss": 3.186, + "step": 87375 + }, + { + "epoch": 5.9369479548851745, + "grad_norm": 0.9259858727455139, + "learning_rate": 0.0002581532816958826, + "loss": 3.4001, + "step": 87380 + }, + { + "epoch": 5.9372876749558365, + "grad_norm": 1.0337470769882202, + "learning_rate": 0.0002581108166870499, + "loss": 3.4656, + "step": 87385 + }, + { + "epoch": 5.937627395026498, + "grad_norm": 0.8176271915435791, + "learning_rate": 0.0002580683516782171, + "loss": 3.3112, + "step": 87390 + }, + { + "epoch": 5.93796711509716, + "grad_norm": 1.1741375923156738, + "learning_rate": 0.00025802588666938445, + "loss": 3.1358, + "step": 87395 + }, + { + "epoch": 5.938306835167822, + "grad_norm": 0.9300533533096313, + "learning_rate": 0.00025798342166055173, + "loss": 3.0622, + "step": 87400 + }, + { + "epoch": 5.938646555238483, + "grad_norm": 1.109507441520691, + "learning_rate": 0.000257940956651719, + "loss": 3.3859, + "step": 87405 + }, + { + "epoch": 5.938986275309145, + "grad_norm": 1.1454970836639404, + "learning_rate": 0.00025789849164288624, + "loss": 3.5865, + "step": 87410 + }, + { + "epoch": 5.939325995379807, + "grad_norm": 0.9350774884223938, + "learning_rate": 0.00025785602663405357, + "loss": 3.2065, + "step": 87415 + }, + { + "epoch": 5.939665715450468, + "grad_norm": 1.0308334827423096, + "learning_rate": 0.00025781356162522085, + "loss": 3.387, + "step": 87420 + }, + { + "epoch": 5.9400054355211305, + "grad_norm": 0.8213961124420166, + "learning_rate": 0.0002577710966163881, + "loss": 3.1875, + "step": 87425 + }, + { + "epoch": 5.9403451555917925, + "grad_norm": 0.6979351043701172, + "learning_rate": 0.0002577286316075554, + "loss": 3.252, + "step": 87430 + }, + { + "epoch": 5.940684875662454, + "grad_norm": 0.8709296584129333, + "learning_rate": 0.0002576861665987227, + "loss": 3.5936, + "step": 87435 + }, + { + "epoch": 5.941024595733116, + "grad_norm": 0.8492176532745361, + "learning_rate": 0.0002576437015898899, + "loss": 3.3563, + "step": 87440 + }, + { + "epoch": 5.941364315803778, + "grad_norm": 0.8691036105155945, + "learning_rate": 0.0002576012365810572, + "loss": 3.2763, + "step": 87445 + }, + { + "epoch": 5.941704035874439, + "grad_norm": 1.2808903455734253, + "learning_rate": 0.00025755877157222453, + "loss": 3.439, + "step": 87450 + }, + { + "epoch": 5.942043755945101, + "grad_norm": 0.8880999684333801, + "learning_rate": 0.00025751630656339176, + "loss": 3.4582, + "step": 87455 + }, + { + "epoch": 5.942383476015763, + "grad_norm": 0.8245532512664795, + "learning_rate": 0.00025747384155455904, + "loss": 3.5298, + "step": 87460 + }, + { + "epoch": 5.942723196086424, + "grad_norm": 0.9818351864814758, + "learning_rate": 0.00025743137654572637, + "loss": 3.2919, + "step": 87465 + }, + { + "epoch": 5.9430629161570865, + "grad_norm": 0.7561081647872925, + "learning_rate": 0.0002573889115368936, + "loss": 3.2671, + "step": 87470 + }, + { + "epoch": 5.943402636227749, + "grad_norm": 1.0153244733810425, + "learning_rate": 0.0002573464465280609, + "loss": 3.5446, + "step": 87475 + }, + { + "epoch": 5.94374235629841, + "grad_norm": 0.8600819110870361, + "learning_rate": 0.00025730398151922816, + "loss": 3.5694, + "step": 87480 + }, + { + "epoch": 5.944082076369072, + "grad_norm": 0.8303184509277344, + "learning_rate": 0.00025726151651039544, + "loss": 3.5344, + "step": 87485 + }, + { + "epoch": 5.944421796439734, + "grad_norm": 0.8291602730751038, + "learning_rate": 0.0002572190515015627, + "loss": 3.3607, + "step": 87490 + }, + { + "epoch": 5.944761516510395, + "grad_norm": 0.9906383752822876, + "learning_rate": 0.00025717658649273, + "loss": 3.2498, + "step": 87495 + }, + { + "epoch": 5.945101236581057, + "grad_norm": 1.0506237745285034, + "learning_rate": 0.0002571341214838973, + "loss": 3.599, + "step": 87500 + }, + { + "epoch": 5.945440956651719, + "grad_norm": 0.890946626663208, + "learning_rate": 0.00025709165647506456, + "loss": 3.4714, + "step": 87505 + }, + { + "epoch": 5.94578067672238, + "grad_norm": 0.8295615315437317, + "learning_rate": 0.00025704919146623184, + "loss": 3.2971, + "step": 87510 + }, + { + "epoch": 5.9461203967930425, + "grad_norm": 0.7587732076644897, + "learning_rate": 0.00025700672645739906, + "loss": 3.5896, + "step": 87515 + }, + { + "epoch": 5.946460116863705, + "grad_norm": 1.0273265838623047, + "learning_rate": 0.0002569642614485664, + "loss": 3.5832, + "step": 87520 + }, + { + "epoch": 5.946799836934366, + "grad_norm": 0.8584988117218018, + "learning_rate": 0.0002569217964397337, + "loss": 3.6005, + "step": 87525 + }, + { + "epoch": 5.947139557005028, + "grad_norm": 0.9107551574707031, + "learning_rate": 0.0002568793314309009, + "loss": 3.232, + "step": 87530 + }, + { + "epoch": 5.94747927707569, + "grad_norm": 0.8768028616905212, + "learning_rate": 0.00025683686642206824, + "loss": 3.5846, + "step": 87535 + }, + { + "epoch": 5.947818997146351, + "grad_norm": 0.814556896686554, + "learning_rate": 0.0002567944014132355, + "loss": 3.4046, + "step": 87540 + }, + { + "epoch": 5.948158717217013, + "grad_norm": 1.1070818901062012, + "learning_rate": 0.00025675193640440274, + "loss": 3.4018, + "step": 87545 + }, + { + "epoch": 5.948498437287675, + "grad_norm": 0.8664720058441162, + "learning_rate": 0.00025670947139557, + "loss": 3.2525, + "step": 87550 + }, + { + "epoch": 5.948838157358336, + "grad_norm": 1.1122792959213257, + "learning_rate": 0.00025666700638673736, + "loss": 3.697, + "step": 87555 + }, + { + "epoch": 5.9491778774289985, + "grad_norm": 0.7186791896820068, + "learning_rate": 0.0002566245413779046, + "loss": 3.1844, + "step": 87560 + }, + { + "epoch": 5.949517597499661, + "grad_norm": 0.8669803142547607, + "learning_rate": 0.00025658207636907186, + "loss": 3.3063, + "step": 87565 + }, + { + "epoch": 5.949857317570322, + "grad_norm": 0.9959481954574585, + "learning_rate": 0.0002565396113602392, + "loss": 3.4692, + "step": 87570 + }, + { + "epoch": 5.950197037640984, + "grad_norm": 0.8121986389160156, + "learning_rate": 0.0002564971463514065, + "loss": 3.2514, + "step": 87575 + }, + { + "epoch": 5.950536757711646, + "grad_norm": 1.2832014560699463, + "learning_rate": 0.0002564546813425737, + "loss": 3.4432, + "step": 87580 + }, + { + "epoch": 5.950876477782307, + "grad_norm": 0.7834803462028503, + "learning_rate": 0.000256412216333741, + "loss": 3.5945, + "step": 87585 + }, + { + "epoch": 5.951216197852969, + "grad_norm": 1.1629315614700317, + "learning_rate": 0.0002563697513249083, + "loss": 3.3294, + "step": 87590 + }, + { + "epoch": 5.951555917923631, + "grad_norm": 0.994955837726593, + "learning_rate": 0.00025632728631607554, + "loss": 3.5415, + "step": 87595 + }, + { + "epoch": 5.951895637994292, + "grad_norm": 0.8421398401260376, + "learning_rate": 0.0002562848213072428, + "loss": 3.4058, + "step": 87600 + }, + { + "epoch": 5.9522353580649545, + "grad_norm": 1.109228491783142, + "learning_rate": 0.00025624235629841016, + "loss": 3.5581, + "step": 87605 + }, + { + "epoch": 5.952575078135617, + "grad_norm": 0.858466386795044, + "learning_rate": 0.0002561998912895774, + "loss": 3.6875, + "step": 87610 + }, + { + "epoch": 5.952914798206278, + "grad_norm": 1.1190904378890991, + "learning_rate": 0.00025615742628074467, + "loss": 3.6751, + "step": 87615 + }, + { + "epoch": 5.95325451827694, + "grad_norm": 0.9409007430076599, + "learning_rate": 0.000256114961271912, + "loss": 3.3409, + "step": 87620 + }, + { + "epoch": 5.953594238347602, + "grad_norm": 1.0708022117614746, + "learning_rate": 0.0002560724962630792, + "loss": 3.2962, + "step": 87625 + }, + { + "epoch": 5.953933958418263, + "grad_norm": 0.9211446642875671, + "learning_rate": 0.0002560300312542465, + "loss": 3.6328, + "step": 87630 + }, + { + "epoch": 5.954273678488925, + "grad_norm": 0.8534998297691345, + "learning_rate": 0.0002559875662454138, + "loss": 3.4303, + "step": 87635 + }, + { + "epoch": 5.954613398559587, + "grad_norm": 0.7810120582580566, + "learning_rate": 0.00025594510123658107, + "loss": 3.3127, + "step": 87640 + }, + { + "epoch": 5.9549531186302485, + "grad_norm": 0.8890312314033508, + "learning_rate": 0.00025590263622774835, + "loss": 3.4002, + "step": 87645 + }, + { + "epoch": 5.9552928387009105, + "grad_norm": 0.8582366108894348, + "learning_rate": 0.0002558601712189156, + "loss": 3.5398, + "step": 87650 + }, + { + "epoch": 5.955632558771573, + "grad_norm": 1.101770043373108, + "learning_rate": 0.0002558177062100829, + "loss": 3.4661, + "step": 87655 + }, + { + "epoch": 5.955972278842234, + "grad_norm": 0.7853280305862427, + "learning_rate": 0.0002557752412012502, + "loss": 3.4261, + "step": 87660 + }, + { + "epoch": 5.956311998912896, + "grad_norm": 1.173232078552246, + "learning_rate": 0.00025573277619241747, + "loss": 3.499, + "step": 87665 + }, + { + "epoch": 5.956651718983558, + "grad_norm": 1.2011194229125977, + "learning_rate": 0.0002556903111835847, + "loss": 3.3928, + "step": 87670 + }, + { + "epoch": 5.956991439054219, + "grad_norm": 0.9078764915466309, + "learning_rate": 0.000255647846174752, + "loss": 3.4452, + "step": 87675 + }, + { + "epoch": 5.957331159124881, + "grad_norm": 0.8929280638694763, + "learning_rate": 0.0002556053811659193, + "loss": 3.7086, + "step": 87680 + }, + { + "epoch": 5.957670879195543, + "grad_norm": 1.0009629726409912, + "learning_rate": 0.00025556291615708653, + "loss": 3.5002, + "step": 87685 + }, + { + "epoch": 5.9580105992662045, + "grad_norm": 1.2639460563659668, + "learning_rate": 0.00025552045114825387, + "loss": 3.3811, + "step": 87690 + }, + { + "epoch": 5.9583503193368665, + "grad_norm": 0.8047736287117004, + "learning_rate": 0.00025547798613942115, + "loss": 3.3775, + "step": 87695 + }, + { + "epoch": 5.958690039407529, + "grad_norm": 0.9053967595100403, + "learning_rate": 0.00025543552113058837, + "loss": 3.7094, + "step": 87700 + }, + { + "epoch": 5.95902975947819, + "grad_norm": 0.9053887724876404, + "learning_rate": 0.00025539305612175565, + "loss": 3.5563, + "step": 87705 + }, + { + "epoch": 5.959369479548852, + "grad_norm": 0.9383725523948669, + "learning_rate": 0.000255350591112923, + "loss": 3.4727, + "step": 87710 + }, + { + "epoch": 5.959709199619514, + "grad_norm": 0.833698570728302, + "learning_rate": 0.0002553081261040902, + "loss": 3.4187, + "step": 87715 + }, + { + "epoch": 5.960048919690175, + "grad_norm": 0.8406358957290649, + "learning_rate": 0.0002552656610952575, + "loss": 3.5216, + "step": 87720 + }, + { + "epoch": 5.960388639760837, + "grad_norm": 0.8009905219078064, + "learning_rate": 0.0002552231960864248, + "loss": 3.4432, + "step": 87725 + }, + { + "epoch": 5.960728359831499, + "grad_norm": 0.8529903292655945, + "learning_rate": 0.00025518073107759205, + "loss": 3.7745, + "step": 87730 + }, + { + "epoch": 5.9610680799021605, + "grad_norm": 0.847158670425415, + "learning_rate": 0.00025513826606875933, + "loss": 3.5308, + "step": 87735 + }, + { + "epoch": 5.9614077999728226, + "grad_norm": 0.8853094577789307, + "learning_rate": 0.0002550958010599266, + "loss": 3.3524, + "step": 87740 + }, + { + "epoch": 5.961747520043485, + "grad_norm": 1.1828546524047852, + "learning_rate": 0.00025505333605109395, + "loss": 3.193, + "step": 87745 + }, + { + "epoch": 5.962087240114146, + "grad_norm": 0.9513552784919739, + "learning_rate": 0.0002550108710422612, + "loss": 3.833, + "step": 87750 + }, + { + "epoch": 5.962426960184808, + "grad_norm": 0.8209564089775085, + "learning_rate": 0.00025496840603342845, + "loss": 3.4537, + "step": 87755 + }, + { + "epoch": 5.96276668025547, + "grad_norm": 0.8446710109710693, + "learning_rate": 0.0002549259410245958, + "loss": 3.5412, + "step": 87760 + }, + { + "epoch": 5.963106400326131, + "grad_norm": 0.8388610482215881, + "learning_rate": 0.000254883476015763, + "loss": 3.7551, + "step": 87765 + }, + { + "epoch": 5.963446120396793, + "grad_norm": 1.1486258506774902, + "learning_rate": 0.0002548410110069303, + "loss": 3.5956, + "step": 87770 + }, + { + "epoch": 5.963785840467455, + "grad_norm": 1.4008855819702148, + "learning_rate": 0.0002547985459980976, + "loss": 3.3362, + "step": 87775 + }, + { + "epoch": 5.9641255605381165, + "grad_norm": 0.7611139416694641, + "learning_rate": 0.00025475608098926485, + "loss": 3.4685, + "step": 87780 + }, + { + "epoch": 5.964465280608779, + "grad_norm": 0.9360001683235168, + "learning_rate": 0.00025471361598043213, + "loss": 3.6552, + "step": 87785 + }, + { + "epoch": 5.964805000679441, + "grad_norm": 0.9566380977630615, + "learning_rate": 0.0002546711509715994, + "loss": 3.582, + "step": 87790 + }, + { + "epoch": 5.965144720750102, + "grad_norm": 0.7862052321434021, + "learning_rate": 0.0002546286859627667, + "loss": 3.1291, + "step": 87795 + }, + { + "epoch": 5.965484440820764, + "grad_norm": 0.8824952244758606, + "learning_rate": 0.000254586220953934, + "loss": 3.545, + "step": 87800 + }, + { + "epoch": 5.965824160891425, + "grad_norm": 1.0531044006347656, + "learning_rate": 0.00025454375594510125, + "loss": 3.3186, + "step": 87805 + }, + { + "epoch": 5.966163880962087, + "grad_norm": 0.8734419345855713, + "learning_rate": 0.0002545012909362685, + "loss": 3.6398, + "step": 87810 + }, + { + "epoch": 5.966503601032749, + "grad_norm": 0.9279415607452393, + "learning_rate": 0.0002544588259274358, + "loss": 3.7741, + "step": 87815 + }, + { + "epoch": 5.96684332110341, + "grad_norm": 0.8120086789131165, + "learning_rate": 0.0002544163609186031, + "loss": 3.3409, + "step": 87820 + }, + { + "epoch": 5.9671830411740725, + "grad_norm": 0.7611218690872192, + "learning_rate": 0.0002543738959097703, + "loss": 3.4591, + "step": 87825 + }, + { + "epoch": 5.967522761244735, + "grad_norm": 0.9114909768104553, + "learning_rate": 0.00025433143090093765, + "loss": 3.6715, + "step": 87830 + }, + { + "epoch": 5.967862481315396, + "grad_norm": 0.9281557202339172, + "learning_rate": 0.00025428896589210493, + "loss": 3.5032, + "step": 87835 + }, + { + "epoch": 5.968202201386058, + "grad_norm": 1.099713921546936, + "learning_rate": 0.00025424650088327216, + "loss": 3.4286, + "step": 87840 + }, + { + "epoch": 5.96854192145672, + "grad_norm": 0.7874103784561157, + "learning_rate": 0.00025420403587443944, + "loss": 3.5774, + "step": 87845 + }, + { + "epoch": 5.968881641527381, + "grad_norm": 0.8104667067527771, + "learning_rate": 0.0002541615708656068, + "loss": 3.3867, + "step": 87850 + }, + { + "epoch": 5.969221361598043, + "grad_norm": 0.8483915328979492, + "learning_rate": 0.000254119105856774, + "loss": 3.5704, + "step": 87855 + }, + { + "epoch": 5.969561081668705, + "grad_norm": 0.9675617814064026, + "learning_rate": 0.0002540766408479413, + "loss": 3.4079, + "step": 87860 + }, + { + "epoch": 5.969900801739366, + "grad_norm": 0.7876060009002686, + "learning_rate": 0.0002540341758391086, + "loss": 3.5209, + "step": 87865 + }, + { + "epoch": 5.9702405218100285, + "grad_norm": 0.842583954334259, + "learning_rate": 0.00025399171083027584, + "loss": 3.4159, + "step": 87870 + }, + { + "epoch": 5.970580241880691, + "grad_norm": 0.8975914120674133, + "learning_rate": 0.0002539492458214431, + "loss": 3.3594, + "step": 87875 + }, + { + "epoch": 5.970919961951352, + "grad_norm": 0.9659945964813232, + "learning_rate": 0.0002539067808126104, + "loss": 3.3833, + "step": 87880 + }, + { + "epoch": 5.971259682022014, + "grad_norm": 0.759992778301239, + "learning_rate": 0.0002538643158037777, + "loss": 3.6166, + "step": 87885 + }, + { + "epoch": 5.971599402092676, + "grad_norm": 1.244498372077942, + "learning_rate": 0.00025382185079494496, + "loss": 3.1945, + "step": 87890 + }, + { + "epoch": 5.971939122163337, + "grad_norm": 0.8708879947662354, + "learning_rate": 0.00025377938578611224, + "loss": 3.5443, + "step": 87895 + }, + { + "epoch": 5.972278842233999, + "grad_norm": 0.9715770483016968, + "learning_rate": 0.0002537369207772795, + "loss": 3.2743, + "step": 87900 + }, + { + "epoch": 5.972618562304661, + "grad_norm": 0.7683677673339844, + "learning_rate": 0.0002536944557684468, + "loss": 3.3208, + "step": 87905 + }, + { + "epoch": 5.9729582823753224, + "grad_norm": 0.9235039949417114, + "learning_rate": 0.0002536519907596141, + "loss": 3.5952, + "step": 87910 + }, + { + "epoch": 5.9732980024459845, + "grad_norm": 0.7747000455856323, + "learning_rate": 0.0002536095257507814, + "loss": 3.5168, + "step": 87915 + }, + { + "epoch": 5.973637722516647, + "grad_norm": 0.9501825571060181, + "learning_rate": 0.00025356706074194864, + "loss": 3.4786, + "step": 87920 + }, + { + "epoch": 5.973977442587308, + "grad_norm": 0.9825599789619446, + "learning_rate": 0.0002535245957331159, + "loss": 3.4249, + "step": 87925 + }, + { + "epoch": 5.97431716265797, + "grad_norm": 0.8727504014968872, + "learning_rate": 0.0002534821307242832, + "loss": 3.5232, + "step": 87930 + }, + { + "epoch": 5.974656882728632, + "grad_norm": 0.9273239970207214, + "learning_rate": 0.0002534396657154505, + "loss": 3.5326, + "step": 87935 + }, + { + "epoch": 5.974996602799293, + "grad_norm": 0.7662596702575684, + "learning_rate": 0.00025339720070661776, + "loss": 3.1811, + "step": 87940 + }, + { + "epoch": 5.975336322869955, + "grad_norm": 0.7651946544647217, + "learning_rate": 0.00025335473569778504, + "loss": 3.4896, + "step": 87945 + }, + { + "epoch": 5.975676042940617, + "grad_norm": 0.8154082894325256, + "learning_rate": 0.0002533122706889523, + "loss": 3.2642, + "step": 87950 + }, + { + "epoch": 5.9760157630112785, + "grad_norm": 0.9031043648719788, + "learning_rate": 0.0002532698056801196, + "loss": 3.2916, + "step": 87955 + }, + { + "epoch": 5.9763554830819405, + "grad_norm": 0.9209145307540894, + "learning_rate": 0.0002532273406712869, + "loss": 3.6234, + "step": 87960 + }, + { + "epoch": 5.976695203152603, + "grad_norm": 0.7985276579856873, + "learning_rate": 0.0002531848756624541, + "loss": 3.3728, + "step": 87965 + }, + { + "epoch": 5.977034923223264, + "grad_norm": 0.8207338452339172, + "learning_rate": 0.00025314241065362144, + "loss": 3.5354, + "step": 87970 + }, + { + "epoch": 5.977374643293926, + "grad_norm": 0.8272246718406677, + "learning_rate": 0.0002530999456447887, + "loss": 3.7067, + "step": 87975 + }, + { + "epoch": 5.977714363364588, + "grad_norm": 0.8883314728736877, + "learning_rate": 0.00025305748063595595, + "loss": 3.4655, + "step": 87980 + }, + { + "epoch": 5.978054083435249, + "grad_norm": 0.6860286593437195, + "learning_rate": 0.0002530150156271233, + "loss": 3.6401, + "step": 87985 + }, + { + "epoch": 5.978393803505911, + "grad_norm": 0.9194698929786682, + "learning_rate": 0.0002529810436200571, + "loss": 3.2107, + "step": 87990 + }, + { + "epoch": 5.978733523576573, + "grad_norm": 0.9548642635345459, + "learning_rate": 0.0002529385786112244, + "loss": 3.6215, + "step": 87995 + }, + { + "epoch": 5.9790732436472345, + "grad_norm": 1.038710355758667, + "learning_rate": 0.0002528961136023916, + "loss": 3.4081, + "step": 88000 + }, + { + "epoch": 5.9794129637178965, + "grad_norm": 0.8906978964805603, + "learning_rate": 0.0002528536485935589, + "loss": 3.3865, + "step": 88005 + }, + { + "epoch": 5.979752683788558, + "grad_norm": 0.8326139450073242, + "learning_rate": 0.0002528111835847262, + "loss": 3.234, + "step": 88010 + }, + { + "epoch": 5.98009240385922, + "grad_norm": 1.0762054920196533, + "learning_rate": 0.00025276871857589345, + "loss": 3.4484, + "step": 88015 + }, + { + "epoch": 5.980432123929882, + "grad_norm": 0.8042022585868835, + "learning_rate": 0.00025272625356706073, + "loss": 3.5216, + "step": 88020 + }, + { + "epoch": 5.980771844000543, + "grad_norm": 0.7954539656639099, + "learning_rate": 0.00025268378855822807, + "loss": 3.332, + "step": 88025 + }, + { + "epoch": 5.981111564071205, + "grad_norm": 0.8406203389167786, + "learning_rate": 0.0002526413235493953, + "loss": 3.5524, + "step": 88030 + }, + { + "epoch": 5.981451284141867, + "grad_norm": 1.0832786560058594, + "learning_rate": 0.0002525988585405626, + "loss": 3.6081, + "step": 88035 + }, + { + "epoch": 5.981791004212528, + "grad_norm": 0.9715307354927063, + "learning_rate": 0.00025255639353172985, + "loss": 3.5758, + "step": 88040 + }, + { + "epoch": 5.9821307242831905, + "grad_norm": 0.8702751398086548, + "learning_rate": 0.00025251392852289713, + "loss": 3.4057, + "step": 88045 + }, + { + "epoch": 5.982470444353853, + "grad_norm": 0.8646028637886047, + "learning_rate": 0.0002524714635140644, + "loss": 3.4467, + "step": 88050 + }, + { + "epoch": 5.982810164424514, + "grad_norm": 0.8343466520309448, + "learning_rate": 0.0002524289985052317, + "loss": 3.3252, + "step": 88055 + }, + { + "epoch": 5.983149884495176, + "grad_norm": 0.7321310639381409, + "learning_rate": 0.000252386533496399, + "loss": 3.5563, + "step": 88060 + }, + { + "epoch": 5.983489604565838, + "grad_norm": 0.8596400618553162, + "learning_rate": 0.00025234406848756625, + "loss": 3.5436, + "step": 88065 + }, + { + "epoch": 5.983829324636499, + "grad_norm": 0.8361667990684509, + "learning_rate": 0.00025230160347873353, + "loss": 3.6726, + "step": 88070 + }, + { + "epoch": 5.984169044707161, + "grad_norm": 0.8429216146469116, + "learning_rate": 0.00025225913846990076, + "loss": 3.174, + "step": 88075 + }, + { + "epoch": 5.984508764777823, + "grad_norm": 1.2502994537353516, + "learning_rate": 0.0002522166734610681, + "loss": 3.6062, + "step": 88080 + }, + { + "epoch": 5.984848484848484, + "grad_norm": 1.1297961473464966, + "learning_rate": 0.0002521742084522354, + "loss": 3.1664, + "step": 88085 + }, + { + "epoch": 5.9851882049191465, + "grad_norm": 0.833631694316864, + "learning_rate": 0.0002521317434434026, + "loss": 3.5316, + "step": 88090 + }, + { + "epoch": 5.985527924989809, + "grad_norm": 0.8750869035720825, + "learning_rate": 0.00025208927843456993, + "loss": 3.1809, + "step": 88095 + }, + { + "epoch": 5.98586764506047, + "grad_norm": 0.944252610206604, + "learning_rate": 0.0002520468134257372, + "loss": 3.5263, + "step": 88100 + }, + { + "epoch": 5.986207365131132, + "grad_norm": 0.7126943469047546, + "learning_rate": 0.00025200434841690444, + "loss": 3.4033, + "step": 88105 + }, + { + "epoch": 5.986547085201794, + "grad_norm": 0.9707626700401306, + "learning_rate": 0.0002519618834080717, + "loss": 3.6014, + "step": 88110 + }, + { + "epoch": 5.986886805272455, + "grad_norm": 0.7775417566299438, + "learning_rate": 0.00025191941839923905, + "loss": 3.5936, + "step": 88115 + }, + { + "epoch": 5.987226525343117, + "grad_norm": 1.0168333053588867, + "learning_rate": 0.00025187695339040633, + "loss": 3.3602, + "step": 88120 + }, + { + "epoch": 5.987566245413779, + "grad_norm": 0.8879374265670776, + "learning_rate": 0.00025183448838157356, + "loss": 3.2943, + "step": 88125 + }, + { + "epoch": 5.98790596548444, + "grad_norm": 1.0209028720855713, + "learning_rate": 0.0002517920233727409, + "loss": 3.1494, + "step": 88130 + }, + { + "epoch": 5.9882456855551025, + "grad_norm": 0.9140170216560364, + "learning_rate": 0.0002517495583639082, + "loss": 3.198, + "step": 88135 + }, + { + "epoch": 5.988585405625765, + "grad_norm": 0.9820375442504883, + "learning_rate": 0.0002517070933550754, + "loss": 3.4194, + "step": 88140 + }, + { + "epoch": 5.988925125696426, + "grad_norm": 0.9559626579284668, + "learning_rate": 0.00025166462834624273, + "loss": 3.3013, + "step": 88145 + }, + { + "epoch": 5.989264845767088, + "grad_norm": 0.9492443203926086, + "learning_rate": 0.00025162216333741, + "loss": 3.3457, + "step": 88150 + }, + { + "epoch": 5.98960456583775, + "grad_norm": 0.9223001599311829, + "learning_rate": 0.00025157969832857724, + "loss": 3.5156, + "step": 88155 + }, + { + "epoch": 5.989944285908411, + "grad_norm": 1.3432555198669434, + "learning_rate": 0.0002515372333197445, + "loss": 3.3384, + "step": 88160 + }, + { + "epoch": 5.990284005979073, + "grad_norm": 0.9420061111450195, + "learning_rate": 0.00025149476831091185, + "loss": 3.233, + "step": 88165 + }, + { + "epoch": 5.990623726049735, + "grad_norm": 1.2693272829055786, + "learning_rate": 0.0002514523033020791, + "loss": 3.7534, + "step": 88170 + }, + { + "epoch": 5.990963446120396, + "grad_norm": 1.2724741697311401, + "learning_rate": 0.00025140983829324636, + "loss": 3.3797, + "step": 88175 + }, + { + "epoch": 5.9913031661910585, + "grad_norm": 1.3410239219665527, + "learning_rate": 0.0002513673732844137, + "loss": 3.675, + "step": 88180 + }, + { + "epoch": 5.991642886261721, + "grad_norm": 0.8829385638237, + "learning_rate": 0.0002513249082755809, + "loss": 3.4706, + "step": 88185 + }, + { + "epoch": 5.991982606332382, + "grad_norm": 0.8474529981613159, + "learning_rate": 0.0002512824432667482, + "loss": 3.3388, + "step": 88190 + }, + { + "epoch": 5.992322326403044, + "grad_norm": 1.9708219766616821, + "learning_rate": 0.0002512399782579155, + "loss": 3.1731, + "step": 88195 + }, + { + "epoch": 5.992662046473706, + "grad_norm": 0.8354642987251282, + "learning_rate": 0.00025119751324908276, + "loss": 3.5463, + "step": 88200 + }, + { + "epoch": 5.993001766544367, + "grad_norm": 0.8855651617050171, + "learning_rate": 0.00025115504824025004, + "loss": 3.3253, + "step": 88205 + }, + { + "epoch": 5.993341486615029, + "grad_norm": 0.8354741334915161, + "learning_rate": 0.0002511125832314173, + "loss": 3.4699, + "step": 88210 + }, + { + "epoch": 5.993681206685691, + "grad_norm": 0.8131734728813171, + "learning_rate": 0.0002510701182225846, + "loss": 3.2271, + "step": 88215 + }, + { + "epoch": 5.9940209267563525, + "grad_norm": 0.7838398814201355, + "learning_rate": 0.0002510276532137519, + "loss": 3.3182, + "step": 88220 + }, + { + "epoch": 5.9943606468270145, + "grad_norm": 0.8467880487442017, + "learning_rate": 0.00025098518820491916, + "loss": 3.5594, + "step": 88225 + }, + { + "epoch": 5.994700366897677, + "grad_norm": 1.1047694683074951, + "learning_rate": 0.0002509427231960864, + "loss": 3.6227, + "step": 88230 + }, + { + "epoch": 5.995040086968338, + "grad_norm": 1.0649546384811401, + "learning_rate": 0.0002509002581872537, + "loss": 3.6608, + "step": 88235 + }, + { + "epoch": 5.995379807039, + "grad_norm": 0.8566672801971436, + "learning_rate": 0.000250857793178421, + "loss": 3.582, + "step": 88240 + }, + { + "epoch": 5.995719527109662, + "grad_norm": 0.8908790946006775, + "learning_rate": 0.00025081532816958823, + "loss": 3.3867, + "step": 88245 + }, + { + "epoch": 5.996059247180323, + "grad_norm": 1.0597162246704102, + "learning_rate": 0.00025077286316075556, + "loss": 3.6339, + "step": 88250 + }, + { + "epoch": 5.996398967250985, + "grad_norm": 0.9008331894874573, + "learning_rate": 0.00025073039815192284, + "loss": 3.2315, + "step": 88255 + }, + { + "epoch": 5.996738687321647, + "grad_norm": 0.8482336401939392, + "learning_rate": 0.00025068793314309007, + "loss": 3.3914, + "step": 88260 + }, + { + "epoch": 5.9970784073923085, + "grad_norm": 0.999190092086792, + "learning_rate": 0.00025064546813425735, + "loss": 3.5736, + "step": 88265 + }, + { + "epoch": 5.9974181274629705, + "grad_norm": 1.0302492380142212, + "learning_rate": 0.0002506030031254247, + "loss": 3.3391, + "step": 88270 + }, + { + "epoch": 5.997757847533633, + "grad_norm": 0.8267307877540588, + "learning_rate": 0.0002505605381165919, + "loss": 3.3493, + "step": 88275 + }, + { + "epoch": 5.998097567604294, + "grad_norm": 0.810225248336792, + "learning_rate": 0.0002505180731077592, + "loss": 3.5699, + "step": 88280 + }, + { + "epoch": 5.998437287674956, + "grad_norm": 1.0729035139083862, + "learning_rate": 0.0002504756080989265, + "loss": 3.5198, + "step": 88285 + }, + { + "epoch": 5.998777007745618, + "grad_norm": 0.8030059337615967, + "learning_rate": 0.0002504331430900938, + "loss": 3.4882, + "step": 88290 + }, + { + "epoch": 5.999116727816279, + "grad_norm": 0.8446468114852905, + "learning_rate": 0.00025039067808126103, + "loss": 3.3903, + "step": 88295 + }, + { + "epoch": 5.999456447886941, + "grad_norm": 1.3059229850769043, + "learning_rate": 0.0002503482130724283, + "loss": 3.5163, + "step": 88300 + }, + { + "epoch": 5.999796167957603, + "grad_norm": 1.1307041645050049, + "learning_rate": 0.00025030574806359564, + "loss": 3.5515, + "step": 88305 + }, + { + "epoch": 6.0, + "eval_bertscore": { + "f1": 0.8409054707517364, + "precision": 0.840920518137524, + "recall": 0.841719837761258 + }, + "eval_bleu_4": 0.02277402812413442, + "eval_exact_match": 0.00038763446070355656, + "eval_loss": 3.3996222019195557, + "eval_meteor": 0.09072239913814335, + "eval_rouge": { + "rouge1": 0.12903632333092827, + "rouge2": 0.02082259019116116, + "rougeL": 0.11232715034286006, + "rougeLsum": 0.11236009995375415 + }, + "eval_runtime": 1556.6431, + "eval_samples_per_second": 6.629, + "eval_steps_per_second": 0.829, + "step": 88308 + }, + { + "epoch": 6.0001358880282645, + "grad_norm": 0.8001552820205688, + "learning_rate": 0.00025026328305476287, + "loss": 3.371, + "step": 88310 + }, + { + "epoch": 6.0004756080989265, + "grad_norm": 0.7927175164222717, + "learning_rate": 0.00025022081804593015, + "loss": 3.3147, + "step": 88315 + }, + { + "epoch": 6.000815328169589, + "grad_norm": 0.8131561279296875, + "learning_rate": 0.0002501783530370975, + "loss": 3.3562, + "step": 88320 + }, + { + "epoch": 6.00115504824025, + "grad_norm": 0.94084233045578, + "learning_rate": 0.0002501358880282647, + "loss": 3.2606, + "step": 88325 + }, + { + "epoch": 6.001494768310912, + "grad_norm": 0.8574494123458862, + "learning_rate": 0.000250093423019432, + "loss": 3.336, + "step": 88330 + }, + { + "epoch": 6.001834488381574, + "grad_norm": 0.845425009727478, + "learning_rate": 0.00025005095801059927, + "loss": 3.4527, + "step": 88335 + }, + { + "epoch": 6.002174208452235, + "grad_norm": 1.0180166959762573, + "learning_rate": 0.00025000849300176655, + "loss": 3.4245, + "step": 88340 + }, + { + "epoch": 6.002513928522897, + "grad_norm": 0.8292003870010376, + "learning_rate": 0.00024996602799293383, + "loss": 3.5514, + "step": 88345 + }, + { + "epoch": 6.002853648593559, + "grad_norm": 0.8941911458969116, + "learning_rate": 0.0002499235629841011, + "loss": 3.4236, + "step": 88350 + }, + { + "epoch": 6.0031933686642205, + "grad_norm": 1.017426609992981, + "learning_rate": 0.0002498810979752684, + "loss": 3.5037, + "step": 88355 + }, + { + "epoch": 6.003533088734883, + "grad_norm": 0.8921874165534973, + "learning_rate": 0.00024983863296643567, + "loss": 3.3229, + "step": 88360 + }, + { + "epoch": 6.003872808805545, + "grad_norm": 0.945702075958252, + "learning_rate": 0.00024979616795760295, + "loss": 3.1406, + "step": 88365 + }, + { + "epoch": 6.004212528876206, + "grad_norm": 1.9268730878829956, + "learning_rate": 0.00024975370294877023, + "loss": 3.053, + "step": 88370 + }, + { + "epoch": 6.004552248946868, + "grad_norm": 0.7531015276908875, + "learning_rate": 0.0002497112379399375, + "loss": 3.4423, + "step": 88375 + }, + { + "epoch": 6.00489196901753, + "grad_norm": 1.0322219133377075, + "learning_rate": 0.0002496687729311048, + "loss": 3.3351, + "step": 88380 + }, + { + "epoch": 6.005231689088191, + "grad_norm": 0.9508810639381409, + "learning_rate": 0.00024962630792227207, + "loss": 3.5382, + "step": 88385 + }, + { + "epoch": 6.005571409158853, + "grad_norm": 0.9619696140289307, + "learning_rate": 0.00024958384291343935, + "loss": 3.424, + "step": 88390 + }, + { + "epoch": 6.005911129229515, + "grad_norm": 0.9977752566337585, + "learning_rate": 0.0002495413779046066, + "loss": 3.5039, + "step": 88395 + }, + { + "epoch": 6.0062508493001765, + "grad_norm": 0.9954617619514465, + "learning_rate": 0.0002494989128957739, + "loss": 3.3856, + "step": 88400 + }, + { + "epoch": 6.006590569370839, + "grad_norm": 1.0331206321716309, + "learning_rate": 0.00024945644788694114, + "loss": 3.5664, + "step": 88405 + }, + { + "epoch": 6.006930289441501, + "grad_norm": 1.0138330459594727, + "learning_rate": 0.0002494139828781084, + "loss": 3.3078, + "step": 88410 + }, + { + "epoch": 6.007270009512162, + "grad_norm": 0.8681838512420654, + "learning_rate": 0.00024937151786927575, + "loss": 3.3079, + "step": 88415 + }, + { + "epoch": 6.007609729582824, + "grad_norm": 0.8768112659454346, + "learning_rate": 0.000249329052860443, + "loss": 3.4055, + "step": 88420 + }, + { + "epoch": 6.007949449653485, + "grad_norm": 1.146081805229187, + "learning_rate": 0.0002492865878516103, + "loss": 3.4119, + "step": 88425 + }, + { + "epoch": 6.008289169724147, + "grad_norm": 1.0852971076965332, + "learning_rate": 0.00024924412284277754, + "loss": 3.495, + "step": 88430 + }, + { + "epoch": 6.008628889794809, + "grad_norm": 0.9482365846633911, + "learning_rate": 0.0002492016578339448, + "loss": 3.2655, + "step": 88435 + }, + { + "epoch": 6.00896860986547, + "grad_norm": 0.9991101622581482, + "learning_rate": 0.00024915919282511215, + "loss": 3.2838, + "step": 88440 + }, + { + "epoch": 6.0093083299361325, + "grad_norm": 1.0607714653015137, + "learning_rate": 0.0002491167278162794, + "loss": 3.6085, + "step": 88445 + }, + { + "epoch": 6.009648050006795, + "grad_norm": 0.849619448184967, + "learning_rate": 0.00024907426280744666, + "loss": 3.2264, + "step": 88450 + }, + { + "epoch": 6.009987770077456, + "grad_norm": 0.9807196259498596, + "learning_rate": 0.00024903179779861394, + "loss": 3.3918, + "step": 88455 + }, + { + "epoch": 6.010327490148118, + "grad_norm": 0.8930837512016296, + "learning_rate": 0.0002489893327897812, + "loss": 3.1776, + "step": 88460 + }, + { + "epoch": 6.01066721021878, + "grad_norm": 0.8067898750305176, + "learning_rate": 0.0002489468677809485, + "loss": 3.4328, + "step": 88465 + }, + { + "epoch": 6.011006930289441, + "grad_norm": 0.8880143165588379, + "learning_rate": 0.0002489044027721158, + "loss": 3.5848, + "step": 88470 + }, + { + "epoch": 6.011346650360103, + "grad_norm": 0.7689340114593506, + "learning_rate": 0.00024886193776328306, + "loss": 3.6577, + "step": 88475 + }, + { + "epoch": 6.011686370430765, + "grad_norm": 0.9106751084327698, + "learning_rate": 0.00024881947275445034, + "loss": 3.246, + "step": 88480 + }, + { + "epoch": 6.0120260905014264, + "grad_norm": 1.1155726909637451, + "learning_rate": 0.0002487770077456176, + "loss": 3.3501, + "step": 88485 + }, + { + "epoch": 6.0123658105720885, + "grad_norm": 0.9749777913093567, + "learning_rate": 0.0002487345427367849, + "loss": 3.2847, + "step": 88490 + }, + { + "epoch": 6.012705530642751, + "grad_norm": 1.8290605545043945, + "learning_rate": 0.0002486920777279522, + "loss": 3.5712, + "step": 88495 + }, + { + "epoch": 6.013045250713412, + "grad_norm": 0.838805079460144, + "learning_rate": 0.00024864961271911946, + "loss": 3.3748, + "step": 88500 + }, + { + "epoch": 6.013384970784074, + "grad_norm": 1.0287470817565918, + "learning_rate": 0.00024860714771028674, + "loss": 3.4696, + "step": 88505 + }, + { + "epoch": 6.013724690854736, + "grad_norm": 1.026421308517456, + "learning_rate": 0.000248564682701454, + "loss": 3.397, + "step": 88510 + }, + { + "epoch": 6.014064410925397, + "grad_norm": 1.0823415517807007, + "learning_rate": 0.0002485222176926213, + "loss": 3.3464, + "step": 88515 + }, + { + "epoch": 6.014404130996059, + "grad_norm": 3.4461562633514404, + "learning_rate": 0.0002484797526837886, + "loss": 3.4062, + "step": 88520 + }, + { + "epoch": 6.014743851066721, + "grad_norm": 1.0962553024291992, + "learning_rate": 0.00024843728767495586, + "loss": 3.4066, + "step": 88525 + }, + { + "epoch": 6.0150835711373825, + "grad_norm": 1.0828511714935303, + "learning_rate": 0.00024839482266612314, + "loss": 3.4167, + "step": 88530 + }, + { + "epoch": 6.0154232912080445, + "grad_norm": 0.8911029696464539, + "learning_rate": 0.00024835235765729036, + "loss": 3.4463, + "step": 88535 + }, + { + "epoch": 6.015763011278707, + "grad_norm": 0.9592869877815247, + "learning_rate": 0.0002483098926484577, + "loss": 3.2063, + "step": 88540 + }, + { + "epoch": 6.016102731349368, + "grad_norm": 0.8495718836784363, + "learning_rate": 0.000248267427639625, + "loss": 3.4378, + "step": 88545 + }, + { + "epoch": 6.01644245142003, + "grad_norm": 0.8940118551254272, + "learning_rate": 0.0002482249626307922, + "loss": 3.4892, + "step": 88550 + }, + { + "epoch": 6.016782171490692, + "grad_norm": 1.10381281375885, + "learning_rate": 0.00024818249762195954, + "loss": 3.2447, + "step": 88555 + }, + { + "epoch": 6.017121891561353, + "grad_norm": 2.1057958602905273, + "learning_rate": 0.00024814003261312676, + "loss": 3.0825, + "step": 88560 + }, + { + "epoch": 6.017461611632015, + "grad_norm": 0.912414014339447, + "learning_rate": 0.00024809756760429404, + "loss": 3.4193, + "step": 88565 + }, + { + "epoch": 6.017801331702677, + "grad_norm": 1.0370008945465088, + "learning_rate": 0.0002480551025954614, + "loss": 3.4188, + "step": 88570 + }, + { + "epoch": 6.0181410517733385, + "grad_norm": 0.7614076137542725, + "learning_rate": 0.0002480126375866286, + "loss": 3.4193, + "step": 88575 + }, + { + "epoch": 6.0184807718440005, + "grad_norm": 0.7710705995559692, + "learning_rate": 0.0002479701725777959, + "loss": 3.5619, + "step": 88580 + }, + { + "epoch": 6.018820491914663, + "grad_norm": 0.7662572860717773, + "learning_rate": 0.00024792770756896316, + "loss": 3.3617, + "step": 88585 + }, + { + "epoch": 6.019160211985324, + "grad_norm": 1.0136221647262573, + "learning_rate": 0.00024788524256013044, + "loss": 3.5203, + "step": 88590 + }, + { + "epoch": 6.019499932055986, + "grad_norm": 0.9013885855674744, + "learning_rate": 0.0002478427775512977, + "loss": 3.0238, + "step": 88595 + }, + { + "epoch": 6.019839652126648, + "grad_norm": 1.0165934562683105, + "learning_rate": 0.000247800312542465, + "loss": 3.2437, + "step": 88600 + }, + { + "epoch": 6.020179372197309, + "grad_norm": 1.0680326223373413, + "learning_rate": 0.0002477578475336323, + "loss": 3.4413, + "step": 88605 + }, + { + "epoch": 6.020519092267971, + "grad_norm": 0.9787543416023254, + "learning_rate": 0.00024771538252479956, + "loss": 3.3029, + "step": 88610 + }, + { + "epoch": 6.020858812338633, + "grad_norm": 0.9909275770187378, + "learning_rate": 0.00024767291751596685, + "loss": 3.3974, + "step": 88615 + }, + { + "epoch": 6.0211985324092945, + "grad_norm": 0.8164178729057312, + "learning_rate": 0.0002476304525071341, + "loss": 3.3898, + "step": 88620 + }, + { + "epoch": 6.0215382524799566, + "grad_norm": 1.0357847213745117, + "learning_rate": 0.0002475879874983014, + "loss": 3.308, + "step": 88625 + }, + { + "epoch": 6.021877972550619, + "grad_norm": 1.1276334524154663, + "learning_rate": 0.0002475455224894687, + "loss": 3.4865, + "step": 88630 + }, + { + "epoch": 6.02221769262128, + "grad_norm": 0.7901747226715088, + "learning_rate": 0.00024750305748063597, + "loss": 3.3529, + "step": 88635 + }, + { + "epoch": 6.022557412691942, + "grad_norm": 0.860526978969574, + "learning_rate": 0.00024746059247180325, + "loss": 3.4515, + "step": 88640 + }, + { + "epoch": 6.022897132762604, + "grad_norm": 0.9673693180084229, + "learning_rate": 0.0002474181274629705, + "loss": 3.4337, + "step": 88645 + }, + { + "epoch": 6.023236852833265, + "grad_norm": 1.1064844131469727, + "learning_rate": 0.0002473756624541378, + "loss": 3.2377, + "step": 88650 + }, + { + "epoch": 6.023576572903927, + "grad_norm": 0.8649743795394897, + "learning_rate": 0.0002473331974453051, + "loss": 3.2103, + "step": 88655 + }, + { + "epoch": 6.023916292974589, + "grad_norm": 0.9882295727729797, + "learning_rate": 0.00024729073243647237, + "loss": 3.4383, + "step": 88660 + }, + { + "epoch": 6.0242560130452505, + "grad_norm": 0.8523057699203491, + "learning_rate": 0.0002472482674276396, + "loss": 3.0984, + "step": 88665 + }, + { + "epoch": 6.024595733115913, + "grad_norm": 0.8327500224113464, + "learning_rate": 0.0002472058024188069, + "loss": 3.6834, + "step": 88670 + }, + { + "epoch": 6.024935453186575, + "grad_norm": 0.8182424306869507, + "learning_rate": 0.0002471633374099742, + "loss": 3.4545, + "step": 88675 + }, + { + "epoch": 6.025275173257236, + "grad_norm": 0.9546188712120056, + "learning_rate": 0.0002471208724011415, + "loss": 3.5024, + "step": 88680 + }, + { + "epoch": 6.025614893327898, + "grad_norm": 1.0095925331115723, + "learning_rate": 0.00024707840739230877, + "loss": 3.3137, + "step": 88685 + }, + { + "epoch": 6.02595461339856, + "grad_norm": 1.2199410200119019, + "learning_rate": 0.000247035942383476, + "loss": 3.5644, + "step": 88690 + }, + { + "epoch": 6.026294333469221, + "grad_norm": 0.7411443591117859, + "learning_rate": 0.0002469934773746433, + "loss": 3.192, + "step": 88695 + }, + { + "epoch": 6.026634053539883, + "grad_norm": 0.7406821250915527, + "learning_rate": 0.00024695101236581055, + "loss": 3.7222, + "step": 88700 + }, + { + "epoch": 6.026973773610545, + "grad_norm": 0.9329227209091187, + "learning_rate": 0.00024690854735697783, + "loss": 3.2138, + "step": 88705 + }, + { + "epoch": 6.0273134936812065, + "grad_norm": 1.3504077196121216, + "learning_rate": 0.00024686608234814517, + "loss": 3.2234, + "step": 88710 + }, + { + "epoch": 6.027653213751869, + "grad_norm": 0.9415284395217896, + "learning_rate": 0.0002468236173393124, + "loss": 3.493, + "step": 88715 + }, + { + "epoch": 6.027992933822531, + "grad_norm": 0.9807319045066833, + "learning_rate": 0.00024678115233047967, + "loss": 3.3982, + "step": 88720 + }, + { + "epoch": 6.028332653893192, + "grad_norm": 0.8650000095367432, + "learning_rate": 0.00024673868732164695, + "loss": 3.5186, + "step": 88725 + }, + { + "epoch": 6.028672373963854, + "grad_norm": 0.7741013169288635, + "learning_rate": 0.00024669622231281423, + "loss": 3.6557, + "step": 88730 + }, + { + "epoch": 6.029012094034516, + "grad_norm": 0.8857645392417908, + "learning_rate": 0.0002466537573039815, + "loss": 3.3756, + "step": 88735 + }, + { + "epoch": 6.029351814105177, + "grad_norm": 1.1910061836242676, + "learning_rate": 0.0002466112922951488, + "loss": 3.6713, + "step": 88740 + }, + { + "epoch": 6.029691534175839, + "grad_norm": 0.7780575156211853, + "learning_rate": 0.00024656882728631607, + "loss": 3.4837, + "step": 88745 + }, + { + "epoch": 6.0300312542465, + "grad_norm": 0.9930034279823303, + "learning_rate": 0.00024652636227748335, + "loss": 3.3522, + "step": 88750 + }, + { + "epoch": 6.0303709743171625, + "grad_norm": 1.0212513208389282, + "learning_rate": 0.00024648389726865063, + "loss": 3.5093, + "step": 88755 + }, + { + "epoch": 6.030710694387825, + "grad_norm": 0.8842883110046387, + "learning_rate": 0.0002464414322598179, + "loss": 3.1826, + "step": 88760 + }, + { + "epoch": 6.031050414458486, + "grad_norm": 1.0912601947784424, + "learning_rate": 0.0002463989672509852, + "loss": 3.3856, + "step": 88765 + }, + { + "epoch": 6.031390134529148, + "grad_norm": 0.9846763610839844, + "learning_rate": 0.0002463565022421525, + "loss": 3.04, + "step": 88770 + }, + { + "epoch": 6.03172985459981, + "grad_norm": 0.7125519514083862, + "learning_rate": 0.00024631403723331975, + "loss": 3.491, + "step": 88775 + }, + { + "epoch": 6.032069574670471, + "grad_norm": 0.910378098487854, + "learning_rate": 0.00024627157222448703, + "loss": 3.2749, + "step": 88780 + }, + { + "epoch": 6.032409294741133, + "grad_norm": 1.0363065004348755, + "learning_rate": 0.0002462291072156543, + "loss": 3.227, + "step": 88785 + }, + { + "epoch": 6.032749014811795, + "grad_norm": 1.0911445617675781, + "learning_rate": 0.0002461866422068216, + "loss": 3.4401, + "step": 88790 + }, + { + "epoch": 6.0330887348824564, + "grad_norm": 0.9450706243515015, + "learning_rate": 0.0002461441771979889, + "loss": 3.5266, + "step": 88795 + }, + { + "epoch": 6.0334284549531185, + "grad_norm": 1.0906778573989868, + "learning_rate": 0.00024610171218915615, + "loss": 3.4927, + "step": 88800 + }, + { + "epoch": 6.033768175023781, + "grad_norm": 1.01165771484375, + "learning_rate": 0.00024605924718032343, + "loss": 3.4689, + "step": 88805 + }, + { + "epoch": 6.034107895094442, + "grad_norm": 2.972412347793579, + "learning_rate": 0.0002460167821714907, + "loss": 3.4766, + "step": 88810 + }, + { + "epoch": 6.034447615165104, + "grad_norm": 2.028269052505493, + "learning_rate": 0.000245974317162658, + "loss": 3.2718, + "step": 88815 + }, + { + "epoch": 6.034787335235766, + "grad_norm": 0.9604483842849731, + "learning_rate": 0.0002459318521538252, + "loss": 3.5883, + "step": 88820 + }, + { + "epoch": 6.035127055306427, + "grad_norm": 0.8923960328102112, + "learning_rate": 0.00024588938714499255, + "loss": 3.453, + "step": 88825 + }, + { + "epoch": 6.035466775377089, + "grad_norm": 0.8483250737190247, + "learning_rate": 0.0002458469221361598, + "loss": 3.4678, + "step": 88830 + }, + { + "epoch": 6.035806495447751, + "grad_norm": 0.9134661555290222, + "learning_rate": 0.00024580445712732706, + "loss": 3.3578, + "step": 88835 + }, + { + "epoch": 6.0361462155184125, + "grad_norm": 0.8445764183998108, + "learning_rate": 0.0002457619921184944, + "loss": 3.2592, + "step": 88840 + }, + { + "epoch": 6.0364859355890745, + "grad_norm": 1.4153802394866943, + "learning_rate": 0.0002457195271096616, + "loss": 3.1708, + "step": 88845 + }, + { + "epoch": 6.036825655659737, + "grad_norm": 0.8495588898658752, + "learning_rate": 0.00024567706210082895, + "loss": 3.4495, + "step": 88850 + }, + { + "epoch": 6.037165375730398, + "grad_norm": 0.9415349364280701, + "learning_rate": 0.0002456345970919962, + "loss": 3.4749, + "step": 88855 + }, + { + "epoch": 6.03750509580106, + "grad_norm": 0.8008081912994385, + "learning_rate": 0.00024559213208316346, + "loss": 3.742, + "step": 88860 + }, + { + "epoch": 6.037844815871722, + "grad_norm": 1.0233426094055176, + "learning_rate": 0.0002455496670743308, + "loss": 3.1142, + "step": 88865 + }, + { + "epoch": 6.038184535942383, + "grad_norm": 0.9339156746864319, + "learning_rate": 0.000245507202065498, + "loss": 3.5513, + "step": 88870 + }, + { + "epoch": 6.038524256013045, + "grad_norm": 0.9131945967674255, + "learning_rate": 0.0002454647370566653, + "loss": 3.2481, + "step": 88875 + }, + { + "epoch": 6.038863976083707, + "grad_norm": 0.8608031868934631, + "learning_rate": 0.0002454222720478326, + "loss": 3.6222, + "step": 88880 + }, + { + "epoch": 6.0392036961543685, + "grad_norm": 0.9536044597625732, + "learning_rate": 0.00024537980703899986, + "loss": 3.4106, + "step": 88885 + }, + { + "epoch": 6.0395434162250305, + "grad_norm": 0.9635094404220581, + "learning_rate": 0.00024533734203016714, + "loss": 3.2046, + "step": 88890 + }, + { + "epoch": 6.039883136295693, + "grad_norm": 0.9643108248710632, + "learning_rate": 0.0002452948770213344, + "loss": 3.2877, + "step": 88895 + }, + { + "epoch": 6.040222856366354, + "grad_norm": 0.9812986850738525, + "learning_rate": 0.0002452524120125017, + "loss": 3.4762, + "step": 88900 + }, + { + "epoch": 6.040562576437016, + "grad_norm": 1.2169638872146606, + "learning_rate": 0.000245209947003669, + "loss": 3.5047, + "step": 88905 + }, + { + "epoch": 6.040902296507678, + "grad_norm": 1.3882557153701782, + "learning_rate": 0.00024516748199483626, + "loss": 3.3692, + "step": 88910 + }, + { + "epoch": 6.041242016578339, + "grad_norm": 1.0936378240585327, + "learning_rate": 0.00024512501698600354, + "loss": 3.4188, + "step": 88915 + }, + { + "epoch": 6.041581736649001, + "grad_norm": 1.0647841691970825, + "learning_rate": 0.0002450825519771708, + "loss": 3.2968, + "step": 88920 + }, + { + "epoch": 6.041921456719663, + "grad_norm": 0.8748718500137329, + "learning_rate": 0.0002450400869683381, + "loss": 3.2957, + "step": 88925 + }, + { + "epoch": 6.0422611767903245, + "grad_norm": 0.8016204237937927, + "learning_rate": 0.0002449976219595054, + "loss": 3.4752, + "step": 88930 + }, + { + "epoch": 6.042600896860987, + "grad_norm": 0.7530916333198547, + "learning_rate": 0.00024495515695067266, + "loss": 3.6672, + "step": 88935 + }, + { + "epoch": 6.042940616931649, + "grad_norm": 0.9171333312988281, + "learning_rate": 0.00024491269194183994, + "loss": 3.2678, + "step": 88940 + }, + { + "epoch": 6.04328033700231, + "grad_norm": 0.8440061211585999, + "learning_rate": 0.0002448702269330072, + "loss": 3.4089, + "step": 88945 + }, + { + "epoch": 6.043620057072972, + "grad_norm": 0.9164229035377502, + "learning_rate": 0.0002448277619241745, + "loss": 3.5204, + "step": 88950 + }, + { + "epoch": 6.043959777143634, + "grad_norm": 1.0612329244613647, + "learning_rate": 0.0002447852969153418, + "loss": 3.5191, + "step": 88955 + }, + { + "epoch": 6.044299497214295, + "grad_norm": 1.0325781106948853, + "learning_rate": 0.000244742831906509, + "loss": 3.6503, + "step": 88960 + }, + { + "epoch": 6.044639217284957, + "grad_norm": 0.9817315340042114, + "learning_rate": 0.00024470036689767634, + "loss": 3.3874, + "step": 88965 + }, + { + "epoch": 6.044978937355619, + "grad_norm": 1.1854314804077148, + "learning_rate": 0.0002446579018888436, + "loss": 3.6864, + "step": 88970 + }, + { + "epoch": 6.0453186574262805, + "grad_norm": 0.8340064883232117, + "learning_rate": 0.00024461543688001085, + "loss": 3.2132, + "step": 88975 + }, + { + "epoch": 6.045658377496943, + "grad_norm": 0.8724546432495117, + "learning_rate": 0.0002445729718711782, + "loss": 3.4248, + "step": 88980 + }, + { + "epoch": 6.045998097567605, + "grad_norm": 0.9860439300537109, + "learning_rate": 0.0002445305068623454, + "loss": 3.4035, + "step": 88985 + }, + { + "epoch": 6.046337817638266, + "grad_norm": 0.918147087097168, + "learning_rate": 0.0002444880418535127, + "loss": 3.1019, + "step": 88990 + }, + { + "epoch": 6.046677537708928, + "grad_norm": 1.0381423234939575, + "learning_rate": 0.00024444557684468, + "loss": 3.3789, + "step": 88995 + }, + { + "epoch": 6.04701725777959, + "grad_norm": 0.9091445207595825, + "learning_rate": 0.00024440311183584725, + "loss": 3.0442, + "step": 89000 + }, + { + "epoch": 6.047356977850251, + "grad_norm": 1.1229406595230103, + "learning_rate": 0.00024436064682701453, + "loss": 3.3973, + "step": 89005 + }, + { + "epoch": 6.047696697920913, + "grad_norm": 1.0033973455429077, + "learning_rate": 0.0002443181818181818, + "loss": 3.2998, + "step": 89010 + }, + { + "epoch": 6.048036417991575, + "grad_norm": 0.9292895197868347, + "learning_rate": 0.0002442757168093491, + "loss": 3.3114, + "step": 89015 + }, + { + "epoch": 6.0483761380622365, + "grad_norm": 0.8865324258804321, + "learning_rate": 0.00024423325180051637, + "loss": 3.108, + "step": 89020 + }, + { + "epoch": 6.048715858132899, + "grad_norm": 1.2329400777816772, + "learning_rate": 0.00024419078679168365, + "loss": 3.3678, + "step": 89025 + }, + { + "epoch": 6.049055578203561, + "grad_norm": 0.8195637464523315, + "learning_rate": 0.00024414832178285093, + "loss": 3.3457, + "step": 89030 + }, + { + "epoch": 6.049395298274222, + "grad_norm": 0.9356868863105774, + "learning_rate": 0.00024410585677401824, + "loss": 3.4682, + "step": 89035 + }, + { + "epoch": 6.049735018344884, + "grad_norm": 1.0034458637237549, + "learning_rate": 0.0002440633917651855, + "loss": 3.4979, + "step": 89040 + }, + { + "epoch": 6.050074738415546, + "grad_norm": 0.9576014876365662, + "learning_rate": 0.00024402092675635277, + "loss": 3.436, + "step": 89045 + }, + { + "epoch": 6.050414458486207, + "grad_norm": 0.8276261687278748, + "learning_rate": 0.00024397846174752005, + "loss": 3.3299, + "step": 89050 + }, + { + "epoch": 6.050754178556869, + "grad_norm": 0.9674467444419861, + "learning_rate": 0.00024393599673868733, + "loss": 3.4606, + "step": 89055 + }, + { + "epoch": 6.051093898627531, + "grad_norm": 0.8928089141845703, + "learning_rate": 0.00024389353172985458, + "loss": 3.3701, + "step": 89060 + }, + { + "epoch": 6.0514336186981925, + "grad_norm": 0.9952036142349243, + "learning_rate": 0.0002438510667210219, + "loss": 3.4161, + "step": 89065 + }, + { + "epoch": 6.051773338768855, + "grad_norm": 1.0343154668807983, + "learning_rate": 0.00024380860171218917, + "loss": 3.5244, + "step": 89070 + }, + { + "epoch": 6.052113058839517, + "grad_norm": 1.1524487733840942, + "learning_rate": 0.00024376613670335642, + "loss": 3.1798, + "step": 89075 + }, + { + "epoch": 6.052452778910178, + "grad_norm": 1.0093181133270264, + "learning_rate": 0.00024372367169452373, + "loss": 3.4605, + "step": 89080 + }, + { + "epoch": 6.05279249898084, + "grad_norm": 0.9704719185829163, + "learning_rate": 0.00024368120668569098, + "loss": 3.3206, + "step": 89085 + }, + { + "epoch": 6.053132219051502, + "grad_norm": 0.7421999573707581, + "learning_rate": 0.00024363874167685826, + "loss": 3.3476, + "step": 89090 + }, + { + "epoch": 6.053471939122163, + "grad_norm": 0.9087968468666077, + "learning_rate": 0.00024359627666802557, + "loss": 3.2944, + "step": 89095 + }, + { + "epoch": 6.053811659192825, + "grad_norm": 0.721286952495575, + "learning_rate": 0.00024355381165919282, + "loss": 3.3327, + "step": 89100 + }, + { + "epoch": 6.0541513792634865, + "grad_norm": 0.8031777739524841, + "learning_rate": 0.00024351134665036013, + "loss": 3.4558, + "step": 89105 + }, + { + "epoch": 6.0544910993341485, + "grad_norm": 0.7720203995704651, + "learning_rate": 0.00024346888164152738, + "loss": 3.3876, + "step": 89110 + }, + { + "epoch": 6.054830819404811, + "grad_norm": 0.8133941292762756, + "learning_rate": 0.00024342641663269466, + "loss": 2.8757, + "step": 89115 + }, + { + "epoch": 6.055170539475472, + "grad_norm": 0.8198811411857605, + "learning_rate": 0.00024338395162386194, + "loss": 3.5672, + "step": 89120 + }, + { + "epoch": 6.055510259546134, + "grad_norm": 1.1686360836029053, + "learning_rate": 0.00024334148661502922, + "loss": 3.4415, + "step": 89125 + }, + { + "epoch": 6.055849979616796, + "grad_norm": 0.9816054105758667, + "learning_rate": 0.0002432990216061965, + "loss": 3.1015, + "step": 89130 + }, + { + "epoch": 6.056189699687457, + "grad_norm": 0.7571449875831604, + "learning_rate": 0.00024325655659736378, + "loss": 3.5625, + "step": 89135 + }, + { + "epoch": 6.056529419758119, + "grad_norm": 0.7015106678009033, + "learning_rate": 0.00024321409158853106, + "loss": 3.2169, + "step": 89140 + }, + { + "epoch": 6.056869139828781, + "grad_norm": 0.9037936925888062, + "learning_rate": 0.00024317162657969832, + "loss": 3.3217, + "step": 89145 + }, + { + "epoch": 6.0572088598994425, + "grad_norm": 1.3237744569778442, + "learning_rate": 0.00024312916157086562, + "loss": 3.2841, + "step": 89150 + }, + { + "epoch": 6.0575485799701045, + "grad_norm": 0.8824396729469299, + "learning_rate": 0.00024308669656203288, + "loss": 3.4998, + "step": 89155 + }, + { + "epoch": 6.057888300040767, + "grad_norm": 0.8492057919502258, + "learning_rate": 0.00024304423155320016, + "loss": 3.1769, + "step": 89160 + }, + { + "epoch": 6.058228020111428, + "grad_norm": 1.092672348022461, + "learning_rate": 0.00024300176654436746, + "loss": 3.4665, + "step": 89165 + }, + { + "epoch": 6.05856774018209, + "grad_norm": 0.897320568561554, + "learning_rate": 0.00024295930153553472, + "loss": 3.4058, + "step": 89170 + }, + { + "epoch": 6.058907460252752, + "grad_norm": 0.8992919325828552, + "learning_rate": 0.000242916836526702, + "loss": 3.7639, + "step": 89175 + }, + { + "epoch": 6.059247180323413, + "grad_norm": 1.0586401224136353, + "learning_rate": 0.00024287437151786928, + "loss": 3.4719, + "step": 89180 + }, + { + "epoch": 6.059586900394075, + "grad_norm": 0.90134197473526, + "learning_rate": 0.00024283190650903656, + "loss": 3.4981, + "step": 89185 + }, + { + "epoch": 6.059926620464737, + "grad_norm": 0.8016781210899353, + "learning_rate": 0.00024278944150020384, + "loss": 3.5237, + "step": 89190 + }, + { + "epoch": 6.0602663405353985, + "grad_norm": 0.7436591386795044, + "learning_rate": 0.00024274697649137112, + "loss": 3.3414, + "step": 89195 + }, + { + "epoch": 6.0606060606060606, + "grad_norm": 1.2227129936218262, + "learning_rate": 0.0002427045114825384, + "loss": 3.51, + "step": 89200 + }, + { + "epoch": 6.060945780676723, + "grad_norm": 0.7111827731132507, + "learning_rate": 0.00024266204647370568, + "loss": 3.226, + "step": 89205 + }, + { + "epoch": 6.061285500747384, + "grad_norm": 1.043121337890625, + "learning_rate": 0.00024261958146487296, + "loss": 3.3582, + "step": 89210 + }, + { + "epoch": 6.061625220818046, + "grad_norm": 0.8063682913780212, + "learning_rate": 0.0002425771164560402, + "loss": 3.1208, + "step": 89215 + }, + { + "epoch": 6.061964940888708, + "grad_norm": 1.0155919790267944, + "learning_rate": 0.00024253465144720752, + "loss": 3.4669, + "step": 89220 + }, + { + "epoch": 6.062304660959369, + "grad_norm": 1.0140587091445923, + "learning_rate": 0.0002424921864383748, + "loss": 3.3796, + "step": 89225 + }, + { + "epoch": 6.062644381030031, + "grad_norm": 0.9438390135765076, + "learning_rate": 0.00024244972142954205, + "loss": 3.5309, + "step": 89230 + }, + { + "epoch": 6.062984101100693, + "grad_norm": 0.8342059254646301, + "learning_rate": 0.00024240725642070936, + "loss": 3.6858, + "step": 89235 + }, + { + "epoch": 6.0633238211713545, + "grad_norm": 0.9709253907203674, + "learning_rate": 0.0002423647914118766, + "loss": 3.3177, + "step": 89240 + }, + { + "epoch": 6.063663541242017, + "grad_norm": 0.8643320202827454, + "learning_rate": 0.0002423223264030439, + "loss": 3.4796, + "step": 89245 + }, + { + "epoch": 6.064003261312679, + "grad_norm": 0.9507293701171875, + "learning_rate": 0.00024227986139421117, + "loss": 3.5716, + "step": 89250 + }, + { + "epoch": 6.06434298138334, + "grad_norm": 0.8297502994537354, + "learning_rate": 0.00024223739638537845, + "loss": 3.3674, + "step": 89255 + }, + { + "epoch": 6.064682701454002, + "grad_norm": 0.8066967725753784, + "learning_rate": 0.00024219493137654573, + "loss": 3.0615, + "step": 89260 + }, + { + "epoch": 6.065022421524664, + "grad_norm": 0.7287917733192444, + "learning_rate": 0.000242152466367713, + "loss": 3.4968, + "step": 89265 + }, + { + "epoch": 6.065362141595325, + "grad_norm": 0.7989723682403564, + "learning_rate": 0.0002421100013588803, + "loss": 3.5964, + "step": 89270 + }, + { + "epoch": 6.065701861665987, + "grad_norm": 1.2523809671401978, + "learning_rate": 0.00024206753635004757, + "loss": 3.397, + "step": 89275 + }, + { + "epoch": 6.066041581736649, + "grad_norm": 0.995922863483429, + "learning_rate": 0.00024202507134121485, + "loss": 3.2501, + "step": 89280 + }, + { + "epoch": 6.0663813018073105, + "grad_norm": 0.90363609790802, + "learning_rate": 0.0002419826063323821, + "loss": 3.4151, + "step": 89285 + }, + { + "epoch": 6.066721021877973, + "grad_norm": 1.439242959022522, + "learning_rate": 0.0002419401413235494, + "loss": 3.7186, + "step": 89290 + }, + { + "epoch": 6.067060741948635, + "grad_norm": 0.8925469517707825, + "learning_rate": 0.0002418976763147167, + "loss": 3.7499, + "step": 89295 + }, + { + "epoch": 6.067400462019296, + "grad_norm": 0.9683470726013184, + "learning_rate": 0.00024185521130588394, + "loss": 3.5247, + "step": 89300 + }, + { + "epoch": 6.067740182089958, + "grad_norm": 0.9636355042457581, + "learning_rate": 0.00024181274629705125, + "loss": 3.2763, + "step": 89305 + }, + { + "epoch": 6.06807990216062, + "grad_norm": 1.0355736017227173, + "learning_rate": 0.0002417702812882185, + "loss": 3.3833, + "step": 89310 + }, + { + "epoch": 6.068419622231281, + "grad_norm": 0.982419490814209, + "learning_rate": 0.00024172781627938578, + "loss": 3.3247, + "step": 89315 + }, + { + "epoch": 6.068759342301943, + "grad_norm": 1.5573205947875977, + "learning_rate": 0.00024168535127055306, + "loss": 3.1015, + "step": 89320 + }, + { + "epoch": 6.069099062372605, + "grad_norm": 0.8347210884094238, + "learning_rate": 0.00024164288626172034, + "loss": 3.7181, + "step": 89325 + }, + { + "epoch": 6.0694387824432665, + "grad_norm": 1.1063815355300903, + "learning_rate": 0.00024160042125288763, + "loss": 3.4398, + "step": 89330 + }, + { + "epoch": 6.069778502513929, + "grad_norm": 0.8224183320999146, + "learning_rate": 0.0002415579562440549, + "loss": 3.5141, + "step": 89335 + }, + { + "epoch": 6.070118222584591, + "grad_norm": 0.8666048645973206, + "learning_rate": 0.00024151549123522219, + "loss": 3.7251, + "step": 89340 + }, + { + "epoch": 6.070457942655252, + "grad_norm": 0.8607798218727112, + "learning_rate": 0.00024147302622638944, + "loss": 3.1762, + "step": 89345 + }, + { + "epoch": 6.070797662725914, + "grad_norm": 0.8859748840332031, + "learning_rate": 0.00024143056121755675, + "loss": 3.4099, + "step": 89350 + }, + { + "epoch": 6.071137382796576, + "grad_norm": 1.1247646808624268, + "learning_rate": 0.000241388096208724, + "loss": 3.4187, + "step": 89355 + }, + { + "epoch": 6.071477102867237, + "grad_norm": 1.0231808423995972, + "learning_rate": 0.0002413456311998913, + "loss": 3.3635, + "step": 89360 + }, + { + "epoch": 6.071816822937899, + "grad_norm": 0.8988510370254517, + "learning_rate": 0.00024130316619105859, + "loss": 3.6061, + "step": 89365 + }, + { + "epoch": 6.072156543008561, + "grad_norm": 0.8118703961372375, + "learning_rate": 0.00024126070118222584, + "loss": 3.5407, + "step": 89370 + }, + { + "epoch": 6.0724962630792225, + "grad_norm": 0.9035707712173462, + "learning_rate": 0.00024121823617339315, + "loss": 3.2998, + "step": 89375 + }, + { + "epoch": 6.072835983149885, + "grad_norm": 0.7201623916625977, + "learning_rate": 0.0002411757711645604, + "loss": 3.3982, + "step": 89380 + }, + { + "epoch": 6.073175703220547, + "grad_norm": 0.816529393196106, + "learning_rate": 0.00024113330615572768, + "loss": 3.4442, + "step": 89385 + }, + { + "epoch": 6.073515423291208, + "grad_norm": 0.7196905612945557, + "learning_rate": 0.00024109084114689499, + "loss": 3.6262, + "step": 89390 + }, + { + "epoch": 6.07385514336187, + "grad_norm": 1.2915639877319336, + "learning_rate": 0.00024104837613806224, + "loss": 3.5283, + "step": 89395 + }, + { + "epoch": 6.074194863432532, + "grad_norm": 0.8260509967803955, + "learning_rate": 0.00024100591112922952, + "loss": 3.048, + "step": 89400 + }, + { + "epoch": 6.074534583503193, + "grad_norm": 0.9176071882247925, + "learning_rate": 0.0002409634461203968, + "loss": 3.5217, + "step": 89405 + }, + { + "epoch": 6.074874303573855, + "grad_norm": 1.0493543148040771, + "learning_rate": 0.00024092098111156408, + "loss": 3.5786, + "step": 89410 + }, + { + "epoch": 6.075214023644517, + "grad_norm": 0.9957193732261658, + "learning_rate": 0.00024087851610273133, + "loss": 3.4221, + "step": 89415 + }, + { + "epoch": 6.0755537437151785, + "grad_norm": 0.7936264276504517, + "learning_rate": 0.00024083605109389864, + "loss": 3.2955, + "step": 89420 + }, + { + "epoch": 6.075893463785841, + "grad_norm": 0.9925267100334167, + "learning_rate": 0.00024079358608506592, + "loss": 3.514, + "step": 89425 + }, + { + "epoch": 6.076233183856502, + "grad_norm": 0.7647408246994019, + "learning_rate": 0.00024075112107623317, + "loss": 3.6312, + "step": 89430 + }, + { + "epoch": 6.076572903927164, + "grad_norm": 1.0322074890136719, + "learning_rate": 0.00024070865606740048, + "loss": 3.4951, + "step": 89435 + }, + { + "epoch": 6.076912623997826, + "grad_norm": 0.9511871337890625, + "learning_rate": 0.00024066619105856773, + "loss": 3.6225, + "step": 89440 + }, + { + "epoch": 6.077252344068487, + "grad_norm": 0.9675427675247192, + "learning_rate": 0.00024062372604973504, + "loss": 3.617, + "step": 89445 + }, + { + "epoch": 6.077592064139149, + "grad_norm": 0.9409381151199341, + "learning_rate": 0.0002405812610409023, + "loss": 3.2952, + "step": 89450 + }, + { + "epoch": 6.077931784209811, + "grad_norm": 0.8897232413291931, + "learning_rate": 0.00024053879603206957, + "loss": 3.4699, + "step": 89455 + }, + { + "epoch": 6.0782715042804725, + "grad_norm": 1.0082066059112549, + "learning_rate": 0.00024049633102323688, + "loss": 3.5309, + "step": 89460 + }, + { + "epoch": 6.0786112243511345, + "grad_norm": 0.8560577034950256, + "learning_rate": 0.00024045386601440413, + "loss": 3.2755, + "step": 89465 + }, + { + "epoch": 6.078950944421797, + "grad_norm": 0.9418835043907166, + "learning_rate": 0.0002404114010055714, + "loss": 3.5896, + "step": 89470 + }, + { + "epoch": 6.079290664492458, + "grad_norm": 0.9550533294677734, + "learning_rate": 0.0002403689359967387, + "loss": 3.3116, + "step": 89475 + }, + { + "epoch": 6.07963038456312, + "grad_norm": 0.8384203314781189, + "learning_rate": 0.00024032647098790597, + "loss": 3.4597, + "step": 89480 + }, + { + "epoch": 6.079970104633782, + "grad_norm": 0.9413471817970276, + "learning_rate": 0.00024028400597907323, + "loss": 3.4365, + "step": 89485 + }, + { + "epoch": 6.080309824704443, + "grad_norm": 0.833048403263092, + "learning_rate": 0.00024024154097024053, + "loss": 3.4509, + "step": 89490 + }, + { + "epoch": 6.080649544775105, + "grad_norm": 0.97544264793396, + "learning_rate": 0.0002401990759614078, + "loss": 3.4719, + "step": 89495 + }, + { + "epoch": 6.080989264845767, + "grad_norm": 0.8623266816139221, + "learning_rate": 0.00024015661095257507, + "loss": 3.363, + "step": 89500 + }, + { + "epoch": 6.0813289849164285, + "grad_norm": 0.8876325488090515, + "learning_rate": 0.00024011414594374237, + "loss": 3.5002, + "step": 89505 + }, + { + "epoch": 6.081668704987091, + "grad_norm": 1.056256890296936, + "learning_rate": 0.00024007168093490963, + "loss": 3.6571, + "step": 89510 + }, + { + "epoch": 6.082008425057753, + "grad_norm": 0.9711539149284363, + "learning_rate": 0.0002400292159260769, + "loss": 3.4311, + "step": 89515 + }, + { + "epoch": 6.082348145128414, + "grad_norm": 0.6937743425369263, + "learning_rate": 0.00023998675091724421, + "loss": 3.4258, + "step": 89520 + }, + { + "epoch": 6.082687865199076, + "grad_norm": 0.8067362308502197, + "learning_rate": 0.00023994428590841147, + "loss": 3.4812, + "step": 89525 + }, + { + "epoch": 6.083027585269738, + "grad_norm": 0.9986014366149902, + "learning_rate": 0.00023990182089957877, + "loss": 3.446, + "step": 89530 + }, + { + "epoch": 6.083367305340399, + "grad_norm": 0.9533737301826477, + "learning_rate": 0.00023985935589074603, + "loss": 3.5102, + "step": 89535 + }, + { + "epoch": 6.083707025411061, + "grad_norm": 1.210567831993103, + "learning_rate": 0.0002398168908819133, + "loss": 3.131, + "step": 89540 + }, + { + "epoch": 6.084046745481723, + "grad_norm": 2.299793004989624, + "learning_rate": 0.0002397744258730806, + "loss": 3.4043, + "step": 89545 + }, + { + "epoch": 6.0843864655523845, + "grad_norm": 0.926751434803009, + "learning_rate": 0.00023973196086424787, + "loss": 3.3454, + "step": 89550 + }, + { + "epoch": 6.084726185623047, + "grad_norm": 0.7953591346740723, + "learning_rate": 0.00023968949585541515, + "loss": 3.2401, + "step": 89555 + }, + { + "epoch": 6.085065905693709, + "grad_norm": 0.8341919779777527, + "learning_rate": 0.00023964703084658243, + "loss": 3.0875, + "step": 89560 + }, + { + "epoch": 6.08540562576437, + "grad_norm": 1.231141448020935, + "learning_rate": 0.0002396045658377497, + "loss": 3.2441, + "step": 89565 + }, + { + "epoch": 6.085745345835032, + "grad_norm": 0.8381528854370117, + "learning_rate": 0.00023956210082891696, + "loss": 3.4211, + "step": 89570 + }, + { + "epoch": 6.086085065905694, + "grad_norm": 0.8929653763771057, + "learning_rate": 0.00023951963582008427, + "loss": 3.3763, + "step": 89575 + }, + { + "epoch": 6.086424785976355, + "grad_norm": 0.8979724645614624, + "learning_rate": 0.00023947717081125152, + "loss": 3.418, + "step": 89580 + }, + { + "epoch": 6.086764506047017, + "grad_norm": 0.8010327816009521, + "learning_rate": 0.0002394347058024188, + "loss": 3.1064, + "step": 89585 + }, + { + "epoch": 6.087104226117679, + "grad_norm": 0.8157447576522827, + "learning_rate": 0.0002393922407935861, + "loss": 3.264, + "step": 89590 + }, + { + "epoch": 6.0874439461883405, + "grad_norm": 0.9010993838310242, + "learning_rate": 0.00023934977578475336, + "loss": 3.474, + "step": 89595 + }, + { + "epoch": 6.087783666259003, + "grad_norm": 0.7772591710090637, + "learning_rate": 0.00023930731077592064, + "loss": 3.5609, + "step": 89600 + }, + { + "epoch": 6.088123386329665, + "grad_norm": 0.9807549118995667, + "learning_rate": 0.00023926484576708792, + "loss": 3.1446, + "step": 89605 + }, + { + "epoch": 6.088463106400326, + "grad_norm": 0.8930133581161499, + "learning_rate": 0.0002392223807582552, + "loss": 3.4786, + "step": 89610 + }, + { + "epoch": 6.088802826470988, + "grad_norm": 0.9945114254951477, + "learning_rate": 0.00023917991574942248, + "loss": 3.4104, + "step": 89615 + }, + { + "epoch": 6.08914254654165, + "grad_norm": 0.8664473295211792, + "learning_rate": 0.00023913745074058976, + "loss": 3.5886, + "step": 89620 + }, + { + "epoch": 6.089482266612311, + "grad_norm": 0.9921711087226868, + "learning_rate": 0.00023909498573175704, + "loss": 3.6421, + "step": 89625 + }, + { + "epoch": 6.089821986682973, + "grad_norm": 1.1646782159805298, + "learning_rate": 0.00023905252072292432, + "loss": 3.4709, + "step": 89630 + }, + { + "epoch": 6.090161706753635, + "grad_norm": 0.8571188449859619, + "learning_rate": 0.0002390100557140916, + "loss": 3.0508, + "step": 89635 + }, + { + "epoch": 6.0905014268242965, + "grad_norm": 0.8587504029273987, + "learning_rate": 0.00023896759070525885, + "loss": 3.3994, + "step": 89640 + }, + { + "epoch": 6.090841146894959, + "grad_norm": 0.9491881132125854, + "learning_rate": 0.00023892512569642616, + "loss": 3.2661, + "step": 89645 + }, + { + "epoch": 6.091180866965621, + "grad_norm": 0.9883736371994019, + "learning_rate": 0.00023888266068759344, + "loss": 3.2173, + "step": 89650 + }, + { + "epoch": 6.091520587036282, + "grad_norm": 0.8071476817131042, + "learning_rate": 0.0002388401956787607, + "loss": 3.5372, + "step": 89655 + }, + { + "epoch": 6.091860307106944, + "grad_norm": 1.1306140422821045, + "learning_rate": 0.000238797730669928, + "loss": 3.5343, + "step": 89660 + }, + { + "epoch": 6.092200027177606, + "grad_norm": 0.9191763401031494, + "learning_rate": 0.00023875526566109525, + "loss": 3.1938, + "step": 89665 + }, + { + "epoch": 6.092539747248267, + "grad_norm": 1.026047706604004, + "learning_rate": 0.00023871280065226253, + "loss": 3.57, + "step": 89670 + }, + { + "epoch": 6.092879467318929, + "grad_norm": 1.1005947589874268, + "learning_rate": 0.00023867033564342981, + "loss": 3.2776, + "step": 89675 + }, + { + "epoch": 6.093219187389591, + "grad_norm": 0.9493575692176819, + "learning_rate": 0.0002386278706345971, + "loss": 3.3707, + "step": 89680 + }, + { + "epoch": 6.0935589074602525, + "grad_norm": 1.0281580686569214, + "learning_rate": 0.00023858540562576437, + "loss": 3.3289, + "step": 89685 + }, + { + "epoch": 6.093898627530915, + "grad_norm": 0.7669827342033386, + "learning_rate": 0.00023854294061693166, + "loss": 3.3381, + "step": 89690 + }, + { + "epoch": 6.094238347601577, + "grad_norm": 0.9432578086853027, + "learning_rate": 0.00023850047560809894, + "loss": 3.6068, + "step": 89695 + }, + { + "epoch": 6.094578067672238, + "grad_norm": 0.9143058061599731, + "learning_rate": 0.00023845801059926622, + "loss": 3.3694, + "step": 89700 + }, + { + "epoch": 6.0949177877429, + "grad_norm": 0.9199538230895996, + "learning_rate": 0.0002384155455904335, + "loss": 3.3729, + "step": 89705 + }, + { + "epoch": 6.095257507813562, + "grad_norm": 1.0170427560806274, + "learning_rate": 0.00023837308058160075, + "loss": 3.4647, + "step": 89710 + }, + { + "epoch": 6.095597227884223, + "grad_norm": 0.8266448974609375, + "learning_rate": 0.00023833061557276806, + "loss": 3.3229, + "step": 89715 + }, + { + "epoch": 6.095936947954885, + "grad_norm": 1.296685814857483, + "learning_rate": 0.00023828815056393534, + "loss": 3.6765, + "step": 89720 + }, + { + "epoch": 6.096276668025547, + "grad_norm": 1.0427742004394531, + "learning_rate": 0.0002382456855551026, + "loss": 3.5795, + "step": 89725 + }, + { + "epoch": 6.0966163880962085, + "grad_norm": 0.8655601143836975, + "learning_rate": 0.0002382032205462699, + "loss": 3.1336, + "step": 89730 + }, + { + "epoch": 6.096956108166871, + "grad_norm": 0.7043294310569763, + "learning_rate": 0.00023816075553743715, + "loss": 3.1608, + "step": 89735 + }, + { + "epoch": 6.097295828237533, + "grad_norm": 1.220400333404541, + "learning_rate": 0.00023811829052860443, + "loss": 3.3524, + "step": 89740 + }, + { + "epoch": 6.097635548308194, + "grad_norm": 0.9819849133491516, + "learning_rate": 0.0002380758255197717, + "loss": 3.5237, + "step": 89745 + }, + { + "epoch": 6.097975268378856, + "grad_norm": 0.9709330797195435, + "learning_rate": 0.000238033360510939, + "loss": 3.3666, + "step": 89750 + }, + { + "epoch": 6.098314988449518, + "grad_norm": 1.0230010747909546, + "learning_rate": 0.00023799089550210627, + "loss": 3.276, + "step": 89755 + }, + { + "epoch": 6.098654708520179, + "grad_norm": 0.7139751315116882, + "learning_rate": 0.00023794843049327355, + "loss": 3.5123, + "step": 89760 + }, + { + "epoch": 6.098994428590841, + "grad_norm": 0.8478607535362244, + "learning_rate": 0.00023790596548444083, + "loss": 3.5448, + "step": 89765 + }, + { + "epoch": 6.099334148661503, + "grad_norm": 0.9809176921844482, + "learning_rate": 0.00023786350047560808, + "loss": 3.3766, + "step": 89770 + }, + { + "epoch": 6.0996738687321646, + "grad_norm": 1.0053985118865967, + "learning_rate": 0.0002378210354667754, + "loss": 3.5424, + "step": 89775 + }, + { + "epoch": 6.100013588802827, + "grad_norm": 1.150930404663086, + "learning_rate": 0.00023777857045794264, + "loss": 3.5259, + "step": 89780 + }, + { + "epoch": 6.100353308873488, + "grad_norm": 1.0789923667907715, + "learning_rate": 0.00023773610544910995, + "loss": 3.3931, + "step": 89785 + }, + { + "epoch": 6.10069302894415, + "grad_norm": 0.7247753739356995, + "learning_rate": 0.00023769364044027723, + "loss": 3.3313, + "step": 89790 + }, + { + "epoch": 6.101032749014812, + "grad_norm": 0.7197237610816956, + "learning_rate": 0.00023765117543144448, + "loss": 3.3852, + "step": 89795 + }, + { + "epoch": 6.101372469085473, + "grad_norm": 1.1519163846969604, + "learning_rate": 0.0002376087104226118, + "loss": 3.4067, + "step": 89800 + }, + { + "epoch": 6.101712189156135, + "grad_norm": 1.621678113937378, + "learning_rate": 0.00023756624541377904, + "loss": 3.5165, + "step": 89805 + }, + { + "epoch": 6.102051909226797, + "grad_norm": 0.9972145557403564, + "learning_rate": 0.00023752378040494632, + "loss": 3.2578, + "step": 89810 + }, + { + "epoch": 6.1023916292974585, + "grad_norm": 0.9117636680603027, + "learning_rate": 0.00023748131539611363, + "loss": 3.32, + "step": 89815 + }, + { + "epoch": 6.102731349368121, + "grad_norm": 3.0444281101226807, + "learning_rate": 0.00023743885038728088, + "loss": 3.4827, + "step": 89820 + }, + { + "epoch": 6.103071069438783, + "grad_norm": 1.0014978647232056, + "learning_rate": 0.00023739638537844816, + "loss": 3.6674, + "step": 89825 + }, + { + "epoch": 6.103410789509444, + "grad_norm": 0.9817042350769043, + "learning_rate": 0.00023735392036961544, + "loss": 3.3623, + "step": 89830 + }, + { + "epoch": 6.103750509580106, + "grad_norm": 0.9422460794448853, + "learning_rate": 0.00023731145536078272, + "loss": 3.4626, + "step": 89835 + }, + { + "epoch": 6.104090229650768, + "grad_norm": 0.8330566883087158, + "learning_rate": 0.00023726899035194998, + "loss": 3.2086, + "step": 89840 + }, + { + "epoch": 6.104429949721429, + "grad_norm": 1.0338433980941772, + "learning_rate": 0.00023722652534311728, + "loss": 3.3653, + "step": 89845 + }, + { + "epoch": 6.104769669792091, + "grad_norm": 0.8379228115081787, + "learning_rate": 0.00023718406033428456, + "loss": 3.3685, + "step": 89850 + }, + { + "epoch": 6.105109389862753, + "grad_norm": 0.8007422685623169, + "learning_rate": 0.00023714159532545182, + "loss": 3.27, + "step": 89855 + }, + { + "epoch": 6.1054491099334145, + "grad_norm": 1.1232210397720337, + "learning_rate": 0.00023709913031661912, + "loss": 3.4607, + "step": 89860 + }, + { + "epoch": 6.105788830004077, + "grad_norm": 1.4204939603805542, + "learning_rate": 0.00023705666530778638, + "loss": 3.3954, + "step": 89865 + }, + { + "epoch": 6.106128550074739, + "grad_norm": 1.079100489616394, + "learning_rate": 0.00023701420029895368, + "loss": 3.5977, + "step": 89870 + }, + { + "epoch": 6.1064682701454, + "grad_norm": 0.8442457914352417, + "learning_rate": 0.00023697173529012094, + "loss": 3.5613, + "step": 89875 + }, + { + "epoch": 6.106807990216062, + "grad_norm": 1.0540432929992676, + "learning_rate": 0.00023692927028128822, + "loss": 3.4818, + "step": 89880 + }, + { + "epoch": 6.107147710286724, + "grad_norm": 1.2026652097702026, + "learning_rate": 0.00023688680527245552, + "loss": 3.577, + "step": 89885 + }, + { + "epoch": 6.107487430357385, + "grad_norm": 0.7903327345848083, + "learning_rate": 0.00023684434026362278, + "loss": 3.3914, + "step": 89890 + }, + { + "epoch": 6.107827150428047, + "grad_norm": 0.8507814407348633, + "learning_rate": 0.0002368103682565566, + "loss": 3.3738, + "step": 89895 + }, + { + "epoch": 6.108166870498709, + "grad_norm": 1.0412254333496094, + "learning_rate": 0.00023676790324772388, + "loss": 3.4512, + "step": 89900 + }, + { + "epoch": 6.1085065905693705, + "grad_norm": 0.9235323071479797, + "learning_rate": 0.00023672543823889116, + "loss": 3.5456, + "step": 89905 + }, + { + "epoch": 6.108846310640033, + "grad_norm": 1.0059643983840942, + "learning_rate": 0.00023668297323005844, + "loss": 3.1934, + "step": 89910 + }, + { + "epoch": 6.109186030710695, + "grad_norm": 0.9098361730575562, + "learning_rate": 0.00023664050822122572, + "loss": 3.4899, + "step": 89915 + }, + { + "epoch": 6.109525750781356, + "grad_norm": 1.0852019786834717, + "learning_rate": 0.000236598043212393, + "loss": 3.5206, + "step": 89920 + }, + { + "epoch": 6.109865470852018, + "grad_norm": 0.9385517239570618, + "learning_rate": 0.00023655557820356028, + "loss": 3.2912, + "step": 89925 + }, + { + "epoch": 6.11020519092268, + "grad_norm": 0.7474623322486877, + "learning_rate": 0.00023651311319472753, + "loss": 3.3479, + "step": 89930 + }, + { + "epoch": 6.110544910993341, + "grad_norm": 0.8018480539321899, + "learning_rate": 0.00023647064818589484, + "loss": 3.2246, + "step": 89935 + }, + { + "epoch": 6.110884631064003, + "grad_norm": 0.9714968204498291, + "learning_rate": 0.0002364281831770621, + "loss": 3.7216, + "step": 89940 + }, + { + "epoch": 6.111224351134665, + "grad_norm": 0.8560486435890198, + "learning_rate": 0.00023638571816822937, + "loss": 3.4431, + "step": 89945 + }, + { + "epoch": 6.1115640712053265, + "grad_norm": 0.8294529318809509, + "learning_rate": 0.00023634325315939668, + "loss": 3.5207, + "step": 89950 + }, + { + "epoch": 6.111903791275989, + "grad_norm": 0.8902685642242432, + "learning_rate": 0.00023630078815056393, + "loss": 3.3238, + "step": 89955 + }, + { + "epoch": 6.112243511346651, + "grad_norm": 1.0956848859786987, + "learning_rate": 0.00023625832314173121, + "loss": 3.3415, + "step": 89960 + }, + { + "epoch": 6.112583231417312, + "grad_norm": 0.87883460521698, + "learning_rate": 0.0002362158581328985, + "loss": 3.3736, + "step": 89965 + }, + { + "epoch": 6.112922951487974, + "grad_norm": 0.7746434211730957, + "learning_rate": 0.00023617339312406577, + "loss": 3.709, + "step": 89970 + }, + { + "epoch": 6.113262671558636, + "grad_norm": 1.4564226865768433, + "learning_rate": 0.00023613092811523303, + "loss": 3.5469, + "step": 89975 + }, + { + "epoch": 6.113602391629297, + "grad_norm": 0.7477574944496155, + "learning_rate": 0.00023608846310640034, + "loss": 3.4568, + "step": 89980 + }, + { + "epoch": 6.113942111699959, + "grad_norm": 0.9856157302856445, + "learning_rate": 0.00023604599809756762, + "loss": 3.3907, + "step": 89985 + }, + { + "epoch": 6.114281831770621, + "grad_norm": 0.8908491134643555, + "learning_rate": 0.0002360035330887349, + "loss": 3.2539, + "step": 89990 + }, + { + "epoch": 6.1146215518412825, + "grad_norm": 1.0649430751800537, + "learning_rate": 0.00023596106807990218, + "loss": 3.4686, + "step": 89995 + }, + { + "epoch": 6.114961271911945, + "grad_norm": 0.9617429971694946, + "learning_rate": 0.00023591860307106943, + "loss": 3.54, + "step": 90000 + }, + { + "epoch": 6.115300991982607, + "grad_norm": 0.7305599451065063, + "learning_rate": 0.00023587613806223674, + "loss": 3.4346, + "step": 90005 + }, + { + "epoch": 6.115640712053268, + "grad_norm": 0.9022901654243469, + "learning_rate": 0.00023583367305340402, + "loss": 3.2583, + "step": 90010 + }, + { + "epoch": 6.11598043212393, + "grad_norm": 0.8445898294448853, + "learning_rate": 0.00023579120804457127, + "loss": 3.7051, + "step": 90015 + }, + { + "epoch": 6.116320152194592, + "grad_norm": 1.0040825605392456, + "learning_rate": 0.00023574874303573858, + "loss": 3.2994, + "step": 90020 + }, + { + "epoch": 6.116659872265253, + "grad_norm": 0.8104782104492188, + "learning_rate": 0.00023570627802690583, + "loss": 3.6155, + "step": 90025 + }, + { + "epoch": 6.116999592335915, + "grad_norm": 0.9047041535377502, + "learning_rate": 0.0002356638130180731, + "loss": 3.2358, + "step": 90030 + }, + { + "epoch": 6.117339312406577, + "grad_norm": 1.078276515007019, + "learning_rate": 0.0002356213480092404, + "loss": 3.484, + "step": 90035 + }, + { + "epoch": 6.1176790324772385, + "grad_norm": 0.975533127784729, + "learning_rate": 0.00023557888300040767, + "loss": 3.536, + "step": 90040 + }, + { + "epoch": 6.118018752547901, + "grad_norm": 0.9504879713058472, + "learning_rate": 0.00023553641799157495, + "loss": 3.4918, + "step": 90045 + }, + { + "epoch": 6.118358472618563, + "grad_norm": 0.9456842541694641, + "learning_rate": 0.00023549395298274223, + "loss": 3.8149, + "step": 90050 + }, + { + "epoch": 6.118698192689224, + "grad_norm": 0.852038562297821, + "learning_rate": 0.0002354514879739095, + "loss": 3.5143, + "step": 90055 + }, + { + "epoch": 6.119037912759886, + "grad_norm": 0.9891812205314636, + "learning_rate": 0.00023540902296507676, + "loss": 3.5382, + "step": 90060 + }, + { + "epoch": 6.119377632830548, + "grad_norm": 0.9981206655502319, + "learning_rate": 0.00023536655795624407, + "loss": 3.4551, + "step": 90065 + }, + { + "epoch": 6.119717352901209, + "grad_norm": 1.0230919122695923, + "learning_rate": 0.00023532409294741132, + "loss": 3.2697, + "step": 90070 + }, + { + "epoch": 6.120057072971871, + "grad_norm": 0.8503521084785461, + "learning_rate": 0.00023528162793857863, + "loss": 3.4795, + "step": 90075 + }, + { + "epoch": 6.120396793042533, + "grad_norm": 0.8362390398979187, + "learning_rate": 0.0002352391629297459, + "loss": 3.4232, + "step": 90080 + }, + { + "epoch": 6.1207365131131946, + "grad_norm": 1.0246142148971558, + "learning_rate": 0.00023519669792091316, + "loss": 3.1814, + "step": 90085 + }, + { + "epoch": 6.121076233183857, + "grad_norm": 0.7161303162574768, + "learning_rate": 0.00023515423291208047, + "loss": 3.3335, + "step": 90090 + }, + { + "epoch": 6.121415953254519, + "grad_norm": 0.8561065793037415, + "learning_rate": 0.00023511176790324772, + "loss": 3.4759, + "step": 90095 + }, + { + "epoch": 6.12175567332518, + "grad_norm": 0.8644846081733704, + "learning_rate": 0.000235069302894415, + "loss": 3.4385, + "step": 90100 + }, + { + "epoch": 6.122095393395842, + "grad_norm": 1.0103040933609009, + "learning_rate": 0.00023502683788558228, + "loss": 3.4347, + "step": 90105 + }, + { + "epoch": 6.122435113466503, + "grad_norm": 0.9716213345527649, + "learning_rate": 0.00023498437287674956, + "loss": 3.222, + "step": 90110 + }, + { + "epoch": 6.122774833537165, + "grad_norm": 0.9857015609741211, + "learning_rate": 0.00023494190786791684, + "loss": 3.4682, + "step": 90115 + }, + { + "epoch": 6.123114553607827, + "grad_norm": 1.0973578691482544, + "learning_rate": 0.00023489944285908412, + "loss": 3.4616, + "step": 90120 + }, + { + "epoch": 6.1234542736784885, + "grad_norm": 0.9589556455612183, + "learning_rate": 0.0002348569778502514, + "loss": 3.2049, + "step": 90125 + }, + { + "epoch": 6.123793993749151, + "grad_norm": 0.8410229682922363, + "learning_rate": 0.00023481451284141866, + "loss": 3.4329, + "step": 90130 + }, + { + "epoch": 6.124133713819813, + "grad_norm": 0.9294925928115845, + "learning_rate": 0.00023477204783258596, + "loss": 3.2322, + "step": 90135 + }, + { + "epoch": 6.124473433890474, + "grad_norm": 0.9210629463195801, + "learning_rate": 0.00023472958282375322, + "loss": 3.5359, + "step": 90140 + }, + { + "epoch": 6.124813153961136, + "grad_norm": 0.8097776770591736, + "learning_rate": 0.0002346871178149205, + "loss": 3.4323, + "step": 90145 + }, + { + "epoch": 6.125152874031798, + "grad_norm": 1.0091338157653809, + "learning_rate": 0.0002346446528060878, + "loss": 3.4513, + "step": 90150 + }, + { + "epoch": 6.125492594102459, + "grad_norm": 1.0216542482376099, + "learning_rate": 0.00023460218779725506, + "loss": 3.5998, + "step": 90155 + }, + { + "epoch": 6.125832314173121, + "grad_norm": 1.1595304012298584, + "learning_rate": 0.00023455972278842236, + "loss": 3.324, + "step": 90160 + }, + { + "epoch": 6.126172034243783, + "grad_norm": 1.0061644315719604, + "learning_rate": 0.00023451725777958962, + "loss": 3.5768, + "step": 90165 + }, + { + "epoch": 6.1265117543144445, + "grad_norm": 1.037075161933899, + "learning_rate": 0.0002344747927707569, + "loss": 3.4689, + "step": 90170 + }, + { + "epoch": 6.126851474385107, + "grad_norm": 0.822518527507782, + "learning_rate": 0.0002344323277619242, + "loss": 3.4018, + "step": 90175 + }, + { + "epoch": 6.127191194455769, + "grad_norm": 0.862923800945282, + "learning_rate": 0.00023438986275309146, + "loss": 3.3014, + "step": 90180 + }, + { + "epoch": 6.12753091452643, + "grad_norm": 0.871720552444458, + "learning_rate": 0.00023434739774425874, + "loss": 3.3707, + "step": 90185 + }, + { + "epoch": 6.127870634597092, + "grad_norm": 0.8224053978919983, + "learning_rate": 0.00023430493273542602, + "loss": 3.6047, + "step": 90190 + }, + { + "epoch": 6.128210354667754, + "grad_norm": 1.0486068725585938, + "learning_rate": 0.0002342624677265933, + "loss": 3.5709, + "step": 90195 + }, + { + "epoch": 6.128550074738415, + "grad_norm": 0.8073180317878723, + "learning_rate": 0.00023422000271776055, + "loss": 3.7388, + "step": 90200 + }, + { + "epoch": 6.128889794809077, + "grad_norm": 0.8771747350692749, + "learning_rate": 0.00023417753770892786, + "loss": 3.6161, + "step": 90205 + }, + { + "epoch": 6.129229514879739, + "grad_norm": 1.026686429977417, + "learning_rate": 0.00023413507270009514, + "loss": 3.4491, + "step": 90210 + }, + { + "epoch": 6.1295692349504005, + "grad_norm": 0.771115779876709, + "learning_rate": 0.0002340926076912624, + "loss": 3.3296, + "step": 90215 + }, + { + "epoch": 6.129908955021063, + "grad_norm": 0.7180362343788147, + "learning_rate": 0.0002340501426824297, + "loss": 3.2318, + "step": 90220 + }, + { + "epoch": 6.130248675091725, + "grad_norm": 0.716742217540741, + "learning_rate": 0.00023400767767359695, + "loss": 3.2355, + "step": 90225 + }, + { + "epoch": 6.130588395162386, + "grad_norm": 0.9254169464111328, + "learning_rate": 0.00023396521266476423, + "loss": 3.0895, + "step": 90230 + }, + { + "epoch": 6.130928115233048, + "grad_norm": 0.8667013645172119, + "learning_rate": 0.0002339227476559315, + "loss": 3.4085, + "step": 90235 + }, + { + "epoch": 6.13126783530371, + "grad_norm": 0.8898740410804749, + "learning_rate": 0.0002338802826470988, + "loss": 3.538, + "step": 90240 + }, + { + "epoch": 6.131607555374371, + "grad_norm": 0.8101511001586914, + "learning_rate": 0.0002338378176382661, + "loss": 3.4677, + "step": 90245 + }, + { + "epoch": 6.131947275445033, + "grad_norm": 1.0501036643981934, + "learning_rate": 0.00023379535262943335, + "loss": 3.1889, + "step": 90250 + }, + { + "epoch": 6.132286995515695, + "grad_norm": 0.8773878812789917, + "learning_rate": 0.00023375288762060063, + "loss": 3.4407, + "step": 90255 + }, + { + "epoch": 6.1326267155863565, + "grad_norm": 0.9624944925308228, + "learning_rate": 0.0002337104226117679, + "loss": 3.5539, + "step": 90260 + }, + { + "epoch": 6.132966435657019, + "grad_norm": 0.9814286231994629, + "learning_rate": 0.0002336679576029352, + "loss": 3.4872, + "step": 90265 + }, + { + "epoch": 6.133306155727681, + "grad_norm": 1.7191568613052368, + "learning_rate": 0.00023362549259410244, + "loss": 3.139, + "step": 90270 + }, + { + "epoch": 6.133645875798342, + "grad_norm": 1.1183573007583618, + "learning_rate": 0.00023358302758526975, + "loss": 3.6399, + "step": 90275 + }, + { + "epoch": 6.133985595869004, + "grad_norm": 0.8793410658836365, + "learning_rate": 0.00023354056257643703, + "loss": 3.5968, + "step": 90280 + }, + { + "epoch": 6.134325315939666, + "grad_norm": 1.149998426437378, + "learning_rate": 0.00023349809756760428, + "loss": 3.6586, + "step": 90285 + }, + { + "epoch": 6.134665036010327, + "grad_norm": 0.9086042046546936, + "learning_rate": 0.0002334556325587716, + "loss": 3.3807, + "step": 90290 + }, + { + "epoch": 6.135004756080989, + "grad_norm": 0.8896855115890503, + "learning_rate": 0.00023341316754993884, + "loss": 3.5823, + "step": 90295 + }, + { + "epoch": 6.135344476151651, + "grad_norm": 0.8417021036148071, + "learning_rate": 0.00023337070254110612, + "loss": 3.2062, + "step": 90300 + }, + { + "epoch": 6.1356841962223125, + "grad_norm": 0.778768002986908, + "learning_rate": 0.00023332823753227343, + "loss": 3.4025, + "step": 90305 + }, + { + "epoch": 6.136023916292975, + "grad_norm": 1.114112377166748, + "learning_rate": 0.00023328577252344068, + "loss": 3.4231, + "step": 90310 + }, + { + "epoch": 6.136363636363637, + "grad_norm": 0.9743187427520752, + "learning_rate": 0.00023324330751460796, + "loss": 3.4139, + "step": 90315 + }, + { + "epoch": 6.136703356434298, + "grad_norm": 0.8965110182762146, + "learning_rate": 0.00023320084250577524, + "loss": 3.3126, + "step": 90320 + }, + { + "epoch": 6.13704307650496, + "grad_norm": 0.9704491496086121, + "learning_rate": 0.00023315837749694252, + "loss": 3.2263, + "step": 90325 + }, + { + "epoch": 6.137382796575622, + "grad_norm": 0.8891574144363403, + "learning_rate": 0.0002331159124881098, + "loss": 3.428, + "step": 90330 + }, + { + "epoch": 6.137722516646283, + "grad_norm": 0.8460766077041626, + "learning_rate": 0.00023307344747927709, + "loss": 3.5842, + "step": 90335 + }, + { + "epoch": 6.138062236716945, + "grad_norm": 0.7908870577812195, + "learning_rate": 0.00023303098247044437, + "loss": 3.4221, + "step": 90340 + }, + { + "epoch": 6.138401956787607, + "grad_norm": 1.0319411754608154, + "learning_rate": 0.00023298851746161165, + "loss": 3.7213, + "step": 90345 + }, + { + "epoch": 6.1387416768582685, + "grad_norm": 0.8656967282295227, + "learning_rate": 0.00023294605245277893, + "loss": 3.5285, + "step": 90350 + }, + { + "epoch": 6.139081396928931, + "grad_norm": 1.0073609352111816, + "learning_rate": 0.00023290358744394618, + "loss": 3.522, + "step": 90355 + }, + { + "epoch": 6.139421116999593, + "grad_norm": 1.0595897436141968, + "learning_rate": 0.00023286112243511349, + "loss": 3.4984, + "step": 90360 + }, + { + "epoch": 6.139760837070254, + "grad_norm": 1.1431704759597778, + "learning_rate": 0.00023281865742628074, + "loss": 3.4302, + "step": 90365 + }, + { + "epoch": 6.140100557140916, + "grad_norm": 1.0238914489746094, + "learning_rate": 0.00023277619241744802, + "loss": 3.345, + "step": 90370 + }, + { + "epoch": 6.140440277211578, + "grad_norm": 0.9941591620445251, + "learning_rate": 0.00023273372740861533, + "loss": 3.334, + "step": 90375 + }, + { + "epoch": 6.140779997282239, + "grad_norm": 1.1233950853347778, + "learning_rate": 0.00023269126239978258, + "loss": 3.6266, + "step": 90380 + }, + { + "epoch": 6.141119717352901, + "grad_norm": 1.6776909828186035, + "learning_rate": 0.00023264879739094986, + "loss": 3.5715, + "step": 90385 + }, + { + "epoch": 6.141459437423563, + "grad_norm": 0.9772237539291382, + "learning_rate": 0.00023260633238211714, + "loss": 3.4355, + "step": 90390 + }, + { + "epoch": 6.141799157494225, + "grad_norm": 1.2850950956344604, + "learning_rate": 0.00023256386737328442, + "loss": 3.251, + "step": 90395 + }, + { + "epoch": 6.142138877564887, + "grad_norm": 1.0111651420593262, + "learning_rate": 0.00023252140236445167, + "loss": 3.4611, + "step": 90400 + }, + { + "epoch": 6.142478597635549, + "grad_norm": 0.7644152641296387, + "learning_rate": 0.00023247893735561898, + "loss": 3.5345, + "step": 90405 + }, + { + "epoch": 6.14281831770621, + "grad_norm": 0.9131135940551758, + "learning_rate": 0.00023243647234678626, + "loss": 3.3584, + "step": 90410 + }, + { + "epoch": 6.143158037776872, + "grad_norm": 1.1693344116210938, + "learning_rate": 0.00023239400733795354, + "loss": 3.5985, + "step": 90415 + }, + { + "epoch": 6.143497757847534, + "grad_norm": 0.8119051456451416, + "learning_rate": 0.00023235154232912082, + "loss": 3.48, + "step": 90420 + }, + { + "epoch": 6.143837477918195, + "grad_norm": 0.8844112157821655, + "learning_rate": 0.00023230907732028807, + "loss": 3.504, + "step": 90425 + }, + { + "epoch": 6.144177197988857, + "grad_norm": 0.7681549787521362, + "learning_rate": 0.00023226661231145538, + "loss": 3.3415, + "step": 90430 + }, + { + "epoch": 6.144516918059519, + "grad_norm": 0.7326382994651794, + "learning_rate": 0.00023222414730262263, + "loss": 3.4444, + "step": 90435 + }, + { + "epoch": 6.144856638130181, + "grad_norm": 1.0108392238616943, + "learning_rate": 0.0002321816822937899, + "loss": 3.6864, + "step": 90440 + }, + { + "epoch": 6.145196358200843, + "grad_norm": 0.902328610420227, + "learning_rate": 0.00023213921728495722, + "loss": 3.5069, + "step": 90445 + }, + { + "epoch": 6.145536078271505, + "grad_norm": 0.8741894364356995, + "learning_rate": 0.00023209675227612447, + "loss": 3.3462, + "step": 90450 + }, + { + "epoch": 6.145875798342166, + "grad_norm": 0.8783340454101562, + "learning_rate": 0.00023205428726729175, + "loss": 3.625, + "step": 90455 + }, + { + "epoch": 6.146215518412828, + "grad_norm": 1.0624834299087524, + "learning_rate": 0.00023201182225845903, + "loss": 3.5716, + "step": 90460 + }, + { + "epoch": 6.14655523848349, + "grad_norm": 0.6574058532714844, + "learning_rate": 0.0002319693572496263, + "loss": 3.5463, + "step": 90465 + }, + { + "epoch": 6.146894958554151, + "grad_norm": 0.869644820690155, + "learning_rate": 0.0002319268922407936, + "loss": 3.4381, + "step": 90470 + }, + { + "epoch": 6.147234678624813, + "grad_norm": 0.8387452960014343, + "learning_rate": 0.00023188442723196087, + "loss": 3.3741, + "step": 90475 + }, + { + "epoch": 6.1475743986954745, + "grad_norm": 1.0560303926467896, + "learning_rate": 0.00023184196222312815, + "loss": 3.3704, + "step": 90480 + }, + { + "epoch": 6.147914118766137, + "grad_norm": 1.0607514381408691, + "learning_rate": 0.0002317994972142954, + "loss": 3.6498, + "step": 90485 + }, + { + "epoch": 6.148253838836799, + "grad_norm": 1.331167459487915, + "learning_rate": 0.0002317570322054627, + "loss": 3.3397, + "step": 90490 + }, + { + "epoch": 6.14859355890746, + "grad_norm": 0.8498597741127014, + "learning_rate": 0.00023171456719662997, + "loss": 3.4781, + "step": 90495 + }, + { + "epoch": 6.148933278978122, + "grad_norm": 1.0037049055099487, + "learning_rate": 0.00023167210218779727, + "loss": 3.3271, + "step": 90500 + }, + { + "epoch": 6.149272999048784, + "grad_norm": 1.1485509872436523, + "learning_rate": 0.00023162963717896455, + "loss": 3.4062, + "step": 90505 + }, + { + "epoch": 6.149612719119445, + "grad_norm": 0.8581130504608154, + "learning_rate": 0.0002315871721701318, + "loss": 3.2565, + "step": 90510 + }, + { + "epoch": 6.149952439190107, + "grad_norm": 0.9088388085365295, + "learning_rate": 0.00023154470716129911, + "loss": 3.5316, + "step": 90515 + }, + { + "epoch": 6.150292159260769, + "grad_norm": 4.437614917755127, + "learning_rate": 0.00023150224215246637, + "loss": 3.3596, + "step": 90520 + }, + { + "epoch": 6.1506318793314305, + "grad_norm": 0.949091911315918, + "learning_rate": 0.00023145977714363365, + "loss": 3.5726, + "step": 90525 + }, + { + "epoch": 6.150971599402093, + "grad_norm": 1.0450406074523926, + "learning_rate": 0.00023141731213480093, + "loss": 3.577, + "step": 90530 + }, + { + "epoch": 6.151311319472755, + "grad_norm": 0.8934120535850525, + "learning_rate": 0.0002313748471259682, + "loss": 3.5448, + "step": 90535 + }, + { + "epoch": 6.151651039543416, + "grad_norm": 1.0016552209854126, + "learning_rate": 0.0002313323821171355, + "loss": 3.5594, + "step": 90540 + }, + { + "epoch": 6.151990759614078, + "grad_norm": 0.9877389073371887, + "learning_rate": 0.00023128991710830277, + "loss": 3.4613, + "step": 90545 + }, + { + "epoch": 6.15233047968474, + "grad_norm": 1.4112948179244995, + "learning_rate": 0.00023124745209947005, + "loss": 3.4205, + "step": 90550 + }, + { + "epoch": 6.152670199755401, + "grad_norm": 1.0125194787979126, + "learning_rate": 0.0002312049870906373, + "loss": 3.5932, + "step": 90555 + }, + { + "epoch": 6.153009919826063, + "grad_norm": 0.8219852447509766, + "learning_rate": 0.0002311625220818046, + "loss": 3.5234, + "step": 90560 + }, + { + "epoch": 6.153349639896725, + "grad_norm": 0.7920364141464233, + "learning_rate": 0.00023112005707297186, + "loss": 3.4846, + "step": 90565 + }, + { + "epoch": 6.1536893599673865, + "grad_norm": 1.078342318534851, + "learning_rate": 0.00023107759206413914, + "loss": 3.5273, + "step": 90570 + }, + { + "epoch": 6.154029080038049, + "grad_norm": 0.9423194527626038, + "learning_rate": 0.00023103512705530645, + "loss": 3.3705, + "step": 90575 + }, + { + "epoch": 6.154368800108711, + "grad_norm": 0.8764362335205078, + "learning_rate": 0.0002309926620464737, + "loss": 3.4694, + "step": 90580 + }, + { + "epoch": 6.154708520179372, + "grad_norm": 1.2585350275039673, + "learning_rate": 0.000230950197037641, + "loss": 3.1733, + "step": 90585 + }, + { + "epoch": 6.155048240250034, + "grad_norm": 0.9567163586616516, + "learning_rate": 0.00023090773202880826, + "loss": 3.4716, + "step": 90590 + }, + { + "epoch": 6.155387960320696, + "grad_norm": 0.8430368900299072, + "learning_rate": 0.00023086526701997554, + "loss": 3.221, + "step": 90595 + }, + { + "epoch": 6.155727680391357, + "grad_norm": 0.9047728180885315, + "learning_rate": 0.00023082280201114285, + "loss": 3.5541, + "step": 90600 + }, + { + "epoch": 6.156067400462019, + "grad_norm": 0.8744259476661682, + "learning_rate": 0.0002307803370023101, + "loss": 3.5373, + "step": 90605 + }, + { + "epoch": 6.156407120532681, + "grad_norm": 1.1616214513778687, + "learning_rate": 0.00023073787199347738, + "loss": 3.6096, + "step": 90610 + }, + { + "epoch": 6.1567468406033425, + "grad_norm": 0.7606600522994995, + "learning_rate": 0.00023069540698464466, + "loss": 3.4228, + "step": 90615 + }, + { + "epoch": 6.157086560674005, + "grad_norm": 0.8272355794906616, + "learning_rate": 0.00023065294197581194, + "loss": 3.6095, + "step": 90620 + }, + { + "epoch": 6.157426280744667, + "grad_norm": 0.9792121052742004, + "learning_rate": 0.0002306104769669792, + "loss": 3.4555, + "step": 90625 + }, + { + "epoch": 6.157766000815328, + "grad_norm": 0.9982336759567261, + "learning_rate": 0.0002305680119581465, + "loss": 3.4277, + "step": 90630 + }, + { + "epoch": 6.15810572088599, + "grad_norm": 1.0333020687103271, + "learning_rate": 0.00023052554694931378, + "loss": 3.3785, + "step": 90635 + }, + { + "epoch": 6.158445440956652, + "grad_norm": 0.7904447913169861, + "learning_rate": 0.00023048308194048103, + "loss": 3.4182, + "step": 90640 + }, + { + "epoch": 6.158785161027313, + "grad_norm": 0.9048055410385132, + "learning_rate": 0.00023044061693164834, + "loss": 3.2692, + "step": 90645 + }, + { + "epoch": 6.159124881097975, + "grad_norm": 0.8061763048171997, + "learning_rate": 0.0002303981519228156, + "loss": 3.5729, + "step": 90650 + }, + { + "epoch": 6.159464601168637, + "grad_norm": 0.7651428580284119, + "learning_rate": 0.00023035568691398287, + "loss": 3.319, + "step": 90655 + }, + { + "epoch": 6.1598043212392986, + "grad_norm": 0.8166138529777527, + "learning_rate": 0.00023031322190515015, + "loss": 3.6396, + "step": 90660 + }, + { + "epoch": 6.160144041309961, + "grad_norm": 1.7771589756011963, + "learning_rate": 0.00023027075689631743, + "loss": 3.1168, + "step": 90665 + }, + { + "epoch": 6.160483761380623, + "grad_norm": 0.8057602643966675, + "learning_rate": 0.00023022829188748474, + "loss": 3.6705, + "step": 90670 + }, + { + "epoch": 6.160823481451284, + "grad_norm": 0.8061012029647827, + "learning_rate": 0.000230185826878652, + "loss": 3.3819, + "step": 90675 + }, + { + "epoch": 6.161163201521946, + "grad_norm": 0.9756184816360474, + "learning_rate": 0.00023014336186981927, + "loss": 3.7223, + "step": 90680 + }, + { + "epoch": 6.161502921592608, + "grad_norm": 1.3420512676239014, + "learning_rate": 0.00023010089686098655, + "loss": 3.4818, + "step": 90685 + }, + { + "epoch": 6.161842641663269, + "grad_norm": 0.9083986878395081, + "learning_rate": 0.00023005843185215383, + "loss": 3.3552, + "step": 90690 + }, + { + "epoch": 6.162182361733931, + "grad_norm": 0.9159302711486816, + "learning_rate": 0.0002300159668433211, + "loss": 3.6149, + "step": 90695 + }, + { + "epoch": 6.162522081804593, + "grad_norm": 1.3849040269851685, + "learning_rate": 0.0002299735018344884, + "loss": 3.3813, + "step": 90700 + }, + { + "epoch": 6.162861801875255, + "grad_norm": 0.8239520192146301, + "learning_rate": 0.00022993103682565568, + "loss": 3.6545, + "step": 90705 + }, + { + "epoch": 6.163201521945917, + "grad_norm": 1.6869722604751587, + "learning_rate": 0.00022988857181682293, + "loss": 3.476, + "step": 90710 + }, + { + "epoch": 6.163541242016579, + "grad_norm": 1.015556812286377, + "learning_rate": 0.00022984610680799024, + "loss": 3.4166, + "step": 90715 + }, + { + "epoch": 6.16388096208724, + "grad_norm": 0.972341775894165, + "learning_rate": 0.0002298036417991575, + "loss": 3.3475, + "step": 90720 + }, + { + "epoch": 6.164220682157902, + "grad_norm": 1.2568366527557373, + "learning_rate": 0.00022976117679032477, + "loss": 3.5222, + "step": 90725 + }, + { + "epoch": 6.164560402228564, + "grad_norm": 1.0953946113586426, + "learning_rate": 0.00022971871178149208, + "loss": 3.4936, + "step": 90730 + }, + { + "epoch": 6.164900122299225, + "grad_norm": 0.7961805462837219, + "learning_rate": 0.00022967624677265933, + "loss": 3.5694, + "step": 90735 + }, + { + "epoch": 6.165239842369887, + "grad_norm": 0.951301097869873, + "learning_rate": 0.0002296337817638266, + "loss": 3.4474, + "step": 90740 + }, + { + "epoch": 6.165579562440549, + "grad_norm": 1.1029353141784668, + "learning_rate": 0.0002295913167549939, + "loss": 3.3873, + "step": 90745 + }, + { + "epoch": 6.165919282511211, + "grad_norm": 1.2205414772033691, + "learning_rate": 0.00022954885174616117, + "loss": 3.4611, + "step": 90750 + }, + { + "epoch": 6.166259002581873, + "grad_norm": 0.8361888527870178, + "learning_rate": 0.00022950638673732845, + "loss": 3.4205, + "step": 90755 + }, + { + "epoch": 6.166598722652535, + "grad_norm": 0.8308649063110352, + "learning_rate": 0.00022946392172849573, + "loss": 3.197, + "step": 90760 + }, + { + "epoch": 6.166938442723196, + "grad_norm": 0.8065077662467957, + "learning_rate": 0.000229421456719663, + "loss": 3.2888, + "step": 90765 + }, + { + "epoch": 6.167278162793858, + "grad_norm": 1.0449708700180054, + "learning_rate": 0.0002293789917108303, + "loss": 3.3911, + "step": 90770 + }, + { + "epoch": 6.16761788286452, + "grad_norm": 0.9078471660614014, + "learning_rate": 0.00022933652670199757, + "loss": 3.3426, + "step": 90775 + }, + { + "epoch": 6.167957602935181, + "grad_norm": 1.2381271123886108, + "learning_rate": 0.00022929406169316482, + "loss": 3.6533, + "step": 90780 + }, + { + "epoch": 6.168297323005843, + "grad_norm": 0.609876275062561, + "learning_rate": 0.00022925159668433213, + "loss": 3.4738, + "step": 90785 + }, + { + "epoch": 6.1686370430765045, + "grad_norm": 1.072131872177124, + "learning_rate": 0.00022920913167549938, + "loss": 3.7083, + "step": 90790 + }, + { + "epoch": 6.168976763147167, + "grad_norm": 1.6489967107772827, + "learning_rate": 0.00022916666666666666, + "loss": 3.424, + "step": 90795 + }, + { + "epoch": 6.169316483217829, + "grad_norm": 0.8023058176040649, + "learning_rate": 0.00022912420165783397, + "loss": 3.3838, + "step": 90800 + }, + { + "epoch": 6.16965620328849, + "grad_norm": 0.792776882648468, + "learning_rate": 0.00022908173664900122, + "loss": 3.3896, + "step": 90805 + }, + { + "epoch": 6.169995923359152, + "grad_norm": 1.2686444520950317, + "learning_rate": 0.0002290392716401685, + "loss": 3.6213, + "step": 90810 + }, + { + "epoch": 6.170335643429814, + "grad_norm": 0.9108410477638245, + "learning_rate": 0.00022899680663133578, + "loss": 3.5843, + "step": 90815 + }, + { + "epoch": 6.170675363500475, + "grad_norm": 0.9510552287101746, + "learning_rate": 0.00022895434162250306, + "loss": 3.383, + "step": 90820 + }, + { + "epoch": 6.171015083571137, + "grad_norm": 0.7781757116317749, + "learning_rate": 0.00022891187661367032, + "loss": 3.2231, + "step": 90825 + }, + { + "epoch": 6.171354803641799, + "grad_norm": 0.8636781573295593, + "learning_rate": 0.00022886941160483762, + "loss": 3.2765, + "step": 90830 + }, + { + "epoch": 6.1716945237124605, + "grad_norm": 1.0530990362167358, + "learning_rate": 0.0002288269465960049, + "loss": 3.3472, + "step": 90835 + }, + { + "epoch": 6.172034243783123, + "grad_norm": 0.9812278747558594, + "learning_rate": 0.00022878448158717218, + "loss": 3.4397, + "step": 90840 + }, + { + "epoch": 6.172373963853785, + "grad_norm": 1.0921605825424194, + "learning_rate": 0.00022874201657833946, + "loss": 3.2746, + "step": 90845 + }, + { + "epoch": 6.172713683924446, + "grad_norm": 0.8617043495178223, + "learning_rate": 0.00022869955156950672, + "loss": 3.4125, + "step": 90850 + }, + { + "epoch": 6.173053403995108, + "grad_norm": 0.8676621913909912, + "learning_rate": 0.00022865708656067402, + "loss": 3.3734, + "step": 90855 + }, + { + "epoch": 6.17339312406577, + "grad_norm": 0.8875109553337097, + "learning_rate": 0.00022861462155184128, + "loss": 3.2334, + "step": 90860 + }, + { + "epoch": 6.173732844136431, + "grad_norm": 0.8561227917671204, + "learning_rate": 0.00022857215654300856, + "loss": 3.4377, + "step": 90865 + }, + { + "epoch": 6.174072564207093, + "grad_norm": 0.8149018883705139, + "learning_rate": 0.00022852969153417586, + "loss": 3.3183, + "step": 90870 + }, + { + "epoch": 6.174412284277755, + "grad_norm": 0.7822860479354858, + "learning_rate": 0.00022848722652534312, + "loss": 3.0844, + "step": 90875 + }, + { + "epoch": 6.1747520043484165, + "grad_norm": 1.504045844078064, + "learning_rate": 0.0002284447615165104, + "loss": 3.3865, + "step": 90880 + }, + { + "epoch": 6.175091724419079, + "grad_norm": 1.1955277919769287, + "learning_rate": 0.00022840229650767768, + "loss": 3.4579, + "step": 90885 + }, + { + "epoch": 6.175431444489741, + "grad_norm": 0.8750812411308289, + "learning_rate": 0.00022835983149884496, + "loss": 3.3873, + "step": 90890 + }, + { + "epoch": 6.175771164560402, + "grad_norm": 0.9748710989952087, + "learning_rate": 0.00022831736649001224, + "loss": 3.4608, + "step": 90895 + }, + { + "epoch": 6.176110884631064, + "grad_norm": 0.9226523041725159, + "learning_rate": 0.00022827490148117952, + "loss": 3.3679, + "step": 90900 + }, + { + "epoch": 6.176450604701726, + "grad_norm": 0.8842809796333313, + "learning_rate": 0.0002282324364723468, + "loss": 3.2521, + "step": 90905 + }, + { + "epoch": 6.176790324772387, + "grad_norm": 0.806590735912323, + "learning_rate": 0.00022818997146351405, + "loss": 3.3674, + "step": 90910 + }, + { + "epoch": 6.177130044843049, + "grad_norm": 0.8449601531028748, + "learning_rate": 0.00022814750645468136, + "loss": 3.3228, + "step": 90915 + }, + { + "epoch": 6.177469764913711, + "grad_norm": 0.9815288782119751, + "learning_rate": 0.0002281050414458486, + "loss": 3.4448, + "step": 90920 + }, + { + "epoch": 6.1778094849843725, + "grad_norm": 0.9330326914787292, + "learning_rate": 0.00022806257643701592, + "loss": 3.3845, + "step": 90925 + }, + { + "epoch": 6.178149205055035, + "grad_norm": 0.8131840229034424, + "learning_rate": 0.0002280201114281832, + "loss": 3.4934, + "step": 90930 + }, + { + "epoch": 6.178488925125697, + "grad_norm": 1.0041674375534058, + "learning_rate": 0.00022797764641935045, + "loss": 3.4795, + "step": 90935 + }, + { + "epoch": 6.178828645196358, + "grad_norm": 0.7152838110923767, + "learning_rate": 0.00022793518141051776, + "loss": 3.489, + "step": 90940 + }, + { + "epoch": 6.17916836526702, + "grad_norm": 0.9374152421951294, + "learning_rate": 0.000227892716401685, + "loss": 3.3096, + "step": 90945 + }, + { + "epoch": 6.179508085337682, + "grad_norm": 1.0595180988311768, + "learning_rate": 0.0002278502513928523, + "loss": 3.2689, + "step": 90950 + }, + { + "epoch": 6.179847805408343, + "grad_norm": 2.7342751026153564, + "learning_rate": 0.00022780778638401957, + "loss": 3.4046, + "step": 90955 + }, + { + "epoch": 6.180187525479005, + "grad_norm": 1.331857681274414, + "learning_rate": 0.00022776532137518685, + "loss": 3.3797, + "step": 90960 + }, + { + "epoch": 6.180527245549667, + "grad_norm": 1.067376971244812, + "learning_rate": 0.00022772285636635413, + "loss": 3.3421, + "step": 90965 + }, + { + "epoch": 6.180866965620329, + "grad_norm": 1.1251713037490845, + "learning_rate": 0.0002276803913575214, + "loss": 3.6443, + "step": 90970 + }, + { + "epoch": 6.181206685690991, + "grad_norm": 0.877314567565918, + "learning_rate": 0.0002276379263486887, + "loss": 3.4889, + "step": 90975 + }, + { + "epoch": 6.181546405761653, + "grad_norm": 1.1965522766113281, + "learning_rate": 0.00022759546133985594, + "loss": 3.7139, + "step": 90980 + }, + { + "epoch": 6.181886125832314, + "grad_norm": 0.9120901226997375, + "learning_rate": 0.00022755299633102325, + "loss": 3.1575, + "step": 90985 + }, + { + "epoch": 6.182225845902976, + "grad_norm": 0.9394571185112, + "learning_rate": 0.0002275105313221905, + "loss": 3.503, + "step": 90990 + }, + { + "epoch": 6.182565565973638, + "grad_norm": 0.9059333801269531, + "learning_rate": 0.00022746806631335778, + "loss": 3.4965, + "step": 90995 + }, + { + "epoch": 6.182905286044299, + "grad_norm": 1.1654589176177979, + "learning_rate": 0.0002274256013045251, + "loss": 3.6565, + "step": 91000 + }, + { + "epoch": 6.183245006114961, + "grad_norm": 0.8577856421470642, + "learning_rate": 0.00022738313629569234, + "loss": 3.3023, + "step": 91005 + }, + { + "epoch": 6.183584726185623, + "grad_norm": 0.8289652466773987, + "learning_rate": 0.00022734067128685965, + "loss": 3.3903, + "step": 91010 + }, + { + "epoch": 6.183924446256285, + "grad_norm": 0.8599313497543335, + "learning_rate": 0.0002272982062780269, + "loss": 3.5129, + "step": 91015 + }, + { + "epoch": 6.184264166326947, + "grad_norm": 0.7330402135848999, + "learning_rate": 0.00022725574126919418, + "loss": 3.2703, + "step": 91020 + }, + { + "epoch": 6.184603886397609, + "grad_norm": 0.8935593962669373, + "learning_rate": 0.0002272132762603615, + "loss": 3.5292, + "step": 91025 + }, + { + "epoch": 6.18494360646827, + "grad_norm": 1.134338617324829, + "learning_rate": 0.00022717081125152874, + "loss": 3.3835, + "step": 91030 + }, + { + "epoch": 6.185283326538932, + "grad_norm": 0.8753663897514343, + "learning_rate": 0.00022712834624269602, + "loss": 3.3454, + "step": 91035 + }, + { + "epoch": 6.185623046609594, + "grad_norm": 1.0329738855361938, + "learning_rate": 0.0002270858812338633, + "loss": 3.2147, + "step": 91040 + }, + { + "epoch": 6.185962766680255, + "grad_norm": 0.9739900231361389, + "learning_rate": 0.00022704341622503058, + "loss": 3.48, + "step": 91045 + }, + { + "epoch": 6.186302486750917, + "grad_norm": 0.6155090928077698, + "learning_rate": 0.00022700095121619784, + "loss": 3.6013, + "step": 91050 + }, + { + "epoch": 6.186642206821579, + "grad_norm": 0.9083206057548523, + "learning_rate": 0.00022695848620736515, + "loss": 3.7035, + "step": 91055 + }, + { + "epoch": 6.186981926892241, + "grad_norm": 0.8156462907791138, + "learning_rate": 0.00022691602119853243, + "loss": 3.1165, + "step": 91060 + }, + { + "epoch": 6.187321646962903, + "grad_norm": 1.180245280265808, + "learning_rate": 0.00022687355618969968, + "loss": 3.5456, + "step": 91065 + }, + { + "epoch": 6.187661367033565, + "grad_norm": 0.7402189373970032, + "learning_rate": 0.00022683109118086699, + "loss": 3.27, + "step": 91070 + }, + { + "epoch": 6.188001087104226, + "grad_norm": 0.9720277190208435, + "learning_rate": 0.00022678862617203424, + "loss": 3.3372, + "step": 91075 + }, + { + "epoch": 6.188340807174888, + "grad_norm": 0.7642938494682312, + "learning_rate": 0.00022674616116320152, + "loss": 3.4366, + "step": 91080 + }, + { + "epoch": 6.18868052724555, + "grad_norm": 1.2094368934631348, + "learning_rate": 0.0002267036961543688, + "loss": 3.3638, + "step": 91085 + }, + { + "epoch": 6.189020247316211, + "grad_norm": 1.2439758777618408, + "learning_rate": 0.00022666123114553608, + "loss": 3.4708, + "step": 91090 + }, + { + "epoch": 6.189359967386873, + "grad_norm": 0.7652945518493652, + "learning_rate": 0.00022661876613670339, + "loss": 3.6123, + "step": 91095 + }, + { + "epoch": 6.189699687457535, + "grad_norm": 1.1804139614105225, + "learning_rate": 0.00022657630112787064, + "loss": 3.3041, + "step": 91100 + }, + { + "epoch": 6.190039407528197, + "grad_norm": 1.068899393081665, + "learning_rate": 0.00022653383611903792, + "loss": 3.3226, + "step": 91105 + }, + { + "epoch": 6.190379127598859, + "grad_norm": 0.7237693667411804, + "learning_rate": 0.0002264913711102052, + "loss": 3.6683, + "step": 91110 + }, + { + "epoch": 6.190718847669521, + "grad_norm": 0.9038435220718384, + "learning_rate": 0.00022644890610137248, + "loss": 3.625, + "step": 91115 + }, + { + "epoch": 6.191058567740182, + "grad_norm": 1.148708462715149, + "learning_rate": 0.00022640644109253973, + "loss": 3.4798, + "step": 91120 + }, + { + "epoch": 6.191398287810844, + "grad_norm": 1.1421542167663574, + "learning_rate": 0.00022636397608370704, + "loss": 3.4862, + "step": 91125 + }, + { + "epoch": 6.191738007881506, + "grad_norm": 0.987495481967926, + "learning_rate": 0.00022632151107487432, + "loss": 3.2702, + "step": 91130 + }, + { + "epoch": 6.192077727952167, + "grad_norm": 0.8045031428337097, + "learning_rate": 0.00022627904606604157, + "loss": 3.6225, + "step": 91135 + }, + { + "epoch": 6.192417448022829, + "grad_norm": 1.147830843925476, + "learning_rate": 0.00022623658105720888, + "loss": 3.6958, + "step": 91140 + }, + { + "epoch": 6.192757168093491, + "grad_norm": 1.1632329225540161, + "learning_rate": 0.00022619411604837613, + "loss": 3.03, + "step": 91145 + }, + { + "epoch": 6.193096888164153, + "grad_norm": 0.7061001658439636, + "learning_rate": 0.0002261516510395434, + "loss": 3.2914, + "step": 91150 + }, + { + "epoch": 6.193436608234815, + "grad_norm": 0.839911162853241, + "learning_rate": 0.00022610918603071072, + "loss": 3.5329, + "step": 91155 + }, + { + "epoch": 6.193776328305476, + "grad_norm": 0.8845669627189636, + "learning_rate": 0.00022606672102187797, + "loss": 3.456, + "step": 91160 + }, + { + "epoch": 6.194116048376138, + "grad_norm": 0.9027566313743591, + "learning_rate": 0.00022602425601304525, + "loss": 3.3874, + "step": 91165 + }, + { + "epoch": 6.1944557684468, + "grad_norm": 0.7994449138641357, + "learning_rate": 0.00022598179100421253, + "loss": 3.5082, + "step": 91170 + }, + { + "epoch": 6.194795488517461, + "grad_norm": 0.9012688994407654, + "learning_rate": 0.0002259393259953798, + "loss": 3.2746, + "step": 91175 + }, + { + "epoch": 6.195135208588123, + "grad_norm": 0.878523588180542, + "learning_rate": 0.0002258968609865471, + "loss": 3.6349, + "step": 91180 + }, + { + "epoch": 6.195474928658785, + "grad_norm": 1.2839632034301758, + "learning_rate": 0.00022585439597771437, + "loss": 3.405, + "step": 91185 + }, + { + "epoch": 6.1958146487294465, + "grad_norm": 0.8296350240707397, + "learning_rate": 0.00022581193096888165, + "loss": 3.4263, + "step": 91190 + }, + { + "epoch": 6.196154368800109, + "grad_norm": 0.7981099486351013, + "learning_rate": 0.00022576946596004893, + "loss": 3.3874, + "step": 91195 + }, + { + "epoch": 6.196494088870771, + "grad_norm": 0.9060725569725037, + "learning_rate": 0.0002257270009512162, + "loss": 3.6813, + "step": 91200 + }, + { + "epoch": 6.196833808941432, + "grad_norm": 0.8987566232681274, + "learning_rate": 0.00022568453594238347, + "loss": 3.3216, + "step": 91205 + }, + { + "epoch": 6.197173529012094, + "grad_norm": 1.1570595502853394, + "learning_rate": 0.00022564207093355077, + "loss": 3.2614, + "step": 91210 + }, + { + "epoch": 6.197513249082756, + "grad_norm": 0.846757173538208, + "learning_rate": 0.00022559960592471803, + "loss": 3.4625, + "step": 91215 + }, + { + "epoch": 6.197852969153417, + "grad_norm": 0.9718948006629944, + "learning_rate": 0.0002255571409158853, + "loss": 3.2769, + "step": 91220 + }, + { + "epoch": 6.198192689224079, + "grad_norm": 0.9171184301376343, + "learning_rate": 0.0002255146759070526, + "loss": 3.5414, + "step": 91225 + }, + { + "epoch": 6.198532409294741, + "grad_norm": 0.9366427063941956, + "learning_rate": 0.00022547221089821987, + "loss": 3.4699, + "step": 91230 + }, + { + "epoch": 6.1988721293654026, + "grad_norm": 0.853413999080658, + "learning_rate": 0.00022542974588938715, + "loss": 3.3494, + "step": 91235 + }, + { + "epoch": 6.199211849436065, + "grad_norm": 0.9134336709976196, + "learning_rate": 0.00022538728088055443, + "loss": 3.416, + "step": 91240 + }, + { + "epoch": 6.199551569506727, + "grad_norm": 1.0363861322402954, + "learning_rate": 0.0002253448158717217, + "loss": 3.4914, + "step": 91245 + }, + { + "epoch": 6.199891289577388, + "grad_norm": 1.0172148942947388, + "learning_rate": 0.00022530235086288896, + "loss": 3.4491, + "step": 91250 + }, + { + "epoch": 6.20023100964805, + "grad_norm": 0.8406175374984741, + "learning_rate": 0.00022525988585405627, + "loss": 3.1341, + "step": 91255 + }, + { + "epoch": 6.200570729718712, + "grad_norm": 0.9764630198478699, + "learning_rate": 0.00022521742084522355, + "loss": 3.3033, + "step": 91260 + }, + { + "epoch": 6.200910449789373, + "grad_norm": 0.8379940390586853, + "learning_rate": 0.00022517495583639083, + "loss": 3.4601, + "step": 91265 + }, + { + "epoch": 6.201250169860035, + "grad_norm": 1.06005859375, + "learning_rate": 0.0002251324908275581, + "loss": 3.3426, + "step": 91270 + }, + { + "epoch": 6.201589889930697, + "grad_norm": 0.7448114156723022, + "learning_rate": 0.00022509002581872536, + "loss": 3.3472, + "step": 91275 + }, + { + "epoch": 6.201929610001359, + "grad_norm": 0.6807951927185059, + "learning_rate": 0.00022504756080989267, + "loss": 3.4719, + "step": 91280 + }, + { + "epoch": 6.202269330072021, + "grad_norm": 0.8865213990211487, + "learning_rate": 0.00022500509580105992, + "loss": 3.1203, + "step": 91285 + }, + { + "epoch": 6.202609050142683, + "grad_norm": 0.8428248763084412, + "learning_rate": 0.0002249626307922272, + "loss": 3.4328, + "step": 91290 + }, + { + "epoch": 6.202948770213344, + "grad_norm": 0.9051187038421631, + "learning_rate": 0.0002249201657833945, + "loss": 3.6077, + "step": 91295 + }, + { + "epoch": 6.203288490284006, + "grad_norm": 1.202512502670288, + "learning_rate": 0.00022487770077456176, + "loss": 3.02, + "step": 91300 + }, + { + "epoch": 6.203628210354668, + "grad_norm": 0.8291865587234497, + "learning_rate": 0.00022483523576572904, + "loss": 3.4273, + "step": 91305 + }, + { + "epoch": 6.203967930425329, + "grad_norm": 0.8538410067558289, + "learning_rate": 0.00022479277075689632, + "loss": 3.3488, + "step": 91310 + }, + { + "epoch": 6.204307650495991, + "grad_norm": 0.7664047479629517, + "learning_rate": 0.0002247503057480636, + "loss": 3.2914, + "step": 91315 + }, + { + "epoch": 6.204647370566653, + "grad_norm": 0.9875375628471375, + "learning_rate": 0.00022470784073923088, + "loss": 3.5663, + "step": 91320 + }, + { + "epoch": 6.204987090637315, + "grad_norm": 1.0755549669265747, + "learning_rate": 0.00022466537573039816, + "loss": 3.2113, + "step": 91325 + }, + { + "epoch": 6.205326810707977, + "grad_norm": 0.7628702521324158, + "learning_rate": 0.00022462291072156544, + "loss": 3.4394, + "step": 91330 + }, + { + "epoch": 6.205666530778639, + "grad_norm": 0.8117073774337769, + "learning_rate": 0.0002245804457127327, + "loss": 3.4195, + "step": 91335 + }, + { + "epoch": 6.2060062508493, + "grad_norm": 0.834444522857666, + "learning_rate": 0.0002245379807039, + "loss": 3.3218, + "step": 91340 + }, + { + "epoch": 6.206345970919962, + "grad_norm": 1.2743384838104248, + "learning_rate": 0.00022449551569506725, + "loss": 3.4749, + "step": 91345 + }, + { + "epoch": 6.206685690990624, + "grad_norm": 1.0079421997070312, + "learning_rate": 0.00022445305068623456, + "loss": 3.1897, + "step": 91350 + }, + { + "epoch": 6.207025411061285, + "grad_norm": 1.0584826469421387, + "learning_rate": 0.00022441058567740184, + "loss": 3.3838, + "step": 91355 + }, + { + "epoch": 6.207365131131947, + "grad_norm": 0.8246873617172241, + "learning_rate": 0.0002243681206685691, + "loss": 3.3076, + "step": 91360 + }, + { + "epoch": 6.207704851202609, + "grad_norm": 1.0455621480941772, + "learning_rate": 0.0002243256556597364, + "loss": 3.3057, + "step": 91365 + }, + { + "epoch": 6.208044571273271, + "grad_norm": 1.035314679145813, + "learning_rate": 0.00022428319065090365, + "loss": 3.6494, + "step": 91370 + }, + { + "epoch": 6.208384291343933, + "grad_norm": 0.825702965259552, + "learning_rate": 0.00022424072564207093, + "loss": 3.6726, + "step": 91375 + }, + { + "epoch": 6.208724011414595, + "grad_norm": 1.3194923400878906, + "learning_rate": 0.00022419826063323821, + "loss": 3.4597, + "step": 91380 + }, + { + "epoch": 6.209063731485256, + "grad_norm": 0.8092346787452698, + "learning_rate": 0.0002241557956244055, + "loss": 3.3854, + "step": 91385 + }, + { + "epoch": 6.209403451555918, + "grad_norm": 0.8579967021942139, + "learning_rate": 0.00022411333061557277, + "loss": 3.3632, + "step": 91390 + }, + { + "epoch": 6.20974317162658, + "grad_norm": 0.8843951225280762, + "learning_rate": 0.00022407086560674005, + "loss": 3.3738, + "step": 91395 + }, + { + "epoch": 6.210082891697241, + "grad_norm": 0.7258933782577515, + "learning_rate": 0.00022402840059790733, + "loss": 3.3564, + "step": 91400 + }, + { + "epoch": 6.210422611767903, + "grad_norm": 0.7307767271995544, + "learning_rate": 0.0002239859355890746, + "loss": 3.4801, + "step": 91405 + }, + { + "epoch": 6.210762331838565, + "grad_norm": 0.7627232670783997, + "learning_rate": 0.0002239434705802419, + "loss": 3.2701, + "step": 91410 + }, + { + "epoch": 6.211102051909227, + "grad_norm": 0.9305933713912964, + "learning_rate": 0.00022390100557140915, + "loss": 3.4801, + "step": 91415 + }, + { + "epoch": 6.211441771979889, + "grad_norm": 0.8837218284606934, + "learning_rate": 0.00022385854056257643, + "loss": 3.5138, + "step": 91420 + }, + { + "epoch": 6.211781492050551, + "grad_norm": 0.8861519694328308, + "learning_rate": 0.00022381607555374374, + "loss": 3.4366, + "step": 91425 + }, + { + "epoch": 6.212121212121212, + "grad_norm": 1.0739165544509888, + "learning_rate": 0.000223773610544911, + "loss": 3.2385, + "step": 91430 + }, + { + "epoch": 6.212460932191874, + "grad_norm": 0.8095664978027344, + "learning_rate": 0.0002237311455360783, + "loss": 3.5851, + "step": 91435 + }, + { + "epoch": 6.212800652262536, + "grad_norm": 0.8961520791053772, + "learning_rate": 0.00022368868052724555, + "loss": 3.6856, + "step": 91440 + }, + { + "epoch": 6.213140372333197, + "grad_norm": 0.7930666208267212, + "learning_rate": 0.00022364621551841283, + "loss": 3.3943, + "step": 91445 + }, + { + "epoch": 6.213480092403859, + "grad_norm": 0.9365860819816589, + "learning_rate": 0.00022360375050958014, + "loss": 3.4487, + "step": 91450 + }, + { + "epoch": 6.213819812474521, + "grad_norm": 1.2209022045135498, + "learning_rate": 0.0002235612855007474, + "loss": 3.5706, + "step": 91455 + }, + { + "epoch": 6.214159532545183, + "grad_norm": 0.885322630405426, + "learning_rate": 0.00022351882049191467, + "loss": 3.6734, + "step": 91460 + }, + { + "epoch": 6.214499252615845, + "grad_norm": 1.0646679401397705, + "learning_rate": 0.00022347635548308195, + "loss": 3.4408, + "step": 91465 + }, + { + "epoch": 6.214838972686506, + "grad_norm": 1.0244619846343994, + "learning_rate": 0.00022343389047424923, + "loss": 3.3037, + "step": 91470 + }, + { + "epoch": 6.215178692757168, + "grad_norm": 0.7880463004112244, + "learning_rate": 0.00022339142546541648, + "loss": 3.3484, + "step": 91475 + }, + { + "epoch": 6.21551841282783, + "grad_norm": 0.8781746625900269, + "learning_rate": 0.0002233489604565838, + "loss": 3.2149, + "step": 91480 + }, + { + "epoch": 6.215858132898491, + "grad_norm": 1.1935125589370728, + "learning_rate": 0.00022330649544775107, + "loss": 3.4278, + "step": 91485 + }, + { + "epoch": 6.216197852969153, + "grad_norm": 1.0864572525024414, + "learning_rate": 0.00022326403043891832, + "loss": 3.6548, + "step": 91490 + }, + { + "epoch": 6.216537573039815, + "grad_norm": 0.8211222887039185, + "learning_rate": 0.00022322156543008563, + "loss": 3.557, + "step": 91495 + }, + { + "epoch": 6.2168772931104765, + "grad_norm": 0.8641924858093262, + "learning_rate": 0.00022317910042125288, + "loss": 3.4254, + "step": 91500 + }, + { + "epoch": 6.217217013181139, + "grad_norm": 0.7857369184494019, + "learning_rate": 0.00022313663541242016, + "loss": 3.3709, + "step": 91505 + }, + { + "epoch": 6.217556733251801, + "grad_norm": 1.0769623517990112, + "learning_rate": 0.00022309417040358744, + "loss": 3.056, + "step": 91510 + }, + { + "epoch": 6.217896453322462, + "grad_norm": 0.9507484436035156, + "learning_rate": 0.00022305170539475472, + "loss": 3.1083, + "step": 91515 + }, + { + "epoch": 6.218236173393124, + "grad_norm": 0.9253087043762207, + "learning_rate": 0.00022300924038592203, + "loss": 3.1753, + "step": 91520 + }, + { + "epoch": 6.218575893463786, + "grad_norm": 1.1956148147583008, + "learning_rate": 0.00022296677537708928, + "loss": 3.5123, + "step": 91525 + }, + { + "epoch": 6.218915613534447, + "grad_norm": 1.0152621269226074, + "learning_rate": 0.00022292431036825656, + "loss": 3.4738, + "step": 91530 + }, + { + "epoch": 6.219255333605109, + "grad_norm": 1.0135000944137573, + "learning_rate": 0.00022288184535942384, + "loss": 3.2429, + "step": 91535 + }, + { + "epoch": 6.219595053675771, + "grad_norm": 0.9694605469703674, + "learning_rate": 0.00022283938035059112, + "loss": 3.5334, + "step": 91540 + }, + { + "epoch": 6.2199347737464326, + "grad_norm": 0.9992201924324036, + "learning_rate": 0.00022279691534175838, + "loss": 3.365, + "step": 91545 + }, + { + "epoch": 6.220274493817095, + "grad_norm": 0.798359751701355, + "learning_rate": 0.00022275445033292568, + "loss": 3.3922, + "step": 91550 + }, + { + "epoch": 6.220614213887757, + "grad_norm": 0.8703809976577759, + "learning_rate": 0.00022271198532409296, + "loss": 3.5445, + "step": 91555 + }, + { + "epoch": 6.220953933958418, + "grad_norm": 0.8258002400398254, + "learning_rate": 0.00022266952031526022, + "loss": 3.5229, + "step": 91560 + }, + { + "epoch": 6.22129365402908, + "grad_norm": 0.8651502728462219, + "learning_rate": 0.00022262705530642752, + "loss": 3.4356, + "step": 91565 + }, + { + "epoch": 6.221633374099742, + "grad_norm": 0.969124972820282, + "learning_rate": 0.00022258459029759478, + "loss": 3.29, + "step": 91570 + }, + { + "epoch": 6.221973094170403, + "grad_norm": 0.8256683349609375, + "learning_rate": 0.00022254212528876206, + "loss": 3.4097, + "step": 91575 + }, + { + "epoch": 6.222312814241065, + "grad_norm": 1.092862606048584, + "learning_rate": 0.00022249966027992936, + "loss": 3.5636, + "step": 91580 + }, + { + "epoch": 6.222652534311727, + "grad_norm": 0.9983499050140381, + "learning_rate": 0.00022245719527109662, + "loss": 3.349, + "step": 91585 + }, + { + "epoch": 6.222992254382389, + "grad_norm": 0.8028711676597595, + "learning_rate": 0.0002224147302622639, + "loss": 3.297, + "step": 91590 + }, + { + "epoch": 6.223331974453051, + "grad_norm": 0.7972126603126526, + "learning_rate": 0.00022237226525343118, + "loss": 3.3017, + "step": 91595 + }, + { + "epoch": 6.223671694523713, + "grad_norm": 0.9556319713592529, + "learning_rate": 0.00022232980024459846, + "loss": 3.4634, + "step": 91600 + }, + { + "epoch": 6.224011414594374, + "grad_norm": 0.8550904393196106, + "learning_rate": 0.00022228733523576574, + "loss": 3.339, + "step": 91605 + }, + { + "epoch": 6.224351134665036, + "grad_norm": 0.9310145974159241, + "learning_rate": 0.00022224487022693302, + "loss": 3.1853, + "step": 91610 + }, + { + "epoch": 6.224690854735698, + "grad_norm": 0.9198402762413025, + "learning_rate": 0.0002222024052181003, + "loss": 3.654, + "step": 91615 + }, + { + "epoch": 6.225030574806359, + "grad_norm": 1.0754395723342896, + "learning_rate": 0.00022215994020926758, + "loss": 3.3831, + "step": 91620 + }, + { + "epoch": 6.225370294877021, + "grad_norm": 0.934924840927124, + "learning_rate": 0.00022211747520043486, + "loss": 3.6669, + "step": 91625 + }, + { + "epoch": 6.225710014947683, + "grad_norm": 0.822730302810669, + "learning_rate": 0.0002220750101916021, + "loss": 3.2849, + "step": 91630 + }, + { + "epoch": 6.226049735018345, + "grad_norm": 0.6813287138938904, + "learning_rate": 0.00022203254518276942, + "loss": 3.482, + "step": 91635 + }, + { + "epoch": 6.226389455089007, + "grad_norm": 1.1668916940689087, + "learning_rate": 0.00022199008017393667, + "loss": 3.3086, + "step": 91640 + }, + { + "epoch": 6.226729175159669, + "grad_norm": 0.9124662280082703, + "learning_rate": 0.00022194761516510395, + "loss": 3.734, + "step": 91645 + }, + { + "epoch": 6.22706889523033, + "grad_norm": 0.87676602602005, + "learning_rate": 0.00022190515015627126, + "loss": 3.1595, + "step": 91650 + }, + { + "epoch": 6.227408615300992, + "grad_norm": 4.999734878540039, + "learning_rate": 0.0002218626851474385, + "loss": 3.1705, + "step": 91655 + }, + { + "epoch": 6.227748335371654, + "grad_norm": 0.9556847214698792, + "learning_rate": 0.0002218202201386058, + "loss": 3.3433, + "step": 91660 + }, + { + "epoch": 6.228088055442315, + "grad_norm": 0.841624915599823, + "learning_rate": 0.00022177775512977307, + "loss": 3.4259, + "step": 91665 + }, + { + "epoch": 6.228427775512977, + "grad_norm": 0.9791463017463684, + "learning_rate": 0.00022173529012094035, + "loss": 3.456, + "step": 91670 + }, + { + "epoch": 6.228767495583639, + "grad_norm": 1.0932642221450806, + "learning_rate": 0.0002216928251121076, + "loss": 3.3746, + "step": 91675 + }, + { + "epoch": 6.229107215654301, + "grad_norm": 0.9656425714492798, + "learning_rate": 0.0002216503601032749, + "loss": 3.3377, + "step": 91680 + }, + { + "epoch": 6.229446935724963, + "grad_norm": 0.893280029296875, + "learning_rate": 0.0002216078950944422, + "loss": 3.4617, + "step": 91685 + }, + { + "epoch": 6.229786655795625, + "grad_norm": 1.011065125465393, + "learning_rate": 0.00022156543008560947, + "loss": 3.448, + "step": 91690 + }, + { + "epoch": 6.230126375866286, + "grad_norm": 1.1081904172897339, + "learning_rate": 0.00022152296507677675, + "loss": 3.3425, + "step": 91695 + }, + { + "epoch": 6.230466095936948, + "grad_norm": 0.8494454622268677, + "learning_rate": 0.000221480500067944, + "loss": 3.3138, + "step": 91700 + }, + { + "epoch": 6.23080581600761, + "grad_norm": 0.8265113830566406, + "learning_rate": 0.0002214380350591113, + "loss": 3.2435, + "step": 91705 + }, + { + "epoch": 6.231145536078271, + "grad_norm": 0.9951688647270203, + "learning_rate": 0.00022139557005027856, + "loss": 3.3001, + "step": 91710 + }, + { + "epoch": 6.231485256148933, + "grad_norm": 0.8099470138549805, + "learning_rate": 0.00022135310504144584, + "loss": 3.7488, + "step": 91715 + }, + { + "epoch": 6.231824976219595, + "grad_norm": 1.5504064559936523, + "learning_rate": 0.00022131064003261315, + "loss": 3.4587, + "step": 91720 + }, + { + "epoch": 6.232164696290257, + "grad_norm": 0.8678792715072632, + "learning_rate": 0.0002212681750237804, + "loss": 3.3922, + "step": 91725 + }, + { + "epoch": 6.232504416360919, + "grad_norm": 0.7987314462661743, + "learning_rate": 0.00022122571001494768, + "loss": 3.1839, + "step": 91730 + }, + { + "epoch": 6.232844136431581, + "grad_norm": 1.0242409706115723, + "learning_rate": 0.00022118324500611496, + "loss": 3.6237, + "step": 91735 + }, + { + "epoch": 6.233183856502242, + "grad_norm": 0.9599498510360718, + "learning_rate": 0.00022114077999728224, + "loss": 3.4062, + "step": 91740 + }, + { + "epoch": 6.233523576572904, + "grad_norm": 1.1206934452056885, + "learning_rate": 0.00022109831498844952, + "loss": 3.4262, + "step": 91745 + }, + { + "epoch": 6.233863296643566, + "grad_norm": 1.0000813007354736, + "learning_rate": 0.0002210558499796168, + "loss": 3.4235, + "step": 91750 + }, + { + "epoch": 6.234203016714227, + "grad_norm": 0.8300129175186157, + "learning_rate": 0.00022101338497078408, + "loss": 3.3684, + "step": 91755 + }, + { + "epoch": 6.234542736784889, + "grad_norm": 0.9084644317626953, + "learning_rate": 0.00022097091996195134, + "loss": 3.4445, + "step": 91760 + }, + { + "epoch": 6.234882456855551, + "grad_norm": 0.9162256717681885, + "learning_rate": 0.00022092845495311865, + "loss": 3.2965, + "step": 91765 + }, + { + "epoch": 6.235222176926213, + "grad_norm": 0.7736665606498718, + "learning_rate": 0.0002208859899442859, + "loss": 3.4637, + "step": 91770 + }, + { + "epoch": 6.235561896996875, + "grad_norm": 1.0706946849822998, + "learning_rate": 0.0002208435249354532, + "loss": 3.4026, + "step": 91775 + }, + { + "epoch": 6.235901617067537, + "grad_norm": 1.0208252668380737, + "learning_rate": 0.00022080105992662049, + "loss": 3.6736, + "step": 91780 + }, + { + "epoch": 6.236241337138198, + "grad_norm": 0.8872512578964233, + "learning_rate": 0.00022075859491778774, + "loss": 3.2905, + "step": 91785 + }, + { + "epoch": 6.23658105720886, + "grad_norm": 1.3399590253829956, + "learning_rate": 0.00022071612990895505, + "loss": 3.4121, + "step": 91790 + }, + { + "epoch": 6.236920777279522, + "grad_norm": 0.8424526453018188, + "learning_rate": 0.0002206736649001223, + "loss": 3.3709, + "step": 91795 + }, + { + "epoch": 6.237260497350183, + "grad_norm": 0.8454508781433105, + "learning_rate": 0.00022063119989128958, + "loss": 3.7407, + "step": 91800 + }, + { + "epoch": 6.237600217420845, + "grad_norm": 0.8919732570648193, + "learning_rate": 0.00022058873488245686, + "loss": 3.5869, + "step": 91805 + }, + { + "epoch": 6.237939937491507, + "grad_norm": 1.158919334411621, + "learning_rate": 0.00022054626987362414, + "loss": 3.3823, + "step": 91810 + }, + { + "epoch": 6.238279657562169, + "grad_norm": 0.9865304231643677, + "learning_rate": 0.00022050380486479142, + "loss": 3.4827, + "step": 91815 + }, + { + "epoch": 6.238619377632831, + "grad_norm": 1.0933358669281006, + "learning_rate": 0.0002204613398559587, + "loss": 3.375, + "step": 91820 + }, + { + "epoch": 6.238959097703493, + "grad_norm": 0.7884363532066345, + "learning_rate": 0.00022041887484712598, + "loss": 3.2793, + "step": 91825 + }, + { + "epoch": 6.239298817774154, + "grad_norm": 0.6851263046264648, + "learning_rate": 0.00022037640983829323, + "loss": 3.0372, + "step": 91830 + }, + { + "epoch": 6.239638537844816, + "grad_norm": 0.8951693177223206, + "learning_rate": 0.00022033394482946054, + "loss": 3.2119, + "step": 91835 + }, + { + "epoch": 6.239978257915477, + "grad_norm": 1.0628917217254639, + "learning_rate": 0.0002202914798206278, + "loss": 3.2184, + "step": 91840 + }, + { + "epoch": 6.240317977986139, + "grad_norm": 0.8804115056991577, + "learning_rate": 0.00022024901481179507, + "loss": 3.5346, + "step": 91845 + }, + { + "epoch": 6.240657698056801, + "grad_norm": 0.8688399791717529, + "learning_rate": 0.00022020654980296238, + "loss": 3.4398, + "step": 91850 + }, + { + "epoch": 6.240997418127463, + "grad_norm": 1.098998785018921, + "learning_rate": 0.00022016408479412963, + "loss": 3.385, + "step": 91855 + }, + { + "epoch": 6.241337138198125, + "grad_norm": 1.6745902299880981, + "learning_rate": 0.00022012161978529694, + "loss": 3.5234, + "step": 91860 + }, + { + "epoch": 6.241676858268787, + "grad_norm": 1.017745018005371, + "learning_rate": 0.0002200791547764642, + "loss": 3.4028, + "step": 91865 + }, + { + "epoch": 6.242016578339448, + "grad_norm": 0.9590489268302917, + "learning_rate": 0.00022003668976763147, + "loss": 3.173, + "step": 91870 + }, + { + "epoch": 6.24235629841011, + "grad_norm": 1.4086601734161377, + "learning_rate": 0.00021999422475879878, + "loss": 3.2985, + "step": 91875 + }, + { + "epoch": 6.242696018480772, + "grad_norm": 0.7889175415039062, + "learning_rate": 0.00021995175974996603, + "loss": 3.3966, + "step": 91880 + }, + { + "epoch": 6.243035738551433, + "grad_norm": 0.9530604481697083, + "learning_rate": 0.0002199092947411333, + "loss": 3.3126, + "step": 91885 + }, + { + "epoch": 6.243375458622095, + "grad_norm": 0.8661965727806091, + "learning_rate": 0.0002198668297323006, + "loss": 3.2869, + "step": 91890 + }, + { + "epoch": 6.243715178692757, + "grad_norm": 0.8435271382331848, + "learning_rate": 0.00021982436472346787, + "loss": 3.4711, + "step": 91895 + }, + { + "epoch": 6.244054898763419, + "grad_norm": 1.1924580335617065, + "learning_rate": 0.00021978189971463513, + "loss": 3.4362, + "step": 91900 + }, + { + "epoch": 6.244394618834081, + "grad_norm": 0.9536303281784058, + "learning_rate": 0.00021973943470580243, + "loss": 3.4968, + "step": 91905 + }, + { + "epoch": 6.244734338904743, + "grad_norm": 2.147371530532837, + "learning_rate": 0.0002196969696969697, + "loss": 3.579, + "step": 91910 + }, + { + "epoch": 6.245074058975404, + "grad_norm": 0.8375000357627869, + "learning_rate": 0.00021965450468813697, + "loss": 3.3237, + "step": 91915 + }, + { + "epoch": 6.245413779046066, + "grad_norm": 0.9423466920852661, + "learning_rate": 0.00021961203967930427, + "loss": 3.5078, + "step": 91920 + }, + { + "epoch": 6.245753499116728, + "grad_norm": 1.180526852607727, + "learning_rate": 0.00021956957467047153, + "loss": 3.5066, + "step": 91925 + }, + { + "epoch": 6.246093219187389, + "grad_norm": 0.9579654335975647, + "learning_rate": 0.0002195271096616388, + "loss": 3.502, + "step": 91930 + }, + { + "epoch": 6.246432939258051, + "grad_norm": 0.9896241426467896, + "learning_rate": 0.00021948464465280609, + "loss": 3.1423, + "step": 91935 + }, + { + "epoch": 6.246772659328713, + "grad_norm": 0.8582212328910828, + "learning_rate": 0.00021944217964397337, + "loss": 3.4461, + "step": 91940 + }, + { + "epoch": 6.247112379399375, + "grad_norm": 1.1284035444259644, + "learning_rate": 0.00021939971463514067, + "loss": 3.6303, + "step": 91945 + }, + { + "epoch": 6.247452099470037, + "grad_norm": 1.210508942604065, + "learning_rate": 0.00021935724962630793, + "loss": 3.2738, + "step": 91950 + }, + { + "epoch": 6.247791819540699, + "grad_norm": 0.8198754191398621, + "learning_rate": 0.0002193147846174752, + "loss": 3.5033, + "step": 91955 + }, + { + "epoch": 6.24813153961136, + "grad_norm": 0.8445462584495544, + "learning_rate": 0.0002192723196086425, + "loss": 3.4672, + "step": 91960 + }, + { + "epoch": 6.248471259682022, + "grad_norm": 0.9900342226028442, + "learning_rate": 0.00021922985459980977, + "loss": 3.4461, + "step": 91965 + }, + { + "epoch": 6.248810979752684, + "grad_norm": 0.8994378447532654, + "learning_rate": 0.00021918738959097702, + "loss": 3.3613, + "step": 91970 + }, + { + "epoch": 6.249150699823345, + "grad_norm": 1.357681393623352, + "learning_rate": 0.00021914492458214433, + "loss": 3.4556, + "step": 91975 + }, + { + "epoch": 6.249490419894007, + "grad_norm": 1.1628652811050415, + "learning_rate": 0.0002191024595733116, + "loss": 3.3058, + "step": 91980 + }, + { + "epoch": 6.249830139964669, + "grad_norm": 0.8636434078216553, + "learning_rate": 0.00021905999456447886, + "loss": 3.1622, + "step": 91985 + }, + { + "epoch": 6.250169860035331, + "grad_norm": 0.8736677169799805, + "learning_rate": 0.00021901752955564617, + "loss": 3.3596, + "step": 91990 + }, + { + "epoch": 6.250509580105993, + "grad_norm": 0.8160125613212585, + "learning_rate": 0.00021897506454681342, + "loss": 3.4312, + "step": 91995 + }, + { + "epoch": 6.250849300176655, + "grad_norm": 0.980208694934845, + "learning_rate": 0.0002189325995379807, + "loss": 3.4697, + "step": 92000 + }, + { + "epoch": 6.251189020247316, + "grad_norm": 0.6977919340133667, + "learning_rate": 0.000218890134529148, + "loss": 3.6686, + "step": 92005 + }, + { + "epoch": 6.251528740317978, + "grad_norm": 0.794808030128479, + "learning_rate": 0.00021884766952031526, + "loss": 3.4232, + "step": 92010 + }, + { + "epoch": 6.25186846038864, + "grad_norm": 0.7790091037750244, + "learning_rate": 0.00021880520451148254, + "loss": 3.1264, + "step": 92015 + }, + { + "epoch": 6.252208180459301, + "grad_norm": 1.073154330253601, + "learning_rate": 0.00021876273950264982, + "loss": 3.3403, + "step": 92020 + }, + { + "epoch": 6.252547900529963, + "grad_norm": 0.9541229605674744, + "learning_rate": 0.0002187202744938171, + "loss": 3.3338, + "step": 92025 + }, + { + "epoch": 6.252887620600625, + "grad_norm": 0.8615903854370117, + "learning_rate": 0.00021867780948498435, + "loss": 3.294, + "step": 92030 + }, + { + "epoch": 6.253227340671287, + "grad_norm": 0.7582383155822754, + "learning_rate": 0.00021863534447615166, + "loss": 3.4748, + "step": 92035 + }, + { + "epoch": 6.253567060741949, + "grad_norm": 0.8691375851631165, + "learning_rate": 0.00021859287946731894, + "loss": 3.2605, + "step": 92040 + }, + { + "epoch": 6.253906780812611, + "grad_norm": 0.8322954773902893, + "learning_rate": 0.00021855041445848622, + "loss": 3.5319, + "step": 92045 + }, + { + "epoch": 6.254246500883272, + "grad_norm": 0.6929288506507874, + "learning_rate": 0.0002185079494496535, + "loss": 3.4441, + "step": 92050 + }, + { + "epoch": 6.254586220953934, + "grad_norm": 1.0421327352523804, + "learning_rate": 0.00021846548444082075, + "loss": 3.3924, + "step": 92055 + }, + { + "epoch": 6.254925941024596, + "grad_norm": 0.8356699347496033, + "learning_rate": 0.00021842301943198806, + "loss": 3.4284, + "step": 92060 + }, + { + "epoch": 6.255265661095257, + "grad_norm": 0.9860181212425232, + "learning_rate": 0.00021838055442315531, + "loss": 3.2901, + "step": 92065 + }, + { + "epoch": 6.255605381165919, + "grad_norm": 0.9146344661712646, + "learning_rate": 0.0002183380894143226, + "loss": 3.1794, + "step": 92070 + }, + { + "epoch": 6.255945101236581, + "grad_norm": 1.0534597635269165, + "learning_rate": 0.0002182956244054899, + "loss": 3.6445, + "step": 92075 + }, + { + "epoch": 6.256284821307243, + "grad_norm": 0.7238872051239014, + "learning_rate": 0.00021825315939665715, + "loss": 3.296, + "step": 92080 + }, + { + "epoch": 6.256624541377905, + "grad_norm": 1.2465555667877197, + "learning_rate": 0.00021821069438782443, + "loss": 3.4234, + "step": 92085 + }, + { + "epoch": 6.256964261448567, + "grad_norm": 0.6970124840736389, + "learning_rate": 0.00021816822937899171, + "loss": 3.3886, + "step": 92090 + }, + { + "epoch": 6.257303981519228, + "grad_norm": 1.6295738220214844, + "learning_rate": 0.000218125764370159, + "loss": 3.0617, + "step": 92095 + }, + { + "epoch": 6.25764370158989, + "grad_norm": 0.808445394039154, + "learning_rate": 0.00021808329936132625, + "loss": 3.4133, + "step": 92100 + }, + { + "epoch": 6.257983421660552, + "grad_norm": 0.8607593774795532, + "learning_rate": 0.00021804083435249355, + "loss": 3.3229, + "step": 92105 + }, + { + "epoch": 6.258323141731213, + "grad_norm": 1.0411845445632935, + "learning_rate": 0.00021799836934366083, + "loss": 3.4816, + "step": 92110 + }, + { + "epoch": 6.258662861801875, + "grad_norm": 1.1325201988220215, + "learning_rate": 0.0002179559043348281, + "loss": 3.5406, + "step": 92115 + }, + { + "epoch": 6.259002581872537, + "grad_norm": 0.7312073111534119, + "learning_rate": 0.0002179134393259954, + "loss": 3.5115, + "step": 92120 + }, + { + "epoch": 6.259342301943199, + "grad_norm": 1.2715916633605957, + "learning_rate": 0.00021787097431716265, + "loss": 3.1975, + "step": 92125 + }, + { + "epoch": 6.259682022013861, + "grad_norm": 1.3265085220336914, + "learning_rate": 0.00021782850930832996, + "loss": 3.6448, + "step": 92130 + }, + { + "epoch": 6.260021742084522, + "grad_norm": 0.7430132627487183, + "learning_rate": 0.0002177860442994972, + "loss": 3.3866, + "step": 92135 + }, + { + "epoch": 6.260361462155184, + "grad_norm": 0.8148967027664185, + "learning_rate": 0.0002177435792906645, + "loss": 3.2889, + "step": 92140 + }, + { + "epoch": 6.260701182225846, + "grad_norm": 0.814887523651123, + "learning_rate": 0.0002177011142818318, + "loss": 3.4208, + "step": 92145 + }, + { + "epoch": 6.261040902296507, + "grad_norm": 0.8204250931739807, + "learning_rate": 0.00021765864927299905, + "loss": 3.4615, + "step": 92150 + }, + { + "epoch": 6.261380622367169, + "grad_norm": 1.1504651308059692, + "learning_rate": 0.00021761618426416633, + "loss": 3.4018, + "step": 92155 + }, + { + "epoch": 6.261720342437831, + "grad_norm": 0.8495885133743286, + "learning_rate": 0.0002175737192553336, + "loss": 3.5901, + "step": 92160 + }, + { + "epoch": 6.262060062508493, + "grad_norm": 0.9920921921730042, + "learning_rate": 0.0002175312542465009, + "loss": 3.3602, + "step": 92165 + }, + { + "epoch": 6.262399782579155, + "grad_norm": 0.8056337833404541, + "learning_rate": 0.00021748878923766817, + "loss": 3.4785, + "step": 92170 + }, + { + "epoch": 6.262739502649817, + "grad_norm": 0.8164330720901489, + "learning_rate": 0.00021744632422883545, + "loss": 3.5208, + "step": 92175 + }, + { + "epoch": 6.263079222720478, + "grad_norm": 1.1252822875976562, + "learning_rate": 0.00021740385922000273, + "loss": 3.2747, + "step": 92180 + }, + { + "epoch": 6.26341894279114, + "grad_norm": 0.9147183299064636, + "learning_rate": 0.00021736139421116998, + "loss": 3.4402, + "step": 92185 + }, + { + "epoch": 6.263758662861802, + "grad_norm": 0.98427414894104, + "learning_rate": 0.0002173189292023373, + "loss": 3.4588, + "step": 92190 + }, + { + "epoch": 6.264098382932463, + "grad_norm": 0.9163640737533569, + "learning_rate": 0.00021727646419350454, + "loss": 3.4458, + "step": 92195 + }, + { + "epoch": 6.264438103003125, + "grad_norm": 1.0080606937408447, + "learning_rate": 0.00021723399918467182, + "loss": 3.4818, + "step": 92200 + }, + { + "epoch": 6.264777823073787, + "grad_norm": 0.9550666213035583, + "learning_rate": 0.00021719153417583913, + "loss": 3.2159, + "step": 92205 + }, + { + "epoch": 6.265117543144449, + "grad_norm": 0.9890584945678711, + "learning_rate": 0.00021714906916700638, + "loss": 3.4368, + "step": 92210 + }, + { + "epoch": 6.265457263215111, + "grad_norm": 0.9854526519775391, + "learning_rate": 0.0002171066041581737, + "loss": 3.5375, + "step": 92215 + }, + { + "epoch": 6.265796983285773, + "grad_norm": 1.0921417474746704, + "learning_rate": 0.00021706413914934094, + "loss": 3.4533, + "step": 92220 + }, + { + "epoch": 6.266136703356434, + "grad_norm": 0.8992187976837158, + "learning_rate": 0.00021702167414050822, + "loss": 3.251, + "step": 92225 + }, + { + "epoch": 6.266476423427096, + "grad_norm": 1.6140226125717163, + "learning_rate": 0.0002169792091316755, + "loss": 3.0432, + "step": 92230 + }, + { + "epoch": 6.266816143497758, + "grad_norm": 1.8707948923110962, + "learning_rate": 0.00021693674412284278, + "loss": 3.3182, + "step": 92235 + }, + { + "epoch": 6.267155863568419, + "grad_norm": 1.071128487586975, + "learning_rate": 0.00021689427911401006, + "loss": 3.4108, + "step": 92240 + }, + { + "epoch": 6.267495583639081, + "grad_norm": 0.9297143816947937, + "learning_rate": 0.00021685181410517734, + "loss": 3.5517, + "step": 92245 + }, + { + "epoch": 6.267835303709743, + "grad_norm": 1.318426251411438, + "learning_rate": 0.00021680934909634462, + "loss": 3.5617, + "step": 92250 + }, + { + "epoch": 6.268175023780405, + "grad_norm": 1.0574225187301636, + "learning_rate": 0.00021676688408751188, + "loss": 3.5436, + "step": 92255 + }, + { + "epoch": 6.268514743851067, + "grad_norm": 0.7268333435058594, + "learning_rate": 0.00021672441907867918, + "loss": 3.6928, + "step": 92260 + }, + { + "epoch": 6.268854463921729, + "grad_norm": 0.9315637350082397, + "learning_rate": 0.00021668195406984644, + "loss": 3.4182, + "step": 92265 + }, + { + "epoch": 6.26919418399239, + "grad_norm": 1.408661127090454, + "learning_rate": 0.00021663948906101372, + "loss": 3.2596, + "step": 92270 + }, + { + "epoch": 6.269533904063052, + "grad_norm": 1.0921976566314697, + "learning_rate": 0.00021659702405218102, + "loss": 3.3842, + "step": 92275 + }, + { + "epoch": 6.269873624133714, + "grad_norm": 0.9193692207336426, + "learning_rate": 0.00021655455904334828, + "loss": 3.1838, + "step": 92280 + }, + { + "epoch": 6.270213344204375, + "grad_norm": 0.9661255478858948, + "learning_rate": 0.00021651209403451556, + "loss": 3.5694, + "step": 92285 + }, + { + "epoch": 6.270553064275037, + "grad_norm": 1.2487494945526123, + "learning_rate": 0.00021646962902568284, + "loss": 3.5063, + "step": 92290 + }, + { + "epoch": 6.270892784345699, + "grad_norm": 0.9467548131942749, + "learning_rate": 0.00021642716401685012, + "loss": 3.2953, + "step": 92295 + }, + { + "epoch": 6.271232504416361, + "grad_norm": 1.4129971265792847, + "learning_rate": 0.00021638469900801742, + "loss": 3.2333, + "step": 92300 + }, + { + "epoch": 6.271572224487023, + "grad_norm": 0.774587869644165, + "learning_rate": 0.00021634223399918468, + "loss": 3.2962, + "step": 92305 + }, + { + "epoch": 6.271911944557685, + "grad_norm": 0.7873556613922119, + "learning_rate": 0.00021629976899035196, + "loss": 3.7436, + "step": 92310 + }, + { + "epoch": 6.272251664628346, + "grad_norm": 0.9960774779319763, + "learning_rate": 0.00021625730398151924, + "loss": 3.4785, + "step": 92315 + }, + { + "epoch": 6.272591384699008, + "grad_norm": 1.1945762634277344, + "learning_rate": 0.00021621483897268652, + "loss": 3.4338, + "step": 92320 + }, + { + "epoch": 6.27293110476967, + "grad_norm": 1.508719801902771, + "learning_rate": 0.00021617237396385377, + "loss": 3.5207, + "step": 92325 + }, + { + "epoch": 6.273270824840331, + "grad_norm": 0.8906660676002502, + "learning_rate": 0.00021612990895502108, + "loss": 3.479, + "step": 92330 + }, + { + "epoch": 6.273610544910993, + "grad_norm": 0.891615629196167, + "learning_rate": 0.00021608744394618836, + "loss": 3.3743, + "step": 92335 + }, + { + "epoch": 6.273950264981655, + "grad_norm": 0.9958972334861755, + "learning_rate": 0.0002160449789373556, + "loss": 3.5917, + "step": 92340 + }, + { + "epoch": 6.274289985052317, + "grad_norm": 0.8327465057373047, + "learning_rate": 0.00021600251392852292, + "loss": 3.297, + "step": 92345 + }, + { + "epoch": 6.274629705122979, + "grad_norm": 1.961403727531433, + "learning_rate": 0.00021596004891969017, + "loss": 3.4837, + "step": 92350 + }, + { + "epoch": 6.274969425193641, + "grad_norm": 1.0492795705795288, + "learning_rate": 0.00021591758391085745, + "loss": 3.145, + "step": 92355 + }, + { + "epoch": 6.275309145264302, + "grad_norm": 0.8519936203956604, + "learning_rate": 0.00021587511890202473, + "loss": 3.2747, + "step": 92360 + }, + { + "epoch": 6.275648865334964, + "grad_norm": 1.4631980657577515, + "learning_rate": 0.000215832653893192, + "loss": 3.2822, + "step": 92365 + }, + { + "epoch": 6.275988585405626, + "grad_norm": 0.8807467222213745, + "learning_rate": 0.0002157901888843593, + "loss": 3.1735, + "step": 92370 + }, + { + "epoch": 6.276328305476287, + "grad_norm": 1.0209991931915283, + "learning_rate": 0.00021574772387552657, + "loss": 3.3517, + "step": 92375 + }, + { + "epoch": 6.276668025546949, + "grad_norm": 0.9097675085067749, + "learning_rate": 0.00021570525886669385, + "loss": 3.2356, + "step": 92380 + }, + { + "epoch": 6.277007745617611, + "grad_norm": 0.7463132739067078, + "learning_rate": 0.00021566279385786113, + "loss": 3.4451, + "step": 92385 + }, + { + "epoch": 6.277347465688273, + "grad_norm": 2.143911838531494, + "learning_rate": 0.0002156203288490284, + "loss": 3.337, + "step": 92390 + }, + { + "epoch": 6.277687185758935, + "grad_norm": 0.670665979385376, + "learning_rate": 0.00021557786384019566, + "loss": 3.2631, + "step": 92395 + }, + { + "epoch": 6.278026905829597, + "grad_norm": 0.8689778447151184, + "learning_rate": 0.00021553539883136297, + "loss": 3.1749, + "step": 92400 + }, + { + "epoch": 6.278366625900258, + "grad_norm": 0.8998492956161499, + "learning_rate": 0.00021549293382253025, + "loss": 3.4869, + "step": 92405 + }, + { + "epoch": 6.27870634597092, + "grad_norm": 0.8617187738418579, + "learning_rate": 0.0002154504688136975, + "loss": 3.3714, + "step": 92410 + }, + { + "epoch": 6.279046066041582, + "grad_norm": 1.9732807874679565, + "learning_rate": 0.0002154080038048648, + "loss": 3.3294, + "step": 92415 + }, + { + "epoch": 6.279385786112243, + "grad_norm": 0.9632378816604614, + "learning_rate": 0.00021536553879603206, + "loss": 3.2453, + "step": 92420 + }, + { + "epoch": 6.279725506182905, + "grad_norm": 0.7191166281700134, + "learning_rate": 0.00021532307378719934, + "loss": 3.6382, + "step": 92425 + }, + { + "epoch": 6.2800652262535674, + "grad_norm": 0.9467539191246033, + "learning_rate": 0.00021528060877836665, + "loss": 3.2985, + "step": 92430 + }, + { + "epoch": 6.280404946324229, + "grad_norm": 1.0081626176834106, + "learning_rate": 0.0002152381437695339, + "loss": 3.6084, + "step": 92435 + }, + { + "epoch": 6.280744666394891, + "grad_norm": 0.8425177931785583, + "learning_rate": 0.00021519567876070118, + "loss": 3.4089, + "step": 92440 + }, + { + "epoch": 6.281084386465553, + "grad_norm": 1.1721769571304321, + "learning_rate": 0.00021515321375186846, + "loss": 3.5439, + "step": 92445 + }, + { + "epoch": 6.281424106536214, + "grad_norm": 1.0399198532104492, + "learning_rate": 0.00021511074874303574, + "loss": 3.407, + "step": 92450 + }, + { + "epoch": 6.281763826606876, + "grad_norm": 1.027409315109253, + "learning_rate": 0.000215068283734203, + "loss": 3.5026, + "step": 92455 + }, + { + "epoch": 6.282103546677538, + "grad_norm": 0.8410511016845703, + "learning_rate": 0.0002150258187253703, + "loss": 3.6541, + "step": 92460 + }, + { + "epoch": 6.282443266748199, + "grad_norm": 1.926013708114624, + "learning_rate": 0.00021498335371653758, + "loss": 3.2219, + "step": 92465 + }, + { + "epoch": 6.282782986818861, + "grad_norm": 1.0170265436172485, + "learning_rate": 0.00021494088870770486, + "loss": 3.4565, + "step": 92470 + }, + { + "epoch": 6.2831227068895235, + "grad_norm": 1.445117473602295, + "learning_rate": 0.00021489842369887214, + "loss": 3.3978, + "step": 92475 + }, + { + "epoch": 6.283462426960185, + "grad_norm": 3.2904105186462402, + "learning_rate": 0.0002148559586900394, + "loss": 3.5373, + "step": 92480 + }, + { + "epoch": 6.283802147030847, + "grad_norm": 0.8443289995193481, + "learning_rate": 0.0002148134936812067, + "loss": 3.3488, + "step": 92485 + }, + { + "epoch": 6.284141867101509, + "grad_norm": 1.0340161323547363, + "learning_rate": 0.00021477102867237396, + "loss": 3.0166, + "step": 92490 + }, + { + "epoch": 6.28448158717217, + "grad_norm": 1.0220314264297485, + "learning_rate": 0.00021472856366354124, + "loss": 3.3283, + "step": 92495 + }, + { + "epoch": 6.284821307242832, + "grad_norm": 1.2971878051757812, + "learning_rate": 0.00021468609865470855, + "loss": 3.3442, + "step": 92500 + }, + { + "epoch": 6.285161027313494, + "grad_norm": 1.0069273710250854, + "learning_rate": 0.0002146436336458758, + "loss": 3.6294, + "step": 92505 + }, + { + "epoch": 6.285500747384155, + "grad_norm": 0.8589316010475159, + "learning_rate": 0.00021460116863704308, + "loss": 3.3008, + "step": 92510 + }, + { + "epoch": 6.285840467454817, + "grad_norm": 1.4306741952896118, + "learning_rate": 0.00021455870362821036, + "loss": 3.3391, + "step": 92515 + }, + { + "epoch": 6.2861801875254795, + "grad_norm": 0.9015608429908752, + "learning_rate": 0.00021451623861937764, + "loss": 3.2791, + "step": 92520 + }, + { + "epoch": 6.286519907596141, + "grad_norm": 1.2210463285446167, + "learning_rate": 0.0002144737736105449, + "loss": 3.3928, + "step": 92525 + }, + { + "epoch": 6.286859627666803, + "grad_norm": 0.9308770895004272, + "learning_rate": 0.0002144313086017122, + "loss": 3.6225, + "step": 92530 + }, + { + "epoch": 6.287199347737464, + "grad_norm": 0.8208233118057251, + "learning_rate": 0.00021438884359287948, + "loss": 3.2984, + "step": 92535 + }, + { + "epoch": 6.287539067808126, + "grad_norm": 1.2824894189834595, + "learning_rate": 0.00021434637858404673, + "loss": 3.5822, + "step": 92540 + }, + { + "epoch": 6.287878787878788, + "grad_norm": 1.7821168899536133, + "learning_rate": 0.00021430391357521404, + "loss": 3.3257, + "step": 92545 + }, + { + "epoch": 6.288218507949449, + "grad_norm": 0.7548096776008606, + "learning_rate": 0.0002142614485663813, + "loss": 3.1282, + "step": 92550 + }, + { + "epoch": 6.288558228020111, + "grad_norm": 0.7664828896522522, + "learning_rate": 0.0002142189835575486, + "loss": 3.4337, + "step": 92555 + }, + { + "epoch": 6.288897948090773, + "grad_norm": 0.9647935628890991, + "learning_rate": 0.00021417651854871585, + "loss": 3.528, + "step": 92560 + }, + { + "epoch": 6.289237668161435, + "grad_norm": 0.9085045456886292, + "learning_rate": 0.00021413405353988313, + "loss": 3.3372, + "step": 92565 + }, + { + "epoch": 6.289577388232097, + "grad_norm": 1.1475566625595093, + "learning_rate": 0.00021409158853105044, + "loss": 3.5548, + "step": 92570 + }, + { + "epoch": 6.289917108302759, + "grad_norm": 1.0225703716278076, + "learning_rate": 0.0002140491235222177, + "loss": 3.5798, + "step": 92575 + }, + { + "epoch": 6.29025682837342, + "grad_norm": 1.1198973655700684, + "learning_rate": 0.00021400665851338497, + "loss": 3.2204, + "step": 92580 + }, + { + "epoch": 6.290596548444082, + "grad_norm": 0.8476110100746155, + "learning_rate": 0.00021396419350455225, + "loss": 3.4002, + "step": 92585 + }, + { + "epoch": 6.290936268514744, + "grad_norm": 0.9396489262580872, + "learning_rate": 0.00021392172849571953, + "loss": 3.6171, + "step": 92590 + }, + { + "epoch": 6.291275988585405, + "grad_norm": 1.2501368522644043, + "learning_rate": 0.0002138792634868868, + "loss": 3.281, + "step": 92595 + }, + { + "epoch": 6.291615708656067, + "grad_norm": 1.2537686824798584, + "learning_rate": 0.0002138367984780541, + "loss": 3.5217, + "step": 92600 + }, + { + "epoch": 6.291955428726729, + "grad_norm": 0.9871388673782349, + "learning_rate": 0.00021379433346922137, + "loss": 3.5881, + "step": 92605 + }, + { + "epoch": 6.292295148797391, + "grad_norm": 0.8937054872512817, + "learning_rate": 0.00021375186846038863, + "loss": 3.3112, + "step": 92610 + }, + { + "epoch": 6.292634868868053, + "grad_norm": 0.9142409563064575, + "learning_rate": 0.00021370940345155593, + "loss": 3.568, + "step": 92615 + }, + { + "epoch": 6.292974588938715, + "grad_norm": 0.8676510453224182, + "learning_rate": 0.00021366693844272319, + "loss": 3.4146, + "step": 92620 + }, + { + "epoch": 6.293314309009376, + "grad_norm": 0.8822487592697144, + "learning_rate": 0.00021362447343389047, + "loss": 3.3693, + "step": 92625 + }, + { + "epoch": 6.293654029080038, + "grad_norm": 1.054497480392456, + "learning_rate": 0.00021358200842505777, + "loss": 3.3521, + "step": 92630 + }, + { + "epoch": 6.2939937491507, + "grad_norm": 0.9493991136550903, + "learning_rate": 0.00021353954341622503, + "loss": 3.4481, + "step": 92635 + }, + { + "epoch": 6.294333469221361, + "grad_norm": 0.8792350888252258, + "learning_rate": 0.00021349707840739233, + "loss": 3.3214, + "step": 92640 + }, + { + "epoch": 6.294673189292023, + "grad_norm": 0.8034452199935913, + "learning_rate": 0.00021345461339855959, + "loss": 3.36, + "step": 92645 + }, + { + "epoch": 6.295012909362685, + "grad_norm": 0.8201754689216614, + "learning_rate": 0.00021341214838972687, + "loss": 3.4844, + "step": 92650 + }, + { + "epoch": 6.295352629433347, + "grad_norm": 0.827086865901947, + "learning_rate": 0.00021336968338089415, + "loss": 3.5794, + "step": 92655 + }, + { + "epoch": 6.295692349504009, + "grad_norm": 0.7797857522964478, + "learning_rate": 0.00021332721837206143, + "loss": 3.3988, + "step": 92660 + }, + { + "epoch": 6.296032069574671, + "grad_norm": 0.8980403542518616, + "learning_rate": 0.0002132847533632287, + "loss": 3.4059, + "step": 92665 + }, + { + "epoch": 6.296371789645332, + "grad_norm": 0.8725829124450684, + "learning_rate": 0.000213242288354396, + "loss": 3.2937, + "step": 92670 + }, + { + "epoch": 6.296711509715994, + "grad_norm": 0.7492392063140869, + "learning_rate": 0.00021319982334556327, + "loss": 3.3501, + "step": 92675 + }, + { + "epoch": 6.297051229786656, + "grad_norm": 0.9160549640655518, + "learning_rate": 0.00021315735833673052, + "loss": 3.5569, + "step": 92680 + }, + { + "epoch": 6.297390949857317, + "grad_norm": 0.9781050682067871, + "learning_rate": 0.00021311489332789783, + "loss": 3.356, + "step": 92685 + }, + { + "epoch": 6.297730669927979, + "grad_norm": 0.8828866481781006, + "learning_rate": 0.00021307242831906508, + "loss": 3.4179, + "step": 92690 + }, + { + "epoch": 6.298070389998641, + "grad_norm": 1.159969687461853, + "learning_rate": 0.00021302996331023236, + "loss": 3.374, + "step": 92695 + }, + { + "epoch": 6.298410110069303, + "grad_norm": 0.9505277276039124, + "learning_rate": 0.00021298749830139967, + "loss": 3.3432, + "step": 92700 + }, + { + "epoch": 6.298749830139965, + "grad_norm": 1.1586238145828247, + "learning_rate": 0.00021294503329256692, + "loss": 3.2001, + "step": 92705 + }, + { + "epoch": 6.299089550210627, + "grad_norm": 1.1055439710617065, + "learning_rate": 0.0002129025682837342, + "loss": 3.4191, + "step": 92710 + }, + { + "epoch": 6.299429270281288, + "grad_norm": 0.997038722038269, + "learning_rate": 0.00021286010327490148, + "loss": 3.8369, + "step": 92715 + }, + { + "epoch": 6.29976899035195, + "grad_norm": 0.9799054265022278, + "learning_rate": 0.00021281763826606876, + "loss": 3.617, + "step": 92720 + }, + { + "epoch": 6.300108710422612, + "grad_norm": 0.9111483097076416, + "learning_rate": 0.00021277517325723607, + "loss": 3.4831, + "step": 92725 + }, + { + "epoch": 6.300448430493273, + "grad_norm": 0.7973201274871826, + "learning_rate": 0.00021273270824840332, + "loss": 3.5135, + "step": 92730 + }, + { + "epoch": 6.300788150563935, + "grad_norm": 0.8003806471824646, + "learning_rate": 0.0002126902432395706, + "loss": 3.4741, + "step": 92735 + }, + { + "epoch": 6.3011278706345974, + "grad_norm": 1.0934759378433228, + "learning_rate": 0.00021264777823073788, + "loss": 3.4278, + "step": 92740 + }, + { + "epoch": 6.301467590705259, + "grad_norm": 0.8525242209434509, + "learning_rate": 0.00021260531322190516, + "loss": 3.3158, + "step": 92745 + }, + { + "epoch": 6.301807310775921, + "grad_norm": 0.7172042727470398, + "learning_rate": 0.00021256284821307241, + "loss": 3.127, + "step": 92750 + }, + { + "epoch": 6.302147030846583, + "grad_norm": 0.791644275188446, + "learning_rate": 0.00021252038320423972, + "loss": 3.1364, + "step": 92755 + }, + { + "epoch": 6.302486750917244, + "grad_norm": 0.8202689290046692, + "learning_rate": 0.000212477918195407, + "loss": 3.3756, + "step": 92760 + }, + { + "epoch": 6.302826470987906, + "grad_norm": 2.3117570877075195, + "learning_rate": 0.00021243545318657425, + "loss": 3.3037, + "step": 92765 + }, + { + "epoch": 6.303166191058568, + "grad_norm": 0.7466692924499512, + "learning_rate": 0.00021239298817774156, + "loss": 3.5111, + "step": 92770 + }, + { + "epoch": 6.303505911129229, + "grad_norm": 1.043246865272522, + "learning_rate": 0.00021235052316890881, + "loss": 3.4677, + "step": 92775 + }, + { + "epoch": 6.303845631199891, + "grad_norm": 1.0588111877441406, + "learning_rate": 0.0002123080581600761, + "loss": 3.4729, + "step": 92780 + }, + { + "epoch": 6.3041853512705535, + "grad_norm": 1.0339211225509644, + "learning_rate": 0.00021226559315124337, + "loss": 3.3942, + "step": 92785 + }, + { + "epoch": 6.304525071341215, + "grad_norm": 1.0313736200332642, + "learning_rate": 0.00021222312814241065, + "loss": 3.5167, + "step": 92790 + }, + { + "epoch": 6.304864791411877, + "grad_norm": 0.7647556066513062, + "learning_rate": 0.00021218066313357793, + "loss": 3.451, + "step": 92795 + }, + { + "epoch": 6.305204511482539, + "grad_norm": 1.1933778524398804, + "learning_rate": 0.00021213819812474521, + "loss": 3.4146, + "step": 92800 + }, + { + "epoch": 6.3055442315532, + "grad_norm": 0.8214246034622192, + "learning_rate": 0.0002120957331159125, + "loss": 3.4164, + "step": 92805 + }, + { + "epoch": 6.305883951623862, + "grad_norm": 1.0044604539871216, + "learning_rate": 0.00021205326810707977, + "loss": 3.4062, + "step": 92810 + }, + { + "epoch": 6.306223671694523, + "grad_norm": 1.553910255432129, + "learning_rate": 0.00021201080309824705, + "loss": 3.447, + "step": 92815 + }, + { + "epoch": 6.306563391765185, + "grad_norm": 0.8568450212478638, + "learning_rate": 0.0002119683380894143, + "loss": 3.2264, + "step": 92820 + }, + { + "epoch": 6.306903111835847, + "grad_norm": 0.7531583309173584, + "learning_rate": 0.00021192587308058161, + "loss": 3.5557, + "step": 92825 + }, + { + "epoch": 6.307242831906509, + "grad_norm": 0.9520260095596313, + "learning_rate": 0.0002118834080717489, + "loss": 3.5686, + "step": 92830 + }, + { + "epoch": 6.307582551977171, + "grad_norm": 1.0216044187545776, + "learning_rate": 0.00021184094306291615, + "loss": 3.5598, + "step": 92835 + }, + { + "epoch": 6.307922272047833, + "grad_norm": 0.9096834063529968, + "learning_rate": 0.00021179847805408346, + "loss": 3.5012, + "step": 92840 + }, + { + "epoch": 6.308261992118494, + "grad_norm": 1.407360315322876, + "learning_rate": 0.0002117560130452507, + "loss": 3.3846, + "step": 92845 + }, + { + "epoch": 6.308601712189156, + "grad_norm": 0.917797327041626, + "learning_rate": 0.000211713548036418, + "loss": 3.4475, + "step": 92850 + }, + { + "epoch": 6.308941432259818, + "grad_norm": 1.0142866373062134, + "learning_rate": 0.00021167108302758527, + "loss": 3.4576, + "step": 92855 + }, + { + "epoch": 6.309281152330479, + "grad_norm": 1.1542587280273438, + "learning_rate": 0.00021162861801875255, + "loss": 3.4407, + "step": 92860 + }, + { + "epoch": 6.309620872401141, + "grad_norm": 1.0851068496704102, + "learning_rate": 0.00021158615300991983, + "loss": 3.6002, + "step": 92865 + }, + { + "epoch": 6.309960592471803, + "grad_norm": 1.1213420629501343, + "learning_rate": 0.0002115436880010871, + "loss": 3.3948, + "step": 92870 + }, + { + "epoch": 6.310300312542465, + "grad_norm": 0.8915587067604065, + "learning_rate": 0.0002115012229922544, + "loss": 3.4897, + "step": 92875 + }, + { + "epoch": 6.310640032613127, + "grad_norm": 0.976908266544342, + "learning_rate": 0.00021145875798342164, + "loss": 3.2598, + "step": 92880 + }, + { + "epoch": 6.310979752683789, + "grad_norm": 2.6699681282043457, + "learning_rate": 0.00021141629297458895, + "loss": 3.4181, + "step": 92885 + }, + { + "epoch": 6.31131947275445, + "grad_norm": 1.0936843156814575, + "learning_rate": 0.00021137382796575623, + "loss": 3.3339, + "step": 92890 + }, + { + "epoch": 6.311659192825112, + "grad_norm": 0.9381501078605652, + "learning_rate": 0.0002113313629569235, + "loss": 3.3357, + "step": 92895 + }, + { + "epoch": 6.311998912895774, + "grad_norm": 0.9472915530204773, + "learning_rate": 0.0002112888979480908, + "loss": 3.4837, + "step": 92900 + }, + { + "epoch": 6.312338632966435, + "grad_norm": 1.2470818758010864, + "learning_rate": 0.00021124643293925804, + "loss": 3.3109, + "step": 92905 + }, + { + "epoch": 6.312678353037097, + "grad_norm": 0.8875870704650879, + "learning_rate": 0.00021120396793042535, + "loss": 3.3945, + "step": 92910 + }, + { + "epoch": 6.313018073107759, + "grad_norm": 1.0797326564788818, + "learning_rate": 0.0002111615029215926, + "loss": 3.3903, + "step": 92915 + }, + { + "epoch": 6.313357793178421, + "grad_norm": 1.3786321878433228, + "learning_rate": 0.00021111903791275988, + "loss": 3.32, + "step": 92920 + }, + { + "epoch": 6.313697513249083, + "grad_norm": 1.1075400114059448, + "learning_rate": 0.0002110765729039272, + "loss": 3.343, + "step": 92925 + }, + { + "epoch": 6.314037233319745, + "grad_norm": 2.112837076187134, + "learning_rate": 0.00021103410789509444, + "loss": 3.3572, + "step": 92930 + }, + { + "epoch": 6.314376953390406, + "grad_norm": 1.0456658601760864, + "learning_rate": 0.00021099164288626172, + "loss": 3.2938, + "step": 92935 + }, + { + "epoch": 6.314716673461068, + "grad_norm": 0.9371612668037415, + "learning_rate": 0.000210949177877429, + "loss": 3.288, + "step": 92940 + }, + { + "epoch": 6.31505639353173, + "grad_norm": 0.8013536334037781, + "learning_rate": 0.00021090671286859628, + "loss": 3.4867, + "step": 92945 + }, + { + "epoch": 6.315396113602391, + "grad_norm": 0.9737503528594971, + "learning_rate": 0.00021086424785976354, + "loss": 3.3532, + "step": 92950 + }, + { + "epoch": 6.315735833673053, + "grad_norm": 1.00124990940094, + "learning_rate": 0.00021082178285093084, + "loss": 3.413, + "step": 92955 + }, + { + "epoch": 6.316075553743715, + "grad_norm": 0.8112602233886719, + "learning_rate": 0.00021077931784209812, + "loss": 3.5416, + "step": 92960 + }, + { + "epoch": 6.316415273814377, + "grad_norm": 0.7758859992027283, + "learning_rate": 0.00021073685283326538, + "loss": 3.2272, + "step": 92965 + }, + { + "epoch": 6.316754993885039, + "grad_norm": 0.9911605715751648, + "learning_rate": 0.00021069438782443268, + "loss": 3.3351, + "step": 92970 + }, + { + "epoch": 6.317094713955701, + "grad_norm": 0.8621547818183899, + "learning_rate": 0.00021065192281559994, + "loss": 3.4519, + "step": 92975 + }, + { + "epoch": 6.317434434026362, + "grad_norm": 1.0482172966003418, + "learning_rate": 0.00021060945780676724, + "loss": 3.4194, + "step": 92980 + }, + { + "epoch": 6.317774154097024, + "grad_norm": 0.8945565223693848, + "learning_rate": 0.0002105669927979345, + "loss": 3.3026, + "step": 92985 + }, + { + "epoch": 6.318113874167686, + "grad_norm": 0.7768453359603882, + "learning_rate": 0.00021052452778910178, + "loss": 3.63, + "step": 92990 + }, + { + "epoch": 6.318453594238347, + "grad_norm": 1.0830323696136475, + "learning_rate": 0.00021048206278026908, + "loss": 3.4609, + "step": 92995 + }, + { + "epoch": 6.318793314309009, + "grad_norm": 1.1656113862991333, + "learning_rate": 0.00021043959777143634, + "loss": 3.3012, + "step": 93000 + }, + { + "epoch": 6.319133034379671, + "grad_norm": 0.8868629932403564, + "learning_rate": 0.00021039713276260362, + "loss": 3.5319, + "step": 93005 + }, + { + "epoch": 6.319472754450333, + "grad_norm": 0.7960383892059326, + "learning_rate": 0.0002103546677537709, + "loss": 3.5288, + "step": 93010 + }, + { + "epoch": 6.319812474520995, + "grad_norm": 1.0050671100616455, + "learning_rate": 0.00021031220274493818, + "loss": 3.1867, + "step": 93015 + }, + { + "epoch": 6.320152194591657, + "grad_norm": 1.5381766557693481, + "learning_rate": 0.00021026973773610546, + "loss": 3.3371, + "step": 93020 + }, + { + "epoch": 6.320491914662318, + "grad_norm": 0.9666773676872253, + "learning_rate": 0.00021022727272727274, + "loss": 3.3821, + "step": 93025 + }, + { + "epoch": 6.32083163473298, + "grad_norm": 0.9862014651298523, + "learning_rate": 0.00021018480771844002, + "loss": 3.3966, + "step": 93030 + }, + { + "epoch": 6.321171354803642, + "grad_norm": 0.8618085980415344, + "learning_rate": 0.00021014234270960727, + "loss": 3.3175, + "step": 93035 + }, + { + "epoch": 6.321511074874303, + "grad_norm": 0.7671900987625122, + "learning_rate": 0.00021009987770077458, + "loss": 3.2977, + "step": 93040 + }, + { + "epoch": 6.321850794944965, + "grad_norm": 0.9140551090240479, + "learning_rate": 0.00021005741269194183, + "loss": 3.268, + "step": 93045 + }, + { + "epoch": 6.3221905150156275, + "grad_norm": 1.3420991897583008, + "learning_rate": 0.0002100149476831091, + "loss": 3.2544, + "step": 93050 + }, + { + "epoch": 6.322530235086289, + "grad_norm": 1.652020812034607, + "learning_rate": 0.00020997248267427642, + "loss": 3.1485, + "step": 93055 + }, + { + "epoch": 6.322869955156951, + "grad_norm": 1.24189293384552, + "learning_rate": 0.00020993001766544367, + "loss": 3.1959, + "step": 93060 + }, + { + "epoch": 6.323209675227613, + "grad_norm": 0.9282287955284119, + "learning_rate": 0.00020988755265661098, + "loss": 3.261, + "step": 93065 + }, + { + "epoch": 6.323549395298274, + "grad_norm": 1.1351302862167358, + "learning_rate": 0.00020984508764777823, + "loss": 3.4605, + "step": 93070 + }, + { + "epoch": 6.323889115368936, + "grad_norm": 1.136211633682251, + "learning_rate": 0.0002098026226389455, + "loss": 3.5935, + "step": 93075 + }, + { + "epoch": 6.324228835439598, + "grad_norm": 0.9300395250320435, + "learning_rate": 0.0002097601576301128, + "loss": 3.474, + "step": 93080 + }, + { + "epoch": 6.324568555510259, + "grad_norm": 1.052943229675293, + "learning_rate": 0.00020971769262128007, + "loss": 3.273, + "step": 93085 + }, + { + "epoch": 6.324908275580921, + "grad_norm": 1.0090116262435913, + "learning_rate": 0.00020967522761244735, + "loss": 3.7892, + "step": 93090 + }, + { + "epoch": 6.3252479956515835, + "grad_norm": 2.6566550731658936, + "learning_rate": 0.00020963276260361463, + "loss": 3.3422, + "step": 93095 + }, + { + "epoch": 6.325587715722245, + "grad_norm": 0.8890952467918396, + "learning_rate": 0.0002095902975947819, + "loss": 3.2033, + "step": 93100 + }, + { + "epoch": 6.325927435792907, + "grad_norm": 0.7992276549339294, + "learning_rate": 0.00020954783258594916, + "loss": 3.4467, + "step": 93105 + }, + { + "epoch": 6.326267155863569, + "grad_norm": 0.9008855819702148, + "learning_rate": 0.00020950536757711647, + "loss": 3.4235, + "step": 93110 + }, + { + "epoch": 6.32660687593423, + "grad_norm": 0.8437705039978027, + "learning_rate": 0.00020946290256828372, + "loss": 3.3948, + "step": 93115 + }, + { + "epoch": 6.326946596004892, + "grad_norm": 0.8326396942138672, + "learning_rate": 0.000209420437559451, + "loss": 3.5228, + "step": 93120 + }, + { + "epoch": 6.327286316075554, + "grad_norm": 0.6847007274627686, + "learning_rate": 0.0002093779725506183, + "loss": 3.4202, + "step": 93125 + }, + { + "epoch": 6.327626036146215, + "grad_norm": 1.0278617143630981, + "learning_rate": 0.00020933550754178556, + "loss": 3.4782, + "step": 93130 + }, + { + "epoch": 6.327965756216877, + "grad_norm": 1.1295886039733887, + "learning_rate": 0.00020929304253295284, + "loss": 3.3781, + "step": 93135 + }, + { + "epoch": 6.3283054762875395, + "grad_norm": 1.0862926244735718, + "learning_rate": 0.00020925057752412012, + "loss": 3.4469, + "step": 93140 + }, + { + "epoch": 6.328645196358201, + "grad_norm": 1.3979166746139526, + "learning_rate": 0.0002092081125152874, + "loss": 3.1465, + "step": 93145 + }, + { + "epoch": 6.328984916428863, + "grad_norm": 0.9991622567176819, + "learning_rate": 0.0002091656475064547, + "loss": 3.4088, + "step": 93150 + }, + { + "epoch": 6.329324636499525, + "grad_norm": 0.8538918495178223, + "learning_rate": 0.00020912318249762196, + "loss": 3.4404, + "step": 93155 + }, + { + "epoch": 6.329664356570186, + "grad_norm": 1.2563645839691162, + "learning_rate": 0.00020908071748878924, + "loss": 3.5469, + "step": 93160 + }, + { + "epoch": 6.330004076640848, + "grad_norm": 0.9576836228370667, + "learning_rate": 0.00020903825247995652, + "loss": 3.3592, + "step": 93165 + }, + { + "epoch": 6.33034379671151, + "grad_norm": 0.8904599547386169, + "learning_rate": 0.0002089957874711238, + "loss": 3.2915, + "step": 93170 + }, + { + "epoch": 6.330683516782171, + "grad_norm": 0.9043847918510437, + "learning_rate": 0.00020895332246229106, + "loss": 3.1318, + "step": 93175 + }, + { + "epoch": 6.331023236852833, + "grad_norm": 0.9230297803878784, + "learning_rate": 0.00020891085745345836, + "loss": 3.4047, + "step": 93180 + }, + { + "epoch": 6.3313629569234955, + "grad_norm": 1.252243161201477, + "learning_rate": 0.00020886839244462564, + "loss": 3.2863, + "step": 93185 + }, + { + "epoch": 6.331702676994157, + "grad_norm": 0.9324498176574707, + "learning_rate": 0.0002088259274357929, + "loss": 3.435, + "step": 93190 + }, + { + "epoch": 6.332042397064819, + "grad_norm": 0.8474945425987244, + "learning_rate": 0.0002087834624269602, + "loss": 3.6786, + "step": 93195 + }, + { + "epoch": 6.332382117135481, + "grad_norm": 1.0068756341934204, + "learning_rate": 0.00020874099741812746, + "loss": 3.3365, + "step": 93200 + }, + { + "epoch": 6.332721837206142, + "grad_norm": 0.7839400768280029, + "learning_rate": 0.00020869853240929474, + "loss": 3.1369, + "step": 93205 + }, + { + "epoch": 6.333061557276804, + "grad_norm": 1.0236002206802368, + "learning_rate": 0.00020865606740046202, + "loss": 3.3558, + "step": 93210 + }, + { + "epoch": 6.333401277347465, + "grad_norm": 0.9158685803413391, + "learning_rate": 0.0002086136023916293, + "loss": 3.4576, + "step": 93215 + }, + { + "epoch": 6.333740997418127, + "grad_norm": 0.9387865662574768, + "learning_rate": 0.00020857113738279658, + "loss": 3.4051, + "step": 93220 + }, + { + "epoch": 6.334080717488789, + "grad_norm": 0.8800220489501953, + "learning_rate": 0.00020852867237396386, + "loss": 3.2327, + "step": 93225 + }, + { + "epoch": 6.334420437559451, + "grad_norm": 0.793998658657074, + "learning_rate": 0.00020848620736513114, + "loss": 3.3568, + "step": 93230 + }, + { + "epoch": 6.334760157630113, + "grad_norm": 0.9760531783103943, + "learning_rate": 0.00020844374235629842, + "loss": 3.7384, + "step": 93235 + }, + { + "epoch": 6.335099877700775, + "grad_norm": 0.9919188618659973, + "learning_rate": 0.0002084012773474657, + "loss": 3.384, + "step": 93240 + }, + { + "epoch": 6.335439597771436, + "grad_norm": 0.7417827844619751, + "learning_rate": 0.00020835881233863295, + "loss": 3.3959, + "step": 93245 + }, + { + "epoch": 6.335779317842098, + "grad_norm": 0.9556022882461548, + "learning_rate": 0.00020831634732980026, + "loss": 3.4897, + "step": 93250 + }, + { + "epoch": 6.33611903791276, + "grad_norm": 0.9978426098823547, + "learning_rate": 0.00020827388232096754, + "loss": 3.3583, + "step": 93255 + }, + { + "epoch": 6.336458757983421, + "grad_norm": 0.9090107679367065, + "learning_rate": 0.0002082314173121348, + "loss": 3.4698, + "step": 93260 + }, + { + "epoch": 6.336798478054083, + "grad_norm": 0.8269804120063782, + "learning_rate": 0.0002081889523033021, + "loss": 3.4824, + "step": 93265 + }, + { + "epoch": 6.337138198124745, + "grad_norm": 1.0559688806533813, + "learning_rate": 0.00020814648729446935, + "loss": 3.2938, + "step": 93270 + }, + { + "epoch": 6.337477918195407, + "grad_norm": 1.1664726734161377, + "learning_rate": 0.00020810402228563663, + "loss": 3.663, + "step": 93275 + }, + { + "epoch": 6.337817638266069, + "grad_norm": 1.3126660585403442, + "learning_rate": 0.0002080615572768039, + "loss": 3.5617, + "step": 93280 + }, + { + "epoch": 6.338157358336731, + "grad_norm": 1.090058445930481, + "learning_rate": 0.0002080190922679712, + "loss": 3.3676, + "step": 93285 + }, + { + "epoch": 6.338497078407392, + "grad_norm": 0.8884878158569336, + "learning_rate": 0.00020797662725913847, + "loss": 3.5021, + "step": 93290 + }, + { + "epoch": 6.338836798478054, + "grad_norm": 0.7857224941253662, + "learning_rate": 0.00020793416225030575, + "loss": 3.4956, + "step": 93295 + }, + { + "epoch": 6.339176518548716, + "grad_norm": 0.7349330186843872, + "learning_rate": 0.00020789169724147303, + "loss": 3.3038, + "step": 93300 + }, + { + "epoch": 6.339516238619377, + "grad_norm": 1.0087381601333618, + "learning_rate": 0.00020784923223264029, + "loss": 3.5547, + "step": 93305 + }, + { + "epoch": 6.339855958690039, + "grad_norm": 0.9086246490478516, + "learning_rate": 0.0002078067672238076, + "loss": 3.3715, + "step": 93310 + }, + { + "epoch": 6.3401956787607014, + "grad_norm": 1.1462993621826172, + "learning_rate": 0.00020776430221497487, + "loss": 3.1395, + "step": 93315 + }, + { + "epoch": 6.340535398831363, + "grad_norm": 1.9170321226119995, + "learning_rate": 0.00020772183720614215, + "loss": 3.5413, + "step": 93320 + }, + { + "epoch": 6.340875118902025, + "grad_norm": 0.8643113374710083, + "learning_rate": 0.00020767937219730943, + "loss": 3.3607, + "step": 93325 + }, + { + "epoch": 6.341214838972687, + "grad_norm": 0.9495186805725098, + "learning_rate": 0.00020763690718847669, + "loss": 3.4614, + "step": 93330 + }, + { + "epoch": 6.341554559043348, + "grad_norm": 0.8364635705947876, + "learning_rate": 0.000207594442179644, + "loss": 3.1323, + "step": 93335 + }, + { + "epoch": 6.34189427911401, + "grad_norm": 1.0455431938171387, + "learning_rate": 0.00020755197717081125, + "loss": 3.2826, + "step": 93340 + }, + { + "epoch": 6.342233999184672, + "grad_norm": 0.8381140828132629, + "learning_rate": 0.00020750951216197853, + "loss": 3.5118, + "step": 93345 + }, + { + "epoch": 6.342573719255333, + "grad_norm": 0.7886340022087097, + "learning_rate": 0.00020746704715314583, + "loss": 3.2852, + "step": 93350 + }, + { + "epoch": 6.342913439325995, + "grad_norm": 0.8454186916351318, + "learning_rate": 0.00020742458214431309, + "loss": 3.463, + "step": 93355 + }, + { + "epoch": 6.3432531593966575, + "grad_norm": 0.7664470672607422, + "learning_rate": 0.00020738211713548037, + "loss": 3.2103, + "step": 93360 + }, + { + "epoch": 6.343592879467319, + "grad_norm": 1.031011939048767, + "learning_rate": 0.00020733965212664765, + "loss": 3.3781, + "step": 93365 + }, + { + "epoch": 6.343932599537981, + "grad_norm": 1.106801986694336, + "learning_rate": 0.00020729718711781493, + "loss": 3.4215, + "step": 93370 + }, + { + "epoch": 6.344272319608643, + "grad_norm": 0.9409727454185486, + "learning_rate": 0.00020725472210898218, + "loss": 3.5239, + "step": 93375 + }, + { + "epoch": 6.344612039679304, + "grad_norm": 0.8359094262123108, + "learning_rate": 0.0002072122571001495, + "loss": 3.4383, + "step": 93380 + }, + { + "epoch": 6.344951759749966, + "grad_norm": 1.1221867799758911, + "learning_rate": 0.00020716979209131677, + "loss": 3.554, + "step": 93385 + }, + { + "epoch": 6.345291479820628, + "grad_norm": 1.0289158821105957, + "learning_rate": 0.00020712732708248402, + "loss": 3.3138, + "step": 93390 + }, + { + "epoch": 6.345631199891289, + "grad_norm": 0.7355784177780151, + "learning_rate": 0.00020708486207365133, + "loss": 3.5178, + "step": 93395 + }, + { + "epoch": 6.345970919961951, + "grad_norm": 0.9779805541038513, + "learning_rate": 0.00020704239706481858, + "loss": 3.2452, + "step": 93400 + }, + { + "epoch": 6.3463106400326135, + "grad_norm": 1.1004780530929565, + "learning_rate": 0.0002069999320559859, + "loss": 3.3506, + "step": 93405 + }, + { + "epoch": 6.346650360103275, + "grad_norm": 0.8549122214317322, + "learning_rate": 0.00020695746704715314, + "loss": 3.4006, + "step": 93410 + }, + { + "epoch": 6.346990080173937, + "grad_norm": 0.8494618535041809, + "learning_rate": 0.00020691500203832042, + "loss": 3.334, + "step": 93415 + }, + { + "epoch": 6.347329800244599, + "grad_norm": 1.0448490381240845, + "learning_rate": 0.00020687253702948773, + "loss": 3.4056, + "step": 93420 + }, + { + "epoch": 6.34766952031526, + "grad_norm": 0.8983119130134583, + "learning_rate": 0.00020683007202065498, + "loss": 3.5206, + "step": 93425 + }, + { + "epoch": 6.348009240385922, + "grad_norm": 1.0215389728546143, + "learning_rate": 0.00020678760701182226, + "loss": 3.269, + "step": 93430 + }, + { + "epoch": 6.348348960456584, + "grad_norm": 0.8797133564949036, + "learning_rate": 0.00020674514200298954, + "loss": 3.2472, + "step": 93435 + }, + { + "epoch": 6.348688680527245, + "grad_norm": 1.0345747470855713, + "learning_rate": 0.00020670267699415682, + "loss": 3.3832, + "step": 93440 + }, + { + "epoch": 6.349028400597907, + "grad_norm": 0.9211626648902893, + "learning_rate": 0.0002066602119853241, + "loss": 3.4703, + "step": 93445 + }, + { + "epoch": 6.3493681206685695, + "grad_norm": 0.8019527196884155, + "learning_rate": 0.00020661774697649138, + "loss": 3.5561, + "step": 93450 + }, + { + "epoch": 6.349707840739231, + "grad_norm": 0.8479011058807373, + "learning_rate": 0.00020657528196765866, + "loss": 3.5688, + "step": 93455 + }, + { + "epoch": 6.350047560809893, + "grad_norm": 0.9467070698738098, + "learning_rate": 0.00020653281695882591, + "loss": 3.2161, + "step": 93460 + }, + { + "epoch": 6.350387280880555, + "grad_norm": 1.0323090553283691, + "learning_rate": 0.00020649035194999322, + "loss": 3.3966, + "step": 93465 + }, + { + "epoch": 6.350727000951216, + "grad_norm": 1.012424349784851, + "learning_rate": 0.00020644788694116047, + "loss": 3.4683, + "step": 93470 + }, + { + "epoch": 6.351066721021878, + "grad_norm": 1.1126595735549927, + "learning_rate": 0.00020640542193232775, + "loss": 3.7197, + "step": 93475 + }, + { + "epoch": 6.35140644109254, + "grad_norm": 1.0461549758911133, + "learning_rate": 0.00020636295692349506, + "loss": 3.4677, + "step": 93480 + }, + { + "epoch": 6.351746161163201, + "grad_norm": 0.9027420878410339, + "learning_rate": 0.00020632049191466231, + "loss": 3.3841, + "step": 93485 + }, + { + "epoch": 6.352085881233863, + "grad_norm": 0.7944022417068481, + "learning_rate": 0.00020627802690582962, + "loss": 3.1492, + "step": 93490 + }, + { + "epoch": 6.352425601304525, + "grad_norm": 0.8830223083496094, + "learning_rate": 0.00020623556189699687, + "loss": 3.2933, + "step": 93495 + }, + { + "epoch": 6.352765321375187, + "grad_norm": 0.8116979598999023, + "learning_rate": 0.00020619309688816415, + "loss": 3.535, + "step": 93500 + }, + { + "epoch": 6.353105041445849, + "grad_norm": 0.76048743724823, + "learning_rate": 0.00020615063187933143, + "loss": 3.279, + "step": 93505 + }, + { + "epoch": 6.35344476151651, + "grad_norm": 0.8971797227859497, + "learning_rate": 0.00020610816687049871, + "loss": 3.3393, + "step": 93510 + }, + { + "epoch": 6.353784481587172, + "grad_norm": 0.9180324673652649, + "learning_rate": 0.000206065701861666, + "loss": 3.4895, + "step": 93515 + }, + { + "epoch": 6.354124201657834, + "grad_norm": 1.0295530557632446, + "learning_rate": 0.00020602323685283327, + "loss": 3.1847, + "step": 93520 + }, + { + "epoch": 6.354463921728495, + "grad_norm": 0.7035699486732483, + "learning_rate": 0.00020598077184400055, + "loss": 3.4781, + "step": 93525 + }, + { + "epoch": 6.354803641799157, + "grad_norm": 0.8369618058204651, + "learning_rate": 0.0002059383068351678, + "loss": 3.4746, + "step": 93530 + }, + { + "epoch": 6.355143361869819, + "grad_norm": 1.6016539335250854, + "learning_rate": 0.00020589584182633511, + "loss": 3.5846, + "step": 93535 + }, + { + "epoch": 6.355483081940481, + "grad_norm": 1.1114078760147095, + "learning_rate": 0.00020585337681750237, + "loss": 3.265, + "step": 93540 + }, + { + "epoch": 6.355822802011143, + "grad_norm": 1.562318205833435, + "learning_rate": 0.00020581091180866965, + "loss": 3.7183, + "step": 93545 + }, + { + "epoch": 6.356162522081805, + "grad_norm": 1.0365420579910278, + "learning_rate": 0.00020576844679983695, + "loss": 3.4182, + "step": 93550 + }, + { + "epoch": 6.356502242152466, + "grad_norm": 1.0414812564849854, + "learning_rate": 0.0002057259817910042, + "loss": 3.474, + "step": 93555 + }, + { + "epoch": 6.356841962223128, + "grad_norm": 1.0579203367233276, + "learning_rate": 0.0002056835167821715, + "loss": 3.0974, + "step": 93560 + }, + { + "epoch": 6.35718168229379, + "grad_norm": 1.0080183744430542, + "learning_rate": 0.00020564105177333877, + "loss": 3.1677, + "step": 93565 + }, + { + "epoch": 6.357521402364451, + "grad_norm": 0.7289566397666931, + "learning_rate": 0.00020559858676450605, + "loss": 3.3741, + "step": 93570 + }, + { + "epoch": 6.357861122435113, + "grad_norm": 0.9932776689529419, + "learning_rate": 0.00020555612175567336, + "loss": 3.4883, + "step": 93575 + }, + { + "epoch": 6.358200842505775, + "grad_norm": 0.8867397904396057, + "learning_rate": 0.0002055136567468406, + "loss": 3.5209, + "step": 93580 + }, + { + "epoch": 6.358540562576437, + "grad_norm": 0.8506049513816833, + "learning_rate": 0.0002054711917380079, + "loss": 3.4024, + "step": 93585 + }, + { + "epoch": 6.358880282647099, + "grad_norm": 1.133637547492981, + "learning_rate": 0.00020542872672917517, + "loss": 3.363, + "step": 93590 + }, + { + "epoch": 6.359220002717761, + "grad_norm": 0.8854036927223206, + "learning_rate": 0.00020538626172034245, + "loss": 3.3426, + "step": 93595 + }, + { + "epoch": 6.359559722788422, + "grad_norm": 0.7757336497306824, + "learning_rate": 0.0002053437967115097, + "loss": 3.6872, + "step": 93600 + }, + { + "epoch": 6.359899442859084, + "grad_norm": 0.8562105298042297, + "learning_rate": 0.000205301331702677, + "loss": 3.1608, + "step": 93605 + }, + { + "epoch": 6.360239162929746, + "grad_norm": 0.7608985304832458, + "learning_rate": 0.0002052588666938443, + "loss": 3.5693, + "step": 93610 + }, + { + "epoch": 6.360578883000407, + "grad_norm": 0.9016426205635071, + "learning_rate": 0.00020521640168501154, + "loss": 3.4766, + "step": 93615 + }, + { + "epoch": 6.360918603071069, + "grad_norm": 0.7544326186180115, + "learning_rate": 0.00020517393667617885, + "loss": 3.3496, + "step": 93620 + }, + { + "epoch": 6.3612583231417315, + "grad_norm": 0.8396128416061401, + "learning_rate": 0.0002051314716673461, + "loss": 3.483, + "step": 93625 + }, + { + "epoch": 6.361598043212393, + "grad_norm": 2.500155210494995, + "learning_rate": 0.00020508900665851338, + "loss": 3.2149, + "step": 93630 + }, + { + "epoch": 6.361937763283055, + "grad_norm": 0.7796416878700256, + "learning_rate": 0.00020504654164968066, + "loss": 3.4957, + "step": 93635 + }, + { + "epoch": 6.362277483353717, + "grad_norm": 1.8002458810806274, + "learning_rate": 0.00020500407664084794, + "loss": 3.428, + "step": 93640 + }, + { + "epoch": 6.362617203424378, + "grad_norm": 0.9591827988624573, + "learning_rate": 0.00020496161163201522, + "loss": 3.6118, + "step": 93645 + }, + { + "epoch": 6.36295692349504, + "grad_norm": 0.8259029388427734, + "learning_rate": 0.0002049191466231825, + "loss": 3.2933, + "step": 93650 + }, + { + "epoch": 6.363296643565702, + "grad_norm": 0.8730987906455994, + "learning_rate": 0.00020487668161434978, + "loss": 3.2076, + "step": 93655 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 0.8388166427612305, + "learning_rate": 0.00020483421660551706, + "loss": 3.211, + "step": 93660 + }, + { + "epoch": 6.363976083707025, + "grad_norm": 0.819059431552887, + "learning_rate": 0.00020479175159668434, + "loss": 3.3719, + "step": 93665 + }, + { + "epoch": 6.3643158037776875, + "grad_norm": 1.1819251775741577, + "learning_rate": 0.0002047492865878516, + "loss": 3.4893, + "step": 93670 + }, + { + "epoch": 6.364655523848349, + "grad_norm": 0.7535378932952881, + "learning_rate": 0.0002047068215790189, + "loss": 3.4392, + "step": 93675 + }, + { + "epoch": 6.364995243919011, + "grad_norm": 1.1734732389450073, + "learning_rate": 0.00020466435657018618, + "loss": 3.1515, + "step": 93680 + }, + { + "epoch": 6.365334963989673, + "grad_norm": 1.7322012186050415, + "learning_rate": 0.00020462189156135344, + "loss": 3.6378, + "step": 93685 + }, + { + "epoch": 6.365674684060334, + "grad_norm": 0.8815013766288757, + "learning_rate": 0.00020457942655252074, + "loss": 3.6492, + "step": 93690 + }, + { + "epoch": 6.366014404130996, + "grad_norm": 0.8546686172485352, + "learning_rate": 0.000204536961543688, + "loss": 3.3871, + "step": 93695 + }, + { + "epoch": 6.366354124201658, + "grad_norm": 0.7245252132415771, + "learning_rate": 0.00020449449653485528, + "loss": 3.4778, + "step": 93700 + }, + { + "epoch": 6.366693844272319, + "grad_norm": 0.8878322839736938, + "learning_rate": 0.00020445203152602256, + "loss": 3.0522, + "step": 93705 + }, + { + "epoch": 6.367033564342981, + "grad_norm": 1.0090646743774414, + "learning_rate": 0.00020440956651718984, + "loss": 3.5393, + "step": 93710 + }, + { + "epoch": 6.3673732844136435, + "grad_norm": 0.9384252429008484, + "learning_rate": 0.00020436710150835712, + "loss": 3.4158, + "step": 93715 + }, + { + "epoch": 6.367713004484305, + "grad_norm": 1.0485002994537354, + "learning_rate": 0.0002043246364995244, + "loss": 3.2112, + "step": 93720 + }, + { + "epoch": 6.368052724554967, + "grad_norm": 0.8687781691551208, + "learning_rate": 0.00020428217149069168, + "loss": 3.5131, + "step": 93725 + }, + { + "epoch": 6.368392444625629, + "grad_norm": 0.9001986384391785, + "learning_rate": 0.00020423970648185893, + "loss": 3.5068, + "step": 93730 + }, + { + "epoch": 6.36873216469629, + "grad_norm": 0.8505859971046448, + "learning_rate": 0.00020419724147302624, + "loss": 3.3022, + "step": 93735 + }, + { + "epoch": 6.369071884766952, + "grad_norm": 2.7646405696868896, + "learning_rate": 0.00020415477646419352, + "loss": 3.3316, + "step": 93740 + }, + { + "epoch": 6.369411604837614, + "grad_norm": 0.9440542459487915, + "learning_rate": 0.0002041123114553608, + "loss": 3.6525, + "step": 93745 + }, + { + "epoch": 6.369751324908275, + "grad_norm": 0.9235779047012329, + "learning_rate": 0.00020406984644652808, + "loss": 3.4896, + "step": 93750 + }, + { + "epoch": 6.370091044978937, + "grad_norm": 1.074874997138977, + "learning_rate": 0.00020402738143769533, + "loss": 3.418, + "step": 93755 + }, + { + "epoch": 6.3704307650495995, + "grad_norm": 0.7949504256248474, + "learning_rate": 0.00020398491642886264, + "loss": 3.4688, + "step": 93760 + }, + { + "epoch": 6.370770485120261, + "grad_norm": 0.9134211540222168, + "learning_rate": 0.0002039424514200299, + "loss": 3.4949, + "step": 93765 + }, + { + "epoch": 6.371110205190923, + "grad_norm": 0.7407535910606384, + "learning_rate": 0.00020389998641119717, + "loss": 3.4782, + "step": 93770 + }, + { + "epoch": 6.371449925261585, + "grad_norm": 0.833960235118866, + "learning_rate": 0.00020385752140236448, + "loss": 3.3878, + "step": 93775 + }, + { + "epoch": 6.371789645332246, + "grad_norm": 1.493695616722107, + "learning_rate": 0.00020381505639353173, + "loss": 3.4569, + "step": 93780 + }, + { + "epoch": 6.372129365402908, + "grad_norm": 0.8918629288673401, + "learning_rate": 0.000203772591384699, + "loss": 3.3434, + "step": 93785 + }, + { + "epoch": 6.37246908547357, + "grad_norm": 0.792733907699585, + "learning_rate": 0.0002037301263758663, + "loss": 3.387, + "step": 93790 + }, + { + "epoch": 6.372808805544231, + "grad_norm": 0.7395930886268616, + "learning_rate": 0.00020368766136703357, + "loss": 3.4733, + "step": 93795 + }, + { + "epoch": 6.373148525614893, + "grad_norm": 0.8922614455223083, + "learning_rate": 0.00020364519635820082, + "loss": 3.2431, + "step": 93800 + }, + { + "epoch": 6.3734882456855555, + "grad_norm": 0.8762533664703369, + "learning_rate": 0.00020360273134936813, + "loss": 3.4839, + "step": 93805 + }, + { + "epoch": 6.373827965756217, + "grad_norm": 0.9175192713737488, + "learning_rate": 0.0002035602663405354, + "loss": 3.5579, + "step": 93810 + }, + { + "epoch": 6.374167685826879, + "grad_norm": 0.9735671281814575, + "learning_rate": 0.00020351780133170266, + "loss": 3.4218, + "step": 93815 + }, + { + "epoch": 6.374507405897541, + "grad_norm": 0.8489055037498474, + "learning_rate": 0.00020347533632286997, + "loss": 3.4316, + "step": 93820 + }, + { + "epoch": 6.374847125968202, + "grad_norm": 0.9271124601364136, + "learning_rate": 0.00020343287131403722, + "loss": 3.3534, + "step": 93825 + }, + { + "epoch": 6.375186846038864, + "grad_norm": 1.012939214706421, + "learning_rate": 0.00020339040630520453, + "loss": 3.5432, + "step": 93830 + }, + { + "epoch": 6.375526566109526, + "grad_norm": 0.9126704931259155, + "learning_rate": 0.00020334794129637178, + "loss": 3.3296, + "step": 93835 + }, + { + "epoch": 6.375866286180187, + "grad_norm": 0.8263074159622192, + "learning_rate": 0.00020330547628753906, + "loss": 3.4192, + "step": 93840 + }, + { + "epoch": 6.376206006250849, + "grad_norm": 0.9870043396949768, + "learning_rate": 0.00020326301127870637, + "loss": 3.4496, + "step": 93845 + }, + { + "epoch": 6.3765457263215115, + "grad_norm": 0.8970112204551697, + "learning_rate": 0.00020322054626987362, + "loss": 3.3731, + "step": 93850 + }, + { + "epoch": 6.376885446392173, + "grad_norm": 1.3440756797790527, + "learning_rate": 0.0002031780812610409, + "loss": 3.3268, + "step": 93855 + }, + { + "epoch": 6.377225166462835, + "grad_norm": 0.6801615357398987, + "learning_rate": 0.00020313561625220818, + "loss": 3.6715, + "step": 93860 + }, + { + "epoch": 6.377564886533497, + "grad_norm": 0.9496816396713257, + "learning_rate": 0.00020309315124337546, + "loss": 3.3828, + "step": 93865 + }, + { + "epoch": 6.377904606604158, + "grad_norm": 0.8262764811515808, + "learning_rate": 0.00020305068623454272, + "loss": 3.3932, + "step": 93870 + }, + { + "epoch": 6.37824432667482, + "grad_norm": 0.7926307916641235, + "learning_rate": 0.00020300822122571002, + "loss": 3.3357, + "step": 93875 + }, + { + "epoch": 6.378584046745482, + "grad_norm": 1.1899144649505615, + "learning_rate": 0.0002029657562168773, + "loss": 3.431, + "step": 93880 + }, + { + "epoch": 6.378923766816143, + "grad_norm": 1.064333200454712, + "learning_rate": 0.00020292329120804456, + "loss": 3.1262, + "step": 93885 + }, + { + "epoch": 6.3792634868868054, + "grad_norm": 0.978529155254364, + "learning_rate": 0.00020288082619921186, + "loss": 3.1959, + "step": 93890 + }, + { + "epoch": 6.379603206957467, + "grad_norm": 0.8881118893623352, + "learning_rate": 0.00020283836119037912, + "loss": 3.0375, + "step": 93895 + }, + { + "epoch": 6.379942927028129, + "grad_norm": 1.1383733749389648, + "learning_rate": 0.0002027958961815464, + "loss": 3.4464, + "step": 93900 + }, + { + "epoch": 6.380282647098791, + "grad_norm": 0.8759476542472839, + "learning_rate": 0.0002027534311727137, + "loss": 3.3581, + "step": 93905 + }, + { + "epoch": 6.380622367169452, + "grad_norm": 0.842117190361023, + "learning_rate": 0.00020271096616388096, + "loss": 3.0634, + "step": 93910 + }, + { + "epoch": 6.380962087240114, + "grad_norm": 0.872198760509491, + "learning_rate": 0.00020266850115504827, + "loss": 3.642, + "step": 93915 + }, + { + "epoch": 6.381301807310776, + "grad_norm": 0.9458057880401611, + "learning_rate": 0.00020262603614621552, + "loss": 2.7015, + "step": 93920 + }, + { + "epoch": 6.381641527381437, + "grad_norm": 0.8200951218605042, + "learning_rate": 0.0002025835711373828, + "loss": 3.4453, + "step": 93925 + }, + { + "epoch": 6.381981247452099, + "grad_norm": 0.8155049085617065, + "learning_rate": 0.00020254110612855008, + "loss": 3.6861, + "step": 93930 + }, + { + "epoch": 6.3823209675227615, + "grad_norm": 0.9199687838554382, + "learning_rate": 0.00020249864111971736, + "loss": 3.2121, + "step": 93935 + }, + { + "epoch": 6.382660687593423, + "grad_norm": 0.8288843035697937, + "learning_rate": 0.00020245617611088464, + "loss": 3.4542, + "step": 93940 + }, + { + "epoch": 6.383000407664085, + "grad_norm": 0.8874051570892334, + "learning_rate": 0.00020241371110205192, + "loss": 3.4849, + "step": 93945 + }, + { + "epoch": 6.383340127734747, + "grad_norm": 0.8851376175880432, + "learning_rate": 0.0002023712460932192, + "loss": 3.2481, + "step": 93950 + }, + { + "epoch": 6.383679847805408, + "grad_norm": 1.0003366470336914, + "learning_rate": 0.00020232878108438645, + "loss": 3.4645, + "step": 93955 + }, + { + "epoch": 6.38401956787607, + "grad_norm": 1.3147093057632446, + "learning_rate": 0.00020228631607555376, + "loss": 3.3053, + "step": 93960 + }, + { + "epoch": 6.384359287946732, + "grad_norm": 0.9427880048751831, + "learning_rate": 0.000202243851066721, + "loss": 3.4863, + "step": 93965 + }, + { + "epoch": 6.384699008017393, + "grad_norm": 1.0159236192703247, + "learning_rate": 0.0002022013860578883, + "loss": 3.7126, + "step": 93970 + }, + { + "epoch": 6.385038728088055, + "grad_norm": 0.8038498163223267, + "learning_rate": 0.0002021589210490556, + "loss": 3.3866, + "step": 93975 + }, + { + "epoch": 6.3853784481587175, + "grad_norm": 1.0278221368789673, + "learning_rate": 0.00020211645604022285, + "loss": 3.4342, + "step": 93980 + }, + { + "epoch": 6.385718168229379, + "grad_norm": 1.4688745737075806, + "learning_rate": 0.00020207399103139013, + "loss": 3.386, + "step": 93985 + }, + { + "epoch": 6.386057888300041, + "grad_norm": 0.7173513770103455, + "learning_rate": 0.0002020315260225574, + "loss": 3.3789, + "step": 93990 + }, + { + "epoch": 6.386397608370703, + "grad_norm": 1.058261513710022, + "learning_rate": 0.0002019890610137247, + "loss": 3.2531, + "step": 93995 + }, + { + "epoch": 6.386737328441364, + "grad_norm": 0.9783673286437988, + "learning_rate": 0.000201946596004892, + "loss": 3.2302, + "step": 94000 + }, + { + "epoch": 6.387077048512026, + "grad_norm": 1.03232741355896, + "learning_rate": 0.00020190413099605925, + "loss": 3.1362, + "step": 94005 + }, + { + "epoch": 6.387416768582688, + "grad_norm": 1.03545081615448, + "learning_rate": 0.00020186166598722653, + "loss": 3.6451, + "step": 94010 + }, + { + "epoch": 6.387756488653349, + "grad_norm": 0.8408604264259338, + "learning_rate": 0.0002018192009783938, + "loss": 3.5417, + "step": 94015 + }, + { + "epoch": 6.388096208724011, + "grad_norm": 0.8448161482810974, + "learning_rate": 0.0002017767359695611, + "loss": 3.3989, + "step": 94020 + }, + { + "epoch": 6.3884359287946735, + "grad_norm": 0.806337296962738, + "learning_rate": 0.00020173427096072835, + "loss": 3.5644, + "step": 94025 + }, + { + "epoch": 6.388775648865335, + "grad_norm": 0.9825842380523682, + "learning_rate": 0.00020169180595189565, + "loss": 3.4977, + "step": 94030 + }, + { + "epoch": 6.389115368935997, + "grad_norm": 11.634439468383789, + "learning_rate": 0.00020164934094306293, + "loss": 3.4658, + "step": 94035 + }, + { + "epoch": 6.389455089006659, + "grad_norm": 0.9492741227149963, + "learning_rate": 0.00020160687593423019, + "loss": 3.4897, + "step": 94040 + }, + { + "epoch": 6.38979480907732, + "grad_norm": 0.8777870535850525, + "learning_rate": 0.0002015644109253975, + "loss": 3.5515, + "step": 94045 + }, + { + "epoch": 6.390134529147982, + "grad_norm": 0.8293880224227905, + "learning_rate": 0.00020152194591656475, + "loss": 3.5404, + "step": 94050 + }, + { + "epoch": 6.390474249218644, + "grad_norm": 0.8477573990821838, + "learning_rate": 0.00020147948090773203, + "loss": 3.0974, + "step": 94055 + }, + { + "epoch": 6.390813969289305, + "grad_norm": 1.024011254310608, + "learning_rate": 0.0002014370158988993, + "loss": 3.286, + "step": 94060 + }, + { + "epoch": 6.391153689359967, + "grad_norm": 0.7434585094451904, + "learning_rate": 0.00020139455089006659, + "loss": 3.3946, + "step": 94065 + }, + { + "epoch": 6.3914934094306295, + "grad_norm": 0.9182376265525818, + "learning_rate": 0.00020135208588123387, + "loss": 3.744, + "step": 94070 + }, + { + "epoch": 6.391833129501291, + "grad_norm": 1.0079782009124756, + "learning_rate": 0.00020130962087240115, + "loss": 3.5311, + "step": 94075 + }, + { + "epoch": 6.392172849571953, + "grad_norm": 0.8505976796150208, + "learning_rate": 0.00020126715586356843, + "loss": 3.519, + "step": 94080 + }, + { + "epoch": 6.392512569642615, + "grad_norm": 0.8715249300003052, + "learning_rate": 0.0002012246908547357, + "loss": 3.5081, + "step": 94085 + }, + { + "epoch": 6.392852289713276, + "grad_norm": 1.099316954612732, + "learning_rate": 0.00020118222584590299, + "loss": 3.6629, + "step": 94090 + }, + { + "epoch": 6.393192009783938, + "grad_norm": 0.7680906057357788, + "learning_rate": 0.00020113976083707024, + "loss": 3.3254, + "step": 94095 + }, + { + "epoch": 6.3935317298546, + "grad_norm": 0.6995541453361511, + "learning_rate": 0.00020109729582823755, + "loss": 3.3422, + "step": 94100 + }, + { + "epoch": 6.393871449925261, + "grad_norm": 0.9848539233207703, + "learning_rate": 0.00020105483081940483, + "loss": 3.5219, + "step": 94105 + }, + { + "epoch": 6.394211169995923, + "grad_norm": 0.7631400227546692, + "learning_rate": 0.00020101236581057208, + "loss": 3.5439, + "step": 94110 + }, + { + "epoch": 6.3945508900665855, + "grad_norm": 1.1540635824203491, + "learning_rate": 0.0002009699008017394, + "loss": 3.189, + "step": 94115 + }, + { + "epoch": 6.394890610137247, + "grad_norm": 0.9479895234107971, + "learning_rate": 0.00020092743579290664, + "loss": 3.2747, + "step": 94120 + }, + { + "epoch": 6.395230330207909, + "grad_norm": 0.9890660047531128, + "learning_rate": 0.00020088497078407392, + "loss": 3.3745, + "step": 94125 + }, + { + "epoch": 6.395570050278571, + "grad_norm": 0.9767805337905884, + "learning_rate": 0.0002008425057752412, + "loss": 3.6068, + "step": 94130 + }, + { + "epoch": 6.395909770349232, + "grad_norm": 0.8647972941398621, + "learning_rate": 0.00020080004076640848, + "loss": 3.3334, + "step": 94135 + }, + { + "epoch": 6.396249490419894, + "grad_norm": 0.8816936016082764, + "learning_rate": 0.00020075757575757576, + "loss": 3.3211, + "step": 94140 + }, + { + "epoch": 6.396589210490556, + "grad_norm": 1.1049442291259766, + "learning_rate": 0.00020071511074874304, + "loss": 3.2005, + "step": 94145 + }, + { + "epoch": 6.396928930561217, + "grad_norm": 0.9697532057762146, + "learning_rate": 0.00020067264573991032, + "loss": 3.4596, + "step": 94150 + }, + { + "epoch": 6.397268650631879, + "grad_norm": 0.9228510856628418, + "learning_rate": 0.00020063018073107757, + "loss": 3.4283, + "step": 94155 + }, + { + "epoch": 6.3976083707025415, + "grad_norm": 1.001485824584961, + "learning_rate": 0.00020058771572224488, + "loss": 3.3977, + "step": 94160 + }, + { + "epoch": 6.397948090773203, + "grad_norm": 0.792380154132843, + "learning_rate": 0.00020054525071341216, + "loss": 3.3108, + "step": 94165 + }, + { + "epoch": 6.398287810843865, + "grad_norm": 0.7904065251350403, + "learning_rate": 0.00020050278570457944, + "loss": 3.4586, + "step": 94170 + }, + { + "epoch": 6.398627530914526, + "grad_norm": 0.8862085938453674, + "learning_rate": 0.00020046032069574672, + "loss": 3.3124, + "step": 94175 + }, + { + "epoch": 6.398967250985188, + "grad_norm": 0.8967287540435791, + "learning_rate": 0.00020041785568691397, + "loss": 3.3458, + "step": 94180 + }, + { + "epoch": 6.39930697105585, + "grad_norm": 0.9162827134132385, + "learning_rate": 0.00020037539067808128, + "loss": 3.3717, + "step": 94185 + }, + { + "epoch": 6.399646691126511, + "grad_norm": 0.8068088889122009, + "learning_rate": 0.00020033292566924853, + "loss": 3.6826, + "step": 94190 + }, + { + "epoch": 6.399986411197173, + "grad_norm": 0.9186222553253174, + "learning_rate": 0.00020029046066041581, + "loss": 3.5024, + "step": 94195 + }, + { + "epoch": 6.4003261312678354, + "grad_norm": 0.8167315125465393, + "learning_rate": 0.00020024799565158312, + "loss": 3.3473, + "step": 94200 + }, + { + "epoch": 6.400665851338497, + "grad_norm": 0.9966377019882202, + "learning_rate": 0.00020020553064275037, + "loss": 3.389, + "step": 94205 + }, + { + "epoch": 6.401005571409159, + "grad_norm": 0.8933510184288025, + "learning_rate": 0.00020016306563391765, + "loss": 3.2734, + "step": 94210 + }, + { + "epoch": 6.401345291479821, + "grad_norm": 1.0128064155578613, + "learning_rate": 0.00020012060062508493, + "loss": 3.4719, + "step": 94215 + }, + { + "epoch": 6.401685011550482, + "grad_norm": 0.7827800512313843, + "learning_rate": 0.00020007813561625221, + "loss": 3.5582, + "step": 94220 + }, + { + "epoch": 6.402024731621144, + "grad_norm": 1.0339906215667725, + "learning_rate": 0.00020003567060741947, + "loss": 3.2654, + "step": 94225 + }, + { + "epoch": 6.402364451691806, + "grad_norm": 0.9826642274856567, + "learning_rate": 0.00019999320559858677, + "loss": 3.2578, + "step": 94230 + }, + { + "epoch": 6.402704171762467, + "grad_norm": 0.913548469543457, + "learning_rate": 0.00019995074058975405, + "loss": 3.5637, + "step": 94235 + }, + { + "epoch": 6.403043891833129, + "grad_norm": 0.8933494687080383, + "learning_rate": 0.0001999082755809213, + "loss": 3.2403, + "step": 94240 + }, + { + "epoch": 6.4033836119037915, + "grad_norm": 1.0014725923538208, + "learning_rate": 0.00019986581057208861, + "loss": 3.2728, + "step": 94245 + }, + { + "epoch": 6.403723331974453, + "grad_norm": 1.0998022556304932, + "learning_rate": 0.00019982334556325587, + "loss": 3.572, + "step": 94250 + }, + { + "epoch": 6.404063052045115, + "grad_norm": 0.9520360827445984, + "learning_rate": 0.00019978088055442317, + "loss": 3.4612, + "step": 94255 + }, + { + "epoch": 6.404402772115777, + "grad_norm": 1.0365772247314453, + "learning_rate": 0.00019973841554559043, + "loss": 3.2811, + "step": 94260 + }, + { + "epoch": 6.404742492186438, + "grad_norm": 1.087678074836731, + "learning_rate": 0.0001996959505367577, + "loss": 3.4852, + "step": 94265 + }, + { + "epoch": 6.4050822122571, + "grad_norm": 1.0540409088134766, + "learning_rate": 0.00019965348552792502, + "loss": 3.4559, + "step": 94270 + }, + { + "epoch": 6.405421932327762, + "grad_norm": 1.6650466918945312, + "learning_rate": 0.00019961102051909227, + "loss": 3.2625, + "step": 94275 + }, + { + "epoch": 6.405761652398423, + "grad_norm": 0.8692065477371216, + "learning_rate": 0.00019956855551025955, + "loss": 3.3956, + "step": 94280 + }, + { + "epoch": 6.406101372469085, + "grad_norm": 0.8651995062828064, + "learning_rate": 0.00019952609050142683, + "loss": 3.3942, + "step": 94285 + }, + { + "epoch": 6.4064410925397475, + "grad_norm": 1.0013427734375, + "learning_rate": 0.0001994836254925941, + "loss": 3.4748, + "step": 94290 + }, + { + "epoch": 6.406780812610409, + "grad_norm": 1.3414863348007202, + "learning_rate": 0.00019944116048376136, + "loss": 3.2206, + "step": 94295 + }, + { + "epoch": 6.407120532681071, + "grad_norm": 1.03749680519104, + "learning_rate": 0.00019939869547492867, + "loss": 3.4578, + "step": 94300 + }, + { + "epoch": 6.407460252751733, + "grad_norm": 0.8394484519958496, + "learning_rate": 0.00019935623046609595, + "loss": 3.3997, + "step": 94305 + }, + { + "epoch": 6.407799972822394, + "grad_norm": 1.0638684034347534, + "learning_rate": 0.0001993137654572632, + "loss": 3.251, + "step": 94310 + }, + { + "epoch": 6.408139692893056, + "grad_norm": 0.8558035492897034, + "learning_rate": 0.0001992713004484305, + "loss": 3.2169, + "step": 94315 + }, + { + "epoch": 6.408479412963718, + "grad_norm": 1.0449111461639404, + "learning_rate": 0.00019922883543959776, + "loss": 3.4892, + "step": 94320 + }, + { + "epoch": 6.408819133034379, + "grad_norm": 1.139414668083191, + "learning_rate": 0.00019918637043076504, + "loss": 3.2536, + "step": 94325 + }, + { + "epoch": 6.409158853105041, + "grad_norm": 0.9507228136062622, + "learning_rate": 0.00019914390542193235, + "loss": 3.5573, + "step": 94330 + }, + { + "epoch": 6.4094985731757035, + "grad_norm": 0.8693867921829224, + "learning_rate": 0.0001991014404130996, + "loss": 3.3747, + "step": 94335 + }, + { + "epoch": 6.409838293246365, + "grad_norm": 1.0359972715377808, + "learning_rate": 0.0001990589754042669, + "loss": 3.2857, + "step": 94340 + }, + { + "epoch": 6.410178013317027, + "grad_norm": 0.8623818159103394, + "learning_rate": 0.00019901651039543416, + "loss": 3.6227, + "step": 94345 + }, + { + "epoch": 6.410517733387689, + "grad_norm": 0.9224463701248169, + "learning_rate": 0.00019897404538660144, + "loss": 3.5634, + "step": 94350 + }, + { + "epoch": 6.41085745345835, + "grad_norm": 0.8919700384140015, + "learning_rate": 0.00019893158037776872, + "loss": 3.3613, + "step": 94355 + }, + { + "epoch": 6.411197173529012, + "grad_norm": 0.8870021104812622, + "learning_rate": 0.000198889115368936, + "loss": 3.4866, + "step": 94360 + }, + { + "epoch": 6.411536893599674, + "grad_norm": 1.0078240633010864, + "learning_rate": 0.00019884665036010328, + "loss": 3.4207, + "step": 94365 + }, + { + "epoch": 6.411876613670335, + "grad_norm": 0.7913521528244019, + "learning_rate": 0.00019880418535127056, + "loss": 3.3045, + "step": 94370 + }, + { + "epoch": 6.412216333740997, + "grad_norm": 1.0346261262893677, + "learning_rate": 0.00019876172034243784, + "loss": 3.4587, + "step": 94375 + }, + { + "epoch": 6.4125560538116595, + "grad_norm": 0.7425000667572021, + "learning_rate": 0.0001987192553336051, + "loss": 3.4621, + "step": 94380 + }, + { + "epoch": 6.412895773882321, + "grad_norm": 0.7761990427970886, + "learning_rate": 0.0001986767903247724, + "loss": 3.3482, + "step": 94385 + }, + { + "epoch": 6.413235493952983, + "grad_norm": 0.7466638684272766, + "learning_rate": 0.00019863432531593966, + "loss": 3.4305, + "step": 94390 + }, + { + "epoch": 6.413575214023645, + "grad_norm": 1.2272168397903442, + "learning_rate": 0.00019859186030710694, + "loss": 3.3305, + "step": 94395 + }, + { + "epoch": 6.413914934094306, + "grad_norm": 0.7048075795173645, + "learning_rate": 0.00019854939529827424, + "loss": 3.2739, + "step": 94400 + }, + { + "epoch": 6.414254654164968, + "grad_norm": 0.9455487132072449, + "learning_rate": 0.0001985069302894415, + "loss": 3.4341, + "step": 94405 + }, + { + "epoch": 6.41459437423563, + "grad_norm": 0.8157252073287964, + "learning_rate": 0.00019846446528060878, + "loss": 3.4725, + "step": 94410 + }, + { + "epoch": 6.414934094306291, + "grad_norm": 1.045296549797058, + "learning_rate": 0.00019842200027177606, + "loss": 3.5112, + "step": 94415 + }, + { + "epoch": 6.415273814376953, + "grad_norm": 1.2207250595092773, + "learning_rate": 0.00019837953526294334, + "loss": 3.3534, + "step": 94420 + }, + { + "epoch": 6.4156135344476155, + "grad_norm": 0.8350090980529785, + "learning_rate": 0.00019833707025411064, + "loss": 3.4256, + "step": 94425 + }, + { + "epoch": 6.415953254518277, + "grad_norm": 0.8450993895530701, + "learning_rate": 0.0001982946052452779, + "loss": 3.6429, + "step": 94430 + }, + { + "epoch": 6.416292974588939, + "grad_norm": 0.8426149487495422, + "learning_rate": 0.00019825214023644518, + "loss": 3.1216, + "step": 94435 + }, + { + "epoch": 6.416632694659601, + "grad_norm": 0.9330060482025146, + "learning_rate": 0.00019820967522761246, + "loss": 3.3005, + "step": 94440 + }, + { + "epoch": 6.416972414730262, + "grad_norm": 0.8725901246070862, + "learning_rate": 0.00019816721021877974, + "loss": 3.57, + "step": 94445 + }, + { + "epoch": 6.417312134800924, + "grad_norm": 0.874678909778595, + "learning_rate": 0.000198124745209947, + "loss": 3.3391, + "step": 94450 + }, + { + "epoch": 6.417651854871586, + "grad_norm": 1.0435091257095337, + "learning_rate": 0.0001980822802011143, + "loss": 3.4923, + "step": 94455 + }, + { + "epoch": 6.417991574942247, + "grad_norm": 1.1334142684936523, + "learning_rate": 0.00019803981519228158, + "loss": 3.4892, + "step": 94460 + }, + { + "epoch": 6.418331295012909, + "grad_norm": 1.1735962629318237, + "learning_rate": 0.00019799735018344883, + "loss": 3.48, + "step": 94465 + }, + { + "epoch": 6.4186710150835715, + "grad_norm": 0.8648676872253418, + "learning_rate": 0.00019795488517461614, + "loss": 3.3853, + "step": 94470 + }, + { + "epoch": 6.419010735154233, + "grad_norm": 0.8716633319854736, + "learning_rate": 0.0001979124201657834, + "loss": 3.2979, + "step": 94475 + }, + { + "epoch": 6.419350455224895, + "grad_norm": 0.7664101719856262, + "learning_rate": 0.00019786995515695067, + "loss": 3.65, + "step": 94480 + }, + { + "epoch": 6.419690175295557, + "grad_norm": 0.9068096876144409, + "learning_rate": 0.00019782749014811795, + "loss": 3.3993, + "step": 94485 + }, + { + "epoch": 6.420029895366218, + "grad_norm": 1.0152082443237305, + "learning_rate": 0.00019778502513928523, + "loss": 3.3501, + "step": 94490 + }, + { + "epoch": 6.42036961543688, + "grad_norm": 1.041167140007019, + "learning_rate": 0.0001977425601304525, + "loss": 3.2831, + "step": 94495 + }, + { + "epoch": 6.420709335507542, + "grad_norm": 0.8602945804595947, + "learning_rate": 0.0001977000951216198, + "loss": 3.5582, + "step": 94500 + }, + { + "epoch": 6.421049055578203, + "grad_norm": 1.1913834810256958, + "learning_rate": 0.00019765763011278707, + "loss": 3.2457, + "step": 94505 + }, + { + "epoch": 6.4213887756488655, + "grad_norm": 0.9753863215446472, + "learning_rate": 0.00019761516510395435, + "loss": 3.1149, + "step": 94510 + }, + { + "epoch": 6.4217284957195275, + "grad_norm": 0.9330111742019653, + "learning_rate": 0.00019757270009512163, + "loss": 3.5943, + "step": 94515 + }, + { + "epoch": 6.422068215790189, + "grad_norm": 0.8313294053077698, + "learning_rate": 0.00019753023508628888, + "loss": 3.2806, + "step": 94520 + }, + { + "epoch": 6.422407935860851, + "grad_norm": 0.8280326724052429, + "learning_rate": 0.0001974877700774562, + "loss": 3.0886, + "step": 94525 + }, + { + "epoch": 6.422747655931513, + "grad_norm": 0.950943112373352, + "learning_rate": 0.00019744530506862347, + "loss": 3.4884, + "step": 94530 + }, + { + "epoch": 6.423087376002174, + "grad_norm": 0.8252728581428528, + "learning_rate": 0.00019740284005979072, + "loss": 3.4664, + "step": 94535 + }, + { + "epoch": 6.423427096072836, + "grad_norm": 0.8062745928764343, + "learning_rate": 0.00019736037505095803, + "loss": 3.6135, + "step": 94540 + }, + { + "epoch": 6.423766816143498, + "grad_norm": 0.8277910947799683, + "learning_rate": 0.00019731791004212528, + "loss": 3.3689, + "step": 94545 + }, + { + "epoch": 6.424106536214159, + "grad_norm": 0.9258710741996765, + "learning_rate": 0.00019727544503329256, + "loss": 3.2405, + "step": 94550 + }, + { + "epoch": 6.4244462562848215, + "grad_norm": 0.975869357585907, + "learning_rate": 0.00019723298002445984, + "loss": 3.5102, + "step": 94555 + }, + { + "epoch": 6.4247859763554835, + "grad_norm": 0.8562180399894714, + "learning_rate": 0.00019719051501562712, + "loss": 3.6875, + "step": 94560 + }, + { + "epoch": 6.425125696426145, + "grad_norm": 1.1240854263305664, + "learning_rate": 0.0001971480500067944, + "loss": 3.4024, + "step": 94565 + }, + { + "epoch": 6.425465416496807, + "grad_norm": 1.01423180103302, + "learning_rate": 0.00019710558499796168, + "loss": 3.4278, + "step": 94570 + }, + { + "epoch": 6.425805136567468, + "grad_norm": 1.382393717765808, + "learning_rate": 0.00019706311998912896, + "loss": 3.042, + "step": 94575 + }, + { + "epoch": 6.42614485663813, + "grad_norm": 1.0421712398529053, + "learning_rate": 0.00019702065498029622, + "loss": 3.3173, + "step": 94580 + }, + { + "epoch": 6.426484576708792, + "grad_norm": 0.971376895904541, + "learning_rate": 0.00019697818997146352, + "loss": 3.567, + "step": 94585 + }, + { + "epoch": 6.426824296779453, + "grad_norm": 1.0744518041610718, + "learning_rate": 0.0001969357249626308, + "loss": 3.4329, + "step": 94590 + }, + { + "epoch": 6.427164016850115, + "grad_norm": 0.9336216449737549, + "learning_rate": 0.00019689325995379808, + "loss": 3.6383, + "step": 94595 + }, + { + "epoch": 6.4275037369207775, + "grad_norm": 1.1287577152252197, + "learning_rate": 0.00019685079494496536, + "loss": 3.5574, + "step": 94600 + }, + { + "epoch": 6.427843456991439, + "grad_norm": 0.8879938125610352, + "learning_rate": 0.00019680832993613262, + "loss": 3.3999, + "step": 94605 + }, + { + "epoch": 6.428183177062101, + "grad_norm": 0.7783915996551514, + "learning_rate": 0.00019676586492729992, + "loss": 3.3925, + "step": 94610 + }, + { + "epoch": 6.428522897132763, + "grad_norm": 0.9805938601493835, + "learning_rate": 0.00019672339991846718, + "loss": 3.2917, + "step": 94615 + }, + { + "epoch": 6.428862617203424, + "grad_norm": 1.012231469154358, + "learning_rate": 0.00019668093490963446, + "loss": 3.6166, + "step": 94620 + }, + { + "epoch": 6.429202337274086, + "grad_norm": 0.9937028288841248, + "learning_rate": 0.00019663846990080176, + "loss": 3.4022, + "step": 94625 + }, + { + "epoch": 6.429542057344748, + "grad_norm": 2.1070988178253174, + "learning_rate": 0.00019659600489196902, + "loss": 3.7426, + "step": 94630 + }, + { + "epoch": 6.429881777415409, + "grad_norm": 1.268039345741272, + "learning_rate": 0.0001965535398831363, + "loss": 3.4526, + "step": 94635 + }, + { + "epoch": 6.430221497486071, + "grad_norm": 1.0635013580322266, + "learning_rate": 0.00019651107487430358, + "loss": 3.5305, + "step": 94640 + }, + { + "epoch": 6.4305612175567335, + "grad_norm": 1.0786659717559814, + "learning_rate": 0.00019646860986547086, + "loss": 3.4249, + "step": 94645 + }, + { + "epoch": 6.430900937627395, + "grad_norm": 0.8952003121376038, + "learning_rate": 0.0001964261448566381, + "loss": 3.5908, + "step": 94650 + }, + { + "epoch": 6.431240657698057, + "grad_norm": 1.0519901514053345, + "learning_rate": 0.00019638367984780542, + "loss": 3.506, + "step": 94655 + }, + { + "epoch": 6.431580377768719, + "grad_norm": 0.8088323473930359, + "learning_rate": 0.0001963412148389727, + "loss": 3.4206, + "step": 94660 + }, + { + "epoch": 6.43192009783938, + "grad_norm": 0.8199892640113831, + "learning_rate": 0.00019629874983013995, + "loss": 3.5615, + "step": 94665 + }, + { + "epoch": 6.432259817910042, + "grad_norm": 0.9024670720100403, + "learning_rate": 0.00019625628482130726, + "loss": 3.3123, + "step": 94670 + }, + { + "epoch": 6.432599537980704, + "grad_norm": 0.7761546969413757, + "learning_rate": 0.0001962138198124745, + "loss": 3.6568, + "step": 94675 + }, + { + "epoch": 6.432939258051365, + "grad_norm": 0.8709329962730408, + "learning_rate": 0.00019617135480364182, + "loss": 3.3171, + "step": 94680 + }, + { + "epoch": 6.433278978122027, + "grad_norm": 1.0211219787597656, + "learning_rate": 0.00019612888979480907, + "loss": 3.417, + "step": 94685 + }, + { + "epoch": 6.4336186981926895, + "grad_norm": 0.9394358396530151, + "learning_rate": 0.00019608642478597635, + "loss": 3.4286, + "step": 94690 + }, + { + "epoch": 6.433958418263351, + "grad_norm": 1.0012856721878052, + "learning_rate": 0.00019604395977714366, + "loss": 3.383, + "step": 94695 + }, + { + "epoch": 6.434298138334013, + "grad_norm": 0.8491132855415344, + "learning_rate": 0.0001960014947683109, + "loss": 3.2411, + "step": 94700 + }, + { + "epoch": 6.434637858404675, + "grad_norm": 1.01179838180542, + "learning_rate": 0.0001959590297594782, + "loss": 3.5188, + "step": 94705 + }, + { + "epoch": 6.434977578475336, + "grad_norm": 1.2951503992080688, + "learning_rate": 0.00019591656475064547, + "loss": 3.4554, + "step": 94710 + }, + { + "epoch": 6.435317298545998, + "grad_norm": 1.1845979690551758, + "learning_rate": 0.00019587409974181275, + "loss": 3.2418, + "step": 94715 + }, + { + "epoch": 6.43565701861666, + "grad_norm": 1.0468412637710571, + "learning_rate": 0.00019583163473298, + "loss": 3.5119, + "step": 94720 + }, + { + "epoch": 6.435996738687321, + "grad_norm": 0.8095134496688843, + "learning_rate": 0.0001957891697241473, + "loss": 3.3808, + "step": 94725 + }, + { + "epoch": 6.436336458757983, + "grad_norm": 1.1069422960281372, + "learning_rate": 0.0001957467047153146, + "loss": 3.3242, + "step": 94730 + }, + { + "epoch": 6.4366761788286455, + "grad_norm": 0.9704550504684448, + "learning_rate": 0.00019570423970648185, + "loss": 3.3381, + "step": 94735 + }, + { + "epoch": 6.437015898899307, + "grad_norm": 1.030583143234253, + "learning_rate": 0.00019566177469764915, + "loss": 3.4499, + "step": 94740 + }, + { + "epoch": 6.437355618969969, + "grad_norm": 0.9812760353088379, + "learning_rate": 0.0001956193096888164, + "loss": 3.3542, + "step": 94745 + }, + { + "epoch": 6.437695339040631, + "grad_norm": 2.1758439540863037, + "learning_rate": 0.00019557684467998369, + "loss": 3.4768, + "step": 94750 + }, + { + "epoch": 6.438035059111292, + "grad_norm": 1.013737678527832, + "learning_rate": 0.000195534379671151, + "loss": 3.434, + "step": 94755 + }, + { + "epoch": 6.438374779181954, + "grad_norm": 0.8206879496574402, + "learning_rate": 0.00019549191466231825, + "loss": 3.6361, + "step": 94760 + }, + { + "epoch": 6.438714499252616, + "grad_norm": 1.1182254552841187, + "learning_rate": 0.00019544944965348555, + "loss": 3.5007, + "step": 94765 + }, + { + "epoch": 6.439054219323277, + "grad_norm": 1.1457555294036865, + "learning_rate": 0.0001954069846446528, + "loss": 3.3641, + "step": 94770 + }, + { + "epoch": 6.4393939393939394, + "grad_norm": 1.7558225393295288, + "learning_rate": 0.00019536451963582009, + "loss": 3.4268, + "step": 94775 + }, + { + "epoch": 6.4397336594646015, + "grad_norm": 1.0449498891830444, + "learning_rate": 0.00019532205462698737, + "loss": 3.4796, + "step": 94780 + }, + { + "epoch": 6.440073379535263, + "grad_norm": 1.0826019048690796, + "learning_rate": 0.00019527958961815465, + "loss": 3.4871, + "step": 94785 + }, + { + "epoch": 6.440413099605925, + "grad_norm": 1.0423697233200073, + "learning_rate": 0.00019523712460932193, + "loss": 3.2044, + "step": 94790 + }, + { + "epoch": 6.440752819676587, + "grad_norm": 1.0904805660247803, + "learning_rate": 0.0001951946596004892, + "loss": 3.2461, + "step": 94795 + }, + { + "epoch": 6.441092539747248, + "grad_norm": 1.004456639289856, + "learning_rate": 0.00019515219459165649, + "loss": 3.4056, + "step": 94800 + }, + { + "epoch": 6.44143225981791, + "grad_norm": 1.1042959690093994, + "learning_rate": 0.00019510972958282374, + "loss": 3.3614, + "step": 94805 + }, + { + "epoch": 6.441771979888572, + "grad_norm": 0.7255584001541138, + "learning_rate": 0.00019506726457399105, + "loss": 3.3444, + "step": 94810 + }, + { + "epoch": 6.442111699959233, + "grad_norm": 0.9718999266624451, + "learning_rate": 0.0001950247995651583, + "loss": 3.4669, + "step": 94815 + }, + { + "epoch": 6.4424514200298955, + "grad_norm": 1.0626156330108643, + "learning_rate": 0.00019498233455632558, + "loss": 3.3923, + "step": 94820 + }, + { + "epoch": 6.4427911401005575, + "grad_norm": 1.083943486213684, + "learning_rate": 0.0001949398695474929, + "loss": 3.3814, + "step": 94825 + }, + { + "epoch": 6.443130860171219, + "grad_norm": 0.8836004734039307, + "learning_rate": 0.00019489740453866014, + "loss": 3.5408, + "step": 94830 + }, + { + "epoch": 6.443470580241881, + "grad_norm": 1.1654422283172607, + "learning_rate": 0.00019485493952982742, + "loss": 3.2213, + "step": 94835 + }, + { + "epoch": 6.443810300312543, + "grad_norm": 1.0636954307556152, + "learning_rate": 0.0001948124745209947, + "loss": 3.5579, + "step": 94840 + }, + { + "epoch": 6.444150020383204, + "grad_norm": 0.9995970129966736, + "learning_rate": 0.00019477000951216198, + "loss": 3.4296, + "step": 94845 + }, + { + "epoch": 6.444489740453866, + "grad_norm": 0.8815613985061646, + "learning_rate": 0.0001947275445033293, + "loss": 3.3576, + "step": 94850 + }, + { + "epoch": 6.444829460524527, + "grad_norm": 1.0060689449310303, + "learning_rate": 0.00019468507949449654, + "loss": 3.1379, + "step": 94855 + }, + { + "epoch": 6.445169180595189, + "grad_norm": 1.2370949983596802, + "learning_rate": 0.00019464261448566382, + "loss": 3.3748, + "step": 94860 + }, + { + "epoch": 6.4455089006658515, + "grad_norm": 0.8567712903022766, + "learning_rate": 0.0001946001494768311, + "loss": 3.5806, + "step": 94865 + }, + { + "epoch": 6.445848620736513, + "grad_norm": 0.8181199431419373, + "learning_rate": 0.00019455768446799838, + "loss": 3.4808, + "step": 94870 + }, + { + "epoch": 6.446188340807175, + "grad_norm": 0.9634509682655334, + "learning_rate": 0.00019451521945916563, + "loss": 3.407, + "step": 94875 + }, + { + "epoch": 6.446528060877837, + "grad_norm": 0.809435248374939, + "learning_rate": 0.00019447275445033294, + "loss": 3.4462, + "step": 94880 + }, + { + "epoch": 6.446867780948498, + "grad_norm": 0.9818933010101318, + "learning_rate": 0.00019443028944150022, + "loss": 3.5544, + "step": 94885 + }, + { + "epoch": 6.44720750101916, + "grad_norm": 1.1728428602218628, + "learning_rate": 0.00019438782443266747, + "loss": 3.2298, + "step": 94890 + }, + { + "epoch": 6.447547221089822, + "grad_norm": 0.7968688011169434, + "learning_rate": 0.00019434535942383478, + "loss": 3.3267, + "step": 94895 + }, + { + "epoch": 6.447886941160483, + "grad_norm": 1.1043007373809814, + "learning_rate": 0.00019430289441500203, + "loss": 3.3031, + "step": 94900 + }, + { + "epoch": 6.448226661231145, + "grad_norm": 0.95989990234375, + "learning_rate": 0.00019426042940616931, + "loss": 3.2197, + "step": 94905 + }, + { + "epoch": 6.4485663813018075, + "grad_norm": 1.0222551822662354, + "learning_rate": 0.0001942179643973366, + "loss": 3.4456, + "step": 94910 + }, + { + "epoch": 6.448906101372469, + "grad_norm": 0.9901083707809448, + "learning_rate": 0.00019417549938850387, + "loss": 3.3382, + "step": 94915 + }, + { + "epoch": 6.449245821443131, + "grad_norm": 0.8229459524154663, + "learning_rate": 0.00019413303437967115, + "loss": 3.626, + "step": 94920 + }, + { + "epoch": 6.449585541513793, + "grad_norm": 0.8338135480880737, + "learning_rate": 0.00019409056937083843, + "loss": 3.1762, + "step": 94925 + }, + { + "epoch": 6.449925261584454, + "grad_norm": 1.0015732049942017, + "learning_rate": 0.00019404810436200571, + "loss": 3.242, + "step": 94930 + }, + { + "epoch": 6.450264981655116, + "grad_norm": 0.9174976348876953, + "learning_rate": 0.000194005639353173, + "loss": 3.4382, + "step": 94935 + }, + { + "epoch": 6.450604701725778, + "grad_norm": 1.2017250061035156, + "learning_rate": 0.00019396317434434027, + "loss": 3.5696, + "step": 94940 + }, + { + "epoch": 6.450944421796439, + "grad_norm": 0.7660757899284363, + "learning_rate": 0.00019392070933550753, + "loss": 3.4599, + "step": 94945 + }, + { + "epoch": 6.451284141867101, + "grad_norm": 0.8123950362205505, + "learning_rate": 0.00019387824432667483, + "loss": 3.4121, + "step": 94950 + }, + { + "epoch": 6.4516238619377635, + "grad_norm": 0.9965104460716248, + "learning_rate": 0.00019383577931784211, + "loss": 3.473, + "step": 94955 + }, + { + "epoch": 6.451963582008425, + "grad_norm": 1.0425920486450195, + "learning_rate": 0.00019379331430900937, + "loss": 3.4634, + "step": 94960 + }, + { + "epoch": 6.452303302079087, + "grad_norm": 1.1207724809646606, + "learning_rate": 0.00019375084930017667, + "loss": 3.4401, + "step": 94965 + }, + { + "epoch": 6.452643022149749, + "grad_norm": 1.3108630180358887, + "learning_rate": 0.00019370838429134393, + "loss": 3.7231, + "step": 94970 + }, + { + "epoch": 6.45298274222041, + "grad_norm": 0.9221240282058716, + "learning_rate": 0.0001936659192825112, + "loss": 3.3276, + "step": 94975 + }, + { + "epoch": 6.453322462291072, + "grad_norm": 0.8348662853240967, + "learning_rate": 0.0001936234542736785, + "loss": 3.1578, + "step": 94980 + }, + { + "epoch": 6.453662182361734, + "grad_norm": 1.2945259809494019, + "learning_rate": 0.00019358098926484577, + "loss": 3.3924, + "step": 94985 + }, + { + "epoch": 6.454001902432395, + "grad_norm": 0.8067662715911865, + "learning_rate": 0.00019353852425601305, + "loss": 3.361, + "step": 94990 + }, + { + "epoch": 6.454341622503057, + "grad_norm": 0.9001979231834412, + "learning_rate": 0.00019349605924718033, + "loss": 3.4132, + "step": 94995 + }, + { + "epoch": 6.4546813425737195, + "grad_norm": 0.7703158855438232, + "learning_rate": 0.0001934535942383476, + "loss": 3.1781, + "step": 95000 + }, + { + "epoch": 6.455021062644381, + "grad_norm": 5.120008945465088, + "learning_rate": 0.00019341112922951486, + "loss": 3.5041, + "step": 95005 + }, + { + "epoch": 6.455360782715043, + "grad_norm": 0.9432382583618164, + "learning_rate": 0.00019336866422068217, + "loss": 3.4068, + "step": 95010 + }, + { + "epoch": 6.455700502785705, + "grad_norm": 0.8029004335403442, + "learning_rate": 0.00019332619921184945, + "loss": 3.333, + "step": 95015 + }, + { + "epoch": 6.456040222856366, + "grad_norm": 0.9195808172225952, + "learning_rate": 0.00019328373420301673, + "loss": 3.5602, + "step": 95020 + }, + { + "epoch": 6.456379942927028, + "grad_norm": 0.7447998523712158, + "learning_rate": 0.000193241269194184, + "loss": 3.6495, + "step": 95025 + }, + { + "epoch": 6.45671966299769, + "grad_norm": 0.8869537115097046, + "learning_rate": 0.00019319880418535126, + "loss": 3.3166, + "step": 95030 + }, + { + "epoch": 6.457059383068351, + "grad_norm": 0.9278919696807861, + "learning_rate": 0.00019315633917651857, + "loss": 3.2959, + "step": 95035 + }, + { + "epoch": 6.457399103139013, + "grad_norm": 1.0051767826080322, + "learning_rate": 0.00019311387416768582, + "loss": 3.3565, + "step": 95040 + }, + { + "epoch": 6.4577388232096755, + "grad_norm": 0.8326539397239685, + "learning_rate": 0.0001930714091588531, + "loss": 3.3084, + "step": 95045 + }, + { + "epoch": 6.458078543280337, + "grad_norm": 0.770872950553894, + "learning_rate": 0.0001930289441500204, + "loss": 3.4025, + "step": 95050 + }, + { + "epoch": 6.458418263350999, + "grad_norm": 1.1909724473953247, + "learning_rate": 0.00019298647914118766, + "loss": 3.0784, + "step": 95055 + }, + { + "epoch": 6.458757983421661, + "grad_norm": 1.1410443782806396, + "learning_rate": 0.00019294401413235494, + "loss": 3.6249, + "step": 95060 + }, + { + "epoch": 6.459097703492322, + "grad_norm": 0.9180636405944824, + "learning_rate": 0.00019290154912352222, + "loss": 3.3644, + "step": 95065 + }, + { + "epoch": 6.459437423562984, + "grad_norm": 0.9848012924194336, + "learning_rate": 0.0001928590841146895, + "loss": 3.2579, + "step": 95070 + }, + { + "epoch": 6.459777143633646, + "grad_norm": 0.9879682064056396, + "learning_rate": 0.00019281661910585676, + "loss": 3.6292, + "step": 95075 + }, + { + "epoch": 6.460116863704307, + "grad_norm": 1.012701392173767, + "learning_rate": 0.00019277415409702406, + "loss": 3.5855, + "step": 95080 + }, + { + "epoch": 6.4604565837749695, + "grad_norm": 1.0478826761245728, + "learning_rate": 0.00019273168908819134, + "loss": 3.3796, + "step": 95085 + }, + { + "epoch": 6.4607963038456315, + "grad_norm": 0.9429601430892944, + "learning_rate": 0.0001926892240793586, + "loss": 3.5861, + "step": 95090 + }, + { + "epoch": 6.461136023916293, + "grad_norm": 0.8791939616203308, + "learning_rate": 0.0001926467590705259, + "loss": 3.4997, + "step": 95095 + }, + { + "epoch": 6.461475743986955, + "grad_norm": 0.960331916809082, + "learning_rate": 0.00019260429406169316, + "loss": 3.267, + "step": 95100 + }, + { + "epoch": 6.461815464057617, + "grad_norm": 1.0078376531600952, + "learning_rate": 0.00019256182905286046, + "loss": 3.2967, + "step": 95105 + }, + { + "epoch": 6.462155184128278, + "grad_norm": 0.9064356088638306, + "learning_rate": 0.00019251936404402772, + "loss": 3.0844, + "step": 95110 + }, + { + "epoch": 6.46249490419894, + "grad_norm": 0.6725652813911438, + "learning_rate": 0.000192476899035195, + "loss": 3.5069, + "step": 95115 + }, + { + "epoch": 6.462834624269602, + "grad_norm": 0.7967776656150818, + "learning_rate": 0.0001924344340263623, + "loss": 3.3095, + "step": 95120 + }, + { + "epoch": 6.463174344340263, + "grad_norm": 0.7468894720077515, + "learning_rate": 0.00019239196901752956, + "loss": 3.5696, + "step": 95125 + }, + { + "epoch": 6.4635140644109255, + "grad_norm": 0.8900094628334045, + "learning_rate": 0.00019234950400869684, + "loss": 3.3097, + "step": 95130 + }, + { + "epoch": 6.4638537844815875, + "grad_norm": 0.8229110240936279, + "learning_rate": 0.00019230703899986412, + "loss": 3.1652, + "step": 95135 + }, + { + "epoch": 6.464193504552249, + "grad_norm": 0.9019662141799927, + "learning_rate": 0.0001922645739910314, + "loss": 3.5814, + "step": 95140 + }, + { + "epoch": 6.464533224622911, + "grad_norm": 0.7617384195327759, + "learning_rate": 0.00019222210898219865, + "loss": 3.5804, + "step": 95145 + }, + { + "epoch": 6.464872944693573, + "grad_norm": 0.8993804454803467, + "learning_rate": 0.00019217964397336596, + "loss": 3.518, + "step": 95150 + }, + { + "epoch": 6.465212664764234, + "grad_norm": 0.7766099572181702, + "learning_rate": 0.00019213717896453324, + "loss": 3.3929, + "step": 95155 + }, + { + "epoch": 6.465552384834896, + "grad_norm": 0.939617395401001, + "learning_rate": 0.0001920947139557005, + "loss": 3.558, + "step": 95160 + }, + { + "epoch": 6.465892104905558, + "grad_norm": 0.9207808375358582, + "learning_rate": 0.0001920522489468678, + "loss": 3.3484, + "step": 95165 + }, + { + "epoch": 6.466231824976219, + "grad_norm": 0.9287243485450745, + "learning_rate": 0.00019200978393803505, + "loss": 3.3401, + "step": 95170 + }, + { + "epoch": 6.4665715450468815, + "grad_norm": 0.8734697699546814, + "learning_rate": 0.00019196731892920233, + "loss": 3.5008, + "step": 95175 + }, + { + "epoch": 6.4669112651175436, + "grad_norm": 1.1486153602600098, + "learning_rate": 0.00019192485392036964, + "loss": 3.2369, + "step": 95180 + }, + { + "epoch": 6.467250985188205, + "grad_norm": 1.3601654767990112, + "learning_rate": 0.0001918823889115369, + "loss": 3.3051, + "step": 95185 + }, + { + "epoch": 6.467590705258867, + "grad_norm": 1.0693988800048828, + "learning_rate": 0.0001918399239027042, + "loss": 3.3464, + "step": 95190 + }, + { + "epoch": 6.467930425329529, + "grad_norm": 0.9903508424758911, + "learning_rate": 0.00019179745889387145, + "loss": 3.4653, + "step": 95195 + }, + { + "epoch": 6.46827014540019, + "grad_norm": 1.0776758193969727, + "learning_rate": 0.00019175499388503873, + "loss": 3.5931, + "step": 95200 + }, + { + "epoch": 6.468609865470852, + "grad_norm": 0.9282914996147156, + "learning_rate": 0.000191712528876206, + "loss": 3.4593, + "step": 95205 + }, + { + "epoch": 6.468949585541514, + "grad_norm": 1.1329785585403442, + "learning_rate": 0.0001916700638673733, + "loss": 3.3764, + "step": 95210 + }, + { + "epoch": 6.469289305612175, + "grad_norm": 0.9217056035995483, + "learning_rate": 0.00019162759885854057, + "loss": 3.4584, + "step": 95215 + }, + { + "epoch": 6.4696290256828375, + "grad_norm": 1.060081124305725, + "learning_rate": 0.00019158513384970785, + "loss": 3.3145, + "step": 95220 + }, + { + "epoch": 6.4699687457535, + "grad_norm": 1.1126649379730225, + "learning_rate": 0.00019154266884087513, + "loss": 3.217, + "step": 95225 + }, + { + "epoch": 6.470308465824161, + "grad_norm": 0.9214702248573303, + "learning_rate": 0.00019150020383204238, + "loss": 3.6282, + "step": 95230 + }, + { + "epoch": 6.470648185894823, + "grad_norm": 0.9422798156738281, + "learning_rate": 0.0001914577388232097, + "loss": 3.8048, + "step": 95235 + }, + { + "epoch": 6.470987905965485, + "grad_norm": 0.880190908908844, + "learning_rate": 0.00019141527381437694, + "loss": 3.6735, + "step": 95240 + }, + { + "epoch": 6.471327626036146, + "grad_norm": 1.0619105100631714, + "learning_rate": 0.00019137280880554422, + "loss": 3.6905, + "step": 95245 + }, + { + "epoch": 6.471667346106808, + "grad_norm": 0.806422233581543, + "learning_rate": 0.00019133034379671153, + "loss": 3.3441, + "step": 95250 + }, + { + "epoch": 6.47200706617747, + "grad_norm": 1.3992887735366821, + "learning_rate": 0.00019128787878787878, + "loss": 3.3616, + "step": 95255 + }, + { + "epoch": 6.472346786248131, + "grad_norm": 0.8253423571586609, + "learning_rate": 0.00019124541377904606, + "loss": 3.6129, + "step": 95260 + }, + { + "epoch": 6.4726865063187935, + "grad_norm": 0.7161591053009033, + "learning_rate": 0.00019120294877021334, + "loss": 3.2701, + "step": 95265 + }, + { + "epoch": 6.473026226389455, + "grad_norm": 0.6935116648674011, + "learning_rate": 0.00019116048376138062, + "loss": 3.3783, + "step": 95270 + }, + { + "epoch": 6.473365946460117, + "grad_norm": 0.8991944789886475, + "learning_rate": 0.0001911180187525479, + "loss": 3.237, + "step": 95275 + }, + { + "epoch": 6.473705666530779, + "grad_norm": 0.9060302376747131, + "learning_rate": 0.00019107555374371518, + "loss": 3.6069, + "step": 95280 + }, + { + "epoch": 6.47404538660144, + "grad_norm": 0.8818085789680481, + "learning_rate": 0.00019103308873488246, + "loss": 3.3516, + "step": 95285 + }, + { + "epoch": 6.474385106672102, + "grad_norm": 1.0138254165649414, + "learning_rate": 0.00019099062372604974, + "loss": 3.4711, + "step": 95290 + }, + { + "epoch": 6.474724826742764, + "grad_norm": 1.0988712310791016, + "learning_rate": 0.00019094815871721702, + "loss": 3.3096, + "step": 95295 + }, + { + "epoch": 6.475064546813425, + "grad_norm": 1.0335016250610352, + "learning_rate": 0.00019090569370838428, + "loss": 3.4352, + "step": 95300 + }, + { + "epoch": 6.475404266884087, + "grad_norm": 1.4598848819732666, + "learning_rate": 0.00019086322869955158, + "loss": 3.5082, + "step": 95305 + }, + { + "epoch": 6.4757439869547495, + "grad_norm": 0.7935274243354797, + "learning_rate": 0.00019082076369071886, + "loss": 3.6391, + "step": 95310 + }, + { + "epoch": 6.476083707025411, + "grad_norm": 0.8398244380950928, + "learning_rate": 0.00019077829868188612, + "loss": 3.3633, + "step": 95315 + }, + { + "epoch": 6.476423427096073, + "grad_norm": 0.9282000064849854, + "learning_rate": 0.00019073583367305342, + "loss": 3.3055, + "step": 95320 + }, + { + "epoch": 6.476763147166735, + "grad_norm": 0.9980384707450867, + "learning_rate": 0.00019069336866422068, + "loss": 3.2943, + "step": 95325 + }, + { + "epoch": 6.477102867237396, + "grad_norm": 0.8438000082969666, + "learning_rate": 0.00019065090365538796, + "loss": 3.4174, + "step": 95330 + }, + { + "epoch": 6.477442587308058, + "grad_norm": 0.711867094039917, + "learning_rate": 0.00019060843864655524, + "loss": 3.6082, + "step": 95335 + }, + { + "epoch": 6.47778230737872, + "grad_norm": 1.1828209161758423, + "learning_rate": 0.00019056597363772252, + "loss": 3.3596, + "step": 95340 + }, + { + "epoch": 6.478122027449381, + "grad_norm": 2.6631126403808594, + "learning_rate": 0.0001905235086288898, + "loss": 3.3155, + "step": 95345 + }, + { + "epoch": 6.4784617475200434, + "grad_norm": 0.7481948137283325, + "learning_rate": 0.00019048104362005708, + "loss": 3.4214, + "step": 95350 + }, + { + "epoch": 6.4788014675907055, + "grad_norm": 1.289524793624878, + "learning_rate": 0.00019043857861122436, + "loss": 3.2577, + "step": 95355 + }, + { + "epoch": 6.479141187661367, + "grad_norm": 1.1773314476013184, + "learning_rate": 0.00019039611360239164, + "loss": 3.2192, + "step": 95360 + }, + { + "epoch": 6.479480907732029, + "grad_norm": 0.8845437169075012, + "learning_rate": 0.00019035364859355892, + "loss": 3.283, + "step": 95365 + }, + { + "epoch": 6.479820627802691, + "grad_norm": 1.0854707956314087, + "learning_rate": 0.00019031118358472617, + "loss": 3.4431, + "step": 95370 + }, + { + "epoch": 6.480160347873352, + "grad_norm": 1.3702934980392456, + "learning_rate": 0.00019026871857589348, + "loss": 2.9813, + "step": 95375 + }, + { + "epoch": 6.480500067944014, + "grad_norm": 1.1520466804504395, + "learning_rate": 0.00019022625356706076, + "loss": 3.5564, + "step": 95380 + }, + { + "epoch": 6.480839788014676, + "grad_norm": 0.7996057868003845, + "learning_rate": 0.000190183788558228, + "loss": 3.4701, + "step": 95385 + }, + { + "epoch": 6.481179508085337, + "grad_norm": 0.9568181037902832, + "learning_rate": 0.00019014132354939532, + "loss": 3.3767, + "step": 95390 + }, + { + "epoch": 6.4815192281559995, + "grad_norm": 0.8376504778862, + "learning_rate": 0.00019009885854056257, + "loss": 3.3229, + "step": 95395 + }, + { + "epoch": 6.4818589482266615, + "grad_norm": 1.0341641902923584, + "learning_rate": 0.00019005639353172985, + "loss": 3.561, + "step": 95400 + }, + { + "epoch": 6.482198668297323, + "grad_norm": 0.9371166825294495, + "learning_rate": 0.00019001392852289713, + "loss": 3.4358, + "step": 95405 + }, + { + "epoch": 6.482538388367985, + "grad_norm": 0.8412254452705383, + "learning_rate": 0.0001899714635140644, + "loss": 3.4394, + "step": 95410 + }, + { + "epoch": 6.482878108438647, + "grad_norm": 0.8774293065071106, + "learning_rate": 0.0001899289985052317, + "loss": 3.558, + "step": 95415 + }, + { + "epoch": 6.483217828509308, + "grad_norm": 0.9511817693710327, + "learning_rate": 0.00018988653349639897, + "loss": 3.5748, + "step": 95420 + }, + { + "epoch": 6.48355754857997, + "grad_norm": 0.7412865161895752, + "learning_rate": 0.00018984406848756625, + "loss": 3.3265, + "step": 95425 + }, + { + "epoch": 6.483897268650632, + "grad_norm": 0.9831663966178894, + "learning_rate": 0.0001898016034787335, + "loss": 3.3883, + "step": 95430 + }, + { + "epoch": 6.484236988721293, + "grad_norm": 1.3002641201019287, + "learning_rate": 0.0001897591384699008, + "loss": 3.4653, + "step": 95435 + }, + { + "epoch": 6.4845767087919555, + "grad_norm": 1.1476348638534546, + "learning_rate": 0.0001897166734610681, + "loss": 3.3609, + "step": 95440 + }, + { + "epoch": 6.4849164288626175, + "grad_norm": 0.8590561151504517, + "learning_rate": 0.00018967420845223537, + "loss": 3.4435, + "step": 95445 + }, + { + "epoch": 6.485256148933279, + "grad_norm": 1.0579421520233154, + "learning_rate": 0.00018963174344340265, + "loss": 3.5271, + "step": 95450 + }, + { + "epoch": 6.485595869003941, + "grad_norm": 0.8247973322868347, + "learning_rate": 0.0001895892784345699, + "loss": 3.3423, + "step": 95455 + }, + { + "epoch": 6.485935589074603, + "grad_norm": 0.8785625100135803, + "learning_rate": 0.0001895468134257372, + "loss": 3.522, + "step": 95460 + }, + { + "epoch": 6.486275309145264, + "grad_norm": 1.026502013206482, + "learning_rate": 0.00018950434841690447, + "loss": 3.6657, + "step": 95465 + }, + { + "epoch": 6.486615029215926, + "grad_norm": 0.8423032164573669, + "learning_rate": 0.00018946188340807175, + "loss": 3.531, + "step": 95470 + }, + { + "epoch": 6.486954749286588, + "grad_norm": 1.080918788909912, + "learning_rate": 0.00018941941839923905, + "loss": 3.6238, + "step": 95475 + }, + { + "epoch": 6.487294469357249, + "grad_norm": 0.8240137696266174, + "learning_rate": 0.0001893769533904063, + "loss": 3.2168, + "step": 95480 + }, + { + "epoch": 6.4876341894279115, + "grad_norm": 0.9028398394584656, + "learning_rate": 0.00018933448838157359, + "loss": 3.5696, + "step": 95485 + }, + { + "epoch": 6.4879739094985736, + "grad_norm": 0.9979042410850525, + "learning_rate": 0.00018929202337274087, + "loss": 3.4687, + "step": 95490 + }, + { + "epoch": 6.488313629569235, + "grad_norm": 0.8346744775772095, + "learning_rate": 0.00018924955836390815, + "loss": 3.3747, + "step": 95495 + }, + { + "epoch": 6.488653349639897, + "grad_norm": 1.0547581911087036, + "learning_rate": 0.0001892070933550754, + "loss": 3.4293, + "step": 95500 + }, + { + "epoch": 6.488993069710559, + "grad_norm": 0.9370664954185486, + "learning_rate": 0.0001891646283462427, + "loss": 3.412, + "step": 95505 + }, + { + "epoch": 6.48933278978122, + "grad_norm": 0.7343831062316895, + "learning_rate": 0.00018912216333740999, + "loss": 3.3525, + "step": 95510 + }, + { + "epoch": 6.489672509851882, + "grad_norm": 0.9967712163925171, + "learning_rate": 0.00018907969832857724, + "loss": 3.5946, + "step": 95515 + }, + { + "epoch": 6.490012229922544, + "grad_norm": 0.9314888715744019, + "learning_rate": 0.00018903723331974455, + "loss": 3.6919, + "step": 95520 + }, + { + "epoch": 6.490351949993205, + "grad_norm": 1.116249442100525, + "learning_rate": 0.0001889947683109118, + "loss": 3.1923, + "step": 95525 + }, + { + "epoch": 6.4906916700638675, + "grad_norm": 0.8066285252571106, + "learning_rate": 0.0001889523033020791, + "loss": 3.2435, + "step": 95530 + }, + { + "epoch": 6.491031390134529, + "grad_norm": 0.9123119711875916, + "learning_rate": 0.00018890983829324636, + "loss": 3.171, + "step": 95535 + }, + { + "epoch": 6.491371110205191, + "grad_norm": 0.9060139060020447, + "learning_rate": 0.00018886737328441364, + "loss": 3.6088, + "step": 95540 + }, + { + "epoch": 6.491710830275853, + "grad_norm": 1.0434660911560059, + "learning_rate": 0.00018882490827558095, + "loss": 3.3557, + "step": 95545 + }, + { + "epoch": 6.492050550346514, + "grad_norm": 0.9602596759796143, + "learning_rate": 0.0001887824432667482, + "loss": 3.2124, + "step": 95550 + }, + { + "epoch": 6.492390270417176, + "grad_norm": 1.130320429801941, + "learning_rate": 0.00018873997825791548, + "loss": 3.4236, + "step": 95555 + }, + { + "epoch": 6.492729990487838, + "grad_norm": 1.0064582824707031, + "learning_rate": 0.00018869751324908276, + "loss": 3.4967, + "step": 95560 + }, + { + "epoch": 6.493069710558499, + "grad_norm": 0.8523055911064148, + "learning_rate": 0.00018865504824025004, + "loss": 3.5577, + "step": 95565 + }, + { + "epoch": 6.493409430629161, + "grad_norm": 1.1265896558761597, + "learning_rate": 0.0001886125832314173, + "loss": 3.2853, + "step": 95570 + }, + { + "epoch": 6.4937491506998235, + "grad_norm": 0.797695517539978, + "learning_rate": 0.0001885701182225846, + "loss": 3.5637, + "step": 95575 + }, + { + "epoch": 6.494088870770485, + "grad_norm": 1.1196115016937256, + "learning_rate": 0.00018852765321375188, + "loss": 3.3375, + "step": 95580 + }, + { + "epoch": 6.494428590841147, + "grad_norm": 0.8812901377677917, + "learning_rate": 0.00018848518820491913, + "loss": 3.4296, + "step": 95585 + }, + { + "epoch": 6.494768310911809, + "grad_norm": 0.952398419380188, + "learning_rate": 0.00018844272319608644, + "loss": 3.2116, + "step": 95590 + }, + { + "epoch": 6.49510803098247, + "grad_norm": 0.8549224138259888, + "learning_rate": 0.0001884002581872537, + "loss": 3.2713, + "step": 95595 + }, + { + "epoch": 6.495447751053132, + "grad_norm": 0.7400305271148682, + "learning_rate": 0.00018835779317842097, + "loss": 3.3102, + "step": 95600 + }, + { + "epoch": 6.495787471123794, + "grad_norm": 0.7714257836341858, + "learning_rate": 0.00018831532816958828, + "loss": 3.3375, + "step": 95605 + }, + { + "epoch": 6.496127191194455, + "grad_norm": 1.04389488697052, + "learning_rate": 0.00018827286316075553, + "loss": 3.3298, + "step": 95610 + }, + { + "epoch": 6.496466911265117, + "grad_norm": 0.6871538758277893, + "learning_rate": 0.00018823039815192284, + "loss": 3.5538, + "step": 95615 + }, + { + "epoch": 6.4968066313357795, + "grad_norm": 0.9808608889579773, + "learning_rate": 0.0001881879331430901, + "loss": 3.5144, + "step": 95620 + }, + { + "epoch": 6.497146351406441, + "grad_norm": 1.3798158168792725, + "learning_rate": 0.00018814546813425737, + "loss": 3.4458, + "step": 95625 + }, + { + "epoch": 6.497486071477103, + "grad_norm": 0.9423761367797852, + "learning_rate": 0.00018810300312542465, + "loss": 3.5182, + "step": 95630 + }, + { + "epoch": 6.497825791547765, + "grad_norm": 0.8347110748291016, + "learning_rate": 0.00018806053811659193, + "loss": 3.6654, + "step": 95635 + }, + { + "epoch": 6.498165511618426, + "grad_norm": 0.9606052041053772, + "learning_rate": 0.00018801807310775921, + "loss": 3.3998, + "step": 95640 + }, + { + "epoch": 6.498505231689088, + "grad_norm": 0.9659754633903503, + "learning_rate": 0.0001879756080989265, + "loss": 3.463, + "step": 95645 + }, + { + "epoch": 6.49884495175975, + "grad_norm": 1.514802098274231, + "learning_rate": 0.00018793314309009377, + "loss": 3.7092, + "step": 95650 + }, + { + "epoch": 6.499184671830411, + "grad_norm": 1.055043339729309, + "learning_rate": 0.00018789067808126103, + "loss": 3.5272, + "step": 95655 + }, + { + "epoch": 6.4995243919010735, + "grad_norm": 1.127803087234497, + "learning_rate": 0.00018784821307242833, + "loss": 3.3229, + "step": 95660 + }, + { + "epoch": 6.4998641119717355, + "grad_norm": 0.9043490886688232, + "learning_rate": 0.0001878057480635956, + "loss": 3.6339, + "step": 95665 + }, + { + "epoch": 6.500203832042397, + "grad_norm": 1.5116510391235352, + "learning_rate": 0.00018776328305476287, + "loss": 3.4085, + "step": 95670 + }, + { + "epoch": 6.500543552113059, + "grad_norm": 1.4145241975784302, + "learning_rate": 0.00018772081804593017, + "loss": 3.5351, + "step": 95675 + }, + { + "epoch": 6.500883272183721, + "grad_norm": 0.9642938375473022, + "learning_rate": 0.00018767835303709743, + "loss": 3.0051, + "step": 95680 + }, + { + "epoch": 6.501222992254382, + "grad_norm": 0.7900421619415283, + "learning_rate": 0.0001876358880282647, + "loss": 3.2956, + "step": 95685 + }, + { + "epoch": 6.501562712325044, + "grad_norm": 0.8734384179115295, + "learning_rate": 0.000187593423019432, + "loss": 3.523, + "step": 95690 + }, + { + "epoch": 6.501902432395706, + "grad_norm": 0.9130914807319641, + "learning_rate": 0.00018755095801059927, + "loss": 3.5209, + "step": 95695 + }, + { + "epoch": 6.502242152466367, + "grad_norm": 0.8703104257583618, + "learning_rate": 0.00018750849300176655, + "loss": 3.5576, + "step": 95700 + }, + { + "epoch": 6.5025818725370295, + "grad_norm": 1.5011425018310547, + "learning_rate": 0.00018746602799293383, + "loss": 3.5532, + "step": 95705 + }, + { + "epoch": 6.5029215926076915, + "grad_norm": 0.9496307969093323, + "learning_rate": 0.0001874235629841011, + "loss": 3.441, + "step": 95710 + }, + { + "epoch": 6.503261312678353, + "grad_norm": 0.9794162511825562, + "learning_rate": 0.0001873810979752684, + "loss": 3.3511, + "step": 95715 + }, + { + "epoch": 6.503601032749015, + "grad_norm": 1.0702646970748901, + "learning_rate": 0.00018733863296643567, + "loss": 3.4833, + "step": 95720 + }, + { + "epoch": 6.503940752819677, + "grad_norm": 0.7905886173248291, + "learning_rate": 0.00018729616795760292, + "loss": 3.5973, + "step": 95725 + }, + { + "epoch": 6.504280472890338, + "grad_norm": 1.3467390537261963, + "learning_rate": 0.00018725370294877023, + "loss": 3.4608, + "step": 95730 + }, + { + "epoch": 6.504620192961, + "grad_norm": 0.9962143301963806, + "learning_rate": 0.0001872112379399375, + "loss": 3.2786, + "step": 95735 + }, + { + "epoch": 6.504959913031662, + "grad_norm": 0.8511714339256287, + "learning_rate": 0.00018716877293110476, + "loss": 3.4186, + "step": 95740 + }, + { + "epoch": 6.505299633102323, + "grad_norm": 0.850841224193573, + "learning_rate": 0.00018712630792227207, + "loss": 3.4791, + "step": 95745 + }, + { + "epoch": 6.5056393531729855, + "grad_norm": 0.8430094718933105, + "learning_rate": 0.00018708384291343932, + "loss": 3.2618, + "step": 95750 + }, + { + "epoch": 6.5059790732436475, + "grad_norm": 1.056355595588684, + "learning_rate": 0.0001870413779046066, + "loss": 3.4905, + "step": 95755 + }, + { + "epoch": 6.506318793314309, + "grad_norm": 0.8001751899719238, + "learning_rate": 0.00018699891289577388, + "loss": 3.4393, + "step": 95760 + }, + { + "epoch": 6.506658513384971, + "grad_norm": 0.8658973574638367, + "learning_rate": 0.00018695644788694116, + "loss": 3.4117, + "step": 95765 + }, + { + "epoch": 6.506998233455633, + "grad_norm": 0.8401905298233032, + "learning_rate": 0.00018691398287810844, + "loss": 3.6479, + "step": 95770 + }, + { + "epoch": 6.507337953526294, + "grad_norm": 0.9163071513175964, + "learning_rate": 0.00018687151786927572, + "loss": 3.3371, + "step": 95775 + }, + { + "epoch": 6.507677673596956, + "grad_norm": 0.7827601432800293, + "learning_rate": 0.000186829052860443, + "loss": 3.6226, + "step": 95780 + }, + { + "epoch": 6.508017393667618, + "grad_norm": 0.7466697096824646, + "learning_rate": 0.00018678658785161028, + "loss": 3.6367, + "step": 95785 + }, + { + "epoch": 6.508357113738279, + "grad_norm": 0.8387410044670105, + "learning_rate": 0.00018674412284277756, + "loss": 3.4125, + "step": 95790 + }, + { + "epoch": 6.5086968338089415, + "grad_norm": 0.9494337439537048, + "learning_rate": 0.00018670165783394482, + "loss": 3.3766, + "step": 95795 + }, + { + "epoch": 6.509036553879604, + "grad_norm": 0.9123794436454773, + "learning_rate": 0.00018665919282511212, + "loss": 3.3212, + "step": 95800 + }, + { + "epoch": 6.509376273950265, + "grad_norm": 1.0091737508773804, + "learning_rate": 0.0001866167278162794, + "loss": 3.3707, + "step": 95805 + }, + { + "epoch": 6.509715994020927, + "grad_norm": 1.1203184127807617, + "learning_rate": 0.00018657426280744666, + "loss": 3.396, + "step": 95810 + }, + { + "epoch": 6.510055714091589, + "grad_norm": 0.8704003691673279, + "learning_rate": 0.00018653179779861396, + "loss": 3.3233, + "step": 95815 + }, + { + "epoch": 6.51039543416225, + "grad_norm": 0.8455916047096252, + "learning_rate": 0.00018648933278978122, + "loss": 3.6406, + "step": 95820 + }, + { + "epoch": 6.510735154232912, + "grad_norm": 0.9203803539276123, + "learning_rate": 0.0001864468677809485, + "loss": 3.266, + "step": 95825 + }, + { + "epoch": 6.511074874303574, + "grad_norm": 1.024174690246582, + "learning_rate": 0.00018640440277211578, + "loss": 3.3732, + "step": 95830 + }, + { + "epoch": 6.511414594374235, + "grad_norm": 0.8556417226791382, + "learning_rate": 0.00018636193776328306, + "loss": 3.2832, + "step": 95835 + }, + { + "epoch": 6.5117543144448975, + "grad_norm": 1.0097578763961792, + "learning_rate": 0.00018631947275445034, + "loss": 3.3253, + "step": 95840 + }, + { + "epoch": 6.51209403451556, + "grad_norm": 0.7711024880409241, + "learning_rate": 0.00018627700774561762, + "loss": 3.4607, + "step": 95845 + }, + { + "epoch": 6.512433754586221, + "grad_norm": 0.7733234167098999, + "learning_rate": 0.0001862345427367849, + "loss": 3.6474, + "step": 95850 + }, + { + "epoch": 6.512773474656883, + "grad_norm": 1.0084670782089233, + "learning_rate": 0.00018619207772795215, + "loss": 3.4477, + "step": 95855 + }, + { + "epoch": 6.513113194727545, + "grad_norm": 1.0807054042816162, + "learning_rate": 0.00018614961271911946, + "loss": 3.3897, + "step": 95860 + }, + { + "epoch": 6.513452914798206, + "grad_norm": 0.9128042459487915, + "learning_rate": 0.00018610714771028674, + "loss": 3.2625, + "step": 95865 + }, + { + "epoch": 6.513792634868868, + "grad_norm": 1.1220717430114746, + "learning_rate": 0.00018606468270145402, + "loss": 3.3262, + "step": 95870 + }, + { + "epoch": 6.51413235493953, + "grad_norm": 0.9851590991020203, + "learning_rate": 0.0001860222176926213, + "loss": 3.3197, + "step": 95875 + }, + { + "epoch": 6.514472075010191, + "grad_norm": 0.9820395708084106, + "learning_rate": 0.00018597975268378855, + "loss": 3.551, + "step": 95880 + }, + { + "epoch": 6.5148117950808535, + "grad_norm": 0.9387549161911011, + "learning_rate": 0.00018593728767495586, + "loss": 3.2445, + "step": 95885 + }, + { + "epoch": 6.515151515151516, + "grad_norm": 0.9645631909370422, + "learning_rate": 0.0001858948226661231, + "loss": 3.5508, + "step": 95890 + }, + { + "epoch": 6.515491235222177, + "grad_norm": 0.9772468209266663, + "learning_rate": 0.0001858523576572904, + "loss": 3.4193, + "step": 95895 + }, + { + "epoch": 6.515830955292839, + "grad_norm": 4.54783821105957, + "learning_rate": 0.0001858098926484577, + "loss": 3.6387, + "step": 95900 + }, + { + "epoch": 6.516170675363501, + "grad_norm": 0.9560899138450623, + "learning_rate": 0.00018576742763962495, + "loss": 3.3537, + "step": 95905 + }, + { + "epoch": 6.516510395434162, + "grad_norm": 1.238062858581543, + "learning_rate": 0.00018572496263079223, + "loss": 3.5162, + "step": 95910 + }, + { + "epoch": 6.516850115504824, + "grad_norm": 0.9157900214195251, + "learning_rate": 0.0001856824976219595, + "loss": 3.7314, + "step": 95915 + }, + { + "epoch": 6.517189835575486, + "grad_norm": 0.8137756586074829, + "learning_rate": 0.0001856400326131268, + "loss": 3.4383, + "step": 95920 + }, + { + "epoch": 6.517529555646147, + "grad_norm": 1.0134735107421875, + "learning_rate": 0.00018559756760429404, + "loss": 3.3311, + "step": 95925 + }, + { + "epoch": 6.5178692757168095, + "grad_norm": 0.8799773454666138, + "learning_rate": 0.00018555510259546135, + "loss": 3.2075, + "step": 95930 + }, + { + "epoch": 6.518208995787472, + "grad_norm": 0.9057723879814148, + "learning_rate": 0.00018551263758662863, + "loss": 3.3728, + "step": 95935 + }, + { + "epoch": 6.518548715858133, + "grad_norm": 1.1941752433776855, + "learning_rate": 0.00018547017257779588, + "loss": 3.4923, + "step": 95940 + }, + { + "epoch": 6.518888435928795, + "grad_norm": 0.88994300365448, + "learning_rate": 0.0001854277075689632, + "loss": 3.712, + "step": 95945 + }, + { + "epoch": 6.519228155999457, + "grad_norm": 0.8618754744529724, + "learning_rate": 0.00018538524256013044, + "loss": 3.5257, + "step": 95950 + }, + { + "epoch": 6.519567876070118, + "grad_norm": 0.9943764805793762, + "learning_rate": 0.00018534277755129775, + "loss": 3.4379, + "step": 95955 + }, + { + "epoch": 6.51990759614078, + "grad_norm": 0.924781084060669, + "learning_rate": 0.000185300312542465, + "loss": 3.3942, + "step": 95960 + }, + { + "epoch": 6.520247316211442, + "grad_norm": 0.9491227865219116, + "learning_rate": 0.00018525784753363228, + "loss": 3.1504, + "step": 95965 + }, + { + "epoch": 6.5205870362821035, + "grad_norm": 0.7962191104888916, + "learning_rate": 0.0001852153825247996, + "loss": 3.5032, + "step": 95970 + }, + { + "epoch": 6.5209267563527655, + "grad_norm": 0.8114104866981506, + "learning_rate": 0.00018517291751596684, + "loss": 3.2658, + "step": 95975 + }, + { + "epoch": 6.521266476423427, + "grad_norm": 1.0158394575119019, + "learning_rate": 0.00018513045250713412, + "loss": 3.1869, + "step": 95980 + }, + { + "epoch": 6.521606196494089, + "grad_norm": 0.7069998383522034, + "learning_rate": 0.0001850879874983014, + "loss": 3.384, + "step": 95985 + }, + { + "epoch": 6.521945916564751, + "grad_norm": 0.8713618516921997, + "learning_rate": 0.00018504552248946868, + "loss": 3.229, + "step": 95990 + }, + { + "epoch": 6.522285636635412, + "grad_norm": 0.9024876356124878, + "learning_rate": 0.00018500305748063594, + "loss": 3.4663, + "step": 95995 + }, + { + "epoch": 6.522625356706074, + "grad_norm": 0.827683687210083, + "learning_rate": 0.00018496059247180324, + "loss": 3.2828, + "step": 96000 + }, + { + "epoch": 6.522965076776736, + "grad_norm": 0.8496642112731934, + "learning_rate": 0.00018491812746297052, + "loss": 3.1836, + "step": 96005 + }, + { + "epoch": 6.523304796847397, + "grad_norm": 0.9325906038284302, + "learning_rate": 0.00018487566245413778, + "loss": 3.2727, + "step": 96010 + }, + { + "epoch": 6.5236445169180595, + "grad_norm": 1.1120365858078003, + "learning_rate": 0.00018483319744530508, + "loss": 3.4801, + "step": 96015 + }, + { + "epoch": 6.5239842369887215, + "grad_norm": 0.9433359503746033, + "learning_rate": 0.00018479073243647234, + "loss": 3.3838, + "step": 96020 + }, + { + "epoch": 6.524323957059383, + "grad_norm": 0.8275235891342163, + "learning_rate": 0.00018474826742763962, + "loss": 3.6085, + "step": 96025 + }, + { + "epoch": 6.524663677130045, + "grad_norm": 1.4381300210952759, + "learning_rate": 0.00018470580241880692, + "loss": 3.4209, + "step": 96030 + }, + { + "epoch": 6.525003397200707, + "grad_norm": 1.1553945541381836, + "learning_rate": 0.00018466333740997418, + "loss": 3.3604, + "step": 96035 + }, + { + "epoch": 6.525343117271368, + "grad_norm": 1.0173001289367676, + "learning_rate": 0.00018462087240114148, + "loss": 3.3346, + "step": 96040 + }, + { + "epoch": 6.52568283734203, + "grad_norm": 0.9590101838111877, + "learning_rate": 0.00018457840739230874, + "loss": 3.58, + "step": 96045 + }, + { + "epoch": 6.526022557412692, + "grad_norm": 1.1312143802642822, + "learning_rate": 0.00018453594238347602, + "loss": 3.429, + "step": 96050 + }, + { + "epoch": 6.526362277483353, + "grad_norm": 1.0752983093261719, + "learning_rate": 0.0001844934773746433, + "loss": 3.705, + "step": 96055 + }, + { + "epoch": 6.5267019975540155, + "grad_norm": 0.9084679484367371, + "learning_rate": 0.00018445101236581058, + "loss": 3.5946, + "step": 96060 + }, + { + "epoch": 6.5270417176246776, + "grad_norm": 0.8572028279304504, + "learning_rate": 0.00018440854735697786, + "loss": 3.3135, + "step": 96065 + }, + { + "epoch": 6.527381437695339, + "grad_norm": 0.8700775504112244, + "learning_rate": 0.00018436608234814514, + "loss": 3.4346, + "step": 96070 + }, + { + "epoch": 6.527721157766001, + "grad_norm": 0.7865277528762817, + "learning_rate": 0.00018432361733931242, + "loss": 3.4332, + "step": 96075 + }, + { + "epoch": 6.528060877836663, + "grad_norm": 1.3459291458129883, + "learning_rate": 0.00018428115233047967, + "loss": 3.6328, + "step": 96080 + }, + { + "epoch": 6.528400597907324, + "grad_norm": 1.0076532363891602, + "learning_rate": 0.00018423868732164698, + "loss": 3.646, + "step": 96085 + }, + { + "epoch": 6.528740317977986, + "grad_norm": 1.0843451023101807, + "learning_rate": 0.00018419622231281423, + "loss": 3.2381, + "step": 96090 + }, + { + "epoch": 6.529080038048648, + "grad_norm": 0.8586568236351013, + "learning_rate": 0.0001841537573039815, + "loss": 3.3545, + "step": 96095 + }, + { + "epoch": 6.529419758119309, + "grad_norm": 1.3085286617279053, + "learning_rate": 0.00018411129229514882, + "loss": 3.4132, + "step": 96100 + }, + { + "epoch": 6.5297594781899715, + "grad_norm": 0.8271479606628418, + "learning_rate": 0.00018406882728631607, + "loss": 3.3236, + "step": 96105 + }, + { + "epoch": 6.530099198260634, + "grad_norm": 1.203324317932129, + "learning_rate": 0.00018402636227748335, + "loss": 3.6716, + "step": 96110 + }, + { + "epoch": 6.530438918331295, + "grad_norm": 1.1159731149673462, + "learning_rate": 0.00018398389726865063, + "loss": 3.6416, + "step": 96115 + }, + { + "epoch": 6.530778638401957, + "grad_norm": 0.7409222722053528, + "learning_rate": 0.0001839414322598179, + "loss": 3.4597, + "step": 96120 + }, + { + "epoch": 6.531118358472619, + "grad_norm": 0.8900206685066223, + "learning_rate": 0.0001838989672509852, + "loss": 3.3953, + "step": 96125 + }, + { + "epoch": 6.53145807854328, + "grad_norm": 0.9867332577705383, + "learning_rate": 0.00018385650224215247, + "loss": 3.4964, + "step": 96130 + }, + { + "epoch": 6.531797798613942, + "grad_norm": 1.0780344009399414, + "learning_rate": 0.00018381403723331975, + "loss": 3.5695, + "step": 96135 + }, + { + "epoch": 6.532137518684604, + "grad_norm": 0.8417288661003113, + "learning_rate": 0.00018377157222448703, + "loss": 3.0318, + "step": 96140 + }, + { + "epoch": 6.532477238755265, + "grad_norm": 0.817300021648407, + "learning_rate": 0.0001837291072156543, + "loss": 3.5008, + "step": 96145 + }, + { + "epoch": 6.5328169588259275, + "grad_norm": 1.201766848564148, + "learning_rate": 0.00018368664220682157, + "loss": 3.4777, + "step": 96150 + }, + { + "epoch": 6.53315667889659, + "grad_norm": 0.8473197817802429, + "learning_rate": 0.00018364417719798887, + "loss": 3.3686, + "step": 96155 + }, + { + "epoch": 6.533496398967251, + "grad_norm": 1.0453999042510986, + "learning_rate": 0.00018360171218915615, + "loss": 3.4495, + "step": 96160 + }, + { + "epoch": 6.533836119037913, + "grad_norm": 0.7251853346824646, + "learning_rate": 0.0001835592471803234, + "loss": 3.4704, + "step": 96165 + }, + { + "epoch": 6.534175839108575, + "grad_norm": 1.1921378374099731, + "learning_rate": 0.0001835167821714907, + "loss": 3.2462, + "step": 96170 + }, + { + "epoch": 6.534515559179236, + "grad_norm": 0.7300398349761963, + "learning_rate": 0.00018347431716265797, + "loss": 3.6445, + "step": 96175 + }, + { + "epoch": 6.534855279249898, + "grad_norm": 0.8843969702720642, + "learning_rate": 0.00018343185215382525, + "loss": 3.2474, + "step": 96180 + }, + { + "epoch": 6.535194999320559, + "grad_norm": 0.8546652793884277, + "learning_rate": 0.00018338938714499253, + "loss": 3.5698, + "step": 96185 + }, + { + "epoch": 6.535534719391221, + "grad_norm": 0.9253738522529602, + "learning_rate": 0.0001833469221361598, + "loss": 3.468, + "step": 96190 + }, + { + "epoch": 6.5358744394618835, + "grad_norm": 1.0977026224136353, + "learning_rate": 0.00018330445712732709, + "loss": 3.3468, + "step": 96195 + }, + { + "epoch": 6.536214159532545, + "grad_norm": 1.3615666627883911, + "learning_rate": 0.00018326199211849437, + "loss": 3.2111, + "step": 96200 + }, + { + "epoch": 6.536553879603207, + "grad_norm": 0.8098323941230774, + "learning_rate": 0.00018321952710966165, + "loss": 3.5543, + "step": 96205 + }, + { + "epoch": 6.536893599673869, + "grad_norm": 0.9509373307228088, + "learning_rate": 0.00018317706210082893, + "loss": 3.4527, + "step": 96210 + }, + { + "epoch": 6.53723331974453, + "grad_norm": 0.9498730301856995, + "learning_rate": 0.0001831345970919962, + "loss": 3.5283, + "step": 96215 + }, + { + "epoch": 6.537573039815192, + "grad_norm": 1.3253573179244995, + "learning_rate": 0.00018309213208316346, + "loss": 3.2857, + "step": 96220 + }, + { + "epoch": 6.537912759885854, + "grad_norm": 1.1458903551101685, + "learning_rate": 0.00018304966707433077, + "loss": 3.243, + "step": 96225 + }, + { + "epoch": 6.538252479956515, + "grad_norm": 1.5632668733596802, + "learning_rate": 0.00018300720206549805, + "loss": 3.1482, + "step": 96230 + }, + { + "epoch": 6.5385922000271774, + "grad_norm": 0.8477184772491455, + "learning_rate": 0.0001829647370566653, + "loss": 3.3295, + "step": 96235 + }, + { + "epoch": 6.5389319200978395, + "grad_norm": 0.8998572826385498, + "learning_rate": 0.0001829222720478326, + "loss": 3.3327, + "step": 96240 + }, + { + "epoch": 6.539271640168501, + "grad_norm": 0.8991907238960266, + "learning_rate": 0.00018287980703899986, + "loss": 3.5971, + "step": 96245 + }, + { + "epoch": 6.539611360239163, + "grad_norm": 0.9429214596748352, + "learning_rate": 0.00018283734203016714, + "loss": 3.1701, + "step": 96250 + }, + { + "epoch": 6.539951080309825, + "grad_norm": 0.8215212225914001, + "learning_rate": 0.00018279487702133442, + "loss": 3.2887, + "step": 96255 + }, + { + "epoch": 6.540290800380486, + "grad_norm": 0.9331828951835632, + "learning_rate": 0.0001827524120125017, + "loss": 3.3743, + "step": 96260 + }, + { + "epoch": 6.540630520451148, + "grad_norm": 0.7267237901687622, + "learning_rate": 0.00018270994700366898, + "loss": 3.6185, + "step": 96265 + }, + { + "epoch": 6.54097024052181, + "grad_norm": 0.8114087581634521, + "learning_rate": 0.00018266748199483626, + "loss": 3.313, + "step": 96270 + }, + { + "epoch": 6.541309960592471, + "grad_norm": 0.9055354595184326, + "learning_rate": 0.00018262501698600354, + "loss": 3.5677, + "step": 96275 + }, + { + "epoch": 6.5416496806631335, + "grad_norm": 0.915196418762207, + "learning_rate": 0.0001825825519771708, + "loss": 3.0155, + "step": 96280 + }, + { + "epoch": 6.5419894007337955, + "grad_norm": 0.7566527128219604, + "learning_rate": 0.0001825400869683381, + "loss": 3.3382, + "step": 96285 + }, + { + "epoch": 6.542329120804457, + "grad_norm": 0.8054450750350952, + "learning_rate": 0.00018249762195950538, + "loss": 3.6478, + "step": 96290 + }, + { + "epoch": 6.542668840875119, + "grad_norm": 0.979840099811554, + "learning_rate": 0.00018245515695067266, + "loss": 3.5757, + "step": 96295 + }, + { + "epoch": 6.543008560945781, + "grad_norm": 0.8012291193008423, + "learning_rate": 0.00018241269194183994, + "loss": 3.4448, + "step": 96300 + }, + { + "epoch": 6.543348281016442, + "grad_norm": 0.8896718621253967, + "learning_rate": 0.0001823702269330072, + "loss": 3.3061, + "step": 96305 + }, + { + "epoch": 6.543688001087104, + "grad_norm": 0.8241873383522034, + "learning_rate": 0.0001823277619241745, + "loss": 3.2888, + "step": 96310 + }, + { + "epoch": 6.544027721157766, + "grad_norm": 0.9818737506866455, + "learning_rate": 0.00018228529691534175, + "loss": 3.7128, + "step": 96315 + }, + { + "epoch": 6.544367441228427, + "grad_norm": 0.9102694988250732, + "learning_rate": 0.00018224283190650903, + "loss": 3.2752, + "step": 96320 + }, + { + "epoch": 6.5447071612990895, + "grad_norm": 1.2717020511627197, + "learning_rate": 0.00018220036689767634, + "loss": 3.3806, + "step": 96325 + }, + { + "epoch": 6.5450468813697515, + "grad_norm": 1.1085553169250488, + "learning_rate": 0.0001821579018888436, + "loss": 3.1313, + "step": 96330 + }, + { + "epoch": 6.545386601440413, + "grad_norm": 0.9430322051048279, + "learning_rate": 0.00018211543688001087, + "loss": 3.4662, + "step": 96335 + }, + { + "epoch": 6.545726321511075, + "grad_norm": 0.7601369023323059, + "learning_rate": 0.00018207297187117815, + "loss": 3.6747, + "step": 96340 + }, + { + "epoch": 6.546066041581737, + "grad_norm": 0.9288267493247986, + "learning_rate": 0.00018203050686234543, + "loss": 3.3786, + "step": 96345 + }, + { + "epoch": 6.546405761652398, + "grad_norm": 0.9109987020492554, + "learning_rate": 0.0001819880418535127, + "loss": 3.3358, + "step": 96350 + }, + { + "epoch": 6.54674548172306, + "grad_norm": 0.8768603205680847, + "learning_rate": 0.00018194557684468, + "loss": 3.4238, + "step": 96355 + }, + { + "epoch": 6.547085201793722, + "grad_norm": 1.1437697410583496, + "learning_rate": 0.00018190311183584727, + "loss": 3.323, + "step": 96360 + }, + { + "epoch": 6.547424921864383, + "grad_norm": 0.8928757905960083, + "learning_rate": 0.00018186064682701453, + "loss": 3.4146, + "step": 96365 + }, + { + "epoch": 6.5477646419350455, + "grad_norm": 0.7943240404129028, + "learning_rate": 0.00018181818181818183, + "loss": 3.5289, + "step": 96370 + }, + { + "epoch": 6.548104362005708, + "grad_norm": 1.0137412548065186, + "learning_rate": 0.0001817757168093491, + "loss": 3.2348, + "step": 96375 + }, + { + "epoch": 6.548444082076369, + "grad_norm": 0.9413430690765381, + "learning_rate": 0.0001817332518005164, + "loss": 3.4291, + "step": 96380 + }, + { + "epoch": 6.548783802147031, + "grad_norm": 0.8432650566101074, + "learning_rate": 0.00018169078679168365, + "loss": 3.2677, + "step": 96385 + }, + { + "epoch": 6.549123522217693, + "grad_norm": 0.9156467318534851, + "learning_rate": 0.00018164832178285093, + "loss": 3.4647, + "step": 96390 + }, + { + "epoch": 6.549463242288354, + "grad_norm": 0.8254164457321167, + "learning_rate": 0.00018160585677401823, + "loss": 3.4339, + "step": 96395 + }, + { + "epoch": 6.549802962359016, + "grad_norm": 0.9429871439933777, + "learning_rate": 0.0001815633917651855, + "loss": 3.4431, + "step": 96400 + }, + { + "epoch": 6.550142682429678, + "grad_norm": 0.8502214550971985, + "learning_rate": 0.00018152092675635277, + "loss": 3.291, + "step": 96405 + }, + { + "epoch": 6.550482402500339, + "grad_norm": 0.9537201523780823, + "learning_rate": 0.00018147846174752005, + "loss": 3.5285, + "step": 96410 + }, + { + "epoch": 6.5508221225710015, + "grad_norm": 0.9920289516448975, + "learning_rate": 0.00018143599673868733, + "loss": 3.5119, + "step": 96415 + }, + { + "epoch": 6.551161842641664, + "grad_norm": 0.825298547744751, + "learning_rate": 0.00018139353172985458, + "loss": 3.3484, + "step": 96420 + }, + { + "epoch": 6.551501562712325, + "grad_norm": 1.0800122022628784, + "learning_rate": 0.0001813510667210219, + "loss": 3.5711, + "step": 96425 + }, + { + "epoch": 6.551841282782987, + "grad_norm": 0.7928586006164551, + "learning_rate": 0.00018130860171218917, + "loss": 3.4899, + "step": 96430 + }, + { + "epoch": 6.552181002853649, + "grad_norm": 0.9259544014930725, + "learning_rate": 0.00018126613670335642, + "loss": 3.4966, + "step": 96435 + }, + { + "epoch": 6.55252072292431, + "grad_norm": 0.7172062993049622, + "learning_rate": 0.00018122367169452373, + "loss": 3.2445, + "step": 96440 + }, + { + "epoch": 6.552860442994972, + "grad_norm": 0.9667514562606812, + "learning_rate": 0.00018118120668569098, + "loss": 3.2039, + "step": 96445 + }, + { + "epoch": 6.553200163065634, + "grad_norm": 1.1942930221557617, + "learning_rate": 0.00018113874167685826, + "loss": 3.4296, + "step": 96450 + }, + { + "epoch": 6.553539883136295, + "grad_norm": 0.9152824878692627, + "learning_rate": 0.00018109627666802557, + "loss": 3.5018, + "step": 96455 + }, + { + "epoch": 6.5538796032069575, + "grad_norm": 1.2814470529556274, + "learning_rate": 0.00018105381165919282, + "loss": 3.1384, + "step": 96460 + }, + { + "epoch": 6.55421932327762, + "grad_norm": 0.7930164337158203, + "learning_rate": 0.00018101134665036013, + "loss": 3.29, + "step": 96465 + }, + { + "epoch": 6.554559043348281, + "grad_norm": 0.8283624053001404, + "learning_rate": 0.00018096888164152738, + "loss": 3.6658, + "step": 96470 + }, + { + "epoch": 6.554898763418943, + "grad_norm": 1.1580358743667603, + "learning_rate": 0.00018092641663269466, + "loss": 3.5278, + "step": 96475 + }, + { + "epoch": 6.555238483489605, + "grad_norm": 0.8476174473762512, + "learning_rate": 0.00018088395162386194, + "loss": 3.3957, + "step": 96480 + }, + { + "epoch": 6.555578203560266, + "grad_norm": 0.8010099530220032, + "learning_rate": 0.00018084148661502922, + "loss": 3.3573, + "step": 96485 + }, + { + "epoch": 6.555917923630928, + "grad_norm": 1.0258667469024658, + "learning_rate": 0.0001807990216061965, + "loss": 3.4307, + "step": 96490 + }, + { + "epoch": 6.55625764370159, + "grad_norm": 1.2952287197113037, + "learning_rate": 0.00018075655659736378, + "loss": 3.0565, + "step": 96495 + }, + { + "epoch": 6.556597363772251, + "grad_norm": 1.3069231510162354, + "learning_rate": 0.00018071409158853106, + "loss": 3.5, + "step": 96500 + }, + { + "epoch": 6.5569370838429135, + "grad_norm": 0.8769518136978149, + "learning_rate": 0.00018067162657969832, + "loss": 3.2552, + "step": 96505 + }, + { + "epoch": 6.557276803913576, + "grad_norm": 0.9203097224235535, + "learning_rate": 0.00018062916157086562, + "loss": 3.4012, + "step": 96510 + }, + { + "epoch": 6.557616523984237, + "grad_norm": 3.309804916381836, + "learning_rate": 0.00018058669656203288, + "loss": 3.3541, + "step": 96515 + }, + { + "epoch": 6.557956244054899, + "grad_norm": 0.7756957411766052, + "learning_rate": 0.00018054423155320016, + "loss": 3.4447, + "step": 96520 + }, + { + "epoch": 6.558295964125561, + "grad_norm": 1.0988537073135376, + "learning_rate": 0.00018050176654436746, + "loss": 3.4951, + "step": 96525 + }, + { + "epoch": 6.558635684196222, + "grad_norm": 0.8514907956123352, + "learning_rate": 0.00018045930153553472, + "loss": 3.5909, + "step": 96530 + }, + { + "epoch": 6.558975404266884, + "grad_norm": 1.1035823822021484, + "learning_rate": 0.000180416836526702, + "loss": 3.6258, + "step": 96535 + }, + { + "epoch": 6.559315124337546, + "grad_norm": 0.8167989253997803, + "learning_rate": 0.00018037437151786928, + "loss": 3.3255, + "step": 96540 + }, + { + "epoch": 6.5596548444082075, + "grad_norm": 0.8429747223854065, + "learning_rate": 0.00018033190650903656, + "loss": 3.4035, + "step": 96545 + }, + { + "epoch": 6.5599945644788695, + "grad_norm": 0.9145278334617615, + "learning_rate": 0.00018028944150020384, + "loss": 3.5605, + "step": 96550 + }, + { + "epoch": 6.560334284549532, + "grad_norm": 1.1124495267868042, + "learning_rate": 0.00018024697649137112, + "loss": 3.3203, + "step": 96555 + }, + { + "epoch": 6.560674004620193, + "grad_norm": 0.8458141088485718, + "learning_rate": 0.0001802045114825384, + "loss": 3.4201, + "step": 96560 + }, + { + "epoch": 6.561013724690855, + "grad_norm": 1.1716632843017578, + "learning_rate": 0.00018016204647370568, + "loss": 3.4054, + "step": 96565 + }, + { + "epoch": 6.561353444761517, + "grad_norm": 0.868791401386261, + "learning_rate": 0.00018011958146487296, + "loss": 3.168, + "step": 96570 + }, + { + "epoch": 6.561693164832178, + "grad_norm": 0.969434916973114, + "learning_rate": 0.0001800771164560402, + "loss": 3.3689, + "step": 96575 + }, + { + "epoch": 6.56203288490284, + "grad_norm": 0.6956519484519958, + "learning_rate": 0.00018003465144720752, + "loss": 3.422, + "step": 96580 + }, + { + "epoch": 6.562372604973502, + "grad_norm": 1.052335500717163, + "learning_rate": 0.0001799921864383748, + "loss": 3.1929, + "step": 96585 + }, + { + "epoch": 6.5627123250441635, + "grad_norm": 1.0469460487365723, + "learning_rate": 0.00017994972142954205, + "loss": 3.2609, + "step": 96590 + }, + { + "epoch": 6.5630520451148255, + "grad_norm": 0.9247040748596191, + "learning_rate": 0.00017990725642070936, + "loss": 3.2811, + "step": 96595 + }, + { + "epoch": 6.563391765185488, + "grad_norm": 1.020613431930542, + "learning_rate": 0.0001798647914118766, + "loss": 3.4064, + "step": 96600 + }, + { + "epoch": 6.563731485256149, + "grad_norm": 1.0119798183441162, + "learning_rate": 0.0001798223264030439, + "loss": 3.2697, + "step": 96605 + }, + { + "epoch": 6.564071205326811, + "grad_norm": 0.7283795475959778, + "learning_rate": 0.00017977986139421117, + "loss": 3.4144, + "step": 96610 + }, + { + "epoch": 6.564410925397473, + "grad_norm": 0.8270854353904724, + "learning_rate": 0.00017973739638537845, + "loss": 3.5867, + "step": 96615 + }, + { + "epoch": 6.564750645468134, + "grad_norm": 0.9614652395248413, + "learning_rate": 0.00017969493137654573, + "loss": 3.2434, + "step": 96620 + }, + { + "epoch": 6.565090365538796, + "grad_norm": 1.0461007356643677, + "learning_rate": 0.000179652466367713, + "loss": 3.449, + "step": 96625 + }, + { + "epoch": 6.565430085609458, + "grad_norm": NaN, + "learning_rate": 0.00017961849436064683, + "loss": 3.331, + "step": 96630 + }, + { + "epoch": 6.5657698056801195, + "grad_norm": 0.9014115333557129, + "learning_rate": 0.00017957602935181411, + "loss": 3.3965, + "step": 96635 + }, + { + "epoch": 6.5661095257507816, + "grad_norm": 0.9101060628890991, + "learning_rate": 0.00017953356434298137, + "loss": 3.2178, + "step": 96640 + }, + { + "epoch": 6.566449245821444, + "grad_norm": 1.05948805809021, + "learning_rate": 0.00017949109933414867, + "loss": 3.3203, + "step": 96645 + }, + { + "epoch": 6.566788965892105, + "grad_norm": 1.0018463134765625, + "learning_rate": 0.00017944863432531593, + "loss": 3.6907, + "step": 96650 + }, + { + "epoch": 6.567128685962767, + "grad_norm": 0.8236015439033508, + "learning_rate": 0.0001794061693164832, + "loss": 3.6045, + "step": 96655 + }, + { + "epoch": 6.567468406033428, + "grad_norm": 0.8386440277099609, + "learning_rate": 0.00017936370430765051, + "loss": 3.5379, + "step": 96660 + }, + { + "epoch": 6.56780812610409, + "grad_norm": 1.0769314765930176, + "learning_rate": 0.00017932123929881777, + "loss": 3.5158, + "step": 96665 + }, + { + "epoch": 6.568147846174752, + "grad_norm": 0.9659611582756042, + "learning_rate": 0.00017927877428998507, + "loss": 3.4886, + "step": 96670 + }, + { + "epoch": 6.568487566245413, + "grad_norm": 0.7674791216850281, + "learning_rate": 0.00017923630928115233, + "loss": 3.6044, + "step": 96675 + }, + { + "epoch": 6.5688272863160755, + "grad_norm": 0.7495800852775574, + "learning_rate": 0.0001791938442723196, + "loss": 3.4666, + "step": 96680 + }, + { + "epoch": 6.569167006386738, + "grad_norm": 1.00981605052948, + "learning_rate": 0.00017915137926348691, + "loss": 3.5036, + "step": 96685 + }, + { + "epoch": 6.569506726457399, + "grad_norm": 1.033519983291626, + "learning_rate": 0.00017910891425465417, + "loss": 3.2244, + "step": 96690 + }, + { + "epoch": 6.569846446528061, + "grad_norm": 1.0277893543243408, + "learning_rate": 0.00017906644924582145, + "loss": 3.2532, + "step": 96695 + }, + { + "epoch": 6.570186166598723, + "grad_norm": 0.8053072690963745, + "learning_rate": 0.00017902398423698873, + "loss": 3.3109, + "step": 96700 + }, + { + "epoch": 6.570525886669384, + "grad_norm": 0.9955875277519226, + "learning_rate": 0.000178981519228156, + "loss": 3.0978, + "step": 96705 + }, + { + "epoch": 6.570865606740046, + "grad_norm": 0.8383739590644836, + "learning_rate": 0.00017893905421932326, + "loss": 3.2923, + "step": 96710 + }, + { + "epoch": 6.571205326810708, + "grad_norm": 1.0197174549102783, + "learning_rate": 0.00017889658921049057, + "loss": 3.3436, + "step": 96715 + }, + { + "epoch": 6.571545046881369, + "grad_norm": 1.0184698104858398, + "learning_rate": 0.00017885412420165785, + "loss": 3.4628, + "step": 96720 + }, + { + "epoch": 6.5718847669520315, + "grad_norm": 0.731654703617096, + "learning_rate": 0.0001788116591928251, + "loss": 3.1274, + "step": 96725 + }, + { + "epoch": 6.572224487022694, + "grad_norm": 0.7793183922767639, + "learning_rate": 0.0001787691941839924, + "loss": 3.1946, + "step": 96730 + }, + { + "epoch": 6.572564207093355, + "grad_norm": 1.5434343814849854, + "learning_rate": 0.00017872672917515966, + "loss": 3.3356, + "step": 96735 + }, + { + "epoch": 6.572903927164017, + "grad_norm": 0.874599814414978, + "learning_rate": 0.00017868426416632694, + "loss": 3.3817, + "step": 96740 + }, + { + "epoch": 6.573243647234679, + "grad_norm": 0.9410642385482788, + "learning_rate": 0.00017864179915749422, + "loss": 3.2192, + "step": 96745 + }, + { + "epoch": 6.57358336730534, + "grad_norm": 0.9363478422164917, + "learning_rate": 0.0001785993341486615, + "loss": 3.7294, + "step": 96750 + }, + { + "epoch": 6.573923087376002, + "grad_norm": 1.0657933950424194, + "learning_rate": 0.0001785568691398288, + "loss": 3.3913, + "step": 96755 + }, + { + "epoch": 6.574262807446664, + "grad_norm": 1.1897368431091309, + "learning_rate": 0.00017851440413099606, + "loss": 3.3594, + "step": 96760 + }, + { + "epoch": 6.574602527517325, + "grad_norm": 0.9002661108970642, + "learning_rate": 0.00017847193912216334, + "loss": 3.5323, + "step": 96765 + }, + { + "epoch": 6.5749422475879875, + "grad_norm": 1.1734447479248047, + "learning_rate": 0.00017842947411333062, + "loss": 3.4404, + "step": 96770 + }, + { + "epoch": 6.57528196765865, + "grad_norm": 0.840173602104187, + "learning_rate": 0.0001783870091044979, + "loss": 3.3467, + "step": 96775 + }, + { + "epoch": 6.575621687729311, + "grad_norm": 0.9750733375549316, + "learning_rate": 0.00017834454409566515, + "loss": 3.2421, + "step": 96780 + }, + { + "epoch": 6.575961407799973, + "grad_norm": 0.8345876336097717, + "learning_rate": 0.00017830207908683246, + "loss": 3.3547, + "step": 96785 + }, + { + "epoch": 6.576301127870635, + "grad_norm": 0.9092654585838318, + "learning_rate": 0.00017825961407799974, + "loss": 3.4095, + "step": 96790 + }, + { + "epoch": 6.576640847941296, + "grad_norm": 0.9484195709228516, + "learning_rate": 0.000178217149069167, + "loss": 3.4782, + "step": 96795 + }, + { + "epoch": 6.576980568011958, + "grad_norm": 0.8600103855133057, + "learning_rate": 0.0001781746840603343, + "loss": 3.5205, + "step": 96800 + }, + { + "epoch": 6.57732028808262, + "grad_norm": 1.0801165103912354, + "learning_rate": 0.00017813221905150156, + "loss": 3.514, + "step": 96805 + }, + { + "epoch": 6.5776600081532814, + "grad_norm": 0.7489414811134338, + "learning_rate": 0.00017808975404266884, + "loss": 3.6452, + "step": 96810 + }, + { + "epoch": 6.5779997282239435, + "grad_norm": 0.960313618183136, + "learning_rate": 0.00017804728903383614, + "loss": 3.4956, + "step": 96815 + }, + { + "epoch": 6.578339448294606, + "grad_norm": 0.9917598962783813, + "learning_rate": 0.0001780048240250034, + "loss": 3.3003, + "step": 96820 + }, + { + "epoch": 6.578679168365267, + "grad_norm": 0.9402817487716675, + "learning_rate": 0.00017796235901617068, + "loss": 3.6825, + "step": 96825 + }, + { + "epoch": 6.579018888435929, + "grad_norm": 1.0749881267547607, + "learning_rate": 0.00017791989400733796, + "loss": 3.1593, + "step": 96830 + }, + { + "epoch": 6.579358608506591, + "grad_norm": 1.2331064939498901, + "learning_rate": 0.00017787742899850524, + "loss": 3.4764, + "step": 96835 + }, + { + "epoch": 6.579698328577252, + "grad_norm": 1.0095975399017334, + "learning_rate": 0.00017783496398967252, + "loss": 3.3883, + "step": 96840 + }, + { + "epoch": 6.580038048647914, + "grad_norm": 1.4322572946548462, + "learning_rate": 0.0001777924989808398, + "loss": 3.2271, + "step": 96845 + }, + { + "epoch": 6.580377768718576, + "grad_norm": 0.8068368434906006, + "learning_rate": 0.00017775003397200708, + "loss": 3.495, + "step": 96850 + }, + { + "epoch": 6.5807174887892375, + "grad_norm": 1.2056032419204712, + "learning_rate": 0.00017770756896317436, + "loss": 3.2836, + "step": 96855 + }, + { + "epoch": 6.5810572088598995, + "grad_norm": 1.516167163848877, + "learning_rate": 0.00017766510395434164, + "loss": 3.6031, + "step": 96860 + }, + { + "epoch": 6.581396928930561, + "grad_norm": 1.320302128791809, + "learning_rate": 0.0001776226389455089, + "loss": 3.5592, + "step": 96865 + }, + { + "epoch": 6.581736649001223, + "grad_norm": 1.0717660188674927, + "learning_rate": 0.0001775801739366762, + "loss": 3.2174, + "step": 96870 + }, + { + "epoch": 6.582076369071885, + "grad_norm": 0.7529323101043701, + "learning_rate": 0.00017753770892784345, + "loss": 3.5978, + "step": 96875 + }, + { + "epoch": 6.582416089142546, + "grad_norm": 0.8591926097869873, + "learning_rate": 0.00017749524391901073, + "loss": 3.4744, + "step": 96880 + }, + { + "epoch": 6.582755809213208, + "grad_norm": 0.9473296403884888, + "learning_rate": 0.00017745277891017804, + "loss": 3.7466, + "step": 96885 + }, + { + "epoch": 6.58309552928387, + "grad_norm": 1.0455561876296997, + "learning_rate": 0.0001774103139013453, + "loss": 3.2125, + "step": 96890 + }, + { + "epoch": 6.583435249354531, + "grad_norm": 1.0314862728118896, + "learning_rate": 0.00017736784889251257, + "loss": 3.5445, + "step": 96895 + }, + { + "epoch": 6.5837749694251935, + "grad_norm": 1.0254491567611694, + "learning_rate": 0.00017732538388367985, + "loss": 3.5492, + "step": 96900 + }, + { + "epoch": 6.5841146894958555, + "grad_norm": 0.7772113084793091, + "learning_rate": 0.00017728291887484713, + "loss": 3.4589, + "step": 96905 + }, + { + "epoch": 6.584454409566517, + "grad_norm": 0.8909747004508972, + "learning_rate": 0.00017724045386601438, + "loss": 3.4227, + "step": 96910 + }, + { + "epoch": 6.584794129637179, + "grad_norm": 0.8504826426506042, + "learning_rate": 0.0001771979888571817, + "loss": 3.3662, + "step": 96915 + }, + { + "epoch": 6.585133849707841, + "grad_norm": 1.2438416481018066, + "learning_rate": 0.00017715552384834897, + "loss": 3.5082, + "step": 96920 + }, + { + "epoch": 6.585473569778502, + "grad_norm": 1.1575578451156616, + "learning_rate": 0.00017711305883951625, + "loss": 3.2003, + "step": 96925 + }, + { + "epoch": 6.585813289849164, + "grad_norm": 0.7021375298500061, + "learning_rate": 0.00017707059383068353, + "loss": 3.5174, + "step": 96930 + }, + { + "epoch": 6.586153009919826, + "grad_norm": 1.0073963403701782, + "learning_rate": 0.00017702812882185078, + "loss": 3.1854, + "step": 96935 + }, + { + "epoch": 6.586492729990487, + "grad_norm": 1.061810851097107, + "learning_rate": 0.0001769856638130181, + "loss": 3.4536, + "step": 96940 + }, + { + "epoch": 6.5868324500611495, + "grad_norm": 0.8373082280158997, + "learning_rate": 0.00017694319880418537, + "loss": 3.2719, + "step": 96945 + }, + { + "epoch": 6.5871721701318116, + "grad_norm": 0.8319278359413147, + "learning_rate": 0.00017690073379535262, + "loss": 3.2153, + "step": 96950 + }, + { + "epoch": 6.587511890202473, + "grad_norm": 0.9663111567497253, + "learning_rate": 0.00017685826878651993, + "loss": 3.1971, + "step": 96955 + }, + { + "epoch": 6.587851610273135, + "grad_norm": 1.0163092613220215, + "learning_rate": 0.00017681580377768718, + "loss": 3.6218, + "step": 96960 + }, + { + "epoch": 6.588191330343797, + "grad_norm": 0.8834219574928284, + "learning_rate": 0.00017677333876885446, + "loss": 3.356, + "step": 96965 + }, + { + "epoch": 6.588531050414458, + "grad_norm": 0.8760011792182922, + "learning_rate": 0.00017673087376002174, + "loss": 3.6722, + "step": 96970 + }, + { + "epoch": 6.58887077048512, + "grad_norm": 1.012588381767273, + "learning_rate": 0.00017668840875118902, + "loss": 2.9831, + "step": 96975 + }, + { + "epoch": 6.589210490555782, + "grad_norm": 5.585391044616699, + "learning_rate": 0.0001766459437423563, + "loss": 3.3806, + "step": 96980 + }, + { + "epoch": 6.589550210626443, + "grad_norm": 1.2364397048950195, + "learning_rate": 0.00017660347873352358, + "loss": 3.4226, + "step": 96985 + }, + { + "epoch": 6.5898899306971055, + "grad_norm": 1.0011404752731323, + "learning_rate": 0.00017656101372469086, + "loss": 3.4993, + "step": 96990 + }, + { + "epoch": 6.590229650767768, + "grad_norm": 0.8533453941345215, + "learning_rate": 0.00017651854871585812, + "loss": 3.5788, + "step": 96995 + }, + { + "epoch": 6.590569370838429, + "grad_norm": 1.1065516471862793, + "learning_rate": 0.00017647608370702542, + "loss": 3.5473, + "step": 97000 + }, + { + "epoch": 6.590909090909091, + "grad_norm": 0.9194850325584412, + "learning_rate": 0.00017643361869819268, + "loss": 3.4491, + "step": 97005 + }, + { + "epoch": 6.591248810979753, + "grad_norm": 0.7665842771530151, + "learning_rate": 0.00017639115368935998, + "loss": 3.2799, + "step": 97010 + }, + { + "epoch": 6.591588531050414, + "grad_norm": 0.8762804865837097, + "learning_rate": 0.00017634868868052726, + "loss": 3.3425, + "step": 97015 + }, + { + "epoch": 6.591928251121076, + "grad_norm": 0.9808079600334167, + "learning_rate": 0.00017630622367169452, + "loss": 3.4573, + "step": 97020 + }, + { + "epoch": 6.592267971191738, + "grad_norm": 1.0717189311981201, + "learning_rate": 0.00017626375866286182, + "loss": 3.3793, + "step": 97025 + }, + { + "epoch": 6.592607691262399, + "grad_norm": 0.8091614246368408, + "learning_rate": 0.00017622129365402908, + "loss": 3.5041, + "step": 97030 + }, + { + "epoch": 6.5929474113330615, + "grad_norm": 1.1031352281570435, + "learning_rate": 0.00017617882864519636, + "loss": 3.1948, + "step": 97035 + }, + { + "epoch": 6.593287131403724, + "grad_norm": 0.9383870959281921, + "learning_rate": 0.00017613636363636364, + "loss": 3.122, + "step": 97040 + }, + { + "epoch": 6.593626851474385, + "grad_norm": 0.8861058950424194, + "learning_rate": 0.00017609389862753092, + "loss": 3.3272, + "step": 97045 + }, + { + "epoch": 6.593966571545047, + "grad_norm": 10.930349349975586, + "learning_rate": 0.0001760514336186982, + "loss": 3.2925, + "step": 97050 + }, + { + "epoch": 6.594306291615709, + "grad_norm": 1.034330129623413, + "learning_rate": 0.00017600896860986548, + "loss": 3.3403, + "step": 97055 + }, + { + "epoch": 6.59464601168637, + "grad_norm": 1.439183235168457, + "learning_rate": 0.00017596650360103276, + "loss": 3.327, + "step": 97060 + }, + { + "epoch": 6.594985731757032, + "grad_norm": 1.050272822380066, + "learning_rate": 0.0001759240385922, + "loss": 3.2628, + "step": 97065 + }, + { + "epoch": 6.595325451827694, + "grad_norm": 0.8950603008270264, + "learning_rate": 0.00017588157358336732, + "loss": 3.304, + "step": 97070 + }, + { + "epoch": 6.595665171898355, + "grad_norm": 0.9465873837471008, + "learning_rate": 0.00017583910857453457, + "loss": 3.5159, + "step": 97075 + }, + { + "epoch": 6.5960048919690175, + "grad_norm": 0.959265947341919, + "learning_rate": 0.00017579664356570185, + "loss": 3.2158, + "step": 97080 + }, + { + "epoch": 6.59634461203968, + "grad_norm": 0.8519126176834106, + "learning_rate": 0.00017575417855686916, + "loss": 3.4831, + "step": 97085 + }, + { + "epoch": 6.596684332110341, + "grad_norm": 0.7518671751022339, + "learning_rate": 0.0001757117135480364, + "loss": 3.4255, + "step": 97090 + }, + { + "epoch": 6.597024052181003, + "grad_norm": 1.0099091529846191, + "learning_rate": 0.00017566924853920372, + "loss": 3.5593, + "step": 97095 + }, + { + "epoch": 6.597363772251665, + "grad_norm": 1.9636856317520142, + "learning_rate": 0.00017562678353037097, + "loss": 3.4155, + "step": 97100 + }, + { + "epoch": 6.597703492322326, + "grad_norm": 1.1085718870162964, + "learning_rate": 0.00017558431852153825, + "loss": 3.4311, + "step": 97105 + }, + { + "epoch": 6.598043212392988, + "grad_norm": 0.941981852054596, + "learning_rate": 0.00017554185351270556, + "loss": 3.5354, + "step": 97110 + }, + { + "epoch": 6.59838293246365, + "grad_norm": 1.2091301679611206, + "learning_rate": 0.0001754993885038728, + "loss": 3.373, + "step": 97115 + }, + { + "epoch": 6.5987226525343115, + "grad_norm": 0.8865528106689453, + "learning_rate": 0.0001754569234950401, + "loss": 3.3821, + "step": 97120 + }, + { + "epoch": 6.5990623726049735, + "grad_norm": 0.8358374834060669, + "learning_rate": 0.00017541445848620737, + "loss": 3.2673, + "step": 97125 + }, + { + "epoch": 6.599402092675636, + "grad_norm": 0.860821545124054, + "learning_rate": 0.00017537199347737465, + "loss": 3.3208, + "step": 97130 + }, + { + "epoch": 6.599741812746297, + "grad_norm": 0.7560829520225525, + "learning_rate": 0.0001753295284685419, + "loss": 3.2561, + "step": 97135 + }, + { + "epoch": 6.600081532816959, + "grad_norm": 0.907918393611908, + "learning_rate": 0.0001752870634597092, + "loss": 3.3599, + "step": 97140 + }, + { + "epoch": 6.600421252887621, + "grad_norm": 1.2077957391738892, + "learning_rate": 0.0001752445984508765, + "loss": 3.4178, + "step": 97145 + }, + { + "epoch": 6.600760972958282, + "grad_norm": 0.921823263168335, + "learning_rate": 0.00017520213344204375, + "loss": 3.4489, + "step": 97150 + }, + { + "epoch": 6.601100693028944, + "grad_norm": 0.7021950483322144, + "learning_rate": 0.00017515966843321105, + "loss": 3.3465, + "step": 97155 + }, + { + "epoch": 6.601440413099606, + "grad_norm": 0.8576568365097046, + "learning_rate": 0.0001751172034243783, + "loss": 3.6863, + "step": 97160 + }, + { + "epoch": 6.6017801331702675, + "grad_norm": 0.9732618927955627, + "learning_rate": 0.00017507473841554559, + "loss": 3.7209, + "step": 97165 + }, + { + "epoch": 6.6021198532409295, + "grad_norm": 0.8388785719871521, + "learning_rate": 0.00017503227340671287, + "loss": 3.4848, + "step": 97170 + }, + { + "epoch": 6.602459573311592, + "grad_norm": 0.9404033422470093, + "learning_rate": 0.00017498980839788015, + "loss": 3.2712, + "step": 97175 + }, + { + "epoch": 6.602799293382253, + "grad_norm": 0.9860801696777344, + "learning_rate": 0.00017494734338904745, + "loss": 3.5108, + "step": 97180 + }, + { + "epoch": 6.603139013452915, + "grad_norm": 0.7402017712593079, + "learning_rate": 0.0001749048783802147, + "loss": 3.5025, + "step": 97185 + }, + { + "epoch": 6.603478733523577, + "grad_norm": 0.7529060244560242, + "learning_rate": 0.00017486241337138199, + "loss": 3.5313, + "step": 97190 + }, + { + "epoch": 6.603818453594238, + "grad_norm": 0.9717360734939575, + "learning_rate": 0.00017481994836254927, + "loss": 3.5744, + "step": 97195 + }, + { + "epoch": 6.6041581736649, + "grad_norm": 1.1876686811447144, + "learning_rate": 0.00017477748335371655, + "loss": 3.7107, + "step": 97200 + }, + { + "epoch": 6.604497893735562, + "grad_norm": 0.9006242752075195, + "learning_rate": 0.0001747350183448838, + "loss": 3.3389, + "step": 97205 + }, + { + "epoch": 6.6048376138062235, + "grad_norm": 1.0071220397949219, + "learning_rate": 0.0001746925533360511, + "loss": 3.1763, + "step": 97210 + }, + { + "epoch": 6.6051773338768855, + "grad_norm": 1.0945372581481934, + "learning_rate": 0.00017465008832721839, + "loss": 3.4015, + "step": 97215 + }, + { + "epoch": 6.605517053947548, + "grad_norm": 1.0903353691101074, + "learning_rate": 0.00017460762331838564, + "loss": 3.4699, + "step": 97220 + }, + { + "epoch": 6.605856774018209, + "grad_norm": 0.932127058506012, + "learning_rate": 0.00017456515830955295, + "loss": 3.3569, + "step": 97225 + }, + { + "epoch": 6.606196494088871, + "grad_norm": 0.7911741733551025, + "learning_rate": 0.0001745226933007202, + "loss": 3.4522, + "step": 97230 + }, + { + "epoch": 6.606536214159533, + "grad_norm": 0.7468881607055664, + "learning_rate": 0.00017448022829188748, + "loss": 3.482, + "step": 97235 + }, + { + "epoch": 6.606875934230194, + "grad_norm": 0.9693372249603271, + "learning_rate": 0.00017443776328305479, + "loss": 3.4571, + "step": 97240 + }, + { + "epoch": 6.607215654300856, + "grad_norm": 1.262694001197815, + "learning_rate": 0.00017439529827422204, + "loss": 3.4294, + "step": 97245 + }, + { + "epoch": 6.607555374371518, + "grad_norm": 0.7966423630714417, + "learning_rate": 0.00017435283326538932, + "loss": 3.1288, + "step": 97250 + }, + { + "epoch": 6.6078950944421795, + "grad_norm": 0.9072123765945435, + "learning_rate": 0.0001743103682565566, + "loss": 3.1822, + "step": 97255 + }, + { + "epoch": 6.608234814512842, + "grad_norm": 0.8354119658470154, + "learning_rate": 0.00017426790324772388, + "loss": 3.6282, + "step": 97260 + }, + { + "epoch": 6.608574534583504, + "grad_norm": 0.7677922248840332, + "learning_rate": 0.00017422543823889116, + "loss": 3.4465, + "step": 97265 + }, + { + "epoch": 6.608914254654165, + "grad_norm": 0.9761457443237305, + "learning_rate": 0.00017418297323005844, + "loss": 3.1972, + "step": 97270 + }, + { + "epoch": 6.609253974724827, + "grad_norm": 0.9753945469856262, + "learning_rate": 0.00017414050822122572, + "loss": 3.6194, + "step": 97275 + }, + { + "epoch": 6.609593694795489, + "grad_norm": 0.725884735584259, + "learning_rate": 0.000174098043212393, + "loss": 3.6028, + "step": 97280 + }, + { + "epoch": 6.60993341486615, + "grad_norm": 0.9694748520851135, + "learning_rate": 0.00017405557820356028, + "loss": 3.6003, + "step": 97285 + }, + { + "epoch": 6.610273134936812, + "grad_norm": 0.8608989119529724, + "learning_rate": 0.00017401311319472753, + "loss": 3.3581, + "step": 97290 + }, + { + "epoch": 6.610612855007474, + "grad_norm": 0.8268758058547974, + "learning_rate": 0.00017397064818589484, + "loss": 3.3196, + "step": 97295 + }, + { + "epoch": 6.6109525750781355, + "grad_norm": 0.7886155843734741, + "learning_rate": 0.0001739281831770621, + "loss": 3.7137, + "step": 97300 + }, + { + "epoch": 6.611292295148798, + "grad_norm": 0.8483842611312866, + "learning_rate": 0.00017388571816822937, + "loss": 3.209, + "step": 97305 + }, + { + "epoch": 6.61163201521946, + "grad_norm": 1.0079290866851807, + "learning_rate": 0.00017384325315939668, + "loss": 3.3241, + "step": 97310 + }, + { + "epoch": 6.611971735290121, + "grad_norm": 0.8289422392845154, + "learning_rate": 0.00017380078815056393, + "loss": 3.5296, + "step": 97315 + }, + { + "epoch": 6.612311455360783, + "grad_norm": 1.2378263473510742, + "learning_rate": 0.0001737583231417312, + "loss": 3.4026, + "step": 97320 + }, + { + "epoch": 6.612651175431445, + "grad_norm": 0.9780288338661194, + "learning_rate": 0.0001737158581328985, + "loss": 3.4556, + "step": 97325 + }, + { + "epoch": 6.612990895502106, + "grad_norm": 0.9569134712219238, + "learning_rate": 0.00017367339312406577, + "loss": 3.294, + "step": 97330 + }, + { + "epoch": 6.613330615572768, + "grad_norm": 0.8619605898857117, + "learning_rate": 0.00017363092811523303, + "loss": 3.2738, + "step": 97335 + }, + { + "epoch": 6.613670335643429, + "grad_norm": 1.0977410078048706, + "learning_rate": 0.00017358846310640033, + "loss": 3.4321, + "step": 97340 + }, + { + "epoch": 6.6140100557140915, + "grad_norm": 0.9106045961380005, + "learning_rate": 0.00017354599809756761, + "loss": 3.4917, + "step": 97345 + }, + { + "epoch": 6.614349775784754, + "grad_norm": 1.1789740324020386, + "learning_rate": 0.0001735035330887349, + "loss": 3.3843, + "step": 97350 + }, + { + "epoch": 6.614689495855415, + "grad_norm": 0.9657354354858398, + "learning_rate": 0.00017346106807990217, + "loss": 3.1964, + "step": 97355 + }, + { + "epoch": 6.615029215926077, + "grad_norm": 1.0349256992340088, + "learning_rate": 0.00017341860307106943, + "loss": 3.1048, + "step": 97360 + }, + { + "epoch": 6.615368935996739, + "grad_norm": 1.0161707401275635, + "learning_rate": 0.00017337613806223673, + "loss": 3.4412, + "step": 97365 + }, + { + "epoch": 6.6157086560674, + "grad_norm": 0.9693883061408997, + "learning_rate": 0.00017333367305340401, + "loss": 3.4174, + "step": 97370 + }, + { + "epoch": 6.616048376138062, + "grad_norm": 0.8241753578186035, + "learning_rate": 0.00017329120804457127, + "loss": 3.0443, + "step": 97375 + }, + { + "epoch": 6.616388096208724, + "grad_norm": 0.8563506007194519, + "learning_rate": 0.00017324874303573857, + "loss": 3.3314, + "step": 97380 + }, + { + "epoch": 6.616727816279385, + "grad_norm": 1.4663656949996948, + "learning_rate": 0.00017320627802690583, + "loss": 3.6082, + "step": 97385 + }, + { + "epoch": 6.6170675363500475, + "grad_norm": 0.9157328009605408, + "learning_rate": 0.0001731638130180731, + "loss": 3.0908, + "step": 97390 + }, + { + "epoch": 6.61740725642071, + "grad_norm": 0.8001793026924133, + "learning_rate": 0.0001731213480092404, + "loss": 3.4469, + "step": 97395 + }, + { + "epoch": 6.617746976491371, + "grad_norm": 1.3075228929519653, + "learning_rate": 0.00017307888300040767, + "loss": 3.3845, + "step": 97400 + }, + { + "epoch": 6.618086696562033, + "grad_norm": 0.994766891002655, + "learning_rate": 0.00017303641799157495, + "loss": 3.5061, + "step": 97405 + }, + { + "epoch": 6.618426416632695, + "grad_norm": 0.9018204808235168, + "learning_rate": 0.00017299395298274223, + "loss": 3.5387, + "step": 97410 + }, + { + "epoch": 6.618766136703356, + "grad_norm": 0.938644289970398, + "learning_rate": 0.0001729514879739095, + "loss": 3.4206, + "step": 97415 + }, + { + "epoch": 6.619105856774018, + "grad_norm": 0.9247003793716431, + "learning_rate": 0.00017290902296507676, + "loss": 3.4177, + "step": 97420 + }, + { + "epoch": 6.61944557684468, + "grad_norm": 0.7563278079032898, + "learning_rate": 0.00017286655795624407, + "loss": 3.6109, + "step": 97425 + }, + { + "epoch": 6.6197852969153415, + "grad_norm": 0.9414361119270325, + "learning_rate": 0.00017282409294741132, + "loss": 3.5296, + "step": 97430 + }, + { + "epoch": 6.6201250169860035, + "grad_norm": 0.8838784694671631, + "learning_rate": 0.00017278162793857863, + "loss": 3.2119, + "step": 97435 + }, + { + "epoch": 6.620464737056666, + "grad_norm": 1.0114789009094238, + "learning_rate": 0.0001727391629297459, + "loss": 3.3369, + "step": 97440 + }, + { + "epoch": 6.620804457127327, + "grad_norm": 0.8282676935195923, + "learning_rate": 0.00017269669792091316, + "loss": 3.1221, + "step": 97445 + }, + { + "epoch": 6.621144177197989, + "grad_norm": 0.8039588332176208, + "learning_rate": 0.00017265423291208047, + "loss": 3.6945, + "step": 97450 + }, + { + "epoch": 6.621483897268651, + "grad_norm": 0.9895963668823242, + "learning_rate": 0.00017261176790324772, + "loss": 3.3688, + "step": 97455 + }, + { + "epoch": 6.621823617339312, + "grad_norm": 1.0057264566421509, + "learning_rate": 0.000172569302894415, + "loss": 3.598, + "step": 97460 + }, + { + "epoch": 6.622163337409974, + "grad_norm": 0.8915064334869385, + "learning_rate": 0.00017252683788558228, + "loss": 3.3479, + "step": 97465 + }, + { + "epoch": 6.622503057480636, + "grad_norm": 0.9491124153137207, + "learning_rate": 0.00017248437287674956, + "loss": 3.398, + "step": 97470 + }, + { + "epoch": 6.6228427775512975, + "grad_norm": 1.1207575798034668, + "learning_rate": 0.00017244190786791684, + "loss": 3.3689, + "step": 97475 + }, + { + "epoch": 6.6231824976219595, + "grad_norm": 0.6909717321395874, + "learning_rate": 0.00017239944285908412, + "loss": 3.2953, + "step": 97480 + }, + { + "epoch": 6.623522217692622, + "grad_norm": 0.7603974342346191, + "learning_rate": 0.0001723569778502514, + "loss": 3.7216, + "step": 97485 + }, + { + "epoch": 6.623861937763283, + "grad_norm": 1.118270754814148, + "learning_rate": 0.00017231451284141865, + "loss": 3.4732, + "step": 97490 + }, + { + "epoch": 6.624201657833945, + "grad_norm": 1.0803996324539185, + "learning_rate": 0.00017227204783258596, + "loss": 3.4671, + "step": 97495 + }, + { + "epoch": 6.624541377904607, + "grad_norm": 1.1441253423690796, + "learning_rate": 0.00017222958282375321, + "loss": 3.305, + "step": 97500 + }, + { + "epoch": 6.624881097975268, + "grad_norm": 1.082458257675171, + "learning_rate": 0.0001721871178149205, + "loss": 3.4361, + "step": 97505 + }, + { + "epoch": 6.62522081804593, + "grad_norm": 1.0085406303405762, + "learning_rate": 0.0001721446528060878, + "loss": 3.5255, + "step": 97510 + }, + { + "epoch": 6.625560538116592, + "grad_norm": 0.9876922965049744, + "learning_rate": 0.00017210218779725506, + "loss": 3.5167, + "step": 97515 + }, + { + "epoch": 6.6259002581872535, + "grad_norm": 0.8283570408821106, + "learning_rate": 0.00017205972278842236, + "loss": 3.7494, + "step": 97520 + }, + { + "epoch": 6.6262399782579156, + "grad_norm": 1.06893789768219, + "learning_rate": 0.00017201725777958962, + "loss": 3.1974, + "step": 97525 + }, + { + "epoch": 6.626579698328578, + "grad_norm": 1.0116472244262695, + "learning_rate": 0.0001719747927707569, + "loss": 3.5986, + "step": 97530 + }, + { + "epoch": 6.626919418399239, + "grad_norm": 0.9640796184539795, + "learning_rate": 0.0001719323277619242, + "loss": 3.493, + "step": 97535 + }, + { + "epoch": 6.627259138469901, + "grad_norm": 0.8987517952919006, + "learning_rate": 0.00017188986275309146, + "loss": 3.4964, + "step": 97540 + }, + { + "epoch": 6.627598858540563, + "grad_norm": 0.9849269390106201, + "learning_rate": 0.00017184739774425874, + "loss": 3.3079, + "step": 97545 + }, + { + "epoch": 6.627938578611224, + "grad_norm": 0.8897499442100525, + "learning_rate": 0.00017180493273542602, + "loss": 3.3859, + "step": 97550 + }, + { + "epoch": 6.628278298681886, + "grad_norm": 0.8455703854560852, + "learning_rate": 0.0001717624677265933, + "loss": 3.4362, + "step": 97555 + }, + { + "epoch": 6.628618018752547, + "grad_norm": 0.8071821928024292, + "learning_rate": 0.00017172000271776055, + "loss": 3.2802, + "step": 97560 + }, + { + "epoch": 6.6289577388232095, + "grad_norm": 0.7855571508407593, + "learning_rate": 0.00017167753770892786, + "loss": 3.2799, + "step": 97565 + }, + { + "epoch": 6.629297458893872, + "grad_norm": 0.9016988277435303, + "learning_rate": 0.00017163507270009514, + "loss": 3.3627, + "step": 97570 + }, + { + "epoch": 6.629637178964533, + "grad_norm": 1.0193394422531128, + "learning_rate": 0.0001715926076912624, + "loss": 3.4315, + "step": 97575 + }, + { + "epoch": 6.629976899035195, + "grad_norm": 0.8409348130226135, + "learning_rate": 0.0001715501426824297, + "loss": 3.5128, + "step": 97580 + }, + { + "epoch": 6.630316619105857, + "grad_norm": 1.2693926095962524, + "learning_rate": 0.00017150767767359695, + "loss": 3.2542, + "step": 97585 + }, + { + "epoch": 6.630656339176518, + "grad_norm": 2.1919076442718506, + "learning_rate": 0.00017146521266476423, + "loss": 3.2158, + "step": 97590 + }, + { + "epoch": 6.63099605924718, + "grad_norm": 0.7783676981925964, + "learning_rate": 0.0001714227476559315, + "loss": 3.5259, + "step": 97595 + }, + { + "epoch": 6.631335779317842, + "grad_norm": 0.7983313798904419, + "learning_rate": 0.0001713802826470988, + "loss": 3.4466, + "step": 97600 + }, + { + "epoch": 6.631675499388503, + "grad_norm": 1.0393949747085571, + "learning_rate": 0.0001713378176382661, + "loss": 3.7836, + "step": 97605 + }, + { + "epoch": 6.6320152194591655, + "grad_norm": 1.1082388162612915, + "learning_rate": 0.00017129535262943335, + "loss": 3.6338, + "step": 97610 + }, + { + "epoch": 6.632354939529828, + "grad_norm": 0.8994917869567871, + "learning_rate": 0.00017125288762060063, + "loss": 3.372, + "step": 97615 + }, + { + "epoch": 6.632694659600489, + "grad_norm": 0.8466517329216003, + "learning_rate": 0.0001712104226117679, + "loss": 3.1729, + "step": 97620 + }, + { + "epoch": 6.633034379671151, + "grad_norm": 1.1107527017593384, + "learning_rate": 0.0001711679576029352, + "loss": 3.1826, + "step": 97625 + }, + { + "epoch": 6.633374099741813, + "grad_norm": 0.7957565188407898, + "learning_rate": 0.00017112549259410244, + "loss": 3.2673, + "step": 97630 + }, + { + "epoch": 6.633713819812474, + "grad_norm": 0.829077422618866, + "learning_rate": 0.00017108302758526975, + "loss": 3.2907, + "step": 97635 + }, + { + "epoch": 6.634053539883136, + "grad_norm": 0.8060550093650818, + "learning_rate": 0.00017104056257643703, + "loss": 3.3674, + "step": 97640 + }, + { + "epoch": 6.634393259953798, + "grad_norm": 1.7067309617996216, + "learning_rate": 0.00017099809756760428, + "loss": 3.5444, + "step": 97645 + }, + { + "epoch": 6.634732980024459, + "grad_norm": 0.9251945614814758, + "learning_rate": 0.0001709556325587716, + "loss": 3.5088, + "step": 97650 + }, + { + "epoch": 6.6350727000951215, + "grad_norm": 0.747983455657959, + "learning_rate": 0.00017091316754993884, + "loss": 3.533, + "step": 97655 + }, + { + "epoch": 6.635412420165784, + "grad_norm": 0.7596400380134583, + "learning_rate": 0.00017087070254110612, + "loss": 3.5677, + "step": 97660 + }, + { + "epoch": 6.635752140236445, + "grad_norm": 0.7568084597587585, + "learning_rate": 0.00017082823753227343, + "loss": 3.1662, + "step": 97665 + }, + { + "epoch": 6.636091860307107, + "grad_norm": 0.9004007577896118, + "learning_rate": 0.00017078577252344068, + "loss": 3.4819, + "step": 97670 + }, + { + "epoch": 6.636431580377769, + "grad_norm": 0.8326224088668823, + "learning_rate": 0.00017074330751460796, + "loss": 3.7075, + "step": 97675 + }, + { + "epoch": 6.63677130044843, + "grad_norm": 0.8776364326477051, + "learning_rate": 0.00017070084250577524, + "loss": 3.4565, + "step": 97680 + }, + { + "epoch": 6.637111020519092, + "grad_norm": 1.0337562561035156, + "learning_rate": 0.00017065837749694252, + "loss": 3.159, + "step": 97685 + }, + { + "epoch": 6.637450740589754, + "grad_norm": 0.838013768196106, + "learning_rate": 0.0001706159124881098, + "loss": 3.6621, + "step": 97690 + }, + { + "epoch": 6.6377904606604154, + "grad_norm": 0.8837043046951294, + "learning_rate": 0.00017057344747927708, + "loss": 3.6236, + "step": 97695 + }, + { + "epoch": 6.6381301807310775, + "grad_norm": 1.016863226890564, + "learning_rate": 0.00017053098247044436, + "loss": 3.3677, + "step": 97700 + }, + { + "epoch": 6.63846990080174, + "grad_norm": 0.8041388988494873, + "learning_rate": 0.00017048851746161164, + "loss": 3.2512, + "step": 97705 + }, + { + "epoch": 6.638809620872401, + "grad_norm": 0.7744637727737427, + "learning_rate": 0.00017044605245277892, + "loss": 3.315, + "step": 97710 + }, + { + "epoch": 6.639149340943063, + "grad_norm": 0.7198867201805115, + "learning_rate": 0.00017040358744394618, + "loss": 3.5196, + "step": 97715 + }, + { + "epoch": 6.639489061013725, + "grad_norm": 0.8510762453079224, + "learning_rate": 0.00017036112243511348, + "loss": 3.4242, + "step": 97720 + }, + { + "epoch": 6.639828781084386, + "grad_norm": 0.8607950210571289, + "learning_rate": 0.00017031865742628074, + "loss": 3.2216, + "step": 97725 + }, + { + "epoch": 6.640168501155048, + "grad_norm": 0.8303046226501465, + "learning_rate": 0.00017027619241744802, + "loss": 3.4818, + "step": 97730 + }, + { + "epoch": 6.64050822122571, + "grad_norm": 1.4850255250930786, + "learning_rate": 0.00017023372740861532, + "loss": 3.4023, + "step": 97735 + }, + { + "epoch": 6.6408479412963715, + "grad_norm": 1.0447502136230469, + "learning_rate": 0.00017019126239978258, + "loss": 3.4453, + "step": 97740 + }, + { + "epoch": 6.6411876613670335, + "grad_norm": 1.1482064723968506, + "learning_rate": 0.00017014879739094986, + "loss": 3.5619, + "step": 97745 + }, + { + "epoch": 6.641527381437696, + "grad_norm": 3.958329916000366, + "learning_rate": 0.00017010633238211714, + "loss": 3.4568, + "step": 97750 + }, + { + "epoch": 6.641867101508357, + "grad_norm": 1.0906254053115845, + "learning_rate": 0.00017006386737328442, + "loss": 3.3501, + "step": 97755 + }, + { + "epoch": 6.642206821579019, + "grad_norm": 1.1116394996643066, + "learning_rate": 0.00017002140236445167, + "loss": 3.366, + "step": 97760 + }, + { + "epoch": 6.642546541649681, + "grad_norm": 0.9731511473655701, + "learning_rate": 0.00016997893735561898, + "loss": 3.5677, + "step": 97765 + }, + { + "epoch": 6.642886261720342, + "grad_norm": 0.8359944820404053, + "learning_rate": 0.00016993647234678626, + "loss": 3.2804, + "step": 97770 + }, + { + "epoch": 6.643225981791004, + "grad_norm": 0.9282190203666687, + "learning_rate": 0.00016989400733795354, + "loss": 3.3645, + "step": 97775 + }, + { + "epoch": 6.643565701861666, + "grad_norm": 0.8600487112998962, + "learning_rate": 0.00016985154232912082, + "loss": 3.2084, + "step": 97780 + }, + { + "epoch": 6.6439054219323275, + "grad_norm": 0.843222975730896, + "learning_rate": 0.00016980907732028807, + "loss": 3.552, + "step": 97785 + }, + { + "epoch": 6.6442451420029895, + "grad_norm": 1.0064126253128052, + "learning_rate": 0.00016976661231145538, + "loss": 3.4592, + "step": 97790 + }, + { + "epoch": 6.644584862073652, + "grad_norm": 0.8339983224868774, + "learning_rate": 0.00016972414730262263, + "loss": 3.3421, + "step": 97795 + }, + { + "epoch": 6.644924582144313, + "grad_norm": 1.0243897438049316, + "learning_rate": 0.0001696816822937899, + "loss": 3.2763, + "step": 97800 + }, + { + "epoch": 6.645264302214975, + "grad_norm": 1.051186442375183, + "learning_rate": 0.00016963921728495722, + "loss": 3.4548, + "step": 97805 + }, + { + "epoch": 6.645604022285637, + "grad_norm": 1.0996973514556885, + "learning_rate": 0.00016959675227612447, + "loss": 3.2904, + "step": 97810 + }, + { + "epoch": 6.645943742356298, + "grad_norm": 0.9041186571121216, + "learning_rate": 0.00016955428726729175, + "loss": 3.2799, + "step": 97815 + }, + { + "epoch": 6.64628346242696, + "grad_norm": 0.8120580315589905, + "learning_rate": 0.00016951182225845903, + "loss": 3.1828, + "step": 97820 + }, + { + "epoch": 6.646623182497622, + "grad_norm": 0.8602385520935059, + "learning_rate": 0.0001694693572496263, + "loss": 3.4435, + "step": 97825 + }, + { + "epoch": 6.6469629025682835, + "grad_norm": 1.0370663404464722, + "learning_rate": 0.0001694268922407936, + "loss": 3.2199, + "step": 97830 + }, + { + "epoch": 6.647302622638946, + "grad_norm": 0.957510769367218, + "learning_rate": 0.00016938442723196087, + "loss": 3.4996, + "step": 97835 + }, + { + "epoch": 6.647642342709608, + "grad_norm": 0.7957910299301147, + "learning_rate": 0.00016934196222312815, + "loss": 3.6426, + "step": 97840 + }, + { + "epoch": 6.647982062780269, + "grad_norm": 0.79746413230896, + "learning_rate": 0.0001692994972142954, + "loss": 3.3605, + "step": 97845 + }, + { + "epoch": 6.648321782850931, + "grad_norm": 0.9436831474304199, + "learning_rate": 0.0001692570322054627, + "loss": 3.4755, + "step": 97850 + }, + { + "epoch": 6.648661502921593, + "grad_norm": 0.9917492866516113, + "learning_rate": 0.00016921456719662996, + "loss": 3.1904, + "step": 97855 + }, + { + "epoch": 6.649001222992254, + "grad_norm": 0.9780381917953491, + "learning_rate": 0.00016917210218779727, + "loss": 3.5358, + "step": 97860 + }, + { + "epoch": 6.649340943062916, + "grad_norm": 0.9417676329612732, + "learning_rate": 0.00016912963717896455, + "loss": 3.55, + "step": 97865 + }, + { + "epoch": 6.649680663133578, + "grad_norm": 0.9698737859725952, + "learning_rate": 0.0001690871721701318, + "loss": 3.2392, + "step": 97870 + }, + { + "epoch": 6.6500203832042395, + "grad_norm": 0.8189160227775574, + "learning_rate": 0.0001690447071612991, + "loss": 3.1716, + "step": 97875 + }, + { + "epoch": 6.650360103274902, + "grad_norm": 0.9630095958709717, + "learning_rate": 0.00016900224215246637, + "loss": 3.2744, + "step": 97880 + }, + { + "epoch": 6.650699823345564, + "grad_norm": 0.8976494073867798, + "learning_rate": 0.00016895977714363365, + "loss": 3.5439, + "step": 97885 + }, + { + "epoch": 6.651039543416225, + "grad_norm": 0.998091995716095, + "learning_rate": 0.00016891731213480093, + "loss": 3.5922, + "step": 97890 + }, + { + "epoch": 6.651379263486887, + "grad_norm": 0.9102874994277954, + "learning_rate": 0.0001688748471259682, + "loss": 3.5268, + "step": 97895 + }, + { + "epoch": 6.651718983557549, + "grad_norm": 0.8651131391525269, + "learning_rate": 0.00016883238211713549, + "loss": 3.3522, + "step": 97900 + }, + { + "epoch": 6.65205870362821, + "grad_norm": 1.028525710105896, + "learning_rate": 0.00016878991710830277, + "loss": 3.3502, + "step": 97905 + }, + { + "epoch": 6.652398423698872, + "grad_norm": 0.8791601061820984, + "learning_rate": 0.00016874745209947005, + "loss": 3.4672, + "step": 97910 + }, + { + "epoch": 6.652738143769534, + "grad_norm": 0.8360100984573364, + "learning_rate": 0.0001687049870906373, + "loss": 3.2886, + "step": 97915 + }, + { + "epoch": 6.6530778638401955, + "grad_norm": 1.0295522212982178, + "learning_rate": 0.0001686625220818046, + "loss": 3.3638, + "step": 97920 + }, + { + "epoch": 6.653417583910858, + "grad_norm": 1.0001438856124878, + "learning_rate": 0.00016862005707297186, + "loss": 3.4761, + "step": 97925 + }, + { + "epoch": 6.65375730398152, + "grad_norm": 0.7576950192451477, + "learning_rate": 0.00016857759206413914, + "loss": 3.2794, + "step": 97930 + }, + { + "epoch": 6.654097024052181, + "grad_norm": 0.9330813884735107, + "learning_rate": 0.00016853512705530645, + "loss": 3.3691, + "step": 97935 + }, + { + "epoch": 6.654436744122843, + "grad_norm": 0.9808968901634216, + "learning_rate": 0.0001684926620464737, + "loss": 3.2473, + "step": 97940 + }, + { + "epoch": 6.654776464193505, + "grad_norm": 1.1326847076416016, + "learning_rate": 0.000168450197037641, + "loss": 3.4128, + "step": 97945 + }, + { + "epoch": 6.655116184264166, + "grad_norm": 1.053780436515808, + "learning_rate": 0.00016840773202880826, + "loss": 3.4635, + "step": 97950 + }, + { + "epoch": 6.655455904334828, + "grad_norm": 0.810038149356842, + "learning_rate": 0.00016836526701997554, + "loss": 3.4938, + "step": 97955 + }, + { + "epoch": 6.65579562440549, + "grad_norm": 0.7497294545173645, + "learning_rate": 0.00016832280201114285, + "loss": 3.0402, + "step": 97960 + }, + { + "epoch": 6.6561353444761515, + "grad_norm": 1.0224865674972534, + "learning_rate": 0.0001682803370023101, + "loss": 3.4778, + "step": 97965 + }, + { + "epoch": 6.656475064546814, + "grad_norm": 0.9350208044052124, + "learning_rate": 0.00016823787199347738, + "loss": 3.4913, + "step": 97970 + }, + { + "epoch": 6.656814784617476, + "grad_norm": 0.9633120894432068, + "learning_rate": 0.00016819540698464466, + "loss": 3.6574, + "step": 97975 + }, + { + "epoch": 6.657154504688137, + "grad_norm": 0.8217949271202087, + "learning_rate": 0.00016815294197581194, + "loss": 3.283, + "step": 97980 + }, + { + "epoch": 6.657494224758799, + "grad_norm": 0.7431110739707947, + "learning_rate": 0.0001681104769669792, + "loss": 3.3092, + "step": 97985 + }, + { + "epoch": 6.657833944829461, + "grad_norm": 0.7402657866477966, + "learning_rate": 0.0001680680119581465, + "loss": 3.578, + "step": 97990 + }, + { + "epoch": 6.658173664900122, + "grad_norm": 0.9713103771209717, + "learning_rate": 0.00016802554694931378, + "loss": 3.4513, + "step": 97995 + }, + { + "epoch": 6.658513384970784, + "grad_norm": 0.923119068145752, + "learning_rate": 0.00016798308194048103, + "loss": 3.3508, + "step": 98000 + }, + { + "epoch": 6.658853105041446, + "grad_norm": 0.9656590819358826, + "learning_rate": 0.00016794061693164834, + "loss": 3.4885, + "step": 98005 + }, + { + "epoch": 6.6591928251121075, + "grad_norm": 0.9947888255119324, + "learning_rate": 0.0001678981519228156, + "loss": 3.155, + "step": 98010 + }, + { + "epoch": 6.65953254518277, + "grad_norm": 0.8325534462928772, + "learning_rate": 0.00016785568691398287, + "loss": 3.6804, + "step": 98015 + }, + { + "epoch": 6.659872265253431, + "grad_norm": 1.080372929573059, + "learning_rate": 0.00016781322190515015, + "loss": 3.4088, + "step": 98020 + }, + { + "epoch": 6.660211985324093, + "grad_norm": 0.8472287654876709, + "learning_rate": 0.00016777075689631743, + "loss": 3.4001, + "step": 98025 + }, + { + "epoch": 6.660551705394755, + "grad_norm": 1.0463734865188599, + "learning_rate": 0.00016772829188748474, + "loss": 3.5897, + "step": 98030 + }, + { + "epoch": 6.660891425465416, + "grad_norm": 0.873298168182373, + "learning_rate": 0.000167685826878652, + "loss": 3.6807, + "step": 98035 + }, + { + "epoch": 6.661231145536078, + "grad_norm": 0.9479320049285889, + "learning_rate": 0.00016764336186981927, + "loss": 3.4274, + "step": 98040 + }, + { + "epoch": 6.66157086560674, + "grad_norm": 1.0318418741226196, + "learning_rate": 0.00016760089686098655, + "loss": 3.3632, + "step": 98045 + }, + { + "epoch": 6.6619105856774015, + "grad_norm": 0.8544460535049438, + "learning_rate": 0.00016755843185215383, + "loss": 3.3841, + "step": 98050 + }, + { + "epoch": 6.6622503057480635, + "grad_norm": 0.9721531867980957, + "learning_rate": 0.0001675159668433211, + "loss": 3.5218, + "step": 98055 + }, + { + "epoch": 6.662590025818726, + "grad_norm": 1.4163718223571777, + "learning_rate": 0.0001674735018344884, + "loss": 3.465, + "step": 98060 + }, + { + "epoch": 6.662929745889387, + "grad_norm": 0.9289585947990417, + "learning_rate": 0.00016743103682565567, + "loss": 3.1837, + "step": 98065 + }, + { + "epoch": 6.663269465960049, + "grad_norm": 0.838818371295929, + "learning_rate": 0.00016738857181682293, + "loss": 3.3745, + "step": 98070 + }, + { + "epoch": 6.663609186030711, + "grad_norm": 0.9752063751220703, + "learning_rate": 0.00016734610680799023, + "loss": 3.4853, + "step": 98075 + }, + { + "epoch": 6.663948906101372, + "grad_norm": 1.3361579179763794, + "learning_rate": 0.0001673036417991575, + "loss": 3.2289, + "step": 98080 + }, + { + "epoch": 6.664288626172034, + "grad_norm": 1.192015528678894, + "learning_rate": 0.00016726117679032477, + "loss": 3.3583, + "step": 98085 + }, + { + "epoch": 6.664628346242696, + "grad_norm": 0.8683301210403442, + "learning_rate": 0.00016721871178149207, + "loss": 3.4214, + "step": 98090 + }, + { + "epoch": 6.6649680663133575, + "grad_norm": 0.9193955063819885, + "learning_rate": 0.00016717624677265933, + "loss": 3.3772, + "step": 98095 + }, + { + "epoch": 6.6653077863840196, + "grad_norm": 0.7600099444389343, + "learning_rate": 0.0001671337817638266, + "loss": 3.2806, + "step": 98100 + }, + { + "epoch": 6.665647506454682, + "grad_norm": 0.8559929132461548, + "learning_rate": 0.0001670913167549939, + "loss": 3.3906, + "step": 98105 + }, + { + "epoch": 6.665987226525343, + "grad_norm": 0.8463395833969116, + "learning_rate": 0.00016704885174616117, + "loss": 3.1453, + "step": 98110 + }, + { + "epoch": 6.666326946596005, + "grad_norm": 0.7919742465019226, + "learning_rate": 0.00016700638673732845, + "loss": 3.4062, + "step": 98115 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.1364693641662598, + "learning_rate": 0.00016696392172849573, + "loss": 3.6269, + "step": 98120 + }, + { + "epoch": 6.667006386737328, + "grad_norm": 0.7206830382347107, + "learning_rate": 0.000166921456719663, + "loss": 3.3562, + "step": 98125 + }, + { + "epoch": 6.66734610680799, + "grad_norm": 0.7655106782913208, + "learning_rate": 0.0001668789917108303, + "loss": 3.5453, + "step": 98130 + }, + { + "epoch": 6.667685826878652, + "grad_norm": 0.8620410561561584, + "learning_rate": 0.00016683652670199757, + "loss": 3.362, + "step": 98135 + }, + { + "epoch": 6.6680255469493135, + "grad_norm": 0.9144266843795776, + "learning_rate": 0.00016679406169316482, + "loss": 3.2651, + "step": 98140 + }, + { + "epoch": 6.668365267019976, + "grad_norm": 0.9132775664329529, + "learning_rate": 0.00016675159668433213, + "loss": 3.3852, + "step": 98145 + }, + { + "epoch": 6.668704987090638, + "grad_norm": 1.1079875230789185, + "learning_rate": 0.00016670913167549938, + "loss": 3.6245, + "step": 98150 + }, + { + "epoch": 6.669044707161299, + "grad_norm": 0.968901515007019, + "learning_rate": 0.00016666666666666666, + "loss": 3.3746, + "step": 98155 + }, + { + "epoch": 6.669384427231961, + "grad_norm": 1.1180707216262817, + "learning_rate": 0.00016662420165783397, + "loss": 3.4171, + "step": 98160 + }, + { + "epoch": 6.669724147302623, + "grad_norm": 0.92876136302948, + "learning_rate": 0.00016658173664900122, + "loss": 3.2494, + "step": 98165 + }, + { + "epoch": 6.670063867373284, + "grad_norm": 0.7565241456031799, + "learning_rate": 0.0001665392716401685, + "loss": 3.4942, + "step": 98170 + }, + { + "epoch": 6.670403587443946, + "grad_norm": 1.056593418121338, + "learning_rate": 0.00016649680663133578, + "loss": 3.3125, + "step": 98175 + }, + { + "epoch": 6.670743307514608, + "grad_norm": 1.1373618841171265, + "learning_rate": 0.00016645434162250306, + "loss": 3.6168, + "step": 98180 + }, + { + "epoch": 6.6710830275852695, + "grad_norm": 1.1004126071929932, + "learning_rate": 0.00016641187661367031, + "loss": 3.3534, + "step": 98185 + }, + { + "epoch": 6.671422747655932, + "grad_norm": 0.908135175704956, + "learning_rate": 0.00016636941160483762, + "loss": 3.5142, + "step": 98190 + }, + { + "epoch": 6.671762467726594, + "grad_norm": 0.8068974614143372, + "learning_rate": 0.0001663269465960049, + "loss": 3.3623, + "step": 98195 + }, + { + "epoch": 6.672102187797255, + "grad_norm": 0.8736423254013062, + "learning_rate": 0.00016628448158717218, + "loss": 3.4383, + "step": 98200 + }, + { + "epoch": 6.672441907867917, + "grad_norm": 0.7805429697036743, + "learning_rate": 0.00016624201657833946, + "loss": 3.3132, + "step": 98205 + }, + { + "epoch": 6.672781627938579, + "grad_norm": 1.2048476934432983, + "learning_rate": 0.00016619955156950671, + "loss": 3.6073, + "step": 98210 + }, + { + "epoch": 6.67312134800924, + "grad_norm": 0.9551158547401428, + "learning_rate": 0.00016615708656067402, + "loss": 3.2659, + "step": 98215 + }, + { + "epoch": 6.673461068079902, + "grad_norm": 0.8667959570884705, + "learning_rate": 0.00016611462155184127, + "loss": 3.4866, + "step": 98220 + }, + { + "epoch": 6.673800788150564, + "grad_norm": 0.8548638224601746, + "learning_rate": 0.00016607215654300856, + "loss": 3.185, + "step": 98225 + }, + { + "epoch": 6.6741405082212255, + "grad_norm": 0.9422394633293152, + "learning_rate": 0.00016602969153417586, + "loss": 3.2962, + "step": 98230 + }, + { + "epoch": 6.674480228291888, + "grad_norm": 1.4089834690093994, + "learning_rate": 0.00016598722652534312, + "loss": 3.5131, + "step": 98235 + }, + { + "epoch": 6.674819948362549, + "grad_norm": 1.1290463209152222, + "learning_rate": 0.0001659447615165104, + "loss": 3.159, + "step": 98240 + }, + { + "epoch": 6.675159668433211, + "grad_norm": 0.7826477289199829, + "learning_rate": 0.00016590229650767768, + "loss": 3.4939, + "step": 98245 + }, + { + "epoch": 6.675499388503873, + "grad_norm": 1.0197573900222778, + "learning_rate": 0.00016585983149884496, + "loss": 3.5746, + "step": 98250 + }, + { + "epoch": 6.675839108574534, + "grad_norm": 1.043168306350708, + "learning_rate": 0.00016581736649001224, + "loss": 3.0077, + "step": 98255 + }, + { + "epoch": 6.676178828645196, + "grad_norm": 0.8759874105453491, + "learning_rate": 0.00016577490148117952, + "loss": 3.4576, + "step": 98260 + }, + { + "epoch": 6.676518548715858, + "grad_norm": 2.3850486278533936, + "learning_rate": 0.0001657324364723468, + "loss": 3.3285, + "step": 98265 + }, + { + "epoch": 6.6768582687865194, + "grad_norm": 0.8540140390396118, + "learning_rate": 0.00016568997146351405, + "loss": 3.2631, + "step": 98270 + }, + { + "epoch": 6.6771979888571815, + "grad_norm": 1.0398250818252563, + "learning_rate": 0.00016564750645468136, + "loss": 3.1429, + "step": 98275 + }, + { + "epoch": 6.677537708927844, + "grad_norm": 0.6876682043075562, + "learning_rate": 0.0001656050414458486, + "loss": 3.5349, + "step": 98280 + }, + { + "epoch": 6.677877428998505, + "grad_norm": 0.7170277833938599, + "learning_rate": 0.00016556257643701592, + "loss": 3.6081, + "step": 98285 + }, + { + "epoch": 6.678217149069167, + "grad_norm": 1.0126115083694458, + "learning_rate": 0.0001655201114281832, + "loss": 3.4819, + "step": 98290 + }, + { + "epoch": 6.678556869139829, + "grad_norm": 1.089142084121704, + "learning_rate": 0.00016547764641935045, + "loss": 3.3609, + "step": 98295 + }, + { + "epoch": 6.67889658921049, + "grad_norm": 0.8695634603500366, + "learning_rate": 0.00016543518141051776, + "loss": 3.5378, + "step": 98300 + }, + { + "epoch": 6.679236309281152, + "grad_norm": 0.8295530080795288, + "learning_rate": 0.000165392716401685, + "loss": 3.5459, + "step": 98305 + }, + { + "epoch": 6.679576029351814, + "grad_norm": 0.8194022178649902, + "learning_rate": 0.0001653502513928523, + "loss": 3.291, + "step": 98310 + }, + { + "epoch": 6.6799157494224755, + "grad_norm": 0.7229406833648682, + "learning_rate": 0.00016530778638401957, + "loss": 3.5842, + "step": 98315 + }, + { + "epoch": 6.6802554694931375, + "grad_norm": 0.9167991280555725, + "learning_rate": 0.00016526532137518685, + "loss": 3.6547, + "step": 98320 + }, + { + "epoch": 6.6805951895638, + "grad_norm": 0.9328945875167847, + "learning_rate": 0.00016522285636635413, + "loss": 3.3491, + "step": 98325 + }, + { + "epoch": 6.680934909634461, + "grad_norm": 0.8952375650405884, + "learning_rate": 0.0001651803913575214, + "loss": 3.554, + "step": 98330 + }, + { + "epoch": 6.681274629705123, + "grad_norm": 1.1260261535644531, + "learning_rate": 0.0001651379263486887, + "loss": 3.2917, + "step": 98335 + }, + { + "epoch": 6.681614349775785, + "grad_norm": 0.7306327223777771, + "learning_rate": 0.00016509546133985594, + "loss": 3.5961, + "step": 98340 + }, + { + "epoch": 6.681954069846446, + "grad_norm": 0.8760716915130615, + "learning_rate": 0.00016505299633102325, + "loss": 3.5669, + "step": 98345 + }, + { + "epoch": 6.682293789917108, + "grad_norm": 1.0379985570907593, + "learning_rate": 0.0001650105313221905, + "loss": 3.4443, + "step": 98350 + }, + { + "epoch": 6.68263350998777, + "grad_norm": 0.8523532748222351, + "learning_rate": 0.00016496806631335778, + "loss": 3.5515, + "step": 98355 + }, + { + "epoch": 6.6829732300584315, + "grad_norm": 1.3686655759811401, + "learning_rate": 0.0001649256013045251, + "loss": 3.5671, + "step": 98360 + }, + { + "epoch": 6.6833129501290935, + "grad_norm": 1.258134126663208, + "learning_rate": 0.00016488313629569234, + "loss": 3.4683, + "step": 98365 + }, + { + "epoch": 6.683652670199756, + "grad_norm": 0.7801007628440857, + "learning_rate": 0.00016484067128685965, + "loss": 3.1704, + "step": 98370 + }, + { + "epoch": 6.683992390270417, + "grad_norm": 0.9494209289550781, + "learning_rate": 0.0001647982062780269, + "loss": 3.232, + "step": 98375 + }, + { + "epoch": 6.684332110341079, + "grad_norm": 1.0138437747955322, + "learning_rate": 0.00016475574126919418, + "loss": 3.4274, + "step": 98380 + }, + { + "epoch": 6.684671830411741, + "grad_norm": 1.124443769454956, + "learning_rate": 0.0001647132762603615, + "loss": 3.6709, + "step": 98385 + }, + { + "epoch": 6.685011550482402, + "grad_norm": 1.0498533248901367, + "learning_rate": 0.00016467081125152874, + "loss": 3.2556, + "step": 98390 + }, + { + "epoch": 6.685351270553064, + "grad_norm": 0.821378767490387, + "learning_rate": 0.00016462834624269602, + "loss": 3.6292, + "step": 98395 + }, + { + "epoch": 6.685690990623726, + "grad_norm": 1.2545686960220337, + "learning_rate": 0.0001645858812338633, + "loss": 3.3159, + "step": 98400 + }, + { + "epoch": 6.6860307106943875, + "grad_norm": 1.0100566148757935, + "learning_rate": 0.00016454341622503058, + "loss": 3.5368, + "step": 98405 + }, + { + "epoch": 6.6863704307650496, + "grad_norm": 0.7250462770462036, + "learning_rate": 0.00016450095121619784, + "loss": 3.249, + "step": 98410 + }, + { + "epoch": 6.686710150835712, + "grad_norm": 1.3126660585403442, + "learning_rate": 0.00016445848620736514, + "loss": 3.4612, + "step": 98415 + }, + { + "epoch": 6.687049870906373, + "grad_norm": 0.7628910541534424, + "learning_rate": 0.00016441602119853242, + "loss": 3.2679, + "step": 98420 + }, + { + "epoch": 6.687389590977035, + "grad_norm": 0.7969615459442139, + "learning_rate": 0.00016437355618969968, + "loss": 3.3995, + "step": 98425 + }, + { + "epoch": 6.687729311047697, + "grad_norm": 0.9647709131240845, + "learning_rate": 0.00016433109118086698, + "loss": 3.1991, + "step": 98430 + }, + { + "epoch": 6.688069031118358, + "grad_norm": 0.9542722702026367, + "learning_rate": 0.00016428862617203424, + "loss": 3.404, + "step": 98435 + }, + { + "epoch": 6.68840875118902, + "grad_norm": 0.8979702591896057, + "learning_rate": 0.00016424616116320152, + "loss": 3.3823, + "step": 98440 + }, + { + "epoch": 6.688748471259682, + "grad_norm": 0.8935280442237854, + "learning_rate": 0.0001642036961543688, + "loss": 3.4111, + "step": 98445 + }, + { + "epoch": 6.6890881913303435, + "grad_norm": 0.8110547661781311, + "learning_rate": 0.00016416123114553608, + "loss": 3.8337, + "step": 98450 + }, + { + "epoch": 6.689427911401006, + "grad_norm": 0.8435448408126831, + "learning_rate": 0.00016411876613670338, + "loss": 3.3816, + "step": 98455 + }, + { + "epoch": 6.689767631471668, + "grad_norm": 0.8521839380264282, + "learning_rate": 0.00016407630112787064, + "loss": 3.547, + "step": 98460 + }, + { + "epoch": 6.690107351542329, + "grad_norm": 0.9286388754844666, + "learning_rate": 0.00016403383611903792, + "loss": 3.436, + "step": 98465 + }, + { + "epoch": 6.690447071612991, + "grad_norm": 0.7298128008842468, + "learning_rate": 0.0001639913711102052, + "loss": 3.212, + "step": 98470 + }, + { + "epoch": 6.690786791683653, + "grad_norm": 0.7352144718170166, + "learning_rate": 0.00016394890610137248, + "loss": 3.4235, + "step": 98475 + }, + { + "epoch": 6.691126511754314, + "grad_norm": 0.855454683303833, + "learning_rate": 0.00016390644109253973, + "loss": 3.4705, + "step": 98480 + }, + { + "epoch": 6.691466231824976, + "grad_norm": 0.7420979738235474, + "learning_rate": 0.00016386397608370704, + "loss": 3.6049, + "step": 98485 + }, + { + "epoch": 6.691805951895638, + "grad_norm": 2.581716775894165, + "learning_rate": 0.00016382151107487432, + "loss": 3.2784, + "step": 98490 + }, + { + "epoch": 6.6921456719662995, + "grad_norm": 0.8522012829780579, + "learning_rate": 0.00016377904606604157, + "loss": 3.4109, + "step": 98495 + }, + { + "epoch": 6.692485392036962, + "grad_norm": 1.1210006475448608, + "learning_rate": 0.00016373658105720888, + "loss": 3.561, + "step": 98500 + }, + { + "epoch": 6.692825112107624, + "grad_norm": 1.0153539180755615, + "learning_rate": 0.00016369411604837613, + "loss": 3.1609, + "step": 98505 + }, + { + "epoch": 6.693164832178285, + "grad_norm": 1.0870707035064697, + "learning_rate": 0.0001636516510395434, + "loss": 3.4721, + "step": 98510 + }, + { + "epoch": 6.693504552248947, + "grad_norm": 0.9687951803207397, + "learning_rate": 0.00016360918603071072, + "loss": 3.4453, + "step": 98515 + }, + { + "epoch": 6.693844272319609, + "grad_norm": 0.9883599877357483, + "learning_rate": 0.00016356672102187797, + "loss": 3.1629, + "step": 98520 + }, + { + "epoch": 6.69418399239027, + "grad_norm": 0.8377922177314758, + "learning_rate": 0.00016352425601304525, + "loss": 3.0542, + "step": 98525 + }, + { + "epoch": 6.694523712460932, + "grad_norm": 0.9856077432632446, + "learning_rate": 0.00016348179100421253, + "loss": 3.4726, + "step": 98530 + }, + { + "epoch": 6.694863432531594, + "grad_norm": 0.8836296796798706, + "learning_rate": 0.0001634393259953798, + "loss": 3.3039, + "step": 98535 + }, + { + "epoch": 6.6952031526022555, + "grad_norm": 0.955392599105835, + "learning_rate": 0.0001633968609865471, + "loss": 3.4081, + "step": 98540 + }, + { + "epoch": 6.695542872672918, + "grad_norm": 0.7265942692756653, + "learning_rate": 0.00016335439597771437, + "loss": 3.5816, + "step": 98545 + }, + { + "epoch": 6.69588259274358, + "grad_norm": 1.0213125944137573, + "learning_rate": 0.00016331193096888165, + "loss": 3.4554, + "step": 98550 + }, + { + "epoch": 6.696222312814241, + "grad_norm": 1.2002778053283691, + "learning_rate": 0.00016326946596004893, + "loss": 3.3618, + "step": 98555 + }, + { + "epoch": 6.696562032884903, + "grad_norm": 0.9647334218025208, + "learning_rate": 0.0001632270009512162, + "loss": 3.4863, + "step": 98560 + }, + { + "epoch": 6.696901752955565, + "grad_norm": 0.9818304181098938, + "learning_rate": 0.00016318453594238346, + "loss": 3.3946, + "step": 98565 + }, + { + "epoch": 6.697241473026226, + "grad_norm": 0.9457587599754333, + "learning_rate": 0.00016314207093355077, + "loss": 3.3338, + "step": 98570 + }, + { + "epoch": 6.697581193096888, + "grad_norm": 0.8006077408790588, + "learning_rate": 0.00016309960592471802, + "loss": 3.2942, + "step": 98575 + }, + { + "epoch": 6.69792091316755, + "grad_norm": 1.1933196783065796, + "learning_rate": 0.0001630571409158853, + "loss": 3.377, + "step": 98580 + }, + { + "epoch": 6.6982606332382115, + "grad_norm": 0.793740451335907, + "learning_rate": 0.0001630146759070526, + "loss": 3.2172, + "step": 98585 + }, + { + "epoch": 6.698600353308874, + "grad_norm": 0.9804584980010986, + "learning_rate": 0.00016297221089821987, + "loss": 3.2577, + "step": 98590 + }, + { + "epoch": 6.698940073379536, + "grad_norm": 0.8064772486686707, + "learning_rate": 0.00016292974588938715, + "loss": 3.1375, + "step": 98595 + }, + { + "epoch": 6.699279793450197, + "grad_norm": 1.0115559101104736, + "learning_rate": 0.00016288728088055443, + "loss": 3.4619, + "step": 98600 + }, + { + "epoch": 6.699619513520859, + "grad_norm": 0.959906280040741, + "learning_rate": 0.0001628448158717217, + "loss": 3.5712, + "step": 98605 + }, + { + "epoch": 6.699959233591521, + "grad_norm": 1.2197072505950928, + "learning_rate": 0.00016280235086288896, + "loss": 3.3566, + "step": 98610 + }, + { + "epoch": 6.700298953662182, + "grad_norm": 0.8607329726219177, + "learning_rate": 0.00016275988585405627, + "loss": 3.2707, + "step": 98615 + }, + { + "epoch": 6.700638673732844, + "grad_norm": 1.0839892625808716, + "learning_rate": 0.00016271742084522355, + "loss": 3.478, + "step": 98620 + }, + { + "epoch": 6.700978393803506, + "grad_norm": 0.9005405306816101, + "learning_rate": 0.00016267495583639083, + "loss": 3.3365, + "step": 98625 + }, + { + "epoch": 6.7013181138741675, + "grad_norm": 0.8043229579925537, + "learning_rate": 0.0001626324908275581, + "loss": 3.2992, + "step": 98630 + }, + { + "epoch": 6.70165783394483, + "grad_norm": 0.918701171875, + "learning_rate": 0.00016259002581872536, + "loss": 3.4053, + "step": 98635 + }, + { + "epoch": 6.701997554015492, + "grad_norm": 1.1564093828201294, + "learning_rate": 0.00016254756080989267, + "loss": 3.6604, + "step": 98640 + }, + { + "epoch": 6.702337274086153, + "grad_norm": 0.9652135372161865, + "learning_rate": 0.00016250509580105992, + "loss": 3.3474, + "step": 98645 + }, + { + "epoch": 6.702676994156815, + "grad_norm": 0.8718267679214478, + "learning_rate": 0.0001624626307922272, + "loss": 3.2753, + "step": 98650 + }, + { + "epoch": 6.703016714227477, + "grad_norm": 0.9352936744689941, + "learning_rate": 0.00016242865878516102, + "loss": 3.2372, + "step": 98655 + }, + { + "epoch": 6.703356434298138, + "grad_norm": 0.7798987627029419, + "learning_rate": 0.00016238619377632833, + "loss": 3.4495, + "step": 98660 + }, + { + "epoch": 6.7036961543688, + "grad_norm": 0.8886719942092896, + "learning_rate": 0.00016234372876749558, + "loss": 3.428, + "step": 98665 + }, + { + "epoch": 6.704035874439462, + "grad_norm": 0.8472762107849121, + "learning_rate": 0.00016230126375866286, + "loss": 3.3834, + "step": 98670 + }, + { + "epoch": 6.7043755945101235, + "grad_norm": 0.8518813848495483, + "learning_rate": 0.00016225879874983014, + "loss": 3.3123, + "step": 98675 + }, + { + "epoch": 6.704715314580786, + "grad_norm": 0.8442111611366272, + "learning_rate": 0.00016221633374099742, + "loss": 3.3555, + "step": 98680 + }, + { + "epoch": 6.705055034651448, + "grad_norm": 0.7006939053535461, + "learning_rate": 0.0001621738687321647, + "loss": 3.4006, + "step": 98685 + }, + { + "epoch": 6.705394754722109, + "grad_norm": 0.9155606031417847, + "learning_rate": 0.00016213140372333198, + "loss": 3.325, + "step": 98690 + }, + { + "epoch": 6.705734474792771, + "grad_norm": 1.041456937789917, + "learning_rate": 0.00016208893871449926, + "loss": 3.478, + "step": 98695 + }, + { + "epoch": 6.706074194863432, + "grad_norm": 1.2873154878616333, + "learning_rate": 0.00016204647370566652, + "loss": 3.3568, + "step": 98700 + }, + { + "epoch": 6.706413914934094, + "grad_norm": 1.0741592645645142, + "learning_rate": 0.00016200400869683382, + "loss": 3.453, + "step": 98705 + }, + { + "epoch": 6.706753635004756, + "grad_norm": 0.7925326824188232, + "learning_rate": 0.00016196154368800108, + "loss": 3.4709, + "step": 98710 + }, + { + "epoch": 6.7070933550754175, + "grad_norm": 1.1573970317840576, + "learning_rate": 0.00016191907867916836, + "loss": 3.4495, + "step": 98715 + }, + { + "epoch": 6.70743307514608, + "grad_norm": 0.8025238513946533, + "learning_rate": 0.00016187661367033566, + "loss": 3.405, + "step": 98720 + }, + { + "epoch": 6.707772795216742, + "grad_norm": 0.9312659502029419, + "learning_rate": 0.00016183414866150292, + "loss": 3.3855, + "step": 98725 + }, + { + "epoch": 6.708112515287403, + "grad_norm": 1.0057957172393799, + "learning_rate": 0.0001617916836526702, + "loss": 3.4515, + "step": 98730 + }, + { + "epoch": 6.708452235358065, + "grad_norm": 0.9164432883262634, + "learning_rate": 0.00016174921864383748, + "loss": 3.3262, + "step": 98735 + }, + { + "epoch": 6.708791955428727, + "grad_norm": 1.068711280822754, + "learning_rate": 0.00016170675363500476, + "loss": 3.2586, + "step": 98740 + }, + { + "epoch": 6.709131675499388, + "grad_norm": 0.7593467235565186, + "learning_rate": 0.00016166428862617206, + "loss": 3.4467, + "step": 98745 + }, + { + "epoch": 6.70947139557005, + "grad_norm": 0.8433524966239929, + "learning_rate": 0.00016162182361733932, + "loss": 3.4562, + "step": 98750 + }, + { + "epoch": 6.709811115640712, + "grad_norm": 0.7758840322494507, + "learning_rate": 0.0001615793586085066, + "loss": 3.7652, + "step": 98755 + }, + { + "epoch": 6.7101508357113735, + "grad_norm": 0.9563224911689758, + "learning_rate": 0.00016153689359967388, + "loss": 3.5482, + "step": 98760 + }, + { + "epoch": 6.710490555782036, + "grad_norm": 0.8923773765563965, + "learning_rate": 0.00016149442859084116, + "loss": 3.7013, + "step": 98765 + }, + { + "epoch": 6.710830275852698, + "grad_norm": 0.816126823425293, + "learning_rate": 0.0001614519635820084, + "loss": 3.595, + "step": 98770 + }, + { + "epoch": 6.711169995923359, + "grad_norm": 0.9694762825965881, + "learning_rate": 0.00016140949857317572, + "loss": 3.314, + "step": 98775 + }, + { + "epoch": 6.711509715994021, + "grad_norm": 0.9916516542434692, + "learning_rate": 0.000161367033564343, + "loss": 3.4855, + "step": 98780 + }, + { + "epoch": 6.711849436064683, + "grad_norm": 0.9196588397026062, + "learning_rate": 0.00016132456855551025, + "loss": 3.363, + "step": 98785 + }, + { + "epoch": 6.712189156135344, + "grad_norm": 1.0423885583877563, + "learning_rate": 0.00016128210354667756, + "loss": 3.4202, + "step": 98790 + }, + { + "epoch": 6.712528876206006, + "grad_norm": 0.824612557888031, + "learning_rate": 0.0001612396385378448, + "loss": 3.4479, + "step": 98795 + }, + { + "epoch": 6.712868596276668, + "grad_norm": 0.9485159516334534, + "learning_rate": 0.0001611971735290121, + "loss": 3.3223, + "step": 98800 + }, + { + "epoch": 6.7132083163473295, + "grad_norm": 0.8723904490470886, + "learning_rate": 0.00016115470852017937, + "loss": 3.365, + "step": 98805 + }, + { + "epoch": 6.713548036417992, + "grad_norm": 1.1419036388397217, + "learning_rate": 0.00016111224351134665, + "loss": 3.4399, + "step": 98810 + }, + { + "epoch": 6.713887756488654, + "grad_norm": 0.9376690983772278, + "learning_rate": 0.00016106977850251393, + "loss": 3.4114, + "step": 98815 + }, + { + "epoch": 6.714227476559315, + "grad_norm": 0.9508131146430969, + "learning_rate": 0.0001610273134936812, + "loss": 3.4999, + "step": 98820 + }, + { + "epoch": 6.714567196629977, + "grad_norm": 1.5072084665298462, + "learning_rate": 0.0001609848484848485, + "loss": 3.4162, + "step": 98825 + }, + { + "epoch": 6.714906916700639, + "grad_norm": 0.7700008153915405, + "learning_rate": 0.00016094238347601577, + "loss": 3.2982, + "step": 98830 + }, + { + "epoch": 6.7152466367713, + "grad_norm": 0.9153805375099182, + "learning_rate": 0.00016089991846718305, + "loss": 3.4555, + "step": 98835 + }, + { + "epoch": 6.715586356841962, + "grad_norm": 1.0082863569259644, + "learning_rate": 0.0001608574534583503, + "loss": 3.5111, + "step": 98840 + }, + { + "epoch": 6.715926076912624, + "grad_norm": 1.0456372499465942, + "learning_rate": 0.0001608149884495176, + "loss": 3.0508, + "step": 98845 + }, + { + "epoch": 6.7162657969832855, + "grad_norm": 1.2236535549163818, + "learning_rate": 0.0001607725234406849, + "loss": 3.2075, + "step": 98850 + }, + { + "epoch": 6.716605517053948, + "grad_norm": 1.4232794046401978, + "learning_rate": 0.00016073005843185214, + "loss": 3.4274, + "step": 98855 + }, + { + "epoch": 6.71694523712461, + "grad_norm": 0.8002721667289734, + "learning_rate": 0.00016068759342301945, + "loss": 3.2837, + "step": 98860 + }, + { + "epoch": 6.717284957195271, + "grad_norm": 0.7814368605613708, + "learning_rate": 0.0001606451284141867, + "loss": 3.6857, + "step": 98865 + }, + { + "epoch": 6.717624677265933, + "grad_norm": 0.8337963819503784, + "learning_rate": 0.00016060266340535399, + "loss": 3.5538, + "step": 98870 + }, + { + "epoch": 6.717964397336595, + "grad_norm": 0.7850040197372437, + "learning_rate": 0.00016056019839652127, + "loss": 3.2849, + "step": 98875 + }, + { + "epoch": 6.718304117407256, + "grad_norm": 1.0921510457992554, + "learning_rate": 0.00016051773338768855, + "loss": 3.6373, + "step": 98880 + }, + { + "epoch": 6.718643837477918, + "grad_norm": 0.8511355519294739, + "learning_rate": 0.00016047526837885583, + "loss": 3.25, + "step": 98885 + }, + { + "epoch": 6.71898355754858, + "grad_norm": 0.9531211853027344, + "learning_rate": 0.0001604328033700231, + "loss": 3.37, + "step": 98890 + }, + { + "epoch": 6.7193232776192415, + "grad_norm": 2.1028852462768555, + "learning_rate": 0.00016039033836119039, + "loss": 3.2028, + "step": 98895 + }, + { + "epoch": 6.719662997689904, + "grad_norm": 0.8577432632446289, + "learning_rate": 0.00016034787335235764, + "loss": 3.3154, + "step": 98900 + }, + { + "epoch": 6.720002717760566, + "grad_norm": 0.7837623357772827, + "learning_rate": 0.00016030540834352495, + "loss": 3.3513, + "step": 98905 + }, + { + "epoch": 6.720342437831227, + "grad_norm": 0.7990435361862183, + "learning_rate": 0.00016026294333469223, + "loss": 3.5132, + "step": 98910 + }, + { + "epoch": 6.720682157901889, + "grad_norm": 1.0887930393218994, + "learning_rate": 0.0001602204783258595, + "loss": 3.3267, + "step": 98915 + }, + { + "epoch": 6.72102187797255, + "grad_norm": 0.7607721090316772, + "learning_rate": 0.00016017801331702679, + "loss": 3.4977, + "step": 98920 + }, + { + "epoch": 6.721361598043212, + "grad_norm": 0.7968986630439758, + "learning_rate": 0.00016013554830819404, + "loss": 3.5199, + "step": 98925 + }, + { + "epoch": 6.721701318113874, + "grad_norm": 0.733620285987854, + "learning_rate": 0.00016009308329936135, + "loss": 3.5179, + "step": 98930 + }, + { + "epoch": 6.7220410381845355, + "grad_norm": 0.9957525134086609, + "learning_rate": 0.0001600506182905286, + "loss": 3.5692, + "step": 98935 + }, + { + "epoch": 6.7223807582551975, + "grad_norm": 0.9561794400215149, + "learning_rate": 0.00016000815328169588, + "loss": 3.4537, + "step": 98940 + }, + { + "epoch": 6.72272047832586, + "grad_norm": 0.806732714176178, + "learning_rate": 0.00015996568827286319, + "loss": 3.3909, + "step": 98945 + }, + { + "epoch": 6.723060198396521, + "grad_norm": 0.9083614349365234, + "learning_rate": 0.00015992322326403044, + "loss": 3.4523, + "step": 98950 + }, + { + "epoch": 6.723399918467183, + "grad_norm": 0.8041409850120544, + "learning_rate": 0.00015988075825519772, + "loss": 3.4779, + "step": 98955 + }, + { + "epoch": 6.723739638537845, + "grad_norm": 0.9328835606575012, + "learning_rate": 0.000159838293246365, + "loss": 3.3637, + "step": 98960 + }, + { + "epoch": 6.724079358608506, + "grad_norm": 0.8063403367996216, + "learning_rate": 0.00015979582823753228, + "loss": 3.6228, + "step": 98965 + }, + { + "epoch": 6.724419078679168, + "grad_norm": 1.092283010482788, + "learning_rate": 0.00015975336322869953, + "loss": 3.471, + "step": 98970 + }, + { + "epoch": 6.72475879874983, + "grad_norm": 0.9127610325813293, + "learning_rate": 0.00015971089821986684, + "loss": 3.21, + "step": 98975 + }, + { + "epoch": 6.7250985188204915, + "grad_norm": 0.9555726647377014, + "learning_rate": 0.00015966843321103412, + "loss": 3.4693, + "step": 98980 + }, + { + "epoch": 6.7254382388911536, + "grad_norm": 0.7800223231315613, + "learning_rate": 0.00015962596820220137, + "loss": 3.3989, + "step": 98985 + }, + { + "epoch": 6.725777958961816, + "grad_norm": 1.1895684003829956, + "learning_rate": 0.00015958350319336868, + "loss": 3.6315, + "step": 98990 + }, + { + "epoch": 6.726117679032477, + "grad_norm": 0.9006454348564148, + "learning_rate": 0.00015954103818453593, + "loss": 3.5911, + "step": 98995 + }, + { + "epoch": 6.726457399103139, + "grad_norm": 0.8103139400482178, + "learning_rate": 0.00015949857317570324, + "loss": 3.4194, + "step": 99000 + }, + { + "epoch": 6.726797119173801, + "grad_norm": 0.7655391097068787, + "learning_rate": 0.0001594561081668705, + "loss": 3.6028, + "step": 99005 + }, + { + "epoch": 6.727136839244462, + "grad_norm": 0.9657906293869019, + "learning_rate": 0.00015941364315803777, + "loss": 3.2092, + "step": 99010 + }, + { + "epoch": 6.727476559315124, + "grad_norm": 0.8685822486877441, + "learning_rate": 0.00015937117814920508, + "loss": 3.5432, + "step": 99015 + }, + { + "epoch": 6.727816279385786, + "grad_norm": 0.883739709854126, + "learning_rate": 0.00015932871314037233, + "loss": 3.3897, + "step": 99020 + }, + { + "epoch": 6.7281559994564475, + "grad_norm": 1.0785949230194092, + "learning_rate": 0.0001592862481315396, + "loss": 3.5518, + "step": 99025 + }, + { + "epoch": 6.72849571952711, + "grad_norm": 1.0391777753829956, + "learning_rate": 0.0001592437831227069, + "loss": 3.5218, + "step": 99030 + }, + { + "epoch": 6.728835439597772, + "grad_norm": 0.9160169363021851, + "learning_rate": 0.00015920131811387417, + "loss": 3.3146, + "step": 99035 + }, + { + "epoch": 6.729175159668433, + "grad_norm": 1.021264672279358, + "learning_rate": 0.00015915885310504145, + "loss": 3.529, + "step": 99040 + }, + { + "epoch": 6.729514879739095, + "grad_norm": 0.837809681892395, + "learning_rate": 0.00015911638809620873, + "loss": 3.4308, + "step": 99045 + }, + { + "epoch": 6.729854599809757, + "grad_norm": 1.2425042390823364, + "learning_rate": 0.00015907392308737601, + "loss": 3.3049, + "step": 99050 + }, + { + "epoch": 6.730194319880418, + "grad_norm": 0.9624091386795044, + "learning_rate": 0.00015903145807854327, + "loss": 3.4267, + "step": 99055 + }, + { + "epoch": 6.73053403995108, + "grad_norm": 1.043908715248108, + "learning_rate": 0.00015898899306971057, + "loss": 3.3324, + "step": 99060 + }, + { + "epoch": 6.730873760021742, + "grad_norm": 1.018049955368042, + "learning_rate": 0.00015894652806087783, + "loss": 3.1328, + "step": 99065 + }, + { + "epoch": 6.7312134800924035, + "grad_norm": 0.7019802331924438, + "learning_rate": 0.0001589040630520451, + "loss": 3.7339, + "step": 99070 + }, + { + "epoch": 6.731553200163066, + "grad_norm": 0.9862908720970154, + "learning_rate": 0.00015886159804321241, + "loss": 3.3199, + "step": 99075 + }, + { + "epoch": 6.731892920233728, + "grad_norm": 0.9101529717445374, + "learning_rate": 0.00015881913303437967, + "loss": 3.3753, + "step": 99080 + }, + { + "epoch": 6.732232640304389, + "grad_norm": 0.7992237210273743, + "learning_rate": 0.00015877666802554697, + "loss": 3.4442, + "step": 99085 + }, + { + "epoch": 6.732572360375051, + "grad_norm": 0.9311121106147766, + "learning_rate": 0.00015873420301671423, + "loss": 3.4429, + "step": 99090 + }, + { + "epoch": 6.732912080445713, + "grad_norm": 0.9180771708488464, + "learning_rate": 0.0001586917380078815, + "loss": 3.3179, + "step": 99095 + }, + { + "epoch": 6.733251800516374, + "grad_norm": 0.7562750577926636, + "learning_rate": 0.0001586492729990488, + "loss": 3.3197, + "step": 99100 + }, + { + "epoch": 6.733591520587036, + "grad_norm": 2.518690824508667, + "learning_rate": 0.00015860680799021607, + "loss": 3.2559, + "step": 99105 + }, + { + "epoch": 6.733931240657698, + "grad_norm": 0.8028576374053955, + "learning_rate": 0.00015856434298138335, + "loss": 3.2954, + "step": 99110 + }, + { + "epoch": 6.7342709607283595, + "grad_norm": 1.0125763416290283, + "learning_rate": 0.00015852187797255063, + "loss": 3.412, + "step": 99115 + }, + { + "epoch": 6.734610680799022, + "grad_norm": 1.006847858428955, + "learning_rate": 0.0001584794129637179, + "loss": 3.3032, + "step": 99120 + }, + { + "epoch": 6.734950400869684, + "grad_norm": 0.7614300847053528, + "learning_rate": 0.00015843694795488516, + "loss": 3.2698, + "step": 99125 + }, + { + "epoch": 6.735290120940345, + "grad_norm": 0.9560384750366211, + "learning_rate": 0.00015839448294605247, + "loss": 3.6878, + "step": 99130 + }, + { + "epoch": 6.735629841011007, + "grad_norm": 0.768836498260498, + "learning_rate": 0.00015835201793721972, + "loss": 3.2315, + "step": 99135 + }, + { + "epoch": 6.735969561081669, + "grad_norm": 0.7976542115211487, + "learning_rate": 0.000158309552928387, + "loss": 3.398, + "step": 99140 + }, + { + "epoch": 6.73630928115233, + "grad_norm": 0.9499739408493042, + "learning_rate": 0.0001582670879195543, + "loss": 3.3635, + "step": 99145 + }, + { + "epoch": 6.736649001222992, + "grad_norm": 0.8811370134353638, + "learning_rate": 0.00015822462291072156, + "loss": 3.41, + "step": 99150 + }, + { + "epoch": 6.736988721293654, + "grad_norm": 0.9721860289573669, + "learning_rate": 0.00015818215790188884, + "loss": 3.293, + "step": 99155 + }, + { + "epoch": 6.7373284413643155, + "grad_norm": 0.836473286151886, + "learning_rate": 0.00015813969289305612, + "loss": 3.3199, + "step": 99160 + }, + { + "epoch": 6.737668161434978, + "grad_norm": 0.8722245693206787, + "learning_rate": 0.0001580972278842234, + "loss": 3.4767, + "step": 99165 + }, + { + "epoch": 6.73800788150564, + "grad_norm": 0.9849978685379028, + "learning_rate": 0.0001580547628753907, + "loss": 3.2335, + "step": 99170 + }, + { + "epoch": 6.738347601576301, + "grad_norm": 0.9061353802680969, + "learning_rate": 0.00015801229786655796, + "loss": 3.4497, + "step": 99175 + }, + { + "epoch": 6.738687321646963, + "grad_norm": 0.9000132083892822, + "learning_rate": 0.00015796983285772524, + "loss": 3.4753, + "step": 99180 + }, + { + "epoch": 6.739027041717625, + "grad_norm": 1.0538355112075806, + "learning_rate": 0.00015792736784889252, + "loss": 3.4445, + "step": 99185 + }, + { + "epoch": 6.739366761788286, + "grad_norm": 1.0339523553848267, + "learning_rate": 0.0001578849028400598, + "loss": 3.4798, + "step": 99190 + }, + { + "epoch": 6.739706481858948, + "grad_norm": 0.9389954209327698, + "learning_rate": 0.00015784243783122705, + "loss": 3.4646, + "step": 99195 + }, + { + "epoch": 6.74004620192961, + "grad_norm": 0.7810302376747131, + "learning_rate": 0.00015779997282239436, + "loss": 3.5288, + "step": 99200 + }, + { + "epoch": 6.7403859220002715, + "grad_norm": 1.3385590314865112, + "learning_rate": 0.00015775750781356164, + "loss": 3.661, + "step": 99205 + }, + { + "epoch": 6.740725642070934, + "grad_norm": 0.9591642618179321, + "learning_rate": 0.0001577150428047289, + "loss": 3.4946, + "step": 99210 + }, + { + "epoch": 6.741065362141596, + "grad_norm": 0.9973788857460022, + "learning_rate": 0.0001576725777958962, + "loss": 3.2561, + "step": 99215 + }, + { + "epoch": 6.741405082212257, + "grad_norm": 1.0520778894424438, + "learning_rate": 0.00015763011278706345, + "loss": 3.3806, + "step": 99220 + }, + { + "epoch": 6.741744802282919, + "grad_norm": 0.797200620174408, + "learning_rate": 0.00015758764777823074, + "loss": 3.2874, + "step": 99225 + }, + { + "epoch": 6.742084522353581, + "grad_norm": 1.044211506843567, + "learning_rate": 0.00015754518276939802, + "loss": 3.4458, + "step": 99230 + }, + { + "epoch": 6.742424242424242, + "grad_norm": 1.1632505655288696, + "learning_rate": 0.0001575027177605653, + "loss": 3.3983, + "step": 99235 + }, + { + "epoch": 6.742763962494904, + "grad_norm": 0.9533296823501587, + "learning_rate": 0.00015746025275173258, + "loss": 3.5407, + "step": 99240 + }, + { + "epoch": 6.743103682565566, + "grad_norm": 0.9739797711372375, + "learning_rate": 0.00015741778774289986, + "loss": 3.2676, + "step": 99245 + }, + { + "epoch": 6.7434434026362275, + "grad_norm": 0.8055427074432373, + "learning_rate": 0.00015737532273406714, + "loss": 3.3675, + "step": 99250 + }, + { + "epoch": 6.74378312270689, + "grad_norm": 0.9558871388435364, + "learning_rate": 0.00015733285772523442, + "loss": 3.6743, + "step": 99255 + }, + { + "epoch": 6.744122842777552, + "grad_norm": 0.8368589282035828, + "learning_rate": 0.0001572903927164017, + "loss": 3.4837, + "step": 99260 + }, + { + "epoch": 6.744462562848213, + "grad_norm": 0.9640116095542908, + "learning_rate": 0.00015724792770756895, + "loss": 3.3405, + "step": 99265 + }, + { + "epoch": 6.744802282918875, + "grad_norm": 0.9620677828788757, + "learning_rate": 0.00015720546269873626, + "loss": 3.2916, + "step": 99270 + }, + { + "epoch": 6.745142002989537, + "grad_norm": 1.0166664123535156, + "learning_rate": 0.00015716299768990354, + "loss": 3.2452, + "step": 99275 + }, + { + "epoch": 6.745481723060198, + "grad_norm": 0.7248257398605347, + "learning_rate": 0.0001571205326810708, + "loss": 3.4057, + "step": 99280 + }, + { + "epoch": 6.74582144313086, + "grad_norm": 1.02186119556427, + "learning_rate": 0.0001570780676722381, + "loss": 3.3902, + "step": 99285 + }, + { + "epoch": 6.746161163201522, + "grad_norm": 0.8176487684249878, + "learning_rate": 0.00015703560266340535, + "loss": 3.1367, + "step": 99290 + }, + { + "epoch": 6.746500883272184, + "grad_norm": 1.1704028844833374, + "learning_rate": 0.00015699313765457263, + "loss": 3.2011, + "step": 99295 + }, + { + "epoch": 6.746840603342846, + "grad_norm": 0.8349586725234985, + "learning_rate": 0.0001569506726457399, + "loss": 3.3644, + "step": 99300 + }, + { + "epoch": 6.747180323413508, + "grad_norm": 0.8781894445419312, + "learning_rate": 0.0001569082076369072, + "loss": 3.4919, + "step": 99305 + }, + { + "epoch": 6.747520043484169, + "grad_norm": 1.0179299116134644, + "learning_rate": 0.00015686574262807447, + "loss": 3.1627, + "step": 99310 + }, + { + "epoch": 6.747859763554831, + "grad_norm": 0.9348723292350769, + "learning_rate": 0.00015682327761924175, + "loss": 3.3024, + "step": 99315 + }, + { + "epoch": 6.748199483625493, + "grad_norm": 0.8475903868675232, + "learning_rate": 0.00015678081261040903, + "loss": 3.4279, + "step": 99320 + }, + { + "epoch": 6.748539203696154, + "grad_norm": 0.898897111415863, + "learning_rate": 0.00015673834760157628, + "loss": 3.5854, + "step": 99325 + }, + { + "epoch": 6.748878923766816, + "grad_norm": 1.0103563070297241, + "learning_rate": 0.0001566958825927436, + "loss": 3.5856, + "step": 99330 + }, + { + "epoch": 6.749218643837478, + "grad_norm": 0.9919679760932922, + "learning_rate": 0.00015665341758391087, + "loss": 3.311, + "step": 99335 + }, + { + "epoch": 6.74955836390814, + "grad_norm": 1.2419114112854004, + "learning_rate": 0.00015661095257507815, + "loss": 3.1216, + "step": 99340 + }, + { + "epoch": 6.749898083978802, + "grad_norm": 1.0059775114059448, + "learning_rate": 0.00015656848756624543, + "loss": 3.7045, + "step": 99345 + }, + { + "epoch": 6.750237804049464, + "grad_norm": 0.9499325752258301, + "learning_rate": 0.00015652602255741268, + "loss": 3.3738, + "step": 99350 + }, + { + "epoch": 6.750577524120125, + "grad_norm": 0.8138713240623474, + "learning_rate": 0.00015648355754858, + "loss": 3.2927, + "step": 99355 + }, + { + "epoch": 6.750917244190787, + "grad_norm": 0.8737850785255432, + "learning_rate": 0.00015644109253974724, + "loss": 3.3905, + "step": 99360 + }, + { + "epoch": 6.751256964261449, + "grad_norm": 0.7393068075180054, + "learning_rate": 0.00015639862753091452, + "loss": 3.3987, + "step": 99365 + }, + { + "epoch": 6.75159668433211, + "grad_norm": 0.8466947078704834, + "learning_rate": 0.00015635616252208183, + "loss": 3.4636, + "step": 99370 + }, + { + "epoch": 6.751936404402772, + "grad_norm": 1.1604455709457397, + "learning_rate": 0.00015631369751324908, + "loss": 3.4032, + "step": 99375 + }, + { + "epoch": 6.7522761244734335, + "grad_norm": 0.9197470545768738, + "learning_rate": 0.00015627123250441636, + "loss": 3.3958, + "step": 99380 + }, + { + "epoch": 6.752615844544096, + "grad_norm": 0.7357714176177979, + "learning_rate": 0.00015622876749558364, + "loss": 3.4996, + "step": 99385 + }, + { + "epoch": 6.752955564614758, + "grad_norm": 1.3699454069137573, + "learning_rate": 0.00015618630248675092, + "loss": 3.5719, + "step": 99390 + }, + { + "epoch": 6.753295284685419, + "grad_norm": 0.8163264393806458, + "learning_rate": 0.00015614383747791818, + "loss": 3.3417, + "step": 99395 + }, + { + "epoch": 6.753635004756081, + "grad_norm": 0.9174010753631592, + "learning_rate": 0.00015610137246908548, + "loss": 3.5255, + "step": 99400 + }, + { + "epoch": 6.753974724826743, + "grad_norm": 0.9022483825683594, + "learning_rate": 0.00015605890746025276, + "loss": 3.4284, + "step": 99405 + }, + { + "epoch": 6.754314444897404, + "grad_norm": 0.8588606715202332, + "learning_rate": 0.00015601644245142002, + "loss": 3.438, + "step": 99410 + }, + { + "epoch": 6.754654164968066, + "grad_norm": 0.8588494658470154, + "learning_rate": 0.00015597397744258732, + "loss": 3.4582, + "step": 99415 + }, + { + "epoch": 6.754993885038728, + "grad_norm": 1.0916669368743896, + "learning_rate": 0.00015593151243375458, + "loss": 3.2998, + "step": 99420 + }, + { + "epoch": 6.7553336051093895, + "grad_norm": 1.0189363956451416, + "learning_rate": 0.00015588904742492186, + "loss": 3.6598, + "step": 99425 + }, + { + "epoch": 6.755673325180052, + "grad_norm": 0.9380828142166138, + "learning_rate": 0.00015584658241608914, + "loss": 3.4474, + "step": 99430 + }, + { + "epoch": 6.756013045250714, + "grad_norm": 0.8701227903366089, + "learning_rate": 0.00015580411740725642, + "loss": 3.495, + "step": 99435 + }, + { + "epoch": 6.756352765321375, + "grad_norm": 1.0082908868789673, + "learning_rate": 0.00015576165239842372, + "loss": 3.4755, + "step": 99440 + }, + { + "epoch": 6.756692485392037, + "grad_norm": 1.0186843872070312, + "learning_rate": 0.00015571918738959098, + "loss": 3.5695, + "step": 99445 + }, + { + "epoch": 6.757032205462699, + "grad_norm": 0.7347564697265625, + "learning_rate": 0.00015567672238075826, + "loss": 3.5651, + "step": 99450 + }, + { + "epoch": 6.75737192553336, + "grad_norm": 0.737755298614502, + "learning_rate": 0.00015563425737192554, + "loss": 3.529, + "step": 99455 + }, + { + "epoch": 6.757711645604022, + "grad_norm": 0.8336412310600281, + "learning_rate": 0.00015559179236309282, + "loss": 3.5093, + "step": 99460 + }, + { + "epoch": 6.758051365674684, + "grad_norm": 1.131439208984375, + "learning_rate": 0.0001555493273542601, + "loss": 3.3722, + "step": 99465 + }, + { + "epoch": 6.7583910857453455, + "grad_norm": 1.0015525817871094, + "learning_rate": 0.00015550686234542738, + "loss": 3.3597, + "step": 99470 + }, + { + "epoch": 6.758730805816008, + "grad_norm": 0.8953506946563721, + "learning_rate": 0.00015546439733659466, + "loss": 3.2639, + "step": 99475 + }, + { + "epoch": 6.75907052588667, + "grad_norm": 0.791253387928009, + "learning_rate": 0.0001554219323277619, + "loss": 3.266, + "step": 99480 + }, + { + "epoch": 6.759410245957331, + "grad_norm": 0.7567099332809448, + "learning_rate": 0.00015537946731892922, + "loss": 3.6335, + "step": 99485 + }, + { + "epoch": 6.759749966027993, + "grad_norm": 1.3261525630950928, + "learning_rate": 0.00015533700231009647, + "loss": 3.6302, + "step": 99490 + }, + { + "epoch": 6.760089686098655, + "grad_norm": 0.8904715180397034, + "learning_rate": 0.00015529453730126375, + "loss": 3.3143, + "step": 99495 + }, + { + "epoch": 6.760429406169316, + "grad_norm": 1.0711661577224731, + "learning_rate": 0.00015525207229243106, + "loss": 3.4614, + "step": 99500 + }, + { + "epoch": 6.760769126239978, + "grad_norm": 0.8257405757904053, + "learning_rate": 0.0001552096072835983, + "loss": 3.6356, + "step": 99505 + }, + { + "epoch": 6.76110884631064, + "grad_norm": 0.8089847564697266, + "learning_rate": 0.0001551671422747656, + "loss": 3.3763, + "step": 99510 + }, + { + "epoch": 6.7614485663813015, + "grad_norm": 0.9259394407272339, + "learning_rate": 0.00015512467726593287, + "loss": 3.481, + "step": 99515 + }, + { + "epoch": 6.761788286451964, + "grad_norm": 0.9311196804046631, + "learning_rate": 0.00015508221225710015, + "loss": 3.3136, + "step": 99520 + }, + { + "epoch": 6.762128006522626, + "grad_norm": 2.017256259918213, + "learning_rate": 0.00015503974724826743, + "loss": 3.6008, + "step": 99525 + }, + { + "epoch": 6.762467726593287, + "grad_norm": 0.9098808169364929, + "learning_rate": 0.0001549972822394347, + "loss": 3.4878, + "step": 99530 + }, + { + "epoch": 6.762807446663949, + "grad_norm": 0.947381317615509, + "learning_rate": 0.000154954817230602, + "loss": 3.7571, + "step": 99535 + }, + { + "epoch": 6.763147166734611, + "grad_norm": 0.8928861021995544, + "learning_rate": 0.00015491235222176927, + "loss": 3.6058, + "step": 99540 + }, + { + "epoch": 6.763486886805272, + "grad_norm": 0.8412185311317444, + "learning_rate": 0.00015486988721293655, + "loss": 3.5083, + "step": 99545 + }, + { + "epoch": 6.763826606875934, + "grad_norm": 0.908613383769989, + "learning_rate": 0.0001548274222041038, + "loss": 3.2709, + "step": 99550 + }, + { + "epoch": 6.764166326946596, + "grad_norm": 0.8073624968528748, + "learning_rate": 0.0001547849571952711, + "loss": 3.4463, + "step": 99555 + }, + { + "epoch": 6.7645060470172576, + "grad_norm": 1.0071794986724854, + "learning_rate": 0.00015474249218643836, + "loss": 3.0823, + "step": 99560 + }, + { + "epoch": 6.76484576708792, + "grad_norm": 0.8446254134178162, + "learning_rate": 0.00015470002717760564, + "loss": 3.5425, + "step": 99565 + }, + { + "epoch": 6.765185487158582, + "grad_norm": 0.9868767261505127, + "learning_rate": 0.00015465756216877295, + "loss": 3.2921, + "step": 99570 + }, + { + "epoch": 6.765525207229243, + "grad_norm": 0.723699688911438, + "learning_rate": 0.0001546150971599402, + "loss": 3.4396, + "step": 99575 + }, + { + "epoch": 6.765864927299905, + "grad_norm": 0.9399039149284363, + "learning_rate": 0.00015457263215110748, + "loss": 3.3189, + "step": 99580 + }, + { + "epoch": 6.766204647370567, + "grad_norm": 0.9933343529701233, + "learning_rate": 0.00015453016714227477, + "loss": 3.2236, + "step": 99585 + }, + { + "epoch": 6.766544367441228, + "grad_norm": 0.9418555498123169, + "learning_rate": 0.00015448770213344205, + "loss": 3.3099, + "step": 99590 + }, + { + "epoch": 6.76688408751189, + "grad_norm": 0.9392416477203369, + "learning_rate": 0.0001544452371246093, + "loss": 3.5237, + "step": 99595 + }, + { + "epoch": 6.7672238075825515, + "grad_norm": 0.949647843837738, + "learning_rate": 0.0001544027721157766, + "loss": 3.5321, + "step": 99600 + }, + { + "epoch": 6.767563527653214, + "grad_norm": 0.7271689772605896, + "learning_rate": 0.00015436030710694389, + "loss": 3.2795, + "step": 99605 + }, + { + "epoch": 6.767903247723876, + "grad_norm": 0.9366714358329773, + "learning_rate": 0.00015431784209811117, + "loss": 3.3417, + "step": 99610 + }, + { + "epoch": 6.768242967794537, + "grad_norm": 1.402651309967041, + "learning_rate": 0.00015427537708927845, + "loss": 3.439, + "step": 99615 + }, + { + "epoch": 6.768582687865199, + "grad_norm": 1.0825341939926147, + "learning_rate": 0.0001542329120804457, + "loss": 3.3367, + "step": 99620 + }, + { + "epoch": 6.768922407935861, + "grad_norm": 0.7089253664016724, + "learning_rate": 0.000154190447071613, + "loss": 3.2108, + "step": 99625 + }, + { + "epoch": 6.769262128006522, + "grad_norm": 1.0681735277175903, + "learning_rate": 0.00015414798206278029, + "loss": 3.469, + "step": 99630 + }, + { + "epoch": 6.769601848077184, + "grad_norm": 0.7780787944793701, + "learning_rate": 0.00015410551705394754, + "loss": 3.2684, + "step": 99635 + }, + { + "epoch": 6.769941568147846, + "grad_norm": 0.7982339262962341, + "learning_rate": 0.00015406305204511485, + "loss": 3.5265, + "step": 99640 + }, + { + "epoch": 6.7702812882185075, + "grad_norm": 0.9487870931625366, + "learning_rate": 0.0001540205870362821, + "loss": 3.3078, + "step": 99645 + }, + { + "epoch": 6.77062100828917, + "grad_norm": 0.7311505079269409, + "learning_rate": 0.00015397812202744938, + "loss": 3.545, + "step": 99650 + }, + { + "epoch": 6.770960728359832, + "grad_norm": 0.9573057889938354, + "learning_rate": 0.00015393565701861666, + "loss": 3.5757, + "step": 99655 + }, + { + "epoch": 6.771300448430493, + "grad_norm": 0.9366477727890015, + "learning_rate": 0.00015389319200978394, + "loss": 3.4297, + "step": 99660 + }, + { + "epoch": 6.771640168501155, + "grad_norm": 0.6975128054618835, + "learning_rate": 0.00015385072700095122, + "loss": 3.1683, + "step": 99665 + }, + { + "epoch": 6.771979888571817, + "grad_norm": 1.0055570602416992, + "learning_rate": 0.0001538082619921185, + "loss": 3.4233, + "step": 99670 + }, + { + "epoch": 6.772319608642478, + "grad_norm": 1.1706628799438477, + "learning_rate": 0.00015376579698328578, + "loss": 3.5206, + "step": 99675 + }, + { + "epoch": 6.77265932871314, + "grad_norm": 0.9133226275444031, + "learning_rate": 0.00015372333197445303, + "loss": 3.42, + "step": 99680 + }, + { + "epoch": 6.772999048783802, + "grad_norm": 0.8033782243728638, + "learning_rate": 0.00015368086696562034, + "loss": 3.415, + "step": 99685 + }, + { + "epoch": 6.7733387688544635, + "grad_norm": 0.8779062628746033, + "learning_rate": 0.0001536384019567876, + "loss": 3.564, + "step": 99690 + }, + { + "epoch": 6.773678488925126, + "grad_norm": 0.8924756050109863, + "learning_rate": 0.0001535959369479549, + "loss": 3.4829, + "step": 99695 + }, + { + "epoch": 6.774018208995788, + "grad_norm": 0.708207368850708, + "learning_rate": 0.00015355347193912218, + "loss": 3.4709, + "step": 99700 + }, + { + "epoch": 6.774357929066449, + "grad_norm": 0.7807380557060242, + "learning_rate": 0.00015351100693028943, + "loss": 3.5777, + "step": 99705 + }, + { + "epoch": 6.774697649137111, + "grad_norm": 1.0833865404129028, + "learning_rate": 0.00015346854192145674, + "loss": 3.2175, + "step": 99710 + }, + { + "epoch": 6.775037369207773, + "grad_norm": 0.8937413096427917, + "learning_rate": 0.000153426076912624, + "loss": 3.2826, + "step": 99715 + }, + { + "epoch": 6.775377089278434, + "grad_norm": 0.8186259865760803, + "learning_rate": 0.00015338361190379127, + "loss": 3.2033, + "step": 99720 + }, + { + "epoch": 6.775716809349096, + "grad_norm": 0.9195008873939514, + "learning_rate": 0.00015334114689495855, + "loss": 3.4725, + "step": 99725 + }, + { + "epoch": 6.776056529419758, + "grad_norm": 1.0089986324310303, + "learning_rate": 0.00015329868188612583, + "loss": 3.5285, + "step": 99730 + }, + { + "epoch": 6.7763962494904195, + "grad_norm": 0.972164511680603, + "learning_rate": 0.0001532562168772931, + "loss": 3.4737, + "step": 99735 + }, + { + "epoch": 6.776735969561082, + "grad_norm": 0.9489824175834656, + "learning_rate": 0.0001532137518684604, + "loss": 3.0764, + "step": 99740 + }, + { + "epoch": 6.777075689631744, + "grad_norm": 0.8547676205635071, + "learning_rate": 0.00015317128685962767, + "loss": 3.0454, + "step": 99745 + }, + { + "epoch": 6.777415409702405, + "grad_norm": 0.9494107365608215, + "learning_rate": 0.00015312882185079493, + "loss": 3.0861, + "step": 99750 + }, + { + "epoch": 6.777755129773067, + "grad_norm": 0.7300184965133667, + "learning_rate": 0.00015308635684196223, + "loss": 3.4069, + "step": 99755 + }, + { + "epoch": 6.778094849843729, + "grad_norm": 0.9758285284042358, + "learning_rate": 0.00015304389183312951, + "loss": 3.3287, + "step": 99760 + }, + { + "epoch": 6.77843456991439, + "grad_norm": 0.9524635076522827, + "learning_rate": 0.00015300142682429677, + "loss": 3.3326, + "step": 99765 + }, + { + "epoch": 6.778774289985052, + "grad_norm": 1.126616358757019, + "learning_rate": 0.00015295896181546407, + "loss": 3.2188, + "step": 99770 + }, + { + "epoch": 6.779114010055714, + "grad_norm": 1.0267877578735352, + "learning_rate": 0.00015291649680663133, + "loss": 3.2606, + "step": 99775 + }, + { + "epoch": 6.7794537301263755, + "grad_norm": 0.8376936912536621, + "learning_rate": 0.00015287403179779863, + "loss": 3.3461, + "step": 99780 + }, + { + "epoch": 6.779793450197038, + "grad_norm": 0.9266044497489929, + "learning_rate": 0.0001528315667889659, + "loss": 3.2116, + "step": 99785 + }, + { + "epoch": 6.7801331702677, + "grad_norm": 0.8980103731155396, + "learning_rate": 0.00015278910178013317, + "loss": 3.3398, + "step": 99790 + }, + { + "epoch": 6.780472890338361, + "grad_norm": 0.9216050505638123, + "learning_rate": 0.00015274663677130047, + "loss": 3.706, + "step": 99795 + }, + { + "epoch": 6.780812610409023, + "grad_norm": 0.8985738158226013, + "learning_rate": 0.00015271266476423427, + "loss": 3.414, + "step": 99800 + }, + { + "epoch": 6.781152330479685, + "grad_norm": 1.2880796194076538, + "learning_rate": 0.00015267019975540155, + "loss": 3.3615, + "step": 99805 + }, + { + "epoch": 6.781492050550346, + "grad_norm": 0.9406355619430542, + "learning_rate": 0.00015262773474656883, + "loss": 3.2704, + "step": 99810 + }, + { + "epoch": 6.781831770621008, + "grad_norm": 0.8824648857116699, + "learning_rate": 0.0001525852697377361, + "loss": 3.2038, + "step": 99815 + }, + { + "epoch": 6.78217149069167, + "grad_norm": 0.883246660232544, + "learning_rate": 0.0001525428047289034, + "loss": 3.5545, + "step": 99820 + }, + { + "epoch": 6.7825112107623315, + "grad_norm": 1.0154708623886108, + "learning_rate": 0.00015250033972007064, + "loss": 3.3939, + "step": 99825 + }, + { + "epoch": 6.782850930832994, + "grad_norm": 1.0537503957748413, + "learning_rate": 0.00015245787471123795, + "loss": 3.4098, + "step": 99830 + }, + { + "epoch": 6.783190650903656, + "grad_norm": 0.7733186483383179, + "learning_rate": 0.00015241540970240523, + "loss": 3.3876, + "step": 99835 + }, + { + "epoch": 6.783530370974317, + "grad_norm": 0.9893706440925598, + "learning_rate": 0.00015237294469357248, + "loss": 3.4257, + "step": 99840 + }, + { + "epoch": 6.783870091044979, + "grad_norm": 0.8108477592468262, + "learning_rate": 0.0001523304796847398, + "loss": 3.4585, + "step": 99845 + }, + { + "epoch": 6.784209811115641, + "grad_norm": 0.8617035746574402, + "learning_rate": 0.00015228801467590704, + "loss": 3.5092, + "step": 99850 + }, + { + "epoch": 6.784549531186302, + "grad_norm": 0.6740385890007019, + "learning_rate": 0.00015224554966707432, + "loss": 3.4499, + "step": 99855 + }, + { + "epoch": 6.784889251256964, + "grad_norm": 0.9042198061943054, + "learning_rate": 0.00015220308465824163, + "loss": 3.3318, + "step": 99860 + }, + { + "epoch": 6.785228971327626, + "grad_norm": 0.8554350137710571, + "learning_rate": 0.00015216061964940888, + "loss": 3.5081, + "step": 99865 + }, + { + "epoch": 6.7855686913982876, + "grad_norm": 1.2798124551773071, + "learning_rate": 0.00015211815464057617, + "loss": 3.4596, + "step": 99870 + }, + { + "epoch": 6.78590841146895, + "grad_norm": 0.9165914058685303, + "learning_rate": 0.00015207568963174345, + "loss": 3.6377, + "step": 99875 + }, + { + "epoch": 6.786248131539612, + "grad_norm": 0.8815146684646606, + "learning_rate": 0.00015203322462291073, + "loss": 3.4396, + "step": 99880 + }, + { + "epoch": 6.786587851610273, + "grad_norm": 0.9334374070167542, + "learning_rate": 0.00015199075961407798, + "loss": 3.2823, + "step": 99885 + }, + { + "epoch": 6.786927571680935, + "grad_norm": 0.8635432124137878, + "learning_rate": 0.00015194829460524529, + "loss": 3.396, + "step": 99890 + }, + { + "epoch": 6.787267291751597, + "grad_norm": 0.8524395227432251, + "learning_rate": 0.00015190582959641257, + "loss": 3.7712, + "step": 99895 + }, + { + "epoch": 6.787607011822258, + "grad_norm": 0.8600781559944153, + "learning_rate": 0.00015186336458757985, + "loss": 3.4597, + "step": 99900 + }, + { + "epoch": 6.78794673189292, + "grad_norm": 0.7553004026412964, + "learning_rate": 0.00015182089957874713, + "loss": 3.2803, + "step": 99905 + }, + { + "epoch": 6.788286451963582, + "grad_norm": 0.9859738349914551, + "learning_rate": 0.00015177843456991438, + "loss": 3.6419, + "step": 99910 + }, + { + "epoch": 6.788626172034244, + "grad_norm": 1.0495774745941162, + "learning_rate": 0.00015173596956108169, + "loss": 3.1768, + "step": 99915 + }, + { + "epoch": 6.788965892104906, + "grad_norm": 0.8597832918167114, + "learning_rate": 0.00015169350455224894, + "loss": 3.4945, + "step": 99920 + }, + { + "epoch": 6.789305612175568, + "grad_norm": 1.0919893980026245, + "learning_rate": 0.00015165103954341622, + "loss": 3.2225, + "step": 99925 + }, + { + "epoch": 6.789645332246229, + "grad_norm": 0.7507445812225342, + "learning_rate": 0.00015160857453458353, + "loss": 3.4459, + "step": 99930 + }, + { + "epoch": 6.789985052316891, + "grad_norm": 0.7487512230873108, + "learning_rate": 0.00015156610952575078, + "loss": 3.3713, + "step": 99935 + }, + { + "epoch": 6.790324772387553, + "grad_norm": 0.969224750995636, + "learning_rate": 0.00015152364451691806, + "loss": 3.5019, + "step": 99940 + }, + { + "epoch": 6.790664492458214, + "grad_norm": 1.0895661115646362, + "learning_rate": 0.00015148117950808534, + "loss": 2.7633, + "step": 99945 + }, + { + "epoch": 6.791004212528876, + "grad_norm": 0.9935546517372131, + "learning_rate": 0.00015143871449925262, + "loss": 3.2483, + "step": 99950 + }, + { + "epoch": 6.791343932599538, + "grad_norm": 0.958808183670044, + "learning_rate": 0.00015139624949041987, + "loss": 3.4622, + "step": 99955 + }, + { + "epoch": 6.7916836526702, + "grad_norm": 0.9813346862792969, + "learning_rate": 0.00015135378448158718, + "loss": 3.4075, + "step": 99960 + }, + { + "epoch": 6.792023372740862, + "grad_norm": 0.8626198172569275, + "learning_rate": 0.00015131131947275446, + "loss": 3.2004, + "step": 99965 + }, + { + "epoch": 6.792363092811524, + "grad_norm": 0.9570123553276062, + "learning_rate": 0.0001512688544639217, + "loss": 3.3623, + "step": 99970 + }, + { + "epoch": 6.792702812882185, + "grad_norm": 0.9155128002166748, + "learning_rate": 0.00015122638945508902, + "loss": 3.3667, + "step": 99975 + }, + { + "epoch": 6.793042532952847, + "grad_norm": 0.9886416792869568, + "learning_rate": 0.00015118392444625627, + "loss": 3.5573, + "step": 99980 + }, + { + "epoch": 6.793382253023509, + "grad_norm": 0.929578423500061, + "learning_rate": 0.00015114145943742358, + "loss": 3.4091, + "step": 99985 + }, + { + "epoch": 6.79372197309417, + "grad_norm": 1.0227792263031006, + "learning_rate": 0.00015109899442859086, + "loss": 3.2338, + "step": 99990 + }, + { + "epoch": 6.794061693164832, + "grad_norm": 1.0230348110198975, + "learning_rate": 0.0001510565294197581, + "loss": 3.0648, + "step": 99995 + }, + { + "epoch": 6.794401413235494, + "grad_norm": 0.9698412418365479, + "learning_rate": 0.00015101406441092542, + "loss": 3.5649, + "step": 100000 + }, + { + "epoch": 6.794741133306156, + "grad_norm": 0.8850202560424805, + "learning_rate": 0.00015097159940209267, + "loss": 3.4714, + "step": 100005 + }, + { + "epoch": 6.795080853376818, + "grad_norm": 0.9423929452896118, + "learning_rate": 0.00015092913439325995, + "loss": 3.286, + "step": 100010 + }, + { + "epoch": 6.79542057344748, + "grad_norm": 0.9693055748939514, + "learning_rate": 0.00015088666938442723, + "loss": 3.5454, + "step": 100015 + }, + { + "epoch": 6.795760293518141, + "grad_norm": 0.8728309869766235, + "learning_rate": 0.0001508442043755945, + "loss": 3.367, + "step": 100020 + }, + { + "epoch": 6.796100013588803, + "grad_norm": 0.7811325192451477, + "learning_rate": 0.0001508017393667618, + "loss": 3.4386, + "step": 100025 + }, + { + "epoch": 6.796439733659465, + "grad_norm": 0.9261370897293091, + "learning_rate": 0.00015075927435792907, + "loss": 3.2956, + "step": 100030 + }, + { + "epoch": 6.796779453730126, + "grad_norm": 0.9465555548667908, + "learning_rate": 0.00015071680934909635, + "loss": 3.5067, + "step": 100035 + }, + { + "epoch": 6.797119173800788, + "grad_norm": 1.0268023014068604, + "learning_rate": 0.0001506743443402636, + "loss": 3.0962, + "step": 100040 + }, + { + "epoch": 6.79745889387145, + "grad_norm": 0.7617166042327881, + "learning_rate": 0.0001506318793314309, + "loss": 3.5035, + "step": 100045 + }, + { + "epoch": 6.797798613942112, + "grad_norm": 0.8714850544929504, + "learning_rate": 0.00015058941432259817, + "loss": 3.4455, + "step": 100050 + }, + { + "epoch": 6.798138334012774, + "grad_norm": 0.8794868588447571, + "learning_rate": 0.00015054694931376545, + "loss": 3.4805, + "step": 100055 + }, + { + "epoch": 6.798478054083435, + "grad_norm": 1.007005214691162, + "learning_rate": 0.00015050448430493275, + "loss": 3.5155, + "step": 100060 + }, + { + "epoch": 6.798817774154097, + "grad_norm": 0.8111362457275391, + "learning_rate": 0.0001504620192961, + "loss": 3.6832, + "step": 100065 + }, + { + "epoch": 6.799157494224759, + "grad_norm": 1.5250481367111206, + "learning_rate": 0.00015041955428726731, + "loss": 3.5325, + "step": 100070 + }, + { + "epoch": 6.79949721429542, + "grad_norm": 1.0006043910980225, + "learning_rate": 0.00015037708927843457, + "loss": 3.2719, + "step": 100075 + }, + { + "epoch": 6.799836934366082, + "grad_norm": 1.0502606630325317, + "learning_rate": 0.00015033462426960185, + "loss": 3.4078, + "step": 100080 + }, + { + "epoch": 6.800176654436744, + "grad_norm": 0.9251105189323425, + "learning_rate": 0.00015029215926076913, + "loss": 3.3848, + "step": 100085 + }, + { + "epoch": 6.8005163745074055, + "grad_norm": 0.8924077749252319, + "learning_rate": 0.0001502496942519364, + "loss": 3.355, + "step": 100090 + }, + { + "epoch": 6.800856094578068, + "grad_norm": 0.7776904702186584, + "learning_rate": 0.0001502072292431037, + "loss": 3.4754, + "step": 100095 + }, + { + "epoch": 6.80119581464873, + "grad_norm": 0.8770089745521545, + "learning_rate": 0.00015016476423427097, + "loss": 3.3253, + "step": 100100 + }, + { + "epoch": 6.801535534719391, + "grad_norm": 0.844292163848877, + "learning_rate": 0.00015012229922543825, + "loss": 3.5728, + "step": 100105 + }, + { + "epoch": 6.801875254790053, + "grad_norm": 1.0784648656845093, + "learning_rate": 0.0001500798342166055, + "loss": 3.2693, + "step": 100110 + }, + { + "epoch": 6.802214974860715, + "grad_norm": 0.9382765293121338, + "learning_rate": 0.0001500373692077728, + "loss": 3.5095, + "step": 100115 + }, + { + "epoch": 6.802554694931376, + "grad_norm": 2.917717933654785, + "learning_rate": 0.0001499949041989401, + "loss": 3.2796, + "step": 100120 + }, + { + "epoch": 6.802894415002038, + "grad_norm": 1.6799073219299316, + "learning_rate": 0.00014995243919010734, + "loss": 3.6869, + "step": 100125 + }, + { + "epoch": 6.8032341350727, + "grad_norm": 0.9990962743759155, + "learning_rate": 0.00014990997418127465, + "loss": 3.3485, + "step": 100130 + }, + { + "epoch": 6.8035738551433615, + "grad_norm": 0.9669924378395081, + "learning_rate": 0.0001498675091724419, + "loss": 3.3269, + "step": 100135 + }, + { + "epoch": 6.803913575214024, + "grad_norm": 0.8286922574043274, + "learning_rate": 0.00014982504416360918, + "loss": 3.5273, + "step": 100140 + }, + { + "epoch": 6.804253295284686, + "grad_norm": 0.8613579273223877, + "learning_rate": 0.00014978257915477646, + "loss": 3.295, + "step": 100145 + }, + { + "epoch": 6.804593015355347, + "grad_norm": 1.0185835361480713, + "learning_rate": 0.00014974011414594374, + "loss": 3.3965, + "step": 100150 + }, + { + "epoch": 6.804932735426009, + "grad_norm": 0.8730132579803467, + "learning_rate": 0.00014969764913711105, + "loss": 3.6556, + "step": 100155 + }, + { + "epoch": 6.805272455496671, + "grad_norm": 0.8498442769050598, + "learning_rate": 0.0001496551841282783, + "loss": 3.2642, + "step": 100160 + }, + { + "epoch": 6.805612175567332, + "grad_norm": 1.0635689496994019, + "learning_rate": 0.00014961271911944558, + "loss": 3.5644, + "step": 100165 + }, + { + "epoch": 6.805951895637994, + "grad_norm": 0.9519302248954773, + "learning_rate": 0.00014957025411061286, + "loss": 3.0487, + "step": 100170 + }, + { + "epoch": 6.806291615708656, + "grad_norm": 0.8312503695487976, + "learning_rate": 0.00014952778910178014, + "loss": 3.3807, + "step": 100175 + }, + { + "epoch": 6.806631335779318, + "grad_norm": 1.214416742324829, + "learning_rate": 0.0001494853240929474, + "loss": 3.5404, + "step": 100180 + }, + { + "epoch": 6.80697105584998, + "grad_norm": 0.9354555606842041, + "learning_rate": 0.0001494428590841147, + "loss": 3.672, + "step": 100185 + }, + { + "epoch": 6.807310775920642, + "grad_norm": 0.8880597949028015, + "learning_rate": 0.00014940039407528198, + "loss": 3.3479, + "step": 100190 + }, + { + "epoch": 6.807650495991303, + "grad_norm": 1.1932209730148315, + "learning_rate": 0.00014935792906644923, + "loss": 3.6662, + "step": 100195 + }, + { + "epoch": 6.807990216061965, + "grad_norm": 1.0487912893295288, + "learning_rate": 0.00014931546405761654, + "loss": 3.3287, + "step": 100200 + }, + { + "epoch": 6.808329936132627, + "grad_norm": 0.9184759259223938, + "learning_rate": 0.0001492729990487838, + "loss": 3.3564, + "step": 100205 + }, + { + "epoch": 6.808669656203288, + "grad_norm": 0.8722283840179443, + "learning_rate": 0.00014923053403995107, + "loss": 3.2843, + "step": 100210 + }, + { + "epoch": 6.80900937627395, + "grad_norm": 0.965593159198761, + "learning_rate": 0.00014918806903111835, + "loss": 3.3767, + "step": 100215 + }, + { + "epoch": 6.809349096344612, + "grad_norm": 0.9436777234077454, + "learning_rate": 0.00014914560402228563, + "loss": 3.4474, + "step": 100220 + }, + { + "epoch": 6.809688816415274, + "grad_norm": 1.4164432287216187, + "learning_rate": 0.00014910313901345291, + "loss": 3.2413, + "step": 100225 + }, + { + "epoch": 6.810028536485936, + "grad_norm": 1.1015056371688843, + "learning_rate": 0.0001490606740046202, + "loss": 3.2987, + "step": 100230 + }, + { + "epoch": 6.810368256556598, + "grad_norm": 1.291273832321167, + "learning_rate": 0.00014901820899578748, + "loss": 3.4748, + "step": 100235 + }, + { + "epoch": 6.810707976627259, + "grad_norm": 0.8958809971809387, + "learning_rate": 0.00014897574398695476, + "loss": 3.3948, + "step": 100240 + }, + { + "epoch": 6.811047696697921, + "grad_norm": 1.0511677265167236, + "learning_rate": 0.00014893327897812204, + "loss": 3.6964, + "step": 100245 + }, + { + "epoch": 6.811387416768583, + "grad_norm": 3.6667072772979736, + "learning_rate": 0.0001488908139692893, + "loss": 3.1188, + "step": 100250 + }, + { + "epoch": 6.811727136839244, + "grad_norm": 1.0545520782470703, + "learning_rate": 0.0001488483489604566, + "loss": 3.2956, + "step": 100255 + }, + { + "epoch": 6.812066856909906, + "grad_norm": 1.0573341846466064, + "learning_rate": 0.00014880588395162388, + "loss": 3.2198, + "step": 100260 + }, + { + "epoch": 6.812406576980568, + "grad_norm": 1.0564751625061035, + "learning_rate": 0.00014876341894279113, + "loss": 3.4756, + "step": 100265 + }, + { + "epoch": 6.81274629705123, + "grad_norm": 0.8238232731819153, + "learning_rate": 0.00014872095393395844, + "loss": 3.3986, + "step": 100270 + }, + { + "epoch": 6.813086017121892, + "grad_norm": 1.1928757429122925, + "learning_rate": 0.0001486784889251257, + "loss": 3.4632, + "step": 100275 + }, + { + "epoch": 6.813425737192553, + "grad_norm": 1.0622563362121582, + "learning_rate": 0.00014863602391629297, + "loss": 3.3895, + "step": 100280 + }, + { + "epoch": 6.813765457263215, + "grad_norm": 1.9619560241699219, + "learning_rate": 0.00014859355890746028, + "loss": 3.1179, + "step": 100285 + }, + { + "epoch": 6.814105177333877, + "grad_norm": 0.8727439045906067, + "learning_rate": 0.00014855109389862753, + "loss": 3.2348, + "step": 100290 + }, + { + "epoch": 6.814444897404538, + "grad_norm": 0.860860288143158, + "learning_rate": 0.0001485086288897948, + "loss": 3.3362, + "step": 100295 + }, + { + "epoch": 6.8147846174752, + "grad_norm": 0.8383142352104187, + "learning_rate": 0.0001484661638809621, + "loss": 3.5421, + "step": 100300 + }, + { + "epoch": 6.815124337545862, + "grad_norm": 1.1463592052459717, + "learning_rate": 0.00014842369887212937, + "loss": 2.9161, + "step": 100305 + }, + { + "epoch": 6.8154640576165235, + "grad_norm": 0.8215956687927246, + "learning_rate": 0.00014838123386329662, + "loss": 3.3479, + "step": 100310 + }, + { + "epoch": 6.815803777687186, + "grad_norm": 1.0743534564971924, + "learning_rate": 0.00014833876885446393, + "loss": 3.3354, + "step": 100315 + }, + { + "epoch": 6.816143497757848, + "grad_norm": 0.8926742672920227, + "learning_rate": 0.0001482963038456312, + "loss": 3.2743, + "step": 100320 + }, + { + "epoch": 6.816483217828509, + "grad_norm": 0.993188738822937, + "learning_rate": 0.0001482538388367985, + "loss": 3.3866, + "step": 100325 + }, + { + "epoch": 6.816822937899171, + "grad_norm": 0.871647834777832, + "learning_rate": 0.00014821137382796577, + "loss": 3.2592, + "step": 100330 + }, + { + "epoch": 6.817162657969833, + "grad_norm": 0.956098735332489, + "learning_rate": 0.00014816890881913302, + "loss": 3.5428, + "step": 100335 + }, + { + "epoch": 6.817502378040494, + "grad_norm": 1.0310859680175781, + "learning_rate": 0.00014812644381030033, + "loss": 3.4424, + "step": 100340 + }, + { + "epoch": 6.817842098111156, + "grad_norm": 1.2162631750106812, + "learning_rate": 0.00014808397880146758, + "loss": 3.3407, + "step": 100345 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 0.8254159688949585, + "learning_rate": 0.00014804151379263486, + "loss": 3.5449, + "step": 100350 + }, + { + "epoch": 6.8185215382524795, + "grad_norm": 0.8968541622161865, + "learning_rate": 0.00014799904878380217, + "loss": 3.38, + "step": 100355 + }, + { + "epoch": 6.818861258323142, + "grad_norm": 0.8716548085212708, + "learning_rate": 0.00014795658377496942, + "loss": 3.582, + "step": 100360 + }, + { + "epoch": 6.819200978393804, + "grad_norm": 0.8513797521591187, + "learning_rate": 0.0001479141187661367, + "loss": 3.5626, + "step": 100365 + }, + { + "epoch": 6.819540698464465, + "grad_norm": 1.0409544706344604, + "learning_rate": 0.00014787165375730398, + "loss": 3.4477, + "step": 100370 + }, + { + "epoch": 6.819880418535127, + "grad_norm": 1.2058031558990479, + "learning_rate": 0.00014782918874847126, + "loss": 3.1774, + "step": 100375 + }, + { + "epoch": 6.820220138605789, + "grad_norm": 0.8941798806190491, + "learning_rate": 0.00014778672373963852, + "loss": 3.5776, + "step": 100380 + }, + { + "epoch": 6.82055985867645, + "grad_norm": 0.9369202852249146, + "learning_rate": 0.00014774425873080582, + "loss": 3.398, + "step": 100385 + }, + { + "epoch": 6.820899578747112, + "grad_norm": 0.8939765095710754, + "learning_rate": 0.0001477017937219731, + "loss": 3.3754, + "step": 100390 + }, + { + "epoch": 6.821239298817774, + "grad_norm": 0.8321282863616943, + "learning_rate": 0.00014765932871314036, + "loss": 3.6064, + "step": 100395 + }, + { + "epoch": 6.8215790188884355, + "grad_norm": 1.4228143692016602, + "learning_rate": 0.00014761686370430766, + "loss": 3.4389, + "step": 100400 + }, + { + "epoch": 6.821918738959098, + "grad_norm": 0.9241313934326172, + "learning_rate": 0.00014757439869547492, + "loss": 3.4024, + "step": 100405 + }, + { + "epoch": 6.82225845902976, + "grad_norm": 0.9288254976272583, + "learning_rate": 0.00014753193368664222, + "loss": 3.5809, + "step": 100410 + }, + { + "epoch": 6.822598179100421, + "grad_norm": 0.7728114128112793, + "learning_rate": 0.0001474894686778095, + "loss": 3.5825, + "step": 100415 + }, + { + "epoch": 6.822937899171083, + "grad_norm": 0.837644100189209, + "learning_rate": 0.00014744700366897676, + "loss": 3.438, + "step": 100420 + }, + { + "epoch": 6.823277619241745, + "grad_norm": 0.7819501757621765, + "learning_rate": 0.00014740453866014406, + "loss": 3.3664, + "step": 100425 + }, + { + "epoch": 6.823617339312406, + "grad_norm": 0.9539788365364075, + "learning_rate": 0.00014736207365131132, + "loss": 3.4268, + "step": 100430 + }, + { + "epoch": 6.823957059383068, + "grad_norm": 0.7768871188163757, + "learning_rate": 0.0001473196086424786, + "loss": 3.3018, + "step": 100435 + }, + { + "epoch": 6.82429677945373, + "grad_norm": 0.9526313543319702, + "learning_rate": 0.00014727714363364588, + "loss": 3.2566, + "step": 100440 + }, + { + "epoch": 6.8246364995243916, + "grad_norm": 0.8297504186630249, + "learning_rate": 0.00014723467862481316, + "loss": 3.5678, + "step": 100445 + }, + { + "epoch": 6.824976219595054, + "grad_norm": 0.8598738312721252, + "learning_rate": 0.00014719221361598044, + "loss": 3.2591, + "step": 100450 + }, + { + "epoch": 6.825315939665716, + "grad_norm": 0.8851190209388733, + "learning_rate": 0.00014714974860714772, + "loss": 3.4818, + "step": 100455 + }, + { + "epoch": 6.825655659736377, + "grad_norm": 0.7176364064216614, + "learning_rate": 0.000147107283598315, + "loss": 3.5307, + "step": 100460 + }, + { + "epoch": 6.825995379807039, + "grad_norm": 0.8685081005096436, + "learning_rate": 0.00014706481858948225, + "loss": 3.6001, + "step": 100465 + }, + { + "epoch": 6.826335099877701, + "grad_norm": 1.1548997163772583, + "learning_rate": 0.00014702235358064956, + "loss": 3.4213, + "step": 100470 + }, + { + "epoch": 6.826674819948362, + "grad_norm": 0.8683525919914246, + "learning_rate": 0.0001469798885718168, + "loss": 3.5969, + "step": 100475 + }, + { + "epoch": 6.827014540019024, + "grad_norm": 0.9472466707229614, + "learning_rate": 0.0001469374235629841, + "loss": 3.3485, + "step": 100480 + }, + { + "epoch": 6.827354260089686, + "grad_norm": 0.8800037503242493, + "learning_rate": 0.0001468949585541514, + "loss": 3.2151, + "step": 100485 + }, + { + "epoch": 6.827693980160348, + "grad_norm": 0.9475527405738831, + "learning_rate": 0.00014685249354531865, + "loss": 3.1891, + "step": 100490 + }, + { + "epoch": 6.82803370023101, + "grad_norm": 0.74416583776474, + "learning_rate": 0.00014681002853648596, + "loss": 3.4575, + "step": 100495 + }, + { + "epoch": 6.828373420301672, + "grad_norm": 1.0797321796417236, + "learning_rate": 0.0001467675635276532, + "loss": 3.3908, + "step": 100500 + }, + { + "epoch": 6.828713140372333, + "grad_norm": 1.1292005777359009, + "learning_rate": 0.0001467250985188205, + "loss": 3.6382, + "step": 100505 + }, + { + "epoch": 6.829052860442995, + "grad_norm": 1.5509438514709473, + "learning_rate": 0.00014668263350998777, + "loss": 3.5232, + "step": 100510 + }, + { + "epoch": 6.829392580513657, + "grad_norm": 0.9029313325881958, + "learning_rate": 0.00014664016850115505, + "loss": 3.4356, + "step": 100515 + }, + { + "epoch": 6.829732300584318, + "grad_norm": 0.9006472229957581, + "learning_rate": 0.00014659770349232233, + "loss": 3.403, + "step": 100520 + }, + { + "epoch": 6.83007202065498, + "grad_norm": 0.9215940237045288, + "learning_rate": 0.0001465552384834896, + "loss": 3.3982, + "step": 100525 + }, + { + "epoch": 6.830411740725642, + "grad_norm": 0.8295223712921143, + "learning_rate": 0.0001465127734746569, + "loss": 3.4291, + "step": 100530 + }, + { + "epoch": 6.830751460796304, + "grad_norm": 0.9379692077636719, + "learning_rate": 0.00014647030846582414, + "loss": 3.4455, + "step": 100535 + }, + { + "epoch": 6.831091180866966, + "grad_norm": 0.9655840992927551, + "learning_rate": 0.00014642784345699145, + "loss": 3.5376, + "step": 100540 + }, + { + "epoch": 6.831430900937628, + "grad_norm": 1.219753623008728, + "learning_rate": 0.00014638537844815873, + "loss": 3.4854, + "step": 100545 + }, + { + "epoch": 6.831770621008289, + "grad_norm": 1.8619489669799805, + "learning_rate": 0.00014634291343932598, + "loss": 3.5385, + "step": 100550 + }, + { + "epoch": 6.832110341078951, + "grad_norm": 1.0519357919692993, + "learning_rate": 0.0001463004484304933, + "loss": 3.4131, + "step": 100555 + }, + { + "epoch": 6.832450061149613, + "grad_norm": 1.3346304893493652, + "learning_rate": 0.00014625798342166054, + "loss": 3.2633, + "step": 100560 + }, + { + "epoch": 6.832789781220274, + "grad_norm": 0.7911186814308167, + "learning_rate": 0.00014621551841282782, + "loss": 3.521, + "step": 100565 + }, + { + "epoch": 6.833129501290936, + "grad_norm": 0.9499264359474182, + "learning_rate": 0.0001461730534039951, + "loss": 3.3946, + "step": 100570 + }, + { + "epoch": 6.833469221361598, + "grad_norm": 0.8374705910682678, + "learning_rate": 0.00014613058839516238, + "loss": 3.576, + "step": 100575 + }, + { + "epoch": 6.83380894143226, + "grad_norm": 0.9505161046981812, + "learning_rate": 0.0001460881233863297, + "loss": 3.2027, + "step": 100580 + }, + { + "epoch": 6.834148661502922, + "grad_norm": 1.039169192314148, + "learning_rate": 0.00014604565837749695, + "loss": 3.4302, + "step": 100585 + }, + { + "epoch": 6.834488381573584, + "grad_norm": 0.7770078182220459, + "learning_rate": 0.00014600319336866423, + "loss": 3.5877, + "step": 100590 + }, + { + "epoch": 6.834828101644245, + "grad_norm": 0.8669039607048035, + "learning_rate": 0.0001459607283598315, + "loss": 3.5918, + "step": 100595 + }, + { + "epoch": 6.835167821714907, + "grad_norm": 0.8030334711074829, + "learning_rate": 0.00014591826335099879, + "loss": 3.3575, + "step": 100600 + }, + { + "epoch": 6.835507541785569, + "grad_norm": 0.8888726234436035, + "learning_rate": 0.00014587579834216604, + "loss": 3.4777, + "step": 100605 + }, + { + "epoch": 6.83584726185623, + "grad_norm": 0.8327375054359436, + "learning_rate": 0.00014583333333333335, + "loss": 3.2317, + "step": 100610 + }, + { + "epoch": 6.836186981926892, + "grad_norm": 0.7742469310760498, + "learning_rate": 0.00014579086832450063, + "loss": 3.1883, + "step": 100615 + }, + { + "epoch": 6.836526701997554, + "grad_norm": 0.8814956545829773, + "learning_rate": 0.00014574840331566788, + "loss": 3.3437, + "step": 100620 + }, + { + "epoch": 6.836866422068216, + "grad_norm": 5.384000778198242, + "learning_rate": 0.00014570593830683519, + "loss": 3.1311, + "step": 100625 + }, + { + "epoch": 6.837206142138878, + "grad_norm": 0.9037419557571411, + "learning_rate": 0.00014566347329800244, + "loss": 3.2839, + "step": 100630 + }, + { + "epoch": 6.83754586220954, + "grad_norm": 0.8623989820480347, + "learning_rate": 0.00014562100828916972, + "loss": 3.3781, + "step": 100635 + }, + { + "epoch": 6.837885582280201, + "grad_norm": 0.7991964817047119, + "learning_rate": 0.000145578543280337, + "loss": 3.4076, + "step": 100640 + }, + { + "epoch": 6.838225302350863, + "grad_norm": 1.0682063102722168, + "learning_rate": 0.00014553607827150428, + "loss": 3.0691, + "step": 100645 + }, + { + "epoch": 6.838565022421525, + "grad_norm": 0.9958236813545227, + "learning_rate": 0.00014549361326267156, + "loss": 3.4586, + "step": 100650 + }, + { + "epoch": 6.838904742492186, + "grad_norm": 1.0269113779067993, + "learning_rate": 0.00014545114825383884, + "loss": 3.3446, + "step": 100655 + }, + { + "epoch": 6.839244462562848, + "grad_norm": 0.7753036618232727, + "learning_rate": 0.00014540868324500612, + "loss": 3.5333, + "step": 100660 + }, + { + "epoch": 6.83958418263351, + "grad_norm": 0.7716336846351624, + "learning_rate": 0.0001453662182361734, + "loss": 3.5162, + "step": 100665 + }, + { + "epoch": 6.839923902704172, + "grad_norm": 1.0808064937591553, + "learning_rate": 0.00014532375322734068, + "loss": 3.5195, + "step": 100670 + }, + { + "epoch": 6.840263622774834, + "grad_norm": 1.0524535179138184, + "learning_rate": 0.00014528128821850793, + "loss": 3.4166, + "step": 100675 + }, + { + "epoch": 6.840603342845496, + "grad_norm": 0.9189282059669495, + "learning_rate": 0.00014523882320967524, + "loss": 3.3256, + "step": 100680 + }, + { + "epoch": 6.840943062916157, + "grad_norm": 1.2636674642562866, + "learning_rate": 0.00014519635820084252, + "loss": 3.4994, + "step": 100685 + }, + { + "epoch": 6.841282782986819, + "grad_norm": 0.9481356143951416, + "learning_rate": 0.00014515389319200977, + "loss": 3.6425, + "step": 100690 + }, + { + "epoch": 6.841622503057481, + "grad_norm": 1.2471166849136353, + "learning_rate": 0.00014511142818317708, + "loss": 3.4651, + "step": 100695 + }, + { + "epoch": 6.841962223128142, + "grad_norm": 0.867953360080719, + "learning_rate": 0.00014506896317434433, + "loss": 3.4292, + "step": 100700 + }, + { + "epoch": 6.842301943198804, + "grad_norm": 1.024450421333313, + "learning_rate": 0.0001450264981655116, + "loss": 3.5599, + "step": 100705 + }, + { + "epoch": 6.842641663269466, + "grad_norm": 1.0250396728515625, + "learning_rate": 0.00014498403315667892, + "loss": 3.3821, + "step": 100710 + }, + { + "epoch": 6.842981383340128, + "grad_norm": 0.8503158688545227, + "learning_rate": 0.00014494156814784617, + "loss": 3.4047, + "step": 100715 + }, + { + "epoch": 6.84332110341079, + "grad_norm": 0.9060848951339722, + "learning_rate": 0.00014489910313901345, + "loss": 3.4563, + "step": 100720 + }, + { + "epoch": 6.843660823481452, + "grad_norm": 0.9839969277381897, + "learning_rate": 0.00014485663813018073, + "loss": 3.261, + "step": 100725 + }, + { + "epoch": 6.844000543552113, + "grad_norm": 1.0185240507125854, + "learning_rate": 0.000144814173121348, + "loss": 3.3956, + "step": 100730 + }, + { + "epoch": 6.844340263622775, + "grad_norm": 0.9988280534744263, + "learning_rate": 0.00014477170811251527, + "loss": 3.2614, + "step": 100735 + }, + { + "epoch": 6.844679983693436, + "grad_norm": 0.7813850045204163, + "learning_rate": 0.00014472924310368257, + "loss": 3.2433, + "step": 100740 + }, + { + "epoch": 6.845019703764098, + "grad_norm": 1.005468487739563, + "learning_rate": 0.00014468677809484985, + "loss": 3.1975, + "step": 100745 + }, + { + "epoch": 6.84535942383476, + "grad_norm": 0.9203284382820129, + "learning_rate": 0.00014464431308601713, + "loss": 3.6079, + "step": 100750 + }, + { + "epoch": 6.845699143905422, + "grad_norm": 0.8210524320602417, + "learning_rate": 0.0001446018480771844, + "loss": 3.4371, + "step": 100755 + }, + { + "epoch": 6.846038863976084, + "grad_norm": 0.7782256007194519, + "learning_rate": 0.00014455938306835167, + "loss": 3.4375, + "step": 100760 + }, + { + "epoch": 6.846378584046746, + "grad_norm": 0.9096581339836121, + "learning_rate": 0.00014451691805951897, + "loss": 3.6538, + "step": 100765 + }, + { + "epoch": 6.846718304117407, + "grad_norm": 0.783846914768219, + "learning_rate": 0.00014447445305068623, + "loss": 3.4577, + "step": 100770 + }, + { + "epoch": 6.847058024188069, + "grad_norm": 0.9606424570083618, + "learning_rate": 0.0001444319880418535, + "loss": 3.2967, + "step": 100775 + }, + { + "epoch": 6.847397744258731, + "grad_norm": 0.9680413007736206, + "learning_rate": 0.00014438952303302081, + "loss": 3.4984, + "step": 100780 + }, + { + "epoch": 6.847737464329392, + "grad_norm": 0.8972719311714172, + "learning_rate": 0.00014434705802418807, + "loss": 3.6751, + "step": 100785 + }, + { + "epoch": 6.848077184400054, + "grad_norm": 0.9071564674377441, + "learning_rate": 0.00014430459301535535, + "loss": 3.2852, + "step": 100790 + }, + { + "epoch": 6.848416904470716, + "grad_norm": 0.7532631754875183, + "learning_rate": 0.00014426212800652263, + "loss": 3.3821, + "step": 100795 + }, + { + "epoch": 6.848756624541378, + "grad_norm": 0.8622666001319885, + "learning_rate": 0.0001442196629976899, + "loss": 3.18, + "step": 100800 + }, + { + "epoch": 6.84909634461204, + "grad_norm": 1.0347177982330322, + "learning_rate": 0.00014417719798885716, + "loss": 3.3357, + "step": 100805 + }, + { + "epoch": 6.849436064682702, + "grad_norm": 1.0447789430618286, + "learning_rate": 0.00014413473298002447, + "loss": 3.47, + "step": 100810 + }, + { + "epoch": 6.849775784753363, + "grad_norm": 0.8838251829147339, + "learning_rate": 0.00014409226797119175, + "loss": 3.6601, + "step": 100815 + }, + { + "epoch": 6.850115504824025, + "grad_norm": 1.0178883075714111, + "learning_rate": 0.000144049802962359, + "loss": 3.337, + "step": 100820 + }, + { + "epoch": 6.850455224894687, + "grad_norm": 1.1297966241836548, + "learning_rate": 0.0001440073379535263, + "loss": 3.3742, + "step": 100825 + }, + { + "epoch": 6.850794944965348, + "grad_norm": 0.8663142323493958, + "learning_rate": 0.00014396487294469356, + "loss": 3.3309, + "step": 100830 + }, + { + "epoch": 6.85113466503601, + "grad_norm": 0.914840579032898, + "learning_rate": 0.00014392240793586087, + "loss": 3.4583, + "step": 100835 + }, + { + "epoch": 6.851474385106672, + "grad_norm": 0.9217373132705688, + "learning_rate": 0.00014387994292702815, + "loss": 3.5524, + "step": 100840 + }, + { + "epoch": 6.851814105177334, + "grad_norm": 0.7700374126434326, + "learning_rate": 0.0001438374779181954, + "loss": 3.3281, + "step": 100845 + }, + { + "epoch": 6.852153825247996, + "grad_norm": 0.8155715465545654, + "learning_rate": 0.0001437950129093627, + "loss": 3.2973, + "step": 100850 + }, + { + "epoch": 6.852493545318658, + "grad_norm": 0.8968302607536316, + "learning_rate": 0.00014375254790052996, + "loss": 3.5273, + "step": 100855 + }, + { + "epoch": 6.852833265389319, + "grad_norm": 0.8967390060424805, + "learning_rate": 0.00014371008289169724, + "loss": 3.5197, + "step": 100860 + }, + { + "epoch": 6.853172985459981, + "grad_norm": 1.1333025693893433, + "learning_rate": 0.00014366761788286452, + "loss": 3.2928, + "step": 100865 + }, + { + "epoch": 6.853512705530643, + "grad_norm": 0.8469271063804626, + "learning_rate": 0.0001436251528740318, + "loss": 3.2615, + "step": 100870 + }, + { + "epoch": 6.853852425601304, + "grad_norm": 1.0033729076385498, + "learning_rate": 0.00014358268786519908, + "loss": 3.7662, + "step": 100875 + }, + { + "epoch": 6.854192145671966, + "grad_norm": 0.7947058081626892, + "learning_rate": 0.00014354022285636636, + "loss": 3.4192, + "step": 100880 + }, + { + "epoch": 6.854531865742628, + "grad_norm": 0.8499600291252136, + "learning_rate": 0.00014349775784753364, + "loss": 3.4748, + "step": 100885 + }, + { + "epoch": 6.85487158581329, + "grad_norm": 0.8053815960884094, + "learning_rate": 0.0001434552928387009, + "loss": 3.3958, + "step": 100890 + }, + { + "epoch": 6.855211305883952, + "grad_norm": 0.9572646617889404, + "learning_rate": 0.0001434128278298682, + "loss": 3.2565, + "step": 100895 + }, + { + "epoch": 6.855551025954614, + "grad_norm": 2.1924071311950684, + "learning_rate": 0.00014337036282103545, + "loss": 3.5375, + "step": 100900 + }, + { + "epoch": 6.855890746025275, + "grad_norm": 0.9553772807121277, + "learning_rate": 0.00014332789781220273, + "loss": 3.1635, + "step": 100905 + }, + { + "epoch": 6.856230466095937, + "grad_norm": 1.0709450244903564, + "learning_rate": 0.00014328543280337004, + "loss": 3.325, + "step": 100910 + }, + { + "epoch": 6.856570186166599, + "grad_norm": 0.9797815680503845, + "learning_rate": 0.0001432429677945373, + "loss": 3.3183, + "step": 100915 + }, + { + "epoch": 6.85690990623726, + "grad_norm": 1.0561468601226807, + "learning_rate": 0.0001432005027857046, + "loss": 3.4245, + "step": 100920 + }, + { + "epoch": 6.857249626307922, + "grad_norm": 0.775399923324585, + "learning_rate": 0.00014315803777687185, + "loss": 3.6002, + "step": 100925 + }, + { + "epoch": 6.857589346378584, + "grad_norm": 0.7595136165618896, + "learning_rate": 0.00014311557276803913, + "loss": 3.3867, + "step": 100930 + }, + { + "epoch": 6.857929066449246, + "grad_norm": 1.1113492250442505, + "learning_rate": 0.00014307310775920641, + "loss": 3.5604, + "step": 100935 + }, + { + "epoch": 6.858268786519908, + "grad_norm": 1.0987749099731445, + "learning_rate": 0.0001430306427503737, + "loss": 3.2858, + "step": 100940 + }, + { + "epoch": 6.85860850659057, + "grad_norm": 0.8912937045097351, + "learning_rate": 0.00014298817774154098, + "loss": 3.5565, + "step": 100945 + }, + { + "epoch": 6.858948226661231, + "grad_norm": 0.8381293416023254, + "learning_rate": 0.00014294571273270826, + "loss": 3.3, + "step": 100950 + }, + { + "epoch": 6.859287946731893, + "grad_norm": 0.790020227432251, + "learning_rate": 0.00014290324772387554, + "loss": 3.1598, + "step": 100955 + }, + { + "epoch": 6.859627666802554, + "grad_norm": 0.8211153745651245, + "learning_rate": 0.0001428607827150428, + "loss": 3.2366, + "step": 100960 + }, + { + "epoch": 6.859967386873216, + "grad_norm": 0.8839123845100403, + "learning_rate": 0.0001428183177062101, + "loss": 3.5279, + "step": 100965 + }, + { + "epoch": 6.860307106943878, + "grad_norm": 1.1718555688858032, + "learning_rate": 0.00014277585269737738, + "loss": 3.2686, + "step": 100970 + }, + { + "epoch": 6.8606468270145395, + "grad_norm": 0.8766233325004578, + "learning_rate": 0.00014273338768854463, + "loss": 3.4381, + "step": 100975 + }, + { + "epoch": 6.860986547085202, + "grad_norm": 6.469608306884766, + "learning_rate": 0.00014269092267971194, + "loss": 3.4044, + "step": 100980 + }, + { + "epoch": 6.861326267155864, + "grad_norm": 1.4574661254882812, + "learning_rate": 0.0001426484576708792, + "loss": 3.2278, + "step": 100985 + }, + { + "epoch": 6.861665987226525, + "grad_norm": 1.2090644836425781, + "learning_rate": 0.00014260599266204647, + "loss": 3.308, + "step": 100990 + }, + { + "epoch": 6.862005707297187, + "grad_norm": 0.9074265956878662, + "learning_rate": 0.00014256352765321375, + "loss": 3.672, + "step": 100995 + }, + { + "epoch": 6.862345427367849, + "grad_norm": 1.6997849941253662, + "learning_rate": 0.00014252106264438103, + "loss": 3.5547, + "step": 101000 + }, + { + "epoch": 6.86268514743851, + "grad_norm": 0.8809224367141724, + "learning_rate": 0.00014247859763554834, + "loss": 3.4505, + "step": 101005 + }, + { + "epoch": 6.863024867509172, + "grad_norm": 0.7211024165153503, + "learning_rate": 0.0001424361326267156, + "loss": 3.2673, + "step": 101010 + }, + { + "epoch": 6.863364587579834, + "grad_norm": 0.8326045274734497, + "learning_rate": 0.00014239366761788287, + "loss": 3.2284, + "step": 101015 + }, + { + "epoch": 6.8637043076504956, + "grad_norm": 0.7737773656845093, + "learning_rate": 0.00014235120260905015, + "loss": 3.6636, + "step": 101020 + }, + { + "epoch": 6.864044027721158, + "grad_norm": 1.3001669645309448, + "learning_rate": 0.00014230873760021743, + "loss": 3.4287, + "step": 101025 + }, + { + "epoch": 6.86438374779182, + "grad_norm": 0.7855926156044006, + "learning_rate": 0.00014226627259138468, + "loss": 3.4564, + "step": 101030 + }, + { + "epoch": 6.864723467862481, + "grad_norm": 0.857282817363739, + "learning_rate": 0.000142223807582552, + "loss": 3.7188, + "step": 101035 + }, + { + "epoch": 6.865063187933143, + "grad_norm": 1.151602029800415, + "learning_rate": 0.00014218134257371927, + "loss": 3.3067, + "step": 101040 + }, + { + "epoch": 6.865402908003805, + "grad_norm": 0.9900321364402771, + "learning_rate": 0.00014213887756488652, + "loss": 3.4488, + "step": 101045 + }, + { + "epoch": 6.865742628074466, + "grad_norm": 0.9885067343711853, + "learning_rate": 0.00014209641255605383, + "loss": 3.5081, + "step": 101050 + }, + { + "epoch": 6.866082348145128, + "grad_norm": 1.1168675422668457, + "learning_rate": 0.00014205394754722108, + "loss": 3.2844, + "step": 101055 + }, + { + "epoch": 6.86642206821579, + "grad_norm": 1.2498767375946045, + "learning_rate": 0.00014201148253838836, + "loss": 3.5683, + "step": 101060 + }, + { + "epoch": 6.866761788286452, + "grad_norm": 0.7961181402206421, + "learning_rate": 0.00014196901752955564, + "loss": 3.3437, + "step": 101065 + }, + { + "epoch": 6.867101508357114, + "grad_norm": 0.8169506192207336, + "learning_rate": 0.00014192655252072292, + "loss": 3.4872, + "step": 101070 + }, + { + "epoch": 6.867441228427776, + "grad_norm": 0.9942353963851929, + "learning_rate": 0.0001418840875118902, + "loss": 3.3333, + "step": 101075 + }, + { + "epoch": 6.867780948498437, + "grad_norm": 0.8404791355133057, + "learning_rate": 0.00014184162250305748, + "loss": 3.5007, + "step": 101080 + }, + { + "epoch": 6.868120668569099, + "grad_norm": 1.0032029151916504, + "learning_rate": 0.00014179915749422476, + "loss": 3.0833, + "step": 101085 + }, + { + "epoch": 6.868460388639761, + "grad_norm": 0.9548054337501526, + "learning_rate": 0.00014175669248539204, + "loss": 3.5011, + "step": 101090 + }, + { + "epoch": 6.868800108710422, + "grad_norm": 1.138546347618103, + "learning_rate": 0.00014171422747655932, + "loss": 3.257, + "step": 101095 + }, + { + "epoch": 6.869139828781084, + "grad_norm": 0.7231637239456177, + "learning_rate": 0.00014167176246772658, + "loss": 3.3651, + "step": 101100 + }, + { + "epoch": 6.869479548851746, + "grad_norm": 0.9551514983177185, + "learning_rate": 0.00014162929745889388, + "loss": 3.2057, + "step": 101105 + }, + { + "epoch": 6.869819268922408, + "grad_norm": 0.8009248971939087, + "learning_rate": 0.00014158683245006116, + "loss": 3.6916, + "step": 101110 + }, + { + "epoch": 6.87015898899307, + "grad_norm": 0.7679635286331177, + "learning_rate": 0.00014154436744122842, + "loss": 3.2552, + "step": 101115 + }, + { + "epoch": 6.870498709063732, + "grad_norm": 1.009892463684082, + "learning_rate": 0.00014150190243239572, + "loss": 3.4282, + "step": 101120 + }, + { + "epoch": 6.870838429134393, + "grad_norm": 0.8812618851661682, + "learning_rate": 0.00014145943742356298, + "loss": 3.474, + "step": 101125 + }, + { + "epoch": 6.871178149205055, + "grad_norm": 1.0081170797348022, + "learning_rate": 0.00014141697241473026, + "loss": 3.3406, + "step": 101130 + }, + { + "epoch": 6.871517869275717, + "grad_norm": 1.0054600238800049, + "learning_rate": 0.00014137450740589756, + "loss": 3.3839, + "step": 101135 + }, + { + "epoch": 6.871857589346378, + "grad_norm": 1.1273664236068726, + "learning_rate": 0.00014133204239706482, + "loss": 3.6108, + "step": 101140 + }, + { + "epoch": 6.87219730941704, + "grad_norm": 0.9133273959159851, + "learning_rate": 0.0001412895773882321, + "loss": 3.6181, + "step": 101145 + }, + { + "epoch": 6.872537029487702, + "grad_norm": 1.1208043098449707, + "learning_rate": 0.00014124711237939938, + "loss": 3.7039, + "step": 101150 + }, + { + "epoch": 6.872876749558364, + "grad_norm": 0.7569783329963684, + "learning_rate": 0.00014120464737056666, + "loss": 3.5428, + "step": 101155 + }, + { + "epoch": 6.873216469629026, + "grad_norm": 0.8607423305511475, + "learning_rate": 0.0001411621823617339, + "loss": 3.3137, + "step": 101160 + }, + { + "epoch": 6.873556189699688, + "grad_norm": 1.1044596433639526, + "learning_rate": 0.00014111971735290122, + "loss": 3.4552, + "step": 101165 + }, + { + "epoch": 6.873895909770349, + "grad_norm": 0.8518256545066833, + "learning_rate": 0.0001410772523440685, + "loss": 3.5203, + "step": 101170 + }, + { + "epoch": 6.874235629841011, + "grad_norm": 1.1475825309753418, + "learning_rate": 0.00014103478733523578, + "loss": 3.4585, + "step": 101175 + }, + { + "epoch": 6.874575349911673, + "grad_norm": 0.8740882277488708, + "learning_rate": 0.00014099232232640306, + "loss": 3.4844, + "step": 101180 + }, + { + "epoch": 6.874915069982334, + "grad_norm": 0.9412821531295776, + "learning_rate": 0.0001409498573175703, + "loss": 3.6011, + "step": 101185 + }, + { + "epoch": 6.875254790052996, + "grad_norm": 1.128775715827942, + "learning_rate": 0.00014090739230873762, + "loss": 3.1923, + "step": 101190 + }, + { + "epoch": 6.875594510123658, + "grad_norm": 0.8854586482048035, + "learning_rate": 0.00014086492729990487, + "loss": 3.3154, + "step": 101195 + }, + { + "epoch": 6.87593423019432, + "grad_norm": 0.8445804715156555, + "learning_rate": 0.00014082246229107215, + "loss": 3.281, + "step": 101200 + }, + { + "epoch": 6.876273950264982, + "grad_norm": 1.010047435760498, + "learning_rate": 0.00014077999728223946, + "loss": 3.3043, + "step": 101205 + }, + { + "epoch": 6.876613670335644, + "grad_norm": 0.7471079230308533, + "learning_rate": 0.0001407375322734067, + "loss": 3.2745, + "step": 101210 + }, + { + "epoch": 6.876953390406305, + "grad_norm": 1.2332203388214111, + "learning_rate": 0.000140695067264574, + "loss": 3.7253, + "step": 101215 + }, + { + "epoch": 6.877293110476967, + "grad_norm": 0.9984679222106934, + "learning_rate": 0.00014065260225574127, + "loss": 3.2229, + "step": 101220 + }, + { + "epoch": 6.877632830547629, + "grad_norm": 0.8730010390281677, + "learning_rate": 0.00014061013724690855, + "loss": 3.5342, + "step": 101225 + }, + { + "epoch": 6.87797255061829, + "grad_norm": 0.9811608791351318, + "learning_rate": 0.0001405676722380758, + "loss": 3.6507, + "step": 101230 + }, + { + "epoch": 6.878312270688952, + "grad_norm": 1.6773959398269653, + "learning_rate": 0.0001405252072292431, + "loss": 3.5234, + "step": 101235 + }, + { + "epoch": 6.878651990759614, + "grad_norm": 1.005483627319336, + "learning_rate": 0.0001404827422204104, + "loss": 3.338, + "step": 101240 + }, + { + "epoch": 6.878991710830276, + "grad_norm": 0.9407474994659424, + "learning_rate": 0.00014044027721157764, + "loss": 3.4132, + "step": 101245 + }, + { + "epoch": 6.879331430900938, + "grad_norm": 1.0900827646255493, + "learning_rate": 0.00014039781220274495, + "loss": 3.3894, + "step": 101250 + }, + { + "epoch": 6.8796711509716, + "grad_norm": 0.7724384069442749, + "learning_rate": 0.0001403553471939122, + "loss": 3.5824, + "step": 101255 + }, + { + "epoch": 6.880010871042261, + "grad_norm": 1.5992456674575806, + "learning_rate": 0.0001403128821850795, + "loss": 3.4357, + "step": 101260 + }, + { + "epoch": 6.880350591112923, + "grad_norm": 1.1048970222473145, + "learning_rate": 0.0001402704171762468, + "loss": 3.4591, + "step": 101265 + }, + { + "epoch": 6.880690311183585, + "grad_norm": 0.7961332201957703, + "learning_rate": 0.00014022795216741404, + "loss": 3.2793, + "step": 101270 + }, + { + "epoch": 6.881030031254246, + "grad_norm": 0.9676252007484436, + "learning_rate": 0.00014018548715858135, + "loss": 3.7215, + "step": 101275 + }, + { + "epoch": 6.881369751324908, + "grad_norm": 1.0573689937591553, + "learning_rate": 0.0001401430221497486, + "loss": 3.3419, + "step": 101280 + }, + { + "epoch": 6.88170947139557, + "grad_norm": 0.8015015721321106, + "learning_rate": 0.00014010055714091588, + "loss": 3.3179, + "step": 101285 + }, + { + "epoch": 6.882049191466232, + "grad_norm": 0.8858278393745422, + "learning_rate": 0.00014005809213208316, + "loss": 3.4305, + "step": 101290 + }, + { + "epoch": 6.882388911536894, + "grad_norm": 0.9058115482330322, + "learning_rate": 0.00014001562712325044, + "loss": 3.3668, + "step": 101295 + }, + { + "epoch": 6.882728631607556, + "grad_norm": 0.8838462233543396, + "learning_rate": 0.00013997316211441773, + "loss": 3.5456, + "step": 101300 + }, + { + "epoch": 6.883068351678217, + "grad_norm": 0.926675021648407, + "learning_rate": 0.000139930697105585, + "loss": 3.3022, + "step": 101305 + }, + { + "epoch": 6.883408071748879, + "grad_norm": 0.743073582649231, + "learning_rate": 0.00013988823209675229, + "loss": 3.3013, + "step": 101310 + }, + { + "epoch": 6.883747791819541, + "grad_norm": 1.0894198417663574, + "learning_rate": 0.00013984576708791954, + "loss": 3.3591, + "step": 101315 + }, + { + "epoch": 6.884087511890202, + "grad_norm": 3.0897367000579834, + "learning_rate": 0.00013980330207908685, + "loss": 3.2738, + "step": 101320 + }, + { + "epoch": 6.884427231960864, + "grad_norm": 0.9849005937576294, + "learning_rate": 0.0001397608370702541, + "loss": 3.5635, + "step": 101325 + }, + { + "epoch": 6.884766952031526, + "grad_norm": 0.7650162577629089, + "learning_rate": 0.00013971837206142138, + "loss": 3.2213, + "step": 101330 + }, + { + "epoch": 6.885106672102188, + "grad_norm": 0.7840611934661865, + "learning_rate": 0.00013967590705258869, + "loss": 3.4228, + "step": 101335 + }, + { + "epoch": 6.88544639217285, + "grad_norm": 0.8601134419441223, + "learning_rate": 0.00013963344204375594, + "loss": 3.2514, + "step": 101340 + }, + { + "epoch": 6.885786112243512, + "grad_norm": 1.0822794437408447, + "learning_rate": 0.00013959097703492325, + "loss": 3.589, + "step": 101345 + }, + { + "epoch": 6.886125832314173, + "grad_norm": 0.8670470118522644, + "learning_rate": 0.0001395485120260905, + "loss": 3.5429, + "step": 101350 + }, + { + "epoch": 6.886465552384835, + "grad_norm": 0.8551841378211975, + "learning_rate": 0.00013950604701725778, + "loss": 3.4189, + "step": 101355 + }, + { + "epoch": 6.886805272455497, + "grad_norm": 0.7800451517105103, + "learning_rate": 0.00013946358200842506, + "loss": 3.305, + "step": 101360 + }, + { + "epoch": 6.887144992526158, + "grad_norm": 0.7777644395828247, + "learning_rate": 0.00013942111699959234, + "loss": 3.4994, + "step": 101365 + }, + { + "epoch": 6.88748471259682, + "grad_norm": 0.9371853470802307, + "learning_rate": 0.00013937865199075962, + "loss": 3.3491, + "step": 101370 + }, + { + "epoch": 6.8878244326674825, + "grad_norm": 1.1401671171188354, + "learning_rate": 0.0001393361869819269, + "loss": 3.2705, + "step": 101375 + }, + { + "epoch": 6.888164152738144, + "grad_norm": 0.8999339938163757, + "learning_rate": 0.00013929372197309418, + "loss": 3.4378, + "step": 101380 + }, + { + "epoch": 6.888503872808806, + "grad_norm": 0.9235441088676453, + "learning_rate": 0.00013925125696426143, + "loss": 3.3706, + "step": 101385 + }, + { + "epoch": 6.888843592879468, + "grad_norm": 0.9213147759437561, + "learning_rate": 0.00013920879195542874, + "loss": 3.476, + "step": 101390 + }, + { + "epoch": 6.889183312950129, + "grad_norm": 1.114593744277954, + "learning_rate": 0.000139166326946596, + "loss": 3.6385, + "step": 101395 + }, + { + "epoch": 6.889523033020791, + "grad_norm": 0.7589915990829468, + "learning_rate": 0.00013912386193776327, + "loss": 3.2428, + "step": 101400 + }, + { + "epoch": 6.889862753091453, + "grad_norm": 0.979759931564331, + "learning_rate": 0.00013908139692893058, + "loss": 3.4265, + "step": 101405 + }, + { + "epoch": 6.890202473162114, + "grad_norm": 1.1800709962844849, + "learning_rate": 0.00013903893192009783, + "loss": 3.4784, + "step": 101410 + }, + { + "epoch": 6.890542193232776, + "grad_norm": 0.8441157937049866, + "learning_rate": 0.0001389964669112651, + "loss": 3.2615, + "step": 101415 + }, + { + "epoch": 6.8908819133034385, + "grad_norm": 1.218435287475586, + "learning_rate": 0.0001389540019024324, + "loss": 3.2036, + "step": 101420 + }, + { + "epoch": 6.8912216333741, + "grad_norm": 1.0090484619140625, + "learning_rate": 0.00013891153689359967, + "loss": 3.4897, + "step": 101425 + }, + { + "epoch": 6.891561353444762, + "grad_norm": 1.1030536890029907, + "learning_rate": 0.00013886907188476698, + "loss": 3.499, + "step": 101430 + }, + { + "epoch": 6.891901073515423, + "grad_norm": 1.0406455993652344, + "learning_rate": 0.00013882660687593423, + "loss": 3.2973, + "step": 101435 + }, + { + "epoch": 6.892240793586085, + "grad_norm": 1.2590299844741821, + "learning_rate": 0.0001387841418671015, + "loss": 3.8024, + "step": 101440 + }, + { + "epoch": 6.892580513656747, + "grad_norm": 0.6762768626213074, + "learning_rate": 0.0001387416768582688, + "loss": 3.3836, + "step": 101445 + }, + { + "epoch": 6.892920233727408, + "grad_norm": 0.860887348651886, + "learning_rate": 0.00013869921184943607, + "loss": 3.3621, + "step": 101450 + }, + { + "epoch": 6.89325995379807, + "grad_norm": 0.8948068618774414, + "learning_rate": 0.00013865674684060333, + "loss": 3.4077, + "step": 101455 + }, + { + "epoch": 6.893599673868732, + "grad_norm": 0.9776368737220764, + "learning_rate": 0.00013861428183177063, + "loss": 3.4465, + "step": 101460 + }, + { + "epoch": 6.893939393939394, + "grad_norm": 1.0009042024612427, + "learning_rate": 0.0001385718168229379, + "loss": 3.2022, + "step": 101465 + }, + { + "epoch": 6.894279114010056, + "grad_norm": 0.7697983980178833, + "learning_rate": 0.00013852935181410517, + "loss": 3.6686, + "step": 101470 + }, + { + "epoch": 6.894618834080718, + "grad_norm": 1.3811979293823242, + "learning_rate": 0.00013848688680527247, + "loss": 3.4602, + "step": 101475 + }, + { + "epoch": 6.894958554151379, + "grad_norm": 0.8307890892028809, + "learning_rate": 0.00013844442179643973, + "loss": 3.1179, + "step": 101480 + }, + { + "epoch": 6.895298274222041, + "grad_norm": 0.904189944267273, + "learning_rate": 0.000138401956787607, + "loss": 3.1802, + "step": 101485 + }, + { + "epoch": 6.895637994292703, + "grad_norm": 0.8173717260360718, + "learning_rate": 0.0001383594917787743, + "loss": 3.2303, + "step": 101490 + }, + { + "epoch": 6.895977714363364, + "grad_norm": 1.0800262689590454, + "learning_rate": 0.00013831702676994157, + "loss": 3.3792, + "step": 101495 + }, + { + "epoch": 6.896317434434026, + "grad_norm": 0.8306785225868225, + "learning_rate": 0.00013827456176110885, + "loss": 3.3252, + "step": 101500 + }, + { + "epoch": 6.896657154504688, + "grad_norm": 0.8672257661819458, + "learning_rate": 0.00013823209675227613, + "loss": 3.4673, + "step": 101505 + }, + { + "epoch": 6.89699687457535, + "grad_norm": 1.0456115007400513, + "learning_rate": 0.0001381896317434434, + "loss": 3.4109, + "step": 101510 + }, + { + "epoch": 6.897336594646012, + "grad_norm": 1.0217704772949219, + "learning_rate": 0.0001381471667346107, + "loss": 3.5299, + "step": 101515 + }, + { + "epoch": 6.897676314716674, + "grad_norm": 1.0737559795379639, + "learning_rate": 0.00013810470172577797, + "loss": 3.0561, + "step": 101520 + }, + { + "epoch": 6.898016034787335, + "grad_norm": 1.1597833633422852, + "learning_rate": 0.00013806223671694522, + "loss": 3.4705, + "step": 101525 + }, + { + "epoch": 6.898355754857997, + "grad_norm": 0.8341037631034851, + "learning_rate": 0.00013801977170811253, + "loss": 3.2232, + "step": 101530 + }, + { + "epoch": 6.898695474928659, + "grad_norm": 1.0014547109603882, + "learning_rate": 0.0001379773066992798, + "loss": 3.2566, + "step": 101535 + }, + { + "epoch": 6.89903519499932, + "grad_norm": 0.749308168888092, + "learning_rate": 0.00013793484169044706, + "loss": 3.4066, + "step": 101540 + }, + { + "epoch": 6.899374915069982, + "grad_norm": 0.8159890174865723, + "learning_rate": 0.00013789237668161437, + "loss": 3.233, + "step": 101545 + }, + { + "epoch": 6.899714635140644, + "grad_norm": 0.8408530354499817, + "learning_rate": 0.00013784991167278162, + "loss": 3.4752, + "step": 101550 + }, + { + "epoch": 6.900054355211306, + "grad_norm": 0.8725390434265137, + "learning_rate": 0.0001378074466639489, + "loss": 3.4541, + "step": 101555 + }, + { + "epoch": 6.900394075281968, + "grad_norm": 0.9349693059921265, + "learning_rate": 0.0001377649816551162, + "loss": 3.5814, + "step": 101560 + }, + { + "epoch": 6.90073379535263, + "grad_norm": 0.7430732846260071, + "learning_rate": 0.00013772251664628346, + "loss": 3.4056, + "step": 101565 + }, + { + "epoch": 6.901073515423291, + "grad_norm": 1.0483983755111694, + "learning_rate": 0.00013768005163745074, + "loss": 3.3211, + "step": 101570 + }, + { + "epoch": 6.901413235493953, + "grad_norm": 0.7922653555870056, + "learning_rate": 0.00013763758662861802, + "loss": 3.3475, + "step": 101575 + }, + { + "epoch": 6.901752955564615, + "grad_norm": 1.356041431427002, + "learning_rate": 0.0001375951216197853, + "loss": 3.1292, + "step": 101580 + }, + { + "epoch": 6.902092675635276, + "grad_norm": 0.9688220620155334, + "learning_rate": 0.00013755265661095255, + "loss": 3.4707, + "step": 101585 + }, + { + "epoch": 6.902432395705938, + "grad_norm": 0.8608810305595398, + "learning_rate": 0.00013751019160211986, + "loss": 3.2843, + "step": 101590 + }, + { + "epoch": 6.9027721157766, + "grad_norm": 1.0149985551834106, + "learning_rate": 0.00013746772659328714, + "loss": 3.2355, + "step": 101595 + }, + { + "epoch": 6.903111835847262, + "grad_norm": 0.8432103395462036, + "learning_rate": 0.00013742526158445442, + "loss": 3.175, + "step": 101600 + }, + { + "epoch": 6.903451555917924, + "grad_norm": 0.6639972925186157, + "learning_rate": 0.0001373827965756217, + "loss": 3.6131, + "step": 101605 + }, + { + "epoch": 6.903791275988586, + "grad_norm": 1.108114242553711, + "learning_rate": 0.00013734033156678895, + "loss": 3.2453, + "step": 101610 + }, + { + "epoch": 6.904130996059247, + "grad_norm": 0.8363702893257141, + "learning_rate": 0.00013729786655795626, + "loss": 3.4085, + "step": 101615 + }, + { + "epoch": 6.904470716129909, + "grad_norm": 0.9337568879127502, + "learning_rate": 0.00013725540154912351, + "loss": 3.7288, + "step": 101620 + }, + { + "epoch": 6.904810436200571, + "grad_norm": 0.8618992567062378, + "learning_rate": 0.0001372129365402908, + "loss": 3.6314, + "step": 101625 + }, + { + "epoch": 6.905150156271232, + "grad_norm": 1.1925410032272339, + "learning_rate": 0.0001371704715314581, + "loss": 3.4431, + "step": 101630 + }, + { + "epoch": 6.905489876341894, + "grad_norm": 1.0381656885147095, + "learning_rate": 0.00013712800652262535, + "loss": 3.4254, + "step": 101635 + }, + { + "epoch": 6.905829596412556, + "grad_norm": 0.8680646419525146, + "learning_rate": 0.00013708554151379263, + "loss": 3.5436, + "step": 101640 + }, + { + "epoch": 6.906169316483218, + "grad_norm": 0.9252715110778809, + "learning_rate": 0.00013704307650495991, + "loss": 3.1256, + "step": 101645 + }, + { + "epoch": 6.90650903655388, + "grad_norm": 0.8928626775741577, + "learning_rate": 0.0001370006114961272, + "loss": 3.7216, + "step": 101650 + }, + { + "epoch": 6.906848756624541, + "grad_norm": 1.2761050462722778, + "learning_rate": 0.00013695814648729445, + "loss": 3.6199, + "step": 101655 + }, + { + "epoch": 6.907188476695203, + "grad_norm": 0.8234912753105164, + "learning_rate": 0.00013691568147846176, + "loss": 3.5191, + "step": 101660 + }, + { + "epoch": 6.907528196765865, + "grad_norm": 1.0390076637268066, + "learning_rate": 0.00013687321646962904, + "loss": 3.419, + "step": 101665 + }, + { + "epoch": 6.907867916836526, + "grad_norm": 0.8124832510948181, + "learning_rate": 0.0001368307514607963, + "loss": 3.4909, + "step": 101670 + }, + { + "epoch": 6.908207636907188, + "grad_norm": 0.9963427186012268, + "learning_rate": 0.0001367882864519636, + "loss": 3.3178, + "step": 101675 + }, + { + "epoch": 6.90854735697785, + "grad_norm": 1.0200154781341553, + "learning_rate": 0.00013674582144313085, + "loss": 3.5745, + "step": 101680 + }, + { + "epoch": 6.908887077048512, + "grad_norm": 1.0017101764678955, + "learning_rate": 0.00013670335643429816, + "loss": 3.4782, + "step": 101685 + }, + { + "epoch": 6.909226797119174, + "grad_norm": 0.8568342328071594, + "learning_rate": 0.00013666089142546544, + "loss": 3.2799, + "step": 101690 + }, + { + "epoch": 6.909566517189836, + "grad_norm": 0.7986143231391907, + "learning_rate": 0.0001366184264166327, + "loss": 3.1372, + "step": 101695 + }, + { + "epoch": 6.909906237260497, + "grad_norm": 0.9638754725456238, + "learning_rate": 0.0001365759614078, + "loss": 3.3091, + "step": 101700 + }, + { + "epoch": 6.910245957331159, + "grad_norm": 0.9278609156608582, + "learning_rate": 0.00013653349639896725, + "loss": 3.3208, + "step": 101705 + }, + { + "epoch": 6.910585677401821, + "grad_norm": 1.0245764255523682, + "learning_rate": 0.00013649103139013453, + "loss": 3.562, + "step": 101710 + }, + { + "epoch": 6.910925397472482, + "grad_norm": 0.7166638970375061, + "learning_rate": 0.0001364485663813018, + "loss": 3.5176, + "step": 101715 + }, + { + "epoch": 6.911265117543144, + "grad_norm": 0.7545394897460938, + "learning_rate": 0.0001364061013724691, + "loss": 3.3825, + "step": 101720 + }, + { + "epoch": 6.911604837613806, + "grad_norm": 0.9454861879348755, + "learning_rate": 0.00013636363636363637, + "loss": 3.5513, + "step": 101725 + }, + { + "epoch": 6.911944557684468, + "grad_norm": 0.8949702978134155, + "learning_rate": 0.00013632117135480365, + "loss": 3.4028, + "step": 101730 + }, + { + "epoch": 6.91228427775513, + "grad_norm": 0.7769314050674438, + "learning_rate": 0.00013627870634597093, + "loss": 3.5976, + "step": 101735 + }, + { + "epoch": 6.912623997825792, + "grad_norm": 1.130757212638855, + "learning_rate": 0.00013623624133713818, + "loss": 3.3475, + "step": 101740 + }, + { + "epoch": 6.912963717896453, + "grad_norm": 0.809402346611023, + "learning_rate": 0.0001361937763283055, + "loss": 3.4437, + "step": 101745 + }, + { + "epoch": 6.913303437967115, + "grad_norm": 1.0489157438278198, + "learning_rate": 0.00013615131131947274, + "loss": 3.2381, + "step": 101750 + }, + { + "epoch": 6.913643158037777, + "grad_norm": 0.9783424735069275, + "learning_rate": 0.00013610884631064002, + "loss": 3.4828, + "step": 101755 + }, + { + "epoch": 6.913982878108438, + "grad_norm": 1.1001558303833008, + "learning_rate": 0.00013606638130180733, + "loss": 3.3447, + "step": 101760 + }, + { + "epoch": 6.9143225981791, + "grad_norm": 0.9369451999664307, + "learning_rate": 0.00013602391629297458, + "loss": 3.4063, + "step": 101765 + }, + { + "epoch": 6.914662318249762, + "grad_norm": 0.744550883769989, + "learning_rate": 0.0001359814512841419, + "loss": 3.3557, + "step": 101770 + }, + { + "epoch": 6.915002038320424, + "grad_norm": 0.9673347473144531, + "learning_rate": 0.00013593898627530914, + "loss": 3.3152, + "step": 101775 + }, + { + "epoch": 6.915341758391086, + "grad_norm": 0.9447105526924133, + "learning_rate": 0.00013589652126647642, + "loss": 3.7255, + "step": 101780 + }, + { + "epoch": 6.915681478461748, + "grad_norm": 0.9968202710151672, + "learning_rate": 0.0001358540562576437, + "loss": 3.1588, + "step": 101785 + }, + { + "epoch": 6.916021198532409, + "grad_norm": 1.0124483108520508, + "learning_rate": 0.00013581159124881098, + "loss": 3.4815, + "step": 101790 + }, + { + "epoch": 6.916360918603071, + "grad_norm": 0.8896495699882507, + "learning_rate": 0.00013576912623997826, + "loss": 3.2584, + "step": 101795 + }, + { + "epoch": 6.916700638673733, + "grad_norm": 0.9369792938232422, + "learning_rate": 0.00013572666123114554, + "loss": 3.4085, + "step": 101800 + }, + { + "epoch": 6.917040358744394, + "grad_norm": 1.3567321300506592, + "learning_rate": 0.00013568419622231282, + "loss": 3.2105, + "step": 101805 + }, + { + "epoch": 6.917380078815056, + "grad_norm": 1.033433198928833, + "learning_rate": 0.00013564173121348008, + "loss": 3.5937, + "step": 101810 + }, + { + "epoch": 6.917719798885718, + "grad_norm": 0.8144243359565735, + "learning_rate": 0.00013559926620464738, + "loss": 3.3262, + "step": 101815 + }, + { + "epoch": 6.91805951895638, + "grad_norm": 0.8449638485908508, + "learning_rate": 0.00013555680119581464, + "loss": 3.5335, + "step": 101820 + }, + { + "epoch": 6.918399239027042, + "grad_norm": 0.8937896490097046, + "learning_rate": 0.00013551433618698192, + "loss": 3.4555, + "step": 101825 + }, + { + "epoch": 6.918738959097704, + "grad_norm": 0.7438507080078125, + "learning_rate": 0.00013547187117814922, + "loss": 3.4622, + "step": 101830 + }, + { + "epoch": 6.919078679168365, + "grad_norm": 1.0141891241073608, + "learning_rate": 0.00013542940616931648, + "loss": 3.3706, + "step": 101835 + }, + { + "epoch": 6.919418399239027, + "grad_norm": 0.7204291820526123, + "learning_rate": 0.00013538694116048376, + "loss": 3.45, + "step": 101840 + }, + { + "epoch": 6.919758119309689, + "grad_norm": 0.9937227368354797, + "learning_rate": 0.00013534447615165104, + "loss": 3.5231, + "step": 101845 + }, + { + "epoch": 6.92009783938035, + "grad_norm": 0.965023398399353, + "learning_rate": 0.00013530201114281832, + "loss": 3.3019, + "step": 101850 + }, + { + "epoch": 6.920437559451012, + "grad_norm": 0.8535993099212646, + "learning_rate": 0.00013525954613398562, + "loss": 3.631, + "step": 101855 + }, + { + "epoch": 6.920777279521674, + "grad_norm": 0.758928656578064, + "learning_rate": 0.00013521708112515288, + "loss": 3.3621, + "step": 101860 + }, + { + "epoch": 6.921116999592336, + "grad_norm": 0.9658979177474976, + "learning_rate": 0.00013517461611632016, + "loss": 3.4179, + "step": 101865 + }, + { + "epoch": 6.921456719662998, + "grad_norm": 0.7522683143615723, + "learning_rate": 0.00013513215110748744, + "loss": 3.3173, + "step": 101870 + }, + { + "epoch": 6.92179643973366, + "grad_norm": 0.9729161858558655, + "learning_rate": 0.00013508968609865472, + "loss": 3.2, + "step": 101875 + }, + { + "epoch": 6.922136159804321, + "grad_norm": 0.9346545934677124, + "learning_rate": 0.00013504722108982197, + "loss": 3.3301, + "step": 101880 + }, + { + "epoch": 6.922475879874983, + "grad_norm": 0.8761724233627319, + "learning_rate": 0.00013500475608098928, + "loss": 3.2279, + "step": 101885 + }, + { + "epoch": 6.922815599945645, + "grad_norm": 0.9154172539710999, + "learning_rate": 0.00013496229107215656, + "loss": 3.3716, + "step": 101890 + }, + { + "epoch": 6.923155320016306, + "grad_norm": 1.0609614849090576, + "learning_rate": 0.0001349198260633238, + "loss": 3.5875, + "step": 101895 + }, + { + "epoch": 6.923495040086968, + "grad_norm": 0.7676581144332886, + "learning_rate": 0.00013487736105449112, + "loss": 3.2403, + "step": 101900 + }, + { + "epoch": 6.92383476015763, + "grad_norm": 1.1316487789154053, + "learning_rate": 0.00013483489604565837, + "loss": 3.3028, + "step": 101905 + }, + { + "epoch": 6.924174480228292, + "grad_norm": 0.8912098407745361, + "learning_rate": 0.00013479243103682565, + "loss": 3.2848, + "step": 101910 + }, + { + "epoch": 6.924514200298954, + "grad_norm": 1.044866681098938, + "learning_rate": 0.00013474996602799293, + "loss": 3.2892, + "step": 101915 + }, + { + "epoch": 6.924853920369616, + "grad_norm": 1.0733081102371216, + "learning_rate": 0.0001347075010191602, + "loss": 3.5258, + "step": 101920 + }, + { + "epoch": 6.925193640440277, + "grad_norm": 1.068678379058838, + "learning_rate": 0.0001346650360103275, + "loss": 3.4723, + "step": 101925 + }, + { + "epoch": 6.925533360510939, + "grad_norm": 0.8167450428009033, + "learning_rate": 0.00013462257100149477, + "loss": 3.2226, + "step": 101930 + }, + { + "epoch": 6.925873080581601, + "grad_norm": 0.9636564254760742, + "learning_rate": 0.00013458010599266205, + "loss": 3.2777, + "step": 101935 + }, + { + "epoch": 6.926212800652262, + "grad_norm": 1.0492509603500366, + "learning_rate": 0.00013453764098382933, + "loss": 3.3526, + "step": 101940 + }, + { + "epoch": 6.926552520722924, + "grad_norm": 1.020589828491211, + "learning_rate": 0.0001344951759749966, + "loss": 3.3805, + "step": 101945 + }, + { + "epoch": 6.9268922407935865, + "grad_norm": 1.0626919269561768, + "learning_rate": 0.00013445271096616386, + "loss": 3.4659, + "step": 101950 + }, + { + "epoch": 6.927231960864248, + "grad_norm": 0.9594429135322571, + "learning_rate": 0.00013441024595733117, + "loss": 3.2597, + "step": 101955 + }, + { + "epoch": 6.92757168093491, + "grad_norm": 0.7763850688934326, + "learning_rate": 0.00013436778094849845, + "loss": 3.5713, + "step": 101960 + }, + { + "epoch": 6.927911401005572, + "grad_norm": 0.7813297510147095, + "learning_rate": 0.0001343253159396657, + "loss": 3.4897, + "step": 101965 + }, + { + "epoch": 6.928251121076233, + "grad_norm": 0.8046717643737793, + "learning_rate": 0.000134282850930833, + "loss": 3.6982, + "step": 101970 + }, + { + "epoch": 6.928590841146895, + "grad_norm": 0.819007396697998, + "learning_rate": 0.00013424038592200026, + "loss": 3.5786, + "step": 101975 + }, + { + "epoch": 6.928930561217557, + "grad_norm": 0.854299008846283, + "learning_rate": 0.00013419792091316754, + "loss": 3.3935, + "step": 101980 + }, + { + "epoch": 6.929270281288218, + "grad_norm": 0.9743382334709167, + "learning_rate": 0.00013415545590433485, + "loss": 3.3934, + "step": 101985 + }, + { + "epoch": 6.92961000135888, + "grad_norm": 1.2586052417755127, + "learning_rate": 0.0001341129908955021, + "loss": 3.281, + "step": 101990 + }, + { + "epoch": 6.9299497214295425, + "grad_norm": 0.980699896812439, + "learning_rate": 0.00013407052588666938, + "loss": 3.354, + "step": 101995 + }, + { + "epoch": 6.930289441500204, + "grad_norm": 0.8982512950897217, + "learning_rate": 0.00013402806087783666, + "loss": 3.6212, + "step": 102000 + }, + { + "epoch": 6.930629161570866, + "grad_norm": 1.002622365951538, + "learning_rate": 0.00013398559586900394, + "loss": 3.5075, + "step": 102005 + }, + { + "epoch": 6.930968881641528, + "grad_norm": 1.0574074983596802, + "learning_rate": 0.0001339431308601712, + "loss": 3.4601, + "step": 102010 + }, + { + "epoch": 6.931308601712189, + "grad_norm": 0.8902122974395752, + "learning_rate": 0.0001339006658513385, + "loss": 3.4164, + "step": 102015 + }, + { + "epoch": 6.931648321782851, + "grad_norm": 0.7388010621070862, + "learning_rate": 0.00013385820084250579, + "loss": 3.4387, + "step": 102020 + }, + { + "epoch": 6.931988041853513, + "grad_norm": 0.9660131335258484, + "learning_rate": 0.00013381573583367307, + "loss": 3.4071, + "step": 102025 + }, + { + "epoch": 6.932327761924174, + "grad_norm": 1.1422232389450073, + "learning_rate": 0.00013377327082484035, + "loss": 3.2064, + "step": 102030 + }, + { + "epoch": 6.932667481994836, + "grad_norm": 1.2569959163665771, + "learning_rate": 0.0001337308058160076, + "loss": 3.5616, + "step": 102035 + }, + { + "epoch": 6.9330072020654985, + "grad_norm": 0.7490068674087524, + "learning_rate": 0.0001336883408071749, + "loss": 3.6036, + "step": 102040 + }, + { + "epoch": 6.93334692213616, + "grad_norm": 0.9678553938865662, + "learning_rate": 0.00013364587579834216, + "loss": 3.3167, + "step": 102045 + }, + { + "epoch": 6.933686642206822, + "grad_norm": 1.3724262714385986, + "learning_rate": 0.00013360341078950944, + "loss": 3.4481, + "step": 102050 + }, + { + "epoch": 6.934026362277484, + "grad_norm": 1.1262446641921997, + "learning_rate": 0.00013356094578067675, + "loss": 3.4861, + "step": 102055 + }, + { + "epoch": 6.934366082348145, + "grad_norm": 1.1713731288909912, + "learning_rate": 0.000133518480771844, + "loss": 3.3313, + "step": 102060 + }, + { + "epoch": 6.934705802418807, + "grad_norm": 1.1278321743011475, + "learning_rate": 0.00013347601576301128, + "loss": 3.3702, + "step": 102065 + }, + { + "epoch": 6.935045522489469, + "grad_norm": 0.9783502221107483, + "learning_rate": 0.00013343355075417856, + "loss": 3.3397, + "step": 102070 + }, + { + "epoch": 6.93538524256013, + "grad_norm": 0.9296241402626038, + "learning_rate": 0.00013339108574534584, + "loss": 3.3885, + "step": 102075 + }, + { + "epoch": 6.935724962630792, + "grad_norm": 1.125178337097168, + "learning_rate": 0.0001333486207365131, + "loss": 3.3185, + "step": 102080 + }, + { + "epoch": 6.9360646827014545, + "grad_norm": 1.0808025598526, + "learning_rate": 0.0001333061557276804, + "loss": 3.4959, + "step": 102085 + }, + { + "epoch": 6.936404402772116, + "grad_norm": 1.0967048406600952, + "learning_rate": 0.00013326369071884768, + "loss": 3.4468, + "step": 102090 + }, + { + "epoch": 6.936744122842778, + "grad_norm": 0.9539487957954407, + "learning_rate": 0.00013322122571001493, + "loss": 3.2682, + "step": 102095 + }, + { + "epoch": 6.93708384291344, + "grad_norm": 1.0207056999206543, + "learning_rate": 0.00013317876070118224, + "loss": 3.0855, + "step": 102100 + }, + { + "epoch": 6.937423562984101, + "grad_norm": 0.8900434970855713, + "learning_rate": 0.0001331362956923495, + "loss": 3.2611, + "step": 102105 + }, + { + "epoch": 6.937763283054763, + "grad_norm": 0.93927401304245, + "learning_rate": 0.0001330938306835168, + "loss": 3.1942, + "step": 102110 + }, + { + "epoch": 6.938103003125424, + "grad_norm": 0.8895729184150696, + "learning_rate": 0.00013305136567468408, + "loss": 3.2159, + "step": 102115 + }, + { + "epoch": 6.938442723196086, + "grad_norm": 0.7885521650314331, + "learning_rate": 0.00013300890066585133, + "loss": 3.5357, + "step": 102120 + }, + { + "epoch": 6.938782443266748, + "grad_norm": 0.7752097845077515, + "learning_rate": 0.00013296643565701864, + "loss": 3.3885, + "step": 102125 + }, + { + "epoch": 6.93912216333741, + "grad_norm": 0.7895439267158508, + "learning_rate": 0.0001329239706481859, + "loss": 3.5794, + "step": 102130 + }, + { + "epoch": 6.939461883408072, + "grad_norm": 0.8720569610595703, + "learning_rate": 0.00013288150563935317, + "loss": 3.3677, + "step": 102135 + }, + { + "epoch": 6.939801603478734, + "grad_norm": 0.893625020980835, + "learning_rate": 0.00013283904063052045, + "loss": 3.4987, + "step": 102140 + }, + { + "epoch": 6.940141323549395, + "grad_norm": 1.1010740995407104, + "learning_rate": 0.00013279657562168773, + "loss": 3.4048, + "step": 102145 + }, + { + "epoch": 6.940481043620057, + "grad_norm": 0.7879207730293274, + "learning_rate": 0.000132754110612855, + "loss": 3.477, + "step": 102150 + }, + { + "epoch": 6.940820763690719, + "grad_norm": 0.9344617128372192, + "learning_rate": 0.0001327116456040223, + "loss": 3.4097, + "step": 102155 + }, + { + "epoch": 6.94116048376138, + "grad_norm": 0.9030152559280396, + "learning_rate": 0.00013266918059518957, + "loss": 3.2163, + "step": 102160 + }, + { + "epoch": 6.941500203832042, + "grad_norm": 0.897499680519104, + "learning_rate": 0.00013262671558635683, + "loss": 3.4205, + "step": 102165 + }, + { + "epoch": 6.941839923902704, + "grad_norm": 0.752012312412262, + "learning_rate": 0.00013258425057752413, + "loss": 3.2359, + "step": 102170 + }, + { + "epoch": 6.942179643973366, + "grad_norm": 1.0045627355575562, + "learning_rate": 0.00013254178556869139, + "loss": 3.5393, + "step": 102175 + }, + { + "epoch": 6.942519364044028, + "grad_norm": 0.7809278964996338, + "learning_rate": 0.00013249932055985867, + "loss": 3.2269, + "step": 102180 + }, + { + "epoch": 6.94285908411469, + "grad_norm": 1.0669599771499634, + "learning_rate": 0.00013245685555102597, + "loss": 3.5346, + "step": 102185 + }, + { + "epoch": 6.943198804185351, + "grad_norm": 0.8395273089408875, + "learning_rate": 0.00013241439054219323, + "loss": 3.3179, + "step": 102190 + }, + { + "epoch": 6.943538524256013, + "grad_norm": 0.8671204447746277, + "learning_rate": 0.00013237192553336053, + "loss": 3.4715, + "step": 102195 + }, + { + "epoch": 6.943878244326675, + "grad_norm": 1.1640512943267822, + "learning_rate": 0.0001323294605245278, + "loss": 3.4383, + "step": 102200 + }, + { + "epoch": 6.944217964397336, + "grad_norm": 0.7464726567268372, + "learning_rate": 0.00013228699551569507, + "loss": 3.5368, + "step": 102205 + }, + { + "epoch": 6.944557684467998, + "grad_norm": 1.0066461563110352, + "learning_rate": 0.00013224453050686235, + "loss": 3.2335, + "step": 102210 + }, + { + "epoch": 6.9448974045386604, + "grad_norm": 0.9477661848068237, + "learning_rate": 0.00013220206549802963, + "loss": 3.4541, + "step": 102215 + }, + { + "epoch": 6.945237124609322, + "grad_norm": 0.7732639312744141, + "learning_rate": 0.0001321596004891969, + "loss": 3.3861, + "step": 102220 + }, + { + "epoch": 6.945576844679984, + "grad_norm": 0.809258222579956, + "learning_rate": 0.0001321171354803642, + "loss": 3.5271, + "step": 102225 + }, + { + "epoch": 6.945916564750646, + "grad_norm": 0.8210856318473816, + "learning_rate": 0.00013207467047153147, + "loss": 3.5824, + "step": 102230 + }, + { + "epoch": 6.946256284821307, + "grad_norm": 0.8387512564659119, + "learning_rate": 0.00013203220546269872, + "loss": 3.4902, + "step": 102235 + }, + { + "epoch": 6.946596004891969, + "grad_norm": 0.9851078987121582, + "learning_rate": 0.00013198974045386603, + "loss": 3.5778, + "step": 102240 + }, + { + "epoch": 6.946935724962631, + "grad_norm": 1.2654155492782593, + "learning_rate": 0.00013194727544503328, + "loss": 3.2705, + "step": 102245 + }, + { + "epoch": 6.947275445033292, + "grad_norm": 0.9821637272834778, + "learning_rate": 0.00013190481043620056, + "loss": 3.6665, + "step": 102250 + }, + { + "epoch": 6.947615165103954, + "grad_norm": 0.8480929136276245, + "learning_rate": 0.00013186234542736787, + "loss": 3.581, + "step": 102255 + }, + { + "epoch": 6.9479548851746165, + "grad_norm": 0.8530550599098206, + "learning_rate": 0.00013181988041853512, + "loss": 3.4374, + "step": 102260 + }, + { + "epoch": 6.948294605245278, + "grad_norm": 0.9060875773429871, + "learning_rate": 0.0001317774154097024, + "loss": 3.4741, + "step": 102265 + }, + { + "epoch": 6.94863432531594, + "grad_norm": 1.0098378658294678, + "learning_rate": 0.00013173495040086968, + "loss": 3.2702, + "step": 102270 + }, + { + "epoch": 6.948974045386602, + "grad_norm": 1.043103575706482, + "learning_rate": 0.00013169248539203696, + "loss": 3.3366, + "step": 102275 + }, + { + "epoch": 6.949313765457263, + "grad_norm": 0.7007728815078735, + "learning_rate": 0.00013165002038320427, + "loss": 3.3939, + "step": 102280 + }, + { + "epoch": 6.949653485527925, + "grad_norm": 0.8544524908065796, + "learning_rate": 0.00013160755537437152, + "loss": 3.2628, + "step": 102285 + }, + { + "epoch": 6.949993205598587, + "grad_norm": 0.8837591409683228, + "learning_rate": 0.0001315650903655388, + "loss": 3.3441, + "step": 102290 + }, + { + "epoch": 6.950332925669248, + "grad_norm": 0.8941274881362915, + "learning_rate": 0.00013152262535670608, + "loss": 3.4093, + "step": 102295 + }, + { + "epoch": 6.95067264573991, + "grad_norm": 0.9659095406532288, + "learning_rate": 0.00013148016034787336, + "loss": 3.2753, + "step": 102300 + }, + { + "epoch": 6.9510123658105725, + "grad_norm": 0.8392763137817383, + "learning_rate": 0.00013143769533904061, + "loss": 3.5322, + "step": 102305 + }, + { + "epoch": 6.951352085881234, + "grad_norm": 0.9862014055252075, + "learning_rate": 0.00013139523033020792, + "loss": 3.5296, + "step": 102310 + }, + { + "epoch": 6.951691805951896, + "grad_norm": 1.00225031375885, + "learning_rate": 0.0001313527653213752, + "loss": 3.3087, + "step": 102315 + }, + { + "epoch": 6.952031526022557, + "grad_norm": 0.8040391802787781, + "learning_rate": 0.00013131030031254245, + "loss": 3.4837, + "step": 102320 + }, + { + "epoch": 6.952371246093219, + "grad_norm": 0.899168848991394, + "learning_rate": 0.00013126783530370976, + "loss": 3.3656, + "step": 102325 + }, + { + "epoch": 6.952710966163881, + "grad_norm": 0.8816724419593811, + "learning_rate": 0.00013122537029487701, + "loss": 3.5532, + "step": 102330 + }, + { + "epoch": 6.953050686234542, + "grad_norm": 1.1571314334869385, + "learning_rate": 0.0001311829052860443, + "loss": 3.3685, + "step": 102335 + }, + { + "epoch": 6.953390406305204, + "grad_norm": 1.017233967781067, + "learning_rate": 0.00013114044027721157, + "loss": 3.4096, + "step": 102340 + }, + { + "epoch": 6.953730126375866, + "grad_norm": 0.707598865032196, + "learning_rate": 0.00013109797526837885, + "loss": 3.7172, + "step": 102345 + }, + { + "epoch": 6.954069846446528, + "grad_norm": 0.8995300531387329, + "learning_rate": 0.00013105551025954613, + "loss": 3.7187, + "step": 102350 + }, + { + "epoch": 6.95440956651719, + "grad_norm": 0.8146638870239258, + "learning_rate": 0.00013101304525071341, + "loss": 3.3553, + "step": 102355 + }, + { + "epoch": 6.954749286587852, + "grad_norm": 0.8475928902626038, + "learning_rate": 0.0001309705802418807, + "loss": 3.1638, + "step": 102360 + }, + { + "epoch": 6.955089006658513, + "grad_norm": 1.1715526580810547, + "learning_rate": 0.00013092811523304797, + "loss": 3.5027, + "step": 102365 + }, + { + "epoch": 6.955428726729175, + "grad_norm": 0.7969035506248474, + "learning_rate": 0.00013088565022421525, + "loss": 3.5229, + "step": 102370 + }, + { + "epoch": 6.955768446799837, + "grad_norm": 1.1158770322799683, + "learning_rate": 0.0001308431852153825, + "loss": 3.4989, + "step": 102375 + }, + { + "epoch": 6.956108166870498, + "grad_norm": 1.0479294061660767, + "learning_rate": 0.00013080072020654982, + "loss": 3.5175, + "step": 102380 + }, + { + "epoch": 6.95644788694116, + "grad_norm": 3.4568865299224854, + "learning_rate": 0.0001307582551977171, + "loss": 3.4656, + "step": 102385 + }, + { + "epoch": 6.956787607011822, + "grad_norm": 0.830379843711853, + "learning_rate": 0.00013071579018888435, + "loss": 3.4718, + "step": 102390 + }, + { + "epoch": 6.957127327082484, + "grad_norm": 0.8327118158340454, + "learning_rate": 0.00013067332518005166, + "loss": 3.0171, + "step": 102395 + }, + { + "epoch": 6.957467047153146, + "grad_norm": 1.0082664489746094, + "learning_rate": 0.0001306308601712189, + "loss": 3.5038, + "step": 102400 + }, + { + "epoch": 6.957806767223808, + "grad_norm": 1.0330051183700562, + "learning_rate": 0.0001305883951623862, + "loss": 3.5968, + "step": 102405 + }, + { + "epoch": 6.958146487294469, + "grad_norm": 0.7879788875579834, + "learning_rate": 0.0001305459301535535, + "loss": 3.3811, + "step": 102410 + }, + { + "epoch": 6.958486207365131, + "grad_norm": 0.7551015615463257, + "learning_rate": 0.00013050346514472075, + "loss": 3.1285, + "step": 102415 + }, + { + "epoch": 6.958825927435793, + "grad_norm": 0.7993594408035278, + "learning_rate": 0.00013046100013588803, + "loss": 3.3362, + "step": 102420 + }, + { + "epoch": 6.959165647506454, + "grad_norm": 0.8568546175956726, + "learning_rate": 0.0001304185351270553, + "loss": 3.4916, + "step": 102425 + }, + { + "epoch": 6.959505367577116, + "grad_norm": 0.9519447088241577, + "learning_rate": 0.0001303760701182226, + "loss": 3.4851, + "step": 102430 + }, + { + "epoch": 6.959845087647778, + "grad_norm": 1.021573543548584, + "learning_rate": 0.00013033360510938984, + "loss": 3.494, + "step": 102435 + }, + { + "epoch": 6.96018480771844, + "grad_norm": 0.910977840423584, + "learning_rate": 0.00013029114010055715, + "loss": 3.4864, + "step": 102440 + }, + { + "epoch": 6.960524527789102, + "grad_norm": 0.8750789761543274, + "learning_rate": 0.00013024867509172443, + "loss": 3.5076, + "step": 102445 + }, + { + "epoch": 6.960864247859764, + "grad_norm": 1.852509617805481, + "learning_rate": 0.0001302062100828917, + "loss": 3.2381, + "step": 102450 + }, + { + "epoch": 6.961203967930425, + "grad_norm": 1.0032416582107544, + "learning_rate": 0.000130163745074059, + "loss": 3.184, + "step": 102455 + }, + { + "epoch": 6.961543688001087, + "grad_norm": 0.7986338138580322, + "learning_rate": 0.00013012128006522624, + "loss": 3.5692, + "step": 102460 + }, + { + "epoch": 6.961883408071749, + "grad_norm": 0.8209830522537231, + "learning_rate": 0.00013007881505639355, + "loss": 3.475, + "step": 102465 + }, + { + "epoch": 6.96222312814241, + "grad_norm": 0.7160298228263855, + "learning_rate": 0.0001300363500475608, + "loss": 3.0609, + "step": 102470 + }, + { + "epoch": 6.962562848213072, + "grad_norm": 0.7791823148727417, + "learning_rate": 0.00012999388503872808, + "loss": 3.0785, + "step": 102475 + }, + { + "epoch": 6.962902568283734, + "grad_norm": 1.007145881652832, + "learning_rate": 0.0001299514200298954, + "loss": 3.248, + "step": 102480 + }, + { + "epoch": 6.963242288354396, + "grad_norm": 0.7008261680603027, + "learning_rate": 0.00012990895502106264, + "loss": 3.5449, + "step": 102485 + }, + { + "epoch": 6.963582008425058, + "grad_norm": 1.0033284425735474, + "learning_rate": 0.00012986649001222992, + "loss": 3.5022, + "step": 102490 + }, + { + "epoch": 6.96392172849572, + "grad_norm": 0.8580614924430847, + "learning_rate": 0.0001298240250033972, + "loss": 3.5858, + "step": 102495 + }, + { + "epoch": 6.964261448566381, + "grad_norm": 1.0260552167892456, + "learning_rate": 0.00012978155999456448, + "loss": 3.2434, + "step": 102500 + }, + { + "epoch": 6.964601168637043, + "grad_norm": 1.0337709188461304, + "learning_rate": 0.00012973909498573174, + "loss": 3.4596, + "step": 102505 + }, + { + "epoch": 6.964940888707705, + "grad_norm": 0.6910926699638367, + "learning_rate": 0.00012969662997689904, + "loss": 3.2782, + "step": 102510 + }, + { + "epoch": 6.965280608778366, + "grad_norm": 0.9006554484367371, + "learning_rate": 0.00012965416496806632, + "loss": 3.4819, + "step": 102515 + }, + { + "epoch": 6.965620328849028, + "grad_norm": 0.7217125296592712, + "learning_rate": 0.00012961169995923358, + "loss": 3.3097, + "step": 102520 + }, + { + "epoch": 6.9659600489196905, + "grad_norm": 1.0307234525680542, + "learning_rate": 0.00012956923495040088, + "loss": 3.5249, + "step": 102525 + }, + { + "epoch": 6.966299768990352, + "grad_norm": 0.8895589113235474, + "learning_rate": 0.00012952676994156814, + "loss": 3.3273, + "step": 102530 + }, + { + "epoch": 6.966639489061014, + "grad_norm": 0.9642895460128784, + "learning_rate": 0.00012948430493273544, + "loss": 3.5377, + "step": 102535 + }, + { + "epoch": 6.966979209131676, + "grad_norm": 0.9881765842437744, + "learning_rate": 0.00012944183992390272, + "loss": 3.787, + "step": 102540 + }, + { + "epoch": 6.967318929202337, + "grad_norm": 0.9436110854148865, + "learning_rate": 0.00012939937491506998, + "loss": 3.4603, + "step": 102545 + }, + { + "epoch": 6.967658649272999, + "grad_norm": 1.0582362413406372, + "learning_rate": 0.00012935690990623728, + "loss": 3.4626, + "step": 102550 + }, + { + "epoch": 6.967998369343661, + "grad_norm": 0.8246753811836243, + "learning_rate": 0.00012931444489740454, + "loss": 3.4864, + "step": 102555 + }, + { + "epoch": 6.968338089414322, + "grad_norm": 1.0772790908813477, + "learning_rate": 0.00012927197988857182, + "loss": 3.5399, + "step": 102560 + }, + { + "epoch": 6.968677809484984, + "grad_norm": 0.848492443561554, + "learning_rate": 0.0001292295148797391, + "loss": 3.49, + "step": 102565 + }, + { + "epoch": 6.9690175295556465, + "grad_norm": 1.088506817817688, + "learning_rate": 0.00012918704987090638, + "loss": 3.297, + "step": 102570 + }, + { + "epoch": 6.969357249626308, + "grad_norm": 1.1164307594299316, + "learning_rate": 0.00012914458486207366, + "loss": 3.3827, + "step": 102575 + }, + { + "epoch": 6.96969696969697, + "grad_norm": 0.967898428440094, + "learning_rate": 0.00012910211985324094, + "loss": 3.583, + "step": 102580 + }, + { + "epoch": 6.970036689767632, + "grad_norm": 0.9936785697937012, + "learning_rate": 0.00012905965484440822, + "loss": 3.5221, + "step": 102585 + }, + { + "epoch": 6.970376409838293, + "grad_norm": 0.9138872623443604, + "learning_rate": 0.00012901718983557547, + "loss": 3.1026, + "step": 102590 + }, + { + "epoch": 6.970716129908955, + "grad_norm": 0.8333055377006531, + "learning_rate": 0.00012897472482674278, + "loss": 3.1672, + "step": 102595 + }, + { + "epoch": 6.971055849979617, + "grad_norm": 1.099650502204895, + "learning_rate": 0.00012893225981791003, + "loss": 3.358, + "step": 102600 + }, + { + "epoch": 6.971395570050278, + "grad_norm": 1.0702528953552246, + "learning_rate": 0.0001288897948090773, + "loss": 3.57, + "step": 102605 + }, + { + "epoch": 6.97173529012094, + "grad_norm": 0.9989902377128601, + "learning_rate": 0.00012884732980024462, + "loss": 3.1306, + "step": 102610 + }, + { + "epoch": 6.9720750101916025, + "grad_norm": 0.905956506729126, + "learning_rate": 0.00012880486479141187, + "loss": 3.2142, + "step": 102615 + }, + { + "epoch": 6.972414730262264, + "grad_norm": 0.8885861039161682, + "learning_rate": 0.00012876239978257918, + "loss": 3.6637, + "step": 102620 + }, + { + "epoch": 6.972754450332926, + "grad_norm": 0.864136815071106, + "learning_rate": 0.00012871993477374643, + "loss": 3.3735, + "step": 102625 + }, + { + "epoch": 6.973094170403588, + "grad_norm": 0.9632852673530579, + "learning_rate": 0.0001286774697649137, + "loss": 3.4906, + "step": 102630 + }, + { + "epoch": 6.973433890474249, + "grad_norm": 1.0074524879455566, + "learning_rate": 0.000128635004756081, + "loss": 3.2698, + "step": 102635 + }, + { + "epoch": 6.973773610544911, + "grad_norm": 0.9106332063674927, + "learning_rate": 0.00012859253974724827, + "loss": 3.4535, + "step": 102640 + }, + { + "epoch": 6.974113330615573, + "grad_norm": 0.964585542678833, + "learning_rate": 0.00012855007473841555, + "loss": 3.3845, + "step": 102645 + }, + { + "epoch": 6.974453050686234, + "grad_norm": 0.9156357049942017, + "learning_rate": 0.00012850760972958283, + "loss": 3.2727, + "step": 102650 + }, + { + "epoch": 6.974792770756896, + "grad_norm": 1.192270040512085, + "learning_rate": 0.0001284651447207501, + "loss": 3.4565, + "step": 102655 + }, + { + "epoch": 6.9751324908275585, + "grad_norm": 1.1999056339263916, + "learning_rate": 0.00012842267971191736, + "loss": 2.9372, + "step": 102660 + }, + { + "epoch": 6.97547221089822, + "grad_norm": 0.742716372013092, + "learning_rate": 0.00012838021470308467, + "loss": 3.3113, + "step": 102665 + }, + { + "epoch": 6.975811930968882, + "grad_norm": 0.7429617643356323, + "learning_rate": 0.00012833774969425192, + "loss": 3.3855, + "step": 102670 + }, + { + "epoch": 6.976151651039544, + "grad_norm": 1.028943419456482, + "learning_rate": 0.0001282952846854192, + "loss": 3.379, + "step": 102675 + }, + { + "epoch": 6.976491371110205, + "grad_norm": 0.9542559385299683, + "learning_rate": 0.0001282528196765865, + "loss": 3.2855, + "step": 102680 + }, + { + "epoch": 6.976831091180867, + "grad_norm": 0.8617198467254639, + "learning_rate": 0.00012821035466775376, + "loss": 3.4735, + "step": 102685 + }, + { + "epoch": 6.977170811251529, + "grad_norm": 0.9068993330001831, + "learning_rate": 0.00012816788965892104, + "loss": 3.1781, + "step": 102690 + }, + { + "epoch": 6.97751053132219, + "grad_norm": 0.9208261966705322, + "learning_rate": 0.00012812542465008832, + "loss": 3.4286, + "step": 102695 + }, + { + "epoch": 6.977850251392852, + "grad_norm": 0.902022659778595, + "learning_rate": 0.0001280829596412556, + "loss": 3.3364, + "step": 102700 + }, + { + "epoch": 6.9781899714635145, + "grad_norm": 0.8053880929946899, + "learning_rate": 0.0001280404946324229, + "loss": 3.4722, + "step": 102705 + }, + { + "epoch": 6.978529691534176, + "grad_norm": 1.168947696685791, + "learning_rate": 0.00012799802962359016, + "loss": 3.2711, + "step": 102710 + }, + { + "epoch": 6.978869411604838, + "grad_norm": 1.064668893814087, + "learning_rate": 0.00012795556461475744, + "loss": 3.3192, + "step": 102715 + }, + { + "epoch": 6.9792091316755, + "grad_norm": 0.9961381554603577, + "learning_rate": 0.00012791309960592472, + "loss": 3.2475, + "step": 102720 + }, + { + "epoch": 6.979548851746161, + "grad_norm": 1.8191231489181519, + "learning_rate": 0.000127870634597092, + "loss": 3.393, + "step": 102725 + }, + { + "epoch": 6.979888571816823, + "grad_norm": 0.685160756111145, + "learning_rate": 0.00012782816958825926, + "loss": 3.478, + "step": 102730 + }, + { + "epoch": 6.980228291887485, + "grad_norm": 0.8706306219100952, + "learning_rate": 0.00012778570457942657, + "loss": 3.4035, + "step": 102735 + }, + { + "epoch": 6.980568011958146, + "grad_norm": 1.0184518098831177, + "learning_rate": 0.00012774323957059385, + "loss": 3.3262, + "step": 102740 + }, + { + "epoch": 6.980907732028808, + "grad_norm": 0.9392253160476685, + "learning_rate": 0.0001277007745617611, + "loss": 3.5872, + "step": 102745 + }, + { + "epoch": 6.9812474520994705, + "grad_norm": 0.9201531410217285, + "learning_rate": 0.0001276583095529284, + "loss": 3.4561, + "step": 102750 + }, + { + "epoch": 6.981587172170132, + "grad_norm": 0.934482753276825, + "learning_rate": 0.00012761584454409566, + "loss": 3.5641, + "step": 102755 + }, + { + "epoch": 6.981926892240794, + "grad_norm": 0.9570853114128113, + "learning_rate": 0.00012757337953526294, + "loss": 3.2201, + "step": 102760 + }, + { + "epoch": 6.982266612311456, + "grad_norm": 1.0656483173370361, + "learning_rate": 0.00012753091452643022, + "loss": 3.4206, + "step": 102765 + }, + { + "epoch": 6.982606332382117, + "grad_norm": 0.8148189783096313, + "learning_rate": 0.0001274884495175975, + "loss": 3.4978, + "step": 102770 + }, + { + "epoch": 6.982946052452779, + "grad_norm": 1.0373625755310059, + "learning_rate": 0.00012744598450876478, + "loss": 3.38, + "step": 102775 + }, + { + "epoch": 6.983285772523441, + "grad_norm": 0.9783996939659119, + "learning_rate": 0.00012740351949993206, + "loss": 3.3764, + "step": 102780 + }, + { + "epoch": 6.983625492594102, + "grad_norm": 0.6255814433097839, + "learning_rate": 0.00012736105449109934, + "loss": 3.3713, + "step": 102785 + }, + { + "epoch": 6.983965212664764, + "grad_norm": 0.9827499985694885, + "learning_rate": 0.00012731858948226662, + "loss": 3.1967, + "step": 102790 + }, + { + "epoch": 6.984304932735426, + "grad_norm": 0.9475609660148621, + "learning_rate": 0.0001272761244734339, + "loss": 3.4631, + "step": 102795 + }, + { + "epoch": 6.984644652806088, + "grad_norm": 0.7990660667419434, + "learning_rate": 0.00012723365946460115, + "loss": 3.5147, + "step": 102800 + }, + { + "epoch": 6.98498437287675, + "grad_norm": 0.8953558802604675, + "learning_rate": 0.00012719119445576846, + "loss": 3.447, + "step": 102805 + }, + { + "epoch": 6.985324092947411, + "grad_norm": 0.6906647682189941, + "learning_rate": 0.00012714872944693574, + "loss": 3.5912, + "step": 102810 + }, + { + "epoch": 6.985663813018073, + "grad_norm": 0.9048526883125305, + "learning_rate": 0.000127106264438103, + "loss": 3.4445, + "step": 102815 + }, + { + "epoch": 6.986003533088735, + "grad_norm": 0.8636841177940369, + "learning_rate": 0.0001270637994292703, + "loss": 3.4337, + "step": 102820 + }, + { + "epoch": 6.986343253159396, + "grad_norm": 0.9227892756462097, + "learning_rate": 0.00012702133442043755, + "loss": 3.4754, + "step": 102825 + }, + { + "epoch": 6.986682973230058, + "grad_norm": 1.03024423122406, + "learning_rate": 0.00012697886941160483, + "loss": 3.6044, + "step": 102830 + }, + { + "epoch": 6.9870226933007205, + "grad_norm": 1.0649397373199463, + "learning_rate": 0.00012693640440277214, + "loss": 3.4388, + "step": 102835 + }, + { + "epoch": 6.987362413371382, + "grad_norm": 1.1352953910827637, + "learning_rate": 0.0001268939393939394, + "loss": 3.4933, + "step": 102840 + }, + { + "epoch": 6.987702133442044, + "grad_norm": 0.9809253811836243, + "learning_rate": 0.00012685147438510667, + "loss": 3.4604, + "step": 102845 + }, + { + "epoch": 6.988041853512706, + "grad_norm": 0.7057732343673706, + "learning_rate": 0.00012680900937627395, + "loss": 3.201, + "step": 102850 + }, + { + "epoch": 6.988381573583367, + "grad_norm": 1.0814056396484375, + "learning_rate": 0.00012676654436744123, + "loss": 3.0497, + "step": 102855 + }, + { + "epoch": 6.988721293654029, + "grad_norm": 1.0254279375076294, + "learning_rate": 0.00012672407935860849, + "loss": 3.4374, + "step": 102860 + }, + { + "epoch": 6.989061013724691, + "grad_norm": 0.8107612133026123, + "learning_rate": 0.0001266816143497758, + "loss": 3.2367, + "step": 102865 + }, + { + "epoch": 6.989400733795352, + "grad_norm": 0.9246805906295776, + "learning_rate": 0.00012663914934094307, + "loss": 3.3233, + "step": 102870 + }, + { + "epoch": 6.989740453866014, + "grad_norm": 1.0539519786834717, + "learning_rate": 0.00012659668433211035, + "loss": 3.1801, + "step": 102875 + }, + { + "epoch": 6.9900801739366765, + "grad_norm": 0.6385786533355713, + "learning_rate": 0.00012655421932327763, + "loss": 3.4621, + "step": 102880 + }, + { + "epoch": 6.990419894007338, + "grad_norm": 1.002603530883789, + "learning_rate": 0.00012651175431444489, + "loss": 3.4752, + "step": 102885 + }, + { + "epoch": 6.990759614078, + "grad_norm": 0.754588782787323, + "learning_rate": 0.0001264692893056122, + "loss": 3.2592, + "step": 102890 + }, + { + "epoch": 6.991099334148662, + "grad_norm": 0.9514551162719727, + "learning_rate": 0.00012642682429677945, + "loss": 3.2049, + "step": 102895 + }, + { + "epoch": 6.991439054219323, + "grad_norm": 1.1115648746490479, + "learning_rate": 0.00012638435928794673, + "loss": 3.4671, + "step": 102900 + }, + { + "epoch": 6.991778774289985, + "grad_norm": 1.0391383171081543, + "learning_rate": 0.00012634189427911403, + "loss": 3.2937, + "step": 102905 + }, + { + "epoch": 6.992118494360647, + "grad_norm": 0.9697072505950928, + "learning_rate": 0.0001262994292702813, + "loss": 3.7651, + "step": 102910 + }, + { + "epoch": 6.992458214431308, + "grad_norm": 1.0919603109359741, + "learning_rate": 0.00012625696426144857, + "loss": 3.637, + "step": 102915 + }, + { + "epoch": 6.99279793450197, + "grad_norm": 0.9303723573684692, + "learning_rate": 0.00012621449925261585, + "loss": 3.4518, + "step": 102920 + }, + { + "epoch": 6.9931376545726325, + "grad_norm": 1.3677812814712524, + "learning_rate": 0.00012617203424378313, + "loss": 3.6038, + "step": 102925 + }, + { + "epoch": 6.993477374643294, + "grad_norm": 0.8555642366409302, + "learning_rate": 0.00012612956923495038, + "loss": 3.6696, + "step": 102930 + }, + { + "epoch": 6.993817094713956, + "grad_norm": 0.8358707427978516, + "learning_rate": 0.0001260871042261177, + "loss": 3.4416, + "step": 102935 + }, + { + "epoch": 6.994156814784618, + "grad_norm": 0.8988481163978577, + "learning_rate": 0.00012604463921728497, + "loss": 3.3804, + "step": 102940 + }, + { + "epoch": 6.994496534855279, + "grad_norm": 1.002144694328308, + "learning_rate": 0.00012600217420845222, + "loss": 3.583, + "step": 102945 + }, + { + "epoch": 6.994836254925941, + "grad_norm": 3.0091536045074463, + "learning_rate": 0.00012595970919961953, + "loss": 3.3873, + "step": 102950 + }, + { + "epoch": 6.995175974996603, + "grad_norm": 0.7576066255569458, + "learning_rate": 0.00012591724419078678, + "loss": 3.3722, + "step": 102955 + }, + { + "epoch": 6.995515695067264, + "grad_norm": 0.874691903591156, + "learning_rate": 0.0001258747791819541, + "loss": 3.5099, + "step": 102960 + }, + { + "epoch": 6.995855415137926, + "grad_norm": 0.8107845187187195, + "learning_rate": 0.00012583231417312137, + "loss": 3.3109, + "step": 102965 + }, + { + "epoch": 6.9961951352085885, + "grad_norm": 0.830906093120575, + "learning_rate": 0.00012578984916428862, + "loss": 3.5246, + "step": 102970 + }, + { + "epoch": 6.99653485527925, + "grad_norm": 0.8777279257774353, + "learning_rate": 0.00012574738415545593, + "loss": 3.5342, + "step": 102975 + }, + { + "epoch": 6.996874575349912, + "grad_norm": 0.8058253526687622, + "learning_rate": 0.00012570491914662318, + "loss": 3.577, + "step": 102980 + }, + { + "epoch": 6.997214295420574, + "grad_norm": 1.0206857919692993, + "learning_rate": 0.00012566245413779046, + "loss": 3.5982, + "step": 102985 + }, + { + "epoch": 6.997554015491235, + "grad_norm": 0.785498321056366, + "learning_rate": 0.00012561998912895774, + "loss": 3.4333, + "step": 102990 + }, + { + "epoch": 6.997893735561897, + "grad_norm": 0.9486989974975586, + "learning_rate": 0.00012557752412012502, + "loss": 3.3731, + "step": 102995 + }, + { + "epoch": 6.998233455632558, + "grad_norm": 1.162947177886963, + "learning_rate": 0.0001255350591112923, + "loss": 3.5173, + "step": 103000 + }, + { + "epoch": 6.99857317570322, + "grad_norm": 1.0782839059829712, + "learning_rate": 0.00012549259410245958, + "loss": 3.3926, + "step": 103005 + }, + { + "epoch": 6.998912895773882, + "grad_norm": 1.0429353713989258, + "learning_rate": 0.00012545012909362686, + "loss": 3.295, + "step": 103010 + }, + { + "epoch": 6.999252615844544, + "grad_norm": 2.5663046836853027, + "learning_rate": 0.00012540766408479411, + "loss": 3.4912, + "step": 103015 + }, + { + "epoch": 6.999592335915206, + "grad_norm": 0.8547271490097046, + "learning_rate": 0.00012536519907596142, + "loss": 3.5293, + "step": 103020 + }, + { + "epoch": 6.999932055985868, + "grad_norm": 0.9447717070579529, + "learning_rate": 0.00012532273406712867, + "loss": 3.2238, + "step": 103025 + }, + { + "epoch": 7.0, + "eval_bertscore": { + "f1": 0.8440153400372807, + "precision": 0.8494711717046824, + "recall": 0.8393967118659723 + }, + "eval_bleu_4": 0.015639508986722127, + "eval_exact_match": 0.0005814516910553348, + "eval_loss": 3.388423204421997, + "eval_meteor": 0.088602209343675, + "eval_rouge": { + "rouge1": 0.12826986894804754, + "rouge2": 0.019076561631196358, + "rougeL": 0.11114144387436872, + "rougeLsum": 0.11117711203673963 + }, + "eval_runtime": 1382.5893, + "eval_samples_per_second": 7.464, + "eval_steps_per_second": 0.933, + "step": 103026 + }, + { + "epoch": 7.00027177605653, + "grad_norm": 0.7759246230125427, + "learning_rate": 0.00012528026905829595, + "loss": 3.4051, + "step": 103030 + }, + { + "epoch": 7.000611496127191, + "grad_norm": 1.4343584775924683, + "learning_rate": 0.00012523780404946326, + "loss": 3.3993, + "step": 103035 + }, + { + "epoch": 7.000951216197853, + "grad_norm": 0.993942379951477, + "learning_rate": 0.00012519533904063051, + "loss": 3.3805, + "step": 103040 + }, + { + "epoch": 7.001290936268515, + "grad_norm": 1.018912672996521, + "learning_rate": 0.00012515287403179782, + "loss": 3.4084, + "step": 103045 + }, + { + "epoch": 7.001630656339176, + "grad_norm": 0.9391670823097229, + "learning_rate": 0.00012511040902296507, + "loss": 3.4214, + "step": 103050 + }, + { + "epoch": 7.001970376409838, + "grad_norm": 1.004220962524414, + "learning_rate": 0.00012506794401413235, + "loss": 3.4688, + "step": 103055 + }, + { + "epoch": 7.0023100964805, + "grad_norm": 1.065028190612793, + "learning_rate": 0.00012502547900529963, + "loss": 3.4085, + "step": 103060 + }, + { + "epoch": 7.002649816551162, + "grad_norm": 0.9808433055877686, + "learning_rate": 0.00012498301399646691, + "loss": 3.5356, + "step": 103065 + }, + { + "epoch": 7.002989536621824, + "grad_norm": 1.026971459388733, + "learning_rate": 0.0001249405489876342, + "loss": 3.1777, + "step": 103070 + }, + { + "epoch": 7.003329256692485, + "grad_norm": 0.9549520015716553, + "learning_rate": 0.00012489808397880147, + "loss": 3.2531, + "step": 103075 + }, + { + "epoch": 7.003668976763147, + "grad_norm": 1.0053322315216064, + "learning_rate": 0.00012485561896996875, + "loss": 3.0856, + "step": 103080 + }, + { + "epoch": 7.004008696833809, + "grad_norm": 0.9951996207237244, + "learning_rate": 0.00012481315396113603, + "loss": 3.5602, + "step": 103085 + }, + { + "epoch": 7.00434841690447, + "grad_norm": 1.2788749933242798, + "learning_rate": 0.0001247706889523033, + "loss": 3.4129, + "step": 103090 + }, + { + "epoch": 7.004688136975132, + "grad_norm": 0.8180457353591919, + "learning_rate": 0.00012472822394347057, + "loss": 3.0867, + "step": 103095 + }, + { + "epoch": 7.0050278570457944, + "grad_norm": 1.2722073793411255, + "learning_rate": 0.00012468575893463788, + "loss": 3.4666, + "step": 103100 + }, + { + "epoch": 7.005367577116456, + "grad_norm": 0.8063511252403259, + "learning_rate": 0.00012464329392580516, + "loss": 3.3839, + "step": 103105 + }, + { + "epoch": 7.005707297187118, + "grad_norm": 1.2188656330108643, + "learning_rate": 0.0001246008289169724, + "loss": 3.3243, + "step": 103110 + }, + { + "epoch": 7.00604701725778, + "grad_norm": 1.157443881034851, + "learning_rate": 0.0001245583639081397, + "loss": 3.4903, + "step": 103115 + }, + { + "epoch": 7.006386737328441, + "grad_norm": 1.1380871534347534, + "learning_rate": 0.00012451589889930697, + "loss": 3.306, + "step": 103120 + }, + { + "epoch": 7.006726457399103, + "grad_norm": 0.8036501407623291, + "learning_rate": 0.00012447343389047425, + "loss": 3.2287, + "step": 103125 + }, + { + "epoch": 7.007066177469765, + "grad_norm": 1.1250271797180176, + "learning_rate": 0.00012443096888164153, + "loss": 3.5258, + "step": 103130 + }, + { + "epoch": 7.007405897540426, + "grad_norm": 0.8113721013069153, + "learning_rate": 0.0001243885038728088, + "loss": 3.3703, + "step": 103135 + }, + { + "epoch": 7.007745617611088, + "grad_norm": 0.8709874153137207, + "learning_rate": 0.0001243460388639761, + "loss": 3.335, + "step": 103140 + }, + { + "epoch": 7.0080853376817505, + "grad_norm": 1.575575351715088, + "learning_rate": 0.00012430357385514337, + "loss": 3.4549, + "step": 103145 + }, + { + "epoch": 7.008425057752412, + "grad_norm": 0.8987162709236145, + "learning_rate": 0.00012426110884631065, + "loss": 3.2957, + "step": 103150 + }, + { + "epoch": 7.008764777823074, + "grad_norm": 0.7744829654693604, + "learning_rate": 0.00012421864383747793, + "loss": 3.4124, + "step": 103155 + }, + { + "epoch": 7.009104497893736, + "grad_norm": 1.1189078092575073, + "learning_rate": 0.00012417617882864518, + "loss": 3.3465, + "step": 103160 + }, + { + "epoch": 7.009444217964397, + "grad_norm": 0.7199158668518066, + "learning_rate": 0.0001241337138198125, + "loss": 3.3789, + "step": 103165 + }, + { + "epoch": 7.009783938035059, + "grad_norm": 0.838915228843689, + "learning_rate": 0.00012409124881097977, + "loss": 3.4678, + "step": 103170 + }, + { + "epoch": 7.010123658105721, + "grad_norm": 0.7981073260307312, + "learning_rate": 0.00012404878380214702, + "loss": 2.9858, + "step": 103175 + }, + { + "epoch": 7.010463378176382, + "grad_norm": 1.2254197597503662, + "learning_rate": 0.0001240063187933143, + "loss": 3.5885, + "step": 103180 + }, + { + "epoch": 7.010803098247044, + "grad_norm": 0.9203154444694519, + "learning_rate": 0.00012396385378448158, + "loss": 3.2966, + "step": 103185 + }, + { + "epoch": 7.0111428183177065, + "grad_norm": 1.0959376096725464, + "learning_rate": 0.00012392138877564886, + "loss": 3.4774, + "step": 103190 + }, + { + "epoch": 7.011482538388368, + "grad_norm": 0.8107547163963318, + "learning_rate": 0.00012387892376681614, + "loss": 3.3467, + "step": 103195 + }, + { + "epoch": 7.01182225845903, + "grad_norm": 1.111763834953308, + "learning_rate": 0.00012383645875798342, + "loss": 3.5672, + "step": 103200 + }, + { + "epoch": 7.012161978529692, + "grad_norm": 3.0614633560180664, + "learning_rate": 0.0001237939937491507, + "loss": 3.4699, + "step": 103205 + }, + { + "epoch": 7.012501698600353, + "grad_norm": 0.7558282017707825, + "learning_rate": 0.00012375152874031798, + "loss": 3.3427, + "step": 103210 + }, + { + "epoch": 7.012841418671015, + "grad_norm": 0.8599579334259033, + "learning_rate": 0.00012370906373148526, + "loss": 3.4506, + "step": 103215 + }, + { + "epoch": 7.013181138741677, + "grad_norm": 0.8602533936500549, + "learning_rate": 0.00012366659872265254, + "loss": 3.1647, + "step": 103220 + }, + { + "epoch": 7.013520858812338, + "grad_norm": 0.7207866907119751, + "learning_rate": 0.0001236241337138198, + "loss": 3.4523, + "step": 103225 + }, + { + "epoch": 7.013860578883, + "grad_norm": 1.0116716623306274, + "learning_rate": 0.0001235816687049871, + "loss": 3.3119, + "step": 103230 + }, + { + "epoch": 7.0142002989536625, + "grad_norm": 0.9430643916130066, + "learning_rate": 0.00012353920369615438, + "loss": 3.3976, + "step": 103235 + }, + { + "epoch": 7.014540019024324, + "grad_norm": 0.8497433066368103, + "learning_rate": 0.00012349673868732166, + "loss": 3.3666, + "step": 103240 + }, + { + "epoch": 7.014879739094986, + "grad_norm": 1.263410210609436, + "learning_rate": 0.00012345427367848892, + "loss": 3.4572, + "step": 103245 + }, + { + "epoch": 7.015219459165648, + "grad_norm": 0.8900768160820007, + "learning_rate": 0.0001234118086696562, + "loss": 3.3421, + "step": 103250 + }, + { + "epoch": 7.015559179236309, + "grad_norm": 1.0139445066452026, + "learning_rate": 0.00012336934366082348, + "loss": 3.3844, + "step": 103255 + }, + { + "epoch": 7.015898899306971, + "grad_norm": 0.8417357802391052, + "learning_rate": 0.00012332687865199076, + "loss": 3.653, + "step": 103260 + }, + { + "epoch": 7.016238619377633, + "grad_norm": 0.9816830158233643, + "learning_rate": 0.00012328441364315804, + "loss": 3.4372, + "step": 103265 + }, + { + "epoch": 7.016578339448294, + "grad_norm": 0.9039369225502014, + "learning_rate": 0.00012324194863432532, + "loss": 3.369, + "step": 103270 + }, + { + "epoch": 7.016918059518956, + "grad_norm": 0.9708408117294312, + "learning_rate": 0.0001231994836254926, + "loss": 3.2978, + "step": 103275 + }, + { + "epoch": 7.0172577795896185, + "grad_norm": 1.483079195022583, + "learning_rate": 0.00012315701861665988, + "loss": 3.3466, + "step": 103280 + }, + { + "epoch": 7.01759749966028, + "grad_norm": 0.9622177481651306, + "learning_rate": 0.00012311455360782716, + "loss": 3.2743, + "step": 103285 + }, + { + "epoch": 7.017937219730942, + "grad_norm": 0.9764963388442993, + "learning_rate": 0.00012307208859899444, + "loss": 3.4556, + "step": 103290 + }, + { + "epoch": 7.018276939801604, + "grad_norm": 0.820844829082489, + "learning_rate": 0.00012302962359016172, + "loss": 3.044, + "step": 103295 + }, + { + "epoch": 7.018616659872265, + "grad_norm": 0.7901012301445007, + "learning_rate": 0.000122987158581329, + "loss": 3.1619, + "step": 103300 + }, + { + "epoch": 7.018956379942927, + "grad_norm": 0.9813311100006104, + "learning_rate": 0.00012294469357249628, + "loss": 3.5438, + "step": 103305 + }, + { + "epoch": 7.019296100013589, + "grad_norm": 1.1725399494171143, + "learning_rate": 0.00012290222856366353, + "loss": 3.1521, + "step": 103310 + }, + { + "epoch": 7.01963582008425, + "grad_norm": 0.8368315100669861, + "learning_rate": 0.0001228597635548308, + "loss": 3.0703, + "step": 103315 + }, + { + "epoch": 7.019975540154912, + "grad_norm": 0.9761486053466797, + "learning_rate": 0.0001228172985459981, + "loss": 3.248, + "step": 103320 + }, + { + "epoch": 7.0203152602255745, + "grad_norm": 0.8730729818344116, + "learning_rate": 0.0001227748335371654, + "loss": 3.637, + "step": 103325 + }, + { + "epoch": 7.020654980296236, + "grad_norm": 1.0443464517593384, + "learning_rate": 0.00012273236852833265, + "loss": 3.6815, + "step": 103330 + }, + { + "epoch": 7.020994700366898, + "grad_norm": 1.1633753776550293, + "learning_rate": 0.00012268990351949993, + "loss": 3.4037, + "step": 103335 + }, + { + "epoch": 7.02133442043756, + "grad_norm": 1.0507506132125854, + "learning_rate": 0.0001226474385106672, + "loss": 3.4701, + "step": 103340 + }, + { + "epoch": 7.021674140508221, + "grad_norm": 0.8971210718154907, + "learning_rate": 0.0001226049735018345, + "loss": 3.3345, + "step": 103345 + }, + { + "epoch": 7.022013860578883, + "grad_norm": 1.1650457382202148, + "learning_rate": 0.00012256250849300177, + "loss": 3.5113, + "step": 103350 + }, + { + "epoch": 7.022353580649545, + "grad_norm": 1.0187745094299316, + "learning_rate": 0.00012252004348416905, + "loss": 3.3068, + "step": 103355 + }, + { + "epoch": 7.022693300720206, + "grad_norm": 0.7960084676742554, + "learning_rate": 0.00012247757847533633, + "loss": 3.4729, + "step": 103360 + }, + { + "epoch": 7.023033020790868, + "grad_norm": 0.7628434896469116, + "learning_rate": 0.0001224351134665036, + "loss": 3.2529, + "step": 103365 + }, + { + "epoch": 7.0233727408615305, + "grad_norm": 0.8763605356216431, + "learning_rate": 0.0001223926484576709, + "loss": 3.4551, + "step": 103370 + }, + { + "epoch": 7.023712460932192, + "grad_norm": 0.8353583216667175, + "learning_rate": 0.00012235018344883817, + "loss": 3.1582, + "step": 103375 + }, + { + "epoch": 7.024052181002854, + "grad_norm": 0.8912373185157776, + "learning_rate": 0.00012230771844000542, + "loss": 3.2754, + "step": 103380 + }, + { + "epoch": 7.024391901073516, + "grad_norm": 0.9821440577507019, + "learning_rate": 0.0001222652534311727, + "loss": 3.5228, + "step": 103385 + }, + { + "epoch": 7.024731621144177, + "grad_norm": 0.7651262879371643, + "learning_rate": 0.00012222278842234, + "loss": 3.3235, + "step": 103390 + }, + { + "epoch": 7.025071341214839, + "grad_norm": 0.8399121761322021, + "learning_rate": 0.00012218032341350726, + "loss": 3.1755, + "step": 103395 + }, + { + "epoch": 7.025411061285501, + "grad_norm": 0.9679678082466125, + "learning_rate": 0.00012213785840467454, + "loss": 3.3499, + "step": 103400 + }, + { + "epoch": 7.025750781356162, + "grad_norm": 0.722880482673645, + "learning_rate": 0.00012209539339584182, + "loss": 3.4296, + "step": 103405 + }, + { + "epoch": 7.0260905014268245, + "grad_norm": 1.105354905128479, + "learning_rate": 0.00012205292838700912, + "loss": 3.5414, + "step": 103410 + }, + { + "epoch": 7.026430221497486, + "grad_norm": 0.9689380526542664, + "learning_rate": 0.00012201046337817638, + "loss": 3.4686, + "step": 103415 + }, + { + "epoch": 7.026769941568148, + "grad_norm": 0.8568660616874695, + "learning_rate": 0.00012196799836934366, + "loss": 3.4082, + "step": 103420 + }, + { + "epoch": 7.02710966163881, + "grad_norm": 0.9366888403892517, + "learning_rate": 0.00012192553336051094, + "loss": 3.2866, + "step": 103425 + }, + { + "epoch": 7.027449381709471, + "grad_norm": 0.8999916315078735, + "learning_rate": 0.00012188306835167821, + "loss": 3.4381, + "step": 103430 + }, + { + "epoch": 7.027789101780133, + "grad_norm": 2.666276454925537, + "learning_rate": 0.00012184060334284549, + "loss": 3.5063, + "step": 103435 + }, + { + "epoch": 7.028128821850795, + "grad_norm": 1.1867799758911133, + "learning_rate": 0.00012179813833401278, + "loss": 3.4214, + "step": 103440 + }, + { + "epoch": 7.028468541921456, + "grad_norm": 0.8475272059440613, + "learning_rate": 0.00012175567332518006, + "loss": 3.54, + "step": 103445 + }, + { + "epoch": 7.028808261992118, + "grad_norm": 0.9603546261787415, + "learning_rate": 0.00012171320831634733, + "loss": 3.2372, + "step": 103450 + }, + { + "epoch": 7.0291479820627805, + "grad_norm": 0.9016897678375244, + "learning_rate": 0.00012167074330751461, + "loss": 3.2932, + "step": 103455 + }, + { + "epoch": 7.029487702133442, + "grad_norm": 1.0100300312042236, + "learning_rate": 0.00012162827829868189, + "loss": 3.4995, + "step": 103460 + }, + { + "epoch": 7.029827422204104, + "grad_norm": 1.1568406820297241, + "learning_rate": 0.00012158581328984916, + "loss": 3.5416, + "step": 103465 + }, + { + "epoch": 7.030167142274766, + "grad_norm": 0.9766241908073425, + "learning_rate": 0.00012154334828101644, + "loss": 3.4725, + "step": 103470 + }, + { + "epoch": 7.030506862345427, + "grad_norm": 0.8022316694259644, + "learning_rate": 0.00012150088327218373, + "loss": 3.224, + "step": 103475 + }, + { + "epoch": 7.030846582416089, + "grad_norm": 1.1728277206420898, + "learning_rate": 0.000121458418263351, + "loss": 3.3851, + "step": 103480 + }, + { + "epoch": 7.031186302486751, + "grad_norm": 0.9686022996902466, + "learning_rate": 0.00012141595325451828, + "loss": 3.407, + "step": 103485 + }, + { + "epoch": 7.031526022557412, + "grad_norm": 1.2470797300338745, + "learning_rate": 0.00012137348824568556, + "loss": 3.2771, + "step": 103490 + }, + { + "epoch": 7.031865742628074, + "grad_norm": 1.024888277053833, + "learning_rate": 0.00012133102323685284, + "loss": 3.2888, + "step": 103495 + }, + { + "epoch": 7.0322054626987365, + "grad_norm": 1.7769429683685303, + "learning_rate": 0.0001212885582280201, + "loss": 3.4427, + "step": 103500 + }, + { + "epoch": 7.032545182769398, + "grad_norm": 0.7823207378387451, + "learning_rate": 0.0001212460932191874, + "loss": 3.2229, + "step": 103505 + }, + { + "epoch": 7.03288490284006, + "grad_norm": 0.7473933100700378, + "learning_rate": 0.00012120362821035468, + "loss": 3.5153, + "step": 103510 + }, + { + "epoch": 7.033224622910722, + "grad_norm": 0.6877932548522949, + "learning_rate": 0.00012116116320152195, + "loss": 3.4639, + "step": 103515 + }, + { + "epoch": 7.033564342981383, + "grad_norm": 1.065366506576538, + "learning_rate": 0.00012111869819268923, + "loss": 3.2853, + "step": 103520 + }, + { + "epoch": 7.033904063052045, + "grad_norm": 0.792716383934021, + "learning_rate": 0.0001210762331838565, + "loss": 3.2543, + "step": 103525 + }, + { + "epoch": 7.034243783122707, + "grad_norm": 0.8847819566726685, + "learning_rate": 0.00012103376817502379, + "loss": 3.6008, + "step": 103530 + }, + { + "epoch": 7.034583503193368, + "grad_norm": 0.8971681594848633, + "learning_rate": 0.00012099130316619105, + "loss": 3.4075, + "step": 103535 + }, + { + "epoch": 7.03492322326403, + "grad_norm": 0.876962423324585, + "learning_rate": 0.00012094883815735835, + "loss": 3.6214, + "step": 103540 + }, + { + "epoch": 7.0352629433346925, + "grad_norm": 1.0735880136489868, + "learning_rate": 0.00012090637314852563, + "loss": 3.279, + "step": 103545 + }, + { + "epoch": 7.035602663405354, + "grad_norm": 0.7778129577636719, + "learning_rate": 0.00012086390813969289, + "loss": 3.1964, + "step": 103550 + }, + { + "epoch": 7.035942383476016, + "grad_norm": 0.7224797010421753, + "learning_rate": 0.00012082144313086017, + "loss": 3.6324, + "step": 103555 + }, + { + "epoch": 7.036282103546678, + "grad_norm": 0.88723224401474, + "learning_rate": 0.00012077897812202745, + "loss": 3.7399, + "step": 103560 + }, + { + "epoch": 7.036621823617339, + "grad_norm": 0.8379307389259338, + "learning_rate": 0.00012073651311319472, + "loss": 3.5074, + "step": 103565 + }, + { + "epoch": 7.036961543688001, + "grad_norm": 0.8079169392585754, + "learning_rate": 0.000120694048104362, + "loss": 3.3683, + "step": 103570 + }, + { + "epoch": 7.037301263758663, + "grad_norm": 0.7536318302154541, + "learning_rate": 0.00012065158309552929, + "loss": 3.5045, + "step": 103575 + }, + { + "epoch": 7.037640983829324, + "grad_norm": 0.9148828983306885, + "learning_rate": 0.00012060911808669657, + "loss": 3.2355, + "step": 103580 + }, + { + "epoch": 7.037980703899986, + "grad_norm": 0.853591799736023, + "learning_rate": 0.00012056665307786384, + "loss": 3.4664, + "step": 103585 + }, + { + "epoch": 7.0383204239706485, + "grad_norm": 1.0232127904891968, + "learning_rate": 0.00012052418806903112, + "loss": 3.1598, + "step": 103590 + }, + { + "epoch": 7.03866014404131, + "grad_norm": 0.808077335357666, + "learning_rate": 0.0001204817230601984, + "loss": 3.6302, + "step": 103595 + }, + { + "epoch": 7.038999864111972, + "grad_norm": 1.4870327711105347, + "learning_rate": 0.00012043925805136567, + "loss": 3.277, + "step": 103600 + }, + { + "epoch": 7.039339584182634, + "grad_norm": 0.9838422536849976, + "learning_rate": 0.00012039679304253296, + "loss": 3.5181, + "step": 103605 + }, + { + "epoch": 7.039679304253295, + "grad_norm": 1.0182397365570068, + "learning_rate": 0.00012035432803370024, + "loss": 3.6325, + "step": 103610 + }, + { + "epoch": 7.040019024323957, + "grad_norm": 0.9720102548599243, + "learning_rate": 0.00012031186302486752, + "loss": 3.4862, + "step": 103615 + }, + { + "epoch": 7.040358744394619, + "grad_norm": 0.8283244371414185, + "learning_rate": 0.00012026939801603479, + "loss": 3.4679, + "step": 103620 + }, + { + "epoch": 7.04069846446528, + "grad_norm": 0.8802581429481506, + "learning_rate": 0.00012022693300720207, + "loss": 3.4911, + "step": 103625 + }, + { + "epoch": 7.041038184535942, + "grad_norm": 1.0007350444793701, + "learning_rate": 0.00012018446799836935, + "loss": 3.3033, + "step": 103630 + }, + { + "epoch": 7.0413779046066045, + "grad_norm": 0.7451424598693848, + "learning_rate": 0.00012014200298953661, + "loss": 3.4119, + "step": 103635 + }, + { + "epoch": 7.041717624677266, + "grad_norm": 0.9568672776222229, + "learning_rate": 0.0001200995379807039, + "loss": 3.3084, + "step": 103640 + }, + { + "epoch": 7.042057344747928, + "grad_norm": 0.9085438847541809, + "learning_rate": 0.00012005707297187119, + "loss": 3.4814, + "step": 103645 + }, + { + "epoch": 7.04239706481859, + "grad_norm": 1.1396970748901367, + "learning_rate": 0.00012001460796303845, + "loss": 3.5196, + "step": 103650 + }, + { + "epoch": 7.042736784889251, + "grad_norm": 1.2549079656600952, + "learning_rate": 0.00011997214295420573, + "loss": 3.8441, + "step": 103655 + }, + { + "epoch": 7.043076504959913, + "grad_norm": 0.8730775713920593, + "learning_rate": 0.00011992967794537301, + "loss": 3.3899, + "step": 103660 + }, + { + "epoch": 7.043416225030575, + "grad_norm": 0.8653447031974792, + "learning_rate": 0.0001198872129365403, + "loss": 3.4319, + "step": 103665 + }, + { + "epoch": 7.043755945101236, + "grad_norm": 0.8586915731430054, + "learning_rate": 0.00011984474792770757, + "loss": 3.577, + "step": 103670 + }, + { + "epoch": 7.0440956651718984, + "grad_norm": 1.2046120166778564, + "learning_rate": 0.00011980228291887485, + "loss": 3.5164, + "step": 103675 + }, + { + "epoch": 7.0444353852425605, + "grad_norm": 1.101643443107605, + "learning_rate": 0.00011975981791004213, + "loss": 3.3946, + "step": 103680 + }, + { + "epoch": 7.044775105313222, + "grad_norm": 0.9673594832420349, + "learning_rate": 0.0001197173529012094, + "loss": 3.5824, + "step": 103685 + }, + { + "epoch": 7.045114825383884, + "grad_norm": 0.983705997467041, + "learning_rate": 0.00011967488789237668, + "loss": 3.1683, + "step": 103690 + }, + { + "epoch": 7.045454545454546, + "grad_norm": 0.7853025794029236, + "learning_rate": 0.00011963242288354396, + "loss": 3.2209, + "step": 103695 + }, + { + "epoch": 7.045794265525207, + "grad_norm": 0.9548130631446838, + "learning_rate": 0.00011958995787471124, + "loss": 3.3516, + "step": 103700 + }, + { + "epoch": 7.046133985595869, + "grad_norm": 1.0840785503387451, + "learning_rate": 0.00011954749286587852, + "loss": 3.5275, + "step": 103705 + }, + { + "epoch": 7.046473705666531, + "grad_norm": 0.9790171980857849, + "learning_rate": 0.0001195050278570458, + "loss": 3.6172, + "step": 103710 + }, + { + "epoch": 7.046813425737192, + "grad_norm": 0.8907310962677002, + "learning_rate": 0.00011946256284821308, + "loss": 3.25, + "step": 103715 + }, + { + "epoch": 7.0471531458078545, + "grad_norm": 0.9802649617195129, + "learning_rate": 0.00011942009783938035, + "loss": 3.324, + "step": 103720 + }, + { + "epoch": 7.0474928658785165, + "grad_norm": 1.7781360149383545, + "learning_rate": 0.00011937763283054763, + "loss": 3.6526, + "step": 103725 + }, + { + "epoch": 7.047832585949178, + "grad_norm": 0.926274836063385, + "learning_rate": 0.00011933516782171491, + "loss": 3.6022, + "step": 103730 + }, + { + "epoch": 7.04817230601984, + "grad_norm": 0.9488686323165894, + "learning_rate": 0.00011929270281288219, + "loss": 3.3701, + "step": 103735 + }, + { + "epoch": 7.048512026090501, + "grad_norm": 0.8529803156852722, + "learning_rate": 0.00011925023780404947, + "loss": 3.6147, + "step": 103740 + }, + { + "epoch": 7.048851746161163, + "grad_norm": 0.8580235838890076, + "learning_rate": 0.00011920777279521675, + "loss": 3.6058, + "step": 103745 + }, + { + "epoch": 7.049191466231825, + "grad_norm": 0.8492461442947388, + "learning_rate": 0.00011916530778638403, + "loss": 3.283, + "step": 103750 + }, + { + "epoch": 7.049531186302486, + "grad_norm": 1.1053059101104736, + "learning_rate": 0.0001191228427775513, + "loss": 3.5617, + "step": 103755 + }, + { + "epoch": 7.049870906373148, + "grad_norm": 1.0680055618286133, + "learning_rate": 0.00011908037776871857, + "loss": 3.171, + "step": 103760 + }, + { + "epoch": 7.0502106264438105, + "grad_norm": 1.1177725791931152, + "learning_rate": 0.00011903791275988585, + "loss": 3.0962, + "step": 103765 + }, + { + "epoch": 7.050550346514472, + "grad_norm": 0.8498919606208801, + "learning_rate": 0.00011899544775105313, + "loss": 3.4736, + "step": 103770 + }, + { + "epoch": 7.050890066585134, + "grad_norm": 0.7526494860649109, + "learning_rate": 0.00011895298274222041, + "loss": 3.3037, + "step": 103775 + }, + { + "epoch": 7.051229786655796, + "grad_norm": 0.951774537563324, + "learning_rate": 0.0001189105177333877, + "loss": 3.2637, + "step": 103780 + }, + { + "epoch": 7.051569506726457, + "grad_norm": 0.8113771677017212, + "learning_rate": 0.00011886805272455497, + "loss": 3.5281, + "step": 103785 + }, + { + "epoch": 7.051909226797119, + "grad_norm": 0.9663901925086975, + "learning_rate": 0.00011882558771572224, + "loss": 3.2836, + "step": 103790 + }, + { + "epoch": 7.052248946867781, + "grad_norm": 0.7112084627151489, + "learning_rate": 0.00011878312270688952, + "loss": 3.4475, + "step": 103795 + }, + { + "epoch": 7.052588666938442, + "grad_norm": 1.0196384191513062, + "learning_rate": 0.00011874065769805681, + "loss": 3.4312, + "step": 103800 + }, + { + "epoch": 7.052928387009104, + "grad_norm": 0.7302797436714172, + "learning_rate": 0.00011869819268922408, + "loss": 3.2683, + "step": 103805 + }, + { + "epoch": 7.0532681070797665, + "grad_norm": 1.1464961767196655, + "learning_rate": 0.00011865572768039136, + "loss": 3.2657, + "step": 103810 + }, + { + "epoch": 7.053607827150428, + "grad_norm": 0.964594841003418, + "learning_rate": 0.00011861326267155864, + "loss": 3.5473, + "step": 103815 + }, + { + "epoch": 7.05394754722109, + "grad_norm": 0.8336894512176514, + "learning_rate": 0.00011857079766272591, + "loss": 3.2141, + "step": 103820 + }, + { + "epoch": 7.054287267291752, + "grad_norm": 1.3233637809753418, + "learning_rate": 0.00011852833265389319, + "loss": 3.2371, + "step": 103825 + }, + { + "epoch": 7.054626987362413, + "grad_norm": 0.9058504700660706, + "learning_rate": 0.00011848586764506047, + "loss": 3.3432, + "step": 103830 + }, + { + "epoch": 7.054966707433075, + "grad_norm": 1.2610450983047485, + "learning_rate": 0.00011844340263622776, + "loss": 3.3962, + "step": 103835 + }, + { + "epoch": 7.055306427503737, + "grad_norm": 1.2502490282058716, + "learning_rate": 0.00011840093762739503, + "loss": 3.1271, + "step": 103840 + }, + { + "epoch": 7.055646147574398, + "grad_norm": 0.8292462825775146, + "learning_rate": 0.00011835847261856231, + "loss": 3.4098, + "step": 103845 + }, + { + "epoch": 7.05598586764506, + "grad_norm": 0.90743488073349, + "learning_rate": 0.00011831600760972959, + "loss": 3.425, + "step": 103850 + }, + { + "epoch": 7.0563255877157225, + "grad_norm": 0.8987111449241638, + "learning_rate": 0.00011827354260089686, + "loss": 3.5824, + "step": 103855 + }, + { + "epoch": 7.056665307786384, + "grad_norm": 1.455331802368164, + "learning_rate": 0.00011823107759206414, + "loss": 3.5599, + "step": 103860 + }, + { + "epoch": 7.057005027857046, + "grad_norm": 0.9033448696136475, + "learning_rate": 0.00011818861258323143, + "loss": 3.3676, + "step": 103865 + }, + { + "epoch": 7.057344747927708, + "grad_norm": 1.1001006364822388, + "learning_rate": 0.00011814614757439871, + "loss": 3.5112, + "step": 103870 + }, + { + "epoch": 7.057684467998369, + "grad_norm": 1.2981910705566406, + "learning_rate": 0.00011810368256556598, + "loss": 3.4444, + "step": 103875 + }, + { + "epoch": 7.058024188069031, + "grad_norm": 1.1378637552261353, + "learning_rate": 0.00011806121755673326, + "loss": 2.805, + "step": 103880 + }, + { + "epoch": 7.058363908139693, + "grad_norm": 0.924365758895874, + "learning_rate": 0.00011801875254790054, + "loss": 3.3647, + "step": 103885 + }, + { + "epoch": 7.058703628210354, + "grad_norm": 0.89938884973526, + "learning_rate": 0.0001179762875390678, + "loss": 3.593, + "step": 103890 + }, + { + "epoch": 7.059043348281016, + "grad_norm": 0.7457945346832275, + "learning_rate": 0.00011793382253023508, + "loss": 3.3527, + "step": 103895 + }, + { + "epoch": 7.0593830683516785, + "grad_norm": 1.0490291118621826, + "learning_rate": 0.00011789135752140238, + "loss": 3.4452, + "step": 103900 + }, + { + "epoch": 7.05972278842234, + "grad_norm": 0.8919139504432678, + "learning_rate": 0.00011784889251256964, + "loss": 3.5852, + "step": 103905 + }, + { + "epoch": 7.060062508493002, + "grad_norm": 0.8256655335426331, + "learning_rate": 0.00011780642750373692, + "loss": 3.5237, + "step": 103910 + }, + { + "epoch": 7.060402228563664, + "grad_norm": 0.8167633414268494, + "learning_rate": 0.0001177639624949042, + "loss": 3.0751, + "step": 103915 + }, + { + "epoch": 7.060741948634325, + "grad_norm": 1.2284481525421143, + "learning_rate": 0.00011772149748607148, + "loss": 3.263, + "step": 103920 + }, + { + "epoch": 7.061081668704987, + "grad_norm": 0.9197717308998108, + "learning_rate": 0.00011767903247723875, + "loss": 3.4573, + "step": 103925 + }, + { + "epoch": 7.061421388775649, + "grad_norm": 1.088013768196106, + "learning_rate": 0.00011763656746840604, + "loss": 3.4062, + "step": 103930 + }, + { + "epoch": 7.06176110884631, + "grad_norm": 0.802029013633728, + "learning_rate": 0.00011759410245957332, + "loss": 3.6171, + "step": 103935 + }, + { + "epoch": 7.062100828916972, + "grad_norm": 0.9430985450744629, + "learning_rate": 0.00011755163745074059, + "loss": 3.426, + "step": 103940 + }, + { + "epoch": 7.0624405489876345, + "grad_norm": 0.8136422634124756, + "learning_rate": 0.00011750917244190787, + "loss": 3.4842, + "step": 103945 + }, + { + "epoch": 7.062780269058296, + "grad_norm": 1.1100726127624512, + "learning_rate": 0.00011746670743307515, + "loss": 3.2277, + "step": 103950 + }, + { + "epoch": 7.063119989128958, + "grad_norm": 0.9441293478012085, + "learning_rate": 0.00011742424242424243, + "loss": 3.5212, + "step": 103955 + }, + { + "epoch": 7.06345970919962, + "grad_norm": 0.8779008388519287, + "learning_rate": 0.0001173817774154097, + "loss": 3.3871, + "step": 103960 + }, + { + "epoch": 7.063799429270281, + "grad_norm": 0.9114820957183838, + "learning_rate": 0.00011733931240657699, + "loss": 3.5325, + "step": 103965 + }, + { + "epoch": 7.064139149340943, + "grad_norm": 1.0565574169158936, + "learning_rate": 0.00011729684739774427, + "loss": 3.5381, + "step": 103970 + }, + { + "epoch": 7.064478869411605, + "grad_norm": 0.7217946648597717, + "learning_rate": 0.00011725438238891154, + "loss": 3.5152, + "step": 103975 + }, + { + "epoch": 7.064818589482266, + "grad_norm": 0.976195216178894, + "learning_rate": 0.00011721191738007882, + "loss": 3.3391, + "step": 103980 + }, + { + "epoch": 7.0651583095529285, + "grad_norm": 0.9735282063484192, + "learning_rate": 0.0001171694523712461, + "loss": 3.1585, + "step": 103985 + }, + { + "epoch": 7.0654980296235905, + "grad_norm": 0.864251971244812, + "learning_rate": 0.00011712698736241336, + "loss": 3.1774, + "step": 103990 + }, + { + "epoch": 7.065837749694252, + "grad_norm": 0.9424875974655151, + "learning_rate": 0.00011708452235358064, + "loss": 3.3383, + "step": 103995 + }, + { + "epoch": 7.066177469764914, + "grad_norm": 0.9848553538322449, + "learning_rate": 0.00011704205734474794, + "loss": 3.512, + "step": 104000 + }, + { + "epoch": 7.066517189835576, + "grad_norm": 0.747008740901947, + "learning_rate": 0.00011699959233591522, + "loss": 3.2819, + "step": 104005 + }, + { + "epoch": 7.066856909906237, + "grad_norm": 0.9827887415885925, + "learning_rate": 0.00011695712732708248, + "loss": 3.3291, + "step": 104010 + }, + { + "epoch": 7.067196629976899, + "grad_norm": 1.0619357824325562, + "learning_rate": 0.00011691466231824976, + "loss": 3.3052, + "step": 104015 + }, + { + "epoch": 7.067536350047561, + "grad_norm": 1.0860077142715454, + "learning_rate": 0.00011687219730941704, + "loss": 3.6197, + "step": 104020 + }, + { + "epoch": 7.067876070118222, + "grad_norm": 1.0262280702590942, + "learning_rate": 0.00011682973230058431, + "loss": 3.2379, + "step": 104025 + }, + { + "epoch": 7.0682157901888845, + "grad_norm": 0.8033637404441833, + "learning_rate": 0.0001167872672917516, + "loss": 3.4037, + "step": 104030 + }, + { + "epoch": 7.0685555102595465, + "grad_norm": 0.7231157422065735, + "learning_rate": 0.00011674480228291888, + "loss": 3.4717, + "step": 104035 + }, + { + "epoch": 7.068895230330208, + "grad_norm": 0.9587050676345825, + "learning_rate": 0.00011670233727408616, + "loss": 3.5326, + "step": 104040 + }, + { + "epoch": 7.06923495040087, + "grad_norm": 0.8588607311248779, + "learning_rate": 0.00011665987226525343, + "loss": 3.5375, + "step": 104045 + }, + { + "epoch": 7.069574670471532, + "grad_norm": 0.8311889171600342, + "learning_rate": 0.00011661740725642071, + "loss": 3.4033, + "step": 104050 + }, + { + "epoch": 7.069914390542193, + "grad_norm": 1.4294191598892212, + "learning_rate": 0.00011657494224758799, + "loss": 3.1779, + "step": 104055 + }, + { + "epoch": 7.070254110612855, + "grad_norm": 0.7866834998130798, + "learning_rate": 0.00011653247723875526, + "loss": 3.6106, + "step": 104060 + }, + { + "epoch": 7.070593830683517, + "grad_norm": 0.9904270172119141, + "learning_rate": 0.00011649001222992255, + "loss": 3.1793, + "step": 104065 + }, + { + "epoch": 7.070933550754178, + "grad_norm": 0.9291178584098816, + "learning_rate": 0.00011644754722108983, + "loss": 3.3888, + "step": 104070 + }, + { + "epoch": 7.0712732708248405, + "grad_norm": 1.6459388732910156, + "learning_rate": 0.0001164050822122571, + "loss": 3.323, + "step": 104075 + }, + { + "epoch": 7.0716129908955025, + "grad_norm": 1.1255115270614624, + "learning_rate": 0.00011636261720342438, + "loss": 3.1814, + "step": 104080 + }, + { + "epoch": 7.071952710966164, + "grad_norm": 0.9261187314987183, + "learning_rate": 0.00011632015219459166, + "loss": 3.4634, + "step": 104085 + }, + { + "epoch": 7.072292431036826, + "grad_norm": 1.000814437866211, + "learning_rate": 0.00011627768718575894, + "loss": 3.3516, + "step": 104090 + }, + { + "epoch": 7.072632151107487, + "grad_norm": 0.9281924962997437, + "learning_rate": 0.00011623522217692622, + "loss": 3.3841, + "step": 104095 + }, + { + "epoch": 7.072971871178149, + "grad_norm": 1.2089521884918213, + "learning_rate": 0.0001161927571680935, + "loss": 3.2986, + "step": 104100 + }, + { + "epoch": 7.073311591248811, + "grad_norm": 0.9928988814353943, + "learning_rate": 0.00011615029215926078, + "loss": 3.4061, + "step": 104105 + }, + { + "epoch": 7.073651311319472, + "grad_norm": 0.8643325567245483, + "learning_rate": 0.00011610782715042804, + "loss": 3.3411, + "step": 104110 + }, + { + "epoch": 7.073991031390134, + "grad_norm": 0.9076882600784302, + "learning_rate": 0.00011606536214159532, + "loss": 3.5007, + "step": 104115 + }, + { + "epoch": 7.0743307514607965, + "grad_norm": 0.9266732335090637, + "learning_rate": 0.0001160228971327626, + "loss": 3.4302, + "step": 104120 + }, + { + "epoch": 7.074670471531458, + "grad_norm": 1.3790770769119263, + "learning_rate": 0.00011598043212392988, + "loss": 3.294, + "step": 104125 + }, + { + "epoch": 7.07501019160212, + "grad_norm": 0.8632657527923584, + "learning_rate": 0.00011593796711509716, + "loss": 3.4597, + "step": 104130 + }, + { + "epoch": 7.075349911672782, + "grad_norm": 0.9229718446731567, + "learning_rate": 0.00011589550210626444, + "loss": 3.4392, + "step": 104135 + }, + { + "epoch": 7.075689631743443, + "grad_norm": 0.7536664009094238, + "learning_rate": 0.00011585303709743172, + "loss": 3.2781, + "step": 104140 + }, + { + "epoch": 7.076029351814105, + "grad_norm": 0.7467969059944153, + "learning_rate": 0.00011581057208859899, + "loss": 3.3798, + "step": 104145 + }, + { + "epoch": 7.076369071884767, + "grad_norm": 0.8969287872314453, + "learning_rate": 0.00011576810707976627, + "loss": 3.447, + "step": 104150 + }, + { + "epoch": 7.076708791955428, + "grad_norm": 0.9934607148170471, + "learning_rate": 0.00011572564207093355, + "loss": 3.4706, + "step": 104155 + }, + { + "epoch": 7.07704851202609, + "grad_norm": 1.024375557899475, + "learning_rate": 0.00011568317706210083, + "loss": 3.4044, + "step": 104160 + }, + { + "epoch": 7.0773882320967525, + "grad_norm": 1.84824800491333, + "learning_rate": 0.00011564071205326811, + "loss": 3.2102, + "step": 104165 + }, + { + "epoch": 7.077727952167414, + "grad_norm": 1.355490803718567, + "learning_rate": 0.00011559824704443539, + "loss": 3.199, + "step": 104170 + }, + { + "epoch": 7.078067672238076, + "grad_norm": 1.1342170238494873, + "learning_rate": 0.00011555578203560267, + "loss": 3.5123, + "step": 104175 + }, + { + "epoch": 7.078407392308738, + "grad_norm": 1.732767105102539, + "learning_rate": 0.00011551331702676994, + "loss": 3.3199, + "step": 104180 + }, + { + "epoch": 7.078747112379399, + "grad_norm": 0.9353762269020081, + "learning_rate": 0.00011547085201793722, + "loss": 3.48, + "step": 104185 + }, + { + "epoch": 7.079086832450061, + "grad_norm": 0.9843038320541382, + "learning_rate": 0.0001154283870091045, + "loss": 3.3661, + "step": 104190 + }, + { + "epoch": 7.079426552520723, + "grad_norm": 1.0421202182769775, + "learning_rate": 0.00011538592200027178, + "loss": 3.5135, + "step": 104195 + }, + { + "epoch": 7.079766272591384, + "grad_norm": 0.8943115472793579, + "learning_rate": 0.00011534345699143906, + "loss": 3.3794, + "step": 104200 + }, + { + "epoch": 7.080105992662046, + "grad_norm": 0.6536949872970581, + "learning_rate": 0.00011530099198260634, + "loss": 3.2156, + "step": 104205 + }, + { + "epoch": 7.0804457127327085, + "grad_norm": 1.1114728450775146, + "learning_rate": 0.00011525852697377362, + "loss": 3.5299, + "step": 104210 + }, + { + "epoch": 7.08078543280337, + "grad_norm": 0.9908339977264404, + "learning_rate": 0.00011521606196494089, + "loss": 3.4236, + "step": 104215 + }, + { + "epoch": 7.081125152874032, + "grad_norm": 0.8799673318862915, + "learning_rate": 0.00011517359695610817, + "loss": 3.4124, + "step": 104220 + }, + { + "epoch": 7.081464872944694, + "grad_norm": 0.765829861164093, + "learning_rate": 0.00011513113194727546, + "loss": 3.2238, + "step": 104225 + }, + { + "epoch": 7.081804593015355, + "grad_norm": 1.0194196701049805, + "learning_rate": 0.00011508866693844273, + "loss": 3.1749, + "step": 104230 + }, + { + "epoch": 7.082144313086017, + "grad_norm": 1.2206164598464966, + "learning_rate": 0.00011504620192961, + "loss": 3.1725, + "step": 104235 + }, + { + "epoch": 7.082484033156679, + "grad_norm": 0.9589239358901978, + "learning_rate": 0.00011500373692077729, + "loss": 3.2093, + "step": 104240 + }, + { + "epoch": 7.08282375322734, + "grad_norm": 0.7979382276535034, + "learning_rate": 0.00011496127191194455, + "loss": 3.4209, + "step": 104245 + }, + { + "epoch": 7.083163473298002, + "grad_norm": 1.152674674987793, + "learning_rate": 0.00011491880690311183, + "loss": 3.4288, + "step": 104250 + }, + { + "epoch": 7.0835031933686645, + "grad_norm": 0.8104459643363953, + "learning_rate": 0.00011487634189427911, + "loss": 3.3798, + "step": 104255 + }, + { + "epoch": 7.083842913439326, + "grad_norm": 0.8000004887580872, + "learning_rate": 0.0001148338768854464, + "loss": 3.5753, + "step": 104260 + }, + { + "epoch": 7.084182633509988, + "grad_norm": 1.0810353755950928, + "learning_rate": 0.00011479141187661367, + "loss": 3.3155, + "step": 104265 + }, + { + "epoch": 7.08452235358065, + "grad_norm": 0.6650676131248474, + "learning_rate": 0.00011474894686778095, + "loss": 3.3608, + "step": 104270 + }, + { + "epoch": 7.084862073651311, + "grad_norm": 0.8622875213623047, + "learning_rate": 0.00011470648185894823, + "loss": 3.3735, + "step": 104275 + }, + { + "epoch": 7.085201793721973, + "grad_norm": 0.7658130526542664, + "learning_rate": 0.0001146640168501155, + "loss": 3.5446, + "step": 104280 + }, + { + "epoch": 7.085541513792635, + "grad_norm": 1.0679903030395508, + "learning_rate": 0.00011462155184128278, + "loss": 3.2719, + "step": 104285 + }, + { + "epoch": 7.085881233863296, + "grad_norm": 1.099033236503601, + "learning_rate": 0.00011457908683245007, + "loss": 3.6487, + "step": 104290 + }, + { + "epoch": 7.0862209539339585, + "grad_norm": 0.8948459625244141, + "learning_rate": 0.00011453662182361735, + "loss": 3.6083, + "step": 104295 + }, + { + "epoch": 7.0865606740046205, + "grad_norm": 0.8178374767303467, + "learning_rate": 0.00011449415681478462, + "loss": 3.4731, + "step": 104300 + }, + { + "epoch": 7.086900394075282, + "grad_norm": 0.7628286480903625, + "learning_rate": 0.0001144516918059519, + "loss": 3.3858, + "step": 104305 + }, + { + "epoch": 7.087240114145944, + "grad_norm": 1.1509989500045776, + "learning_rate": 0.00011440922679711918, + "loss": 3.2411, + "step": 104310 + }, + { + "epoch": 7.087579834216606, + "grad_norm": 0.7682275176048279, + "learning_rate": 0.00011436676178828645, + "loss": 3.3316, + "step": 104315 + }, + { + "epoch": 7.087919554287267, + "grad_norm": 0.984397292137146, + "learning_rate": 0.00011432429677945373, + "loss": 3.3803, + "step": 104320 + }, + { + "epoch": 7.088259274357929, + "grad_norm": 0.9718103408813477, + "learning_rate": 0.00011428183177062102, + "loss": 3.6995, + "step": 104325 + }, + { + "epoch": 7.088598994428591, + "grad_norm": 0.9029846787452698, + "learning_rate": 0.00011423936676178829, + "loss": 3.4332, + "step": 104330 + }, + { + "epoch": 7.088938714499252, + "grad_norm": 0.9939478039741516, + "learning_rate": 0.00011419690175295557, + "loss": 3.4134, + "step": 104335 + }, + { + "epoch": 7.0892784345699145, + "grad_norm": 0.8481655716896057, + "learning_rate": 0.00011415443674412285, + "loss": 3.2403, + "step": 104340 + }, + { + "epoch": 7.0896181546405765, + "grad_norm": 0.957554817199707, + "learning_rate": 0.00011411197173529013, + "loss": 3.3809, + "step": 104345 + }, + { + "epoch": 7.089957874711238, + "grad_norm": 0.9866841435432434, + "learning_rate": 0.00011406950672645739, + "loss": 3.2015, + "step": 104350 + }, + { + "epoch": 7.0902975947819, + "grad_norm": 0.7408316135406494, + "learning_rate": 0.00011402704171762469, + "loss": 3.4243, + "step": 104355 + }, + { + "epoch": 7.090637314852562, + "grad_norm": 1.0829781293869019, + "learning_rate": 0.00011398457670879197, + "loss": 3.4824, + "step": 104360 + }, + { + "epoch": 7.090977034923223, + "grad_norm": 0.7531844973564148, + "learning_rate": 0.00011394211169995923, + "loss": 3.4734, + "step": 104365 + }, + { + "epoch": 7.091316754993885, + "grad_norm": 0.8292893767356873, + "learning_rate": 0.00011389964669112651, + "loss": 3.4812, + "step": 104370 + }, + { + "epoch": 7.091656475064547, + "grad_norm": 0.731326699256897, + "learning_rate": 0.0001138571816822938, + "loss": 3.503, + "step": 104375 + }, + { + "epoch": 7.091996195135208, + "grad_norm": 0.9331722259521484, + "learning_rate": 0.00011381471667346107, + "loss": 3.2977, + "step": 104380 + }, + { + "epoch": 7.0923359152058705, + "grad_norm": 0.8937399983406067, + "learning_rate": 0.00011377225166462834, + "loss": 3.1681, + "step": 104385 + }, + { + "epoch": 7.0926756352765326, + "grad_norm": 0.9695686101913452, + "learning_rate": 0.00011372978665579563, + "loss": 3.4581, + "step": 104390 + }, + { + "epoch": 7.093015355347194, + "grad_norm": 0.9388794898986816, + "learning_rate": 0.00011368732164696291, + "loss": 3.509, + "step": 104395 + }, + { + "epoch": 7.093355075417856, + "grad_norm": 0.8248666524887085, + "learning_rate": 0.00011364485663813018, + "loss": 3.33, + "step": 104400 + }, + { + "epoch": 7.093694795488518, + "grad_norm": 0.9024440050125122, + "learning_rate": 0.00011360239162929746, + "loss": 3.2819, + "step": 104405 + }, + { + "epoch": 7.094034515559179, + "grad_norm": 0.9828065037727356, + "learning_rate": 0.00011355992662046474, + "loss": 3.3787, + "step": 104410 + }, + { + "epoch": 7.094374235629841, + "grad_norm": 0.8965364098548889, + "learning_rate": 0.00011351746161163201, + "loss": 3.3242, + "step": 104415 + }, + { + "epoch": 7.094713955700502, + "grad_norm": 1.090340495109558, + "learning_rate": 0.00011347499660279929, + "loss": 3.23, + "step": 104420 + }, + { + "epoch": 7.095053675771164, + "grad_norm": 0.8266969323158264, + "learning_rate": 0.00011343253159396658, + "loss": 3.3239, + "step": 104425 + }, + { + "epoch": 7.0953933958418265, + "grad_norm": 0.7782180905342102, + "learning_rate": 0.00011339006658513386, + "loss": 3.2283, + "step": 104430 + }, + { + "epoch": 7.095733115912488, + "grad_norm": 0.8258267045021057, + "learning_rate": 0.00011334760157630113, + "loss": 3.4793, + "step": 104435 + }, + { + "epoch": 7.09607283598315, + "grad_norm": 0.8219000101089478, + "learning_rate": 0.00011330513656746841, + "loss": 3.2901, + "step": 104440 + }, + { + "epoch": 7.096412556053812, + "grad_norm": 0.8641642332077026, + "learning_rate": 0.00011326267155863569, + "loss": 3.3648, + "step": 104445 + }, + { + "epoch": 7.096752276124473, + "grad_norm": 1.17420494556427, + "learning_rate": 0.00011322020654980295, + "loss": 3.2344, + "step": 104450 + }, + { + "epoch": 7.097091996195135, + "grad_norm": 1.0017011165618896, + "learning_rate": 0.00011317774154097025, + "loss": 3.255, + "step": 104455 + }, + { + "epoch": 7.097431716265797, + "grad_norm": 0.9874993562698364, + "learning_rate": 0.00011313527653213753, + "loss": 3.4152, + "step": 104460 + }, + { + "epoch": 7.097771436336458, + "grad_norm": 1.0167880058288574, + "learning_rate": 0.00011309281152330481, + "loss": 3.4983, + "step": 104465 + }, + { + "epoch": 7.09811115640712, + "grad_norm": 0.8137903213500977, + "learning_rate": 0.00011305034651447207, + "loss": 3.0657, + "step": 104470 + }, + { + "epoch": 7.0984508764777825, + "grad_norm": 0.8214836716651917, + "learning_rate": 0.00011300788150563935, + "loss": 3.222, + "step": 104475 + }, + { + "epoch": 7.098790596548444, + "grad_norm": 0.8453896641731262, + "learning_rate": 0.00011296541649680663, + "loss": 3.5558, + "step": 104480 + }, + { + "epoch": 7.099130316619106, + "grad_norm": 1.064696192741394, + "learning_rate": 0.0001129229514879739, + "loss": 3.167, + "step": 104485 + }, + { + "epoch": 7.099470036689768, + "grad_norm": 0.7510692477226257, + "learning_rate": 0.0001128804864791412, + "loss": 3.5045, + "step": 104490 + }, + { + "epoch": 7.099809756760429, + "grad_norm": 0.9763411283493042, + "learning_rate": 0.00011283802147030847, + "loss": 3.0679, + "step": 104495 + }, + { + "epoch": 7.100149476831091, + "grad_norm": 1.0035704374313354, + "learning_rate": 0.00011279555646147574, + "loss": 3.4712, + "step": 104500 + }, + { + "epoch": 7.100489196901753, + "grad_norm": 1.0123237371444702, + "learning_rate": 0.00011275309145264302, + "loss": 3.2746, + "step": 104505 + }, + { + "epoch": 7.100828916972414, + "grad_norm": 0.9989948272705078, + "learning_rate": 0.0001127106264438103, + "loss": 3.4391, + "step": 104510 + }, + { + "epoch": 7.101168637043076, + "grad_norm": 0.851763129234314, + "learning_rate": 0.00011266816143497758, + "loss": 3.4921, + "step": 104515 + }, + { + "epoch": 7.1015083571137385, + "grad_norm": 2.5475528240203857, + "learning_rate": 0.00011262569642614486, + "loss": 3.2614, + "step": 104520 + }, + { + "epoch": 7.1018480771844, + "grad_norm": 1.1967190504074097, + "learning_rate": 0.00011258323141731214, + "loss": 3.1669, + "step": 104525 + }, + { + "epoch": 7.102187797255062, + "grad_norm": 1.246982455253601, + "learning_rate": 0.00011254076640847942, + "loss": 3.3498, + "step": 104530 + }, + { + "epoch": 7.102527517325724, + "grad_norm": 0.8950657248497009, + "learning_rate": 0.00011249830139964669, + "loss": 3.188, + "step": 104535 + }, + { + "epoch": 7.102867237396385, + "grad_norm": 0.9605151414871216, + "learning_rate": 0.00011245583639081397, + "loss": 3.3514, + "step": 104540 + }, + { + "epoch": 7.103206957467047, + "grad_norm": 1.0323870182037354, + "learning_rate": 0.00011241337138198125, + "loss": 3.524, + "step": 104545 + }, + { + "epoch": 7.103546677537709, + "grad_norm": 0.886607825756073, + "learning_rate": 0.00011237090637314853, + "loss": 3.3912, + "step": 104550 + }, + { + "epoch": 7.10388639760837, + "grad_norm": 1.070296049118042, + "learning_rate": 0.00011232844136431581, + "loss": 3.324, + "step": 104555 + }, + { + "epoch": 7.1042261176790324, + "grad_norm": 0.9121344089508057, + "learning_rate": 0.00011228597635548309, + "loss": 3.3014, + "step": 104560 + }, + { + "epoch": 7.1045658377496945, + "grad_norm": 0.9322543144226074, + "learning_rate": 0.00011224351134665037, + "loss": 3.1821, + "step": 104565 + }, + { + "epoch": 7.104905557820356, + "grad_norm": 0.7935618162155151, + "learning_rate": 0.00011220104633781764, + "loss": 3.3836, + "step": 104570 + }, + { + "epoch": 7.105245277891018, + "grad_norm": 1.1654421091079712, + "learning_rate": 0.00011215858132898492, + "loss": 3.3422, + "step": 104575 + }, + { + "epoch": 7.10558499796168, + "grad_norm": 1.1908732652664185, + "learning_rate": 0.0001121161163201522, + "loss": 3.3114, + "step": 104580 + }, + { + "epoch": 7.105924718032341, + "grad_norm": 1.2614853382110596, + "learning_rate": 0.00011207365131131948, + "loss": 3.4882, + "step": 104585 + }, + { + "epoch": 7.106264438103003, + "grad_norm": 0.8872395753860474, + "learning_rate": 0.00011203118630248676, + "loss": 3.4798, + "step": 104590 + }, + { + "epoch": 7.106604158173665, + "grad_norm": 1.19614839553833, + "learning_rate": 0.00011198872129365404, + "loss": 3.1945, + "step": 104595 + }, + { + "epoch": 7.106943878244326, + "grad_norm": 0.9494301676750183, + "learning_rate": 0.00011194625628482132, + "loss": 3.332, + "step": 104600 + }, + { + "epoch": 7.1072835983149885, + "grad_norm": 1.285488247871399, + "learning_rate": 0.00011190379127598858, + "loss": 3.6261, + "step": 104605 + }, + { + "epoch": 7.1076233183856505, + "grad_norm": 0.9555674195289612, + "learning_rate": 0.00011186132626715586, + "loss": 3.0921, + "step": 104610 + }, + { + "epoch": 7.107963038456312, + "grad_norm": 0.9263401031494141, + "learning_rate": 0.00011181886125832314, + "loss": 3.3443, + "step": 104615 + }, + { + "epoch": 7.108302758526974, + "grad_norm": 1.0956751108169556, + "learning_rate": 0.00011177639624949042, + "loss": 3.4758, + "step": 104620 + }, + { + "epoch": 7.108642478597636, + "grad_norm": 0.8740381598472595, + "learning_rate": 0.0001117339312406577, + "loss": 3.3299, + "step": 104625 + }, + { + "epoch": 7.108982198668297, + "grad_norm": 0.8836825489997864, + "learning_rate": 0.00011169146623182498, + "loss": 3.45, + "step": 104630 + }, + { + "epoch": 7.109321918738959, + "grad_norm": 1.2167656421661377, + "learning_rate": 0.00011164900122299226, + "loss": 3.3182, + "step": 104635 + }, + { + "epoch": 7.109661638809621, + "grad_norm": 0.9786140322685242, + "learning_rate": 0.00011160653621415953, + "loss": 3.4729, + "step": 104640 + }, + { + "epoch": 7.110001358880282, + "grad_norm": 0.8961608409881592, + "learning_rate": 0.00011156407120532681, + "loss": 3.2599, + "step": 104645 + }, + { + "epoch": 7.1103410789509445, + "grad_norm": 0.9996669292449951, + "learning_rate": 0.0001115216061964941, + "loss": 3.2176, + "step": 104650 + }, + { + "epoch": 7.1106807990216065, + "grad_norm": 1.056275725364685, + "learning_rate": 0.00011147914118766137, + "loss": 3.2512, + "step": 104655 + }, + { + "epoch": 7.111020519092268, + "grad_norm": 0.9229202270507812, + "learning_rate": 0.00011143667617882865, + "loss": 3.5043, + "step": 104660 + }, + { + "epoch": 7.11136023916293, + "grad_norm": 1.2269407510757446, + "learning_rate": 0.00011139421116999593, + "loss": 3.4222, + "step": 104665 + }, + { + "epoch": 7.111699959233592, + "grad_norm": 0.8689697980880737, + "learning_rate": 0.0001113517461611632, + "loss": 3.5455, + "step": 104670 + }, + { + "epoch": 7.112039679304253, + "grad_norm": 1.3533518314361572, + "learning_rate": 0.00011130928115233048, + "loss": 3.6041, + "step": 104675 + }, + { + "epoch": 7.112379399374915, + "grad_norm": 0.9843369722366333, + "learning_rate": 0.00011126681614349776, + "loss": 3.449, + "step": 104680 + }, + { + "epoch": 7.112719119445577, + "grad_norm": 0.9516898989677429, + "learning_rate": 0.00011122435113466505, + "loss": 3.4874, + "step": 104685 + }, + { + "epoch": 7.113058839516238, + "grad_norm": 0.986560046672821, + "learning_rate": 0.00011118188612583232, + "loss": 3.4506, + "step": 104690 + }, + { + "epoch": 7.1133985595869005, + "grad_norm": 0.8106385469436646, + "learning_rate": 0.0001111394211169996, + "loss": 3.443, + "step": 104695 + }, + { + "epoch": 7.113738279657563, + "grad_norm": 0.9073916673660278, + "learning_rate": 0.00011109695610816688, + "loss": 3.3447, + "step": 104700 + }, + { + "epoch": 7.114077999728224, + "grad_norm": 0.9261428713798523, + "learning_rate": 0.00011105449109933414, + "loss": 3.5438, + "step": 104705 + }, + { + "epoch": 7.114417719798886, + "grad_norm": 0.9988160133361816, + "learning_rate": 0.00011101202609050142, + "loss": 3.3468, + "step": 104710 + }, + { + "epoch": 7.114757439869548, + "grad_norm": 0.8005990386009216, + "learning_rate": 0.00011096956108166872, + "loss": 3.5456, + "step": 104715 + }, + { + "epoch": 7.115097159940209, + "grad_norm": 0.7878611087799072, + "learning_rate": 0.000110927096072836, + "loss": 3.3026, + "step": 104720 + }, + { + "epoch": 7.115436880010871, + "grad_norm": 1.0218263864517212, + "learning_rate": 0.00011088463106400326, + "loss": 3.0243, + "step": 104725 + }, + { + "epoch": 7.115776600081533, + "grad_norm": 0.8984653353691101, + "learning_rate": 0.00011084216605517054, + "loss": 3.4234, + "step": 104730 + }, + { + "epoch": 7.116116320152194, + "grad_norm": 0.938751757144928, + "learning_rate": 0.00011079970104633782, + "loss": 3.3107, + "step": 104735 + }, + { + "epoch": 7.1164560402228565, + "grad_norm": 4.718700885772705, + "learning_rate": 0.00011075723603750509, + "loss": 3.3857, + "step": 104740 + }, + { + "epoch": 7.116795760293519, + "grad_norm": 1.7811745405197144, + "learning_rate": 0.00011071477102867237, + "loss": 3.3491, + "step": 104745 + }, + { + "epoch": 7.11713548036418, + "grad_norm": 1.5978891849517822, + "learning_rate": 0.00011067230601983966, + "loss": 3.2684, + "step": 104750 + }, + { + "epoch": 7.117475200434842, + "grad_norm": 0.8635345697402954, + "learning_rate": 0.00011062984101100693, + "loss": 3.4326, + "step": 104755 + }, + { + "epoch": 7.117814920505504, + "grad_norm": 0.9781230688095093, + "learning_rate": 0.00011058737600217421, + "loss": 3.2188, + "step": 104760 + }, + { + "epoch": 7.118154640576165, + "grad_norm": 0.9331554770469666, + "learning_rate": 0.00011054491099334149, + "loss": 3.3751, + "step": 104765 + }, + { + "epoch": 7.118494360646827, + "grad_norm": 0.8037042021751404, + "learning_rate": 0.00011050244598450877, + "loss": 3.5703, + "step": 104770 + }, + { + "epoch": 7.118834080717488, + "grad_norm": 0.7340946197509766, + "learning_rate": 0.00011045998097567604, + "loss": 3.5717, + "step": 104775 + }, + { + "epoch": 7.11917380078815, + "grad_norm": 0.8703202605247498, + "learning_rate": 0.00011041751596684332, + "loss": 3.4655, + "step": 104780 + }, + { + "epoch": 7.1195135208588125, + "grad_norm": 0.8534095883369446, + "learning_rate": 0.00011037505095801061, + "loss": 3.4804, + "step": 104785 + }, + { + "epoch": 7.119853240929474, + "grad_norm": 0.815967857837677, + "learning_rate": 0.00011033258594917788, + "loss": 3.3053, + "step": 104790 + }, + { + "epoch": 7.120192961000136, + "grad_norm": 0.7952294945716858, + "learning_rate": 0.00011029012094034516, + "loss": 3.3386, + "step": 104795 + }, + { + "epoch": 7.120532681070798, + "grad_norm": 1.2886563539505005, + "learning_rate": 0.00011024765593151244, + "loss": 3.4754, + "step": 104800 + }, + { + "epoch": 7.120872401141459, + "grad_norm": 0.8265258073806763, + "learning_rate": 0.00011020519092267972, + "loss": 3.6369, + "step": 104805 + }, + { + "epoch": 7.121212121212121, + "grad_norm": 1.2805049419403076, + "learning_rate": 0.00011016272591384698, + "loss": 3.1628, + "step": 104810 + }, + { + "epoch": 7.121551841282783, + "grad_norm": 0.910995602607727, + "learning_rate": 0.00011012026090501428, + "loss": 3.5945, + "step": 104815 + }, + { + "epoch": 7.121891561353444, + "grad_norm": 1.2289241552352905, + "learning_rate": 0.00011007779589618156, + "loss": 3.2441, + "step": 104820 + }, + { + "epoch": 7.122231281424106, + "grad_norm": 0.9354240894317627, + "learning_rate": 0.00011003533088734882, + "loss": 3.2476, + "step": 104825 + }, + { + "epoch": 7.1225710014947685, + "grad_norm": 1.0676519870758057, + "learning_rate": 0.0001099928658785161, + "loss": 3.3309, + "step": 104830 + }, + { + "epoch": 7.12291072156543, + "grad_norm": 0.8422080874443054, + "learning_rate": 0.00010995040086968338, + "loss": 3.7286, + "step": 104835 + }, + { + "epoch": 7.123250441636092, + "grad_norm": 0.8465490937232971, + "learning_rate": 0.00010990793586085065, + "loss": 3.2746, + "step": 104840 + }, + { + "epoch": 7.123590161706754, + "grad_norm": 0.8787097334861755, + "learning_rate": 0.00010986547085201793, + "loss": 3.2838, + "step": 104845 + }, + { + "epoch": 7.123929881777415, + "grad_norm": 0.8060733079910278, + "learning_rate": 0.00010982300584318522, + "loss": 3.5394, + "step": 104850 + }, + { + "epoch": 7.124269601848077, + "grad_norm": 0.8431591987609863, + "learning_rate": 0.0001097805408343525, + "loss": 3.2905, + "step": 104855 + }, + { + "epoch": 7.124609321918739, + "grad_norm": 0.976470410823822, + "learning_rate": 0.00010973807582551977, + "loss": 3.2487, + "step": 104860 + }, + { + "epoch": 7.1249490419894, + "grad_norm": 0.8880183100700378, + "learning_rate": 0.00010969561081668705, + "loss": 3.4056, + "step": 104865 + }, + { + "epoch": 7.1252887620600625, + "grad_norm": 0.8917927145957947, + "learning_rate": 0.00010965314580785433, + "loss": 3.223, + "step": 104870 + }, + { + "epoch": 7.1256284821307245, + "grad_norm": 0.9994983077049255, + "learning_rate": 0.0001096106807990216, + "loss": 3.1872, + "step": 104875 + }, + { + "epoch": 7.125968202201386, + "grad_norm": 0.8650099635124207, + "learning_rate": 0.00010956821579018889, + "loss": 3.3701, + "step": 104880 + }, + { + "epoch": 7.126307922272048, + "grad_norm": 0.9256086945533752, + "learning_rate": 0.00010953424378312272, + "loss": 3.5283, + "step": 104885 + }, + { + "epoch": 7.12664764234271, + "grad_norm": 0.8395077586174011, + "learning_rate": 0.00010949177877429, + "loss": 3.6141, + "step": 104890 + }, + { + "epoch": 7.126987362413371, + "grad_norm": 0.755560040473938, + "learning_rate": 0.00010944931376545726, + "loss": 3.0743, + "step": 104895 + }, + { + "epoch": 7.127327082484033, + "grad_norm": 0.8628045916557312, + "learning_rate": 0.00010940684875662454, + "loss": 3.5376, + "step": 104900 + }, + { + "epoch": 7.127666802554695, + "grad_norm": 0.8038767576217651, + "learning_rate": 0.00010936438374779182, + "loss": 3.6044, + "step": 104905 + }, + { + "epoch": 7.128006522625356, + "grad_norm": 0.7387892007827759, + "learning_rate": 0.00010932191873895909, + "loss": 3.3344, + "step": 104910 + }, + { + "epoch": 7.1283462426960185, + "grad_norm": 1.108657717704773, + "learning_rate": 0.00010927945373012638, + "loss": 3.4901, + "step": 104915 + }, + { + "epoch": 7.1286859627666805, + "grad_norm": 0.9704428911209106, + "learning_rate": 0.00010923698872129366, + "loss": 3.597, + "step": 104920 + }, + { + "epoch": 7.129025682837342, + "grad_norm": 0.860019326210022, + "learning_rate": 0.00010919452371246093, + "loss": 3.5384, + "step": 104925 + }, + { + "epoch": 7.129365402908004, + "grad_norm": 0.8933203816413879, + "learning_rate": 0.00010915205870362821, + "loss": 3.1262, + "step": 104930 + }, + { + "epoch": 7.129705122978666, + "grad_norm": 0.9804382920265198, + "learning_rate": 0.00010910959369479549, + "loss": 3.393, + "step": 104935 + }, + { + "epoch": 7.130044843049327, + "grad_norm": 0.8204399943351746, + "learning_rate": 0.00010906712868596277, + "loss": 3.4209, + "step": 104940 + }, + { + "epoch": 7.130384563119989, + "grad_norm": 0.9388198852539062, + "learning_rate": 0.00010902466367713005, + "loss": 3.5174, + "step": 104945 + }, + { + "epoch": 7.130724283190651, + "grad_norm": 0.9600579738616943, + "learning_rate": 0.00010898219866829733, + "loss": 3.3345, + "step": 104950 + }, + { + "epoch": 7.131064003261312, + "grad_norm": 0.8652578592300415, + "learning_rate": 0.00010893973365946461, + "loss": 3.5306, + "step": 104955 + }, + { + "epoch": 7.1314037233319745, + "grad_norm": 0.9647805094718933, + "learning_rate": 0.00010889726865063188, + "loss": 3.2405, + "step": 104960 + }, + { + "epoch": 7.1317434434026366, + "grad_norm": 0.9356905817985535, + "learning_rate": 0.00010885480364179916, + "loss": 3.3313, + "step": 104965 + }, + { + "epoch": 7.132083163473298, + "grad_norm": 0.814176082611084, + "learning_rate": 0.00010881233863296644, + "loss": 3.4351, + "step": 104970 + }, + { + "epoch": 7.13242288354396, + "grad_norm": 1.0121484994888306, + "learning_rate": 0.00010876987362413372, + "loss": 3.4172, + "step": 104975 + }, + { + "epoch": 7.132762603614622, + "grad_norm": 0.7524013519287109, + "learning_rate": 0.000108727408615301, + "loss": 3.3813, + "step": 104980 + }, + { + "epoch": 7.133102323685283, + "grad_norm": 2.2925045490264893, + "learning_rate": 0.00010868494360646828, + "loss": 3.351, + "step": 104985 + }, + { + "epoch": 7.133442043755945, + "grad_norm": 0.9110906720161438, + "learning_rate": 0.00010864247859763556, + "loss": 3.407, + "step": 104990 + }, + { + "epoch": 7.133781763826607, + "grad_norm": 2.6466150283813477, + "learning_rate": 0.00010860001358880282, + "loss": 3.2798, + "step": 104995 + }, + { + "epoch": 7.134121483897268, + "grad_norm": 1.2646700143814087, + "learning_rate": 0.0001085575485799701, + "loss": 3.3696, + "step": 105000 + }, + { + "epoch": 7.1344612039679305, + "grad_norm": 0.8108121156692505, + "learning_rate": 0.00010851508357113738, + "loss": 3.3155, + "step": 105005 + }, + { + "epoch": 7.134800924038593, + "grad_norm": 1.4224773645401, + "learning_rate": 0.00010847261856230465, + "loss": 3.4374, + "step": 105010 + }, + { + "epoch": 7.135140644109254, + "grad_norm": 0.6651644110679626, + "learning_rate": 0.00010843015355347194, + "loss": 3.3554, + "step": 105015 + }, + { + "epoch": 7.135480364179916, + "grad_norm": 0.9939013719558716, + "learning_rate": 0.00010838768854463922, + "loss": 3.2211, + "step": 105020 + }, + { + "epoch": 7.135820084250578, + "grad_norm": 1.0087828636169434, + "learning_rate": 0.0001083452235358065, + "loss": 3.345, + "step": 105025 + }, + { + "epoch": 7.136159804321239, + "grad_norm": 1.3392380475997925, + "learning_rate": 0.00010830275852697377, + "loss": 3.5823, + "step": 105030 + }, + { + "epoch": 7.136499524391901, + "grad_norm": 0.804026186466217, + "learning_rate": 0.00010826029351814105, + "loss": 3.2856, + "step": 105035 + }, + { + "epoch": 7.136839244462563, + "grad_norm": 1.2105201482772827, + "learning_rate": 0.00010821782850930833, + "loss": 3.3893, + "step": 105040 + }, + { + "epoch": 7.137178964533224, + "grad_norm": 0.7534107565879822, + "learning_rate": 0.00010817536350047561, + "loss": 3.3887, + "step": 105045 + }, + { + "epoch": 7.1375186846038865, + "grad_norm": 0.9769415259361267, + "learning_rate": 0.00010813289849164289, + "loss": 3.622, + "step": 105050 + }, + { + "epoch": 7.137858404674549, + "grad_norm": 1.1568392515182495, + "learning_rate": 0.00010809043348281017, + "loss": 3.4141, + "step": 105055 + }, + { + "epoch": 7.13819812474521, + "grad_norm": 4.317326545715332, + "learning_rate": 0.00010804796847397745, + "loss": 3.4681, + "step": 105060 + }, + { + "epoch": 7.138537844815872, + "grad_norm": 1.0183095932006836, + "learning_rate": 0.00010800550346514472, + "loss": 3.2759, + "step": 105065 + }, + { + "epoch": 7.138877564886534, + "grad_norm": 1.3384324312210083, + "learning_rate": 0.000107963038456312, + "loss": 3.3195, + "step": 105070 + }, + { + "epoch": 7.139217284957195, + "grad_norm": 1.0516295433044434, + "learning_rate": 0.00010792057344747928, + "loss": 3.568, + "step": 105075 + }, + { + "epoch": 7.139557005027857, + "grad_norm": 0.9116994738578796, + "learning_rate": 0.00010787810843864656, + "loss": 3.1154, + "step": 105080 + }, + { + "epoch": 7.139896725098519, + "grad_norm": 1.4439798593521118, + "learning_rate": 0.00010783564342981384, + "loss": 3.069, + "step": 105085 + }, + { + "epoch": 7.14023644516918, + "grad_norm": 1.1724140644073486, + "learning_rate": 0.00010779317842098112, + "loss": 3.413, + "step": 105090 + }, + { + "epoch": 7.1405761652398425, + "grad_norm": 0.9942414164543152, + "learning_rate": 0.00010775071341214838, + "loss": 3.1245, + "step": 105095 + }, + { + "epoch": 7.140915885310504, + "grad_norm": 0.8253317475318909, + "learning_rate": 0.00010770824840331566, + "loss": 3.8213, + "step": 105100 + }, + { + "epoch": 7.141255605381166, + "grad_norm": 1.1108438968658447, + "learning_rate": 0.00010766578339448294, + "loss": 3.3602, + "step": 105105 + }, + { + "epoch": 7.141595325451828, + "grad_norm": 0.9836533069610596, + "learning_rate": 0.00010762331838565024, + "loss": 3.3992, + "step": 105110 + }, + { + "epoch": 7.141935045522489, + "grad_norm": 1.0808693170547485, + "learning_rate": 0.0001075808533768175, + "loss": 3.2452, + "step": 105115 + }, + { + "epoch": 7.142274765593151, + "grad_norm": 0.9066566228866577, + "learning_rate": 0.00010753838836798478, + "loss": 3.3281, + "step": 105120 + }, + { + "epoch": 7.142614485663813, + "grad_norm": 0.7254949808120728, + "learning_rate": 0.00010749592335915206, + "loss": 3.3559, + "step": 105125 + }, + { + "epoch": 7.142954205734474, + "grad_norm": 0.8576305508613586, + "learning_rate": 0.00010745345835031933, + "loss": 3.3469, + "step": 105130 + }, + { + "epoch": 7.1432939258051364, + "grad_norm": 1.1533105373382568, + "learning_rate": 0.00010741099334148661, + "loss": 3.2709, + "step": 105135 + }, + { + "epoch": 7.1436336458757985, + "grad_norm": 1.2859631776809692, + "learning_rate": 0.00010736852833265389, + "loss": 3.411, + "step": 105140 + }, + { + "epoch": 7.14397336594646, + "grad_norm": 1.0376222133636475, + "learning_rate": 0.00010732606332382118, + "loss": 3.0643, + "step": 105145 + }, + { + "epoch": 7.144313086017122, + "grad_norm": 0.8646969199180603, + "learning_rate": 0.00010728359831498845, + "loss": 3.2295, + "step": 105150 + }, + { + "epoch": 7.144652806087784, + "grad_norm": 0.8820927143096924, + "learning_rate": 0.00010724113330615573, + "loss": 3.588, + "step": 105155 + }, + { + "epoch": 7.144992526158445, + "grad_norm": 0.8223441243171692, + "learning_rate": 0.00010719866829732301, + "loss": 3.4208, + "step": 105160 + }, + { + "epoch": 7.145332246229107, + "grad_norm": 0.820364773273468, + "learning_rate": 0.00010715620328849028, + "loss": 3.4133, + "step": 105165 + }, + { + "epoch": 7.145671966299769, + "grad_norm": 0.7258740663528442, + "learning_rate": 0.00010711373827965756, + "loss": 3.5802, + "step": 105170 + }, + { + "epoch": 7.14601168637043, + "grad_norm": 0.9230836629867554, + "learning_rate": 0.00010707127327082485, + "loss": 3.361, + "step": 105175 + }, + { + "epoch": 7.1463514064410925, + "grad_norm": 1.1392924785614014, + "learning_rate": 0.00010702880826199212, + "loss": 3.3171, + "step": 105180 + }, + { + "epoch": 7.1466911265117545, + "grad_norm": 1.0021556615829468, + "learning_rate": 0.0001069863432531594, + "loss": 3.3812, + "step": 105185 + }, + { + "epoch": 7.147030846582416, + "grad_norm": 0.8043031096458435, + "learning_rate": 0.00010694387824432668, + "loss": 3.4951, + "step": 105190 + }, + { + "epoch": 7.147370566653078, + "grad_norm": 1.2412991523742676, + "learning_rate": 0.00010690141323549396, + "loss": 3.6022, + "step": 105195 + }, + { + "epoch": 7.14771028672374, + "grad_norm": 1.0292308330535889, + "learning_rate": 0.00010685894822666122, + "loss": 3.3064, + "step": 105200 + }, + { + "epoch": 7.148050006794401, + "grad_norm": 0.7949464917182922, + "learning_rate": 0.0001068164832178285, + "loss": 3.2412, + "step": 105205 + }, + { + "epoch": 7.148389726865063, + "grad_norm": 0.9690940976142883, + "learning_rate": 0.0001067740182089958, + "loss": 3.3937, + "step": 105210 + }, + { + "epoch": 7.148729446935725, + "grad_norm": 0.8031835556030273, + "learning_rate": 0.00010673155320016307, + "loss": 3.1118, + "step": 105215 + }, + { + "epoch": 7.149069167006386, + "grad_norm": 0.8738428950309753, + "learning_rate": 0.00010668908819133035, + "loss": 3.1964, + "step": 105220 + }, + { + "epoch": 7.1494088870770485, + "grad_norm": 0.960422694683075, + "learning_rate": 0.00010664662318249763, + "loss": 3.5761, + "step": 105225 + }, + { + "epoch": 7.1497486071477105, + "grad_norm": 1.0057868957519531, + "learning_rate": 0.0001066041581736649, + "loss": 3.3453, + "step": 105230 + }, + { + "epoch": 7.150088327218372, + "grad_norm": 1.123608946800232, + "learning_rate": 0.00010656169316483217, + "loss": 3.5057, + "step": 105235 + }, + { + "epoch": 7.150428047289034, + "grad_norm": 0.8599390387535095, + "learning_rate": 0.00010651922815599947, + "loss": 3.2258, + "step": 105240 + }, + { + "epoch": 7.150767767359696, + "grad_norm": 0.9360896348953247, + "learning_rate": 0.00010647676314716675, + "loss": 3.5169, + "step": 105245 + }, + { + "epoch": 7.151107487430357, + "grad_norm": 0.720926821231842, + "learning_rate": 0.00010643429813833401, + "loss": 3.5282, + "step": 105250 + }, + { + "epoch": 7.151447207501019, + "grad_norm": 0.8682109117507935, + "learning_rate": 0.00010639183312950129, + "loss": 3.5712, + "step": 105255 + }, + { + "epoch": 7.151786927571681, + "grad_norm": 1.1363420486450195, + "learning_rate": 0.00010634936812066857, + "loss": 3.4119, + "step": 105260 + }, + { + "epoch": 7.152126647642342, + "grad_norm": 0.9654146432876587, + "learning_rate": 0.00010630690311183584, + "loss": 3.3126, + "step": 105265 + }, + { + "epoch": 7.1524663677130045, + "grad_norm": 0.7556343078613281, + "learning_rate": 0.00010626443810300312, + "loss": 3.5313, + "step": 105270 + }, + { + "epoch": 7.1528060877836666, + "grad_norm": 1.0621938705444336, + "learning_rate": 0.00010622197309417041, + "loss": 3.4682, + "step": 105275 + }, + { + "epoch": 7.153145807854328, + "grad_norm": 1.0070481300354004, + "learning_rate": 0.00010617950808533769, + "loss": 3.2999, + "step": 105280 + }, + { + "epoch": 7.15348552792499, + "grad_norm": 0.8678222894668579, + "learning_rate": 0.00010613704307650496, + "loss": 3.4528, + "step": 105285 + }, + { + "epoch": 7.153825247995652, + "grad_norm": 0.9561582803726196, + "learning_rate": 0.00010609457806767224, + "loss": 3.5586, + "step": 105290 + }, + { + "epoch": 7.154164968066313, + "grad_norm": 1.0777051448822021, + "learning_rate": 0.00010605211305883952, + "loss": 3.4695, + "step": 105295 + }, + { + "epoch": 7.154504688136975, + "grad_norm": 0.7587683796882629, + "learning_rate": 0.00010600964805000679, + "loss": 3.5542, + "step": 105300 + }, + { + "epoch": 7.154844408207637, + "grad_norm": 0.8261883854866028, + "learning_rate": 0.00010596718304117408, + "loss": 3.3795, + "step": 105305 + }, + { + "epoch": 7.155184128278298, + "grad_norm": 0.9929502010345459, + "learning_rate": 0.00010592471803234136, + "loss": 3.6222, + "step": 105310 + }, + { + "epoch": 7.1555238483489605, + "grad_norm": 1.1584417819976807, + "learning_rate": 0.00010588225302350864, + "loss": 3.2738, + "step": 105315 + }, + { + "epoch": 7.155863568419623, + "grad_norm": 0.9308335185050964, + "learning_rate": 0.0001058397880146759, + "loss": 3.486, + "step": 105320 + }, + { + "epoch": 7.156203288490284, + "grad_norm": 0.8380085825920105, + "learning_rate": 0.00010579732300584319, + "loss": 3.3795, + "step": 105325 + }, + { + "epoch": 7.156543008560946, + "grad_norm": 1.024419903755188, + "learning_rate": 0.00010575485799701047, + "loss": 3.3874, + "step": 105330 + }, + { + "epoch": 7.156882728631608, + "grad_norm": 0.8761345148086548, + "learning_rate": 0.00010571239298817773, + "loss": 3.3725, + "step": 105335 + }, + { + "epoch": 7.157222448702269, + "grad_norm": 1.0028109550476074, + "learning_rate": 0.00010566992797934503, + "loss": 3.2645, + "step": 105340 + }, + { + "epoch": 7.157562168772931, + "grad_norm": 0.8255127668380737, + "learning_rate": 0.0001056274629705123, + "loss": 3.2826, + "step": 105345 + }, + { + "epoch": 7.157901888843593, + "grad_norm": 0.8247517943382263, + "learning_rate": 0.00010558499796167957, + "loss": 3.5102, + "step": 105350 + }, + { + "epoch": 7.158241608914254, + "grad_norm": 1.5735113620758057, + "learning_rate": 0.00010554253295284685, + "loss": 3.4068, + "step": 105355 + }, + { + "epoch": 7.1585813289849165, + "grad_norm": 0.9724668860435486, + "learning_rate": 0.00010550006794401413, + "loss": 3.3139, + "step": 105360 + }, + { + "epoch": 7.158921049055579, + "grad_norm": 0.8628373742103577, + "learning_rate": 0.00010545760293518141, + "loss": 3.4211, + "step": 105365 + }, + { + "epoch": 7.15926076912624, + "grad_norm": 0.8813592791557312, + "learning_rate": 0.00010541513792634868, + "loss": 3.282, + "step": 105370 + }, + { + "epoch": 7.159600489196902, + "grad_norm": 0.9656916260719299, + "learning_rate": 0.00010537267291751597, + "loss": 3.5366, + "step": 105375 + }, + { + "epoch": 7.159940209267564, + "grad_norm": 1.7500241994857788, + "learning_rate": 0.00010533020790868325, + "loss": 3.6289, + "step": 105380 + }, + { + "epoch": 7.160279929338225, + "grad_norm": 1.019623875617981, + "learning_rate": 0.00010528774289985052, + "loss": 3.5896, + "step": 105385 + }, + { + "epoch": 7.160619649408887, + "grad_norm": 0.772437572479248, + "learning_rate": 0.0001052452778910178, + "loss": 3.4947, + "step": 105390 + }, + { + "epoch": 7.160959369479549, + "grad_norm": 0.6835080981254578, + "learning_rate": 0.00010520281288218508, + "loss": 3.2871, + "step": 105395 + }, + { + "epoch": 7.16129908955021, + "grad_norm": 0.9106978178024292, + "learning_rate": 0.00010516034787335236, + "loss": 3.3891, + "step": 105400 + }, + { + "epoch": 7.1616388096208725, + "grad_norm": 0.8045798540115356, + "learning_rate": 0.00010511788286451964, + "loss": 3.1907, + "step": 105405 + }, + { + "epoch": 7.161978529691535, + "grad_norm": 1.0239566564559937, + "learning_rate": 0.00010507541785568692, + "loss": 3.4245, + "step": 105410 + }, + { + "epoch": 7.162318249762196, + "grad_norm": 0.9733526110649109, + "learning_rate": 0.0001050329528468542, + "loss": 3.2813, + "step": 105415 + }, + { + "epoch": 7.162657969832858, + "grad_norm": 0.896955668926239, + "learning_rate": 0.00010499048783802147, + "loss": 3.3217, + "step": 105420 + }, + { + "epoch": 7.16299768990352, + "grad_norm": 1.258553385734558, + "learning_rate": 0.00010494802282918875, + "loss": 3.448, + "step": 105425 + }, + { + "epoch": 7.163337409974181, + "grad_norm": 0.8774502277374268, + "learning_rate": 0.00010490555782035603, + "loss": 3.4578, + "step": 105430 + }, + { + "epoch": 7.163677130044843, + "grad_norm": 0.8652628064155579, + "learning_rate": 0.0001048630928115233, + "loss": 3.2031, + "step": 105435 + }, + { + "epoch": 7.164016850115505, + "grad_norm": 0.938737154006958, + "learning_rate": 0.00010482062780269059, + "loss": 3.5154, + "step": 105440 + }, + { + "epoch": 7.1643565701861665, + "grad_norm": 0.843633770942688, + "learning_rate": 0.00010477816279385787, + "loss": 3.3979, + "step": 105445 + }, + { + "epoch": 7.1646962902568285, + "grad_norm": 1.0410627126693726, + "learning_rate": 0.00010473569778502515, + "loss": 3.4508, + "step": 105450 + }, + { + "epoch": 7.165036010327491, + "grad_norm": 1.033007025718689, + "learning_rate": 0.00010469323277619241, + "loss": 3.2745, + "step": 105455 + }, + { + "epoch": 7.165375730398152, + "grad_norm": 0.8961345553398132, + "learning_rate": 0.0001046507677673597, + "loss": 3.3697, + "step": 105460 + }, + { + "epoch": 7.165715450468814, + "grad_norm": 1.0767568349838257, + "learning_rate": 0.00010460830275852697, + "loss": 2.9353, + "step": 105465 + }, + { + "epoch": 7.166055170539475, + "grad_norm": 1.1282695531845093, + "learning_rate": 0.00010456583774969425, + "loss": 3.3141, + "step": 105470 + }, + { + "epoch": 7.166394890610137, + "grad_norm": 0.962043046951294, + "learning_rate": 0.00010452337274086153, + "loss": 3.4509, + "step": 105475 + }, + { + "epoch": 7.166734610680799, + "grad_norm": 0.9006996750831604, + "learning_rate": 0.00010448090773202881, + "loss": 3.2334, + "step": 105480 + }, + { + "epoch": 7.16707433075146, + "grad_norm": 1.0780112743377686, + "learning_rate": 0.0001044384427231961, + "loss": 3.1415, + "step": 105485 + }, + { + "epoch": 7.1674140508221225, + "grad_norm": 0.8114528656005859, + "learning_rate": 0.00010439597771436336, + "loss": 3.2711, + "step": 105490 + }, + { + "epoch": 7.1677537708927845, + "grad_norm": 0.9072496891021729, + "learning_rate": 0.00010435351270553064, + "loss": 3.4811, + "step": 105495 + }, + { + "epoch": 7.168093490963446, + "grad_norm": 0.9659896492958069, + "learning_rate": 0.00010431104769669792, + "loss": 3.4984, + "step": 105500 + }, + { + "epoch": 7.168433211034108, + "grad_norm": 0.8328889608383179, + "learning_rate": 0.0001042685826878652, + "loss": 3.4969, + "step": 105505 + }, + { + "epoch": 7.16877293110477, + "grad_norm": 1.1639823913574219, + "learning_rate": 0.00010422611767903248, + "loss": 3.1068, + "step": 105510 + }, + { + "epoch": 7.169112651175431, + "grad_norm": 1.225456714630127, + "learning_rate": 0.00010418365267019976, + "loss": 3.3664, + "step": 105515 + }, + { + "epoch": 7.169452371246093, + "grad_norm": 0.9109500646591187, + "learning_rate": 0.00010414118766136703, + "loss": 3.3674, + "step": 105520 + }, + { + "epoch": 7.169792091316755, + "grad_norm": 0.8904948234558105, + "learning_rate": 0.00010409872265253431, + "loss": 3.5797, + "step": 105525 + }, + { + "epoch": 7.170131811387416, + "grad_norm": 0.9696316719055176, + "learning_rate": 0.00010405625764370159, + "loss": 3.2142, + "step": 105530 + }, + { + "epoch": 7.1704715314580785, + "grad_norm": 0.7601168155670166, + "learning_rate": 0.00010401379263486888, + "loss": 3.4039, + "step": 105535 + }, + { + "epoch": 7.1708112515287405, + "grad_norm": 1.3341087102890015, + "learning_rate": 0.00010397132762603615, + "loss": 3.4486, + "step": 105540 + }, + { + "epoch": 7.171150971599402, + "grad_norm": 1.2195806503295898, + "learning_rate": 0.00010392886261720343, + "loss": 3.6005, + "step": 105545 + }, + { + "epoch": 7.171490691670064, + "grad_norm": 0.8797705769538879, + "learning_rate": 0.00010388639760837071, + "loss": 3.4904, + "step": 105550 + }, + { + "epoch": 7.171830411740726, + "grad_norm": 0.7608579993247986, + "learning_rate": 0.00010384393259953797, + "loss": 3.1839, + "step": 105555 + }, + { + "epoch": 7.172170131811387, + "grad_norm": 1.3380630016326904, + "learning_rate": 0.00010380146759070525, + "loss": 3.4708, + "step": 105560 + }, + { + "epoch": 7.172509851882049, + "grad_norm": 1.0529872179031372, + "learning_rate": 0.00010375900258187253, + "loss": 3.4871, + "step": 105565 + }, + { + "epoch": 7.172849571952711, + "grad_norm": 0.8039199113845825, + "learning_rate": 0.00010371653757303983, + "loss": 3.5036, + "step": 105570 + }, + { + "epoch": 7.173189292023372, + "grad_norm": 0.9003735184669495, + "learning_rate": 0.0001036740725642071, + "loss": 3.2196, + "step": 105575 + }, + { + "epoch": 7.1735290120940345, + "grad_norm": 0.9107008576393127, + "learning_rate": 0.00010363160755537438, + "loss": 3.2986, + "step": 105580 + }, + { + "epoch": 7.173868732164697, + "grad_norm": 1.2710726261138916, + "learning_rate": 0.00010358914254654166, + "loss": 3.4912, + "step": 105585 + }, + { + "epoch": 7.174208452235358, + "grad_norm": 0.7939351797103882, + "learning_rate": 0.00010354667753770892, + "loss": 3.3627, + "step": 105590 + }, + { + "epoch": 7.17454817230602, + "grad_norm": 0.894990086555481, + "learning_rate": 0.0001035042125288762, + "loss": 3.5454, + "step": 105595 + }, + { + "epoch": 7.174887892376682, + "grad_norm": 0.9599953293800354, + "learning_rate": 0.0001034617475200435, + "loss": 3.468, + "step": 105600 + }, + { + "epoch": 7.175227612447343, + "grad_norm": 1.0498688220977783, + "learning_rate": 0.00010341928251121076, + "loss": 3.7425, + "step": 105605 + }, + { + "epoch": 7.175567332518005, + "grad_norm": 0.9856375455856323, + "learning_rate": 0.00010337681750237804, + "loss": 3.5106, + "step": 105610 + }, + { + "epoch": 7.175907052588667, + "grad_norm": 1.228522539138794, + "learning_rate": 0.00010333435249354532, + "loss": 3.4404, + "step": 105615 + }, + { + "epoch": 7.176246772659328, + "grad_norm": 0.7776727676391602, + "learning_rate": 0.0001032918874847126, + "loss": 3.1914, + "step": 105620 + }, + { + "epoch": 7.1765864927299905, + "grad_norm": 0.7926812767982483, + "learning_rate": 0.00010324942247587987, + "loss": 3.099, + "step": 105625 + }, + { + "epoch": 7.176926212800653, + "grad_norm": 0.8764440417289734, + "learning_rate": 0.00010320695746704715, + "loss": 3.4635, + "step": 105630 + }, + { + "epoch": 7.177265932871314, + "grad_norm": 0.9349136352539062, + "learning_rate": 0.00010316449245821444, + "loss": 3.4385, + "step": 105635 + }, + { + "epoch": 7.177605652941976, + "grad_norm": 1.2566790580749512, + "learning_rate": 0.00010312202744938171, + "loss": 3.3289, + "step": 105640 + }, + { + "epoch": 7.177945373012638, + "grad_norm": 1.450919508934021, + "learning_rate": 0.00010307956244054899, + "loss": 3.3523, + "step": 105645 + }, + { + "epoch": 7.178285093083299, + "grad_norm": 0.8772846460342407, + "learning_rate": 0.00010303709743171627, + "loss": 3.8517, + "step": 105650 + }, + { + "epoch": 7.178624813153961, + "grad_norm": 0.8679059147834778, + "learning_rate": 0.00010299463242288355, + "loss": 3.4083, + "step": 105655 + }, + { + "epoch": 7.178964533224623, + "grad_norm": 0.7658941745758057, + "learning_rate": 0.00010295216741405082, + "loss": 3.3671, + "step": 105660 + }, + { + "epoch": 7.179304253295284, + "grad_norm": 1.2649730443954468, + "learning_rate": 0.00010290970240521811, + "loss": 3.4664, + "step": 105665 + }, + { + "epoch": 7.1796439733659465, + "grad_norm": 0.8454555869102478, + "learning_rate": 0.00010286723739638539, + "loss": 3.2442, + "step": 105670 + }, + { + "epoch": 7.179983693436609, + "grad_norm": 0.8890806436538696, + "learning_rate": 0.00010282477238755266, + "loss": 3.4798, + "step": 105675 + }, + { + "epoch": 7.18032341350727, + "grad_norm": 0.657062292098999, + "learning_rate": 0.00010278230737871994, + "loss": 3.4193, + "step": 105680 + }, + { + "epoch": 7.180663133577932, + "grad_norm": 0.818314254283905, + "learning_rate": 0.00010273984236988722, + "loss": 3.5121, + "step": 105685 + }, + { + "epoch": 7.181002853648594, + "grad_norm": 1.0364009141921997, + "learning_rate": 0.00010269737736105448, + "loss": 3.7785, + "step": 105690 + }, + { + "epoch": 7.181342573719255, + "grad_norm": 0.9383648037910461, + "learning_rate": 0.00010265491235222176, + "loss": 3.295, + "step": 105695 + }, + { + "epoch": 7.181682293789917, + "grad_norm": 1.5904285907745361, + "learning_rate": 0.00010261244734338906, + "loss": 3.5186, + "step": 105700 + }, + { + "epoch": 7.182022013860579, + "grad_norm": 1.3556724786758423, + "learning_rate": 0.00010256998233455634, + "loss": 3.1599, + "step": 105705 + }, + { + "epoch": 7.18236173393124, + "grad_norm": 1.0006781816482544, + "learning_rate": 0.0001025275173257236, + "loss": 3.3554, + "step": 105710 + }, + { + "epoch": 7.1827014540019025, + "grad_norm": 1.1575984954833984, + "learning_rate": 0.00010248505231689088, + "loss": 3.4319, + "step": 105715 + }, + { + "epoch": 7.183041174072565, + "grad_norm": 1.025034785270691, + "learning_rate": 0.00010244258730805816, + "loss": 3.3948, + "step": 105720 + }, + { + "epoch": 7.183380894143226, + "grad_norm": 0.990044891834259, + "learning_rate": 0.00010240012229922543, + "loss": 3.2141, + "step": 105725 + }, + { + "epoch": 7.183720614213888, + "grad_norm": 1.0106562376022339, + "learning_rate": 0.00010235765729039272, + "loss": 3.3413, + "step": 105730 + }, + { + "epoch": 7.18406033428455, + "grad_norm": 0.9884135127067566, + "learning_rate": 0.00010231519228156, + "loss": 3.1874, + "step": 105735 + }, + { + "epoch": 7.184400054355211, + "grad_norm": 0.8787037134170532, + "learning_rate": 0.00010227272727272728, + "loss": 3.4062, + "step": 105740 + }, + { + "epoch": 7.184739774425873, + "grad_norm": 1.2505663633346558, + "learning_rate": 0.00010223026226389455, + "loss": 3.098, + "step": 105745 + }, + { + "epoch": 7.185079494496535, + "grad_norm": 1.0332282781600952, + "learning_rate": 0.00010218779725506183, + "loss": 3.2158, + "step": 105750 + }, + { + "epoch": 7.1854192145671965, + "grad_norm": 0.8181195259094238, + "learning_rate": 0.00010214533224622911, + "loss": 3.5059, + "step": 105755 + }, + { + "epoch": 7.1857589346378585, + "grad_norm": 0.8261857032775879, + "learning_rate": 0.00010210286723739638, + "loss": 3.404, + "step": 105760 + }, + { + "epoch": 7.186098654708521, + "grad_norm": 0.9887767434120178, + "learning_rate": 0.00010206040222856367, + "loss": 3.3395, + "step": 105765 + }, + { + "epoch": 7.186438374779182, + "grad_norm": 0.9890621900558472, + "learning_rate": 0.00010201793721973095, + "loss": 3.203, + "step": 105770 + }, + { + "epoch": 7.186778094849844, + "grad_norm": 0.8823869824409485, + "learning_rate": 0.00010197547221089822, + "loss": 3.267, + "step": 105775 + }, + { + "epoch": 7.187117814920505, + "grad_norm": 1.0168191194534302, + "learning_rate": 0.0001019330072020655, + "loss": 3.4096, + "step": 105780 + }, + { + "epoch": 7.187457534991167, + "grad_norm": 1.112205982208252, + "learning_rate": 0.00010189054219323278, + "loss": 3.4098, + "step": 105785 + }, + { + "epoch": 7.187797255061829, + "grad_norm": 0.7387718558311462, + "learning_rate": 0.00010184807718440006, + "loss": 3.4519, + "step": 105790 + }, + { + "epoch": 7.18813697513249, + "grad_norm": 1.3145238161087036, + "learning_rate": 0.00010180561217556732, + "loss": 3.3771, + "step": 105795 + }, + { + "epoch": 7.1884766952031525, + "grad_norm": 0.9423953294754028, + "learning_rate": 0.00010176314716673462, + "loss": 3.173, + "step": 105800 + }, + { + "epoch": 7.1888164152738145, + "grad_norm": 1.1451388597488403, + "learning_rate": 0.0001017206821579019, + "loss": 3.4246, + "step": 105805 + }, + { + "epoch": 7.189156135344476, + "grad_norm": 0.7169680595397949, + "learning_rate": 0.00010167821714906916, + "loss": 3.1838, + "step": 105810 + }, + { + "epoch": 7.189495855415138, + "grad_norm": 0.8871379494667053, + "learning_rate": 0.00010163575214023644, + "loss": 3.6158, + "step": 105815 + }, + { + "epoch": 7.1898355754858, + "grad_norm": 0.8777378797531128, + "learning_rate": 0.00010159328713140372, + "loss": 3.4311, + "step": 105820 + }, + { + "epoch": 7.190175295556461, + "grad_norm": 0.8748310208320618, + "learning_rate": 0.000101550822122571, + "loss": 3.5946, + "step": 105825 + }, + { + "epoch": 7.190515015627123, + "grad_norm": 0.9894733428955078, + "learning_rate": 0.00010150835711373828, + "loss": 3.5901, + "step": 105830 + }, + { + "epoch": 7.190854735697785, + "grad_norm": 1.2364580631256104, + "learning_rate": 0.00010146589210490556, + "loss": 3.2465, + "step": 105835 + }, + { + "epoch": 7.191194455768446, + "grad_norm": 1.0961805582046509, + "learning_rate": 0.00010142342709607284, + "loss": 3.2909, + "step": 105840 + }, + { + "epoch": 7.1915341758391085, + "grad_norm": 0.8453742861747742, + "learning_rate": 0.00010138096208724011, + "loss": 3.5552, + "step": 105845 + }, + { + "epoch": 7.1918738959097706, + "grad_norm": 0.8191595673561096, + "learning_rate": 0.00010133849707840739, + "loss": 3.459, + "step": 105850 + }, + { + "epoch": 7.192213615980432, + "grad_norm": 1.074611783027649, + "learning_rate": 0.00010129603206957467, + "loss": 3.4814, + "step": 105855 + }, + { + "epoch": 7.192553336051094, + "grad_norm": 0.9485097527503967, + "learning_rate": 0.00010125356706074194, + "loss": 3.1623, + "step": 105860 + }, + { + "epoch": 7.192893056121756, + "grad_norm": 1.026902437210083, + "learning_rate": 0.00010121110205190923, + "loss": 3.4341, + "step": 105865 + }, + { + "epoch": 7.193232776192417, + "grad_norm": 0.786159336566925, + "learning_rate": 0.00010116863704307651, + "loss": 3.1914, + "step": 105870 + }, + { + "epoch": 7.193572496263079, + "grad_norm": 0.9644454121589661, + "learning_rate": 0.00010112617203424379, + "loss": 3.2402, + "step": 105875 + }, + { + "epoch": 7.193912216333741, + "grad_norm": 0.9747393727302551, + "learning_rate": 0.00010108370702541106, + "loss": 3.2562, + "step": 105880 + }, + { + "epoch": 7.194251936404402, + "grad_norm": 1.622343897819519, + "learning_rate": 0.00010104124201657834, + "loss": 3.3164, + "step": 105885 + }, + { + "epoch": 7.1945916564750645, + "grad_norm": 1.0604039430618286, + "learning_rate": 0.00010099877700774562, + "loss": 3.4249, + "step": 105890 + }, + { + "epoch": 7.194931376545727, + "grad_norm": 0.933817982673645, + "learning_rate": 0.0001009563119989129, + "loss": 3.5608, + "step": 105895 + }, + { + "epoch": 7.195271096616388, + "grad_norm": 0.8159639239311218, + "learning_rate": 0.00010091384699008018, + "loss": 3.5169, + "step": 105900 + }, + { + "epoch": 7.19561081668705, + "grad_norm": 0.9693737030029297, + "learning_rate": 0.00010087138198124746, + "loss": 3.2533, + "step": 105905 + }, + { + "epoch": 7.195950536757712, + "grad_norm": 1.1001238822937012, + "learning_rate": 0.00010082891697241474, + "loss": 3.361, + "step": 105910 + }, + { + "epoch": 7.196290256828373, + "grad_norm": 0.8479357957839966, + "learning_rate": 0.000100786451963582, + "loss": 3.4364, + "step": 105915 + }, + { + "epoch": 7.196629976899035, + "grad_norm": 0.7353842258453369, + "learning_rate": 0.00010074398695474928, + "loss": 3.2965, + "step": 105920 + }, + { + "epoch": 7.196969696969697, + "grad_norm": 0.8670721650123596, + "learning_rate": 0.00010070152194591657, + "loss": 3.3837, + "step": 105925 + }, + { + "epoch": 7.197309417040358, + "grad_norm": 1.0912238359451294, + "learning_rate": 0.00010065905693708385, + "loss": 3.201, + "step": 105930 + }, + { + "epoch": 7.1976491371110205, + "grad_norm": 1.337511658668518, + "learning_rate": 0.00010061659192825113, + "loss": 3.3548, + "step": 105935 + }, + { + "epoch": 7.197988857181683, + "grad_norm": 0.8750711679458618, + "learning_rate": 0.0001005741269194184, + "loss": 3.6856, + "step": 105940 + }, + { + "epoch": 7.198328577252344, + "grad_norm": 0.9216896295547485, + "learning_rate": 0.00010053166191058567, + "loss": 3.3611, + "step": 105945 + }, + { + "epoch": 7.198668297323006, + "grad_norm": 0.8362811803817749, + "learning_rate": 0.00010048919690175295, + "loss": 3.7368, + "step": 105950 + }, + { + "epoch": 7.199008017393668, + "grad_norm": 1.0284980535507202, + "learning_rate": 0.00010044673189292023, + "loss": 3.2915, + "step": 105955 + }, + { + "epoch": 7.199347737464329, + "grad_norm": 0.9385945200920105, + "learning_rate": 0.00010040426688408753, + "loss": 3.4402, + "step": 105960 + }, + { + "epoch": 7.199687457534991, + "grad_norm": 1.194718360900879, + "learning_rate": 0.00010036180187525479, + "loss": 3.2461, + "step": 105965 + }, + { + "epoch": 7.200027177605653, + "grad_norm": 0.9096370339393616, + "learning_rate": 0.00010031933686642207, + "loss": 3.5104, + "step": 105970 + }, + { + "epoch": 7.200366897676314, + "grad_norm": 0.7551174163818359, + "learning_rate": 0.00010027687185758935, + "loss": 3.4193, + "step": 105975 + }, + { + "epoch": 7.2007066177469765, + "grad_norm": 1.1275147199630737, + "learning_rate": 0.00010023440684875662, + "loss": 3.4536, + "step": 105980 + }, + { + "epoch": 7.201046337817639, + "grad_norm": 0.8481507897377014, + "learning_rate": 0.0001001919418399239, + "loss": 3.2388, + "step": 105985 + }, + { + "epoch": 7.2013860578883, + "grad_norm": 1.004298448562622, + "learning_rate": 0.00010014947683109118, + "loss": 3.3738, + "step": 105990 + }, + { + "epoch": 7.201725777958962, + "grad_norm": 1.0815188884735107, + "learning_rate": 0.00010010701182225847, + "loss": 3.3806, + "step": 105995 + }, + { + "epoch": 7.202065498029624, + "grad_norm": 1.0948508977890015, + "learning_rate": 0.00010006454681342574, + "loss": 3.4468, + "step": 106000 + }, + { + "epoch": 7.202405218100285, + "grad_norm": 0.7768076062202454, + "learning_rate": 0.00010002208180459302, + "loss": 3.455, + "step": 106005 + }, + { + "epoch": 7.202744938170947, + "grad_norm": 0.9626686573028564, + "learning_rate": 9.99796167957603e-05, + "loss": 3.2635, + "step": 106010 + }, + { + "epoch": 7.203084658241609, + "grad_norm": 0.9164626002311707, + "learning_rate": 9.993715178692757e-05, + "loss": 3.5476, + "step": 106015 + }, + { + "epoch": 7.2034243783122704, + "grad_norm": 0.962905764579773, + "learning_rate": 9.989468677809485e-05, + "loss": 3.4282, + "step": 106020 + }, + { + "epoch": 7.2037640983829325, + "grad_norm": 0.8809643983840942, + "learning_rate": 9.985222176926214e-05, + "loss": 3.3252, + "step": 106025 + }, + { + "epoch": 7.204103818453595, + "grad_norm": 0.7884936928749084, + "learning_rate": 9.98097567604294e-05, + "loss": 3.4126, + "step": 106030 + }, + { + "epoch": 7.204443538524256, + "grad_norm": 1.089401125907898, + "learning_rate": 9.976729175159669e-05, + "loss": 3.5377, + "step": 106035 + }, + { + "epoch": 7.204783258594918, + "grad_norm": 1.0285054445266724, + "learning_rate": 9.972482674276397e-05, + "loss": 3.111, + "step": 106040 + }, + { + "epoch": 7.20512297866558, + "grad_norm": 0.9894689321517944, + "learning_rate": 9.968236173393125e-05, + "loss": 3.5037, + "step": 106045 + }, + { + "epoch": 7.205462698736241, + "grad_norm": 0.947315514087677, + "learning_rate": 9.963989672509851e-05, + "loss": 3.5439, + "step": 106050 + }, + { + "epoch": 7.205802418806903, + "grad_norm": 0.8051893711090088, + "learning_rate": 9.959743171626579e-05, + "loss": 3.4466, + "step": 106055 + }, + { + "epoch": 7.206142138877565, + "grad_norm": 0.8053247332572937, + "learning_rate": 9.955496670743309e-05, + "loss": 3.6972, + "step": 106060 + }, + { + "epoch": 7.2064818589482265, + "grad_norm": 0.9979872703552246, + "learning_rate": 9.951250169860035e-05, + "loss": 3.3728, + "step": 106065 + }, + { + "epoch": 7.2068215790188885, + "grad_norm": 0.736204206943512, + "learning_rate": 9.947003668976763e-05, + "loss": 3.5166, + "step": 106070 + }, + { + "epoch": 7.207161299089551, + "grad_norm": 0.9073604941368103, + "learning_rate": 9.942757168093491e-05, + "loss": 3.2956, + "step": 106075 + }, + { + "epoch": 7.207501019160212, + "grad_norm": 0.7775717377662659, + "learning_rate": 9.938510667210219e-05, + "loss": 3.3075, + "step": 106080 + }, + { + "epoch": 7.207840739230874, + "grad_norm": 0.89478999376297, + "learning_rate": 9.934264166326946e-05, + "loss": 3.5192, + "step": 106085 + }, + { + "epoch": 7.208180459301536, + "grad_norm": 0.8570777773857117, + "learning_rate": 9.930017665443675e-05, + "loss": 3.2282, + "step": 106090 + }, + { + "epoch": 7.208520179372197, + "grad_norm": 1.2446560859680176, + "learning_rate": 9.925771164560403e-05, + "loss": 3.4575, + "step": 106095 + }, + { + "epoch": 7.208859899442859, + "grad_norm": 0.9317611455917358, + "learning_rate": 9.92152466367713e-05, + "loss": 3.6254, + "step": 106100 + }, + { + "epoch": 7.209199619513521, + "grad_norm": 0.8992774486541748, + "learning_rate": 9.917278162793858e-05, + "loss": 3.5488, + "step": 106105 + }, + { + "epoch": 7.2095393395841825, + "grad_norm": 0.7610717415809631, + "learning_rate": 9.913031661910586e-05, + "loss": 3.5235, + "step": 106110 + }, + { + "epoch": 7.2098790596548445, + "grad_norm": 0.8336702585220337, + "learning_rate": 9.908785161027313e-05, + "loss": 3.2311, + "step": 106115 + }, + { + "epoch": 7.210218779725507, + "grad_norm": 0.9159334301948547, + "learning_rate": 9.90453866014404e-05, + "loss": 3.5003, + "step": 106120 + }, + { + "epoch": 7.210558499796168, + "grad_norm": 1.0960251092910767, + "learning_rate": 9.90029215926077e-05, + "loss": 3.5527, + "step": 106125 + }, + { + "epoch": 7.21089821986683, + "grad_norm": 0.8495272397994995, + "learning_rate": 9.896045658377498e-05, + "loss": 3.4274, + "step": 106130 + }, + { + "epoch": 7.211237939937492, + "grad_norm": 1.0788837671279907, + "learning_rate": 9.891799157494225e-05, + "loss": 3.3676, + "step": 106135 + }, + { + "epoch": 7.211577660008153, + "grad_norm": 0.9917789101600647, + "learning_rate": 9.887552656610953e-05, + "loss": 3.3265, + "step": 106140 + }, + { + "epoch": 7.211917380078815, + "grad_norm": 0.857916533946991, + "learning_rate": 9.883306155727681e-05, + "loss": 3.3082, + "step": 106145 + }, + { + "epoch": 7.212257100149476, + "grad_norm": 0.7578067779541016, + "learning_rate": 9.879059654844407e-05, + "loss": 3.3472, + "step": 106150 + }, + { + "epoch": 7.2125968202201385, + "grad_norm": 0.7997801899909973, + "learning_rate": 9.874813153961137e-05, + "loss": 3.3752, + "step": 106155 + }, + { + "epoch": 7.212936540290801, + "grad_norm": 1.1222922801971436, + "learning_rate": 9.870566653077865e-05, + "loss": 3.2959, + "step": 106160 + }, + { + "epoch": 7.213276260361462, + "grad_norm": 0.9397960305213928, + "learning_rate": 9.866320152194593e-05, + "loss": 3.3362, + "step": 106165 + }, + { + "epoch": 7.213615980432124, + "grad_norm": 0.7420213222503662, + "learning_rate": 9.86207365131132e-05, + "loss": 3.4165, + "step": 106170 + }, + { + "epoch": 7.213955700502786, + "grad_norm": 0.8429858088493347, + "learning_rate": 9.857827150428047e-05, + "loss": 3.2478, + "step": 106175 + }, + { + "epoch": 7.214295420573447, + "grad_norm": 0.9535784125328064, + "learning_rate": 9.853580649544775e-05, + "loss": 3.3543, + "step": 106180 + }, + { + "epoch": 7.214635140644109, + "grad_norm": 0.9485892653465271, + "learning_rate": 9.849334148661502e-05, + "loss": 3.3998, + "step": 106185 + }, + { + "epoch": 7.214974860714771, + "grad_norm": 0.8095501661300659, + "learning_rate": 9.845087647778231e-05, + "loss": 2.9809, + "step": 106190 + }, + { + "epoch": 7.215314580785432, + "grad_norm": 0.8296616673469543, + "learning_rate": 9.84084114689496e-05, + "loss": 3.2925, + "step": 106195 + }, + { + "epoch": 7.2156543008560945, + "grad_norm": 0.8173223733901978, + "learning_rate": 9.836594646011686e-05, + "loss": 3.2549, + "step": 106200 + }, + { + "epoch": 7.215994020926757, + "grad_norm": 0.7383130788803101, + "learning_rate": 9.832348145128414e-05, + "loss": 3.6557, + "step": 106205 + }, + { + "epoch": 7.216333740997418, + "grad_norm": 0.9921514391899109, + "learning_rate": 9.828101644245142e-05, + "loss": 3.3346, + "step": 106210 + }, + { + "epoch": 7.21667346106808, + "grad_norm": 0.9259402751922607, + "learning_rate": 9.82385514336187e-05, + "loss": 3.2521, + "step": 106215 + }, + { + "epoch": 7.217013181138742, + "grad_norm": 1.2341504096984863, + "learning_rate": 9.819608642478597e-05, + "loss": 3.2132, + "step": 106220 + }, + { + "epoch": 7.217352901209403, + "grad_norm": 0.9733263254165649, + "learning_rate": 9.815362141595326e-05, + "loss": 3.2797, + "step": 106225 + }, + { + "epoch": 7.217692621280065, + "grad_norm": 1.3429620265960693, + "learning_rate": 9.811115640712054e-05, + "loss": 3.6212, + "step": 106230 + }, + { + "epoch": 7.218032341350727, + "grad_norm": 1.0732076168060303, + "learning_rate": 9.806869139828781e-05, + "loss": 3.5815, + "step": 106235 + }, + { + "epoch": 7.218372061421388, + "grad_norm": 1.0165232419967651, + "learning_rate": 9.802622638945509e-05, + "loss": 3.3321, + "step": 106240 + }, + { + "epoch": 7.2187117814920505, + "grad_norm": 2.6232073307037354, + "learning_rate": 9.798376138062237e-05, + "loss": 3.0016, + "step": 106245 + }, + { + "epoch": 7.219051501562713, + "grad_norm": 1.303659439086914, + "learning_rate": 9.794129637178965e-05, + "loss": 3.3417, + "step": 106250 + }, + { + "epoch": 7.219391221633374, + "grad_norm": 1.0008296966552734, + "learning_rate": 9.789883136295693e-05, + "loss": 3.4142, + "step": 106255 + }, + { + "epoch": 7.219730941704036, + "grad_norm": 0.877399742603302, + "learning_rate": 9.785636635412421e-05, + "loss": 3.5448, + "step": 106260 + }, + { + "epoch": 7.220070661774698, + "grad_norm": 1.4533665180206299, + "learning_rate": 9.781390134529149e-05, + "loss": 3.5567, + "step": 106265 + }, + { + "epoch": 7.220410381845359, + "grad_norm": 0.8439891934394836, + "learning_rate": 9.777143633645875e-05, + "loss": 3.4337, + "step": 106270 + }, + { + "epoch": 7.220750101916021, + "grad_norm": 1.045285701751709, + "learning_rate": 9.772897132762603e-05, + "loss": 3.4536, + "step": 106275 + }, + { + "epoch": 7.221089821986683, + "grad_norm": 1.2341760396957397, + "learning_rate": 9.768650631879331e-05, + "loss": 3.5573, + "step": 106280 + }, + { + "epoch": 7.221429542057344, + "grad_norm": 1.2884063720703125, + "learning_rate": 9.764404130996058e-05, + "loss": 3.5334, + "step": 106285 + }, + { + "epoch": 7.2217692621280065, + "grad_norm": 1.4611653089523315, + "learning_rate": 9.760157630112788e-05, + "loss": 3.3116, + "step": 106290 + }, + { + "epoch": 7.222108982198669, + "grad_norm": 0.8890812397003174, + "learning_rate": 9.755911129229516e-05, + "loss": 3.5857, + "step": 106295 + }, + { + "epoch": 7.22244870226933, + "grad_norm": 0.8539170026779175, + "learning_rate": 9.751664628346244e-05, + "loss": 3.2378, + "step": 106300 + }, + { + "epoch": 7.222788422339992, + "grad_norm": 1.016847848892212, + "learning_rate": 9.74741812746297e-05, + "loss": 3.3675, + "step": 106305 + }, + { + "epoch": 7.223128142410654, + "grad_norm": 0.9195106029510498, + "learning_rate": 9.743171626579698e-05, + "loss": 3.6428, + "step": 106310 + }, + { + "epoch": 7.223467862481315, + "grad_norm": 0.9813041687011719, + "learning_rate": 9.738925125696426e-05, + "loss": 3.3169, + "step": 106315 + }, + { + "epoch": 7.223807582551977, + "grad_norm": 1.1619354486465454, + "learning_rate": 9.734678624813154e-05, + "loss": 3.5701, + "step": 106320 + }, + { + "epoch": 7.224147302622639, + "grad_norm": 0.8879547715187073, + "learning_rate": 9.730432123929882e-05, + "loss": 3.4706, + "step": 106325 + }, + { + "epoch": 7.2244870226933005, + "grad_norm": 1.3446524143218994, + "learning_rate": 9.72618562304661e-05, + "loss": 3.1458, + "step": 106330 + }, + { + "epoch": 7.2248267427639625, + "grad_norm": 0.9754709005355835, + "learning_rate": 9.721939122163338e-05, + "loss": 3.2904, + "step": 106335 + }, + { + "epoch": 7.225166462834625, + "grad_norm": 0.8374881744384766, + "learning_rate": 9.717692621280065e-05, + "loss": 3.5345, + "step": 106340 + }, + { + "epoch": 7.225506182905286, + "grad_norm": 1.397585391998291, + "learning_rate": 9.713446120396793e-05, + "loss": 3.5922, + "step": 106345 + }, + { + "epoch": 7.225845902975948, + "grad_norm": 0.7331221103668213, + "learning_rate": 9.709199619513521e-05, + "loss": 3.4385, + "step": 106350 + }, + { + "epoch": 7.22618562304661, + "grad_norm": 1.1514402627944946, + "learning_rate": 9.704953118630249e-05, + "loss": 3.2042, + "step": 106355 + }, + { + "epoch": 7.226525343117271, + "grad_norm": 1.010617971420288, + "learning_rate": 9.700706617746977e-05, + "loss": 3.3854, + "step": 106360 + }, + { + "epoch": 7.226865063187933, + "grad_norm": 0.9634455442428589, + "learning_rate": 9.696460116863705e-05, + "loss": 3.3934, + "step": 106365 + }, + { + "epoch": 7.227204783258595, + "grad_norm": 0.8654137849807739, + "learning_rate": 9.692213615980432e-05, + "loss": 3.7926, + "step": 106370 + }, + { + "epoch": 7.2275445033292565, + "grad_norm": 0.9829028844833374, + "learning_rate": 9.68796711509716e-05, + "loss": 3.2316, + "step": 106375 + }, + { + "epoch": 7.2278842233999185, + "grad_norm": 1.067372441291809, + "learning_rate": 9.683720614213888e-05, + "loss": 3.696, + "step": 106380 + }, + { + "epoch": 7.228223943470581, + "grad_norm": 1.0262326002120972, + "learning_rate": 9.679474113330617e-05, + "loss": 2.9469, + "step": 106385 + }, + { + "epoch": 7.228563663541242, + "grad_norm": 1.119954228401184, + "learning_rate": 9.675227612447344e-05, + "loss": 3.3832, + "step": 106390 + }, + { + "epoch": 7.228903383611904, + "grad_norm": 1.2161709070205688, + "learning_rate": 9.670981111564072e-05, + "loss": 3.4103, + "step": 106395 + }, + { + "epoch": 7.229243103682566, + "grad_norm": 0.7750277519226074, + "learning_rate": 9.6667346106808e-05, + "loss": 3.3465, + "step": 106400 + }, + { + "epoch": 7.229582823753227, + "grad_norm": 1.0107624530792236, + "learning_rate": 9.662488109797526e-05, + "loss": 3.3612, + "step": 106405 + }, + { + "epoch": 7.229922543823889, + "grad_norm": 1.5383999347686768, + "learning_rate": 9.658241608914254e-05, + "loss": 3.3031, + "step": 106410 + }, + { + "epoch": 7.230262263894551, + "grad_norm": 0.860011637210846, + "learning_rate": 9.653995108030982e-05, + "loss": 3.3497, + "step": 106415 + }, + { + "epoch": 7.2306019839652125, + "grad_norm": 1.05092191696167, + "learning_rate": 9.649748607147712e-05, + "loss": 3.462, + "step": 106420 + }, + { + "epoch": 7.2309417040358746, + "grad_norm": 0.8414567112922668, + "learning_rate": 9.645502106264438e-05, + "loss": 3.3697, + "step": 106425 + }, + { + "epoch": 7.231281424106537, + "grad_norm": 1.022850751876831, + "learning_rate": 9.641255605381166e-05, + "loss": 3.4121, + "step": 106430 + }, + { + "epoch": 7.231621144177198, + "grad_norm": 0.734934389591217, + "learning_rate": 9.637009104497894e-05, + "loss": 3.4257, + "step": 106435 + }, + { + "epoch": 7.23196086424786, + "grad_norm": 0.9525325298309326, + "learning_rate": 9.632762603614621e-05, + "loss": 3.6304, + "step": 106440 + }, + { + "epoch": 7.232300584318522, + "grad_norm": 0.7540212869644165, + "learning_rate": 9.628516102731349e-05, + "loss": 3.3755, + "step": 106445 + }, + { + "epoch": 7.232640304389183, + "grad_norm": 0.786838948726654, + "learning_rate": 9.624269601848078e-05, + "loss": 3.5743, + "step": 106450 + }, + { + "epoch": 7.232980024459845, + "grad_norm": 0.9002817869186401, + "learning_rate": 9.620023100964805e-05, + "loss": 3.1108, + "step": 106455 + }, + { + "epoch": 7.233319744530506, + "grad_norm": 1.0550581216812134, + "learning_rate": 9.615776600081533e-05, + "loss": 3.2617, + "step": 106460 + }, + { + "epoch": 7.2336594646011685, + "grad_norm": 0.8753000497817993, + "learning_rate": 9.611530099198261e-05, + "loss": 3.4307, + "step": 106465 + }, + { + "epoch": 7.233999184671831, + "grad_norm": 1.0713354349136353, + "learning_rate": 9.607283598314989e-05, + "loss": 3.3357, + "step": 106470 + }, + { + "epoch": 7.234338904742492, + "grad_norm": 1.197746992111206, + "learning_rate": 9.603037097431716e-05, + "loss": 3.3544, + "step": 106475 + }, + { + "epoch": 7.234678624813154, + "grad_norm": 0.8419172167778015, + "learning_rate": 9.598790596548444e-05, + "loss": 3.3231, + "step": 106480 + }, + { + "epoch": 7.235018344883816, + "grad_norm": 1.0014857053756714, + "learning_rate": 9.594544095665173e-05, + "loss": 3.2585, + "step": 106485 + }, + { + "epoch": 7.235358064954477, + "grad_norm": 0.9966953992843628, + "learning_rate": 9.5902975947819e-05, + "loss": 3.0708, + "step": 106490 + }, + { + "epoch": 7.235697785025139, + "grad_norm": 1.0715240240097046, + "learning_rate": 9.586051093898628e-05, + "loss": 3.6321, + "step": 106495 + }, + { + "epoch": 7.236037505095801, + "grad_norm": 0.8368849158287048, + "learning_rate": 9.581804593015356e-05, + "loss": 3.3356, + "step": 106500 + }, + { + "epoch": 7.236377225166462, + "grad_norm": 1.103559970855713, + "learning_rate": 9.577558092132084e-05, + "loss": 3.3086, + "step": 106505 + }, + { + "epoch": 7.2367169452371245, + "grad_norm": 0.998490571975708, + "learning_rate": 9.57331159124881e-05, + "loss": 3.3435, + "step": 106510 + }, + { + "epoch": 7.237056665307787, + "grad_norm": 1.066719651222229, + "learning_rate": 9.56906509036554e-05, + "loss": 3.2668, + "step": 106515 + }, + { + "epoch": 7.237396385378448, + "grad_norm": 0.9685758352279663, + "learning_rate": 9.564818589482268e-05, + "loss": 3.4909, + "step": 106520 + }, + { + "epoch": 7.23773610544911, + "grad_norm": 0.6913326978683472, + "learning_rate": 9.560572088598994e-05, + "loss": 3.0747, + "step": 106525 + }, + { + "epoch": 7.238075825519772, + "grad_norm": 0.9756535887718201, + "learning_rate": 9.556325587715722e-05, + "loss": 3.2414, + "step": 106530 + }, + { + "epoch": 7.238415545590433, + "grad_norm": 0.8253628611564636, + "learning_rate": 9.55207908683245e-05, + "loss": 3.3439, + "step": 106535 + }, + { + "epoch": 7.238755265661095, + "grad_norm": 0.8633694648742676, + "learning_rate": 9.547832585949177e-05, + "loss": 3.5786, + "step": 106540 + }, + { + "epoch": 7.239094985731757, + "grad_norm": 1.1053664684295654, + "learning_rate": 9.543586085065905e-05, + "loss": 3.5573, + "step": 106545 + }, + { + "epoch": 7.239434705802418, + "grad_norm": 0.9998781085014343, + "learning_rate": 9.539339584182634e-05, + "loss": 3.1868, + "step": 106550 + }, + { + "epoch": 7.2397744258730805, + "grad_norm": 0.9551767110824585, + "learning_rate": 9.535093083299362e-05, + "loss": 3.4156, + "step": 106555 + }, + { + "epoch": 7.240114145943743, + "grad_norm": 0.9673648476600647, + "learning_rate": 9.530846582416089e-05, + "loss": 3.3833, + "step": 106560 + }, + { + "epoch": 7.240453866014404, + "grad_norm": 0.9218423962593079, + "learning_rate": 9.526600081532817e-05, + "loss": 3.2316, + "step": 106565 + }, + { + "epoch": 7.240793586085066, + "grad_norm": 0.8201649785041809, + "learning_rate": 9.522353580649545e-05, + "loss": 3.5998, + "step": 106570 + }, + { + "epoch": 7.241133306155728, + "grad_norm": 1.4717475175857544, + "learning_rate": 9.518107079766272e-05, + "loss": 3.5036, + "step": 106575 + }, + { + "epoch": 7.241473026226389, + "grad_norm": 0.899878740310669, + "learning_rate": 9.513860578883e-05, + "loss": 3.4023, + "step": 106580 + }, + { + "epoch": 7.241812746297051, + "grad_norm": 1.018218994140625, + "learning_rate": 9.509614077999729e-05, + "loss": 3.4759, + "step": 106585 + }, + { + "epoch": 7.242152466367713, + "grad_norm": 0.6936570405960083, + "learning_rate": 9.505367577116457e-05, + "loss": 3.4834, + "step": 106590 + }, + { + "epoch": 7.2424921864383744, + "grad_norm": 1.02244234085083, + "learning_rate": 9.501121076233184e-05, + "loss": 3.234, + "step": 106595 + }, + { + "epoch": 7.2428319065090365, + "grad_norm": 0.9275456070899963, + "learning_rate": 9.496874575349912e-05, + "loss": 3.5703, + "step": 106600 + }, + { + "epoch": 7.243171626579699, + "grad_norm": 1.1174226999282837, + "learning_rate": 9.49262807446664e-05, + "loss": 3.3607, + "step": 106605 + }, + { + "epoch": 7.24351134665036, + "grad_norm": 0.7595512866973877, + "learning_rate": 9.488381573583366e-05, + "loss": 3.3554, + "step": 106610 + }, + { + "epoch": 7.243851066721022, + "grad_norm": 1.3963921070098877, + "learning_rate": 9.484135072700096e-05, + "loss": 3.5334, + "step": 106615 + }, + { + "epoch": 7.244190786791684, + "grad_norm": 0.9463827610015869, + "learning_rate": 9.479888571816824e-05, + "loss": 3.1866, + "step": 106620 + }, + { + "epoch": 7.244530506862345, + "grad_norm": 0.7838232517242432, + "learning_rate": 9.47564207093355e-05, + "loss": 3.2337, + "step": 106625 + }, + { + "epoch": 7.244870226933007, + "grad_norm": 0.9513334035873413, + "learning_rate": 9.471395570050278e-05, + "loss": 3.4162, + "step": 106630 + }, + { + "epoch": 7.245209947003669, + "grad_norm": 0.8248580098152161, + "learning_rate": 9.467149069167006e-05, + "loss": 3.4947, + "step": 106635 + }, + { + "epoch": 7.2455496670743305, + "grad_norm": 0.7253133058547974, + "learning_rate": 9.462902568283735e-05, + "loss": 3.3126, + "step": 106640 + }, + { + "epoch": 7.2458893871449925, + "grad_norm": 0.955199658870697, + "learning_rate": 9.458656067400461e-05, + "loss": 3.3405, + "step": 106645 + }, + { + "epoch": 7.246229107215655, + "grad_norm": 0.7419239282608032, + "learning_rate": 9.45440956651719e-05, + "loss": 3.6082, + "step": 106650 + }, + { + "epoch": 7.246568827286316, + "grad_norm": 1.0378450155258179, + "learning_rate": 9.450163065633919e-05, + "loss": 3.0617, + "step": 106655 + }, + { + "epoch": 7.246908547356978, + "grad_norm": 0.7422586679458618, + "learning_rate": 9.445916564750645e-05, + "loss": 3.257, + "step": 106660 + }, + { + "epoch": 7.24724826742764, + "grad_norm": 0.8105832934379578, + "learning_rate": 9.441670063867373e-05, + "loss": 3.4892, + "step": 106665 + }, + { + "epoch": 7.247587987498301, + "grad_norm": 0.9796565175056458, + "learning_rate": 9.437423562984101e-05, + "loss": 3.2195, + "step": 106670 + }, + { + "epoch": 7.247927707568963, + "grad_norm": 0.8121803998947144, + "learning_rate": 9.433177062100829e-05, + "loss": 3.3814, + "step": 106675 + }, + { + "epoch": 7.248267427639625, + "grad_norm": 0.9639015197753906, + "learning_rate": 9.428930561217557e-05, + "loss": 3.3467, + "step": 106680 + }, + { + "epoch": 7.2486071477102865, + "grad_norm": 1.131103277206421, + "learning_rate": 9.424684060334285e-05, + "loss": 3.4953, + "step": 106685 + }, + { + "epoch": 7.2489468677809485, + "grad_norm": 1.1780000925064087, + "learning_rate": 9.420437559451013e-05, + "loss": 3.2784, + "step": 106690 + }, + { + "epoch": 7.249286587851611, + "grad_norm": 0.8641454577445984, + "learning_rate": 9.41619105856774e-05, + "loss": 3.4028, + "step": 106695 + }, + { + "epoch": 7.249626307922272, + "grad_norm": 0.9258278012275696, + "learning_rate": 9.411944557684468e-05, + "loss": 3.328, + "step": 106700 + }, + { + "epoch": 7.249966027992934, + "grad_norm": 0.8853586912155151, + "learning_rate": 9.407698056801196e-05, + "loss": 3.6102, + "step": 106705 + }, + { + "epoch": 7.250305748063596, + "grad_norm": 0.7012967467308044, + "learning_rate": 9.403451555917923e-05, + "loss": 3.5348, + "step": 106710 + }, + { + "epoch": 7.250645468134257, + "grad_norm": 0.8248225450515747, + "learning_rate": 9.399205055034652e-05, + "loss": 3.4303, + "step": 106715 + }, + { + "epoch": 7.250985188204919, + "grad_norm": 1.0346137285232544, + "learning_rate": 9.39495855415138e-05, + "loss": 3.4192, + "step": 106720 + }, + { + "epoch": 7.251324908275581, + "grad_norm": 0.936991810798645, + "learning_rate": 9.390712053268108e-05, + "loss": 3.4858, + "step": 106725 + }, + { + "epoch": 7.2516646283462425, + "grad_norm": 0.895368754863739, + "learning_rate": 9.386465552384835e-05, + "loss": 3.3159, + "step": 106730 + }, + { + "epoch": 7.2520043484169046, + "grad_norm": 1.110459804534912, + "learning_rate": 9.382219051501563e-05, + "loss": 3.3019, + "step": 106735 + }, + { + "epoch": 7.252344068487567, + "grad_norm": 0.8580955862998962, + "learning_rate": 9.37797255061829e-05, + "loss": 3.3905, + "step": 106740 + }, + { + "epoch": 7.252683788558228, + "grad_norm": 0.9586470723152161, + "learning_rate": 9.373726049735019e-05, + "loss": 3.3679, + "step": 106745 + }, + { + "epoch": 7.25302350862889, + "grad_norm": 1.316164493560791, + "learning_rate": 9.369479548851747e-05, + "loss": 3.2334, + "step": 106750 + }, + { + "epoch": 7.253363228699552, + "grad_norm": 1.0969911813735962, + "learning_rate": 9.365233047968475e-05, + "loss": 2.8933, + "step": 106755 + }, + { + "epoch": 7.253702948770213, + "grad_norm": 0.9315330386161804, + "learning_rate": 9.360986547085203e-05, + "loss": 3.4559, + "step": 106760 + }, + { + "epoch": 7.254042668840875, + "grad_norm": 1.3063770532608032, + "learning_rate": 9.356740046201929e-05, + "loss": 3.1112, + "step": 106765 + }, + { + "epoch": 7.254382388911537, + "grad_norm": 1.024181842803955, + "learning_rate": 9.352493545318657e-05, + "loss": 3.2864, + "step": 106770 + }, + { + "epoch": 7.2547221089821985, + "grad_norm": 1.055405855178833, + "learning_rate": 9.348247044435385e-05, + "loss": 3.4355, + "step": 106775 + }, + { + "epoch": 7.255061829052861, + "grad_norm": 1.0097079277038574, + "learning_rate": 9.344000543552113e-05, + "loss": 3.4052, + "step": 106780 + }, + { + "epoch": 7.255401549123523, + "grad_norm": 0.8935850262641907, + "learning_rate": 9.339754042668841e-05, + "loss": 3.2644, + "step": 106785 + }, + { + "epoch": 7.255741269194184, + "grad_norm": 0.9412526488304138, + "learning_rate": 9.335507541785569e-05, + "loss": 3.3963, + "step": 106790 + }, + { + "epoch": 7.256080989264846, + "grad_norm": 0.9440111517906189, + "learning_rate": 9.331261040902296e-05, + "loss": 3.3124, + "step": 106795 + }, + { + "epoch": 7.256420709335508, + "grad_norm": 0.8392987251281738, + "learning_rate": 9.327014540019024e-05, + "loss": 3.3978, + "step": 106800 + }, + { + "epoch": 7.256760429406169, + "grad_norm": 0.8836671710014343, + "learning_rate": 9.322768039135752e-05, + "loss": 3.2866, + "step": 106805 + }, + { + "epoch": 7.257100149476831, + "grad_norm": 1.088700294494629, + "learning_rate": 9.318521538252481e-05, + "loss": 3.2035, + "step": 106810 + }, + { + "epoch": 7.257439869547493, + "grad_norm": 1.3906748294830322, + "learning_rate": 9.314275037369208e-05, + "loss": 3.6051, + "step": 106815 + }, + { + "epoch": 7.2577795896181545, + "grad_norm": 0.9321019053459167, + "learning_rate": 9.310028536485936e-05, + "loss": 3.4965, + "step": 106820 + }, + { + "epoch": 7.258119309688817, + "grad_norm": 1.4776277542114258, + "learning_rate": 9.305782035602664e-05, + "loss": 3.4533, + "step": 106825 + }, + { + "epoch": 7.258459029759479, + "grad_norm": 0.7699559330940247, + "learning_rate": 9.30153553471939e-05, + "loss": 3.4855, + "step": 106830 + }, + { + "epoch": 7.25879874983014, + "grad_norm": 0.9702872633934021, + "learning_rate": 9.297289033836119e-05, + "loss": 3.5923, + "step": 106835 + }, + { + "epoch": 7.259138469900802, + "grad_norm": 1.1445820331573486, + "learning_rate": 9.293042532952847e-05, + "loss": 3.3295, + "step": 106840 + }, + { + "epoch": 7.259478189971463, + "grad_norm": 0.940119206905365, + "learning_rate": 9.288796032069576e-05, + "loss": 3.5763, + "step": 106845 + }, + { + "epoch": 7.259817910042125, + "grad_norm": 0.7063393592834473, + "learning_rate": 9.284549531186303e-05, + "loss": 3.1937, + "step": 106850 + }, + { + "epoch": 7.260157630112787, + "grad_norm": 0.9142401814460754, + "learning_rate": 9.280303030303031e-05, + "loss": 3.2926, + "step": 106855 + }, + { + "epoch": 7.260497350183448, + "grad_norm": 0.954307496547699, + "learning_rate": 9.276056529419759e-05, + "loss": 3.3387, + "step": 106860 + }, + { + "epoch": 7.2608370702541105, + "grad_norm": 1.2589070796966553, + "learning_rate": 9.271810028536485e-05, + "loss": 3.1954, + "step": 106865 + }, + { + "epoch": 7.261176790324773, + "grad_norm": 0.8197320699691772, + "learning_rate": 9.267563527653213e-05, + "loss": 3.3158, + "step": 106870 + }, + { + "epoch": 7.261516510395434, + "grad_norm": 0.6727901697158813, + "learning_rate": 9.263317026769943e-05, + "loss": 3.2703, + "step": 106875 + }, + { + "epoch": 7.261856230466096, + "grad_norm": 0.9791354537010193, + "learning_rate": 9.25907052588667e-05, + "loss": 3.4526, + "step": 106880 + }, + { + "epoch": 7.262195950536758, + "grad_norm": 1.930832862854004, + "learning_rate": 9.254824025003397e-05, + "loss": 3.8219, + "step": 106885 + }, + { + "epoch": 7.262535670607419, + "grad_norm": 1.1681394577026367, + "learning_rate": 9.250577524120125e-05, + "loss": 3.1535, + "step": 106890 + }, + { + "epoch": 7.262875390678081, + "grad_norm": 0.8689091205596924, + "learning_rate": 9.246331023236853e-05, + "loss": 3.3653, + "step": 106895 + }, + { + "epoch": 7.263215110748743, + "grad_norm": 1.114701271057129, + "learning_rate": 9.24208452235358e-05, + "loss": 3.3484, + "step": 106900 + }, + { + "epoch": 7.2635548308194045, + "grad_norm": 1.149871587753296, + "learning_rate": 9.237838021470308e-05, + "loss": 3.3289, + "step": 106905 + }, + { + "epoch": 7.2638945508900665, + "grad_norm": 0.9220776557922363, + "learning_rate": 9.233591520587037e-05, + "loss": 3.3478, + "step": 106910 + }, + { + "epoch": 7.264234270960729, + "grad_norm": 0.848659873008728, + "learning_rate": 9.229345019703764e-05, + "loss": 3.4696, + "step": 106915 + }, + { + "epoch": 7.26457399103139, + "grad_norm": 2.1254217624664307, + "learning_rate": 9.225098518820492e-05, + "loss": 3.4793, + "step": 106920 + }, + { + "epoch": 7.264913711102052, + "grad_norm": 0.9105680584907532, + "learning_rate": 9.22085201793722e-05, + "loss": 3.4182, + "step": 106925 + }, + { + "epoch": 7.265253431172714, + "grad_norm": 0.9147411584854126, + "learning_rate": 9.216605517053948e-05, + "loss": 3.2244, + "step": 106930 + }, + { + "epoch": 7.265593151243375, + "grad_norm": 1.1112929582595825, + "learning_rate": 9.212359016170675e-05, + "loss": 3.3884, + "step": 106935 + }, + { + "epoch": 7.265932871314037, + "grad_norm": 0.8753441572189331, + "learning_rate": 9.208112515287404e-05, + "loss": 3.3987, + "step": 106940 + }, + { + "epoch": 7.266272591384699, + "grad_norm": 0.8476969599723816, + "learning_rate": 9.203866014404132e-05, + "loss": 3.4701, + "step": 106945 + }, + { + "epoch": 7.2666123114553605, + "grad_norm": 1.0897724628448486, + "learning_rate": 9.199619513520859e-05, + "loss": 3.5274, + "step": 106950 + }, + { + "epoch": 7.2669520315260225, + "grad_norm": 0.8795422315597534, + "learning_rate": 9.195373012637587e-05, + "loss": 3.4216, + "step": 106955 + }, + { + "epoch": 7.267291751596685, + "grad_norm": 0.8821689486503601, + "learning_rate": 9.191126511754315e-05, + "loss": 3.4322, + "step": 106960 + }, + { + "epoch": 7.267631471667346, + "grad_norm": 1.1592833995819092, + "learning_rate": 9.186880010871041e-05, + "loss": 3.444, + "step": 106965 + }, + { + "epoch": 7.267971191738008, + "grad_norm": 0.9927697777748108, + "learning_rate": 9.18263350998777e-05, + "loss": 3.2447, + "step": 106970 + }, + { + "epoch": 7.26831091180867, + "grad_norm": 1.121121883392334, + "learning_rate": 9.178387009104499e-05, + "loss": 3.4484, + "step": 106975 + }, + { + "epoch": 7.268650631879331, + "grad_norm": 1.0082286596298218, + "learning_rate": 9.174140508221227e-05, + "loss": 3.4965, + "step": 106980 + }, + { + "epoch": 7.268990351949993, + "grad_norm": 1.1908787488937378, + "learning_rate": 9.169894007337953e-05, + "loss": 3.3513, + "step": 106985 + }, + { + "epoch": 7.269330072020655, + "grad_norm": 0.9220174551010132, + "learning_rate": 9.165647506454681e-05, + "loss": 3.548, + "step": 106990 + }, + { + "epoch": 7.2696697920913165, + "grad_norm": 1.2632396221160889, + "learning_rate": 9.16140100557141e-05, + "loss": 3.4461, + "step": 106995 + }, + { + "epoch": 7.2700095121619785, + "grad_norm": 0.8750635981559753, + "learning_rate": 9.157154504688136e-05, + "loss": 3.4266, + "step": 107000 + }, + { + "epoch": 7.270349232232641, + "grad_norm": 0.8411234617233276, + "learning_rate": 9.152908003804864e-05, + "loss": 3.3911, + "step": 107005 + }, + { + "epoch": 7.270688952303302, + "grad_norm": 1.0020290613174438, + "learning_rate": 9.148661502921594e-05, + "loss": 3.3994, + "step": 107010 + }, + { + "epoch": 7.271028672373964, + "grad_norm": 0.8932426571846008, + "learning_rate": 9.144415002038322e-05, + "loss": 3.3615, + "step": 107015 + }, + { + "epoch": 7.271368392444626, + "grad_norm": 0.9097530245780945, + "learning_rate": 9.140168501155048e-05, + "loss": 3.3822, + "step": 107020 + }, + { + "epoch": 7.271708112515287, + "grad_norm": 1.1096588373184204, + "learning_rate": 9.135922000271776e-05, + "loss": 3.5909, + "step": 107025 + }, + { + "epoch": 7.272047832585949, + "grad_norm": 0.9274539947509766, + "learning_rate": 9.131675499388504e-05, + "loss": 3.2669, + "step": 107030 + }, + { + "epoch": 7.272387552656611, + "grad_norm": 0.8819828629493713, + "learning_rate": 9.127428998505231e-05, + "loss": 3.2755, + "step": 107035 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 1.1294366121292114, + "learning_rate": 9.12318249762196e-05, + "loss": 3.3599, + "step": 107040 + }, + { + "epoch": 7.273066992797935, + "grad_norm": 0.9703618288040161, + "learning_rate": 9.118935996738688e-05, + "loss": 3.5264, + "step": 107045 + }, + { + "epoch": 7.273406712868597, + "grad_norm": 0.8017876148223877, + "learning_rate": 9.114689495855415e-05, + "loss": 3.1404, + "step": 107050 + }, + { + "epoch": 7.273746432939258, + "grad_norm": 1.0033574104309082, + "learning_rate": 9.110442994972143e-05, + "loss": 3.3755, + "step": 107055 + }, + { + "epoch": 7.27408615300992, + "grad_norm": 0.9356278777122498, + "learning_rate": 9.106196494088871e-05, + "loss": 3.5385, + "step": 107060 + }, + { + "epoch": 7.274425873080582, + "grad_norm": 0.9361753463745117, + "learning_rate": 9.101949993205599e-05, + "loss": 3.3074, + "step": 107065 + }, + { + "epoch": 7.274765593151243, + "grad_norm": 0.7625493407249451, + "learning_rate": 9.097703492322326e-05, + "loss": 3.3219, + "step": 107070 + }, + { + "epoch": 7.275105313221905, + "grad_norm": 0.8126778602600098, + "learning_rate": 9.093456991439055e-05, + "loss": 3.4049, + "step": 107075 + }, + { + "epoch": 7.275445033292567, + "grad_norm": 0.8722221255302429, + "learning_rate": 9.089210490555783e-05, + "loss": 3.5159, + "step": 107080 + }, + { + "epoch": 7.2757847533632285, + "grad_norm": 0.7434508800506592, + "learning_rate": 9.08496398967251e-05, + "loss": 3.3816, + "step": 107085 + }, + { + "epoch": 7.276124473433891, + "grad_norm": 1.027294635772705, + "learning_rate": 9.080717488789238e-05, + "loss": 3.4073, + "step": 107090 + }, + { + "epoch": 7.276464193504553, + "grad_norm": 1.0637012720108032, + "learning_rate": 9.076470987905966e-05, + "loss": 3.3412, + "step": 107095 + }, + { + "epoch": 7.276803913575214, + "grad_norm": 0.960479199886322, + "learning_rate": 9.072224487022694e-05, + "loss": 3.3122, + "step": 107100 + }, + { + "epoch": 7.277143633645876, + "grad_norm": 1.2564722299575806, + "learning_rate": 9.067977986139422e-05, + "loss": 3.6348, + "step": 107105 + }, + { + "epoch": 7.277483353716538, + "grad_norm": 0.6841046810150146, + "learning_rate": 9.06373148525615e-05, + "loss": 3.4669, + "step": 107110 + }, + { + "epoch": 7.277823073787199, + "grad_norm": 0.9158651828765869, + "learning_rate": 9.059484984372878e-05, + "loss": 3.6915, + "step": 107115 + }, + { + "epoch": 7.278162793857861, + "grad_norm": 0.8700653910636902, + "learning_rate": 9.055238483489604e-05, + "loss": 3.3406, + "step": 107120 + }, + { + "epoch": 7.278502513928522, + "grad_norm": 2.616039514541626, + "learning_rate": 9.050991982606332e-05, + "loss": 3.3759, + "step": 107125 + }, + { + "epoch": 7.2788422339991845, + "grad_norm": 0.9320615530014038, + "learning_rate": 9.04674548172306e-05, + "loss": 3.4856, + "step": 107130 + }, + { + "epoch": 7.279181954069847, + "grad_norm": 0.9373964071273804, + "learning_rate": 9.042498980839787e-05, + "loss": 3.1934, + "step": 107135 + }, + { + "epoch": 7.279521674140508, + "grad_norm": 1.0204733610153198, + "learning_rate": 9.038252479956516e-05, + "loss": 3.1431, + "step": 107140 + }, + { + "epoch": 7.27986139421117, + "grad_norm": 0.8938197493553162, + "learning_rate": 9.034005979073244e-05, + "loss": 3.3705, + "step": 107145 + }, + { + "epoch": 7.280201114281832, + "grad_norm": 1.2687907218933105, + "learning_rate": 9.029759478189972e-05, + "loss": 3.3389, + "step": 107150 + }, + { + "epoch": 7.280540834352493, + "grad_norm": 0.8161853551864624, + "learning_rate": 9.025512977306699e-05, + "loss": 3.2223, + "step": 107155 + }, + { + "epoch": 7.280880554423155, + "grad_norm": 0.8815144896507263, + "learning_rate": 9.021266476423427e-05, + "loss": 3.2846, + "step": 107160 + }, + { + "epoch": 7.281220274493817, + "grad_norm": 0.9384636282920837, + "learning_rate": 9.017019975540155e-05, + "loss": 3.3587, + "step": 107165 + }, + { + "epoch": 7.2815599945644784, + "grad_norm": 1.075742483139038, + "learning_rate": 9.012773474656883e-05, + "loss": 3.2334, + "step": 107170 + }, + { + "epoch": 7.2818997146351405, + "grad_norm": 1.0741455554962158, + "learning_rate": 9.008526973773611e-05, + "loss": 3.4734, + "step": 107175 + }, + { + "epoch": 7.282239434705803, + "grad_norm": 0.9290394186973572, + "learning_rate": 9.004280472890339e-05, + "loss": 3.3803, + "step": 107180 + }, + { + "epoch": 7.282579154776464, + "grad_norm": 0.972166895866394, + "learning_rate": 9.000033972007067e-05, + "loss": 3.4297, + "step": 107185 + }, + { + "epoch": 7.282918874847126, + "grad_norm": 1.1440999507904053, + "learning_rate": 8.995787471123794e-05, + "loss": 3.3538, + "step": 107190 + }, + { + "epoch": 7.283258594917788, + "grad_norm": 0.9842727780342102, + "learning_rate": 8.991540970240522e-05, + "loss": 3.278, + "step": 107195 + }, + { + "epoch": 7.283598314988449, + "grad_norm": 0.9229713678359985, + "learning_rate": 8.98729446935725e-05, + "loss": 3.4394, + "step": 107200 + }, + { + "epoch": 7.283938035059111, + "grad_norm": 0.988594114780426, + "learning_rate": 8.983047968473978e-05, + "loss": 3.2025, + "step": 107205 + }, + { + "epoch": 7.284277755129773, + "grad_norm": 0.8584747910499573, + "learning_rate": 8.978801467590706e-05, + "loss": 3.3395, + "step": 107210 + }, + { + "epoch": 7.2846174752004345, + "grad_norm": 0.8938697576522827, + "learning_rate": 8.974554966707434e-05, + "loss": 3.1333, + "step": 107215 + }, + { + "epoch": 7.2849571952710965, + "grad_norm": 1.1987535953521729, + "learning_rate": 8.97030846582416e-05, + "loss": 3.2478, + "step": 107220 + }, + { + "epoch": 7.285296915341759, + "grad_norm": 0.9493235945701599, + "learning_rate": 8.966061964940888e-05, + "loss": 3.3741, + "step": 107225 + }, + { + "epoch": 7.28563663541242, + "grad_norm": 1.581229329109192, + "learning_rate": 8.961815464057616e-05, + "loss": 3.7693, + "step": 107230 + }, + { + "epoch": 7.285976355483082, + "grad_norm": 1.2339736223220825, + "learning_rate": 8.957568963174346e-05, + "loss": 3.2022, + "step": 107235 + }, + { + "epoch": 7.286316075553744, + "grad_norm": 1.2398124933242798, + "learning_rate": 8.953322462291072e-05, + "loss": 3.2002, + "step": 107240 + }, + { + "epoch": 7.286655795624405, + "grad_norm": 1.2006813287734985, + "learning_rate": 8.9490759614078e-05, + "loss": 3.254, + "step": 107245 + }, + { + "epoch": 7.286995515695067, + "grad_norm": 1.0216896533966064, + "learning_rate": 8.944829460524528e-05, + "loss": 3.1712, + "step": 107250 + }, + { + "epoch": 7.287335235765729, + "grad_norm": 1.0015507936477661, + "learning_rate": 8.940582959641255e-05, + "loss": 3.5012, + "step": 107255 + }, + { + "epoch": 7.2876749558363905, + "grad_norm": 0.8238083124160767, + "learning_rate": 8.936336458757983e-05, + "loss": 3.5109, + "step": 107260 + }, + { + "epoch": 7.2880146759070525, + "grad_norm": 0.9421113133430481, + "learning_rate": 8.932089957874711e-05, + "loss": 3.2485, + "step": 107265 + }, + { + "epoch": 7.288354395977715, + "grad_norm": 0.8317088484764099, + "learning_rate": 8.92784345699144e-05, + "loss": 3.3968, + "step": 107270 + }, + { + "epoch": 7.288694116048376, + "grad_norm": 1.0804107189178467, + "learning_rate": 8.923596956108167e-05, + "loss": 3.341, + "step": 107275 + }, + { + "epoch": 7.289033836119038, + "grad_norm": 0.9724596738815308, + "learning_rate": 8.919350455224895e-05, + "loss": 3.0717, + "step": 107280 + }, + { + "epoch": 7.2893735561897, + "grad_norm": 0.8983426094055176, + "learning_rate": 8.915103954341623e-05, + "loss": 3.5236, + "step": 107285 + }, + { + "epoch": 7.289713276260361, + "grad_norm": 1.069204330444336, + "learning_rate": 8.91085745345835e-05, + "loss": 3.4165, + "step": 107290 + }, + { + "epoch": 7.290052996331023, + "grad_norm": 1.1735159158706665, + "learning_rate": 8.906610952575078e-05, + "loss": 3.4313, + "step": 107295 + }, + { + "epoch": 7.290392716401685, + "grad_norm": 1.1217705011367798, + "learning_rate": 8.902364451691807e-05, + "loss": 3.7031, + "step": 107300 + }, + { + "epoch": 7.2907324364723465, + "grad_norm": 0.9486626982688904, + "learning_rate": 8.898117950808534e-05, + "loss": 3.3322, + "step": 107305 + }, + { + "epoch": 7.2910721565430086, + "grad_norm": 0.8767917156219482, + "learning_rate": 8.893871449925262e-05, + "loss": 3.3699, + "step": 107310 + }, + { + "epoch": 7.291411876613671, + "grad_norm": 0.8593475222587585, + "learning_rate": 8.88962494904199e-05, + "loss": 3.5109, + "step": 107315 + }, + { + "epoch": 7.291751596684332, + "grad_norm": 0.9458880424499512, + "learning_rate": 8.885378448158718e-05, + "loss": 3.4451, + "step": 107320 + }, + { + "epoch": 7.292091316754994, + "grad_norm": 1.1174410581588745, + "learning_rate": 8.881131947275444e-05, + "loss": 3.5441, + "step": 107325 + }, + { + "epoch": 7.292431036825656, + "grad_norm": 0.9063006043434143, + "learning_rate": 8.876885446392172e-05, + "loss": 3.5722, + "step": 107330 + }, + { + "epoch": 7.292770756896317, + "grad_norm": 0.8687930107116699, + "learning_rate": 8.872638945508902e-05, + "loss": 3.2806, + "step": 107335 + }, + { + "epoch": 7.293110476966979, + "grad_norm": 1.653217077255249, + "learning_rate": 8.868392444625628e-05, + "loss": 3.5805, + "step": 107340 + }, + { + "epoch": 7.293450197037641, + "grad_norm": 0.8219819068908691, + "learning_rate": 8.864145943742356e-05, + "loss": 3.7473, + "step": 107345 + }, + { + "epoch": 7.2937899171083025, + "grad_norm": 0.8829936981201172, + "learning_rate": 8.859899442859084e-05, + "loss": 3.5984, + "step": 107350 + }, + { + "epoch": 7.294129637178965, + "grad_norm": 1.067057490348816, + "learning_rate": 8.855652941975813e-05, + "loss": 3.176, + "step": 107355 + }, + { + "epoch": 7.294469357249627, + "grad_norm": 0.7854785323143005, + "learning_rate": 8.851406441092539e-05, + "loss": 3.3656, + "step": 107360 + }, + { + "epoch": 7.294809077320288, + "grad_norm": 0.7951167225837708, + "learning_rate": 8.847159940209269e-05, + "loss": 3.5432, + "step": 107365 + }, + { + "epoch": 7.29514879739095, + "grad_norm": 1.014304280281067, + "learning_rate": 8.842913439325997e-05, + "loss": 3.4173, + "step": 107370 + }, + { + "epoch": 7.295488517461612, + "grad_norm": 0.8325261473655701, + "learning_rate": 8.838666938442723e-05, + "loss": 3.3426, + "step": 107375 + }, + { + "epoch": 7.295828237532273, + "grad_norm": 1.0489182472229004, + "learning_rate": 8.834420437559451e-05, + "loss": 3.3377, + "step": 107380 + }, + { + "epoch": 7.296167957602935, + "grad_norm": 0.9725972414016724, + "learning_rate": 8.830173936676179e-05, + "loss": 3.3835, + "step": 107385 + }, + { + "epoch": 7.296507677673597, + "grad_norm": 0.8716647028923035, + "learning_rate": 8.825927435792906e-05, + "loss": 3.5738, + "step": 107390 + }, + { + "epoch": 7.2968473977442585, + "grad_norm": 0.9838634729385376, + "learning_rate": 8.821680934909634e-05, + "loss": 3.3804, + "step": 107395 + }, + { + "epoch": 7.297187117814921, + "grad_norm": 0.9392914772033691, + "learning_rate": 8.817434434026363e-05, + "loss": 3.5634, + "step": 107400 + }, + { + "epoch": 7.297526837885583, + "grad_norm": 1.542887568473816, + "learning_rate": 8.813187933143091e-05, + "loss": 3.3807, + "step": 107405 + }, + { + "epoch": 7.297866557956244, + "grad_norm": 0.9785023331642151, + "learning_rate": 8.808941432259818e-05, + "loss": 3.4887, + "step": 107410 + }, + { + "epoch": 7.298206278026906, + "grad_norm": 0.8856414556503296, + "learning_rate": 8.804694931376546e-05, + "loss": 3.4202, + "step": 107415 + }, + { + "epoch": 7.298545998097568, + "grad_norm": 1.0327752828598022, + "learning_rate": 8.800448430493274e-05, + "loss": 3.4438, + "step": 107420 + }, + { + "epoch": 7.298885718168229, + "grad_norm": 0.8928911685943604, + "learning_rate": 8.79620192961e-05, + "loss": 3.3863, + "step": 107425 + }, + { + "epoch": 7.299225438238891, + "grad_norm": 0.8369243741035461, + "learning_rate": 8.791955428726729e-05, + "loss": 3.4243, + "step": 107430 + }, + { + "epoch": 7.299565158309553, + "grad_norm": 0.9383416175842285, + "learning_rate": 8.787708927843458e-05, + "loss": 3.1707, + "step": 107435 + }, + { + "epoch": 7.2999048783802145, + "grad_norm": 0.8058355450630188, + "learning_rate": 8.783462426960186e-05, + "loss": 3.5005, + "step": 107440 + }, + { + "epoch": 7.300244598450877, + "grad_norm": 0.8540806770324707, + "learning_rate": 8.779215926076913e-05, + "loss": 3.0953, + "step": 107445 + }, + { + "epoch": 7.300584318521539, + "grad_norm": 0.9630956053733826, + "learning_rate": 8.77496942519364e-05, + "loss": 3.4383, + "step": 107450 + }, + { + "epoch": 7.3009240385922, + "grad_norm": 0.7776601910591125, + "learning_rate": 8.770722924310369e-05, + "loss": 3.2743, + "step": 107455 + }, + { + "epoch": 7.301263758662862, + "grad_norm": 0.9591835141181946, + "learning_rate": 8.766476423427095e-05, + "loss": 3.4567, + "step": 107460 + }, + { + "epoch": 7.301603478733524, + "grad_norm": 0.8215896487236023, + "learning_rate": 8.762229922543825e-05, + "loss": 3.3089, + "step": 107465 + }, + { + "epoch": 7.301943198804185, + "grad_norm": 1.2328009605407715, + "learning_rate": 8.757983421660553e-05, + "loss": 3.5177, + "step": 107470 + }, + { + "epoch": 7.302282918874847, + "grad_norm": 1.0653401613235474, + "learning_rate": 8.753736920777279e-05, + "loss": 3.3057, + "step": 107475 + }, + { + "epoch": 7.302622638945509, + "grad_norm": 1.1605507135391235, + "learning_rate": 8.749490419894007e-05, + "loss": 3.4816, + "step": 107480 + }, + { + "epoch": 7.3029623590161705, + "grad_norm": 0.9589990973472595, + "learning_rate": 8.745243919010735e-05, + "loss": 3.4111, + "step": 107485 + }, + { + "epoch": 7.303302079086833, + "grad_norm": 0.8885660767555237, + "learning_rate": 8.740997418127463e-05, + "loss": 3.1911, + "step": 107490 + }, + { + "epoch": 7.303641799157495, + "grad_norm": 1.0296975374221802, + "learning_rate": 8.73675091724419e-05, + "loss": 3.1227, + "step": 107495 + }, + { + "epoch": 7.303981519228156, + "grad_norm": 1.073764443397522, + "learning_rate": 8.732504416360919e-05, + "loss": 3.2239, + "step": 107500 + }, + { + "epoch": 7.304321239298818, + "grad_norm": 0.8247879147529602, + "learning_rate": 8.728257915477647e-05, + "loss": 3.5568, + "step": 107505 + }, + { + "epoch": 7.30466095936948, + "grad_norm": 1.02059805393219, + "learning_rate": 8.724011414594374e-05, + "loss": 2.9982, + "step": 107510 + }, + { + "epoch": 7.305000679440141, + "grad_norm": 1.1185179948806763, + "learning_rate": 8.719764913711102e-05, + "loss": 3.3208, + "step": 107515 + }, + { + "epoch": 7.305340399510803, + "grad_norm": 0.8709661364555359, + "learning_rate": 8.71551841282783e-05, + "loss": 3.5211, + "step": 107520 + }, + { + "epoch": 7.3056801195814645, + "grad_norm": 1.037195086479187, + "learning_rate": 8.711271911944558e-05, + "loss": 3.2598, + "step": 107525 + }, + { + "epoch": 7.3060198396521265, + "grad_norm": 0.8449952602386475, + "learning_rate": 8.707025411061286e-05, + "loss": 3.4189, + "step": 107530 + }, + { + "epoch": 7.306359559722789, + "grad_norm": 0.9405316114425659, + "learning_rate": 8.702778910178014e-05, + "loss": 3.6862, + "step": 107535 + }, + { + "epoch": 7.30669927979345, + "grad_norm": 1.1354985237121582, + "learning_rate": 8.698532409294742e-05, + "loss": 3.2079, + "step": 107540 + }, + { + "epoch": 7.307038999864112, + "grad_norm": 0.8565885424613953, + "learning_rate": 8.694285908411469e-05, + "loss": 3.1249, + "step": 107545 + }, + { + "epoch": 7.307378719934774, + "grad_norm": 1.0312999486923218, + "learning_rate": 8.690039407528197e-05, + "loss": 3.4763, + "step": 107550 + }, + { + "epoch": 7.307718440005435, + "grad_norm": 0.9970328211784363, + "learning_rate": 8.685792906644925e-05, + "loss": 3.6991, + "step": 107555 + }, + { + "epoch": 7.308058160076097, + "grad_norm": 0.9780045747756958, + "learning_rate": 8.681546405761651e-05, + "loss": 3.5609, + "step": 107560 + }, + { + "epoch": 7.308397880146759, + "grad_norm": 0.7496859431266785, + "learning_rate": 8.678149205055035e-05, + "loss": 3.3253, + "step": 107565 + }, + { + "epoch": 7.3087376002174205, + "grad_norm": 1.0353810787200928, + "learning_rate": 8.673902704171763e-05, + "loss": 3.7142, + "step": 107570 + }, + { + "epoch": 7.3090773202880825, + "grad_norm": 1.1686391830444336, + "learning_rate": 8.669656203288491e-05, + "loss": 3.2744, + "step": 107575 + }, + { + "epoch": 7.309417040358745, + "grad_norm": 0.9290786385536194, + "learning_rate": 8.665409702405218e-05, + "loss": 3.4293, + "step": 107580 + }, + { + "epoch": 7.309756760429406, + "grad_norm": 1.8293100595474243, + "learning_rate": 8.661163201521946e-05, + "loss": 3.5056, + "step": 107585 + }, + { + "epoch": 7.310096480500068, + "grad_norm": 0.8697346448898315, + "learning_rate": 8.656916700638674e-05, + "loss": 3.3101, + "step": 107590 + }, + { + "epoch": 7.31043620057073, + "grad_norm": 1.232410192489624, + "learning_rate": 8.6526701997554e-05, + "loss": 3.4023, + "step": 107595 + }, + { + "epoch": 7.310775920641391, + "grad_norm": 0.9468633532524109, + "learning_rate": 8.64842369887213e-05, + "loss": 3.5182, + "step": 107600 + }, + { + "epoch": 7.311115640712053, + "grad_norm": 0.9186557531356812, + "learning_rate": 8.644177197988858e-05, + "loss": 3.3705, + "step": 107605 + }, + { + "epoch": 7.311455360782715, + "grad_norm": 0.9161437749862671, + "learning_rate": 8.639930697105586e-05, + "loss": 3.557, + "step": 107610 + }, + { + "epoch": 7.3117950808533765, + "grad_norm": 0.9443641901016235, + "learning_rate": 8.635684196222312e-05, + "loss": 3.4499, + "step": 107615 + }, + { + "epoch": 7.312134800924039, + "grad_norm": 0.8891390562057495, + "learning_rate": 8.63143769533904e-05, + "loss": 3.4067, + "step": 107620 + }, + { + "epoch": 7.312474520994701, + "grad_norm": 0.8889767527580261, + "learning_rate": 8.627191194455768e-05, + "loss": 3.3173, + "step": 107625 + }, + { + "epoch": 7.312814241065362, + "grad_norm": 0.9084475040435791, + "learning_rate": 8.622944693572496e-05, + "loss": 3.5436, + "step": 107630 + }, + { + "epoch": 7.313153961136024, + "grad_norm": 0.7716243863105774, + "learning_rate": 8.618698192689224e-05, + "loss": 3.3487, + "step": 107635 + }, + { + "epoch": 7.313493681206686, + "grad_norm": 1.1844761371612549, + "learning_rate": 8.614451691805952e-05, + "loss": 3.2306, + "step": 107640 + }, + { + "epoch": 7.313833401277347, + "grad_norm": 0.779236376285553, + "learning_rate": 8.61020519092268e-05, + "loss": 3.4464, + "step": 107645 + }, + { + "epoch": 7.314173121348009, + "grad_norm": 0.9062373042106628, + "learning_rate": 8.605958690039407e-05, + "loss": 3.3036, + "step": 107650 + }, + { + "epoch": 7.314512841418671, + "grad_norm": 1.0305867195129395, + "learning_rate": 8.601712189156135e-05, + "loss": 3.2223, + "step": 107655 + }, + { + "epoch": 7.3148525614893325, + "grad_norm": 0.8595685362815857, + "learning_rate": 8.597465688272865e-05, + "loss": 3.1088, + "step": 107660 + }, + { + "epoch": 7.315192281559995, + "grad_norm": 1.1234357357025146, + "learning_rate": 8.593219187389591e-05, + "loss": 3.4283, + "step": 107665 + }, + { + "epoch": 7.315532001630657, + "grad_norm": 0.8695744872093201, + "learning_rate": 8.588972686506319e-05, + "loss": 3.5954, + "step": 107670 + }, + { + "epoch": 7.315871721701318, + "grad_norm": 2.712257146835327, + "learning_rate": 8.584726185623047e-05, + "loss": 3.6663, + "step": 107675 + }, + { + "epoch": 7.31621144177198, + "grad_norm": 0.8600691556930542, + "learning_rate": 8.580479684739774e-05, + "loss": 3.423, + "step": 107680 + }, + { + "epoch": 7.316551161842642, + "grad_norm": 0.7528800368309021, + "learning_rate": 8.576233183856502e-05, + "loss": 3.1854, + "step": 107685 + }, + { + "epoch": 7.316890881913303, + "grad_norm": 0.8991795182228088, + "learning_rate": 8.57198668297323e-05, + "loss": 3.5679, + "step": 107690 + }, + { + "epoch": 7.317230601983965, + "grad_norm": 0.8111313581466675, + "learning_rate": 8.567740182089959e-05, + "loss": 3.2664, + "step": 107695 + }, + { + "epoch": 7.317570322054627, + "grad_norm": 0.8345276713371277, + "learning_rate": 8.563493681206686e-05, + "loss": 3.3775, + "step": 107700 + }, + { + "epoch": 7.3179100421252885, + "grad_norm": 1.0650897026062012, + "learning_rate": 8.559247180323414e-05, + "loss": 3.6308, + "step": 107705 + }, + { + "epoch": 7.318249762195951, + "grad_norm": 1.287412405014038, + "learning_rate": 8.555000679440142e-05, + "loss": 3.3055, + "step": 107710 + }, + { + "epoch": 7.318589482266613, + "grad_norm": 1.070698618888855, + "learning_rate": 8.550754178556869e-05, + "loss": 3.5762, + "step": 107715 + }, + { + "epoch": 7.318929202337274, + "grad_norm": 0.9314174056053162, + "learning_rate": 8.546507677673597e-05, + "loss": 3.6921, + "step": 107720 + }, + { + "epoch": 7.319268922407936, + "grad_norm": 0.8944305777549744, + "learning_rate": 8.542261176790325e-05, + "loss": 3.3632, + "step": 107725 + }, + { + "epoch": 7.319608642478598, + "grad_norm": 1.0062872171401978, + "learning_rate": 8.538014675907054e-05, + "loss": 3.4221, + "step": 107730 + }, + { + "epoch": 7.319948362549259, + "grad_norm": 1.0917764902114868, + "learning_rate": 8.53376817502378e-05, + "loss": 3.2598, + "step": 107735 + }, + { + "epoch": 7.320288082619921, + "grad_norm": 1.1222434043884277, + "learning_rate": 8.529521674140509e-05, + "loss": 3.4245, + "step": 107740 + }, + { + "epoch": 7.320627802690583, + "grad_norm": 0.8587983846664429, + "learning_rate": 8.525275173257237e-05, + "loss": 3.3712, + "step": 107745 + }, + { + "epoch": 7.3209675227612445, + "grad_norm": 0.9645965695381165, + "learning_rate": 8.521028672373963e-05, + "loss": 3.5109, + "step": 107750 + }, + { + "epoch": 7.321307242831907, + "grad_norm": 16.520050048828125, + "learning_rate": 8.516782171490691e-05, + "loss": 3.0273, + "step": 107755 + }, + { + "epoch": 7.321646962902569, + "grad_norm": 0.7892425060272217, + "learning_rate": 8.51253567060742e-05, + "loss": 3.3453, + "step": 107760 + }, + { + "epoch": 7.32198668297323, + "grad_norm": 0.8537120223045349, + "learning_rate": 8.508289169724147e-05, + "loss": 3.2771, + "step": 107765 + }, + { + "epoch": 7.322326403043892, + "grad_norm": 0.9980531930923462, + "learning_rate": 8.504042668840875e-05, + "loss": 3.2724, + "step": 107770 + }, + { + "epoch": 7.322666123114554, + "grad_norm": 0.9071036577224731, + "learning_rate": 8.499796167957603e-05, + "loss": 3.2189, + "step": 107775 + }, + { + "epoch": 7.323005843185215, + "grad_norm": 0.8919150233268738, + "learning_rate": 8.495549667074331e-05, + "loss": 3.5768, + "step": 107780 + }, + { + "epoch": 7.323345563255877, + "grad_norm": 2.142904281616211, + "learning_rate": 8.491303166191058e-05, + "loss": 3.3968, + "step": 107785 + }, + { + "epoch": 7.323685283326539, + "grad_norm": 1.0454962253570557, + "learning_rate": 8.487056665307786e-05, + "loss": 3.1896, + "step": 107790 + }, + { + "epoch": 7.3240250033972005, + "grad_norm": 0.8359812498092651, + "learning_rate": 8.482810164424515e-05, + "loss": 3.418, + "step": 107795 + }, + { + "epoch": 7.324364723467863, + "grad_norm": 1.1642664670944214, + "learning_rate": 8.478563663541242e-05, + "loss": 3.4431, + "step": 107800 + }, + { + "epoch": 7.324704443538524, + "grad_norm": 0.9888619780540466, + "learning_rate": 8.47431716265797e-05, + "loss": 3.3975, + "step": 107805 + }, + { + "epoch": 7.325044163609186, + "grad_norm": 0.8617981672286987, + "learning_rate": 8.470070661774698e-05, + "loss": 3.5446, + "step": 107810 + }, + { + "epoch": 7.325383883679848, + "grad_norm": 0.8279331922531128, + "learning_rate": 8.465824160891426e-05, + "loss": 3.4235, + "step": 107815 + }, + { + "epoch": 7.325723603750509, + "grad_norm": 0.8541170954704285, + "learning_rate": 8.461577660008153e-05, + "loss": 3.352, + "step": 107820 + }, + { + "epoch": 7.326063323821171, + "grad_norm": 0.791886568069458, + "learning_rate": 8.457331159124882e-05, + "loss": 3.3005, + "step": 107825 + }, + { + "epoch": 7.326403043891833, + "grad_norm": 0.953693151473999, + "learning_rate": 8.45308465824161e-05, + "loss": 3.4491, + "step": 107830 + }, + { + "epoch": 7.3267427639624945, + "grad_norm": 0.8389368653297424, + "learning_rate": 8.448838157358337e-05, + "loss": 3.5244, + "step": 107835 + }, + { + "epoch": 7.3270824840331565, + "grad_norm": 0.9229384064674377, + "learning_rate": 8.444591656475065e-05, + "loss": 3.3353, + "step": 107840 + }, + { + "epoch": 7.327422204103819, + "grad_norm": 0.8707290887832642, + "learning_rate": 8.440345155591793e-05, + "loss": 3.2628, + "step": 107845 + }, + { + "epoch": 7.32776192417448, + "grad_norm": 0.9839906096458435, + "learning_rate": 8.43609865470852e-05, + "loss": 3.5098, + "step": 107850 + }, + { + "epoch": 7.328101644245142, + "grad_norm": 0.946179986000061, + "learning_rate": 8.431852153825247e-05, + "loss": 3.486, + "step": 107855 + }, + { + "epoch": 7.328441364315804, + "grad_norm": 0.7631790041923523, + "learning_rate": 8.427605652941977e-05, + "loss": 3.406, + "step": 107860 + }, + { + "epoch": 7.328781084386465, + "grad_norm": 0.8271942734718323, + "learning_rate": 8.423359152058705e-05, + "loss": 3.47, + "step": 107865 + }, + { + "epoch": 7.329120804457127, + "grad_norm": 1.0625298023223877, + "learning_rate": 8.419112651175431e-05, + "loss": 3.525, + "step": 107870 + }, + { + "epoch": 7.329460524527789, + "grad_norm": 0.8718134164810181, + "learning_rate": 8.41486615029216e-05, + "loss": 3.4798, + "step": 107875 + }, + { + "epoch": 7.3298002445984505, + "grad_norm": 0.9799123406410217, + "learning_rate": 8.410619649408887e-05, + "loss": 3.4352, + "step": 107880 + }, + { + "epoch": 7.3301399646691126, + "grad_norm": 0.996215283870697, + "learning_rate": 8.406373148525614e-05, + "loss": 3.5329, + "step": 107885 + }, + { + "epoch": 7.330479684739775, + "grad_norm": 0.7794317007064819, + "learning_rate": 8.402126647642343e-05, + "loss": 3.5537, + "step": 107890 + }, + { + "epoch": 7.330819404810436, + "grad_norm": 0.8613622188568115, + "learning_rate": 8.397880146759071e-05, + "loss": 3.3274, + "step": 107895 + }, + { + "epoch": 7.331159124881098, + "grad_norm": 0.9069505333900452, + "learning_rate": 8.3936336458758e-05, + "loss": 3.6471, + "step": 107900 + }, + { + "epoch": 7.33149884495176, + "grad_norm": 0.9127135276794434, + "learning_rate": 8.389387144992526e-05, + "loss": 3.3637, + "step": 107905 + }, + { + "epoch": 7.331838565022421, + "grad_norm": 0.9266355633735657, + "learning_rate": 8.385140644109254e-05, + "loss": 3.4665, + "step": 107910 + }, + { + "epoch": 7.332178285093083, + "grad_norm": 1.0371819734573364, + "learning_rate": 8.380894143225982e-05, + "loss": 3.4656, + "step": 107915 + }, + { + "epoch": 7.332518005163745, + "grad_norm": 1.0362948179244995, + "learning_rate": 8.376647642342709e-05, + "loss": 3.3731, + "step": 107920 + }, + { + "epoch": 7.3328577252344065, + "grad_norm": 0.8675819635391235, + "learning_rate": 8.372401141459438e-05, + "loss": 3.2323, + "step": 107925 + }, + { + "epoch": 7.333197445305069, + "grad_norm": 0.9931822419166565, + "learning_rate": 8.368154640576166e-05, + "loss": 3.2307, + "step": 107930 + }, + { + "epoch": 7.333537165375731, + "grad_norm": 0.9308160543441772, + "learning_rate": 8.363908139692893e-05, + "loss": 3.5938, + "step": 107935 + }, + { + "epoch": 7.333876885446392, + "grad_norm": 0.9822877049446106, + "learning_rate": 8.359661638809621e-05, + "loss": 3.508, + "step": 107940 + }, + { + "epoch": 7.334216605517054, + "grad_norm": 0.9348020553588867, + "learning_rate": 8.355415137926349e-05, + "loss": 3.5109, + "step": 107945 + }, + { + "epoch": 7.334556325587716, + "grad_norm": 0.9548328518867493, + "learning_rate": 8.351168637043077e-05, + "loss": 3.3717, + "step": 107950 + }, + { + "epoch": 7.334896045658377, + "grad_norm": 0.8203341960906982, + "learning_rate": 8.346922136159805e-05, + "loss": 3.4749, + "step": 107955 + }, + { + "epoch": 7.335235765729039, + "grad_norm": 0.8391079306602478, + "learning_rate": 8.342675635276533e-05, + "loss": 3.2306, + "step": 107960 + }, + { + "epoch": 7.335575485799701, + "grad_norm": 0.9815139174461365, + "learning_rate": 8.338429134393261e-05, + "loss": 3.2942, + "step": 107965 + }, + { + "epoch": 7.3359152058703625, + "grad_norm": 1.025807499885559, + "learning_rate": 8.334182633509987e-05, + "loss": 3.0352, + "step": 107970 + }, + { + "epoch": 7.336254925941025, + "grad_norm": 0.9787268042564392, + "learning_rate": 8.329936132626715e-05, + "loss": 3.1916, + "step": 107975 + }, + { + "epoch": 7.336594646011687, + "grad_norm": 1.003245234489441, + "learning_rate": 8.325689631743443e-05, + "loss": 3.388, + "step": 107980 + }, + { + "epoch": 7.336934366082348, + "grad_norm": 0.9315221905708313, + "learning_rate": 8.321443130860171e-05, + "loss": 3.7112, + "step": 107985 + }, + { + "epoch": 7.33727408615301, + "grad_norm": 0.7620629072189331, + "learning_rate": 8.3171966299769e-05, + "loss": 3.27, + "step": 107990 + }, + { + "epoch": 7.337613806223672, + "grad_norm": 1.524202585220337, + "learning_rate": 8.312950129093627e-05, + "loss": 3.4938, + "step": 107995 + }, + { + "epoch": 7.337953526294333, + "grad_norm": 1.0228829383850098, + "learning_rate": 8.308703628210355e-05, + "loss": 3.3656, + "step": 108000 + }, + { + "epoch": 7.338293246364995, + "grad_norm": 1.0066057443618774, + "learning_rate": 8.304457127327082e-05, + "loss": 3.5127, + "step": 108005 + }, + { + "epoch": 7.338632966435657, + "grad_norm": 2.894225597381592, + "learning_rate": 8.30021062644381e-05, + "loss": 3.4752, + "step": 108010 + }, + { + "epoch": 7.3389726865063185, + "grad_norm": 0.7255954742431641, + "learning_rate": 8.295964125560538e-05, + "loss": 3.5582, + "step": 108015 + }, + { + "epoch": 7.339312406576981, + "grad_norm": 1.1371551752090454, + "learning_rate": 8.291717624677265e-05, + "loss": 3.4148, + "step": 108020 + }, + { + "epoch": 7.339652126647643, + "grad_norm": 0.8952188491821289, + "learning_rate": 8.287471123793994e-05, + "loss": 3.4829, + "step": 108025 + }, + { + "epoch": 7.339991846718304, + "grad_norm": 0.9371165037155151, + "learning_rate": 8.283224622910722e-05, + "loss": 3.6008, + "step": 108030 + }, + { + "epoch": 7.340331566788966, + "grad_norm": 1.0306569337844849, + "learning_rate": 8.27897812202745e-05, + "loss": 3.3636, + "step": 108035 + }, + { + "epoch": 7.340671286859628, + "grad_norm": 0.9362478256225586, + "learning_rate": 8.274731621144177e-05, + "loss": 3.4185, + "step": 108040 + }, + { + "epoch": 7.341011006930289, + "grad_norm": 1.1194589138031006, + "learning_rate": 8.270485120260905e-05, + "loss": 3.7121, + "step": 108045 + }, + { + "epoch": 7.341350727000951, + "grad_norm": 1.1808515787124634, + "learning_rate": 8.266238619377633e-05, + "loss": 3.2927, + "step": 108050 + }, + { + "epoch": 7.341690447071613, + "grad_norm": 0.8906660079956055, + "learning_rate": 8.261992118494361e-05, + "loss": 3.4385, + "step": 108055 + }, + { + "epoch": 7.3420301671422745, + "grad_norm": 0.8188360333442688, + "learning_rate": 8.257745617611089e-05, + "loss": 3.511, + "step": 108060 + }, + { + "epoch": 7.342369887212937, + "grad_norm": 0.8082699775695801, + "learning_rate": 8.253499116727817e-05, + "loss": 3.3914, + "step": 108065 + }, + { + "epoch": 7.342709607283599, + "grad_norm": 0.7165114879608154, + "learning_rate": 8.249252615844545e-05, + "loss": 3.5545, + "step": 108070 + }, + { + "epoch": 7.34304932735426, + "grad_norm": 0.864155650138855, + "learning_rate": 8.245006114961272e-05, + "loss": 3.3111, + "step": 108075 + }, + { + "epoch": 7.343389047424922, + "grad_norm": 0.7930484414100647, + "learning_rate": 8.240759614078e-05, + "loss": 3.4395, + "step": 108080 + }, + { + "epoch": 7.343728767495584, + "grad_norm": 0.765216052532196, + "learning_rate": 8.236513113194728e-05, + "loss": 3.4655, + "step": 108085 + }, + { + "epoch": 7.344068487566245, + "grad_norm": 0.9517468810081482, + "learning_rate": 8.232266612311456e-05, + "loss": 3.3079, + "step": 108090 + }, + { + "epoch": 7.344408207636907, + "grad_norm": 0.9252774715423584, + "learning_rate": 8.228020111428184e-05, + "loss": 3.2603, + "step": 108095 + }, + { + "epoch": 7.344747927707569, + "grad_norm": 0.770166277885437, + "learning_rate": 8.223773610544912e-05, + "loss": 3.1298, + "step": 108100 + }, + { + "epoch": 7.3450876477782305, + "grad_norm": 0.7698420882225037, + "learning_rate": 8.219527109661638e-05, + "loss": 3.4351, + "step": 108105 + }, + { + "epoch": 7.345427367848893, + "grad_norm": 0.7821729183197021, + "learning_rate": 8.215280608778366e-05, + "loss": 3.8083, + "step": 108110 + }, + { + "epoch": 7.345767087919555, + "grad_norm": 0.6487552523612976, + "learning_rate": 8.211034107895094e-05, + "loss": 3.4037, + "step": 108115 + }, + { + "epoch": 7.346106807990216, + "grad_norm": 0.9583954811096191, + "learning_rate": 8.206787607011824e-05, + "loss": 3.0173, + "step": 108120 + }, + { + "epoch": 7.346446528060878, + "grad_norm": 1.0351208448410034, + "learning_rate": 8.20254110612855e-05, + "loss": 3.5311, + "step": 108125 + }, + { + "epoch": 7.34678624813154, + "grad_norm": 1.1160820722579956, + "learning_rate": 8.198294605245278e-05, + "loss": 3.3393, + "step": 108130 + }, + { + "epoch": 7.347125968202201, + "grad_norm": 0.9219271540641785, + "learning_rate": 8.194048104362006e-05, + "loss": 3.1356, + "step": 108135 + }, + { + "epoch": 7.347465688272863, + "grad_norm": 0.8252049088478088, + "learning_rate": 8.189801603478733e-05, + "loss": 3.2276, + "step": 108140 + }, + { + "epoch": 7.347805408343525, + "grad_norm": 3.945964813232422, + "learning_rate": 8.185555102595461e-05, + "loss": 3.1186, + "step": 108145 + }, + { + "epoch": 7.3481451284141865, + "grad_norm": 1.0410008430480957, + "learning_rate": 8.181308601712189e-05, + "loss": 3.4825, + "step": 108150 + }, + { + "epoch": 7.348484848484849, + "grad_norm": 1.02815580368042, + "learning_rate": 8.177062100828918e-05, + "loss": 3.3653, + "step": 108155 + }, + { + "epoch": 7.348824568555511, + "grad_norm": 0.8430343270301819, + "learning_rate": 8.172815599945645e-05, + "loss": 3.5745, + "step": 108160 + }, + { + "epoch": 7.349164288626172, + "grad_norm": 1.0164819955825806, + "learning_rate": 8.168569099062373e-05, + "loss": 3.5167, + "step": 108165 + }, + { + "epoch": 7.349504008696834, + "grad_norm": 0.9788842797279358, + "learning_rate": 8.164322598179101e-05, + "loss": 3.3443, + "step": 108170 + }, + { + "epoch": 7.349843728767496, + "grad_norm": 0.8705429434776306, + "learning_rate": 8.160076097295828e-05, + "loss": 3.5047, + "step": 108175 + }, + { + "epoch": 7.350183448838157, + "grad_norm": 0.8463244438171387, + "learning_rate": 8.155829596412556e-05, + "loss": 3.6705, + "step": 108180 + }, + { + "epoch": 7.350523168908819, + "grad_norm": 0.8032101988792419, + "learning_rate": 8.151583095529285e-05, + "loss": 3.2361, + "step": 108185 + }, + { + "epoch": 7.350862888979481, + "grad_norm": 0.9860422611236572, + "learning_rate": 8.147336594646012e-05, + "loss": 3.4855, + "step": 108190 + }, + { + "epoch": 7.3512026090501426, + "grad_norm": 0.9160711169242859, + "learning_rate": 8.14309009376274e-05, + "loss": 3.4277, + "step": 108195 + }, + { + "epoch": 7.351542329120805, + "grad_norm": 1.1140798330307007, + "learning_rate": 8.138843592879468e-05, + "loss": 3.1075, + "step": 108200 + }, + { + "epoch": 7.351882049191466, + "grad_norm": 1.0478521585464478, + "learning_rate": 8.134597091996196e-05, + "loss": 3.2777, + "step": 108205 + }, + { + "epoch": 7.352221769262128, + "grad_norm": 0.8166481852531433, + "learning_rate": 8.130350591112922e-05, + "loss": 3.4482, + "step": 108210 + }, + { + "epoch": 7.35256148933279, + "grad_norm": 0.7869060039520264, + "learning_rate": 8.12610409022965e-05, + "loss": 3.621, + "step": 108215 + }, + { + "epoch": 7.352901209403451, + "grad_norm": 0.8418645858764648, + "learning_rate": 8.12185758934638e-05, + "loss": 3.3947, + "step": 108220 + }, + { + "epoch": 7.353240929474113, + "grad_norm": 0.8825634717941284, + "learning_rate": 8.117611088463106e-05, + "loss": 3.4847, + "step": 108225 + }, + { + "epoch": 7.353580649544775, + "grad_norm": 0.8441269397735596, + "learning_rate": 8.113364587579834e-05, + "loss": 3.4592, + "step": 108230 + }, + { + "epoch": 7.3539203696154365, + "grad_norm": 1.7965517044067383, + "learning_rate": 8.109118086696562e-05, + "loss": 3.2989, + "step": 108235 + }, + { + "epoch": 7.354260089686099, + "grad_norm": 0.6669885516166687, + "learning_rate": 8.10487158581329e-05, + "loss": 3.4361, + "step": 108240 + }, + { + "epoch": 7.354599809756761, + "grad_norm": 0.909227728843689, + "learning_rate": 8.100625084930017e-05, + "loss": 3.6899, + "step": 108245 + }, + { + "epoch": 7.354939529827422, + "grad_norm": 0.7583549618721008, + "learning_rate": 8.096378584046746e-05, + "loss": 3.4796, + "step": 108250 + }, + { + "epoch": 7.355279249898084, + "grad_norm": 1.0463563203811646, + "learning_rate": 8.092132083163474e-05, + "loss": 3.3769, + "step": 108255 + }, + { + "epoch": 7.355618969968746, + "grad_norm": 0.874703586101532, + "learning_rate": 8.087885582280201e-05, + "loss": 3.4979, + "step": 108260 + }, + { + "epoch": 7.355958690039407, + "grad_norm": 1.2408183813095093, + "learning_rate": 8.083639081396929e-05, + "loss": 3.4978, + "step": 108265 + }, + { + "epoch": 7.356298410110069, + "grad_norm": 0.9038196802139282, + "learning_rate": 8.079392580513657e-05, + "loss": 3.3954, + "step": 108270 + }, + { + "epoch": 7.356638130180731, + "grad_norm": 0.9715887308120728, + "learning_rate": 8.075146079630384e-05, + "loss": 3.4897, + "step": 108275 + }, + { + "epoch": 7.3569778502513925, + "grad_norm": 0.7339851260185242, + "learning_rate": 8.070899578747112e-05, + "loss": 3.5255, + "step": 108280 + }, + { + "epoch": 7.357317570322055, + "grad_norm": 1.0252959728240967, + "learning_rate": 8.066653077863841e-05, + "loss": 3.5272, + "step": 108285 + }, + { + "epoch": 7.357657290392717, + "grad_norm": 1.041770100593567, + "learning_rate": 8.062406576980569e-05, + "loss": 3.2334, + "step": 108290 + }, + { + "epoch": 7.357997010463378, + "grad_norm": 0.9462423324584961, + "learning_rate": 8.058160076097296e-05, + "loss": 3.511, + "step": 108295 + }, + { + "epoch": 7.35833673053404, + "grad_norm": 0.8526864051818848, + "learning_rate": 8.053913575214024e-05, + "loss": 3.2567, + "step": 108300 + }, + { + "epoch": 7.358676450604702, + "grad_norm": 1.0511627197265625, + "learning_rate": 8.049667074330752e-05, + "loss": 3.515, + "step": 108305 + }, + { + "epoch": 7.359016170675363, + "grad_norm": 0.8668769001960754, + "learning_rate": 8.045420573447478e-05, + "loss": 3.4346, + "step": 108310 + }, + { + "epoch": 7.359355890746025, + "grad_norm": 0.7760682106018066, + "learning_rate": 8.041174072564208e-05, + "loss": 3.354, + "step": 108315 + }, + { + "epoch": 7.359695610816687, + "grad_norm": 1.0884753465652466, + "learning_rate": 8.036927571680936e-05, + "loss": 3.3554, + "step": 108320 + }, + { + "epoch": 7.3600353308873485, + "grad_norm": 1.089479684829712, + "learning_rate": 8.032681070797664e-05, + "loss": 3.4246, + "step": 108325 + }, + { + "epoch": 7.360375050958011, + "grad_norm": 0.7651086449623108, + "learning_rate": 8.02843456991439e-05, + "loss": 3.3048, + "step": 108330 + }, + { + "epoch": 7.360714771028673, + "grad_norm": 0.8419424295425415, + "learning_rate": 8.024188069031118e-05, + "loss": 3.6906, + "step": 108335 + }, + { + "epoch": 7.361054491099334, + "grad_norm": 1.10274076461792, + "learning_rate": 8.019941568147846e-05, + "loss": 3.456, + "step": 108340 + }, + { + "epoch": 7.361394211169996, + "grad_norm": 0.9409751296043396, + "learning_rate": 8.015695067264573e-05, + "loss": 3.2227, + "step": 108345 + }, + { + "epoch": 7.361733931240658, + "grad_norm": 1.2417738437652588, + "learning_rate": 8.011448566381302e-05, + "loss": 3.1178, + "step": 108350 + }, + { + "epoch": 7.362073651311319, + "grad_norm": 1.0204612016677856, + "learning_rate": 8.00720206549803e-05, + "loss": 3.2669, + "step": 108355 + }, + { + "epoch": 7.362413371381981, + "grad_norm": 0.9966930747032166, + "learning_rate": 8.002955564614757e-05, + "loss": 3.3555, + "step": 108360 + }, + { + "epoch": 7.362753091452643, + "grad_norm": 0.9278699159622192, + "learning_rate": 7.998709063731485e-05, + "loss": 3.4864, + "step": 108365 + }, + { + "epoch": 7.3630928115233045, + "grad_norm": 1.0128856897354126, + "learning_rate": 7.994462562848213e-05, + "loss": 3.4005, + "step": 108370 + }, + { + "epoch": 7.363432531593967, + "grad_norm": 1.10188889503479, + "learning_rate": 7.990216061964941e-05, + "loss": 3.3984, + "step": 108375 + }, + { + "epoch": 7.363772251664629, + "grad_norm": 0.8811441659927368, + "learning_rate": 7.985969561081668e-05, + "loss": 3.6409, + "step": 108380 + }, + { + "epoch": 7.36411197173529, + "grad_norm": 1.1458157300949097, + "learning_rate": 7.981723060198397e-05, + "loss": 3.5367, + "step": 108385 + }, + { + "epoch": 7.364451691805952, + "grad_norm": 1.1762100458145142, + "learning_rate": 7.977476559315125e-05, + "loss": 2.9716, + "step": 108390 + }, + { + "epoch": 7.364791411876614, + "grad_norm": 1.140358328819275, + "learning_rate": 7.973230058431852e-05, + "loss": 3.3275, + "step": 108395 + }, + { + "epoch": 7.365131131947275, + "grad_norm": 1.005632758140564, + "learning_rate": 7.96898355754858e-05, + "loss": 3.1831, + "step": 108400 + }, + { + "epoch": 7.365470852017937, + "grad_norm": 0.8336877822875977, + "learning_rate": 7.964737056665308e-05, + "loss": 3.2862, + "step": 108405 + }, + { + "epoch": 7.365810572088599, + "grad_norm": 1.0532371997833252, + "learning_rate": 7.960490555782036e-05, + "loss": 3.1467, + "step": 108410 + }, + { + "epoch": 7.3661502921592605, + "grad_norm": 0.8804168701171875, + "learning_rate": 7.956244054898764e-05, + "loss": 3.1982, + "step": 108415 + }, + { + "epoch": 7.366490012229923, + "grad_norm": 1.0313595533370972, + "learning_rate": 7.951997554015492e-05, + "loss": 3.2265, + "step": 108420 + }, + { + "epoch": 7.366829732300585, + "grad_norm": 1.063122034072876, + "learning_rate": 7.94775105313222e-05, + "loss": 3.5862, + "step": 108425 + }, + { + "epoch": 7.367169452371246, + "grad_norm": 0.9502562284469604, + "learning_rate": 7.943504552248947e-05, + "loss": 3.1652, + "step": 108430 + }, + { + "epoch": 7.367509172441908, + "grad_norm": 0.9603378176689148, + "learning_rate": 7.939258051365675e-05, + "loss": 3.6104, + "step": 108435 + }, + { + "epoch": 7.36784889251257, + "grad_norm": 0.9209380149841309, + "learning_rate": 7.935011550482403e-05, + "loss": 3.5541, + "step": 108440 + }, + { + "epoch": 7.368188612583231, + "grad_norm": 1.0916991233825684, + "learning_rate": 7.930765049599129e-05, + "loss": 3.274, + "step": 108445 + }, + { + "epoch": 7.368528332653893, + "grad_norm": 0.908411979675293, + "learning_rate": 7.926518548715859e-05, + "loss": 3.4925, + "step": 108450 + }, + { + "epoch": 7.368868052724555, + "grad_norm": 0.9224674701690674, + "learning_rate": 7.922272047832587e-05, + "loss": 3.4421, + "step": 108455 + }, + { + "epoch": 7.3692077727952165, + "grad_norm": 1.209481120109558, + "learning_rate": 7.918025546949315e-05, + "loss": 3.3837, + "step": 108460 + }, + { + "epoch": 7.369547492865879, + "grad_norm": 0.9018449783325195, + "learning_rate": 7.913779046066041e-05, + "loss": 3.2802, + "step": 108465 + }, + { + "epoch": 7.369887212936541, + "grad_norm": 0.8408250212669373, + "learning_rate": 7.909532545182769e-05, + "loss": 3.3142, + "step": 108470 + }, + { + "epoch": 7.370226933007202, + "grad_norm": 0.7119000554084778, + "learning_rate": 7.905286044299497e-05, + "loss": 3.3084, + "step": 108475 + }, + { + "epoch": 7.370566653077864, + "grad_norm": 0.8446618318557739, + "learning_rate": 7.901039543416225e-05, + "loss": 3.4015, + "step": 108480 + }, + { + "epoch": 7.370906373148525, + "grad_norm": 0.9615287780761719, + "learning_rate": 7.896793042532953e-05, + "loss": 3.5359, + "step": 108485 + }, + { + "epoch": 7.371246093219187, + "grad_norm": 1.0711029767990112, + "learning_rate": 7.892546541649681e-05, + "loss": 3.3175, + "step": 108490 + }, + { + "epoch": 7.371585813289849, + "grad_norm": 1.2759901285171509, + "learning_rate": 7.888300040766409e-05, + "loss": 3.3278, + "step": 108495 + }, + { + "epoch": 7.3719255333605105, + "grad_norm": 0.8576691150665283, + "learning_rate": 7.884053539883136e-05, + "loss": 3.3793, + "step": 108500 + }, + { + "epoch": 7.372265253431173, + "grad_norm": 0.7970983386039734, + "learning_rate": 7.879807038999864e-05, + "loss": 3.4483, + "step": 108505 + }, + { + "epoch": 7.372604973501835, + "grad_norm": 0.9503864645957947, + "learning_rate": 7.875560538116592e-05, + "loss": 3.458, + "step": 108510 + }, + { + "epoch": 7.372944693572496, + "grad_norm": 0.8051579594612122, + "learning_rate": 7.87131403723332e-05, + "loss": 3.6375, + "step": 108515 + }, + { + "epoch": 7.373284413643158, + "grad_norm": 0.7895700335502625, + "learning_rate": 7.867067536350048e-05, + "loss": 3.6899, + "step": 108520 + }, + { + "epoch": 7.37362413371382, + "grad_norm": 0.9901759624481201, + "learning_rate": 7.862821035466776e-05, + "loss": 3.3347, + "step": 108525 + }, + { + "epoch": 7.373963853784481, + "grad_norm": 1.2165303230285645, + "learning_rate": 7.858574534583503e-05, + "loss": 3.2309, + "step": 108530 + }, + { + "epoch": 7.374303573855143, + "grad_norm": 0.8499707579612732, + "learning_rate": 7.85432803370023e-05, + "loss": 3.4173, + "step": 108535 + }, + { + "epoch": 7.374643293925805, + "grad_norm": 0.883696973323822, + "learning_rate": 7.850081532816959e-05, + "loss": 3.3737, + "step": 108540 + }, + { + "epoch": 7.3749830139964665, + "grad_norm": 0.7455485463142395, + "learning_rate": 7.845835031933688e-05, + "loss": 3.3612, + "step": 108545 + }, + { + "epoch": 7.375322734067129, + "grad_norm": 0.8792511820793152, + "learning_rate": 7.841588531050415e-05, + "loss": 3.1996, + "step": 108550 + }, + { + "epoch": 7.375662454137791, + "grad_norm": 0.7742661237716675, + "learning_rate": 7.837342030167143e-05, + "loss": 3.6207, + "step": 108555 + }, + { + "epoch": 7.376002174208452, + "grad_norm": 1.1773477792739868, + "learning_rate": 7.833095529283871e-05, + "loss": 3.5099, + "step": 108560 + }, + { + "epoch": 7.376341894279114, + "grad_norm": 0.9379788637161255, + "learning_rate": 7.828849028400597e-05, + "loss": 3.381, + "step": 108565 + }, + { + "epoch": 7.376681614349776, + "grad_norm": 0.8596869111061096, + "learning_rate": 7.824602527517325e-05, + "loss": 3.4787, + "step": 108570 + }, + { + "epoch": 7.377021334420437, + "grad_norm": 1.0831236839294434, + "learning_rate": 7.820356026634053e-05, + "loss": 3.5385, + "step": 108575 + }, + { + "epoch": 7.377361054491099, + "grad_norm": 1.009730577468872, + "learning_rate": 7.816109525750783e-05, + "loss": 3.423, + "step": 108580 + }, + { + "epoch": 7.377700774561761, + "grad_norm": 0.9685721397399902, + "learning_rate": 7.81186302486751e-05, + "loss": 3.4852, + "step": 108585 + }, + { + "epoch": 7.3780404946324225, + "grad_norm": 0.9676198363304138, + "learning_rate": 7.807616523984237e-05, + "loss": 3.4998, + "step": 108590 + }, + { + "epoch": 7.378380214703085, + "grad_norm": 1.0064194202423096, + "learning_rate": 7.803370023100965e-05, + "loss": 3.5932, + "step": 108595 + }, + { + "epoch": 7.378719934773747, + "grad_norm": 0.7691531777381897, + "learning_rate": 7.799123522217692e-05, + "loss": 3.2816, + "step": 108600 + }, + { + "epoch": 7.379059654844408, + "grad_norm": 0.9243648052215576, + "learning_rate": 7.79487702133442e-05, + "loss": 3.2969, + "step": 108605 + }, + { + "epoch": 7.37939937491507, + "grad_norm": 0.8425473570823669, + "learning_rate": 7.79063052045115e-05, + "loss": 3.244, + "step": 108610 + }, + { + "epoch": 7.379739094985732, + "grad_norm": 0.8655633926391602, + "learning_rate": 7.786384019567876e-05, + "loss": 3.7258, + "step": 108615 + }, + { + "epoch": 7.380078815056393, + "grad_norm": 0.90301513671875, + "learning_rate": 7.782137518684604e-05, + "loss": 3.4117, + "step": 108620 + }, + { + "epoch": 7.380418535127055, + "grad_norm": 1.2357665300369263, + "learning_rate": 7.777891017801332e-05, + "loss": 3.5223, + "step": 108625 + }, + { + "epoch": 7.380758255197717, + "grad_norm": 0.8828724026679993, + "learning_rate": 7.77364451691806e-05, + "loss": 3.3361, + "step": 108630 + }, + { + "epoch": 7.3810979752683785, + "grad_norm": 0.9043753743171692, + "learning_rate": 7.769398016034787e-05, + "loss": 3.272, + "step": 108635 + }, + { + "epoch": 7.381437695339041, + "grad_norm": 0.9881519675254822, + "learning_rate": 7.765151515151515e-05, + "loss": 3.5728, + "step": 108640 + }, + { + "epoch": 7.381777415409703, + "grad_norm": 0.9170758128166199, + "learning_rate": 7.760905014268244e-05, + "loss": 3.5431, + "step": 108645 + }, + { + "epoch": 7.382117135480364, + "grad_norm": 0.845940887928009, + "learning_rate": 7.756658513384971e-05, + "loss": 3.3817, + "step": 108650 + }, + { + "epoch": 7.382456855551026, + "grad_norm": 1.1515021324157715, + "learning_rate": 7.752412012501699e-05, + "loss": 3.3801, + "step": 108655 + }, + { + "epoch": 7.382796575621688, + "grad_norm": 0.8191408514976501, + "learning_rate": 7.748165511618427e-05, + "loss": 3.459, + "step": 108660 + }, + { + "epoch": 7.383136295692349, + "grad_norm": 1.0081161260604858, + "learning_rate": 7.743919010735153e-05, + "loss": 3.0892, + "step": 108665 + }, + { + "epoch": 7.383476015763011, + "grad_norm": 0.8673043251037598, + "learning_rate": 7.739672509851881e-05, + "loss": 3.2621, + "step": 108670 + }, + { + "epoch": 7.383815735833673, + "grad_norm": 0.8846167922019958, + "learning_rate": 7.735426008968611e-05, + "loss": 3.3335, + "step": 108675 + }, + { + "epoch": 7.3841554559043345, + "grad_norm": 0.9759306311607361, + "learning_rate": 7.731179508085339e-05, + "loss": 3.3425, + "step": 108680 + }, + { + "epoch": 7.384495175974997, + "grad_norm": 0.7957445979118347, + "learning_rate": 7.726933007202065e-05, + "loss": 3.7325, + "step": 108685 + }, + { + "epoch": 7.384834896045659, + "grad_norm": 0.8391867876052856, + "learning_rate": 7.722686506318793e-05, + "loss": 3.3635, + "step": 108690 + }, + { + "epoch": 7.38517461611632, + "grad_norm": 1.0814059972763062, + "learning_rate": 7.718440005435521e-05, + "loss": 3.2838, + "step": 108695 + }, + { + "epoch": 7.385514336186982, + "grad_norm": 0.9408888816833496, + "learning_rate": 7.714193504552248e-05, + "loss": 3.2474, + "step": 108700 + }, + { + "epoch": 7.385854056257644, + "grad_norm": 1.479493498802185, + "learning_rate": 7.709947003668976e-05, + "loss": 3.4372, + "step": 108705 + }, + { + "epoch": 7.386193776328305, + "grad_norm": 0.8255301713943481, + "learning_rate": 7.705700502785705e-05, + "loss": 3.4233, + "step": 108710 + }, + { + "epoch": 7.386533496398967, + "grad_norm": 0.9327268600463867, + "learning_rate": 7.701454001902433e-05, + "loss": 3.298, + "step": 108715 + }, + { + "epoch": 7.386873216469629, + "grad_norm": 0.9907144904136658, + "learning_rate": 7.69720750101916e-05, + "loss": 3.5066, + "step": 108720 + }, + { + "epoch": 7.3872129365402905, + "grad_norm": 0.8732564449310303, + "learning_rate": 7.692961000135888e-05, + "loss": 3.4916, + "step": 108725 + }, + { + "epoch": 7.387552656610953, + "grad_norm": 1.1318023204803467, + "learning_rate": 7.688714499252616e-05, + "loss": 3.3757, + "step": 108730 + }, + { + "epoch": 7.387892376681615, + "grad_norm": 1.1583155393600464, + "learning_rate": 7.684467998369343e-05, + "loss": 3.5328, + "step": 108735 + }, + { + "epoch": 7.388232096752276, + "grad_norm": 1.1294537782669067, + "learning_rate": 7.680221497486072e-05, + "loss": 3.5066, + "step": 108740 + }, + { + "epoch": 7.388571816822938, + "grad_norm": 0.8529323935508728, + "learning_rate": 7.6759749966028e-05, + "loss": 3.4332, + "step": 108745 + }, + { + "epoch": 7.3889115368936, + "grad_norm": 0.8567370176315308, + "learning_rate": 7.671728495719527e-05, + "loss": 3.3423, + "step": 108750 + }, + { + "epoch": 7.389251256964261, + "grad_norm": 0.8277938961982727, + "learning_rate": 7.667481994836255e-05, + "loss": 3.6297, + "step": 108755 + }, + { + "epoch": 7.389590977034923, + "grad_norm": 1.0433034896850586, + "learning_rate": 7.663235493952983e-05, + "loss": 3.2809, + "step": 108760 + }, + { + "epoch": 7.389930697105585, + "grad_norm": 1.164507508277893, + "learning_rate": 7.658988993069711e-05, + "loss": 3.3057, + "step": 108765 + }, + { + "epoch": 7.3902704171762466, + "grad_norm": 1.0003825426101685, + "learning_rate": 7.654742492186438e-05, + "loss": 3.4393, + "step": 108770 + }, + { + "epoch": 7.390610137246909, + "grad_norm": 0.9151121377944946, + "learning_rate": 7.650495991303167e-05, + "loss": 3.4438, + "step": 108775 + }, + { + "epoch": 7.390949857317571, + "grad_norm": 1.076714277267456, + "learning_rate": 7.646249490419895e-05, + "loss": 3.7354, + "step": 108780 + }, + { + "epoch": 7.391289577388232, + "grad_norm": 0.8840180039405823, + "learning_rate": 7.642002989536622e-05, + "loss": 3.5172, + "step": 108785 + }, + { + "epoch": 7.391629297458894, + "grad_norm": 1.0768052339553833, + "learning_rate": 7.63775648865335e-05, + "loss": 3.4811, + "step": 108790 + }, + { + "epoch": 7.391969017529556, + "grad_norm": 1.0272971391677856, + "learning_rate": 7.633509987770078e-05, + "loss": 3.3605, + "step": 108795 + }, + { + "epoch": 7.392308737600217, + "grad_norm": 0.9474733471870422, + "learning_rate": 7.629263486886806e-05, + "loss": 3.4183, + "step": 108800 + }, + { + "epoch": 7.392648457670879, + "grad_norm": 1.061570405960083, + "learning_rate": 7.625016986003532e-05, + "loss": 3.6389, + "step": 108805 + }, + { + "epoch": 7.392988177741541, + "grad_norm": 0.9925691485404968, + "learning_rate": 7.620770485120262e-05, + "loss": 3.3807, + "step": 108810 + }, + { + "epoch": 7.393327897812203, + "grad_norm": 0.7889589071273804, + "learning_rate": 7.61652398423699e-05, + "loss": 3.344, + "step": 108815 + }, + { + "epoch": 7.393667617882865, + "grad_norm": 1.0123953819274902, + "learning_rate": 7.612277483353716e-05, + "loss": 3.662, + "step": 108820 + }, + { + "epoch": 7.394007337953527, + "grad_norm": 0.8001521825790405, + "learning_rate": 7.608030982470444e-05, + "loss": 3.498, + "step": 108825 + }, + { + "epoch": 7.394347058024188, + "grad_norm": 0.7446021437644958, + "learning_rate": 7.603784481587172e-05, + "loss": 3.6969, + "step": 108830 + }, + { + "epoch": 7.39468677809485, + "grad_norm": 0.9414898157119751, + "learning_rate": 7.599537980703899e-05, + "loss": 3.2846, + "step": 108835 + }, + { + "epoch": 7.395026498165512, + "grad_norm": 0.8337817192077637, + "learning_rate": 7.595291479820628e-05, + "loss": 3.3903, + "step": 108840 + }, + { + "epoch": 7.395366218236173, + "grad_norm": 0.9020292162895203, + "learning_rate": 7.591044978937356e-05, + "loss": 3.5499, + "step": 108845 + }, + { + "epoch": 7.395705938306835, + "grad_norm": 0.8478841185569763, + "learning_rate": 7.586798478054084e-05, + "loss": 3.2023, + "step": 108850 + }, + { + "epoch": 7.396045658377497, + "grad_norm": 0.877508282661438, + "learning_rate": 7.582551977170811e-05, + "loss": 3.4114, + "step": 108855 + }, + { + "epoch": 7.396385378448159, + "grad_norm": 0.8729053139686584, + "learning_rate": 7.578305476287539e-05, + "loss": 3.3891, + "step": 108860 + }, + { + "epoch": 7.396725098518821, + "grad_norm": 1.1704154014587402, + "learning_rate": 7.574058975404267e-05, + "loss": 3.2604, + "step": 108865 + }, + { + "epoch": 7.397064818589483, + "grad_norm": 0.844127357006073, + "learning_rate": 7.569812474520994e-05, + "loss": 3.3151, + "step": 108870 + }, + { + "epoch": 7.397404538660144, + "grad_norm": 0.8729225993156433, + "learning_rate": 7.565565973637723e-05, + "loss": 3.5894, + "step": 108875 + }, + { + "epoch": 7.397744258730806, + "grad_norm": 0.8720885515213013, + "learning_rate": 7.561319472754451e-05, + "loss": 3.5777, + "step": 108880 + }, + { + "epoch": 7.398083978801467, + "grad_norm": 1.1539244651794434, + "learning_rate": 7.557072971871179e-05, + "loss": 3.4184, + "step": 108885 + }, + { + "epoch": 7.398423698872129, + "grad_norm": 0.8540691137313843, + "learning_rate": 7.552826470987906e-05, + "loss": 3.3898, + "step": 108890 + }, + { + "epoch": 7.398763418942791, + "grad_norm": 0.7917319536209106, + "learning_rate": 7.548579970104634e-05, + "loss": 3.1666, + "step": 108895 + }, + { + "epoch": 7.3991031390134525, + "grad_norm": 0.9998136758804321, + "learning_rate": 7.544333469221362e-05, + "loss": 3.2508, + "step": 108900 + }, + { + "epoch": 7.399442859084115, + "grad_norm": 4.335847854614258, + "learning_rate": 7.54008696833809e-05, + "loss": 3.4101, + "step": 108905 + }, + { + "epoch": 7.399782579154777, + "grad_norm": 0.7362374663352966, + "learning_rate": 7.535840467454818e-05, + "loss": 3.3132, + "step": 108910 + }, + { + "epoch": 7.400122299225438, + "grad_norm": 1.0943665504455566, + "learning_rate": 7.531593966571546e-05, + "loss": 3.307, + "step": 108915 + }, + { + "epoch": 7.4004620192961, + "grad_norm": 1.0437105894088745, + "learning_rate": 7.527347465688272e-05, + "loss": 3.2246, + "step": 108920 + }, + { + "epoch": 7.400801739366762, + "grad_norm": 1.3250950574874878, + "learning_rate": 7.523100964805e-05, + "loss": 3.4602, + "step": 108925 + }, + { + "epoch": 7.401141459437423, + "grad_norm": 0.977540910243988, + "learning_rate": 7.518854463921728e-05, + "loss": 3.4146, + "step": 108930 + }, + { + "epoch": 7.401481179508085, + "grad_norm": 0.9714841842651367, + "learning_rate": 7.514607963038456e-05, + "loss": 3.416, + "step": 108935 + }, + { + "epoch": 7.401820899578747, + "grad_norm": 0.8881797194480896, + "learning_rate": 7.510361462155184e-05, + "loss": 3.4521, + "step": 108940 + }, + { + "epoch": 7.4021606196494085, + "grad_norm": 1.0970607995986938, + "learning_rate": 7.506114961271912e-05, + "loss": 3.1919, + "step": 108945 + }, + { + "epoch": 7.402500339720071, + "grad_norm": 0.8946642279624939, + "learning_rate": 7.50186846038864e-05, + "loss": 3.3827, + "step": 108950 + }, + { + "epoch": 7.402840059790733, + "grad_norm": 0.9237583875656128, + "learning_rate": 7.497621959505367e-05, + "loss": 3.3908, + "step": 108955 + }, + { + "epoch": 7.403179779861394, + "grad_norm": 0.9120374321937561, + "learning_rate": 7.493375458622095e-05, + "loss": 3.5913, + "step": 108960 + }, + { + "epoch": 7.403519499932056, + "grad_norm": 0.8552541732788086, + "learning_rate": 7.489128957738823e-05, + "loss": 3.443, + "step": 108965 + }, + { + "epoch": 7.403859220002718, + "grad_norm": 0.978038489818573, + "learning_rate": 7.484882456855552e-05, + "loss": 3.3192, + "step": 108970 + }, + { + "epoch": 7.404198940073379, + "grad_norm": 1.0296014547348022, + "learning_rate": 7.480635955972279e-05, + "loss": 3.4533, + "step": 108975 + }, + { + "epoch": 7.404538660144041, + "grad_norm": 1.0469553470611572, + "learning_rate": 7.476389455089007e-05, + "loss": 3.205, + "step": 108980 + }, + { + "epoch": 7.404878380214703, + "grad_norm": 1.0136126279830933, + "learning_rate": 7.472142954205735e-05, + "loss": 3.26, + "step": 108985 + }, + { + "epoch": 7.4052181002853645, + "grad_norm": 0.9125833511352539, + "learning_rate": 7.467896453322462e-05, + "loss": 3.4258, + "step": 108990 + }, + { + "epoch": 7.405557820356027, + "grad_norm": 1.0132085084915161, + "learning_rate": 7.46364995243919e-05, + "loss": 3.3039, + "step": 108995 + }, + { + "epoch": 7.405897540426689, + "grad_norm": 0.8686308264732361, + "learning_rate": 7.459403451555918e-05, + "loss": 3.3148, + "step": 109000 + }, + { + "epoch": 7.40623726049735, + "grad_norm": 0.956834077835083, + "learning_rate": 7.455156950672646e-05, + "loss": 3.4122, + "step": 109005 + }, + { + "epoch": 7.406576980568012, + "grad_norm": 0.8685609698295593, + "learning_rate": 7.450910449789374e-05, + "loss": 3.3649, + "step": 109010 + }, + { + "epoch": 7.406916700638674, + "grad_norm": 1.0353368520736694, + "learning_rate": 7.446663948906102e-05, + "loss": 3.3508, + "step": 109015 + }, + { + "epoch": 7.407256420709335, + "grad_norm": 1.2913082838058472, + "learning_rate": 7.44241744802283e-05, + "loss": 3.2508, + "step": 109020 + }, + { + "epoch": 7.407596140779997, + "grad_norm": 1.3797696828842163, + "learning_rate": 7.438170947139556e-05, + "loss": 3.3364, + "step": 109025 + }, + { + "epoch": 7.407935860850659, + "grad_norm": 4.794374465942383, + "learning_rate": 7.433924446256284e-05, + "loss": 3.5367, + "step": 109030 + }, + { + "epoch": 7.4082755809213205, + "grad_norm": 0.810980498790741, + "learning_rate": 7.429677945373014e-05, + "loss": 3.2636, + "step": 109035 + }, + { + "epoch": 7.408615300991983, + "grad_norm": 0.9132948517799377, + "learning_rate": 7.42543144448974e-05, + "loss": 3.075, + "step": 109040 + }, + { + "epoch": 7.408955021062645, + "grad_norm": 0.9274193048477173, + "learning_rate": 7.421184943606468e-05, + "loss": 3.2977, + "step": 109045 + }, + { + "epoch": 7.409294741133306, + "grad_norm": 0.9688493013381958, + "learning_rate": 7.416938442723196e-05, + "loss": 3.183, + "step": 109050 + }, + { + "epoch": 7.409634461203968, + "grad_norm": 0.9379900097846985, + "learning_rate": 7.412691941839924e-05, + "loss": 3.25, + "step": 109055 + }, + { + "epoch": 7.40997418127463, + "grad_norm": 0.9871506094932556, + "learning_rate": 7.408445440956651e-05, + "loss": 3.2084, + "step": 109060 + }, + { + "epoch": 7.410313901345291, + "grad_norm": 1.0878899097442627, + "learning_rate": 7.404198940073379e-05, + "loss": 3.4717, + "step": 109065 + }, + { + "epoch": 7.410653621415953, + "grad_norm": 1.062902808189392, + "learning_rate": 7.399952439190108e-05, + "loss": 3.2346, + "step": 109070 + }, + { + "epoch": 7.410993341486615, + "grad_norm": 1.261297583580017, + "learning_rate": 7.395705938306835e-05, + "loss": 3.2549, + "step": 109075 + }, + { + "epoch": 7.411333061557277, + "grad_norm": 0.8901593089103699, + "learning_rate": 7.391459437423563e-05, + "loss": 3.5008, + "step": 109080 + }, + { + "epoch": 7.411672781627939, + "grad_norm": 1.0293625593185425, + "learning_rate": 7.387212936540291e-05, + "loss": 3.2535, + "step": 109085 + }, + { + "epoch": 7.412012501698601, + "grad_norm": 1.0863763093948364, + "learning_rate": 7.382966435657018e-05, + "loss": 3.2335, + "step": 109090 + }, + { + "epoch": 7.412352221769262, + "grad_norm": 0.9990018010139465, + "learning_rate": 7.378719934773746e-05, + "loss": 3.3314, + "step": 109095 + }, + { + "epoch": 7.412691941839924, + "grad_norm": 0.8639346361160278, + "learning_rate": 7.374473433890475e-05, + "loss": 3.2164, + "step": 109100 + }, + { + "epoch": 7.413031661910586, + "grad_norm": 1.3476876020431519, + "learning_rate": 7.370226933007203e-05, + "loss": 3.3737, + "step": 109105 + }, + { + "epoch": 7.413371381981247, + "grad_norm": 1.0796478986740112, + "learning_rate": 7.36598043212393e-05, + "loss": 3.2951, + "step": 109110 + }, + { + "epoch": 7.413711102051909, + "grad_norm": 1.2691730260849, + "learning_rate": 7.361733931240658e-05, + "loss": 3.2377, + "step": 109115 + }, + { + "epoch": 7.414050822122571, + "grad_norm": 0.7170352935791016, + "learning_rate": 7.357487430357386e-05, + "loss": 3.5373, + "step": 109120 + }, + { + "epoch": 7.414390542193233, + "grad_norm": 0.8876254558563232, + "learning_rate": 7.353240929474113e-05, + "loss": 3.5582, + "step": 109125 + }, + { + "epoch": 7.414730262263895, + "grad_norm": 0.795817494392395, + "learning_rate": 7.34899442859084e-05, + "loss": 3.5331, + "step": 109130 + }, + { + "epoch": 7.415069982334557, + "grad_norm": 1.0976141691207886, + "learning_rate": 7.34474792770757e-05, + "loss": 3.6216, + "step": 109135 + }, + { + "epoch": 7.415409702405218, + "grad_norm": 0.7282344102859497, + "learning_rate": 7.340501426824298e-05, + "loss": 3.3587, + "step": 109140 + }, + { + "epoch": 7.41574942247588, + "grad_norm": 1.762641191482544, + "learning_rate": 7.336254925941025e-05, + "loss": 3.3769, + "step": 109145 + }, + { + "epoch": 7.416089142546542, + "grad_norm": 0.982059121131897, + "learning_rate": 7.332008425057753e-05, + "loss": 2.875, + "step": 109150 + }, + { + "epoch": 7.416428862617203, + "grad_norm": 0.7667697072029114, + "learning_rate": 7.32776192417448e-05, + "loss": 3.353, + "step": 109155 + }, + { + "epoch": 7.416768582687865, + "grad_norm": 0.7989778518676758, + "learning_rate": 7.323515423291207e-05, + "loss": 3.2472, + "step": 109160 + }, + { + "epoch": 7.4171083027585265, + "grad_norm": 0.9729012846946716, + "learning_rate": 7.319268922407937e-05, + "loss": 3.2826, + "step": 109165 + }, + { + "epoch": 7.417448022829189, + "grad_norm": 0.8627350926399231, + "learning_rate": 7.315022421524665e-05, + "loss": 3.3013, + "step": 109170 + }, + { + "epoch": 7.417787742899851, + "grad_norm": 0.8184252381324768, + "learning_rate": 7.310775920641391e-05, + "loss": 3.5334, + "step": 109175 + }, + { + "epoch": 7.418127462970512, + "grad_norm": 0.959801435470581, + "learning_rate": 7.306529419758119e-05, + "loss": 3.335, + "step": 109180 + }, + { + "epoch": 7.418467183041174, + "grad_norm": 0.7720160484313965, + "learning_rate": 7.302282918874847e-05, + "loss": 3.3567, + "step": 109185 + }, + { + "epoch": 7.418806903111836, + "grad_norm": 0.8661257028579712, + "learning_rate": 7.298036417991575e-05, + "loss": 3.4177, + "step": 109190 + }, + { + "epoch": 7.419146623182497, + "grad_norm": 1.0161147117614746, + "learning_rate": 7.293789917108302e-05, + "loss": 3.4449, + "step": 109195 + }, + { + "epoch": 7.419486343253159, + "grad_norm": 0.9587270021438599, + "learning_rate": 7.289543416225031e-05, + "loss": 3.2777, + "step": 109200 + }, + { + "epoch": 7.419826063323821, + "grad_norm": 0.9024713635444641, + "learning_rate": 7.285296915341759e-05, + "loss": 3.363, + "step": 109205 + }, + { + "epoch": 7.4201657833944825, + "grad_norm": 1.2454334497451782, + "learning_rate": 7.281050414458486e-05, + "loss": 3.1345, + "step": 109210 + }, + { + "epoch": 7.420505503465145, + "grad_norm": 0.8636432886123657, + "learning_rate": 7.276803913575214e-05, + "loss": 3.2915, + "step": 109215 + }, + { + "epoch": 7.420845223535807, + "grad_norm": 6.972426891326904, + "learning_rate": 7.272557412691942e-05, + "loss": 3.2552, + "step": 109220 + }, + { + "epoch": 7.421184943606468, + "grad_norm": 0.8944876194000244, + "learning_rate": 7.26831091180867e-05, + "loss": 3.6191, + "step": 109225 + }, + { + "epoch": 7.42152466367713, + "grad_norm": 1.096196174621582, + "learning_rate": 7.264064410925397e-05, + "loss": 3.366, + "step": 109230 + }, + { + "epoch": 7.421864383747792, + "grad_norm": 0.7586780786514282, + "learning_rate": 7.259817910042126e-05, + "loss": 3.4533, + "step": 109235 + }, + { + "epoch": 7.422204103818453, + "grad_norm": 0.839377224445343, + "learning_rate": 7.255571409158854e-05, + "loss": 3.5665, + "step": 109240 + }, + { + "epoch": 7.422543823889115, + "grad_norm": 1.064310908317566, + "learning_rate": 7.25132490827558e-05, + "loss": 3.4732, + "step": 109245 + }, + { + "epoch": 7.422883543959777, + "grad_norm": 0.9780547618865967, + "learning_rate": 7.247078407392309e-05, + "loss": 3.3281, + "step": 109250 + }, + { + "epoch": 7.4232232640304385, + "grad_norm": 0.935157835483551, + "learning_rate": 7.242831906509037e-05, + "loss": 3.3341, + "step": 109255 + }, + { + "epoch": 7.423562984101101, + "grad_norm": 0.9636282324790955, + "learning_rate": 7.238585405625763e-05, + "loss": 3.2415, + "step": 109260 + }, + { + "epoch": 7.423902704171763, + "grad_norm": 0.8679943680763245, + "learning_rate": 7.234338904742493e-05, + "loss": 3.528, + "step": 109265 + }, + { + "epoch": 7.424242424242424, + "grad_norm": 0.9341953992843628, + "learning_rate": 7.23009240385922e-05, + "loss": 3.6682, + "step": 109270 + }, + { + "epoch": 7.424582144313086, + "grad_norm": 7.118320465087891, + "learning_rate": 7.225845902975949e-05, + "loss": 3.5418, + "step": 109275 + }, + { + "epoch": 7.424921864383748, + "grad_norm": 1.0145213603973389, + "learning_rate": 7.221599402092675e-05, + "loss": 3.4339, + "step": 109280 + }, + { + "epoch": 7.425261584454409, + "grad_norm": 0.8610491156578064, + "learning_rate": 7.217352901209403e-05, + "loss": 3.5538, + "step": 109285 + }, + { + "epoch": 7.425601304525071, + "grad_norm": 0.8987185955047607, + "learning_rate": 7.213106400326131e-05, + "loss": 3.4555, + "step": 109290 + }, + { + "epoch": 7.425941024595733, + "grad_norm": 1.035756230354309, + "learning_rate": 7.208859899442858e-05, + "loss": 3.0633, + "step": 109295 + }, + { + "epoch": 7.4262807446663945, + "grad_norm": 0.9255558848381042, + "learning_rate": 7.204613398559587e-05, + "loss": 3.5348, + "step": 109300 + }, + { + "epoch": 7.426620464737057, + "grad_norm": 0.9350107908248901, + "learning_rate": 7.200366897676315e-05, + "loss": 3.1839, + "step": 109305 + }, + { + "epoch": 7.426960184807719, + "grad_norm": 1.1564383506774902, + "learning_rate": 7.196120396793043e-05, + "loss": 3.2623, + "step": 109310 + }, + { + "epoch": 7.42729990487838, + "grad_norm": 0.9734782576560974, + "learning_rate": 7.19187389590977e-05, + "loss": 3.452, + "step": 109315 + }, + { + "epoch": 7.427639624949042, + "grad_norm": 1.065194010734558, + "learning_rate": 7.187627395026498e-05, + "loss": 3.5167, + "step": 109320 + }, + { + "epoch": 7.427979345019704, + "grad_norm": 0.9454578161239624, + "learning_rate": 7.183380894143226e-05, + "loss": 3.4183, + "step": 109325 + }, + { + "epoch": 7.428319065090365, + "grad_norm": 0.6983144283294678, + "learning_rate": 7.179134393259954e-05, + "loss": 3.378, + "step": 109330 + }, + { + "epoch": 7.428658785161027, + "grad_norm": 0.9117900729179382, + "learning_rate": 7.174887892376682e-05, + "loss": 3.2525, + "step": 109335 + }, + { + "epoch": 7.428998505231689, + "grad_norm": 0.8100894689559937, + "learning_rate": 7.17064139149341e-05, + "loss": 3.577, + "step": 109340 + }, + { + "epoch": 7.4293382253023506, + "grad_norm": 0.9076022505760193, + "learning_rate": 7.166394890610137e-05, + "loss": 3.3046, + "step": 109345 + }, + { + "epoch": 7.429677945373013, + "grad_norm": 0.9915136098861694, + "learning_rate": 7.162148389726865e-05, + "loss": 3.5229, + "step": 109350 + }, + { + "epoch": 7.430017665443675, + "grad_norm": 0.9381319880485535, + "learning_rate": 7.157901888843593e-05, + "loss": 3.2609, + "step": 109355 + }, + { + "epoch": 7.430357385514336, + "grad_norm": 0.9287576079368591, + "learning_rate": 7.153655387960321e-05, + "loss": 3.4439, + "step": 109360 + }, + { + "epoch": 7.430697105584998, + "grad_norm": 1.0224312543869019, + "learning_rate": 7.149408887077049e-05, + "loss": 3.5826, + "step": 109365 + }, + { + "epoch": 7.43103682565566, + "grad_norm": 0.9616596698760986, + "learning_rate": 7.145162386193777e-05, + "loss": 3.5115, + "step": 109370 + }, + { + "epoch": 7.431376545726321, + "grad_norm": 1.0234184265136719, + "learning_rate": 7.140915885310505e-05, + "loss": 3.4193, + "step": 109375 + }, + { + "epoch": 7.431716265796983, + "grad_norm": 0.9249810576438904, + "learning_rate": 7.136669384427231e-05, + "loss": 3.535, + "step": 109380 + }, + { + "epoch": 7.432055985867645, + "grad_norm": 0.9546009302139282, + "learning_rate": 7.13242288354396e-05, + "loss": 3.3453, + "step": 109385 + }, + { + "epoch": 7.432395705938307, + "grad_norm": 0.8849051594734192, + "learning_rate": 7.128176382660687e-05, + "loss": 3.4396, + "step": 109390 + }, + { + "epoch": 7.432735426008969, + "grad_norm": 0.9772620797157288, + "learning_rate": 7.123929881777417e-05, + "loss": 3.2888, + "step": 109395 + }, + { + "epoch": 7.433075146079631, + "grad_norm": 1.0872286558151245, + "learning_rate": 7.119683380894143e-05, + "loss": 3.4394, + "step": 109400 + }, + { + "epoch": 7.433414866150292, + "grad_norm": 0.9369063377380371, + "learning_rate": 7.115436880010871e-05, + "loss": 3.4409, + "step": 109405 + }, + { + "epoch": 7.433754586220954, + "grad_norm": 0.855851948261261, + "learning_rate": 7.1111903791276e-05, + "loss": 3.4137, + "step": 109410 + }, + { + "epoch": 7.434094306291616, + "grad_norm": 0.7228081226348877, + "learning_rate": 7.106943878244326e-05, + "loss": 3.3832, + "step": 109415 + }, + { + "epoch": 7.434434026362277, + "grad_norm": 1.0059621334075928, + "learning_rate": 7.102697377361054e-05, + "loss": 3.5134, + "step": 109420 + }, + { + "epoch": 7.434773746432939, + "grad_norm": 1.200427532196045, + "learning_rate": 7.098450876477782e-05, + "loss": 3.5511, + "step": 109425 + }, + { + "epoch": 7.435113466503601, + "grad_norm": 1.1306483745574951, + "learning_rate": 7.09420437559451e-05, + "loss": 3.2236, + "step": 109430 + }, + { + "epoch": 7.435453186574263, + "grad_norm": 0.7725550532341003, + "learning_rate": 7.089957874711238e-05, + "loss": 3.1332, + "step": 109435 + }, + { + "epoch": 7.435792906644925, + "grad_norm": 0.728381335735321, + "learning_rate": 7.085711373827966e-05, + "loss": 3.0541, + "step": 109440 + }, + { + "epoch": 7.436132626715587, + "grad_norm": 0.8653856515884399, + "learning_rate": 7.081464872944694e-05, + "loss": 3.4009, + "step": 109445 + }, + { + "epoch": 7.436472346786248, + "grad_norm": 0.7724325656890869, + "learning_rate": 7.077218372061421e-05, + "loss": 3.573, + "step": 109450 + }, + { + "epoch": 7.43681206685691, + "grad_norm": 0.9161696434020996, + "learning_rate": 7.072971871178149e-05, + "loss": 3.3921, + "step": 109455 + }, + { + "epoch": 7.437151786927572, + "grad_norm": 0.967287540435791, + "learning_rate": 7.068725370294878e-05, + "loss": 3.1704, + "step": 109460 + }, + { + "epoch": 7.437491506998233, + "grad_norm": 1.0498316287994385, + "learning_rate": 7.064478869411605e-05, + "loss": 3.1598, + "step": 109465 + }, + { + "epoch": 7.437831227068895, + "grad_norm": 0.952092170715332, + "learning_rate": 7.060232368528333e-05, + "loss": 3.429, + "step": 109470 + }, + { + "epoch": 7.438170947139557, + "grad_norm": 1.0715686082839966, + "learning_rate": 7.055985867645061e-05, + "loss": 3.4248, + "step": 109475 + }, + { + "epoch": 7.438510667210219, + "grad_norm": 1.0522185564041138, + "learning_rate": 7.051739366761789e-05, + "loss": 3.612, + "step": 109480 + }, + { + "epoch": 7.438850387280881, + "grad_norm": 0.8112723231315613, + "learning_rate": 7.047492865878516e-05, + "loss": 3.3989, + "step": 109485 + }, + { + "epoch": 7.439190107351543, + "grad_norm": 0.9382035732269287, + "learning_rate": 7.043246364995244e-05, + "loss": 3.4241, + "step": 109490 + }, + { + "epoch": 7.439529827422204, + "grad_norm": 0.8587118983268738, + "learning_rate": 7.038999864111973e-05, + "loss": 3.5506, + "step": 109495 + }, + { + "epoch": 7.439869547492866, + "grad_norm": 1.2270114421844482, + "learning_rate": 7.0347533632287e-05, + "loss": 3.2953, + "step": 109500 + }, + { + "epoch": 7.440209267563528, + "grad_norm": 1.0293775796890259, + "learning_rate": 7.030506862345428e-05, + "loss": 3.2263, + "step": 109505 + }, + { + "epoch": 7.440548987634189, + "grad_norm": 0.8477956056594849, + "learning_rate": 7.026260361462156e-05, + "loss": 3.1888, + "step": 109510 + }, + { + "epoch": 7.440888707704851, + "grad_norm": 1.0047045946121216, + "learning_rate": 7.022013860578882e-05, + "loss": 3.3954, + "step": 109515 + }, + { + "epoch": 7.441228427775513, + "grad_norm": 0.8298618197441101, + "learning_rate": 7.01776735969561e-05, + "loss": 3.2939, + "step": 109520 + }, + { + "epoch": 7.441568147846175, + "grad_norm": 0.9316758513450623, + "learning_rate": 7.01352085881234e-05, + "loss": 3.3248, + "step": 109525 + }, + { + "epoch": 7.441907867916837, + "grad_norm": 1.354490876197815, + "learning_rate": 7.009274357929068e-05, + "loss": 3.4576, + "step": 109530 + }, + { + "epoch": 7.442247587987499, + "grad_norm": 0.9229007363319397, + "learning_rate": 7.005027857045794e-05, + "loss": 3.3847, + "step": 109535 + }, + { + "epoch": 7.44258730805816, + "grad_norm": 0.841850996017456, + "learning_rate": 7.000781356162522e-05, + "loss": 3.33, + "step": 109540 + }, + { + "epoch": 7.442927028128822, + "grad_norm": 0.850897490978241, + "learning_rate": 6.99653485527925e-05, + "loss": 3.2405, + "step": 109545 + }, + { + "epoch": 7.443266748199484, + "grad_norm": 1.0524327754974365, + "learning_rate": 6.992288354395977e-05, + "loss": 3.6495, + "step": 109550 + }, + { + "epoch": 7.443606468270145, + "grad_norm": 0.8506093621253967, + "learning_rate": 6.988041853512705e-05, + "loss": 3.4188, + "step": 109555 + }, + { + "epoch": 7.443946188340807, + "grad_norm": 1.234908103942871, + "learning_rate": 6.983795352629434e-05, + "loss": 3.3571, + "step": 109560 + }, + { + "epoch": 7.444285908411469, + "grad_norm": 0.9827096462249756, + "learning_rate": 6.979548851746162e-05, + "loss": 3.3132, + "step": 109565 + }, + { + "epoch": 7.444625628482131, + "grad_norm": 0.9539719223976135, + "learning_rate": 6.975302350862889e-05, + "loss": 3.564, + "step": 109570 + }, + { + "epoch": 7.444965348552793, + "grad_norm": 0.7894200682640076, + "learning_rate": 6.971055849979617e-05, + "loss": 3.3442, + "step": 109575 + }, + { + "epoch": 7.445305068623454, + "grad_norm": 1.0193971395492554, + "learning_rate": 6.966809349096345e-05, + "loss": 3.585, + "step": 109580 + }, + { + "epoch": 7.445644788694116, + "grad_norm": 1.4858921766281128, + "learning_rate": 6.962562848213072e-05, + "loss": 3.4865, + "step": 109585 + }, + { + "epoch": 7.445984508764778, + "grad_norm": 0.8466053009033203, + "learning_rate": 6.9583163473298e-05, + "loss": 3.2853, + "step": 109590 + }, + { + "epoch": 7.446324228835439, + "grad_norm": 0.9157701730728149, + "learning_rate": 6.954069846446529e-05, + "loss": 3.264, + "step": 109595 + }, + { + "epoch": 7.446663948906101, + "grad_norm": 1.334599256515503, + "learning_rate": 6.949823345563256e-05, + "loss": 3.2725, + "step": 109600 + }, + { + "epoch": 7.447003668976763, + "grad_norm": 1.1273620128631592, + "learning_rate": 6.945576844679984e-05, + "loss": 3.4014, + "step": 109605 + }, + { + "epoch": 7.4473433890474245, + "grad_norm": 1.0137178897857666, + "learning_rate": 6.941330343796712e-05, + "loss": 3.3216, + "step": 109610 + }, + { + "epoch": 7.447683109118087, + "grad_norm": 1.0781277418136597, + "learning_rate": 6.93708384291344e-05, + "loss": 3.341, + "step": 109615 + }, + { + "epoch": 7.448022829188749, + "grad_norm": 0.9857994318008423, + "learning_rate": 6.932837342030166e-05, + "loss": 3.602, + "step": 109620 + }, + { + "epoch": 7.44836254925941, + "grad_norm": 1.305351734161377, + "learning_rate": 6.928590841146896e-05, + "loss": 3.3032, + "step": 109625 + }, + { + "epoch": 7.448702269330072, + "grad_norm": 1.1199474334716797, + "learning_rate": 6.924344340263624e-05, + "loss": 3.5072, + "step": 109630 + }, + { + "epoch": 7.449041989400734, + "grad_norm": 1.29483962059021, + "learning_rate": 6.92009783938035e-05, + "loss": 3.4925, + "step": 109635 + }, + { + "epoch": 7.449381709471395, + "grad_norm": 0.8053165078163147, + "learning_rate": 6.915851338497078e-05, + "loss": 3.4511, + "step": 109640 + }, + { + "epoch": 7.449721429542057, + "grad_norm": 1.049609899520874, + "learning_rate": 6.911604837613806e-05, + "loss": 3.5036, + "step": 109645 + }, + { + "epoch": 7.450061149612719, + "grad_norm": 0.8823279738426208, + "learning_rate": 6.907358336730534e-05, + "loss": 3.3418, + "step": 109650 + }, + { + "epoch": 7.4504008696833806, + "grad_norm": 1.054550051689148, + "learning_rate": 6.903111835847261e-05, + "loss": 3.0599, + "step": 109655 + }, + { + "epoch": 7.450740589754043, + "grad_norm": 0.8168779611587524, + "learning_rate": 6.89886533496399e-05, + "loss": 3.3815, + "step": 109660 + }, + { + "epoch": 7.451080309824705, + "grad_norm": 0.8225720524787903, + "learning_rate": 6.894618834080718e-05, + "loss": 3.2571, + "step": 109665 + }, + { + "epoch": 7.451420029895366, + "grad_norm": 1.1443933248519897, + "learning_rate": 6.890372333197445e-05, + "loss": 3.4238, + "step": 109670 + }, + { + "epoch": 7.451759749966028, + "grad_norm": 0.8045722246170044, + "learning_rate": 6.886125832314173e-05, + "loss": 3.4203, + "step": 109675 + }, + { + "epoch": 7.45209947003669, + "grad_norm": 0.7904484272003174, + "learning_rate": 6.881879331430901e-05, + "loss": 3.26, + "step": 109680 + }, + { + "epoch": 7.452439190107351, + "grad_norm": 1.021471381187439, + "learning_rate": 6.877632830547628e-05, + "loss": 3.2899, + "step": 109685 + }, + { + "epoch": 7.452778910178013, + "grad_norm": 1.0008741617202759, + "learning_rate": 6.873386329664357e-05, + "loss": 3.4129, + "step": 109690 + }, + { + "epoch": 7.453118630248675, + "grad_norm": 0.8198463320732117, + "learning_rate": 6.869139828781085e-05, + "loss": 3.3742, + "step": 109695 + }, + { + "epoch": 7.453458350319337, + "grad_norm": 1.160305380821228, + "learning_rate": 6.864893327897813e-05, + "loss": 3.4106, + "step": 109700 + }, + { + "epoch": 7.453798070389999, + "grad_norm": 0.8872312903404236, + "learning_rate": 6.86064682701454e-05, + "loss": 3.2756, + "step": 109705 + }, + { + "epoch": 7.454137790460661, + "grad_norm": 0.8332380652427673, + "learning_rate": 6.856400326131268e-05, + "loss": 3.2526, + "step": 109710 + }, + { + "epoch": 7.454477510531322, + "grad_norm": 1.1261932849884033, + "learning_rate": 6.852153825247996e-05, + "loss": 3.3134, + "step": 109715 + }, + { + "epoch": 7.454817230601984, + "grad_norm": 0.9240607023239136, + "learning_rate": 6.847907324364722e-05, + "loss": 3.336, + "step": 109720 + }, + { + "epoch": 7.455156950672646, + "grad_norm": 0.9032028317451477, + "learning_rate": 6.843660823481452e-05, + "loss": 3.4328, + "step": 109725 + }, + { + "epoch": 7.455496670743307, + "grad_norm": 0.9004048705101013, + "learning_rate": 6.83941432259818e-05, + "loss": 3.1449, + "step": 109730 + }, + { + "epoch": 7.455836390813969, + "grad_norm": 0.8477425575256348, + "learning_rate": 6.835167821714908e-05, + "loss": 3.4696, + "step": 109735 + }, + { + "epoch": 7.456176110884631, + "grad_norm": 0.884273886680603, + "learning_rate": 6.830921320831634e-05, + "loss": 3.3586, + "step": 109740 + }, + { + "epoch": 7.456515830955293, + "grad_norm": 0.8278295397758484, + "learning_rate": 6.826674819948362e-05, + "loss": 3.3019, + "step": 109745 + }, + { + "epoch": 7.456855551025955, + "grad_norm": 0.7165254354476929, + "learning_rate": 6.82242831906509e-05, + "loss": 3.3925, + "step": 109750 + }, + { + "epoch": 7.457195271096617, + "grad_norm": 0.8751130104064941, + "learning_rate": 6.818181818181818e-05, + "loss": 3.4616, + "step": 109755 + }, + { + "epoch": 7.457534991167278, + "grad_norm": 0.7926779985427856, + "learning_rate": 6.813935317298546e-05, + "loss": 3.3619, + "step": 109760 + }, + { + "epoch": 7.45787471123794, + "grad_norm": 0.9780829548835754, + "learning_rate": 6.809688816415274e-05, + "loss": 3.4306, + "step": 109765 + }, + { + "epoch": 7.458214431308602, + "grad_norm": 0.9999364018440247, + "learning_rate": 6.805442315532001e-05, + "loss": 3.3326, + "step": 109770 + }, + { + "epoch": 7.458554151379263, + "grad_norm": 1.1841404438018799, + "learning_rate": 6.801195814648729e-05, + "loss": 3.4264, + "step": 109775 + }, + { + "epoch": 7.458893871449925, + "grad_norm": 0.9400631785392761, + "learning_rate": 6.796949313765457e-05, + "loss": 3.1453, + "step": 109780 + }, + { + "epoch": 7.459233591520587, + "grad_norm": 1.934386968612671, + "learning_rate": 6.792702812882185e-05, + "loss": 3.3967, + "step": 109785 + }, + { + "epoch": 7.459573311591249, + "grad_norm": 1.2256841659545898, + "learning_rate": 6.788456311998913e-05, + "loss": 3.2883, + "step": 109790 + }, + { + "epoch": 7.459913031661911, + "grad_norm": 1.1404412984848022, + "learning_rate": 6.784209811115641e-05, + "loss": 3.5765, + "step": 109795 + }, + { + "epoch": 7.460252751732573, + "grad_norm": 1.0708494186401367, + "learning_rate": 6.779963310232369e-05, + "loss": 3.3029, + "step": 109800 + }, + { + "epoch": 7.460592471803234, + "grad_norm": 0.7909883260726929, + "learning_rate": 6.775716809349096e-05, + "loss": 3.5219, + "step": 109805 + }, + { + "epoch": 7.460932191873896, + "grad_norm": 0.9253078103065491, + "learning_rate": 6.771470308465824e-05, + "loss": 3.4657, + "step": 109810 + }, + { + "epoch": 7.461271911944558, + "grad_norm": 0.8484600782394409, + "learning_rate": 6.767223807582552e-05, + "loss": 3.6171, + "step": 109815 + }, + { + "epoch": 7.461611632015219, + "grad_norm": 1.2871206998825073, + "learning_rate": 6.762977306699281e-05, + "loss": 3.4874, + "step": 109820 + }, + { + "epoch": 7.461951352085881, + "grad_norm": 0.8492790460586548, + "learning_rate": 6.758730805816008e-05, + "loss": 3.3121, + "step": 109825 + }, + { + "epoch": 7.462291072156543, + "grad_norm": 0.8479198217391968, + "learning_rate": 6.754484304932736e-05, + "loss": 3.751, + "step": 109830 + }, + { + "epoch": 7.462630792227205, + "grad_norm": 0.7617112398147583, + "learning_rate": 6.750237804049464e-05, + "loss": 3.28, + "step": 109835 + }, + { + "epoch": 7.462970512297867, + "grad_norm": 0.9921170473098755, + "learning_rate": 6.74599130316619e-05, + "loss": 3.273, + "step": 109840 + }, + { + "epoch": 7.463310232368528, + "grad_norm": 0.8351657390594482, + "learning_rate": 6.741744802282919e-05, + "loss": 3.3818, + "step": 109845 + }, + { + "epoch": 7.46364995243919, + "grad_norm": 0.7631083726882935, + "learning_rate": 6.737498301399647e-05, + "loss": 3.2056, + "step": 109850 + }, + { + "epoch": 7.463989672509852, + "grad_norm": 1.0327320098876953, + "learning_rate": 6.733251800516375e-05, + "loss": 3.1805, + "step": 109855 + }, + { + "epoch": 7.464329392580513, + "grad_norm": 0.9662609100341797, + "learning_rate": 6.729005299633103e-05, + "loss": 3.52, + "step": 109860 + }, + { + "epoch": 7.464669112651175, + "grad_norm": 0.8133007287979126, + "learning_rate": 6.72475879874983e-05, + "loss": 3.2733, + "step": 109865 + }, + { + "epoch": 7.465008832721837, + "grad_norm": 1.1472079753875732, + "learning_rate": 6.720512297866559e-05, + "loss": 2.9847, + "step": 109870 + }, + { + "epoch": 7.4653485527924985, + "grad_norm": 0.8852643370628357, + "learning_rate": 6.716265796983285e-05, + "loss": 3.6352, + "step": 109875 + }, + { + "epoch": 7.465688272863161, + "grad_norm": 1.08835768699646, + "learning_rate": 6.712019296100013e-05, + "loss": 3.3001, + "step": 109880 + }, + { + "epoch": 7.466027992933823, + "grad_norm": 1.2011148929595947, + "learning_rate": 6.707772795216743e-05, + "loss": 3.4198, + "step": 109885 + }, + { + "epoch": 7.466367713004484, + "grad_norm": 1.0302355289459229, + "learning_rate": 6.703526294333469e-05, + "loss": 3.3298, + "step": 109890 + }, + { + "epoch": 7.466707433075146, + "grad_norm": 0.9402418732643127, + "learning_rate": 6.699279793450197e-05, + "loss": 3.6035, + "step": 109895 + }, + { + "epoch": 7.467047153145808, + "grad_norm": 0.9001951217651367, + "learning_rate": 6.695033292566925e-05, + "loss": 3.4884, + "step": 109900 + }, + { + "epoch": 7.467386873216469, + "grad_norm": 0.9956760406494141, + "learning_rate": 6.690786791683653e-05, + "loss": 3.4153, + "step": 109905 + }, + { + "epoch": 7.467726593287131, + "grad_norm": 0.7506144046783447, + "learning_rate": 6.68654029080038e-05, + "loss": 3.1821, + "step": 109910 + }, + { + "epoch": 7.468066313357793, + "grad_norm": 0.9583525657653809, + "learning_rate": 6.682293789917108e-05, + "loss": 3.4792, + "step": 109915 + }, + { + "epoch": 7.4684060334284545, + "grad_norm": 1.006805181503296, + "learning_rate": 6.678047289033837e-05, + "loss": 3.4, + "step": 109920 + }, + { + "epoch": 7.468745753499117, + "grad_norm": 0.9080003499984741, + "learning_rate": 6.673800788150564e-05, + "loss": 3.2873, + "step": 109925 + }, + { + "epoch": 7.469085473569779, + "grad_norm": 0.8374408483505249, + "learning_rate": 6.669554287267292e-05, + "loss": 3.3529, + "step": 109930 + }, + { + "epoch": 7.46942519364044, + "grad_norm": 1.1424434185028076, + "learning_rate": 6.66530778638402e-05, + "loss": 3.278, + "step": 109935 + }, + { + "epoch": 7.469764913711102, + "grad_norm": 0.8935762643814087, + "learning_rate": 6.661061285500747e-05, + "loss": 3.2772, + "step": 109940 + }, + { + "epoch": 7.470104633781764, + "grad_norm": 1.5608117580413818, + "learning_rate": 6.656814784617475e-05, + "loss": 3.1656, + "step": 109945 + }, + { + "epoch": 7.470444353852425, + "grad_norm": 1.138751745223999, + "learning_rate": 6.652568283734204e-05, + "loss": 3.3257, + "step": 109950 + }, + { + "epoch": 7.470784073923087, + "grad_norm": 1.1901954412460327, + "learning_rate": 6.648321782850932e-05, + "loss": 3.1478, + "step": 109955 + }, + { + "epoch": 7.471123793993749, + "grad_norm": 1.2032108306884766, + "learning_rate": 6.644075281967659e-05, + "loss": 3.6173, + "step": 109960 + }, + { + "epoch": 7.471463514064411, + "grad_norm": 1.1860231161117554, + "learning_rate": 6.639828781084387e-05, + "loss": 3.3746, + "step": 109965 + }, + { + "epoch": 7.471803234135073, + "grad_norm": 1.0324697494506836, + "learning_rate": 6.635582280201115e-05, + "loss": 3.3768, + "step": 109970 + }, + { + "epoch": 7.472142954205735, + "grad_norm": 1.2527673244476318, + "learning_rate": 6.631335779317841e-05, + "loss": 3.3899, + "step": 109975 + }, + { + "epoch": 7.472482674276396, + "grad_norm": 1.1943002939224243, + "learning_rate": 6.627089278434569e-05, + "loss": 3.5601, + "step": 109980 + }, + { + "epoch": 7.472822394347058, + "grad_norm": 0.9139949679374695, + "learning_rate": 6.622842777551299e-05, + "loss": 3.623, + "step": 109985 + }, + { + "epoch": 7.47316211441772, + "grad_norm": 1.0549315214157104, + "learning_rate": 6.618596276668027e-05, + "loss": 3.2047, + "step": 109990 + }, + { + "epoch": 7.473501834488381, + "grad_norm": 1.0948214530944824, + "learning_rate": 6.614349775784753e-05, + "loss": 3.3871, + "step": 109995 + }, + { + "epoch": 7.473841554559043, + "grad_norm": 1.0261125564575195, + "learning_rate": 6.610103274901481e-05, + "loss": 3.2009, + "step": 110000 + }, + { + "epoch": 7.474181274629705, + "grad_norm": 0.850376307964325, + "learning_rate": 6.60585677401821e-05, + "loss": 3.495, + "step": 110005 + }, + { + "epoch": 7.474520994700367, + "grad_norm": 0.8808454871177673, + "learning_rate": 6.601610273134936e-05, + "loss": 3.375, + "step": 110010 + }, + { + "epoch": 7.474860714771029, + "grad_norm": 0.9570554494857788, + "learning_rate": 6.597363772251664e-05, + "loss": 3.3343, + "step": 110015 + }, + { + "epoch": 7.475200434841691, + "grad_norm": 0.9391192197799683, + "learning_rate": 6.593117271368393e-05, + "loss": 3.0241, + "step": 110020 + }, + { + "epoch": 7.475540154912352, + "grad_norm": 1.1603697538375854, + "learning_rate": 6.58887077048512e-05, + "loss": 3.302, + "step": 110025 + }, + { + "epoch": 7.475879874983014, + "grad_norm": 0.8558090329170227, + "learning_rate": 6.584624269601848e-05, + "loss": 3.361, + "step": 110030 + }, + { + "epoch": 7.476219595053676, + "grad_norm": 0.9230468273162842, + "learning_rate": 6.580377768718576e-05, + "loss": 3.2455, + "step": 110035 + }, + { + "epoch": 7.476559315124337, + "grad_norm": 1.1281931400299072, + "learning_rate": 6.576131267835304e-05, + "loss": 3.1194, + "step": 110040 + }, + { + "epoch": 7.476899035194999, + "grad_norm": 1.0995992422103882, + "learning_rate": 6.571884766952031e-05, + "loss": 3.0604, + "step": 110045 + }, + { + "epoch": 7.477238755265661, + "grad_norm": 0.8667719960212708, + "learning_rate": 6.56763826606876e-05, + "loss": 3.4217, + "step": 110050 + }, + { + "epoch": 7.477578475336323, + "grad_norm": 0.9218230247497559, + "learning_rate": 6.563391765185488e-05, + "loss": 3.3552, + "step": 110055 + }, + { + "epoch": 7.477918195406985, + "grad_norm": 0.9446624517440796, + "learning_rate": 6.559145264302215e-05, + "loss": 3.1525, + "step": 110060 + }, + { + "epoch": 7.478257915477647, + "grad_norm": 0.7893321514129639, + "learning_rate": 6.554898763418943e-05, + "loss": 3.6283, + "step": 110065 + }, + { + "epoch": 7.478597635548308, + "grad_norm": 1.2252265214920044, + "learning_rate": 6.550652262535671e-05, + "loss": 3.2197, + "step": 110070 + }, + { + "epoch": 7.47893735561897, + "grad_norm": 0.8652254939079285, + "learning_rate": 6.546405761652399e-05, + "loss": 3.4786, + "step": 110075 + }, + { + "epoch": 7.479277075689632, + "grad_norm": 1.1065337657928467, + "learning_rate": 6.542159260769125e-05, + "loss": 3.3893, + "step": 110080 + }, + { + "epoch": 7.479616795760293, + "grad_norm": 0.9452478289604187, + "learning_rate": 6.537912759885855e-05, + "loss": 3.2384, + "step": 110085 + }, + { + "epoch": 7.479956515830955, + "grad_norm": 0.8873727321624756, + "learning_rate": 6.533666259002583e-05, + "loss": 3.2044, + "step": 110090 + }, + { + "epoch": 7.480296235901617, + "grad_norm": 0.8793472051620483, + "learning_rate": 6.52941975811931e-05, + "loss": 3.5747, + "step": 110095 + }, + { + "epoch": 7.480635955972279, + "grad_norm": 1.0128443241119385, + "learning_rate": 6.525173257236037e-05, + "loss": 3.2702, + "step": 110100 + }, + { + "epoch": 7.480975676042941, + "grad_norm": 0.9126684069633484, + "learning_rate": 6.520926756352765e-05, + "loss": 3.2167, + "step": 110105 + }, + { + "epoch": 7.481315396113603, + "grad_norm": 0.9881851077079773, + "learning_rate": 6.516680255469492e-05, + "loss": 3.3914, + "step": 110110 + }, + { + "epoch": 7.481655116184264, + "grad_norm": 0.8274767398834229, + "learning_rate": 6.512433754586221e-05, + "loss": 3.4857, + "step": 110115 + }, + { + "epoch": 7.481994836254926, + "grad_norm": 1.1022952795028687, + "learning_rate": 6.50818725370295e-05, + "loss": 3.4737, + "step": 110120 + }, + { + "epoch": 7.482334556325588, + "grad_norm": 0.8168429136276245, + "learning_rate": 6.503940752819677e-05, + "loss": 3.1942, + "step": 110125 + }, + { + "epoch": 7.482674276396249, + "grad_norm": 1.0126115083694458, + "learning_rate": 6.499694251936404e-05, + "loss": 3.2962, + "step": 110130 + }, + { + "epoch": 7.483013996466911, + "grad_norm": 1.074960708618164, + "learning_rate": 6.495447751053132e-05, + "loss": 3.3303, + "step": 110135 + }, + { + "epoch": 7.483353716537573, + "grad_norm": 0.9823753237724304, + "learning_rate": 6.49120125016986e-05, + "loss": 3.3658, + "step": 110140 + }, + { + "epoch": 7.483693436608235, + "grad_norm": 0.7667866349220276, + "learning_rate": 6.486954749286587e-05, + "loss": 3.3906, + "step": 110145 + }, + { + "epoch": 7.484033156678897, + "grad_norm": 0.7583798766136169, + "learning_rate": 6.482708248403316e-05, + "loss": 3.4407, + "step": 110150 + }, + { + "epoch": 7.484372876749559, + "grad_norm": 0.8881388306617737, + "learning_rate": 6.478461747520044e-05, + "loss": 3.417, + "step": 110155 + }, + { + "epoch": 7.48471259682022, + "grad_norm": 0.8834562301635742, + "learning_rate": 6.474215246636772e-05, + "loss": 3.194, + "step": 110160 + }, + { + "epoch": 7.485052316890882, + "grad_norm": 0.9494379162788391, + "learning_rate": 6.469968745753499e-05, + "loss": 3.34, + "step": 110165 + }, + { + "epoch": 7.485392036961544, + "grad_norm": 0.9479822516441345, + "learning_rate": 6.465722244870227e-05, + "loss": 3.3816, + "step": 110170 + }, + { + "epoch": 7.485731757032205, + "grad_norm": 0.86037677526474, + "learning_rate": 6.461475743986955e-05, + "loss": 3.526, + "step": 110175 + }, + { + "epoch": 7.486071477102867, + "grad_norm": 1.1689033508300781, + "learning_rate": 6.457229243103683e-05, + "loss": 3.3081, + "step": 110180 + }, + { + "epoch": 7.486411197173529, + "grad_norm": 0.8895187377929688, + "learning_rate": 6.452982742220411e-05, + "loss": 3.3107, + "step": 110185 + }, + { + "epoch": 7.486750917244191, + "grad_norm": 1.0216306447982788, + "learning_rate": 6.448736241337139e-05, + "loss": 3.4308, + "step": 110190 + }, + { + "epoch": 7.487090637314853, + "grad_norm": 0.7717465162277222, + "learning_rate": 6.444489740453866e-05, + "loss": 3.3958, + "step": 110195 + }, + { + "epoch": 7.487430357385515, + "grad_norm": 1.0906111001968384, + "learning_rate": 6.440243239570594e-05, + "loss": 3.0709, + "step": 110200 + }, + { + "epoch": 7.487770077456176, + "grad_norm": 1.001355528831482, + "learning_rate": 6.435996738687322e-05, + "loss": 3.4327, + "step": 110205 + }, + { + "epoch": 7.488109797526838, + "grad_norm": 1.2515660524368286, + "learning_rate": 6.43175023780405e-05, + "loss": 3.5871, + "step": 110210 + }, + { + "epoch": 7.4884495175975, + "grad_norm": 0.837287425994873, + "learning_rate": 6.427503736920778e-05, + "loss": 3.5901, + "step": 110215 + }, + { + "epoch": 7.488789237668161, + "grad_norm": 1.035495638847351, + "learning_rate": 6.423257236037506e-05, + "loss": 3.0856, + "step": 110220 + }, + { + "epoch": 7.489128957738823, + "grad_norm": 0.9369212985038757, + "learning_rate": 6.419010735154234e-05, + "loss": 3.4359, + "step": 110225 + }, + { + "epoch": 7.489468677809485, + "grad_norm": 0.9273168444633484, + "learning_rate": 6.41476423427096e-05, + "loss": 3.3936, + "step": 110230 + }, + { + "epoch": 7.489808397880147, + "grad_norm": 0.7566055655479431, + "learning_rate": 6.410517733387688e-05, + "loss": 3.4655, + "step": 110235 + }, + { + "epoch": 7.490148117950809, + "grad_norm": 0.8990804553031921, + "learning_rate": 6.406271232504416e-05, + "loss": 3.4301, + "step": 110240 + }, + { + "epoch": 7.490487838021471, + "grad_norm": 1.023098111152649, + "learning_rate": 6.402024731621146e-05, + "loss": 3.2066, + "step": 110245 + }, + { + "epoch": 7.490827558092132, + "grad_norm": 0.9973464012145996, + "learning_rate": 6.397778230737872e-05, + "loss": 3.3566, + "step": 110250 + }, + { + "epoch": 7.491167278162794, + "grad_norm": 1.1611179113388062, + "learning_rate": 6.3935317298546e-05, + "loss": 3.3808, + "step": 110255 + }, + { + "epoch": 7.491506998233455, + "grad_norm": 0.9120360016822815, + "learning_rate": 6.389285228971328e-05, + "loss": 3.4391, + "step": 110260 + }, + { + "epoch": 7.491846718304117, + "grad_norm": 1.0286924839019775, + "learning_rate": 6.385038728088055e-05, + "loss": 3.5121, + "step": 110265 + }, + { + "epoch": 7.492186438374779, + "grad_norm": 0.9827174544334412, + "learning_rate": 6.380792227204783e-05, + "loss": 3.4288, + "step": 110270 + }, + { + "epoch": 7.492526158445441, + "grad_norm": 0.880236029624939, + "learning_rate": 6.376545726321511e-05, + "loss": 3.4302, + "step": 110275 + }, + { + "epoch": 7.492865878516103, + "grad_norm": 1.096649408340454, + "learning_rate": 6.372299225438239e-05, + "loss": 3.245, + "step": 110280 + }, + { + "epoch": 7.493205598586765, + "grad_norm": 0.8872920274734497, + "learning_rate": 6.368052724554967e-05, + "loss": 3.3979, + "step": 110285 + }, + { + "epoch": 7.493545318657426, + "grad_norm": 0.9359025359153748, + "learning_rate": 6.363806223671695e-05, + "loss": 3.1619, + "step": 110290 + }, + { + "epoch": 7.493885038728088, + "grad_norm": 0.8797444105148315, + "learning_rate": 6.359559722788423e-05, + "loss": 3.3735, + "step": 110295 + }, + { + "epoch": 7.49422475879875, + "grad_norm": 1.035375714302063, + "learning_rate": 6.35531322190515e-05, + "loss": 3.3602, + "step": 110300 + }, + { + "epoch": 7.494564478869411, + "grad_norm": 0.9037024974822998, + "learning_rate": 6.351066721021878e-05, + "loss": 3.2976, + "step": 110305 + }, + { + "epoch": 7.494904198940073, + "grad_norm": 0.7767401933670044, + "learning_rate": 6.346820220138607e-05, + "loss": 3.2696, + "step": 110310 + }, + { + "epoch": 7.495243919010735, + "grad_norm": 1.2030326128005981, + "learning_rate": 6.342573719255334e-05, + "loss": 3.685, + "step": 110315 + }, + { + "epoch": 7.495583639081397, + "grad_norm": 0.8481517434120178, + "learning_rate": 6.338327218372062e-05, + "loss": 3.3921, + "step": 110320 + }, + { + "epoch": 7.495923359152059, + "grad_norm": 1.1049492359161377, + "learning_rate": 6.33408071748879e-05, + "loss": 3.2148, + "step": 110325 + }, + { + "epoch": 7.496263079222721, + "grad_norm": 0.8636043667793274, + "learning_rate": 6.329834216605518e-05, + "loss": 3.2136, + "step": 110330 + }, + { + "epoch": 7.496602799293382, + "grad_norm": 0.8711188435554504, + "learning_rate": 6.325587715722244e-05, + "loss": 3.4559, + "step": 110335 + }, + { + "epoch": 7.496942519364044, + "grad_norm": 0.8619717955589294, + "learning_rate": 6.321341214838972e-05, + "loss": 3.3034, + "step": 110340 + }, + { + "epoch": 7.497282239434706, + "grad_norm": 1.1422981023788452, + "learning_rate": 6.317094713955702e-05, + "loss": 3.3743, + "step": 110345 + }, + { + "epoch": 7.497621959505367, + "grad_norm": 0.7423680424690247, + "learning_rate": 6.312848213072428e-05, + "loss": 3.498, + "step": 110350 + }, + { + "epoch": 7.497961679576029, + "grad_norm": 1.0266309976577759, + "learning_rate": 6.308601712189156e-05, + "loss": 3.528, + "step": 110355 + }, + { + "epoch": 7.498301399646691, + "grad_norm": 0.9704815745353699, + "learning_rate": 6.304355211305884e-05, + "loss": 3.5483, + "step": 110360 + }, + { + "epoch": 7.498641119717353, + "grad_norm": 0.9125679135322571, + "learning_rate": 6.300108710422611e-05, + "loss": 3.3519, + "step": 110365 + }, + { + "epoch": 7.498980839788015, + "grad_norm": 1.139822244644165, + "learning_rate": 6.295862209539339e-05, + "loss": 3.4585, + "step": 110370 + }, + { + "epoch": 7.499320559858677, + "grad_norm": 0.9856573939323425, + "learning_rate": 6.291615708656068e-05, + "loss": 3.3339, + "step": 110375 + }, + { + "epoch": 7.499660279929338, + "grad_norm": 0.7942637205123901, + "learning_rate": 6.287369207772796e-05, + "loss": 3.5557, + "step": 110380 + }, + { + "epoch": 7.5, + "grad_norm": 0.9916671514511108, + "learning_rate": 6.283122706889523e-05, + "loss": 3.7491, + "step": 110385 + }, + { + "epoch": 7.500339720070662, + "grad_norm": 0.7415621876716614, + "learning_rate": 6.278876206006251e-05, + "loss": 3.4172, + "step": 110390 + }, + { + "epoch": 7.500679440141323, + "grad_norm": 0.8131020069122314, + "learning_rate": 6.274629705122979e-05, + "loss": 3.3942, + "step": 110395 + }, + { + "epoch": 7.501019160211985, + "grad_norm": 0.896213710308075, + "learning_rate": 6.270383204239706e-05, + "loss": 3.5058, + "step": 110400 + }, + { + "epoch": 7.501358880282647, + "grad_norm": 1.0481654405593872, + "learning_rate": 6.266136703356434e-05, + "loss": 3.2166, + "step": 110405 + }, + { + "epoch": 7.501698600353309, + "grad_norm": 0.9154471755027771, + "learning_rate": 6.261890202473163e-05, + "loss": 3.0462, + "step": 110410 + }, + { + "epoch": 7.502038320423971, + "grad_norm": 1.1406673192977905, + "learning_rate": 6.257643701589891e-05, + "loss": 3.0918, + "step": 110415 + }, + { + "epoch": 7.502378040494633, + "grad_norm": 0.9988096952438354, + "learning_rate": 6.253397200706618e-05, + "loss": 3.5199, + "step": 110420 + }, + { + "epoch": 7.502717760565294, + "grad_norm": 0.8388963937759399, + "learning_rate": 6.249150699823346e-05, + "loss": 3.4736, + "step": 110425 + }, + { + "epoch": 7.503057480635956, + "grad_norm": 0.8421375155448914, + "learning_rate": 6.244904198940074e-05, + "loss": 3.0672, + "step": 110430 + }, + { + "epoch": 7.503397200706618, + "grad_norm": 0.9604227542877197, + "learning_rate": 6.240657698056802e-05, + "loss": 3.6945, + "step": 110435 + }, + { + "epoch": 7.503736920777279, + "grad_norm": 0.951640248298645, + "learning_rate": 6.236411197173528e-05, + "loss": 3.0939, + "step": 110440 + }, + { + "epoch": 7.504076640847941, + "grad_norm": 0.7867865562438965, + "learning_rate": 6.232164696290258e-05, + "loss": 3.5141, + "step": 110445 + }, + { + "epoch": 7.504416360918603, + "grad_norm": 0.9086334109306335, + "learning_rate": 6.227918195406984e-05, + "loss": 3.4537, + "step": 110450 + }, + { + "epoch": 7.504756080989265, + "grad_norm": 0.7599518299102783, + "learning_rate": 6.223671694523712e-05, + "loss": 3.3693, + "step": 110455 + }, + { + "epoch": 7.505095801059927, + "grad_norm": 1.0128785371780396, + "learning_rate": 6.21942519364044e-05, + "loss": 2.902, + "step": 110460 + }, + { + "epoch": 7.505435521130589, + "grad_norm": 1.1277470588684082, + "learning_rate": 6.215178692757168e-05, + "loss": 3.3402, + "step": 110465 + }, + { + "epoch": 7.50577524120125, + "grad_norm": 0.8316221833229065, + "learning_rate": 6.210932191873896e-05, + "loss": 3.1388, + "step": 110470 + }, + { + "epoch": 7.506114961271912, + "grad_norm": 0.8446305394172668, + "learning_rate": 6.206685690990624e-05, + "loss": 3.4985, + "step": 110475 + }, + { + "epoch": 7.506454681342574, + "grad_norm": 0.8173556327819824, + "learning_rate": 6.202439190107351e-05, + "loss": 3.4591, + "step": 110480 + }, + { + "epoch": 7.506794401413235, + "grad_norm": 1.0729273557662964, + "learning_rate": 6.198192689224079e-05, + "loss": 3.4198, + "step": 110485 + }, + { + "epoch": 7.507134121483897, + "grad_norm": 0.8930612802505493, + "learning_rate": 6.193946188340807e-05, + "loss": 3.2589, + "step": 110490 + }, + { + "epoch": 7.5074738415545585, + "grad_norm": 0.8629117012023926, + "learning_rate": 6.189699687457535e-05, + "loss": 3.1707, + "step": 110495 + }, + { + "epoch": 7.507813561625221, + "grad_norm": 0.870652437210083, + "learning_rate": 6.185453186574263e-05, + "loss": 3.2528, + "step": 110500 + }, + { + "epoch": 7.508153281695883, + "grad_norm": 0.9957869052886963, + "learning_rate": 6.18120668569099e-05, + "loss": 3.5155, + "step": 110505 + }, + { + "epoch": 7.508493001766544, + "grad_norm": 0.911189079284668, + "learning_rate": 6.176960184807719e-05, + "loss": 3.4134, + "step": 110510 + }, + { + "epoch": 7.508832721837206, + "grad_norm": 1.0229395627975464, + "learning_rate": 6.172713683924446e-05, + "loss": 3.2755, + "step": 110515 + }, + { + "epoch": 7.509172441907868, + "grad_norm": 1.187605619430542, + "learning_rate": 6.168467183041174e-05, + "loss": 3.197, + "step": 110520 + }, + { + "epoch": 7.509512161978529, + "grad_norm": 1.0108492374420166, + "learning_rate": 6.164220682157902e-05, + "loss": 3.231, + "step": 110525 + }, + { + "epoch": 7.509851882049191, + "grad_norm": 1.4034297466278076, + "learning_rate": 6.15997418127463e-05, + "loss": 3.3995, + "step": 110530 + }, + { + "epoch": 7.510191602119853, + "grad_norm": 0.798566460609436, + "learning_rate": 6.155727680391358e-05, + "loss": 3.2372, + "step": 110535 + }, + { + "epoch": 7.510531322190515, + "grad_norm": 0.7469354867935181, + "learning_rate": 6.151481179508086e-05, + "loss": 3.5561, + "step": 110540 + }, + { + "epoch": 7.510871042261177, + "grad_norm": 1.2719950675964355, + "learning_rate": 6.147234678624814e-05, + "loss": 3.4162, + "step": 110545 + }, + { + "epoch": 7.511210762331839, + "grad_norm": 1.0321216583251953, + "learning_rate": 6.14298817774154e-05, + "loss": 3.4673, + "step": 110550 + }, + { + "epoch": 7.5115504824025, + "grad_norm": 1.3944116830825806, + "learning_rate": 6.13874167685827e-05, + "loss": 3.2426, + "step": 110555 + }, + { + "epoch": 7.511890202473162, + "grad_norm": 0.8952900767326355, + "learning_rate": 6.134495175974997e-05, + "loss": 3.5649, + "step": 110560 + }, + { + "epoch": 7.512229922543824, + "grad_norm": 1.205167293548584, + "learning_rate": 6.130248675091725e-05, + "loss": 3.6433, + "step": 110565 + }, + { + "epoch": 7.512569642614485, + "grad_norm": 0.802108108997345, + "learning_rate": 6.126002174208453e-05, + "loss": 3.2227, + "step": 110570 + }, + { + "epoch": 7.512909362685147, + "grad_norm": 1.3564757108688354, + "learning_rate": 6.12175567332518e-05, + "loss": 3.5059, + "step": 110575 + }, + { + "epoch": 7.513249082755809, + "grad_norm": 0.7106591463088989, + "learning_rate": 6.117509172441909e-05, + "loss": 3.6922, + "step": 110580 + }, + { + "epoch": 7.513588802826471, + "grad_norm": 1.012741208076477, + "learning_rate": 6.113262671558635e-05, + "loss": 3.5396, + "step": 110585 + }, + { + "epoch": 7.513928522897133, + "grad_norm": 1.0532689094543457, + "learning_rate": 6.109016170675363e-05, + "loss": 3.4035, + "step": 110590 + }, + { + "epoch": 7.514268242967795, + "grad_norm": 0.9844326972961426, + "learning_rate": 6.104769669792091e-05, + "loss": 3.1922, + "step": 110595 + }, + { + "epoch": 7.514607963038456, + "grad_norm": 0.8460561633110046, + "learning_rate": 6.100523168908819e-05, + "loss": 3.4807, + "step": 110600 + }, + { + "epoch": 7.514947683109118, + "grad_norm": 0.9211511611938477, + "learning_rate": 6.096276668025547e-05, + "loss": 3.3229, + "step": 110605 + }, + { + "epoch": 7.51528740317978, + "grad_norm": 1.0602363348007202, + "learning_rate": 6.0920301671422746e-05, + "loss": 3.4725, + "step": 110610 + }, + { + "epoch": 7.515627123250441, + "grad_norm": 0.9387175440788269, + "learning_rate": 6.087783666259003e-05, + "loss": 3.2633, + "step": 110615 + }, + { + "epoch": 7.515966843321103, + "grad_norm": 0.8427199721336365, + "learning_rate": 6.0835371653757306e-05, + "loss": 3.4414, + "step": 110620 + }, + { + "epoch": 7.516306563391765, + "grad_norm": 0.8096643090248108, + "learning_rate": 6.079290664492458e-05, + "loss": 3.2723, + "step": 110625 + }, + { + "epoch": 7.516646283462427, + "grad_norm": 0.8248152136802673, + "learning_rate": 6.0750441636091866e-05, + "loss": 3.4056, + "step": 110630 + }, + { + "epoch": 7.516986003533089, + "grad_norm": 0.835559070110321, + "learning_rate": 6.070797662725914e-05, + "loss": 3.1849, + "step": 110635 + }, + { + "epoch": 7.517325723603751, + "grad_norm": 0.9605305790901184, + "learning_rate": 6.066551161842642e-05, + "loss": 3.3084, + "step": 110640 + }, + { + "epoch": 7.517665443674412, + "grad_norm": 1.1913032531738281, + "learning_rate": 6.06230466095937e-05, + "loss": 3.3058, + "step": 110645 + }, + { + "epoch": 7.518005163745074, + "grad_norm": 1.1537522077560425, + "learning_rate": 6.058058160076097e-05, + "loss": 3.4338, + "step": 110650 + }, + { + "epoch": 7.518344883815736, + "grad_norm": 1.5156059265136719, + "learning_rate": 6.053811659192825e-05, + "loss": 3.4001, + "step": 110655 + }, + { + "epoch": 7.518684603886397, + "grad_norm": 0.7772120833396912, + "learning_rate": 6.0495651583095526e-05, + "loss": 3.2215, + "step": 110660 + }, + { + "epoch": 7.519024323957059, + "grad_norm": 0.9350413680076599, + "learning_rate": 6.045318657426281e-05, + "loss": 3.4903, + "step": 110665 + }, + { + "epoch": 7.519364044027721, + "grad_norm": 0.8059254884719849, + "learning_rate": 6.0410721565430086e-05, + "loss": 3.4313, + "step": 110670 + }, + { + "epoch": 7.519703764098383, + "grad_norm": 0.9018340706825256, + "learning_rate": 6.036825655659736e-05, + "loss": 3.1238, + "step": 110675 + }, + { + "epoch": 7.520043484169045, + "grad_norm": 0.956717848777771, + "learning_rate": 6.0325791547764646e-05, + "loss": 3.5442, + "step": 110680 + }, + { + "epoch": 7.520383204239707, + "grad_norm": 0.920565664768219, + "learning_rate": 6.028332653893192e-05, + "loss": 3.3921, + "step": 110685 + }, + { + "epoch": 7.520722924310368, + "grad_norm": 0.8320326209068298, + "learning_rate": 6.02408615300992e-05, + "loss": 3.3651, + "step": 110690 + }, + { + "epoch": 7.52106264438103, + "grad_norm": 0.9225030541419983, + "learning_rate": 6.019839652126648e-05, + "loss": 3.3828, + "step": 110695 + }, + { + "epoch": 7.521402364451692, + "grad_norm": 0.8586440682411194, + "learning_rate": 6.0164424514200304e-05, + "loss": 3.3369, + "step": 110700 + }, + { + "epoch": 7.521742084522353, + "grad_norm": 0.9437689185142517, + "learning_rate": 6.012195950536758e-05, + "loss": 3.2419, + "step": 110705 + }, + { + "epoch": 7.522081804593015, + "grad_norm": 0.9920357465744019, + "learning_rate": 6.007949449653485e-05, + "loss": 3.8357, + "step": 110710 + }, + { + "epoch": 7.522421524663677, + "grad_norm": 0.9931831955909729, + "learning_rate": 6.003702948770214e-05, + "loss": 3.6689, + "step": 110715 + }, + { + "epoch": 7.522761244734339, + "grad_norm": 0.9885028600692749, + "learning_rate": 5.999456447886941e-05, + "loss": 3.3589, + "step": 110720 + }, + { + "epoch": 7.523100964805001, + "grad_norm": 1.2361721992492676, + "learning_rate": 5.995209947003669e-05, + "loss": 3.4219, + "step": 110725 + }, + { + "epoch": 7.523440684875663, + "grad_norm": 1.0717439651489258, + "learning_rate": 5.990963446120397e-05, + "loss": 3.6992, + "step": 110730 + }, + { + "epoch": 7.523780404946324, + "grad_norm": 0.9085184335708618, + "learning_rate": 5.986716945237125e-05, + "loss": 3.5773, + "step": 110735 + }, + { + "epoch": 7.524120125016986, + "grad_norm": 1.004119634628296, + "learning_rate": 5.9824704443538524e-05, + "loss": 3.329, + "step": 110740 + }, + { + "epoch": 7.524459845087648, + "grad_norm": 0.7617980241775513, + "learning_rate": 5.9782239434705804e-05, + "loss": 3.2446, + "step": 110745 + }, + { + "epoch": 7.524799565158309, + "grad_norm": 1.0238404273986816, + "learning_rate": 5.9739774425873084e-05, + "loss": 3.2497, + "step": 110750 + }, + { + "epoch": 7.525139285228971, + "grad_norm": 0.9376500844955444, + "learning_rate": 5.969730941704036e-05, + "loss": 3.4396, + "step": 110755 + }, + { + "epoch": 7.525479005299633, + "grad_norm": 1.0900242328643799, + "learning_rate": 5.965484440820764e-05, + "loss": 3.28, + "step": 110760 + }, + { + "epoch": 7.525818725370295, + "grad_norm": 0.7351537346839905, + "learning_rate": 5.961237939937492e-05, + "loss": 3.4303, + "step": 110765 + }, + { + "epoch": 7.526158445440957, + "grad_norm": 0.8709346652030945, + "learning_rate": 5.956991439054219e-05, + "loss": 3.2901, + "step": 110770 + }, + { + "epoch": 7.526498165511619, + "grad_norm": 0.8774198293685913, + "learning_rate": 5.952744938170947e-05, + "loss": 3.5547, + "step": 110775 + }, + { + "epoch": 7.52683788558228, + "grad_norm": 1.2342363595962524, + "learning_rate": 5.948498437287675e-05, + "loss": 3.3735, + "step": 110780 + }, + { + "epoch": 7.527177605652942, + "grad_norm": 0.8935549259185791, + "learning_rate": 5.944251936404403e-05, + "loss": 3.3572, + "step": 110785 + }, + { + "epoch": 7.527517325723604, + "grad_norm": 1.2053388357162476, + "learning_rate": 5.9400054355211305e-05, + "loss": 3.2562, + "step": 110790 + }, + { + "epoch": 7.527857045794265, + "grad_norm": 0.7832306623458862, + "learning_rate": 5.9357589346378585e-05, + "loss": 3.5175, + "step": 110795 + }, + { + "epoch": 7.528196765864927, + "grad_norm": 1.8053730726242065, + "learning_rate": 5.9315124337545865e-05, + "loss": 3.6061, + "step": 110800 + }, + { + "epoch": 7.528536485935589, + "grad_norm": 0.8570696711540222, + "learning_rate": 5.927265932871314e-05, + "loss": 3.6088, + "step": 110805 + }, + { + "epoch": 7.528876206006251, + "grad_norm": 0.803434431552887, + "learning_rate": 5.9230194319880425e-05, + "loss": 3.2566, + "step": 110810 + }, + { + "epoch": 7.529215926076913, + "grad_norm": 1.1454607248306274, + "learning_rate": 5.91877293110477e-05, + "loss": 3.296, + "step": 110815 + }, + { + "epoch": 7.529555646147575, + "grad_norm": 1.0361809730529785, + "learning_rate": 5.914526430221498e-05, + "loss": 3.328, + "step": 110820 + }, + { + "epoch": 7.529895366218236, + "grad_norm": 0.842930018901825, + "learning_rate": 5.910279929338225e-05, + "loss": 3.1794, + "step": 110825 + }, + { + "epoch": 7.530235086288898, + "grad_norm": 1.8623378276824951, + "learning_rate": 5.906033428454953e-05, + "loss": 3.284, + "step": 110830 + }, + { + "epoch": 7.53057480635956, + "grad_norm": 0.7943780422210693, + "learning_rate": 5.901786927571681e-05, + "loss": 3.3899, + "step": 110835 + }, + { + "epoch": 7.530914526430221, + "grad_norm": 0.9994096159934998, + "learning_rate": 5.8975404266884085e-05, + "loss": 3.1665, + "step": 110840 + }, + { + "epoch": 7.531254246500883, + "grad_norm": 1.3846967220306396, + "learning_rate": 5.893293925805137e-05, + "loss": 3.3919, + "step": 110845 + }, + { + "epoch": 7.5315939665715455, + "grad_norm": 1.3458116054534912, + "learning_rate": 5.8890474249218645e-05, + "loss": 3.5961, + "step": 110850 + }, + { + "epoch": 7.531933686642207, + "grad_norm": 0.9948827624320984, + "learning_rate": 5.884800924038592e-05, + "loss": 3.2376, + "step": 110855 + }, + { + "epoch": 7.532273406712869, + "grad_norm": 1.366356611251831, + "learning_rate": 5.8805544231553205e-05, + "loss": 3.2612, + "step": 110860 + }, + { + "epoch": 7.532613126783531, + "grad_norm": 0.853091299533844, + "learning_rate": 5.876307922272048e-05, + "loss": 3.3448, + "step": 110865 + }, + { + "epoch": 7.532952846854192, + "grad_norm": 1.1183748245239258, + "learning_rate": 5.872061421388776e-05, + "loss": 3.2767, + "step": 110870 + }, + { + "epoch": 7.533292566924854, + "grad_norm": 0.7931148409843445, + "learning_rate": 5.867814920505504e-05, + "loss": 3.4608, + "step": 110875 + }, + { + "epoch": 7.533632286995516, + "grad_norm": 0.7956960797309875, + "learning_rate": 5.863568419622231e-05, + "loss": 3.2945, + "step": 110880 + }, + { + "epoch": 7.533972007066177, + "grad_norm": 0.9264063239097595, + "learning_rate": 5.859321918738959e-05, + "loss": 3.4772, + "step": 110885 + }, + { + "epoch": 7.534311727136839, + "grad_norm": 0.8222202062606812, + "learning_rate": 5.8550754178556866e-05, + "loss": 3.5438, + "step": 110890 + }, + { + "epoch": 7.5346514472075015, + "grad_norm": 0.9292075037956238, + "learning_rate": 5.850828916972415e-05, + "loss": 3.7885, + "step": 110895 + }, + { + "epoch": 7.534991167278163, + "grad_norm": 0.9123685956001282, + "learning_rate": 5.8465824160891426e-05, + "loss": 3.1663, + "step": 110900 + }, + { + "epoch": 7.535330887348825, + "grad_norm": 0.8801208734512329, + "learning_rate": 5.8423359152058706e-05, + "loss": 3.3379, + "step": 110905 + }, + { + "epoch": 7.535670607419487, + "grad_norm": 1.0789026021957397, + "learning_rate": 5.8380894143225986e-05, + "loss": 3.5919, + "step": 110910 + }, + { + "epoch": 7.536010327490148, + "grad_norm": 1.2814037799835205, + "learning_rate": 5.833842913439326e-05, + "loss": 3.5719, + "step": 110915 + }, + { + "epoch": 7.53635004756081, + "grad_norm": 0.8850480318069458, + "learning_rate": 5.829596412556054e-05, + "loss": 3.3842, + "step": 110920 + }, + { + "epoch": 7.536689767631472, + "grad_norm": 0.9480291604995728, + "learning_rate": 5.825349911672782e-05, + "loss": 3.1364, + "step": 110925 + }, + { + "epoch": 7.537029487702133, + "grad_norm": 0.9737577438354492, + "learning_rate": 5.82110341078951e-05, + "loss": 3.3555, + "step": 110930 + }, + { + "epoch": 7.537369207772795, + "grad_norm": 0.8187976479530334, + "learning_rate": 5.816856909906237e-05, + "loss": 3.5642, + "step": 110935 + }, + { + "epoch": 7.5377089278434575, + "grad_norm": 0.7971140742301941, + "learning_rate": 5.8126104090229646e-05, + "loss": 3.6754, + "step": 110940 + }, + { + "epoch": 7.538048647914119, + "grad_norm": 0.9135612845420837, + "learning_rate": 5.808363908139693e-05, + "loss": 3.6089, + "step": 110945 + }, + { + "epoch": 7.538388367984781, + "grad_norm": 1.0390366315841675, + "learning_rate": 5.8041174072564206e-05, + "loss": 3.2882, + "step": 110950 + }, + { + "epoch": 7.538728088055443, + "grad_norm": 1.2540656328201294, + "learning_rate": 5.7998709063731486e-05, + "loss": 3.5145, + "step": 110955 + }, + { + "epoch": 7.539067808126104, + "grad_norm": 0.8345974683761597, + "learning_rate": 5.7956244054898766e-05, + "loss": 3.2083, + "step": 110960 + }, + { + "epoch": 7.539407528196766, + "grad_norm": 0.867470383644104, + "learning_rate": 5.791377904606604e-05, + "loss": 3.3734, + "step": 110965 + }, + { + "epoch": 7.539747248267427, + "grad_norm": 0.8344741463661194, + "learning_rate": 5.787131403723332e-05, + "loss": 3.0591, + "step": 110970 + }, + { + "epoch": 7.540086968338089, + "grad_norm": 1.2050118446350098, + "learning_rate": 5.78288490284006e-05, + "loss": 3.4223, + "step": 110975 + }, + { + "epoch": 7.540426688408751, + "grad_norm": 1.0050663948059082, + "learning_rate": 5.778638401956788e-05, + "loss": 3.1616, + "step": 110980 + }, + { + "epoch": 7.540766408479413, + "grad_norm": 0.6758597493171692, + "learning_rate": 5.774391901073515e-05, + "loss": 3.354, + "step": 110985 + }, + { + "epoch": 7.541106128550075, + "grad_norm": 0.843966007232666, + "learning_rate": 5.770145400190244e-05, + "loss": 3.4655, + "step": 110990 + }, + { + "epoch": 7.541445848620737, + "grad_norm": 4.489269256591797, + "learning_rate": 5.765898899306971e-05, + "loss": 3.219, + "step": 110995 + }, + { + "epoch": 7.541785568691398, + "grad_norm": 1.0495887994766235, + "learning_rate": 5.7616523984236987e-05, + "loss": 3.3754, + "step": 111000 + }, + { + "epoch": 7.54212528876206, + "grad_norm": 1.0655415058135986, + "learning_rate": 5.757405897540427e-05, + "loss": 3.329, + "step": 111005 + }, + { + "epoch": 7.542465008832722, + "grad_norm": 0.9531832337379456, + "learning_rate": 5.753159396657155e-05, + "loss": 3.5093, + "step": 111010 + }, + { + "epoch": 7.542804728903383, + "grad_norm": 0.8792481422424316, + "learning_rate": 5.748912895773883e-05, + "loss": 3.3045, + "step": 111015 + }, + { + "epoch": 7.543144448974045, + "grad_norm": 0.7427186965942383, + "learning_rate": 5.74466639489061e-05, + "loss": 3.4255, + "step": 111020 + }, + { + "epoch": 7.543484169044707, + "grad_norm": 0.9242883324623108, + "learning_rate": 5.740419894007338e-05, + "loss": 3.306, + "step": 111025 + }, + { + "epoch": 7.543823889115369, + "grad_norm": 1.1567984819412231, + "learning_rate": 5.736173393124066e-05, + "loss": 3.2987, + "step": 111030 + }, + { + "epoch": 7.544163609186031, + "grad_norm": 0.9089877605438232, + "learning_rate": 5.7319268922407934e-05, + "loss": 3.3105, + "step": 111035 + }, + { + "epoch": 7.544503329256693, + "grad_norm": 1.1531163454055786, + "learning_rate": 5.727680391357522e-05, + "loss": 3.4237, + "step": 111040 + }, + { + "epoch": 7.544843049327354, + "grad_norm": 0.625120222568512, + "learning_rate": 5.7234338904742494e-05, + "loss": 3.1689, + "step": 111045 + }, + { + "epoch": 7.545182769398016, + "grad_norm": 0.8082393407821655, + "learning_rate": 5.719187389590977e-05, + "loss": 3.5555, + "step": 111050 + }, + { + "epoch": 7.545522489468678, + "grad_norm": 1.0093564987182617, + "learning_rate": 5.7149408887077054e-05, + "loss": 3.5367, + "step": 111055 + }, + { + "epoch": 7.545862209539339, + "grad_norm": 0.8401378989219666, + "learning_rate": 5.710694387824433e-05, + "loss": 3.2074, + "step": 111060 + }, + { + "epoch": 7.546201929610001, + "grad_norm": 0.9936879277229309, + "learning_rate": 5.706447886941161e-05, + "loss": 3.2405, + "step": 111065 + }, + { + "epoch": 7.546541649680663, + "grad_norm": 0.9898437261581421, + "learning_rate": 5.702201386057888e-05, + "loss": 3.3214, + "step": 111070 + }, + { + "epoch": 7.546881369751325, + "grad_norm": 1.016484260559082, + "learning_rate": 5.697954885174617e-05, + "loss": 3.5475, + "step": 111075 + }, + { + "epoch": 7.547221089821987, + "grad_norm": 0.8189226984977722, + "learning_rate": 5.693708384291344e-05, + "loss": 3.2665, + "step": 111080 + }, + { + "epoch": 7.547560809892649, + "grad_norm": 1.0417810678482056, + "learning_rate": 5.6894618834080714e-05, + "loss": 3.3395, + "step": 111085 + }, + { + "epoch": 7.54790052996331, + "grad_norm": 0.8991899490356445, + "learning_rate": 5.6852153825248e-05, + "loss": 3.5301, + "step": 111090 + }, + { + "epoch": 7.548240250033972, + "grad_norm": 0.9534642100334167, + "learning_rate": 5.6809688816415274e-05, + "loss": 3.5118, + "step": 111095 + }, + { + "epoch": 7.548579970104634, + "grad_norm": 0.863837718963623, + "learning_rate": 5.6767223807582554e-05, + "loss": 3.6653, + "step": 111100 + }, + { + "epoch": 7.548919690175295, + "grad_norm": 1.28590989112854, + "learning_rate": 5.6724758798749834e-05, + "loss": 3.4676, + "step": 111105 + }, + { + "epoch": 7.549259410245957, + "grad_norm": 0.9508373141288757, + "learning_rate": 5.668229378991711e-05, + "loss": 3.2652, + "step": 111110 + }, + { + "epoch": 7.5495991303166194, + "grad_norm": 1.00570809841156, + "learning_rate": 5.663982878108439e-05, + "loss": 3.1338, + "step": 111115 + }, + { + "epoch": 7.549938850387281, + "grad_norm": 0.9225771427154541, + "learning_rate": 5.659736377225166e-05, + "loss": 3.6795, + "step": 111120 + }, + { + "epoch": 7.550278570457943, + "grad_norm": 0.9781825542449951, + "learning_rate": 5.655489876341895e-05, + "loss": 3.3615, + "step": 111125 + }, + { + "epoch": 7.550618290528605, + "grad_norm": 0.7540979981422424, + "learning_rate": 5.651243375458622e-05, + "loss": 3.3011, + "step": 111130 + }, + { + "epoch": 7.550958010599266, + "grad_norm": 1.0694115161895752, + "learning_rate": 5.6469968745753494e-05, + "loss": 3.1466, + "step": 111135 + }, + { + "epoch": 7.551297730669928, + "grad_norm": 1.000286340713501, + "learning_rate": 5.642750373692078e-05, + "loss": 3.2748, + "step": 111140 + }, + { + "epoch": 7.55163745074059, + "grad_norm": 0.8480379581451416, + "learning_rate": 5.6385038728088055e-05, + "loss": 3.2217, + "step": 111145 + }, + { + "epoch": 7.551977170811251, + "grad_norm": 0.8654393553733826, + "learning_rate": 5.6342573719255335e-05, + "loss": 3.5001, + "step": 111150 + }, + { + "epoch": 7.552316890881913, + "grad_norm": 0.943507194519043, + "learning_rate": 5.6300108710422615e-05, + "loss": 3.4672, + "step": 111155 + }, + { + "epoch": 7.5526566109525755, + "grad_norm": 1.0692079067230225, + "learning_rate": 5.6257643701589895e-05, + "loss": 3.3204, + "step": 111160 + }, + { + "epoch": 7.552996331023237, + "grad_norm": 0.80223548412323, + "learning_rate": 5.621517869275717e-05, + "loss": 3.3976, + "step": 111165 + }, + { + "epoch": 7.553336051093899, + "grad_norm": 1.0105695724487305, + "learning_rate": 5.617271368392445e-05, + "loss": 3.4274, + "step": 111170 + }, + { + "epoch": 7.55367577116456, + "grad_norm": 0.7826060056686401, + "learning_rate": 5.613024867509173e-05, + "loss": 3.3007, + "step": 111175 + }, + { + "epoch": 7.554015491235222, + "grad_norm": 0.9786580801010132, + "learning_rate": 5.6087783666259e-05, + "loss": 3.351, + "step": 111180 + }, + { + "epoch": 7.554355211305884, + "grad_norm": 0.9193581342697144, + "learning_rate": 5.604531865742628e-05, + "loss": 3.3902, + "step": 111185 + }, + { + "epoch": 7.554694931376545, + "grad_norm": 0.9430224299430847, + "learning_rate": 5.600285364859356e-05, + "loss": 3.5122, + "step": 111190 + }, + { + "epoch": 7.555034651447207, + "grad_norm": 1.3244264125823975, + "learning_rate": 5.5960388639760835e-05, + "loss": 3.2166, + "step": 111195 + }, + { + "epoch": 7.555374371517869, + "grad_norm": 0.8973139524459839, + "learning_rate": 5.5917923630928115e-05, + "loss": 3.5163, + "step": 111200 + }, + { + "epoch": 7.555714091588531, + "grad_norm": 0.8164151310920715, + "learning_rate": 5.5875458622095395e-05, + "loss": 3.4583, + "step": 111205 + }, + { + "epoch": 7.556053811659193, + "grad_norm": 0.8496857285499573, + "learning_rate": 5.5832993613262675e-05, + "loss": 3.4694, + "step": 111210 + }, + { + "epoch": 7.556393531729855, + "grad_norm": 1.0808261632919312, + "learning_rate": 5.579052860442995e-05, + "loss": 3.131, + "step": 111215 + }, + { + "epoch": 7.556733251800516, + "grad_norm": 0.8490496277809143, + "learning_rate": 5.574806359559723e-05, + "loss": 3.2893, + "step": 111220 + }, + { + "epoch": 7.557072971871178, + "grad_norm": 0.8560026288032532, + "learning_rate": 5.570559858676451e-05, + "loss": 3.2903, + "step": 111225 + }, + { + "epoch": 7.55741269194184, + "grad_norm": 1.0869386196136475, + "learning_rate": 5.566313357793178e-05, + "loss": 3.291, + "step": 111230 + }, + { + "epoch": 7.557752412012501, + "grad_norm": 0.9720499515533447, + "learning_rate": 5.562066856909907e-05, + "loss": 3.1908, + "step": 111235 + }, + { + "epoch": 7.558092132083163, + "grad_norm": 0.8887346982955933, + "learning_rate": 5.557820356026634e-05, + "loss": 3.4955, + "step": 111240 + }, + { + "epoch": 7.558431852153825, + "grad_norm": 1.7528939247131348, + "learning_rate": 5.553573855143362e-05, + "loss": 3.438, + "step": 111245 + }, + { + "epoch": 7.558771572224487, + "grad_norm": 0.9449172019958496, + "learning_rate": 5.5493273542600896e-05, + "loss": 3.6225, + "step": 111250 + }, + { + "epoch": 7.559111292295149, + "grad_norm": 0.9753671288490295, + "learning_rate": 5.5450808533768176e-05, + "loss": 3.4991, + "step": 111255 + }, + { + "epoch": 7.559451012365811, + "grad_norm": 0.84330153465271, + "learning_rate": 5.5408343524935456e-05, + "loss": 3.4765, + "step": 111260 + }, + { + "epoch": 7.559790732436472, + "grad_norm": 0.8053187131881714, + "learning_rate": 5.536587851610273e-05, + "loss": 3.4732, + "step": 111265 + }, + { + "epoch": 7.560130452507134, + "grad_norm": 0.9485358595848083, + "learning_rate": 5.5323413507270016e-05, + "loss": 3.1892, + "step": 111270 + }, + { + "epoch": 7.560470172577796, + "grad_norm": 0.8992294669151306, + "learning_rate": 5.528094849843729e-05, + "loss": 3.4286, + "step": 111275 + }, + { + "epoch": 7.560809892648457, + "grad_norm": 0.774539589881897, + "learning_rate": 5.523848348960456e-05, + "loss": 3.4999, + "step": 111280 + }, + { + "epoch": 7.561149612719119, + "grad_norm": 0.8762194514274597, + "learning_rate": 5.519601848077185e-05, + "loss": 3.2983, + "step": 111285 + }, + { + "epoch": 7.561489332789781, + "grad_norm": 0.9701195955276489, + "learning_rate": 5.515355347193912e-05, + "loss": 3.5157, + "step": 111290 + }, + { + "epoch": 7.561829052860443, + "grad_norm": 0.8302828073501587, + "learning_rate": 5.51110884631064e-05, + "loss": 3.3158, + "step": 111295 + }, + { + "epoch": 7.562168772931105, + "grad_norm": 0.8757392168045044, + "learning_rate": 5.506862345427368e-05, + "loss": 3.7282, + "step": 111300 + }, + { + "epoch": 7.562508493001767, + "grad_norm": 0.9234779477119446, + "learning_rate": 5.5026158445440956e-05, + "loss": 3.3421, + "step": 111305 + }, + { + "epoch": 7.562848213072428, + "grad_norm": 0.918983519077301, + "learning_rate": 5.4983693436608236e-05, + "loss": 3.595, + "step": 111310 + }, + { + "epoch": 7.56318793314309, + "grad_norm": 1.3373476266860962, + "learning_rate": 5.494122842777551e-05, + "loss": 3.4707, + "step": 111315 + }, + { + "epoch": 7.563527653213752, + "grad_norm": 0.8898845911026001, + "learning_rate": 5.4898763418942796e-05, + "loss": 3.3766, + "step": 111320 + }, + { + "epoch": 7.563867373284413, + "grad_norm": 1.1520800590515137, + "learning_rate": 5.485629841011007e-05, + "loss": 3.3581, + "step": 111325 + }, + { + "epoch": 7.564207093355075, + "grad_norm": 0.9496495723724365, + "learning_rate": 5.481383340127735e-05, + "loss": 3.3219, + "step": 111330 + }, + { + "epoch": 7.564546813425737, + "grad_norm": 1.2607035636901855, + "learning_rate": 5.477136839244463e-05, + "loss": 3.4954, + "step": 111335 + }, + { + "epoch": 7.564886533496399, + "grad_norm": 0.9642074108123779, + "learning_rate": 5.47289033836119e-05, + "loss": 3.3991, + "step": 111340 + }, + { + "epoch": 7.565226253567061, + "grad_norm": 0.8146254420280457, + "learning_rate": 5.468643837477918e-05, + "loss": 3.5031, + "step": 111345 + }, + { + "epoch": 7.565565973637723, + "grad_norm": 1.0160869359970093, + "learning_rate": 5.464397336594646e-05, + "loss": 3.5101, + "step": 111350 + }, + { + "epoch": 7.565905693708384, + "grad_norm": 1.0078233480453491, + "learning_rate": 5.460150835711374e-05, + "loss": 3.2805, + "step": 111355 + }, + { + "epoch": 7.566245413779046, + "grad_norm": 0.9245506525039673, + "learning_rate": 5.4559043348281017e-05, + "loss": 3.368, + "step": 111360 + }, + { + "epoch": 7.566585133849708, + "grad_norm": 0.8413960337638855, + "learning_rate": 5.451657833944829e-05, + "loss": 3.3881, + "step": 111365 + }, + { + "epoch": 7.566924853920369, + "grad_norm": 0.9986164569854736, + "learning_rate": 5.447411333061558e-05, + "loss": 3.5234, + "step": 111370 + }, + { + "epoch": 7.567264573991031, + "grad_norm": 0.9047693610191345, + "learning_rate": 5.443164832178285e-05, + "loss": 3.4148, + "step": 111375 + }, + { + "epoch": 7.567604294061693, + "grad_norm": 1.645825982093811, + "learning_rate": 5.438918331295013e-05, + "loss": 3.4277, + "step": 111380 + }, + { + "epoch": 7.567944014132355, + "grad_norm": 0.8419675230979919, + "learning_rate": 5.434671830411741e-05, + "loss": 3.405, + "step": 111385 + }, + { + "epoch": 7.568283734203017, + "grad_norm": 0.9045330882072449, + "learning_rate": 5.4304253295284684e-05, + "loss": 3.4877, + "step": 111390 + }, + { + "epoch": 7.568623454273679, + "grad_norm": 0.9136260151863098, + "learning_rate": 5.4261788286451964e-05, + "loss": 3.5268, + "step": 111395 + }, + { + "epoch": 7.56896317434434, + "grad_norm": 0.9433954358100891, + "learning_rate": 5.4219323277619244e-05, + "loss": 3.5794, + "step": 111400 + }, + { + "epoch": 7.569302894415002, + "grad_norm": 3.7134499549865723, + "learning_rate": 5.4176858268786524e-05, + "loss": 3.4299, + "step": 111405 + }, + { + "epoch": 7.569642614485664, + "grad_norm": 0.772043764591217, + "learning_rate": 5.41343932599538e-05, + "loss": 3.3842, + "step": 111410 + }, + { + "epoch": 7.569982334556325, + "grad_norm": 0.8751406073570251, + "learning_rate": 5.409192825112108e-05, + "loss": 2.933, + "step": 111415 + }, + { + "epoch": 7.570322054626987, + "grad_norm": 0.7845754623413086, + "learning_rate": 5.404946324228836e-05, + "loss": 3.2955, + "step": 111420 + }, + { + "epoch": 7.5706617746976494, + "grad_norm": 0.9565984606742859, + "learning_rate": 5.400699823345563e-05, + "loss": 3.5833, + "step": 111425 + }, + { + "epoch": 7.571001494768311, + "grad_norm": 0.8577227592468262, + "learning_rate": 5.396453322462291e-05, + "loss": 3.5287, + "step": 111430 + }, + { + "epoch": 7.571341214838973, + "grad_norm": 1.1477410793304443, + "learning_rate": 5.392206821579019e-05, + "loss": 3.2326, + "step": 111435 + }, + { + "epoch": 7.571680934909635, + "grad_norm": 0.7696582674980164, + "learning_rate": 5.387960320695747e-05, + "loss": 3.4233, + "step": 111440 + }, + { + "epoch": 7.572020654980296, + "grad_norm": 1.6161779165267944, + "learning_rate": 5.3837138198124744e-05, + "loss": 3.6339, + "step": 111445 + }, + { + "epoch": 7.572360375050958, + "grad_norm": 0.8754577040672302, + "learning_rate": 5.3794673189292024e-05, + "loss": 3.1502, + "step": 111450 + }, + { + "epoch": 7.57270009512162, + "grad_norm": 0.8094170689582825, + "learning_rate": 5.3752208180459304e-05, + "loss": 3.5133, + "step": 111455 + }, + { + "epoch": 7.573039815192281, + "grad_norm": 0.8884057998657227, + "learning_rate": 5.370974317162658e-05, + "loss": 3.2627, + "step": 111460 + }, + { + "epoch": 7.573379535262943, + "grad_norm": 1.0266739130020142, + "learning_rate": 5.3667278162793864e-05, + "loss": 3.3535, + "step": 111465 + }, + { + "epoch": 7.5737192553336055, + "grad_norm": 1.0699812173843384, + "learning_rate": 5.362481315396114e-05, + "loss": 3.4099, + "step": 111470 + }, + { + "epoch": 7.574058975404267, + "grad_norm": 0.9310716986656189, + "learning_rate": 5.358234814512841e-05, + "loss": 3.227, + "step": 111475 + }, + { + "epoch": 7.574398695474929, + "grad_norm": 0.8921904563903809, + "learning_rate": 5.35398831362957e-05, + "loss": 3.6456, + "step": 111480 + }, + { + "epoch": 7.574738415545591, + "grad_norm": 0.7298526167869568, + "learning_rate": 5.349741812746297e-05, + "loss": 3.2223, + "step": 111485 + }, + { + "epoch": 7.575078135616252, + "grad_norm": 0.8072844743728638, + "learning_rate": 5.345495311863025e-05, + "loss": 3.4938, + "step": 111490 + }, + { + "epoch": 7.575417855686914, + "grad_norm": 1.1010662317276, + "learning_rate": 5.3412488109797524e-05, + "loss": 3.5153, + "step": 111495 + }, + { + "epoch": 7.575757575757576, + "grad_norm": 0.8986167907714844, + "learning_rate": 5.3370023100964805e-05, + "loss": 3.1707, + "step": 111500 + }, + { + "epoch": 7.576097295828237, + "grad_norm": 0.9021571278572083, + "learning_rate": 5.3327558092132085e-05, + "loss": 3.4502, + "step": 111505 + }, + { + "epoch": 7.576437015898899, + "grad_norm": 1.212934970855713, + "learning_rate": 5.328509308329936e-05, + "loss": 3.4933, + "step": 111510 + }, + { + "epoch": 7.5767767359695615, + "grad_norm": 0.8588468432426453, + "learning_rate": 5.3242628074466645e-05, + "loss": 3.3089, + "step": 111515 + }, + { + "epoch": 7.577116456040223, + "grad_norm": 1.2466322183609009, + "learning_rate": 5.320016306563392e-05, + "loss": 3.5388, + "step": 111520 + }, + { + "epoch": 7.577456176110885, + "grad_norm": 0.8825560808181763, + "learning_rate": 5.31576980568012e-05, + "loss": 3.3369, + "step": 111525 + }, + { + "epoch": 7.577795896181547, + "grad_norm": 0.9915660619735718, + "learning_rate": 5.311523304796848e-05, + "loss": 3.4688, + "step": 111530 + }, + { + "epoch": 7.578135616252208, + "grad_norm": 0.9905868172645569, + "learning_rate": 5.307276803913575e-05, + "loss": 3.461, + "step": 111535 + }, + { + "epoch": 7.57847533632287, + "grad_norm": 1.0113292932510376, + "learning_rate": 5.303030303030303e-05, + "loss": 3.1873, + "step": 111540 + }, + { + "epoch": 7.578815056393532, + "grad_norm": 0.8788331151008606, + "learning_rate": 5.2987838021470305e-05, + "loss": 3.2923, + "step": 111545 + }, + { + "epoch": 7.579154776464193, + "grad_norm": 0.7085466980934143, + "learning_rate": 5.294537301263759e-05, + "loss": 3.2894, + "step": 111550 + }, + { + "epoch": 7.579494496534855, + "grad_norm": 1.0307868719100952, + "learning_rate": 5.2902908003804865e-05, + "loss": 3.3827, + "step": 111555 + }, + { + "epoch": 7.5798342166055175, + "grad_norm": 1.0198594331741333, + "learning_rate": 5.286044299497214e-05, + "loss": 3.2289, + "step": 111560 + }, + { + "epoch": 7.580173936676179, + "grad_norm": 1.0383124351501465, + "learning_rate": 5.2817977986139425e-05, + "loss": 3.3749, + "step": 111565 + }, + { + "epoch": 7.580513656746841, + "grad_norm": 0.8877254128456116, + "learning_rate": 5.27755129773067e-05, + "loss": 2.9964, + "step": 111570 + }, + { + "epoch": 7.580853376817503, + "grad_norm": 1.0877083539962769, + "learning_rate": 5.273304796847398e-05, + "loss": 3.3038, + "step": 111575 + }, + { + "epoch": 7.581193096888164, + "grad_norm": 0.8188464045524597, + "learning_rate": 5.269058295964126e-05, + "loss": 3.1898, + "step": 111580 + }, + { + "epoch": 7.581532816958826, + "grad_norm": 6.561679363250732, + "learning_rate": 5.264811795080853e-05, + "loss": 3.2926, + "step": 111585 + }, + { + "epoch": 7.581872537029488, + "grad_norm": 0.8498920798301697, + "learning_rate": 5.260565294197581e-05, + "loss": 3.4546, + "step": 111590 + }, + { + "epoch": 7.582212257100149, + "grad_norm": 0.8228119611740112, + "learning_rate": 5.256318793314309e-05, + "loss": 3.5209, + "step": 111595 + }, + { + "epoch": 7.582551977170811, + "grad_norm": 0.9705032110214233, + "learning_rate": 5.252072292431037e-05, + "loss": 3.4155, + "step": 111600 + }, + { + "epoch": 7.5828916972414735, + "grad_norm": 1.3414093255996704, + "learning_rate": 5.2478257915477646e-05, + "loss": 3.5394, + "step": 111605 + }, + { + "epoch": 7.583231417312135, + "grad_norm": 0.8677010536193848, + "learning_rate": 5.2435792906644926e-05, + "loss": 3.3525, + "step": 111610 + }, + { + "epoch": 7.583571137382797, + "grad_norm": 1.2310662269592285, + "learning_rate": 5.2393327897812206e-05, + "loss": 3.4963, + "step": 111615 + }, + { + "epoch": 7.583910857453459, + "grad_norm": 0.9524437785148621, + "learning_rate": 5.235086288897948e-05, + "loss": 3.3884, + "step": 111620 + }, + { + "epoch": 7.58425057752412, + "grad_norm": 0.9103891849517822, + "learning_rate": 5.230839788014676e-05, + "loss": 3.3761, + "step": 111625 + }, + { + "epoch": 7.584590297594782, + "grad_norm": 0.9338932633399963, + "learning_rate": 5.226593287131404e-05, + "loss": 3.6262, + "step": 111630 + }, + { + "epoch": 7.584930017665444, + "grad_norm": 0.9937441945075989, + "learning_rate": 5.222346786248132e-05, + "loss": 3.4482, + "step": 111635 + }, + { + "epoch": 7.585269737736105, + "grad_norm": 1.1434228420257568, + "learning_rate": 5.218100285364859e-05, + "loss": 3.2708, + "step": 111640 + }, + { + "epoch": 7.585609457806767, + "grad_norm": 0.8877071738243103, + "learning_rate": 5.213853784481587e-05, + "loss": 3.3725, + "step": 111645 + }, + { + "epoch": 7.585949177877429, + "grad_norm": 0.9027047157287598, + "learning_rate": 5.209607283598315e-05, + "loss": 3.2268, + "step": 111650 + }, + { + "epoch": 7.586288897948091, + "grad_norm": 1.3866093158721924, + "learning_rate": 5.2053607827150426e-05, + "loss": 3.403, + "step": 111655 + }, + { + "epoch": 7.586628618018753, + "grad_norm": 1.0705687999725342, + "learning_rate": 5.201114281831771e-05, + "loss": 3.5179, + "step": 111660 + }, + { + "epoch": 7.586968338089414, + "grad_norm": 1.1225770711898804, + "learning_rate": 5.1968677809484986e-05, + "loss": 3.4071, + "step": 111665 + }, + { + "epoch": 7.587308058160076, + "grad_norm": 2.525092601776123, + "learning_rate": 5.192621280065226e-05, + "loss": 3.0531, + "step": 111670 + }, + { + "epoch": 7.587647778230738, + "grad_norm": 0.847682774066925, + "learning_rate": 5.188374779181954e-05, + "loss": 3.4203, + "step": 111675 + }, + { + "epoch": 7.587987498301399, + "grad_norm": 0.7483988404273987, + "learning_rate": 5.184128278298682e-05, + "loss": 3.2988, + "step": 111680 + }, + { + "epoch": 7.588327218372061, + "grad_norm": 0.960634171962738, + "learning_rate": 5.17988177741541e-05, + "loss": 3.2916, + "step": 111685 + }, + { + "epoch": 7.588666938442723, + "grad_norm": 1.2309622764587402, + "learning_rate": 5.175635276532137e-05, + "loss": 3.3625, + "step": 111690 + }, + { + "epoch": 7.589006658513385, + "grad_norm": 1.1913366317749023, + "learning_rate": 5.171388775648866e-05, + "loss": 3.5785, + "step": 111695 + }, + { + "epoch": 7.589346378584047, + "grad_norm": 1.0453379154205322, + "learning_rate": 5.167142274765593e-05, + "loss": 3.2043, + "step": 111700 + }, + { + "epoch": 7.589686098654709, + "grad_norm": 0.8563163876533508, + "learning_rate": 5.1628957738823206e-05, + "loss": 3.4855, + "step": 111705 + }, + { + "epoch": 7.59002581872537, + "grad_norm": 0.8499231338500977, + "learning_rate": 5.158649272999049e-05, + "loss": 3.3411, + "step": 111710 + }, + { + "epoch": 7.590365538796032, + "grad_norm": 1.494600534439087, + "learning_rate": 5.1544027721157767e-05, + "loss": 3.3487, + "step": 111715 + }, + { + "epoch": 7.590705258866694, + "grad_norm": 4.613491535186768, + "learning_rate": 5.150156271232505e-05, + "loss": 3.5825, + "step": 111720 + }, + { + "epoch": 7.591044978937355, + "grad_norm": 0.8938926458358765, + "learning_rate": 5.145909770349232e-05, + "loss": 3.2963, + "step": 111725 + }, + { + "epoch": 7.591384699008017, + "grad_norm": 1.0224815607070923, + "learning_rate": 5.14166326946596e-05, + "loss": 3.15, + "step": 111730 + }, + { + "epoch": 7.5917244190786795, + "grad_norm": 1.1838222742080688, + "learning_rate": 5.137416768582688e-05, + "loss": 3.5697, + "step": 111735 + }, + { + "epoch": 7.592064139149341, + "grad_norm": 1.2228208780288696, + "learning_rate": 5.133170267699415e-05, + "loss": 3.4762, + "step": 111740 + }, + { + "epoch": 7.592403859220003, + "grad_norm": 1.0901402235031128, + "learning_rate": 5.128923766816144e-05, + "loss": 3.28, + "step": 111745 + }, + { + "epoch": 7.592743579290665, + "grad_norm": 1.1495977640151978, + "learning_rate": 5.1246772659328714e-05, + "loss": 3.288, + "step": 111750 + }, + { + "epoch": 7.593083299361326, + "grad_norm": 0.8784831166267395, + "learning_rate": 5.120430765049599e-05, + "loss": 3.3648, + "step": 111755 + }, + { + "epoch": 7.593423019431988, + "grad_norm": 0.8467528223991394, + "learning_rate": 5.1161842641663274e-05, + "loss": 3.2977, + "step": 111760 + }, + { + "epoch": 7.59376273950265, + "grad_norm": 0.8605762720108032, + "learning_rate": 5.111937763283055e-05, + "loss": 3.3139, + "step": 111765 + }, + { + "epoch": 7.594102459573311, + "grad_norm": 0.862257182598114, + "learning_rate": 5.107691262399783e-05, + "loss": 3.3829, + "step": 111770 + }, + { + "epoch": 7.594442179643973, + "grad_norm": 0.8904305696487427, + "learning_rate": 5.103444761516511e-05, + "loss": 3.45, + "step": 111775 + }, + { + "epoch": 7.5947818997146355, + "grad_norm": 1.91378915309906, + "learning_rate": 5.099198260633239e-05, + "loss": 3.359, + "step": 111780 + }, + { + "epoch": 7.595121619785297, + "grad_norm": 1.0890718698501587, + "learning_rate": 5.094951759749966e-05, + "loss": 3.3729, + "step": 111785 + }, + { + "epoch": 7.595461339855959, + "grad_norm": 1.4305864572525024, + "learning_rate": 5.0907052588666934e-05, + "loss": 3.2832, + "step": 111790 + }, + { + "epoch": 7.595801059926621, + "grad_norm": 0.8973947167396545, + "learning_rate": 5.086458757983422e-05, + "loss": 3.415, + "step": 111795 + }, + { + "epoch": 7.596140779997282, + "grad_norm": 0.9007610082626343, + "learning_rate": 5.0822122571001494e-05, + "loss": 3.5778, + "step": 111800 + }, + { + "epoch": 7.596480500067944, + "grad_norm": 1.0537998676300049, + "learning_rate": 5.0779657562168774e-05, + "loss": 3.3746, + "step": 111805 + }, + { + "epoch": 7.596820220138606, + "grad_norm": 1.0346812009811401, + "learning_rate": 5.0737192553336054e-05, + "loss": 3.4241, + "step": 111810 + }, + { + "epoch": 7.597159940209267, + "grad_norm": 0.8788004517555237, + "learning_rate": 5.069472754450333e-05, + "loss": 3.1497, + "step": 111815 + }, + { + "epoch": 7.597499660279929, + "grad_norm": 0.821405827999115, + "learning_rate": 5.065226253567061e-05, + "loss": 3.2077, + "step": 111820 + }, + { + "epoch": 7.5978393803505915, + "grad_norm": 1.0745303630828857, + "learning_rate": 5.060979752683789e-05, + "loss": 3.3639, + "step": 111825 + }, + { + "epoch": 7.598179100421253, + "grad_norm": 1.07894766330719, + "learning_rate": 5.056733251800517e-05, + "loss": 3.5371, + "step": 111830 + }, + { + "epoch": 7.598518820491915, + "grad_norm": 0.8472136855125427, + "learning_rate": 5.052486750917244e-05, + "loss": 3.5005, + "step": 111835 + }, + { + "epoch": 7.598858540562577, + "grad_norm": 0.7411198616027832, + "learning_rate": 5.048240250033972e-05, + "loss": 3.3937, + "step": 111840 + }, + { + "epoch": 7.599198260633238, + "grad_norm": 0.7545201182365417, + "learning_rate": 5.0439937491507e-05, + "loss": 3.2988, + "step": 111845 + }, + { + "epoch": 7.5995379807039, + "grad_norm": 1.2005561590194702, + "learning_rate": 5.0397472482674274e-05, + "loss": 3.0267, + "step": 111850 + }, + { + "epoch": 7.599877700774561, + "grad_norm": 1.0094791650772095, + "learning_rate": 5.0355007473841555e-05, + "loss": 3.3768, + "step": 111855 + }, + { + "epoch": 7.600217420845223, + "grad_norm": 0.9300236105918884, + "learning_rate": 5.0312542465008835e-05, + "loss": 3.4615, + "step": 111860 + }, + { + "epoch": 7.600557140915885, + "grad_norm": 1.022845983505249, + "learning_rate": 5.0270077456176115e-05, + "loss": 3.3383, + "step": 111865 + }, + { + "epoch": 7.600896860986547, + "grad_norm": 0.8161834478378296, + "learning_rate": 5.022761244734339e-05, + "loss": 3.3673, + "step": 111870 + }, + { + "epoch": 7.601236581057209, + "grad_norm": 0.8159050941467285, + "learning_rate": 5.018514743851067e-05, + "loss": 3.1513, + "step": 111875 + }, + { + "epoch": 7.601576301127871, + "grad_norm": 0.8434037566184998, + "learning_rate": 5.014268242967795e-05, + "loss": 3.5074, + "step": 111880 + }, + { + "epoch": 7.601916021198532, + "grad_norm": 0.9081565141677856, + "learning_rate": 5.010021742084522e-05, + "loss": 3.283, + "step": 111885 + }, + { + "epoch": 7.602255741269194, + "grad_norm": 1.1408895254135132, + "learning_rate": 5.005775241201251e-05, + "loss": 3.3213, + "step": 111890 + }, + { + "epoch": 7.602595461339856, + "grad_norm": 0.8830049633979797, + "learning_rate": 5.001528740317978e-05, + "loss": 3.3994, + "step": 111895 + }, + { + "epoch": 7.602935181410517, + "grad_norm": 0.8445941805839539, + "learning_rate": 4.9972822394347055e-05, + "loss": 3.2621, + "step": 111900 + }, + { + "epoch": 7.603274901481179, + "grad_norm": 0.9483040571212769, + "learning_rate": 4.993035738551434e-05, + "loss": 3.2662, + "step": 111905 + }, + { + "epoch": 7.603614621551841, + "grad_norm": 0.8904713988304138, + "learning_rate": 4.9887892376681615e-05, + "loss": 3.436, + "step": 111910 + }, + { + "epoch": 7.603954341622503, + "grad_norm": 1.455727458000183, + "learning_rate": 4.9845427367848895e-05, + "loss": 3.4396, + "step": 111915 + }, + { + "epoch": 7.604294061693165, + "grad_norm": 1.1814323663711548, + "learning_rate": 4.980296235901617e-05, + "loss": 3.4392, + "step": 111920 + }, + { + "epoch": 7.604633781763827, + "grad_norm": 4.007590293884277, + "learning_rate": 4.976049735018345e-05, + "loss": 3.3229, + "step": 111925 + }, + { + "epoch": 7.604973501834488, + "grad_norm": 0.9399138689041138, + "learning_rate": 4.971803234135073e-05, + "loss": 3.1534, + "step": 111930 + }, + { + "epoch": 7.60531322190515, + "grad_norm": 0.9473491311073303, + "learning_rate": 4.9675567332518e-05, + "loss": 3.2898, + "step": 111935 + }, + { + "epoch": 7.605652941975812, + "grad_norm": 0.9623449444770813, + "learning_rate": 4.963310232368529e-05, + "loss": 3.6166, + "step": 111940 + }, + { + "epoch": 7.605992662046473, + "grad_norm": 0.8717129826545715, + "learning_rate": 4.959063731485256e-05, + "loss": 3.5477, + "step": 111945 + }, + { + "epoch": 7.606332382117135, + "grad_norm": 0.8019974827766418, + "learning_rate": 4.954817230601984e-05, + "loss": 3.5078, + "step": 111950 + }, + { + "epoch": 7.606672102187797, + "grad_norm": 0.8498746752738953, + "learning_rate": 4.950570729718712e-05, + "loss": 3.4053, + "step": 111955 + }, + { + "epoch": 7.607011822258459, + "grad_norm": 0.7374544143676758, + "learning_rate": 4.9463242288354395e-05, + "loss": 3.1994, + "step": 111960 + }, + { + "epoch": 7.607351542329121, + "grad_norm": 0.7348116040229797, + "learning_rate": 4.9420777279521676e-05, + "loss": 3.3038, + "step": 111965 + }, + { + "epoch": 7.607691262399783, + "grad_norm": 0.9048084020614624, + "learning_rate": 4.937831227068895e-05, + "loss": 3.4312, + "step": 111970 + }, + { + "epoch": 7.608030982470444, + "grad_norm": 1.3439435958862305, + "learning_rate": 4.9335847261856236e-05, + "loss": 3.5021, + "step": 111975 + }, + { + "epoch": 7.608370702541106, + "grad_norm": 0.8323404788970947, + "learning_rate": 4.929338225302351e-05, + "loss": 3.2929, + "step": 111980 + }, + { + "epoch": 7.608710422611768, + "grad_norm": 1.1947228908538818, + "learning_rate": 4.925091724419078e-05, + "loss": 3.1677, + "step": 111985 + }, + { + "epoch": 7.609050142682429, + "grad_norm": 0.8015844821929932, + "learning_rate": 4.920845223535807e-05, + "loss": 3.4433, + "step": 111990 + }, + { + "epoch": 7.609389862753091, + "grad_norm": 0.7414008378982544, + "learning_rate": 4.916598722652534e-05, + "loss": 3.5866, + "step": 111995 + }, + { + "epoch": 7.6097295828237534, + "grad_norm": 0.9283313155174255, + "learning_rate": 4.912352221769262e-05, + "loss": 3.5361, + "step": 112000 + }, + { + "epoch": 7.610069302894415, + "grad_norm": 0.8616315722465515, + "learning_rate": 4.90810572088599e-05, + "loss": 3.326, + "step": 112005 + }, + { + "epoch": 7.610409022965077, + "grad_norm": 0.874891996383667, + "learning_rate": 4.9038592200027176e-05, + "loss": 3.5032, + "step": 112010 + }, + { + "epoch": 7.610748743035739, + "grad_norm": 0.9518083930015564, + "learning_rate": 4.8996127191194456e-05, + "loss": 2.8619, + "step": 112015 + }, + { + "epoch": 7.6110884631064, + "grad_norm": 0.876089870929718, + "learning_rate": 4.8953662182361736e-05, + "loss": 3.4652, + "step": 112020 + }, + { + "epoch": 7.611428183177062, + "grad_norm": 0.7142054438591003, + "learning_rate": 4.8911197173529016e-05, + "loss": 3.4936, + "step": 112025 + }, + { + "epoch": 7.611767903247724, + "grad_norm": 0.8993443250656128, + "learning_rate": 4.886873216469629e-05, + "loss": 3.5088, + "step": 112030 + }, + { + "epoch": 7.612107623318385, + "grad_norm": 0.8617344498634338, + "learning_rate": 4.882626715586357e-05, + "loss": 3.4827, + "step": 112035 + }, + { + "epoch": 7.612447343389047, + "grad_norm": 1.1400223970413208, + "learning_rate": 4.878380214703085e-05, + "loss": 3.1715, + "step": 112040 + }, + { + "epoch": 7.6127870634597095, + "grad_norm": 0.920464813709259, + "learning_rate": 4.874133713819812e-05, + "loss": 3.1511, + "step": 112045 + }, + { + "epoch": 7.613126783530371, + "grad_norm": 0.7666418552398682, + "learning_rate": 4.86988721293654e-05, + "loss": 3.3644, + "step": 112050 + }, + { + "epoch": 7.613466503601033, + "grad_norm": 0.9095754623413086, + "learning_rate": 4.865640712053268e-05, + "loss": 3.442, + "step": 112055 + }, + { + "epoch": 7.613806223671695, + "grad_norm": 0.9867074489593506, + "learning_rate": 4.861394211169996e-05, + "loss": 3.2286, + "step": 112060 + }, + { + "epoch": 7.614145943742356, + "grad_norm": 1.0355843305587769, + "learning_rate": 4.8571477102867236e-05, + "loss": 3.22, + "step": 112065 + }, + { + "epoch": 7.614485663813018, + "grad_norm": 0.986709475517273, + "learning_rate": 4.8529012094034517e-05, + "loss": 3.2804, + "step": 112070 + }, + { + "epoch": 7.61482538388368, + "grad_norm": 0.9287840127944946, + "learning_rate": 4.8486547085201797e-05, + "loss": 3.4674, + "step": 112075 + }, + { + "epoch": 7.615165103954341, + "grad_norm": 1.060194969177246, + "learning_rate": 4.844408207636907e-05, + "loss": 3.3764, + "step": 112080 + }, + { + "epoch": 7.615504824025003, + "grad_norm": 1.0524369478225708, + "learning_rate": 4.840161706753636e-05, + "loss": 3.4292, + "step": 112085 + }, + { + "epoch": 7.6158445440956655, + "grad_norm": 1.0536998510360718, + "learning_rate": 4.835915205870363e-05, + "loss": 3.4648, + "step": 112090 + }, + { + "epoch": 7.616184264166327, + "grad_norm": 0.8025508522987366, + "learning_rate": 4.83166870498709e-05, + "loss": 3.4927, + "step": 112095 + }, + { + "epoch": 7.616523984236989, + "grad_norm": 0.9857558608055115, + "learning_rate": 4.8274222041038183e-05, + "loss": 3.4723, + "step": 112100 + }, + { + "epoch": 7.616863704307651, + "grad_norm": 1.5598760843276978, + "learning_rate": 4.8231757032205463e-05, + "loss": 3.5294, + "step": 112105 + }, + { + "epoch": 7.617203424378312, + "grad_norm": 0.9723787903785706, + "learning_rate": 4.8189292023372744e-05, + "loss": 3.3691, + "step": 112110 + }, + { + "epoch": 7.617543144448974, + "grad_norm": 0.7758461833000183, + "learning_rate": 4.814682701454002e-05, + "loss": 3.3908, + "step": 112115 + }, + { + "epoch": 7.617882864519636, + "grad_norm": 0.7799140214920044, + "learning_rate": 4.8104362005707304e-05, + "loss": 3.4423, + "step": 112120 + }, + { + "epoch": 7.618222584590297, + "grad_norm": 0.7767892479896545, + "learning_rate": 4.806189699687458e-05, + "loss": 3.6456, + "step": 112125 + }, + { + "epoch": 7.618562304660959, + "grad_norm": 1.1116639375686646, + "learning_rate": 4.801943198804185e-05, + "loss": 3.2572, + "step": 112130 + }, + { + "epoch": 7.6189020247316215, + "grad_norm": 1.1703358888626099, + "learning_rate": 4.797696697920914e-05, + "loss": 3.636, + "step": 112135 + }, + { + "epoch": 7.619241744802283, + "grad_norm": 1.163792610168457, + "learning_rate": 4.793450197037641e-05, + "loss": 3.1832, + "step": 112140 + }, + { + "epoch": 7.619581464872945, + "grad_norm": 0.9477721452713013, + "learning_rate": 4.789203696154369e-05, + "loss": 3.4531, + "step": 112145 + }, + { + "epoch": 7.619921184943607, + "grad_norm": 1.1266587972640991, + "learning_rate": 4.7849571952710964e-05, + "loss": 3.2142, + "step": 112150 + }, + { + "epoch": 7.620260905014268, + "grad_norm": 1.0216593742370605, + "learning_rate": 4.7807106943878244e-05, + "loss": 3.4564, + "step": 112155 + }, + { + "epoch": 7.62060062508493, + "grad_norm": 0.9754804372787476, + "learning_rate": 4.7764641935045524e-05, + "loss": 3.4914, + "step": 112160 + }, + { + "epoch": 7.620940345155592, + "grad_norm": 1.0810072422027588, + "learning_rate": 4.77221769262128e-05, + "loss": 3.4988, + "step": 112165 + }, + { + "epoch": 7.621280065226253, + "grad_norm": 1.1578364372253418, + "learning_rate": 4.7679711917380084e-05, + "loss": 3.3172, + "step": 112170 + }, + { + "epoch": 7.621619785296915, + "grad_norm": 1.3192267417907715, + "learning_rate": 4.763724690854736e-05, + "loss": 3.4313, + "step": 112175 + }, + { + "epoch": 7.6219595053675775, + "grad_norm": 1.3605619668960571, + "learning_rate": 4.759478189971463e-05, + "loss": 3.2525, + "step": 112180 + }, + { + "epoch": 7.622299225438239, + "grad_norm": 0.8362947106361389, + "learning_rate": 4.755231689088192e-05, + "loss": 3.5242, + "step": 112185 + }, + { + "epoch": 7.622638945508901, + "grad_norm": 0.9542124271392822, + "learning_rate": 4.750985188204919e-05, + "loss": 3.5344, + "step": 112190 + }, + { + "epoch": 7.622978665579563, + "grad_norm": 0.9299943447113037, + "learning_rate": 4.746738687321647e-05, + "loss": 3.2205, + "step": 112195 + }, + { + "epoch": 7.623318385650224, + "grad_norm": 0.7342591285705566, + "learning_rate": 4.742492186438375e-05, + "loss": 3.383, + "step": 112200 + }, + { + "epoch": 7.623658105720886, + "grad_norm": 0.987119734287262, + "learning_rate": 4.738245685555103e-05, + "loss": 3.2459, + "step": 112205 + }, + { + "epoch": 7.623997825791548, + "grad_norm": 0.872288703918457, + "learning_rate": 4.7339991846718304e-05, + "loss": 3.1964, + "step": 112210 + }, + { + "epoch": 7.624337545862209, + "grad_norm": 1.005126953125, + "learning_rate": 4.729752683788558e-05, + "loss": 3.4557, + "step": 112215 + }, + { + "epoch": 7.624677265932871, + "grad_norm": 1.091690182685852, + "learning_rate": 4.7255061829052865e-05, + "loss": 3.3182, + "step": 112220 + }, + { + "epoch": 7.6250169860035335, + "grad_norm": 0.9722214341163635, + "learning_rate": 4.721259682022014e-05, + "loss": 3.3407, + "step": 112225 + }, + { + "epoch": 7.625356706074195, + "grad_norm": 0.9731531143188477, + "learning_rate": 4.717013181138742e-05, + "loss": 3.6553, + "step": 112230 + }, + { + "epoch": 7.625696426144857, + "grad_norm": 0.7474367022514343, + "learning_rate": 4.71276668025547e-05, + "loss": 3.4957, + "step": 112235 + }, + { + "epoch": 7.626036146215519, + "grad_norm": 0.9273325204849243, + "learning_rate": 4.708520179372197e-05, + "loss": 3.4212, + "step": 112240 + }, + { + "epoch": 7.62637586628618, + "grad_norm": 0.847760796546936, + "learning_rate": 4.704273678488925e-05, + "loss": 3.5361, + "step": 112245 + }, + { + "epoch": 7.626715586356842, + "grad_norm": 0.9177217483520508, + "learning_rate": 4.700027177605653e-05, + "loss": 3.5133, + "step": 112250 + }, + { + "epoch": 7.627055306427504, + "grad_norm": 1.049443006515503, + "learning_rate": 4.695780676722381e-05, + "loss": 3.1329, + "step": 112255 + }, + { + "epoch": 7.627395026498165, + "grad_norm": 0.7060555815696716, + "learning_rate": 4.6915341758391085e-05, + "loss": 3.4187, + "step": 112260 + }, + { + "epoch": 7.627734746568827, + "grad_norm": 1.1033433675765991, + "learning_rate": 4.6872876749558365e-05, + "loss": 3.4996, + "step": 112265 + }, + { + "epoch": 7.6280744666394895, + "grad_norm": 0.9744884371757507, + "learning_rate": 4.6830411740725645e-05, + "loss": 3.3258, + "step": 112270 + }, + { + "epoch": 7.628414186710151, + "grad_norm": 1.0072442293167114, + "learning_rate": 4.678794673189292e-05, + "loss": 3.2909, + "step": 112275 + }, + { + "epoch": 7.628753906780813, + "grad_norm": 1.1407057046890259, + "learning_rate": 4.67454817230602e-05, + "loss": 3.438, + "step": 112280 + }, + { + "epoch": 7.629093626851475, + "grad_norm": 0.9727713465690613, + "learning_rate": 4.670301671422748e-05, + "loss": 3.563, + "step": 112285 + }, + { + "epoch": 7.629433346922136, + "grad_norm": 0.915163516998291, + "learning_rate": 4.666055170539476e-05, + "loss": 3.1873, + "step": 112290 + }, + { + "epoch": 7.629773066992798, + "grad_norm": 1.052696943283081, + "learning_rate": 4.661808669656203e-05, + "loss": 3.3897, + "step": 112295 + }, + { + "epoch": 7.63011278706346, + "grad_norm": 1.9420828819274902, + "learning_rate": 4.657562168772931e-05, + "loss": 3.4814, + "step": 112300 + }, + { + "epoch": 7.630452507134121, + "grad_norm": 1.0550487041473389, + "learning_rate": 4.653315667889659e-05, + "loss": 3.3032, + "step": 112305 + }, + { + "epoch": 7.6307922272047835, + "grad_norm": 1.3471428155899048, + "learning_rate": 4.6490691670063865e-05, + "loss": 3.4322, + "step": 112310 + }, + { + "epoch": 7.6311319472754455, + "grad_norm": 1.2736315727233887, + "learning_rate": 4.644822666123115e-05, + "loss": 3.3725, + "step": 112315 + }, + { + "epoch": 7.631471667346107, + "grad_norm": 0.9450377821922302, + "learning_rate": 4.6405761652398426e-05, + "loss": 3.5209, + "step": 112320 + }, + { + "epoch": 7.631811387416769, + "grad_norm": 0.8291178941726685, + "learning_rate": 4.63632966435657e-05, + "loss": 3.3429, + "step": 112325 + }, + { + "epoch": 7.63215110748743, + "grad_norm": 1.7491694688796997, + "learning_rate": 4.632083163473298e-05, + "loss": 3.5902, + "step": 112330 + }, + { + "epoch": 7.632490827558092, + "grad_norm": 0.9486700296401978, + "learning_rate": 4.627836662590026e-05, + "loss": 3.439, + "step": 112335 + }, + { + "epoch": 7.632830547628754, + "grad_norm": 0.8513317704200745, + "learning_rate": 4.623590161706754e-05, + "loss": 3.4126, + "step": 112340 + }, + { + "epoch": 7.633170267699415, + "grad_norm": 1.0288217067718506, + "learning_rate": 4.619343660823481e-05, + "loss": 3.4264, + "step": 112345 + }, + { + "epoch": 7.633509987770077, + "grad_norm": 0.9110191464424133, + "learning_rate": 4.615097159940209e-05, + "loss": 3.4257, + "step": 112350 + }, + { + "epoch": 7.6338497078407395, + "grad_norm": 0.9484977126121521, + "learning_rate": 4.610850659056937e-05, + "loss": 3.3842, + "step": 112355 + }, + { + "epoch": 7.634189427911401, + "grad_norm": 1.1111654043197632, + "learning_rate": 4.6066041581736646e-05, + "loss": 3.4654, + "step": 112360 + }, + { + "epoch": 7.634529147982063, + "grad_norm": 0.8591316938400269, + "learning_rate": 4.602357657290393e-05, + "loss": 3.1863, + "step": 112365 + }, + { + "epoch": 7.634868868052725, + "grad_norm": 0.9713543057441711, + "learning_rate": 4.5981111564071206e-05, + "loss": 3.4315, + "step": 112370 + }, + { + "epoch": 7.635208588123386, + "grad_norm": 0.8042253851890564, + "learning_rate": 4.5938646555238486e-05, + "loss": 3.5013, + "step": 112375 + }, + { + "epoch": 7.635548308194048, + "grad_norm": 1.5437589883804321, + "learning_rate": 4.5896181546405766e-05, + "loss": 3.5092, + "step": 112380 + }, + { + "epoch": 7.63588802826471, + "grad_norm": 1.5969674587249756, + "learning_rate": 4.585371653757304e-05, + "loss": 3.2842, + "step": 112385 + }, + { + "epoch": 7.636227748335371, + "grad_norm": 0.9451215863227844, + "learning_rate": 4.581125152874032e-05, + "loss": 3.1548, + "step": 112390 + }, + { + "epoch": 7.636567468406033, + "grad_norm": 0.9443419575691223, + "learning_rate": 4.576878651990759e-05, + "loss": 3.4615, + "step": 112395 + }, + { + "epoch": 7.6369071884766955, + "grad_norm": 1.018707513809204, + "learning_rate": 4.572632151107488e-05, + "loss": 3.158, + "step": 112400 + }, + { + "epoch": 7.637246908547357, + "grad_norm": 1.0428764820098877, + "learning_rate": 4.568385650224215e-05, + "loss": 3.5943, + "step": 112405 + }, + { + "epoch": 7.637586628618019, + "grad_norm": 0.8654230237007141, + "learning_rate": 4.5641391493409426e-05, + "loss": 3.3703, + "step": 112410 + }, + { + "epoch": 7.637926348688681, + "grad_norm": 0.896723747253418, + "learning_rate": 4.559892648457671e-05, + "loss": 3.2491, + "step": 112415 + }, + { + "epoch": 7.638266068759342, + "grad_norm": 0.918653666973114, + "learning_rate": 4.5556461475743986e-05, + "loss": 3.393, + "step": 112420 + }, + { + "epoch": 7.638605788830004, + "grad_norm": 1.0434272289276123, + "learning_rate": 4.5513996466911266e-05, + "loss": 3.4244, + "step": 112425 + }, + { + "epoch": 7.638945508900666, + "grad_norm": 0.894179105758667, + "learning_rate": 4.5471531458078547e-05, + "loss": 3.479, + "step": 112430 + }, + { + "epoch": 7.639285228971327, + "grad_norm": 0.8864219784736633, + "learning_rate": 4.542906644924582e-05, + "loss": 3.1746, + "step": 112435 + }, + { + "epoch": 7.639624949041989, + "grad_norm": 1.1889787912368774, + "learning_rate": 4.53866014404131e-05, + "loss": 3.4651, + "step": 112440 + }, + { + "epoch": 7.6399646691126515, + "grad_norm": 0.9762519598007202, + "learning_rate": 4.534413643158038e-05, + "loss": 3.504, + "step": 112445 + }, + { + "epoch": 7.640304389183313, + "grad_norm": 1.4742532968521118, + "learning_rate": 4.530167142274766e-05, + "loss": 3.4249, + "step": 112450 + }, + { + "epoch": 7.640644109253975, + "grad_norm": 0.8919811844825745, + "learning_rate": 4.525920641391493e-05, + "loss": 3.3695, + "step": 112455 + }, + { + "epoch": 7.640983829324637, + "grad_norm": 1.308939814567566, + "learning_rate": 4.5216741405082213e-05, + "loss": 3.2338, + "step": 112460 + }, + { + "epoch": 7.641323549395298, + "grad_norm": 0.9449737668037415, + "learning_rate": 4.5174276396249494e-05, + "loss": 3.3733, + "step": 112465 + }, + { + "epoch": 7.64166326946596, + "grad_norm": 0.9351301193237305, + "learning_rate": 4.513181138741677e-05, + "loss": 3.3421, + "step": 112470 + }, + { + "epoch": 7.642002989536622, + "grad_norm": 0.8596168756484985, + "learning_rate": 4.508934637858405e-05, + "loss": 3.4448, + "step": 112475 + }, + { + "epoch": 7.642342709607283, + "grad_norm": 0.9862120747566223, + "learning_rate": 4.504688136975133e-05, + "loss": 3.4419, + "step": 112480 + }, + { + "epoch": 7.642682429677945, + "grad_norm": 1.1253989934921265, + "learning_rate": 4.500441636091861e-05, + "loss": 3.4772, + "step": 112485 + }, + { + "epoch": 7.6430221497486075, + "grad_norm": 4.240890979766846, + "learning_rate": 4.496195135208588e-05, + "loss": 2.9634, + "step": 112490 + }, + { + "epoch": 7.643361869819269, + "grad_norm": 1.0301920175552368, + "learning_rate": 4.491948634325316e-05, + "loss": 3.3386, + "step": 112495 + }, + { + "epoch": 7.643701589889931, + "grad_norm": 0.7789915800094604, + "learning_rate": 4.487702133442044e-05, + "loss": 3.5837, + "step": 112500 + }, + { + "epoch": 7.644041309960593, + "grad_norm": 0.850872278213501, + "learning_rate": 4.4834556325587714e-05, + "loss": 3.4934, + "step": 112505 + }, + { + "epoch": 7.644381030031254, + "grad_norm": 0.707161545753479, + "learning_rate": 4.4792091316755e-05, + "loss": 3.4338, + "step": 112510 + }, + { + "epoch": 7.644720750101916, + "grad_norm": 0.7604393362998962, + "learning_rate": 4.4749626307922274e-05, + "loss": 3.4184, + "step": 112515 + }, + { + "epoch": 7.645060470172578, + "grad_norm": 0.9857166409492493, + "learning_rate": 4.470716129908955e-05, + "loss": 3.3164, + "step": 112520 + }, + { + "epoch": 7.645400190243239, + "grad_norm": 1.016910195350647, + "learning_rate": 4.466469629025683e-05, + "loss": 3.5492, + "step": 112525 + }, + { + "epoch": 7.645739910313901, + "grad_norm": 1.1485960483551025, + "learning_rate": 4.462223128142411e-05, + "loss": 3.3615, + "step": 112530 + }, + { + "epoch": 7.6460796303845635, + "grad_norm": 0.7990475296974182, + "learning_rate": 4.457976627259139e-05, + "loss": 3.5327, + "step": 112535 + }, + { + "epoch": 7.646419350455225, + "grad_norm": 0.8459970951080322, + "learning_rate": 4.453730126375866e-05, + "loss": 3.1178, + "step": 112540 + }, + { + "epoch": 7.646759070525887, + "grad_norm": 1.0571739673614502, + "learning_rate": 4.449483625492595e-05, + "loss": 3.4907, + "step": 112545 + }, + { + "epoch": 7.647098790596548, + "grad_norm": 0.8733468055725098, + "learning_rate": 4.445237124609322e-05, + "loss": 3.6334, + "step": 112550 + }, + { + "epoch": 7.64743851066721, + "grad_norm": 1.0533652305603027, + "learning_rate": 4.4409906237260494e-05, + "loss": 3.1691, + "step": 112555 + }, + { + "epoch": 7.647778230737872, + "grad_norm": 0.7356808185577393, + "learning_rate": 4.436744122842778e-05, + "loss": 3.4243, + "step": 112560 + }, + { + "epoch": 7.648117950808533, + "grad_norm": 1.3763104677200317, + "learning_rate": 4.4324976219595054e-05, + "loss": 3.1269, + "step": 112565 + }, + { + "epoch": 7.648457670879195, + "grad_norm": 0.819363534450531, + "learning_rate": 4.4282511210762334e-05, + "loss": 3.3679, + "step": 112570 + }, + { + "epoch": 7.6487973909498574, + "grad_norm": 0.878258466720581, + "learning_rate": 4.424004620192961e-05, + "loss": 3.3776, + "step": 112575 + }, + { + "epoch": 7.649137111020519, + "grad_norm": 0.976390540599823, + "learning_rate": 4.419758119309689e-05, + "loss": 3.4717, + "step": 112580 + }, + { + "epoch": 7.649476831091181, + "grad_norm": 0.8392893075942993, + "learning_rate": 4.415511618426417e-05, + "loss": 3.3623, + "step": 112585 + }, + { + "epoch": 7.649816551161843, + "grad_norm": 0.957385241985321, + "learning_rate": 4.411265117543144e-05, + "loss": 3.57, + "step": 112590 + }, + { + "epoch": 7.650156271232504, + "grad_norm": 0.8398355841636658, + "learning_rate": 4.407018616659873e-05, + "loss": 3.6076, + "step": 112595 + }, + { + "epoch": 7.650495991303166, + "grad_norm": 1.0080901384353638, + "learning_rate": 4.4027721157766e-05, + "loss": 3.2721, + "step": 112600 + }, + { + "epoch": 7.650835711373828, + "grad_norm": 3.352134943008423, + "learning_rate": 4.3985256148933275e-05, + "loss": 3.4258, + "step": 112605 + }, + { + "epoch": 7.651175431444489, + "grad_norm": 0.7516959309577942, + "learning_rate": 4.394279114010056e-05, + "loss": 3.1632, + "step": 112610 + }, + { + "epoch": 7.651515151515151, + "grad_norm": 0.8226331472396851, + "learning_rate": 4.3900326131267835e-05, + "loss": 3.5364, + "step": 112615 + }, + { + "epoch": 7.6518548715858135, + "grad_norm": 0.8597267270088196, + "learning_rate": 4.3857861122435115e-05, + "loss": 3.0733, + "step": 112620 + }, + { + "epoch": 7.652194591656475, + "grad_norm": 1.3243303298950195, + "learning_rate": 4.3815396113602395e-05, + "loss": 3.4285, + "step": 112625 + }, + { + "epoch": 7.652534311727137, + "grad_norm": 0.9364092350006104, + "learning_rate": 4.3772931104769675e-05, + "loss": 3.4681, + "step": 112630 + }, + { + "epoch": 7.652874031797799, + "grad_norm": 1.0284308195114136, + "learning_rate": 4.373046609593695e-05, + "loss": 3.2223, + "step": 112635 + }, + { + "epoch": 7.65321375186846, + "grad_norm": 1.0039489269256592, + "learning_rate": 4.368800108710422e-05, + "loss": 3.572, + "step": 112640 + }, + { + "epoch": 7.653553471939122, + "grad_norm": 0.9975035786628723, + "learning_rate": 4.364553607827151e-05, + "loss": 3.3938, + "step": 112645 + }, + { + "epoch": 7.653893192009784, + "grad_norm": 1.1554728746414185, + "learning_rate": 4.360307106943878e-05, + "loss": 3.5087, + "step": 112650 + }, + { + "epoch": 7.654232912080445, + "grad_norm": 0.973671555519104, + "learning_rate": 4.356060606060606e-05, + "loss": 3.4153, + "step": 112655 + }, + { + "epoch": 7.654572632151107, + "grad_norm": 1.0698546171188354, + "learning_rate": 4.351814105177334e-05, + "loss": 3.2998, + "step": 112660 + }, + { + "epoch": 7.6549123522217695, + "grad_norm": 1.1218070983886719, + "learning_rate": 4.3475676042940615e-05, + "loss": 3.2038, + "step": 112665 + }, + { + "epoch": 7.655252072292431, + "grad_norm": 0.9450260996818542, + "learning_rate": 4.3433211034107895e-05, + "loss": 3.0995, + "step": 112670 + }, + { + "epoch": 7.655591792363093, + "grad_norm": 0.932754635810852, + "learning_rate": 4.3390746025275175e-05, + "loss": 3.6155, + "step": 112675 + }, + { + "epoch": 7.655931512433755, + "grad_norm": 1.2132648229599, + "learning_rate": 4.3348281016442456e-05, + "loss": 3.3786, + "step": 112680 + }, + { + "epoch": 7.656271232504416, + "grad_norm": 1.0078388452529907, + "learning_rate": 4.330581600760973e-05, + "loss": 3.2622, + "step": 112685 + }, + { + "epoch": 7.656610952575078, + "grad_norm": 0.8480902314186096, + "learning_rate": 4.3263350998777e-05, + "loss": 3.3761, + "step": 112690 + }, + { + "epoch": 7.65695067264574, + "grad_norm": 1.091733455657959, + "learning_rate": 4.322088598994429e-05, + "loss": 3.3946, + "step": 112695 + }, + { + "epoch": 7.657290392716401, + "grad_norm": 0.8963373303413391, + "learning_rate": 4.317842098111156e-05, + "loss": 3.2568, + "step": 112700 + }, + { + "epoch": 7.657630112787063, + "grad_norm": 1.3333582878112793, + "learning_rate": 4.313595597227884e-05, + "loss": 3.3039, + "step": 112705 + }, + { + "epoch": 7.6579698328577255, + "grad_norm": 0.9199469089508057, + "learning_rate": 4.309349096344612e-05, + "loss": 3.3972, + "step": 112710 + }, + { + "epoch": 7.658309552928387, + "grad_norm": 1.0723237991333008, + "learning_rate": 4.30510259546134e-05, + "loss": 3.5547, + "step": 112715 + }, + { + "epoch": 7.658649272999049, + "grad_norm": 1.2305182218551636, + "learning_rate": 4.3008560945780676e-05, + "loss": 3.7139, + "step": 112720 + }, + { + "epoch": 7.658988993069711, + "grad_norm": 0.9661731719970703, + "learning_rate": 4.2966095936947956e-05, + "loss": 3.1741, + "step": 112725 + }, + { + "epoch": 7.659328713140372, + "grad_norm": 1.031707525253296, + "learning_rate": 4.2923630928115236e-05, + "loss": 3.215, + "step": 112730 + }, + { + "epoch": 7.659668433211034, + "grad_norm": 1.0153179168701172, + "learning_rate": 4.288116591928251e-05, + "loss": 3.4102, + "step": 112735 + }, + { + "epoch": 7.660008153281696, + "grad_norm": 1.136208176612854, + "learning_rate": 4.2838700910449796e-05, + "loss": 3.6313, + "step": 112740 + }, + { + "epoch": 7.660347873352357, + "grad_norm": 0.9128114581108093, + "learning_rate": 4.279623590161707e-05, + "loss": 3.639, + "step": 112745 + }, + { + "epoch": 7.660687593423019, + "grad_norm": 0.9586009979248047, + "learning_rate": 4.275377089278434e-05, + "loss": 3.3664, + "step": 112750 + }, + { + "epoch": 7.6610273134936815, + "grad_norm": 1.0555378198623657, + "learning_rate": 4.271130588395162e-05, + "loss": 3.5643, + "step": 112755 + }, + { + "epoch": 7.661367033564343, + "grad_norm": 1.8690283298492432, + "learning_rate": 4.26688408751189e-05, + "loss": 3.2544, + "step": 112760 + }, + { + "epoch": 7.661706753635005, + "grad_norm": 0.8876920342445374, + "learning_rate": 4.262637586628618e-05, + "loss": 3.2734, + "step": 112765 + }, + { + "epoch": 7.662046473705667, + "grad_norm": 0.8330904245376587, + "learning_rate": 4.2583910857453456e-05, + "loss": 3.389, + "step": 112770 + }, + { + "epoch": 7.662386193776328, + "grad_norm": 0.7301376461982727, + "learning_rate": 4.2541445848620736e-05, + "loss": 3.5636, + "step": 112775 + }, + { + "epoch": 7.66272591384699, + "grad_norm": 0.8928303718566895, + "learning_rate": 4.2498980839788016e-05, + "loss": 3.2915, + "step": 112780 + }, + { + "epoch": 7.663065633917652, + "grad_norm": 0.9029821753501892, + "learning_rate": 4.245651583095529e-05, + "loss": 3.2941, + "step": 112785 + }, + { + "epoch": 7.663405353988313, + "grad_norm": 0.9035652279853821, + "learning_rate": 4.2414050822122577e-05, + "loss": 3.3708, + "step": 112790 + }, + { + "epoch": 7.663745074058975, + "grad_norm": 1.0718276500701904, + "learning_rate": 4.237158581328985e-05, + "loss": 3.4418, + "step": 112795 + }, + { + "epoch": 7.6640847941296375, + "grad_norm": 0.8075514435768127, + "learning_rate": 4.232912080445713e-05, + "loss": 3.4775, + "step": 112800 + }, + { + "epoch": 7.664424514200299, + "grad_norm": 0.9213892817497253, + "learning_rate": 4.228665579562441e-05, + "loss": 3.5593, + "step": 112805 + }, + { + "epoch": 7.664764234270961, + "grad_norm": 0.9576359391212463, + "learning_rate": 4.224419078679168e-05, + "loss": 3.1486, + "step": 112810 + }, + { + "epoch": 7.665103954341623, + "grad_norm": 1.2321370840072632, + "learning_rate": 4.2201725777958963e-05, + "loss": 3.4101, + "step": 112815 + }, + { + "epoch": 7.665443674412284, + "grad_norm": 0.9551894664764404, + "learning_rate": 4.215926076912624e-05, + "loss": 3.3776, + "step": 112820 + }, + { + "epoch": 7.665783394482946, + "grad_norm": 0.8403181433677673, + "learning_rate": 4.2116795760293524e-05, + "loss": 3.2923, + "step": 112825 + }, + { + "epoch": 7.666123114553608, + "grad_norm": 0.9445260763168335, + "learning_rate": 4.20743307514608e-05, + "loss": 3.0693, + "step": 112830 + }, + { + "epoch": 7.666462834624269, + "grad_norm": 0.777859628200531, + "learning_rate": 4.203186574262807e-05, + "loss": 3.5267, + "step": 112835 + }, + { + "epoch": 7.666802554694931, + "grad_norm": 0.8916822075843811, + "learning_rate": 4.198940073379536e-05, + "loss": 3.4932, + "step": 112840 + }, + { + "epoch": 7.6671422747655935, + "grad_norm": 0.8733360767364502, + "learning_rate": 4.194693572496263e-05, + "loss": 3.4036, + "step": 112845 + }, + { + "epoch": 7.667481994836255, + "grad_norm": 0.8671924471855164, + "learning_rate": 4.190447071612991e-05, + "loss": 3.2843, + "step": 112850 + }, + { + "epoch": 7.667821714906917, + "grad_norm": 0.8920717835426331, + "learning_rate": 4.186200570729719e-05, + "loss": 3.0527, + "step": 112855 + }, + { + "epoch": 7.668161434977579, + "grad_norm": 1.174516201019287, + "learning_rate": 4.1819540698464464e-05, + "loss": 3.4222, + "step": 112860 + }, + { + "epoch": 7.66850115504824, + "grad_norm": 1.0901272296905518, + "learning_rate": 4.1777075689631744e-05, + "loss": 3.465, + "step": 112865 + }, + { + "epoch": 7.668840875118902, + "grad_norm": 1.360144853591919, + "learning_rate": 4.1734610680799024e-05, + "loss": 3.2923, + "step": 112870 + }, + { + "epoch": 7.669180595189564, + "grad_norm": 0.9995095729827881, + "learning_rate": 4.1692145671966304e-05, + "loss": 3.4649, + "step": 112875 + }, + { + "epoch": 7.669520315260225, + "grad_norm": 1.0363340377807617, + "learning_rate": 4.164968066313358e-05, + "loss": 3.3011, + "step": 112880 + }, + { + "epoch": 7.6698600353308874, + "grad_norm": 1.1341469287872314, + "learning_rate": 4.160721565430086e-05, + "loss": 3.4051, + "step": 112885 + }, + { + "epoch": 7.6701997554015495, + "grad_norm": 0.8289105296134949, + "learning_rate": 4.156475064546814e-05, + "loss": 3.4918, + "step": 112890 + }, + { + "epoch": 7.670539475472211, + "grad_norm": 0.9683694839477539, + "learning_rate": 4.152228563663541e-05, + "loss": 3.48, + "step": 112895 + }, + { + "epoch": 7.670879195542873, + "grad_norm": 0.814932107925415, + "learning_rate": 4.147982062780269e-05, + "loss": 3.3926, + "step": 112900 + }, + { + "epoch": 7.671218915613535, + "grad_norm": 0.9518396258354187, + "learning_rate": 4.143735561896997e-05, + "loss": 3.0654, + "step": 112905 + }, + { + "epoch": 7.671558635684196, + "grad_norm": 1.0044931173324585, + "learning_rate": 4.139489061013725e-05, + "loss": 3.2266, + "step": 112910 + }, + { + "epoch": 7.671898355754858, + "grad_norm": 0.9125984907150269, + "learning_rate": 4.1352425601304524e-05, + "loss": 3.5538, + "step": 112915 + }, + { + "epoch": 7.67223807582552, + "grad_norm": 0.9238818287849426, + "learning_rate": 4.1309960592471804e-05, + "loss": 3.5695, + "step": 112920 + }, + { + "epoch": 7.672577795896181, + "grad_norm": 1.0175492763519287, + "learning_rate": 4.1267495583639084e-05, + "loss": 3.5015, + "step": 112925 + }, + { + "epoch": 7.6729175159668435, + "grad_norm": 1.8322104215621948, + "learning_rate": 4.122503057480636e-05, + "loss": 3.3712, + "step": 112930 + }, + { + "epoch": 7.6732572360375055, + "grad_norm": 1.1931766271591187, + "learning_rate": 4.118256556597364e-05, + "loss": 3.2023, + "step": 112935 + }, + { + "epoch": 7.673596956108167, + "grad_norm": 0.857399582862854, + "learning_rate": 4.114010055714092e-05, + "loss": 3.255, + "step": 112940 + }, + { + "epoch": 7.673936676178829, + "grad_norm": 0.8397573232650757, + "learning_rate": 4.109763554830819e-05, + "loss": 3.0632, + "step": 112945 + }, + { + "epoch": 7.674276396249491, + "grad_norm": 0.8912802338600159, + "learning_rate": 4.105517053947547e-05, + "loss": 3.5373, + "step": 112950 + }, + { + "epoch": 7.674616116320152, + "grad_norm": 1.235787034034729, + "learning_rate": 4.101270553064275e-05, + "loss": 3.4509, + "step": 112955 + }, + { + "epoch": 7.674955836390814, + "grad_norm": 1.3107123374938965, + "learning_rate": 4.097024052181003e-05, + "loss": 3.299, + "step": 112960 + }, + { + "epoch": 7.675295556461476, + "grad_norm": 0.9448580145835876, + "learning_rate": 4.0927775512977305e-05, + "loss": 3.2885, + "step": 112965 + }, + { + "epoch": 7.675635276532137, + "grad_norm": 1.1512495279312134, + "learning_rate": 4.088531050414459e-05, + "loss": 3.2997, + "step": 112970 + }, + { + "epoch": 7.6759749966027995, + "grad_norm": 0.8451576232910156, + "learning_rate": 4.0842845495311865e-05, + "loss": 3.1879, + "step": 112975 + }, + { + "epoch": 7.6763147166734615, + "grad_norm": 1.4072993993759155, + "learning_rate": 4.080038048647914e-05, + "loss": 3.4831, + "step": 112980 + }, + { + "epoch": 7.676654436744123, + "grad_norm": 1.0157920122146606, + "learning_rate": 4.0757915477646425e-05, + "loss": 3.2441, + "step": 112985 + }, + { + "epoch": 7.676994156814785, + "grad_norm": 1.0246602296829224, + "learning_rate": 4.07154504688137e-05, + "loss": 3.3296, + "step": 112990 + }, + { + "epoch": 7.677333876885447, + "grad_norm": 0.9796385765075684, + "learning_rate": 4.067298545998098e-05, + "loss": 3.4867, + "step": 112995 + }, + { + "epoch": 7.677673596956108, + "grad_norm": 0.7980547547340393, + "learning_rate": 4.063052045114825e-05, + "loss": 3.1946, + "step": 113000 + }, + { + "epoch": 7.67801331702677, + "grad_norm": 0.8483682870864868, + "learning_rate": 4.058805544231553e-05, + "loss": 3.486, + "step": 113005 + }, + { + "epoch": 7.678353037097431, + "grad_norm": 0.9986150860786438, + "learning_rate": 4.054559043348281e-05, + "loss": 3.3023, + "step": 113010 + }, + { + "epoch": 7.678692757168093, + "grad_norm": 0.928788959980011, + "learning_rate": 4.0503125424650085e-05, + "loss": 3.4921, + "step": 113015 + }, + { + "epoch": 7.6790324772387555, + "grad_norm": 0.9477344155311584, + "learning_rate": 4.046066041581737e-05, + "loss": 3.2937, + "step": 113020 + }, + { + "epoch": 7.679372197309417, + "grad_norm": 0.8922954201698303, + "learning_rate": 4.0418195406984645e-05, + "loss": 3.2674, + "step": 113025 + }, + { + "epoch": 7.679711917380079, + "grad_norm": 0.8210869431495667, + "learning_rate": 4.037573039815192e-05, + "loss": 3.2998, + "step": 113030 + }, + { + "epoch": 7.680051637450741, + "grad_norm": 1.149309515953064, + "learning_rate": 4.0333265389319205e-05, + "loss": 3.4171, + "step": 113035 + }, + { + "epoch": 7.680391357521402, + "grad_norm": 0.9490342140197754, + "learning_rate": 4.029080038048648e-05, + "loss": 3.4797, + "step": 113040 + }, + { + "epoch": 7.680731077592064, + "grad_norm": 0.9403334259986877, + "learning_rate": 4.024833537165376e-05, + "loss": 3.2798, + "step": 113045 + }, + { + "epoch": 7.681070797662726, + "grad_norm": 0.939726710319519, + "learning_rate": 4.020587036282104e-05, + "loss": 3.3073, + "step": 113050 + }, + { + "epoch": 7.681410517733387, + "grad_norm": 0.9404070973396301, + "learning_rate": 4.016340535398832e-05, + "loss": 3.3941, + "step": 113055 + }, + { + "epoch": 7.681750237804049, + "grad_norm": 0.9614078998565674, + "learning_rate": 4.012094034515559e-05, + "loss": 3.4664, + "step": 113060 + }, + { + "epoch": 7.6820899578747115, + "grad_norm": 0.9895490407943726, + "learning_rate": 4.0078475336322866e-05, + "loss": 3.3842, + "step": 113065 + }, + { + "epoch": 7.682429677945373, + "grad_norm": 1.038095235824585, + "learning_rate": 4.003601032749015e-05, + "loss": 3.5857, + "step": 113070 + }, + { + "epoch": 7.682769398016035, + "grad_norm": 0.9369269013404846, + "learning_rate": 3.9993545318657426e-05, + "loss": 3.3932, + "step": 113075 + }, + { + "epoch": 7.683109118086697, + "grad_norm": 0.9462903141975403, + "learning_rate": 3.9951080309824706e-05, + "loss": 3.3195, + "step": 113080 + }, + { + "epoch": 7.683448838157358, + "grad_norm": 0.8750750422477722, + "learning_rate": 3.9908615300991986e-05, + "loss": 3.5872, + "step": 113085 + }, + { + "epoch": 7.68378855822802, + "grad_norm": 1.0248388051986694, + "learning_rate": 3.986615029215926e-05, + "loss": 3.2959, + "step": 113090 + }, + { + "epoch": 7.684128278298682, + "grad_norm": 0.9286130666732788, + "learning_rate": 3.982368528332654e-05, + "loss": 3.4287, + "step": 113095 + }, + { + "epoch": 7.684467998369343, + "grad_norm": 0.900779664516449, + "learning_rate": 3.978122027449382e-05, + "loss": 3.1682, + "step": 113100 + }, + { + "epoch": 7.684807718440005, + "grad_norm": 1.2934554815292358, + "learning_rate": 3.97387552656611e-05, + "loss": 3.3543, + "step": 113105 + }, + { + "epoch": 7.6851474385106675, + "grad_norm": 0.9959975481033325, + "learning_rate": 3.969629025682837e-05, + "loss": 3.2576, + "step": 113110 + }, + { + "epoch": 7.685487158581329, + "grad_norm": 1.2035260200500488, + "learning_rate": 3.9653825247995646e-05, + "loss": 3.388, + "step": 113115 + }, + { + "epoch": 7.685826878651991, + "grad_norm": 1.0120289325714111, + "learning_rate": 3.961136023916293e-05, + "loss": 3.2993, + "step": 113120 + }, + { + "epoch": 7.686166598722653, + "grad_norm": 1.5670620203018188, + "learning_rate": 3.9568895230330206e-05, + "loss": 3.3283, + "step": 113125 + }, + { + "epoch": 7.686506318793314, + "grad_norm": 0.9593925476074219, + "learning_rate": 3.9526430221497486e-05, + "loss": 3.2616, + "step": 113130 + }, + { + "epoch": 7.686846038863976, + "grad_norm": 1.2878154516220093, + "learning_rate": 3.9483965212664766e-05, + "loss": 3.0274, + "step": 113135 + }, + { + "epoch": 7.687185758934638, + "grad_norm": 0.9538037776947021, + "learning_rate": 3.9441500203832046e-05, + "loss": 3.456, + "step": 113140 + }, + { + "epoch": 7.687525479005299, + "grad_norm": 0.8095225095748901, + "learning_rate": 3.939903519499932e-05, + "loss": 3.2501, + "step": 113145 + }, + { + "epoch": 7.687865199075961, + "grad_norm": 1.2917673587799072, + "learning_rate": 3.93565701861666e-05, + "loss": 3.4888, + "step": 113150 + }, + { + "epoch": 7.6882049191466235, + "grad_norm": 0.8511213660240173, + "learning_rate": 3.931410517733388e-05, + "loss": 3.5501, + "step": 113155 + }, + { + "epoch": 7.688544639217285, + "grad_norm": 1.3562476634979248, + "learning_rate": 3.927164016850115e-05, + "loss": 3.2988, + "step": 113160 + }, + { + "epoch": 7.688884359287947, + "grad_norm": 0.9853160381317139, + "learning_rate": 3.922917515966844e-05, + "loss": 3.2819, + "step": 113165 + }, + { + "epoch": 7.689224079358609, + "grad_norm": 1.134445309638977, + "learning_rate": 3.918671015083571e-05, + "loss": 3.332, + "step": 113170 + }, + { + "epoch": 7.68956379942927, + "grad_norm": 0.8684777021408081, + "learning_rate": 3.914424514200299e-05, + "loss": 3.5523, + "step": 113175 + }, + { + "epoch": 7.689903519499932, + "grad_norm": 0.8354480266571045, + "learning_rate": 3.910178013317027e-05, + "loss": 3.6025, + "step": 113180 + }, + { + "epoch": 7.690243239570594, + "grad_norm": 0.9430517554283142, + "learning_rate": 3.905931512433755e-05, + "loss": 3.31, + "step": 113185 + }, + { + "epoch": 7.690582959641255, + "grad_norm": 0.9144879579544067, + "learning_rate": 3.901685011550483e-05, + "loss": 3.4369, + "step": 113190 + }, + { + "epoch": 7.6909226797119175, + "grad_norm": 1.0657507181167603, + "learning_rate": 3.89743851066721e-05, + "loss": 3.2388, + "step": 113195 + }, + { + "epoch": 7.6912623997825795, + "grad_norm": 0.8273041248321533, + "learning_rate": 3.893192009783938e-05, + "loss": 3.0971, + "step": 113200 + }, + { + "epoch": 7.691602119853241, + "grad_norm": 1.4062763452529907, + "learning_rate": 3.888945508900666e-05, + "loss": 3.2315, + "step": 113205 + }, + { + "epoch": 7.691941839923903, + "grad_norm": 1.0692888498306274, + "learning_rate": 3.8846990080173934e-05, + "loss": 3.4034, + "step": 113210 + }, + { + "epoch": 7.692281559994565, + "grad_norm": 0.8667792081832886, + "learning_rate": 3.880452507134122e-05, + "loss": 3.3264, + "step": 113215 + }, + { + "epoch": 7.692621280065226, + "grad_norm": 1.0293762683868408, + "learning_rate": 3.8762060062508494e-05, + "loss": 3.5407, + "step": 113220 + }, + { + "epoch": 7.692961000135888, + "grad_norm": 0.9623515009880066, + "learning_rate": 3.871959505367577e-05, + "loss": 3.3736, + "step": 113225 + }, + { + "epoch": 7.693300720206549, + "grad_norm": 0.8669413328170776, + "learning_rate": 3.8677130044843054e-05, + "loss": 3.3059, + "step": 113230 + }, + { + "epoch": 7.693640440277211, + "grad_norm": 0.8650485873222351, + "learning_rate": 3.863466503601033e-05, + "loss": 3.2607, + "step": 113235 + }, + { + "epoch": 7.6939801603478735, + "grad_norm": 1.044545292854309, + "learning_rate": 3.859220002717761e-05, + "loss": 3.5635, + "step": 113240 + }, + { + "epoch": 7.694319880418535, + "grad_norm": 1.1584144830703735, + "learning_rate": 3.854973501834488e-05, + "loss": 3.5005, + "step": 113245 + }, + { + "epoch": 7.694659600489197, + "grad_norm": 1.0153440237045288, + "learning_rate": 3.850727000951217e-05, + "loss": 3.1795, + "step": 113250 + }, + { + "epoch": 7.694999320559859, + "grad_norm": 0.8653181195259094, + "learning_rate": 3.846480500067944e-05, + "loss": 3.2539, + "step": 113255 + }, + { + "epoch": 7.69533904063052, + "grad_norm": 0.9077422618865967, + "learning_rate": 3.8422339991846714e-05, + "loss": 3.2715, + "step": 113260 + }, + { + "epoch": 7.695678760701182, + "grad_norm": 1.2083402872085571, + "learning_rate": 3.8379874983014e-05, + "loss": 3.2622, + "step": 113265 + }, + { + "epoch": 7.696018480771844, + "grad_norm": 1.1180464029312134, + "learning_rate": 3.8337409974181274e-05, + "loss": 3.3639, + "step": 113270 + }, + { + "epoch": 7.696358200842505, + "grad_norm": 0.8795891404151917, + "learning_rate": 3.8294944965348554e-05, + "loss": 3.5851, + "step": 113275 + }, + { + "epoch": 7.696697920913167, + "grad_norm": 1.128968358039856, + "learning_rate": 3.8252479956515834e-05, + "loss": 3.497, + "step": 113280 + }, + { + "epoch": 7.6970376409838295, + "grad_norm": 0.9003632068634033, + "learning_rate": 3.821001494768311e-05, + "loss": 3.3919, + "step": 113285 + }, + { + "epoch": 7.697377361054491, + "grad_norm": 1.3509232997894287, + "learning_rate": 3.816754993885039e-05, + "loss": 3.3478, + "step": 113290 + }, + { + "epoch": 7.697717081125153, + "grad_norm": 0.9917703866958618, + "learning_rate": 3.812508493001766e-05, + "loss": 3.2531, + "step": 113295 + }, + { + "epoch": 7.698056801195815, + "grad_norm": 0.7568669319152832, + "learning_rate": 3.808261992118495e-05, + "loss": 3.1931, + "step": 113300 + }, + { + "epoch": 7.698396521266476, + "grad_norm": 1.2081174850463867, + "learning_rate": 3.804015491235222e-05, + "loss": 3.3819, + "step": 113305 + }, + { + "epoch": 7.698736241337138, + "grad_norm": 1.0999376773834229, + "learning_rate": 3.7997689903519495e-05, + "loss": 3.3612, + "step": 113310 + }, + { + "epoch": 7.6990759614078, + "grad_norm": 0.9313483238220215, + "learning_rate": 3.795522489468678e-05, + "loss": 3.4227, + "step": 113315 + }, + { + "epoch": 7.699415681478461, + "grad_norm": 1.0805236101150513, + "learning_rate": 3.7912759885854055e-05, + "loss": 3.282, + "step": 113320 + }, + { + "epoch": 7.699755401549123, + "grad_norm": 0.8462879657745361, + "learning_rate": 3.7870294877021335e-05, + "loss": 3.4327, + "step": 113325 + }, + { + "epoch": 7.7000951216197855, + "grad_norm": 0.9555248022079468, + "learning_rate": 3.7827829868188615e-05, + "loss": 3.3129, + "step": 113330 + }, + { + "epoch": 7.700434841690447, + "grad_norm": 0.9631436467170715, + "learning_rate": 3.7785364859355895e-05, + "loss": 3.4142, + "step": 113335 + }, + { + "epoch": 7.700774561761109, + "grad_norm": 0.8791877627372742, + "learning_rate": 3.774289985052317e-05, + "loss": 3.2875, + "step": 113340 + }, + { + "epoch": 7.701114281831771, + "grad_norm": 1.4759364128112793, + "learning_rate": 3.770043484169045e-05, + "loss": 3.3838, + "step": 113345 + }, + { + "epoch": 7.701454001902432, + "grad_norm": 1.328500509262085, + "learning_rate": 3.765796983285773e-05, + "loss": 3.5052, + "step": 113350 + }, + { + "epoch": 7.701793721973094, + "grad_norm": 0.925998866558075, + "learning_rate": 3.7615504824025e-05, + "loss": 3.58, + "step": 113355 + }, + { + "epoch": 7.702133442043756, + "grad_norm": 0.8228958249092102, + "learning_rate": 3.757303981519228e-05, + "loss": 3.31, + "step": 113360 + }, + { + "epoch": 7.702473162114417, + "grad_norm": 1.0621278285980225, + "learning_rate": 3.753057480635956e-05, + "loss": 2.9731, + "step": 113365 + }, + { + "epoch": 7.702812882185079, + "grad_norm": 1.2556978464126587, + "learning_rate": 3.7488109797526835e-05, + "loss": 3.535, + "step": 113370 + }, + { + "epoch": 7.7031526022557415, + "grad_norm": 0.94618159532547, + "learning_rate": 3.7445644788694115e-05, + "loss": 3.3438, + "step": 113375 + }, + { + "epoch": 7.703492322326403, + "grad_norm": 0.9277669191360474, + "learning_rate": 3.7403179779861395e-05, + "loss": 3.4661, + "step": 113380 + }, + { + "epoch": 7.703832042397065, + "grad_norm": 1.2715619802474976, + "learning_rate": 3.7360714771028675e-05, + "loss": 3.6947, + "step": 113385 + }, + { + "epoch": 7.704171762467727, + "grad_norm": 0.9162868857383728, + "learning_rate": 3.731824976219595e-05, + "loss": 3.6956, + "step": 113390 + }, + { + "epoch": 7.704511482538388, + "grad_norm": 0.8811008334159851, + "learning_rate": 3.727578475336323e-05, + "loss": 3.3296, + "step": 113395 + }, + { + "epoch": 7.70485120260905, + "grad_norm": 0.8755283355712891, + "learning_rate": 3.723331974453051e-05, + "loss": 3.3265, + "step": 113400 + }, + { + "epoch": 7.705190922679712, + "grad_norm": 0.9015448689460754, + "learning_rate": 3.719085473569778e-05, + "loss": 3.4949, + "step": 113405 + }, + { + "epoch": 7.705530642750373, + "grad_norm": 1.084763765335083, + "learning_rate": 3.714838972686507e-05, + "loss": 3.5328, + "step": 113410 + }, + { + "epoch": 7.705870362821035, + "grad_norm": 1.1303619146347046, + "learning_rate": 3.710592471803234e-05, + "loss": 3.228, + "step": 113415 + }, + { + "epoch": 7.7062100828916975, + "grad_norm": 0.8070182204246521, + "learning_rate": 3.706345970919962e-05, + "loss": 3.2668, + "step": 113420 + }, + { + "epoch": 7.706549802962359, + "grad_norm": 1.2447941303253174, + "learning_rate": 3.7020994700366896e-05, + "loss": 3.3759, + "step": 113425 + }, + { + "epoch": 7.706889523033021, + "grad_norm": 1.0630418062210083, + "learning_rate": 3.6978529691534176e-05, + "loss": 3.2132, + "step": 113430 + }, + { + "epoch": 7.707229243103683, + "grad_norm": 0.9184804558753967, + "learning_rate": 3.6936064682701456e-05, + "loss": 3.3173, + "step": 113435 + }, + { + "epoch": 7.707568963174344, + "grad_norm": 1.1968096494674683, + "learning_rate": 3.689359967386873e-05, + "loss": 3.1554, + "step": 113440 + }, + { + "epoch": 7.707908683245006, + "grad_norm": 1.0963420867919922, + "learning_rate": 3.6851134665036016e-05, + "loss": 3.3316, + "step": 113445 + }, + { + "epoch": 7.708248403315668, + "grad_norm": 0.7918722033500671, + "learning_rate": 3.680866965620329e-05, + "loss": 3.2082, + "step": 113450 + }, + { + "epoch": 7.708588123386329, + "grad_norm": 0.7739001512527466, + "learning_rate": 3.676620464737056e-05, + "loss": 3.2103, + "step": 113455 + }, + { + "epoch": 7.7089278434569914, + "grad_norm": 1.1090219020843506, + "learning_rate": 3.672373963853785e-05, + "loss": 3.3691, + "step": 113460 + }, + { + "epoch": 7.7092675635276535, + "grad_norm": 1.0036929845809937, + "learning_rate": 3.668127462970512e-05, + "loss": 3.3464, + "step": 113465 + }, + { + "epoch": 7.709607283598315, + "grad_norm": 0.7356132864952087, + "learning_rate": 3.66388096208724e-05, + "loss": 3.492, + "step": 113470 + }, + { + "epoch": 7.709947003668977, + "grad_norm": 0.9294981956481934, + "learning_rate": 3.659634461203968e-05, + "loss": 3.4, + "step": 113475 + }, + { + "epoch": 7.710286723739639, + "grad_norm": 0.9712327122688293, + "learning_rate": 3.6553879603206956e-05, + "loss": 3.379, + "step": 113480 + }, + { + "epoch": 7.7106264438103, + "grad_norm": 0.8874353170394897, + "learning_rate": 3.6511414594374236e-05, + "loss": 3.3524, + "step": 113485 + }, + { + "epoch": 7.710966163880962, + "grad_norm": 0.9258385896682739, + "learning_rate": 3.646894958554151e-05, + "loss": 3.3368, + "step": 113490 + }, + { + "epoch": 7.711305883951624, + "grad_norm": 0.7252011299133301, + "learning_rate": 3.6426484576708796e-05, + "loss": 3.4513, + "step": 113495 + }, + { + "epoch": 7.711645604022285, + "grad_norm": 1.7623268365859985, + "learning_rate": 3.638401956787607e-05, + "loss": 3.4273, + "step": 113500 + }, + { + "epoch": 7.7119853240929475, + "grad_norm": 0.8585666418075562, + "learning_rate": 3.634155455904335e-05, + "loss": 3.658, + "step": 113505 + }, + { + "epoch": 7.7123250441636095, + "grad_norm": 0.8745517134666443, + "learning_rate": 3.629908955021063e-05, + "loss": 3.4039, + "step": 113510 + }, + { + "epoch": 7.712664764234271, + "grad_norm": 0.9303756952285767, + "learning_rate": 3.62566245413779e-05, + "loss": 3.344, + "step": 113515 + }, + { + "epoch": 7.713004484304933, + "grad_norm": 1.126086711883545, + "learning_rate": 3.621415953254518e-05, + "loss": 3.205, + "step": 113520 + }, + { + "epoch": 7.713344204375595, + "grad_norm": 0.8994361162185669, + "learning_rate": 3.617169452371246e-05, + "loss": 3.3487, + "step": 113525 + }, + { + "epoch": 7.713683924446256, + "grad_norm": 0.9609485268592834, + "learning_rate": 3.6129229514879743e-05, + "loss": 3.3984, + "step": 113530 + }, + { + "epoch": 7.714023644516918, + "grad_norm": 1.0669132471084595, + "learning_rate": 3.608676450604702e-05, + "loss": 3.2388, + "step": 113535 + }, + { + "epoch": 7.71436336458758, + "grad_norm": 0.8950393199920654, + "learning_rate": 3.604429949721429e-05, + "loss": 3.1876, + "step": 113540 + }, + { + "epoch": 7.714703084658241, + "grad_norm": 1.0720142126083374, + "learning_rate": 3.600183448838158e-05, + "loss": 3.048, + "step": 113545 + }, + { + "epoch": 7.7150428047289035, + "grad_norm": 1.1508668661117554, + "learning_rate": 3.595936947954885e-05, + "loss": 3.4041, + "step": 113550 + }, + { + "epoch": 7.7153825247995655, + "grad_norm": 1.168928623199463, + "learning_rate": 3.591690447071613e-05, + "loss": 3.2452, + "step": 113555 + }, + { + "epoch": 7.715722244870227, + "grad_norm": 0.9314920902252197, + "learning_rate": 3.587443946188341e-05, + "loss": 3.2822, + "step": 113560 + }, + { + "epoch": 7.716061964940889, + "grad_norm": 0.7975327968597412, + "learning_rate": 3.5831974453050684e-05, + "loss": 3.1543, + "step": 113565 + }, + { + "epoch": 7.716401685011551, + "grad_norm": 0.967461347579956, + "learning_rate": 3.5789509444217964e-05, + "loss": 3.2457, + "step": 113570 + }, + { + "epoch": 7.716741405082212, + "grad_norm": 0.8836188912391663, + "learning_rate": 3.5747044435385244e-05, + "loss": 3.1456, + "step": 113575 + }, + { + "epoch": 7.717081125152874, + "grad_norm": 1.1287009716033936, + "learning_rate": 3.5704579426552524e-05, + "loss": 3.3148, + "step": 113580 + }, + { + "epoch": 7.717420845223536, + "grad_norm": 1.275459885597229, + "learning_rate": 3.56621144177198e-05, + "loss": 3.3183, + "step": 113585 + }, + { + "epoch": 7.717760565294197, + "grad_norm": 1.1226565837860107, + "learning_rate": 3.5619649408887084e-05, + "loss": 3.1987, + "step": 113590 + }, + { + "epoch": 7.7181002853648595, + "grad_norm": 1.1230636835098267, + "learning_rate": 3.557718440005436e-05, + "loss": 3.1719, + "step": 113595 + }, + { + "epoch": 7.7184400054355216, + "grad_norm": 0.9895016551017761, + "learning_rate": 3.553471939122163e-05, + "loss": 2.978, + "step": 113600 + }, + { + "epoch": 7.718779725506183, + "grad_norm": 0.8563648462295532, + "learning_rate": 3.549225438238891e-05, + "loss": 3.2933, + "step": 113605 + }, + { + "epoch": 7.719119445576845, + "grad_norm": 1.2055106163024902, + "learning_rate": 3.544978937355619e-05, + "loss": 3.3055, + "step": 113610 + }, + { + "epoch": 7.719459165647507, + "grad_norm": 0.7970181107521057, + "learning_rate": 3.540732436472347e-05, + "loss": 3.1742, + "step": 113615 + }, + { + "epoch": 7.719798885718168, + "grad_norm": 0.9165317416191101, + "learning_rate": 3.5364859355890744e-05, + "loss": 3.397, + "step": 113620 + }, + { + "epoch": 7.72013860578883, + "grad_norm": 0.9574131965637207, + "learning_rate": 3.5322394347058024e-05, + "loss": 3.2904, + "step": 113625 + }, + { + "epoch": 7.720478325859492, + "grad_norm": 0.7498705983161926, + "learning_rate": 3.5279929338225304e-05, + "loss": 3.4889, + "step": 113630 + }, + { + "epoch": 7.720818045930153, + "grad_norm": 0.9995712637901306, + "learning_rate": 3.523746432939258e-05, + "loss": 3.3568, + "step": 113635 + }, + { + "epoch": 7.7211577660008155, + "grad_norm": 0.8766238689422607, + "learning_rate": 3.5194999320559864e-05, + "loss": 3.3012, + "step": 113640 + }, + { + "epoch": 7.721497486071478, + "grad_norm": 2.872417688369751, + "learning_rate": 3.515253431172714e-05, + "loss": 3.3497, + "step": 113645 + }, + { + "epoch": 7.721837206142139, + "grad_norm": 1.1823569536209106, + "learning_rate": 3.511006930289441e-05, + "loss": 3.3815, + "step": 113650 + }, + { + "epoch": 7.722176926212801, + "grad_norm": 0.855802059173584, + "learning_rate": 3.50676042940617e-05, + "loss": 3.2864, + "step": 113655 + }, + { + "epoch": 7.722516646283463, + "grad_norm": 1.169682502746582, + "learning_rate": 3.502513928522897e-05, + "loss": 3.4953, + "step": 113660 + }, + { + "epoch": 7.722856366354124, + "grad_norm": 0.976779580116272, + "learning_rate": 3.498267427639625e-05, + "loss": 3.2917, + "step": 113665 + }, + { + "epoch": 7.723196086424786, + "grad_norm": 0.9961483478546143, + "learning_rate": 3.4940209267563525e-05, + "loss": 3.2535, + "step": 113670 + }, + { + "epoch": 7.723535806495448, + "grad_norm": 1.012447714805603, + "learning_rate": 3.489774425873081e-05, + "loss": 3.6198, + "step": 113675 + }, + { + "epoch": 7.723875526566109, + "grad_norm": 1.6397122144699097, + "learning_rate": 3.4855279249898085e-05, + "loss": 3.5713, + "step": 113680 + }, + { + "epoch": 7.7242152466367715, + "grad_norm": 0.8482804894447327, + "learning_rate": 3.481281424106536e-05, + "loss": 3.559, + "step": 113685 + }, + { + "epoch": 7.724554966707433, + "grad_norm": 1.0066263675689697, + "learning_rate": 3.4770349232232645e-05, + "loss": 3.3657, + "step": 113690 + }, + { + "epoch": 7.724894686778095, + "grad_norm": 0.9522129893302917, + "learning_rate": 3.472788422339992e-05, + "loss": 3.452, + "step": 113695 + }, + { + "epoch": 7.725234406848757, + "grad_norm": 1.050275206565857, + "learning_rate": 3.46854192145672e-05, + "loss": 3.3537, + "step": 113700 + }, + { + "epoch": 7.725574126919418, + "grad_norm": 1.0867496728897095, + "learning_rate": 3.464295420573448e-05, + "loss": 3.55, + "step": 113705 + }, + { + "epoch": 7.72591384699008, + "grad_norm": 1.2007735967636108, + "learning_rate": 3.460048919690175e-05, + "loss": 3.2697, + "step": 113710 + }, + { + "epoch": 7.726253567060742, + "grad_norm": 0.8445066809654236, + "learning_rate": 3.455802418806903e-05, + "loss": 3.4228, + "step": 113715 + }, + { + "epoch": 7.726593287131403, + "grad_norm": 0.8794454336166382, + "learning_rate": 3.4515559179236305e-05, + "loss": 3.378, + "step": 113720 + }, + { + "epoch": 7.726933007202065, + "grad_norm": 1.1100859642028809, + "learning_rate": 3.447309417040359e-05, + "loss": 3.2541, + "step": 113725 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 1.1142739057540894, + "learning_rate": 3.4430629161570865e-05, + "loss": 3.4308, + "step": 113730 + }, + { + "epoch": 7.727612447343389, + "grad_norm": 0.990254819393158, + "learning_rate": 3.438816415273814e-05, + "loss": 3.5016, + "step": 113735 + }, + { + "epoch": 7.727952167414051, + "grad_norm": 0.7733102440834045, + "learning_rate": 3.4345699143905425e-05, + "loss": 3.5198, + "step": 113740 + }, + { + "epoch": 7.728291887484713, + "grad_norm": 0.8994688391685486, + "learning_rate": 3.43032341350727e-05, + "loss": 3.4877, + "step": 113745 + }, + { + "epoch": 7.728631607555374, + "grad_norm": 1.0677324533462524, + "learning_rate": 3.426076912623998e-05, + "loss": 3.2871, + "step": 113750 + }, + { + "epoch": 7.728971327626036, + "grad_norm": 1.048244595527649, + "learning_rate": 3.421830411740726e-05, + "loss": 3.3121, + "step": 113755 + }, + { + "epoch": 7.729311047696698, + "grad_norm": 0.9953640699386597, + "learning_rate": 3.417583910857454e-05, + "loss": 3.3373, + "step": 113760 + }, + { + "epoch": 7.729650767767359, + "grad_norm": 0.8759520649909973, + "learning_rate": 3.413337409974181e-05, + "loss": 3.5366, + "step": 113765 + }, + { + "epoch": 7.7299904878380215, + "grad_norm": 1.037048578262329, + "learning_rate": 3.409090909090909e-05, + "loss": 3.312, + "step": 113770 + }, + { + "epoch": 7.7303302079086835, + "grad_norm": 1.0922091007232666, + "learning_rate": 3.404844408207637e-05, + "loss": 3.3098, + "step": 113775 + }, + { + "epoch": 7.730669927979345, + "grad_norm": 0.7941230535507202, + "learning_rate": 3.4005979073243646e-05, + "loss": 3.3747, + "step": 113780 + }, + { + "epoch": 7.731009648050007, + "grad_norm": 1.7117406129837036, + "learning_rate": 3.3963514064410926e-05, + "loss": 3.3047, + "step": 113785 + }, + { + "epoch": 7.731349368120669, + "grad_norm": 0.8689830899238586, + "learning_rate": 3.3921049055578206e-05, + "loss": 3.3152, + "step": 113790 + }, + { + "epoch": 7.73168908819133, + "grad_norm": 0.9037043452262878, + "learning_rate": 3.387858404674548e-05, + "loss": 3.2833, + "step": 113795 + }, + { + "epoch": 7.732028808261992, + "grad_norm": 0.9110051393508911, + "learning_rate": 3.383611903791276e-05, + "loss": 3.4605, + "step": 113800 + }, + { + "epoch": 7.732368528332654, + "grad_norm": 0.8880904912948608, + "learning_rate": 3.379365402908004e-05, + "loss": 3.4588, + "step": 113805 + }, + { + "epoch": 7.732708248403315, + "grad_norm": 1.0021089315414429, + "learning_rate": 3.375118902024732e-05, + "loss": 2.999, + "step": 113810 + }, + { + "epoch": 7.7330479684739775, + "grad_norm": 0.8469573855400085, + "learning_rate": 3.370872401141459e-05, + "loss": 3.439, + "step": 113815 + }, + { + "epoch": 7.7333876885446395, + "grad_norm": 0.9436444044113159, + "learning_rate": 3.366625900258187e-05, + "loss": 3.3781, + "step": 113820 + }, + { + "epoch": 7.733727408615301, + "grad_norm": 0.8222303986549377, + "learning_rate": 3.362379399374915e-05, + "loss": 3.4722, + "step": 113825 + }, + { + "epoch": 7.734067128685963, + "grad_norm": 0.8674531579017639, + "learning_rate": 3.3581328984916426e-05, + "loss": 3.2906, + "step": 113830 + }, + { + "epoch": 7.734406848756625, + "grad_norm": 1.1442385911941528, + "learning_rate": 3.353886397608371e-05, + "loss": 3.7947, + "step": 113835 + }, + { + "epoch": 7.734746568827286, + "grad_norm": 0.8223207592964172, + "learning_rate": 3.3496398967250986e-05, + "loss": 3.5522, + "step": 113840 + }, + { + "epoch": 7.735086288897948, + "grad_norm": 1.0152440071105957, + "learning_rate": 3.3453933958418266e-05, + "loss": 3.3792, + "step": 113845 + }, + { + "epoch": 7.73542600896861, + "grad_norm": 0.994615375995636, + "learning_rate": 3.341146894958554e-05, + "loss": 3.3044, + "step": 113850 + }, + { + "epoch": 7.735765729039271, + "grad_norm": 1.0107558965682983, + "learning_rate": 3.336900394075282e-05, + "loss": 3.379, + "step": 113855 + }, + { + "epoch": 7.7361054491099335, + "grad_norm": 0.8716590404510498, + "learning_rate": 3.33265389319201e-05, + "loss": 3.0792, + "step": 113860 + }, + { + "epoch": 7.7364451691805955, + "grad_norm": 1.06198251247406, + "learning_rate": 3.328407392308737e-05, + "loss": 3.6234, + "step": 113865 + }, + { + "epoch": 7.736784889251257, + "grad_norm": 0.9045277237892151, + "learning_rate": 3.324160891425466e-05, + "loss": 3.4071, + "step": 113870 + }, + { + "epoch": 7.737124609321919, + "grad_norm": 0.7207725644111633, + "learning_rate": 3.319914390542193e-05, + "loss": 3.4502, + "step": 113875 + }, + { + "epoch": 7.737464329392581, + "grad_norm": 0.8476120233535767, + "learning_rate": 3.3156678896589206e-05, + "loss": 3.5472, + "step": 113880 + }, + { + "epoch": 7.737804049463242, + "grad_norm": 1.243977665901184, + "learning_rate": 3.311421388775649e-05, + "loss": 3.2009, + "step": 113885 + }, + { + "epoch": 7.738143769533904, + "grad_norm": 1.0567848682403564, + "learning_rate": 3.307174887892377e-05, + "loss": 3.3471, + "step": 113890 + }, + { + "epoch": 7.738483489604566, + "grad_norm": 0.9303759336471558, + "learning_rate": 3.302928387009105e-05, + "loss": 3.434, + "step": 113895 + }, + { + "epoch": 7.738823209675227, + "grad_norm": 1.5821069478988647, + "learning_rate": 3.298681886125832e-05, + "loss": 3.3902, + "step": 113900 + }, + { + "epoch": 7.7391629297458895, + "grad_norm": 0.8893117308616638, + "learning_rate": 3.29443538524256e-05, + "loss": 3.3929, + "step": 113905 + }, + { + "epoch": 7.739502649816551, + "grad_norm": 0.82630455493927, + "learning_rate": 3.290188884359288e-05, + "loss": 3.3655, + "step": 113910 + }, + { + "epoch": 7.739842369887213, + "grad_norm": 1.013189435005188, + "learning_rate": 3.2859423834760153e-05, + "loss": 3.4151, + "step": 113915 + }, + { + "epoch": 7.740182089957875, + "grad_norm": 0.8102506399154663, + "learning_rate": 3.281695882592744e-05, + "loss": 3.3331, + "step": 113920 + }, + { + "epoch": 7.740521810028536, + "grad_norm": 1.1748411655426025, + "learning_rate": 3.2774493817094714e-05, + "loss": 3.5756, + "step": 113925 + }, + { + "epoch": 7.740861530099198, + "grad_norm": 0.8616225719451904, + "learning_rate": 3.2732028808261994e-05, + "loss": 3.6197, + "step": 113930 + }, + { + "epoch": 7.74120125016986, + "grad_norm": 0.8957856893539429, + "learning_rate": 3.2689563799429274e-05, + "loss": 3.2921, + "step": 113935 + }, + { + "epoch": 7.741540970240521, + "grad_norm": 0.9503731727600098, + "learning_rate": 3.264709879059655e-05, + "loss": 3.4607, + "step": 113940 + }, + { + "epoch": 7.741880690311183, + "grad_norm": 0.8069412112236023, + "learning_rate": 3.260463378176383e-05, + "loss": 3.3933, + "step": 113945 + }, + { + "epoch": 7.7422204103818455, + "grad_norm": 0.9264793395996094, + "learning_rate": 3.256216877293111e-05, + "loss": 3.2986, + "step": 113950 + }, + { + "epoch": 7.742560130452507, + "grad_norm": 0.8725402355194092, + "learning_rate": 3.251970376409839e-05, + "loss": 3.3548, + "step": 113955 + }, + { + "epoch": 7.742899850523169, + "grad_norm": 0.9168331027030945, + "learning_rate": 3.247723875526566e-05, + "loss": 3.509, + "step": 113960 + }, + { + "epoch": 7.743239570593831, + "grad_norm": 1.0535017251968384, + "learning_rate": 3.2434773746432934e-05, + "loss": 3.3669, + "step": 113965 + }, + { + "epoch": 7.743579290664492, + "grad_norm": 0.9821929335594177, + "learning_rate": 3.239230873760022e-05, + "loss": 3.4161, + "step": 113970 + }, + { + "epoch": 7.743919010735154, + "grad_norm": 0.8167740106582642, + "learning_rate": 3.2349843728767494e-05, + "loss": 3.4341, + "step": 113975 + }, + { + "epoch": 7.744258730805816, + "grad_norm": 0.8642147183418274, + "learning_rate": 3.2307378719934774e-05, + "loss": 3.4938, + "step": 113980 + }, + { + "epoch": 7.744598450876477, + "grad_norm": 1.063920259475708, + "learning_rate": 3.2264913711102054e-05, + "loss": 3.2683, + "step": 113985 + }, + { + "epoch": 7.744938170947139, + "grad_norm": 0.7174834609031677, + "learning_rate": 3.222244870226933e-05, + "loss": 3.1929, + "step": 113990 + }, + { + "epoch": 7.7452778910178015, + "grad_norm": 0.8174612522125244, + "learning_rate": 3.217998369343661e-05, + "loss": 3.3392, + "step": 113995 + }, + { + "epoch": 7.745617611088463, + "grad_norm": 0.9200714826583862, + "learning_rate": 3.213751868460389e-05, + "loss": 3.665, + "step": 114000 + }, + { + "epoch": 7.745957331159125, + "grad_norm": 1.0506377220153809, + "learning_rate": 3.209505367577117e-05, + "loss": 3.2048, + "step": 114005 + }, + { + "epoch": 7.746297051229787, + "grad_norm": 0.8155113458633423, + "learning_rate": 3.205258866693844e-05, + "loss": 3.3459, + "step": 114010 + }, + { + "epoch": 7.746636771300448, + "grad_norm": 1.02433443069458, + "learning_rate": 3.201012365810573e-05, + "loss": 3.4451, + "step": 114015 + }, + { + "epoch": 7.74697649137111, + "grad_norm": 0.8931393027305603, + "learning_rate": 3.1967658649273e-05, + "loss": 3.2128, + "step": 114020 + }, + { + "epoch": 7.747316211441772, + "grad_norm": 0.9836956858634949, + "learning_rate": 3.1925193640440275e-05, + "loss": 3.4311, + "step": 114025 + }, + { + "epoch": 7.747655931512433, + "grad_norm": 1.1126338243484497, + "learning_rate": 3.1882728631607555e-05, + "loss": 3.4099, + "step": 114030 + }, + { + "epoch": 7.7479956515830954, + "grad_norm": 0.8375820517539978, + "learning_rate": 3.1840263622774835e-05, + "loss": 3.3243, + "step": 114035 + }, + { + "epoch": 7.7483353716537575, + "grad_norm": 0.8992963433265686, + "learning_rate": 3.1797798613942115e-05, + "loss": 3.5431, + "step": 114040 + }, + { + "epoch": 7.748675091724419, + "grad_norm": 0.6159864664077759, + "learning_rate": 3.175533360510939e-05, + "loss": 3.3466, + "step": 114045 + }, + { + "epoch": 7.749014811795081, + "grad_norm": 1.03946053981781, + "learning_rate": 3.171286859627667e-05, + "loss": 3.3341, + "step": 114050 + }, + { + "epoch": 7.749354531865743, + "grad_norm": 1.0002676248550415, + "learning_rate": 3.167040358744395e-05, + "loss": 3.3847, + "step": 114055 + }, + { + "epoch": 7.749694251936404, + "grad_norm": 1.4619200229644775, + "learning_rate": 3.162793857861122e-05, + "loss": 3.2545, + "step": 114060 + }, + { + "epoch": 7.750033972007066, + "grad_norm": 1.0052818059921265, + "learning_rate": 3.158547356977851e-05, + "loss": 3.4253, + "step": 114065 + }, + { + "epoch": 7.750373692077728, + "grad_norm": 0.9495962262153625, + "learning_rate": 3.154300856094578e-05, + "loss": 3.3695, + "step": 114070 + }, + { + "epoch": 7.750713412148389, + "grad_norm": 1.3361760377883911, + "learning_rate": 3.1500543552113055e-05, + "loss": 3.2797, + "step": 114075 + }, + { + "epoch": 7.7510531322190515, + "grad_norm": 0.9635332822799683, + "learning_rate": 3.145807854328034e-05, + "loss": 3.3314, + "step": 114080 + }, + { + "epoch": 7.7513928522897135, + "grad_norm": 1.2480236291885376, + "learning_rate": 3.1415613534447615e-05, + "loss": 3.2972, + "step": 114085 + }, + { + "epoch": 7.751732572360375, + "grad_norm": 0.8277528882026672, + "learning_rate": 3.1373148525614895e-05, + "loss": 3.4079, + "step": 114090 + }, + { + "epoch": 7.752072292431037, + "grad_norm": 0.9295924305915833, + "learning_rate": 3.133068351678217e-05, + "loss": 3.5515, + "step": 114095 + }, + { + "epoch": 7.752412012501699, + "grad_norm": 0.6973501443862915, + "learning_rate": 3.1288218507949455e-05, + "loss": 3.3319, + "step": 114100 + }, + { + "epoch": 7.75275173257236, + "grad_norm": 1.9449423551559448, + "learning_rate": 3.124575349911673e-05, + "loss": 3.2709, + "step": 114105 + }, + { + "epoch": 7.753091452643022, + "grad_norm": 0.8060148358345032, + "learning_rate": 3.120328849028401e-05, + "loss": 3.3351, + "step": 114110 + }, + { + "epoch": 7.753431172713684, + "grad_norm": 1.1277720928192139, + "learning_rate": 3.116082348145129e-05, + "loss": 3.107, + "step": 114115 + }, + { + "epoch": 7.753770892784345, + "grad_norm": 1.1118638515472412, + "learning_rate": 3.111835847261856e-05, + "loss": 3.7592, + "step": 114120 + }, + { + "epoch": 7.7541106128550075, + "grad_norm": 0.8273407220840454, + "learning_rate": 3.107589346378584e-05, + "loss": 3.4057, + "step": 114125 + }, + { + "epoch": 7.7544503329256695, + "grad_norm": 1.3753070831298828, + "learning_rate": 3.103342845495312e-05, + "loss": 3.3704, + "step": 114130 + }, + { + "epoch": 7.754790052996331, + "grad_norm": 0.94412761926651, + "learning_rate": 3.0990963446120396e-05, + "loss": 3.2776, + "step": 114135 + }, + { + "epoch": 7.755129773066993, + "grad_norm": 1.3363081216812134, + "learning_rate": 3.0948498437287676e-05, + "loss": 3.4289, + "step": 114140 + }, + { + "epoch": 7.755469493137655, + "grad_norm": 0.7877116799354553, + "learning_rate": 3.090603342845495e-05, + "loss": 3.3554, + "step": 114145 + }, + { + "epoch": 7.755809213208316, + "grad_norm": 0.8770577907562256, + "learning_rate": 3.086356841962223e-05, + "loss": 3.3638, + "step": 114150 + }, + { + "epoch": 7.756148933278978, + "grad_norm": 0.7986202239990234, + "learning_rate": 3.082110341078951e-05, + "loss": 3.365, + "step": 114155 + }, + { + "epoch": 7.75648865334964, + "grad_norm": 0.9816272258758545, + "learning_rate": 3.077863840195679e-05, + "loss": 3.5379, + "step": 114160 + }, + { + "epoch": 7.756828373420301, + "grad_norm": 0.8911651372909546, + "learning_rate": 3.073617339312407e-05, + "loss": 3.0491, + "step": 114165 + }, + { + "epoch": 7.7571680934909635, + "grad_norm": 0.9941518306732178, + "learning_rate": 3.069370838429135e-05, + "loss": 3.3774, + "step": 114170 + }, + { + "epoch": 7.7575078135616256, + "grad_norm": 0.9578038454055786, + "learning_rate": 3.065124337545862e-05, + "loss": 3.6632, + "step": 114175 + }, + { + "epoch": 7.757847533632287, + "grad_norm": 0.9765973091125488, + "learning_rate": 3.06087783666259e-05, + "loss": 3.4947, + "step": 114180 + }, + { + "epoch": 7.758187253702949, + "grad_norm": 0.9921910762786865, + "learning_rate": 3.0566313357793176e-05, + "loss": 3.6425, + "step": 114185 + }, + { + "epoch": 7.758526973773611, + "grad_norm": 0.8782354593276978, + "learning_rate": 3.0523848348960456e-05, + "loss": 3.4128, + "step": 114190 + }, + { + "epoch": 7.758866693844272, + "grad_norm": 0.8422530293464661, + "learning_rate": 3.0481383340127736e-05, + "loss": 3.3771, + "step": 114195 + }, + { + "epoch": 7.759206413914934, + "grad_norm": 0.8212481141090393, + "learning_rate": 3.0438918331295016e-05, + "loss": 3.4073, + "step": 114200 + }, + { + "epoch": 7.759546133985596, + "grad_norm": 1.3212831020355225, + "learning_rate": 3.039645332246229e-05, + "loss": 3.4114, + "step": 114205 + }, + { + "epoch": 7.759885854056257, + "grad_norm": 1.0676252841949463, + "learning_rate": 3.035398831362957e-05, + "loss": 3.4085, + "step": 114210 + }, + { + "epoch": 7.7602255741269195, + "grad_norm": 0.8105381727218628, + "learning_rate": 3.031152330479685e-05, + "loss": 3.3619, + "step": 114215 + }, + { + "epoch": 7.760565294197582, + "grad_norm": 1.041892170906067, + "learning_rate": 3.0269058295964126e-05, + "loss": 3.2182, + "step": 114220 + }, + { + "epoch": 7.760905014268243, + "grad_norm": 1.0578950643539429, + "learning_rate": 3.0226593287131406e-05, + "loss": 3.3967, + "step": 114225 + }, + { + "epoch": 7.761244734338905, + "grad_norm": 0.785068690776825, + "learning_rate": 3.018412827829868e-05, + "loss": 3.687, + "step": 114230 + }, + { + "epoch": 7.761584454409567, + "grad_norm": 2.141361713409424, + "learning_rate": 3.014166326946596e-05, + "loss": 3.1739, + "step": 114235 + }, + { + "epoch": 7.761924174480228, + "grad_norm": 1.8218547105789185, + "learning_rate": 3.009919826063324e-05, + "loss": 3.3121, + "step": 114240 + }, + { + "epoch": 7.76226389455089, + "grad_norm": 0.9618586897850037, + "learning_rate": 3.0056733251800517e-05, + "loss": 3.2205, + "step": 114245 + }, + { + "epoch": 7.762603614621552, + "grad_norm": 0.765064001083374, + "learning_rate": 3.0014268242967797e-05, + "loss": 3.2082, + "step": 114250 + }, + { + "epoch": 7.762943334692213, + "grad_norm": 0.9602835178375244, + "learning_rate": 2.9971803234135073e-05, + "loss": 3.5096, + "step": 114255 + }, + { + "epoch": 7.7632830547628755, + "grad_norm": 0.7666296362876892, + "learning_rate": 2.992933822530235e-05, + "loss": 3.6504, + "step": 114260 + }, + { + "epoch": 7.763622774833538, + "grad_norm": 0.9383266568183899, + "learning_rate": 2.988687321646963e-05, + "loss": 3.4665, + "step": 114265 + }, + { + "epoch": 7.763962494904199, + "grad_norm": 0.9209392070770264, + "learning_rate": 2.9844408207636907e-05, + "loss": 3.1655, + "step": 114270 + }, + { + "epoch": 7.764302214974861, + "grad_norm": 0.9681779146194458, + "learning_rate": 2.9801943198804187e-05, + "loss": 3.2924, + "step": 114275 + }, + { + "epoch": 7.764641935045523, + "grad_norm": 0.9818360805511475, + "learning_rate": 2.9759478189971464e-05, + "loss": 3.3542, + "step": 114280 + }, + { + "epoch": 7.764981655116184, + "grad_norm": 0.8852611780166626, + "learning_rate": 2.9717013181138744e-05, + "loss": 3.4773, + "step": 114285 + }, + { + "epoch": 7.765321375186846, + "grad_norm": 1.142445683479309, + "learning_rate": 2.967454817230602e-05, + "loss": 3.5811, + "step": 114290 + }, + { + "epoch": 7.765661095257508, + "grad_norm": 0.8374162316322327, + "learning_rate": 2.9632083163473297e-05, + "loss": 3.3696, + "step": 114295 + }, + { + "epoch": 7.766000815328169, + "grad_norm": 1.069589614868164, + "learning_rate": 2.9589618154640577e-05, + "loss": 3.5116, + "step": 114300 + }, + { + "epoch": 7.7663405353988315, + "grad_norm": 0.9420245289802551, + "learning_rate": 2.9547153145807857e-05, + "loss": 3.4398, + "step": 114305 + }, + { + "epoch": 7.766680255469494, + "grad_norm": 0.8117449879646301, + "learning_rate": 2.9504688136975134e-05, + "loss": 3.3571, + "step": 114310 + }, + { + "epoch": 7.767019975540155, + "grad_norm": 0.9687461256980896, + "learning_rate": 2.946222312814241e-05, + "loss": 3.2384, + "step": 114315 + }, + { + "epoch": 7.767359695610817, + "grad_norm": 0.8947687745094299, + "learning_rate": 2.9419758119309687e-05, + "loss": 3.574, + "step": 114320 + }, + { + "epoch": 7.767699415681479, + "grad_norm": 0.8817046880722046, + "learning_rate": 2.9377293110476967e-05, + "loss": 3.5716, + "step": 114325 + }, + { + "epoch": 7.76803913575214, + "grad_norm": 1.0266635417938232, + "learning_rate": 2.9334828101644247e-05, + "loss": 3.242, + "step": 114330 + }, + { + "epoch": 7.768378855822802, + "grad_norm": 0.9241191744804382, + "learning_rate": 2.9292363092811524e-05, + "loss": 3.1904, + "step": 114335 + }, + { + "epoch": 7.768718575893464, + "grad_norm": 0.7929470539093018, + "learning_rate": 2.9249898083978804e-05, + "loss": 3.3902, + "step": 114340 + }, + { + "epoch": 7.7690582959641254, + "grad_norm": 0.7349050045013428, + "learning_rate": 2.9207433075146077e-05, + "loss": 3.3447, + "step": 114345 + }, + { + "epoch": 7.7693980160347875, + "grad_norm": 0.931790828704834, + "learning_rate": 2.9164968066313358e-05, + "loss": 3.4449, + "step": 114350 + }, + { + "epoch": 7.76973773610545, + "grad_norm": 0.8067919611930847, + "learning_rate": 2.9122503057480638e-05, + "loss": 3.274, + "step": 114355 + }, + { + "epoch": 7.770077456176111, + "grad_norm": 0.9432344436645508, + "learning_rate": 2.9080038048647914e-05, + "loss": 3.354, + "step": 114360 + }, + { + "epoch": 7.770417176246773, + "grad_norm": 0.9624553322792053, + "learning_rate": 2.9037573039815194e-05, + "loss": 3.4279, + "step": 114365 + }, + { + "epoch": 7.770756896317434, + "grad_norm": 1.0710785388946533, + "learning_rate": 2.899510803098247e-05, + "loss": 3.2949, + "step": 114370 + }, + { + "epoch": 7.771096616388096, + "grad_norm": 0.8870888948440552, + "learning_rate": 2.8952643022149748e-05, + "loss": 3.3774, + "step": 114375 + }, + { + "epoch": 7.771436336458758, + "grad_norm": 0.9693521857261658, + "learning_rate": 2.8910178013317028e-05, + "loss": 3.3384, + "step": 114380 + }, + { + "epoch": 7.771776056529419, + "grad_norm": 0.9772228002548218, + "learning_rate": 2.8867713004484305e-05, + "loss": 3.4412, + "step": 114385 + }, + { + "epoch": 7.7721157766000815, + "grad_norm": 0.8488404750823975, + "learning_rate": 2.8825247995651585e-05, + "loss": 3.2342, + "step": 114390 + }, + { + "epoch": 7.7724554966707435, + "grad_norm": 0.8604421615600586, + "learning_rate": 2.8782782986818865e-05, + "loss": 3.4073, + "step": 114395 + }, + { + "epoch": 7.772795216741405, + "grad_norm": 0.9154587984085083, + "learning_rate": 2.8740317977986138e-05, + "loss": 3.2079, + "step": 114400 + }, + { + "epoch": 7.773134936812067, + "grad_norm": 0.9787241220474243, + "learning_rate": 2.8697852969153418e-05, + "loss": 3.3301, + "step": 114405 + }, + { + "epoch": 7.773474656882729, + "grad_norm": 0.7979158163070679, + "learning_rate": 2.8655387960320695e-05, + "loss": 3.1491, + "step": 114410 + }, + { + "epoch": 7.77381437695339, + "grad_norm": 1.018354058265686, + "learning_rate": 2.8612922951487975e-05, + "loss": 3.4901, + "step": 114415 + }, + { + "epoch": 7.774154097024052, + "grad_norm": 0.7715879082679749, + "learning_rate": 2.8570457942655255e-05, + "loss": 3.5022, + "step": 114420 + }, + { + "epoch": 7.774493817094714, + "grad_norm": 0.9157072305679321, + "learning_rate": 2.852799293382253e-05, + "loss": 3.209, + "step": 114425 + }, + { + "epoch": 7.774833537165375, + "grad_norm": 1.1103029251098633, + "learning_rate": 2.848552792498981e-05, + "loss": 3.2784, + "step": 114430 + }, + { + "epoch": 7.7751732572360375, + "grad_norm": 0.8523648977279663, + "learning_rate": 2.8443062916157085e-05, + "loss": 3.4879, + "step": 114435 + }, + { + "epoch": 7.7755129773066995, + "grad_norm": 0.8694077134132385, + "learning_rate": 2.8400597907324365e-05, + "loss": 3.6583, + "step": 114440 + }, + { + "epoch": 7.775852697377361, + "grad_norm": 0.9091561436653137, + "learning_rate": 2.8358132898491645e-05, + "loss": 3.2064, + "step": 114445 + }, + { + "epoch": 7.776192417448023, + "grad_norm": 1.0071734189987183, + "learning_rate": 2.8315667889658922e-05, + "loss": 3.5534, + "step": 114450 + }, + { + "epoch": 7.776532137518685, + "grad_norm": 0.8647490739822388, + "learning_rate": 2.8273202880826202e-05, + "loss": 3.1632, + "step": 114455 + }, + { + "epoch": 7.776871857589346, + "grad_norm": 0.8949958682060242, + "learning_rate": 2.8230737871993475e-05, + "loss": 3.2705, + "step": 114460 + }, + { + "epoch": 7.777211577660008, + "grad_norm": 0.9421919584274292, + "learning_rate": 2.8188272863160755e-05, + "loss": 3.3164, + "step": 114465 + }, + { + "epoch": 7.77755129773067, + "grad_norm": 1.2471528053283691, + "learning_rate": 2.8145807854328035e-05, + "loss": 3.4329, + "step": 114470 + }, + { + "epoch": 7.777891017801331, + "grad_norm": 1.238612413406372, + "learning_rate": 2.8103342845495312e-05, + "loss": 3.2139, + "step": 114475 + }, + { + "epoch": 7.7782307378719935, + "grad_norm": 1.258226752281189, + "learning_rate": 2.8060877836662592e-05, + "loss": 3.1993, + "step": 114480 + }, + { + "epoch": 7.778570457942656, + "grad_norm": 0.7969419956207275, + "learning_rate": 2.801841282782987e-05, + "loss": 3.5687, + "step": 114485 + }, + { + "epoch": 7.778910178013317, + "grad_norm": 1.2255609035491943, + "learning_rate": 2.7975947818997146e-05, + "loss": 3.7024, + "step": 114490 + }, + { + "epoch": 7.779249898083979, + "grad_norm": 0.7739019989967346, + "learning_rate": 2.7933482810164426e-05, + "loss": 3.6452, + "step": 114495 + }, + { + "epoch": 7.779589618154641, + "grad_norm": 0.8732863664627075, + "learning_rate": 2.7891017801331702e-05, + "loss": 2.9937, + "step": 114500 + }, + { + "epoch": 7.779929338225302, + "grad_norm": 0.9874916076660156, + "learning_rate": 2.7848552792498982e-05, + "loss": 3.1004, + "step": 114505 + }, + { + "epoch": 7.780269058295964, + "grad_norm": 0.8769840002059937, + "learning_rate": 2.7806087783666262e-05, + "loss": 3.6054, + "step": 114510 + }, + { + "epoch": 7.780608778366626, + "grad_norm": 0.7716265916824341, + "learning_rate": 2.7763622774833536e-05, + "loss": 3.3922, + "step": 114515 + }, + { + "epoch": 7.780948498437287, + "grad_norm": 1.018960952758789, + "learning_rate": 2.7721157766000816e-05, + "loss": 3.32, + "step": 114520 + }, + { + "epoch": 7.7812882185079495, + "grad_norm": 5.744914531707764, + "learning_rate": 2.7678692757168093e-05, + "loss": 3.4612, + "step": 114525 + }, + { + "epoch": 7.781627938578612, + "grad_norm": 0.8144775032997131, + "learning_rate": 2.7636227748335373e-05, + "loss": 3.4021, + "step": 114530 + }, + { + "epoch": 7.781967658649273, + "grad_norm": 1.2974331378936768, + "learning_rate": 2.7593762739502653e-05, + "loss": 3.4037, + "step": 114535 + }, + { + "epoch": 7.782307378719935, + "grad_norm": 0.889894962310791, + "learning_rate": 2.755129773066993e-05, + "loss": 3.1712, + "step": 114540 + }, + { + "epoch": 7.782647098790597, + "grad_norm": 0.9259335398674011, + "learning_rate": 2.7508832721837206e-05, + "loss": 3.4631, + "step": 114545 + }, + { + "epoch": 7.782986818861258, + "grad_norm": 1.0196936130523682, + "learning_rate": 2.7466367713004483e-05, + "loss": 3.5457, + "step": 114550 + }, + { + "epoch": 7.78332653893192, + "grad_norm": 0.8183448910713196, + "learning_rate": 2.7423902704171763e-05, + "loss": 3.2443, + "step": 114555 + }, + { + "epoch": 7.783666259002582, + "grad_norm": 0.7521885633468628, + "learning_rate": 2.7381437695339043e-05, + "loss": 3.3958, + "step": 114560 + }, + { + "epoch": 7.784005979073243, + "grad_norm": 0.974395751953125, + "learning_rate": 2.733897268650632e-05, + "loss": 3.5684, + "step": 114565 + }, + { + "epoch": 7.7843456991439055, + "grad_norm": 1.1015626192092896, + "learning_rate": 2.7296507677673596e-05, + "loss": 3.1054, + "step": 114570 + }, + { + "epoch": 7.784685419214568, + "grad_norm": 0.8330518007278442, + "learning_rate": 2.7254042668840876e-05, + "loss": 3.344, + "step": 114575 + }, + { + "epoch": 7.785025139285229, + "grad_norm": 0.9915731549263, + "learning_rate": 2.7211577660008153e-05, + "loss": 3.4699, + "step": 114580 + }, + { + "epoch": 7.785364859355891, + "grad_norm": 0.8348602652549744, + "learning_rate": 2.7169112651175433e-05, + "loss": 3.2023, + "step": 114585 + }, + { + "epoch": 7.785704579426552, + "grad_norm": 1.0071414709091187, + "learning_rate": 2.712664764234271e-05, + "loss": 3.3019, + "step": 114590 + }, + { + "epoch": 7.786044299497214, + "grad_norm": 1.7085261344909668, + "learning_rate": 2.708418263350999e-05, + "loss": 3.3611, + "step": 114595 + }, + { + "epoch": 7.786384019567876, + "grad_norm": 1.0824153423309326, + "learning_rate": 2.7041717624677267e-05, + "loss": 3.4091, + "step": 114600 + }, + { + "epoch": 7.786723739638537, + "grad_norm": 0.8706474304199219, + "learning_rate": 2.6999252615844543e-05, + "loss": 3.4865, + "step": 114605 + }, + { + "epoch": 7.787063459709199, + "grad_norm": 1.0800302028656006, + "learning_rate": 2.6956787607011823e-05, + "loss": 3.3801, + "step": 114610 + }, + { + "epoch": 7.7874031797798615, + "grad_norm": 0.9151641130447388, + "learning_rate": 2.69143225981791e-05, + "loss": 3.4142, + "step": 114615 + }, + { + "epoch": 7.787742899850523, + "grad_norm": 0.7080878019332886, + "learning_rate": 2.687185758934638e-05, + "loss": 3.3995, + "step": 114620 + }, + { + "epoch": 7.788082619921185, + "grad_norm": 0.983941376209259, + "learning_rate": 2.6829392580513657e-05, + "loss": 3.335, + "step": 114625 + }, + { + "epoch": 7.788422339991847, + "grad_norm": 0.9120297431945801, + "learning_rate": 2.6786927571680933e-05, + "loss": 3.2833, + "step": 114630 + }, + { + "epoch": 7.788762060062508, + "grad_norm": 0.7770398855209351, + "learning_rate": 2.6744462562848214e-05, + "loss": 3.4352, + "step": 114635 + }, + { + "epoch": 7.78910178013317, + "grad_norm": 0.7387987971305847, + "learning_rate": 2.670199755401549e-05, + "loss": 3.5066, + "step": 114640 + }, + { + "epoch": 7.789441500203832, + "grad_norm": 0.7633771896362305, + "learning_rate": 2.665953254518277e-05, + "loss": 3.1292, + "step": 114645 + }, + { + "epoch": 7.789781220274493, + "grad_norm": 0.9765605330467224, + "learning_rate": 2.661706753635005e-05, + "loss": 3.3882, + "step": 114650 + }, + { + "epoch": 7.7901209403451555, + "grad_norm": 1.0390033721923828, + "learning_rate": 2.6574602527517324e-05, + "loss": 3.2869, + "step": 114655 + }, + { + "epoch": 7.7904606604158175, + "grad_norm": 1.1453986167907715, + "learning_rate": 2.6532137518684604e-05, + "loss": 3.3299, + "step": 114660 + }, + { + "epoch": 7.790800380486479, + "grad_norm": 1.3327174186706543, + "learning_rate": 2.6489672509851884e-05, + "loss": 3.468, + "step": 114665 + }, + { + "epoch": 7.791140100557141, + "grad_norm": 0.8416317105293274, + "learning_rate": 2.644720750101916e-05, + "loss": 3.2507, + "step": 114670 + }, + { + "epoch": 7.791479820627803, + "grad_norm": 0.9647867679595947, + "learning_rate": 2.640474249218644e-05, + "loss": 3.4216, + "step": 114675 + }, + { + "epoch": 7.791819540698464, + "grad_norm": 0.8482579588890076, + "learning_rate": 2.6362277483353717e-05, + "loss": 3.3156, + "step": 114680 + }, + { + "epoch": 7.792159260769126, + "grad_norm": 0.9288334846496582, + "learning_rate": 2.6319812474520994e-05, + "loss": 3.1549, + "step": 114685 + }, + { + "epoch": 7.792498980839788, + "grad_norm": 0.9336647391319275, + "learning_rate": 2.6277347465688274e-05, + "loss": 3.2736, + "step": 114690 + }, + { + "epoch": 7.792838700910449, + "grad_norm": 0.8427070379257202, + "learning_rate": 2.623488245685555e-05, + "loss": 3.137, + "step": 114695 + }, + { + "epoch": 7.7931784209811115, + "grad_norm": 0.974307119846344, + "learning_rate": 2.619241744802283e-05, + "loss": 3.4824, + "step": 114700 + }, + { + "epoch": 7.7935181410517735, + "grad_norm": 0.8001380562782288, + "learning_rate": 2.6149952439190108e-05, + "loss": 3.1584, + "step": 114705 + }, + { + "epoch": 7.793857861122435, + "grad_norm": 0.8384062647819519, + "learning_rate": 2.6107487430357384e-05, + "loss": 2.9744, + "step": 114710 + }, + { + "epoch": 7.794197581193097, + "grad_norm": 1.0486671924591064, + "learning_rate": 2.6065022421524664e-05, + "loss": 3.2343, + "step": 114715 + }, + { + "epoch": 7.794537301263759, + "grad_norm": 1.116713285446167, + "learning_rate": 2.602255741269194e-05, + "loss": 3.3192, + "step": 114720 + }, + { + "epoch": 7.79487702133442, + "grad_norm": 0.8093993663787842, + "learning_rate": 2.598009240385922e-05, + "loss": 3.3503, + "step": 114725 + }, + { + "epoch": 7.795216741405082, + "grad_norm": 2.4967007637023926, + "learning_rate": 2.59376273950265e-05, + "loss": 3.2991, + "step": 114730 + }, + { + "epoch": 7.795556461475744, + "grad_norm": 1.0481281280517578, + "learning_rate": 2.5895162386193778e-05, + "loss": 3.0571, + "step": 114735 + }, + { + "epoch": 7.795896181546405, + "grad_norm": 0.9707038402557373, + "learning_rate": 2.5852697377361055e-05, + "loss": 3.3645, + "step": 114740 + }, + { + "epoch": 7.7962359016170675, + "grad_norm": 0.9576956629753113, + "learning_rate": 2.581023236852833e-05, + "loss": 3.6815, + "step": 114745 + }, + { + "epoch": 7.7965756216877296, + "grad_norm": 1.0857588052749634, + "learning_rate": 2.576776735969561e-05, + "loss": 3.5253, + "step": 114750 + }, + { + "epoch": 7.796915341758391, + "grad_norm": 1.2983111143112183, + "learning_rate": 2.572530235086289e-05, + "loss": 3.172, + "step": 114755 + }, + { + "epoch": 7.797255061829053, + "grad_norm": 0.8391571640968323, + "learning_rate": 2.5682837342030168e-05, + "loss": 3.3401, + "step": 114760 + }, + { + "epoch": 7.797594781899715, + "grad_norm": 0.7726710438728333, + "learning_rate": 2.5640372333197448e-05, + "loss": 3.514, + "step": 114765 + }, + { + "epoch": 7.797934501970376, + "grad_norm": 0.8680242896080017, + "learning_rate": 2.559790732436472e-05, + "loss": 3.4014, + "step": 114770 + }, + { + "epoch": 7.798274222041038, + "grad_norm": 0.7496702075004578, + "learning_rate": 2.5555442315532e-05, + "loss": 3.3258, + "step": 114775 + }, + { + "epoch": 7.7986139421117, + "grad_norm": 0.9320528507232666, + "learning_rate": 2.551297730669928e-05, + "loss": 3.604, + "step": 114780 + }, + { + "epoch": 7.798953662182361, + "grad_norm": 0.8442209959030151, + "learning_rate": 2.5470512297866558e-05, + "loss": 3.2327, + "step": 114785 + }, + { + "epoch": 7.7992933822530235, + "grad_norm": 0.8600015044212341, + "learning_rate": 2.542804728903384e-05, + "loss": 3.2696, + "step": 114790 + }, + { + "epoch": 7.799633102323686, + "grad_norm": 0.8862429261207581, + "learning_rate": 2.538558228020111e-05, + "loss": 3.5162, + "step": 114795 + }, + { + "epoch": 7.799972822394347, + "grad_norm": 0.8216318488121033, + "learning_rate": 2.5343117271368392e-05, + "loss": 3.3707, + "step": 114800 + }, + { + "epoch": 7.800312542465009, + "grad_norm": 0.8099024295806885, + "learning_rate": 2.5300652262535672e-05, + "loss": 3.6698, + "step": 114805 + }, + { + "epoch": 7.800652262535671, + "grad_norm": 0.7379819750785828, + "learning_rate": 2.525818725370295e-05, + "loss": 3.4943, + "step": 114810 + }, + { + "epoch": 7.800991982606332, + "grad_norm": 0.9993781447410583, + "learning_rate": 2.521572224487023e-05, + "loss": 3.2989, + "step": 114815 + }, + { + "epoch": 7.801331702676994, + "grad_norm": 1.4173696041107178, + "learning_rate": 2.517325723603751e-05, + "loss": 3.3691, + "step": 114820 + }, + { + "epoch": 7.801671422747656, + "grad_norm": 0.9001297354698181, + "learning_rate": 2.5130792227204782e-05, + "loss": 3.3834, + "step": 114825 + }, + { + "epoch": 7.802011142818317, + "grad_norm": 1.2459286451339722, + "learning_rate": 2.5088327218372062e-05, + "loss": 3.495, + "step": 114830 + }, + { + "epoch": 7.8023508628889795, + "grad_norm": 0.7495852708816528, + "learning_rate": 2.504586220953934e-05, + "loss": 3.4124, + "step": 114835 + }, + { + "epoch": 7.802690582959642, + "grad_norm": 0.8684486746788025, + "learning_rate": 2.500339720070662e-05, + "loss": 3.2718, + "step": 114840 + }, + { + "epoch": 7.803030303030303, + "grad_norm": 0.8637984395027161, + "learning_rate": 2.49609321918739e-05, + "loss": 3.4264, + "step": 114845 + }, + { + "epoch": 7.803370023100965, + "grad_norm": 0.8598664402961731, + "learning_rate": 2.4918467183041176e-05, + "loss": 3.2286, + "step": 114850 + }, + { + "epoch": 7.803709743171627, + "grad_norm": 0.8838856816291809, + "learning_rate": 2.4876002174208452e-05, + "loss": 3.3924, + "step": 114855 + }, + { + "epoch": 7.804049463242288, + "grad_norm": 0.8088594675064087, + "learning_rate": 2.483353716537573e-05, + "loss": 3.0996, + "step": 114860 + }, + { + "epoch": 7.80438918331295, + "grad_norm": 1.0199369192123413, + "learning_rate": 2.479107215654301e-05, + "loss": 3.6097, + "step": 114865 + }, + { + "epoch": 7.804728903383612, + "grad_norm": 1.1169832944869995, + "learning_rate": 2.474860714771029e-05, + "loss": 3.2063, + "step": 114870 + }, + { + "epoch": 7.805068623454273, + "grad_norm": 0.8371045589447021, + "learning_rate": 2.4706142138877566e-05, + "loss": 3.2409, + "step": 114875 + }, + { + "epoch": 7.8054083435249355, + "grad_norm": 0.8798542618751526, + "learning_rate": 2.4663677130044842e-05, + "loss": 3.2079, + "step": 114880 + }, + { + "epoch": 7.805748063595598, + "grad_norm": 0.9018483757972717, + "learning_rate": 2.462121212121212e-05, + "loss": 3.4802, + "step": 114885 + }, + { + "epoch": 7.806087783666259, + "grad_norm": 0.9338012337684631, + "learning_rate": 2.45787471123794e-05, + "loss": 3.1375, + "step": 114890 + }, + { + "epoch": 7.806427503736921, + "grad_norm": 0.8320783972740173, + "learning_rate": 2.453628210354668e-05, + "loss": 3.4149, + "step": 114895 + }, + { + "epoch": 7.806767223807583, + "grad_norm": 0.9642854928970337, + "learning_rate": 2.4493817094713956e-05, + "loss": 3.4169, + "step": 114900 + }, + { + "epoch": 7.807106943878244, + "grad_norm": 0.9193230867385864, + "learning_rate": 2.4451352085881236e-05, + "loss": 2.9667, + "step": 114905 + }, + { + "epoch": 7.807446663948906, + "grad_norm": 0.8379111289978027, + "learning_rate": 2.4408887077048513e-05, + "loss": 3.356, + "step": 114910 + }, + { + "epoch": 7.807786384019568, + "grad_norm": 1.1043237447738647, + "learning_rate": 2.436642206821579e-05, + "loss": 3.4469, + "step": 114915 + }, + { + "epoch": 7.8081261040902294, + "grad_norm": 0.938105046749115, + "learning_rate": 2.432395705938307e-05, + "loss": 3.1953, + "step": 114920 + }, + { + "epoch": 7.8084658241608915, + "grad_norm": 1.0255717039108276, + "learning_rate": 2.4281492050550346e-05, + "loss": 3.1401, + "step": 114925 + }, + { + "epoch": 7.808805544231554, + "grad_norm": 1.3805586099624634, + "learning_rate": 2.4239027041717626e-05, + "loss": 3.5166, + "step": 114930 + }, + { + "epoch": 7.809145264302215, + "grad_norm": 1.082837700843811, + "learning_rate": 2.4196562032884906e-05, + "loss": 3.3566, + "step": 114935 + }, + { + "epoch": 7.809484984372877, + "grad_norm": 0.952633798122406, + "learning_rate": 2.415409702405218e-05, + "loss": 3.4528, + "step": 114940 + }, + { + "epoch": 7.809824704443539, + "grad_norm": 0.8667873740196228, + "learning_rate": 2.411163201521946e-05, + "loss": 3.2687, + "step": 114945 + }, + { + "epoch": 7.8101644245142, + "grad_norm": 0.8527999520301819, + "learning_rate": 2.4069167006386736e-05, + "loss": 3.341, + "step": 114950 + }, + { + "epoch": 7.810504144584862, + "grad_norm": 1.0143927335739136, + "learning_rate": 2.4026701997554017e-05, + "loss": 3.4682, + "step": 114955 + }, + { + "epoch": 7.810843864655524, + "grad_norm": 0.960868239402771, + "learning_rate": 2.3984236988721297e-05, + "loss": 3.5559, + "step": 114960 + }, + { + "epoch": 7.8111835847261855, + "grad_norm": 1.0682742595672607, + "learning_rate": 2.394177197988857e-05, + "loss": 3.3869, + "step": 114965 + }, + { + "epoch": 7.8115233047968475, + "grad_norm": 0.8660510778427124, + "learning_rate": 2.389930697105585e-05, + "loss": 3.5093, + "step": 114970 + }, + { + "epoch": 7.81186302486751, + "grad_norm": 0.9177801609039307, + "learning_rate": 2.3856841962223127e-05, + "loss": 3.2909, + "step": 114975 + }, + { + "epoch": 7.812202744938171, + "grad_norm": 1.1258420944213867, + "learning_rate": 2.3814376953390407e-05, + "loss": 3.0058, + "step": 114980 + }, + { + "epoch": 7.812542465008833, + "grad_norm": 1.0479464530944824, + "learning_rate": 2.3771911944557687e-05, + "loss": 3.3088, + "step": 114985 + }, + { + "epoch": 7.812882185079495, + "grad_norm": 0.9610199332237244, + "learning_rate": 2.3729446935724964e-05, + "loss": 3.4592, + "step": 114990 + }, + { + "epoch": 7.813221905150156, + "grad_norm": 0.8305081725120544, + "learning_rate": 2.368698192689224e-05, + "loss": 3.4938, + "step": 114995 + }, + { + "epoch": 7.813561625220818, + "grad_norm": 1.146561861038208, + "learning_rate": 2.364451691805952e-05, + "loss": 3.3849, + "step": 115000 + }, + { + "epoch": 7.81390134529148, + "grad_norm": 0.748786211013794, + "learning_rate": 2.3602051909226797e-05, + "loss": 3.2878, + "step": 115005 + }, + { + "epoch": 7.8142410653621415, + "grad_norm": 0.9911561608314514, + "learning_rate": 2.3559586900394077e-05, + "loss": 3.3827, + "step": 115010 + }, + { + "epoch": 7.8145807854328035, + "grad_norm": 0.7786740660667419, + "learning_rate": 2.3517121891561354e-05, + "loss": 3.3221, + "step": 115015 + }, + { + "epoch": 7.814920505503466, + "grad_norm": 0.8932788968086243, + "learning_rate": 2.3474656882728634e-05, + "loss": 3.4815, + "step": 115020 + }, + { + "epoch": 7.815260225574127, + "grad_norm": 1.3246700763702393, + "learning_rate": 2.343219187389591e-05, + "loss": 3.1471, + "step": 115025 + }, + { + "epoch": 7.815599945644789, + "grad_norm": 0.8441046476364136, + "learning_rate": 2.3389726865063187e-05, + "loss": 3.1224, + "step": 115030 + }, + { + "epoch": 7.815939665715451, + "grad_norm": 0.9021058082580566, + "learning_rate": 2.3347261856230467e-05, + "loss": 3.5143, + "step": 115035 + }, + { + "epoch": 7.816279385786112, + "grad_norm": 0.8236507177352905, + "learning_rate": 2.3304796847397744e-05, + "loss": 3.173, + "step": 115040 + }, + { + "epoch": 7.816619105856774, + "grad_norm": 1.0150883197784424, + "learning_rate": 2.3262331838565024e-05, + "loss": 3.3913, + "step": 115045 + }, + { + "epoch": 7.816958825927435, + "grad_norm": 0.9967908263206482, + "learning_rate": 2.32198668297323e-05, + "loss": 3.1198, + "step": 115050 + }, + { + "epoch": 7.8172985459980975, + "grad_norm": 1.1206148862838745, + "learning_rate": 2.3177401820899577e-05, + "loss": 3.5795, + "step": 115055 + }, + { + "epoch": 7.81763826606876, + "grad_norm": 0.9987223148345947, + "learning_rate": 2.3134936812066857e-05, + "loss": 3.7193, + "step": 115060 + }, + { + "epoch": 7.817977986139421, + "grad_norm": 0.7998812794685364, + "learning_rate": 2.3092471803234134e-05, + "loss": 3.5321, + "step": 115065 + }, + { + "epoch": 7.818317706210083, + "grad_norm": 0.9092327356338501, + "learning_rate": 2.3050006794401414e-05, + "loss": 3.527, + "step": 115070 + }, + { + "epoch": 7.818657426280745, + "grad_norm": 0.89759761095047, + "learning_rate": 2.3007541785568694e-05, + "loss": 3.052, + "step": 115075 + }, + { + "epoch": 7.818997146351406, + "grad_norm": 1.0275319814682007, + "learning_rate": 2.2965076776735968e-05, + "loss": 3.3613, + "step": 115080 + }, + { + "epoch": 7.819336866422068, + "grad_norm": 0.9302978515625, + "learning_rate": 2.2922611767903248e-05, + "loss": 3.2525, + "step": 115085 + }, + { + "epoch": 7.81967658649273, + "grad_norm": 0.8509906530380249, + "learning_rate": 2.2880146759070528e-05, + "loss": 3.3656, + "step": 115090 + }, + { + "epoch": 7.820016306563391, + "grad_norm": 1.0535659790039062, + "learning_rate": 2.2837681750237804e-05, + "loss": 3.6091, + "step": 115095 + }, + { + "epoch": 7.8203560266340535, + "grad_norm": 0.7847927808761597, + "learning_rate": 2.2795216741405085e-05, + "loss": 3.2903, + "step": 115100 + }, + { + "epoch": 7.820695746704716, + "grad_norm": 0.9870132803916931, + "learning_rate": 2.275275173257236e-05, + "loss": 3.2922, + "step": 115105 + }, + { + "epoch": 7.821035466775377, + "grad_norm": 0.8476266860961914, + "learning_rate": 2.2710286723739638e-05, + "loss": 3.5365, + "step": 115110 + }, + { + "epoch": 7.821375186846039, + "grad_norm": 0.8929209113121033, + "learning_rate": 2.2667821714906918e-05, + "loss": 3.6126, + "step": 115115 + }, + { + "epoch": 7.821714906916701, + "grad_norm": 0.9587222337722778, + "learning_rate": 2.2625356706074195e-05, + "loss": 3.3633, + "step": 115120 + }, + { + "epoch": 7.822054626987362, + "grad_norm": 1.119791030883789, + "learning_rate": 2.2582891697241475e-05, + "loss": 3.1411, + "step": 115125 + }, + { + "epoch": 7.822394347058024, + "grad_norm": 1.1090699434280396, + "learning_rate": 2.254042668840875e-05, + "loss": 3.2981, + "step": 115130 + }, + { + "epoch": 7.822734067128686, + "grad_norm": 0.8886882662773132, + "learning_rate": 2.2497961679576028e-05, + "loss": 3.3716, + "step": 115135 + }, + { + "epoch": 7.823073787199347, + "grad_norm": 0.7693060040473938, + "learning_rate": 2.2455496670743308e-05, + "loss": 3.2953, + "step": 115140 + }, + { + "epoch": 7.8234135072700095, + "grad_norm": 0.8658074736595154, + "learning_rate": 2.2413031661910585e-05, + "loss": 3.2157, + "step": 115145 + }, + { + "epoch": 7.823753227340672, + "grad_norm": 0.8098118901252747, + "learning_rate": 2.2370566653077865e-05, + "loss": 3.3691, + "step": 115150 + }, + { + "epoch": 7.824092947411333, + "grad_norm": 1.1397113800048828, + "learning_rate": 2.232810164424514e-05, + "loss": 3.2667, + "step": 115155 + }, + { + "epoch": 7.824432667481995, + "grad_norm": 0.9787867665290833, + "learning_rate": 2.2285636635412422e-05, + "loss": 3.2436, + "step": 115160 + }, + { + "epoch": 7.824772387552657, + "grad_norm": 0.8168771862983704, + "learning_rate": 2.22431716265797e-05, + "loss": 3.2678, + "step": 115165 + }, + { + "epoch": 7.825112107623318, + "grad_norm": 0.885113000869751, + "learning_rate": 2.2200706617746975e-05, + "loss": 3.3541, + "step": 115170 + }, + { + "epoch": 7.82545182769398, + "grad_norm": 0.8719485998153687, + "learning_rate": 2.2158241608914255e-05, + "loss": 3.2808, + "step": 115175 + }, + { + "epoch": 7.825791547764642, + "grad_norm": 0.7836947441101074, + "learning_rate": 2.2115776600081535e-05, + "loss": 3.4836, + "step": 115180 + }, + { + "epoch": 7.826131267835303, + "grad_norm": 0.82196444272995, + "learning_rate": 2.2073311591248812e-05, + "loss": 3.3105, + "step": 115185 + }, + { + "epoch": 7.8264709879059655, + "grad_norm": 0.9389337301254272, + "learning_rate": 2.2030846582416092e-05, + "loss": 3.4929, + "step": 115190 + }, + { + "epoch": 7.826810707976628, + "grad_norm": 0.875511884689331, + "learning_rate": 2.1988381573583365e-05, + "loss": 3.5482, + "step": 115195 + }, + { + "epoch": 7.827150428047289, + "grad_norm": 0.894795835018158, + "learning_rate": 2.1945916564750645e-05, + "loss": 3.4688, + "step": 115200 + }, + { + "epoch": 7.827490148117951, + "grad_norm": 1.1291640996932983, + "learning_rate": 2.1903451555917926e-05, + "loss": 3.3348, + "step": 115205 + }, + { + "epoch": 7.827829868188613, + "grad_norm": 1.176121473312378, + "learning_rate": 2.1860986547085202e-05, + "loss": 3.36, + "step": 115210 + }, + { + "epoch": 7.828169588259274, + "grad_norm": 0.9450066089630127, + "learning_rate": 2.1818521538252482e-05, + "loss": 3.3753, + "step": 115215 + }, + { + "epoch": 7.828509308329936, + "grad_norm": 1.1251667737960815, + "learning_rate": 2.1776056529419756e-05, + "loss": 3.3728, + "step": 115220 + }, + { + "epoch": 7.828849028400598, + "grad_norm": 1.0882141590118408, + "learning_rate": 2.1733591520587036e-05, + "loss": 3.423, + "step": 115225 + }, + { + "epoch": 7.8291887484712595, + "grad_norm": 0.9595118761062622, + "learning_rate": 2.1691126511754316e-05, + "loss": 3.3104, + "step": 115230 + }, + { + "epoch": 7.8295284685419215, + "grad_norm": 0.8911934494972229, + "learning_rate": 2.1648661502921592e-05, + "loss": 3.3018, + "step": 115235 + }, + { + "epoch": 7.829868188612584, + "grad_norm": 1.114537239074707, + "learning_rate": 2.1606196494088873e-05, + "loss": 3.4952, + "step": 115240 + }, + { + "epoch": 7.830207908683245, + "grad_norm": 1.291375756263733, + "learning_rate": 2.156373148525615e-05, + "loss": 3.4724, + "step": 115245 + }, + { + "epoch": 7.830547628753907, + "grad_norm": 0.9357790350914001, + "learning_rate": 2.1521266476423426e-05, + "loss": 3.4078, + "step": 115250 + }, + { + "epoch": 7.830887348824569, + "grad_norm": 0.8006625771522522, + "learning_rate": 2.1478801467590706e-05, + "loss": 3.2916, + "step": 115255 + }, + { + "epoch": 7.83122706889523, + "grad_norm": 1.1116278171539307, + "learning_rate": 2.1436336458757983e-05, + "loss": 3.3249, + "step": 115260 + }, + { + "epoch": 7.831566788965892, + "grad_norm": 0.8354714512825012, + "learning_rate": 2.1393871449925263e-05, + "loss": 3.3166, + "step": 115265 + }, + { + "epoch": 7.831906509036553, + "grad_norm": 0.9888442754745483, + "learning_rate": 2.1351406441092543e-05, + "loss": 3.6857, + "step": 115270 + }, + { + "epoch": 7.8322462291072155, + "grad_norm": 0.7826831936836243, + "learning_rate": 2.130894143225982e-05, + "loss": 3.3129, + "step": 115275 + }, + { + "epoch": 7.8325859491778775, + "grad_norm": 0.9525880217552185, + "learning_rate": 2.1266476423427096e-05, + "loss": 3.4752, + "step": 115280 + }, + { + "epoch": 7.832925669248539, + "grad_norm": 1.0015250444412231, + "learning_rate": 2.1224011414594373e-05, + "loss": 3.4013, + "step": 115285 + }, + { + "epoch": 7.833265389319201, + "grad_norm": 0.709562361240387, + "learning_rate": 2.1181546405761653e-05, + "loss": 3.293, + "step": 115290 + }, + { + "epoch": 7.833605109389863, + "grad_norm": 0.9379487633705139, + "learning_rate": 2.1139081396928933e-05, + "loss": 3.3454, + "step": 115295 + }, + { + "epoch": 7.833944829460524, + "grad_norm": 0.9840753674507141, + "learning_rate": 2.109661638809621e-05, + "loss": 3.1557, + "step": 115300 + }, + { + "epoch": 7.834284549531186, + "grad_norm": 0.7465277910232544, + "learning_rate": 2.1054151379263486e-05, + "loss": 3.2136, + "step": 115305 + }, + { + "epoch": 7.834624269601848, + "grad_norm": 0.8196090459823608, + "learning_rate": 2.1011686370430763e-05, + "loss": 3.5758, + "step": 115310 + }, + { + "epoch": 7.834963989672509, + "grad_norm": 1.1604788303375244, + "learning_rate": 2.0969221361598043e-05, + "loss": 3.1693, + "step": 115315 + }, + { + "epoch": 7.8353037097431715, + "grad_norm": 1.0070806741714478, + "learning_rate": 2.0926756352765323e-05, + "loss": 3.5242, + "step": 115320 + }, + { + "epoch": 7.8356434298138335, + "grad_norm": 0.8670352697372437, + "learning_rate": 2.08842913439326e-05, + "loss": 3.4432, + "step": 115325 + }, + { + "epoch": 7.835983149884495, + "grad_norm": 0.8637135028839111, + "learning_rate": 2.084182633509988e-05, + "loss": 3.4229, + "step": 115330 + }, + { + "epoch": 7.836322869955157, + "grad_norm": 0.856228232383728, + "learning_rate": 2.0799361326267153e-05, + "loss": 3.4249, + "step": 115335 + }, + { + "epoch": 7.836662590025819, + "grad_norm": 0.8524559736251831, + "learning_rate": 2.0756896317434433e-05, + "loss": 3.3485, + "step": 115340 + }, + { + "epoch": 7.83700231009648, + "grad_norm": 0.9412023425102234, + "learning_rate": 2.0714431308601713e-05, + "loss": 3.4962, + "step": 115345 + }, + { + "epoch": 7.837342030167142, + "grad_norm": 1.0184285640716553, + "learning_rate": 2.067196629976899e-05, + "loss": 3.278, + "step": 115350 + }, + { + "epoch": 7.837681750237804, + "grad_norm": 0.7907654643058777, + "learning_rate": 2.062950129093627e-05, + "loss": 3.4292, + "step": 115355 + }, + { + "epoch": 7.838021470308465, + "grad_norm": 0.9181801676750183, + "learning_rate": 2.058703628210355e-05, + "loss": 3.4317, + "step": 115360 + }, + { + "epoch": 7.8383611903791275, + "grad_norm": 1.0042554140090942, + "learning_rate": 2.0544571273270824e-05, + "loss": 3.5528, + "step": 115365 + }, + { + "epoch": 7.83870091044979, + "grad_norm": 1.1264654397964478, + "learning_rate": 2.0502106264438104e-05, + "loss": 3.3182, + "step": 115370 + }, + { + "epoch": 7.839040630520451, + "grad_norm": 2.915147304534912, + "learning_rate": 2.045964125560538e-05, + "loss": 3.2023, + "step": 115375 + }, + { + "epoch": 7.839380350591113, + "grad_norm": 0.9946007132530212, + "learning_rate": 2.041717624677266e-05, + "loss": 3.5816, + "step": 115380 + }, + { + "epoch": 7.839720070661775, + "grad_norm": 0.9481021165847778, + "learning_rate": 2.037471123793994e-05, + "loss": 3.4615, + "step": 115385 + }, + { + "epoch": 7.840059790732436, + "grad_norm": 1.0133622884750366, + "learning_rate": 2.0332246229107214e-05, + "loss": 3.3114, + "step": 115390 + }, + { + "epoch": 7.840399510803098, + "grad_norm": 0.8761985898017883, + "learning_rate": 2.0289781220274494e-05, + "loss": 3.3664, + "step": 115395 + }, + { + "epoch": 7.84073923087376, + "grad_norm": 0.9696314334869385, + "learning_rate": 2.024731621144177e-05, + "loss": 3.3486, + "step": 115400 + }, + { + "epoch": 7.841078950944421, + "grad_norm": 0.9740232229232788, + "learning_rate": 2.020485120260905e-05, + "loss": 3.1948, + "step": 115405 + }, + { + "epoch": 7.8414186710150835, + "grad_norm": 1.1077817678451538, + "learning_rate": 2.016238619377633e-05, + "loss": 3.2221, + "step": 115410 + }, + { + "epoch": 7.841758391085746, + "grad_norm": 0.8868404030799866, + "learning_rate": 2.0119921184943607e-05, + "loss": 3.3191, + "step": 115415 + }, + { + "epoch": 7.842098111156407, + "grad_norm": 0.8251886963844299, + "learning_rate": 2.0077456176110884e-05, + "loss": 3.3638, + "step": 115420 + }, + { + "epoch": 7.842437831227069, + "grad_norm": 0.9401786923408508, + "learning_rate": 2.003499116727816e-05, + "loss": 3.51, + "step": 115425 + }, + { + "epoch": 7.842777551297731, + "grad_norm": 0.7644994854927063, + "learning_rate": 1.999252615844544e-05, + "loss": 3.3378, + "step": 115430 + }, + { + "epoch": 7.843117271368392, + "grad_norm": 0.840512752532959, + "learning_rate": 1.995006114961272e-05, + "loss": 3.4859, + "step": 115435 + }, + { + "epoch": 7.843456991439054, + "grad_norm": 1.0155402421951294, + "learning_rate": 1.9907596140779998e-05, + "loss": 3.3496, + "step": 115440 + }, + { + "epoch": 7.843796711509716, + "grad_norm": 1.1321113109588623, + "learning_rate": 1.9865131131947278e-05, + "loss": 3.4601, + "step": 115445 + }, + { + "epoch": 7.844136431580377, + "grad_norm": 0.8812751770019531, + "learning_rate": 1.9822666123114554e-05, + "loss": 3.4909, + "step": 115450 + }, + { + "epoch": 7.8444761516510395, + "grad_norm": 0.7174862027168274, + "learning_rate": 1.978020111428183e-05, + "loss": 3.1217, + "step": 115455 + }, + { + "epoch": 7.844815871721702, + "grad_norm": 0.8532968163490295, + "learning_rate": 1.973773610544911e-05, + "loss": 3.2315, + "step": 115460 + }, + { + "epoch": 7.845155591792363, + "grad_norm": 0.8116024732589722, + "learning_rate": 1.9695271096616388e-05, + "loss": 3.3982, + "step": 115465 + }, + { + "epoch": 7.845495311863025, + "grad_norm": 0.9731085896492004, + "learning_rate": 1.9652806087783668e-05, + "loss": 3.5791, + "step": 115470 + }, + { + "epoch": 7.845835031933687, + "grad_norm": 0.7584823966026306, + "learning_rate": 1.9610341078950945e-05, + "loss": 3.1778, + "step": 115475 + }, + { + "epoch": 7.846174752004348, + "grad_norm": 1.066107988357544, + "learning_rate": 1.956787607011822e-05, + "loss": 3.2984, + "step": 115480 + }, + { + "epoch": 7.84651447207501, + "grad_norm": 0.8375856280326843, + "learning_rate": 1.95254110612855e-05, + "loss": 3.4025, + "step": 115485 + }, + { + "epoch": 7.846854192145672, + "grad_norm": 1.0503791570663452, + "learning_rate": 1.9482946052452778e-05, + "loss": 3.3876, + "step": 115490 + }, + { + "epoch": 7.8471939122163334, + "grad_norm": 1.2511892318725586, + "learning_rate": 1.9440481043620058e-05, + "loss": 3.5138, + "step": 115495 + }, + { + "epoch": 7.8475336322869955, + "grad_norm": 0.8136301636695862, + "learning_rate": 1.9398016034787338e-05, + "loss": 3.7106, + "step": 115500 + }, + { + "epoch": 7.847873352357658, + "grad_norm": 1.129090666770935, + "learning_rate": 1.935555102595461e-05, + "loss": 3.3818, + "step": 115505 + }, + { + "epoch": 7.848213072428319, + "grad_norm": 1.2548298835754395, + "learning_rate": 1.931308601712189e-05, + "loss": 3.2576, + "step": 115510 + }, + { + "epoch": 7.848552792498981, + "grad_norm": 0.8671340942382812, + "learning_rate": 1.9270621008289172e-05, + "loss": 3.0321, + "step": 115515 + }, + { + "epoch": 7.848892512569643, + "grad_norm": 0.8285448551177979, + "learning_rate": 1.922815599945645e-05, + "loss": 3.515, + "step": 115520 + }, + { + "epoch": 7.849232232640304, + "grad_norm": 0.89328932762146, + "learning_rate": 1.918569099062373e-05, + "loss": 3.1042, + "step": 115525 + }, + { + "epoch": 7.849571952710966, + "grad_norm": 1.1066603660583496, + "learning_rate": 1.9143225981791002e-05, + "loss": 3.1217, + "step": 115530 + }, + { + "epoch": 7.849911672781628, + "grad_norm": 0.905995786190033, + "learning_rate": 1.9100760972958282e-05, + "loss": 3.3114, + "step": 115535 + }, + { + "epoch": 7.8502513928522895, + "grad_norm": 0.8740086555480957, + "learning_rate": 1.9058295964125562e-05, + "loss": 3.4861, + "step": 115540 + }, + { + "epoch": 7.8505911129229515, + "grad_norm": 1.110846996307373, + "learning_rate": 1.901583095529284e-05, + "loss": 3.5955, + "step": 115545 + }, + { + "epoch": 7.850930832993614, + "grad_norm": 0.8884345889091492, + "learning_rate": 1.897336594646012e-05, + "loss": 3.4902, + "step": 115550 + }, + { + "epoch": 7.851270553064275, + "grad_norm": 0.9815927147865295, + "learning_rate": 1.8930900937627395e-05, + "loss": 3.2067, + "step": 115555 + }, + { + "epoch": 7.851610273134937, + "grad_norm": 0.7342246174812317, + "learning_rate": 1.8888435928794672e-05, + "loss": 3.1898, + "step": 115560 + }, + { + "epoch": 7.851949993205599, + "grad_norm": 0.8323040008544922, + "learning_rate": 1.8845970919961952e-05, + "loss": 3.3552, + "step": 115565 + }, + { + "epoch": 7.85228971327626, + "grad_norm": 0.8886663913726807, + "learning_rate": 1.880350591112923e-05, + "loss": 3.2868, + "step": 115570 + }, + { + "epoch": 7.852629433346922, + "grad_norm": 0.8110567331314087, + "learning_rate": 1.876104090229651e-05, + "loss": 3.6252, + "step": 115575 + }, + { + "epoch": 7.852969153417584, + "grad_norm": 0.7803608775138855, + "learning_rate": 1.8718575893463786e-05, + "loss": 3.3761, + "step": 115580 + }, + { + "epoch": 7.8533088734882455, + "grad_norm": 0.7669162154197693, + "learning_rate": 1.8676110884631066e-05, + "loss": 3.214, + "step": 115585 + }, + { + "epoch": 7.8536485935589075, + "grad_norm": 0.938956081867218, + "learning_rate": 1.8633645875798342e-05, + "loss": 3.3063, + "step": 115590 + }, + { + "epoch": 7.85398831362957, + "grad_norm": 0.8600935339927673, + "learning_rate": 1.859118086696562e-05, + "loss": 3.355, + "step": 115595 + }, + { + "epoch": 7.854328033700231, + "grad_norm": 0.8332198262214661, + "learning_rate": 1.85487158581329e-05, + "loss": 3.3406, + "step": 115600 + }, + { + "epoch": 7.854667753770893, + "grad_norm": 1.007500171661377, + "learning_rate": 1.850625084930018e-05, + "loss": 3.3389, + "step": 115605 + }, + { + "epoch": 7.855007473841555, + "grad_norm": 0.9508211016654968, + "learning_rate": 1.8463785840467456e-05, + "loss": 3.2591, + "step": 115610 + }, + { + "epoch": 7.855347193912216, + "grad_norm": 0.8223510384559631, + "learning_rate": 1.8421320831634733e-05, + "loss": 3.3566, + "step": 115615 + }, + { + "epoch": 7.855686913982878, + "grad_norm": 0.9014521837234497, + "learning_rate": 1.837885582280201e-05, + "loss": 3.1086, + "step": 115620 + }, + { + "epoch": 7.85602663405354, + "grad_norm": 0.7935341000556946, + "learning_rate": 1.833639081396929e-05, + "loss": 3.5296, + "step": 115625 + }, + { + "epoch": 7.8563663541242015, + "grad_norm": 0.9205112457275391, + "learning_rate": 1.830241880690311e-05, + "loss": 3.2844, + "step": 115630 + }, + { + "epoch": 7.8567060741948636, + "grad_norm": 0.9138445258140564, + "learning_rate": 1.825995379807039e-05, + "loss": 3.4373, + "step": 115635 + }, + { + "epoch": 7.857045794265526, + "grad_norm": 0.7892048954963684, + "learning_rate": 1.821748878923767e-05, + "loss": 3.3962, + "step": 115640 + }, + { + "epoch": 7.857385514336187, + "grad_norm": 0.9354141354560852, + "learning_rate": 1.8175023780404947e-05, + "loss": 3.1393, + "step": 115645 + }, + { + "epoch": 7.857725234406849, + "grad_norm": 1.0639134645462036, + "learning_rate": 1.8132558771572224e-05, + "loss": 3.3969, + "step": 115650 + }, + { + "epoch": 7.858064954477511, + "grad_norm": 0.8901233077049255, + "learning_rate": 1.80900937627395e-05, + "loss": 3.2991, + "step": 115655 + }, + { + "epoch": 7.858404674548172, + "grad_norm": 1.1621290445327759, + "learning_rate": 1.804762875390678e-05, + "loss": 3.243, + "step": 115660 + }, + { + "epoch": 7.858744394618834, + "grad_norm": 0.8484512567520142, + "learning_rate": 1.800516374507406e-05, + "loss": 3.2482, + "step": 115665 + }, + { + "epoch": 7.859084114689496, + "grad_norm": 0.9921923875808716, + "learning_rate": 1.7962698736241337e-05, + "loss": 3.3942, + "step": 115670 + }, + { + "epoch": 7.8594238347601575, + "grad_norm": 0.8568032383918762, + "learning_rate": 1.7920233727408617e-05, + "loss": 3.3213, + "step": 115675 + }, + { + "epoch": 7.85976355483082, + "grad_norm": 0.8839669227600098, + "learning_rate": 1.7877768718575894e-05, + "loss": 3.41, + "step": 115680 + }, + { + "epoch": 7.860103274901482, + "grad_norm": 0.9469376802444458, + "learning_rate": 1.783530370974317e-05, + "loss": 3.5532, + "step": 115685 + }, + { + "epoch": 7.860442994972143, + "grad_norm": 0.8245354294776917, + "learning_rate": 1.779283870091045e-05, + "loss": 3.4691, + "step": 115690 + }, + { + "epoch": 7.860782715042805, + "grad_norm": 0.8928731679916382, + "learning_rate": 1.7750373692077727e-05, + "loss": 3.1903, + "step": 115695 + }, + { + "epoch": 7.861122435113467, + "grad_norm": 0.8753934502601624, + "learning_rate": 1.7707908683245007e-05, + "loss": 3.3669, + "step": 115700 + }, + { + "epoch": 7.861462155184128, + "grad_norm": 0.8334084749221802, + "learning_rate": 1.7665443674412287e-05, + "loss": 3.2339, + "step": 115705 + }, + { + "epoch": 7.86180187525479, + "grad_norm": 0.8848863244056702, + "learning_rate": 1.762297866557956e-05, + "loss": 3.3011, + "step": 115710 + }, + { + "epoch": 7.862141595325452, + "grad_norm": 2.0837557315826416, + "learning_rate": 1.758051365674684e-05, + "loss": 3.2205, + "step": 115715 + }, + { + "epoch": 7.8624813153961135, + "grad_norm": 1.0469070672988892, + "learning_rate": 1.7538048647914118e-05, + "loss": 3.2568, + "step": 115720 + }, + { + "epoch": 7.862821035466776, + "grad_norm": 1.0611906051635742, + "learning_rate": 1.7495583639081398e-05, + "loss": 3.2648, + "step": 115725 + }, + { + "epoch": 7.863160755537437, + "grad_norm": 0.8425038456916809, + "learning_rate": 1.7453118630248678e-05, + "loss": 3.3372, + "step": 115730 + }, + { + "epoch": 7.863500475608099, + "grad_norm": 0.8558586835861206, + "learning_rate": 1.741065362141595e-05, + "loss": 3.2257, + "step": 115735 + }, + { + "epoch": 7.863840195678761, + "grad_norm": 0.8687008619308472, + "learning_rate": 1.736818861258323e-05, + "loss": 3.3775, + "step": 115740 + }, + { + "epoch": 7.864179915749422, + "grad_norm": 0.8687962889671326, + "learning_rate": 1.732572360375051e-05, + "loss": 3.4126, + "step": 115745 + }, + { + "epoch": 7.864519635820084, + "grad_norm": 0.9146246910095215, + "learning_rate": 1.7283258594917788e-05, + "loss": 3.2539, + "step": 115750 + }, + { + "epoch": 7.864859355890746, + "grad_norm": 1.1964025497436523, + "learning_rate": 1.7240793586085068e-05, + "loss": 3.4797, + "step": 115755 + }, + { + "epoch": 7.865199075961407, + "grad_norm": 0.800546407699585, + "learning_rate": 1.7198328577252345e-05, + "loss": 3.4701, + "step": 115760 + }, + { + "epoch": 7.8655387960320695, + "grad_norm": 0.8247764110565186, + "learning_rate": 1.715586356841962e-05, + "loss": 3.4535, + "step": 115765 + }, + { + "epoch": 7.865878516102732, + "grad_norm": 0.8254044055938721, + "learning_rate": 1.71133985595869e-05, + "loss": 3.4564, + "step": 115770 + }, + { + "epoch": 7.866218236173393, + "grad_norm": 0.9693357944488525, + "learning_rate": 1.7070933550754178e-05, + "loss": 3.3074, + "step": 115775 + }, + { + "epoch": 7.866557956244055, + "grad_norm": 0.9011774659156799, + "learning_rate": 1.7028468541921458e-05, + "loss": 3.5164, + "step": 115780 + }, + { + "epoch": 7.866897676314717, + "grad_norm": 0.9043723940849304, + "learning_rate": 1.6986003533088735e-05, + "loss": 3.3937, + "step": 115785 + }, + { + "epoch": 7.867237396385378, + "grad_norm": 0.9332377314567566, + "learning_rate": 1.6943538524256015e-05, + "loss": 3.5897, + "step": 115790 + }, + { + "epoch": 7.86757711645604, + "grad_norm": 1.0422720909118652, + "learning_rate": 1.690107351542329e-05, + "loss": 3.5924, + "step": 115795 + }, + { + "epoch": 7.867916836526702, + "grad_norm": 0.8413017988204956, + "learning_rate": 1.6858608506590568e-05, + "loss": 3.3169, + "step": 115800 + }, + { + "epoch": 7.8682565565973634, + "grad_norm": 0.9427221417427063, + "learning_rate": 1.681614349775785e-05, + "loss": 3.4272, + "step": 115805 + }, + { + "epoch": 7.8685962766680255, + "grad_norm": 1.1696702241897583, + "learning_rate": 1.6773678488925125e-05, + "loss": 3.494, + "step": 115810 + }, + { + "epoch": 7.868935996738688, + "grad_norm": 0.89406418800354, + "learning_rate": 1.6731213480092405e-05, + "loss": 3.4295, + "step": 115815 + }, + { + "epoch": 7.869275716809349, + "grad_norm": 0.7606706619262695, + "learning_rate": 1.6688748471259682e-05, + "loss": 3.4219, + "step": 115820 + }, + { + "epoch": 7.869615436880011, + "grad_norm": 0.8837304711341858, + "learning_rate": 1.664628346242696e-05, + "loss": 3.4906, + "step": 115825 + }, + { + "epoch": 7.869955156950673, + "grad_norm": 0.8934082984924316, + "learning_rate": 1.660381845359424e-05, + "loss": 3.6389, + "step": 115830 + }, + { + "epoch": 7.870294877021334, + "grad_norm": 0.9416075348854065, + "learning_rate": 1.656135344476152e-05, + "loss": 3.1413, + "step": 115835 + }, + { + "epoch": 7.870634597091996, + "grad_norm": 0.77569580078125, + "learning_rate": 1.652738143769534e-05, + "loss": 3.2829, + "step": 115840 + }, + { + "epoch": 7.870974317162658, + "grad_norm": 0.9895443916320801, + "learning_rate": 1.648491642886262e-05, + "loss": 3.3575, + "step": 115845 + }, + { + "epoch": 7.8713140372333195, + "grad_norm": 1.1527740955352783, + "learning_rate": 1.6442451420029896e-05, + "loss": 3.3036, + "step": 115850 + }, + { + "epoch": 7.8716537573039815, + "grad_norm": 0.9520873427391052, + "learning_rate": 1.6399986411197173e-05, + "loss": 3.2384, + "step": 115855 + }, + { + "epoch": 7.871993477374644, + "grad_norm": 0.9333670735359192, + "learning_rate": 1.635752140236445e-05, + "loss": 3.4773, + "step": 115860 + }, + { + "epoch": 7.872333197445305, + "grad_norm": 0.8543186187744141, + "learning_rate": 1.631505639353173e-05, + "loss": 3.5229, + "step": 115865 + }, + { + "epoch": 7.872672917515967, + "grad_norm": 0.9801069498062134, + "learning_rate": 1.627259138469901e-05, + "loss": 3.4141, + "step": 115870 + }, + { + "epoch": 7.873012637586629, + "grad_norm": 0.8658555746078491, + "learning_rate": 1.6230126375866286e-05, + "loss": 3.5771, + "step": 115875 + }, + { + "epoch": 7.87335235765729, + "grad_norm": 0.7764684557914734, + "learning_rate": 1.6187661367033566e-05, + "loss": 3.4007, + "step": 115880 + }, + { + "epoch": 7.873692077727952, + "grad_norm": 0.9760209321975708, + "learning_rate": 1.614519635820084e-05, + "loss": 3.5608, + "step": 115885 + }, + { + "epoch": 7.874031797798614, + "grad_norm": 0.8917196393013, + "learning_rate": 1.610273134936812e-05, + "loss": 3.3551, + "step": 115890 + }, + { + "epoch": 7.8743715178692755, + "grad_norm": 1.0000526905059814, + "learning_rate": 1.60602663405354e-05, + "loss": 3.3097, + "step": 115895 + }, + { + "epoch": 7.8747112379399375, + "grad_norm": 0.9060320854187012, + "learning_rate": 1.6017801331702677e-05, + "loss": 3.249, + "step": 115900 + }, + { + "epoch": 7.8750509580106, + "grad_norm": 0.8220974206924438, + "learning_rate": 1.5975336322869957e-05, + "loss": 3.2578, + "step": 115905 + }, + { + "epoch": 7.875390678081261, + "grad_norm": 0.8271099328994751, + "learning_rate": 1.5932871314037237e-05, + "loss": 3.5195, + "step": 115910 + }, + { + "epoch": 7.875730398151923, + "grad_norm": 1.2816189527511597, + "learning_rate": 1.589040630520451e-05, + "loss": 3.39, + "step": 115915 + }, + { + "epoch": 7.876070118222585, + "grad_norm": 0.8445116281509399, + "learning_rate": 1.584794129637179e-05, + "loss": 3.2271, + "step": 115920 + }, + { + "epoch": 7.876409838293246, + "grad_norm": 0.8475092053413391, + "learning_rate": 1.5805476287539067e-05, + "loss": 3.5336, + "step": 115925 + }, + { + "epoch": 7.876749558363908, + "grad_norm": 0.9283754229545593, + "learning_rate": 1.5763011278706347e-05, + "loss": 3.4419, + "step": 115930 + }, + { + "epoch": 7.87708927843457, + "grad_norm": 1.075298547744751, + "learning_rate": 1.5720546269873627e-05, + "loss": 3.0582, + "step": 115935 + }, + { + "epoch": 7.8774289985052315, + "grad_norm": 1.5916749238967896, + "learning_rate": 1.56780812610409e-05, + "loss": 3.143, + "step": 115940 + }, + { + "epoch": 7.877768718575894, + "grad_norm": 0.705812931060791, + "learning_rate": 1.563561625220818e-05, + "loss": 3.4484, + "step": 115945 + }, + { + "epoch": 7.878108438646555, + "grad_norm": 0.8702048063278198, + "learning_rate": 1.5593151243375457e-05, + "loss": 3.3862, + "step": 115950 + }, + { + "epoch": 7.878448158717217, + "grad_norm": 1.1620341539382935, + "learning_rate": 1.5550686234542737e-05, + "loss": 3.2319, + "step": 115955 + }, + { + "epoch": 7.878787878787879, + "grad_norm": 0.8756722807884216, + "learning_rate": 1.5508221225710017e-05, + "loss": 3.4652, + "step": 115960 + }, + { + "epoch": 7.87912759885854, + "grad_norm": 0.8820767998695374, + "learning_rate": 1.5465756216877294e-05, + "loss": 3.3999, + "step": 115965 + }, + { + "epoch": 7.879467318929202, + "grad_norm": 0.9631357789039612, + "learning_rate": 1.542329120804457e-05, + "loss": 3.3179, + "step": 115970 + }, + { + "epoch": 7.879807038999864, + "grad_norm": 1.1747891902923584, + "learning_rate": 1.538082619921185e-05, + "loss": 3.5356, + "step": 115975 + }, + { + "epoch": 7.880146759070525, + "grad_norm": 0.8561417460441589, + "learning_rate": 1.5338361190379127e-05, + "loss": 3.4685, + "step": 115980 + }, + { + "epoch": 7.8804864791411875, + "grad_norm": 0.9082193374633789, + "learning_rate": 1.5295896181546407e-05, + "loss": 3.2852, + "step": 115985 + }, + { + "epoch": 7.88082619921185, + "grad_norm": 1.529134750366211, + "learning_rate": 1.5253431172713684e-05, + "loss": 3.4249, + "step": 115990 + }, + { + "epoch": 7.881165919282511, + "grad_norm": 1.0026991367340088, + "learning_rate": 1.5210966163880962e-05, + "loss": 3.3695, + "step": 115995 + }, + { + "epoch": 7.881505639353173, + "grad_norm": 1.4689871072769165, + "learning_rate": 1.516850115504824e-05, + "loss": 3.3577, + "step": 116000 + }, + { + "epoch": 7.881845359423835, + "grad_norm": 1.0231854915618896, + "learning_rate": 1.5126036146215518e-05, + "loss": 3.2692, + "step": 116005 + }, + { + "epoch": 7.882185079494496, + "grad_norm": 0.8574801087379456, + "learning_rate": 1.5083571137382798e-05, + "loss": 3.3862, + "step": 116010 + }, + { + "epoch": 7.882524799565158, + "grad_norm": 1.1495572328567505, + "learning_rate": 1.5041106128550076e-05, + "loss": 3.4951, + "step": 116015 + }, + { + "epoch": 7.88286451963582, + "grad_norm": 1.1564090251922607, + "learning_rate": 1.4998641119717353e-05, + "loss": 3.5231, + "step": 116020 + }, + { + "epoch": 7.883204239706481, + "grad_norm": 0.9969608783721924, + "learning_rate": 1.4956176110884631e-05, + "loss": 3.4707, + "step": 116025 + }, + { + "epoch": 7.8835439597771435, + "grad_norm": 0.9387454986572266, + "learning_rate": 1.491371110205191e-05, + "loss": 3.5012, + "step": 116030 + }, + { + "epoch": 7.883883679847806, + "grad_norm": 0.773318350315094, + "learning_rate": 1.4871246093219188e-05, + "loss": 3.4622, + "step": 116035 + }, + { + "epoch": 7.884223399918467, + "grad_norm": 0.8296008110046387, + "learning_rate": 1.4828781084386466e-05, + "loss": 3.5587, + "step": 116040 + }, + { + "epoch": 7.884563119989129, + "grad_norm": 9.56624984741211, + "learning_rate": 1.4786316075553745e-05, + "loss": 3.1208, + "step": 116045 + }, + { + "epoch": 7.884902840059791, + "grad_norm": 1.1115461587905884, + "learning_rate": 1.4743851066721021e-05, + "loss": 3.5906, + "step": 116050 + }, + { + "epoch": 7.885242560130452, + "grad_norm": 0.7923240661621094, + "learning_rate": 1.4701386057888301e-05, + "loss": 3.4435, + "step": 116055 + }, + { + "epoch": 7.885582280201114, + "grad_norm": 0.9213826656341553, + "learning_rate": 1.4658921049055578e-05, + "loss": 3.4292, + "step": 116060 + }, + { + "epoch": 7.885922000271776, + "grad_norm": 1.0419042110443115, + "learning_rate": 1.4616456040222856e-05, + "loss": 3.4981, + "step": 116065 + }, + { + "epoch": 7.886261720342437, + "grad_norm": 0.7935518622398376, + "learning_rate": 1.4573991031390135e-05, + "loss": 3.4757, + "step": 116070 + }, + { + "epoch": 7.8866014404130995, + "grad_norm": 0.9395286440849304, + "learning_rate": 1.4531526022557411e-05, + "loss": 3.2499, + "step": 116075 + }, + { + "epoch": 7.886941160483762, + "grad_norm": 1.1352423429489136, + "learning_rate": 1.4489061013724692e-05, + "loss": 3.3179, + "step": 116080 + }, + { + "epoch": 7.887280880554423, + "grad_norm": 1.0177313089370728, + "learning_rate": 1.444659600489197e-05, + "loss": 3.3241, + "step": 116085 + }, + { + "epoch": 7.887620600625085, + "grad_norm": 1.155542254447937, + "learning_rate": 1.4404130996059247e-05, + "loss": 3.3368, + "step": 116090 + }, + { + "epoch": 7.887960320695747, + "grad_norm": 0.8992282152175903, + "learning_rate": 1.4361665987226525e-05, + "loss": 3.2998, + "step": 116095 + }, + { + "epoch": 7.888300040766408, + "grad_norm": 1.0456116199493408, + "learning_rate": 1.4319200978393805e-05, + "loss": 3.4776, + "step": 116100 + }, + { + "epoch": 7.88863976083707, + "grad_norm": 0.8862426280975342, + "learning_rate": 1.4276735969561082e-05, + "loss": 3.4085, + "step": 116105 + }, + { + "epoch": 7.888979480907732, + "grad_norm": 0.8835179805755615, + "learning_rate": 1.423427096072836e-05, + "loss": 3.0222, + "step": 116110 + }, + { + "epoch": 7.8893192009783935, + "grad_norm": 0.9822314381599426, + "learning_rate": 1.4191805951895639e-05, + "loss": 3.3996, + "step": 116115 + }, + { + "epoch": 7.8896589210490555, + "grad_norm": 0.8998411297798157, + "learning_rate": 1.4149340943062915e-05, + "loss": 3.2017, + "step": 116120 + }, + { + "epoch": 7.889998641119718, + "grad_norm": 0.8986609578132629, + "learning_rate": 1.4106875934230195e-05, + "loss": 3.286, + "step": 116125 + }, + { + "epoch": 7.890338361190379, + "grad_norm": 0.863797128200531, + "learning_rate": 1.4064410925397474e-05, + "loss": 3.5556, + "step": 116130 + }, + { + "epoch": 7.890678081261041, + "grad_norm": 1.2542353868484497, + "learning_rate": 1.402194591656475e-05, + "loss": 3.5218, + "step": 116135 + }, + { + "epoch": 7.891017801331703, + "grad_norm": 1.0090429782867432, + "learning_rate": 1.3979480907732029e-05, + "loss": 3.1725, + "step": 116140 + }, + { + "epoch": 7.891357521402364, + "grad_norm": 0.9362375140190125, + "learning_rate": 1.3937015898899307e-05, + "loss": 3.1688, + "step": 116145 + }, + { + "epoch": 7.891697241473026, + "grad_norm": 1.3953783512115479, + "learning_rate": 1.3894550890066586e-05, + "loss": 3.3381, + "step": 116150 + }, + { + "epoch": 7.892036961543688, + "grad_norm": 0.8514189124107361, + "learning_rate": 1.3852085881233864e-05, + "loss": 3.288, + "step": 116155 + }, + { + "epoch": 7.8923766816143495, + "grad_norm": 0.9701269268989563, + "learning_rate": 1.380962087240114e-05, + "loss": 3.406, + "step": 116160 + }, + { + "epoch": 7.8927164016850115, + "grad_norm": 1.1062361001968384, + "learning_rate": 1.376715586356842e-05, + "loss": 3.4375, + "step": 116165 + }, + { + "epoch": 7.893056121755674, + "grad_norm": 1.6524564027786255, + "learning_rate": 1.3724690854735699e-05, + "loss": 3.4097, + "step": 116170 + }, + { + "epoch": 7.893395841826335, + "grad_norm": 0.8115729689598083, + "learning_rate": 1.3682225845902976e-05, + "loss": 3.4997, + "step": 116175 + }, + { + "epoch": 7.893735561896997, + "grad_norm": 0.86532062292099, + "learning_rate": 1.3639760837070254e-05, + "loss": 3.4106, + "step": 116180 + }, + { + "epoch": 7.894075281967659, + "grad_norm": 3.409167528152466, + "learning_rate": 1.3597295828237533e-05, + "loss": 3.5168, + "step": 116185 + }, + { + "epoch": 7.89441500203832, + "grad_norm": 0.8829941153526306, + "learning_rate": 1.3554830819404811e-05, + "loss": 3.4062, + "step": 116190 + }, + { + "epoch": 7.894754722108982, + "grad_norm": 1.1478184461593628, + "learning_rate": 1.351236581057209e-05, + "loss": 3.2423, + "step": 116195 + }, + { + "epoch": 7.895094442179644, + "grad_norm": 0.9997457265853882, + "learning_rate": 1.3469900801739368e-05, + "loss": 3.3064, + "step": 116200 + }, + { + "epoch": 7.8954341622503055, + "grad_norm": 0.9861294031143188, + "learning_rate": 1.3427435792906644e-05, + "loss": 3.4837, + "step": 116205 + }, + { + "epoch": 7.8957738823209676, + "grad_norm": 0.8822082877159119, + "learning_rate": 1.3384970784073924e-05, + "loss": 3.1523, + "step": 116210 + }, + { + "epoch": 7.89611360239163, + "grad_norm": 0.8061407804489136, + "learning_rate": 1.3342505775241201e-05, + "loss": 3.4936, + "step": 116215 + }, + { + "epoch": 7.896453322462291, + "grad_norm": 0.9887561798095703, + "learning_rate": 1.330004076640848e-05, + "loss": 3.2344, + "step": 116220 + }, + { + "epoch": 7.896793042532953, + "grad_norm": 0.813563883304596, + "learning_rate": 1.3257575757575758e-05, + "loss": 3.3716, + "step": 116225 + }, + { + "epoch": 7.897132762603615, + "grad_norm": 1.1505757570266724, + "learning_rate": 1.3215110748743035e-05, + "loss": 3.2167, + "step": 116230 + }, + { + "epoch": 7.897472482674276, + "grad_norm": 1.0675026178359985, + "learning_rate": 1.3172645739910315e-05, + "loss": 3.22, + "step": 116235 + }, + { + "epoch": 7.897812202744938, + "grad_norm": 0.9299976229667664, + "learning_rate": 1.3130180731077593e-05, + "loss": 3.1814, + "step": 116240 + }, + { + "epoch": 7.8981519228156, + "grad_norm": 0.99269038438797, + "learning_rate": 1.308771572224487e-05, + "loss": 3.4057, + "step": 116245 + }, + { + "epoch": 7.8984916428862615, + "grad_norm": 0.8486061692237854, + "learning_rate": 1.3045250713412148e-05, + "loss": 3.308, + "step": 116250 + }, + { + "epoch": 7.898831362956924, + "grad_norm": 0.9759187698364258, + "learning_rate": 1.3002785704579428e-05, + "loss": 3.2602, + "step": 116255 + }, + { + "epoch": 7.899171083027586, + "grad_norm": 0.8054320812225342, + "learning_rate": 1.2960320695746705e-05, + "loss": 3.0677, + "step": 116260 + }, + { + "epoch": 7.899510803098247, + "grad_norm": 0.8101117014884949, + "learning_rate": 1.2917855686913983e-05, + "loss": 3.5219, + "step": 116265 + }, + { + "epoch": 7.899850523168909, + "grad_norm": 1.2950226068496704, + "learning_rate": 1.2875390678081262e-05, + "loss": 3.5692, + "step": 116270 + }, + { + "epoch": 7.900190243239571, + "grad_norm": 1.232377052307129, + "learning_rate": 1.2832925669248538e-05, + "loss": 3.3244, + "step": 116275 + }, + { + "epoch": 7.900529963310232, + "grad_norm": 1.040297031402588, + "learning_rate": 1.2790460660415818e-05, + "loss": 3.3742, + "step": 116280 + }, + { + "epoch": 7.900869683380894, + "grad_norm": 1.1206929683685303, + "learning_rate": 1.2747995651583097e-05, + "loss": 3.4134, + "step": 116285 + }, + { + "epoch": 7.901209403451556, + "grad_norm": 0.8778587579727173, + "learning_rate": 1.2705530642750373e-05, + "loss": 3.4974, + "step": 116290 + }, + { + "epoch": 7.9015491235222175, + "grad_norm": 1.0118733644485474, + "learning_rate": 1.2663065633917652e-05, + "loss": 3.3696, + "step": 116295 + }, + { + "epoch": 7.90188884359288, + "grad_norm": 1.5946348905563354, + "learning_rate": 1.262060062508493e-05, + "loss": 3.3226, + "step": 116300 + }, + { + "epoch": 7.902228563663542, + "grad_norm": 0.7987099885940552, + "learning_rate": 1.2578135616252209e-05, + "loss": 3.1337, + "step": 116305 + }, + { + "epoch": 7.902568283734203, + "grad_norm": 1.2427043914794922, + "learning_rate": 1.2535670607419487e-05, + "loss": 3.3197, + "step": 116310 + }, + { + "epoch": 7.902908003804865, + "grad_norm": 0.8773234486579895, + "learning_rate": 1.2493205598586764e-05, + "loss": 3.3388, + "step": 116315 + }, + { + "epoch": 7.903247723875527, + "grad_norm": 0.8489192724227905, + "learning_rate": 1.2450740589754042e-05, + "loss": 3.5432, + "step": 116320 + }, + { + "epoch": 7.903587443946188, + "grad_norm": 0.8989433646202087, + "learning_rate": 1.2408275580921322e-05, + "loss": 3.4745, + "step": 116325 + }, + { + "epoch": 7.90392716401685, + "grad_norm": 0.7398304343223572, + "learning_rate": 1.2365810572088599e-05, + "loss": 2.8999, + "step": 116330 + }, + { + "epoch": 7.904266884087512, + "grad_norm": 0.7912055850028992, + "learning_rate": 1.2323345563255877e-05, + "loss": 3.4253, + "step": 116335 + }, + { + "epoch": 7.9046066041581735, + "grad_norm": 0.9548945426940918, + "learning_rate": 1.2280880554423156e-05, + "loss": 3.4316, + "step": 116340 + }, + { + "epoch": 7.904946324228836, + "grad_norm": 0.8963494896888733, + "learning_rate": 1.2238415545590434e-05, + "loss": 3.1191, + "step": 116345 + }, + { + "epoch": 7.905286044299498, + "grad_norm": 1.045539140701294, + "learning_rate": 1.2195950536757712e-05, + "loss": 3.5947, + "step": 116350 + }, + { + "epoch": 7.905625764370159, + "grad_norm": 0.9093453288078308, + "learning_rate": 1.215348552792499e-05, + "loss": 3.4045, + "step": 116355 + }, + { + "epoch": 7.905965484440821, + "grad_norm": 0.7456111311912537, + "learning_rate": 1.2111020519092267e-05, + "loss": 3.328, + "step": 116360 + }, + { + "epoch": 7.906305204511483, + "grad_norm": 1.0074299573898315, + "learning_rate": 1.2068555510259546e-05, + "loss": 3.5903, + "step": 116365 + }, + { + "epoch": 7.906644924582144, + "grad_norm": 0.8131290078163147, + "learning_rate": 1.2026090501426826e-05, + "loss": 3.496, + "step": 116370 + }, + { + "epoch": 7.906984644652806, + "grad_norm": 0.9192919731140137, + "learning_rate": 1.1983625492594103e-05, + "loss": 3.1376, + "step": 116375 + }, + { + "epoch": 7.907324364723468, + "grad_norm": 0.9107369184494019, + "learning_rate": 1.1941160483761381e-05, + "loss": 3.2107, + "step": 116380 + }, + { + "epoch": 7.9076640847941295, + "grad_norm": 1.0383825302124023, + "learning_rate": 1.1898695474928658e-05, + "loss": 3.4205, + "step": 116385 + }, + { + "epoch": 7.908003804864792, + "grad_norm": 0.9023416638374329, + "learning_rate": 1.1856230466095938e-05, + "loss": 3.2973, + "step": 116390 + }, + { + "epoch": 7.908343524935454, + "grad_norm": 0.8243381381034851, + "learning_rate": 1.1813765457263216e-05, + "loss": 3.3769, + "step": 116395 + }, + { + "epoch": 7.908683245006115, + "grad_norm": 0.9647794961929321, + "learning_rate": 1.1771300448430493e-05, + "loss": 3.296, + "step": 116400 + }, + { + "epoch": 7.909022965076777, + "grad_norm": 0.8837306499481201, + "learning_rate": 1.1728835439597771e-05, + "loss": 3.3775, + "step": 116405 + }, + { + "epoch": 7.909362685147439, + "grad_norm": 1.136675238609314, + "learning_rate": 1.168637043076505e-05, + "loss": 3.3494, + "step": 116410 + }, + { + "epoch": 7.9097024052181, + "grad_norm": 0.7471441626548767, + "learning_rate": 1.1643905421932328e-05, + "loss": 3.2899, + "step": 116415 + }, + { + "epoch": 7.910042125288762, + "grad_norm": 1.0410605669021606, + "learning_rate": 1.1601440413099606e-05, + "loss": 3.3419, + "step": 116420 + }, + { + "epoch": 7.9103818453594235, + "grad_norm": 1.0671144723892212, + "learning_rate": 1.1558975404266885e-05, + "loss": 3.4184, + "step": 116425 + }, + { + "epoch": 7.9107215654300855, + "grad_norm": 1.2348099946975708, + "learning_rate": 1.1516510395434161e-05, + "loss": 3.5978, + "step": 116430 + }, + { + "epoch": 7.911061285500748, + "grad_norm": 0.9575271606445312, + "learning_rate": 1.1474045386601442e-05, + "loss": 3.4619, + "step": 116435 + }, + { + "epoch": 7.911401005571409, + "grad_norm": 0.8175496459007263, + "learning_rate": 1.143158037776872e-05, + "loss": 3.4682, + "step": 116440 + }, + { + "epoch": 7.911740725642071, + "grad_norm": 0.9048704504966736, + "learning_rate": 1.1389115368935997e-05, + "loss": 3.2874, + "step": 116445 + }, + { + "epoch": 7.912080445712733, + "grad_norm": 0.8583383560180664, + "learning_rate": 1.1346650360103275e-05, + "loss": 3.3702, + "step": 116450 + }, + { + "epoch": 7.912420165783394, + "grad_norm": 0.9579060673713684, + "learning_rate": 1.1304185351270553e-05, + "loss": 3.5554, + "step": 116455 + }, + { + "epoch": 7.912759885854056, + "grad_norm": 0.9150121808052063, + "learning_rate": 1.1261720342437832e-05, + "loss": 3.483, + "step": 116460 + }, + { + "epoch": 7.913099605924718, + "grad_norm": 0.9384187459945679, + "learning_rate": 1.121925533360511e-05, + "loss": 3.2537, + "step": 116465 + }, + { + "epoch": 7.9134393259953795, + "grad_norm": 1.0819907188415527, + "learning_rate": 1.1176790324772387e-05, + "loss": 3.3222, + "step": 116470 + }, + { + "epoch": 7.9137790460660415, + "grad_norm": 0.8277508020401001, + "learning_rate": 1.1134325315939665e-05, + "loss": 3.6726, + "step": 116475 + }, + { + "epoch": 7.914118766136704, + "grad_norm": 0.7710220813751221, + "learning_rate": 1.1091860307106945e-05, + "loss": 3.2573, + "step": 116480 + }, + { + "epoch": 7.914458486207365, + "grad_norm": 0.9885514378547668, + "learning_rate": 1.1049395298274222e-05, + "loss": 3.5507, + "step": 116485 + }, + { + "epoch": 7.914798206278027, + "grad_norm": 1.3520911931991577, + "learning_rate": 1.10069302894415e-05, + "loss": 3.4371, + "step": 116490 + }, + { + "epoch": 7.915137926348689, + "grad_norm": 0.8003328442573547, + "learning_rate": 1.0964465280608779e-05, + "loss": 3.1059, + "step": 116495 + }, + { + "epoch": 7.91547764641935, + "grad_norm": 1.609450101852417, + "learning_rate": 1.0922000271776055e-05, + "loss": 3.3608, + "step": 116500 + }, + { + "epoch": 7.915817366490012, + "grad_norm": 1.2455931901931763, + "learning_rate": 1.0879535262943336e-05, + "loss": 3.4081, + "step": 116505 + }, + { + "epoch": 7.916157086560674, + "grad_norm": 0.777170717716217, + "learning_rate": 1.0837070254110614e-05, + "loss": 3.2106, + "step": 116510 + }, + { + "epoch": 7.9164968066313355, + "grad_norm": 0.9499818086624146, + "learning_rate": 1.079460524527789e-05, + "loss": 3.4399, + "step": 116515 + }, + { + "epoch": 7.916836526701998, + "grad_norm": 0.9296178221702576, + "learning_rate": 1.0752140236445169e-05, + "loss": 3.4312, + "step": 116520 + }, + { + "epoch": 7.91717624677266, + "grad_norm": 1.4534413814544678, + "learning_rate": 1.0709675227612449e-05, + "loss": 3.4814, + "step": 116525 + }, + { + "epoch": 7.917515966843321, + "grad_norm": 1.0560541152954102, + "learning_rate": 1.0667210218779726e-05, + "loss": 3.3642, + "step": 116530 + }, + { + "epoch": 7.917855686913983, + "grad_norm": 0.9030450582504272, + "learning_rate": 1.0624745209947004e-05, + "loss": 3.6405, + "step": 116535 + }, + { + "epoch": 7.918195406984645, + "grad_norm": 0.9173261523246765, + "learning_rate": 1.0582280201114282e-05, + "loss": 3.4985, + "step": 116540 + }, + { + "epoch": 7.918535127055306, + "grad_norm": 0.8283436298370361, + "learning_rate": 1.053981519228156e-05, + "loss": 3.5945, + "step": 116545 + }, + { + "epoch": 7.918874847125968, + "grad_norm": 0.7974158525466919, + "learning_rate": 1.049735018344884e-05, + "loss": 3.2663, + "step": 116550 + }, + { + "epoch": 7.91921456719663, + "grad_norm": 0.8235711455345154, + "learning_rate": 1.0454885174616116e-05, + "loss": 3.3347, + "step": 116555 + }, + { + "epoch": 7.9195542872672915, + "grad_norm": 1.0174522399902344, + "learning_rate": 1.0412420165783394e-05, + "loss": 3.3559, + "step": 116560 + }, + { + "epoch": 7.919894007337954, + "grad_norm": 1.1860289573669434, + "learning_rate": 1.0369955156950673e-05, + "loss": 3.3535, + "step": 116565 + }, + { + "epoch": 7.920233727408616, + "grad_norm": 1.0423609018325806, + "learning_rate": 1.0327490148117951e-05, + "loss": 3.276, + "step": 116570 + }, + { + "epoch": 7.920573447479277, + "grad_norm": 0.8379596471786499, + "learning_rate": 1.028502513928523e-05, + "loss": 3.4954, + "step": 116575 + }, + { + "epoch": 7.920913167549939, + "grad_norm": 0.9458230137825012, + "learning_rate": 1.0242560130452508e-05, + "loss": 3.1673, + "step": 116580 + }, + { + "epoch": 7.921252887620601, + "grad_norm": 1.1277341842651367, + "learning_rate": 1.0200095121619785e-05, + "loss": 3.4712, + "step": 116585 + }, + { + "epoch": 7.921592607691262, + "grad_norm": 1.2280609607696533, + "learning_rate": 1.0157630112787063e-05, + "loss": 3.2707, + "step": 116590 + }, + { + "epoch": 7.921932327761924, + "grad_norm": 0.8649842143058777, + "learning_rate": 1.0115165103954343e-05, + "loss": 3.4351, + "step": 116595 + }, + { + "epoch": 7.922272047832586, + "grad_norm": 0.7720298171043396, + "learning_rate": 1.007270009512162e-05, + "loss": 3.378, + "step": 116600 + }, + { + "epoch": 7.9226117679032475, + "grad_norm": 0.7366422414779663, + "learning_rate": 1.0030235086288898e-05, + "loss": 3.3717, + "step": 116605 + }, + { + "epoch": 7.92295148797391, + "grad_norm": 1.1566712856292725, + "learning_rate": 9.987770077456176e-06, + "loss": 3.1406, + "step": 116610 + }, + { + "epoch": 7.923291208044572, + "grad_norm": 0.978537917137146, + "learning_rate": 9.945305068623455e-06, + "loss": 3.4263, + "step": 116615 + }, + { + "epoch": 7.923630928115233, + "grad_norm": 0.927959680557251, + "learning_rate": 9.902840059790733e-06, + "loss": 3.613, + "step": 116620 + }, + { + "epoch": 7.923970648185895, + "grad_norm": 1.3105332851409912, + "learning_rate": 9.860375050958012e-06, + "loss": 3.2402, + "step": 116625 + }, + { + "epoch": 7.924310368256556, + "grad_norm": 0.888448178768158, + "learning_rate": 9.817910042125288e-06, + "loss": 3.1411, + "step": 116630 + }, + { + "epoch": 7.924650088327218, + "grad_norm": 0.9930823445320129, + "learning_rate": 9.775445033292567e-06, + "loss": 3.3792, + "step": 116635 + }, + { + "epoch": 7.92498980839788, + "grad_norm": 0.9135282039642334, + "learning_rate": 9.732980024459845e-06, + "loss": 3.6153, + "step": 116640 + }, + { + "epoch": 7.925329528468541, + "grad_norm": 0.8279620409011841, + "learning_rate": 9.690515015627123e-06, + "loss": 3.5242, + "step": 116645 + }, + { + "epoch": 7.9256692485392035, + "grad_norm": 0.8971065878868103, + "learning_rate": 9.648050006794402e-06, + "loss": 3.4379, + "step": 116650 + }, + { + "epoch": 7.926008968609866, + "grad_norm": 1.209789752960205, + "learning_rate": 9.605584997961679e-06, + "loss": 3.5518, + "step": 116655 + }, + { + "epoch": 7.926348688680527, + "grad_norm": 1.0281981229782104, + "learning_rate": 9.563119989128959e-06, + "loss": 3.3034, + "step": 116660 + }, + { + "epoch": 7.926688408751189, + "grad_norm": 0.8628694415092468, + "learning_rate": 9.520654980296237e-06, + "loss": 3.4435, + "step": 116665 + }, + { + "epoch": 7.927028128821851, + "grad_norm": 0.8030472993850708, + "learning_rate": 9.478189971463514e-06, + "loss": 3.4537, + "step": 116670 + }, + { + "epoch": 7.927367848892512, + "grad_norm": 0.9167085289955139, + "learning_rate": 9.435724962630792e-06, + "loss": 3.5065, + "step": 116675 + }, + { + "epoch": 7.927707568963174, + "grad_norm": 1.1988354921340942, + "learning_rate": 9.39325995379807e-06, + "loss": 3.3443, + "step": 116680 + }, + { + "epoch": 7.928047289033836, + "grad_norm": 0.9318909645080566, + "learning_rate": 9.350794944965349e-06, + "loss": 3.3203, + "step": 116685 + }, + { + "epoch": 7.9283870091044975, + "grad_norm": 1.1624102592468262, + "learning_rate": 9.308329936132627e-06, + "loss": 3.5067, + "step": 116690 + }, + { + "epoch": 7.9287267291751595, + "grad_norm": 1.1477880477905273, + "learning_rate": 9.265864927299906e-06, + "loss": 3.1371, + "step": 116695 + }, + { + "epoch": 7.929066449245822, + "grad_norm": 1.110440731048584, + "learning_rate": 9.223399918467182e-06, + "loss": 3.3949, + "step": 116700 + }, + { + "epoch": 7.929406169316483, + "grad_norm": 1.1568299531936646, + "learning_rate": 9.180934909634462e-06, + "loss": 3.3126, + "step": 116705 + }, + { + "epoch": 7.929745889387145, + "grad_norm": 0.9406833052635193, + "learning_rate": 9.138469900801739e-06, + "loss": 3.398, + "step": 116710 + }, + { + "epoch": 7.930085609457807, + "grad_norm": 0.9667701125144958, + "learning_rate": 9.096004891969017e-06, + "loss": 3.6192, + "step": 116715 + }, + { + "epoch": 7.930425329528468, + "grad_norm": 0.9343003034591675, + "learning_rate": 9.053539883136296e-06, + "loss": 3.1685, + "step": 116720 + }, + { + "epoch": 7.93076504959913, + "grad_norm": 0.9776849746704102, + "learning_rate": 9.011074874303572e-06, + "loss": 3.5765, + "step": 116725 + }, + { + "epoch": 7.931104769669792, + "grad_norm": 0.9502739310264587, + "learning_rate": 8.968609865470853e-06, + "loss": 3.298, + "step": 116730 + }, + { + "epoch": 7.9314444897404535, + "grad_norm": 1.06642484664917, + "learning_rate": 8.926144856638131e-06, + "loss": 3.1604, + "step": 116735 + }, + { + "epoch": 7.9317842098111155, + "grad_norm": 0.9050705432891846, + "learning_rate": 8.883679847805408e-06, + "loss": 3.2817, + "step": 116740 + }, + { + "epoch": 7.932123929881778, + "grad_norm": 0.9114601016044617, + "learning_rate": 8.841214838972686e-06, + "loss": 3.4405, + "step": 116745 + }, + { + "epoch": 7.932463649952439, + "grad_norm": 1.0566595792770386, + "learning_rate": 8.798749830139966e-06, + "loss": 3.1199, + "step": 116750 + }, + { + "epoch": 7.932803370023101, + "grad_norm": 1.07633376121521, + "learning_rate": 8.756284821307243e-06, + "loss": 3.3921, + "step": 116755 + }, + { + "epoch": 7.933143090093763, + "grad_norm": 0.929010808467865, + "learning_rate": 8.713819812474521e-06, + "loss": 3.4359, + "step": 116760 + }, + { + "epoch": 7.933482810164424, + "grad_norm": 10.345086097717285, + "learning_rate": 8.6713548036418e-06, + "loss": 3.4244, + "step": 116765 + }, + { + "epoch": 7.933822530235086, + "grad_norm": 0.985912561416626, + "learning_rate": 8.628889794809076e-06, + "loss": 3.6452, + "step": 116770 + }, + { + "epoch": 7.934162250305748, + "grad_norm": 0.9113600850105286, + "learning_rate": 8.586424785976356e-06, + "loss": 3.3824, + "step": 116775 + }, + { + "epoch": 7.9345019703764095, + "grad_norm": 0.9682573676109314, + "learning_rate": 8.543959777143635e-06, + "loss": 3.0158, + "step": 116780 + }, + { + "epoch": 7.9348416904470715, + "grad_norm": 1.0318443775177002, + "learning_rate": 8.501494768310911e-06, + "loss": 3.5689, + "step": 116785 + }, + { + "epoch": 7.935181410517734, + "grad_norm": 1.0325835943222046, + "learning_rate": 8.45902975947819e-06, + "loss": 3.3327, + "step": 116790 + }, + { + "epoch": 7.935521130588395, + "grad_norm": 0.8859418630599976, + "learning_rate": 8.416564750645468e-06, + "loss": 3.4312, + "step": 116795 + }, + { + "epoch": 7.935860850659057, + "grad_norm": 0.957879364490509, + "learning_rate": 8.374099741812747e-06, + "loss": 3.2207, + "step": 116800 + }, + { + "epoch": 7.936200570729719, + "grad_norm": 1.1967977285385132, + "learning_rate": 8.331634732980025e-06, + "loss": 3.4674, + "step": 116805 + }, + { + "epoch": 7.93654029080038, + "grad_norm": 0.7823821902275085, + "learning_rate": 8.289169724147302e-06, + "loss": 3.3843, + "step": 116810 + }, + { + "epoch": 7.936880010871042, + "grad_norm": 0.9404852986335754, + "learning_rate": 8.24670471531458e-06, + "loss": 3.4461, + "step": 116815 + }, + { + "epoch": 7.937219730941704, + "grad_norm": 0.8960379958152771, + "learning_rate": 8.20423970648186e-06, + "loss": 3.2473, + "step": 116820 + }, + { + "epoch": 7.9375594510123655, + "grad_norm": 1.426465392112732, + "learning_rate": 8.161774697649137e-06, + "loss": 3.3317, + "step": 116825 + }, + { + "epoch": 7.937899171083028, + "grad_norm": 0.7441895008087158, + "learning_rate": 8.119309688816415e-06, + "loss": 3.2811, + "step": 116830 + }, + { + "epoch": 7.93823889115369, + "grad_norm": 0.9563423991203308, + "learning_rate": 8.076844679983694e-06, + "loss": 3.6676, + "step": 116835 + }, + { + "epoch": 7.938578611224351, + "grad_norm": 0.8903767466545105, + "learning_rate": 8.034379671150972e-06, + "loss": 3.1385, + "step": 116840 + }, + { + "epoch": 7.938918331295013, + "grad_norm": 0.9169220328330994, + "learning_rate": 7.99191466231825e-06, + "loss": 3.4576, + "step": 116845 + }, + { + "epoch": 7.939258051365675, + "grad_norm": 1.0446540117263794, + "learning_rate": 7.949449653485529e-06, + "loss": 3.4191, + "step": 116850 + }, + { + "epoch": 7.939597771436336, + "grad_norm": 0.9285815358161926, + "learning_rate": 7.906984644652805e-06, + "loss": 3.1302, + "step": 116855 + }, + { + "epoch": 7.939937491506998, + "grad_norm": 0.8801335692405701, + "learning_rate": 7.864519635820085e-06, + "loss": 3.5371, + "step": 116860 + }, + { + "epoch": 7.94027721157766, + "grad_norm": 1.1792230606079102, + "learning_rate": 7.822054626987364e-06, + "loss": 3.4455, + "step": 116865 + }, + { + "epoch": 7.9406169316483215, + "grad_norm": 0.9104101657867432, + "learning_rate": 7.77958961815464e-06, + "loss": 3.3556, + "step": 116870 + }, + { + "epoch": 7.940956651718984, + "grad_norm": 1.02010178565979, + "learning_rate": 7.737124609321919e-06, + "loss": 3.5455, + "step": 116875 + }, + { + "epoch": 7.941296371789646, + "grad_norm": 0.9076337218284607, + "learning_rate": 7.694659600489197e-06, + "loss": 3.6044, + "step": 116880 + }, + { + "epoch": 7.941636091860307, + "grad_norm": 1.0605700016021729, + "learning_rate": 7.652194591656476e-06, + "loss": 3.4545, + "step": 116885 + }, + { + "epoch": 7.941975811930969, + "grad_norm": 0.8869123458862305, + "learning_rate": 7.609729582823754e-06, + "loss": 3.6093, + "step": 116890 + }, + { + "epoch": 7.942315532001631, + "grad_norm": 1.1121597290039062, + "learning_rate": 7.567264573991032e-06, + "loss": 3.2774, + "step": 116895 + }, + { + "epoch": 7.942655252072292, + "grad_norm": 1.8815683126449585, + "learning_rate": 7.52479956515831e-06, + "loss": 3.4219, + "step": 116900 + }, + { + "epoch": 7.942994972142954, + "grad_norm": 0.785434365272522, + "learning_rate": 7.4823345563255875e-06, + "loss": 3.4682, + "step": 116905 + }, + { + "epoch": 7.943334692213616, + "grad_norm": 0.7085667252540588, + "learning_rate": 7.439869547492866e-06, + "loss": 3.356, + "step": 116910 + }, + { + "epoch": 7.9436744122842775, + "grad_norm": 1.1977427005767822, + "learning_rate": 7.397404538660144e-06, + "loss": 3.4995, + "step": 116915 + }, + { + "epoch": 7.94401413235494, + "grad_norm": 0.9996548295021057, + "learning_rate": 7.354939529827422e-06, + "loss": 3.3888, + "step": 116920 + }, + { + "epoch": 7.944353852425602, + "grad_norm": 0.8569489121437073, + "learning_rate": 7.312474520994701e-06, + "loss": 3.4396, + "step": 116925 + }, + { + "epoch": 7.944693572496263, + "grad_norm": 0.8829942941665649, + "learning_rate": 7.270009512161979e-06, + "loss": 3.2292, + "step": 116930 + }, + { + "epoch": 7.945033292566925, + "grad_norm": 0.8755810260772705, + "learning_rate": 7.227544503329257e-06, + "loss": 3.6013, + "step": 116935 + }, + { + "epoch": 7.945373012637587, + "grad_norm": 0.8394572138786316, + "learning_rate": 7.1850794944965345e-06, + "loss": 3.2441, + "step": 116940 + }, + { + "epoch": 7.945712732708248, + "grad_norm": 0.8988709449768066, + "learning_rate": 7.142614485663814e-06, + "loss": 3.4798, + "step": 116945 + }, + { + "epoch": 7.94605245277891, + "grad_norm": 1.1230111122131348, + "learning_rate": 7.100149476831091e-06, + "loss": 3.4555, + "step": 116950 + }, + { + "epoch": 7.946392172849572, + "grad_norm": 1.0216529369354248, + "learning_rate": 7.057684467998369e-06, + "loss": 3.4682, + "step": 116955 + }, + { + "epoch": 7.9467318929202335, + "grad_norm": 0.9499970078468323, + "learning_rate": 7.015219459165648e-06, + "loss": 3.4844, + "step": 116960 + }, + { + "epoch": 7.947071612990896, + "grad_norm": 0.799590528011322, + "learning_rate": 6.9727544503329256e-06, + "loss": 3.3449, + "step": 116965 + }, + { + "epoch": 7.947411333061558, + "grad_norm": 1.2945332527160645, + "learning_rate": 6.930289441500204e-06, + "loss": 3.3166, + "step": 116970 + }, + { + "epoch": 7.947751053132219, + "grad_norm": 0.941944420337677, + "learning_rate": 6.887824432667482e-06, + "loss": 3.1497, + "step": 116975 + }, + { + "epoch": 7.948090773202881, + "grad_norm": 0.961362898349762, + "learning_rate": 6.845359423834761e-06, + "loss": 3.521, + "step": 116980 + }, + { + "epoch": 7.948430493273543, + "grad_norm": 1.1206685304641724, + "learning_rate": 6.802894415002038e-06, + "loss": 3.4088, + "step": 116985 + }, + { + "epoch": 7.948770213344204, + "grad_norm": 1.3242701292037964, + "learning_rate": 6.760429406169317e-06, + "loss": 3.5296, + "step": 116990 + }, + { + "epoch": 7.949109933414866, + "grad_norm": 1.2758419513702393, + "learning_rate": 6.717964397336595e-06, + "loss": 3.382, + "step": 116995 + }, + { + "epoch": 7.949449653485528, + "grad_norm": 0.9132824540138245, + "learning_rate": 6.6754993885038726e-06, + "loss": 3.4177, + "step": 117000 + }, + { + "epoch": 7.9497893735561895, + "grad_norm": 0.8920568227767944, + "learning_rate": 6.633034379671151e-06, + "loss": 3.3601, + "step": 117005 + }, + { + "epoch": 7.950129093626852, + "grad_norm": 5.019270420074463, + "learning_rate": 6.590569370838429e-06, + "loss": 3.2153, + "step": 117010 + }, + { + "epoch": 7.950468813697514, + "grad_norm": 1.0341037511825562, + "learning_rate": 6.548104362005708e-06, + "loss": 3.2989, + "step": 117015 + }, + { + "epoch": 7.950808533768175, + "grad_norm": 0.7298821210861206, + "learning_rate": 6.505639353172985e-06, + "loss": 3.1297, + "step": 117020 + }, + { + "epoch": 7.951148253838837, + "grad_norm": 0.9584304094314575, + "learning_rate": 6.463174344340264e-06, + "loss": 2.904, + "step": 117025 + }, + { + "epoch": 7.951487973909499, + "grad_norm": 1.3317363262176514, + "learning_rate": 6.420709335507542e-06, + "loss": 3.3658, + "step": 117030 + }, + { + "epoch": 7.95182769398016, + "grad_norm": 0.9715455174446106, + "learning_rate": 6.37824432667482e-06, + "loss": 3.4245, + "step": 117035 + }, + { + "epoch": 7.952167414050822, + "grad_norm": 0.9651104807853699, + "learning_rate": 6.335779317842098e-06, + "loss": 3.3816, + "step": 117040 + }, + { + "epoch": 7.952507134121484, + "grad_norm": 1.1670786142349243, + "learning_rate": 6.293314309009377e-06, + "loss": 3.5226, + "step": 117045 + }, + { + "epoch": 7.9528468541921455, + "grad_norm": 1.5975185632705688, + "learning_rate": 6.250849300176655e-06, + "loss": 3.3393, + "step": 117050 + }, + { + "epoch": 7.953186574262808, + "grad_norm": 0.9534032344818115, + "learning_rate": 6.208384291343932e-06, + "loss": 3.3217, + "step": 117055 + }, + { + "epoch": 7.95352629433347, + "grad_norm": 0.9489267468452454, + "learning_rate": 6.165919282511211e-06, + "loss": 3.314, + "step": 117060 + }, + { + "epoch": 7.953866014404131, + "grad_norm": 0.7998537421226501, + "learning_rate": 6.123454273678489e-06, + "loss": 3.1665, + "step": 117065 + }, + { + "epoch": 7.954205734474793, + "grad_norm": 0.8890878558158875, + "learning_rate": 6.080989264845767e-06, + "loss": 3.3961, + "step": 117070 + }, + { + "epoch": 7.954545454545455, + "grad_norm": 0.8904721140861511, + "learning_rate": 6.038524256013045e-06, + "loss": 3.1856, + "step": 117075 + }, + { + "epoch": 7.954885174616116, + "grad_norm": 1.2795671224594116, + "learning_rate": 5.996059247180324e-06, + "loss": 3.3654, + "step": 117080 + }, + { + "epoch": 7.955224894686778, + "grad_norm": 1.0729902982711792, + "learning_rate": 5.953594238347602e-06, + "loss": 3.235, + "step": 117085 + }, + { + "epoch": 7.95556461475744, + "grad_norm": 1.082217812538147, + "learning_rate": 5.91112922951488e-06, + "loss": 3.2508, + "step": 117090 + }, + { + "epoch": 7.9559043348281016, + "grad_norm": 0.7448486089706421, + "learning_rate": 5.8686642206821585e-06, + "loss": 3.5403, + "step": 117095 + }, + { + "epoch": 7.956244054898764, + "grad_norm": 0.7800440192222595, + "learning_rate": 5.826199211849436e-06, + "loss": 3.5026, + "step": 117100 + }, + { + "epoch": 7.956583774969425, + "grad_norm": 0.7328072786331177, + "learning_rate": 5.783734203016714e-06, + "loss": 3.2931, + "step": 117105 + }, + { + "epoch": 7.956923495040087, + "grad_norm": 0.9910997748374939, + "learning_rate": 5.741269194183992e-06, + "loss": 3.4043, + "step": 117110 + }, + { + "epoch": 7.957263215110749, + "grad_norm": 0.9993313550949097, + "learning_rate": 5.698804185351271e-06, + "loss": 3.325, + "step": 117115 + }, + { + "epoch": 7.95760293518141, + "grad_norm": 1.2610512971878052, + "learning_rate": 5.656339176518549e-06, + "loss": 3.2648, + "step": 117120 + }, + { + "epoch": 7.957942655252072, + "grad_norm": 2.517399311065674, + "learning_rate": 5.613874167685827e-06, + "loss": 3.2516, + "step": 117125 + }, + { + "epoch": 7.958282375322734, + "grad_norm": 2.255922794342041, + "learning_rate": 5.5714091588531054e-06, + "loss": 3.0886, + "step": 117130 + }, + { + "epoch": 7.9586220953933955, + "grad_norm": 1.0058856010437012, + "learning_rate": 5.528944150020384e-06, + "loss": 3.3352, + "step": 117135 + }, + { + "epoch": 7.958961815464058, + "grad_norm": 0.8048667907714844, + "learning_rate": 5.486479141187661e-06, + "loss": 3.5782, + "step": 117140 + }, + { + "epoch": 7.95930153553472, + "grad_norm": 0.9396070241928101, + "learning_rate": 5.444014132354939e-06, + "loss": 3.4374, + "step": 117145 + }, + { + "epoch": 7.959641255605381, + "grad_norm": 1.3895988464355469, + "learning_rate": 5.401549123522218e-06, + "loss": 3.3937, + "step": 117150 + }, + { + "epoch": 7.959980975676043, + "grad_norm": 1.0638508796691895, + "learning_rate": 5.359084114689496e-06, + "loss": 3.4075, + "step": 117155 + }, + { + "epoch": 7.960320695746705, + "grad_norm": 1.147978663444519, + "learning_rate": 5.316619105856774e-06, + "loss": 3.5914, + "step": 117160 + }, + { + "epoch": 7.960660415817366, + "grad_norm": 0.7572131752967834, + "learning_rate": 5.2741540970240524e-06, + "loss": 3.4647, + "step": 117165 + }, + { + "epoch": 7.961000135888028, + "grad_norm": 1.3496909141540527, + "learning_rate": 5.231689088191331e-06, + "loss": 3.3251, + "step": 117170 + }, + { + "epoch": 7.96133985595869, + "grad_norm": 1.0912625789642334, + "learning_rate": 5.189224079358608e-06, + "loss": 3.3868, + "step": 117175 + }, + { + "epoch": 7.9616795760293515, + "grad_norm": 0.8769623637199402, + "learning_rate": 5.146759070525888e-06, + "loss": 3.4198, + "step": 117180 + }, + { + "epoch": 7.962019296100014, + "grad_norm": 0.8366495370864868, + "learning_rate": 5.104294061693165e-06, + "loss": 3.258, + "step": 117185 + }, + { + "epoch": 7.962359016170676, + "grad_norm": 0.9728685617446899, + "learning_rate": 5.061829052860443e-06, + "loss": 3.2902, + "step": 117190 + }, + { + "epoch": 7.962698736241337, + "grad_norm": 1.3270682096481323, + "learning_rate": 5.019364044027721e-06, + "loss": 3.1826, + "step": 117195 + }, + { + "epoch": 7.963038456311999, + "grad_norm": 0.8425725102424622, + "learning_rate": 4.976899035194999e-06, + "loss": 3.49, + "step": 117200 + }, + { + "epoch": 7.963378176382661, + "grad_norm": 0.9526097178459167, + "learning_rate": 4.934434026362278e-06, + "loss": 3.3132, + "step": 117205 + }, + { + "epoch": 7.963717896453322, + "grad_norm": 1.0207871198654175, + "learning_rate": 4.891969017529555e-06, + "loss": 3.5368, + "step": 117210 + }, + { + "epoch": 7.964057616523984, + "grad_norm": 0.9663781523704529, + "learning_rate": 4.8495040086968346e-06, + "loss": 3.5094, + "step": 117215 + }, + { + "epoch": 7.964397336594646, + "grad_norm": 1.1090892553329468, + "learning_rate": 4.807038999864112e-06, + "loss": 3.3642, + "step": 117220 + }, + { + "epoch": 7.9647370566653075, + "grad_norm": 1.1921916007995605, + "learning_rate": 4.7645739910313905e-06, + "loss": 3.3907, + "step": 117225 + }, + { + "epoch": 7.96507677673597, + "grad_norm": 0.7731561064720154, + "learning_rate": 4.722108982198668e-06, + "loss": 3.4243, + "step": 117230 + }, + { + "epoch": 7.965416496806632, + "grad_norm": 0.908453106880188, + "learning_rate": 4.679643973365946e-06, + "loss": 3.2411, + "step": 117235 + }, + { + "epoch": 7.965756216877293, + "grad_norm": 0.7578496336936951, + "learning_rate": 4.637178964533225e-06, + "loss": 3.5029, + "step": 117240 + }, + { + "epoch": 7.966095936947955, + "grad_norm": 0.9767783284187317, + "learning_rate": 4.594713955700502e-06, + "loss": 3.579, + "step": 117245 + }, + { + "epoch": 7.966435657018617, + "grad_norm": 0.7536844611167908, + "learning_rate": 4.5522489468677816e-06, + "loss": 3.5188, + "step": 117250 + }, + { + "epoch": 7.966775377089278, + "grad_norm": 0.865977942943573, + "learning_rate": 4.509783938035059e-06, + "loss": 3.3329, + "step": 117255 + }, + { + "epoch": 7.96711509715994, + "grad_norm": 0.8407233953475952, + "learning_rate": 4.4673189292023375e-06, + "loss": 3.0612, + "step": 117260 + }, + { + "epoch": 7.967454817230602, + "grad_norm": 0.9029647707939148, + "learning_rate": 4.424853920369615e-06, + "loss": 3.3338, + "step": 117265 + }, + { + "epoch": 7.9677945373012635, + "grad_norm": 0.9360986948013306, + "learning_rate": 4.382388911536894e-06, + "loss": 3.443, + "step": 117270 + }, + { + "epoch": 7.968134257371926, + "grad_norm": 1.0597829818725586, + "learning_rate": 4.339923902704172e-06, + "loss": 3.399, + "step": 117275 + }, + { + "epoch": 7.968473977442588, + "grad_norm": 1.011165738105774, + "learning_rate": 4.297458893871449e-06, + "loss": 3.2867, + "step": 117280 + }, + { + "epoch": 7.968813697513249, + "grad_norm": 1.0258355140686035, + "learning_rate": 4.2549938850387285e-06, + "loss": 3.2981, + "step": 117285 + }, + { + "epoch": 7.969153417583911, + "grad_norm": 0.7790448069572449, + "learning_rate": 4.212528876206006e-06, + "loss": 3.6032, + "step": 117290 + }, + { + "epoch": 7.969493137654573, + "grad_norm": 0.7645205855369568, + "learning_rate": 4.1700638673732845e-06, + "loss": 3.5353, + "step": 117295 + }, + { + "epoch": 7.969832857725234, + "grad_norm": 1.0478875637054443, + "learning_rate": 4.127598858540563e-06, + "loss": 3.3268, + "step": 117300 + }, + { + "epoch": 7.970172577795896, + "grad_norm": 1.0692400932312012, + "learning_rate": 4.085133849707841e-06, + "loss": 3.54, + "step": 117305 + }, + { + "epoch": 7.9705122978665575, + "grad_norm": 0.903150737285614, + "learning_rate": 4.042668840875119e-06, + "loss": 3.2467, + "step": 117310 + }, + { + "epoch": 7.9708520179372195, + "grad_norm": 0.9086745977401733, + "learning_rate": 4.000203832042397e-06, + "loss": 3.1188, + "step": 117315 + }, + { + "epoch": 7.971191738007882, + "grad_norm": 1.2438172101974487, + "learning_rate": 3.9577388232096755e-06, + "loss": 3.3454, + "step": 117320 + }, + { + "epoch": 7.971531458078543, + "grad_norm": 0.9926016330718994, + "learning_rate": 3.915273814376953e-06, + "loss": 3.2936, + "step": 117325 + }, + { + "epoch": 7.971871178149205, + "grad_norm": 1.0362221002578735, + "learning_rate": 3.8728088055442314e-06, + "loss": 3.3306, + "step": 117330 + }, + { + "epoch": 7.972210898219867, + "grad_norm": 0.9202038049697876, + "learning_rate": 3.83034379671151e-06, + "loss": 3.4943, + "step": 117335 + }, + { + "epoch": 7.972550618290528, + "grad_norm": 0.974084734916687, + "learning_rate": 3.7878787878787882e-06, + "loss": 3.5161, + "step": 117340 + }, + { + "epoch": 7.97289033836119, + "grad_norm": 1.2423197031021118, + "learning_rate": 3.745413779046066e-06, + "loss": 3.4119, + "step": 117345 + }, + { + "epoch": 7.973230058431852, + "grad_norm": 0.9911630749702454, + "learning_rate": 3.702948770213344e-06, + "loss": 3.0941, + "step": 117350 + }, + { + "epoch": 7.9735697785025135, + "grad_norm": 0.953568696975708, + "learning_rate": 3.6604837613806225e-06, + "loss": 3.4726, + "step": 117355 + }, + { + "epoch": 7.9739094985731755, + "grad_norm": 0.9334720969200134, + "learning_rate": 3.6180187525479005e-06, + "loss": 3.4326, + "step": 117360 + }, + { + "epoch": 7.974249218643838, + "grad_norm": 0.9428009390830994, + "learning_rate": 3.575553743715179e-06, + "loss": 3.3214, + "step": 117365 + }, + { + "epoch": 7.974588938714499, + "grad_norm": 1.4598023891448975, + "learning_rate": 3.533088734882457e-06, + "loss": 3.4391, + "step": 117370 + }, + { + "epoch": 7.974928658785161, + "grad_norm": 1.016463279724121, + "learning_rate": 3.490623726049735e-06, + "loss": 3.5056, + "step": 117375 + }, + { + "epoch": 7.975268378855823, + "grad_norm": 0.9509491920471191, + "learning_rate": 3.448158717217013e-06, + "loss": 3.4045, + "step": 117380 + }, + { + "epoch": 7.975608098926484, + "grad_norm": 0.7301925420761108, + "learning_rate": 3.4056937083842915e-06, + "loss": 3.41, + "step": 117385 + }, + { + "epoch": 7.975947818997146, + "grad_norm": 0.8806939721107483, + "learning_rate": 3.36322869955157e-06, + "loss": 3.4951, + "step": 117390 + }, + { + "epoch": 7.976287539067808, + "grad_norm": 0.907806932926178, + "learning_rate": 3.3207636907188475e-06, + "loss": 3.4397, + "step": 117395 + }, + { + "epoch": 7.9766272591384695, + "grad_norm": 0.9362978339195251, + "learning_rate": 3.278298681886126e-06, + "loss": 3.2552, + "step": 117400 + }, + { + "epoch": 7.976966979209132, + "grad_norm": 0.8269045948982239, + "learning_rate": 3.235833673053404e-06, + "loss": 3.4186, + "step": 117405 + }, + { + "epoch": 7.977306699279794, + "grad_norm": 0.8900313973426819, + "learning_rate": 3.193368664220682e-06, + "loss": 3.4061, + "step": 117410 + }, + { + "epoch": 7.977646419350455, + "grad_norm": 0.8181596994400024, + "learning_rate": 3.1509036553879606e-06, + "loss": 3.2702, + "step": 117415 + }, + { + "epoch": 7.977986139421117, + "grad_norm": 0.8245812058448792, + "learning_rate": 3.1084386465552385e-06, + "loss": 3.3604, + "step": 117420 + }, + { + "epoch": 7.978325859491779, + "grad_norm": 0.7542943954467773, + "learning_rate": 3.065973637722517e-06, + "loss": 3.7126, + "step": 117425 + }, + { + "epoch": 7.97866557956244, + "grad_norm": 1.1748710870742798, + "learning_rate": 3.023508628889795e-06, + "loss": 3.3777, + "step": 117430 + }, + { + "epoch": 7.979005299633102, + "grad_norm": 1.201479196548462, + "learning_rate": 2.9810436200570733e-06, + "loss": 3.6021, + "step": 117435 + }, + { + "epoch": 7.979345019703764, + "grad_norm": 0.9979589581489563, + "learning_rate": 2.938578611224351e-06, + "loss": 3.5157, + "step": 117440 + }, + { + "epoch": 7.9796847397744255, + "grad_norm": 0.9610357880592346, + "learning_rate": 2.896113602391629e-06, + "loss": 3.6827, + "step": 117445 + }, + { + "epoch": 7.980024459845088, + "grad_norm": 1.2221890687942505, + "learning_rate": 2.8536485935589076e-06, + "loss": 3.2539, + "step": 117450 + }, + { + "epoch": 7.98036417991575, + "grad_norm": 0.8058366179466248, + "learning_rate": 2.8111835847261855e-06, + "loss": 3.1573, + "step": 117455 + }, + { + "epoch": 7.980703899986411, + "grad_norm": 0.9379357695579529, + "learning_rate": 2.768718575893464e-06, + "loss": 3.5204, + "step": 117460 + }, + { + "epoch": 7.981043620057073, + "grad_norm": 0.7994109392166138, + "learning_rate": 2.726253567060742e-06, + "loss": 3.3457, + "step": 117465 + }, + { + "epoch": 7.981383340127735, + "grad_norm": 1.0193010568618774, + "learning_rate": 2.6837885582280202e-06, + "loss": 3.2785, + "step": 117470 + }, + { + "epoch": 7.981723060198396, + "grad_norm": 1.0214210748672485, + "learning_rate": 2.6413235493952986e-06, + "loss": 3.5272, + "step": 117475 + }, + { + "epoch": 7.982062780269058, + "grad_norm": 0.8707491159439087, + "learning_rate": 2.5988585405625766e-06, + "loss": 3.4596, + "step": 117480 + }, + { + "epoch": 7.98240250033972, + "grad_norm": 0.7753697633743286, + "learning_rate": 2.556393531729855e-06, + "loss": 3.3915, + "step": 117485 + }, + { + "epoch": 7.9827422204103815, + "grad_norm": 0.9439900517463684, + "learning_rate": 2.5139285228971325e-06, + "loss": 3.617, + "step": 117490 + }, + { + "epoch": 7.983081940481044, + "grad_norm": 0.8206765651702881, + "learning_rate": 2.471463514064411e-06, + "loss": 3.1006, + "step": 117495 + }, + { + "epoch": 7.983421660551706, + "grad_norm": 0.8284202814102173, + "learning_rate": 2.428998505231689e-06, + "loss": 3.4138, + "step": 117500 + }, + { + "epoch": 7.983761380622367, + "grad_norm": 1.546726942062378, + "learning_rate": 2.3865334963989672e-06, + "loss": 3.2988, + "step": 117505 + }, + { + "epoch": 7.984101100693029, + "grad_norm": 0.9728623628616333, + "learning_rate": 2.3440684875662456e-06, + "loss": 3.7281, + "step": 117510 + }, + { + "epoch": 7.984440820763691, + "grad_norm": 0.8122462034225464, + "learning_rate": 2.3016034787335236e-06, + "loss": 3.3975, + "step": 117515 + }, + { + "epoch": 7.984780540834352, + "grad_norm": 0.7745659351348877, + "learning_rate": 2.259138469900802e-06, + "loss": 3.514, + "step": 117520 + }, + { + "epoch": 7.985120260905014, + "grad_norm": 0.7590939998626709, + "learning_rate": 2.21667346106808e-06, + "loss": 3.3353, + "step": 117525 + }, + { + "epoch": 7.985459980975676, + "grad_norm": 0.9289600849151611, + "learning_rate": 2.1742084522353583e-06, + "loss": 2.9935, + "step": 117530 + }, + { + "epoch": 7.9857997010463375, + "grad_norm": 0.7830320000648499, + "learning_rate": 2.1317434434026363e-06, + "loss": 3.4058, + "step": 117535 + }, + { + "epoch": 7.986139421117, + "grad_norm": 0.8799627423286438, + "learning_rate": 2.0892784345699142e-06, + "loss": 3.3539, + "step": 117540 + }, + { + "epoch": 7.986479141187662, + "grad_norm": 0.8360272645950317, + "learning_rate": 2.0468134257371926e-06, + "loss": 3.3765, + "step": 117545 + }, + { + "epoch": 7.986818861258323, + "grad_norm": 0.8870963454246521, + "learning_rate": 2.0043484169044706e-06, + "loss": 3.3201, + "step": 117550 + }, + { + "epoch": 7.987158581328985, + "grad_norm": 0.9705449938774109, + "learning_rate": 1.961883408071749e-06, + "loss": 3.5543, + "step": 117555 + }, + { + "epoch": 7.987498301399647, + "grad_norm": 0.8764634132385254, + "learning_rate": 1.919418399239027e-06, + "loss": 3.5448, + "step": 117560 + }, + { + "epoch": 7.987838021470308, + "grad_norm": 0.9476521015167236, + "learning_rate": 1.8769533904063053e-06, + "loss": 3.5386, + "step": 117565 + }, + { + "epoch": 7.98817774154097, + "grad_norm": 0.9111322164535522, + "learning_rate": 1.8344883815735833e-06, + "loss": 3.2171, + "step": 117570 + }, + { + "epoch": 7.988517461611632, + "grad_norm": 1.0649820566177368, + "learning_rate": 1.7920233727408616e-06, + "loss": 3.2485, + "step": 117575 + }, + { + "epoch": 7.9888571816822935, + "grad_norm": 0.9265506267547607, + "learning_rate": 1.7495583639081398e-06, + "loss": 3.1963, + "step": 117580 + }, + { + "epoch": 7.989196901752956, + "grad_norm": 0.9048522710800171, + "learning_rate": 1.707093355075418e-06, + "loss": 3.1793, + "step": 117585 + }, + { + "epoch": 7.989536621823618, + "grad_norm": 1.2579624652862549, + "learning_rate": 1.664628346242696e-06, + "loss": 3.3287, + "step": 117590 + }, + { + "epoch": 7.989876341894279, + "grad_norm": 0.8823444843292236, + "learning_rate": 1.6221633374099741e-06, + "loss": 3.3976, + "step": 117595 + }, + { + "epoch": 7.990216061964941, + "grad_norm": 0.837339460849762, + "learning_rate": 1.5796983285772523e-06, + "loss": 3.514, + "step": 117600 + }, + { + "epoch": 7.990555782035603, + "grad_norm": 0.9399139881134033, + "learning_rate": 1.5372333197445307e-06, + "loss": 3.3297, + "step": 117605 + }, + { + "epoch": 7.990895502106264, + "grad_norm": 0.9182848930358887, + "learning_rate": 1.4947683109118088e-06, + "loss": 3.3511, + "step": 117610 + }, + { + "epoch": 7.991235222176926, + "grad_norm": 0.9416151642799377, + "learning_rate": 1.4523033020790868e-06, + "loss": 3.2493, + "step": 117615 + }, + { + "epoch": 7.991574942247588, + "grad_norm": 0.9170828461647034, + "learning_rate": 1.409838293246365e-06, + "loss": 3.3751, + "step": 117620 + }, + { + "epoch": 7.9919146623182495, + "grad_norm": 0.906395673751831, + "learning_rate": 1.3673732844136431e-06, + "loss": 3.5663, + "step": 117625 + }, + { + "epoch": 7.992254382388912, + "grad_norm": 0.7348623275756836, + "learning_rate": 1.3249082755809213e-06, + "loss": 3.4219, + "step": 117630 + }, + { + "epoch": 7.992594102459574, + "grad_norm": 1.050638198852539, + "learning_rate": 1.2824432667481997e-06, + "loss": 3.338, + "step": 117635 + }, + { + "epoch": 7.992933822530235, + "grad_norm": 0.9951655864715576, + "learning_rate": 1.2399782579154776e-06, + "loss": 3.537, + "step": 117640 + }, + { + "epoch": 7.993273542600897, + "grad_norm": 7.599734783172607, + "learning_rate": 1.1975132490827558e-06, + "loss": 3.1919, + "step": 117645 + }, + { + "epoch": 7.993613262671559, + "grad_norm": 0.9461233615875244, + "learning_rate": 1.155048240250034e-06, + "loss": 3.2409, + "step": 117650 + }, + { + "epoch": 7.99395298274222, + "grad_norm": 0.887936532497406, + "learning_rate": 1.1125832314173122e-06, + "loss": 3.3637, + "step": 117655 + }, + { + "epoch": 7.994292702812882, + "grad_norm": 0.8758895397186279, + "learning_rate": 1.0701182225845901e-06, + "loss": 3.235, + "step": 117660 + }, + { + "epoch": 7.994632422883544, + "grad_norm": 1.3600155115127563, + "learning_rate": 1.0276532137518685e-06, + "loss": 3.2492, + "step": 117665 + }, + { + "epoch": 7.9949721429542056, + "grad_norm": 1.0242259502410889, + "learning_rate": 9.851882049191467e-07, + "loss": 3.188, + "step": 117670 + }, + { + "epoch": 7.995311863024868, + "grad_norm": 0.7220985293388367, + "learning_rate": 9.427231960864248e-07, + "loss": 3.2295, + "step": 117675 + }, + { + "epoch": 7.99565158309553, + "grad_norm": 0.8266955018043518, + "learning_rate": 9.002581872537029e-07, + "loss": 3.1913, + "step": 117680 + }, + { + "epoch": 7.995991303166191, + "grad_norm": 0.8832756876945496, + "learning_rate": 8.577931784209812e-07, + "loss": 3.4606, + "step": 117685 + }, + { + "epoch": 7.996331023236853, + "grad_norm": 0.9660047292709351, + "learning_rate": 8.153281695882593e-07, + "loss": 3.4482, + "step": 117690 + }, + { + "epoch": 7.996670743307515, + "grad_norm": 0.6685022711753845, + "learning_rate": 7.728631607555374e-07, + "loss": 3.4469, + "step": 117695 + }, + { + "epoch": 7.997010463378176, + "grad_norm": 0.931858479976654, + "learning_rate": 7.303981519228156e-07, + "loss": 3.436, + "step": 117700 + }, + { + "epoch": 7.997350183448838, + "grad_norm": 0.6957717537879944, + "learning_rate": 6.879331430900938e-07, + "loss": 3.2059, + "step": 117705 + }, + { + "epoch": 7.9976899035195, + "grad_norm": 0.9909206628799438, + "learning_rate": 6.454681342573719e-07, + "loss": 3.2846, + "step": 117710 + }, + { + "epoch": 7.998029623590162, + "grad_norm": 0.92824786901474, + "learning_rate": 6.030031254246501e-07, + "loss": 3.3024, + "step": 117715 + }, + { + "epoch": 7.998369343660824, + "grad_norm": 1.0434094667434692, + "learning_rate": 5.605381165919283e-07, + "loss": 3.2656, + "step": 117720 + }, + { + "epoch": 7.998709063731486, + "grad_norm": 0.9917938113212585, + "learning_rate": 5.180731077592064e-07, + "loss": 3.4668, + "step": 117725 + }, + { + "epoch": 7.999048783802147, + "grad_norm": 1.1903223991394043, + "learning_rate": 4.7560809892648463e-07, + "loss": 3.5708, + "step": 117730 + }, + { + "epoch": 7.999388503872809, + "grad_norm": 1.0263484716415405, + "learning_rate": 4.3314309009376275e-07, + "loss": 3.2523, + "step": 117735 + }, + { + "epoch": 7.999728223943471, + "grad_norm": 0.9721146821975708, + "learning_rate": 3.906780812610409e-07, + "loss": 3.5283, + "step": 117740 + }, + { + "epoch": 8.0, + "eval_bertscore": { + "f1": 0.8403179171947398, + "precision": 0.8415759222338118, + "recall": 0.8398933725332103 + }, + "eval_bleu_4": 0.019936774468180853, + "eval_exact_match": 0.00048454307587944567, + "eval_loss": 3.384040117263794, + "eval_meteor": 0.08849174462183192, + "eval_rouge": { + "rouge1": 0.12682105956311823, + "rouge2": 0.019784283803210855, + "rougeL": 0.10996252730894909, + "rougeLsum": 0.11009915135779794 + }, + "eval_runtime": 1569.3433, + "eval_samples_per_second": 6.575, + "eval_steps_per_second": 0.822, + "step": 117744 + } + ], + "logging_steps": 5, + "max_steps": 117744, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.787048068020961e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}