diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,33252 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.998419971559487, + "eval_steps": 500, + "global_step": 47460, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002106704587349239, + "grad_norm": 1.0077738761901855, + "learning_rate": 0.00019999997809141818, + "loss": 2.2638, + "step": 10 + }, + { + "epoch": 0.004213409174698478, + "grad_norm": 0.7950726747512817, + "learning_rate": 0.00019999991236568227, + "loss": 2.0614, + "step": 20 + }, + { + "epoch": 0.006320113762047717, + "grad_norm": 0.8017520308494568, + "learning_rate": 0.0001999998028228211, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.008426818349396957, + "grad_norm": 0.7065390944480896, + "learning_rate": 0.00019999964946288266, + "loss": 2.0147, + "step": 40 + }, + { + "epoch": 0.010533522936746195, + "grad_norm": 0.7359225749969482, + "learning_rate": 0.00019999945228593418, + "loss": 2.0113, + "step": 50 + }, + { + "epoch": 0.012640227524095434, + "grad_norm": 0.7210670113563538, + "learning_rate": 0.000199999211292062, + "loss": 1.9811, + "step": 60 + }, + { + "epoch": 0.014746932111444672, + "grad_norm": 0.6569874286651611, + "learning_rate": 0.00019999892648137174, + "loss": 1.9632, + "step": 70 + }, + { + "epoch": 0.016853636698793913, + "grad_norm": 0.6518391966819763, + "learning_rate": 0.0001999985978539882, + "loss": 1.9901, + "step": 80 + }, + { + "epoch": 0.01896034128614315, + "grad_norm": 0.6628450155258179, + "learning_rate": 0.00019999822541005537, + "loss": 1.9529, + "step": 90 + }, + { + "epoch": 0.02106704587349239, + "grad_norm": 0.6857717037200928, + "learning_rate": 0.00019999780914973646, + "loss": 1.9697, + "step": 100 + }, + { + "epoch": 0.02317375046084163, + "grad_norm": 0.7159650325775146, + "learning_rate": 0.00019999734907321385, + "loss": 1.9493, + "step": 110 + }, + { + "epoch": 0.025280455048190868, + "grad_norm": 0.6563044786453247, + "learning_rate": 0.00019999684518068916, + "loss": 1.9791, + "step": 120 + }, + { + "epoch": 0.027387159635540108, + "grad_norm": 0.6270883083343506, + "learning_rate": 0.0001999962974723831, + "loss": 1.9267, + "step": 130 + }, + { + "epoch": 0.029493864222889344, + "grad_norm": 0.6255055069923401, + "learning_rate": 0.00019999570594853575, + "loss": 1.9642, + "step": 140 + }, + { + "epoch": 0.03160056881023859, + "grad_norm": 0.6271708011627197, + "learning_rate": 0.00019999507060940625, + "loss": 1.9391, + "step": 150 + }, + { + "epoch": 0.03370727339758783, + "grad_norm": 0.641325056552887, + "learning_rate": 0.00019999439145527303, + "loss": 1.9436, + "step": 160 + }, + { + "epoch": 0.03581397798493706, + "grad_norm": 0.6543128490447998, + "learning_rate": 0.00019999366848643364, + "loss": 1.9596, + "step": 170 + }, + { + "epoch": 0.0379206825722863, + "grad_norm": 0.634831964969635, + "learning_rate": 0.00019999290170320485, + "loss": 1.9341, + "step": 180 + }, + { + "epoch": 0.04002738715963554, + "grad_norm": 0.6254514455795288, + "learning_rate": 0.0001999920911059227, + "loss": 1.9737, + "step": 190 + }, + { + "epoch": 0.04213409174698478, + "grad_norm": 0.6236160397529602, + "learning_rate": 0.0001999912366949423, + "loss": 1.9373, + "step": 200 + }, + { + "epoch": 0.04424079633433402, + "grad_norm": 0.6445717215538025, + "learning_rate": 0.00019999033847063811, + "loss": 1.928, + "step": 210 + }, + { + "epoch": 0.04634750092168326, + "grad_norm": 0.6292468905448914, + "learning_rate": 0.00019998939643340365, + "loss": 1.9223, + "step": 220 + }, + { + "epoch": 0.0484542055090325, + "grad_norm": 0.6503478288650513, + "learning_rate": 0.00019998841058365167, + "loss": 1.9438, + "step": 230 + }, + { + "epoch": 0.050560910096381737, + "grad_norm": 0.6267730593681335, + "learning_rate": 0.00019998738092181421, + "loss": 1.9529, + "step": 240 + }, + { + "epoch": 0.052667614683730976, + "grad_norm": 0.6830586791038513, + "learning_rate": 0.00019998630744834243, + "loss": 1.9406, + "step": 250 + }, + { + "epoch": 0.054774319271080216, + "grad_norm": 0.6437792181968689, + "learning_rate": 0.0001999851901637066, + "loss": 1.985, + "step": 260 + }, + { + "epoch": 0.05688102385842945, + "grad_norm": 0.6661207675933838, + "learning_rate": 0.00019998402906839643, + "loss": 1.8874, + "step": 270 + }, + { + "epoch": 0.05898772844577869, + "grad_norm": 0.6596685647964478, + "learning_rate": 0.00019998282416292055, + "loss": 1.8807, + "step": 280 + }, + { + "epoch": 0.06109443303312793, + "grad_norm": 0.6325543522834778, + "learning_rate": 0.00019998157544780698, + "loss": 1.9462, + "step": 290 + }, + { + "epoch": 0.06320113762047717, + "grad_norm": 0.7503716349601746, + "learning_rate": 0.00019998028292360286, + "loss": 1.9079, + "step": 300 + }, + { + "epoch": 0.06530784220782641, + "grad_norm": 0.6907737255096436, + "learning_rate": 0.00019997894659087457, + "loss": 1.8995, + "step": 310 + }, + { + "epoch": 0.06741454679517565, + "grad_norm": 0.6293074488639832, + "learning_rate": 0.00019997756645020757, + "loss": 1.9193, + "step": 320 + }, + { + "epoch": 0.06952125138252488, + "grad_norm": 0.6826409101486206, + "learning_rate": 0.0001999761425022067, + "loss": 1.9169, + "step": 330 + }, + { + "epoch": 0.07162795596987412, + "grad_norm": 0.6254687309265137, + "learning_rate": 0.0001999746747474958, + "loss": 1.8822, + "step": 340 + }, + { + "epoch": 0.07373466055722336, + "grad_norm": 0.6506015658378601, + "learning_rate": 0.00019997316318671806, + "loss": 1.8803, + "step": 350 + }, + { + "epoch": 0.0758413651445726, + "grad_norm": 0.639499306678772, + "learning_rate": 0.00019997160782053578, + "loss": 1.8657, + "step": 360 + }, + { + "epoch": 0.07794806973192184, + "grad_norm": 0.603563666343689, + "learning_rate": 0.0001999700086496305, + "loss": 1.9138, + "step": 370 + }, + { + "epoch": 0.08005477431927108, + "grad_norm": 0.6491252183914185, + "learning_rate": 0.0001999683656747029, + "loss": 1.945, + "step": 380 + }, + { + "epoch": 0.08216147890662032, + "grad_norm": 0.6904931664466858, + "learning_rate": 0.00019996667889647288, + "loss": 1.9429, + "step": 390 + }, + { + "epoch": 0.08426818349396956, + "grad_norm": 0.6672365069389343, + "learning_rate": 0.00019996494831567958, + "loss": 1.8993, + "step": 400 + }, + { + "epoch": 0.0863748880813188, + "grad_norm": 0.635400116443634, + "learning_rate": 0.00019996317393308126, + "loss": 1.8849, + "step": 410 + }, + { + "epoch": 0.08848159266866804, + "grad_norm": 0.6338510513305664, + "learning_rate": 0.00019996135574945544, + "loss": 1.9232, + "step": 420 + }, + { + "epoch": 0.09058829725601728, + "grad_norm": 0.718330979347229, + "learning_rate": 0.00019995949376559874, + "loss": 1.9022, + "step": 430 + }, + { + "epoch": 0.09269500184336651, + "grad_norm": 0.6823340058326721, + "learning_rate": 0.00019995758798232704, + "loss": 1.9344, + "step": 440 + }, + { + "epoch": 0.09480170643071575, + "grad_norm": 0.6402202248573303, + "learning_rate": 0.00019995563840047542, + "loss": 1.9258, + "step": 450 + }, + { + "epoch": 0.096908411018065, + "grad_norm": 0.5626136660575867, + "learning_rate": 0.00019995364502089813, + "loss": 1.8337, + "step": 460 + }, + { + "epoch": 0.09901511560541423, + "grad_norm": 0.6894583702087402, + "learning_rate": 0.00019995160784446863, + "loss": 1.9246, + "step": 470 + }, + { + "epoch": 0.10112182019276347, + "grad_norm": 0.6594771146774292, + "learning_rate": 0.00019994952687207954, + "loss": 1.9241, + "step": 480 + }, + { + "epoch": 0.10322852478011271, + "grad_norm": 0.624462902545929, + "learning_rate": 0.00019994740210464268, + "loss": 1.9486, + "step": 490 + }, + { + "epoch": 0.10533522936746195, + "grad_norm": 0.6428165435791016, + "learning_rate": 0.00019994523354308904, + "loss": 1.8711, + "step": 500 + }, + { + "epoch": 0.10744193395481119, + "grad_norm": 0.626460075378418, + "learning_rate": 0.00019994302118836883, + "loss": 1.8453, + "step": 510 + }, + { + "epoch": 0.10954863854216043, + "grad_norm": 0.5991859436035156, + "learning_rate": 0.00019994076504145148, + "loss": 1.8691, + "step": 520 + }, + { + "epoch": 0.11165534312950967, + "grad_norm": 0.6435895562171936, + "learning_rate": 0.00019993846510332552, + "loss": 1.9216, + "step": 530 + }, + { + "epoch": 0.1137620477168589, + "grad_norm": 0.6111456751823425, + "learning_rate": 0.00019993612137499876, + "loss": 1.846, + "step": 540 + }, + { + "epoch": 0.11586875230420814, + "grad_norm": 0.6688899993896484, + "learning_rate": 0.0001999337338574981, + "loss": 1.8676, + "step": 550 + }, + { + "epoch": 0.11797545689155738, + "grad_norm": 0.6680476069450378, + "learning_rate": 0.00019993130255186977, + "loss": 1.8763, + "step": 560 + }, + { + "epoch": 0.12008216147890662, + "grad_norm": 0.6650174856185913, + "learning_rate": 0.00019992882745917902, + "loss": 1.8757, + "step": 570 + }, + { + "epoch": 0.12218886606625586, + "grad_norm": 0.6582191586494446, + "learning_rate": 0.0001999263085805104, + "loss": 1.921, + "step": 580 + }, + { + "epoch": 0.1242955706536051, + "grad_norm": 0.6733909249305725, + "learning_rate": 0.00019992374591696761, + "loss": 1.9209, + "step": 590 + }, + { + "epoch": 0.12640227524095435, + "grad_norm": 0.6196283102035522, + "learning_rate": 0.00019992113946967353, + "loss": 1.8838, + "step": 600 + }, + { + "epoch": 0.1285089798283036, + "grad_norm": 0.6418552398681641, + "learning_rate": 0.00019991848923977022, + "loss": 1.9044, + "step": 610 + }, + { + "epoch": 0.13061568441565283, + "grad_norm": 0.6484676599502563, + "learning_rate": 0.00019991579522841892, + "loss": 1.8783, + "step": 620 + }, + { + "epoch": 0.13272238900300207, + "grad_norm": 0.6386022567749023, + "learning_rate": 0.00019991305743680013, + "loss": 1.9412, + "step": 630 + }, + { + "epoch": 0.1348290935903513, + "grad_norm": 0.7087893486022949, + "learning_rate": 0.00019991027586611343, + "loss": 1.955, + "step": 640 + }, + { + "epoch": 0.13693579817770052, + "grad_norm": 0.6675140857696533, + "learning_rate": 0.00019990745051757765, + "loss": 1.9606, + "step": 650 + }, + { + "epoch": 0.13904250276504976, + "grad_norm": 0.6183488965034485, + "learning_rate": 0.00019990458139243077, + "loss": 1.8419, + "step": 660 + }, + { + "epoch": 0.141149207352399, + "grad_norm": 0.6724327206611633, + "learning_rate": 0.00019990166849192994, + "loss": 1.9343, + "step": 670 + }, + { + "epoch": 0.14325591193974824, + "grad_norm": 0.650532066822052, + "learning_rate": 0.00019989871181735148, + "loss": 1.8345, + "step": 680 + }, + { + "epoch": 0.14536261652709748, + "grad_norm": 0.608699381351471, + "learning_rate": 0.000199895711369991, + "loss": 1.9123, + "step": 690 + }, + { + "epoch": 0.14746932111444672, + "grad_norm": 0.6671069264411926, + "learning_rate": 0.00019989266715116316, + "loss": 1.8696, + "step": 700 + }, + { + "epoch": 0.14957602570179596, + "grad_norm": 0.619253396987915, + "learning_rate": 0.0001998895791622019, + "loss": 1.9529, + "step": 710 + }, + { + "epoch": 0.1516827302891452, + "grad_norm": 0.6708328127861023, + "learning_rate": 0.00019988644740446022, + "loss": 1.9158, + "step": 720 + }, + { + "epoch": 0.15378943487649444, + "grad_norm": 0.6542178392410278, + "learning_rate": 0.0001998832718793104, + "loss": 1.8742, + "step": 730 + }, + { + "epoch": 0.15589613946384367, + "grad_norm": 0.6505009531974792, + "learning_rate": 0.00019988005258814387, + "loss": 1.8314, + "step": 740 + }, + { + "epoch": 0.15800284405119291, + "grad_norm": 0.6684021353721619, + "learning_rate": 0.00019987678953237127, + "loss": 1.8651, + "step": 750 + }, + { + "epoch": 0.16010954863854215, + "grad_norm": 0.6631053686141968, + "learning_rate": 0.00019987348271342228, + "loss": 1.8857, + "step": 760 + }, + { + "epoch": 0.1622162532258914, + "grad_norm": 0.7295246124267578, + "learning_rate": 0.00019987013213274593, + "loss": 1.9163, + "step": 770 + }, + { + "epoch": 0.16432295781324063, + "grad_norm": 0.6868222951889038, + "learning_rate": 0.00019986673779181033, + "loss": 1.8221, + "step": 780 + }, + { + "epoch": 0.16642966240058987, + "grad_norm": 0.6467426419258118, + "learning_rate": 0.00019986329969210278, + "loss": 1.8954, + "step": 790 + }, + { + "epoch": 0.1685363669879391, + "grad_norm": 0.6961290240287781, + "learning_rate": 0.00019985981783512976, + "loss": 1.9336, + "step": 800 + }, + { + "epoch": 0.17064307157528835, + "grad_norm": 0.6626701951026917, + "learning_rate": 0.00019985629222241694, + "loss": 1.8577, + "step": 810 + }, + { + "epoch": 0.1727497761626376, + "grad_norm": 0.6900150775909424, + "learning_rate": 0.0001998527228555091, + "loss": 1.8339, + "step": 820 + }, + { + "epoch": 0.17485648074998683, + "grad_norm": 0.6740383505821228, + "learning_rate": 0.00019984910973597027, + "loss": 1.8174, + "step": 830 + }, + { + "epoch": 0.17696318533733607, + "grad_norm": 0.6543059945106506, + "learning_rate": 0.0001998454528653836, + "loss": 1.8637, + "step": 840 + }, + { + "epoch": 0.1790698899246853, + "grad_norm": 0.652992308139801, + "learning_rate": 0.00019984175224535146, + "loss": 1.888, + "step": 850 + }, + { + "epoch": 0.18117659451203455, + "grad_norm": 0.6753103137016296, + "learning_rate": 0.00019983800787749532, + "loss": 1.9282, + "step": 860 + }, + { + "epoch": 0.1832832990993838, + "grad_norm": 0.6452760696411133, + "learning_rate": 0.00019983421976345586, + "loss": 1.846, + "step": 870 + }, + { + "epoch": 0.18539000368673303, + "grad_norm": 0.7148582339286804, + "learning_rate": 0.00019983038790489296, + "loss": 1.8172, + "step": 880 + }, + { + "epoch": 0.18749670827408227, + "grad_norm": 0.644650399684906, + "learning_rate": 0.00019982651230348556, + "loss": 1.8717, + "step": 890 + }, + { + "epoch": 0.1896034128614315, + "grad_norm": 0.6961220502853394, + "learning_rate": 0.0001998225929609319, + "loss": 1.85, + "step": 900 + }, + { + "epoch": 0.19171011744878075, + "grad_norm": 0.5916398763656616, + "learning_rate": 0.00019981862987894934, + "loss": 1.8181, + "step": 910 + }, + { + "epoch": 0.19381682203613, + "grad_norm": 0.697644829750061, + "learning_rate": 0.0001998146230592743, + "loss": 1.8774, + "step": 920 + }, + { + "epoch": 0.19592352662347923, + "grad_norm": 0.6732763648033142, + "learning_rate": 0.00019981057250366253, + "loss": 1.8786, + "step": 930 + }, + { + "epoch": 0.19803023121082847, + "grad_norm": 0.6978752613067627, + "learning_rate": 0.00019980647821388886, + "loss": 1.8784, + "step": 940 + }, + { + "epoch": 0.2001369357981777, + "grad_norm": 0.7141478657722473, + "learning_rate": 0.00019980234019174729, + "loss": 1.847, + "step": 950 + }, + { + "epoch": 0.20224364038552695, + "grad_norm": 0.6787569522857666, + "learning_rate": 0.00019979815843905097, + "loss": 1.8903, + "step": 960 + }, + { + "epoch": 0.20435034497287619, + "grad_norm": 0.6798018217086792, + "learning_rate": 0.0001997939329576322, + "loss": 1.9078, + "step": 970 + }, + { + "epoch": 0.20645704956022543, + "grad_norm": 0.6333872675895691, + "learning_rate": 0.00019978966374934254, + "loss": 1.8625, + "step": 980 + }, + { + "epoch": 0.20856375414757466, + "grad_norm": 0.706005871295929, + "learning_rate": 0.0001997853508160526, + "loss": 1.8134, + "step": 990 + }, + { + "epoch": 0.2106704587349239, + "grad_norm": 0.6944781541824341, + "learning_rate": 0.00019978099415965213, + "loss": 1.8938, + "step": 1000 + }, + { + "epoch": 0.21277716332227314, + "grad_norm": 0.7021796107292175, + "learning_rate": 0.00019977659378205015, + "loss": 1.8474, + "step": 1010 + }, + { + "epoch": 0.21488386790962238, + "grad_norm": 0.6823903918266296, + "learning_rate": 0.0001997721496851748, + "loss": 1.883, + "step": 1020 + }, + { + "epoch": 0.21699057249697162, + "grad_norm": 0.6894837617874146, + "learning_rate": 0.0001997676618709733, + "loss": 1.8624, + "step": 1030 + }, + { + "epoch": 0.21909727708432086, + "grad_norm": 0.7047960758209229, + "learning_rate": 0.00019976313034141213, + "loss": 1.8421, + "step": 1040 + }, + { + "epoch": 0.2212039816716701, + "grad_norm": 0.6393168568611145, + "learning_rate": 0.00019975855509847686, + "loss": 1.8069, + "step": 1050 + }, + { + "epoch": 0.22331068625901934, + "grad_norm": 0.6958315372467041, + "learning_rate": 0.00019975393614417224, + "loss": 1.8484, + "step": 1060 + }, + { + "epoch": 0.22541739084636858, + "grad_norm": 0.716953694820404, + "learning_rate": 0.00019974927348052215, + "loss": 1.8619, + "step": 1070 + }, + { + "epoch": 0.2275240954337178, + "grad_norm": 0.7667638659477234, + "learning_rate": 0.00019974456710956964, + "loss": 1.8564, + "step": 1080 + }, + { + "epoch": 0.22963080002106703, + "grad_norm": 0.6643611192703247, + "learning_rate": 0.0001997398170333769, + "loss": 1.8949, + "step": 1090 + }, + { + "epoch": 0.23173750460841627, + "grad_norm": 0.7317732572555542, + "learning_rate": 0.00019973502325402532, + "loss": 1.8381, + "step": 1100 + }, + { + "epoch": 0.2338442091957655, + "grad_norm": 0.7321386933326721, + "learning_rate": 0.00019973018577361536, + "loss": 1.8545, + "step": 1110 + }, + { + "epoch": 0.23595091378311475, + "grad_norm": 0.6243638396263123, + "learning_rate": 0.00019972530459426663, + "loss": 1.8488, + "step": 1120 + }, + { + "epoch": 0.238057618370464, + "grad_norm": 0.6985125541687012, + "learning_rate": 0.00019972037971811802, + "loss": 1.8695, + "step": 1130 + }, + { + "epoch": 0.24016432295781323, + "grad_norm": 0.6656931042671204, + "learning_rate": 0.00019971541114732741, + "loss": 1.8663, + "step": 1140 + }, + { + "epoch": 0.24227102754516247, + "grad_norm": 0.6580804586410522, + "learning_rate": 0.00019971039888407187, + "loss": 1.8632, + "step": 1150 + }, + { + "epoch": 0.2443777321325117, + "grad_norm": 0.6782211661338806, + "learning_rate": 0.0001997053429305477, + "loss": 1.876, + "step": 1160 + }, + { + "epoch": 0.24648443671986095, + "grad_norm": 0.6574906706809998, + "learning_rate": 0.00019970024328897022, + "loss": 1.8532, + "step": 1170 + }, + { + "epoch": 0.2485911413072102, + "grad_norm": 0.7327693104743958, + "learning_rate": 0.00019969509996157396, + "loss": 1.8929, + "step": 1180 + }, + { + "epoch": 0.25069784589455946, + "grad_norm": 0.7190077900886536, + "learning_rate": 0.0001996899129506126, + "loss": 1.9107, + "step": 1190 + }, + { + "epoch": 0.2528045504819087, + "grad_norm": 0.7218247652053833, + "learning_rate": 0.0001996846822583589, + "loss": 1.9151, + "step": 1200 + }, + { + "epoch": 0.25491125506925794, + "grad_norm": 0.7065020799636841, + "learning_rate": 0.0001996794078871048, + "loss": 1.8707, + "step": 1210 + }, + { + "epoch": 0.2570179596566072, + "grad_norm": 0.688412606716156, + "learning_rate": 0.00019967408983916145, + "loss": 1.8346, + "step": 1220 + }, + { + "epoch": 0.2591246642439564, + "grad_norm": 0.6871249079704285, + "learning_rate": 0.000199668728116859, + "loss": 1.825, + "step": 1230 + }, + { + "epoch": 0.26123136883130565, + "grad_norm": 0.6805330514907837, + "learning_rate": 0.00019966332272254684, + "loss": 1.8608, + "step": 1240 + }, + { + "epoch": 0.2633380734186549, + "grad_norm": 0.6569370627403259, + "learning_rate": 0.00019965787365859344, + "loss": 1.8489, + "step": 1250 + }, + { + "epoch": 0.26544477800600413, + "grad_norm": 0.6645459532737732, + "learning_rate": 0.00019965238092738643, + "loss": 1.8218, + "step": 1260 + }, + { + "epoch": 0.2675514825933534, + "grad_norm": 0.6701253056526184, + "learning_rate": 0.0001996468445313326, + "loss": 1.8862, + "step": 1270 + }, + { + "epoch": 0.2696581871807026, + "grad_norm": 0.7218180298805237, + "learning_rate": 0.00019964126447285778, + "loss": 1.9143, + "step": 1280 + }, + { + "epoch": 0.2717648917680518, + "grad_norm": 0.8006751537322998, + "learning_rate": 0.00019963564075440703, + "loss": 1.8406, + "step": 1290 + }, + { + "epoch": 0.27387159635540104, + "grad_norm": 0.7272390127182007, + "learning_rate": 0.00019962997337844452, + "loss": 1.8889, + "step": 1300 + }, + { + "epoch": 0.2759783009427503, + "grad_norm": 0.6612052321434021, + "learning_rate": 0.0001996242623474535, + "loss": 1.8533, + "step": 1310 + }, + { + "epoch": 0.2780850055300995, + "grad_norm": 0.7371575236320496, + "learning_rate": 0.0001996185076639364, + "loss": 1.8072, + "step": 1320 + }, + { + "epoch": 0.28019171011744876, + "grad_norm": 0.6426770687103271, + "learning_rate": 0.00019961270933041477, + "loss": 1.8731, + "step": 1330 + }, + { + "epoch": 0.282298414704798, + "grad_norm": 0.6850872039794922, + "learning_rate": 0.00019960686734942922, + "loss": 1.9205, + "step": 1340 + }, + { + "epoch": 0.28440511929214723, + "grad_norm": 0.7293111681938171, + "learning_rate": 0.00019960098172353962, + "loss": 1.8028, + "step": 1350 + }, + { + "epoch": 0.2865118238794965, + "grad_norm": 0.6498401761054993, + "learning_rate": 0.0001995950524553248, + "loss": 1.7996, + "step": 1360 + }, + { + "epoch": 0.2886185284668457, + "grad_norm": 0.7127597332000732, + "learning_rate": 0.00019958907954738288, + "loss": 1.8245, + "step": 1370 + }, + { + "epoch": 0.29072523305419495, + "grad_norm": 0.6359906196594238, + "learning_rate": 0.00019958306300233098, + "loss": 1.8264, + "step": 1380 + }, + { + "epoch": 0.2928319376415442, + "grad_norm": 0.7264698147773743, + "learning_rate": 0.00019957700282280537, + "loss": 1.8531, + "step": 1390 + }, + { + "epoch": 0.29493864222889343, + "grad_norm": 0.6813586354255676, + "learning_rate": 0.00019957089901146148, + "loss": 1.8633, + "step": 1400 + }, + { + "epoch": 0.2970453468162427, + "grad_norm": 0.6944628357887268, + "learning_rate": 0.00019956475157097378, + "loss": 1.8383, + "step": 1410 + }, + { + "epoch": 0.2991520514035919, + "grad_norm": 0.6405615210533142, + "learning_rate": 0.00019955856050403594, + "loss": 1.8386, + "step": 1420 + }, + { + "epoch": 0.30125875599094115, + "grad_norm": 0.6815128326416016, + "learning_rate": 0.0001995523258133607, + "loss": 1.92, + "step": 1430 + }, + { + "epoch": 0.3033654605782904, + "grad_norm": 0.6566641330718994, + "learning_rate": 0.00019954604750167993, + "loss": 1.898, + "step": 1440 + }, + { + "epoch": 0.30547216516563963, + "grad_norm": 0.7111663222312927, + "learning_rate": 0.0001995397255717446, + "loss": 1.8138, + "step": 1450 + }, + { + "epoch": 0.30757886975298887, + "grad_norm": 0.7194084525108337, + "learning_rate": 0.0001995333600263248, + "loss": 1.8619, + "step": 1460 + }, + { + "epoch": 0.3096855743403381, + "grad_norm": 0.7005071640014648, + "learning_rate": 0.00019952695086820975, + "loss": 1.8261, + "step": 1470 + }, + { + "epoch": 0.31179227892768735, + "grad_norm": 0.7344840168952942, + "learning_rate": 0.00019952049810020771, + "loss": 1.8191, + "step": 1480 + }, + { + "epoch": 0.3138989835150366, + "grad_norm": 0.692154049873352, + "learning_rate": 0.00019951400172514618, + "loss": 1.8496, + "step": 1490 + }, + { + "epoch": 0.31600568810238583, + "grad_norm": 0.7312874794006348, + "learning_rate": 0.00019950746174587163, + "loss": 1.8621, + "step": 1500 + }, + { + "epoch": 0.31811239268973507, + "grad_norm": 0.7678685188293457, + "learning_rate": 0.0001995008781652497, + "loss": 1.8418, + "step": 1510 + }, + { + "epoch": 0.3202190972770843, + "grad_norm": 0.7713690400123596, + "learning_rate": 0.00019949425098616513, + "loss": 1.853, + "step": 1520 + }, + { + "epoch": 0.32232580186443355, + "grad_norm": 0.7388377785682678, + "learning_rate": 0.0001994875802115218, + "loss": 1.8708, + "step": 1530 + }, + { + "epoch": 0.3244325064517828, + "grad_norm": 0.6781003475189209, + "learning_rate": 0.00019948086584424256, + "loss": 1.8333, + "step": 1540 + }, + { + "epoch": 0.326539211039132, + "grad_norm": 0.7277612686157227, + "learning_rate": 0.00019947410788726956, + "loss": 1.7997, + "step": 1550 + }, + { + "epoch": 0.32864591562648127, + "grad_norm": 0.7328895926475525, + "learning_rate": 0.0001994673063435639, + "loss": 1.8654, + "step": 1560 + }, + { + "epoch": 0.3307526202138305, + "grad_norm": 0.779045820236206, + "learning_rate": 0.00019946046121610583, + "loss": 1.8928, + "step": 1570 + }, + { + "epoch": 0.33285932480117975, + "grad_norm": 0.7018063068389893, + "learning_rate": 0.00019945357250789468, + "loss": 1.8213, + "step": 1580 + }, + { + "epoch": 0.334966029388529, + "grad_norm": 0.6804325580596924, + "learning_rate": 0.00019944664022194885, + "loss": 1.8256, + "step": 1590 + }, + { + "epoch": 0.3370727339758782, + "grad_norm": 0.6910491585731506, + "learning_rate": 0.00019943966436130597, + "loss": 1.8203, + "step": 1600 + }, + { + "epoch": 0.33917943856322746, + "grad_norm": 0.6203540563583374, + "learning_rate": 0.00019943264492902258, + "loss": 1.8504, + "step": 1610 + }, + { + "epoch": 0.3412861431505767, + "grad_norm": 0.6889249682426453, + "learning_rate": 0.0001994255819281744, + "loss": 1.8939, + "step": 1620 + }, + { + "epoch": 0.34339284773792594, + "grad_norm": 0.7516829967498779, + "learning_rate": 0.00019941847536185633, + "loss": 1.8601, + "step": 1630 + }, + { + "epoch": 0.3454995523252752, + "grad_norm": 0.6868574619293213, + "learning_rate": 0.00019941132523318216, + "loss": 1.8381, + "step": 1640 + }, + { + "epoch": 0.3476062569126244, + "grad_norm": 0.7086262106895447, + "learning_rate": 0.0001994041315452849, + "loss": 1.8145, + "step": 1650 + }, + { + "epoch": 0.34971296149997366, + "grad_norm": 0.7216698527336121, + "learning_rate": 0.00019939689430131666, + "loss": 1.8728, + "step": 1660 + }, + { + "epoch": 0.3518196660873229, + "grad_norm": 0.6712501645088196, + "learning_rate": 0.00019938961350444854, + "loss": 1.8133, + "step": 1670 + }, + { + "epoch": 0.35392637067467214, + "grad_norm": 0.64311683177948, + "learning_rate": 0.0001993822891578708, + "loss": 1.8468, + "step": 1680 + }, + { + "epoch": 0.3560330752620214, + "grad_norm": 0.7252877354621887, + "learning_rate": 0.00019937492126479277, + "loss": 1.8592, + "step": 1690 + }, + { + "epoch": 0.3581397798493706, + "grad_norm": 0.6842496991157532, + "learning_rate": 0.00019936750982844283, + "loss": 1.8267, + "step": 1700 + }, + { + "epoch": 0.36024648443671986, + "grad_norm": 0.7731478214263916, + "learning_rate": 0.00019936005485206851, + "loss": 1.8334, + "step": 1710 + }, + { + "epoch": 0.3623531890240691, + "grad_norm": 2.383643865585327, + "learning_rate": 0.00019935255633893632, + "loss": 1.7964, + "step": 1720 + }, + { + "epoch": 0.36445989361141834, + "grad_norm": 0.7606905698776245, + "learning_rate": 0.0001993450142923319, + "loss": 1.7642, + "step": 1730 + }, + { + "epoch": 0.3665665981987676, + "grad_norm": 0.748777449131012, + "learning_rate": 0.00019933742871556, + "loss": 1.8177, + "step": 1740 + }, + { + "epoch": 0.3686733027861168, + "grad_norm": 0.6804649829864502, + "learning_rate": 0.00019932979961194435, + "loss": 1.7606, + "step": 1750 + }, + { + "epoch": 0.37078000737346606, + "grad_norm": 0.6883798837661743, + "learning_rate": 0.00019932212698482786, + "loss": 1.8529, + "step": 1760 + }, + { + "epoch": 0.3728867119608153, + "grad_norm": 0.6991207003593445, + "learning_rate": 0.00019931441083757245, + "loss": 1.842, + "step": 1770 + }, + { + "epoch": 0.37499341654816454, + "grad_norm": 0.8275269269943237, + "learning_rate": 0.00019930665117355906, + "loss": 1.7748, + "step": 1780 + }, + { + "epoch": 0.3771001211355138, + "grad_norm": 0.740403950214386, + "learning_rate": 0.0001992988479961878, + "loss": 1.8512, + "step": 1790 + }, + { + "epoch": 0.379206825722863, + "grad_norm": 0.7021561861038208, + "learning_rate": 0.00019929100130887782, + "loss": 1.7875, + "step": 1800 + }, + { + "epoch": 0.38131353031021226, + "grad_norm": 0.6865299344062805, + "learning_rate": 0.00019928311111506726, + "loss": 1.7992, + "step": 1810 + }, + { + "epoch": 0.3834202348975615, + "grad_norm": 0.7252370119094849, + "learning_rate": 0.00019927517741821343, + "loss": 1.8761, + "step": 1820 + }, + { + "epoch": 0.38552693948491074, + "grad_norm": 0.7330300211906433, + "learning_rate": 0.0001992672002217926, + "loss": 1.8956, + "step": 1830 + }, + { + "epoch": 0.38763364407226, + "grad_norm": 0.6942179203033447, + "learning_rate": 0.00019925917952930022, + "loss": 1.8465, + "step": 1840 + }, + { + "epoch": 0.3897403486596092, + "grad_norm": 0.7793520092964172, + "learning_rate": 0.00019925111534425068, + "loss": 1.7717, + "step": 1850 + }, + { + "epoch": 0.39184705324695845, + "grad_norm": 0.7142687439918518, + "learning_rate": 0.0001992430076701775, + "loss": 1.7862, + "step": 1860 + }, + { + "epoch": 0.3939537578343077, + "grad_norm": 0.6944627165794373, + "learning_rate": 0.00019923485651063318, + "loss": 1.8167, + "step": 1870 + }, + { + "epoch": 0.39606046242165693, + "grad_norm": 0.7327067852020264, + "learning_rate": 0.0001992266618691894, + "loss": 1.8431, + "step": 1880 + }, + { + "epoch": 0.3981671670090062, + "grad_norm": 0.7131524682044983, + "learning_rate": 0.0001992184237494368, + "loss": 1.8566, + "step": 1890 + }, + { + "epoch": 0.4002738715963554, + "grad_norm": 0.7777301073074341, + "learning_rate": 0.00019921014215498506, + "loss": 1.8612, + "step": 1900 + }, + { + "epoch": 0.40238057618370465, + "grad_norm": 0.684638500213623, + "learning_rate": 0.00019920181708946296, + "loss": 1.8838, + "step": 1910 + }, + { + "epoch": 0.4044872807710539, + "grad_norm": 0.7215536236763, + "learning_rate": 0.00019919344855651833, + "loss": 1.8362, + "step": 1920 + }, + { + "epoch": 0.40659398535840313, + "grad_norm": 0.7496854066848755, + "learning_rate": 0.00019918503655981797, + "loss": 1.8092, + "step": 1930 + }, + { + "epoch": 0.40870068994575237, + "grad_norm": 0.7422535419464111, + "learning_rate": 0.00019917658110304783, + "loss": 1.8325, + "step": 1940 + }, + { + "epoch": 0.4108073945331016, + "grad_norm": 0.7733063101768494, + "learning_rate": 0.0001991680821899128, + "loss": 1.7758, + "step": 1950 + }, + { + "epoch": 0.41291409912045085, + "grad_norm": 0.662631094455719, + "learning_rate": 0.0001991595398241369, + "loss": 1.7993, + "step": 1960 + }, + { + "epoch": 0.4150208037078001, + "grad_norm": 0.7115535140037537, + "learning_rate": 0.00019915095400946319, + "loss": 1.8554, + "step": 1970 + }, + { + "epoch": 0.41712750829514933, + "grad_norm": 0.6702825427055359, + "learning_rate": 0.00019914232474965365, + "loss": 1.808, + "step": 1980 + }, + { + "epoch": 0.41923421288249857, + "grad_norm": 0.6962928771972656, + "learning_rate": 0.00019913365204848939, + "loss": 1.7712, + "step": 1990 + }, + { + "epoch": 0.4213409174698478, + "grad_norm": 0.7009411454200745, + "learning_rate": 0.0001991249359097706, + "loss": 1.8069, + "step": 2000 + }, + { + "epoch": 0.42344762205719705, + "grad_norm": 0.7244513630867004, + "learning_rate": 0.00019911617633731638, + "loss": 1.8486, + "step": 2010 + }, + { + "epoch": 0.4255543266445463, + "grad_norm": 0.7286465764045715, + "learning_rate": 0.00019910737333496498, + "loss": 1.7949, + "step": 2020 + }, + { + "epoch": 0.4276610312318955, + "grad_norm": 0.686680018901825, + "learning_rate": 0.00019909852690657359, + "loss": 1.811, + "step": 2030 + }, + { + "epoch": 0.42976773581924477, + "grad_norm": 0.7796157002449036, + "learning_rate": 0.00019908963705601846, + "loss": 1.8168, + "step": 2040 + }, + { + "epoch": 0.431874440406594, + "grad_norm": 0.6823425889015198, + "learning_rate": 0.00019908070378719492, + "loss": 1.7983, + "step": 2050 + }, + { + "epoch": 0.43398114499394325, + "grad_norm": 0.7402235269546509, + "learning_rate": 0.00019907172710401723, + "loss": 1.8272, + "step": 2060 + }, + { + "epoch": 0.4360878495812925, + "grad_norm": 0.7159191370010376, + "learning_rate": 0.0001990627070104187, + "loss": 1.8816, + "step": 2070 + }, + { + "epoch": 0.4381945541686417, + "grad_norm": 0.7955963015556335, + "learning_rate": 0.00019905364351035173, + "loss": 1.8434, + "step": 2080 + }, + { + "epoch": 0.44030125875599097, + "grad_norm": 0.6838492751121521, + "learning_rate": 0.0001990445366077877, + "loss": 1.8187, + "step": 2090 + }, + { + "epoch": 0.4424079633433402, + "grad_norm": 0.7640077471733093, + "learning_rate": 0.0001990353863067169, + "loss": 1.8395, + "step": 2100 + }, + { + "epoch": 0.44451466793068944, + "grad_norm": 0.8374277353286743, + "learning_rate": 0.0001990261926111488, + "loss": 1.813, + "step": 2110 + }, + { + "epoch": 0.4466213725180387, + "grad_norm": 0.7484899759292603, + "learning_rate": 0.00019901695552511183, + "loss": 1.8341, + "step": 2120 + }, + { + "epoch": 0.4487280771053879, + "grad_norm": 0.7072715759277344, + "learning_rate": 0.0001990076750526534, + "loss": 1.8297, + "step": 2130 + }, + { + "epoch": 0.45083478169273716, + "grad_norm": 0.6923781633377075, + "learning_rate": 0.00019899835119783997, + "loss": 1.8126, + "step": 2140 + }, + { + "epoch": 0.45294148628008635, + "grad_norm": 0.8046467304229736, + "learning_rate": 0.00019898898396475694, + "loss": 1.8306, + "step": 2150 + }, + { + "epoch": 0.4550481908674356, + "grad_norm": 0.7048581838607788, + "learning_rate": 0.00019897957335750878, + "loss": 1.8026, + "step": 2160 + }, + { + "epoch": 0.4571548954547848, + "grad_norm": 0.6879782676696777, + "learning_rate": 0.000198970119380219, + "loss": 1.8211, + "step": 2170 + }, + { + "epoch": 0.45926160004213407, + "grad_norm": 0.7135127782821655, + "learning_rate": 0.00019896062203703002, + "loss": 1.8329, + "step": 2180 + }, + { + "epoch": 0.4613683046294833, + "grad_norm": 0.7031205892562866, + "learning_rate": 0.00019895108133210335, + "loss": 1.7945, + "step": 2190 + }, + { + "epoch": 0.46347500921683255, + "grad_norm": 0.7131096720695496, + "learning_rate": 0.00019894149726961937, + "loss": 1.8469, + "step": 2200 + }, + { + "epoch": 0.4655817138041818, + "grad_norm": 0.7171880006790161, + "learning_rate": 0.00019893186985377761, + "loss": 1.8642, + "step": 2210 + }, + { + "epoch": 0.467688418391531, + "grad_norm": 0.8296772241592407, + "learning_rate": 0.00019892219908879653, + "loss": 1.8306, + "step": 2220 + }, + { + "epoch": 0.46979512297888026, + "grad_norm": 0.7418431639671326, + "learning_rate": 0.0001989124849789136, + "loss": 1.8617, + "step": 2230 + }, + { + "epoch": 0.4719018275662295, + "grad_norm": 0.6901628971099854, + "learning_rate": 0.00019890272752838518, + "loss": 1.7787, + "step": 2240 + }, + { + "epoch": 0.47400853215357874, + "grad_norm": 0.7321175336837769, + "learning_rate": 0.00019889292674148682, + "loss": 1.7973, + "step": 2250 + }, + { + "epoch": 0.476115236740928, + "grad_norm": 0.7562801837921143, + "learning_rate": 0.00019888308262251285, + "loss": 1.908, + "step": 2260 + }, + { + "epoch": 0.4782219413282772, + "grad_norm": 0.7009602785110474, + "learning_rate": 0.0001988731951757768, + "loss": 1.8732, + "step": 2270 + }, + { + "epoch": 0.48032864591562646, + "grad_norm": 0.7295694947242737, + "learning_rate": 0.00019886326440561093, + "loss": 1.875, + "step": 2280 + }, + { + "epoch": 0.4824353505029757, + "grad_norm": 0.7495467662811279, + "learning_rate": 0.0001988532903163667, + "loss": 1.8208, + "step": 2290 + }, + { + "epoch": 0.48454205509032494, + "grad_norm": 0.7268190979957581, + "learning_rate": 0.00019884327291241446, + "loss": 1.8246, + "step": 2300 + }, + { + "epoch": 0.4866487596776742, + "grad_norm": 0.671644926071167, + "learning_rate": 0.0001988332121981436, + "loss": 1.8348, + "step": 2310 + }, + { + "epoch": 0.4887554642650234, + "grad_norm": 0.7536865472793579, + "learning_rate": 0.00019882310817796235, + "loss": 1.7981, + "step": 2320 + }, + { + "epoch": 0.49086216885237266, + "grad_norm": 0.8593438863754272, + "learning_rate": 0.00019881296085629807, + "loss": 1.8558, + "step": 2330 + }, + { + "epoch": 0.4929688734397219, + "grad_norm": 0.7384650707244873, + "learning_rate": 0.00019880277023759702, + "loss": 1.8659, + "step": 2340 + }, + { + "epoch": 0.49507557802707114, + "grad_norm": 0.7357537150382996, + "learning_rate": 0.0001987925363263244, + "loss": 1.7887, + "step": 2350 + }, + { + "epoch": 0.4971822826144204, + "grad_norm": 0.7231765389442444, + "learning_rate": 0.00019878225912696446, + "loss": 1.8046, + "step": 2360 + }, + { + "epoch": 0.4992889872017696, + "grad_norm": 0.7143809199333191, + "learning_rate": 0.00019877193864402038, + "loss": 1.8156, + "step": 2370 + }, + { + "epoch": 0.5013956917891189, + "grad_norm": 0.7593514323234558, + "learning_rate": 0.00019876157488201424, + "loss": 1.8299, + "step": 2380 + }, + { + "epoch": 0.5035023963764681, + "grad_norm": 0.7489436864852905, + "learning_rate": 0.00019875116784548723, + "loss": 1.7569, + "step": 2390 + }, + { + "epoch": 0.5056091009638174, + "grad_norm": 0.7284204363822937, + "learning_rate": 0.0001987407175389994, + "loss": 1.8094, + "step": 2400 + }, + { + "epoch": 0.5077158055511666, + "grad_norm": 0.7947770357131958, + "learning_rate": 0.00019873022396712972, + "loss": 1.8225, + "step": 2410 + }, + { + "epoch": 0.5098225101385159, + "grad_norm": 0.6892939209938049, + "learning_rate": 0.00019871968713447625, + "loss": 1.7943, + "step": 2420 + }, + { + "epoch": 0.5119292147258651, + "grad_norm": 0.7589520215988159, + "learning_rate": 0.00019870910704565588, + "loss": 1.774, + "step": 2430 + }, + { + "epoch": 0.5140359193132144, + "grad_norm": 0.7198192477226257, + "learning_rate": 0.00019869848370530452, + "loss": 1.7809, + "step": 2440 + }, + { + "epoch": 0.5161426239005635, + "grad_norm": 0.7147354483604431, + "learning_rate": 0.00019868781711807705, + "loss": 1.8524, + "step": 2450 + }, + { + "epoch": 0.5182493284879128, + "grad_norm": 0.8181036710739136, + "learning_rate": 0.0001986771072886472, + "loss": 1.8625, + "step": 2460 + }, + { + "epoch": 0.520356033075262, + "grad_norm": 0.7021554112434387, + "learning_rate": 0.00019866635422170775, + "loss": 1.8081, + "step": 2470 + }, + { + "epoch": 0.5224627376626113, + "grad_norm": 0.7589981555938721, + "learning_rate": 0.00019865555792197042, + "loss": 1.8463, + "step": 2480 + }, + { + "epoch": 0.5245694422499605, + "grad_norm": 0.734856128692627, + "learning_rate": 0.00019864471839416576, + "loss": 1.852, + "step": 2490 + }, + { + "epoch": 0.5266761468373098, + "grad_norm": 0.7465302348136902, + "learning_rate": 0.00019863383564304346, + "loss": 1.809, + "step": 2500 + }, + { + "epoch": 0.528782851424659, + "grad_norm": 0.7393603324890137, + "learning_rate": 0.00019862290967337192, + "loss": 1.842, + "step": 2510 + }, + { + "epoch": 0.5308895560120083, + "grad_norm": 0.7171728610992432, + "learning_rate": 0.00019861194048993863, + "loss": 1.7747, + "step": 2520 + }, + { + "epoch": 0.5329962605993575, + "grad_norm": 0.7123204469680786, + "learning_rate": 0.00019860092809755, + "loss": 1.8511, + "step": 2530 + }, + { + "epoch": 0.5351029651867067, + "grad_norm": 0.8142136931419373, + "learning_rate": 0.00019858987250103132, + "loss": 1.8167, + "step": 2540 + }, + { + "epoch": 0.5372096697740559, + "grad_norm": 0.7203823924064636, + "learning_rate": 0.00019857877370522685, + "loss": 1.8407, + "step": 2550 + }, + { + "epoch": 0.5393163743614052, + "grad_norm": 0.7605892419815063, + "learning_rate": 0.00019856763171499978, + "loss": 1.8308, + "step": 2560 + }, + { + "epoch": 0.5414230789487544, + "grad_norm": 0.735731840133667, + "learning_rate": 0.00019855644653523217, + "loss": 1.808, + "step": 2570 + }, + { + "epoch": 0.5435297835361036, + "grad_norm": 0.7104590535163879, + "learning_rate": 0.0001985452181708251, + "loss": 1.7968, + "step": 2580 + }, + { + "epoch": 0.5456364881234529, + "grad_norm": 0.734384298324585, + "learning_rate": 0.00019853394662669847, + "loss": 1.8222, + "step": 2590 + }, + { + "epoch": 0.5477431927108021, + "grad_norm": 0.7150630950927734, + "learning_rate": 0.00019852263190779122, + "loss": 1.8305, + "step": 2600 + }, + { + "epoch": 0.5498498972981514, + "grad_norm": 0.7069328427314758, + "learning_rate": 0.0001985112740190611, + "loss": 1.8839, + "step": 2610 + }, + { + "epoch": 0.5519566018855006, + "grad_norm": 0.8343731164932251, + "learning_rate": 0.00019849987296548477, + "loss": 1.8147, + "step": 2620 + }, + { + "epoch": 0.5540633064728498, + "grad_norm": 0.74041748046875, + "learning_rate": 0.00019848842875205792, + "loss": 1.8146, + "step": 2630 + }, + { + "epoch": 0.556170011060199, + "grad_norm": 0.7421287298202515, + "learning_rate": 0.00019847694138379506, + "loss": 1.7814, + "step": 2640 + }, + { + "epoch": 0.5582767156475483, + "grad_norm": 0.7727276682853699, + "learning_rate": 0.0001984654108657296, + "loss": 1.82, + "step": 2650 + }, + { + "epoch": 0.5603834202348975, + "grad_norm": 0.7713900208473206, + "learning_rate": 0.00019845383720291392, + "loss": 1.8117, + "step": 2660 + }, + { + "epoch": 0.5624901248222468, + "grad_norm": 0.7313315272331238, + "learning_rate": 0.00019844222040041928, + "loss": 1.8402, + "step": 2670 + }, + { + "epoch": 0.564596829409596, + "grad_norm": 0.7062333226203918, + "learning_rate": 0.00019843056046333577, + "loss": 1.8251, + "step": 2680 + }, + { + "epoch": 0.5667035339969453, + "grad_norm": 0.7643407583236694, + "learning_rate": 0.00019841885739677251, + "loss": 1.8659, + "step": 2690 + }, + { + "epoch": 0.5688102385842945, + "grad_norm": 0.7446190714836121, + "learning_rate": 0.0001984071112058574, + "loss": 1.8089, + "step": 2700 + }, + { + "epoch": 0.5709169431716438, + "grad_norm": 0.7492159605026245, + "learning_rate": 0.00019839532189573733, + "loss": 1.7645, + "step": 2710 + }, + { + "epoch": 0.573023647758993, + "grad_norm": 0.707715630531311, + "learning_rate": 0.00019838348947157804, + "loss": 1.7683, + "step": 2720 + }, + { + "epoch": 0.5751303523463422, + "grad_norm": 0.7064851522445679, + "learning_rate": 0.0001983716139385641, + "loss": 1.8434, + "step": 2730 + }, + { + "epoch": 0.5772370569336914, + "grad_norm": 0.6879447102546692, + "learning_rate": 0.00019835969530189912, + "loss": 1.8176, + "step": 2740 + }, + { + "epoch": 0.5793437615210407, + "grad_norm": 0.7283410429954529, + "learning_rate": 0.00019834773356680547, + "loss": 1.8101, + "step": 2750 + }, + { + "epoch": 0.5814504661083899, + "grad_norm": 0.7824443578720093, + "learning_rate": 0.00019833572873852444, + "loss": 1.8454, + "step": 2760 + }, + { + "epoch": 0.5835571706957392, + "grad_norm": 0.7730831503868103, + "learning_rate": 0.0001983236808223162, + "loss": 1.8517, + "step": 2770 + }, + { + "epoch": 0.5856638752830884, + "grad_norm": 0.7812346816062927, + "learning_rate": 0.0001983115898234598, + "loss": 1.8139, + "step": 2780 + }, + { + "epoch": 0.5877705798704377, + "grad_norm": 0.7695793509483337, + "learning_rate": 0.0001982994557472532, + "loss": 1.8299, + "step": 2790 + }, + { + "epoch": 0.5898772844577869, + "grad_norm": 0.7079188227653503, + "learning_rate": 0.00019828727859901317, + "loss": 1.7954, + "step": 2800 + }, + { + "epoch": 0.5919839890451362, + "grad_norm": 0.9112276434898376, + "learning_rate": 0.00019827505838407544, + "loss": 1.8438, + "step": 2810 + }, + { + "epoch": 0.5940906936324853, + "grad_norm": 0.7448698282241821, + "learning_rate": 0.00019826279510779454, + "loss": 1.7831, + "step": 2820 + }, + { + "epoch": 0.5961973982198346, + "grad_norm": 0.8215844035148621, + "learning_rate": 0.00019825048877554385, + "loss": 1.8194, + "step": 2830 + }, + { + "epoch": 0.5983041028071838, + "grad_norm": 0.7165418863296509, + "learning_rate": 0.0001982381393927157, + "loss": 1.8716, + "step": 2840 + }, + { + "epoch": 0.6004108073945331, + "grad_norm": 0.7175021171569824, + "learning_rate": 0.00019822574696472126, + "loss": 1.8433, + "step": 2850 + }, + { + "epoch": 0.6025175119818823, + "grad_norm": 0.8004611730575562, + "learning_rate": 0.00019821331149699048, + "loss": 1.8758, + "step": 2860 + }, + { + "epoch": 0.6046242165692316, + "grad_norm": 0.731736958026886, + "learning_rate": 0.00019820083299497228, + "loss": 1.7633, + "step": 2870 + }, + { + "epoch": 0.6067309211565808, + "grad_norm": 0.7083932757377625, + "learning_rate": 0.00019818831146413434, + "loss": 1.7994, + "step": 2880 + }, + { + "epoch": 0.6088376257439301, + "grad_norm": 0.740691065788269, + "learning_rate": 0.0001981757469099633, + "loss": 1.7944, + "step": 2890 + }, + { + "epoch": 0.6109443303312793, + "grad_norm": 0.7464807629585266, + "learning_rate": 0.0001981631393379645, + "loss": 1.8093, + "step": 2900 + }, + { + "epoch": 0.6130510349186286, + "grad_norm": 0.7055980563163757, + "learning_rate": 0.00019815048875366234, + "loss": 1.8142, + "step": 2910 + }, + { + "epoch": 0.6151577395059777, + "grad_norm": 0.7619740962982178, + "learning_rate": 0.00019813779516259986, + "loss": 1.7805, + "step": 2920 + }, + { + "epoch": 0.617264444093327, + "grad_norm": 0.8591639995574951, + "learning_rate": 0.00019812505857033904, + "loss": 1.8294, + "step": 2930 + }, + { + "epoch": 0.6193711486806762, + "grad_norm": 0.7257503867149353, + "learning_rate": 0.0001981122789824607, + "loss": 1.7968, + "step": 2940 + }, + { + "epoch": 0.6214778532680255, + "grad_norm": 0.8203638195991516, + "learning_rate": 0.00019809945640456453, + "loss": 1.8513, + "step": 2950 + }, + { + "epoch": 0.6235845578553747, + "grad_norm": 0.7143609523773193, + "learning_rate": 0.000198086590842269, + "loss": 1.7997, + "step": 2960 + }, + { + "epoch": 0.625691262442724, + "grad_norm": 0.7339126467704773, + "learning_rate": 0.0001980736823012114, + "loss": 1.8356, + "step": 2970 + }, + { + "epoch": 0.6277979670300732, + "grad_norm": 0.755310595035553, + "learning_rate": 0.0001980607307870479, + "loss": 1.771, + "step": 2980 + }, + { + "epoch": 0.6299046716174225, + "grad_norm": 0.7657110691070557, + "learning_rate": 0.00019804773630545353, + "loss": 1.8006, + "step": 2990 + }, + { + "epoch": 0.6320113762047717, + "grad_norm": 0.7267753481864929, + "learning_rate": 0.0001980346988621221, + "loss": 1.895, + "step": 3000 + }, + { + "epoch": 0.634118080792121, + "grad_norm": 0.7973232865333557, + "learning_rate": 0.00019802161846276615, + "loss": 1.8724, + "step": 3010 + }, + { + "epoch": 0.6362247853794701, + "grad_norm": 0.7586084008216858, + "learning_rate": 0.00019800849511311726, + "loss": 1.8659, + "step": 3020 + }, + { + "epoch": 0.6383314899668194, + "grad_norm": 0.7559179663658142, + "learning_rate": 0.00019799532881892564, + "loss": 1.8208, + "step": 3030 + }, + { + "epoch": 0.6404381945541686, + "grad_norm": 0.7547093629837036, + "learning_rate": 0.0001979821195859604, + "loss": 1.7948, + "step": 3040 + }, + { + "epoch": 0.6425448991415179, + "grad_norm": 0.7194349765777588, + "learning_rate": 0.0001979688674200095, + "loss": 1.795, + "step": 3050 + }, + { + "epoch": 0.6446516037288671, + "grad_norm": 0.7290440797805786, + "learning_rate": 0.00019795557232687956, + "loss": 1.8149, + "step": 3060 + }, + { + "epoch": 0.6467583083162164, + "grad_norm": 0.7215458750724792, + "learning_rate": 0.0001979422343123962, + "loss": 1.8181, + "step": 3070 + }, + { + "epoch": 0.6488650129035656, + "grad_norm": 0.7262182831764221, + "learning_rate": 0.00019792885338240374, + "loss": 1.7897, + "step": 3080 + }, + { + "epoch": 0.6509717174909149, + "grad_norm": 0.7412195205688477, + "learning_rate": 0.0001979154295427653, + "loss": 1.8387, + "step": 3090 + }, + { + "epoch": 0.653078422078264, + "grad_norm": 0.7855926156044006, + "learning_rate": 0.00019790196279936286, + "loss": 1.8165, + "step": 3100 + }, + { + "epoch": 0.6551851266656133, + "grad_norm": 0.7217236161231995, + "learning_rate": 0.00019788845315809713, + "loss": 1.7652, + "step": 3110 + }, + { + "epoch": 0.6572918312529625, + "grad_norm": 0.7485338449478149, + "learning_rate": 0.0001978749006248877, + "loss": 1.7949, + "step": 3120 + }, + { + "epoch": 0.6593985358403118, + "grad_norm": 0.7767829895019531, + "learning_rate": 0.00019786130520567285, + "loss": 1.802, + "step": 3130 + }, + { + "epoch": 0.661505240427661, + "grad_norm": 0.7325339317321777, + "learning_rate": 0.00019784766690640975, + "loss": 1.7796, + "step": 3140 + }, + { + "epoch": 0.6636119450150103, + "grad_norm": 0.7544266581535339, + "learning_rate": 0.00019783398573307428, + "loss": 1.8225, + "step": 3150 + }, + { + "epoch": 0.6657186496023595, + "grad_norm": 1.14772367477417, + "learning_rate": 0.00019782026169166118, + "loss": 1.8231, + "step": 3160 + }, + { + "epoch": 0.6678253541897088, + "grad_norm": 0.7514392733573914, + "learning_rate": 0.0001978064947881839, + "loss": 1.8369, + "step": 3170 + }, + { + "epoch": 0.669932058777058, + "grad_norm": 0.7889571785926819, + "learning_rate": 0.00019779268502867473, + "loss": 1.779, + "step": 3180 + }, + { + "epoch": 0.6720387633644073, + "grad_norm": 0.7332549095153809, + "learning_rate": 0.00019777883241918468, + "loss": 1.8519, + "step": 3190 + }, + { + "epoch": 0.6741454679517564, + "grad_norm": 0.7040186524391174, + "learning_rate": 0.00019776493696578365, + "loss": 1.7989, + "step": 3200 + }, + { + "epoch": 0.6762521725391057, + "grad_norm": 0.7270476818084717, + "learning_rate": 0.00019775099867456013, + "loss": 1.8423, + "step": 3210 + }, + { + "epoch": 0.6783588771264549, + "grad_norm": 0.7685533165931702, + "learning_rate": 0.00019773701755162158, + "loss": 1.7827, + "step": 3220 + }, + { + "epoch": 0.6804655817138042, + "grad_norm": 0.6836286187171936, + "learning_rate": 0.00019772299360309406, + "loss": 1.8167, + "step": 3230 + }, + { + "epoch": 0.6825722863011534, + "grad_norm": 0.7396418452262878, + "learning_rate": 0.0001977089268351225, + "loss": 1.8, + "step": 3240 + }, + { + "epoch": 0.6846789908885027, + "grad_norm": 0.7958587408065796, + "learning_rate": 0.00019769481725387053, + "loss": 1.8342, + "step": 3250 + }, + { + "epoch": 0.6867856954758519, + "grad_norm": 0.7690047025680542, + "learning_rate": 0.00019768066486552065, + "loss": 1.8248, + "step": 3260 + }, + { + "epoch": 0.6888924000632012, + "grad_norm": 0.7547836303710938, + "learning_rate": 0.0001976664696762739, + "loss": 1.8619, + "step": 3270 + }, + { + "epoch": 0.6909991046505504, + "grad_norm": 0.8090780377388, + "learning_rate": 0.00019765223169235035, + "loss": 1.8464, + "step": 3280 + }, + { + "epoch": 0.6931058092378997, + "grad_norm": 0.7340956926345825, + "learning_rate": 0.00019763795091998858, + "loss": 1.8414, + "step": 3290 + }, + { + "epoch": 0.6952125138252488, + "grad_norm": 0.7637834548950195, + "learning_rate": 0.00019762362736544607, + "loss": 1.775, + "step": 3300 + }, + { + "epoch": 0.6973192184125981, + "grad_norm": 0.7355379462242126, + "learning_rate": 0.00019760926103499897, + "loss": 1.8039, + "step": 3310 + }, + { + "epoch": 0.6994259229999473, + "grad_norm": 0.7598356008529663, + "learning_rate": 0.0001975948519349422, + "loss": 1.7958, + "step": 3320 + }, + { + "epoch": 0.7015326275872966, + "grad_norm": 0.6928766369819641, + "learning_rate": 0.00019758040007158948, + "loss": 1.7421, + "step": 3330 + }, + { + "epoch": 0.7036393321746458, + "grad_norm": 0.7078458666801453, + "learning_rate": 0.00019756590545127312, + "loss": 1.8463, + "step": 3340 + }, + { + "epoch": 0.7057460367619951, + "grad_norm": 0.7719635963439941, + "learning_rate": 0.00019755136808034425, + "loss": 1.7731, + "step": 3350 + }, + { + "epoch": 0.7078527413493443, + "grad_norm": 0.7915549278259277, + "learning_rate": 0.00019753678796517282, + "loss": 1.8302, + "step": 3360 + }, + { + "epoch": 0.7099594459366936, + "grad_norm": 0.7216295599937439, + "learning_rate": 0.00019752216511214737, + "loss": 1.8054, + "step": 3370 + }, + { + "epoch": 0.7120661505240428, + "grad_norm": 0.7865151762962341, + "learning_rate": 0.0001975074995276752, + "loss": 1.8076, + "step": 3380 + }, + { + "epoch": 0.7141728551113921, + "grad_norm": 0.7931540608406067, + "learning_rate": 0.00019749279121818235, + "loss": 1.8389, + "step": 3390 + }, + { + "epoch": 0.7162795596987412, + "grad_norm": 0.7390506267547607, + "learning_rate": 0.00019747804019011367, + "loss": 1.7799, + "step": 3400 + }, + { + "epoch": 0.7183862642860905, + "grad_norm": 0.7065187096595764, + "learning_rate": 0.00019746324644993255, + "loss": 1.7853, + "step": 3410 + }, + { + "epoch": 0.7204929688734397, + "grad_norm": 0.7845907211303711, + "learning_rate": 0.00019744841000412123, + "loss": 1.8037, + "step": 3420 + }, + { + "epoch": 0.722599673460789, + "grad_norm": 0.7040785551071167, + "learning_rate": 0.0001974335308591806, + "loss": 1.7391, + "step": 3430 + }, + { + "epoch": 0.7247063780481382, + "grad_norm": 0.7547016739845276, + "learning_rate": 0.00019741860902163029, + "loss": 1.8104, + "step": 3440 + }, + { + "epoch": 0.7268130826354874, + "grad_norm": 0.7265505790710449, + "learning_rate": 0.0001974036444980086, + "loss": 1.8016, + "step": 3450 + }, + { + "epoch": 0.7289197872228367, + "grad_norm": 0.6882880926132202, + "learning_rate": 0.0001973886372948726, + "loss": 1.7781, + "step": 3460 + }, + { + "epoch": 0.7310264918101859, + "grad_norm": 0.7181904911994934, + "learning_rate": 0.00019737358741879802, + "loss": 1.8162, + "step": 3470 + }, + { + "epoch": 0.7331331963975352, + "grad_norm": 0.7640955448150635, + "learning_rate": 0.00019735849487637929, + "loss": 1.7987, + "step": 3480 + }, + { + "epoch": 0.7352399009848843, + "grad_norm": 0.7534769773483276, + "learning_rate": 0.00019734335967422947, + "loss": 1.7754, + "step": 3490 + }, + { + "epoch": 0.7373466055722336, + "grad_norm": 0.7754014730453491, + "learning_rate": 0.00019732818181898045, + "loss": 1.827, + "step": 3500 + }, + { + "epoch": 0.7394533101595828, + "grad_norm": 0.6991292834281921, + "learning_rate": 0.0001973129613172827, + "loss": 1.8061, + "step": 3510 + }, + { + "epoch": 0.7415600147469321, + "grad_norm": 0.7496746182441711, + "learning_rate": 0.00019729769817580542, + "loss": 1.8001, + "step": 3520 + }, + { + "epoch": 0.7436667193342813, + "grad_norm": 0.7338647246360779, + "learning_rate": 0.0001972823924012365, + "loss": 1.8216, + "step": 3530 + }, + { + "epoch": 0.7457734239216306, + "grad_norm": 0.742842435836792, + "learning_rate": 0.0001972670440002825, + "loss": 1.7468, + "step": 3540 + }, + { + "epoch": 0.7478801285089798, + "grad_norm": 0.7462572455406189, + "learning_rate": 0.00019725165297966859, + "loss": 1.7863, + "step": 3550 + }, + { + "epoch": 0.7499868330963291, + "grad_norm": 0.7699198126792908, + "learning_rate": 0.00019723621934613874, + "loss": 1.7659, + "step": 3560 + }, + { + "epoch": 0.7520935376836783, + "grad_norm": 0.7163071632385254, + "learning_rate": 0.00019722074310645553, + "loss": 1.8067, + "step": 3570 + }, + { + "epoch": 0.7542002422710276, + "grad_norm": 0.7273010015487671, + "learning_rate": 0.0001972052242674002, + "loss": 1.8497, + "step": 3580 + }, + { + "epoch": 0.7563069468583767, + "grad_norm": 0.7326186895370483, + "learning_rate": 0.00019718966283577264, + "loss": 1.7821, + "step": 3590 + }, + { + "epoch": 0.758413651445726, + "grad_norm": 0.7297202944755554, + "learning_rate": 0.00019717405881839145, + "loss": 1.8288, + "step": 3600 + }, + { + "epoch": 0.7605203560330752, + "grad_norm": 0.7100248336791992, + "learning_rate": 0.00019715841222209387, + "loss": 1.7949, + "step": 3610 + }, + { + "epoch": 0.7626270606204245, + "grad_norm": 0.7175480723381042, + "learning_rate": 0.0001971427230537358, + "loss": 1.807, + "step": 3620 + }, + { + "epoch": 0.7647337652077737, + "grad_norm": 0.7053601741790771, + "learning_rate": 0.0001971269913201918, + "loss": 1.8223, + "step": 3630 + }, + { + "epoch": 0.766840469795123, + "grad_norm": 0.7398414611816406, + "learning_rate": 0.00019711121702835504, + "loss": 1.7607, + "step": 3640 + }, + { + "epoch": 0.7689471743824722, + "grad_norm": 0.7213431596755981, + "learning_rate": 0.00019709540018513736, + "loss": 1.7927, + "step": 3650 + }, + { + "epoch": 0.7710538789698215, + "grad_norm": 0.7234872579574585, + "learning_rate": 0.00019707954079746927, + "loss": 1.8027, + "step": 3660 + }, + { + "epoch": 0.7731605835571707, + "grad_norm": 0.8808572292327881, + "learning_rate": 0.0001970636388722999, + "loss": 1.7833, + "step": 3670 + }, + { + "epoch": 0.77526728814452, + "grad_norm": 0.7086265683174133, + "learning_rate": 0.00019704769441659703, + "loss": 1.7386, + "step": 3680 + }, + { + "epoch": 0.7773739927318691, + "grad_norm": 0.7601612210273743, + "learning_rate": 0.00019703170743734706, + "loss": 1.7816, + "step": 3690 + }, + { + "epoch": 0.7794806973192184, + "grad_norm": 0.7857274413108826, + "learning_rate": 0.000197015677941555, + "loss": 1.7497, + "step": 3700 + }, + { + "epoch": 0.7815874019065676, + "grad_norm": 0.7167443633079529, + "learning_rate": 0.00019699960593624462, + "loss": 1.7963, + "step": 3710 + }, + { + "epoch": 0.7836941064939169, + "grad_norm": 0.6912304759025574, + "learning_rate": 0.00019698349142845814, + "loss": 1.738, + "step": 3720 + }, + { + "epoch": 0.7858008110812661, + "grad_norm": 0.7966861724853516, + "learning_rate": 0.00019696733442525646, + "loss": 1.8344, + "step": 3730 + }, + { + "epoch": 0.7879075156686154, + "grad_norm": 0.8376454710960388, + "learning_rate": 0.00019695113493371918, + "loss": 1.7743, + "step": 3740 + }, + { + "epoch": 0.7900142202559646, + "grad_norm": 0.7219945192337036, + "learning_rate": 0.00019693489296094443, + "loss": 1.8116, + "step": 3750 + }, + { + "epoch": 0.7921209248433139, + "grad_norm": 0.7250286936759949, + "learning_rate": 0.00019691860851404897, + "loss": 1.8044, + "step": 3760 + }, + { + "epoch": 0.794227629430663, + "grad_norm": 0.8389397859573364, + "learning_rate": 0.00019690228160016817, + "loss": 1.7662, + "step": 3770 + }, + { + "epoch": 0.7963343340180123, + "grad_norm": 0.8005920052528381, + "learning_rate": 0.00019688591222645607, + "loss": 1.8361, + "step": 3780 + }, + { + "epoch": 0.7984410386053615, + "grad_norm": 0.7611145377159119, + "learning_rate": 0.00019686950040008526, + "loss": 1.8219, + "step": 3790 + }, + { + "epoch": 0.8005477431927108, + "grad_norm": 0.7690491080284119, + "learning_rate": 0.0001968530461282469, + "loss": 1.78, + "step": 3800 + }, + { + "epoch": 0.80265444778006, + "grad_norm": 0.7202954292297363, + "learning_rate": 0.00019683654941815077, + "loss": 1.8325, + "step": 3810 + }, + { + "epoch": 0.8047611523674093, + "grad_norm": 0.7846165299415588, + "learning_rate": 0.00019682001027702533, + "loss": 1.7509, + "step": 3820 + }, + { + "epoch": 0.8068678569547585, + "grad_norm": 0.7994365096092224, + "learning_rate": 0.00019680342871211752, + "loss": 1.8076, + "step": 3830 + }, + { + "epoch": 0.8089745615421078, + "grad_norm": 0.854131281375885, + "learning_rate": 0.00019678680473069293, + "loss": 1.8089, + "step": 3840 + }, + { + "epoch": 0.811081266129457, + "grad_norm": 0.7317761778831482, + "learning_rate": 0.0001967701383400357, + "loss": 1.8047, + "step": 3850 + }, + { + "epoch": 0.8131879707168063, + "grad_norm": 0.7426530718803406, + "learning_rate": 0.00019675342954744853, + "loss": 1.7675, + "step": 3860 + }, + { + "epoch": 0.8152946753041554, + "grad_norm": 0.7657327055931091, + "learning_rate": 0.00019673667836025283, + "loss": 1.8319, + "step": 3870 + }, + { + "epoch": 0.8174013798915047, + "grad_norm": 0.773003339767456, + "learning_rate": 0.00019671988478578843, + "loss": 1.8963, + "step": 3880 + }, + { + "epoch": 0.8195080844788539, + "grad_norm": 0.7750910520553589, + "learning_rate": 0.00019670304883141382, + "loss": 1.8312, + "step": 3890 + }, + { + "epoch": 0.8216147890662032, + "grad_norm": 0.7256291508674622, + "learning_rate": 0.00019668617050450603, + "loss": 1.7888, + "step": 3900 + }, + { + "epoch": 0.8237214936535524, + "grad_norm": 0.7944165468215942, + "learning_rate": 0.00019666924981246066, + "loss": 1.8584, + "step": 3910 + }, + { + "epoch": 0.8258281982409017, + "grad_norm": 0.7136659622192383, + "learning_rate": 0.0001966522867626919, + "loss": 1.8315, + "step": 3920 + }, + { + "epoch": 0.8279349028282509, + "grad_norm": 0.8189762830734253, + "learning_rate": 0.00019663528136263246, + "loss": 1.8193, + "step": 3930 + }, + { + "epoch": 0.8300416074156002, + "grad_norm": 0.7890966534614563, + "learning_rate": 0.0001966182336197336, + "loss": 1.8204, + "step": 3940 + }, + { + "epoch": 0.8321483120029494, + "grad_norm": 0.8017906546592712, + "learning_rate": 0.00019660114354146525, + "loss": 1.7891, + "step": 3950 + }, + { + "epoch": 0.8342550165902987, + "grad_norm": 0.8445520997047424, + "learning_rate": 0.00019658401113531565, + "loss": 1.8108, + "step": 3960 + }, + { + "epoch": 0.8363617211776478, + "grad_norm": 0.7506213188171387, + "learning_rate": 0.00019656683640879185, + "loss": 1.7811, + "step": 3970 + }, + { + "epoch": 0.8384684257649971, + "grad_norm": 0.7670649290084839, + "learning_rate": 0.00019654961936941932, + "loss": 1.8, + "step": 3980 + }, + { + "epoch": 0.8405751303523463, + "grad_norm": 0.86253422498703, + "learning_rate": 0.000196532360024742, + "loss": 1.7841, + "step": 3990 + }, + { + "epoch": 0.8426818349396956, + "grad_norm": 0.8307934999465942, + "learning_rate": 0.00019651505838232255, + "loss": 1.7813, + "step": 4000 + }, + { + "epoch": 0.8447885395270448, + "grad_norm": 0.7400214076042175, + "learning_rate": 0.00019649771444974197, + "loss": 1.7305, + "step": 4010 + }, + { + "epoch": 0.8468952441143941, + "grad_norm": 0.7180853486061096, + "learning_rate": 0.00019648032823459994, + "loss": 1.7569, + "step": 4020 + }, + { + "epoch": 0.8490019487017433, + "grad_norm": 0.7013576626777649, + "learning_rate": 0.00019646289974451455, + "loss": 1.796, + "step": 4030 + }, + { + "epoch": 0.8511086532890926, + "grad_norm": 0.7643212676048279, + "learning_rate": 0.00019644542898712252, + "loss": 1.7791, + "step": 4040 + }, + { + "epoch": 0.8532153578764418, + "grad_norm": 0.7359760999679565, + "learning_rate": 0.00019642791597007902, + "loss": 1.7836, + "step": 4050 + }, + { + "epoch": 0.855322062463791, + "grad_norm": 0.7171246409416199, + "learning_rate": 0.00019641036070105778, + "loss": 1.7218, + "step": 4060 + }, + { + "epoch": 0.8574287670511402, + "grad_norm": 0.7775237560272217, + "learning_rate": 0.000196392763187751, + "loss": 1.7838, + "step": 4070 + }, + { + "epoch": 0.8595354716384895, + "grad_norm": 0.687788188457489, + "learning_rate": 0.00019637512343786937, + "loss": 1.8259, + "step": 4080 + }, + { + "epoch": 0.8616421762258387, + "grad_norm": 0.7279845476150513, + "learning_rate": 0.00019635744145914222, + "loss": 1.755, + "step": 4090 + }, + { + "epoch": 0.863748880813188, + "grad_norm": 0.8731548190116882, + "learning_rate": 0.0001963397172593172, + "loss": 1.8539, + "step": 4100 + }, + { + "epoch": 0.8658555854005372, + "grad_norm": 0.7026403546333313, + "learning_rate": 0.00019632195084616063, + "loss": 1.7384, + "step": 4110 + }, + { + "epoch": 0.8679622899878865, + "grad_norm": 0.7707716226577759, + "learning_rate": 0.0001963041422274572, + "loss": 1.821, + "step": 4120 + }, + { + "epoch": 0.8700689945752357, + "grad_norm": 0.7273789644241333, + "learning_rate": 0.00019628629141101012, + "loss": 1.8124, + "step": 4130 + }, + { + "epoch": 0.872175699162585, + "grad_norm": 0.7353452444076538, + "learning_rate": 0.00019626839840464119, + "loss": 1.7925, + "step": 4140 + }, + { + "epoch": 0.8742824037499342, + "grad_norm": 0.7055097818374634, + "learning_rate": 0.00019625046321619053, + "loss": 1.8403, + "step": 4150 + }, + { + "epoch": 0.8763891083372835, + "grad_norm": 0.8632143139839172, + "learning_rate": 0.0001962324858535169, + "loss": 1.78, + "step": 4160 + }, + { + "epoch": 0.8784958129246326, + "grad_norm": 0.7952572107315063, + "learning_rate": 0.00019621446632449744, + "loss": 1.7858, + "step": 4170 + }, + { + "epoch": 0.8806025175119819, + "grad_norm": 0.7177860736846924, + "learning_rate": 0.00019619640463702779, + "loss": 1.7926, + "step": 4180 + }, + { + "epoch": 0.8827092220993311, + "grad_norm": 0.7487980723381042, + "learning_rate": 0.0001961783007990221, + "loss": 1.7805, + "step": 4190 + }, + { + "epoch": 0.8848159266866804, + "grad_norm": 0.8172057271003723, + "learning_rate": 0.0001961601548184129, + "loss": 1.7751, + "step": 4200 + }, + { + "epoch": 0.8869226312740296, + "grad_norm": 0.7768619656562805, + "learning_rate": 0.00019614196670315133, + "loss": 1.7962, + "step": 4210 + }, + { + "epoch": 0.8890293358613789, + "grad_norm": 0.7874531149864197, + "learning_rate": 0.00019612373646120683, + "loss": 1.7485, + "step": 4220 + }, + { + "epoch": 0.8911360404487281, + "grad_norm": 0.7843292355537415, + "learning_rate": 0.0001961054641005674, + "loss": 1.8358, + "step": 4230 + }, + { + "epoch": 0.8932427450360774, + "grad_norm": 0.7430649995803833, + "learning_rate": 0.00019608714962923948, + "loss": 1.8252, + "step": 4240 + }, + { + "epoch": 0.8953494496234266, + "grad_norm": 0.7348790764808655, + "learning_rate": 0.00019606879305524794, + "loss": 1.8759, + "step": 4250 + }, + { + "epoch": 0.8974561542107758, + "grad_norm": 0.7689461708068848, + "learning_rate": 0.00019605039438663614, + "loss": 1.8029, + "step": 4260 + }, + { + "epoch": 0.899562858798125, + "grad_norm": 0.7884606719017029, + "learning_rate": 0.0001960319536314658, + "loss": 1.7901, + "step": 4270 + }, + { + "epoch": 0.9016695633854743, + "grad_norm": 0.7943284511566162, + "learning_rate": 0.0001960134707978172, + "loss": 1.7719, + "step": 4280 + }, + { + "epoch": 0.9037762679728235, + "grad_norm": 0.9755631685256958, + "learning_rate": 0.0001959949458937889, + "loss": 1.8401, + "step": 4290 + }, + { + "epoch": 0.9058829725601727, + "grad_norm": 0.7475715279579163, + "learning_rate": 0.0001959763789274981, + "loss": 1.7304, + "step": 4300 + }, + { + "epoch": 0.907989677147522, + "grad_norm": 0.731900155544281, + "learning_rate": 0.00019595776990708022, + "loss": 1.779, + "step": 4310 + }, + { + "epoch": 0.9100963817348712, + "grad_norm": 0.7448343634605408, + "learning_rate": 0.0001959391188406893, + "loss": 1.7347, + "step": 4320 + }, + { + "epoch": 0.9122030863222205, + "grad_norm": 0.7453986406326294, + "learning_rate": 0.00019592042573649763, + "loss": 1.7833, + "step": 4330 + }, + { + "epoch": 0.9143097909095697, + "grad_norm": 0.7423632144927979, + "learning_rate": 0.00019590169060269602, + "loss": 1.7099, + "step": 4340 + }, + { + "epoch": 0.916416495496919, + "grad_norm": 0.7491478323936462, + "learning_rate": 0.0001958829134474937, + "loss": 1.7697, + "step": 4350 + }, + { + "epoch": 0.9185232000842681, + "grad_norm": 0.7573466300964355, + "learning_rate": 0.00019586409427911825, + "loss": 1.8192, + "step": 4360 + }, + { + "epoch": 0.9206299046716174, + "grad_norm": 0.7873802185058594, + "learning_rate": 0.00019584523310581573, + "loss": 1.7532, + "step": 4370 + }, + { + "epoch": 0.9227366092589666, + "grad_norm": 0.782875120639801, + "learning_rate": 0.00019582632993585052, + "loss": 1.7863, + "step": 4380 + }, + { + "epoch": 0.9248433138463159, + "grad_norm": 0.7400102615356445, + "learning_rate": 0.00019580738477750553, + "loss": 1.8116, + "step": 4390 + }, + { + "epoch": 0.9269500184336651, + "grad_norm": 0.7774225473403931, + "learning_rate": 0.00019578839763908192, + "loss": 1.7869, + "step": 4400 + }, + { + "epoch": 0.9290567230210144, + "grad_norm": 0.7710514068603516, + "learning_rate": 0.00019576936852889936, + "loss": 1.7993, + "step": 4410 + }, + { + "epoch": 0.9311634276083636, + "grad_norm": 0.6935902833938599, + "learning_rate": 0.00019575029745529582, + "loss": 1.7855, + "step": 4420 + }, + { + "epoch": 0.9332701321957129, + "grad_norm": 0.7460892796516418, + "learning_rate": 0.00019573118442662776, + "loss": 1.8266, + "step": 4430 + }, + { + "epoch": 0.935376836783062, + "grad_norm": 0.8303509950637817, + "learning_rate": 0.00019571202945126994, + "loss": 1.7451, + "step": 4440 + }, + { + "epoch": 0.9374835413704113, + "grad_norm": 0.7582190632820129, + "learning_rate": 0.00019569283253761553, + "loss": 1.7904, + "step": 4450 + }, + { + "epoch": 0.9395902459577605, + "grad_norm": 0.7921029329299927, + "learning_rate": 0.00019567359369407605, + "loss": 1.7872, + "step": 4460 + }, + { + "epoch": 0.9416969505451098, + "grad_norm": 0.7588982582092285, + "learning_rate": 0.00019565431292908146, + "loss": 1.8489, + "step": 4470 + }, + { + "epoch": 0.943803655132459, + "grad_norm": 0.7762647867202759, + "learning_rate": 0.00019563499025107998, + "loss": 1.8482, + "step": 4480 + }, + { + "epoch": 0.9459103597198083, + "grad_norm": 0.7484056949615479, + "learning_rate": 0.00019561562566853836, + "loss": 1.7979, + "step": 4490 + }, + { + "epoch": 0.9480170643071575, + "grad_norm": 0.7211093902587891, + "learning_rate": 0.0001955962191899415, + "loss": 1.7849, + "step": 4500 + }, + { + "epoch": 0.9501237688945068, + "grad_norm": 0.7542027235031128, + "learning_rate": 0.00019557677082379286, + "loss": 1.7533, + "step": 4510 + }, + { + "epoch": 0.952230473481856, + "grad_norm": 0.7238128185272217, + "learning_rate": 0.0001955572805786141, + "loss": 1.8403, + "step": 4520 + }, + { + "epoch": 0.9543371780692053, + "grad_norm": 0.7738656997680664, + "learning_rate": 0.0001955377484629453, + "loss": 1.8056, + "step": 4530 + }, + { + "epoch": 0.9564438826565544, + "grad_norm": 0.7569396495819092, + "learning_rate": 0.0001955181744853449, + "loss": 1.8334, + "step": 4540 + }, + { + "epoch": 0.9585505872439037, + "grad_norm": 0.7401841878890991, + "learning_rate": 0.00019549855865438965, + "loss": 1.8209, + "step": 4550 + }, + { + "epoch": 0.9606572918312529, + "grad_norm": 0.8001863956451416, + "learning_rate": 0.00019547890097867468, + "loss": 1.7749, + "step": 4560 + }, + { + "epoch": 0.9627639964186022, + "grad_norm": 0.8146688938140869, + "learning_rate": 0.00019545920146681338, + "loss": 1.7778, + "step": 4570 + }, + { + "epoch": 0.9648707010059514, + "grad_norm": 0.7350276708602905, + "learning_rate": 0.00019543946012743756, + "loss": 1.7766, + "step": 4580 + }, + { + "epoch": 0.9669774055933007, + "grad_norm": 0.7190563082695007, + "learning_rate": 0.0001954196769691973, + "loss": 1.7803, + "step": 4590 + }, + { + "epoch": 0.9690841101806499, + "grad_norm": 0.7367439866065979, + "learning_rate": 0.00019539985200076098, + "loss": 1.7911, + "step": 4600 + }, + { + "epoch": 0.9711908147679992, + "grad_norm": 0.760611355304718, + "learning_rate": 0.0001953799852308154, + "loss": 1.8031, + "step": 4610 + }, + { + "epoch": 0.9732975193553484, + "grad_norm": 0.7771376967430115, + "learning_rate": 0.00019536007666806556, + "loss": 1.764, + "step": 4620 + }, + { + "epoch": 0.9754042239426977, + "grad_norm": 0.7223167419433594, + "learning_rate": 0.00019534012632123484, + "loss": 1.7894, + "step": 4630 + }, + { + "epoch": 0.9775109285300468, + "grad_norm": 0.72389155626297, + "learning_rate": 0.00019532013419906497, + "loss": 1.8032, + "step": 4640 + }, + { + "epoch": 0.9796176331173961, + "grad_norm": 0.7463617324829102, + "learning_rate": 0.00019530010031031586, + "loss": 1.7631, + "step": 4650 + }, + { + "epoch": 0.9817243377047453, + "grad_norm": 0.7694106698036194, + "learning_rate": 0.00019528002466376586, + "loss": 1.7619, + "step": 4660 + }, + { + "epoch": 0.9838310422920946, + "grad_norm": 0.7413203120231628, + "learning_rate": 0.0001952599072682115, + "loss": 1.7901, + "step": 4670 + }, + { + "epoch": 0.9859377468794438, + "grad_norm": 0.7745965123176575, + "learning_rate": 0.00019523974813246767, + "loss": 1.7842, + "step": 4680 + }, + { + "epoch": 0.9880444514667931, + "grad_norm": 0.7581157088279724, + "learning_rate": 0.0001952195472653675, + "loss": 1.8425, + "step": 4690 + }, + { + "epoch": 0.9901511560541423, + "grad_norm": 0.7829221487045288, + "learning_rate": 0.00019519930467576247, + "loss": 1.8011, + "step": 4700 + }, + { + "epoch": 0.9922578606414916, + "grad_norm": 0.7388656735420227, + "learning_rate": 0.0001951790203725223, + "loss": 1.7878, + "step": 4710 + }, + { + "epoch": 0.9943645652288408, + "grad_norm": 1.5744229555130005, + "learning_rate": 0.00019515869436453502, + "loss": 1.8004, + "step": 4720 + }, + { + "epoch": 0.99647126981619, + "grad_norm": 0.787341296672821, + "learning_rate": 0.00019513832666070687, + "loss": 1.7781, + "step": 4730 + }, + { + "epoch": 0.9985779744035392, + "grad_norm": 0.8090167045593262, + "learning_rate": 0.00019511791726996243, + "loss": 1.8597, + "step": 4740 + }, + { + "epoch": 1.0006846789908885, + "grad_norm": 0.7438858151435852, + "learning_rate": 0.00019509746620124447, + "loss": 1.7137, + "step": 4750 + }, + { + "epoch": 1.0027913835782378, + "grad_norm": 0.7524532079696655, + "learning_rate": 0.00019507697346351414, + "loss": 1.7519, + "step": 4760 + }, + { + "epoch": 1.004898088165587, + "grad_norm": 0.7667739391326904, + "learning_rate": 0.00019505643906575073, + "loss": 1.7693, + "step": 4770 + }, + { + "epoch": 1.0070047927529362, + "grad_norm": 0.8733193874359131, + "learning_rate": 0.00019503586301695183, + "loss": 1.7616, + "step": 4780 + }, + { + "epoch": 1.0091114973402855, + "grad_norm": 0.7520270943641663, + "learning_rate": 0.00019501524532613328, + "loss": 1.7154, + "step": 4790 + }, + { + "epoch": 1.0112182019276348, + "grad_norm": 0.8560175895690918, + "learning_rate": 0.0001949945860023292, + "loss": 1.7157, + "step": 4800 + }, + { + "epoch": 1.0133249065149839, + "grad_norm": 0.7532521486282349, + "learning_rate": 0.00019497388505459188, + "loss": 1.7118, + "step": 4810 + }, + { + "epoch": 1.0154316111023332, + "grad_norm": 0.7637246251106262, + "learning_rate": 0.00019495314249199194, + "loss": 1.7412, + "step": 4820 + }, + { + "epoch": 1.0175383156896824, + "grad_norm": 0.789354681968689, + "learning_rate": 0.0001949323583236181, + "loss": 1.739, + "step": 4830 + }, + { + "epoch": 1.0196450202770317, + "grad_norm": 0.8701115846633911, + "learning_rate": 0.00019491153255857748, + "loss": 1.7189, + "step": 4840 + }, + { + "epoch": 1.0217517248643808, + "grad_norm": 0.7776268720626831, + "learning_rate": 0.0001948906652059953, + "loss": 1.7785, + "step": 4850 + }, + { + "epoch": 1.0238584294517301, + "grad_norm": 0.8140941858291626, + "learning_rate": 0.00019486975627501502, + "loss": 1.7281, + "step": 4860 + }, + { + "epoch": 1.0259651340390794, + "grad_norm": 0.8021364808082581, + "learning_rate": 0.00019484880577479835, + "loss": 1.7601, + "step": 4870 + }, + { + "epoch": 1.0280718386264287, + "grad_norm": 0.8303508758544922, + "learning_rate": 0.00019482781371452524, + "loss": 1.7261, + "step": 4880 + }, + { + "epoch": 1.0301785432137778, + "grad_norm": 0.7315002083778381, + "learning_rate": 0.0001948067801033938, + "loss": 1.7464, + "step": 4890 + }, + { + "epoch": 1.032285247801127, + "grad_norm": 0.8632476329803467, + "learning_rate": 0.00019478570495062037, + "loss": 1.772, + "step": 4900 + }, + { + "epoch": 1.0343919523884764, + "grad_norm": 0.7853039503097534, + "learning_rate": 0.00019476458826543945, + "loss": 1.7331, + "step": 4910 + }, + { + "epoch": 1.0364986569758257, + "grad_norm": 0.7495045065879822, + "learning_rate": 0.0001947434300571038, + "loss": 1.7609, + "step": 4920 + }, + { + "epoch": 1.0386053615631747, + "grad_norm": 0.8120619654655457, + "learning_rate": 0.00019472223033488431, + "loss": 1.7377, + "step": 4930 + }, + { + "epoch": 1.040712066150524, + "grad_norm": 0.7205514311790466, + "learning_rate": 0.00019470098910807015, + "loss": 1.7342, + "step": 4940 + }, + { + "epoch": 1.0428187707378733, + "grad_norm": 0.8088648319244385, + "learning_rate": 0.0001946797063859686, + "loss": 1.7257, + "step": 4950 + }, + { + "epoch": 1.0449254753252226, + "grad_norm": 0.7667573094367981, + "learning_rate": 0.00019465838217790516, + "loss": 1.7089, + "step": 4960 + }, + { + "epoch": 1.0470321799125717, + "grad_norm": 0.8235226273536682, + "learning_rate": 0.00019463701649322343, + "loss": 1.7422, + "step": 4970 + }, + { + "epoch": 1.049138884499921, + "grad_norm": 0.8100693225860596, + "learning_rate": 0.00019461560934128533, + "loss": 1.7587, + "step": 4980 + }, + { + "epoch": 1.0512455890872703, + "grad_norm": 0.7639106512069702, + "learning_rate": 0.00019459416073147083, + "loss": 1.775, + "step": 4990 + }, + { + "epoch": 1.0533522936746196, + "grad_norm": 0.7470769286155701, + "learning_rate": 0.00019457267067317808, + "loss": 1.7528, + "step": 5000 + }, + { + "epoch": 1.0554589982619687, + "grad_norm": 0.7330575585365295, + "learning_rate": 0.00019455113917582346, + "loss": 1.725, + "step": 5010 + }, + { + "epoch": 1.057565702849318, + "grad_norm": 0.8469204902648926, + "learning_rate": 0.00019452956624884144, + "loss": 1.7515, + "step": 5020 + }, + { + "epoch": 1.0596724074366672, + "grad_norm": 0.8233253359794617, + "learning_rate": 0.00019450795190168466, + "loss": 1.7137, + "step": 5030 + }, + { + "epoch": 1.0617791120240165, + "grad_norm": 0.7793976664543152, + "learning_rate": 0.0001944862961438239, + "loss": 1.7661, + "step": 5040 + }, + { + "epoch": 1.0638858166113656, + "grad_norm": 0.748579204082489, + "learning_rate": 0.00019446459898474813, + "loss": 1.7186, + "step": 5050 + }, + { + "epoch": 1.065992521198715, + "grad_norm": 0.8824768662452698, + "learning_rate": 0.00019444286043396444, + "loss": 1.7756, + "step": 5060 + }, + { + "epoch": 1.0680992257860642, + "grad_norm": 0.8221313953399658, + "learning_rate": 0.000194421080500998, + "loss": 1.7871, + "step": 5070 + }, + { + "epoch": 1.0702059303734135, + "grad_norm": 0.7716003060340881, + "learning_rate": 0.0001943992591953922, + "loss": 1.7361, + "step": 5080 + }, + { + "epoch": 1.0723126349607626, + "grad_norm": 0.7575770616531372, + "learning_rate": 0.00019437739652670847, + "loss": 1.7479, + "step": 5090 + }, + { + "epoch": 1.0744193395481119, + "grad_norm": 0.8454105854034424, + "learning_rate": 0.00019435549250452645, + "loss": 1.7889, + "step": 5100 + }, + { + "epoch": 1.0765260441354612, + "grad_norm": 0.809290885925293, + "learning_rate": 0.00019433354713844386, + "loss": 1.8065, + "step": 5110 + }, + { + "epoch": 1.0786327487228102, + "grad_norm": 0.8279988765716553, + "learning_rate": 0.00019431156043807652, + "loss": 1.7569, + "step": 5120 + }, + { + "epoch": 1.0807394533101595, + "grad_norm": 0.8030558824539185, + "learning_rate": 0.00019428953241305838, + "loss": 1.7719, + "step": 5130 + }, + { + "epoch": 1.0828461578975088, + "grad_norm": 0.7470248341560364, + "learning_rate": 0.00019426746307304153, + "loss": 1.7697, + "step": 5140 + }, + { + "epoch": 1.0849528624848581, + "grad_norm": 0.8372868299484253, + "learning_rate": 0.00019424535242769606, + "loss": 1.7387, + "step": 5150 + }, + { + "epoch": 1.0870595670722074, + "grad_norm": 0.74366694688797, + "learning_rate": 0.0001942232004867103, + "loss": 1.7477, + "step": 5160 + }, + { + "epoch": 1.0891662716595565, + "grad_norm": 0.9802844524383545, + "learning_rate": 0.00019420100725979058, + "loss": 1.6741, + "step": 5170 + }, + { + "epoch": 1.0912729762469058, + "grad_norm": 0.9001051187515259, + "learning_rate": 0.0001941787727566613, + "loss": 1.7635, + "step": 5180 + }, + { + "epoch": 1.093379680834255, + "grad_norm": 0.8312892317771912, + "learning_rate": 0.00019415649698706507, + "loss": 1.7846, + "step": 5190 + }, + { + "epoch": 1.0954863854216041, + "grad_norm": 0.7780260443687439, + "learning_rate": 0.0001941341799607624, + "loss": 1.7114, + "step": 5200 + }, + { + "epoch": 1.0975930900089534, + "grad_norm": 0.7878116965293884, + "learning_rate": 0.00019411182168753205, + "loss": 1.7928, + "step": 5210 + }, + { + "epoch": 1.0996997945963027, + "grad_norm": 0.7883005738258362, + "learning_rate": 0.0001940894221771708, + "loss": 1.7648, + "step": 5220 + }, + { + "epoch": 1.101806499183652, + "grad_norm": 0.7752858400344849, + "learning_rate": 0.00019406698143949338, + "loss": 1.73, + "step": 5230 + }, + { + "epoch": 1.1039132037710013, + "grad_norm": 0.9342918992042542, + "learning_rate": 0.00019404449948433276, + "loss": 1.783, + "step": 5240 + }, + { + "epoch": 1.1060199083583504, + "grad_norm": 0.7494181394577026, + "learning_rate": 0.00019402197632153992, + "loss": 1.7905, + "step": 5250 + }, + { + "epoch": 1.1081266129456997, + "grad_norm": 0.7930409908294678, + "learning_rate": 0.0001939994119609838, + "loss": 1.7753, + "step": 5260 + }, + { + "epoch": 1.110233317533049, + "grad_norm": 0.7505367994308472, + "learning_rate": 0.0001939768064125515, + "loss": 1.7385, + "step": 5270 + }, + { + "epoch": 1.112340022120398, + "grad_norm": 0.912246823310852, + "learning_rate": 0.00019395415968614813, + "loss": 1.7465, + "step": 5280 + }, + { + "epoch": 1.1144467267077474, + "grad_norm": 0.8005505204200745, + "learning_rate": 0.00019393147179169685, + "loss": 1.7409, + "step": 5290 + }, + { + "epoch": 1.1165534312950967, + "grad_norm": 0.811210036277771, + "learning_rate": 0.00019390874273913884, + "loss": 1.7744, + "step": 5300 + }, + { + "epoch": 1.118660135882446, + "grad_norm": 0.7788394093513489, + "learning_rate": 0.00019388597253843334, + "loss": 1.78, + "step": 5310 + }, + { + "epoch": 1.120766840469795, + "grad_norm": 0.7960549592971802, + "learning_rate": 0.00019386316119955756, + "loss": 1.7695, + "step": 5320 + }, + { + "epoch": 1.1228735450571443, + "grad_norm": 0.7453601956367493, + "learning_rate": 0.00019384030873250682, + "loss": 1.7466, + "step": 5330 + }, + { + "epoch": 1.1249802496444936, + "grad_norm": 0.7400311231613159, + "learning_rate": 0.00019381741514729443, + "loss": 1.7426, + "step": 5340 + }, + { + "epoch": 1.127086954231843, + "grad_norm": 0.7591160535812378, + "learning_rate": 0.00019379448045395167, + "loss": 1.7722, + "step": 5350 + }, + { + "epoch": 1.129193658819192, + "grad_norm": 0.7780054211616516, + "learning_rate": 0.00019377150466252797, + "loss": 1.756, + "step": 5360 + }, + { + "epoch": 1.1313003634065413, + "grad_norm": 0.8573557734489441, + "learning_rate": 0.00019374848778309055, + "loss": 1.751, + "step": 5370 + }, + { + "epoch": 1.1334070679938906, + "grad_norm": 0.8030732870101929, + "learning_rate": 0.0001937254298257248, + "loss": 1.7632, + "step": 5380 + }, + { + "epoch": 1.1355137725812399, + "grad_norm": 0.8074278831481934, + "learning_rate": 0.00019370233080053407, + "loss": 1.7779, + "step": 5390 + }, + { + "epoch": 1.137620477168589, + "grad_norm": 0.7617718577384949, + "learning_rate": 0.0001936791907176397, + "loss": 1.7212, + "step": 5400 + }, + { + "epoch": 1.1397271817559382, + "grad_norm": 0.9997255802154541, + "learning_rate": 0.000193656009587181, + "loss": 1.684, + "step": 5410 + }, + { + "epoch": 1.1418338863432875, + "grad_norm": 0.9460029006004333, + "learning_rate": 0.0001936327874193153, + "loss": 1.8257, + "step": 5420 + }, + { + "epoch": 1.1439405909306368, + "grad_norm": 0.8431273102760315, + "learning_rate": 0.00019360952422421793, + "loss": 1.7567, + "step": 5430 + }, + { + "epoch": 1.146047295517986, + "grad_norm": 0.7903361320495605, + "learning_rate": 0.00019358622001208205, + "loss": 1.7924, + "step": 5440 + }, + { + "epoch": 1.1481540001053352, + "grad_norm": 0.8683080673217773, + "learning_rate": 0.00019356287479311903, + "loss": 1.8095, + "step": 5450 + }, + { + "epoch": 1.1502607046926845, + "grad_norm": 0.8753440976142883, + "learning_rate": 0.00019353948857755803, + "loss": 1.7625, + "step": 5460 + }, + { + "epoch": 1.1523674092800338, + "grad_norm": 0.9305017590522766, + "learning_rate": 0.0001935160613756462, + "loss": 1.7848, + "step": 5470 + }, + { + "epoch": 1.1544741138673829, + "grad_norm": 0.8456583023071289, + "learning_rate": 0.00019349259319764874, + "loss": 1.7629, + "step": 5480 + }, + { + "epoch": 1.1565808184547322, + "grad_norm": 0.8240710496902466, + "learning_rate": 0.00019346908405384867, + "loss": 1.7405, + "step": 5490 + }, + { + "epoch": 1.1586875230420814, + "grad_norm": 0.8952575922012329, + "learning_rate": 0.00019344553395454707, + "loss": 1.7578, + "step": 5500 + }, + { + "epoch": 1.1607942276294307, + "grad_norm": 0.8688398599624634, + "learning_rate": 0.00019342194291006295, + "loss": 1.7185, + "step": 5510 + }, + { + "epoch": 1.1629009322167798, + "grad_norm": 0.8205084800720215, + "learning_rate": 0.00019339831093073318, + "loss": 1.7478, + "step": 5520 + }, + { + "epoch": 1.165007636804129, + "grad_norm": 0.8122640252113342, + "learning_rate": 0.00019337463802691264, + "loss": 1.76, + "step": 5530 + }, + { + "epoch": 1.1671143413914784, + "grad_norm": 0.8044484257698059, + "learning_rate": 0.00019335092420897417, + "loss": 1.7624, + "step": 5540 + }, + { + "epoch": 1.1692210459788277, + "grad_norm": 0.8380444049835205, + "learning_rate": 0.0001933271694873084, + "loss": 1.7228, + "step": 5550 + }, + { + "epoch": 1.1713277505661768, + "grad_norm": 0.8047568202018738, + "learning_rate": 0.00019330337387232408, + "loss": 1.7534, + "step": 5560 + }, + { + "epoch": 1.173434455153526, + "grad_norm": 0.8417034149169922, + "learning_rate": 0.00019327953737444768, + "loss": 1.7706, + "step": 5570 + }, + { + "epoch": 1.1755411597408754, + "grad_norm": 0.8022201657295227, + "learning_rate": 0.00019325566000412376, + "loss": 1.7586, + "step": 5580 + }, + { + "epoch": 1.1776478643282247, + "grad_norm": 0.7806428074836731, + "learning_rate": 0.00019323174177181463, + "loss": 1.7009, + "step": 5590 + }, + { + "epoch": 1.1797545689155737, + "grad_norm": 0.7925698757171631, + "learning_rate": 0.00019320778268800066, + "loss": 1.724, + "step": 5600 + }, + { + "epoch": 1.181861273502923, + "grad_norm": 0.7943428158760071, + "learning_rate": 0.00019318378276318, + "loss": 1.7897, + "step": 5610 + }, + { + "epoch": 1.1839679780902723, + "grad_norm": 0.8270592093467712, + "learning_rate": 0.0001931597420078687, + "loss": 1.7325, + "step": 5620 + }, + { + "epoch": 1.1860746826776216, + "grad_norm": 0.7894768118858337, + "learning_rate": 0.00019313566043260082, + "loss": 1.7382, + "step": 5630 + }, + { + "epoch": 1.1881813872649707, + "grad_norm": 0.9318056702613831, + "learning_rate": 0.0001931115380479281, + "loss": 1.7403, + "step": 5640 + }, + { + "epoch": 1.19028809185232, + "grad_norm": 0.8299248218536377, + "learning_rate": 0.00019308737486442045, + "loss": 1.7449, + "step": 5650 + }, + { + "epoch": 1.1923947964396693, + "grad_norm": 0.8641102910041809, + "learning_rate": 0.00019306317089266535, + "loss": 1.7894, + "step": 5660 + }, + { + "epoch": 1.1945015010270186, + "grad_norm": 0.8760716319084167, + "learning_rate": 0.00019303892614326836, + "loss": 1.8106, + "step": 5670 + }, + { + "epoch": 1.1966082056143676, + "grad_norm": 0.8323030471801758, + "learning_rate": 0.00019301464062685284, + "loss": 1.6792, + "step": 5680 + }, + { + "epoch": 1.198714910201717, + "grad_norm": 0.8567124009132385, + "learning_rate": 0.00019299031435406, + "loss": 1.7333, + "step": 5690 + }, + { + "epoch": 1.2008216147890662, + "grad_norm": 0.8274356722831726, + "learning_rate": 0.00019296594733554892, + "loss": 1.7383, + "step": 5700 + }, + { + "epoch": 1.2029283193764155, + "grad_norm": 0.8282963037490845, + "learning_rate": 0.0001929415395819965, + "loss": 1.7545, + "step": 5710 + }, + { + "epoch": 1.2050350239637646, + "grad_norm": 0.9670590162277222, + "learning_rate": 0.00019291709110409762, + "loss": 1.7156, + "step": 5720 + }, + { + "epoch": 1.207141728551114, + "grad_norm": 0.892984926700592, + "learning_rate": 0.00019289260191256483, + "loss": 1.7138, + "step": 5730 + }, + { + "epoch": 1.2092484331384632, + "grad_norm": 0.8116103410720825, + "learning_rate": 0.00019286807201812867, + "loss": 1.7758, + "step": 5740 + }, + { + "epoch": 1.2113551377258125, + "grad_norm": 0.9112514853477478, + "learning_rate": 0.00019284350143153737, + "loss": 1.7703, + "step": 5750 + }, + { + "epoch": 1.2134618423131616, + "grad_norm": 0.9463765025138855, + "learning_rate": 0.0001928188901635571, + "loss": 1.7667, + "step": 5760 + }, + { + "epoch": 1.2155685469005109, + "grad_norm": 0.9577860832214355, + "learning_rate": 0.0001927942382249718, + "loss": 1.6904, + "step": 5770 + }, + { + "epoch": 1.2176752514878602, + "grad_norm": 0.7787550687789917, + "learning_rate": 0.00019276954562658327, + "loss": 1.7321, + "step": 5780 + }, + { + "epoch": 1.2197819560752095, + "grad_norm": 0.8212473392486572, + "learning_rate": 0.00019274481237921114, + "loss": 1.6719, + "step": 5790 + }, + { + "epoch": 1.2218886606625585, + "grad_norm": 0.8447032570838928, + "learning_rate": 0.00019272003849369273, + "loss": 1.7175, + "step": 5800 + }, + { + "epoch": 1.2239953652499078, + "grad_norm": 0.8335479497909546, + "learning_rate": 0.00019269522398088332, + "loss": 1.8072, + "step": 5810 + }, + { + "epoch": 1.2261020698372571, + "grad_norm": 0.8188406825065613, + "learning_rate": 0.00019267036885165588, + "loss": 1.8029, + "step": 5820 + }, + { + "epoch": 1.2282087744246062, + "grad_norm": 0.9411261677742004, + "learning_rate": 0.00019264547311690128, + "loss": 1.6936, + "step": 5830 + }, + { + "epoch": 1.2303154790119555, + "grad_norm": 0.8654270768165588, + "learning_rate": 0.0001926205367875281, + "loss": 1.74, + "step": 5840 + }, + { + "epoch": 1.2324221835993048, + "grad_norm": 0.8441559672355652, + "learning_rate": 0.0001925955598744627, + "loss": 1.7549, + "step": 5850 + }, + { + "epoch": 1.234528888186654, + "grad_norm": 0.820061206817627, + "learning_rate": 0.0001925705423886493, + "loss": 1.6773, + "step": 5860 + }, + { + "epoch": 1.2366355927740034, + "grad_norm": 0.7828805446624756, + "learning_rate": 0.00019254548434104985, + "loss": 1.7523, + "step": 5870 + }, + { + "epoch": 1.2387422973613524, + "grad_norm": 0.8304197192192078, + "learning_rate": 0.00019252038574264405, + "loss": 1.7367, + "step": 5880 + }, + { + "epoch": 1.2408490019487017, + "grad_norm": 0.8142679333686829, + "learning_rate": 0.0001924952466044294, + "loss": 1.7818, + "step": 5890 + }, + { + "epoch": 1.242955706536051, + "grad_norm": 0.8129520416259766, + "learning_rate": 0.00019247006693742113, + "loss": 1.783, + "step": 5900 + }, + { + "epoch": 1.2450624111234, + "grad_norm": 0.9476183652877808, + "learning_rate": 0.00019244484675265232, + "loss": 1.7761, + "step": 5910 + }, + { + "epoch": 1.2471691157107494, + "grad_norm": 0.8139955997467041, + "learning_rate": 0.00019241958606117373, + "loss": 1.7095, + "step": 5920 + }, + { + "epoch": 1.2492758202980987, + "grad_norm": 0.774530291557312, + "learning_rate": 0.00019239428487405382, + "loss": 1.7884, + "step": 5930 + }, + { + "epoch": 1.251382524885448, + "grad_norm": 0.7705311179161072, + "learning_rate": 0.00019236894320237894, + "loss": 1.7389, + "step": 5940 + }, + { + "epoch": 1.2534892294727973, + "grad_norm": 0.8137152194976807, + "learning_rate": 0.00019234356105725297, + "loss": 1.7329, + "step": 5950 + }, + { + "epoch": 1.2555959340601464, + "grad_norm": 0.8168460726737976, + "learning_rate": 0.00019231813844979777, + "loss": 1.7827, + "step": 5960 + }, + { + "epoch": 1.2577026386474957, + "grad_norm": 0.7712127566337585, + "learning_rate": 0.0001922926753911527, + "loss": 1.7593, + "step": 5970 + }, + { + "epoch": 1.259809343234845, + "grad_norm": 0.8201771974563599, + "learning_rate": 0.00019226717189247503, + "loss": 1.7471, + "step": 5980 + }, + { + "epoch": 1.261916047822194, + "grad_norm": 0.8837641477584839, + "learning_rate": 0.00019224162796493968, + "loss": 1.7401, + "step": 5990 + }, + { + "epoch": 1.2640227524095433, + "grad_norm": 0.8871500492095947, + "learning_rate": 0.00019221604361973919, + "loss": 1.7056, + "step": 6000 + }, + { + "epoch": 1.2661294569968926, + "grad_norm": 0.8061213493347168, + "learning_rate": 0.00019219041886808392, + "loss": 1.7369, + "step": 6010 + }, + { + "epoch": 1.268236161584242, + "grad_norm": 0.8275548219680786, + "learning_rate": 0.00019216475372120197, + "loss": 1.7914, + "step": 6020 + }, + { + "epoch": 1.2703428661715912, + "grad_norm": 0.8748604655265808, + "learning_rate": 0.00019213904819033903, + "loss": 1.696, + "step": 6030 + }, + { + "epoch": 1.2724495707589403, + "grad_norm": 0.9023202061653137, + "learning_rate": 0.00019211330228675855, + "loss": 1.7967, + "step": 6040 + }, + { + "epoch": 1.2745562753462896, + "grad_norm": 0.8412946462631226, + "learning_rate": 0.00019208751602174163, + "loss": 1.7548, + "step": 6050 + }, + { + "epoch": 1.2766629799336389, + "grad_norm": 0.8570536971092224, + "learning_rate": 0.00019206168940658712, + "loss": 1.7311, + "step": 6060 + }, + { + "epoch": 1.278769684520988, + "grad_norm": 0.8899838924407959, + "learning_rate": 0.00019203582245261148, + "loss": 1.7736, + "step": 6070 + }, + { + "epoch": 1.2808763891083372, + "grad_norm": 0.828941285610199, + "learning_rate": 0.0001920099151711489, + "loss": 1.7372, + "step": 6080 + }, + { + "epoch": 1.2829830936956865, + "grad_norm": 0.8117388486862183, + "learning_rate": 0.00019198396757355118, + "loss": 1.8239, + "step": 6090 + }, + { + "epoch": 1.2850897982830358, + "grad_norm": 0.9626308679580688, + "learning_rate": 0.00019195797967118785, + "loss": 1.7581, + "step": 6100 + }, + { + "epoch": 1.2871965028703851, + "grad_norm": 0.7775184512138367, + "learning_rate": 0.00019193195147544607, + "loss": 1.7391, + "step": 6110 + }, + { + "epoch": 1.2893032074577342, + "grad_norm": 0.7835120558738708, + "learning_rate": 0.00019190588299773062, + "loss": 1.7508, + "step": 6120 + }, + { + "epoch": 1.2914099120450835, + "grad_norm": 0.8474375605583191, + "learning_rate": 0.00019187977424946405, + "loss": 1.8031, + "step": 6130 + }, + { + "epoch": 1.2935166166324328, + "grad_norm": 0.8936663269996643, + "learning_rate": 0.0001918536252420864, + "loss": 1.6969, + "step": 6140 + }, + { + "epoch": 1.2956233212197819, + "grad_norm": 0.7543231844902039, + "learning_rate": 0.00019182743598705542, + "loss": 1.7573, + "step": 6150 + }, + { + "epoch": 1.2977300258071311, + "grad_norm": 0.7518043518066406, + "learning_rate": 0.00019180120649584653, + "loss": 1.7773, + "step": 6160 + }, + { + "epoch": 1.2998367303944804, + "grad_norm": 0.7887030243873596, + "learning_rate": 0.00019177493677995276, + "loss": 1.7501, + "step": 6170 + }, + { + "epoch": 1.3019434349818297, + "grad_norm": 0.7868638038635254, + "learning_rate": 0.00019174862685088472, + "loss": 1.7665, + "step": 6180 + }, + { + "epoch": 1.304050139569179, + "grad_norm": 0.8161253929138184, + "learning_rate": 0.0001917222767201707, + "loss": 1.76, + "step": 6190 + }, + { + "epoch": 1.306156844156528, + "grad_norm": 0.8441837430000305, + "learning_rate": 0.00019169588639935658, + "loss": 1.7396, + "step": 6200 + }, + { + "epoch": 1.3082635487438774, + "grad_norm": 0.8492400050163269, + "learning_rate": 0.00019166945590000584, + "loss": 1.7393, + "step": 6210 + }, + { + "epoch": 1.3103702533312267, + "grad_norm": 0.8949893712997437, + "learning_rate": 0.00019164298523369956, + "loss": 1.7993, + "step": 6220 + }, + { + "epoch": 1.3124769579185758, + "grad_norm": 0.8137794733047485, + "learning_rate": 0.00019161647441203646, + "loss": 1.7052, + "step": 6230 + }, + { + "epoch": 1.314583662505925, + "grad_norm": 0.8320265412330627, + "learning_rate": 0.0001915899234466328, + "loss": 1.7183, + "step": 6240 + }, + { + "epoch": 1.3166903670932744, + "grad_norm": 0.759221613407135, + "learning_rate": 0.00019156333234912252, + "loss": 1.7243, + "step": 6250 + }, + { + "epoch": 1.3187970716806237, + "grad_norm": 0.8010006546974182, + "learning_rate": 0.00019153670113115703, + "loss": 1.7496, + "step": 6260 + }, + { + "epoch": 1.320903776267973, + "grad_norm": 0.8241715431213379, + "learning_rate": 0.0001915100298044054, + "loss": 1.7967, + "step": 6270 + }, + { + "epoch": 1.323010480855322, + "grad_norm": 0.9702495336532593, + "learning_rate": 0.00019148331838055423, + "loss": 1.7791, + "step": 6280 + }, + { + "epoch": 1.3251171854426713, + "grad_norm": 0.814001739025116, + "learning_rate": 0.0001914565668713077, + "loss": 1.7507, + "step": 6290 + }, + { + "epoch": 1.3272238900300206, + "grad_norm": 0.8167542815208435, + "learning_rate": 0.00019142977528838762, + "loss": 1.7606, + "step": 6300 + }, + { + "epoch": 1.3293305946173697, + "grad_norm": 0.8223824501037598, + "learning_rate": 0.00019140294364353324, + "loss": 1.6936, + "step": 6310 + }, + { + "epoch": 1.331437299204719, + "grad_norm": 0.8717813491821289, + "learning_rate": 0.00019137607194850146, + "loss": 1.7051, + "step": 6320 + }, + { + "epoch": 1.3335440037920683, + "grad_norm": 0.8642485737800598, + "learning_rate": 0.00019134916021506666, + "loss": 1.7115, + "step": 6330 + }, + { + "epoch": 1.3356507083794176, + "grad_norm": 0.851425290107727, + "learning_rate": 0.00019132220845502086, + "loss": 1.7759, + "step": 6340 + }, + { + "epoch": 1.3377574129667669, + "grad_norm": 0.8192858099937439, + "learning_rate": 0.00019129521668017347, + "loss": 1.7782, + "step": 6350 + }, + { + "epoch": 1.339864117554116, + "grad_norm": 0.8982135057449341, + "learning_rate": 0.0001912681849023516, + "loss": 1.7843, + "step": 6360 + }, + { + "epoch": 1.3419708221414652, + "grad_norm": 0.8544314503669739, + "learning_rate": 0.00019124111313339976, + "loss": 1.7804, + "step": 6370 + }, + { + "epoch": 1.3440775267288145, + "grad_norm": 0.8088465929031372, + "learning_rate": 0.00019121400138518007, + "loss": 1.7797, + "step": 6380 + }, + { + "epoch": 1.3461842313161636, + "grad_norm": 0.8711520433425903, + "learning_rate": 0.00019118684966957207, + "loss": 1.6855, + "step": 6390 + }, + { + "epoch": 1.348290935903513, + "grad_norm": 0.8697302937507629, + "learning_rate": 0.00019115965799847292, + "loss": 1.7711, + "step": 6400 + }, + { + "epoch": 1.3503976404908622, + "grad_norm": 0.7881453037261963, + "learning_rate": 0.00019113242638379725, + "loss": 1.7652, + "step": 6410 + }, + { + "epoch": 1.3525043450782115, + "grad_norm": 0.8739869594573975, + "learning_rate": 0.00019110515483747716, + "loss": 1.7741, + "step": 6420 + }, + { + "epoch": 1.3546110496655608, + "grad_norm": 0.9500364661216736, + "learning_rate": 0.0001910778433714622, + "loss": 1.7113, + "step": 6430 + }, + { + "epoch": 1.3567177542529099, + "grad_norm": 0.8580523729324341, + "learning_rate": 0.00019105049199771962, + "loss": 1.7662, + "step": 6440 + }, + { + "epoch": 1.3588244588402592, + "grad_norm": 0.7940788865089417, + "learning_rate": 0.00019102310072823393, + "loss": 1.7336, + "step": 6450 + }, + { + "epoch": 1.3609311634276082, + "grad_norm": 0.80447918176651, + "learning_rate": 0.0001909956695750072, + "loss": 1.6681, + "step": 6460 + }, + { + "epoch": 1.3630378680149575, + "grad_norm": 0.8043936491012573, + "learning_rate": 0.000190968198550059, + "loss": 1.7039, + "step": 6470 + }, + { + "epoch": 1.3651445726023068, + "grad_norm": 0.8359491229057312, + "learning_rate": 0.0001909406876654264, + "loss": 1.711, + "step": 6480 + }, + { + "epoch": 1.3672512771896561, + "grad_norm": 0.8909677267074585, + "learning_rate": 0.00019091313693316383, + "loss": 1.7488, + "step": 6490 + }, + { + "epoch": 1.3693579817770054, + "grad_norm": 0.9276780486106873, + "learning_rate": 0.00019088554636534323, + "loss": 1.7168, + "step": 6500 + }, + { + "epoch": 1.3714646863643545, + "grad_norm": 0.8368661403656006, + "learning_rate": 0.00019085791597405404, + "loss": 1.8085, + "step": 6510 + }, + { + "epoch": 1.3735713909517038, + "grad_norm": 1.0867797136306763, + "learning_rate": 0.0001908302457714031, + "loss": 1.7569, + "step": 6520 + }, + { + "epoch": 1.375678095539053, + "grad_norm": 0.7681411504745483, + "learning_rate": 0.00019080253576951473, + "loss": 1.7583, + "step": 6530 + }, + { + "epoch": 1.3777848001264021, + "grad_norm": 0.8406849503517151, + "learning_rate": 0.00019077478598053063, + "loss": 1.7632, + "step": 6540 + }, + { + "epoch": 1.3798915047137514, + "grad_norm": 0.771959125995636, + "learning_rate": 0.00019074699641661, + "loss": 1.6977, + "step": 6550 + }, + { + "epoch": 1.3819982093011007, + "grad_norm": 0.8810305595397949, + "learning_rate": 0.00019071916708992943, + "loss": 1.7648, + "step": 6560 + }, + { + "epoch": 1.38410491388845, + "grad_norm": 0.7824996709823608, + "learning_rate": 0.00019069129801268294, + "loss": 1.8061, + "step": 6570 + }, + { + "epoch": 1.3862116184757993, + "grad_norm": 0.822420060634613, + "learning_rate": 0.00019066338919708197, + "loss": 1.7184, + "step": 6580 + }, + { + "epoch": 1.3883183230631484, + "grad_norm": 0.8344022035598755, + "learning_rate": 0.00019063544065535534, + "loss": 1.7274, + "step": 6590 + }, + { + "epoch": 1.3904250276504977, + "grad_norm": 0.8080721497535706, + "learning_rate": 0.00019060745239974936, + "loss": 1.7137, + "step": 6600 + }, + { + "epoch": 1.392531732237847, + "grad_norm": 0.8242797255516052, + "learning_rate": 0.0001905794244425277, + "loss": 1.7193, + "step": 6610 + }, + { + "epoch": 1.394638436825196, + "grad_norm": 0.8020373582839966, + "learning_rate": 0.00019055135679597136, + "loss": 1.7942, + "step": 6620 + }, + { + "epoch": 1.3967451414125454, + "grad_norm": 0.8053699731826782, + "learning_rate": 0.0001905232494723788, + "loss": 1.7781, + "step": 6630 + }, + { + "epoch": 1.3988518459998946, + "grad_norm": 0.8109655380249023, + "learning_rate": 0.00019049510248406586, + "loss": 1.7311, + "step": 6640 + }, + { + "epoch": 1.400958550587244, + "grad_norm": 0.8344601392745972, + "learning_rate": 0.00019046691584336577, + "loss": 1.7843, + "step": 6650 + }, + { + "epoch": 1.4030652551745932, + "grad_norm": 0.7590084075927734, + "learning_rate": 0.0001904386895626291, + "loss": 1.7729, + "step": 6660 + }, + { + "epoch": 1.4051719597619423, + "grad_norm": 0.7660609483718872, + "learning_rate": 0.0001904104236542238, + "loss": 1.7351, + "step": 6670 + }, + { + "epoch": 1.4072786643492916, + "grad_norm": 0.8946515917778015, + "learning_rate": 0.0001903821181305352, + "loss": 1.7527, + "step": 6680 + }, + { + "epoch": 1.409385368936641, + "grad_norm": 0.8283473253250122, + "learning_rate": 0.00019035377300396597, + "loss": 1.7567, + "step": 6690 + }, + { + "epoch": 1.41149207352399, + "grad_norm": 0.7549485564231873, + "learning_rate": 0.00019032538828693616, + "loss": 1.7559, + "step": 6700 + }, + { + "epoch": 1.4135987781113393, + "grad_norm": 0.8404794931411743, + "learning_rate": 0.0001902969639918831, + "loss": 1.7181, + "step": 6710 + }, + { + "epoch": 1.4157054826986886, + "grad_norm": 0.826153576374054, + "learning_rate": 0.00019026850013126157, + "loss": 1.7724, + "step": 6720 + }, + { + "epoch": 1.4178121872860379, + "grad_norm": 0.8652869462966919, + "learning_rate": 0.0001902399967175436, + "loss": 1.6797, + "step": 6730 + }, + { + "epoch": 1.4199188918733872, + "grad_norm": 0.8194015622138977, + "learning_rate": 0.00019021145376321857, + "loss": 1.7175, + "step": 6740 + }, + { + "epoch": 1.4220255964607362, + "grad_norm": 0.8753089904785156, + "learning_rate": 0.0001901828712807932, + "loss": 1.756, + "step": 6750 + }, + { + "epoch": 1.4241323010480855, + "grad_norm": 0.7799382209777832, + "learning_rate": 0.0001901542492827915, + "loss": 1.7679, + "step": 6760 + }, + { + "epoch": 1.4262390056354348, + "grad_norm": 0.805819571018219, + "learning_rate": 0.00019012558778175485, + "loss": 1.7729, + "step": 6770 + }, + { + "epoch": 1.428345710222784, + "grad_norm": 0.7864932417869568, + "learning_rate": 0.0001900968867902419, + "loss": 1.7852, + "step": 6780 + }, + { + "epoch": 1.4304524148101332, + "grad_norm": 0.8187804222106934, + "learning_rate": 0.00019006814632082863, + "loss": 1.6702, + "step": 6790 + }, + { + "epoch": 1.4325591193974825, + "grad_norm": 0.7939268946647644, + "learning_rate": 0.00019003936638610828, + "loss": 1.6653, + "step": 6800 + }, + { + "epoch": 1.4346658239848318, + "grad_norm": 0.8125925660133362, + "learning_rate": 0.00019001054699869133, + "loss": 1.7386, + "step": 6810 + }, + { + "epoch": 1.436772528572181, + "grad_norm": 0.8372388482093811, + "learning_rate": 0.00018998168817120577, + "loss": 1.7444, + "step": 6820 + }, + { + "epoch": 1.4388792331595301, + "grad_norm": 0.83929842710495, + "learning_rate": 0.00018995278991629658, + "loss": 1.751, + "step": 6830 + }, + { + "epoch": 1.4409859377468794, + "grad_norm": 0.7986190319061279, + "learning_rate": 0.00018992385224662623, + "loss": 1.746, + "step": 6840 + }, + { + "epoch": 1.4430926423342287, + "grad_norm": 0.8739672303199768, + "learning_rate": 0.00018989487517487435, + "loss": 1.7579, + "step": 6850 + }, + { + "epoch": 1.4451993469215778, + "grad_norm": 0.9071808457374573, + "learning_rate": 0.0001898658587137379, + "loss": 1.7034, + "step": 6860 + }, + { + "epoch": 1.447306051508927, + "grad_norm": 0.8445550799369812, + "learning_rate": 0.00018983680287593105, + "loss": 1.7639, + "step": 6870 + }, + { + "epoch": 1.4494127560962764, + "grad_norm": 0.8497149348258972, + "learning_rate": 0.00018980770767418526, + "loss": 1.7695, + "step": 6880 + }, + { + "epoch": 1.4515194606836257, + "grad_norm": 0.7921140789985657, + "learning_rate": 0.00018977857312124923, + "loss": 1.7391, + "step": 6890 + }, + { + "epoch": 1.453626165270975, + "grad_norm": 0.7450403571128845, + "learning_rate": 0.00018974939922988883, + "loss": 1.7423, + "step": 6900 + }, + { + "epoch": 1.455732869858324, + "grad_norm": 0.8325957655906677, + "learning_rate": 0.0001897201860128873, + "loss": 1.79, + "step": 6910 + }, + { + "epoch": 1.4578395744456734, + "grad_norm": 0.9251476526260376, + "learning_rate": 0.00018969093348304505, + "loss": 1.7301, + "step": 6920 + }, + { + "epoch": 1.4599462790330227, + "grad_norm": 0.8292146325111389, + "learning_rate": 0.00018966164165317966, + "loss": 1.7738, + "step": 6930 + }, + { + "epoch": 1.4620529836203717, + "grad_norm": 0.8368434309959412, + "learning_rate": 0.000189632310536126, + "loss": 1.7129, + "step": 6940 + }, + { + "epoch": 1.464159688207721, + "grad_norm": 0.8482025861740112, + "learning_rate": 0.00018960294014473615, + "loss": 1.778, + "step": 6950 + }, + { + "epoch": 1.4662663927950703, + "grad_norm": 0.823621392250061, + "learning_rate": 0.00018957353049187936, + "loss": 1.7401, + "step": 6960 + }, + { + "epoch": 1.4683730973824196, + "grad_norm": 0.7859732508659363, + "learning_rate": 0.0001895440815904421, + "loss": 1.7523, + "step": 6970 + }, + { + "epoch": 1.470479801969769, + "grad_norm": 0.9153429865837097, + "learning_rate": 0.00018951459345332807, + "loss": 1.7763, + "step": 6980 + }, + { + "epoch": 1.472586506557118, + "grad_norm": 0.8432143926620483, + "learning_rate": 0.00018948506609345813, + "loss": 1.733, + "step": 6990 + }, + { + "epoch": 1.4746932111444673, + "grad_norm": 0.8629891276359558, + "learning_rate": 0.0001894554995237703, + "loss": 1.7545, + "step": 7000 + }, + { + "epoch": 1.4767999157318166, + "grad_norm": 0.8362427353858948, + "learning_rate": 0.00018942589375721985, + "loss": 1.7524, + "step": 7010 + }, + { + "epoch": 1.4789066203191656, + "grad_norm": 0.7721174359321594, + "learning_rate": 0.00018939624880677918, + "loss": 1.7503, + "step": 7020 + }, + { + "epoch": 1.481013324906515, + "grad_norm": 0.8513792753219604, + "learning_rate": 0.00018936656468543784, + "loss": 1.7612, + "step": 7030 + }, + { + "epoch": 1.4831200294938642, + "grad_norm": 0.804923415184021, + "learning_rate": 0.00018933684140620257, + "loss": 1.74, + "step": 7040 + }, + { + "epoch": 1.4852267340812135, + "grad_norm": 0.88444584608078, + "learning_rate": 0.00018930707898209733, + "loss": 1.7494, + "step": 7050 + }, + { + "epoch": 1.4873334386685628, + "grad_norm": 0.8222391605377197, + "learning_rate": 0.00018927727742616313, + "loss": 1.7207, + "step": 7060 + }, + { + "epoch": 1.489440143255912, + "grad_norm": 0.8316041231155396, + "learning_rate": 0.00018924743675145813, + "loss": 1.7651, + "step": 7070 + }, + { + "epoch": 1.4915468478432612, + "grad_norm": 0.847623348236084, + "learning_rate": 0.0001892175569710577, + "loss": 1.8181, + "step": 7080 + }, + { + "epoch": 1.4936535524306105, + "grad_norm": 0.8623965382575989, + "learning_rate": 0.00018918763809805435, + "loss": 1.6863, + "step": 7090 + }, + { + "epoch": 1.4957602570179596, + "grad_norm": 0.882828950881958, + "learning_rate": 0.00018915768014555762, + "loss": 1.7619, + "step": 7100 + }, + { + "epoch": 1.4978669616053089, + "grad_norm": 0.9039724469184875, + "learning_rate": 0.00018912768312669424, + "loss": 1.7102, + "step": 7110 + }, + { + "epoch": 1.4999736661926582, + "grad_norm": 0.7769089341163635, + "learning_rate": 0.0001890976470546081, + "loss": 1.7189, + "step": 7120 + }, + { + "epoch": 1.5020803707800074, + "grad_norm": 0.7899691462516785, + "learning_rate": 0.00018906757194246012, + "loss": 1.7211, + "step": 7130 + }, + { + "epoch": 1.5041870753673567, + "grad_norm": 0.7718929052352905, + "learning_rate": 0.00018903745780342839, + "loss": 1.6544, + "step": 7140 + }, + { + "epoch": 1.5062937799547058, + "grad_norm": 0.8393471837043762, + "learning_rate": 0.00018900730465070802, + "loss": 1.7581, + "step": 7150 + }, + { + "epoch": 1.508400484542055, + "grad_norm": 0.845622181892395, + "learning_rate": 0.0001889771124975113, + "loss": 1.6994, + "step": 7160 + }, + { + "epoch": 1.5105071891294042, + "grad_norm": 0.7832208275794983, + "learning_rate": 0.0001889468813570676, + "loss": 1.7505, + "step": 7170 + }, + { + "epoch": 1.5126138937167535, + "grad_norm": 0.8510347604751587, + "learning_rate": 0.00018891661124262327, + "loss": 1.7472, + "step": 7180 + }, + { + "epoch": 1.5147205983041028, + "grad_norm": 0.8204689621925354, + "learning_rate": 0.00018888630216744193, + "loss": 1.7411, + "step": 7190 + }, + { + "epoch": 1.516827302891452, + "grad_norm": 0.852558434009552, + "learning_rate": 0.00018885595414480405, + "loss": 1.7592, + "step": 7200 + }, + { + "epoch": 1.5189340074788014, + "grad_norm": 0.8407373428344727, + "learning_rate": 0.0001888255671880073, + "loss": 1.7416, + "step": 7210 + }, + { + "epoch": 1.5210407120661507, + "grad_norm": 0.7926289439201355, + "learning_rate": 0.00018879514131036644, + "loss": 1.7593, + "step": 7220 + }, + { + "epoch": 1.5231474166534997, + "grad_norm": 0.9050236940383911, + "learning_rate": 0.00018876467652521317, + "loss": 1.7416, + "step": 7230 + }, + { + "epoch": 1.525254121240849, + "grad_norm": 0.9141523241996765, + "learning_rate": 0.00018873417284589629, + "loss": 1.7551, + "step": 7240 + }, + { + "epoch": 1.527360825828198, + "grad_norm": 0.8591190576553345, + "learning_rate": 0.00018870363028578168, + "loss": 1.7524, + "step": 7250 + }, + { + "epoch": 1.5294675304155474, + "grad_norm": 0.8687646389007568, + "learning_rate": 0.0001886730488582522, + "loss": 1.6681, + "step": 7260 + }, + { + "epoch": 1.5315742350028967, + "grad_norm": 0.8512703776359558, + "learning_rate": 0.0001886424285767078, + "loss": 1.8276, + "step": 7270 + }, + { + "epoch": 1.533680939590246, + "grad_norm": 0.8147086501121521, + "learning_rate": 0.0001886117694545654, + "loss": 1.7959, + "step": 7280 + }, + { + "epoch": 1.5357876441775953, + "grad_norm": 0.8092646598815918, + "learning_rate": 0.0001885810715052589, + "loss": 1.7359, + "step": 7290 + }, + { + "epoch": 1.5378943487649446, + "grad_norm": 0.8645735383033752, + "learning_rate": 0.00018855033474223936, + "loss": 1.7274, + "step": 7300 + }, + { + "epoch": 1.5400010533522936, + "grad_norm": 0.7911597490310669, + "learning_rate": 0.0001885195591789747, + "loss": 1.7509, + "step": 7310 + }, + { + "epoch": 1.542107757939643, + "grad_norm": 0.8070617914199829, + "learning_rate": 0.00018848874482894993, + "loss": 1.8309, + "step": 7320 + }, + { + "epoch": 1.544214462526992, + "grad_norm": 0.8313998579978943, + "learning_rate": 0.00018845789170566702, + "loss": 1.776, + "step": 7330 + }, + { + "epoch": 1.5463211671143413, + "grad_norm": 0.7963941097259521, + "learning_rate": 0.00018842699982264492, + "loss": 1.7448, + "step": 7340 + }, + { + "epoch": 1.5484278717016906, + "grad_norm": 0.8401038646697998, + "learning_rate": 0.0001883960691934196, + "loss": 1.7501, + "step": 7350 + }, + { + "epoch": 1.55053457628904, + "grad_norm": 0.8527158498764038, + "learning_rate": 0.0001883650998315439, + "loss": 1.734, + "step": 7360 + }, + { + "epoch": 1.5526412808763892, + "grad_norm": 0.8697671294212341, + "learning_rate": 0.00018833409175058786, + "loss": 1.7513, + "step": 7370 + }, + { + "epoch": 1.5547479854637385, + "grad_norm": 0.7973361015319824, + "learning_rate": 0.00018830304496413822, + "loss": 1.7337, + "step": 7380 + }, + { + "epoch": 1.5568546900510876, + "grad_norm": 0.9423846006393433, + "learning_rate": 0.00018827195948579886, + "loss": 1.6702, + "step": 7390 + }, + { + "epoch": 1.5589613946384369, + "grad_norm": 0.8513593673706055, + "learning_rate": 0.00018824083532919056, + "loss": 1.7395, + "step": 7400 + }, + { + "epoch": 1.561068099225786, + "grad_norm": 0.9370805025100708, + "learning_rate": 0.000188209672507951, + "loss": 1.7967, + "step": 7410 + }, + { + "epoch": 1.5631748038131352, + "grad_norm": 0.8711183667182922, + "learning_rate": 0.00018817847103573486, + "loss": 1.7464, + "step": 7420 + }, + { + "epoch": 1.5652815084004845, + "grad_norm": 0.8438371419906616, + "learning_rate": 0.00018814723092621375, + "loss": 1.7417, + "step": 7430 + }, + { + "epoch": 1.5673882129878338, + "grad_norm": 0.7928584218025208, + "learning_rate": 0.00018811595219307622, + "loss": 1.729, + "step": 7440 + }, + { + "epoch": 1.5694949175751831, + "grad_norm": 1.1181422472000122, + "learning_rate": 0.00018808463485002767, + "loss": 1.784, + "step": 7450 + }, + { + "epoch": 1.5716016221625324, + "grad_norm": 0.7837269902229309, + "learning_rate": 0.00018805327891079055, + "loss": 1.7821, + "step": 7460 + }, + { + "epoch": 1.5737083267498815, + "grad_norm": 0.8478876948356628, + "learning_rate": 0.00018802188438910405, + "loss": 1.7542, + "step": 7470 + }, + { + "epoch": 1.5758150313372308, + "grad_norm": 0.8075322508811951, + "learning_rate": 0.0001879904512987244, + "loss": 1.7268, + "step": 7480 + }, + { + "epoch": 1.5779217359245798, + "grad_norm": 1.1477035284042358, + "learning_rate": 0.00018795897965342474, + "loss": 1.759, + "step": 7490 + }, + { + "epoch": 1.5800284405119291, + "grad_norm": 0.9190003275871277, + "learning_rate": 0.000187927469466995, + "loss": 1.8034, + "step": 7500 + }, + { + "epoch": 1.5821351450992784, + "grad_norm": 0.8050284385681152, + "learning_rate": 0.00018789592075324203, + "loss": 1.7384, + "step": 7510 + }, + { + "epoch": 1.5842418496866277, + "grad_norm": 0.9296058416366577, + "learning_rate": 0.0001878643335259896, + "loss": 1.747, + "step": 7520 + }, + { + "epoch": 1.586348554273977, + "grad_norm": 0.8164471983909607, + "learning_rate": 0.00018783270779907838, + "loss": 1.7186, + "step": 7530 + }, + { + "epoch": 1.5884552588613263, + "grad_norm": 0.8429021239280701, + "learning_rate": 0.0001878010435863658, + "loss": 1.7842, + "step": 7540 + }, + { + "epoch": 1.5905619634486754, + "grad_norm": 0.8415732979774475, + "learning_rate": 0.00018776934090172627, + "loss": 1.7409, + "step": 7550 + }, + { + "epoch": 1.5926686680360247, + "grad_norm": 0.8081998229026794, + "learning_rate": 0.00018773759975905098, + "loss": 1.7561, + "step": 7560 + }, + { + "epoch": 1.5947753726233738, + "grad_norm": 0.8396217226982117, + "learning_rate": 0.00018770582017224802, + "loss": 1.7832, + "step": 7570 + }, + { + "epoch": 1.596882077210723, + "grad_norm": 0.9396729469299316, + "learning_rate": 0.0001876740021552423, + "loss": 1.7721, + "step": 7580 + }, + { + "epoch": 1.5989887817980724, + "grad_norm": 0.8377107977867126, + "learning_rate": 0.00018764214572197552, + "loss": 1.7757, + "step": 7590 + }, + { + "epoch": 1.6010954863854217, + "grad_norm": 0.8287121653556824, + "learning_rate": 0.00018761025088640632, + "loss": 1.7319, + "step": 7600 + }, + { + "epoch": 1.603202190972771, + "grad_norm": 0.9031893610954285, + "learning_rate": 0.00018757831766251016, + "loss": 1.7783, + "step": 7610 + }, + { + "epoch": 1.6053088955601202, + "grad_norm": 0.8598982095718384, + "learning_rate": 0.00018754634606427914, + "loss": 1.8231, + "step": 7620 + }, + { + "epoch": 1.6074156001474693, + "grad_norm": 0.8003443479537964, + "learning_rate": 0.00018751433610572242, + "loss": 1.7699, + "step": 7630 + }, + { + "epoch": 1.6095223047348186, + "grad_norm": 0.8151317834854126, + "learning_rate": 0.00018748228780086579, + "loss": 1.765, + "step": 7640 + }, + { + "epoch": 1.6116290093221677, + "grad_norm": 0.825056791305542, + "learning_rate": 0.00018745020116375197, + "loss": 1.7585, + "step": 7650 + }, + { + "epoch": 1.613735713909517, + "grad_norm": 0.8297179937362671, + "learning_rate": 0.00018741807620844037, + "loss": 1.7263, + "step": 7660 + }, + { + "epoch": 1.6158424184968663, + "grad_norm": 0.8049089312553406, + "learning_rate": 0.0001873859129490072, + "loss": 1.7333, + "step": 7670 + }, + { + "epoch": 1.6179491230842156, + "grad_norm": 0.817482590675354, + "learning_rate": 0.00018735371139954558, + "loss": 1.6743, + "step": 7680 + }, + { + "epoch": 1.6200558276715649, + "grad_norm": 0.8606418371200562, + "learning_rate": 0.00018732147157416527, + "loss": 1.7371, + "step": 7690 + }, + { + "epoch": 1.6221625322589142, + "grad_norm": 0.8428048491477966, + "learning_rate": 0.00018728919348699283, + "loss": 1.7423, + "step": 7700 + }, + { + "epoch": 1.6242692368462632, + "grad_norm": 0.9453518986701965, + "learning_rate": 0.00018725687715217163, + "loss": 1.7329, + "step": 7710 + }, + { + "epoch": 1.6263759414336125, + "grad_norm": 0.8569479584693909, + "learning_rate": 0.0001872245225838617, + "loss": 1.7658, + "step": 7720 + }, + { + "epoch": 1.6284826460209616, + "grad_norm": 0.8008100390434265, + "learning_rate": 0.00018719212979624, + "loss": 1.7586, + "step": 7730 + }, + { + "epoch": 1.630589350608311, + "grad_norm": 0.8791933655738831, + "learning_rate": 0.0001871596988035001, + "loss": 1.8345, + "step": 7740 + }, + { + "epoch": 1.6326960551956602, + "grad_norm": 0.848496675491333, + "learning_rate": 0.0001871272296198523, + "loss": 1.7011, + "step": 7750 + }, + { + "epoch": 1.6348027597830095, + "grad_norm": 0.8242083191871643, + "learning_rate": 0.0001870947222595237, + "loss": 1.7555, + "step": 7760 + }, + { + "epoch": 1.6369094643703588, + "grad_norm": 0.8481186032295227, + "learning_rate": 0.00018706217673675811, + "loss": 1.7048, + "step": 7770 + }, + { + "epoch": 1.6390161689577079, + "grad_norm": 0.8180390000343323, + "learning_rate": 0.00018702959306581604, + "loss": 1.6859, + "step": 7780 + }, + { + "epoch": 1.6411228735450571, + "grad_norm": 0.8697353601455688, + "learning_rate": 0.00018699697126097476, + "loss": 1.7215, + "step": 7790 + }, + { + "epoch": 1.6432295781324062, + "grad_norm": 0.8706187605857849, + "learning_rate": 0.00018696431133652817, + "loss": 1.6866, + "step": 7800 + }, + { + "epoch": 1.6453362827197555, + "grad_norm": 0.8431823253631592, + "learning_rate": 0.00018693161330678696, + "loss": 1.675, + "step": 7810 + }, + { + "epoch": 1.6474429873071048, + "grad_norm": 0.8259839415550232, + "learning_rate": 0.0001868988771860785, + "loss": 1.7079, + "step": 7820 + }, + { + "epoch": 1.649549691894454, + "grad_norm": 0.805834949016571, + "learning_rate": 0.00018686610298874676, + "loss": 1.7089, + "step": 7830 + }, + { + "epoch": 1.6516563964818034, + "grad_norm": 0.7511940598487854, + "learning_rate": 0.00018683329072915252, + "loss": 1.719, + "step": 7840 + }, + { + "epoch": 1.6537631010691527, + "grad_norm": 0.8287107348442078, + "learning_rate": 0.00018680044042167318, + "loss": 1.7804, + "step": 7850 + }, + { + "epoch": 1.6558698056565018, + "grad_norm": 0.8526592254638672, + "learning_rate": 0.00018676755208070275, + "loss": 1.7506, + "step": 7860 + }, + { + "epoch": 1.657976510243851, + "grad_norm": 0.8048674464225769, + "learning_rate": 0.00018673462572065205, + "loss": 1.7624, + "step": 7870 + }, + { + "epoch": 1.6600832148312001, + "grad_norm": 0.8441417813301086, + "learning_rate": 0.0001867016613559484, + "loss": 1.7617, + "step": 7880 + }, + { + "epoch": 1.6621899194185494, + "grad_norm": 0.8121191263198853, + "learning_rate": 0.00018666865900103597, + "loss": 1.7087, + "step": 7890 + }, + { + "epoch": 1.6642966240058987, + "grad_norm": 0.8484655618667603, + "learning_rate": 0.00018663561867037534, + "loss": 1.7656, + "step": 7900 + }, + { + "epoch": 1.666403328593248, + "grad_norm": 0.9858533143997192, + "learning_rate": 0.00018660254037844388, + "loss": 1.6809, + "step": 7910 + }, + { + "epoch": 1.6685100331805973, + "grad_norm": 0.7125934362411499, + "learning_rate": 0.00018656942413973555, + "loss": 1.7411, + "step": 7920 + }, + { + "epoch": 1.6706167377679466, + "grad_norm": 0.9220843315124512, + "learning_rate": 0.000186536269968761, + "loss": 1.8265, + "step": 7930 + }, + { + "epoch": 1.6727234423552957, + "grad_norm": 0.934085488319397, + "learning_rate": 0.00018650307788004735, + "loss": 1.7728, + "step": 7940 + }, + { + "epoch": 1.674830146942645, + "grad_norm": 0.8358553647994995, + "learning_rate": 0.00018646984788813856, + "loss": 1.7261, + "step": 7950 + }, + { + "epoch": 1.676936851529994, + "grad_norm": 0.8404299020767212, + "learning_rate": 0.00018643658000759493, + "loss": 1.7528, + "step": 7960 + }, + { + "epoch": 1.6790435561173433, + "grad_norm": 0.8132160305976868, + "learning_rate": 0.00018640327425299363, + "loss": 1.7522, + "step": 7970 + }, + { + "epoch": 1.6811502607046926, + "grad_norm": 0.826701819896698, + "learning_rate": 0.0001863699306389282, + "loss": 1.7083, + "step": 7980 + }, + { + "epoch": 1.683256965292042, + "grad_norm": 0.899210512638092, + "learning_rate": 0.00018633654918000892, + "loss": 1.7517, + "step": 7990 + }, + { + "epoch": 1.6853636698793912, + "grad_norm": 0.7902103662490845, + "learning_rate": 0.00018630312989086257, + "loss": 1.7133, + "step": 8000 + }, + { + "epoch": 1.6874703744667405, + "grad_norm": 0.8423092365264893, + "learning_rate": 0.00018626967278613253, + "loss": 1.7686, + "step": 8010 + }, + { + "epoch": 1.6895770790540896, + "grad_norm": 0.8402092456817627, + "learning_rate": 0.0001862361778804788, + "loss": 1.737, + "step": 8020 + }, + { + "epoch": 1.691683783641439, + "grad_norm": 0.8797973394393921, + "learning_rate": 0.00018620264518857786, + "loss": 1.7052, + "step": 8030 + }, + { + "epoch": 1.693790488228788, + "grad_norm": 0.8236359357833862, + "learning_rate": 0.0001861690747251228, + "loss": 1.7043, + "step": 8040 + }, + { + "epoch": 1.6958971928161373, + "grad_norm": 0.8558099269866943, + "learning_rate": 0.00018613546650482322, + "loss": 1.7806, + "step": 8050 + }, + { + "epoch": 1.6980038974034866, + "grad_norm": 0.8771076798439026, + "learning_rate": 0.00018610182054240533, + "loss": 1.7177, + "step": 8060 + }, + { + "epoch": 1.7001106019908359, + "grad_norm": 0.8418282270431519, + "learning_rate": 0.0001860681368526118, + "loss": 1.7573, + "step": 8070 + }, + { + "epoch": 1.7022173065781852, + "grad_norm": 0.8841100931167603, + "learning_rate": 0.00018603441545020187, + "loss": 1.7451, + "step": 8080 + }, + { + "epoch": 1.7043240111655344, + "grad_norm": 0.8179870247840881, + "learning_rate": 0.00018600065634995135, + "loss": 1.7527, + "step": 8090 + }, + { + "epoch": 1.7064307157528835, + "grad_norm": 0.7647489905357361, + "learning_rate": 0.00018596685956665245, + "loss": 1.7789, + "step": 8100 + }, + { + "epoch": 1.7085374203402328, + "grad_norm": 0.8438899517059326, + "learning_rate": 0.000185933025115114, + "loss": 1.7211, + "step": 8110 + }, + { + "epoch": 1.7106441249275819, + "grad_norm": 0.8097826242446899, + "learning_rate": 0.0001858991530101613, + "loss": 1.7124, + "step": 8120 + }, + { + "epoch": 1.7127508295149312, + "grad_norm": 0.902351438999176, + "learning_rate": 0.00018586524326663615, + "loss": 1.7037, + "step": 8130 + }, + { + "epoch": 1.7148575341022805, + "grad_norm": 0.806341290473938, + "learning_rate": 0.0001858312958993968, + "loss": 1.7221, + "step": 8140 + }, + { + "epoch": 1.7169642386896298, + "grad_norm": 0.8351038098335266, + "learning_rate": 0.00018579731092331807, + "loss": 1.6798, + "step": 8150 + }, + { + "epoch": 1.719070943276979, + "grad_norm": 0.8565513491630554, + "learning_rate": 0.00018576328835329117, + "loss": 1.7356, + "step": 8160 + }, + { + "epoch": 1.7211776478643284, + "grad_norm": 0.8509708046913147, + "learning_rate": 0.00018572922820422387, + "loss": 1.7857, + "step": 8170 + }, + { + "epoch": 1.7232843524516774, + "grad_norm": 0.8077223300933838, + "learning_rate": 0.00018569513049104033, + "loss": 1.7137, + "step": 8180 + }, + { + "epoch": 1.7253910570390267, + "grad_norm": 0.9218313097953796, + "learning_rate": 0.00018566099522868119, + "loss": 1.7443, + "step": 8190 + }, + { + "epoch": 1.7274977616263758, + "grad_norm": 0.828586995601654, + "learning_rate": 0.00018562682243210358, + "loss": 1.7148, + "step": 8200 + }, + { + "epoch": 1.729604466213725, + "grad_norm": 0.966937243938446, + "learning_rate": 0.00018559261211628108, + "loss": 1.7608, + "step": 8210 + }, + { + "epoch": 1.7317111708010744, + "grad_norm": 0.8322204351425171, + "learning_rate": 0.00018555836429620358, + "loss": 1.7076, + "step": 8220 + }, + { + "epoch": 1.7338178753884237, + "grad_norm": 0.8872451782226562, + "learning_rate": 0.00018552407898687762, + "loss": 1.7599, + "step": 8230 + }, + { + "epoch": 1.735924579975773, + "grad_norm": 0.8485720753669739, + "learning_rate": 0.00018548975620332598, + "loss": 1.6958, + "step": 8240 + }, + { + "epoch": 1.7380312845631223, + "grad_norm": 0.839801549911499, + "learning_rate": 0.00018545539596058795, + "loss": 1.7315, + "step": 8250 + }, + { + "epoch": 1.7401379891504714, + "grad_norm": 0.8265219926834106, + "learning_rate": 0.0001854209982737192, + "loss": 1.7349, + "step": 8260 + }, + { + "epoch": 1.7422446937378206, + "grad_norm": 0.8101875185966492, + "learning_rate": 0.00018538656315779183, + "loss": 1.7198, + "step": 8270 + }, + { + "epoch": 1.7443513983251697, + "grad_norm": 0.8307563662528992, + "learning_rate": 0.00018535209062789433, + "loss": 1.7425, + "step": 8280 + }, + { + "epoch": 1.746458102912519, + "grad_norm": 0.8362199068069458, + "learning_rate": 0.00018531758069913158, + "loss": 1.7387, + "step": 8290 + }, + { + "epoch": 1.7485648074998683, + "grad_norm": 0.8315666913986206, + "learning_rate": 0.00018528303338662488, + "loss": 1.7549, + "step": 8300 + }, + { + "epoch": 1.7506715120872176, + "grad_norm": 0.9023050665855408, + "learning_rate": 0.00018524844870551185, + "loss": 1.7211, + "step": 8310 + }, + { + "epoch": 1.752778216674567, + "grad_norm": 0.8308394551277161, + "learning_rate": 0.00018521382667094656, + "loss": 1.7989, + "step": 8320 + }, + { + "epoch": 1.7548849212619162, + "grad_norm": 0.8728174567222595, + "learning_rate": 0.0001851791672980993, + "loss": 1.7407, + "step": 8330 + }, + { + "epoch": 1.7569916258492653, + "grad_norm": 0.8924462795257568, + "learning_rate": 0.00018514447060215698, + "loss": 1.7964, + "step": 8340 + }, + { + "epoch": 1.7590983304366146, + "grad_norm": 0.8875755071640015, + "learning_rate": 0.00018510973659832257, + "loss": 1.7254, + "step": 8350 + }, + { + "epoch": 1.7612050350239636, + "grad_norm": 0.9017901420593262, + "learning_rate": 0.0001850749653018156, + "loss": 1.7919, + "step": 8360 + }, + { + "epoch": 1.763311739611313, + "grad_norm": 0.8241222500801086, + "learning_rate": 0.00018504015672787184, + "loss": 1.7515, + "step": 8370 + }, + { + "epoch": 1.7654184441986622, + "grad_norm": 0.8747454881668091, + "learning_rate": 0.00018500531089174341, + "loss": 1.6982, + "step": 8380 + }, + { + "epoch": 1.7675251487860115, + "grad_norm": 0.8906333446502686, + "learning_rate": 0.00018497042780869883, + "loss": 1.7056, + "step": 8390 + }, + { + "epoch": 1.7696318533733608, + "grad_norm": 0.797450840473175, + "learning_rate": 0.00018493550749402278, + "loss": 1.7383, + "step": 8400 + }, + { + "epoch": 1.7717385579607101, + "grad_norm": 0.813721239566803, + "learning_rate": 0.0001849005499630164, + "loss": 1.8067, + "step": 8410 + }, + { + "epoch": 1.7738452625480592, + "grad_norm": 0.8121708631515503, + "learning_rate": 0.00018486555523099712, + "loss": 1.7114, + "step": 8420 + }, + { + "epoch": 1.7759519671354085, + "grad_norm": 0.8621687293052673, + "learning_rate": 0.00018483052331329857, + "loss": 1.6977, + "step": 8430 + }, + { + "epoch": 1.7780586717227576, + "grad_norm": 0.8001519441604614, + "learning_rate": 0.00018479545422527084, + "loss": 1.7607, + "step": 8440 + }, + { + "epoch": 1.7801653763101069, + "grad_norm": 0.8188821077346802, + "learning_rate": 0.0001847603479822801, + "loss": 1.7479, + "step": 8450 + }, + { + "epoch": 1.7822720808974561, + "grad_norm": 0.8462778329849243, + "learning_rate": 0.00018472520459970898, + "loss": 1.719, + "step": 8460 + }, + { + "epoch": 1.7843787854848054, + "grad_norm": 0.8654899597167969, + "learning_rate": 0.00018469002409295628, + "loss": 1.7461, + "step": 8470 + }, + { + "epoch": 1.7864854900721547, + "grad_norm": 0.8919845819473267, + "learning_rate": 0.0001846548064774371, + "loss": 1.7215, + "step": 8480 + }, + { + "epoch": 1.788592194659504, + "grad_norm": 0.8800879120826721, + "learning_rate": 0.00018461955176858285, + "loss": 1.7195, + "step": 8490 + }, + { + "epoch": 1.790698899246853, + "grad_norm": 0.7641480565071106, + "learning_rate": 0.00018458425998184113, + "loss": 1.6894, + "step": 8500 + }, + { + "epoch": 1.7928056038342024, + "grad_norm": 0.7839869856834412, + "learning_rate": 0.00018454893113267572, + "loss": 1.6647, + "step": 8510 + }, + { + "epoch": 1.7949123084215515, + "grad_norm": 0.8127973079681396, + "learning_rate": 0.0001845135652365668, + "loss": 1.8154, + "step": 8520 + }, + { + "epoch": 1.7970190130089008, + "grad_norm": 0.8238587379455566, + "learning_rate": 0.00018447816230901068, + "loss": 1.6919, + "step": 8530 + }, + { + "epoch": 1.79912571759625, + "grad_norm": 0.9203738570213318, + "learning_rate": 0.0001844427223655199, + "loss": 1.761, + "step": 8540 + }, + { + "epoch": 1.8012324221835994, + "grad_norm": 0.8604192137718201, + "learning_rate": 0.00018440724542162328, + "loss": 1.8097, + "step": 8550 + }, + { + "epoch": 1.8033391267709487, + "grad_norm": 0.8560807704925537, + "learning_rate": 0.0001843717314928658, + "loss": 1.7408, + "step": 8560 + }, + { + "epoch": 1.805445831358298, + "grad_norm": 0.8069592118263245, + "learning_rate": 0.00018433618059480864, + "loss": 1.7691, + "step": 8570 + }, + { + "epoch": 1.807552535945647, + "grad_norm": 0.8375011086463928, + "learning_rate": 0.00018430059274302917, + "loss": 1.7214, + "step": 8580 + }, + { + "epoch": 1.8096592405329963, + "grad_norm": 0.8408818244934082, + "learning_rate": 0.000184264967953121, + "loss": 1.7175, + "step": 8590 + }, + { + "epoch": 1.8117659451203454, + "grad_norm": 0.8032023310661316, + "learning_rate": 0.00018422930624069396, + "loss": 1.7363, + "step": 8600 + }, + { + "epoch": 1.8138726497076947, + "grad_norm": 0.8840620517730713, + "learning_rate": 0.00018419360762137395, + "loss": 1.7359, + "step": 8610 + }, + { + "epoch": 1.815979354295044, + "grad_norm": 0.869219958782196, + "learning_rate": 0.00018415787211080304, + "loss": 1.7537, + "step": 8620 + }, + { + "epoch": 1.8180860588823933, + "grad_norm": 0.8339371085166931, + "learning_rate": 0.0001841220997246396, + "loss": 1.7803, + "step": 8630 + }, + { + "epoch": 1.8201927634697426, + "grad_norm": 0.8503332734107971, + "learning_rate": 0.00018408629047855804, + "loss": 1.764, + "step": 8640 + }, + { + "epoch": 1.8222994680570916, + "grad_norm": 0.8225341439247131, + "learning_rate": 0.00018405044438824897, + "loss": 1.7486, + "step": 8650 + }, + { + "epoch": 1.824406172644441, + "grad_norm": 0.8652873635292053, + "learning_rate": 0.00018401456146941908, + "loss": 1.6888, + "step": 8660 + }, + { + "epoch": 1.82651287723179, + "grad_norm": 0.8248763084411621, + "learning_rate": 0.00018397864173779133, + "loss": 1.7245, + "step": 8670 + }, + { + "epoch": 1.8286195818191393, + "grad_norm": 0.8583594560623169, + "learning_rate": 0.00018394268520910466, + "loss": 1.7382, + "step": 8680 + }, + { + "epoch": 1.8307262864064886, + "grad_norm": 0.8216450214385986, + "learning_rate": 0.00018390669189911427, + "loss": 1.7708, + "step": 8690 + }, + { + "epoch": 1.832832990993838, + "grad_norm": 0.8396652340888977, + "learning_rate": 0.00018387066182359133, + "loss": 1.7057, + "step": 8700 + }, + { + "epoch": 1.8349396955811872, + "grad_norm": 0.8246045708656311, + "learning_rate": 0.00018383459499832322, + "loss": 1.7709, + "step": 8710 + }, + { + "epoch": 1.8370464001685365, + "grad_norm": 0.8746206164360046, + "learning_rate": 0.00018379849143911343, + "loss": 1.7462, + "step": 8720 + }, + { + "epoch": 1.8391531047558856, + "grad_norm": 1.081039547920227, + "learning_rate": 0.00018376235116178148, + "loss": 1.7569, + "step": 8730 + }, + { + "epoch": 1.8412598093432349, + "grad_norm": 0.9471474289894104, + "learning_rate": 0.00018372617418216307, + "loss": 1.7173, + "step": 8740 + }, + { + "epoch": 1.843366513930584, + "grad_norm": 0.848264217376709, + "learning_rate": 0.00018368996051610986, + "loss": 1.7653, + "step": 8750 + }, + { + "epoch": 1.8454732185179332, + "grad_norm": 0.8978562951087952, + "learning_rate": 0.00018365371017948964, + "loss": 1.7484, + "step": 8760 + }, + { + "epoch": 1.8475799231052825, + "grad_norm": 0.8349031209945679, + "learning_rate": 0.00018361742318818638, + "loss": 1.7595, + "step": 8770 + }, + { + "epoch": 1.8496866276926318, + "grad_norm": 0.8227035403251648, + "learning_rate": 0.00018358109955809993, + "loss": 1.7236, + "step": 8780 + }, + { + "epoch": 1.851793332279981, + "grad_norm": 0.7719805240631104, + "learning_rate": 0.0001835447393051463, + "loss": 1.7278, + "step": 8790 + }, + { + "epoch": 1.8539000368673304, + "grad_norm": 1.0519185066223145, + "learning_rate": 0.00018350834244525749, + "loss": 1.7136, + "step": 8800 + }, + { + "epoch": 1.8560067414546795, + "grad_norm": 0.9757561683654785, + "learning_rate": 0.00018347190899438158, + "loss": 1.7359, + "step": 8810 + }, + { + "epoch": 1.8581134460420288, + "grad_norm": 0.7864118814468384, + "learning_rate": 0.00018343543896848273, + "loss": 1.6817, + "step": 8820 + }, + { + "epoch": 1.8602201506293778, + "grad_norm": 0.8366915583610535, + "learning_rate": 0.000183398932383541, + "loss": 1.7644, + "step": 8830 + }, + { + "epoch": 1.8623268552167271, + "grad_norm": 0.8808780312538147, + "learning_rate": 0.00018336238925555263, + "loss": 1.7644, + "step": 8840 + }, + { + "epoch": 1.8644335598040764, + "grad_norm": 0.8752515316009521, + "learning_rate": 0.00018332580960052965, + "loss": 1.7058, + "step": 8850 + }, + { + "epoch": 1.8665402643914257, + "grad_norm": 0.8687223792076111, + "learning_rate": 0.00018328919343450035, + "loss": 1.7352, + "step": 8860 + }, + { + "epoch": 1.868646968978775, + "grad_norm": 0.9593966603279114, + "learning_rate": 0.00018325254077350883, + "loss": 1.7985, + "step": 8870 + }, + { + "epoch": 1.8707536735661243, + "grad_norm": 0.8275451064109802, + "learning_rate": 0.00018321585163361527, + "loss": 1.7322, + "step": 8880 + }, + { + "epoch": 1.8728603781534734, + "grad_norm": 0.8880124092102051, + "learning_rate": 0.0001831791260308958, + "loss": 1.7701, + "step": 8890 + }, + { + "epoch": 1.8749670827408227, + "grad_norm": 0.7356016635894775, + "learning_rate": 0.00018314236398144255, + "loss": 1.6926, + "step": 8900 + }, + { + "epoch": 1.8770737873281718, + "grad_norm": 0.869474470615387, + "learning_rate": 0.00018310556550136357, + "loss": 1.7184, + "step": 8910 + }, + { + "epoch": 1.879180491915521, + "grad_norm": 0.8896689414978027, + "learning_rate": 0.00018306873060678296, + "loss": 1.6887, + "step": 8920 + }, + { + "epoch": 1.8812871965028704, + "grad_norm": 0.9539408683776855, + "learning_rate": 0.0001830318593138407, + "loss": 1.7284, + "step": 8930 + }, + { + "epoch": 1.8833939010902196, + "grad_norm": 0.8492235541343689, + "learning_rate": 0.00018299495163869275, + "loss": 1.7262, + "step": 8940 + }, + { + "epoch": 1.885500605677569, + "grad_norm": 0.8203418850898743, + "learning_rate": 0.00018295800759751102, + "loss": 1.7769, + "step": 8950 + }, + { + "epoch": 1.8876073102649182, + "grad_norm": 0.7786878943443298, + "learning_rate": 0.00018292102720648333, + "loss": 1.7299, + "step": 8960 + }, + { + "epoch": 1.8897140148522673, + "grad_norm": 0.8030612468719482, + "learning_rate": 0.0001828840104818134, + "loss": 1.7691, + "step": 8970 + }, + { + "epoch": 1.8918207194396166, + "grad_norm": 0.9526339769363403, + "learning_rate": 0.00018284695743972095, + "loss": 1.739, + "step": 8980 + }, + { + "epoch": 1.8939274240269657, + "grad_norm": 0.8131811022758484, + "learning_rate": 0.00018280986809644158, + "loss": 1.7498, + "step": 8990 + }, + { + "epoch": 1.896034128614315, + "grad_norm": 0.8120954632759094, + "learning_rate": 0.0001827727424682268, + "loss": 1.7462, + "step": 9000 + }, + { + "epoch": 1.8981408332016643, + "grad_norm": 0.7906435132026672, + "learning_rate": 0.00018273558057134393, + "loss": 1.7311, + "step": 9010 + }, + { + "epoch": 1.9002475377890136, + "grad_norm": 0.9683308601379395, + "learning_rate": 0.00018269838242207636, + "loss": 1.7781, + "step": 9020 + }, + { + "epoch": 1.9023542423763629, + "grad_norm": 0.8596441745758057, + "learning_rate": 0.00018266114803672318, + "loss": 1.7434, + "step": 9030 + }, + { + "epoch": 1.9044609469637122, + "grad_norm": 0.7930140495300293, + "learning_rate": 0.0001826238774315995, + "loss": 1.7232, + "step": 9040 + }, + { + "epoch": 1.9065676515510612, + "grad_norm": 0.8552681803703308, + "learning_rate": 0.00018258657062303623, + "loss": 1.8003, + "step": 9050 + }, + { + "epoch": 1.9086743561384105, + "grad_norm": 0.954117476940155, + "learning_rate": 0.00018254922762738008, + "loss": 1.7098, + "step": 9060 + }, + { + "epoch": 1.9107810607257596, + "grad_norm": 0.9021719098091125, + "learning_rate": 0.00018251184846099382, + "loss": 1.7276, + "step": 9070 + }, + { + "epoch": 1.912887765313109, + "grad_norm": 0.8462482690811157, + "learning_rate": 0.00018247443314025583, + "loss": 1.6948, + "step": 9080 + }, + { + "epoch": 1.9149944699004582, + "grad_norm": 0.8283345699310303, + "learning_rate": 0.00018243698168156054, + "loss": 1.6674, + "step": 9090 + }, + { + "epoch": 1.9171011744878075, + "grad_norm": 0.8391633629798889, + "learning_rate": 0.00018239949410131802, + "loss": 1.7516, + "step": 9100 + }, + { + "epoch": 1.9192078790751568, + "grad_norm": 0.7927471995353699, + "learning_rate": 0.00018236197041595432, + "loss": 1.7364, + "step": 9110 + }, + { + "epoch": 1.921314583662506, + "grad_norm": 0.8896411061286926, + "learning_rate": 0.00018232441064191125, + "loss": 1.7549, + "step": 9120 + }, + { + "epoch": 1.9234212882498551, + "grad_norm": 0.8045653700828552, + "learning_rate": 0.00018228681479564644, + "loss": 1.7284, + "step": 9130 + }, + { + "epoch": 1.9255279928372044, + "grad_norm": 0.8324005603790283, + "learning_rate": 0.0001822491828936333, + "loss": 1.6741, + "step": 9140 + }, + { + "epoch": 1.9276346974245535, + "grad_norm": 0.7971271276473999, + "learning_rate": 0.0001822115149523611, + "loss": 1.765, + "step": 9150 + }, + { + "epoch": 1.9297414020119028, + "grad_norm": 0.892197847366333, + "learning_rate": 0.0001821738109883348, + "loss": 1.7358, + "step": 9160 + }, + { + "epoch": 1.931848106599252, + "grad_norm": 0.8694189786911011, + "learning_rate": 0.00018213607101807527, + "loss": 1.7133, + "step": 9170 + }, + { + "epoch": 1.9339548111866014, + "grad_norm": 0.8469682931900024, + "learning_rate": 0.0001820982950581191, + "loss": 1.7176, + "step": 9180 + }, + { + "epoch": 1.9360615157739507, + "grad_norm": 0.8897395730018616, + "learning_rate": 0.0001820604831250186, + "loss": 1.6831, + "step": 9190 + }, + { + "epoch": 1.9381682203613, + "grad_norm": 0.8338301777839661, + "learning_rate": 0.0001820226352353419, + "loss": 1.7822, + "step": 9200 + }, + { + "epoch": 1.940274924948649, + "grad_norm": 0.8942866921424866, + "learning_rate": 0.00018198475140567287, + "loss": 1.7657, + "step": 9210 + }, + { + "epoch": 1.9423816295359984, + "grad_norm": 0.8633554577827454, + "learning_rate": 0.00018194683165261114, + "loss": 1.7695, + "step": 9220 + }, + { + "epoch": 1.9444883341233474, + "grad_norm": 0.8235793709754944, + "learning_rate": 0.00018190887599277207, + "loss": 1.8101, + "step": 9230 + }, + { + "epoch": 1.9465950387106967, + "grad_norm": 0.8241968154907227, + "learning_rate": 0.00018187088444278674, + "loss": 1.7691, + "step": 9240 + }, + { + "epoch": 1.948701743298046, + "grad_norm": 0.8452653288841248, + "learning_rate": 0.000181832857019302, + "loss": 1.8259, + "step": 9250 + }, + { + "epoch": 1.9508084478853953, + "grad_norm": 0.862368643283844, + "learning_rate": 0.00018179479373898035, + "loss": 1.7402, + "step": 9260 + }, + { + "epoch": 1.9529151524727446, + "grad_norm": 0.7917757034301758, + "learning_rate": 0.00018175669461850005, + "loss": 1.7411, + "step": 9270 + }, + { + "epoch": 1.955021857060094, + "grad_norm": 0.9397031664848328, + "learning_rate": 0.00018171855967455506, + "loss": 1.6667, + "step": 9280 + }, + { + "epoch": 1.957128561647443, + "grad_norm": 0.8887811303138733, + "learning_rate": 0.00018168038892385507, + "loss": 1.6761, + "step": 9290 + }, + { + "epoch": 1.9592352662347923, + "grad_norm": 0.8086221814155579, + "learning_rate": 0.00018164218238312535, + "loss": 1.7965, + "step": 9300 + }, + { + "epoch": 1.9613419708221413, + "grad_norm": 0.8723207712173462, + "learning_rate": 0.00018160394006910694, + "loss": 1.801, + "step": 9310 + }, + { + "epoch": 1.9634486754094906, + "grad_norm": 0.8716481328010559, + "learning_rate": 0.00018156566199855657, + "loss": 1.689, + "step": 9320 + }, + { + "epoch": 1.96555537999684, + "grad_norm": 0.8738270401954651, + "learning_rate": 0.00018152734818824658, + "loss": 1.7629, + "step": 9330 + }, + { + "epoch": 1.9676620845841892, + "grad_norm": 0.8747379779815674, + "learning_rate": 0.00018148899865496503, + "loss": 1.712, + "step": 9340 + }, + { + "epoch": 1.9697687891715385, + "grad_norm": 0.9044255018234253, + "learning_rate": 0.00018145061341551553, + "loss": 1.7433, + "step": 9350 + }, + { + "epoch": 1.9718754937588878, + "grad_norm": 0.8422555923461914, + "learning_rate": 0.00018141219248671745, + "loss": 1.7475, + "step": 9360 + }, + { + "epoch": 1.973982198346237, + "grad_norm": 0.9065391421318054, + "learning_rate": 0.00018137373588540578, + "loss": 1.777, + "step": 9370 + }, + { + "epoch": 1.9760889029335862, + "grad_norm": 0.8669577240943909, + "learning_rate": 0.00018133524362843104, + "loss": 1.7281, + "step": 9380 + }, + { + "epoch": 1.9781956075209353, + "grad_norm": 0.8608076572418213, + "learning_rate": 0.0001812967157326595, + "loss": 1.7726, + "step": 9390 + }, + { + "epoch": 1.9803023121082846, + "grad_norm": 0.8125661015510559, + "learning_rate": 0.00018125815221497294, + "loss": 1.7158, + "step": 9400 + }, + { + "epoch": 1.9824090166956339, + "grad_norm": 0.92203289270401, + "learning_rate": 0.00018121955309226886, + "loss": 1.7336, + "step": 9410 + }, + { + "epoch": 1.9845157212829831, + "grad_norm": 0.8154654502868652, + "learning_rate": 0.00018118091838146029, + "loss": 1.7337, + "step": 9420 + }, + { + "epoch": 1.9866224258703324, + "grad_norm": 0.8414307832717896, + "learning_rate": 0.00018114224809947583, + "loss": 1.7626, + "step": 9430 + }, + { + "epoch": 1.9887291304576817, + "grad_norm": 0.9456828832626343, + "learning_rate": 0.0001811035422632597, + "loss": 1.7152, + "step": 9440 + }, + { + "epoch": 1.9908358350450308, + "grad_norm": 0.8258214592933655, + "learning_rate": 0.00018106480088977172, + "loss": 1.7039, + "step": 9450 + }, + { + "epoch": 1.9929425396323799, + "grad_norm": 0.8530787229537964, + "learning_rate": 0.00018102602399598728, + "loss": 1.7288, + "step": 9460 + }, + { + "epoch": 1.9950492442197292, + "grad_norm": 0.8169214129447937, + "learning_rate": 0.00018098721159889728, + "loss": 1.7202, + "step": 9470 + }, + { + "epoch": 1.9971559488070785, + "grad_norm": 0.868748664855957, + "learning_rate": 0.00018094836371550824, + "loss": 1.711, + "step": 9480 + }, + { + "epoch": 1.9992626533944278, + "grad_norm": 0.7722933888435364, + "learning_rate": 0.00018090948036284215, + "loss": 1.6874, + "step": 9490 + }, + { + "epoch": 2.001369357981777, + "grad_norm": 0.7461708188056946, + "learning_rate": 0.0001808705615579367, + "loss": 1.6395, + "step": 9500 + }, + { + "epoch": 2.0034760625691264, + "grad_norm": 0.9301486015319824, + "learning_rate": 0.00018083160731784486, + "loss": 1.6459, + "step": 9510 + }, + { + "epoch": 2.0055827671564757, + "grad_norm": 0.834187924861908, + "learning_rate": 0.00018079261765963537, + "loss": 1.6873, + "step": 9520 + }, + { + "epoch": 2.0076894717438245, + "grad_norm": 0.8011589646339417, + "learning_rate": 0.0001807535926003924, + "loss": 1.6617, + "step": 9530 + }, + { + "epoch": 2.009796176331174, + "grad_norm": 0.8627424836158752, + "learning_rate": 0.00018071453215721554, + "loss": 1.7085, + "step": 9540 + }, + { + "epoch": 2.011902880918523, + "grad_norm": 0.9021561145782471, + "learning_rate": 0.00018067543634722006, + "loss": 1.6688, + "step": 9550 + }, + { + "epoch": 2.0140095855058724, + "grad_norm": 0.8814119100570679, + "learning_rate": 0.00018063630518753662, + "loss": 1.7258, + "step": 9560 + }, + { + "epoch": 2.0161162900932217, + "grad_norm": 0.9105542898178101, + "learning_rate": 0.0001805971386953113, + "loss": 1.6976, + "step": 9570 + }, + { + "epoch": 2.018222994680571, + "grad_norm": 0.8711789846420288, + "learning_rate": 0.00018055793688770587, + "loss": 1.7255, + "step": 9580 + }, + { + "epoch": 2.0203296992679203, + "grad_norm": 0.8914005160331726, + "learning_rate": 0.00018051869978189731, + "loss": 1.7591, + "step": 9590 + }, + { + "epoch": 2.0224364038552696, + "grad_norm": 0.874055802822113, + "learning_rate": 0.00018047942739507836, + "loss": 1.626, + "step": 9600 + }, + { + "epoch": 2.0245431084426184, + "grad_norm": 0.823403537273407, + "learning_rate": 0.00018044011974445697, + "loss": 1.7155, + "step": 9610 + }, + { + "epoch": 2.0266498130299677, + "grad_norm": 0.859565258026123, + "learning_rate": 0.00018040077684725666, + "loss": 1.7357, + "step": 9620 + }, + { + "epoch": 2.028756517617317, + "grad_norm": 0.8823413252830505, + "learning_rate": 0.0001803613987207163, + "loss": 1.673, + "step": 9630 + }, + { + "epoch": 2.0308632222046663, + "grad_norm": 0.903207540512085, + "learning_rate": 0.00018032198538209043, + "loss": 1.708, + "step": 9640 + }, + { + "epoch": 2.0329699267920156, + "grad_norm": 0.9172302484512329, + "learning_rate": 0.0001802825368486487, + "loss": 1.7111, + "step": 9650 + }, + { + "epoch": 2.035076631379365, + "grad_norm": 0.8453816175460815, + "learning_rate": 0.00018024305313767646, + "loss": 1.6574, + "step": 9660 + }, + { + "epoch": 2.037183335966714, + "grad_norm": 0.8865382671356201, + "learning_rate": 0.00018020353426647428, + "loss": 1.7316, + "step": 9670 + }, + { + "epoch": 2.0392900405540635, + "grad_norm": 0.9413029551506042, + "learning_rate": 0.0001801639802523582, + "loss": 1.7115, + "step": 9680 + }, + { + "epoch": 2.0413967451414123, + "grad_norm": 0.8472778797149658, + "learning_rate": 0.00018012439111265974, + "loss": 1.6621, + "step": 9690 + }, + { + "epoch": 2.0435034497287616, + "grad_norm": 0.8548828363418579, + "learning_rate": 0.00018008476686472564, + "loss": 1.7178, + "step": 9700 + }, + { + "epoch": 2.045610154316111, + "grad_norm": 0.9240100979804993, + "learning_rate": 0.0001800451075259182, + "loss": 1.6855, + "step": 9710 + }, + { + "epoch": 2.0477168589034602, + "grad_norm": 0.8424657583236694, + "learning_rate": 0.000180005413113615, + "loss": 1.6294, + "step": 9720 + }, + { + "epoch": 2.0498235634908095, + "grad_norm": 0.8709080815315247, + "learning_rate": 0.00017996568364520897, + "loss": 1.68, + "step": 9730 + }, + { + "epoch": 2.051930268078159, + "grad_norm": 0.8857349753379822, + "learning_rate": 0.00017992591913810845, + "loss": 1.7878, + "step": 9740 + }, + { + "epoch": 2.054036972665508, + "grad_norm": 0.928957462310791, + "learning_rate": 0.00017988611960973713, + "loss": 1.6697, + "step": 9750 + }, + { + "epoch": 2.0561436772528574, + "grad_norm": 0.8724343776702881, + "learning_rate": 0.00017984628507753406, + "loss": 1.6534, + "step": 9760 + }, + { + "epoch": 2.0582503818402063, + "grad_norm": 0.9669876098632812, + "learning_rate": 0.00017980641555895356, + "loss": 1.6382, + "step": 9770 + }, + { + "epoch": 2.0603570864275556, + "grad_norm": 0.853915274143219, + "learning_rate": 0.00017976651107146533, + "loss": 1.7274, + "step": 9780 + }, + { + "epoch": 2.062463791014905, + "grad_norm": 0.8353849053382874, + "learning_rate": 0.00017972657163255442, + "loss": 1.6684, + "step": 9790 + }, + { + "epoch": 2.064570495602254, + "grad_norm": 0.8487059473991394, + "learning_rate": 0.00017968659725972112, + "loss": 1.7395, + "step": 9800 + }, + { + "epoch": 2.0666772001896034, + "grad_norm": 0.8349198698997498, + "learning_rate": 0.00017964658797048108, + "loss": 1.7004, + "step": 9810 + }, + { + "epoch": 2.0687839047769527, + "grad_norm": 0.866244375705719, + "learning_rate": 0.0001796065437823652, + "loss": 1.6692, + "step": 9820 + }, + { + "epoch": 2.070890609364302, + "grad_norm": 0.9268345236778259, + "learning_rate": 0.0001795664647129198, + "loss": 1.7057, + "step": 9830 + }, + { + "epoch": 2.0729973139516513, + "grad_norm": 0.803345263004303, + "learning_rate": 0.0001795263507797063, + "loss": 1.6856, + "step": 9840 + }, + { + "epoch": 2.075104018539, + "grad_norm": 0.8338049054145813, + "learning_rate": 0.00017948620200030152, + "loss": 1.7012, + "step": 9850 + }, + { + "epoch": 2.0772107231263495, + "grad_norm": 0.9227036833763123, + "learning_rate": 0.00017944601839229755, + "loss": 1.6416, + "step": 9860 + }, + { + "epoch": 2.0793174277136988, + "grad_norm": 1.008952021598816, + "learning_rate": 0.00017940579997330165, + "loss": 1.6402, + "step": 9870 + }, + { + "epoch": 2.081424132301048, + "grad_norm": 0.9596250653266907, + "learning_rate": 0.0001793655467609364, + "loss": 1.6992, + "step": 9880 + }, + { + "epoch": 2.0835308368883974, + "grad_norm": 0.8340634107589722, + "learning_rate": 0.00017932525877283964, + "loss": 1.7393, + "step": 9890 + }, + { + "epoch": 2.0856375414757466, + "grad_norm": 0.8354076743125916, + "learning_rate": 0.00017928493602666445, + "loss": 1.7162, + "step": 9900 + }, + { + "epoch": 2.087744246063096, + "grad_norm": 0.8993865847587585, + "learning_rate": 0.00017924457854007902, + "loss": 1.6699, + "step": 9910 + }, + { + "epoch": 2.0898509506504452, + "grad_norm": 0.9259373545646667, + "learning_rate": 0.00017920418633076698, + "loss": 1.6892, + "step": 9920 + }, + { + "epoch": 2.091957655237794, + "grad_norm": 0.9740672707557678, + "learning_rate": 0.0001791637594164269, + "loss": 1.7243, + "step": 9930 + }, + { + "epoch": 2.0940643598251434, + "grad_norm": 0.8578957319259644, + "learning_rate": 0.00017912329781477287, + "loss": 1.6387, + "step": 9940 + }, + { + "epoch": 2.0961710644124927, + "grad_norm": 0.8971062302589417, + "learning_rate": 0.00017908280154353392, + "loss": 1.7315, + "step": 9950 + }, + { + "epoch": 2.098277768999842, + "grad_norm": 0.8996395468711853, + "learning_rate": 0.00017904227062045437, + "loss": 1.7068, + "step": 9960 + }, + { + "epoch": 2.1003844735871913, + "grad_norm": 0.858382523059845, + "learning_rate": 0.00017900170506329372, + "loss": 1.7447, + "step": 9970 + }, + { + "epoch": 2.1024911781745406, + "grad_norm": 0.8814343810081482, + "learning_rate": 0.00017896110488982672, + "loss": 1.6968, + "step": 9980 + }, + { + "epoch": 2.10459788276189, + "grad_norm": 0.9278027415275574, + "learning_rate": 0.00017892047011784312, + "loss": 1.7095, + "step": 9990 + }, + { + "epoch": 2.106704587349239, + "grad_norm": 0.8749867081642151, + "learning_rate": 0.00017887980076514796, + "loss": 1.7037, + "step": 10000 + }, + { + "epoch": 2.108811291936588, + "grad_norm": 0.8612695932388306, + "learning_rate": 0.0001788390968495614, + "loss": 1.6536, + "step": 10010 + }, + { + "epoch": 2.1109179965239373, + "grad_norm": 0.8968421220779419, + "learning_rate": 0.00017879835838891875, + "loss": 1.6926, + "step": 10020 + }, + { + "epoch": 2.1130247011112866, + "grad_norm": 0.9752348065376282, + "learning_rate": 0.00017875758540107043, + "loss": 1.6765, + "step": 10030 + }, + { + "epoch": 2.115131405698636, + "grad_norm": 0.8976007699966431, + "learning_rate": 0.00017871677790388203, + "loss": 1.7059, + "step": 10040 + }, + { + "epoch": 2.117238110285985, + "grad_norm": 0.8147952556610107, + "learning_rate": 0.00017867593591523422, + "loss": 1.7046, + "step": 10050 + }, + { + "epoch": 2.1193448148733345, + "grad_norm": 0.9791306853294373, + "learning_rate": 0.00017863505945302279, + "loss": 1.6575, + "step": 10060 + }, + { + "epoch": 2.1214515194606838, + "grad_norm": 0.8902132511138916, + "learning_rate": 0.0001785941485351587, + "loss": 1.6564, + "step": 10070 + }, + { + "epoch": 2.123558224048033, + "grad_norm": 0.8692643642425537, + "learning_rate": 0.00017855320317956784, + "loss": 1.7202, + "step": 10080 + }, + { + "epoch": 2.125664928635382, + "grad_norm": 0.8463397026062012, + "learning_rate": 0.00017851222340419144, + "loss": 1.7316, + "step": 10090 + }, + { + "epoch": 2.127771633222731, + "grad_norm": 0.9453892707824707, + "learning_rate": 0.00017847120922698562, + "loss": 1.6316, + "step": 10100 + }, + { + "epoch": 2.1298783378100805, + "grad_norm": 0.8649837970733643, + "learning_rate": 0.00017843016066592158, + "loss": 1.5966, + "step": 10110 + }, + { + "epoch": 2.13198504239743, + "grad_norm": 0.8016608357429504, + "learning_rate": 0.0001783890777389857, + "loss": 1.7382, + "step": 10120 + }, + { + "epoch": 2.134091746984779, + "grad_norm": 0.879031777381897, + "learning_rate": 0.00017834796046417933, + "loss": 1.6988, + "step": 10130 + }, + { + "epoch": 2.1361984515721284, + "grad_norm": 0.9154124855995178, + "learning_rate": 0.00017830680885951887, + "loss": 1.6508, + "step": 10140 + }, + { + "epoch": 2.1383051561594777, + "grad_norm": 0.8642693758010864, + "learning_rate": 0.00017826562294303585, + "loss": 1.7181, + "step": 10150 + }, + { + "epoch": 2.140411860746827, + "grad_norm": 0.9341524243354797, + "learning_rate": 0.00017822440273277673, + "loss": 1.6984, + "step": 10160 + }, + { + "epoch": 2.142518565334176, + "grad_norm": 0.8300173282623291, + "learning_rate": 0.000178183148246803, + "loss": 1.7304, + "step": 10170 + }, + { + "epoch": 2.144625269921525, + "grad_norm": 0.9484438896179199, + "learning_rate": 0.00017814185950319126, + "loss": 1.722, + "step": 10180 + }, + { + "epoch": 2.1467319745088744, + "grad_norm": 0.9994838833808899, + "learning_rate": 0.00017810053652003304, + "loss": 1.7375, + "step": 10190 + }, + { + "epoch": 2.1488386790962237, + "grad_norm": 0.9783572554588318, + "learning_rate": 0.00017805917931543492, + "loss": 1.6866, + "step": 10200 + }, + { + "epoch": 2.150945383683573, + "grad_norm": 1.022603154182434, + "learning_rate": 0.00017801778790751843, + "loss": 1.715, + "step": 10210 + }, + { + "epoch": 2.1530520882709223, + "grad_norm": 0.9454113245010376, + "learning_rate": 0.00017797636231442016, + "loss": 1.738, + "step": 10220 + }, + { + "epoch": 2.1551587928582716, + "grad_norm": 0.9445421099662781, + "learning_rate": 0.00017793490255429157, + "loss": 1.6642, + "step": 10230 + }, + { + "epoch": 2.1572654974456205, + "grad_norm": 0.9507448077201843, + "learning_rate": 0.00017789340864529917, + "loss": 1.6549, + "step": 10240 + }, + { + "epoch": 2.1593722020329698, + "grad_norm": 0.9919808506965637, + "learning_rate": 0.00017785188060562442, + "loss": 1.6856, + "step": 10250 + }, + { + "epoch": 2.161478906620319, + "grad_norm": 0.8635092973709106, + "learning_rate": 0.00017781031845346375, + "loss": 1.669, + "step": 10260 + }, + { + "epoch": 2.1635856112076683, + "grad_norm": 0.9286298155784607, + "learning_rate": 0.00017776872220702847, + "loss": 1.7009, + "step": 10270 + }, + { + "epoch": 2.1656923157950176, + "grad_norm": 0.8796800374984741, + "learning_rate": 0.0001777270918845449, + "loss": 1.6913, + "step": 10280 + }, + { + "epoch": 2.167799020382367, + "grad_norm": 0.9373218417167664, + "learning_rate": 0.00017768542750425426, + "loss": 1.7442, + "step": 10290 + }, + { + "epoch": 2.1699057249697162, + "grad_norm": 0.8646446466445923, + "learning_rate": 0.00017764372908441275, + "loss": 1.7052, + "step": 10300 + }, + { + "epoch": 2.1720124295570655, + "grad_norm": 0.9008393287658691, + "learning_rate": 0.00017760199664329136, + "loss": 1.7049, + "step": 10310 + }, + { + "epoch": 2.174119134144415, + "grad_norm": 0.9161930084228516, + "learning_rate": 0.00017756023019917607, + "loss": 1.6693, + "step": 10320 + }, + { + "epoch": 2.1762258387317637, + "grad_norm": 0.9321724772453308, + "learning_rate": 0.0001775184297703678, + "loss": 1.6694, + "step": 10330 + }, + { + "epoch": 2.178332543319113, + "grad_norm": 1.1692795753479004, + "learning_rate": 0.0001774765953751823, + "loss": 1.7455, + "step": 10340 + }, + { + "epoch": 2.1804392479064623, + "grad_norm": 0.8697549104690552, + "learning_rate": 0.00017743472703195015, + "loss": 1.6826, + "step": 10350 + }, + { + "epoch": 2.1825459524938116, + "grad_norm": 0.9908336400985718, + "learning_rate": 0.000177392824759017, + "loss": 1.7432, + "step": 10360 + }, + { + "epoch": 2.184652657081161, + "grad_norm": 0.9399738311767578, + "learning_rate": 0.0001773508885747431, + "loss": 1.6717, + "step": 10370 + }, + { + "epoch": 2.18675936166851, + "grad_norm": 1.0489552021026611, + "learning_rate": 0.00017730891849750377, + "loss": 1.7272, + "step": 10380 + }, + { + "epoch": 2.1888660662558594, + "grad_norm": 0.9378235340118408, + "learning_rate": 0.00017726691454568908, + "loss": 1.6757, + "step": 10390 + }, + { + "epoch": 2.1909727708432083, + "grad_norm": 1.0289270877838135, + "learning_rate": 0.000177224876737704, + "loss": 1.6845, + "step": 10400 + }, + { + "epoch": 2.1930794754305576, + "grad_norm": 0.8981749415397644, + "learning_rate": 0.00017718280509196828, + "loss": 1.7542, + "step": 10410 + }, + { + "epoch": 2.195186180017907, + "grad_norm": 0.8597977161407471, + "learning_rate": 0.00017714069962691657, + "loss": 1.664, + "step": 10420 + }, + { + "epoch": 2.197292884605256, + "grad_norm": 0.8526979088783264, + "learning_rate": 0.0001770985603609982, + "loss": 1.7156, + "step": 10430 + }, + { + "epoch": 2.1993995891926055, + "grad_norm": 0.9204460382461548, + "learning_rate": 0.0001770563873126775, + "loss": 1.6903, + "step": 10440 + }, + { + "epoch": 2.2015062937799548, + "grad_norm": 0.8693051934242249, + "learning_rate": 0.00017701418050043342, + "loss": 1.7246, + "step": 10450 + }, + { + "epoch": 2.203612998367304, + "grad_norm": 0.8942760825157166, + "learning_rate": 0.00017697193994275983, + "loss": 1.7206, + "step": 10460 + }, + { + "epoch": 2.2057197029546534, + "grad_norm": 0.8524116277694702, + "learning_rate": 0.00017692966565816532, + "loss": 1.7519, + "step": 10470 + }, + { + "epoch": 2.2078264075420027, + "grad_norm": 0.9318793416023254, + "learning_rate": 0.00017688735766517333, + "loss": 1.7078, + "step": 10480 + }, + { + "epoch": 2.2099331121293515, + "grad_norm": 0.8922185301780701, + "learning_rate": 0.00017684501598232198, + "loss": 1.7328, + "step": 10490 + }, + { + "epoch": 2.212039816716701, + "grad_norm": 0.8484925627708435, + "learning_rate": 0.0001768026406281642, + "loss": 1.7063, + "step": 10500 + }, + { + "epoch": 2.21414652130405, + "grad_norm": 0.907193660736084, + "learning_rate": 0.00017676023162126772, + "loss": 1.714, + "step": 10510 + }, + { + "epoch": 2.2162532258913994, + "grad_norm": 0.9571147561073303, + "learning_rate": 0.00017671778898021488, + "loss": 1.6853, + "step": 10520 + }, + { + "epoch": 2.2183599304787487, + "grad_norm": 0.8519482016563416, + "learning_rate": 0.0001766753127236029, + "loss": 1.6397, + "step": 10530 + }, + { + "epoch": 2.220466635066098, + "grad_norm": 0.8464012742042542, + "learning_rate": 0.00017663280287004364, + "loss": 1.787, + "step": 10540 + }, + { + "epoch": 2.2225733396534473, + "grad_norm": 1.021557092666626, + "learning_rate": 0.00017659025943816373, + "loss": 1.6816, + "step": 10550 + }, + { + "epoch": 2.224680044240796, + "grad_norm": 0.9452894926071167, + "learning_rate": 0.00017654768244660448, + "loss": 1.6548, + "step": 10560 + }, + { + "epoch": 2.2267867488281454, + "grad_norm": 0.8741638660430908, + "learning_rate": 0.00017650507191402194, + "loss": 1.6928, + "step": 10570 + }, + { + "epoch": 2.2288934534154947, + "grad_norm": 0.9022242426872253, + "learning_rate": 0.00017646242785908682, + "loss": 1.6819, + "step": 10580 + }, + { + "epoch": 2.231000158002844, + "grad_norm": 0.9004740118980408, + "learning_rate": 0.00017641975030048454, + "loss": 1.693, + "step": 10590 + }, + { + "epoch": 2.2331068625901933, + "grad_norm": 0.9306367635726929, + "learning_rate": 0.0001763770392569152, + "loss": 1.7281, + "step": 10600 + }, + { + "epoch": 2.2352135671775426, + "grad_norm": 0.9229249358177185, + "learning_rate": 0.00017633429474709356, + "loss": 1.6821, + "step": 10610 + }, + { + "epoch": 2.237320271764892, + "grad_norm": 0.9667083024978638, + "learning_rate": 0.00017629151678974907, + "loss": 1.6875, + "step": 10620 + }, + { + "epoch": 2.239426976352241, + "grad_norm": 0.8459573984146118, + "learning_rate": 0.0001762487054036258, + "loss": 1.7336, + "step": 10630 + }, + { + "epoch": 2.24153368093959, + "grad_norm": 0.9185715913772583, + "learning_rate": 0.00017620586060748252, + "loss": 1.6614, + "step": 10640 + }, + { + "epoch": 2.2436403855269393, + "grad_norm": 0.9300093650817871, + "learning_rate": 0.00017616298242009251, + "loss": 1.6556, + "step": 10650 + }, + { + "epoch": 2.2457470901142886, + "grad_norm": 0.8929982781410217, + "learning_rate": 0.0001761200708602439, + "loss": 1.6979, + "step": 10660 + }, + { + "epoch": 2.247853794701638, + "grad_norm": 0.9404104351997375, + "learning_rate": 0.00017607712594673922, + "loss": 1.7029, + "step": 10670 + }, + { + "epoch": 2.2499604992889872, + "grad_norm": 0.9714049696922302, + "learning_rate": 0.00017603414769839577, + "loss": 1.7188, + "step": 10680 + }, + { + "epoch": 2.2520672038763365, + "grad_norm": 0.8535948991775513, + "learning_rate": 0.00017599113613404538, + "loss": 1.6941, + "step": 10690 + }, + { + "epoch": 2.254173908463686, + "grad_norm": 0.9558704495429993, + "learning_rate": 0.0001759480912725345, + "loss": 1.6855, + "step": 10700 + }, + { + "epoch": 2.256280613051035, + "grad_norm": 0.9051923155784607, + "learning_rate": 0.00017590501313272415, + "loss": 1.7355, + "step": 10710 + }, + { + "epoch": 2.258387317638384, + "grad_norm": 0.8770729899406433, + "learning_rate": 0.00017586190173348996, + "loss": 1.7381, + "step": 10720 + }, + { + "epoch": 2.2604940222257333, + "grad_norm": 0.8900496363639832, + "learning_rate": 0.00017581875709372216, + "loss": 1.7573, + "step": 10730 + }, + { + "epoch": 2.2626007268130826, + "grad_norm": 0.8546925187110901, + "learning_rate": 0.00017577557923232546, + "loss": 1.7518, + "step": 10740 + }, + { + "epoch": 2.264707431400432, + "grad_norm": 1.025599718093872, + "learning_rate": 0.0001757323681682192, + "loss": 1.6861, + "step": 10750 + }, + { + "epoch": 2.266814135987781, + "grad_norm": 0.8811286687850952, + "learning_rate": 0.0001756891239203372, + "loss": 1.7308, + "step": 10760 + }, + { + "epoch": 2.2689208405751304, + "grad_norm": 0.9262977242469788, + "learning_rate": 0.00017564584650762793, + "loss": 1.6927, + "step": 10770 + }, + { + "epoch": 2.2710275451624797, + "grad_norm": 1.1650166511535645, + "learning_rate": 0.00017560253594905425, + "loss": 1.7301, + "step": 10780 + }, + { + "epoch": 2.2731342497498286, + "grad_norm": 0.8260765671730042, + "learning_rate": 0.0001755591922635937, + "loss": 1.6494, + "step": 10790 + }, + { + "epoch": 2.275240954337178, + "grad_norm": 0.9326931238174438, + "learning_rate": 0.00017551581547023819, + "loss": 1.7466, + "step": 10800 + }, + { + "epoch": 2.277347658924527, + "grad_norm": 0.8884145021438599, + "learning_rate": 0.0001754724055879942, + "loss": 1.7261, + "step": 10810 + }, + { + "epoch": 2.2794543635118765, + "grad_norm": 0.907815158367157, + "learning_rate": 0.00017542896263588275, + "loss": 1.6978, + "step": 10820 + }, + { + "epoch": 2.2815610680992258, + "grad_norm": 0.9290655255317688, + "learning_rate": 0.0001753854866329393, + "loss": 1.6516, + "step": 10830 + }, + { + "epoch": 2.283667772686575, + "grad_norm": 0.9227153062820435, + "learning_rate": 0.00017534197759821378, + "loss": 1.6579, + "step": 10840 + }, + { + "epoch": 2.2857744772739244, + "grad_norm": 0.8803461194038391, + "learning_rate": 0.00017529843555077066, + "loss": 1.6537, + "step": 10850 + }, + { + "epoch": 2.2878811818612736, + "grad_norm": 1.0009804964065552, + "learning_rate": 0.00017525486050968875, + "loss": 1.7348, + "step": 10860 + }, + { + "epoch": 2.289987886448623, + "grad_norm": 0.8984347581863403, + "learning_rate": 0.00017521125249406145, + "loss": 1.6616, + "step": 10870 + }, + { + "epoch": 2.292094591035972, + "grad_norm": 0.8679141402244568, + "learning_rate": 0.00017516761152299657, + "loss": 1.6898, + "step": 10880 + }, + { + "epoch": 2.294201295623321, + "grad_norm": 1.022753357887268, + "learning_rate": 0.00017512393761561632, + "loss": 1.6664, + "step": 10890 + }, + { + "epoch": 2.2963080002106704, + "grad_norm": 0.919381856918335, + "learning_rate": 0.00017508023079105736, + "loss": 1.7005, + "step": 10900 + }, + { + "epoch": 2.2984147047980197, + "grad_norm": 0.9226942658424377, + "learning_rate": 0.0001750364910684708, + "loss": 1.7123, + "step": 10910 + }, + { + "epoch": 2.300521409385369, + "grad_norm": 0.9635823369026184, + "learning_rate": 0.00017499271846702213, + "loss": 1.7071, + "step": 10920 + }, + { + "epoch": 2.3026281139727183, + "grad_norm": 0.9143548011779785, + "learning_rate": 0.00017494891300589131, + "loss": 1.6471, + "step": 10930 + }, + { + "epoch": 2.3047348185600676, + "grad_norm": 0.8815865516662598, + "learning_rate": 0.00017490507470427257, + "loss": 1.7108, + "step": 10940 + }, + { + "epoch": 2.3068415231474164, + "grad_norm": 0.8886958956718445, + "learning_rate": 0.0001748612035813747, + "loss": 1.6857, + "step": 10950 + }, + { + "epoch": 2.3089482277347657, + "grad_norm": 0.9607595801353455, + "learning_rate": 0.00017481729965642065, + "loss": 1.8113, + "step": 10960 + }, + { + "epoch": 2.311054932322115, + "grad_norm": 0.8694586157798767, + "learning_rate": 0.00017477336294864805, + "loss": 1.7077, + "step": 10970 + }, + { + "epoch": 2.3131616369094643, + "grad_norm": 0.931908130645752, + "learning_rate": 0.00017472939347730856, + "loss": 1.7296, + "step": 10980 + }, + { + "epoch": 2.3152683414968136, + "grad_norm": 0.8658048510551453, + "learning_rate": 0.00017468539126166846, + "loss": 1.7228, + "step": 10990 + }, + { + "epoch": 2.317375046084163, + "grad_norm": 0.8600513935089111, + "learning_rate": 0.00017464135632100825, + "loss": 1.7335, + "step": 11000 + }, + { + "epoch": 2.319481750671512, + "grad_norm": 0.8735826015472412, + "learning_rate": 0.00017459728867462275, + "loss": 1.6597, + "step": 11010 + }, + { + "epoch": 2.3215884552588615, + "grad_norm": 0.9487072229385376, + "learning_rate": 0.00017455318834182118, + "loss": 1.7637, + "step": 11020 + }, + { + "epoch": 2.323695159846211, + "grad_norm": 0.9656599164009094, + "learning_rate": 0.00017450905534192708, + "loss": 1.705, + "step": 11030 + }, + { + "epoch": 2.3258018644335596, + "grad_norm": 0.8537636995315552, + "learning_rate": 0.0001744648896942782, + "loss": 1.6545, + "step": 11040 + }, + { + "epoch": 2.327908569020909, + "grad_norm": 0.8958091139793396, + "learning_rate": 0.0001744206914182268, + "loss": 1.7148, + "step": 11050 + }, + { + "epoch": 2.330015273608258, + "grad_norm": 0.8656923770904541, + "learning_rate": 0.0001743764605331392, + "loss": 1.7047, + "step": 11060 + }, + { + "epoch": 2.3321219781956075, + "grad_norm": 0.8860281109809875, + "learning_rate": 0.00017433219705839616, + "loss": 1.7217, + "step": 11070 + }, + { + "epoch": 2.334228682782957, + "grad_norm": 0.9574180841445923, + "learning_rate": 0.0001742879010133927, + "loss": 1.7557, + "step": 11080 + }, + { + "epoch": 2.336335387370306, + "grad_norm": 0.9499590992927551, + "learning_rate": 0.00017424357241753807, + "loss": 1.7174, + "step": 11090 + }, + { + "epoch": 2.3384420919576554, + "grad_norm": 0.8961427211761475, + "learning_rate": 0.00017419921129025576, + "loss": 1.762, + "step": 11100 + }, + { + "epoch": 2.3405487965450043, + "grad_norm": 0.910950243473053, + "learning_rate": 0.00017415481765098364, + "loss": 1.7514, + "step": 11110 + }, + { + "epoch": 2.3426555011323535, + "grad_norm": 0.9184181690216064, + "learning_rate": 0.0001741103915191737, + "loss": 1.6957, + "step": 11120 + }, + { + "epoch": 2.344762205719703, + "grad_norm": 0.9001920819282532, + "learning_rate": 0.00017406593291429217, + "loss": 1.6939, + "step": 11130 + }, + { + "epoch": 2.346868910307052, + "grad_norm": 0.8733050227165222, + "learning_rate": 0.00017402144185581965, + "loss": 1.7023, + "step": 11140 + }, + { + "epoch": 2.3489756148944014, + "grad_norm": 0.8722548484802246, + "learning_rate": 0.0001739769183632508, + "loss": 1.7007, + "step": 11150 + }, + { + "epoch": 2.3510823194817507, + "grad_norm": 0.8651793599128723, + "learning_rate": 0.0001739323624560945, + "loss": 1.7203, + "step": 11160 + }, + { + "epoch": 2.3531890240691, + "grad_norm": 0.960211992263794, + "learning_rate": 0.000173887774153874, + "loss": 1.7257, + "step": 11170 + }, + { + "epoch": 2.3552957286564493, + "grad_norm": 0.8777878284454346, + "learning_rate": 0.00017384315347612655, + "loss": 1.667, + "step": 11180 + }, + { + "epoch": 2.3574024332437986, + "grad_norm": 0.9975689649581909, + "learning_rate": 0.00017379850044240368, + "loss": 1.6877, + "step": 11190 + }, + { + "epoch": 2.3595091378311475, + "grad_norm": 0.9246053695678711, + "learning_rate": 0.00017375381507227108, + "loss": 1.6909, + "step": 11200 + }, + { + "epoch": 2.3616158424184968, + "grad_norm": 0.8531624674797058, + "learning_rate": 0.00017370909738530864, + "loss": 1.6837, + "step": 11210 + }, + { + "epoch": 2.363722547005846, + "grad_norm": 0.8880084753036499, + "learning_rate": 0.00017366434740111037, + "loss": 1.7014, + "step": 11220 + }, + { + "epoch": 2.3658292515931953, + "grad_norm": 0.8885220289230347, + "learning_rate": 0.0001736195651392844, + "loss": 1.7163, + "step": 11230 + }, + { + "epoch": 2.3679359561805446, + "grad_norm": 0.9168665409088135, + "learning_rate": 0.0001735747506194531, + "loss": 1.7306, + "step": 11240 + }, + { + "epoch": 2.370042660767894, + "grad_norm": 0.8922172784805298, + "learning_rate": 0.00017352990386125292, + "loss": 1.714, + "step": 11250 + }, + { + "epoch": 2.3721493653552432, + "grad_norm": 0.8480457067489624, + "learning_rate": 0.0001734850248843344, + "loss": 1.6654, + "step": 11260 + }, + { + "epoch": 2.374256069942592, + "grad_norm": 0.862450897693634, + "learning_rate": 0.00017344011370836227, + "loss": 1.645, + "step": 11270 + }, + { + "epoch": 2.3763627745299414, + "grad_norm": 0.9275450706481934, + "learning_rate": 0.00017339517035301532, + "loss": 1.7187, + "step": 11280 + }, + { + "epoch": 2.3784694791172907, + "grad_norm": 0.9133449792861938, + "learning_rate": 0.00017335019483798644, + "loss": 1.7186, + "step": 11290 + }, + { + "epoch": 2.38057618370464, + "grad_norm": 0.915347695350647, + "learning_rate": 0.00017330518718298264, + "loss": 1.7369, + "step": 11300 + }, + { + "epoch": 2.3826828882919893, + "grad_norm": 1.0750889778137207, + "learning_rate": 0.000173260147407725, + "loss": 1.6625, + "step": 11310 + }, + { + "epoch": 2.3847895928793386, + "grad_norm": 0.9652765393257141, + "learning_rate": 0.00017321507553194867, + "loss": 1.6807, + "step": 11320 + }, + { + "epoch": 2.386896297466688, + "grad_norm": 0.9385799169540405, + "learning_rate": 0.00017316997157540288, + "loss": 1.7278, + "step": 11330 + }, + { + "epoch": 2.389003002054037, + "grad_norm": 0.8610515594482422, + "learning_rate": 0.00017312483555785086, + "loss": 1.6791, + "step": 11340 + }, + { + "epoch": 2.3911097066413864, + "grad_norm": 0.9047316908836365, + "learning_rate": 0.00017307966749907, + "loss": 1.7269, + "step": 11350 + }, + { + "epoch": 2.3932164112287353, + "grad_norm": 0.7869266867637634, + "learning_rate": 0.0001730344674188516, + "loss": 1.6686, + "step": 11360 + }, + { + "epoch": 2.3953231158160846, + "grad_norm": 0.8471513390541077, + "learning_rate": 0.00017298923533700107, + "loss": 1.7153, + "step": 11370 + }, + { + "epoch": 2.397429820403434, + "grad_norm": 1.0041714906692505, + "learning_rate": 0.00017294397127333785, + "loss": 1.6957, + "step": 11380 + }, + { + "epoch": 2.399536524990783, + "grad_norm": 0.8737631440162659, + "learning_rate": 0.00017289867524769537, + "loss": 1.7366, + "step": 11390 + }, + { + "epoch": 2.4016432295781325, + "grad_norm": 0.9573279619216919, + "learning_rate": 0.000172853347279921, + "loss": 1.7048, + "step": 11400 + }, + { + "epoch": 2.4037499341654818, + "grad_norm": 0.8812615275382996, + "learning_rate": 0.00017280798738987624, + "loss": 1.7328, + "step": 11410 + }, + { + "epoch": 2.405856638752831, + "grad_norm": 0.8517493605613708, + "learning_rate": 0.0001727625955974365, + "loss": 1.6838, + "step": 11420 + }, + { + "epoch": 2.40796334334018, + "grad_norm": 0.9157832264900208, + "learning_rate": 0.00017271717192249116, + "loss": 1.6942, + "step": 11430 + }, + { + "epoch": 2.410070047927529, + "grad_norm": 0.9208922982215881, + "learning_rate": 0.00017267171638494358, + "loss": 1.6941, + "step": 11440 + }, + { + "epoch": 2.4121767525148785, + "grad_norm": 0.896136462688446, + "learning_rate": 0.00017262622900471105, + "loss": 1.714, + "step": 11450 + }, + { + "epoch": 2.414283457102228, + "grad_norm": 0.8526272177696228, + "learning_rate": 0.00017258070980172494, + "loss": 1.6841, + "step": 11460 + }, + { + "epoch": 2.416390161689577, + "grad_norm": 0.9494383335113525, + "learning_rate": 0.00017253515879593043, + "loss": 1.7294, + "step": 11470 + }, + { + "epoch": 2.4184968662769264, + "grad_norm": 0.907871663570404, + "learning_rate": 0.00017248957600728664, + "loss": 1.7119, + "step": 11480 + }, + { + "epoch": 2.4206035708642757, + "grad_norm": 1.0190140008926392, + "learning_rate": 0.00017244396145576672, + "loss": 1.6932, + "step": 11490 + }, + { + "epoch": 2.422710275451625, + "grad_norm": 0.8717211484909058, + "learning_rate": 0.0001723983151613576, + "loss": 1.6928, + "step": 11500 + }, + { + "epoch": 2.4248169800389743, + "grad_norm": 0.8648276329040527, + "learning_rate": 0.0001723526371440603, + "loss": 1.7075, + "step": 11510 + }, + { + "epoch": 2.426923684626323, + "grad_norm": 0.8947681784629822, + "learning_rate": 0.0001723069274238895, + "loss": 1.6637, + "step": 11520 + }, + { + "epoch": 2.4290303892136724, + "grad_norm": 0.8753421306610107, + "learning_rate": 0.000172261186020874, + "loss": 1.7371, + "step": 11530 + }, + { + "epoch": 2.4311370938010217, + "grad_norm": 0.8403404951095581, + "learning_rate": 0.00017221541295505636, + "loss": 1.6949, + "step": 11540 + }, + { + "epoch": 2.433243798388371, + "grad_norm": 0.9043484926223755, + "learning_rate": 0.00017216960824649303, + "loss": 1.6836, + "step": 11550 + }, + { + "epoch": 2.4353505029757203, + "grad_norm": 0.8930955529212952, + "learning_rate": 0.00017212377191525434, + "loss": 1.7813, + "step": 11560 + }, + { + "epoch": 2.4374572075630696, + "grad_norm": 0.9738860130310059, + "learning_rate": 0.00017207790398142446, + "loss": 1.6875, + "step": 11570 + }, + { + "epoch": 2.439563912150419, + "grad_norm": 0.9277392029762268, + "learning_rate": 0.0001720320044651014, + "loss": 1.6789, + "step": 11580 + }, + { + "epoch": 2.4416706167377678, + "grad_norm": 0.8932106494903564, + "learning_rate": 0.0001719860733863971, + "loss": 1.7109, + "step": 11590 + }, + { + "epoch": 2.443777321325117, + "grad_norm": 0.9118972420692444, + "learning_rate": 0.00017194011076543717, + "loss": 1.7243, + "step": 11600 + }, + { + "epoch": 2.4458840259124663, + "grad_norm": 0.8895049691200256, + "learning_rate": 0.0001718941166223612, + "loss": 1.7096, + "step": 11610 + }, + { + "epoch": 2.4479907304998156, + "grad_norm": 0.8548287153244019, + "learning_rate": 0.00017184809097732246, + "loss": 1.6847, + "step": 11620 + }, + { + "epoch": 2.450097435087165, + "grad_norm": 0.8783696293830872, + "learning_rate": 0.00017180203385048812, + "loss": 1.7226, + "step": 11630 + }, + { + "epoch": 2.4522041396745142, + "grad_norm": 0.8651697039604187, + "learning_rate": 0.00017175594526203905, + "loss": 1.7217, + "step": 11640 + }, + { + "epoch": 2.4543108442618635, + "grad_norm": 0.9524984359741211, + "learning_rate": 0.00017170982523217004, + "loss": 1.7337, + "step": 11650 + }, + { + "epoch": 2.4564175488492124, + "grad_norm": 0.8341417908668518, + "learning_rate": 0.00017166367378108953, + "loss": 1.7415, + "step": 11660 + }, + { + "epoch": 2.4585242534365617, + "grad_norm": 0.9668021202087402, + "learning_rate": 0.00017161749092901984, + "loss": 1.6845, + "step": 11670 + }, + { + "epoch": 2.460630958023911, + "grad_norm": 1.0953508615493774, + "learning_rate": 0.00017157127669619688, + "loss": 1.7007, + "step": 11680 + }, + { + "epoch": 2.4627376626112603, + "grad_norm": 0.9357819557189941, + "learning_rate": 0.00017152503110287048, + "loss": 1.7105, + "step": 11690 + }, + { + "epoch": 2.4648443671986096, + "grad_norm": 1.1564433574676514, + "learning_rate": 0.00017147875416930416, + "loss": 1.6699, + "step": 11700 + }, + { + "epoch": 2.466951071785959, + "grad_norm": 0.9135658740997314, + "learning_rate": 0.00017143244591577515, + "loss": 1.6882, + "step": 11710 + }, + { + "epoch": 2.469057776373308, + "grad_norm": 0.9228572249412537, + "learning_rate": 0.00017138610636257436, + "loss": 1.6808, + "step": 11720 + }, + { + "epoch": 2.4711644809606574, + "grad_norm": 0.914090096950531, + "learning_rate": 0.00017133973553000654, + "loss": 1.6233, + "step": 11730 + }, + { + "epoch": 2.4732711855480067, + "grad_norm": 0.9056432843208313, + "learning_rate": 0.00017129333343839003, + "loss": 1.7001, + "step": 11740 + }, + { + "epoch": 2.4753778901353556, + "grad_norm": 0.9140064120292664, + "learning_rate": 0.00017124690010805692, + "loss": 1.7155, + "step": 11750 + }, + { + "epoch": 2.477484594722705, + "grad_norm": 0.8333889245986938, + "learning_rate": 0.00017120043555935298, + "loss": 1.6708, + "step": 11760 + }, + { + "epoch": 2.479591299310054, + "grad_norm": 0.8906539678573608, + "learning_rate": 0.00017115393981263768, + "loss": 1.694, + "step": 11770 + }, + { + "epoch": 2.4816980038974035, + "grad_norm": 0.8971114158630371, + "learning_rate": 0.0001711074128882841, + "loss": 1.7009, + "step": 11780 + }, + { + "epoch": 2.4838047084847528, + "grad_norm": 0.856102466583252, + "learning_rate": 0.00017106085480667903, + "loss": 1.7115, + "step": 11790 + }, + { + "epoch": 2.485911413072102, + "grad_norm": 0.9299646019935608, + "learning_rate": 0.00017101426558822292, + "loss": 1.7067, + "step": 11800 + }, + { + "epoch": 2.4880181176594514, + "grad_norm": 0.9112943410873413, + "learning_rate": 0.00017096764525332986, + "loss": 1.6979, + "step": 11810 + }, + { + "epoch": 2.4901248222468, + "grad_norm": 0.917447566986084, + "learning_rate": 0.00017092099382242748, + "loss": 1.7305, + "step": 11820 + }, + { + "epoch": 2.4922315268341495, + "grad_norm": 0.9493505358695984, + "learning_rate": 0.0001708743113159572, + "loss": 1.6831, + "step": 11830 + }, + { + "epoch": 2.494338231421499, + "grad_norm": 0.9228153824806213, + "learning_rate": 0.0001708275977543739, + "loss": 1.6944, + "step": 11840 + }, + { + "epoch": 2.496444936008848, + "grad_norm": 0.8501965999603271, + "learning_rate": 0.0001707808531581462, + "loss": 1.7237, + "step": 11850 + }, + { + "epoch": 2.4985516405961974, + "grad_norm": 0.9229296445846558, + "learning_rate": 0.00017073407754775622, + "loss": 1.7281, + "step": 11860 + }, + { + "epoch": 2.5006583451835467, + "grad_norm": 0.9164074659347534, + "learning_rate": 0.0001706872709436997, + "loss": 1.7504, + "step": 11870 + }, + { + "epoch": 2.502765049770896, + "grad_norm": 0.9984068274497986, + "learning_rate": 0.00017064043336648599, + "loss": 1.7848, + "step": 11880 + }, + { + "epoch": 2.5048717543582453, + "grad_norm": 0.9881888031959534, + "learning_rate": 0.00017059356483663796, + "loss": 1.6548, + "step": 11890 + }, + { + "epoch": 2.5069784589455946, + "grad_norm": 0.8894940614700317, + "learning_rate": 0.00017054666537469213, + "loss": 1.6994, + "step": 11900 + }, + { + "epoch": 2.5090851635329434, + "grad_norm": 0.93770432472229, + "learning_rate": 0.00017049973500119845, + "loss": 1.6519, + "step": 11910 + }, + { + "epoch": 2.5111918681202927, + "grad_norm": 0.9530473351478577, + "learning_rate": 0.00017045277373672047, + "loss": 1.7667, + "step": 11920 + }, + { + "epoch": 2.513298572707642, + "grad_norm": 1.0557433366775513, + "learning_rate": 0.00017040578160183536, + "loss": 1.6852, + "step": 11930 + }, + { + "epoch": 2.5154052772949913, + "grad_norm": 0.9705979228019714, + "learning_rate": 0.0001703587586171337, + "loss": 1.7019, + "step": 11940 + }, + { + "epoch": 2.5175119818823406, + "grad_norm": 0.9046064019203186, + "learning_rate": 0.0001703117048032196, + "loss": 1.7381, + "step": 11950 + }, + { + "epoch": 2.51961868646969, + "grad_norm": 0.9303194880485535, + "learning_rate": 0.0001702646201807107, + "loss": 1.692, + "step": 11960 + }, + { + "epoch": 2.521725391057039, + "grad_norm": 0.903297483921051, + "learning_rate": 0.0001702175047702382, + "loss": 1.6716, + "step": 11970 + }, + { + "epoch": 2.523832095644388, + "grad_norm": 0.8236584067344666, + "learning_rate": 0.00017017035859244673, + "loss": 1.7018, + "step": 11980 + }, + { + "epoch": 2.525938800231738, + "grad_norm": 0.8487248420715332, + "learning_rate": 0.00017012318166799437, + "loss": 1.7035, + "step": 11990 + }, + { + "epoch": 2.5280455048190866, + "grad_norm": 0.8535507917404175, + "learning_rate": 0.00017007597401755276, + "loss": 1.6756, + "step": 12000 + }, + { + "epoch": 2.530152209406436, + "grad_norm": 0.8676731586456299, + "learning_rate": 0.00017002873566180688, + "loss": 1.7124, + "step": 12010 + }, + { + "epoch": 2.532258913993785, + "grad_norm": 0.9095235466957092, + "learning_rate": 0.0001699814666214553, + "loss": 1.6922, + "step": 12020 + }, + { + "epoch": 2.5343656185811345, + "grad_norm": 0.8557278513908386, + "learning_rate": 0.00016993416691720998, + "loss": 1.6995, + "step": 12030 + }, + { + "epoch": 2.536472323168484, + "grad_norm": 0.9567210674285889, + "learning_rate": 0.00016988683656979624, + "loss": 1.6733, + "step": 12040 + }, + { + "epoch": 2.5385790277558327, + "grad_norm": 0.8367411494255066, + "learning_rate": 0.00016983947559995297, + "loss": 1.7573, + "step": 12050 + }, + { + "epoch": 2.5406857323431824, + "grad_norm": 0.9761552810668945, + "learning_rate": 0.00016979208402843237, + "loss": 1.7031, + "step": 12060 + }, + { + "epoch": 2.5427924369305313, + "grad_norm": 0.9026761651039124, + "learning_rate": 0.0001697446618760001, + "loss": 1.6968, + "step": 12070 + }, + { + "epoch": 2.5448991415178805, + "grad_norm": 0.8959203362464905, + "learning_rate": 0.00016969720916343515, + "loss": 1.6517, + "step": 12080 + }, + { + "epoch": 2.54700584610523, + "grad_norm": 0.8353654742240906, + "learning_rate": 0.00016964972591153, + "loss": 1.6856, + "step": 12090 + }, + { + "epoch": 2.549112550692579, + "grad_norm": 1.0780528783798218, + "learning_rate": 0.00016960221214109045, + "loss": 1.6545, + "step": 12100 + }, + { + "epoch": 2.5512192552799284, + "grad_norm": 0.8808447122573853, + "learning_rate": 0.00016955466787293576, + "loss": 1.6997, + "step": 12110 + }, + { + "epoch": 2.5533259598672777, + "grad_norm": 0.9192379117012024, + "learning_rate": 0.00016950709312789833, + "loss": 1.7434, + "step": 12120 + }, + { + "epoch": 2.555432664454627, + "grad_norm": 0.9900842905044556, + "learning_rate": 0.00016945948792682417, + "loss": 1.7091, + "step": 12130 + }, + { + "epoch": 2.557539369041976, + "grad_norm": 0.9383872747421265, + "learning_rate": 0.0001694118522905725, + "loss": 1.6993, + "step": 12140 + }, + { + "epoch": 2.559646073629325, + "grad_norm": 0.9453622102737427, + "learning_rate": 0.00016936418624001592, + "loss": 1.7373, + "step": 12150 + }, + { + "epoch": 2.5617527782166745, + "grad_norm": 0.8687297701835632, + "learning_rate": 0.0001693164897960403, + "loss": 1.7023, + "step": 12160 + }, + { + "epoch": 2.5638594828040238, + "grad_norm": 0.8873741030693054, + "learning_rate": 0.00016926876297954492, + "loss": 1.6679, + "step": 12170 + }, + { + "epoch": 2.565966187391373, + "grad_norm": 0.9118444919586182, + "learning_rate": 0.00016922100581144228, + "loss": 1.702, + "step": 12180 + }, + { + "epoch": 2.5680728919787223, + "grad_norm": 0.9042273759841919, + "learning_rate": 0.00016917321831265826, + "loss": 1.7259, + "step": 12190 + }, + { + "epoch": 2.5701795965660716, + "grad_norm": 0.9207640886306763, + "learning_rate": 0.00016912540050413195, + "loss": 1.6712, + "step": 12200 + }, + { + "epoch": 2.5722863011534205, + "grad_norm": 0.9605659246444702, + "learning_rate": 0.00016907755240681577, + "loss": 1.7248, + "step": 12210 + }, + { + "epoch": 2.5743930057407702, + "grad_norm": 0.9034086465835571, + "learning_rate": 0.00016902967404167538, + "loss": 1.6718, + "step": 12220 + }, + { + "epoch": 2.576499710328119, + "grad_norm": 0.9003251791000366, + "learning_rate": 0.00016898176542968975, + "loss": 1.7107, + "step": 12230 + }, + { + "epoch": 2.5786064149154684, + "grad_norm": 0.9571187496185303, + "learning_rate": 0.00016893382659185105, + "loss": 1.671, + "step": 12240 + }, + { + "epoch": 2.5807131195028177, + "grad_norm": 0.9620792269706726, + "learning_rate": 0.00016888585754916476, + "loss": 1.7082, + "step": 12250 + }, + { + "epoch": 2.582819824090167, + "grad_norm": 0.9372028112411499, + "learning_rate": 0.0001688378583226495, + "loss": 1.7293, + "step": 12260 + }, + { + "epoch": 2.5849265286775163, + "grad_norm": 0.9726297855377197, + "learning_rate": 0.00016878982893333717, + "loss": 1.7104, + "step": 12270 + }, + { + "epoch": 2.5870332332648656, + "grad_norm": 1.0220943689346313, + "learning_rate": 0.00016874176940227296, + "loss": 1.7373, + "step": 12280 + }, + { + "epoch": 2.589139937852215, + "grad_norm": 0.8310849666595459, + "learning_rate": 0.0001686936797505151, + "loss": 1.6991, + "step": 12290 + }, + { + "epoch": 2.5912466424395637, + "grad_norm": 0.8683304190635681, + "learning_rate": 0.00016864555999913518, + "loss": 1.7168, + "step": 12300 + }, + { + "epoch": 2.593353347026913, + "grad_norm": 0.8827438354492188, + "learning_rate": 0.00016859741016921786, + "loss": 1.702, + "step": 12310 + }, + { + "epoch": 2.5954600516142623, + "grad_norm": 0.91736900806427, + "learning_rate": 0.00016854923028186111, + "loss": 1.6877, + "step": 12320 + }, + { + "epoch": 2.5975667562016116, + "grad_norm": 0.8782617449760437, + "learning_rate": 0.00016850102035817588, + "loss": 1.7195, + "step": 12330 + }, + { + "epoch": 2.599673460788961, + "grad_norm": 0.9509816765785217, + "learning_rate": 0.0001684527804192865, + "loss": 1.7482, + "step": 12340 + }, + { + "epoch": 2.60178016537631, + "grad_norm": 0.9325737357139587, + "learning_rate": 0.00016840451048633025, + "loss": 1.6947, + "step": 12350 + }, + { + "epoch": 2.6038868699636595, + "grad_norm": 0.9182351231575012, + "learning_rate": 0.0001683562105804577, + "loss": 1.6918, + "step": 12360 + }, + { + "epoch": 2.6059935745510083, + "grad_norm": 0.919793426990509, + "learning_rate": 0.00016830788072283247, + "loss": 1.7496, + "step": 12370 + }, + { + "epoch": 2.608100279138358, + "grad_norm": 0.8799266219139099, + "learning_rate": 0.00016825952093463135, + "loss": 1.6664, + "step": 12380 + }, + { + "epoch": 2.610206983725707, + "grad_norm": 0.9467419385910034, + "learning_rate": 0.00016821113123704424, + "loss": 1.7167, + "step": 12390 + }, + { + "epoch": 2.612313688313056, + "grad_norm": 0.9151462316513062, + "learning_rate": 0.00016816271165127412, + "loss": 1.7149, + "step": 12400 + }, + { + "epoch": 2.6144203929004055, + "grad_norm": 0.895194947719574, + "learning_rate": 0.00016811426219853702, + "loss": 1.7368, + "step": 12410 + }, + { + "epoch": 2.616527097487755, + "grad_norm": 1.095924735069275, + "learning_rate": 0.00016806578290006225, + "loss": 1.6619, + "step": 12420 + }, + { + "epoch": 2.618633802075104, + "grad_norm": 0.8870509266853333, + "learning_rate": 0.00016801727377709194, + "loss": 1.6918, + "step": 12430 + }, + { + "epoch": 2.6207405066624534, + "grad_norm": 0.967845618724823, + "learning_rate": 0.00016796873485088148, + "loss": 1.7309, + "step": 12440 + }, + { + "epoch": 2.6228472112498027, + "grad_norm": 1.0131421089172363, + "learning_rate": 0.00016792016614269924, + "loss": 1.6849, + "step": 12450 + }, + { + "epoch": 2.6249539158371515, + "grad_norm": 0.9242698550224304, + "learning_rate": 0.00016787156767382659, + "loss": 1.6903, + "step": 12460 + }, + { + "epoch": 2.627060620424501, + "grad_norm": 0.8336485624313354, + "learning_rate": 0.00016782293946555806, + "loss": 1.7431, + "step": 12470 + }, + { + "epoch": 2.62916732501185, + "grad_norm": 0.8880660533905029, + "learning_rate": 0.0001677742815392012, + "loss": 1.6785, + "step": 12480 + }, + { + "epoch": 2.6312740295991994, + "grad_norm": 0.918901264667511, + "learning_rate": 0.00016772559391607642, + "loss": 1.6933, + "step": 12490 + }, + { + "epoch": 2.6333807341865487, + "grad_norm": 1.0502138137817383, + "learning_rate": 0.00016767687661751733, + "loss": 1.7067, + "step": 12500 + }, + { + "epoch": 2.635487438773898, + "grad_norm": 0.842281699180603, + "learning_rate": 0.00016762812966487044, + "loss": 1.7439, + "step": 12510 + }, + { + "epoch": 2.6375941433612473, + "grad_norm": 0.9271451234817505, + "learning_rate": 0.0001675793530794953, + "loss": 1.7082, + "step": 12520 + }, + { + "epoch": 2.639700847948596, + "grad_norm": 0.9502926468849182, + "learning_rate": 0.0001675305468827644, + "loss": 1.7357, + "step": 12530 + }, + { + "epoch": 2.641807552535946, + "grad_norm": 0.9139789342880249, + "learning_rate": 0.00016748171109606328, + "loss": 1.6743, + "step": 12540 + }, + { + "epoch": 2.6439142571232948, + "grad_norm": 0.8416628837585449, + "learning_rate": 0.00016743284574079033, + "loss": 1.7216, + "step": 12550 + }, + { + "epoch": 2.646020961710644, + "grad_norm": 0.8748312592506409, + "learning_rate": 0.000167383950838357, + "loss": 1.678, + "step": 12560 + }, + { + "epoch": 2.6481276662979933, + "grad_norm": 0.8777117729187012, + "learning_rate": 0.00016733502641018766, + "loss": 1.682, + "step": 12570 + }, + { + "epoch": 2.6502343708853426, + "grad_norm": 0.9002516269683838, + "learning_rate": 0.00016728607247771957, + "loss": 1.7076, + "step": 12580 + }, + { + "epoch": 2.652341075472692, + "grad_norm": 0.9774075746536255, + "learning_rate": 0.000167237089062403, + "loss": 1.7174, + "step": 12590 + }, + { + "epoch": 2.6544477800600412, + "grad_norm": 0.8962463140487671, + "learning_rate": 0.00016718807618570106, + "loss": 1.7087, + "step": 12600 + }, + { + "epoch": 2.6565544846473905, + "grad_norm": 0.9497388601303101, + "learning_rate": 0.00016713903386908984, + "loss": 1.7252, + "step": 12610 + }, + { + "epoch": 2.6586611892347394, + "grad_norm": 0.907230794429779, + "learning_rate": 0.00016708996213405826, + "loss": 1.7184, + "step": 12620 + }, + { + "epoch": 2.6607678938220887, + "grad_norm": 0.8758313059806824, + "learning_rate": 0.00016704086100210815, + "loss": 1.7283, + "step": 12630 + }, + { + "epoch": 2.662874598409438, + "grad_norm": 0.9444327354431152, + "learning_rate": 0.00016699173049475425, + "loss": 1.7191, + "step": 12640 + }, + { + "epoch": 2.6649813029967873, + "grad_norm": 0.967747151851654, + "learning_rate": 0.00016694257063352416, + "loss": 1.7209, + "step": 12650 + }, + { + "epoch": 2.6670880075841366, + "grad_norm": 0.8999049067497253, + "learning_rate": 0.00016689338143995833, + "loss": 1.6764, + "step": 12660 + }, + { + "epoch": 2.669194712171486, + "grad_norm": 0.9095073342323303, + "learning_rate": 0.0001668441629356101, + "loss": 1.6978, + "step": 12670 + }, + { + "epoch": 2.671301416758835, + "grad_norm": 0.9646686315536499, + "learning_rate": 0.00016679491514204556, + "loss": 1.6882, + "step": 12680 + }, + { + "epoch": 2.673408121346184, + "grad_norm": 0.9153634309768677, + "learning_rate": 0.00016674563808084377, + "loss": 1.747, + "step": 12690 + }, + { + "epoch": 2.6755148259335337, + "grad_norm": 0.9942224621772766, + "learning_rate": 0.00016669633177359647, + "loss": 1.6708, + "step": 12700 + }, + { + "epoch": 2.6776215305208826, + "grad_norm": 0.9104297757148743, + "learning_rate": 0.0001666469962419083, + "loss": 1.6708, + "step": 12710 + }, + { + "epoch": 2.679728235108232, + "grad_norm": 0.8879741430282593, + "learning_rate": 0.00016659763150739677, + "loss": 1.7089, + "step": 12720 + }, + { + "epoch": 2.681834939695581, + "grad_norm": 0.9687016606330872, + "learning_rate": 0.00016654823759169199, + "loss": 1.7089, + "step": 12730 + }, + { + "epoch": 2.6839416442829305, + "grad_norm": 0.9552226662635803, + "learning_rate": 0.00016649881451643705, + "loss": 1.7382, + "step": 12740 + }, + { + "epoch": 2.6860483488702798, + "grad_norm": 0.9607418775558472, + "learning_rate": 0.0001664493623032877, + "loss": 1.7465, + "step": 12750 + }, + { + "epoch": 2.688155053457629, + "grad_norm": 1.0158402919769287, + "learning_rate": 0.00016639988097391252, + "loss": 1.7237, + "step": 12760 + }, + { + "epoch": 2.6902617580449784, + "grad_norm": 0.9217173457145691, + "learning_rate": 0.00016635037054999285, + "loss": 1.6807, + "step": 12770 + }, + { + "epoch": 2.692368462632327, + "grad_norm": 0.8327500820159912, + "learning_rate": 0.00016630083105322266, + "loss": 1.709, + "step": 12780 + }, + { + "epoch": 2.6944751672196765, + "grad_norm": 0.8606378436088562, + "learning_rate": 0.0001662512625053089, + "loss": 1.685, + "step": 12790 + }, + { + "epoch": 2.696581871807026, + "grad_norm": 1.037510871887207, + "learning_rate": 0.00016620166492797096, + "loss": 1.7485, + "step": 12800 + }, + { + "epoch": 2.698688576394375, + "grad_norm": 0.8210626244544983, + "learning_rate": 0.00016615203834294119, + "loss": 1.6761, + "step": 12810 + }, + { + "epoch": 2.7007952809817244, + "grad_norm": 0.9673810601234436, + "learning_rate": 0.00016610238277196446, + "loss": 1.7397, + "step": 12820 + }, + { + "epoch": 2.7029019855690737, + "grad_norm": 0.9024403691291809, + "learning_rate": 0.00016605269823679851, + "loss": 1.6665, + "step": 12830 + }, + { + "epoch": 2.705008690156423, + "grad_norm": 0.933525800704956, + "learning_rate": 0.00016600298475921365, + "loss": 1.7333, + "step": 12840 + }, + { + "epoch": 2.707115394743772, + "grad_norm": 0.9351423978805542, + "learning_rate": 0.00016595324236099294, + "loss": 1.7284, + "step": 12850 + }, + { + "epoch": 2.7092220993311216, + "grad_norm": 1.0248723030090332, + "learning_rate": 0.0001659034710639321, + "loss": 1.7172, + "step": 12860 + }, + { + "epoch": 2.7113288039184704, + "grad_norm": 0.8728225827217102, + "learning_rate": 0.00016585367088983946, + "loss": 1.6741, + "step": 12870 + }, + { + "epoch": 2.7134355085058197, + "grad_norm": 0.9945250153541565, + "learning_rate": 0.0001658038418605361, + "loss": 1.7471, + "step": 12880 + }, + { + "epoch": 2.715542213093169, + "grad_norm": 0.9469719529151917, + "learning_rate": 0.00016575398399785562, + "loss": 1.7206, + "step": 12890 + }, + { + "epoch": 2.7176489176805183, + "grad_norm": 0.8776702880859375, + "learning_rate": 0.00016570409732364437, + "loss": 1.7085, + "step": 12900 + }, + { + "epoch": 2.7197556222678676, + "grad_norm": 0.8826903104782104, + "learning_rate": 0.00016565418185976127, + "loss": 1.6886, + "step": 12910 + }, + { + "epoch": 2.7218623268552165, + "grad_norm": 0.8356149196624756, + "learning_rate": 0.00016560423762807783, + "loss": 1.7057, + "step": 12920 + }, + { + "epoch": 2.723969031442566, + "grad_norm": 0.8950599431991577, + "learning_rate": 0.00016555426465047823, + "loss": 1.7086, + "step": 12930 + }, + { + "epoch": 2.726075736029915, + "grad_norm": 0.9071326851844788, + "learning_rate": 0.00016550426294885919, + "loss": 1.6641, + "step": 12940 + }, + { + "epoch": 2.7281824406172643, + "grad_norm": 0.9181045293807983, + "learning_rate": 0.00016545423254513004, + "loss": 1.7222, + "step": 12950 + }, + { + "epoch": 2.7302891452046136, + "grad_norm": 0.9889811277389526, + "learning_rate": 0.0001654041734612127, + "loss": 1.6696, + "step": 12960 + }, + { + "epoch": 2.732395849791963, + "grad_norm": 0.9322980046272278, + "learning_rate": 0.00016535408571904164, + "loss": 1.6845, + "step": 12970 + }, + { + "epoch": 2.7345025543793122, + "grad_norm": 0.9056413769721985, + "learning_rate": 0.00016530396934056384, + "loss": 1.738, + "step": 12980 + }, + { + "epoch": 2.7366092589666615, + "grad_norm": 1.0681997537612915, + "learning_rate": 0.00016525382434773894, + "loss": 1.7147, + "step": 12990 + }, + { + "epoch": 2.738715963554011, + "grad_norm": 0.9294453263282776, + "learning_rate": 0.00016520365076253904, + "loss": 1.7205, + "step": 13000 + }, + { + "epoch": 2.7408226681413597, + "grad_norm": 0.950069010257721, + "learning_rate": 0.00016515344860694876, + "loss": 1.7498, + "step": 13010 + }, + { + "epoch": 2.742929372728709, + "grad_norm": 0.9262808561325073, + "learning_rate": 0.00016510321790296525, + "loss": 1.7285, + "step": 13020 + }, + { + "epoch": 2.7450360773160583, + "grad_norm": 0.865739643573761, + "learning_rate": 0.00016505295867259823, + "loss": 1.6656, + "step": 13030 + }, + { + "epoch": 2.7471427819034075, + "grad_norm": 1.1308355331420898, + "learning_rate": 0.0001650026709378698, + "loss": 1.7217, + "step": 13040 + }, + { + "epoch": 2.749249486490757, + "grad_norm": 0.882122814655304, + "learning_rate": 0.00016495235472081468, + "loss": 1.6862, + "step": 13050 + }, + { + "epoch": 2.751356191078106, + "grad_norm": 0.928149402141571, + "learning_rate": 0.00016490201004348, + "loss": 1.7385, + "step": 13060 + }, + { + "epoch": 2.7534628956654554, + "grad_norm": 0.8748145699501038, + "learning_rate": 0.00016485163692792534, + "loss": 1.7096, + "step": 13070 + }, + { + "epoch": 2.7555696002528043, + "grad_norm": 0.8686920404434204, + "learning_rate": 0.00016480123539622281, + "loss": 1.7108, + "step": 13080 + }, + { + "epoch": 2.757676304840154, + "grad_norm": 0.90756756067276, + "learning_rate": 0.00016475080547045687, + "loss": 1.6767, + "step": 13090 + }, + { + "epoch": 2.759783009427503, + "grad_norm": 0.9196756482124329, + "learning_rate": 0.00016470034717272456, + "loss": 1.7034, + "step": 13100 + }, + { + "epoch": 2.761889714014852, + "grad_norm": 0.9286429286003113, + "learning_rate": 0.0001646498605251352, + "loss": 1.6663, + "step": 13110 + }, + { + "epoch": 2.7639964186022015, + "grad_norm": 0.911522388458252, + "learning_rate": 0.00016459934554981066, + "loss": 1.7007, + "step": 13120 + }, + { + "epoch": 2.7661031231895508, + "grad_norm": 0.8197303414344788, + "learning_rate": 0.00016454880226888515, + "loss": 1.6557, + "step": 13130 + }, + { + "epoch": 2.7682098277769, + "grad_norm": 0.9045342206954956, + "learning_rate": 0.00016449823070450531, + "loss": 1.6377, + "step": 13140 + }, + { + "epoch": 2.7703165323642494, + "grad_norm": 0.9467363953590393, + "learning_rate": 0.0001644476308788302, + "loss": 1.6749, + "step": 13150 + }, + { + "epoch": 2.7724232369515986, + "grad_norm": 0.8628239035606384, + "learning_rate": 0.00016439700281403114, + "loss": 1.7064, + "step": 13160 + }, + { + "epoch": 2.7745299415389475, + "grad_norm": 0.8383623957633972, + "learning_rate": 0.00016434634653229199, + "loss": 1.6991, + "step": 13170 + }, + { + "epoch": 2.776636646126297, + "grad_norm": 0.870199978351593, + "learning_rate": 0.00016429566205580884, + "loss": 1.7157, + "step": 13180 + }, + { + "epoch": 2.778743350713646, + "grad_norm": 0.9087043404579163, + "learning_rate": 0.00016424494940679024, + "loss": 1.742, + "step": 13190 + }, + { + "epoch": 2.7808500553009954, + "grad_norm": 0.9809213280677795, + "learning_rate": 0.00016419420860745699, + "loss": 1.7031, + "step": 13200 + }, + { + "epoch": 2.7829567598883447, + "grad_norm": 0.9322521686553955, + "learning_rate": 0.0001641434396800423, + "loss": 1.7097, + "step": 13210 + }, + { + "epoch": 2.785063464475694, + "grad_norm": 0.8767238259315491, + "learning_rate": 0.00016409264264679164, + "loss": 1.641, + "step": 13220 + }, + { + "epoch": 2.7871701690630433, + "grad_norm": 0.9374739527702332, + "learning_rate": 0.00016404181752996289, + "loss": 1.6943, + "step": 13230 + }, + { + "epoch": 2.789276873650392, + "grad_norm": 0.9542452096939087, + "learning_rate": 0.00016399096435182613, + "loss": 1.6838, + "step": 13240 + }, + { + "epoch": 2.791383578237742, + "grad_norm": 0.9805099368095398, + "learning_rate": 0.00016394008313466376, + "loss": 1.7133, + "step": 13250 + }, + { + "epoch": 2.7934902828250907, + "grad_norm": 0.8735339045524597, + "learning_rate": 0.00016388917390077054, + "loss": 1.7007, + "step": 13260 + }, + { + "epoch": 2.79559698741244, + "grad_norm": 0.8826082944869995, + "learning_rate": 0.00016383823667245344, + "loss": 1.7285, + "step": 13270 + }, + { + "epoch": 2.7977036919997893, + "grad_norm": 1.036288857460022, + "learning_rate": 0.00016378727147203166, + "loss": 1.717, + "step": 13280 + }, + { + "epoch": 2.7998103965871386, + "grad_norm": 0.9070268869400024, + "learning_rate": 0.0001637362783218368, + "loss": 1.7483, + "step": 13290 + }, + { + "epoch": 2.801917101174488, + "grad_norm": 0.9468526244163513, + "learning_rate": 0.00016368525724421248, + "loss": 1.716, + "step": 13300 + }, + { + "epoch": 2.804023805761837, + "grad_norm": 0.9099157452583313, + "learning_rate": 0.00016363420826151482, + "loss": 1.7263, + "step": 13310 + }, + { + "epoch": 2.8061305103491865, + "grad_norm": 0.9246546626091003, + "learning_rate": 0.00016358313139611195, + "loss": 1.7236, + "step": 13320 + }, + { + "epoch": 2.8082372149365353, + "grad_norm": 0.9395228624343872, + "learning_rate": 0.00016353202667038433, + "loss": 1.7252, + "step": 13330 + }, + { + "epoch": 2.8103439195238846, + "grad_norm": 0.9023792743682861, + "learning_rate": 0.0001634808941067246, + "loss": 1.7201, + "step": 13340 + }, + { + "epoch": 2.812450624111234, + "grad_norm": 0.9175665974617004, + "learning_rate": 0.0001634297337275376, + "loss": 1.7183, + "step": 13350 + }, + { + "epoch": 2.814557328698583, + "grad_norm": 0.9023197293281555, + "learning_rate": 0.00016337854555524038, + "loss": 1.6694, + "step": 13360 + }, + { + "epoch": 2.8166640332859325, + "grad_norm": 0.9740870594978333, + "learning_rate": 0.0001633273296122621, + "loss": 1.6947, + "step": 13370 + }, + { + "epoch": 2.818770737873282, + "grad_norm": 0.822769820690155, + "learning_rate": 0.0001632760859210442, + "loss": 1.7328, + "step": 13380 + }, + { + "epoch": 2.820877442460631, + "grad_norm": 0.8751346468925476, + "learning_rate": 0.00016322481450404015, + "loss": 1.6763, + "step": 13390 + }, + { + "epoch": 2.82298414704798, + "grad_norm": 0.9259299635887146, + "learning_rate": 0.00016317351538371565, + "loss": 1.6463, + "step": 13400 + }, + { + "epoch": 2.8250908516353297, + "grad_norm": 0.9039295315742493, + "learning_rate": 0.0001631221885825485, + "loss": 1.6898, + "step": 13410 + }, + { + "epoch": 2.8271975562226785, + "grad_norm": 0.8728259801864624, + "learning_rate": 0.0001630708341230287, + "loss": 1.6981, + "step": 13420 + }, + { + "epoch": 2.829304260810028, + "grad_norm": 0.8956354856491089, + "learning_rate": 0.00016301945202765826, + "loss": 1.6968, + "step": 13430 + }, + { + "epoch": 2.831410965397377, + "grad_norm": 1.1208888292312622, + "learning_rate": 0.00016296804231895142, + "loss": 1.7184, + "step": 13440 + }, + { + "epoch": 2.8335176699847264, + "grad_norm": 0.9479233622550964, + "learning_rate": 0.00016291660501943438, + "loss": 1.7048, + "step": 13450 + }, + { + "epoch": 2.8356243745720757, + "grad_norm": 0.8727468848228455, + "learning_rate": 0.00016286514015164557, + "loss": 1.6782, + "step": 13460 + }, + { + "epoch": 2.837731079159425, + "grad_norm": 0.9350339770317078, + "learning_rate": 0.0001628136477381354, + "loss": 1.6597, + "step": 13470 + }, + { + "epoch": 2.8398377837467743, + "grad_norm": 0.8482766151428223, + "learning_rate": 0.00016276212780146637, + "loss": 1.7017, + "step": 13480 + }, + { + "epoch": 2.841944488334123, + "grad_norm": 0.9531964659690857, + "learning_rate": 0.00016271058036421314, + "loss": 1.7145, + "step": 13490 + }, + { + "epoch": 2.8440511929214725, + "grad_norm": 0.8633520007133484, + "learning_rate": 0.00016265900544896225, + "loss": 1.6757, + "step": 13500 + }, + { + "epoch": 2.8461578975088218, + "grad_norm": 0.8866189122200012, + "learning_rate": 0.00016260740307831237, + "loss": 1.6983, + "step": 13510 + }, + { + "epoch": 2.848264602096171, + "grad_norm": 0.8696046471595764, + "learning_rate": 0.00016255577327487425, + "loss": 1.6678, + "step": 13520 + }, + { + "epoch": 2.8503713066835203, + "grad_norm": 0.9444338083267212, + "learning_rate": 0.00016250411606127054, + "loss": 1.7161, + "step": 13530 + }, + { + "epoch": 2.8524780112708696, + "grad_norm": 0.8829008936882019, + "learning_rate": 0.00016245243146013603, + "loss": 1.7357, + "step": 13540 + }, + { + "epoch": 2.854584715858219, + "grad_norm": 0.9542901515960693, + "learning_rate": 0.00016240071949411738, + "loss": 1.7435, + "step": 13550 + }, + { + "epoch": 2.856691420445568, + "grad_norm": 0.9169602990150452, + "learning_rate": 0.00016234898018587337, + "loss": 1.6806, + "step": 13560 + }, + { + "epoch": 2.8587981250329175, + "grad_norm": 0.9551498889923096, + "learning_rate": 0.00016229721355807465, + "loss": 1.6869, + "step": 13570 + }, + { + "epoch": 2.8609048296202664, + "grad_norm": 0.8909421563148499, + "learning_rate": 0.00016224541963340391, + "loss": 1.7237, + "step": 13580 + }, + { + "epoch": 2.8630115342076157, + "grad_norm": 1.0937942266464233, + "learning_rate": 0.00016219359843455577, + "loss": 1.6916, + "step": 13590 + }, + { + "epoch": 2.865118238794965, + "grad_norm": 0.9448012709617615, + "learning_rate": 0.0001621417499842368, + "loss": 1.6715, + "step": 13600 + }, + { + "epoch": 2.8672249433823143, + "grad_norm": 0.8611142039299011, + "learning_rate": 0.0001620898743051656, + "loss": 1.7111, + "step": 13610 + }, + { + "epoch": 2.8693316479696636, + "grad_norm": 0.8860224485397339, + "learning_rate": 0.0001620379714200725, + "loss": 1.7714, + "step": 13620 + }, + { + "epoch": 2.871438352557013, + "grad_norm": 0.9378777742385864, + "learning_rate": 0.0001619860413516999, + "loss": 1.6969, + "step": 13630 + }, + { + "epoch": 2.873545057144362, + "grad_norm": 0.8672656416893005, + "learning_rate": 0.00016193408412280217, + "loss": 1.6637, + "step": 13640 + }, + { + "epoch": 2.875651761731711, + "grad_norm": 0.9400450587272644, + "learning_rate": 0.00016188209975614542, + "loss": 1.6858, + "step": 13650 + }, + { + "epoch": 2.8777584663190603, + "grad_norm": 0.9799240231513977, + "learning_rate": 0.00016183008827450773, + "loss": 1.7139, + "step": 13660 + }, + { + "epoch": 2.8798651709064096, + "grad_norm": 0.8705218434333801, + "learning_rate": 0.00016177804970067907, + "loss": 1.72, + "step": 13670 + }, + { + "epoch": 2.881971875493759, + "grad_norm": 0.9402818083763123, + "learning_rate": 0.00016172598405746124, + "loss": 1.6982, + "step": 13680 + }, + { + "epoch": 2.884078580081108, + "grad_norm": 0.9194636344909668, + "learning_rate": 0.000161673891367668, + "loss": 1.7205, + "step": 13690 + }, + { + "epoch": 2.8861852846684575, + "grad_norm": 0.8392009139060974, + "learning_rate": 0.0001616217716541248, + "loss": 1.7389, + "step": 13700 + }, + { + "epoch": 2.8882919892558068, + "grad_norm": 0.9039947986602783, + "learning_rate": 0.00016156962493966908, + "loss": 1.7311, + "step": 13710 + }, + { + "epoch": 2.8903986938431556, + "grad_norm": 0.9231280088424683, + "learning_rate": 0.00016151745124715002, + "loss": 1.7134, + "step": 13720 + }, + { + "epoch": 2.8925053984305054, + "grad_norm": 0.9273865818977356, + "learning_rate": 0.00016146525059942865, + "loss": 1.6491, + "step": 13730 + }, + { + "epoch": 2.894612103017854, + "grad_norm": 0.8546093106269836, + "learning_rate": 0.00016141302301937786, + "loss": 1.6637, + "step": 13740 + }, + { + "epoch": 2.8967188076052035, + "grad_norm": 0.9186126589775085, + "learning_rate": 0.00016136076852988224, + "loss": 1.694, + "step": 13750 + }, + { + "epoch": 2.898825512192553, + "grad_norm": 0.919446587562561, + "learning_rate": 0.00016130848715383827, + "loss": 1.7034, + "step": 13760 + }, + { + "epoch": 2.900932216779902, + "grad_norm": 0.8708723187446594, + "learning_rate": 0.0001612561789141541, + "loss": 1.7133, + "step": 13770 + }, + { + "epoch": 2.9030389213672514, + "grad_norm": 0.8736213445663452, + "learning_rate": 0.0001612038438337498, + "loss": 1.7206, + "step": 13780 + }, + { + "epoch": 2.9051456259546002, + "grad_norm": 0.993733823299408, + "learning_rate": 0.00016115148193555706, + "loss": 1.7022, + "step": 13790 + }, + { + "epoch": 2.90725233054195, + "grad_norm": 0.8737487196922302, + "learning_rate": 0.0001610990932425194, + "loss": 1.7069, + "step": 13800 + }, + { + "epoch": 2.909359035129299, + "grad_norm": 0.91124427318573, + "learning_rate": 0.00016104667777759206, + "loss": 1.7307, + "step": 13810 + }, + { + "epoch": 2.911465739716648, + "grad_norm": 0.9024822115898132, + "learning_rate": 0.00016099423556374199, + "loss": 1.6838, + "step": 13820 + }, + { + "epoch": 2.9135724443039974, + "grad_norm": 0.9395371079444885, + "learning_rate": 0.00016094176662394792, + "loss": 1.7139, + "step": 13830 + }, + { + "epoch": 2.9156791488913467, + "grad_norm": 0.8873124122619629, + "learning_rate": 0.0001608892709812002, + "loss": 1.6932, + "step": 13840 + }, + { + "epoch": 2.917785853478696, + "grad_norm": 0.9905756115913391, + "learning_rate": 0.00016083674865850097, + "loss": 1.7101, + "step": 13850 + }, + { + "epoch": 2.9198925580660453, + "grad_norm": 0.8979195356369019, + "learning_rate": 0.00016078419967886402, + "loss": 1.6858, + "step": 13860 + }, + { + "epoch": 2.9219992626533946, + "grad_norm": 0.8918625712394714, + "learning_rate": 0.0001607316240653148, + "loss": 1.6297, + "step": 13870 + }, + { + "epoch": 2.9241059672407435, + "grad_norm": 0.9859835505485535, + "learning_rate": 0.00016067902184089048, + "loss": 1.6728, + "step": 13880 + }, + { + "epoch": 2.9262126718280927, + "grad_norm": 0.9373555779457092, + "learning_rate": 0.00016062639302863986, + "loss": 1.7194, + "step": 13890 + }, + { + "epoch": 2.928319376415442, + "grad_norm": 0.9152488112449646, + "learning_rate": 0.00016057373765162333, + "loss": 1.668, + "step": 13900 + }, + { + "epoch": 2.9304260810027913, + "grad_norm": 0.9642083644866943, + "learning_rate": 0.0001605210557329131, + "loss": 1.7473, + "step": 13910 + }, + { + "epoch": 2.9325327855901406, + "grad_norm": 0.9451881647109985, + "learning_rate": 0.0001604683472955928, + "loss": 1.6884, + "step": 13920 + }, + { + "epoch": 2.93463949017749, + "grad_norm": 0.9817609786987305, + "learning_rate": 0.00016041561236275777, + "loss": 1.7305, + "step": 13930 + }, + { + "epoch": 2.9367461947648392, + "grad_norm": 0.923049807548523, + "learning_rate": 0.00016036285095751503, + "loss": 1.7269, + "step": 13940 + }, + { + "epoch": 2.938852899352188, + "grad_norm": 0.9717063903808594, + "learning_rate": 0.00016031006310298306, + "loss": 1.7341, + "step": 13950 + }, + { + "epoch": 2.940959603939538, + "grad_norm": 0.901326060295105, + "learning_rate": 0.00016025724882229208, + "loss": 1.6908, + "step": 13960 + }, + { + "epoch": 2.9430663085268867, + "grad_norm": 0.8968400359153748, + "learning_rate": 0.0001602044081385837, + "loss": 1.6873, + "step": 13970 + }, + { + "epoch": 2.945173013114236, + "grad_norm": 0.8596915006637573, + "learning_rate": 0.00016015154107501133, + "loss": 1.7327, + "step": 13980 + }, + { + "epoch": 2.9472797177015853, + "grad_norm": 0.9334450960159302, + "learning_rate": 0.00016009864765473972, + "loss": 1.6833, + "step": 13990 + }, + { + "epoch": 2.9493864222889346, + "grad_norm": 0.8397333025932312, + "learning_rate": 0.00016004572790094535, + "loss": 1.7331, + "step": 14000 + }, + { + "epoch": 2.951493126876284, + "grad_norm": 0.8748630285263062, + "learning_rate": 0.00015999278183681604, + "loss": 1.7298, + "step": 14010 + }, + { + "epoch": 2.953599831463633, + "grad_norm": 0.9097732901573181, + "learning_rate": 0.0001599398094855514, + "loss": 1.7674, + "step": 14020 + }, + { + "epoch": 2.9557065360509824, + "grad_norm": 0.8727101683616638, + "learning_rate": 0.0001598868108703623, + "loss": 1.7557, + "step": 14030 + }, + { + "epoch": 2.9578132406383313, + "grad_norm": 0.8882416486740112, + "learning_rate": 0.00015983378601447127, + "loss": 1.6351, + "step": 14040 + }, + { + "epoch": 2.9599199452256806, + "grad_norm": 0.9257373213768005, + "learning_rate": 0.00015978073494111227, + "loss": 1.6756, + "step": 14050 + }, + { + "epoch": 2.96202664981303, + "grad_norm": 0.8525111675262451, + "learning_rate": 0.00015972765767353087, + "loss": 1.7364, + "step": 14060 + }, + { + "epoch": 2.964133354400379, + "grad_norm": 0.9729892611503601, + "learning_rate": 0.00015967455423498387, + "loss": 1.7332, + "step": 14070 + }, + { + "epoch": 2.9662400589877285, + "grad_norm": 0.9302247762680054, + "learning_rate": 0.00015962142464873985, + "loss": 1.7507, + "step": 14080 + }, + { + "epoch": 2.9683467635750778, + "grad_norm": 0.8983259201049805, + "learning_rate": 0.00015956826893807855, + "loss": 1.7273, + "step": 14090 + }, + { + "epoch": 2.970453468162427, + "grad_norm": 0.8971543908119202, + "learning_rate": 0.0001595150871262914, + "loss": 1.7365, + "step": 14100 + }, + { + "epoch": 2.972560172749776, + "grad_norm": 0.8822101354598999, + "learning_rate": 0.00015946187923668108, + "loss": 1.6777, + "step": 14110 + }, + { + "epoch": 2.9746668773371256, + "grad_norm": 0.9178860783576965, + "learning_rate": 0.00015940864529256186, + "loss": 1.6907, + "step": 14120 + }, + { + "epoch": 2.9767735819244745, + "grad_norm": 0.9445515275001526, + "learning_rate": 0.00015935538531725927, + "loss": 1.693, + "step": 14130 + }, + { + "epoch": 2.978880286511824, + "grad_norm": 0.8799481987953186, + "learning_rate": 0.00015930209933411036, + "loss": 1.6918, + "step": 14140 + }, + { + "epoch": 2.980986991099173, + "grad_norm": 0.8957604765892029, + "learning_rate": 0.00015924878736646352, + "loss": 1.7036, + "step": 14150 + }, + { + "epoch": 2.9830936956865224, + "grad_norm": 0.8179033398628235, + "learning_rate": 0.00015919544943767856, + "loss": 1.7331, + "step": 14160 + }, + { + "epoch": 2.9852004002738717, + "grad_norm": 1.4999958276748657, + "learning_rate": 0.00015914208557112663, + "loss": 1.6402, + "step": 14170 + }, + { + "epoch": 2.987307104861221, + "grad_norm": 0.9046642780303955, + "learning_rate": 0.00015908869579019027, + "loss": 1.7611, + "step": 14180 + }, + { + "epoch": 2.9894138094485703, + "grad_norm": 0.9739823341369629, + "learning_rate": 0.00015903528011826335, + "loss": 1.7253, + "step": 14190 + }, + { + "epoch": 2.991520514035919, + "grad_norm": 0.9123568534851074, + "learning_rate": 0.00015898183857875116, + "loss": 1.721, + "step": 14200 + }, + { + "epoch": 2.9936272186232684, + "grad_norm": 0.8985586166381836, + "learning_rate": 0.00015892837119507014, + "loss": 1.6525, + "step": 14210 + }, + { + "epoch": 2.9957339232106177, + "grad_norm": 0.9110245108604431, + "learning_rate": 0.00015887487799064838, + "loss": 1.6592, + "step": 14220 + }, + { + "epoch": 2.997840627797967, + "grad_norm": 0.9014399647712708, + "learning_rate": 0.00015882135898892492, + "loss": 1.6392, + "step": 14230 + }, + { + "epoch": 2.9999473323853163, + "grad_norm": 0.9020178914070129, + "learning_rate": 0.0001587678142133503, + "loss": 1.7085, + "step": 14240 + }, + { + "epoch": 3.0020540369726656, + "grad_norm": 1.0644667148590088, + "learning_rate": 0.0001587142436873864, + "loss": 1.6745, + "step": 14250 + }, + { + "epoch": 3.004160741560015, + "grad_norm": 0.9659833312034607, + "learning_rate": 0.00015866064743450618, + "loss": 1.6343, + "step": 14260 + }, + { + "epoch": 3.006267446147364, + "grad_norm": 0.8868164420127869, + "learning_rate": 0.00015860702547819413, + "loss": 1.6598, + "step": 14270 + }, + { + "epoch": 3.008374150734713, + "grad_norm": 0.9452905058860779, + "learning_rate": 0.00015855337784194577, + "loss": 1.7491, + "step": 14280 + }, + { + "epoch": 3.0104808553220623, + "grad_norm": 0.941245973110199, + "learning_rate": 0.00015849970454926804, + "loss": 1.664, + "step": 14290 + }, + { + "epoch": 3.0125875599094116, + "grad_norm": 0.9174633622169495, + "learning_rate": 0.000158446005623679, + "loss": 1.6228, + "step": 14300 + }, + { + "epoch": 3.014694264496761, + "grad_norm": 0.9531957507133484, + "learning_rate": 0.000158392281088708, + "loss": 1.6542, + "step": 14310 + }, + { + "epoch": 3.01680096908411, + "grad_norm": 0.9811040163040161, + "learning_rate": 0.00015833853096789566, + "loss": 1.6483, + "step": 14320 + }, + { + "epoch": 3.0189076736714595, + "grad_norm": 0.8868007659912109, + "learning_rate": 0.00015828475528479368, + "loss": 1.6262, + "step": 14330 + }, + { + "epoch": 3.021014378258809, + "grad_norm": 0.9833335280418396, + "learning_rate": 0.00015823095406296514, + "loss": 1.6906, + "step": 14340 + }, + { + "epoch": 3.023121082846158, + "grad_norm": 0.9299181699752808, + "learning_rate": 0.00015817712732598413, + "loss": 1.6371, + "step": 14350 + }, + { + "epoch": 3.025227787433507, + "grad_norm": 1.0234558582305908, + "learning_rate": 0.00015812327509743602, + "loss": 1.698, + "step": 14360 + }, + { + "epoch": 3.0273344920208562, + "grad_norm": 0.9303063750267029, + "learning_rate": 0.00015806939740091734, + "loss": 1.6533, + "step": 14370 + }, + { + "epoch": 3.0294411966082055, + "grad_norm": 0.8761981725692749, + "learning_rate": 0.00015801549426003577, + "loss": 1.6728, + "step": 14380 + }, + { + "epoch": 3.031547901195555, + "grad_norm": 0.8972175717353821, + "learning_rate": 0.00015796156569841013, + "loss": 1.6416, + "step": 14390 + }, + { + "epoch": 3.033654605782904, + "grad_norm": 1.033262848854065, + "learning_rate": 0.00015790761173967036, + "loss": 1.6115, + "step": 14400 + }, + { + "epoch": 3.0357613103702534, + "grad_norm": 0.9915304780006409, + "learning_rate": 0.00015785363240745757, + "loss": 1.685, + "step": 14410 + }, + { + "epoch": 3.0378680149576027, + "grad_norm": 0.9661949276924133, + "learning_rate": 0.00015779962772542402, + "loss": 1.6853, + "step": 14420 + }, + { + "epoch": 3.039974719544952, + "grad_norm": 1.1520849466323853, + "learning_rate": 0.00015774559771723298, + "loss": 1.6574, + "step": 14430 + }, + { + "epoch": 3.042081424132301, + "grad_norm": 0.8876193165779114, + "learning_rate": 0.00015769154240655885, + "loss": 1.6226, + "step": 14440 + }, + { + "epoch": 3.04418812871965, + "grad_norm": 1.0137007236480713, + "learning_rate": 0.0001576374618170872, + "loss": 1.6743, + "step": 14450 + }, + { + "epoch": 3.0462948333069995, + "grad_norm": 0.8431521058082581, + "learning_rate": 0.00015758335597251458, + "loss": 1.6573, + "step": 14460 + }, + { + "epoch": 3.0484015378943488, + "grad_norm": 1.0139840841293335, + "learning_rate": 0.00015752922489654857, + "loss": 1.6128, + "step": 14470 + }, + { + "epoch": 3.050508242481698, + "grad_norm": 1.04917311668396, + "learning_rate": 0.00015747506861290796, + "loss": 1.6697, + "step": 14480 + }, + { + "epoch": 3.0526149470690473, + "grad_norm": 0.9627788066864014, + "learning_rate": 0.00015742088714532247, + "loss": 1.6761, + "step": 14490 + }, + { + "epoch": 3.0547216516563966, + "grad_norm": 0.926226794719696, + "learning_rate": 0.0001573666805175329, + "loss": 1.6598, + "step": 14500 + }, + { + "epoch": 3.056828356243746, + "grad_norm": 0.99714595079422, + "learning_rate": 0.00015731244875329107, + "loss": 1.6662, + "step": 14510 + }, + { + "epoch": 3.058935060831095, + "grad_norm": 0.960364580154419, + "learning_rate": 0.00015725819187635968, + "loss": 1.6963, + "step": 14520 + }, + { + "epoch": 3.061041765418444, + "grad_norm": 0.8881476521492004, + "learning_rate": 0.00015720390991051268, + "loss": 1.6545, + "step": 14530 + }, + { + "epoch": 3.0631484700057934, + "grad_norm": 0.9406470656394958, + "learning_rate": 0.00015714960287953485, + "loss": 1.584, + "step": 14540 + }, + { + "epoch": 3.0652551745931427, + "grad_norm": 0.9198954105377197, + "learning_rate": 0.00015709527080722202, + "loss": 1.628, + "step": 14550 + }, + { + "epoch": 3.067361879180492, + "grad_norm": 0.9906532764434814, + "learning_rate": 0.0001570409137173809, + "loss": 1.6715, + "step": 14560 + }, + { + "epoch": 3.0694685837678413, + "grad_norm": 0.9512636661529541, + "learning_rate": 0.0001569865316338293, + "loss": 1.6115, + "step": 14570 + }, + { + "epoch": 3.0715752883551906, + "grad_norm": 0.9917583465576172, + "learning_rate": 0.00015693212458039584, + "loss": 1.7425, + "step": 14580 + }, + { + "epoch": 3.0736819929425394, + "grad_norm": 0.9491133093833923, + "learning_rate": 0.00015687769258092017, + "loss": 1.6637, + "step": 14590 + }, + { + "epoch": 3.0757886975298887, + "grad_norm": 0.9375881552696228, + "learning_rate": 0.00015682323565925285, + "loss": 1.6363, + "step": 14600 + }, + { + "epoch": 3.077895402117238, + "grad_norm": 1.0055121183395386, + "learning_rate": 0.00015676875383925534, + "loss": 1.6571, + "step": 14610 + }, + { + "epoch": 3.0800021067045873, + "grad_norm": 1.0039416551589966, + "learning_rate": 0.0001567142471448001, + "loss": 1.6895, + "step": 14620 + }, + { + "epoch": 3.0821088112919366, + "grad_norm": 0.9418062567710876, + "learning_rate": 0.00015665971559977035, + "loss": 1.6936, + "step": 14630 + }, + { + "epoch": 3.084215515879286, + "grad_norm": 0.9344249367713928, + "learning_rate": 0.00015660515922806027, + "loss": 1.6527, + "step": 14640 + }, + { + "epoch": 3.086322220466635, + "grad_norm": 0.9298831224441528, + "learning_rate": 0.00015655057805357493, + "loss": 1.6325, + "step": 14650 + }, + { + "epoch": 3.0884289250539845, + "grad_norm": 0.8685382604598999, + "learning_rate": 0.00015649597210023027, + "loss": 1.7013, + "step": 14660 + }, + { + "epoch": 3.0905356296413338, + "grad_norm": 0.920759916305542, + "learning_rate": 0.000156441341391953, + "loss": 1.6177, + "step": 14670 + }, + { + "epoch": 3.0926423342286826, + "grad_norm": 0.9808409810066223, + "learning_rate": 0.00015638668595268084, + "loss": 1.6857, + "step": 14680 + }, + { + "epoch": 3.094749038816032, + "grad_norm": 1.0194792747497559, + "learning_rate": 0.0001563320058063622, + "loss": 1.6346, + "step": 14690 + }, + { + "epoch": 3.096855743403381, + "grad_norm": 0.9400737881660461, + "learning_rate": 0.00015627730097695638, + "loss": 1.6783, + "step": 14700 + }, + { + "epoch": 3.0989624479907305, + "grad_norm": 0.9842483997344971, + "learning_rate": 0.00015622257148843348, + "loss": 1.6614, + "step": 14710 + }, + { + "epoch": 3.10106915257808, + "grad_norm": 1.000064492225647, + "learning_rate": 0.00015616781736477444, + "loss": 1.66, + "step": 14720 + }, + { + "epoch": 3.103175857165429, + "grad_norm": 0.9329800605773926, + "learning_rate": 0.0001561130386299709, + "loss": 1.5831, + "step": 14730 + }, + { + "epoch": 3.1052825617527784, + "grad_norm": 1.0243964195251465, + "learning_rate": 0.00015605823530802541, + "loss": 1.6751, + "step": 14740 + }, + { + "epoch": 3.1073892663401272, + "grad_norm": 0.9882740378379822, + "learning_rate": 0.0001560034074229512, + "loss": 1.6543, + "step": 14750 + }, + { + "epoch": 3.1094959709274765, + "grad_norm": 0.9630525708198547, + "learning_rate": 0.0001559485549987723, + "loss": 1.6822, + "step": 14760 + }, + { + "epoch": 3.111602675514826, + "grad_norm": 0.9518427848815918, + "learning_rate": 0.00015589367805952348, + "loss": 1.6336, + "step": 14770 + }, + { + "epoch": 3.113709380102175, + "grad_norm": 0.9057418704032898, + "learning_rate": 0.0001558387766292503, + "loss": 1.6206, + "step": 14780 + }, + { + "epoch": 3.1158160846895244, + "grad_norm": 0.9594529271125793, + "learning_rate": 0.00015578385073200895, + "loss": 1.6772, + "step": 14790 + }, + { + "epoch": 3.1179227892768737, + "grad_norm": 0.8876820802688599, + "learning_rate": 0.0001557289003918664, + "loss": 1.5913, + "step": 14800 + }, + { + "epoch": 3.120029493864223, + "grad_norm": 0.977289617061615, + "learning_rate": 0.00015567392563290038, + "loss": 1.6617, + "step": 14810 + }, + { + "epoch": 3.1221361984515723, + "grad_norm": 0.8738213777542114, + "learning_rate": 0.0001556189264791992, + "loss": 1.7153, + "step": 14820 + }, + { + "epoch": 3.124242903038921, + "grad_norm": 1.0732994079589844, + "learning_rate": 0.0001555639029548621, + "loss": 1.664, + "step": 14830 + }, + { + "epoch": 3.1263496076262705, + "grad_norm": 0.9305723309516907, + "learning_rate": 0.00015550885508399856, + "loss": 1.6656, + "step": 14840 + }, + { + "epoch": 3.1284563122136197, + "grad_norm": 1.1515586376190186, + "learning_rate": 0.00015545378289072922, + "loss": 1.71, + "step": 14850 + }, + { + "epoch": 3.130563016800969, + "grad_norm": 0.9483801126480103, + "learning_rate": 0.00015539868639918505, + "loss": 1.6762, + "step": 14860 + }, + { + "epoch": 3.1326697213883183, + "grad_norm": 1.0394940376281738, + "learning_rate": 0.00015534356563350784, + "loss": 1.6828, + "step": 14870 + }, + { + "epoch": 3.1347764259756676, + "grad_norm": 0.9815645217895508, + "learning_rate": 0.0001552884206178498, + "loss": 1.6447, + "step": 14880 + }, + { + "epoch": 3.136883130563017, + "grad_norm": 0.9685027599334717, + "learning_rate": 0.00015523325137637407, + "loss": 1.6087, + "step": 14890 + }, + { + "epoch": 3.1389898351503662, + "grad_norm": 0.9185184240341187, + "learning_rate": 0.0001551780579332542, + "loss": 1.5929, + "step": 14900 + }, + { + "epoch": 3.141096539737715, + "grad_norm": 0.9931472539901733, + "learning_rate": 0.00015512284031267437, + "loss": 1.6585, + "step": 14910 + }, + { + "epoch": 3.1432032443250644, + "grad_norm": 0.9517951607704163, + "learning_rate": 0.00015506759853882934, + "loss": 1.6431, + "step": 14920 + }, + { + "epoch": 3.1453099489124137, + "grad_norm": 0.8860810995101929, + "learning_rate": 0.00015501233263592455, + "loss": 1.697, + "step": 14930 + }, + { + "epoch": 3.147416653499763, + "grad_norm": 0.961342990398407, + "learning_rate": 0.00015495704262817597, + "loss": 1.6574, + "step": 14940 + }, + { + "epoch": 3.1495233580871123, + "grad_norm": 0.9692322611808777, + "learning_rate": 0.00015490172853981004, + "loss": 1.6623, + "step": 14950 + }, + { + "epoch": 3.1516300626744616, + "grad_norm": 0.9061542749404907, + "learning_rate": 0.00015484639039506387, + "loss": 1.6955, + "step": 14960 + }, + { + "epoch": 3.153736767261811, + "grad_norm": 0.8963515162467957, + "learning_rate": 0.00015479102821818507, + "loss": 1.6676, + "step": 14970 + }, + { + "epoch": 3.15584347184916, + "grad_norm": 0.9289860725402832, + "learning_rate": 0.00015473564203343174, + "loss": 1.6048, + "step": 14980 + }, + { + "epoch": 3.157950176436509, + "grad_norm": 1.0143063068389893, + "learning_rate": 0.00015468023186507256, + "loss": 1.717, + "step": 14990 + }, + { + "epoch": 3.1600568810238583, + "grad_norm": 1.0174888372421265, + "learning_rate": 0.0001546247977373867, + "loss": 1.6563, + "step": 15000 + }, + { + "epoch": 3.1621635856112076, + "grad_norm": 1.0211094617843628, + "learning_rate": 0.0001545693396746638, + "loss": 1.6338, + "step": 15010 + }, + { + "epoch": 3.164270290198557, + "grad_norm": 0.966232419013977, + "learning_rate": 0.000154513857701204, + "loss": 1.6749, + "step": 15020 + }, + { + "epoch": 3.166376994785906, + "grad_norm": 0.9910480976104736, + "learning_rate": 0.000154458351841318, + "loss": 1.6248, + "step": 15030 + }, + { + "epoch": 3.1684836993732555, + "grad_norm": 1.0164378881454468, + "learning_rate": 0.00015440282211932682, + "loss": 1.6117, + "step": 15040 + }, + { + "epoch": 3.1705904039606048, + "grad_norm": 0.9360641241073608, + "learning_rate": 0.00015434726855956206, + "loss": 1.6809, + "step": 15050 + }, + { + "epoch": 3.172697108547954, + "grad_norm": 1.1147947311401367, + "learning_rate": 0.00015429169118636566, + "loss": 1.6561, + "step": 15060 + }, + { + "epoch": 3.174803813135303, + "grad_norm": 0.9635496735572815, + "learning_rate": 0.00015423609002409008, + "loss": 1.6598, + "step": 15070 + }, + { + "epoch": 3.176910517722652, + "grad_norm": 0.9060547351837158, + "learning_rate": 0.00015418046509709817, + "loss": 1.6666, + "step": 15080 + }, + { + "epoch": 3.1790172223100015, + "grad_norm": 1.014722228050232, + "learning_rate": 0.00015412481642976318, + "loss": 1.7179, + "step": 15090 + }, + { + "epoch": 3.181123926897351, + "grad_norm": 1.006274700164795, + "learning_rate": 0.00015406914404646882, + "loss": 1.7212, + "step": 15100 + }, + { + "epoch": 3.1832306314847, + "grad_norm": 0.9190952777862549, + "learning_rate": 0.00015401344797160907, + "loss": 1.6335, + "step": 15110 + }, + { + "epoch": 3.1853373360720494, + "grad_norm": 0.9608885049819946, + "learning_rate": 0.00015395772822958845, + "loss": 1.6894, + "step": 15120 + }, + { + "epoch": 3.1874440406593987, + "grad_norm": 1.0324962139129639, + "learning_rate": 0.00015390198484482176, + "loss": 1.656, + "step": 15130 + }, + { + "epoch": 3.189550745246748, + "grad_norm": 0.9675622582435608, + "learning_rate": 0.00015384621784173414, + "loss": 1.7216, + "step": 15140 + }, + { + "epoch": 3.191657449834097, + "grad_norm": 0.9279264211654663, + "learning_rate": 0.0001537904272447611, + "loss": 1.6508, + "step": 15150 + }, + { + "epoch": 3.193764154421446, + "grad_norm": 0.9079473614692688, + "learning_rate": 0.00015373461307834852, + "loss": 1.6871, + "step": 15160 + }, + { + "epoch": 3.1958708590087954, + "grad_norm": 1.0070098638534546, + "learning_rate": 0.0001536787753669526, + "loss": 1.7162, + "step": 15170 + }, + { + "epoch": 3.1979775635961447, + "grad_norm": 0.9919424057006836, + "learning_rate": 0.00015362291413503984, + "loss": 1.6805, + "step": 15180 + }, + { + "epoch": 3.200084268183494, + "grad_norm": 1.0416815280914307, + "learning_rate": 0.000153567029407087, + "loss": 1.7119, + "step": 15190 + }, + { + "epoch": 3.2021909727708433, + "grad_norm": 1.0771572589874268, + "learning_rate": 0.00015351112120758122, + "loss": 1.5851, + "step": 15200 + }, + { + "epoch": 3.2042976773581926, + "grad_norm": 1.0703963041305542, + "learning_rate": 0.0001534551895610199, + "loss": 1.6739, + "step": 15210 + }, + { + "epoch": 3.206404381945542, + "grad_norm": 0.9871819019317627, + "learning_rate": 0.00015339923449191067, + "loss": 1.6918, + "step": 15220 + }, + { + "epoch": 3.2085110865328907, + "grad_norm": 1.037592887878418, + "learning_rate": 0.00015334325602477146, + "loss": 1.6564, + "step": 15230 + }, + { + "epoch": 3.21061779112024, + "grad_norm": 0.9865399599075317, + "learning_rate": 0.00015328725418413045, + "loss": 1.6338, + "step": 15240 + }, + { + "epoch": 3.2127244957075893, + "grad_norm": 0.9120628237724304, + "learning_rate": 0.00015323122899452607, + "loss": 1.6434, + "step": 15250 + }, + { + "epoch": 3.2148312002949386, + "grad_norm": 0.9064800143241882, + "learning_rate": 0.00015317518048050697, + "loss": 1.6631, + "step": 15260 + }, + { + "epoch": 3.216937904882288, + "grad_norm": 0.9162564873695374, + "learning_rate": 0.00015311910866663196, + "loss": 1.6954, + "step": 15270 + }, + { + "epoch": 3.219044609469637, + "grad_norm": 0.9941088557243347, + "learning_rate": 0.00015306301357747022, + "loss": 1.7128, + "step": 15280 + }, + { + "epoch": 3.2211513140569865, + "grad_norm": 0.984843909740448, + "learning_rate": 0.00015300689523760097, + "loss": 1.6372, + "step": 15290 + }, + { + "epoch": 3.2232580186443354, + "grad_norm": 1.007107138633728, + "learning_rate": 0.00015295075367161367, + "loss": 1.6644, + "step": 15300 + }, + { + "epoch": 3.2253647232316847, + "grad_norm": 1.0158659219741821, + "learning_rate": 0.00015289458890410798, + "loss": 1.6354, + "step": 15310 + }, + { + "epoch": 3.227471427819034, + "grad_norm": 0.9480316042900085, + "learning_rate": 0.00015283840095969367, + "loss": 1.6573, + "step": 15320 + }, + { + "epoch": 3.2295781324063833, + "grad_norm": 0.9743335247039795, + "learning_rate": 0.00015278218986299074, + "loss": 1.7287, + "step": 15330 + }, + { + "epoch": 3.2316848369937325, + "grad_norm": 0.9238772392272949, + "learning_rate": 0.00015272595563862933, + "loss": 1.662, + "step": 15340 + }, + { + "epoch": 3.233791541581082, + "grad_norm": 0.9363320469856262, + "learning_rate": 0.00015266969831124962, + "loss": 1.6218, + "step": 15350 + }, + { + "epoch": 3.235898246168431, + "grad_norm": 0.9849210381507874, + "learning_rate": 0.00015261341790550196, + "loss": 1.6596, + "step": 15360 + }, + { + "epoch": 3.2380049507557804, + "grad_norm": 0.9171252250671387, + "learning_rate": 0.00015255711444604693, + "loss": 1.6972, + "step": 15370 + }, + { + "epoch": 3.2401116553431297, + "grad_norm": 1.0775831937789917, + "learning_rate": 0.000152500787957555, + "loss": 1.6343, + "step": 15380 + }, + { + "epoch": 3.2422183599304786, + "grad_norm": 0.9412891864776611, + "learning_rate": 0.0001524444384647069, + "loss": 1.6965, + "step": 15390 + }, + { + "epoch": 3.244325064517828, + "grad_norm": 1.0591611862182617, + "learning_rate": 0.00015238806599219336, + "loss": 1.6702, + "step": 15400 + }, + { + "epoch": 3.246431769105177, + "grad_norm": 0.8861743211746216, + "learning_rate": 0.00015233167056471523, + "loss": 1.708, + "step": 15410 + }, + { + "epoch": 3.2485384736925265, + "grad_norm": 1.0437482595443726, + "learning_rate": 0.0001522752522069833, + "loss": 1.6861, + "step": 15420 + }, + { + "epoch": 3.2506451782798758, + "grad_norm": 0.9577218294143677, + "learning_rate": 0.00015221881094371861, + "loss": 1.6652, + "step": 15430 + }, + { + "epoch": 3.252751882867225, + "grad_norm": 1.0100152492523193, + "learning_rate": 0.00015216234679965205, + "loss": 1.7185, + "step": 15440 + }, + { + "epoch": 3.2548585874545743, + "grad_norm": 0.9381777048110962, + "learning_rate": 0.0001521058597995246, + "loss": 1.6786, + "step": 15450 + }, + { + "epoch": 3.256965292041923, + "grad_norm": 0.9847040772438049, + "learning_rate": 0.00015204934996808727, + "loss": 1.6359, + "step": 15460 + }, + { + "epoch": 3.2590719966292725, + "grad_norm": 1.0387279987335205, + "learning_rate": 0.00015199281733010116, + "loss": 1.639, + "step": 15470 + }, + { + "epoch": 3.261178701216622, + "grad_norm": 0.9832319617271423, + "learning_rate": 0.00015193626191033712, + "loss": 1.6587, + "step": 15480 + }, + { + "epoch": 3.263285405803971, + "grad_norm": 0.9799401760101318, + "learning_rate": 0.00015187968373357619, + "loss": 1.6814, + "step": 15490 + }, + { + "epoch": 3.2653921103913204, + "grad_norm": 1.0971506834030151, + "learning_rate": 0.00015182308282460935, + "loss": 1.6852, + "step": 15500 + }, + { + "epoch": 3.2674988149786697, + "grad_norm": 0.9791223406791687, + "learning_rate": 0.0001517664592082375, + "loss": 1.6598, + "step": 15510 + }, + { + "epoch": 3.269605519566019, + "grad_norm": 1.0151629447937012, + "learning_rate": 0.00015170981290927145, + "loss": 1.6781, + "step": 15520 + }, + { + "epoch": 3.2717122241533683, + "grad_norm": 0.9513714909553528, + "learning_rate": 0.00015165314395253212, + "loss": 1.6378, + "step": 15530 + }, + { + "epoch": 3.2738189287407176, + "grad_norm": 0.9968097805976868, + "learning_rate": 0.0001515964523628501, + "loss": 1.6473, + "step": 15540 + }, + { + "epoch": 3.2759256333280664, + "grad_norm": 1.050000548362732, + "learning_rate": 0.00015153973816506614, + "loss": 1.654, + "step": 15550 + }, + { + "epoch": 3.2780323379154157, + "grad_norm": 1.0254466533660889, + "learning_rate": 0.00015148300138403075, + "loss": 1.6328, + "step": 15560 + }, + { + "epoch": 3.280139042502765, + "grad_norm": 0.9959595203399658, + "learning_rate": 0.00015142624204460435, + "loss": 1.6786, + "step": 15570 + }, + { + "epoch": 3.2822457470901143, + "grad_norm": 1.058243989944458, + "learning_rate": 0.00015136946017165734, + "loss": 1.6118, + "step": 15580 + }, + { + "epoch": 3.2843524516774636, + "grad_norm": 1.0479722023010254, + "learning_rate": 0.0001513126557900699, + "loss": 1.6355, + "step": 15590 + }, + { + "epoch": 3.286459156264813, + "grad_norm": 0.9290971159934998, + "learning_rate": 0.00015125582892473204, + "loss": 1.6835, + "step": 15600 + }, + { + "epoch": 3.288565860852162, + "grad_norm": 0.9415523409843445, + "learning_rate": 0.0001511989796005438, + "loss": 1.6497, + "step": 15610 + }, + { + "epoch": 3.290672565439511, + "grad_norm": 0.9990530610084534, + "learning_rate": 0.00015114210784241482, + "loss": 1.6725, + "step": 15620 + }, + { + "epoch": 3.2927792700268603, + "grad_norm": 0.8826045393943787, + "learning_rate": 0.00015108521367526479, + "loss": 1.6639, + "step": 15630 + }, + { + "epoch": 3.2948859746142096, + "grad_norm": 0.9666330218315125, + "learning_rate": 0.00015102829712402302, + "loss": 1.7046, + "step": 15640 + }, + { + "epoch": 3.296992679201559, + "grad_norm": 0.9691329598426819, + "learning_rate": 0.00015097135821362883, + "loss": 1.714, + "step": 15650 + }, + { + "epoch": 3.299099383788908, + "grad_norm": 0.9856581687927246, + "learning_rate": 0.00015091439696903115, + "loss": 1.653, + "step": 15660 + }, + { + "epoch": 3.3012060883762575, + "grad_norm": 1.028440237045288, + "learning_rate": 0.00015085741341518886, + "loss": 1.6483, + "step": 15670 + }, + { + "epoch": 3.303312792963607, + "grad_norm": 1.0359004735946655, + "learning_rate": 0.00015080040757707046, + "loss": 1.7243, + "step": 15680 + }, + { + "epoch": 3.305419497550956, + "grad_norm": 1.0426095724105835, + "learning_rate": 0.00015074337947965435, + "loss": 1.6496, + "step": 15690 + }, + { + "epoch": 3.3075262021383054, + "grad_norm": 0.9464336633682251, + "learning_rate": 0.0001506863291479286, + "loss": 1.7339, + "step": 15700 + }, + { + "epoch": 3.3096329067256542, + "grad_norm": 0.9904953241348267, + "learning_rate": 0.00015062925660689106, + "loss": 1.6224, + "step": 15710 + }, + { + "epoch": 3.3117396113130035, + "grad_norm": 0.9366286396980286, + "learning_rate": 0.00015057216188154928, + "loss": 1.6543, + "step": 15720 + }, + { + "epoch": 3.313846315900353, + "grad_norm": 0.9428682923316956, + "learning_rate": 0.00015051504499692054, + "loss": 1.6566, + "step": 15730 + }, + { + "epoch": 3.315953020487702, + "grad_norm": 0.914440393447876, + "learning_rate": 0.0001504579059780319, + "loss": 1.6787, + "step": 15740 + }, + { + "epoch": 3.3180597250750514, + "grad_norm": 1.1147249937057495, + "learning_rate": 0.00015040074484992, + "loss": 1.6386, + "step": 15750 + }, + { + "epoch": 3.3201664296624007, + "grad_norm": 0.9929258227348328, + "learning_rate": 0.00015034356163763123, + "loss": 1.6538, + "step": 15760 + }, + { + "epoch": 3.32227313424975, + "grad_norm": 0.9763256907463074, + "learning_rate": 0.00015028635636622165, + "loss": 1.6431, + "step": 15770 + }, + { + "epoch": 3.324379838837099, + "grad_norm": 0.9423137903213501, + "learning_rate": 0.00015022912906075702, + "loss": 1.6739, + "step": 15780 + }, + { + "epoch": 3.326486543424448, + "grad_norm": 1.0271004438400269, + "learning_rate": 0.00015017187974631271, + "loss": 1.7341, + "step": 15790 + }, + { + "epoch": 3.3285932480117975, + "grad_norm": 1.0378199815750122, + "learning_rate": 0.00015011460844797372, + "loss": 1.7028, + "step": 15800 + }, + { + "epoch": 3.3306999525991468, + "grad_norm": 0.9106980562210083, + "learning_rate": 0.0001500573151908347, + "loss": 1.6631, + "step": 15810 + }, + { + "epoch": 3.332806657186496, + "grad_norm": 0.9558465480804443, + "learning_rate": 0.00015000000000000001, + "loss": 1.7068, + "step": 15820 + }, + { + "epoch": 3.3349133617738453, + "grad_norm": 0.9833946824073792, + "learning_rate": 0.0001499426629005835, + "loss": 1.6597, + "step": 15830 + }, + { + "epoch": 3.3370200663611946, + "grad_norm": 0.9429462552070618, + "learning_rate": 0.00014988530391770856, + "loss": 1.6672, + "step": 15840 + }, + { + "epoch": 3.339126770948544, + "grad_norm": 1.01088285446167, + "learning_rate": 0.0001498279230765084, + "loss": 1.6842, + "step": 15850 + }, + { + "epoch": 3.3412334755358932, + "grad_norm": 0.9075558185577393, + "learning_rate": 0.00014977052040212566, + "loss": 1.6968, + "step": 15860 + }, + { + "epoch": 3.343340180123242, + "grad_norm": 1.0675263404846191, + "learning_rate": 0.00014971309591971252, + "loss": 1.6572, + "step": 15870 + }, + { + "epoch": 3.3454468847105914, + "grad_norm": 0.978440523147583, + "learning_rate": 0.00014965564965443079, + "loss": 1.7329, + "step": 15880 + }, + { + "epoch": 3.3475535892979407, + "grad_norm": 0.9912124276161194, + "learning_rate": 0.00014959818163145174, + "loss": 1.6253, + "step": 15890 + }, + { + "epoch": 3.34966029388529, + "grad_norm": 0.9947911500930786, + "learning_rate": 0.00014954069187595633, + "loss": 1.6586, + "step": 15900 + }, + { + "epoch": 3.3517669984726393, + "grad_norm": 0.9598615765571594, + "learning_rate": 0.00014948318041313482, + "loss": 1.6465, + "step": 15910 + }, + { + "epoch": 3.3538737030599886, + "grad_norm": 0.9510218501091003, + "learning_rate": 0.0001494256472681872, + "loss": 1.6517, + "step": 15920 + }, + { + "epoch": 3.355980407647338, + "grad_norm": 1.0019654035568237, + "learning_rate": 0.0001493680924663228, + "loss": 1.6578, + "step": 15930 + }, + { + "epoch": 3.3580871122346867, + "grad_norm": 0.9807612895965576, + "learning_rate": 0.00014931051603276054, + "loss": 1.6576, + "step": 15940 + }, + { + "epoch": 3.360193816822036, + "grad_norm": 0.9929197430610657, + "learning_rate": 0.00014925291799272876, + "loss": 1.6551, + "step": 15950 + }, + { + "epoch": 3.3623005214093853, + "grad_norm": 0.9978349208831787, + "learning_rate": 0.00014919529837146528, + "loss": 1.6679, + "step": 15960 + }, + { + "epoch": 3.3644072259967346, + "grad_norm": 0.8876854777336121, + "learning_rate": 0.00014913765719421743, + "loss": 1.7459, + "step": 15970 + }, + { + "epoch": 3.366513930584084, + "grad_norm": 1.0585439205169678, + "learning_rate": 0.00014907999448624188, + "loss": 1.6814, + "step": 15980 + }, + { + "epoch": 3.368620635171433, + "grad_norm": 1.0199978351593018, + "learning_rate": 0.00014902231027280486, + "loss": 1.6441, + "step": 15990 + }, + { + "epoch": 3.3707273397587825, + "grad_norm": 0.9279028177261353, + "learning_rate": 0.00014896460457918185, + "loss": 1.6734, + "step": 16000 + }, + { + "epoch": 3.3728340443461313, + "grad_norm": 0.9600446820259094, + "learning_rate": 0.00014890687743065794, + "loss": 1.7033, + "step": 16010 + }, + { + "epoch": 3.3749407489334806, + "grad_norm": 0.9573974609375, + "learning_rate": 0.0001488491288525275, + "loss": 1.706, + "step": 16020 + }, + { + "epoch": 3.37704745352083, + "grad_norm": 0.9946306347846985, + "learning_rate": 0.00014879135887009435, + "loss": 1.6575, + "step": 16030 + }, + { + "epoch": 3.379154158108179, + "grad_norm": 0.9504879117012024, + "learning_rate": 0.0001487335675086716, + "loss": 1.6634, + "step": 16040 + }, + { + "epoch": 3.3812608626955285, + "grad_norm": 1.0545927286148071, + "learning_rate": 0.0001486757547935818, + "loss": 1.7045, + "step": 16050 + }, + { + "epoch": 3.383367567282878, + "grad_norm": 1.1232430934906006, + "learning_rate": 0.00014861792075015687, + "loss": 1.6703, + "step": 16060 + }, + { + "epoch": 3.385474271870227, + "grad_norm": 0.9520963430404663, + "learning_rate": 0.00014856006540373806, + "loss": 1.7003, + "step": 16070 + }, + { + "epoch": 3.3875809764575764, + "grad_norm": 1.0433330535888672, + "learning_rate": 0.0001485021887796759, + "loss": 1.6963, + "step": 16080 + }, + { + "epoch": 3.3896876810449257, + "grad_norm": 1.0261874198913574, + "learning_rate": 0.0001484442909033303, + "loss": 1.6862, + "step": 16090 + }, + { + "epoch": 3.3917943856322745, + "grad_norm": 0.8804453015327454, + "learning_rate": 0.00014838637180007047, + "loss": 1.6284, + "step": 16100 + }, + { + "epoch": 3.393901090219624, + "grad_norm": 1.066070795059204, + "learning_rate": 0.0001483284314952749, + "loss": 1.6803, + "step": 16110 + }, + { + "epoch": 3.396007794806973, + "grad_norm": 0.9975647330284119, + "learning_rate": 0.0001482704700143314, + "loss": 1.7284, + "step": 16120 + }, + { + "epoch": 3.3981144993943224, + "grad_norm": 1.016730546951294, + "learning_rate": 0.0001482124873826371, + "loss": 1.6747, + "step": 16130 + }, + { + "epoch": 3.4002212039816717, + "grad_norm": 1.0822088718414307, + "learning_rate": 0.00014815448362559826, + "loss": 1.6058, + "step": 16140 + }, + { + "epoch": 3.402327908569021, + "grad_norm": 0.9539144039154053, + "learning_rate": 0.00014809645876863052, + "loss": 1.6734, + "step": 16150 + }, + { + "epoch": 3.4044346131563703, + "grad_norm": 0.995520830154419, + "learning_rate": 0.0001480384128371587, + "loss": 1.6552, + "step": 16160 + }, + { + "epoch": 3.406541317743719, + "grad_norm": 1.0045973062515259, + "learning_rate": 0.00014798034585661695, + "loss": 1.7047, + "step": 16170 + }, + { + "epoch": 3.4086480223310685, + "grad_norm": 0.9666810035705566, + "learning_rate": 0.0001479222578524485, + "loss": 1.6763, + "step": 16180 + }, + { + "epoch": 3.4107547269184177, + "grad_norm": 0.9643430113792419, + "learning_rate": 0.0001478641488501059, + "loss": 1.6404, + "step": 16190 + }, + { + "epoch": 3.412861431505767, + "grad_norm": 1.0544071197509766, + "learning_rate": 0.00014780601887505088, + "loss": 1.6564, + "step": 16200 + }, + { + "epoch": 3.4149681360931163, + "grad_norm": 0.9909730553627014, + "learning_rate": 0.00014774786795275429, + "loss": 1.6865, + "step": 16210 + }, + { + "epoch": 3.4170748406804656, + "grad_norm": 0.9278872609138489, + "learning_rate": 0.00014768969610869627, + "loss": 1.6599, + "step": 16220 + }, + { + "epoch": 3.419181545267815, + "grad_norm": 0.9268772602081299, + "learning_rate": 0.00014763150336836604, + "loss": 1.6472, + "step": 16230 + }, + { + "epoch": 3.421288249855164, + "grad_norm": 0.9475058317184448, + "learning_rate": 0.00014757328975726207, + "loss": 1.667, + "step": 16240 + }, + { + "epoch": 3.4233949544425135, + "grad_norm": 1.048485279083252, + "learning_rate": 0.0001475150553008918, + "loss": 1.7311, + "step": 16250 + }, + { + "epoch": 3.4255016590298624, + "grad_norm": 0.9860157370567322, + "learning_rate": 0.00014745680002477203, + "loss": 1.6184, + "step": 16260 + }, + { + "epoch": 3.4276083636172117, + "grad_norm": 0.9206960201263428, + "learning_rate": 0.00014739852395442854, + "loss": 1.6858, + "step": 16270 + }, + { + "epoch": 3.429715068204561, + "grad_norm": 0.9898627400398254, + "learning_rate": 0.0001473402271153962, + "loss": 1.6377, + "step": 16280 + }, + { + "epoch": 3.4318217727919103, + "grad_norm": 1.0353739261627197, + "learning_rate": 0.00014728190953321903, + "loss": 1.6448, + "step": 16290 + }, + { + "epoch": 3.4339284773792595, + "grad_norm": 0.9169580340385437, + "learning_rate": 0.00014722357123345023, + "loss": 1.657, + "step": 16300 + }, + { + "epoch": 3.436035181966609, + "grad_norm": 1.0277870893478394, + "learning_rate": 0.00014716521224165192, + "loss": 1.6797, + "step": 16310 + }, + { + "epoch": 3.438141886553958, + "grad_norm": 0.9552232623100281, + "learning_rate": 0.00014710683258339536, + "loss": 1.6266, + "step": 16320 + }, + { + "epoch": 3.440248591141307, + "grad_norm": 0.9857643246650696, + "learning_rate": 0.00014704843228426087, + "loss": 1.6921, + "step": 16330 + }, + { + "epoch": 3.4423552957286563, + "grad_norm": 1.0019571781158447, + "learning_rate": 0.00014699001136983782, + "loss": 1.6315, + "step": 16340 + }, + { + "epoch": 3.4444620003160056, + "grad_norm": 1.0217735767364502, + "learning_rate": 0.00014693156986572456, + "loss": 1.6518, + "step": 16350 + }, + { + "epoch": 3.446568704903355, + "grad_norm": 0.9069836735725403, + "learning_rate": 0.00014687310779752855, + "loss": 1.5829, + "step": 16360 + }, + { + "epoch": 3.448675409490704, + "grad_norm": 1.0425034761428833, + "learning_rate": 0.00014681462519086614, + "loss": 1.7103, + "step": 16370 + }, + { + "epoch": 3.4507821140780535, + "grad_norm": 0.999427318572998, + "learning_rate": 0.0001467561220713628, + "loss": 1.65, + "step": 16380 + }, + { + "epoch": 3.4528888186654028, + "grad_norm": 0.9969028234481812, + "learning_rate": 0.00014669759846465296, + "loss": 1.6209, + "step": 16390 + }, + { + "epoch": 3.454995523252752, + "grad_norm": 1.015284538269043, + "learning_rate": 0.00014663905439637995, + "loss": 1.689, + "step": 16400 + }, + { + "epoch": 3.4571022278401013, + "grad_norm": 0.9351761937141418, + "learning_rate": 0.00014658048989219614, + "loss": 1.6442, + "step": 16410 + }, + { + "epoch": 3.45920893242745, + "grad_norm": 1.002864122390747, + "learning_rate": 0.00014652190497776286, + "loss": 1.635, + "step": 16420 + }, + { + "epoch": 3.4613156370147995, + "grad_norm": 1.0125610828399658, + "learning_rate": 0.00014646329967875032, + "loss": 1.6677, + "step": 16430 + }, + { + "epoch": 3.463422341602149, + "grad_norm": 1.0910165309906006, + "learning_rate": 0.0001464046740208377, + "loss": 1.7004, + "step": 16440 + }, + { + "epoch": 3.465529046189498, + "grad_norm": 0.9675852060317993, + "learning_rate": 0.00014634602802971312, + "loss": 1.667, + "step": 16450 + }, + { + "epoch": 3.4676357507768474, + "grad_norm": 1.0512768030166626, + "learning_rate": 0.00014628736173107363, + "loss": 1.675, + "step": 16460 + }, + { + "epoch": 3.4697424553641967, + "grad_norm": 1.171718716621399, + "learning_rate": 0.00014622867515062503, + "loss": 1.6984, + "step": 16470 + }, + { + "epoch": 3.471849159951546, + "grad_norm": 1.0073360204696655, + "learning_rate": 0.0001461699683140822, + "loss": 1.6667, + "step": 16480 + }, + { + "epoch": 3.473955864538895, + "grad_norm": 0.9846724271774292, + "learning_rate": 0.00014611124124716882, + "loss": 1.6832, + "step": 16490 + }, + { + "epoch": 3.476062569126244, + "grad_norm": 0.9981314539909363, + "learning_rate": 0.00014605249397561736, + "loss": 1.6448, + "step": 16500 + }, + { + "epoch": 3.4781692737135934, + "grad_norm": 0.9036053419113159, + "learning_rate": 0.00014599372652516926, + "loss": 1.6785, + "step": 16510 + }, + { + "epoch": 3.4802759783009427, + "grad_norm": 0.9795652031898499, + "learning_rate": 0.00014593493892157473, + "loss": 1.6672, + "step": 16520 + }, + { + "epoch": 3.482382682888292, + "grad_norm": 1.0752781629562378, + "learning_rate": 0.00014587613119059284, + "loss": 1.6374, + "step": 16530 + }, + { + "epoch": 3.4844893874756413, + "grad_norm": 0.9755626916885376, + "learning_rate": 0.0001458173033579914, + "loss": 1.6375, + "step": 16540 + }, + { + "epoch": 3.4865960920629906, + "grad_norm": 1.0344021320343018, + "learning_rate": 0.00014575845544954725, + "loss": 1.5904, + "step": 16550 + }, + { + "epoch": 3.48870279665034, + "grad_norm": 1.0018855333328247, + "learning_rate": 0.00014569958749104575, + "loss": 1.7213, + "step": 16560 + }, + { + "epoch": 3.490809501237689, + "grad_norm": 1.0331135988235474, + "learning_rate": 0.00014564069950828118, + "loss": 1.6596, + "step": 16570 + }, + { + "epoch": 3.492916205825038, + "grad_norm": 1.0158442258834839, + "learning_rate": 0.0001455817915270566, + "loss": 1.7339, + "step": 16580 + }, + { + "epoch": 3.4950229104123873, + "grad_norm": 1.087720274925232, + "learning_rate": 0.0001455228635731839, + "loss": 1.6767, + "step": 16590 + }, + { + "epoch": 3.4971296149997366, + "grad_norm": 0.95531165599823, + "learning_rate": 0.00014546391567248353, + "loss": 1.6726, + "step": 16600 + }, + { + "epoch": 3.499236319587086, + "grad_norm": 1.0385819673538208, + "learning_rate": 0.00014540494785078478, + "loss": 1.6336, + "step": 16610 + }, + { + "epoch": 3.501343024174435, + "grad_norm": 1.0225718021392822, + "learning_rate": 0.00014534596013392575, + "loss": 1.6422, + "step": 16620 + }, + { + "epoch": 3.5034497287617845, + "grad_norm": 1.0323877334594727, + "learning_rate": 0.00014528695254775316, + "loss": 1.7083, + "step": 16630 + }, + { + "epoch": 3.505556433349134, + "grad_norm": 0.9760758280754089, + "learning_rate": 0.00014522792511812243, + "loss": 1.6618, + "step": 16640 + }, + { + "epoch": 3.5076631379364827, + "grad_norm": 0.9613909721374512, + "learning_rate": 0.00014516887787089774, + "loss": 1.7509, + "step": 16650 + }, + { + "epoch": 3.509769842523832, + "grad_norm": 1.0987319946289062, + "learning_rate": 0.00014510981083195188, + "loss": 1.6929, + "step": 16660 + }, + { + "epoch": 3.5118765471111812, + "grad_norm": 0.8977022767066956, + "learning_rate": 0.00014505072402716643, + "loss": 1.656, + "step": 16670 + }, + { + "epoch": 3.5139832516985305, + "grad_norm": 0.9773346781730652, + "learning_rate": 0.00014499161748243147, + "loss": 1.7439, + "step": 16680 + }, + { + "epoch": 3.51608995628588, + "grad_norm": 0.9943004250526428, + "learning_rate": 0.00014493249122364584, + "loss": 1.664, + "step": 16690 + }, + { + "epoch": 3.518196660873229, + "grad_norm": 1.1163723468780518, + "learning_rate": 0.00014487334527671697, + "loss": 1.6813, + "step": 16700 + }, + { + "epoch": 3.5203033654605784, + "grad_norm": 0.9387895464897156, + "learning_rate": 0.00014481417966756102, + "loss": 1.706, + "step": 16710 + }, + { + "epoch": 3.5224100700479273, + "grad_norm": 0.9459789991378784, + "learning_rate": 0.00014475499442210258, + "loss": 1.725, + "step": 16720 + }, + { + "epoch": 3.524516774635277, + "grad_norm": 0.9388308525085449, + "learning_rate": 0.00014469578956627496, + "loss": 1.6384, + "step": 16730 + }, + { + "epoch": 3.526623479222626, + "grad_norm": 1.0331875085830688, + "learning_rate": 0.0001446365651260201, + "loss": 1.6376, + "step": 16740 + }, + { + "epoch": 3.528730183809975, + "grad_norm": 0.9883841276168823, + "learning_rate": 0.00014457732112728848, + "loss": 1.6539, + "step": 16750 + }, + { + "epoch": 3.5308368883973245, + "grad_norm": 0.9518382549285889, + "learning_rate": 0.00014451805759603908, + "loss": 1.6514, + "step": 16760 + }, + { + "epoch": 3.5329435929846738, + "grad_norm": 1.0333259105682373, + "learning_rate": 0.00014445877455823946, + "loss": 1.7183, + "step": 16770 + }, + { + "epoch": 3.535050297572023, + "grad_norm": 0.9487828016281128, + "learning_rate": 0.0001443994720398659, + "loss": 1.6944, + "step": 16780 + }, + { + "epoch": 3.5371570021593723, + "grad_norm": 0.9641709327697754, + "learning_rate": 0.00014434015006690299, + "loss": 1.6249, + "step": 16790 + }, + { + "epoch": 3.5392637067467216, + "grad_norm": 0.936957597732544, + "learning_rate": 0.00014428080866534396, + "loss": 1.6875, + "step": 16800 + }, + { + "epoch": 3.5413704113340705, + "grad_norm": 0.9620563983917236, + "learning_rate": 0.0001442214478611905, + "loss": 1.6837, + "step": 16810 + }, + { + "epoch": 3.54347711592142, + "grad_norm": 0.9967496395111084, + "learning_rate": 0.00014416206768045288, + "loss": 1.6423, + "step": 16820 + }, + { + "epoch": 3.545583820508769, + "grad_norm": 0.9146559834480286, + "learning_rate": 0.0001441026681491498, + "loss": 1.6752, + "step": 16830 + }, + { + "epoch": 3.5476905250961184, + "grad_norm": 1.0962780714035034, + "learning_rate": 0.0001440432492933084, + "loss": 1.7516, + "step": 16840 + }, + { + "epoch": 3.5497972296834677, + "grad_norm": 1.0509783029556274, + "learning_rate": 0.00014398381113896438, + "loss": 1.6429, + "step": 16850 + }, + { + "epoch": 3.551903934270817, + "grad_norm": 0.9411798119544983, + "learning_rate": 0.00014392435371216185, + "loss": 1.6632, + "step": 16860 + }, + { + "epoch": 3.5540106388581663, + "grad_norm": 1.1193450689315796, + "learning_rate": 0.0001438648770389534, + "loss": 1.7557, + "step": 16870 + }, + { + "epoch": 3.556117343445515, + "grad_norm": 0.9980596303939819, + "learning_rate": 0.00014380538114539996, + "loss": 1.6619, + "step": 16880 + }, + { + "epoch": 3.558224048032865, + "grad_norm": 0.8729853630065918, + "learning_rate": 0.00014374586605757095, + "loss": 1.6667, + "step": 16890 + }, + { + "epoch": 3.5603307526202137, + "grad_norm": 0.9395302534103394, + "learning_rate": 0.00014368633180154424, + "loss": 1.6658, + "step": 16900 + }, + { + "epoch": 3.562437457207563, + "grad_norm": 0.9311736226081848, + "learning_rate": 0.00014362677840340602, + "loss": 1.6424, + "step": 16910 + }, + { + "epoch": 3.5645441617949123, + "grad_norm": 0.8994746208190918, + "learning_rate": 0.0001435672058892509, + "loss": 1.678, + "step": 16920 + }, + { + "epoch": 3.5666508663822616, + "grad_norm": 1.0269373655319214, + "learning_rate": 0.00014350761428518185, + "loss": 1.6777, + "step": 16930 + }, + { + "epoch": 3.568757570969611, + "grad_norm": 1.0147863626480103, + "learning_rate": 0.00014344800361731027, + "loss": 1.6976, + "step": 16940 + }, + { + "epoch": 3.57086427555696, + "grad_norm": 0.9315280318260193, + "learning_rate": 0.00014338837391175582, + "loss": 1.6918, + "step": 16950 + }, + { + "epoch": 3.5729709801443095, + "grad_norm": 0.9913371205329895, + "learning_rate": 0.0001433287251946466, + "loss": 1.7383, + "step": 16960 + }, + { + "epoch": 3.5750776847316583, + "grad_norm": 1.0025800466537476, + "learning_rate": 0.00014326905749211888, + "loss": 1.6843, + "step": 16970 + }, + { + "epoch": 3.5771843893190076, + "grad_norm": 1.001194715499878, + "learning_rate": 0.00014320937083031748, + "loss": 1.7082, + "step": 16980 + }, + { + "epoch": 3.579291093906357, + "grad_norm": 0.9992178082466125, + "learning_rate": 0.0001431496652353953, + "loss": 1.7006, + "step": 16990 + }, + { + "epoch": 3.581397798493706, + "grad_norm": 0.9966685175895691, + "learning_rate": 0.0001430899407335137, + "loss": 1.6599, + "step": 17000 + }, + { + "epoch": 3.5835045030810555, + "grad_norm": 0.9909036755561829, + "learning_rate": 0.00014303019735084226, + "loss": 1.699, + "step": 17010 + }, + { + "epoch": 3.585611207668405, + "grad_norm": 0.8726781010627747, + "learning_rate": 0.0001429704351135588, + "loss": 1.7063, + "step": 17020 + }, + { + "epoch": 3.587717912255754, + "grad_norm": 0.9716811180114746, + "learning_rate": 0.00014291065404784946, + "loss": 1.6646, + "step": 17030 + }, + { + "epoch": 3.589824616843103, + "grad_norm": 1.0037916898727417, + "learning_rate": 0.0001428508541799086, + "loss": 1.6589, + "step": 17040 + }, + { + "epoch": 3.5919313214304527, + "grad_norm": 1.0080369710922241, + "learning_rate": 0.00014279103553593885, + "loss": 1.7106, + "step": 17050 + }, + { + "epoch": 3.5940380260178015, + "grad_norm": 1.0465925931930542, + "learning_rate": 0.00014273119814215102, + "loss": 1.6131, + "step": 17060 + }, + { + "epoch": 3.596144730605151, + "grad_norm": 0.9277071356773376, + "learning_rate": 0.00014267134202476417, + "loss": 1.6802, + "step": 17070 + }, + { + "epoch": 3.5982514351925, + "grad_norm": 0.9744635820388794, + "learning_rate": 0.00014261146721000553, + "loss": 1.6739, + "step": 17080 + }, + { + "epoch": 3.6003581397798494, + "grad_norm": 0.9288362264633179, + "learning_rate": 0.00014255157372411058, + "loss": 1.6332, + "step": 17090 + }, + { + "epoch": 3.6024648443671987, + "grad_norm": 0.9643329381942749, + "learning_rate": 0.0001424916615933229, + "loss": 1.6615, + "step": 17100 + }, + { + "epoch": 3.604571548954548, + "grad_norm": 1.0060192346572876, + "learning_rate": 0.00014243173084389437, + "loss": 1.6714, + "step": 17110 + }, + { + "epoch": 3.6066782535418973, + "grad_norm": 0.9957178831100464, + "learning_rate": 0.00014237178150208486, + "loss": 1.6903, + "step": 17120 + }, + { + "epoch": 3.608784958129246, + "grad_norm": 0.9705983400344849, + "learning_rate": 0.00014231181359416247, + "loss": 1.6499, + "step": 17130 + }, + { + "epoch": 3.6108916627165955, + "grad_norm": 1.016700029373169, + "learning_rate": 0.0001422518271464035, + "loss": 1.7183, + "step": 17140 + }, + { + "epoch": 3.6129983673039447, + "grad_norm": 1.1801646947860718, + "learning_rate": 0.0001421918221850923, + "loss": 1.6735, + "step": 17150 + }, + { + "epoch": 3.615105071891294, + "grad_norm": 1.018476128578186, + "learning_rate": 0.00014213179873652127, + "loss": 1.6662, + "step": 17160 + }, + { + "epoch": 3.6172117764786433, + "grad_norm": 1.0025722980499268, + "learning_rate": 0.00014207175682699107, + "loss": 1.6927, + "step": 17170 + }, + { + "epoch": 3.6193184810659926, + "grad_norm": 0.9412466883659363, + "learning_rate": 0.00014201169648281027, + "loss": 1.7017, + "step": 17180 + }, + { + "epoch": 3.621425185653342, + "grad_norm": 0.997974693775177, + "learning_rate": 0.0001419516177302957, + "loss": 1.6749, + "step": 17190 + }, + { + "epoch": 3.623531890240691, + "grad_norm": 1.0293715000152588, + "learning_rate": 0.00014189152059577214, + "loss": 1.6701, + "step": 17200 + }, + { + "epoch": 3.62563859482804, + "grad_norm": 1.0058647394180298, + "learning_rate": 0.0001418314051055724, + "loss": 1.6771, + "step": 17210 + }, + { + "epoch": 3.6277452994153894, + "grad_norm": 0.9235060214996338, + "learning_rate": 0.00014177127128603745, + "loss": 1.6577, + "step": 17220 + }, + { + "epoch": 3.6298520040027387, + "grad_norm": 1.0182476043701172, + "learning_rate": 0.00014171111916351623, + "loss": 1.786, + "step": 17230 + }, + { + "epoch": 3.631958708590088, + "grad_norm": 1.0080841779708862, + "learning_rate": 0.00014165094876436563, + "loss": 1.6719, + "step": 17240 + }, + { + "epoch": 3.6340654131774373, + "grad_norm": 0.9524149894714355, + "learning_rate": 0.00014159076011495061, + "loss": 1.6081, + "step": 17250 + }, + { + "epoch": 3.6361721177647865, + "grad_norm": 0.9929611086845398, + "learning_rate": 0.00014153055324164418, + "loss": 1.6987, + "step": 17260 + }, + { + "epoch": 3.6382788223521354, + "grad_norm": 0.9325882196426392, + "learning_rate": 0.00014147032817082728, + "loss": 1.6635, + "step": 17270 + }, + { + "epoch": 3.640385526939485, + "grad_norm": 1.0150893926620483, + "learning_rate": 0.0001414100849288888, + "loss": 1.6721, + "step": 17280 + }, + { + "epoch": 3.642492231526834, + "grad_norm": 1.0248265266418457, + "learning_rate": 0.00014134982354222563, + "loss": 1.7073, + "step": 17290 + }, + { + "epoch": 3.6445989361141833, + "grad_norm": 1.0384207963943481, + "learning_rate": 0.0001412895440372426, + "loss": 1.5953, + "step": 17300 + }, + { + "epoch": 3.6467056407015326, + "grad_norm": 0.996492862701416, + "learning_rate": 0.00014122924644035249, + "loss": 1.6656, + "step": 17310 + }, + { + "epoch": 3.648812345288882, + "grad_norm": 0.9860630035400391, + "learning_rate": 0.000141168930777976, + "loss": 1.7256, + "step": 17320 + }, + { + "epoch": 3.650919049876231, + "grad_norm": 0.9836267232894897, + "learning_rate": 0.0001411085970765417, + "loss": 1.6836, + "step": 17330 + }, + { + "epoch": 3.6530257544635805, + "grad_norm": 0.9142735004425049, + "learning_rate": 0.00014104824536248614, + "loss": 1.6744, + "step": 17340 + }, + { + "epoch": 3.6551324590509298, + "grad_norm": 1.0227885246276855, + "learning_rate": 0.00014098787566225375, + "loss": 1.7145, + "step": 17350 + }, + { + "epoch": 3.6572391636382786, + "grad_norm": 1.0329092741012573, + "learning_rate": 0.00014092748800229683, + "loss": 1.6737, + "step": 17360 + }, + { + "epoch": 3.659345868225628, + "grad_norm": 1.0570374727249146, + "learning_rate": 0.00014086708240907542, + "loss": 1.6968, + "step": 17370 + }, + { + "epoch": 3.661452572812977, + "grad_norm": 0.9342901110649109, + "learning_rate": 0.0001408066589090577, + "loss": 1.6643, + "step": 17380 + }, + { + "epoch": 3.6635592774003265, + "grad_norm": 0.9996681809425354, + "learning_rate": 0.00014074621752871943, + "loss": 1.6771, + "step": 17390 + }, + { + "epoch": 3.665665981987676, + "grad_norm": 0.9607349634170532, + "learning_rate": 0.00014068575829454436, + "loss": 1.6859, + "step": 17400 + }, + { + "epoch": 3.667772686575025, + "grad_norm": 1.1797279119491577, + "learning_rate": 0.00014062528123302395, + "loss": 1.6771, + "step": 17410 + }, + { + "epoch": 3.6698793911623744, + "grad_norm": 1.010092854499817, + "learning_rate": 0.00014056478637065761, + "loss": 1.7489, + "step": 17420 + }, + { + "epoch": 3.6719860957497232, + "grad_norm": 0.9948980212211609, + "learning_rate": 0.0001405042737339524, + "loss": 1.6696, + "step": 17430 + }, + { + "epoch": 3.674092800337073, + "grad_norm": 1.0839941501617432, + "learning_rate": 0.00014044374334942333, + "loss": 1.7265, + "step": 17440 + }, + { + "epoch": 3.676199504924422, + "grad_norm": 0.9895996451377869, + "learning_rate": 0.00014038319524359297, + "loss": 1.694, + "step": 17450 + }, + { + "epoch": 3.678306209511771, + "grad_norm": 1.0752763748168945, + "learning_rate": 0.00014032262944299194, + "loss": 1.6722, + "step": 17460 + }, + { + "epoch": 3.6804129140991204, + "grad_norm": 1.049815058708191, + "learning_rate": 0.0001402620459741583, + "loss": 1.6526, + "step": 17470 + }, + { + "epoch": 3.6825196186864697, + "grad_norm": 0.9036969542503357, + "learning_rate": 0.00014020144486363812, + "loss": 1.686, + "step": 17480 + }, + { + "epoch": 3.684626323273819, + "grad_norm": 1.0472948551177979, + "learning_rate": 0.00014014082613798503, + "loss": 1.7052, + "step": 17490 + }, + { + "epoch": 3.6867330278611683, + "grad_norm": 1.0632350444793701, + "learning_rate": 0.00014008018982376044, + "loss": 1.7132, + "step": 17500 + }, + { + "epoch": 3.6888397324485176, + "grad_norm": 0.9786173105239868, + "learning_rate": 0.00014001953594753352, + "loss": 1.6874, + "step": 17510 + }, + { + "epoch": 3.6909464370358664, + "grad_norm": 0.977205753326416, + "learning_rate": 0.00013995886453588104, + "loss": 1.6544, + "step": 17520 + }, + { + "epoch": 3.6930531416232157, + "grad_norm": 0.9975076913833618, + "learning_rate": 0.00013989817561538742, + "loss": 1.6845, + "step": 17530 + }, + { + "epoch": 3.695159846210565, + "grad_norm": 1.027984857559204, + "learning_rate": 0.00013983746921264494, + "loss": 1.6706, + "step": 17540 + }, + { + "epoch": 3.6972665507979143, + "grad_norm": 0.9895164966583252, + "learning_rate": 0.00013977674535425337, + "loss": 1.6418, + "step": 17550 + }, + { + "epoch": 3.6993732553852636, + "grad_norm": 0.8872331380844116, + "learning_rate": 0.0001397160040668202, + "loss": 1.7271, + "step": 17560 + }, + { + "epoch": 3.701479959972613, + "grad_norm": 0.8885982036590576, + "learning_rate": 0.00013965524537696048, + "loss": 1.6964, + "step": 17570 + }, + { + "epoch": 3.703586664559962, + "grad_norm": 0.9368189573287964, + "learning_rate": 0.00013959446931129704, + "loss": 1.6421, + "step": 17580 + }, + { + "epoch": 3.705693369147311, + "grad_norm": 1.0521513223648071, + "learning_rate": 0.00013953367589646015, + "loss": 1.6955, + "step": 17590 + }, + { + "epoch": 3.707800073734661, + "grad_norm": 0.9745957255363464, + "learning_rate": 0.00013947286515908779, + "loss": 1.6645, + "step": 17600 + }, + { + "epoch": 3.7099067783220097, + "grad_norm": 0.8766047358512878, + "learning_rate": 0.00013941203712582553, + "loss": 1.6694, + "step": 17610 + }, + { + "epoch": 3.712013482909359, + "grad_norm": 1.053513526916504, + "learning_rate": 0.00013935119182332642, + "loss": 1.7104, + "step": 17620 + }, + { + "epoch": 3.7141201874967082, + "grad_norm": 0.9564160704612732, + "learning_rate": 0.0001392903292782512, + "loss": 1.6903, + "step": 17630 + }, + { + "epoch": 3.7162268920840575, + "grad_norm": 0.9763982892036438, + "learning_rate": 0.0001392294495172681, + "loss": 1.6762, + "step": 17640 + }, + { + "epoch": 3.718333596671407, + "grad_norm": 0.9978336095809937, + "learning_rate": 0.00013916855256705288, + "loss": 1.6788, + "step": 17650 + }, + { + "epoch": 3.720440301258756, + "grad_norm": 1.0024161338806152, + "learning_rate": 0.0001391076384542889, + "loss": 1.6639, + "step": 17660 + }, + { + "epoch": 3.7225470058461054, + "grad_norm": 1.06181800365448, + "learning_rate": 0.00013904670720566698, + "loss": 1.6124, + "step": 17670 + }, + { + "epoch": 3.7246537104334543, + "grad_norm": 0.9750447273254395, + "learning_rate": 0.00013898575884788543, + "loss": 1.6872, + "step": 17680 + }, + { + "epoch": 3.7267604150208036, + "grad_norm": 0.9319090247154236, + "learning_rate": 0.0001389247934076501, + "loss": 1.6294, + "step": 17690 + }, + { + "epoch": 3.728867119608153, + "grad_norm": 0.9692747592926025, + "learning_rate": 0.0001388638109116744, + "loss": 1.6471, + "step": 17700 + }, + { + "epoch": 3.730973824195502, + "grad_norm": 0.9356905817985535, + "learning_rate": 0.00013880281138667905, + "loss": 1.649, + "step": 17710 + }, + { + "epoch": 3.7330805287828515, + "grad_norm": 0.9623681902885437, + "learning_rate": 0.0001387417948593923, + "loss": 1.7005, + "step": 17720 + }, + { + "epoch": 3.7351872333702008, + "grad_norm": 1.0994905233383179, + "learning_rate": 0.0001386807613565499, + "loss": 1.7227, + "step": 17730 + }, + { + "epoch": 3.73729393795755, + "grad_norm": 1.081142783164978, + "learning_rate": 0.000138619710904895, + "loss": 1.6773, + "step": 17740 + }, + { + "epoch": 3.739400642544899, + "grad_norm": 0.921646237373352, + "learning_rate": 0.00013855864353117816, + "loss": 1.6094, + "step": 17750 + }, + { + "epoch": 3.7415073471322486, + "grad_norm": 0.930452287197113, + "learning_rate": 0.00013849755926215735, + "loss": 1.6182, + "step": 17760 + }, + { + "epoch": 3.7436140517195975, + "grad_norm": 0.939112663269043, + "learning_rate": 0.00013843645812459802, + "loss": 1.6962, + "step": 17770 + }, + { + "epoch": 3.745720756306947, + "grad_norm": 0.9898761510848999, + "learning_rate": 0.0001383753401452729, + "loss": 1.6638, + "step": 17780 + }, + { + "epoch": 3.747827460894296, + "grad_norm": 0.9990235567092896, + "learning_rate": 0.00013831420535096223, + "loss": 1.6496, + "step": 17790 + }, + { + "epoch": 3.7499341654816454, + "grad_norm": 0.9664174318313599, + "learning_rate": 0.00013825305376845347, + "loss": 1.6869, + "step": 17800 + }, + { + "epoch": 3.7520408700689947, + "grad_norm": 0.9983745813369751, + "learning_rate": 0.0001381918854245415, + "loss": 1.6858, + "step": 17810 + }, + { + "epoch": 3.754147574656344, + "grad_norm": 0.9060636162757874, + "learning_rate": 0.00013813070034602863, + "loss": 1.6385, + "step": 17820 + }, + { + "epoch": 3.7562542792436933, + "grad_norm": 0.9503934979438782, + "learning_rate": 0.00013806949855972434, + "loss": 1.6733, + "step": 17830 + }, + { + "epoch": 3.758360983831042, + "grad_norm": 0.9625155925750732, + "learning_rate": 0.00013800828009244559, + "loss": 1.6871, + "step": 17840 + }, + { + "epoch": 3.7604676884183914, + "grad_norm": 0.985414981842041, + "learning_rate": 0.00013794704497101655, + "loss": 1.6641, + "step": 17850 + }, + { + "epoch": 3.7625743930057407, + "grad_norm": 0.9473032355308533, + "learning_rate": 0.00013788579322226868, + "loss": 1.6459, + "step": 17860 + }, + { + "epoch": 3.76468109759309, + "grad_norm": 1.0287021398544312, + "learning_rate": 0.0001378245248730408, + "loss": 1.6515, + "step": 17870 + }, + { + "epoch": 3.7667878021804393, + "grad_norm": 0.9983483552932739, + "learning_rate": 0.00013776323995017898, + "loss": 1.6521, + "step": 17880 + }, + { + "epoch": 3.7688945067677886, + "grad_norm": 1.0303962230682373, + "learning_rate": 0.00013770193848053648, + "loss": 1.7067, + "step": 17890 + }, + { + "epoch": 3.771001211355138, + "grad_norm": 1.0121102333068848, + "learning_rate": 0.00013764062049097389, + "loss": 1.6628, + "step": 17900 + }, + { + "epoch": 3.7731079159424867, + "grad_norm": 0.9781060218811035, + "learning_rate": 0.000137579286008359, + "loss": 1.5913, + "step": 17910 + }, + { + "epoch": 3.7752146205298365, + "grad_norm": 1.03104829788208, + "learning_rate": 0.0001375179350595669, + "loss": 1.675, + "step": 17920 + }, + { + "epoch": 3.7773213251171853, + "grad_norm": 0.9380589127540588, + "learning_rate": 0.00013745656767147978, + "loss": 1.6743, + "step": 17930 + }, + { + "epoch": 3.7794280297045346, + "grad_norm": 0.9748852252960205, + "learning_rate": 0.00013739518387098705, + "loss": 1.6895, + "step": 17940 + }, + { + "epoch": 3.781534734291884, + "grad_norm": 0.973847508430481, + "learning_rate": 0.00013733378368498543, + "loss": 1.6514, + "step": 17950 + }, + { + "epoch": 3.783641438879233, + "grad_norm": 1.1825324296951294, + "learning_rate": 0.00013727236714037872, + "loss": 1.6661, + "step": 17960 + }, + { + "epoch": 3.7857481434665825, + "grad_norm": 0.9843605756759644, + "learning_rate": 0.0001372109342640779, + "loss": 1.6192, + "step": 17970 + }, + { + "epoch": 3.787854848053932, + "grad_norm": 1.0862230062484741, + "learning_rate": 0.0001371494850830011, + "loss": 1.6936, + "step": 17980 + }, + { + "epoch": 3.789961552641281, + "grad_norm": 0.9965257048606873, + "learning_rate": 0.0001370880196240736, + "loss": 1.6191, + "step": 17990 + }, + { + "epoch": 3.79206825722863, + "grad_norm": 1.045751690864563, + "learning_rate": 0.0001370265379142279, + "loss": 1.746, + "step": 18000 + }, + { + "epoch": 3.7941749618159792, + "grad_norm": 0.9319239854812622, + "learning_rate": 0.00013696503998040342, + "loss": 1.7011, + "step": 18010 + }, + { + "epoch": 3.7962816664033285, + "grad_norm": 1.0652892589569092, + "learning_rate": 0.0001369035258495469, + "loss": 1.6326, + "step": 18020 + }, + { + "epoch": 3.798388370990678, + "grad_norm": 0.9452232718467712, + "learning_rate": 0.00013684199554861207, + "loss": 1.7333, + "step": 18030 + }, + { + "epoch": 3.800495075578027, + "grad_norm": 0.9951366186141968, + "learning_rate": 0.00013678044910455975, + "loss": 1.7036, + "step": 18040 + }, + { + "epoch": 3.8026017801653764, + "grad_norm": 1.0448187589645386, + "learning_rate": 0.0001367188865443578, + "loss": 1.6969, + "step": 18050 + }, + { + "epoch": 3.8047084847527257, + "grad_norm": 0.952127993106842, + "learning_rate": 0.0001366573078949813, + "loss": 1.6977, + "step": 18060 + }, + { + "epoch": 3.8068151893400746, + "grad_norm": 0.9487131237983704, + "learning_rate": 0.0001365957131834122, + "loss": 1.6284, + "step": 18070 + }, + { + "epoch": 3.808921893927424, + "grad_norm": 1.0855556726455688, + "learning_rate": 0.00013653410243663952, + "loss": 1.7035, + "step": 18080 + }, + { + "epoch": 3.811028598514773, + "grad_norm": 0.9785754084587097, + "learning_rate": 0.00013647247568165938, + "loss": 1.67, + "step": 18090 + }, + { + "epoch": 3.8131353031021225, + "grad_norm": 0.981380045413971, + "learning_rate": 0.0001364108329454749, + "loss": 1.7031, + "step": 18100 + }, + { + "epoch": 3.8152420076894717, + "grad_norm": 0.9764424562454224, + "learning_rate": 0.00013634917425509616, + "loss": 1.7354, + "step": 18110 + }, + { + "epoch": 3.817348712276821, + "grad_norm": 0.8839856386184692, + "learning_rate": 0.00013628749963754026, + "loss": 1.7048, + "step": 18120 + }, + { + "epoch": 3.8194554168641703, + "grad_norm": 1.0101412534713745, + "learning_rate": 0.0001362258091198312, + "loss": 1.6833, + "step": 18130 + }, + { + "epoch": 3.821562121451519, + "grad_norm": 0.9509012699127197, + "learning_rate": 0.00013616410272900014, + "loss": 1.6583, + "step": 18140 + }, + { + "epoch": 3.823668826038869, + "grad_norm": 0.9501906037330627, + "learning_rate": 0.00013610238049208495, + "loss": 1.6526, + "step": 18150 + }, + { + "epoch": 3.825775530626218, + "grad_norm": 0.9449414610862732, + "learning_rate": 0.0001360406424361306, + "loss": 1.6726, + "step": 18160 + }, + { + "epoch": 3.827882235213567, + "grad_norm": 0.9675168991088867, + "learning_rate": 0.00013597888858818898, + "loss": 1.6955, + "step": 18170 + }, + { + "epoch": 3.8299889398009164, + "grad_norm": 0.9368403553962708, + "learning_rate": 0.0001359171189753189, + "loss": 1.665, + "step": 18180 + }, + { + "epoch": 3.8320956443882657, + "grad_norm": 1.0501976013183594, + "learning_rate": 0.00013585533362458599, + "loss": 1.6632, + "step": 18190 + }, + { + "epoch": 3.834202348975615, + "grad_norm": 0.9386237859725952, + "learning_rate": 0.00013579353256306287, + "loss": 1.6612, + "step": 18200 + }, + { + "epoch": 3.8363090535629643, + "grad_norm": 1.0792053937911987, + "learning_rate": 0.00013573171581782897, + "loss": 1.6528, + "step": 18210 + }, + { + "epoch": 3.8384157581503136, + "grad_norm": 0.9612410664558411, + "learning_rate": 0.00013566988341597068, + "loss": 1.6405, + "step": 18220 + }, + { + "epoch": 3.8405224627376624, + "grad_norm": 1.0245602130889893, + "learning_rate": 0.00013560803538458123, + "loss": 1.6902, + "step": 18230 + }, + { + "epoch": 3.8426291673250117, + "grad_norm": 0.8941143751144409, + "learning_rate": 0.00013554617175076062, + "loss": 1.629, + "step": 18240 + }, + { + "epoch": 3.844735871912361, + "grad_norm": 0.9566497206687927, + "learning_rate": 0.00013548429254161575, + "loss": 1.5743, + "step": 18250 + }, + { + "epoch": 3.8468425764997103, + "grad_norm": 1.0370187759399414, + "learning_rate": 0.00013542239778426034, + "loss": 1.6781, + "step": 18260 + }, + { + "epoch": 3.8489492810870596, + "grad_norm": 0.8610029220581055, + "learning_rate": 0.00013536048750581494, + "loss": 1.681, + "step": 18270 + }, + { + "epoch": 3.851055985674409, + "grad_norm": 0.8663182854652405, + "learning_rate": 0.00013529856173340684, + "loss": 1.6648, + "step": 18280 + }, + { + "epoch": 3.853162690261758, + "grad_norm": 0.9131945371627808, + "learning_rate": 0.00013523662049417015, + "loss": 1.6575, + "step": 18290 + }, + { + "epoch": 3.855269394849107, + "grad_norm": 0.9651349782943726, + "learning_rate": 0.0001351746638152458, + "loss": 1.6545, + "step": 18300 + }, + { + "epoch": 3.8573760994364568, + "grad_norm": 0.8891710042953491, + "learning_rate": 0.00013511269172378147, + "loss": 1.643, + "step": 18310 + }, + { + "epoch": 3.8594828040238056, + "grad_norm": 0.9134824872016907, + "learning_rate": 0.00013505070424693153, + "loss": 1.6766, + "step": 18320 + }, + { + "epoch": 3.861589508611155, + "grad_norm": 1.019539475440979, + "learning_rate": 0.00013498870141185712, + "loss": 1.6787, + "step": 18330 + }, + { + "epoch": 3.863696213198504, + "grad_norm": 1.0163251161575317, + "learning_rate": 0.00013492668324572614, + "loss": 1.7008, + "step": 18340 + }, + { + "epoch": 3.8658029177858535, + "grad_norm": 0.9731863141059875, + "learning_rate": 0.00013486464977571324, + "loss": 1.6363, + "step": 18350 + }, + { + "epoch": 3.867909622373203, + "grad_norm": 0.9799426198005676, + "learning_rate": 0.00013480260102899966, + "loss": 1.6885, + "step": 18360 + }, + { + "epoch": 3.870016326960552, + "grad_norm": 1.055290699005127, + "learning_rate": 0.00013474053703277342, + "loss": 1.7328, + "step": 18370 + }, + { + "epoch": 3.8721230315479014, + "grad_norm": 0.930711030960083, + "learning_rate": 0.00013467845781422924, + "loss": 1.6725, + "step": 18380 + }, + { + "epoch": 3.8742297361352502, + "grad_norm": 0.9686673283576965, + "learning_rate": 0.00013461636340056843, + "loss": 1.6906, + "step": 18390 + }, + { + "epoch": 3.8763364407225995, + "grad_norm": 0.9889995455741882, + "learning_rate": 0.000134554253818999, + "loss": 1.6348, + "step": 18400 + }, + { + "epoch": 3.878443145309949, + "grad_norm": 0.964756965637207, + "learning_rate": 0.00013449212909673563, + "loss": 1.6999, + "step": 18410 + }, + { + "epoch": 3.880549849897298, + "grad_norm": 0.945366621017456, + "learning_rate": 0.0001344299892609996, + "loss": 1.6821, + "step": 18420 + }, + { + "epoch": 3.8826565544846474, + "grad_norm": 0.9812168478965759, + "learning_rate": 0.0001343678343390188, + "loss": 1.6809, + "step": 18430 + }, + { + "epoch": 3.8847632590719967, + "grad_norm": 0.9222221970558167, + "learning_rate": 0.00013430566435802783, + "loss": 1.6884, + "step": 18440 + }, + { + "epoch": 3.886869963659346, + "grad_norm": 1.0188971757888794, + "learning_rate": 0.00013424347934526772, + "loss": 1.6886, + "step": 18450 + }, + { + "epoch": 3.888976668246695, + "grad_norm": 0.8854445815086365, + "learning_rate": 0.00013418127932798623, + "loss": 1.6784, + "step": 18460 + }, + { + "epoch": 3.8910833728340446, + "grad_norm": 0.9683605432510376, + "learning_rate": 0.00013411906433343765, + "loss": 1.6627, + "step": 18470 + }, + { + "epoch": 3.8931900774213934, + "grad_norm": 0.9435099363327026, + "learning_rate": 0.00013405683438888282, + "loss": 1.7143, + "step": 18480 + }, + { + "epoch": 3.8952967820087427, + "grad_norm": 0.8995447158813477, + "learning_rate": 0.0001339945895215891, + "loss": 1.6652, + "step": 18490 + }, + { + "epoch": 3.897403486596092, + "grad_norm": 0.9004771113395691, + "learning_rate": 0.00013393232975883042, + "loss": 1.6349, + "step": 18500 + }, + { + "epoch": 3.8995101911834413, + "grad_norm": 1.0811883211135864, + "learning_rate": 0.00013387005512788733, + "loss": 1.6849, + "step": 18510 + }, + { + "epoch": 3.9016168957707906, + "grad_norm": 0.9638143181800842, + "learning_rate": 0.00013380776565604676, + "loss": 1.6405, + "step": 18520 + }, + { + "epoch": 3.90372360035814, + "grad_norm": 0.9664076566696167, + "learning_rate": 0.00013374546137060212, + "loss": 1.6693, + "step": 18530 + }, + { + "epoch": 3.905830304945489, + "grad_norm": 1.1116750240325928, + "learning_rate": 0.00013368314229885347, + "loss": 1.6814, + "step": 18540 + }, + { + "epoch": 3.907937009532838, + "grad_norm": 1.125832438468933, + "learning_rate": 0.00013362080846810725, + "loss": 1.6283, + "step": 18550 + }, + { + "epoch": 3.9100437141201874, + "grad_norm": 0.9035454988479614, + "learning_rate": 0.00013355845990567635, + "loss": 1.6729, + "step": 18560 + }, + { + "epoch": 3.9121504187075367, + "grad_norm": 1.0243180990219116, + "learning_rate": 0.00013349609663888015, + "loss": 1.6674, + "step": 18570 + }, + { + "epoch": 3.914257123294886, + "grad_norm": 1.1027157306671143, + "learning_rate": 0.00013343371869504444, + "loss": 1.6408, + "step": 18580 + }, + { + "epoch": 3.9163638278822352, + "grad_norm": 0.9812912344932556, + "learning_rate": 0.00013337132610150148, + "loss": 1.7142, + "step": 18590 + }, + { + "epoch": 3.9184705324695845, + "grad_norm": 0.9783474802970886, + "learning_rate": 0.00013330891888559002, + "loss": 1.6257, + "step": 18600 + }, + { + "epoch": 3.920577237056934, + "grad_norm": 1.0197913646697998, + "learning_rate": 0.000133246497074655, + "loss": 1.6701, + "step": 18610 + }, + { + "epoch": 3.9226839416442827, + "grad_norm": 1.0612571239471436, + "learning_rate": 0.00013318406069604794, + "loss": 1.6635, + "step": 18620 + }, + { + "epoch": 3.9247906462316324, + "grad_norm": 0.922271728515625, + "learning_rate": 0.00013312160977712668, + "loss": 1.6573, + "step": 18630 + }, + { + "epoch": 3.9268973508189813, + "grad_norm": 0.9498322010040283, + "learning_rate": 0.00013305914434525552, + "loss": 1.6633, + "step": 18640 + }, + { + "epoch": 3.9290040554063306, + "grad_norm": 0.9405461549758911, + "learning_rate": 0.00013299666442780493, + "loss": 1.6647, + "step": 18650 + }, + { + "epoch": 3.93111075999368, + "grad_norm": 0.9539029002189636, + "learning_rate": 0.00013293417005215188, + "loss": 1.6587, + "step": 18660 + }, + { + "epoch": 3.933217464581029, + "grad_norm": 0.9618557691574097, + "learning_rate": 0.00013287166124567964, + "loss": 1.7109, + "step": 18670 + }, + { + "epoch": 3.9353241691683785, + "grad_norm": 0.9625663161277771, + "learning_rate": 0.0001328091380357778, + "loss": 1.6915, + "step": 18680 + }, + { + "epoch": 3.9374308737557278, + "grad_norm": 0.9376285672187805, + "learning_rate": 0.00013274660044984224, + "loss": 1.7005, + "step": 18690 + }, + { + "epoch": 3.939537578343077, + "grad_norm": 0.8864640593528748, + "learning_rate": 0.00013268404851527518, + "loss": 1.6187, + "step": 18700 + }, + { + "epoch": 3.941644282930426, + "grad_norm": 1.2145129442214966, + "learning_rate": 0.00013262148225948506, + "loss": 1.6385, + "step": 18710 + }, + { + "epoch": 3.943750987517775, + "grad_norm": 0.9888447523117065, + "learning_rate": 0.0001325589017098867, + "loss": 1.64, + "step": 18720 + }, + { + "epoch": 3.9458576921051245, + "grad_norm": 1.1795809268951416, + "learning_rate": 0.0001324963068939011, + "loss": 1.7333, + "step": 18730 + }, + { + "epoch": 3.947964396692474, + "grad_norm": 1.0691869258880615, + "learning_rate": 0.00013243369783895548, + "loss": 1.6156, + "step": 18740 + }, + { + "epoch": 3.950071101279823, + "grad_norm": 0.9917706251144409, + "learning_rate": 0.0001323710745724834, + "loss": 1.5666, + "step": 18750 + }, + { + "epoch": 3.9521778058671724, + "grad_norm": 0.9891237020492554, + "learning_rate": 0.00013230843712192463, + "loss": 1.7003, + "step": 18760 + }, + { + "epoch": 3.9542845104545217, + "grad_norm": 0.9454847574234009, + "learning_rate": 0.0001322457855147251, + "loss": 1.7411, + "step": 18770 + }, + { + "epoch": 3.9563912150418705, + "grad_norm": 0.9509596228599548, + "learning_rate": 0.00013218311977833687, + "loss": 1.6653, + "step": 18780 + }, + { + "epoch": 3.9584979196292203, + "grad_norm": 0.9767935276031494, + "learning_rate": 0.00013212043994021845, + "loss": 1.7047, + "step": 18790 + }, + { + "epoch": 3.960604624216569, + "grad_norm": 0.972263514995575, + "learning_rate": 0.00013205774602783428, + "loss": 1.6834, + "step": 18800 + }, + { + "epoch": 3.9627113288039184, + "grad_norm": 0.9423323273658752, + "learning_rate": 0.00013199503806865504, + "loss": 1.6837, + "step": 18810 + }, + { + "epoch": 3.9648180333912677, + "grad_norm": 0.9944238066673279, + "learning_rate": 0.00013193231609015763, + "loss": 1.6581, + "step": 18820 + }, + { + "epoch": 3.966924737978617, + "grad_norm": 1.011017918586731, + "learning_rate": 0.00013186958011982502, + "loss": 1.657, + "step": 18830 + }, + { + "epoch": 3.9690314425659663, + "grad_norm": 0.9588932991027832, + "learning_rate": 0.0001318068301851463, + "loss": 1.6204, + "step": 18840 + }, + { + "epoch": 3.9711381471533156, + "grad_norm": 0.9741507768630981, + "learning_rate": 0.00013174406631361675, + "loss": 1.663, + "step": 18850 + }, + { + "epoch": 3.973244851740665, + "grad_norm": 0.897010326385498, + "learning_rate": 0.00013168128853273772, + "loss": 1.6428, + "step": 18860 + }, + { + "epoch": 3.9753515563280137, + "grad_norm": 0.9145323634147644, + "learning_rate": 0.00013161849687001666, + "loss": 1.6265, + "step": 18870 + }, + { + "epoch": 3.977458260915363, + "grad_norm": 1.093653678894043, + "learning_rate": 0.00013155569135296703, + "loss": 1.6954, + "step": 18880 + }, + { + "epoch": 3.9795649655027123, + "grad_norm": 0.9764483571052551, + "learning_rate": 0.0001314928720091085, + "loss": 1.6673, + "step": 18890 + }, + { + "epoch": 3.9816716700900616, + "grad_norm": 0.9174908995628357, + "learning_rate": 0.00013143003886596669, + "loss": 1.688, + "step": 18900 + }, + { + "epoch": 3.983778374677411, + "grad_norm": 0.9659301042556763, + "learning_rate": 0.00013136719195107335, + "loss": 1.6748, + "step": 18910 + }, + { + "epoch": 3.98588507926476, + "grad_norm": 0.9585952162742615, + "learning_rate": 0.00013130433129196614, + "loss": 1.6963, + "step": 18920 + }, + { + "epoch": 3.9879917838521095, + "grad_norm": 0.9116827249526978, + "learning_rate": 0.00013124145691618884, + "loss": 1.6635, + "step": 18930 + }, + { + "epoch": 3.9900984884394584, + "grad_norm": 1.0654014348983765, + "learning_rate": 0.00013117856885129126, + "loss": 1.6963, + "step": 18940 + }, + { + "epoch": 3.9922051930268077, + "grad_norm": 1.0164047479629517, + "learning_rate": 0.00013111566712482913, + "loss": 1.7179, + "step": 18950 + }, + { + "epoch": 3.994311897614157, + "grad_norm": 0.9828543066978455, + "learning_rate": 0.0001310527517643642, + "loss": 1.6681, + "step": 18960 + }, + { + "epoch": 3.9964186022015062, + "grad_norm": 0.9909005761146545, + "learning_rate": 0.00013098982279746422, + "loss": 1.6163, + "step": 18970 + }, + { + "epoch": 3.9985253067888555, + "grad_norm": 0.9685956239700317, + "learning_rate": 0.00013092688025170284, + "loss": 1.6844, + "step": 18980 + }, + { + "epoch": 4.000632011376204, + "grad_norm": 0.9608546495437622, + "learning_rate": 0.00013086392415465972, + "loss": 1.6726, + "step": 18990 + }, + { + "epoch": 4.002738715963554, + "grad_norm": 0.9137300848960876, + "learning_rate": 0.0001308009545339205, + "loss": 1.6214, + "step": 19000 + }, + { + "epoch": 4.004845420550903, + "grad_norm": 1.1166579723358154, + "learning_rate": 0.00013073797141707657, + "loss": 1.5999, + "step": 19010 + }, + { + "epoch": 4.006952125138253, + "grad_norm": 0.9817916750907898, + "learning_rate": 0.00013067497483172538, + "loss": 1.6498, + "step": 19020 + }, + { + "epoch": 4.009058829725602, + "grad_norm": 0.9361011981964111, + "learning_rate": 0.0001306119648054703, + "loss": 1.629, + "step": 19030 + }, + { + "epoch": 4.011165534312951, + "grad_norm": 0.9715505242347717, + "learning_rate": 0.00013054894136592052, + "loss": 1.6419, + "step": 19040 + }, + { + "epoch": 4.0132722389003, + "grad_norm": 0.9041184782981873, + "learning_rate": 0.00013048590454069108, + "loss": 1.5842, + "step": 19050 + }, + { + "epoch": 4.015378943487649, + "grad_norm": 0.9362772107124329, + "learning_rate": 0.0001304228543574029, + "loss": 1.575, + "step": 19060 + }, + { + "epoch": 4.017485648074999, + "grad_norm": 1.1021233797073364, + "learning_rate": 0.00013035979084368292, + "loss": 1.6569, + "step": 19070 + }, + { + "epoch": 4.019592352662348, + "grad_norm": 0.9974276423454285, + "learning_rate": 0.00013029671402716366, + "loss": 1.6237, + "step": 19080 + }, + { + "epoch": 4.021699057249697, + "grad_norm": 0.9724931120872498, + "learning_rate": 0.00013023362393548363, + "loss": 1.68, + "step": 19090 + }, + { + "epoch": 4.023805761837046, + "grad_norm": 0.990185558795929, + "learning_rate": 0.0001301705205962871, + "loss": 1.6594, + "step": 19100 + }, + { + "epoch": 4.025912466424396, + "grad_norm": 1.051291584968567, + "learning_rate": 0.0001301074040372242, + "loss": 1.5814, + "step": 19110 + }, + { + "epoch": 4.028019171011745, + "grad_norm": 0.9896820783615112, + "learning_rate": 0.0001300442742859508, + "loss": 1.6006, + "step": 19120 + }, + { + "epoch": 4.0301258755990945, + "grad_norm": 1.0852347612380981, + "learning_rate": 0.00012998113137012855, + "loss": 1.6283, + "step": 19130 + }, + { + "epoch": 4.032232580186443, + "grad_norm": 1.0564210414886475, + "learning_rate": 0.00012991797531742492, + "loss": 1.636, + "step": 19140 + }, + { + "epoch": 4.034339284773792, + "grad_norm": 0.9386522769927979, + "learning_rate": 0.00012985480615551305, + "loss": 1.6024, + "step": 19150 + }, + { + "epoch": 4.036445989361142, + "grad_norm": 1.0262477397918701, + "learning_rate": 0.00012979162391207194, + "loss": 1.6168, + "step": 19160 + }, + { + "epoch": 4.038552693948491, + "grad_norm": 0.9269583225250244, + "learning_rate": 0.00012972842861478618, + "loss": 1.6511, + "step": 19170 + }, + { + "epoch": 4.0406593985358406, + "grad_norm": 1.0529104471206665, + "learning_rate": 0.00012966522029134623, + "loss": 1.6147, + "step": 19180 + }, + { + "epoch": 4.042766103123189, + "grad_norm": 1.0251187086105347, + "learning_rate": 0.00012960199896944815, + "loss": 1.5781, + "step": 19190 + }, + { + "epoch": 4.044872807710539, + "grad_norm": 1.0744245052337646, + "learning_rate": 0.00012953876467679373, + "loss": 1.6442, + "step": 19200 + }, + { + "epoch": 4.046979512297888, + "grad_norm": 0.9448406100273132, + "learning_rate": 0.00012947551744109043, + "loss": 1.6216, + "step": 19210 + }, + { + "epoch": 4.049086216885237, + "grad_norm": 0.9437319040298462, + "learning_rate": 0.00012941225729005143, + "loss": 1.6557, + "step": 19220 + }, + { + "epoch": 4.051192921472587, + "grad_norm": 1.092026710510254, + "learning_rate": 0.0001293489842513955, + "loss": 1.6463, + "step": 19230 + }, + { + "epoch": 4.053299626059935, + "grad_norm": 1.0943164825439453, + "learning_rate": 0.00012928569835284713, + "loss": 1.5869, + "step": 19240 + }, + { + "epoch": 4.055406330647285, + "grad_norm": 0.9766450524330139, + "learning_rate": 0.00012922239962213637, + "loss": 1.6009, + "step": 19250 + }, + { + "epoch": 4.057513035234634, + "grad_norm": 0.9823459982872009, + "learning_rate": 0.00012915908808699893, + "loss": 1.6072, + "step": 19260 + }, + { + "epoch": 4.059619739821984, + "grad_norm": 0.9854733347892761, + "learning_rate": 0.00012909576377517616, + "loss": 1.6373, + "step": 19270 + }, + { + "epoch": 4.061726444409333, + "grad_norm": 1.1235730648040771, + "learning_rate": 0.00012903242671441492, + "loss": 1.6886, + "step": 19280 + }, + { + "epoch": 4.063833148996682, + "grad_norm": 1.0431162118911743, + "learning_rate": 0.0001289690769324678, + "loss": 1.6286, + "step": 19290 + }, + { + "epoch": 4.065939853584031, + "grad_norm": 1.0117398500442505, + "learning_rate": 0.00012890571445709278, + "loss": 1.6423, + "step": 19300 + }, + { + "epoch": 4.06804655817138, + "grad_norm": 1.196179747581482, + "learning_rate": 0.00012884233931605358, + "loss": 1.64, + "step": 19310 + }, + { + "epoch": 4.07015326275873, + "grad_norm": 1.04131019115448, + "learning_rate": 0.00012877895153711935, + "loss": 1.6285, + "step": 19320 + }, + { + "epoch": 4.072259967346079, + "grad_norm": 0.9589956402778625, + "learning_rate": 0.00012871555114806483, + "loss": 1.6349, + "step": 19330 + }, + { + "epoch": 4.074366671933428, + "grad_norm": 0.9664011597633362, + "learning_rate": 0.00012865213817667023, + "loss": 1.585, + "step": 19340 + }, + { + "epoch": 4.076473376520777, + "grad_norm": 0.9522480964660645, + "learning_rate": 0.0001285887126507214, + "loss": 1.641, + "step": 19350 + }, + { + "epoch": 4.078580081108127, + "grad_norm": 1.0472582578659058, + "learning_rate": 0.00012852527459800953, + "loss": 1.676, + "step": 19360 + }, + { + "epoch": 4.080686785695476, + "grad_norm": 0.9799109697341919, + "learning_rate": 0.00012846182404633143, + "loss": 1.618, + "step": 19370 + }, + { + "epoch": 4.082793490282825, + "grad_norm": 0.9914186000823975, + "learning_rate": 0.00012839836102348926, + "loss": 1.6168, + "step": 19380 + }, + { + "epoch": 4.084900194870174, + "grad_norm": 0.9672318696975708, + "learning_rate": 0.0001283348855572908, + "loss": 1.6168, + "step": 19390 + }, + { + "epoch": 4.087006899457523, + "grad_norm": 1.0729410648345947, + "learning_rate": 0.00012827139767554915, + "loss": 1.6477, + "step": 19400 + }, + { + "epoch": 4.089113604044873, + "grad_norm": 1.1007764339447021, + "learning_rate": 0.00012820789740608293, + "loss": 1.5919, + "step": 19410 + }, + { + "epoch": 4.091220308632222, + "grad_norm": 0.9777454733848572, + "learning_rate": 0.0001281443847767161, + "loss": 1.5997, + "step": 19420 + }, + { + "epoch": 4.093327013219572, + "grad_norm": 1.0719486474990845, + "learning_rate": 0.00012808085981527815, + "loss": 1.6411, + "step": 19430 + }, + { + "epoch": 4.0954337178069204, + "grad_norm": 1.0390957593917847, + "learning_rate": 0.00012801732254960388, + "loss": 1.6416, + "step": 19440 + }, + { + "epoch": 4.09754042239427, + "grad_norm": 1.065137505531311, + "learning_rate": 0.00012795377300753357, + "loss": 1.7202, + "step": 19450 + }, + { + "epoch": 4.099647126981619, + "grad_norm": 1.0135070085525513, + "learning_rate": 0.00012789021121691274, + "loss": 1.6056, + "step": 19460 + }, + { + "epoch": 4.101753831568968, + "grad_norm": 0.9829651117324829, + "learning_rate": 0.00012782663720559246, + "loss": 1.5973, + "step": 19470 + }, + { + "epoch": 4.103860536156318, + "grad_norm": 1.0231988430023193, + "learning_rate": 0.00012776305100142897, + "loss": 1.6736, + "step": 19480 + }, + { + "epoch": 4.1059672407436665, + "grad_norm": 0.9512834548950195, + "learning_rate": 0.00012769945263228403, + "loss": 1.6567, + "step": 19490 + }, + { + "epoch": 4.108073945331016, + "grad_norm": 1.0609872341156006, + "learning_rate": 0.00012763584212602453, + "loss": 1.6434, + "step": 19500 + }, + { + "epoch": 4.110180649918365, + "grad_norm": 1.026944637298584, + "learning_rate": 0.0001275722195105229, + "loss": 1.6767, + "step": 19510 + }, + { + "epoch": 4.112287354505715, + "grad_norm": 1.0436118841171265, + "learning_rate": 0.00012750858481365673, + "loss": 1.6491, + "step": 19520 + }, + { + "epoch": 4.114394059093064, + "grad_norm": 1.051046371459961, + "learning_rate": 0.0001274449380633089, + "loss": 1.6575, + "step": 19530 + }, + { + "epoch": 4.1165007636804125, + "grad_norm": 1.0463584661483765, + "learning_rate": 0.00012738127928736765, + "loss": 1.6379, + "step": 19540 + }, + { + "epoch": 4.118607468267762, + "grad_norm": 1.15203058719635, + "learning_rate": 0.00012731760851372644, + "loss": 1.6481, + "step": 19550 + }, + { + "epoch": 4.120714172855111, + "grad_norm": 1.0916799306869507, + "learning_rate": 0.00012725392577028402, + "loss": 1.6209, + "step": 19560 + }, + { + "epoch": 4.122820877442461, + "grad_norm": 1.14989173412323, + "learning_rate": 0.00012719023108494435, + "loss": 1.6247, + "step": 19570 + }, + { + "epoch": 4.12492758202981, + "grad_norm": 0.949806272983551, + "learning_rate": 0.00012712652448561656, + "loss": 1.6374, + "step": 19580 + }, + { + "epoch": 4.127034286617159, + "grad_norm": 1.0403326749801636, + "learning_rate": 0.00012706280600021522, + "loss": 1.6005, + "step": 19590 + }, + { + "epoch": 4.129140991204508, + "grad_norm": 1.0589529275894165, + "learning_rate": 0.00012699907565665982, + "loss": 1.6739, + "step": 19600 + }, + { + "epoch": 4.131247695791858, + "grad_norm": 1.0490063428878784, + "learning_rate": 0.0001269353334828753, + "loss": 1.6425, + "step": 19610 + }, + { + "epoch": 4.133354400379207, + "grad_norm": 1.061720371246338, + "learning_rate": 0.0001268715795067916, + "loss": 1.6322, + "step": 19620 + }, + { + "epoch": 4.135461104966556, + "grad_norm": 0.9578418731689453, + "learning_rate": 0.0001268078137563439, + "loss": 1.6109, + "step": 19630 + }, + { + "epoch": 4.1375678095539055, + "grad_norm": 0.9411417245864868, + "learning_rate": 0.0001267440362594726, + "loss": 1.5956, + "step": 19640 + }, + { + "epoch": 4.139674514141254, + "grad_norm": 1.0242791175842285, + "learning_rate": 0.00012668024704412317, + "loss": 1.6445, + "step": 19650 + }, + { + "epoch": 4.141781218728604, + "grad_norm": 1.0523643493652344, + "learning_rate": 0.0001266164461382462, + "loss": 1.5977, + "step": 19660 + }, + { + "epoch": 4.143887923315953, + "grad_norm": 0.977243959903717, + "learning_rate": 0.00012655263356979747, + "loss": 1.6381, + "step": 19670 + }, + { + "epoch": 4.145994627903303, + "grad_norm": 1.057154893875122, + "learning_rate": 0.00012648880936673787, + "loss": 1.6399, + "step": 19680 + }, + { + "epoch": 4.1481013324906515, + "grad_norm": 1.0574363470077515, + "learning_rate": 0.00012642497355703326, + "loss": 1.6326, + "step": 19690 + }, + { + "epoch": 4.150208037078, + "grad_norm": 1.0001354217529297, + "learning_rate": 0.00012636112616865475, + "loss": 1.5812, + "step": 19700 + }, + { + "epoch": 4.15231474166535, + "grad_norm": 1.0256274938583374, + "learning_rate": 0.00012629726722957846, + "loss": 1.6748, + "step": 19710 + }, + { + "epoch": 4.154421446252699, + "grad_norm": 0.9991124272346497, + "learning_rate": 0.00012623339676778557, + "loss": 1.6085, + "step": 19720 + }, + { + "epoch": 4.156528150840049, + "grad_norm": 0.9248740673065186, + "learning_rate": 0.00012616951481126223, + "loss": 1.5815, + "step": 19730 + }, + { + "epoch": 4.1586348554273975, + "grad_norm": 1.117120385169983, + "learning_rate": 0.00012610562138799978, + "loss": 1.6614, + "step": 19740 + }, + { + "epoch": 4.160741560014747, + "grad_norm": 1.041090488433838, + "learning_rate": 0.00012604171652599448, + "loss": 1.606, + "step": 19750 + }, + { + "epoch": 4.162848264602096, + "grad_norm": 1.0659749507904053, + "learning_rate": 0.00012597780025324764, + "loss": 1.6511, + "step": 19760 + }, + { + "epoch": 4.164954969189445, + "grad_norm": 1.1362602710723877, + "learning_rate": 0.00012591387259776551, + "loss": 1.5923, + "step": 19770 + }, + { + "epoch": 4.167061673776795, + "grad_norm": 0.9490585327148438, + "learning_rate": 0.00012584993358755945, + "loss": 1.6317, + "step": 19780 + }, + { + "epoch": 4.169168378364144, + "grad_norm": 0.9721894860267639, + "learning_rate": 0.00012578598325064565, + "loss": 1.6217, + "step": 19790 + }, + { + "epoch": 4.171275082951493, + "grad_norm": 1.014675498008728, + "learning_rate": 0.00012572202161504543, + "loss": 1.6416, + "step": 19800 + }, + { + "epoch": 4.173381787538842, + "grad_norm": 0.959456205368042, + "learning_rate": 0.00012565804870878484, + "loss": 1.6154, + "step": 19810 + }, + { + "epoch": 4.175488492126192, + "grad_norm": 1.0517370700836182, + "learning_rate": 0.00012559406455989506, + "loss": 1.6004, + "step": 19820 + }, + { + "epoch": 4.177595196713541, + "grad_norm": 1.101207971572876, + "learning_rate": 0.00012553006919641214, + "loss": 1.6393, + "step": 19830 + }, + { + "epoch": 4.1797019013008905, + "grad_norm": 1.0225372314453125, + "learning_rate": 0.00012546606264637699, + "loss": 1.633, + "step": 19840 + }, + { + "epoch": 4.181808605888239, + "grad_norm": 0.9832279682159424, + "learning_rate": 0.00012540204493783553, + "loss": 1.6122, + "step": 19850 + }, + { + "epoch": 4.183915310475588, + "grad_norm": 1.0497034788131714, + "learning_rate": 0.00012533801609883842, + "loss": 1.6318, + "step": 19860 + }, + { + "epoch": 4.186022015062938, + "grad_norm": 0.9535748958587646, + "learning_rate": 0.00012527397615744138, + "loss": 1.6368, + "step": 19870 + }, + { + "epoch": 4.188128719650287, + "grad_norm": 1.0655570030212402, + "learning_rate": 0.0001252099251417048, + "loss": 1.6132, + "step": 19880 + }, + { + "epoch": 4.1902354242376365, + "grad_norm": 1.0989586114883423, + "learning_rate": 0.0001251458630796941, + "loss": 1.6372, + "step": 19890 + }, + { + "epoch": 4.192342128824985, + "grad_norm": 1.0104575157165527, + "learning_rate": 0.00012508178999947936, + "loss": 1.6516, + "step": 19900 + }, + { + "epoch": 4.194448833412335, + "grad_norm": 1.0405486822128296, + "learning_rate": 0.00012501770592913568, + "loss": 1.5935, + "step": 19910 + }, + { + "epoch": 4.196555537999684, + "grad_norm": 1.1122767925262451, + "learning_rate": 0.00012495361089674285, + "loss": 1.6198, + "step": 19920 + }, + { + "epoch": 4.198662242587034, + "grad_norm": 1.0726100206375122, + "learning_rate": 0.00012488950493038552, + "loss": 1.6371, + "step": 19930 + }, + { + "epoch": 4.2007689471743825, + "grad_norm": 1.0831425189971924, + "learning_rate": 0.000124825388058153, + "loss": 1.6321, + "step": 19940 + }, + { + "epoch": 4.202875651761731, + "grad_norm": 1.1081782579421997, + "learning_rate": 0.00012476126030813963, + "loss": 1.5934, + "step": 19950 + }, + { + "epoch": 4.204982356349081, + "grad_norm": 0.9888300895690918, + "learning_rate": 0.0001246971217084443, + "loss": 1.6427, + "step": 19960 + }, + { + "epoch": 4.20708906093643, + "grad_norm": 1.0104364156723022, + "learning_rate": 0.00012463297228717073, + "loss": 1.601, + "step": 19970 + }, + { + "epoch": 4.20919576552378, + "grad_norm": 1.0632632970809937, + "learning_rate": 0.00012456881207242732, + "loss": 1.5513, + "step": 19980 + }, + { + "epoch": 4.211302470111129, + "grad_norm": 1.0740721225738525, + "learning_rate": 0.0001245046410923274, + "loss": 1.69, + "step": 19990 + }, + { + "epoch": 4.213409174698478, + "grad_norm": 1.067266583442688, + "learning_rate": 0.00012444045937498873, + "loss": 1.69, + "step": 20000 + }, + { + "epoch": 4.215515879285827, + "grad_norm": 1.0238685607910156, + "learning_rate": 0.000124376266948534, + "loss": 1.5979, + "step": 20010 + }, + { + "epoch": 4.217622583873176, + "grad_norm": 1.1018869876861572, + "learning_rate": 0.00012431206384109044, + "loss": 1.6022, + "step": 20020 + }, + { + "epoch": 4.219729288460526, + "grad_norm": 1.0108250379562378, + "learning_rate": 0.00012424785008079015, + "loss": 1.6489, + "step": 20030 + }, + { + "epoch": 4.221835993047875, + "grad_norm": 0.9962803721427917, + "learning_rate": 0.00012418362569576965, + "loss": 1.6018, + "step": 20040 + }, + { + "epoch": 4.223942697635224, + "grad_norm": 0.9581331610679626, + "learning_rate": 0.00012411939071417034, + "loss": 1.6362, + "step": 20050 + }, + { + "epoch": 4.226049402222573, + "grad_norm": 1.0649348497390747, + "learning_rate": 0.00012405514516413807, + "loss": 1.6541, + "step": 20060 + }, + { + "epoch": 4.228156106809923, + "grad_norm": 1.1011024713516235, + "learning_rate": 0.0001239908890738235, + "loss": 1.6053, + "step": 20070 + }, + { + "epoch": 4.230262811397272, + "grad_norm": 1.0573521852493286, + "learning_rate": 0.0001239266224713818, + "loss": 1.6766, + "step": 20080 + }, + { + "epoch": 4.232369515984621, + "grad_norm": 1.1805315017700195, + "learning_rate": 0.00012386234538497282, + "loss": 1.6301, + "step": 20090 + }, + { + "epoch": 4.23447622057197, + "grad_norm": 0.9401804208755493, + "learning_rate": 0.00012379805784276082, + "loss": 1.6689, + "step": 20100 + }, + { + "epoch": 4.236582925159319, + "grad_norm": 0.9770873188972473, + "learning_rate": 0.0001237337598729149, + "loss": 1.6291, + "step": 20110 + }, + { + "epoch": 4.238689629746669, + "grad_norm": 1.0193746089935303, + "learning_rate": 0.00012366945150360861, + "loss": 1.6169, + "step": 20120 + }, + { + "epoch": 4.240796334334018, + "grad_norm": 1.0372521877288818, + "learning_rate": 0.00012360513276301997, + "loss": 1.6679, + "step": 20130 + }, + { + "epoch": 4.2429030389213676, + "grad_norm": 1.0554676055908203, + "learning_rate": 0.00012354080367933166, + "loss": 1.6073, + "step": 20140 + }, + { + "epoch": 4.245009743508716, + "grad_norm": 1.1003928184509277, + "learning_rate": 0.0001234764642807309, + "loss": 1.5924, + "step": 20150 + }, + { + "epoch": 4.247116448096066, + "grad_norm": 0.9821735620498657, + "learning_rate": 0.0001234121145954094, + "loss": 1.6366, + "step": 20160 + }, + { + "epoch": 4.249223152683415, + "grad_norm": 1.034835934638977, + "learning_rate": 0.0001233477546515633, + "loss": 1.6446, + "step": 20170 + }, + { + "epoch": 4.251329857270764, + "grad_norm": 1.0512527227401733, + "learning_rate": 0.00012328338447739333, + "loss": 1.6344, + "step": 20180 + }, + { + "epoch": 4.253436561858114, + "grad_norm": 0.9722573161125183, + "learning_rate": 0.00012321900410110464, + "loss": 1.5923, + "step": 20190 + }, + { + "epoch": 4.255543266445462, + "grad_norm": 1.1065928936004639, + "learning_rate": 0.000123154613550907, + "loss": 1.6165, + "step": 20200 + }, + { + "epoch": 4.257649971032812, + "grad_norm": 0.9826769232749939, + "learning_rate": 0.0001230902128550144, + "loss": 1.6689, + "step": 20210 + }, + { + "epoch": 4.259756675620161, + "grad_norm": 1.2589317560195923, + "learning_rate": 0.00012302580204164541, + "loss": 1.6341, + "step": 20220 + }, + { + "epoch": 4.261863380207511, + "grad_norm": 0.9650371670722961, + "learning_rate": 0.00012296138113902308, + "loss": 1.6348, + "step": 20230 + }, + { + "epoch": 4.26397008479486, + "grad_norm": 1.121001124382019, + "learning_rate": 0.00012289695017537485, + "loss": 1.6525, + "step": 20240 + }, + { + "epoch": 4.2660767893822085, + "grad_norm": 1.0059106349945068, + "learning_rate": 0.00012283250917893244, + "loss": 1.6566, + "step": 20250 + }, + { + "epoch": 4.268183493969558, + "grad_norm": 1.1646438837051392, + "learning_rate": 0.00012276805817793208, + "loss": 1.6315, + "step": 20260 + }, + { + "epoch": 4.270290198556907, + "grad_norm": 1.138887882232666, + "learning_rate": 0.00012270359720061445, + "loss": 1.6118, + "step": 20270 + }, + { + "epoch": 4.272396903144257, + "grad_norm": 1.1170200109481812, + "learning_rate": 0.0001226391262752245, + "loss": 1.6587, + "step": 20280 + }, + { + "epoch": 4.274503607731606, + "grad_norm": 1.041637897491455, + "learning_rate": 0.00012257464543001146, + "loss": 1.6728, + "step": 20290 + }, + { + "epoch": 4.276610312318955, + "grad_norm": 1.0330857038497925, + "learning_rate": 0.00012251015469322916, + "loss": 1.6058, + "step": 20300 + }, + { + "epoch": 4.278717016906304, + "grad_norm": 0.9935159087181091, + "learning_rate": 0.00012244565409313547, + "loss": 1.6765, + "step": 20310 + }, + { + "epoch": 4.280823721493654, + "grad_norm": 1.0498121976852417, + "learning_rate": 0.00012238114365799286, + "loss": 1.7234, + "step": 20320 + }, + { + "epoch": 4.282930426081003, + "grad_norm": 1.0286202430725098, + "learning_rate": 0.00012231662341606785, + "loss": 1.6389, + "step": 20330 + }, + { + "epoch": 4.285037130668352, + "grad_norm": 1.294057011604309, + "learning_rate": 0.00012225209339563145, + "loss": 1.6428, + "step": 20340 + }, + { + "epoch": 4.287143835255701, + "grad_norm": 1.061030387878418, + "learning_rate": 0.00012218755362495887, + "loss": 1.6834, + "step": 20350 + }, + { + "epoch": 4.28925053984305, + "grad_norm": 1.0638394355773926, + "learning_rate": 0.00012212300413232962, + "loss": 1.6978, + "step": 20360 + }, + { + "epoch": 4.2913572444304, + "grad_norm": 1.0417766571044922, + "learning_rate": 0.0001220584449460274, + "loss": 1.6336, + "step": 20370 + }, + { + "epoch": 4.293463949017749, + "grad_norm": 0.9782829880714417, + "learning_rate": 0.0001219938760943403, + "loss": 1.6014, + "step": 20380 + }, + { + "epoch": 4.295570653605099, + "grad_norm": 1.0527657270431519, + "learning_rate": 0.00012192929760556049, + "loss": 1.6763, + "step": 20390 + }, + { + "epoch": 4.2976773581924474, + "grad_norm": 1.0444494485855103, + "learning_rate": 0.00012186470950798445, + "loss": 1.6582, + "step": 20400 + }, + { + "epoch": 4.299784062779796, + "grad_norm": 1.0852185487747192, + "learning_rate": 0.00012180011182991289, + "loss": 1.6798, + "step": 20410 + }, + { + "epoch": 4.301890767367146, + "grad_norm": 1.030380129814148, + "learning_rate": 0.00012173550459965062, + "loss": 1.6249, + "step": 20420 + }, + { + "epoch": 4.303997471954495, + "grad_norm": 0.9517531394958496, + "learning_rate": 0.00012167088784550673, + "loss": 1.6335, + "step": 20430 + }, + { + "epoch": 4.306104176541845, + "grad_norm": 1.0121312141418457, + "learning_rate": 0.00012160626159579447, + "loss": 1.6665, + "step": 20440 + }, + { + "epoch": 4.3082108811291935, + "grad_norm": 0.9951829314231873, + "learning_rate": 0.0001215416258788312, + "loss": 1.6113, + "step": 20450 + }, + { + "epoch": 4.310317585716543, + "grad_norm": 1.082280158996582, + "learning_rate": 0.00012147698072293842, + "loss": 1.6288, + "step": 20460 + }, + { + "epoch": 4.312424290303892, + "grad_norm": 0.967485785484314, + "learning_rate": 0.0001214123261564419, + "loss": 1.5958, + "step": 20470 + }, + { + "epoch": 4.314530994891241, + "grad_norm": 1.0343081951141357, + "learning_rate": 0.00012134766220767135, + "loss": 1.6265, + "step": 20480 + }, + { + "epoch": 4.316637699478591, + "grad_norm": 1.054532527923584, + "learning_rate": 0.00012128298890496072, + "loss": 1.6081, + "step": 20490 + }, + { + "epoch": 4.3187444040659395, + "grad_norm": 0.9487448334693909, + "learning_rate": 0.00012121830627664801, + "loss": 1.6646, + "step": 20500 + }, + { + "epoch": 4.320851108653289, + "grad_norm": 1.0977160930633545, + "learning_rate": 0.00012115361435107531, + "loss": 1.6541, + "step": 20510 + }, + { + "epoch": 4.322957813240638, + "grad_norm": 1.0766124725341797, + "learning_rate": 0.00012108891315658879, + "loss": 1.6211, + "step": 20520 + }, + { + "epoch": 4.325064517827988, + "grad_norm": 1.025858998298645, + "learning_rate": 0.00012102420272153869, + "loss": 1.6154, + "step": 20530 + }, + { + "epoch": 4.327171222415337, + "grad_norm": 0.9398080110549927, + "learning_rate": 0.00012095948307427925, + "loss": 1.6605, + "step": 20540 + }, + { + "epoch": 4.329277927002686, + "grad_norm": 1.111257791519165, + "learning_rate": 0.00012089475424316883, + "loss": 1.6486, + "step": 20550 + }, + { + "epoch": 4.331384631590035, + "grad_norm": 0.9591060876846313, + "learning_rate": 0.00012083001625656973, + "loss": 1.6059, + "step": 20560 + }, + { + "epoch": 4.333491336177384, + "grad_norm": 1.1178287267684937, + "learning_rate": 0.00012076526914284833, + "loss": 1.6159, + "step": 20570 + }, + { + "epoch": 4.335598040764734, + "grad_norm": 1.0105968713760376, + "learning_rate": 0.00012070051293037492, + "loss": 1.68, + "step": 20580 + }, + { + "epoch": 4.337704745352083, + "grad_norm": 1.0441944599151611, + "learning_rate": 0.00012063574764752394, + "loss": 1.6522, + "step": 20590 + }, + { + "epoch": 4.3398114499394325, + "grad_norm": 1.0669680833816528, + "learning_rate": 0.00012057097332267359, + "loss": 1.6637, + "step": 20600 + }, + { + "epoch": 4.341918154526781, + "grad_norm": 1.169826626777649, + "learning_rate": 0.00012050618998420624, + "loss": 1.6253, + "step": 20610 + }, + { + "epoch": 4.344024859114131, + "grad_norm": 1.0377858877182007, + "learning_rate": 0.00012044139766050802, + "loss": 1.593, + "step": 20620 + }, + { + "epoch": 4.34613156370148, + "grad_norm": 1.0214862823486328, + "learning_rate": 0.00012037659637996916, + "loss": 1.6451, + "step": 20630 + }, + { + "epoch": 4.34823826828883, + "grad_norm": 1.0646579265594482, + "learning_rate": 0.00012031178617098371, + "loss": 1.6613, + "step": 20640 + }, + { + "epoch": 4.3503449728761785, + "grad_norm": 1.0184786319732666, + "learning_rate": 0.00012024696706194967, + "loss": 1.6069, + "step": 20650 + }, + { + "epoch": 4.352451677463527, + "grad_norm": 1.0893901586532593, + "learning_rate": 0.00012018213908126889, + "loss": 1.6436, + "step": 20660 + }, + { + "epoch": 4.354558382050877, + "grad_norm": 0.9943939447402954, + "learning_rate": 0.00012011730225734723, + "loss": 1.6233, + "step": 20670 + }, + { + "epoch": 4.356665086638226, + "grad_norm": 1.5163390636444092, + "learning_rate": 0.00012005245661859434, + "loss": 1.6874, + "step": 20680 + }, + { + "epoch": 4.358771791225576, + "grad_norm": 1.077407717704773, + "learning_rate": 0.00011998760219342368, + "loss": 1.6384, + "step": 20690 + }, + { + "epoch": 4.3608784958129245, + "grad_norm": 1.0055080652236938, + "learning_rate": 0.00011992273901025269, + "loss": 1.6354, + "step": 20700 + }, + { + "epoch": 4.362985200400274, + "grad_norm": 1.019344449043274, + "learning_rate": 0.00011985786709750251, + "loss": 1.6234, + "step": 20710 + }, + { + "epoch": 4.365091904987623, + "grad_norm": 1.0513646602630615, + "learning_rate": 0.00011979298648359823, + "loss": 1.6259, + "step": 20720 + }, + { + "epoch": 4.367198609574972, + "grad_norm": 1.1279222965240479, + "learning_rate": 0.00011972809719696864, + "loss": 1.6621, + "step": 20730 + }, + { + "epoch": 4.369305314162322, + "grad_norm": 1.0146011114120483, + "learning_rate": 0.00011966319926604641, + "loss": 1.6617, + "step": 20740 + }, + { + "epoch": 4.371412018749671, + "grad_norm": 1.0017142295837402, + "learning_rate": 0.00011959829271926799, + "loss": 1.6856, + "step": 20750 + }, + { + "epoch": 4.37351872333702, + "grad_norm": 0.9613775014877319, + "learning_rate": 0.0001195333775850736, + "loss": 1.6276, + "step": 20760 + }, + { + "epoch": 4.375625427924369, + "grad_norm": 1.031233549118042, + "learning_rate": 0.00011946845389190715, + "loss": 1.6607, + "step": 20770 + }, + { + "epoch": 4.377732132511719, + "grad_norm": 0.905860960483551, + "learning_rate": 0.0001194035216682164, + "loss": 1.7105, + "step": 20780 + }, + { + "epoch": 4.379838837099068, + "grad_norm": 1.1338081359863281, + "learning_rate": 0.00011933858094245281, + "loss": 1.6139, + "step": 20790 + }, + { + "epoch": 4.381945541686417, + "grad_norm": 1.0127646923065186, + "learning_rate": 0.00011927363174307156, + "loss": 1.672, + "step": 20800 + }, + { + "epoch": 4.384052246273766, + "grad_norm": 0.9693343043327332, + "learning_rate": 0.00011920867409853154, + "loss": 1.5391, + "step": 20810 + }, + { + "epoch": 4.386158950861115, + "grad_norm": 1.008926510810852, + "learning_rate": 0.00011914370803729533, + "loss": 1.637, + "step": 20820 + }, + { + "epoch": 4.388265655448465, + "grad_norm": 1.1421765089035034, + "learning_rate": 0.00011907873358782926, + "loss": 1.6087, + "step": 20830 + }, + { + "epoch": 4.390372360035814, + "grad_norm": 1.052461862564087, + "learning_rate": 0.00011901375077860329, + "loss": 1.6072, + "step": 20840 + }, + { + "epoch": 4.3924790646231635, + "grad_norm": 1.0214227437973022, + "learning_rate": 0.00011894875963809098, + "loss": 1.6071, + "step": 20850 + }, + { + "epoch": 4.394585769210512, + "grad_norm": 1.00290846824646, + "learning_rate": 0.00011888376019476966, + "loss": 1.5995, + "step": 20860 + }, + { + "epoch": 4.396692473797862, + "grad_norm": 1.0803271532058716, + "learning_rate": 0.00011881875247712025, + "loss": 1.6456, + "step": 20870 + }, + { + "epoch": 4.398799178385211, + "grad_norm": 1.1548503637313843, + "learning_rate": 0.00011875373651362727, + "loss": 1.6417, + "step": 20880 + }, + { + "epoch": 4.40090588297256, + "grad_norm": 1.0493422746658325, + "learning_rate": 0.00011868871233277884, + "loss": 1.6475, + "step": 20890 + }, + { + "epoch": 4.4030125875599095, + "grad_norm": 0.9755949974060059, + "learning_rate": 0.00011862367996306673, + "loss": 1.6587, + "step": 20900 + }, + { + "epoch": 4.405119292147258, + "grad_norm": 1.0143030881881714, + "learning_rate": 0.00011855863943298631, + "loss": 1.6612, + "step": 20910 + }, + { + "epoch": 4.407225996734608, + "grad_norm": 1.1264982223510742, + "learning_rate": 0.0001184935907710365, + "loss": 1.649, + "step": 20920 + }, + { + "epoch": 4.409332701321957, + "grad_norm": 1.0828158855438232, + "learning_rate": 0.00011842853400571971, + "loss": 1.5933, + "step": 20930 + }, + { + "epoch": 4.411439405909307, + "grad_norm": 1.0389914512634277, + "learning_rate": 0.00011836346916554205, + "loss": 1.6446, + "step": 20940 + }, + { + "epoch": 4.413546110496656, + "grad_norm": 1.0088750123977661, + "learning_rate": 0.00011829839627901302, + "loss": 1.6015, + "step": 20950 + }, + { + "epoch": 4.415652815084005, + "grad_norm": 1.0506969690322876, + "learning_rate": 0.00011823331537464574, + "loss": 1.649, + "step": 20960 + }, + { + "epoch": 4.417759519671354, + "grad_norm": 1.0698243379592896, + "learning_rate": 0.00011816822648095687, + "loss": 1.627, + "step": 20970 + }, + { + "epoch": 4.419866224258703, + "grad_norm": 1.0903515815734863, + "learning_rate": 0.00011810312962646644, + "loss": 1.6891, + "step": 20980 + }, + { + "epoch": 4.421972928846053, + "grad_norm": 1.1337631940841675, + "learning_rate": 0.00011803802483969806, + "loss": 1.6733, + "step": 20990 + }, + { + "epoch": 4.424079633433402, + "grad_norm": 1.6732696294784546, + "learning_rate": 0.00011797291214917881, + "loss": 1.6171, + "step": 21000 + }, + { + "epoch": 4.426186338020751, + "grad_norm": 0.9726201891899109, + "learning_rate": 0.00011790779158343925, + "loss": 1.6015, + "step": 21010 + }, + { + "epoch": 4.4282930426081, + "grad_norm": 1.0177321434020996, + "learning_rate": 0.00011784266317101333, + "loss": 1.6375, + "step": 21020 + }, + { + "epoch": 4.43039974719545, + "grad_norm": 1.0415085554122925, + "learning_rate": 0.00011777752694043849, + "loss": 1.656, + "step": 21030 + }, + { + "epoch": 4.432506451782799, + "grad_norm": 1.0331918001174927, + "learning_rate": 0.00011771238292025558, + "loss": 1.6425, + "step": 21040 + }, + { + "epoch": 4.434613156370148, + "grad_norm": 0.9632758498191833, + "learning_rate": 0.00011764723113900886, + "loss": 1.6347, + "step": 21050 + }, + { + "epoch": 4.436719860957497, + "grad_norm": 1.1020656824111938, + "learning_rate": 0.00011758207162524598, + "loss": 1.657, + "step": 21060 + }, + { + "epoch": 4.438826565544846, + "grad_norm": 1.0980470180511475, + "learning_rate": 0.000117516904407518, + "loss": 1.674, + "step": 21070 + }, + { + "epoch": 4.440933270132196, + "grad_norm": 1.0126473903656006, + "learning_rate": 0.00011745172951437932, + "loss": 1.6464, + "step": 21080 + }, + { + "epoch": 4.443039974719545, + "grad_norm": 0.9396146535873413, + "learning_rate": 0.00011738654697438782, + "loss": 1.6033, + "step": 21090 + }, + { + "epoch": 4.4451466793068946, + "grad_norm": 1.0903655290603638, + "learning_rate": 0.00011732135681610452, + "loss": 1.6508, + "step": 21100 + }, + { + "epoch": 4.447253383894243, + "grad_norm": 1.0132126808166504, + "learning_rate": 0.00011725615906809397, + "loss": 1.6235, + "step": 21110 + }, + { + "epoch": 4.449360088481592, + "grad_norm": 1.0690926313400269, + "learning_rate": 0.00011719095375892396, + "loss": 1.6116, + "step": 21120 + }, + { + "epoch": 4.451466793068942, + "grad_norm": 1.099784255027771, + "learning_rate": 0.00011712574091716563, + "loss": 1.6192, + "step": 21130 + }, + { + "epoch": 4.453573497656291, + "grad_norm": 1.0641016960144043, + "learning_rate": 0.00011706052057139335, + "loss": 1.6692, + "step": 21140 + }, + { + "epoch": 4.455680202243641, + "grad_norm": 0.9757359027862549, + "learning_rate": 0.00011699529275018484, + "loss": 1.6685, + "step": 21150 + }, + { + "epoch": 4.457786906830989, + "grad_norm": 1.322670817375183, + "learning_rate": 0.0001169300574821211, + "loss": 1.6492, + "step": 21160 + }, + { + "epoch": 4.459893611418339, + "grad_norm": 1.0116708278656006, + "learning_rate": 0.0001168648147957864, + "loss": 1.619, + "step": 21170 + }, + { + "epoch": 4.462000316005688, + "grad_norm": 1.0692319869995117, + "learning_rate": 0.00011679956471976814, + "loss": 1.6236, + "step": 21180 + }, + { + "epoch": 4.464107020593037, + "grad_norm": 0.9994955658912659, + "learning_rate": 0.00011673430728265713, + "loss": 1.6552, + "step": 21190 + }, + { + "epoch": 4.466213725180387, + "grad_norm": 1.0054254531860352, + "learning_rate": 0.00011666904251304731, + "loss": 1.6309, + "step": 21200 + }, + { + "epoch": 4.4683204297677355, + "grad_norm": 1.0914092063903809, + "learning_rate": 0.00011660377043953588, + "loss": 1.6467, + "step": 21210 + }, + { + "epoch": 4.470427134355085, + "grad_norm": 0.9891067147254944, + "learning_rate": 0.00011653849109072314, + "loss": 1.6362, + "step": 21220 + }, + { + "epoch": 4.472533838942434, + "grad_norm": 1.1097562313079834, + "learning_rate": 0.00011647320449521268, + "loss": 1.6397, + "step": 21230 + }, + { + "epoch": 4.474640543529784, + "grad_norm": 1.0569071769714355, + "learning_rate": 0.0001164079106816113, + "loss": 1.6645, + "step": 21240 + }, + { + "epoch": 4.476747248117133, + "grad_norm": 1.0638412237167358, + "learning_rate": 0.00011634260967852882, + "loss": 1.6816, + "step": 21250 + }, + { + "epoch": 4.478853952704482, + "grad_norm": 1.0064091682434082, + "learning_rate": 0.00011627730151457829, + "loss": 1.6874, + "step": 21260 + }, + { + "epoch": 4.480960657291831, + "grad_norm": 1.0138236284255981, + "learning_rate": 0.00011621198621837593, + "loss": 1.6736, + "step": 21270 + }, + { + "epoch": 4.48306736187918, + "grad_norm": 1.0266871452331543, + "learning_rate": 0.00011614666381854107, + "loss": 1.638, + "step": 21280 + }, + { + "epoch": 4.48517406646653, + "grad_norm": 1.0543068647384644, + "learning_rate": 0.00011608133434369604, + "loss": 1.6258, + "step": 21290 + }, + { + "epoch": 4.487280771053879, + "grad_norm": 1.0614491701126099, + "learning_rate": 0.00011601599782246646, + "loss": 1.6601, + "step": 21300 + }, + { + "epoch": 4.489387475641228, + "grad_norm": 1.1936269998550415, + "learning_rate": 0.00011595065428348087, + "loss": 1.6566, + "step": 21310 + }, + { + "epoch": 4.491494180228577, + "grad_norm": 1.0331757068634033, + "learning_rate": 0.00011588530375537101, + "loss": 1.6704, + "step": 21320 + }, + { + "epoch": 4.493600884815927, + "grad_norm": 1.0366097688674927, + "learning_rate": 0.0001158199462667716, + "loss": 1.7132, + "step": 21330 + }, + { + "epoch": 4.495707589403276, + "grad_norm": 0.966823935508728, + "learning_rate": 0.00011575458184632044, + "loss": 1.5783, + "step": 21340 + }, + { + "epoch": 4.497814293990626, + "grad_norm": 1.0771065950393677, + "learning_rate": 0.00011568921052265836, + "loss": 1.6208, + "step": 21350 + }, + { + "epoch": 4.4999209985779745, + "grad_norm": 1.0702791213989258, + "learning_rate": 0.00011562383232442926, + "loss": 1.7674, + "step": 21360 + }, + { + "epoch": 4.502027703165323, + "grad_norm": 1.0116688013076782, + "learning_rate": 0.00011555844728027993, + "loss": 1.6573, + "step": 21370 + }, + { + "epoch": 4.504134407752673, + "grad_norm": 0.97417813539505, + "learning_rate": 0.00011549305541886032, + "loss": 1.6242, + "step": 21380 + }, + { + "epoch": 4.506241112340022, + "grad_norm": 1.091288447380066, + "learning_rate": 0.00011542765676882325, + "loss": 1.6407, + "step": 21390 + }, + { + "epoch": 4.508347816927372, + "grad_norm": 1.0396335124969482, + "learning_rate": 0.0001153622513588246, + "loss": 1.6791, + "step": 21400 + }, + { + "epoch": 4.5104545215147205, + "grad_norm": 1.0363614559173584, + "learning_rate": 0.0001152968392175231, + "loss": 1.6589, + "step": 21410 + }, + { + "epoch": 4.51256122610207, + "grad_norm": 0.9808732271194458, + "learning_rate": 0.0001152314203735805, + "loss": 1.6257, + "step": 21420 + }, + { + "epoch": 4.514667930689419, + "grad_norm": 1.0970127582550049, + "learning_rate": 0.00011516599485566153, + "loss": 1.6904, + "step": 21430 + }, + { + "epoch": 4.516774635276768, + "grad_norm": 1.1880615949630737, + "learning_rate": 0.00011510056269243379, + "loss": 1.6835, + "step": 21440 + }, + { + "epoch": 4.518881339864118, + "grad_norm": 1.0334802865982056, + "learning_rate": 0.00011503512391256776, + "loss": 1.6452, + "step": 21450 + }, + { + "epoch": 4.5209880444514665, + "grad_norm": 1.0974247455596924, + "learning_rate": 0.00011496967854473688, + "loss": 1.6913, + "step": 21460 + }, + { + "epoch": 4.523094749038816, + "grad_norm": 1.1479419469833374, + "learning_rate": 0.00011490422661761744, + "loss": 1.6741, + "step": 21470 + }, + { + "epoch": 4.525201453626165, + "grad_norm": 1.1152433156967163, + "learning_rate": 0.00011483876815988867, + "loss": 1.6518, + "step": 21480 + }, + { + "epoch": 4.527308158213515, + "grad_norm": 1.0902926921844482, + "learning_rate": 0.00011477330320023255, + "loss": 1.7099, + "step": 21490 + }, + { + "epoch": 4.529414862800864, + "grad_norm": 1.1328729391098022, + "learning_rate": 0.00011470783176733395, + "loss": 1.6352, + "step": 21500 + }, + { + "epoch": 4.5315215673882125, + "grad_norm": 1.0286601781845093, + "learning_rate": 0.00011464235388988067, + "loss": 1.7122, + "step": 21510 + }, + { + "epoch": 4.533628271975562, + "grad_norm": 1.0103448629379272, + "learning_rate": 0.00011457686959656322, + "loss": 1.6624, + "step": 21520 + }, + { + "epoch": 4.535734976562911, + "grad_norm": 0.9834364056587219, + "learning_rate": 0.00011451137891607495, + "loss": 1.6217, + "step": 21530 + }, + { + "epoch": 4.537841681150261, + "grad_norm": 1.0024162530899048, + "learning_rate": 0.00011444588187711205, + "loss": 1.6639, + "step": 21540 + }, + { + "epoch": 4.53994838573761, + "grad_norm": 1.019455909729004, + "learning_rate": 0.00011438037850837342, + "loss": 1.6285, + "step": 21550 + }, + { + "epoch": 4.5420550903249595, + "grad_norm": 1.0064702033996582, + "learning_rate": 0.00011431486883856082, + "loss": 1.6412, + "step": 21560 + }, + { + "epoch": 4.544161794912308, + "grad_norm": 1.1464800834655762, + "learning_rate": 0.0001142493528963787, + "loss": 1.6867, + "step": 21570 + }, + { + "epoch": 4.546268499499657, + "grad_norm": 0.978492021560669, + "learning_rate": 0.00011418383071053431, + "loss": 1.6104, + "step": 21580 + }, + { + "epoch": 4.548375204087007, + "grad_norm": 1.1636004447937012, + "learning_rate": 0.0001141183023097376, + "loss": 1.6889, + "step": 21590 + }, + { + "epoch": 4.550481908674356, + "grad_norm": 1.0960801839828491, + "learning_rate": 0.00011405276772270126, + "loss": 1.6693, + "step": 21600 + }, + { + "epoch": 4.5525886132617055, + "grad_norm": 1.0665209293365479, + "learning_rate": 0.0001139872269781407, + "loss": 1.6581, + "step": 21610 + }, + { + "epoch": 4.554695317849054, + "grad_norm": 1.0270514488220215, + "learning_rate": 0.00011392168010477398, + "loss": 1.6687, + "step": 21620 + }, + { + "epoch": 4.556802022436404, + "grad_norm": 1.0477863550186157, + "learning_rate": 0.0001138561271313219, + "loss": 1.6623, + "step": 21630 + }, + { + "epoch": 4.558908727023753, + "grad_norm": 1.1489893198013306, + "learning_rate": 0.00011379056808650794, + "loss": 1.6419, + "step": 21640 + }, + { + "epoch": 4.561015431611103, + "grad_norm": 0.995190441608429, + "learning_rate": 0.0001137250029990582, + "loss": 1.6129, + "step": 21650 + }, + { + "epoch": 4.5631221361984515, + "grad_norm": 1.0449438095092773, + "learning_rate": 0.0001136594318977014, + "loss": 1.6834, + "step": 21660 + }, + { + "epoch": 4.565228840785801, + "grad_norm": 0.9984011054039001, + "learning_rate": 0.00011359385481116897, + "loss": 1.6747, + "step": 21670 + }, + { + "epoch": 4.56733554537315, + "grad_norm": 1.0843603610992432, + "learning_rate": 0.00011352827176819496, + "loss": 1.6948, + "step": 21680 + }, + { + "epoch": 4.569442249960499, + "grad_norm": 1.0422046184539795, + "learning_rate": 0.00011346268279751595, + "loss": 1.6508, + "step": 21690 + }, + { + "epoch": 4.571548954547849, + "grad_norm": 0.9797605276107788, + "learning_rate": 0.00011339708792787119, + "loss": 1.5822, + "step": 21700 + }, + { + "epoch": 4.573655659135198, + "grad_norm": 0.9891635179519653, + "learning_rate": 0.00011333148718800248, + "loss": 1.6098, + "step": 21710 + }, + { + "epoch": 4.575762363722547, + "grad_norm": 1.0078808069229126, + "learning_rate": 0.0001132658806066542, + "loss": 1.5623, + "step": 21720 + }, + { + "epoch": 4.577869068309896, + "grad_norm": 0.9395504593849182, + "learning_rate": 0.00011320026821257333, + "loss": 1.6912, + "step": 21730 + }, + { + "epoch": 4.579975772897246, + "grad_norm": 0.9831303954124451, + "learning_rate": 0.00011313465003450931, + "loss": 1.6553, + "step": 21740 + }, + { + "epoch": 4.582082477484595, + "grad_norm": 0.9971016645431519, + "learning_rate": 0.00011306902610121419, + "loss": 1.651, + "step": 21750 + }, + { + "epoch": 4.584189182071944, + "grad_norm": 1.0432978868484497, + "learning_rate": 0.00011300339644144252, + "loss": 1.6727, + "step": 21760 + }, + { + "epoch": 4.586295886659293, + "grad_norm": 1.1212437152862549, + "learning_rate": 0.00011293776108395135, + "loss": 1.6057, + "step": 21770 + }, + { + "epoch": 4.588402591246642, + "grad_norm": 0.9745138883590698, + "learning_rate": 0.00011287212005750024, + "loss": 1.6231, + "step": 21780 + }, + { + "epoch": 4.590509295833992, + "grad_norm": 1.057394027709961, + "learning_rate": 0.00011280647339085118, + "loss": 1.5696, + "step": 21790 + }, + { + "epoch": 4.592616000421341, + "grad_norm": 1.1260936260223389, + "learning_rate": 0.00011274082111276876, + "loss": 1.6752, + "step": 21800 + }, + { + "epoch": 4.5947227050086905, + "grad_norm": 1.133716344833374, + "learning_rate": 0.00011267516325201985, + "loss": 1.6331, + "step": 21810 + }, + { + "epoch": 4.596829409596039, + "grad_norm": 1.1236519813537598, + "learning_rate": 0.00011260949983737398, + "loss": 1.6272, + "step": 21820 + }, + { + "epoch": 4.598936114183388, + "grad_norm": 0.9887076020240784, + "learning_rate": 0.00011254383089760285, + "loss": 1.6449, + "step": 21830 + }, + { + "epoch": 4.601042818770738, + "grad_norm": 1.0048518180847168, + "learning_rate": 0.00011247815646148087, + "loss": 1.6848, + "step": 21840 + }, + { + "epoch": 4.603149523358087, + "grad_norm": 0.9584482312202454, + "learning_rate": 0.00011241247655778464, + "loss": 1.6587, + "step": 21850 + }, + { + "epoch": 4.6052562279454365, + "grad_norm": 1.220842957496643, + "learning_rate": 0.00011234679121529323, + "loss": 1.64, + "step": 21860 + }, + { + "epoch": 4.607362932532785, + "grad_norm": 1.1299933195114136, + "learning_rate": 0.00011228110046278808, + "loss": 1.661, + "step": 21870 + }, + { + "epoch": 4.609469637120135, + "grad_norm": 1.0172736644744873, + "learning_rate": 0.00011221540432905309, + "loss": 1.6298, + "step": 21880 + }, + { + "epoch": 4.611576341707484, + "grad_norm": 1.0505467653274536, + "learning_rate": 0.00011214970284287435, + "loss": 1.668, + "step": 21890 + }, + { + "epoch": 4.613683046294833, + "grad_norm": 0.9825325608253479, + "learning_rate": 0.00011208399603304047, + "loss": 1.5831, + "step": 21900 + }, + { + "epoch": 4.615789750882183, + "grad_norm": 1.0815013647079468, + "learning_rate": 0.00011201828392834223, + "loss": 1.6227, + "step": 21910 + }, + { + "epoch": 4.617896455469531, + "grad_norm": 0.9960325360298157, + "learning_rate": 0.00011195256655757288, + "loss": 1.646, + "step": 21920 + }, + { + "epoch": 4.620003160056881, + "grad_norm": 1.1286170482635498, + "learning_rate": 0.00011188684394952789, + "loss": 1.6757, + "step": 21930 + }, + { + "epoch": 4.62210986464423, + "grad_norm": 0.9996662735939026, + "learning_rate": 0.00011182111613300501, + "loss": 1.6136, + "step": 21940 + }, + { + "epoch": 4.62421656923158, + "grad_norm": 1.0416985750198364, + "learning_rate": 0.00011175538313680431, + "loss": 1.655, + "step": 21950 + }, + { + "epoch": 4.626323273818929, + "grad_norm": 1.0647679567337036, + "learning_rate": 0.00011168964498972818, + "loss": 1.6109, + "step": 21960 + }, + { + "epoch": 4.628429978406278, + "grad_norm": 1.1595183610916138, + "learning_rate": 0.00011162390172058115, + "loss": 1.6105, + "step": 21970 + }, + { + "epoch": 4.630536682993627, + "grad_norm": 1.0993425846099854, + "learning_rate": 0.00011155815335817011, + "loss": 1.6102, + "step": 21980 + }, + { + "epoch": 4.632643387580977, + "grad_norm": 0.9971186518669128, + "learning_rate": 0.00011149239993130403, + "loss": 1.6735, + "step": 21990 + }, + { + "epoch": 4.634750092168326, + "grad_norm": 1.1622364521026611, + "learning_rate": 0.00011142664146879432, + "loss": 1.6381, + "step": 22000 + }, + { + "epoch": 4.636856796755675, + "grad_norm": 1.0967098474502563, + "learning_rate": 0.00011136087799945438, + "loss": 1.6526, + "step": 22010 + }, + { + "epoch": 4.638963501343024, + "grad_norm": 1.0782030820846558, + "learning_rate": 0.00011129510955209996, + "loss": 1.755, + "step": 22020 + }, + { + "epoch": 4.641070205930373, + "grad_norm": 0.965355634689331, + "learning_rate": 0.00011122933615554889, + "loss": 1.6236, + "step": 22030 + }, + { + "epoch": 4.643176910517723, + "grad_norm": 1.0064630508422852, + "learning_rate": 0.00011116355783862122, + "loss": 1.6899, + "step": 22040 + }, + { + "epoch": 4.645283615105072, + "grad_norm": 1.1502881050109863, + "learning_rate": 0.00011109777463013915, + "loss": 1.6342, + "step": 22050 + }, + { + "epoch": 4.647390319692422, + "grad_norm": 1.1655616760253906, + "learning_rate": 0.00011103198655892699, + "loss": 1.636, + "step": 22060 + }, + { + "epoch": 4.64949702427977, + "grad_norm": 1.063859462738037, + "learning_rate": 0.00011096619365381123, + "loss": 1.6424, + "step": 22070 + }, + { + "epoch": 4.651603728867119, + "grad_norm": 1.0649337768554688, + "learning_rate": 0.00011090039594362045, + "loss": 1.6466, + "step": 22080 + }, + { + "epoch": 4.653710433454469, + "grad_norm": 0.9440916180610657, + "learning_rate": 0.00011083459345718535, + "loss": 1.6441, + "step": 22090 + }, + { + "epoch": 4.655817138041818, + "grad_norm": 1.06228768825531, + "learning_rate": 0.00011076878622333868, + "loss": 1.6706, + "step": 22100 + }, + { + "epoch": 4.657923842629168, + "grad_norm": 0.9528666734695435, + "learning_rate": 0.00011070297427091534, + "loss": 1.6179, + "step": 22110 + }, + { + "epoch": 4.660030547216516, + "grad_norm": 1.1522499322891235, + "learning_rate": 0.00011063715762875225, + "loss": 1.6745, + "step": 22120 + }, + { + "epoch": 4.662137251803866, + "grad_norm": 1.049260139465332, + "learning_rate": 0.00011057133632568839, + "loss": 1.6675, + "step": 22130 + }, + { + "epoch": 4.664243956391215, + "grad_norm": 1.0696789026260376, + "learning_rate": 0.00011050551039056479, + "loss": 1.6355, + "step": 22140 + }, + { + "epoch": 4.666350660978564, + "grad_norm": 0.9847993850708008, + "learning_rate": 0.0001104396798522245, + "loss": 1.6592, + "step": 22150 + }, + { + "epoch": 4.668457365565914, + "grad_norm": 1.102442979812622, + "learning_rate": 0.0001103738447395126, + "loss": 1.6874, + "step": 22160 + }, + { + "epoch": 4.6705640701532625, + "grad_norm": 0.9978004097938538, + "learning_rate": 0.0001103080050812762, + "loss": 1.6361, + "step": 22170 + }, + { + "epoch": 4.672670774740612, + "grad_norm": 1.0541409254074097, + "learning_rate": 0.00011024216090636433, + "loss": 1.6731, + "step": 22180 + }, + { + "epoch": 4.674777479327961, + "grad_norm": 1.0716190338134766, + "learning_rate": 0.00011017631224362803, + "loss": 1.6109, + "step": 22190 + }, + { + "epoch": 4.676884183915311, + "grad_norm": 1.0343621969223022, + "learning_rate": 0.00011011045912192035, + "loss": 1.6403, + "step": 22200 + }, + { + "epoch": 4.67899088850266, + "grad_norm": 1.1150816679000854, + "learning_rate": 0.00011004460157009626, + "loss": 1.6232, + "step": 22210 + }, + { + "epoch": 4.6810975930900085, + "grad_norm": 0.9567576050758362, + "learning_rate": 0.00010997873961701266, + "loss": 1.6275, + "step": 22220 + }, + { + "epoch": 4.683204297677358, + "grad_norm": 1.0110678672790527, + "learning_rate": 0.00010991287329152838, + "loss": 1.6055, + "step": 22230 + }, + { + "epoch": 4.685311002264707, + "grad_norm": 1.0686061382293701, + "learning_rate": 0.00010984700262250418, + "loss": 1.6376, + "step": 22240 + }, + { + "epoch": 4.687417706852057, + "grad_norm": 1.069300651550293, + "learning_rate": 0.00010978112763880275, + "loss": 1.6952, + "step": 22250 + }, + { + "epoch": 4.689524411439406, + "grad_norm": 0.9756374955177307, + "learning_rate": 0.0001097152483692886, + "loss": 1.6478, + "step": 22260 + }, + { + "epoch": 4.691631116026755, + "grad_norm": 1.0548217296600342, + "learning_rate": 0.00010964936484282817, + "loss": 1.6522, + "step": 22270 + }, + { + "epoch": 4.693737820614104, + "grad_norm": 1.073805570602417, + "learning_rate": 0.00010958347708828976, + "loss": 1.6336, + "step": 22280 + }, + { + "epoch": 4.695844525201454, + "grad_norm": 1.0033069849014282, + "learning_rate": 0.00010951758513454351, + "loss": 1.6493, + "step": 22290 + }, + { + "epoch": 4.697951229788803, + "grad_norm": 1.0431855916976929, + "learning_rate": 0.00010945168901046139, + "loss": 1.622, + "step": 22300 + }, + { + "epoch": 4.700057934376153, + "grad_norm": 1.0515035390853882, + "learning_rate": 0.00010938578874491722, + "loss": 1.6085, + "step": 22310 + }, + { + "epoch": 4.7021646389635015, + "grad_norm": 0.9794484972953796, + "learning_rate": 0.00010931988436678666, + "loss": 1.664, + "step": 22320 + }, + { + "epoch": 4.70427134355085, + "grad_norm": 1.1576461791992188, + "learning_rate": 0.00010925397590494712, + "loss": 1.7116, + "step": 22330 + }, + { + "epoch": 4.7063780481382, + "grad_norm": 1.0061620473861694, + "learning_rate": 0.00010918806338827778, + "loss": 1.57, + "step": 22340 + }, + { + "epoch": 4.708484752725549, + "grad_norm": 1.0988759994506836, + "learning_rate": 0.00010912214684565967, + "loss": 1.6373, + "step": 22350 + }, + { + "epoch": 4.710591457312899, + "grad_norm": 1.1311630010604858, + "learning_rate": 0.00010905622630597558, + "loss": 1.6238, + "step": 22360 + }, + { + "epoch": 4.7126981619002475, + "grad_norm": 1.1295340061187744, + "learning_rate": 0.00010899030179810997, + "loss": 1.6584, + "step": 22370 + }, + { + "epoch": 4.714804866487597, + "grad_norm": 1.0294084548950195, + "learning_rate": 0.00010892437335094912, + "loss": 1.6811, + "step": 22380 + }, + { + "epoch": 4.716911571074946, + "grad_norm": 1.053384780883789, + "learning_rate": 0.00010885844099338094, + "loss": 1.6385, + "step": 22390 + }, + { + "epoch": 4.719018275662295, + "grad_norm": 1.0761774778366089, + "learning_rate": 0.00010879250475429523, + "loss": 1.6331, + "step": 22400 + }, + { + "epoch": 4.721124980249645, + "grad_norm": 1.0743370056152344, + "learning_rate": 0.00010872656466258328, + "loss": 1.6635, + "step": 22410 + }, + { + "epoch": 4.7232316848369935, + "grad_norm": 0.9481725096702576, + "learning_rate": 0.00010866062074713825, + "loss": 1.6565, + "step": 22420 + }, + { + "epoch": 4.725338389424343, + "grad_norm": 1.148763656616211, + "learning_rate": 0.00010859467303685482, + "loss": 1.6608, + "step": 22430 + }, + { + "epoch": 4.727445094011692, + "grad_norm": 1.1060622930526733, + "learning_rate": 0.00010852872156062946, + "loss": 1.6188, + "step": 22440 + }, + { + "epoch": 4.729551798599042, + "grad_norm": 1.1848762035369873, + "learning_rate": 0.00010846276634736021, + "loss": 1.5904, + "step": 22450 + }, + { + "epoch": 4.731658503186391, + "grad_norm": 0.9851653575897217, + "learning_rate": 0.00010839680742594678, + "loss": 1.6341, + "step": 22460 + }, + { + "epoch": 4.7337652077737395, + "grad_norm": 0.9424974322319031, + "learning_rate": 0.00010833084482529048, + "loss": 1.6531, + "step": 22470 + }, + { + "epoch": 4.735871912361089, + "grad_norm": 0.9752402901649475, + "learning_rate": 0.00010826487857429428, + "loss": 1.6471, + "step": 22480 + }, + { + "epoch": 4.737978616948438, + "grad_norm": 1.0955601930618286, + "learning_rate": 0.00010819890870186271, + "loss": 1.6078, + "step": 22490 + }, + { + "epoch": 4.740085321535788, + "grad_norm": 1.05659818649292, + "learning_rate": 0.00010813293523690191, + "loss": 1.6918, + "step": 22500 + }, + { + "epoch": 4.742192026123137, + "grad_norm": 1.0022358894348145, + "learning_rate": 0.00010806695820831954, + "loss": 1.6364, + "step": 22510 + }, + { + "epoch": 4.7442987307104865, + "grad_norm": 1.1091896295547485, + "learning_rate": 0.00010800097764502491, + "loss": 1.6751, + "step": 22520 + }, + { + "epoch": 4.746405435297835, + "grad_norm": 0.9929733276367188, + "learning_rate": 0.0001079349935759288, + "loss": 1.6447, + "step": 22530 + }, + { + "epoch": 4.748512139885184, + "grad_norm": 1.0926586389541626, + "learning_rate": 0.00010786900602994359, + "loss": 1.7089, + "step": 22540 + }, + { + "epoch": 4.750618844472534, + "grad_norm": 1.0213204622268677, + "learning_rate": 0.00010780301503598306, + "loss": 1.6099, + "step": 22550 + }, + { + "epoch": 4.752725549059883, + "grad_norm": 0.9645788669586182, + "learning_rate": 0.00010773702062296273, + "loss": 1.6107, + "step": 22560 + }, + { + "epoch": 4.7548322536472325, + "grad_norm": 1.0832529067993164, + "learning_rate": 0.00010767102281979939, + "loss": 1.6678, + "step": 22570 + }, + { + "epoch": 4.756938958234581, + "grad_norm": 0.9932439923286438, + "learning_rate": 0.00010760502165541144, + "loss": 1.6744, + "step": 22580 + }, + { + "epoch": 4.759045662821931, + "grad_norm": 1.130690574645996, + "learning_rate": 0.00010753901715871866, + "loss": 1.6977, + "step": 22590 + }, + { + "epoch": 4.76115236740928, + "grad_norm": 1.1236021518707275, + "learning_rate": 0.00010747300935864243, + "loss": 1.6445, + "step": 22600 + }, + { + "epoch": 4.763259071996629, + "grad_norm": 1.0128141641616821, + "learning_rate": 0.00010740699828410545, + "loss": 1.6873, + "step": 22610 + }, + { + "epoch": 4.7653657765839785, + "grad_norm": 0.9882675409317017, + "learning_rate": 0.00010734098396403192, + "loss": 1.5715, + "step": 22620 + }, + { + "epoch": 4.767472481171327, + "grad_norm": 1.1569823026657104, + "learning_rate": 0.0001072749664273474, + "loss": 1.6743, + "step": 22630 + }, + { + "epoch": 4.769579185758677, + "grad_norm": 1.044657588005066, + "learning_rate": 0.00010720894570297897, + "loss": 1.5966, + "step": 22640 + }, + { + "epoch": 4.771685890346026, + "grad_norm": 1.0096189975738525, + "learning_rate": 0.00010714292181985498, + "loss": 1.6633, + "step": 22650 + }, + { + "epoch": 4.773792594933376, + "grad_norm": 1.0168404579162598, + "learning_rate": 0.00010707689480690526, + "loss": 1.6276, + "step": 22660 + }, + { + "epoch": 4.775899299520725, + "grad_norm": 1.054799199104309, + "learning_rate": 0.00010701086469306096, + "loss": 1.6701, + "step": 22670 + }, + { + "epoch": 4.778006004108074, + "grad_norm": 0.9628526568412781, + "learning_rate": 0.00010694483150725458, + "loss": 1.6597, + "step": 22680 + }, + { + "epoch": 4.780112708695423, + "grad_norm": 1.0607578754425049, + "learning_rate": 0.00010687879527842007, + "loss": 1.6841, + "step": 22690 + }, + { + "epoch": 4.782219413282773, + "grad_norm": 1.0231002569198608, + "learning_rate": 0.00010681275603549252, + "loss": 1.6606, + "step": 22700 + }, + { + "epoch": 4.784326117870122, + "grad_norm": 1.0138951539993286, + "learning_rate": 0.00010674671380740851, + "loss": 1.6407, + "step": 22710 + }, + { + "epoch": 4.786432822457471, + "grad_norm": 1.051766276359558, + "learning_rate": 0.00010668066862310589, + "loss": 1.6696, + "step": 22720 + }, + { + "epoch": 4.78853952704482, + "grad_norm": 1.0689328908920288, + "learning_rate": 0.00010661462051152376, + "loss": 1.6823, + "step": 22730 + }, + { + "epoch": 4.790646231632169, + "grad_norm": 1.0172207355499268, + "learning_rate": 0.00010654856950160253, + "loss": 1.6111, + "step": 22740 + }, + { + "epoch": 4.792752936219519, + "grad_norm": 1.068111538887024, + "learning_rate": 0.00010648251562228386, + "loss": 1.6345, + "step": 22750 + }, + { + "epoch": 4.794859640806868, + "grad_norm": 1.1267240047454834, + "learning_rate": 0.00010641645890251075, + "loss": 1.6202, + "step": 22760 + }, + { + "epoch": 4.7969663453942175, + "grad_norm": 1.025101900100708, + "learning_rate": 0.00010635039937122733, + "loss": 1.5892, + "step": 22770 + }, + { + "epoch": 4.799073049981566, + "grad_norm": 1.0108810663223267, + "learning_rate": 0.00010628433705737901, + "loss": 1.6042, + "step": 22780 + }, + { + "epoch": 4.801179754568915, + "grad_norm": 1.0703437328338623, + "learning_rate": 0.00010621827198991245, + "loss": 1.6693, + "step": 22790 + }, + { + "epoch": 4.803286459156265, + "grad_norm": 1.0624349117279053, + "learning_rate": 0.00010615220419777548, + "loss": 1.6896, + "step": 22800 + }, + { + "epoch": 4.805393163743614, + "grad_norm": 1.0351321697235107, + "learning_rate": 0.00010608613370991711, + "loss": 1.6833, + "step": 22810 + }, + { + "epoch": 4.8074998683309635, + "grad_norm": 1.0743846893310547, + "learning_rate": 0.0001060200605552876, + "loss": 1.608, + "step": 22820 + }, + { + "epoch": 4.809606572918312, + "grad_norm": 1.0879895687103271, + "learning_rate": 0.00010595398476283827, + "loss": 1.6452, + "step": 22830 + }, + { + "epoch": 4.811713277505662, + "grad_norm": 1.1424341201782227, + "learning_rate": 0.00010588790636152168, + "loss": 1.7068, + "step": 22840 + }, + { + "epoch": 4.813819982093011, + "grad_norm": 1.0059975385665894, + "learning_rate": 0.00010582182538029157, + "loss": 1.6526, + "step": 22850 + }, + { + "epoch": 4.81592668668036, + "grad_norm": 0.9729864597320557, + "learning_rate": 0.00010575574184810269, + "loss": 1.6288, + "step": 22860 + }, + { + "epoch": 4.81803339126771, + "grad_norm": 1.1198335886001587, + "learning_rate": 0.00010568965579391097, + "loss": 1.589, + "step": 22870 + }, + { + "epoch": 4.820140095855058, + "grad_norm": 1.0577073097229004, + "learning_rate": 0.00010562356724667346, + "loss": 1.5819, + "step": 22880 + }, + { + "epoch": 4.822246800442408, + "grad_norm": 1.1223348379135132, + "learning_rate": 0.00010555747623534831, + "loss": 1.6413, + "step": 22890 + }, + { + "epoch": 4.824353505029757, + "grad_norm": 1.072959065437317, + "learning_rate": 0.00010549138278889468, + "loss": 1.6769, + "step": 22900 + }, + { + "epoch": 4.826460209617107, + "grad_norm": 1.004937767982483, + "learning_rate": 0.00010542528693627287, + "loss": 1.6091, + "step": 22910 + }, + { + "epoch": 4.828566914204456, + "grad_norm": 1.094373345375061, + "learning_rate": 0.0001053591887064442, + "loss": 1.6446, + "step": 22920 + }, + { + "epoch": 4.8306736187918045, + "grad_norm": 1.0144459009170532, + "learning_rate": 0.00010529308812837104, + "loss": 1.6605, + "step": 22930 + }, + { + "epoch": 4.832780323379154, + "grad_norm": 1.0409224033355713, + "learning_rate": 0.00010522698523101682, + "loss": 1.6474, + "step": 22940 + }, + { + "epoch": 4.834887027966503, + "grad_norm": 1.112221598625183, + "learning_rate": 0.0001051608800433459, + "loss": 1.7047, + "step": 22950 + }, + { + "epoch": 4.836993732553853, + "grad_norm": 1.0993469953536987, + "learning_rate": 0.00010509477259432372, + "loss": 1.6449, + "step": 22960 + }, + { + "epoch": 4.839100437141202, + "grad_norm": 1.0196629762649536, + "learning_rate": 0.00010502866291291668, + "loss": 1.7036, + "step": 22970 + }, + { + "epoch": 4.841207141728551, + "grad_norm": 1.0654680728912354, + "learning_rate": 0.00010496255102809223, + "loss": 1.6403, + "step": 22980 + }, + { + "epoch": 4.8433138463159, + "grad_norm": 0.9815942049026489, + "learning_rate": 0.00010489643696881864, + "loss": 1.6527, + "step": 22990 + }, + { + "epoch": 4.84542055090325, + "grad_norm": 1.2717540264129639, + "learning_rate": 0.00010483032076406528, + "loss": 1.6497, + "step": 23000 + }, + { + "epoch": 4.847527255490599, + "grad_norm": 1.1045358180999756, + "learning_rate": 0.00010476420244280232, + "loss": 1.677, + "step": 23010 + }, + { + "epoch": 4.849633960077949, + "grad_norm": 0.9338797926902771, + "learning_rate": 0.00010469808203400102, + "loss": 1.6032, + "step": 23020 + }, + { + "epoch": 4.851740664665297, + "grad_norm": 0.9959046840667725, + "learning_rate": 0.00010463195956663338, + "loss": 1.5908, + "step": 23030 + }, + { + "epoch": 4.853847369252646, + "grad_norm": 1.0763238668441772, + "learning_rate": 0.00010456583506967248, + "loss": 1.635, + "step": 23040 + }, + { + "epoch": 4.855954073839996, + "grad_norm": 1.1013065576553345, + "learning_rate": 0.00010449970857209213, + "loss": 1.6311, + "step": 23050 + }, + { + "epoch": 4.858060778427345, + "grad_norm": 1.1367875337600708, + "learning_rate": 0.00010443358010286714, + "loss": 1.6263, + "step": 23060 + }, + { + "epoch": 4.860167483014695, + "grad_norm": 1.066049337387085, + "learning_rate": 0.00010436744969097306, + "loss": 1.6411, + "step": 23070 + }, + { + "epoch": 4.862274187602043, + "grad_norm": 0.9799880385398865, + "learning_rate": 0.00010430131736538644, + "loss": 1.6492, + "step": 23080 + }, + { + "epoch": 4.864380892189393, + "grad_norm": 1.122512936592102, + "learning_rate": 0.00010423518315508452, + "loss": 1.6537, + "step": 23090 + }, + { + "epoch": 4.866487596776742, + "grad_norm": 0.9705328941345215, + "learning_rate": 0.00010416904708904548, + "loss": 1.6212, + "step": 23100 + }, + { + "epoch": 4.868594301364091, + "grad_norm": 1.0140635967254639, + "learning_rate": 0.00010410290919624824, + "loss": 1.6039, + "step": 23110 + }, + { + "epoch": 4.870701005951441, + "grad_norm": 1.1238670349121094, + "learning_rate": 0.00010403676950567258, + "loss": 1.6271, + "step": 23120 + }, + { + "epoch": 4.8728077105387895, + "grad_norm": 1.032712697982788, + "learning_rate": 0.000103970628046299, + "loss": 1.6567, + "step": 23130 + }, + { + "epoch": 4.874914415126139, + "grad_norm": 1.0045222043991089, + "learning_rate": 0.00010390448484710886, + "loss": 1.6499, + "step": 23140 + }, + { + "epoch": 4.877021119713488, + "grad_norm": 1.1023186445236206, + "learning_rate": 0.00010383833993708416, + "loss": 1.6053, + "step": 23150 + }, + { + "epoch": 4.879127824300838, + "grad_norm": 1.5840126276016235, + "learning_rate": 0.00010377219334520783, + "loss": 1.6386, + "step": 23160 + }, + { + "epoch": 4.881234528888187, + "grad_norm": 1.037644386291504, + "learning_rate": 0.00010370604510046331, + "loss": 1.6591, + "step": 23170 + }, + { + "epoch": 4.8833412334755355, + "grad_norm": 1.0414832830429077, + "learning_rate": 0.00010363989523183495, + "loss": 1.6327, + "step": 23180 + }, + { + "epoch": 4.885447938062885, + "grad_norm": 1.0389552116394043, + "learning_rate": 0.00010357374376830775, + "loss": 1.668, + "step": 23190 + }, + { + "epoch": 4.887554642650234, + "grad_norm": 1.1060019731521606, + "learning_rate": 0.00010350759073886739, + "loss": 1.6629, + "step": 23200 + }, + { + "epoch": 4.889661347237584, + "grad_norm": 0.971713662147522, + "learning_rate": 0.00010344143617250029, + "loss": 1.644, + "step": 23210 + }, + { + "epoch": 4.891768051824933, + "grad_norm": 1.0252668857574463, + "learning_rate": 0.00010337528009819344, + "loss": 1.6615, + "step": 23220 + }, + { + "epoch": 4.893874756412282, + "grad_norm": 0.9966990947723389, + "learning_rate": 0.00010330912254493456, + "loss": 1.6698, + "step": 23230 + }, + { + "epoch": 4.895981460999631, + "grad_norm": 1.0967049598693848, + "learning_rate": 0.00010324296354171207, + "loss": 1.6607, + "step": 23240 + }, + { + "epoch": 4.89808816558698, + "grad_norm": 1.0470962524414062, + "learning_rate": 0.00010317680311751496, + "loss": 1.6537, + "step": 23250 + }, + { + "epoch": 4.90019487017433, + "grad_norm": 1.0356905460357666, + "learning_rate": 0.00010311064130133279, + "loss": 1.6644, + "step": 23260 + }, + { + "epoch": 4.902301574761679, + "grad_norm": 1.1939691305160522, + "learning_rate": 0.00010304447812215582, + "loss": 1.6593, + "step": 23270 + }, + { + "epoch": 4.9044082793490285, + "grad_norm": 1.1555665731430054, + "learning_rate": 0.00010297831360897492, + "loss": 1.7027, + "step": 23280 + }, + { + "epoch": 4.906514983936377, + "grad_norm": 1.1872739791870117, + "learning_rate": 0.00010291214779078149, + "loss": 1.6771, + "step": 23290 + }, + { + "epoch": 4.908621688523727, + "grad_norm": 1.0362814664840698, + "learning_rate": 0.00010284598069656746, + "loss": 1.641, + "step": 23300 + }, + { + "epoch": 4.910728393111076, + "grad_norm": 1.0210367441177368, + "learning_rate": 0.00010277981235532541, + "loss": 1.6396, + "step": 23310 + }, + { + "epoch": 4.912835097698425, + "grad_norm": 1.0176624059677124, + "learning_rate": 0.00010271364279604842, + "loss": 1.6409, + "step": 23320 + }, + { + "epoch": 4.9149418022857745, + "grad_norm": 0.9932822585105896, + "learning_rate": 0.00010264747204773018, + "loss": 1.6659, + "step": 23330 + }, + { + "epoch": 4.917048506873123, + "grad_norm": 1.0647072792053223, + "learning_rate": 0.00010258130013936474, + "loss": 1.6686, + "step": 23340 + }, + { + "epoch": 4.919155211460473, + "grad_norm": 1.2044399976730347, + "learning_rate": 0.0001025151270999468, + "loss": 1.6833, + "step": 23350 + }, + { + "epoch": 4.921261916047822, + "grad_norm": 0.9959510564804077, + "learning_rate": 0.00010244895295847147, + "loss": 1.6623, + "step": 23360 + }, + { + "epoch": 4.923368620635172, + "grad_norm": 1.0841436386108398, + "learning_rate": 0.00010238277774393448, + "loss": 1.6428, + "step": 23370 + }, + { + "epoch": 4.9254753252225205, + "grad_norm": 1.029536247253418, + "learning_rate": 0.00010231660148533183, + "loss": 1.6339, + "step": 23380 + }, + { + "epoch": 4.92758202980987, + "grad_norm": 1.031693458557129, + "learning_rate": 0.0001022504242116601, + "loss": 1.6584, + "step": 23390 + }, + { + "epoch": 4.929688734397219, + "grad_norm": 1.041793942451477, + "learning_rate": 0.00010218424595191631, + "loss": 1.6233, + "step": 23400 + }, + { + "epoch": 4.931795438984569, + "grad_norm": 1.0688579082489014, + "learning_rate": 0.00010211806673509794, + "loss": 1.6484, + "step": 23410 + }, + { + "epoch": 4.933902143571918, + "grad_norm": 0.9542795419692993, + "learning_rate": 0.00010205188659020275, + "loss": 1.6047, + "step": 23420 + }, + { + "epoch": 4.9360088481592665, + "grad_norm": 0.9721381068229675, + "learning_rate": 0.00010198570554622909, + "loss": 1.7019, + "step": 23430 + }, + { + "epoch": 4.938115552746616, + "grad_norm": 1.0544655323028564, + "learning_rate": 0.00010191952363217557, + "loss": 1.6674, + "step": 23440 + }, + { + "epoch": 4.940222257333965, + "grad_norm": 0.9668622612953186, + "learning_rate": 0.00010185334087704124, + "loss": 1.5844, + "step": 23450 + }, + { + "epoch": 4.942328961921315, + "grad_norm": 0.9595541954040527, + "learning_rate": 0.00010178715730982549, + "loss": 1.6182, + "step": 23460 + }, + { + "epoch": 4.944435666508664, + "grad_norm": 1.1048792600631714, + "learning_rate": 0.00010172097295952811, + "loss": 1.6485, + "step": 23470 + }, + { + "epoch": 4.9465423710960135, + "grad_norm": 1.0720999240875244, + "learning_rate": 0.00010165478785514919, + "loss": 1.7186, + "step": 23480 + }, + { + "epoch": 4.948649075683362, + "grad_norm": 1.021140456199646, + "learning_rate": 0.00010158860202568916, + "loss": 1.6569, + "step": 23490 + }, + { + "epoch": 4.950755780270711, + "grad_norm": 1.0183557271957397, + "learning_rate": 0.00010152241550014881, + "loss": 1.6664, + "step": 23500 + }, + { + "epoch": 4.952862484858061, + "grad_norm": 1.0429364442825317, + "learning_rate": 0.00010145622830752912, + "loss": 1.6682, + "step": 23510 + }, + { + "epoch": 4.95496918944541, + "grad_norm": 1.113517165184021, + "learning_rate": 0.00010139004047683151, + "loss": 1.6062, + "step": 23520 + }, + { + "epoch": 4.9570758940327595, + "grad_norm": 1.080854058265686, + "learning_rate": 0.00010132385203705761, + "loss": 1.5665, + "step": 23530 + }, + { + "epoch": 4.959182598620108, + "grad_norm": 1.0921235084533691, + "learning_rate": 0.00010125766301720929, + "loss": 1.6353, + "step": 23540 + }, + { + "epoch": 4.961289303207458, + "grad_norm": 0.9792695641517639, + "learning_rate": 0.0001011914734462887, + "loss": 1.6395, + "step": 23550 + }, + { + "epoch": 4.963396007794807, + "grad_norm": 0.903640866279602, + "learning_rate": 0.00010112528335329823, + "loss": 1.5959, + "step": 23560 + }, + { + "epoch": 4.965502712382156, + "grad_norm": 1.024780511856079, + "learning_rate": 0.0001010590927672405, + "loss": 1.6883, + "step": 23570 + }, + { + "epoch": 4.9676094169695055, + "grad_norm": 0.9267536401748657, + "learning_rate": 0.00010099290171711841, + "loss": 1.6925, + "step": 23580 + }, + { + "epoch": 4.969716121556854, + "grad_norm": 1.0130085945129395, + "learning_rate": 0.00010092671023193491, + "loss": 1.6629, + "step": 23590 + }, + { + "epoch": 4.971822826144204, + "grad_norm": 1.0210721492767334, + "learning_rate": 0.00010086051834069328, + "loss": 1.6247, + "step": 23600 + }, + { + "epoch": 4.973929530731553, + "grad_norm": 1.1113983392715454, + "learning_rate": 0.00010079432607239692, + "loss": 1.6352, + "step": 23610 + }, + { + "epoch": 4.976036235318903, + "grad_norm": 1.0311906337738037, + "learning_rate": 0.00010072813345604941, + "loss": 1.6338, + "step": 23620 + }, + { + "epoch": 4.978142939906252, + "grad_norm": 1.0975770950317383, + "learning_rate": 0.00010066194052065446, + "loss": 1.6354, + "step": 23630 + }, + { + "epoch": 4.9802496444936, + "grad_norm": 1.080855131149292, + "learning_rate": 0.00010059574729521595, + "loss": 1.6562, + "step": 23640 + }, + { + "epoch": 4.98235634908095, + "grad_norm": 1.0517081022262573, + "learning_rate": 0.00010052955380873788, + "loss": 1.6971, + "step": 23650 + }, + { + "epoch": 4.984463053668299, + "grad_norm": 1.005123257637024, + "learning_rate": 0.00010046336009022435, + "loss": 1.6412, + "step": 23660 + }, + { + "epoch": 4.986569758255649, + "grad_norm": 1.0352355241775513, + "learning_rate": 0.00010039716616867957, + "loss": 1.6435, + "step": 23670 + }, + { + "epoch": 4.988676462842998, + "grad_norm": 1.0361087322235107, + "learning_rate": 0.00010033097207310785, + "loss": 1.647, + "step": 23680 + }, + { + "epoch": 4.990783167430347, + "grad_norm": 0.9913681745529175, + "learning_rate": 0.00010026477783251351, + "loss": 1.6346, + "step": 23690 + }, + { + "epoch": 4.992889872017696, + "grad_norm": 1.0833758115768433, + "learning_rate": 0.0001001985834759011, + "loss": 1.6415, + "step": 23700 + }, + { + "epoch": 4.994996576605046, + "grad_norm": 1.0982452630996704, + "learning_rate": 0.00010013238903227499, + "loss": 1.6706, + "step": 23710 + }, + { + "epoch": 4.997103281192395, + "grad_norm": 0.9679703116416931, + "learning_rate": 0.00010006619453063979, + "loss": 1.6587, + "step": 23720 + }, + { + "epoch": 4.9992099857797445, + "grad_norm": 1.0053178071975708, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 23730 + }, + { + "epoch": 5.001316690367093, + "grad_norm": 1.0724413394927979, + "learning_rate": 9.993380546936023e-05, + "loss": 1.6388, + "step": 23740 + }, + { + "epoch": 5.003423394954442, + "grad_norm": 1.3603681325912476, + "learning_rate": 9.986761096772502e-05, + "loss": 1.5901, + "step": 23750 + }, + { + "epoch": 5.005530099541792, + "grad_norm": 1.0234671831130981, + "learning_rate": 9.980141652409895e-05, + "loss": 1.667, + "step": 23760 + }, + { + "epoch": 5.007636804129141, + "grad_norm": 0.9571532607078552, + "learning_rate": 9.97352221674865e-05, + "loss": 1.5851, + "step": 23770 + }, + { + "epoch": 5.0097435087164905, + "grad_norm": 1.0448963642120361, + "learning_rate": 9.966902792689219e-05, + "loss": 1.557, + "step": 23780 + }, + { + "epoch": 5.011850213303839, + "grad_norm": 1.0232895612716675, + "learning_rate": 9.960283383132045e-05, + "loss": 1.6287, + "step": 23790 + }, + { + "epoch": 5.013956917891189, + "grad_norm": 1.0618042945861816, + "learning_rate": 9.953663990977568e-05, + "loss": 1.5744, + "step": 23800 + }, + { + "epoch": 5.016063622478538, + "grad_norm": 1.0534571409225464, + "learning_rate": 9.947044619126216e-05, + "loss": 1.694, + "step": 23810 + }, + { + "epoch": 5.018170327065887, + "grad_norm": 1.0805995464324951, + "learning_rate": 9.940425270478407e-05, + "loss": 1.6219, + "step": 23820 + }, + { + "epoch": 5.020277031653237, + "grad_norm": 0.9920625686645508, + "learning_rate": 9.933805947934555e-05, + "loss": 1.5763, + "step": 23830 + }, + { + "epoch": 5.022383736240585, + "grad_norm": 1.4843225479125977, + "learning_rate": 9.927186654395063e-05, + "loss": 1.6695, + "step": 23840 + }, + { + "epoch": 5.024490440827935, + "grad_norm": 1.0274052619934082, + "learning_rate": 9.920567392760312e-05, + "loss": 1.5961, + "step": 23850 + }, + { + "epoch": 5.026597145415284, + "grad_norm": 1.0526154041290283, + "learning_rate": 9.913948165930676e-05, + "loss": 1.6448, + "step": 23860 + }, + { + "epoch": 5.028703850002634, + "grad_norm": 1.0544629096984863, + "learning_rate": 9.907328976806511e-05, + "loss": 1.6435, + "step": 23870 + }, + { + "epoch": 5.030810554589983, + "grad_norm": 1.0605446100234985, + "learning_rate": 9.900709828288164e-05, + "loss": 1.6284, + "step": 23880 + }, + { + "epoch": 5.0329172591773315, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.894090723275951e-05, + "loss": 1.6044, + "step": 23890 + }, + { + "epoch": 5.035023963764681, + "grad_norm": 1.1093242168426514, + "learning_rate": 9.88747166467018e-05, + "loss": 1.6237, + "step": 23900 + }, + { + "epoch": 5.03713066835203, + "grad_norm": 1.014807939529419, + "learning_rate": 9.880852655371134e-05, + "loss": 1.5961, + "step": 23910 + }, + { + "epoch": 5.03923737293938, + "grad_norm": 1.281245231628418, + "learning_rate": 9.874233698279075e-05, + "loss": 1.5518, + "step": 23920 + }, + { + "epoch": 5.041344077526729, + "grad_norm": 1.0977730751037598, + "learning_rate": 9.867614796294242e-05, + "loss": 1.6002, + "step": 23930 + }, + { + "epoch": 5.043450782114078, + "grad_norm": 1.0470744371414185, + "learning_rate": 9.860995952316851e-05, + "loss": 1.6183, + "step": 23940 + }, + { + "epoch": 5.045557486701427, + "grad_norm": 1.046972393989563, + "learning_rate": 9.854377169247089e-05, + "loss": 1.5417, + "step": 23950 + }, + { + "epoch": 5.047664191288777, + "grad_norm": 1.1596167087554932, + "learning_rate": 9.847758449985124e-05, + "loss": 1.6612, + "step": 23960 + }, + { + "epoch": 5.049770895876126, + "grad_norm": 1.0991291999816895, + "learning_rate": 9.841139797431087e-05, + "loss": 1.5882, + "step": 23970 + }, + { + "epoch": 5.051877600463475, + "grad_norm": 1.0760891437530518, + "learning_rate": 9.834521214485083e-05, + "loss": 1.5909, + "step": 23980 + }, + { + "epoch": 5.053984305050824, + "grad_norm": 0.985348105430603, + "learning_rate": 9.827902704047191e-05, + "loss": 1.587, + "step": 23990 + }, + { + "epoch": 5.056091009638173, + "grad_norm": 1.231652021408081, + "learning_rate": 9.821284269017455e-05, + "loss": 1.6468, + "step": 24000 + }, + { + "epoch": 5.058197714225523, + "grad_norm": 1.148422122001648, + "learning_rate": 9.81466591229588e-05, + "loss": 1.6027, + "step": 24010 + }, + { + "epoch": 5.060304418812872, + "grad_norm": 1.0643984079360962, + "learning_rate": 9.808047636782447e-05, + "loss": 1.6109, + "step": 24020 + }, + { + "epoch": 5.062411123400222, + "grad_norm": 1.1943291425704956, + "learning_rate": 9.801429445377094e-05, + "loss": 1.7016, + "step": 24030 + }, + { + "epoch": 5.06451782798757, + "grad_norm": 1.0921201705932617, + "learning_rate": 9.794811340979724e-05, + "loss": 1.6567, + "step": 24040 + }, + { + "epoch": 5.066624532574919, + "grad_norm": 1.1693758964538574, + "learning_rate": 9.788193326490212e-05, + "loss": 1.6591, + "step": 24050 + }, + { + "epoch": 5.068731237162269, + "grad_norm": 1.1490429639816284, + "learning_rate": 9.781575404808371e-05, + "loss": 1.6554, + "step": 24060 + }, + { + "epoch": 5.070837941749618, + "grad_norm": 1.0068416595458984, + "learning_rate": 9.774957578833993e-05, + "loss": 1.6122, + "step": 24070 + }, + { + "epoch": 5.072944646336968, + "grad_norm": 0.9988663196563721, + "learning_rate": 9.768339851466818e-05, + "loss": 1.604, + "step": 24080 + }, + { + "epoch": 5.0750513509243165, + "grad_norm": 1.1303640604019165, + "learning_rate": 9.761722225606557e-05, + "loss": 1.585, + "step": 24090 + }, + { + "epoch": 5.077158055511666, + "grad_norm": 1.080378770828247, + "learning_rate": 9.755104704152854e-05, + "loss": 1.6062, + "step": 24100 + }, + { + "epoch": 5.079264760099015, + "grad_norm": 1.0893718004226685, + "learning_rate": 9.748487290005324e-05, + "loss": 1.6053, + "step": 24110 + }, + { + "epoch": 5.081371464686365, + "grad_norm": 1.0815832614898682, + "learning_rate": 9.741869986063526e-05, + "loss": 1.6045, + "step": 24120 + }, + { + "epoch": 5.083478169273714, + "grad_norm": 1.0142110586166382, + "learning_rate": 9.735252795226987e-05, + "loss": 1.5406, + "step": 24130 + }, + { + "epoch": 5.0855848738610625, + "grad_norm": 1.080077886581421, + "learning_rate": 9.728635720395159e-05, + "loss": 1.6617, + "step": 24140 + }, + { + "epoch": 5.087691578448412, + "grad_norm": 1.115033507347107, + "learning_rate": 9.722018764467461e-05, + "loss": 1.6468, + "step": 24150 + }, + { + "epoch": 5.089798283035761, + "grad_norm": 1.0328733921051025, + "learning_rate": 9.715401930343254e-05, + "loss": 1.6028, + "step": 24160 + }, + { + "epoch": 5.091904987623111, + "grad_norm": 1.0720160007476807, + "learning_rate": 9.708785220921856e-05, + "loss": 1.5708, + "step": 24170 + }, + { + "epoch": 5.09401169221046, + "grad_norm": 1.17787504196167, + "learning_rate": 9.702168639102509e-05, + "loss": 1.6243, + "step": 24180 + }, + { + "epoch": 5.096118396797809, + "grad_norm": 1.1949994564056396, + "learning_rate": 9.695552187784419e-05, + "loss": 1.651, + "step": 24190 + }, + { + "epoch": 5.098225101385158, + "grad_norm": 0.9967186450958252, + "learning_rate": 9.68893586986672e-05, + "loss": 1.6253, + "step": 24200 + }, + { + "epoch": 5.100331805972507, + "grad_norm": 1.130844235420227, + "learning_rate": 9.682319688248509e-05, + "loss": 1.5494, + "step": 24210 + }, + { + "epoch": 5.102438510559857, + "grad_norm": 1.107001781463623, + "learning_rate": 9.675703645828794e-05, + "loss": 1.6392, + "step": 24220 + }, + { + "epoch": 5.104545215147206, + "grad_norm": 1.0460437536239624, + "learning_rate": 9.669087745506545e-05, + "loss": 1.5718, + "step": 24230 + }, + { + "epoch": 5.1066519197345555, + "grad_norm": 1.0819109678268433, + "learning_rate": 9.662471990180657e-05, + "loss": 1.6356, + "step": 24240 + }, + { + "epoch": 5.108758624321904, + "grad_norm": 1.12892746925354, + "learning_rate": 9.655856382749976e-05, + "loss": 1.6667, + "step": 24250 + }, + { + "epoch": 5.110865328909254, + "grad_norm": 1.0783275365829468, + "learning_rate": 9.649240926113262e-05, + "loss": 1.6301, + "step": 24260 + }, + { + "epoch": 5.112972033496603, + "grad_norm": 1.098510980606079, + "learning_rate": 9.642625623169226e-05, + "loss": 1.5735, + "step": 24270 + }, + { + "epoch": 5.115078738083952, + "grad_norm": 1.0497877597808838, + "learning_rate": 9.636010476816504e-05, + "loss": 1.5672, + "step": 24280 + }, + { + "epoch": 5.1171854426713015, + "grad_norm": 1.0647207498550415, + "learning_rate": 9.629395489953669e-05, + "loss": 1.5622, + "step": 24290 + }, + { + "epoch": 5.11929214725865, + "grad_norm": 1.110742449760437, + "learning_rate": 9.622780665479222e-05, + "loss": 1.5973, + "step": 24300 + }, + { + "epoch": 5.121398851846, + "grad_norm": 1.070984959602356, + "learning_rate": 9.616166006291585e-05, + "loss": 1.6043, + "step": 24310 + }, + { + "epoch": 5.123505556433349, + "grad_norm": 1.0085152387619019, + "learning_rate": 9.609551515289116e-05, + "loss": 1.6405, + "step": 24320 + }, + { + "epoch": 5.125612261020699, + "grad_norm": 1.1093374490737915, + "learning_rate": 9.602937195370099e-05, + "loss": 1.5763, + "step": 24330 + }, + { + "epoch": 5.1277189656080475, + "grad_norm": 1.0516998767852783, + "learning_rate": 9.596323049432746e-05, + "loss": 1.6104, + "step": 24340 + }, + { + "epoch": 5.129825670195397, + "grad_norm": 1.1160739660263062, + "learning_rate": 9.589709080375179e-05, + "loss": 1.5914, + "step": 24350 + }, + { + "epoch": 5.131932374782746, + "grad_norm": 1.3322851657867432, + "learning_rate": 9.583095291095453e-05, + "loss": 1.5964, + "step": 24360 + }, + { + "epoch": 5.134039079370095, + "grad_norm": 1.0118359327316284, + "learning_rate": 9.576481684491548e-05, + "loss": 1.607, + "step": 24370 + }, + { + "epoch": 5.136145783957445, + "grad_norm": 1.0594619512557983, + "learning_rate": 9.569868263461361e-05, + "loss": 1.5388, + "step": 24380 + }, + { + "epoch": 5.1382524885447936, + "grad_norm": 1.0311650037765503, + "learning_rate": 9.563255030902697e-05, + "loss": 1.594, + "step": 24390 + }, + { + "epoch": 5.140359193132143, + "grad_norm": 1.0667462348937988, + "learning_rate": 9.556641989713288e-05, + "loss": 1.5542, + "step": 24400 + }, + { + "epoch": 5.142465897719492, + "grad_norm": 1.0168761014938354, + "learning_rate": 9.550029142790786e-05, + "loss": 1.6031, + "step": 24410 + }, + { + "epoch": 5.144572602306842, + "grad_norm": 1.099887728691101, + "learning_rate": 9.543416493032757e-05, + "loss": 1.6045, + "step": 24420 + }, + { + "epoch": 5.146679306894191, + "grad_norm": 1.058099389076233, + "learning_rate": 9.536804043336664e-05, + "loss": 1.6458, + "step": 24430 + }, + { + "epoch": 5.14878601148154, + "grad_norm": 1.0442464351654053, + "learning_rate": 9.5301917965999e-05, + "loss": 1.5772, + "step": 24440 + }, + { + "epoch": 5.150892716068889, + "grad_norm": 1.1567966938018799, + "learning_rate": 9.523579755719769e-05, + "loss": 1.5836, + "step": 24450 + }, + { + "epoch": 5.152999420656238, + "grad_norm": 1.0755870342254639, + "learning_rate": 9.516967923593479e-05, + "loss": 1.5786, + "step": 24460 + }, + { + "epoch": 5.155106125243588, + "grad_norm": 1.0643812417984009, + "learning_rate": 9.51035630311814e-05, + "loss": 1.6168, + "step": 24470 + }, + { + "epoch": 5.157212829830937, + "grad_norm": 1.075704574584961, + "learning_rate": 9.503744897190778e-05, + "loss": 1.6217, + "step": 24480 + }, + { + "epoch": 5.1593195344182865, + "grad_norm": 1.0636301040649414, + "learning_rate": 9.497133708708331e-05, + "loss": 1.5837, + "step": 24490 + }, + { + "epoch": 5.161426239005635, + "grad_norm": 1.1529415845870972, + "learning_rate": 9.490522740567633e-05, + "loss": 1.6462, + "step": 24500 + }, + { + "epoch": 5.163532943592985, + "grad_norm": 1.1077044010162354, + "learning_rate": 9.483911995665414e-05, + "loss": 1.6843, + "step": 24510 + }, + { + "epoch": 5.165639648180334, + "grad_norm": 1.245644450187683, + "learning_rate": 9.477301476898322e-05, + "loss": 1.6067, + "step": 24520 + }, + { + "epoch": 5.167746352767683, + "grad_norm": 1.0524848699569702, + "learning_rate": 9.470691187162897e-05, + "loss": 1.5735, + "step": 24530 + }, + { + "epoch": 5.1698530573550325, + "grad_norm": 1.0744348764419556, + "learning_rate": 9.464081129355586e-05, + "loss": 1.621, + "step": 24540 + }, + { + "epoch": 5.171959761942381, + "grad_norm": 1.111391544342041, + "learning_rate": 9.457471306372716e-05, + "loss": 1.6196, + "step": 24550 + }, + { + "epoch": 5.174066466529731, + "grad_norm": 1.1415176391601562, + "learning_rate": 9.450861721110534e-05, + "loss": 1.5828, + "step": 24560 + }, + { + "epoch": 5.17617317111708, + "grad_norm": 1.121319055557251, + "learning_rate": 9.444252376465171e-05, + "loss": 1.6659, + "step": 24570 + }, + { + "epoch": 5.17827987570443, + "grad_norm": 1.061152458190918, + "learning_rate": 9.437643275332653e-05, + "loss": 1.6155, + "step": 24580 + }, + { + "epoch": 5.180386580291779, + "grad_norm": 1.0673737525939941, + "learning_rate": 9.431034420608906e-05, + "loss": 1.6082, + "step": 24590 + }, + { + "epoch": 5.182493284879127, + "grad_norm": 1.1684726476669312, + "learning_rate": 9.424425815189733e-05, + "loss": 1.6265, + "step": 24600 + }, + { + "epoch": 5.184599989466477, + "grad_norm": 1.05269455909729, + "learning_rate": 9.417817461970844e-05, + "loss": 1.6044, + "step": 24610 + }, + { + "epoch": 5.186706694053826, + "grad_norm": 1.1382789611816406, + "learning_rate": 9.41120936384783e-05, + "loss": 1.5781, + "step": 24620 + }, + { + "epoch": 5.188813398641176, + "grad_norm": 1.5912727117538452, + "learning_rate": 9.404601523716175e-05, + "loss": 1.6607, + "step": 24630 + }, + { + "epoch": 5.190920103228525, + "grad_norm": 1.1234204769134521, + "learning_rate": 9.397993944471244e-05, + "loss": 1.6193, + "step": 24640 + }, + { + "epoch": 5.193026807815874, + "grad_norm": 1.079758882522583, + "learning_rate": 9.39138662900829e-05, + "loss": 1.6121, + "step": 24650 + }, + { + "epoch": 5.195133512403223, + "grad_norm": 1.2727773189544678, + "learning_rate": 9.384779580222453e-05, + "loss": 1.6537, + "step": 24660 + }, + { + "epoch": 5.197240216990573, + "grad_norm": 1.0580523014068604, + "learning_rate": 9.378172801008757e-05, + "loss": 1.6081, + "step": 24670 + }, + { + "epoch": 5.199346921577922, + "grad_norm": 1.1032179594039917, + "learning_rate": 9.371566294262101e-05, + "loss": 1.6044, + "step": 24680 + }, + { + "epoch": 5.201453626165271, + "grad_norm": 1.1013224124908447, + "learning_rate": 9.364960062877268e-05, + "loss": 1.6322, + "step": 24690 + }, + { + "epoch": 5.20356033075262, + "grad_norm": 1.0431034564971924, + "learning_rate": 9.358354109748926e-05, + "loss": 1.5924, + "step": 24700 + }, + { + "epoch": 5.205667035339969, + "grad_norm": 1.073081374168396, + "learning_rate": 9.351748437771615e-05, + "loss": 1.5925, + "step": 24710 + }, + { + "epoch": 5.207773739927319, + "grad_norm": 1.0321084260940552, + "learning_rate": 9.345143049839749e-05, + "loss": 1.5903, + "step": 24720 + }, + { + "epoch": 5.209880444514668, + "grad_norm": 1.0614155530929565, + "learning_rate": 9.338537948847626e-05, + "loss": 1.5674, + "step": 24730 + }, + { + "epoch": 5.2119871491020175, + "grad_norm": 1.030477523803711, + "learning_rate": 9.331933137689412e-05, + "loss": 1.6419, + "step": 24740 + }, + { + "epoch": 5.214093853689366, + "grad_norm": 1.0811430215835571, + "learning_rate": 9.325328619259151e-05, + "loss": 1.6482, + "step": 24750 + }, + { + "epoch": 5.216200558276715, + "grad_norm": 1.0927019119262695, + "learning_rate": 9.31872439645075e-05, + "loss": 1.5877, + "step": 24760 + }, + { + "epoch": 5.218307262864065, + "grad_norm": 1.027279019355774, + "learning_rate": 9.312120472157997e-05, + "loss": 1.6179, + "step": 24770 + }, + { + "epoch": 5.220413967451414, + "grad_norm": 1.262536644935608, + "learning_rate": 9.305516849274541e-05, + "loss": 1.5903, + "step": 24780 + }, + { + "epoch": 5.222520672038764, + "grad_norm": 1.244111180305481, + "learning_rate": 9.298913530693907e-05, + "loss": 1.6425, + "step": 24790 + }, + { + "epoch": 5.224627376626112, + "grad_norm": 1.0479352474212646, + "learning_rate": 9.292310519309475e-05, + "loss": 1.5862, + "step": 24800 + }, + { + "epoch": 5.226734081213462, + "grad_norm": 1.1215065717697144, + "learning_rate": 9.285707818014502e-05, + "loss": 1.5962, + "step": 24810 + }, + { + "epoch": 5.228840785800811, + "grad_norm": 1.1024200916290283, + "learning_rate": 9.279105429702103e-05, + "loss": 1.6951, + "step": 24820 + }, + { + "epoch": 5.230947490388161, + "grad_norm": 1.1429402828216553, + "learning_rate": 9.272503357265261e-05, + "loss": 1.6295, + "step": 24830 + }, + { + "epoch": 5.23305419497551, + "grad_norm": 1.1070572137832642, + "learning_rate": 9.265901603596811e-05, + "loss": 1.5736, + "step": 24840 + }, + { + "epoch": 5.2351608995628585, + "grad_norm": 1.0643175840377808, + "learning_rate": 9.259300171589456e-05, + "loss": 1.5951, + "step": 24850 + }, + { + "epoch": 5.237267604150208, + "grad_norm": 1.0902659893035889, + "learning_rate": 9.252699064135758e-05, + "loss": 1.6062, + "step": 24860 + }, + { + "epoch": 5.239374308737557, + "grad_norm": 1.0354982614517212, + "learning_rate": 9.246098284128133e-05, + "loss": 1.5459, + "step": 24870 + }, + { + "epoch": 5.241481013324907, + "grad_norm": 1.082728385925293, + "learning_rate": 9.239497834458861e-05, + "loss": 1.6218, + "step": 24880 + }, + { + "epoch": 5.243587717912256, + "grad_norm": 1.2707469463348389, + "learning_rate": 9.232897718020064e-05, + "loss": 1.6518, + "step": 24890 + }, + { + "epoch": 5.245694422499605, + "grad_norm": 1.0314414501190186, + "learning_rate": 9.226297937703728e-05, + "loss": 1.6052, + "step": 24900 + }, + { + "epoch": 5.247801127086954, + "grad_norm": 1.0788559913635254, + "learning_rate": 9.219698496401693e-05, + "loss": 1.5714, + "step": 24910 + }, + { + "epoch": 5.249907831674303, + "grad_norm": 1.1269972324371338, + "learning_rate": 9.213099397005646e-05, + "loss": 1.5576, + "step": 24920 + }, + { + "epoch": 5.252014536261653, + "grad_norm": 1.141788125038147, + "learning_rate": 9.206500642407123e-05, + "loss": 1.6976, + "step": 24930 + }, + { + "epoch": 5.254121240849002, + "grad_norm": 1.159085750579834, + "learning_rate": 9.199902235497511e-05, + "loss": 1.6217, + "step": 24940 + }, + { + "epoch": 5.256227945436351, + "grad_norm": 1.0742684602737427, + "learning_rate": 9.193304179168046e-05, + "loss": 1.6252, + "step": 24950 + }, + { + "epoch": 5.2583346500237, + "grad_norm": 1.231498122215271, + "learning_rate": 9.186706476309812e-05, + "loss": 1.6454, + "step": 24960 + }, + { + "epoch": 5.26044135461105, + "grad_norm": 1.103709101676941, + "learning_rate": 9.180109129813731e-05, + "loss": 1.629, + "step": 24970 + }, + { + "epoch": 5.262548059198399, + "grad_norm": 1.1012301445007324, + "learning_rate": 9.173512142570573e-05, + "loss": 1.6015, + "step": 24980 + }, + { + "epoch": 5.264654763785748, + "grad_norm": 1.0851894617080688, + "learning_rate": 9.166915517470953e-05, + "loss": 1.6225, + "step": 24990 + }, + { + "epoch": 5.266761468373097, + "grad_norm": 1.2022123336791992, + "learning_rate": 9.160319257405326e-05, + "loss": 1.6095, + "step": 25000 + }, + { + "epoch": 5.268868172960446, + "grad_norm": 1.1617381572723389, + "learning_rate": 9.153723365263983e-05, + "loss": 1.6482, + "step": 25010 + }, + { + "epoch": 5.270974877547796, + "grad_norm": 1.0850499868392944, + "learning_rate": 9.147127843937055e-05, + "loss": 1.5923, + "step": 25020 + }, + { + "epoch": 5.273081582135145, + "grad_norm": 1.1439275741577148, + "learning_rate": 9.14053269631452e-05, + "loss": 1.5755, + "step": 25030 + }, + { + "epoch": 5.275188286722495, + "grad_norm": 1.04306960105896, + "learning_rate": 9.13393792528618e-05, + "loss": 1.637, + "step": 25040 + }, + { + "epoch": 5.2772949913098435, + "grad_norm": 1.1085509061813354, + "learning_rate": 9.127343533741673e-05, + "loss": 1.7139, + "step": 25050 + }, + { + "epoch": 5.279401695897193, + "grad_norm": 1.081034779548645, + "learning_rate": 9.12074952457048e-05, + "loss": 1.5843, + "step": 25060 + }, + { + "epoch": 5.281508400484542, + "grad_norm": 1.0647776126861572, + "learning_rate": 9.114155900661906e-05, + "loss": 1.6279, + "step": 25070 + }, + { + "epoch": 5.283615105071891, + "grad_norm": 1.064026117324829, + "learning_rate": 9.107562664905093e-05, + "loss": 1.6156, + "step": 25080 + }, + { + "epoch": 5.285721809659241, + "grad_norm": 1.2240235805511475, + "learning_rate": 9.100969820189006e-05, + "loss": 1.629, + "step": 25090 + }, + { + "epoch": 5.2878285142465895, + "grad_norm": 1.0848407745361328, + "learning_rate": 9.094377369402444e-05, + "loss": 1.6134, + "step": 25100 + }, + { + "epoch": 5.289935218833939, + "grad_norm": 1.1111202239990234, + "learning_rate": 9.087785315434034e-05, + "loss": 1.6009, + "step": 25110 + }, + { + "epoch": 5.292041923421288, + "grad_norm": 1.1308156251907349, + "learning_rate": 9.081193661172224e-05, + "loss": 1.5953, + "step": 25120 + }, + { + "epoch": 5.294148628008638, + "grad_norm": 1.0957039594650269, + "learning_rate": 9.074602409505293e-05, + "loss": 1.59, + "step": 25130 + }, + { + "epoch": 5.296255332595987, + "grad_norm": 1.0466959476470947, + "learning_rate": 9.068011563321336e-05, + "loss": 1.5666, + "step": 25140 + }, + { + "epoch": 5.298362037183336, + "grad_norm": 1.1054922342300415, + "learning_rate": 9.061421125508279e-05, + "loss": 1.6434, + "step": 25150 + }, + { + "epoch": 5.300468741770685, + "grad_norm": 1.0235182046890259, + "learning_rate": 9.054831098953863e-05, + "loss": 1.6244, + "step": 25160 + }, + { + "epoch": 5.302575446358034, + "grad_norm": 1.0452880859375, + "learning_rate": 9.048241486545653e-05, + "loss": 1.6336, + "step": 25170 + }, + { + "epoch": 5.304682150945384, + "grad_norm": 1.25437331199646, + "learning_rate": 9.041652291171028e-05, + "loss": 1.5979, + "step": 25180 + }, + { + "epoch": 5.306788855532733, + "grad_norm": 1.1870005130767822, + "learning_rate": 9.035063515717185e-05, + "loss": 1.582, + "step": 25190 + }, + { + "epoch": 5.3088955601200825, + "grad_norm": 1.1450788974761963, + "learning_rate": 9.028475163071141e-05, + "loss": 1.6209, + "step": 25200 + }, + { + "epoch": 5.311002264707431, + "grad_norm": 1.0536702871322632, + "learning_rate": 9.021887236119729e-05, + "loss": 1.6007, + "step": 25210 + }, + { + "epoch": 5.313108969294781, + "grad_norm": 1.4930493831634521, + "learning_rate": 9.015299737749585e-05, + "loss": 1.627, + "step": 25220 + }, + { + "epoch": 5.31521567388213, + "grad_norm": 1.0780115127563477, + "learning_rate": 9.008712670847164e-05, + "loss": 1.5879, + "step": 25230 + }, + { + "epoch": 5.317322378469479, + "grad_norm": 1.0802395343780518, + "learning_rate": 9.002126038298736e-05, + "loss": 1.6739, + "step": 25240 + }, + { + "epoch": 5.3194290830568285, + "grad_norm": 1.0295484066009521, + "learning_rate": 8.995539842990376e-05, + "loss": 1.6355, + "step": 25250 + }, + { + "epoch": 5.321535787644177, + "grad_norm": 1.0127300024032593, + "learning_rate": 8.988954087807968e-05, + "loss": 1.6036, + "step": 25260 + }, + { + "epoch": 5.323642492231527, + "grad_norm": 1.171979308128357, + "learning_rate": 8.9823687756372e-05, + "loss": 1.5417, + "step": 25270 + }, + { + "epoch": 5.325749196818876, + "grad_norm": 1.1540085077285767, + "learning_rate": 8.975783909363571e-05, + "loss": 1.6326, + "step": 25280 + }, + { + "epoch": 5.327855901406226, + "grad_norm": 1.2152167558670044, + "learning_rate": 8.969199491872384e-05, + "loss": 1.5994, + "step": 25290 + }, + { + "epoch": 5.3299626059935745, + "grad_norm": 1.049820899963379, + "learning_rate": 8.962615526048742e-05, + "loss": 1.6396, + "step": 25300 + }, + { + "epoch": 5.332069310580923, + "grad_norm": 1.0768330097198486, + "learning_rate": 8.956032014777552e-05, + "loss": 1.6109, + "step": 25310 + }, + { + "epoch": 5.334176015168273, + "grad_norm": 1.1724385023117065, + "learning_rate": 8.949448960943524e-05, + "loss": 1.6675, + "step": 25320 + }, + { + "epoch": 5.336282719755622, + "grad_norm": 1.080008625984192, + "learning_rate": 8.942866367431166e-05, + "loss": 1.6312, + "step": 25330 + }, + { + "epoch": 5.338389424342972, + "grad_norm": 1.121904730796814, + "learning_rate": 8.936284237124778e-05, + "loss": 1.6396, + "step": 25340 + }, + { + "epoch": 5.3404961289303206, + "grad_norm": 0.990679144859314, + "learning_rate": 8.929702572908468e-05, + "loss": 1.5981, + "step": 25350 + }, + { + "epoch": 5.34260283351767, + "grad_norm": 1.048987865447998, + "learning_rate": 8.923121377666134e-05, + "loss": 1.6284, + "step": 25360 + }, + { + "epoch": 5.344709538105019, + "grad_norm": 1.0290054082870483, + "learning_rate": 8.916540654281469e-05, + "loss": 1.6364, + "step": 25370 + }, + { + "epoch": 5.346816242692368, + "grad_norm": 1.119303584098816, + "learning_rate": 8.909960405637958e-05, + "loss": 1.6075, + "step": 25380 + }, + { + "epoch": 5.348922947279718, + "grad_norm": 1.2384835481643677, + "learning_rate": 8.90338063461888e-05, + "loss": 1.5558, + "step": 25390 + }, + { + "epoch": 5.351029651867067, + "grad_norm": 1.0883729457855225, + "learning_rate": 8.896801344107302e-05, + "loss": 1.6098, + "step": 25400 + }, + { + "epoch": 5.353136356454416, + "grad_norm": 1.1402106285095215, + "learning_rate": 8.890222536986085e-05, + "loss": 1.6391, + "step": 25410 + }, + { + "epoch": 5.355243061041765, + "grad_norm": 1.0882619619369507, + "learning_rate": 8.88364421613788e-05, + "loss": 1.6, + "step": 25420 + }, + { + "epoch": 5.357349765629115, + "grad_norm": 1.1555756330490112, + "learning_rate": 8.877066384445114e-05, + "loss": 1.6376, + "step": 25430 + }, + { + "epoch": 5.359456470216464, + "grad_norm": 1.139312505722046, + "learning_rate": 8.870489044790006e-05, + "loss": 1.6167, + "step": 25440 + }, + { + "epoch": 5.3615631748038135, + "grad_norm": 1.1492891311645508, + "learning_rate": 8.86391220005456e-05, + "loss": 1.6342, + "step": 25450 + }, + { + "epoch": 5.363669879391162, + "grad_norm": 1.0864403247833252, + "learning_rate": 8.857335853120572e-05, + "loss": 1.6052, + "step": 25460 + }, + { + "epoch": 5.365776583978511, + "grad_norm": 1.119722604751587, + "learning_rate": 8.8507600068696e-05, + "loss": 1.6132, + "step": 25470 + }, + { + "epoch": 5.367883288565861, + "grad_norm": 1.1324105262756348, + "learning_rate": 8.844184664182993e-05, + "loss": 1.6156, + "step": 25480 + }, + { + "epoch": 5.36998999315321, + "grad_norm": 1.0984848737716675, + "learning_rate": 8.837609827941885e-05, + "loss": 1.6018, + "step": 25490 + }, + { + "epoch": 5.3720966977405595, + "grad_norm": 1.1173757314682007, + "learning_rate": 8.831035501027186e-05, + "loss": 1.5513, + "step": 25500 + }, + { + "epoch": 5.374203402327908, + "grad_norm": 1.0673028230667114, + "learning_rate": 8.82446168631957e-05, + "loss": 1.5594, + "step": 25510 + }, + { + "epoch": 5.376310106915258, + "grad_norm": 1.1615711450576782, + "learning_rate": 8.8178883866995e-05, + "loss": 1.6259, + "step": 25520 + }, + { + "epoch": 5.378416811502607, + "grad_norm": 1.045413851737976, + "learning_rate": 8.811315605047212e-05, + "loss": 1.6354, + "step": 25530 + }, + { + "epoch": 5.380523516089957, + "grad_norm": 1.2000606060028076, + "learning_rate": 8.804743344242715e-05, + "loss": 1.6617, + "step": 25540 + }, + { + "epoch": 5.382630220677306, + "grad_norm": 1.1387240886688232, + "learning_rate": 8.798171607165778e-05, + "loss": 1.6394, + "step": 25550 + }, + { + "epoch": 5.384736925264654, + "grad_norm": 1.3192468881607056, + "learning_rate": 8.791600396695954e-05, + "loss": 1.5966, + "step": 25560 + }, + { + "epoch": 5.386843629852004, + "grad_norm": 1.1031746864318848, + "learning_rate": 8.785029715712564e-05, + "loss": 1.5806, + "step": 25570 + }, + { + "epoch": 5.388950334439353, + "grad_norm": 1.1289430856704712, + "learning_rate": 8.778459567094696e-05, + "loss": 1.5852, + "step": 25580 + }, + { + "epoch": 5.391057039026703, + "grad_norm": 1.1894451379776, + "learning_rate": 8.771889953721193e-05, + "loss": 1.6102, + "step": 25590 + }, + { + "epoch": 5.393163743614052, + "grad_norm": 1.0416464805603027, + "learning_rate": 8.765320878470679e-05, + "loss": 1.626, + "step": 25600 + }, + { + "epoch": 5.395270448201401, + "grad_norm": 1.0740944147109985, + "learning_rate": 8.758752344221537e-05, + "loss": 1.6292, + "step": 25610 + }, + { + "epoch": 5.39737715278875, + "grad_norm": 1.1314433813095093, + "learning_rate": 8.752184353851916e-05, + "loss": 1.5899, + "step": 25620 + }, + { + "epoch": 5.399483857376099, + "grad_norm": 1.1938408613204956, + "learning_rate": 8.745616910239716e-05, + "loss": 1.6401, + "step": 25630 + }, + { + "epoch": 5.401590561963449, + "grad_norm": 1.1085186004638672, + "learning_rate": 8.739050016262604e-05, + "loss": 1.6419, + "step": 25640 + }, + { + "epoch": 5.403697266550798, + "grad_norm": 1.1080095767974854, + "learning_rate": 8.732483674798013e-05, + "loss": 1.6605, + "step": 25650 + }, + { + "epoch": 5.405803971138147, + "grad_norm": 1.1335920095443726, + "learning_rate": 8.72591788872313e-05, + "loss": 1.6491, + "step": 25660 + }, + { + "epoch": 5.407910675725496, + "grad_norm": 1.1067054271697998, + "learning_rate": 8.719352660914884e-05, + "loss": 1.6226, + "step": 25670 + }, + { + "epoch": 5.410017380312846, + "grad_norm": 1.1139326095581055, + "learning_rate": 8.712787994249979e-05, + "loss": 1.6116, + "step": 25680 + }, + { + "epoch": 5.412124084900195, + "grad_norm": 1.1228965520858765, + "learning_rate": 8.706223891604866e-05, + "loss": 1.583, + "step": 25690 + }, + { + "epoch": 5.414230789487544, + "grad_norm": 1.0639370679855347, + "learning_rate": 8.699660355855748e-05, + "loss": 1.5674, + "step": 25700 + }, + { + "epoch": 5.416337494074893, + "grad_norm": 1.1020694971084595, + "learning_rate": 8.693097389878584e-05, + "loss": 1.5689, + "step": 25710 + }, + { + "epoch": 5.418444198662242, + "grad_norm": 1.1313263177871704, + "learning_rate": 8.686534996549071e-05, + "loss": 1.6155, + "step": 25720 + }, + { + "epoch": 5.420550903249592, + "grad_norm": 1.2440193891525269, + "learning_rate": 8.679973178742668e-05, + "loss": 1.6607, + "step": 25730 + }, + { + "epoch": 5.422657607836941, + "grad_norm": 1.121762752532959, + "learning_rate": 8.673411939334581e-05, + "loss": 1.5493, + "step": 25740 + }, + { + "epoch": 5.424764312424291, + "grad_norm": 1.0492937564849854, + "learning_rate": 8.666851281199757e-05, + "loss": 1.6558, + "step": 25750 + }, + { + "epoch": 5.426871017011639, + "grad_norm": 1.2097651958465576, + "learning_rate": 8.660291207212882e-05, + "loss": 1.6651, + "step": 25760 + }, + { + "epoch": 5.428977721598989, + "grad_norm": 1.1282267570495605, + "learning_rate": 8.653731720248406e-05, + "loss": 1.603, + "step": 25770 + }, + { + "epoch": 5.431084426186338, + "grad_norm": 1.0867459774017334, + "learning_rate": 8.647172823180505e-05, + "loss": 1.5858, + "step": 25780 + }, + { + "epoch": 5.433191130773687, + "grad_norm": 1.1194344758987427, + "learning_rate": 8.640614518883105e-05, + "loss": 1.6352, + "step": 25790 + }, + { + "epoch": 5.435297835361037, + "grad_norm": 1.0908195972442627, + "learning_rate": 8.634056810229862e-05, + "loss": 1.68, + "step": 25800 + }, + { + "epoch": 5.4374045399483855, + "grad_norm": 1.1710094213485718, + "learning_rate": 8.627499700094183e-05, + "loss": 1.5755, + "step": 25810 + }, + { + "epoch": 5.439511244535735, + "grad_norm": 1.1145213842391968, + "learning_rate": 8.620943191349207e-05, + "loss": 1.6015, + "step": 25820 + }, + { + "epoch": 5.441617949123084, + "grad_norm": 1.1294487714767456, + "learning_rate": 8.614387286867814e-05, + "loss": 1.65, + "step": 25830 + }, + { + "epoch": 5.443724653710434, + "grad_norm": 1.099099040031433, + "learning_rate": 8.607831989522604e-05, + "loss": 1.5673, + "step": 25840 + }, + { + "epoch": 5.445831358297783, + "grad_norm": 1.1329737901687622, + "learning_rate": 8.601277302185932e-05, + "loss": 1.6757, + "step": 25850 + }, + { + "epoch": 5.447938062885132, + "grad_norm": 1.0884069204330444, + "learning_rate": 8.594723227729875e-05, + "loss": 1.5949, + "step": 25860 + }, + { + "epoch": 5.450044767472481, + "grad_norm": 1.1422146558761597, + "learning_rate": 8.588169769026244e-05, + "loss": 1.649, + "step": 25870 + }, + { + "epoch": 5.45215147205983, + "grad_norm": 1.214929223060608, + "learning_rate": 8.581616928946571e-05, + "loss": 1.6687, + "step": 25880 + }, + { + "epoch": 5.45425817664718, + "grad_norm": 0.945561945438385, + "learning_rate": 8.575064710362131e-05, + "loss": 1.6441, + "step": 25890 + }, + { + "epoch": 5.456364881234529, + "grad_norm": 1.2264434099197388, + "learning_rate": 8.568513116143919e-05, + "loss": 1.6985, + "step": 25900 + }, + { + "epoch": 5.458471585821878, + "grad_norm": 1.0190399885177612, + "learning_rate": 8.561962149162662e-05, + "loss": 1.5734, + "step": 25910 + }, + { + "epoch": 5.460578290409227, + "grad_norm": 1.111526608467102, + "learning_rate": 8.555411812288798e-05, + "loss": 1.6245, + "step": 25920 + }, + { + "epoch": 5.462684994996577, + "grad_norm": 1.1148344278335571, + "learning_rate": 8.548862108392506e-05, + "loss": 1.5885, + "step": 25930 + }, + { + "epoch": 5.464791699583926, + "grad_norm": 1.1234092712402344, + "learning_rate": 8.542313040343679e-05, + "loss": 1.6301, + "step": 25940 + }, + { + "epoch": 5.466898404171275, + "grad_norm": 1.1449817419052124, + "learning_rate": 8.535764611011938e-05, + "loss": 1.6448, + "step": 25950 + }, + { + "epoch": 5.4690051087586244, + "grad_norm": 1.0495420694351196, + "learning_rate": 8.529216823266606e-05, + "loss": 1.581, + "step": 25960 + }, + { + "epoch": 5.471111813345973, + "grad_norm": 1.131170630455017, + "learning_rate": 8.522669679976749e-05, + "loss": 1.6223, + "step": 25970 + }, + { + "epoch": 5.473218517933323, + "grad_norm": 1.1759270429611206, + "learning_rate": 8.516123184011135e-05, + "loss": 1.5741, + "step": 25980 + }, + { + "epoch": 5.475325222520672, + "grad_norm": 1.2842549085617065, + "learning_rate": 8.509577338238255e-05, + "loss": 1.5891, + "step": 25990 + }, + { + "epoch": 5.477431927108022, + "grad_norm": 1.1138408184051514, + "learning_rate": 8.503032145526314e-05, + "loss": 1.6362, + "step": 26000 + }, + { + "epoch": 5.4795386316953705, + "grad_norm": 1.0257008075714111, + "learning_rate": 8.496487608743225e-05, + "loss": 1.6618, + "step": 26010 + }, + { + "epoch": 5.481645336282719, + "grad_norm": 1.2054387331008911, + "learning_rate": 8.489943730756622e-05, + "loss": 1.683, + "step": 26020 + }, + { + "epoch": 5.483752040870069, + "grad_norm": 1.1666181087493896, + "learning_rate": 8.483400514433845e-05, + "loss": 1.6326, + "step": 26030 + }, + { + "epoch": 5.485858745457418, + "grad_norm": 1.123583197593689, + "learning_rate": 8.47685796264195e-05, + "loss": 1.6099, + "step": 26040 + }, + { + "epoch": 5.487965450044768, + "grad_norm": 1.0301923751831055, + "learning_rate": 8.470316078247694e-05, + "loss": 1.6046, + "step": 26050 + }, + { + "epoch": 5.4900721546321165, + "grad_norm": 1.0988959074020386, + "learning_rate": 8.463774864117542e-05, + "loss": 1.607, + "step": 26060 + }, + { + "epoch": 5.492178859219466, + "grad_norm": 1.058692216873169, + "learning_rate": 8.457234323117675e-05, + "loss": 1.6333, + "step": 26070 + }, + { + "epoch": 5.494285563806815, + "grad_norm": 1.0051920413970947, + "learning_rate": 8.450694458113969e-05, + "loss": 1.6068, + "step": 26080 + }, + { + "epoch": 5.496392268394165, + "grad_norm": 1.1012892723083496, + "learning_rate": 8.444155271972008e-05, + "loss": 1.6615, + "step": 26090 + }, + { + "epoch": 5.498498972981514, + "grad_norm": 1.1324644088745117, + "learning_rate": 8.437616767557077e-05, + "loss": 1.6613, + "step": 26100 + }, + { + "epoch": 5.5006056775688625, + "grad_norm": 1.0465189218521118, + "learning_rate": 8.431078947734164e-05, + "loss": 1.6052, + "step": 26110 + }, + { + "epoch": 5.502712382156212, + "grad_norm": 1.173836350440979, + "learning_rate": 8.424541815367957e-05, + "loss": 1.579, + "step": 26120 + }, + { + "epoch": 5.504819086743561, + "grad_norm": 1.185910940170288, + "learning_rate": 8.418005373322841e-05, + "loss": 1.5884, + "step": 26130 + }, + { + "epoch": 5.506925791330911, + "grad_norm": 1.1530224084854126, + "learning_rate": 8.4114696244629e-05, + "loss": 1.5754, + "step": 26140 + }, + { + "epoch": 5.50903249591826, + "grad_norm": 1.2175930738449097, + "learning_rate": 8.404934571651913e-05, + "loss": 1.6492, + "step": 26150 + }, + { + "epoch": 5.5111392005056095, + "grad_norm": 1.1386138200759888, + "learning_rate": 8.398400217753357e-05, + "loss": 1.572, + "step": 26160 + }, + { + "epoch": 5.513245905092958, + "grad_norm": 1.1737759113311768, + "learning_rate": 8.391866565630397e-05, + "loss": 1.5755, + "step": 26170 + }, + { + "epoch": 5.515352609680308, + "grad_norm": 0.9875378608703613, + "learning_rate": 8.385333618145896e-05, + "loss": 1.5388, + "step": 26180 + }, + { + "epoch": 5.517459314267657, + "grad_norm": 1.1286633014678955, + "learning_rate": 8.378801378162407e-05, + "loss": 1.5541, + "step": 26190 + }, + { + "epoch": 5.519566018855006, + "grad_norm": 1.0399701595306396, + "learning_rate": 8.372269848542172e-05, + "loss": 1.6503, + "step": 26200 + }, + { + "epoch": 5.5216727234423555, + "grad_norm": 1.1287180185317993, + "learning_rate": 8.36573903214712e-05, + "loss": 1.6099, + "step": 26210 + }, + { + "epoch": 5.523779428029704, + "grad_norm": 1.062724232673645, + "learning_rate": 8.359208931838871e-05, + "loss": 1.6084, + "step": 26220 + }, + { + "epoch": 5.525886132617054, + "grad_norm": 1.0824880599975586, + "learning_rate": 8.35267955047873e-05, + "loss": 1.6435, + "step": 26230 + }, + { + "epoch": 5.527992837204403, + "grad_norm": 1.113713264465332, + "learning_rate": 8.346150890927688e-05, + "loss": 1.6137, + "step": 26240 + }, + { + "epoch": 5.530099541791753, + "grad_norm": 1.0756171941757202, + "learning_rate": 8.339622956046417e-05, + "loss": 1.6287, + "step": 26250 + }, + { + "epoch": 5.5322062463791015, + "grad_norm": 1.0738415718078613, + "learning_rate": 8.333095748695271e-05, + "loss": 1.5887, + "step": 26260 + }, + { + "epoch": 5.53431295096645, + "grad_norm": 1.0821735858917236, + "learning_rate": 8.326569271734287e-05, + "loss": 1.6155, + "step": 26270 + }, + { + "epoch": 5.5364196555538, + "grad_norm": 1.0803221464157104, + "learning_rate": 8.320043528023188e-05, + "loss": 1.6377, + "step": 26280 + }, + { + "epoch": 5.538526360141149, + "grad_norm": 1.025138020515442, + "learning_rate": 8.313518520421366e-05, + "loss": 1.6606, + "step": 26290 + }, + { + "epoch": 5.540633064728499, + "grad_norm": 1.1567028760910034, + "learning_rate": 8.306994251787892e-05, + "loss": 1.6837, + "step": 26300 + }, + { + "epoch": 5.5427397693158476, + "grad_norm": 1.1138674020767212, + "learning_rate": 8.300470724981517e-05, + "loss": 1.6178, + "step": 26310 + }, + { + "epoch": 5.544846473903197, + "grad_norm": 1.135926365852356, + "learning_rate": 8.293947942860666e-05, + "loss": 1.6356, + "step": 26320 + }, + { + "epoch": 5.546953178490546, + "grad_norm": 1.0830113887786865, + "learning_rate": 8.287425908283442e-05, + "loss": 1.567, + "step": 26330 + }, + { + "epoch": 5.549059883077895, + "grad_norm": 1.042052149772644, + "learning_rate": 8.280904624107606e-05, + "loss": 1.6557, + "step": 26340 + }, + { + "epoch": 5.551166587665245, + "grad_norm": 1.1980870962142944, + "learning_rate": 8.274384093190605e-05, + "loss": 1.638, + "step": 26350 + }, + { + "epoch": 5.553273292252594, + "grad_norm": 1.0648488998413086, + "learning_rate": 8.267864318389549e-05, + "loss": 1.6735, + "step": 26360 + }, + { + "epoch": 5.555379996839943, + "grad_norm": 1.1223556995391846, + "learning_rate": 8.261345302561223e-05, + "loss": 1.5817, + "step": 26370 + }, + { + "epoch": 5.557486701427292, + "grad_norm": 1.0523229837417603, + "learning_rate": 8.254827048562069e-05, + "loss": 1.6265, + "step": 26380 + }, + { + "epoch": 5.559593406014642, + "grad_norm": 1.073630928993225, + "learning_rate": 8.248309559248203e-05, + "loss": 1.6052, + "step": 26390 + }, + { + "epoch": 5.561700110601991, + "grad_norm": 1.099674940109253, + "learning_rate": 8.241792837475405e-05, + "loss": 1.6322, + "step": 26400 + }, + { + "epoch": 5.56380681518934, + "grad_norm": 1.1496034860610962, + "learning_rate": 8.235276886099119e-05, + "loss": 1.6381, + "step": 26410 + }, + { + "epoch": 5.565913519776689, + "grad_norm": 1.0635652542114258, + "learning_rate": 8.228761707974445e-05, + "loss": 1.6508, + "step": 26420 + }, + { + "epoch": 5.568020224364038, + "grad_norm": 1.1200566291809082, + "learning_rate": 8.222247305956153e-05, + "loss": 1.6321, + "step": 26430 + }, + { + "epoch": 5.570126928951388, + "grad_norm": 1.3193455934524536, + "learning_rate": 8.215733682898669e-05, + "loss": 1.5639, + "step": 26440 + }, + { + "epoch": 5.572233633538737, + "grad_norm": 1.0898486375808716, + "learning_rate": 8.209220841656078e-05, + "loss": 1.6149, + "step": 26450 + }, + { + "epoch": 5.5743403381260865, + "grad_norm": 1.1833173036575317, + "learning_rate": 8.202708785082121e-05, + "loss": 1.6015, + "step": 26460 + }, + { + "epoch": 5.576447042713435, + "grad_norm": 1.1096851825714111, + "learning_rate": 8.196197516030198e-05, + "loss": 1.6056, + "step": 26470 + }, + { + "epoch": 5.578553747300785, + "grad_norm": 1.1732834577560425, + "learning_rate": 8.18968703735336e-05, + "loss": 1.6239, + "step": 26480 + }, + { + "epoch": 5.580660451888134, + "grad_norm": 1.0716811418533325, + "learning_rate": 8.183177351904318e-05, + "loss": 1.6712, + "step": 26490 + }, + { + "epoch": 5.582767156475484, + "grad_norm": 1.0455117225646973, + "learning_rate": 8.176668462535427e-05, + "loss": 1.6137, + "step": 26500 + }, + { + "epoch": 5.584873861062833, + "grad_norm": 1.180620789527893, + "learning_rate": 8.1701603720987e-05, + "loss": 1.5832, + "step": 26510 + }, + { + "epoch": 5.586980565650181, + "grad_norm": 1.1026450395584106, + "learning_rate": 8.163653083445799e-05, + "loss": 1.6063, + "step": 26520 + }, + { + "epoch": 5.589087270237531, + "grad_norm": 1.145521879196167, + "learning_rate": 8.157146599428028e-05, + "loss": 1.6038, + "step": 26530 + }, + { + "epoch": 5.59119397482488, + "grad_norm": 1.016776204109192, + "learning_rate": 8.150640922896356e-05, + "loss": 1.5613, + "step": 26540 + }, + { + "epoch": 5.59330067941223, + "grad_norm": 1.0819216966629028, + "learning_rate": 8.144136056701371e-05, + "loss": 1.5235, + "step": 26550 + }, + { + "epoch": 5.595407383999579, + "grad_norm": 1.1226952075958252, + "learning_rate": 8.137632003693329e-05, + "loss": 1.6309, + "step": 26560 + }, + { + "epoch": 5.597514088586928, + "grad_norm": 1.149537205696106, + "learning_rate": 8.131128766722117e-05, + "loss": 1.5602, + "step": 26570 + }, + { + "epoch": 5.599620793174277, + "grad_norm": 1.0373797416687012, + "learning_rate": 8.124626348637279e-05, + "loss": 1.6514, + "step": 26580 + }, + { + "epoch": 5.601727497761626, + "grad_norm": 1.1900537014007568, + "learning_rate": 8.118124752287979e-05, + "loss": 1.647, + "step": 26590 + }, + { + "epoch": 5.603834202348976, + "grad_norm": 1.0474778413772583, + "learning_rate": 8.111623980523035e-05, + "loss": 1.6313, + "step": 26600 + }, + { + "epoch": 5.605940906936325, + "grad_norm": 1.0889447927474976, + "learning_rate": 8.105124036190901e-05, + "loss": 1.5919, + "step": 26610 + }, + { + "epoch": 5.608047611523674, + "grad_norm": 1.520419716835022, + "learning_rate": 8.098624922139676e-05, + "loss": 1.6039, + "step": 26620 + }, + { + "epoch": 5.610154316111023, + "grad_norm": 1.1650199890136719, + "learning_rate": 8.092126641217076e-05, + "loss": 1.6202, + "step": 26630 + }, + { + "epoch": 5.612261020698373, + "grad_norm": 1.0928066968917847, + "learning_rate": 8.085629196270469e-05, + "loss": 1.6322, + "step": 26640 + }, + { + "epoch": 5.614367725285722, + "grad_norm": 1.0415284633636475, + "learning_rate": 8.079132590146847e-05, + "loss": 1.61, + "step": 26650 + }, + { + "epoch": 5.616474429873071, + "grad_norm": 1.1244189739227295, + "learning_rate": 8.072636825692849e-05, + "loss": 1.5919, + "step": 26660 + }, + { + "epoch": 5.61858113446042, + "grad_norm": 1.1185684204101562, + "learning_rate": 8.066141905754723e-05, + "loss": 1.6128, + "step": 26670 + }, + { + "epoch": 5.620687839047769, + "grad_norm": 1.0240288972854614, + "learning_rate": 8.059647833178363e-05, + "loss": 1.6269, + "step": 26680 + }, + { + "epoch": 5.622794543635119, + "grad_norm": 1.125137448310852, + "learning_rate": 8.053154610809285e-05, + "loss": 1.6419, + "step": 26690 + }, + { + "epoch": 5.624901248222468, + "grad_norm": 1.057777762413025, + "learning_rate": 8.046662241492645e-05, + "loss": 1.6237, + "step": 26700 + }, + { + "epoch": 5.627007952809818, + "grad_norm": 1.1793187856674194, + "learning_rate": 8.040170728073202e-05, + "loss": 1.6258, + "step": 26710 + }, + { + "epoch": 5.629114657397166, + "grad_norm": 1.0160819292068481, + "learning_rate": 8.03368007339536e-05, + "loss": 1.6346, + "step": 26720 + }, + { + "epoch": 5.631221361984515, + "grad_norm": 1.032878041267395, + "learning_rate": 8.027190280303137e-05, + "loss": 1.6404, + "step": 26730 + }, + { + "epoch": 5.633328066571865, + "grad_norm": 1.0659453868865967, + "learning_rate": 8.020701351640182e-05, + "loss": 1.6393, + "step": 26740 + }, + { + "epoch": 5.635434771159214, + "grad_norm": 1.0905574560165405, + "learning_rate": 8.014213290249751e-05, + "loss": 1.5401, + "step": 26750 + }, + { + "epoch": 5.637541475746564, + "grad_norm": 1.096535563468933, + "learning_rate": 8.007726098974734e-05, + "loss": 1.5796, + "step": 26760 + }, + { + "epoch": 5.6396481803339125, + "grad_norm": 1.1352156400680542, + "learning_rate": 8.00123978065763e-05, + "loss": 1.6769, + "step": 26770 + }, + { + "epoch": 5.641754884921262, + "grad_norm": 1.0784434080123901, + "learning_rate": 7.994754338140569e-05, + "loss": 1.6259, + "step": 26780 + }, + { + "epoch": 5.643861589508611, + "grad_norm": 1.0838322639465332, + "learning_rate": 7.988269774265278e-05, + "loss": 1.6178, + "step": 26790 + }, + { + "epoch": 5.64596829409596, + "grad_norm": 1.0983805656433105, + "learning_rate": 7.981786091873112e-05, + "loss": 1.6762, + "step": 26800 + }, + { + "epoch": 5.64807499868331, + "grad_norm": 1.1972914934158325, + "learning_rate": 7.975303293805035e-05, + "loss": 1.6466, + "step": 26810 + }, + { + "epoch": 5.6501817032706585, + "grad_norm": 1.0667072534561157, + "learning_rate": 7.96882138290163e-05, + "loss": 1.6487, + "step": 26820 + }, + { + "epoch": 5.652288407858008, + "grad_norm": 1.1564662456512451, + "learning_rate": 7.962340362003089e-05, + "loss": 1.5964, + "step": 26830 + }, + { + "epoch": 5.654395112445357, + "grad_norm": 1.1387529373168945, + "learning_rate": 7.9558602339492e-05, + "loss": 1.5973, + "step": 26840 + }, + { + "epoch": 5.656501817032707, + "grad_norm": 1.087855577468872, + "learning_rate": 7.949381001579378e-05, + "loss": 1.5811, + "step": 26850 + }, + { + "epoch": 5.658608521620056, + "grad_norm": 1.3254786729812622, + "learning_rate": 7.942902667732639e-05, + "loss": 1.5901, + "step": 26860 + }, + { + "epoch": 5.660715226207405, + "grad_norm": 1.0886156558990479, + "learning_rate": 7.936425235247611e-05, + "loss": 1.6013, + "step": 26870 + }, + { + "epoch": 5.662821930794754, + "grad_norm": 1.041137456893921, + "learning_rate": 7.929948706962508e-05, + "loss": 1.5307, + "step": 26880 + }, + { + "epoch": 5.664928635382104, + "grad_norm": 1.093061089515686, + "learning_rate": 7.92347308571517e-05, + "loss": 1.5949, + "step": 26890 + }, + { + "epoch": 5.667035339969453, + "grad_norm": 1.2626030445098877, + "learning_rate": 7.916998374343028e-05, + "loss": 1.6477, + "step": 26900 + }, + { + "epoch": 5.669142044556802, + "grad_norm": 1.0964906215667725, + "learning_rate": 7.910524575683122e-05, + "loss": 1.6141, + "step": 26910 + }, + { + "epoch": 5.6712487491441514, + "grad_norm": 1.0444769859313965, + "learning_rate": 7.904051692572079e-05, + "loss": 1.6259, + "step": 26920 + }, + { + "epoch": 5.6733554537315, + "grad_norm": 1.0373730659484863, + "learning_rate": 7.897579727846133e-05, + "loss": 1.6066, + "step": 26930 + }, + { + "epoch": 5.67546215831885, + "grad_norm": 1.141251802444458, + "learning_rate": 7.891108684341121e-05, + "loss": 1.6742, + "step": 26940 + }, + { + "epoch": 5.677568862906199, + "grad_norm": 1.1581555604934692, + "learning_rate": 7.884638564892472e-05, + "loss": 1.6523, + "step": 26950 + }, + { + "epoch": 5.679675567493549, + "grad_norm": 1.0533156394958496, + "learning_rate": 7.8781693723352e-05, + "loss": 1.6443, + "step": 26960 + }, + { + "epoch": 5.6817822720808975, + "grad_norm": 1.1277610063552856, + "learning_rate": 7.871701109503929e-05, + "loss": 1.62, + "step": 26970 + }, + { + "epoch": 5.683888976668246, + "grad_norm": 1.0494487285614014, + "learning_rate": 7.865233779232866e-05, + "loss": 1.6113, + "step": 26980 + }, + { + "epoch": 5.685995681255596, + "grad_norm": 1.0661777257919312, + "learning_rate": 7.858767384355815e-05, + "loss": 1.5868, + "step": 26990 + }, + { + "epoch": 5.688102385842945, + "grad_norm": 1.1448099613189697, + "learning_rate": 7.852301927706159e-05, + "loss": 1.6307, + "step": 27000 + }, + { + "epoch": 5.690209090430295, + "grad_norm": 1.0377132892608643, + "learning_rate": 7.845837412116883e-05, + "loss": 1.5962, + "step": 27010 + }, + { + "epoch": 5.6923157950176435, + "grad_norm": 1.0388470888137817, + "learning_rate": 7.839373840420554e-05, + "loss": 1.5914, + "step": 27020 + }, + { + "epoch": 5.694422499604993, + "grad_norm": 1.1136940717697144, + "learning_rate": 7.83291121544933e-05, + "loss": 1.6221, + "step": 27030 + }, + { + "epoch": 5.696529204192342, + "grad_norm": 1.113476037979126, + "learning_rate": 7.826449540034939e-05, + "loss": 1.6005, + "step": 27040 + }, + { + "epoch": 5.698635908779691, + "grad_norm": 1.1359562873840332, + "learning_rate": 7.819988817008713e-05, + "loss": 1.6283, + "step": 27050 + }, + { + "epoch": 5.700742613367041, + "grad_norm": 1.0916094779968262, + "learning_rate": 7.813529049201556e-05, + "loss": 1.5927, + "step": 27060 + }, + { + "epoch": 5.7028493179543895, + "grad_norm": 1.1190940141677856, + "learning_rate": 7.807070239443957e-05, + "loss": 1.6316, + "step": 27070 + }, + { + "epoch": 5.704956022541739, + "grad_norm": 1.1429566144943237, + "learning_rate": 7.800612390565974e-05, + "loss": 1.6386, + "step": 27080 + }, + { + "epoch": 5.707062727129088, + "grad_norm": 1.0969349145889282, + "learning_rate": 7.794155505397261e-05, + "loss": 1.572, + "step": 27090 + }, + { + "epoch": 5.709169431716438, + "grad_norm": 1.0643476247787476, + "learning_rate": 7.787699586767042e-05, + "loss": 1.5452, + "step": 27100 + }, + { + "epoch": 5.711276136303787, + "grad_norm": 1.1677190065383911, + "learning_rate": 7.781244637504114e-05, + "loss": 1.6587, + "step": 27110 + }, + { + "epoch": 5.713382840891136, + "grad_norm": 1.0919948816299438, + "learning_rate": 7.774790660436858e-05, + "loss": 1.6646, + "step": 27120 + }, + { + "epoch": 5.715489545478485, + "grad_norm": 1.1836225986480713, + "learning_rate": 7.768337658393216e-05, + "loss": 1.6362, + "step": 27130 + }, + { + "epoch": 5.717596250065834, + "grad_norm": 1.017246961593628, + "learning_rate": 7.761885634200717e-05, + "loss": 1.6277, + "step": 27140 + }, + { + "epoch": 5.719702954653184, + "grad_norm": 1.0970447063446045, + "learning_rate": 7.755434590686452e-05, + "loss": 1.5896, + "step": 27150 + }, + { + "epoch": 5.721809659240533, + "grad_norm": 1.150378704071045, + "learning_rate": 7.748984530677089e-05, + "loss": 1.6005, + "step": 27160 + }, + { + "epoch": 5.7239163638278825, + "grad_norm": 1.0526460409164429, + "learning_rate": 7.742535456998853e-05, + "loss": 1.6235, + "step": 27170 + }, + { + "epoch": 5.726023068415231, + "grad_norm": 1.0322532653808594, + "learning_rate": 7.736087372477554e-05, + "loss": 1.6019, + "step": 27180 + }, + { + "epoch": 5.728129773002581, + "grad_norm": 1.119554042816162, + "learning_rate": 7.729640279938555e-05, + "loss": 1.5993, + "step": 27190 + }, + { + "epoch": 5.73023647758993, + "grad_norm": 1.0483386516571045, + "learning_rate": 7.723194182206792e-05, + "loss": 1.6072, + "step": 27200 + }, + { + "epoch": 5.73234318217728, + "grad_norm": 1.065516710281372, + "learning_rate": 7.71674908210676e-05, + "loss": 1.6462, + "step": 27210 + }, + { + "epoch": 5.7344498867646285, + "grad_norm": 1.101328730583191, + "learning_rate": 7.710304982462519e-05, + "loss": 1.6684, + "step": 27220 + }, + { + "epoch": 5.736556591351977, + "grad_norm": 1.0765429735183716, + "learning_rate": 7.70386188609769e-05, + "loss": 1.6649, + "step": 27230 + }, + { + "epoch": 5.738663295939327, + "grad_norm": 1.154428243637085, + "learning_rate": 7.69741979583546e-05, + "loss": 1.6245, + "step": 27240 + }, + { + "epoch": 5.740770000526676, + "grad_norm": 1.134865164756775, + "learning_rate": 7.690978714498563e-05, + "loss": 1.5566, + "step": 27250 + }, + { + "epoch": 5.742876705114026, + "grad_norm": 1.0900312662124634, + "learning_rate": 7.684538644909302e-05, + "loss": 1.6537, + "step": 27260 + }, + { + "epoch": 5.7449834097013746, + "grad_norm": 1.1284072399139404, + "learning_rate": 7.678099589889534e-05, + "loss": 1.6081, + "step": 27270 + }, + { + "epoch": 5.747090114288724, + "grad_norm": 1.0693503618240356, + "learning_rate": 7.671661552260671e-05, + "loss": 1.5738, + "step": 27280 + }, + { + "epoch": 5.749196818876073, + "grad_norm": 1.5038498640060425, + "learning_rate": 7.665224534843673e-05, + "loss": 1.6403, + "step": 27290 + }, + { + "epoch": 5.751303523463422, + "grad_norm": 1.0032192468643188, + "learning_rate": 7.658788540459062e-05, + "loss": 1.6093, + "step": 27300 + }, + { + "epoch": 5.753410228050772, + "grad_norm": 1.174879550933838, + "learning_rate": 7.652353571926908e-05, + "loss": 1.5718, + "step": 27310 + }, + { + "epoch": 5.755516932638121, + "grad_norm": 1.0580400228500366, + "learning_rate": 7.645919632066833e-05, + "loss": 1.6227, + "step": 27320 + }, + { + "epoch": 5.75762363722547, + "grad_norm": 1.119176983833313, + "learning_rate": 7.639486723698006e-05, + "loss": 1.6271, + "step": 27330 + }, + { + "epoch": 5.759730341812819, + "grad_norm": 1.1590195894241333, + "learning_rate": 7.633054849639144e-05, + "loss": 1.6028, + "step": 27340 + }, + { + "epoch": 5.761837046400169, + "grad_norm": 1.1386713981628418, + "learning_rate": 7.62662401270851e-05, + "loss": 1.6229, + "step": 27350 + }, + { + "epoch": 5.763943750987518, + "grad_norm": 1.172624111175537, + "learning_rate": 7.620194215723919e-05, + "loss": 1.564, + "step": 27360 + }, + { + "epoch": 5.766050455574867, + "grad_norm": 1.1533126831054688, + "learning_rate": 7.613765461502724e-05, + "loss": 1.5977, + "step": 27370 + }, + { + "epoch": 5.768157160162216, + "grad_norm": 1.1233863830566406, + "learning_rate": 7.60733775286182e-05, + "loss": 1.616, + "step": 27380 + }, + { + "epoch": 5.770263864749565, + "grad_norm": 1.0440213680267334, + "learning_rate": 7.600911092617651e-05, + "loss": 1.5581, + "step": 27390 + }, + { + "epoch": 5.772370569336915, + "grad_norm": 1.2219536304473877, + "learning_rate": 7.594485483586193e-05, + "loss": 1.651, + "step": 27400 + }, + { + "epoch": 5.774477273924264, + "grad_norm": 1.1317471265792847, + "learning_rate": 7.588060928582971e-05, + "loss": 1.5883, + "step": 27410 + }, + { + "epoch": 5.7765839785116135, + "grad_norm": 1.0306445360183716, + "learning_rate": 7.581637430423037e-05, + "loss": 1.602, + "step": 27420 + }, + { + "epoch": 5.778690683098962, + "grad_norm": 1.0400179624557495, + "learning_rate": 7.575214991920987e-05, + "loss": 1.6392, + "step": 27430 + }, + { + "epoch": 5.780797387686311, + "grad_norm": 1.022694706916809, + "learning_rate": 7.568793615890954e-05, + "loss": 1.6382, + "step": 27440 + }, + { + "epoch": 5.782904092273661, + "grad_norm": 1.1518008708953857, + "learning_rate": 7.562373305146604e-05, + "loss": 1.6253, + "step": 27450 + }, + { + "epoch": 5.78501079686101, + "grad_norm": 1.0711225271224976, + "learning_rate": 7.55595406250113e-05, + "loss": 1.5917, + "step": 27460 + }, + { + "epoch": 5.78711750144836, + "grad_norm": 1.1197304725646973, + "learning_rate": 7.549535890767263e-05, + "loss": 1.6342, + "step": 27470 + }, + { + "epoch": 5.789224206035708, + "grad_norm": 1.1032601594924927, + "learning_rate": 7.543118792757266e-05, + "loss": 1.6164, + "step": 27480 + }, + { + "epoch": 5.791330910623058, + "grad_norm": 1.1401535272598267, + "learning_rate": 7.536702771282932e-05, + "loss": 1.5832, + "step": 27490 + }, + { + "epoch": 5.793437615210407, + "grad_norm": 1.1442997455596924, + "learning_rate": 7.530287829155574e-05, + "loss": 1.6285, + "step": 27500 + }, + { + "epoch": 5.795544319797756, + "grad_norm": 1.0326517820358276, + "learning_rate": 7.523873969186039e-05, + "loss": 1.5764, + "step": 27510 + }, + { + "epoch": 5.797651024385106, + "grad_norm": 1.0888991355895996, + "learning_rate": 7.517461194184699e-05, + "loss": 1.6155, + "step": 27520 + }, + { + "epoch": 5.7997577289724545, + "grad_norm": 1.1346875429153442, + "learning_rate": 7.511049506961454e-05, + "loss": 1.6698, + "step": 27530 + }, + { + "epoch": 5.801864433559804, + "grad_norm": 1.1853289604187012, + "learning_rate": 7.504638910325717e-05, + "loss": 1.6589, + "step": 27540 + }, + { + "epoch": 5.803971138147153, + "grad_norm": 1.0693249702453613, + "learning_rate": 7.498229407086432e-05, + "loss": 1.5959, + "step": 27550 + }, + { + "epoch": 5.806077842734503, + "grad_norm": 1.0473322868347168, + "learning_rate": 7.491821000052064e-05, + "loss": 1.6197, + "step": 27560 + }, + { + "epoch": 5.808184547321852, + "grad_norm": 1.1137909889221191, + "learning_rate": 7.485413692030596e-05, + "loss": 1.6887, + "step": 27570 + }, + { + "epoch": 5.810291251909201, + "grad_norm": 1.1461670398712158, + "learning_rate": 7.479007485829523e-05, + "loss": 1.611, + "step": 27580 + }, + { + "epoch": 5.81239795649655, + "grad_norm": 1.1173228025436401, + "learning_rate": 7.472602384255864e-05, + "loss": 1.5538, + "step": 27590 + }, + { + "epoch": 5.8145046610839, + "grad_norm": 1.0433090925216675, + "learning_rate": 7.466198390116158e-05, + "loss": 1.6569, + "step": 27600 + }, + { + "epoch": 5.816611365671249, + "grad_norm": 1.139930248260498, + "learning_rate": 7.45979550621645e-05, + "loss": 1.591, + "step": 27610 + }, + { + "epoch": 5.818718070258598, + "grad_norm": 1.1254889965057373, + "learning_rate": 7.453393735362302e-05, + "loss": 1.61, + "step": 27620 + }, + { + "epoch": 5.820824774845947, + "grad_norm": 1.256567358970642, + "learning_rate": 7.446993080358789e-05, + "loss": 1.6197, + "step": 27630 + }, + { + "epoch": 5.822931479433296, + "grad_norm": 1.20447838306427, + "learning_rate": 7.440593544010495e-05, + "loss": 1.5921, + "step": 27640 + }, + { + "epoch": 5.825038184020646, + "grad_norm": 1.09376859664917, + "learning_rate": 7.434195129121518e-05, + "loss": 1.642, + "step": 27650 + }, + { + "epoch": 5.827144888607995, + "grad_norm": 1.2998801469802856, + "learning_rate": 7.427797838495463e-05, + "loss": 1.606, + "step": 27660 + }, + { + "epoch": 5.829251593195345, + "grad_norm": 1.0475540161132812, + "learning_rate": 7.421401674935435e-05, + "loss": 1.6485, + "step": 27670 + }, + { + "epoch": 5.831358297782693, + "grad_norm": 1.0838693380355835, + "learning_rate": 7.415006641244057e-05, + "loss": 1.6273, + "step": 27680 + }, + { + "epoch": 5.833465002370042, + "grad_norm": 1.039505958557129, + "learning_rate": 7.408612740223448e-05, + "loss": 1.5898, + "step": 27690 + }, + { + "epoch": 5.835571706957392, + "grad_norm": 1.1330510377883911, + "learning_rate": 7.40221997467524e-05, + "loss": 1.6115, + "step": 27700 + }, + { + "epoch": 5.837678411544741, + "grad_norm": 1.0775504112243652, + "learning_rate": 7.395828347400555e-05, + "loss": 1.6676, + "step": 27710 + }, + { + "epoch": 5.839785116132091, + "grad_norm": 1.0811083316802979, + "learning_rate": 7.389437861200024e-05, + "loss": 1.6371, + "step": 27720 + }, + { + "epoch": 5.8418918207194395, + "grad_norm": 1.005247712135315, + "learning_rate": 7.383048518873778e-05, + "loss": 1.6141, + "step": 27730 + }, + { + "epoch": 5.843998525306789, + "grad_norm": 1.088276982307434, + "learning_rate": 7.376660323221449e-05, + "loss": 1.6159, + "step": 27740 + }, + { + "epoch": 5.846105229894138, + "grad_norm": 1.114585518836975, + "learning_rate": 7.370273277042156e-05, + "loss": 1.6081, + "step": 27750 + }, + { + "epoch": 5.848211934481487, + "grad_norm": 1.0871803760528564, + "learning_rate": 7.363887383134527e-05, + "loss": 1.6395, + "step": 27760 + }, + { + "epoch": 5.850318639068837, + "grad_norm": 1.176871657371521, + "learning_rate": 7.357502644296677e-05, + "loss": 1.601, + "step": 27770 + }, + { + "epoch": 5.8524253436561855, + "grad_norm": 1.0000860691070557, + "learning_rate": 7.35111906332622e-05, + "loss": 1.6131, + "step": 27780 + }, + { + "epoch": 5.854532048243535, + "grad_norm": 1.133975863456726, + "learning_rate": 7.344736643020256e-05, + "loss": 1.5859, + "step": 27790 + }, + { + "epoch": 5.856638752830884, + "grad_norm": 1.1784876585006714, + "learning_rate": 7.338355386175382e-05, + "loss": 1.6133, + "step": 27800 + }, + { + "epoch": 5.858745457418234, + "grad_norm": 1.029212474822998, + "learning_rate": 7.331975295587687e-05, + "loss": 1.6396, + "step": 27810 + }, + { + "epoch": 5.860852162005583, + "grad_norm": 1.0330170392990112, + "learning_rate": 7.325596374052743e-05, + "loss": 1.618, + "step": 27820 + }, + { + "epoch": 5.8629588665929315, + "grad_norm": 1.0614304542541504, + "learning_rate": 7.319218624365613e-05, + "loss": 1.6479, + "step": 27830 + }, + { + "epoch": 5.865065571180281, + "grad_norm": 1.1751939058303833, + "learning_rate": 7.312842049320844e-05, + "loss": 1.6193, + "step": 27840 + }, + { + "epoch": 5.86717227576763, + "grad_norm": 1.1354511976242065, + "learning_rate": 7.306466651712474e-05, + "loss": 1.5616, + "step": 27850 + }, + { + "epoch": 5.86927898035498, + "grad_norm": 1.0025415420532227, + "learning_rate": 7.30009243433402e-05, + "loss": 1.5796, + "step": 27860 + }, + { + "epoch": 5.871385684942329, + "grad_norm": 1.099372148513794, + "learning_rate": 7.293719399978482e-05, + "loss": 1.621, + "step": 27870 + }, + { + "epoch": 5.8734923895296784, + "grad_norm": 1.2273434400558472, + "learning_rate": 7.287347551438344e-05, + "loss": 1.6476, + "step": 27880 + }, + { + "epoch": 5.875599094117027, + "grad_norm": 1.0725783109664917, + "learning_rate": 7.280976891505569e-05, + "loss": 1.6296, + "step": 27890 + }, + { + "epoch": 5.877705798704377, + "grad_norm": 1.0418058633804321, + "learning_rate": 7.2746074229716e-05, + "loss": 1.5929, + "step": 27900 + }, + { + "epoch": 5.879812503291726, + "grad_norm": 1.028952717781067, + "learning_rate": 7.268239148627355e-05, + "loss": 1.6189, + "step": 27910 + }, + { + "epoch": 5.881919207879076, + "grad_norm": 1.1607900857925415, + "learning_rate": 7.261872071263236e-05, + "loss": 1.6088, + "step": 27920 + }, + { + "epoch": 5.8840259124664245, + "grad_norm": 1.19911789894104, + "learning_rate": 7.25550619366911e-05, + "loss": 1.5934, + "step": 27930 + }, + { + "epoch": 5.886132617053773, + "grad_norm": 1.0657827854156494, + "learning_rate": 7.249141518634327e-05, + "loss": 1.6062, + "step": 27940 + }, + { + "epoch": 5.888239321641123, + "grad_norm": 1.0900195837020874, + "learning_rate": 7.242778048947713e-05, + "loss": 1.6429, + "step": 27950 + }, + { + "epoch": 5.890346026228472, + "grad_norm": 1.1822017431259155, + "learning_rate": 7.236415787397548e-05, + "loss": 1.5911, + "step": 27960 + }, + { + "epoch": 5.892452730815822, + "grad_norm": 1.1533838510513306, + "learning_rate": 7.230054736771601e-05, + "loss": 1.619, + "step": 27970 + }, + { + "epoch": 5.8945594354031705, + "grad_norm": 0.9922227263450623, + "learning_rate": 7.223694899857103e-05, + "loss": 1.6274, + "step": 27980 + }, + { + "epoch": 5.89666613999052, + "grad_norm": 1.097437858581543, + "learning_rate": 7.217336279440761e-05, + "loss": 1.6355, + "step": 27990 + }, + { + "epoch": 5.898772844577869, + "grad_norm": 1.1534727811813354, + "learning_rate": 7.210978878308729e-05, + "loss": 1.629, + "step": 28000 + }, + { + "epoch": 5.900879549165218, + "grad_norm": 1.0968070030212402, + "learning_rate": 7.204622699246646e-05, + "loss": 1.5983, + "step": 28010 + }, + { + "epoch": 5.902986253752568, + "grad_norm": 1.0842479467391968, + "learning_rate": 7.198267745039612e-05, + "loss": 1.6278, + "step": 28020 + }, + { + "epoch": 5.9050929583399165, + "grad_norm": 1.136145830154419, + "learning_rate": 7.19191401847219e-05, + "loss": 1.5965, + "step": 28030 + }, + { + "epoch": 5.907199662927266, + "grad_norm": 1.0902760028839111, + "learning_rate": 7.185561522328395e-05, + "loss": 1.6764, + "step": 28040 + }, + { + "epoch": 5.909306367514615, + "grad_norm": 1.0746238231658936, + "learning_rate": 7.179210259391709e-05, + "loss": 1.6268, + "step": 28050 + }, + { + "epoch": 5.911413072101965, + "grad_norm": 0.9946722984313965, + "learning_rate": 7.172860232445084e-05, + "loss": 1.6565, + "step": 28060 + }, + { + "epoch": 5.913519776689314, + "grad_norm": 1.1601204872131348, + "learning_rate": 7.166511444270924e-05, + "loss": 1.6128, + "step": 28070 + }, + { + "epoch": 5.915626481276663, + "grad_norm": 1.1329623460769653, + "learning_rate": 7.160163897651075e-05, + "loss": 1.6341, + "step": 28080 + }, + { + "epoch": 5.917733185864012, + "grad_norm": 1.0780909061431885, + "learning_rate": 7.153817595366858e-05, + "loss": 1.6146, + "step": 28090 + }, + { + "epoch": 5.919839890451361, + "grad_norm": 1.1066025495529175, + "learning_rate": 7.147472540199046e-05, + "loss": 1.613, + "step": 28100 + }, + { + "epoch": 5.921946595038711, + "grad_norm": 1.0269134044647217, + "learning_rate": 7.141128734927863e-05, + "loss": 1.6617, + "step": 28110 + }, + { + "epoch": 5.92405329962606, + "grad_norm": 1.249660849571228, + "learning_rate": 7.134786182332978e-05, + "loss": 1.6327, + "step": 28120 + }, + { + "epoch": 5.9261600042134095, + "grad_norm": 0.9970906972885132, + "learning_rate": 7.128444885193518e-05, + "loss": 1.6346, + "step": 28130 + }, + { + "epoch": 5.928266708800758, + "grad_norm": 1.132813572883606, + "learning_rate": 7.122104846288064e-05, + "loss": 1.6349, + "step": 28140 + }, + { + "epoch": 5.930373413388107, + "grad_norm": 1.0950262546539307, + "learning_rate": 7.115766068394645e-05, + "loss": 1.6456, + "step": 28150 + }, + { + "epoch": 5.932480117975457, + "grad_norm": 1.1574265956878662, + "learning_rate": 7.109428554290725e-05, + "loss": 1.6806, + "step": 28160 + }, + { + "epoch": 5.934586822562806, + "grad_norm": 1.1822057962417603, + "learning_rate": 7.103092306753222e-05, + "loss": 1.6143, + "step": 28170 + }, + { + "epoch": 5.9366935271501555, + "grad_norm": 1.03468656539917, + "learning_rate": 7.096757328558506e-05, + "loss": 1.6355, + "step": 28180 + }, + { + "epoch": 5.938800231737504, + "grad_norm": 1.0852649211883545, + "learning_rate": 7.090423622482389e-05, + "loss": 1.6394, + "step": 28190 + }, + { + "epoch": 5.940906936324854, + "grad_norm": 1.0834746360778809, + "learning_rate": 7.08409119130011e-05, + "loss": 1.6541, + "step": 28200 + }, + { + "epoch": 5.943013640912203, + "grad_norm": 1.068300724029541, + "learning_rate": 7.077760037786365e-05, + "loss": 1.5601, + "step": 28210 + }, + { + "epoch": 5.945120345499553, + "grad_norm": 1.1912277936935425, + "learning_rate": 7.071430164715288e-05, + "loss": 1.6005, + "step": 28220 + }, + { + "epoch": 5.947227050086902, + "grad_norm": 1.164495825767517, + "learning_rate": 7.065101574860449e-05, + "loss": 1.5994, + "step": 28230 + }, + { + "epoch": 5.949333754674251, + "grad_norm": 1.1123220920562744, + "learning_rate": 7.058774270994862e-05, + "loss": 1.6172, + "step": 28240 + }, + { + "epoch": 5.9514404592616, + "grad_norm": 1.1080528497695923, + "learning_rate": 7.052448255890957e-05, + "loss": 1.6239, + "step": 28250 + }, + { + "epoch": 5.953547163848949, + "grad_norm": 1.1013644933700562, + "learning_rate": 7.04612353232063e-05, + "loss": 1.6834, + "step": 28260 + }, + { + "epoch": 5.955653868436299, + "grad_norm": 1.1154600381851196, + "learning_rate": 7.039800103055186e-05, + "loss": 1.5973, + "step": 28270 + }, + { + "epoch": 5.957760573023648, + "grad_norm": 1.1243196725845337, + "learning_rate": 7.033477970865381e-05, + "loss": 1.6159, + "step": 28280 + }, + { + "epoch": 5.959867277610997, + "grad_norm": 1.254236102104187, + "learning_rate": 7.027157138521383e-05, + "loss": 1.6784, + "step": 28290 + }, + { + "epoch": 5.961973982198346, + "grad_norm": 1.1903520822525024, + "learning_rate": 7.02083760879281e-05, + "loss": 1.6246, + "step": 28300 + }, + { + "epoch": 5.964080686785696, + "grad_norm": 1.1204966306686401, + "learning_rate": 7.014519384448696e-05, + "loss": 1.6253, + "step": 28310 + }, + { + "epoch": 5.966187391373045, + "grad_norm": 1.0842779874801636, + "learning_rate": 7.008202468257514e-05, + "loss": 1.6459, + "step": 28320 + }, + { + "epoch": 5.968294095960394, + "grad_norm": 1.201533317565918, + "learning_rate": 7.001886862987147e-05, + "loss": 1.5967, + "step": 28330 + }, + { + "epoch": 5.970400800547743, + "grad_norm": 1.0753484964370728, + "learning_rate": 6.995572571404923e-05, + "loss": 1.6426, + "step": 28340 + }, + { + "epoch": 5.972507505135092, + "grad_norm": 1.1155014038085938, + "learning_rate": 6.989259596277582e-05, + "loss": 1.6384, + "step": 28350 + }, + { + "epoch": 5.974614209722442, + "grad_norm": 1.0992554426193237, + "learning_rate": 6.982947940371293e-05, + "loss": 1.6077, + "step": 28360 + }, + { + "epoch": 5.976720914309791, + "grad_norm": 1.2092218399047852, + "learning_rate": 6.97663760645164e-05, + "loss": 1.6299, + "step": 28370 + }, + { + "epoch": 5.9788276188971405, + "grad_norm": 1.1124134063720703, + "learning_rate": 6.970328597283637e-05, + "loss": 1.629, + "step": 28380 + }, + { + "epoch": 5.980934323484489, + "grad_norm": 1.0230249166488647, + "learning_rate": 6.964020915631711e-05, + "loss": 1.6255, + "step": 28390 + }, + { + "epoch": 5.983041028071838, + "grad_norm": 1.0409824848175049, + "learning_rate": 6.957714564259712e-05, + "loss": 1.5886, + "step": 28400 + }, + { + "epoch": 5.985147732659188, + "grad_norm": 1.25199294090271, + "learning_rate": 6.951409545930895e-05, + "loss": 1.6144, + "step": 28410 + }, + { + "epoch": 5.987254437246537, + "grad_norm": 1.1676099300384521, + "learning_rate": 6.945105863407951e-05, + "loss": 1.5898, + "step": 28420 + }, + { + "epoch": 5.989361141833887, + "grad_norm": 1.007283329963684, + "learning_rate": 6.93880351945297e-05, + "loss": 1.6614, + "step": 28430 + }, + { + "epoch": 5.991467846421235, + "grad_norm": 1.2138159275054932, + "learning_rate": 6.932502516827461e-05, + "loss": 1.6746, + "step": 28440 + }, + { + "epoch": 5.993574551008585, + "grad_norm": 1.0807868242263794, + "learning_rate": 6.926202858292345e-05, + "loss": 1.6564, + "step": 28450 + }, + { + "epoch": 5.995681255595934, + "grad_norm": 1.0263831615447998, + "learning_rate": 6.919904546607954e-05, + "loss": 1.5506, + "step": 28460 + }, + { + "epoch": 5.997787960183283, + "grad_norm": 1.1143971681594849, + "learning_rate": 6.913607584534026e-05, + "loss": 1.6399, + "step": 28470 + }, + { + "epoch": 5.999894664770633, + "grad_norm": 1.2648664712905884, + "learning_rate": 6.907311974829716e-05, + "loss": 1.5995, + "step": 28480 + }, + { + "epoch": 6.0020013693579815, + "grad_norm": 1.0600049495697021, + "learning_rate": 6.901017720253583e-05, + "loss": 1.5734, + "step": 28490 + }, + { + "epoch": 6.004108073945331, + "grad_norm": 1.108899712562561, + "learning_rate": 6.894724823563583e-05, + "loss": 1.5201, + "step": 28500 + }, + { + "epoch": 6.00621477853268, + "grad_norm": 1.150876760482788, + "learning_rate": 6.888433287517088e-05, + "loss": 1.6381, + "step": 28510 + }, + { + "epoch": 6.00832148312003, + "grad_norm": 1.0762908458709717, + "learning_rate": 6.882143114870876e-05, + "loss": 1.5533, + "step": 28520 + }, + { + "epoch": 6.010428187707379, + "grad_norm": 1.0255169868469238, + "learning_rate": 6.875854308381118e-05, + "loss": 1.6005, + "step": 28530 + }, + { + "epoch": 6.012534892294728, + "grad_norm": 1.126582145690918, + "learning_rate": 6.869566870803388e-05, + "loss": 1.5961, + "step": 28540 + }, + { + "epoch": 6.014641596882077, + "grad_norm": 1.1807374954223633, + "learning_rate": 6.863280804892668e-05, + "loss": 1.5693, + "step": 28550 + }, + { + "epoch": 6.016748301469426, + "grad_norm": 1.0278984308242798, + "learning_rate": 6.85699611340333e-05, + "loss": 1.6046, + "step": 28560 + }, + { + "epoch": 6.018855006056776, + "grad_norm": 1.2131428718566895, + "learning_rate": 6.850712799089151e-05, + "loss": 1.527, + "step": 28570 + }, + { + "epoch": 6.020961710644125, + "grad_norm": 1.1807914972305298, + "learning_rate": 6.844430864703298e-05, + "loss": 1.6019, + "step": 28580 + }, + { + "epoch": 6.023068415231474, + "grad_norm": 1.1616885662078857, + "learning_rate": 6.838150312998338e-05, + "loss": 1.6801, + "step": 28590 + }, + { + "epoch": 6.025175119818823, + "grad_norm": 1.0600801706314087, + "learning_rate": 6.831871146726228e-05, + "loss": 1.5839, + "step": 28600 + }, + { + "epoch": 6.027281824406173, + "grad_norm": 1.0691717863082886, + "learning_rate": 6.825593368638327e-05, + "loss": 1.6165, + "step": 28610 + }, + { + "epoch": 6.029388528993522, + "grad_norm": 1.1156972646713257, + "learning_rate": 6.819316981485372e-05, + "loss": 1.5783, + "step": 28620 + }, + { + "epoch": 6.031495233580871, + "grad_norm": 1.1417192220687866, + "learning_rate": 6.813041988017501e-05, + "loss": 1.6113, + "step": 28630 + }, + { + "epoch": 6.03360193816822, + "grad_norm": 1.1809382438659668, + "learning_rate": 6.806768390984237e-05, + "loss": 1.6015, + "step": 28640 + }, + { + "epoch": 6.035708642755569, + "grad_norm": 1.2206168174743652, + "learning_rate": 6.800496193134498e-05, + "loss": 1.5658, + "step": 28650 + }, + { + "epoch": 6.037815347342919, + "grad_norm": 1.0852173566818237, + "learning_rate": 6.794225397216575e-05, + "loss": 1.5586, + "step": 28660 + }, + { + "epoch": 6.039922051930268, + "grad_norm": 1.072217583656311, + "learning_rate": 6.787956005978156e-05, + "loss": 1.5642, + "step": 28670 + }, + { + "epoch": 6.042028756517618, + "grad_norm": 1.183638334274292, + "learning_rate": 6.781688022166311e-05, + "loss": 1.6155, + "step": 28680 + }, + { + "epoch": 6.0441354611049665, + "grad_norm": 1.0388456583023071, + "learning_rate": 6.775421448527496e-05, + "loss": 1.5514, + "step": 28690 + }, + { + "epoch": 6.046242165692316, + "grad_norm": 1.0287905931472778, + "learning_rate": 6.76915628780754e-05, + "loss": 1.5245, + "step": 28700 + }, + { + "epoch": 6.048348870279665, + "grad_norm": 1.1689507961273193, + "learning_rate": 6.76289254275166e-05, + "loss": 1.5968, + "step": 28710 + }, + { + "epoch": 6.050455574867014, + "grad_norm": 1.067297101020813, + "learning_rate": 6.756630216104454e-05, + "loss": 1.6348, + "step": 28720 + }, + { + "epoch": 6.052562279454364, + "grad_norm": 1.0096595287322998, + "learning_rate": 6.750369310609894e-05, + "loss": 1.5733, + "step": 28730 + }, + { + "epoch": 6.0546689840417125, + "grad_norm": 1.132238745689392, + "learning_rate": 6.744109829011332e-05, + "loss": 1.5642, + "step": 28740 + }, + { + "epoch": 6.056775688629062, + "grad_norm": 1.286741018295288, + "learning_rate": 6.737851774051495e-05, + "loss": 1.6235, + "step": 28750 + }, + { + "epoch": 6.058882393216411, + "grad_norm": 1.1896122694015503, + "learning_rate": 6.731595148472485e-05, + "loss": 1.5754, + "step": 28760 + }, + { + "epoch": 6.060989097803761, + "grad_norm": 1.0230028629302979, + "learning_rate": 6.725339955015777e-05, + "loss": 1.5619, + "step": 28770 + }, + { + "epoch": 6.06309580239111, + "grad_norm": 1.1299915313720703, + "learning_rate": 6.719086196422225e-05, + "loss": 1.6199, + "step": 28780 + }, + { + "epoch": 6.0652025069784585, + "grad_norm": 1.1828107833862305, + "learning_rate": 6.712833875432038e-05, + "loss": 1.6065, + "step": 28790 + }, + { + "epoch": 6.067309211565808, + "grad_norm": 1.1760649681091309, + "learning_rate": 6.706582994784814e-05, + "loss": 1.5737, + "step": 28800 + }, + { + "epoch": 6.069415916153157, + "grad_norm": 1.1444995403289795, + "learning_rate": 6.700333557219511e-05, + "loss": 1.5816, + "step": 28810 + }, + { + "epoch": 6.071522620740507, + "grad_norm": 1.1875180006027222, + "learning_rate": 6.694085565474453e-05, + "loss": 1.5932, + "step": 28820 + }, + { + "epoch": 6.073629325327856, + "grad_norm": 1.281719446182251, + "learning_rate": 6.687839022287332e-05, + "loss": 1.6174, + "step": 28830 + }, + { + "epoch": 6.0757360299152054, + "grad_norm": 1.2580798864364624, + "learning_rate": 6.681593930395209e-05, + "loss": 1.5437, + "step": 28840 + }, + { + "epoch": 6.077842734502554, + "grad_norm": 1.21761155128479, + "learning_rate": 6.675350292534504e-05, + "loss": 1.6229, + "step": 28850 + }, + { + "epoch": 6.079949439089904, + "grad_norm": 1.111264944076538, + "learning_rate": 6.669108111441003e-05, + "loss": 1.5935, + "step": 28860 + }, + { + "epoch": 6.082056143677253, + "grad_norm": 1.094279408454895, + "learning_rate": 6.662867389849851e-05, + "loss": 1.5806, + "step": 28870 + }, + { + "epoch": 6.084162848264602, + "grad_norm": 1.3840709924697876, + "learning_rate": 6.656628130495558e-05, + "loss": 1.5542, + "step": 28880 + }, + { + "epoch": 6.0862695528519515, + "grad_norm": 1.1345621347427368, + "learning_rate": 6.650390336111989e-05, + "loss": 1.6027, + "step": 28890 + }, + { + "epoch": 6.0883762574393, + "grad_norm": 1.1738914251327515, + "learning_rate": 6.644154009432369e-05, + "loss": 1.5833, + "step": 28900 + }, + { + "epoch": 6.09048296202665, + "grad_norm": 1.157949686050415, + "learning_rate": 6.637919153189279e-05, + "loss": 1.5879, + "step": 28910 + }, + { + "epoch": 6.092589666613999, + "grad_norm": 1.1940367221832275, + "learning_rate": 6.631685770114654e-05, + "loss": 1.5931, + "step": 28920 + }, + { + "epoch": 6.094696371201349, + "grad_norm": 1.1197195053100586, + "learning_rate": 6.625453862939789e-05, + "loss": 1.6045, + "step": 28930 + }, + { + "epoch": 6.0968030757886975, + "grad_norm": 1.4872801303863525, + "learning_rate": 6.619223434395329e-05, + "loss": 1.5962, + "step": 28940 + }, + { + "epoch": 6.098909780376046, + "grad_norm": 1.1432135105133057, + "learning_rate": 6.61299448721127e-05, + "loss": 1.5848, + "step": 28950 + }, + { + "epoch": 6.101016484963396, + "grad_norm": 1.1360822916030884, + "learning_rate": 6.606767024116957e-05, + "loss": 1.5676, + "step": 28960 + }, + { + "epoch": 6.103123189550745, + "grad_norm": 1.1027390956878662, + "learning_rate": 6.600541047841093e-05, + "loss": 1.6078, + "step": 28970 + }, + { + "epoch": 6.105229894138095, + "grad_norm": 1.1606420278549194, + "learning_rate": 6.594316561111724e-05, + "loss": 1.5931, + "step": 28980 + }, + { + "epoch": 6.1073365987254435, + "grad_norm": 1.0428427457809448, + "learning_rate": 6.588093566656238e-05, + "loss": 1.5535, + "step": 28990 + }, + { + "epoch": 6.109443303312793, + "grad_norm": 1.0354915857315063, + "learning_rate": 6.581872067201378e-05, + "loss": 1.595, + "step": 29000 + }, + { + "epoch": 6.111550007900142, + "grad_norm": 1.0693248510360718, + "learning_rate": 6.57565206547323e-05, + "loss": 1.5438, + "step": 29010 + }, + { + "epoch": 6.113656712487492, + "grad_norm": 1.1684085130691528, + "learning_rate": 6.569433564197222e-05, + "loss": 1.6196, + "step": 29020 + }, + { + "epoch": 6.115763417074841, + "grad_norm": 1.12764310836792, + "learning_rate": 6.563216566098121e-05, + "loss": 1.501, + "step": 29030 + }, + { + "epoch": 6.11787012166219, + "grad_norm": 1.1445949077606201, + "learning_rate": 6.557001073900044e-05, + "loss": 1.5407, + "step": 29040 + }, + { + "epoch": 6.119976826249539, + "grad_norm": 1.206442952156067, + "learning_rate": 6.55078709032644e-05, + "loss": 1.6259, + "step": 29050 + }, + { + "epoch": 6.122083530836888, + "grad_norm": 1.1316516399383545, + "learning_rate": 6.544574618100102e-05, + "loss": 1.6372, + "step": 29060 + }, + { + "epoch": 6.124190235424238, + "grad_norm": 1.1655209064483643, + "learning_rate": 6.538363659943162e-05, + "loss": 1.6235, + "step": 29070 + }, + { + "epoch": 6.126296940011587, + "grad_norm": 1.2157286405563354, + "learning_rate": 6.53215421857708e-05, + "loss": 1.5969, + "step": 29080 + }, + { + "epoch": 6.1284036445989365, + "grad_norm": 1.1558120250701904, + "learning_rate": 6.525946296722659e-05, + "loss": 1.5945, + "step": 29090 + }, + { + "epoch": 6.130510349186285, + "grad_norm": 1.1660462617874146, + "learning_rate": 6.519739897100034e-05, + "loss": 1.6076, + "step": 29100 + }, + { + "epoch": 6.132617053773634, + "grad_norm": 1.1626811027526855, + "learning_rate": 6.51353502242868e-05, + "loss": 1.5999, + "step": 29110 + }, + { + "epoch": 6.134723758360984, + "grad_norm": 1.0529296398162842, + "learning_rate": 6.507331675427387e-05, + "loss": 1.5767, + "step": 29120 + }, + { + "epoch": 6.136830462948333, + "grad_norm": 1.1551191806793213, + "learning_rate": 6.50112985881429e-05, + "loss": 1.6153, + "step": 29130 + }, + { + "epoch": 6.1389371675356825, + "grad_norm": 1.0937594175338745, + "learning_rate": 6.494929575306848e-05, + "loss": 1.5513, + "step": 29140 + }, + { + "epoch": 6.141043872123031, + "grad_norm": 1.1500444412231445, + "learning_rate": 6.488730827621856e-05, + "loss": 1.5965, + "step": 29150 + }, + { + "epoch": 6.143150576710381, + "grad_norm": 1.208426594734192, + "learning_rate": 6.482533618475422e-05, + "loss": 1.4839, + "step": 29160 + }, + { + "epoch": 6.14525728129773, + "grad_norm": 1.1101106405258179, + "learning_rate": 6.476337950582987e-05, + "loss": 1.5582, + "step": 29170 + }, + { + "epoch": 6.147363985885079, + "grad_norm": 1.1369212865829468, + "learning_rate": 6.470143826659317e-05, + "loss": 1.6388, + "step": 29180 + }, + { + "epoch": 6.149470690472429, + "grad_norm": 1.0529881715774536, + "learning_rate": 6.46395124941851e-05, + "loss": 1.6363, + "step": 29190 + }, + { + "epoch": 6.151577395059777, + "grad_norm": 1.0460222959518433, + "learning_rate": 6.457760221573968e-05, + "loss": 1.5687, + "step": 29200 + }, + { + "epoch": 6.153684099647127, + "grad_norm": 1.164514183998108, + "learning_rate": 6.451570745838426e-05, + "loss": 1.5877, + "step": 29210 + }, + { + "epoch": 6.155790804234476, + "grad_norm": 1.1551470756530762, + "learning_rate": 6.445382824923938e-05, + "loss": 1.618, + "step": 29220 + }, + { + "epoch": 6.157897508821826, + "grad_norm": 1.1310980319976807, + "learning_rate": 6.43919646154188e-05, + "loss": 1.5745, + "step": 29230 + }, + { + "epoch": 6.160004213409175, + "grad_norm": 1.1333801746368408, + "learning_rate": 6.433011658402933e-05, + "loss": 1.5899, + "step": 29240 + }, + { + "epoch": 6.162110917996524, + "grad_norm": 1.0840401649475098, + "learning_rate": 6.426828418217104e-05, + "loss": 1.5188, + "step": 29250 + }, + { + "epoch": 6.164217622583873, + "grad_norm": 1.150305986404419, + "learning_rate": 6.420646743693714e-05, + "loss": 1.5772, + "step": 29260 + }, + { + "epoch": 6.166324327171222, + "grad_norm": 1.1294101476669312, + "learning_rate": 6.414466637541405e-05, + "loss": 1.5676, + "step": 29270 + }, + { + "epoch": 6.168431031758572, + "grad_norm": 1.1302536725997925, + "learning_rate": 6.408288102468113e-05, + "loss": 1.6208, + "step": 29280 + }, + { + "epoch": 6.170537736345921, + "grad_norm": 1.1266132593154907, + "learning_rate": 6.402111141181101e-05, + "loss": 1.6378, + "step": 29290 + }, + { + "epoch": 6.17264444093327, + "grad_norm": 1.1256543397903442, + "learning_rate": 6.39593575638694e-05, + "loss": 1.6542, + "step": 29300 + }, + { + "epoch": 6.174751145520619, + "grad_norm": 1.1547867059707642, + "learning_rate": 6.38976195079151e-05, + "loss": 1.6013, + "step": 29310 + }, + { + "epoch": 6.176857850107969, + "grad_norm": 1.083693504333496, + "learning_rate": 6.38358972709999e-05, + "loss": 1.6854, + "step": 29320 + }, + { + "epoch": 6.178964554695318, + "grad_norm": 1.1603076457977295, + "learning_rate": 6.377419088016881e-05, + "loss": 1.6129, + "step": 29330 + }, + { + "epoch": 6.1810712592826675, + "grad_norm": 1.0910979509353638, + "learning_rate": 6.371250036245976e-05, + "loss": 1.5678, + "step": 29340 + }, + { + "epoch": 6.183177963870016, + "grad_norm": 1.075230598449707, + "learning_rate": 6.365082574490384e-05, + "loss": 1.5757, + "step": 29350 + }, + { + "epoch": 6.185284668457365, + "grad_norm": 1.2227821350097656, + "learning_rate": 6.358916705452514e-05, + "loss": 1.5971, + "step": 29360 + }, + { + "epoch": 6.187391373044715, + "grad_norm": 1.148966908454895, + "learning_rate": 6.352752431834063e-05, + "loss": 1.5922, + "step": 29370 + }, + { + "epoch": 6.189498077632064, + "grad_norm": 1.168792963027954, + "learning_rate": 6.34658975633605e-05, + "loss": 1.5615, + "step": 29380 + }, + { + "epoch": 6.191604782219414, + "grad_norm": 1.1150977611541748, + "learning_rate": 6.340428681658783e-05, + "loss": 1.6151, + "step": 29390 + }, + { + "epoch": 6.193711486806762, + "grad_norm": 1.0817590951919556, + "learning_rate": 6.334269210501875e-05, + "loss": 1.6724, + "step": 29400 + }, + { + "epoch": 6.195818191394112, + "grad_norm": 1.126131296157837, + "learning_rate": 6.328111345564221e-05, + "loss": 1.5664, + "step": 29410 + }, + { + "epoch": 6.197924895981461, + "grad_norm": 1.2539746761322021, + "learning_rate": 6.321955089544029e-05, + "loss": 1.5707, + "step": 29420 + }, + { + "epoch": 6.20003160056881, + "grad_norm": 1.1847485303878784, + "learning_rate": 6.315800445138796e-05, + "loss": 1.5636, + "step": 29430 + }, + { + "epoch": 6.20213830515616, + "grad_norm": 1.1532986164093018, + "learning_rate": 6.309647415045315e-05, + "loss": 1.5578, + "step": 29440 + }, + { + "epoch": 6.2042450097435085, + "grad_norm": 1.2231844663619995, + "learning_rate": 6.30349600195966e-05, + "loss": 1.5815, + "step": 29450 + }, + { + "epoch": 6.206351714330858, + "grad_norm": 1.1287641525268555, + "learning_rate": 6.297346208577213e-05, + "loss": 1.5534, + "step": 29460 + }, + { + "epoch": 6.208458418918207, + "grad_norm": 1.108749270439148, + "learning_rate": 6.291198037592639e-05, + "loss": 1.5468, + "step": 29470 + }, + { + "epoch": 6.210565123505557, + "grad_norm": 1.1339576244354248, + "learning_rate": 6.285051491699896e-05, + "loss": 1.5976, + "step": 29480 + }, + { + "epoch": 6.212671828092906, + "grad_norm": 1.1239781379699707, + "learning_rate": 6.278906573592213e-05, + "loss": 1.6005, + "step": 29490 + }, + { + "epoch": 6.2147785326802545, + "grad_norm": 1.057753562927246, + "learning_rate": 6.272763285962129e-05, + "loss": 1.5794, + "step": 29500 + }, + { + "epoch": 6.216885237267604, + "grad_norm": 1.1971989870071411, + "learning_rate": 6.266621631501457e-05, + "loss": 1.574, + "step": 29510 + }, + { + "epoch": 6.218991941854953, + "grad_norm": 1.1916954517364502, + "learning_rate": 6.260481612901299e-05, + "loss": 1.5978, + "step": 29520 + }, + { + "epoch": 6.221098646442303, + "grad_norm": 1.0900837182998657, + "learning_rate": 6.254343232852027e-05, + "loss": 1.5698, + "step": 29530 + }, + { + "epoch": 6.223205351029652, + "grad_norm": 1.1096452474594116, + "learning_rate": 6.248206494043313e-05, + "loss": 1.5639, + "step": 29540 + }, + { + "epoch": 6.225312055617001, + "grad_norm": 1.263198733329773, + "learning_rate": 6.2420713991641e-05, + "loss": 1.5872, + "step": 29550 + }, + { + "epoch": 6.22741876020435, + "grad_norm": 1.0684478282928467, + "learning_rate": 6.235937950902615e-05, + "loss": 1.6383, + "step": 29560 + }, + { + "epoch": 6.2295254647917, + "grad_norm": 1.1642452478408813, + "learning_rate": 6.229806151946353e-05, + "loss": 1.6017, + "step": 29570 + }, + { + "epoch": 6.231632169379049, + "grad_norm": 1.056420922279358, + "learning_rate": 6.223676004982105e-05, + "loss": 1.5695, + "step": 29580 + }, + { + "epoch": 6.233738873966398, + "grad_norm": 1.1780263185501099, + "learning_rate": 6.217547512695919e-05, + "loss": 1.6149, + "step": 29590 + }, + { + "epoch": 6.235845578553747, + "grad_norm": 1.1113979816436768, + "learning_rate": 6.211420677773131e-05, + "loss": 1.6249, + "step": 29600 + }, + { + "epoch": 6.237952283141096, + "grad_norm": 1.2237204313278198, + "learning_rate": 6.205295502898348e-05, + "loss": 1.6159, + "step": 29610 + }, + { + "epoch": 6.240058987728446, + "grad_norm": 1.144103765487671, + "learning_rate": 6.199171990755441e-05, + "loss": 1.6249, + "step": 29620 + }, + { + "epoch": 6.242165692315795, + "grad_norm": 1.0986157655715942, + "learning_rate": 6.193050144027565e-05, + "loss": 1.5917, + "step": 29630 + }, + { + "epoch": 6.244272396903145, + "grad_norm": 1.1357136964797974, + "learning_rate": 6.18692996539714e-05, + "loss": 1.634, + "step": 29640 + }, + { + "epoch": 6.2463791014904935, + "grad_norm": 1.0918197631835938, + "learning_rate": 6.180811457545852e-05, + "loss": 1.5904, + "step": 29650 + }, + { + "epoch": 6.248485806077842, + "grad_norm": 1.162324070930481, + "learning_rate": 6.174694623154658e-05, + "loss": 1.6446, + "step": 29660 + }, + { + "epoch": 6.250592510665192, + "grad_norm": 1.066695213317871, + "learning_rate": 6.168579464903779e-05, + "loss": 1.592, + "step": 29670 + }, + { + "epoch": 6.252699215252541, + "grad_norm": 1.1125965118408203, + "learning_rate": 6.16246598547271e-05, + "loss": 1.5864, + "step": 29680 + }, + { + "epoch": 6.254805919839891, + "grad_norm": 1.1239444017410278, + "learning_rate": 6.1563541875402e-05, + "loss": 1.5655, + "step": 29690 + }, + { + "epoch": 6.2569126244272395, + "grad_norm": 1.1455018520355225, + "learning_rate": 6.150244073784266e-05, + "loss": 1.6228, + "step": 29700 + }, + { + "epoch": 6.259019329014589, + "grad_norm": 1.1525976657867432, + "learning_rate": 6.144135646882188e-05, + "loss": 1.6001, + "step": 29710 + }, + { + "epoch": 6.261126033601938, + "grad_norm": 1.305464506149292, + "learning_rate": 6.138028909510503e-05, + "loss": 1.5917, + "step": 29720 + }, + { + "epoch": 6.263232738189288, + "grad_norm": 1.2044486999511719, + "learning_rate": 6.131923864345012e-05, + "loss": 1.6158, + "step": 29730 + }, + { + "epoch": 6.265339442776637, + "grad_norm": 1.2145870923995972, + "learning_rate": 6.125820514060772e-05, + "loss": 1.6551, + "step": 29740 + }, + { + "epoch": 6.2674461473639855, + "grad_norm": 1.1999444961547852, + "learning_rate": 6.119718861332098e-05, + "loss": 1.6351, + "step": 29750 + }, + { + "epoch": 6.269552851951335, + "grad_norm": 1.1321429014205933, + "learning_rate": 6.113618908832561e-05, + "loss": 1.6331, + "step": 29760 + }, + { + "epoch": 6.271659556538684, + "grad_norm": 1.1454631090164185, + "learning_rate": 6.107520659234988e-05, + "loss": 1.5784, + "step": 29770 + }, + { + "epoch": 6.273766261126034, + "grad_norm": 1.2279682159423828, + "learning_rate": 6.101424115211458e-05, + "loss": 1.5881, + "step": 29780 + }, + { + "epoch": 6.275872965713383, + "grad_norm": 1.215173363685608, + "learning_rate": 6.095329279433304e-05, + "loss": 1.5792, + "step": 29790 + }, + { + "epoch": 6.2779796703007325, + "grad_norm": 1.0732548236846924, + "learning_rate": 6.089236154571109e-05, + "loss": 1.6326, + "step": 29800 + }, + { + "epoch": 6.280086374888081, + "grad_norm": 1.3970751762390137, + "learning_rate": 6.083144743294713e-05, + "loss": 1.5841, + "step": 29810 + }, + { + "epoch": 6.28219307947543, + "grad_norm": 1.1148728132247925, + "learning_rate": 6.0770550482731924e-05, + "loss": 1.6009, + "step": 29820 + }, + { + "epoch": 6.28429978406278, + "grad_norm": 1.209598422050476, + "learning_rate": 6.070967072174881e-05, + "loss": 1.5648, + "step": 29830 + }, + { + "epoch": 6.286406488650129, + "grad_norm": 1.1604822874069214, + "learning_rate": 6.0648808176673586e-05, + "loss": 1.6543, + "step": 29840 + }, + { + "epoch": 6.2885131932374785, + "grad_norm": 1.1224958896636963, + "learning_rate": 6.058796287417451e-05, + "loss": 1.622, + "step": 29850 + }, + { + "epoch": 6.290619897824827, + "grad_norm": 1.227657675743103, + "learning_rate": 6.0527134840912224e-05, + "loss": 1.633, + "step": 29860 + }, + { + "epoch": 6.292726602412177, + "grad_norm": 1.1823822259902954, + "learning_rate": 6.046632410353987e-05, + "loss": 1.5963, + "step": 29870 + }, + { + "epoch": 6.294833306999526, + "grad_norm": 1.0884348154067993, + "learning_rate": 6.0405530688702986e-05, + "loss": 1.5579, + "step": 29880 + }, + { + "epoch": 6.296940011586875, + "grad_norm": 1.2381103038787842, + "learning_rate": 6.034475462303952e-05, + "loss": 1.5499, + "step": 29890 + }, + { + "epoch": 6.2990467161742245, + "grad_norm": 1.1996432542800903, + "learning_rate": 6.028399593317984e-05, + "loss": 1.6038, + "step": 29900 + }, + { + "epoch": 6.301153420761573, + "grad_norm": 1.2356693744659424, + "learning_rate": 6.022325464574665e-05, + "loss": 1.5827, + "step": 29910 + }, + { + "epoch": 6.303260125348923, + "grad_norm": 1.1580696105957031, + "learning_rate": 6.016253078735508e-05, + "loss": 1.5349, + "step": 29920 + }, + { + "epoch": 6.305366829936272, + "grad_norm": 1.0700081586837769, + "learning_rate": 6.010182438461258e-05, + "loss": 1.5857, + "step": 29930 + }, + { + "epoch": 6.307473534523622, + "grad_norm": 1.2955549955368042, + "learning_rate": 6.0041135464119024e-05, + "loss": 1.578, + "step": 29940 + }, + { + "epoch": 6.3095802391109705, + "grad_norm": 1.116965889930725, + "learning_rate": 5.998046405246651e-05, + "loss": 1.5308, + "step": 29950 + }, + { + "epoch": 6.31168694369832, + "grad_norm": 1.232913851737976, + "learning_rate": 5.991981017623955e-05, + "loss": 1.5701, + "step": 29960 + }, + { + "epoch": 6.313793648285669, + "grad_norm": 1.167012333869934, + "learning_rate": 5.9859173862014985e-05, + "loss": 1.645, + "step": 29970 + }, + { + "epoch": 6.315900352873018, + "grad_norm": 1.1298688650131226, + "learning_rate": 5.979855513636192e-05, + "loss": 1.6271, + "step": 29980 + }, + { + "epoch": 6.318007057460368, + "grad_norm": 1.1685657501220703, + "learning_rate": 5.9737954025841725e-05, + "loss": 1.5824, + "step": 29990 + }, + { + "epoch": 6.320113762047717, + "grad_norm": 1.1890677213668823, + "learning_rate": 5.9677370557008104e-05, + "loss": 1.6214, + "step": 30000 + }, + { + "epoch": 6.322220466635066, + "grad_norm": 1.2406339645385742, + "learning_rate": 5.961680475640703e-05, + "loss": 1.5818, + "step": 30010 + }, + { + "epoch": 6.324327171222415, + "grad_norm": 1.1883368492126465, + "learning_rate": 5.955625665057672e-05, + "loss": 1.6185, + "step": 30020 + }, + { + "epoch": 6.326433875809765, + "grad_norm": 1.1744321584701538, + "learning_rate": 5.9495726266047605e-05, + "loss": 1.588, + "step": 30030 + }, + { + "epoch": 6.328540580397114, + "grad_norm": 1.2040607929229736, + "learning_rate": 5.9435213629342416e-05, + "loss": 1.5988, + "step": 30040 + }, + { + "epoch": 6.3306472849844635, + "grad_norm": 1.179335117340088, + "learning_rate": 5.9374718766976043e-05, + "loss": 1.6026, + "step": 30050 + }, + { + "epoch": 6.332753989571812, + "grad_norm": 1.173584222793579, + "learning_rate": 5.9314241705455674e-05, + "loss": 1.5836, + "step": 30060 + }, + { + "epoch": 6.334860694159161, + "grad_norm": 1.1390973329544067, + "learning_rate": 5.9253782471280596e-05, + "loss": 1.6209, + "step": 30070 + }, + { + "epoch": 6.336967398746511, + "grad_norm": 1.0759764909744263, + "learning_rate": 5.919334109094232e-05, + "loss": 1.6303, + "step": 30080 + }, + { + "epoch": 6.33907410333386, + "grad_norm": 1.2041103839874268, + "learning_rate": 5.9132917590924564e-05, + "loss": 1.6344, + "step": 30090 + }, + { + "epoch": 6.3411808079212095, + "grad_norm": 1.1497009992599487, + "learning_rate": 5.9072511997703226e-05, + "loss": 1.5925, + "step": 30100 + }, + { + "epoch": 6.343287512508558, + "grad_norm": 1.1539160013198853, + "learning_rate": 5.901212433774625e-05, + "loss": 1.5589, + "step": 30110 + }, + { + "epoch": 6.345394217095908, + "grad_norm": 1.163013219833374, + "learning_rate": 5.895175463751385e-05, + "loss": 1.5567, + "step": 30120 + }, + { + "epoch": 6.347500921683257, + "grad_norm": 1.2151119709014893, + "learning_rate": 5.889140292345831e-05, + "loss": 1.5016, + "step": 30130 + }, + { + "epoch": 6.349607626270606, + "grad_norm": 1.0841346979141235, + "learning_rate": 5.883106922202405e-05, + "loss": 1.5996, + "step": 30140 + }, + { + "epoch": 6.351714330857956, + "grad_norm": 1.169562578201294, + "learning_rate": 5.877075355964754e-05, + "loss": 1.5329, + "step": 30150 + }, + { + "epoch": 6.353821035445304, + "grad_norm": 1.2437294721603394, + "learning_rate": 5.871045596275742e-05, + "loss": 1.5844, + "step": 30160 + }, + { + "epoch": 6.355927740032654, + "grad_norm": 1.1044965982437134, + "learning_rate": 5.86501764577744e-05, + "loss": 1.586, + "step": 30170 + }, + { + "epoch": 6.358034444620003, + "grad_norm": 1.2414278984069824, + "learning_rate": 5.858991507111122e-05, + "loss": 1.6109, + "step": 30180 + }, + { + "epoch": 6.360141149207353, + "grad_norm": 1.134537935256958, + "learning_rate": 5.852967182917276e-05, + "loss": 1.5789, + "step": 30190 + }, + { + "epoch": 6.362247853794702, + "grad_norm": 1.0381525754928589, + "learning_rate": 5.846944675835584e-05, + "loss": 1.596, + "step": 30200 + }, + { + "epoch": 6.36435455838205, + "grad_norm": 1.9543304443359375, + "learning_rate": 5.84092398850494e-05, + "loss": 1.6616, + "step": 30210 + }, + { + "epoch": 6.3664612629694, + "grad_norm": 1.0826011896133423, + "learning_rate": 5.834905123563441e-05, + "loss": 1.5841, + "step": 30220 + }, + { + "epoch": 6.368567967556749, + "grad_norm": 1.1161965131759644, + "learning_rate": 5.828888083648382e-05, + "loss": 1.6423, + "step": 30230 + }, + { + "epoch": 6.370674672144099, + "grad_norm": 1.0613067150115967, + "learning_rate": 5.8228728713962543e-05, + "loss": 1.5799, + "step": 30240 + }, + { + "epoch": 6.372781376731448, + "grad_norm": 1.1326632499694824, + "learning_rate": 5.81685948944276e-05, + "loss": 1.5887, + "step": 30250 + }, + { + "epoch": 6.374888081318797, + "grad_norm": 1.1356877088546753, + "learning_rate": 5.8108479404227857e-05, + "loss": 1.5807, + "step": 30260 + }, + { + "epoch": 6.376994785906146, + "grad_norm": 1.159637212753296, + "learning_rate": 5.8048382269704305e-05, + "loss": 1.6054, + "step": 30270 + }, + { + "epoch": 6.379101490493496, + "grad_norm": 1.1319040060043335, + "learning_rate": 5.798830351718975e-05, + "loss": 1.5625, + "step": 30280 + }, + { + "epoch": 6.381208195080845, + "grad_norm": 1.3133894205093384, + "learning_rate": 5.7928243173008956e-05, + "loss": 1.5916, + "step": 30290 + }, + { + "epoch": 6.383314899668194, + "grad_norm": 1.1023930311203003, + "learning_rate": 5.786820126347876e-05, + "loss": 1.5471, + "step": 30300 + }, + { + "epoch": 6.385421604255543, + "grad_norm": 1.0441666841506958, + "learning_rate": 5.780817781490777e-05, + "loss": 1.5525, + "step": 30310 + }, + { + "epoch": 6.387528308842892, + "grad_norm": 1.1849002838134766, + "learning_rate": 5.7748172853596504e-05, + "loss": 1.6018, + "step": 30320 + }, + { + "epoch": 6.389635013430242, + "grad_norm": 1.1467089653015137, + "learning_rate": 5.768818640583755e-05, + "loss": 1.5891, + "step": 30330 + }, + { + "epoch": 6.391741718017591, + "grad_norm": 1.1927987337112427, + "learning_rate": 5.762821849791515e-05, + "loss": 1.6288, + "step": 30340 + }, + { + "epoch": 6.393848422604941, + "grad_norm": 1.230624794960022, + "learning_rate": 5.7568269156105656e-05, + "loss": 1.5764, + "step": 30350 + }, + { + "epoch": 6.395955127192289, + "grad_norm": 1.03030264377594, + "learning_rate": 5.750833840667711e-05, + "loss": 1.6052, + "step": 30360 + }, + { + "epoch": 6.398061831779639, + "grad_norm": 1.1722559928894043, + "learning_rate": 5.744842627588942e-05, + "loss": 1.5977, + "step": 30370 + }, + { + "epoch": 6.400168536366988, + "grad_norm": 1.084792137145996, + "learning_rate": 5.7388532789994476e-05, + "loss": 1.5983, + "step": 30380 + }, + { + "epoch": 6.402275240954337, + "grad_norm": 1.12361478805542, + "learning_rate": 5.7328657975235864e-05, + "loss": 1.5713, + "step": 30390 + }, + { + "epoch": 6.404381945541687, + "grad_norm": 1.1321178674697876, + "learning_rate": 5.7268801857848974e-05, + "loss": 1.6019, + "step": 30400 + }, + { + "epoch": 6.4064886501290355, + "grad_norm": 1.0965027809143066, + "learning_rate": 5.7208964464061165e-05, + "loss": 1.5976, + "step": 30410 + }, + { + "epoch": 6.408595354716385, + "grad_norm": 1.1020747423171997, + "learning_rate": 5.7149145820091385e-05, + "loss": 1.568, + "step": 30420 + }, + { + "epoch": 6.410702059303734, + "grad_norm": 1.088662028312683, + "learning_rate": 5.7089345952150555e-05, + "loss": 1.6088, + "step": 30430 + }, + { + "epoch": 6.412808763891084, + "grad_norm": 1.1830599308013916, + "learning_rate": 5.7029564886441245e-05, + "loss": 1.5691, + "step": 30440 + }, + { + "epoch": 6.414915468478433, + "grad_norm": 1.2777645587921143, + "learning_rate": 5.696980264915777e-05, + "loss": 1.6209, + "step": 30450 + }, + { + "epoch": 6.4170221730657815, + "grad_norm": 1.0497311353683472, + "learning_rate": 5.691005926648631e-05, + "loss": 1.5402, + "step": 30460 + }, + { + "epoch": 6.419128877653131, + "grad_norm": 1.1205629110336304, + "learning_rate": 5.685033476460471e-05, + "loss": 1.6328, + "step": 30470 + }, + { + "epoch": 6.42123558224048, + "grad_norm": 1.097438097000122, + "learning_rate": 5.6790629169682564e-05, + "loss": 1.586, + "step": 30480 + }, + { + "epoch": 6.42334228682783, + "grad_norm": 1.1904405355453491, + "learning_rate": 5.673094250788115e-05, + "loss": 1.5376, + "step": 30490 + }, + { + "epoch": 6.425448991415179, + "grad_norm": 1.2280725240707397, + "learning_rate": 5.6671274805353434e-05, + "loss": 1.601, + "step": 30500 + }, + { + "epoch": 6.427555696002528, + "grad_norm": 1.2686785459518433, + "learning_rate": 5.6611626088244194e-05, + "loss": 1.6357, + "step": 30510 + }, + { + "epoch": 6.429662400589877, + "grad_norm": 1.2746089696884155, + "learning_rate": 5.6551996382689776e-05, + "loss": 1.6072, + "step": 30520 + }, + { + "epoch": 6.431769105177226, + "grad_norm": 1.127000093460083, + "learning_rate": 5.649238571481815e-05, + "loss": 1.6258, + "step": 30530 + }, + { + "epoch": 6.433875809764576, + "grad_norm": 1.1913783550262451, + "learning_rate": 5.6432794110749134e-05, + "loss": 1.5714, + "step": 30540 + }, + { + "epoch": 6.435982514351925, + "grad_norm": 1.0561548471450806, + "learning_rate": 5.6373221596594e-05, + "loss": 1.528, + "step": 30550 + }, + { + "epoch": 6.438089218939274, + "grad_norm": 1.189866304397583, + "learning_rate": 5.631366819845578e-05, + "loss": 1.5789, + "step": 30560 + }, + { + "epoch": 6.440195923526623, + "grad_norm": 1.1098788976669312, + "learning_rate": 5.625413394242907e-05, + "loss": 1.5776, + "step": 30570 + }, + { + "epoch": 6.442302628113973, + "grad_norm": 1.271070122718811, + "learning_rate": 5.6194618854600057e-05, + "loss": 1.5436, + "step": 30580 + }, + { + "epoch": 6.444409332701322, + "grad_norm": 1.1668862104415894, + "learning_rate": 5.613512296104663e-05, + "loss": 1.6552, + "step": 30590 + }, + { + "epoch": 6.446516037288671, + "grad_norm": 1.2380750179290771, + "learning_rate": 5.607564628783817e-05, + "loss": 1.5845, + "step": 30600 + }, + { + "epoch": 6.4486227418760205, + "grad_norm": 1.224056601524353, + "learning_rate": 5.601618886103561e-05, + "loss": 1.6456, + "step": 30610 + }, + { + "epoch": 6.450729446463369, + "grad_norm": 1.0602716207504272, + "learning_rate": 5.595675070669162e-05, + "loss": 1.5599, + "step": 30620 + }, + { + "epoch": 6.452836151050719, + "grad_norm": 1.1630531549453735, + "learning_rate": 5.589733185085022e-05, + "loss": 1.5894, + "step": 30630 + }, + { + "epoch": 6.454942855638068, + "grad_norm": 1.0980303287506104, + "learning_rate": 5.583793231954713e-05, + "loss": 1.6006, + "step": 30640 + }, + { + "epoch": 6.457049560225418, + "grad_norm": 1.1695753335952759, + "learning_rate": 5.577855213880951e-05, + "loss": 1.598, + "step": 30650 + }, + { + "epoch": 6.4591562648127665, + "grad_norm": 1.0704690217971802, + "learning_rate": 5.571919133465605e-05, + "loss": 1.6008, + "step": 30660 + }, + { + "epoch": 6.461262969400116, + "grad_norm": 1.102163314819336, + "learning_rate": 5.565984993309703e-05, + "loss": 1.5416, + "step": 30670 + }, + { + "epoch": 6.463369673987465, + "grad_norm": 1.143849492073059, + "learning_rate": 5.560052796013413e-05, + "loss": 1.5771, + "step": 30680 + }, + { + "epoch": 6.465476378574814, + "grad_norm": 1.03353750705719, + "learning_rate": 5.5541225441760524e-05, + "loss": 1.6104, + "step": 30690 + }, + { + "epoch": 6.467583083162164, + "grad_norm": 1.1685138940811157, + "learning_rate": 5.5481942403960986e-05, + "loss": 1.6455, + "step": 30700 + }, + { + "epoch": 6.4696897877495125, + "grad_norm": 1.1921008825302124, + "learning_rate": 5.542267887271155e-05, + "loss": 1.6081, + "step": 30710 + }, + { + "epoch": 6.471796492336862, + "grad_norm": 1.1776456832885742, + "learning_rate": 5.5363434873979903e-05, + "loss": 1.6246, + "step": 30720 + }, + { + "epoch": 6.473903196924211, + "grad_norm": 1.1411466598510742, + "learning_rate": 5.530421043372507e-05, + "loss": 1.6324, + "step": 30730 + }, + { + "epoch": 6.476009901511561, + "grad_norm": 1.1445951461791992, + "learning_rate": 5.524500557789745e-05, + "loss": 1.6065, + "step": 30740 + }, + { + "epoch": 6.47811660609891, + "grad_norm": 1.1429086923599243, + "learning_rate": 5.518582033243902e-05, + "loss": 1.5875, + "step": 30750 + }, + { + "epoch": 6.4802233106862595, + "grad_norm": 1.1539630889892578, + "learning_rate": 5.512665472328302e-05, + "loss": 1.6545, + "step": 30760 + }, + { + "epoch": 6.482330015273608, + "grad_norm": 1.1232472658157349, + "learning_rate": 5.506750877635418e-05, + "loss": 1.5867, + "step": 30770 + }, + { + "epoch": 6.484436719860957, + "grad_norm": 1.2769269943237305, + "learning_rate": 5.500838251756857e-05, + "loss": 1.6165, + "step": 30780 + }, + { + "epoch": 6.486543424448307, + "grad_norm": 1.1701322793960571, + "learning_rate": 5.4949275972833594e-05, + "loss": 1.5832, + "step": 30790 + }, + { + "epoch": 6.488650129035656, + "grad_norm": 1.1360645294189453, + "learning_rate": 5.489018916804813e-05, + "loss": 1.6098, + "step": 30800 + }, + { + "epoch": 6.4907568336230055, + "grad_norm": 1.127044677734375, + "learning_rate": 5.4831122129102307e-05, + "loss": 1.6443, + "step": 30810 + }, + { + "epoch": 6.492863538210354, + "grad_norm": 1.1451194286346436, + "learning_rate": 5.477207488187759e-05, + "loss": 1.6208, + "step": 30820 + }, + { + "epoch": 6.494970242797704, + "grad_norm": 1.1845287084579468, + "learning_rate": 5.471304745224689e-05, + "loss": 1.6055, + "step": 30830 + }, + { + "epoch": 6.497076947385053, + "grad_norm": 1.208879828453064, + "learning_rate": 5.465403986607426e-05, + "loss": 1.6061, + "step": 30840 + }, + { + "epoch": 6.499183651972402, + "grad_norm": 1.2800372838974, + "learning_rate": 5.4595052149215246e-05, + "loss": 1.5853, + "step": 30850 + }, + { + "epoch": 6.5012903565597515, + "grad_norm": 1.1612744331359863, + "learning_rate": 5.4536084327516535e-05, + "loss": 1.6532, + "step": 30860 + }, + { + "epoch": 6.5033970611471, + "grad_norm": 1.0994377136230469, + "learning_rate": 5.447713642681612e-05, + "loss": 1.6062, + "step": 30870 + }, + { + "epoch": 6.50550376573445, + "grad_norm": 1.1329749822616577, + "learning_rate": 5.441820847294339e-05, + "loss": 1.5962, + "step": 30880 + }, + { + "epoch": 6.507610470321799, + "grad_norm": 1.0713906288146973, + "learning_rate": 5.435930049171885e-05, + "loss": 1.5756, + "step": 30890 + }, + { + "epoch": 6.509717174909149, + "grad_norm": 1.1019048690795898, + "learning_rate": 5.430041250895428e-05, + "loss": 1.5566, + "step": 30900 + }, + { + "epoch": 6.5118238794964975, + "grad_norm": 1.2118427753448486, + "learning_rate": 5.424154455045278e-05, + "loss": 1.5868, + "step": 30910 + }, + { + "epoch": 6.513930584083846, + "grad_norm": 1.2621134519577026, + "learning_rate": 5.418269664200857e-05, + "loss": 1.5591, + "step": 30920 + }, + { + "epoch": 6.516037288671196, + "grad_norm": 1.1320388317108154, + "learning_rate": 5.4123868809407206e-05, + "loss": 1.6229, + "step": 30930 + }, + { + "epoch": 6.518143993258545, + "grad_norm": 1.1174076795578003, + "learning_rate": 5.4065061078425315e-05, + "loss": 1.6151, + "step": 30940 + }, + { + "epoch": 6.520250697845895, + "grad_norm": 1.1697906255722046, + "learning_rate": 5.400627347483076e-05, + "loss": 1.6131, + "step": 30950 + }, + { + "epoch": 6.522357402433244, + "grad_norm": 1.0805909633636475, + "learning_rate": 5.3947506024382665e-05, + "loss": 1.5527, + "step": 30960 + }, + { + "epoch": 6.524464107020593, + "grad_norm": 1.077239990234375, + "learning_rate": 5.388875875283124e-05, + "loss": 1.6083, + "step": 30970 + }, + { + "epoch": 6.526570811607942, + "grad_norm": 1.1366991996765137, + "learning_rate": 5.3830031685917803e-05, + "loss": 1.5768, + "step": 30980 + }, + { + "epoch": 6.528677516195291, + "grad_norm": 1.2108395099639893, + "learning_rate": 5.377132484937499e-05, + "loss": 1.6281, + "step": 30990 + }, + { + "epoch": 6.530784220782641, + "grad_norm": 1.1738390922546387, + "learning_rate": 5.3712638268926397e-05, + "loss": 1.6515, + "step": 31000 + }, + { + "epoch": 6.53289092536999, + "grad_norm": 1.0865222215652466, + "learning_rate": 5.365397197028685e-05, + "loss": 1.5456, + "step": 31010 + }, + { + "epoch": 6.534997629957339, + "grad_norm": 1.1172924041748047, + "learning_rate": 5.359532597916233e-05, + "loss": 1.5758, + "step": 31020 + }, + { + "epoch": 6.537104334544688, + "grad_norm": 1.1861040592193604, + "learning_rate": 5.35367003212497e-05, + "loss": 1.5852, + "step": 31030 + }, + { + "epoch": 6.539211039132038, + "grad_norm": 1.1553261280059814, + "learning_rate": 5.3478095022237175e-05, + "loss": 1.5824, + "step": 31040 + }, + { + "epoch": 6.541317743719387, + "grad_norm": 1.1746402978897095, + "learning_rate": 5.341951010780386e-05, + "loss": 1.5837, + "step": 31050 + }, + { + "epoch": 6.5434244483067365, + "grad_norm": 1.233399748802185, + "learning_rate": 5.336094560362006e-05, + "loss": 1.5422, + "step": 31060 + }, + { + "epoch": 6.545531152894085, + "grad_norm": 1.1761796474456787, + "learning_rate": 5.330240153534707e-05, + "loss": 1.5942, + "step": 31070 + }, + { + "epoch": 6.547637857481435, + "grad_norm": 1.077040195465088, + "learning_rate": 5.324387792863719e-05, + "loss": 1.5745, + "step": 31080 + }, + { + "epoch": 6.549744562068784, + "grad_norm": 1.251773476600647, + "learning_rate": 5.3185374809133837e-05, + "loss": 1.6185, + "step": 31090 + }, + { + "epoch": 6.551851266656133, + "grad_norm": 1.1122722625732422, + "learning_rate": 5.312689220247151e-05, + "loss": 1.5961, + "step": 31100 + }, + { + "epoch": 6.553957971243483, + "grad_norm": 1.1091094017028809, + "learning_rate": 5.306843013427545e-05, + "loss": 1.6158, + "step": 31110 + }, + { + "epoch": 6.556064675830831, + "grad_norm": 1.1521689891815186, + "learning_rate": 5.300998863016222e-05, + "loss": 1.6193, + "step": 31120 + }, + { + "epoch": 6.558171380418181, + "grad_norm": 1.0843865871429443, + "learning_rate": 5.2951567715739126e-05, + "loss": 1.5983, + "step": 31130 + }, + { + "epoch": 6.56027808500553, + "grad_norm": 1.2138440608978271, + "learning_rate": 5.289316741660466e-05, + "loss": 1.5899, + "step": 31140 + }, + { + "epoch": 6.56238478959288, + "grad_norm": 1.0254400968551636, + "learning_rate": 5.283478775834811e-05, + "loss": 1.5816, + "step": 31150 + }, + { + "epoch": 6.564491494180229, + "grad_norm": 1.0957165956497192, + "learning_rate": 5.277642876654978e-05, + "loss": 1.6152, + "step": 31160 + }, + { + "epoch": 6.566598198767577, + "grad_norm": 1.1528323888778687, + "learning_rate": 5.271809046678094e-05, + "loss": 1.6044, + "step": 31170 + }, + { + "epoch": 6.568704903354927, + "grad_norm": 1.1122441291809082, + "learning_rate": 5.265977288460386e-05, + "loss": 1.5604, + "step": 31180 + }, + { + "epoch": 6.570811607942276, + "grad_norm": 1.1756147146224976, + "learning_rate": 5.2601476045571506e-05, + "loss": 1.6111, + "step": 31190 + }, + { + "epoch": 6.572918312529626, + "grad_norm": 1.2942123413085938, + "learning_rate": 5.254319997522796e-05, + "loss": 1.61, + "step": 31200 + }, + { + "epoch": 6.575025017116975, + "grad_norm": 1.1158119440078735, + "learning_rate": 5.2484944699108194e-05, + "loss": 1.6006, + "step": 31210 + }, + { + "epoch": 6.577131721704324, + "grad_norm": 1.1101962327957153, + "learning_rate": 5.242671024273798e-05, + "loss": 1.6298, + "step": 31220 + }, + { + "epoch": 6.579238426291673, + "grad_norm": 1.0920320749282837, + "learning_rate": 5.236849663163399e-05, + "loss": 1.6053, + "step": 31230 + }, + { + "epoch": 6.581345130879022, + "grad_norm": 1.1226073503494263, + "learning_rate": 5.231030389130375e-05, + "loss": 1.6229, + "step": 31240 + }, + { + "epoch": 6.583451835466372, + "grad_norm": 1.1309608221054077, + "learning_rate": 5.2252132047245704e-05, + "loss": 1.5947, + "step": 31250 + }, + { + "epoch": 6.585558540053721, + "grad_norm": 1.190983533859253, + "learning_rate": 5.21939811249492e-05, + "loss": 1.6526, + "step": 31260 + }, + { + "epoch": 6.58766524464107, + "grad_norm": 1.2079119682312012, + "learning_rate": 5.2135851149894124e-05, + "loss": 1.6248, + "step": 31270 + }, + { + "epoch": 6.589771949228419, + "grad_norm": 1.0407904386520386, + "learning_rate": 5.20777421475515e-05, + "loss": 1.5605, + "step": 31280 + }, + { + "epoch": 6.591878653815769, + "grad_norm": 1.154831886291504, + "learning_rate": 5.201965414338308e-05, + "loss": 1.6149, + "step": 31290 + }, + { + "epoch": 6.593985358403118, + "grad_norm": 1.1011749505996704, + "learning_rate": 5.196158716284128e-05, + "loss": 1.5944, + "step": 31300 + }, + { + "epoch": 6.596092062990467, + "grad_norm": 1.1772890090942383, + "learning_rate": 5.190354123136954e-05, + "loss": 1.604, + "step": 31310 + }, + { + "epoch": 6.598198767577816, + "grad_norm": 1.1958460807800293, + "learning_rate": 5.1845516374401784e-05, + "loss": 1.5671, + "step": 31320 + }, + { + "epoch": 6.600305472165165, + "grad_norm": 1.1239957809448242, + "learning_rate": 5.178751261736292e-05, + "loss": 1.5596, + "step": 31330 + }, + { + "epoch": 6.602412176752515, + "grad_norm": 1.0745184421539307, + "learning_rate": 5.1729529985668604e-05, + "loss": 1.5746, + "step": 31340 + }, + { + "epoch": 6.604518881339864, + "grad_norm": 1.1670557260513306, + "learning_rate": 5.1671568504725135e-05, + "loss": 1.5606, + "step": 31350 + }, + { + "epoch": 6.606625585927214, + "grad_norm": 1.1216731071472168, + "learning_rate": 5.1613628199929544e-05, + "loss": 1.6375, + "step": 31360 + }, + { + "epoch": 6.6087322905145625, + "grad_norm": 1.1803592443466187, + "learning_rate": 5.1555709096669725e-05, + "loss": 1.6, + "step": 31370 + }, + { + "epoch": 6.610838995101912, + "grad_norm": 1.1835579872131348, + "learning_rate": 5.14978112203241e-05, + "loss": 1.6252, + "step": 31380 + }, + { + "epoch": 6.612945699689261, + "grad_norm": 1.231615424156189, + "learning_rate": 5.1439934596261994e-05, + "loss": 1.6111, + "step": 31390 + }, + { + "epoch": 6.615052404276611, + "grad_norm": 1.2632482051849365, + "learning_rate": 5.138207924984313e-05, + "loss": 1.6047, + "step": 31400 + }, + { + "epoch": 6.61715910886396, + "grad_norm": 1.2507926225662231, + "learning_rate": 5.1324245206418184e-05, + "loss": 1.646, + "step": 31410 + }, + { + "epoch": 6.6192658134513085, + "grad_norm": 1.119822382926941, + "learning_rate": 5.126643249132843e-05, + "loss": 1.5353, + "step": 31420 + }, + { + "epoch": 6.621372518038658, + "grad_norm": 1.2157727479934692, + "learning_rate": 5.120864112990569e-05, + "loss": 1.6133, + "step": 31430 + }, + { + "epoch": 6.623479222626007, + "grad_norm": 1.1493059396743774, + "learning_rate": 5.11508711474725e-05, + "loss": 1.6899, + "step": 31440 + }, + { + "epoch": 6.625585927213357, + "grad_norm": 1.209083080291748, + "learning_rate": 5.109312256934208e-05, + "loss": 1.5788, + "step": 31450 + }, + { + "epoch": 6.627692631800706, + "grad_norm": 1.1497858762741089, + "learning_rate": 5.103539542081814e-05, + "loss": 1.5901, + "step": 31460 + }, + { + "epoch": 6.629799336388055, + "grad_norm": 1.1102197170257568, + "learning_rate": 5.097768972719522e-05, + "loss": 1.6064, + "step": 31470 + }, + { + "epoch": 6.631906040975404, + "grad_norm": 1.158963918685913, + "learning_rate": 5.092000551375814e-05, + "loss": 1.5992, + "step": 31480 + }, + { + "epoch": 6.634012745562753, + "grad_norm": 1.145154356956482, + "learning_rate": 5.086234280578257e-05, + "loss": 1.5399, + "step": 31490 + }, + { + "epoch": 6.636119450150103, + "grad_norm": 1.1013636589050293, + "learning_rate": 5.080470162853472e-05, + "loss": 1.6289, + "step": 31500 + }, + { + "epoch": 6.638226154737452, + "grad_norm": 1.1045435667037964, + "learning_rate": 5.0747082007271275e-05, + "loss": 1.5798, + "step": 31510 + }, + { + "epoch": 6.640332859324801, + "grad_norm": 1.1121597290039062, + "learning_rate": 5.068948396723947e-05, + "loss": 1.5345, + "step": 31520 + }, + { + "epoch": 6.64243956391215, + "grad_norm": 1.1220855712890625, + "learning_rate": 5.063190753367721e-05, + "loss": 1.6155, + "step": 31530 + }, + { + "epoch": 6.6445462684995, + "grad_norm": 1.2580904960632324, + "learning_rate": 5.0574352731812814e-05, + "loss": 1.5651, + "step": 31540 + }, + { + "epoch": 6.646652973086849, + "grad_norm": 1.054614782333374, + "learning_rate": 5.051681958686518e-05, + "loss": 1.6003, + "step": 31550 + }, + { + "epoch": 6.648759677674198, + "grad_norm": 1.0666719675064087, + "learning_rate": 5.0459308124043715e-05, + "loss": 1.5902, + "step": 31560 + }, + { + "epoch": 6.6508663822615475, + "grad_norm": 1.1629925966262817, + "learning_rate": 5.040181836854825e-05, + "loss": 1.659, + "step": 31570 + }, + { + "epoch": 6.652973086848896, + "grad_norm": 5.463768005371094, + "learning_rate": 5.0344350345569244e-05, + "loss": 1.5968, + "step": 31580 + }, + { + "epoch": 6.655079791436246, + "grad_norm": 1.0833951234817505, + "learning_rate": 5.028690408028748e-05, + "loss": 1.6454, + "step": 31590 + }, + { + "epoch": 6.657186496023595, + "grad_norm": 1.0549983978271484, + "learning_rate": 5.022947959787435e-05, + "loss": 1.6268, + "step": 31600 + }, + { + "epoch": 6.659293200610945, + "grad_norm": 1.0697112083435059, + "learning_rate": 5.0172076923491604e-05, + "loss": 1.546, + "step": 31610 + }, + { + "epoch": 6.6613999051982935, + "grad_norm": 1.2887377738952637, + "learning_rate": 5.0114696082291425e-05, + "loss": 1.5367, + "step": 31620 + }, + { + "epoch": 6.663506609785642, + "grad_norm": 1.2095539569854736, + "learning_rate": 5.0057337099416556e-05, + "loss": 1.6219, + "step": 31630 + }, + { + "epoch": 6.665613314372992, + "grad_norm": 1.0067800283432007, + "learning_rate": 5.000000000000002e-05, + "loss": 1.5961, + "step": 31640 + }, + { + "epoch": 6.667720018960341, + "grad_norm": 1.167069673538208, + "learning_rate": 4.9942684809165284e-05, + "loss": 1.5955, + "step": 31650 + }, + { + "epoch": 6.669826723547691, + "grad_norm": 1.0418312549591064, + "learning_rate": 4.9885391552026304e-05, + "loss": 1.5472, + "step": 31660 + }, + { + "epoch": 6.6719334281350395, + "grad_norm": 1.0489190816879272, + "learning_rate": 4.9828120253687296e-05, + "loss": 1.5671, + "step": 31670 + }, + { + "epoch": 6.674040132722389, + "grad_norm": 1.3005365133285522, + "learning_rate": 4.9770870939242986e-05, + "loss": 1.6017, + "step": 31680 + }, + { + "epoch": 6.676146837309738, + "grad_norm": 1.2087478637695312, + "learning_rate": 4.971364363377837e-05, + "loss": 1.5988, + "step": 31690 + }, + { + "epoch": 6.678253541897088, + "grad_norm": 1.241007924079895, + "learning_rate": 4.9656438362368784e-05, + "loss": 1.5843, + "step": 31700 + }, + { + "epoch": 6.680360246484437, + "grad_norm": 1.1099778413772583, + "learning_rate": 4.959925515008002e-05, + "loss": 1.656, + "step": 31710 + }, + { + "epoch": 6.6824669510717865, + "grad_norm": 1.1651355028152466, + "learning_rate": 4.954209402196813e-05, + "loss": 1.5808, + "step": 31720 + }, + { + "epoch": 6.684573655659135, + "grad_norm": 1.303068995475769, + "learning_rate": 4.948495500307945e-05, + "loss": 1.5864, + "step": 31730 + }, + { + "epoch": 6.686680360246484, + "grad_norm": 1.1450005769729614, + "learning_rate": 4.942783811845074e-05, + "loss": 1.6218, + "step": 31740 + }, + { + "epoch": 6.688787064833834, + "grad_norm": 1.2237937450408936, + "learning_rate": 4.937074339310894e-05, + "loss": 1.6114, + "step": 31750 + }, + { + "epoch": 6.690893769421183, + "grad_norm": 1.1926014423370361, + "learning_rate": 4.931367085207142e-05, + "loss": 1.6181, + "step": 31760 + }, + { + "epoch": 6.6930004740085325, + "grad_norm": 1.0968434810638428, + "learning_rate": 4.9256620520345675e-05, + "loss": 1.5742, + "step": 31770 + }, + { + "epoch": 6.695107178595881, + "grad_norm": 1.1037589311599731, + "learning_rate": 4.919959242292954e-05, + "loss": 1.5769, + "step": 31780 + }, + { + "epoch": 6.697213883183231, + "grad_norm": 1.150773286819458, + "learning_rate": 4.9142586584811165e-05, + "loss": 1.5985, + "step": 31790 + }, + { + "epoch": 6.69932058777058, + "grad_norm": 1.1506338119506836, + "learning_rate": 4.908560303096887e-05, + "loss": 1.5338, + "step": 31800 + }, + { + "epoch": 6.701427292357929, + "grad_norm": 1.2946168184280396, + "learning_rate": 4.90286417863712e-05, + "loss": 1.5999, + "step": 31810 + }, + { + "epoch": 6.7035339969452785, + "grad_norm": 1.1435717344284058, + "learning_rate": 4.8971702875977e-05, + "loss": 1.6167, + "step": 31820 + }, + { + "epoch": 6.705640701532627, + "grad_norm": 1.048068881034851, + "learning_rate": 4.891478632473524e-05, + "loss": 1.5388, + "step": 31830 + }, + { + "epoch": 6.707747406119977, + "grad_norm": 1.086499571800232, + "learning_rate": 4.88578921575852e-05, + "loss": 1.6239, + "step": 31840 + }, + { + "epoch": 6.709854110707326, + "grad_norm": 1.2164744138717651, + "learning_rate": 4.880102039945624e-05, + "loss": 1.646, + "step": 31850 + }, + { + "epoch": 6.711960815294676, + "grad_norm": 1.0961980819702148, + "learning_rate": 4.874417107526795e-05, + "loss": 1.5828, + "step": 31860 + }, + { + "epoch": 6.7140675198820245, + "grad_norm": 1.094759225845337, + "learning_rate": 4.868734420993014e-05, + "loss": 1.6037, + "step": 31870 + }, + { + "epoch": 6.716174224469373, + "grad_norm": 1.0369664430618286, + "learning_rate": 4.863053982834266e-05, + "loss": 1.5872, + "step": 31880 + }, + { + "epoch": 6.718280929056723, + "grad_norm": 1.1314085721969604, + "learning_rate": 4.857375795539566e-05, + "loss": 1.5673, + "step": 31890 + }, + { + "epoch": 6.720387633644072, + "grad_norm": 1.1117151975631714, + "learning_rate": 4.85169986159693e-05, + "loss": 1.6008, + "step": 31900 + }, + { + "epoch": 6.722494338231422, + "grad_norm": 1.1076699495315552, + "learning_rate": 4.8460261834933875e-05, + "loss": 1.6054, + "step": 31910 + }, + { + "epoch": 6.724601042818771, + "grad_norm": 1.090351939201355, + "learning_rate": 4.840354763714991e-05, + "loss": 1.5609, + "step": 31920 + }, + { + "epoch": 6.72670774740612, + "grad_norm": 1.0670236349105835, + "learning_rate": 4.834685604746794e-05, + "loss": 1.5798, + "step": 31930 + }, + { + "epoch": 6.728814451993469, + "grad_norm": 1.1509038209915161, + "learning_rate": 4.829018709072854e-05, + "loss": 1.645, + "step": 31940 + }, + { + "epoch": 6.730921156580818, + "grad_norm": 1.1492196321487427, + "learning_rate": 4.823354079176253e-05, + "loss": 1.5266, + "step": 31950 + }, + { + "epoch": 6.733027861168168, + "grad_norm": 1.3231983184814453, + "learning_rate": 4.8176917175390656e-05, + "loss": 1.5827, + "step": 31960 + }, + { + "epoch": 6.735134565755517, + "grad_norm": 1.2020155191421509, + "learning_rate": 4.812031626642382e-05, + "loss": 1.6198, + "step": 31970 + }, + { + "epoch": 6.737241270342866, + "grad_norm": 1.140032172203064, + "learning_rate": 4.8063738089662926e-05, + "loss": 1.5159, + "step": 31980 + }, + { + "epoch": 6.739347974930215, + "grad_norm": 1.1477890014648438, + "learning_rate": 4.800718266989888e-05, + "loss": 1.6024, + "step": 31990 + }, + { + "epoch": 6.741454679517565, + "grad_norm": 1.2770425081253052, + "learning_rate": 4.795065003191272e-05, + "loss": 1.5814, + "step": 32000 + }, + { + "epoch": 6.743561384104914, + "grad_norm": 1.1991952657699585, + "learning_rate": 4.7894140200475435e-05, + "loss": 1.5764, + "step": 32010 + }, + { + "epoch": 6.745668088692263, + "grad_norm": 1.2961416244506836, + "learning_rate": 4.7837653200347974e-05, + "loss": 1.5989, + "step": 32020 + }, + { + "epoch": 6.747774793279612, + "grad_norm": 1.1892178058624268, + "learning_rate": 4.7781189056281415e-05, + "loss": 1.6455, + "step": 32030 + }, + { + "epoch": 6.749881497866961, + "grad_norm": 1.1718902587890625, + "learning_rate": 4.772474779301669e-05, + "loss": 1.5826, + "step": 32040 + }, + { + "epoch": 6.751988202454311, + "grad_norm": 1.1832486391067505, + "learning_rate": 4.766832943528481e-05, + "loss": 1.6021, + "step": 32050 + }, + { + "epoch": 6.75409490704166, + "grad_norm": 1.1368720531463623, + "learning_rate": 4.7611934007806666e-05, + "loss": 1.5537, + "step": 32060 + }, + { + "epoch": 6.75620161162901, + "grad_norm": 1.0927391052246094, + "learning_rate": 4.755556153529311e-05, + "loss": 1.617, + "step": 32070 + }, + { + "epoch": 6.758308316216358, + "grad_norm": 1.3450697660446167, + "learning_rate": 4.749921204244503e-05, + "loss": 1.6012, + "step": 32080 + }, + { + "epoch": 6.760415020803708, + "grad_norm": 1.1041196584701538, + "learning_rate": 4.744288555395313e-05, + "loss": 1.6142, + "step": 32090 + }, + { + "epoch": 6.762521725391057, + "grad_norm": 1.079490065574646, + "learning_rate": 4.738658209449805e-05, + "loss": 1.6022, + "step": 32100 + }, + { + "epoch": 6.764628429978407, + "grad_norm": 1.116524338722229, + "learning_rate": 4.7330301688750434e-05, + "loss": 1.5029, + "step": 32110 + }, + { + "epoch": 6.766735134565756, + "grad_norm": 1.13874351978302, + "learning_rate": 4.72740443613707e-05, + "loss": 1.6044, + "step": 32120 + }, + { + "epoch": 6.7688418391531044, + "grad_norm": 1.1345837116241455, + "learning_rate": 4.7217810137009274e-05, + "loss": 1.5789, + "step": 32130 + }, + { + "epoch": 6.770948543740454, + "grad_norm": 1.2009543180465698, + "learning_rate": 4.716159904030637e-05, + "loss": 1.5548, + "step": 32140 + }, + { + "epoch": 6.773055248327803, + "grad_norm": 1.1193791627883911, + "learning_rate": 4.710541109589205e-05, + "loss": 1.612, + "step": 32150 + }, + { + "epoch": 6.775161952915153, + "grad_norm": 1.2925307750701904, + "learning_rate": 4.704924632838636e-05, + "loss": 1.604, + "step": 32160 + }, + { + "epoch": 6.777268657502502, + "grad_norm": 1.1835635900497437, + "learning_rate": 4.699310476239904e-05, + "loss": 1.6282, + "step": 32170 + }, + { + "epoch": 6.779375362089851, + "grad_norm": 1.1469124555587769, + "learning_rate": 4.693698642252979e-05, + "loss": 1.5664, + "step": 32180 + }, + { + "epoch": 6.7814820666772, + "grad_norm": 1.1878842115402222, + "learning_rate": 4.688089133336805e-05, + "loss": 1.6328, + "step": 32190 + }, + { + "epoch": 6.783588771264549, + "grad_norm": 1.113439679145813, + "learning_rate": 4.6824819519493057e-05, + "loss": 1.5955, + "step": 32200 + }, + { + "epoch": 6.785695475851899, + "grad_norm": 1.1598047018051147, + "learning_rate": 4.676877100547392e-05, + "loss": 1.5929, + "step": 32210 + }, + { + "epoch": 6.787802180439248, + "grad_norm": 1.0954216718673706, + "learning_rate": 4.671274581586958e-05, + "loss": 1.5691, + "step": 32220 + }, + { + "epoch": 6.789908885026597, + "grad_norm": 1.2070858478546143, + "learning_rate": 4.665674397522856e-05, + "loss": 1.6073, + "step": 32230 + }, + { + "epoch": 6.792015589613946, + "grad_norm": 1.273911476135254, + "learning_rate": 4.660076550808936e-05, + "loss": 1.5836, + "step": 32240 + }, + { + "epoch": 6.794122294201296, + "grad_norm": 1.1469873189926147, + "learning_rate": 4.654481043898011e-05, + "loss": 1.6065, + "step": 32250 + }, + { + "epoch": 6.796228998788645, + "grad_norm": 1.175328254699707, + "learning_rate": 4.648887879241879e-05, + "loss": 1.5451, + "step": 32260 + }, + { + "epoch": 6.798335703375994, + "grad_norm": 1.2798956632614136, + "learning_rate": 4.6432970592913026e-05, + "loss": 1.5988, + "step": 32270 + }, + { + "epoch": 6.800442407963343, + "grad_norm": 1.1209592819213867, + "learning_rate": 4.637708586496018e-05, + "loss": 1.5729, + "step": 32280 + }, + { + "epoch": 6.802549112550692, + "grad_norm": 1.106839656829834, + "learning_rate": 4.6321224633047365e-05, + "loss": 1.5888, + "step": 32290 + }, + { + "epoch": 6.804655817138042, + "grad_norm": 1.256089687347412, + "learning_rate": 4.6265386921651496e-05, + "loss": 1.6005, + "step": 32300 + }, + { + "epoch": 6.806762521725391, + "grad_norm": 1.0970877408981323, + "learning_rate": 4.6209572755238905e-05, + "loss": 1.5179, + "step": 32310 + }, + { + "epoch": 6.808869226312741, + "grad_norm": 1.0892575979232788, + "learning_rate": 4.61537821582659e-05, + "loss": 1.6311, + "step": 32320 + }, + { + "epoch": 6.8109759309000895, + "grad_norm": 1.0509310960769653, + "learning_rate": 4.609801515517825e-05, + "loss": 1.5343, + "step": 32330 + }, + { + "epoch": 6.813082635487438, + "grad_norm": 1.2261005640029907, + "learning_rate": 4.604227177041156e-05, + "loss": 1.6152, + "step": 32340 + }, + { + "epoch": 6.815189340074788, + "grad_norm": 1.4062305688858032, + "learning_rate": 4.598655202839096e-05, + "loss": 1.6324, + "step": 32350 + }, + { + "epoch": 6.817296044662137, + "grad_norm": 1.1476296186447144, + "learning_rate": 4.5930855953531214e-05, + "loss": 1.6057, + "step": 32360 + }, + { + "epoch": 6.819402749249487, + "grad_norm": 1.1502786874771118, + "learning_rate": 4.5875183570236815e-05, + "loss": 1.6032, + "step": 32370 + }, + { + "epoch": 6.8215094538368355, + "grad_norm": 1.0922356843948364, + "learning_rate": 4.581953490290188e-05, + "loss": 1.5566, + "step": 32380 + }, + { + "epoch": 6.823616158424185, + "grad_norm": 1.1090188026428223, + "learning_rate": 4.576390997590996e-05, + "loss": 1.6288, + "step": 32390 + }, + { + "epoch": 6.825722863011534, + "grad_norm": 1.0930324792861938, + "learning_rate": 4.570830881363439e-05, + "loss": 1.6241, + "step": 32400 + }, + { + "epoch": 6.827829567598884, + "grad_norm": 1.1673924922943115, + "learning_rate": 4.5652731440437965e-05, + "loss": 1.6006, + "step": 32410 + }, + { + "epoch": 6.829936272186233, + "grad_norm": 1.2876216173171997, + "learning_rate": 4.559717788067316e-05, + "loss": 1.5865, + "step": 32420 + }, + { + "epoch": 6.832042976773582, + "grad_norm": 1.2427315711975098, + "learning_rate": 4.554164815868204e-05, + "loss": 1.5477, + "step": 32430 + }, + { + "epoch": 6.834149681360931, + "grad_norm": 1.2781567573547363, + "learning_rate": 4.5486142298795995e-05, + "loss": 1.6095, + "step": 32440 + }, + { + "epoch": 6.83625638594828, + "grad_norm": 1.0563685894012451, + "learning_rate": 4.54306603253362e-05, + "loss": 1.5988, + "step": 32450 + }, + { + "epoch": 6.83836309053563, + "grad_norm": 1.2373830080032349, + "learning_rate": 4.537520226261333e-05, + "loss": 1.5737, + "step": 32460 + }, + { + "epoch": 6.840469795122979, + "grad_norm": 1.2991282939910889, + "learning_rate": 4.531976813492747e-05, + "loss": 1.6012, + "step": 32470 + }, + { + "epoch": 6.842576499710328, + "grad_norm": 1.3080326318740845, + "learning_rate": 4.5264357966568306e-05, + "loss": 1.5993, + "step": 32480 + }, + { + "epoch": 6.844683204297677, + "grad_norm": 1.129129409790039, + "learning_rate": 4.5208971781814955e-05, + "loss": 1.5381, + "step": 32490 + }, + { + "epoch": 6.846789908885027, + "grad_norm": 1.1796236038208008, + "learning_rate": 4.515360960493612e-05, + "loss": 1.5905, + "step": 32500 + }, + { + "epoch": 6.848896613472376, + "grad_norm": 1.5889365673065186, + "learning_rate": 4.509827146019e-05, + "loss": 1.5976, + "step": 32510 + }, + { + "epoch": 6.851003318059725, + "grad_norm": 1.1488497257232666, + "learning_rate": 4.5042957371824057e-05, + "loss": 1.6156, + "step": 32520 + }, + { + "epoch": 6.8531100226470745, + "grad_norm": 1.2863367795944214, + "learning_rate": 4.498766736407543e-05, + "loss": 1.5985, + "step": 32530 + }, + { + "epoch": 6.855216727234423, + "grad_norm": 1.0283994674682617, + "learning_rate": 4.493240146117066e-05, + "loss": 1.5567, + "step": 32540 + }, + { + "epoch": 6.857323431821773, + "grad_norm": 1.2300434112548828, + "learning_rate": 4.487715968732568e-05, + "loss": 1.5876, + "step": 32550 + }, + { + "epoch": 6.859430136409122, + "grad_norm": 1.172739863395691, + "learning_rate": 4.482194206674585e-05, + "loss": 1.5768, + "step": 32560 + }, + { + "epoch": 6.861536840996472, + "grad_norm": 1.1315293312072754, + "learning_rate": 4.476674862362593e-05, + "loss": 1.6282, + "step": 32570 + }, + { + "epoch": 6.8636435455838205, + "grad_norm": 1.1859827041625977, + "learning_rate": 4.471157938215017e-05, + "loss": 1.6081, + "step": 32580 + }, + { + "epoch": 6.865750250171169, + "grad_norm": 1.262385368347168, + "learning_rate": 4.465643436649224e-05, + "loss": 1.6119, + "step": 32590 + }, + { + "epoch": 6.867856954758519, + "grad_norm": 1.146864891052246, + "learning_rate": 4.460131360081496e-05, + "loss": 1.6026, + "step": 32600 + }, + { + "epoch": 6.869963659345868, + "grad_norm": 1.1739929914474487, + "learning_rate": 4.454621710927077e-05, + "loss": 1.6119, + "step": 32610 + }, + { + "epoch": 6.872070363933218, + "grad_norm": 1.1527085304260254, + "learning_rate": 4.4491144916001425e-05, + "loss": 1.6168, + "step": 32620 + }, + { + "epoch": 6.8741770685205665, + "grad_norm": 1.122193455696106, + "learning_rate": 4.443609704513797e-05, + "loss": 1.6375, + "step": 32630 + }, + { + "epoch": 6.876283773107916, + "grad_norm": 1.2170180082321167, + "learning_rate": 4.438107352080076e-05, + "loss": 1.6198, + "step": 32640 + }, + { + "epoch": 6.878390477695265, + "grad_norm": 1.1985820531845093, + "learning_rate": 4.4326074367099646e-05, + "loss": 1.5268, + "step": 32650 + }, + { + "epoch": 6.880497182282614, + "grad_norm": 1.2520807981491089, + "learning_rate": 4.42710996081336e-05, + "loss": 1.6396, + "step": 32660 + }, + { + "epoch": 6.882603886869964, + "grad_norm": 1.1947911977767944, + "learning_rate": 4.421614926799108e-05, + "loss": 1.5729, + "step": 32670 + }, + { + "epoch": 6.884710591457313, + "grad_norm": 1.089781403541565, + "learning_rate": 4.4161223370749746e-05, + "loss": 1.5976, + "step": 32680 + }, + { + "epoch": 6.886817296044662, + "grad_norm": 1.0784406661987305, + "learning_rate": 4.4106321940476516e-05, + "loss": 1.5902, + "step": 32690 + }, + { + "epoch": 6.888924000632011, + "grad_norm": 1.2805781364440918, + "learning_rate": 4.405144500122772e-05, + "loss": 1.6525, + "step": 32700 + }, + { + "epoch": 6.891030705219361, + "grad_norm": 1.1548502445220947, + "learning_rate": 4.399659257704879e-05, + "loss": 1.6348, + "step": 32710 + }, + { + "epoch": 6.89313740980671, + "grad_norm": 1.220477819442749, + "learning_rate": 4.3941764691974596e-05, + "loss": 1.57, + "step": 32720 + }, + { + "epoch": 6.895244114394059, + "grad_norm": 1.1432626247406006, + "learning_rate": 4.388696137002911e-05, + "loss": 1.5568, + "step": 32730 + }, + { + "epoch": 6.897350818981408, + "grad_norm": 1.245499849319458, + "learning_rate": 4.383218263522556e-05, + "loss": 1.6399, + "step": 32740 + }, + { + "epoch": 6.899457523568757, + "grad_norm": 1.1206386089324951, + "learning_rate": 4.377742851156652e-05, + "loss": 1.5825, + "step": 32750 + }, + { + "epoch": 6.901564228156107, + "grad_norm": 1.1021487712860107, + "learning_rate": 4.372269902304363e-05, + "loss": 1.6142, + "step": 32760 + }, + { + "epoch": 6.903670932743456, + "grad_norm": 1.198765754699707, + "learning_rate": 4.3667994193637796e-05, + "loss": 1.6203, + "step": 32770 + }, + { + "epoch": 6.9057776373308055, + "grad_norm": 1.1611610651016235, + "learning_rate": 4.3613314047319167e-05, + "loss": 1.6046, + "step": 32780 + }, + { + "epoch": 6.907884341918154, + "grad_norm": 1.2177382707595825, + "learning_rate": 4.355865860804698e-05, + "loss": 1.5561, + "step": 32790 + }, + { + "epoch": 6.909991046505504, + "grad_norm": 1.2026368379592896, + "learning_rate": 4.350402789976975e-05, + "loss": 1.6029, + "step": 32800 + }, + { + "epoch": 6.912097751092853, + "grad_norm": 1.1011264324188232, + "learning_rate": 4.3449421946425096e-05, + "loss": 1.5106, + "step": 32810 + }, + { + "epoch": 6.914204455680203, + "grad_norm": 1.1586644649505615, + "learning_rate": 4.339484077193974e-05, + "loss": 1.5585, + "step": 32820 + }, + { + "epoch": 6.9163111602675515, + "grad_norm": 1.0997034311294556, + "learning_rate": 4.3340284400229666e-05, + "loss": 1.6231, + "step": 32830 + }, + { + "epoch": 6.9184178648549, + "grad_norm": 1.154043197631836, + "learning_rate": 4.328575285519994e-05, + "loss": 1.6233, + "step": 32840 + }, + { + "epoch": 6.92052456944225, + "grad_norm": 1.0947542190551758, + "learning_rate": 4.323124616074464e-05, + "loss": 1.5705, + "step": 32850 + }, + { + "epoch": 6.922631274029599, + "grad_norm": 1.0261285305023193, + "learning_rate": 4.3176764340747177e-05, + "loss": 1.6055, + "step": 32860 + }, + { + "epoch": 6.924737978616949, + "grad_norm": 1.13263738155365, + "learning_rate": 4.312230741907984e-05, + "loss": 1.5702, + "step": 32870 + }, + { + "epoch": 6.926844683204298, + "grad_norm": 1.1369476318359375, + "learning_rate": 4.3067875419604184e-05, + "loss": 1.6287, + "step": 32880 + }, + { + "epoch": 6.928951387791647, + "grad_norm": 1.2614531517028809, + "learning_rate": 4.301346836617074e-05, + "loss": 1.6592, + "step": 32890 + }, + { + "epoch": 6.931058092378996, + "grad_norm": 1.1027562618255615, + "learning_rate": 4.29590862826191e-05, + "loss": 1.6164, + "step": 32900 + }, + { + "epoch": 6.933164796966345, + "grad_norm": 1.1705682277679443, + "learning_rate": 4.2904729192778006e-05, + "loss": 1.5558, + "step": 32910 + }, + { + "epoch": 6.935271501553695, + "grad_norm": 1.180618405342102, + "learning_rate": 4.285039712046517e-05, + "loss": 1.5074, + "step": 32920 + }, + { + "epoch": 6.937378206141044, + "grad_norm": 1.125746250152588, + "learning_rate": 4.279609008948732e-05, + "loss": 1.5831, + "step": 32930 + }, + { + "epoch": 6.939484910728393, + "grad_norm": 1.092772126197815, + "learning_rate": 4.2741808123640335e-05, + "loss": 1.6018, + "step": 32940 + }, + { + "epoch": 6.941591615315742, + "grad_norm": 1.1880598068237305, + "learning_rate": 4.2687551246708965e-05, + "loss": 1.6156, + "step": 32950 + }, + { + "epoch": 6.943698319903092, + "grad_norm": 1.1571134328842163, + "learning_rate": 4.26333194824671e-05, + "loss": 1.6002, + "step": 32960 + }, + { + "epoch": 6.945805024490441, + "grad_norm": 1.2684969902038574, + "learning_rate": 4.257911285467754e-05, + "loss": 1.6134, + "step": 32970 + }, + { + "epoch": 6.94791172907779, + "grad_norm": 1.1462793350219727, + "learning_rate": 4.252493138709204e-05, + "loss": 1.5812, + "step": 32980 + }, + { + "epoch": 6.950018433665139, + "grad_norm": 1.157240629196167, + "learning_rate": 4.2470775103451446e-05, + "loss": 1.634, + "step": 32990 + }, + { + "epoch": 6.952125138252488, + "grad_norm": 1.1271111965179443, + "learning_rate": 4.241664402748544e-05, + "loss": 1.5717, + "step": 33000 + }, + { + "epoch": 6.954231842839838, + "grad_norm": 1.1427202224731445, + "learning_rate": 4.236253818291281e-05, + "loss": 1.6627, + "step": 33010 + }, + { + "epoch": 6.956338547427187, + "grad_norm": 1.1169521808624268, + "learning_rate": 4.230845759344116e-05, + "loss": 1.589, + "step": 33020 + }, + { + "epoch": 6.958445252014537, + "grad_norm": 1.222920298576355, + "learning_rate": 4.2254402282767034e-05, + "loss": 1.6033, + "step": 33030 + }, + { + "epoch": 6.960551956601885, + "grad_norm": 1.2239333391189575, + "learning_rate": 4.2200372274576e-05, + "loss": 1.5999, + "step": 33040 + }, + { + "epoch": 6.962658661189234, + "grad_norm": 1.1619553565979004, + "learning_rate": 4.2146367592542444e-05, + "loss": 1.6088, + "step": 33050 + }, + { + "epoch": 6.964765365776584, + "grad_norm": 1.230373501777649, + "learning_rate": 4.209238826032965e-05, + "loss": 1.5548, + "step": 33060 + }, + { + "epoch": 6.966872070363933, + "grad_norm": 1.0696500539779663, + "learning_rate": 4.203843430158991e-05, + "loss": 1.6181, + "step": 33070 + }, + { + "epoch": 6.968978774951283, + "grad_norm": 1.0919783115386963, + "learning_rate": 4.198450573996423e-05, + "loss": 1.6078, + "step": 33080 + }, + { + "epoch": 6.9710854795386314, + "grad_norm": 1.0691747665405273, + "learning_rate": 4.1930602599082666e-05, + "loss": 1.6057, + "step": 33090 + }, + { + "epoch": 6.973192184125981, + "grad_norm": 1.1408584117889404, + "learning_rate": 4.1876724902564004e-05, + "loss": 1.607, + "step": 33100 + }, + { + "epoch": 6.97529888871333, + "grad_norm": 1.2705100774765015, + "learning_rate": 4.182287267401587e-05, + "loss": 1.5754, + "step": 33110 + }, + { + "epoch": 6.97740559330068, + "grad_norm": 1.117083191871643, + "learning_rate": 4.1769045937034876e-05, + "loss": 1.5889, + "step": 33120 + }, + { + "epoch": 6.979512297888029, + "grad_norm": 1.1689233779907227, + "learning_rate": 4.171524471520633e-05, + "loss": 1.5913, + "step": 33130 + }, + { + "epoch": 6.981619002475378, + "grad_norm": 1.0697065591812134, + "learning_rate": 4.166146903210436e-05, + "loss": 1.5788, + "step": 33140 + }, + { + "epoch": 6.983725707062727, + "grad_norm": 1.1633777618408203, + "learning_rate": 4.1607718911292025e-05, + "loss": 1.5982, + "step": 33150 + }, + { + "epoch": 6.985832411650076, + "grad_norm": 1.1077229976654053, + "learning_rate": 4.1553994376321023e-05, + "loss": 1.5866, + "step": 33160 + }, + { + "epoch": 6.987939116237426, + "grad_norm": 1.1923149824142456, + "learning_rate": 4.1500295450731994e-05, + "loss": 1.6038, + "step": 33170 + }, + { + "epoch": 6.990045820824775, + "grad_norm": 1.184695839881897, + "learning_rate": 4.144662215805426e-05, + "loss": 1.5184, + "step": 33180 + }, + { + "epoch": 6.992152525412124, + "grad_norm": 1.1566641330718994, + "learning_rate": 4.139297452180588e-05, + "loss": 1.6186, + "step": 33190 + }, + { + "epoch": 6.994259229999473, + "grad_norm": 1.1046571731567383, + "learning_rate": 4.133935256549383e-05, + "loss": 1.6273, + "step": 33200 + }, + { + "epoch": 6.996365934586823, + "grad_norm": 1.2560962438583374, + "learning_rate": 4.1285756312613654e-05, + "loss": 1.5725, + "step": 33210 + }, + { + "epoch": 6.998472639174172, + "grad_norm": 1.1830743551254272, + "learning_rate": 4.1232185786649704e-05, + "loss": 1.5894, + "step": 33220 + }, + { + "epoch": 7.000579343761521, + "grad_norm": 1.2797951698303223, + "learning_rate": 4.1178641011075116e-05, + "loss": 1.5852, + "step": 33230 + }, + { + "epoch": 7.00268604834887, + "grad_norm": 1.0687123537063599, + "learning_rate": 4.1125122009351634e-05, + "loss": 1.5228, + "step": 33240 + }, + { + "epoch": 7.004792752936219, + "grad_norm": 1.206784963607788, + "learning_rate": 4.107162880492984e-05, + "loss": 1.5989, + "step": 33250 + }, + { + "epoch": 7.006899457523569, + "grad_norm": 1.2578917741775513, + "learning_rate": 4.1018161421248905e-05, + "loss": 1.5553, + "step": 33260 + }, + { + "epoch": 7.009006162110918, + "grad_norm": 1.4243639707565308, + "learning_rate": 4.096471988173667e-05, + "loss": 1.5727, + "step": 33270 + }, + { + "epoch": 7.011112866698268, + "grad_norm": 1.162205696105957, + "learning_rate": 4.0911304209809776e-05, + "loss": 1.525, + "step": 33280 + }, + { + "epoch": 7.0132195712856165, + "grad_norm": 1.0842784643173218, + "learning_rate": 4.08579144288734e-05, + "loss": 1.5675, + "step": 33290 + }, + { + "epoch": 7.015326275872965, + "grad_norm": 1.1000008583068848, + "learning_rate": 4.080455056232147e-05, + "loss": 1.5373, + "step": 33300 + }, + { + "epoch": 7.017432980460315, + "grad_norm": 1.184962511062622, + "learning_rate": 4.075121263353653e-05, + "loss": 1.532, + "step": 33310 + }, + { + "epoch": 7.019539685047664, + "grad_norm": 1.0851813554763794, + "learning_rate": 4.069790066588967e-05, + "loss": 1.5424, + "step": 33320 + }, + { + "epoch": 7.021646389635014, + "grad_norm": 1.2001564502716064, + "learning_rate": 4.064461468274077e-05, + "loss": 1.5278, + "step": 33330 + }, + { + "epoch": 7.0237530942223625, + "grad_norm": 1.15140962600708, + "learning_rate": 4.05913547074382e-05, + "loss": 1.5639, + "step": 33340 + }, + { + "epoch": 7.025859798809712, + "grad_norm": 1.1785743236541748, + "learning_rate": 4.053812076331893e-05, + "loss": 1.5807, + "step": 33350 + }, + { + "epoch": 7.027966503397061, + "grad_norm": 1.2136023044586182, + "learning_rate": 4.048491287370863e-05, + "loss": 1.5895, + "step": 33360 + }, + { + "epoch": 7.030073207984411, + "grad_norm": 1.275848150253296, + "learning_rate": 4.043173106192145e-05, + "loss": 1.5946, + "step": 33370 + }, + { + "epoch": 7.03217991257176, + "grad_norm": 1.157455325126648, + "learning_rate": 4.0378575351260184e-05, + "loss": 1.5566, + "step": 33380 + }, + { + "epoch": 7.0342866171591085, + "grad_norm": 1.1953786611557007, + "learning_rate": 4.0325445765016145e-05, + "loss": 1.5714, + "step": 33390 + }, + { + "epoch": 7.036393321746458, + "grad_norm": 1.1416175365447998, + "learning_rate": 4.0272342326469157e-05, + "loss": 1.6163, + "step": 33400 + }, + { + "epoch": 7.038500026333807, + "grad_norm": 1.0863345861434937, + "learning_rate": 4.021926505888774e-05, + "loss": 1.5635, + "step": 33410 + }, + { + "epoch": 7.040606730921157, + "grad_norm": 1.1222482919692993, + "learning_rate": 4.016621398552877e-05, + "loss": 1.5723, + "step": 33420 + }, + { + "epoch": 7.042713435508506, + "grad_norm": 1.2037105560302734, + "learning_rate": 4.011318912963772e-05, + "loss": 1.5243, + "step": 33430 + }, + { + "epoch": 7.044820140095855, + "grad_norm": 1.1907230615615845, + "learning_rate": 4.006019051444864e-05, + "loss": 1.5661, + "step": 33440 + }, + { + "epoch": 7.046926844683204, + "grad_norm": 1.17070734500885, + "learning_rate": 4.000721816318395e-05, + "loss": 1.5788, + "step": 33450 + }, + { + "epoch": 7.049033549270553, + "grad_norm": 1.2719948291778564, + "learning_rate": 3.995427209905469e-05, + "loss": 1.5891, + "step": 33460 + }, + { + "epoch": 7.051140253857903, + "grad_norm": 1.1980891227722168, + "learning_rate": 3.99013523452603e-05, + "loss": 1.569, + "step": 33470 + }, + { + "epoch": 7.053246958445252, + "grad_norm": 1.1968605518341064, + "learning_rate": 3.9848458924988684e-05, + "loss": 1.5259, + "step": 33480 + }, + { + "epoch": 7.0553536630326015, + "grad_norm": 1.1652711629867554, + "learning_rate": 3.9795591861416316e-05, + "loss": 1.5396, + "step": 33490 + }, + { + "epoch": 7.05746036761995, + "grad_norm": 1.15365469455719, + "learning_rate": 3.974275117770798e-05, + "loss": 1.5551, + "step": 33500 + }, + { + "epoch": 7.0595670722073, + "grad_norm": 1.1745774745941162, + "learning_rate": 3.9689936897016944e-05, + "loss": 1.5099, + "step": 33510 + }, + { + "epoch": 7.061673776794649, + "grad_norm": 1.1286360025405884, + "learning_rate": 3.963714904248501e-05, + "loss": 1.5246, + "step": 33520 + }, + { + "epoch": 7.063780481381999, + "grad_norm": 1.1611193418502808, + "learning_rate": 3.958438763724224e-05, + "loss": 1.5001, + "step": 33530 + }, + { + "epoch": 7.0658871859693475, + "grad_norm": 1.140989065170288, + "learning_rate": 3.953165270440721e-05, + "loss": 1.5921, + "step": 33540 + }, + { + "epoch": 7.067993890556696, + "grad_norm": 1.2701148986816406, + "learning_rate": 3.947894426708696e-05, + "loss": 1.5792, + "step": 33550 + }, + { + "epoch": 7.070100595144046, + "grad_norm": 1.1872613430023193, + "learning_rate": 3.942626234837668e-05, + "loss": 1.5927, + "step": 33560 + }, + { + "epoch": 7.072207299731395, + "grad_norm": 1.215997338294983, + "learning_rate": 3.937360697136019e-05, + "loss": 1.5151, + "step": 33570 + }, + { + "epoch": 7.074314004318745, + "grad_norm": 1.2439547777175903, + "learning_rate": 3.9320978159109533e-05, + "loss": 1.5755, + "step": 33580 + }, + { + "epoch": 7.0764207089060935, + "grad_norm": 1.2074193954467773, + "learning_rate": 3.926837593468522e-05, + "loss": 1.5721, + "step": 33590 + }, + { + "epoch": 7.078527413493443, + "grad_norm": 1.0745258331298828, + "learning_rate": 3.921580032113602e-05, + "loss": 1.5437, + "step": 33600 + }, + { + "epoch": 7.080634118080792, + "grad_norm": 1.3252061605453491, + "learning_rate": 3.916325134149904e-05, + "loss": 1.5626, + "step": 33610 + }, + { + "epoch": 7.082740822668141, + "grad_norm": 1.164564609527588, + "learning_rate": 3.9110729018799785e-05, + "loss": 1.5461, + "step": 33620 + }, + { + "epoch": 7.084847527255491, + "grad_norm": 1.2133631706237793, + "learning_rate": 3.905823337605213e-05, + "loss": 1.5484, + "step": 33630 + }, + { + "epoch": 7.08695423184284, + "grad_norm": 1.2363920211791992, + "learning_rate": 3.900576443625803e-05, + "loss": 1.5937, + "step": 33640 + }, + { + "epoch": 7.089060936430189, + "grad_norm": 1.2874761819839478, + "learning_rate": 3.895332222240794e-05, + "loss": 1.4771, + "step": 33650 + }, + { + "epoch": 7.091167641017538, + "grad_norm": 1.1343390941619873, + "learning_rate": 3.8900906757480614e-05, + "loss": 1.5485, + "step": 33660 + }, + { + "epoch": 7.093274345604888, + "grad_norm": 1.2385504245758057, + "learning_rate": 3.884851806444296e-05, + "loss": 1.5506, + "step": 33670 + }, + { + "epoch": 7.095381050192237, + "grad_norm": 1.160474181175232, + "learning_rate": 3.879615616625024e-05, + "loss": 1.5962, + "step": 33680 + }, + { + "epoch": 7.097487754779586, + "grad_norm": 1.2261831760406494, + "learning_rate": 3.874382108584591e-05, + "loss": 1.5721, + "step": 33690 + }, + { + "epoch": 7.099594459366935, + "grad_norm": 1.2461113929748535, + "learning_rate": 3.8691512846161737e-05, + "loss": 1.5832, + "step": 33700 + }, + { + "epoch": 7.101701163954284, + "grad_norm": 1.687401294708252, + "learning_rate": 3.86392314701178e-05, + "loss": 1.5928, + "step": 33710 + }, + { + "epoch": 7.103807868541634, + "grad_norm": 1.188767433166504, + "learning_rate": 3.858697698062217e-05, + "loss": 1.6336, + "step": 33720 + }, + { + "epoch": 7.105914573128983, + "grad_norm": 1.3257564306259155, + "learning_rate": 3.8534749400571337e-05, + "loss": 1.5285, + "step": 33730 + }, + { + "epoch": 7.1080212777163325, + "grad_norm": 1.227262258529663, + "learning_rate": 3.848254875285e-05, + "loss": 1.5814, + "step": 33740 + }, + { + "epoch": 7.110127982303681, + "grad_norm": 1.3911917209625244, + "learning_rate": 3.843037506033096e-05, + "loss": 1.6014, + "step": 33750 + }, + { + "epoch": 7.112234686891031, + "grad_norm": 1.1162062883377075, + "learning_rate": 3.8378228345875246e-05, + "loss": 1.5378, + "step": 33760 + }, + { + "epoch": 7.11434139147838, + "grad_norm": 1.1487606763839722, + "learning_rate": 3.832610863233204e-05, + "loss": 1.5834, + "step": 33770 + }, + { + "epoch": 7.116448096065729, + "grad_norm": 1.1570956707000732, + "learning_rate": 3.8274015942538745e-05, + "loss": 1.5618, + "step": 33780 + }, + { + "epoch": 7.1185548006530786, + "grad_norm": 1.1539541482925415, + "learning_rate": 3.822195029932095e-05, + "loss": 1.5992, + "step": 33790 + }, + { + "epoch": 7.120661505240427, + "grad_norm": 1.1285444498062134, + "learning_rate": 3.8169911725492303e-05, + "loss": 1.5704, + "step": 33800 + }, + { + "epoch": 7.122768209827777, + "grad_norm": 1.1300140619277954, + "learning_rate": 3.8117900243854595e-05, + "loss": 1.528, + "step": 33810 + }, + { + "epoch": 7.124874914415126, + "grad_norm": 1.538737416267395, + "learning_rate": 3.806591587719784e-05, + "loss": 1.5414, + "step": 33820 + }, + { + "epoch": 7.126981619002476, + "grad_norm": 1.2989195585250854, + "learning_rate": 3.8013958648300075e-05, + "loss": 1.5495, + "step": 33830 + }, + { + "epoch": 7.129088323589825, + "grad_norm": 1.1311976909637451, + "learning_rate": 3.7962028579927555e-05, + "loss": 1.5997, + "step": 33840 + }, + { + "epoch": 7.131195028177173, + "grad_norm": 1.0657979249954224, + "learning_rate": 3.7910125694834445e-05, + "loss": 1.5286, + "step": 33850 + }, + { + "epoch": 7.133301732764523, + "grad_norm": 1.199328899383545, + "learning_rate": 3.7858250015763174e-05, + "loss": 1.5864, + "step": 33860 + }, + { + "epoch": 7.135408437351872, + "grad_norm": 1.2691328525543213, + "learning_rate": 3.780640156544424e-05, + "loss": 1.5788, + "step": 33870 + }, + { + "epoch": 7.137515141939222, + "grad_norm": 1.2796211242675781, + "learning_rate": 3.7754580366596115e-05, + "loss": 1.6114, + "step": 33880 + }, + { + "epoch": 7.139621846526571, + "grad_norm": 1.1912226676940918, + "learning_rate": 3.7702786441925355e-05, + "loss": 1.5474, + "step": 33890 + }, + { + "epoch": 7.14172855111392, + "grad_norm": 1.2463699579238892, + "learning_rate": 3.7651019814126654e-05, + "loss": 1.5949, + "step": 33900 + }, + { + "epoch": 7.143835255701269, + "grad_norm": 1.1586321592330933, + "learning_rate": 3.7599280505882604e-05, + "loss": 1.5561, + "step": 33910 + }, + { + "epoch": 7.145941960288619, + "grad_norm": 1.2323464155197144, + "learning_rate": 3.7547568539864017e-05, + "loss": 1.5535, + "step": 33920 + }, + { + "epoch": 7.148048664875968, + "grad_norm": 1.402441143989563, + "learning_rate": 3.749588393872947e-05, + "loss": 1.5405, + "step": 33930 + }, + { + "epoch": 7.150155369463317, + "grad_norm": 1.1518326997756958, + "learning_rate": 3.7444226725125764e-05, + "loss": 1.5759, + "step": 33940 + }, + { + "epoch": 7.152262074050666, + "grad_norm": 1.2248179912567139, + "learning_rate": 3.739259692168764e-05, + "loss": 1.5951, + "step": 33950 + }, + { + "epoch": 7.154368778638015, + "grad_norm": 1.2795392274856567, + "learning_rate": 3.734099455103779e-05, + "loss": 1.5979, + "step": 33960 + }, + { + "epoch": 7.156475483225365, + "grad_norm": 1.1059311628341675, + "learning_rate": 3.728941963578687e-05, + "loss": 1.557, + "step": 33970 + }, + { + "epoch": 7.158582187812714, + "grad_norm": 1.275423288345337, + "learning_rate": 3.723787219853363e-05, + "loss": 1.549, + "step": 33980 + }, + { + "epoch": 7.160688892400064, + "grad_norm": 1.1001471281051636, + "learning_rate": 3.71863522618646e-05, + "loss": 1.5314, + "step": 33990 + }, + { + "epoch": 7.162795596987412, + "grad_norm": 1.242405652999878, + "learning_rate": 3.7134859848354485e-05, + "loss": 1.5806, + "step": 34000 + }, + { + "epoch": 7.164902301574761, + "grad_norm": 1.2491405010223389, + "learning_rate": 3.708339498056565e-05, + "loss": 1.543, + "step": 34010 + }, + { + "epoch": 7.167009006162111, + "grad_norm": 1.1283265352249146, + "learning_rate": 3.7031957681048604e-05, + "loss": 1.5682, + "step": 34020 + }, + { + "epoch": 7.16911571074946, + "grad_norm": 1.109413743019104, + "learning_rate": 3.698054797234175e-05, + "loss": 1.5197, + "step": 34030 + }, + { + "epoch": 7.17122241533681, + "grad_norm": 1.24946129322052, + "learning_rate": 3.6929165876971337e-05, + "loss": 1.5934, + "step": 34040 + }, + { + "epoch": 7.1733291199241584, + "grad_norm": 1.0502756834030151, + "learning_rate": 3.68778114174515e-05, + "loss": 1.5576, + "step": 34050 + }, + { + "epoch": 7.175435824511508, + "grad_norm": 1.1192634105682373, + "learning_rate": 3.682648461628439e-05, + "loss": 1.5288, + "step": 34060 + }, + { + "epoch": 7.177542529098857, + "grad_norm": 1.2234610319137573, + "learning_rate": 3.677518549595986e-05, + "loss": 1.5302, + "step": 34070 + }, + { + "epoch": 7.179649233686207, + "grad_norm": 1.157472014427185, + "learning_rate": 3.6723914078955825e-05, + "loss": 1.5859, + "step": 34080 + }, + { + "epoch": 7.181755938273556, + "grad_norm": 1.1958365440368652, + "learning_rate": 3.667267038773791e-05, + "loss": 1.5783, + "step": 34090 + }, + { + "epoch": 7.1838626428609045, + "grad_norm": 1.1906516551971436, + "learning_rate": 3.662145444475963e-05, + "loss": 1.5748, + "step": 34100 + }, + { + "epoch": 7.185969347448254, + "grad_norm": 1.313653588294983, + "learning_rate": 3.65702662724624e-05, + "loss": 1.5847, + "step": 34110 + }, + { + "epoch": 7.188076052035603, + "grad_norm": 1.2378566265106201, + "learning_rate": 3.65191058932754e-05, + "loss": 1.5802, + "step": 34120 + }, + { + "epoch": 7.190182756622953, + "grad_norm": 1.146131992340088, + "learning_rate": 3.646797332961569e-05, + "loss": 1.5321, + "step": 34130 + }, + { + "epoch": 7.192289461210302, + "grad_norm": 1.2050758600234985, + "learning_rate": 3.64168686038881e-05, + "loss": 1.6303, + "step": 34140 + }, + { + "epoch": 7.194396165797651, + "grad_norm": 1.172650933265686, + "learning_rate": 3.636579173848521e-05, + "loss": 1.5502, + "step": 34150 + }, + { + "epoch": 7.196502870385, + "grad_norm": 1.1262861490249634, + "learning_rate": 3.631474275578754e-05, + "loss": 1.5185, + "step": 34160 + }, + { + "epoch": 7.198609574972349, + "grad_norm": 1.1936100721359253, + "learning_rate": 3.626372167816326e-05, + "loss": 1.5495, + "step": 34170 + }, + { + "epoch": 7.200716279559699, + "grad_norm": 1.0827867984771729, + "learning_rate": 3.6212728527968345e-05, + "loss": 1.5777, + "step": 34180 + }, + { + "epoch": 7.202822984147048, + "grad_norm": 1.1810598373413086, + "learning_rate": 3.616176332754659e-05, + "loss": 1.5491, + "step": 34190 + }, + { + "epoch": 7.204929688734397, + "grad_norm": 1.2283319234848022, + "learning_rate": 3.6110826099229453e-05, + "loss": 1.5434, + "step": 34200 + }, + { + "epoch": 7.207036393321746, + "grad_norm": 1.2398056983947754, + "learning_rate": 3.605991686533625e-05, + "loss": 1.5411, + "step": 34210 + }, + { + "epoch": 7.209143097909096, + "grad_norm": 1.17624831199646, + "learning_rate": 3.6009035648173914e-05, + "loss": 1.5901, + "step": 34220 + }, + { + "epoch": 7.211249802496445, + "grad_norm": 1.192043662071228, + "learning_rate": 3.595818247003713e-05, + "loss": 1.5786, + "step": 34230 + }, + { + "epoch": 7.213356507083795, + "grad_norm": 1.3036718368530273, + "learning_rate": 3.590735735320837e-05, + "loss": 1.5868, + "step": 34240 + }, + { + "epoch": 7.2154632116711435, + "grad_norm": 1.1599690914154053, + "learning_rate": 3.5856560319957747e-05, + "loss": 1.5652, + "step": 34250 + }, + { + "epoch": 7.217569916258492, + "grad_norm": 1.0903196334838867, + "learning_rate": 3.580579139254303e-05, + "loss": 1.6047, + "step": 34260 + }, + { + "epoch": 7.219676620845842, + "grad_norm": 1.1610807180404663, + "learning_rate": 3.57550505932098e-05, + "loss": 1.5608, + "step": 34270 + }, + { + "epoch": 7.221783325433191, + "grad_norm": 1.1886292695999146, + "learning_rate": 3.570433794419117e-05, + "loss": 1.5595, + "step": 34280 + }, + { + "epoch": 7.223890030020541, + "grad_norm": 1.1972240209579468, + "learning_rate": 3.565365346770805e-05, + "loss": 1.5889, + "step": 34290 + }, + { + "epoch": 7.2259967346078895, + "grad_norm": 1.13959538936615, + "learning_rate": 3.560299718596889e-05, + "loss": 1.5306, + "step": 34300 + }, + { + "epoch": 7.228103439195239, + "grad_norm": 1.218733549118042, + "learning_rate": 3.5552369121169815e-05, + "loss": 1.5694, + "step": 34310 + }, + { + "epoch": 7.230210143782588, + "grad_norm": 1.1074182987213135, + "learning_rate": 3.550176929549468e-05, + "loss": 1.5485, + "step": 34320 + }, + { + "epoch": 7.232316848369937, + "grad_norm": 1.186241626739502, + "learning_rate": 3.545119773111486e-05, + "loss": 1.5769, + "step": 34330 + }, + { + "epoch": 7.234423552957287, + "grad_norm": 1.160736083984375, + "learning_rate": 3.540065445018933e-05, + "loss": 1.5437, + "step": 34340 + }, + { + "epoch": 7.2365302575446355, + "grad_norm": 1.1813569068908691, + "learning_rate": 3.535013947486481e-05, + "loss": 1.5968, + "step": 34350 + }, + { + "epoch": 7.238636962131985, + "grad_norm": 1.1769226789474487, + "learning_rate": 3.5299652827275455e-05, + "loss": 1.5811, + "step": 34360 + }, + { + "epoch": 7.240743666719334, + "grad_norm": 1.207960605621338, + "learning_rate": 3.5249194529543137e-05, + "loss": 1.5773, + "step": 34370 + }, + { + "epoch": 7.242850371306684, + "grad_norm": 1.3144844770431519, + "learning_rate": 3.5198764603777235e-05, + "loss": 1.613, + "step": 34380 + }, + { + "epoch": 7.244957075894033, + "grad_norm": 1.3451526165008545, + "learning_rate": 3.5148363072074666e-05, + "loss": 1.5226, + "step": 34390 + }, + { + "epoch": 7.247063780481382, + "grad_norm": 1.1638214588165283, + "learning_rate": 3.509798995652002e-05, + "loss": 1.5835, + "step": 34400 + }, + { + "epoch": 7.249170485068731, + "grad_norm": 1.2865285873413086, + "learning_rate": 3.50476452791853e-05, + "loss": 1.5573, + "step": 34410 + }, + { + "epoch": 7.25127718965608, + "grad_norm": 1.2313289642333984, + "learning_rate": 3.49973290621302e-05, + "loss": 1.5818, + "step": 34420 + }, + { + "epoch": 7.25338389424343, + "grad_norm": 1.1138957738876343, + "learning_rate": 3.494704132740181e-05, + "loss": 1.6223, + "step": 34430 + }, + { + "epoch": 7.255490598830779, + "grad_norm": 1.151701807975769, + "learning_rate": 3.489678209703475e-05, + "loss": 1.6069, + "step": 34440 + }, + { + "epoch": 7.2575973034181285, + "grad_norm": 1.2936291694641113, + "learning_rate": 3.4846551393051265e-05, + "loss": 1.541, + "step": 34450 + }, + { + "epoch": 7.259704008005477, + "grad_norm": 1.155973196029663, + "learning_rate": 3.4796349237461e-05, + "loss": 1.5876, + "step": 34460 + }, + { + "epoch": 7.261810712592827, + "grad_norm": 1.261993408203125, + "learning_rate": 3.4746175652261056e-05, + "loss": 1.5971, + "step": 34470 + }, + { + "epoch": 7.263917417180176, + "grad_norm": 1.2483882904052734, + "learning_rate": 3.469603065943617e-05, + "loss": 1.5946, + "step": 34480 + }, + { + "epoch": 7.266024121767525, + "grad_norm": 1.229948878288269, + "learning_rate": 3.464591428095838e-05, + "loss": 1.6399, + "step": 34490 + }, + { + "epoch": 7.2681308263548745, + "grad_norm": 1.2871240377426147, + "learning_rate": 3.459582653878731e-05, + "loss": 1.5594, + "step": 34500 + }, + { + "epoch": 7.270237530942223, + "grad_norm": 1.1843843460083008, + "learning_rate": 3.4545767454869995e-05, + "loss": 1.6033, + "step": 34510 + }, + { + "epoch": 7.272344235529573, + "grad_norm": 1.0694409608840942, + "learning_rate": 3.449573705114082e-05, + "loss": 1.5933, + "step": 34520 + }, + { + "epoch": 7.274450940116922, + "grad_norm": 1.301433801651001, + "learning_rate": 3.44457353495218e-05, + "loss": 1.5651, + "step": 34530 + }, + { + "epoch": 7.276557644704272, + "grad_norm": 1.12484872341156, + "learning_rate": 3.43957623719222e-05, + "loss": 1.6104, + "step": 34540 + }, + { + "epoch": 7.2786643492916205, + "grad_norm": 1.188636064529419, + "learning_rate": 3.434581814023875e-05, + "loss": 1.5948, + "step": 34550 + }, + { + "epoch": 7.28077105387897, + "grad_norm": 1.1812471151351929, + "learning_rate": 3.429590267635565e-05, + "loss": 1.5563, + "step": 34560 + }, + { + "epoch": 7.282877758466319, + "grad_norm": 1.2685455083847046, + "learning_rate": 3.4246016002144377e-05, + "loss": 1.5661, + "step": 34570 + }, + { + "epoch": 7.284984463053668, + "grad_norm": 1.173599123954773, + "learning_rate": 3.4196158139463915e-05, + "loss": 1.6113, + "step": 34580 + }, + { + "epoch": 7.287091167641018, + "grad_norm": 1.1645420789718628, + "learning_rate": 3.414632911016056e-05, + "loss": 1.5558, + "step": 34590 + }, + { + "epoch": 7.289197872228367, + "grad_norm": 1.149623990058899, + "learning_rate": 3.4096528936067905e-05, + "loss": 1.5926, + "step": 34600 + }, + { + "epoch": 7.291304576815716, + "grad_norm": 1.3477739095687866, + "learning_rate": 3.4046757639007066e-05, + "loss": 1.5735, + "step": 34610 + }, + { + "epoch": 7.293411281403065, + "grad_norm": 1.1365127563476562, + "learning_rate": 3.399701524078635e-05, + "loss": 1.5733, + "step": 34620 + }, + { + "epoch": 7.295517985990415, + "grad_norm": 1.2671220302581787, + "learning_rate": 3.394730176320151e-05, + "loss": 1.5625, + "step": 34630 + }, + { + "epoch": 7.297624690577764, + "grad_norm": 1.344285249710083, + "learning_rate": 3.389761722803557e-05, + "loss": 1.5495, + "step": 34640 + }, + { + "epoch": 7.299731395165113, + "grad_norm": 1.2595552206039429, + "learning_rate": 3.3847961657058845e-05, + "loss": 1.5918, + "step": 34650 + }, + { + "epoch": 7.301838099752462, + "grad_norm": 1.2583255767822266, + "learning_rate": 3.379833507202903e-05, + "loss": 1.591, + "step": 34660 + }, + { + "epoch": 7.303944804339811, + "grad_norm": 1.2452776432037354, + "learning_rate": 3.3748737494691153e-05, + "loss": 1.534, + "step": 34670 + }, + { + "epoch": 7.306051508927161, + "grad_norm": 1.0824054479599, + "learning_rate": 3.369916894677733e-05, + "loss": 1.5869, + "step": 34680 + }, + { + "epoch": 7.30815821351451, + "grad_norm": 1.1286417245864868, + "learning_rate": 3.3649629450007195e-05, + "loss": 1.6053, + "step": 34690 + }, + { + "epoch": 7.3102649181018595, + "grad_norm": 1.2368204593658447, + "learning_rate": 3.360011902608747e-05, + "loss": 1.5804, + "step": 34700 + }, + { + "epoch": 7.312371622689208, + "grad_norm": 1.3171190023422241, + "learning_rate": 3.355063769671232e-05, + "loss": 1.5864, + "step": 34710 + }, + { + "epoch": 7.314478327276557, + "grad_norm": 1.1727089881896973, + "learning_rate": 3.3501185483562994e-05, + "loss": 1.6236, + "step": 34720 + }, + { + "epoch": 7.316585031863907, + "grad_norm": 1.1134991645812988, + "learning_rate": 3.345176240830803e-05, + "loss": 1.5589, + "step": 34730 + }, + { + "epoch": 7.318691736451256, + "grad_norm": 1.2899798154830933, + "learning_rate": 3.340236849260324e-05, + "loss": 1.591, + "step": 34740 + }, + { + "epoch": 7.3207984410386056, + "grad_norm": 1.2112675905227661, + "learning_rate": 3.335300375809173e-05, + "loss": 1.6022, + "step": 34750 + }, + { + "epoch": 7.322905145625954, + "grad_norm": 1.1530252695083618, + "learning_rate": 3.330366822640356e-05, + "loss": 1.5894, + "step": 34760 + }, + { + "epoch": 7.325011850213304, + "grad_norm": 1.0778065919876099, + "learning_rate": 3.325436191915628e-05, + "loss": 1.6197, + "step": 34770 + }, + { + "epoch": 7.327118554800653, + "grad_norm": 1.2357008457183838, + "learning_rate": 3.320508485795445e-05, + "loss": 1.5383, + "step": 34780 + }, + { + "epoch": 7.329225259388003, + "grad_norm": 1.2149114608764648, + "learning_rate": 3.315583706438994e-05, + "loss": 1.5355, + "step": 34790 + }, + { + "epoch": 7.331331963975352, + "grad_norm": 1.1368036270141602, + "learning_rate": 3.31066185600417e-05, + "loss": 1.581, + "step": 34800 + }, + { + "epoch": 7.3334386685627, + "grad_norm": 1.1959999799728394, + "learning_rate": 3.305742936647586e-05, + "loss": 1.6053, + "step": 34810 + }, + { + "epoch": 7.33554537315005, + "grad_norm": 1.2553191184997559, + "learning_rate": 3.300826950524575e-05, + "loss": 1.5595, + "step": 34820 + }, + { + "epoch": 7.337652077737399, + "grad_norm": 1.2087421417236328, + "learning_rate": 3.2959138997891905e-05, + "loss": 1.5864, + "step": 34830 + }, + { + "epoch": 7.339758782324749, + "grad_norm": 1.136260986328125, + "learning_rate": 3.291003786594178e-05, + "loss": 1.5502, + "step": 34840 + }, + { + "epoch": 7.341865486912098, + "grad_norm": 1.2127354145050049, + "learning_rate": 3.28609661309102e-05, + "loss": 1.5129, + "step": 34850 + }, + { + "epoch": 7.343972191499447, + "grad_norm": 1.2038772106170654, + "learning_rate": 3.281192381429894e-05, + "loss": 1.5622, + "step": 34860 + }, + { + "epoch": 7.346078896086796, + "grad_norm": 1.3606103658676147, + "learning_rate": 3.276291093759701e-05, + "loss": 1.5531, + "step": 34870 + }, + { + "epoch": 7.348185600674145, + "grad_norm": 1.1576495170593262, + "learning_rate": 3.2713927522280453e-05, + "loss": 1.6216, + "step": 34880 + }, + { + "epoch": 7.350292305261495, + "grad_norm": 1.1671663522720337, + "learning_rate": 3.2664973589812364e-05, + "loss": 1.5243, + "step": 34890 + }, + { + "epoch": 7.352399009848844, + "grad_norm": 1.3078503608703613, + "learning_rate": 3.2616049161643005e-05, + "loss": 1.543, + "step": 34900 + }, + { + "epoch": 7.354505714436193, + "grad_norm": 1.1224507093429565, + "learning_rate": 3.256715425920969e-05, + "loss": 1.5786, + "step": 34910 + }, + { + "epoch": 7.356612419023542, + "grad_norm": 1.191428780555725, + "learning_rate": 3.251828890393677e-05, + "loss": 1.5707, + "step": 34920 + }, + { + "epoch": 7.358719123610892, + "grad_norm": 1.117315411567688, + "learning_rate": 3.246945311723564e-05, + "loss": 1.5758, + "step": 34930 + }, + { + "epoch": 7.360825828198241, + "grad_norm": 1.1541494131088257, + "learning_rate": 3.2420646920504726e-05, + "loss": 1.6113, + "step": 34940 + }, + { + "epoch": 7.362932532785591, + "grad_norm": 1.2011319398880005, + "learning_rate": 3.237187033512956e-05, + "loss": 1.5299, + "step": 34950 + }, + { + "epoch": 7.365039237372939, + "grad_norm": 1.2775424718856812, + "learning_rate": 3.232312338248271e-05, + "loss": 1.598, + "step": 34960 + }, + { + "epoch": 7.367145941960288, + "grad_norm": 1.1767414808273315, + "learning_rate": 3.2274406083923605e-05, + "loss": 1.5929, + "step": 34970 + }, + { + "epoch": 7.369252646547638, + "grad_norm": 1.212234377861023, + "learning_rate": 3.222571846079881e-05, + "loss": 1.5344, + "step": 34980 + }, + { + "epoch": 7.371359351134987, + "grad_norm": 1.1913750171661377, + "learning_rate": 3.217706053444193e-05, + "loss": 1.5611, + "step": 34990 + }, + { + "epoch": 7.373466055722337, + "grad_norm": 1.183814287185669, + "learning_rate": 3.212843232617343e-05, + "loss": 1.5926, + "step": 35000 + }, + { + "epoch": 7.3755727603096854, + "grad_norm": 1.2367053031921387, + "learning_rate": 3.207983385730081e-05, + "loss": 1.5913, + "step": 35010 + }, + { + "epoch": 7.377679464897035, + "grad_norm": 1.2764780521392822, + "learning_rate": 3.203126514911854e-05, + "loss": 1.5774, + "step": 35020 + }, + { + "epoch": 7.379786169484384, + "grad_norm": 1.2614119052886963, + "learning_rate": 3.198272622290804e-05, + "loss": 1.5315, + "step": 35030 + }, + { + "epoch": 7.381892874071733, + "grad_norm": 1.1370763778686523, + "learning_rate": 3.193421709993779e-05, + "loss": 1.5631, + "step": 35040 + }, + { + "epoch": 7.383999578659083, + "grad_norm": 1.2041585445404053, + "learning_rate": 3.188573780146298e-05, + "loss": 1.5932, + "step": 35050 + }, + { + "epoch": 7.3861062832464315, + "grad_norm": 1.2313627004623413, + "learning_rate": 3.1837288348725905e-05, + "loss": 1.6218, + "step": 35060 + }, + { + "epoch": 7.388212987833781, + "grad_norm": 1.1100727319717407, + "learning_rate": 3.178886876295578e-05, + "loss": 1.5867, + "step": 35070 + }, + { + "epoch": 7.39031969242113, + "grad_norm": 1.1173222064971924, + "learning_rate": 3.1740479065368665e-05, + "loss": 1.5747, + "step": 35080 + }, + { + "epoch": 7.39242639700848, + "grad_norm": 1.1595535278320312, + "learning_rate": 3.1692119277167564e-05, + "loss": 1.5614, + "step": 35090 + }, + { + "epoch": 7.394533101595829, + "grad_norm": 1.1855976581573486, + "learning_rate": 3.1643789419542324e-05, + "loss": 1.5725, + "step": 35100 + }, + { + "epoch": 7.3966398061831775, + "grad_norm": 1.3628852367401123, + "learning_rate": 3.159548951366975e-05, + "loss": 1.5792, + "step": 35110 + }, + { + "epoch": 7.398746510770527, + "grad_norm": 1.256475806236267, + "learning_rate": 3.154721958071356e-05, + "loss": 1.5672, + "step": 35120 + }, + { + "epoch": 7.400853215357876, + "grad_norm": 1.1298019886016846, + "learning_rate": 3.149897964182413e-05, + "loss": 1.5738, + "step": 35130 + }, + { + "epoch": 7.402959919945226, + "grad_norm": 1.2207309007644653, + "learning_rate": 3.145076971813891e-05, + "loss": 1.571, + "step": 35140 + }, + { + "epoch": 7.405066624532575, + "grad_norm": 1.2245393991470337, + "learning_rate": 3.140258983078214e-05, + "loss": 1.5812, + "step": 35150 + }, + { + "epoch": 7.407173329119924, + "grad_norm": 1.2725415229797363, + "learning_rate": 3.135444000086485e-05, + "loss": 1.6163, + "step": 35160 + }, + { + "epoch": 7.409280033707273, + "grad_norm": 1.2918919324874878, + "learning_rate": 3.130632024948491e-05, + "loss": 1.5951, + "step": 35170 + }, + { + "epoch": 7.411386738294623, + "grad_norm": 1.187846302986145, + "learning_rate": 3.1258230597727075e-05, + "loss": 1.5676, + "step": 35180 + }, + { + "epoch": 7.413493442881972, + "grad_norm": 1.155684232711792, + "learning_rate": 3.121017106666283e-05, + "loss": 1.5967, + "step": 35190 + }, + { + "epoch": 7.415600147469321, + "grad_norm": 1.173157811164856, + "learning_rate": 3.116214167735053e-05, + "loss": 1.5996, + "step": 35200 + }, + { + "epoch": 7.4177068520566705, + "grad_norm": 1.1492563486099243, + "learning_rate": 3.1114142450835294e-05, + "loss": 1.5971, + "step": 35210 + }, + { + "epoch": 7.419813556644019, + "grad_norm": 1.1434016227722168, + "learning_rate": 3.1066173408148955e-05, + "loss": 1.5462, + "step": 35220 + }, + { + "epoch": 7.421920261231369, + "grad_norm": 1.1040714979171753, + "learning_rate": 3.101823457031028e-05, + "loss": 1.5812, + "step": 35230 + }, + { + "epoch": 7.424026965818718, + "grad_norm": 1.2135083675384521, + "learning_rate": 3.097032595832462e-05, + "loss": 1.6262, + "step": 35240 + }, + { + "epoch": 7.426133670406068, + "grad_norm": 1.2643121480941772, + "learning_rate": 3.092244759318424e-05, + "loss": 1.6215, + "step": 35250 + }, + { + "epoch": 7.4282403749934165, + "grad_norm": 1.105613112449646, + "learning_rate": 3.087459949586807e-05, + "loss": 1.6266, + "step": 35260 + }, + { + "epoch": 7.430347079580766, + "grad_norm": 1.2127161026000977, + "learning_rate": 3.082678168734175e-05, + "loss": 1.5836, + "step": 35270 + }, + { + "epoch": 7.432453784168115, + "grad_norm": 1.2094069719314575, + "learning_rate": 3.077899418855772e-05, + "loss": 1.6044, + "step": 35280 + }, + { + "epoch": 7.434560488755464, + "grad_norm": 1.115659236907959, + "learning_rate": 3.07312370204551e-05, + "loss": 1.5297, + "step": 35290 + }, + { + "epoch": 7.436667193342814, + "grad_norm": 1.180795669555664, + "learning_rate": 3.068351020395971e-05, + "loss": 1.5198, + "step": 35300 + }, + { + "epoch": 7.4387738979301625, + "grad_norm": 1.097103238105774, + "learning_rate": 3.063581375998412e-05, + "loss": 1.6051, + "step": 35310 + }, + { + "epoch": 7.440880602517512, + "grad_norm": 1.161124587059021, + "learning_rate": 3.0588147709427506e-05, + "loss": 1.5481, + "step": 35320 + }, + { + "epoch": 7.442987307104861, + "grad_norm": 1.1073428392410278, + "learning_rate": 3.054051207317585e-05, + "loss": 1.5336, + "step": 35330 + }, + { + "epoch": 7.445094011692211, + "grad_norm": 1.1355111598968506, + "learning_rate": 3.04929068721017e-05, + "loss": 1.5453, + "step": 35340 + }, + { + "epoch": 7.44720071627956, + "grad_norm": 1.302330732345581, + "learning_rate": 3.0445332127064275e-05, + "loss": 1.5737, + "step": 35350 + }, + { + "epoch": 7.449307420866909, + "grad_norm": 1.3458727598190308, + "learning_rate": 3.0397787858909542e-05, + "loss": 1.5543, + "step": 35360 + }, + { + "epoch": 7.451414125454258, + "grad_norm": 1.2425020933151245, + "learning_rate": 3.0350274088470022e-05, + "loss": 1.5909, + "step": 35370 + }, + { + "epoch": 7.453520830041607, + "grad_norm": 1.2361297607421875, + "learning_rate": 3.0302790836564853e-05, + "loss": 1.6337, + "step": 35380 + }, + { + "epoch": 7.455627534628957, + "grad_norm": 1.1364221572875977, + "learning_rate": 3.0255338123999934e-05, + "loss": 1.5521, + "step": 35390 + }, + { + "epoch": 7.457734239216306, + "grad_norm": 1.3635613918304443, + "learning_rate": 3.0207915971567624e-05, + "loss": 1.6241, + "step": 35400 + }, + { + "epoch": 7.4598409438036555, + "grad_norm": 1.1741420030593872, + "learning_rate": 3.016052440004703e-05, + "loss": 1.63, + "step": 35410 + }, + { + "epoch": 7.461947648391004, + "grad_norm": 1.2724038362503052, + "learning_rate": 3.0113163430203772e-05, + "loss": 1.5863, + "step": 35420 + }, + { + "epoch": 7.464054352978353, + "grad_norm": 1.2235639095306396, + "learning_rate": 3.006583308279003e-05, + "loss": 1.5091, + "step": 35430 + }, + { + "epoch": 7.466161057565703, + "grad_norm": 1.1760095357894897, + "learning_rate": 3.001853337854471e-05, + "loss": 1.5496, + "step": 35440 + }, + { + "epoch": 7.468267762153052, + "grad_norm": 1.1114704608917236, + "learning_rate": 2.9971264338193138e-05, + "loss": 1.5444, + "step": 35450 + }, + { + "epoch": 7.4703744667404015, + "grad_norm": 1.2096513509750366, + "learning_rate": 2.992402598244727e-05, + "loss": 1.5935, + "step": 35460 + }, + { + "epoch": 7.47248117132775, + "grad_norm": 1.2605581283569336, + "learning_rate": 2.987681833200565e-05, + "loss": 1.5468, + "step": 35470 + }, + { + "epoch": 7.4745878759151, + "grad_norm": 1.1911935806274414, + "learning_rate": 2.9829641407553276e-05, + "loss": 1.6422, + "step": 35480 + }, + { + "epoch": 7.476694580502449, + "grad_norm": 1.1851458549499512, + "learning_rate": 2.9782495229761808e-05, + "loss": 1.571, + "step": 35490 + }, + { + "epoch": 7.478801285089799, + "grad_norm": 1.1616430282592773, + "learning_rate": 2.973537981928932e-05, + "loss": 1.5879, + "step": 35500 + }, + { + "epoch": 7.4809079896771475, + "grad_norm": 1.434672474861145, + "learning_rate": 2.9688295196780436e-05, + "loss": 1.5506, + "step": 35510 + }, + { + "epoch": 7.483014694264496, + "grad_norm": 1.253493309020996, + "learning_rate": 2.9641241382866348e-05, + "loss": 1.5878, + "step": 35520 + }, + { + "epoch": 7.485121398851846, + "grad_norm": 1.2330995798110962, + "learning_rate": 2.959421839816464e-05, + "loss": 1.5484, + "step": 35530 + }, + { + "epoch": 7.487228103439195, + "grad_norm": 1.020727515220642, + "learning_rate": 2.9547226263279526e-05, + "loss": 1.5971, + "step": 35540 + }, + { + "epoch": 7.489334808026545, + "grad_norm": 1.3436671495437622, + "learning_rate": 2.9500264998801584e-05, + "loss": 1.5988, + "step": 35550 + }, + { + "epoch": 7.491441512613894, + "grad_norm": 1.1431214809417725, + "learning_rate": 2.945333462530788e-05, + "loss": 1.5691, + "step": 35560 + }, + { + "epoch": 7.493548217201243, + "grad_norm": 1.2193323373794556, + "learning_rate": 2.9406435163362033e-05, + "loss": 1.5398, + "step": 35570 + }, + { + "epoch": 7.495654921788592, + "grad_norm": 1.1207623481750488, + "learning_rate": 2.9359566633514037e-05, + "loss": 1.5464, + "step": 35580 + }, + { + "epoch": 7.497761626375942, + "grad_norm": 1.1370456218719482, + "learning_rate": 2.9312729056300302e-05, + "loss": 1.5847, + "step": 35590 + }, + { + "epoch": 7.499868330963291, + "grad_norm": 1.2836565971374512, + "learning_rate": 2.926592245224381e-05, + "loss": 1.5317, + "step": 35600 + }, + { + "epoch": 7.50197503555064, + "grad_norm": 1.125777244567871, + "learning_rate": 2.9219146841853807e-05, + "loss": 1.5978, + "step": 35610 + }, + { + "epoch": 7.504081740137989, + "grad_norm": 1.1385287046432495, + "learning_rate": 2.9172402245626108e-05, + "loss": 1.5838, + "step": 35620 + }, + { + "epoch": 7.506188444725338, + "grad_norm": 1.2350115776062012, + "learning_rate": 2.912568868404284e-05, + "loss": 1.5388, + "step": 35630 + }, + { + "epoch": 7.508295149312688, + "grad_norm": 1.2484310865402222, + "learning_rate": 2.907900617757252e-05, + "loss": 1.5478, + "step": 35640 + }, + { + "epoch": 7.510401853900037, + "grad_norm": 1.2130504846572876, + "learning_rate": 2.9032354746670176e-05, + "loss": 1.5439, + "step": 35650 + }, + { + "epoch": 7.5125085584873865, + "grad_norm": 1.2337069511413574, + "learning_rate": 2.8985734411777097e-05, + "loss": 1.5881, + "step": 35660 + }, + { + "epoch": 7.514615263074735, + "grad_norm": 1.1304142475128174, + "learning_rate": 2.893914519332097e-05, + "loss": 1.5423, + "step": 35670 + }, + { + "epoch": 7.516721967662084, + "grad_norm": 1.271000623703003, + "learning_rate": 2.8892587111715917e-05, + "loss": 1.628, + "step": 35680 + }, + { + "epoch": 7.518828672249434, + "grad_norm": 1.1860854625701904, + "learning_rate": 2.8846060187362335e-05, + "loss": 1.5807, + "step": 35690 + }, + { + "epoch": 7.520935376836783, + "grad_norm": 1.1863863468170166, + "learning_rate": 2.879956444064703e-05, + "loss": 1.5932, + "step": 35700 + }, + { + "epoch": 7.5230420814241326, + "grad_norm": 1.2513294219970703, + "learning_rate": 2.8753099891943113e-05, + "loss": 1.5891, + "step": 35710 + }, + { + "epoch": 7.525148786011481, + "grad_norm": 1.1214491128921509, + "learning_rate": 2.8706666561609998e-05, + "loss": 1.5867, + "step": 35720 + }, + { + "epoch": 7.527255490598831, + "grad_norm": 1.2936240434646606, + "learning_rate": 2.8660264469993502e-05, + "loss": 1.5993, + "step": 35730 + }, + { + "epoch": 7.52936219518618, + "grad_norm": 1.1555366516113281, + "learning_rate": 2.8613893637425647e-05, + "loss": 1.575, + "step": 35740 + }, + { + "epoch": 7.531468899773529, + "grad_norm": 1.1356788873672485, + "learning_rate": 2.856755408422489e-05, + "loss": 1.5665, + "step": 35750 + }, + { + "epoch": 7.533575604360879, + "grad_norm": 1.0721943378448486, + "learning_rate": 2.8521245830695864e-05, + "loss": 1.565, + "step": 35760 + }, + { + "epoch": 7.535682308948227, + "grad_norm": 1.1919306516647339, + "learning_rate": 2.847496889712952e-05, + "loss": 1.5273, + "step": 35770 + }, + { + "epoch": 7.537789013535577, + "grad_norm": 1.2383922338485718, + "learning_rate": 2.842872330380314e-05, + "loss": 1.5732, + "step": 35780 + }, + { + "epoch": 7.539895718122926, + "grad_norm": 1.244070291519165, + "learning_rate": 2.83825090709802e-05, + "loss": 1.6233, + "step": 35790 + }, + { + "epoch": 7.542002422710276, + "grad_norm": 1.201996922492981, + "learning_rate": 2.8336326218910457e-05, + "loss": 1.5927, + "step": 35800 + }, + { + "epoch": 7.544109127297625, + "grad_norm": 1.1840076446533203, + "learning_rate": 2.829017476782997e-05, + "loss": 1.5434, + "step": 35810 + }, + { + "epoch": 7.5462158318849735, + "grad_norm": 1.3222793340682983, + "learning_rate": 2.8244054737960935e-05, + "loss": 1.5198, + "step": 35820 + }, + { + "epoch": 7.548322536472323, + "grad_norm": 1.1209752559661865, + "learning_rate": 2.819796614951191e-05, + "loss": 1.5988, + "step": 35830 + }, + { + "epoch": 7.550429241059672, + "grad_norm": 1.3570189476013184, + "learning_rate": 2.815190902267757e-05, + "loss": 1.5611, + "step": 35840 + }, + { + "epoch": 7.552535945647022, + "grad_norm": 1.2108376026153564, + "learning_rate": 2.810588337763881e-05, + "loss": 1.5437, + "step": 35850 + }, + { + "epoch": 7.554642650234371, + "grad_norm": 1.14738929271698, + "learning_rate": 2.805988923456283e-05, + "loss": 1.5708, + "step": 35860 + }, + { + "epoch": 7.55674935482172, + "grad_norm": 1.3338896036148071, + "learning_rate": 2.8013926613602936e-05, + "loss": 1.5849, + "step": 35870 + }, + { + "epoch": 7.558856059409069, + "grad_norm": 1.2129565477371216, + "learning_rate": 2.7967995534898596e-05, + "loss": 1.5874, + "step": 35880 + }, + { + "epoch": 7.560962763996419, + "grad_norm": 1.1787198781967163, + "learning_rate": 2.7922096018575572e-05, + "loss": 1.5445, + "step": 35890 + }, + { + "epoch": 7.563069468583768, + "grad_norm": 1.1483491659164429, + "learning_rate": 2.787622808474567e-05, + "loss": 1.5665, + "step": 35900 + }, + { + "epoch": 7.565176173171118, + "grad_norm": 1.1970431804656982, + "learning_rate": 2.783039175350699e-05, + "loss": 1.6202, + "step": 35910 + }, + { + "epoch": 7.567282877758466, + "grad_norm": 1.138543963432312, + "learning_rate": 2.7784587044943666e-05, + "loss": 1.6148, + "step": 35920 + }, + { + "epoch": 7.569389582345815, + "grad_norm": 1.1422301530838013, + "learning_rate": 2.7738813979126e-05, + "loss": 1.5764, + "step": 35930 + }, + { + "epoch": 7.571496286933165, + "grad_norm": 1.1119393110275269, + "learning_rate": 2.7693072576110514e-05, + "loss": 1.6223, + "step": 35940 + }, + { + "epoch": 7.573602991520514, + "grad_norm": 1.2076855897903442, + "learning_rate": 2.764736285593975e-05, + "loss": 1.5864, + "step": 35950 + }, + { + "epoch": 7.575709696107864, + "grad_norm": 1.2735531330108643, + "learning_rate": 2.7601684838642405e-05, + "loss": 1.6377, + "step": 35960 + }, + { + "epoch": 7.5778164006952125, + "grad_norm": 1.0892045497894287, + "learning_rate": 2.755603854423332e-05, + "loss": 1.5968, + "step": 35970 + }, + { + "epoch": 7.579923105282562, + "grad_norm": 1.2122892141342163, + "learning_rate": 2.7510423992713374e-05, + "loss": 1.6211, + "step": 35980 + }, + { + "epoch": 7.582029809869911, + "grad_norm": 1.1663323640823364, + "learning_rate": 2.7464841204069614e-05, + "loss": 1.5842, + "step": 35990 + }, + { + "epoch": 7.58413651445726, + "grad_norm": 1.1494828462600708, + "learning_rate": 2.7419290198275095e-05, + "loss": 1.6125, + "step": 36000 + }, + { + "epoch": 7.58624321904461, + "grad_norm": 1.1589758396148682, + "learning_rate": 2.737377099528895e-05, + "loss": 1.6178, + "step": 36010 + }, + { + "epoch": 7.5883499236319585, + "grad_norm": 1.177195429801941, + "learning_rate": 2.7328283615056472e-05, + "loss": 1.535, + "step": 36020 + }, + { + "epoch": 7.590456628219308, + "grad_norm": 1.2202266454696655, + "learning_rate": 2.728282807750886e-05, + "loss": 1.5805, + "step": 36030 + }, + { + "epoch": 7.592563332806657, + "grad_norm": 1.1336926221847534, + "learning_rate": 2.7237404402563517e-05, + "loss": 1.5508, + "step": 36040 + }, + { + "epoch": 7.594670037394007, + "grad_norm": 1.181666612625122, + "learning_rate": 2.7192012610123774e-05, + "loss": 1.6056, + "step": 36050 + }, + { + "epoch": 7.596776741981356, + "grad_norm": 1.1373564004898071, + "learning_rate": 2.7146652720079003e-05, + "loss": 1.5999, + "step": 36060 + }, + { + "epoch": 7.5988834465687045, + "grad_norm": 1.1596566438674927, + "learning_rate": 2.7101324752304635e-05, + "loss": 1.5928, + "step": 36070 + }, + { + "epoch": 7.600990151156054, + "grad_norm": 1.2005223035812378, + "learning_rate": 2.7056028726662176e-05, + "loss": 1.5748, + "step": 36080 + }, + { + "epoch": 7.603096855743403, + "grad_norm": 1.1738914251327515, + "learning_rate": 2.7010764662998933e-05, + "loss": 1.6077, + "step": 36090 + }, + { + "epoch": 7.605203560330753, + "grad_norm": 1.196463704109192, + "learning_rate": 2.69655325811484e-05, + "loss": 1.5577, + "step": 36100 + }, + { + "epoch": 7.607310264918102, + "grad_norm": 1.3594914674758911, + "learning_rate": 2.6920332500930025e-05, + "loss": 1.5649, + "step": 36110 + }, + { + "epoch": 7.609416969505451, + "grad_norm": 1.21880042552948, + "learning_rate": 2.6875164442149147e-05, + "loss": 1.5571, + "step": 36120 + }, + { + "epoch": 7.6115236740928, + "grad_norm": 1.2229676246643066, + "learning_rate": 2.6830028424597165e-05, + "loss": 1.5707, + "step": 36130 + }, + { + "epoch": 7.613630378680149, + "grad_norm": 1.1788535118103027, + "learning_rate": 2.6784924468051342e-05, + "loss": 1.5671, + "step": 36140 + }, + { + "epoch": 7.615737083267499, + "grad_norm": 1.1653674840927124, + "learning_rate": 2.6739852592274995e-05, + "loss": 1.5574, + "step": 36150 + }, + { + "epoch": 7.617843787854848, + "grad_norm": 1.0854195356369019, + "learning_rate": 2.669481281701739e-05, + "loss": 1.5698, + "step": 36160 + }, + { + "epoch": 7.6199504924421975, + "grad_norm": 1.1866564750671387, + "learning_rate": 2.6649805162013585e-05, + "loss": 1.554, + "step": 36170 + }, + { + "epoch": 7.622057197029546, + "grad_norm": 1.1668610572814941, + "learning_rate": 2.6604829646984686e-05, + "loss": 1.5912, + "step": 36180 + }, + { + "epoch": 7.624163901616896, + "grad_norm": 1.1470420360565186, + "learning_rate": 2.6559886291637748e-05, + "loss": 1.584, + "step": 36190 + }, + { + "epoch": 7.626270606204245, + "grad_norm": 1.1592053174972534, + "learning_rate": 2.651497511566562e-05, + "loss": 1.5544, + "step": 36200 + }, + { + "epoch": 7.628377310791594, + "grad_norm": 1.1775624752044678, + "learning_rate": 2.6470096138747126e-05, + "loss": 1.5315, + "step": 36210 + }, + { + "epoch": 7.6304840153789435, + "grad_norm": 1.2434234619140625, + "learning_rate": 2.6425249380546912e-05, + "loss": 1.569, + "step": 36220 + }, + { + "epoch": 7.632590719966292, + "grad_norm": 1.1641994714736938, + "learning_rate": 2.6380434860715598e-05, + "loss": 1.6168, + "step": 36230 + }, + { + "epoch": 7.634697424553642, + "grad_norm": 1.2335554361343384, + "learning_rate": 2.6335652598889683e-05, + "loss": 1.5694, + "step": 36240 + }, + { + "epoch": 7.636804129140991, + "grad_norm": 1.1290301084518433, + "learning_rate": 2.629090261469138e-05, + "loss": 1.5447, + "step": 36250 + }, + { + "epoch": 7.638910833728341, + "grad_norm": 1.1775555610656738, + "learning_rate": 2.624618492772891e-05, + "loss": 1.6394, + "step": 36260 + }, + { + "epoch": 7.6410175383156895, + "grad_norm": 1.2943990230560303, + "learning_rate": 2.620149955759633e-05, + "loss": 1.6198, + "step": 36270 + }, + { + "epoch": 7.643124242903039, + "grad_norm": 1.2637001276016235, + "learning_rate": 2.615684652387348e-05, + "loss": 1.5505, + "step": 36280 + }, + { + "epoch": 7.645230947490388, + "grad_norm": 1.195572018623352, + "learning_rate": 2.6112225846126038e-05, + "loss": 1.5403, + "step": 36290 + }, + { + "epoch": 7.647337652077738, + "grad_norm": 1.2487926483154297, + "learning_rate": 2.60676375439055e-05, + "loss": 1.5534, + "step": 36300 + }, + { + "epoch": 7.649444356665087, + "grad_norm": 1.304735541343689, + "learning_rate": 2.6023081636749225e-05, + "loss": 1.5638, + "step": 36310 + }, + { + "epoch": 7.651551061252436, + "grad_norm": 1.1997491121292114, + "learning_rate": 2.5978558144180363e-05, + "loss": 1.612, + "step": 36320 + }, + { + "epoch": 7.653657765839785, + "grad_norm": 1.226371169090271, + "learning_rate": 2.5934067085707834e-05, + "loss": 1.541, + "step": 36330 + }, + { + "epoch": 7.655764470427134, + "grad_norm": 1.1558582782745361, + "learning_rate": 2.5889608480826322e-05, + "loss": 1.6072, + "step": 36340 + }, + { + "epoch": 7.657871175014484, + "grad_norm": 1.1929916143417358, + "learning_rate": 2.5845182349016384e-05, + "loss": 1.5986, + "step": 36350 + }, + { + "epoch": 7.659977879601833, + "grad_norm": 1.2270394563674927, + "learning_rate": 2.5800788709744227e-05, + "loss": 1.5288, + "step": 36360 + }, + { + "epoch": 7.6620845841891825, + "grad_norm": 1.158097267150879, + "learning_rate": 2.5756427582461996e-05, + "loss": 1.5654, + "step": 36370 + }, + { + "epoch": 7.664191288776531, + "grad_norm": 1.2660669088363647, + "learning_rate": 2.5712098986607326e-05, + "loss": 1.5996, + "step": 36380 + }, + { + "epoch": 7.66629799336388, + "grad_norm": 1.1697864532470703, + "learning_rate": 2.5667802941603834e-05, + "loss": 1.5389, + "step": 36390 + }, + { + "epoch": 7.66840469795123, + "grad_norm": 1.2377358675003052, + "learning_rate": 2.5623539466860813e-05, + "loss": 1.5828, + "step": 36400 + }, + { + "epoch": 7.670511402538579, + "grad_norm": 1.2005665302276611, + "learning_rate": 2.5579308581773232e-05, + "loss": 1.6106, + "step": 36410 + }, + { + "epoch": 7.6726181071259285, + "grad_norm": 1.2478543519973755, + "learning_rate": 2.5535110305721776e-05, + "loss": 1.5887, + "step": 36420 + }, + { + "epoch": 7.674724811713277, + "grad_norm": 1.2146416902542114, + "learning_rate": 2.5490944658072945e-05, + "loss": 1.5917, + "step": 36430 + }, + { + "epoch": 7.676831516300627, + "grad_norm": 1.2361540794372559, + "learning_rate": 2.5446811658178815e-05, + "loss": 1.6649, + "step": 36440 + }, + { + "epoch": 7.678938220887976, + "grad_norm": 1.107953429222107, + "learning_rate": 2.540271132537729e-05, + "loss": 1.6139, + "step": 36450 + }, + { + "epoch": 7.681044925475325, + "grad_norm": 1.2849297523498535, + "learning_rate": 2.5358643678991788e-05, + "loss": 1.562, + "step": 36460 + }, + { + "epoch": 7.6831516300626745, + "grad_norm": 1.2075804471969604, + "learning_rate": 2.5314608738331537e-05, + "loss": 1.5133, + "step": 36470 + }, + { + "epoch": 7.685258334650023, + "grad_norm": 1.1946347951889038, + "learning_rate": 2.5270606522691443e-05, + "loss": 1.6608, + "step": 36480 + }, + { + "epoch": 7.687365039237373, + "grad_norm": 1.1127058267593384, + "learning_rate": 2.5226637051351987e-05, + "loss": 1.5892, + "step": 36490 + }, + { + "epoch": 7.689471743824722, + "grad_norm": 1.2100321054458618, + "learning_rate": 2.5182700343579334e-05, + "loss": 1.4847, + "step": 36500 + }, + { + "epoch": 7.691578448412072, + "grad_norm": 1.1425443887710571, + "learning_rate": 2.5138796418625343e-05, + "loss": 1.5488, + "step": 36510 + }, + { + "epoch": 7.693685152999421, + "grad_norm": 1.0502097606658936, + "learning_rate": 2.5094925295727423e-05, + "loss": 1.5681, + "step": 36520 + }, + { + "epoch": 7.695791857586769, + "grad_norm": 1.2372586727142334, + "learning_rate": 2.5051086994108743e-05, + "loss": 1.5978, + "step": 36530 + }, + { + "epoch": 7.697898562174119, + "grad_norm": 1.2679331302642822, + "learning_rate": 2.500728153297788e-05, + "loss": 1.5652, + "step": 36540 + }, + { + "epoch": 7.700005266761468, + "grad_norm": 1.316920280456543, + "learning_rate": 2.49635089315292e-05, + "loss": 1.5933, + "step": 36550 + }, + { + "epoch": 7.702111971348818, + "grad_norm": 1.1949541568756104, + "learning_rate": 2.4919769208942655e-05, + "loss": 1.5761, + "step": 36560 + }, + { + "epoch": 7.704218675936167, + "grad_norm": 1.2706875801086426, + "learning_rate": 2.4876062384383714e-05, + "loss": 1.5492, + "step": 36570 + }, + { + "epoch": 7.706325380523516, + "grad_norm": 1.247161865234375, + "learning_rate": 2.4832388477003443e-05, + "loss": 1.5714, + "step": 36580 + }, + { + "epoch": 7.708432085110865, + "grad_norm": 1.1570487022399902, + "learning_rate": 2.478874750593856e-05, + "loss": 1.6159, + "step": 36590 + }, + { + "epoch": 7.710538789698215, + "grad_norm": 1.2316917181015015, + "learning_rate": 2.4745139490311254e-05, + "loss": 1.5746, + "step": 36600 + }, + { + "epoch": 7.712645494285564, + "grad_norm": 1.1851727962493896, + "learning_rate": 2.4701564449229374e-05, + "loss": 1.6075, + "step": 36610 + }, + { + "epoch": 7.7147521988729135, + "grad_norm": 1.2722426652908325, + "learning_rate": 2.465802240178624e-05, + "loss": 1.5341, + "step": 36620 + }, + { + "epoch": 7.716858903460262, + "grad_norm": 1.2204933166503906, + "learning_rate": 2.46145133670607e-05, + "loss": 1.5937, + "step": 36630 + }, + { + "epoch": 7.718965608047611, + "grad_norm": 1.2491124868392944, + "learning_rate": 2.4571037364117255e-05, + "loss": 1.5956, + "step": 36640 + }, + { + "epoch": 7.721072312634961, + "grad_norm": 1.1790046691894531, + "learning_rate": 2.4527594412005793e-05, + "loss": 1.6315, + "step": 36650 + }, + { + "epoch": 7.72317901722231, + "grad_norm": 1.1278231143951416, + "learning_rate": 2.4484184529761834e-05, + "loss": 1.57, + "step": 36660 + }, + { + "epoch": 7.72528572180966, + "grad_norm": 1.13362455368042, + "learning_rate": 2.4440807736406335e-05, + "loss": 1.5596, + "step": 36670 + }, + { + "epoch": 7.727392426397008, + "grad_norm": 1.2489476203918457, + "learning_rate": 2.439746405094575e-05, + "loss": 1.5568, + "step": 36680 + }, + { + "epoch": 7.729499130984358, + "grad_norm": 1.2021193504333496, + "learning_rate": 2.43541534923721e-05, + "loss": 1.5935, + "step": 36690 + }, + { + "epoch": 7.731605835571707, + "grad_norm": 1.2708065509796143, + "learning_rate": 2.4310876079662824e-05, + "loss": 1.527, + "step": 36700 + }, + { + "epoch": 7.733712540159056, + "grad_norm": 1.2618707418441772, + "learning_rate": 2.4267631831780824e-05, + "loss": 1.5606, + "step": 36710 + }, + { + "epoch": 7.735819244746406, + "grad_norm": 1.1836202144622803, + "learning_rate": 2.4224420767674562e-05, + "loss": 1.6017, + "step": 36720 + }, + { + "epoch": 7.737925949333754, + "grad_norm": 1.3585106134414673, + "learning_rate": 2.4181242906277833e-05, + "loss": 1.5522, + "step": 36730 + }, + { + "epoch": 7.740032653921104, + "grad_norm": 1.2638752460479736, + "learning_rate": 2.4138098266510033e-05, + "loss": 1.6293, + "step": 36740 + }, + { + "epoch": 7.742139358508453, + "grad_norm": 1.2558181285858154, + "learning_rate": 2.409498686727587e-05, + "loss": 1.5783, + "step": 36750 + }, + { + "epoch": 7.744246063095803, + "grad_norm": 1.243468999862671, + "learning_rate": 2.405190872746551e-05, + "loss": 1.554, + "step": 36760 + }, + { + "epoch": 7.746352767683152, + "grad_norm": 1.2900869846343994, + "learning_rate": 2.4008863865954635e-05, + "loss": 1.6041, + "step": 36770 + }, + { + "epoch": 7.7484594722705005, + "grad_norm": 1.216533899307251, + "learning_rate": 2.3965852301604254e-05, + "loss": 1.5808, + "step": 36780 + }, + { + "epoch": 7.75056617685785, + "grad_norm": 1.1279761791229248, + "learning_rate": 2.392287405326078e-05, + "loss": 1.5403, + "step": 36790 + }, + { + "epoch": 7.752672881445199, + "grad_norm": 1.2638577222824097, + "learning_rate": 2.387992913975613e-05, + "loss": 1.5787, + "step": 36800 + }, + { + "epoch": 7.754779586032549, + "grad_norm": 1.2621861696243286, + "learning_rate": 2.3837017579907472e-05, + "loss": 1.6167, + "step": 36810 + }, + { + "epoch": 7.756886290619898, + "grad_norm": 1.2948744297027588, + "learning_rate": 2.379413939251751e-05, + "loss": 1.6138, + "step": 36820 + }, + { + "epoch": 7.758992995207247, + "grad_norm": 1.2331513166427612, + "learning_rate": 2.375129459637422e-05, + "loss": 1.6155, + "step": 36830 + }, + { + "epoch": 7.761099699794596, + "grad_norm": 1.1364085674285889, + "learning_rate": 2.370848321025093e-05, + "loss": 1.5179, + "step": 36840 + }, + { + "epoch": 7.763206404381945, + "grad_norm": 1.110427737236023, + "learning_rate": 2.3665705252906443e-05, + "loss": 1.6095, + "step": 36850 + }, + { + "epoch": 7.765313108969295, + "grad_norm": 1.229262113571167, + "learning_rate": 2.3622960743084798e-05, + "loss": 1.5977, + "step": 36860 + }, + { + "epoch": 7.767419813556644, + "grad_norm": 1.1048126220703125, + "learning_rate": 2.3580249699515467e-05, + "loss": 1.5216, + "step": 36870 + }, + { + "epoch": 7.769526518143993, + "grad_norm": 1.1986953020095825, + "learning_rate": 2.353757214091321e-05, + "loss": 1.556, + "step": 36880 + }, + { + "epoch": 7.771633222731342, + "grad_norm": 1.2192281484603882, + "learning_rate": 2.3494928085978073e-05, + "loss": 1.5995, + "step": 36890 + }, + { + "epoch": 7.773739927318692, + "grad_norm": 1.184574842453003, + "learning_rate": 2.345231755339554e-05, + "loss": 1.5295, + "step": 36900 + }, + { + "epoch": 7.775846631906041, + "grad_norm": 1.2875125408172607, + "learning_rate": 2.3409740561836313e-05, + "loss": 1.5371, + "step": 36910 + }, + { + "epoch": 7.777953336493391, + "grad_norm": 1.20929753780365, + "learning_rate": 2.3367197129956376e-05, + "loss": 1.5668, + "step": 36920 + }, + { + "epoch": 7.7800600410807395, + "grad_norm": 1.1370899677276611, + "learning_rate": 2.332468727639713e-05, + "loss": 1.5638, + "step": 36930 + }, + { + "epoch": 7.782166745668089, + "grad_norm": 1.2598953247070312, + "learning_rate": 2.3282211019785127e-05, + "loss": 1.5746, + "step": 36940 + }, + { + "epoch": 7.784273450255438, + "grad_norm": 1.2978923320770264, + "learning_rate": 2.3239768378732307e-05, + "loss": 1.5687, + "step": 36950 + }, + { + "epoch": 7.786380154842787, + "grad_norm": 1.1977964639663696, + "learning_rate": 2.3197359371835802e-05, + "loss": 1.552, + "step": 36960 + }, + { + "epoch": 7.788486859430137, + "grad_norm": 1.1075884103775024, + "learning_rate": 2.3154984017678015e-05, + "loss": 1.569, + "step": 36970 + }, + { + "epoch": 7.7905935640174855, + "grad_norm": 1.2914838790893555, + "learning_rate": 2.3112642334826684e-05, + "loss": 1.5667, + "step": 36980 + }, + { + "epoch": 7.792700268604835, + "grad_norm": 1.172547698020935, + "learning_rate": 2.30703343418347e-05, + "loss": 1.5901, + "step": 36990 + }, + { + "epoch": 7.794806973192184, + "grad_norm": 1.1503307819366455, + "learning_rate": 2.3028060057240187e-05, + "loss": 1.5928, + "step": 37000 + }, + { + "epoch": 7.796913677779534, + "grad_norm": 1.278609275817871, + "learning_rate": 2.298581949956662e-05, + "loss": 1.5382, + "step": 37010 + }, + { + "epoch": 7.799020382366883, + "grad_norm": 1.2486605644226074, + "learning_rate": 2.2943612687322525e-05, + "loss": 1.5521, + "step": 37020 + }, + { + "epoch": 7.8011270869542315, + "grad_norm": 1.2521138191223145, + "learning_rate": 2.290143963900181e-05, + "loss": 1.5802, + "step": 37030 + }, + { + "epoch": 7.803233791541581, + "grad_norm": 1.1833715438842773, + "learning_rate": 2.285930037308347e-05, + "loss": 1.5383, + "step": 37040 + }, + { + "epoch": 7.80534049612893, + "grad_norm": 1.2181299924850464, + "learning_rate": 2.2817194908031712e-05, + "loss": 1.5618, + "step": 37050 + }, + { + "epoch": 7.80744720071628, + "grad_norm": 1.2383207082748413, + "learning_rate": 2.2775123262296005e-05, + "loss": 1.6328, + "step": 37060 + }, + { + "epoch": 7.809553905303629, + "grad_norm": 1.2487322092056274, + "learning_rate": 2.2733085454310942e-05, + "loss": 1.5988, + "step": 37070 + }, + { + "epoch": 7.811660609890978, + "grad_norm": 1.2942522764205933, + "learning_rate": 2.2691081502496246e-05, + "loss": 1.5875, + "step": 37080 + }, + { + "epoch": 7.813767314478327, + "grad_norm": 1.1791707277297974, + "learning_rate": 2.264911142525693e-05, + "loss": 1.6108, + "step": 37090 + }, + { + "epoch": 7.815874019065676, + "grad_norm": 1.1863888502120972, + "learning_rate": 2.2607175240983026e-05, + "loss": 1.6131, + "step": 37100 + }, + { + "epoch": 7.817980723653026, + "grad_norm": 1.1202061176300049, + "learning_rate": 2.2565272968049844e-05, + "loss": 1.5871, + "step": 37110 + }, + { + "epoch": 7.820087428240375, + "grad_norm": 1.1665284633636475, + "learning_rate": 2.2523404624817736e-05, + "loss": 1.6289, + "step": 37120 + }, + { + "epoch": 7.8221941328277245, + "grad_norm": 1.2799750566482544, + "learning_rate": 2.2481570229632197e-05, + "loss": 1.5694, + "step": 37130 + }, + { + "epoch": 7.824300837415073, + "grad_norm": 1.236337423324585, + "learning_rate": 2.243976980082394e-05, + "loss": 1.5345, + "step": 37140 + }, + { + "epoch": 7.826407542002423, + "grad_norm": 1.1478630304336548, + "learning_rate": 2.2398003356708654e-05, + "loss": 1.5972, + "step": 37150 + }, + { + "epoch": 7.828514246589772, + "grad_norm": 1.2333014011383057, + "learning_rate": 2.2356270915587274e-05, + "loss": 1.5873, + "step": 37160 + }, + { + "epoch": 7.830620951177121, + "grad_norm": 1.4515944719314575, + "learning_rate": 2.2314572495745746e-05, + "loss": 1.5801, + "step": 37170 + }, + { + "epoch": 7.8327276557644705, + "grad_norm": 1.263717532157898, + "learning_rate": 2.2272908115455104e-05, + "loss": 1.593, + "step": 37180 + }, + { + "epoch": 7.834834360351819, + "grad_norm": 1.1389946937561035, + "learning_rate": 2.2231277792971515e-05, + "loss": 1.6318, + "step": 37190 + }, + { + "epoch": 7.836941064939169, + "grad_norm": 1.3768473863601685, + "learning_rate": 2.218968154653629e-05, + "loss": 1.5971, + "step": 37200 + }, + { + "epoch": 7.839047769526518, + "grad_norm": 1.1650984287261963, + "learning_rate": 2.2148119394375577e-05, + "loss": 1.612, + "step": 37210 + }, + { + "epoch": 7.841154474113868, + "grad_norm": 1.6648105382919312, + "learning_rate": 2.2106591354700845e-05, + "loss": 1.5907, + "step": 37220 + }, + { + "epoch": 7.8432611787012165, + "grad_norm": 1.2210161685943604, + "learning_rate": 2.2065097445708437e-05, + "loss": 1.5393, + "step": 37230 + }, + { + "epoch": 7.845367883288565, + "grad_norm": 1.1931644678115845, + "learning_rate": 2.2023637685579856e-05, + "loss": 1.628, + "step": 37240 + }, + { + "epoch": 7.847474587875915, + "grad_norm": 1.2531075477600098, + "learning_rate": 2.198221209248158e-05, + "loss": 1.5395, + "step": 37250 + }, + { + "epoch": 7.849581292463264, + "grad_norm": 1.1961562633514404, + "learning_rate": 2.194082068456509e-05, + "loss": 1.5847, + "step": 37260 + }, + { + "epoch": 7.851687997050614, + "grad_norm": 1.3024967908859253, + "learning_rate": 2.1899463479966952e-05, + "loss": 1.5841, + "step": 37270 + }, + { + "epoch": 7.853794701637963, + "grad_norm": 1.9943350553512573, + "learning_rate": 2.1858140496808777e-05, + "loss": 1.6065, + "step": 37280 + }, + { + "epoch": 7.855901406225312, + "grad_norm": 1.188504934310913, + "learning_rate": 2.181685175319702e-05, + "loss": 1.5932, + "step": 37290 + }, + { + "epoch": 7.858008110812661, + "grad_norm": 1.0779452323913574, + "learning_rate": 2.1775597267223323e-05, + "loss": 1.5936, + "step": 37300 + }, + { + "epoch": 7.860114815400011, + "grad_norm": 1.3473560810089111, + "learning_rate": 2.1734377056964172e-05, + "loss": 1.5284, + "step": 37310 + }, + { + "epoch": 7.86222151998736, + "grad_norm": 1.1941745281219482, + "learning_rate": 2.169319114048114e-05, + "loss": 1.6274, + "step": 37320 + }, + { + "epoch": 7.8643282245747095, + "grad_norm": 1.1974396705627441, + "learning_rate": 2.1652039535820712e-05, + "loss": 1.533, + "step": 37330 + }, + { + "epoch": 7.866434929162058, + "grad_norm": 1.1820552349090576, + "learning_rate": 2.161092226101432e-05, + "loss": 1.5985, + "step": 37340 + }, + { + "epoch": 7.868541633749407, + "grad_norm": 1.3820282220840454, + "learning_rate": 2.1569839334078422e-05, + "loss": 1.5649, + "step": 37350 + }, + { + "epoch": 7.870648338336757, + "grad_norm": 1.108791708946228, + "learning_rate": 2.152879077301443e-05, + "loss": 1.5098, + "step": 37360 + }, + { + "epoch": 7.872755042924106, + "grad_norm": 1.1899380683898926, + "learning_rate": 2.1487776595808575e-05, + "loss": 1.5486, + "step": 37370 + }, + { + "epoch": 7.8748617475114555, + "grad_norm": 1.275118112564087, + "learning_rate": 2.1446796820432167e-05, + "loss": 1.5725, + "step": 37380 + }, + { + "epoch": 7.876968452098804, + "grad_norm": 1.3126546144485474, + "learning_rate": 2.140585146484133e-05, + "loss": 1.6219, + "step": 37390 + }, + { + "epoch": 7.879075156686154, + "grad_norm": 1.2064414024353027, + "learning_rate": 2.136494054697722e-05, + "loss": 1.5985, + "step": 37400 + }, + { + "epoch": 7.881181861273503, + "grad_norm": 1.1565742492675781, + "learning_rate": 2.1324064084765815e-05, + "loss": 1.615, + "step": 37410 + }, + { + "epoch": 7.883288565860852, + "grad_norm": 1.1512229442596436, + "learning_rate": 2.1283222096117982e-05, + "loss": 1.5886, + "step": 37420 + }, + { + "epoch": 7.8853952704482015, + "grad_norm": 1.209027886390686, + "learning_rate": 2.124241459892955e-05, + "loss": 1.5634, + "step": 37430 + }, + { + "epoch": 7.88750197503555, + "grad_norm": 1.07988703250885, + "learning_rate": 2.1201641611081246e-05, + "loss": 1.546, + "step": 37440 + }, + { + "epoch": 7.8896086796229, + "grad_norm": 1.1130520105361938, + "learning_rate": 2.1160903150438605e-05, + "loss": 1.5706, + "step": 37450 + }, + { + "epoch": 7.891715384210249, + "grad_norm": 1.0994175672531128, + "learning_rate": 2.1120199234852067e-05, + "loss": 1.5346, + "step": 37460 + }, + { + "epoch": 7.893822088797599, + "grad_norm": 1.1334426403045654, + "learning_rate": 2.10795298821569e-05, + "loss": 1.5728, + "step": 37470 + }, + { + "epoch": 7.895928793384948, + "grad_norm": 1.1012332439422607, + "learning_rate": 2.1038895110173283e-05, + "loss": 1.5863, + "step": 37480 + }, + { + "epoch": 7.898035497972296, + "grad_norm": 1.2324810028076172, + "learning_rate": 2.0998294936706288e-05, + "loss": 1.632, + "step": 37490 + }, + { + "epoch": 7.900142202559646, + "grad_norm": 1.2288521528244019, + "learning_rate": 2.0957729379545655e-05, + "loss": 1.6035, + "step": 37500 + }, + { + "epoch": 7.902248907146995, + "grad_norm": 1.2216746807098389, + "learning_rate": 2.091719845646609e-05, + "loss": 1.61, + "step": 37510 + }, + { + "epoch": 7.904355611734345, + "grad_norm": 1.077216386795044, + "learning_rate": 2.0876702185227137e-05, + "loss": 1.5732, + "step": 37520 + }, + { + "epoch": 7.906462316321694, + "grad_norm": 1.1874003410339355, + "learning_rate": 2.0836240583573098e-05, + "loss": 1.5926, + "step": 37530 + }, + { + "epoch": 7.908569020909043, + "grad_norm": 1.2038264274597168, + "learning_rate": 2.079581366923308e-05, + "loss": 1.5913, + "step": 37540 + }, + { + "epoch": 7.910675725496392, + "grad_norm": 1.289731740951538, + "learning_rate": 2.0755421459920986e-05, + "loss": 1.5404, + "step": 37550 + }, + { + "epoch": 7.912782430083741, + "grad_norm": 1.265437126159668, + "learning_rate": 2.0715063973335568e-05, + "loss": 1.616, + "step": 37560 + }, + { + "epoch": 7.914889134671091, + "grad_norm": 1.1244462728500366, + "learning_rate": 2.067474122716039e-05, + "loss": 1.563, + "step": 37570 + }, + { + "epoch": 7.91699583925844, + "grad_norm": 1.1906565427780151, + "learning_rate": 2.0634453239063623e-05, + "loss": 1.5503, + "step": 37580 + }, + { + "epoch": 7.919102543845789, + "grad_norm": 1.2545281648635864, + "learning_rate": 2.0594200026698363e-05, + "loss": 1.5598, + "step": 37590 + }, + { + "epoch": 7.921209248433138, + "grad_norm": 1.2519073486328125, + "learning_rate": 2.0553981607702478e-05, + "loss": 1.5408, + "step": 37600 + }, + { + "epoch": 7.923315953020488, + "grad_norm": 1.3093730211257935, + "learning_rate": 2.051379799969849e-05, + "loss": 1.5862, + "step": 37610 + }, + { + "epoch": 7.925422657607837, + "grad_norm": 1.108938455581665, + "learning_rate": 2.04736492202937e-05, + "loss": 1.6347, + "step": 37620 + }, + { + "epoch": 7.927529362195187, + "grad_norm": 1.1940797567367554, + "learning_rate": 2.0433535287080217e-05, + "loss": 1.5676, + "step": 37630 + }, + { + "epoch": 7.929636066782535, + "grad_norm": 1.052841067314148, + "learning_rate": 2.0393456217634775e-05, + "loss": 1.5539, + "step": 37640 + }, + { + "epoch": 7.931742771369885, + "grad_norm": 1.1526647806167603, + "learning_rate": 2.035341202951897e-05, + "loss": 1.5962, + "step": 37650 + }, + { + "epoch": 7.933849475957234, + "grad_norm": 1.2988382577896118, + "learning_rate": 2.0313402740278908e-05, + "loss": 1.5831, + "step": 37660 + }, + { + "epoch": 7.935956180544583, + "grad_norm": 1.2303566932678223, + "learning_rate": 2.027342836744559e-05, + "loss": 1.6096, + "step": 37670 + }, + { + "epoch": 7.938062885131933, + "grad_norm": 1.2298696041107178, + "learning_rate": 2.0233488928534673e-05, + "loss": 1.5679, + "step": 37680 + }, + { + "epoch": 7.940169589719281, + "grad_norm": 1.2016994953155518, + "learning_rate": 2.0193584441046443e-05, + "loss": 1.5947, + "step": 37690 + }, + { + "epoch": 7.942276294306631, + "grad_norm": 1.2675684690475464, + "learning_rate": 2.015371492246596e-05, + "loss": 1.5852, + "step": 37700 + }, + { + "epoch": 7.94438299889398, + "grad_norm": 1.2211737632751465, + "learning_rate": 2.0113880390262884e-05, + "loss": 1.5966, + "step": 37710 + }, + { + "epoch": 7.94648970348133, + "grad_norm": 1.0325324535369873, + "learning_rate": 2.0074080861891565e-05, + "loss": 1.5497, + "step": 37720 + }, + { + "epoch": 7.948596408068679, + "grad_norm": 1.1817600727081299, + "learning_rate": 2.0034316354791062e-05, + "loss": 1.5462, + "step": 37730 + }, + { + "epoch": 7.9507031126560275, + "grad_norm": 1.1029366254806519, + "learning_rate": 1.9994586886385046e-05, + "loss": 1.5305, + "step": 37740 + }, + { + "epoch": 7.952809817243377, + "grad_norm": 1.2298988103866577, + "learning_rate": 1.995489247408181e-05, + "loss": 1.5816, + "step": 37750 + }, + { + "epoch": 7.954916521830726, + "grad_norm": 1.2545249462127686, + "learning_rate": 1.991523313527437e-05, + "loss": 1.6065, + "step": 37760 + }, + { + "epoch": 7.957023226418076, + "grad_norm": 1.2715802192687988, + "learning_rate": 1.987560888734027e-05, + "loss": 1.6085, + "step": 37770 + }, + { + "epoch": 7.959129931005425, + "grad_norm": 1.146997332572937, + "learning_rate": 1.98360197476418e-05, + "loss": 1.5374, + "step": 37780 + }, + { + "epoch": 7.961236635592774, + "grad_norm": 1.3229591846466064, + "learning_rate": 1.979646573352574e-05, + "loss": 1.6448, + "step": 37790 + }, + { + "epoch": 7.963343340180123, + "grad_norm": 1.1811490058898926, + "learning_rate": 1.9756946862323535e-05, + "loss": 1.5583, + "step": 37800 + }, + { + "epoch": 7.965450044767472, + "grad_norm": 1.248812198638916, + "learning_rate": 1.971746315135129e-05, + "loss": 1.5434, + "step": 37810 + }, + { + "epoch": 7.967556749354822, + "grad_norm": 1.3636667728424072, + "learning_rate": 1.9678014617909602e-05, + "loss": 1.6521, + "step": 37820 + }, + { + "epoch": 7.969663453942171, + "grad_norm": 1.1896461248397827, + "learning_rate": 1.9638601279283684e-05, + "loss": 1.6116, + "step": 37830 + }, + { + "epoch": 7.97177015852952, + "grad_norm": 1.2454185485839844, + "learning_rate": 1.9599223152743395e-05, + "loss": 1.5729, + "step": 37840 + }, + { + "epoch": 7.973876863116869, + "grad_norm": 1.169144630432129, + "learning_rate": 1.955988025554305e-05, + "loss": 1.5811, + "step": 37850 + }, + { + "epoch": 7.975983567704219, + "grad_norm": 1.1805130243301392, + "learning_rate": 1.9520572604921672e-05, + "loss": 1.6041, + "step": 37860 + }, + { + "epoch": 7.978090272291568, + "grad_norm": 1.1419270038604736, + "learning_rate": 1.9481300218102692e-05, + "loss": 1.5693, + "step": 37870 + }, + { + "epoch": 7.980196976878917, + "grad_norm": 1.2116881608963013, + "learning_rate": 1.9442063112294163e-05, + "loss": 1.5413, + "step": 37880 + }, + { + "epoch": 7.9823036814662665, + "grad_norm": 1.3711943626403809, + "learning_rate": 1.9402861304688712e-05, + "loss": 1.5626, + "step": 37890 + }, + { + "epoch": 7.984410386053615, + "grad_norm": 1.208719253540039, + "learning_rate": 1.936369481246344e-05, + "loss": 1.5836, + "step": 37900 + }, + { + "epoch": 7.986517090640965, + "grad_norm": 1.2907274961471558, + "learning_rate": 1.9324563652779947e-05, + "loss": 1.5959, + "step": 37910 + }, + { + "epoch": 7.988623795228314, + "grad_norm": 1.100823163986206, + "learning_rate": 1.9285467842784467e-05, + "loss": 1.597, + "step": 37920 + }, + { + "epoch": 7.990730499815664, + "grad_norm": 1.3616795539855957, + "learning_rate": 1.9246407399607625e-05, + "loss": 1.5545, + "step": 37930 + }, + { + "epoch": 7.9928372044030125, + "grad_norm": 1.2126030921936035, + "learning_rate": 1.9207382340364634e-05, + "loss": 1.5463, + "step": 37940 + }, + { + "epoch": 7.994943908990361, + "grad_norm": 1.2499946355819702, + "learning_rate": 1.9168392682155157e-05, + "loss": 1.5681, + "step": 37950 + }, + { + "epoch": 7.997050613577711, + "grad_norm": 1.1559704542160034, + "learning_rate": 1.912943844206333e-05, + "loss": 1.6395, + "step": 37960 + }, + { + "epoch": 7.99915731816506, + "grad_norm": 1.2376368045806885, + "learning_rate": 1.9090519637157846e-05, + "loss": 1.575, + "step": 37970 + }, + { + "epoch": 8.001264022752409, + "grad_norm": 1.3157291412353516, + "learning_rate": 1.9051636284491757e-05, + "loss": 1.6129, + "step": 37980 + }, + { + "epoch": 8.003370727339759, + "grad_norm": 1.1666113138198853, + "learning_rate": 1.901278840110272e-05, + "loss": 1.5028, + "step": 37990 + }, + { + "epoch": 8.005477431927108, + "grad_norm": 1.3203294277191162, + "learning_rate": 1.897397600401274e-05, + "loss": 1.5457, + "step": 38000 + }, + { + "epoch": 8.007584136514458, + "grad_norm": 1.1368193626403809, + "learning_rate": 1.8935199110228275e-05, + "loss": 1.5735, + "step": 38010 + }, + { + "epoch": 8.009690841101806, + "grad_norm": 1.1702295541763306, + "learning_rate": 1.8896457736740313e-05, + "loss": 1.5446, + "step": 38020 + }, + { + "epoch": 8.011797545689156, + "grad_norm": 1.217568039894104, + "learning_rate": 1.88577519005242e-05, + "loss": 1.5859, + "step": 38030 + }, + { + "epoch": 8.013904250276505, + "grad_norm": 1.2242965698242188, + "learning_rate": 1.8819081618539723e-05, + "loss": 1.6145, + "step": 38040 + }, + { + "epoch": 8.016010954863853, + "grad_norm": 1.059897541999817, + "learning_rate": 1.8780446907731142e-05, + "loss": 1.5619, + "step": 38050 + }, + { + "epoch": 8.018117659451203, + "grad_norm": 1.1061480045318604, + "learning_rate": 1.8741847785027045e-05, + "loss": 1.5779, + "step": 38060 + }, + { + "epoch": 8.020224364038553, + "grad_norm": 1.2350695133209229, + "learning_rate": 1.8703284267340516e-05, + "loss": 1.6109, + "step": 38070 + }, + { + "epoch": 8.022331068625903, + "grad_norm": 1.1782333850860596, + "learning_rate": 1.866475637156898e-05, + "loss": 1.5727, + "step": 38080 + }, + { + "epoch": 8.02443777321325, + "grad_norm": 1.1965060234069824, + "learning_rate": 1.862626411459424e-05, + "loss": 1.6016, + "step": 38090 + }, + { + "epoch": 8.0265444778006, + "grad_norm": 1.1553235054016113, + "learning_rate": 1.858780751328255e-05, + "loss": 1.6156, + "step": 38100 + }, + { + "epoch": 8.02865118238795, + "grad_norm": 1.1190050840377808, + "learning_rate": 1.8549386584484495e-05, + "loss": 1.568, + "step": 38110 + }, + { + "epoch": 8.030757886975298, + "grad_norm": 1.1927499771118164, + "learning_rate": 1.8511001345034994e-05, + "loss": 1.5496, + "step": 38120 + }, + { + "epoch": 8.032864591562648, + "grad_norm": 1.1628167629241943, + "learning_rate": 1.8472651811753428e-05, + "loss": 1.6048, + "step": 38130 + }, + { + "epoch": 8.034971296149997, + "grad_norm": 1.1890445947647095, + "learning_rate": 1.843433800144343e-05, + "loss": 1.5735, + "step": 38140 + }, + { + "epoch": 8.037078000737347, + "grad_norm": 1.137284755706787, + "learning_rate": 1.839605993089307e-05, + "loss": 1.5657, + "step": 38150 + }, + { + "epoch": 8.039184705324695, + "grad_norm": 1.2434216737747192, + "learning_rate": 1.8357817616874694e-05, + "loss": 1.5726, + "step": 38160 + }, + { + "epoch": 8.041291409912045, + "grad_norm": 1.249247670173645, + "learning_rate": 1.831961107614496e-05, + "loss": 1.5715, + "step": 38170 + }, + { + "epoch": 8.043398114499395, + "grad_norm": 1.3489915132522583, + "learning_rate": 1.8281440325444953e-05, + "loss": 1.5904, + "step": 38180 + }, + { + "epoch": 8.045504819086744, + "grad_norm": 1.2363272905349731, + "learning_rate": 1.8243305381499976e-05, + "loss": 1.555, + "step": 38190 + }, + { + "epoch": 8.047611523674092, + "grad_norm": 1.1866079568862915, + "learning_rate": 1.820520626101967e-05, + "loss": 1.5552, + "step": 38200 + }, + { + "epoch": 8.049718228261442, + "grad_norm": 1.2174458503723145, + "learning_rate": 1.816714298069804e-05, + "loss": 1.5627, + "step": 38210 + }, + { + "epoch": 8.051824932848792, + "grad_norm": 1.2444413900375366, + "learning_rate": 1.8129115557213262e-05, + "loss": 1.6285, + "step": 38220 + }, + { + "epoch": 8.05393163743614, + "grad_norm": 1.2618112564086914, + "learning_rate": 1.8091124007227945e-05, + "loss": 1.5893, + "step": 38230 + }, + { + "epoch": 8.05603834202349, + "grad_norm": 1.2796944379806519, + "learning_rate": 1.8053168347388884e-05, + "loss": 1.5575, + "step": 38240 + }, + { + "epoch": 8.05814504661084, + "grad_norm": 1.1825329065322876, + "learning_rate": 1.801524859432714e-05, + "loss": 1.5461, + "step": 38250 + }, + { + "epoch": 8.060251751198189, + "grad_norm": 1.2693184614181519, + "learning_rate": 1.7977364764658122e-05, + "loss": 1.5468, + "step": 38260 + }, + { + "epoch": 8.062358455785537, + "grad_norm": 1.3873083591461182, + "learning_rate": 1.79395168749814e-05, + "loss": 1.6055, + "step": 38270 + }, + { + "epoch": 8.064465160372887, + "grad_norm": 1.1636654138565063, + "learning_rate": 1.7901704941880914e-05, + "loss": 1.5814, + "step": 38280 + }, + { + "epoch": 8.066571864960236, + "grad_norm": 1.1651959419250488, + "learning_rate": 1.7863928981924726e-05, + "loss": 1.5662, + "step": 38290 + }, + { + "epoch": 8.068678569547584, + "grad_norm": 1.3011173009872437, + "learning_rate": 1.7826189011665184e-05, + "loss": 1.5698, + "step": 38300 + }, + { + "epoch": 8.070785274134934, + "grad_norm": 1.2538598775863647, + "learning_rate": 1.7788485047638925e-05, + "loss": 1.5508, + "step": 38310 + }, + { + "epoch": 8.072891978722284, + "grad_norm": 1.1562957763671875, + "learning_rate": 1.7750817106366714e-05, + "loss": 1.5399, + "step": 38320 + }, + { + "epoch": 8.074998683309634, + "grad_norm": 1.2713629007339478, + "learning_rate": 1.7713185204353567e-05, + "loss": 1.5962, + "step": 38330 + }, + { + "epoch": 8.077105387896982, + "grad_norm": 1.2110227346420288, + "learning_rate": 1.7675589358088763e-05, + "loss": 1.5827, + "step": 38340 + }, + { + "epoch": 8.079212092484331, + "grad_norm": 1.2139581441879272, + "learning_rate": 1.763802958404568e-05, + "loss": 1.5289, + "step": 38350 + }, + { + "epoch": 8.081318797071681, + "grad_norm": 1.1466100215911865, + "learning_rate": 1.7600505898681997e-05, + "loss": 1.5669, + "step": 38360 + }, + { + "epoch": 8.083425501659029, + "grad_norm": 1.122790813446045, + "learning_rate": 1.7563018318439496e-05, + "loss": 1.519, + "step": 38370 + }, + { + "epoch": 8.085532206246379, + "grad_norm": 1.6603208780288696, + "learning_rate": 1.7525566859744168e-05, + "loss": 1.609, + "step": 38380 + }, + { + "epoch": 8.087638910833729, + "grad_norm": 1.1294156312942505, + "learning_rate": 1.7488151539006203e-05, + "loss": 1.5371, + "step": 38390 + }, + { + "epoch": 8.089745615421078, + "grad_norm": 1.2689809799194336, + "learning_rate": 1.745077237261994e-05, + "loss": 1.5928, + "step": 38400 + }, + { + "epoch": 8.091852320008426, + "grad_norm": 1.1921470165252686, + "learning_rate": 1.7413429376963808e-05, + "loss": 1.5385, + "step": 38410 + }, + { + "epoch": 8.093959024595776, + "grad_norm": 1.2434748411178589, + "learning_rate": 1.7376122568400532e-05, + "loss": 1.5235, + "step": 38420 + }, + { + "epoch": 8.096065729183126, + "grad_norm": 1.590907335281372, + "learning_rate": 1.7338851963276825e-05, + "loss": 1.5797, + "step": 38430 + }, + { + "epoch": 8.098172433770474, + "grad_norm": 1.129142165184021, + "learning_rate": 1.730161757792367e-05, + "loss": 1.5699, + "step": 38440 + }, + { + "epoch": 8.100279138357823, + "grad_norm": 1.270219326019287, + "learning_rate": 1.7264419428656075e-05, + "loss": 1.5919, + "step": 38450 + }, + { + "epoch": 8.102385842945173, + "grad_norm": 1.2105395793914795, + "learning_rate": 1.7227257531773223e-05, + "loss": 1.5551, + "step": 38460 + }, + { + "epoch": 8.104492547532523, + "grad_norm": 1.2144361734390259, + "learning_rate": 1.7190131903558425e-05, + "loss": 1.5649, + "step": 38470 + }, + { + "epoch": 8.10659925211987, + "grad_norm": 1.217391014099121, + "learning_rate": 1.7153042560279065e-05, + "loss": 1.5267, + "step": 38480 + }, + { + "epoch": 8.10870595670722, + "grad_norm": 1.2412052154541016, + "learning_rate": 1.7115989518186615e-05, + "loss": 1.528, + "step": 38490 + }, + { + "epoch": 8.11081266129457, + "grad_norm": 1.7172247171401978, + "learning_rate": 1.707897279351671e-05, + "loss": 1.5828, + "step": 38500 + }, + { + "epoch": 8.11291936588192, + "grad_norm": 1.2050105333328247, + "learning_rate": 1.7041992402488994e-05, + "loss": 1.5875, + "step": 38510 + }, + { + "epoch": 8.115026070469268, + "grad_norm": 1.7654755115509033, + "learning_rate": 1.7005048361307262e-05, + "loss": 1.5203, + "step": 38520 + }, + { + "epoch": 8.117132775056618, + "grad_norm": 1.2826247215270996, + "learning_rate": 1.6968140686159328e-05, + "loss": 1.5569, + "step": 38530 + }, + { + "epoch": 8.119239479643968, + "grad_norm": 1.1842323541641235, + "learning_rate": 1.693126939321705e-05, + "loss": 1.538, + "step": 38540 + }, + { + "epoch": 8.121346184231315, + "grad_norm": 1.1324920654296875, + "learning_rate": 1.6894434498636446e-05, + "loss": 1.5795, + "step": 38550 + }, + { + "epoch": 8.123452888818665, + "grad_norm": 1.240378975868225, + "learning_rate": 1.6857636018557466e-05, + "loss": 1.5545, + "step": 38560 + }, + { + "epoch": 8.125559593406015, + "grad_norm": 1.0717569589614868, + "learning_rate": 1.682087396910422e-05, + "loss": 1.568, + "step": 38570 + }, + { + "epoch": 8.127666297993365, + "grad_norm": 1.266964077949524, + "learning_rate": 1.6784148366384754e-05, + "loss": 1.5935, + "step": 38580 + }, + { + "epoch": 8.129773002580713, + "grad_norm": 1.1756432056427002, + "learning_rate": 1.674745922649118e-05, + "loss": 1.5398, + "step": 38590 + }, + { + "epoch": 8.131879707168062, + "grad_norm": 1.2381880283355713, + "learning_rate": 1.671080656549965e-05, + "loss": 1.5288, + "step": 38600 + }, + { + "epoch": 8.133986411755412, + "grad_norm": 1.2864570617675781, + "learning_rate": 1.667419039947037e-05, + "loss": 1.5336, + "step": 38610 + }, + { + "epoch": 8.13609311634276, + "grad_norm": 1.2409577369689941, + "learning_rate": 1.6637610744447407e-05, + "loss": 1.5452, + "step": 38620 + }, + { + "epoch": 8.13819982093011, + "grad_norm": 1.2823350429534912, + "learning_rate": 1.6601067616458987e-05, + "loss": 1.5658, + "step": 38630 + }, + { + "epoch": 8.14030652551746, + "grad_norm": 1.2008086442947388, + "learning_rate": 1.656456103151728e-05, + "loss": 1.5276, + "step": 38640 + }, + { + "epoch": 8.14241323010481, + "grad_norm": 1.245653748512268, + "learning_rate": 1.6528091005618428e-05, + "loss": 1.5227, + "step": 38650 + }, + { + "epoch": 8.144519934692157, + "grad_norm": 1.184909701347351, + "learning_rate": 1.6491657554742557e-05, + "loss": 1.5415, + "step": 38660 + }, + { + "epoch": 8.146626639279507, + "grad_norm": 1.199066400527954, + "learning_rate": 1.6455260694853736e-05, + "loss": 1.5847, + "step": 38670 + }, + { + "epoch": 8.148733343866857, + "grad_norm": 1.2071822881698608, + "learning_rate": 1.6418900441900087e-05, + "loss": 1.5406, + "step": 38680 + }, + { + "epoch": 8.150840048454205, + "grad_norm": 1.2048323154449463, + "learning_rate": 1.6382576811813655e-05, + "loss": 1.5376, + "step": 38690 + }, + { + "epoch": 8.152946753041554, + "grad_norm": 1.1268657445907593, + "learning_rate": 1.6346289820510363e-05, + "loss": 1.5304, + "step": 38700 + }, + { + "epoch": 8.155053457628904, + "grad_norm": 1.2604269981384277, + "learning_rate": 1.631003948389016e-05, + "loss": 1.5056, + "step": 38710 + }, + { + "epoch": 8.157160162216254, + "grad_norm": 1.1815440654754639, + "learning_rate": 1.6273825817836963e-05, + "loss": 1.6137, + "step": 38720 + }, + { + "epoch": 8.159266866803602, + "grad_norm": 1.1286557912826538, + "learning_rate": 1.6237648838218532e-05, + "loss": 1.5052, + "step": 38730 + }, + { + "epoch": 8.161373571390952, + "grad_norm": 1.3425495624542236, + "learning_rate": 1.6201508560886602e-05, + "loss": 1.5563, + "step": 38740 + }, + { + "epoch": 8.163480275978301, + "grad_norm": 1.1465646028518677, + "learning_rate": 1.6165405001676793e-05, + "loss": 1.5663, + "step": 38750 + }, + { + "epoch": 8.16558698056565, + "grad_norm": 1.232774019241333, + "learning_rate": 1.612933817640868e-05, + "loss": 1.5836, + "step": 38760 + }, + { + "epoch": 8.167693685152999, + "grad_norm": 1.1636687517166138, + "learning_rate": 1.6093308100885774e-05, + "loss": 1.5692, + "step": 38770 + }, + { + "epoch": 8.169800389740349, + "grad_norm": 1.1775188446044922, + "learning_rate": 1.605731479089534e-05, + "loss": 1.5056, + "step": 38780 + }, + { + "epoch": 8.171907094327699, + "grad_norm": 1.2304803133010864, + "learning_rate": 1.6021358262208665e-05, + "loss": 1.5485, + "step": 38790 + }, + { + "epoch": 8.174013798915047, + "grad_norm": 1.2564809322357178, + "learning_rate": 1.5985438530580908e-05, + "loss": 1.5348, + "step": 38800 + }, + { + "epoch": 8.176120503502396, + "grad_norm": 1.2680792808532715, + "learning_rate": 1.5949555611751044e-05, + "loss": 1.5888, + "step": 38810 + }, + { + "epoch": 8.178227208089746, + "grad_norm": 1.1961019039154053, + "learning_rate": 1.5913709521441988e-05, + "loss": 1.5454, + "step": 38820 + }, + { + "epoch": 8.180333912677096, + "grad_norm": 1.2921595573425293, + "learning_rate": 1.5877900275360412e-05, + "loss": 1.57, + "step": 38830 + }, + { + "epoch": 8.182440617264444, + "grad_norm": 1.3242322206497192, + "learning_rate": 1.5842127889196956e-05, + "loss": 1.601, + "step": 38840 + }, + { + "epoch": 8.184547321851793, + "grad_norm": 1.1462351083755493, + "learning_rate": 1.580639237862608e-05, + "loss": 1.6022, + "step": 38850 + }, + { + "epoch": 8.186654026439143, + "grad_norm": 1.270314335823059, + "learning_rate": 1.5770693759306055e-05, + "loss": 1.6073, + "step": 38860 + }, + { + "epoch": 8.188760731026491, + "grad_norm": 1.2723679542541504, + "learning_rate": 1.5735032046878973e-05, + "loss": 1.5688, + "step": 38870 + }, + { + "epoch": 8.190867435613841, + "grad_norm": 1.1393686532974243, + "learning_rate": 1.5699407256970833e-05, + "loss": 1.5085, + "step": 38880 + }, + { + "epoch": 8.19297414020119, + "grad_norm": 1.1237406730651855, + "learning_rate": 1.5663819405191373e-05, + "loss": 1.5975, + "step": 38890 + }, + { + "epoch": 8.19508084478854, + "grad_norm": 1.1238974332809448, + "learning_rate": 1.5628268507134224e-05, + "loss": 1.517, + "step": 38900 + }, + { + "epoch": 8.197187549375888, + "grad_norm": 1.300145149230957, + "learning_rate": 1.5592754578376724e-05, + "loss": 1.6047, + "step": 38910 + }, + { + "epoch": 8.199294253963238, + "grad_norm": 1.1321696043014526, + "learning_rate": 1.5557277634480083e-05, + "loss": 1.5334, + "step": 38920 + }, + { + "epoch": 8.201400958550588, + "grad_norm": 1.1901246309280396, + "learning_rate": 1.5521837690989338e-05, + "loss": 1.5728, + "step": 38930 + }, + { + "epoch": 8.203507663137936, + "grad_norm": 1.2294230461120605, + "learning_rate": 1.5486434763433222e-05, + "loss": 1.5832, + "step": 38940 + }, + { + "epoch": 8.205614367725286, + "grad_norm": 1.2560628652572632, + "learning_rate": 1.5451068867324293e-05, + "loss": 1.5975, + "step": 38950 + }, + { + "epoch": 8.207721072312635, + "grad_norm": 1.312564492225647, + "learning_rate": 1.541574001815892e-05, + "loss": 1.568, + "step": 38960 + }, + { + "epoch": 8.209827776899985, + "grad_norm": 1.3545145988464355, + "learning_rate": 1.5380448231417144e-05, + "loss": 1.6047, + "step": 38970 + }, + { + "epoch": 8.211934481487333, + "grad_norm": 1.142449975013733, + "learning_rate": 1.5345193522562917e-05, + "loss": 1.5515, + "step": 38980 + }, + { + "epoch": 8.214041186074683, + "grad_norm": 1.1972579956054688, + "learning_rate": 1.530997590704375e-05, + "loss": 1.585, + "step": 38990 + }, + { + "epoch": 8.216147890662032, + "grad_norm": 1.2056550979614258, + "learning_rate": 1.527479540029104e-05, + "loss": 1.5195, + "step": 39000 + }, + { + "epoch": 8.21825459524938, + "grad_norm": 1.3925740718841553, + "learning_rate": 1.5239652017719919e-05, + "loss": 1.6004, + "step": 39010 + }, + { + "epoch": 8.22036129983673, + "grad_norm": 1.4021625518798828, + "learning_rate": 1.5204545774729207e-05, + "loss": 1.5881, + "step": 39020 + }, + { + "epoch": 8.22246800442408, + "grad_norm": 1.195132851600647, + "learning_rate": 1.5169476686701423e-05, + "loss": 1.5715, + "step": 39030 + }, + { + "epoch": 8.22457470901143, + "grad_norm": 1.2492402791976929, + "learning_rate": 1.5134444769002909e-05, + "loss": 1.4855, + "step": 39040 + }, + { + "epoch": 8.226681413598778, + "grad_norm": 1.1475170850753784, + "learning_rate": 1.5099450036983598e-05, + "loss": 1.5519, + "step": 39050 + }, + { + "epoch": 8.228788118186127, + "grad_norm": 1.213303565979004, + "learning_rate": 1.5064492505977234e-05, + "loss": 1.5203, + "step": 39060 + }, + { + "epoch": 8.230894822773477, + "grad_norm": 1.260926604270935, + "learning_rate": 1.5029572191301211e-05, + "loss": 1.5361, + "step": 39070 + }, + { + "epoch": 8.233001527360825, + "grad_norm": 1.3445188999176025, + "learning_rate": 1.4994689108256576e-05, + "loss": 1.5805, + "step": 39080 + }, + { + "epoch": 8.235108231948175, + "grad_norm": 1.1416704654693604, + "learning_rate": 1.4959843272128172e-05, + "loss": 1.5933, + "step": 39090 + }, + { + "epoch": 8.237214936535525, + "grad_norm": 1.2642247676849365, + "learning_rate": 1.4925034698184393e-05, + "loss": 1.5494, + "step": 39100 + }, + { + "epoch": 8.239321641122874, + "grad_norm": 1.2801305055618286, + "learning_rate": 1.4890263401677429e-05, + "loss": 1.6004, + "step": 39110 + }, + { + "epoch": 8.241428345710222, + "grad_norm": 1.2009177207946777, + "learning_rate": 1.4855529397843038e-05, + "loss": 1.5567, + "step": 39120 + }, + { + "epoch": 8.243535050297572, + "grad_norm": 1.2630966901779175, + "learning_rate": 1.4820832701900667e-05, + "loss": 1.5823, + "step": 39130 + }, + { + "epoch": 8.245641754884922, + "grad_norm": 1.2626142501831055, + "learning_rate": 1.4786173329053466e-05, + "loss": 1.567, + "step": 39140 + }, + { + "epoch": 8.24774845947227, + "grad_norm": 1.112261414527893, + "learning_rate": 1.4751551294488154e-05, + "loss": 1.5831, + "step": 39150 + }, + { + "epoch": 8.24985516405962, + "grad_norm": 1.3133232593536377, + "learning_rate": 1.4716966613375116e-05, + "loss": 1.5637, + "step": 39160 + }, + { + "epoch": 8.25196186864697, + "grad_norm": 1.27349853515625, + "learning_rate": 1.4682419300868423e-05, + "loss": 1.5465, + "step": 39170 + }, + { + "epoch": 8.254068573234319, + "grad_norm": 1.2491436004638672, + "learning_rate": 1.4647909372105672e-05, + "loss": 1.5928, + "step": 39180 + }, + { + "epoch": 8.256175277821667, + "grad_norm": 1.2191392183303833, + "learning_rate": 1.4613436842208183e-05, + "loss": 1.5809, + "step": 39190 + }, + { + "epoch": 8.258281982409017, + "grad_norm": 1.204507827758789, + "learning_rate": 1.4579001726280828e-05, + "loss": 1.5881, + "step": 39200 + }, + { + "epoch": 8.260388686996366, + "grad_norm": 1.1342881917953491, + "learning_rate": 1.454460403941207e-05, + "loss": 1.5041, + "step": 39210 + }, + { + "epoch": 8.262495391583716, + "grad_norm": 1.216679573059082, + "learning_rate": 1.451024379667404e-05, + "loss": 1.5633, + "step": 39220 + }, + { + "epoch": 8.264602096171064, + "grad_norm": 1.2390002012252808, + "learning_rate": 1.4475921013122406e-05, + "loss": 1.5611, + "step": 39230 + }, + { + "epoch": 8.266708800758414, + "grad_norm": 1.191501259803772, + "learning_rate": 1.4441635703796408e-05, + "loss": 1.5184, + "step": 39240 + }, + { + "epoch": 8.268815505345763, + "grad_norm": 1.177857756614685, + "learning_rate": 1.4407387883718959e-05, + "loss": 1.604, + "step": 39250 + }, + { + "epoch": 8.270922209933111, + "grad_norm": 1.2561581134796143, + "learning_rate": 1.4373177567896413e-05, + "loss": 1.5755, + "step": 39260 + }, + { + "epoch": 8.273028914520461, + "grad_norm": 1.167371153831482, + "learning_rate": 1.433900477131882e-05, + "loss": 1.5556, + "step": 39270 + }, + { + "epoch": 8.275135619107811, + "grad_norm": 1.2407907247543335, + "learning_rate": 1.4304869508959707e-05, + "loss": 1.5155, + "step": 39280 + }, + { + "epoch": 8.27724232369516, + "grad_norm": 1.125529408454895, + "learning_rate": 1.427077179577615e-05, + "loss": 1.5944, + "step": 39290 + }, + { + "epoch": 8.279349028282509, + "grad_norm": 1.4028643369674683, + "learning_rate": 1.4236711646708844e-05, + "loss": 1.5687, + "step": 39300 + }, + { + "epoch": 8.281455732869858, + "grad_norm": 1.199676275253296, + "learning_rate": 1.4202689076681962e-05, + "loss": 1.5419, + "step": 39310 + }, + { + "epoch": 8.283562437457208, + "grad_norm": 1.2607035636901855, + "learning_rate": 1.4168704100603214e-05, + "loss": 1.5766, + "step": 39320 + }, + { + "epoch": 8.285669142044556, + "grad_norm": 1.2903956174850464, + "learning_rate": 1.4134756733363886e-05, + "loss": 1.5419, + "step": 39330 + }, + { + "epoch": 8.287775846631906, + "grad_norm": 1.4275097846984863, + "learning_rate": 1.41008469898387e-05, + "loss": 1.5228, + "step": 39340 + }, + { + "epoch": 8.289882551219256, + "grad_norm": 1.247402310371399, + "learning_rate": 1.4066974884886008e-05, + "loss": 1.6018, + "step": 39350 + }, + { + "epoch": 8.291989255806605, + "grad_norm": 1.2947194576263428, + "learning_rate": 1.4033140433347569e-05, + "loss": 1.5944, + "step": 39360 + }, + { + "epoch": 8.294095960393953, + "grad_norm": 1.195644736289978, + "learning_rate": 1.3999343650048669e-05, + "loss": 1.5578, + "step": 39370 + }, + { + "epoch": 8.296202664981303, + "grad_norm": 1.1885980367660522, + "learning_rate": 1.396558454979814e-05, + "loss": 1.5829, + "step": 39380 + }, + { + "epoch": 8.298309369568653, + "grad_norm": 1.2197813987731934, + "learning_rate": 1.3931863147388202e-05, + "loss": 1.5968, + "step": 39390 + }, + { + "epoch": 8.300416074156, + "grad_norm": 1.19045090675354, + "learning_rate": 1.3898179457594684e-05, + "loss": 1.549, + "step": 39400 + }, + { + "epoch": 8.30252277874335, + "grad_norm": 1.2025426626205444, + "learning_rate": 1.386453349517679e-05, + "loss": 1.5336, + "step": 39410 + }, + { + "epoch": 8.3046294833307, + "grad_norm": 1.3740406036376953, + "learning_rate": 1.3830925274877216e-05, + "loss": 1.5438, + "step": 39420 + }, + { + "epoch": 8.30673618791805, + "grad_norm": 1.0788160562515259, + "learning_rate": 1.3797354811422158e-05, + "loss": 1.511, + "step": 39430 + }, + { + "epoch": 8.308842892505398, + "grad_norm": 1.213222861289978, + "learning_rate": 1.376382211952123e-05, + "loss": 1.5702, + "step": 39440 + }, + { + "epoch": 8.310949597092748, + "grad_norm": 1.2587624788284302, + "learning_rate": 1.3730327213867478e-05, + "loss": 1.5513, + "step": 39450 + }, + { + "epoch": 8.313056301680097, + "grad_norm": 1.2077735662460327, + "learning_rate": 1.369687010913746e-05, + "loss": 1.5716, + "step": 39460 + }, + { + "epoch": 8.315163006267445, + "grad_norm": 1.3706378936767578, + "learning_rate": 1.3663450819991107e-05, + "loss": 1.5903, + "step": 39470 + }, + { + "epoch": 8.317269710854795, + "grad_norm": 1.298016905784607, + "learning_rate": 1.363006936107183e-05, + "loss": 1.5508, + "step": 39480 + }, + { + "epoch": 8.319376415442145, + "grad_norm": 1.2610794305801392, + "learning_rate": 1.3596725747006411e-05, + "loss": 1.5297, + "step": 39490 + }, + { + "epoch": 8.321483120029495, + "grad_norm": 1.2800568342208862, + "learning_rate": 1.3563419992405068e-05, + "loss": 1.5699, + "step": 39500 + }, + { + "epoch": 8.323589824616842, + "grad_norm": 1.2344589233398438, + "learning_rate": 1.3530152111861483e-05, + "loss": 1.5382, + "step": 39510 + }, + { + "epoch": 8.325696529204192, + "grad_norm": 1.2370203733444214, + "learning_rate": 1.349692211995266e-05, + "loss": 1.5622, + "step": 39520 + }, + { + "epoch": 8.327803233791542, + "grad_norm": 1.173630714416504, + "learning_rate": 1.346373003123903e-05, + "loss": 1.6422, + "step": 39530 + }, + { + "epoch": 8.32990993837889, + "grad_norm": 1.1007499694824219, + "learning_rate": 1.343057586026446e-05, + "loss": 1.5922, + "step": 39540 + }, + { + "epoch": 8.33201664296624, + "grad_norm": 1.3459322452545166, + "learning_rate": 1.339745962155613e-05, + "loss": 1.5034, + "step": 39550 + }, + { + "epoch": 8.33412334755359, + "grad_norm": 1.2300962209701538, + "learning_rate": 1.3364381329624687e-05, + "loss": 1.5769, + "step": 39560 + }, + { + "epoch": 8.33623005214094, + "grad_norm": 1.2691370248794556, + "learning_rate": 1.333134099896406e-05, + "loss": 1.6051, + "step": 39570 + }, + { + "epoch": 8.338336756728287, + "grad_norm": 1.3355259895324707, + "learning_rate": 1.3298338644051578e-05, + "loss": 1.5746, + "step": 39580 + }, + { + "epoch": 8.340443461315637, + "grad_norm": 1.2271134853363037, + "learning_rate": 1.3265374279347975e-05, + "loss": 1.5573, + "step": 39590 + }, + { + "epoch": 8.342550165902987, + "grad_norm": 1.389338731765747, + "learning_rate": 1.3232447919297274e-05, + "loss": 1.564, + "step": 39600 + }, + { + "epoch": 8.344656870490336, + "grad_norm": 1.3343056440353394, + "learning_rate": 1.3199559578326858e-05, + "loss": 1.5654, + "step": 39610 + }, + { + "epoch": 8.346763575077684, + "grad_norm": 1.2581366300582886, + "learning_rate": 1.3166709270847511e-05, + "loss": 1.5501, + "step": 39620 + }, + { + "epoch": 8.348870279665034, + "grad_norm": 1.626578688621521, + "learning_rate": 1.313389701125325e-05, + "loss": 1.5701, + "step": 39630 + }, + { + "epoch": 8.350976984252384, + "grad_norm": 1.2747974395751953, + "learning_rate": 1.3101122813921529e-05, + "loss": 1.5525, + "step": 39640 + }, + { + "epoch": 8.353083688839732, + "grad_norm": 1.28239905834198, + "learning_rate": 1.3068386693213053e-05, + "loss": 1.5239, + "step": 39650 + }, + { + "epoch": 8.355190393427081, + "grad_norm": 1.2086161375045776, + "learning_rate": 1.3035688663471834e-05, + "loss": 1.5576, + "step": 39660 + }, + { + "epoch": 8.357297098014431, + "grad_norm": 1.1600311994552612, + "learning_rate": 1.3003028739025258e-05, + "loss": 1.4787, + "step": 39670 + }, + { + "epoch": 8.359403802601781, + "grad_norm": 1.2429834604263306, + "learning_rate": 1.2970406934183954e-05, + "loss": 1.5686, + "step": 39680 + }, + { + "epoch": 8.361510507189129, + "grad_norm": 1.1623550653457642, + "learning_rate": 1.29378232632419e-05, + "loss": 1.5772, + "step": 39690 + }, + { + "epoch": 8.363617211776479, + "grad_norm": 1.1314207315444946, + "learning_rate": 1.2905277740476318e-05, + "loss": 1.5337, + "step": 39700 + }, + { + "epoch": 8.365723916363828, + "grad_norm": 1.2326688766479492, + "learning_rate": 1.2872770380147703e-05, + "loss": 1.4982, + "step": 39710 + }, + { + "epoch": 8.367830620951176, + "grad_norm": 1.2618603706359863, + "learning_rate": 1.2840301196499893e-05, + "loss": 1.5631, + "step": 39720 + }, + { + "epoch": 8.369937325538526, + "grad_norm": 1.1323174238204956, + "learning_rate": 1.2807870203760009e-05, + "loss": 1.5235, + "step": 39730 + }, + { + "epoch": 8.372044030125876, + "grad_norm": 1.1603468656539917, + "learning_rate": 1.2775477416138294e-05, + "loss": 1.5585, + "step": 39740 + }, + { + "epoch": 8.374150734713226, + "grad_norm": 1.1536343097686768, + "learning_rate": 1.2743122847828415e-05, + "loss": 1.5427, + "step": 39750 + }, + { + "epoch": 8.376257439300574, + "grad_norm": 1.1194301843643188, + "learning_rate": 1.271080651300719e-05, + "loss": 1.5408, + "step": 39760 + }, + { + "epoch": 8.378364143887923, + "grad_norm": 1.503873586654663, + "learning_rate": 1.2678528425834758e-05, + "loss": 1.6087, + "step": 39770 + }, + { + "epoch": 8.380470848475273, + "grad_norm": 1.1918821334838867, + "learning_rate": 1.2646288600454448e-05, + "loss": 1.5488, + "step": 39780 + }, + { + "epoch": 8.382577553062621, + "grad_norm": 1.1944622993469238, + "learning_rate": 1.2614087050992796e-05, + "loss": 1.6118, + "step": 39790 + }, + { + "epoch": 8.38468425764997, + "grad_norm": 1.2867789268493652, + "learning_rate": 1.2581923791559647e-05, + "loss": 1.5397, + "step": 39800 + }, + { + "epoch": 8.38679096223732, + "grad_norm": 1.3712007999420166, + "learning_rate": 1.2549798836248072e-05, + "loss": 1.5778, + "step": 39810 + }, + { + "epoch": 8.38889766682467, + "grad_norm": 1.2475578784942627, + "learning_rate": 1.2517712199134224e-05, + "loss": 1.5816, + "step": 39820 + }, + { + "epoch": 8.391004371412018, + "grad_norm": 1.3664524555206299, + "learning_rate": 1.2485663894277611e-05, + "loss": 1.5837, + "step": 39830 + }, + { + "epoch": 8.393111075999368, + "grad_norm": 1.4213124513626099, + "learning_rate": 1.2453653935720867e-05, + "loss": 1.5526, + "step": 39840 + }, + { + "epoch": 8.395217780586718, + "grad_norm": 1.2256556749343872, + "learning_rate": 1.2421682337489882e-05, + "loss": 1.5103, + "step": 39850 + }, + { + "epoch": 8.397324485174067, + "grad_norm": 1.2881014347076416, + "learning_rate": 1.2389749113593684e-05, + "loss": 1.5409, + "step": 39860 + }, + { + "epoch": 8.399431189761415, + "grad_norm": 1.249493956565857, + "learning_rate": 1.2357854278024484e-05, + "loss": 1.5371, + "step": 39870 + }, + { + "epoch": 8.401537894348765, + "grad_norm": 1.1821091175079346, + "learning_rate": 1.2325997844757719e-05, + "loss": 1.5334, + "step": 39880 + }, + { + "epoch": 8.403644598936115, + "grad_norm": 1.125915288925171, + "learning_rate": 1.2294179827752007e-05, + "loss": 1.6042, + "step": 39890 + }, + { + "epoch": 8.405751303523463, + "grad_norm": 1.2114832401275635, + "learning_rate": 1.2262400240949023e-05, + "loss": 1.5729, + "step": 39900 + }, + { + "epoch": 8.407858008110813, + "grad_norm": 1.1864620447158813, + "learning_rate": 1.2230659098273744e-05, + "loss": 1.5487, + "step": 39910 + }, + { + "epoch": 8.409964712698162, + "grad_norm": 1.3160470724105835, + "learning_rate": 1.2198956413634199e-05, + "loss": 1.5549, + "step": 39920 + }, + { + "epoch": 8.412071417285512, + "grad_norm": 1.2356914281845093, + "learning_rate": 1.216729220092162e-05, + "loss": 1.5617, + "step": 39930 + }, + { + "epoch": 8.41417812187286, + "grad_norm": 1.2617237567901611, + "learning_rate": 1.213566647401041e-05, + "loss": 1.6353, + "step": 39940 + }, + { + "epoch": 8.41628482646021, + "grad_norm": 1.1665171384811401, + "learning_rate": 1.2104079246757993e-05, + "loss": 1.5639, + "step": 39950 + }, + { + "epoch": 8.41839153104756, + "grad_norm": 1.424302101135254, + "learning_rate": 1.2072530533005012e-05, + "loss": 1.5733, + "step": 39960 + }, + { + "epoch": 8.420498235634907, + "grad_norm": 1.1204131841659546, + "learning_rate": 1.2041020346575272e-05, + "loss": 1.5616, + "step": 39970 + }, + { + "epoch": 8.422604940222257, + "grad_norm": 1.3583896160125732, + "learning_rate": 1.2009548701275598e-05, + "loss": 1.5992, + "step": 39980 + }, + { + "epoch": 8.424711644809607, + "grad_norm": 1.1708359718322754, + "learning_rate": 1.197811561089598e-05, + "loss": 1.5822, + "step": 39990 + }, + { + "epoch": 8.426818349396957, + "grad_norm": 1.2146726846694946, + "learning_rate": 1.1946721089209479e-05, + "loss": 1.5704, + "step": 40000 + }, + { + "epoch": 8.428925053984305, + "grad_norm": 1.3206424713134766, + "learning_rate": 1.1915365149972324e-05, + "loss": 1.4784, + "step": 40010 + }, + { + "epoch": 8.431031758571654, + "grad_norm": 1.2263844013214111, + "learning_rate": 1.1884047806923815e-05, + "loss": 1.5193, + "step": 40020 + }, + { + "epoch": 8.433138463159004, + "grad_norm": 1.1511719226837158, + "learning_rate": 1.1852769073786263e-05, + "loss": 1.5587, + "step": 40030 + }, + { + "epoch": 8.435245167746352, + "grad_norm": 1.2120052576065063, + "learning_rate": 1.182152896426515e-05, + "loss": 1.5952, + "step": 40040 + }, + { + "epoch": 8.437351872333702, + "grad_norm": 1.214147925376892, + "learning_rate": 1.1790327492049025e-05, + "loss": 1.6125, + "step": 40050 + }, + { + "epoch": 8.439458576921052, + "grad_norm": 1.2446225881576538, + "learning_rate": 1.1759164670809486e-05, + "loss": 1.5809, + "step": 40060 + }, + { + "epoch": 8.441565281508401, + "grad_norm": 1.2550058364868164, + "learning_rate": 1.1728040514201144e-05, + "loss": 1.5168, + "step": 40070 + }, + { + "epoch": 8.44367198609575, + "grad_norm": 1.2683697938919067, + "learning_rate": 1.169695503586179e-05, + "loss": 1.5667, + "step": 40080 + }, + { + "epoch": 8.445778690683099, + "grad_norm": 1.1972013711929321, + "learning_rate": 1.1665908249412161e-05, + "loss": 1.5599, + "step": 40090 + }, + { + "epoch": 8.447885395270449, + "grad_norm": 1.1732401847839355, + "learning_rate": 1.163490016845611e-05, + "loss": 1.5032, + "step": 40100 + }, + { + "epoch": 8.449992099857797, + "grad_norm": 1.2353123426437378, + "learning_rate": 1.1603930806580444e-05, + "loss": 1.6133, + "step": 40110 + }, + { + "epoch": 8.452098804445146, + "grad_norm": 1.1491025686264038, + "learning_rate": 1.1573000177355086e-05, + "loss": 1.5902, + "step": 40120 + }, + { + "epoch": 8.454205509032496, + "grad_norm": 1.2281687259674072, + "learning_rate": 1.1542108294332998e-05, + "loss": 1.5905, + "step": 40130 + }, + { + "epoch": 8.456312213619846, + "grad_norm": 1.223631501197815, + "learning_rate": 1.1511255171050084e-05, + "loss": 1.5575, + "step": 40140 + }, + { + "epoch": 8.458418918207194, + "grad_norm": 1.2047030925750732, + "learning_rate": 1.1480440821025296e-05, + "loss": 1.5834, + "step": 40150 + }, + { + "epoch": 8.460525622794544, + "grad_norm": 1.2944790124893188, + "learning_rate": 1.1449665257760656e-05, + "loss": 1.5372, + "step": 40160 + }, + { + "epoch": 8.462632327381893, + "grad_norm": 1.3119227886199951, + "learning_rate": 1.1418928494741087e-05, + "loss": 1.5558, + "step": 40170 + }, + { + "epoch": 8.464739031969241, + "grad_norm": 1.5002557039260864, + "learning_rate": 1.1388230545434653e-05, + "loss": 1.5156, + "step": 40180 + }, + { + "epoch": 8.466845736556591, + "grad_norm": 1.2909941673278809, + "learning_rate": 1.1357571423292213e-05, + "loss": 1.6109, + "step": 40190 + }, + { + "epoch": 8.46895244114394, + "grad_norm": 1.1943180561065674, + "learning_rate": 1.1326951141747788e-05, + "loss": 1.548, + "step": 40200 + }, + { + "epoch": 8.47105914573129, + "grad_norm": 1.3649312257766724, + "learning_rate": 1.1296369714218324e-05, + "loss": 1.5584, + "step": 40210 + }, + { + "epoch": 8.473165850318638, + "grad_norm": 1.2651944160461426, + "learning_rate": 1.1265827154103703e-05, + "loss": 1.6027, + "step": 40220 + }, + { + "epoch": 8.475272554905988, + "grad_norm": 1.1884119510650635, + "learning_rate": 1.1235323474786841e-05, + "loss": 1.554, + "step": 40230 + }, + { + "epoch": 8.477379259493338, + "grad_norm": 1.2143009901046753, + "learning_rate": 1.120485868963358e-05, + "loss": 1.5674, + "step": 40240 + }, + { + "epoch": 8.479485964080688, + "grad_norm": 1.1663596630096436, + "learning_rate": 1.1174432811992685e-05, + "loss": 1.5234, + "step": 40250 + }, + { + "epoch": 8.481592668668036, + "grad_norm": 1.1769542694091797, + "learning_rate": 1.1144045855195973e-05, + "loss": 1.5517, + "step": 40260 + }, + { + "epoch": 8.483699373255385, + "grad_norm": 1.2329270839691162, + "learning_rate": 1.1113697832558101e-05, + "loss": 1.5428, + "step": 40270 + }, + { + "epoch": 8.485806077842735, + "grad_norm": 1.2631030082702637, + "learning_rate": 1.1083388757376712e-05, + "loss": 1.5605, + "step": 40280 + }, + { + "epoch": 8.487912782430083, + "grad_norm": 1.1870076656341553, + "learning_rate": 1.1053118642932425e-05, + "loss": 1.5147, + "step": 40290 + }, + { + "epoch": 8.490019487017433, + "grad_norm": 1.1924930810928345, + "learning_rate": 1.1022887502488688e-05, + "loss": 1.5811, + "step": 40300 + }, + { + "epoch": 8.492126191604783, + "grad_norm": 1.269508719444275, + "learning_rate": 1.0992695349291981e-05, + "loss": 1.5839, + "step": 40310 + }, + { + "epoch": 8.494232896192132, + "grad_norm": 1.3270164728164673, + "learning_rate": 1.0962542196571634e-05, + "loss": 1.5849, + "step": 40320 + }, + { + "epoch": 8.49633960077948, + "grad_norm": 1.3483185768127441, + "learning_rate": 1.0932428057539879e-05, + "loss": 1.545, + "step": 40330 + }, + { + "epoch": 8.49844630536683, + "grad_norm": 1.2481123208999634, + "learning_rate": 1.0902352945391903e-05, + "loss": 1.5465, + "step": 40340 + }, + { + "epoch": 8.50055300995418, + "grad_norm": 1.2228620052337646, + "learning_rate": 1.0872316873305766e-05, + "loss": 1.5942, + "step": 40350 + }, + { + "epoch": 8.502659714541528, + "grad_norm": 1.1880698204040527, + "learning_rate": 1.0842319854442395e-05, + "loss": 1.533, + "step": 40360 + }, + { + "epoch": 8.504766419128877, + "grad_norm": 1.247296929359436, + "learning_rate": 1.0812361901945678e-05, + "loss": 1.5614, + "step": 40370 + }, + { + "epoch": 8.506873123716227, + "grad_norm": 1.2996197938919067, + "learning_rate": 1.078244302894229e-05, + "loss": 1.5818, + "step": 40380 + }, + { + "epoch": 8.508979828303577, + "grad_norm": 1.2891303300857544, + "learning_rate": 1.0752563248541891e-05, + "loss": 1.5511, + "step": 40390 + }, + { + "epoch": 8.511086532890925, + "grad_norm": 1.3689333200454712, + "learning_rate": 1.0722722573836907e-05, + "loss": 1.611, + "step": 40400 + }, + { + "epoch": 8.513193237478275, + "grad_norm": 1.157731533050537, + "learning_rate": 1.069292101790268e-05, + "loss": 1.524, + "step": 40410 + }, + { + "epoch": 8.515299942065624, + "grad_norm": 1.2392263412475586, + "learning_rate": 1.0663158593797428e-05, + "loss": 1.5202, + "step": 40420 + }, + { + "epoch": 8.517406646652972, + "grad_norm": 1.198107123374939, + "learning_rate": 1.063343531456219e-05, + "loss": 1.5084, + "step": 40430 + }, + { + "epoch": 8.519513351240322, + "grad_norm": 1.1941255331039429, + "learning_rate": 1.0603751193220846e-05, + "loss": 1.5902, + "step": 40440 + }, + { + "epoch": 8.521620055827672, + "grad_norm": 1.1136577129364014, + "learning_rate": 1.0574106242780179e-05, + "loss": 1.5638, + "step": 40450 + }, + { + "epoch": 8.523726760415022, + "grad_norm": 1.2597277164459229, + "learning_rate": 1.0544500476229713e-05, + "loss": 1.5807, + "step": 40460 + }, + { + "epoch": 8.52583346500237, + "grad_norm": 1.1556949615478516, + "learning_rate": 1.0514933906541901e-05, + "loss": 1.5527, + "step": 40470 + }, + { + "epoch": 8.52794016958972, + "grad_norm": 1.279253363609314, + "learning_rate": 1.048540654667195e-05, + "loss": 1.5874, + "step": 40480 + }, + { + "epoch": 8.530046874177069, + "grad_norm": 1.2556148767471313, + "learning_rate": 1.0455918409557908e-05, + "loss": 1.5802, + "step": 40490 + }, + { + "epoch": 8.532153578764417, + "grad_norm": 1.2259470224380493, + "learning_rate": 1.0426469508120662e-05, + "loss": 1.5214, + "step": 40500 + }, + { + "epoch": 8.534260283351767, + "grad_norm": 1.1735920906066895, + "learning_rate": 1.0397059855263858e-05, + "loss": 1.6112, + "step": 40510 + }, + { + "epoch": 8.536366987939116, + "grad_norm": 1.2973119020462036, + "learning_rate": 1.0367689463874008e-05, + "loss": 1.5514, + "step": 40520 + }, + { + "epoch": 8.538473692526466, + "grad_norm": 1.1571524143218994, + "learning_rate": 1.0338358346820353e-05, + "loss": 1.548, + "step": 40530 + }, + { + "epoch": 8.540580397113814, + "grad_norm": 1.2462031841278076, + "learning_rate": 1.0309066516954958e-05, + "loss": 1.5325, + "step": 40540 + }, + { + "epoch": 8.542687101701164, + "grad_norm": 1.3027243614196777, + "learning_rate": 1.0279813987112696e-05, + "loss": 1.6096, + "step": 40550 + }, + { + "epoch": 8.544793806288514, + "grad_norm": 1.1657319068908691, + "learning_rate": 1.0250600770111185e-05, + "loss": 1.5217, + "step": 40560 + }, + { + "epoch": 8.546900510875862, + "grad_norm": 1.2053802013397217, + "learning_rate": 1.0221426878750805e-05, + "loss": 1.5174, + "step": 40570 + }, + { + "epoch": 8.549007215463211, + "grad_norm": 1.1766663789749146, + "learning_rate": 1.0192292325814756e-05, + "loss": 1.5205, + "step": 40580 + }, + { + "epoch": 8.551113920050561, + "grad_norm": 1.3153703212738037, + "learning_rate": 1.0163197124068957e-05, + "loss": 1.5878, + "step": 40590 + }, + { + "epoch": 8.55322062463791, + "grad_norm": 1.141251564025879, + "learning_rate": 1.013414128626211e-05, + "loss": 1.5941, + "step": 40600 + }, + { + "epoch": 8.555327329225259, + "grad_norm": 1.2303365468978882, + "learning_rate": 1.0105124825125666e-05, + "loss": 1.5336, + "step": 40610 + }, + { + "epoch": 8.557434033812608, + "grad_norm": 1.178717017173767, + "learning_rate": 1.0076147753373789e-05, + "loss": 1.5571, + "step": 40620 + }, + { + "epoch": 8.559540738399958, + "grad_norm": 1.3122880458831787, + "learning_rate": 1.004721008370344e-05, + "loss": 1.6165, + "step": 40630 + }, + { + "epoch": 8.561647442987308, + "grad_norm": 1.1219425201416016, + "learning_rate": 1.0018311828794268e-05, + "loss": 1.5958, + "step": 40640 + }, + { + "epoch": 8.563754147574656, + "grad_norm": 1.148552656173706, + "learning_rate": 9.989453001308657e-06, + "loss": 1.5789, + "step": 40650 + }, + { + "epoch": 8.565860852162006, + "grad_norm": 1.2436506748199463, + "learning_rate": 9.960633613891756e-06, + "loss": 1.5607, + "step": 40660 + }, + { + "epoch": 8.567967556749355, + "grad_norm": 1.113144874572754, + "learning_rate": 9.931853679171377e-06, + "loss": 1.583, + "step": 40670 + }, + { + "epoch": 8.570074261336703, + "grad_norm": 1.1895748376846313, + "learning_rate": 9.903113209758096e-06, + "loss": 1.6034, + "step": 40680 + }, + { + "epoch": 8.572180965924053, + "grad_norm": 1.2008453607559204, + "learning_rate": 9.874412218245155e-06, + "loss": 1.5242, + "step": 40690 + }, + { + "epoch": 8.574287670511403, + "grad_norm": 1.219973087310791, + "learning_rate": 9.845750717208502e-06, + "loss": 1.6285, + "step": 40700 + }, + { + "epoch": 8.576394375098753, + "grad_norm": 1.3711133003234863, + "learning_rate": 9.817128719206825e-06, + "loss": 1.5516, + "step": 40710 + }, + { + "epoch": 8.5785010796861, + "grad_norm": 1.0981019735336304, + "learning_rate": 9.788546236781459e-06, + "loss": 1.5154, + "step": 40720 + }, + { + "epoch": 8.58060778427345, + "grad_norm": 1.314899206161499, + "learning_rate": 9.760003282456409e-06, + "loss": 1.585, + "step": 40730 + }, + { + "epoch": 8.5827144888608, + "grad_norm": 1.2251036167144775, + "learning_rate": 9.731499868738447e-06, + "loss": 1.5973, + "step": 40740 + }, + { + "epoch": 8.584821193448148, + "grad_norm": 1.1139544248580933, + "learning_rate": 9.703036008116895e-06, + "loss": 1.5602, + "step": 40750 + }, + { + "epoch": 8.586927898035498, + "grad_norm": 1.4896215200424194, + "learning_rate": 9.674611713063864e-06, + "loss": 1.5935, + "step": 40760 + }, + { + "epoch": 8.589034602622847, + "grad_norm": 1.1655323505401611, + "learning_rate": 9.646226996034048e-06, + "loss": 1.5746, + "step": 40770 + }, + { + "epoch": 8.591141307210197, + "grad_norm": 1.2914824485778809, + "learning_rate": 9.617881869464807e-06, + "loss": 1.5852, + "step": 40780 + }, + { + "epoch": 8.593248011797545, + "grad_norm": 1.2509586811065674, + "learning_rate": 9.589576345776218e-06, + "loss": 1.5507, + "step": 40790 + }, + { + "epoch": 8.595354716384895, + "grad_norm": 1.3798872232437134, + "learning_rate": 9.561310437370907e-06, + "loss": 1.6163, + "step": 40800 + }, + { + "epoch": 8.597461420972245, + "grad_norm": 1.440629005432129, + "learning_rate": 9.533084156634242e-06, + "loss": 1.5344, + "step": 40810 + }, + { + "epoch": 8.599568125559593, + "grad_norm": 1.2847709655761719, + "learning_rate": 9.504897515934153e-06, + "loss": 1.5853, + "step": 40820 + }, + { + "epoch": 8.601674830146942, + "grad_norm": 1.203344702720642, + "learning_rate": 9.476750527621214e-06, + "loss": 1.5734, + "step": 40830 + }, + { + "epoch": 8.603781534734292, + "grad_norm": 1.2557659149169922, + "learning_rate": 9.448643204028662e-06, + "loss": 1.5918, + "step": 40840 + }, + { + "epoch": 8.605888239321642, + "grad_norm": 1.1673754453659058, + "learning_rate": 9.420575557472333e-06, + "loss": 1.5612, + "step": 40850 + }, + { + "epoch": 8.60799494390899, + "grad_norm": 1.145298957824707, + "learning_rate": 9.392547600250634e-06, + "loss": 1.5774, + "step": 40860 + }, + { + "epoch": 8.61010164849634, + "grad_norm": 1.214869737625122, + "learning_rate": 9.364559344644663e-06, + "loss": 1.5249, + "step": 40870 + }, + { + "epoch": 8.61220835308369, + "grad_norm": 1.2023444175720215, + "learning_rate": 9.336610802918044e-06, + "loss": 1.5852, + "step": 40880 + }, + { + "epoch": 8.614315057671039, + "grad_norm": 1.2107261419296265, + "learning_rate": 9.308701987317081e-06, + "loss": 1.5125, + "step": 40890 + }, + { + "epoch": 8.616421762258387, + "grad_norm": 1.2665753364562988, + "learning_rate": 9.280832910070591e-06, + "loss": 1.5619, + "step": 40900 + }, + { + "epoch": 8.618528466845737, + "grad_norm": 1.1741652488708496, + "learning_rate": 9.253003583390008e-06, + "loss": 1.5431, + "step": 40910 + }, + { + "epoch": 8.620635171433086, + "grad_norm": 1.2381701469421387, + "learning_rate": 9.225214019469385e-06, + "loss": 1.5154, + "step": 40920 + }, + { + "epoch": 8.622741876020434, + "grad_norm": 1.2759102582931519, + "learning_rate": 9.197464230485298e-06, + "loss": 1.566, + "step": 40930 + }, + { + "epoch": 8.624848580607784, + "grad_norm": 1.186509370803833, + "learning_rate": 9.169754228596905e-06, + "loss": 1.5888, + "step": 40940 + }, + { + "epoch": 8.626955285195134, + "grad_norm": 1.198004126548767, + "learning_rate": 9.142084025945984e-06, + "loss": 1.5688, + "step": 40950 + }, + { + "epoch": 8.629061989782482, + "grad_norm": 1.2855273485183716, + "learning_rate": 9.114453634656783e-06, + "loss": 1.5654, + "step": 40960 + }, + { + "epoch": 8.631168694369832, + "grad_norm": 1.3223928213119507, + "learning_rate": 9.086863066836203e-06, + "loss": 1.5425, + "step": 40970 + }, + { + "epoch": 8.633275398957181, + "grad_norm": 1.2067506313323975, + "learning_rate": 9.059312334573633e-06, + "loss": 1.5451, + "step": 40980 + }, + { + "epoch": 8.635382103544531, + "grad_norm": 1.1481016874313354, + "learning_rate": 9.031801449940991e-06, + "loss": 1.578, + "step": 40990 + }, + { + "epoch": 8.637488808131879, + "grad_norm": 1.2925342321395874, + "learning_rate": 9.004330424992813e-06, + "loss": 1.5271, + "step": 41000 + }, + { + "epoch": 8.639595512719229, + "grad_norm": 1.3120172023773193, + "learning_rate": 8.976899271766092e-06, + "loss": 1.5395, + "step": 41010 + }, + { + "epoch": 8.641702217306579, + "grad_norm": 1.2305859327316284, + "learning_rate": 8.949508002280382e-06, + "loss": 1.5323, + "step": 41020 + }, + { + "epoch": 8.643808921893928, + "grad_norm": 1.2865490913391113, + "learning_rate": 8.922156628537792e-06, + "loss": 1.56, + "step": 41030 + }, + { + "epoch": 8.645915626481276, + "grad_norm": 1.1152595281600952, + "learning_rate": 8.89484516252287e-06, + "loss": 1.544, + "step": 41040 + }, + { + "epoch": 8.648022331068626, + "grad_norm": 1.2578926086425781, + "learning_rate": 8.867573616202751e-06, + "loss": 1.5684, + "step": 41050 + }, + { + "epoch": 8.650129035655976, + "grad_norm": 1.2378671169281006, + "learning_rate": 8.840342001527091e-06, + "loss": 1.5555, + "step": 41060 + }, + { + "epoch": 8.652235740243324, + "grad_norm": 1.2699627876281738, + "learning_rate": 8.813150330427945e-06, + "loss": 1.5894, + "step": 41070 + }, + { + "epoch": 8.654342444830673, + "grad_norm": 1.2434086799621582, + "learning_rate": 8.785998614819957e-06, + "loss": 1.4924, + "step": 41080 + }, + { + "epoch": 8.656449149418023, + "grad_norm": 1.194634199142456, + "learning_rate": 8.758886866600257e-06, + "loss": 1.5675, + "step": 41090 + }, + { + "epoch": 8.658555854005373, + "grad_norm": 1.182447910308838, + "learning_rate": 8.731815097648433e-06, + "loss": 1.5356, + "step": 41100 + }, + { + "epoch": 8.66066255859272, + "grad_norm": 1.409224033355713, + "learning_rate": 8.70478331982656e-06, + "loss": 1.5601, + "step": 41110 + }, + { + "epoch": 8.66276926318007, + "grad_norm": 1.3378043174743652, + "learning_rate": 8.677791544979174e-06, + "loss": 1.6083, + "step": 41120 + }, + { + "epoch": 8.66487596776742, + "grad_norm": 1.2825632095336914, + "learning_rate": 8.65083978493334e-06, + "loss": 1.5947, + "step": 41130 + }, + { + "epoch": 8.666982672354768, + "grad_norm": 1.223988652229309, + "learning_rate": 8.623928051498575e-06, + "loss": 1.593, + "step": 41140 + }, + { + "epoch": 8.669089376942118, + "grad_norm": 1.2797350883483887, + "learning_rate": 8.597056356466771e-06, + "loss": 1.5612, + "step": 41150 + }, + { + "epoch": 8.671196081529468, + "grad_norm": 1.1674673557281494, + "learning_rate": 8.570224711612385e-06, + "loss": 1.5863, + "step": 41160 + }, + { + "epoch": 8.673302786116817, + "grad_norm": 1.359400987625122, + "learning_rate": 8.543433128692291e-06, + "loss": 1.5858, + "step": 41170 + }, + { + "epoch": 8.675409490704165, + "grad_norm": 1.2248836755752563, + "learning_rate": 8.516681619445788e-06, + "loss": 1.5338, + "step": 41180 + }, + { + "epoch": 8.677516195291515, + "grad_norm": 1.2979825735092163, + "learning_rate": 8.489970195594632e-06, + "loss": 1.6042, + "step": 41190 + }, + { + "epoch": 8.679622899878865, + "grad_norm": 1.1553622484207153, + "learning_rate": 8.463298868842972e-06, + "loss": 1.5417, + "step": 41200 + }, + { + "epoch": 8.681729604466213, + "grad_norm": 1.1689449548721313, + "learning_rate": 8.436667650877472e-06, + "loss": 1.5669, + "step": 41210 + }, + { + "epoch": 8.683836309053563, + "grad_norm": 1.2584521770477295, + "learning_rate": 8.410076553367208e-06, + "loss": 1.5273, + "step": 41220 + }, + { + "epoch": 8.685943013640912, + "grad_norm": 1.209145188331604, + "learning_rate": 8.383525587963558e-06, + "loss": 1.5444, + "step": 41230 + }, + { + "epoch": 8.688049718228262, + "grad_norm": 1.187502145767212, + "learning_rate": 8.357014766300441e-06, + "loss": 1.5391, + "step": 41240 + }, + { + "epoch": 8.69015642281561, + "grad_norm": 1.2135961055755615, + "learning_rate": 8.330544099994187e-06, + "loss": 1.5121, + "step": 41250 + }, + { + "epoch": 8.69226312740296, + "grad_norm": 1.060791254043579, + "learning_rate": 8.304113600643438e-06, + "loss": 1.5645, + "step": 41260 + }, + { + "epoch": 8.69436983199031, + "grad_norm": 1.342658519744873, + "learning_rate": 8.27772327982932e-06, + "loss": 1.5468, + "step": 41270 + }, + { + "epoch": 8.69647653657766, + "grad_norm": 1.1672006845474243, + "learning_rate": 8.251373149115293e-06, + "loss": 1.5637, + "step": 41280 + }, + { + "epoch": 8.698583241165007, + "grad_norm": 1.3452717065811157, + "learning_rate": 8.225063220047246e-06, + "loss": 1.5927, + "step": 41290 + }, + { + "epoch": 8.700689945752357, + "grad_norm": 1.2547844648361206, + "learning_rate": 8.19879350415349e-06, + "loss": 1.5255, + "step": 41300 + }, + { + "epoch": 8.702796650339707, + "grad_norm": 1.1102582216262817, + "learning_rate": 8.172564012944595e-06, + "loss": 1.4948, + "step": 41310 + }, + { + "epoch": 8.704903354927055, + "grad_norm": 1.167216420173645, + "learning_rate": 8.146374757913622e-06, + "loss": 1.5291, + "step": 41320 + }, + { + "epoch": 8.707010059514404, + "grad_norm": 1.2935529947280884, + "learning_rate": 8.12022575053597e-06, + "loss": 1.582, + "step": 41330 + }, + { + "epoch": 8.709116764101754, + "grad_norm": 1.1825650930404663, + "learning_rate": 8.094117002269363e-06, + "loss": 1.6038, + "step": 41340 + }, + { + "epoch": 8.711223468689102, + "grad_norm": 1.3403667211532593, + "learning_rate": 8.068048524553961e-06, + "loss": 1.5006, + "step": 41350 + }, + { + "epoch": 8.713330173276452, + "grad_norm": 1.3244026899337769, + "learning_rate": 8.042020328812161e-06, + "loss": 1.5853, + "step": 41360 + }, + { + "epoch": 8.715436877863802, + "grad_norm": 1.2212400436401367, + "learning_rate": 8.016032426448817e-06, + "loss": 1.4957, + "step": 41370 + }, + { + "epoch": 8.717543582451151, + "grad_norm": 1.154133915901184, + "learning_rate": 7.990084828851108e-06, + "loss": 1.5267, + "step": 41380 + }, + { + "epoch": 8.7196502870385, + "grad_norm": 1.243924617767334, + "learning_rate": 7.96417754738853e-06, + "loss": 1.6473, + "step": 41390 + }, + { + "epoch": 8.721756991625849, + "grad_norm": 1.189622402191162, + "learning_rate": 7.938310593412879e-06, + "loss": 1.5284, + "step": 41400 + }, + { + "epoch": 8.723863696213199, + "grad_norm": 1.4922682046890259, + "learning_rate": 7.912483978258367e-06, + "loss": 1.6112, + "step": 41410 + }, + { + "epoch": 8.725970400800549, + "grad_norm": 1.2274786233901978, + "learning_rate": 7.886697713241453e-06, + "loss": 1.5704, + "step": 41420 + }, + { + "epoch": 8.728077105387896, + "grad_norm": 1.1772549152374268, + "learning_rate": 7.860951809660989e-06, + "loss": 1.552, + "step": 41430 + }, + { + "epoch": 8.730183809975246, + "grad_norm": 1.0843391418457031, + "learning_rate": 7.835246278798037e-06, + "loss": 1.5429, + "step": 41440 + }, + { + "epoch": 8.732290514562596, + "grad_norm": 1.07290780544281, + "learning_rate": 7.809581131916066e-06, + "loss": 1.5829, + "step": 41450 + }, + { + "epoch": 8.734397219149944, + "grad_norm": 1.1702159643173218, + "learning_rate": 7.783956380260837e-06, + "loss": 1.5034, + "step": 41460 + }, + { + "epoch": 8.736503923737294, + "grad_norm": 1.2474020719528198, + "learning_rate": 7.758372035060357e-06, + "loss": 1.5376, + "step": 41470 + }, + { + "epoch": 8.738610628324643, + "grad_norm": 1.2100849151611328, + "learning_rate": 7.73282810752497e-06, + "loss": 1.5916, + "step": 41480 + }, + { + "epoch": 8.740717332911993, + "grad_norm": 1.3087376356124878, + "learning_rate": 7.70732460884731e-06, + "loss": 1.5164, + "step": 41490 + }, + { + "epoch": 8.742824037499341, + "grad_norm": 1.180748701095581, + "learning_rate": 7.681861550202252e-06, + "loss": 1.5888, + "step": 41500 + }, + { + "epoch": 8.74493074208669, + "grad_norm": 1.3653792142868042, + "learning_rate": 7.656438942747058e-06, + "loss": 1.5426, + "step": 41510 + }, + { + "epoch": 8.74703744667404, + "grad_norm": 1.3318077325820923, + "learning_rate": 7.631056797621106e-06, + "loss": 1.6475, + "step": 41520 + }, + { + "epoch": 8.749144151261389, + "grad_norm": 1.1052581071853638, + "learning_rate": 7.605715125946178e-06, + "loss": 1.4625, + "step": 41530 + }, + { + "epoch": 8.751250855848738, + "grad_norm": 1.2538378238677979, + "learning_rate": 7.5804139388262915e-06, + "loss": 1.5603, + "step": 41540 + }, + { + "epoch": 8.753357560436088, + "grad_norm": 1.4633102416992188, + "learning_rate": 7.5551532473476795e-06, + "loss": 1.5634, + "step": 41550 + }, + { + "epoch": 8.755464265023438, + "grad_norm": 1.199808120727539, + "learning_rate": 7.529933062578864e-06, + "loss": 1.5058, + "step": 41560 + }, + { + "epoch": 8.757570969610786, + "grad_norm": 1.220557689666748, + "learning_rate": 7.504753395570629e-06, + "loss": 1.574, + "step": 41570 + }, + { + "epoch": 8.759677674198135, + "grad_norm": 1.2284119129180908, + "learning_rate": 7.479614257355971e-06, + "loss": 1.4977, + "step": 41580 + }, + { + "epoch": 8.761784378785485, + "grad_norm": 1.1737325191497803, + "learning_rate": 7.454515658950167e-06, + "loss": 1.5178, + "step": 41590 + }, + { + "epoch": 8.763891083372833, + "grad_norm": 1.3059794902801514, + "learning_rate": 7.42945761135071e-06, + "loss": 1.6303, + "step": 41600 + }, + { + "epoch": 8.765997787960183, + "grad_norm": 1.3335444927215576, + "learning_rate": 7.404440125537293e-06, + "loss": 1.501, + "step": 41610 + }, + { + "epoch": 8.768104492547533, + "grad_norm": 1.2088760137557983, + "learning_rate": 7.379463212471915e-06, + "loss": 1.5628, + "step": 41620 + }, + { + "epoch": 8.770211197134882, + "grad_norm": 1.393725037574768, + "learning_rate": 7.354526883098711e-06, + "loss": 1.572, + "step": 41630 + }, + { + "epoch": 8.77231790172223, + "grad_norm": 1.21114182472229, + "learning_rate": 7.329631148344118e-06, + "loss": 1.5486, + "step": 41640 + }, + { + "epoch": 8.77442460630958, + "grad_norm": 1.2755677700042725, + "learning_rate": 7.30477601911671e-06, + "loss": 1.5395, + "step": 41650 + }, + { + "epoch": 8.77653131089693, + "grad_norm": 1.1830251216888428, + "learning_rate": 7.279961506307287e-06, + "loss": 1.5716, + "step": 41660 + }, + { + "epoch": 8.77863801548428, + "grad_norm": 1.2919485569000244, + "learning_rate": 7.255187620788894e-06, + "loss": 1.5423, + "step": 41670 + }, + { + "epoch": 8.780744720071628, + "grad_norm": 1.2150613069534302, + "learning_rate": 7.230454373416739e-06, + "loss": 1.6086, + "step": 41680 + }, + { + "epoch": 8.782851424658977, + "grad_norm": 1.2287827730178833, + "learning_rate": 7.205761775028197e-06, + "loss": 1.6237, + "step": 41690 + }, + { + "epoch": 8.784958129246327, + "grad_norm": 1.1854701042175293, + "learning_rate": 7.181109836442912e-06, + "loss": 1.6204, + "step": 41700 + }, + { + "epoch": 8.787064833833675, + "grad_norm": 1.306795358657837, + "learning_rate": 7.156498568462633e-06, + "loss": 1.5797, + "step": 41710 + }, + { + "epoch": 8.789171538421025, + "grad_norm": 1.2896075248718262, + "learning_rate": 7.1319279818713445e-06, + "loss": 1.5644, + "step": 41720 + }, + { + "epoch": 8.791278243008374, + "grad_norm": 1.2205928564071655, + "learning_rate": 7.1073980874351575e-06, + "loss": 1.5979, + "step": 41730 + }, + { + "epoch": 8.793384947595724, + "grad_norm": 1.1458282470703125, + "learning_rate": 7.082908895902374e-06, + "loss": 1.5617, + "step": 41740 + }, + { + "epoch": 8.795491652183072, + "grad_norm": 1.1576645374298096, + "learning_rate": 7.058460418003488e-06, + "loss": 1.5693, + "step": 41750 + }, + { + "epoch": 8.797598356770422, + "grad_norm": 1.2268850803375244, + "learning_rate": 7.034052664451118e-06, + "loss": 1.5878, + "step": 41760 + }, + { + "epoch": 8.799705061357772, + "grad_norm": 1.1965655088424683, + "learning_rate": 7.009685645940023e-06, + "loss": 1.5876, + "step": 41770 + }, + { + "epoch": 8.80181176594512, + "grad_norm": 1.2396894693374634, + "learning_rate": 6.985359373147182e-06, + "loss": 1.5395, + "step": 41780 + }, + { + "epoch": 8.80391847053247, + "grad_norm": 1.2615655660629272, + "learning_rate": 6.961073856731648e-06, + "loss": 1.5189, + "step": 41790 + }, + { + "epoch": 8.806025175119819, + "grad_norm": 1.1825131177902222, + "learning_rate": 6.936829107334664e-06, + "loss": 1.5358, + "step": 41800 + }, + { + "epoch": 8.808131879707169, + "grad_norm": 1.2486509084701538, + "learning_rate": 6.9126251355795864e-06, + "loss": 1.571, + "step": 41810 + }, + { + "epoch": 8.810238584294517, + "grad_norm": 1.2613776922225952, + "learning_rate": 6.88846195207189e-06, + "loss": 1.5789, + "step": 41820 + }, + { + "epoch": 8.812345288881867, + "grad_norm": 1.1759439706802368, + "learning_rate": 6.864339567399225e-06, + "loss": 1.5835, + "step": 41830 + }, + { + "epoch": 8.814451993469216, + "grad_norm": 1.1128981113433838, + "learning_rate": 6.840257992131316e-06, + "loss": 1.5402, + "step": 41840 + }, + { + "epoch": 8.816558698056564, + "grad_norm": 1.1860219240188599, + "learning_rate": 6.816217236820032e-06, + "loss": 1.5533, + "step": 41850 + }, + { + "epoch": 8.818665402643914, + "grad_norm": 1.1567201614379883, + "learning_rate": 6.7922173119993606e-06, + "loss": 1.5627, + "step": 41860 + }, + { + "epoch": 8.820772107231264, + "grad_norm": 1.254657506942749, + "learning_rate": 6.768258228185353e-06, + "loss": 1.5441, + "step": 41870 + }, + { + "epoch": 8.822878811818613, + "grad_norm": 1.3093767166137695, + "learning_rate": 6.7443399958762584e-06, + "loss": 1.5616, + "step": 41880 + }, + { + "epoch": 8.824985516405961, + "grad_norm": 1.260617733001709, + "learning_rate": 6.720462625552326e-06, + "loss": 1.5495, + "step": 41890 + }, + { + "epoch": 8.827092220993311, + "grad_norm": 1.1566492319107056, + "learning_rate": 6.6966261276759424e-06, + "loss": 1.5999, + "step": 41900 + }, + { + "epoch": 8.82919892558066, + "grad_norm": 1.1674546003341675, + "learning_rate": 6.672830512691608e-06, + "loss": 1.5354, + "step": 41910 + }, + { + "epoch": 8.83130563016801, + "grad_norm": 1.2879528999328613, + "learning_rate": 6.64907579102586e-06, + "loss": 1.5497, + "step": 41920 + }, + { + "epoch": 8.833412334755359, + "grad_norm": 1.2599519491195679, + "learning_rate": 6.625361973087363e-06, + "loss": 1.5339, + "step": 41930 + }, + { + "epoch": 8.835519039342708, + "grad_norm": 1.1604090929031372, + "learning_rate": 6.6016890692668364e-06, + "loss": 1.5772, + "step": 41940 + }, + { + "epoch": 8.837625743930058, + "grad_norm": 1.200273036956787, + "learning_rate": 6.578057089937062e-06, + "loss": 1.505, + "step": 41950 + }, + { + "epoch": 8.839732448517406, + "grad_norm": 1.5028703212738037, + "learning_rate": 6.554466045452923e-06, + "loss": 1.5707, + "step": 41960 + }, + { + "epoch": 8.841839153104756, + "grad_norm": 1.3208073377609253, + "learning_rate": 6.530915946151339e-06, + "loss": 1.5414, + "step": 41970 + }, + { + "epoch": 8.843945857692106, + "grad_norm": 1.1587634086608887, + "learning_rate": 6.507406802351268e-06, + "loss": 1.5389, + "step": 41980 + }, + { + "epoch": 8.846052562279453, + "grad_norm": 1.2481507062911987, + "learning_rate": 6.4839386243538025e-06, + "loss": 1.5671, + "step": 41990 + }, + { + "epoch": 8.848159266866803, + "grad_norm": 1.1359668970108032, + "learning_rate": 6.460511422441984e-06, + "loss": 1.5504, + "step": 42000 + }, + { + "epoch": 8.850265971454153, + "grad_norm": 1.2089869976043701, + "learning_rate": 6.4371252068809786e-06, + "loss": 1.5816, + "step": 42010 + }, + { + "epoch": 8.852372676041503, + "grad_norm": 1.1908295154571533, + "learning_rate": 6.413779987917956e-06, + "loss": 1.6369, + "step": 42020 + }, + { + "epoch": 8.85447938062885, + "grad_norm": 1.216638207435608, + "learning_rate": 6.390475775782101e-06, + "loss": 1.5828, + "step": 42030 + }, + { + "epoch": 8.8565860852162, + "grad_norm": 1.127992868423462, + "learning_rate": 6.367212580684712e-06, + "loss": 1.5546, + "step": 42040 + }, + { + "epoch": 8.85869278980355, + "grad_norm": 1.1806529760360718, + "learning_rate": 6.343990412819023e-06, + "loss": 1.528, + "step": 42050 + }, + { + "epoch": 8.8607994943909, + "grad_norm": 1.1873255968093872, + "learning_rate": 6.320809282360319e-06, + "loss": 1.5194, + "step": 42060 + }, + { + "epoch": 8.862906198978248, + "grad_norm": 1.7294059991836548, + "learning_rate": 6.297669199465961e-06, + "loss": 1.5223, + "step": 42070 + }, + { + "epoch": 8.865012903565598, + "grad_norm": 1.1746385097503662, + "learning_rate": 6.274570174275218e-06, + "loss": 1.4832, + "step": 42080 + }, + { + "epoch": 8.867119608152947, + "grad_norm": 1.351119041442871, + "learning_rate": 6.2515122169094835e-06, + "loss": 1.6232, + "step": 42090 + }, + { + "epoch": 8.869226312740295, + "grad_norm": 1.2291535139083862, + "learning_rate": 6.2284953374720736e-06, + "loss": 1.5923, + "step": 42100 + }, + { + "epoch": 8.871333017327645, + "grad_norm": 1.1676898002624512, + "learning_rate": 6.205519546048322e-06, + "loss": 1.593, + "step": 42110 + }, + { + "epoch": 8.873439721914995, + "grad_norm": 1.1755423545837402, + "learning_rate": 6.1825848527055865e-06, + "loss": 1.5588, + "step": 42120 + }, + { + "epoch": 8.875546426502344, + "grad_norm": 1.2773000001907349, + "learning_rate": 6.159691267493206e-06, + "loss": 1.5397, + "step": 42130 + }, + { + "epoch": 8.877653131089692, + "grad_norm": 1.2411293983459473, + "learning_rate": 6.136838800442457e-06, + "loss": 1.5132, + "step": 42140 + }, + { + "epoch": 8.879759835677042, + "grad_norm": 1.338410496711731, + "learning_rate": 6.114027461566696e-06, + "loss": 1.5821, + "step": 42150 + }, + { + "epoch": 8.881866540264392, + "grad_norm": 1.1013096570968628, + "learning_rate": 6.091257260861172e-06, + "loss": 1.5418, + "step": 42160 + }, + { + "epoch": 8.88397324485174, + "grad_norm": 1.2717993259429932, + "learning_rate": 6.068528208303148e-06, + "loss": 1.6138, + "step": 42170 + }, + { + "epoch": 8.88607994943909, + "grad_norm": 1.244744896888733, + "learning_rate": 6.04584031385188e-06, + "loss": 1.5054, + "step": 42180 + }, + { + "epoch": 8.88818665402644, + "grad_norm": 1.3655146360397339, + "learning_rate": 6.0231935874484945e-06, + "loss": 1.5829, + "step": 42190 + }, + { + "epoch": 8.890293358613789, + "grad_norm": 1.2557286024093628, + "learning_rate": 6.000588039016208e-06, + "loss": 1.5387, + "step": 42200 + }, + { + "epoch": 8.892400063201137, + "grad_norm": 1.334020972251892, + "learning_rate": 5.978023678460099e-06, + "loss": 1.568, + "step": 42210 + }, + { + "epoch": 8.894506767788487, + "grad_norm": 1.276594638824463, + "learning_rate": 5.9555005156672335e-06, + "loss": 1.5658, + "step": 42220 + }, + { + "epoch": 8.896613472375837, + "grad_norm": 1.1930742263793945, + "learning_rate": 5.933018560506643e-06, + "loss": 1.5815, + "step": 42230 + }, + { + "epoch": 8.898720176963185, + "grad_norm": 1.10635507106781, + "learning_rate": 5.910577822829233e-06, + "loss": 1.5378, + "step": 42240 + }, + { + "epoch": 8.900826881550534, + "grad_norm": 1.0961418151855469, + "learning_rate": 5.88817831246794e-06, + "loss": 1.5494, + "step": 42250 + }, + { + "epoch": 8.902933586137884, + "grad_norm": 1.249894142150879, + "learning_rate": 5.865820039237624e-06, + "loss": 1.5608, + "step": 42260 + }, + { + "epoch": 8.905040290725234, + "grad_norm": 1.2282543182373047, + "learning_rate": 5.843503012934959e-06, + "loss": 1.5222, + "step": 42270 + }, + { + "epoch": 8.907146995312582, + "grad_norm": 1.2692981958389282, + "learning_rate": 5.821227243338712e-06, + "loss": 1.5812, + "step": 42280 + }, + { + "epoch": 8.909253699899931, + "grad_norm": 1.2558562755584717, + "learning_rate": 5.798992740209441e-06, + "loss": 1.5812, + "step": 42290 + }, + { + "epoch": 8.911360404487281, + "grad_norm": 1.233802318572998, + "learning_rate": 5.77679951328971e-06, + "loss": 1.5638, + "step": 42300 + }, + { + "epoch": 8.913467109074631, + "grad_norm": 1.3241041898727417, + "learning_rate": 5.75464757230395e-06, + "loss": 1.5635, + "step": 42310 + }, + { + "epoch": 8.915573813661979, + "grad_norm": 1.1713545322418213, + "learning_rate": 5.732536926958487e-06, + "loss": 1.5177, + "step": 42320 + }, + { + "epoch": 8.917680518249329, + "grad_norm": 1.1677508354187012, + "learning_rate": 5.710467586941615e-06, + "loss": 1.5437, + "step": 42330 + }, + { + "epoch": 8.919787222836678, + "grad_norm": 1.3364185094833374, + "learning_rate": 5.6884395619235085e-06, + "loss": 1.5431, + "step": 42340 + }, + { + "epoch": 8.921893927424026, + "grad_norm": 1.236284852027893, + "learning_rate": 5.66645286155616e-06, + "loss": 1.5379, + "step": 42350 + }, + { + "epoch": 8.924000632011376, + "grad_norm": 1.1496565341949463, + "learning_rate": 5.644507495473572e-06, + "loss": 1.5879, + "step": 42360 + }, + { + "epoch": 8.926107336598726, + "grad_norm": 1.2142751216888428, + "learning_rate": 5.622603473291543e-06, + "loss": 1.5205, + "step": 42370 + }, + { + "epoch": 8.928214041186074, + "grad_norm": 1.2585910558700562, + "learning_rate": 5.600740804607829e-06, + "loss": 1.5507, + "step": 42380 + }, + { + "epoch": 8.930320745773423, + "grad_norm": 1.2263305187225342, + "learning_rate": 5.5789194990020225e-06, + "loss": 1.512, + "step": 42390 + }, + { + "epoch": 8.932427450360773, + "grad_norm": 1.1774612665176392, + "learning_rate": 5.557139566035574e-06, + "loss": 1.5541, + "step": 42400 + }, + { + "epoch": 8.934534154948123, + "grad_norm": 1.2883888483047485, + "learning_rate": 5.535401015251851e-06, + "loss": 1.5749, + "step": 42410 + }, + { + "epoch": 8.936640859535471, + "grad_norm": 1.1521271467208862, + "learning_rate": 5.5137038561761115e-06, + "loss": 1.5371, + "step": 42420 + }, + { + "epoch": 8.93874756412282, + "grad_norm": 1.2090059518814087, + "learning_rate": 5.49204809831535e-06, + "loss": 1.5231, + "step": 42430 + }, + { + "epoch": 8.94085426871017, + "grad_norm": 1.3226497173309326, + "learning_rate": 5.470433751158577e-06, + "loss": 1.5888, + "step": 42440 + }, + { + "epoch": 8.94296097329752, + "grad_norm": 1.1438806056976318, + "learning_rate": 5.4488608241765494e-06, + "loss": 1.546, + "step": 42450 + }, + { + "epoch": 8.945067677884868, + "grad_norm": 1.3889515399932861, + "learning_rate": 5.427329326821906e-06, + "loss": 1.6614, + "step": 42460 + }, + { + "epoch": 8.947174382472218, + "grad_norm": 1.2498170137405396, + "learning_rate": 5.4058392685292005e-06, + "loss": 1.6041, + "step": 42470 + }, + { + "epoch": 8.949281087059568, + "grad_norm": 1.1849504709243774, + "learning_rate": 5.3843906587146886e-06, + "loss": 1.5202, + "step": 42480 + }, + { + "epoch": 8.951387791646916, + "grad_norm": 1.1745749711990356, + "learning_rate": 5.362983506776564e-06, + "loss": 1.5924, + "step": 42490 + }, + { + "epoch": 8.953494496234265, + "grad_norm": 1.1476824283599854, + "learning_rate": 5.341617822094869e-06, + "loss": 1.5793, + "step": 42500 + }, + { + "epoch": 8.955601200821615, + "grad_norm": 1.2073520421981812, + "learning_rate": 5.320293614031413e-06, + "loss": 1.6405, + "step": 42510 + }, + { + "epoch": 8.957707905408965, + "grad_norm": 1.2965010404586792, + "learning_rate": 5.299010891929856e-06, + "loss": 1.5816, + "step": 42520 + }, + { + "epoch": 8.959814609996313, + "grad_norm": 1.189401626586914, + "learning_rate": 5.277769665115695e-06, + "loss": 1.5539, + "step": 42530 + }, + { + "epoch": 8.961921314583662, + "grad_norm": 1.3214972019195557, + "learning_rate": 5.256569942896217e-06, + "loss": 1.5813, + "step": 42540 + }, + { + "epoch": 8.964028019171012, + "grad_norm": 1.2079973220825195, + "learning_rate": 5.23541173456058e-06, + "loss": 1.5224, + "step": 42550 + }, + { + "epoch": 8.96613472375836, + "grad_norm": 1.121533989906311, + "learning_rate": 5.214295049379658e-06, + "loss": 1.5912, + "step": 42560 + }, + { + "epoch": 8.96824142834571, + "grad_norm": 1.1464295387268066, + "learning_rate": 5.193219896606194e-06, + "loss": 1.537, + "step": 42570 + }, + { + "epoch": 8.97034813293306, + "grad_norm": 1.2208278179168701, + "learning_rate": 5.172186285474756e-06, + "loss": 1.5689, + "step": 42580 + }, + { + "epoch": 8.97245483752041, + "grad_norm": 1.1732256412506104, + "learning_rate": 5.15119422520165e-06, + "loss": 1.6011, + "step": 42590 + }, + { + "epoch": 8.974561542107757, + "grad_norm": 1.1097040176391602, + "learning_rate": 5.130243724984995e-06, + "loss": 1.5775, + "step": 42600 + }, + { + "epoch": 8.976668246695107, + "grad_norm": 1.2230794429779053, + "learning_rate": 5.1093347940047274e-06, + "loss": 1.5617, + "step": 42610 + }, + { + "epoch": 8.978774951282457, + "grad_norm": 1.1923304796218872, + "learning_rate": 5.0884674414225284e-06, + "loss": 1.5653, + "step": 42620 + }, + { + "epoch": 8.980881655869805, + "grad_norm": 1.2577967643737793, + "learning_rate": 5.067641676381918e-06, + "loss": 1.6404, + "step": 42630 + }, + { + "epoch": 8.982988360457155, + "grad_norm": 1.5352810621261597, + "learning_rate": 5.046857508008085e-06, + "loss": 1.5895, + "step": 42640 + }, + { + "epoch": 8.985095065044504, + "grad_norm": 1.1417043209075928, + "learning_rate": 5.026114945408123e-06, + "loss": 1.565, + "step": 42650 + }, + { + "epoch": 8.987201769631854, + "grad_norm": 1.1386953592300415, + "learning_rate": 5.005413997670816e-06, + "loss": 1.5226, + "step": 42660 + }, + { + "epoch": 8.989308474219202, + "grad_norm": 1.1961097717285156, + "learning_rate": 4.984754673866732e-06, + "loss": 1.5532, + "step": 42670 + }, + { + "epoch": 8.991415178806552, + "grad_norm": 1.2846159934997559, + "learning_rate": 4.964136983048184e-06, + "loss": 1.5413, + "step": 42680 + }, + { + "epoch": 8.993521883393901, + "grad_norm": 1.1944465637207031, + "learning_rate": 4.9435609342493025e-06, + "loss": 1.5316, + "step": 42690 + }, + { + "epoch": 8.995628587981251, + "grad_norm": 1.21571946144104, + "learning_rate": 4.923026536485875e-06, + "loss": 1.5421, + "step": 42700 + }, + { + "epoch": 8.9977352925686, + "grad_norm": 1.240976333618164, + "learning_rate": 4.902533798755548e-06, + "loss": 1.5161, + "step": 42710 + }, + { + "epoch": 8.999841997155949, + "grad_norm": 1.2830826044082642, + "learning_rate": 4.8820827300376075e-06, + "loss": 1.5955, + "step": 42720 + }, + { + "epoch": 9.001948701743299, + "grad_norm": 1.0986697673797607, + "learning_rate": 4.861673339293149e-06, + "loss": 1.5327, + "step": 42730 + }, + { + "epoch": 9.004055406330647, + "grad_norm": 1.1212446689605713, + "learning_rate": 4.84130563546501e-06, + "loss": 1.5482, + "step": 42740 + }, + { + "epoch": 9.006162110917996, + "grad_norm": 1.2350519895553589, + "learning_rate": 4.820979627477706e-06, + "loss": 1.5517, + "step": 42750 + }, + { + "epoch": 9.008268815505346, + "grad_norm": 1.2882555723190308, + "learning_rate": 4.800695324237547e-06, + "loss": 1.5526, + "step": 42760 + }, + { + "epoch": 9.010375520092696, + "grad_norm": 1.3195393085479736, + "learning_rate": 4.780452734632524e-06, + "loss": 1.5865, + "step": 42770 + }, + { + "epoch": 9.012482224680044, + "grad_norm": 1.2144135236740112, + "learning_rate": 4.760251867532362e-06, + "loss": 1.5783, + "step": 42780 + }, + { + "epoch": 9.014588929267394, + "grad_norm": 1.2432124614715576, + "learning_rate": 4.7400927317885256e-06, + "loss": 1.5697, + "step": 42790 + }, + { + "epoch": 9.016695633854743, + "grad_norm": 1.3771353960037231, + "learning_rate": 4.7199753362341614e-06, + "loss": 1.5572, + "step": 42800 + }, + { + "epoch": 9.018802338442091, + "grad_norm": 1.1509369611740112, + "learning_rate": 4.699899689684129e-06, + "loss": 1.5553, + "step": 42810 + }, + { + "epoch": 9.020909043029441, + "grad_norm": 1.2895727157592773, + "learning_rate": 4.679865800935046e-06, + "loss": 1.5799, + "step": 42820 + }, + { + "epoch": 9.02301574761679, + "grad_norm": 1.110680341720581, + "learning_rate": 4.659873678765158e-06, + "loss": 1.5675, + "step": 42830 + }, + { + "epoch": 9.02512245220414, + "grad_norm": 1.3074404001235962, + "learning_rate": 4.639923331934471e-06, + "loss": 1.5556, + "step": 42840 + }, + { + "epoch": 9.027229156791488, + "grad_norm": 1.2942652702331543, + "learning_rate": 4.620014769184644e-06, + "loss": 1.6094, + "step": 42850 + }, + { + "epoch": 9.029335861378838, + "grad_norm": 1.2963061332702637, + "learning_rate": 4.600147999239035e-06, + "loss": 1.6269, + "step": 42860 + }, + { + "epoch": 9.031442565966188, + "grad_norm": 1.1793369054794312, + "learning_rate": 4.5803230308027356e-06, + "loss": 1.5271, + "step": 42870 + }, + { + "epoch": 9.033549270553536, + "grad_norm": 1.3445175886154175, + "learning_rate": 4.560539872562463e-06, + "loss": 1.5342, + "step": 42880 + }, + { + "epoch": 9.035655975140886, + "grad_norm": 1.1091607809066772, + "learning_rate": 4.540798533186619e-06, + "loss": 1.5355, + "step": 42890 + }, + { + "epoch": 9.037762679728235, + "grad_norm": 1.1904314756393433, + "learning_rate": 4.521099021325336e-06, + "loss": 1.5208, + "step": 42900 + }, + { + "epoch": 9.039869384315585, + "grad_norm": 1.192204475402832, + "learning_rate": 4.501441345610347e-06, + "loss": 1.4896, + "step": 42910 + }, + { + "epoch": 9.041976088902933, + "grad_norm": 1.1834639310836792, + "learning_rate": 4.481825514655114e-06, + "loss": 1.5564, + "step": 42920 + }, + { + "epoch": 9.044082793490283, + "grad_norm": 1.3617198467254639, + "learning_rate": 4.462251537054718e-06, + "loss": 1.5556, + "step": 42930 + }, + { + "epoch": 9.046189498077633, + "grad_norm": 1.2874000072479248, + "learning_rate": 4.442719421385922e-06, + "loss": 1.5975, + "step": 42940 + }, + { + "epoch": 9.04829620266498, + "grad_norm": 1.1991859674453735, + "learning_rate": 4.423229176207167e-06, + "loss": 1.6017, + "step": 42950 + }, + { + "epoch": 9.05040290725233, + "grad_norm": 1.1703438758850098, + "learning_rate": 4.403780810058511e-06, + "loss": 1.5776, + "step": 42960 + }, + { + "epoch": 9.05250961183968, + "grad_norm": 1.2019577026367188, + "learning_rate": 4.38437433146166e-06, + "loss": 1.5743, + "step": 42970 + }, + { + "epoch": 9.05461631642703, + "grad_norm": 1.319351315498352, + "learning_rate": 4.365009748920012e-06, + "loss": 1.558, + "step": 42980 + }, + { + "epoch": 9.056723021014378, + "grad_norm": 1.1495599746704102, + "learning_rate": 4.345687070918559e-06, + "loss": 1.576, + "step": 42990 + }, + { + "epoch": 9.058829725601727, + "grad_norm": 1.2733286619186401, + "learning_rate": 4.326406305923958e-06, + "loss": 1.5244, + "step": 43000 + }, + { + "epoch": 9.060936430189077, + "grad_norm": 1.318764090538025, + "learning_rate": 4.307167462384498e-06, + "loss": 1.5456, + "step": 43010 + }, + { + "epoch": 9.063043134776425, + "grad_norm": 1.2200381755828857, + "learning_rate": 4.287970548730069e-06, + "loss": 1.5125, + "step": 43020 + }, + { + "epoch": 9.065149839363775, + "grad_norm": 1.2284795045852661, + "learning_rate": 4.268815573372242e-06, + "loss": 1.5087, + "step": 43030 + }, + { + "epoch": 9.067256543951125, + "grad_norm": 1.171825647354126, + "learning_rate": 4.249702544704171e-06, + "loss": 1.5662, + "step": 43040 + }, + { + "epoch": 9.069363248538474, + "grad_norm": 1.3023587465286255, + "learning_rate": 4.230631471100655e-06, + "loss": 1.5747, + "step": 43050 + }, + { + "epoch": 9.071469953125822, + "grad_norm": 1.1990822553634644, + "learning_rate": 4.211602360918099e-06, + "loss": 1.53, + "step": 43060 + }, + { + "epoch": 9.073576657713172, + "grad_norm": 1.2179408073425293, + "learning_rate": 4.192615222494489e-06, + "loss": 1.5735, + "step": 43070 + }, + { + "epoch": 9.075683362300522, + "grad_norm": 1.1778931617736816, + "learning_rate": 4.173670064149482e-06, + "loss": 1.5302, + "step": 43080 + }, + { + "epoch": 9.077790066887871, + "grad_norm": 1.3378112316131592, + "learning_rate": 4.154766894184292e-06, + "loss": 1.5586, + "step": 43090 + }, + { + "epoch": 9.07989677147522, + "grad_norm": 1.2669785022735596, + "learning_rate": 4.1359057208817605e-06, + "loss": 1.5415, + "step": 43100 + }, + { + "epoch": 9.08200347606257, + "grad_norm": 1.32623291015625, + "learning_rate": 4.117086552506322e-06, + "loss": 1.5398, + "step": 43110 + }, + { + "epoch": 9.084110180649919, + "grad_norm": 1.186213493347168, + "learning_rate": 4.098309397303978e-06, + "loss": 1.5541, + "step": 43120 + }, + { + "epoch": 9.086216885237267, + "grad_norm": 1.2825602293014526, + "learning_rate": 4.079574263502384e-06, + "loss": 1.5482, + "step": 43130 + }, + { + "epoch": 9.088323589824617, + "grad_norm": 1.2611690759658813, + "learning_rate": 4.060881159310725e-06, + "loss": 1.5587, + "step": 43140 + }, + { + "epoch": 9.090430294411966, + "grad_norm": 1.2552509307861328, + "learning_rate": 4.042230092919774e-06, + "loss": 1.5142, + "step": 43150 + }, + { + "epoch": 9.092536998999316, + "grad_norm": 1.2151157855987549, + "learning_rate": 4.023621072501926e-06, + "loss": 1.5874, + "step": 43160 + }, + { + "epoch": 9.094643703586664, + "grad_norm": 1.1732696294784546, + "learning_rate": 4.005054106211104e-06, + "loss": 1.5516, + "step": 43170 + }, + { + "epoch": 9.096750408174014, + "grad_norm": 1.32643461227417, + "learning_rate": 3.986529202182832e-06, + "loss": 1.6017, + "step": 43180 + }, + { + "epoch": 9.098857112761364, + "grad_norm": 1.1873167753219604, + "learning_rate": 3.968046368534217e-06, + "loss": 1.5591, + "step": 43190 + }, + { + "epoch": 9.100963817348712, + "grad_norm": 1.229538917541504, + "learning_rate": 3.949605613363882e-06, + "loss": 1.5554, + "step": 43200 + }, + { + "epoch": 9.103070521936061, + "grad_norm": 1.1299848556518555, + "learning_rate": 3.931206944752064e-06, + "loss": 1.5277, + "step": 43210 + }, + { + "epoch": 9.105177226523411, + "grad_norm": 1.3684306144714355, + "learning_rate": 3.912850370760534e-06, + "loss": 1.5113, + "step": 43220 + }, + { + "epoch": 9.10728393111076, + "grad_norm": 1.313065528869629, + "learning_rate": 3.894535899432606e-06, + "loss": 1.4936, + "step": 43230 + }, + { + "epoch": 9.109390635698109, + "grad_norm": 1.2317758798599243, + "learning_rate": 3.87626353879319e-06, + "loss": 1.5658, + "step": 43240 + }, + { + "epoch": 9.111497340285458, + "grad_norm": 1.3811818361282349, + "learning_rate": 3.8580332968486955e-06, + "loss": 1.6049, + "step": 43250 + }, + { + "epoch": 9.113604044872808, + "grad_norm": 1.2241859436035156, + "learning_rate": 3.839845181587098e-06, + "loss": 1.5565, + "step": 43260 + }, + { + "epoch": 9.115710749460156, + "grad_norm": 1.2962020635604858, + "learning_rate": 3.821699200977924e-06, + "loss": 1.551, + "step": 43270 + }, + { + "epoch": 9.117817454047506, + "grad_norm": 1.173917293548584, + "learning_rate": 3.8035953629722234e-06, + "loss": 1.5632, + "step": 43280 + }, + { + "epoch": 9.119924158634856, + "grad_norm": 1.336701512336731, + "learning_rate": 3.785533675502584e-06, + "loss": 1.5325, + "step": 43290 + }, + { + "epoch": 9.122030863222205, + "grad_norm": 1.2437199354171753, + "learning_rate": 3.76751414648312e-06, + "loss": 1.5471, + "step": 43300 + }, + { + "epoch": 9.124137567809553, + "grad_norm": 1.1865382194519043, + "learning_rate": 3.749536783809482e-06, + "loss": 1.5753, + "step": 43310 + }, + { + "epoch": 9.126244272396903, + "grad_norm": 1.3648736476898193, + "learning_rate": 3.7316015953588467e-06, + "loss": 1.5323, + "step": 43320 + }, + { + "epoch": 9.128350976984253, + "grad_norm": 1.2792103290557861, + "learning_rate": 3.7137085889898947e-06, + "loss": 1.5369, + "step": 43330 + }, + { + "epoch": 9.1304576815716, + "grad_norm": 1.1069854497909546, + "learning_rate": 3.6958577725428433e-06, + "loss": 1.5356, + "step": 43340 + }, + { + "epoch": 9.13256438615895, + "grad_norm": 1.3729944229125977, + "learning_rate": 3.6780491538394025e-06, + "loss": 1.5163, + "step": 43350 + }, + { + "epoch": 9.1346710907463, + "grad_norm": 1.1662119626998901, + "learning_rate": 3.6602827406828076e-06, + "loss": 1.5907, + "step": 43360 + }, + { + "epoch": 9.13677779533365, + "grad_norm": 1.2316755056381226, + "learning_rate": 3.64255854085781e-06, + "loss": 1.6138, + "step": 43370 + }, + { + "epoch": 9.138884499920998, + "grad_norm": 1.2182281017303467, + "learning_rate": 3.6248765621306414e-06, + "loss": 1.5445, + "step": 43380 + }, + { + "epoch": 9.140991204508348, + "grad_norm": 1.2118333578109741, + "learning_rate": 3.6072368122490265e-06, + "loss": 1.6002, + "step": 43390 + }, + { + "epoch": 9.143097909095697, + "grad_norm": 1.1032582521438599, + "learning_rate": 3.5896392989422377e-06, + "loss": 1.5135, + "step": 43400 + }, + { + "epoch": 9.145204613683047, + "grad_norm": 1.1986793279647827, + "learning_rate": 3.5720840299209747e-06, + "loss": 1.5815, + "step": 43410 + }, + { + "epoch": 9.147311318270395, + "grad_norm": 1.3122923374176025, + "learning_rate": 3.5545710128774835e-06, + "loss": 1.5405, + "step": 43420 + }, + { + "epoch": 9.149418022857745, + "grad_norm": 1.2399784326553345, + "learning_rate": 3.5371002554854593e-06, + "loss": 1.5949, + "step": 43430 + }, + { + "epoch": 9.151524727445095, + "grad_norm": 1.2310932874679565, + "learning_rate": 3.519671765400079e-06, + "loss": 1.5763, + "step": 43440 + }, + { + "epoch": 9.153631432032443, + "grad_norm": 1.2371530532836914, + "learning_rate": 3.502285550258044e-06, + "loss": 1.5614, + "step": 43450 + }, + { + "epoch": 9.155738136619792, + "grad_norm": 1.3835184574127197, + "learning_rate": 3.484941617677473e-06, + "loss": 1.6102, + "step": 43460 + }, + { + "epoch": 9.157844841207142, + "grad_norm": 1.3356512784957886, + "learning_rate": 3.467639975257997e-06, + "loss": 1.5885, + "step": 43470 + }, + { + "epoch": 9.159951545794492, + "grad_norm": 1.2307041883468628, + "learning_rate": 3.4503806305807074e-06, + "loss": 1.5764, + "step": 43480 + }, + { + "epoch": 9.16205825038184, + "grad_norm": 1.1840221881866455, + "learning_rate": 3.4331635912081437e-06, + "loss": 1.5634, + "step": 43490 + }, + { + "epoch": 9.16416495496919, + "grad_norm": 1.4013233184814453, + "learning_rate": 3.4159888646843495e-06, + "loss": 1.5445, + "step": 43500 + }, + { + "epoch": 9.16627165955654, + "grad_norm": 1.3255256414413452, + "learning_rate": 3.3988564585347937e-06, + "loss": 1.63, + "step": 43510 + }, + { + "epoch": 9.168378364143887, + "grad_norm": 1.1528620719909668, + "learning_rate": 3.3817663802663935e-06, + "loss": 1.5377, + "step": 43520 + }, + { + "epoch": 9.170485068731237, + "grad_norm": 1.2030662298202515, + "learning_rate": 3.364718637367548e-06, + "loss": 1.5362, + "step": 43530 + }, + { + "epoch": 9.172591773318587, + "grad_norm": 1.131676435470581, + "learning_rate": 3.3477132373081254e-06, + "loss": 1.5372, + "step": 43540 + }, + { + "epoch": 9.174698477905936, + "grad_norm": 1.3713462352752686, + "learning_rate": 3.3307501875393556e-06, + "loss": 1.5488, + "step": 43550 + }, + { + "epoch": 9.176805182493284, + "grad_norm": 1.1891109943389893, + "learning_rate": 3.313829495493992e-06, + "loss": 1.5883, + "step": 43560 + }, + { + "epoch": 9.178911887080634, + "grad_norm": 1.176886796951294, + "learning_rate": 3.2969511685862042e-06, + "loss": 1.5753, + "step": 43570 + }, + { + "epoch": 9.181018591667984, + "grad_norm": 1.270033597946167, + "learning_rate": 3.2801152142115766e-06, + "loss": 1.55, + "step": 43580 + }, + { + "epoch": 9.183125296255332, + "grad_norm": 1.247550368309021, + "learning_rate": 3.2633216397471966e-06, + "loss": 1.5918, + "step": 43590 + }, + { + "epoch": 9.185232000842682, + "grad_norm": 1.1211265325546265, + "learning_rate": 3.2465704525514785e-06, + "loss": 1.5428, + "step": 43600 + }, + { + "epoch": 9.187338705430031, + "grad_norm": 1.260231614112854, + "learning_rate": 3.2298616599643285e-06, + "loss": 1.5487, + "step": 43610 + }, + { + "epoch": 9.189445410017381, + "grad_norm": 1.412537932395935, + "learning_rate": 3.2131952693070898e-06, + "loss": 1.571, + "step": 43620 + }, + { + "epoch": 9.191552114604729, + "grad_norm": 1.1907163858413696, + "learning_rate": 3.196571287882488e-06, + "loss": 1.5942, + "step": 43630 + }, + { + "epoch": 9.193658819192079, + "grad_norm": 1.2359150648117065, + "learning_rate": 3.1799897229746857e-06, + "loss": 1.5795, + "step": 43640 + }, + { + "epoch": 9.195765523779428, + "grad_norm": 1.2746754884719849, + "learning_rate": 3.1634505818492256e-06, + "loss": 1.5895, + "step": 43650 + }, + { + "epoch": 9.197872228366776, + "grad_norm": 1.262993574142456, + "learning_rate": 3.146953871753122e-06, + "loss": 1.5722, + "step": 43660 + }, + { + "epoch": 9.199978932954126, + "grad_norm": 1.1132774353027344, + "learning_rate": 3.1304995999147713e-06, + "loss": 1.5522, + "step": 43670 + }, + { + "epoch": 9.202085637541476, + "grad_norm": 1.0957084894180298, + "learning_rate": 3.1140877735439387e-06, + "loss": 1.57, + "step": 43680 + }, + { + "epoch": 9.204192342128826, + "grad_norm": 1.1807489395141602, + "learning_rate": 3.0977183998318282e-06, + "loss": 1.5659, + "step": 43690 + }, + { + "epoch": 9.206299046716174, + "grad_norm": 1.3260751962661743, + "learning_rate": 3.0813914859510572e-06, + "loss": 1.5896, + "step": 43700 + }, + { + "epoch": 9.208405751303523, + "grad_norm": 1.2571806907653809, + "learning_rate": 3.0651070390556034e-06, + "loss": 1.5722, + "step": 43710 + }, + { + "epoch": 9.210512455890873, + "grad_norm": 1.2674102783203125, + "learning_rate": 3.048865066280848e-06, + "loss": 1.5034, + "step": 43720 + }, + { + "epoch": 9.212619160478223, + "grad_norm": 1.1594977378845215, + "learning_rate": 3.032665574743543e-06, + "loss": 1.5583, + "step": 43730 + }, + { + "epoch": 9.21472586506557, + "grad_norm": 1.2206066846847534, + "learning_rate": 3.0165085715418763e-06, + "loss": 1.5349, + "step": 43740 + }, + { + "epoch": 9.21683256965292, + "grad_norm": 1.282946228981018, + "learning_rate": 3.000394063755396e-06, + "loss": 1.552, + "step": 43750 + }, + { + "epoch": 9.21893927424027, + "grad_norm": 1.2245168685913086, + "learning_rate": 2.984322058444977e-06, + "loss": 1.5567, + "step": 43760 + }, + { + "epoch": 9.221045978827618, + "grad_norm": 1.1592695713043213, + "learning_rate": 2.9682925626529522e-06, + "loss": 1.5166, + "step": 43770 + }, + { + "epoch": 9.223152683414968, + "grad_norm": 1.0854359865188599, + "learning_rate": 2.952305583402981e-06, + "loss": 1.5572, + "step": 43780 + }, + { + "epoch": 9.225259388002318, + "grad_norm": 1.132101058959961, + "learning_rate": 2.9363611277001156e-06, + "loss": 1.5624, + "step": 43790 + }, + { + "epoch": 9.227366092589667, + "grad_norm": 1.1339458227157593, + "learning_rate": 2.9204592025307566e-06, + "loss": 1.4836, + "step": 43800 + }, + { + "epoch": 9.229472797177015, + "grad_norm": 1.1902509927749634, + "learning_rate": 2.904599814862663e-06, + "loss": 1.5311, + "step": 43810 + }, + { + "epoch": 9.231579501764365, + "grad_norm": 1.4040547609329224, + "learning_rate": 2.8887829716449876e-06, + "loss": 1.5296, + "step": 43820 + }, + { + "epoch": 9.233686206351715, + "grad_norm": 1.332128882408142, + "learning_rate": 2.87300867980822e-06, + "loss": 1.5542, + "step": 43830 + }, + { + "epoch": 9.235792910939063, + "grad_norm": 1.20232355594635, + "learning_rate": 2.857276946264198e-06, + "loss": 1.5255, + "step": 43840 + }, + { + "epoch": 9.237899615526413, + "grad_norm": 1.3149586915969849, + "learning_rate": 2.8415877779061182e-06, + "loss": 1.5336, + "step": 43850 + }, + { + "epoch": 9.240006320113762, + "grad_norm": 1.2837882041931152, + "learning_rate": 2.8259411816085492e-06, + "loss": 1.5487, + "step": 43860 + }, + { + "epoch": 9.242113024701112, + "grad_norm": 1.4165587425231934, + "learning_rate": 2.810337164227361e-06, + "loss": 1.5605, + "step": 43870 + }, + { + "epoch": 9.24421972928846, + "grad_norm": 1.1678107976913452, + "learning_rate": 2.79477573259983e-06, + "loss": 1.5262, + "step": 43880 + }, + { + "epoch": 9.24632643387581, + "grad_norm": 1.2908071279525757, + "learning_rate": 2.7792568935444796e-06, + "loss": 1.6054, + "step": 43890 + }, + { + "epoch": 9.24843313846316, + "grad_norm": 1.2241575717926025, + "learning_rate": 2.7637806538612586e-06, + "loss": 1.5645, + "step": 43900 + }, + { + "epoch": 9.250539843050507, + "grad_norm": 1.18755304813385, + "learning_rate": 2.748347020331421e-06, + "loss": 1.5223, + "step": 43910 + }, + { + "epoch": 9.252646547637857, + "grad_norm": 1.396857738494873, + "learning_rate": 2.732955999717546e-06, + "loss": 1.5947, + "step": 43920 + }, + { + "epoch": 9.254753252225207, + "grad_norm": 1.3757117986679077, + "learning_rate": 2.717607598763505e-06, + "loss": 1.5703, + "step": 43930 + }, + { + "epoch": 9.256859956812557, + "grad_norm": 1.3316391706466675, + "learning_rate": 2.702301824194586e-06, + "loss": 1.4793, + "step": 43940 + }, + { + "epoch": 9.258966661399905, + "grad_norm": 1.4265036582946777, + "learning_rate": 2.687038682717302e-06, + "loss": 1.5784, + "step": 43950 + }, + { + "epoch": 9.261073365987254, + "grad_norm": 1.159681797027588, + "learning_rate": 2.6718181810195696e-06, + "loss": 1.5113, + "step": 43960 + }, + { + "epoch": 9.263180070574604, + "grad_norm": 1.3730366230010986, + "learning_rate": 2.656640325770543e-06, + "loss": 1.4718, + "step": 43970 + }, + { + "epoch": 9.265286775161952, + "grad_norm": 1.3480654954910278, + "learning_rate": 2.6415051236207355e-06, + "loss": 1.5478, + "step": 43980 + }, + { + "epoch": 9.267393479749302, + "grad_norm": 1.2406569719314575, + "learning_rate": 2.626412581201987e-06, + "loss": 1.5258, + "step": 43990 + }, + { + "epoch": 9.269500184336652, + "grad_norm": 1.192407250404358, + "learning_rate": 2.6113627051273957e-06, + "loss": 1.591, + "step": 44000 + }, + { + "epoch": 9.271606888924001, + "grad_norm": 1.2141858339309692, + "learning_rate": 2.5963555019913988e-06, + "loss": 1.6122, + "step": 44010 + }, + { + "epoch": 9.27371359351135, + "grad_norm": 1.2186552286148071, + "learning_rate": 2.5813909783697354e-06, + "loss": 1.5491, + "step": 44020 + }, + { + "epoch": 9.275820298098699, + "grad_norm": 1.2834081649780273, + "learning_rate": 2.5664691408194165e-06, + "loss": 1.5743, + "step": 44030 + }, + { + "epoch": 9.277927002686049, + "grad_norm": 1.265353798866272, + "learning_rate": 2.551589995878789e-06, + "loss": 1.5876, + "step": 44040 + }, + { + "epoch": 9.280033707273397, + "grad_norm": 1.170989751815796, + "learning_rate": 2.5367535500674724e-06, + "loss": 1.5179, + "step": 44050 + }, + { + "epoch": 9.282140411860746, + "grad_norm": 1.162787914276123, + "learning_rate": 2.521959809886343e-06, + "loss": 1.5376, + "step": 44060 + }, + { + "epoch": 9.284247116448096, + "grad_norm": 1.1817100048065186, + "learning_rate": 2.5072087818176382e-06, + "loss": 1.5615, + "step": 44070 + }, + { + "epoch": 9.286353821035446, + "grad_norm": 1.217679500579834, + "learning_rate": 2.4925004723248323e-06, + "loss": 1.5643, + "step": 44080 + }, + { + "epoch": 9.288460525622794, + "grad_norm": 1.2032785415649414, + "learning_rate": 2.477834887852659e-06, + "loss": 1.5034, + "step": 44090 + }, + { + "epoch": 9.290567230210144, + "grad_norm": 1.3014835119247437, + "learning_rate": 2.4632120348272003e-06, + "loss": 1.4977, + "step": 44100 + }, + { + "epoch": 9.292673934797493, + "grad_norm": 1.2489004135131836, + "learning_rate": 2.4486319196557416e-06, + "loss": 1.5721, + "step": 44110 + }, + { + "epoch": 9.294780639384843, + "grad_norm": 1.165895700454712, + "learning_rate": 2.434094548726917e-06, + "loss": 1.5625, + "step": 44120 + }, + { + "epoch": 9.296887343972191, + "grad_norm": 1.2659752368927002, + "learning_rate": 2.419599928410554e-06, + "loss": 1.5215, + "step": 44130 + }, + { + "epoch": 9.29899404855954, + "grad_norm": 1.2900304794311523, + "learning_rate": 2.405148065057794e-06, + "loss": 1.6137, + "step": 44140 + }, + { + "epoch": 9.30110075314689, + "grad_norm": 1.1113080978393555, + "learning_rate": 2.390738965001038e-06, + "loss": 1.5197, + "step": 44150 + }, + { + "epoch": 9.303207457734239, + "grad_norm": 1.139173984527588, + "learning_rate": 2.376372634553936e-06, + "loss": 1.5136, + "step": 44160 + }, + { + "epoch": 9.305314162321588, + "grad_norm": 1.209246039390564, + "learning_rate": 2.3620490800114304e-06, + "loss": 1.605, + "step": 44170 + }, + { + "epoch": 9.307420866908938, + "grad_norm": 1.2354110479354858, + "learning_rate": 2.347768307649667e-06, + "loss": 1.5638, + "step": 44180 + }, + { + "epoch": 9.309527571496288, + "grad_norm": 1.3547914028167725, + "learning_rate": 2.3335303237260853e-06, + "loss": 1.6327, + "step": 44190 + }, + { + "epoch": 9.311634276083636, + "grad_norm": 1.3228518962860107, + "learning_rate": 2.3193351344793835e-06, + "loss": 1.5426, + "step": 44200 + }, + { + "epoch": 9.313740980670985, + "grad_norm": 1.3366025686264038, + "learning_rate": 2.3051827461294638e-06, + "loss": 1.5076, + "step": 44210 + }, + { + "epoch": 9.315847685258335, + "grad_norm": 1.2990881204605103, + "learning_rate": 2.291073164877511e-06, + "loss": 1.5914, + "step": 44220 + }, + { + "epoch": 9.317954389845683, + "grad_norm": 1.2209333181381226, + "learning_rate": 2.2770063969059562e-06, + "loss": 1.5379, + "step": 44230 + }, + { + "epoch": 9.320061094433033, + "grad_norm": 1.2527897357940674, + "learning_rate": 2.2629824483784366e-06, + "loss": 1.5866, + "step": 44240 + }, + { + "epoch": 9.322167799020383, + "grad_norm": 1.1806588172912598, + "learning_rate": 2.24900132543987e-06, + "loss": 1.545, + "step": 44250 + }, + { + "epoch": 9.324274503607732, + "grad_norm": 1.2967365980148315, + "learning_rate": 2.235063034216378e-06, + "loss": 1.5978, + "step": 44260 + }, + { + "epoch": 9.32638120819508, + "grad_norm": 1.301858901977539, + "learning_rate": 2.2211675808153198e-06, + "loss": 1.5622, + "step": 44270 + }, + { + "epoch": 9.32848791278243, + "grad_norm": 1.3768510818481445, + "learning_rate": 2.207314971325292e-06, + "loss": 1.5269, + "step": 44280 + }, + { + "epoch": 9.33059461736978, + "grad_norm": 1.2430403232574463, + "learning_rate": 2.193505211816127e-06, + "loss": 1.5703, + "step": 44290 + }, + { + "epoch": 9.332701321957128, + "grad_norm": 1.2545567750930786, + "learning_rate": 2.1797383083388412e-06, + "loss": 1.4986, + "step": 44300 + }, + { + "epoch": 9.334808026544477, + "grad_norm": 1.3329411745071411, + "learning_rate": 2.166014266925731e-06, + "loss": 1.5672, + "step": 44310 + }, + { + "epoch": 9.336914731131827, + "grad_norm": 1.41471266746521, + "learning_rate": 2.152333093590264e-06, + "loss": 1.561, + "step": 44320 + }, + { + "epoch": 9.339021435719177, + "grad_norm": 1.2551592588424683, + "learning_rate": 2.1386947943271562e-06, + "loss": 1.5539, + "step": 44330 + }, + { + "epoch": 9.341128140306525, + "grad_norm": 1.2264280319213867, + "learning_rate": 2.125099375112316e-06, + "loss": 1.5668, + "step": 44340 + }, + { + "epoch": 9.343234844893875, + "grad_norm": 1.3428797721862793, + "learning_rate": 2.1115468419028672e-06, + "loss": 1.5565, + "step": 44350 + }, + { + "epoch": 9.345341549481224, + "grad_norm": 1.0985000133514404, + "learning_rate": 2.0980372006371487e-06, + "loss": 1.5038, + "step": 44360 + }, + { + "epoch": 9.347448254068572, + "grad_norm": 1.2095003128051758, + "learning_rate": 2.0845704572347025e-06, + "loss": 1.5727, + "step": 44370 + }, + { + "epoch": 9.349554958655922, + "grad_norm": 1.3696715831756592, + "learning_rate": 2.0711466175962756e-06, + "loss": 1.5711, + "step": 44380 + }, + { + "epoch": 9.351661663243272, + "grad_norm": 1.1304073333740234, + "learning_rate": 2.0577656876038076e-06, + "loss": 1.5606, + "step": 44390 + }, + { + "epoch": 9.353768367830622, + "grad_norm": 1.2599557638168335, + "learning_rate": 2.0444276731204415e-06, + "loss": 1.5672, + "step": 44400 + }, + { + "epoch": 9.35587507241797, + "grad_norm": 1.1687496900558472, + "learning_rate": 2.0311325799905356e-06, + "loss": 1.5357, + "step": 44410 + }, + { + "epoch": 9.35798177700532, + "grad_norm": 1.28921377658844, + "learning_rate": 2.0178804140396078e-06, + "loss": 1.5935, + "step": 44420 + }, + { + "epoch": 9.360088481592669, + "grad_norm": 1.1254773139953613, + "learning_rate": 2.004671181074369e-06, + "loss": 1.5027, + "step": 44430 + }, + { + "epoch": 9.362195186180017, + "grad_norm": 1.3228946924209595, + "learning_rate": 1.9915048868827558e-06, + "loss": 1.5286, + "step": 44440 + }, + { + "epoch": 9.364301890767367, + "grad_norm": 1.2278170585632324, + "learning_rate": 1.9783815372338423e-06, + "loss": 1.5514, + "step": 44450 + }, + { + "epoch": 9.366408595354716, + "grad_norm": 1.287775993347168, + "learning_rate": 1.9653011378779283e-06, + "loss": 1.5499, + "step": 44460 + }, + { + "epoch": 9.368515299942066, + "grad_norm": 1.283502221107483, + "learning_rate": 1.9522636945464635e-06, + "loss": 1.5364, + "step": 44470 + }, + { + "epoch": 9.370622004529414, + "grad_norm": 1.1731789112091064, + "learning_rate": 1.9392692129520882e-06, + "loss": 1.5227, + "step": 44480 + }, + { + "epoch": 9.372728709116764, + "grad_norm": 1.3029754161834717, + "learning_rate": 1.9263176987886043e-06, + "loss": 1.5858, + "step": 44490 + }, + { + "epoch": 9.374835413704114, + "grad_norm": 1.1961798667907715, + "learning_rate": 1.9134091577310274e-06, + "loss": 1.588, + "step": 44500 + }, + { + "epoch": 9.376942118291463, + "grad_norm": 1.2847782373428345, + "learning_rate": 1.9005435954354667e-06, + "loss": 1.5501, + "step": 44510 + }, + { + "epoch": 9.379048822878811, + "grad_norm": 1.1511763334274292, + "learning_rate": 1.88772101753929e-06, + "loss": 1.5782, + "step": 44520 + }, + { + "epoch": 9.381155527466161, + "grad_norm": 1.1530286073684692, + "learning_rate": 1.8749414296609702e-06, + "loss": 1.5413, + "step": 44530 + }, + { + "epoch": 9.38326223205351, + "grad_norm": 1.1921340227127075, + "learning_rate": 1.862204837400161e-06, + "loss": 1.5548, + "step": 44540 + }, + { + "epoch": 9.385368936640859, + "grad_norm": 1.3235081434249878, + "learning_rate": 1.8495112463376874e-06, + "loss": 1.5859, + "step": 44550 + }, + { + "epoch": 9.387475641228209, + "grad_norm": 1.2756280899047852, + "learning_rate": 1.836860662035489e-06, + "loss": 1.5467, + "step": 44560 + }, + { + "epoch": 9.389582345815558, + "grad_norm": 1.198059320449829, + "learning_rate": 1.8242530900367315e-06, + "loss": 1.6058, + "step": 44570 + }, + { + "epoch": 9.391689050402908, + "grad_norm": 1.1972780227661133, + "learning_rate": 1.8116885358656744e-06, + "loss": 1.5333, + "step": 44580 + }, + { + "epoch": 9.393795754990256, + "grad_norm": 1.2168101072311401, + "learning_rate": 1.7991670050277354e-06, + "loss": 1.5402, + "step": 44590 + }, + { + "epoch": 9.395902459577606, + "grad_norm": 1.1930609941482544, + "learning_rate": 1.786688503009537e-06, + "loss": 1.5392, + "step": 44600 + }, + { + "epoch": 9.398009164164955, + "grad_norm": 1.1218000650405884, + "learning_rate": 1.7742530352787612e-06, + "loss": 1.4655, + "step": 44610 + }, + { + "epoch": 9.400115868752303, + "grad_norm": 1.3308579921722412, + "learning_rate": 1.7618606072842936e-06, + "loss": 1.5094, + "step": 44620 + }, + { + "epoch": 9.402222573339653, + "grad_norm": 1.2352577447891235, + "learning_rate": 1.7495112244561573e-06, + "loss": 1.5612, + "step": 44630 + }, + { + "epoch": 9.404329277927003, + "grad_norm": 1.2200725078582764, + "learning_rate": 1.7372048922054906e-06, + "loss": 1.5206, + "step": 44640 + }, + { + "epoch": 9.406435982514353, + "grad_norm": 1.4431538581848145, + "learning_rate": 1.7249416159245691e-06, + "loss": 1.5482, + "step": 44650 + }, + { + "epoch": 9.4085426871017, + "grad_norm": 1.1892200708389282, + "learning_rate": 1.7127214009868385e-06, + "loss": 1.64, + "step": 44660 + }, + { + "epoch": 9.41064939168905, + "grad_norm": 1.2224235534667969, + "learning_rate": 1.7005442527468163e-06, + "loss": 1.5128, + "step": 44670 + }, + { + "epoch": 9.4127560962764, + "grad_norm": 1.1963303089141846, + "learning_rate": 1.6884101765402116e-06, + "loss": 1.5092, + "step": 44680 + }, + { + "epoch": 9.414862800863748, + "grad_norm": 1.2295386791229248, + "learning_rate": 1.676319177683816e-06, + "loss": 1.5728, + "step": 44690 + }, + { + "epoch": 9.416969505451098, + "grad_norm": 1.390763759613037, + "learning_rate": 1.6642712614755695e-06, + "loss": 1.5631, + "step": 44700 + }, + { + "epoch": 9.419076210038448, + "grad_norm": 1.2150217294692993, + "learning_rate": 1.6522664331945382e-06, + "loss": 1.5224, + "step": 44710 + }, + { + "epoch": 9.421182914625797, + "grad_norm": 1.1678669452667236, + "learning_rate": 1.6403046981008807e-06, + "loss": 1.6289, + "step": 44720 + }, + { + "epoch": 9.423289619213145, + "grad_norm": 1.193841814994812, + "learning_rate": 1.6283860614358936e-06, + "loss": 1.6028, + "step": 44730 + }, + { + "epoch": 9.425396323800495, + "grad_norm": 1.2163934707641602, + "learning_rate": 1.6165105284219772e-06, + "loss": 1.5197, + "step": 44740 + }, + { + "epoch": 9.427503028387845, + "grad_norm": 1.2479548454284668, + "learning_rate": 1.6046781042626802e-06, + "loss": 1.5534, + "step": 44750 + }, + { + "epoch": 9.429609732975194, + "grad_norm": 1.2134898900985718, + "learning_rate": 1.5928887941426107e-06, + "loss": 1.5745, + "step": 44760 + }, + { + "epoch": 9.431716437562542, + "grad_norm": 1.2324718236923218, + "learning_rate": 1.5811426032275146e-06, + "loss": 1.5941, + "step": 44770 + }, + { + "epoch": 9.433823142149892, + "grad_norm": 1.2660291194915771, + "learning_rate": 1.5694395366642411e-06, + "loss": 1.5639, + "step": 44780 + }, + { + "epoch": 9.435929846737242, + "grad_norm": 1.1077207326889038, + "learning_rate": 1.5577795995807554e-06, + "loss": 1.5146, + "step": 44790 + }, + { + "epoch": 9.43803655132459, + "grad_norm": 1.2963979244232178, + "learning_rate": 1.5461627970860814e-06, + "loss": 1.5483, + "step": 44800 + }, + { + "epoch": 9.44014325591194, + "grad_norm": 1.2777230739593506, + "learning_rate": 1.5345891342704033e-06, + "loss": 1.5602, + "step": 44810 + }, + { + "epoch": 9.44224996049929, + "grad_norm": 1.2303513288497925, + "learning_rate": 1.523058616204942e-06, + "loss": 1.5415, + "step": 44820 + }, + { + "epoch": 9.444356665086639, + "grad_norm": 1.3114386796951294, + "learning_rate": 1.5115712479420786e-06, + "loss": 1.6028, + "step": 44830 + }, + { + "epoch": 9.446463369673987, + "grad_norm": 1.1598353385925293, + "learning_rate": 1.5001270345152307e-06, + "loss": 1.5337, + "step": 44840 + }, + { + "epoch": 9.448570074261337, + "grad_norm": 1.2327178716659546, + "learning_rate": 1.4887259809389208e-06, + "loss": 1.5464, + "step": 44850 + }, + { + "epoch": 9.450676778848687, + "grad_norm": 1.2036960124969482, + "learning_rate": 1.4773680922087863e-06, + "loss": 1.5555, + "step": 44860 + }, + { + "epoch": 9.452783483436034, + "grad_norm": 1.258427381515503, + "learning_rate": 1.4660533733015236e-06, + "loss": 1.5747, + "step": 44870 + }, + { + "epoch": 9.454890188023384, + "grad_norm": 1.2606942653656006, + "learning_rate": 1.4547818291749115e-06, + "loss": 1.5928, + "step": 44880 + }, + { + "epoch": 9.456996892610734, + "grad_norm": 1.2755640745162964, + "learning_rate": 1.4435534647678438e-06, + "loss": 1.5394, + "step": 44890 + }, + { + "epoch": 9.459103597198084, + "grad_norm": 1.2024848461151123, + "learning_rate": 1.4323682850002406e-06, + "loss": 1.568, + "step": 44900 + }, + { + "epoch": 9.461210301785432, + "grad_norm": 1.3687775135040283, + "learning_rate": 1.4212262947731703e-06, + "loss": 1.5108, + "step": 44910 + }, + { + "epoch": 9.463317006372781, + "grad_norm": 1.1984235048294067, + "learning_rate": 1.4101274989687053e-06, + "loss": 1.5219, + "step": 44920 + }, + { + "epoch": 9.465423710960131, + "grad_norm": 1.1227729320526123, + "learning_rate": 1.399071902450022e-06, + "loss": 1.6081, + "step": 44930 + }, + { + "epoch": 9.467530415547479, + "grad_norm": 1.3621063232421875, + "learning_rate": 1.3880595100613792e-06, + "loss": 1.5531, + "step": 44940 + }, + { + "epoch": 9.469637120134829, + "grad_norm": 1.2998682260513306, + "learning_rate": 1.3770903266281054e-06, + "loss": 1.5531, + "step": 44950 + }, + { + "epoch": 9.471743824722179, + "grad_norm": 1.3776503801345825, + "learning_rate": 1.3661643569565785e-06, + "loss": 1.5866, + "step": 44960 + }, + { + "epoch": 9.473850529309528, + "grad_norm": 1.2252628803253174, + "learning_rate": 1.3552816058342354e-06, + "loss": 1.5095, + "step": 44970 + }, + { + "epoch": 9.475957233896876, + "grad_norm": 1.3023484945297241, + "learning_rate": 1.3444420780296063e-06, + "loss": 1.5863, + "step": 44980 + }, + { + "epoch": 9.478063938484226, + "grad_norm": 1.2645686864852905, + "learning_rate": 1.3336457782922474e-06, + "loss": 1.5368, + "step": 44990 + }, + { + "epoch": 9.480170643071576, + "grad_norm": 1.1360158920288086, + "learning_rate": 1.3228927113528189e-06, + "loss": 1.5091, + "step": 45000 + }, + { + "epoch": 9.482277347658924, + "grad_norm": 1.1841646432876587, + "learning_rate": 1.3121828819229743e-06, + "loss": 1.5503, + "step": 45010 + }, + { + "epoch": 9.484384052246273, + "grad_norm": 1.1105971336364746, + "learning_rate": 1.3015162946954706e-06, + "loss": 1.5561, + "step": 45020 + }, + { + "epoch": 9.486490756833623, + "grad_norm": 1.1991256475448608, + "learning_rate": 1.290892954344125e-06, + "loss": 1.5536, + "step": 45030 + }, + { + "epoch": 9.488597461420973, + "grad_norm": 1.1073957681655884, + "learning_rate": 1.2803128655237694e-06, + "loss": 1.5014, + "step": 45040 + }, + { + "epoch": 9.490704166008321, + "grad_norm": 1.2259361743927002, + "learning_rate": 1.2697760328702734e-06, + "loss": 1.5671, + "step": 45050 + }, + { + "epoch": 9.49281087059567, + "grad_norm": 1.1295838356018066, + "learning_rate": 1.2592824610006215e-06, + "loss": 1.5262, + "step": 45060 + }, + { + "epoch": 9.49491757518302, + "grad_norm": 1.258636236190796, + "learning_rate": 1.248832154512769e-06, + "loss": 1.55, + "step": 45070 + }, + { + "epoch": 9.497024279770368, + "grad_norm": 1.2501273155212402, + "learning_rate": 1.2384251179857643e-06, + "loss": 1.5668, + "step": 45080 + }, + { + "epoch": 9.499130984357718, + "grad_norm": 1.2161610126495361, + "learning_rate": 1.2280613559796595e-06, + "loss": 1.5267, + "step": 45090 + }, + { + "epoch": 9.501237688945068, + "grad_norm": 1.2681390047073364, + "learning_rate": 1.2177408730355554e-06, + "loss": 1.5289, + "step": 45100 + }, + { + "epoch": 9.503344393532418, + "grad_norm": 1.1886988878250122, + "learning_rate": 1.2074636736756129e-06, + "loss": 1.5128, + "step": 45110 + }, + { + "epoch": 9.505451098119766, + "grad_norm": 1.2830315828323364, + "learning_rate": 1.1972297624030072e-06, + "loss": 1.5249, + "step": 45120 + }, + { + "epoch": 9.507557802707115, + "grad_norm": 1.2447185516357422, + "learning_rate": 1.18703914370194e-06, + "loss": 1.5655, + "step": 45130 + }, + { + "epoch": 9.509664507294465, + "grad_norm": 1.3017278909683228, + "learning_rate": 1.1768918220376624e-06, + "loss": 1.5557, + "step": 45140 + }, + { + "epoch": 9.511771211881815, + "grad_norm": 1.3122484683990479, + "learning_rate": 1.1667878018564171e-06, + "loss": 1.5859, + "step": 45150 + }, + { + "epoch": 9.513877916469163, + "grad_norm": 1.339020848274231, + "learning_rate": 1.1567270875855407e-06, + "loss": 1.5386, + "step": 45160 + }, + { + "epoch": 9.515984621056512, + "grad_norm": 1.227014422416687, + "learning_rate": 1.1467096836333069e-06, + "loss": 1.6457, + "step": 45170 + }, + { + "epoch": 9.518091325643862, + "grad_norm": 1.3103618621826172, + "learning_rate": 1.1367355943890823e-06, + "loss": 1.5938, + "step": 45180 + }, + { + "epoch": 9.52019803023121, + "grad_norm": 1.1503431797027588, + "learning_rate": 1.1268048242232375e-06, + "loss": 1.5428, + "step": 45190 + }, + { + "epoch": 9.52230473481856, + "grad_norm": 1.1854219436645508, + "learning_rate": 1.1169173774871478e-06, + "loss": 1.5368, + "step": 45200 + }, + { + "epoch": 9.52441143940591, + "grad_norm": 1.2122063636779785, + "learning_rate": 1.1070732585132026e-06, + "loss": 1.5124, + "step": 45210 + }, + { + "epoch": 9.52651814399326, + "grad_norm": 1.2711586952209473, + "learning_rate": 1.0972724716148187e-06, + "loss": 1.5631, + "step": 45220 + }, + { + "epoch": 9.528624848580607, + "grad_norm": 1.3009141683578491, + "learning_rate": 1.0875150210864271e-06, + "loss": 1.5276, + "step": 45230 + }, + { + "epoch": 9.530731553167957, + "grad_norm": 1.1954138278961182, + "learning_rate": 1.0778009112034748e-06, + "loss": 1.4954, + "step": 45240 + }, + { + "epoch": 9.532838257755307, + "grad_norm": 1.1968759298324585, + "learning_rate": 1.0681301462223903e-06, + "loss": 1.5634, + "step": 45250 + }, + { + "epoch": 9.534944962342655, + "grad_norm": 1.2859160900115967, + "learning_rate": 1.0585027303806393e-06, + "loss": 1.5997, + "step": 45260 + }, + { + "epoch": 9.537051666930005, + "grad_norm": 1.1200820207595825, + "learning_rate": 1.0489186678966812e-06, + "loss": 1.5602, + "step": 45270 + }, + { + "epoch": 9.539158371517354, + "grad_norm": 1.3090044260025024, + "learning_rate": 1.0393779629699786e-06, + "loss": 1.6024, + "step": 45280 + }, + { + "epoch": 9.541265076104704, + "grad_norm": 1.2200546264648438, + "learning_rate": 1.0298806197809984e-06, + "loss": 1.5606, + "step": 45290 + }, + { + "epoch": 9.543371780692052, + "grad_norm": 1.2394194602966309, + "learning_rate": 1.0204266424912123e-06, + "loss": 1.5037, + "step": 45300 + }, + { + "epoch": 9.545478485279402, + "grad_norm": 1.1983662843704224, + "learning_rate": 1.011016035243073e-06, + "loss": 1.5588, + "step": 45310 + }, + { + "epoch": 9.547585189866751, + "grad_norm": 1.2025030851364136, + "learning_rate": 1.001648802160049e-06, + "loss": 1.5663, + "step": 45320 + }, + { + "epoch": 9.5496918944541, + "grad_norm": 1.2647790908813477, + "learning_rate": 9.923249473466012e-07, + "loss": 1.5482, + "step": 45330 + }, + { + "epoch": 9.55179859904145, + "grad_norm": 1.2974402904510498, + "learning_rate": 9.830444748881728e-07, + "loss": 1.5217, + "step": 45340 + }, + { + "epoch": 9.553905303628799, + "grad_norm": 1.124445915222168, + "learning_rate": 9.738073888512e-07, + "loss": 1.4856, + "step": 45350 + }, + { + "epoch": 9.556012008216149, + "grad_norm": 1.7168290615081787, + "learning_rate": 9.64613693283123e-07, + "loss": 1.5335, + "step": 45360 + }, + { + "epoch": 9.558118712803497, + "grad_norm": 1.2465564012527466, + "learning_rate": 9.554633922123412e-07, + "loss": 1.5456, + "step": 45370 + }, + { + "epoch": 9.560225417390846, + "grad_norm": 1.1842801570892334, + "learning_rate": 9.463564896482813e-07, + "loss": 1.5513, + "step": 45380 + }, + { + "epoch": 9.562332121978196, + "grad_norm": 1.1776254177093506, + "learning_rate": 9.372929895813065e-07, + "loss": 1.5909, + "step": 45390 + }, + { + "epoch": 9.564438826565546, + "grad_norm": 1.1699087619781494, + "learning_rate": 9.282728959827958e-07, + "loss": 1.5508, + "step": 45400 + }, + { + "epoch": 9.566545531152894, + "grad_norm": 1.3377196788787842, + "learning_rate": 9.192962128050986e-07, + "loss": 1.5649, + "step": 45410 + }, + { + "epoch": 9.568652235740243, + "grad_norm": 1.380293369293213, + "learning_rate": 9.103629439815354e-07, + "loss": 1.5433, + "step": 45420 + }, + { + "epoch": 9.570758940327593, + "grad_norm": 1.2865245342254639, + "learning_rate": 9.014730934264192e-07, + "loss": 1.5992, + "step": 45430 + }, + { + "epoch": 9.572865644914941, + "grad_norm": 1.338066816329956, + "learning_rate": 8.926266650350234e-07, + "loss": 1.5397, + "step": 45440 + }, + { + "epoch": 9.574972349502291, + "grad_norm": 1.3003287315368652, + "learning_rate": 8.838236626836138e-07, + "loss": 1.6043, + "step": 45450 + }, + { + "epoch": 9.57707905408964, + "grad_norm": 1.2428746223449707, + "learning_rate": 8.750640902294161e-07, + "loss": 1.6224, + "step": 45460 + }, + { + "epoch": 9.579185758676989, + "grad_norm": 1.2704057693481445, + "learning_rate": 8.663479515106043e-07, + "loss": 1.5504, + "step": 45470 + }, + { + "epoch": 9.581292463264338, + "grad_norm": 1.1228466033935547, + "learning_rate": 8.57675250346368e-07, + "loss": 1.563, + "step": 45480 + }, + { + "epoch": 9.583399167851688, + "grad_norm": 1.1902949810028076, + "learning_rate": 8.490459905368342e-07, + "loss": 1.4802, + "step": 45490 + }, + { + "epoch": 9.585505872439038, + "grad_norm": 1.2982428073883057, + "learning_rate": 8.404601758630892e-07, + "loss": 1.5255, + "step": 45500 + }, + { + "epoch": 9.587612577026386, + "grad_norm": 1.2793209552764893, + "learning_rate": 8.319178100872016e-07, + "loss": 1.5295, + "step": 45510 + }, + { + "epoch": 9.589719281613736, + "grad_norm": 1.2290621995925903, + "learning_rate": 8.234188969521883e-07, + "loss": 1.5463, + "step": 45520 + }, + { + "epoch": 9.591825986201085, + "grad_norm": 1.2028400897979736, + "learning_rate": 8.149634401820372e-07, + "loss": 1.5438, + "step": 45530 + }, + { + "epoch": 9.593932690788435, + "grad_norm": 1.3339173793792725, + "learning_rate": 8.065514434816845e-07, + "loss": 1.5446, + "step": 45540 + }, + { + "epoch": 9.596039395375783, + "grad_norm": 1.1351807117462158, + "learning_rate": 7.981829105370375e-07, + "loss": 1.5374, + "step": 45550 + }, + { + "epoch": 9.598146099963133, + "grad_norm": 1.3664101362228394, + "learning_rate": 7.898578450149407e-07, + "loss": 1.586, + "step": 45560 + }, + { + "epoch": 9.600252804550482, + "grad_norm": 1.2555339336395264, + "learning_rate": 7.815762505632096e-07, + "loss": 1.5537, + "step": 45570 + }, + { + "epoch": 9.60235950913783, + "grad_norm": 1.0828977823257446, + "learning_rate": 7.733381308105969e-07, + "loss": 1.5487, + "step": 45580 + }, + { + "epoch": 9.60446621372518, + "grad_norm": 1.2509876489639282, + "learning_rate": 7.651434893668152e-07, + "loss": 1.5715, + "step": 45590 + }, + { + "epoch": 9.60657291831253, + "grad_norm": 1.2072515487670898, + "learning_rate": 7.569923298225146e-07, + "loss": 1.4886, + "step": 45600 + }, + { + "epoch": 9.60867962289988, + "grad_norm": 1.249969482421875, + "learning_rate": 7.488846557493267e-07, + "loss": 1.5317, + "step": 45610 + }, + { + "epoch": 9.610786327487228, + "grad_norm": 1.304600477218628, + "learning_rate": 7.408204706997879e-07, + "loss": 1.6078, + "step": 45620 + }, + { + "epoch": 9.612893032074577, + "grad_norm": 1.2158945798873901, + "learning_rate": 7.327997782073936e-07, + "loss": 1.5877, + "step": 45630 + }, + { + "epoch": 9.614999736661927, + "grad_norm": 1.1745023727416992, + "learning_rate": 7.248225817865884e-07, + "loss": 1.5379, + "step": 45640 + }, + { + "epoch": 9.617106441249275, + "grad_norm": 1.2478928565979004, + "learning_rate": 7.168888849327426e-07, + "loss": 1.5642, + "step": 45650 + }, + { + "epoch": 9.619213145836625, + "grad_norm": 1.1718308925628662, + "learning_rate": 7.08998691122198e-07, + "loss": 1.5113, + "step": 45660 + }, + { + "epoch": 9.621319850423975, + "grad_norm": 1.2330695390701294, + "learning_rate": 7.01152003812211e-07, + "loss": 1.5686, + "step": 45670 + }, + { + "epoch": 9.623426555011324, + "grad_norm": 1.255374789237976, + "learning_rate": 6.933488264409538e-07, + "loss": 1.5387, + "step": 45680 + }, + { + "epoch": 9.625533259598672, + "grad_norm": 1.1764744520187378, + "learning_rate": 6.855891624275801e-07, + "loss": 1.4828, + "step": 45690 + }, + { + "epoch": 9.627639964186022, + "grad_norm": 1.2803027629852295, + "learning_rate": 6.77873015172148e-07, + "loss": 1.5569, + "step": 45700 + }, + { + "epoch": 9.629746668773372, + "grad_norm": 1.295817255973816, + "learning_rate": 6.702003880556418e-07, + "loss": 1.5269, + "step": 45710 + }, + { + "epoch": 9.63185337336072, + "grad_norm": 1.1505221128463745, + "learning_rate": 6.625712844400056e-07, + "loss": 1.5517, + "step": 45720 + }, + { + "epoch": 9.63396007794807, + "grad_norm": 1.2030140161514282, + "learning_rate": 6.549857076680987e-07, + "loss": 1.5361, + "step": 45730 + }, + { + "epoch": 9.63606678253542, + "grad_norm": 1.2046858072280884, + "learning_rate": 6.474436610636958e-07, + "loss": 1.5525, + "step": 45740 + }, + { + "epoch": 9.638173487122769, + "grad_norm": 1.2719230651855469, + "learning_rate": 6.399451479315088e-07, + "loss": 1.5241, + "step": 45750 + }, + { + "epoch": 9.640280191710117, + "grad_norm": 1.1040761470794678, + "learning_rate": 6.324901715571651e-07, + "loss": 1.5709, + "step": 45760 + }, + { + "epoch": 9.642386896297467, + "grad_norm": 1.313246250152588, + "learning_rate": 6.250787352072518e-07, + "loss": 1.5177, + "step": 45770 + }, + { + "epoch": 9.644493600884816, + "grad_norm": 1.1665434837341309, + "learning_rate": 6.177108421292266e-07, + "loss": 1.4843, + "step": 45780 + }, + { + "epoch": 9.646600305472166, + "grad_norm": 1.19307541847229, + "learning_rate": 6.103864955514849e-07, + "loss": 1.4744, + "step": 45790 + }, + { + "epoch": 9.648707010059514, + "grad_norm": 1.35244882106781, + "learning_rate": 6.031056986833705e-07, + "loss": 1.5368, + "step": 45800 + }, + { + "epoch": 9.650813714646864, + "grad_norm": 1.1607619524002075, + "learning_rate": 5.958684547151095e-07, + "loss": 1.6192, + "step": 45810 + }, + { + "epoch": 9.652920419234214, + "grad_norm": 1.2676018476486206, + "learning_rate": 5.886747668178538e-07, + "loss": 1.5227, + "step": 45820 + }, + { + "epoch": 9.655027123821561, + "grad_norm": 1.2928742170333862, + "learning_rate": 5.815246381436934e-07, + "loss": 1.518, + "step": 45830 + }, + { + "epoch": 9.657133828408911, + "grad_norm": 1.1910942792892456, + "learning_rate": 5.744180718255776e-07, + "loss": 1.5501, + "step": 45840 + }, + { + "epoch": 9.659240532996261, + "grad_norm": 1.215248465538025, + "learning_rate": 5.673550709774267e-07, + "loss": 1.6009, + "step": 45850 + }, + { + "epoch": 9.661347237583609, + "grad_norm": 1.1708420515060425, + "learning_rate": 5.603356386940429e-07, + "loss": 1.5504, + "step": 45860 + }, + { + "epoch": 9.663453942170959, + "grad_norm": 1.3606902360916138, + "learning_rate": 5.533597780511435e-07, + "loss": 1.5371, + "step": 45870 + }, + { + "epoch": 9.665560646758308, + "grad_norm": 1.2347360849380493, + "learning_rate": 5.464274921053503e-07, + "loss": 1.5587, + "step": 45880 + }, + { + "epoch": 9.667667351345658, + "grad_norm": 1.3127825260162354, + "learning_rate": 5.395387838942001e-07, + "loss": 1.5692, + "step": 45890 + }, + { + "epoch": 9.669774055933006, + "grad_norm": 1.1706701517105103, + "learning_rate": 5.326936564361118e-07, + "loss": 1.5369, + "step": 45900 + }, + { + "epoch": 9.671880760520356, + "grad_norm": 1.190574049949646, + "learning_rate": 5.258921127304528e-07, + "loss": 1.4852, + "step": 45910 + }, + { + "epoch": 9.673987465107706, + "grad_norm": 1.3065412044525146, + "learning_rate": 5.191341557574392e-07, + "loss": 1.5335, + "step": 45920 + }, + { + "epoch": 9.676094169695055, + "grad_norm": 1.227949857711792, + "learning_rate": 5.124197884782356e-07, + "loss": 1.5388, + "step": 45930 + }, + { + "epoch": 9.678200874282403, + "grad_norm": 1.2584184408187866, + "learning_rate": 5.057490138348775e-07, + "loss": 1.5407, + "step": 45940 + }, + { + "epoch": 9.680307578869753, + "grad_norm": 1.395398736000061, + "learning_rate": 4.991218347503157e-07, + "loss": 1.5738, + "step": 45950 + }, + { + "epoch": 9.682414283457103, + "grad_norm": 1.1984232664108276, + "learning_rate": 4.92538254128383e-07, + "loss": 1.5051, + "step": 45960 + }, + { + "epoch": 9.68452098804445, + "grad_norm": 1.2473891973495483, + "learning_rate": 4.859982748538272e-07, + "loss": 1.6031, + "step": 45970 + }, + { + "epoch": 9.6866276926318, + "grad_norm": 1.2355321645736694, + "learning_rate": 4.795018997922784e-07, + "loss": 1.5476, + "step": 45980 + }, + { + "epoch": 9.68873439721915, + "grad_norm": 1.1943268775939941, + "learning_rate": 4.7304913179025965e-07, + "loss": 1.566, + "step": 45990 + }, + { + "epoch": 9.6908411018065, + "grad_norm": 1.1601346731185913, + "learning_rate": 4.666399736751981e-07, + "loss": 1.5289, + "step": 46000 + }, + { + "epoch": 9.692947806393848, + "grad_norm": 1.346374750137329, + "learning_rate": 4.602744282554028e-07, + "loss": 1.5774, + "step": 46010 + }, + { + "epoch": 9.695054510981198, + "grad_norm": 1.2781177759170532, + "learning_rate": 4.5395249832007604e-07, + "loss": 1.5595, + "step": 46020 + }, + { + "epoch": 9.697161215568547, + "grad_norm": 1.2155574560165405, + "learning_rate": 4.4767418663930196e-07, + "loss": 1.5051, + "step": 46030 + }, + { + "epoch": 9.699267920155895, + "grad_norm": 1.2750422954559326, + "learning_rate": 4.414394959640578e-07, + "loss": 1.5713, + "step": 46040 + }, + { + "epoch": 9.701374624743245, + "grad_norm": 1.2818180322647095, + "learning_rate": 4.352484290262249e-07, + "loss": 1.5517, + "step": 46050 + }, + { + "epoch": 9.703481329330595, + "grad_norm": 1.320322871208191, + "learning_rate": 4.291009885385333e-07, + "loss": 1.5886, + "step": 46060 + }, + { + "epoch": 9.705588033917945, + "grad_norm": 1.1885708570480347, + "learning_rate": 4.229971771946284e-07, + "loss": 1.5852, + "step": 46070 + }, + { + "epoch": 9.707694738505293, + "grad_norm": 1.2882665395736694, + "learning_rate": 4.1693699766902626e-07, + "loss": 1.5852, + "step": 46080 + }, + { + "epoch": 9.709801443092642, + "grad_norm": 1.1464838981628418, + "learning_rate": 4.10920452617114e-07, + "loss": 1.555, + "step": 46090 + }, + { + "epoch": 9.711908147679992, + "grad_norm": 1.2324957847595215, + "learning_rate": 4.049475446751827e-07, + "loss": 1.5373, + "step": 46100 + }, + { + "epoch": 9.71401485226734, + "grad_norm": 1.1413804292678833, + "learning_rate": 3.9901827646039446e-07, + "loss": 1.5398, + "step": 46110 + }, + { + "epoch": 9.71612155685469, + "grad_norm": 1.084005355834961, + "learning_rate": 3.931326505707822e-07, + "loss": 1.5267, + "step": 46120 + }, + { + "epoch": 9.71822826144204, + "grad_norm": 1.366945505142212, + "learning_rate": 3.872906695852607e-07, + "loss": 1.6069, + "step": 46130 + }, + { + "epoch": 9.72033496602939, + "grad_norm": 1.3505078554153442, + "learning_rate": 3.814923360636158e-07, + "loss": 1.6329, + "step": 46140 + }, + { + "epoch": 9.722441670616737, + "grad_norm": 1.1962380409240723, + "learning_rate": 3.7573765254651504e-07, + "loss": 1.5906, + "step": 46150 + }, + { + "epoch": 9.724548375204087, + "grad_norm": 1.1975687742233276, + "learning_rate": 3.700266215554971e-07, + "loss": 1.5692, + "step": 46160 + }, + { + "epoch": 9.726655079791437, + "grad_norm": 1.2534469366073608, + "learning_rate": 3.643592455929712e-07, + "loss": 1.5563, + "step": 46170 + }, + { + "epoch": 9.728761784378786, + "grad_norm": 1.1433435678482056, + "learning_rate": 3.587355271422288e-07, + "loss": 1.5346, + "step": 46180 + }, + { + "epoch": 9.730868488966134, + "grad_norm": 1.272630214691162, + "learning_rate": 3.53155468667421e-07, + "loss": 1.5332, + "step": 46190 + }, + { + "epoch": 9.732975193553484, + "grad_norm": 1.3106838464736938, + "learning_rate": 3.4761907261356976e-07, + "loss": 1.6156, + "step": 46200 + }, + { + "epoch": 9.735081898140834, + "grad_norm": 1.2662346363067627, + "learning_rate": 3.4212634140656784e-07, + "loss": 1.5496, + "step": 46210 + }, + { + "epoch": 9.737188602728182, + "grad_norm": 1.1517857313156128, + "learning_rate": 3.366772774531679e-07, + "loss": 1.5525, + "step": 46220 + }, + { + "epoch": 9.739295307315532, + "grad_norm": 1.2349926233291626, + "learning_rate": 3.3127188314100444e-07, + "loss": 1.4994, + "step": 46230 + }, + { + "epoch": 9.741402011902881, + "grad_norm": 1.3378150463104248, + "learning_rate": 3.2591016083856064e-07, + "loss": 1.5132, + "step": 46240 + }, + { + "epoch": 9.743508716490231, + "grad_norm": 1.350303053855896, + "learning_rate": 3.205921128952016e-07, + "loss": 1.5775, + "step": 46250 + }, + { + "epoch": 9.745615421077579, + "grad_norm": 1.3889814615249634, + "learning_rate": 3.1531774164111903e-07, + "loss": 1.6269, + "step": 46260 + }, + { + "epoch": 9.747722125664929, + "grad_norm": 1.2698912620544434, + "learning_rate": 3.1008704938743084e-07, + "loss": 1.5344, + "step": 46270 + }, + { + "epoch": 9.749828830252278, + "grad_norm": 1.3080488443374634, + "learning_rate": 3.049000384260592e-07, + "loss": 1.5701, + "step": 46280 + }, + { + "epoch": 9.751935534839626, + "grad_norm": 1.3104041814804077, + "learning_rate": 2.997567110297861e-07, + "loss": 1.6486, + "step": 46290 + }, + { + "epoch": 9.754042239426976, + "grad_norm": 1.277556300163269, + "learning_rate": 2.9465706945230874e-07, + "loss": 1.5088, + "step": 46300 + }, + { + "epoch": 9.756148944014326, + "grad_norm": 1.2666455507278442, + "learning_rate": 2.896011159281176e-07, + "loss": 1.5295, + "step": 46310 + }, + { + "epoch": 9.758255648601676, + "grad_norm": 1.277604103088379, + "learning_rate": 2.8458885267260705e-07, + "loss": 1.5221, + "step": 46320 + }, + { + "epoch": 9.760362353189024, + "grad_norm": 1.2649208307266235, + "learning_rate": 2.7962028188198706e-07, + "loss": 1.555, + "step": 46330 + }, + { + "epoch": 9.762469057776373, + "grad_norm": 1.2684274911880493, + "learning_rate": 2.746954057333606e-07, + "loss": 1.5789, + "step": 46340 + }, + { + "epoch": 9.764575762363723, + "grad_norm": 1.2506211996078491, + "learning_rate": 2.6981422638466814e-07, + "loss": 1.5723, + "step": 46350 + }, + { + "epoch": 9.766682466951071, + "grad_norm": 1.1741187572479248, + "learning_rate": 2.649767459746988e-07, + "loss": 1.5262, + "step": 46360 + }, + { + "epoch": 9.76878917153842, + "grad_norm": 1.1646640300750732, + "learning_rate": 2.601829666231015e-07, + "loss": 1.4826, + "step": 46370 + }, + { + "epoch": 9.77089587612577, + "grad_norm": 1.2875044345855713, + "learning_rate": 2.554328904303738e-07, + "loss": 1.574, + "step": 46380 + }, + { + "epoch": 9.77300258071312, + "grad_norm": 1.1216455698013306, + "learning_rate": 2.5072651947786184e-07, + "loss": 1.534, + "step": 46390 + }, + { + "epoch": 9.775109285300468, + "grad_norm": 1.1633546352386475, + "learning_rate": 2.460638558277606e-07, + "loss": 1.5665, + "step": 46400 + }, + { + "epoch": 9.777215989887818, + "grad_norm": 1.1941152811050415, + "learning_rate": 2.414449015231357e-07, + "loss": 1.5668, + "step": 46410 + }, + { + "epoch": 9.779322694475168, + "grad_norm": 1.2184324264526367, + "learning_rate": 2.3686965858786824e-07, + "loss": 1.5645, + "step": 46420 + }, + { + "epoch": 9.781429399062516, + "grad_norm": 1.2699726819992065, + "learning_rate": 2.3233812902669905e-07, + "loss": 1.5608, + "step": 46430 + }, + { + "epoch": 9.783536103649865, + "grad_norm": 1.1520774364471436, + "learning_rate": 2.2785031482521758e-07, + "loss": 1.5574, + "step": 46440 + }, + { + "epoch": 9.785642808237215, + "grad_norm": 1.4768619537353516, + "learning_rate": 2.2340621794985084e-07, + "loss": 1.5991, + "step": 46450 + }, + { + "epoch": 9.787749512824565, + "grad_norm": 1.1728389263153076, + "learning_rate": 2.1900584034788561e-07, + "loss": 1.505, + "step": 46460 + }, + { + "epoch": 9.789856217411913, + "grad_norm": 1.2773675918579102, + "learning_rate": 2.1464918394743516e-07, + "loss": 1.5297, + "step": 46470 + }, + { + "epoch": 9.791962921999263, + "grad_norm": 1.2537978887557983, + "learning_rate": 2.1033625065747242e-07, + "loss": 1.5844, + "step": 46480 + }, + { + "epoch": 9.794069626586612, + "grad_norm": 1.206842303276062, + "learning_rate": 2.0606704236779683e-07, + "loss": 1.5266, + "step": 46490 + }, + { + "epoch": 9.79617633117396, + "grad_norm": 1.3502146005630493, + "learning_rate": 2.0184156094905648e-07, + "loss": 1.5746, + "step": 46500 + }, + { + "epoch": 9.79828303576131, + "grad_norm": 1.3354579210281372, + "learning_rate": 1.976598082527259e-07, + "loss": 1.5081, + "step": 46510 + }, + { + "epoch": 9.80038974034866, + "grad_norm": 1.1795066595077515, + "learning_rate": 1.9352178611115046e-07, + "loss": 1.5523, + "step": 46520 + }, + { + "epoch": 9.80249644493601, + "grad_norm": 1.2156345844268799, + "learning_rate": 1.894274963374798e-07, + "loss": 1.491, + "step": 46530 + }, + { + "epoch": 9.804603149523357, + "grad_norm": 1.2723069190979004, + "learning_rate": 1.853769407257122e-07, + "loss": 1.5152, + "step": 46540 + }, + { + "epoch": 9.806709854110707, + "grad_norm": 1.303544521331787, + "learning_rate": 1.813701210506946e-07, + "loss": 1.5611, + "step": 46550 + }, + { + "epoch": 9.808816558698057, + "grad_norm": 1.1918131113052368, + "learning_rate": 1.7740703906810042e-07, + "loss": 1.544, + "step": 46560 + }, + { + "epoch": 9.810923263285407, + "grad_norm": 1.2438985109329224, + "learning_rate": 1.7348769651445164e-07, + "loss": 1.554, + "step": 46570 + }, + { + "epoch": 9.813029967872755, + "grad_norm": 1.3150713443756104, + "learning_rate": 1.6961209510707454e-07, + "loss": 1.4749, + "step": 46580 + }, + { + "epoch": 9.815136672460104, + "grad_norm": 1.1011872291564941, + "learning_rate": 1.657802365441441e-07, + "loss": 1.568, + "step": 46590 + }, + { + "epoch": 9.817243377047454, + "grad_norm": 1.401932716369629, + "learning_rate": 1.6199212250469498e-07, + "loss": 1.5435, + "step": 46600 + }, + { + "epoch": 9.819350081634802, + "grad_norm": 1.2226777076721191, + "learning_rate": 1.58247754648555e-07, + "loss": 1.536, + "step": 46610 + }, + { + "epoch": 9.821456786222152, + "grad_norm": 1.3291361331939697, + "learning_rate": 1.545471346164007e-07, + "loss": 1.537, + "step": 46620 + }, + { + "epoch": 9.823563490809502, + "grad_norm": 1.2543821334838867, + "learning_rate": 1.5089026402973493e-07, + "loss": 1.5187, + "step": 46630 + }, + { + "epoch": 9.825670195396851, + "grad_norm": 1.2340812683105469, + "learning_rate": 1.4727714449090935e-07, + "loss": 1.5296, + "step": 46640 + }, + { + "epoch": 9.8277768999842, + "grad_norm": 1.2020738124847412, + "learning_rate": 1.4370777758307974e-07, + "loss": 1.5796, + "step": 46650 + }, + { + "epoch": 9.829883604571549, + "grad_norm": 1.1650514602661133, + "learning_rate": 1.401821648702506e-07, + "loss": 1.5099, + "step": 46660 + }, + { + "epoch": 9.831990309158899, + "grad_norm": 1.2301157712936401, + "learning_rate": 1.3670030789723066e-07, + "loss": 1.5486, + "step": 46670 + }, + { + "epoch": 9.834097013746247, + "grad_norm": 1.166978359222412, + "learning_rate": 1.3326220818968838e-07, + "loss": 1.5177, + "step": 46680 + }, + { + "epoch": 9.836203718333596, + "grad_norm": 1.241199016571045, + "learning_rate": 1.298678672540854e-07, + "loss": 1.6009, + "step": 46690 + }, + { + "epoch": 9.838310422920946, + "grad_norm": 1.2882767915725708, + "learning_rate": 1.2651728657773198e-07, + "loss": 1.579, + "step": 46700 + }, + { + "epoch": 9.840417127508296, + "grad_norm": 1.233310580253601, + "learning_rate": 1.2321046762876487e-07, + "loss": 1.6353, + "step": 46710 + }, + { + "epoch": 9.842523832095644, + "grad_norm": 1.211338758468628, + "learning_rate": 1.1994741185612502e-07, + "loss": 1.5947, + "step": 46720 + }, + { + "epoch": 9.844630536682994, + "grad_norm": 1.278219223022461, + "learning_rate": 1.1672812068960204e-07, + "loss": 1.5459, + "step": 46730 + }, + { + "epoch": 9.846737241270343, + "grad_norm": 1.3067710399627686, + "learning_rate": 1.1355259553978981e-07, + "loss": 1.5399, + "step": 46740 + }, + { + "epoch": 9.848843945857691, + "grad_norm": 1.2565813064575195, + "learning_rate": 1.1042083779811973e-07, + "loss": 1.5436, + "step": 46750 + }, + { + "epoch": 9.850950650445041, + "grad_norm": 1.2496920824050903, + "learning_rate": 1.0733284883682749e-07, + "loss": 1.5691, + "step": 46760 + }, + { + "epoch": 9.85305735503239, + "grad_norm": 1.1770131587982178, + "learning_rate": 1.0428863000899735e-07, + "loss": 1.6555, + "step": 46770 + }, + { + "epoch": 9.85516405961974, + "grad_norm": 1.2371290922164917, + "learning_rate": 1.0128818264851791e-07, + "loss": 1.5574, + "step": 46780 + }, + { + "epoch": 9.857270764207088, + "grad_norm": 1.1748226881027222, + "learning_rate": 9.833150807009306e-08, + "loss": 1.5877, + "step": 46790 + }, + { + "epoch": 9.859377468794438, + "grad_norm": 1.3029236793518066, + "learning_rate": 9.541860756925314e-08, + "loss": 1.5687, + "step": 46800 + }, + { + "epoch": 9.861484173381788, + "grad_norm": 1.1565899848937988, + "learning_rate": 9.254948242235495e-08, + "loss": 1.5244, + "step": 46810 + }, + { + "epoch": 9.863590877969138, + "grad_norm": 1.278841495513916, + "learning_rate": 8.972413388657064e-08, + "loss": 1.5342, + "step": 46820 + }, + { + "epoch": 9.865697582556486, + "grad_norm": 1.2975276708602905, + "learning_rate": 8.694256319987659e-08, + "loss": 1.5355, + "step": 46830 + }, + { + "epoch": 9.867804287143835, + "grad_norm": 1.1724909543991089, + "learning_rate": 8.420477158107565e-08, + "loss": 1.5269, + "step": 46840 + }, + { + "epoch": 9.869910991731185, + "grad_norm": 1.3155508041381836, + "learning_rate": 8.151076022980819e-08, + "loss": 1.5505, + "step": 46850 + }, + { + "epoch": 9.872017696318533, + "grad_norm": 1.251495599746704, + "learning_rate": 7.886053032649665e-08, + "loss": 1.5621, + "step": 46860 + }, + { + "epoch": 9.874124400905883, + "grad_norm": 1.429298996925354, + "learning_rate": 7.62540830324121e-08, + "loss": 1.5938, + "step": 46870 + }, + { + "epoch": 9.876231105493233, + "grad_norm": 1.2667317390441895, + "learning_rate": 7.369141948960767e-08, + "loss": 1.544, + "step": 46880 + }, + { + "epoch": 9.87833781008058, + "grad_norm": 1.188432216644287, + "learning_rate": 7.117254082098512e-08, + "loss": 1.5984, + "step": 46890 + }, + { + "epoch": 9.88044451466793, + "grad_norm": 1.162582516670227, + "learning_rate": 6.869744813023937e-08, + "loss": 1.5626, + "step": 46900 + }, + { + "epoch": 9.88255121925528, + "grad_norm": 1.2468620538711548, + "learning_rate": 6.626614250188068e-08, + "loss": 1.6147, + "step": 46910 + }, + { + "epoch": 9.88465792384263, + "grad_norm": 1.2421079874038696, + "learning_rate": 6.387862500125685e-08, + "loss": 1.5524, + "step": 46920 + }, + { + "epoch": 9.886764628429978, + "grad_norm": 1.3621149063110352, + "learning_rate": 6.153489667448664e-08, + "loss": 1.5235, + "step": 46930 + }, + { + "epoch": 9.888871333017327, + "grad_norm": 1.3506252765655518, + "learning_rate": 5.9234958548537445e-08, + "loss": 1.5795, + "step": 46940 + }, + { + "epoch": 9.890978037604677, + "grad_norm": 1.1938551664352417, + "learning_rate": 5.697881163118091e-08, + "loss": 1.513, + "step": 46950 + }, + { + "epoch": 9.893084742192027, + "grad_norm": 1.2343593835830688, + "learning_rate": 5.476645691098181e-08, + "loss": 1.5676, + "step": 46960 + }, + { + "epoch": 9.895191446779375, + "grad_norm": 1.3996899127960205, + "learning_rate": 5.25978953573536e-08, + "loss": 1.5635, + "step": 46970 + }, + { + "epoch": 9.897298151366725, + "grad_norm": 1.1408532857894897, + "learning_rate": 5.047312792046954e-08, + "loss": 1.5245, + "step": 46980 + }, + { + "epoch": 9.899404855954074, + "grad_norm": 1.3579094409942627, + "learning_rate": 4.839215553137377e-08, + "loss": 1.503, + "step": 46990 + }, + { + "epoch": 9.901511560541422, + "grad_norm": 1.0767526626586914, + "learning_rate": 4.6354979101870254e-08, + "loss": 1.5036, + "step": 47000 + }, + { + "epoch": 9.903618265128772, + "grad_norm": 1.2614816427230835, + "learning_rate": 4.4361599524589406e-08, + "loss": 1.6215, + "step": 47010 + }, + { + "epoch": 9.905724969716122, + "grad_norm": 1.347352147102356, + "learning_rate": 4.241201767298808e-08, + "loss": 1.5626, + "step": 47020 + }, + { + "epoch": 9.907831674303472, + "grad_norm": 1.3003053665161133, + "learning_rate": 4.0506234401305186e-08, + "loss": 1.6067, + "step": 47030 + }, + { + "epoch": 9.90993837889082, + "grad_norm": 1.2332658767700195, + "learning_rate": 3.8644250544594975e-08, + "loss": 1.5574, + "step": 47040 + }, + { + "epoch": 9.91204508347817, + "grad_norm": 1.1698170900344849, + "learning_rate": 3.682606691874924e-08, + "loss": 1.5384, + "step": 47050 + }, + { + "epoch": 9.914151788065519, + "grad_norm": 1.2463845014572144, + "learning_rate": 3.505168432043071e-08, + "loss": 1.5131, + "step": 47060 + }, + { + "epoch": 9.916258492652867, + "grad_norm": 1.2173086404800415, + "learning_rate": 3.332110352712858e-08, + "loss": 1.5339, + "step": 47070 + }, + { + "epoch": 9.918365197240217, + "grad_norm": 1.2471425533294678, + "learning_rate": 3.1634325297114076e-08, + "loss": 1.589, + "step": 47080 + }, + { + "epoch": 9.920471901827566, + "grad_norm": 1.269729733467102, + "learning_rate": 2.999135036951817e-08, + "loss": 1.5994, + "step": 47090 + }, + { + "epoch": 9.922578606414916, + "grad_norm": 1.228976845741272, + "learning_rate": 2.839217946422057e-08, + "loss": 1.5785, + "step": 47100 + }, + { + "epoch": 9.924685311002264, + "grad_norm": 1.2782270908355713, + "learning_rate": 2.6836813281938543e-08, + "loss": 1.4929, + "step": 47110 + }, + { + "epoch": 9.926792015589614, + "grad_norm": 1.2272552251815796, + "learning_rate": 2.53252525042047e-08, + "loss": 1.5998, + "step": 47120 + }, + { + "epoch": 9.928898720176964, + "grad_norm": 1.2180017232894897, + "learning_rate": 2.385749779332258e-08, + "loss": 1.5487, + "step": 47130 + }, + { + "epoch": 9.931005424764312, + "grad_norm": 1.21531081199646, + "learning_rate": 2.243354979242218e-08, + "loss": 1.5415, + "step": 47140 + }, + { + "epoch": 9.933112129351661, + "grad_norm": 1.1451653242111206, + "learning_rate": 2.1053409125448843e-08, + "loss": 1.5198, + "step": 47150 + }, + { + "epoch": 9.935218833939011, + "grad_norm": 1.2328366041183472, + "learning_rate": 1.971707639712994e-08, + "loss": 1.5671, + "step": 47160 + }, + { + "epoch": 9.93732553852636, + "grad_norm": 1.4899163246154785, + "learning_rate": 1.8424552193019308e-08, + "loss": 1.604, + "step": 47170 + }, + { + "epoch": 9.939432243113709, + "grad_norm": 1.3283289670944214, + "learning_rate": 1.7175837079452804e-08, + "loss": 1.5649, + "step": 47180 + }, + { + "epoch": 9.941538947701059, + "grad_norm": 1.2755736112594604, + "learning_rate": 1.5970931603592752e-08, + "loss": 1.5319, + "step": 47190 + }, + { + "epoch": 9.943645652288408, + "grad_norm": 1.3269785642623901, + "learning_rate": 1.4809836293394608e-08, + "loss": 1.5431, + "step": 47200 + }, + { + "epoch": 9.945752356875758, + "grad_norm": 1.18012535572052, + "learning_rate": 1.3692551657595865e-08, + "loss": 1.5568, + "step": 47210 + }, + { + "epoch": 9.947859061463106, + "grad_norm": 1.5265027284622192, + "learning_rate": 1.2619078185793776e-08, + "loss": 1.5734, + "step": 47220 + }, + { + "epoch": 9.949965766050456, + "grad_norm": 1.2340668439865112, + "learning_rate": 1.1589416348323224e-08, + "loss": 1.6149, + "step": 47230 + }, + { + "epoch": 9.952072470637805, + "grad_norm": 1.254093885421753, + "learning_rate": 1.0603566596367742e-08, + "loss": 1.5872, + "step": 47240 + }, + { + "epoch": 9.954179175225153, + "grad_norm": 1.224780797958374, + "learning_rate": 9.661529361892907e-09, + "loss": 1.5545, + "step": 47250 + }, + { + "epoch": 9.956285879812503, + "grad_norm": 1.172776699066162, + "learning_rate": 8.76330505769074e-09, + "loss": 1.5728, + "step": 47260 + }, + { + "epoch": 9.958392584399853, + "grad_norm": 1.1891138553619385, + "learning_rate": 7.908894077301998e-09, + "loss": 1.522, + "step": 47270 + }, + { + "epoch": 9.9604992889872, + "grad_norm": 1.2386298179626465, + "learning_rate": 7.098296795138293e-09, + "loss": 1.5344, + "step": 47280 + }, + { + "epoch": 9.96260599357455, + "grad_norm": 1.303057074546814, + "learning_rate": 6.331513566371072e-09, + "loss": 1.5502, + "step": 47290 + }, + { + "epoch": 9.9647126981619, + "grad_norm": 1.3631552457809448, + "learning_rate": 5.608544726976029e-09, + "loss": 1.5891, + "step": 47300 + }, + { + "epoch": 9.96681940274925, + "grad_norm": 1.2292252779006958, + "learning_rate": 4.929390593744199e-09, + "loss": 1.6031, + "step": 47310 + }, + { + "epoch": 9.968926107336598, + "grad_norm": 1.2337449789047241, + "learning_rate": 4.2940514642597626e-09, + "loss": 1.5657, + "step": 47320 + }, + { + "epoch": 9.971032811923948, + "grad_norm": 1.3655712604522705, + "learning_rate": 3.7025276169000424e-09, + "loss": 1.5833, + "step": 47330 + }, + { + "epoch": 9.973139516511297, + "grad_norm": 1.205053687095642, + "learning_rate": 3.154819310868806e-09, + "loss": 1.5473, + "step": 47340 + }, + { + "epoch": 9.975246221098647, + "grad_norm": 1.3029731512069702, + "learning_rate": 2.650926786151864e-09, + "loss": 1.538, + "step": 47350 + }, + { + "epoch": 9.977352925685995, + "grad_norm": 1.1505680084228516, + "learning_rate": 2.19085026355037e-09, + "loss": 1.5799, + "step": 47360 + }, + { + "epoch": 9.979459630273345, + "grad_norm": 1.2553126811981201, + "learning_rate": 1.7745899446364178e-09, + "loss": 1.5353, + "step": 47370 + }, + { + "epoch": 9.981566334860695, + "grad_norm": 1.342659831047058, + "learning_rate": 1.4021460118085472e-09, + "loss": 1.4656, + "step": 47380 + }, + { + "epoch": 9.983673039448043, + "grad_norm": 1.3252551555633545, + "learning_rate": 1.073518628269543e-09, + "loss": 1.5514, + "step": 47390 + }, + { + "epoch": 9.985779744035392, + "grad_norm": 1.1547834873199463, + "learning_rate": 7.887079380153317e-10, + "loss": 1.5357, + "step": 47400 + }, + { + "epoch": 9.987886448622742, + "grad_norm": 1.2872856855392456, + "learning_rate": 5.477140658349811e-10, + "loss": 1.4944, + "step": 47410 + }, + { + "epoch": 9.989993153210092, + "grad_norm": 1.1849569082260132, + "learning_rate": 3.5053711733290527e-10, + "loss": 1.6425, + "step": 47420 + }, + { + "epoch": 9.99209985779744, + "grad_norm": 1.1810338497161865, + "learning_rate": 1.9717717889555787e-10, + "loss": 1.5292, + "step": 47430 + }, + { + "epoch": 9.99420656238479, + "grad_norm": 1.142162799835205, + "learning_rate": 8.763431773584075e-11, + "loss": 1.5462, + "step": 47440 + }, + { + "epoch": 9.99631326697214, + "grad_norm": 1.3802740573883057, + "learning_rate": 2.1908581837593033e-11, + "loss": 1.5825, + "step": 47450 + }, + { + "epoch": 9.998419971559487, + "grad_norm": 1.2215389013290405, + "learning_rate": 0.0, + "loss": 1.6347, + "step": 47460 + }, + { + "epoch": 9.998419971559487, + "step": 47460, + "total_flos": 1.2118866375632486e+18, + "train_loss": 1.6495774427446894, + "train_runtime": 50831.8889, + "train_samples_per_second": 3.735, + "train_steps_per_second": 0.934 + } + ], + "logging_steps": 10, + "max_steps": 47460, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 1.2118866375632486e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}