diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.963998270928964, + "eval_steps": 500, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016467343200016467, + "grad_norm": 30.985063552856445, + "learning_rate": 3.2906764168190127e-07, + "loss": 1.3972, + "step": 10 + }, + { + "epoch": 0.0032934686400032933, + "grad_norm": 31.839323043823242, + "learning_rate": 6.946983546617917e-07, + "loss": 1.3292, + "step": 20 + }, + { + "epoch": 0.0049402029600049404, + "grad_norm": 34.46781921386719, + "learning_rate": 1.060329067641682e-06, + "loss": 1.2296, + "step": 30 + }, + { + "epoch": 0.006586937280006587, + "grad_norm": 34.32807540893555, + "learning_rate": 1.4259597806215722e-06, + "loss": 1.2755, + "step": 40 + }, + { + "epoch": 0.008233671600008234, + "grad_norm": 24.143030166625977, + "learning_rate": 1.7915904936014627e-06, + "loss": 1.2333, + "step": 50 + }, + { + "epoch": 0.009880405920009881, + "grad_norm": 20.771278381347656, + "learning_rate": 2.157221206581353e-06, + "loss": 1.206, + "step": 60 + }, + { + "epoch": 0.011527140240011528, + "grad_norm": 25.031925201416016, + "learning_rate": 2.5228519195612434e-06, + "loss": 1.2008, + "step": 70 + }, + { + "epoch": 0.013173874560013173, + "grad_norm": 24.008222579956055, + "learning_rate": 2.8884826325411334e-06, + "loss": 1.1446, + "step": 80 + }, + { + "epoch": 0.01482060888001482, + "grad_norm": 23.959667205810547, + "learning_rate": 3.254113345521024e-06, + "loss": 1.2072, + "step": 90 + }, + { + "epoch": 0.016467343200016468, + "grad_norm": 16.00537109375, + "learning_rate": 3.6197440585009143e-06, + "loss": 1.1743, + "step": 100 + }, + { + "epoch": 0.018114077520018115, + "grad_norm": 13.254168510437012, + "learning_rate": 3.985374771480805e-06, + "loss": 1.1374, + "step": 110 + }, + { + "epoch": 0.019760811840019762, + "grad_norm": 14.698610305786133, + "learning_rate": 4.351005484460696e-06, + "loss": 1.1192, + "step": 120 + }, + { + "epoch": 0.02140754616002141, + "grad_norm": 12.052921295166016, + "learning_rate": 4.716636197440586e-06, + "loss": 1.089, + "step": 130 + }, + { + "epoch": 0.023054280480023056, + "grad_norm": 9.61983871459961, + "learning_rate": 5.082266910420476e-06, + "loss": 1.0972, + "step": 140 + }, + { + "epoch": 0.0247010148000247, + "grad_norm": 10.221634864807129, + "learning_rate": 5.447897623400366e-06, + "loss": 1.1082, + "step": 150 + }, + { + "epoch": 0.026347749120026347, + "grad_norm": 14.476886749267578, + "learning_rate": 5.813528336380257e-06, + "loss": 1.1348, + "step": 160 + }, + { + "epoch": 0.027994483440027994, + "grad_norm": 23.15399169921875, + "learning_rate": 6.1791590493601475e-06, + "loss": 1.0889, + "step": 170 + }, + { + "epoch": 0.02964121776002964, + "grad_norm": 16.883651733398438, + "learning_rate": 6.544789762340037e-06, + "loss": 1.1055, + "step": 180 + }, + { + "epoch": 0.03128795208003129, + "grad_norm": 12.747838020324707, + "learning_rate": 6.9104204753199275e-06, + "loss": 1.0862, + "step": 190 + }, + { + "epoch": 0.032934686400032935, + "grad_norm": 11.923359870910645, + "learning_rate": 7.2760511882998175e-06, + "loss": 1.0944, + "step": 200 + }, + { + "epoch": 0.03458142072003458, + "grad_norm": 10.496978759765625, + "learning_rate": 7.641681901279708e-06, + "loss": 1.0962, + "step": 210 + }, + { + "epoch": 0.03622815504003623, + "grad_norm": 8.666916847229004, + "learning_rate": 8.007312614259598e-06, + "loss": 1.0826, + "step": 220 + }, + { + "epoch": 0.037874889360037876, + "grad_norm": 7.300290584564209, + "learning_rate": 8.372943327239488e-06, + "loss": 1.0932, + "step": 230 + }, + { + "epoch": 0.039521623680039523, + "grad_norm": 10.057311058044434, + "learning_rate": 8.73857404021938e-06, + "loss": 1.0466, + "step": 240 + }, + { + "epoch": 0.04116835800004117, + "grad_norm": 11.838115692138672, + "learning_rate": 9.10420475319927e-06, + "loss": 1.0881, + "step": 250 + }, + { + "epoch": 0.04281509232004282, + "grad_norm": 6.902042388916016, + "learning_rate": 9.469835466179161e-06, + "loss": 1.0618, + "step": 260 + }, + { + "epoch": 0.044461826640044465, + "grad_norm": 5.424289703369141, + "learning_rate": 9.83546617915905e-06, + "loss": 1.0146, + "step": 270 + }, + { + "epoch": 0.04610856096004611, + "grad_norm": 5.259349346160889, + "learning_rate": 1.020109689213894e-05, + "loss": 0.9994, + "step": 280 + }, + { + "epoch": 0.04775529528004775, + "grad_norm": 6.474951267242432, + "learning_rate": 1.0566727605118832e-05, + "loss": 1.0003, + "step": 290 + }, + { + "epoch": 0.0494020296000494, + "grad_norm": 7.7709455490112305, + "learning_rate": 1.0932358318098721e-05, + "loss": 1.0263, + "step": 300 + }, + { + "epoch": 0.051048763920051046, + "grad_norm": 8.521011352539062, + "learning_rate": 1.129798903107861e-05, + "loss": 1.0383, + "step": 310 + }, + { + "epoch": 0.05269549824005269, + "grad_norm": 6.253118515014648, + "learning_rate": 1.1663619744058501e-05, + "loss": 1.0609, + "step": 320 + }, + { + "epoch": 0.05434223256005434, + "grad_norm": 6.294135093688965, + "learning_rate": 1.2029250457038392e-05, + "loss": 1.0212, + "step": 330 + }, + { + "epoch": 0.05598896688005599, + "grad_norm": 8.303707122802734, + "learning_rate": 1.2394881170018283e-05, + "loss": 0.9964, + "step": 340 + }, + { + "epoch": 0.057635701200057635, + "grad_norm": 4.852534294128418, + "learning_rate": 1.2760511882998172e-05, + "loss": 0.9875, + "step": 350 + }, + { + "epoch": 0.05928243552005928, + "grad_norm": 6.304441928863525, + "learning_rate": 1.3126142595978065e-05, + "loss": 1.0425, + "step": 360 + }, + { + "epoch": 0.06092916984006093, + "grad_norm": 5.18842887878418, + "learning_rate": 1.3491773308957954e-05, + "loss": 1.0101, + "step": 370 + }, + { + "epoch": 0.06257590416006258, + "grad_norm": 5.784980773925781, + "learning_rate": 1.3857404021937843e-05, + "loss": 1.0046, + "step": 380 + }, + { + "epoch": 0.06422263848006422, + "grad_norm": 8.19119930267334, + "learning_rate": 1.4223034734917734e-05, + "loss": 0.9889, + "step": 390 + }, + { + "epoch": 0.06586937280006587, + "grad_norm": 6.851486682891846, + "learning_rate": 1.4588665447897625e-05, + "loss": 1.0111, + "step": 400 + }, + { + "epoch": 0.06751610712006752, + "grad_norm": 4.777026176452637, + "learning_rate": 1.4954296160877516e-05, + "loss": 0.9922, + "step": 410 + }, + { + "epoch": 0.06916284144006916, + "grad_norm": 5.17830753326416, + "learning_rate": 1.5319926873857403e-05, + "loss": 0.9556, + "step": 420 + }, + { + "epoch": 0.07080957576007081, + "grad_norm": 5.6952080726623535, + "learning_rate": 1.5685557586837297e-05, + "loss": 0.9696, + "step": 430 + }, + { + "epoch": 0.07245631008007246, + "grad_norm": 4.926089286804199, + "learning_rate": 1.6051188299817185e-05, + "loss": 0.9668, + "step": 440 + }, + { + "epoch": 0.0741030444000741, + "grad_norm": 4.33101749420166, + "learning_rate": 1.6416819012797076e-05, + "loss": 0.9884, + "step": 450 + }, + { + "epoch": 0.07574977872007575, + "grad_norm": 3.4968671798706055, + "learning_rate": 1.6782449725776967e-05, + "loss": 0.9852, + "step": 460 + }, + { + "epoch": 0.0773965130400774, + "grad_norm": 3.997958183288574, + "learning_rate": 1.7148080438756858e-05, + "loss": 0.9675, + "step": 470 + }, + { + "epoch": 0.07904324736007905, + "grad_norm": 4.513462066650391, + "learning_rate": 1.751371115173675e-05, + "loss": 0.9886, + "step": 480 + }, + { + "epoch": 0.0806899816800807, + "grad_norm": 4.011235237121582, + "learning_rate": 1.7879341864716636e-05, + "loss": 0.9802, + "step": 490 + }, + { + "epoch": 0.08233671600008234, + "grad_norm": 4.776538848876953, + "learning_rate": 1.8244972577696527e-05, + "loss": 0.928, + "step": 500 + }, + { + "epoch": 0.08398345032008399, + "grad_norm": 4.51215934753418, + "learning_rate": 1.8610603290676418e-05, + "loss": 0.9286, + "step": 510 + }, + { + "epoch": 0.08563018464008564, + "grad_norm": 3.450950860977173, + "learning_rate": 1.897623400365631e-05, + "loss": 0.9485, + "step": 520 + }, + { + "epoch": 0.08727691896008728, + "grad_norm": 3.313753604888916, + "learning_rate": 1.93418647166362e-05, + "loss": 0.9803, + "step": 530 + }, + { + "epoch": 0.08892365328008893, + "grad_norm": 4.609573841094971, + "learning_rate": 1.970749542961609e-05, + "loss": 0.9782, + "step": 540 + }, + { + "epoch": 0.09057038760009058, + "grad_norm": 3.477215528488159, + "learning_rate": 1.9999999367939968e-05, + "loss": 0.9189, + "step": 550 + }, + { + "epoch": 0.09221712192009222, + "grad_norm": 5.594038963317871, + "learning_rate": 1.9999977245847137e-05, + "loss": 0.9146, + "step": 560 + }, + { + "epoch": 0.09386385624009386, + "grad_norm": 2.6363534927368164, + "learning_rate": 1.9999923520832466e-05, + "loss": 0.895, + "step": 570 + }, + { + "epoch": 0.0955105905600955, + "grad_norm": 3.1434268951416016, + "learning_rate": 1.999983819306574e-05, + "loss": 0.9041, + "step": 580 + }, + { + "epoch": 0.09715732488009715, + "grad_norm": 3.609675168991089, + "learning_rate": 1.999972126281662e-05, + "loss": 0.9154, + "step": 590 + }, + { + "epoch": 0.0988040592000988, + "grad_norm": 4.128945350646973, + "learning_rate": 1.9999572730454638e-05, + "loss": 0.9061, + "step": 600 + }, + { + "epoch": 0.10045079352010045, + "grad_norm": 3.6238315105438232, + "learning_rate": 1.999939259644921e-05, + "loss": 0.9326, + "step": 610 + }, + { + "epoch": 0.10209752784010209, + "grad_norm": 3.4944262504577637, + "learning_rate": 1.9999180861369605e-05, + "loss": 0.894, + "step": 620 + }, + { + "epoch": 0.10374426216010374, + "grad_norm": 2.8273308277130127, + "learning_rate": 1.999893752588497e-05, + "loss": 0.9058, + "step": 630 + }, + { + "epoch": 0.10539099648010539, + "grad_norm": 3.1057746410369873, + "learning_rate": 1.999866259076432e-05, + "loss": 0.8943, + "step": 640 + }, + { + "epoch": 0.10703773080010703, + "grad_norm": 4.640377998352051, + "learning_rate": 1.9998356056876532e-05, + "loss": 0.9156, + "step": 650 + }, + { + "epoch": 0.10868446512010868, + "grad_norm": 4.991531848907471, + "learning_rate": 1.9998017925190345e-05, + "loss": 0.9163, + "step": 660 + }, + { + "epoch": 0.11033119944011033, + "grad_norm": 7.1393327713012695, + "learning_rate": 1.9997648196774354e-05, + "loss": 0.8828, + "step": 670 + }, + { + "epoch": 0.11197793376011198, + "grad_norm": 3.893332004547119, + "learning_rate": 1.9997246872797018e-05, + "loss": 0.9226, + "step": 680 + }, + { + "epoch": 0.11362466808011362, + "grad_norm": 3.794612169265747, + "learning_rate": 1.999681395452663e-05, + "loss": 0.8738, + "step": 690 + }, + { + "epoch": 0.11527140240011527, + "grad_norm": 3.3877007961273193, + "learning_rate": 1.9996349443331354e-05, + "loss": 0.8742, + "step": 700 + }, + { + "epoch": 0.11691813672011692, + "grad_norm": 5.569369316101074, + "learning_rate": 1.9995853340679173e-05, + "loss": 0.8674, + "step": 710 + }, + { + "epoch": 0.11856487104011856, + "grad_norm": 3.396794080734253, + "learning_rate": 1.999532564813793e-05, + "loss": 0.914, + "step": 720 + }, + { + "epoch": 0.12021160536012021, + "grad_norm": 5.97723388671875, + "learning_rate": 1.9994766367375283e-05, + "loss": 0.8892, + "step": 730 + }, + { + "epoch": 0.12185833968012186, + "grad_norm": 3.1893229484558105, + "learning_rate": 1.999417550015873e-05, + "loss": 0.8554, + "step": 740 + }, + { + "epoch": 0.1235050740001235, + "grad_norm": 3.7248144149780273, + "learning_rate": 1.999355304835559e-05, + "loss": 0.857, + "step": 750 + }, + { + "epoch": 0.12515180832012515, + "grad_norm": 3.797175645828247, + "learning_rate": 1.9992899013932994e-05, + "loss": 0.8646, + "step": 760 + }, + { + "epoch": 0.1267985426401268, + "grad_norm": 2.7383298873901367, + "learning_rate": 1.999221339895789e-05, + "loss": 0.8749, + "step": 770 + }, + { + "epoch": 0.12844527696012845, + "grad_norm": 2.588918447494507, + "learning_rate": 1.9991496205597023e-05, + "loss": 0.8897, + "step": 780 + }, + { + "epoch": 0.1300920112801301, + "grad_norm": 2.533170223236084, + "learning_rate": 1.999074743611694e-05, + "loss": 0.8611, + "step": 790 + }, + { + "epoch": 0.13173874560013174, + "grad_norm": 2.219616174697876, + "learning_rate": 1.998996709288398e-05, + "loss": 0.8416, + "step": 800 + }, + { + "epoch": 0.1333854799201334, + "grad_norm": 2.3094987869262695, + "learning_rate": 1.9989155178364253e-05, + "loss": 0.8401, + "step": 810 + }, + { + "epoch": 0.13503221424013503, + "grad_norm": 1.8022428750991821, + "learning_rate": 1.998831169512366e-05, + "loss": 0.8049, + "step": 820 + }, + { + "epoch": 0.13667894856013668, + "grad_norm": 1.8802989721298218, + "learning_rate": 1.998743664582786e-05, + "loss": 0.829, + "step": 830 + }, + { + "epoch": 0.13832568288013833, + "grad_norm": 2.1697161197662354, + "learning_rate": 1.9986530033242263e-05, + "loss": 0.8503, + "step": 840 + }, + { + "epoch": 0.13997241720013998, + "grad_norm": 4.128193378448486, + "learning_rate": 1.9985591860232047e-05, + "loss": 0.8472, + "step": 850 + }, + { + "epoch": 0.14161915152014162, + "grad_norm": 3.771169900894165, + "learning_rate": 1.9984622129762116e-05, + "loss": 0.8543, + "step": 860 + }, + { + "epoch": 0.14326588584014327, + "grad_norm": 3.2667131423950195, + "learning_rate": 1.99836208448971e-05, + "loss": 0.8512, + "step": 870 + }, + { + "epoch": 0.14491262016014492, + "grad_norm": 2.6572048664093018, + "learning_rate": 1.9982588008801368e-05, + "loss": 0.8274, + "step": 880 + }, + { + "epoch": 0.14655935448014656, + "grad_norm": 2.6121675968170166, + "learning_rate": 1.998152362473899e-05, + "loss": 0.8373, + "step": 890 + }, + { + "epoch": 0.1482060888001482, + "grad_norm": 3.6851730346679688, + "learning_rate": 1.998042769607374e-05, + "loss": 0.8262, + "step": 900 + }, + { + "epoch": 0.14985282312014986, + "grad_norm": 3.212480306625366, + "learning_rate": 1.9979300226269077e-05, + "loss": 0.8379, + "step": 910 + }, + { + "epoch": 0.1514995574401515, + "grad_norm": 2.4245805740356445, + "learning_rate": 1.9978141218888143e-05, + "loss": 0.8513, + "step": 920 + }, + { + "epoch": 0.15314629176015315, + "grad_norm": 3.501965284347534, + "learning_rate": 1.997695067759375e-05, + "loss": 0.8045, + "step": 930 + }, + { + "epoch": 0.1547930260801548, + "grad_norm": 2.5098695755004883, + "learning_rate": 1.997572860614836e-05, + "loss": 0.8123, + "step": 940 + }, + { + "epoch": 0.15643976040015645, + "grad_norm": 2.632624387741089, + "learning_rate": 1.9974475008414095e-05, + "loss": 0.8261, + "step": 950 + }, + { + "epoch": 0.1580864947201581, + "grad_norm": 2.489697217941284, + "learning_rate": 1.997318988835269e-05, + "loss": 0.8348, + "step": 960 + }, + { + "epoch": 0.15973322904015974, + "grad_norm": 4.488799571990967, + "learning_rate": 1.9971873250025512e-05, + "loss": 0.8684, + "step": 970 + }, + { + "epoch": 0.1613799633601614, + "grad_norm": 2.742992877960205, + "learning_rate": 1.9970525097593537e-05, + "loss": 0.7831, + "step": 980 + }, + { + "epoch": 0.16302669768016304, + "grad_norm": 2.506899118423462, + "learning_rate": 1.996914543531732e-05, + "loss": 0.8285, + "step": 990 + }, + { + "epoch": 0.16467343200016468, + "grad_norm": 3.0873513221740723, + "learning_rate": 1.996773426755702e-05, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.16632016632016633, + "grad_norm": 1.7364745140075684, + "learning_rate": 1.9966291598772335e-05, + "loss": 0.8385, + "step": 1010 + }, + { + "epoch": 0.16796690064016798, + "grad_norm": 2.113279104232788, + "learning_rate": 1.9964817433522537e-05, + "loss": 0.8246, + "step": 1020 + }, + { + "epoch": 0.16961363496016962, + "grad_norm": 2.741102457046509, + "learning_rate": 1.9963311776466435e-05, + "loss": 0.8454, + "step": 1030 + }, + { + "epoch": 0.17126036928017127, + "grad_norm": 2.6016159057617188, + "learning_rate": 1.996177463236235e-05, + "loss": 0.8078, + "step": 1040 + }, + { + "epoch": 0.17290710360017292, + "grad_norm": 2.431082010269165, + "learning_rate": 1.9960206006068116e-05, + "loss": 0.8548, + "step": 1050 + }, + { + "epoch": 0.17455383792017456, + "grad_norm": 2.077765703201294, + "learning_rate": 1.9958605902541065e-05, + "loss": 0.8492, + "step": 1060 + }, + { + "epoch": 0.1762005722401762, + "grad_norm": 3.3199009895324707, + "learning_rate": 1.9956974326838004e-05, + "loss": 0.7955, + "step": 1070 + }, + { + "epoch": 0.17784730656017786, + "grad_norm": 2.1479249000549316, + "learning_rate": 1.9955311284115198e-05, + "loss": 0.7802, + "step": 1080 + }, + { + "epoch": 0.1794940408801795, + "grad_norm": 1.943623661994934, + "learning_rate": 1.9953616779628364e-05, + "loss": 0.8206, + "step": 1090 + }, + { + "epoch": 0.18114077520018115, + "grad_norm": 2.654792070388794, + "learning_rate": 1.995189081873264e-05, + "loss": 0.8148, + "step": 1100 + }, + { + "epoch": 0.1827875095201828, + "grad_norm": 1.9163497686386108, + "learning_rate": 1.9950133406882577e-05, + "loss": 0.7941, + "step": 1110 + }, + { + "epoch": 0.18443424384018445, + "grad_norm": 2.9743542671203613, + "learning_rate": 1.9948344549632124e-05, + "loss": 0.8019, + "step": 1120 + }, + { + "epoch": 0.1860809781601861, + "grad_norm": 2.794917106628418, + "learning_rate": 1.9946524252634612e-05, + "loss": 0.8506, + "step": 1130 + }, + { + "epoch": 0.18772771248018771, + "grad_norm": 2.309389352798462, + "learning_rate": 1.9944672521642715e-05, + "loss": 0.7851, + "step": 1140 + }, + { + "epoch": 0.18937444680018936, + "grad_norm": 1.9234291315078735, + "learning_rate": 1.9942789362508463e-05, + "loss": 0.7941, + "step": 1150 + }, + { + "epoch": 0.191021181120191, + "grad_norm": 2.477848768234253, + "learning_rate": 1.9940874781183203e-05, + "loss": 0.7892, + "step": 1160 + }, + { + "epoch": 0.19266791544019266, + "grad_norm": 2.7016117572784424, + "learning_rate": 1.993892878371758e-05, + "loss": 0.8266, + "step": 1170 + }, + { + "epoch": 0.1943146497601943, + "grad_norm": 1.9896224737167358, + "learning_rate": 1.9936951376261534e-05, + "loss": 0.8227, + "step": 1180 + }, + { + "epoch": 0.19596138408019595, + "grad_norm": 2.1344499588012695, + "learning_rate": 1.993494256506426e-05, + "loss": 0.8019, + "step": 1190 + }, + { + "epoch": 0.1976081184001976, + "grad_norm": 2.2229461669921875, + "learning_rate": 1.9932902356474208e-05, + "loss": 0.7962, + "step": 1200 + }, + { + "epoch": 0.19925485272019924, + "grad_norm": 2.2535150051116943, + "learning_rate": 1.993083075693904e-05, + "loss": 0.8405, + "step": 1210 + }, + { + "epoch": 0.2009015870402009, + "grad_norm": 2.778207778930664, + "learning_rate": 1.9928727773005644e-05, + "loss": 0.8082, + "step": 1220 + }, + { + "epoch": 0.20254832136020254, + "grad_norm": 3.0831124782562256, + "learning_rate": 1.9926593411320064e-05, + "loss": 0.796, + "step": 1230 + }, + { + "epoch": 0.20419505568020418, + "grad_norm": 2.5188772678375244, + "learning_rate": 1.9924427678627533e-05, + "loss": 0.8401, + "step": 1240 + }, + { + "epoch": 0.20584179000020583, + "grad_norm": 2.3578946590423584, + "learning_rate": 1.9922230581772405e-05, + "loss": 0.7995, + "step": 1250 + }, + { + "epoch": 0.20748852432020748, + "grad_norm": 2.7173349857330322, + "learning_rate": 1.992000212769817e-05, + "loss": 0.8271, + "step": 1260 + }, + { + "epoch": 0.20913525864020913, + "grad_norm": 2.0116770267486572, + "learning_rate": 1.9917742323447414e-05, + "loss": 0.7978, + "step": 1270 + }, + { + "epoch": 0.21078199296021077, + "grad_norm": 1.9019718170166016, + "learning_rate": 1.9915451176161788e-05, + "loss": 0.8141, + "step": 1280 + }, + { + "epoch": 0.21242872728021242, + "grad_norm": 2.0212275981903076, + "learning_rate": 1.9913128693082e-05, + "loss": 0.773, + "step": 1290 + }, + { + "epoch": 0.21407546160021407, + "grad_norm": 3.2075860500335693, + "learning_rate": 1.9910774881547803e-05, + "loss": 0.7901, + "step": 1300 + }, + { + "epoch": 0.21572219592021571, + "grad_norm": 2.278242588043213, + "learning_rate": 1.9908389748997937e-05, + "loss": 0.7616, + "step": 1310 + }, + { + "epoch": 0.21736893024021736, + "grad_norm": 1.8260267972946167, + "learning_rate": 1.990597330297014e-05, + "loss": 0.7901, + "step": 1320 + }, + { + "epoch": 0.219015664560219, + "grad_norm": 2.169857978820801, + "learning_rate": 1.9903525551101105e-05, + "loss": 0.7491, + "step": 1330 + }, + { + "epoch": 0.22066239888022066, + "grad_norm": 1.7144447565078735, + "learning_rate": 1.9901046501126454e-05, + "loss": 0.7546, + "step": 1340 + }, + { + "epoch": 0.2223091332002223, + "grad_norm": 1.9034463167190552, + "learning_rate": 1.9898536160880736e-05, + "loss": 0.7679, + "step": 1350 + }, + { + "epoch": 0.22395586752022395, + "grad_norm": 1.7277398109436035, + "learning_rate": 1.989599453829737e-05, + "loss": 0.7254, + "step": 1360 + }, + { + "epoch": 0.2256026018402256, + "grad_norm": 1.7360395193099976, + "learning_rate": 1.989342164140865e-05, + "loss": 0.7861, + "step": 1370 + }, + { + "epoch": 0.22724933616022724, + "grad_norm": 1.7167317867279053, + "learning_rate": 1.98908174783457e-05, + "loss": 0.7765, + "step": 1380 + }, + { + "epoch": 0.2288960704802289, + "grad_norm": 2.2761967182159424, + "learning_rate": 1.988818205733845e-05, + "loss": 0.8211, + "step": 1390 + }, + { + "epoch": 0.23054280480023054, + "grad_norm": 2.0242958068847656, + "learning_rate": 1.9885515386715625e-05, + "loss": 0.7516, + "step": 1400 + }, + { + "epoch": 0.23218953912023219, + "grad_norm": 1.9908565282821655, + "learning_rate": 1.9882817474904697e-05, + "loss": 0.7685, + "step": 1410 + }, + { + "epoch": 0.23383627344023383, + "grad_norm": 2.352423667907715, + "learning_rate": 1.9880088330431883e-05, + "loss": 0.7235, + "step": 1420 + }, + { + "epoch": 0.23548300776023548, + "grad_norm": 1.5180648565292358, + "learning_rate": 1.9877327961922085e-05, + "loss": 0.7557, + "step": 1430 + }, + { + "epoch": 0.23712974208023713, + "grad_norm": 2.784830331802368, + "learning_rate": 1.9874536378098905e-05, + "loss": 0.7864, + "step": 1440 + }, + { + "epoch": 0.23877647640023877, + "grad_norm": 1.8382682800292969, + "learning_rate": 1.987171358778458e-05, + "loss": 0.7465, + "step": 1450 + }, + { + "epoch": 0.24042321072024042, + "grad_norm": 1.4438285827636719, + "learning_rate": 1.986885959989997e-05, + "loss": 0.7892, + "step": 1460 + }, + { + "epoch": 0.24206994504024207, + "grad_norm": 1.4113389253616333, + "learning_rate": 1.986597442346453e-05, + "loss": 0.7586, + "step": 1470 + }, + { + "epoch": 0.24371667936024372, + "grad_norm": 3.4214653968811035, + "learning_rate": 1.9863058067596287e-05, + "loss": 0.7564, + "step": 1480 + }, + { + "epoch": 0.24536341368024536, + "grad_norm": 1.6341257095336914, + "learning_rate": 1.9860110541511792e-05, + "loss": 0.7493, + "step": 1490 + }, + { + "epoch": 0.247010148000247, + "grad_norm": 4.065557956695557, + "learning_rate": 1.9857131854526117e-05, + "loss": 0.768, + "step": 1500 + }, + { + "epoch": 0.24865688232024866, + "grad_norm": 1.5984668731689453, + "learning_rate": 1.9854122016052803e-05, + "loss": 0.7631, + "step": 1510 + }, + { + "epoch": 0.2503036166402503, + "grad_norm": 1.571628212928772, + "learning_rate": 1.9851081035603836e-05, + "loss": 0.7575, + "step": 1520 + }, + { + "epoch": 0.2519503509602519, + "grad_norm": 2.2074406147003174, + "learning_rate": 1.9848008922789625e-05, + "loss": 0.7501, + "step": 1530 + }, + { + "epoch": 0.2535970852802536, + "grad_norm": 1.4130685329437256, + "learning_rate": 1.984490568731897e-05, + "loss": 0.7653, + "step": 1540 + }, + { + "epoch": 0.2552438196002552, + "grad_norm": 1.4988036155700684, + "learning_rate": 1.9841771338999022e-05, + "loss": 0.7544, + "step": 1550 + }, + { + "epoch": 0.2568905539202569, + "grad_norm": 1.799906611442566, + "learning_rate": 1.9838605887735266e-05, + "loss": 0.7264, + "step": 1560 + }, + { + "epoch": 0.2585372882402585, + "grad_norm": 2.0572450160980225, + "learning_rate": 1.9835409343531465e-05, + "loss": 0.7212, + "step": 1570 + }, + { + "epoch": 0.2601840225602602, + "grad_norm": 1.9132790565490723, + "learning_rate": 1.9832181716489664e-05, + "loss": 0.7176, + "step": 1580 + }, + { + "epoch": 0.2618307568802618, + "grad_norm": 1.7707911729812622, + "learning_rate": 1.9828923016810123e-05, + "loss": 0.7647, + "step": 1590 + }, + { + "epoch": 0.2634774912002635, + "grad_norm": 1.9048503637313843, + "learning_rate": 1.9825633254791318e-05, + "loss": 0.7202, + "step": 1600 + }, + { + "epoch": 0.2651242255202651, + "grad_norm": 2.583421230316162, + "learning_rate": 1.9822312440829876e-05, + "loss": 0.73, + "step": 1610 + }, + { + "epoch": 0.2667709598402668, + "grad_norm": 1.651800513267517, + "learning_rate": 1.9818960585420562e-05, + "loss": 0.706, + "step": 1620 + }, + { + "epoch": 0.2684176941602684, + "grad_norm": 2.220543622970581, + "learning_rate": 1.981557769915625e-05, + "loss": 0.7246, + "step": 1630 + }, + { + "epoch": 0.27006442848027007, + "grad_norm": 1.6591778993606567, + "learning_rate": 1.9812163792727864e-05, + "loss": 0.7453, + "step": 1640 + }, + { + "epoch": 0.2717111628002717, + "grad_norm": 1.8537503480911255, + "learning_rate": 1.9808718876924376e-05, + "loss": 0.7483, + "step": 1650 + }, + { + "epoch": 0.27335789712027336, + "grad_norm": 2.1067299842834473, + "learning_rate": 1.9805242962632747e-05, + "loss": 0.7047, + "step": 1660 + }, + { + "epoch": 0.275004631440275, + "grad_norm": 1.5523687601089478, + "learning_rate": 1.9801736060837913e-05, + "loss": 0.7427, + "step": 1670 + }, + { + "epoch": 0.27665136576027666, + "grad_norm": 1.404558777809143, + "learning_rate": 1.9798198182622734e-05, + "loss": 0.7976, + "step": 1680 + }, + { + "epoch": 0.2782981000802783, + "grad_norm": 2.101656675338745, + "learning_rate": 1.979462933916795e-05, + "loss": 0.7621, + "step": 1690 + }, + { + "epoch": 0.27994483440027995, + "grad_norm": 2.0541417598724365, + "learning_rate": 1.9791029541752197e-05, + "loss": 0.7805, + "step": 1700 + }, + { + "epoch": 0.28159156872028157, + "grad_norm": 1.5349130630493164, + "learning_rate": 1.9787398801751895e-05, + "loss": 0.7399, + "step": 1710 + }, + { + "epoch": 0.28323830304028325, + "grad_norm": 2.281158447265625, + "learning_rate": 1.9783737130641272e-05, + "loss": 0.7575, + "step": 1720 + }, + { + "epoch": 0.28488503736028487, + "grad_norm": 2.4197065830230713, + "learning_rate": 1.978004453999231e-05, + "loss": 0.7578, + "step": 1730 + }, + { + "epoch": 0.28653177168028654, + "grad_norm": 2.0881662368774414, + "learning_rate": 1.97763210414747e-05, + "loss": 0.7549, + "step": 1740 + }, + { + "epoch": 0.28817850600028816, + "grad_norm": 2.4676826000213623, + "learning_rate": 1.9772566646855814e-05, + "loss": 0.741, + "step": 1750 + }, + { + "epoch": 0.28982524032028983, + "grad_norm": 1.893269419670105, + "learning_rate": 1.9768781368000658e-05, + "loss": 0.7416, + "step": 1760 + }, + { + "epoch": 0.29147197464029145, + "grad_norm": 1.5994690656661987, + "learning_rate": 1.9764965216871848e-05, + "loss": 0.7181, + "step": 1770 + }, + { + "epoch": 0.29311870896029313, + "grad_norm": 1.4995242357254028, + "learning_rate": 1.9761118205529565e-05, + "loss": 0.7389, + "step": 1780 + }, + { + "epoch": 0.29476544328029475, + "grad_norm": 1.3291387557983398, + "learning_rate": 1.9757240346131517e-05, + "loss": 0.7356, + "step": 1790 + }, + { + "epoch": 0.2964121776002964, + "grad_norm": 1.4342174530029297, + "learning_rate": 1.9753331650932898e-05, + "loss": 0.722, + "step": 1800 + }, + { + "epoch": 0.29805891192029804, + "grad_norm": 1.4941635131835938, + "learning_rate": 1.9749392132286356e-05, + "loss": 0.7491, + "step": 1810 + }, + { + "epoch": 0.2997056462402997, + "grad_norm": 1.4663833379745483, + "learning_rate": 1.974542180264195e-05, + "loss": 0.7724, + "step": 1820 + }, + { + "epoch": 0.30135238056030134, + "grad_norm": 1.420600414276123, + "learning_rate": 1.974142067454711e-05, + "loss": 0.7844, + "step": 1830 + }, + { + "epoch": 0.302999114880303, + "grad_norm": 1.7661523818969727, + "learning_rate": 1.97373887606466e-05, + "loss": 0.7277, + "step": 1840 + }, + { + "epoch": 0.30464584920030463, + "grad_norm": 2.7334187030792236, + "learning_rate": 1.9733326073682475e-05, + "loss": 0.7209, + "step": 1850 + }, + { + "epoch": 0.3062925835203063, + "grad_norm": 1.8105331659317017, + "learning_rate": 1.972923262649404e-05, + "loss": 0.7499, + "step": 1860 + }, + { + "epoch": 0.3079393178403079, + "grad_norm": 1.575403094291687, + "learning_rate": 1.9725108432017812e-05, + "loss": 0.7097, + "step": 1870 + }, + { + "epoch": 0.3095860521603096, + "grad_norm": 1.4154794216156006, + "learning_rate": 1.9720953503287487e-05, + "loss": 0.7141, + "step": 1880 + }, + { + "epoch": 0.3112327864803112, + "grad_norm": 1.4072984457015991, + "learning_rate": 1.9716767853433877e-05, + "loss": 0.7378, + "step": 1890 + }, + { + "epoch": 0.3128795208003129, + "grad_norm": 1.3542795181274414, + "learning_rate": 1.971255149568489e-05, + "loss": 0.7385, + "step": 1900 + }, + { + "epoch": 0.3145262551203145, + "grad_norm": 1.7643039226531982, + "learning_rate": 1.970830444336548e-05, + "loss": 0.7148, + "step": 1910 + }, + { + "epoch": 0.3161729894403162, + "grad_norm": 1.285137414932251, + "learning_rate": 1.9704026709897606e-05, + "loss": 0.6827, + "step": 1920 + }, + { + "epoch": 0.3178197237603178, + "grad_norm": 1.2715585231781006, + "learning_rate": 1.9699718308800182e-05, + "loss": 0.7133, + "step": 1930 + }, + { + "epoch": 0.3194664580803195, + "grad_norm": 1.5506412982940674, + "learning_rate": 1.9695379253689048e-05, + "loss": 0.7238, + "step": 1940 + }, + { + "epoch": 0.3211131924003211, + "grad_norm": 1.3223036527633667, + "learning_rate": 1.9691009558276915e-05, + "loss": 0.7131, + "step": 1950 + }, + { + "epoch": 0.3227599267203228, + "grad_norm": 1.505676031112671, + "learning_rate": 1.9686609236373333e-05, + "loss": 0.7216, + "step": 1960 + }, + { + "epoch": 0.3244066610403244, + "grad_norm": 1.330424427986145, + "learning_rate": 1.9682178301884632e-05, + "loss": 0.7296, + "step": 1970 + }, + { + "epoch": 0.32605339536032607, + "grad_norm": 1.6218839883804321, + "learning_rate": 1.9677716768813893e-05, + "loss": 0.6836, + "step": 1980 + }, + { + "epoch": 0.3277001296803277, + "grad_norm": 1.3749337196350098, + "learning_rate": 1.9673224651260894e-05, + "loss": 0.7039, + "step": 1990 + }, + { + "epoch": 0.32934686400032936, + "grad_norm": 1.3419939279556274, + "learning_rate": 1.9668701963422077e-05, + "loss": 0.7181, + "step": 2000 + }, + { + "epoch": 0.330993598320331, + "grad_norm": 1.531684398651123, + "learning_rate": 1.9664148719590486e-05, + "loss": 0.7263, + "step": 2010 + }, + { + "epoch": 0.33264033264033266, + "grad_norm": 1.6776705980300903, + "learning_rate": 1.9659564934155733e-05, + "loss": 0.7279, + "step": 2020 + }, + { + "epoch": 0.3342870669603343, + "grad_norm": 1.4876450300216675, + "learning_rate": 1.9654950621603955e-05, + "loss": 0.7441, + "step": 2030 + }, + { + "epoch": 0.33593380128033595, + "grad_norm": 1.3358787298202515, + "learning_rate": 1.965030579651776e-05, + "loss": 0.6973, + "step": 2040 + }, + { + "epoch": 0.3375805356003376, + "grad_norm": 1.2709858417510986, + "learning_rate": 1.9645630473576184e-05, + "loss": 0.6895, + "step": 2050 + }, + { + "epoch": 0.33922726992033925, + "grad_norm": 1.523409366607666, + "learning_rate": 1.9640924667554654e-05, + "loss": 0.7273, + "step": 2060 + }, + { + "epoch": 0.34087400424034087, + "grad_norm": 1.1860836744308472, + "learning_rate": 1.9636188393324917e-05, + "loss": 0.6914, + "step": 2070 + }, + { + "epoch": 0.34252073856034254, + "grad_norm": 1.5158321857452393, + "learning_rate": 1.9631421665855023e-05, + "loss": 0.6884, + "step": 2080 + }, + { + "epoch": 0.34416747288034416, + "grad_norm": 1.5086334943771362, + "learning_rate": 1.9626624500209254e-05, + "loss": 0.7352, + "step": 2090 + }, + { + "epoch": 0.34581420720034584, + "grad_norm": 1.5807071924209595, + "learning_rate": 1.9621796911548097e-05, + "loss": 0.6873, + "step": 2100 + }, + { + "epoch": 0.34746094152034745, + "grad_norm": 1.472090244293213, + "learning_rate": 1.961693891512817e-05, + "loss": 0.6839, + "step": 2110 + }, + { + "epoch": 0.34910767584034913, + "grad_norm": 1.2841448783874512, + "learning_rate": 1.9612050526302195e-05, + "loss": 0.7038, + "step": 2120 + }, + { + "epoch": 0.35075441016035075, + "grad_norm": 1.4147202968597412, + "learning_rate": 1.9607131760518952e-05, + "loss": 0.7, + "step": 2130 + }, + { + "epoch": 0.3524011444803524, + "grad_norm": 1.4587483406066895, + "learning_rate": 1.9602182633323205e-05, + "loss": 0.6995, + "step": 2140 + }, + { + "epoch": 0.35404787880035404, + "grad_norm": 1.179916501045227, + "learning_rate": 1.9597203160355684e-05, + "loss": 0.6975, + "step": 2150 + }, + { + "epoch": 0.3556946131203557, + "grad_norm": 1.621735692024231, + "learning_rate": 1.9592193357353012e-05, + "loss": 0.6808, + "step": 2160 + }, + { + "epoch": 0.35734134744035734, + "grad_norm": 1.5990266799926758, + "learning_rate": 1.9587153240147663e-05, + "loss": 0.7072, + "step": 2170 + }, + { + "epoch": 0.358988081760359, + "grad_norm": 1.6757946014404297, + "learning_rate": 1.9582082824667924e-05, + "loss": 0.7017, + "step": 2180 + }, + { + "epoch": 0.36063481608036063, + "grad_norm": 1.4000555276870728, + "learning_rate": 1.957698212693782e-05, + "loss": 0.704, + "step": 2190 + }, + { + "epoch": 0.3622815504003623, + "grad_norm": 1.4697622060775757, + "learning_rate": 1.9571851163077082e-05, + "loss": 0.6987, + "step": 2200 + }, + { + "epoch": 0.3639282847203639, + "grad_norm": 1.608054518699646, + "learning_rate": 1.9566689949301097e-05, + "loss": 0.6947, + "step": 2210 + }, + { + "epoch": 0.3655750190403656, + "grad_norm": 1.957412600517273, + "learning_rate": 1.956149850192084e-05, + "loss": 0.6938, + "step": 2220 + }, + { + "epoch": 0.3672217533603672, + "grad_norm": 1.398135781288147, + "learning_rate": 1.955627683734284e-05, + "loss": 0.6606, + "step": 2230 + }, + { + "epoch": 0.3688684876803689, + "grad_norm": 1.6263389587402344, + "learning_rate": 1.9551024972069127e-05, + "loss": 0.7298, + "step": 2240 + }, + { + "epoch": 0.3705152220003705, + "grad_norm": 1.461220145225525, + "learning_rate": 1.9545742922697157e-05, + "loss": 0.704, + "step": 2250 + }, + { + "epoch": 0.3721619563203722, + "grad_norm": 1.1990222930908203, + "learning_rate": 1.9540430705919798e-05, + "loss": 0.6566, + "step": 2260 + }, + { + "epoch": 0.3738086906403738, + "grad_norm": 1.6441187858581543, + "learning_rate": 1.9535088338525238e-05, + "loss": 0.7184, + "step": 2270 + }, + { + "epoch": 0.37545542496037543, + "grad_norm": 1.2525330781936646, + "learning_rate": 1.9529715837396956e-05, + "loss": 0.7407, + "step": 2280 + }, + { + "epoch": 0.3771021592803771, + "grad_norm": 1.493485927581787, + "learning_rate": 1.952431321951367e-05, + "loss": 0.6515, + "step": 2290 + }, + { + "epoch": 0.3787488936003787, + "grad_norm": 1.203551173210144, + "learning_rate": 1.9518880501949267e-05, + "loss": 0.7195, + "step": 2300 + }, + { + "epoch": 0.3803956279203804, + "grad_norm": 1.6315183639526367, + "learning_rate": 1.9513417701872766e-05, + "loss": 0.6887, + "step": 2310 + }, + { + "epoch": 0.382042362240382, + "grad_norm": 1.2268880605697632, + "learning_rate": 1.9507924836548244e-05, + "loss": 0.7259, + "step": 2320 + }, + { + "epoch": 0.3836890965603837, + "grad_norm": 1.1860028505325317, + "learning_rate": 1.9502401923334798e-05, + "loss": 0.671, + "step": 2330 + }, + { + "epoch": 0.3853358308803853, + "grad_norm": 1.2253532409667969, + "learning_rate": 1.9496848979686493e-05, + "loss": 0.6954, + "step": 2340 + }, + { + "epoch": 0.386982565200387, + "grad_norm": 1.3530570268630981, + "learning_rate": 1.949126602315229e-05, + "loss": 0.7222, + "step": 2350 + }, + { + "epoch": 0.3886292995203886, + "grad_norm": 1.353309988975525, + "learning_rate": 1.9485653071376004e-05, + "loss": 0.7268, + "step": 2360 + }, + { + "epoch": 0.3902760338403903, + "grad_norm": 1.2534314393997192, + "learning_rate": 1.9480010142096245e-05, + "loss": 0.6925, + "step": 2370 + }, + { + "epoch": 0.3919227681603919, + "grad_norm": 1.8099732398986816, + "learning_rate": 1.947433725314636e-05, + "loss": 0.659, + "step": 2380 + }, + { + "epoch": 0.3935695024803936, + "grad_norm": 2.0360233783721924, + "learning_rate": 1.946863442245437e-05, + "loss": 0.6859, + "step": 2390 + }, + { + "epoch": 0.3952162368003952, + "grad_norm": 1.5365337133407593, + "learning_rate": 1.946290166804293e-05, + "loss": 0.6973, + "step": 2400 + }, + { + "epoch": 0.39686297112039687, + "grad_norm": 1.4845314025878906, + "learning_rate": 1.9457139008029263e-05, + "loss": 0.7292, + "step": 2410 + }, + { + "epoch": 0.3985097054403985, + "grad_norm": 1.3728680610656738, + "learning_rate": 1.94513464606251e-05, + "loss": 0.6981, + "step": 2420 + }, + { + "epoch": 0.40015643976040016, + "grad_norm": 1.3709412813186646, + "learning_rate": 1.9445524044136618e-05, + "loss": 0.7105, + "step": 2430 + }, + { + "epoch": 0.4018031740804018, + "grad_norm": 1.321450114250183, + "learning_rate": 1.94396717769644e-05, + "loss": 0.6632, + "step": 2440 + }, + { + "epoch": 0.40344990840040346, + "grad_norm": 1.2133983373641968, + "learning_rate": 1.943378967760337e-05, + "loss": 0.6788, + "step": 2450 + }, + { + "epoch": 0.4050966427204051, + "grad_norm": 1.2709228992462158, + "learning_rate": 1.9427877764642714e-05, + "loss": 0.683, + "step": 2460 + }, + { + "epoch": 0.40674337704040675, + "grad_norm": 1.167478084564209, + "learning_rate": 1.9421936056765847e-05, + "loss": 0.6599, + "step": 2470 + }, + { + "epoch": 0.40839011136040837, + "grad_norm": 1.3339264392852783, + "learning_rate": 1.9415964572750347e-05, + "loss": 0.6807, + "step": 2480 + }, + { + "epoch": 0.41003684568041004, + "grad_norm": 1.1604259014129639, + "learning_rate": 1.9409963331467893e-05, + "loss": 0.6634, + "step": 2490 + }, + { + "epoch": 0.41168358000041166, + "grad_norm": 1.3648931980133057, + "learning_rate": 1.94039323518842e-05, + "loss": 0.6572, + "step": 2500 + }, + { + "epoch": 0.41333031432041334, + "grad_norm": 1.3427473306655884, + "learning_rate": 1.9397871653058974e-05, + "loss": 0.6773, + "step": 2510 + }, + { + "epoch": 0.41497704864041496, + "grad_norm": 1.2733012437820435, + "learning_rate": 1.9391781254145833e-05, + "loss": 0.6822, + "step": 2520 + }, + { + "epoch": 0.41662378296041663, + "grad_norm": 1.1846665143966675, + "learning_rate": 1.9385661174392262e-05, + "loss": 0.6781, + "step": 2530 + }, + { + "epoch": 0.41827051728041825, + "grad_norm": 1.2302825450897217, + "learning_rate": 1.9379511433139547e-05, + "loss": 0.6608, + "step": 2540 + }, + { + "epoch": 0.4199172516004199, + "grad_norm": 1.2838640213012695, + "learning_rate": 1.937333204982271e-05, + "loss": 0.6924, + "step": 2550 + }, + { + "epoch": 0.42156398592042155, + "grad_norm": 1.528942346572876, + "learning_rate": 1.9367123043970452e-05, + "loss": 0.6789, + "step": 2560 + }, + { + "epoch": 0.4232107202404232, + "grad_norm": 1.4241206645965576, + "learning_rate": 1.936088443520509e-05, + "loss": 0.675, + "step": 2570 + }, + { + "epoch": 0.42485745456042484, + "grad_norm": 1.1903736591339111, + "learning_rate": 1.93546162432425e-05, + "loss": 0.6795, + "step": 2580 + }, + { + "epoch": 0.4265041888804265, + "grad_norm": 1.1137593984603882, + "learning_rate": 1.9348318487892036e-05, + "loss": 0.6693, + "step": 2590 + }, + { + "epoch": 0.42815092320042814, + "grad_norm": 1.1012071371078491, + "learning_rate": 1.9341991189056498e-05, + "loss": 0.6982, + "step": 2600 + }, + { + "epoch": 0.4297976575204298, + "grad_norm": 1.1505986452102661, + "learning_rate": 1.9335634366732044e-05, + "loss": 0.664, + "step": 2610 + }, + { + "epoch": 0.43144439184043143, + "grad_norm": 1.229068398475647, + "learning_rate": 1.9329248041008134e-05, + "loss": 0.6789, + "step": 2620 + }, + { + "epoch": 0.4330911261604331, + "grad_norm": 1.1394855976104736, + "learning_rate": 1.9322832232067466e-05, + "loss": 0.6599, + "step": 2630 + }, + { + "epoch": 0.4347378604804347, + "grad_norm": 1.1221281290054321, + "learning_rate": 1.9316386960185922e-05, + "loss": 0.6592, + "step": 2640 + }, + { + "epoch": 0.4363845948004364, + "grad_norm": 1.1771143674850464, + "learning_rate": 1.930991224573249e-05, + "loss": 0.6922, + "step": 2650 + }, + { + "epoch": 0.438031329120438, + "grad_norm": 1.2491823434829712, + "learning_rate": 1.9303408109169205e-05, + "loss": 0.6544, + "step": 2660 + }, + { + "epoch": 0.4396780634404397, + "grad_norm": 1.059578537940979, + "learning_rate": 1.9296874571051084e-05, + "loss": 0.6561, + "step": 2670 + }, + { + "epoch": 0.4413247977604413, + "grad_norm": 1.4251331090927124, + "learning_rate": 1.9290311652026065e-05, + "loss": 0.6322, + "step": 2680 + }, + { + "epoch": 0.442971532080443, + "grad_norm": 1.9921714067459106, + "learning_rate": 1.9283719372834933e-05, + "loss": 0.6859, + "step": 2690 + }, + { + "epoch": 0.4446182664004446, + "grad_norm": 1.2513914108276367, + "learning_rate": 1.9277097754311277e-05, + "loss": 0.6463, + "step": 2700 + }, + { + "epoch": 0.4462650007204463, + "grad_norm": 1.6695165634155273, + "learning_rate": 1.9270446817381377e-05, + "loss": 0.6827, + "step": 2710 + }, + { + "epoch": 0.4479117350404479, + "grad_norm": 1.291625738143921, + "learning_rate": 1.9263766583064193e-05, + "loss": 0.6992, + "step": 2720 + }, + { + "epoch": 0.4495584693604496, + "grad_norm": 1.7672340869903564, + "learning_rate": 1.925705707247127e-05, + "loss": 0.6754, + "step": 2730 + }, + { + "epoch": 0.4512052036804512, + "grad_norm": 3.7783656120300293, + "learning_rate": 1.925031830680666e-05, + "loss": 0.671, + "step": 2740 + }, + { + "epoch": 0.45285193800045287, + "grad_norm": 1.448926568031311, + "learning_rate": 1.9243550307366884e-05, + "loss": 0.6677, + "step": 2750 + }, + { + "epoch": 0.4544986723204545, + "grad_norm": 1.472121238708496, + "learning_rate": 1.923675309554085e-05, + "loss": 0.6609, + "step": 2760 + }, + { + "epoch": 0.45614540664045616, + "grad_norm": 1.7885098457336426, + "learning_rate": 1.9229926692809777e-05, + "loss": 0.6847, + "step": 2770 + }, + { + "epoch": 0.4577921409604578, + "grad_norm": 1.6598893404006958, + "learning_rate": 1.9223071120747145e-05, + "loss": 0.6782, + "step": 2780 + }, + { + "epoch": 0.45943887528045946, + "grad_norm": 1.3006958961486816, + "learning_rate": 1.9216186401018614e-05, + "loss": 0.656, + "step": 2790 + }, + { + "epoch": 0.4610856096004611, + "grad_norm": 1.0863409042358398, + "learning_rate": 1.920927255538196e-05, + "loss": 0.6482, + "step": 2800 + }, + { + "epoch": 0.46273234392046275, + "grad_norm": 1.5282680988311768, + "learning_rate": 1.9202329605687e-05, + "loss": 0.6434, + "step": 2810 + }, + { + "epoch": 0.46437907824046437, + "grad_norm": 1.2323793172836304, + "learning_rate": 1.9195357573875537e-05, + "loss": 0.6177, + "step": 2820 + }, + { + "epoch": 0.46602581256046605, + "grad_norm": 1.1313997507095337, + "learning_rate": 1.918835648198128e-05, + "loss": 0.674, + "step": 2830 + }, + { + "epoch": 0.46767254688046767, + "grad_norm": 1.3586218357086182, + "learning_rate": 1.9181326352129773e-05, + "loss": 0.6722, + "step": 2840 + }, + { + "epoch": 0.46931928120046934, + "grad_norm": 1.0108249187469482, + "learning_rate": 1.9174267206538332e-05, + "loss": 0.6609, + "step": 2850 + }, + { + "epoch": 0.47096601552047096, + "grad_norm": 1.2038466930389404, + "learning_rate": 1.916717906751597e-05, + "loss": 0.6407, + "step": 2860 + }, + { + "epoch": 0.47261274984047263, + "grad_norm": 1.1672009229660034, + "learning_rate": 1.916006195746333e-05, + "loss": 0.6932, + "step": 2870 + }, + { + "epoch": 0.47425948416047425, + "grad_norm": 1.5974833965301514, + "learning_rate": 1.915291589887261e-05, + "loss": 0.6373, + "step": 2880 + }, + { + "epoch": 0.47590621848047593, + "grad_norm": 1.06233811378479, + "learning_rate": 1.914574091432749e-05, + "loss": 0.6429, + "step": 2890 + }, + { + "epoch": 0.47755295280047755, + "grad_norm": 1.8846768140792847, + "learning_rate": 1.9138537026503076e-05, + "loss": 0.6373, + "step": 2900 + }, + { + "epoch": 0.4791996871204792, + "grad_norm": 1.164318323135376, + "learning_rate": 1.913130425816581e-05, + "loss": 0.6639, + "step": 2910 + }, + { + "epoch": 0.48084642144048084, + "grad_norm": 1.1987966299057007, + "learning_rate": 1.9124042632173398e-05, + "loss": 0.6699, + "step": 2920 + }, + { + "epoch": 0.4824931557604825, + "grad_norm": 1.5233405828475952, + "learning_rate": 1.9116752171474754e-05, + "loss": 0.6355, + "step": 2930 + }, + { + "epoch": 0.48413989008048414, + "grad_norm": 1.2364847660064697, + "learning_rate": 1.9109432899109923e-05, + "loss": 0.6616, + "step": 2940 + }, + { + "epoch": 0.4857866244004858, + "grad_norm": 1.3125419616699219, + "learning_rate": 1.9102084838209992e-05, + "loss": 0.6585, + "step": 2950 + }, + { + "epoch": 0.48743335872048743, + "grad_norm": 1.315054178237915, + "learning_rate": 1.9094708011997033e-05, + "loss": 0.6443, + "step": 2960 + }, + { + "epoch": 0.4890800930404891, + "grad_norm": 0.934471070766449, + "learning_rate": 1.908730244378403e-05, + "loss": 0.6209, + "step": 2970 + }, + { + "epoch": 0.4907268273604907, + "grad_norm": 1.1312390565872192, + "learning_rate": 1.9079868156974788e-05, + "loss": 0.6699, + "step": 2980 + }, + { + "epoch": 0.4923735616804924, + "grad_norm": 1.168817162513733, + "learning_rate": 1.9072405175063883e-05, + "loss": 0.6222, + "step": 2990 + }, + { + "epoch": 0.494020296000494, + "grad_norm": 1.1964330673217773, + "learning_rate": 1.9064913521636574e-05, + "loss": 0.6303, + "step": 3000 + }, + { + "epoch": 0.4956670303204957, + "grad_norm": 1.1183303594589233, + "learning_rate": 1.9057393220368722e-05, + "loss": 0.6285, + "step": 3010 + }, + { + "epoch": 0.4973137646404973, + "grad_norm": 1.0846532583236694, + "learning_rate": 1.9049844295026738e-05, + "loss": 0.6463, + "step": 3020 + }, + { + "epoch": 0.498960498960499, + "grad_norm": 1.4997867345809937, + "learning_rate": 1.904226676946748e-05, + "loss": 0.6729, + "step": 3030 + }, + { + "epoch": 0.5006072332805006, + "grad_norm": 1.2239123582839966, + "learning_rate": 1.9034660667638206e-05, + "loss": 0.664, + "step": 3040 + }, + { + "epoch": 0.5022539676005022, + "grad_norm": 1.2806061506271362, + "learning_rate": 1.9027026013576465e-05, + "loss": 0.6568, + "step": 3050 + }, + { + "epoch": 0.5039007019205038, + "grad_norm": 1.2059847116470337, + "learning_rate": 1.9019362831410057e-05, + "loss": 0.6771, + "step": 3060 + }, + { + "epoch": 0.5055474362405056, + "grad_norm": 1.2228035926818848, + "learning_rate": 1.9011671145356926e-05, + "loss": 0.6438, + "step": 3070 + }, + { + "epoch": 0.5071941705605072, + "grad_norm": 1.16281259059906, + "learning_rate": 1.9003950979725103e-05, + "loss": 0.6518, + "step": 3080 + }, + { + "epoch": 0.5088409048805088, + "grad_norm": 1.3429057598114014, + "learning_rate": 1.899620235891263e-05, + "loss": 0.6167, + "step": 3090 + }, + { + "epoch": 0.5104876392005104, + "grad_norm": 1.0990668535232544, + "learning_rate": 1.8988425307407458e-05, + "loss": 0.624, + "step": 3100 + }, + { + "epoch": 0.5121343735205122, + "grad_norm": 1.1892952919006348, + "learning_rate": 1.89806198497874e-05, + "loss": 0.6193, + "step": 3110 + }, + { + "epoch": 0.5137811078405138, + "grad_norm": 1.194745421409607, + "learning_rate": 1.8972786010720046e-05, + "loss": 0.6825, + "step": 3120 + }, + { + "epoch": 0.5154278421605154, + "grad_norm": 1.267279028892517, + "learning_rate": 1.8964923814962672e-05, + "loss": 0.6361, + "step": 3130 + }, + { + "epoch": 0.517074576480517, + "grad_norm": 1.3956196308135986, + "learning_rate": 1.8957033287362167e-05, + "loss": 0.6075, + "step": 3140 + }, + { + "epoch": 0.5187213108005188, + "grad_norm": 1.4350258111953735, + "learning_rate": 1.8949114452854957e-05, + "loss": 0.6427, + "step": 3150 + }, + { + "epoch": 0.5203680451205204, + "grad_norm": 1.3245211839675903, + "learning_rate": 1.8941167336466932e-05, + "loss": 0.6825, + "step": 3160 + }, + { + "epoch": 0.522014779440522, + "grad_norm": 1.3547745943069458, + "learning_rate": 1.893319196331336e-05, + "loss": 0.6353, + "step": 3170 + }, + { + "epoch": 0.5236615137605236, + "grad_norm": 1.0884572267532349, + "learning_rate": 1.8925188358598815e-05, + "loss": 0.6354, + "step": 3180 + }, + { + "epoch": 0.5253082480805253, + "grad_norm": 1.193581223487854, + "learning_rate": 1.8917156547617072e-05, + "loss": 0.651, + "step": 3190 + }, + { + "epoch": 0.526954982400527, + "grad_norm": 1.1126220226287842, + "learning_rate": 1.890909655575106e-05, + "loss": 0.6516, + "step": 3200 + }, + { + "epoch": 0.5286017167205286, + "grad_norm": 1.4914835691452026, + "learning_rate": 1.8901008408472775e-05, + "loss": 0.6346, + "step": 3210 + }, + { + "epoch": 0.5302484510405302, + "grad_norm": 1.3618090152740479, + "learning_rate": 1.8892892131343177e-05, + "loss": 0.6237, + "step": 3220 + }, + { + "epoch": 0.5318951853605319, + "grad_norm": 1.3347357511520386, + "learning_rate": 1.888474775001213e-05, + "loss": 0.6727, + "step": 3230 + }, + { + "epoch": 0.5335419196805335, + "grad_norm": 1.1067010164260864, + "learning_rate": 1.8876575290218323e-05, + "loss": 0.6533, + "step": 3240 + }, + { + "epoch": 0.5351886540005352, + "grad_norm": 1.7199362516403198, + "learning_rate": 1.8868374777789172e-05, + "loss": 0.6546, + "step": 3250 + }, + { + "epoch": 0.5368353883205368, + "grad_norm": 1.990593671798706, + "learning_rate": 1.886014623864075e-05, + "loss": 0.6129, + "step": 3260 + }, + { + "epoch": 0.5384821226405385, + "grad_norm": 1.2225085496902466, + "learning_rate": 1.8851889698777707e-05, + "loss": 0.655, + "step": 3270 + }, + { + "epoch": 0.5401288569605401, + "grad_norm": 1.4789817333221436, + "learning_rate": 1.8843605184293177e-05, + "loss": 0.6274, + "step": 3280 + }, + { + "epoch": 0.5417755912805418, + "grad_norm": 1.4760066270828247, + "learning_rate": 1.8835292721368715e-05, + "loss": 0.649, + "step": 3290 + }, + { + "epoch": 0.5434223256005434, + "grad_norm": 2.949131488800049, + "learning_rate": 1.8826952336274184e-05, + "loss": 0.6358, + "step": 3300 + }, + { + "epoch": 0.5450690599205451, + "grad_norm": 1.5721391439437866, + "learning_rate": 1.88185840553677e-05, + "loss": 0.6461, + "step": 3310 + }, + { + "epoch": 0.5467157942405467, + "grad_norm": 1.620802402496338, + "learning_rate": 1.881018790509553e-05, + "loss": 0.5999, + "step": 3320 + }, + { + "epoch": 0.5483625285605483, + "grad_norm": 1.6917542219161987, + "learning_rate": 1.8801763911992035e-05, + "loss": 0.6141, + "step": 3330 + }, + { + "epoch": 0.55000926288055, + "grad_norm": 1.6579176187515259, + "learning_rate": 1.8793312102679548e-05, + "loss": 0.6865, + "step": 3340 + }, + { + "epoch": 0.5516559972005517, + "grad_norm": 1.5372205972671509, + "learning_rate": 1.8784832503868314e-05, + "loss": 0.607, + "step": 3350 + }, + { + "epoch": 0.5533027315205533, + "grad_norm": 2.1712288856506348, + "learning_rate": 1.8776325142356406e-05, + "loss": 0.6754, + "step": 3360 + }, + { + "epoch": 0.5549494658405549, + "grad_norm": 1.6536438465118408, + "learning_rate": 1.876779004502964e-05, + "loss": 0.6484, + "step": 3370 + }, + { + "epoch": 0.5565962001605566, + "grad_norm": 2.0324020385742188, + "learning_rate": 1.8759227238861467e-05, + "loss": 0.6278, + "step": 3380 + }, + { + "epoch": 0.5582429344805583, + "grad_norm": 2.410161018371582, + "learning_rate": 1.8750636750912927e-05, + "loss": 0.611, + "step": 3390 + }, + { + "epoch": 0.5598896688005599, + "grad_norm": 1.6419649124145508, + "learning_rate": 1.874201860833253e-05, + "loss": 0.643, + "step": 3400 + }, + { + "epoch": 0.5615364031205615, + "grad_norm": 2.4913241863250732, + "learning_rate": 1.873337283835619e-05, + "loss": 0.6535, + "step": 3410 + }, + { + "epoch": 0.5631831374405631, + "grad_norm": 1.8292442560195923, + "learning_rate": 1.8724699468307123e-05, + "loss": 0.643, + "step": 3420 + }, + { + "epoch": 0.5648298717605649, + "grad_norm": 1.3961424827575684, + "learning_rate": 1.8715998525595775e-05, + "loss": 0.6279, + "step": 3430 + }, + { + "epoch": 0.5664766060805665, + "grad_norm": 1.811924695968628, + "learning_rate": 1.8707270037719737e-05, + "loss": 0.6424, + "step": 3440 + }, + { + "epoch": 0.5681233404005681, + "grad_norm": 1.5042741298675537, + "learning_rate": 1.8698514032263636e-05, + "loss": 0.6027, + "step": 3450 + }, + { + "epoch": 0.5697700747205697, + "grad_norm": 1.6137949228286743, + "learning_rate": 1.8689730536899078e-05, + "loss": 0.6294, + "step": 3460 + }, + { + "epoch": 0.5714168090405715, + "grad_norm": 1.3153676986694336, + "learning_rate": 1.868091957938453e-05, + "loss": 0.6277, + "step": 3470 + }, + { + "epoch": 0.5730635433605731, + "grad_norm": 1.300405740737915, + "learning_rate": 1.867208118756526e-05, + "loss": 0.6461, + "step": 3480 + }, + { + "epoch": 0.5747102776805747, + "grad_norm": 1.4557414054870605, + "learning_rate": 1.866321538937323e-05, + "loss": 0.6486, + "step": 3490 + }, + { + "epoch": 0.5763570120005763, + "grad_norm": 1.8575087785720825, + "learning_rate": 1.8654322212827022e-05, + "loss": 0.6732, + "step": 3500 + }, + { + "epoch": 0.578003746320578, + "grad_norm": 1.3033815622329712, + "learning_rate": 1.864540168603173e-05, + "loss": 0.6361, + "step": 3510 + }, + { + "epoch": 0.5796504806405797, + "grad_norm": 2.0723509788513184, + "learning_rate": 1.8636453837178893e-05, + "loss": 0.6148, + "step": 3520 + }, + { + "epoch": 0.5812972149605813, + "grad_norm": 2.9872148036956787, + "learning_rate": 1.8627478694546387e-05, + "loss": 0.6768, + "step": 3530 + }, + { + "epoch": 0.5829439492805829, + "grad_norm": 1.6916303634643555, + "learning_rate": 1.8618476286498356e-05, + "loss": 0.6318, + "step": 3540 + }, + { + "epoch": 0.5845906836005846, + "grad_norm": 2.017186164855957, + "learning_rate": 1.86094466414851e-05, + "loss": 0.6583, + "step": 3550 + }, + { + "epoch": 0.5862374179205863, + "grad_norm": 1.7270339727401733, + "learning_rate": 1.8600389788043003e-05, + "loss": 0.6026, + "step": 3560 + }, + { + "epoch": 0.5878841522405879, + "grad_norm": 1.8550065755844116, + "learning_rate": 1.8591305754794434e-05, + "loss": 0.631, + "step": 3570 + }, + { + "epoch": 0.5895308865605895, + "grad_norm": 2.131333112716675, + "learning_rate": 1.8582194570447654e-05, + "loss": 0.6287, + "step": 3580 + }, + { + "epoch": 0.5911776208805912, + "grad_norm": 2.0691232681274414, + "learning_rate": 1.8573056263796732e-05, + "loss": 0.6032, + "step": 3590 + }, + { + "epoch": 0.5928243552005928, + "grad_norm": 2.4947502613067627, + "learning_rate": 1.856389086372146e-05, + "loss": 0.6571, + "step": 3600 + }, + { + "epoch": 0.5944710895205945, + "grad_norm": 1.4081095457077026, + "learning_rate": 1.8554698399187246e-05, + "loss": 0.649, + "step": 3610 + }, + { + "epoch": 0.5961178238405961, + "grad_norm": 1.7815622091293335, + "learning_rate": 1.854547889924502e-05, + "loss": 0.6437, + "step": 3620 + }, + { + "epoch": 0.5977645581605978, + "grad_norm": 1.900574803352356, + "learning_rate": 1.8536232393031173e-05, + "loss": 0.636, + "step": 3630 + }, + { + "epoch": 0.5994112924805994, + "grad_norm": 1.3070859909057617, + "learning_rate": 1.8526958909767425e-05, + "loss": 0.6325, + "step": 3640 + }, + { + "epoch": 0.601058026800601, + "grad_norm": 1.3566079139709473, + "learning_rate": 1.851765847876076e-05, + "loss": 0.6235, + "step": 3650 + }, + { + "epoch": 0.6027047611206027, + "grad_norm": 1.459912896156311, + "learning_rate": 1.8508331129403333e-05, + "loss": 0.6738, + "step": 3660 + }, + { + "epoch": 0.6043514954406044, + "grad_norm": 1.7209458351135254, + "learning_rate": 1.849897689117235e-05, + "loss": 0.6478, + "step": 3670 + }, + { + "epoch": 0.605998229760606, + "grad_norm": 1.884052038192749, + "learning_rate": 1.848959579363001e-05, + "loss": 0.6695, + "step": 3680 + }, + { + "epoch": 0.6076449640806076, + "grad_norm": 1.8747471570968628, + "learning_rate": 1.8480187866423386e-05, + "loss": 0.6453, + "step": 3690 + }, + { + "epoch": 0.6092916984006093, + "grad_norm": 1.548047423362732, + "learning_rate": 1.8470753139284344e-05, + "loss": 0.6253, + "step": 3700 + }, + { + "epoch": 0.610938432720611, + "grad_norm": 3.6169090270996094, + "learning_rate": 1.8461291642029454e-05, + "loss": 0.6002, + "step": 3710 + }, + { + "epoch": 0.6125851670406126, + "grad_norm": 1.727982997894287, + "learning_rate": 1.8451803404559873e-05, + "loss": 0.6077, + "step": 3720 + }, + { + "epoch": 0.6142319013606142, + "grad_norm": 1.832689881324768, + "learning_rate": 1.844228845686127e-05, + "loss": 0.621, + "step": 3730 + }, + { + "epoch": 0.6158786356806158, + "grad_norm": 1.83828866481781, + "learning_rate": 1.8432746829003732e-05, + "loss": 0.6464, + "step": 3740 + }, + { + "epoch": 0.6175253700006176, + "grad_norm": 2.2948646545410156, + "learning_rate": 1.8423178551141662e-05, + "loss": 0.6234, + "step": 3750 + }, + { + "epoch": 0.6191721043206192, + "grad_norm": 2.7820045948028564, + "learning_rate": 1.841358365351368e-05, + "loss": 0.6334, + "step": 3760 + }, + { + "epoch": 0.6208188386406208, + "grad_norm": 1.8063865900039673, + "learning_rate": 1.8403962166442535e-05, + "loss": 0.6515, + "step": 3770 + }, + { + "epoch": 0.6224655729606224, + "grad_norm": 3.7020092010498047, + "learning_rate": 1.8394314120335002e-05, + "loss": 0.6761, + "step": 3780 + }, + { + "epoch": 0.6241123072806241, + "grad_norm": 3.1100969314575195, + "learning_rate": 1.8384639545681803e-05, + "loss": 0.5896, + "step": 3790 + }, + { + "epoch": 0.6257590416006258, + "grad_norm": 3.011991262435913, + "learning_rate": 1.8374938473057486e-05, + "loss": 0.627, + "step": 3800 + }, + { + "epoch": 0.6274057759206274, + "grad_norm": 1.5754694938659668, + "learning_rate": 1.8365210933120347e-05, + "loss": 0.6245, + "step": 3810 + }, + { + "epoch": 0.629052510240629, + "grad_norm": 1.8275468349456787, + "learning_rate": 1.835545695661232e-05, + "loss": 0.6179, + "step": 3820 + }, + { + "epoch": 0.6306992445606306, + "grad_norm": 2.1361637115478516, + "learning_rate": 1.8345676574358897e-05, + "loss": 0.638, + "step": 3830 + }, + { + "epoch": 0.6323459788806324, + "grad_norm": 1.829188585281372, + "learning_rate": 1.8335869817269006e-05, + "loss": 0.6304, + "step": 3840 + }, + { + "epoch": 0.633992713200634, + "grad_norm": 1.8671929836273193, + "learning_rate": 1.8326036716334942e-05, + "loss": 0.6322, + "step": 3850 + }, + { + "epoch": 0.6356394475206356, + "grad_norm": 1.404921531677246, + "learning_rate": 1.8316177302632248e-05, + "loss": 0.6213, + "step": 3860 + }, + { + "epoch": 0.6372861818406372, + "grad_norm": 1.64621901512146, + "learning_rate": 1.8306291607319618e-05, + "loss": 0.6668, + "step": 3870 + }, + { + "epoch": 0.638932916160639, + "grad_norm": 2.2080841064453125, + "learning_rate": 1.829637966163881e-05, + "loss": 0.6089, + "step": 3880 + }, + { + "epoch": 0.6405796504806406, + "grad_norm": 1.4423542022705078, + "learning_rate": 1.8286441496914545e-05, + "loss": 0.6406, + "step": 3890 + }, + { + "epoch": 0.6422263848006422, + "grad_norm": 1.7559462785720825, + "learning_rate": 1.8276477144554393e-05, + "loss": 0.6371, + "step": 3900 + }, + { + "epoch": 0.6438731191206438, + "grad_norm": 1.5417958498001099, + "learning_rate": 1.826648663604869e-05, + "loss": 0.6262, + "step": 3910 + }, + { + "epoch": 0.6455198534406456, + "grad_norm": 1.899674654006958, + "learning_rate": 1.8256470002970438e-05, + "loss": 0.6192, + "step": 3920 + }, + { + "epoch": 0.6471665877606472, + "grad_norm": 1.715377688407898, + "learning_rate": 1.8246427276975196e-05, + "loss": 0.6189, + "step": 3930 + }, + { + "epoch": 0.6488133220806488, + "grad_norm": 1.4496129751205444, + "learning_rate": 1.823635848980098e-05, + "loss": 0.6147, + "step": 3940 + }, + { + "epoch": 0.6504600564006504, + "grad_norm": 1.5673155784606934, + "learning_rate": 1.822626367326818e-05, + "loss": 0.6159, + "step": 3950 + }, + { + "epoch": 0.6521067907206521, + "grad_norm": 1.1710649728775024, + "learning_rate": 1.8216142859279432e-05, + "loss": 0.5911, + "step": 3960 + }, + { + "epoch": 0.6537535250406538, + "grad_norm": 1.358017921447754, + "learning_rate": 1.8205996079819534e-05, + "loss": 0.6107, + "step": 3970 + }, + { + "epoch": 0.6554002593606554, + "grad_norm": 1.4123727083206177, + "learning_rate": 1.8195823366955356e-05, + "loss": 0.629, + "step": 3980 + }, + { + "epoch": 0.657046993680657, + "grad_norm": 1.9289451837539673, + "learning_rate": 1.8185624752835714e-05, + "loss": 0.6609, + "step": 3990 + }, + { + "epoch": 0.6586937280006587, + "grad_norm": 1.6234312057495117, + "learning_rate": 1.8175400269691278e-05, + "loss": 0.6191, + "step": 4000 + }, + { + "epoch": 0.6603404623206603, + "grad_norm": 1.8857241868972778, + "learning_rate": 1.8165149949834474e-05, + "loss": 0.6466, + "step": 4010 + }, + { + "epoch": 0.661987196640662, + "grad_norm": 2.1216554641723633, + "learning_rate": 1.8154873825659393e-05, + "loss": 0.6257, + "step": 4020 + }, + { + "epoch": 0.6636339309606636, + "grad_norm": 1.483231544494629, + "learning_rate": 1.814457192964165e-05, + "loss": 0.5745, + "step": 4030 + }, + { + "epoch": 0.6652806652806653, + "grad_norm": 1.1727079153060913, + "learning_rate": 1.813424429433833e-05, + "loss": 0.6168, + "step": 4040 + }, + { + "epoch": 0.6669273996006669, + "grad_norm": 1.454728364944458, + "learning_rate": 1.8123890952387848e-05, + "loss": 0.6006, + "step": 4050 + }, + { + "epoch": 0.6685741339206686, + "grad_norm": 1.6542500257492065, + "learning_rate": 1.8113511936509864e-05, + "loss": 0.6296, + "step": 4060 + }, + { + "epoch": 0.6702208682406702, + "grad_norm": 1.3419349193572998, + "learning_rate": 1.8103107279505177e-05, + "loss": 0.6597, + "step": 4070 + }, + { + "epoch": 0.6718676025606719, + "grad_norm": 1.247767686843872, + "learning_rate": 1.809267701425562e-05, + "loss": 0.6231, + "step": 4080 + }, + { + "epoch": 0.6735143368806735, + "grad_norm": 2.062878370285034, + "learning_rate": 1.808222117372395e-05, + "loss": 0.6577, + "step": 4090 + }, + { + "epoch": 0.6751610712006751, + "grad_norm": 1.2169808149337769, + "learning_rate": 1.8071739790953754e-05, + "loss": 0.6052, + "step": 4100 + }, + { + "epoch": 0.6768078055206768, + "grad_norm": 1.3929376602172852, + "learning_rate": 1.806123289906934e-05, + "loss": 0.5777, + "step": 4110 + }, + { + "epoch": 0.6784545398406785, + "grad_norm": 1.6458179950714111, + "learning_rate": 1.8050700531275632e-05, + "loss": 0.6085, + "step": 4120 + }, + { + "epoch": 0.6801012741606801, + "grad_norm": 1.402419090270996, + "learning_rate": 1.8040142720858064e-05, + "loss": 0.6112, + "step": 4130 + }, + { + "epoch": 0.6817480084806817, + "grad_norm": 1.2374345064163208, + "learning_rate": 1.8029559501182482e-05, + "loss": 0.5976, + "step": 4140 + }, + { + "epoch": 0.6833947428006834, + "grad_norm": 1.6818028688430786, + "learning_rate": 1.8018950905695022e-05, + "loss": 0.6461, + "step": 4150 + }, + { + "epoch": 0.6850414771206851, + "grad_norm": 1.5200414657592773, + "learning_rate": 1.8008316967922027e-05, + "loss": 0.6412, + "step": 4160 + }, + { + "epoch": 0.6866882114406867, + "grad_norm": 1.8898037672042847, + "learning_rate": 1.799765772146992e-05, + "loss": 0.6135, + "step": 4170 + }, + { + "epoch": 0.6883349457606883, + "grad_norm": 2.4506630897521973, + "learning_rate": 1.7986973200025115e-05, + "loss": 0.6181, + "step": 4180 + }, + { + "epoch": 0.6899816800806899, + "grad_norm": 1.4668997526168823, + "learning_rate": 1.7976263437353897e-05, + "loss": 0.5981, + "step": 4190 + }, + { + "epoch": 0.6916284144006917, + "grad_norm": 1.4976595640182495, + "learning_rate": 1.796552846730232e-05, + "loss": 0.5983, + "step": 4200 + }, + { + "epoch": 0.6932751487206933, + "grad_norm": 1.2524445056915283, + "learning_rate": 1.7954768323796107e-05, + "loss": 0.6191, + "step": 4210 + }, + { + "epoch": 0.6949218830406949, + "grad_norm": 1.3345338106155396, + "learning_rate": 1.7943983040840527e-05, + "loss": 0.6019, + "step": 4220 + }, + { + "epoch": 0.6965686173606965, + "grad_norm": 1.9218558073043823, + "learning_rate": 1.7933172652520308e-05, + "loss": 0.6084, + "step": 4230 + }, + { + "epoch": 0.6982153516806983, + "grad_norm": 1.2218800783157349, + "learning_rate": 1.7922337192999514e-05, + "loss": 0.6521, + "step": 4240 + }, + { + "epoch": 0.6998620860006999, + "grad_norm": 1.9215497970581055, + "learning_rate": 1.7911476696521437e-05, + "loss": 0.6229, + "step": 4250 + }, + { + "epoch": 0.7015088203207015, + "grad_norm": 1.0203500986099243, + "learning_rate": 1.79005911974085e-05, + "loss": 0.5724, + "step": 4260 + }, + { + "epoch": 0.7031555546407031, + "grad_norm": 1.290940761566162, + "learning_rate": 1.7889680730062137e-05, + "loss": 0.5783, + "step": 4270 + }, + { + "epoch": 0.7048022889607048, + "grad_norm": 1.156741976737976, + "learning_rate": 1.7878745328962696e-05, + "loss": 0.6373, + "step": 4280 + }, + { + "epoch": 0.7064490232807065, + "grad_norm": 1.3677122592926025, + "learning_rate": 1.7867785028669308e-05, + "loss": 0.6251, + "step": 4290 + }, + { + "epoch": 0.7080957576007081, + "grad_norm": 1.15171217918396, + "learning_rate": 1.7856799863819814e-05, + "loss": 0.6389, + "step": 4300 + }, + { + "epoch": 0.7097424919207097, + "grad_norm": 1.243003249168396, + "learning_rate": 1.784578986913062e-05, + "loss": 0.6155, + "step": 4310 + }, + { + "epoch": 0.7113892262407114, + "grad_norm": 1.0966947078704834, + "learning_rate": 1.7834755079396604e-05, + "loss": 0.5783, + "step": 4320 + }, + { + "epoch": 0.7130359605607131, + "grad_norm": 1.2809836864471436, + "learning_rate": 1.782369552949101e-05, + "loss": 0.6119, + "step": 4330 + }, + { + "epoch": 0.7146826948807147, + "grad_norm": 1.148653507232666, + "learning_rate": 1.781261125436532e-05, + "loss": 0.6417, + "step": 4340 + }, + { + "epoch": 0.7163294292007163, + "grad_norm": 1.11336088180542, + "learning_rate": 1.780150228904916e-05, + "loss": 0.6154, + "step": 4350 + }, + { + "epoch": 0.717976163520718, + "grad_norm": 1.1516305208206177, + "learning_rate": 1.779036866865019e-05, + "loss": 0.601, + "step": 4360 + }, + { + "epoch": 0.7196228978407196, + "grad_norm": 1.2385212182998657, + "learning_rate": 1.777921042835399e-05, + "loss": 0.6114, + "step": 4370 + }, + { + "epoch": 0.7212696321607213, + "grad_norm": 1.259101152420044, + "learning_rate": 1.776802760342393e-05, + "loss": 0.5704, + "step": 4380 + }, + { + "epoch": 0.7229163664807229, + "grad_norm": 1.3446874618530273, + "learning_rate": 1.7756820229201092e-05, + "loss": 0.5561, + "step": 4390 + }, + { + "epoch": 0.7245631008007246, + "grad_norm": 1.4307595491409302, + "learning_rate": 1.7745588341104127e-05, + "loss": 0.5923, + "step": 4400 + }, + { + "epoch": 0.7262098351207262, + "grad_norm": 1.4261305332183838, + "learning_rate": 1.7734331974629166e-05, + "loss": 0.6218, + "step": 4410 + }, + { + "epoch": 0.7278565694407279, + "grad_norm": 1.342528223991394, + "learning_rate": 1.77230511653497e-05, + "loss": 0.6146, + "step": 4420 + }, + { + "epoch": 0.7295033037607295, + "grad_norm": 1.4735767841339111, + "learning_rate": 1.7711745948916464e-05, + "loss": 0.6167, + "step": 4430 + }, + { + "epoch": 0.7311500380807312, + "grad_norm": 1.2360775470733643, + "learning_rate": 1.7700416361057322e-05, + "loss": 0.58, + "step": 4440 + }, + { + "epoch": 0.7327967724007328, + "grad_norm": 1.2893751859664917, + "learning_rate": 1.7689062437577165e-05, + "loss": 0.5934, + "step": 4450 + }, + { + "epoch": 0.7344435067207344, + "grad_norm": 1.6743543148040771, + "learning_rate": 1.7677684214357793e-05, + "loss": 0.5796, + "step": 4460 + }, + { + "epoch": 0.7360902410407361, + "grad_norm": 1.218066692352295, + "learning_rate": 1.7666281727357792e-05, + "loss": 0.5908, + "step": 4470 + }, + { + "epoch": 0.7377369753607378, + "grad_norm": 1.4119192361831665, + "learning_rate": 1.7654855012612442e-05, + "loss": 0.6163, + "step": 4480 + }, + { + "epoch": 0.7393837096807394, + "grad_norm": 1.4504175186157227, + "learning_rate": 1.7643404106233573e-05, + "loss": 0.6091, + "step": 4490 + }, + { + "epoch": 0.741030444000741, + "grad_norm": 1.2811555862426758, + "learning_rate": 1.763192904440949e-05, + "loss": 0.5846, + "step": 4500 + }, + { + "epoch": 0.7426771783207426, + "grad_norm": 1.9486907720565796, + "learning_rate": 1.762042986340481e-05, + "loss": 0.6064, + "step": 4510 + }, + { + "epoch": 0.7443239126407444, + "grad_norm": 1.731048822402954, + "learning_rate": 1.76089065995604e-05, + "loss": 0.6002, + "step": 4520 + }, + { + "epoch": 0.745970646960746, + "grad_norm": 1.3845677375793457, + "learning_rate": 1.7597359289293213e-05, + "loss": 0.5834, + "step": 4530 + }, + { + "epoch": 0.7476173812807476, + "grad_norm": 2.7521491050720215, + "learning_rate": 1.758578796909621e-05, + "loss": 0.6001, + "step": 4540 + }, + { + "epoch": 0.7492641156007492, + "grad_norm": 2.141174554824829, + "learning_rate": 1.7574192675538232e-05, + "loss": 0.5961, + "step": 4550 + }, + { + "epoch": 0.7509108499207509, + "grad_norm": 2.0954525470733643, + "learning_rate": 1.756257344526387e-05, + "loss": 0.5862, + "step": 4560 + }, + { + "epoch": 0.7525575842407526, + "grad_norm": 1.395780324935913, + "learning_rate": 1.755093031499338e-05, + "loss": 0.5753, + "step": 4570 + }, + { + "epoch": 0.7542043185607542, + "grad_norm": 1.6621346473693848, + "learning_rate": 1.7539263321522528e-05, + "loss": 0.608, + "step": 4580 + }, + { + "epoch": 0.7558510528807558, + "grad_norm": 1.2174135446548462, + "learning_rate": 1.7527572501722516e-05, + "loss": 0.5927, + "step": 4590 + }, + { + "epoch": 0.7574977872007574, + "grad_norm": 2.0868325233459473, + "learning_rate": 1.7515857892539828e-05, + "loss": 0.5783, + "step": 4600 + }, + { + "epoch": 0.7591445215207592, + "grad_norm": 1.0850328207015991, + "learning_rate": 1.7504119530996138e-05, + "loss": 0.6089, + "step": 4610 + }, + { + "epoch": 0.7607912558407608, + "grad_norm": 0.910921573638916, + "learning_rate": 1.749235745418818e-05, + "loss": 0.5967, + "step": 4620 + }, + { + "epoch": 0.7624379901607624, + "grad_norm": 1.7855496406555176, + "learning_rate": 1.7480571699287647e-05, + "loss": 0.5669, + "step": 4630 + }, + { + "epoch": 0.764084724480764, + "grad_norm": 1.3511872291564941, + "learning_rate": 1.7468762303541044e-05, + "loss": 0.6084, + "step": 4640 + }, + { + "epoch": 0.7657314588007658, + "grad_norm": 1.2458902597427368, + "learning_rate": 1.7456929304269598e-05, + "loss": 0.5668, + "step": 4650 + }, + { + "epoch": 0.7673781931207674, + "grad_norm": 1.1821246147155762, + "learning_rate": 1.7445072738869134e-05, + "loss": 0.6231, + "step": 4660 + }, + { + "epoch": 0.769024927440769, + "grad_norm": 1.6277104616165161, + "learning_rate": 1.7433192644809942e-05, + "loss": 0.6051, + "step": 4670 + }, + { + "epoch": 0.7706716617607706, + "grad_norm": 1.184881329536438, + "learning_rate": 1.742128905963668e-05, + "loss": 0.6102, + "step": 4680 + }, + { + "epoch": 0.7723183960807724, + "grad_norm": 1.4022691249847412, + "learning_rate": 1.7409362020968242e-05, + "loss": 0.6079, + "step": 4690 + }, + { + "epoch": 0.773965130400774, + "grad_norm": 1.271551251411438, + "learning_rate": 1.7397411566497638e-05, + "loss": 0.6041, + "step": 4700 + }, + { + "epoch": 0.7756118647207756, + "grad_norm": 1.211241364479065, + "learning_rate": 1.738543773399188e-05, + "loss": 0.5973, + "step": 4710 + }, + { + "epoch": 0.7772585990407772, + "grad_norm": 1.336120367050171, + "learning_rate": 1.737344056129187e-05, + "loss": 0.6225, + "step": 4720 + }, + { + "epoch": 0.7789053333607789, + "grad_norm": 1.2912676334381104, + "learning_rate": 1.7361420086312255e-05, + "loss": 0.5732, + "step": 4730 + }, + { + "epoch": 0.7805520676807806, + "grad_norm": 1.1799460649490356, + "learning_rate": 1.7349376347041346e-05, + "loss": 0.5675, + "step": 4740 + }, + { + "epoch": 0.7821988020007822, + "grad_norm": 1.8032108545303345, + "learning_rate": 1.7337309381540955e-05, + "loss": 0.5871, + "step": 4750 + }, + { + "epoch": 0.7838455363207838, + "grad_norm": 1.2031786441802979, + "learning_rate": 1.7325219227946314e-05, + "loss": 0.5934, + "step": 4760 + }, + { + "epoch": 0.7854922706407855, + "grad_norm": 1.412174105644226, + "learning_rate": 1.7313105924465923e-05, + "loss": 0.5923, + "step": 4770 + }, + { + "epoch": 0.7871390049607871, + "grad_norm": 1.4549241065979004, + "learning_rate": 1.7300969509381448e-05, + "loss": 0.5747, + "step": 4780 + }, + { + "epoch": 0.7887857392807888, + "grad_norm": 1.831076979637146, + "learning_rate": 1.7288810021047597e-05, + "loss": 0.6102, + "step": 4790 + }, + { + "epoch": 0.7904324736007904, + "grad_norm": 1.530872106552124, + "learning_rate": 1.7276627497891984e-05, + "loss": 0.5789, + "step": 4800 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 1.1748154163360596, + "learning_rate": 1.726442197841504e-05, + "loss": 0.5854, + "step": 4810 + }, + { + "epoch": 0.7937259422407937, + "grad_norm": 1.1575807332992554, + "learning_rate": 1.7252193501189857e-05, + "loss": 0.5712, + "step": 4820 + }, + { + "epoch": 0.7953726765607954, + "grad_norm": 1.2026344537734985, + "learning_rate": 1.723994210486208e-05, + "loss": 0.5471, + "step": 4830 + }, + { + "epoch": 0.797019410880797, + "grad_norm": 1.4718669652938843, + "learning_rate": 1.7227667828149795e-05, + "loss": 0.6096, + "step": 4840 + }, + { + "epoch": 0.7986661452007987, + "grad_norm": 1.6093403100967407, + "learning_rate": 1.721537070984339e-05, + "loss": 0.5773, + "step": 4850 + }, + { + "epoch": 0.8003128795208003, + "grad_norm": 1.2670097351074219, + "learning_rate": 1.720305078880544e-05, + "loss": 0.5763, + "step": 4860 + }, + { + "epoch": 0.8019596138408019, + "grad_norm": 1.2672990560531616, + "learning_rate": 1.719070810397058e-05, + "loss": 0.5859, + "step": 4870 + }, + { + "epoch": 0.8036063481608036, + "grad_norm": 1.4210219383239746, + "learning_rate": 1.7178342694345395e-05, + "loss": 0.5781, + "step": 4880 + }, + { + "epoch": 0.8052530824808053, + "grad_norm": 1.1239454746246338, + "learning_rate": 1.7165954599008275e-05, + "loss": 0.5778, + "step": 4890 + }, + { + "epoch": 0.8068998168008069, + "grad_norm": 1.0624126195907593, + "learning_rate": 1.7153543857109314e-05, + "loss": 0.5804, + "step": 4900 + }, + { + "epoch": 0.8085465511208085, + "grad_norm": 1.0993456840515137, + "learning_rate": 1.7141110507870172e-05, + "loss": 0.5778, + "step": 4910 + }, + { + "epoch": 0.8101932854408102, + "grad_norm": 1.2814329862594604, + "learning_rate": 1.7128654590583953e-05, + "loss": 0.5687, + "step": 4920 + }, + { + "epoch": 0.8118400197608119, + "grad_norm": 1.2554537057876587, + "learning_rate": 1.7116176144615085e-05, + "loss": 0.5967, + "step": 4930 + }, + { + "epoch": 0.8134867540808135, + "grad_norm": 1.1652063131332397, + "learning_rate": 1.7103675209399194e-05, + "loss": 0.5907, + "step": 4940 + }, + { + "epoch": 0.8151334884008151, + "grad_norm": 1.1235425472259521, + "learning_rate": 1.7091151824442978e-05, + "loss": 0.6217, + "step": 4950 + }, + { + "epoch": 0.8167802227208167, + "grad_norm": 1.1538571119308472, + "learning_rate": 1.707860602932408e-05, + "loss": 0.5807, + "step": 4960 + }, + { + "epoch": 0.8184269570408185, + "grad_norm": 1.3833861351013184, + "learning_rate": 1.7066037863690975e-05, + "loss": 0.5916, + "step": 4970 + }, + { + "epoch": 0.8200736913608201, + "grad_norm": 1.847043752670288, + "learning_rate": 1.7053447367262817e-05, + "loss": 0.5662, + "step": 4980 + }, + { + "epoch": 0.8217204256808217, + "grad_norm": 1.4586071968078613, + "learning_rate": 1.7040834579829358e-05, + "loss": 0.5624, + "step": 4990 + }, + { + "epoch": 0.8233671600008233, + "grad_norm": 1.2225836515426636, + "learning_rate": 1.702819954125077e-05, + "loss": 0.5864, + "step": 5000 + }, + { + "epoch": 0.8250138943208251, + "grad_norm": 1.430363655090332, + "learning_rate": 1.7015542291457567e-05, + "loss": 0.5816, + "step": 5010 + }, + { + "epoch": 0.8266606286408267, + "grad_norm": 1.2088559865951538, + "learning_rate": 1.7002862870450446e-05, + "loss": 0.5924, + "step": 5020 + }, + { + "epoch": 0.8283073629608283, + "grad_norm": 1.0974377393722534, + "learning_rate": 1.6990161318300167e-05, + "loss": 0.556, + "step": 5030 + }, + { + "epoch": 0.8299540972808299, + "grad_norm": 1.475848913192749, + "learning_rate": 1.697743767514744e-05, + "loss": 0.5711, + "step": 5040 + }, + { + "epoch": 0.8316008316008316, + "grad_norm": 1.3842908143997192, + "learning_rate": 1.696469198120279e-05, + "loss": 0.5931, + "step": 5050 + }, + { + "epoch": 0.8332475659208333, + "grad_norm": 1.4212431907653809, + "learning_rate": 1.6951924276746425e-05, + "loss": 0.5905, + "step": 5060 + }, + { + "epoch": 0.8348943002408349, + "grad_norm": 1.4288952350616455, + "learning_rate": 1.693913460212811e-05, + "loss": 0.5891, + "step": 5070 + }, + { + "epoch": 0.8365410345608365, + "grad_norm": 1.2996931076049805, + "learning_rate": 1.6926322997767045e-05, + "loss": 0.5876, + "step": 5080 + }, + { + "epoch": 0.8381877688808382, + "grad_norm": 1.6933460235595703, + "learning_rate": 1.6913489504151743e-05, + "loss": 0.5642, + "step": 5090 + }, + { + "epoch": 0.8398345032008399, + "grad_norm": 1.210008144378662, + "learning_rate": 1.690063416183988e-05, + "loss": 0.5543, + "step": 5100 + }, + { + "epoch": 0.8414812375208415, + "grad_norm": 1.4546929597854614, + "learning_rate": 1.6887757011458184e-05, + "loss": 0.5843, + "step": 5110 + }, + { + "epoch": 0.8431279718408431, + "grad_norm": 1.6936150789260864, + "learning_rate": 1.687485809370231e-05, + "loss": 0.5882, + "step": 5120 + }, + { + "epoch": 0.8447747061608448, + "grad_norm": 1.3216242790222168, + "learning_rate": 1.6861937449336697e-05, + "loss": 0.5567, + "step": 5130 + }, + { + "epoch": 0.8464214404808464, + "grad_norm": 1.691382884979248, + "learning_rate": 1.6848995119194453e-05, + "loss": 0.559, + "step": 5140 + }, + { + "epoch": 0.8480681748008481, + "grad_norm": 1.0977565050125122, + "learning_rate": 1.683603114417721e-05, + "loss": 0.5877, + "step": 5150 + }, + { + "epoch": 0.8497149091208497, + "grad_norm": 1.1555675268173218, + "learning_rate": 1.6823045565255018e-05, + "loss": 0.5665, + "step": 5160 + }, + { + "epoch": 0.8513616434408514, + "grad_norm": 1.320023775100708, + "learning_rate": 1.6810038423466184e-05, + "loss": 0.5939, + "step": 5170 + }, + { + "epoch": 0.853008377760853, + "grad_norm": 1.6562938690185547, + "learning_rate": 1.6797009759917177e-05, + "loss": 0.5441, + "step": 5180 + }, + { + "epoch": 0.8546551120808547, + "grad_norm": 1.2160990238189697, + "learning_rate": 1.678395961578247e-05, + "loss": 0.5647, + "step": 5190 + }, + { + "epoch": 0.8563018464008563, + "grad_norm": 1.3416587114334106, + "learning_rate": 1.6770888032304437e-05, + "loss": 0.5696, + "step": 5200 + }, + { + "epoch": 0.857948580720858, + "grad_norm": 1.0583994388580322, + "learning_rate": 1.6757795050793175e-05, + "loss": 0.5509, + "step": 5210 + }, + { + "epoch": 0.8595953150408596, + "grad_norm": 1.2085150480270386, + "learning_rate": 1.674468071262644e-05, + "loss": 0.5405, + "step": 5220 + }, + { + "epoch": 0.8612420493608612, + "grad_norm": 1.6761043071746826, + "learning_rate": 1.6731545059249467e-05, + "loss": 0.5666, + "step": 5230 + }, + { + "epoch": 0.8628887836808629, + "grad_norm": 1.3844279050827026, + "learning_rate": 1.671838813217485e-05, + "loss": 0.5458, + "step": 5240 + }, + { + "epoch": 0.8645355180008646, + "grad_norm": 2.4434731006622314, + "learning_rate": 1.670520997298241e-05, + "loss": 0.5734, + "step": 5250 + }, + { + "epoch": 0.8661822523208662, + "grad_norm": 1.2524642944335938, + "learning_rate": 1.6692010623319087e-05, + "loss": 0.5698, + "step": 5260 + }, + { + "epoch": 0.8678289866408678, + "grad_norm": 1.8466917276382446, + "learning_rate": 1.667879012489877e-05, + "loss": 0.5698, + "step": 5270 + }, + { + "epoch": 0.8694757209608694, + "grad_norm": 1.3722879886627197, + "learning_rate": 1.6665548519502196e-05, + "loss": 0.5402, + "step": 5280 + }, + { + "epoch": 0.8711224552808712, + "grad_norm": 1.4703186750411987, + "learning_rate": 1.66522858489768e-05, + "loss": 0.5886, + "step": 5290 + }, + { + "epoch": 0.8727691896008728, + "grad_norm": 1.5705962181091309, + "learning_rate": 1.663900215523659e-05, + "loss": 0.6006, + "step": 5300 + }, + { + "epoch": 0.8744159239208744, + "grad_norm": 1.7487270832061768, + "learning_rate": 1.662569748026202e-05, + "loss": 0.5421, + "step": 5310 + }, + { + "epoch": 0.876062658240876, + "grad_norm": 1.986843228340149, + "learning_rate": 1.661237186609984e-05, + "loss": 0.5275, + "step": 5320 + }, + { + "epoch": 0.8777093925608777, + "grad_norm": 1.661085844039917, + "learning_rate": 1.6599025354862983e-05, + "loss": 0.5491, + "step": 5330 + }, + { + "epoch": 0.8793561268808794, + "grad_norm": 1.975029706954956, + "learning_rate": 1.6585657988730424e-05, + "loss": 0.5743, + "step": 5340 + }, + { + "epoch": 0.881002861200881, + "grad_norm": 1.8658064603805542, + "learning_rate": 1.6572269809947035e-05, + "loss": 0.5627, + "step": 5350 + }, + { + "epoch": 0.8826495955208826, + "grad_norm": 1.540292739868164, + "learning_rate": 1.6558860860823473e-05, + "loss": 0.5227, + "step": 5360 + }, + { + "epoch": 0.8842963298408842, + "grad_norm": 1.792221188545227, + "learning_rate": 1.654543118373603e-05, + "loss": 0.5766, + "step": 5370 + }, + { + "epoch": 0.885943064160886, + "grad_norm": 1.3535420894622803, + "learning_rate": 1.6531980821126508e-05, + "loss": 0.5803, + "step": 5380 + }, + { + "epoch": 0.8875897984808876, + "grad_norm": 1.7461328506469727, + "learning_rate": 1.651850981550208e-05, + "loss": 0.5511, + "step": 5390 + }, + { + "epoch": 0.8892365328008892, + "grad_norm": 1.6010706424713135, + "learning_rate": 1.6505018209435152e-05, + "loss": 0.5656, + "step": 5400 + }, + { + "epoch": 0.8908832671208908, + "grad_norm": 1.3865113258361816, + "learning_rate": 1.649150604556324e-05, + "loss": 0.57, + "step": 5410 + }, + { + "epoch": 0.8925300014408926, + "grad_norm": 1.609237790107727, + "learning_rate": 1.6477973366588833e-05, + "loss": 0.5566, + "step": 5420 + }, + { + "epoch": 0.8941767357608942, + "grad_norm": 2.0316720008850098, + "learning_rate": 1.6464420215279237e-05, + "loss": 0.5601, + "step": 5430 + }, + { + "epoch": 0.8958234700808958, + "grad_norm": 1.759619951248169, + "learning_rate": 1.6450846634466476e-05, + "loss": 0.5665, + "step": 5440 + }, + { + "epoch": 0.8974702044008974, + "grad_norm": 1.5108795166015625, + "learning_rate": 1.643725266704713e-05, + "loss": 0.5432, + "step": 5450 + }, + { + "epoch": 0.8991169387208992, + "grad_norm": 1.3208014965057373, + "learning_rate": 1.6423638355982202e-05, + "loss": 0.5867, + "step": 5460 + }, + { + "epoch": 0.9007636730409008, + "grad_norm": 1.530247688293457, + "learning_rate": 1.6410003744296984e-05, + "loss": 0.5721, + "step": 5470 + }, + { + "epoch": 0.9024104073609024, + "grad_norm": 2.346730947494507, + "learning_rate": 1.6396348875080945e-05, + "loss": 0.5569, + "step": 5480 + }, + { + "epoch": 0.904057141680904, + "grad_norm": 1.4136897325515747, + "learning_rate": 1.638267379148755e-05, + "loss": 0.5876, + "step": 5490 + }, + { + "epoch": 0.9057038760009057, + "grad_norm": 1.5225974321365356, + "learning_rate": 1.6368978536734162e-05, + "loss": 0.57, + "step": 5500 + }, + { + "epoch": 0.9073506103209074, + "grad_norm": 1.5766575336456299, + "learning_rate": 1.6355263154101884e-05, + "loss": 0.5282, + "step": 5510 + }, + { + "epoch": 0.908997344640909, + "grad_norm": 1.784691333770752, + "learning_rate": 1.634152768693543e-05, + "loss": 0.5805, + "step": 5520 + }, + { + "epoch": 0.9106440789609106, + "grad_norm": 1.479247808456421, + "learning_rate": 1.6327772178642986e-05, + "loss": 0.5525, + "step": 5530 + }, + { + "epoch": 0.9122908132809123, + "grad_norm": 1.4190905094146729, + "learning_rate": 1.6313996672696083e-05, + "loss": 0.5598, + "step": 5540 + }, + { + "epoch": 0.913937547600914, + "grad_norm": 1.5564013719558716, + "learning_rate": 1.6300201212629437e-05, + "loss": 0.5521, + "step": 5550 + }, + { + "epoch": 0.9155842819209156, + "grad_norm": 1.3574228286743164, + "learning_rate": 1.6286385842040843e-05, + "loss": 0.55, + "step": 5560 + }, + { + "epoch": 0.9172310162409172, + "grad_norm": 1.4063721895217896, + "learning_rate": 1.6272550604590993e-05, + "loss": 0.5731, + "step": 5570 + }, + { + "epoch": 0.9188777505609189, + "grad_norm": 1.4861043691635132, + "learning_rate": 1.625869554400339e-05, + "loss": 0.5248, + "step": 5580 + }, + { + "epoch": 0.9205244848809205, + "grad_norm": 1.3798218965530396, + "learning_rate": 1.624482070406417e-05, + "loss": 0.5626, + "step": 5590 + }, + { + "epoch": 0.9221712192009222, + "grad_norm": 1.3654139041900635, + "learning_rate": 1.623092612862198e-05, + "loss": 0.5273, + "step": 5600 + }, + { + "epoch": 0.9238179535209238, + "grad_norm": 1.593929648399353, + "learning_rate": 1.6217011861587834e-05, + "loss": 0.5609, + "step": 5610 + }, + { + "epoch": 0.9254646878409255, + "grad_norm": 1.439780592918396, + "learning_rate": 1.6203077946934992e-05, + "loss": 0.5699, + "step": 5620 + }, + { + "epoch": 0.9271114221609271, + "grad_norm": 2.25289249420166, + "learning_rate": 1.6189124428698787e-05, + "loss": 0.5472, + "step": 5630 + }, + { + "epoch": 0.9287581564809287, + "grad_norm": 1.3383618593215942, + "learning_rate": 1.6175151350976518e-05, + "loss": 0.5551, + "step": 5640 + }, + { + "epoch": 0.9304048908009304, + "grad_norm": 1.414027452468872, + "learning_rate": 1.6161158757927292e-05, + "loss": 0.5793, + "step": 5650 + }, + { + "epoch": 0.9320516251209321, + "grad_norm": 1.126561164855957, + "learning_rate": 1.6147146693771896e-05, + "loss": 0.5704, + "step": 5660 + }, + { + "epoch": 0.9336983594409337, + "grad_norm": 1.2248793840408325, + "learning_rate": 1.6133115202792645e-05, + "loss": 0.5614, + "step": 5670 + }, + { + "epoch": 0.9353450937609353, + "grad_norm": 1.4462449550628662, + "learning_rate": 1.6119064329333248e-05, + "loss": 0.5544, + "step": 5680 + }, + { + "epoch": 0.936991828080937, + "grad_norm": 1.1664077043533325, + "learning_rate": 1.6104994117798674e-05, + "loss": 0.5596, + "step": 5690 + }, + { + "epoch": 0.9386385624009387, + "grad_norm": 1.239491581916809, + "learning_rate": 1.6090904612655007e-05, + "loss": 0.5636, + "step": 5700 + }, + { + "epoch": 0.9402852967209403, + "grad_norm": 1.3634086847305298, + "learning_rate": 1.6076795858429296e-05, + "loss": 0.558, + "step": 5710 + }, + { + "epoch": 0.9419320310409419, + "grad_norm": 1.319319725036621, + "learning_rate": 1.606266789970943e-05, + "loss": 0.5563, + "step": 5720 + }, + { + "epoch": 0.9435787653609435, + "grad_norm": 1.2931233644485474, + "learning_rate": 1.6048520781143988e-05, + "loss": 0.5646, + "step": 5730 + }, + { + "epoch": 0.9452254996809453, + "grad_norm": 1.3727686405181885, + "learning_rate": 1.6034354547442104e-05, + "loss": 0.5288, + "step": 5740 + }, + { + "epoch": 0.9468722340009469, + "grad_norm": 1.4657319784164429, + "learning_rate": 1.6020169243373313e-05, + "loss": 0.5605, + "step": 5750 + }, + { + "epoch": 0.9485189683209485, + "grad_norm": 1.3323101997375488, + "learning_rate": 1.600596491376742e-05, + "loss": 0.5575, + "step": 5760 + }, + { + "epoch": 0.9501657026409501, + "grad_norm": 1.4169237613677979, + "learning_rate": 1.5991741603514367e-05, + "loss": 0.5448, + "step": 5770 + }, + { + "epoch": 0.9518124369609519, + "grad_norm": 2.1234960556030273, + "learning_rate": 1.5977499357564067e-05, + "loss": 0.5453, + "step": 5780 + }, + { + "epoch": 0.9534591712809535, + "grad_norm": 1.3298665285110474, + "learning_rate": 1.596323822092628e-05, + "loss": 0.5205, + "step": 5790 + }, + { + "epoch": 0.9551059056009551, + "grad_norm": 1.7144644260406494, + "learning_rate": 1.594895823867047e-05, + "loss": 0.5333, + "step": 5800 + }, + { + "epoch": 0.9567526399209567, + "grad_norm": 1.2539159059524536, + "learning_rate": 1.5934659455925658e-05, + "loss": 0.552, + "step": 5810 + }, + { + "epoch": 0.9583993742409584, + "grad_norm": 1.7403203248977661, + "learning_rate": 1.5920341917880277e-05, + "loss": 0.5166, + "step": 5820 + }, + { + "epoch": 0.9600461085609601, + "grad_norm": 1.802240252494812, + "learning_rate": 1.5906005669782027e-05, + "loss": 0.5392, + "step": 5830 + }, + { + "epoch": 0.9616928428809617, + "grad_norm": 1.3411508798599243, + "learning_rate": 1.5891650756937755e-05, + "loss": 0.5621, + "step": 5840 + }, + { + "epoch": 0.9633395772009633, + "grad_norm": 1.388187289237976, + "learning_rate": 1.587727722471327e-05, + "loss": 0.5331, + "step": 5850 + }, + { + "epoch": 0.964986311520965, + "grad_norm": 1.2933506965637207, + "learning_rate": 1.5862885118533244e-05, + "loss": 0.5449, + "step": 5860 + }, + { + "epoch": 0.9666330458409667, + "grad_norm": 1.637378454208374, + "learning_rate": 1.5848474483881044e-05, + "loss": 0.5586, + "step": 5870 + }, + { + "epoch": 0.9682797801609683, + "grad_norm": 1.2650068998336792, + "learning_rate": 1.5834045366298593e-05, + "loss": 0.5504, + "step": 5880 + }, + { + "epoch": 0.9699265144809699, + "grad_norm": 1.3587515354156494, + "learning_rate": 1.5819597811386208e-05, + "loss": 0.5494, + "step": 5890 + }, + { + "epoch": 0.9715732488009716, + "grad_norm": 1.1852201223373413, + "learning_rate": 1.5805131864802496e-05, + "loss": 0.5639, + "step": 5900 + }, + { + "epoch": 0.9732199831209732, + "grad_norm": 1.3955892324447632, + "learning_rate": 1.579064757226418e-05, + "loss": 0.5651, + "step": 5910 + }, + { + "epoch": 0.9748667174409749, + "grad_norm": 1.284073829650879, + "learning_rate": 1.5776144979545963e-05, + "loss": 0.5304, + "step": 5920 + }, + { + "epoch": 0.9765134517609765, + "grad_norm": 1.7228704690933228, + "learning_rate": 1.5761624132480372e-05, + "loss": 0.5762, + "step": 5930 + }, + { + "epoch": 0.9781601860809782, + "grad_norm": 1.9546711444854736, + "learning_rate": 1.5747085076957635e-05, + "loss": 0.5715, + "step": 5940 + }, + { + "epoch": 0.9798069204009798, + "grad_norm": 1.2742135524749756, + "learning_rate": 1.5732527858925523e-05, + "loss": 0.5501, + "step": 5950 + }, + { + "epoch": 0.9814536547209814, + "grad_norm": 1.293632984161377, + "learning_rate": 1.57179525243892e-05, + "loss": 0.5593, + "step": 5960 + }, + { + "epoch": 0.9831003890409831, + "grad_norm": 1.244485855102539, + "learning_rate": 1.5703359119411087e-05, + "loss": 0.5229, + "step": 5970 + }, + { + "epoch": 0.9847471233609848, + "grad_norm": 1.1649023294448853, + "learning_rate": 1.5688747690110708e-05, + "loss": 0.5411, + "step": 5980 + }, + { + "epoch": 0.9863938576809864, + "grad_norm": 1.1061643362045288, + "learning_rate": 1.5674118282664563e-05, + "loss": 0.5528, + "step": 5990 + }, + { + "epoch": 0.988040592000988, + "grad_norm": 1.6961778402328491, + "learning_rate": 1.5659470943305956e-05, + "loss": 0.5484, + "step": 6000 + }, + { + "epoch": 0.9896873263209897, + "grad_norm": 1.4628689289093018, + "learning_rate": 1.5644805718324854e-05, + "loss": 0.5446, + "step": 6010 + }, + { + "epoch": 0.9913340606409914, + "grad_norm": 1.3264354467391968, + "learning_rate": 1.5630122654067778e-05, + "loss": 0.5684, + "step": 6020 + }, + { + "epoch": 0.992980794960993, + "grad_norm": 1.5602631568908691, + "learning_rate": 1.5615421796937593e-05, + "loss": 0.5514, + "step": 6030 + }, + { + "epoch": 0.9946275292809946, + "grad_norm": 2.169344663619995, + "learning_rate": 1.560070319339341e-05, + "loss": 0.5681, + "step": 6040 + }, + { + "epoch": 0.9962742636009962, + "grad_norm": 1.190785527229309, + "learning_rate": 1.5585966889950423e-05, + "loss": 0.5266, + "step": 6050 + }, + { + "epoch": 0.997920997920998, + "grad_norm": 1.3038309812545776, + "learning_rate": 1.5571212933179766e-05, + "loss": 0.5326, + "step": 6060 + }, + { + "epoch": 0.9995677322409996, + "grad_norm": 1.3445013761520386, + "learning_rate": 1.5556441369708358e-05, + "loss": 0.5473, + "step": 6070 + }, + { + "epoch": 1.0013173874560013, + "grad_norm": 1.4349257946014404, + "learning_rate": 1.554165224621876e-05, + "loss": 0.5755, + "step": 6080 + }, + { + "epoch": 1.002964121776003, + "grad_norm": 1.7247047424316406, + "learning_rate": 1.552684560944903e-05, + "loss": 0.5148, + "step": 6090 + }, + { + "epoch": 1.0046108560960045, + "grad_norm": 1.518079161643982, + "learning_rate": 1.5512021506192575e-05, + "loss": 0.5156, + "step": 6100 + }, + { + "epoch": 1.0062575904160063, + "grad_norm": 1.1310617923736572, + "learning_rate": 1.5497179983297998e-05, + "loss": 0.4851, + "step": 6110 + }, + { + "epoch": 1.007904324736008, + "grad_norm": 1.521835207939148, + "learning_rate": 1.5482321087668954e-05, + "loss": 0.507, + "step": 6120 + }, + { + "epoch": 1.0095510590560095, + "grad_norm": 1.224595308303833, + "learning_rate": 1.5467444866264e-05, + "loss": 0.5086, + "step": 6130 + }, + { + "epoch": 1.0111977933760112, + "grad_norm": 1.4510592222213745, + "learning_rate": 1.5452551366096457e-05, + "loss": 0.4839, + "step": 6140 + }, + { + "epoch": 1.0128445276960127, + "grad_norm": 1.4431626796722412, + "learning_rate": 1.5437640634234234e-05, + "loss": 0.4992, + "step": 6150 + }, + { + "epoch": 1.0144912620160145, + "grad_norm": 1.131432056427002, + "learning_rate": 1.5422712717799714e-05, + "loss": 0.5107, + "step": 6160 + }, + { + "epoch": 1.0161379963360162, + "grad_norm": 1.4015928506851196, + "learning_rate": 1.5407767663969576e-05, + "loss": 0.5125, + "step": 6170 + }, + { + "epoch": 1.0177847306560177, + "grad_norm": 1.4604285955429077, + "learning_rate": 1.5392805519974678e-05, + "loss": 0.5076, + "step": 6180 + }, + { + "epoch": 1.0194314649760194, + "grad_norm": 1.2894060611724854, + "learning_rate": 1.5377826333099855e-05, + "loss": 0.51, + "step": 6190 + }, + { + "epoch": 1.0210781992960212, + "grad_norm": 1.5654339790344238, + "learning_rate": 1.5362830150683838e-05, + "loss": 0.4946, + "step": 6200 + }, + { + "epoch": 1.0227249336160227, + "grad_norm": 1.7387830018997192, + "learning_rate": 1.5347817020119044e-05, + "loss": 0.5302, + "step": 6210 + }, + { + "epoch": 1.0243716679360244, + "grad_norm": 1.534555196762085, + "learning_rate": 1.5332786988851462e-05, + "loss": 0.5191, + "step": 6220 + }, + { + "epoch": 1.026018402256026, + "grad_norm": 1.6992589235305786, + "learning_rate": 1.531774010438049e-05, + "loss": 0.5076, + "step": 6230 + }, + { + "epoch": 1.0276651365760276, + "grad_norm": 1.5963928699493408, + "learning_rate": 1.530267641425879e-05, + "loss": 0.4864, + "step": 6240 + }, + { + "epoch": 1.0293118708960294, + "grad_norm": 1.390317678451538, + "learning_rate": 1.528759596609213e-05, + "loss": 0.5072, + "step": 6250 + }, + { + "epoch": 1.0309586052160309, + "grad_norm": 1.8305940628051758, + "learning_rate": 1.5272498807539248e-05, + "loss": 0.4892, + "step": 6260 + }, + { + "epoch": 1.0326053395360326, + "grad_norm": 4.596288204193115, + "learning_rate": 1.5257384986311671e-05, + "loss": 0.513, + "step": 6270 + }, + { + "epoch": 1.0342520738560343, + "grad_norm": 1.9707483053207397, + "learning_rate": 1.5242254550173612e-05, + "loss": 0.4815, + "step": 6280 + }, + { + "epoch": 1.0358988081760359, + "grad_norm": 1.657809853553772, + "learning_rate": 1.5227107546941772e-05, + "loss": 0.5244, + "step": 6290 + }, + { + "epoch": 1.0375455424960376, + "grad_norm": 1.530784010887146, + "learning_rate": 1.5211944024485216e-05, + "loss": 0.5041, + "step": 6300 + }, + { + "epoch": 1.039192276816039, + "grad_norm": 1.7985795736312866, + "learning_rate": 1.5196764030725217e-05, + "loss": 0.4922, + "step": 6310 + }, + { + "epoch": 1.0408390111360408, + "grad_norm": 1.5775318145751953, + "learning_rate": 1.5181567613635099e-05, + "loss": 0.5004, + "step": 6320 + }, + { + "epoch": 1.0424857454560426, + "grad_norm": 1.5690876245498657, + "learning_rate": 1.5166354821240093e-05, + "loss": 0.5047, + "step": 6330 + }, + { + "epoch": 1.044132479776044, + "grad_norm": 1.4452539682388306, + "learning_rate": 1.5151125701617168e-05, + "loss": 0.4845, + "step": 6340 + }, + { + "epoch": 1.0457792140960458, + "grad_norm": 2.1906886100769043, + "learning_rate": 1.513588030289491e-05, + "loss": 0.4827, + "step": 6350 + }, + { + "epoch": 1.0474259484160475, + "grad_norm": 2.8921637535095215, + "learning_rate": 1.5120618673253335e-05, + "loss": 0.5144, + "step": 6360 + }, + { + "epoch": 1.049072682736049, + "grad_norm": 2.0884976387023926, + "learning_rate": 1.510534086092377e-05, + "loss": 0.5055, + "step": 6370 + }, + { + "epoch": 1.0507194170560508, + "grad_norm": 2.065589189529419, + "learning_rate": 1.5090046914188672e-05, + "loss": 0.5281, + "step": 6380 + }, + { + "epoch": 1.0523661513760523, + "grad_norm": 2.384566068649292, + "learning_rate": 1.507473688138149e-05, + "loss": 0.5073, + "step": 6390 + }, + { + "epoch": 1.054012885696054, + "grad_norm": 2.007929801940918, + "learning_rate": 1.5059410810886515e-05, + "loss": 0.4977, + "step": 6400 + }, + { + "epoch": 1.0556596200160557, + "grad_norm": 1.7083964347839355, + "learning_rate": 1.5044068751138716e-05, + "loss": 0.5095, + "step": 6410 + }, + { + "epoch": 1.0573063543360572, + "grad_norm": 1.9435292482376099, + "learning_rate": 1.5028710750623592e-05, + "loss": 0.5173, + "step": 6420 + }, + { + "epoch": 1.058953088656059, + "grad_norm": 3.824756145477295, + "learning_rate": 1.5013336857877025e-05, + "loss": 0.482, + "step": 6430 + }, + { + "epoch": 1.0605998229760607, + "grad_norm": 2.91257381439209, + "learning_rate": 1.4997947121485119e-05, + "loss": 0.5235, + "step": 6440 + }, + { + "epoch": 1.0622465572960622, + "grad_norm": 1.5288364887237549, + "learning_rate": 1.4982541590084047e-05, + "loss": 0.4804, + "step": 6450 + }, + { + "epoch": 1.063893291616064, + "grad_norm": 1.5502305030822754, + "learning_rate": 1.4967120312359902e-05, + "loss": 0.516, + "step": 6460 + }, + { + "epoch": 1.0655400259360654, + "grad_norm": 1.62122642993927, + "learning_rate": 1.4951683337048536e-05, + "loss": 0.4812, + "step": 6470 + }, + { + "epoch": 1.0671867602560672, + "grad_norm": 1.508252739906311, + "learning_rate": 1.4936230712935416e-05, + "loss": 0.4897, + "step": 6480 + }, + { + "epoch": 1.068833494576069, + "grad_norm": 1.665816068649292, + "learning_rate": 1.4920762488855457e-05, + "loss": 0.4891, + "step": 6490 + }, + { + "epoch": 1.0704802288960704, + "grad_norm": 1.297685146331787, + "learning_rate": 1.490527871369288e-05, + "loss": 0.4643, + "step": 6500 + }, + { + "epoch": 1.071962289784072, + "grad_norm": 1.9492204189300537, + "learning_rate": 1.4889779436381046e-05, + "loss": 0.5146, + "step": 6510 + }, + { + "epoch": 1.0736090241040737, + "grad_norm": 1.655253291130066, + "learning_rate": 1.4874264705902319e-05, + "loss": 0.5261, + "step": 6520 + }, + { + "epoch": 1.0752557584240752, + "grad_norm": 1.1686067581176758, + "learning_rate": 1.4858734571287885e-05, + "loss": 0.5401, + "step": 6530 + }, + { + "epoch": 1.076902492744077, + "grad_norm": 1.5212386846542358, + "learning_rate": 1.4843189081617622e-05, + "loss": 0.5403, + "step": 6540 + }, + { + "epoch": 1.0785492270640786, + "grad_norm": 1.1677196025848389, + "learning_rate": 1.4827628286019928e-05, + "loss": 0.5261, + "step": 6550 + }, + { + "epoch": 1.0801959613840801, + "grad_norm": 1.2745987176895142, + "learning_rate": 1.4812052233671581e-05, + "loss": 0.5309, + "step": 6560 + }, + { + "epoch": 1.0818426957040819, + "grad_norm": 1.4600034952163696, + "learning_rate": 1.4796460973797566e-05, + "loss": 0.5168, + "step": 6570 + }, + { + "epoch": 1.0834894300240836, + "grad_norm": 1.8619006872177124, + "learning_rate": 1.4780854555670932e-05, + "loss": 0.5423, + "step": 6580 + }, + { + "epoch": 1.085136164344085, + "grad_norm": 1.418540120124817, + "learning_rate": 1.4765233028612633e-05, + "loss": 0.501, + "step": 6590 + }, + { + "epoch": 1.0867828986640868, + "grad_norm": 1.2168614864349365, + "learning_rate": 1.474959644199137e-05, + "loss": 0.5219, + "step": 6600 + }, + { + "epoch": 1.0884296329840883, + "grad_norm": 1.4315396547317505, + "learning_rate": 1.4733944845223441e-05, + "loss": 0.5274, + "step": 6610 + }, + { + "epoch": 1.09007636730409, + "grad_norm": 1.5139554738998413, + "learning_rate": 1.4718278287772574e-05, + "loss": 0.5181, + "step": 6620 + }, + { + "epoch": 1.0917231016240918, + "grad_norm": 1.6579830646514893, + "learning_rate": 1.4702596819149784e-05, + "loss": 0.5303, + "step": 6630 + }, + { + "epoch": 1.0933698359440933, + "grad_norm": 1.6588560342788696, + "learning_rate": 1.46869004889132e-05, + "loss": 0.4853, + "step": 6640 + }, + { + "epoch": 1.095016570264095, + "grad_norm": 1.925136923789978, + "learning_rate": 1.4671189346667933e-05, + "loss": 0.5222, + "step": 6650 + }, + { + "epoch": 1.0966633045840966, + "grad_norm": 1.1029921770095825, + "learning_rate": 1.4655463442065893e-05, + "loss": 0.4927, + "step": 6660 + }, + { + "epoch": 1.0983100389040983, + "grad_norm": 1.2987926006317139, + "learning_rate": 1.4639722824805644e-05, + "loss": 0.5328, + "step": 6670 + }, + { + "epoch": 1.0999567732241, + "grad_norm": 1.4294719696044922, + "learning_rate": 1.4623967544632252e-05, + "loss": 0.5172, + "step": 6680 + }, + { + "epoch": 1.1016035075441015, + "grad_norm": 1.1788091659545898, + "learning_rate": 1.4608197651337117e-05, + "loss": 0.5056, + "step": 6690 + }, + { + "epoch": 1.1032502418641033, + "grad_norm": 1.2116508483886719, + "learning_rate": 1.4592413194757826e-05, + "loss": 0.5182, + "step": 6700 + }, + { + "epoch": 1.104896976184105, + "grad_norm": 1.1964901685714722, + "learning_rate": 1.4576614224777982e-05, + "loss": 0.4942, + "step": 6710 + }, + { + "epoch": 1.1065437105041065, + "grad_norm": 1.5274699926376343, + "learning_rate": 1.4560800791327063e-05, + "loss": 0.5232, + "step": 6720 + }, + { + "epoch": 1.1081904448241082, + "grad_norm": 1.4706414937973022, + "learning_rate": 1.4544972944380256e-05, + "loss": 0.5432, + "step": 6730 + }, + { + "epoch": 1.10983717914411, + "grad_norm": 1.6734999418258667, + "learning_rate": 1.4529130733958292e-05, + "loss": 0.5161, + "step": 6740 + }, + { + "epoch": 1.1114839134641115, + "grad_norm": 1.2489820718765259, + "learning_rate": 1.4513274210127298e-05, + "loss": 0.5232, + "step": 6750 + }, + { + "epoch": 1.1131306477841132, + "grad_norm": 1.5743283033370972, + "learning_rate": 1.4497403422998634e-05, + "loss": 0.5028, + "step": 6760 + }, + { + "epoch": 1.1147773821041147, + "grad_norm": 1.1314998865127563, + "learning_rate": 1.448151842272875e-05, + "loss": 0.5087, + "step": 6770 + }, + { + "epoch": 1.1164241164241164, + "grad_norm": 1.446236491203308, + "learning_rate": 1.446561925951899e-05, + "loss": 0.5107, + "step": 6780 + }, + { + "epoch": 1.1180708507441182, + "grad_norm": 1.3917357921600342, + "learning_rate": 1.4449705983615474e-05, + "loss": 0.5119, + "step": 6790 + }, + { + "epoch": 1.1197175850641197, + "grad_norm": 1.290035605430603, + "learning_rate": 1.443377864530892e-05, + "loss": 0.522, + "step": 6800 + }, + { + "epoch": 1.1213643193841214, + "grad_norm": 1.2674860954284668, + "learning_rate": 1.4417837294934484e-05, + "loss": 0.5249, + "step": 6810 + }, + { + "epoch": 1.123011053704123, + "grad_norm": 1.2753984928131104, + "learning_rate": 1.4401881982871604e-05, + "loss": 0.4972, + "step": 6820 + }, + { + "epoch": 1.1246577880241246, + "grad_norm": 1.801187515258789, + "learning_rate": 1.4385912759543843e-05, + "loss": 0.5043, + "step": 6830 + }, + { + "epoch": 1.1263045223441264, + "grad_norm": 1.2739251852035522, + "learning_rate": 1.436992967541873e-05, + "loss": 0.509, + "step": 6840 + }, + { + "epoch": 1.1279512566641279, + "grad_norm": 1.319750189781189, + "learning_rate": 1.4353932781007594e-05, + "loss": 0.529, + "step": 6850 + }, + { + "epoch": 1.1295979909841296, + "grad_norm": 1.7615410089492798, + "learning_rate": 1.433792212686541e-05, + "loss": 0.5045, + "step": 6860 + }, + { + "epoch": 1.1312447253041313, + "grad_norm": 1.2809779644012451, + "learning_rate": 1.432189776359064e-05, + "loss": 0.4994, + "step": 6870 + }, + { + "epoch": 1.1328914596241328, + "grad_norm": 1.5949429273605347, + "learning_rate": 1.4305859741825068e-05, + "loss": 0.502, + "step": 6880 + }, + { + "epoch": 1.1345381939441346, + "grad_norm": 1.4932994842529297, + "learning_rate": 1.4289808112253643e-05, + "loss": 0.5026, + "step": 6890 + }, + { + "epoch": 1.1361849282641363, + "grad_norm": 1.708443284034729, + "learning_rate": 1.4273742925604322e-05, + "loss": 0.4747, + "step": 6900 + }, + { + "epoch": 1.1378316625841378, + "grad_norm": 1.2732783555984497, + "learning_rate": 1.4257664232647903e-05, + "loss": 0.5154, + "step": 6910 + }, + { + "epoch": 1.1394783969041395, + "grad_norm": 1.4677982330322266, + "learning_rate": 1.424157208419787e-05, + "loss": 0.4879, + "step": 6920 + }, + { + "epoch": 1.141125131224141, + "grad_norm": 1.3541361093521118, + "learning_rate": 1.422546653111023e-05, + "loss": 0.5129, + "step": 6930 + }, + { + "epoch": 1.1427718655441428, + "grad_norm": 1.359041690826416, + "learning_rate": 1.4209347624283352e-05, + "loss": 0.4885, + "step": 6940 + }, + { + "epoch": 1.1444185998641445, + "grad_norm": 1.3765989542007446, + "learning_rate": 1.4193215414657808e-05, + "loss": 0.543, + "step": 6950 + }, + { + "epoch": 1.146065334184146, + "grad_norm": 1.3113765716552734, + "learning_rate": 1.4177069953216207e-05, + "loss": 0.4778, + "step": 6960 + }, + { + "epoch": 1.1477120685041478, + "grad_norm": 1.1242080926895142, + "learning_rate": 1.4160911290983041e-05, + "loss": 0.4809, + "step": 6970 + }, + { + "epoch": 1.1493588028241493, + "grad_norm": 1.536487340927124, + "learning_rate": 1.4144739479024527e-05, + "loss": 0.4818, + "step": 6980 + }, + { + "epoch": 1.151005537144151, + "grad_norm": 1.2508544921875, + "learning_rate": 1.4128554568448425e-05, + "loss": 0.514, + "step": 6990 + }, + { + "epoch": 1.1526522714641527, + "grad_norm": 1.192643404006958, + "learning_rate": 1.4112356610403897e-05, + "loss": 0.4733, + "step": 7000 + }, + { + "epoch": 1.1542990057841542, + "grad_norm": 1.2769783735275269, + "learning_rate": 1.4096145656081347e-05, + "loss": 0.4891, + "step": 7010 + }, + { + "epoch": 1.155945740104156, + "grad_norm": 1.1201382875442505, + "learning_rate": 1.4079921756712238e-05, + "loss": 0.4824, + "step": 7020 + }, + { + "epoch": 1.1575924744241575, + "grad_norm": 1.3404284715652466, + "learning_rate": 1.4063684963568948e-05, + "loss": 0.5242, + "step": 7030 + }, + { + "epoch": 1.1592392087441592, + "grad_norm": 1.3030028343200684, + "learning_rate": 1.4047435327964609e-05, + "loss": 0.5023, + "step": 7040 + }, + { + "epoch": 1.160885943064161, + "grad_norm": 1.7895567417144775, + "learning_rate": 1.4031172901252931e-05, + "loss": 0.4866, + "step": 7050 + }, + { + "epoch": 1.1625326773841624, + "grad_norm": 1.2120519876480103, + "learning_rate": 1.4014897734828055e-05, + "loss": 0.4819, + "step": 7060 + }, + { + "epoch": 1.1641794117041642, + "grad_norm": 1.3895535469055176, + "learning_rate": 1.3998609880124373e-05, + "loss": 0.4955, + "step": 7070 + }, + { + "epoch": 1.165826146024166, + "grad_norm": 1.2869995832443237, + "learning_rate": 1.3982309388616385e-05, + "loss": 0.4857, + "step": 7080 + }, + { + "epoch": 1.1674728803441674, + "grad_norm": 1.2224863767623901, + "learning_rate": 1.3965996311818526e-05, + "loss": 0.5069, + "step": 7090 + }, + { + "epoch": 1.1691196146641691, + "grad_norm": 1.1428492069244385, + "learning_rate": 1.3949670701285e-05, + "loss": 0.4908, + "step": 7100 + }, + { + "epoch": 1.1707663489841709, + "grad_norm": 1.8123971223831177, + "learning_rate": 1.3933332608609624e-05, + "loss": 0.504, + "step": 7110 + }, + { + "epoch": 1.1724130833041724, + "grad_norm": 1.2169560194015503, + "learning_rate": 1.3916982085425663e-05, + "loss": 0.5015, + "step": 7120 + }, + { + "epoch": 1.174059817624174, + "grad_norm": 1.34042489528656, + "learning_rate": 1.3900619183405666e-05, + "loss": 0.5077, + "step": 7130 + }, + { + "epoch": 1.1757065519441756, + "grad_norm": 1.6218396425247192, + "learning_rate": 1.38842439542613e-05, + "loss": 0.5148, + "step": 7140 + }, + { + "epoch": 1.1773532862641773, + "grad_norm": 1.162052869796753, + "learning_rate": 1.3867856449743191e-05, + "loss": 0.4832, + "step": 7150 + }, + { + "epoch": 1.179000020584179, + "grad_norm": 1.2878142595291138, + "learning_rate": 1.3851456721640762e-05, + "loss": 0.5124, + "step": 7160 + }, + { + "epoch": 1.1806467549041806, + "grad_norm": 1.1885998249053955, + "learning_rate": 1.3835044821782063e-05, + "loss": 0.4829, + "step": 7170 + }, + { + "epoch": 1.1822934892241823, + "grad_norm": 1.176771879196167, + "learning_rate": 1.3818620802033609e-05, + "loss": 0.4881, + "step": 7180 + }, + { + "epoch": 1.1839402235441838, + "grad_norm": 1.699350357055664, + "learning_rate": 1.3802184714300219e-05, + "loss": 0.4995, + "step": 7190 + }, + { + "epoch": 1.1855869578641856, + "grad_norm": 1.660423994064331, + "learning_rate": 1.3785736610524854e-05, + "loss": 0.5151, + "step": 7200 + }, + { + "epoch": 1.1872336921841873, + "grad_norm": 1.5366398096084595, + "learning_rate": 1.3769276542688444e-05, + "loss": 0.5002, + "step": 7210 + }, + { + "epoch": 1.1888804265041888, + "grad_norm": 1.597117304801941, + "learning_rate": 1.3752804562809731e-05, + "loss": 0.4737, + "step": 7220 + }, + { + "epoch": 1.1905271608241905, + "grad_norm": 1.2548353672027588, + "learning_rate": 1.3736320722945103e-05, + "loss": 0.4971, + "step": 7230 + }, + { + "epoch": 1.1921738951441923, + "grad_norm": 1.5998353958129883, + "learning_rate": 1.3719825075188427e-05, + "loss": 0.4988, + "step": 7240 + }, + { + "epoch": 1.1938206294641938, + "grad_norm": 1.853366732597351, + "learning_rate": 1.370331767167089e-05, + "loss": 0.5122, + "step": 7250 + }, + { + "epoch": 1.1954673637841955, + "grad_norm": 1.6614432334899902, + "learning_rate": 1.3686798564560831e-05, + "loss": 0.5014, + "step": 7260 + }, + { + "epoch": 1.1971140981041972, + "grad_norm": 1.8630009889602661, + "learning_rate": 1.3670267806063573e-05, + "loss": 0.4648, + "step": 7270 + }, + { + "epoch": 1.1987608324241987, + "grad_norm": 1.5616339445114136, + "learning_rate": 1.3653725448421258e-05, + "loss": 0.5241, + "step": 7280 + }, + { + "epoch": 1.2004075667442005, + "grad_norm": 1.2994410991668701, + "learning_rate": 1.3637171543912693e-05, + "loss": 0.4812, + "step": 7290 + }, + { + "epoch": 1.202054301064202, + "grad_norm": 1.523964524269104, + "learning_rate": 1.362060614485317e-05, + "loss": 0.4898, + "step": 7300 + }, + { + "epoch": 1.2037010353842037, + "grad_norm": 2.751901865005493, + "learning_rate": 1.3604029303594311e-05, + "loss": 0.5339, + "step": 7310 + }, + { + "epoch": 1.2053477697042054, + "grad_norm": 1.832046389579773, + "learning_rate": 1.3587441072523893e-05, + "loss": 0.4728, + "step": 7320 + }, + { + "epoch": 1.206994504024207, + "grad_norm": 1.397538423538208, + "learning_rate": 1.3570841504065695e-05, + "loss": 0.4737, + "step": 7330 + }, + { + "epoch": 1.2086412383442087, + "grad_norm": 2.1878819465637207, + "learning_rate": 1.3554230650679326e-05, + "loss": 0.4977, + "step": 7340 + }, + { + "epoch": 1.2102879726642102, + "grad_norm": 1.9552628993988037, + "learning_rate": 1.3537608564860053e-05, + "loss": 0.4879, + "step": 7350 + }, + { + "epoch": 1.211934706984212, + "grad_norm": 1.4052588939666748, + "learning_rate": 1.3520975299138637e-05, + "loss": 0.4753, + "step": 7360 + }, + { + "epoch": 1.2135814413042136, + "grad_norm": 1.8420672416687012, + "learning_rate": 1.3504330906081187e-05, + "loss": 0.4877, + "step": 7370 + }, + { + "epoch": 1.2152281756242151, + "grad_norm": 1.29952073097229, + "learning_rate": 1.3487675438288965e-05, + "loss": 0.4822, + "step": 7380 + }, + { + "epoch": 1.2168749099442169, + "grad_norm": 1.3250713348388672, + "learning_rate": 1.3471008948398233e-05, + "loss": 0.465, + "step": 7390 + }, + { + "epoch": 1.2185216442642186, + "grad_norm": 1.2507812976837158, + "learning_rate": 1.345433148908009e-05, + "loss": 0.4588, + "step": 7400 + }, + { + "epoch": 1.2201683785842201, + "grad_norm": 1.415877103805542, + "learning_rate": 1.3437643113040302e-05, + "loss": 0.4691, + "step": 7410 + }, + { + "epoch": 1.2218151129042218, + "grad_norm": 1.3288524150848389, + "learning_rate": 1.3420943873019128e-05, + "loss": 0.4664, + "step": 7420 + }, + { + "epoch": 1.2234618472242236, + "grad_norm": 1.4450100660324097, + "learning_rate": 1.3404233821791172e-05, + "loss": 0.4613, + "step": 7430 + }, + { + "epoch": 1.225108581544225, + "grad_norm": 1.3197182416915894, + "learning_rate": 1.3387513012165196e-05, + "loss": 0.485, + "step": 7440 + }, + { + "epoch": 1.2267553158642268, + "grad_norm": 1.192579746246338, + "learning_rate": 1.3370781496983965e-05, + "loss": 0.4967, + "step": 7450 + }, + { + "epoch": 1.2284020501842283, + "grad_norm": 1.670664668083191, + "learning_rate": 1.3354039329124074e-05, + "loss": 0.4973, + "step": 7460 + }, + { + "epoch": 1.23004878450423, + "grad_norm": 1.251954197883606, + "learning_rate": 1.3337286561495788e-05, + "loss": 0.473, + "step": 7470 + }, + { + "epoch": 1.2316955188242318, + "grad_norm": 1.7240633964538574, + "learning_rate": 1.3320523247042867e-05, + "loss": 0.4754, + "step": 7480 + }, + { + "epoch": 1.2333422531442333, + "grad_norm": 1.3650866746902466, + "learning_rate": 1.33037494387424e-05, + "loss": 0.4382, + "step": 7490 + }, + { + "epoch": 1.234988987464235, + "grad_norm": 1.5670567750930786, + "learning_rate": 1.3286965189604648e-05, + "loss": 0.4662, + "step": 7500 + }, + { + "epoch": 1.2366357217842365, + "grad_norm": 1.2606995105743408, + "learning_rate": 1.3270170552672864e-05, + "loss": 0.4672, + "step": 7510 + }, + { + "epoch": 1.2382824561042383, + "grad_norm": 1.3360075950622559, + "learning_rate": 1.3253365581023123e-05, + "loss": 0.4842, + "step": 7520 + }, + { + "epoch": 1.23992919042424, + "grad_norm": 1.5268915891647339, + "learning_rate": 1.3236550327764169e-05, + "loss": 0.4793, + "step": 7530 + }, + { + "epoch": 1.2415759247442415, + "grad_norm": 1.3975640535354614, + "learning_rate": 1.3219724846037237e-05, + "loss": 0.4645, + "step": 7540 + }, + { + "epoch": 1.2432226590642432, + "grad_norm": 1.4461370706558228, + "learning_rate": 1.3202889189015883e-05, + "loss": 0.4699, + "step": 7550 + }, + { + "epoch": 1.244869393384245, + "grad_norm": 1.379309892654419, + "learning_rate": 1.3186043409905831e-05, + "loss": 0.4732, + "step": 7560 + }, + { + "epoch": 1.2465161277042465, + "grad_norm": 1.8799365758895874, + "learning_rate": 1.3169187561944776e-05, + "loss": 0.4716, + "step": 7570 + }, + { + "epoch": 1.2481628620242482, + "grad_norm": 1.2510124444961548, + "learning_rate": 1.3152321698402253e-05, + "loss": 0.4666, + "step": 7580 + }, + { + "epoch": 1.24980959634425, + "grad_norm": 1.4777454137802124, + "learning_rate": 1.3135445872579435e-05, + "loss": 0.4837, + "step": 7590 + }, + { + "epoch": 1.2514563306642514, + "grad_norm": 1.3962723016738892, + "learning_rate": 1.3118560137808985e-05, + "loss": 0.4803, + "step": 7600 + }, + { + "epoch": 1.2531030649842532, + "grad_norm": 1.7384527921676636, + "learning_rate": 1.310166454745488e-05, + "loss": 0.4683, + "step": 7610 + }, + { + "epoch": 1.2547497993042547, + "grad_norm": 1.8363560438156128, + "learning_rate": 1.3084759154912243e-05, + "loss": 0.4645, + "step": 7620 + }, + { + "epoch": 1.2563965336242564, + "grad_norm": 1.5209704637527466, + "learning_rate": 1.3067844013607179e-05, + "loss": 0.4482, + "step": 7630 + }, + { + "epoch": 1.2580432679442581, + "grad_norm": 3.5697286128997803, + "learning_rate": 1.3050919176996592e-05, + "loss": 0.4452, + "step": 7640 + }, + { + "epoch": 1.2596900022642596, + "grad_norm": 1.9864082336425781, + "learning_rate": 1.3033984698568038e-05, + "loss": 0.4683, + "step": 7650 + }, + { + "epoch": 1.2613367365842614, + "grad_norm": 2.205308437347412, + "learning_rate": 1.3017040631839536e-05, + "loss": 0.4605, + "step": 7660 + }, + { + "epoch": 1.2629834709042629, + "grad_norm": 1.6735590696334839, + "learning_rate": 1.3000087030359411e-05, + "loss": 0.4727, + "step": 7670 + }, + { + "epoch": 1.2646302052242646, + "grad_norm": 1.703411340713501, + "learning_rate": 1.298312394770612e-05, + "loss": 0.4644, + "step": 7680 + }, + { + "epoch": 1.2662769395442663, + "grad_norm": 1.6092456579208374, + "learning_rate": 1.2966151437488078e-05, + "loss": 0.4496, + "step": 7690 + }, + { + "epoch": 1.2679236738642679, + "grad_norm": 2.1162524223327637, + "learning_rate": 1.2949169553343504e-05, + "loss": 0.4658, + "step": 7700 + }, + { + "epoch": 1.2695704081842696, + "grad_norm": 1.9072030782699585, + "learning_rate": 1.2932178348940231e-05, + "loss": 0.4503, + "step": 7710 + }, + { + "epoch": 1.271217142504271, + "grad_norm": 1.529270887374878, + "learning_rate": 1.2915177877975556e-05, + "loss": 0.4608, + "step": 7720 + }, + { + "epoch": 1.2728638768242728, + "grad_norm": 1.5956228971481323, + "learning_rate": 1.2898168194176056e-05, + "loss": 0.4385, + "step": 7730 + }, + { + "epoch": 1.2745106111442746, + "grad_norm": 1.6679530143737793, + "learning_rate": 1.288114935129742e-05, + "loss": 0.4593, + "step": 7740 + }, + { + "epoch": 1.2761573454642763, + "grad_norm": 1.8094053268432617, + "learning_rate": 1.2864121403124288e-05, + "loss": 0.4918, + "step": 7750 + }, + { + "epoch": 1.2778040797842778, + "grad_norm": 2.4337899684906006, + "learning_rate": 1.2847084403470076e-05, + "loss": 0.4849, + "step": 7760 + }, + { + "epoch": 1.2794508141042795, + "grad_norm": 1.9572261571884155, + "learning_rate": 1.2830038406176804e-05, + "loss": 0.4923, + "step": 7770 + }, + { + "epoch": 1.281097548424281, + "grad_norm": 1.9552801847457886, + "learning_rate": 1.2812983465114925e-05, + "loss": 0.4959, + "step": 7780 + }, + { + "epoch": 1.2827442827442828, + "grad_norm": 1.6549162864685059, + "learning_rate": 1.2795919634183159e-05, + "loss": 0.4571, + "step": 7790 + }, + { + "epoch": 1.2843910170642845, + "grad_norm": 1.427476167678833, + "learning_rate": 1.277884696730832e-05, + "loss": 0.4831, + "step": 7800 + }, + { + "epoch": 1.286037751384286, + "grad_norm": 2.2045860290527344, + "learning_rate": 1.2761765518445146e-05, + "loss": 0.4709, + "step": 7810 + }, + { + "epoch": 1.2876844857042877, + "grad_norm": 1.556210994720459, + "learning_rate": 1.274467534157613e-05, + "loss": 0.4733, + "step": 7820 + }, + { + "epoch": 1.2893312200242892, + "grad_norm": 1.9513988494873047, + "learning_rate": 1.2727576490711344e-05, + "loss": 0.4755, + "step": 7830 + }, + { + "epoch": 1.290977954344291, + "grad_norm": 1.7663830518722534, + "learning_rate": 1.2710469019888283e-05, + "loss": 0.4407, + "step": 7840 + }, + { + "epoch": 1.2926246886642927, + "grad_norm": 1.8734620809555054, + "learning_rate": 1.2693352983171669e-05, + "loss": 0.4683, + "step": 7850 + }, + { + "epoch": 1.2942714229842942, + "grad_norm": 2.0744926929473877, + "learning_rate": 1.2676228434653307e-05, + "loss": 0.468, + "step": 7860 + }, + { + "epoch": 1.295918157304296, + "grad_norm": 1.9999136924743652, + "learning_rate": 1.2659095428451898e-05, + "loss": 0.4709, + "step": 7870 + }, + { + "epoch": 1.2975648916242974, + "grad_norm": 1.605631709098816, + "learning_rate": 1.2641954018712863e-05, + "loss": 0.439, + "step": 7880 + }, + { + "epoch": 1.2992116259442992, + "grad_norm": 2.568913221359253, + "learning_rate": 1.2624804259608194e-05, + "loss": 0.4881, + "step": 7890 + }, + { + "epoch": 1.300858360264301, + "grad_norm": 2.5562288761138916, + "learning_rate": 1.2607646205336264e-05, + "loss": 0.507, + "step": 7900 + }, + { + "epoch": 1.3025050945843026, + "grad_norm": 2.0905675888061523, + "learning_rate": 1.2590479910121662e-05, + "loss": 0.467, + "step": 7910 + }, + { + "epoch": 1.3041518289043041, + "grad_norm": 2.1957993507385254, + "learning_rate": 1.2573305428215017e-05, + "loss": 0.4565, + "step": 7920 + }, + { + "epoch": 1.3057985632243059, + "grad_norm": 1.6840404272079468, + "learning_rate": 1.2556122813892834e-05, + "loss": 0.4654, + "step": 7930 + }, + { + "epoch": 1.3074452975443074, + "grad_norm": 1.8692106008529663, + "learning_rate": 1.2538932121457322e-05, + "loss": 0.452, + "step": 7940 + }, + { + "epoch": 1.3090920318643091, + "grad_norm": 1.645704746246338, + "learning_rate": 1.2521733405236214e-05, + "loss": 0.451, + "step": 7950 + }, + { + "epoch": 1.3107387661843108, + "grad_norm": 1.3794227838516235, + "learning_rate": 1.2504526719582596e-05, + "loss": 0.479, + "step": 7960 + }, + { + "epoch": 1.3123855005043124, + "grad_norm": 1.594277262687683, + "learning_rate": 1.2487312118874755e-05, + "loss": 0.4686, + "step": 7970 + }, + { + "epoch": 1.314032234824314, + "grad_norm": 2.5194473266601562, + "learning_rate": 1.247008965751598e-05, + "loss": 0.4672, + "step": 7980 + }, + { + "epoch": 1.3156789691443156, + "grad_norm": 1.4308621883392334, + "learning_rate": 1.2452859389934399e-05, + "loss": 0.4429, + "step": 7990 + }, + { + "epoch": 1.3173257034643173, + "grad_norm": 1.6084959506988525, + "learning_rate": 1.2435621370582824e-05, + "loss": 0.4557, + "step": 8000 + }, + { + "epoch": 1.318972437784319, + "grad_norm": 2.098825693130493, + "learning_rate": 1.241837565393855e-05, + "loss": 0.4538, + "step": 8010 + }, + { + "epoch": 1.3206191721043206, + "grad_norm": 1.6159589290618896, + "learning_rate": 1.2401122294503212e-05, + "loss": 0.4345, + "step": 8020 + }, + { + "epoch": 1.3222659064243223, + "grad_norm": 1.534826636314392, + "learning_rate": 1.2383861346802585e-05, + "loss": 0.444, + "step": 8030 + }, + { + "epoch": 1.3239126407443238, + "grad_norm": 1.4128071069717407, + "learning_rate": 1.2366592865386433e-05, + "loss": 0.4529, + "step": 8040 + }, + { + "epoch": 1.3255593750643255, + "grad_norm": 1.3641729354858398, + "learning_rate": 1.2349316904828327e-05, + "loss": 0.4563, + "step": 8050 + }, + { + "epoch": 1.3272061093843273, + "grad_norm": 1.5807952880859375, + "learning_rate": 1.2332033519725474e-05, + "loss": 0.4433, + "step": 8060 + }, + { + "epoch": 1.328852843704329, + "grad_norm": 1.464307188987732, + "learning_rate": 1.231474276469855e-05, + "loss": 0.4405, + "step": 8070 + }, + { + "epoch": 1.3304995780243305, + "grad_norm": 1.6941524744033813, + "learning_rate": 1.2297444694391509e-05, + "loss": 0.4552, + "step": 8080 + }, + { + "epoch": 1.3321463123443322, + "grad_norm": 1.869858741760254, + "learning_rate": 1.2280139363471437e-05, + "loss": 0.4543, + "step": 8090 + }, + { + "epoch": 1.3337930466643337, + "grad_norm": 2.1110851764678955, + "learning_rate": 1.2262826826628357e-05, + "loss": 0.4671, + "step": 8100 + }, + { + "epoch": 1.3354397809843355, + "grad_norm": 1.4754462242126465, + "learning_rate": 1.2245507138575072e-05, + "loss": 0.4646, + "step": 8110 + }, + { + "epoch": 1.3370865153043372, + "grad_norm": 1.5958442687988281, + "learning_rate": 1.2228180354046983e-05, + "loss": 0.4384, + "step": 8120 + }, + { + "epoch": 1.3387332496243387, + "grad_norm": 1.969876766204834, + "learning_rate": 1.2210846527801912e-05, + "loss": 0.4393, + "step": 8130 + }, + { + "epoch": 1.3403799839443404, + "grad_norm": 1.1938165426254272, + "learning_rate": 1.2193505714619937e-05, + "loss": 0.4304, + "step": 8140 + }, + { + "epoch": 1.342026718264342, + "grad_norm": 1.4246577024459839, + "learning_rate": 1.2176157969303227e-05, + "loss": 0.4483, + "step": 8150 + }, + { + "epoch": 1.3436734525843437, + "grad_norm": 1.4756056070327759, + "learning_rate": 1.2158803346675845e-05, + "loss": 0.4525, + "step": 8160 + }, + { + "epoch": 1.3453201869043454, + "grad_norm": 1.381611704826355, + "learning_rate": 1.2141441901583593e-05, + "loss": 0.4407, + "step": 8170 + }, + { + "epoch": 1.346966921224347, + "grad_norm": 1.3701461553573608, + "learning_rate": 1.2124073688893838e-05, + "loss": 0.439, + "step": 8180 + }, + { + "epoch": 1.3486136555443486, + "grad_norm": 2.179842710494995, + "learning_rate": 1.2106698763495333e-05, + "loss": 0.4494, + "step": 8190 + }, + { + "epoch": 1.3502603898643502, + "grad_norm": 1.3610347509384155, + "learning_rate": 1.2089317180298043e-05, + "loss": 0.4301, + "step": 8200 + }, + { + "epoch": 1.3519071241843519, + "grad_norm": 1.461213231086731, + "learning_rate": 1.207192899423297e-05, + "loss": 0.4565, + "step": 8210 + }, + { + "epoch": 1.3535538585043536, + "grad_norm": 1.9947609901428223, + "learning_rate": 1.2054534260251995e-05, + "loss": 0.4513, + "step": 8220 + }, + { + "epoch": 1.3552005928243551, + "grad_norm": 1.3805066347122192, + "learning_rate": 1.2037133033327679e-05, + "loss": 0.4198, + "step": 8230 + }, + { + "epoch": 1.3568473271443569, + "grad_norm": 1.6667447090148926, + "learning_rate": 1.2019725368453111e-05, + "loss": 0.4525, + "step": 8240 + }, + { + "epoch": 1.3584940614643584, + "grad_norm": 1.3600236177444458, + "learning_rate": 1.2002311320641722e-05, + "loss": 0.4311, + "step": 8250 + }, + { + "epoch": 1.36014079578436, + "grad_norm": 1.2351371049880981, + "learning_rate": 1.1984890944927119e-05, + "loss": 0.4625, + "step": 8260 + }, + { + "epoch": 1.3617875301043618, + "grad_norm": 1.8119263648986816, + "learning_rate": 1.1967464296362903e-05, + "loss": 0.4306, + "step": 8270 + }, + { + "epoch": 1.3634342644243636, + "grad_norm": 1.2814642190933228, + "learning_rate": 1.1950031430022499e-05, + "loss": 0.4399, + "step": 8280 + }, + { + "epoch": 1.365080998744365, + "grad_norm": 1.4676977396011353, + "learning_rate": 1.1932592400998985e-05, + "loss": 0.4423, + "step": 8290 + }, + { + "epoch": 1.3667277330643668, + "grad_norm": 1.8431611061096191, + "learning_rate": 1.1915147264404916e-05, + "loss": 0.4298, + "step": 8300 + }, + { + "epoch": 1.3683744673843683, + "grad_norm": 1.3189785480499268, + "learning_rate": 1.1897696075372143e-05, + "loss": 0.4415, + "step": 8310 + }, + { + "epoch": 1.37002120170437, + "grad_norm": 1.5338736772537231, + "learning_rate": 1.1880238889051647e-05, + "loss": 0.4405, + "step": 8320 + }, + { + "epoch": 1.3716679360243718, + "grad_norm": 2.1897244453430176, + "learning_rate": 1.1862775760613365e-05, + "loss": 0.4169, + "step": 8330 + }, + { + "epoch": 1.3733146703443733, + "grad_norm": 1.3329521417617798, + "learning_rate": 1.1845306745246012e-05, + "loss": 0.4433, + "step": 8340 + }, + { + "epoch": 1.374961404664375, + "grad_norm": 1.6167490482330322, + "learning_rate": 1.1827831898156905e-05, + "loss": 0.4594, + "step": 8350 + }, + { + "epoch": 1.3766081389843765, + "grad_norm": 1.357681393623352, + "learning_rate": 1.1810351274571792e-05, + "loss": 0.4196, + "step": 8360 + }, + { + "epoch": 1.3782548733043782, + "grad_norm": 1.3341563940048218, + "learning_rate": 1.1792864929734678e-05, + "loss": 0.4466, + "step": 8370 + }, + { + "epoch": 1.37990160762438, + "grad_norm": 1.418643832206726, + "learning_rate": 1.1775372918907646e-05, + "loss": 0.4178, + "step": 8380 + }, + { + "epoch": 1.3815483419443815, + "grad_norm": 1.5278998613357544, + "learning_rate": 1.1757875297370687e-05, + "loss": 0.4381, + "step": 8390 + }, + { + "epoch": 1.3831950762643832, + "grad_norm": 1.9926173686981201, + "learning_rate": 1.174037212042152e-05, + "loss": 0.4454, + "step": 8400 + }, + { + "epoch": 1.3848418105843847, + "grad_norm": 3.1398065090179443, + "learning_rate": 1.1722863443375437e-05, + "loss": 0.4214, + "step": 8410 + }, + { + "epoch": 1.3864885449043864, + "grad_norm": 1.9283721446990967, + "learning_rate": 1.1705349321565085e-05, + "loss": 0.4275, + "step": 8420 + }, + { + "epoch": 1.3881352792243882, + "grad_norm": 1.2598986625671387, + "learning_rate": 1.1687829810340338e-05, + "loss": 0.4551, + "step": 8430 + }, + { + "epoch": 1.38978201354439, + "grad_norm": 1.5143520832061768, + "learning_rate": 1.1670304965068098e-05, + "loss": 0.4438, + "step": 8440 + }, + { + "epoch": 1.3914287478643914, + "grad_norm": 2.051283597946167, + "learning_rate": 1.165277484113212e-05, + "loss": 0.4186, + "step": 8450 + }, + { + "epoch": 1.3930754821843931, + "grad_norm": 1.9647774696350098, + "learning_rate": 1.1635239493932842e-05, + "loss": 0.4315, + "step": 8460 + }, + { + "epoch": 1.3947222165043947, + "grad_norm": 3.153352737426758, + "learning_rate": 1.1617698978887214e-05, + "loss": 0.4627, + "step": 8470 + }, + { + "epoch": 1.3963689508243964, + "grad_norm": 2.1228764057159424, + "learning_rate": 1.1600153351428516e-05, + "loss": 0.4405, + "step": 8480 + }, + { + "epoch": 1.3980156851443981, + "grad_norm": 1.7035056352615356, + "learning_rate": 1.158260266700618e-05, + "loss": 0.4512, + "step": 8490 + }, + { + "epoch": 1.3996624194643996, + "grad_norm": 2.2831528186798096, + "learning_rate": 1.1565046981085621e-05, + "loss": 0.4442, + "step": 8500 + }, + { + "epoch": 1.4013091537844014, + "grad_norm": 1.483190894126892, + "learning_rate": 1.154748634914807e-05, + "loss": 0.4119, + "step": 8510 + }, + { + "epoch": 1.4029558881044029, + "grad_norm": 1.4663301706314087, + "learning_rate": 1.1529920826690375e-05, + "loss": 0.4234, + "step": 8520 + }, + { + "epoch": 1.4046026224244046, + "grad_norm": 2.566152572631836, + "learning_rate": 1.1512350469224846e-05, + "loss": 0.423, + "step": 8530 + }, + { + "epoch": 1.4062493567444063, + "grad_norm": 1.1131696701049805, + "learning_rate": 1.1494775332279076e-05, + "loss": 0.4087, + "step": 8540 + }, + { + "epoch": 1.4078960910644078, + "grad_norm": 1.7250810861587524, + "learning_rate": 1.1477195471395759e-05, + "loss": 0.4192, + "step": 8550 + }, + { + "epoch": 1.4095428253844096, + "grad_norm": 2.6664037704467773, + "learning_rate": 1.1459610942132513e-05, + "loss": 0.4299, + "step": 8560 + }, + { + "epoch": 1.411189559704411, + "grad_norm": 1.2339338064193726, + "learning_rate": 1.144202180006172e-05, + "loss": 0.3896, + "step": 8570 + }, + { + "epoch": 1.4128362940244128, + "grad_norm": 1.5988686084747314, + "learning_rate": 1.1424428100770333e-05, + "loss": 0.438, + "step": 8580 + }, + { + "epoch": 1.4144830283444145, + "grad_norm": 1.4023103713989258, + "learning_rate": 1.140682989985971e-05, + "loss": 0.4207, + "step": 8590 + }, + { + "epoch": 1.4161297626644163, + "grad_norm": 1.4158636331558228, + "learning_rate": 1.1389227252945434e-05, + "loss": 0.4342, + "step": 8600 + }, + { + "epoch": 1.4177764969844178, + "grad_norm": 1.6897794008255005, + "learning_rate": 1.1371620215657136e-05, + "loss": 0.4199, + "step": 8610 + }, + { + "epoch": 1.4194232313044195, + "grad_norm": 2.0550646781921387, + "learning_rate": 1.1354008843638331e-05, + "loss": 0.411, + "step": 8620 + }, + { + "epoch": 1.421069965624421, + "grad_norm": 1.6463772058486938, + "learning_rate": 1.133639319254622e-05, + "loss": 0.4246, + "step": 8630 + }, + { + "epoch": 1.4227166999444227, + "grad_norm": 1.7673988342285156, + "learning_rate": 1.131877331805154e-05, + "loss": 0.4296, + "step": 8640 + }, + { + "epoch": 1.4243634342644245, + "grad_norm": 1.807904601097107, + "learning_rate": 1.1301149275838363e-05, + "loss": 0.4131, + "step": 8650 + }, + { + "epoch": 1.426010168584426, + "grad_norm": 1.274460792541504, + "learning_rate": 1.1283521121603948e-05, + "loss": 0.4114, + "step": 8660 + }, + { + "epoch": 1.4276569029044277, + "grad_norm": 1.1608306169509888, + "learning_rate": 1.1265888911058532e-05, + "loss": 0.4429, + "step": 8670 + }, + { + "epoch": 1.4293036372244292, + "grad_norm": 1.3403079509735107, + "learning_rate": 1.124825269992518e-05, + "loss": 0.4456, + "step": 8680 + }, + { + "epoch": 1.430950371544431, + "grad_norm": 1.4677191972732544, + "learning_rate": 1.1230612543939603e-05, + "loss": 0.4251, + "step": 8690 + }, + { + "epoch": 1.4325971058644327, + "grad_norm": 1.2995331287384033, + "learning_rate": 1.1212968498849973e-05, + "loss": 0.4305, + "step": 8700 + }, + { + "epoch": 1.4342438401844342, + "grad_norm": 1.7009153366088867, + "learning_rate": 1.119532062041675e-05, + "loss": 0.4071, + "step": 8710 + }, + { + "epoch": 1.435890574504436, + "grad_norm": 1.6432769298553467, + "learning_rate": 1.1177668964412519e-05, + "loss": 0.4388, + "step": 8720 + }, + { + "epoch": 1.4375373088244374, + "grad_norm": 1.4236555099487305, + "learning_rate": 1.1160013586621796e-05, + "loss": 0.4028, + "step": 8730 + }, + { + "epoch": 1.4391840431444392, + "grad_norm": 1.7424023151397705, + "learning_rate": 1.1142354542840859e-05, + "loss": 0.4037, + "step": 8740 + }, + { + "epoch": 1.4408307774644409, + "grad_norm": 1.2468111515045166, + "learning_rate": 1.1124691888877575e-05, + "loss": 0.4084, + "step": 8750 + }, + { + "epoch": 1.4424775117844426, + "grad_norm": 1.6972237825393677, + "learning_rate": 1.1107025680551216e-05, + "loss": 0.4287, + "step": 8760 + }, + { + "epoch": 1.4441242461044441, + "grad_norm": 1.772537112236023, + "learning_rate": 1.1089355973692292e-05, + "loss": 0.4122, + "step": 8770 + }, + { + "epoch": 1.4457709804244459, + "grad_norm": 2.2410008907318115, + "learning_rate": 1.1071682824142365e-05, + "loss": 0.4277, + "step": 8780 + }, + { + "epoch": 1.4474177147444474, + "grad_norm": 1.619391918182373, + "learning_rate": 1.1054006287753876e-05, + "loss": 0.44, + "step": 8790 + }, + { + "epoch": 1.449064449064449, + "grad_norm": 1.8790594339370728, + "learning_rate": 1.1036326420389978e-05, + "loss": 0.4215, + "step": 8800 + }, + { + "epoch": 1.4507111833844508, + "grad_norm": 2.067535877227783, + "learning_rate": 1.1018643277924338e-05, + "loss": 0.4136, + "step": 8810 + }, + { + "epoch": 1.4523579177044523, + "grad_norm": 1.8926339149475098, + "learning_rate": 1.1000956916240985e-05, + "loss": 0.4328, + "step": 8820 + }, + { + "epoch": 1.454004652024454, + "grad_norm": 1.6244444847106934, + "learning_rate": 1.0983267391234113e-05, + "loss": 0.4103, + "step": 8830 + }, + { + "epoch": 1.4556513863444556, + "grad_norm": 1.3383644819259644, + "learning_rate": 1.0965574758807924e-05, + "loss": 0.4274, + "step": 8840 + }, + { + "epoch": 1.4572981206644573, + "grad_norm": 1.4654048681259155, + "learning_rate": 1.0947879074876425e-05, + "loss": 0.4314, + "step": 8850 + }, + { + "epoch": 1.458944854984459, + "grad_norm": 1.5433770418167114, + "learning_rate": 1.0930180395363275e-05, + "loss": 0.439, + "step": 8860 + }, + { + "epoch": 1.4605915893044605, + "grad_norm": 1.5175681114196777, + "learning_rate": 1.0912478776201605e-05, + "loss": 0.4175, + "step": 8870 + }, + { + "epoch": 1.4622383236244623, + "grad_norm": 1.402180552482605, + "learning_rate": 1.089477427333383e-05, + "loss": 0.389, + "step": 8880 + }, + { + "epoch": 1.4638850579444638, + "grad_norm": 1.3952786922454834, + "learning_rate": 1.0877066942711476e-05, + "loss": 0.4107, + "step": 8890 + }, + { + "epoch": 1.4655317922644655, + "grad_norm": 1.4274441003799438, + "learning_rate": 1.0859356840295013e-05, + "loss": 0.4036, + "step": 8900 + }, + { + "epoch": 1.4671785265844672, + "grad_norm": 1.6509827375411987, + "learning_rate": 1.0841644022053662e-05, + "loss": 0.4023, + "step": 8910 + }, + { + "epoch": 1.4688252609044687, + "grad_norm": 1.4822527170181274, + "learning_rate": 1.0823928543965236e-05, + "loss": 0.417, + "step": 8920 + }, + { + "epoch": 1.4704719952244705, + "grad_norm": 1.6752021312713623, + "learning_rate": 1.0806210462015946e-05, + "loss": 0.3962, + "step": 8930 + }, + { + "epoch": 1.472118729544472, + "grad_norm": 2.425339460372925, + "learning_rate": 1.0788489832200237e-05, + "loss": 0.4551, + "step": 8940 + }, + { + "epoch": 1.4737654638644737, + "grad_norm": 1.5440939664840698, + "learning_rate": 1.0770766710520607e-05, + "loss": 0.4006, + "step": 8950 + }, + { + "epoch": 1.4754121981844754, + "grad_norm": 1.8079001903533936, + "learning_rate": 1.075304115298742e-05, + "loss": 0.397, + "step": 8960 + }, + { + "epoch": 1.4770589325044772, + "grad_norm": 1.4747633934020996, + "learning_rate": 1.0735313215618748e-05, + "loss": 0.4002, + "step": 8970 + }, + { + "epoch": 1.4787056668244787, + "grad_norm": 1.5912610292434692, + "learning_rate": 1.071758295444018e-05, + "loss": 0.4153, + "step": 8980 + }, + { + "epoch": 1.4803524011444804, + "grad_norm": 1.4173518419265747, + "learning_rate": 1.069985042548465e-05, + "loss": 0.4142, + "step": 8990 + }, + { + "epoch": 1.481999135464482, + "grad_norm": 1.8204532861709595, + "learning_rate": 1.0682115684792256e-05, + "loss": 0.4103, + "step": 9000 + }, + { + "epoch": 1.4836458697844837, + "grad_norm": 1.718644380569458, + "learning_rate": 1.0664378788410092e-05, + "loss": 0.4099, + "step": 9010 + }, + { + "epoch": 1.4852926041044854, + "grad_norm": 1.6308566331863403, + "learning_rate": 1.0646639792392057e-05, + "loss": 0.4108, + "step": 9020 + }, + { + "epoch": 1.486939338424487, + "grad_norm": 1.60697340965271, + "learning_rate": 1.062889875279869e-05, + "loss": 0.4109, + "step": 9030 + }, + { + "epoch": 1.4885860727444886, + "grad_norm": 1.5269246101379395, + "learning_rate": 1.0611155725696988e-05, + "loss": 0.3809, + "step": 9040 + }, + { + "epoch": 1.4902328070644901, + "grad_norm": 1.802048921585083, + "learning_rate": 1.0593410767160229e-05, + "loss": 0.4015, + "step": 9050 + }, + { + "epoch": 1.4918795413844919, + "grad_norm": 1.5887212753295898, + "learning_rate": 1.0575663933267793e-05, + "loss": 0.3746, + "step": 9060 + }, + { + "epoch": 1.4935262757044936, + "grad_norm": 5.430899620056152, + "learning_rate": 1.0557915280104987e-05, + "loss": 0.3979, + "step": 9070 + }, + { + "epoch": 1.495173010024495, + "grad_norm": 1.8319224119186401, + "learning_rate": 1.0540164863762867e-05, + "loss": 0.3903, + "step": 9080 + }, + { + "epoch": 1.4968197443444968, + "grad_norm": 1.445753574371338, + "learning_rate": 1.0522412740338072e-05, + "loss": 0.3993, + "step": 9090 + }, + { + "epoch": 1.4984664786644983, + "grad_norm": 2.215062141418457, + "learning_rate": 1.0504658965932617e-05, + "loss": 0.4079, + "step": 9100 + }, + { + "epoch": 1.5001132129845, + "grad_norm": 1.9936473369598389, + "learning_rate": 1.0486903596653746e-05, + "loss": 0.4127, + "step": 9110 + }, + { + "epoch": 1.5017599473045018, + "grad_norm": 2.1176092624664307, + "learning_rate": 1.0469146688613744e-05, + "loss": 0.3993, + "step": 9120 + }, + { + "epoch": 1.5034066816245035, + "grad_norm": 2.069631576538086, + "learning_rate": 1.0451388297929757e-05, + "loss": 0.4327, + "step": 9130 + }, + { + "epoch": 1.505053415944505, + "grad_norm": 1.5769414901733398, + "learning_rate": 1.043362848072361e-05, + "loss": 0.3866, + "step": 9140 + }, + { + "epoch": 1.5067001502645065, + "grad_norm": 1.590387225151062, + "learning_rate": 1.041586729312165e-05, + "loss": 0.3923, + "step": 9150 + }, + { + "epoch": 1.5083468845845083, + "grad_norm": 1.8121472597122192, + "learning_rate": 1.0398104791254542e-05, + "loss": 0.3963, + "step": 9160 + }, + { + "epoch": 1.50999361890451, + "grad_norm": 1.914831280708313, + "learning_rate": 1.038034103125711e-05, + "loss": 0.3835, + "step": 9170 + }, + { + "epoch": 1.5116403532245117, + "grad_norm": 1.556657314300537, + "learning_rate": 1.0362576069268156e-05, + "loss": 0.396, + "step": 9180 + }, + { + "epoch": 1.5132870875445132, + "grad_norm": 1.9325206279754639, + "learning_rate": 1.0344809961430277e-05, + "loss": 0.4204, + "step": 9190 + }, + { + "epoch": 1.514933821864515, + "grad_norm": 2.9902329444885254, + "learning_rate": 1.0327042763889692e-05, + "loss": 0.4115, + "step": 9200 + }, + { + "epoch": 1.5165805561845165, + "grad_norm": 1.8379610776901245, + "learning_rate": 1.0309274532796064e-05, + "loss": 0.3794, + "step": 9210 + }, + { + "epoch": 1.5182272905045182, + "grad_norm": 1.8868550062179565, + "learning_rate": 1.0291505324302322e-05, + "loss": 0.4127, + "step": 9220 + }, + { + "epoch": 1.51987402482452, + "grad_norm": 2.223417282104492, + "learning_rate": 1.027373519456449e-05, + "loss": 0.4269, + "step": 9230 + }, + { + "epoch": 1.5215207591445217, + "grad_norm": 2.04238224029541, + "learning_rate": 1.0255964199741488e-05, + "loss": 0.3928, + "step": 9240 + }, + { + "epoch": 1.5231674934645232, + "grad_norm": 2.3193910121917725, + "learning_rate": 1.0238192395994989e-05, + "loss": 0.4056, + "step": 9250 + }, + { + "epoch": 1.5248142277845247, + "grad_norm": 2.5879781246185303, + "learning_rate": 1.022041983948921e-05, + "loss": 0.3993, + "step": 9260 + }, + { + "epoch": 1.5264609621045264, + "grad_norm": 1.9214088916778564, + "learning_rate": 1.020264658639075e-05, + "loss": 0.3947, + "step": 9270 + }, + { + "epoch": 1.5281076964245282, + "grad_norm": 1.8478078842163086, + "learning_rate": 1.0184872692868409e-05, + "loss": 0.4056, + "step": 9280 + }, + { + "epoch": 1.5297544307445299, + "grad_norm": 3.103062629699707, + "learning_rate": 1.016709821509301e-05, + "loss": 0.3801, + "step": 9290 + }, + { + "epoch": 1.5314011650645314, + "grad_norm": 2.530630111694336, + "learning_rate": 1.014932320923723e-05, + "loss": 0.4195, + "step": 9300 + }, + { + "epoch": 1.533047899384533, + "grad_norm": 1.8765909671783447, + "learning_rate": 1.0131547731475401e-05, + "loss": 0.4163, + "step": 9310 + }, + { + "epoch": 1.5346946337045346, + "grad_norm": 1.9875380992889404, + "learning_rate": 1.0113771837983361e-05, + "loss": 0.4135, + "step": 9320 + }, + { + "epoch": 1.5363413680245364, + "grad_norm": 2.6800906658172607, + "learning_rate": 1.0095995584938252e-05, + "loss": 0.3949, + "step": 9330 + }, + { + "epoch": 1.537988102344538, + "grad_norm": 2.1471517086029053, + "learning_rate": 1.0078219028518359e-05, + "loss": 0.4129, + "step": 9340 + }, + { + "epoch": 1.5396348366645396, + "grad_norm": 2.109405517578125, + "learning_rate": 1.0060442224902915e-05, + "loss": 0.404, + "step": 9350 + }, + { + "epoch": 1.5412815709845413, + "grad_norm": 1.336411714553833, + "learning_rate": 1.0042665230271947e-05, + "loss": 0.423, + "step": 9360 + }, + { + "epoch": 1.5429283053045428, + "grad_norm": 1.543381690979004, + "learning_rate": 1.0024888100806079e-05, + "loss": 0.4204, + "step": 9370 + }, + { + "epoch": 1.5445750396245446, + "grad_norm": 1.7605801820755005, + "learning_rate": 1.000711089268636e-05, + "loss": 0.3922, + "step": 9380 + }, + { + "epoch": 1.5462217739445463, + "grad_norm": 1.8124173879623413, + "learning_rate": 9.989333662094092e-06, + "loss": 0.4036, + "step": 9390 + }, + { + "epoch": 1.547868508264548, + "grad_norm": 2.485732316970825, + "learning_rate": 9.971556465210643e-06, + "loss": 0.3985, + "step": 9400 + }, + { + "epoch": 1.5495152425845495, + "grad_norm": 2.339867353439331, + "learning_rate": 9.953779358217281e-06, + "loss": 0.4196, + "step": 9410 + }, + { + "epoch": 1.551161976904551, + "grad_norm": 2.12572979927063, + "learning_rate": 9.93600239729499e-06, + "loss": 0.3888, + "step": 9420 + }, + { + "epoch": 1.5528087112245528, + "grad_norm": 2.382894277572632, + "learning_rate": 9.918225638624276e-06, + "loss": 0.4123, + "step": 9430 + }, + { + "epoch": 1.5544554455445545, + "grad_norm": 2.5262231826782227, + "learning_rate": 9.900449138385026e-06, + "loss": 0.425, + "step": 9440 + }, + { + "epoch": 1.5561021798645562, + "grad_norm": 2.0982043743133545, + "learning_rate": 9.882672952756301e-06, + "loss": 0.398, + "step": 9450 + }, + { + "epoch": 1.5577489141845577, + "grad_norm": 1.5875078439712524, + "learning_rate": 9.86489713791617e-06, + "loss": 0.3729, + "step": 9460 + }, + { + "epoch": 1.5593956485045593, + "grad_norm": 1.499527096748352, + "learning_rate": 9.847121750041532e-06, + "loss": 0.4208, + "step": 9470 + }, + { + "epoch": 1.561042382824561, + "grad_norm": 1.7554186582565308, + "learning_rate": 9.829346845307929e-06, + "loss": 0.4171, + "step": 9480 + }, + { + "epoch": 1.5626891171445627, + "grad_norm": 2.1074600219726562, + "learning_rate": 9.811572479889387e-06, + "loss": 0.4018, + "step": 9490 + }, + { + "epoch": 1.5643358514645644, + "grad_norm": 3.0306382179260254, + "learning_rate": 9.793798709958221e-06, + "loss": 0.4319, + "step": 9500 + }, + { + "epoch": 1.565982585784566, + "grad_norm": 1.5060926675796509, + "learning_rate": 9.77602559168486e-06, + "loss": 0.3968, + "step": 9510 + }, + { + "epoch": 1.5676293201045677, + "grad_norm": 1.7786906957626343, + "learning_rate": 9.75825318123768e-06, + "loss": 0.3867, + "step": 9520 + }, + { + "epoch": 1.5692760544245692, + "grad_norm": 1.4499342441558838, + "learning_rate": 9.740481534782822e-06, + "loss": 0.3877, + "step": 9530 + }, + { + "epoch": 1.570922788744571, + "grad_norm": 1.7993638515472412, + "learning_rate": 9.722710708484009e-06, + "loss": 0.3935, + "step": 9540 + }, + { + "epoch": 1.5725695230645726, + "grad_norm": 1.487587571144104, + "learning_rate": 9.704940758502367e-06, + "loss": 0.4333, + "step": 9550 + }, + { + "epoch": 1.5742162573845744, + "grad_norm": 1.5965005159378052, + "learning_rate": 9.687171740996262e-06, + "loss": 0.3824, + "step": 9560 + }, + { + "epoch": 1.5758629917045759, + "grad_norm": 1.9359855651855469, + "learning_rate": 9.669403712121116e-06, + "loss": 0.4349, + "step": 9570 + }, + { + "epoch": 1.5775097260245774, + "grad_norm": 1.7077593803405762, + "learning_rate": 9.651636728029205e-06, + "loss": 0.4088, + "step": 9580 + }, + { + "epoch": 1.5791564603445791, + "grad_norm": 1.3524852991104126, + "learning_rate": 9.633870844869526e-06, + "loss": 0.3674, + "step": 9590 + }, + { + "epoch": 1.5808031946645809, + "grad_norm": 1.4619336128234863, + "learning_rate": 9.616106118787586e-06, + "loss": 0.4186, + "step": 9600 + }, + { + "epoch": 1.5824499289845826, + "grad_norm": 1.5433001518249512, + "learning_rate": 9.59834260592524e-06, + "loss": 0.3836, + "step": 9610 + }, + { + "epoch": 1.584096663304584, + "grad_norm": 1.562225580215454, + "learning_rate": 9.580580362420505e-06, + "loss": 0.3979, + "step": 9620 + }, + { + "epoch": 1.5857433976245856, + "grad_norm": 1.6424192190170288, + "learning_rate": 9.562819444407389e-06, + "loss": 0.3839, + "step": 9630 + }, + { + "epoch": 1.5873901319445873, + "grad_norm": 1.7441072463989258, + "learning_rate": 9.545059908015713e-06, + "loss": 0.3896, + "step": 9640 + }, + { + "epoch": 1.589036866264589, + "grad_norm": 1.3649961948394775, + "learning_rate": 9.527301809370922e-06, + "loss": 0.3891, + "step": 9650 + }, + { + "epoch": 1.5906836005845908, + "grad_norm": 2.137296676635742, + "learning_rate": 9.509545204593928e-06, + "loss": 0.3771, + "step": 9660 + }, + { + "epoch": 1.5923303349045923, + "grad_norm": 3.6009469032287598, + "learning_rate": 9.491790149800916e-06, + "loss": 0.3759, + "step": 9670 + }, + { + "epoch": 1.5939770692245938, + "grad_norm": 2.4487457275390625, + "learning_rate": 9.474036701103178e-06, + "loss": 0.4081, + "step": 9680 + }, + { + "epoch": 1.5956238035445955, + "grad_norm": 2.1855509281158447, + "learning_rate": 9.456284914606924e-06, + "loss": 0.394, + "step": 9690 + }, + { + "epoch": 1.5972705378645973, + "grad_norm": 1.6992751359939575, + "learning_rate": 9.438534846413115e-06, + "loss": 0.3946, + "step": 9700 + }, + { + "epoch": 1.598917272184599, + "grad_norm": 1.9765753746032715, + "learning_rate": 9.420786552617281e-06, + "loss": 0.3921, + "step": 9710 + }, + { + "epoch": 1.6005640065046005, + "grad_norm": 1.6092065572738647, + "learning_rate": 9.40304008930934e-06, + "loss": 0.3737, + "step": 9720 + }, + { + "epoch": 1.6022107408246022, + "grad_norm": 1.9483388662338257, + "learning_rate": 9.385295512573436e-06, + "loss": 0.4191, + "step": 9730 + }, + { + "epoch": 1.6038574751446038, + "grad_norm": 1.6057438850402832, + "learning_rate": 9.367552878487736e-06, + "loss": 0.4186, + "step": 9740 + }, + { + "epoch": 1.6055042094646055, + "grad_norm": 2.68620228767395, + "learning_rate": 9.34981224312428e-06, + "loss": 0.3937, + "step": 9750 + }, + { + "epoch": 1.6071509437846072, + "grad_norm": 2.7004518508911133, + "learning_rate": 9.332073662548785e-06, + "loss": 0.4057, + "step": 9760 + }, + { + "epoch": 1.608797678104609, + "grad_norm": 2.4203567504882812, + "learning_rate": 9.314337192820477e-06, + "loss": 0.3898, + "step": 9770 + }, + { + "epoch": 1.6104444124246104, + "grad_norm": 1.4761004447937012, + "learning_rate": 9.296602889991914e-06, + "loss": 0.3759, + "step": 9780 + }, + { + "epoch": 1.612091146744612, + "grad_norm": 1.881105899810791, + "learning_rate": 9.278870810108794e-06, + "loss": 0.3927, + "step": 9790 + }, + { + "epoch": 1.6137378810646137, + "grad_norm": 1.8509386777877808, + "learning_rate": 9.261141009209803e-06, + "loss": 0.4013, + "step": 9800 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.8641047477722168, + "learning_rate": 9.24341354332642e-06, + "loss": 0.3911, + "step": 9810 + }, + { + "epoch": 1.6170313497046171, + "grad_norm": 1.479648470878601, + "learning_rate": 9.225688468482743e-06, + "loss": 0.384, + "step": 9820 + }, + { + "epoch": 1.6186780840246187, + "grad_norm": 1.395717740058899, + "learning_rate": 9.207965840695314e-06, + "loss": 0.3862, + "step": 9830 + }, + { + "epoch": 1.6203248183446202, + "grad_norm": 1.989012598991394, + "learning_rate": 9.190245715972946e-06, + "loss": 0.405, + "step": 9840 + }, + { + "epoch": 1.621971552664622, + "grad_norm": 2.0621602535247803, + "learning_rate": 9.172528150316536e-06, + "loss": 0.4259, + "step": 9850 + }, + { + "epoch": 1.6236182869846236, + "grad_norm": 2.660885810852051, + "learning_rate": 9.154813199718893e-06, + "loss": 0.3823, + "step": 9860 + }, + { + "epoch": 1.6252650213046254, + "grad_norm": 2.435926914215088, + "learning_rate": 9.137100920164567e-06, + "loss": 0.3809, + "step": 9870 + }, + { + "epoch": 1.6269117556246269, + "grad_norm": 1.566701054573059, + "learning_rate": 9.119391367629665e-06, + "loss": 0.3987, + "step": 9880 + }, + { + "epoch": 1.6285584899446286, + "grad_norm": 1.947234034538269, + "learning_rate": 9.101684598081672e-06, + "loss": 0.3806, + "step": 9890 + }, + { + "epoch": 1.63020522426463, + "grad_norm": 1.8058415651321411, + "learning_rate": 9.083980667479286e-06, + "loss": 0.412, + "step": 9900 + }, + { + "epoch": 1.6318519585846318, + "grad_norm": 1.4982244968414307, + "learning_rate": 9.066279631772222e-06, + "loss": 0.3814, + "step": 9910 + }, + { + "epoch": 1.6334986929046336, + "grad_norm": 1.4314496517181396, + "learning_rate": 9.048581546901056e-06, + "loss": 0.3928, + "step": 9920 + }, + { + "epoch": 1.6351454272246353, + "grad_norm": 1.6061030626296997, + "learning_rate": 9.03088646879703e-06, + "loss": 0.3695, + "step": 9930 + }, + { + "epoch": 1.6367921615446368, + "grad_norm": 1.627259373664856, + "learning_rate": 9.013194453381892e-06, + "loss": 0.4073, + "step": 9940 + }, + { + "epoch": 1.6384388958646383, + "grad_norm": 1.8757750988006592, + "learning_rate": 8.995505556567707e-06, + "loss": 0.3773, + "step": 9950 + }, + { + "epoch": 1.64008563018464, + "grad_norm": 2.141075372695923, + "learning_rate": 8.977819834256683e-06, + "loss": 0.3893, + "step": 9960 + }, + { + "epoch": 1.6417323645046418, + "grad_norm": 1.8441202640533447, + "learning_rate": 8.960137342340997e-06, + "loss": 0.3921, + "step": 9970 + }, + { + "epoch": 1.6433790988246435, + "grad_norm": 1.523070216178894, + "learning_rate": 8.94245813670262e-06, + "loss": 0.3899, + "step": 9980 + }, + { + "epoch": 1.645025833144645, + "grad_norm": 1.7577736377716064, + "learning_rate": 8.924782273213137e-06, + "loss": 0.3742, + "step": 9990 + }, + { + "epoch": 1.6466725674646465, + "grad_norm": 1.6085615158081055, + "learning_rate": 8.907109807733559e-06, + "loss": 0.3694, + "step": 10000 + }, + { + "epoch": 1.6483193017846482, + "grad_norm": 2.142134666442871, + "learning_rate": 8.889440796114174e-06, + "loss": 0.3847, + "step": 10010 + }, + { + "epoch": 1.64996603610465, + "grad_norm": 1.5217559337615967, + "learning_rate": 8.871775294194346e-06, + "loss": 0.3493, + "step": 10020 + }, + { + "epoch": 1.6516127704246517, + "grad_norm": 2.990302085876465, + "learning_rate": 8.854113357802353e-06, + "loss": 0.367, + "step": 10030 + }, + { + "epoch": 1.6532595047446532, + "grad_norm": 1.62135648727417, + "learning_rate": 8.836455042755197e-06, + "loss": 0.3797, + "step": 10040 + }, + { + "epoch": 1.654906239064655, + "grad_norm": 1.5902869701385498, + "learning_rate": 8.818800404858441e-06, + "loss": 0.3803, + "step": 10050 + }, + { + "epoch": 1.6565529733846565, + "grad_norm": 1.5055309534072876, + "learning_rate": 8.801149499906032e-06, + "loss": 0.379, + "step": 10060 + }, + { + "epoch": 1.6581997077046582, + "grad_norm": 1.5412495136260986, + "learning_rate": 8.78350238368011e-06, + "loss": 0.3718, + "step": 10070 + }, + { + "epoch": 1.65984644202466, + "grad_norm": 1.5508426427841187, + "learning_rate": 8.765859111950842e-06, + "loss": 0.3957, + "step": 10080 + }, + { + "epoch": 1.6614931763446616, + "grad_norm": 1.5197268724441528, + "learning_rate": 8.74821974047625e-06, + "loss": 0.3655, + "step": 10090 + }, + { + "epoch": 1.6631399106646632, + "grad_norm": 2.0279932022094727, + "learning_rate": 8.730584325002031e-06, + "loss": 0.3608, + "step": 10100 + }, + { + "epoch": 1.6647866449846647, + "grad_norm": 1.2136750221252441, + "learning_rate": 8.712952921261377e-06, + "loss": 0.3698, + "step": 10110 + }, + { + "epoch": 1.6664333793046664, + "grad_norm": 1.5392953157424927, + "learning_rate": 8.695325584974802e-06, + "loss": 0.385, + "step": 10120 + }, + { + "epoch": 1.6680801136246681, + "grad_norm": 1.2899067401885986, + "learning_rate": 8.677702371849965e-06, + "loss": 0.3637, + "step": 10130 + }, + { + "epoch": 1.6697268479446699, + "grad_norm": 1.5220967531204224, + "learning_rate": 8.6600833375815e-06, + "loss": 0.3788, + "step": 10140 + }, + { + "epoch": 1.6713735822646714, + "grad_norm": 2.6489806175231934, + "learning_rate": 8.642468537850822e-06, + "loss": 0.3889, + "step": 10150 + }, + { + "epoch": 1.6730203165846729, + "grad_norm": 2.6164515018463135, + "learning_rate": 8.624858028325976e-06, + "loss": 0.384, + "step": 10160 + }, + { + "epoch": 1.6746670509046746, + "grad_norm": 2.1148691177368164, + "learning_rate": 8.607251864661443e-06, + "loss": 0.3684, + "step": 10170 + }, + { + "epoch": 1.6763137852246763, + "grad_norm": 1.1956931352615356, + "learning_rate": 8.589650102497973e-06, + "loss": 0.3512, + "step": 10180 + }, + { + "epoch": 1.677960519544678, + "grad_norm": 1.5633714199066162, + "learning_rate": 8.572052797462403e-06, + "loss": 0.3658, + "step": 10190 + }, + { + "epoch": 1.6796072538646796, + "grad_norm": 1.9797472953796387, + "learning_rate": 8.554460005167483e-06, + "loss": 0.3643, + "step": 10200 + }, + { + "epoch": 1.6812539881846813, + "grad_norm": 1.8225812911987305, + "learning_rate": 8.536871781211711e-06, + "loss": 0.3615, + "step": 10210 + }, + { + "epoch": 1.6829007225046828, + "grad_norm": 2.6055593490600586, + "learning_rate": 8.51928818117913e-06, + "loss": 0.3932, + "step": 10220 + }, + { + "epoch": 1.6845474568246845, + "grad_norm": 2.2658164501190186, + "learning_rate": 8.501709260639187e-06, + "loss": 0.3912, + "step": 10230 + }, + { + "epoch": 1.6861941911446863, + "grad_norm": 2.0631959438323975, + "learning_rate": 8.484135075146528e-06, + "loss": 0.3777, + "step": 10240 + }, + { + "epoch": 1.687840925464688, + "grad_norm": 2.3120977878570557, + "learning_rate": 8.466565680240847e-06, + "loss": 0.3918, + "step": 10250 + }, + { + "epoch": 1.6894876597846895, + "grad_norm": 1.802088737487793, + "learning_rate": 8.449001131446687e-06, + "loss": 0.3785, + "step": 10260 + }, + { + "epoch": 1.691134394104691, + "grad_norm": 2.1044161319732666, + "learning_rate": 8.431441484273282e-06, + "loss": 0.3679, + "step": 10270 + }, + { + "epoch": 1.6927811284246927, + "grad_norm": 2.071812868118286, + "learning_rate": 8.413886794214379e-06, + "loss": 0.3771, + "step": 10280 + }, + { + "epoch": 1.6944278627446945, + "grad_norm": 2.1053099632263184, + "learning_rate": 8.396337116748046e-06, + "loss": 0.3496, + "step": 10290 + }, + { + "epoch": 1.6960745970646962, + "grad_norm": 1.8317806720733643, + "learning_rate": 8.37879250733652e-06, + "loss": 0.3549, + "step": 10300 + }, + { + "epoch": 1.6977213313846977, + "grad_norm": 2.297271728515625, + "learning_rate": 8.361253021426019e-06, + "loss": 0.3801, + "step": 10310 + }, + { + "epoch": 1.6993680657046992, + "grad_norm": 2.0474696159362793, + "learning_rate": 8.343718714446572e-06, + "loss": 0.3814, + "step": 10320 + }, + { + "epoch": 1.701014800024701, + "grad_norm": 1.3918821811676025, + "learning_rate": 8.326189641811835e-06, + "loss": 0.3513, + "step": 10330 + }, + { + "epoch": 1.7026615343447027, + "grad_norm": 1.5274895429611206, + "learning_rate": 8.308665858918928e-06, + "loss": 0.3358, + "step": 10340 + }, + { + "epoch": 1.7043082686647044, + "grad_norm": 2.130195140838623, + "learning_rate": 8.291147421148255e-06, + "loss": 0.3682, + "step": 10350 + }, + { + "epoch": 1.705955002984706, + "grad_norm": 1.9873136281967163, + "learning_rate": 8.273634383863315e-06, + "loss": 0.367, + "step": 10360 + }, + { + "epoch": 1.7076017373047077, + "grad_norm": 1.433440923690796, + "learning_rate": 8.256126802410554e-06, + "loss": 0.3572, + "step": 10370 + }, + { + "epoch": 1.7092484716247092, + "grad_norm": 1.550261378288269, + "learning_rate": 8.238624732119169e-06, + "loss": 0.383, + "step": 10380 + }, + { + "epoch": 1.710895205944711, + "grad_norm": 1.8397653102874756, + "learning_rate": 8.221128228300941e-06, + "loss": 0.3497, + "step": 10390 + }, + { + "epoch": 1.7125419402647126, + "grad_norm": 1.6404129266738892, + "learning_rate": 8.203637346250062e-06, + "loss": 0.3514, + "step": 10400 + }, + { + "epoch": 1.7141886745847141, + "grad_norm": 2.078935146331787, + "learning_rate": 8.186152141242957e-06, + "loss": 0.3694, + "step": 10410 + }, + { + "epoch": 1.7158354089047159, + "grad_norm": 1.4735045433044434, + "learning_rate": 8.16867266853811e-06, + "loss": 0.3602, + "step": 10420 + }, + { + "epoch": 1.7174821432247174, + "grad_norm": 1.6575242280960083, + "learning_rate": 8.15119898337588e-06, + "loss": 0.3569, + "step": 10430 + }, + { + "epoch": 1.719128877544719, + "grad_norm": 1.4052168130874634, + "learning_rate": 8.133731140978347e-06, + "loss": 0.3703, + "step": 10440 + }, + { + "epoch": 1.7207756118647208, + "grad_norm": 2.0821497440338135, + "learning_rate": 8.116269196549124e-06, + "loss": 0.3442, + "step": 10450 + }, + { + "epoch": 1.7224223461847226, + "grad_norm": 1.4449158906936646, + "learning_rate": 8.098813205273183e-06, + "loss": 0.3179, + "step": 10460 + }, + { + "epoch": 1.724069080504724, + "grad_norm": 1.4337159395217896, + "learning_rate": 8.081363222316681e-06, + "loss": 0.3608, + "step": 10470 + }, + { + "epoch": 1.7257158148247256, + "grad_norm": 1.587299108505249, + "learning_rate": 8.063919302826787e-06, + "loss": 0.366, + "step": 10480 + }, + { + "epoch": 1.7273625491447273, + "grad_norm": 1.4321982860565186, + "learning_rate": 8.046481501931516e-06, + "loss": 0.36, + "step": 10490 + }, + { + "epoch": 1.729009283464729, + "grad_norm": 1.3218423128128052, + "learning_rate": 8.02904987473953e-06, + "loss": 0.3678, + "step": 10500 + }, + { + "epoch": 1.7306560177847308, + "grad_norm": 1.4841070175170898, + "learning_rate": 8.011624476339993e-06, + "loss": 0.3565, + "step": 10510 + }, + { + "epoch": 1.7323027521047323, + "grad_norm": 1.6882789134979248, + "learning_rate": 7.99420536180238e-06, + "loss": 0.3598, + "step": 10520 + }, + { + "epoch": 1.7339494864247338, + "grad_norm": 1.6460531949996948, + "learning_rate": 7.976792586176311e-06, + "loss": 0.3601, + "step": 10530 + }, + { + "epoch": 1.7355962207447355, + "grad_norm": 2.165046215057373, + "learning_rate": 7.959386204491365e-06, + "loss": 0.3529, + "step": 10540 + }, + { + "epoch": 1.7372429550647372, + "grad_norm": 2.083591938018799, + "learning_rate": 7.941986271756926e-06, + "loss": 0.3664, + "step": 10550 + }, + { + "epoch": 1.738889689384739, + "grad_norm": 1.4335622787475586, + "learning_rate": 7.924592842961985e-06, + "loss": 0.3742, + "step": 10560 + }, + { + "epoch": 1.7405364237047405, + "grad_norm": 1.9920486211776733, + "learning_rate": 7.907205973074987e-06, + "loss": 0.3652, + "step": 10570 + }, + { + "epoch": 1.7421831580247422, + "grad_norm": 1.6265625953674316, + "learning_rate": 7.889825717043643e-06, + "loss": 0.3714, + "step": 10580 + }, + { + "epoch": 1.7438298923447437, + "grad_norm": 1.6098805665969849, + "learning_rate": 7.872452129794765e-06, + "loss": 0.3434, + "step": 10590 + }, + { + "epoch": 1.7454766266647455, + "grad_norm": 1.5913910865783691, + "learning_rate": 7.855085266234093e-06, + "loss": 0.3601, + "step": 10600 + }, + { + "epoch": 1.7471233609847472, + "grad_norm": 1.3813248872756958, + "learning_rate": 7.837725181246116e-06, + "loss": 0.3658, + "step": 10610 + }, + { + "epoch": 1.748770095304749, + "grad_norm": 1.6209338903427124, + "learning_rate": 7.820371929693894e-06, + "loss": 0.3515, + "step": 10620 + }, + { + "epoch": 1.7504168296247504, + "grad_norm": 1.6621341705322266, + "learning_rate": 7.803025566418904e-06, + "loss": 0.364, + "step": 10630 + }, + { + "epoch": 1.752063563944752, + "grad_norm": 1.360546588897705, + "learning_rate": 7.785686146240844e-06, + "loss": 0.3582, + "step": 10640 + }, + { + "epoch": 1.7537102982647537, + "grad_norm": 2.2365732192993164, + "learning_rate": 7.76835372395747e-06, + "loss": 0.3538, + "step": 10650 + }, + { + "epoch": 1.7553570325847554, + "grad_norm": 1.673392415046692, + "learning_rate": 7.751028354344432e-06, + "loss": 0.3661, + "step": 10660 + }, + { + "epoch": 1.7570037669047571, + "grad_norm": 1.4482479095458984, + "learning_rate": 7.733710092155076e-06, + "loss": 0.3468, + "step": 10670 + }, + { + "epoch": 1.7586505012247586, + "grad_norm": 1.5075653791427612, + "learning_rate": 7.716398992120302e-06, + "loss": 0.3588, + "step": 10680 + }, + { + "epoch": 1.7602972355447601, + "grad_norm": 1.881640911102295, + "learning_rate": 7.699095108948365e-06, + "loss": 0.3538, + "step": 10690 + }, + { + "epoch": 1.7619439698647619, + "grad_norm": 1.933388590812683, + "learning_rate": 7.681798497324717e-06, + "loss": 0.3527, + "step": 10700 + }, + { + "epoch": 1.7635907041847636, + "grad_norm": 1.514220118522644, + "learning_rate": 7.664509211911833e-06, + "loss": 0.3679, + "step": 10710 + }, + { + "epoch": 1.7652374385047653, + "grad_norm": 1.9936243295669556, + "learning_rate": 7.647227307349024e-06, + "loss": 0.3808, + "step": 10720 + }, + { + "epoch": 1.7668841728247668, + "grad_norm": 1.567679762840271, + "learning_rate": 7.629952838252287e-06, + "loss": 0.3515, + "step": 10730 + }, + { + "epoch": 1.7685309071447686, + "grad_norm": 1.4825366735458374, + "learning_rate": 7.612685859214113e-06, + "loss": 0.3677, + "step": 10740 + }, + { + "epoch": 1.77017764146477, + "grad_norm": 1.8481963872909546, + "learning_rate": 7.59542642480333e-06, + "loss": 0.3642, + "step": 10750 + }, + { + "epoch": 1.7718243757847718, + "grad_norm": 1.9915103912353516, + "learning_rate": 7.578174589564911e-06, + "loss": 0.3512, + "step": 10760 + }, + { + "epoch": 1.7734711101047735, + "grad_norm": 2.2940824031829834, + "learning_rate": 7.560930408019823e-06, + "loss": 0.3713, + "step": 10770 + }, + { + "epoch": 1.7751178444247753, + "grad_norm": 1.6668184995651245, + "learning_rate": 7.543693934664846e-06, + "loss": 0.3782, + "step": 10780 + }, + { + "epoch": 1.7767645787447768, + "grad_norm": 1.4784590005874634, + "learning_rate": 7.52646522397239e-06, + "loss": 0.3632, + "step": 10790 + }, + { + "epoch": 1.7784113130647783, + "grad_norm": 1.6654629707336426, + "learning_rate": 7.5092443303903404e-06, + "loss": 0.351, + "step": 10800 + }, + { + "epoch": 1.78005804738478, + "grad_norm": 1.913283109664917, + "learning_rate": 7.492031308341879e-06, + "loss": 0.3557, + "step": 10810 + }, + { + "epoch": 1.7817047817047817, + "grad_norm": 1.9951850175857544, + "learning_rate": 7.474826212225305e-06, + "loss": 0.3358, + "step": 10820 + }, + { + "epoch": 1.7833515160247835, + "grad_norm": 1.5256726741790771, + "learning_rate": 7.457629096413874e-06, + "loss": 0.3418, + "step": 10830 + }, + { + "epoch": 1.784998250344785, + "grad_norm": 1.617591142654419, + "learning_rate": 7.440440015255625e-06, + "loss": 0.3448, + "step": 10840 + }, + { + "epoch": 1.7866449846647865, + "grad_norm": 1.3453478813171387, + "learning_rate": 7.423259023073197e-06, + "loss": 0.3479, + "step": 10850 + }, + { + "epoch": 1.7882917189847882, + "grad_norm": 1.643965721130371, + "learning_rate": 7.406086174163665e-06, + "loss": 0.3634, + "step": 10860 + }, + { + "epoch": 1.78993845330479, + "grad_norm": 1.6207489967346191, + "learning_rate": 7.388921522798376e-06, + "loss": 0.3419, + "step": 10870 + }, + { + "epoch": 1.7915851876247917, + "grad_norm": 2.043084144592285, + "learning_rate": 7.371765123222767e-06, + "loss": 0.348, + "step": 10880 + }, + { + "epoch": 1.7932319219447932, + "grad_norm": 1.35907781124115, + "learning_rate": 7.354617029656198e-06, + "loss": 0.3344, + "step": 10890 + }, + { + "epoch": 1.794878656264795, + "grad_norm": 1.5821566581726074, + "learning_rate": 7.337477296291778e-06, + "loss": 0.3273, + "step": 10900 + }, + { + "epoch": 1.7965253905847964, + "grad_norm": 1.6671669483184814, + "learning_rate": 7.3203459772961924e-06, + "loss": 0.3508, + "step": 10910 + }, + { + "epoch": 1.7981721249047982, + "grad_norm": 1.5016417503356934, + "learning_rate": 7.303223126809546e-06, + "loss": 0.3348, + "step": 10920 + }, + { + "epoch": 1.7998188592248, + "grad_norm": 1.4615623950958252, + "learning_rate": 7.286108798945162e-06, + "loss": 0.3501, + "step": 10930 + }, + { + "epoch": 1.8014655935448016, + "grad_norm": 1.4260656833648682, + "learning_rate": 7.269003047789446e-06, + "loss": 0.338, + "step": 10940 + }, + { + "epoch": 1.8031123278648031, + "grad_norm": 2.318674087524414, + "learning_rate": 7.251905927401691e-06, + "loss": 0.3422, + "step": 10950 + }, + { + "epoch": 1.8047590621848046, + "grad_norm": 1.3945016860961914, + "learning_rate": 7.234817491813917e-06, + "loss": 0.3291, + "step": 10960 + }, + { + "epoch": 1.8064057965048064, + "grad_norm": 2.036652088165283, + "learning_rate": 7.217737795030695e-06, + "loss": 0.3412, + "step": 10970 + }, + { + "epoch": 1.808052530824808, + "grad_norm": 1.9263806343078613, + "learning_rate": 7.200666891028983e-06, + "loss": 0.3341, + "step": 10980 + }, + { + "epoch": 1.8096992651448098, + "grad_norm": 1.7770569324493408, + "learning_rate": 7.183604833757949e-06, + "loss": 0.3375, + "step": 10990 + }, + { + "epoch": 1.8113459994648113, + "grad_norm": 1.8444048166275024, + "learning_rate": 7.166551677138794e-06, + "loss": 0.3298, + "step": 11000 + }, + { + "epoch": 1.8129927337848128, + "grad_norm": 1.5634081363677979, + "learning_rate": 7.149507475064606e-06, + "loss": 0.337, + "step": 11010 + }, + { + "epoch": 1.8146394681048146, + "grad_norm": 2.272538900375366, + "learning_rate": 7.13247228140016e-06, + "loss": 0.3514, + "step": 11020 + }, + { + "epoch": 1.8162862024248163, + "grad_norm": 1.4847793579101562, + "learning_rate": 7.115446149981774e-06, + "loss": 0.3442, + "step": 11030 + }, + { + "epoch": 1.817932936744818, + "grad_norm": 1.7815018892288208, + "learning_rate": 7.098429134617117e-06, + "loss": 0.3527, + "step": 11040 + }, + { + "epoch": 1.8195796710648195, + "grad_norm": 1.8198860883712769, + "learning_rate": 7.081421289085053e-06, + "loss": 0.3289, + "step": 11050 + }, + { + "epoch": 1.8212264053848213, + "grad_norm": 1.5678714513778687, + "learning_rate": 7.06442266713547e-06, + "loss": 0.3212, + "step": 11060 + }, + { + "epoch": 1.8228731397048228, + "grad_norm": 1.6661272048950195, + "learning_rate": 7.047433322489094e-06, + "loss": 0.3511, + "step": 11070 + }, + { + "epoch": 1.8245198740248245, + "grad_norm": 1.59157395362854, + "learning_rate": 7.030453308837344e-06, + "loss": 0.3392, + "step": 11080 + }, + { + "epoch": 1.8261666083448262, + "grad_norm": 1.9493590593338013, + "learning_rate": 7.013482679842145e-06, + "loss": 0.3496, + "step": 11090 + }, + { + "epoch": 1.827813342664828, + "grad_norm": 2.397372245788574, + "learning_rate": 6.996521489135768e-06, + "loss": 0.3428, + "step": 11100 + }, + { + "epoch": 1.8294600769848295, + "grad_norm": 2.058790445327759, + "learning_rate": 6.979569790320653e-06, + "loss": 0.3099, + "step": 11110 + }, + { + "epoch": 1.831106811304831, + "grad_norm": 1.6316676139831543, + "learning_rate": 6.962627636969241e-06, + "loss": 0.3536, + "step": 11120 + }, + { + "epoch": 1.8327535456248327, + "grad_norm": 1.8087751865386963, + "learning_rate": 6.945695082623816e-06, + "loss": 0.3548, + "step": 11130 + }, + { + "epoch": 1.8344002799448345, + "grad_norm": 1.6913039684295654, + "learning_rate": 6.928772180796308e-06, + "loss": 0.3325, + "step": 11140 + }, + { + "epoch": 1.8360470142648362, + "grad_norm": 1.5240042209625244, + "learning_rate": 6.911858984968158e-06, + "loss": 0.3381, + "step": 11150 + }, + { + "epoch": 1.8376937485848377, + "grad_norm": 1.340308427810669, + "learning_rate": 6.894955548590128e-06, + "loss": 0.3293, + "step": 11160 + }, + { + "epoch": 1.8393404829048392, + "grad_norm": 1.640592336654663, + "learning_rate": 6.878061925082138e-06, + "loss": 0.3394, + "step": 11170 + }, + { + "epoch": 1.840987217224841, + "grad_norm": 1.7119625806808472, + "learning_rate": 6.861178167833096e-06, + "loss": 0.3495, + "step": 11180 + }, + { + "epoch": 1.8426339515448427, + "grad_norm": 1.365484356880188, + "learning_rate": 6.844304330200728e-06, + "loss": 0.3353, + "step": 11190 + }, + { + "epoch": 1.8442806858648444, + "grad_norm": 2.2884132862091064, + "learning_rate": 6.827440465511414e-06, + "loss": 0.331, + "step": 11200 + }, + { + "epoch": 1.845927420184846, + "grad_norm": 1.5407003164291382, + "learning_rate": 6.810586627060019e-06, + "loss": 0.3182, + "step": 11210 + }, + { + "epoch": 1.8475741545048474, + "grad_norm": 1.7855136394500732, + "learning_rate": 6.793742868109709e-06, + "loss": 0.3248, + "step": 11220 + }, + { + "epoch": 1.8492208888248491, + "grad_norm": 1.5685489177703857, + "learning_rate": 6.776909241891809e-06, + "loss": 0.3332, + "step": 11230 + }, + { + "epoch": 1.8508676231448509, + "grad_norm": 2.068676233291626, + "learning_rate": 6.76008580160562e-06, + "loss": 0.3421, + "step": 11240 + }, + { + "epoch": 1.8525143574648526, + "grad_norm": 1.9478691816329956, + "learning_rate": 6.743272600418246e-06, + "loss": 0.3194, + "step": 11250 + }, + { + "epoch": 1.854161091784854, + "grad_norm": 1.7436619997024536, + "learning_rate": 6.726469691464439e-06, + "loss": 0.3261, + "step": 11260 + }, + { + "epoch": 1.8558078261048558, + "grad_norm": 1.7236602306365967, + "learning_rate": 6.70967712784642e-06, + "loss": 0.3115, + "step": 11270 + }, + { + "epoch": 1.8574545604248573, + "grad_norm": 1.4117182493209839, + "learning_rate": 6.692894962633722e-06, + "loss": 0.3181, + "step": 11280 + }, + { + "epoch": 1.859101294744859, + "grad_norm": 1.5423839092254639, + "learning_rate": 6.6761232488630046e-06, + "loss": 0.3137, + "step": 11290 + }, + { + "epoch": 1.8607480290648608, + "grad_norm": 1.5166431665420532, + "learning_rate": 6.659362039537907e-06, + "loss": 0.3257, + "step": 11300 + }, + { + "epoch": 1.8623947633848625, + "grad_norm": 1.261831521987915, + "learning_rate": 6.6426113876288665e-06, + "loss": 0.3134, + "step": 11310 + }, + { + "epoch": 1.864041497704864, + "grad_norm": 1.5698233842849731, + "learning_rate": 6.6258713460729604e-06, + "loss": 0.3269, + "step": 11320 + }, + { + "epoch": 1.8656882320248656, + "grad_norm": 1.4800500869750977, + "learning_rate": 6.609141967773733e-06, + "loss": 0.3393, + "step": 11330 + }, + { + "epoch": 1.8673349663448673, + "grad_norm": 1.6218483448028564, + "learning_rate": 6.592423305601025e-06, + "loss": 0.3542, + "step": 11340 + }, + { + "epoch": 1.868981700664869, + "grad_norm": 1.5717345476150513, + "learning_rate": 6.5757154123908185e-06, + "loss": 0.3347, + "step": 11350 + }, + { + "epoch": 1.8706284349848707, + "grad_norm": 2.0220766067504883, + "learning_rate": 6.559018340945051e-06, + "loss": 0.3264, + "step": 11360 + }, + { + "epoch": 1.8722751693048723, + "grad_norm": 1.756557822227478, + "learning_rate": 6.542332144031471e-06, + "loss": 0.3721, + "step": 11370 + }, + { + "epoch": 1.8739219036248738, + "grad_norm": 1.273116946220398, + "learning_rate": 6.525656874383456e-06, + "loss": 0.3272, + "step": 11380 + }, + { + "epoch": 1.8755686379448755, + "grad_norm": 1.276261568069458, + "learning_rate": 6.508992584699849e-06, + "loss": 0.3009, + "step": 11390 + }, + { + "epoch": 1.8772153722648772, + "grad_norm": 1.6424288749694824, + "learning_rate": 6.492339327644797e-06, + "loss": 0.3341, + "step": 11400 + }, + { + "epoch": 1.878862106584879, + "grad_norm": 1.6955491304397583, + "learning_rate": 6.4756971558475755e-06, + "loss": 0.3335, + "step": 11410 + }, + { + "epoch": 1.8805088409048805, + "grad_norm": 1.661267638206482, + "learning_rate": 6.459066121902433e-06, + "loss": 0.3396, + "step": 11420 + }, + { + "epoch": 1.8821555752248822, + "grad_norm": 1.469783067703247, + "learning_rate": 6.442446278368411e-06, + "loss": 0.2996, + "step": 11430 + }, + { + "epoch": 1.8838023095448837, + "grad_norm": 1.665208101272583, + "learning_rate": 6.425837677769191e-06, + "loss": 0.3379, + "step": 11440 + }, + { + "epoch": 1.8854490438648854, + "grad_norm": 1.6089602708816528, + "learning_rate": 6.409240372592926e-06, + "loss": 0.3355, + "step": 11450 + }, + { + "epoch": 1.8870957781848872, + "grad_norm": 1.9932987689971924, + "learning_rate": 6.392654415292068e-06, + "loss": 0.3335, + "step": 11460 + }, + { + "epoch": 1.888742512504889, + "grad_norm": 1.4461263418197632, + "learning_rate": 6.3760798582832065e-06, + "loss": 0.327, + "step": 11470 + }, + { + "epoch": 1.8903892468248904, + "grad_norm": 1.5308934450149536, + "learning_rate": 6.359516753946905e-06, + "loss": 0.3283, + "step": 11480 + }, + { + "epoch": 1.892035981144892, + "grad_norm": 1.4926317930221558, + "learning_rate": 6.342965154627534e-06, + "loss": 0.3308, + "step": 11490 + }, + { + "epoch": 1.8936827154648936, + "grad_norm": 1.491185188293457, + "learning_rate": 6.326425112633097e-06, + "loss": 0.3179, + "step": 11500 + }, + { + "epoch": 1.8953294497848954, + "grad_norm": 1.4022414684295654, + "learning_rate": 6.309896680235082e-06, + "loss": 0.331, + "step": 11510 + }, + { + "epoch": 1.896976184104897, + "grad_norm": 1.5799373388290405, + "learning_rate": 6.293379909668282e-06, + "loss": 0.3099, + "step": 11520 + }, + { + "epoch": 1.8986229184248986, + "grad_norm": 1.7698333263397217, + "learning_rate": 6.276874853130639e-06, + "loss": 0.3459, + "step": 11530 + }, + { + "epoch": 1.9002696527449001, + "grad_norm": 1.4904412031173706, + "learning_rate": 6.2603815627830685e-06, + "loss": 0.3258, + "step": 11540 + }, + { + "epoch": 1.9019163870649018, + "grad_norm": 1.5331462621688843, + "learning_rate": 6.2439000907493105e-06, + "loss": 0.3347, + "step": 11550 + }, + { + "epoch": 1.9035631213849036, + "grad_norm": 1.8723499774932861, + "learning_rate": 6.227430489115751e-06, + "loss": 0.3369, + "step": 11560 + }, + { + "epoch": 1.9052098557049053, + "grad_norm": 1.607606291770935, + "learning_rate": 6.210972809931257e-06, + "loss": 0.3183, + "step": 11570 + }, + { + "epoch": 1.9068565900249068, + "grad_norm": 1.2479761838912964, + "learning_rate": 6.194527105207024e-06, + "loss": 0.3357, + "step": 11580 + }, + { + "epoch": 1.9085033243449085, + "grad_norm": 1.7267532348632812, + "learning_rate": 6.178093426916403e-06, + "loss": 0.3089, + "step": 11590 + }, + { + "epoch": 1.91015005866491, + "grad_norm": 1.616453766822815, + "learning_rate": 6.161671826994739e-06, + "loss": 0.3416, + "step": 11600 + }, + { + "epoch": 1.9117967929849118, + "grad_norm": 1.7664293050765991, + "learning_rate": 6.1452623573392e-06, + "loss": 0.3323, + "step": 11610 + }, + { + "epoch": 1.9134435273049135, + "grad_norm": 1.3341567516326904, + "learning_rate": 6.128865069808625e-06, + "loss": 0.3194, + "step": 11620 + }, + { + "epoch": 1.9150902616249152, + "grad_norm": 1.6267433166503906, + "learning_rate": 6.112480016223352e-06, + "loss": 0.321, + "step": 11630 + }, + { + "epoch": 1.9167369959449168, + "grad_norm": 1.7511682510375977, + "learning_rate": 6.0961072483650526e-06, + "loss": 0.3216, + "step": 11640 + }, + { + "epoch": 1.9183837302649183, + "grad_norm": 1.8487275838851929, + "learning_rate": 6.0797468179765785e-06, + "loss": 0.3226, + "step": 11650 + }, + { + "epoch": 1.92003046458492, + "grad_norm": 1.737802267074585, + "learning_rate": 6.063398776761785e-06, + "loss": 0.3123, + "step": 11660 + }, + { + "epoch": 1.9216771989049217, + "grad_norm": 1.4471075534820557, + "learning_rate": 6.047063176385378e-06, + "loss": 0.2977, + "step": 11670 + }, + { + "epoch": 1.9233239332249235, + "grad_norm": 1.8854256868362427, + "learning_rate": 6.030740068472745e-06, + "loss": 0.3323, + "step": 11680 + }, + { + "epoch": 1.924970667544925, + "grad_norm": 1.4351447820663452, + "learning_rate": 6.014429504609796e-06, + "loss": 0.3164, + "step": 11690 + }, + { + "epoch": 1.9266174018649265, + "grad_norm": 1.6448378562927246, + "learning_rate": 5.998131536342792e-06, + "loss": 0.3268, + "step": 11700 + }, + { + "epoch": 1.9282641361849282, + "grad_norm": 1.5195516347885132, + "learning_rate": 5.981846215178191e-06, + "loss": 0.3031, + "step": 11710 + }, + { + "epoch": 1.92991087050493, + "grad_norm": 1.6448338031768799, + "learning_rate": 5.965573592582488e-06, + "loss": 0.3234, + "step": 11720 + }, + { + "epoch": 1.9315576048249317, + "grad_norm": 1.4545010328292847, + "learning_rate": 5.9493137199820376e-06, + "loss": 0.3354, + "step": 11730 + }, + { + "epoch": 1.9332043391449332, + "grad_norm": 1.2834254503250122, + "learning_rate": 5.933066648762907e-06, + "loss": 0.3229, + "step": 11740 + }, + { + "epoch": 1.934851073464935, + "grad_norm": 1.7601953744888306, + "learning_rate": 5.916832430270705e-06, + "loss": 0.3112, + "step": 11750 + }, + { + "epoch": 1.9364978077849364, + "grad_norm": 1.315958857536316, + "learning_rate": 5.900611115810423e-06, + "loss": 0.3119, + "step": 11760 + }, + { + "epoch": 1.9381445421049381, + "grad_norm": 2.057389259338379, + "learning_rate": 5.884402756646273e-06, + "loss": 0.3206, + "step": 11770 + }, + { + "epoch": 1.9397912764249399, + "grad_norm": 2.368434190750122, + "learning_rate": 5.868207404001518e-06, + "loss": 0.3211, + "step": 11780 + }, + { + "epoch": 1.9414380107449416, + "grad_norm": 1.6587117910385132, + "learning_rate": 5.852025109058321e-06, + "loss": 0.3078, + "step": 11790 + }, + { + "epoch": 1.943084745064943, + "grad_norm": 2.1465649604797363, + "learning_rate": 5.835855922957583e-06, + "loss": 0.3168, + "step": 11800 + }, + { + "epoch": 1.9447314793849446, + "grad_norm": 2.077082395553589, + "learning_rate": 5.819699896798765e-06, + "loss": 0.3214, + "step": 11810 + }, + { + "epoch": 1.9463782137049463, + "grad_norm": 1.8317384719848633, + "learning_rate": 5.803557081639757e-06, + "loss": 0.3085, + "step": 11820 + }, + { + "epoch": 1.948024948024948, + "grad_norm": 1.5140388011932373, + "learning_rate": 5.787427528496676e-06, + "loss": 0.3213, + "step": 11830 + }, + { + "epoch": 1.9496716823449498, + "grad_norm": 1.7246615886688232, + "learning_rate": 5.771311288343748e-06, + "loss": 0.3098, + "step": 11840 + }, + { + "epoch": 1.9513184166649513, + "grad_norm": 2.0293779373168945, + "learning_rate": 5.755208412113116e-06, + "loss": 0.3206, + "step": 11850 + }, + { + "epoch": 1.9529651509849528, + "grad_norm": 1.6586174964904785, + "learning_rate": 5.739118950694684e-06, + "loss": 0.3122, + "step": 11860 + }, + { + "epoch": 1.9546118853049546, + "grad_norm": 1.7157447338104248, + "learning_rate": 5.723042954935968e-06, + "loss": 0.2913, + "step": 11870 + }, + { + "epoch": 1.9562586196249563, + "grad_norm": 1.383748173713684, + "learning_rate": 5.7069804756419326e-06, + "loss": 0.3248, + "step": 11880 + }, + { + "epoch": 1.957905353944958, + "grad_norm": 2.3896443843841553, + "learning_rate": 5.690931563574813e-06, + "loss": 0.3112, + "step": 11890 + }, + { + "epoch": 1.9595520882649595, + "grad_norm": 1.9655721187591553, + "learning_rate": 5.6748962694539855e-06, + "loss": 0.3181, + "step": 11900 + }, + { + "epoch": 1.961198822584961, + "grad_norm": 1.6184086799621582, + "learning_rate": 5.6588746439557706e-06, + "loss": 0.3272, + "step": 11910 + }, + { + "epoch": 1.9628455569049628, + "grad_norm": 1.5995012521743774, + "learning_rate": 5.642866737713311e-06, + "loss": 0.3039, + "step": 11920 + }, + { + "epoch": 1.9644922912249645, + "grad_norm": 1.8000292778015137, + "learning_rate": 5.6268726013163764e-06, + "loss": 0.3061, + "step": 11930 + }, + { + "epoch": 1.9661390255449662, + "grad_norm": 1.651097059249878, + "learning_rate": 5.610892285311229e-06, + "loss": 0.3, + "step": 11940 + }, + { + "epoch": 1.9677857598649677, + "grad_norm": 1.8784924745559692, + "learning_rate": 5.5949258402004446e-06, + "loss": 0.3261, + "step": 11950 + }, + { + "epoch": 1.9694324941849695, + "grad_norm": 1.6601523160934448, + "learning_rate": 5.578973316442779e-06, + "loss": 0.3081, + "step": 11960 + }, + { + "epoch": 1.971079228504971, + "grad_norm": 1.904227614402771, + "learning_rate": 5.563034764452976e-06, + "loss": 0.3211, + "step": 11970 + }, + { + "epoch": 1.9727259628249727, + "grad_norm": 1.6444785594940186, + "learning_rate": 5.5471102346016385e-06, + "loss": 0.3315, + "step": 11980 + }, + { + "epoch": 1.9743726971449744, + "grad_norm": 1.8297940492630005, + "learning_rate": 5.531199777215044e-06, + "loss": 0.2992, + "step": 11990 + }, + { + "epoch": 1.9760194314649762, + "grad_norm": 1.717757225036621, + "learning_rate": 5.515303442574997e-06, + "loss": 0.3323, + "step": 12000 + }, + { + "epoch": 1.9776661657849777, + "grad_norm": 1.6197963953018188, + "learning_rate": 5.499421280918682e-06, + "loss": 0.3401, + "step": 12010 + }, + { + "epoch": 1.9793129001049792, + "grad_norm": 1.5700541734695435, + "learning_rate": 5.4835533424384825e-06, + "loss": 0.3204, + "step": 12020 + }, + { + "epoch": 1.980959634424981, + "grad_norm": 1.5739840269088745, + "learning_rate": 5.467699677281828e-06, + "loss": 0.3115, + "step": 12030 + }, + { + "epoch": 1.9826063687449826, + "grad_norm": 1.8662614822387695, + "learning_rate": 5.451860335551056e-06, + "loss": 0.3207, + "step": 12040 + }, + { + "epoch": 1.9842531030649844, + "grad_norm": 1.725319504737854, + "learning_rate": 5.4360353673032185e-06, + "loss": 0.3038, + "step": 12050 + }, + { + "epoch": 1.9858998373849859, + "grad_norm": 1.7426807880401611, + "learning_rate": 5.420224822549963e-06, + "loss": 0.3012, + "step": 12060 + }, + { + "epoch": 1.9875465717049874, + "grad_norm": 1.5318362712860107, + "learning_rate": 5.404428751257339e-06, + "loss": 0.3037, + "step": 12070 + }, + { + "epoch": 1.9891933060249891, + "grad_norm": 1.514984369277954, + "learning_rate": 5.388647203345659e-06, + "loss": 0.3329, + "step": 12080 + }, + { + "epoch": 1.9908400403449908, + "grad_norm": 1.3293901681900024, + "learning_rate": 5.372880228689341e-06, + "loss": 0.3109, + "step": 12090 + }, + { + "epoch": 1.9924867746649926, + "grad_norm": 1.5703567266464233, + "learning_rate": 5.357127877116743e-06, + "loss": 0.3262, + "step": 12100 + }, + { + "epoch": 1.994133508984994, + "grad_norm": 1.395535945892334, + "learning_rate": 5.3413901984100195e-06, + "loss": 0.3314, + "step": 12110 + }, + { + "epoch": 1.9957802433049958, + "grad_norm": 1.7006887197494507, + "learning_rate": 5.3256672423049396e-06, + "loss": 0.3132, + "step": 12120 + }, + { + "epoch": 1.9974269776249973, + "grad_norm": 1.6310688257217407, + "learning_rate": 5.309959058490754e-06, + "loss": 0.2896, + "step": 12130 + }, + { + "epoch": 1.999073711944999, + "grad_norm": 1.698585033416748, + "learning_rate": 5.294265696610022e-06, + "loss": 0.3006, + "step": 12140 + }, + { + "epoch": 2.0008233671600006, + "grad_norm": 1.5965461730957031, + "learning_rate": 5.2785872062584705e-06, + "loss": 0.3434, + "step": 12150 + }, + { + "epoch": 2.0024701014800024, + "grad_norm": 1.6904765367507935, + "learning_rate": 5.262923636984818e-06, + "loss": 0.2823, + "step": 12160 + }, + { + "epoch": 2.004116835800004, + "grad_norm": 1.2032884359359741, + "learning_rate": 5.24727503829064e-06, + "loss": 0.2712, + "step": 12170 + }, + { + "epoch": 2.005763570120006, + "grad_norm": 1.4353348016738892, + "learning_rate": 5.2316414596301855e-06, + "loss": 0.285, + "step": 12180 + }, + { + "epoch": 2.0074103044400076, + "grad_norm": 1.5126605033874512, + "learning_rate": 5.216022950410251e-06, + "loss": 0.2763, + "step": 12190 + }, + { + "epoch": 2.009057038760009, + "grad_norm": 2.8418431282043457, + "learning_rate": 5.2004195599899966e-06, + "loss": 0.2863, + "step": 12200 + }, + { + "epoch": 2.0107037730800106, + "grad_norm": 2.0845654010772705, + "learning_rate": 5.1848313376808065e-06, + "loss": 0.29, + "step": 12210 + }, + { + "epoch": 2.0123505074000123, + "grad_norm": 1.393932580947876, + "learning_rate": 5.16925833274613e-06, + "loss": 0.2904, + "step": 12220 + }, + { + "epoch": 2.013997241720014, + "grad_norm": 1.4617998600006104, + "learning_rate": 5.153700594401328e-06, + "loss": 0.2663, + "step": 12230 + }, + { + "epoch": 2.0156439760400158, + "grad_norm": 1.865277886390686, + "learning_rate": 5.138158171813507e-06, + "loss": 0.2876, + "step": 12240 + }, + { + "epoch": 2.0172907103600175, + "grad_norm": 1.4628126621246338, + "learning_rate": 5.12263111410138e-06, + "loss": 0.2608, + "step": 12250 + }, + { + "epoch": 2.018937444680019, + "grad_norm": 1.8890234231948853, + "learning_rate": 5.107119470335093e-06, + "loss": 0.2869, + "step": 12260 + }, + { + "epoch": 2.0205841790000205, + "grad_norm": 2.153687000274658, + "learning_rate": 5.091623289536095e-06, + "loss": 0.2778, + "step": 12270 + }, + { + "epoch": 2.0222309133200222, + "grad_norm": 1.9289218187332153, + "learning_rate": 5.076142620676941e-06, + "loss": 0.3025, + "step": 12280 + }, + { + "epoch": 2.023877647640024, + "grad_norm": 1.6347593069076538, + "learning_rate": 5.060677512681187e-06, + "loss": 0.2692, + "step": 12290 + }, + { + "epoch": 2.0255243819600257, + "grad_norm": 2.1839990615844727, + "learning_rate": 5.045228014423203e-06, + "loss": 0.2843, + "step": 12300 + }, + { + "epoch": 2.027171116280027, + "grad_norm": 2.4028539657592773, + "learning_rate": 5.029794174728031e-06, + "loss": 0.2798, + "step": 12310 + }, + { + "epoch": 2.0288178506000287, + "grad_norm": 2.70993709564209, + "learning_rate": 5.014376042371221e-06, + "loss": 0.2773, + "step": 12320 + }, + { + "epoch": 2.0304645849200305, + "grad_norm": 1.8589779138565063, + "learning_rate": 4.998973666078692e-06, + "loss": 0.2775, + "step": 12330 + }, + { + "epoch": 2.032111319240032, + "grad_norm": 1.971356749534607, + "learning_rate": 4.983587094526556e-06, + "loss": 0.2776, + "step": 12340 + }, + { + "epoch": 2.033758053560034, + "grad_norm": 1.5491329431533813, + "learning_rate": 4.9682163763410005e-06, + "loss": 0.2729, + "step": 12350 + }, + { + "epoch": 2.035404787880035, + "grad_norm": 1.7983886003494263, + "learning_rate": 4.952861560098079e-06, + "loss": 0.2835, + "step": 12360 + }, + { + "epoch": 2.037051522200037, + "grad_norm": 2.259243965148926, + "learning_rate": 4.937522694323618e-06, + "loss": 0.3009, + "step": 12370 + }, + { + "epoch": 2.0386982565200387, + "grad_norm": 2.2802488803863525, + "learning_rate": 4.922199827493022e-06, + "loss": 0.2921, + "step": 12380 + }, + { + "epoch": 2.0403449908400404, + "grad_norm": 1.9333369731903076, + "learning_rate": 4.906893008031141e-06, + "loss": 0.2746, + "step": 12390 + }, + { + "epoch": 2.041991725160042, + "grad_norm": 1.7847884893417358, + "learning_rate": 4.8916022843121e-06, + "loss": 0.2816, + "step": 12400 + }, + { + "epoch": 2.043638459480044, + "grad_norm": 1.5014792680740356, + "learning_rate": 4.876327704659172e-06, + "loss": 0.2897, + "step": 12410 + }, + { + "epoch": 2.045285193800045, + "grad_norm": 2.1002280712127686, + "learning_rate": 4.861069317344598e-06, + "loss": 0.2744, + "step": 12420 + }, + { + "epoch": 2.046931928120047, + "grad_norm": 1.5581461191177368, + "learning_rate": 4.845827170589449e-06, + "loss": 0.2886, + "step": 12430 + }, + { + "epoch": 2.0485786624400486, + "grad_norm": 1.6326346397399902, + "learning_rate": 4.830601312563469e-06, + "loss": 0.2859, + "step": 12440 + }, + { + "epoch": 2.0502253967600503, + "grad_norm": 1.5935137271881104, + "learning_rate": 4.815391791384933e-06, + "loss": 0.2932, + "step": 12450 + }, + { + "epoch": 2.051872131080052, + "grad_norm": 1.5060147047042847, + "learning_rate": 4.800198655120478e-06, + "loss": 0.2899, + "step": 12460 + }, + { + "epoch": 2.0535188654000534, + "grad_norm": 1.7694473266601562, + "learning_rate": 4.785021951784967e-06, + "loss": 0.2852, + "step": 12470 + }, + { + "epoch": 2.055165599720055, + "grad_norm": 1.579512596130371, + "learning_rate": 4.76986172934132e-06, + "loss": 0.2837, + "step": 12480 + }, + { + "epoch": 2.056812334040057, + "grad_norm": 1.4137424230575562, + "learning_rate": 4.7547180357003885e-06, + "loss": 0.2793, + "step": 12490 + }, + { + "epoch": 2.0584590683600585, + "grad_norm": 1.5642904043197632, + "learning_rate": 4.739590918720765e-06, + "loss": 0.2766, + "step": 12500 + }, + { + "epoch": 2.0601058026800603, + "grad_norm": 2.033942937850952, + "learning_rate": 4.724480426208678e-06, + "loss": 0.3038, + "step": 12510 + }, + { + "epoch": 2.0617525370000616, + "grad_norm": 1.408429503440857, + "learning_rate": 4.709386605917798e-06, + "loss": 0.2747, + "step": 12520 + }, + { + "epoch": 2.0633992713200633, + "grad_norm": 1.467748761177063, + "learning_rate": 4.694309505549128e-06, + "loss": 0.2833, + "step": 12530 + }, + { + "epoch": 2.065046005640065, + "grad_norm": 2.374821662902832, + "learning_rate": 4.6792491727508076e-06, + "loss": 0.2828, + "step": 12540 + }, + { + "epoch": 2.0666927399600667, + "grad_norm": 2.1544690132141113, + "learning_rate": 4.664205655118006e-06, + "loss": 0.2872, + "step": 12550 + }, + { + "epoch": 2.0683394742800685, + "grad_norm": 1.5849260091781616, + "learning_rate": 4.6491790001927385e-06, + "loss": 0.2748, + "step": 12560 + }, + { + "epoch": 2.0699862086000698, + "grad_norm": 1.403977394104004, + "learning_rate": 4.634169255463734e-06, + "loss": 0.2899, + "step": 12570 + }, + { + "epoch": 2.0716329429200715, + "grad_norm": 1.8115674257278442, + "learning_rate": 4.619176468366274e-06, + "loss": 0.2723, + "step": 12580 + }, + { + "epoch": 2.0732796772400732, + "grad_norm": 1.3875244855880737, + "learning_rate": 4.604200686282063e-06, + "loss": 0.2661, + "step": 12590 + }, + { + "epoch": 2.074926411560075, + "grad_norm": 2.000905990600586, + "learning_rate": 4.5892419565390486e-06, + "loss": 0.2808, + "step": 12600 + }, + { + "epoch": 2.0765731458800767, + "grad_norm": 2.43119740486145, + "learning_rate": 4.5743003264113015e-06, + "loss": 0.2826, + "step": 12610 + }, + { + "epoch": 2.0782198802000784, + "grad_norm": 3.4667115211486816, + "learning_rate": 4.559375843118839e-06, + "loss": 0.2803, + "step": 12620 + }, + { + "epoch": 2.0798666145200797, + "grad_norm": 1.7227445840835571, + "learning_rate": 4.544468553827508e-06, + "loss": 0.2836, + "step": 12630 + }, + { + "epoch": 2.0815133488400814, + "grad_norm": 1.7388190031051636, + "learning_rate": 4.529578505648789e-06, + "loss": 0.288, + "step": 12640 + }, + { + "epoch": 2.083160083160083, + "grad_norm": 1.63683021068573, + "learning_rate": 4.514705745639706e-06, + "loss": 0.2896, + "step": 12650 + }, + { + "epoch": 2.084806817480085, + "grad_norm": 2.0901145935058594, + "learning_rate": 4.499850320802623e-06, + "loss": 0.2793, + "step": 12660 + }, + { + "epoch": 2.0864535518000866, + "grad_norm": 2.0766639709472656, + "learning_rate": 4.485012278085139e-06, + "loss": 0.2731, + "step": 12670 + }, + { + "epoch": 2.088100286120088, + "grad_norm": 1.5726747512817383, + "learning_rate": 4.470191664379903e-06, + "loss": 0.2751, + "step": 12680 + }, + { + "epoch": 2.0897470204400896, + "grad_norm": 2.0181379318237305, + "learning_rate": 4.455388526524498e-06, + "loss": 0.2992, + "step": 12690 + }, + { + "epoch": 2.0913937547600914, + "grad_norm": 2.143312692642212, + "learning_rate": 4.440602911301267e-06, + "loss": 0.2939, + "step": 12700 + }, + { + "epoch": 2.093040489080093, + "grad_norm": 1.3811988830566406, + "learning_rate": 4.425834865437184e-06, + "loss": 0.2795, + "step": 12710 + }, + { + "epoch": 2.094687223400095, + "grad_norm": 1.4296287298202515, + "learning_rate": 4.411084435603688e-06, + "loss": 0.2831, + "step": 12720 + }, + { + "epoch": 2.096333957720096, + "grad_norm": 1.4301252365112305, + "learning_rate": 4.396351668416562e-06, + "loss": 0.2963, + "step": 12730 + }, + { + "epoch": 2.097980692040098, + "grad_norm": 2.8089404106140137, + "learning_rate": 4.3816366104357545e-06, + "loss": 0.2935, + "step": 12740 + }, + { + "epoch": 2.0996274263600996, + "grad_norm": 1.936610460281372, + "learning_rate": 4.366939308165259e-06, + "loss": 0.3008, + "step": 12750 + }, + { + "epoch": 2.1012741606801013, + "grad_norm": 1.634929895401001, + "learning_rate": 4.352259808052944e-06, + "loss": 0.2853, + "step": 12760 + }, + { + "epoch": 2.102920895000103, + "grad_norm": 1.791841745376587, + "learning_rate": 4.337598156490435e-06, + "loss": 0.2583, + "step": 12770 + }, + { + "epoch": 2.1045676293201048, + "grad_norm": 1.6003674268722534, + "learning_rate": 4.322954399812932e-06, + "loss": 0.2787, + "step": 12780 + }, + { + "epoch": 2.106214363640106, + "grad_norm": 1.8749767541885376, + "learning_rate": 4.308328584299092e-06, + "loss": 0.2829, + "step": 12790 + }, + { + "epoch": 2.107861097960108, + "grad_norm": 1.6271084547042847, + "learning_rate": 4.293720756170868e-06, + "loss": 0.2713, + "step": 12800 + }, + { + "epoch": 2.1095078322801095, + "grad_norm": 1.9377316236495972, + "learning_rate": 4.2791309615933764e-06, + "loss": 0.2913, + "step": 12810 + }, + { + "epoch": 2.1111545666001112, + "grad_norm": 1.353582501411438, + "learning_rate": 4.264559246674727e-06, + "loss": 0.2805, + "step": 12820 + }, + { + "epoch": 2.112801300920113, + "grad_norm": 1.5382156372070312, + "learning_rate": 4.2500056574659135e-06, + "loss": 0.2853, + "step": 12830 + }, + { + "epoch": 2.1144480352401143, + "grad_norm": 1.6927893161773682, + "learning_rate": 4.235470239960623e-06, + "loss": 0.2789, + "step": 12840 + }, + { + "epoch": 2.116094769560116, + "grad_norm": 1.7223583459854126, + "learning_rate": 4.2209530400951335e-06, + "loss": 0.2786, + "step": 12850 + }, + { + "epoch": 2.1177415038801177, + "grad_norm": 1.528069257736206, + "learning_rate": 4.206454103748142e-06, + "loss": 0.2854, + "step": 12860 + }, + { + "epoch": 2.1193882382001195, + "grad_norm": 1.9811831712722778, + "learning_rate": 4.191973476740628e-06, + "loss": 0.2687, + "step": 12870 + }, + { + "epoch": 2.121034972520121, + "grad_norm": 1.6877635717391968, + "learning_rate": 4.1775112048357e-06, + "loss": 0.2725, + "step": 12880 + }, + { + "epoch": 2.1226817068401225, + "grad_norm": 1.6502715349197388, + "learning_rate": 4.163067333738479e-06, + "loss": 0.2712, + "step": 12890 + }, + { + "epoch": 2.124328441160124, + "grad_norm": 1.460304617881775, + "learning_rate": 4.148641909095911e-06, + "loss": 0.2653, + "step": 12900 + }, + { + "epoch": 2.125975175480126, + "grad_norm": 1.7706825733184814, + "learning_rate": 4.134234976496666e-06, + "loss": 0.2853, + "step": 12910 + }, + { + "epoch": 2.1276219098001277, + "grad_norm": 1.4802557229995728, + "learning_rate": 4.11984658147096e-06, + "loss": 0.2824, + "step": 12920 + }, + { + "epoch": 2.1292686441201294, + "grad_norm": 1.5747597217559814, + "learning_rate": 4.105476769490424e-06, + "loss": 0.268, + "step": 12930 + }, + { + "epoch": 2.1309153784401307, + "grad_norm": 2.0587172508239746, + "learning_rate": 4.091125585967975e-06, + "loss": 0.2962, + "step": 12940 + }, + { + "epoch": 2.1325621127601324, + "grad_norm": 1.5220030546188354, + "learning_rate": 4.0767930762576415e-06, + "loss": 0.2717, + "step": 12950 + }, + { + "epoch": 2.134208847080134, + "grad_norm": 1.6755307912826538, + "learning_rate": 4.0624792856544505e-06, + "loss": 0.2795, + "step": 12960 + }, + { + "epoch": 2.135855581400136, + "grad_norm": 1.359605073928833, + "learning_rate": 4.0481842593942636e-06, + "loss": 0.2689, + "step": 12970 + }, + { + "epoch": 2.1375023157201376, + "grad_norm": 1.5714139938354492, + "learning_rate": 4.033908042653639e-06, + "loss": 0.2772, + "step": 12980 + }, + { + "epoch": 2.1391490500401393, + "grad_norm": 1.4556095600128174, + "learning_rate": 4.019650680549704e-06, + "loss": 0.276, + "step": 12990 + }, + { + "epoch": 2.1407957843601406, + "grad_norm": 1.8429503440856934, + "learning_rate": 4.005412218139986e-06, + "loss": 0.2829, + "step": 13000 + }, + { + "epoch": 2.1424425186801423, + "grad_norm": 1.6822142601013184, + "learning_rate": 3.991192700422286e-06, + "loss": 0.2829, + "step": 13010 + }, + { + "epoch": 2.144089253000144, + "grad_norm": 1.859412670135498, + "learning_rate": 3.976992172334544e-06, + "loss": 0.2764, + "step": 13020 + }, + { + "epoch": 2.145735987320146, + "grad_norm": 1.6306754350662231, + "learning_rate": 3.962810678754674e-06, + "loss": 0.2731, + "step": 13030 + }, + { + "epoch": 2.1473827216401475, + "grad_norm": 1.5259572267532349, + "learning_rate": 3.948648264500445e-06, + "loss": 0.2608, + "step": 13040 + }, + { + "epoch": 2.149029455960149, + "grad_norm": 1.6222518682479858, + "learning_rate": 3.934504974329326e-06, + "loss": 0.299, + "step": 13050 + }, + { + "epoch": 2.1506761902801506, + "grad_norm": 1.880744457244873, + "learning_rate": 3.920380852938348e-06, + "loss": 0.2744, + "step": 13060 + }, + { + "epoch": 2.1523229246001523, + "grad_norm": 1.6208007335662842, + "learning_rate": 3.906275944963957e-06, + "loss": 0.2817, + "step": 13070 + }, + { + "epoch": 2.153969658920154, + "grad_norm": 1.4789073467254639, + "learning_rate": 3.892190294981893e-06, + "loss": 0.2813, + "step": 13080 + }, + { + "epoch": 2.1556163932401557, + "grad_norm": 1.7686960697174072, + "learning_rate": 3.8781239475070194e-06, + "loss": 0.2651, + "step": 13090 + }, + { + "epoch": 2.1572631275601575, + "grad_norm": 1.6891413927078247, + "learning_rate": 3.864076946993215e-06, + "loss": 0.2503, + "step": 13100 + }, + { + "epoch": 2.1589098618801588, + "grad_norm": 1.6192152500152588, + "learning_rate": 3.850049337833196e-06, + "loss": 0.2693, + "step": 13110 + }, + { + "epoch": 2.1605565962001605, + "grad_norm": 1.6435240507125854, + "learning_rate": 3.836041164358416e-06, + "loss": 0.2769, + "step": 13120 + }, + { + "epoch": 2.1622033305201622, + "grad_norm": 1.7345331907272339, + "learning_rate": 3.822052470838893e-06, + "loss": 0.2804, + "step": 13130 + }, + { + "epoch": 2.163850064840164, + "grad_norm": 1.42905855178833, + "learning_rate": 3.8080833014830865e-06, + "loss": 0.2749, + "step": 13140 + }, + { + "epoch": 2.1654967991601657, + "grad_norm": 1.6597148180007935, + "learning_rate": 3.7941337004377497e-06, + "loss": 0.2748, + "step": 13150 + }, + { + "epoch": 2.167143533480167, + "grad_norm": 1.6788873672485352, + "learning_rate": 3.7802037117878053e-06, + "loss": 0.2832, + "step": 13160 + }, + { + "epoch": 2.1687902678001687, + "grad_norm": 1.761786699295044, + "learning_rate": 3.7662933795561805e-06, + "loss": 0.2943, + "step": 13170 + }, + { + "epoch": 2.1704370021201704, + "grad_norm": 1.6826972961425781, + "learning_rate": 3.7524027477036974e-06, + "loss": 0.2763, + "step": 13180 + }, + { + "epoch": 2.172083736440172, + "grad_norm": 1.4096064567565918, + "learning_rate": 3.7385318601289034e-06, + "loss": 0.2904, + "step": 13190 + }, + { + "epoch": 2.173730470760174, + "grad_norm": 2.226126194000244, + "learning_rate": 3.724680760667967e-06, + "loss": 0.2656, + "step": 13200 + }, + { + "epoch": 2.175377205080175, + "grad_norm": 1.3945817947387695, + "learning_rate": 3.7108494930944937e-06, + "loss": 0.2762, + "step": 13210 + }, + { + "epoch": 2.177023939400177, + "grad_norm": 1.428282618522644, + "learning_rate": 3.697038101119442e-06, + "loss": 0.2728, + "step": 13220 + }, + { + "epoch": 2.1786706737201786, + "grad_norm": 1.9224436283111572, + "learning_rate": 3.6832466283909386e-06, + "loss": 0.2577, + "step": 13230 + }, + { + "epoch": 2.1803174080401804, + "grad_norm": 2.1055612564086914, + "learning_rate": 3.6694751184941712e-06, + "loss": 0.2695, + "step": 13240 + }, + { + "epoch": 2.181964142360182, + "grad_norm": 2.1279563903808594, + "learning_rate": 3.6557236149512276e-06, + "loss": 0.2871, + "step": 13250 + }, + { + "epoch": 2.1836108766801834, + "grad_norm": 2.5193889141082764, + "learning_rate": 3.641992161220983e-06, + "loss": 0.2814, + "step": 13260 + }, + { + "epoch": 2.185257611000185, + "grad_norm": 1.3304851055145264, + "learning_rate": 3.628280800698939e-06, + "loss": 0.2704, + "step": 13270 + }, + { + "epoch": 2.186904345320187, + "grad_norm": 1.5844635963439941, + "learning_rate": 3.6145895767171e-06, + "loss": 0.2627, + "step": 13280 + }, + { + "epoch": 2.1885510796401886, + "grad_norm": 1.840064287185669, + "learning_rate": 3.6009185325438278e-06, + "loss": 0.2649, + "step": 13290 + }, + { + "epoch": 2.1901978139601903, + "grad_norm": 1.7065536975860596, + "learning_rate": 3.5872677113837227e-06, + "loss": 0.2494, + "step": 13300 + }, + { + "epoch": 2.191844548280192, + "grad_norm": 1.525583267211914, + "learning_rate": 3.5736371563774587e-06, + "loss": 0.2716, + "step": 13310 + }, + { + "epoch": 2.1934912826001933, + "grad_norm": 1.5239418745040894, + "learning_rate": 3.560026910601678e-06, + "loss": 0.272, + "step": 13320 + }, + { + "epoch": 2.195138016920195, + "grad_norm": 1.7140849828720093, + "learning_rate": 3.5464370170688244e-06, + "loss": 0.2802, + "step": 13330 + }, + { + "epoch": 2.196784751240197, + "grad_norm": 1.8460406064987183, + "learning_rate": 3.53286751872704e-06, + "loss": 0.2674, + "step": 13340 + }, + { + "epoch": 2.1984314855601985, + "grad_norm": 1.5761222839355469, + "learning_rate": 3.519318458459988e-06, + "loss": 0.2673, + "step": 13350 + }, + { + "epoch": 2.2000782198802002, + "grad_norm": 1.5991119146347046, + "learning_rate": 3.5057898790867673e-06, + "loss": 0.2904, + "step": 13360 + }, + { + "epoch": 2.2017249542002015, + "grad_norm": 1.726907730102539, + "learning_rate": 3.4922818233617295e-06, + "loss": 0.2729, + "step": 13370 + }, + { + "epoch": 2.2033716885202033, + "grad_norm": 1.3864545822143555, + "learning_rate": 3.4787943339743836e-06, + "loss": 0.2753, + "step": 13380 + }, + { + "epoch": 2.205018422840205, + "grad_norm": 2.468479871749878, + "learning_rate": 3.4653274535492255e-06, + "loss": 0.2777, + "step": 13390 + }, + { + "epoch": 2.2066651571602067, + "grad_norm": 1.603942632675171, + "learning_rate": 3.451881224645637e-06, + "loss": 0.2721, + "step": 13400 + }, + { + "epoch": 2.2083118914802085, + "grad_norm": 1.7777491807937622, + "learning_rate": 3.4384556897577183e-06, + "loss": 0.2949, + "step": 13410 + }, + { + "epoch": 2.20995862580021, + "grad_norm": 1.8029536008834839, + "learning_rate": 3.425050891314191e-06, + "loss": 0.272, + "step": 13420 + }, + { + "epoch": 2.2116053601202115, + "grad_norm": 1.74064302444458, + "learning_rate": 3.4116668716782164e-06, + "loss": 0.267, + "step": 13430 + }, + { + "epoch": 2.213252094440213, + "grad_norm": 1.70004403591156, + "learning_rate": 3.398303673147314e-06, + "loss": 0.2528, + "step": 13440 + }, + { + "epoch": 2.214898828760215, + "grad_norm": 1.407127022743225, + "learning_rate": 3.3849613379531865e-06, + "loss": 0.27, + "step": 13450 + }, + { + "epoch": 2.2165455630802167, + "grad_norm": 2.015048027038574, + "learning_rate": 3.371639908261611e-06, + "loss": 0.2576, + "step": 13460 + }, + { + "epoch": 2.2181922974002184, + "grad_norm": 2.1229333877563477, + "learning_rate": 3.35833942617229e-06, + "loss": 0.2779, + "step": 13470 + }, + { + "epoch": 2.2198390317202197, + "grad_norm": 1.634856939315796, + "learning_rate": 3.3450599337187326e-06, + "loss": 0.2564, + "step": 13480 + }, + { + "epoch": 2.2214857660402214, + "grad_norm": 1.7512359619140625, + "learning_rate": 3.3318014728681104e-06, + "loss": 0.2773, + "step": 13490 + }, + { + "epoch": 2.223132500360223, + "grad_norm": 1.8369240760803223, + "learning_rate": 3.3185640855211255e-06, + "loss": 0.2501, + "step": 13500 + }, + { + "epoch": 2.224779234680225, + "grad_norm": 2.120023012161255, + "learning_rate": 3.3053478135118845e-06, + "loss": 0.2836, + "step": 13510 + }, + { + "epoch": 2.2264259690002266, + "grad_norm": 1.684515118598938, + "learning_rate": 3.292152698607768e-06, + "loss": 0.2638, + "step": 13520 + }, + { + "epoch": 2.228072703320228, + "grad_norm": 1.8493366241455078, + "learning_rate": 3.2789787825092834e-06, + "loss": 0.2579, + "step": 13530 + }, + { + "epoch": 2.2297194376402296, + "grad_norm": 1.272315502166748, + "learning_rate": 3.2658261068499562e-06, + "loss": 0.2729, + "step": 13540 + }, + { + "epoch": 2.2313661719602313, + "grad_norm": 2.1729161739349365, + "learning_rate": 3.252694713196173e-06, + "loss": 0.275, + "step": 13550 + }, + { + "epoch": 2.233012906280233, + "grad_norm": 1.7917054891586304, + "learning_rate": 3.239584643047078e-06, + "loss": 0.2788, + "step": 13560 + }, + { + "epoch": 2.234659640600235, + "grad_norm": 1.5601428747177124, + "learning_rate": 3.2264959378344053e-06, + "loss": 0.2728, + "step": 13570 + }, + { + "epoch": 2.236306374920236, + "grad_norm": 1.8332723379135132, + "learning_rate": 3.2134286389223913e-06, + "loss": 0.2782, + "step": 13580 + }, + { + "epoch": 2.237953109240238, + "grad_norm": 1.5861481428146362, + "learning_rate": 3.2003827876076066e-06, + "loss": 0.2638, + "step": 13590 + }, + { + "epoch": 2.2395998435602396, + "grad_norm": 1.4836739301681519, + "learning_rate": 3.1873584251188527e-06, + "loss": 0.2454, + "step": 13600 + }, + { + "epoch": 2.2412465778802413, + "grad_norm": 1.721747636795044, + "learning_rate": 3.174355592617008e-06, + "loss": 0.2766, + "step": 13610 + }, + { + "epoch": 2.242893312200243, + "grad_norm": 2.278952121734619, + "learning_rate": 3.161374331194922e-06, + "loss": 0.2613, + "step": 13620 + }, + { + "epoch": 2.2445400465202447, + "grad_norm": 2.0035574436187744, + "learning_rate": 3.1484146818772644e-06, + "loss": 0.2816, + "step": 13630 + }, + { + "epoch": 2.246186780840246, + "grad_norm": 1.6324348449707031, + "learning_rate": 3.1354766856204066e-06, + "loss": 0.2613, + "step": 13640 + }, + { + "epoch": 2.2478335151602478, + "grad_norm": 2.2162437438964844, + "learning_rate": 3.1225603833122866e-06, + "loss": 0.2682, + "step": 13650 + }, + { + "epoch": 2.2494802494802495, + "grad_norm": 1.5400594472885132, + "learning_rate": 3.1096658157722936e-06, + "loss": 0.2776, + "step": 13660 + }, + { + "epoch": 2.2511269838002512, + "grad_norm": 2.152050495147705, + "learning_rate": 3.0967930237511144e-06, + "loss": 0.2573, + "step": 13670 + }, + { + "epoch": 2.252773718120253, + "grad_norm": 1.3941951990127563, + "learning_rate": 3.0839420479306325e-06, + "loss": 0.2738, + "step": 13680 + }, + { + "epoch": 2.2544204524402542, + "grad_norm": 1.7376865148544312, + "learning_rate": 3.071112928923773e-06, + "loss": 0.2625, + "step": 13690 + }, + { + "epoch": 2.256067186760256, + "grad_norm": 2.0723204612731934, + "learning_rate": 3.058305707274396e-06, + "loss": 0.2632, + "step": 13700 + }, + { + "epoch": 2.2577139210802577, + "grad_norm": 2.126526117324829, + "learning_rate": 3.0455204234571568e-06, + "loss": 0.2649, + "step": 13710 + }, + { + "epoch": 2.2593606554002594, + "grad_norm": 2.2294485569000244, + "learning_rate": 3.0327571178773772e-06, + "loss": 0.2506, + "step": 13720 + }, + { + "epoch": 2.261007389720261, + "grad_norm": 1.6885284185409546, + "learning_rate": 3.0200158308709217e-06, + "loss": 0.2715, + "step": 13730 + }, + { + "epoch": 2.262654124040263, + "grad_norm": 1.9802756309509277, + "learning_rate": 3.0072966027040785e-06, + "loss": 0.2647, + "step": 13740 + }, + { + "epoch": 2.264300858360264, + "grad_norm": 1.758371353149414, + "learning_rate": 2.9945994735734085e-06, + "loss": 0.2741, + "step": 13750 + }, + { + "epoch": 2.265947592680266, + "grad_norm": 1.7555452585220337, + "learning_rate": 2.9819244836056482e-06, + "loss": 0.2659, + "step": 13760 + }, + { + "epoch": 2.2675943270002676, + "grad_norm": 1.7351974248886108, + "learning_rate": 2.9692716728575576e-06, + "loss": 0.2482, + "step": 13770 + }, + { + "epoch": 2.2692410613202694, + "grad_norm": 1.67487633228302, + "learning_rate": 2.956641081315803e-06, + "loss": 0.2655, + "step": 13780 + }, + { + "epoch": 2.270887795640271, + "grad_norm": 1.7826485633850098, + "learning_rate": 2.944032748896842e-06, + "loss": 0.2706, + "step": 13790 + }, + { + "epoch": 2.2725345299602724, + "grad_norm": 1.6247366666793823, + "learning_rate": 2.9314467154467728e-06, + "loss": 0.2877, + "step": 13800 + }, + { + "epoch": 2.274181264280274, + "grad_norm": 1.7416200637817383, + "learning_rate": 2.9188830207412355e-06, + "loss": 0.2653, + "step": 13810 + }, + { + "epoch": 2.275827998600276, + "grad_norm": 1.554840087890625, + "learning_rate": 2.9063417044852627e-06, + "loss": 0.2528, + "step": 13820 + }, + { + "epoch": 2.2774747329202776, + "grad_norm": 1.978947639465332, + "learning_rate": 2.8938228063131655e-06, + "loss": 0.2884, + "step": 13830 + }, + { + "epoch": 2.2791214672402793, + "grad_norm": 2.049222946166992, + "learning_rate": 2.881326365788417e-06, + "loss": 0.2699, + "step": 13840 + }, + { + "epoch": 2.2807682015602806, + "grad_norm": 1.588185429573059, + "learning_rate": 2.8688524224035076e-06, + "loss": 0.2725, + "step": 13850 + }, + { + "epoch": 2.2824149358802823, + "grad_norm": 1.8063595294952393, + "learning_rate": 2.856401015579828e-06, + "loss": 0.2775, + "step": 13860 + }, + { + "epoch": 2.284061670200284, + "grad_norm": 1.70321786403656, + "learning_rate": 2.8439721846675595e-06, + "loss": 0.259, + "step": 13870 + }, + { + "epoch": 2.285708404520286, + "grad_norm": 1.4761788845062256, + "learning_rate": 2.83156596894552e-06, + "loss": 0.2497, + "step": 13880 + }, + { + "epoch": 2.2873551388402875, + "grad_norm": 2.2861974239349365, + "learning_rate": 2.819182407621074e-06, + "loss": 0.2603, + "step": 13890 + }, + { + "epoch": 2.289001873160289, + "grad_norm": 2.404402494430542, + "learning_rate": 2.806821539829978e-06, + "loss": 0.2708, + "step": 13900 + }, + { + "epoch": 2.2906486074802905, + "grad_norm": 1.8514249324798584, + "learning_rate": 2.7944834046362755e-06, + "loss": 0.259, + "step": 13910 + }, + { + "epoch": 2.2922953418002923, + "grad_norm": 2.039151191711426, + "learning_rate": 2.7821680410321638e-06, + "loss": 0.2385, + "step": 13920 + }, + { + "epoch": 2.293942076120294, + "grad_norm": 1.4477638006210327, + "learning_rate": 2.7698754879378853e-06, + "loss": 0.2805, + "step": 13930 + }, + { + "epoch": 2.2955888104402957, + "grad_norm": 1.8691905736923218, + "learning_rate": 2.7576057842015823e-06, + "loss": 0.2642, + "step": 13940 + }, + { + "epoch": 2.297235544760297, + "grad_norm": 2.3928122520446777, + "learning_rate": 2.7453589685991964e-06, + "loss": 0.2619, + "step": 13950 + }, + { + "epoch": 2.2988822790802987, + "grad_norm": 1.9695171117782593, + "learning_rate": 2.733135079834327e-06, + "loss": 0.2569, + "step": 13960 + }, + { + "epoch": 2.3005290134003005, + "grad_norm": 1.6952944993972778, + "learning_rate": 2.7209341565381275e-06, + "loss": 0.272, + "step": 13970 + }, + { + "epoch": 2.302175747720302, + "grad_norm": 2.042433500289917, + "learning_rate": 2.7087562372691644e-06, + "loss": 0.2553, + "step": 13980 + }, + { + "epoch": 2.303822482040304, + "grad_norm": 1.8709336519241333, + "learning_rate": 2.696601360513309e-06, + "loss": 0.2592, + "step": 13990 + }, + { + "epoch": 2.3054692163603057, + "grad_norm": 1.952884316444397, + "learning_rate": 2.684469564683608e-06, + "loss": 0.269, + "step": 14000 + }, + { + "epoch": 2.307115950680307, + "grad_norm": 1.885693907737732, + "learning_rate": 2.6723608881201737e-06, + "loss": 0.2986, + "step": 14010 + }, + { + "epoch": 2.3087626850003087, + "grad_norm": 1.9475207328796387, + "learning_rate": 2.660275369090043e-06, + "loss": 0.2396, + "step": 14020 + }, + { + "epoch": 2.3104094193203104, + "grad_norm": 1.693192958831787, + "learning_rate": 2.6482130457870813e-06, + "loss": 0.2611, + "step": 14030 + }, + { + "epoch": 2.312056153640312, + "grad_norm": 1.7588555812835693, + "learning_rate": 2.6361739563318334e-06, + "loss": 0.2704, + "step": 14040 + }, + { + "epoch": 2.313702887960314, + "grad_norm": 2.1660373210906982, + "learning_rate": 2.6241581387714333e-06, + "loss": 0.2869, + "step": 14050 + }, + { + "epoch": 2.315349622280315, + "grad_norm": 1.8832919597625732, + "learning_rate": 2.61216563107946e-06, + "loss": 0.2388, + "step": 14060 + }, + { + "epoch": 2.316996356600317, + "grad_norm": 1.77772855758667, + "learning_rate": 2.6001964711558245e-06, + "loss": 0.2586, + "step": 14070 + }, + { + "epoch": 2.3186430909203186, + "grad_norm": 1.9591187238693237, + "learning_rate": 2.5882506968266564e-06, + "loss": 0.2469, + "step": 14080 + }, + { + "epoch": 2.3202898252403203, + "grad_norm": 1.8513628244400024, + "learning_rate": 2.5763283458441823e-06, + "loss": 0.2728, + "step": 14090 + }, + { + "epoch": 2.321936559560322, + "grad_norm": 1.9876492023468018, + "learning_rate": 2.5644294558865955e-06, + "loss": 0.254, + "step": 14100 + }, + { + "epoch": 2.323583293880324, + "grad_norm": 1.812187910079956, + "learning_rate": 2.5525540645579573e-06, + "loss": 0.2641, + "step": 14110 + }, + { + "epoch": 2.325230028200325, + "grad_norm": 1.6044301986694336, + "learning_rate": 2.540702209388052e-06, + "loss": 0.2712, + "step": 14120 + }, + { + "epoch": 2.326876762520327, + "grad_norm": 1.71054208278656, + "learning_rate": 2.5288739278322992e-06, + "loss": 0.2619, + "step": 14130 + }, + { + "epoch": 2.3285234968403286, + "grad_norm": 1.707785964012146, + "learning_rate": 2.5170692572715983e-06, + "loss": 0.2493, + "step": 14140 + }, + { + "epoch": 2.3301702311603303, + "grad_norm": 1.6551952362060547, + "learning_rate": 2.505288235012251e-06, + "loss": 0.2635, + "step": 14150 + }, + { + "epoch": 2.331816965480332, + "grad_norm": 1.5544058084487915, + "learning_rate": 2.4935308982858097e-06, + "loss": 0.2601, + "step": 14160 + }, + { + "epoch": 2.3334636998003333, + "grad_norm": 2.0415706634521484, + "learning_rate": 2.4817972842489824e-06, + "loss": 0.2691, + "step": 14170 + }, + { + "epoch": 2.335110434120335, + "grad_norm": 1.9987118244171143, + "learning_rate": 2.4700874299834975e-06, + "loss": 0.257, + "step": 14180 + }, + { + "epoch": 2.3367571684403368, + "grad_norm": 2.0765695571899414, + "learning_rate": 2.4584013724960063e-06, + "loss": 0.2277, + "step": 14190 + }, + { + "epoch": 2.3384039027603385, + "grad_norm": 2.2434241771698, + "learning_rate": 2.4467391487179446e-06, + "loss": 0.245, + "step": 14200 + }, + { + "epoch": 2.3400506370803402, + "grad_norm": 1.8256983757019043, + "learning_rate": 2.4351007955054316e-06, + "loss": 0.2637, + "step": 14210 + }, + { + "epoch": 2.3416973714003415, + "grad_norm": 1.6313284635543823, + "learning_rate": 2.4234863496391458e-06, + "loss": 0.2765, + "step": 14220 + }, + { + "epoch": 2.3433441057203432, + "grad_norm": 1.5957192182540894, + "learning_rate": 2.411895847824218e-06, + "loss": 0.2675, + "step": 14230 + }, + { + "epoch": 2.344990840040345, + "grad_norm": 1.8532218933105469, + "learning_rate": 2.4003293266900985e-06, + "loss": 0.2673, + "step": 14240 + }, + { + "epoch": 2.3466375743603467, + "grad_norm": 2.489428997039795, + "learning_rate": 2.388786822790462e-06, + "loss": 0.2629, + "step": 14250 + }, + { + "epoch": 2.3482843086803484, + "grad_norm": 1.6500635147094727, + "learning_rate": 2.377268372603071e-06, + "loss": 0.2726, + "step": 14260 + }, + { + "epoch": 2.3499310430003497, + "grad_norm": 2.1244235038757324, + "learning_rate": 2.3657740125296845e-06, + "loss": 0.2656, + "step": 14270 + }, + { + "epoch": 2.3515777773203514, + "grad_norm": 1.7785730361938477, + "learning_rate": 2.354303778895911e-06, + "loss": 0.2606, + "step": 14280 + }, + { + "epoch": 2.353224511640353, + "grad_norm": 1.852630615234375, + "learning_rate": 2.3428577079511304e-06, + "loss": 0.2656, + "step": 14290 + }, + { + "epoch": 2.354871245960355, + "grad_norm": 1.7617170810699463, + "learning_rate": 2.331435835868349e-06, + "loss": 0.2685, + "step": 14300 + }, + { + "epoch": 2.3565179802803566, + "grad_norm": 1.348222017288208, + "learning_rate": 2.3200381987441067e-06, + "loss": 0.2524, + "step": 14310 + }, + { + "epoch": 2.358164714600358, + "grad_norm": 1.567896842956543, + "learning_rate": 2.308664832598343e-06, + "loss": 0.2574, + "step": 14320 + }, + { + "epoch": 2.3598114489203597, + "grad_norm": 2.113232374191284, + "learning_rate": 2.2973157733743055e-06, + "loss": 0.2589, + "step": 14330 + }, + { + "epoch": 2.3614581832403614, + "grad_norm": 2.2120754718780518, + "learning_rate": 2.285991056938418e-06, + "loss": 0.2562, + "step": 14340 + }, + { + "epoch": 2.363104917560363, + "grad_norm": 1.2926445007324219, + "learning_rate": 2.2746907190801724e-06, + "loss": 0.2383, + "step": 14350 + }, + { + "epoch": 2.364751651880365, + "grad_norm": 2.4468600749969482, + "learning_rate": 2.2634147955120176e-06, + "loss": 0.2621, + "step": 14360 + }, + { + "epoch": 2.3663983862003666, + "grad_norm": 1.7285964488983154, + "learning_rate": 2.252163321869254e-06, + "loss": 0.2619, + "step": 14370 + }, + { + "epoch": 2.368045120520368, + "grad_norm": 1.6784744262695312, + "learning_rate": 2.2409363337099e-06, + "loss": 0.2551, + "step": 14380 + }, + { + "epoch": 2.3696918548403696, + "grad_norm": 1.5659254789352417, + "learning_rate": 2.229733866514605e-06, + "loss": 0.2449, + "step": 14390 + }, + { + "epoch": 2.3713385891603713, + "grad_norm": 1.6228045225143433, + "learning_rate": 2.2185559556865145e-06, + "loss": 0.2598, + "step": 14400 + }, + { + "epoch": 2.372985323480373, + "grad_norm": 1.4725788831710815, + "learning_rate": 2.20740263655118e-06, + "loss": 0.2619, + "step": 14410 + }, + { + "epoch": 2.374632057800375, + "grad_norm": 1.6542408466339111, + "learning_rate": 2.1962739443564196e-06, + "loss": 0.2548, + "step": 14420 + }, + { + "epoch": 2.3762787921203765, + "grad_norm": 1.6933832168579102, + "learning_rate": 2.1851699142722395e-06, + "loss": 0.2413, + "step": 14430 + }, + { + "epoch": 2.377925526440378, + "grad_norm": 1.9051034450531006, + "learning_rate": 2.1740905813906945e-06, + "loss": 0.2533, + "step": 14440 + }, + { + "epoch": 2.3795722607603795, + "grad_norm": 1.8932108879089355, + "learning_rate": 2.1630359807257962e-06, + "loss": 0.2603, + "step": 14450 + }, + { + "epoch": 2.3812189950803813, + "grad_norm": 2.07220721244812, + "learning_rate": 2.1520061472133903e-06, + "loss": 0.2544, + "step": 14460 + }, + { + "epoch": 2.382865729400383, + "grad_norm": 1.947472333908081, + "learning_rate": 2.1410011157110556e-06, + "loss": 0.2462, + "step": 14470 + }, + { + "epoch": 2.3845124637203847, + "grad_norm": 2.6550047397613525, + "learning_rate": 2.130020920997985e-06, + "loss": 0.2725, + "step": 14480 + }, + { + "epoch": 2.386159198040386, + "grad_norm": 1.6511452198028564, + "learning_rate": 2.11906559777488e-06, + "loss": 0.26, + "step": 14490 + }, + { + "epoch": 2.3878059323603877, + "grad_norm": 1.7327511310577393, + "learning_rate": 2.1081351806638395e-06, + "loss": 0.2479, + "step": 14500 + }, + { + "epoch": 2.3892879932483893, + "grad_norm": 1.907187819480896, + "learning_rate": 2.09722970420826e-06, + "loss": 0.2281, + "step": 14510 + }, + { + "epoch": 2.390934727568391, + "grad_norm": 1.4497332572937012, + "learning_rate": 2.086349202872705e-06, + "loss": 0.2465, + "step": 14520 + }, + { + "epoch": 2.3925814618883927, + "grad_norm": 1.7492471933364868, + "learning_rate": 2.075493711042823e-06, + "loss": 0.2407, + "step": 14530 + }, + { + "epoch": 2.3942281962083944, + "grad_norm": 2.5765883922576904, + "learning_rate": 2.0646632630252104e-06, + "loss": 0.2505, + "step": 14540 + }, + { + "epoch": 2.3958749305283957, + "grad_norm": 1.5902353525161743, + "learning_rate": 2.053857893047334e-06, + "loss": 0.2381, + "step": 14550 + }, + { + "epoch": 2.3975216648483975, + "grad_norm": 2.103832721710205, + "learning_rate": 2.0430776352573924e-06, + "loss": 0.2603, + "step": 14560 + }, + { + "epoch": 2.399168399168399, + "grad_norm": 2.955065965652466, + "learning_rate": 2.0323225237242285e-06, + "loss": 0.2414, + "step": 14570 + }, + { + "epoch": 2.400815133488401, + "grad_norm": 1.6870169639587402, + "learning_rate": 2.0215925924372126e-06, + "loss": 0.2196, + "step": 14580 + }, + { + "epoch": 2.4024618678084027, + "grad_norm": 1.891734004020691, + "learning_rate": 2.0108878753061434e-06, + "loss": 0.2278, + "step": 14590 + }, + { + "epoch": 2.404108602128404, + "grad_norm": 2.0214812755584717, + "learning_rate": 2.0002084061611282e-06, + "loss": 0.2463, + "step": 14600 + }, + { + "epoch": 2.4057553364484057, + "grad_norm": 1.8380804061889648, + "learning_rate": 1.9895542187524906e-06, + "loss": 0.2256, + "step": 14610 + }, + { + "epoch": 2.4074020707684074, + "grad_norm": 1.8675373792648315, + "learning_rate": 1.9789253467506474e-06, + "loss": 0.2442, + "step": 14620 + }, + { + "epoch": 2.409048805088409, + "grad_norm": 1.725472092628479, + "learning_rate": 1.9683218237460233e-06, + "loss": 0.2396, + "step": 14630 + }, + { + "epoch": 2.410695539408411, + "grad_norm": 1.398227334022522, + "learning_rate": 1.9577436832489206e-06, + "loss": 0.2201, + "step": 14640 + }, + { + "epoch": 2.412342273728412, + "grad_norm": 2.566589593887329, + "learning_rate": 1.947190958689428e-06, + "loss": 0.2475, + "step": 14650 + }, + { + "epoch": 2.413989008048414, + "grad_norm": 1.501044511795044, + "learning_rate": 1.9366636834173193e-06, + "loss": 0.2488, + "step": 14660 + }, + { + "epoch": 2.4156357423684156, + "grad_norm": 1.5589001178741455, + "learning_rate": 1.926161890701934e-06, + "loss": 0.2375, + "step": 14670 + }, + { + "epoch": 2.4172824766884173, + "grad_norm": 1.3738040924072266, + "learning_rate": 1.915685613732079e-06, + "loss": 0.2353, + "step": 14680 + }, + { + "epoch": 2.418929211008419, + "grad_norm": 1.7642990350723267, + "learning_rate": 1.9052348856159298e-06, + "loss": 0.2333, + "step": 14690 + }, + { + "epoch": 2.4205759453284204, + "grad_norm": 1.651908040046692, + "learning_rate": 1.8948097393809127e-06, + "loss": 0.2347, + "step": 14700 + }, + { + "epoch": 2.422222679648422, + "grad_norm": 1.8113752603530884, + "learning_rate": 1.8844102079736114e-06, + "loss": 0.2444, + "step": 14710 + }, + { + "epoch": 2.423869413968424, + "grad_norm": 1.66728675365448, + "learning_rate": 1.8740363242596605e-06, + "loss": 0.222, + "step": 14720 + }, + { + "epoch": 2.4255161482884255, + "grad_norm": 1.575468897819519, + "learning_rate": 1.8636881210236346e-06, + "loss": 0.2395, + "step": 14730 + }, + { + "epoch": 2.4271628826084273, + "grad_norm": 1.5622098445892334, + "learning_rate": 1.8533656309689584e-06, + "loss": 0.2589, + "step": 14740 + }, + { + "epoch": 2.4288096169284286, + "grad_norm": 1.7539955377578735, + "learning_rate": 1.8430688867177882e-06, + "loss": 0.2442, + "step": 14750 + }, + { + "epoch": 2.4304563512484303, + "grad_norm": 1.620202898979187, + "learning_rate": 1.832797920810917e-06, + "loss": 0.2434, + "step": 14760 + }, + { + "epoch": 2.432103085568432, + "grad_norm": 1.9027595520019531, + "learning_rate": 1.822552765707676e-06, + "loss": 0.2541, + "step": 14770 + }, + { + "epoch": 2.4337498198884338, + "grad_norm": 1.3224104642868042, + "learning_rate": 1.8123334537858195e-06, + "loss": 0.2264, + "step": 14780 + }, + { + "epoch": 2.4353965542084355, + "grad_norm": 2.0354669094085693, + "learning_rate": 1.8021400173414306e-06, + "loss": 0.237, + "step": 14790 + }, + { + "epoch": 2.437043288528437, + "grad_norm": 1.2932946681976318, + "learning_rate": 1.7919724885888256e-06, + "loss": 0.2355, + "step": 14800 + }, + { + "epoch": 2.4386900228484385, + "grad_norm": 1.3264442682266235, + "learning_rate": 1.781830899660434e-06, + "loss": 0.2237, + "step": 14810 + }, + { + "epoch": 2.4403367571684402, + "grad_norm": 1.640587329864502, + "learning_rate": 1.7717152826067175e-06, + "loss": 0.2311, + "step": 14820 + }, + { + "epoch": 2.441983491488442, + "grad_norm": 1.6640249490737915, + "learning_rate": 1.7616256693960532e-06, + "loss": 0.2193, + "step": 14830 + }, + { + "epoch": 2.4436302258084437, + "grad_norm": 1.4918125867843628, + "learning_rate": 1.751562091914637e-06, + "loss": 0.2356, + "step": 14840 + }, + { + "epoch": 2.4452769601284454, + "grad_norm": 1.52755606174469, + "learning_rate": 1.7415245819663861e-06, + "loss": 0.2266, + "step": 14850 + }, + { + "epoch": 2.446923694448447, + "grad_norm": 2.0704216957092285, + "learning_rate": 1.7315131712728417e-06, + "loss": 0.2535, + "step": 14860 + }, + { + "epoch": 2.4485704287684484, + "grad_norm": 1.83346426486969, + "learning_rate": 1.7215278914730527e-06, + "loss": 0.2363, + "step": 14870 + }, + { + "epoch": 2.45021716308845, + "grad_norm": 1.4527473449707031, + "learning_rate": 1.7115687741234987e-06, + "loss": 0.2296, + "step": 14880 + }, + { + "epoch": 2.451863897408452, + "grad_norm": 1.9535012245178223, + "learning_rate": 1.7016358506979657e-06, + "loss": 0.2336, + "step": 14890 + }, + { + "epoch": 2.4535106317284536, + "grad_norm": 2.087193250656128, + "learning_rate": 1.6917291525874723e-06, + "loss": 0.2527, + "step": 14900 + }, + { + "epoch": 2.4551573660484554, + "grad_norm": 1.9018986225128174, + "learning_rate": 1.681848711100146e-06, + "loss": 0.2463, + "step": 14910 + }, + { + "epoch": 2.4568041003684566, + "grad_norm": 1.6955307722091675, + "learning_rate": 1.6719945574611418e-06, + "loss": 0.2348, + "step": 14920 + }, + { + "epoch": 2.4584508346884584, + "grad_norm": 1.538035273551941, + "learning_rate": 1.6621667228125305e-06, + "loss": 0.2511, + "step": 14930 + }, + { + "epoch": 2.46009756900846, + "grad_norm": 1.6792057752609253, + "learning_rate": 1.6523652382132183e-06, + "loss": 0.2592, + "step": 14940 + }, + { + "epoch": 2.461744303328462, + "grad_norm": 1.4391840696334839, + "learning_rate": 1.6425901346388263e-06, + "loss": 0.2233, + "step": 14950 + }, + { + "epoch": 2.4633910376484636, + "grad_norm": 1.333770990371704, + "learning_rate": 1.6328414429816109e-06, + "loss": 0.2303, + "step": 14960 + }, + { + "epoch": 2.465037771968465, + "grad_norm": 2.2043004035949707, + "learning_rate": 1.6231191940503543e-06, + "loss": 0.2294, + "step": 14970 + }, + { + "epoch": 2.4666845062884666, + "grad_norm": 1.5073317289352417, + "learning_rate": 1.613423418570279e-06, + "loss": 0.2368, + "step": 14980 + }, + { + "epoch": 2.4683312406084683, + "grad_norm": 1.6827707290649414, + "learning_rate": 1.6037541471829288e-06, + "loss": 0.238, + "step": 14990 + }, + { + "epoch": 2.46997797492847, + "grad_norm": 1.4171675443649292, + "learning_rate": 1.594111410446104e-06, + "loss": 0.2162, + "step": 15000 + }, + { + "epoch": 2.4716247092484718, + "grad_norm": 1.938513994216919, + "learning_rate": 1.5844952388337332e-06, + "loss": 0.269, + "step": 15010 + }, + { + "epoch": 2.473271443568473, + "grad_norm": 1.7886428833007812, + "learning_rate": 1.574905662735805e-06, + "loss": 0.2393, + "step": 15020 + }, + { + "epoch": 2.474918177888475, + "grad_norm": 1.4953855276107788, + "learning_rate": 1.5653427124582431e-06, + "loss": 0.2265, + "step": 15030 + }, + { + "epoch": 2.4765649122084765, + "grad_norm": 1.8918662071228027, + "learning_rate": 1.5558064182228393e-06, + "loss": 0.2295, + "step": 15040 + }, + { + "epoch": 2.4782116465284783, + "grad_norm": 1.7208409309387207, + "learning_rate": 1.5462968101671361e-06, + "loss": 0.2429, + "step": 15050 + }, + { + "epoch": 2.47985838084848, + "grad_norm": 1.5308380126953125, + "learning_rate": 1.5368139183443421e-06, + "loss": 0.2396, + "step": 15060 + }, + { + "epoch": 2.4815051151684813, + "grad_norm": 1.626708984375, + "learning_rate": 1.5273577727232314e-06, + "loss": 0.2398, + "step": 15070 + }, + { + "epoch": 2.483151849488483, + "grad_norm": 1.9254333972930908, + "learning_rate": 1.5179284031880603e-06, + "loss": 0.2288, + "step": 15080 + }, + { + "epoch": 2.4847985838084847, + "grad_norm": 2.0169408321380615, + "learning_rate": 1.5085258395384538e-06, + "loss": 0.2416, + "step": 15090 + }, + { + "epoch": 2.4864453181284865, + "grad_norm": 1.554512858390808, + "learning_rate": 1.4991501114893336e-06, + "loss": 0.2522, + "step": 15100 + }, + { + "epoch": 2.488092052448488, + "grad_norm": 1.447223424911499, + "learning_rate": 1.4898012486708024e-06, + "loss": 0.2248, + "step": 15110 + }, + { + "epoch": 2.48973878676849, + "grad_norm": 1.233121395111084, + "learning_rate": 1.48047928062807e-06, + "loss": 0.2216, + "step": 15120 + }, + { + "epoch": 2.491385521088491, + "grad_norm": 1.4239838123321533, + "learning_rate": 1.4711842368213437e-06, + "loss": 0.2151, + "step": 15130 + }, + { + "epoch": 2.493032255408493, + "grad_norm": 1.5615428686141968, + "learning_rate": 1.4619161466257459e-06, + "loss": 0.2303, + "step": 15140 + }, + { + "epoch": 2.4946789897284947, + "grad_norm": 1.5079021453857422, + "learning_rate": 1.4526750393312118e-06, + "loss": 0.2295, + "step": 15150 + }, + { + "epoch": 2.4963257240484964, + "grad_norm": 1.7538816928863525, + "learning_rate": 1.443460944142413e-06, + "loss": 0.2289, + "step": 15160 + }, + { + "epoch": 2.497972458368498, + "grad_norm": 1.7975422143936157, + "learning_rate": 1.4342738901786434e-06, + "loss": 0.234, + "step": 15170 + }, + { + "epoch": 2.4996191926885, + "grad_norm": 1.8438105583190918, + "learning_rate": 1.4251139064737485e-06, + "loss": 0.239, + "step": 15180 + }, + { + "epoch": 2.501265927008501, + "grad_norm": 4.315160274505615, + "learning_rate": 1.4159810219760161e-06, + "loss": 0.2261, + "step": 15190 + }, + { + "epoch": 2.502912661328503, + "grad_norm": 1.4963452816009521, + "learning_rate": 1.406875265548101e-06, + "loss": 0.2228, + "step": 15200 + }, + { + "epoch": 2.5045593956485046, + "grad_norm": 1.717155933380127, + "learning_rate": 1.3977966659669096e-06, + "loss": 0.2385, + "step": 15210 + }, + { + "epoch": 2.5062061299685063, + "grad_norm": 1.8200072050094604, + "learning_rate": 1.3887452519235434e-06, + "loss": 0.2179, + "step": 15220 + }, + { + "epoch": 2.507852864288508, + "grad_norm": 1.7720727920532227, + "learning_rate": 1.379721052023174e-06, + "loss": 0.2252, + "step": 15230 + }, + { + "epoch": 2.5094995986085094, + "grad_norm": 1.5537362098693848, + "learning_rate": 1.3707240947849797e-06, + "loss": 0.2273, + "step": 15240 + }, + { + "epoch": 2.511146332928511, + "grad_norm": 1.4824028015136719, + "learning_rate": 1.3617544086420353e-06, + "loss": 0.241, + "step": 15250 + }, + { + "epoch": 2.512793067248513, + "grad_norm": 1.6623420715332031, + "learning_rate": 1.3528120219412377e-06, + "loss": 0.2494, + "step": 15260 + }, + { + "epoch": 2.5144398015685145, + "grad_norm": 1.8645778894424438, + "learning_rate": 1.3438969629432042e-06, + "loss": 0.2561, + "step": 15270 + }, + { + "epoch": 2.5160865358885163, + "grad_norm": 2.436868190765381, + "learning_rate": 1.335009259822191e-06, + "loss": 0.2311, + "step": 15280 + }, + { + "epoch": 2.5177332702085176, + "grad_norm": 1.7088630199432373, + "learning_rate": 1.3261489406659978e-06, + "loss": 0.2494, + "step": 15290 + }, + { + "epoch": 2.5193800045285193, + "grad_norm": 1.6420857906341553, + "learning_rate": 1.3173160334758895e-06, + "loss": 0.2465, + "step": 15300 + }, + { + "epoch": 2.521026738848521, + "grad_norm": 1.4090502262115479, + "learning_rate": 1.3085105661664933e-06, + "loss": 0.2263, + "step": 15310 + }, + { + "epoch": 2.5226734731685228, + "grad_norm": 1.924774169921875, + "learning_rate": 1.2997325665657257e-06, + "loss": 0.2348, + "step": 15320 + }, + { + "epoch": 2.5243202074885245, + "grad_norm": 1.5904576778411865, + "learning_rate": 1.2909820624146908e-06, + "loss": 0.2272, + "step": 15330 + }, + { + "epoch": 2.5259669418085258, + "grad_norm": 1.8772406578063965, + "learning_rate": 1.282259081367606e-06, + "loss": 0.2255, + "step": 15340 + }, + { + "epoch": 2.5276136761285275, + "grad_norm": 1.8594352006912231, + "learning_rate": 1.273563650991696e-06, + "loss": 0.2329, + "step": 15350 + }, + { + "epoch": 2.5292604104485292, + "grad_norm": 1.4280927181243896, + "learning_rate": 1.2648957987671295e-06, + "loss": 0.234, + "step": 15360 + }, + { + "epoch": 2.530907144768531, + "grad_norm": 1.7298115491867065, + "learning_rate": 1.256255552086909e-06, + "loss": 0.2416, + "step": 15370 + }, + { + "epoch": 2.5325538790885327, + "grad_norm": 1.3822236061096191, + "learning_rate": 1.2476429382568067e-06, + "loss": 0.2454, + "step": 15380 + }, + { + "epoch": 2.534200613408534, + "grad_norm": 1.9294795989990234, + "learning_rate": 1.2390579844952565e-06, + "loss": 0.2421, + "step": 15390 + }, + { + "epoch": 2.5358473477285357, + "grad_norm": 1.349674940109253, + "learning_rate": 1.2305007179332851e-06, + "loss": 0.2348, + "step": 15400 + }, + { + "epoch": 2.5374940820485374, + "grad_norm": 1.3522496223449707, + "learning_rate": 1.2219711656144161e-06, + "loss": 0.2404, + "step": 15410 + }, + { + "epoch": 2.539140816368539, + "grad_norm": 1.3507784605026245, + "learning_rate": 1.2134693544945875e-06, + "loss": 0.246, + "step": 15420 + }, + { + "epoch": 2.540787550688541, + "grad_norm": 1.6919246912002563, + "learning_rate": 1.2049953114420654e-06, + "loss": 0.2455, + "step": 15430 + }, + { + "epoch": 2.542434285008542, + "grad_norm": 1.62079918384552, + "learning_rate": 1.1965490632373677e-06, + "loss": 0.2317, + "step": 15440 + }, + { + "epoch": 2.5440810193285444, + "grad_norm": 1.7440752983093262, + "learning_rate": 1.1881306365731638e-06, + "loss": 0.2322, + "step": 15450 + }, + { + "epoch": 2.5457277536485456, + "grad_norm": 1.5291510820388794, + "learning_rate": 1.179740058054204e-06, + "loss": 0.2287, + "step": 15460 + }, + { + "epoch": 2.5473744879685474, + "grad_norm": 1.448723316192627, + "learning_rate": 1.1713773541972263e-06, + "loss": 0.2278, + "step": 15470 + }, + { + "epoch": 2.549021222288549, + "grad_norm": 1.5722907781600952, + "learning_rate": 1.1630425514308819e-06, + "loss": 0.2366, + "step": 15480 + }, + { + "epoch": 2.5506679566085504, + "grad_norm": 1.579613447189331, + "learning_rate": 1.1547356760956397e-06, + "loss": 0.2361, + "step": 15490 + }, + { + "epoch": 2.5523146909285526, + "grad_norm": 1.437633991241455, + "learning_rate": 1.1464567544437144e-06, + "loss": 0.2245, + "step": 15500 + }, + { + "epoch": 2.553961425248554, + "grad_norm": 1.626078724861145, + "learning_rate": 1.138205812638975e-06, + "loss": 0.2505, + "step": 15510 + }, + { + "epoch": 2.5556081595685556, + "grad_norm": 1.7124260663986206, + "learning_rate": 1.12998287675687e-06, + "loss": 0.2494, + "step": 15520 + }, + { + "epoch": 2.5572548938885573, + "grad_norm": 1.749879002571106, + "learning_rate": 1.1217879727843351e-06, + "loss": 0.2194, + "step": 15530 + }, + { + "epoch": 2.558901628208559, + "grad_norm": 2.109128713607788, + "learning_rate": 1.113621126619725e-06, + "loss": 0.2321, + "step": 15540 + }, + { + "epoch": 2.5605483625285608, + "grad_norm": 1.6905772686004639, + "learning_rate": 1.1054823640727163e-06, + "loss": 0.2457, + "step": 15550 + }, + { + "epoch": 2.562195096848562, + "grad_norm": 1.2987953424453735, + "learning_rate": 1.0973717108642323e-06, + "loss": 0.2325, + "step": 15560 + }, + { + "epoch": 2.563841831168564, + "grad_norm": 1.5485819578170776, + "learning_rate": 1.0892891926263703e-06, + "loss": 0.2462, + "step": 15570 + }, + { + "epoch": 2.5654885654885655, + "grad_norm": 1.8916813135147095, + "learning_rate": 1.0812348349023038e-06, + "loss": 0.2335, + "step": 15580 + }, + { + "epoch": 2.5671352998085673, + "grad_norm": 1.2640624046325684, + "learning_rate": 1.073208663146218e-06, + "loss": 0.2301, + "step": 15590 + }, + { + "epoch": 2.568782034128569, + "grad_norm": 1.434807300567627, + "learning_rate": 1.065210702723215e-06, + "loss": 0.2167, + "step": 15600 + }, + { + "epoch": 2.5704287684485703, + "grad_norm": 1.6343811750411987, + "learning_rate": 1.0572409789092452e-06, + "loss": 0.2227, + "step": 15610 + }, + { + "epoch": 2.572075502768572, + "grad_norm": 1.6789041757583618, + "learning_rate": 1.0492995168910225e-06, + "loss": 0.2451, + "step": 15620 + }, + { + "epoch": 2.5737222370885737, + "grad_norm": 1.42600417137146, + "learning_rate": 1.0413863417659454e-06, + "loss": 0.23, + "step": 15630 + }, + { + "epoch": 2.5753689714085755, + "grad_norm": 1.9068646430969238, + "learning_rate": 1.0335014785420128e-06, + "loss": 0.2413, + "step": 15640 + }, + { + "epoch": 2.577015705728577, + "grad_norm": 1.5069235563278198, + "learning_rate": 1.0256449521377565e-06, + "loss": 0.2581, + "step": 15650 + }, + { + "epoch": 2.5786624400485785, + "grad_norm": 1.8914705514907837, + "learning_rate": 1.0178167873821487e-06, + "loss": 0.2205, + "step": 15660 + }, + { + "epoch": 2.58030917436858, + "grad_norm": 1.843311071395874, + "learning_rate": 1.0100170090145379e-06, + "loss": 0.2357, + "step": 15670 + }, + { + "epoch": 2.581955908688582, + "grad_norm": 1.3863564729690552, + "learning_rate": 1.0022456416845561e-06, + "loss": 0.222, + "step": 15680 + }, + { + "epoch": 2.5836026430085837, + "grad_norm": 1.5262445211410522, + "learning_rate": 9.945027099520489e-07, + "loss": 0.2182, + "step": 15690 + }, + { + "epoch": 2.5852493773285854, + "grad_norm": 1.4200090169906616, + "learning_rate": 9.86788238287003e-07, + "loss": 0.2287, + "step": 15700 + }, + { + "epoch": 2.5868961116485867, + "grad_norm": 1.338196039199829, + "learning_rate": 9.79102251069456e-07, + "loss": 0.2213, + "step": 15710 + }, + { + "epoch": 2.5885428459685884, + "grad_norm": 1.5565379858016968, + "learning_rate": 9.71444772589426e-07, + "loss": 0.2243, + "step": 15720 + }, + { + "epoch": 2.59018958028859, + "grad_norm": 1.8993768692016602, + "learning_rate": 9.638158270468423e-07, + "loss": 0.2194, + "step": 15730 + }, + { + "epoch": 2.591836314608592, + "grad_norm": 1.739005446434021, + "learning_rate": 9.56215438551452e-07, + "loss": 0.2241, + "step": 15740 + }, + { + "epoch": 2.5934830489285936, + "grad_norm": 1.7477450370788574, + "learning_rate": 9.486436311227631e-07, + "loss": 0.2443, + "step": 15750 + }, + { + "epoch": 2.595129783248595, + "grad_norm": 1.8742027282714844, + "learning_rate": 9.411004286899495e-07, + "loss": 0.2386, + "step": 15760 + }, + { + "epoch": 2.5967765175685966, + "grad_norm": 1.9991704225540161, + "learning_rate": 9.335858550917942e-07, + "loss": 0.2306, + "step": 15770 + }, + { + "epoch": 2.5984232518885984, + "grad_norm": 1.8734098672866821, + "learning_rate": 9.26099934076593e-07, + "loss": 0.2307, + "step": 15780 + }, + { + "epoch": 2.6000699862086, + "grad_norm": 1.9999672174453735, + "learning_rate": 9.186426893021016e-07, + "loss": 0.2242, + "step": 15790 + }, + { + "epoch": 2.601716720528602, + "grad_norm": 1.490793228149414, + "learning_rate": 9.112141443354439e-07, + "loss": 0.2493, + "step": 15800 + }, + { + "epoch": 2.603363454848603, + "grad_norm": 1.619504690170288, + "learning_rate": 9.038143226530482e-07, + "loss": 0.2458, + "step": 15810 + }, + { + "epoch": 2.6050101891686053, + "grad_norm": 1.3605459928512573, + "learning_rate": 8.964432476405638e-07, + "loss": 0.2279, + "step": 15820 + }, + { + "epoch": 2.6066569234886066, + "grad_norm": 1.5391523838043213, + "learning_rate": 8.891009425927977e-07, + "loss": 0.2332, + "step": 15830 + }, + { + "epoch": 2.6083036578086083, + "grad_norm": 1.6004542112350464, + "learning_rate": 8.817874307136298e-07, + "loss": 0.2404, + "step": 15840 + }, + { + "epoch": 2.60995039212861, + "grad_norm": 1.6447652578353882, + "learning_rate": 8.745027351159486e-07, + "loss": 0.2363, + "step": 15850 + }, + { + "epoch": 2.6115971264486117, + "grad_norm": 2.2156291007995605, + "learning_rate": 8.672468788215682e-07, + "loss": 0.2413, + "step": 15860 + }, + { + "epoch": 2.6132438607686135, + "grad_norm": 1.7267791032791138, + "learning_rate": 8.60019884761173e-07, + "loss": 0.2444, + "step": 15870 + }, + { + "epoch": 2.6148905950886148, + "grad_norm": 1.5263491868972778, + "learning_rate": 8.52821775774223e-07, + "loss": 0.227, + "step": 15880 + }, + { + "epoch": 2.6165373294086165, + "grad_norm": 1.5678350925445557, + "learning_rate": 8.456525746089017e-07, + "loss": 0.2185, + "step": 15890 + }, + { + "epoch": 2.6181840637286182, + "grad_norm": 2.008378744125366, + "learning_rate": 8.385123039220277e-07, + "loss": 0.237, + "step": 15900 + }, + { + "epoch": 2.61983079804862, + "grad_norm": 2.221608877182007, + "learning_rate": 8.314009862789984e-07, + "loss": 0.2433, + "step": 15910 + }, + { + "epoch": 2.6214775323686217, + "grad_norm": 2.2678582668304443, + "learning_rate": 8.243186441536999e-07, + "loss": 0.2457, + "step": 15920 + }, + { + "epoch": 2.623124266688623, + "grad_norm": 1.7668777704238892, + "learning_rate": 8.172652999284592e-07, + "loss": 0.2409, + "step": 15930 + }, + { + "epoch": 2.6247710010086247, + "grad_norm": 1.7608022689819336, + "learning_rate": 8.102409758939522e-07, + "loss": 0.243, + "step": 15940 + }, + { + "epoch": 2.6264177353286264, + "grad_norm": 2.060465097427368, + "learning_rate": 8.032456942491484e-07, + "loss": 0.2452, + "step": 15950 + }, + { + "epoch": 2.628064469648628, + "grad_norm": 1.3911464214324951, + "learning_rate": 7.962794771012284e-07, + "loss": 0.2331, + "step": 15960 + }, + { + "epoch": 2.62971120396863, + "grad_norm": 1.3499740362167358, + "learning_rate": 7.893423464655292e-07, + "loss": 0.2287, + "step": 15970 + }, + { + "epoch": 2.631357938288631, + "grad_norm": 1.5909963846206665, + "learning_rate": 7.824343242654564e-07, + "loss": 0.2338, + "step": 15980 + }, + { + "epoch": 2.633004672608633, + "grad_norm": 1.7845244407653809, + "learning_rate": 7.755554323324299e-07, + "loss": 0.233, + "step": 15990 + }, + { + "epoch": 2.6346514069286346, + "grad_norm": 1.5275846719741821, + "learning_rate": 7.687056924058056e-07, + "loss": 0.2209, + "step": 16000 + }, + { + "epoch": 2.6362981412486364, + "grad_norm": 1.519129753112793, + "learning_rate": 7.618851261328153e-07, + "loss": 0.227, + "step": 16010 + }, + { + "epoch": 2.637944875568638, + "grad_norm": 1.5210860967636108, + "learning_rate": 7.550937550684867e-07, + "loss": 0.2288, + "step": 16020 + }, + { + "epoch": 2.6395916098886394, + "grad_norm": 2.0805931091308594, + "learning_rate": 7.483316006755892e-07, + "loss": 0.234, + "step": 16030 + }, + { + "epoch": 2.641238344208641, + "grad_norm": 1.706473708152771, + "learning_rate": 7.415986843245515e-07, + "loss": 0.2407, + "step": 16040 + }, + { + "epoch": 2.642885078528643, + "grad_norm": 1.63625168800354, + "learning_rate": 7.348950272934107e-07, + "loss": 0.2213, + "step": 16050 + }, + { + "epoch": 2.6445318128486446, + "grad_norm": 1.5497370958328247, + "learning_rate": 7.282206507677225e-07, + "loss": 0.2214, + "step": 16060 + }, + { + "epoch": 2.6461785471686463, + "grad_norm": 1.570217490196228, + "learning_rate": 7.215755758405208e-07, + "loss": 0.2112, + "step": 16070 + }, + { + "epoch": 2.6478252814886476, + "grad_norm": 1.8267185688018799, + "learning_rate": 7.149598235122279e-07, + "loss": 0.224, + "step": 16080 + }, + { + "epoch": 2.6494720158086493, + "grad_norm": 1.6849004030227661, + "learning_rate": 7.08373414690604e-07, + "loss": 0.2273, + "step": 16090 + }, + { + "epoch": 2.651118750128651, + "grad_norm": 1.8440622091293335, + "learning_rate": 7.01816370190671e-07, + "loss": 0.2058, + "step": 16100 + }, + { + "epoch": 2.652765484448653, + "grad_norm": 1.6094481945037842, + "learning_rate": 6.952887107346551e-07, + "loss": 0.2358, + "step": 16110 + }, + { + "epoch": 2.6544122187686545, + "grad_norm": 1.7363183498382568, + "learning_rate": 6.887904569519133e-07, + "loss": 0.2254, + "step": 16120 + }, + { + "epoch": 2.656058953088656, + "grad_norm": 1.7017285823822021, + "learning_rate": 6.823216293788715e-07, + "loss": 0.2234, + "step": 16130 + }, + { + "epoch": 2.657705687408658, + "grad_norm": 1.396016240119934, + "learning_rate": 6.758822484589622e-07, + "loss": 0.232, + "step": 16140 + }, + { + "epoch": 2.6593524217286593, + "grad_norm": 1.758091688156128, + "learning_rate": 6.69472334542558e-07, + "loss": 0.2362, + "step": 16150 + }, + { + "epoch": 2.660999156048661, + "grad_norm": 1.6662510633468628, + "learning_rate": 6.630919078869036e-07, + "loss": 0.2182, + "step": 16160 + }, + { + "epoch": 2.6626458903686627, + "grad_norm": 2.1852259635925293, + "learning_rate": 6.567409886560605e-07, + "loss": 0.2109, + "step": 16170 + }, + { + "epoch": 2.6642926246886645, + "grad_norm": 1.657429575920105, + "learning_rate": 6.504195969208315e-07, + "loss": 0.2273, + "step": 16180 + }, + { + "epoch": 2.665939359008666, + "grad_norm": 2.1529386043548584, + "learning_rate": 6.44127752658712e-07, + "loss": 0.2448, + "step": 16190 + }, + { + "epoch": 2.6675860933286675, + "grad_norm": 1.56782066822052, + "learning_rate": 6.378654757538072e-07, + "loss": 0.2201, + "step": 16200 + }, + { + "epoch": 2.669232827648669, + "grad_norm": 2.715771198272705, + "learning_rate": 6.316327859967907e-07, + "loss": 0.2286, + "step": 16210 + }, + { + "epoch": 2.670879561968671, + "grad_norm": 1.7289623022079468, + "learning_rate": 6.254297030848255e-07, + "loss": 0.2243, + "step": 16220 + }, + { + "epoch": 2.6725262962886727, + "grad_norm": 1.6193941831588745, + "learning_rate": 6.192562466215135e-07, + "loss": 0.2266, + "step": 16230 + }, + { + "epoch": 2.6741730306086744, + "grad_norm": 1.712371826171875, + "learning_rate": 6.131124361168228e-07, + "loss": 0.221, + "step": 16240 + }, + { + "epoch": 2.6758197649286757, + "grad_norm": 1.8381526470184326, + "learning_rate": 6.069982909870376e-07, + "loss": 0.2216, + "step": 16250 + }, + { + "epoch": 2.6774664992486774, + "grad_norm": 2.2945876121520996, + "learning_rate": 6.009138305546813e-07, + "loss": 0.2157, + "step": 16260 + }, + { + "epoch": 2.679113233568679, + "grad_norm": 1.9753563404083252, + "learning_rate": 5.948590740484783e-07, + "loss": 0.2298, + "step": 16270 + }, + { + "epoch": 2.680759967888681, + "grad_norm": 1.4538737535476685, + "learning_rate": 5.888340406032633e-07, + "loss": 0.2151, + "step": 16280 + }, + { + "epoch": 2.6824067022086826, + "grad_norm": 1.7806299924850464, + "learning_rate": 5.828387492599507e-07, + "loss": 0.244, + "step": 16290 + }, + { + "epoch": 2.684053436528684, + "grad_norm": 1.6264704465866089, + "learning_rate": 5.768732189654535e-07, + "loss": 0.2418, + "step": 16300 + }, + { + "epoch": 2.6857001708486856, + "grad_norm": 1.9107524156570435, + "learning_rate": 5.709374685726365e-07, + "loss": 0.2453, + "step": 16310 + }, + { + "epoch": 2.6873469051686873, + "grad_norm": 2.1297032833099365, + "learning_rate": 5.650315168402443e-07, + "loss": 0.2351, + "step": 16320 + }, + { + "epoch": 2.688993639488689, + "grad_norm": 5.344056606292725, + "learning_rate": 5.591553824328555e-07, + "loss": 0.2455, + "step": 16330 + }, + { + "epoch": 2.690640373808691, + "grad_norm": 1.7449675798416138, + "learning_rate": 5.533090839208133e-07, + "loss": 0.2377, + "step": 16340 + }, + { + "epoch": 2.692287108128692, + "grad_norm": 2.327143907546997, + "learning_rate": 5.474926397801705e-07, + "loss": 0.2251, + "step": 16350 + }, + { + "epoch": 2.693933842448694, + "grad_norm": 1.8635077476501465, + "learning_rate": 5.417060683926301e-07, + "loss": 0.2174, + "step": 16360 + }, + { + "epoch": 2.6955805767686956, + "grad_norm": 1.3951900005340576, + "learning_rate": 5.359493880454935e-07, + "loss": 0.2114, + "step": 16370 + }, + { + "epoch": 2.6972273110886973, + "grad_norm": 2.1820077896118164, + "learning_rate": 5.302226169315927e-07, + "loss": 0.2318, + "step": 16380 + }, + { + "epoch": 2.698874045408699, + "grad_norm": 1.5952907800674438, + "learning_rate": 5.245257731492381e-07, + "loss": 0.2344, + "step": 16390 + }, + { + "epoch": 2.7005207797287003, + "grad_norm": 1.400417447090149, + "learning_rate": 5.188588747021628e-07, + "loss": 0.2343, + "step": 16400 + }, + { + "epoch": 2.702167514048702, + "grad_norm": 1.4244686365127563, + "learning_rate": 5.13221939499462e-07, + "loss": 0.2151, + "step": 16410 + }, + { + "epoch": 2.7038142483687038, + "grad_norm": 2.0728249549865723, + "learning_rate": 5.076149853555379e-07, + "loss": 0.2045, + "step": 16420 + }, + { + "epoch": 2.7054609826887055, + "grad_norm": 1.8236521482467651, + "learning_rate": 5.02038029990044e-07, + "loss": 0.2209, + "step": 16430 + }, + { + "epoch": 2.7071077170087072, + "grad_norm": 2.1654088497161865, + "learning_rate": 4.964910910278298e-07, + "loss": 0.2313, + "step": 16440 + }, + { + "epoch": 2.7087544513287085, + "grad_norm": 1.472296953201294, + "learning_rate": 4.909741859988837e-07, + "loss": 0.2372, + "step": 16450 + }, + { + "epoch": 2.7104011856487102, + "grad_norm": 1.6187572479248047, + "learning_rate": 4.854873323382747e-07, + "loss": 0.2295, + "step": 16460 + }, + { + "epoch": 2.712047919968712, + "grad_norm": 1.675091028213501, + "learning_rate": 4.800305473861056e-07, + "loss": 0.2211, + "step": 16470 + }, + { + "epoch": 2.7136946542887137, + "grad_norm": 1.6285738945007324, + "learning_rate": 4.7460384838744934e-07, + "loss": 0.2188, + "step": 16480 + }, + { + "epoch": 2.7153413886087154, + "grad_norm": 1.419481873512268, + "learning_rate": 4.692072524922975e-07, + "loss": 0.2209, + "step": 16490 + }, + { + "epoch": 2.7169881229287167, + "grad_norm": 1.381244421005249, + "learning_rate": 4.638407767555131e-07, + "loss": 0.2193, + "step": 16500 + }, + { + "epoch": 2.718634857248719, + "grad_norm": 1.5735642910003662, + "learning_rate": 4.585044381367609e-07, + "loss": 0.2397, + "step": 16510 + }, + { + "epoch": 2.72028159156872, + "grad_norm": 1.268129825592041, + "learning_rate": 4.531982535004731e-07, + "loss": 0.2109, + "step": 16520 + }, + { + "epoch": 2.721928325888722, + "grad_norm": 1.3103333711624146, + "learning_rate": 4.4792223961578006e-07, + "loss": 0.2041, + "step": 16530 + }, + { + "epoch": 2.7235750602087236, + "grad_norm": 1.9005833864212036, + "learning_rate": 4.4267641315646313e-07, + "loss": 0.2229, + "step": 16540 + }, + { + "epoch": 2.7252217945287254, + "grad_norm": 1.3628641366958618, + "learning_rate": 4.3746079070090765e-07, + "loss": 0.2132, + "step": 16550 + }, + { + "epoch": 2.726868528848727, + "grad_norm": 1.7968915700912476, + "learning_rate": 4.3227538873204076e-07, + "loss": 0.2321, + "step": 16560 + }, + { + "epoch": 2.7285152631687284, + "grad_norm": 1.7925100326538086, + "learning_rate": 4.271202236372829e-07, + "loss": 0.2242, + "step": 16570 + }, + { + "epoch": 2.73016199748873, + "grad_norm": 1.5291099548339844, + "learning_rate": 4.2199531170850296e-07, + "loss": 0.2206, + "step": 16580 + }, + { + "epoch": 2.731808731808732, + "grad_norm": 1.671055793762207, + "learning_rate": 4.1690066914195306e-07, + "loss": 0.2104, + "step": 16590 + }, + { + "epoch": 2.7334554661287336, + "grad_norm": 1.9849021434783936, + "learning_rate": 4.118363120382318e-07, + "loss": 0.2212, + "step": 16600 + }, + { + "epoch": 2.7351022004487353, + "grad_norm": 1.765097975730896, + "learning_rate": 4.0680225640222227e-07, + "loss": 0.2274, + "step": 16610 + }, + { + "epoch": 2.7367489347687366, + "grad_norm": 1.7051899433135986, + "learning_rate": 4.017985181430495e-07, + "loss": 0.2262, + "step": 16620 + }, + { + "epoch": 2.7383956690887383, + "grad_norm": 1.5623761415481567, + "learning_rate": 3.9682511307402083e-07, + "loss": 0.2442, + "step": 16630 + }, + { + "epoch": 2.74004240340874, + "grad_norm": 1.9426796436309814, + "learning_rate": 3.918820569125881e-07, + "loss": 0.2348, + "step": 16640 + }, + { + "epoch": 2.741689137728742, + "grad_norm": 1.656064510345459, + "learning_rate": 3.869693652802864e-07, + "loss": 0.232, + "step": 16650 + }, + { + "epoch": 2.7433358720487435, + "grad_norm": 1.2131900787353516, + "learning_rate": 3.820870537026944e-07, + "loss": 0.2123, + "step": 16660 + }, + { + "epoch": 2.744982606368745, + "grad_norm": 1.733933448791504, + "learning_rate": 3.7723513760937525e-07, + "loss": 0.2327, + "step": 16670 + }, + { + "epoch": 2.7466293406887465, + "grad_norm": 1.47609281539917, + "learning_rate": 3.7241363233384007e-07, + "loss": 0.2185, + "step": 16680 + }, + { + "epoch": 2.7482760750087483, + "grad_norm": 1.770021915435791, + "learning_rate": 3.6762255311348696e-07, + "loss": 0.2237, + "step": 16690 + }, + { + "epoch": 2.74992280932875, + "grad_norm": 1.9828838109970093, + "learning_rate": 3.628619150895607e-07, + "loss": 0.2295, + "step": 16700 + }, + { + "epoch": 2.7515695436487517, + "grad_norm": 2.307547092437744, + "learning_rate": 3.5813173330710215e-07, + "loss": 0.2399, + "step": 16710 + }, + { + "epoch": 2.753216277968753, + "grad_norm": 1.719611644744873, + "learning_rate": 3.534320227149035e-07, + "loss": 0.2246, + "step": 16720 + }, + { + "epoch": 2.7548630122887547, + "grad_norm": 1.925981879234314, + "learning_rate": 3.48762798165454e-07, + "loss": 0.2115, + "step": 16730 + }, + { + "epoch": 2.7565097466087565, + "grad_norm": 1.7162208557128906, + "learning_rate": 3.441240744149055e-07, + "loss": 0.2397, + "step": 16740 + }, + { + "epoch": 2.758156480928758, + "grad_norm": 2.125133991241455, + "learning_rate": 3.3951586612300914e-07, + "loss": 0.2179, + "step": 16750 + }, + { + "epoch": 2.75980321524876, + "grad_norm": 1.792880654335022, + "learning_rate": 3.3493818785308886e-07, + "loss": 0.2181, + "step": 16760 + }, + { + "epoch": 2.761449949568761, + "grad_norm": 1.3869192600250244, + "learning_rate": 3.3039105407197127e-07, + "loss": 0.2357, + "step": 16770 + }, + { + "epoch": 2.763096683888763, + "grad_norm": 2.811720371246338, + "learning_rate": 3.2587447914996463e-07, + "loss": 0.2457, + "step": 16780 + }, + { + "epoch": 2.7647434182087647, + "grad_norm": 1.611673355102539, + "learning_rate": 3.213884773607967e-07, + "loss": 0.2396, + "step": 16790 + }, + { + "epoch": 2.7663901525287664, + "grad_norm": 2.732407808303833, + "learning_rate": 3.1693306288157697e-07, + "loss": 0.2192, + "step": 16800 + }, + { + "epoch": 2.768036886848768, + "grad_norm": 1.6391844749450684, + "learning_rate": 3.1250824979274675e-07, + "loss": 0.2464, + "step": 16810 + }, + { + "epoch": 2.7696836211687694, + "grad_norm": 1.7055604457855225, + "learning_rate": 3.0811405207804456e-07, + "loss": 0.2196, + "step": 16820 + }, + { + "epoch": 2.7713303554887716, + "grad_norm": 1.6378053426742554, + "learning_rate": 3.0375048362444535e-07, + "loss": 0.2241, + "step": 16830 + }, + { + "epoch": 2.772977089808773, + "grad_norm": 1.4006156921386719, + "learning_rate": 2.9941755822213704e-07, + "loss": 0.2291, + "step": 16840 + }, + { + "epoch": 2.7746238241287746, + "grad_norm": 1.677067756652832, + "learning_rate": 2.95115289564456e-07, + "loss": 0.2527, + "step": 16850 + }, + { + "epoch": 2.7762705584487763, + "grad_norm": 1.372916340827942, + "learning_rate": 2.9084369124786293e-07, + "loss": 0.2291, + "step": 16860 + }, + { + "epoch": 2.777917292768778, + "grad_norm": 1.4055744409561157, + "learning_rate": 2.8660277677188487e-07, + "loss": 0.2199, + "step": 16870 + }, + { + "epoch": 2.77956402708878, + "grad_norm": 1.945945382118225, + "learning_rate": 2.8239255953908305e-07, + "loss": 0.2199, + "step": 16880 + }, + { + "epoch": 2.781210761408781, + "grad_norm": 1.2029650211334229, + "learning_rate": 2.782130528550031e-07, + "loss": 0.2292, + "step": 16890 + }, + { + "epoch": 2.782857495728783, + "grad_norm": 2.0258445739746094, + "learning_rate": 2.740642699281382e-07, + "loss": 0.2193, + "step": 16900 + }, + { + "epoch": 2.7845042300487846, + "grad_norm": 1.2839769124984741, + "learning_rate": 2.699462238698847e-07, + "loss": 0.2206, + "step": 16910 + }, + { + "epoch": 2.7861509643687863, + "grad_norm": 1.4832942485809326, + "learning_rate": 2.6585892769450005e-07, + "loss": 0.2196, + "step": 16920 + }, + { + "epoch": 2.787797698688788, + "grad_norm": 1.6463637351989746, + "learning_rate": 2.6180239431906284e-07, + "loss": 0.2127, + "step": 16930 + }, + { + "epoch": 2.7894444330087893, + "grad_norm": 1.6157695055007935, + "learning_rate": 2.57776636563436e-07, + "loss": 0.2241, + "step": 16940 + }, + { + "epoch": 2.791091167328791, + "grad_norm": 1.478996753692627, + "learning_rate": 2.537816671502158e-07, + "loss": 0.2124, + "step": 16950 + }, + { + "epoch": 2.7927379016487928, + "grad_norm": 1.402809739112854, + "learning_rate": 2.498174987047042e-07, + "loss": 0.2311, + "step": 16960 + }, + { + "epoch": 2.7943846359687945, + "grad_norm": 1.756034016609192, + "learning_rate": 2.458841437548587e-07, + "loss": 0.208, + "step": 16970 + }, + { + "epoch": 2.7960313702887962, + "grad_norm": 1.5966026782989502, + "learning_rate": 2.4198161473126147e-07, + "loss": 0.2216, + "step": 16980 + }, + { + "epoch": 2.7976781046087975, + "grad_norm": 1.536316990852356, + "learning_rate": 2.3810992396706812e-07, + "loss": 0.2261, + "step": 16990 + }, + { + "epoch": 2.7993248389287992, + "grad_norm": 2.4802510738372803, + "learning_rate": 2.342690836979833e-07, + "loss": 0.2265, + "step": 17000 + }, + { + "epoch": 2.800971573248801, + "grad_norm": 1.9055966138839722, + "learning_rate": 2.3045910606221078e-07, + "loss": 0.2162, + "step": 17010 + }, + { + "epoch": 2.8026183075688027, + "grad_norm": 1.6244001388549805, + "learning_rate": 2.2668000310042237e-07, + "loss": 0.2292, + "step": 17020 + }, + { + "epoch": 2.8042650418888044, + "grad_norm": 1.7967705726623535, + "learning_rate": 2.2293178675571236e-07, + "loss": 0.2163, + "step": 17030 + }, + { + "epoch": 2.8059117762088057, + "grad_norm": 1.432999610900879, + "learning_rate": 2.1921446887356869e-07, + "loss": 0.2227, + "step": 17040 + }, + { + "epoch": 2.8075585105288074, + "grad_norm": 1.8171266317367554, + "learning_rate": 2.1552806120182734e-07, + "loss": 0.219, + "step": 17050 + }, + { + "epoch": 2.809205244848809, + "grad_norm": 1.6710107326507568, + "learning_rate": 2.1187257539064143e-07, + "loss": 0.2164, + "step": 17060 + }, + { + "epoch": 2.810851979168811, + "grad_norm": 2.254349708557129, + "learning_rate": 2.0824802299243775e-07, + "loss": 0.2159, + "step": 17070 + }, + { + "epoch": 2.8124987134888126, + "grad_norm": 1.3687024116516113, + "learning_rate": 2.0465441546189125e-07, + "loss": 0.2183, + "step": 17080 + }, + { + "epoch": 2.814145447808814, + "grad_norm": 1.6281770467758179, + "learning_rate": 2.0109176415587294e-07, + "loss": 0.2154, + "step": 17090 + }, + { + "epoch": 2.8157921821288157, + "grad_norm": 1.8856794834136963, + "learning_rate": 1.9756008033343211e-07, + "loss": 0.213, + "step": 17100 + }, + { + "epoch": 2.8174389164488174, + "grad_norm": 1.6289527416229248, + "learning_rate": 1.940593751557429e-07, + "loss": 0.2276, + "step": 17110 + }, + { + "epoch": 2.819085650768819, + "grad_norm": 1.4464855194091797, + "learning_rate": 1.9058965968608567e-07, + "loss": 0.2174, + "step": 17120 + }, + { + "epoch": 2.820732385088821, + "grad_norm": 1.4708118438720703, + "learning_rate": 1.8715094488979568e-07, + "loss": 0.2193, + "step": 17130 + }, + { + "epoch": 2.822379119408822, + "grad_norm": 1.6330119371414185, + "learning_rate": 1.837432416342444e-07, + "loss": 0.2153, + "step": 17140 + }, + { + "epoch": 2.824025853728824, + "grad_norm": 1.0777000188827515, + "learning_rate": 1.8036656068879166e-07, + "loss": 0.2284, + "step": 17150 + }, + { + "epoch": 2.8256725880488256, + "grad_norm": 1.7415492534637451, + "learning_rate": 1.770209127247635e-07, + "loss": 0.2217, + "step": 17160 + }, + { + "epoch": 2.8273193223688273, + "grad_norm": 1.8589420318603516, + "learning_rate": 1.7370630831540668e-07, + "loss": 0.2278, + "step": 17170 + }, + { + "epoch": 2.828966056688829, + "grad_norm": 1.300451636314392, + "learning_rate": 1.7042275793586416e-07, + "loss": 0.212, + "step": 17180 + }, + { + "epoch": 2.8306127910088303, + "grad_norm": 1.7537988424301147, + "learning_rate": 1.671702719631374e-07, + "loss": 0.222, + "step": 17190 + }, + { + "epoch": 2.8322595253288325, + "grad_norm": 1.643229365348816, + "learning_rate": 1.6394886067605752e-07, + "loss": 0.2198, + "step": 17200 + }, + { + "epoch": 2.833906259648834, + "grad_norm": 1.942588210105896, + "learning_rate": 1.6075853425524646e-07, + "loss": 0.2279, + "step": 17210 + }, + { + "epoch": 2.8355529939688355, + "grad_norm": 1.780501365661621, + "learning_rate": 1.5759930278309243e-07, + "loss": 0.2243, + "step": 17220 + }, + { + "epoch": 2.8371997282888373, + "grad_norm": 1.2578332424163818, + "learning_rate": 1.5447117624371122e-07, + "loss": 0.2193, + "step": 17230 + }, + { + "epoch": 2.838846462608839, + "grad_norm": 1.4017175436019897, + "learning_rate": 1.5137416452292164e-07, + "loss": 0.1921, + "step": 17240 + }, + { + "epoch": 2.8404931969288407, + "grad_norm": 1.6967748403549194, + "learning_rate": 1.4830827740820453e-07, + "loss": 0.233, + "step": 17250 + }, + { + "epoch": 2.842139931248842, + "grad_norm": 1.5690455436706543, + "learning_rate": 1.4527352458868494e-07, + "loss": 0.2263, + "step": 17260 + }, + { + "epoch": 2.8437866655688437, + "grad_norm": 2.2616426944732666, + "learning_rate": 1.4226991565508662e-07, + "loss": 0.2235, + "step": 17270 + }, + { + "epoch": 2.8454333998888455, + "grad_norm": 2.0758249759674072, + "learning_rate": 1.3929746009971434e-07, + "loss": 0.2137, + "step": 17280 + }, + { + "epoch": 2.847080134208847, + "grad_norm": 1.4178270101547241, + "learning_rate": 1.3635616731641933e-07, + "loss": 0.2044, + "step": 17290 + }, + { + "epoch": 2.848726868528849, + "grad_norm": 1.5788546800613403, + "learning_rate": 1.3344604660056494e-07, + "loss": 0.2198, + "step": 17300 + }, + { + "epoch": 2.85037360284885, + "grad_norm": 1.6083252429962158, + "learning_rate": 1.3056710714900334e-07, + "loss": 0.2158, + "step": 17310 + }, + { + "epoch": 2.852020337168852, + "grad_norm": 1.376654863357544, + "learning_rate": 1.2771935806004776e-07, + "loss": 0.2158, + "step": 17320 + }, + { + "epoch": 2.8536670714888537, + "grad_norm": 1.5676956176757812, + "learning_rate": 1.2490280833343694e-07, + "loss": 0.2127, + "step": 17330 + }, + { + "epoch": 2.8553138058088554, + "grad_norm": 1.510061502456665, + "learning_rate": 1.2211746687030958e-07, + "loss": 0.2066, + "step": 17340 + }, + { + "epoch": 2.856960540128857, + "grad_norm": 1.0092027187347412, + "learning_rate": 1.1936334247318104e-07, + "loss": 0.2072, + "step": 17350 + }, + { + "epoch": 2.8586072744488584, + "grad_norm": 1.4783754348754883, + "learning_rate": 1.1664044384590679e-07, + "loss": 0.2055, + "step": 17360 + }, + { + "epoch": 2.86025400876886, + "grad_norm": 1.3680094480514526, + "learning_rate": 1.1394877959366223e-07, + "loss": 0.2041, + "step": 17370 + }, + { + "epoch": 2.861900743088862, + "grad_norm": 1.7390999794006348, + "learning_rate": 1.1128835822291406e-07, + "loss": 0.2109, + "step": 17380 + }, + { + "epoch": 2.8635474774088636, + "grad_norm": 1.5219812393188477, + "learning_rate": 1.0865918814138677e-07, + "loss": 0.2101, + "step": 17390 + }, + { + "epoch": 2.8651942117288653, + "grad_norm": 1.1699146032333374, + "learning_rate": 1.060612776580483e-07, + "loss": 0.227, + "step": 17400 + }, + { + "epoch": 2.8668409460488666, + "grad_norm": 1.836059331893921, + "learning_rate": 1.0349463498307233e-07, + "loss": 0.2264, + "step": 17410 + }, + { + "epoch": 2.8684876803688684, + "grad_norm": 1.6597439050674438, + "learning_rate": 1.009592682278171e-07, + "loss": 0.2205, + "step": 17420 + }, + { + "epoch": 2.87013441468887, + "grad_norm": 1.5050686597824097, + "learning_rate": 9.845518540480214e-08, + "loss": 0.2203, + "step": 17430 + }, + { + "epoch": 2.871781149008872, + "grad_norm": 1.6144781112670898, + "learning_rate": 9.598239442767721e-08, + "loss": 0.249, + "step": 17440 + }, + { + "epoch": 2.8734278833288736, + "grad_norm": 1.542999029159546, + "learning_rate": 9.354090311120334e-08, + "loss": 0.2278, + "step": 17450 + }, + { + "epoch": 2.875074617648875, + "grad_norm": 1.3249629735946655, + "learning_rate": 9.113071917122407e-08, + "loss": 0.2044, + "step": 17460 + }, + { + "epoch": 2.8767213519688766, + "grad_norm": 1.4665296077728271, + "learning_rate": 8.875185022464094e-08, + "loss": 0.2165, + "step": 17470 + }, + { + "epoch": 2.8783680862888783, + "grad_norm": 1.5788675546646118, + "learning_rate": 8.640430378939246e-08, + "loss": 0.2187, + "step": 17480 + }, + { + "epoch": 2.88001482060888, + "grad_norm": 1.677793025970459, + "learning_rate": 8.408808728442963e-08, + "loss": 0.2058, + "step": 17490 + }, + { + "epoch": 2.8816615549288818, + "grad_norm": 1.5491596460342407, + "learning_rate": 8.180320802968822e-08, + "loss": 0.2036, + "step": 17500 + }, + { + "epoch": 2.883308289248883, + "grad_norm": 1.7089343070983887, + "learning_rate": 7.95496732460721e-08, + "loss": 0.2199, + "step": 17510 + }, + { + "epoch": 2.8849550235688852, + "grad_norm": 1.4340360164642334, + "learning_rate": 7.732749005542439e-08, + "loss": 0.2135, + "step": 17520 + }, + { + "epoch": 2.8866017578888865, + "grad_norm": 1.716632604598999, + "learning_rate": 7.51366654805108e-08, + "loss": 0.2367, + "step": 17530 + }, + { + "epoch": 2.8882484922088882, + "grad_norm": 1.836976170539856, + "learning_rate": 7.297720644499073e-08, + "loss": 0.2203, + "step": 17540 + }, + { + "epoch": 2.88989522652889, + "grad_norm": 1.2886078357696533, + "learning_rate": 7.084911977340404e-08, + "loss": 0.2142, + "step": 17550 + }, + { + "epoch": 2.8915419608488917, + "grad_norm": 1.4025774002075195, + "learning_rate": 6.875241219113982e-08, + "loss": 0.2067, + "step": 17560 + }, + { + "epoch": 2.8931886951688934, + "grad_norm": 1.7300608158111572, + "learning_rate": 6.66870903244221e-08, + "loss": 0.2204, + "step": 17570 + }, + { + "epoch": 2.8948354294888947, + "grad_norm": 1.5215460062026978, + "learning_rate": 6.465316070028538e-08, + "loss": 0.2343, + "step": 17580 + }, + { + "epoch": 2.8964821638088964, + "grad_norm": 1.8203786611557007, + "learning_rate": 6.265062974655789e-08, + "loss": 0.21, + "step": 17590 + }, + { + "epoch": 2.898128898128898, + "grad_norm": 1.4137710332870483, + "learning_rate": 6.067950379183619e-08, + "loss": 0.224, + "step": 17600 + }, + { + "epoch": 2.8997756324489, + "grad_norm": 1.8194389343261719, + "learning_rate": 5.87397890654684e-08, + "loss": 0.2254, + "step": 17610 + }, + { + "epoch": 2.9014223667689016, + "grad_norm": 2.260550022125244, + "learning_rate": 5.683149169753433e-08, + "loss": 0.2237, + "step": 17620 + }, + { + "epoch": 2.903069101088903, + "grad_norm": 1.4916115999221802, + "learning_rate": 5.4954617718823154e-08, + "loss": 0.2235, + "step": 17630 + }, + { + "epoch": 2.9047158354089047, + "grad_norm": 1.691039800643921, + "learning_rate": 5.3109173060820196e-08, + "loss": 0.2222, + "step": 17640 + }, + { + "epoch": 2.9063625697289064, + "grad_norm": 1.4430158138275146, + "learning_rate": 5.129516355568354e-08, + "loss": 0.2101, + "step": 17650 + }, + { + "epoch": 2.908009304048908, + "grad_norm": 1.406510829925537, + "learning_rate": 4.9512594936224065e-08, + "loss": 0.2136, + "step": 17660 + }, + { + "epoch": 2.90965603836891, + "grad_norm": 1.539867639541626, + "learning_rate": 4.776147283589438e-08, + "loss": 0.2305, + "step": 17670 + }, + { + "epoch": 2.911302772688911, + "grad_norm": 1.5464060306549072, + "learning_rate": 4.6041802788762136e-08, + "loss": 0.218, + "step": 17680 + }, + { + "epoch": 2.912949507008913, + "grad_norm": 1.3350013494491577, + "learning_rate": 4.435359022950336e-08, + "loss": 0.2234, + "step": 17690 + }, + { + "epoch": 2.9145962413289146, + "grad_norm": 1.1990008354187012, + "learning_rate": 4.269684049337142e-08, + "loss": 0.2208, + "step": 17700 + }, + { + "epoch": 2.9162429756489163, + "grad_norm": 1.4853509664535522, + "learning_rate": 4.1071558816193626e-08, + "loss": 0.219, + "step": 17710 + }, + { + "epoch": 2.917889709968918, + "grad_norm": 1.4338077306747437, + "learning_rate": 3.947775033434575e-08, + "loss": 0.2077, + "step": 17720 + }, + { + "epoch": 2.9195364442889193, + "grad_norm": 2.2821855545043945, + "learning_rate": 3.7915420084740915e-08, + "loss": 0.2177, + "step": 17730 + }, + { + "epoch": 2.921183178608921, + "grad_norm": 1.7811158895492554, + "learning_rate": 3.6384573004808465e-08, + "loss": 0.2172, + "step": 17740 + }, + { + "epoch": 2.922829912928923, + "grad_norm": 1.8270400762557983, + "learning_rate": 3.488521393248401e-08, + "loss": 0.2127, + "step": 17750 + }, + { + "epoch": 2.9244766472489245, + "grad_norm": 1.455257534980774, + "learning_rate": 3.341734760619275e-08, + "loss": 0.2223, + "step": 17760 + }, + { + "epoch": 2.9261233815689263, + "grad_norm": 1.5582276582717896, + "learning_rate": 3.198097866483063e-08, + "loss": 0.2194, + "step": 17770 + }, + { + "epoch": 2.9277701158889275, + "grad_norm": 1.5785218477249146, + "learning_rate": 3.0576111647752096e-08, + "loss": 0.2032, + "step": 17780 + }, + { + "epoch": 2.9294168502089293, + "grad_norm": 1.5375938415527344, + "learning_rate": 2.920275099476011e-08, + "loss": 0.2294, + "step": 17790 + }, + { + "epoch": 2.931063584528931, + "grad_norm": 2.171330451965332, + "learning_rate": 2.7860901046082856e-08, + "loss": 0.2358, + "step": 17800 + }, + { + "epoch": 2.9327103188489327, + "grad_norm": 1.675736665725708, + "learning_rate": 2.6550566042370386e-08, + "loss": 0.2215, + "step": 17810 + }, + { + "epoch": 2.9343570531689345, + "grad_norm": 2.0025336742401123, + "learning_rate": 2.5271750124672423e-08, + "loss": 0.2151, + "step": 17820 + }, + { + "epoch": 2.9360037874889358, + "grad_norm": 1.4711110591888428, + "learning_rate": 2.4024457334430595e-08, + "loss": 0.2173, + "step": 17830 + }, + { + "epoch": 2.9376505218089375, + "grad_norm": 1.8637895584106445, + "learning_rate": 2.2808691613461776e-08, + "loss": 0.2215, + "step": 17840 + }, + { + "epoch": 2.939297256128939, + "grad_norm": 2.4678914546966553, + "learning_rate": 2.162445680395142e-08, + "loss": 0.2373, + "step": 17850 + }, + { + "epoch": 2.940943990448941, + "grad_norm": 1.4459493160247803, + "learning_rate": 2.0471756648435814e-08, + "loss": 0.2033, + "step": 17860 + }, + { + "epoch": 2.9425907247689427, + "grad_norm": 1.8387855291366577, + "learning_rate": 1.9350594789792064e-08, + "loss": 0.2107, + "step": 17870 + }, + { + "epoch": 2.944237459088944, + "grad_norm": 1.8373688459396362, + "learning_rate": 1.8260974771227015e-08, + "loss": 0.2318, + "step": 17880 + }, + { + "epoch": 2.945884193408946, + "grad_norm": 1.6983997821807861, + "learning_rate": 1.7202900036268343e-08, + "loss": 0.2178, + "step": 17890 + }, + { + "epoch": 2.9475309277289474, + "grad_norm": 1.5288745164871216, + "learning_rate": 1.6176373928745715e-08, + "loss": 0.2205, + "step": 17900 + }, + { + "epoch": 2.949177662048949, + "grad_norm": 1.6072468757629395, + "learning_rate": 1.5181399692790756e-08, + "loss": 0.2076, + "step": 17910 + }, + { + "epoch": 2.950824396368951, + "grad_norm": 3.108124017715454, + "learning_rate": 1.4217980472819304e-08, + "loss": 0.233, + "step": 17920 + }, + { + "epoch": 2.9524711306889526, + "grad_norm": 1.8453097343444824, + "learning_rate": 1.3286119313525858e-08, + "loss": 0.2303, + "step": 17930 + }, + { + "epoch": 2.9541178650089543, + "grad_norm": 2.0312986373901367, + "learning_rate": 1.2385819159869138e-08, + "loss": 0.2122, + "step": 17940 + }, + { + "epoch": 2.9557645993289556, + "grad_norm": 1.338973879814148, + "learning_rate": 1.1517082857067652e-08, + "loss": 0.2248, + "step": 17950 + }, + { + "epoch": 2.9574113336489574, + "grad_norm": 1.7432622909545898, + "learning_rate": 1.0679913150588584e-08, + "loss": 0.2213, + "step": 17960 + }, + { + "epoch": 2.959058067968959, + "grad_norm": 1.9854906797409058, + "learning_rate": 9.87431268613781e-09, + "loss": 0.2312, + "step": 17970 + }, + { + "epoch": 2.960704802288961, + "grad_norm": 1.817718267440796, + "learning_rate": 9.100284009655458e-09, + "loss": 0.2276, + "step": 17980 + }, + { + "epoch": 2.9623515366089626, + "grad_norm": 1.655806064605713, + "learning_rate": 8.357829567302577e-09, + "loss": 0.2261, + "step": 17990 + }, + { + "epoch": 2.963998270928964, + "grad_norm": 1.608216643333435, + "learning_rate": 7.646951705457817e-09, + "loss": 0.2112, + "step": 18000 + } + ], + "logging_steps": 10, + "max_steps": 18219, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2831001863587446e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}