{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.963998270928964, "eval_steps": 500, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016467343200016467, "grad_norm": 30.985063552856445, "learning_rate": 3.2906764168190127e-07, "loss": 1.3972, "step": 10 }, { "epoch": 0.0032934686400032933, "grad_norm": 31.839323043823242, "learning_rate": 6.946983546617917e-07, "loss": 1.3292, "step": 20 }, { "epoch": 0.0049402029600049404, "grad_norm": 34.46781921386719, "learning_rate": 1.060329067641682e-06, "loss": 1.2296, "step": 30 }, { "epoch": 0.006586937280006587, "grad_norm": 34.32807540893555, "learning_rate": 1.4259597806215722e-06, "loss": 1.2755, "step": 40 }, { "epoch": 0.008233671600008234, "grad_norm": 24.143030166625977, "learning_rate": 1.7915904936014627e-06, "loss": 1.2333, "step": 50 }, { "epoch": 0.009880405920009881, "grad_norm": 20.771278381347656, "learning_rate": 2.157221206581353e-06, "loss": 1.206, "step": 60 }, { "epoch": 0.011527140240011528, "grad_norm": 25.031925201416016, "learning_rate": 2.5228519195612434e-06, "loss": 1.2008, "step": 70 }, { "epoch": 0.013173874560013173, "grad_norm": 24.008222579956055, "learning_rate": 2.8884826325411334e-06, "loss": 1.1446, "step": 80 }, { "epoch": 0.01482060888001482, "grad_norm": 23.959667205810547, "learning_rate": 3.254113345521024e-06, "loss": 1.2072, "step": 90 }, { "epoch": 0.016467343200016468, "grad_norm": 16.00537109375, "learning_rate": 3.6197440585009143e-06, "loss": 1.1743, "step": 100 }, { "epoch": 0.018114077520018115, "grad_norm": 13.254168510437012, "learning_rate": 3.985374771480805e-06, "loss": 1.1374, "step": 110 }, { "epoch": 0.019760811840019762, "grad_norm": 14.698610305786133, "learning_rate": 4.351005484460696e-06, "loss": 1.1192, "step": 120 }, { "epoch": 0.02140754616002141, "grad_norm": 12.052921295166016, "learning_rate": 4.716636197440586e-06, "loss": 1.089, "step": 130 }, { "epoch": 0.023054280480023056, "grad_norm": 9.61983871459961, "learning_rate": 5.082266910420476e-06, "loss": 1.0972, "step": 140 }, { "epoch": 0.0247010148000247, "grad_norm": 10.221634864807129, "learning_rate": 5.447897623400366e-06, "loss": 1.1082, "step": 150 }, { "epoch": 0.026347749120026347, "grad_norm": 14.476886749267578, "learning_rate": 5.813528336380257e-06, "loss": 1.1348, "step": 160 }, { "epoch": 0.027994483440027994, "grad_norm": 23.15399169921875, "learning_rate": 6.1791590493601475e-06, "loss": 1.0889, "step": 170 }, { "epoch": 0.02964121776002964, "grad_norm": 16.883651733398438, "learning_rate": 6.544789762340037e-06, "loss": 1.1055, "step": 180 }, { "epoch": 0.03128795208003129, "grad_norm": 12.747838020324707, "learning_rate": 6.9104204753199275e-06, "loss": 1.0862, "step": 190 }, { "epoch": 0.032934686400032935, "grad_norm": 11.923359870910645, "learning_rate": 7.2760511882998175e-06, "loss": 1.0944, "step": 200 }, { "epoch": 0.03458142072003458, "grad_norm": 10.496978759765625, "learning_rate": 7.641681901279708e-06, "loss": 1.0962, "step": 210 }, { "epoch": 0.03622815504003623, "grad_norm": 8.666916847229004, "learning_rate": 8.007312614259598e-06, "loss": 1.0826, "step": 220 }, { "epoch": 0.037874889360037876, "grad_norm": 7.300290584564209, "learning_rate": 8.372943327239488e-06, "loss": 1.0932, "step": 230 }, { "epoch": 0.039521623680039523, "grad_norm": 10.057311058044434, "learning_rate": 8.73857404021938e-06, "loss": 1.0466, "step": 240 }, { "epoch": 0.04116835800004117, "grad_norm": 11.838115692138672, "learning_rate": 9.10420475319927e-06, "loss": 1.0881, "step": 250 }, { "epoch": 0.04281509232004282, "grad_norm": 6.902042388916016, "learning_rate": 9.469835466179161e-06, "loss": 1.0618, "step": 260 }, { "epoch": 0.044461826640044465, "grad_norm": 5.424289703369141, "learning_rate": 9.83546617915905e-06, "loss": 1.0146, "step": 270 }, { "epoch": 0.04610856096004611, "grad_norm": 5.259349346160889, "learning_rate": 1.020109689213894e-05, "loss": 0.9994, "step": 280 }, { "epoch": 0.04775529528004775, "grad_norm": 6.474951267242432, "learning_rate": 1.0566727605118832e-05, "loss": 1.0003, "step": 290 }, { "epoch": 0.0494020296000494, "grad_norm": 7.7709455490112305, "learning_rate": 1.0932358318098721e-05, "loss": 1.0263, "step": 300 }, { "epoch": 0.051048763920051046, "grad_norm": 8.521011352539062, "learning_rate": 1.129798903107861e-05, "loss": 1.0383, "step": 310 }, { "epoch": 0.05269549824005269, "grad_norm": 6.253118515014648, "learning_rate": 1.1663619744058501e-05, "loss": 1.0609, "step": 320 }, { "epoch": 0.05434223256005434, "grad_norm": 6.294135093688965, "learning_rate": 1.2029250457038392e-05, "loss": 1.0212, "step": 330 }, { "epoch": 0.05598896688005599, "grad_norm": 8.303707122802734, "learning_rate": 1.2394881170018283e-05, "loss": 0.9964, "step": 340 }, { "epoch": 0.057635701200057635, "grad_norm": 4.852534294128418, "learning_rate": 1.2760511882998172e-05, "loss": 0.9875, "step": 350 }, { "epoch": 0.05928243552005928, "grad_norm": 6.304441928863525, "learning_rate": 1.3126142595978065e-05, "loss": 1.0425, "step": 360 }, { "epoch": 0.06092916984006093, "grad_norm": 5.18842887878418, "learning_rate": 1.3491773308957954e-05, "loss": 1.0101, "step": 370 }, { "epoch": 0.06257590416006258, "grad_norm": 5.784980773925781, "learning_rate": 1.3857404021937843e-05, "loss": 1.0046, "step": 380 }, { "epoch": 0.06422263848006422, "grad_norm": 8.19119930267334, "learning_rate": 1.4223034734917734e-05, "loss": 0.9889, "step": 390 }, { "epoch": 0.06586937280006587, "grad_norm": 6.851486682891846, "learning_rate": 1.4588665447897625e-05, "loss": 1.0111, "step": 400 }, { "epoch": 0.06751610712006752, "grad_norm": 4.777026176452637, "learning_rate": 1.4954296160877516e-05, "loss": 0.9922, "step": 410 }, { "epoch": 0.06916284144006916, "grad_norm": 5.17830753326416, "learning_rate": 1.5319926873857403e-05, "loss": 0.9556, "step": 420 }, { "epoch": 0.07080957576007081, "grad_norm": 5.6952080726623535, "learning_rate": 1.5685557586837297e-05, "loss": 0.9696, "step": 430 }, { "epoch": 0.07245631008007246, "grad_norm": 4.926089286804199, "learning_rate": 1.6051188299817185e-05, "loss": 0.9668, "step": 440 }, { "epoch": 0.0741030444000741, "grad_norm": 4.33101749420166, "learning_rate": 1.6416819012797076e-05, "loss": 0.9884, "step": 450 }, { "epoch": 0.07574977872007575, "grad_norm": 3.4968671798706055, "learning_rate": 1.6782449725776967e-05, "loss": 0.9852, "step": 460 }, { "epoch": 0.0773965130400774, "grad_norm": 3.997958183288574, "learning_rate": 1.7148080438756858e-05, "loss": 0.9675, "step": 470 }, { "epoch": 0.07904324736007905, "grad_norm": 4.513462066650391, "learning_rate": 1.751371115173675e-05, "loss": 0.9886, "step": 480 }, { "epoch": 0.0806899816800807, "grad_norm": 4.011235237121582, "learning_rate": 1.7879341864716636e-05, "loss": 0.9802, "step": 490 }, { "epoch": 0.08233671600008234, "grad_norm": 4.776538848876953, "learning_rate": 1.8244972577696527e-05, "loss": 0.928, "step": 500 }, { "epoch": 0.08398345032008399, "grad_norm": 4.51215934753418, "learning_rate": 1.8610603290676418e-05, "loss": 0.9286, "step": 510 }, { "epoch": 0.08563018464008564, "grad_norm": 3.450950860977173, "learning_rate": 1.897623400365631e-05, "loss": 0.9485, "step": 520 }, { "epoch": 0.08727691896008728, "grad_norm": 3.313753604888916, "learning_rate": 1.93418647166362e-05, "loss": 0.9803, "step": 530 }, { "epoch": 0.08892365328008893, "grad_norm": 4.609573841094971, "learning_rate": 1.970749542961609e-05, "loss": 0.9782, "step": 540 }, { "epoch": 0.09057038760009058, "grad_norm": 3.477215528488159, "learning_rate": 1.9999999367939968e-05, "loss": 0.9189, "step": 550 }, { "epoch": 0.09221712192009222, "grad_norm": 5.594038963317871, "learning_rate": 1.9999977245847137e-05, "loss": 0.9146, "step": 560 }, { "epoch": 0.09386385624009386, "grad_norm": 2.6363534927368164, "learning_rate": 1.9999923520832466e-05, "loss": 0.895, "step": 570 }, { "epoch": 0.0955105905600955, "grad_norm": 3.1434268951416016, "learning_rate": 1.999983819306574e-05, "loss": 0.9041, "step": 580 }, { "epoch": 0.09715732488009715, "grad_norm": 3.609675168991089, "learning_rate": 1.999972126281662e-05, "loss": 0.9154, "step": 590 }, { "epoch": 0.0988040592000988, "grad_norm": 4.128945350646973, "learning_rate": 1.9999572730454638e-05, "loss": 0.9061, "step": 600 }, { "epoch": 0.10045079352010045, "grad_norm": 3.6238315105438232, "learning_rate": 1.999939259644921e-05, "loss": 0.9326, "step": 610 }, { "epoch": 0.10209752784010209, "grad_norm": 3.4944262504577637, "learning_rate": 1.9999180861369605e-05, "loss": 0.894, "step": 620 }, { "epoch": 0.10374426216010374, "grad_norm": 2.8273308277130127, "learning_rate": 1.999893752588497e-05, "loss": 0.9058, "step": 630 }, { "epoch": 0.10539099648010539, "grad_norm": 3.1057746410369873, "learning_rate": 1.999866259076432e-05, "loss": 0.8943, "step": 640 }, { "epoch": 0.10703773080010703, "grad_norm": 4.640377998352051, "learning_rate": 1.9998356056876532e-05, "loss": 0.9156, "step": 650 }, { "epoch": 0.10868446512010868, "grad_norm": 4.991531848907471, "learning_rate": 1.9998017925190345e-05, "loss": 0.9163, "step": 660 }, { "epoch": 0.11033119944011033, "grad_norm": 7.1393327713012695, "learning_rate": 1.9997648196774354e-05, "loss": 0.8828, "step": 670 }, { "epoch": 0.11197793376011198, "grad_norm": 3.893332004547119, "learning_rate": 1.9997246872797018e-05, "loss": 0.9226, "step": 680 }, { "epoch": 0.11362466808011362, "grad_norm": 3.794612169265747, "learning_rate": 1.999681395452663e-05, "loss": 0.8738, "step": 690 }, { "epoch": 0.11527140240011527, "grad_norm": 3.3877007961273193, "learning_rate": 1.9996349443331354e-05, "loss": 0.8742, "step": 700 }, { "epoch": 0.11691813672011692, "grad_norm": 5.569369316101074, "learning_rate": 1.9995853340679173e-05, "loss": 0.8674, "step": 710 }, { "epoch": 0.11856487104011856, "grad_norm": 3.396794080734253, "learning_rate": 1.999532564813793e-05, "loss": 0.914, "step": 720 }, { "epoch": 0.12021160536012021, "grad_norm": 5.97723388671875, "learning_rate": 1.9994766367375283e-05, "loss": 0.8892, "step": 730 }, { "epoch": 0.12185833968012186, "grad_norm": 3.1893229484558105, "learning_rate": 1.999417550015873e-05, "loss": 0.8554, "step": 740 }, { "epoch": 0.1235050740001235, "grad_norm": 3.7248144149780273, "learning_rate": 1.999355304835559e-05, "loss": 0.857, "step": 750 }, { "epoch": 0.12515180832012515, "grad_norm": 3.797175645828247, "learning_rate": 1.9992899013932994e-05, "loss": 0.8646, "step": 760 }, { "epoch": 0.1267985426401268, "grad_norm": 2.7383298873901367, "learning_rate": 1.999221339895789e-05, "loss": 0.8749, "step": 770 }, { "epoch": 0.12844527696012845, "grad_norm": 2.588918447494507, "learning_rate": 1.9991496205597023e-05, "loss": 0.8897, "step": 780 }, { "epoch": 0.1300920112801301, "grad_norm": 2.533170223236084, "learning_rate": 1.999074743611694e-05, "loss": 0.8611, "step": 790 }, { "epoch": 0.13173874560013174, "grad_norm": 2.219616174697876, "learning_rate": 1.998996709288398e-05, "loss": 0.8416, "step": 800 }, { "epoch": 0.1333854799201334, "grad_norm": 2.3094987869262695, "learning_rate": 1.9989155178364253e-05, "loss": 0.8401, "step": 810 }, { "epoch": 0.13503221424013503, "grad_norm": 1.8022428750991821, "learning_rate": 1.998831169512366e-05, "loss": 0.8049, "step": 820 }, { "epoch": 0.13667894856013668, "grad_norm": 1.8802989721298218, "learning_rate": 1.998743664582786e-05, "loss": 0.829, "step": 830 }, { "epoch": 0.13832568288013833, "grad_norm": 2.1697161197662354, "learning_rate": 1.9986530033242263e-05, "loss": 0.8503, "step": 840 }, { "epoch": 0.13997241720013998, "grad_norm": 4.128193378448486, "learning_rate": 1.9985591860232047e-05, "loss": 0.8472, "step": 850 }, { "epoch": 0.14161915152014162, "grad_norm": 3.771169900894165, "learning_rate": 1.9984622129762116e-05, "loss": 0.8543, "step": 860 }, { "epoch": 0.14326588584014327, "grad_norm": 3.2667131423950195, "learning_rate": 1.99836208448971e-05, "loss": 0.8512, "step": 870 }, { "epoch": 0.14491262016014492, "grad_norm": 2.6572048664093018, "learning_rate": 1.9982588008801368e-05, "loss": 0.8274, "step": 880 }, { "epoch": 0.14655935448014656, "grad_norm": 2.6121675968170166, "learning_rate": 1.998152362473899e-05, "loss": 0.8373, "step": 890 }, { "epoch": 0.1482060888001482, "grad_norm": 3.6851730346679688, "learning_rate": 1.998042769607374e-05, "loss": 0.8262, "step": 900 }, { "epoch": 0.14985282312014986, "grad_norm": 3.212480306625366, "learning_rate": 1.9979300226269077e-05, "loss": 0.8379, "step": 910 }, { "epoch": 0.1514995574401515, "grad_norm": 2.4245805740356445, "learning_rate": 1.9978141218888143e-05, "loss": 0.8513, "step": 920 }, { "epoch": 0.15314629176015315, "grad_norm": 3.501965284347534, "learning_rate": 1.997695067759375e-05, "loss": 0.8045, "step": 930 }, { "epoch": 0.1547930260801548, "grad_norm": 2.5098695755004883, "learning_rate": 1.997572860614836e-05, "loss": 0.8123, "step": 940 }, { "epoch": 0.15643976040015645, "grad_norm": 2.632624387741089, "learning_rate": 1.9974475008414095e-05, "loss": 0.8261, "step": 950 }, { "epoch": 0.1580864947201581, "grad_norm": 2.489697217941284, "learning_rate": 1.997318988835269e-05, "loss": 0.8348, "step": 960 }, { "epoch": 0.15973322904015974, "grad_norm": 4.488799571990967, "learning_rate": 1.9971873250025512e-05, "loss": 0.8684, "step": 970 }, { "epoch": 0.1613799633601614, "grad_norm": 2.742992877960205, "learning_rate": 1.9970525097593537e-05, "loss": 0.7831, "step": 980 }, { "epoch": 0.16302669768016304, "grad_norm": 2.506899118423462, "learning_rate": 1.996914543531732e-05, "loss": 0.8285, "step": 990 }, { "epoch": 0.16467343200016468, "grad_norm": 3.0873513221740723, "learning_rate": 1.996773426755702e-05, "loss": 0.8215, "step": 1000 }, { "epoch": 0.16632016632016633, "grad_norm": 1.7364745140075684, "learning_rate": 1.9966291598772335e-05, "loss": 0.8385, "step": 1010 }, { "epoch": 0.16796690064016798, "grad_norm": 2.113279104232788, "learning_rate": 1.9964817433522537e-05, "loss": 0.8246, "step": 1020 }, { "epoch": 0.16961363496016962, "grad_norm": 2.741102457046509, "learning_rate": 1.9963311776466435e-05, "loss": 0.8454, "step": 1030 }, { "epoch": 0.17126036928017127, "grad_norm": 2.6016159057617188, "learning_rate": 1.996177463236235e-05, "loss": 0.8078, "step": 1040 }, { "epoch": 0.17290710360017292, "grad_norm": 2.431082010269165, "learning_rate": 1.9960206006068116e-05, "loss": 0.8548, "step": 1050 }, { "epoch": 0.17455383792017456, "grad_norm": 2.077765703201294, "learning_rate": 1.9958605902541065e-05, "loss": 0.8492, "step": 1060 }, { "epoch": 0.1762005722401762, "grad_norm": 3.3199009895324707, "learning_rate": 1.9956974326838004e-05, "loss": 0.7955, "step": 1070 }, { "epoch": 0.17784730656017786, "grad_norm": 2.1479249000549316, "learning_rate": 1.9955311284115198e-05, "loss": 0.7802, "step": 1080 }, { "epoch": 0.1794940408801795, "grad_norm": 1.943623661994934, "learning_rate": 1.9953616779628364e-05, "loss": 0.8206, "step": 1090 }, { "epoch": 0.18114077520018115, "grad_norm": 2.654792070388794, "learning_rate": 1.995189081873264e-05, "loss": 0.8148, "step": 1100 }, { "epoch": 0.1827875095201828, "grad_norm": 1.9163497686386108, "learning_rate": 1.9950133406882577e-05, "loss": 0.7941, "step": 1110 }, { "epoch": 0.18443424384018445, "grad_norm": 2.9743542671203613, "learning_rate": 1.9948344549632124e-05, "loss": 0.8019, "step": 1120 }, { "epoch": 0.1860809781601861, "grad_norm": 2.794917106628418, "learning_rate": 1.9946524252634612e-05, "loss": 0.8506, "step": 1130 }, { "epoch": 0.18772771248018771, "grad_norm": 2.309389352798462, "learning_rate": 1.9944672521642715e-05, "loss": 0.7851, "step": 1140 }, { "epoch": 0.18937444680018936, "grad_norm": 1.9234291315078735, "learning_rate": 1.9942789362508463e-05, "loss": 0.7941, "step": 1150 }, { "epoch": 0.191021181120191, "grad_norm": 2.477848768234253, "learning_rate": 1.9940874781183203e-05, "loss": 0.7892, "step": 1160 }, { "epoch": 0.19266791544019266, "grad_norm": 2.7016117572784424, "learning_rate": 1.993892878371758e-05, "loss": 0.8266, "step": 1170 }, { "epoch": 0.1943146497601943, "grad_norm": 1.9896224737167358, "learning_rate": 1.9936951376261534e-05, "loss": 0.8227, "step": 1180 }, { "epoch": 0.19596138408019595, "grad_norm": 2.1344499588012695, "learning_rate": 1.993494256506426e-05, "loss": 0.8019, "step": 1190 }, { "epoch": 0.1976081184001976, "grad_norm": 2.2229461669921875, "learning_rate": 1.9932902356474208e-05, "loss": 0.7962, "step": 1200 }, { "epoch": 0.19925485272019924, "grad_norm": 2.2535150051116943, "learning_rate": 1.993083075693904e-05, "loss": 0.8405, "step": 1210 }, { "epoch": 0.2009015870402009, "grad_norm": 2.778207778930664, "learning_rate": 1.9928727773005644e-05, "loss": 0.8082, "step": 1220 }, { "epoch": 0.20254832136020254, "grad_norm": 3.0831124782562256, "learning_rate": 1.9926593411320064e-05, "loss": 0.796, "step": 1230 }, { "epoch": 0.20419505568020418, "grad_norm": 2.5188772678375244, "learning_rate": 1.9924427678627533e-05, "loss": 0.8401, "step": 1240 }, { "epoch": 0.20584179000020583, "grad_norm": 2.3578946590423584, "learning_rate": 1.9922230581772405e-05, "loss": 0.7995, "step": 1250 }, { "epoch": 0.20748852432020748, "grad_norm": 2.7173349857330322, "learning_rate": 1.992000212769817e-05, "loss": 0.8271, "step": 1260 }, { "epoch": 0.20913525864020913, "grad_norm": 2.0116770267486572, "learning_rate": 1.9917742323447414e-05, "loss": 0.7978, "step": 1270 }, { "epoch": 0.21078199296021077, "grad_norm": 1.9019718170166016, "learning_rate": 1.9915451176161788e-05, "loss": 0.8141, "step": 1280 }, { "epoch": 0.21242872728021242, "grad_norm": 2.0212275981903076, "learning_rate": 1.9913128693082e-05, "loss": 0.773, "step": 1290 }, { "epoch": 0.21407546160021407, "grad_norm": 3.2075860500335693, "learning_rate": 1.9910774881547803e-05, "loss": 0.7901, "step": 1300 }, { "epoch": 0.21572219592021571, "grad_norm": 2.278242588043213, "learning_rate": 1.9908389748997937e-05, "loss": 0.7616, "step": 1310 }, { "epoch": 0.21736893024021736, "grad_norm": 1.8260267972946167, "learning_rate": 1.990597330297014e-05, "loss": 0.7901, "step": 1320 }, { "epoch": 0.219015664560219, "grad_norm": 2.169857978820801, "learning_rate": 1.9903525551101105e-05, "loss": 0.7491, "step": 1330 }, { "epoch": 0.22066239888022066, "grad_norm": 1.7144447565078735, "learning_rate": 1.9901046501126454e-05, "loss": 0.7546, "step": 1340 }, { "epoch": 0.2223091332002223, "grad_norm": 1.9034463167190552, "learning_rate": 1.9898536160880736e-05, "loss": 0.7679, "step": 1350 }, { "epoch": 0.22395586752022395, "grad_norm": 1.7277398109436035, "learning_rate": 1.989599453829737e-05, "loss": 0.7254, "step": 1360 }, { "epoch": 0.2256026018402256, "grad_norm": 1.7360395193099976, "learning_rate": 1.989342164140865e-05, "loss": 0.7861, "step": 1370 }, { "epoch": 0.22724933616022724, "grad_norm": 1.7167317867279053, "learning_rate": 1.98908174783457e-05, "loss": 0.7765, "step": 1380 }, { "epoch": 0.2288960704802289, "grad_norm": 2.2761967182159424, "learning_rate": 1.988818205733845e-05, "loss": 0.8211, "step": 1390 }, { "epoch": 0.23054280480023054, "grad_norm": 2.0242958068847656, "learning_rate": 1.9885515386715625e-05, "loss": 0.7516, "step": 1400 }, { "epoch": 0.23218953912023219, "grad_norm": 1.9908565282821655, "learning_rate": 1.9882817474904697e-05, "loss": 0.7685, "step": 1410 }, { "epoch": 0.23383627344023383, "grad_norm": 2.352423667907715, "learning_rate": 1.9880088330431883e-05, "loss": 0.7235, "step": 1420 }, { "epoch": 0.23548300776023548, "grad_norm": 1.5180648565292358, "learning_rate": 1.9877327961922085e-05, "loss": 0.7557, "step": 1430 }, { "epoch": 0.23712974208023713, "grad_norm": 2.784830331802368, "learning_rate": 1.9874536378098905e-05, "loss": 0.7864, "step": 1440 }, { "epoch": 0.23877647640023877, "grad_norm": 1.8382682800292969, "learning_rate": 1.987171358778458e-05, "loss": 0.7465, "step": 1450 }, { "epoch": 0.24042321072024042, "grad_norm": 1.4438285827636719, "learning_rate": 1.986885959989997e-05, "loss": 0.7892, "step": 1460 }, { "epoch": 0.24206994504024207, "grad_norm": 1.4113389253616333, "learning_rate": 1.986597442346453e-05, "loss": 0.7586, "step": 1470 }, { "epoch": 0.24371667936024372, "grad_norm": 3.4214653968811035, "learning_rate": 1.9863058067596287e-05, "loss": 0.7564, "step": 1480 }, { "epoch": 0.24536341368024536, "grad_norm": 1.6341257095336914, "learning_rate": 1.9860110541511792e-05, "loss": 0.7493, "step": 1490 }, { "epoch": 0.247010148000247, "grad_norm": 4.065557956695557, "learning_rate": 1.9857131854526117e-05, "loss": 0.768, "step": 1500 }, { "epoch": 0.24865688232024866, "grad_norm": 1.5984668731689453, "learning_rate": 1.9854122016052803e-05, "loss": 0.7631, "step": 1510 }, { "epoch": 0.2503036166402503, "grad_norm": 1.571628212928772, "learning_rate": 1.9851081035603836e-05, "loss": 0.7575, "step": 1520 }, { "epoch": 0.2519503509602519, "grad_norm": 2.2074406147003174, "learning_rate": 1.9848008922789625e-05, "loss": 0.7501, "step": 1530 }, { "epoch": 0.2535970852802536, "grad_norm": 1.4130685329437256, "learning_rate": 1.984490568731897e-05, "loss": 0.7653, "step": 1540 }, { "epoch": 0.2552438196002552, "grad_norm": 1.4988036155700684, "learning_rate": 1.9841771338999022e-05, "loss": 0.7544, "step": 1550 }, { "epoch": 0.2568905539202569, "grad_norm": 1.799906611442566, "learning_rate": 1.9838605887735266e-05, "loss": 0.7264, "step": 1560 }, { "epoch": 0.2585372882402585, "grad_norm": 2.0572450160980225, "learning_rate": 1.9835409343531465e-05, "loss": 0.7212, "step": 1570 }, { "epoch": 0.2601840225602602, "grad_norm": 1.9132790565490723, "learning_rate": 1.9832181716489664e-05, "loss": 0.7176, "step": 1580 }, { "epoch": 0.2618307568802618, "grad_norm": 1.7707911729812622, "learning_rate": 1.9828923016810123e-05, "loss": 0.7647, "step": 1590 }, { "epoch": 0.2634774912002635, "grad_norm": 1.9048503637313843, "learning_rate": 1.9825633254791318e-05, "loss": 0.7202, "step": 1600 }, { "epoch": 0.2651242255202651, "grad_norm": 2.583421230316162, "learning_rate": 1.9822312440829876e-05, "loss": 0.73, "step": 1610 }, { "epoch": 0.2667709598402668, "grad_norm": 1.651800513267517, "learning_rate": 1.9818960585420562e-05, "loss": 0.706, "step": 1620 }, { "epoch": 0.2684176941602684, "grad_norm": 2.220543622970581, "learning_rate": 1.981557769915625e-05, "loss": 0.7246, "step": 1630 }, { "epoch": 0.27006442848027007, "grad_norm": 1.6591778993606567, "learning_rate": 1.9812163792727864e-05, "loss": 0.7453, "step": 1640 }, { "epoch": 0.2717111628002717, "grad_norm": 1.8537503480911255, "learning_rate": 1.9808718876924376e-05, "loss": 0.7483, "step": 1650 }, { "epoch": 0.27335789712027336, "grad_norm": 2.1067299842834473, "learning_rate": 1.9805242962632747e-05, "loss": 0.7047, "step": 1660 }, { "epoch": 0.275004631440275, "grad_norm": 1.5523687601089478, "learning_rate": 1.9801736060837913e-05, "loss": 0.7427, "step": 1670 }, { "epoch": 0.27665136576027666, "grad_norm": 1.404558777809143, "learning_rate": 1.9798198182622734e-05, "loss": 0.7976, "step": 1680 }, { "epoch": 0.2782981000802783, "grad_norm": 2.101656675338745, "learning_rate": 1.979462933916795e-05, "loss": 0.7621, "step": 1690 }, { "epoch": 0.27994483440027995, "grad_norm": 2.0541417598724365, "learning_rate": 1.9791029541752197e-05, "loss": 0.7805, "step": 1700 }, { "epoch": 0.28159156872028157, "grad_norm": 1.5349130630493164, "learning_rate": 1.9787398801751895e-05, "loss": 0.7399, "step": 1710 }, { "epoch": 0.28323830304028325, "grad_norm": 2.281158447265625, "learning_rate": 1.9783737130641272e-05, "loss": 0.7575, "step": 1720 }, { "epoch": 0.28488503736028487, "grad_norm": 2.4197065830230713, "learning_rate": 1.978004453999231e-05, "loss": 0.7578, "step": 1730 }, { "epoch": 0.28653177168028654, "grad_norm": 2.0881662368774414, "learning_rate": 1.97763210414747e-05, "loss": 0.7549, "step": 1740 }, { "epoch": 0.28817850600028816, "grad_norm": 2.4676826000213623, "learning_rate": 1.9772566646855814e-05, "loss": 0.741, "step": 1750 }, { "epoch": 0.28982524032028983, "grad_norm": 1.893269419670105, "learning_rate": 1.9768781368000658e-05, "loss": 0.7416, "step": 1760 }, { "epoch": 0.29147197464029145, "grad_norm": 1.5994690656661987, "learning_rate": 1.9764965216871848e-05, "loss": 0.7181, "step": 1770 }, { "epoch": 0.29311870896029313, "grad_norm": 1.4995242357254028, "learning_rate": 1.9761118205529565e-05, "loss": 0.7389, "step": 1780 }, { "epoch": 0.29476544328029475, "grad_norm": 1.3291387557983398, "learning_rate": 1.9757240346131517e-05, "loss": 0.7356, "step": 1790 }, { "epoch": 0.2964121776002964, "grad_norm": 1.4342174530029297, "learning_rate": 1.9753331650932898e-05, "loss": 0.722, "step": 1800 }, { "epoch": 0.29805891192029804, "grad_norm": 1.4941635131835938, "learning_rate": 1.9749392132286356e-05, "loss": 0.7491, "step": 1810 }, { "epoch": 0.2997056462402997, "grad_norm": 1.4663833379745483, "learning_rate": 1.974542180264195e-05, "loss": 0.7724, "step": 1820 }, { "epoch": 0.30135238056030134, "grad_norm": 1.420600414276123, "learning_rate": 1.974142067454711e-05, "loss": 0.7844, "step": 1830 }, { "epoch": 0.302999114880303, "grad_norm": 1.7661523818969727, "learning_rate": 1.97373887606466e-05, "loss": 0.7277, "step": 1840 }, { "epoch": 0.30464584920030463, "grad_norm": 2.7334187030792236, "learning_rate": 1.9733326073682475e-05, "loss": 0.7209, "step": 1850 }, { "epoch": 0.3062925835203063, "grad_norm": 1.8105331659317017, "learning_rate": 1.972923262649404e-05, "loss": 0.7499, "step": 1860 }, { "epoch": 0.3079393178403079, "grad_norm": 1.575403094291687, "learning_rate": 1.9725108432017812e-05, "loss": 0.7097, "step": 1870 }, { "epoch": 0.3095860521603096, "grad_norm": 1.4154794216156006, "learning_rate": 1.9720953503287487e-05, "loss": 0.7141, "step": 1880 }, { "epoch": 0.3112327864803112, "grad_norm": 1.4072984457015991, "learning_rate": 1.9716767853433877e-05, "loss": 0.7378, "step": 1890 }, { "epoch": 0.3128795208003129, "grad_norm": 1.3542795181274414, "learning_rate": 1.971255149568489e-05, "loss": 0.7385, "step": 1900 }, { "epoch": 0.3145262551203145, "grad_norm": 1.7643039226531982, "learning_rate": 1.970830444336548e-05, "loss": 0.7148, "step": 1910 }, { "epoch": 0.3161729894403162, "grad_norm": 1.285137414932251, "learning_rate": 1.9704026709897606e-05, "loss": 0.6827, "step": 1920 }, { "epoch": 0.3178197237603178, "grad_norm": 1.2715585231781006, "learning_rate": 1.9699718308800182e-05, "loss": 0.7133, "step": 1930 }, { "epoch": 0.3194664580803195, "grad_norm": 1.5506412982940674, "learning_rate": 1.9695379253689048e-05, "loss": 0.7238, "step": 1940 }, { "epoch": 0.3211131924003211, "grad_norm": 1.3223036527633667, "learning_rate": 1.9691009558276915e-05, "loss": 0.7131, "step": 1950 }, { "epoch": 0.3227599267203228, "grad_norm": 1.505676031112671, "learning_rate": 1.9686609236373333e-05, "loss": 0.7216, "step": 1960 }, { "epoch": 0.3244066610403244, "grad_norm": 1.330424427986145, "learning_rate": 1.9682178301884632e-05, "loss": 0.7296, "step": 1970 }, { "epoch": 0.32605339536032607, "grad_norm": 1.6218839883804321, "learning_rate": 1.9677716768813893e-05, "loss": 0.6836, "step": 1980 }, { "epoch": 0.3277001296803277, "grad_norm": 1.3749337196350098, "learning_rate": 1.9673224651260894e-05, "loss": 0.7039, "step": 1990 }, { "epoch": 0.32934686400032936, "grad_norm": 1.3419939279556274, "learning_rate": 1.9668701963422077e-05, "loss": 0.7181, "step": 2000 }, { "epoch": 0.330993598320331, "grad_norm": 1.531684398651123, "learning_rate": 1.9664148719590486e-05, "loss": 0.7263, "step": 2010 }, { "epoch": 0.33264033264033266, "grad_norm": 1.6776705980300903, "learning_rate": 1.9659564934155733e-05, "loss": 0.7279, "step": 2020 }, { "epoch": 0.3342870669603343, "grad_norm": 1.4876450300216675, "learning_rate": 1.9654950621603955e-05, "loss": 0.7441, "step": 2030 }, { "epoch": 0.33593380128033595, "grad_norm": 1.3358787298202515, "learning_rate": 1.965030579651776e-05, "loss": 0.6973, "step": 2040 }, { "epoch": 0.3375805356003376, "grad_norm": 1.2709858417510986, "learning_rate": 1.9645630473576184e-05, "loss": 0.6895, "step": 2050 }, { "epoch": 0.33922726992033925, "grad_norm": 1.523409366607666, "learning_rate": 1.9640924667554654e-05, "loss": 0.7273, "step": 2060 }, { "epoch": 0.34087400424034087, "grad_norm": 1.1860836744308472, "learning_rate": 1.9636188393324917e-05, "loss": 0.6914, "step": 2070 }, { "epoch": 0.34252073856034254, "grad_norm": 1.5158321857452393, "learning_rate": 1.9631421665855023e-05, "loss": 0.6884, "step": 2080 }, { "epoch": 0.34416747288034416, "grad_norm": 1.5086334943771362, "learning_rate": 1.9626624500209254e-05, "loss": 0.7352, "step": 2090 }, { "epoch": 0.34581420720034584, "grad_norm": 1.5807071924209595, "learning_rate": 1.9621796911548097e-05, "loss": 0.6873, "step": 2100 }, { "epoch": 0.34746094152034745, "grad_norm": 1.472090244293213, "learning_rate": 1.961693891512817e-05, "loss": 0.6839, "step": 2110 }, { "epoch": 0.34910767584034913, "grad_norm": 1.2841448783874512, "learning_rate": 1.9612050526302195e-05, "loss": 0.7038, "step": 2120 }, { "epoch": 0.35075441016035075, "grad_norm": 1.4147202968597412, "learning_rate": 1.9607131760518952e-05, "loss": 0.7, "step": 2130 }, { "epoch": 0.3524011444803524, "grad_norm": 1.4587483406066895, "learning_rate": 1.9602182633323205e-05, "loss": 0.6995, "step": 2140 }, { "epoch": 0.35404787880035404, "grad_norm": 1.179916501045227, "learning_rate": 1.9597203160355684e-05, "loss": 0.6975, "step": 2150 }, { "epoch": 0.3556946131203557, "grad_norm": 1.621735692024231, "learning_rate": 1.9592193357353012e-05, "loss": 0.6808, "step": 2160 }, { "epoch": 0.35734134744035734, "grad_norm": 1.5990266799926758, "learning_rate": 1.9587153240147663e-05, "loss": 0.7072, "step": 2170 }, { "epoch": 0.358988081760359, "grad_norm": 1.6757946014404297, "learning_rate": 1.9582082824667924e-05, "loss": 0.7017, "step": 2180 }, { "epoch": 0.36063481608036063, "grad_norm": 1.4000555276870728, "learning_rate": 1.957698212693782e-05, "loss": 0.704, "step": 2190 }, { "epoch": 0.3622815504003623, "grad_norm": 1.4697622060775757, "learning_rate": 1.9571851163077082e-05, "loss": 0.6987, "step": 2200 }, { "epoch": 0.3639282847203639, "grad_norm": 1.608054518699646, "learning_rate": 1.9566689949301097e-05, "loss": 0.6947, "step": 2210 }, { "epoch": 0.3655750190403656, "grad_norm": 1.957412600517273, "learning_rate": 1.956149850192084e-05, "loss": 0.6938, "step": 2220 }, { "epoch": 0.3672217533603672, "grad_norm": 1.398135781288147, "learning_rate": 1.955627683734284e-05, "loss": 0.6606, "step": 2230 }, { "epoch": 0.3688684876803689, "grad_norm": 1.6263389587402344, "learning_rate": 1.9551024972069127e-05, "loss": 0.7298, "step": 2240 }, { "epoch": 0.3705152220003705, "grad_norm": 1.461220145225525, "learning_rate": 1.9545742922697157e-05, "loss": 0.704, "step": 2250 }, { "epoch": 0.3721619563203722, "grad_norm": 1.1990222930908203, "learning_rate": 1.9540430705919798e-05, "loss": 0.6566, "step": 2260 }, { "epoch": 0.3738086906403738, "grad_norm": 1.6441187858581543, "learning_rate": 1.9535088338525238e-05, "loss": 0.7184, "step": 2270 }, { "epoch": 0.37545542496037543, "grad_norm": 1.2525330781936646, "learning_rate": 1.9529715837396956e-05, "loss": 0.7407, "step": 2280 }, { "epoch": 0.3771021592803771, "grad_norm": 1.493485927581787, "learning_rate": 1.952431321951367e-05, "loss": 0.6515, "step": 2290 }, { "epoch": 0.3787488936003787, "grad_norm": 1.203551173210144, "learning_rate": 1.9518880501949267e-05, "loss": 0.7195, "step": 2300 }, { "epoch": 0.3803956279203804, "grad_norm": 1.6315183639526367, "learning_rate": 1.9513417701872766e-05, "loss": 0.6887, "step": 2310 }, { "epoch": 0.382042362240382, "grad_norm": 1.2268880605697632, "learning_rate": 1.9507924836548244e-05, "loss": 0.7259, "step": 2320 }, { "epoch": 0.3836890965603837, "grad_norm": 1.1860028505325317, "learning_rate": 1.9502401923334798e-05, "loss": 0.671, "step": 2330 }, { "epoch": 0.3853358308803853, "grad_norm": 1.2253532409667969, "learning_rate": 1.9496848979686493e-05, "loss": 0.6954, "step": 2340 }, { "epoch": 0.386982565200387, "grad_norm": 1.3530570268630981, "learning_rate": 1.949126602315229e-05, "loss": 0.7222, "step": 2350 }, { "epoch": 0.3886292995203886, "grad_norm": 1.353309988975525, "learning_rate": 1.9485653071376004e-05, "loss": 0.7268, "step": 2360 }, { "epoch": 0.3902760338403903, "grad_norm": 1.2534314393997192, "learning_rate": 1.9480010142096245e-05, "loss": 0.6925, "step": 2370 }, { "epoch": 0.3919227681603919, "grad_norm": 1.8099732398986816, "learning_rate": 1.947433725314636e-05, "loss": 0.659, "step": 2380 }, { "epoch": 0.3935695024803936, "grad_norm": 2.0360233783721924, "learning_rate": 1.946863442245437e-05, "loss": 0.6859, "step": 2390 }, { "epoch": 0.3952162368003952, "grad_norm": 1.5365337133407593, "learning_rate": 1.946290166804293e-05, "loss": 0.6973, "step": 2400 }, { "epoch": 0.39686297112039687, "grad_norm": 1.4845314025878906, "learning_rate": 1.9457139008029263e-05, "loss": 0.7292, "step": 2410 }, { "epoch": 0.3985097054403985, "grad_norm": 1.3728680610656738, "learning_rate": 1.94513464606251e-05, "loss": 0.6981, "step": 2420 }, { "epoch": 0.40015643976040016, "grad_norm": 1.3709412813186646, "learning_rate": 1.9445524044136618e-05, "loss": 0.7105, "step": 2430 }, { "epoch": 0.4018031740804018, "grad_norm": 1.321450114250183, "learning_rate": 1.94396717769644e-05, "loss": 0.6632, "step": 2440 }, { "epoch": 0.40344990840040346, "grad_norm": 1.2133983373641968, "learning_rate": 1.943378967760337e-05, "loss": 0.6788, "step": 2450 }, { "epoch": 0.4050966427204051, "grad_norm": 1.2709228992462158, "learning_rate": 1.9427877764642714e-05, "loss": 0.683, "step": 2460 }, { "epoch": 0.40674337704040675, "grad_norm": 1.167478084564209, "learning_rate": 1.9421936056765847e-05, "loss": 0.6599, "step": 2470 }, { "epoch": 0.40839011136040837, "grad_norm": 1.3339264392852783, "learning_rate": 1.9415964572750347e-05, "loss": 0.6807, "step": 2480 }, { "epoch": 0.41003684568041004, "grad_norm": 1.1604259014129639, "learning_rate": 1.9409963331467893e-05, "loss": 0.6634, "step": 2490 }, { "epoch": 0.41168358000041166, "grad_norm": 1.3648931980133057, "learning_rate": 1.94039323518842e-05, "loss": 0.6572, "step": 2500 }, { "epoch": 0.41333031432041334, "grad_norm": 1.3427473306655884, "learning_rate": 1.9397871653058974e-05, "loss": 0.6773, "step": 2510 }, { "epoch": 0.41497704864041496, "grad_norm": 1.2733012437820435, "learning_rate": 1.9391781254145833e-05, "loss": 0.6822, "step": 2520 }, { "epoch": 0.41662378296041663, "grad_norm": 1.1846665143966675, "learning_rate": 1.9385661174392262e-05, "loss": 0.6781, "step": 2530 }, { "epoch": 0.41827051728041825, "grad_norm": 1.2302825450897217, "learning_rate": 1.9379511433139547e-05, "loss": 0.6608, "step": 2540 }, { "epoch": 0.4199172516004199, "grad_norm": 1.2838640213012695, "learning_rate": 1.937333204982271e-05, "loss": 0.6924, "step": 2550 }, { "epoch": 0.42156398592042155, "grad_norm": 1.528942346572876, "learning_rate": 1.9367123043970452e-05, "loss": 0.6789, "step": 2560 }, { "epoch": 0.4232107202404232, "grad_norm": 1.4241206645965576, "learning_rate": 1.936088443520509e-05, "loss": 0.675, "step": 2570 }, { "epoch": 0.42485745456042484, "grad_norm": 1.1903736591339111, "learning_rate": 1.93546162432425e-05, "loss": 0.6795, "step": 2580 }, { "epoch": 0.4265041888804265, "grad_norm": 1.1137593984603882, "learning_rate": 1.9348318487892036e-05, "loss": 0.6693, "step": 2590 }, { "epoch": 0.42815092320042814, "grad_norm": 1.1012071371078491, "learning_rate": 1.9341991189056498e-05, "loss": 0.6982, "step": 2600 }, { "epoch": 0.4297976575204298, "grad_norm": 1.1505986452102661, "learning_rate": 1.9335634366732044e-05, "loss": 0.664, "step": 2610 }, { "epoch": 0.43144439184043143, "grad_norm": 1.229068398475647, "learning_rate": 1.9329248041008134e-05, "loss": 0.6789, "step": 2620 }, { "epoch": 0.4330911261604331, "grad_norm": 1.1394855976104736, "learning_rate": 1.9322832232067466e-05, "loss": 0.6599, "step": 2630 }, { "epoch": 0.4347378604804347, "grad_norm": 1.1221281290054321, "learning_rate": 1.9316386960185922e-05, "loss": 0.6592, "step": 2640 }, { "epoch": 0.4363845948004364, "grad_norm": 1.1771143674850464, "learning_rate": 1.930991224573249e-05, "loss": 0.6922, "step": 2650 }, { "epoch": 0.438031329120438, "grad_norm": 1.2491823434829712, "learning_rate": 1.9303408109169205e-05, "loss": 0.6544, "step": 2660 }, { "epoch": 0.4396780634404397, "grad_norm": 1.059578537940979, "learning_rate": 1.9296874571051084e-05, "loss": 0.6561, "step": 2670 }, { "epoch": 0.4413247977604413, "grad_norm": 1.4251331090927124, "learning_rate": 1.9290311652026065e-05, "loss": 0.6322, "step": 2680 }, { "epoch": 0.442971532080443, "grad_norm": 1.9921714067459106, "learning_rate": 1.9283719372834933e-05, "loss": 0.6859, "step": 2690 }, { "epoch": 0.4446182664004446, "grad_norm": 1.2513914108276367, "learning_rate": 1.9277097754311277e-05, "loss": 0.6463, "step": 2700 }, { "epoch": 0.4462650007204463, "grad_norm": 1.6695165634155273, "learning_rate": 1.9270446817381377e-05, "loss": 0.6827, "step": 2710 }, { "epoch": 0.4479117350404479, "grad_norm": 1.291625738143921, "learning_rate": 1.9263766583064193e-05, "loss": 0.6992, "step": 2720 }, { "epoch": 0.4495584693604496, "grad_norm": 1.7672340869903564, "learning_rate": 1.925705707247127e-05, "loss": 0.6754, "step": 2730 }, { "epoch": 0.4512052036804512, "grad_norm": 3.7783656120300293, "learning_rate": 1.925031830680666e-05, "loss": 0.671, "step": 2740 }, { "epoch": 0.45285193800045287, "grad_norm": 1.448926568031311, "learning_rate": 1.9243550307366884e-05, "loss": 0.6677, "step": 2750 }, { "epoch": 0.4544986723204545, "grad_norm": 1.472121238708496, "learning_rate": 1.923675309554085e-05, "loss": 0.6609, "step": 2760 }, { "epoch": 0.45614540664045616, "grad_norm": 1.7885098457336426, "learning_rate": 1.9229926692809777e-05, "loss": 0.6847, "step": 2770 }, { "epoch": 0.4577921409604578, "grad_norm": 1.6598893404006958, "learning_rate": 1.9223071120747145e-05, "loss": 0.6782, "step": 2780 }, { "epoch": 0.45943887528045946, "grad_norm": 1.3006958961486816, "learning_rate": 1.9216186401018614e-05, "loss": 0.656, "step": 2790 }, { "epoch": 0.4610856096004611, "grad_norm": 1.0863409042358398, "learning_rate": 1.920927255538196e-05, "loss": 0.6482, "step": 2800 }, { "epoch": 0.46273234392046275, "grad_norm": 1.5282680988311768, "learning_rate": 1.9202329605687e-05, "loss": 0.6434, "step": 2810 }, { "epoch": 0.46437907824046437, "grad_norm": 1.2323793172836304, "learning_rate": 1.9195357573875537e-05, "loss": 0.6177, "step": 2820 }, { "epoch": 0.46602581256046605, "grad_norm": 1.1313997507095337, "learning_rate": 1.918835648198128e-05, "loss": 0.674, "step": 2830 }, { "epoch": 0.46767254688046767, "grad_norm": 1.3586218357086182, "learning_rate": 1.9181326352129773e-05, "loss": 0.6722, "step": 2840 }, { "epoch": 0.46931928120046934, "grad_norm": 1.0108249187469482, "learning_rate": 1.9174267206538332e-05, "loss": 0.6609, "step": 2850 }, { "epoch": 0.47096601552047096, "grad_norm": 1.2038466930389404, "learning_rate": 1.916717906751597e-05, "loss": 0.6407, "step": 2860 }, { "epoch": 0.47261274984047263, "grad_norm": 1.1672009229660034, "learning_rate": 1.916006195746333e-05, "loss": 0.6932, "step": 2870 }, { "epoch": 0.47425948416047425, "grad_norm": 1.5974833965301514, "learning_rate": 1.915291589887261e-05, "loss": 0.6373, "step": 2880 }, { "epoch": 0.47590621848047593, "grad_norm": 1.06233811378479, "learning_rate": 1.914574091432749e-05, "loss": 0.6429, "step": 2890 }, { "epoch": 0.47755295280047755, "grad_norm": 1.8846768140792847, "learning_rate": 1.9138537026503076e-05, "loss": 0.6373, "step": 2900 }, { "epoch": 0.4791996871204792, "grad_norm": 1.164318323135376, "learning_rate": 1.913130425816581e-05, "loss": 0.6639, "step": 2910 }, { "epoch": 0.48084642144048084, "grad_norm": 1.1987966299057007, "learning_rate": 1.9124042632173398e-05, "loss": 0.6699, "step": 2920 }, { "epoch": 0.4824931557604825, "grad_norm": 1.5233405828475952, "learning_rate": 1.9116752171474754e-05, "loss": 0.6355, "step": 2930 }, { "epoch": 0.48413989008048414, "grad_norm": 1.2364847660064697, "learning_rate": 1.9109432899109923e-05, "loss": 0.6616, "step": 2940 }, { "epoch": 0.4857866244004858, "grad_norm": 1.3125419616699219, "learning_rate": 1.9102084838209992e-05, "loss": 0.6585, "step": 2950 }, { "epoch": 0.48743335872048743, "grad_norm": 1.315054178237915, "learning_rate": 1.9094708011997033e-05, "loss": 0.6443, "step": 2960 }, { "epoch": 0.4890800930404891, "grad_norm": 0.934471070766449, "learning_rate": 1.908730244378403e-05, "loss": 0.6209, "step": 2970 }, { "epoch": 0.4907268273604907, "grad_norm": 1.1312390565872192, "learning_rate": 1.9079868156974788e-05, "loss": 0.6699, "step": 2980 }, { "epoch": 0.4923735616804924, "grad_norm": 1.168817162513733, "learning_rate": 1.9072405175063883e-05, "loss": 0.6222, "step": 2990 }, { "epoch": 0.494020296000494, "grad_norm": 1.1964330673217773, "learning_rate": 1.9064913521636574e-05, "loss": 0.6303, "step": 3000 }, { "epoch": 0.4956670303204957, "grad_norm": 1.1183303594589233, "learning_rate": 1.9057393220368722e-05, "loss": 0.6285, "step": 3010 }, { "epoch": 0.4973137646404973, "grad_norm": 1.0846532583236694, "learning_rate": 1.9049844295026738e-05, "loss": 0.6463, "step": 3020 }, { "epoch": 0.498960498960499, "grad_norm": 1.4997867345809937, "learning_rate": 1.904226676946748e-05, "loss": 0.6729, "step": 3030 }, { "epoch": 0.5006072332805006, "grad_norm": 1.2239123582839966, "learning_rate": 1.9034660667638206e-05, "loss": 0.664, "step": 3040 }, { "epoch": 0.5022539676005022, "grad_norm": 1.2806061506271362, "learning_rate": 1.9027026013576465e-05, "loss": 0.6568, "step": 3050 }, { "epoch": 0.5039007019205038, "grad_norm": 1.2059847116470337, "learning_rate": 1.9019362831410057e-05, "loss": 0.6771, "step": 3060 }, { "epoch": 0.5055474362405056, "grad_norm": 1.2228035926818848, "learning_rate": 1.9011671145356926e-05, "loss": 0.6438, "step": 3070 }, { "epoch": 0.5071941705605072, "grad_norm": 1.16281259059906, "learning_rate": 1.9003950979725103e-05, "loss": 0.6518, "step": 3080 }, { "epoch": 0.5088409048805088, "grad_norm": 1.3429057598114014, "learning_rate": 1.899620235891263e-05, "loss": 0.6167, "step": 3090 }, { "epoch": 0.5104876392005104, "grad_norm": 1.0990668535232544, "learning_rate": 1.8988425307407458e-05, "loss": 0.624, "step": 3100 }, { "epoch": 0.5121343735205122, "grad_norm": 1.1892952919006348, "learning_rate": 1.89806198497874e-05, "loss": 0.6193, "step": 3110 }, { "epoch": 0.5137811078405138, "grad_norm": 1.194745421409607, "learning_rate": 1.8972786010720046e-05, "loss": 0.6825, "step": 3120 }, { "epoch": 0.5154278421605154, "grad_norm": 1.267279028892517, "learning_rate": 1.8964923814962672e-05, "loss": 0.6361, "step": 3130 }, { "epoch": 0.517074576480517, "grad_norm": 1.3956196308135986, "learning_rate": 1.8957033287362167e-05, "loss": 0.6075, "step": 3140 }, { "epoch": 0.5187213108005188, "grad_norm": 1.4350258111953735, "learning_rate": 1.8949114452854957e-05, "loss": 0.6427, "step": 3150 }, { "epoch": 0.5203680451205204, "grad_norm": 1.3245211839675903, "learning_rate": 1.8941167336466932e-05, "loss": 0.6825, "step": 3160 }, { "epoch": 0.522014779440522, "grad_norm": 1.3547745943069458, "learning_rate": 1.893319196331336e-05, "loss": 0.6353, "step": 3170 }, { "epoch": 0.5236615137605236, "grad_norm": 1.0884572267532349, "learning_rate": 1.8925188358598815e-05, "loss": 0.6354, "step": 3180 }, { "epoch": 0.5253082480805253, "grad_norm": 1.193581223487854, "learning_rate": 1.8917156547617072e-05, "loss": 0.651, "step": 3190 }, { "epoch": 0.526954982400527, "grad_norm": 1.1126220226287842, "learning_rate": 1.890909655575106e-05, "loss": 0.6516, "step": 3200 }, { "epoch": 0.5286017167205286, "grad_norm": 1.4914835691452026, "learning_rate": 1.8901008408472775e-05, "loss": 0.6346, "step": 3210 }, { "epoch": 0.5302484510405302, "grad_norm": 1.3618090152740479, "learning_rate": 1.8892892131343177e-05, "loss": 0.6237, "step": 3220 }, { "epoch": 0.5318951853605319, "grad_norm": 1.3347357511520386, "learning_rate": 1.888474775001213e-05, "loss": 0.6727, "step": 3230 }, { "epoch": 0.5335419196805335, "grad_norm": 1.1067010164260864, "learning_rate": 1.8876575290218323e-05, "loss": 0.6533, "step": 3240 }, { "epoch": 0.5351886540005352, "grad_norm": 1.7199362516403198, "learning_rate": 1.8868374777789172e-05, "loss": 0.6546, "step": 3250 }, { "epoch": 0.5368353883205368, "grad_norm": 1.990593671798706, "learning_rate": 1.886014623864075e-05, "loss": 0.6129, "step": 3260 }, { "epoch": 0.5384821226405385, "grad_norm": 1.2225085496902466, "learning_rate": 1.8851889698777707e-05, "loss": 0.655, "step": 3270 }, { "epoch": 0.5401288569605401, "grad_norm": 1.4789817333221436, "learning_rate": 1.8843605184293177e-05, "loss": 0.6274, "step": 3280 }, { "epoch": 0.5417755912805418, "grad_norm": 1.4760066270828247, "learning_rate": 1.8835292721368715e-05, "loss": 0.649, "step": 3290 }, { "epoch": 0.5434223256005434, "grad_norm": 2.949131488800049, "learning_rate": 1.8826952336274184e-05, "loss": 0.6358, "step": 3300 }, { "epoch": 0.5450690599205451, "grad_norm": 1.5721391439437866, "learning_rate": 1.88185840553677e-05, "loss": 0.6461, "step": 3310 }, { "epoch": 0.5467157942405467, "grad_norm": 1.620802402496338, "learning_rate": 1.881018790509553e-05, "loss": 0.5999, "step": 3320 }, { "epoch": 0.5483625285605483, "grad_norm": 1.6917542219161987, "learning_rate": 1.8801763911992035e-05, "loss": 0.6141, "step": 3330 }, { "epoch": 0.55000926288055, "grad_norm": 1.6579176187515259, "learning_rate": 1.8793312102679548e-05, "loss": 0.6865, "step": 3340 }, { "epoch": 0.5516559972005517, "grad_norm": 1.5372205972671509, "learning_rate": 1.8784832503868314e-05, "loss": 0.607, "step": 3350 }, { "epoch": 0.5533027315205533, "grad_norm": 2.1712288856506348, "learning_rate": 1.8776325142356406e-05, "loss": 0.6754, "step": 3360 }, { "epoch": 0.5549494658405549, "grad_norm": 1.6536438465118408, "learning_rate": 1.876779004502964e-05, "loss": 0.6484, "step": 3370 }, { "epoch": 0.5565962001605566, "grad_norm": 2.0324020385742188, "learning_rate": 1.8759227238861467e-05, "loss": 0.6278, "step": 3380 }, { "epoch": 0.5582429344805583, "grad_norm": 2.410161018371582, "learning_rate": 1.8750636750912927e-05, "loss": 0.611, "step": 3390 }, { "epoch": 0.5598896688005599, "grad_norm": 1.6419649124145508, "learning_rate": 1.874201860833253e-05, "loss": 0.643, "step": 3400 }, { "epoch": 0.5615364031205615, "grad_norm": 2.4913241863250732, "learning_rate": 1.873337283835619e-05, "loss": 0.6535, "step": 3410 }, { "epoch": 0.5631831374405631, "grad_norm": 1.8292442560195923, "learning_rate": 1.8724699468307123e-05, "loss": 0.643, "step": 3420 }, { "epoch": 0.5648298717605649, "grad_norm": 1.3961424827575684, "learning_rate": 1.8715998525595775e-05, "loss": 0.6279, "step": 3430 }, { "epoch": 0.5664766060805665, "grad_norm": 1.811924695968628, "learning_rate": 1.8707270037719737e-05, "loss": 0.6424, "step": 3440 }, { "epoch": 0.5681233404005681, "grad_norm": 1.5042741298675537, "learning_rate": 1.8698514032263636e-05, "loss": 0.6027, "step": 3450 }, { "epoch": 0.5697700747205697, "grad_norm": 1.6137949228286743, "learning_rate": 1.8689730536899078e-05, "loss": 0.6294, "step": 3460 }, { "epoch": 0.5714168090405715, "grad_norm": 1.3153676986694336, "learning_rate": 1.868091957938453e-05, "loss": 0.6277, "step": 3470 }, { "epoch": 0.5730635433605731, "grad_norm": 1.300405740737915, "learning_rate": 1.867208118756526e-05, "loss": 0.6461, "step": 3480 }, { "epoch": 0.5747102776805747, "grad_norm": 1.4557414054870605, "learning_rate": 1.866321538937323e-05, "loss": 0.6486, "step": 3490 }, { "epoch": 0.5763570120005763, "grad_norm": 1.8575087785720825, "learning_rate": 1.8654322212827022e-05, "loss": 0.6732, "step": 3500 }, { "epoch": 0.578003746320578, "grad_norm": 1.3033815622329712, "learning_rate": 1.864540168603173e-05, "loss": 0.6361, "step": 3510 }, { "epoch": 0.5796504806405797, "grad_norm": 2.0723509788513184, "learning_rate": 1.8636453837178893e-05, "loss": 0.6148, "step": 3520 }, { "epoch": 0.5812972149605813, "grad_norm": 2.9872148036956787, "learning_rate": 1.8627478694546387e-05, "loss": 0.6768, "step": 3530 }, { "epoch": 0.5829439492805829, "grad_norm": 1.6916303634643555, "learning_rate": 1.8618476286498356e-05, "loss": 0.6318, "step": 3540 }, { "epoch": 0.5845906836005846, "grad_norm": 2.017186164855957, "learning_rate": 1.86094466414851e-05, "loss": 0.6583, "step": 3550 }, { "epoch": 0.5862374179205863, "grad_norm": 1.7270339727401733, "learning_rate": 1.8600389788043003e-05, "loss": 0.6026, "step": 3560 }, { "epoch": 0.5878841522405879, "grad_norm": 1.8550065755844116, "learning_rate": 1.8591305754794434e-05, "loss": 0.631, "step": 3570 }, { "epoch": 0.5895308865605895, "grad_norm": 2.131333112716675, "learning_rate": 1.8582194570447654e-05, "loss": 0.6287, "step": 3580 }, { "epoch": 0.5911776208805912, "grad_norm": 2.0691232681274414, "learning_rate": 1.8573056263796732e-05, "loss": 0.6032, "step": 3590 }, { "epoch": 0.5928243552005928, "grad_norm": 2.4947502613067627, "learning_rate": 1.856389086372146e-05, "loss": 0.6571, "step": 3600 }, { "epoch": 0.5944710895205945, "grad_norm": 1.4081095457077026, "learning_rate": 1.8554698399187246e-05, "loss": 0.649, "step": 3610 }, { "epoch": 0.5961178238405961, "grad_norm": 1.7815622091293335, "learning_rate": 1.854547889924502e-05, "loss": 0.6437, "step": 3620 }, { "epoch": 0.5977645581605978, "grad_norm": 1.900574803352356, "learning_rate": 1.8536232393031173e-05, "loss": 0.636, "step": 3630 }, { "epoch": 0.5994112924805994, "grad_norm": 1.3070859909057617, "learning_rate": 1.8526958909767425e-05, "loss": 0.6325, "step": 3640 }, { "epoch": 0.601058026800601, "grad_norm": 1.3566079139709473, "learning_rate": 1.851765847876076e-05, "loss": 0.6235, "step": 3650 }, { "epoch": 0.6027047611206027, "grad_norm": 1.459912896156311, "learning_rate": 1.8508331129403333e-05, "loss": 0.6738, "step": 3660 }, { "epoch": 0.6043514954406044, "grad_norm": 1.7209458351135254, "learning_rate": 1.849897689117235e-05, "loss": 0.6478, "step": 3670 }, { "epoch": 0.605998229760606, "grad_norm": 1.884052038192749, "learning_rate": 1.848959579363001e-05, "loss": 0.6695, "step": 3680 }, { "epoch": 0.6076449640806076, "grad_norm": 1.8747471570968628, "learning_rate": 1.8480187866423386e-05, "loss": 0.6453, "step": 3690 }, { "epoch": 0.6092916984006093, "grad_norm": 1.548047423362732, "learning_rate": 1.8470753139284344e-05, "loss": 0.6253, "step": 3700 }, { "epoch": 0.610938432720611, "grad_norm": 3.6169090270996094, "learning_rate": 1.8461291642029454e-05, "loss": 0.6002, "step": 3710 }, { "epoch": 0.6125851670406126, "grad_norm": 1.727982997894287, "learning_rate": 1.8451803404559873e-05, "loss": 0.6077, "step": 3720 }, { "epoch": 0.6142319013606142, "grad_norm": 1.832689881324768, "learning_rate": 1.844228845686127e-05, "loss": 0.621, "step": 3730 }, { "epoch": 0.6158786356806158, "grad_norm": 1.83828866481781, "learning_rate": 1.8432746829003732e-05, "loss": 0.6464, "step": 3740 }, { "epoch": 0.6175253700006176, "grad_norm": 2.2948646545410156, "learning_rate": 1.8423178551141662e-05, "loss": 0.6234, "step": 3750 }, { "epoch": 0.6191721043206192, "grad_norm": 2.7820045948028564, "learning_rate": 1.841358365351368e-05, "loss": 0.6334, "step": 3760 }, { "epoch": 0.6208188386406208, "grad_norm": 1.8063865900039673, "learning_rate": 1.8403962166442535e-05, "loss": 0.6515, "step": 3770 }, { "epoch": 0.6224655729606224, "grad_norm": 3.7020092010498047, "learning_rate": 1.8394314120335002e-05, "loss": 0.6761, "step": 3780 }, { "epoch": 0.6241123072806241, "grad_norm": 3.1100969314575195, "learning_rate": 1.8384639545681803e-05, "loss": 0.5896, "step": 3790 }, { "epoch": 0.6257590416006258, "grad_norm": 3.011991262435913, "learning_rate": 1.8374938473057486e-05, "loss": 0.627, "step": 3800 }, { "epoch": 0.6274057759206274, "grad_norm": 1.5754694938659668, "learning_rate": 1.8365210933120347e-05, "loss": 0.6245, "step": 3810 }, { "epoch": 0.629052510240629, "grad_norm": 1.8275468349456787, "learning_rate": 1.835545695661232e-05, "loss": 0.6179, "step": 3820 }, { "epoch": 0.6306992445606306, "grad_norm": 2.1361637115478516, "learning_rate": 1.8345676574358897e-05, "loss": 0.638, "step": 3830 }, { "epoch": 0.6323459788806324, "grad_norm": 1.829188585281372, "learning_rate": 1.8335869817269006e-05, "loss": 0.6304, "step": 3840 }, { "epoch": 0.633992713200634, "grad_norm": 1.8671929836273193, "learning_rate": 1.8326036716334942e-05, "loss": 0.6322, "step": 3850 }, { "epoch": 0.6356394475206356, "grad_norm": 1.404921531677246, "learning_rate": 1.8316177302632248e-05, "loss": 0.6213, "step": 3860 }, { "epoch": 0.6372861818406372, "grad_norm": 1.64621901512146, "learning_rate": 1.8306291607319618e-05, "loss": 0.6668, "step": 3870 }, { "epoch": 0.638932916160639, "grad_norm": 2.2080841064453125, "learning_rate": 1.829637966163881e-05, "loss": 0.6089, "step": 3880 }, { "epoch": 0.6405796504806406, "grad_norm": 1.4423542022705078, "learning_rate": 1.8286441496914545e-05, "loss": 0.6406, "step": 3890 }, { "epoch": 0.6422263848006422, "grad_norm": 1.7559462785720825, "learning_rate": 1.8276477144554393e-05, "loss": 0.6371, "step": 3900 }, { "epoch": 0.6438731191206438, "grad_norm": 1.5417958498001099, "learning_rate": 1.826648663604869e-05, "loss": 0.6262, "step": 3910 }, { "epoch": 0.6455198534406456, "grad_norm": 1.899674654006958, "learning_rate": 1.8256470002970438e-05, "loss": 0.6192, "step": 3920 }, { "epoch": 0.6471665877606472, "grad_norm": 1.715377688407898, "learning_rate": 1.8246427276975196e-05, "loss": 0.6189, "step": 3930 }, { "epoch": 0.6488133220806488, "grad_norm": 1.4496129751205444, "learning_rate": 1.823635848980098e-05, "loss": 0.6147, "step": 3940 }, { "epoch": 0.6504600564006504, "grad_norm": 1.5673155784606934, "learning_rate": 1.822626367326818e-05, "loss": 0.6159, "step": 3950 }, { "epoch": 0.6521067907206521, "grad_norm": 1.1710649728775024, "learning_rate": 1.8216142859279432e-05, "loss": 0.5911, "step": 3960 }, { "epoch": 0.6537535250406538, "grad_norm": 1.358017921447754, "learning_rate": 1.8205996079819534e-05, "loss": 0.6107, "step": 3970 }, { "epoch": 0.6554002593606554, "grad_norm": 1.4123727083206177, "learning_rate": 1.8195823366955356e-05, "loss": 0.629, "step": 3980 }, { "epoch": 0.657046993680657, "grad_norm": 1.9289451837539673, "learning_rate": 1.8185624752835714e-05, "loss": 0.6609, "step": 3990 }, { "epoch": 0.6586937280006587, "grad_norm": 1.6234312057495117, "learning_rate": 1.8175400269691278e-05, "loss": 0.6191, "step": 4000 }, { "epoch": 0.6603404623206603, "grad_norm": 1.8857241868972778, "learning_rate": 1.8165149949834474e-05, "loss": 0.6466, "step": 4010 }, { "epoch": 0.661987196640662, "grad_norm": 2.1216554641723633, "learning_rate": 1.8154873825659393e-05, "loss": 0.6257, "step": 4020 }, { "epoch": 0.6636339309606636, "grad_norm": 1.483231544494629, "learning_rate": 1.814457192964165e-05, "loss": 0.5745, "step": 4030 }, { "epoch": 0.6652806652806653, "grad_norm": 1.1727079153060913, "learning_rate": 1.813424429433833e-05, "loss": 0.6168, "step": 4040 }, { "epoch": 0.6669273996006669, "grad_norm": 1.454728364944458, "learning_rate": 1.8123890952387848e-05, "loss": 0.6006, "step": 4050 }, { "epoch": 0.6685741339206686, "grad_norm": 1.6542500257492065, "learning_rate": 1.8113511936509864e-05, "loss": 0.6296, "step": 4060 }, { "epoch": 0.6702208682406702, "grad_norm": 1.3419349193572998, "learning_rate": 1.8103107279505177e-05, "loss": 0.6597, "step": 4070 }, { "epoch": 0.6718676025606719, "grad_norm": 1.247767686843872, "learning_rate": 1.809267701425562e-05, "loss": 0.6231, "step": 4080 }, { "epoch": 0.6735143368806735, "grad_norm": 2.062878370285034, "learning_rate": 1.808222117372395e-05, "loss": 0.6577, "step": 4090 }, { "epoch": 0.6751610712006751, "grad_norm": 1.2169808149337769, "learning_rate": 1.8071739790953754e-05, "loss": 0.6052, "step": 4100 }, { "epoch": 0.6768078055206768, "grad_norm": 1.3929376602172852, "learning_rate": 1.806123289906934e-05, "loss": 0.5777, "step": 4110 }, { "epoch": 0.6784545398406785, "grad_norm": 1.6458179950714111, "learning_rate": 1.8050700531275632e-05, "loss": 0.6085, "step": 4120 }, { "epoch": 0.6801012741606801, "grad_norm": 1.402419090270996, "learning_rate": 1.8040142720858064e-05, "loss": 0.6112, "step": 4130 }, { "epoch": 0.6817480084806817, "grad_norm": 1.2374345064163208, "learning_rate": 1.8029559501182482e-05, "loss": 0.5976, "step": 4140 }, { "epoch": 0.6833947428006834, "grad_norm": 1.6818028688430786, "learning_rate": 1.8018950905695022e-05, "loss": 0.6461, "step": 4150 }, { "epoch": 0.6850414771206851, "grad_norm": 1.5200414657592773, "learning_rate": 1.8008316967922027e-05, "loss": 0.6412, "step": 4160 }, { "epoch": 0.6866882114406867, "grad_norm": 1.8898037672042847, "learning_rate": 1.799765772146992e-05, "loss": 0.6135, "step": 4170 }, { "epoch": 0.6883349457606883, "grad_norm": 2.4506630897521973, "learning_rate": 1.7986973200025115e-05, "loss": 0.6181, "step": 4180 }, { "epoch": 0.6899816800806899, "grad_norm": 1.4668997526168823, "learning_rate": 1.7976263437353897e-05, "loss": 0.5981, "step": 4190 }, { "epoch": 0.6916284144006917, "grad_norm": 1.4976595640182495, "learning_rate": 1.796552846730232e-05, "loss": 0.5983, "step": 4200 }, { "epoch": 0.6932751487206933, "grad_norm": 1.2524445056915283, "learning_rate": 1.7954768323796107e-05, "loss": 0.6191, "step": 4210 }, { "epoch": 0.6949218830406949, "grad_norm": 1.3345338106155396, "learning_rate": 1.7943983040840527e-05, "loss": 0.6019, "step": 4220 }, { "epoch": 0.6965686173606965, "grad_norm": 1.9218558073043823, "learning_rate": 1.7933172652520308e-05, "loss": 0.6084, "step": 4230 }, { "epoch": 0.6982153516806983, "grad_norm": 1.2218800783157349, "learning_rate": 1.7922337192999514e-05, "loss": 0.6521, "step": 4240 }, { "epoch": 0.6998620860006999, "grad_norm": 1.9215497970581055, "learning_rate": 1.7911476696521437e-05, "loss": 0.6229, "step": 4250 }, { "epoch": 0.7015088203207015, "grad_norm": 1.0203500986099243, "learning_rate": 1.79005911974085e-05, "loss": 0.5724, "step": 4260 }, { "epoch": 0.7031555546407031, "grad_norm": 1.290940761566162, "learning_rate": 1.7889680730062137e-05, "loss": 0.5783, "step": 4270 }, { "epoch": 0.7048022889607048, "grad_norm": 1.156741976737976, "learning_rate": 1.7878745328962696e-05, "loss": 0.6373, "step": 4280 }, { "epoch": 0.7064490232807065, "grad_norm": 1.3677122592926025, "learning_rate": 1.7867785028669308e-05, "loss": 0.6251, "step": 4290 }, { "epoch": 0.7080957576007081, "grad_norm": 1.15171217918396, "learning_rate": 1.7856799863819814e-05, "loss": 0.6389, "step": 4300 }, { "epoch": 0.7097424919207097, "grad_norm": 1.243003249168396, "learning_rate": 1.784578986913062e-05, "loss": 0.6155, "step": 4310 }, { "epoch": 0.7113892262407114, "grad_norm": 1.0966947078704834, "learning_rate": 1.7834755079396604e-05, "loss": 0.5783, "step": 4320 }, { "epoch": 0.7130359605607131, "grad_norm": 1.2809836864471436, "learning_rate": 1.782369552949101e-05, "loss": 0.6119, "step": 4330 }, { "epoch": 0.7146826948807147, "grad_norm": 1.148653507232666, "learning_rate": 1.781261125436532e-05, "loss": 0.6417, "step": 4340 }, { "epoch": 0.7163294292007163, "grad_norm": 1.11336088180542, "learning_rate": 1.780150228904916e-05, "loss": 0.6154, "step": 4350 }, { "epoch": 0.717976163520718, "grad_norm": 1.1516305208206177, "learning_rate": 1.779036866865019e-05, "loss": 0.601, "step": 4360 }, { "epoch": 0.7196228978407196, "grad_norm": 1.2385212182998657, "learning_rate": 1.777921042835399e-05, "loss": 0.6114, "step": 4370 }, { "epoch": 0.7212696321607213, "grad_norm": 1.259101152420044, "learning_rate": 1.776802760342393e-05, "loss": 0.5704, "step": 4380 }, { "epoch": 0.7229163664807229, "grad_norm": 1.3446874618530273, "learning_rate": 1.7756820229201092e-05, "loss": 0.5561, "step": 4390 }, { "epoch": 0.7245631008007246, "grad_norm": 1.4307595491409302, "learning_rate": 1.7745588341104127e-05, "loss": 0.5923, "step": 4400 }, { "epoch": 0.7262098351207262, "grad_norm": 1.4261305332183838, "learning_rate": 1.7734331974629166e-05, "loss": 0.6218, "step": 4410 }, { "epoch": 0.7278565694407279, "grad_norm": 1.342528223991394, "learning_rate": 1.77230511653497e-05, "loss": 0.6146, "step": 4420 }, { "epoch": 0.7295033037607295, "grad_norm": 1.4735767841339111, "learning_rate": 1.7711745948916464e-05, "loss": 0.6167, "step": 4430 }, { "epoch": 0.7311500380807312, "grad_norm": 1.2360775470733643, "learning_rate": 1.7700416361057322e-05, "loss": 0.58, "step": 4440 }, { "epoch": 0.7327967724007328, "grad_norm": 1.2893751859664917, "learning_rate": 1.7689062437577165e-05, "loss": 0.5934, "step": 4450 }, { "epoch": 0.7344435067207344, "grad_norm": 1.6743543148040771, "learning_rate": 1.7677684214357793e-05, "loss": 0.5796, "step": 4460 }, { "epoch": 0.7360902410407361, "grad_norm": 1.218066692352295, "learning_rate": 1.7666281727357792e-05, "loss": 0.5908, "step": 4470 }, { "epoch": 0.7377369753607378, "grad_norm": 1.4119192361831665, "learning_rate": 1.7654855012612442e-05, "loss": 0.6163, "step": 4480 }, { "epoch": 0.7393837096807394, "grad_norm": 1.4504175186157227, "learning_rate": 1.7643404106233573e-05, "loss": 0.6091, "step": 4490 }, { "epoch": 0.741030444000741, "grad_norm": 1.2811555862426758, "learning_rate": 1.763192904440949e-05, "loss": 0.5846, "step": 4500 }, { "epoch": 0.7426771783207426, "grad_norm": 1.9486907720565796, "learning_rate": 1.762042986340481e-05, "loss": 0.6064, "step": 4510 }, { "epoch": 0.7443239126407444, "grad_norm": 1.731048822402954, "learning_rate": 1.76089065995604e-05, "loss": 0.6002, "step": 4520 }, { "epoch": 0.745970646960746, "grad_norm": 1.3845677375793457, "learning_rate": 1.7597359289293213e-05, "loss": 0.5834, "step": 4530 }, { "epoch": 0.7476173812807476, "grad_norm": 2.7521491050720215, "learning_rate": 1.758578796909621e-05, "loss": 0.6001, "step": 4540 }, { "epoch": 0.7492641156007492, "grad_norm": 2.141174554824829, "learning_rate": 1.7574192675538232e-05, "loss": 0.5961, "step": 4550 }, { "epoch": 0.7509108499207509, "grad_norm": 2.0954525470733643, "learning_rate": 1.756257344526387e-05, "loss": 0.5862, "step": 4560 }, { "epoch": 0.7525575842407526, "grad_norm": 1.395780324935913, "learning_rate": 1.755093031499338e-05, "loss": 0.5753, "step": 4570 }, { "epoch": 0.7542043185607542, "grad_norm": 1.6621346473693848, "learning_rate": 1.7539263321522528e-05, "loss": 0.608, "step": 4580 }, { "epoch": 0.7558510528807558, "grad_norm": 1.2174135446548462, "learning_rate": 1.7527572501722516e-05, "loss": 0.5927, "step": 4590 }, { "epoch": 0.7574977872007574, "grad_norm": 2.0868325233459473, "learning_rate": 1.7515857892539828e-05, "loss": 0.5783, "step": 4600 }, { "epoch": 0.7591445215207592, "grad_norm": 1.0850328207015991, "learning_rate": 1.7504119530996138e-05, "loss": 0.6089, "step": 4610 }, { "epoch": 0.7607912558407608, "grad_norm": 0.910921573638916, "learning_rate": 1.749235745418818e-05, "loss": 0.5967, "step": 4620 }, { "epoch": 0.7624379901607624, "grad_norm": 1.7855496406555176, "learning_rate": 1.7480571699287647e-05, "loss": 0.5669, "step": 4630 }, { "epoch": 0.764084724480764, "grad_norm": 1.3511872291564941, "learning_rate": 1.7468762303541044e-05, "loss": 0.6084, "step": 4640 }, { "epoch": 0.7657314588007658, "grad_norm": 1.2458902597427368, "learning_rate": 1.7456929304269598e-05, "loss": 0.5668, "step": 4650 }, { "epoch": 0.7673781931207674, "grad_norm": 1.1821246147155762, "learning_rate": 1.7445072738869134e-05, "loss": 0.6231, "step": 4660 }, { "epoch": 0.769024927440769, "grad_norm": 1.6277104616165161, "learning_rate": 1.7433192644809942e-05, "loss": 0.6051, "step": 4670 }, { "epoch": 0.7706716617607706, "grad_norm": 1.184881329536438, "learning_rate": 1.742128905963668e-05, "loss": 0.6102, "step": 4680 }, { "epoch": 0.7723183960807724, "grad_norm": 1.4022691249847412, "learning_rate": 1.7409362020968242e-05, "loss": 0.6079, "step": 4690 }, { "epoch": 0.773965130400774, "grad_norm": 1.271551251411438, "learning_rate": 1.7397411566497638e-05, "loss": 0.6041, "step": 4700 }, { "epoch": 0.7756118647207756, "grad_norm": 1.211241364479065, "learning_rate": 1.738543773399188e-05, "loss": 0.5973, "step": 4710 }, { "epoch": 0.7772585990407772, "grad_norm": 1.336120367050171, "learning_rate": 1.737344056129187e-05, "loss": 0.6225, "step": 4720 }, { "epoch": 0.7789053333607789, "grad_norm": 1.2912676334381104, "learning_rate": 1.7361420086312255e-05, "loss": 0.5732, "step": 4730 }, { "epoch": 0.7805520676807806, "grad_norm": 1.1799460649490356, "learning_rate": 1.7349376347041346e-05, "loss": 0.5675, "step": 4740 }, { "epoch": 0.7821988020007822, "grad_norm": 1.8032108545303345, "learning_rate": 1.7337309381540955e-05, "loss": 0.5871, "step": 4750 }, { "epoch": 0.7838455363207838, "grad_norm": 1.2031786441802979, "learning_rate": 1.7325219227946314e-05, "loss": 0.5934, "step": 4760 }, { "epoch": 0.7854922706407855, "grad_norm": 1.412174105644226, "learning_rate": 1.7313105924465923e-05, "loss": 0.5923, "step": 4770 }, { "epoch": 0.7871390049607871, "grad_norm": 1.4549241065979004, "learning_rate": 1.7300969509381448e-05, "loss": 0.5747, "step": 4780 }, { "epoch": 0.7887857392807888, "grad_norm": 1.831076979637146, "learning_rate": 1.7288810021047597e-05, "loss": 0.6102, "step": 4790 }, { "epoch": 0.7904324736007904, "grad_norm": 1.530872106552124, "learning_rate": 1.7276627497891984e-05, "loss": 0.5789, "step": 4800 }, { "epoch": 0.7920792079207921, "grad_norm": 1.1748154163360596, "learning_rate": 1.726442197841504e-05, "loss": 0.5854, "step": 4810 }, { "epoch": 0.7937259422407937, "grad_norm": 1.1575807332992554, "learning_rate": 1.7252193501189857e-05, "loss": 0.5712, "step": 4820 }, { "epoch": 0.7953726765607954, "grad_norm": 1.2026344537734985, "learning_rate": 1.723994210486208e-05, "loss": 0.5471, "step": 4830 }, { "epoch": 0.797019410880797, "grad_norm": 1.4718669652938843, "learning_rate": 1.7227667828149795e-05, "loss": 0.6096, "step": 4840 }, { "epoch": 0.7986661452007987, "grad_norm": 1.6093403100967407, "learning_rate": 1.721537070984339e-05, "loss": 0.5773, "step": 4850 }, { "epoch": 0.8003128795208003, "grad_norm": 1.2670097351074219, "learning_rate": 1.720305078880544e-05, "loss": 0.5763, "step": 4860 }, { "epoch": 0.8019596138408019, "grad_norm": 1.2672990560531616, "learning_rate": 1.719070810397058e-05, "loss": 0.5859, "step": 4870 }, { "epoch": 0.8036063481608036, "grad_norm": 1.4210219383239746, "learning_rate": 1.7178342694345395e-05, "loss": 0.5781, "step": 4880 }, { "epoch": 0.8052530824808053, "grad_norm": 1.1239454746246338, "learning_rate": 1.7165954599008275e-05, "loss": 0.5778, "step": 4890 }, { "epoch": 0.8068998168008069, "grad_norm": 1.0624126195907593, "learning_rate": 1.7153543857109314e-05, "loss": 0.5804, "step": 4900 }, { "epoch": 0.8085465511208085, "grad_norm": 1.0993456840515137, "learning_rate": 1.7141110507870172e-05, "loss": 0.5778, "step": 4910 }, { "epoch": 0.8101932854408102, "grad_norm": 1.2814329862594604, "learning_rate": 1.7128654590583953e-05, "loss": 0.5687, "step": 4920 }, { "epoch": 0.8118400197608119, "grad_norm": 1.2554537057876587, "learning_rate": 1.7116176144615085e-05, "loss": 0.5967, "step": 4930 }, { "epoch": 0.8134867540808135, "grad_norm": 1.1652063131332397, "learning_rate": 1.7103675209399194e-05, "loss": 0.5907, "step": 4940 }, { "epoch": 0.8151334884008151, "grad_norm": 1.1235425472259521, "learning_rate": 1.7091151824442978e-05, "loss": 0.6217, "step": 4950 }, { "epoch": 0.8167802227208167, "grad_norm": 1.1538571119308472, "learning_rate": 1.707860602932408e-05, "loss": 0.5807, "step": 4960 }, { "epoch": 0.8184269570408185, "grad_norm": 1.3833861351013184, "learning_rate": 1.7066037863690975e-05, "loss": 0.5916, "step": 4970 }, { "epoch": 0.8200736913608201, "grad_norm": 1.847043752670288, "learning_rate": 1.7053447367262817e-05, "loss": 0.5662, "step": 4980 }, { "epoch": 0.8217204256808217, "grad_norm": 1.4586071968078613, "learning_rate": 1.7040834579829358e-05, "loss": 0.5624, "step": 4990 }, { "epoch": 0.8233671600008233, "grad_norm": 1.2225836515426636, "learning_rate": 1.702819954125077e-05, "loss": 0.5864, "step": 5000 }, { "epoch": 0.8250138943208251, "grad_norm": 1.430363655090332, "learning_rate": 1.7015542291457567e-05, "loss": 0.5816, "step": 5010 }, { "epoch": 0.8266606286408267, "grad_norm": 1.2088559865951538, "learning_rate": 1.7002862870450446e-05, "loss": 0.5924, "step": 5020 }, { "epoch": 0.8283073629608283, "grad_norm": 1.0974377393722534, "learning_rate": 1.6990161318300167e-05, "loss": 0.556, "step": 5030 }, { "epoch": 0.8299540972808299, "grad_norm": 1.475848913192749, "learning_rate": 1.697743767514744e-05, "loss": 0.5711, "step": 5040 }, { "epoch": 0.8316008316008316, "grad_norm": 1.3842908143997192, "learning_rate": 1.696469198120279e-05, "loss": 0.5931, "step": 5050 }, { "epoch": 0.8332475659208333, "grad_norm": 1.4212431907653809, "learning_rate": 1.6951924276746425e-05, "loss": 0.5905, "step": 5060 }, { "epoch": 0.8348943002408349, "grad_norm": 1.4288952350616455, "learning_rate": 1.693913460212811e-05, "loss": 0.5891, "step": 5070 }, { "epoch": 0.8365410345608365, "grad_norm": 1.2996931076049805, "learning_rate": 1.6926322997767045e-05, "loss": 0.5876, "step": 5080 }, { "epoch": 0.8381877688808382, "grad_norm": 1.6933460235595703, "learning_rate": 1.6913489504151743e-05, "loss": 0.5642, "step": 5090 }, { "epoch": 0.8398345032008399, "grad_norm": 1.210008144378662, "learning_rate": 1.690063416183988e-05, "loss": 0.5543, "step": 5100 }, { "epoch": 0.8414812375208415, "grad_norm": 1.4546929597854614, "learning_rate": 1.6887757011458184e-05, "loss": 0.5843, "step": 5110 }, { "epoch": 0.8431279718408431, "grad_norm": 1.6936150789260864, "learning_rate": 1.687485809370231e-05, "loss": 0.5882, "step": 5120 }, { "epoch": 0.8447747061608448, "grad_norm": 1.3216242790222168, "learning_rate": 1.6861937449336697e-05, "loss": 0.5567, "step": 5130 }, { "epoch": 0.8464214404808464, "grad_norm": 1.691382884979248, "learning_rate": 1.6848995119194453e-05, "loss": 0.559, "step": 5140 }, { "epoch": 0.8480681748008481, "grad_norm": 1.0977565050125122, "learning_rate": 1.683603114417721e-05, "loss": 0.5877, "step": 5150 }, { "epoch": 0.8497149091208497, "grad_norm": 1.1555675268173218, "learning_rate": 1.6823045565255018e-05, "loss": 0.5665, "step": 5160 }, { "epoch": 0.8513616434408514, "grad_norm": 1.320023775100708, "learning_rate": 1.6810038423466184e-05, "loss": 0.5939, "step": 5170 }, { "epoch": 0.853008377760853, "grad_norm": 1.6562938690185547, "learning_rate": 1.6797009759917177e-05, "loss": 0.5441, "step": 5180 }, { "epoch": 0.8546551120808547, "grad_norm": 1.2160990238189697, "learning_rate": 1.678395961578247e-05, "loss": 0.5647, "step": 5190 }, { "epoch": 0.8563018464008563, "grad_norm": 1.3416587114334106, "learning_rate": 1.6770888032304437e-05, "loss": 0.5696, "step": 5200 }, { "epoch": 0.857948580720858, "grad_norm": 1.0583994388580322, "learning_rate": 1.6757795050793175e-05, "loss": 0.5509, "step": 5210 }, { "epoch": 0.8595953150408596, "grad_norm": 1.2085150480270386, "learning_rate": 1.674468071262644e-05, "loss": 0.5405, "step": 5220 }, { "epoch": 0.8612420493608612, "grad_norm": 1.6761043071746826, "learning_rate": 1.6731545059249467e-05, "loss": 0.5666, "step": 5230 }, { "epoch": 0.8628887836808629, "grad_norm": 1.3844279050827026, "learning_rate": 1.671838813217485e-05, "loss": 0.5458, "step": 5240 }, { "epoch": 0.8645355180008646, "grad_norm": 2.4434731006622314, "learning_rate": 1.670520997298241e-05, "loss": 0.5734, "step": 5250 }, { "epoch": 0.8661822523208662, "grad_norm": 1.2524642944335938, "learning_rate": 1.6692010623319087e-05, "loss": 0.5698, "step": 5260 }, { "epoch": 0.8678289866408678, "grad_norm": 1.8466917276382446, "learning_rate": 1.667879012489877e-05, "loss": 0.5698, "step": 5270 }, { "epoch": 0.8694757209608694, "grad_norm": 1.3722879886627197, "learning_rate": 1.6665548519502196e-05, "loss": 0.5402, "step": 5280 }, { "epoch": 0.8711224552808712, "grad_norm": 1.4703186750411987, "learning_rate": 1.66522858489768e-05, "loss": 0.5886, "step": 5290 }, { "epoch": 0.8727691896008728, "grad_norm": 1.5705962181091309, "learning_rate": 1.663900215523659e-05, "loss": 0.6006, "step": 5300 }, { "epoch": 0.8744159239208744, "grad_norm": 1.7487270832061768, "learning_rate": 1.662569748026202e-05, "loss": 0.5421, "step": 5310 }, { "epoch": 0.876062658240876, "grad_norm": 1.986843228340149, "learning_rate": 1.661237186609984e-05, "loss": 0.5275, "step": 5320 }, { "epoch": 0.8777093925608777, "grad_norm": 1.661085844039917, "learning_rate": 1.6599025354862983e-05, "loss": 0.5491, "step": 5330 }, { "epoch": 0.8793561268808794, "grad_norm": 1.975029706954956, "learning_rate": 1.6585657988730424e-05, "loss": 0.5743, "step": 5340 }, { "epoch": 0.881002861200881, "grad_norm": 1.8658064603805542, "learning_rate": 1.6572269809947035e-05, "loss": 0.5627, "step": 5350 }, { "epoch": 0.8826495955208826, "grad_norm": 1.540292739868164, "learning_rate": 1.6558860860823473e-05, "loss": 0.5227, "step": 5360 }, { "epoch": 0.8842963298408842, "grad_norm": 1.792221188545227, "learning_rate": 1.654543118373603e-05, "loss": 0.5766, "step": 5370 }, { "epoch": 0.885943064160886, "grad_norm": 1.3535420894622803, "learning_rate": 1.6531980821126508e-05, "loss": 0.5803, "step": 5380 }, { "epoch": 0.8875897984808876, "grad_norm": 1.7461328506469727, "learning_rate": 1.651850981550208e-05, "loss": 0.5511, "step": 5390 }, { "epoch": 0.8892365328008892, "grad_norm": 1.6010706424713135, "learning_rate": 1.6505018209435152e-05, "loss": 0.5656, "step": 5400 }, { "epoch": 0.8908832671208908, "grad_norm": 1.3865113258361816, "learning_rate": 1.649150604556324e-05, "loss": 0.57, "step": 5410 }, { "epoch": 0.8925300014408926, "grad_norm": 1.609237790107727, "learning_rate": 1.6477973366588833e-05, "loss": 0.5566, "step": 5420 }, { "epoch": 0.8941767357608942, "grad_norm": 2.0316720008850098, "learning_rate": 1.6464420215279237e-05, "loss": 0.5601, "step": 5430 }, { "epoch": 0.8958234700808958, "grad_norm": 1.759619951248169, "learning_rate": 1.6450846634466476e-05, "loss": 0.5665, "step": 5440 }, { "epoch": 0.8974702044008974, "grad_norm": 1.5108795166015625, "learning_rate": 1.643725266704713e-05, "loss": 0.5432, "step": 5450 }, { "epoch": 0.8991169387208992, "grad_norm": 1.3208014965057373, "learning_rate": 1.6423638355982202e-05, "loss": 0.5867, "step": 5460 }, { "epoch": 0.9007636730409008, "grad_norm": 1.530247688293457, "learning_rate": 1.6410003744296984e-05, "loss": 0.5721, "step": 5470 }, { "epoch": 0.9024104073609024, "grad_norm": 2.346730947494507, "learning_rate": 1.6396348875080945e-05, "loss": 0.5569, "step": 5480 }, { "epoch": 0.904057141680904, "grad_norm": 1.4136897325515747, "learning_rate": 1.638267379148755e-05, "loss": 0.5876, "step": 5490 }, { "epoch": 0.9057038760009057, "grad_norm": 1.5225974321365356, "learning_rate": 1.6368978536734162e-05, "loss": 0.57, "step": 5500 }, { "epoch": 0.9073506103209074, "grad_norm": 1.5766575336456299, "learning_rate": 1.6355263154101884e-05, "loss": 0.5282, "step": 5510 }, { "epoch": 0.908997344640909, "grad_norm": 1.784691333770752, "learning_rate": 1.634152768693543e-05, "loss": 0.5805, "step": 5520 }, { "epoch": 0.9106440789609106, "grad_norm": 1.479247808456421, "learning_rate": 1.6327772178642986e-05, "loss": 0.5525, "step": 5530 }, { "epoch": 0.9122908132809123, "grad_norm": 1.4190905094146729, "learning_rate": 1.6313996672696083e-05, "loss": 0.5598, "step": 5540 }, { "epoch": 0.913937547600914, "grad_norm": 1.5564013719558716, "learning_rate": 1.6300201212629437e-05, "loss": 0.5521, "step": 5550 }, { "epoch": 0.9155842819209156, "grad_norm": 1.3574228286743164, "learning_rate": 1.6286385842040843e-05, "loss": 0.55, "step": 5560 }, { "epoch": 0.9172310162409172, "grad_norm": 1.4063721895217896, "learning_rate": 1.6272550604590993e-05, "loss": 0.5731, "step": 5570 }, { "epoch": 0.9188777505609189, "grad_norm": 1.4861043691635132, "learning_rate": 1.625869554400339e-05, "loss": 0.5248, "step": 5580 }, { "epoch": 0.9205244848809205, "grad_norm": 1.3798218965530396, "learning_rate": 1.624482070406417e-05, "loss": 0.5626, "step": 5590 }, { "epoch": 0.9221712192009222, "grad_norm": 1.3654139041900635, "learning_rate": 1.623092612862198e-05, "loss": 0.5273, "step": 5600 }, { "epoch": 0.9238179535209238, "grad_norm": 1.593929648399353, "learning_rate": 1.6217011861587834e-05, "loss": 0.5609, "step": 5610 }, { "epoch": 0.9254646878409255, "grad_norm": 1.439780592918396, "learning_rate": 1.6203077946934992e-05, "loss": 0.5699, "step": 5620 }, { "epoch": 0.9271114221609271, "grad_norm": 2.25289249420166, "learning_rate": 1.6189124428698787e-05, "loss": 0.5472, "step": 5630 }, { "epoch": 0.9287581564809287, "grad_norm": 1.3383618593215942, "learning_rate": 1.6175151350976518e-05, "loss": 0.5551, "step": 5640 }, { "epoch": 0.9304048908009304, "grad_norm": 1.414027452468872, "learning_rate": 1.6161158757927292e-05, "loss": 0.5793, "step": 5650 }, { "epoch": 0.9320516251209321, "grad_norm": 1.126561164855957, "learning_rate": 1.6147146693771896e-05, "loss": 0.5704, "step": 5660 }, { "epoch": 0.9336983594409337, "grad_norm": 1.2248793840408325, "learning_rate": 1.6133115202792645e-05, "loss": 0.5614, "step": 5670 }, { "epoch": 0.9353450937609353, "grad_norm": 1.4462449550628662, "learning_rate": 1.6119064329333248e-05, "loss": 0.5544, "step": 5680 }, { "epoch": 0.936991828080937, "grad_norm": 1.1664077043533325, "learning_rate": 1.6104994117798674e-05, "loss": 0.5596, "step": 5690 }, { "epoch": 0.9386385624009387, "grad_norm": 1.239491581916809, "learning_rate": 1.6090904612655007e-05, "loss": 0.5636, "step": 5700 }, { "epoch": 0.9402852967209403, "grad_norm": 1.3634086847305298, "learning_rate": 1.6076795858429296e-05, "loss": 0.558, "step": 5710 }, { "epoch": 0.9419320310409419, "grad_norm": 1.319319725036621, "learning_rate": 1.606266789970943e-05, "loss": 0.5563, "step": 5720 }, { "epoch": 0.9435787653609435, "grad_norm": 1.2931233644485474, "learning_rate": 1.6048520781143988e-05, "loss": 0.5646, "step": 5730 }, { "epoch": 0.9452254996809453, "grad_norm": 1.3727686405181885, "learning_rate": 1.6034354547442104e-05, "loss": 0.5288, "step": 5740 }, { "epoch": 0.9468722340009469, "grad_norm": 1.4657319784164429, "learning_rate": 1.6020169243373313e-05, "loss": 0.5605, "step": 5750 }, { "epoch": 0.9485189683209485, "grad_norm": 1.3323101997375488, "learning_rate": 1.600596491376742e-05, "loss": 0.5575, "step": 5760 }, { "epoch": 0.9501657026409501, "grad_norm": 1.4169237613677979, "learning_rate": 1.5991741603514367e-05, "loss": 0.5448, "step": 5770 }, { "epoch": 0.9518124369609519, "grad_norm": 2.1234960556030273, "learning_rate": 1.5977499357564067e-05, "loss": 0.5453, "step": 5780 }, { "epoch": 0.9534591712809535, "grad_norm": 1.3298665285110474, "learning_rate": 1.596323822092628e-05, "loss": 0.5205, "step": 5790 }, { "epoch": 0.9551059056009551, "grad_norm": 1.7144644260406494, "learning_rate": 1.594895823867047e-05, "loss": 0.5333, "step": 5800 }, { "epoch": 0.9567526399209567, "grad_norm": 1.2539159059524536, "learning_rate": 1.5934659455925658e-05, "loss": 0.552, "step": 5810 }, { "epoch": 0.9583993742409584, "grad_norm": 1.7403203248977661, "learning_rate": 1.5920341917880277e-05, "loss": 0.5166, "step": 5820 }, { "epoch": 0.9600461085609601, "grad_norm": 1.802240252494812, "learning_rate": 1.5906005669782027e-05, "loss": 0.5392, "step": 5830 }, { "epoch": 0.9616928428809617, "grad_norm": 1.3411508798599243, "learning_rate": 1.5891650756937755e-05, "loss": 0.5621, "step": 5840 }, { "epoch": 0.9633395772009633, "grad_norm": 1.388187289237976, "learning_rate": 1.587727722471327e-05, "loss": 0.5331, "step": 5850 }, { "epoch": 0.964986311520965, "grad_norm": 1.2933506965637207, "learning_rate": 1.5862885118533244e-05, "loss": 0.5449, "step": 5860 }, { "epoch": 0.9666330458409667, "grad_norm": 1.637378454208374, "learning_rate": 1.5848474483881044e-05, "loss": 0.5586, "step": 5870 }, { "epoch": 0.9682797801609683, "grad_norm": 1.2650068998336792, "learning_rate": 1.5834045366298593e-05, "loss": 0.5504, "step": 5880 }, { "epoch": 0.9699265144809699, "grad_norm": 1.3587515354156494, "learning_rate": 1.5819597811386208e-05, "loss": 0.5494, "step": 5890 }, { "epoch": 0.9715732488009716, "grad_norm": 1.1852201223373413, "learning_rate": 1.5805131864802496e-05, "loss": 0.5639, "step": 5900 }, { "epoch": 0.9732199831209732, "grad_norm": 1.3955892324447632, "learning_rate": 1.579064757226418e-05, "loss": 0.5651, "step": 5910 }, { "epoch": 0.9748667174409749, "grad_norm": 1.284073829650879, "learning_rate": 1.5776144979545963e-05, "loss": 0.5304, "step": 5920 }, { "epoch": 0.9765134517609765, "grad_norm": 1.7228704690933228, "learning_rate": 1.5761624132480372e-05, "loss": 0.5762, "step": 5930 }, { "epoch": 0.9781601860809782, "grad_norm": 1.9546711444854736, "learning_rate": 1.5747085076957635e-05, "loss": 0.5715, "step": 5940 }, { "epoch": 0.9798069204009798, "grad_norm": 1.2742135524749756, "learning_rate": 1.5732527858925523e-05, "loss": 0.5501, "step": 5950 }, { "epoch": 0.9814536547209814, "grad_norm": 1.293632984161377, "learning_rate": 1.57179525243892e-05, "loss": 0.5593, "step": 5960 }, { "epoch": 0.9831003890409831, "grad_norm": 1.244485855102539, "learning_rate": 1.5703359119411087e-05, "loss": 0.5229, "step": 5970 }, { "epoch": 0.9847471233609848, "grad_norm": 1.1649023294448853, "learning_rate": 1.5688747690110708e-05, "loss": 0.5411, "step": 5980 }, { "epoch": 0.9863938576809864, "grad_norm": 1.1061643362045288, "learning_rate": 1.5674118282664563e-05, "loss": 0.5528, "step": 5990 }, { "epoch": 0.988040592000988, "grad_norm": 1.6961778402328491, "learning_rate": 1.5659470943305956e-05, "loss": 0.5484, "step": 6000 }, { "epoch": 0.9896873263209897, "grad_norm": 1.4628689289093018, "learning_rate": 1.5644805718324854e-05, "loss": 0.5446, "step": 6010 }, { "epoch": 0.9913340606409914, "grad_norm": 1.3264354467391968, "learning_rate": 1.5630122654067778e-05, "loss": 0.5684, "step": 6020 }, { "epoch": 0.992980794960993, "grad_norm": 1.5602631568908691, "learning_rate": 1.5615421796937593e-05, "loss": 0.5514, "step": 6030 }, { "epoch": 0.9946275292809946, "grad_norm": 2.169344663619995, "learning_rate": 1.560070319339341e-05, "loss": 0.5681, "step": 6040 }, { "epoch": 0.9962742636009962, "grad_norm": 1.190785527229309, "learning_rate": 1.5585966889950423e-05, "loss": 0.5266, "step": 6050 }, { "epoch": 0.997920997920998, "grad_norm": 1.3038309812545776, "learning_rate": 1.5571212933179766e-05, "loss": 0.5326, "step": 6060 }, { "epoch": 0.9995677322409996, "grad_norm": 1.3445013761520386, "learning_rate": 1.5556441369708358e-05, "loss": 0.5473, "step": 6070 }, { "epoch": 1.0013173874560013, "grad_norm": 1.4349257946014404, "learning_rate": 1.554165224621876e-05, "loss": 0.5755, "step": 6080 }, { "epoch": 1.002964121776003, "grad_norm": 1.7247047424316406, "learning_rate": 1.552684560944903e-05, "loss": 0.5148, "step": 6090 }, { "epoch": 1.0046108560960045, "grad_norm": 1.518079161643982, "learning_rate": 1.5512021506192575e-05, "loss": 0.5156, "step": 6100 }, { "epoch": 1.0062575904160063, "grad_norm": 1.1310617923736572, "learning_rate": 1.5497179983297998e-05, "loss": 0.4851, "step": 6110 }, { "epoch": 1.007904324736008, "grad_norm": 1.521835207939148, "learning_rate": 1.5482321087668954e-05, "loss": 0.507, "step": 6120 }, { "epoch": 1.0095510590560095, "grad_norm": 1.224595308303833, "learning_rate": 1.5467444866264e-05, "loss": 0.5086, "step": 6130 }, { "epoch": 1.0111977933760112, "grad_norm": 1.4510592222213745, "learning_rate": 1.5452551366096457e-05, "loss": 0.4839, "step": 6140 }, { "epoch": 1.0128445276960127, "grad_norm": 1.4431626796722412, "learning_rate": 1.5437640634234234e-05, "loss": 0.4992, "step": 6150 }, { "epoch": 1.0144912620160145, "grad_norm": 1.131432056427002, "learning_rate": 1.5422712717799714e-05, "loss": 0.5107, "step": 6160 }, { "epoch": 1.0161379963360162, "grad_norm": 1.4015928506851196, "learning_rate": 1.5407767663969576e-05, "loss": 0.5125, "step": 6170 }, { "epoch": 1.0177847306560177, "grad_norm": 1.4604285955429077, "learning_rate": 1.5392805519974678e-05, "loss": 0.5076, "step": 6180 }, { "epoch": 1.0194314649760194, "grad_norm": 1.2894060611724854, "learning_rate": 1.5377826333099855e-05, "loss": 0.51, "step": 6190 }, { "epoch": 1.0210781992960212, "grad_norm": 1.5654339790344238, "learning_rate": 1.5362830150683838e-05, "loss": 0.4946, "step": 6200 }, { "epoch": 1.0227249336160227, "grad_norm": 1.7387830018997192, "learning_rate": 1.5347817020119044e-05, "loss": 0.5302, "step": 6210 }, { "epoch": 1.0243716679360244, "grad_norm": 1.534555196762085, "learning_rate": 1.5332786988851462e-05, "loss": 0.5191, "step": 6220 }, { "epoch": 1.026018402256026, "grad_norm": 1.6992589235305786, "learning_rate": 1.531774010438049e-05, "loss": 0.5076, "step": 6230 }, { "epoch": 1.0276651365760276, "grad_norm": 1.5963928699493408, "learning_rate": 1.530267641425879e-05, "loss": 0.4864, "step": 6240 }, { "epoch": 1.0293118708960294, "grad_norm": 1.390317678451538, "learning_rate": 1.528759596609213e-05, "loss": 0.5072, "step": 6250 }, { "epoch": 1.0309586052160309, "grad_norm": 1.8305940628051758, "learning_rate": 1.5272498807539248e-05, "loss": 0.4892, "step": 6260 }, { "epoch": 1.0326053395360326, "grad_norm": 4.596288204193115, "learning_rate": 1.5257384986311671e-05, "loss": 0.513, "step": 6270 }, { "epoch": 1.0342520738560343, "grad_norm": 1.9707483053207397, "learning_rate": 1.5242254550173612e-05, "loss": 0.4815, "step": 6280 }, { "epoch": 1.0358988081760359, "grad_norm": 1.657809853553772, "learning_rate": 1.5227107546941772e-05, "loss": 0.5244, "step": 6290 }, { "epoch": 1.0375455424960376, "grad_norm": 1.530784010887146, "learning_rate": 1.5211944024485216e-05, "loss": 0.5041, "step": 6300 }, { "epoch": 1.039192276816039, "grad_norm": 1.7985795736312866, "learning_rate": 1.5196764030725217e-05, "loss": 0.4922, "step": 6310 }, { "epoch": 1.0408390111360408, "grad_norm": 1.5775318145751953, "learning_rate": 1.5181567613635099e-05, "loss": 0.5004, "step": 6320 }, { "epoch": 1.0424857454560426, "grad_norm": 1.5690876245498657, "learning_rate": 1.5166354821240093e-05, "loss": 0.5047, "step": 6330 }, { "epoch": 1.044132479776044, "grad_norm": 1.4452539682388306, "learning_rate": 1.5151125701617168e-05, "loss": 0.4845, "step": 6340 }, { "epoch": 1.0457792140960458, "grad_norm": 2.1906886100769043, "learning_rate": 1.513588030289491e-05, "loss": 0.4827, "step": 6350 }, { "epoch": 1.0474259484160475, "grad_norm": 2.8921637535095215, "learning_rate": 1.5120618673253335e-05, "loss": 0.5144, "step": 6360 }, { "epoch": 1.049072682736049, "grad_norm": 2.0884976387023926, "learning_rate": 1.510534086092377e-05, "loss": 0.5055, "step": 6370 }, { "epoch": 1.0507194170560508, "grad_norm": 2.065589189529419, "learning_rate": 1.5090046914188672e-05, "loss": 0.5281, "step": 6380 }, { "epoch": 1.0523661513760523, "grad_norm": 2.384566068649292, "learning_rate": 1.507473688138149e-05, "loss": 0.5073, "step": 6390 }, { "epoch": 1.054012885696054, "grad_norm": 2.007929801940918, "learning_rate": 1.5059410810886515e-05, "loss": 0.4977, "step": 6400 }, { "epoch": 1.0556596200160557, "grad_norm": 1.7083964347839355, "learning_rate": 1.5044068751138716e-05, "loss": 0.5095, "step": 6410 }, { "epoch": 1.0573063543360572, "grad_norm": 1.9435292482376099, "learning_rate": 1.5028710750623592e-05, "loss": 0.5173, "step": 6420 }, { "epoch": 1.058953088656059, "grad_norm": 3.824756145477295, "learning_rate": 1.5013336857877025e-05, "loss": 0.482, "step": 6430 }, { "epoch": 1.0605998229760607, "grad_norm": 2.91257381439209, "learning_rate": 1.4997947121485119e-05, "loss": 0.5235, "step": 6440 }, { "epoch": 1.0622465572960622, "grad_norm": 1.5288364887237549, "learning_rate": 1.4982541590084047e-05, "loss": 0.4804, "step": 6450 }, { "epoch": 1.063893291616064, "grad_norm": 1.5502305030822754, "learning_rate": 1.4967120312359902e-05, "loss": 0.516, "step": 6460 }, { "epoch": 1.0655400259360654, "grad_norm": 1.62122642993927, "learning_rate": 1.4951683337048536e-05, "loss": 0.4812, "step": 6470 }, { "epoch": 1.0671867602560672, "grad_norm": 1.508252739906311, "learning_rate": 1.4936230712935416e-05, "loss": 0.4897, "step": 6480 }, { "epoch": 1.068833494576069, "grad_norm": 1.665816068649292, "learning_rate": 1.4920762488855457e-05, "loss": 0.4891, "step": 6490 }, { "epoch": 1.0704802288960704, "grad_norm": 1.297685146331787, "learning_rate": 1.490527871369288e-05, "loss": 0.4643, "step": 6500 }, { "epoch": 1.071962289784072, "grad_norm": 1.9492204189300537, "learning_rate": 1.4889779436381046e-05, "loss": 0.5146, "step": 6510 }, { "epoch": 1.0736090241040737, "grad_norm": 1.655253291130066, "learning_rate": 1.4874264705902319e-05, "loss": 0.5261, "step": 6520 }, { "epoch": 1.0752557584240752, "grad_norm": 1.1686067581176758, "learning_rate": 1.4858734571287885e-05, "loss": 0.5401, "step": 6530 }, { "epoch": 1.076902492744077, "grad_norm": 1.5212386846542358, "learning_rate": 1.4843189081617622e-05, "loss": 0.5403, "step": 6540 }, { "epoch": 1.0785492270640786, "grad_norm": 1.1677196025848389, "learning_rate": 1.4827628286019928e-05, "loss": 0.5261, "step": 6550 }, { "epoch": 1.0801959613840801, "grad_norm": 1.2745987176895142, "learning_rate": 1.4812052233671581e-05, "loss": 0.5309, "step": 6560 }, { "epoch": 1.0818426957040819, "grad_norm": 1.4600034952163696, "learning_rate": 1.4796460973797566e-05, "loss": 0.5168, "step": 6570 }, { "epoch": 1.0834894300240836, "grad_norm": 1.8619006872177124, "learning_rate": 1.4780854555670932e-05, "loss": 0.5423, "step": 6580 }, { "epoch": 1.085136164344085, "grad_norm": 1.418540120124817, "learning_rate": 1.4765233028612633e-05, "loss": 0.501, "step": 6590 }, { "epoch": 1.0867828986640868, "grad_norm": 1.2168614864349365, "learning_rate": 1.474959644199137e-05, "loss": 0.5219, "step": 6600 }, { "epoch": 1.0884296329840883, "grad_norm": 1.4315396547317505, "learning_rate": 1.4733944845223441e-05, "loss": 0.5274, "step": 6610 }, { "epoch": 1.09007636730409, "grad_norm": 1.5139554738998413, "learning_rate": 1.4718278287772574e-05, "loss": 0.5181, "step": 6620 }, { "epoch": 1.0917231016240918, "grad_norm": 1.6579830646514893, "learning_rate": 1.4702596819149784e-05, "loss": 0.5303, "step": 6630 }, { "epoch": 1.0933698359440933, "grad_norm": 1.6588560342788696, "learning_rate": 1.46869004889132e-05, "loss": 0.4853, "step": 6640 }, { "epoch": 1.095016570264095, "grad_norm": 1.925136923789978, "learning_rate": 1.4671189346667933e-05, "loss": 0.5222, "step": 6650 }, { "epoch": 1.0966633045840966, "grad_norm": 1.1029921770095825, "learning_rate": 1.4655463442065893e-05, "loss": 0.4927, "step": 6660 }, { "epoch": 1.0983100389040983, "grad_norm": 1.2987926006317139, "learning_rate": 1.4639722824805644e-05, "loss": 0.5328, "step": 6670 }, { "epoch": 1.0999567732241, "grad_norm": 1.4294719696044922, "learning_rate": 1.4623967544632252e-05, "loss": 0.5172, "step": 6680 }, { "epoch": 1.1016035075441015, "grad_norm": 1.1788091659545898, "learning_rate": 1.4608197651337117e-05, "loss": 0.5056, "step": 6690 }, { "epoch": 1.1032502418641033, "grad_norm": 1.2116508483886719, "learning_rate": 1.4592413194757826e-05, "loss": 0.5182, "step": 6700 }, { "epoch": 1.104896976184105, "grad_norm": 1.1964901685714722, "learning_rate": 1.4576614224777982e-05, "loss": 0.4942, "step": 6710 }, { "epoch": 1.1065437105041065, "grad_norm": 1.5274699926376343, "learning_rate": 1.4560800791327063e-05, "loss": 0.5232, "step": 6720 }, { "epoch": 1.1081904448241082, "grad_norm": 1.4706414937973022, "learning_rate": 1.4544972944380256e-05, "loss": 0.5432, "step": 6730 }, { "epoch": 1.10983717914411, "grad_norm": 1.6734999418258667, "learning_rate": 1.4529130733958292e-05, "loss": 0.5161, "step": 6740 }, { "epoch": 1.1114839134641115, "grad_norm": 1.2489820718765259, "learning_rate": 1.4513274210127298e-05, "loss": 0.5232, "step": 6750 }, { "epoch": 1.1131306477841132, "grad_norm": 1.5743283033370972, "learning_rate": 1.4497403422998634e-05, "loss": 0.5028, "step": 6760 }, { "epoch": 1.1147773821041147, "grad_norm": 1.1314998865127563, "learning_rate": 1.448151842272875e-05, "loss": 0.5087, "step": 6770 }, { "epoch": 1.1164241164241164, "grad_norm": 1.446236491203308, "learning_rate": 1.446561925951899e-05, "loss": 0.5107, "step": 6780 }, { "epoch": 1.1180708507441182, "grad_norm": 1.3917357921600342, "learning_rate": 1.4449705983615474e-05, "loss": 0.5119, "step": 6790 }, { "epoch": 1.1197175850641197, "grad_norm": 1.290035605430603, "learning_rate": 1.443377864530892e-05, "loss": 0.522, "step": 6800 }, { "epoch": 1.1213643193841214, "grad_norm": 1.2674860954284668, "learning_rate": 1.4417837294934484e-05, "loss": 0.5249, "step": 6810 }, { "epoch": 1.123011053704123, "grad_norm": 1.2753984928131104, "learning_rate": 1.4401881982871604e-05, "loss": 0.4972, "step": 6820 }, { "epoch": 1.1246577880241246, "grad_norm": 1.801187515258789, "learning_rate": 1.4385912759543843e-05, "loss": 0.5043, "step": 6830 }, { "epoch": 1.1263045223441264, "grad_norm": 1.2739251852035522, "learning_rate": 1.436992967541873e-05, "loss": 0.509, "step": 6840 }, { "epoch": 1.1279512566641279, "grad_norm": 1.319750189781189, "learning_rate": 1.4353932781007594e-05, "loss": 0.529, "step": 6850 }, { "epoch": 1.1295979909841296, "grad_norm": 1.7615410089492798, "learning_rate": 1.433792212686541e-05, "loss": 0.5045, "step": 6860 }, { "epoch": 1.1312447253041313, "grad_norm": 1.2809779644012451, "learning_rate": 1.432189776359064e-05, "loss": 0.4994, "step": 6870 }, { "epoch": 1.1328914596241328, "grad_norm": 1.5949429273605347, "learning_rate": 1.4305859741825068e-05, "loss": 0.502, "step": 6880 }, { "epoch": 1.1345381939441346, "grad_norm": 1.4932994842529297, "learning_rate": 1.4289808112253643e-05, "loss": 0.5026, "step": 6890 }, { "epoch": 1.1361849282641363, "grad_norm": 1.708443284034729, "learning_rate": 1.4273742925604322e-05, "loss": 0.4747, "step": 6900 }, { "epoch": 1.1378316625841378, "grad_norm": 1.2732783555984497, "learning_rate": 1.4257664232647903e-05, "loss": 0.5154, "step": 6910 }, { "epoch": 1.1394783969041395, "grad_norm": 1.4677982330322266, "learning_rate": 1.424157208419787e-05, "loss": 0.4879, "step": 6920 }, { "epoch": 1.141125131224141, "grad_norm": 1.3541361093521118, "learning_rate": 1.422546653111023e-05, "loss": 0.5129, "step": 6930 }, { "epoch": 1.1427718655441428, "grad_norm": 1.359041690826416, "learning_rate": 1.4209347624283352e-05, "loss": 0.4885, "step": 6940 }, { "epoch": 1.1444185998641445, "grad_norm": 1.3765989542007446, "learning_rate": 1.4193215414657808e-05, "loss": 0.543, "step": 6950 }, { "epoch": 1.146065334184146, "grad_norm": 1.3113765716552734, "learning_rate": 1.4177069953216207e-05, "loss": 0.4778, "step": 6960 }, { "epoch": 1.1477120685041478, "grad_norm": 1.1242080926895142, "learning_rate": 1.4160911290983041e-05, "loss": 0.4809, "step": 6970 }, { "epoch": 1.1493588028241493, "grad_norm": 1.536487340927124, "learning_rate": 1.4144739479024527e-05, "loss": 0.4818, "step": 6980 }, { "epoch": 1.151005537144151, "grad_norm": 1.2508544921875, "learning_rate": 1.4128554568448425e-05, "loss": 0.514, "step": 6990 }, { "epoch": 1.1526522714641527, "grad_norm": 1.192643404006958, "learning_rate": 1.4112356610403897e-05, "loss": 0.4733, "step": 7000 }, { "epoch": 1.1542990057841542, "grad_norm": 1.2769783735275269, "learning_rate": 1.4096145656081347e-05, "loss": 0.4891, "step": 7010 }, { "epoch": 1.155945740104156, "grad_norm": 1.1201382875442505, "learning_rate": 1.4079921756712238e-05, "loss": 0.4824, "step": 7020 }, { "epoch": 1.1575924744241575, "grad_norm": 1.3404284715652466, "learning_rate": 1.4063684963568948e-05, "loss": 0.5242, "step": 7030 }, { "epoch": 1.1592392087441592, "grad_norm": 1.3030028343200684, "learning_rate": 1.4047435327964609e-05, "loss": 0.5023, "step": 7040 }, { "epoch": 1.160885943064161, "grad_norm": 1.7895567417144775, "learning_rate": 1.4031172901252931e-05, "loss": 0.4866, "step": 7050 }, { "epoch": 1.1625326773841624, "grad_norm": 1.2120519876480103, "learning_rate": 1.4014897734828055e-05, "loss": 0.4819, "step": 7060 }, { "epoch": 1.1641794117041642, "grad_norm": 1.3895535469055176, "learning_rate": 1.3998609880124373e-05, "loss": 0.4955, "step": 7070 }, { "epoch": 1.165826146024166, "grad_norm": 1.2869995832443237, "learning_rate": 1.3982309388616385e-05, "loss": 0.4857, "step": 7080 }, { "epoch": 1.1674728803441674, "grad_norm": 1.2224863767623901, "learning_rate": 1.3965996311818526e-05, "loss": 0.5069, "step": 7090 }, { "epoch": 1.1691196146641691, "grad_norm": 1.1428492069244385, "learning_rate": 1.3949670701285e-05, "loss": 0.4908, "step": 7100 }, { "epoch": 1.1707663489841709, "grad_norm": 1.8123971223831177, "learning_rate": 1.3933332608609624e-05, "loss": 0.504, "step": 7110 }, { "epoch": 1.1724130833041724, "grad_norm": 1.2169560194015503, "learning_rate": 1.3916982085425663e-05, "loss": 0.5015, "step": 7120 }, { "epoch": 1.174059817624174, "grad_norm": 1.34042489528656, "learning_rate": 1.3900619183405666e-05, "loss": 0.5077, "step": 7130 }, { "epoch": 1.1757065519441756, "grad_norm": 1.6218396425247192, "learning_rate": 1.38842439542613e-05, "loss": 0.5148, "step": 7140 }, { "epoch": 1.1773532862641773, "grad_norm": 1.162052869796753, "learning_rate": 1.3867856449743191e-05, "loss": 0.4832, "step": 7150 }, { "epoch": 1.179000020584179, "grad_norm": 1.2878142595291138, "learning_rate": 1.3851456721640762e-05, "loss": 0.5124, "step": 7160 }, { "epoch": 1.1806467549041806, "grad_norm": 1.1885998249053955, "learning_rate": 1.3835044821782063e-05, "loss": 0.4829, "step": 7170 }, { "epoch": 1.1822934892241823, "grad_norm": 1.176771879196167, "learning_rate": 1.3818620802033609e-05, "loss": 0.4881, "step": 7180 }, { "epoch": 1.1839402235441838, "grad_norm": 1.699350357055664, "learning_rate": 1.3802184714300219e-05, "loss": 0.4995, "step": 7190 }, { "epoch": 1.1855869578641856, "grad_norm": 1.660423994064331, "learning_rate": 1.3785736610524854e-05, "loss": 0.5151, "step": 7200 }, { "epoch": 1.1872336921841873, "grad_norm": 1.5366398096084595, "learning_rate": 1.3769276542688444e-05, "loss": 0.5002, "step": 7210 }, { "epoch": 1.1888804265041888, "grad_norm": 1.597117304801941, "learning_rate": 1.3752804562809731e-05, "loss": 0.4737, "step": 7220 }, { "epoch": 1.1905271608241905, "grad_norm": 1.2548353672027588, "learning_rate": 1.3736320722945103e-05, "loss": 0.4971, "step": 7230 }, { "epoch": 1.1921738951441923, "grad_norm": 1.5998353958129883, "learning_rate": 1.3719825075188427e-05, "loss": 0.4988, "step": 7240 }, { "epoch": 1.1938206294641938, "grad_norm": 1.853366732597351, "learning_rate": 1.370331767167089e-05, "loss": 0.5122, "step": 7250 }, { "epoch": 1.1954673637841955, "grad_norm": 1.6614432334899902, "learning_rate": 1.3686798564560831e-05, "loss": 0.5014, "step": 7260 }, { "epoch": 1.1971140981041972, "grad_norm": 1.8630009889602661, "learning_rate": 1.3670267806063573e-05, "loss": 0.4648, "step": 7270 }, { "epoch": 1.1987608324241987, "grad_norm": 1.5616339445114136, "learning_rate": 1.3653725448421258e-05, "loss": 0.5241, "step": 7280 }, { "epoch": 1.2004075667442005, "grad_norm": 1.2994410991668701, "learning_rate": 1.3637171543912693e-05, "loss": 0.4812, "step": 7290 }, { "epoch": 1.202054301064202, "grad_norm": 1.523964524269104, "learning_rate": 1.362060614485317e-05, "loss": 0.4898, "step": 7300 }, { "epoch": 1.2037010353842037, "grad_norm": 2.751901865005493, "learning_rate": 1.3604029303594311e-05, "loss": 0.5339, "step": 7310 }, { "epoch": 1.2053477697042054, "grad_norm": 1.832046389579773, "learning_rate": 1.3587441072523893e-05, "loss": 0.4728, "step": 7320 }, { "epoch": 1.206994504024207, "grad_norm": 1.397538423538208, "learning_rate": 1.3570841504065695e-05, "loss": 0.4737, "step": 7330 }, { "epoch": 1.2086412383442087, "grad_norm": 2.1878819465637207, "learning_rate": 1.3554230650679326e-05, "loss": 0.4977, "step": 7340 }, { "epoch": 1.2102879726642102, "grad_norm": 1.9552628993988037, "learning_rate": 1.3537608564860053e-05, "loss": 0.4879, "step": 7350 }, { "epoch": 1.211934706984212, "grad_norm": 1.4052588939666748, "learning_rate": 1.3520975299138637e-05, "loss": 0.4753, "step": 7360 }, { "epoch": 1.2135814413042136, "grad_norm": 1.8420672416687012, "learning_rate": 1.3504330906081187e-05, "loss": 0.4877, "step": 7370 }, { "epoch": 1.2152281756242151, "grad_norm": 1.29952073097229, "learning_rate": 1.3487675438288965e-05, "loss": 0.4822, "step": 7380 }, { "epoch": 1.2168749099442169, "grad_norm": 1.3250713348388672, "learning_rate": 1.3471008948398233e-05, "loss": 0.465, "step": 7390 }, { "epoch": 1.2185216442642186, "grad_norm": 1.2507812976837158, "learning_rate": 1.345433148908009e-05, "loss": 0.4588, "step": 7400 }, { "epoch": 1.2201683785842201, "grad_norm": 1.415877103805542, "learning_rate": 1.3437643113040302e-05, "loss": 0.4691, "step": 7410 }, { "epoch": 1.2218151129042218, "grad_norm": 1.3288524150848389, "learning_rate": 1.3420943873019128e-05, "loss": 0.4664, "step": 7420 }, { "epoch": 1.2234618472242236, "grad_norm": 1.4450100660324097, "learning_rate": 1.3404233821791172e-05, "loss": 0.4613, "step": 7430 }, { "epoch": 1.225108581544225, "grad_norm": 1.3197182416915894, "learning_rate": 1.3387513012165196e-05, "loss": 0.485, "step": 7440 }, { "epoch": 1.2267553158642268, "grad_norm": 1.192579746246338, "learning_rate": 1.3370781496983965e-05, "loss": 0.4967, "step": 7450 }, { "epoch": 1.2284020501842283, "grad_norm": 1.670664668083191, "learning_rate": 1.3354039329124074e-05, "loss": 0.4973, "step": 7460 }, { "epoch": 1.23004878450423, "grad_norm": 1.251954197883606, "learning_rate": 1.3337286561495788e-05, "loss": 0.473, "step": 7470 }, { "epoch": 1.2316955188242318, "grad_norm": 1.7240633964538574, "learning_rate": 1.3320523247042867e-05, "loss": 0.4754, "step": 7480 }, { "epoch": 1.2333422531442333, "grad_norm": 1.3650866746902466, "learning_rate": 1.33037494387424e-05, "loss": 0.4382, "step": 7490 }, { "epoch": 1.234988987464235, "grad_norm": 1.5670567750930786, "learning_rate": 1.3286965189604648e-05, "loss": 0.4662, "step": 7500 }, { "epoch": 1.2366357217842365, "grad_norm": 1.2606995105743408, "learning_rate": 1.3270170552672864e-05, "loss": 0.4672, "step": 7510 }, { "epoch": 1.2382824561042383, "grad_norm": 1.3360075950622559, "learning_rate": 1.3253365581023123e-05, "loss": 0.4842, "step": 7520 }, { "epoch": 1.23992919042424, "grad_norm": 1.5268915891647339, "learning_rate": 1.3236550327764169e-05, "loss": 0.4793, "step": 7530 }, { "epoch": 1.2415759247442415, "grad_norm": 1.3975640535354614, "learning_rate": 1.3219724846037237e-05, "loss": 0.4645, "step": 7540 }, { "epoch": 1.2432226590642432, "grad_norm": 1.4461370706558228, "learning_rate": 1.3202889189015883e-05, "loss": 0.4699, "step": 7550 }, { "epoch": 1.244869393384245, "grad_norm": 1.379309892654419, "learning_rate": 1.3186043409905831e-05, "loss": 0.4732, "step": 7560 }, { "epoch": 1.2465161277042465, "grad_norm": 1.8799365758895874, "learning_rate": 1.3169187561944776e-05, "loss": 0.4716, "step": 7570 }, { "epoch": 1.2481628620242482, "grad_norm": 1.2510124444961548, "learning_rate": 1.3152321698402253e-05, "loss": 0.4666, "step": 7580 }, { "epoch": 1.24980959634425, "grad_norm": 1.4777454137802124, "learning_rate": 1.3135445872579435e-05, "loss": 0.4837, "step": 7590 }, { "epoch": 1.2514563306642514, "grad_norm": 1.3962723016738892, "learning_rate": 1.3118560137808985e-05, "loss": 0.4803, "step": 7600 }, { "epoch": 1.2531030649842532, "grad_norm": 1.7384527921676636, "learning_rate": 1.310166454745488e-05, "loss": 0.4683, "step": 7610 }, { "epoch": 1.2547497993042547, "grad_norm": 1.8363560438156128, "learning_rate": 1.3084759154912243e-05, "loss": 0.4645, "step": 7620 }, { "epoch": 1.2563965336242564, "grad_norm": 1.5209704637527466, "learning_rate": 1.3067844013607179e-05, "loss": 0.4482, "step": 7630 }, { "epoch": 1.2580432679442581, "grad_norm": 3.5697286128997803, "learning_rate": 1.3050919176996592e-05, "loss": 0.4452, "step": 7640 }, { "epoch": 1.2596900022642596, "grad_norm": 1.9864082336425781, "learning_rate": 1.3033984698568038e-05, "loss": 0.4683, "step": 7650 }, { "epoch": 1.2613367365842614, "grad_norm": 2.205308437347412, "learning_rate": 1.3017040631839536e-05, "loss": 0.4605, "step": 7660 }, { "epoch": 1.2629834709042629, "grad_norm": 1.6735590696334839, "learning_rate": 1.3000087030359411e-05, "loss": 0.4727, "step": 7670 }, { "epoch": 1.2646302052242646, "grad_norm": 1.703411340713501, "learning_rate": 1.298312394770612e-05, "loss": 0.4644, "step": 7680 }, { "epoch": 1.2662769395442663, "grad_norm": 1.6092456579208374, "learning_rate": 1.2966151437488078e-05, "loss": 0.4496, "step": 7690 }, { "epoch": 1.2679236738642679, "grad_norm": 2.1162524223327637, "learning_rate": 1.2949169553343504e-05, "loss": 0.4658, "step": 7700 }, { "epoch": 1.2695704081842696, "grad_norm": 1.9072030782699585, "learning_rate": 1.2932178348940231e-05, "loss": 0.4503, "step": 7710 }, { "epoch": 1.271217142504271, "grad_norm": 1.529270887374878, "learning_rate": 1.2915177877975556e-05, "loss": 0.4608, "step": 7720 }, { "epoch": 1.2728638768242728, "grad_norm": 1.5956228971481323, "learning_rate": 1.2898168194176056e-05, "loss": 0.4385, "step": 7730 }, { "epoch": 1.2745106111442746, "grad_norm": 1.6679530143737793, "learning_rate": 1.288114935129742e-05, "loss": 0.4593, "step": 7740 }, { "epoch": 1.2761573454642763, "grad_norm": 1.8094053268432617, "learning_rate": 1.2864121403124288e-05, "loss": 0.4918, "step": 7750 }, { "epoch": 1.2778040797842778, "grad_norm": 2.4337899684906006, "learning_rate": 1.2847084403470076e-05, "loss": 0.4849, "step": 7760 }, { "epoch": 1.2794508141042795, "grad_norm": 1.9572261571884155, "learning_rate": 1.2830038406176804e-05, "loss": 0.4923, "step": 7770 }, { "epoch": 1.281097548424281, "grad_norm": 1.9552801847457886, "learning_rate": 1.2812983465114925e-05, "loss": 0.4959, "step": 7780 }, { "epoch": 1.2827442827442828, "grad_norm": 1.6549162864685059, "learning_rate": 1.2795919634183159e-05, "loss": 0.4571, "step": 7790 }, { "epoch": 1.2843910170642845, "grad_norm": 1.427476167678833, "learning_rate": 1.277884696730832e-05, "loss": 0.4831, "step": 7800 }, { "epoch": 1.286037751384286, "grad_norm": 2.2045860290527344, "learning_rate": 1.2761765518445146e-05, "loss": 0.4709, "step": 7810 }, { "epoch": 1.2876844857042877, "grad_norm": 1.556210994720459, "learning_rate": 1.274467534157613e-05, "loss": 0.4733, "step": 7820 }, { "epoch": 1.2893312200242892, "grad_norm": 1.9513988494873047, "learning_rate": 1.2727576490711344e-05, "loss": 0.4755, "step": 7830 }, { "epoch": 1.290977954344291, "grad_norm": 1.7663830518722534, "learning_rate": 1.2710469019888283e-05, "loss": 0.4407, "step": 7840 }, { "epoch": 1.2926246886642927, "grad_norm": 1.8734620809555054, "learning_rate": 1.2693352983171669e-05, "loss": 0.4683, "step": 7850 }, { "epoch": 1.2942714229842942, "grad_norm": 2.0744926929473877, "learning_rate": 1.2676228434653307e-05, "loss": 0.468, "step": 7860 }, { "epoch": 1.295918157304296, "grad_norm": 1.9999136924743652, "learning_rate": 1.2659095428451898e-05, "loss": 0.4709, "step": 7870 }, { "epoch": 1.2975648916242974, "grad_norm": 1.605631709098816, "learning_rate": 1.2641954018712863e-05, "loss": 0.439, "step": 7880 }, { "epoch": 1.2992116259442992, "grad_norm": 2.568913221359253, "learning_rate": 1.2624804259608194e-05, "loss": 0.4881, "step": 7890 }, { "epoch": 1.300858360264301, "grad_norm": 2.5562288761138916, "learning_rate": 1.2607646205336264e-05, "loss": 0.507, "step": 7900 }, { "epoch": 1.3025050945843026, "grad_norm": 2.0905675888061523, "learning_rate": 1.2590479910121662e-05, "loss": 0.467, "step": 7910 }, { "epoch": 1.3041518289043041, "grad_norm": 2.1957993507385254, "learning_rate": 1.2573305428215017e-05, "loss": 0.4565, "step": 7920 }, { "epoch": 1.3057985632243059, "grad_norm": 1.6840404272079468, "learning_rate": 1.2556122813892834e-05, "loss": 0.4654, "step": 7930 }, { "epoch": 1.3074452975443074, "grad_norm": 1.8692106008529663, "learning_rate": 1.2538932121457322e-05, "loss": 0.452, "step": 7940 }, { "epoch": 1.3090920318643091, "grad_norm": 1.645704746246338, "learning_rate": 1.2521733405236214e-05, "loss": 0.451, "step": 7950 }, { "epoch": 1.3107387661843108, "grad_norm": 1.3794227838516235, "learning_rate": 1.2504526719582596e-05, "loss": 0.479, "step": 7960 }, { "epoch": 1.3123855005043124, "grad_norm": 1.594277262687683, "learning_rate": 1.2487312118874755e-05, "loss": 0.4686, "step": 7970 }, { "epoch": 1.314032234824314, "grad_norm": 2.5194473266601562, "learning_rate": 1.247008965751598e-05, "loss": 0.4672, "step": 7980 }, { "epoch": 1.3156789691443156, "grad_norm": 1.4308621883392334, "learning_rate": 1.2452859389934399e-05, "loss": 0.4429, "step": 7990 }, { "epoch": 1.3173257034643173, "grad_norm": 1.6084959506988525, "learning_rate": 1.2435621370582824e-05, "loss": 0.4557, "step": 8000 }, { "epoch": 1.318972437784319, "grad_norm": 2.098825693130493, "learning_rate": 1.241837565393855e-05, "loss": 0.4538, "step": 8010 }, { "epoch": 1.3206191721043206, "grad_norm": 1.6159589290618896, "learning_rate": 1.2401122294503212e-05, "loss": 0.4345, "step": 8020 }, { "epoch": 1.3222659064243223, "grad_norm": 1.534826636314392, "learning_rate": 1.2383861346802585e-05, "loss": 0.444, "step": 8030 }, { "epoch": 1.3239126407443238, "grad_norm": 1.4128071069717407, "learning_rate": 1.2366592865386433e-05, "loss": 0.4529, "step": 8040 }, { "epoch": 1.3255593750643255, "grad_norm": 1.3641729354858398, "learning_rate": 1.2349316904828327e-05, "loss": 0.4563, "step": 8050 }, { "epoch": 1.3272061093843273, "grad_norm": 1.5807952880859375, "learning_rate": 1.2332033519725474e-05, "loss": 0.4433, "step": 8060 }, { "epoch": 1.328852843704329, "grad_norm": 1.464307188987732, "learning_rate": 1.231474276469855e-05, "loss": 0.4405, "step": 8070 }, { "epoch": 1.3304995780243305, "grad_norm": 1.6941524744033813, "learning_rate": 1.2297444694391509e-05, "loss": 0.4552, "step": 8080 }, { "epoch": 1.3321463123443322, "grad_norm": 1.869858741760254, "learning_rate": 1.2280139363471437e-05, "loss": 0.4543, "step": 8090 }, { "epoch": 1.3337930466643337, "grad_norm": 2.1110851764678955, "learning_rate": 1.2262826826628357e-05, "loss": 0.4671, "step": 8100 }, { "epoch": 1.3354397809843355, "grad_norm": 1.4754462242126465, "learning_rate": 1.2245507138575072e-05, "loss": 0.4646, "step": 8110 }, { "epoch": 1.3370865153043372, "grad_norm": 1.5958442687988281, "learning_rate": 1.2228180354046983e-05, "loss": 0.4384, "step": 8120 }, { "epoch": 1.3387332496243387, "grad_norm": 1.969876766204834, "learning_rate": 1.2210846527801912e-05, "loss": 0.4393, "step": 8130 }, { "epoch": 1.3403799839443404, "grad_norm": 1.1938165426254272, "learning_rate": 1.2193505714619937e-05, "loss": 0.4304, "step": 8140 }, { "epoch": 1.342026718264342, "grad_norm": 1.4246577024459839, "learning_rate": 1.2176157969303227e-05, "loss": 0.4483, "step": 8150 }, { "epoch": 1.3436734525843437, "grad_norm": 1.4756056070327759, "learning_rate": 1.2158803346675845e-05, "loss": 0.4525, "step": 8160 }, { "epoch": 1.3453201869043454, "grad_norm": 1.381611704826355, "learning_rate": 1.2141441901583593e-05, "loss": 0.4407, "step": 8170 }, { "epoch": 1.346966921224347, "grad_norm": 1.3701461553573608, "learning_rate": 1.2124073688893838e-05, "loss": 0.439, "step": 8180 }, { "epoch": 1.3486136555443486, "grad_norm": 2.179842710494995, "learning_rate": 1.2106698763495333e-05, "loss": 0.4494, "step": 8190 }, { "epoch": 1.3502603898643502, "grad_norm": 1.3610347509384155, "learning_rate": 1.2089317180298043e-05, "loss": 0.4301, "step": 8200 }, { "epoch": 1.3519071241843519, "grad_norm": 1.461213231086731, "learning_rate": 1.207192899423297e-05, "loss": 0.4565, "step": 8210 }, { "epoch": 1.3535538585043536, "grad_norm": 1.9947609901428223, "learning_rate": 1.2054534260251995e-05, "loss": 0.4513, "step": 8220 }, { "epoch": 1.3552005928243551, "grad_norm": 1.3805066347122192, "learning_rate": 1.2037133033327679e-05, "loss": 0.4198, "step": 8230 }, { "epoch": 1.3568473271443569, "grad_norm": 1.6667447090148926, "learning_rate": 1.2019725368453111e-05, "loss": 0.4525, "step": 8240 }, { "epoch": 1.3584940614643584, "grad_norm": 1.3600236177444458, "learning_rate": 1.2002311320641722e-05, "loss": 0.4311, "step": 8250 }, { "epoch": 1.36014079578436, "grad_norm": 1.2351371049880981, "learning_rate": 1.1984890944927119e-05, "loss": 0.4625, "step": 8260 }, { "epoch": 1.3617875301043618, "grad_norm": 1.8119263648986816, "learning_rate": 1.1967464296362903e-05, "loss": 0.4306, "step": 8270 }, { "epoch": 1.3634342644243636, "grad_norm": 1.2814642190933228, "learning_rate": 1.1950031430022499e-05, "loss": 0.4399, "step": 8280 }, { "epoch": 1.365080998744365, "grad_norm": 1.4676977396011353, "learning_rate": 1.1932592400998985e-05, "loss": 0.4423, "step": 8290 }, { "epoch": 1.3667277330643668, "grad_norm": 1.8431611061096191, "learning_rate": 1.1915147264404916e-05, "loss": 0.4298, "step": 8300 }, { "epoch": 1.3683744673843683, "grad_norm": 1.3189785480499268, "learning_rate": 1.1897696075372143e-05, "loss": 0.4415, "step": 8310 }, { "epoch": 1.37002120170437, "grad_norm": 1.5338736772537231, "learning_rate": 1.1880238889051647e-05, "loss": 0.4405, "step": 8320 }, { "epoch": 1.3716679360243718, "grad_norm": 2.1897244453430176, "learning_rate": 1.1862775760613365e-05, "loss": 0.4169, "step": 8330 }, { "epoch": 1.3733146703443733, "grad_norm": 1.3329521417617798, "learning_rate": 1.1845306745246012e-05, "loss": 0.4433, "step": 8340 }, { "epoch": 1.374961404664375, "grad_norm": 1.6167490482330322, "learning_rate": 1.1827831898156905e-05, "loss": 0.4594, "step": 8350 }, { "epoch": 1.3766081389843765, "grad_norm": 1.357681393623352, "learning_rate": 1.1810351274571792e-05, "loss": 0.4196, "step": 8360 }, { "epoch": 1.3782548733043782, "grad_norm": 1.3341563940048218, "learning_rate": 1.1792864929734678e-05, "loss": 0.4466, "step": 8370 }, { "epoch": 1.37990160762438, "grad_norm": 1.418643832206726, "learning_rate": 1.1775372918907646e-05, "loss": 0.4178, "step": 8380 }, { "epoch": 1.3815483419443815, "grad_norm": 1.5278998613357544, "learning_rate": 1.1757875297370687e-05, "loss": 0.4381, "step": 8390 }, { "epoch": 1.3831950762643832, "grad_norm": 1.9926173686981201, "learning_rate": 1.174037212042152e-05, "loss": 0.4454, "step": 8400 }, { "epoch": 1.3848418105843847, "grad_norm": 3.1398065090179443, "learning_rate": 1.1722863443375437e-05, "loss": 0.4214, "step": 8410 }, { "epoch": 1.3864885449043864, "grad_norm": 1.9283721446990967, "learning_rate": 1.1705349321565085e-05, "loss": 0.4275, "step": 8420 }, { "epoch": 1.3881352792243882, "grad_norm": 1.2598986625671387, "learning_rate": 1.1687829810340338e-05, "loss": 0.4551, "step": 8430 }, { "epoch": 1.38978201354439, "grad_norm": 1.5143520832061768, "learning_rate": 1.1670304965068098e-05, "loss": 0.4438, "step": 8440 }, { "epoch": 1.3914287478643914, "grad_norm": 2.051283597946167, "learning_rate": 1.165277484113212e-05, "loss": 0.4186, "step": 8450 }, { "epoch": 1.3930754821843931, "grad_norm": 1.9647774696350098, "learning_rate": 1.1635239493932842e-05, "loss": 0.4315, "step": 8460 }, { "epoch": 1.3947222165043947, "grad_norm": 3.153352737426758, "learning_rate": 1.1617698978887214e-05, "loss": 0.4627, "step": 8470 }, { "epoch": 1.3963689508243964, "grad_norm": 2.1228764057159424, "learning_rate": 1.1600153351428516e-05, "loss": 0.4405, "step": 8480 }, { "epoch": 1.3980156851443981, "grad_norm": 1.7035056352615356, "learning_rate": 1.158260266700618e-05, "loss": 0.4512, "step": 8490 }, { "epoch": 1.3996624194643996, "grad_norm": 2.2831528186798096, "learning_rate": 1.1565046981085621e-05, "loss": 0.4442, "step": 8500 }, { "epoch": 1.4013091537844014, "grad_norm": 1.483190894126892, "learning_rate": 1.154748634914807e-05, "loss": 0.4119, "step": 8510 }, { "epoch": 1.4029558881044029, "grad_norm": 1.4663301706314087, "learning_rate": 1.1529920826690375e-05, "loss": 0.4234, "step": 8520 }, { "epoch": 1.4046026224244046, "grad_norm": 2.566152572631836, "learning_rate": 1.1512350469224846e-05, "loss": 0.423, "step": 8530 }, { "epoch": 1.4062493567444063, "grad_norm": 1.1131696701049805, "learning_rate": 1.1494775332279076e-05, "loss": 0.4087, "step": 8540 }, { "epoch": 1.4078960910644078, "grad_norm": 1.7250810861587524, "learning_rate": 1.1477195471395759e-05, "loss": 0.4192, "step": 8550 }, { "epoch": 1.4095428253844096, "grad_norm": 2.6664037704467773, "learning_rate": 1.1459610942132513e-05, "loss": 0.4299, "step": 8560 }, { "epoch": 1.411189559704411, "grad_norm": 1.2339338064193726, "learning_rate": 1.144202180006172e-05, "loss": 0.3896, "step": 8570 }, { "epoch": 1.4128362940244128, "grad_norm": 1.5988686084747314, "learning_rate": 1.1424428100770333e-05, "loss": 0.438, "step": 8580 }, { "epoch": 1.4144830283444145, "grad_norm": 1.4023103713989258, "learning_rate": 1.140682989985971e-05, "loss": 0.4207, "step": 8590 }, { "epoch": 1.4161297626644163, "grad_norm": 1.4158636331558228, "learning_rate": 1.1389227252945434e-05, "loss": 0.4342, "step": 8600 }, { "epoch": 1.4177764969844178, "grad_norm": 1.6897794008255005, "learning_rate": 1.1371620215657136e-05, "loss": 0.4199, "step": 8610 }, { "epoch": 1.4194232313044195, "grad_norm": 2.0550646781921387, "learning_rate": 1.1354008843638331e-05, "loss": 0.411, "step": 8620 }, { "epoch": 1.421069965624421, "grad_norm": 1.6463772058486938, "learning_rate": 1.133639319254622e-05, "loss": 0.4246, "step": 8630 }, { "epoch": 1.4227166999444227, "grad_norm": 1.7673988342285156, "learning_rate": 1.131877331805154e-05, "loss": 0.4296, "step": 8640 }, { "epoch": 1.4243634342644245, "grad_norm": 1.807904601097107, "learning_rate": 1.1301149275838363e-05, "loss": 0.4131, "step": 8650 }, { "epoch": 1.426010168584426, "grad_norm": 1.274460792541504, "learning_rate": 1.1283521121603948e-05, "loss": 0.4114, "step": 8660 }, { "epoch": 1.4276569029044277, "grad_norm": 1.1608306169509888, "learning_rate": 1.1265888911058532e-05, "loss": 0.4429, "step": 8670 }, { "epoch": 1.4293036372244292, "grad_norm": 1.3403079509735107, "learning_rate": 1.124825269992518e-05, "loss": 0.4456, "step": 8680 }, { "epoch": 1.430950371544431, "grad_norm": 1.4677191972732544, "learning_rate": 1.1230612543939603e-05, "loss": 0.4251, "step": 8690 }, { "epoch": 1.4325971058644327, "grad_norm": 1.2995331287384033, "learning_rate": 1.1212968498849973e-05, "loss": 0.4305, "step": 8700 }, { "epoch": 1.4342438401844342, "grad_norm": 1.7009153366088867, "learning_rate": 1.119532062041675e-05, "loss": 0.4071, "step": 8710 }, { "epoch": 1.435890574504436, "grad_norm": 1.6432769298553467, "learning_rate": 1.1177668964412519e-05, "loss": 0.4388, "step": 8720 }, { "epoch": 1.4375373088244374, "grad_norm": 1.4236555099487305, "learning_rate": 1.1160013586621796e-05, "loss": 0.4028, "step": 8730 }, { "epoch": 1.4391840431444392, "grad_norm": 1.7424023151397705, "learning_rate": 1.1142354542840859e-05, "loss": 0.4037, "step": 8740 }, { "epoch": 1.4408307774644409, "grad_norm": 1.2468111515045166, "learning_rate": 1.1124691888877575e-05, "loss": 0.4084, "step": 8750 }, { "epoch": 1.4424775117844426, "grad_norm": 1.6972237825393677, "learning_rate": 1.1107025680551216e-05, "loss": 0.4287, "step": 8760 }, { "epoch": 1.4441242461044441, "grad_norm": 1.772537112236023, "learning_rate": 1.1089355973692292e-05, "loss": 0.4122, "step": 8770 }, { "epoch": 1.4457709804244459, "grad_norm": 2.2410008907318115, "learning_rate": 1.1071682824142365e-05, "loss": 0.4277, "step": 8780 }, { "epoch": 1.4474177147444474, "grad_norm": 1.619391918182373, "learning_rate": 1.1054006287753876e-05, "loss": 0.44, "step": 8790 }, { "epoch": 1.449064449064449, "grad_norm": 1.8790594339370728, "learning_rate": 1.1036326420389978e-05, "loss": 0.4215, "step": 8800 }, { "epoch": 1.4507111833844508, "grad_norm": 2.067535877227783, "learning_rate": 1.1018643277924338e-05, "loss": 0.4136, "step": 8810 }, { "epoch": 1.4523579177044523, "grad_norm": 1.8926339149475098, "learning_rate": 1.1000956916240985e-05, "loss": 0.4328, "step": 8820 }, { "epoch": 1.454004652024454, "grad_norm": 1.6244444847106934, "learning_rate": 1.0983267391234113e-05, "loss": 0.4103, "step": 8830 }, { "epoch": 1.4556513863444556, "grad_norm": 1.3383644819259644, "learning_rate": 1.0965574758807924e-05, "loss": 0.4274, "step": 8840 }, { "epoch": 1.4572981206644573, "grad_norm": 1.4654048681259155, "learning_rate": 1.0947879074876425e-05, "loss": 0.4314, "step": 8850 }, { "epoch": 1.458944854984459, "grad_norm": 1.5433770418167114, "learning_rate": 1.0930180395363275e-05, "loss": 0.439, "step": 8860 }, { "epoch": 1.4605915893044605, "grad_norm": 1.5175681114196777, "learning_rate": 1.0912478776201605e-05, "loss": 0.4175, "step": 8870 }, { "epoch": 1.4622383236244623, "grad_norm": 1.402180552482605, "learning_rate": 1.089477427333383e-05, "loss": 0.389, "step": 8880 }, { "epoch": 1.4638850579444638, "grad_norm": 1.3952786922454834, "learning_rate": 1.0877066942711476e-05, "loss": 0.4107, "step": 8890 }, { "epoch": 1.4655317922644655, "grad_norm": 1.4274441003799438, "learning_rate": 1.0859356840295013e-05, "loss": 0.4036, "step": 8900 }, { "epoch": 1.4671785265844672, "grad_norm": 1.6509827375411987, "learning_rate": 1.0841644022053662e-05, "loss": 0.4023, "step": 8910 }, { "epoch": 1.4688252609044687, "grad_norm": 1.4822527170181274, "learning_rate": 1.0823928543965236e-05, "loss": 0.417, "step": 8920 }, { "epoch": 1.4704719952244705, "grad_norm": 1.6752021312713623, "learning_rate": 1.0806210462015946e-05, "loss": 0.3962, "step": 8930 }, { "epoch": 1.472118729544472, "grad_norm": 2.425339460372925, "learning_rate": 1.0788489832200237e-05, "loss": 0.4551, "step": 8940 }, { "epoch": 1.4737654638644737, "grad_norm": 1.5440939664840698, "learning_rate": 1.0770766710520607e-05, "loss": 0.4006, "step": 8950 }, { "epoch": 1.4754121981844754, "grad_norm": 1.8079001903533936, "learning_rate": 1.075304115298742e-05, "loss": 0.397, "step": 8960 }, { "epoch": 1.4770589325044772, "grad_norm": 1.4747633934020996, "learning_rate": 1.0735313215618748e-05, "loss": 0.4002, "step": 8970 }, { "epoch": 1.4787056668244787, "grad_norm": 1.5912610292434692, "learning_rate": 1.071758295444018e-05, "loss": 0.4153, "step": 8980 }, { "epoch": 1.4803524011444804, "grad_norm": 1.4173518419265747, "learning_rate": 1.069985042548465e-05, "loss": 0.4142, "step": 8990 }, { "epoch": 1.481999135464482, "grad_norm": 1.8204532861709595, "learning_rate": 1.0682115684792256e-05, "loss": 0.4103, "step": 9000 }, { "epoch": 1.4836458697844837, "grad_norm": 1.718644380569458, "learning_rate": 1.0664378788410092e-05, "loss": 0.4099, "step": 9010 }, { "epoch": 1.4852926041044854, "grad_norm": 1.6308566331863403, "learning_rate": 1.0646639792392057e-05, "loss": 0.4108, "step": 9020 }, { "epoch": 1.486939338424487, "grad_norm": 1.60697340965271, "learning_rate": 1.062889875279869e-05, "loss": 0.4109, "step": 9030 }, { "epoch": 1.4885860727444886, "grad_norm": 1.5269246101379395, "learning_rate": 1.0611155725696988e-05, "loss": 0.3809, "step": 9040 }, { "epoch": 1.4902328070644901, "grad_norm": 1.802048921585083, "learning_rate": 1.0593410767160229e-05, "loss": 0.4015, "step": 9050 }, { "epoch": 1.4918795413844919, "grad_norm": 1.5887212753295898, "learning_rate": 1.0575663933267793e-05, "loss": 0.3746, "step": 9060 }, { "epoch": 1.4935262757044936, "grad_norm": 5.430899620056152, "learning_rate": 1.0557915280104987e-05, "loss": 0.3979, "step": 9070 }, { "epoch": 1.495173010024495, "grad_norm": 1.8319224119186401, "learning_rate": 1.0540164863762867e-05, "loss": 0.3903, "step": 9080 }, { "epoch": 1.4968197443444968, "grad_norm": 1.445753574371338, "learning_rate": 1.0522412740338072e-05, "loss": 0.3993, "step": 9090 }, { "epoch": 1.4984664786644983, "grad_norm": 2.215062141418457, "learning_rate": 1.0504658965932617e-05, "loss": 0.4079, "step": 9100 }, { "epoch": 1.5001132129845, "grad_norm": 1.9936473369598389, "learning_rate": 1.0486903596653746e-05, "loss": 0.4127, "step": 9110 }, { "epoch": 1.5017599473045018, "grad_norm": 2.1176092624664307, "learning_rate": 1.0469146688613744e-05, "loss": 0.3993, "step": 9120 }, { "epoch": 1.5034066816245035, "grad_norm": 2.069631576538086, "learning_rate": 1.0451388297929757e-05, "loss": 0.4327, "step": 9130 }, { "epoch": 1.505053415944505, "grad_norm": 1.5769414901733398, "learning_rate": 1.043362848072361e-05, "loss": 0.3866, "step": 9140 }, { "epoch": 1.5067001502645065, "grad_norm": 1.590387225151062, "learning_rate": 1.041586729312165e-05, "loss": 0.3923, "step": 9150 }, { "epoch": 1.5083468845845083, "grad_norm": 1.8121472597122192, "learning_rate": 1.0398104791254542e-05, "loss": 0.3963, "step": 9160 }, { "epoch": 1.50999361890451, "grad_norm": 1.914831280708313, "learning_rate": 1.038034103125711e-05, "loss": 0.3835, "step": 9170 }, { "epoch": 1.5116403532245117, "grad_norm": 1.556657314300537, "learning_rate": 1.0362576069268156e-05, "loss": 0.396, "step": 9180 }, { "epoch": 1.5132870875445132, "grad_norm": 1.9325206279754639, "learning_rate": 1.0344809961430277e-05, "loss": 0.4204, "step": 9190 }, { "epoch": 1.514933821864515, "grad_norm": 2.9902329444885254, "learning_rate": 1.0327042763889692e-05, "loss": 0.4115, "step": 9200 }, { "epoch": 1.5165805561845165, "grad_norm": 1.8379610776901245, "learning_rate": 1.0309274532796064e-05, "loss": 0.3794, "step": 9210 }, { "epoch": 1.5182272905045182, "grad_norm": 1.8868550062179565, "learning_rate": 1.0291505324302322e-05, "loss": 0.4127, "step": 9220 }, { "epoch": 1.51987402482452, "grad_norm": 2.223417282104492, "learning_rate": 1.027373519456449e-05, "loss": 0.4269, "step": 9230 }, { "epoch": 1.5215207591445217, "grad_norm": 2.04238224029541, "learning_rate": 1.0255964199741488e-05, "loss": 0.3928, "step": 9240 }, { "epoch": 1.5231674934645232, "grad_norm": 2.3193910121917725, "learning_rate": 1.0238192395994989e-05, "loss": 0.4056, "step": 9250 }, { "epoch": 1.5248142277845247, "grad_norm": 2.5879781246185303, "learning_rate": 1.022041983948921e-05, "loss": 0.3993, "step": 9260 }, { "epoch": 1.5264609621045264, "grad_norm": 1.9214088916778564, "learning_rate": 1.020264658639075e-05, "loss": 0.3947, "step": 9270 }, { "epoch": 1.5281076964245282, "grad_norm": 1.8478078842163086, "learning_rate": 1.0184872692868409e-05, "loss": 0.4056, "step": 9280 }, { "epoch": 1.5297544307445299, "grad_norm": 3.103062629699707, "learning_rate": 1.016709821509301e-05, "loss": 0.3801, "step": 9290 }, { "epoch": 1.5314011650645314, "grad_norm": 2.530630111694336, "learning_rate": 1.014932320923723e-05, "loss": 0.4195, "step": 9300 }, { "epoch": 1.533047899384533, "grad_norm": 1.8765909671783447, "learning_rate": 1.0131547731475401e-05, "loss": 0.4163, "step": 9310 }, { "epoch": 1.5346946337045346, "grad_norm": 1.9875380992889404, "learning_rate": 1.0113771837983361e-05, "loss": 0.4135, "step": 9320 }, { "epoch": 1.5363413680245364, "grad_norm": 2.6800906658172607, "learning_rate": 1.0095995584938252e-05, "loss": 0.3949, "step": 9330 }, { "epoch": 1.537988102344538, "grad_norm": 2.1471517086029053, "learning_rate": 1.0078219028518359e-05, "loss": 0.4129, "step": 9340 }, { "epoch": 1.5396348366645396, "grad_norm": 2.109405517578125, "learning_rate": 1.0060442224902915e-05, "loss": 0.404, "step": 9350 }, { "epoch": 1.5412815709845413, "grad_norm": 1.336411714553833, "learning_rate": 1.0042665230271947e-05, "loss": 0.423, "step": 9360 }, { "epoch": 1.5429283053045428, "grad_norm": 1.543381690979004, "learning_rate": 1.0024888100806079e-05, "loss": 0.4204, "step": 9370 }, { "epoch": 1.5445750396245446, "grad_norm": 1.7605801820755005, "learning_rate": 1.000711089268636e-05, "loss": 0.3922, "step": 9380 }, { "epoch": 1.5462217739445463, "grad_norm": 1.8124173879623413, "learning_rate": 9.989333662094092e-06, "loss": 0.4036, "step": 9390 }, { "epoch": 1.547868508264548, "grad_norm": 2.485732316970825, "learning_rate": 9.971556465210643e-06, "loss": 0.3985, "step": 9400 }, { "epoch": 1.5495152425845495, "grad_norm": 2.339867353439331, "learning_rate": 9.953779358217281e-06, "loss": 0.4196, "step": 9410 }, { "epoch": 1.551161976904551, "grad_norm": 2.12572979927063, "learning_rate": 9.93600239729499e-06, "loss": 0.3888, "step": 9420 }, { "epoch": 1.5528087112245528, "grad_norm": 2.382894277572632, "learning_rate": 9.918225638624276e-06, "loss": 0.4123, "step": 9430 }, { "epoch": 1.5544554455445545, "grad_norm": 2.5262231826782227, "learning_rate": 9.900449138385026e-06, "loss": 0.425, "step": 9440 }, { "epoch": 1.5561021798645562, "grad_norm": 2.0982043743133545, "learning_rate": 9.882672952756301e-06, "loss": 0.398, "step": 9450 }, { "epoch": 1.5577489141845577, "grad_norm": 1.5875078439712524, "learning_rate": 9.86489713791617e-06, "loss": 0.3729, "step": 9460 }, { "epoch": 1.5593956485045593, "grad_norm": 1.499527096748352, "learning_rate": 9.847121750041532e-06, "loss": 0.4208, "step": 9470 }, { "epoch": 1.561042382824561, "grad_norm": 1.7554186582565308, "learning_rate": 9.829346845307929e-06, "loss": 0.4171, "step": 9480 }, { "epoch": 1.5626891171445627, "grad_norm": 2.1074600219726562, "learning_rate": 9.811572479889387e-06, "loss": 0.4018, "step": 9490 }, { "epoch": 1.5643358514645644, "grad_norm": 3.0306382179260254, "learning_rate": 9.793798709958221e-06, "loss": 0.4319, "step": 9500 }, { "epoch": 1.565982585784566, "grad_norm": 1.5060926675796509, "learning_rate": 9.77602559168486e-06, "loss": 0.3968, "step": 9510 }, { "epoch": 1.5676293201045677, "grad_norm": 1.7786906957626343, "learning_rate": 9.75825318123768e-06, "loss": 0.3867, "step": 9520 }, { "epoch": 1.5692760544245692, "grad_norm": 1.4499342441558838, "learning_rate": 9.740481534782822e-06, "loss": 0.3877, "step": 9530 }, { "epoch": 1.570922788744571, "grad_norm": 1.7993638515472412, "learning_rate": 9.722710708484009e-06, "loss": 0.3935, "step": 9540 }, { "epoch": 1.5725695230645726, "grad_norm": 1.487587571144104, "learning_rate": 9.704940758502367e-06, "loss": 0.4333, "step": 9550 }, { "epoch": 1.5742162573845744, "grad_norm": 1.5965005159378052, "learning_rate": 9.687171740996262e-06, "loss": 0.3824, "step": 9560 }, { "epoch": 1.5758629917045759, "grad_norm": 1.9359855651855469, "learning_rate": 9.669403712121116e-06, "loss": 0.4349, "step": 9570 }, { "epoch": 1.5775097260245774, "grad_norm": 1.7077593803405762, "learning_rate": 9.651636728029205e-06, "loss": 0.4088, "step": 9580 }, { "epoch": 1.5791564603445791, "grad_norm": 1.3524852991104126, "learning_rate": 9.633870844869526e-06, "loss": 0.3674, "step": 9590 }, { "epoch": 1.5808031946645809, "grad_norm": 1.4619336128234863, "learning_rate": 9.616106118787586e-06, "loss": 0.4186, "step": 9600 }, { "epoch": 1.5824499289845826, "grad_norm": 1.5433001518249512, "learning_rate": 9.59834260592524e-06, "loss": 0.3836, "step": 9610 }, { "epoch": 1.584096663304584, "grad_norm": 1.562225580215454, "learning_rate": 9.580580362420505e-06, "loss": 0.3979, "step": 9620 }, { "epoch": 1.5857433976245856, "grad_norm": 1.6424192190170288, "learning_rate": 9.562819444407389e-06, "loss": 0.3839, "step": 9630 }, { "epoch": 1.5873901319445873, "grad_norm": 1.7441072463989258, "learning_rate": 9.545059908015713e-06, "loss": 0.3896, "step": 9640 }, { "epoch": 1.589036866264589, "grad_norm": 1.3649961948394775, "learning_rate": 9.527301809370922e-06, "loss": 0.3891, "step": 9650 }, { "epoch": 1.5906836005845908, "grad_norm": 2.137296676635742, "learning_rate": 9.509545204593928e-06, "loss": 0.3771, "step": 9660 }, { "epoch": 1.5923303349045923, "grad_norm": 3.6009469032287598, "learning_rate": 9.491790149800916e-06, "loss": 0.3759, "step": 9670 }, { "epoch": 1.5939770692245938, "grad_norm": 2.4487457275390625, "learning_rate": 9.474036701103178e-06, "loss": 0.4081, "step": 9680 }, { "epoch": 1.5956238035445955, "grad_norm": 2.1855509281158447, "learning_rate": 9.456284914606924e-06, "loss": 0.394, "step": 9690 }, { "epoch": 1.5972705378645973, "grad_norm": 1.6992751359939575, "learning_rate": 9.438534846413115e-06, "loss": 0.3946, "step": 9700 }, { "epoch": 1.598917272184599, "grad_norm": 1.9765753746032715, "learning_rate": 9.420786552617281e-06, "loss": 0.3921, "step": 9710 }, { "epoch": 1.6005640065046005, "grad_norm": 1.6092065572738647, "learning_rate": 9.40304008930934e-06, "loss": 0.3737, "step": 9720 }, { "epoch": 1.6022107408246022, "grad_norm": 1.9483388662338257, "learning_rate": 9.385295512573436e-06, "loss": 0.4191, "step": 9730 }, { "epoch": 1.6038574751446038, "grad_norm": 1.6057438850402832, "learning_rate": 9.367552878487736e-06, "loss": 0.4186, "step": 9740 }, { "epoch": 1.6055042094646055, "grad_norm": 2.68620228767395, "learning_rate": 9.34981224312428e-06, "loss": 0.3937, "step": 9750 }, { "epoch": 1.6071509437846072, "grad_norm": 2.7004518508911133, "learning_rate": 9.332073662548785e-06, "loss": 0.4057, "step": 9760 }, { "epoch": 1.608797678104609, "grad_norm": 2.4203567504882812, "learning_rate": 9.314337192820477e-06, "loss": 0.3898, "step": 9770 }, { "epoch": 1.6104444124246104, "grad_norm": 1.4761004447937012, "learning_rate": 9.296602889991914e-06, "loss": 0.3759, "step": 9780 }, { "epoch": 1.612091146744612, "grad_norm": 1.881105899810791, "learning_rate": 9.278870810108794e-06, "loss": 0.3927, "step": 9790 }, { "epoch": 1.6137378810646137, "grad_norm": 1.8509386777877808, "learning_rate": 9.261141009209803e-06, "loss": 0.4013, "step": 9800 }, { "epoch": 1.6153846153846154, "grad_norm": 1.8641047477722168, "learning_rate": 9.24341354332642e-06, "loss": 0.3911, "step": 9810 }, { "epoch": 1.6170313497046171, "grad_norm": 1.479648470878601, "learning_rate": 9.225688468482743e-06, "loss": 0.384, "step": 9820 }, { "epoch": 1.6186780840246187, "grad_norm": 1.395717740058899, "learning_rate": 9.207965840695314e-06, "loss": 0.3862, "step": 9830 }, { "epoch": 1.6203248183446202, "grad_norm": 1.989012598991394, "learning_rate": 9.190245715972946e-06, "loss": 0.405, "step": 9840 }, { "epoch": 1.621971552664622, "grad_norm": 2.0621602535247803, "learning_rate": 9.172528150316536e-06, "loss": 0.4259, "step": 9850 }, { "epoch": 1.6236182869846236, "grad_norm": 2.660885810852051, "learning_rate": 9.154813199718893e-06, "loss": 0.3823, "step": 9860 }, { "epoch": 1.6252650213046254, "grad_norm": 2.435926914215088, "learning_rate": 9.137100920164567e-06, "loss": 0.3809, "step": 9870 }, { "epoch": 1.6269117556246269, "grad_norm": 1.566701054573059, "learning_rate": 9.119391367629665e-06, "loss": 0.3987, "step": 9880 }, { "epoch": 1.6285584899446286, "grad_norm": 1.947234034538269, "learning_rate": 9.101684598081672e-06, "loss": 0.3806, "step": 9890 }, { "epoch": 1.63020522426463, "grad_norm": 1.8058415651321411, "learning_rate": 9.083980667479286e-06, "loss": 0.412, "step": 9900 }, { "epoch": 1.6318519585846318, "grad_norm": 1.4982244968414307, "learning_rate": 9.066279631772222e-06, "loss": 0.3814, "step": 9910 }, { "epoch": 1.6334986929046336, "grad_norm": 1.4314496517181396, "learning_rate": 9.048581546901056e-06, "loss": 0.3928, "step": 9920 }, { "epoch": 1.6351454272246353, "grad_norm": 1.6061030626296997, "learning_rate": 9.03088646879703e-06, "loss": 0.3695, "step": 9930 }, { "epoch": 1.6367921615446368, "grad_norm": 1.627259373664856, "learning_rate": 9.013194453381892e-06, "loss": 0.4073, "step": 9940 }, { "epoch": 1.6384388958646383, "grad_norm": 1.8757750988006592, "learning_rate": 8.995505556567707e-06, "loss": 0.3773, "step": 9950 }, { "epoch": 1.64008563018464, "grad_norm": 2.141075372695923, "learning_rate": 8.977819834256683e-06, "loss": 0.3893, "step": 9960 }, { "epoch": 1.6417323645046418, "grad_norm": 1.8441202640533447, "learning_rate": 8.960137342340997e-06, "loss": 0.3921, "step": 9970 }, { "epoch": 1.6433790988246435, "grad_norm": 1.523070216178894, "learning_rate": 8.94245813670262e-06, "loss": 0.3899, "step": 9980 }, { "epoch": 1.645025833144645, "grad_norm": 1.7577736377716064, "learning_rate": 8.924782273213137e-06, "loss": 0.3742, "step": 9990 }, { "epoch": 1.6466725674646465, "grad_norm": 1.6085615158081055, "learning_rate": 8.907109807733559e-06, "loss": 0.3694, "step": 10000 }, { "epoch": 1.6483193017846482, "grad_norm": 2.142134666442871, "learning_rate": 8.889440796114174e-06, "loss": 0.3847, "step": 10010 }, { "epoch": 1.64996603610465, "grad_norm": 1.5217559337615967, "learning_rate": 8.871775294194346e-06, "loss": 0.3493, "step": 10020 }, { "epoch": 1.6516127704246517, "grad_norm": 2.990302085876465, "learning_rate": 8.854113357802353e-06, "loss": 0.367, "step": 10030 }, { "epoch": 1.6532595047446532, "grad_norm": 1.62135648727417, "learning_rate": 8.836455042755197e-06, "loss": 0.3797, "step": 10040 }, { "epoch": 1.654906239064655, "grad_norm": 1.5902869701385498, "learning_rate": 8.818800404858441e-06, "loss": 0.3803, "step": 10050 }, { "epoch": 1.6565529733846565, "grad_norm": 1.5055309534072876, "learning_rate": 8.801149499906032e-06, "loss": 0.379, "step": 10060 }, { "epoch": 1.6581997077046582, "grad_norm": 1.5412495136260986, "learning_rate": 8.78350238368011e-06, "loss": 0.3718, "step": 10070 }, { "epoch": 1.65984644202466, "grad_norm": 1.5508426427841187, "learning_rate": 8.765859111950842e-06, "loss": 0.3957, "step": 10080 }, { "epoch": 1.6614931763446616, "grad_norm": 1.5197268724441528, "learning_rate": 8.74821974047625e-06, "loss": 0.3655, "step": 10090 }, { "epoch": 1.6631399106646632, "grad_norm": 2.0279932022094727, "learning_rate": 8.730584325002031e-06, "loss": 0.3608, "step": 10100 }, { "epoch": 1.6647866449846647, "grad_norm": 1.2136750221252441, "learning_rate": 8.712952921261377e-06, "loss": 0.3698, "step": 10110 }, { "epoch": 1.6664333793046664, "grad_norm": 1.5392953157424927, "learning_rate": 8.695325584974802e-06, "loss": 0.385, "step": 10120 }, { "epoch": 1.6680801136246681, "grad_norm": 1.2899067401885986, "learning_rate": 8.677702371849965e-06, "loss": 0.3637, "step": 10130 }, { "epoch": 1.6697268479446699, "grad_norm": 1.5220967531204224, "learning_rate": 8.6600833375815e-06, "loss": 0.3788, "step": 10140 }, { "epoch": 1.6713735822646714, "grad_norm": 2.6489806175231934, "learning_rate": 8.642468537850822e-06, "loss": 0.3889, "step": 10150 }, { "epoch": 1.6730203165846729, "grad_norm": 2.6164515018463135, "learning_rate": 8.624858028325976e-06, "loss": 0.384, "step": 10160 }, { "epoch": 1.6746670509046746, "grad_norm": 2.1148691177368164, "learning_rate": 8.607251864661443e-06, "loss": 0.3684, "step": 10170 }, { "epoch": 1.6763137852246763, "grad_norm": 1.1956931352615356, "learning_rate": 8.589650102497973e-06, "loss": 0.3512, "step": 10180 }, { "epoch": 1.677960519544678, "grad_norm": 1.5633714199066162, "learning_rate": 8.572052797462403e-06, "loss": 0.3658, "step": 10190 }, { "epoch": 1.6796072538646796, "grad_norm": 1.9797472953796387, "learning_rate": 8.554460005167483e-06, "loss": 0.3643, "step": 10200 }, { "epoch": 1.6812539881846813, "grad_norm": 1.8225812911987305, "learning_rate": 8.536871781211711e-06, "loss": 0.3615, "step": 10210 }, { "epoch": 1.6829007225046828, "grad_norm": 2.6055593490600586, "learning_rate": 8.51928818117913e-06, "loss": 0.3932, "step": 10220 }, { "epoch": 1.6845474568246845, "grad_norm": 2.2658164501190186, "learning_rate": 8.501709260639187e-06, "loss": 0.3912, "step": 10230 }, { "epoch": 1.6861941911446863, "grad_norm": 2.0631959438323975, "learning_rate": 8.484135075146528e-06, "loss": 0.3777, "step": 10240 }, { "epoch": 1.687840925464688, "grad_norm": 2.3120977878570557, "learning_rate": 8.466565680240847e-06, "loss": 0.3918, "step": 10250 }, { "epoch": 1.6894876597846895, "grad_norm": 1.802088737487793, "learning_rate": 8.449001131446687e-06, "loss": 0.3785, "step": 10260 }, { "epoch": 1.691134394104691, "grad_norm": 2.1044161319732666, "learning_rate": 8.431441484273282e-06, "loss": 0.3679, "step": 10270 }, { "epoch": 1.6927811284246927, "grad_norm": 2.071812868118286, "learning_rate": 8.413886794214379e-06, "loss": 0.3771, "step": 10280 }, { "epoch": 1.6944278627446945, "grad_norm": 2.1053099632263184, "learning_rate": 8.396337116748046e-06, "loss": 0.3496, "step": 10290 }, { "epoch": 1.6960745970646962, "grad_norm": 1.8317806720733643, "learning_rate": 8.37879250733652e-06, "loss": 0.3549, "step": 10300 }, { "epoch": 1.6977213313846977, "grad_norm": 2.297271728515625, "learning_rate": 8.361253021426019e-06, "loss": 0.3801, "step": 10310 }, { "epoch": 1.6993680657046992, "grad_norm": 2.0474696159362793, "learning_rate": 8.343718714446572e-06, "loss": 0.3814, "step": 10320 }, { "epoch": 1.701014800024701, "grad_norm": 1.3918821811676025, "learning_rate": 8.326189641811835e-06, "loss": 0.3513, "step": 10330 }, { "epoch": 1.7026615343447027, "grad_norm": 1.5274895429611206, "learning_rate": 8.308665858918928e-06, "loss": 0.3358, "step": 10340 }, { "epoch": 1.7043082686647044, "grad_norm": 2.130195140838623, "learning_rate": 8.291147421148255e-06, "loss": 0.3682, "step": 10350 }, { "epoch": 1.705955002984706, "grad_norm": 1.9873136281967163, "learning_rate": 8.273634383863315e-06, "loss": 0.367, "step": 10360 }, { "epoch": 1.7076017373047077, "grad_norm": 1.433440923690796, "learning_rate": 8.256126802410554e-06, "loss": 0.3572, "step": 10370 }, { "epoch": 1.7092484716247092, "grad_norm": 1.550261378288269, "learning_rate": 8.238624732119169e-06, "loss": 0.383, "step": 10380 }, { "epoch": 1.710895205944711, "grad_norm": 1.8397653102874756, "learning_rate": 8.221128228300941e-06, "loss": 0.3497, "step": 10390 }, { "epoch": 1.7125419402647126, "grad_norm": 1.6404129266738892, "learning_rate": 8.203637346250062e-06, "loss": 0.3514, "step": 10400 }, { "epoch": 1.7141886745847141, "grad_norm": 2.078935146331787, "learning_rate": 8.186152141242957e-06, "loss": 0.3694, "step": 10410 }, { "epoch": 1.7158354089047159, "grad_norm": 1.4735045433044434, "learning_rate": 8.16867266853811e-06, "loss": 0.3602, "step": 10420 }, { "epoch": 1.7174821432247174, "grad_norm": 1.6575242280960083, "learning_rate": 8.15119898337588e-06, "loss": 0.3569, "step": 10430 }, { "epoch": 1.719128877544719, "grad_norm": 1.4052168130874634, "learning_rate": 8.133731140978347e-06, "loss": 0.3703, "step": 10440 }, { "epoch": 1.7207756118647208, "grad_norm": 2.0821497440338135, "learning_rate": 8.116269196549124e-06, "loss": 0.3442, "step": 10450 }, { "epoch": 1.7224223461847226, "grad_norm": 1.4449158906936646, "learning_rate": 8.098813205273183e-06, "loss": 0.3179, "step": 10460 }, { "epoch": 1.724069080504724, "grad_norm": 1.4337159395217896, "learning_rate": 8.081363222316681e-06, "loss": 0.3608, "step": 10470 }, { "epoch": 1.7257158148247256, "grad_norm": 1.587299108505249, "learning_rate": 8.063919302826787e-06, "loss": 0.366, "step": 10480 }, { "epoch": 1.7273625491447273, "grad_norm": 1.4321982860565186, "learning_rate": 8.046481501931516e-06, "loss": 0.36, "step": 10490 }, { "epoch": 1.729009283464729, "grad_norm": 1.3218423128128052, "learning_rate": 8.02904987473953e-06, "loss": 0.3678, "step": 10500 }, { "epoch": 1.7306560177847308, "grad_norm": 1.4841070175170898, "learning_rate": 8.011624476339993e-06, "loss": 0.3565, "step": 10510 }, { "epoch": 1.7323027521047323, "grad_norm": 1.6882789134979248, "learning_rate": 7.99420536180238e-06, "loss": 0.3598, "step": 10520 }, { "epoch": 1.7339494864247338, "grad_norm": 1.6460531949996948, "learning_rate": 7.976792586176311e-06, "loss": 0.3601, "step": 10530 }, { "epoch": 1.7355962207447355, "grad_norm": 2.165046215057373, "learning_rate": 7.959386204491365e-06, "loss": 0.3529, "step": 10540 }, { "epoch": 1.7372429550647372, "grad_norm": 2.083591938018799, "learning_rate": 7.941986271756926e-06, "loss": 0.3664, "step": 10550 }, { "epoch": 1.738889689384739, "grad_norm": 1.4335622787475586, "learning_rate": 7.924592842961985e-06, "loss": 0.3742, "step": 10560 }, { "epoch": 1.7405364237047405, "grad_norm": 1.9920486211776733, "learning_rate": 7.907205973074987e-06, "loss": 0.3652, "step": 10570 }, { "epoch": 1.7421831580247422, "grad_norm": 1.6265625953674316, "learning_rate": 7.889825717043643e-06, "loss": 0.3714, "step": 10580 }, { "epoch": 1.7438298923447437, "grad_norm": 1.6098805665969849, "learning_rate": 7.872452129794765e-06, "loss": 0.3434, "step": 10590 }, { "epoch": 1.7454766266647455, "grad_norm": 1.5913910865783691, "learning_rate": 7.855085266234093e-06, "loss": 0.3601, "step": 10600 }, { "epoch": 1.7471233609847472, "grad_norm": 1.3813248872756958, "learning_rate": 7.837725181246116e-06, "loss": 0.3658, "step": 10610 }, { "epoch": 1.748770095304749, "grad_norm": 1.6209338903427124, "learning_rate": 7.820371929693894e-06, "loss": 0.3515, "step": 10620 }, { "epoch": 1.7504168296247504, "grad_norm": 1.6621341705322266, "learning_rate": 7.803025566418904e-06, "loss": 0.364, "step": 10630 }, { "epoch": 1.752063563944752, "grad_norm": 1.360546588897705, "learning_rate": 7.785686146240844e-06, "loss": 0.3582, "step": 10640 }, { "epoch": 1.7537102982647537, "grad_norm": 2.2365732192993164, "learning_rate": 7.76835372395747e-06, "loss": 0.3538, "step": 10650 }, { "epoch": 1.7553570325847554, "grad_norm": 1.673392415046692, "learning_rate": 7.751028354344432e-06, "loss": 0.3661, "step": 10660 }, { "epoch": 1.7570037669047571, "grad_norm": 1.4482479095458984, "learning_rate": 7.733710092155076e-06, "loss": 0.3468, "step": 10670 }, { "epoch": 1.7586505012247586, "grad_norm": 1.5075653791427612, "learning_rate": 7.716398992120302e-06, "loss": 0.3588, "step": 10680 }, { "epoch": 1.7602972355447601, "grad_norm": 1.881640911102295, "learning_rate": 7.699095108948365e-06, "loss": 0.3538, "step": 10690 }, { "epoch": 1.7619439698647619, "grad_norm": 1.933388590812683, "learning_rate": 7.681798497324717e-06, "loss": 0.3527, "step": 10700 }, { "epoch": 1.7635907041847636, "grad_norm": 1.514220118522644, "learning_rate": 7.664509211911833e-06, "loss": 0.3679, "step": 10710 }, { "epoch": 1.7652374385047653, "grad_norm": 1.9936243295669556, "learning_rate": 7.647227307349024e-06, "loss": 0.3808, "step": 10720 }, { "epoch": 1.7668841728247668, "grad_norm": 1.567679762840271, "learning_rate": 7.629952838252287e-06, "loss": 0.3515, "step": 10730 }, { "epoch": 1.7685309071447686, "grad_norm": 1.4825366735458374, "learning_rate": 7.612685859214113e-06, "loss": 0.3677, "step": 10740 }, { "epoch": 1.77017764146477, "grad_norm": 1.8481963872909546, "learning_rate": 7.59542642480333e-06, "loss": 0.3642, "step": 10750 }, { "epoch": 1.7718243757847718, "grad_norm": 1.9915103912353516, "learning_rate": 7.578174589564911e-06, "loss": 0.3512, "step": 10760 }, { "epoch": 1.7734711101047735, "grad_norm": 2.2940824031829834, "learning_rate": 7.560930408019823e-06, "loss": 0.3713, "step": 10770 }, { "epoch": 1.7751178444247753, "grad_norm": 1.6668184995651245, "learning_rate": 7.543693934664846e-06, "loss": 0.3782, "step": 10780 }, { "epoch": 1.7767645787447768, "grad_norm": 1.4784590005874634, "learning_rate": 7.52646522397239e-06, "loss": 0.3632, "step": 10790 }, { "epoch": 1.7784113130647783, "grad_norm": 1.6654629707336426, "learning_rate": 7.5092443303903404e-06, "loss": 0.351, "step": 10800 }, { "epoch": 1.78005804738478, "grad_norm": 1.913283109664917, "learning_rate": 7.492031308341879e-06, "loss": 0.3557, "step": 10810 }, { "epoch": 1.7817047817047817, "grad_norm": 1.9951850175857544, "learning_rate": 7.474826212225305e-06, "loss": 0.3358, "step": 10820 }, { "epoch": 1.7833515160247835, "grad_norm": 1.5256726741790771, "learning_rate": 7.457629096413874e-06, "loss": 0.3418, "step": 10830 }, { "epoch": 1.784998250344785, "grad_norm": 1.617591142654419, "learning_rate": 7.440440015255625e-06, "loss": 0.3448, "step": 10840 }, { "epoch": 1.7866449846647865, "grad_norm": 1.3453478813171387, "learning_rate": 7.423259023073197e-06, "loss": 0.3479, "step": 10850 }, { "epoch": 1.7882917189847882, "grad_norm": 1.643965721130371, "learning_rate": 7.406086174163665e-06, "loss": 0.3634, "step": 10860 }, { "epoch": 1.78993845330479, "grad_norm": 1.6207489967346191, "learning_rate": 7.388921522798376e-06, "loss": 0.3419, "step": 10870 }, { "epoch": 1.7915851876247917, "grad_norm": 2.043084144592285, "learning_rate": 7.371765123222767e-06, "loss": 0.348, "step": 10880 }, { "epoch": 1.7932319219447932, "grad_norm": 1.35907781124115, "learning_rate": 7.354617029656198e-06, "loss": 0.3344, "step": 10890 }, { "epoch": 1.794878656264795, "grad_norm": 1.5821566581726074, "learning_rate": 7.337477296291778e-06, "loss": 0.3273, "step": 10900 }, { "epoch": 1.7965253905847964, "grad_norm": 1.6671669483184814, "learning_rate": 7.3203459772961924e-06, "loss": 0.3508, "step": 10910 }, { "epoch": 1.7981721249047982, "grad_norm": 1.5016417503356934, "learning_rate": 7.303223126809546e-06, "loss": 0.3348, "step": 10920 }, { "epoch": 1.7998188592248, "grad_norm": 1.4615623950958252, "learning_rate": 7.286108798945162e-06, "loss": 0.3501, "step": 10930 }, { "epoch": 1.8014655935448016, "grad_norm": 1.4260656833648682, "learning_rate": 7.269003047789446e-06, "loss": 0.338, "step": 10940 }, { "epoch": 1.8031123278648031, "grad_norm": 2.318674087524414, "learning_rate": 7.251905927401691e-06, "loss": 0.3422, "step": 10950 }, { "epoch": 1.8047590621848046, "grad_norm": 1.3945016860961914, "learning_rate": 7.234817491813917e-06, "loss": 0.3291, "step": 10960 }, { "epoch": 1.8064057965048064, "grad_norm": 2.036652088165283, "learning_rate": 7.217737795030695e-06, "loss": 0.3412, "step": 10970 }, { "epoch": 1.808052530824808, "grad_norm": 1.9263806343078613, "learning_rate": 7.200666891028983e-06, "loss": 0.3341, "step": 10980 }, { "epoch": 1.8096992651448098, "grad_norm": 1.7770569324493408, "learning_rate": 7.183604833757949e-06, "loss": 0.3375, "step": 10990 }, { "epoch": 1.8113459994648113, "grad_norm": 1.8444048166275024, "learning_rate": 7.166551677138794e-06, "loss": 0.3298, "step": 11000 }, { "epoch": 1.8129927337848128, "grad_norm": 1.5634081363677979, "learning_rate": 7.149507475064606e-06, "loss": 0.337, "step": 11010 }, { "epoch": 1.8146394681048146, "grad_norm": 2.272538900375366, "learning_rate": 7.13247228140016e-06, "loss": 0.3514, "step": 11020 }, { "epoch": 1.8162862024248163, "grad_norm": 1.4847793579101562, "learning_rate": 7.115446149981774e-06, "loss": 0.3442, "step": 11030 }, { "epoch": 1.817932936744818, "grad_norm": 1.7815018892288208, "learning_rate": 7.098429134617117e-06, "loss": 0.3527, "step": 11040 }, { "epoch": 1.8195796710648195, "grad_norm": 1.8198860883712769, "learning_rate": 7.081421289085053e-06, "loss": 0.3289, "step": 11050 }, { "epoch": 1.8212264053848213, "grad_norm": 1.5678714513778687, "learning_rate": 7.06442266713547e-06, "loss": 0.3212, "step": 11060 }, { "epoch": 1.8228731397048228, "grad_norm": 1.6661272048950195, "learning_rate": 7.047433322489094e-06, "loss": 0.3511, "step": 11070 }, { "epoch": 1.8245198740248245, "grad_norm": 1.59157395362854, "learning_rate": 7.030453308837344e-06, "loss": 0.3392, "step": 11080 }, { "epoch": 1.8261666083448262, "grad_norm": 1.9493590593338013, "learning_rate": 7.013482679842145e-06, "loss": 0.3496, "step": 11090 }, { "epoch": 1.827813342664828, "grad_norm": 2.397372245788574, "learning_rate": 6.996521489135768e-06, "loss": 0.3428, "step": 11100 }, { "epoch": 1.8294600769848295, "grad_norm": 2.058790445327759, "learning_rate": 6.979569790320653e-06, "loss": 0.3099, "step": 11110 }, { "epoch": 1.831106811304831, "grad_norm": 1.6316676139831543, "learning_rate": 6.962627636969241e-06, "loss": 0.3536, "step": 11120 }, { "epoch": 1.8327535456248327, "grad_norm": 1.8087751865386963, "learning_rate": 6.945695082623816e-06, "loss": 0.3548, "step": 11130 }, { "epoch": 1.8344002799448345, "grad_norm": 1.6913039684295654, "learning_rate": 6.928772180796308e-06, "loss": 0.3325, "step": 11140 }, { "epoch": 1.8360470142648362, "grad_norm": 1.5240042209625244, "learning_rate": 6.911858984968158e-06, "loss": 0.3381, "step": 11150 }, { "epoch": 1.8376937485848377, "grad_norm": 1.340308427810669, "learning_rate": 6.894955548590128e-06, "loss": 0.3293, "step": 11160 }, { "epoch": 1.8393404829048392, "grad_norm": 1.640592336654663, "learning_rate": 6.878061925082138e-06, "loss": 0.3394, "step": 11170 }, { "epoch": 1.840987217224841, "grad_norm": 1.7119625806808472, "learning_rate": 6.861178167833096e-06, "loss": 0.3495, "step": 11180 }, { "epoch": 1.8426339515448427, "grad_norm": 1.365484356880188, "learning_rate": 6.844304330200728e-06, "loss": 0.3353, "step": 11190 }, { "epoch": 1.8442806858648444, "grad_norm": 2.2884132862091064, "learning_rate": 6.827440465511414e-06, "loss": 0.331, "step": 11200 }, { "epoch": 1.845927420184846, "grad_norm": 1.5407003164291382, "learning_rate": 6.810586627060019e-06, "loss": 0.3182, "step": 11210 }, { "epoch": 1.8475741545048474, "grad_norm": 1.7855136394500732, "learning_rate": 6.793742868109709e-06, "loss": 0.3248, "step": 11220 }, { "epoch": 1.8492208888248491, "grad_norm": 1.5685489177703857, "learning_rate": 6.776909241891809e-06, "loss": 0.3332, "step": 11230 }, { "epoch": 1.8508676231448509, "grad_norm": 2.068676233291626, "learning_rate": 6.76008580160562e-06, "loss": 0.3421, "step": 11240 }, { "epoch": 1.8525143574648526, "grad_norm": 1.9478691816329956, "learning_rate": 6.743272600418246e-06, "loss": 0.3194, "step": 11250 }, { "epoch": 1.854161091784854, "grad_norm": 1.7436619997024536, "learning_rate": 6.726469691464439e-06, "loss": 0.3261, "step": 11260 }, { "epoch": 1.8558078261048558, "grad_norm": 1.7236602306365967, "learning_rate": 6.70967712784642e-06, "loss": 0.3115, "step": 11270 }, { "epoch": 1.8574545604248573, "grad_norm": 1.4117182493209839, "learning_rate": 6.692894962633722e-06, "loss": 0.3181, "step": 11280 }, { "epoch": 1.859101294744859, "grad_norm": 1.5423839092254639, "learning_rate": 6.6761232488630046e-06, "loss": 0.3137, "step": 11290 }, { "epoch": 1.8607480290648608, "grad_norm": 1.5166431665420532, "learning_rate": 6.659362039537907e-06, "loss": 0.3257, "step": 11300 }, { "epoch": 1.8623947633848625, "grad_norm": 1.261831521987915, "learning_rate": 6.6426113876288665e-06, "loss": 0.3134, "step": 11310 }, { "epoch": 1.864041497704864, "grad_norm": 1.5698233842849731, "learning_rate": 6.6258713460729604e-06, "loss": 0.3269, "step": 11320 }, { "epoch": 1.8656882320248656, "grad_norm": 1.4800500869750977, "learning_rate": 6.609141967773733e-06, "loss": 0.3393, "step": 11330 }, { "epoch": 1.8673349663448673, "grad_norm": 1.6218483448028564, "learning_rate": 6.592423305601025e-06, "loss": 0.3542, "step": 11340 }, { "epoch": 1.868981700664869, "grad_norm": 1.5717345476150513, "learning_rate": 6.5757154123908185e-06, "loss": 0.3347, "step": 11350 }, { "epoch": 1.8706284349848707, "grad_norm": 2.0220766067504883, "learning_rate": 6.559018340945051e-06, "loss": 0.3264, "step": 11360 }, { "epoch": 1.8722751693048723, "grad_norm": 1.756557822227478, "learning_rate": 6.542332144031471e-06, "loss": 0.3721, "step": 11370 }, { "epoch": 1.8739219036248738, "grad_norm": 1.273116946220398, "learning_rate": 6.525656874383456e-06, "loss": 0.3272, "step": 11380 }, { "epoch": 1.8755686379448755, "grad_norm": 1.276261568069458, "learning_rate": 6.508992584699849e-06, "loss": 0.3009, "step": 11390 }, { "epoch": 1.8772153722648772, "grad_norm": 1.6424288749694824, "learning_rate": 6.492339327644797e-06, "loss": 0.3341, "step": 11400 }, { "epoch": 1.878862106584879, "grad_norm": 1.6955491304397583, "learning_rate": 6.4756971558475755e-06, "loss": 0.3335, "step": 11410 }, { "epoch": 1.8805088409048805, "grad_norm": 1.661267638206482, "learning_rate": 6.459066121902433e-06, "loss": 0.3396, "step": 11420 }, { "epoch": 1.8821555752248822, "grad_norm": 1.469783067703247, "learning_rate": 6.442446278368411e-06, "loss": 0.2996, "step": 11430 }, { "epoch": 1.8838023095448837, "grad_norm": 1.665208101272583, "learning_rate": 6.425837677769191e-06, "loss": 0.3379, "step": 11440 }, { "epoch": 1.8854490438648854, "grad_norm": 1.6089602708816528, "learning_rate": 6.409240372592926e-06, "loss": 0.3355, "step": 11450 }, { "epoch": 1.8870957781848872, "grad_norm": 1.9932987689971924, "learning_rate": 6.392654415292068e-06, "loss": 0.3335, "step": 11460 }, { "epoch": 1.888742512504889, "grad_norm": 1.4461263418197632, "learning_rate": 6.3760798582832065e-06, "loss": 0.327, "step": 11470 }, { "epoch": 1.8903892468248904, "grad_norm": 1.5308934450149536, "learning_rate": 6.359516753946905e-06, "loss": 0.3283, "step": 11480 }, { "epoch": 1.892035981144892, "grad_norm": 1.4926317930221558, "learning_rate": 6.342965154627534e-06, "loss": 0.3308, "step": 11490 }, { "epoch": 1.8936827154648936, "grad_norm": 1.491185188293457, "learning_rate": 6.326425112633097e-06, "loss": 0.3179, "step": 11500 }, { "epoch": 1.8953294497848954, "grad_norm": 1.4022414684295654, "learning_rate": 6.309896680235082e-06, "loss": 0.331, "step": 11510 }, { "epoch": 1.896976184104897, "grad_norm": 1.5799373388290405, "learning_rate": 6.293379909668282e-06, "loss": 0.3099, "step": 11520 }, { "epoch": 1.8986229184248986, "grad_norm": 1.7698333263397217, "learning_rate": 6.276874853130639e-06, "loss": 0.3459, "step": 11530 }, { "epoch": 1.9002696527449001, "grad_norm": 1.4904412031173706, "learning_rate": 6.2603815627830685e-06, "loss": 0.3258, "step": 11540 }, { "epoch": 1.9019163870649018, "grad_norm": 1.5331462621688843, "learning_rate": 6.2439000907493105e-06, "loss": 0.3347, "step": 11550 }, { "epoch": 1.9035631213849036, "grad_norm": 1.8723499774932861, "learning_rate": 6.227430489115751e-06, "loss": 0.3369, "step": 11560 }, { "epoch": 1.9052098557049053, "grad_norm": 1.607606291770935, "learning_rate": 6.210972809931257e-06, "loss": 0.3183, "step": 11570 }, { "epoch": 1.9068565900249068, "grad_norm": 1.2479761838912964, "learning_rate": 6.194527105207024e-06, "loss": 0.3357, "step": 11580 }, { "epoch": 1.9085033243449085, "grad_norm": 1.7267532348632812, "learning_rate": 6.178093426916403e-06, "loss": 0.3089, "step": 11590 }, { "epoch": 1.91015005866491, "grad_norm": 1.616453766822815, "learning_rate": 6.161671826994739e-06, "loss": 0.3416, "step": 11600 }, { "epoch": 1.9117967929849118, "grad_norm": 1.7664293050765991, "learning_rate": 6.1452623573392e-06, "loss": 0.3323, "step": 11610 }, { "epoch": 1.9134435273049135, "grad_norm": 1.3341567516326904, "learning_rate": 6.128865069808625e-06, "loss": 0.3194, "step": 11620 }, { "epoch": 1.9150902616249152, "grad_norm": 1.6267433166503906, "learning_rate": 6.112480016223352e-06, "loss": 0.321, "step": 11630 }, { "epoch": 1.9167369959449168, "grad_norm": 1.7511682510375977, "learning_rate": 6.0961072483650526e-06, "loss": 0.3216, "step": 11640 }, { "epoch": 1.9183837302649183, "grad_norm": 1.8487275838851929, "learning_rate": 6.0797468179765785e-06, "loss": 0.3226, "step": 11650 }, { "epoch": 1.92003046458492, "grad_norm": 1.737802267074585, "learning_rate": 6.063398776761785e-06, "loss": 0.3123, "step": 11660 }, { "epoch": 1.9216771989049217, "grad_norm": 1.4471075534820557, "learning_rate": 6.047063176385378e-06, "loss": 0.2977, "step": 11670 }, { "epoch": 1.9233239332249235, "grad_norm": 1.8854256868362427, "learning_rate": 6.030740068472745e-06, "loss": 0.3323, "step": 11680 }, { "epoch": 1.924970667544925, "grad_norm": 1.4351447820663452, "learning_rate": 6.014429504609796e-06, "loss": 0.3164, "step": 11690 }, { "epoch": 1.9266174018649265, "grad_norm": 1.6448378562927246, "learning_rate": 5.998131536342792e-06, "loss": 0.3268, "step": 11700 }, { "epoch": 1.9282641361849282, "grad_norm": 1.5195516347885132, "learning_rate": 5.981846215178191e-06, "loss": 0.3031, "step": 11710 }, { "epoch": 1.92991087050493, "grad_norm": 1.6448338031768799, "learning_rate": 5.965573592582488e-06, "loss": 0.3234, "step": 11720 }, { "epoch": 1.9315576048249317, "grad_norm": 1.4545010328292847, "learning_rate": 5.9493137199820376e-06, "loss": 0.3354, "step": 11730 }, { "epoch": 1.9332043391449332, "grad_norm": 1.2834254503250122, "learning_rate": 5.933066648762907e-06, "loss": 0.3229, "step": 11740 }, { "epoch": 1.934851073464935, "grad_norm": 1.7601953744888306, "learning_rate": 5.916832430270705e-06, "loss": 0.3112, "step": 11750 }, { "epoch": 1.9364978077849364, "grad_norm": 1.315958857536316, "learning_rate": 5.900611115810423e-06, "loss": 0.3119, "step": 11760 }, { "epoch": 1.9381445421049381, "grad_norm": 2.057389259338379, "learning_rate": 5.884402756646273e-06, "loss": 0.3206, "step": 11770 }, { "epoch": 1.9397912764249399, "grad_norm": 2.368434190750122, "learning_rate": 5.868207404001518e-06, "loss": 0.3211, "step": 11780 }, { "epoch": 1.9414380107449416, "grad_norm": 1.6587117910385132, "learning_rate": 5.852025109058321e-06, "loss": 0.3078, "step": 11790 }, { "epoch": 1.943084745064943, "grad_norm": 2.1465649604797363, "learning_rate": 5.835855922957583e-06, "loss": 0.3168, "step": 11800 }, { "epoch": 1.9447314793849446, "grad_norm": 2.077082395553589, "learning_rate": 5.819699896798765e-06, "loss": 0.3214, "step": 11810 }, { "epoch": 1.9463782137049463, "grad_norm": 1.8317384719848633, "learning_rate": 5.803557081639757e-06, "loss": 0.3085, "step": 11820 }, { "epoch": 1.948024948024948, "grad_norm": 1.5140388011932373, "learning_rate": 5.787427528496676e-06, "loss": 0.3213, "step": 11830 }, { "epoch": 1.9496716823449498, "grad_norm": 1.7246615886688232, "learning_rate": 5.771311288343748e-06, "loss": 0.3098, "step": 11840 }, { "epoch": 1.9513184166649513, "grad_norm": 2.0293779373168945, "learning_rate": 5.755208412113116e-06, "loss": 0.3206, "step": 11850 }, { "epoch": 1.9529651509849528, "grad_norm": 1.6586174964904785, "learning_rate": 5.739118950694684e-06, "loss": 0.3122, "step": 11860 }, { "epoch": 1.9546118853049546, "grad_norm": 1.7157447338104248, "learning_rate": 5.723042954935968e-06, "loss": 0.2913, "step": 11870 }, { "epoch": 1.9562586196249563, "grad_norm": 1.383748173713684, "learning_rate": 5.7069804756419326e-06, "loss": 0.3248, "step": 11880 }, { "epoch": 1.957905353944958, "grad_norm": 2.3896443843841553, "learning_rate": 5.690931563574813e-06, "loss": 0.3112, "step": 11890 }, { "epoch": 1.9595520882649595, "grad_norm": 1.9655721187591553, "learning_rate": 5.6748962694539855e-06, "loss": 0.3181, "step": 11900 }, { "epoch": 1.961198822584961, "grad_norm": 1.6184086799621582, "learning_rate": 5.6588746439557706e-06, "loss": 0.3272, "step": 11910 }, { "epoch": 1.9628455569049628, "grad_norm": 1.5995012521743774, "learning_rate": 5.642866737713311e-06, "loss": 0.3039, "step": 11920 }, { "epoch": 1.9644922912249645, "grad_norm": 1.8000292778015137, "learning_rate": 5.6268726013163764e-06, "loss": 0.3061, "step": 11930 }, { "epoch": 1.9661390255449662, "grad_norm": 1.651097059249878, "learning_rate": 5.610892285311229e-06, "loss": 0.3, "step": 11940 }, { "epoch": 1.9677857598649677, "grad_norm": 1.8784924745559692, "learning_rate": 5.5949258402004446e-06, "loss": 0.3261, "step": 11950 }, { "epoch": 1.9694324941849695, "grad_norm": 1.6601523160934448, "learning_rate": 5.578973316442779e-06, "loss": 0.3081, "step": 11960 }, { "epoch": 1.971079228504971, "grad_norm": 1.904227614402771, "learning_rate": 5.563034764452976e-06, "loss": 0.3211, "step": 11970 }, { "epoch": 1.9727259628249727, "grad_norm": 1.6444785594940186, "learning_rate": 5.5471102346016385e-06, "loss": 0.3315, "step": 11980 }, { "epoch": 1.9743726971449744, "grad_norm": 1.8297940492630005, "learning_rate": 5.531199777215044e-06, "loss": 0.2992, "step": 11990 }, { "epoch": 1.9760194314649762, "grad_norm": 1.717757225036621, "learning_rate": 5.515303442574997e-06, "loss": 0.3323, "step": 12000 }, { "epoch": 1.9776661657849777, "grad_norm": 1.6197963953018188, "learning_rate": 5.499421280918682e-06, "loss": 0.3401, "step": 12010 }, { "epoch": 1.9793129001049792, "grad_norm": 1.5700541734695435, "learning_rate": 5.4835533424384825e-06, "loss": 0.3204, "step": 12020 }, { "epoch": 1.980959634424981, "grad_norm": 1.5739840269088745, "learning_rate": 5.467699677281828e-06, "loss": 0.3115, "step": 12030 }, { "epoch": 1.9826063687449826, "grad_norm": 1.8662614822387695, "learning_rate": 5.451860335551056e-06, "loss": 0.3207, "step": 12040 }, { "epoch": 1.9842531030649844, "grad_norm": 1.725319504737854, "learning_rate": 5.4360353673032185e-06, "loss": 0.3038, "step": 12050 }, { "epoch": 1.9858998373849859, "grad_norm": 1.7426807880401611, "learning_rate": 5.420224822549963e-06, "loss": 0.3012, "step": 12060 }, { "epoch": 1.9875465717049874, "grad_norm": 1.5318362712860107, "learning_rate": 5.404428751257339e-06, "loss": 0.3037, "step": 12070 }, { "epoch": 1.9891933060249891, "grad_norm": 1.514984369277954, "learning_rate": 5.388647203345659e-06, "loss": 0.3329, "step": 12080 }, { "epoch": 1.9908400403449908, "grad_norm": 1.3293901681900024, "learning_rate": 5.372880228689341e-06, "loss": 0.3109, "step": 12090 }, { "epoch": 1.9924867746649926, "grad_norm": 1.5703567266464233, "learning_rate": 5.357127877116743e-06, "loss": 0.3262, "step": 12100 }, { "epoch": 1.994133508984994, "grad_norm": 1.395535945892334, "learning_rate": 5.3413901984100195e-06, "loss": 0.3314, "step": 12110 }, { "epoch": 1.9957802433049958, "grad_norm": 1.7006887197494507, "learning_rate": 5.3256672423049396e-06, "loss": 0.3132, "step": 12120 }, { "epoch": 1.9974269776249973, "grad_norm": 1.6310688257217407, "learning_rate": 5.309959058490754e-06, "loss": 0.2896, "step": 12130 }, { "epoch": 1.999073711944999, "grad_norm": 1.698585033416748, "learning_rate": 5.294265696610022e-06, "loss": 0.3006, "step": 12140 }, { "epoch": 2.0008233671600006, "grad_norm": 1.5965461730957031, "learning_rate": 5.2785872062584705e-06, "loss": 0.3434, "step": 12150 }, { "epoch": 2.0024701014800024, "grad_norm": 1.6904765367507935, "learning_rate": 5.262923636984818e-06, "loss": 0.2823, "step": 12160 }, { "epoch": 2.004116835800004, "grad_norm": 1.2032884359359741, "learning_rate": 5.24727503829064e-06, "loss": 0.2712, "step": 12170 }, { "epoch": 2.005763570120006, "grad_norm": 1.4353348016738892, "learning_rate": 5.2316414596301855e-06, "loss": 0.285, "step": 12180 }, { "epoch": 2.0074103044400076, "grad_norm": 1.5126605033874512, "learning_rate": 5.216022950410251e-06, "loss": 0.2763, "step": 12190 }, { "epoch": 2.009057038760009, "grad_norm": 2.8418431282043457, "learning_rate": 5.2004195599899966e-06, "loss": 0.2863, "step": 12200 }, { "epoch": 2.0107037730800106, "grad_norm": 2.0845654010772705, "learning_rate": 5.1848313376808065e-06, "loss": 0.29, "step": 12210 }, { "epoch": 2.0123505074000123, "grad_norm": 1.393932580947876, "learning_rate": 5.16925833274613e-06, "loss": 0.2904, "step": 12220 }, { "epoch": 2.013997241720014, "grad_norm": 1.4617998600006104, "learning_rate": 5.153700594401328e-06, "loss": 0.2663, "step": 12230 }, { "epoch": 2.0156439760400158, "grad_norm": 1.865277886390686, "learning_rate": 5.138158171813507e-06, "loss": 0.2876, "step": 12240 }, { "epoch": 2.0172907103600175, "grad_norm": 1.4628126621246338, "learning_rate": 5.12263111410138e-06, "loss": 0.2608, "step": 12250 }, { "epoch": 2.018937444680019, "grad_norm": 1.8890234231948853, "learning_rate": 5.107119470335093e-06, "loss": 0.2869, "step": 12260 }, { "epoch": 2.0205841790000205, "grad_norm": 2.153687000274658, "learning_rate": 5.091623289536095e-06, "loss": 0.2778, "step": 12270 }, { "epoch": 2.0222309133200222, "grad_norm": 1.9289218187332153, "learning_rate": 5.076142620676941e-06, "loss": 0.3025, "step": 12280 }, { "epoch": 2.023877647640024, "grad_norm": 1.6347593069076538, "learning_rate": 5.060677512681187e-06, "loss": 0.2692, "step": 12290 }, { "epoch": 2.0255243819600257, "grad_norm": 2.1839990615844727, "learning_rate": 5.045228014423203e-06, "loss": 0.2843, "step": 12300 }, { "epoch": 2.027171116280027, "grad_norm": 2.4028539657592773, "learning_rate": 5.029794174728031e-06, "loss": 0.2798, "step": 12310 }, { "epoch": 2.0288178506000287, "grad_norm": 2.70993709564209, "learning_rate": 5.014376042371221e-06, "loss": 0.2773, "step": 12320 }, { "epoch": 2.0304645849200305, "grad_norm": 1.8589779138565063, "learning_rate": 4.998973666078692e-06, "loss": 0.2775, "step": 12330 }, { "epoch": 2.032111319240032, "grad_norm": 1.971356749534607, "learning_rate": 4.983587094526556e-06, "loss": 0.2776, "step": 12340 }, { "epoch": 2.033758053560034, "grad_norm": 1.5491329431533813, "learning_rate": 4.9682163763410005e-06, "loss": 0.2729, "step": 12350 }, { "epoch": 2.035404787880035, "grad_norm": 1.7983886003494263, "learning_rate": 4.952861560098079e-06, "loss": 0.2835, "step": 12360 }, { "epoch": 2.037051522200037, "grad_norm": 2.259243965148926, "learning_rate": 4.937522694323618e-06, "loss": 0.3009, "step": 12370 }, { "epoch": 2.0386982565200387, "grad_norm": 2.2802488803863525, "learning_rate": 4.922199827493022e-06, "loss": 0.2921, "step": 12380 }, { "epoch": 2.0403449908400404, "grad_norm": 1.9333369731903076, "learning_rate": 4.906893008031141e-06, "loss": 0.2746, "step": 12390 }, { "epoch": 2.041991725160042, "grad_norm": 1.7847884893417358, "learning_rate": 4.8916022843121e-06, "loss": 0.2816, "step": 12400 }, { "epoch": 2.043638459480044, "grad_norm": 1.5014792680740356, "learning_rate": 4.876327704659172e-06, "loss": 0.2897, "step": 12410 }, { "epoch": 2.045285193800045, "grad_norm": 2.1002280712127686, "learning_rate": 4.861069317344598e-06, "loss": 0.2744, "step": 12420 }, { "epoch": 2.046931928120047, "grad_norm": 1.5581461191177368, "learning_rate": 4.845827170589449e-06, "loss": 0.2886, "step": 12430 }, { "epoch": 2.0485786624400486, "grad_norm": 1.6326346397399902, "learning_rate": 4.830601312563469e-06, "loss": 0.2859, "step": 12440 }, { "epoch": 2.0502253967600503, "grad_norm": 1.5935137271881104, "learning_rate": 4.815391791384933e-06, "loss": 0.2932, "step": 12450 }, { "epoch": 2.051872131080052, "grad_norm": 1.5060147047042847, "learning_rate": 4.800198655120478e-06, "loss": 0.2899, "step": 12460 }, { "epoch": 2.0535188654000534, "grad_norm": 1.7694473266601562, "learning_rate": 4.785021951784967e-06, "loss": 0.2852, "step": 12470 }, { "epoch": 2.055165599720055, "grad_norm": 1.579512596130371, "learning_rate": 4.76986172934132e-06, "loss": 0.2837, "step": 12480 }, { "epoch": 2.056812334040057, "grad_norm": 1.4137424230575562, "learning_rate": 4.7547180357003885e-06, "loss": 0.2793, "step": 12490 }, { "epoch": 2.0584590683600585, "grad_norm": 1.5642904043197632, "learning_rate": 4.739590918720765e-06, "loss": 0.2766, "step": 12500 }, { "epoch": 2.0601058026800603, "grad_norm": 2.033942937850952, "learning_rate": 4.724480426208678e-06, "loss": 0.3038, "step": 12510 }, { "epoch": 2.0617525370000616, "grad_norm": 1.408429503440857, "learning_rate": 4.709386605917798e-06, "loss": 0.2747, "step": 12520 }, { "epoch": 2.0633992713200633, "grad_norm": 1.467748761177063, "learning_rate": 4.694309505549128e-06, "loss": 0.2833, "step": 12530 }, { "epoch": 2.065046005640065, "grad_norm": 2.374821662902832, "learning_rate": 4.6792491727508076e-06, "loss": 0.2828, "step": 12540 }, { "epoch": 2.0666927399600667, "grad_norm": 2.1544690132141113, "learning_rate": 4.664205655118006e-06, "loss": 0.2872, "step": 12550 }, { "epoch": 2.0683394742800685, "grad_norm": 1.5849260091781616, "learning_rate": 4.6491790001927385e-06, "loss": 0.2748, "step": 12560 }, { "epoch": 2.0699862086000698, "grad_norm": 1.403977394104004, "learning_rate": 4.634169255463734e-06, "loss": 0.2899, "step": 12570 }, { "epoch": 2.0716329429200715, "grad_norm": 1.8115674257278442, "learning_rate": 4.619176468366274e-06, "loss": 0.2723, "step": 12580 }, { "epoch": 2.0732796772400732, "grad_norm": 1.3875244855880737, "learning_rate": 4.604200686282063e-06, "loss": 0.2661, "step": 12590 }, { "epoch": 2.074926411560075, "grad_norm": 2.000905990600586, "learning_rate": 4.5892419565390486e-06, "loss": 0.2808, "step": 12600 }, { "epoch": 2.0765731458800767, "grad_norm": 2.43119740486145, "learning_rate": 4.5743003264113015e-06, "loss": 0.2826, "step": 12610 }, { "epoch": 2.0782198802000784, "grad_norm": 3.4667115211486816, "learning_rate": 4.559375843118839e-06, "loss": 0.2803, "step": 12620 }, { "epoch": 2.0798666145200797, "grad_norm": 1.7227445840835571, "learning_rate": 4.544468553827508e-06, "loss": 0.2836, "step": 12630 }, { "epoch": 2.0815133488400814, "grad_norm": 1.7388190031051636, "learning_rate": 4.529578505648789e-06, "loss": 0.288, "step": 12640 }, { "epoch": 2.083160083160083, "grad_norm": 1.63683021068573, "learning_rate": 4.514705745639706e-06, "loss": 0.2896, "step": 12650 }, { "epoch": 2.084806817480085, "grad_norm": 2.0901145935058594, "learning_rate": 4.499850320802623e-06, "loss": 0.2793, "step": 12660 }, { "epoch": 2.0864535518000866, "grad_norm": 2.0766639709472656, "learning_rate": 4.485012278085139e-06, "loss": 0.2731, "step": 12670 }, { "epoch": 2.088100286120088, "grad_norm": 1.5726747512817383, "learning_rate": 4.470191664379903e-06, "loss": 0.2751, "step": 12680 }, { "epoch": 2.0897470204400896, "grad_norm": 2.0181379318237305, "learning_rate": 4.455388526524498e-06, "loss": 0.2992, "step": 12690 }, { "epoch": 2.0913937547600914, "grad_norm": 2.143312692642212, "learning_rate": 4.440602911301267e-06, "loss": 0.2939, "step": 12700 }, { "epoch": 2.093040489080093, "grad_norm": 1.3811988830566406, "learning_rate": 4.425834865437184e-06, "loss": 0.2795, "step": 12710 }, { "epoch": 2.094687223400095, "grad_norm": 1.4296287298202515, "learning_rate": 4.411084435603688e-06, "loss": 0.2831, "step": 12720 }, { "epoch": 2.096333957720096, "grad_norm": 1.4301252365112305, "learning_rate": 4.396351668416562e-06, "loss": 0.2963, "step": 12730 }, { "epoch": 2.097980692040098, "grad_norm": 2.8089404106140137, "learning_rate": 4.3816366104357545e-06, "loss": 0.2935, "step": 12740 }, { "epoch": 2.0996274263600996, "grad_norm": 1.936610460281372, "learning_rate": 4.366939308165259e-06, "loss": 0.3008, "step": 12750 }, { "epoch": 2.1012741606801013, "grad_norm": 1.634929895401001, "learning_rate": 4.352259808052944e-06, "loss": 0.2853, "step": 12760 }, { "epoch": 2.102920895000103, "grad_norm": 1.791841745376587, "learning_rate": 4.337598156490435e-06, "loss": 0.2583, "step": 12770 }, { "epoch": 2.1045676293201048, "grad_norm": 1.6003674268722534, "learning_rate": 4.322954399812932e-06, "loss": 0.2787, "step": 12780 }, { "epoch": 2.106214363640106, "grad_norm": 1.8749767541885376, "learning_rate": 4.308328584299092e-06, "loss": 0.2829, "step": 12790 }, { "epoch": 2.107861097960108, "grad_norm": 1.6271084547042847, "learning_rate": 4.293720756170868e-06, "loss": 0.2713, "step": 12800 }, { "epoch": 2.1095078322801095, "grad_norm": 1.9377316236495972, "learning_rate": 4.2791309615933764e-06, "loss": 0.2913, "step": 12810 }, { "epoch": 2.1111545666001112, "grad_norm": 1.353582501411438, "learning_rate": 4.264559246674727e-06, "loss": 0.2805, "step": 12820 }, { "epoch": 2.112801300920113, "grad_norm": 1.5382156372070312, "learning_rate": 4.2500056574659135e-06, "loss": 0.2853, "step": 12830 }, { "epoch": 2.1144480352401143, "grad_norm": 1.6927893161773682, "learning_rate": 4.235470239960623e-06, "loss": 0.2789, "step": 12840 }, { "epoch": 2.116094769560116, "grad_norm": 1.7223583459854126, "learning_rate": 4.2209530400951335e-06, "loss": 0.2786, "step": 12850 }, { "epoch": 2.1177415038801177, "grad_norm": 1.528069257736206, "learning_rate": 4.206454103748142e-06, "loss": 0.2854, "step": 12860 }, { "epoch": 2.1193882382001195, "grad_norm": 1.9811831712722778, "learning_rate": 4.191973476740628e-06, "loss": 0.2687, "step": 12870 }, { "epoch": 2.121034972520121, "grad_norm": 1.6877635717391968, "learning_rate": 4.1775112048357e-06, "loss": 0.2725, "step": 12880 }, { "epoch": 2.1226817068401225, "grad_norm": 1.6502715349197388, "learning_rate": 4.163067333738479e-06, "loss": 0.2712, "step": 12890 }, { "epoch": 2.124328441160124, "grad_norm": 1.460304617881775, "learning_rate": 4.148641909095911e-06, "loss": 0.2653, "step": 12900 }, { "epoch": 2.125975175480126, "grad_norm": 1.7706825733184814, "learning_rate": 4.134234976496666e-06, "loss": 0.2853, "step": 12910 }, { "epoch": 2.1276219098001277, "grad_norm": 1.4802557229995728, "learning_rate": 4.11984658147096e-06, "loss": 0.2824, "step": 12920 }, { "epoch": 2.1292686441201294, "grad_norm": 1.5747597217559814, "learning_rate": 4.105476769490424e-06, "loss": 0.268, "step": 12930 }, { "epoch": 2.1309153784401307, "grad_norm": 2.0587172508239746, "learning_rate": 4.091125585967975e-06, "loss": 0.2962, "step": 12940 }, { "epoch": 2.1325621127601324, "grad_norm": 1.5220030546188354, "learning_rate": 4.0767930762576415e-06, "loss": 0.2717, "step": 12950 }, { "epoch": 2.134208847080134, "grad_norm": 1.6755307912826538, "learning_rate": 4.0624792856544505e-06, "loss": 0.2795, "step": 12960 }, { "epoch": 2.135855581400136, "grad_norm": 1.359605073928833, "learning_rate": 4.0481842593942636e-06, "loss": 0.2689, "step": 12970 }, { "epoch": 2.1375023157201376, "grad_norm": 1.5714139938354492, "learning_rate": 4.033908042653639e-06, "loss": 0.2772, "step": 12980 }, { "epoch": 2.1391490500401393, "grad_norm": 1.4556095600128174, "learning_rate": 4.019650680549704e-06, "loss": 0.276, "step": 12990 }, { "epoch": 2.1407957843601406, "grad_norm": 1.8429503440856934, "learning_rate": 4.005412218139986e-06, "loss": 0.2829, "step": 13000 }, { "epoch": 2.1424425186801423, "grad_norm": 1.6822142601013184, "learning_rate": 3.991192700422286e-06, "loss": 0.2829, "step": 13010 }, { "epoch": 2.144089253000144, "grad_norm": 1.859412670135498, "learning_rate": 3.976992172334544e-06, "loss": 0.2764, "step": 13020 }, { "epoch": 2.145735987320146, "grad_norm": 1.6306754350662231, "learning_rate": 3.962810678754674e-06, "loss": 0.2731, "step": 13030 }, { "epoch": 2.1473827216401475, "grad_norm": 1.5259572267532349, "learning_rate": 3.948648264500445e-06, "loss": 0.2608, "step": 13040 }, { "epoch": 2.149029455960149, "grad_norm": 1.6222518682479858, "learning_rate": 3.934504974329326e-06, "loss": 0.299, "step": 13050 }, { "epoch": 2.1506761902801506, "grad_norm": 1.880744457244873, "learning_rate": 3.920380852938348e-06, "loss": 0.2744, "step": 13060 }, { "epoch": 2.1523229246001523, "grad_norm": 1.6208007335662842, "learning_rate": 3.906275944963957e-06, "loss": 0.2817, "step": 13070 }, { "epoch": 2.153969658920154, "grad_norm": 1.4789073467254639, "learning_rate": 3.892190294981893e-06, "loss": 0.2813, "step": 13080 }, { "epoch": 2.1556163932401557, "grad_norm": 1.7686960697174072, "learning_rate": 3.8781239475070194e-06, "loss": 0.2651, "step": 13090 }, { "epoch": 2.1572631275601575, "grad_norm": 1.6891413927078247, "learning_rate": 3.864076946993215e-06, "loss": 0.2503, "step": 13100 }, { "epoch": 2.1589098618801588, "grad_norm": 1.6192152500152588, "learning_rate": 3.850049337833196e-06, "loss": 0.2693, "step": 13110 }, { "epoch": 2.1605565962001605, "grad_norm": 1.6435240507125854, "learning_rate": 3.836041164358416e-06, "loss": 0.2769, "step": 13120 }, { "epoch": 2.1622033305201622, "grad_norm": 1.7345331907272339, "learning_rate": 3.822052470838893e-06, "loss": 0.2804, "step": 13130 }, { "epoch": 2.163850064840164, "grad_norm": 1.42905855178833, "learning_rate": 3.8080833014830865e-06, "loss": 0.2749, "step": 13140 }, { "epoch": 2.1654967991601657, "grad_norm": 1.6597148180007935, "learning_rate": 3.7941337004377497e-06, "loss": 0.2748, "step": 13150 }, { "epoch": 2.167143533480167, "grad_norm": 1.6788873672485352, "learning_rate": 3.7802037117878053e-06, "loss": 0.2832, "step": 13160 }, { "epoch": 2.1687902678001687, "grad_norm": 1.761786699295044, "learning_rate": 3.7662933795561805e-06, "loss": 0.2943, "step": 13170 }, { "epoch": 2.1704370021201704, "grad_norm": 1.6826972961425781, "learning_rate": 3.7524027477036974e-06, "loss": 0.2763, "step": 13180 }, { "epoch": 2.172083736440172, "grad_norm": 1.4096064567565918, "learning_rate": 3.7385318601289034e-06, "loss": 0.2904, "step": 13190 }, { "epoch": 2.173730470760174, "grad_norm": 2.226126194000244, "learning_rate": 3.724680760667967e-06, "loss": 0.2656, "step": 13200 }, { "epoch": 2.175377205080175, "grad_norm": 1.3945817947387695, "learning_rate": 3.7108494930944937e-06, "loss": 0.2762, "step": 13210 }, { "epoch": 2.177023939400177, "grad_norm": 1.428282618522644, "learning_rate": 3.697038101119442e-06, "loss": 0.2728, "step": 13220 }, { "epoch": 2.1786706737201786, "grad_norm": 1.9224436283111572, "learning_rate": 3.6832466283909386e-06, "loss": 0.2577, "step": 13230 }, { "epoch": 2.1803174080401804, "grad_norm": 2.1055612564086914, "learning_rate": 3.6694751184941712e-06, "loss": 0.2695, "step": 13240 }, { "epoch": 2.181964142360182, "grad_norm": 2.1279563903808594, "learning_rate": 3.6557236149512276e-06, "loss": 0.2871, "step": 13250 }, { "epoch": 2.1836108766801834, "grad_norm": 2.5193889141082764, "learning_rate": 3.641992161220983e-06, "loss": 0.2814, "step": 13260 }, { "epoch": 2.185257611000185, "grad_norm": 1.3304851055145264, "learning_rate": 3.628280800698939e-06, "loss": 0.2704, "step": 13270 }, { "epoch": 2.186904345320187, "grad_norm": 1.5844635963439941, "learning_rate": 3.6145895767171e-06, "loss": 0.2627, "step": 13280 }, { "epoch": 2.1885510796401886, "grad_norm": 1.840064287185669, "learning_rate": 3.6009185325438278e-06, "loss": 0.2649, "step": 13290 }, { "epoch": 2.1901978139601903, "grad_norm": 1.7065536975860596, "learning_rate": 3.5872677113837227e-06, "loss": 0.2494, "step": 13300 }, { "epoch": 2.191844548280192, "grad_norm": 1.525583267211914, "learning_rate": 3.5736371563774587e-06, "loss": 0.2716, "step": 13310 }, { "epoch": 2.1934912826001933, "grad_norm": 1.5239418745040894, "learning_rate": 3.560026910601678e-06, "loss": 0.272, "step": 13320 }, { "epoch": 2.195138016920195, "grad_norm": 1.7140849828720093, "learning_rate": 3.5464370170688244e-06, "loss": 0.2802, "step": 13330 }, { "epoch": 2.196784751240197, "grad_norm": 1.8460406064987183, "learning_rate": 3.53286751872704e-06, "loss": 0.2674, "step": 13340 }, { "epoch": 2.1984314855601985, "grad_norm": 1.5761222839355469, "learning_rate": 3.519318458459988e-06, "loss": 0.2673, "step": 13350 }, { "epoch": 2.2000782198802002, "grad_norm": 1.5991119146347046, "learning_rate": 3.5057898790867673e-06, "loss": 0.2904, "step": 13360 }, { "epoch": 2.2017249542002015, "grad_norm": 1.726907730102539, "learning_rate": 3.4922818233617295e-06, "loss": 0.2729, "step": 13370 }, { "epoch": 2.2033716885202033, "grad_norm": 1.3864545822143555, "learning_rate": 3.4787943339743836e-06, "loss": 0.2753, "step": 13380 }, { "epoch": 2.205018422840205, "grad_norm": 2.468479871749878, "learning_rate": 3.4653274535492255e-06, "loss": 0.2777, "step": 13390 }, { "epoch": 2.2066651571602067, "grad_norm": 1.603942632675171, "learning_rate": 3.451881224645637e-06, "loss": 0.2721, "step": 13400 }, { "epoch": 2.2083118914802085, "grad_norm": 1.7777491807937622, "learning_rate": 3.4384556897577183e-06, "loss": 0.2949, "step": 13410 }, { "epoch": 2.20995862580021, "grad_norm": 1.8029536008834839, "learning_rate": 3.425050891314191e-06, "loss": 0.272, "step": 13420 }, { "epoch": 2.2116053601202115, "grad_norm": 1.74064302444458, "learning_rate": 3.4116668716782164e-06, "loss": 0.267, "step": 13430 }, { "epoch": 2.213252094440213, "grad_norm": 1.70004403591156, "learning_rate": 3.398303673147314e-06, "loss": 0.2528, "step": 13440 }, { "epoch": 2.214898828760215, "grad_norm": 1.407127022743225, "learning_rate": 3.3849613379531865e-06, "loss": 0.27, "step": 13450 }, { "epoch": 2.2165455630802167, "grad_norm": 2.015048027038574, "learning_rate": 3.371639908261611e-06, "loss": 0.2576, "step": 13460 }, { "epoch": 2.2181922974002184, "grad_norm": 2.1229333877563477, "learning_rate": 3.35833942617229e-06, "loss": 0.2779, "step": 13470 }, { "epoch": 2.2198390317202197, "grad_norm": 1.634856939315796, "learning_rate": 3.3450599337187326e-06, "loss": 0.2564, "step": 13480 }, { "epoch": 2.2214857660402214, "grad_norm": 1.7512359619140625, "learning_rate": 3.3318014728681104e-06, "loss": 0.2773, "step": 13490 }, { "epoch": 2.223132500360223, "grad_norm": 1.8369240760803223, "learning_rate": 3.3185640855211255e-06, "loss": 0.2501, "step": 13500 }, { "epoch": 2.224779234680225, "grad_norm": 2.120023012161255, "learning_rate": 3.3053478135118845e-06, "loss": 0.2836, "step": 13510 }, { "epoch": 2.2264259690002266, "grad_norm": 1.684515118598938, "learning_rate": 3.292152698607768e-06, "loss": 0.2638, "step": 13520 }, { "epoch": 2.228072703320228, "grad_norm": 1.8493366241455078, "learning_rate": 3.2789787825092834e-06, "loss": 0.2579, "step": 13530 }, { "epoch": 2.2297194376402296, "grad_norm": 1.272315502166748, "learning_rate": 3.2658261068499562e-06, "loss": 0.2729, "step": 13540 }, { "epoch": 2.2313661719602313, "grad_norm": 2.1729161739349365, "learning_rate": 3.252694713196173e-06, "loss": 0.275, "step": 13550 }, { "epoch": 2.233012906280233, "grad_norm": 1.7917054891586304, "learning_rate": 3.239584643047078e-06, "loss": 0.2788, "step": 13560 }, { "epoch": 2.234659640600235, "grad_norm": 1.5601428747177124, "learning_rate": 3.2264959378344053e-06, "loss": 0.2728, "step": 13570 }, { "epoch": 2.236306374920236, "grad_norm": 1.8332723379135132, "learning_rate": 3.2134286389223913e-06, "loss": 0.2782, "step": 13580 }, { "epoch": 2.237953109240238, "grad_norm": 1.5861481428146362, "learning_rate": 3.2003827876076066e-06, "loss": 0.2638, "step": 13590 }, { "epoch": 2.2395998435602396, "grad_norm": 1.4836739301681519, "learning_rate": 3.1873584251188527e-06, "loss": 0.2454, "step": 13600 }, { "epoch": 2.2412465778802413, "grad_norm": 1.721747636795044, "learning_rate": 3.174355592617008e-06, "loss": 0.2766, "step": 13610 }, { "epoch": 2.242893312200243, "grad_norm": 2.278952121734619, "learning_rate": 3.161374331194922e-06, "loss": 0.2613, "step": 13620 }, { "epoch": 2.2445400465202447, "grad_norm": 2.0035574436187744, "learning_rate": 3.1484146818772644e-06, "loss": 0.2816, "step": 13630 }, { "epoch": 2.246186780840246, "grad_norm": 1.6324348449707031, "learning_rate": 3.1354766856204066e-06, "loss": 0.2613, "step": 13640 }, { "epoch": 2.2478335151602478, "grad_norm": 2.2162437438964844, "learning_rate": 3.1225603833122866e-06, "loss": 0.2682, "step": 13650 }, { "epoch": 2.2494802494802495, "grad_norm": 1.5400594472885132, "learning_rate": 3.1096658157722936e-06, "loss": 0.2776, "step": 13660 }, { "epoch": 2.2511269838002512, "grad_norm": 2.152050495147705, "learning_rate": 3.0967930237511144e-06, "loss": 0.2573, "step": 13670 }, { "epoch": 2.252773718120253, "grad_norm": 1.3941951990127563, "learning_rate": 3.0839420479306325e-06, "loss": 0.2738, "step": 13680 }, { "epoch": 2.2544204524402542, "grad_norm": 1.7376865148544312, "learning_rate": 3.071112928923773e-06, "loss": 0.2625, "step": 13690 }, { "epoch": 2.256067186760256, "grad_norm": 2.0723204612731934, "learning_rate": 3.058305707274396e-06, "loss": 0.2632, "step": 13700 }, { "epoch": 2.2577139210802577, "grad_norm": 2.126526117324829, "learning_rate": 3.0455204234571568e-06, "loss": 0.2649, "step": 13710 }, { "epoch": 2.2593606554002594, "grad_norm": 2.2294485569000244, "learning_rate": 3.0327571178773772e-06, "loss": 0.2506, "step": 13720 }, { "epoch": 2.261007389720261, "grad_norm": 1.6885284185409546, "learning_rate": 3.0200158308709217e-06, "loss": 0.2715, "step": 13730 }, { "epoch": 2.262654124040263, "grad_norm": 1.9802756309509277, "learning_rate": 3.0072966027040785e-06, "loss": 0.2647, "step": 13740 }, { "epoch": 2.264300858360264, "grad_norm": 1.758371353149414, "learning_rate": 2.9945994735734085e-06, "loss": 0.2741, "step": 13750 }, { "epoch": 2.265947592680266, "grad_norm": 1.7555452585220337, "learning_rate": 2.9819244836056482e-06, "loss": 0.2659, "step": 13760 }, { "epoch": 2.2675943270002676, "grad_norm": 1.7351974248886108, "learning_rate": 2.9692716728575576e-06, "loss": 0.2482, "step": 13770 }, { "epoch": 2.2692410613202694, "grad_norm": 1.67487633228302, "learning_rate": 2.956641081315803e-06, "loss": 0.2655, "step": 13780 }, { "epoch": 2.270887795640271, "grad_norm": 1.7826485633850098, "learning_rate": 2.944032748896842e-06, "loss": 0.2706, "step": 13790 }, { "epoch": 2.2725345299602724, "grad_norm": 1.6247366666793823, "learning_rate": 2.9314467154467728e-06, "loss": 0.2877, "step": 13800 }, { "epoch": 2.274181264280274, "grad_norm": 1.7416200637817383, "learning_rate": 2.9188830207412355e-06, "loss": 0.2653, "step": 13810 }, { "epoch": 2.275827998600276, "grad_norm": 1.554840087890625, "learning_rate": 2.9063417044852627e-06, "loss": 0.2528, "step": 13820 }, { "epoch": 2.2774747329202776, "grad_norm": 1.978947639465332, "learning_rate": 2.8938228063131655e-06, "loss": 0.2884, "step": 13830 }, { "epoch": 2.2791214672402793, "grad_norm": 2.049222946166992, "learning_rate": 2.881326365788417e-06, "loss": 0.2699, "step": 13840 }, { "epoch": 2.2807682015602806, "grad_norm": 1.588185429573059, "learning_rate": 2.8688524224035076e-06, "loss": 0.2725, "step": 13850 }, { "epoch": 2.2824149358802823, "grad_norm": 1.8063595294952393, "learning_rate": 2.856401015579828e-06, "loss": 0.2775, "step": 13860 }, { "epoch": 2.284061670200284, "grad_norm": 1.70321786403656, "learning_rate": 2.8439721846675595e-06, "loss": 0.259, "step": 13870 }, { "epoch": 2.285708404520286, "grad_norm": 1.4761788845062256, "learning_rate": 2.83156596894552e-06, "loss": 0.2497, "step": 13880 }, { "epoch": 2.2873551388402875, "grad_norm": 2.2861974239349365, "learning_rate": 2.819182407621074e-06, "loss": 0.2603, "step": 13890 }, { "epoch": 2.289001873160289, "grad_norm": 2.404402494430542, "learning_rate": 2.806821539829978e-06, "loss": 0.2708, "step": 13900 }, { "epoch": 2.2906486074802905, "grad_norm": 1.8514249324798584, "learning_rate": 2.7944834046362755e-06, "loss": 0.259, "step": 13910 }, { "epoch": 2.2922953418002923, "grad_norm": 2.039151191711426, "learning_rate": 2.7821680410321638e-06, "loss": 0.2385, "step": 13920 }, { "epoch": 2.293942076120294, "grad_norm": 1.4477638006210327, "learning_rate": 2.7698754879378853e-06, "loss": 0.2805, "step": 13930 }, { "epoch": 2.2955888104402957, "grad_norm": 1.8691905736923218, "learning_rate": 2.7576057842015823e-06, "loss": 0.2642, "step": 13940 }, { "epoch": 2.297235544760297, "grad_norm": 2.3928122520446777, "learning_rate": 2.7453589685991964e-06, "loss": 0.2619, "step": 13950 }, { "epoch": 2.2988822790802987, "grad_norm": 1.9695171117782593, "learning_rate": 2.733135079834327e-06, "loss": 0.2569, "step": 13960 }, { "epoch": 2.3005290134003005, "grad_norm": 1.6952944993972778, "learning_rate": 2.7209341565381275e-06, "loss": 0.272, "step": 13970 }, { "epoch": 2.302175747720302, "grad_norm": 2.042433500289917, "learning_rate": 2.7087562372691644e-06, "loss": 0.2553, "step": 13980 }, { "epoch": 2.303822482040304, "grad_norm": 1.8709336519241333, "learning_rate": 2.696601360513309e-06, "loss": 0.2592, "step": 13990 }, { "epoch": 2.3054692163603057, "grad_norm": 1.952884316444397, "learning_rate": 2.684469564683608e-06, "loss": 0.269, "step": 14000 }, { "epoch": 2.307115950680307, "grad_norm": 1.885693907737732, "learning_rate": 2.6723608881201737e-06, "loss": 0.2986, "step": 14010 }, { "epoch": 2.3087626850003087, "grad_norm": 1.9475207328796387, "learning_rate": 2.660275369090043e-06, "loss": 0.2396, "step": 14020 }, { "epoch": 2.3104094193203104, "grad_norm": 1.693192958831787, "learning_rate": 2.6482130457870813e-06, "loss": 0.2611, "step": 14030 }, { "epoch": 2.312056153640312, "grad_norm": 1.7588555812835693, "learning_rate": 2.6361739563318334e-06, "loss": 0.2704, "step": 14040 }, { "epoch": 2.313702887960314, "grad_norm": 2.1660373210906982, "learning_rate": 2.6241581387714333e-06, "loss": 0.2869, "step": 14050 }, { "epoch": 2.315349622280315, "grad_norm": 1.8832919597625732, "learning_rate": 2.61216563107946e-06, "loss": 0.2388, "step": 14060 }, { "epoch": 2.316996356600317, "grad_norm": 1.77772855758667, "learning_rate": 2.6001964711558245e-06, "loss": 0.2586, "step": 14070 }, { "epoch": 2.3186430909203186, "grad_norm": 1.9591187238693237, "learning_rate": 2.5882506968266564e-06, "loss": 0.2469, "step": 14080 }, { "epoch": 2.3202898252403203, "grad_norm": 1.8513628244400024, "learning_rate": 2.5763283458441823e-06, "loss": 0.2728, "step": 14090 }, { "epoch": 2.321936559560322, "grad_norm": 1.9876492023468018, "learning_rate": 2.5644294558865955e-06, "loss": 0.254, "step": 14100 }, { "epoch": 2.323583293880324, "grad_norm": 1.812187910079956, "learning_rate": 2.5525540645579573e-06, "loss": 0.2641, "step": 14110 }, { "epoch": 2.325230028200325, "grad_norm": 1.6044301986694336, "learning_rate": 2.540702209388052e-06, "loss": 0.2712, "step": 14120 }, { "epoch": 2.326876762520327, "grad_norm": 1.71054208278656, "learning_rate": 2.5288739278322992e-06, "loss": 0.2619, "step": 14130 }, { "epoch": 2.3285234968403286, "grad_norm": 1.707785964012146, "learning_rate": 2.5170692572715983e-06, "loss": 0.2493, "step": 14140 }, { "epoch": 2.3301702311603303, "grad_norm": 1.6551952362060547, "learning_rate": 2.505288235012251e-06, "loss": 0.2635, "step": 14150 }, { "epoch": 2.331816965480332, "grad_norm": 1.5544058084487915, "learning_rate": 2.4935308982858097e-06, "loss": 0.2601, "step": 14160 }, { "epoch": 2.3334636998003333, "grad_norm": 2.0415706634521484, "learning_rate": 2.4817972842489824e-06, "loss": 0.2691, "step": 14170 }, { "epoch": 2.335110434120335, "grad_norm": 1.9987118244171143, "learning_rate": 2.4700874299834975e-06, "loss": 0.257, "step": 14180 }, { "epoch": 2.3367571684403368, "grad_norm": 2.0765695571899414, "learning_rate": 2.4584013724960063e-06, "loss": 0.2277, "step": 14190 }, { "epoch": 2.3384039027603385, "grad_norm": 2.2434241771698, "learning_rate": 2.4467391487179446e-06, "loss": 0.245, "step": 14200 }, { "epoch": 2.3400506370803402, "grad_norm": 1.8256983757019043, "learning_rate": 2.4351007955054316e-06, "loss": 0.2637, "step": 14210 }, { "epoch": 2.3416973714003415, "grad_norm": 1.6313284635543823, "learning_rate": 2.4234863496391458e-06, "loss": 0.2765, "step": 14220 }, { "epoch": 2.3433441057203432, "grad_norm": 1.5957192182540894, "learning_rate": 2.411895847824218e-06, "loss": 0.2675, "step": 14230 }, { "epoch": 2.344990840040345, "grad_norm": 1.8532218933105469, "learning_rate": 2.4003293266900985e-06, "loss": 0.2673, "step": 14240 }, { "epoch": 2.3466375743603467, "grad_norm": 2.489428997039795, "learning_rate": 2.388786822790462e-06, "loss": 0.2629, "step": 14250 }, { "epoch": 2.3482843086803484, "grad_norm": 1.6500635147094727, "learning_rate": 2.377268372603071e-06, "loss": 0.2726, "step": 14260 }, { "epoch": 2.3499310430003497, "grad_norm": 2.1244235038757324, "learning_rate": 2.3657740125296845e-06, "loss": 0.2656, "step": 14270 }, { "epoch": 2.3515777773203514, "grad_norm": 1.7785730361938477, "learning_rate": 2.354303778895911e-06, "loss": 0.2606, "step": 14280 }, { "epoch": 2.353224511640353, "grad_norm": 1.852630615234375, "learning_rate": 2.3428577079511304e-06, "loss": 0.2656, "step": 14290 }, { "epoch": 2.354871245960355, "grad_norm": 1.7617170810699463, "learning_rate": 2.331435835868349e-06, "loss": 0.2685, "step": 14300 }, { "epoch": 2.3565179802803566, "grad_norm": 1.348222017288208, "learning_rate": 2.3200381987441067e-06, "loss": 0.2524, "step": 14310 }, { "epoch": 2.358164714600358, "grad_norm": 1.567896842956543, "learning_rate": 2.308664832598343e-06, "loss": 0.2574, "step": 14320 }, { "epoch": 2.3598114489203597, "grad_norm": 2.113232374191284, "learning_rate": 2.2973157733743055e-06, "loss": 0.2589, "step": 14330 }, { "epoch": 2.3614581832403614, "grad_norm": 2.2120754718780518, "learning_rate": 2.285991056938418e-06, "loss": 0.2562, "step": 14340 }, { "epoch": 2.363104917560363, "grad_norm": 1.2926445007324219, "learning_rate": 2.2746907190801724e-06, "loss": 0.2383, "step": 14350 }, { "epoch": 2.364751651880365, "grad_norm": 2.4468600749969482, "learning_rate": 2.2634147955120176e-06, "loss": 0.2621, "step": 14360 }, { "epoch": 2.3663983862003666, "grad_norm": 1.7285964488983154, "learning_rate": 2.252163321869254e-06, "loss": 0.2619, "step": 14370 }, { "epoch": 2.368045120520368, "grad_norm": 1.6784744262695312, "learning_rate": 2.2409363337099e-06, "loss": 0.2551, "step": 14380 }, { "epoch": 2.3696918548403696, "grad_norm": 1.5659254789352417, "learning_rate": 2.229733866514605e-06, "loss": 0.2449, "step": 14390 }, { "epoch": 2.3713385891603713, "grad_norm": 1.6228045225143433, "learning_rate": 2.2185559556865145e-06, "loss": 0.2598, "step": 14400 }, { "epoch": 2.372985323480373, "grad_norm": 1.4725788831710815, "learning_rate": 2.20740263655118e-06, "loss": 0.2619, "step": 14410 }, { "epoch": 2.374632057800375, "grad_norm": 1.6542408466339111, "learning_rate": 2.1962739443564196e-06, "loss": 0.2548, "step": 14420 }, { "epoch": 2.3762787921203765, "grad_norm": 1.6933832168579102, "learning_rate": 2.1851699142722395e-06, "loss": 0.2413, "step": 14430 }, { "epoch": 2.377925526440378, "grad_norm": 1.9051034450531006, "learning_rate": 2.1740905813906945e-06, "loss": 0.2533, "step": 14440 }, { "epoch": 2.3795722607603795, "grad_norm": 1.8932108879089355, "learning_rate": 2.1630359807257962e-06, "loss": 0.2603, "step": 14450 }, { "epoch": 2.3812189950803813, "grad_norm": 2.07220721244812, "learning_rate": 2.1520061472133903e-06, "loss": 0.2544, "step": 14460 }, { "epoch": 2.382865729400383, "grad_norm": 1.947472333908081, "learning_rate": 2.1410011157110556e-06, "loss": 0.2462, "step": 14470 }, { "epoch": 2.3845124637203847, "grad_norm": 2.6550047397613525, "learning_rate": 2.130020920997985e-06, "loss": 0.2725, "step": 14480 }, { "epoch": 2.386159198040386, "grad_norm": 1.6511452198028564, "learning_rate": 2.11906559777488e-06, "loss": 0.26, "step": 14490 }, { "epoch": 2.3878059323603877, "grad_norm": 1.7327511310577393, "learning_rate": 2.1081351806638395e-06, "loss": 0.2479, "step": 14500 }, { "epoch": 2.3892879932483893, "grad_norm": 1.907187819480896, "learning_rate": 2.09722970420826e-06, "loss": 0.2281, "step": 14510 }, { "epoch": 2.390934727568391, "grad_norm": 1.4497332572937012, "learning_rate": 2.086349202872705e-06, "loss": 0.2465, "step": 14520 }, { "epoch": 2.3925814618883927, "grad_norm": 1.7492471933364868, "learning_rate": 2.075493711042823e-06, "loss": 0.2407, "step": 14530 }, { "epoch": 2.3942281962083944, "grad_norm": 2.5765883922576904, "learning_rate": 2.0646632630252104e-06, "loss": 0.2505, "step": 14540 }, { "epoch": 2.3958749305283957, "grad_norm": 1.5902353525161743, "learning_rate": 2.053857893047334e-06, "loss": 0.2381, "step": 14550 }, { "epoch": 2.3975216648483975, "grad_norm": 2.103832721710205, "learning_rate": 2.0430776352573924e-06, "loss": 0.2603, "step": 14560 }, { "epoch": 2.399168399168399, "grad_norm": 2.955065965652466, "learning_rate": 2.0323225237242285e-06, "loss": 0.2414, "step": 14570 }, { "epoch": 2.400815133488401, "grad_norm": 1.6870169639587402, "learning_rate": 2.0215925924372126e-06, "loss": 0.2196, "step": 14580 }, { "epoch": 2.4024618678084027, "grad_norm": 1.891734004020691, "learning_rate": 2.0108878753061434e-06, "loss": 0.2278, "step": 14590 }, { "epoch": 2.404108602128404, "grad_norm": 2.0214812755584717, "learning_rate": 2.0002084061611282e-06, "loss": 0.2463, "step": 14600 }, { "epoch": 2.4057553364484057, "grad_norm": 1.8380804061889648, "learning_rate": 1.9895542187524906e-06, "loss": 0.2256, "step": 14610 }, { "epoch": 2.4074020707684074, "grad_norm": 1.8675373792648315, "learning_rate": 1.9789253467506474e-06, "loss": 0.2442, "step": 14620 }, { "epoch": 2.409048805088409, "grad_norm": 1.725472092628479, "learning_rate": 1.9683218237460233e-06, "loss": 0.2396, "step": 14630 }, { "epoch": 2.410695539408411, "grad_norm": 1.398227334022522, "learning_rate": 1.9577436832489206e-06, "loss": 0.2201, "step": 14640 }, { "epoch": 2.412342273728412, "grad_norm": 2.566589593887329, "learning_rate": 1.947190958689428e-06, "loss": 0.2475, "step": 14650 }, { "epoch": 2.413989008048414, "grad_norm": 1.501044511795044, "learning_rate": 1.9366636834173193e-06, "loss": 0.2488, "step": 14660 }, { "epoch": 2.4156357423684156, "grad_norm": 1.5589001178741455, "learning_rate": 1.926161890701934e-06, "loss": 0.2375, "step": 14670 }, { "epoch": 2.4172824766884173, "grad_norm": 1.3738040924072266, "learning_rate": 1.915685613732079e-06, "loss": 0.2353, "step": 14680 }, { "epoch": 2.418929211008419, "grad_norm": 1.7642990350723267, "learning_rate": 1.9052348856159298e-06, "loss": 0.2333, "step": 14690 }, { "epoch": 2.4205759453284204, "grad_norm": 1.651908040046692, "learning_rate": 1.8948097393809127e-06, "loss": 0.2347, "step": 14700 }, { "epoch": 2.422222679648422, "grad_norm": 1.8113752603530884, "learning_rate": 1.8844102079736114e-06, "loss": 0.2444, "step": 14710 }, { "epoch": 2.423869413968424, "grad_norm": 1.66728675365448, "learning_rate": 1.8740363242596605e-06, "loss": 0.222, "step": 14720 }, { "epoch": 2.4255161482884255, "grad_norm": 1.575468897819519, "learning_rate": 1.8636881210236346e-06, "loss": 0.2395, "step": 14730 }, { "epoch": 2.4271628826084273, "grad_norm": 1.5622098445892334, "learning_rate": 1.8533656309689584e-06, "loss": 0.2589, "step": 14740 }, { "epoch": 2.4288096169284286, "grad_norm": 1.7539955377578735, "learning_rate": 1.8430688867177882e-06, "loss": 0.2442, "step": 14750 }, { "epoch": 2.4304563512484303, "grad_norm": 1.620202898979187, "learning_rate": 1.832797920810917e-06, "loss": 0.2434, "step": 14760 }, { "epoch": 2.432103085568432, "grad_norm": 1.9027595520019531, "learning_rate": 1.822552765707676e-06, "loss": 0.2541, "step": 14770 }, { "epoch": 2.4337498198884338, "grad_norm": 1.3224104642868042, "learning_rate": 1.8123334537858195e-06, "loss": 0.2264, "step": 14780 }, { "epoch": 2.4353965542084355, "grad_norm": 2.0354669094085693, "learning_rate": 1.8021400173414306e-06, "loss": 0.237, "step": 14790 }, { "epoch": 2.437043288528437, "grad_norm": 1.2932946681976318, "learning_rate": 1.7919724885888256e-06, "loss": 0.2355, "step": 14800 }, { "epoch": 2.4386900228484385, "grad_norm": 1.3264442682266235, "learning_rate": 1.781830899660434e-06, "loss": 0.2237, "step": 14810 }, { "epoch": 2.4403367571684402, "grad_norm": 1.640587329864502, "learning_rate": 1.7717152826067175e-06, "loss": 0.2311, "step": 14820 }, { "epoch": 2.441983491488442, "grad_norm": 1.6640249490737915, "learning_rate": 1.7616256693960532e-06, "loss": 0.2193, "step": 14830 }, { "epoch": 2.4436302258084437, "grad_norm": 1.4918125867843628, "learning_rate": 1.751562091914637e-06, "loss": 0.2356, "step": 14840 }, { "epoch": 2.4452769601284454, "grad_norm": 1.52755606174469, "learning_rate": 1.7415245819663861e-06, "loss": 0.2266, "step": 14850 }, { "epoch": 2.446923694448447, "grad_norm": 2.0704216957092285, "learning_rate": 1.7315131712728417e-06, "loss": 0.2535, "step": 14860 }, { "epoch": 2.4485704287684484, "grad_norm": 1.83346426486969, "learning_rate": 1.7215278914730527e-06, "loss": 0.2363, "step": 14870 }, { "epoch": 2.45021716308845, "grad_norm": 1.4527473449707031, "learning_rate": 1.7115687741234987e-06, "loss": 0.2296, "step": 14880 }, { "epoch": 2.451863897408452, "grad_norm": 1.9535012245178223, "learning_rate": 1.7016358506979657e-06, "loss": 0.2336, "step": 14890 }, { "epoch": 2.4535106317284536, "grad_norm": 2.087193250656128, "learning_rate": 1.6917291525874723e-06, "loss": 0.2527, "step": 14900 }, { "epoch": 2.4551573660484554, "grad_norm": 1.9018986225128174, "learning_rate": 1.681848711100146e-06, "loss": 0.2463, "step": 14910 }, { "epoch": 2.4568041003684566, "grad_norm": 1.6955307722091675, "learning_rate": 1.6719945574611418e-06, "loss": 0.2348, "step": 14920 }, { "epoch": 2.4584508346884584, "grad_norm": 1.538035273551941, "learning_rate": 1.6621667228125305e-06, "loss": 0.2511, "step": 14930 }, { "epoch": 2.46009756900846, "grad_norm": 1.6792057752609253, "learning_rate": 1.6523652382132183e-06, "loss": 0.2592, "step": 14940 }, { "epoch": 2.461744303328462, "grad_norm": 1.4391840696334839, "learning_rate": 1.6425901346388263e-06, "loss": 0.2233, "step": 14950 }, { "epoch": 2.4633910376484636, "grad_norm": 1.333770990371704, "learning_rate": 1.6328414429816109e-06, "loss": 0.2303, "step": 14960 }, { "epoch": 2.465037771968465, "grad_norm": 2.2043004035949707, "learning_rate": 1.6231191940503543e-06, "loss": 0.2294, "step": 14970 }, { "epoch": 2.4666845062884666, "grad_norm": 1.5073317289352417, "learning_rate": 1.613423418570279e-06, "loss": 0.2368, "step": 14980 }, { "epoch": 2.4683312406084683, "grad_norm": 1.6827707290649414, "learning_rate": 1.6037541471829288e-06, "loss": 0.238, "step": 14990 }, { "epoch": 2.46997797492847, "grad_norm": 1.4171675443649292, "learning_rate": 1.594111410446104e-06, "loss": 0.2162, "step": 15000 }, { "epoch": 2.4716247092484718, "grad_norm": 1.938513994216919, "learning_rate": 1.5844952388337332e-06, "loss": 0.269, "step": 15010 }, { "epoch": 2.473271443568473, "grad_norm": 1.7886428833007812, "learning_rate": 1.574905662735805e-06, "loss": 0.2393, "step": 15020 }, { "epoch": 2.474918177888475, "grad_norm": 1.4953855276107788, "learning_rate": 1.5653427124582431e-06, "loss": 0.2265, "step": 15030 }, { "epoch": 2.4765649122084765, "grad_norm": 1.8918662071228027, "learning_rate": 1.5558064182228393e-06, "loss": 0.2295, "step": 15040 }, { "epoch": 2.4782116465284783, "grad_norm": 1.7208409309387207, "learning_rate": 1.5462968101671361e-06, "loss": 0.2429, "step": 15050 }, { "epoch": 2.47985838084848, "grad_norm": 1.5308380126953125, "learning_rate": 1.5368139183443421e-06, "loss": 0.2396, "step": 15060 }, { "epoch": 2.4815051151684813, "grad_norm": 1.626708984375, "learning_rate": 1.5273577727232314e-06, "loss": 0.2398, "step": 15070 }, { "epoch": 2.483151849488483, "grad_norm": 1.9254333972930908, "learning_rate": 1.5179284031880603e-06, "loss": 0.2288, "step": 15080 }, { "epoch": 2.4847985838084847, "grad_norm": 2.0169408321380615, "learning_rate": 1.5085258395384538e-06, "loss": 0.2416, "step": 15090 }, { "epoch": 2.4864453181284865, "grad_norm": 1.554512858390808, "learning_rate": 1.4991501114893336e-06, "loss": 0.2522, "step": 15100 }, { "epoch": 2.488092052448488, "grad_norm": 1.447223424911499, "learning_rate": 1.4898012486708024e-06, "loss": 0.2248, "step": 15110 }, { "epoch": 2.48973878676849, "grad_norm": 1.233121395111084, "learning_rate": 1.48047928062807e-06, "loss": 0.2216, "step": 15120 }, { "epoch": 2.491385521088491, "grad_norm": 1.4239838123321533, "learning_rate": 1.4711842368213437e-06, "loss": 0.2151, "step": 15130 }, { "epoch": 2.493032255408493, "grad_norm": 1.5615428686141968, "learning_rate": 1.4619161466257459e-06, "loss": 0.2303, "step": 15140 }, { "epoch": 2.4946789897284947, "grad_norm": 1.5079021453857422, "learning_rate": 1.4526750393312118e-06, "loss": 0.2295, "step": 15150 }, { "epoch": 2.4963257240484964, "grad_norm": 1.7538816928863525, "learning_rate": 1.443460944142413e-06, "loss": 0.2289, "step": 15160 }, { "epoch": 2.497972458368498, "grad_norm": 1.7975422143936157, "learning_rate": 1.4342738901786434e-06, "loss": 0.234, "step": 15170 }, { "epoch": 2.4996191926885, "grad_norm": 1.8438105583190918, "learning_rate": 1.4251139064737485e-06, "loss": 0.239, "step": 15180 }, { "epoch": 2.501265927008501, "grad_norm": 4.315160274505615, "learning_rate": 1.4159810219760161e-06, "loss": 0.2261, "step": 15190 }, { "epoch": 2.502912661328503, "grad_norm": 1.4963452816009521, "learning_rate": 1.406875265548101e-06, "loss": 0.2228, "step": 15200 }, { "epoch": 2.5045593956485046, "grad_norm": 1.717155933380127, "learning_rate": 1.3977966659669096e-06, "loss": 0.2385, "step": 15210 }, { "epoch": 2.5062061299685063, "grad_norm": 1.8200072050094604, "learning_rate": 1.3887452519235434e-06, "loss": 0.2179, "step": 15220 }, { "epoch": 2.507852864288508, "grad_norm": 1.7720727920532227, "learning_rate": 1.379721052023174e-06, "loss": 0.2252, "step": 15230 }, { "epoch": 2.5094995986085094, "grad_norm": 1.5537362098693848, "learning_rate": 1.3707240947849797e-06, "loss": 0.2273, "step": 15240 }, { "epoch": 2.511146332928511, "grad_norm": 1.4824028015136719, "learning_rate": 1.3617544086420353e-06, "loss": 0.241, "step": 15250 }, { "epoch": 2.512793067248513, "grad_norm": 1.6623420715332031, "learning_rate": 1.3528120219412377e-06, "loss": 0.2494, "step": 15260 }, { "epoch": 2.5144398015685145, "grad_norm": 1.8645778894424438, "learning_rate": 1.3438969629432042e-06, "loss": 0.2561, "step": 15270 }, { "epoch": 2.5160865358885163, "grad_norm": 2.436868190765381, "learning_rate": 1.335009259822191e-06, "loss": 0.2311, "step": 15280 }, { "epoch": 2.5177332702085176, "grad_norm": 1.7088630199432373, "learning_rate": 1.3261489406659978e-06, "loss": 0.2494, "step": 15290 }, { "epoch": 2.5193800045285193, "grad_norm": 1.6420857906341553, "learning_rate": 1.3173160334758895e-06, "loss": 0.2465, "step": 15300 }, { "epoch": 2.521026738848521, "grad_norm": 1.4090502262115479, "learning_rate": 1.3085105661664933e-06, "loss": 0.2263, "step": 15310 }, { "epoch": 2.5226734731685228, "grad_norm": 1.924774169921875, "learning_rate": 1.2997325665657257e-06, "loss": 0.2348, "step": 15320 }, { "epoch": 2.5243202074885245, "grad_norm": 1.5904576778411865, "learning_rate": 1.2909820624146908e-06, "loss": 0.2272, "step": 15330 }, { "epoch": 2.5259669418085258, "grad_norm": 1.8772406578063965, "learning_rate": 1.282259081367606e-06, "loss": 0.2255, "step": 15340 }, { "epoch": 2.5276136761285275, "grad_norm": 1.8594352006912231, "learning_rate": 1.273563650991696e-06, "loss": 0.2329, "step": 15350 }, { "epoch": 2.5292604104485292, "grad_norm": 1.4280927181243896, "learning_rate": 1.2648957987671295e-06, "loss": 0.234, "step": 15360 }, { "epoch": 2.530907144768531, "grad_norm": 1.7298115491867065, "learning_rate": 1.256255552086909e-06, "loss": 0.2416, "step": 15370 }, { "epoch": 2.5325538790885327, "grad_norm": 1.3822236061096191, "learning_rate": 1.2476429382568067e-06, "loss": 0.2454, "step": 15380 }, { "epoch": 2.534200613408534, "grad_norm": 1.9294795989990234, "learning_rate": 1.2390579844952565e-06, "loss": 0.2421, "step": 15390 }, { "epoch": 2.5358473477285357, "grad_norm": 1.349674940109253, "learning_rate": 1.2305007179332851e-06, "loss": 0.2348, "step": 15400 }, { "epoch": 2.5374940820485374, "grad_norm": 1.3522496223449707, "learning_rate": 1.2219711656144161e-06, "loss": 0.2404, "step": 15410 }, { "epoch": 2.539140816368539, "grad_norm": 1.3507784605026245, "learning_rate": 1.2134693544945875e-06, "loss": 0.246, "step": 15420 }, { "epoch": 2.540787550688541, "grad_norm": 1.6919246912002563, "learning_rate": 1.2049953114420654e-06, "loss": 0.2455, "step": 15430 }, { "epoch": 2.542434285008542, "grad_norm": 1.62079918384552, "learning_rate": 1.1965490632373677e-06, "loss": 0.2317, "step": 15440 }, { "epoch": 2.5440810193285444, "grad_norm": 1.7440752983093262, "learning_rate": 1.1881306365731638e-06, "loss": 0.2322, "step": 15450 }, { "epoch": 2.5457277536485456, "grad_norm": 1.5291510820388794, "learning_rate": 1.179740058054204e-06, "loss": 0.2287, "step": 15460 }, { "epoch": 2.5473744879685474, "grad_norm": 1.448723316192627, "learning_rate": 1.1713773541972263e-06, "loss": 0.2278, "step": 15470 }, { "epoch": 2.549021222288549, "grad_norm": 1.5722907781600952, "learning_rate": 1.1630425514308819e-06, "loss": 0.2366, "step": 15480 }, { "epoch": 2.5506679566085504, "grad_norm": 1.579613447189331, "learning_rate": 1.1547356760956397e-06, "loss": 0.2361, "step": 15490 }, { "epoch": 2.5523146909285526, "grad_norm": 1.437633991241455, "learning_rate": 1.1464567544437144e-06, "loss": 0.2245, "step": 15500 }, { "epoch": 2.553961425248554, "grad_norm": 1.626078724861145, "learning_rate": 1.138205812638975e-06, "loss": 0.2505, "step": 15510 }, { "epoch": 2.5556081595685556, "grad_norm": 1.7124260663986206, "learning_rate": 1.12998287675687e-06, "loss": 0.2494, "step": 15520 }, { "epoch": 2.5572548938885573, "grad_norm": 1.749879002571106, "learning_rate": 1.1217879727843351e-06, "loss": 0.2194, "step": 15530 }, { "epoch": 2.558901628208559, "grad_norm": 2.109128713607788, "learning_rate": 1.113621126619725e-06, "loss": 0.2321, "step": 15540 }, { "epoch": 2.5605483625285608, "grad_norm": 1.6905772686004639, "learning_rate": 1.1054823640727163e-06, "loss": 0.2457, "step": 15550 }, { "epoch": 2.562195096848562, "grad_norm": 1.2987953424453735, "learning_rate": 1.0973717108642323e-06, "loss": 0.2325, "step": 15560 }, { "epoch": 2.563841831168564, "grad_norm": 1.5485819578170776, "learning_rate": 1.0892891926263703e-06, "loss": 0.2462, "step": 15570 }, { "epoch": 2.5654885654885655, "grad_norm": 1.8916813135147095, "learning_rate": 1.0812348349023038e-06, "loss": 0.2335, "step": 15580 }, { "epoch": 2.5671352998085673, "grad_norm": 1.2640624046325684, "learning_rate": 1.073208663146218e-06, "loss": 0.2301, "step": 15590 }, { "epoch": 2.568782034128569, "grad_norm": 1.434807300567627, "learning_rate": 1.065210702723215e-06, "loss": 0.2167, "step": 15600 }, { "epoch": 2.5704287684485703, "grad_norm": 1.6343811750411987, "learning_rate": 1.0572409789092452e-06, "loss": 0.2227, "step": 15610 }, { "epoch": 2.572075502768572, "grad_norm": 1.6789041757583618, "learning_rate": 1.0492995168910225e-06, "loss": 0.2451, "step": 15620 }, { "epoch": 2.5737222370885737, "grad_norm": 1.42600417137146, "learning_rate": 1.0413863417659454e-06, "loss": 0.23, "step": 15630 }, { "epoch": 2.5753689714085755, "grad_norm": 1.9068646430969238, "learning_rate": 1.0335014785420128e-06, "loss": 0.2413, "step": 15640 }, { "epoch": 2.577015705728577, "grad_norm": 1.5069235563278198, "learning_rate": 1.0256449521377565e-06, "loss": 0.2581, "step": 15650 }, { "epoch": 2.5786624400485785, "grad_norm": 1.8914705514907837, "learning_rate": 1.0178167873821487e-06, "loss": 0.2205, "step": 15660 }, { "epoch": 2.58030917436858, "grad_norm": 1.843311071395874, "learning_rate": 1.0100170090145379e-06, "loss": 0.2357, "step": 15670 }, { "epoch": 2.581955908688582, "grad_norm": 1.3863564729690552, "learning_rate": 1.0022456416845561e-06, "loss": 0.222, "step": 15680 }, { "epoch": 2.5836026430085837, "grad_norm": 1.5262445211410522, "learning_rate": 9.945027099520489e-07, "loss": 0.2182, "step": 15690 }, { "epoch": 2.5852493773285854, "grad_norm": 1.4200090169906616, "learning_rate": 9.86788238287003e-07, "loss": 0.2287, "step": 15700 }, { "epoch": 2.5868961116485867, "grad_norm": 1.338196039199829, "learning_rate": 9.79102251069456e-07, "loss": 0.2213, "step": 15710 }, { "epoch": 2.5885428459685884, "grad_norm": 1.5565379858016968, "learning_rate": 9.71444772589426e-07, "loss": 0.2243, "step": 15720 }, { "epoch": 2.59018958028859, "grad_norm": 1.8993768692016602, "learning_rate": 9.638158270468423e-07, "loss": 0.2194, "step": 15730 }, { "epoch": 2.591836314608592, "grad_norm": 1.739005446434021, "learning_rate": 9.56215438551452e-07, "loss": 0.2241, "step": 15740 }, { "epoch": 2.5934830489285936, "grad_norm": 1.7477450370788574, "learning_rate": 9.486436311227631e-07, "loss": 0.2443, "step": 15750 }, { "epoch": 2.595129783248595, "grad_norm": 1.8742027282714844, "learning_rate": 9.411004286899495e-07, "loss": 0.2386, "step": 15760 }, { "epoch": 2.5967765175685966, "grad_norm": 1.9991704225540161, "learning_rate": 9.335858550917942e-07, "loss": 0.2306, "step": 15770 }, { "epoch": 2.5984232518885984, "grad_norm": 1.8734098672866821, "learning_rate": 9.26099934076593e-07, "loss": 0.2307, "step": 15780 }, { "epoch": 2.6000699862086, "grad_norm": 1.9999672174453735, "learning_rate": 9.186426893021016e-07, "loss": 0.2242, "step": 15790 }, { "epoch": 2.601716720528602, "grad_norm": 1.490793228149414, "learning_rate": 9.112141443354439e-07, "loss": 0.2493, "step": 15800 }, { "epoch": 2.603363454848603, "grad_norm": 1.619504690170288, "learning_rate": 9.038143226530482e-07, "loss": 0.2458, "step": 15810 }, { "epoch": 2.6050101891686053, "grad_norm": 1.3605459928512573, "learning_rate": 8.964432476405638e-07, "loss": 0.2279, "step": 15820 }, { "epoch": 2.6066569234886066, "grad_norm": 1.5391523838043213, "learning_rate": 8.891009425927977e-07, "loss": 0.2332, "step": 15830 }, { "epoch": 2.6083036578086083, "grad_norm": 1.6004542112350464, "learning_rate": 8.817874307136298e-07, "loss": 0.2404, "step": 15840 }, { "epoch": 2.60995039212861, "grad_norm": 1.6447652578353882, "learning_rate": 8.745027351159486e-07, "loss": 0.2363, "step": 15850 }, { "epoch": 2.6115971264486117, "grad_norm": 2.2156291007995605, "learning_rate": 8.672468788215682e-07, "loss": 0.2413, "step": 15860 }, { "epoch": 2.6132438607686135, "grad_norm": 1.7267791032791138, "learning_rate": 8.60019884761173e-07, "loss": 0.2444, "step": 15870 }, { "epoch": 2.6148905950886148, "grad_norm": 1.5263491868972778, "learning_rate": 8.52821775774223e-07, "loss": 0.227, "step": 15880 }, { "epoch": 2.6165373294086165, "grad_norm": 1.5678350925445557, "learning_rate": 8.456525746089017e-07, "loss": 0.2185, "step": 15890 }, { "epoch": 2.6181840637286182, "grad_norm": 2.008378744125366, "learning_rate": 8.385123039220277e-07, "loss": 0.237, "step": 15900 }, { "epoch": 2.61983079804862, "grad_norm": 2.221608877182007, "learning_rate": 8.314009862789984e-07, "loss": 0.2433, "step": 15910 }, { "epoch": 2.6214775323686217, "grad_norm": 2.2678582668304443, "learning_rate": 8.243186441536999e-07, "loss": 0.2457, "step": 15920 }, { "epoch": 2.623124266688623, "grad_norm": 1.7668777704238892, "learning_rate": 8.172652999284592e-07, "loss": 0.2409, "step": 15930 }, { "epoch": 2.6247710010086247, "grad_norm": 1.7608022689819336, "learning_rate": 8.102409758939522e-07, "loss": 0.243, "step": 15940 }, { "epoch": 2.6264177353286264, "grad_norm": 2.060465097427368, "learning_rate": 8.032456942491484e-07, "loss": 0.2452, "step": 15950 }, { "epoch": 2.628064469648628, "grad_norm": 1.3911464214324951, "learning_rate": 7.962794771012284e-07, "loss": 0.2331, "step": 15960 }, { "epoch": 2.62971120396863, "grad_norm": 1.3499740362167358, "learning_rate": 7.893423464655292e-07, "loss": 0.2287, "step": 15970 }, { "epoch": 2.631357938288631, "grad_norm": 1.5909963846206665, "learning_rate": 7.824343242654564e-07, "loss": 0.2338, "step": 15980 }, { "epoch": 2.633004672608633, "grad_norm": 1.7845244407653809, "learning_rate": 7.755554323324299e-07, "loss": 0.233, "step": 15990 }, { "epoch": 2.6346514069286346, "grad_norm": 1.5275846719741821, "learning_rate": 7.687056924058056e-07, "loss": 0.2209, "step": 16000 }, { "epoch": 2.6362981412486364, "grad_norm": 1.519129753112793, "learning_rate": 7.618851261328153e-07, "loss": 0.227, "step": 16010 }, { "epoch": 2.637944875568638, "grad_norm": 1.5210860967636108, "learning_rate": 7.550937550684867e-07, "loss": 0.2288, "step": 16020 }, { "epoch": 2.6395916098886394, "grad_norm": 2.0805931091308594, "learning_rate": 7.483316006755892e-07, "loss": 0.234, "step": 16030 }, { "epoch": 2.641238344208641, "grad_norm": 1.706473708152771, "learning_rate": 7.415986843245515e-07, "loss": 0.2407, "step": 16040 }, { "epoch": 2.642885078528643, "grad_norm": 1.63625168800354, "learning_rate": 7.348950272934107e-07, "loss": 0.2213, "step": 16050 }, { "epoch": 2.6445318128486446, "grad_norm": 1.5497370958328247, "learning_rate": 7.282206507677225e-07, "loss": 0.2214, "step": 16060 }, { "epoch": 2.6461785471686463, "grad_norm": 1.570217490196228, "learning_rate": 7.215755758405208e-07, "loss": 0.2112, "step": 16070 }, { "epoch": 2.6478252814886476, "grad_norm": 1.8267185688018799, "learning_rate": 7.149598235122279e-07, "loss": 0.224, "step": 16080 }, { "epoch": 2.6494720158086493, "grad_norm": 1.6849004030227661, "learning_rate": 7.08373414690604e-07, "loss": 0.2273, "step": 16090 }, { "epoch": 2.651118750128651, "grad_norm": 1.8440622091293335, "learning_rate": 7.01816370190671e-07, "loss": 0.2058, "step": 16100 }, { "epoch": 2.652765484448653, "grad_norm": 1.6094481945037842, "learning_rate": 6.952887107346551e-07, "loss": 0.2358, "step": 16110 }, { "epoch": 2.6544122187686545, "grad_norm": 1.7363183498382568, "learning_rate": 6.887904569519133e-07, "loss": 0.2254, "step": 16120 }, { "epoch": 2.656058953088656, "grad_norm": 1.7017285823822021, "learning_rate": 6.823216293788715e-07, "loss": 0.2234, "step": 16130 }, { "epoch": 2.657705687408658, "grad_norm": 1.396016240119934, "learning_rate": 6.758822484589622e-07, "loss": 0.232, "step": 16140 }, { "epoch": 2.6593524217286593, "grad_norm": 1.758091688156128, "learning_rate": 6.69472334542558e-07, "loss": 0.2362, "step": 16150 }, { "epoch": 2.660999156048661, "grad_norm": 1.6662510633468628, "learning_rate": 6.630919078869036e-07, "loss": 0.2182, "step": 16160 }, { "epoch": 2.6626458903686627, "grad_norm": 2.1852259635925293, "learning_rate": 6.567409886560605e-07, "loss": 0.2109, "step": 16170 }, { "epoch": 2.6642926246886645, "grad_norm": 1.657429575920105, "learning_rate": 6.504195969208315e-07, "loss": 0.2273, "step": 16180 }, { "epoch": 2.665939359008666, "grad_norm": 2.1529386043548584, "learning_rate": 6.44127752658712e-07, "loss": 0.2448, "step": 16190 }, { "epoch": 2.6675860933286675, "grad_norm": 1.56782066822052, "learning_rate": 6.378654757538072e-07, "loss": 0.2201, "step": 16200 }, { "epoch": 2.669232827648669, "grad_norm": 2.715771198272705, "learning_rate": 6.316327859967907e-07, "loss": 0.2286, "step": 16210 }, { "epoch": 2.670879561968671, "grad_norm": 1.7289623022079468, "learning_rate": 6.254297030848255e-07, "loss": 0.2243, "step": 16220 }, { "epoch": 2.6725262962886727, "grad_norm": 1.6193941831588745, "learning_rate": 6.192562466215135e-07, "loss": 0.2266, "step": 16230 }, { "epoch": 2.6741730306086744, "grad_norm": 1.712371826171875, "learning_rate": 6.131124361168228e-07, "loss": 0.221, "step": 16240 }, { "epoch": 2.6758197649286757, "grad_norm": 1.8381526470184326, "learning_rate": 6.069982909870376e-07, "loss": 0.2216, "step": 16250 }, { "epoch": 2.6774664992486774, "grad_norm": 2.2945876121520996, "learning_rate": 6.009138305546813e-07, "loss": 0.2157, "step": 16260 }, { "epoch": 2.679113233568679, "grad_norm": 1.9753563404083252, "learning_rate": 5.948590740484783e-07, "loss": 0.2298, "step": 16270 }, { "epoch": 2.680759967888681, "grad_norm": 1.4538737535476685, "learning_rate": 5.888340406032633e-07, "loss": 0.2151, "step": 16280 }, { "epoch": 2.6824067022086826, "grad_norm": 1.7806299924850464, "learning_rate": 5.828387492599507e-07, "loss": 0.244, "step": 16290 }, { "epoch": 2.684053436528684, "grad_norm": 1.6264704465866089, "learning_rate": 5.768732189654535e-07, "loss": 0.2418, "step": 16300 }, { "epoch": 2.6857001708486856, "grad_norm": 1.9107524156570435, "learning_rate": 5.709374685726365e-07, "loss": 0.2453, "step": 16310 }, { "epoch": 2.6873469051686873, "grad_norm": 2.1297032833099365, "learning_rate": 5.650315168402443e-07, "loss": 0.2351, "step": 16320 }, { "epoch": 2.688993639488689, "grad_norm": 5.344056606292725, "learning_rate": 5.591553824328555e-07, "loss": 0.2455, "step": 16330 }, { "epoch": 2.690640373808691, "grad_norm": 1.7449675798416138, "learning_rate": 5.533090839208133e-07, "loss": 0.2377, "step": 16340 }, { "epoch": 2.692287108128692, "grad_norm": 2.327143907546997, "learning_rate": 5.474926397801705e-07, "loss": 0.2251, "step": 16350 }, { "epoch": 2.693933842448694, "grad_norm": 1.8635077476501465, "learning_rate": 5.417060683926301e-07, "loss": 0.2174, "step": 16360 }, { "epoch": 2.6955805767686956, "grad_norm": 1.3951900005340576, "learning_rate": 5.359493880454935e-07, "loss": 0.2114, "step": 16370 }, { "epoch": 2.6972273110886973, "grad_norm": 2.1820077896118164, "learning_rate": 5.302226169315927e-07, "loss": 0.2318, "step": 16380 }, { "epoch": 2.698874045408699, "grad_norm": 1.5952907800674438, "learning_rate": 5.245257731492381e-07, "loss": 0.2344, "step": 16390 }, { "epoch": 2.7005207797287003, "grad_norm": 1.400417447090149, "learning_rate": 5.188588747021628e-07, "loss": 0.2343, "step": 16400 }, { "epoch": 2.702167514048702, "grad_norm": 1.4244686365127563, "learning_rate": 5.13221939499462e-07, "loss": 0.2151, "step": 16410 }, { "epoch": 2.7038142483687038, "grad_norm": 2.0728249549865723, "learning_rate": 5.076149853555379e-07, "loss": 0.2045, "step": 16420 }, { "epoch": 2.7054609826887055, "grad_norm": 1.8236521482467651, "learning_rate": 5.02038029990044e-07, "loss": 0.2209, "step": 16430 }, { "epoch": 2.7071077170087072, "grad_norm": 2.1654088497161865, "learning_rate": 4.964910910278298e-07, "loss": 0.2313, "step": 16440 }, { "epoch": 2.7087544513287085, "grad_norm": 1.472296953201294, "learning_rate": 4.909741859988837e-07, "loss": 0.2372, "step": 16450 }, { "epoch": 2.7104011856487102, "grad_norm": 1.6187572479248047, "learning_rate": 4.854873323382747e-07, "loss": 0.2295, "step": 16460 }, { "epoch": 2.712047919968712, "grad_norm": 1.675091028213501, "learning_rate": 4.800305473861056e-07, "loss": 0.2211, "step": 16470 }, { "epoch": 2.7136946542887137, "grad_norm": 1.6285738945007324, "learning_rate": 4.7460384838744934e-07, "loss": 0.2188, "step": 16480 }, { "epoch": 2.7153413886087154, "grad_norm": 1.419481873512268, "learning_rate": 4.692072524922975e-07, "loss": 0.2209, "step": 16490 }, { "epoch": 2.7169881229287167, "grad_norm": 1.381244421005249, "learning_rate": 4.638407767555131e-07, "loss": 0.2193, "step": 16500 }, { "epoch": 2.718634857248719, "grad_norm": 1.5735642910003662, "learning_rate": 4.585044381367609e-07, "loss": 0.2397, "step": 16510 }, { "epoch": 2.72028159156872, "grad_norm": 1.268129825592041, "learning_rate": 4.531982535004731e-07, "loss": 0.2109, "step": 16520 }, { "epoch": 2.721928325888722, "grad_norm": 1.3103333711624146, "learning_rate": 4.4792223961578006e-07, "loss": 0.2041, "step": 16530 }, { "epoch": 2.7235750602087236, "grad_norm": 1.9005833864212036, "learning_rate": 4.4267641315646313e-07, "loss": 0.2229, "step": 16540 }, { "epoch": 2.7252217945287254, "grad_norm": 1.3628641366958618, "learning_rate": 4.3746079070090765e-07, "loss": 0.2132, "step": 16550 }, { "epoch": 2.726868528848727, "grad_norm": 1.7968915700912476, "learning_rate": 4.3227538873204076e-07, "loss": 0.2321, "step": 16560 }, { "epoch": 2.7285152631687284, "grad_norm": 1.7925100326538086, "learning_rate": 4.271202236372829e-07, "loss": 0.2242, "step": 16570 }, { "epoch": 2.73016199748873, "grad_norm": 1.5291099548339844, "learning_rate": 4.2199531170850296e-07, "loss": 0.2206, "step": 16580 }, { "epoch": 2.731808731808732, "grad_norm": 1.671055793762207, "learning_rate": 4.1690066914195306e-07, "loss": 0.2104, "step": 16590 }, { "epoch": 2.7334554661287336, "grad_norm": 1.9849021434783936, "learning_rate": 4.118363120382318e-07, "loss": 0.2212, "step": 16600 }, { "epoch": 2.7351022004487353, "grad_norm": 1.765097975730896, "learning_rate": 4.0680225640222227e-07, "loss": 0.2274, "step": 16610 }, { "epoch": 2.7367489347687366, "grad_norm": 1.7051899433135986, "learning_rate": 4.017985181430495e-07, "loss": 0.2262, "step": 16620 }, { "epoch": 2.7383956690887383, "grad_norm": 1.5623761415481567, "learning_rate": 3.9682511307402083e-07, "loss": 0.2442, "step": 16630 }, { "epoch": 2.74004240340874, "grad_norm": 1.9426796436309814, "learning_rate": 3.918820569125881e-07, "loss": 0.2348, "step": 16640 }, { "epoch": 2.741689137728742, "grad_norm": 1.656064510345459, "learning_rate": 3.869693652802864e-07, "loss": 0.232, "step": 16650 }, { "epoch": 2.7433358720487435, "grad_norm": 1.2131900787353516, "learning_rate": 3.820870537026944e-07, "loss": 0.2123, "step": 16660 }, { "epoch": 2.744982606368745, "grad_norm": 1.733933448791504, "learning_rate": 3.7723513760937525e-07, "loss": 0.2327, "step": 16670 }, { "epoch": 2.7466293406887465, "grad_norm": 1.47609281539917, "learning_rate": 3.7241363233384007e-07, "loss": 0.2185, "step": 16680 }, { "epoch": 2.7482760750087483, "grad_norm": 1.770021915435791, "learning_rate": 3.6762255311348696e-07, "loss": 0.2237, "step": 16690 }, { "epoch": 2.74992280932875, "grad_norm": 1.9828838109970093, "learning_rate": 3.628619150895607e-07, "loss": 0.2295, "step": 16700 }, { "epoch": 2.7515695436487517, "grad_norm": 2.307547092437744, "learning_rate": 3.5813173330710215e-07, "loss": 0.2399, "step": 16710 }, { "epoch": 2.753216277968753, "grad_norm": 1.719611644744873, "learning_rate": 3.534320227149035e-07, "loss": 0.2246, "step": 16720 }, { "epoch": 2.7548630122887547, "grad_norm": 1.925981879234314, "learning_rate": 3.48762798165454e-07, "loss": 0.2115, "step": 16730 }, { "epoch": 2.7565097466087565, "grad_norm": 1.7162208557128906, "learning_rate": 3.441240744149055e-07, "loss": 0.2397, "step": 16740 }, { "epoch": 2.758156480928758, "grad_norm": 2.125133991241455, "learning_rate": 3.3951586612300914e-07, "loss": 0.2179, "step": 16750 }, { "epoch": 2.75980321524876, "grad_norm": 1.792880654335022, "learning_rate": 3.3493818785308886e-07, "loss": 0.2181, "step": 16760 }, { "epoch": 2.761449949568761, "grad_norm": 1.3869192600250244, "learning_rate": 3.3039105407197127e-07, "loss": 0.2357, "step": 16770 }, { "epoch": 2.763096683888763, "grad_norm": 2.811720371246338, "learning_rate": 3.2587447914996463e-07, "loss": 0.2457, "step": 16780 }, { "epoch": 2.7647434182087647, "grad_norm": 1.611673355102539, "learning_rate": 3.213884773607967e-07, "loss": 0.2396, "step": 16790 }, { "epoch": 2.7663901525287664, "grad_norm": 2.732407808303833, "learning_rate": 3.1693306288157697e-07, "loss": 0.2192, "step": 16800 }, { "epoch": 2.768036886848768, "grad_norm": 1.6391844749450684, "learning_rate": 3.1250824979274675e-07, "loss": 0.2464, "step": 16810 }, { "epoch": 2.7696836211687694, "grad_norm": 1.7055604457855225, "learning_rate": 3.0811405207804456e-07, "loss": 0.2196, "step": 16820 }, { "epoch": 2.7713303554887716, "grad_norm": 1.6378053426742554, "learning_rate": 3.0375048362444535e-07, "loss": 0.2241, "step": 16830 }, { "epoch": 2.772977089808773, "grad_norm": 1.4006156921386719, "learning_rate": 2.9941755822213704e-07, "loss": 0.2291, "step": 16840 }, { "epoch": 2.7746238241287746, "grad_norm": 1.677067756652832, "learning_rate": 2.95115289564456e-07, "loss": 0.2527, "step": 16850 }, { "epoch": 2.7762705584487763, "grad_norm": 1.372916340827942, "learning_rate": 2.9084369124786293e-07, "loss": 0.2291, "step": 16860 }, { "epoch": 2.777917292768778, "grad_norm": 1.4055744409561157, "learning_rate": 2.8660277677188487e-07, "loss": 0.2199, "step": 16870 }, { "epoch": 2.77956402708878, "grad_norm": 1.945945382118225, "learning_rate": 2.8239255953908305e-07, "loss": 0.2199, "step": 16880 }, { "epoch": 2.781210761408781, "grad_norm": 1.2029650211334229, "learning_rate": 2.782130528550031e-07, "loss": 0.2292, "step": 16890 }, { "epoch": 2.782857495728783, "grad_norm": 2.0258445739746094, "learning_rate": 2.740642699281382e-07, "loss": 0.2193, "step": 16900 }, { "epoch": 2.7845042300487846, "grad_norm": 1.2839769124984741, "learning_rate": 2.699462238698847e-07, "loss": 0.2206, "step": 16910 }, { "epoch": 2.7861509643687863, "grad_norm": 1.4832942485809326, "learning_rate": 2.6585892769450005e-07, "loss": 0.2196, "step": 16920 }, { "epoch": 2.787797698688788, "grad_norm": 1.6463637351989746, "learning_rate": 2.6180239431906284e-07, "loss": 0.2127, "step": 16930 }, { "epoch": 2.7894444330087893, "grad_norm": 1.6157695055007935, "learning_rate": 2.57776636563436e-07, "loss": 0.2241, "step": 16940 }, { "epoch": 2.791091167328791, "grad_norm": 1.478996753692627, "learning_rate": 2.537816671502158e-07, "loss": 0.2124, "step": 16950 }, { "epoch": 2.7927379016487928, "grad_norm": 1.402809739112854, "learning_rate": 2.498174987047042e-07, "loss": 0.2311, "step": 16960 }, { "epoch": 2.7943846359687945, "grad_norm": 1.756034016609192, "learning_rate": 2.458841437548587e-07, "loss": 0.208, "step": 16970 }, { "epoch": 2.7960313702887962, "grad_norm": 1.5966026782989502, "learning_rate": 2.4198161473126147e-07, "loss": 0.2216, "step": 16980 }, { "epoch": 2.7976781046087975, "grad_norm": 1.536316990852356, "learning_rate": 2.3810992396706812e-07, "loss": 0.2261, "step": 16990 }, { "epoch": 2.7993248389287992, "grad_norm": 2.4802510738372803, "learning_rate": 2.342690836979833e-07, "loss": 0.2265, "step": 17000 }, { "epoch": 2.800971573248801, "grad_norm": 1.9055966138839722, "learning_rate": 2.3045910606221078e-07, "loss": 0.2162, "step": 17010 }, { "epoch": 2.8026183075688027, "grad_norm": 1.6244001388549805, "learning_rate": 2.2668000310042237e-07, "loss": 0.2292, "step": 17020 }, { "epoch": 2.8042650418888044, "grad_norm": 1.7967705726623535, "learning_rate": 2.2293178675571236e-07, "loss": 0.2163, "step": 17030 }, { "epoch": 2.8059117762088057, "grad_norm": 1.432999610900879, "learning_rate": 2.1921446887356869e-07, "loss": 0.2227, "step": 17040 }, { "epoch": 2.8075585105288074, "grad_norm": 1.8171266317367554, "learning_rate": 2.1552806120182734e-07, "loss": 0.219, "step": 17050 }, { "epoch": 2.809205244848809, "grad_norm": 1.6710107326507568, "learning_rate": 2.1187257539064143e-07, "loss": 0.2164, "step": 17060 }, { "epoch": 2.810851979168811, "grad_norm": 2.254349708557129, "learning_rate": 2.0824802299243775e-07, "loss": 0.2159, "step": 17070 }, { "epoch": 2.8124987134888126, "grad_norm": 1.3687024116516113, "learning_rate": 2.0465441546189125e-07, "loss": 0.2183, "step": 17080 }, { "epoch": 2.814145447808814, "grad_norm": 1.6281770467758179, "learning_rate": 2.0109176415587294e-07, "loss": 0.2154, "step": 17090 }, { "epoch": 2.8157921821288157, "grad_norm": 1.8856794834136963, "learning_rate": 1.9756008033343211e-07, "loss": 0.213, "step": 17100 }, { "epoch": 2.8174389164488174, "grad_norm": 1.6289527416229248, "learning_rate": 1.940593751557429e-07, "loss": 0.2276, "step": 17110 }, { "epoch": 2.819085650768819, "grad_norm": 1.4464855194091797, "learning_rate": 1.9058965968608567e-07, "loss": 0.2174, "step": 17120 }, { "epoch": 2.820732385088821, "grad_norm": 1.4708118438720703, "learning_rate": 1.8715094488979568e-07, "loss": 0.2193, "step": 17130 }, { "epoch": 2.822379119408822, "grad_norm": 1.6330119371414185, "learning_rate": 1.837432416342444e-07, "loss": 0.2153, "step": 17140 }, { "epoch": 2.824025853728824, "grad_norm": 1.0777000188827515, "learning_rate": 1.8036656068879166e-07, "loss": 0.2284, "step": 17150 }, { "epoch": 2.8256725880488256, "grad_norm": 1.7415492534637451, "learning_rate": 1.770209127247635e-07, "loss": 0.2217, "step": 17160 }, { "epoch": 2.8273193223688273, "grad_norm": 1.8589420318603516, "learning_rate": 1.7370630831540668e-07, "loss": 0.2278, "step": 17170 }, { "epoch": 2.828966056688829, "grad_norm": 1.300451636314392, "learning_rate": 1.7042275793586416e-07, "loss": 0.212, "step": 17180 }, { "epoch": 2.8306127910088303, "grad_norm": 1.7537988424301147, "learning_rate": 1.671702719631374e-07, "loss": 0.222, "step": 17190 }, { "epoch": 2.8322595253288325, "grad_norm": 1.643229365348816, "learning_rate": 1.6394886067605752e-07, "loss": 0.2198, "step": 17200 }, { "epoch": 2.833906259648834, "grad_norm": 1.942588210105896, "learning_rate": 1.6075853425524646e-07, "loss": 0.2279, "step": 17210 }, { "epoch": 2.8355529939688355, "grad_norm": 1.780501365661621, "learning_rate": 1.5759930278309243e-07, "loss": 0.2243, "step": 17220 }, { "epoch": 2.8371997282888373, "grad_norm": 1.2578332424163818, "learning_rate": 1.5447117624371122e-07, "loss": 0.2193, "step": 17230 }, { "epoch": 2.838846462608839, "grad_norm": 1.4017175436019897, "learning_rate": 1.5137416452292164e-07, "loss": 0.1921, "step": 17240 }, { "epoch": 2.8404931969288407, "grad_norm": 1.6967748403549194, "learning_rate": 1.4830827740820453e-07, "loss": 0.233, "step": 17250 }, { "epoch": 2.842139931248842, "grad_norm": 1.5690455436706543, "learning_rate": 1.4527352458868494e-07, "loss": 0.2263, "step": 17260 }, { "epoch": 2.8437866655688437, "grad_norm": 2.2616426944732666, "learning_rate": 1.4226991565508662e-07, "loss": 0.2235, "step": 17270 }, { "epoch": 2.8454333998888455, "grad_norm": 2.0758249759674072, "learning_rate": 1.3929746009971434e-07, "loss": 0.2137, "step": 17280 }, { "epoch": 2.847080134208847, "grad_norm": 1.4178270101547241, "learning_rate": 1.3635616731641933e-07, "loss": 0.2044, "step": 17290 }, { "epoch": 2.848726868528849, "grad_norm": 1.5788546800613403, "learning_rate": 1.3344604660056494e-07, "loss": 0.2198, "step": 17300 }, { "epoch": 2.85037360284885, "grad_norm": 1.6083252429962158, "learning_rate": 1.3056710714900334e-07, "loss": 0.2158, "step": 17310 }, { "epoch": 2.852020337168852, "grad_norm": 1.376654863357544, "learning_rate": 1.2771935806004776e-07, "loss": 0.2158, "step": 17320 }, { "epoch": 2.8536670714888537, "grad_norm": 1.5676956176757812, "learning_rate": 1.2490280833343694e-07, "loss": 0.2127, "step": 17330 }, { "epoch": 2.8553138058088554, "grad_norm": 1.510061502456665, "learning_rate": 1.2211746687030958e-07, "loss": 0.2066, "step": 17340 }, { "epoch": 2.856960540128857, "grad_norm": 1.0092027187347412, "learning_rate": 1.1936334247318104e-07, "loss": 0.2072, "step": 17350 }, { "epoch": 2.8586072744488584, "grad_norm": 1.4783754348754883, "learning_rate": 1.1664044384590679e-07, "loss": 0.2055, "step": 17360 }, { "epoch": 2.86025400876886, "grad_norm": 1.3680094480514526, "learning_rate": 1.1394877959366223e-07, "loss": 0.2041, "step": 17370 }, { "epoch": 2.861900743088862, "grad_norm": 1.7390999794006348, "learning_rate": 1.1128835822291406e-07, "loss": 0.2109, "step": 17380 }, { "epoch": 2.8635474774088636, "grad_norm": 1.5219812393188477, "learning_rate": 1.0865918814138677e-07, "loss": 0.2101, "step": 17390 }, { "epoch": 2.8651942117288653, "grad_norm": 1.1699146032333374, "learning_rate": 1.060612776580483e-07, "loss": 0.227, "step": 17400 }, { "epoch": 2.8668409460488666, "grad_norm": 1.836059331893921, "learning_rate": 1.0349463498307233e-07, "loss": 0.2264, "step": 17410 }, { "epoch": 2.8684876803688684, "grad_norm": 1.6597439050674438, "learning_rate": 1.009592682278171e-07, "loss": 0.2205, "step": 17420 }, { "epoch": 2.87013441468887, "grad_norm": 1.5050686597824097, "learning_rate": 9.845518540480214e-08, "loss": 0.2203, "step": 17430 }, { "epoch": 2.871781149008872, "grad_norm": 1.6144781112670898, "learning_rate": 9.598239442767721e-08, "loss": 0.249, "step": 17440 }, { "epoch": 2.8734278833288736, "grad_norm": 1.542999029159546, "learning_rate": 9.354090311120334e-08, "loss": 0.2278, "step": 17450 }, { "epoch": 2.875074617648875, "grad_norm": 1.3249629735946655, "learning_rate": 9.113071917122407e-08, "loss": 0.2044, "step": 17460 }, { "epoch": 2.8767213519688766, "grad_norm": 1.4665296077728271, "learning_rate": 8.875185022464094e-08, "loss": 0.2165, "step": 17470 }, { "epoch": 2.8783680862888783, "grad_norm": 1.5788675546646118, "learning_rate": 8.640430378939246e-08, "loss": 0.2187, "step": 17480 }, { "epoch": 2.88001482060888, "grad_norm": 1.677793025970459, "learning_rate": 8.408808728442963e-08, "loss": 0.2058, "step": 17490 }, { "epoch": 2.8816615549288818, "grad_norm": 1.5491596460342407, "learning_rate": 8.180320802968822e-08, "loss": 0.2036, "step": 17500 }, { "epoch": 2.883308289248883, "grad_norm": 1.7089343070983887, "learning_rate": 7.95496732460721e-08, "loss": 0.2199, "step": 17510 }, { "epoch": 2.8849550235688852, "grad_norm": 1.4340360164642334, "learning_rate": 7.732749005542439e-08, "loss": 0.2135, "step": 17520 }, { "epoch": 2.8866017578888865, "grad_norm": 1.716632604598999, "learning_rate": 7.51366654805108e-08, "loss": 0.2367, "step": 17530 }, { "epoch": 2.8882484922088882, "grad_norm": 1.836976170539856, "learning_rate": 7.297720644499073e-08, "loss": 0.2203, "step": 17540 }, { "epoch": 2.88989522652889, "grad_norm": 1.2886078357696533, "learning_rate": 7.084911977340404e-08, "loss": 0.2142, "step": 17550 }, { "epoch": 2.8915419608488917, "grad_norm": 1.4025774002075195, "learning_rate": 6.875241219113982e-08, "loss": 0.2067, "step": 17560 }, { "epoch": 2.8931886951688934, "grad_norm": 1.7300608158111572, "learning_rate": 6.66870903244221e-08, "loss": 0.2204, "step": 17570 }, { "epoch": 2.8948354294888947, "grad_norm": 1.5215460062026978, "learning_rate": 6.465316070028538e-08, "loss": 0.2343, "step": 17580 }, { "epoch": 2.8964821638088964, "grad_norm": 1.8203786611557007, "learning_rate": 6.265062974655789e-08, "loss": 0.21, "step": 17590 }, { "epoch": 2.898128898128898, "grad_norm": 1.4137710332870483, "learning_rate": 6.067950379183619e-08, "loss": 0.224, "step": 17600 }, { "epoch": 2.8997756324489, "grad_norm": 1.8194389343261719, "learning_rate": 5.87397890654684e-08, "loss": 0.2254, "step": 17610 }, { "epoch": 2.9014223667689016, "grad_norm": 2.260550022125244, "learning_rate": 5.683149169753433e-08, "loss": 0.2237, "step": 17620 }, { "epoch": 2.903069101088903, "grad_norm": 1.4916115999221802, "learning_rate": 5.4954617718823154e-08, "loss": 0.2235, "step": 17630 }, { "epoch": 2.9047158354089047, "grad_norm": 1.691039800643921, "learning_rate": 5.3109173060820196e-08, "loss": 0.2222, "step": 17640 }, { "epoch": 2.9063625697289064, "grad_norm": 1.4430158138275146, "learning_rate": 5.129516355568354e-08, "loss": 0.2101, "step": 17650 }, { "epoch": 2.908009304048908, "grad_norm": 1.406510829925537, "learning_rate": 4.9512594936224065e-08, "loss": 0.2136, "step": 17660 }, { "epoch": 2.90965603836891, "grad_norm": 1.539867639541626, "learning_rate": 4.776147283589438e-08, "loss": 0.2305, "step": 17670 }, { "epoch": 2.911302772688911, "grad_norm": 1.5464060306549072, "learning_rate": 4.6041802788762136e-08, "loss": 0.218, "step": 17680 }, { "epoch": 2.912949507008913, "grad_norm": 1.3350013494491577, "learning_rate": 4.435359022950336e-08, "loss": 0.2234, "step": 17690 }, { "epoch": 2.9145962413289146, "grad_norm": 1.1990008354187012, "learning_rate": 4.269684049337142e-08, "loss": 0.2208, "step": 17700 }, { "epoch": 2.9162429756489163, "grad_norm": 1.4853509664535522, "learning_rate": 4.1071558816193626e-08, "loss": 0.219, "step": 17710 }, { "epoch": 2.917889709968918, "grad_norm": 1.4338077306747437, "learning_rate": 3.947775033434575e-08, "loss": 0.2077, "step": 17720 }, { "epoch": 2.9195364442889193, "grad_norm": 2.2821855545043945, "learning_rate": 3.7915420084740915e-08, "loss": 0.2177, "step": 17730 }, { "epoch": 2.921183178608921, "grad_norm": 1.7811158895492554, "learning_rate": 3.6384573004808465e-08, "loss": 0.2172, "step": 17740 }, { "epoch": 2.922829912928923, "grad_norm": 1.8270400762557983, "learning_rate": 3.488521393248401e-08, "loss": 0.2127, "step": 17750 }, { "epoch": 2.9244766472489245, "grad_norm": 1.455257534980774, "learning_rate": 3.341734760619275e-08, "loss": 0.2223, "step": 17760 }, { "epoch": 2.9261233815689263, "grad_norm": 1.5582276582717896, "learning_rate": 3.198097866483063e-08, "loss": 0.2194, "step": 17770 }, { "epoch": 2.9277701158889275, "grad_norm": 1.5785218477249146, "learning_rate": 3.0576111647752096e-08, "loss": 0.2032, "step": 17780 }, { "epoch": 2.9294168502089293, "grad_norm": 1.5375938415527344, "learning_rate": 2.920275099476011e-08, "loss": 0.2294, "step": 17790 }, { "epoch": 2.931063584528931, "grad_norm": 2.171330451965332, "learning_rate": 2.7860901046082856e-08, "loss": 0.2358, "step": 17800 }, { "epoch": 2.9327103188489327, "grad_norm": 1.675736665725708, "learning_rate": 2.6550566042370386e-08, "loss": 0.2215, "step": 17810 }, { "epoch": 2.9343570531689345, "grad_norm": 2.0025336742401123, "learning_rate": 2.5271750124672423e-08, "loss": 0.2151, "step": 17820 }, { "epoch": 2.9360037874889358, "grad_norm": 1.4711110591888428, "learning_rate": 2.4024457334430595e-08, "loss": 0.2173, "step": 17830 }, { "epoch": 2.9376505218089375, "grad_norm": 1.8637895584106445, "learning_rate": 2.2808691613461776e-08, "loss": 0.2215, "step": 17840 }, { "epoch": 2.939297256128939, "grad_norm": 2.4678914546966553, "learning_rate": 2.162445680395142e-08, "loss": 0.2373, "step": 17850 }, { "epoch": 2.940943990448941, "grad_norm": 1.4459493160247803, "learning_rate": 2.0471756648435814e-08, "loss": 0.2033, "step": 17860 }, { "epoch": 2.9425907247689427, "grad_norm": 1.8387855291366577, "learning_rate": 1.9350594789792064e-08, "loss": 0.2107, "step": 17870 }, { "epoch": 2.944237459088944, "grad_norm": 1.8373688459396362, "learning_rate": 1.8260974771227015e-08, "loss": 0.2318, "step": 17880 }, { "epoch": 2.945884193408946, "grad_norm": 1.6983997821807861, "learning_rate": 1.7202900036268343e-08, "loss": 0.2178, "step": 17890 }, { "epoch": 2.9475309277289474, "grad_norm": 1.5288745164871216, "learning_rate": 1.6176373928745715e-08, "loss": 0.2205, "step": 17900 }, { "epoch": 2.949177662048949, "grad_norm": 1.6072468757629395, "learning_rate": 1.5181399692790756e-08, "loss": 0.2076, "step": 17910 }, { "epoch": 2.950824396368951, "grad_norm": 3.108124017715454, "learning_rate": 1.4217980472819304e-08, "loss": 0.233, "step": 17920 }, { "epoch": 2.9524711306889526, "grad_norm": 1.8453097343444824, "learning_rate": 1.3286119313525858e-08, "loss": 0.2303, "step": 17930 }, { "epoch": 2.9541178650089543, "grad_norm": 2.0312986373901367, "learning_rate": 1.2385819159869138e-08, "loss": 0.2122, "step": 17940 }, { "epoch": 2.9557645993289556, "grad_norm": 1.338973879814148, "learning_rate": 1.1517082857067652e-08, "loss": 0.2248, "step": 17950 }, { "epoch": 2.9574113336489574, "grad_norm": 1.7432622909545898, "learning_rate": 1.0679913150588584e-08, "loss": 0.2213, "step": 17960 }, { "epoch": 2.959058067968959, "grad_norm": 1.9854906797409058, "learning_rate": 9.87431268613781e-09, "loss": 0.2312, "step": 17970 }, { "epoch": 2.960704802288961, "grad_norm": 1.817718267440796, "learning_rate": 9.100284009655458e-09, "loss": 0.2276, "step": 17980 }, { "epoch": 2.9623515366089626, "grad_norm": 1.655806064605713, "learning_rate": 8.357829567302577e-09, "loss": 0.2261, "step": 17990 }, { "epoch": 2.963998270928964, "grad_norm": 1.608216643333435, "learning_rate": 7.646951705457817e-09, "loss": 0.2112, "step": 18000 } ], "logging_steps": 10, "max_steps": 18219, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2831001863587446e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }