| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.225, | |
| "eval_steps": 500, | |
| "global_step": 1800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00125, | |
| "grad_norm": 0.07407524436712265, | |
| "learning_rate": 2.25e-06, | |
| "loss": 2.2311376571655273, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 0.07508661597967148, | |
| "learning_rate": 4.75e-06, | |
| "loss": 2.254044532775879, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.00375, | |
| "grad_norm": 0.08043646067380905, | |
| "learning_rate": 7.25e-06, | |
| "loss": 2.2791187286376955, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.07152171432971954, | |
| "learning_rate": 9.750000000000002e-06, | |
| "loss": 2.29660701751709, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.00625, | |
| "grad_norm": 0.07239466905593872, | |
| "learning_rate": 1.225e-05, | |
| "loss": 2.2157821655273438, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 0.07063177227973938, | |
| "learning_rate": 1.475e-05, | |
| "loss": 2.2415660858154296, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.00875, | |
| "grad_norm": 0.07137205451726913, | |
| "learning_rate": 1.725e-05, | |
| "loss": 2.188554573059082, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.07028814405202866, | |
| "learning_rate": 1.9750000000000002e-05, | |
| "loss": 2.177708053588867, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.01125, | |
| "grad_norm": 0.07327590137720108, | |
| "learning_rate": 2.2250000000000002e-05, | |
| "loss": 2.22082405090332, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 0.06442452222108841, | |
| "learning_rate": 2.4750000000000002e-05, | |
| "loss": 2.2364925384521483, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01375, | |
| "grad_norm": 0.06526491045951843, | |
| "learning_rate": 2.725e-05, | |
| "loss": 2.2664451599121094, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.06679920852184296, | |
| "learning_rate": 2.975e-05, | |
| "loss": 2.188519287109375, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.01625, | |
| "grad_norm": 0.07065364718437195, | |
| "learning_rate": 3.2250000000000005e-05, | |
| "loss": 2.270803451538086, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 0.07348085194826126, | |
| "learning_rate": 3.475e-05, | |
| "loss": 2.213222122192383, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.01875, | |
| "grad_norm": 0.0695674866437912, | |
| "learning_rate": 3.7250000000000004e-05, | |
| "loss": 2.208852767944336, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.07005158066749573, | |
| "learning_rate": 3.9750000000000004e-05, | |
| "loss": 2.161180305480957, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.02125, | |
| "grad_norm": 0.06879838556051254, | |
| "learning_rate": 4.2250000000000004e-05, | |
| "loss": 2.1557788848876953, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 0.07037137448787689, | |
| "learning_rate": 4.4750000000000004e-05, | |
| "loss": 2.2376060485839844, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.02375, | |
| "grad_norm": 0.06982705742120743, | |
| "learning_rate": 4.7249999999999997e-05, | |
| "loss": 2.217011260986328, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.06674907356500626, | |
| "learning_rate": 4.975e-05, | |
| "loss": 2.256929397583008, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02625, | |
| "grad_norm": 0.07002667337656021, | |
| "learning_rate": 4.999983575010662e-05, | |
| "loss": 2.179422378540039, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 0.0717761367559433, | |
| "learning_rate": 4.9999267975468225e-05, | |
| "loss": 2.2249755859375, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.02875, | |
| "grad_norm": 0.07136455923318863, | |
| "learning_rate": 4.9998294657516734e-05, | |
| "loss": 2.222713088989258, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.07328379899263382, | |
| "learning_rate": 4.999691581204152e-05, | |
| "loss": 2.1884521484375, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03125, | |
| "grad_norm": 0.07197773456573486, | |
| "learning_rate": 4.999513146141048e-05, | |
| "loss": 2.2087806701660155, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 0.07391443103551865, | |
| "learning_rate": 4.999294163456975e-05, | |
| "loss": 2.2307754516601563, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.03375, | |
| "grad_norm": 0.07325068861246109, | |
| "learning_rate": 4.9990346367043114e-05, | |
| "loss": 2.198223114013672, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.07103461772203445, | |
| "learning_rate": 4.9987345700931586e-05, | |
| "loss": 2.228559112548828, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.03625, | |
| "grad_norm": 0.07458475232124329, | |
| "learning_rate": 4.998393968491256e-05, | |
| "loss": 2.1933670043945312, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.06882892549037933, | |
| "learning_rate": 4.9980128374239156e-05, | |
| "loss": 2.2323123931884767, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03875, | |
| "grad_norm": 0.07062669098377228, | |
| "learning_rate": 4.997591183073923e-05, | |
| "loss": 2.2588329315185547, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.07071756571531296, | |
| "learning_rate": 4.997129012281443e-05, | |
| "loss": 2.11534423828125, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.04125, | |
| "grad_norm": 0.06557808071374893, | |
| "learning_rate": 4.996626332543907e-05, | |
| "loss": 2.1684688568115233, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 0.0680045560002327, | |
| "learning_rate": 4.9960831520158904e-05, | |
| "loss": 2.1954887390136717, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.04375, | |
| "grad_norm": 0.07579149305820465, | |
| "learning_rate": 4.9954994795089786e-05, | |
| "loss": 2.120090103149414, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.07108975201845169, | |
| "learning_rate": 4.994875324491629e-05, | |
| "loss": 2.1955142974853517, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.04625, | |
| "grad_norm": 0.07410898804664612, | |
| "learning_rate": 4.994210697089014e-05, | |
| "loss": 2.2114063262939454, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 0.06744780391454697, | |
| "learning_rate": 4.993505608082857e-05, | |
| "loss": 2.1475910186767577, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.04875, | |
| "grad_norm": 0.06901328265666962, | |
| "learning_rate": 4.992760068911258e-05, | |
| "loss": 2.2015354156494142, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.0716853216290474, | |
| "learning_rate": 4.991974091668509e-05, | |
| "loss": 2.2013004302978514, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05125, | |
| "grad_norm": 0.07426904886960983, | |
| "learning_rate": 4.991147689104895e-05, | |
| "loss": 2.170287322998047, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 0.06732422858476639, | |
| "learning_rate": 4.990280874626491e-05, | |
| "loss": 2.1896934509277344, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.05375, | |
| "grad_norm": 0.06560606509447098, | |
| "learning_rate": 4.98937366229494e-05, | |
| "loss": 2.2104034423828125, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.06944538652896881, | |
| "learning_rate": 4.98842606682723e-05, | |
| "loss": 2.126712989807129, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.05625, | |
| "grad_norm": 0.06867734342813492, | |
| "learning_rate": 4.98743810359545e-05, | |
| "loss": 2.234315299987793, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 0.07116754353046417, | |
| "learning_rate": 4.986409788626546e-05, | |
| "loss": 2.184938430786133, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.05875, | |
| "grad_norm": 0.06651467084884644, | |
| "learning_rate": 4.985341138602056e-05, | |
| "loss": 2.152230453491211, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.06930788606405258, | |
| "learning_rate": 4.984232170857842e-05, | |
| "loss": 2.181766891479492, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.06125, | |
| "grad_norm": 0.07047919183969498, | |
| "learning_rate": 4.9830829033838096e-05, | |
| "loss": 2.187181091308594, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.07572918385267258, | |
| "learning_rate": 4.981893354823614e-05, | |
| "loss": 2.1816110610961914, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06375, | |
| "grad_norm": 0.07347270846366882, | |
| "learning_rate": 4.9806635444743595e-05, | |
| "loss": 2.2242454528808593, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.06983183324337006, | |
| "learning_rate": 4.979393492286284e-05, | |
| "loss": 2.1648427963256838, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.06625, | |
| "grad_norm": 0.06904461979866028, | |
| "learning_rate": 4.9780832188624375e-05, | |
| "loss": 2.1993919372558595, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 0.0692647248506546, | |
| "learning_rate": 4.976732745458348e-05, | |
| "loss": 2.1890403747558596, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.06875, | |
| "grad_norm": 0.06891040503978729, | |
| "learning_rate": 4.975342093981675e-05, | |
| "loss": 2.1854305267333984, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.0714147537946701, | |
| "learning_rate": 4.973911286991856e-05, | |
| "loss": 2.219981002807617, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.07125, | |
| "grad_norm": 0.07621099054813385, | |
| "learning_rate": 4.9724403476997384e-05, | |
| "loss": 2.182698440551758, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 0.07603556662797928, | |
| "learning_rate": 4.970929299967204e-05, | |
| "loss": 2.1696598052978517, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.07375, | |
| "grad_norm": 0.07282973825931549, | |
| "learning_rate": 4.969378168306784e-05, | |
| "loss": 2.1716108322143555, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.06459035724401474, | |
| "learning_rate": 4.967786977881257e-05, | |
| "loss": 2.1681615829467775, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07625, | |
| "grad_norm": 0.07177098840475082, | |
| "learning_rate": 4.966155754503245e-05, | |
| "loss": 2.175799560546875, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 0.06652142852544785, | |
| "learning_rate": 4.964484524634792e-05, | |
| "loss": 2.1907752990722655, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.07875, | |
| "grad_norm": 0.0727977305650711, | |
| "learning_rate": 4.962773315386935e-05, | |
| "loss": 2.1232975006103514, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.07146522402763367, | |
| "learning_rate": 4.961022154519267e-05, | |
| "loss": 2.1747344970703124, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.08125, | |
| "grad_norm": 0.06721191853284836, | |
| "learning_rate": 4.959231070439482e-05, | |
| "loss": 2.2015512466430662, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 0.06989260762929916, | |
| "learning_rate": 4.957400092202917e-05, | |
| "loss": 2.187305450439453, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.08375, | |
| "grad_norm": 0.06762924790382385, | |
| "learning_rate": 4.955529249512082e-05, | |
| "loss": 2.1369449615478517, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.06524205207824707, | |
| "learning_rate": 4.953618572716175e-05, | |
| "loss": 2.162923049926758, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.08625, | |
| "grad_norm": 0.0713450163602829, | |
| "learning_rate": 4.951668092810593e-05, | |
| "loss": 2.2041025161743164, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 0.0673929825425148, | |
| "learning_rate": 4.949677841436423e-05, | |
| "loss": 2.124150848388672, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08875, | |
| "grad_norm": 0.06595680117607117, | |
| "learning_rate": 4.947647850879938e-05, | |
| "loss": 2.1866949081420897, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.06935486942529678, | |
| "learning_rate": 4.945578154072065e-05, | |
| "loss": 2.168051528930664, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.09125, | |
| "grad_norm": 0.07811375707387924, | |
| "learning_rate": 4.943468784587856e-05, | |
| "loss": 2.2397434234619142, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 0.06744036078453064, | |
| "learning_rate": 4.9413197766459394e-05, | |
| "loss": 2.206024169921875, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.09375, | |
| "grad_norm": 0.06797294318675995, | |
| "learning_rate": 4.939131165107969e-05, | |
| "loss": 2.16088981628418, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.06975241005420685, | |
| "learning_rate": 4.936902985478055e-05, | |
| "loss": 2.2246139526367186, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.09625, | |
| "grad_norm": 0.0649324357509613, | |
| "learning_rate": 4.9346352739021895e-05, | |
| "loss": 2.1035717010498045, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 0.0682821199297905, | |
| "learning_rate": 4.93232806716766e-05, | |
| "loss": 2.1523895263671875, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.09875, | |
| "grad_norm": 0.06782221794128418, | |
| "learning_rate": 4.9299814027024536e-05, | |
| "loss": 2.199502182006836, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.06508056074380875, | |
| "learning_rate": 4.927595318574649e-05, | |
| "loss": 2.1637832641601564, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10125, | |
| "grad_norm": 0.06774035096168518, | |
| "learning_rate": 4.9251698534917965e-05, | |
| "loss": 2.227591323852539, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 0.06584357470273972, | |
| "learning_rate": 4.9227050468002954e-05, | |
| "loss": 2.204555130004883, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.10375, | |
| "grad_norm": 0.06837441772222519, | |
| "learning_rate": 4.92020093848475e-05, | |
| "loss": 2.1817317962646485, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.06870482861995697, | |
| "learning_rate": 4.9176575691673265e-05, | |
| "loss": 2.1889968872070313, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.10625, | |
| "grad_norm": 0.06784000247716904, | |
| "learning_rate": 4.9150749801070884e-05, | |
| "loss": 2.1590961456298827, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 0.06860610842704773, | |
| "learning_rate": 4.912453213199332e-05, | |
| "loss": 2.1836381912231446, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.10875, | |
| "grad_norm": 0.06371141225099564, | |
| "learning_rate": 4.909792310974904e-05, | |
| "loss": 2.136852264404297, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.06636381894350052, | |
| "learning_rate": 4.9070923165995135e-05, | |
| "loss": 2.123456573486328, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.11125, | |
| "grad_norm": 0.06704560667276382, | |
| "learning_rate": 4.9043532738730284e-05, | |
| "loss": 2.210599899291992, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 0.06560536473989487, | |
| "learning_rate": 4.901575227228769e-05, | |
| "loss": 2.1686206817626954, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.11375, | |
| "grad_norm": 0.06711073964834213, | |
| "learning_rate": 4.898758221732783e-05, | |
| "loss": 2.1291219711303713, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.07313625514507294, | |
| "learning_rate": 4.89590230308312e-05, | |
| "loss": 2.1620079040527345, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.11625, | |
| "grad_norm": 0.07415210455656052, | |
| "learning_rate": 4.8930075176090844e-05, | |
| "loss": 2.2015281677246095, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 0.06644522398710251, | |
| "learning_rate": 4.890073912270486e-05, | |
| "loss": 2.154953384399414, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.11875, | |
| "grad_norm": 0.0653102844953537, | |
| "learning_rate": 4.887101534656882e-05, | |
| "loss": 2.186296081542969, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.06608347594738007, | |
| "learning_rate": 4.8840904329867955e-05, | |
| "loss": 2.1376876831054688, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.12125, | |
| "grad_norm": 0.06143832579255104, | |
| "learning_rate": 4.881040656106944e-05, | |
| "loss": 2.1134971618652343, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1225, | |
| "grad_norm": 0.06736340373754501, | |
| "learning_rate": 4.8779522534914414e-05, | |
| "loss": 2.1506475448608398, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.12375, | |
| "grad_norm": 0.06936616450548172, | |
| "learning_rate": 4.874825275240996e-05, | |
| "loss": 2.125407028198242, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.07099438458681107, | |
| "learning_rate": 4.871659772082097e-05, | |
| "loss": 2.2163812637329103, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.12625, | |
| "grad_norm": 0.06724845618009567, | |
| "learning_rate": 4.868455795366195e-05, | |
| "loss": 2.1106163024902345, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.1275, | |
| "grad_norm": 0.0672367662191391, | |
| "learning_rate": 4.8652133970688636e-05, | |
| "loss": 2.071784019470215, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.12875, | |
| "grad_norm": 0.06996046751737595, | |
| "learning_rate": 4.861932629788962e-05, | |
| "loss": 2.198385238647461, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.06979460269212723, | |
| "learning_rate": 4.858613546747777e-05, | |
| "loss": 2.111052894592285, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.13125, | |
| "grad_norm": 0.06372442096471786, | |
| "learning_rate": 4.8552562017881634e-05, | |
| "loss": 2.133512496948242, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1325, | |
| "grad_norm": 0.06758279353380203, | |
| "learning_rate": 4.851860649373666e-05, | |
| "loss": 2.1648277282714843, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.13375, | |
| "grad_norm": 0.07092203199863434, | |
| "learning_rate": 4.848426944587644e-05, | |
| "loss": 2.1862049102783203, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 0.06371688097715378, | |
| "learning_rate": 4.844955143132366e-05, | |
| "loss": 2.197888946533203, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.13625, | |
| "grad_norm": 0.06636802852153778, | |
| "learning_rate": 4.841445301328117e-05, | |
| "loss": 2.172176742553711, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 0.06878487765789032, | |
| "learning_rate": 4.8378974761122794e-05, | |
| "loss": 2.2152605056762695, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.13875, | |
| "grad_norm": 0.06840436160564423, | |
| "learning_rate": 4.834311725038409e-05, | |
| "loss": 2.2093944549560547, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.06905412673950195, | |
| "learning_rate": 4.830688106275305e-05, | |
| "loss": 2.1413171768188475, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.14125, | |
| "grad_norm": 0.06591488420963287, | |
| "learning_rate": 4.827026678606063e-05, | |
| "loss": 2.2281288146972655, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.1425, | |
| "grad_norm": 0.06814192980527878, | |
| "learning_rate": 4.823327501427121e-05, | |
| "loss": 2.241679000854492, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.14375, | |
| "grad_norm": 0.06960905343294144, | |
| "learning_rate": 4.8195906347473e-05, | |
| "loss": 2.1834861755371096, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 0.07025773823261261, | |
| "learning_rate": 4.8158161391868276e-05, | |
| "loss": 2.2492515563964846, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.14625, | |
| "grad_norm": 0.06904685497283936, | |
| "learning_rate": 4.812004075976352e-05, | |
| "loss": 2.1875804901123046, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.1475, | |
| "grad_norm": 0.06693335622549057, | |
| "learning_rate": 4.808154506955955e-05, | |
| "loss": 2.23748779296875, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.14875, | |
| "grad_norm": 0.06956436485052109, | |
| "learning_rate": 4.804267494574145e-05, | |
| "loss": 2.2373470306396483, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.07944408059120178, | |
| "learning_rate": 4.800343101886842e-05, | |
| "loss": 2.2595752716064452, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15125, | |
| "grad_norm": 0.06570149958133698, | |
| "learning_rate": 4.7963813925563586e-05, | |
| "loss": 2.2755199432373048, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.1525, | |
| "grad_norm": 0.06871885061264038, | |
| "learning_rate": 4.792382430850366e-05, | |
| "loss": 2.2775409698486326, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.15375, | |
| "grad_norm": 0.06904297322034836, | |
| "learning_rate": 4.788346281640852e-05, | |
| "loss": 2.2760135650634767, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 0.06859485059976578, | |
| "learning_rate": 4.784273010403065e-05, | |
| "loss": 2.259684753417969, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 0.06464747339487076, | |
| "learning_rate": 4.780162683214457e-05, | |
| "loss": 2.308213806152344, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.1575, | |
| "grad_norm": 0.06627296656370163, | |
| "learning_rate": 4.77601536675361e-05, | |
| "loss": 2.3249114990234374, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.15875, | |
| "grad_norm": 0.06741499155759811, | |
| "learning_rate": 4.77183112829915e-05, | |
| "loss": 2.350922393798828, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.06964308768510818, | |
| "learning_rate": 4.7676100357286624e-05, | |
| "loss": 2.244986915588379, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.16125, | |
| "grad_norm": 0.0634695515036583, | |
| "learning_rate": 4.7633521575175874e-05, | |
| "loss": 2.2805355072021483, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 0.06915149837732315, | |
| "learning_rate": 4.759057562738109e-05, | |
| "loss": 2.3919076919555664, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.16375, | |
| "grad_norm": 0.06478839367628098, | |
| "learning_rate": 4.754726321058034e-05, | |
| "loss": 2.259754180908203, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.06912291049957275, | |
| "learning_rate": 4.7503585027396646e-05, | |
| "loss": 2.2800014495849608, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.16625, | |
| "grad_norm": 0.06265991181135178, | |
| "learning_rate": 4.745954178638656e-05, | |
| "loss": 2.291925811767578, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.1675, | |
| "grad_norm": 0.06898439675569534, | |
| "learning_rate": 4.741513420202867e-05, | |
| "loss": 2.360172080993652, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.16875, | |
| "grad_norm": 0.06620923429727554, | |
| "learning_rate": 4.7370362994712036e-05, | |
| "loss": 2.368211364746094, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.07054638862609863, | |
| "learning_rate": 4.732522889072447e-05, | |
| "loss": 2.3170215606689455, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.17125, | |
| "grad_norm": 0.07074211537837982, | |
| "learning_rate": 4.7279732622240766e-05, | |
| "loss": 2.276025581359863, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.1725, | |
| "grad_norm": 0.06905434280633926, | |
| "learning_rate": 4.723387492731084e-05, | |
| "loss": 2.2982337951660154, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.17375, | |
| "grad_norm": 0.06408198177814484, | |
| "learning_rate": 4.718765654984773e-05, | |
| "loss": 2.3407756805419924, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.06612099707126617, | |
| "learning_rate": 4.7141078239615566e-05, | |
| "loss": 2.2696388244628904, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.17625, | |
| "grad_norm": 0.06839219480752945, | |
| "learning_rate": 4.709414075221734e-05, | |
| "loss": 2.227347755432129, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.1775, | |
| "grad_norm": 0.06748591363430023, | |
| "learning_rate": 4.7046844849082725e-05, | |
| "loss": 2.310345458984375, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.17875, | |
| "grad_norm": 0.06979987025260925, | |
| "learning_rate": 4.699919129745567e-05, | |
| "loss": 2.3062816619873048, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.06655360758304596, | |
| "learning_rate": 4.695118087038198e-05, | |
| "loss": 2.30033016204834, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.18125, | |
| "grad_norm": 0.0673886388540268, | |
| "learning_rate": 4.690281434669677e-05, | |
| "loss": 2.3160564422607424, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1825, | |
| "grad_norm": 0.0680936723947525, | |
| "learning_rate": 4.685409251101183e-05, | |
| "loss": 2.378013801574707, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.18375, | |
| "grad_norm": 0.07122587412595749, | |
| "learning_rate": 4.680501615370288e-05, | |
| "loss": 2.2907699584960937, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 0.07071765512228012, | |
| "learning_rate": 4.675558607089677e-05, | |
| "loss": 2.3167457580566406, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.18625, | |
| "grad_norm": 0.07135665416717529, | |
| "learning_rate": 4.6705803064458575e-05, | |
| "loss": 2.36193962097168, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.07396227866411209, | |
| "learning_rate": 4.665566794197854e-05, | |
| "loss": 2.3736957550048827, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.18875, | |
| "grad_norm": 0.06626343727111816, | |
| "learning_rate": 4.6605181516759047e-05, | |
| "loss": 2.3442031860351564, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.06523022055625916, | |
| "learning_rate": 4.6554344607801335e-05, | |
| "loss": 2.3258100509643556, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.19125, | |
| "grad_norm": 0.06743918359279633, | |
| "learning_rate": 4.6503158039792324e-05, | |
| "loss": 2.286594009399414, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.1925, | |
| "grad_norm": 0.06837300956249237, | |
| "learning_rate": 4.645162264309112e-05, | |
| "loss": 2.320156288146973, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.19375, | |
| "grad_norm": 0.06472747027873993, | |
| "learning_rate": 4.639973925371562e-05, | |
| "loss": 2.39737548828125, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 0.06730551272630692, | |
| "learning_rate": 4.634750871332896e-05, | |
| "loss": 2.3250579833984375, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.19625, | |
| "grad_norm": 0.06307782232761383, | |
| "learning_rate": 4.6294931869225774e-05, | |
| "loss": 2.300210189819336, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.1975, | |
| "grad_norm": 0.06609486788511276, | |
| "learning_rate": 4.6242009574318554e-05, | |
| "loss": 2.2991945266723635, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.19875, | |
| "grad_norm": 0.0668780580163002, | |
| "learning_rate": 4.6188742687123754e-05, | |
| "loss": 2.2811546325683594, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.07088489830493927, | |
| "learning_rate": 4.6135132071747864e-05, | |
| "loss": 2.32708740234375, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.20125, | |
| "grad_norm": 0.06599114835262299, | |
| "learning_rate": 4.608117859787342e-05, | |
| "loss": 2.26273250579834, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2025, | |
| "grad_norm": 0.06767764687538147, | |
| "learning_rate": 4.602688314074487e-05, | |
| "loss": 2.3097578048706056, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.20375, | |
| "grad_norm": 0.06805308163166046, | |
| "learning_rate": 4.597224658115438e-05, | |
| "loss": 2.322467231750488, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 0.06872619688510895, | |
| "learning_rate": 4.5917269805427567e-05, | |
| "loss": 2.303860092163086, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.20625, | |
| "grad_norm": 0.06846166402101517, | |
| "learning_rate": 4.5861953705409086e-05, | |
| "loss": 2.2648983001708984, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2075, | |
| "grad_norm": 0.06966902315616608, | |
| "learning_rate": 4.58062991784482e-05, | |
| "loss": 2.3156864166259767, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.20875, | |
| "grad_norm": 0.06632312387228012, | |
| "learning_rate": 4.575030712738419e-05, | |
| "loss": 2.302859878540039, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.07102814316749573, | |
| "learning_rate": 4.5693978460531725e-05, | |
| "loss": 2.3624195098876952, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.21125, | |
| "grad_norm": 0.06857810169458389, | |
| "learning_rate": 4.563731409166615e-05, | |
| "loss": 2.360994338989258, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.2125, | |
| "grad_norm": 0.06877714395523071, | |
| "learning_rate": 4.558031494000858e-05, | |
| "loss": 2.322928237915039, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.21375, | |
| "grad_norm": 0.07134252786636353, | |
| "learning_rate": 4.5522981930211114e-05, | |
| "loss": 2.2678380966186524, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 0.07036204636096954, | |
| "learning_rate": 4.546531599234173e-05, | |
| "loss": 2.314472770690918, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.21625, | |
| "grad_norm": 0.06794637441635132, | |
| "learning_rate": 4.540731806186922e-05, | |
| "loss": 2.385559844970703, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2175, | |
| "grad_norm": 0.06780719757080078, | |
| "learning_rate": 4.5348989079648065e-05, | |
| "loss": 2.261928939819336, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.21875, | |
| "grad_norm": 0.06747139990329742, | |
| "learning_rate": 4.5290329991903115e-05, | |
| "loss": 2.295556831359863, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.06255707144737244, | |
| "learning_rate": 4.5231341750214256e-05, | |
| "loss": 2.338383102416992, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.22125, | |
| "grad_norm": 0.06699945777654648, | |
| "learning_rate": 4.5172025311501004e-05, | |
| "loss": 2.2839550018310546, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.2225, | |
| "grad_norm": 0.06443452835083008, | |
| "learning_rate": 4.511238163800692e-05, | |
| "loss": 2.3070682525634765, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.22375, | |
| "grad_norm": 0.06801515817642212, | |
| "learning_rate": 4.505241169728407e-05, | |
| "loss": 2.313974952697754, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.06966620683670044, | |
| "learning_rate": 4.499211646217727e-05, | |
| "loss": 2.2580539703369142, | |
| "step": 1800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1075411496665088e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |