| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9980430528375734, | |
| "eval_steps": 500, | |
| "global_step": 1149, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02609262883235486, | |
| "grad_norm": 2.5580028899176965, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9679, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05218525766470972, | |
| "grad_norm": 1.2382684301314397, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8485, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07827788649706457, | |
| "grad_norm": 1.7244888669737717, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8086, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10437051532941943, | |
| "grad_norm": 0.6684245608414933, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7921, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1304631441617743, | |
| "grad_norm": 1.2452393350160769, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7827, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15655577299412915, | |
| "grad_norm": 0.596876900380745, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7727, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.182648401826484, | |
| "grad_norm": 0.6733648796408476, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7657, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.20874103065883887, | |
| "grad_norm": 0.6054360604030639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7592, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.23483365949119372, | |
| "grad_norm": 0.6078628726535039, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7575, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2609262883235486, | |
| "grad_norm": 0.667808601099263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7535, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.28701891715590344, | |
| "grad_norm": 0.5998036644971939, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7505, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3131115459882583, | |
| "grad_norm": 0.601234098332995, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7462, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.33920417482061316, | |
| "grad_norm": 0.5470502921448889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7464, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.365296803652968, | |
| "grad_norm": 0.5354881588471849, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7448, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3913894324853229, | |
| "grad_norm": 0.638179541327912, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7403, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.41748206131767773, | |
| "grad_norm": 0.7539597801432362, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7415, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4435746901500326, | |
| "grad_norm": 0.6349380651529639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7397, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.46966731898238745, | |
| "grad_norm": 0.6486113350477142, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7346, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4957599478147423, | |
| "grad_norm": 0.524423192769518, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7336, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5218525766470972, | |
| "grad_norm": 0.8675137881298488, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7357, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.547945205479452, | |
| "grad_norm": 0.6181294717672231, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7324, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5740378343118069, | |
| "grad_norm": 0.5310035054090366, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7311, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6001304631441617, | |
| "grad_norm": 0.5616827688931022, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7286, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6262230919765166, | |
| "grad_norm": 0.6811648368494987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7291, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6523157208088715, | |
| "grad_norm": 0.7733201970691932, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7276, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6784083496412263, | |
| "grad_norm": 0.6192873191830528, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7272, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7045009784735812, | |
| "grad_norm": 0.8068236017156095, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7249, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.730593607305936, | |
| "grad_norm": 0.8150623115239886, | |
| "learning_rate": 5e-06, | |
| "loss": 0.728, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7566862361382909, | |
| "grad_norm": 0.724911564188439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7264, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7827788649706457, | |
| "grad_norm": 0.6359822287899792, | |
| "learning_rate": 5e-06, | |
| "loss": 0.723, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8088714938030006, | |
| "grad_norm": 0.5527636979800452, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7232, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8349641226353555, | |
| "grad_norm": 0.499616870483793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7222, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8610567514677103, | |
| "grad_norm": 0.5494308637029189, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7174, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8871493803000652, | |
| "grad_norm": 0.5742968590351128, | |
| "learning_rate": 5e-06, | |
| "loss": 0.719, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.91324200913242, | |
| "grad_norm": 0.48431807220252887, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7206, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9393346379647749, | |
| "grad_norm": 0.4853602229330187, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7178, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9654272667971298, | |
| "grad_norm": 0.501484126053991, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7204, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9915198956294846, | |
| "grad_norm": 0.5012192063059191, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7179, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0176125244618395, | |
| "grad_norm": 0.9524671132017011, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7451, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0437051532941943, | |
| "grad_norm": 0.524028424392148, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6803, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0697977821265492, | |
| "grad_norm": 0.5407200294759114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6793, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.095890410958904, | |
| "grad_norm": 0.5857287578156233, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6811, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.121983039791259, | |
| "grad_norm": 0.5280070429598902, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6807, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.1480756686236138, | |
| "grad_norm": 0.4857639927609277, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6776, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1741682974559686, | |
| "grad_norm": 0.5945511698424941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.68, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2002609262883235, | |
| "grad_norm": 0.589811732149492, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6831, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.2263535551206783, | |
| "grad_norm": 0.5534380098945336, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.2524461839530332, | |
| "grad_norm": 0.5126525326289294, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6798, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.278538812785388, | |
| "grad_norm": 0.6254986498705398, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6797, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.304631441617743, | |
| "grad_norm": 0.49035273013831415, | |
| "learning_rate": 5e-06, | |
| "loss": 0.683, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3307240704500978, | |
| "grad_norm": 0.5403945589242636, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6769, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.3568166992824526, | |
| "grad_norm": 0.5501681060254049, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6807, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3829093281148075, | |
| "grad_norm": 0.6772693119372506, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6806, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.4090019569471623, | |
| "grad_norm": 0.5937727651699936, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6797, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.4350945857795172, | |
| "grad_norm": 0.5931432683082334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6769, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.461187214611872, | |
| "grad_norm": 0.6044257552885106, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.487279843444227, | |
| "grad_norm": 0.5487167986355509, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6779, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.5133724722765818, | |
| "grad_norm": 0.5046084173932228, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6797, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.5394651011089366, | |
| "grad_norm": 0.588333435552722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.681, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.5655577299412915, | |
| "grad_norm": 0.5857191319203284, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5916503587736464, | |
| "grad_norm": 0.5197393978903909, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6815, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.6177429876060012, | |
| "grad_norm": 0.5171612878613464, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6786, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.643835616438356, | |
| "grad_norm": 0.566201693304294, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6793, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.669928245270711, | |
| "grad_norm": 0.5044017255239589, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6776, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.6960208741030658, | |
| "grad_norm": 0.478490790469601, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6754, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.7221135029354206, | |
| "grad_norm": 0.4963811798560764, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6798, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.7482061317677755, | |
| "grad_norm": 0.5278784589567511, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6768, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.7742987606001304, | |
| "grad_norm": 0.5387531965847336, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6795, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.8003913894324852, | |
| "grad_norm": 0.5875937914583538, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6781, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.82648401826484, | |
| "grad_norm": 0.5355905144164954, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6792, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.852576647097195, | |
| "grad_norm": 0.6338025629920409, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.8786692759295498, | |
| "grad_norm": 0.5291801310990664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.677, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 0.590990170736626, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6766, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.9308545335942595, | |
| "grad_norm": 0.5482515525172131, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6751, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.9569471624266144, | |
| "grad_norm": 0.620356822725272, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6765, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.9830397912589692, | |
| "grad_norm": 0.5642132027456009, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6776, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.009132420091324, | |
| "grad_norm": 0.7543688135393036, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7153, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.035225048923679, | |
| "grad_norm": 0.5581679413180287, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6374, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.061317677756034, | |
| "grad_norm": 0.6393564816609882, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.0874103065883887, | |
| "grad_norm": 0.614009057667238, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6402, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.1135029354207435, | |
| "grad_norm": 0.5346940357958085, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6407, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.1395955642530984, | |
| "grad_norm": 0.559106756693009, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.1656881930854532, | |
| "grad_norm": 0.603074822031344, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6363, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.191780821917808, | |
| "grad_norm": 0.6669696137807871, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6394, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.217873450750163, | |
| "grad_norm": 0.6196755868842616, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6407, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.243966079582518, | |
| "grad_norm": 0.5462481539172109, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6407, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.2700587084148727, | |
| "grad_norm": 0.5378190251307421, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6401, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.2961513372472275, | |
| "grad_norm": 0.5143159135941981, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6403, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.3222439660795824, | |
| "grad_norm": 0.514592343488391, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6412, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.3483365949119372, | |
| "grad_norm": 0.5516158921785709, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6434, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.374429223744292, | |
| "grad_norm": 0.669009619898686, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6386, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.400521852576647, | |
| "grad_norm": 0.6127707444182544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6378, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.426614481409002, | |
| "grad_norm": 0.6147599175693709, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6411, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.4527071102413567, | |
| "grad_norm": 0.5183387934748579, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6431, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.4787997390737115, | |
| "grad_norm": 0.5481797536089282, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6415, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.5048923679060664, | |
| "grad_norm": 0.49281844671077113, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6416, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.5309849967384213, | |
| "grad_norm": 0.5590165679106437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6413, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.557077625570776, | |
| "grad_norm": 0.4928056280476181, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6398, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.583170254403131, | |
| "grad_norm": 0.6836197415170101, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6458, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.609262883235486, | |
| "grad_norm": 0.5754142213185701, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6391, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.6353555120678407, | |
| "grad_norm": 0.5860168635077211, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6412, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.6614481409001955, | |
| "grad_norm": 0.7689811537018695, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6419, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.6875407697325504, | |
| "grad_norm": 0.7522119854978059, | |
| "learning_rate": 5e-06, | |
| "loss": 0.643, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.7136333985649053, | |
| "grad_norm": 0.5487355635946237, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6395, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.73972602739726, | |
| "grad_norm": 0.6331210725666454, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6409, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.765818656229615, | |
| "grad_norm": 0.5745683262495273, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6447, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.79191128506197, | |
| "grad_norm": 0.5686919776335425, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6396, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.8180039138943247, | |
| "grad_norm": 0.6015098918868974, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6414, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.8440965427266796, | |
| "grad_norm": 0.4915249083089861, | |
| "learning_rate": 5e-06, | |
| "loss": 0.641, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.8701891715590344, | |
| "grad_norm": 0.5923242648103054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6418, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.8962818003913893, | |
| "grad_norm": 0.5453451599354968, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6435, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.922374429223744, | |
| "grad_norm": 0.6120521256139003, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6448, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.948467058056099, | |
| "grad_norm": 0.5414568129256186, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6423, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.974559686888454, | |
| "grad_norm": 0.5372353896161117, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6412, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.9980430528375734, | |
| "step": 1149, | |
| "total_flos": 1924199035699200.0, | |
| "train_loss": 0.6906216756689538, | |
| "train_runtime": 17455.8647, | |
| "train_samples_per_second": 33.704, | |
| "train_steps_per_second": 0.066 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1149, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1924199035699200.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |