| { | |
| "best_metric": 0.030188467353582382, | |
| "best_model_checkpoint": "saves/psy-course/MentaLLaMA-chat-7B/train/fold3/checkpoint-1950", | |
| "epoch": 4.9976479443033215, | |
| "eval_steps": 50, | |
| "global_step": 3320, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015053156458744943, | |
| "grad_norm": 1.822564721107483, | |
| "learning_rate": 3.0120481927710846e-06, | |
| "loss": 1.7049, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.030106312917489886, | |
| "grad_norm": 2.226895570755005, | |
| "learning_rate": 6.024096385542169e-06, | |
| "loss": 1.6672, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04515946937623483, | |
| "grad_norm": 2.538222551345825, | |
| "learning_rate": 9.036144578313253e-06, | |
| "loss": 1.615, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06021262583497977, | |
| "grad_norm": 3.4118905067443848, | |
| "learning_rate": 1.2048192771084338e-05, | |
| "loss": 1.4092, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07526578229372471, | |
| "grad_norm": 0.845978856086731, | |
| "learning_rate": 1.5060240963855424e-05, | |
| "loss": 0.8176, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07526578229372471, | |
| "eval_loss": 0.6253208518028259, | |
| "eval_runtime": 178.2787, | |
| "eval_samples_per_second": 6.624, | |
| "eval_steps_per_second": 6.624, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09031893875246966, | |
| "grad_norm": 0.6405627727508545, | |
| "learning_rate": 1.8072289156626505e-05, | |
| "loss": 0.5568, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1053720952112146, | |
| "grad_norm": 0.9309800267219543, | |
| "learning_rate": 2.1084337349397593e-05, | |
| "loss": 0.4914, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12042525166995954, | |
| "grad_norm": 0.7481735348701477, | |
| "learning_rate": 2.4096385542168677e-05, | |
| "loss": 0.3461, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1354784081287045, | |
| "grad_norm": 0.840279757976532, | |
| "learning_rate": 2.7108433734939758e-05, | |
| "loss": 0.173, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.15053156458744943, | |
| "grad_norm": 0.6460654735565186, | |
| "learning_rate": 3.012048192771085e-05, | |
| "loss": 0.149, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.15053156458744943, | |
| "eval_loss": 0.12084457278251648, | |
| "eval_runtime": 178.2793, | |
| "eval_samples_per_second": 6.624, | |
| "eval_steps_per_second": 6.624, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16558472104619437, | |
| "grad_norm": 0.8036267757415771, | |
| "learning_rate": 3.313253012048193e-05, | |
| "loss": 0.1049, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.18063787750493931, | |
| "grad_norm": 0.4858762323856354, | |
| "learning_rate": 3.614457831325301e-05, | |
| "loss": 0.1118, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.19569103396368426, | |
| "grad_norm": 0.5866274833679199, | |
| "learning_rate": 3.91566265060241e-05, | |
| "loss": 0.091, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2107441904224292, | |
| "grad_norm": 0.6353292465209961, | |
| "learning_rate": 4.2168674698795186e-05, | |
| "loss": 0.0815, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.22579734688117414, | |
| "grad_norm": 0.8361664414405823, | |
| "learning_rate": 4.5180722891566266e-05, | |
| "loss": 0.0858, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22579734688117414, | |
| "eval_loss": 0.0719056949019432, | |
| "eval_runtime": 178.4648, | |
| "eval_samples_per_second": 6.618, | |
| "eval_steps_per_second": 6.618, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2408505033399191, | |
| "grad_norm": 0.5231442451477051, | |
| "learning_rate": 4.8192771084337354e-05, | |
| "loss": 0.076, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.25590365979866403, | |
| "grad_norm": 0.7356946468353271, | |
| "learning_rate": 5.120481927710844e-05, | |
| "loss": 0.0844, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.270956816257409, | |
| "grad_norm": 0.7905665636062622, | |
| "learning_rate": 5.4216867469879516e-05, | |
| "loss": 0.0734, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2860099727161539, | |
| "grad_norm": 0.9920386075973511, | |
| "learning_rate": 5.72289156626506e-05, | |
| "loss": 0.0687, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.30106312917489886, | |
| "grad_norm": 0.8998428583145142, | |
| "learning_rate": 6.02409638554217e-05, | |
| "loss": 0.0728, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.30106312917489886, | |
| "eval_loss": 0.056333187967538834, | |
| "eval_runtime": 178.4366, | |
| "eval_samples_per_second": 6.619, | |
| "eval_steps_per_second": 6.619, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3161162856336438, | |
| "grad_norm": 0.517510175704956, | |
| "learning_rate": 6.325301204819278e-05, | |
| "loss": 0.0601, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.33116944209238874, | |
| "grad_norm": 0.5527799725532532, | |
| "learning_rate": 6.626506024096386e-05, | |
| "loss": 0.0636, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3462225985511337, | |
| "grad_norm": 0.5699729919433594, | |
| "learning_rate": 6.927710843373494e-05, | |
| "loss": 0.0742, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.36127575500987863, | |
| "grad_norm": 0.4265763461589813, | |
| "learning_rate": 7.228915662650602e-05, | |
| "loss": 0.0485, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3763289114686236, | |
| "grad_norm": 0.8758322596549988, | |
| "learning_rate": 7.530120481927712e-05, | |
| "loss": 0.0574, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3763289114686236, | |
| "eval_loss": 0.04959677904844284, | |
| "eval_runtime": 178.4826, | |
| "eval_samples_per_second": 6.617, | |
| "eval_steps_per_second": 6.617, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3913820679273685, | |
| "grad_norm": 0.4359073340892792, | |
| "learning_rate": 7.83132530120482e-05, | |
| "loss": 0.0393, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.40643522438611346, | |
| "grad_norm": 0.4522893726825714, | |
| "learning_rate": 8.132530120481928e-05, | |
| "loss": 0.0538, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4214883808448584, | |
| "grad_norm": 0.531626284122467, | |
| "learning_rate": 8.433734939759037e-05, | |
| "loss": 0.0481, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.43654153730360334, | |
| "grad_norm": 0.26284611225128174, | |
| "learning_rate": 8.734939759036145e-05, | |
| "loss": 0.049, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4515946937623483, | |
| "grad_norm": 0.639433741569519, | |
| "learning_rate": 9.036144578313253e-05, | |
| "loss": 0.0603, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4515946937623483, | |
| "eval_loss": 0.05344177782535553, | |
| "eval_runtime": 178.3037, | |
| "eval_samples_per_second": 6.624, | |
| "eval_steps_per_second": 6.624, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.46664785022109323, | |
| "grad_norm": 0.23814865946769714, | |
| "learning_rate": 9.337349397590361e-05, | |
| "loss": 0.0443, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4817010066798382, | |
| "grad_norm": 0.8485608100891113, | |
| "learning_rate": 9.638554216867471e-05, | |
| "loss": 0.0691, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4967541631385831, | |
| "grad_norm": 0.7342369556427002, | |
| "learning_rate": 9.939759036144579e-05, | |
| "loss": 0.0535, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5118073195973281, | |
| "grad_norm": 0.5011388063430786, | |
| "learning_rate": 9.999823129264712e-05, | |
| "loss": 0.038, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.526860476056073, | |
| "grad_norm": 0.2543235421180725, | |
| "learning_rate": 9.999104613348688e-05, | |
| "loss": 0.0432, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.526860476056073, | |
| "eval_loss": 0.043458491563797, | |
| "eval_runtime": 178.3108, | |
| "eval_samples_per_second": 6.623, | |
| "eval_steps_per_second": 6.623, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.541913632514818, | |
| "grad_norm": 0.19088613986968994, | |
| "learning_rate": 9.997833477197385e-05, | |
| "loss": 0.054, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5569667889735629, | |
| "grad_norm": 0.4376489818096161, | |
| "learning_rate": 9.996009861327077e-05, | |
| "loss": 0.043, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5720199454323078, | |
| "grad_norm": 0.33153238892555237, | |
| "learning_rate": 9.993633967327269e-05, | |
| "loss": 0.0445, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5870731018910528, | |
| "grad_norm": 0.5583683252334595, | |
| "learning_rate": 9.990706057838416e-05, | |
| "loss": 0.04, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6021262583497977, | |
| "grad_norm": 0.4868285655975342, | |
| "learning_rate": 9.987226456522884e-05, | |
| "loss": 0.0506, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6021262583497977, | |
| "eval_loss": 0.04465359449386597, | |
| "eval_runtime": 178.2721, | |
| "eval_samples_per_second": 6.625, | |
| "eval_steps_per_second": 6.625, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6171794148085427, | |
| "grad_norm": 0.4056116044521332, | |
| "learning_rate": 9.983195548029173e-05, | |
| "loss": 0.0334, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6322325712672876, | |
| "grad_norm": 0.5368084907531738, | |
| "learning_rate": 9.9786137779494e-05, | |
| "loss": 0.0537, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6472857277260325, | |
| "grad_norm": 0.2836042046546936, | |
| "learning_rate": 9.973481652770038e-05, | |
| "loss": 0.0403, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6623388841847775, | |
| "grad_norm": 0.2168349325656891, | |
| "learning_rate": 9.967799739815925e-05, | |
| "loss": 0.0322, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6773920406435224, | |
| "grad_norm": 0.5306774973869324, | |
| "learning_rate": 9.961568667187556e-05, | |
| "loss": 0.0418, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6773920406435224, | |
| "eval_loss": 0.042699169367551804, | |
| "eval_runtime": 178.1761, | |
| "eval_samples_per_second": 6.628, | |
| "eval_steps_per_second": 6.628, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6924451971022674, | |
| "grad_norm": 0.5590943098068237, | |
| "learning_rate": 9.954789123691642e-05, | |
| "loss": 0.0423, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7074983535610123, | |
| "grad_norm": 0.46472442150115967, | |
| "learning_rate": 9.947461858764978e-05, | |
| "loss": 0.0383, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7225515100197573, | |
| "grad_norm": 0.307923823595047, | |
| "learning_rate": 9.939587682391586e-05, | |
| "loss": 0.0392, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7376046664785022, | |
| "grad_norm": 0.3933861255645752, | |
| "learning_rate": 9.931167465013182e-05, | |
| "loss": 0.0519, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7526578229372471, | |
| "grad_norm": 0.31609928607940674, | |
| "learning_rate": 9.922202137432955e-05, | |
| "loss": 0.0319, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7526578229372471, | |
| "eval_loss": 0.03804313763976097, | |
| "eval_runtime": 178.3743, | |
| "eval_samples_per_second": 6.621, | |
| "eval_steps_per_second": 6.621, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7677109793959921, | |
| "grad_norm": 0.2750873863697052, | |
| "learning_rate": 9.912692690712665e-05, | |
| "loss": 0.0356, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.782764135854737, | |
| "grad_norm": 0.4082382023334503, | |
| "learning_rate": 9.902640176063103e-05, | |
| "loss": 0.0426, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.797817292313482, | |
| "grad_norm": 0.2213507443666458, | |
| "learning_rate": 9.892045704727864e-05, | |
| "loss": 0.0579, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8128704487722269, | |
| "grad_norm": 0.24513815343379974, | |
| "learning_rate": 9.880910447860527e-05, | |
| "loss": 0.0512, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8279236052309719, | |
| "grad_norm": 0.33264800906181335, | |
| "learning_rate": 9.869235636395177e-05, | |
| "loss": 0.0409, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8279236052309719, | |
| "eval_loss": 0.03934917971491814, | |
| "eval_runtime": 178.3094, | |
| "eval_samples_per_second": 6.623, | |
| "eval_steps_per_second": 6.623, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8429767616897168, | |
| "grad_norm": 0.4701898694038391, | |
| "learning_rate": 9.857022560910338e-05, | |
| "loss": 0.0513, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8580299181484617, | |
| "grad_norm": 0.36034145951271057, | |
| "learning_rate": 9.844272571486311e-05, | |
| "loss": 0.0375, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8730830746072067, | |
| "grad_norm": 0.1605140119791031, | |
| "learning_rate": 9.830987077555924e-05, | |
| "loss": 0.0471, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8881362310659516, | |
| "grad_norm": 0.3974078893661499, | |
| "learning_rate": 9.817167547748729e-05, | |
| "loss": 0.0588, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9031893875246966, | |
| "grad_norm": 0.21184875071048737, | |
| "learning_rate": 9.802815509728662e-05, | |
| "loss": 0.0289, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9031893875246966, | |
| "eval_loss": 0.0372873954474926, | |
| "eval_runtime": 178.4025, | |
| "eval_samples_per_second": 6.62, | |
| "eval_steps_per_second": 6.62, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9182425439834415, | |
| "grad_norm": 0.3172430992126465, | |
| "learning_rate": 9.787932550025158e-05, | |
| "loss": 0.0351, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9332957004421865, | |
| "grad_norm": 0.25452977418899536, | |
| "learning_rate": 9.772520313857775e-05, | |
| "loss": 0.0354, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9483488569009314, | |
| "grad_norm": 0.31180253624916077, | |
| "learning_rate": 9.756580504954334e-05, | |
| "loss": 0.0408, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9634020133596763, | |
| "grad_norm": 0.2877283990383148, | |
| "learning_rate": 9.740114885362562e-05, | |
| "loss": 0.0357, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9784551698184213, | |
| "grad_norm": 0.31497249007225037, | |
| "learning_rate": 9.723125275255325e-05, | |
| "loss": 0.0301, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9784551698184213, | |
| "eval_loss": 0.03777872771024704, | |
| "eval_runtime": 178.3928, | |
| "eval_samples_per_second": 6.62, | |
| "eval_steps_per_second": 6.62, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9935083262771662, | |
| "grad_norm": 0.18348178267478943, | |
| "learning_rate": 9.705613552729415e-05, | |
| "loss": 0.0259, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.0085614827359113, | |
| "grad_norm": 0.15709549188613892, | |
| "learning_rate": 9.68758165359794e-05, | |
| "loss": 0.0335, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0236146391946561, | |
| "grad_norm": 0.25934895873069763, | |
| "learning_rate": 9.669031571176322e-05, | |
| "loss": 0.0353, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.0386677956534012, | |
| "grad_norm": 0.21780580282211304, | |
| "learning_rate": 9.64996535606196e-05, | |
| "loss": 0.0318, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.053720952112146, | |
| "grad_norm": 0.14777664840221405, | |
| "learning_rate": 9.630385115907545e-05, | |
| "loss": 0.0289, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.053720952112146, | |
| "eval_loss": 0.03666107356548309, | |
| "eval_runtime": 178.3948, | |
| "eval_samples_per_second": 6.62, | |
| "eval_steps_per_second": 6.62, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.068774108570891, | |
| "grad_norm": 0.09680185467004776, | |
| "learning_rate": 9.610293015188067e-05, | |
| "loss": 0.0305, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.083827265029636, | |
| "grad_norm": 0.21849073469638824, | |
| "learning_rate": 9.589691274961556e-05, | |
| "loss": 0.0241, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.098880421488381, | |
| "grad_norm": 0.23137938976287842, | |
| "learning_rate": 9.568582172623544e-05, | |
| "loss": 0.0367, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.1139335779471258, | |
| "grad_norm": 0.38276636600494385, | |
| "learning_rate": 9.546968041655326e-05, | |
| "loss": 0.0302, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.1289867344058706, | |
| "grad_norm": 0.17088480293750763, | |
| "learning_rate": 9.524851271366001e-05, | |
| "loss": 0.0283, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.1289867344058706, | |
| "eval_loss": 0.03535303846001625, | |
| "eval_runtime": 178.355, | |
| "eval_samples_per_second": 6.622, | |
| "eval_steps_per_second": 6.622, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.1440398908646157, | |
| "grad_norm": 0.236710324883461, | |
| "learning_rate": 9.502234306628355e-05, | |
| "loss": 0.0295, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.1590930473233607, | |
| "grad_norm": 0.23621885478496552, | |
| "learning_rate": 9.47911964760858e-05, | |
| "loss": 0.0272, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.1741462037821055, | |
| "grad_norm": 0.41253966093063354, | |
| "learning_rate": 9.455509849489915e-05, | |
| "loss": 0.0254, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.1891993602408506, | |
| "grad_norm": 0.2661738991737366, | |
| "learning_rate": 9.431407522190175e-05, | |
| "loss": 0.0321, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.2042525166995954, | |
| "grad_norm": 0.26856687664985657, | |
| "learning_rate": 9.406815330073244e-05, | |
| "loss": 0.0338, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2042525166995954, | |
| "eval_loss": 0.03521593660116196, | |
| "eval_runtime": 178.2893, | |
| "eval_samples_per_second": 6.624, | |
| "eval_steps_per_second": 6.624, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2193056731583405, | |
| "grad_norm": 0.15742696821689606, | |
| "learning_rate": 9.381735991654546e-05, | |
| "loss": 0.03, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.2343588296170853, | |
| "grad_norm": 0.38911351561546326, | |
| "learning_rate": 9.356172279300528e-05, | |
| "loss": 0.0352, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.2494119860758304, | |
| "grad_norm": 0.12868359684944153, | |
| "learning_rate": 9.330127018922194e-05, | |
| "loss": 0.0296, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.2644651425345752, | |
| "grad_norm": 0.1884261518716812, | |
| "learning_rate": 9.303603089662716e-05, | |
| "loss": 0.0368, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.2795182989933203, | |
| "grad_norm": 0.14614073932170868, | |
| "learning_rate": 9.276603423579164e-05, | |
| "loss": 0.0286, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.2795182989933203, | |
| "eval_loss": 0.035069651901721954, | |
| "eval_runtime": 178.3662, | |
| "eval_samples_per_second": 6.621, | |
| "eval_steps_per_second": 6.621, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.294571455452065, | |
| "grad_norm": 0.33718934655189514, | |
| "learning_rate": 9.249131005318387e-05, | |
| "loss": 0.0412, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.30962461191081, | |
| "grad_norm": 0.1532343029975891, | |
| "learning_rate": 9.221188871787075e-05, | |
| "loss": 0.0327, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.324677768369555, | |
| "grad_norm": 0.16133852303028107, | |
| "learning_rate": 9.192780111816047e-05, | |
| "loss": 0.0297, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.3397309248283, | |
| "grad_norm": 0.19326288998126984, | |
| "learning_rate": 9.163907865818806e-05, | |
| "loss": 0.0304, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.3547840812870449, | |
| "grad_norm": 0.2810550332069397, | |
| "learning_rate": 9.134575325444376e-05, | |
| "loss": 0.0396, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3547840812870449, | |
| "eval_loss": 0.0357198566198349, | |
| "eval_runtime": 178.4266, | |
| "eval_samples_per_second": 6.619, | |
| "eval_steps_per_second": 6.619, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3698372377457897, | |
| "grad_norm": 0.24054096639156342, | |
| "learning_rate": 9.104785733224496e-05, | |
| "loss": 0.0297, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.3848903942045347, | |
| "grad_norm": 0.1945008933544159, | |
| "learning_rate": 9.07454238221517e-05, | |
| "loss": 0.0297, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.3999435506632798, | |
| "grad_norm": 0.22676967084407806, | |
| "learning_rate": 9.043848615632642e-05, | |
| "loss": 0.0258, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.4149967071220246, | |
| "grad_norm": 0.12921719253063202, | |
| "learning_rate": 9.012707826483823e-05, | |
| "loss": 0.0331, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.4300498635807695, | |
| "grad_norm": 0.2984769940376282, | |
| "learning_rate": 8.98112345719122e-05, | |
| "loss": 0.0286, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.4300498635807695, | |
| "eval_loss": 0.0343579463660717, | |
| "eval_runtime": 178.3834, | |
| "eval_samples_per_second": 6.621, | |
| "eval_steps_per_second": 6.621, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.4451030200395145, | |
| "grad_norm": 0.5328255891799927, | |
| "learning_rate": 8.949098999212391e-05, | |
| "loss": 0.0325, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.4601561764982596, | |
| "grad_norm": 0.22663827240467072, | |
| "learning_rate": 8.916637992653991e-05, | |
| "loss": 0.025, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.4752093329570044, | |
| "grad_norm": 0.31026363372802734, | |
| "learning_rate": 8.883744025880428e-05, | |
| "loss": 0.0322, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.4902624894157492, | |
| "grad_norm": 0.1781584918498993, | |
| "learning_rate": 8.850420735117202e-05, | |
| "loss": 0.0239, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.5053156458744943, | |
| "grad_norm": 0.25284937024116516, | |
| "learning_rate": 8.816671804048933e-05, | |
| "loss": 0.0276, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.5053156458744943, | |
| "eval_loss": 0.03388373181223869, | |
| "eval_runtime": 178.2776, | |
| "eval_samples_per_second": 6.624, | |
| "eval_steps_per_second": 6.624, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.5203688023332393, | |
| "grad_norm": 0.18928562104701996, | |
| "learning_rate": 8.782500963412156e-05, | |
| "loss": 0.0317, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.5354219587919842, | |
| "grad_norm": 0.24282586574554443, | |
| "learning_rate": 8.747911990582912e-05, | |
| "loss": 0.0249, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.550475115250729, | |
| "grad_norm": 0.1898348331451416, | |
| "learning_rate": 8.712908709159183e-05, | |
| "loss": 0.025, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.565528271709474, | |
| "grad_norm": 0.46804437041282654, | |
| "learning_rate": 8.677494988538211e-05, | |
| "loss": 0.034, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.5805814281682191, | |
| "grad_norm": 0.21851754188537598, | |
| "learning_rate": 8.641674743488769e-05, | |
| "loss": 0.0269, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.5805814281682191, | |
| "eval_loss": 0.03368181735277176, | |
| "eval_runtime": 178.3246, | |
| "eval_samples_per_second": 6.623, | |
| "eval_steps_per_second": 6.623, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.595634584626964, | |
| "grad_norm": 0.3493017256259918, | |
| "learning_rate": 8.605451933718397e-05, | |
| "loss": 0.028, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.6106877410857088, | |
| "grad_norm": 0.3040076792240143, | |
| "learning_rate": 8.568830563435694e-05, | |
| "loss": 0.0347, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.6257408975444538, | |
| "grad_norm": 0.2517918646335602, | |
| "learning_rate": 8.531814680907664e-05, | |
| "loss": 0.0212, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.6407940540031989, | |
| "grad_norm": 0.28657108545303345, | |
| "learning_rate": 8.494408378012209e-05, | |
| "loss": 0.0253, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.6558472104619437, | |
| "grad_norm": 0.12799641489982605, | |
| "learning_rate": 8.456615789785804e-05, | |
| "loss": 0.0202, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.6558472104619437, | |
| "eval_loss": 0.033622123301029205, | |
| "eval_runtime": 178.3151, | |
| "eval_samples_per_second": 6.623, | |
| "eval_steps_per_second": 6.623, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.6709003669206886, | |
| "grad_norm": 0.220402330160141, | |
| "learning_rate": 8.418441093966385e-05, | |
| "loss": 0.0368, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.6859535233794336, | |
| "grad_norm": 0.2065258026123047, | |
| "learning_rate": 8.379888510531535e-05, | |
| "loss": 0.0253, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.7010066798381787, | |
| "grad_norm": 0.2068171501159668, | |
| "learning_rate": 8.340962301231981e-05, | |
| "loss": 0.0214, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.7160598362969235, | |
| "grad_norm": 0.12344514578580856, | |
| "learning_rate": 8.301666769120488e-05, | |
| "loss": 0.0291, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.7311129927556683, | |
| "grad_norm": 0.11504557728767395, | |
| "learning_rate": 8.262006258076187e-05, | |
| "loss": 0.0364, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.7311129927556683, | |
| "eval_loss": 0.03163878992199898, | |
| "eval_runtime": 178.2999, | |
| "eval_samples_per_second": 6.624, | |
| "eval_steps_per_second": 6.624, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.7461661492144134, | |
| "grad_norm": 0.2136172354221344, | |
| "learning_rate": 8.221985152324385e-05, | |
| "loss": 0.0284, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.7612193056731584, | |
| "grad_norm": 0.3486640453338623, | |
| "learning_rate": 8.18160787595191e-05, | |
| "loss": 0.0373, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.7762724621319033, | |
| "grad_norm": 0.2501416802406311, | |
| "learning_rate": 8.14087889241806e-05, | |
| "loss": 0.0265, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.791325618590648, | |
| "grad_norm": 0.21038861572742462, | |
| "learning_rate": 8.099802704061195e-05, | |
| "loss": 0.0337, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.8063787750493931, | |
| "grad_norm": 0.3405999541282654, | |
| "learning_rate": 8.058383851601027e-05, | |
| "loss": 0.0265, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.8063787750493931, | |
| "eval_loss": 0.032966770231723785, | |
| "eval_runtime": 178.2353, | |
| "eval_samples_per_second": 6.626, | |
| "eval_steps_per_second": 6.626, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.8214319315081382, | |
| "grad_norm": 0.5199443101882935, | |
| "learning_rate": 8.01662691363668e-05, | |
| "loss": 0.029, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.836485087966883, | |
| "grad_norm": 0.35030198097229004, | |
| "learning_rate": 7.974536506140547e-05, | |
| "loss": 0.0333, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.8515382444256279, | |
| "grad_norm": 0.27158239483833313, | |
| "learning_rate": 7.932117281948021e-05, | |
| "loss": 0.0294, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.866591400884373, | |
| "grad_norm": 0.18654458224773407, | |
| "learning_rate": 7.889373930243164e-05, | |
| "loss": 0.032, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.881644557343118, | |
| "grad_norm": 0.13273607194423676, | |
| "learning_rate": 7.846311176040331e-05, | |
| "loss": 0.0291, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.881644557343118, | |
| "eval_loss": 0.03071535751223564, | |
| "eval_runtime": 178.2583, | |
| "eval_samples_per_second": 6.625, | |
| "eval_steps_per_second": 6.625, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.8966977138018628, | |
| "grad_norm": 0.1273525208234787, | |
| "learning_rate": 7.802933779661859e-05, | |
| "loss": 0.0361, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.9117508702606076, | |
| "grad_norm": 0.09461209923028946, | |
| "learning_rate": 7.759246536211844e-05, | |
| "loss": 0.0251, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.9268040267193527, | |
| "grad_norm": 0.10459864139556885, | |
| "learning_rate": 7.715254275046062e-05, | |
| "loss": 0.0308, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.9418571831780977, | |
| "grad_norm": 0.21300184726715088, | |
| "learning_rate": 7.670961859238124e-05, | |
| "loss": 0.0272, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.9569103396368426, | |
| "grad_norm": 0.47472086548805237, | |
| "learning_rate": 7.626374185041886e-05, | |
| "loss": 0.0244, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.9569103396368426, | |
| "eval_loss": 0.031091291457414627, | |
| "eval_runtime": 178.3155, | |
| "eval_samples_per_second": 6.623, | |
| "eval_steps_per_second": 6.623, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.9719634960955874, | |
| "grad_norm": 0.29737257957458496, | |
| "learning_rate": 7.581496181350203e-05, | |
| "loss": 0.0255, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.9870166525543325, | |
| "grad_norm": 0.18417982757091522, | |
| "learning_rate": 7.536332809150067e-05, | |
| "loss": 0.0258, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.0020698090130775, | |
| "grad_norm": 0.27828750014305115, | |
| "learning_rate": 7.490889060974201e-05, | |
| "loss": 0.0272, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.0171229654718226, | |
| "grad_norm": 0.194544717669487, | |
| "learning_rate": 7.445169960349167e-05, | |
| "loss": 0.015, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.032176121930567, | |
| "grad_norm": 0.21474245190620422, | |
| "learning_rate": 7.399180561240044e-05, | |
| "loss": 0.0197, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.032176121930567, | |
| "eval_loss": 0.031300000846385956, | |
| "eval_runtime": 178.2318, | |
| "eval_samples_per_second": 6.626, | |
| "eval_steps_per_second": 6.626, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.0472292783893122, | |
| "grad_norm": 0.10497045516967773, | |
| "learning_rate": 7.352925947491746e-05, | |
| "loss": 0.0118, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.0622824348480573, | |
| "grad_norm": 0.32971543073654175, | |
| "learning_rate": 7.306411232267029e-05, | |
| "loss": 0.0199, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.0773355913068023, | |
| "grad_norm": 0.20883502066135406, | |
| "learning_rate": 7.259641557481269e-05, | |
| "loss": 0.0221, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.092388747765547, | |
| "grad_norm": 0.12358664721250534, | |
| "learning_rate": 7.212622093234049e-05, | |
| "loss": 0.0153, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.107441904224292, | |
| "grad_norm": 0.2762002646923065, | |
| "learning_rate": 7.165358037237643e-05, | |
| "loss": 0.0223, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.107441904224292, | |
| "eval_loss": 0.03041531704366207, | |
| "eval_runtime": 178.1406, | |
| "eval_samples_per_second": 6.63, | |
| "eval_steps_per_second": 6.63, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.122495060683037, | |
| "grad_norm": 0.21662060916423798, | |
| "learning_rate": 7.117854614242434e-05, | |
| "loss": 0.0208, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.137548217141782, | |
| "grad_norm": 0.2806037366390228, | |
| "learning_rate": 7.070117075459352e-05, | |
| "loss": 0.0203, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.1526013736005267, | |
| "grad_norm": 0.17611578106880188, | |
| "learning_rate": 7.022150697979384e-05, | |
| "loss": 0.0226, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.167654530059272, | |
| "grad_norm": 0.22745737433433533, | |
| "learning_rate": 6.973960784190237e-05, | |
| "loss": 0.023, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.182707686518017, | |
| "grad_norm": 0.2118409425020218, | |
| "learning_rate": 6.925552661190166e-05, | |
| "loss": 0.0197, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.182707686518017, | |
| "eval_loss": 0.03106178157031536, | |
| "eval_runtime": 178.1235, | |
| "eval_samples_per_second": 6.63, | |
| "eval_steps_per_second": 6.63, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.197760842976762, | |
| "grad_norm": 0.1830424964427948, | |
| "learning_rate": 6.876931680199121e-05, | |
| "loss": 0.0156, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.2128139994355065, | |
| "grad_norm": 0.10300840437412262, | |
| "learning_rate": 6.828103215967186e-05, | |
| "loss": 0.0203, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.2278671558942515, | |
| "grad_norm": 0.17760959267616272, | |
| "learning_rate": 6.779072666180446e-05, | |
| "loss": 0.0234, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.2429203123529966, | |
| "grad_norm": 0.05315260961651802, | |
| "learning_rate": 6.729845450864294e-05, | |
| "loss": 0.0159, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.257973468811741, | |
| "grad_norm": 0.21539124846458435, | |
| "learning_rate": 6.680427011784292e-05, | |
| "loss": 0.0204, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.257973468811741, | |
| "eval_loss": 0.0323634073138237, | |
| "eval_runtime": 178.1357, | |
| "eval_samples_per_second": 6.63, | |
| "eval_steps_per_second": 6.63, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.2730266252704863, | |
| "grad_norm": 0.24310541152954102, | |
| "learning_rate": 6.630822811844604e-05, | |
| "loss": 0.017, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.2880797817292313, | |
| "grad_norm": 0.1696529984474182, | |
| "learning_rate": 6.58103833448412e-05, | |
| "loss": 0.0222, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.3031329381879764, | |
| "grad_norm": 0.3020496666431427, | |
| "learning_rate": 6.531079083070288e-05, | |
| "loss": 0.0234, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.3181860946467214, | |
| "grad_norm": 0.12914994359016418, | |
| "learning_rate": 6.480950580290752e-05, | |
| "loss": 0.017, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.333239251105466, | |
| "grad_norm": 0.18893304467201233, | |
| "learning_rate": 6.430658367542843e-05, | |
| "loss": 0.0276, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.333239251105466, | |
| "eval_loss": 0.031088994815945625, | |
| "eval_runtime": 178.1568, | |
| "eval_samples_per_second": 6.629, | |
| "eval_steps_per_second": 6.629, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.348292407564211, | |
| "grad_norm": 0.08729825168848038, | |
| "learning_rate": 6.380208004321036e-05, | |
| "loss": 0.0247, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.363345564022956, | |
| "grad_norm": 0.12183941900730133, | |
| "learning_rate": 6.32960506760236e-05, | |
| "loss": 0.0181, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.378398720481701, | |
| "grad_norm": 0.2509489953517914, | |
| "learning_rate": 6.278855151229901e-05, | |
| "loss": 0.0158, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.393451876940446, | |
| "grad_norm": 0.0914265587925911, | |
| "learning_rate": 6.227963865294444e-05, | |
| "loss": 0.0168, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.408505033399191, | |
| "grad_norm": 0.11629947274923325, | |
| "learning_rate": 6.176936835514312e-05, | |
| "loss": 0.0146, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.408505033399191, | |
| "eval_loss": 0.031647298485040665, | |
| "eval_runtime": 178.063, | |
| "eval_samples_per_second": 6.632, | |
| "eval_steps_per_second": 6.632, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.423558189857936, | |
| "grad_norm": 0.2622014582157135, | |
| "learning_rate": 6.125779702613471e-05, | |
| "loss": 0.0176, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.438611346316681, | |
| "grad_norm": 0.177884042263031, | |
| "learning_rate": 6.074498121697983e-05, | |
| "loss": 0.0207, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.4536645027754256, | |
| "grad_norm": 0.20466935634613037, | |
| "learning_rate": 6.023097761630879e-05, | |
| "loss": 0.0254, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.4687176592341706, | |
| "grad_norm": 0.1373012661933899, | |
| "learning_rate": 5.971584304405489e-05, | |
| "loss": 0.0164, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.4837708156929157, | |
| "grad_norm": 0.23096352815628052, | |
| "learning_rate": 5.919963444517338e-05, | |
| "loss": 0.0165, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.4837708156929157, | |
| "eval_loss": 0.03288305178284645, | |
| "eval_runtime": 178.0988, | |
| "eval_samples_per_second": 6.631, | |
| "eval_steps_per_second": 6.631, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.4988239721516607, | |
| "grad_norm": 0.2017652541399002, | |
| "learning_rate": 5.868240888334653e-05, | |
| "loss": 0.0181, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.5138771286104054, | |
| "grad_norm": 0.07514988631010056, | |
| "learning_rate": 5.816422353467562e-05, | |
| "loss": 0.0148, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.5289302850691504, | |
| "grad_norm": 0.11077038943767548, | |
| "learning_rate": 5.7645135681360496e-05, | |
| "loss": 0.0183, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.5439834415278955, | |
| "grad_norm": 0.24594064056873322, | |
| "learning_rate": 5.7125202705367234e-05, | |
| "loss": 0.0228, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.5590365979866405, | |
| "grad_norm": 0.2879762649536133, | |
| "learning_rate": 5.660448208208513e-05, | |
| "loss": 0.0242, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.5590365979866405, | |
| "eval_loss": 0.03144305199384689, | |
| "eval_runtime": 178.0564, | |
| "eval_samples_per_second": 6.633, | |
| "eval_steps_per_second": 6.633, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.574089754445385, | |
| "grad_norm": 0.060580622404813766, | |
| "learning_rate": 5.608303137397294e-05, | |
| "loss": 0.0156, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.58914291090413, | |
| "grad_norm": 0.13122287392616272, | |
| "learning_rate": 5.5560908224195886e-05, | |
| "loss": 0.0218, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.6041960673628752, | |
| "grad_norm": 0.2391199916601181, | |
| "learning_rate": 5.503817035025342e-05, | |
| "loss": 0.0187, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.61924922382162, | |
| "grad_norm": 0.2503703236579895, | |
| "learning_rate": 5.4514875537598985e-05, | |
| "loss": 0.0143, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.634302380280365, | |
| "grad_norm": 0.2618468105792999, | |
| "learning_rate": 5.399108163325217e-05, | |
| "loss": 0.0273, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.634302380280365, | |
| "eval_loss": 0.032075975090265274, | |
| "eval_runtime": 178.055, | |
| "eval_samples_per_second": 6.633, | |
| "eval_steps_per_second": 6.633, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.64935553673911, | |
| "grad_norm": 0.15629079937934875, | |
| "learning_rate": 5.346684653940408e-05, | |
| "loss": 0.0217, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.664408693197855, | |
| "grad_norm": 0.09079688042402267, | |
| "learning_rate": 5.294222820701661e-05, | |
| "loss": 0.0204, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.6794618496566, | |
| "grad_norm": 0.29432734847068787, | |
| "learning_rate": 5.24172846294163e-05, | |
| "loss": 0.0148, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.6945150061153447, | |
| "grad_norm": 0.17906472086906433, | |
| "learning_rate": 5.1892073835883524e-05, | |
| "loss": 0.0189, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.7095681625740897, | |
| "grad_norm": 0.37694233655929565, | |
| "learning_rate": 5.136665388523778e-05, | |
| "loss": 0.0204, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.7095681625740897, | |
| "eval_loss": 0.03231995552778244, | |
| "eval_runtime": 177.9391, | |
| "eval_samples_per_second": 6.637, | |
| "eval_steps_per_second": 6.637, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.7246213190328348, | |
| "grad_norm": 0.15220922231674194, | |
| "learning_rate": 5.0841082859419585e-05, | |
| "loss": 0.0243, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.7396744754915794, | |
| "grad_norm": 0.1869482398033142, | |
| "learning_rate": 5.031541885706987e-05, | |
| "loss": 0.0215, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.7547276319503244, | |
| "grad_norm": 0.12930846214294434, | |
| "learning_rate": 4.9789719987107545e-05, | |
| "loss": 0.0223, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.7697807884090695, | |
| "grad_norm": 0.4493294060230255, | |
| "learning_rate": 4.926404436230596e-05, | |
| "loss": 0.0201, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.7848339448678145, | |
| "grad_norm": 0.3657067120075226, | |
| "learning_rate": 4.8738450092868785e-05, | |
| "loss": 0.021, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.7848339448678145, | |
| "eval_loss": 0.03066716156899929, | |
| "eval_runtime": 177.9619, | |
| "eval_samples_per_second": 6.636, | |
| "eval_steps_per_second": 6.636, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.7998871013265596, | |
| "grad_norm": 0.35673412680625916, | |
| "learning_rate": 4.8212995280006426e-05, | |
| "loss": 0.0186, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.814940257785304, | |
| "grad_norm": 0.20455271005630493, | |
| "learning_rate": 4.76877380095132e-05, | |
| "loss": 0.0183, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.8299934142440493, | |
| "grad_norm": 0.25493428111076355, | |
| "learning_rate": 4.7162736345346303e-05, | |
| "loss": 0.0149, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.8450465707027943, | |
| "grad_norm": 0.25855717062950134, | |
| "learning_rate": 4.663804832320726e-05, | |
| "loss": 0.0134, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.860099727161539, | |
| "grad_norm": 0.16750505566596985, | |
| "learning_rate": 4.6113731944126406e-05, | |
| "loss": 0.0195, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.860099727161539, | |
| "eval_loss": 0.033523257821798325, | |
| "eval_runtime": 178.3199, | |
| "eval_samples_per_second": 6.623, | |
| "eval_steps_per_second": 6.623, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.875152883620284, | |
| "grad_norm": 0.17557500302791595, | |
| "learning_rate": 4.558984516805118e-05, | |
| "loss": 0.0177, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.890206040079029, | |
| "grad_norm": 0.32916489243507385, | |
| "learning_rate": 4.5066445907439104e-05, | |
| "loss": 0.0132, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.905259196537774, | |
| "grad_norm": 0.3346385359764099, | |
| "learning_rate": 4.454359202085582e-05, | |
| "loss": 0.0233, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.920312352996519, | |
| "grad_norm": 0.5354165434837341, | |
| "learning_rate": 4.402134130657925e-05, | |
| "loss": 0.0163, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.9353655094552638, | |
| "grad_norm": 0.24359606206417084, | |
| "learning_rate": 4.349975149621039e-05, | |
| "loss": 0.0221, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.9353655094552638, | |
| "eval_loss": 0.030188467353582382, | |
| "eval_runtime": 177.9643, | |
| "eval_samples_per_second": 6.636, | |
| "eval_steps_per_second": 6.636, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.950418665914009, | |
| "grad_norm": 0.11338020116090775, | |
| "learning_rate": 4.297888024829126e-05, | |
| "loss": 0.0163, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.965471822372754, | |
| "grad_norm": 0.1252482682466507, | |
| "learning_rate": 4.2458785141931314e-05, | |
| "loss": 0.0181, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.9805249788314985, | |
| "grad_norm": 0.0980340763926506, | |
| "learning_rate": 4.1939523670442316e-05, | |
| "loss": 0.0171, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.9955781352902435, | |
| "grad_norm": 0.1608283817768097, | |
| "learning_rate": 4.14211532349828e-05, | |
| "loss": 0.0177, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.0106312917489886, | |
| "grad_norm": 0.07570704817771912, | |
| "learning_rate": 4.090373113821281e-05, | |
| "loss": 0.0129, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.0106312917489886, | |
| "eval_loss": 0.03050847351551056, | |
| "eval_runtime": 177.9373, | |
| "eval_samples_per_second": 6.637, | |
| "eval_steps_per_second": 6.637, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.0256844482077336, | |
| "grad_norm": 0.15597139298915863, | |
| "learning_rate": 4.0387314577959315e-05, | |
| "loss": 0.0113, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.0407376046664787, | |
| "grad_norm": 0.055645011365413666, | |
| "learning_rate": 3.987196064089346e-05, | |
| "loss": 0.0091, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.0557907611252233, | |
| "grad_norm": 0.10545477271080017, | |
| "learning_rate": 3.935772629621995e-05, | |
| "loss": 0.0079, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.0708439175839684, | |
| "grad_norm": 0.14946284890174866, | |
| "learning_rate": 3.8844668389379396e-05, | |
| "loss": 0.0102, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 3.0858970740427134, | |
| "grad_norm": 0.2742091119289398, | |
| "learning_rate": 3.833284363576447e-05, | |
| "loss": 0.0127, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.0858970740427134, | |
| "eval_loss": 0.03302207961678505, | |
| "eval_runtime": 178.0745, | |
| "eval_samples_per_second": 6.632, | |
| "eval_steps_per_second": 6.632, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.1009502305014585, | |
| "grad_norm": 0.19018790125846863, | |
| "learning_rate": 3.7822308614450406e-05, | |
| "loss": 0.0096, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 3.116003386960203, | |
| "grad_norm": 0.04711037501692772, | |
| "learning_rate": 3.7313119761940375e-05, | |
| "loss": 0.0142, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 3.131056543418948, | |
| "grad_norm": 0.22224950790405273, | |
| "learning_rate": 3.680533336592694e-05, | |
| "loss": 0.0091, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 3.146109699877693, | |
| "grad_norm": 0.09899937361478806, | |
| "learning_rate": 3.62990055590697e-05, | |
| "loss": 0.0101, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.1611628563364382, | |
| "grad_norm": 0.09017643332481384, | |
| "learning_rate": 3.579419231279023e-05, | |
| "loss": 0.0122, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.1611628563364382, | |
| "eval_loss": 0.03305284306406975, | |
| "eval_runtime": 178.3274, | |
| "eval_samples_per_second": 6.623, | |
| "eval_steps_per_second": 6.623, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.176216012795183, | |
| "grad_norm": 0.12764662504196167, | |
| "learning_rate": 3.529094943108475e-05, | |
| "loss": 0.0085, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.191269169253928, | |
| "grad_norm": 0.12719255685806274, | |
| "learning_rate": 3.478933254435534e-05, | |
| "loss": 0.0144, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.206322325712673, | |
| "grad_norm": 0.164920374751091, | |
| "learning_rate": 3.4289397103260346e-05, | |
| "loss": 0.0125, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.221375482171418, | |
| "grad_norm": 0.22153641283512115, | |
| "learning_rate": 3.3791198372584664e-05, | |
| "loss": 0.0129, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.2364286386301626, | |
| "grad_norm": 0.11729541420936584, | |
| "learning_rate": 3.329479142513051e-05, | |
| "loss": 0.0129, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.2364286386301626, | |
| "eval_loss": 0.03324244171380997, | |
| "eval_runtime": 178.3386, | |
| "eval_samples_per_second": 6.622, | |
| "eval_steps_per_second": 6.622, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.2514817950889077, | |
| "grad_norm": 0.344887912273407, | |
| "learning_rate": 3.280023113562957e-05, | |
| "loss": 0.0136, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.2665349515476527, | |
| "grad_norm": 0.1558815985918045, | |
| "learning_rate": 3.230757217467677e-05, | |
| "loss": 0.0171, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.2815881080063978, | |
| "grad_norm": 0.19974495470523834, | |
| "learning_rate": 3.1816869002686936e-05, | |
| "loss": 0.0108, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.2966412644651424, | |
| "grad_norm": 0.13594548404216766, | |
| "learning_rate": 3.1328175863874464e-05, | |
| "loss": 0.0086, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.3116944209238874, | |
| "grad_norm": 0.21001030504703522, | |
| "learning_rate": 3.084154678025692e-05, | |
| "loss": 0.0125, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.3116944209238874, | |
| "eval_loss": 0.03326999396085739, | |
| "eval_runtime": 178.6788, | |
| "eval_samples_per_second": 6.61, | |
| "eval_steps_per_second": 6.61, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.3267475773826325, | |
| "grad_norm": 0.2704913914203644, | |
| "learning_rate": 3.035703554568331e-05, | |
| "loss": 0.0126, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.3418007338413775, | |
| "grad_norm": 0.1079394668340683, | |
| "learning_rate": 2.9874695719887464e-05, | |
| "loss": 0.0109, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.356853890300122, | |
| "grad_norm": 0.3655363917350769, | |
| "learning_rate": 2.9394580622567312e-05, | |
| "loss": 0.0076, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.371907046758867, | |
| "grad_norm": 0.09557830542325974, | |
| "learning_rate": 2.8916743327490803e-05, | |
| "loss": 0.0116, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 3.3869602032176123, | |
| "grad_norm": 0.22106143832206726, | |
| "learning_rate": 2.8441236656628828e-05, | |
| "loss": 0.0132, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.3869602032176123, | |
| "eval_loss": 0.03426986560225487, | |
| "eval_runtime": 179.2113, | |
| "eval_samples_per_second": 6.59, | |
| "eval_steps_per_second": 6.59, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.4020133596763573, | |
| "grad_norm": 0.1936042606830597, | |
| "learning_rate": 2.79681131743161e-05, | |
| "loss": 0.0064, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 3.417066516135102, | |
| "grad_norm": 0.18822342157363892, | |
| "learning_rate": 2.7497425181440607e-05, | |
| "loss": 0.0131, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 3.432119672593847, | |
| "grad_norm": 0.13977766036987305, | |
| "learning_rate": 2.702922470966187e-05, | |
| "loss": 0.0067, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 3.447172829052592, | |
| "grad_norm": 0.1867881566286087, | |
| "learning_rate": 2.6563563515659306e-05, | |
| "loss": 0.0104, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 3.462225985511337, | |
| "grad_norm": 0.035174448043107986, | |
| "learning_rate": 2.6100493075410848e-05, | |
| "loss": 0.0084, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.462225985511337, | |
| "eval_loss": 0.03515166416764259, | |
| "eval_runtime": 179.2383, | |
| "eval_samples_per_second": 6.589, | |
| "eval_steps_per_second": 6.589, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.4772791419700817, | |
| "grad_norm": 0.3701817989349365, | |
| "learning_rate": 2.5640064578502497e-05, | |
| "loss": 0.0153, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 3.4923322984288268, | |
| "grad_norm": 0.25794854760169983, | |
| "learning_rate": 2.5182328922469723e-05, | |
| "loss": 0.0108, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 3.507385454887572, | |
| "grad_norm": 0.29802244901657104, | |
| "learning_rate": 2.4727336707170973e-05, | |
| "loss": 0.0102, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 3.522438611346317, | |
| "grad_norm": 0.10990750789642334, | |
| "learning_rate": 2.427513822919424e-05, | |
| "loss": 0.0115, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 3.5374917678050615, | |
| "grad_norm": 0.282993346452713, | |
| "learning_rate": 2.3825783476297087e-05, | |
| "loss": 0.0086, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.5374917678050615, | |
| "eval_loss": 0.03473927080631256, | |
| "eval_runtime": 179.3483, | |
| "eval_samples_per_second": 6.585, | |
| "eval_steps_per_second": 6.585, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.5525449242638065, | |
| "grad_norm": 0.27041804790496826, | |
| "learning_rate": 2.337932212188073e-05, | |
| "loss": 0.0109, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 3.5675980807225516, | |
| "grad_norm": 0.2581341862678528, | |
| "learning_rate": 2.2935803519499e-05, | |
| "loss": 0.0122, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 3.582651237181296, | |
| "grad_norm": 0.13770440220832825, | |
| "learning_rate": 2.2495276697402662e-05, | |
| "loss": 0.0083, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 3.5977043936400412, | |
| "grad_norm": 0.13981565833091736, | |
| "learning_rate": 2.2057790353119535e-05, | |
| "loss": 0.0105, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 3.6127575500987863, | |
| "grad_norm": 0.27494239807128906, | |
| "learning_rate": 2.1623392848071354e-05, | |
| "loss": 0.0112, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.6127575500987863, | |
| "eval_loss": 0.03513345867395401, | |
| "eval_runtime": 179.5449, | |
| "eval_samples_per_second": 6.578, | |
| "eval_steps_per_second": 6.578, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.6278107065575314, | |
| "grad_norm": 0.18222074210643768, | |
| "learning_rate": 2.1192132202227677e-05, | |
| "loss": 0.0103, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 3.6428638630162764, | |
| "grad_norm": 0.27348509430885315, | |
| "learning_rate": 2.0764056088797645e-05, | |
| "loss": 0.0096, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 3.657917019475021, | |
| "grad_norm": 0.39703041315078735, | |
| "learning_rate": 2.0339211828959904e-05, | |
| "loss": 0.0098, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 3.672970175933766, | |
| "grad_norm": 0.21773886680603027, | |
| "learning_rate": 1.9917646386631577e-05, | |
| "loss": 0.0094, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 3.688023332392511, | |
| "grad_norm": 0.16702532768249512, | |
| "learning_rate": 1.949940636327671e-05, | |
| "loss": 0.012, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 3.688023332392511, | |
| "eval_loss": 0.03648751974105835, | |
| "eval_runtime": 179.7341, | |
| "eval_samples_per_second": 6.571, | |
| "eval_steps_per_second": 6.571, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 3.7030764888512557, | |
| "grad_norm": 0.32012760639190674, | |
| "learning_rate": 1.9084537992754792e-05, | |
| "loss": 0.01, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 3.718129645310001, | |
| "grad_norm": 0.24596737325191498, | |
| "learning_rate": 1.8673087136209803e-05, | |
| "loss": 0.0076, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 3.733182801768746, | |
| "grad_norm": 0.07540785521268845, | |
| "learning_rate": 1.8265099277000614e-05, | |
| "loss": 0.0067, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 3.748235958227491, | |
| "grad_norm": 0.2560862600803375, | |
| "learning_rate": 1.7860619515673033e-05, | |
| "loss": 0.0107, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 3.763289114686236, | |
| "grad_norm": 0.09346114844083786, | |
| "learning_rate": 1.7459692564974316e-05, | |
| "loss": 0.011, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.763289114686236, | |
| "eval_loss": 0.03612372279167175, | |
| "eval_runtime": 179.868, | |
| "eval_samples_per_second": 6.566, | |
| "eval_steps_per_second": 6.566, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.7783422711449806, | |
| "grad_norm": 0.07731559872627258, | |
| "learning_rate": 1.7062362744910322e-05, | |
| "loss": 0.0059, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 3.7933954276037256, | |
| "grad_norm": 0.35534602403640747, | |
| "learning_rate": 1.6668673977846254e-05, | |
| "loss": 0.0122, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 3.8084485840624707, | |
| "grad_norm": 0.4245568513870239, | |
| "learning_rate": 1.6278669783651395e-05, | |
| "loss": 0.0089, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 3.8235017405212153, | |
| "grad_norm": 0.0772927924990654, | |
| "learning_rate": 1.589239327488812e-05, | |
| "loss": 0.0051, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 3.8385548969799603, | |
| "grad_norm": 0.1755327433347702, | |
| "learning_rate": 1.5509887152046137e-05, | |
| "loss": 0.0079, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 3.8385548969799603, | |
| "eval_loss": 0.03893940895795822, | |
| "eval_runtime": 180.0653, | |
| "eval_samples_per_second": 6.559, | |
| "eval_steps_per_second": 6.559, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 3.8536080534387054, | |
| "grad_norm": 0.23755531013011932, | |
| "learning_rate": 1.5131193698822232e-05, | |
| "loss": 0.0137, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 3.8686612098974504, | |
| "grad_norm": 0.21640031039714813, | |
| "learning_rate": 1.4756354777446001e-05, | |
| "loss": 0.0089, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 3.8837143663561955, | |
| "grad_norm": 0.3311944305896759, | |
| "learning_rate": 1.4385411824052342e-05, | |
| "loss": 0.0093, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 3.89876752281494, | |
| "grad_norm": 0.2809070646762848, | |
| "learning_rate": 1.4018405844100812e-05, | |
| "loss": 0.0123, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 3.913820679273685, | |
| "grad_norm": 0.19817812740802765, | |
| "learning_rate": 1.3655377407842812e-05, | |
| "loss": 0.0089, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.913820679273685, | |
| "eval_loss": 0.03675345331430435, | |
| "eval_runtime": 180.0963, | |
| "eval_samples_per_second": 6.558, | |
| "eval_steps_per_second": 6.558, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.92887383573243, | |
| "grad_norm": 0.15485577285289764, | |
| "learning_rate": 1.3296366645836822e-05, | |
| "loss": 0.0074, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 3.943926992191175, | |
| "grad_norm": 0.2657122313976288, | |
| "learning_rate": 1.2941413244512113e-05, | |
| "loss": 0.0128, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 3.95898014864992, | |
| "grad_norm": 0.3065131902694702, | |
| "learning_rate": 1.2590556441781725e-05, | |
| "loss": 0.0077, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 3.974033305108665, | |
| "grad_norm": 0.487378865480423, | |
| "learning_rate": 1.2243835022705003e-05, | |
| "loss": 0.0115, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 3.98908646156741, | |
| "grad_norm": 0.25239965319633484, | |
| "learning_rate": 1.1901287315199977e-05, | |
| "loss": 0.009, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 3.98908646156741, | |
| "eval_loss": 0.03644752502441406, | |
| "eval_runtime": 180.2417, | |
| "eval_samples_per_second": 6.552, | |
| "eval_steps_per_second": 6.552, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 4.004139618026155, | |
| "grad_norm": 0.24973592162132263, | |
| "learning_rate": 1.1562951185806676e-05, | |
| "loss": 0.0078, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 4.0191927744849, | |
| "grad_norm": 0.11293565481901169, | |
| "learning_rate": 1.1228864035501069e-05, | |
| "loss": 0.0064, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 4.034245930943645, | |
| "grad_norm": 0.24446244537830353, | |
| "learning_rate": 1.0899062795560573e-05, | |
| "loss": 0.0042, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 4.04929908740239, | |
| "grad_norm": 0.29575204849243164, | |
| "learning_rate": 1.0573583923481711e-05, | |
| "loss": 0.0087, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 4.064352243861134, | |
| "grad_norm": 0.18624891340732574, | |
| "learning_rate": 1.0252463398949792e-05, | |
| "loss": 0.0086, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.064352243861134, | |
| "eval_loss": 0.037868741899728775, | |
| "eval_runtime": 180.3651, | |
| "eval_samples_per_second": 6.548, | |
| "eval_steps_per_second": 6.548, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.07940540031988, | |
| "grad_norm": 0.10995467007160187, | |
| "learning_rate": 9.935736719861622e-06, | |
| "loss": 0.0053, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 4.0944585567786245, | |
| "grad_norm": 0.09725473076105118, | |
| "learning_rate": 9.62343889840151e-06, | |
| "loss": 0.0051, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 4.109511713237369, | |
| "grad_norm": 0.036554962396621704, | |
| "learning_rate": 9.315604457170768e-06, | |
| "loss": 0.0033, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 4.124564869696115, | |
| "grad_norm": 0.10706653445959091, | |
| "learning_rate": 9.012267425371513e-06, | |
| "loss": 0.0053, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 4.139618026154859, | |
| "grad_norm": 0.11202232539653778, | |
| "learning_rate": 8.71346133504498e-06, | |
| "loss": 0.005, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.139618026154859, | |
| "eval_loss": 0.03884652629494667, | |
| "eval_runtime": 180.4257, | |
| "eval_samples_per_second": 6.546, | |
| "eval_steps_per_second": 6.546, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.154671182613605, | |
| "grad_norm": 0.05641934648156166, | |
| "learning_rate": 8.419219217364654e-06, | |
| "loss": 0.0037, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 4.169724339072349, | |
| "grad_norm": 0.17391756176948547, | |
| "learning_rate": 8.129573598984997e-06, | |
| "loss": 0.0054, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 4.184777495531094, | |
| "grad_norm": 0.08039785176515579, | |
| "learning_rate": 7.844556498445788e-06, | |
| "loss": 0.0046, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 4.199830651989839, | |
| "grad_norm": 0.24674569070339203, | |
| "learning_rate": 7.564199422632579e-06, | |
| "loss": 0.0067, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 4.214883808448584, | |
| "grad_norm": 0.05541248247027397, | |
| "learning_rate": 7.288533363293959e-06, | |
| "loss": 0.0051, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.214883808448584, | |
| "eval_loss": 0.0397312231361866, | |
| "eval_runtime": 180.5412, | |
| "eval_samples_per_second": 6.541, | |
| "eval_steps_per_second": 6.541, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.229936964907329, | |
| "grad_norm": 0.30977103114128113, | |
| "learning_rate": 7.017588793615498e-06, | |
| "loss": 0.004, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 4.244990121366074, | |
| "grad_norm": 0.145084947347641, | |
| "learning_rate": 6.751395664851135e-06, | |
| "loss": 0.0038, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 4.260043277824819, | |
| "grad_norm": 0.1771937906742096, | |
| "learning_rate": 6.489983403012312e-06, | |
| "loss": 0.0047, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 4.275096434283564, | |
| "grad_norm": 0.30625805258750916, | |
| "learning_rate": 6.233380905615049e-06, | |
| "loss": 0.0042, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 4.290149590742309, | |
| "grad_norm": 0.15609732270240784, | |
| "learning_rate": 5.981616538485496e-06, | |
| "loss": 0.0055, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 4.290149590742309, | |
| "eval_loss": 0.04038526862859726, | |
| "eval_runtime": 180.0932, | |
| "eval_samples_per_second": 6.558, | |
| "eval_steps_per_second": 6.558, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 4.3052027472010534, | |
| "grad_norm": 0.150795578956604, | |
| "learning_rate": 5.73471813262435e-06, | |
| "loss": 0.0035, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 4.320255903659799, | |
| "grad_norm": 0.26626893877983093, | |
| "learning_rate": 5.4927129811301715e-06, | |
| "loss": 0.0035, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 4.335309060118544, | |
| "grad_norm": 0.042985837906599045, | |
| "learning_rate": 5.255627836182453e-06, | |
| "loss": 0.0024, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 4.350362216577288, | |
| "grad_norm": 0.2361251413822174, | |
| "learning_rate": 5.0234889060842176e-06, | |
| "loss": 0.0056, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 4.365415373036034, | |
| "grad_norm": 0.12534025311470032, | |
| "learning_rate": 4.796321852364877e-06, | |
| "loss": 0.0083, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 4.365415373036034, | |
| "eval_loss": 0.04092538729310036, | |
| "eval_runtime": 180.6762, | |
| "eval_samples_per_second": 6.537, | |
| "eval_steps_per_second": 6.537, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 4.380468529494778, | |
| "grad_norm": 0.10099391639232635, | |
| "learning_rate": 4.5741517869435706e-06, | |
| "loss": 0.0033, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 4.395521685953524, | |
| "grad_norm": 0.037613652646541595, | |
| "learning_rate": 4.357003269353105e-06, | |
| "loss": 0.0035, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 4.410574842412268, | |
| "grad_norm": 0.10834306478500366, | |
| "learning_rate": 4.144900304025101e-06, | |
| "loss": 0.0057, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 4.425627998871013, | |
| "grad_norm": 0.4370887577533722, | |
| "learning_rate": 3.937866337636459e-06, | |
| "loss": 0.0077, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 4.4406811553297585, | |
| "grad_norm": 0.2118665874004364, | |
| "learning_rate": 3.7359242565174423e-06, | |
| "loss": 0.0055, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 4.4406811553297585, | |
| "eval_loss": 0.04117078334093094, | |
| "eval_runtime": 180.7007, | |
| "eval_samples_per_second": 6.536, | |
| "eval_steps_per_second": 6.536, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 4.455734311788503, | |
| "grad_norm": 0.11663982272148132, | |
| "learning_rate": 3.539096384121743e-06, | |
| "loss": 0.005, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 4.470787468247248, | |
| "grad_norm": 0.1609174907207489, | |
| "learning_rate": 3.34740447855878e-06, | |
| "loss": 0.0025, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 4.485840624705993, | |
| "grad_norm": 0.2631778419017792, | |
| "learning_rate": 3.160869730188465e-06, | |
| "loss": 0.0046, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 4.500893781164738, | |
| "grad_norm": 0.05932645872235298, | |
| "learning_rate": 2.9795127592787186e-06, | |
| "loss": 0.0054, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 4.515946937623482, | |
| "grad_norm": 0.5128821134567261, | |
| "learning_rate": 2.803353613726056e-06, | |
| "loss": 0.0056, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 4.515946937623482, | |
| "eval_loss": 0.041392985731363297, | |
| "eval_runtime": 180.7978, | |
| "eval_samples_per_second": 6.532, | |
| "eval_steps_per_second": 6.532, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 4.531000094082228, | |
| "grad_norm": 0.12355174124240875, | |
| "learning_rate": 2.6324117668393877e-06, | |
| "loss": 0.0044, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 4.5460532505409725, | |
| "grad_norm": 0.32771605253219604, | |
| "learning_rate": 2.466706115187406e-06, | |
| "loss": 0.0029, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 4.561106406999718, | |
| "grad_norm": 0.15814469754695892, | |
| "learning_rate": 2.3062549765096364e-06, | |
| "loss": 0.0028, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 4.576159563458463, | |
| "grad_norm": 0.20907792448997498, | |
| "learning_rate": 2.1510760876915505e-06, | |
| "loss": 0.0071, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 4.591212719917207, | |
| "grad_norm": 0.3278658986091614, | |
| "learning_rate": 2.0011866028038617e-06, | |
| "loss": 0.01, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 4.591212719917207, | |
| "eval_loss": 0.04168102145195007, | |
| "eval_runtime": 180.8128, | |
| "eval_samples_per_second": 6.532, | |
| "eval_steps_per_second": 6.532, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 4.606265876375953, | |
| "grad_norm": 0.22570809721946716, | |
| "learning_rate": 1.8566030912062549e-06, | |
| "loss": 0.0058, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 4.621319032834697, | |
| "grad_norm": 0.134377583861351, | |
| "learning_rate": 1.717341535715733e-06, | |
| "loss": 0.0034, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 4.636372189293443, | |
| "grad_norm": 0.11749345809221268, | |
| "learning_rate": 1.5834173308397982e-06, | |
| "loss": 0.0045, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 4.6514253457521875, | |
| "grad_norm": 0.0782150849699974, | |
| "learning_rate": 1.4548452810747403e-06, | |
| "loss": 0.0064, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 4.666478502210932, | |
| "grad_norm": 0.13278253376483917, | |
| "learning_rate": 1.33163959926903e-06, | |
| "loss": 0.0041, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 4.666478502210932, | |
| "eval_loss": 0.041555680334568024, | |
| "eval_runtime": 180.8052, | |
| "eval_samples_per_second": 6.532, | |
| "eval_steps_per_second": 6.532, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 4.681531658669678, | |
| "grad_norm": 0.06097458675503731, | |
| "learning_rate": 1.2138139050522023e-06, | |
| "loss": 0.0046, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 4.696584815128422, | |
| "grad_norm": 0.0928138941526413, | |
| "learning_rate": 1.101381223329301e-06, | |
| "loss": 0.0042, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 4.711637971587168, | |
| "grad_norm": 0.11098574846982956, | |
| "learning_rate": 9.943539828410342e-07, | |
| "loss": 0.0041, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 4.726691128045912, | |
| "grad_norm": 0.02670971117913723, | |
| "learning_rate": 8.927440147898702e-07, | |
| "loss": 0.0053, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 4.741744284504657, | |
| "grad_norm": 0.16498462855815887, | |
| "learning_rate": 7.96562551532154e-07, | |
| "loss": 0.0046, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 4.741744284504657, | |
| "eval_loss": 0.041612498462200165, | |
| "eval_runtime": 180.8943, | |
| "eval_samples_per_second": 6.529, | |
| "eval_steps_per_second": 6.529, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 4.756797440963402, | |
| "grad_norm": 0.02326858416199684, | |
| "learning_rate": 7.05820225336451e-07, | |
| "loss": 0.0034, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 4.771850597422147, | |
| "grad_norm": 0.03126712515950203, | |
| "learning_rate": 6.20527067208232e-07, | |
| "loss": 0.0028, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 4.786903753880892, | |
| "grad_norm": 0.09830054640769958, | |
| "learning_rate": 5.406925057809653e-07, | |
| "loss": 0.0042, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 4.801956910339637, | |
| "grad_norm": 0.10579635947942734, | |
| "learning_rate": 4.6632536627386756e-07, | |
| "loss": 0.0054, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 4.817010066798382, | |
| "grad_norm": 0.15730389952659607, | |
| "learning_rate": 3.974338695163393e-07, | |
| "loss": 0.0029, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 4.817010066798382, | |
| "eval_loss": 0.04158816114068031, | |
| "eval_runtime": 180.8757, | |
| "eval_samples_per_second": 6.529, | |
| "eval_steps_per_second": 6.529, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 4.832063223257126, | |
| "grad_norm": 0.15249422192573547, | |
| "learning_rate": 3.3402563103916984e-07, | |
| "loss": 0.003, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 4.847116379715872, | |
| "grad_norm": 0.06642192602157593, | |
| "learning_rate": 2.7610766023271615e-07, | |
| "loss": 0.0032, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 4.8621695361746164, | |
| "grad_norm": 0.17876122891902924, | |
| "learning_rate": 2.2368635957205618e-07, | |
| "loss": 0.0049, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 4.877222692633362, | |
| "grad_norm": 0.2662655711174011, | |
| "learning_rate": 1.7676752390920482e-07, | |
| "loss": 0.0067, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 4.8922758490921066, | |
| "grad_norm": 0.3073679208755493, | |
| "learning_rate": 1.3535633983257078e-07, | |
| "loss": 0.0071, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 4.8922758490921066, | |
| "eval_loss": 0.04167798534035683, | |
| "eval_runtime": 180.9807, | |
| "eval_samples_per_second": 6.526, | |
| "eval_steps_per_second": 6.526, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 4.907329005550851, | |
| "grad_norm": 0.07359470427036285, | |
| "learning_rate": 9.945738509358205e-08, | |
| "loss": 0.0026, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 4.922382162009597, | |
| "grad_norm": 0.0783749371767044, | |
| "learning_rate": 6.907462810065158e-08, | |
| "loss": 0.0037, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 4.937435318468341, | |
| "grad_norm": 0.2075500339269638, | |
| "learning_rate": 4.4211427480500554e-08, | |
| "loss": 0.0079, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 4.952488474927087, | |
| "grad_norm": 0.035787180066108704, | |
| "learning_rate": 2.4870531706872034e-08, | |
| "loss": 0.0042, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 4.967541631385831, | |
| "grad_norm": 0.28970831632614136, | |
| "learning_rate": 1.105407879670728e-08, | |
| "loss": 0.0039, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 4.967541631385831, | |
| "eval_loss": 0.041562460362911224, | |
| "eval_runtime": 180.9472, | |
| "eval_samples_per_second": 6.527, | |
| "eval_steps_per_second": 6.527, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 4.982594787844576, | |
| "grad_norm": 0.1511860340833664, | |
| "learning_rate": 2.763596073807051e-09, | |
| "loss": 0.0041, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 4.9976479443033215, | |
| "grad_norm": 0.369547039270401, | |
| "learning_rate": 0.0, | |
| "loss": 0.0045, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 4.9976479443033215, | |
| "step": 3320, | |
| "total_flos": 8.981559806648648e+17, | |
| "train_loss": 0.048681382634620886, | |
| "train_runtime": 37834.6736, | |
| "train_samples_per_second": 1.405, | |
| "train_steps_per_second": 0.088 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3320, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.981559806648648e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |